├── LICENSE
├── README.md
└── llamacpp_mock_api.py


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Blake Wyatt
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Code Llama for VSCode
 2 | 
 3 | An API which mocks [Llama.cpp](https://github.com/ggerganov/llama.cpp) to enable support for Code Llama with the
 4 | [Continue Visual Studio Code extension](https://continue.dev/).
 5 | 
 6 | As of the time of writing and to my knowledge, this is the only way to use Code Llama with VSCode locally without having
 7 | to sign up or get an API key for a service. The only exception to this is Continue with [Ollama](https://ollama.ai/), but
 8 | Ollama doesn't support Windows or Linux. On the other hand, Code Llama for VSCode is completely cross-platform and will
 9 | run wherever Meta's own [codellama](https://github.com/facebookresearch/codellama) code will run.
10 | 
11 | Now let's get started!
12 | 
13 | ### Setup
14 | 
15 | Prerequisites:
16 | - [Download and run one of the Code Llama Instruct models](https://github.com/facebookresearch/codellama)
17 | - [Install the Continue VSCode extension](https://marketplace.visualstudio.com/items?itemName=Continue.continue)
18 | 
19 | After you are able to use both independently, we will glue them together with Code Llama for VSCode.
20 | 
21 | Steps:
22 | 1. Move `llamacpp_mock_api.py` to your [`codellama`](https://github.com/facebookresearch/codellama) folder and install Flask to your environment with `pip install flask`.
23 | 2. Run `llamacpp_mock_api.py` with your [Code Llama Instruct torchrun command](https://github.com/facebookresearch/codellama#fine-tuned-instruction-models). For example:
24 | ```
25 | torchrun --nproc_per_node 1 llamacpp_mock_api.py \
26 |     --ckpt_dir CodeLlama-7b-Instruct/ \
27 |     --tokenizer_path CodeLlama-7b-Instruct/tokenizer.model \
28 |     --max_seq_len 512 --max_batch_size 4
29 | ```
30 | 3. Click the settings button at the bottom right of Continue's UI in VSCode and make changes to `config.json` so it looks like [this](https://docs.continue.dev/reference/Model%20Providers/llamacpp)[<sup>\[archive\]</sup>](http://web.archive.org/web/20240531162330/https://docs.continue.dev/reference/Model%20Providers/llamacpp). Replace `MODEL_NAME` with `codellama-7b`.
31 | 
32 | Restart VSCode or reload the Continue extension and you should now be able to use Code Llama for VSCode!
33 | 


--------------------------------------------------------------------------------
/llamacpp_mock_api.py:
--------------------------------------------------------------------------------
  1 | from typing import Optional
  2 | 
  3 | import fire
  4 | from flask import Flask, jsonify, request, Response
  5 | import torch.distributed as dist
  6 | 
  7 | from llama import Llama
  8 | 
  9 | def main(
 10 |     ckpt_dir: str,
 11 |     tokenizer_path: str,
 12 |     temperature: float = 0.2,
 13 |     top_p: float = 0.95,
 14 |     max_seq_len: int = 512,
 15 |     max_batch_size: int = 8,
 16 |     max_gen_len: Optional[int] = None,
 17 |     port: int = 8080,
 18 | ):
 19 |     print("Loading Code Llama...", end="", flush=True)
 20 | 
 21 |     # Create our Code Llama object.
 22 |     generator = Llama.build(
 23 |         ckpt_dir=ckpt_dir,
 24 |         tokenizer_path=tokenizer_path,
 25 |         max_seq_len=max_seq_len,
 26 |         max_batch_size=max_batch_size,
 27 |     )
 28 |     
 29 |     print("Done!", flush=True)
 30 |     print()
 31 | 
 32 |     # With torchrun and distributed PyTorch, multiple copies of this code
 33 |     # can be run at once. We only want one of them (node 0) to have the Flask API
 34 |     # and we will use it to control the rest.
 35 |     if dist.get_rank() == 0:
 36 |         app = Flask(__name__)
 37 |         
 38 |         def prompt_to_instructions(prompt):
 39 |             # Remove unnecessary tokens and spacing from Continue's prompt format.
 40 |             prompt = prompt.replace("</s>\n<s>", "")
 41 |             prompt = prompt.replace("[INST] ", "[INST]")
 42 |             prompt = prompt.replace(" [/INST]", "[/INST]")
 43 | 
 44 |             # Consume Continue's prompt string and transform it into a list of
 45 |             # message dicts which contain role information.
 46 |             messages = []
 47 |             prompt_start = 0
 48 |             while True:
 49 |                 user_message_start = prompt.find("[INST]", prompt_start) + 6
 50 |                 user_message_end = prompt.find("[/INST]", prompt_start)
 51 |                 assistant_message_end = prompt.find("[INST]", user_message_end)
 52 |                 
 53 |                 messages += [{"role": "user", "content": prompt[user_message_start:user_message_end]}]
 54 |                 
 55 |                 if assistant_message_end != -1:
 56 |                     messages += [{"role": "assistant", "content": prompt[user_message_end + 7:assistant_message_end]}]
 57 |                 else:
 58 |                     break
 59 | 
 60 |                 prompt_start = assistant_message_end
 61 |             
 62 |             # Send back the message instructions.
 63 |             return [messages]
 64 | 
 65 |         def run_chat_completion(prompt):
 66 |             # Transform the prompt format Continue uses into a list of
 67 |             # message dicts Code Llama supports.
 68 |             instructions = prompt_to_instructions(prompt)
 69 |             
 70 |             # Broadcast what should be processed to other nodes (acting as a C&C node).
 71 |             dist.broadcast_object_list([instructions, max_gen_len, temperature, top_p])
 72 | 
 73 |             # Start Code Llama inferencing.
 74 |             results = generator.chat_completion(
 75 |                 instructions,
 76 |                 max_gen_len=max_gen_len,
 77 |                 temperature=temperature,
 78 |                 top_p=top_p,
 79 |             )
 80 |             
 81 |             # Send the response back.
 82 |             return results[0]["generation"]["content"].strip()
 83 | 
 84 |         @app.route("/completion", methods=["POST"])
 85 |         def completion():
 86 |             content = request.json
 87 |             
 88 |             print("Incoming request: " + str(content))
 89 |             
 90 |             # Perform Code Llama chat completion.
 91 |             response = run_chat_completion(content["prompt"])
 92 |             response = jsonify({"content": response}).get_data(as_text=True)
 93 |             
 94 |             print("Outgoing response: " + str(response))
 95 |             
 96 |             # Llama.cpp's HTTP server uses Server-Sent Events to stream results to the client
 97 |             # so we reimplement it here, for a single event sent to Continue which contains
 98 |             # the entire Code Llama response.
 99 |             def generate():
100 |                 yield "data: " + response + "\n"
101 |                 yield "data: [DONE]\n"
102 |             
103 |             # Send back the response.
104 |             return Response(generate())
105 | 
106 |         # Run the Flask API server on the Llama.cpp port.
107 |         app.run(port=port)
108 |     
109 |     # Nodes which are not node 0 wait for tasks.
110 |     else:
111 |         while True:
112 |             config = [None] * 4
113 |             try:
114 |                 dist.broadcast_object_list(config)
115 |                 generator.chat_completion(
116 |                     config[0], max_gen_len=config[1], temperature=config[2], top_p=config[3]
117 |                 )
118 |             except:
119 |                 pass
120 | 
121 | if __name__ == "__main__":
122 |     fire.Fire(main)
123 | 


--------------------------------------------------------------------------------