├── .gitmodules ├── LICENSE ├── MODEL_LICENSE ├── README.md ├── README_zh.md ├── api ├── README_zh.md ├── codegeex-api-example-java │ ├── pom.xml │ └── src │ │ └── main │ │ └── java │ │ └── cn │ │ └── aminer │ │ └── codegeex │ │ └── example │ │ ├── CodeGenerationExample.java │ │ └── pojo │ │ └── Payload.java └── codegeex-api-example-python │ └── generation_example.py ├── codegeex ├── __init__.py ├── benchmark │ ├── README.md │ ├── README_zh.md │ ├── __init__.py │ ├── evaluate_humaneval_x.py │ ├── execution.py │ ├── gather_output.py │ ├── humaneval-x │ │ ├── __init__.py │ │ ├── cpp │ │ │ ├── data │ │ │ │ └── humaneval_cpp.jsonl.gz │ │ │ └── evaluation │ │ │ │ └── test.cpp │ │ ├── evaluate_humaneval_x.py │ │ ├── generate_humaneval_x.py │ │ ├── go │ │ │ ├── data │ │ │ │ └── humaneval_go.jsonl.gz │ │ │ └── evaluation │ │ │ │ ├── go.mod │ │ │ │ ├── go.sum │ │ │ │ └── vendor.tar.gz │ │ ├── java │ │ │ └── data │ │ │ │ └── humaneval_java.jsonl.gz │ │ ├── js │ │ │ └── data │ │ │ │ └── humaneval_js.jsonl.gz │ │ ├── python │ │ │ └── data │ │ │ │ └── humaneval_python.jsonl.gz │ │ ├── rust │ │ │ └── data │ │ │ │ └── humaneval_rust.jsonl.gz │ │ └── translate_humaneval_x.py │ ├── inspect_result.py │ ├── metric.py │ ├── rust │ │ ├── Cargo.lock │ │ └── Cargo.toml │ └── utils.py ├── data │ ├── __init__.py │ ├── data_utils.py │ ├── process_pretrain_dataset.py │ ├── processor.py │ └── types.py ├── docker │ └── Dockerfile ├── kernels │ ├── __init__.py │ └── quantization.fatbin ├── megatron │ ├── __init__.py │ ├── arguments.py │ ├── checkpointing.py │ ├── code_generation_utils.py │ ├── convert_ckpt_parallel.py │ ├── data │ │ ├── __init__.py │ │ ├── blendable_dataset.py │ │ ├── data_samplers.py │ │ ├── dataset_utils.py │ │ ├── helpers.cpp │ │ ├── indexed_dataset.py │ │ └── prompt_dataset.py │ ├── enums.py │ ├── global_vars.py │ ├── inference.py │ ├── initialize.py │ ├── learning_rates.py │ ├── memory.py │ ├── merge_ckpt_parallel.py │ ├── microbatches.py │ ├── mindspore_to_megatron.py │ ├── model │ │ ├── __init__.py │ │ ├── codegeex_model.py │ │ ├── distributed.py │ │ ├── language_model.py │ │ ├── module.py │ │ ├── transformer.py │ │ └── utils.py │ ├── mpu │ │ ├── __init__.py │ │ ├── cross_entropy.py │ │ ├── data.py │ │ ├── initialize.py │ │ ├── layers.py │ │ ├── mappings.py │ │ ├── random.py │ │ └── utils.py │ ├── optimizer │ │ ├── __init__.py │ │ ├── clip_grads.py │ │ ├── grad_scaler.py │ │ └── optimizer.py │ ├── p2p_communication.py │ ├── schedules.py │ ├── tokenizer │ │ ├── __init__.py │ │ ├── gpt2_tokenization.py │ │ └── tokenizer.py │ ├── tools │ │ ├── collect_env.py │ │ ├── finetune_codegeex.py │ │ └── pretrain_codegeex.py │ ├── training.py │ └── utils.py ├── mindspore │ ├── configs │ │ ├── 13B.sh │ │ ├── 13B_128p_save_1p.sh │ │ ├── 13B_128p_save_8p_ckpt.sh │ │ ├── 13B_1p_to_torch.sh │ │ ├── 13B_finetune.sh │ │ ├── 13B_generate.sh │ │ ├── 13B_generate_1p.sh │ │ ├── 13B_generate_1p_values.sh │ │ ├── 13B_generate_finetune.sh │ │ ├── 13B_generate_humaneval.sh │ │ └── 13B_generate_values.sh │ ├── convertion_1p.py │ ├── finetune.py │ ├── generation.py │ ├── generation_1p.py │ ├── generation_batch.py │ ├── generation_finetune.py │ ├── generation_humaneval.py │ ├── generation_values.py │ ├── generation_values_1p.py │ ├── save_1p_ckpt_from_8p_ckpt.py │ ├── save_8p_ckpt.py │ ├── scripts │ │ ├── custom_tune_bank_new │ │ │ └── Ascend910ProA │ │ │ │ ├── cube │ │ │ │ ├── repository_ascend910ProA_matmul.bin │ │ │ │ └── repository_ascend910ProA_matmul.json │ │ │ │ └── vector │ │ │ │ └── Ascend910ProA_AiCore_32_v001_20220509_200939_588817.json │ │ ├── layer_norm.py │ │ ├── layer_norm_x_backprop_v2.py │ │ ├── ma-pre-start.sh │ │ ├── run_modelarts.py │ │ ├── run_modelarts_gen_finetune.py │ │ └── run_modelarts_gen_humaneval_x.py │ ├── src │ │ ├── __init__.py │ │ ├── adam.py │ │ ├── callbacks.py │ │ ├── code_tokenizer.py │ │ ├── dataset.py │ │ ├── dataset_finetune.py │ │ ├── generate.py │ │ ├── generate_finetune.py │ │ ├── generate_greedy.py │ │ ├── generate_humaneval.py │ │ ├── metrics.py │ │ ├── pangu_alpha.py │ │ ├── pangu_alpha_config.py │ │ ├── pangu_alpha_fp16_predict.py │ │ ├── pangu_alpha_wrapcell.py │ │ ├── pangu_alpha_wrapcell_finetune.py │ │ ├── preprocess.py │ │ ├── sat_dataset.py │ │ ├── tokenization_jieba.py │ │ └── utils.py │ └── train.py ├── oneflow │ ├── __init__.py │ ├── codegeex_model.py │ └── inference.py ├── paddle │ ├── __init__.py │ ├── codegeex_model.py │ ├── inference.py │ └── pt_to_pdparams.py ├── quantization │ ├── __init__.py │ ├── quantize.py │ └── quantize_oneflow.py ├── tokenizer │ ├── __init__.py │ ├── added_tokens.json │ ├── merges.txt │ ├── special_tokens_map.json │ ├── tokenizer.py │ ├── tokenizer_config.json │ └── vocab.json └── torch │ ├── __init__.py │ ├── codegeex_model.py │ ├── get_ckpt_qkv.py │ └── inference.py ├── configs ├── codegeex_13b.sh ├── codegeex_13b_paddle.sh └── codegeex_13b_parallel.sh ├── deployment ├── example_inputs.jsonl └── server_gradio.py ├── generations ├── humaneval_python_generations.jsonl.gz └── humaneval_rust_generations.jsonl.gz ├── requirements.txt ├── resources ├── api │ ├── api_step_1.png │ ├── api_step_2.png │ ├── api_step_3.png │ ├── api_step_4.png │ └── api_step_5.png ├── en │ ├── codegeex_training.png │ ├── hx_boxplot.png │ ├── hx_examples.png │ ├── hx_generattion_radar_horizon.png │ ├── hx_pass_rate_vs_language.png │ ├── hx_tasks.png │ └── hx_translation.png ├── logo │ └── codegeex_logo.png └── zh │ ├── hx_boxplot_zh.png │ ├── hx_generattion_radar_horizon_zh.png │ ├── hx_pass_rate_vs_language_zh.png │ ├── hx_tasks_zh.png │ ├── hx_translation_zh.png │ ├── join_wechat.png │ └── wechat.md ├── scripts ├── convert_ckpt_parallel.sh ├── convert_mindspore_to_megatron.sh ├── evaluate_humaneval_x.py ├── evaluate_humaneval_x.sh ├── finetune_codegeex.sh ├── gather_output.sh ├── generate_humaneval_x.sh ├── pretrain_codegeex.sh ├── process_pretrain_dataset.sh ├── test_inference.sh ├── test_inference_oneflow.sh ├── test_inference_oneflow_quantized.sh ├── test_inference_paddle.sh ├── test_inference_parallel.sh ├── test_inference_quantized.sh └── translate_humaneval_x.sh ├── setup.py ├── tests ├── test_inference.py ├── test_inference_megatron.py ├── test_inference_oneflow.py ├── test_inference_paddle.py └── test_prompt.txt └── vscode-extension ├── README.md └── README_zh.md /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "vscode-extension/codegeex-vscode-extension"] 2 | path = vscode-extension/codegeex-vscode-extension 3 | url = git@github.com:CodeGeeX/codegeex-vscode-extension.git 4 | -------------------------------------------------------------------------------- /MODEL_LICENSE: -------------------------------------------------------------------------------- 1 | The CodeGeeX License 2 | 3 | 1. Definitions 4 | 5 | “Licensor” means the CodeGeeX Model Team that distributes its Software. 6 | 7 | “Software” means the CodeGeeX model parameters made available under this license. 8 | 9 | 2. License Grant 10 | 11 | Subject to the terms and conditions of this License, the Licensor hereby grants to you a non-exclusive, worldwide, non-transferable, non-sublicensable, revocable, royalty-free copyright license to use the Software solely for your non-commercial research purposes. 12 | 13 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 14 | 15 | 3. Restriction 16 | 17 | You will not use, copy, modify, merge, publish, distribute, reproduce, or create derivative works of the Software, in whole or in part, for any commercial, military, or illegal purposes. 18 | 19 | You will not use the Software for any act that may undermine China's national security and national unity, harm the public interest of society, or infringe upon the rights and interests of human beings. 20 | 21 | 4. Disclaimer 22 | 23 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 24 | 25 | 5. Limitation of Liability 26 | 27 | EXCEPT TO THE EXTENT PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL THEORY, WHETHER BASED IN TORT, NEGLIGENCE, CONTRACT, LIABILITY, OR OTHERWISE WILL ANY LICENSOR BE LIABLE TO YOU FOR ANY DIRECT, INDIRECT, SPECIAL, INCIDENTAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES, OR ANY OTHER COMMERCIAL LOSSES, EVEN IF THE LICENSOR HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 28 | 29 | 6. Dispute Resolution 30 | 31 | This license shall be governed and construed in accordance with the laws of People’s Republic of China. Any dispute arising from or in connection with this License shall be submitted to Haidian District People's Court in Beijing. 32 | 33 | Note that the license is subject to update to a more comprehensive version. For any questions related to the license and copyright, please contact us at report@aminer.cn. -------------------------------------------------------------------------------- /api/README_zh.md: -------------------------------------------------------------------------------- 1 | ![codegeex_logo](../resources/logo/codegeex_logo.png) 2 | 3 | # 创建CodeGeeX API 4 | 5 | 使用[天启 · API开放平台](https://tianqi.aminer.cn/open/)申请CodeGeeX API: 6 | 7 | 8 | 9 | 点击首页中的天启平台体验入口: 10 | 11 | 点击API应用: 12 | 13 | 输入任意名称,创建API应用。创建后会得到API Key/Secret,用于调用API: 14 | 15 | 16 | 在API信息中,可以查看代码生成/代码翻译的请求地址和使用文档: 17 | 18 | 19 | 根据文档中的描述使用API,Python版参考目录``api/codegeex-api-example-python``;JAVA版参考工程:``api/codegeex-api-example-java`` 20 | -------------------------------------------------------------------------------- /api/codegeex-api-example-java/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | cn.aminer 8 | codegeex-api-example-java 9 | 1.0-SNAPSHOT 10 | 11 | 15 | 16 | UTF-8 17 | UTF-8 18 | 19 | 20 | 21 | 22 | 23 | org.apache.maven.plugins 24 | maven-compiler-plugin 25 | 3.8.1 26 | 27 | 29 | 1.8 30 | 1.8 31 | UTF-8 32 | 33 | 34 | 35 | org.apache.maven.plugins 36 | maven-assembly-plugin 37 | 3.3.0 38 | 39 | 40 | jar-with-dependencies 41 | 42 | 43 | 44 | 45 | 46 | package 47 | 48 | single 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | com.fasterxml.jackson.module 59 | jackson-module-parameter-names 60 | 2.6.6 61 | 62 | 63 | com.fasterxml.jackson.datatype 64 | jackson-datatype-jdk8 65 | 2.6.6 66 | 67 | 68 | com.fasterxml.jackson.datatype 69 | jackson-datatype-jsr310 70 | 2.6.6 71 | 72 | 73 | com.squareup.okhttp3 74 | okhttp 75 | 76 | 77 | org.slf4j 78 | slf4j-log4j12 79 | 80 | 81 | log4j 82 | log4j 83 | 84 | 85 | org.projectlombok 86 | lombok 87 | provided 88 | 89 | 90 | 91 | 92 | 93 | 94 | com.fasterxml.jackson.module 95 | jackson-module-parameter-names 96 | 97 | 98 | com.fasterxml.jackson.datatype 99 | jackson-datatype-jdk8 100 | 101 | 102 | com.fasterxml.jackson.datatype 103 | jackson-datatype-jsr310 104 | 105 | 106 | com.fasterxml.jackson.core 107 | jackson-databind 108 | 109 | 110 | com.squareup.okhttp3 111 | okhttp 112 | 4.10.0 113 | 114 | 115 | log4j 116 | log4j 117 | 1.2.17 118 | 119 | 120 | org.slf4j 121 | slf4j-log4j12 122 | 1.7.5 123 | 124 | 125 | org.projectlombok 126 | lombok 127 | 1.18.20 128 | provided 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | central 137 | ALiYun 138 | http://maven.aliyun.com/nexus/content/groups/public 139 | 140 | 141 | 142 | -------------------------------------------------------------------------------- /api/codegeex-api-example-java/src/main/java/cn/aminer/codegeex/example/CodeGenerationExample.java: -------------------------------------------------------------------------------- 1 | package cn.aminer.codegeex.example; 2 | 3 | import cn.aminer.codegeex.example.pojo.Payload; 4 | import com.fasterxml.jackson.databind.ObjectMapper; 5 | import okhttp3.*; 6 | 7 | import java.io.IOException; 8 | 9 | /** 10 | * 调用 CodeGeeX API 生成代码的例子。 11 | * 12 | * @author Darran Zhang @ codelast.com 13 | * @version 2023-01-20 14 | */ 15 | public class CodeGenerationExample { 16 | public static final String API_KEY = "your_api_key"; // 在"天启开放平台"上申请到的API Key 17 | public static final String API_SECRET = "your_api_secret"; // 在"天启开放平台"上申请到的API Secret 18 | public static final int NUMBER = 3; // 生成几个候选 19 | public static final String LANGUAGE = "Java"; // 编程语言 20 | public static final String REQUEST_URL = "https://tianqi.aminer.cn/api/v2/multilingual_code_generate"; // 请求地址 21 | 22 | public static void main(String[] args) throws Exception { 23 | CodeGenerationExample example = new CodeGenerationExample(); 24 | String prompt = "// use OkHttpClient library to write a function to perform http post request\n\n" + 25 | "public class HttpPost {\n" + 26 | " public static void main(String[] args) {\n"; 27 | example.generateCode(prompt); 28 | } 29 | 30 | /** 31 | * 生成代码。 32 | * 33 | * @param prompt 待补全的代码 34 | */ 35 | public void generateCode(String prompt) throws Exception { 36 | ObjectMapper objectMapper = new ObjectMapper(); 37 | Payload payload = new Payload().setApiKey(API_KEY).setApiSecret(API_SECRET).setPrompt(prompt).setNumber(NUMBER) 38 | .setLanguage(LANGUAGE); 39 | String response = performHttpPost(REQUEST_URL, objectMapper.writeValueAsString(payload)); 40 | System.out.println(response); 41 | } 42 | 43 | /** 44 | * 发起 HTTP POST 请求。 45 | * 46 | * @param url 请求的URL 47 | * @param payload 请求的JSON数据 48 | * @return 请求返回的内容,若出错则返回 null。 49 | */ 50 | public String performHttpPost(String url, String payload) { 51 | HttpUrl.Builder builder = null; 52 | try { 53 | HttpUrl httpUrl = HttpUrl.parse(url); 54 | if (httpUrl != null) { 55 | builder = httpUrl.newBuilder(); 56 | } 57 | } catch (IllegalArgumentException e) { 58 | System.out.println("failed to create HttpUrl.Builder from url " + url + ":" + e); 59 | } 60 | if (builder == null) { 61 | return null; 62 | } 63 | OkHttpClient client = new OkHttpClient(); 64 | RequestBody requestBody = RequestBody.create(payload, MediaType.parse("application/json; charset=utf-8")); 65 | Request request = new Request.Builder() 66 | .url(builder.build()) 67 | .post(requestBody) 68 | .build(); 69 | 70 | try { 71 | Response response = client.newCall(request).execute(); 72 | ResponseBody body = response.body(); 73 | if (body == null) { 74 | System.out.println("null response body"); 75 | return null; 76 | } 77 | return body.string(); 78 | } catch (IOException e) { 79 | System.out.println("failed to send POST request: " + e); 80 | } 81 | return null; 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /api/codegeex-api-example-java/src/main/java/cn/aminer/codegeex/example/pojo/Payload.java: -------------------------------------------------------------------------------- 1 | package cn.aminer.codegeex.example.pojo; 2 | 3 | import com.fasterxml.jackson.annotation.JsonIgnoreProperties; 4 | import com.fasterxml.jackson.annotation.JsonProperty; 5 | import lombok.Data; 6 | import lombok.experimental.Accessors; 7 | 8 | /** 9 | * 发送到 CodeGeex API 的请求中包含的JSON payload对象。 10 | * 11 | * @author Darran Zhang @ codelast.com 12 | * @version 2023-01-20 13 | */ 14 | @JsonIgnoreProperties(ignoreUnknown = true) 15 | @Data 16 | @Accessors(chain = true) 17 | public class Payload { 18 | @JsonProperty("apikey") 19 | String apiKey; // 在"天启开放平台"上申请到的API Key 20 | 21 | @JsonProperty("apisecret") 22 | String apiSecret; // 在"天启开放平台"上申请到的API Secret 23 | 24 | String prompt; // 待补全的代码 25 | 26 | @JsonProperty("n") 27 | int number; // 生成几个候选 28 | 29 | @JsonProperty("lang") 30 | String language; // 编程语言 31 | } 32 | -------------------------------------------------------------------------------- /api/codegeex-api-example-python/generation_example.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | 3 | import json 4 | 5 | import requests 6 | 7 | ''' 8 | Code Generation 9 | ''' 10 | API_KEY = "" # Get from Tianqi console. 从控制台获取 11 | API_SECRET = "" # Get from Tianqi console. 从控制台获取 12 | PROMPT = "from typing import List\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n " \ 13 | "\"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n given " \ 14 | "threshold.\n >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n False\n >>> has_close_elements(" \ 15 | "[1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n True\n \"\"\"\n" 16 | NUMBER = 3 17 | LANG = "Python" 18 | request_url = "https://tianqi.aminer.cn/api/v2/" 19 | api = 'multilingual_code_generate' 20 | 21 | # Request is in json format. 指定请求参数格式为json 22 | headers = {'Content-Type': 'application/json'} 23 | request_url = request_url + api 24 | data = { 25 | "apikey": API_KEY, 26 | "apisecret": API_SECRET, 27 | "prompt": PROMPT, 28 | "n": NUMBER, 29 | "lang": LANG 30 | } 31 | 32 | 33 | def main(): 34 | response = requests.post(request_url, headers=headers, data=json.dumps(data)) 35 | if response: 36 | print(response.json()) 37 | 38 | 39 | if __name__ == '__main__': 40 | main() 41 | -------------------------------------------------------------------------------- /codegeex/__init__.py: -------------------------------------------------------------------------------- 1 | import copy 2 | 3 | from typing import * 4 | from codegeex.tokenizer import CodeGeeXTokenizer 5 | from codegeex.torch.inference import get_token_stream 6 | 7 | 8 | def get_model( 9 | backend: str = "megatron", 10 | quantized: bool = False, 11 | ): 12 | pass 13 | 14 | 15 | def generate( 16 | model, 17 | tokenizer: CodeGeeXTokenizer, 18 | prompt: str, 19 | out_seq_length: int, 20 | seq_length: int = 2048, 21 | top_k: int = 0, 22 | top_p: float = 1.0, 23 | temperature: float = 1.0, 24 | micro_batch_size: int = 1, 25 | backend: str = "megatron", 26 | greedy: bool = False, 27 | verbose: bool = False, 28 | ): 29 | tokens = tokenizer.encode_code(prompt) 30 | n_token_prompt = len(tokens) 31 | 32 | if verbose: 33 | print(f"Current prompt:\n{prompt}") 34 | print("N_token_prompt:", n_token_prompt) 35 | 36 | generated_codes = [] 37 | if backend == "megatron": 38 | token_stream = get_token_stream( 39 | model, 40 | tokenizer, 41 | seq_length, 42 | out_seq_length, 43 | [copy.deepcopy(tokens) for _ in range(micro_batch_size)], 44 | micro_batch_size=micro_batch_size, 45 | topk=top_k, 46 | topp=top_p, 47 | temperature=temperature, 48 | greedy=greedy, 49 | ) 50 | is_finished = [False for _ in range(micro_batch_size)] 51 | for i, generated in enumerate(token_stream): 52 | generated_tokens = generated[0] 53 | for j in range(micro_batch_size): 54 | if is_finished[j]: 55 | continue 56 | 57 | if generated_tokens[j].cpu().numpy()[-1] == tokenizer.eos_token_id or len(generated_tokens[j]) >= out_seq_length: 58 | is_finished[j] = True 59 | generated_tokens_ = generated_tokens[j].cpu().numpy().tolist() 60 | generated_code = tokenizer.decode_code(generated_tokens_[n_token_prompt:]) 61 | generated_code = "".join(generated_code) 62 | generated_codes.append(generated_code) 63 | if verbose: 64 | print(f"\nGenerated code {i}:\n{generated_code}") 65 | 66 | if all(is_finished): 67 | break 68 | 69 | return generated_codes -------------------------------------------------------------------------------- /codegeex/benchmark/README.md: -------------------------------------------------------------------------------- 1 | # HumanEval-X: A new benchmark for Multilingual Program Synthesis 2 | 3 | 🌐 中文 4 | 5 | HumanEval-X is a new benchmark for better evaluating the multilingual ability of code generation models. While previous works evaluate multilingual program synthesis under semantic similarity (e.g., [CodeBLEU](https://arxiv.org/abs/2009.10297)) which is often misleading, HumanEval-X evaluates the functional correctness of the generated programs. HumanEval-X consists of 820 high-quality human-crafted data samples (each with test cases) in Python, C++, Java, JavaScript, and Go, and can be used for various tasks. 6 | 7 | 8 | 9 |

An illustration of tasks supported by HumanEval-X. Declarations, docstrings, and solutions are marked with red, green, and blue respectively. Code generation uses declaration and docstring as input, to generate solution. Code translation uses declaration in both languages and translate the solution in source language to the one in target language.

10 | 11 | In HumanEval-X, every sample in each language contains declaration, docstring, and solution, which can be combined in various ways to support different downstream tasks including generation, translation, summarization, etc. We currently focus on two tasks: **code generation** and **code translation**. For code generation, the model uses declaration and docstring as input to generate the solution. For code translation, the model uses declarations in both languages and the solution in the source language as input, to generate solutions in the target language. We remove the description during code translation to prevent the model from directly solving the problem. For both tasks, we use the unbiased pass@k metric proposed in [Codex](https://arxiv.org/abs/2107.03374): $\text{pass}@k:= \mathbb{E}[1-\frac{\tbinom{n-c}{k}}{\tbinom{n}{k}}]$, with $n=200$ and $k\in(1,10,100)$. 12 | 13 | ## How to use HumanEval-X 14 | 15 | Data are stored in ``codegeex/benchmark/humaneval-x/[LANG]/data/humaneval_[LANG].jsonl.gz``, using JSON list format. There are six keys: 16 | 17 | * ``task_id``: indicates the target language and ID of the problem. Language is one of ["Python", "Java", "JavaScript", "CPP", "Go"]. 18 | * ``prompt``: the function declaration and docstring, used for code generation. 19 | * ``declaration``: only the function declaration, used for code translation. 20 | * ``canonical_solution``: human-crafted example solutions. 21 | * ``test``: hidden test samples, used for evaluation. 22 | * ``example_test``: public test samples (appeared in prompt), used for evaluation. 23 | 24 | ### Evaluation Environment 25 | 26 | The evaluation of the generated codes involves compiling and running in multiple programming languages. The versions of the programming language environments and packages we use are as follows: 27 | 28 | | Dependency | Version | 29 | | ---------- | -------- | 30 | | Python | 3.8.12 | 31 | | JDK | 18.0.2.1 | 32 | | Node.js | 16.14.0 | 33 | | js-md5 | 0.7.3 | 34 | | C++ | 11 | 35 | | g++ | 7.5.0 | 36 | | Boost | 1.71.0 | 37 | | OpenSSL | 3.0.0 | 38 | | go | 1.18.4 | 39 | 40 | In order to save everyone the trouble of setting up the environments for these languages, we build a Docker image with the required environments and CodeGeeX installed. 41 | 42 | You can directly pull the image from Docker Hub: 43 | 44 | ```bash 45 | docker pull rishubi/codegeex:latest 46 | ``` 47 | 48 | Alternatively, if you are familiar with Dockerfile, you can build the image from `codegeex/docker/Dockerfile` or configure the Dockerfile as you like it: 49 | 50 | ```bash 51 | cd codegeex/docker 52 | docker build [OPTIONS] . 53 | ``` 54 | 55 | After obtaining the image, you can build a container using the following command: 56 | 57 | ```bash 58 | docker run -it --gpus all --mount type=bind,source=,target= [OPTIONS] 59 | ``` 60 | 61 | ### Evaluation 62 | 63 | We recommend evaluating in [the provided image](#evaluation-environment). To evaluate the generated samples, save generated codes in the following JSON list format: 64 | 65 | ``` 66 | {"task_id": "../..", "generation: "..."} 67 | {"task_id": "../..", "generation: "..."} 68 | ... 69 | ``` 70 | 71 | and evaluate them using the following script under the root directory of the repository (please execute with caution, the generated codes might have unexpected behaviours though with very low possibility. See the warnings in [execution.py](execution.py) and uncomment the execution lines at your own risk): 72 | 73 | ```bash 74 | bash scripts/evaluate_humaneval_x.sh 75 | ``` 76 | -------------------------------------------------------------------------------- /codegeex/benchmark/README_zh.md: -------------------------------------------------------------------------------- 1 | # HumanEval-X: 多语言代码生成基准 2 | 3 | 🌐 English 4 | 5 | 为了更好地评测代码生成模型的多语言生成能力,我们构建了一个新基准HumanEval-X。此前,多语言代码生成能力是基于语义相似度(比如[CodeBLEU](https://arxiv.org/abs/2009.10297))衡量的,具有一定误导性;HumanEval-X则可用于衡量生成代码的功能正确性。HumanEval-X包含820个高质量手写样本,覆盖Python、C++、Java、JavaScript、Go,可用于多种任务。 6 | 7 | 8 | 9 |

HumanEval-X支持的任务示例。声明描述解答分别用红、绿、蓝色标注。代码生成将声明与描述作为输入,输出解答。代码翻译将两种语言的声明与源语言的解答作为输入,输出目标语言的解答。

10 | 11 | HumanEval-X中每个语言的样本,包含了声明、描述和解答,它们之间的组合可以支持不同的下游任务,包括生成、翻译、概括等。我们目前关注两个任务:**代码生成**与**代码翻译**。对于代码生成任务,模型将函数声明与文档字符串作为输入,输出函数实现;对于代码翻译任务,模型将两种语言的函数声明与源语言的实现作为输入,输出目标语言上的实现。我们在代码翻译任务中不将文档字符串输入模型,以避免模型直接通过描述生成答案。在两种任务下,我们都采用[Codex](https://arxiv.org/abs/2107.03374)所使用的无偏pass@k指标:$\text{pass}@k:= \mathbb{E}[1-\frac{\tbinom{n-c}{k}}{\tbinom{n}{k}}]$, $n=200$, $k\in(1,10,100)$。 12 | 13 | ## 如何使用HumanEval-X 14 | 15 | 样本使用JSON列表格式存储在``codegeex/benchmark/humaneval-x/[LANG]/data/humaneval_[LANG].jsonl.gz``,每条样本包含6个部分: 16 | 17 | * ``task_id``: 题目的目标语言与ID。语言为["Python", "Java", "JavaScript", "CPP", "Go"]中之一。 18 | * ``prompt``: 函数声明与描述,用于代码生成。 19 | * ``declaration``: 仅有函数声明,用于代码翻译。 20 | * ``canonical_solution``: 手写的示例解答。 21 | * ``test``: 隐藏测例,用于评测。 22 | * ``example_test``: 提示中出现的公开测例,用于评测。 23 | 24 | ### 评测环境 25 | 26 | 评测生成的代码需要使用多种语言编译、运行。我们使用的各编程语言依赖及所用包的版本如下: 27 | 28 | | 依赖 | 版本 | 29 | | ------- | -------- | 30 | | Python | 3.8.12 | 31 | | JDK | 18.0.2.1 | 32 | | Node.js | 16.14.0 | 33 | | js-md5 | 0.7.3 | 34 | | C++ | 11 | 35 | | g++ | 7.5.0 | 36 | | Boost | 1.71.0 | 37 | | OpenSSL | 3.0.0 | 38 | | go | 1.18.4 | 39 | 40 | 为了省去使用者配置这些语言环境的麻烦,我们构建了一个Docker镜像,并在其中配置了所需要的环境。 41 | 42 | 可以直接从Docker Hub拉取镜像: 43 | 44 | ```bash 45 | docker pull rishubi/codegeex:latest 46 | ``` 47 | 48 | 如果您熟悉Dockerfile,也可以从`codegeex/docker/Dockerfile`构建镜像,或者修改之以定制自己的配置: 49 | 50 | ```bash 51 | cd codegeex/docker 52 | docker build [OPTIONS] . 53 | ``` 54 | 55 | 获取镜像后,使用如下命令创建容器: 56 | 57 | ```bash 58 | docker run -it --gpus all --mount type=bind,source=,target= [OPTIONS] 59 | ``` 60 | 61 | ### 评测 62 | 63 | 我们推荐使用给定的[评测环境](#评测环境)进行评测。在评测前,将生成的代码以如下JSON列表形式存储: 64 | 65 | ``` 66 | {"task_id": "../..", "generation: "..."} 67 | {"task_id": "../..", "generation: "..."} 68 | ... 69 | ``` 70 | 71 | 并在本仓库的根目录下使用如下指令(请谨慎执行,生成的代码可能有极低概率产生意外行为。在[execution.py](execution.py)中查看警告并取消执行代码的注释,风险自负): 72 | 73 | ```bash 74 | bash scripts/evaluate_humaneval_x.sh 75 | ``` 76 | -------------------------------------------------------------------------------- /codegeex/benchmark/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/codegeex/benchmark/__init__.py -------------------------------------------------------------------------------- /codegeex/benchmark/gather_output.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import fire 4 | import glob 5 | 6 | 7 | def gather_output( 8 | output_dir: str = "./output", 9 | output_prefix: str = None, 10 | if_remove_rank_files: int = 0, 11 | ): 12 | if output_prefix is None: 13 | output_list = glob.glob(output_dir + "/*") 14 | else: 15 | output_list = glob.glob(os.path.join(output_dir, output_prefix + "*")) 16 | 17 | for output_file in output_list: 18 | if "rank0" in output_file: 19 | output_prefix_ = output_file.split("_rank0.jsonl")[0] 20 | rank_files = glob.glob(output_prefix_ + "_rank*") 21 | with open(output_prefix_ + ".jsonl", "w") as f_out: 22 | for rank_file in rank_files: 23 | with open(rank_file, "r") as f_in: 24 | for line in f_in: 25 | f_out.write(line) 26 | if if_remove_rank_files: 27 | os.remove(rank_file) 28 | print(f"Removing {rank_file}...") 29 | 30 | if output_prefix is None: 31 | output_list = glob.glob(output_dir + "/*") 32 | else: 33 | output_list = glob.glob(os.path.join(output_dir, output_prefix + "*")) 34 | 35 | for output_file in output_list: 36 | if "rank" in output_file or "_unfinished" in output_file or "all" in output_file or "_result" in output_file: 37 | continue 38 | if "_finished" not in output_file: 39 | continue 40 | output_prefix_ = output_file.split("_finished.jsonl")[0] 41 | files = [output_file, output_prefix_ + "_unfinished.jsonl"] 42 | with open(output_prefix_ + "_all.jsonl", "w") as f_out: 43 | for f in files: 44 | with open(f, "r") as f_in: 45 | for line in f_in: 46 | f_out.write(line) 47 | 48 | print("Gathering finished. Saved in {}".format(output_prefix_ + "_all.jsonl")) 49 | 50 | 51 | def main(): 52 | fire.Fire(gather_output) 53 | 54 | 55 | if __name__ == "__main__": 56 | sys.exit(main()) 57 | -------------------------------------------------------------------------------- /codegeex/benchmark/humaneval-x/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/codegeex/benchmark/humaneval-x/__init__.py -------------------------------------------------------------------------------- /codegeex/benchmark/humaneval-x/cpp/data/humaneval_cpp.jsonl.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/codegeex/benchmark/humaneval-x/cpp/data/humaneval_cpp.jsonl.gz -------------------------------------------------------------------------------- /codegeex/benchmark/humaneval-x/cpp/evaluation/test.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Input to this function is a string containing multiple groups of nested parentheses. Your goal is to 3 | separate those group into separate strings and return the vector of those. 4 | Separate groups are balanced (each open brace is properly closed) and not nested within each other 5 | Ignore any spaces in the input string. 6 | >>> separate_paren_groups("( ) (( )) (( )( ))") 7 | {"()", "(())", "(()())"} 8 | */ 9 | #include 10 | #include 11 | #include 12 | using namespace std; 13 | vector separate_paren_groups(string paren_string){ 14 | 15 | vector all_parens; 16 | string current_paren; 17 | int level=0; 18 | char chr; 19 | int i; 20 | for (i=0;i 43 | bool issame(vector a,vectorb){ 44 | if (a.size()!=b.size()) return false; 45 | for (int i=0;i np.ndarray: 32 | """ 33 | Estimates pass@k of each problem and returns them in an array. 34 | """ 35 | 36 | def estimator(n: int, c: int, k: int) -> float: 37 | """ 38 | Calculates 1 - comb(n - c, k) / comb(n, k). 39 | """ 40 | if n - c < k: 41 | return 1.0 42 | return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1)) 43 | 44 | if isinstance(num_samples, int): 45 | num_samples_it = itertools.repeat(num_samples, len(num_correct)) 46 | else: 47 | assert len(num_samples) == len(num_correct) 48 | num_samples_it = iter(num_samples) 49 | 50 | return np.array([estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)]) 51 | -------------------------------------------------------------------------------- /codegeex/benchmark/rust/Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | version = 3 4 | 5 | [[package]] 6 | name = "aho-corasick" 7 | version = "0.7.20" 8 | source = "registry+https://github.com/rust-lang/crates.io-index" 9 | checksum = "cc936419f96fa211c1b9166887b38e5e40b19958e5b895be7c1f93adec7071ac" 10 | dependencies = [ 11 | "memchr", 12 | ] 13 | 14 | [[package]] 15 | name = "fuchsia-cprng" 16 | version = "0.1.1" 17 | source = "registry+https://github.com/rust-lang/crates.io-index" 18 | checksum = "a06f77d526c1a601b7c4cdd98f54b5eaabffc14d5f2f0296febdc7f357c6d3ba" 19 | 20 | [[package]] 21 | name = "libc" 22 | version = "0.2.139" 23 | source = "registry+https://github.com/rust-lang/crates.io-index" 24 | checksum = "201de327520df007757c1f0adce6e827fe8562fbc28bfd9c15571c66ca1f5f79" 25 | 26 | [[package]] 27 | name = "md5" 28 | version = "0.7.0" 29 | source = "registry+https://github.com/rust-lang/crates.io-index" 30 | checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771" 31 | 32 | [[package]] 33 | name = "memchr" 34 | version = "2.5.0" 35 | source = "registry+https://github.com/rust-lang/crates.io-index" 36 | checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" 37 | 38 | [[package]] 39 | name = "rand" 40 | version = "0.4.6" 41 | source = "registry+https://github.com/rust-lang/crates.io-index" 42 | checksum = "552840b97013b1a26992c11eac34bdd778e464601a4c2054b5f0bff7c6761293" 43 | dependencies = [ 44 | "fuchsia-cprng", 45 | "libc", 46 | "rand_core 0.3.1", 47 | "rdrand", 48 | "winapi", 49 | ] 50 | 51 | [[package]] 52 | name = "rand_core" 53 | version = "0.3.1" 54 | source = "registry+https://github.com/rust-lang/crates.io-index" 55 | checksum = "7a6fdeb83b075e8266dcc8762c22776f6877a63111121f5f8c7411e5be7eed4b" 56 | dependencies = [ 57 | "rand_core 0.4.2", 58 | ] 59 | 60 | [[package]] 61 | name = "rand_core" 62 | version = "0.4.2" 63 | source = "registry+https://github.com/rust-lang/crates.io-index" 64 | checksum = "9c33a3c44ca05fa6f1807d8e6743f3824e8509beca625669633be0acbdf509dc" 65 | 66 | [[package]] 67 | name = "rdrand" 68 | version = "0.4.0" 69 | source = "registry+https://github.com/rust-lang/crates.io-index" 70 | checksum = "678054eb77286b51581ba43620cc911abf02758c91f93f479767aed0f90458b2" 71 | dependencies = [ 72 | "rand_core 0.3.1", 73 | ] 74 | 75 | [[package]] 76 | name = "regex" 77 | version = "1.7.1" 78 | source = "registry+https://github.com/rust-lang/crates.io-index" 79 | checksum = "48aaa5748ba571fb95cd2c85c09f629215d3a6ece942baa100950af03a34f733" 80 | dependencies = [ 81 | "aho-corasick", 82 | "memchr", 83 | "regex-syntax", 84 | ] 85 | 86 | [[package]] 87 | name = "regex-syntax" 88 | version = "0.6.28" 89 | source = "registry+https://github.com/rust-lang/crates.io-index" 90 | checksum = "456c603be3e8d448b072f410900c09faf164fbce2d480456f50eea6e25f9c848" 91 | 92 | [[package]] 93 | name = "rust" 94 | version = "0.1.0" 95 | dependencies = [ 96 | "md5", 97 | "rand", 98 | "regex", 99 | ] 100 | 101 | [[package]] 102 | name = "winapi" 103 | version = "0.3.9" 104 | source = "registry+https://github.com/rust-lang/crates.io-index" 105 | checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" 106 | dependencies = [ 107 | "winapi-i686-pc-windows-gnu", 108 | "winapi-x86_64-pc-windows-gnu", 109 | ] 110 | 111 | [[package]] 112 | name = "winapi-i686-pc-windows-gnu" 113 | version = "0.4.0" 114 | source = "registry+https://github.com/rust-lang/crates.io-index" 115 | checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" 116 | 117 | [[package]] 118 | name = "winapi-x86_64-pc-windows-gnu" 119 | version = "0.4.0" 120 | source = "registry+https://github.com/rust-lang/crates.io-index" 121 | checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" 122 | -------------------------------------------------------------------------------- /codegeex/benchmark/rust/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "rust" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 7 | 8 | [dependencies] 9 | rand = "0.4" 10 | regex = "1" 11 | md5 = "0.7.0" 12 | 13 | -------------------------------------------------------------------------------- /codegeex/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/codegeex/data/__init__.py -------------------------------------------------------------------------------- /codegeex/data/data_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import gzip 3 | import json 4 | 5 | from typing import * 6 | 7 | LANGUAGE_TAG = { 8 | "c" : "// language: C", 9 | "c++" : "// language: C++", 10 | "cpp" : "// language: C++", 11 | "c#" : "// language: C#", 12 | "csharp" : "// language: C#", 13 | "css" : "/* language: CSS */", 14 | "cuda" : "// language: Cuda", 15 | "dart" : "// language: Dart", 16 | "lua" : "// language: Lua", 17 | "objectivec" : "// language: Objective-C", 18 | "objective-c" : "// language: Objective-C", 19 | "objective-c++": "// language: Objective-C++", 20 | "python" : "# language: Python", 21 | "perl" : "# language: Perl", 22 | "prolog" : f"% language: Prolog", 23 | "swift" : "// language: swift", 24 | "lisp" : "; language: Lisp", 25 | "java" : "// language: Java", 26 | "scala" : "// language: Scala", 27 | "tex" : f"% language: TeX", 28 | "vue" : "", 29 | "markdown" : "", 30 | "html" : "", 31 | "php" : "// language: PHP", 32 | "js" : "// language: JavaScript", 33 | "javascript" : "// language: JavaScript", 34 | "typescript" : "// language: TypeScript", 35 | "go" : "// language: Go", 36 | "shell" : "# language: Shell", 37 | "rust" : "// language: Rust", 38 | "sql" : "-- language: SQL", 39 | "kotlin" : "// language: Kotlin", 40 | "vb" : "' language: Visual Basic", 41 | "ruby" : "# language: Ruby", 42 | "pascal" : "// language: Pascal", 43 | "r" : "# language: R", 44 | "fortran" : "!language: Fortran", 45 | "lean" : "-- language: Lean", 46 | "matlab" : f"% language: Matlab", 47 | "delphi" : "{language: Delphi}", 48 | "scheme" : "; language: Scheme", 49 | "basic" : "' language: Basic", 50 | "assembly" : "; language: Assembly", 51 | "groovy" : "// language: Groovy", 52 | "abap" : "* language: Abap", 53 | "gdscript" : "# language: GDScript", 54 | "haskell" : "-- language: Haskell", 55 | "julia" : "# language: Julia", 56 | "elixir" : "# language: Elixir", 57 | "excel" : "' language: Excel", 58 | "clojure" : "; language: Clojure", 59 | "actionscript" : "// language: ActionScript", 60 | "solidity" : "// language: Solidity", 61 | "powershell" : "# language: PowerShell", 62 | "erlang" : f"% language: Erlang", 63 | "cobol" : "// language: Cobol", 64 | } 65 | 66 | 67 | def stream_jsonl(filename: str) -> Iterable[Dict]: 68 | """ 69 | Parses each jsonl line and yields it as a dictionary 70 | """ 71 | if filename.endswith(".gz"): 72 | with open(filename, "rb") as gzfp: 73 | with gzip.open(gzfp, "rt") as fp: 74 | for line in fp: 75 | if any(not x.isspace() for x in line): 76 | yield json.loads(line) 77 | else: 78 | with open(filename, "r") as fp: 79 | for line in fp: 80 | if any(not x.isspace() for x in line): 81 | yield json.loads(line) 82 | 83 | 84 | def write_jsonl(filename: str, data: Iterable[Dict], append: bool = False): 85 | """ 86 | Writes an iterable of dictionaries to jsonl 87 | """ 88 | if append: 89 | mode = "ab" 90 | else: 91 | mode = "wb" 92 | filename = os.path.expanduser(filename) 93 | if filename.endswith(".gz"): 94 | with open(filename, mode) as fp: 95 | with gzip.GzipFile(fileobj=fp, mode="wb") as gzfp: 96 | for x in data: 97 | gzfp.write((json.dumps(x) + "\n").encode("utf-8")) 98 | else: 99 | with open(filename, mode) as fp: 100 | for x in data: 101 | fp.write((json.dumps(x) + "\n").encode("utf-8")) 102 | 103 | 104 | def sliding_window( 105 | prompt_tokens: list, 106 | code_tokens: list, 107 | seq_len: int, 108 | sliding_stride: int, 109 | minimum_code_len: int = 1, 110 | ) -> Iterable[Tuple[list, list]]: 111 | """ 112 | Generate a series of (prompt, code) pairs by sliding the window over the code. 113 | """ 114 | prompt_len = len(prompt_tokens) 115 | code_len = len(code_tokens) 116 | total_len = prompt_len + code_len 117 | 118 | start_idx = max(0, prompt_len - seq_len + minimum_code_len) # at least `minimum_code_len` code token should be in the window 119 | end_idx = max(0, total_len - seq_len) 120 | start_idx = min(start_idx, end_idx) 121 | 122 | for i in range(start_idx, end_idx + 1, sliding_stride): 123 | current_prompt = prompt_tokens[i:i + seq_len] 124 | current_code = code_tokens[max(i - prompt_len, 0):i - prompt_len + seq_len] 125 | yield current_prompt, current_code 126 | 127 | if (end_idx - start_idx) % sliding_stride != 0: 128 | current_prompt = prompt_tokens[end_idx:end_idx + seq_len] 129 | current_code = code_tokens[max(end_idx - prompt_len, 0):end_idx - prompt_len + seq_len] 130 | yield current_prompt, current_code 131 | -------------------------------------------------------------------------------- /codegeex/data/process_pretrain_dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | import fire 4 | import torch 5 | import multiprocessing 6 | 7 | from typing import * 8 | from tqdm.auto import tqdm 9 | from time import perf_counter 10 | from black import format_str, FileMode 11 | 12 | from codegeex.data.types import PromptDataset, PromptSample 13 | from codegeex.data.processor import PromptDatasetProcessor 14 | from codegeex.data.data_utils import stream_jsonl, LANGUAGE_TAG 15 | from codegeex.megatron.data.indexed_dataset import make_mmap_builder 16 | from codegeex.tokenizer import CodeGeeXTokenizer 17 | 18 | 19 | def try_format_code(code: str): 20 | # Auto-correct to PEP8 format (Change tab to 4-whitespaces; 21 | # add whitespace around some special symbols; 22 | # reformat line length < 100, etc.) 23 | try: 24 | res = format_str(code, mode=FileMode(line_length=200)) 25 | except Exception as e: 26 | res = code 27 | print(e) 28 | print("Wrong python format: {}".format(code)) 29 | return res 30 | 31 | 32 | def load_pretrain_dataset(dataset_path: Union[str, List[str]]) -> Dict: 33 | if type(dataset_path) is str: 34 | dataset_path = [dataset_path] 35 | 36 | for p in dataset_path: 37 | if not os.path.isdir(p): 38 | if p.endswith(".gz") or p.endswith(".jsonl"): 39 | print(f"loading from {p}") 40 | yield from stream_jsonl(p) 41 | else: 42 | p_list = glob.glob(p + "/*") 43 | for p_ in p_list: 44 | if p_.endswith(".gz") or p_.endswith(".jsonl"): 45 | print(f"loading from {p_}") 46 | yield from stream_jsonl(p_) 47 | 48 | 49 | def process_sample( 50 | sample: Dict, 51 | language: str=None, 52 | mode: str="pretrain", 53 | ) -> Iterable[PromptSample]: 54 | if mode == "pretrain": 55 | prompt = "" 56 | else: 57 | prompt = sample["prompt"] 58 | 59 | try: 60 | if language is not None and language in LANGUAGE_TAG.keys(): 61 | code = LANGUAGE_TAG[language] + "\n" + sample["code"] 62 | else: 63 | code = sample["code"] 64 | except Exception as e: 65 | print(e) 66 | print("The key 'code' is missing in data. Aborted") 67 | exit(0) 68 | 69 | yield PromptSample(prompt, code) 70 | 71 | 72 | def generate_prompt_samples( 73 | dataset: Iterable[Dict], 74 | language: str = None, 75 | mode: str = "pretrain", 76 | ) -> PromptDataset: 77 | for sample in dataset: 78 | yield from process_sample(sample, language, mode) 79 | 80 | 81 | def main( 82 | tokenizer_path: str, 83 | dataset_path: Union[str, List[str]], 84 | output_prefix: str, 85 | language: str = None, 86 | mode: str = "pretrain", 87 | discard_overlong: bool = False, 88 | sliding_stride: int = 200, 89 | num_workers: int = 32, 90 | seq_len: int = 2048, 91 | ): 92 | DATA_KEYS = ["input_ids", "attention_mask", "labels"] 93 | 94 | # create output dir 95 | os.makedirs(os.path.dirname(output_prefix), exist_ok=True) 96 | 97 | tokenizer = CodeGeeXTokenizer(tokenizer_path=tokenizer_path) 98 | pad_token_id = tokenizer.eos_token_id 99 | 100 | dataset = load_pretrain_dataset(dataset_path) 101 | prompt_dataset = generate_prompt_samples(dataset, language=language, mode=mode) 102 | 103 | if num_workers == 0: 104 | num_workers = multiprocessing.cpu_count() 105 | pool = multiprocessing.Pool(num_workers) 106 | output_bin_files = {} 107 | output_idx_files = {} 108 | builders = {} 109 | 110 | for key in DATA_KEYS: 111 | output_bin_files[key] = "{}_{}.bin".format(output_prefix, key) 112 | output_idx_files[key] = "{}_{}.idx".format(output_prefix, key) 113 | builders[key] = make_mmap_builder( 114 | output_bin_files[key], 115 | vocab_size=None, # magic number, should change it 116 | ) 117 | 118 | # NOTE that we use seq_len + 1 instead of seq_len, since the input tokens will be shifted by one. 119 | processor = PromptDatasetProcessor( 120 | tokenize=tokenizer.encode_code, 121 | pad_token=pad_token_id, 122 | max_seq_len=seq_len + 1, 123 | discard_overlong=discard_overlong, 124 | sliding_stride=sliding_stride, 125 | eod_token=pad_token_id) 126 | 127 | processor.start_time = perf_counter() 128 | doc_iter = pool.imap_unordered(processor.process_sample_strict, 129 | prompt_dataset, 130 | chunksize=20) 131 | 132 | for doc_idx, docs in tqdm(enumerate(doc_iter, start=1)): 133 | processor.doc_processed += 1 134 | for doc in docs: 135 | processor.doc_generated += 1 136 | for key in DATA_KEYS: 137 | builders[key].add_item(torch.IntTensor(doc[key])) 138 | 139 | for key in DATA_KEYS: 140 | builders[key].finalize(output_idx_files[key]) 141 | 142 | 143 | if __name__ == "__main__": 144 | fire.Fire(main) 145 | -------------------------------------------------------------------------------- /codegeex/data/types.py: -------------------------------------------------------------------------------- 1 | from typing import * 2 | from dataclasses import dataclass 3 | 4 | 5 | @dataclass 6 | class PromptSample: 7 | prompt: str 8 | code: str 9 | extra: dict = None 10 | 11 | 12 | PromptDataset = Iterable[PromptSample] 13 | 14 | @dataclass 15 | class LabelSample: 16 | prompt: str 17 | label: int 18 | extra: dict = None 19 | 20 | LabelDataset = Iterable[LabelSample] -------------------------------------------------------------------------------- /codegeex/docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM pytorch/pytorch:1.11.0-cuda11.3-cudnn8-runtime 2 | 3 | RUN cp /etc/apt/sources.list /etc/apt/sources.list.bak \ 4 | && sed -i "s@http://.*archive.ubuntu.com@https://mirrors.tuna.tsinghua.edu.cn@g" /etc/apt/sources.list \ 5 | && sed -i "s@http://.*security.ubuntu.com@https://mirrors.tuna.tsinghua.edu.cn@g" /etc/apt/sources.list \ 6 | && pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple \ 7 | && apt-get update && apt-get install -y curl npm git nano \ 8 | && pip install fire zmq transformers tokenizers \ 9 | && mkdir /workspace/download 10 | 11 | RUN curl -o /workspace/download/go.tar.gz -SL https://go.dev/dl/go1.18.4.linux-amd64.tar.gz \ 12 | && tar -zxf /workspace/download/go.tar.gz -C /usr/local && rm /workspace/download/go.tar.gz 13 | ENV PATH=/bin:/usr/local/go/bin:$PATH 14 | 15 | RUN curl -o /workspace/download/node.tar.gz -SL https://nodejs.org/download/release/v16.14.0/node-v16.14.0-linux-x64.tar.gz \ 16 | && mkdir -p /usr/local/lib/nodejs && tar -zxf /workspace/download/node.tar.gz -C /usr/local/lib/nodejs && mv /usr/local/lib/nodejs/node-v16.14.0-linux-x64 /usr/local/lib/nodejs/node \ 17 | && rm /workspace/download/node.tar.gz && npm install -g js-md5@0.7.3 18 | ENV PATH=/usr/local/lib/nodejs/node/bin:$PATH 19 | ENV NODE_PATH=/usr/local/lib/node_modules 20 | 21 | RUN apt-get install -y build-essential && apt-get install -y g++ \ 22 | && curl -o /workspace/download/boost_1_71_0.tar.gz -SL https://boostorg.jfrog.io/artifactory/main/release/1.71.0/source/boost_1_71_0.tar.gz \ 23 | && tar -zxf /workspace/download/boost_1_71_0.tar.gz && rm /workspace/download/boost_1_71_0.tar.gz && cd boost_1_71_0 \ 24 | && ./bootstrap.sh --prefix=/usr/ && ./b2 && ./b2 install \ 25 | && cd .. && rm -r boost_1_71_0 26 | RUN curl -o /workspace/download/openssl.tar.gz -SL https://www.openssl.org/source/old/3.0/openssl-3.0.0.tar.gz \ 27 | && tar -zxf /workspace/download/openssl.tar.gz && cd openssl-3.0.0 && ./Configure && make && make install \ 28 | && rm /workspace/download/openssl.tar.gz && rm -r /workspace/openssl-3.0.0 29 | ENV PATH=/usr/bin/openssl:$PATH 30 | 31 | RUN curl -o /workspace/download/jdk.tar.gz -SL https://download.oracle.com/java/18/latest/jdk-18_linux-x64_bin.tar.gz \ 32 | && mkdir /usr/java && tar -zxf /workspace/download/jdk.tar.gz -C /usr/java && rm /workspace/download/jdk.tar.gz \ 33 | && java_path=`ls /usr/java/${path}` && echo "export JAVA_HOME=/usr/java/${java_path}" >> ~/.profile 34 | 35 | RUN git clone https://github.com/THUDM/CodeGeeX.git && pip install -e /workspace/CodeGeeX \ 36 | && tar -zxf /workspace/CodeGeeX/codegeex/benchmark/humaneval-x/go/evaluation/vendor.tar.gz -C /workspace/CodeGeeX/codegeex/benchmark/humaneval-x/go/evaluation 37 | 38 | SHELL ["sh", "-lc"] 39 | RUN update-alternatives --install /usr/bin/java java $JAVA_HOME/bin/java 20000 \ 40 | && update-alternatives --install /usr/bin/javac javac $JAVA_HOME/bin/javac 20000 \ 41 | && rm -r /workspace/download \ 42 | && env 43 | -------------------------------------------------------------------------------- /codegeex/kernels/__init__.py: -------------------------------------------------------------------------------- 1 | import pkg_resources 2 | import torch 3 | import ctypes 4 | 5 | from typing import List 6 | from cpm_kernels.kernels.base import LazyKernelCModule, KernelFunction, round_up 7 | 8 | RESOURCE_PACKAGE_NAME = __name__ 9 | 10 | 11 | class Kernel: 12 | def __init__(self, filename: str, function_names: List[str]): 13 | filename = filename + ".fatbin" 14 | if not pkg_resources.resource_exists(RESOURCE_PACKAGE_NAME, filename): 15 | raise RuntimeError("File `%s` not found in `%s`" % (filename, RESOURCE_PACKAGE_NAME)) 16 | self.filename = filename 17 | self.code = pkg_resources.resource_string(RESOURCE_PACKAGE_NAME, filename) 18 | self._function_names = function_names 19 | self._cmodule = LazyKernelCModule(self.code) 20 | 21 | for name in self._function_names: 22 | setattr(self, name, KernelFunction(self._cmodule, name)) 23 | 24 | 25 | kernels = Kernel( 26 | "quantization", 27 | [ 28 | "int4WeightCompression", 29 | "int4WeightExtractionFloat", 30 | "int4WeightExtractionHalf", 31 | "int8WeightExtractionFloat", 32 | "int8WeightExtractionHalf", 33 | ], 34 | ) 35 | 36 | 37 | def compress_int4_weight(weight: torch.Tensor): # (n, m) 38 | with torch.cuda.device(weight.device): 39 | n, m = weight.size(0), weight.size(1) 40 | assert m % 2 == 0 41 | m = m // 2 42 | out = torch.empty(n, m, dtype=torch.int8, device="cuda") 43 | stream = torch.cuda.current_stream() 44 | 45 | gridDim = (n, 1, 1) 46 | blockDim = (min(round_up(m, 32), 1024), 1, 1) 47 | 48 | kernels.int4WeightCompression( 49 | gridDim, 50 | blockDim, 51 | 0, 52 | stream, 53 | [ctypes.c_void_p(weight.data_ptr()), ctypes.c_void_p(out.data_ptr()), ctypes.c_int32(n), ctypes.c_int32(m)], 54 | ) 55 | return out 56 | 57 | 58 | def extract_weight_to_half(weight: torch.Tensor, scale_list: torch.Tensor, source_bit_width: int): 59 | if source_bit_width == 8: 60 | func = kernels.int8WeightExtractionHalf 61 | elif source_bit_width == 4: 62 | func = kernels.int4WeightExtractionHalf 63 | else: 64 | assert False, "Unsupported bit-width" 65 | 66 | with torch.cuda.device(weight.device): 67 | n, m = weight.size(0), weight.size(1) 68 | out = torch.empty(n, m * (8 // source_bit_width), dtype=torch.half, device="cuda") 69 | stream = torch.cuda.current_stream() 70 | 71 | gridDim = (n, 1, 1) 72 | blockDim = (min(round_up(m, 32), 1024), 1, 1) 73 | 74 | func( 75 | gridDim, 76 | blockDim, 77 | 0, 78 | stream, 79 | [ 80 | ctypes.c_void_p(weight.data_ptr()), 81 | ctypes.c_void_p(scale_list.data_ptr()), 82 | ctypes.c_void_p(out.data_ptr()), 83 | ctypes.c_int32(n), 84 | ctypes.c_int32(m), 85 | ], 86 | ) 87 | return out 88 | 89 | 90 | if __name__ == "__main__": 91 | weight = torch.randn(4, 32).to(torch.int8).cuda() 92 | scale = torch.ones(weight.size(0)).to(torch.half).cuda() 93 | 94 | print(weight) 95 | b = compress_int4_weight(weight) 96 | print(b) 97 | 98 | a = extract_weight_to_half(b, scale, source_bit_width=4) 99 | print(a) 100 | -------------------------------------------------------------------------------- /codegeex/kernels/quantization.fatbin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/codegeex/kernels/quantization.fatbin -------------------------------------------------------------------------------- /codegeex/megatron/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | import torch 16 | 17 | from .global_vars import get_args 18 | from .global_vars import get_current_global_batch_size 19 | from .global_vars import get_num_microbatches 20 | from .global_vars import update_num_microbatches 21 | from .global_vars import get_tokenizer 22 | from .global_vars import get_tensorboard_writer 23 | from .global_vars import get_adlr_autoresume 24 | from .global_vars import get_timers 25 | from .initialize import initialize_megatron 26 | 27 | 28 | def print_rank_0(message): 29 | """If distributed is initialized, print only on rank 0.""" 30 | if torch.distributed.is_initialized(): 31 | if torch.distributed.get_rank() == 0: 32 | print(message, flush=True) 33 | else: 34 | print(message, flush=True) 35 | 36 | 37 | def is_last_rank(): 38 | return torch.distributed.get_rank() == (torch.distributed.get_world_size() - 1) 39 | 40 | 41 | def print_rank_last(message): 42 | """If distributed is initialized, print only on last rank.""" 43 | if torch.distributed.is_initialized(): 44 | if is_last_rank(): 45 | print(message, flush=True) 46 | else: 47 | print(message, flush=True) 48 | -------------------------------------------------------------------------------- /codegeex/megatron/convert_ckpt_parallel.py: -------------------------------------------------------------------------------- 1 | """Get model parallel partitions.""" 2 | 3 | import os 4 | import torch 5 | import argparse 6 | 7 | 8 | def get_change_ckpt_args(parser): 9 | """Provide extra arguments required for merging.""" 10 | group = parser.add_argument_group(title='Mindspore to megatron') 11 | group.add_argument( 12 | '--load-ckpt-path', 13 | type=str, 14 | required=True, 15 | help='path to load ".pt" checkpoint.', 16 | ) 17 | group.add_argument( 18 | '--save-ckpt-path', 19 | type=str, 20 | required=True, 21 | help='dir to save converted checkpoints.', 22 | ) 23 | group.add_argument( 24 | '--target-tensor-model-parallel-size', 25 | type=int, 26 | default=2, 27 | help='target tensor model parallel size', 28 | ) 29 | 30 | return parser 31 | 32 | 33 | def get_element_from_dict_by_path(d, path): 34 | """ 35 | Get element from dictionary by path. If element is not present, recursively add empty dictionaries. 36 | Args: 37 | d (dict): the dictionary to get the element from 38 | path (list): the path to the element which is delimited by "." 39 | """ 40 | path = path.split(".") 41 | for k in path: 42 | if k not in d: 43 | d[k] = {} 44 | d = d[k] 45 | return d 46 | 47 | 48 | def main(): 49 | parser = argparse.ArgumentParser() 50 | parser = get_change_ckpt_args(parser) 51 | args, _ = parser.parse_known_args() 52 | 53 | print(f"Load ckpt from {args.load_ckpt_path}...") 54 | state_dict = torch.load(args.load_ckpt_path, map_location="cpu") 55 | 56 | print(f"Spliting ckpt into {args.target_tensor_model_parallel_size} parts...") 57 | output_state_dict = [] 58 | for i in range(args.target_tensor_model_parallel_size): 59 | output_state_dict.append({}) 60 | 61 | print("Converting Embedding layers...") 62 | word_embeddings = state_dict['module']['language_model']['embedding']['word_embeddings']['weight'] 63 | position_embeddings = state_dict['module']['language_model']['embedding']['position_embeddings']['weight'] 64 | out_word_embeddings = torch.chunk(word_embeddings, args.target_tensor_model_parallel_size, dim=0) 65 | 66 | for i in range(args.target_tensor_model_parallel_size): 67 | pos_emb_dict = get_element_from_dict_by_path( 68 | output_state_dict[i], "module.language_model.embedding.position_embeddings" 69 | ) 70 | pos_emb_dict["weight"] = position_embeddings 71 | 72 | word_emb_dict = get_element_from_dict_by_path( 73 | output_state_dict[i], "module.language_model.embedding.word_embeddings" 74 | ) 75 | word_emb_dict["weight"] = out_word_embeddings[i].clone() 76 | 77 | print("Converting QueryEmbedding layers...") 78 | query_embeddings = state_dict['module']['language_model']['topQueryEmbedding']['top_query_embeddings']['weight'] 79 | out_query_embeddings = torch.chunk(query_embeddings, args.target_tensor_model_parallel_size, dim=0) 80 | 81 | for i in range(args.target_tensor_model_parallel_size): 82 | query_emb_dict = get_element_from_dict_by_path( 83 | output_state_dict[i], "module.language_model.topQueryEmbedding.top_query_embeddings" 84 | ) 85 | query_emb_dict["weight"] = out_query_embeddings[i].clone() 86 | 87 | print("Converting Transformer layers...") 88 | for layer_name in state_dict['module']['language_model']['transformer'].keys(): 89 | params = state_dict['module']['language_model']['transformer'][layer_name] 90 | if "layernorm" in layer_name: 91 | pass 92 | elif "attention" in layer_name and "weight" in layer_name: 93 | if "dense" in layer_name: 94 | params = torch.chunk(params, args.target_tensor_model_parallel_size, dim=1) 95 | else: 96 | params = torch.chunk(params, args.target_tensor_model_parallel_size, dim=0) 97 | elif "weight" in layer_name and "dense" in layer_name: 98 | if "h_to_4h" in layer_name: 99 | params = torch.chunk(params, args.target_tensor_model_parallel_size, dim=0) 100 | else: 101 | params = torch.chunk(params, args.target_tensor_model_parallel_size, dim=1) 102 | elif "bias" in layer_name: 103 | if "dense" not in layer_name or "mlp" in layer_name: 104 | if "4h_to_h" in layer_name: 105 | pass 106 | else: 107 | params = torch.chunk(params, args.target_tensor_model_parallel_size, dim=0) 108 | 109 | for i in range(args.target_tensor_model_parallel_size): 110 | params_dict = get_element_from_dict_by_path(output_state_dict[i], "module.language_model.transformer") 111 | if type(params) is tuple: 112 | params_dict[layer_name] = params[i].clone() 113 | else: 114 | params_dict[layer_name] = params 115 | 116 | os.makedirs(args.save_ckpt_path, exist_ok=True) 117 | for rank in range(args.target_tensor_model_parallel_size): 118 | save_ckpt_path = os.path.join(args.save_ckpt_path, f"mp_rank_{rank:02d}_model_states.pt") 119 | torch.save(output_state_dict[rank], save_ckpt_path) 120 | print(f"Converted checkpoint saved in {save_ckpt_path}.") 121 | 122 | 123 | if __name__ == '__main__': 124 | main() 125 | -------------------------------------------------------------------------------- /codegeex/megatron/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/codegeex/megatron/data/__init__.py -------------------------------------------------------------------------------- /codegeex/megatron/data/blendable_dataset.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Blendable dataset.""" 17 | 18 | import time 19 | import torch 20 | import numpy as np 21 | 22 | from codegeex.megatron import print_rank_0 23 | 24 | 25 | class BlendableDataset(torch.utils.data.Dataset): 26 | def __init__(self, datasets, weights): 27 | 28 | self.datasets = datasets 29 | num_datasets = len(datasets) 30 | assert num_datasets == len(weights) 31 | 32 | self.size = 0 33 | for dataset in self.datasets: 34 | self.size += len(dataset) 35 | 36 | # Normalize weights. 37 | weights = np.array(weights, dtype=np.float64) 38 | sum_weights = np.sum(weights) 39 | assert sum_weights > 0.0 40 | weights /= sum_weights 41 | 42 | # Build indecies. 43 | start_time = time.time() 44 | assert num_datasets < 255 45 | self.dataset_index = np.zeros(self.size, dtype=np.uint8) 46 | self.dataset_sample_index = np.zeros(self.size, dtype=np.int64) 47 | 48 | from megatron.data import helpers 49 | 50 | helpers.build_blending_indices( 51 | self.dataset_index, 52 | self.dataset_sample_index, 53 | weights, 54 | num_datasets, 55 | self.size, 56 | torch.distributed.get_rank() == 0, 57 | ) 58 | print_rank_0( 59 | "> elapsed time for building blendable dataset indices: " 60 | "{:.2f} (sec)".format(time.time() - start_time) 61 | ) 62 | 63 | def __len__(self): 64 | return self.size 65 | 66 | def __getitem__(self, idx): 67 | dataset_idx = self.dataset_index[idx] 68 | sample_idx = self.dataset_sample_index[idx] 69 | return self.datasets[dataset_idx][sample_idx] 70 | -------------------------------------------------------------------------------- /codegeex/megatron/enums.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import enum 17 | 18 | 19 | class LayerType(enum.Enum): 20 | encoder = 1 21 | decoder = 2 22 | 23 | 24 | class AttnType(enum.Enum): 25 | self_attn = 1 26 | cross_attn = 2 27 | 28 | 29 | class AttnMaskType(enum.Enum): 30 | padding = 1 31 | causal = 2 32 | -------------------------------------------------------------------------------- /codegeex/megatron/memory.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | import torch 18 | 19 | 20 | # A dictionary of all the memory buffers allocated. 21 | _MEM_BUFFS = dict() 22 | 23 | 24 | def allocate_mem_buff(name, numel, dtype, track_usage): 25 | """Allocate a memory buffer.""" 26 | assert name not in _MEM_BUFFS, "memory buffer {} already allocated.".format(name) 27 | _MEM_BUFFS[name] = MemoryBuffer(name, numel, dtype, track_usage) 28 | return _MEM_BUFFS[name] 29 | 30 | 31 | def get_mem_buff(name): 32 | """Get the memory buffer.""" 33 | return _MEM_BUFFS[name] 34 | 35 | 36 | class MemoryBuffer: 37 | """Contiguous memory buffer. 38 | Allocate a contiguous memory of type `dtype` and size `numel`. It is 39 | used to reduce memory fragmentation. 40 | 41 | Usage: After the allocation, the `_start` index is set tot the first 42 | index of the memory. A memory chunk starting from `_start` index 43 | can be `allocated` for an input tensor, with the elements of the 44 | tensor being coppied. The buffer can be reused by resetting the 45 | `_start` index. 46 | 47 | """ 48 | 49 | def __init__(self, name, numel, dtype, track_usage): 50 | if torch.distributed.get_rank() == 0: 51 | element_size = torch.tensor([], dtype=dtype).element_size() 52 | print( 53 | "> building the {} memory buffer with {} num elements " 54 | "and {} dtype ({:.1f} MB)...".format( 55 | name, numel, dtype, numel * element_size / 1024 / 1024 56 | ), 57 | flush=True, 58 | ) 59 | self.name = name 60 | self.numel = numel 61 | self.dtype = dtype 62 | self.data = torch.empty( 63 | self.numel, 64 | dtype=self.dtype, 65 | device=torch.cuda.current_device(), 66 | requires_grad=False, 67 | ) 68 | 69 | # Index tracking the start of the free memory. 70 | self._start = 0 71 | 72 | # Values used for tracking usage. 73 | self.track_usage = track_usage 74 | if self.track_usage: 75 | self.in_use_value = 0.0 76 | self.total_value = 0.0 77 | 78 | def reset(self): 79 | """Reset the buffer start index to the beginning of the buffer.""" 80 | self._start = 0 81 | 82 | def is_in_use(self): 83 | """Whether the current buffer hold on to any memory.""" 84 | return self._start > 0 85 | 86 | def numel_in_use(self): 87 | """Return number of elements in use.""" 88 | return self._start 89 | 90 | def add(self, tensor): 91 | """Allocate a chunk of memory from the buffer to tensor and copy 92 | the values.""" 93 | assert ( 94 | tensor.dtype == self.dtype 95 | ), "Input tensor type {} different from buffer type {}".format( 96 | tensor.dtype, self.dtype 97 | ) 98 | # Number of elements of the input tensor. 99 | tensor_numel = torch.numel(tensor) 100 | new_start = self._start + tensor_numel 101 | assert ( 102 | new_start <= self.numel 103 | ), "Not enough memory left in the buffer ({} > {})".format( 104 | tensor_numel, self.numel - self._start 105 | ) 106 | # New tensor is a view into the memory. 107 | new_tensor = self.data[self._start : new_start] 108 | self._start = new_start 109 | new_tensor = new_tensor.view(tensor.shape) 110 | new_tensor.copy_(tensor) 111 | # Return a pointer to the new tensor. 112 | return new_tensor 113 | 114 | def get_data(self): 115 | """Return the data currently in use.""" 116 | if self.track_usage: 117 | self.in_use_value += float(self._start) 118 | self.total_value += float(self.numel) 119 | return self.data[: self._start] 120 | 121 | def print_average_usage(self): 122 | """Print memory usage average over time. We would like this value 123 | to be as high as possible.""" 124 | assert self.track_usage, "You need to enable track usage." 125 | if torch.distributed.get_rank() == 0: 126 | print( 127 | " > usage of {} memory buffer: {:.2f} %".format( 128 | self.name, self.in_use_value * 100.0 / self.total_value 129 | ), 130 | flush=True, 131 | ) 132 | 133 | 134 | class RingMemBuffer: 135 | """A ring of memory buffers.""" 136 | 137 | def __init__(self, name, num_buffers, numel, dtype, track_usage): 138 | self.num_buffers = num_buffers 139 | self.buffers = [ 140 | allocate_mem_buff(name + " {}".format(i), numel, dtype, track_usage) 141 | for i in range(num_buffers) 142 | ] 143 | self._index = -1 144 | 145 | def get_next_buffer(self): 146 | self._index += 1 147 | self._index = self._index % self.num_buffers 148 | buff = self.buffers[self._index] 149 | assert not buff.is_in_use(), "buffer is already in use." 150 | return buff 151 | -------------------------------------------------------------------------------- /codegeex/megatron/model/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | from torch.nn import LayerNorm 17 | from .distributed import DistributedDataParallel 18 | from .codegeex_model import CodeGeeXModel 19 | from .language_model import get_language_model 20 | from .module import Float16Module -------------------------------------------------------------------------------- /codegeex/megatron/model/utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Utilities for models.""" 17 | 18 | import math 19 | import torch 20 | 21 | 22 | def init_method_normal(sigma): 23 | """Init method based on N(0, sigma).""" 24 | 25 | def init_(tensor): 26 | return torch.nn.init.normal_(tensor, mean=0.0, std=sigma) 27 | 28 | return init_ 29 | 30 | 31 | def scaled_init_method_normal(sigma, num_layers): 32 | """Init method based on N(0, sigma/sqrt(2*num_layers).""" 33 | std = sigma / math.sqrt(2.0 * num_layers) 34 | 35 | def init_(tensor): 36 | return torch.nn.init.normal_(tensor, mean=0.0, std=std) 37 | 38 | return init_ 39 | 40 | 41 | def attention_mask_func(attention_scores, attention_mask): 42 | attention_scores.masked_fill_(attention_mask, -10000.0) 43 | 44 | return attention_scores 45 | 46 | 47 | def get_linear_layer(rows, columns, init_method): 48 | """Simple linear layer with weight initialization.""" 49 | layer = torch.nn.Linear(rows, columns) 50 | init_method(layer.weight) 51 | with torch.no_grad(): 52 | layer.bias.zero_() 53 | return layer 54 | 55 | 56 | def fast_gelu(x): 57 | """Mindspore's fast gelu implementation.""" 58 | return x / (1 + torch.exp(-1.702 * torch.abs(x))) * torch.exp(0.851 * (x - torch.abs(x))) 59 | 60 | 61 | @torch.jit.script 62 | def gelu_impl(x): 63 | """OpenAI's gelu implementation.""" 64 | return ( 65 | 0.5 * x * (1.0 + torch.tanh(0.7978845608028654 * x * (1.0 + 0.044715 * x * x))) 66 | ) 67 | 68 | 69 | def openai_gelu(x): 70 | return gelu_impl(x) 71 | 72 | 73 | # This is actually Python equivalent of torch.nn.functional.gelu(), also with type hints for ONNX exporter 74 | @torch.jit.script 75 | def erf_gelu(x): 76 | return ( 77 | x 78 | * 0.5 79 | * ( 80 | torch.erf(x / 1.41421).to(dtype=x.dtype) 81 | + torch.ones_like(x).to(dtype=x.dtype) 82 | ) 83 | ) 84 | -------------------------------------------------------------------------------- /codegeex/megatron/mpu/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Model parallel utility interface.""" 17 | 18 | from .cross_entropy import vocab_parallel_cross_entropy 19 | 20 | from .data import broadcast_data 21 | 22 | from .initialize import is_unitialized 23 | from .initialize import destroy_model_parallel 24 | from .initialize import get_data_parallel_group 25 | from .initialize import get_data_parallel_rank 26 | from .initialize import get_data_parallel_world_size 27 | from .initialize import get_embedding_group 28 | from .initialize import get_model_parallel_group 29 | from .initialize import get_tensor_model_parallel_group 30 | from .initialize import get_pipeline_model_parallel_group 31 | from .initialize import get_tensor_model_parallel_rank, set_tensor_model_parallel_rank 32 | from .initialize import ( 33 | get_pipeline_model_parallel_rank, 34 | set_pipeline_model_parallel_rank, 35 | ) 36 | from .initialize import is_pipeline_first_stage, is_pipeline_last_stage 37 | from .initialize import get_tensor_model_parallel_src_rank 38 | from .initialize import get_pipeline_model_parallel_first_rank 39 | from .initialize import get_pipeline_model_parallel_last_rank 40 | from .initialize import get_pipeline_model_parallel_next_rank 41 | from .initialize import get_pipeline_model_parallel_prev_rank 42 | from .initialize import ( 43 | get_tensor_model_parallel_world_size, 44 | set_tensor_model_parallel_world_size, 45 | ) 46 | from .initialize import ( 47 | get_pipeline_model_parallel_world_size, 48 | set_pipeline_model_parallel_world_size, 49 | ) 50 | from .initialize import ( 51 | get_virtual_pipeline_model_parallel_rank, 52 | set_virtual_pipeline_model_parallel_rank, 53 | ) 54 | from .initialize import initialize_model_parallel 55 | from .initialize import model_parallel_is_initialized 56 | from .initialize import get_model_parallel_world_size, get_model_parallel_rank 57 | 58 | from .layers import ColumnParallelLinear 59 | from .layers import RowParallelLinear 60 | from .layers import VocabParallelEmbedding 61 | from .layers import ( 62 | set_tensor_model_parallel_attributes, 63 | set_defaults_if_not_set_tensor_model_parallel_attributes, 64 | copy_tensor_model_parallel_attributes, 65 | ) 66 | 67 | from .mappings import copy_to_tensor_model_parallel_region 68 | from .mappings import gather_from_tensor_model_parallel_region 69 | from .mappings import reduce_from_tensor_model_parallel_region 70 | from .mappings import scatter_to_tensor_model_parallel_region 71 | 72 | from .random import checkpoint 73 | from .random import get_cuda_rng_tracker 74 | from .random import init_checkpointed_activations_memory_buffer 75 | from .random import model_parallel_cuda_manual_seed 76 | from .random import reset_checkpointed_activations_memory_buffer 77 | from .random import gather_split_1d_tensor 78 | from .random import split_tensor_into_1d_equal_chunks 79 | 80 | from .utils import divide 81 | from .utils import split_tensor_along_last_dim 82 | -------------------------------------------------------------------------------- /codegeex/megatron/mpu/cross_entropy.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | import torch 18 | 19 | from .initialize import get_tensor_model_parallel_group 20 | from .initialize import get_tensor_model_parallel_rank 21 | from .initialize import get_tensor_model_parallel_world_size 22 | from .utils import VocabUtility 23 | 24 | 25 | class _VocabParallelCrossEntropy(torch.autograd.Function): 26 | @staticmethod 27 | def forward(ctx, vocab_parallel_logits, target): 28 | 29 | # Maximum value along vocab dimension across all GPUs. 30 | logits_max = torch.max(vocab_parallel_logits, dim=-1)[0] 31 | torch.distributed.all_reduce( 32 | logits_max, 33 | op=torch.distributed.ReduceOp.MAX, 34 | group=get_tensor_model_parallel_group(), 35 | ) 36 | # Subtract the maximum value. 37 | vocab_parallel_logits.sub_(logits_max.unsqueeze(dim=-1)) 38 | 39 | # Get the partition's vocab indecies 40 | get_vocab_range = VocabUtility.vocab_range_from_per_partition_vocab_size 41 | partition_vocab_size = vocab_parallel_logits.size()[-1] 42 | rank = get_tensor_model_parallel_rank() 43 | world_size = get_tensor_model_parallel_world_size() 44 | vocab_start_index, vocab_end_index = get_vocab_range( 45 | partition_vocab_size, rank, world_size 46 | ) 47 | 48 | # Create a mask of valid vocab ids (1 means it needs to be masked). 49 | target_mask = (target < vocab_start_index) | (target >= vocab_end_index) 50 | masked_target = target.clone() - vocab_start_index 51 | masked_target[target_mask] = 0 52 | 53 | # Get predicted-logits = logits[target]. 54 | # For Simplicity, we convert logits to a 2-D tensor with size 55 | # [*, partition-vocab-size] and target to a 1-D tensor of size [*]. 56 | logits_2d = vocab_parallel_logits.view(-1, partition_vocab_size) 57 | masked_target_1d = masked_target.view(-1) 58 | arange_1d = torch.arange( 59 | start=0, end=logits_2d.size()[0], device=logits_2d.device 60 | ) 61 | predicted_logits_1d = logits_2d[arange_1d, masked_target_1d] 62 | predicted_logits_1d = predicted_logits_1d.clone().contiguous() 63 | predicted_logits = predicted_logits_1d.view_as(target) 64 | predicted_logits[target_mask] = 0.0 65 | # All reduce is needed to get the chunks from other GPUs. 66 | torch.distributed.all_reduce( 67 | predicted_logits, 68 | op=torch.distributed.ReduceOp.SUM, 69 | group=get_tensor_model_parallel_group(), 70 | ) 71 | 72 | # Sum of exponential of logits along vocab dimension across all GPUs. 73 | exp_logits = vocab_parallel_logits 74 | torch.exp(vocab_parallel_logits, out=exp_logits) 75 | sum_exp_logits = exp_logits.sum(dim=-1) 76 | torch.distributed.all_reduce( 77 | sum_exp_logits, 78 | op=torch.distributed.ReduceOp.SUM, 79 | group=get_tensor_model_parallel_group(), 80 | ) 81 | 82 | # Loss = log(sum(exp(logits))) - predicted-logit. 83 | loss = torch.log(sum_exp_logits) - predicted_logits 84 | 85 | # Store softmax, target-mask and masked-target for backward pass. 86 | exp_logits.div_(sum_exp_logits.unsqueeze(dim=-1)) 87 | ctx.save_for_backward(exp_logits, target_mask, masked_target_1d) 88 | 89 | return loss 90 | 91 | @staticmethod 92 | def backward(ctx, grad_output): 93 | 94 | # Retreive tensors from the forward path. 95 | softmax, target_mask, masked_target_1d = ctx.saved_tensors 96 | 97 | # All the inputs have softmax as their gradient. 98 | grad_input = softmax 99 | # For simplicity, work with the 2D gradient. 100 | partition_vocab_size = softmax.size()[-1] 101 | grad_2d = grad_input.view(-1, partition_vocab_size) 102 | 103 | # Add the gradient from matching classes. 104 | arange_1d = torch.arange(start=0, end=grad_2d.size()[0], device=grad_2d.device) 105 | grad_2d[arange_1d, masked_target_1d] -= 1.0 - target_mask.view(-1).float() 106 | 107 | # Finally elementwise multiplication with the output gradients. 108 | grad_input.mul_(grad_output.unsqueeze(dim=-1)) 109 | 110 | return grad_input, None 111 | 112 | 113 | def vocab_parallel_cross_entropy(vocab_parallel_logits, target): 114 | """Helper function for the cross entropy.""" 115 | return _VocabParallelCrossEntropy.apply(vocab_parallel_logits, target) 116 | -------------------------------------------------------------------------------- /codegeex/megatron/mpu/data.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import torch 17 | 18 | from .initialize import get_tensor_model_parallel_group 19 | from .initialize import get_tensor_model_parallel_rank 20 | from .initialize import get_tensor_model_parallel_src_rank 21 | 22 | 23 | _MAX_DATA_DIM = 5 24 | 25 | 26 | def _check_data_types(keys, data, target_dtype): 27 | """Check that all the keys have the same target data type.""" 28 | for key in keys: 29 | assert ( 30 | data[key].dtype == target_dtype 31 | ), "{} has data type {} which " "is different than {}".format( 32 | key, data[key].dtype, target_dtype 33 | ) 34 | 35 | 36 | def _build_key_size_numel_dictionaries(keys, data): 37 | """Build the size on rank 0 and broadcast.""" 38 | max_dim = _MAX_DATA_DIM 39 | sizes = [0 for _ in range(max_dim) for _ in keys] 40 | 41 | # Pack the sizes on rank zero. 42 | if get_tensor_model_parallel_rank() == 0: 43 | offset = 0 44 | for key in keys: 45 | assert data[key].dim() < max_dim, "you should increase MAX_DATA_DIM" 46 | size = data[key].size() 47 | for i, s in enumerate(size): 48 | sizes[i + offset] = s 49 | offset += max_dim 50 | 51 | # Move to GPU and broadcast. 52 | sizes_cuda = torch.cuda.LongTensor(sizes) 53 | torch.distributed.broadcast( 54 | sizes_cuda, 55 | get_tensor_model_parallel_src_rank(), 56 | group=get_tensor_model_parallel_group(), 57 | ) 58 | 59 | # Move back to cpu and unpack. 60 | sizes_cpu = sizes_cuda.cpu() 61 | key_size = {} 62 | key_numel = {} 63 | total_numel = 0 64 | offset = 0 65 | for key in keys: 66 | i = 0 67 | size = [] 68 | numel = 1 69 | while sizes_cpu[offset + i] > 0: 70 | this_size = sizes_cpu[offset + i] 71 | size.append(this_size) 72 | numel *= this_size 73 | i += 1 74 | key_size[key] = size 75 | key_numel[key] = numel 76 | total_numel += numel 77 | offset += max_dim 78 | 79 | return key_size, key_numel, total_numel 80 | 81 | 82 | def broadcast_data(keys, data, datatype): 83 | """Broadcast data from rank zero of each model parallel group to the 84 | members of the same model parallel group. 85 | 86 | Arguments: 87 | keys: list of keys in the data disctionary to be broadcasted 88 | data: data dictionary of string keys and cpu tensor values. 89 | datatype: torch data type of all tensors in data associated 90 | with keys. 91 | """ 92 | # Build (key, size) and (key, number of elements) dictionaries along 93 | # with the total number of elements on all ranks. 94 | key_size, key_numel, total_numel = _build_key_size_numel_dictionaries(keys, data) 95 | 96 | # Pack on rank zero. 97 | if get_tensor_model_parallel_rank() == 0: 98 | # Check that all keys have the same data type. 99 | _check_data_types(keys, data, datatype) 100 | # Flatten the data associated with the keys 101 | flatten_data = torch.cat( 102 | [data[key].contiguous().view(-1) for key in keys], dim=0 103 | ).cuda() 104 | else: 105 | flatten_data = torch.empty( 106 | total_numel, device=torch.cuda.current_device(), dtype=datatype 107 | ) 108 | 109 | # Broadcast 110 | torch.distributed.broadcast( 111 | flatten_data, 112 | get_tensor_model_parallel_src_rank(), 113 | group=get_tensor_model_parallel_group(), 114 | ) 115 | 116 | # Unpack 117 | output = {} 118 | offset = 0 119 | for key in keys: 120 | size = key_size[key] 121 | numel = key_numel[key] 122 | output[key] = flatten_data.narrow(0, offset, numel).view(size) 123 | offset += numel 124 | 125 | return output 126 | -------------------------------------------------------------------------------- /codegeex/megatron/mpu/mappings.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import torch 17 | 18 | from .initialize import ( 19 | get_tensor_model_parallel_group, 20 | get_tensor_model_parallel_world_size, 21 | get_tensor_model_parallel_rank, 22 | ) 23 | from .utils import split_tensor_along_last_dim 24 | 25 | 26 | def _reduce(input_): 27 | """All-reduce the the input tensor across model parallel group.""" 28 | 29 | # Bypass the function if we are using only 1 GPU. 30 | if get_tensor_model_parallel_world_size() == 1: 31 | return input_ 32 | 33 | # All-reduce. 34 | torch.distributed.all_reduce(input_, group=get_tensor_model_parallel_group()) 35 | 36 | return input_ 37 | 38 | 39 | def _split(input_): 40 | """Split the tensor along its last dimension and keep the 41 | corresponding slice.""" 42 | 43 | world_size = get_tensor_model_parallel_world_size() 44 | # Bypass the function if we are using only 1 GPU. 45 | if world_size == 1: 46 | return input_ 47 | 48 | # Split along last dimension. 49 | input_list = split_tensor_along_last_dim(input_, world_size) 50 | 51 | # Note: torch.split does not create contiguous tensors by default. 52 | rank = get_tensor_model_parallel_rank() 53 | output = input_list[rank].contiguous() 54 | 55 | return output 56 | 57 | 58 | def _gather(input_): 59 | """Gather tensors and concatinate along the last dimension.""" 60 | 61 | world_size = get_tensor_model_parallel_world_size() 62 | # Bypass the function if we are using only 1 GPU. 63 | if world_size == 1: 64 | return input_ 65 | 66 | # Size and dimension. 67 | last_dim = input_.dim() - 1 68 | rank = get_tensor_model_parallel_rank() 69 | 70 | tensor_list = [torch.empty_like(input_) for _ in range(world_size)] 71 | tensor_list[rank] = input_ 72 | torch.distributed.all_gather( 73 | tensor_list, input_, group=get_tensor_model_parallel_group() 74 | ) 75 | 76 | # Note: torch.cat already creates a contiguous tensor. 77 | output = torch.cat(tensor_list, dim=last_dim).contiguous() 78 | 79 | return output 80 | 81 | 82 | class _CopyToModelParallelRegion(torch.autograd.Function): 83 | """Pass the input to the model parallel region.""" 84 | 85 | @staticmethod 86 | def symbolic(graph, input_): 87 | return input_ 88 | 89 | @staticmethod 90 | def forward(ctx, input_): 91 | return input_ 92 | 93 | @staticmethod 94 | def backward(ctx, grad_output): 95 | return _reduce(grad_output) 96 | 97 | 98 | class _ReduceFromModelParallelRegion(torch.autograd.Function): 99 | """All-reduce the input from the model parallel region.""" 100 | 101 | @staticmethod 102 | def symbolic(graph, input_): 103 | return _reduce(input_) 104 | 105 | @staticmethod 106 | def forward(ctx, input_): 107 | return _reduce(input_) 108 | 109 | @staticmethod 110 | def backward(ctx, grad_output): 111 | return grad_output 112 | 113 | 114 | class _ScatterToModelParallelRegion(torch.autograd.Function): 115 | """Split the input and keep only the corresponding chuck to the rank.""" 116 | 117 | @staticmethod 118 | def symbolic(graph, input_): 119 | return _split(input_) 120 | 121 | @staticmethod 122 | def forward(ctx, input_): 123 | return _split(input_) 124 | 125 | @staticmethod 126 | def backward(ctx, grad_output): 127 | return _gather(grad_output) 128 | 129 | 130 | class _GatherFromModelParallelRegion(torch.autograd.Function): 131 | """Gather the input from model parallel region and concatinate.""" 132 | 133 | @staticmethod 134 | def symbolic(graph, input_): 135 | return _gather(input_) 136 | 137 | @staticmethod 138 | def forward(ctx, input_): 139 | return _gather(input_) 140 | 141 | @staticmethod 142 | def backward(ctx, grad_output): 143 | return _split(grad_output) 144 | 145 | 146 | # ----------------- 147 | # Helper functions. 148 | # ----------------- 149 | 150 | 151 | def copy_to_tensor_model_parallel_region(input_): 152 | return _CopyToModelParallelRegion.apply(input_) 153 | 154 | 155 | def reduce_from_tensor_model_parallel_region(input_): 156 | return _ReduceFromModelParallelRegion.apply(input_) 157 | 158 | 159 | def scatter_to_tensor_model_parallel_region(input_): 160 | return _ScatterToModelParallelRegion.apply(input_) 161 | 162 | 163 | def gather_from_tensor_model_parallel_region(input_): 164 | return _GatherFromModelParallelRegion.apply(input_) 165 | -------------------------------------------------------------------------------- /codegeex/megatron/mpu/utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | import torch 18 | 19 | 20 | def ensure_divisibility(numerator, denominator): 21 | """Ensure that numerator is divisible by the denominator.""" 22 | assert numerator % denominator == 0, "{} is not divisible by {}".format( 23 | numerator, denominator 24 | ) 25 | 26 | 27 | def divide(numerator, denominator): 28 | """Ensure that numerator is divisible by the denominator and return 29 | the division value.""" 30 | ensure_divisibility(numerator, denominator) 31 | return numerator // denominator 32 | 33 | 34 | def split_tensor_along_last_dim(tensor, num_partitions, contiguous_split_chunks=False): 35 | """Split a tensor along its last dimension. 36 | Arguments: 37 | tensor: input tensor. 38 | num_partitions: number of partitions to split the tensor 39 | contiguous_split_chunks: If True, make each chunk contiguous 40 | in memory. 41 | """ 42 | # Get the size and dimension. 43 | last_dim = tensor.dim() - 1 44 | last_dim_size = divide(tensor.size()[last_dim], num_partitions) 45 | # Split. 46 | tensor_list = torch.split(tensor, last_dim_size, dim=last_dim) 47 | # Note: torch.split does not create contiguous tensors by default. 48 | if contiguous_split_chunks: 49 | return tuple(chunk.contiguous() for chunk in tensor_list) 50 | 51 | return tensor_list 52 | 53 | 54 | class VocabUtility: 55 | """Split the vocabulary into `world_size` chunks amd return the 56 | first and last index of the vocabulary belonging to the `rank` 57 | partition: Note that indecies in [fist, last)""" 58 | 59 | @staticmethod 60 | def vocab_range_from_per_partition_vocab_size( 61 | per_partition_vocab_size, rank, world_size 62 | ): 63 | index_f = rank * per_partition_vocab_size 64 | index_l = index_f + per_partition_vocab_size 65 | return index_f, index_l 66 | 67 | @staticmethod 68 | def vocab_range_from_global_vocab_size(global_vocab_size, rank, world_size): 69 | per_partition_vocab_size = divide(global_vocab_size, world_size) 70 | return VocabUtility.vocab_range_from_per_partition_vocab_size( 71 | per_partition_vocab_size, rank, world_size 72 | ) 73 | -------------------------------------------------------------------------------- /codegeex/megatron/optimizer/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | from apex.optimizers import FusedAdam as Adam 17 | from apex.optimizers import FusedSGD as SGD 18 | 19 | from codegeex.megatron import get_args 20 | from codegeex.megatron.model import LayerNorm 21 | 22 | from .grad_scaler import ConstantGradScaler, DynamicGradScaler 23 | from .optimizer import Float16OptimizerWithFloat16Params, FP32Optimizer 24 | 25 | 26 | def _get_params_for_weight_decay_optimization(modules): 27 | """Divide params into with-weight-decay and without-weight-decay groups. 28 | Layernorms and baises will have no weight decay but the rest will. 29 | """ 30 | 31 | weight_decay_params = {"params": []} 32 | no_weight_decay_params = {"params": [], "weight_decay": 0.0} 33 | for module in modules: 34 | for module_ in module.modules(): 35 | if isinstance(module_, LayerNorm): 36 | no_weight_decay_params["params"].extend( 37 | [p for p in list(module_._parameters.values()) if p is not None] 38 | ) 39 | else: 40 | weight_decay_params["params"].extend( 41 | [ 42 | p 43 | for n, p in list(module_._parameters.items()) 44 | if p is not None and n != "bias" 45 | ] 46 | ) 47 | no_weight_decay_params["params"].extend( 48 | [ 49 | p 50 | for n, p in list(module_._parameters.items()) 51 | if p is not None and n == "bias" 52 | ] 53 | ) 54 | 55 | return weight_decay_params, no_weight_decay_params 56 | 57 | 58 | def get_megatron_optimizer(model): 59 | args = get_args() 60 | 61 | if args.cpu_optimizer: 62 | raise NotImplementedError("need to add cpu adam") 63 | 64 | param_groups = _get_params_for_weight_decay_optimization(model) 65 | 66 | if args.optimizer == "adam": 67 | optimizer = Adam( 68 | param_groups, 69 | lr=args.lr, 70 | weight_decay=args.weight_decay, 71 | betas=(args.adam_beta1, args.adam_beta2), 72 | eps=args.adam_eps, 73 | ) 74 | elif args.optimizer == "sgd": 75 | optimizer = SGD( 76 | param_groups, 77 | lr=args.lr, 78 | weight_decay=args.weight_decay, 79 | momentum=args.sgd_momentum, 80 | ) 81 | else: 82 | raise Exception("{} optimizer is not supported.".format(args.optimizer)) 83 | 84 | if args.deepspeed: 85 | return optimizer 86 | 87 | # Determine whether the params have main-grad field. 88 | params_have_main_grad = False 89 | if args.DDP_impl == "local": 90 | params_have_main_grad = True 91 | 92 | if args.fp16 or args.bf16: 93 | 94 | # Grad scaler: 95 | # if loss-scale is provided, instantiate the constant scaler. 96 | # if we are using fp16 and loss-scale is not present, use a 97 | # dynamic scaler. 98 | # otherwise we are running in bf16 with no loss-scale so 99 | # leave it as None. 100 | grad_scaler = None 101 | # Constant loss scale. 102 | if args.loss_scale: 103 | grad_scaler = ConstantGradScaler(args.loss_scale) 104 | # Dynamic loss scale. 105 | else: 106 | if args.fp16: 107 | grad_scaler = DynamicGradScaler( 108 | initial_scale=args.initial_loss_scale, 109 | min_scale=args.min_loss_scale, 110 | growth_factor=2.0, 111 | backoff_factor=0.5, 112 | growth_interval=args.loss_scale_window, 113 | hysteresis=args.hysteresis, 114 | ) 115 | 116 | # Megatron optimizer. 117 | return Float16OptimizerWithFloat16Params( 118 | optimizer, 119 | args.clip_grad, 120 | args.log_num_zeros_in_grad, 121 | params_have_main_grad, 122 | args.bf16, 123 | grad_scaler, 124 | ) 125 | 126 | # FP32. 127 | return FP32Optimizer( 128 | optimizer, args.clip_grad, args.log_num_zeros_in_grad, params_have_main_grad 129 | ) 130 | -------------------------------------------------------------------------------- /codegeex/megatron/optimizer/clip_grads.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Gradient clipping.""" 17 | 18 | import torch 19 | from torch._six import inf 20 | 21 | from apex.multi_tensor_apply import multi_tensor_applier 22 | import amp_C 23 | 24 | from codegeex.megatron import mpu 25 | from codegeex.megatron.model.module import param_is_not_shared 26 | from codegeex.megatron.mpu.layers import param_is_not_tensor_parallel_duplicate 27 | 28 | 29 | def clip_grad_norm_fp32(parameters, max_norm, norm_type=2): 30 | """Clips gradient norm of an iterable of parameters whose gradients 31 | are in fp32. 32 | 33 | This is adapted from torch.nn.utils.clip_grad.clip_grad_norm_ and 34 | added functionality to handle model parallel parameters. Note that 35 | the gradients are modified in place. 36 | 37 | Arguments: 38 | parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a 39 | single Tensor that will have gradients normalized 40 | max_norm (float or int): max norm of the gradients 41 | norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for 42 | infinity norm. 43 | 44 | Returns: 45 | Total norm of the parameters (viewed as a single vector). 46 | """ 47 | 48 | if isinstance(parameters, torch.Tensor): 49 | parameters = [parameters] 50 | 51 | # Filter parameters based on: 52 | # - grad should not be none 53 | # - parameter should not be shared 54 | # - should not be a replica due to tensor model parallelism 55 | grads = [] 56 | grads_for_norm = [] 57 | for param in parameters: 58 | grad_not_none = param.grad is not None 59 | is_not_shared = param_is_not_shared(param) 60 | is_not_tp_duplicate = param_is_not_tensor_parallel_duplicate(param) 61 | grad = param.grad.detach() 62 | if grad_not_none: 63 | # Make sure the grads are in fp32 64 | assert param.grad.type() == "torch.cuda.FloatTensor" 65 | grads.append(grad) 66 | if grad_not_none and is_not_shared and is_not_tp_duplicate: 67 | grads_for_norm.append(grad) 68 | 69 | # Norm parameters. 70 | max_norm = float(max_norm) 71 | norm_type = float(norm_type) 72 | total_norm = 0.0 73 | 74 | # Calculate norm. 75 | if norm_type == inf: 76 | total_norm = max(grad.abs().max() for grad in grads_for_norm) 77 | total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)]) 78 | # Take max across all model-parallel GPUs. 79 | torch.distributed.all_reduce( 80 | total_norm_cuda, 81 | op=torch.distributed.ReduceOp.MAX, 82 | group=mpu.get_model_parallel_group(), 83 | ) 84 | total_norm = total_norm_cuda[0].item() 85 | 86 | else: 87 | if norm_type == 2.0: 88 | dummy_overflow_buf = torch.cuda.IntTensor([0]) 89 | # Use apex's multi-tensor applier for efficiency reasons. 90 | # Multi-tensor applier takes a function and a list of list 91 | # and performs the operation on that list all in one kernel. 92 | grad_norm, _ = multi_tensor_applier( 93 | amp_C.multi_tensor_l2norm, 94 | dummy_overflow_buf, 95 | [grads_for_norm], 96 | False, # no per-parameter norm 97 | ) 98 | # Since we will be summing across data parallel groups, 99 | # we need the pow(norm-type). 100 | total_norm = grad_norm ** norm_type 101 | 102 | else: 103 | for grad in grads_for_norm: 104 | grad_norm = torch.norm(grad, norm_type) 105 | total_norm += grad_norm ** norm_type 106 | 107 | # Sum across all model-parallel GPUs. 108 | torch.distributed.all_reduce( 109 | total_norm, 110 | op=torch.distributed.ReduceOp.SUM, 111 | group=mpu.get_model_parallel_group(), 112 | ) 113 | total_norm = total_norm.item() ** (1.0 / norm_type) 114 | 115 | # Scale. 116 | clip_coeff = max_norm / (total_norm + 1.0e-6) 117 | if clip_coeff < 1.0: 118 | dummy_overflow_buf = torch.cuda.IntTensor([0]) 119 | multi_tensor_applier( 120 | amp_C.multi_tensor_scale, dummy_overflow_buf, [grads, grads], clip_coeff 121 | ) 122 | 123 | return total_norm 124 | 125 | 126 | def count_zeros_fp32(parameters): 127 | 128 | if isinstance(parameters, torch.Tensor): 129 | parameters = [parameters] 130 | 131 | # Filter parameters based on: 132 | # - grad should not be none 133 | # - parameter should not be shared 134 | # - should not be a replica due to tensor model parallelism 135 | total_num_zeros = 0.0 136 | for param in parameters: 137 | grad_not_none = param.grad is not None 138 | is_not_shared = param_is_not_shared(param) 139 | is_not_tp_duplicate = param_is_not_tensor_parallel_duplicate(param) 140 | if grad_not_none and is_not_shared and is_not_tp_duplicate: 141 | grad = param.grad.detach() 142 | num_zeros = grad.numel() - torch.count_nonzero(grad) 143 | total_num_zeros = num_zeros + total_num_zeros 144 | 145 | # Sum across all model-parallel GPUs. 146 | torch.distributed.all_reduce( 147 | total_num_zeros, 148 | op=torch.distributed.ReduceOp.SUM, 149 | group=mpu.get_model_parallel_group(), 150 | ) 151 | total_num_zeros = total_num_zeros.item() 152 | 153 | return total_num_zeros 154 | -------------------------------------------------------------------------------- /codegeex/megatron/optimizer/grad_scaler.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Megatron grad scaler.""" 17 | 18 | from abc import ABC 19 | from abc import abstractmethod 20 | 21 | import torch 22 | 23 | 24 | class MegatronGradScaler(ABC): 25 | def __init__(self, initial_scale): 26 | """Initialize scale value with the input initial scale.""" 27 | assert initial_scale > 0.0 28 | self._scale = torch.cuda.FloatTensor([initial_scale]) 29 | 30 | @property 31 | def scale(self): 32 | return self._scale 33 | 34 | @property 35 | def inv_scale(self): 36 | return self._scale.double().reciprocal().float() 37 | 38 | @abstractmethod 39 | def update(self, found_inf): 40 | pass 41 | 42 | @abstractmethod 43 | def state_dict(self): 44 | pass 45 | 46 | @abstractmethod 47 | def load_state_dict(self, state_dict): 48 | pass 49 | 50 | 51 | class ConstantGradScaler(MegatronGradScaler): 52 | def update(self, found_inf): 53 | pass 54 | 55 | def state_dict(self): 56 | return dict() 57 | 58 | def load_state_dict(self, state_dict): 59 | pass 60 | 61 | 62 | class DynamicGradScaler(MegatronGradScaler): 63 | def __init__( 64 | self, 65 | initial_scale, 66 | min_scale, 67 | growth_factor, 68 | backoff_factor, 69 | growth_interval, 70 | hysteresis, 71 | ): 72 | """ "Grad scaler with dynamic scale that gets adjusted 73 | during training.""" 74 | super(DynamicGradScaler, self).__init__(initial_scale) 75 | 76 | # Lower bound on the scale. 77 | assert min_scale > 0.0 78 | assert min_scale <= initial_scale 79 | self.min_scale = torch.cuda.FloatTensor([min_scale]) 80 | # Growth and backoff factors for the scale. 81 | assert growth_factor > 1.0 82 | self.growth_factor = torch.cuda.FloatTensor([growth_factor]) 83 | assert backoff_factor < 1.0 84 | assert backoff_factor > 0.0 85 | self.backoff_factor = torch.cuda.FloatTensor([backoff_factor]) 86 | # Interval over which if we don't see any inf/nan, 87 | # we will scale the grad scale by the growth factor. 88 | assert growth_interval > 0 89 | self.growth_interval = growth_interval 90 | # Number of inf/nans we should see before scaling down 91 | # the grad scale by the backoff factor. 92 | assert hysteresis > 0 93 | self.hysteresis = hysteresis 94 | 95 | # Trackers. 96 | self._growth_tracker = 0 97 | self._hysteresis_tracker = self.hysteresis 98 | 99 | def update(self, found_inf): 100 | 101 | # If we have an inf/nan, growth tracker is set to 0 102 | # and hysterisis tracker is reduced by 1. 103 | if found_inf: 104 | self._growth_tracker = 0 105 | self._hysteresis_tracker -= 1 106 | # Now if we are out of hysteresis count, scale down the loss. 107 | if self._hysteresis_tracker <= 0: 108 | self._scale = torch.max( 109 | self._scale * self.backoff_factor, self.min_scale 110 | ) 111 | else: 112 | # If there is no nan/inf, increment the growth tracker. 113 | self._growth_tracker += 1 114 | # If we have had enough consequitive intervals with no nan/inf: 115 | if self._growth_tracker == self.growth_interval: 116 | # Reset the tracker and hysteresis trackers, 117 | self._growth_tracker = 0 118 | self._hysteresis_tracker = self.hysteresis 119 | # and scale up the loss scale. 120 | self._scale = self._scale * self.growth_factor 121 | 122 | def state_dict(self): 123 | state_dict = {} 124 | state_dict["scale"] = self._scale 125 | state_dict["growth_tracker"] = self._growth_tracker 126 | state_dict["hysteresis_tracker"] = self._hysteresis_tracker 127 | return state_dict 128 | 129 | def load_state_dict(self, state_dict): 130 | self._scale = state_dict["scale"].cuda(torch.cuda.current_device()) 131 | self._growth_tracker = state_dict["growth_tracker"] 132 | self._hysteresis_tracker = state_dict["hysteresis_tracker"] 133 | -------------------------------------------------------------------------------- /codegeex/megatron/tokenizer/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | from .tokenizer import build_tokenizer 18 | -------------------------------------------------------------------------------- /codegeex/megatron/tools/collect_env.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | 4 | ENV_NAMES = ["CUDA_HOME", "LD_LIBRARY_PATH", "PATH", "TORCH_EXTENSIONS_DIR", "CUDA_LAUNCH_BLOCKING"] 5 | 6 | 7 | def main(): 8 | s = "" 9 | for name in ENV_NAMES: 10 | if name in os.environ: 11 | value = os.environ[name] 12 | s += "{}={}\n".format(name, value) 13 | print(f"{name}={value}") 14 | else: 15 | print(f"{name} is not set") 16 | 17 | # write env vars to .deepspeed_env 18 | with open(".deepspeed_env", "w") as f: 19 | f.write(s) 20 | 21 | 22 | if __name__ == "__main__": 23 | main() 24 | -------------------------------------------------------------------------------- /codegeex/mindspore/configs/13B.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | script_path=$(realpath $BASH_SOURCE) 4 | script_dir=$(dirname $script_path) 5 | 6 | CODE_DATA_DIR="" # TODO: set the path to the code data directory 7 | 8 | GAS=1 9 | 10 | python ${script_dir}/../train.py \ 11 | --distribute true \ 12 | --device_num $RANK_SIZE \ 13 | --sink_size 2 \ 14 | --run_type train \ 15 | --train_and_eval_mode 0 \ 16 | --mode 13B \ 17 | --code_data $CODE_DATA_DIR \ 18 | --param_init_type fp32 \ 19 | --micro_size $GAS \ 20 | --seq_length 2048 \ 21 | --vocab_size 51200 \ 22 | --ckpt_name_prefix code-13B \ 23 | --save_checkpoint=True \ 24 | --save_checkpoint_path /cache/ckpts \ 25 | --save_checkpoint_obs_path \ # TODO: set to obs path for saving ckpts 26 | --save_checkpoint_steps 250 \ 27 | --load_ckpt_path \ # TODO: set to obs path for loading ckpt 28 | --load_ckpt_epoch \ # TODO: set to epoch number of loaded ckpt 29 | --per_batch_size 16 \ 30 | --dropout_rate 0.1 \ 31 | --full_batch 0 \ 32 | --epoch_size 1 \ 33 | --micro_interleaved_size 1 \ 34 | --profiling 0 \ 35 | --tb_dir $LOG_PATH -------------------------------------------------------------------------------- /codegeex/mindspore/configs/13B_128p_save_1p.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | script_path=$(realpath $BASH_SOURCE) 3 | script_dir=$(dirname $script_path) 4 | 5 | CODE_DATA_DIR="" # TODO: set the path to the code data directory 6 | 7 | GAS=32 8 | 9 | python ${script_dir}/../save_1p_ckpt_from_8p_ckpt.py \ 10 | --distribute true \ 11 | --run_type train \ 12 | --train_and_eval_mode 0 \ 13 | --mode 13B \ 14 | --code_data $CODE_DATA_DIR \ 15 | --param_init_type fp32 \ 16 | --micro_size $GAS \ 17 | --seq_length 2048 \ 18 | --vocab_size 51200 \ 19 | --ckpt_name_prefix code-13B \ 20 | --save_checkpoint=True \ 21 | --save_checkpoint_path /cache/ckpts \ 22 | --save_checkpoint_obs_path \ # TODO: set to obs path for saving ckpts 23 | --save_checkpoint_steps \ # TODO: set to epoch number of loaded ckpt 24 | --load_ckpt_path \ # TODO: set to obs path for loading ckpt 25 | --load_ckpt_epoch \ # TODO: set to epoch number of loaded ckpt, same as save_checkpoint_steps 26 | --strategy_load_ckpt_path "/home/work/user-job-dir/start_1.6/strategy.ckpt" \ 27 | --per_batch_size 16 \ 28 | --full_batch 0 \ 29 | --epoch_size 1 \ 30 | --micro_interleaved_size 1 \ 31 | --profiling 0 \ 32 | --tb_dir $LOG_PATH 33 | -------------------------------------------------------------------------------- /codegeex/mindspore/configs/13B_128p_save_8p_ckpt.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | script_path=$(realpath $BASH_SOURCE) 4 | script_dir=$(dirname $script_path) 5 | 6 | CODE_DATA_DIR="" # TODO: set the path to the code data directory 7 | 8 | GAS=32 9 | 10 | python ${script_dir}/../save_8p_ckpt.py \ 11 | --distribute true \ 12 | --device_num $RANK_SIZE \ 13 | --sink_size 2 \ 14 | --run_type train \ 15 | --train_and_eval_mode 0 \ 16 | --mode 13B \ 17 | --code_data $CODE_DATA_DIR \ 18 | --param_init_type fp32 \ 19 | --micro_size $GAS \ 20 | --seq_length 2048 \ 21 | --vocab_size 51200 \ 22 | --ckpt_name_prefix code-13B \ 23 | --save_checkpoint=True \ 24 | --save_checkpoint_path /cache/ckpts \ 25 | --save_checkpoint_obs_path \ # TODO: set to obs path for saving ckpts 26 | --save_checkpoint_steps 99999 \ 27 | --load_ckpt_path \ # TODO: set to obs path for loading ckpt 28 | --load_ckpt_epoch \ # TODO: set to epoch number of loaded ckpt, same as save_checkpoint_steps 29 | --strategy_load_ckpt_path "/home/work/user-job-dir/start_1.6/strategy.ckpt" \ 30 | --per_batch_size 16 \ 31 | --full_batch 0 \ 32 | --epoch_size 1 \ 33 | --micro_interleaved_size 1 \ 34 | --profiling 0 \ 35 | --tb_dir $LOG_PATH -------------------------------------------------------------------------------- /codegeex/mindspore/configs/13B_1p_to_torch.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | script_path=$(realpath $BASH_SOURCE) 4 | script_dir=$(dirname $script_path) 5 | 6 | CODE_DATA_DIR="" # TODO: set the path to the code data directory 7 | 8 | GAS=32 9 | 10 | python ${script_dir}/../convertion_1p.py \ 11 | --distribute false \ 12 | --device_num $RANK_SIZE \ 13 | --sink_size 2 \ 14 | --run_type predict \ 15 | --train_and_eval_mode 0 \ 16 | --mode 13B \ 17 | --code_data $CODE_DATA_DIR \ 18 | --param_init_type fp32 \ 19 | --micro_size $GAS \ 20 | --seq_length 2048 \ 21 | --vocab_size 51200 \ 22 | --ckpt_name_prefix code-13B \ 23 | --save_checkpoint=True \ 24 | --save_checkpoint_path /cache/ckpts \ 25 | --load_ckpt_path \ # TODO: set to obs path for loading ckpt 26 | --per_batch_size 1 \ 27 | --full_batch 1 \ 28 | --epoch_size 1 \ 29 | --micro_interleaved_size 1 \ 30 | --profiling 0 \ 31 | --use_past "true" \ 32 | --top_p 0.95 \ 33 | --top_k_num 100 \ 34 | --temperature 0.8 \ 35 | --op_level_model_parallel_num 1 \ 36 | --frequency_penalty 0.0 \ 37 | --presence_penalty 0.0 \ 38 | --strategy_load_ckpt_path "/home/work/user-job-dir/start_1.6/strategy.ckpt" \ 39 | --tb_dir $LOG_PATH 40 | -------------------------------------------------------------------------------- /codegeex/mindspore/configs/13B_finetune.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | script_path=$(realpath $BASH_SOURCE) 4 | script_dir=$(dirname $script_path) 5 | 6 | CODE_DATA_DIR="" # TODO: set the path to the code data directory 7 | 8 | GAS=1 9 | 10 | python ${script_dir}/../../finetune.py \ 11 | --distribute true \ 12 | --device_num $RANK_SIZE \ 13 | --sink_size 2 \ 14 | --run_type train \ 15 | --train_and_eval_mode 1 \ 16 | --mode 13B \ 17 | --code_data $CODE_DATA_DIR \ 18 | --param_init_type fp32 \ 19 | --micro_size $GAS \ 20 | --seq_length 2048 \ 21 | --vocab_size 51200 \ 22 | --ckpt_name_prefix code-13B \ 23 | --save_checkpoint=True \ 24 | --save_checkpoint_path /cache/ckpts \ 25 | --save_checkpoint_obs_path \ # TODO: set to obs path for saving ckpts 26 | --save_checkpoint_steps 20 \ 27 | --load_ckpt_path \ # TODO: set to obs path for loading ckpt 28 | --load_ckpt_epoch \ # TODO: set to epoch number of loaded ckpt 29 | --per_batch_size 16 \ 30 | --dropout_rate 0.1 \ 31 | --full_batch 0 \ 32 | --epoch_size 5 \ 33 | --micro_interleaved_size 1 \ 34 | --profiling 0 \ 35 | --tb_dir $LOG_PATH \ -------------------------------------------------------------------------------- /codegeex/mindspore/configs/13B_generate.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | script_path=$(realpath $BASH_SOURCE) 4 | script_dir=$(dirname $script_path) 5 | 6 | CODE_DATA_DIR="" # TODO: set the path to the code data directory 7 | 8 | GAS=32 9 | 10 | python ${script_dir}/../generation.py \ 11 | --distribute true \ 12 | --device_num $RANK_SIZE \ 13 | --sink_size 2 \ 14 | --run_type predict \ 15 | --train_and_eval_mode 0 \ 16 | --mode 13B \ 17 | --code_data $CODE_DATA_DIR \ 18 | --param_init_type fp32 \ 19 | --micro_size $GAS \ 20 | --seq_length 2048 \ 21 | --vocab_size 51200 \ 22 | --ckpt_name_prefix code-13B \ 23 | --save_checkpoint=True \ 24 | --save_checkpoint_path /cache/ckpts \ 25 | --save_checkpoint_obs_path /home \ # TODO: set at will 26 | --save_checkpoint_steps 99999 \ # TODO: set at will 27 | --load_ckpt_path \ # TODO: set to obs path for loading ckpt 28 | --load_ckpt_epoch \ # TODO: set to epoch number of loaded ckpt, same as save_checkpoint_steps 29 | --per_batch_size 1 \ 30 | --full_batch 1 \ 31 | --epoch_size 1 \ 32 | --micro_interleaved_size 1 \ 33 | --profiling 0 \ 34 | --use_past "true" \ 35 | --top_p 0.95 \ 36 | --top_k_num 100 \ 37 | --temperature 0.8 \ 38 | --frequency_penalty 0.0 \ 39 | --presence_penalty 0.0 \ 40 | --strategy_load_ckpt_path "/home/work/user-job-dir/start_1.6/strategy.ckpt" \ 41 | --tb_dir $LOG_PATH 42 | -------------------------------------------------------------------------------- /codegeex/mindspore/configs/13B_generate_1p.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | script_path=$(realpath $BASH_SOURCE) 4 | script_dir=$(dirname $script_path) 5 | 6 | CODE_DATA_DIR="" # TODO: set the path to the code data directory 7 | 8 | GAS=32 9 | 10 | python ${script_dir}/../generation_1p.py \ 11 | --distribute false \ 12 | --device_num $RANK_SIZE \ 13 | --sink_size 2 \ 14 | --run_type predict \ 15 | --train_and_eval_mode 0 \ 16 | --mode 13B \ 17 | --code_data $CODE_DATA_DIR \ 18 | --param_init_type fp16 \ 19 | --micro_size $GAS \ 20 | --seq_length 2048 \ 21 | --vocab_size 51200 \ 22 | --ckpt_name_prefix code-13B \ 23 | --save_checkpoint=True \ 24 | --save_checkpoint_path /cache/ckpts \ 25 | --save_checkpoint_obs_path /home \ # TODO: set at will 26 | --save_checkpoint_steps 99999 \ # TODO: set at will 27 | --load_ckpt_path \ # TODO: set to obs path for loading ckpt 28 | --load_ckpt_epoch \ # TODO: set to epoch number of loaded ckpt, same as save_checkpoint_steps 29 | --per_batch_size 1 \ 30 | --full_batch 1 \ 31 | --epoch_size 1 \ 32 | --micro_interleaved_size 1 \ 33 | --profiling 0 \ 34 | --use_past "true" \ 35 | --top_p 0.95 \ 36 | --top_k_num 100 \ 37 | --temperature 0.8 \ 38 | --op_level_model_parallel_num 1 \ 39 | --frequency_penalty 0.0 \ 40 | --presence_penalty 0.0 \ 41 | --strategy_load_ckpt_path "/home/work/user-job-dir/start_1.6/strategy.ckpt" \ 42 | --tb_dir $LOG_PATH 43 | -------------------------------------------------------------------------------- /codegeex/mindspore/configs/13B_generate_1p_values.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | script_path=$(realpath $BASH_SOURCE) 4 | script_dir=$(dirname $script_path) 5 | 6 | CODE_DATA_DIR="" # TODO: set the path to the code data directory 7 | 8 | GAS=32 9 | 10 | python ${script_dir}/../generation_values_1p.py \ 11 | --distribute false \ 12 | --device_num $RANK_SIZE \ 13 | --sink_size 2 \ 14 | --run_type predict \ 15 | --train_and_eval_mode 0 \ 16 | --mode 13B \ 17 | --code_data $CODE_DATA_DIR \ 18 | --param_init_type fp16 \ 19 | --micro_size $GAS \ 20 | --seq_length 2048 \ 21 | --vocab_size 51200 \ 22 | --ckpt_name_prefix code-13B \ 23 | --save_checkpoint=True \ 24 | --save_checkpoint_path /cache/ckpts \ 25 | --save_checkpoint_obs_path /home \ # TODO: set at will 26 | --save_checkpoint_steps 213000 \ # TODO: set at will 27 | --load_ckpt_path \ # TODO: set to obs path for loading ckpt 28 | --load_ckpt_epoch \ # TODO: set to epoch number of loaded ckpt 29 | --per_batch_size 1 \ 30 | --full_batch 1 \ 31 | --epoch_size 1 \ 32 | --micro_interleaved_size 1 \ 33 | --profiling 0 \ 34 | --use_past "false" \ 35 | --top_p 0.95 \ 36 | --top_k_num 100 \ 37 | --temperature 0.8 \ 38 | --op_level_model_parallel_num 1 \ 39 | --frequency_penalty 0.0 \ 40 | --presence_penalty 0.0 \ 41 | --strategy_load_ckpt_path "/home/work/user-job-dir/start_1.6/strategy.ckpt" \ 42 | --tb_dir $LOG_PATH 43 | -------------------------------------------------------------------------------- /codegeex/mindspore/configs/13B_generate_finetune.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | script_path=$(realpath $BASH_SOURCE) 4 | script_dir=$(dirname $script_path) 5 | 6 | CODE_DATA_DIR="" # TODO: set the path to the code data directory 7 | 8 | GAS=32 9 | 10 | python ${script_dir}/../generation_finetune.py \ 11 | --distribute true \ 12 | --device_num $RANK_SIZE \ 13 | --sink_size 2 \ 14 | --run_type predict \ 15 | --train_and_eval_mode 0 \ 16 | --mode 13B \ 17 | --code_data $CODE_DATA_DIR \ 18 | --param_init_type fp32 \ 19 | --micro_size $GAS \ 20 | --seq_length 2048 \ 21 | --vocab_size 51200 \ 22 | --max_generate_length 1024 \ 23 | --ckpt_name_prefix code-13B \ 24 | --save_checkpoint=True \ 25 | --save_checkpoint_path /cache/ckpts \ 26 | --save_checkpoint_obs_path /home \ # TODO: set at will 27 | --save_checkpoint_steps 99999 \ # TODO: set at will 28 | --load_ckpt_path \ # TODO: set to obs path for loading ckpt 29 | --load_ckpt_epoch \ # TODO: set to epoch number of loaded ckpt, same as save_checkpoint_steps 30 | --per_batch_size 6 \ 31 | --full_batch 1 \ 32 | --epoch_size 1 \ 33 | --micro_interleaved_size 1 \ 34 | --profiling 0 \ 35 | --use_past "true" \ 36 | --top_p 0.95 \ 37 | --top_k_num 100 \ 38 | --temperature 0.2 \ 39 | --frequency_penalty 0.0 \ 40 | --presence_penalty 0.0 \ 41 | --strategy_load_ckpt_path "/home/work/user-job-dir/start_1.6/strategy.ckpt" \ 42 | --tb_dir $LOG_PATH \ 43 | --language $LANGUAGE 44 | 45 | -------------------------------------------------------------------------------- /codegeex/mindspore/configs/13B_generate_humaneval.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | script_path=$(realpath $BASH_SOURCE) 4 | script_dir=$(dirname $script_path) 5 | 6 | CODE_DATA_DIR="" # TODO: set the path to the code data directory 7 | 8 | GAS=32 9 | 10 | python ${script_dir}/../generation_humaneval.py \ 11 | --distribute true \ 12 | --device_num $RANK_SIZE \ 13 | --sink_size 2 \ 14 | --run_type predict \ 15 | --train_and_eval_mode 0 \ 16 | --mode 13B \ 17 | --code_data $CODE_DATA_DIR \ 18 | --param_init_type fp32 \ 19 | --micro_size $GAS \ 20 | --seq_length 2048 \ 21 | --vocab_size 51200 \ 22 | --max_generate_length 1024 \ 23 | --ckpt_name_prefix code-13B \ 24 | --save_checkpoint=True \ 25 | --save_checkpoint_path /cache/ckpts \ 26 | --save_checkpoint_obs_path /home \ # TODO: set at will 27 | --save_checkpoint_steps 99999 \ # TODO: set at will 28 | --load_ckpt_path \ # TODO: set to obs path for loading ckpt 29 | --load_ckpt_epoch \ # TODO: set to epoch number of loaded ckpt, same as save_checkpoint_steps 30 | --per_batch_size 6 \ 31 | --full_batch 1 \ 32 | --epoch_size 1 \ 33 | --micro_interleaved_size 1 \ 34 | --profiling 0 \ 35 | --use_past "true" \ 36 | --top_p 0.95 \ 37 | --top_k_num 100 \ 38 | --temperature 0.8 \ 39 | --frequency_penalty 0.0 \ 40 | --presence_penalty 0.0 \ 41 | --strategy_load_ckpt_path "/home/work/user-job-dir/start_1.6/strategy.ckpt" \ 42 | --tb_dir $LOG_PATH \ 43 | --part $PART 44 | 45 | -------------------------------------------------------------------------------- /codegeex/mindspore/configs/13B_generate_values.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | script_path=$(realpath $BASH_SOURCE) 4 | script_dir=$(dirname $script_path) 5 | 6 | CODE_DATA_DIR="" # TODO: set the path to the code data directory 7 | 8 | GAS=32 9 | 10 | python ${script_dir}/../generation_values.py \ 11 | --distribute true \ 12 | --device_num $RANK_SIZE \ 13 | --sink_size 2 \ 14 | --run_type predict \ 15 | --train_and_eval_mode 0 \ 16 | --mode 13B \ 17 | --code_data $CODE_DATA_DIR \ 18 | --param_init_type fp32 \ 19 | --micro_size $GAS \ 20 | --seq_length 2048 \ 21 | --vocab_size 51200 \ 22 | --max_generate_length 2048 \ 23 | --ckpt_name_prefix code-13B \ 24 | --save_checkpoint=True \ 25 | --save_checkpoint_path /cache/ckpts \ 26 | --save_checkpoint_obs_path /home \ # TODO: set at will 27 | --save_checkpoint_steps 99999 \ # TODO: set at will 28 | --load_ckpt_path \ # TODO: set to obs path for loading ckpt 29 | --load_ckpt_epoch \ # TODO: set to epoch number of loaded ckpt, same as save_checkpoint_steps 30 | --per_batch_size 6 \ 31 | --full_batch 1 \ 32 | --epoch_size 1 \ 33 | --micro_interleaved_size 1 \ 34 | --profiling 0 \ 35 | --use_past "true" \ 36 | --top_p 0.95 \ 37 | --top_k_num 100 \ 38 | --temperature 1.0 \ 39 | --frequency_penalty 0.0 \ 40 | --presence_penalty 0.0 \ 41 | --strategy_load_ckpt_path "/home/work/user-job-dir/start_1.6/strategy.ckpt" \ 42 | --tb_dir $LOG_PATH 43 | -------------------------------------------------------------------------------- /codegeex/mindspore/scripts/custom_tune_bank_new/Ascend910ProA/cube/repository_ascend910ProA_matmul.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/codegeex/mindspore/scripts/custom_tune_bank_new/Ascend910ProA/cube/repository_ascend910ProA_matmul.bin -------------------------------------------------------------------------------- /codegeex/mindspore/scripts/ma-pre-start.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | source ~/.bashrc 4 | echo "Start to intall the run package" 5 | WORK_DIR=start_1.7 6 | RUN_DIR=run 7 | mindspore_file=mindspore_ascend-1.7.0-cp37-cp37m-linux_aarch64.whl 8 | LOCAL_DIR=$(cd "$(dirname "$0")";pwd) 9 | echo $LOCAL_DIR 10 | 11 | echo "===current dir=" 12 | ls ./${WORK_DIR}/${RUN_DIR} 13 | 14 | pip install ./${WORK_DIR}/${mindspore_file} -i http://100.125.33.126:8888/repository/pypi/simple --trusted-host=100.125.33.126 15 | sudo chmod +755 -R /usr/local/Ascend/nnae 16 | sudo rm -rf /usr/local/Ascend/nnae 17 | 18 | sudo chmod +x ./${WORK_DIR}/${RUN_DIR}/*.run 19 | sudo bash ./${WORK_DIR}/${RUN_DIR}/Ascend* --full --quiet 20 | 21 | export HCCL_CONNECT_TIMEOUT=1800 # 通信建链最长等待时间,单位s 22 | 23 | echo "======/usr/local/Ascend======" 24 | ls -al /usr/local/Ascend 25 | echo "======/usr/local/Ascend/ascend-toolkit/======" 26 | ls -al /usr/local/Ascend/ascend-toolkit/ 27 | echo "======/usr/local/Ascend/ascend-toolkit/latest======" 28 | ls -al /usr/local/Ascend/ascend-toolkit/latest 29 | echo "======/usr/local/Ascend/driver/lib64========" 30 | ls -al /usr/local/Ascend/driver/lib64 31 | echo "======/usr/local/Ascend/driver/lib64/common=======" 32 | ls -al /usr/local/Ascend/driver/lib64/common 33 | echo "=======/usr/local/Ascend/driver/lib64/driver=======" 34 | ls -al /usr/local/Ascend/driver/lib64/driver 35 | echo "============/usr/local/Ascend/ascend-toolkit/5.1.RC1=============" 36 | ls -al /usr/local/Ascend/ascend-toolkit/5.1.RC1 37 | sudo mkdir /usr/local/Ascend/nnae 38 | sudo chmod +755 -R /usr/local/Ascend/nnae 39 | #sudo mkdir /usr/local/Ascend/nnae/latest 40 | #sudo chmod +755 -R /usr/local/Ascend/nnae/latest 41 | sudo ln -s /usr/local/Ascend/ascend-toolkit/5.1.RC1 /usr/local/Ascend/nnae/latest 42 | echo "======/usr/local/Ascend/nnae======" 43 | ls -al /usr/local/Ascend/nnae 44 | echo "======/usr/local/Ascend/nnae/latest======" 45 | ls -al /usr/local/Ascend/nnae/latest 46 | echo "======/usr/local/Ascend/nnae/latest/lib64/libhccl.so======" 47 | ls -al /usr/local/Ascend/nnae/latest/lib64/libhccl.so 48 | 49 | # sudo cp -fp ${LOCAL_DIR}/${WORK_DIR}/libhccl.so /usr/local/Ascend/nnae/latest/lib64/libhccl.so 50 | echo "======/usr/local/Ascend/nnae/latest/lib64/libhccl.so======" 51 | ls -al /usr/local/Ascend/nnae/latest/lib64/libhccl.so 52 | 53 | echo "======/usr/local/Ascend/nnae/latest/opp/op_impl/built-in/ai_core/tbe/impl/layer_norm.py======" 54 | ls -al /usr/local/Ascend/nnae/latest/opp/op_impl/built-in/ai_core/tbe/impl/layer_norm.py 55 | 56 | echo "======/usr/local/Ascend/nnae/latest/opp/op_impl/built-in/ai_core/tbe/impl/layer_norm.py======" 57 | ls -al /usr/local/Ascend/nnae/latest/opp/op_impl/built-in/ai_core/tbe/impl/layer_norm_x_backprop_v2.py 58 | 59 | 60 | sudo cp -fp ${LOCAL_DIR}/${WORK_DIR}/layer_norm.py /usr/local/Ascend/nnae/latest/opp/op_impl/built-in/ai_core/tbe/impl/layer_norm.py 61 | sudo cp -fp ${LOCAL_DIR}/${WORK_DIR}/layer_norm_x_backprop_v2.py /usr/local/Ascend/nnae/latest/opp/op_impl/built-in/ai_core/tbe/impl/layer_norm_x_backprop_v2.py 62 | 63 | chmod +777 /usr/local/Ascend/nnae/latest/opp/op_impl/built-in/ai_core/tbe/impl/layer_norm_x_backprop_v2.py 64 | chmod +777 /usr/local/Ascend/nnae/latest/opp/op_impl/built-in/ai_core/tbe/impl/layer_norm.py 65 | 66 | echo "======/usr/local/Ascend/nnae/latest/opp/op_impl/built-in/ai_core/tbe/impl/layer_norm.py====new==" 67 | ls -al /usr/local/Ascend/nnae/latest/opp/op_impl/built-in/ai_core/tbe/impl/layer_norm.py 68 | 69 | echo "======/usr/local/Ascend/nnae/latest/opp/op_impl/built-in/ai_core/tbe/impl/layer_norm.py====new==" 70 | ls -al /usr/local/Ascend/nnae/latest/opp/op_impl/built-in/ai_core/tbe/impl/layer_norm_x_backprop_v2.py 71 | 72 | ls -al ${LOCAL_DIR}/${WORK_DIR}/custom_tune_bank_new 73 | 74 | export LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:$LD_LIBRARY_PATH 75 | export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/compiler/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/compiler/lib64/plugin/nnengine:$LD_LIBRARY_PATH 76 | export PATH=/usr/local/Ascend/ascend-toolkit/latest/bin:/usr/local/Ascend/ascend-toolkit/latest/compiler/ccec_compiler/bin:$PATH 77 | export ASCEND_AICPU_PATH=/usr/local/Ascend/ascend-toolkit/latest 78 | export ASCEND_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp 79 | export TOOLCHAIN_HOME=/usr/local/Ascend/ascend-toolkit/latest/toolkit 80 | export ASCEND_HOME_PATH=/usr/local/Ascend/ascend-toolkit/latest:$ASCEND_HOME_PATH 81 | 82 | echo "-------------------uninstall te topi and hccl--------------------------" 83 | sudo pip uninstall te -y 84 | sudo pip uninstall topi -y 85 | sudo pip uninstall hccl -y 86 | echo "-------------------install te topi and hccl--------------------------" 87 | pip install /usr/local/Ascend/ascend-toolkit/latest/lib64/topi-0.4.0-py3-none-any.whl 88 | pip install /usr/local/Ascend/ascend-toolkit/latest/lib64/te-0.4.0-py3-none-any.whl 89 | pip install /usr/local/Ascend/ascend-toolkit/latest/lib64/hccl-0.1.0-py3-none-any.whl 90 | pip install /usr/local/Ascend/ascend-toolkit/latest/tools/hccl_parser-0.1-py3-none-any.whl 91 | 92 | 93 | export GLOG_v=3 # mindspore日志开关,1:Info, 2:Warning, 3:Error 94 | export ASCEND_GLOBAL_LOG_LEVEL=3 # 底层软件的日志级别开关 1:Info, 2:Warning, 3:Error 95 | export ASCEND_GLOBAL_EVENT_ENABLE=1 # 底层软件的日志event日志开关 0:disable, 1:enable 96 | export ASCEND_SLOG_PRINT_TO_STDOUT=0 # 是否把底层日志重定向到打屏,0:disable, 1:enable 97 | 98 | export ENABLE_TUNE_BANK=True 99 | export TUNE_BANK_PATH=${LOCAL_DIR}/${WORK_DIR}/custom_tune_bank_new 100 | 101 | env 102 | 103 | mkdir -p /cache/ckpts 104 | mkdir -p /home/work/sfs/cache/${BATCH_JOB_ID}/1 105 | mkdir -p /home/work/sfs/cache/${BATCH_JOB_ID}/2 106 | 107 | sudo chmod +777 -R /cache/ckpts 108 | sudo chmod +777 -R /home/work/sfs/cache/${BATCH_JOB_ID} 109 | 110 | export GROUP_INFO_FILE=/home/work/sfs/cache/${BATCH_JOB_ID}/group_info_file.pb 111 | -------------------------------------------------------------------------------- /codegeex/mindspore/scripts/run_modelarts.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import time 4 | from pathlib import Path 5 | 6 | parser = argparse.ArgumentParser() 7 | parser.add_argument("--work_dir", type=str, required=True) 8 | parser.add_argument("--script", type=str, required=True) 9 | parser.add_argument("--data_url", type=str, default=None) 10 | parser.add_argument("--train_url", type=str, default=None) 11 | 12 | args = parser.parse_args() 13 | 14 | log_path = os.path.join(args.work_dir, "logs", os.environ.get("JOB_ID"), f'device{os.environ.get("RANK_ID")}') 15 | tb_path = os.path.join(args.work_dir, "runs", os.environ.get("JOB_ID")) 16 | 17 | Path(log_path).mkdir(parents=True, exist_ok=True) 18 | Path(tb_path).mkdir(parents=True, exist_ok=True) 19 | 20 | log_path_prefix_1 = os.path.join(args.work_dir, "logs") 21 | 22 | os.environ["LOG_PATH"] = tb_path 23 | 24 | print("=================RANK_TABLE_FILE: ", os.environ["RANK_TABLE_FILE"], flush=True) 25 | print("=================ms import done", flush=True) 26 | time.sleep(10) 27 | os.system( 28 | "cp /home/work/rank_table/jobstart_hccl.json /home/work/sfs/xx; sudo chmod +777 /home/work/rank_table/jobstart_hccl.json") 29 | ret = os.system(f"cd {log_path} && bash {args.script} 2>&1 | tee output.log") 30 | if os.environ.get("RANK_ID") == 0: 31 | log_dir = os.path.join(args.work_dir, "logs", os.environ.get("JOB_ID")) 32 | os.system(f"sudo chmod +777 -R {tb_path}") 33 | os.system(f"sudo chmod +777 -R {log_dir}") 34 | print("==========ret code is: ", ret, flush=True) 35 | if ret != 0: 36 | raise RuntimeError("ret code is :" + str(ret)) 37 | -------------------------------------------------------------------------------- /codegeex/mindspore/scripts/run_modelarts_gen_finetune.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import time 4 | from pathlib import Path 5 | 6 | parser = argparse.ArgumentParser() 7 | parser.add_argument("--work_dir", type=str, required=True) 8 | parser.add_argument("--script", type=str, required=True) 9 | parser.add_argument("--data_url", type=str, default=None) 10 | parser.add_argument("--train_url", type=str, default=None) 11 | parser.add_argument("--language", type=str, default=None) 12 | 13 | args = parser.parse_args() 14 | 15 | log_path = os.path.join(args.work_dir, "logs", os.environ.get("JOB_ID"), f'device{os.environ.get("RANK_ID")}') 16 | tb_path = os.path.join(args.work_dir, "runs", os.environ.get("JOB_ID")) 17 | 18 | Path(log_path).mkdir(parents=True, exist_ok=True) 19 | Path(tb_path).mkdir(parents=True, exist_ok=True) 20 | 21 | log_path_prefix_1 = os.path.join(args.work_dir, "logs") 22 | 23 | os.environ["LOG_PATH"] = tb_path 24 | if args.language is not None: 25 | os.environ["LANGUAGE"] = args.language 26 | else: 27 | os.environ["LANGUAGE"] = "Null" 28 | 29 | print("=================RANK_TABLE_FILE: ", os.environ["RANK_TABLE_FILE"], flush=True) 30 | print("=================ms import done", flush=True) 31 | time.sleep(10) 32 | os.system( 33 | "cp /home/work/rank_table/jobstart_hccl.json /home/work/sfs/xx; sudo chmod +777 /home/work/rank_table/jobstart_hccl.json") 34 | ret = os.system(f"cd {log_path} && bash {args.script} 2>&1 | tee output.log") 35 | if os.environ.get("RANK_ID") == 0: 36 | log_dir = os.path.join(args.work_dir, "logs", os.environ.get("JOB_ID")) 37 | os.system(f"sudo chmod +777 -R {tb_path}") 38 | os.system(f"sudo chmod +777 -R {log_dir}") 39 | print("==========ret code is: ", ret, flush=True) 40 | if ret != 0: 41 | raise RuntimeError("ret code is :" + str(ret)) 42 | -------------------------------------------------------------------------------- /codegeex/mindspore/scripts/run_modelarts_gen_humaneval_x.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import time 4 | from pathlib import Path 5 | 6 | parser = argparse.ArgumentParser() 7 | parser.add_argument("--work_dir", type=str, required=True) 8 | parser.add_argument("--script", type=str, required=True) 9 | parser.add_argument("--data_url", type=str, default=None) 10 | parser.add_argument("--train_url", type=str, default=None) 11 | parser.add_argument("--part", type=str, default=None) 12 | 13 | args = parser.parse_args() 14 | 15 | log_path = os.path.join(args.work_dir, "logs", os.environ.get("JOB_ID"), f'device{os.environ.get("RANK_ID")}') 16 | tb_path = os.path.join(args.work_dir, "runs", os.environ.get("JOB_ID")) 17 | 18 | Path(log_path).mkdir(parents=True, exist_ok=True) 19 | Path(tb_path).mkdir(parents=True, exist_ok=True) 20 | 21 | log_path_prefix_1 = os.path.join(args.work_dir, "logs") 22 | 23 | os.environ["LOG_PATH"] = tb_path 24 | if args.part is not None: 25 | os.environ["PART"] = args.part 26 | else: 27 | os.environ["PART"] = "-1" 28 | 29 | print("=================RANK_TABLE_FILE: ", os.environ["RANK_TABLE_FILE"], flush=True) 30 | print("=================ms import done", flush=True) 31 | time.sleep(10) 32 | os.system( 33 | "cp /home/work/rank_table/jobstart_hccl.json /home/work/sfs/xx; sudo chmod +777 /home/work/rank_table/jobstart_hccl.json") 34 | ret = os.system(f"cd {log_path} && bash {args.script} 2>&1 | tee output.log") 35 | if os.environ.get("RANK_ID") == 0: 36 | log_dir = os.path.join(args.work_dir, "logs", os.environ.get("JOB_ID")) 37 | os.system(f"sudo chmod +777 -R {tb_path}") 38 | os.system(f"sudo chmod +777 -R {log_dir}") 39 | print("==========ret code is: ", ret, flush=True) 40 | if ret != 0: 41 | raise RuntimeError("ret code is :" + str(ret)) 42 | -------------------------------------------------------------------------------- /codegeex/mindspore/src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/codegeex/mindspore/src/__init__.py -------------------------------------------------------------------------------- /codegeex/mindspore/src/code_tokenizer.py: -------------------------------------------------------------------------------- 1 | from typing import * 2 | 3 | import numpy as np 4 | from transformers import AutoTokenizer 5 | from transformers.models.gpt2 import GPT2TokenizerFast 6 | 7 | 8 | def encode_whitespaces(text, start_extra_id: int, max_len: int): 9 | """ Encode whitespaces to extra tokens in GPT-J. 10 | 11 | >>> encode_whitespaces('a\\n b\\n c', 10, 10) 12 | 'a\\n<|extratoken_10|>b\\n<|extratoken_11|>c' 13 | """ 14 | 15 | def push_acc_space(acc_len: int, text: str): 16 | if acc_len == 0: 17 | return text 18 | if acc_len == 1: 19 | return text + ' ' 20 | assert acc_len <= max_len, f'Max whitespace run length {max_len}, but found {acc_len}' 21 | extra_id = start_extra_id - 2 + acc_len 22 | extra_token = f'<|extratoken_{extra_id}|>' 23 | return text + extra_token 24 | 25 | acc_len = 0 26 | res = '' 27 | for ch in text: 28 | if ch == ' ': 29 | acc_len += 1 30 | if acc_len == max_len: 31 | res = push_acc_space(acc_len, res) 32 | acc_len = 0 33 | else: 34 | res = push_acc_space(acc_len, res) 35 | acc_len = 0 36 | res = res + ch 37 | 38 | res = push_acc_space(acc_len, res) 39 | 40 | return res 41 | 42 | 43 | def decode_whitespaces(text: str, start_extra_id: int, max_len: int): 44 | """ Decode the whitespace-encoded strings produced by encode_whitespace. 45 | 46 | >>> text = 'a\\n b\\n c' 47 | >>> s, l = 10, 10 48 | >>> text == decode_whitespaces(encode_whitespaces(text, s, l), s, l) 49 | True 50 | """ 51 | for l in range(2, max_len + 1): 52 | token_id = start_extra_id - 2 + l 53 | token = f'<|extratoken_{token_id}|>' 54 | text = text.replace(token, ' ' * l) 55 | return text 56 | 57 | 58 | class Code13BDictionary(object): 59 | def __init__( 60 | self, 61 | dict_file: str, 62 | extra_token_ids: List[str] = None, 63 | pad_to_vocab_size: int = -1, 64 | ): 65 | self._idx = dict() 66 | self._count = dict() 67 | self._num_symbols = 0 68 | self._symbols = [] 69 | 70 | self._add_symbol("", 0) 71 | self._add_symbol("", 0) 72 | self._add_symbol("", 0) 73 | self._add_symbol("", 0) 74 | self._load_dict(dict_file) 75 | 76 | if extra_token_ids is None: 77 | extra_token_ids = [ 78 | str(x) for x in range(50257, 50400) 79 | ] # follows GPT-J settings 80 | 81 | for token_id in extra_token_ids: 82 | self._add_symbol(token_id, 0) 83 | 84 | if pad_to_vocab_size > 0: 85 | self._pad_to_vocab_size(pad_to_vocab_size) 86 | 87 | def _pad_to_vocab_size(self, vocab_size: int): 88 | num_pad = vocab_size - len(self) 89 | if num_pad <= 0: 90 | return 91 | for i in range(1, num_pad + 1): 92 | self._add_symbol("vocab_pad_token{}".format(i), 0) 93 | 94 | def _load_dict(self, dict_file: str): 95 | with open(dict_file, "r") as f: 96 | for line in f: 97 | line = line.strip() 98 | if line == "" or line.startswith("#"): 99 | continue 100 | sym, count = line.split() 101 | self._add_symbol(sym, int(count)) 102 | 103 | def _add_symbol(self, sym: str, count: int): 104 | self._idx[sym] = self._num_symbols 105 | self._count[sym] = count 106 | self._symbols.append(sym) 107 | self._num_symbols += 1 108 | 109 | def __len__(self): 110 | return self._num_symbols 111 | 112 | def index(self, sym: str): 113 | return self._idx[sym] 114 | 115 | def string(self, idx: int): 116 | return self._symbols[idx] 117 | 118 | def map_token(self, token: Union[int, str]): 119 | if isinstance(token, int): 120 | token = str(token) 121 | return self.index(token) 122 | 123 | def map_tokens(self, tokens): 124 | return [self.map_token(token) for token in tokens] 125 | 126 | def decode_tokens(self, tokens): 127 | decoded = [self.string(token) for token in tokens] 128 | return [int(x) for x in decoded if not x.startswith("vocab_pad_token")] 129 | 130 | 131 | class CodeTokenizer(object): 132 | def __init__( 133 | self, 134 | tokenizer: GPT2TokenizerFast = None, 135 | start_extra_id: int = 10, 136 | max_len: int = 10, 137 | mode='13b', 138 | dict_file: str = None, 139 | ): 140 | self.tokenizer = tokenizer if tokenizer is not None else AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B") 141 | if mode not in ['6b', '13b']: 142 | raise ValueError(f"Invalid mode {mode}, choose from ['6b', '13b']") 143 | self.start_extra_id = start_extra_id 144 | self.max_len = max_len 145 | self.mode = mode 146 | self.code_dict = Code13BDictionary(dict_file, pad_to_vocab_size=51200) if self.mode == '13b' else None 147 | self.eos_token_id = self.tokenizer.eos_token_id 148 | 149 | def encode_code(self, code: str): 150 | if self.mode == '6b': 151 | code = encode_whitespaces(code, self.start_extra_id, self.max_len) 152 | input_ids = self.tokenizer(code).input_ids 153 | 154 | elif self.mode == '13b': 155 | code = encode_whitespaces(code, self.start_extra_id, self.max_len) 156 | input_ids = self.code_dict.map_tokens(self.tokenizer.encode(code)) 157 | input_ids = np.array(input_ids, dtype=np.int64).reshape(1, -1) 158 | 159 | return input_ids 160 | 161 | def decode_code(self, input_ids): 162 | if self.mode == '6b': 163 | texts = self.tokenizer.batch_decode(input_ids) 164 | output_code = [decode_whitespaces(text, self.start_extra_id, self.max_len) for text in texts] 165 | 166 | elif self.mode == '13b': 167 | input_ids = [self.code_dict.decode_tokens(input_ids.tolist()[0])] 168 | texts = self.tokenizer.batch_decode(input_ids) 169 | output_code = [decode_whitespaces(text, self.start_extra_id, self.max_len) for text in texts] 170 | 171 | return output_code 172 | -------------------------------------------------------------------------------- /codegeex/mindspore/src/metrics.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Huawei Technologies Co., Ltd 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | """ 16 | Eval metrics 17 | """ 18 | 19 | import math 20 | 21 | from mindspore import context 22 | from mindspore.communication.management import get_rank, get_group_size 23 | from mindspore.nn.metrics import Metric 24 | 25 | 26 | class PPLMetric(Metric): 27 | """ 28 | Ppl metric 29 | """ 30 | 31 | def __init__(self, data_length): 32 | super(PPLMetric, self).__init__() 33 | self.clear() 34 | self.data_length = data_length 35 | pipeline_stages = context.get_auto_parallel_context("pipeline_stages") 36 | per_stage_device_num = get_group_size() // pipeline_stages 37 | stage_id = get_rank() // per_stage_device_num 38 | self.is_last_stage = (stage_id == pipeline_stages - 1) 39 | 40 | def clear(self): 41 | """Clear the internal evaluation result.""" 42 | self.PPL = [] 43 | self.tokens_count = 0 44 | 45 | def update(self, *inputs): # inputs 46 | """Update list of ppl""" 47 | if not self.is_last_stage: 48 | return 49 | logits = inputs[0].asnumpy().flatten().tolist() # logits 50 | self.PPL.append(logits[0] * self.data_length) 51 | self.tokens_count += 1 52 | 53 | def eval(self): 54 | if not self.is_last_stage: 55 | return 0 56 | if self.tokens_count == 0: 57 | print("Warning: tokens_count is 0") 58 | return 0 59 | val_loss = sum(self.PPL) / (self.tokens_count * self.data_length) 60 | ppl = math.exp(min(20, val_loss)) 61 | # print("====" * 20 + " ppl end") 62 | # print("====" * 20 + " ppl: {}".format(ppl)) 63 | # return ppl 64 | return val_loss 65 | 66 | 67 | class ValidationLoss(Metric): 68 | def __init__(self, data_length): 69 | super(ValidationLoss, self).__init__() 70 | self.clear() 71 | self.data_length = data_length 72 | pipeline_stages = context.get_auto_parallel_context("pipeline_stages") 73 | per_stage_device_num = get_group_size() // pipeline_stages 74 | stage_id = get_rank() // per_stage_device_num 75 | self.is_last_stage = (stage_id == pipeline_stages - 1) 76 | 77 | def clear(self): 78 | """Clear the internal evaluation result.""" 79 | self.metric = [] 80 | self.tokens_count = 0 81 | 82 | def update(self, *inputs): # inputs 83 | """Update list of ppl""" 84 | # logits = inputs[0].asnumpy() 85 | # if self.rank % 8 == 0: 86 | # print("====" * 2 + " logits: {}".format(logits), flush=True) 87 | # self.metric.append(logits) 88 | if not self.is_last_stage: 89 | return 90 | logits = inputs[0].asnumpy().flatten().tolist() # logits 91 | self.metric.append(logits[0] * self.data_length) 92 | self.tokens_count += 1 93 | 94 | def eval(self): 95 | if not self.is_last_stage == 0: 96 | return 0 97 | val_loss = sum(self.metric) / (self.tokens_count * self.data_length) 98 | return val_loss 99 | -------------------------------------------------------------------------------- /codegeex/mindspore/src/tokenization_jieba.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Tokenization classes for OpenAI GPT.""" 16 | from __future__ import (absolute_import, division, print_function, 17 | unicode_literals) 18 | 19 | from io import open 20 | 21 | import jieba 22 | import sentencepiece as spm 23 | 24 | 25 | class JIEBATokenizer(): 26 | r""" 27 | Jieba Tokenizer 28 | """ 29 | 30 | def __init__(self, vocab_file, model_file, max_len=None): 31 | self.max_len = max_len if max_len is not None else int(1e12) 32 | f = open(vocab_file, 'r') 33 | lines = f.readlines() 34 | self.encoder = {} 35 | for line in enumerate(lines): 36 | key = line[1].split('\t')[0] 37 | self.encoder[key] = line[0] 38 | 39 | self.decoder = {v: k for k, v in self.encoder.items()} 40 | 41 | self.sp = spm.SentencePieceProcessor(model_file=model_file) 42 | self.translator = str.maketrans(" \n", "\u2582\u2583") 43 | 44 | self.eod_id = self.encoder[''] 45 | self.eot_id = self.encoder[''] 46 | self.pad_id = self.encoder[''] 47 | 48 | @property 49 | def vocab_size(self): 50 | return len(self.encoder) 51 | 52 | def __len__(self): 53 | return len(self.encoder) + len(self.special_tokens) 54 | 55 | @property 56 | def eod(self): 57 | return self.eod_id 58 | 59 | def tokenize(self, text): 60 | """ Tokenize a string. """ 61 | seg_list = [x.translate(self.translator) for x in jieba.cut(text, cut_all=False)] 62 | new_seg = " ".join(seg_list) 63 | return self.sp.encode(new_seg) 64 | 65 | def convert_tokens_to_ids(self, tokens): 66 | return tokens 67 | 68 | def convert_ids_to_tokens(self, ids): 69 | return self.decode(ids) 70 | 71 | def encode(self, text): 72 | res = self.tokenize(text) 73 | return res 74 | 75 | def decode(self, tokens): 76 | text = self.sp.decode(tokens) 77 | text = text.replace(' ', '').replace('\u2582', ' ').replace('\u2583', '\n') 78 | return text 79 | -------------------------------------------------------------------------------- /codegeex/oneflow/__init__.py: -------------------------------------------------------------------------------- 1 | from .codegeex_model import CodeGeeXModel -------------------------------------------------------------------------------- /codegeex/paddle/__init__.py: -------------------------------------------------------------------------------- 1 | from .codegeex_model import CodeGeeXModel -------------------------------------------------------------------------------- /codegeex/paddle/pt_to_pdparams.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import paddle 3 | import torch 4 | 5 | linear_layer = [ 6 | "mlp.dense_h_to_4h", 7 | "mlp.dense_4h_to_h", 8 | "attention.query", 9 | "attention.key", 10 | "attention.value", 11 | "attention.dense", 12 | ] 13 | 14 | 15 | def WalkDict(x): 16 | for i in x: 17 | if isinstance(x[i], dict): 18 | WalkDict(x[i]) 19 | elif isinstance(x[i], torch.Tensor): 20 | print(f"Converting '{i}' from 'torch.Tensor' to 'numpy.ndarray'.") 21 | npy = x[i].cpu().numpy() 22 | if any([f".{layer}.weight" in i for layer in linear_layer]): 23 | print(f"Transposing linear layer weight '{i}'.") 24 | x[i] = npy.T 25 | else: 26 | x[i] = npy 27 | 28 | 29 | def parse_opt(): 30 | parser = argparse.ArgumentParser() 31 | parser.add_argument( 32 | "--pt", 33 | type=str, 34 | required=True, 35 | help="Path to pt checkpoint." 36 | ) 37 | parser.add_argument( 38 | "--pdparams", 39 | type=str, 40 | required=True, 41 | help="Path to pdparams checkpoint." 42 | ) 43 | opt = parser.parse_args() 44 | return opt 45 | 46 | 47 | def main(opt): 48 | state_dict = torch.load(opt.pt) 49 | WalkDict(state_dict) 50 | paddle.save(state_dict, opt.pdparams) 51 | 52 | 53 | if __name__ == "__main__": 54 | opt = parse_opt() 55 | main(opt) 56 | -------------------------------------------------------------------------------- /codegeex/quantization/__init__.py: -------------------------------------------------------------------------------- 1 | from .quantize import quantize 2 | try: 3 | from .quantize_oneflow import quantize_oneflow 4 | from .quantize_oneflow import QuantizedLinear 5 | except ModuleNotFoundError: 6 | pass 7 | -------------------------------------------------------------------------------- /codegeex/tokenizer/__init__.py: -------------------------------------------------------------------------------- 1 | from .tokenizer import CodeGeeXTokenizer -------------------------------------------------------------------------------- /codegeex/tokenizer/added_tokens.json: -------------------------------------------------------------------------------- 1 | {"<|extratoken_14|>": 50270, "<|extratoken_121|>": 50377, "<|extratoken_3|>": 50259, "<|extratoken_25|>": 50281, "<|extratoken_101|>": 50357, "<|extratoken_138|>": 50394, "<|extratoken_10|>": 50266, "<|extratoken_21|>": 50277, "<|extratoken_32|>": 50288, "<|extratoken_46|>": 50302, "<|extratoken_22|>": 50278, "<|extratoken_40|>": 50296, "<|extratoken_96|>": 50352, "<|extratoken_92|>": 50348, "<|extratoken_95|>": 50351, "<|extratoken_141|>": 50397, "<|extratoken_78|>": 50334, "<|extratoken_86|>": 50342, "<|extratoken_56|>": 50312, "<|extratoken_124|>": 50380, "<|extratoken_127|>": 50383, "<|extratoken_122|>": 50378, "<|extratoken_123|>": 50379, "<|extratoken_111|>": 50367, "<|extratoken_93|>": 50349, "<|extratoken_130|>": 50386, "<|extratoken_113|>": 50369, "<|extratoken_50|>": 50306, "<|extratoken_97|>": 50353, "<|extratoken_1|>": 50257, "<|extratoken_55|>": 50311, "<|extratoken_34|>": 50290, "<|extratoken_143|>": 50399, "<|extratoken_62|>": 50318, "<|extratoken_74|>": 50330, "<|extratoken_136|>": 50392, "<|extratoken_117|>": 50373, "<|extratoken_38|>": 50294, "<|extratoken_120|>": 50376, "<|extratoken_39|>": 50295, "<|extratoken_65|>": 50321, "<|extratoken_29|>": 50285, "<|extratoken_104|>": 50360, "<|extratoken_13|>": 50269, "<|extratoken_5|>": 50261, "<|extratoken_107|>": 50363, "<|extratoken_19|>": 50275, "<|extratoken_84|>": 50340, "<|extratoken_77|>": 50333, "<|extratoken_135|>": 50391, "<|extratoken_24|>": 50280, "<|extratoken_134|>": 50390, "<|extratoken_15|>": 50271, "<|extratoken_67|>": 50323, "<|extratoken_89|>": 50345, "<|extratoken_2|>": 50258, "<|extratoken_73|>": 50329, "<|extratoken_129|>": 50385, "<|extratoken_126|>": 50382, "<|extratoken_30|>": 50286, "<|extratoken_41|>": 50297, "<|extratoken_28|>": 50284, "<|extratoken_114|>": 50370, "<|extratoken_128|>": 50384, "<|extratoken_118|>": 50374, "<|extratoken_131|>": 50387, "<|extratoken_68|>": 50324, "<|extratoken_125|>": 50381, "<|extratoken_103|>": 50359, "<|extratoken_8|>": 50264, "<|extratoken_64|>": 50320, "<|extratoken_52|>": 50308, "<|extratoken_45|>": 50301, "<|extratoken_43|>": 50299, "<|extratoken_18|>": 50274, "<|extratoken_139|>": 50395, "<|extratoken_85|>": 50341, "<|extratoken_88|>": 50344, "<|extratoken_63|>": 50319, "<|extratoken_4|>": 50260, "<|extratoken_48|>": 50304, "<|extratoken_112|>": 50368, "<|extratoken_17|>": 50273, "<|extratoken_49|>": 50305, "<|extratoken_108|>": 50364, "<|extratoken_110|>": 50366, "<|extratoken_42|>": 50298, "<|extratoken_70|>": 50326, "<|extratoken_6|>": 50262, "<|extratoken_35|>": 50291, "<|extratoken_23|>": 50279, "<|extratoken_66|>": 50322, "<|extratoken_60|>": 50316, "<|extratoken_71|>": 50327, "<|extratoken_51|>": 50307, "<|extratoken_133|>": 50389, "<|extratoken_20|>": 50276, "<|extratoken_76|>": 50332, "<|extratoken_81|>": 50337, "<|extratoken_142|>": 50398, "<|extratoken_116|>": 50372, "<|extratoken_57|>": 50313, "<|extratoken_75|>": 50331, "<|extratoken_37|>": 50293, "<|extratoken_33|>": 50289, "<|extratoken_16|>": 50272, "<|extratoken_61|>": 50317, "<|extratoken_7|>": 50263, "<|extratoken_12|>": 50268, "<|extratoken_36|>": 50292, "<|extratoken_80|>": 50336, "<|extratoken_98|>": 50354, "<|extratoken_105|>": 50361, "<|extratoken_91|>": 50347, "<|extratoken_53|>": 50309, "<|extratoken_137|>": 50393, "<|extratoken_9|>": 50265, "<|extratoken_79|>": 50335, "<|extratoken_83|>": 50339, "<|extratoken_109|>": 50365, "<|extratoken_99|>": 50355, "<|extratoken_140|>": 50396, "<|extratoken_72|>": 50328, "<|extratoken_11|>": 50267, "<|extratoken_94|>": 50350, "<|extratoken_26|>": 50282, "<|extratoken_59|>": 50315, "<|extratoken_106|>": 50362, "<|extratoken_115|>": 50371, "<|extratoken_58|>": 50314, "<|extratoken_90|>": 50346, "<|extratoken_31|>": 50287, "<|extratoken_102|>": 50358, "<|extratoken_47|>": 50303, "<|extratoken_100|>": 50356, "<|extratoken_82|>": 50338, "<|extratoken_44|>": 50300, "<|extratoken_69|>": 50325, "<|extratoken_54|>": 50310, "<|extratoken_132|>": 50388, "<|extratoken_27|>": 50283, "<|extratoken_87|>": 50343, "<|extratoken_119|>": 50375} 2 | -------------------------------------------------------------------------------- /codegeex/tokenizer/special_tokens_map.json: -------------------------------------------------------------------------------- 1 | {"bos_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "eos_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "unk_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}} 2 | -------------------------------------------------------------------------------- /codegeex/tokenizer/tokenizer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from typing import * 3 | from transformers import AutoTokenizer 4 | from transformers.models.gpt2 import GPT2TokenizerFast 5 | 6 | 7 | def encode_whitespaces(text: str, start_extra_id: int, max_len: int): 8 | """ Encode whitespaces to extra tokens. 9 | 10 | >>> encode_whitespaces('a\\n b\\n c', 10, 10) 11 | 'a\\n<|extratoken_10|>b\\n<|extratoken_11|>c' 12 | """ 13 | for i in np.arange(max_len, 1, -1): 14 | text = text.replace(" " * i, f"<|extratoken_{start_extra_id + i - 2}|>") 15 | return text 16 | 17 | 18 | def decode_whitespaces(text: str, start_extra_id: int, max_len: int): 19 | """ Decode the whitespace-encoded strings produced by encode_whitespace. 20 | 21 | >>> text = 'a\\n b\\n c' 22 | >>> s, l = 10, 10 23 | >>> text == decode_whitespaces(encode_whitespaces(text, s, l), s, l) 24 | True 25 | """ 26 | for l in range(2, max_len + 1): 27 | token_id = start_extra_id - 2 + l 28 | token = f'<|extratoken_{token_id}|>' 29 | text = text.replace(token, ' ' * l) 30 | return text 31 | 32 | 33 | class CodeGeeXTokenizer(object): 34 | def __init__( 35 | self, 36 | tokenizer: GPT2TokenizerFast = None, 37 | tokenizer_path: str = "EleutherAI/gpt-j-6B", 38 | start_extra_id: int = 10, 39 | max_len : int = 10, 40 | mode='codegeex-13b', 41 | dict_file: str = None, 42 | ): 43 | self.tokenizer = tokenizer if tokenizer is not None else AutoTokenizer.from_pretrained(tokenizer_path) 44 | if mode not in ['codegeex-13b']: 45 | raise ValueError(f"Invalid mode {mode}, choose from ['codegeex-13b']") 46 | self.start_extra_id = start_extra_id 47 | self.max_len = max_len 48 | self.mode = mode 49 | self.eos_token_id = self.tokenizer.eos_token_id 50 | 51 | def encode_code(self, code: str): 52 | if self.mode == 'codegeex-13b': 53 | code = encode_whitespaces(code, self.start_extra_id, self.max_len) 54 | input_ids = self.tokenizer(code, is_split_into_words=False, verbose=False).input_ids 55 | 56 | return input_ids 57 | 58 | def decode_code(self, input_ids): 59 | if self.mode == 'codegeex-13b': 60 | text = self.tokenizer.decode(input_ids, skip_special_tokens=False, verbose=False) 61 | output_code = decode_whitespaces(text, self.start_extra_id, self.max_len) 62 | 63 | return output_code -------------------------------------------------------------------------------- /codegeex/tokenizer/tokenizer_config.json: -------------------------------------------------------------------------------- 1 | {"unk_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "bos_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "eos_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "add_prefix_space": false, "errors": "replace", "model_max_length": 2048, "special_tokens_map_file": null, "name_or_path": "gpt-j-6B", "from_slow": true, "tokenizer_class": "GPT2Tokenizer"} 2 | -------------------------------------------------------------------------------- /codegeex/torch/__init__.py: -------------------------------------------------------------------------------- 1 | from .codegeex_model import CodeGeeXModel -------------------------------------------------------------------------------- /codegeex/torch/get_ckpt_qkv.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import torch 4 | import random 5 | import argparse 6 | import numpy as np 7 | 8 | 9 | def main(): 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument("--load-path", 12 | type=str, 13 | default="/zhangpai24/workspace/ckpt_ms/ckpt_ms_213000_fp32_52224.pt") 14 | parser.add_argument("--save-path", 15 | type=str, 16 | default="/zhangpai24/workspace/ckpt_ms/ckpt_ms_213000_qkv.pt") 17 | 18 | args, _ = parser.parse_known_args() 19 | 20 | state_dict_path = args.load_path 21 | print("Loading state dict ...") 22 | sd = torch.load(state_dict_path, map_location="cpu") 23 | 24 | for i in range(40): 25 | if i < 39: 26 | query_weight = sd['module']['language_model']['transformer'].pop(f'layers.{i}.attention.query.weight', None) 27 | query_bias = sd['module']['language_model']['transformer'].pop(f'layers.{i}.attention.query.bias', None) 28 | key_weight = sd['module']['language_model']['transformer'].pop(f'layers.{i}.attention.key.weight', None) 29 | key_bias = sd['module']['language_model']['transformer'].pop(f'layers.{i}.attention.key.bias', None) 30 | value_weight = sd['module']['language_model']['transformer'].pop(f'layers.{i}.attention.value.weight', None) 31 | value_bias = sd['module']['language_model']['transformer'].pop(f'layers.{i}.attention.value.bias', None) 32 | qkv_weight = torch.cat([query_weight, key_weight, value_weight], dim=0) 33 | qkv_bias = torch.cat([query_bias, key_bias, value_bias]) 34 | sd['module']['language_model']['transformer'][f'layers.{i}.attention.query_key_value.weight'] = qkv_weight 35 | sd['module']['language_model']['transformer'][f'layers.{i}.attention.query_key_value.bias'] = qkv_bias 36 | else: 37 | tq_key_weight = sd['module']['language_model']['transformer'].pop('topQueryLayer.attention.key.weight', None) 38 | tq_key_bias = sd['module']['language_model']['transformer'].pop('topQueryLayer.attention.key.bias', None) 39 | tq_value_weight = sd['module']['language_model']['transformer'].pop('topQueryLayer.attention.value.weight', None) 40 | tq_value_bias = sd['module']['language_model']['transformer'].pop('topQueryLayer.attention.value.bias', None) 41 | tq_kv_weight = torch.cat([tq_key_weight, tq_value_weight], dim=0) 42 | tq_kv_bias = torch.cat([tq_key_bias, tq_value_bias]) 43 | sd['module']['language_model']['transformer']['topQueryLayer.attention.key_value.weight'] = tq_kv_weight 44 | sd['module']['language_model']['transformer']['topQueryLayer.attention.key_value.bias'] = tq_kv_bias 45 | 46 | save_ckpt_path = args.save_path 47 | torch.save(sd, save_ckpt_path) 48 | 49 | if __name__ == '__main__': 50 | main() 51 | -------------------------------------------------------------------------------- /configs/codegeex_13b.sh: -------------------------------------------------------------------------------- 1 | # CodeGeeX-13B configuration 2 | 3 | CHECKPOINT_PATH="" 4 | 5 | MODEL_ARGS="--num-layers 39 \ 6 | --hidden-size 5120 \ 7 | --num-attention-heads 40 \ 8 | --max-position-embeddings 2048 \ 9 | --attention-softmax-in-fp32 \ 10 | --load "$CHECKPOINT_PATH" \ 11 | --layernorm-epsilon 1e-5 \ 12 | --fp16 \ 13 | --ws-encoding-start-id 10 \ 14 | --ws-encoding-length 10 \ 15 | --make-vocab-size-divisible-by 52224 \ 16 | --seq-length 2048" -------------------------------------------------------------------------------- /configs/codegeex_13b_paddle.sh: -------------------------------------------------------------------------------- 1 | # CodeGeeX-13B paddle configuration 2 | 3 | CHECKPOINT_PATH="" 4 | 5 | MODEL_ARGS="--num-layers 39 \ 6 | --hidden-size 5120 \ 7 | --num-attention-heads 40 \ 8 | --max-position-embeddings 2048 \ 9 | --attention-softmax-in-fp32 \ 10 | --load "$CHECKPOINT_PATH" \ 11 | --layernorm-epsilon 1e-5 \ 12 | --fp16 \ 13 | --ws-encoding-start-id 10 \ 14 | --ws-encoding-length 10 \ 15 | --make-vocab-size-divisible-by 52224 \ 16 | --seq-length 2048" -------------------------------------------------------------------------------- /configs/codegeex_13b_parallel.sh: -------------------------------------------------------------------------------- 1 | # CodeGeeX-13B parallel configuration 2 | # Parallel checkpoints are named under the format "mp_rank_0{i}_model_states.pt", where i is the rank, start from 0. 3 | 4 | CHECKPOINT_PATH="" 5 | 6 | MODEL_ARGS="--num-layers 39 \ 7 | --hidden-size 5120 \ 8 | --num-attention-heads 40 \ 9 | --max-position-embeddings 2048 \ 10 | --attention-softmax-in-fp32 \ 11 | --load "$CHECKPOINT_PATH" \ 12 | --layernorm-epsilon 1e-5 \ 13 | --fp16 \ 14 | --ws-encoding-start-id 10 \ 15 | --ws-encoding-length 10 \ 16 | --make-vocab-size-divisible-by 52224 \ 17 | --seq-length 2048" -------------------------------------------------------------------------------- /deployment/example_inputs.jsonl: -------------------------------------------------------------------------------- 1 | {"code": "# Write a function that returns the sum of the numbers from 1 to n.\n# For example, if n is 5, then the function should return 1 + 2 + 3 + 4 + 5.\n\n# You may assume that n is a positive integer.\ndef sum_of_numbers(n):", "langauge": "Python"} 2 | {"code": "// Write a function that returns the sum of the numbers from 1 to n.\n// For example, if n is 5, then the function should return 1 + 2 + 3 + 4 + 5.\n\n#include \nusing namespace std;\nint sum_of_numbers(int n) {", "langauge": "C++"} 3 | {"code": "// Write a function that returns the sum of the numbers from 1 to n.\n// For example, if n is 5, then the function should return 1 + 2 + 3 + 4 + 5.\n\n#include \n#include \nint sum(int n)\n{", "langauge": "C"} 4 | {"code": "// Write a function that returns the sum of the numbers from 1 to n.\n// For example, if n is 5, then the function should return 1 + 2 + 3 + 4 + 5.\nprivate int sum(int n) {", "langauge": "C#"} 5 | {"code": "// Write a function that returns the sum of the numbers from 1 to n.\n// For example, if n is 5, then the function should return 1 + 2 + 3 + 4 + 5.\n\npublic class SumOfNumbers {", "langauge": "Java"} 6 | {"code": "\n\n
", "langauge": "HTML"} 7 | {"code": "// Write a function that returns the sum of the numbers from 1 to n.\n// For example, if n is 5, then the function should return 1 + 2 + 3 + 4 + 5.\n// If n is 0, then the function should return 0.\n// If n is less than 0, then the function should return -1.\n/**\n * @param {number} n\n * @return {number}\n */\nfunction sum ($n) {", "langauge": "PHP"} 8 | {"code": "// Write a function that returns the sum of the numbers from 1 to n.\n// For example, if n is 5, then the function should return 1 + 2 + 3 + 4 + 5.\n\nfunction sum(n) {", "langauge": "JavaScript"} 9 | {"code": "// Write a function that returns the sum of the numbers from 1 to n,\n// but using a for loop instead of a while loop.\n\nfunction sumForLoop(n) {", "langauge": "TypeScript"} 10 | {"code": "// Write a function that returns the sum of the numbers from 1 to n,\n// but using a for loop instead of a while loop.\n\nfunc sumN(n int) int {", "langauge": "Go"} 11 | {"code": "// Write a function that returns the sum of the numbers from 1 to n,\n// but using a for loop instead of a while loop.\n\nfn sum_numbers(n: usize) -> usize {", "langauge": "Rust"} 12 | {"code": "-- Search all the records from the table CodeGeeX\n-- Delete iterms with odd indices", "langauge": "SQL"} 13 | {"code": "// Write a function that returns the sum of the numbers from 1 to n.\n// For example, if n is 5, then the function should return 1 + 2 + 3 + 4 + 5.\n\nfun sum(n: Int): Int {", "langauge": "Kotlin"} 14 | {"code": "! Write a function that returns the sum of the numbers from 1 to n.\n! For example, if n is 5, then the function should return 1 + 2 + 3 + 4 + 5.\n\n! Use the following header:\n! module sum_numbers\n! end\nmodule sum_numbers", "langauge": "Fortran"} 15 | {"code": "# Write a function that returns the sum of the numbers from 1 to n.\n# For example, if n is 5, then the function should return 1 + 2 + 3 + 4 + 5.\nsum_numbers <- function(n) {", "langauge": "R"} 16 | -------------------------------------------------------------------------------- /generations/humaneval_python_generations.jsonl.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/generations/humaneval_python_generations.jsonl.gz -------------------------------------------------------------------------------- /generations/humaneval_rust_generations.jsonl.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/generations/humaneval_rust_generations.jsonl.gz -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | fire>=0.4.0 2 | ipython>=8.4.0 3 | numpy>=1.22.0 4 | pandas>=1.3.5 5 | pyzmq>=23.2.1 6 | regex>=2022.3.15 7 | setuptools>=58.0.4 8 | transformers>=4.22.0 9 | torch>=1.10.0 10 | tqdm>=4.63.0 11 | cpm_kernels 12 | deepspeed>0.6.1 -------------------------------------------------------------------------------- /resources/api/api_step_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/resources/api/api_step_1.png -------------------------------------------------------------------------------- /resources/api/api_step_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/resources/api/api_step_2.png -------------------------------------------------------------------------------- /resources/api/api_step_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/resources/api/api_step_3.png -------------------------------------------------------------------------------- /resources/api/api_step_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/resources/api/api_step_4.png -------------------------------------------------------------------------------- /resources/api/api_step_5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/resources/api/api_step_5.png -------------------------------------------------------------------------------- /resources/en/codegeex_training.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/resources/en/codegeex_training.png -------------------------------------------------------------------------------- /resources/en/hx_boxplot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/resources/en/hx_boxplot.png -------------------------------------------------------------------------------- /resources/en/hx_examples.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/resources/en/hx_examples.png -------------------------------------------------------------------------------- /resources/en/hx_generattion_radar_horizon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/resources/en/hx_generattion_radar_horizon.png -------------------------------------------------------------------------------- /resources/en/hx_pass_rate_vs_language.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/resources/en/hx_pass_rate_vs_language.png -------------------------------------------------------------------------------- /resources/en/hx_tasks.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/resources/en/hx_tasks.png -------------------------------------------------------------------------------- /resources/en/hx_translation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/resources/en/hx_translation.png -------------------------------------------------------------------------------- /resources/logo/codegeex_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/resources/logo/codegeex_logo.png -------------------------------------------------------------------------------- /resources/zh/hx_boxplot_zh.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/resources/zh/hx_boxplot_zh.png -------------------------------------------------------------------------------- /resources/zh/hx_generattion_radar_horizon_zh.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/resources/zh/hx_generattion_radar_horizon_zh.png -------------------------------------------------------------------------------- /resources/zh/hx_pass_rate_vs_language_zh.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/resources/zh/hx_pass_rate_vs_language_zh.png -------------------------------------------------------------------------------- /resources/zh/hx_tasks_zh.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/resources/zh/hx_tasks_zh.png -------------------------------------------------------------------------------- /resources/zh/hx_translation_zh.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/resources/zh/hx_translation_zh.png -------------------------------------------------------------------------------- /resources/zh/join_wechat.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/resources/zh/join_wechat.png -------------------------------------------------------------------------------- /resources/zh/wechat.md: -------------------------------------------------------------------------------- 1 |
2 | 3 | 4 |

扫码关注公众号加入「CodeGeeX交流群」

5 |

Scan the QR code to join the "CodeGeeX WeChat Group"

6 |
7 | -------------------------------------------------------------------------------- /scripts/convert_ckpt_parallel.sh: -------------------------------------------------------------------------------- 1 | # This script is used to convert checkpoint model parallel partitions. 2 | 3 | LOAD_CKPT_PATH=$1 # Path to weights in .pt format. 4 | SAVE_CKPT_PATH=$2 # Path to save the output MP checkpoints. 5 | MP_SIZE=$3 # Model parallel size 6 | 7 | SCRIPT_PATH=$(realpath "$0") 8 | SCRIPT_DIR=$(dirname "$SCRIPT_PATH") 9 | MAIN_DIR=$(dirname "$SCRIPT_DIR") 10 | TOKENIZER_PATH="$MAIN_DIR/codegeex/tokenizer/" 11 | 12 | if [ -z "$MP_SIZE" ]; then 13 | MP_SIZE=1 14 | fi 15 | 16 | # export CUDA settings 17 | export CUDA_HOME=/usr/local/cuda-11.1/ 18 | export CUDA_VISIBLE_DEVICES=0,1 19 | 20 | 21 | CMD="python $MAIN_DIR/codegeex/megatron/convert_ckpt_parallel.py \ 22 | --load-ckpt-path $LOAD_CKPT_PATH \ 23 | --save-ckpt-path $SAVE_CKPT_PATH \ 24 | --tokenizer-path $TOKENIZER_PATH \ 25 | --target-tensor-model-parallel-size $MP_SIZE \ 26 | --num-layers 39 \ 27 | --hidden-size 5120 \ 28 | --num-attention-heads 40 \ 29 | --max-position-embeddings 2048 \ 30 | --attention-softmax-in-fp32 \ 31 | --fp16 \ 32 | --micro-batch-size 1 \ 33 | --make-vocab-size-divisible-by 52224 \ 34 | --seq-length 2048" 35 | 36 | echo "$CMD" 37 | eval "$CMD" -------------------------------------------------------------------------------- /scripts/convert_mindspore_to_megatron.sh: -------------------------------------------------------------------------------- 1 | # This script is used to convert mindspore checkpoint to the megatron format. 2 | 3 | NPY_CKPT_PATH=$1 # Path to Mindspore exported weights in .npy format. 4 | SAVE_CKPT_PATH=$2 # Path to save the output .pt checkpoint. 5 | GPU=$3 6 | 7 | SCRIPT_PATH=$(realpath "$0") 8 | SCRIPT_DIR=$(dirname "$SCRIPT_PATH") 9 | MAIN_DIR=$(dirname "$SCRIPT_DIR") 10 | TOKENIZER_PATH="$MAIN_DIR/codegeex/tokenizer/" 11 | 12 | # export CUDA settings 13 | if [ -z "$GPU" ]; then 14 | GPU=0 15 | fi 16 | 17 | export CUDA_HOME=/usr/local/cuda-11.1/ 18 | export CUDA_VISIBLE_DEVICES=$GPU 19 | 20 | 21 | CMD="python $MAIN_DIR/codegeex/megatron/mindspore_to_megatron.py \ 22 | --npy-ckpt-path $NPY_CKPT_PATH \ 23 | --save-ckpt-path $SAVE_CKPT_PATH \ 24 | --tokenizer-path $TOKENIZER_PATH \ 25 | $MODEL_ARGS" 26 | 27 | echo "$CMD" 28 | eval "$CMD" -------------------------------------------------------------------------------- /scripts/evaluate_humaneval_x.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | from pathlib import Path 4 | from codegeex.benchmark.evaluate_humaneval_x import evaluate_functional_correctness 5 | #GLOBALS 6 | INPUT_FILE: str 7 | LANGUAGE: str 8 | N_WORKERS: int 9 | TIMEOUT: int 10 | 11 | 12 | parser = argparse.ArgumentParser("Debugging evaluate humaneval_x") 13 | # Path to the .jsonl file that contains the generated codes. 14 | parser.add_argument("-s","--samples", type=str) 15 | 16 | # Target programming language, currently support one of ["python", "java", "cpp", "js", "go"] 17 | parser.add_argument("-l","--language", default="python", type=str) 18 | 19 | # Number of parallel workers. 20 | parser.add_argument("-w","--workers", default=64, type=int) 21 | 22 | # Timeout in seconds. 23 | parser.add_argument("-t","--timeout", default=5, type=int) 24 | 25 | args = parser.parse_args() 26 | 27 | INPUT_FILE = args.samples 28 | LANGUAGE = args.language 29 | N_WORKERS = args.workers 30 | TIMEOUT= args.timeout 31 | 32 | 33 | 34 | SCRIPT_PATH: str = Path(os.path.abspath(__file__)) 35 | print(SCRIPT_PATH) 36 | SCRIPT_DIR: str = os.path.dirname(SCRIPT_PATH) 37 | print(SCRIPT_DIR) 38 | MAIN_DIR: str = os.path.dirname(SCRIPT_DIR) 39 | print(MAIN_DIR) 40 | 41 | DATA_DIR=os.path.join(MAIN_DIR,"codegeex/benchmark/humaneval-x/" + LANGUAGE + "/data/humaneval_" + LANGUAGE + ".jsonl.gz") 42 | print(DATA_DIR) 43 | 44 | TMP_DIR=os.path.join(MAIN_DIR, "/codegeex/benchmark/humaneval-x/") 45 | 46 | 47 | #Debugging 48 | INPUT_FILE='/home/rog0d/Escritorio/CodeGeeX/generations/humaneval_rust_generations.jsonl.gz' 49 | LANGUAGE='rust' 50 | DATA_DIR=os.path.join(MAIN_DIR,"codegeex/benchmark/humaneval-x/" + LANGUAGE + "/data/humaneval_" + LANGUAGE + ".jsonl.gz") 51 | 52 | """ 53 | input_file: str = None, 54 | tmp_dir: str = "./", 55 | n_workers: int = 32, 56 | timeout: float = 5.0, 57 | problem_file: str = "../data/humaneval_python.jsonl.gz", 58 | out_dir: str = None, 59 | k: List[int] = [1, 10, 100], 60 | test_groundtruth: bool = False, 61 | example_test: bool = False, 62 | 63 | """ 64 | 65 | evaluate_functional_correctness(input_file=INPUT_FILE, 66 | n_workers=N_WORKERS, 67 | tmp_dir=TMP_DIR, 68 | problem_file=DATA_DIR, 69 | timeout=300.0) 70 | 71 | 72 | -------------------------------------------------------------------------------- /scripts/evaluate_humaneval_x.sh: -------------------------------------------------------------------------------- 1 | # This script is for evaluating the functional correctness of the generated codes of HumanEval-X. 2 | 3 | INPUT_FILE=$1 # Path to the .jsonl file that contains the generated codes. 4 | LANGUAGE=$2 # Target programming language, currently support one of ["python", "java", "cpp", "js", "go"] 5 | N_WORKERS=$3 # Number of parallel workers. 6 | TIMEOUT=$4 # Timeout in seconds. 7 | 8 | SCRIPT_PATH=$(realpath "$0") 9 | SCRIPT_DIR=$(dirname "$SCRIPT_PATH") 10 | MAIN_DIR=$(dirname "$SCRIPT_DIR") 11 | 12 | echo "$INPUT_FILE" 13 | 14 | if [ -z "$N_WORKERS" ] 15 | then 16 | N_WORKERS=64 17 | fi 18 | 19 | if [ -z "$LANGUAGE" ] 20 | then 21 | LANGUAGE=python 22 | fi 23 | 24 | if [ -z "$TIMEOUT" ] 25 | then 26 | TIMEOUT=5 27 | fi 28 | 29 | DATA_DIR=$MAIN_DIR/codegeex/benchmark/humaneval-x/$LANGUAGE/data/humaneval_$LANGUAGE.jsonl.gz 30 | 31 | if [ $LANGUAGE = go ]; then 32 | export PATH=$PATH:/usr/local/go/bin 33 | fi 34 | 35 | if [ $LANGUAGE = cpp ]; then 36 | export PATH=$PATH:/usr/bin/openssl 37 | fi 38 | 39 | CMD="python $MAIN_DIR/codegeex/benchmark/humaneval-x/evaluate_humaneval_x.py \ 40 | --input_file "$INPUT_FILE" \ 41 | --n_workers $N_WORKERS \ 42 | --tmp_dir $MAIN_DIR/codegeex/benchmark/humaneval-x/ \ 43 | --problem_file $DATA_DIR \ 44 | --timeout $TIMEOUT" 45 | 46 | echo "$CMD" 47 | eval "$CMD" -------------------------------------------------------------------------------- /scripts/finetune_codegeex.sh: -------------------------------------------------------------------------------- 1 | SCRIPT_PATH=$(realpath "$0") 2 | SCRIPT_DIR=$(dirname "$SCRIPT_PATH") 3 | MAIN_DIR=$(dirname "$SCRIPT_DIR") 4 | 5 | # ====== Environment ====== 6 | # - NCCL & IB 7 | export NCCL_DEBUG=info 8 | export NCCL_IB_DISABLE=0 9 | export NCCL_IB_GID_INDEX=3 10 | 11 | HOSTFILE="" 12 | MASTER_IP=$(cat $HOSTFILE | head -n 1) 13 | cat $HOSTFILE | awk '{print $1 " slots=8"}' > $SCRIPT_DIR/hostfile 14 | echo "MASTER_IP=$MASTER_IP" 15 | 16 | # ====== Parameters ====== 17 | DATA_PATH="" 18 | CKPT_PATH="" 19 | DS_CONFIG=ds_config.json 20 | # - 13b 21 | TP=1 22 | PP=1 23 | NLAYERS=39 24 | HIDDEN=5120 25 | NATTN_HEAD=40 26 | EMBED_VOCAB=52224 27 | GLOBAL_BATCH=560 28 | MICRO_BATCH=10 29 | NTRAIN_ITERS=100000 30 | EVAL_INT=10 31 | SAVE_INT=10 32 | TRIAL_TAG="13b-test" 33 | # - trial 34 | TRIAL_NAME="pretrain-codegeex" 35 | # - zero stage 36 | ZERO_STAGE=2 37 | # - logging & output 38 | NOW=$(date +"%Y%m%d_%H%M%S") 39 | OUTPUT_DIR="-$TRIAL_NAME-$TRIAL_TAG" 40 | TB_DIR=$OUTPUT_DIR/tb$NOW 41 | mkdir -p $OUTPUT_DIR 42 | mkdir -p $TB_DIR 43 | 44 | # Deepspeed config 45 | cat < $DS_CONFIG 46 | { 47 | "train_batch_size" : $GLOBAL_BATCH, 48 | "train_micro_batch_size_per_gpu": $MICRO_BATCH, 49 | "steps_per_print": 5, 50 | "zero_optimization": { 51 | "stage": $ZERO_STAGE, 52 | "reduce_bucket_size": 50000000, 53 | "allgather_bucket_size": 50000000, 54 | "overlap_comm": true, 55 | "contiguous_gradients": false 56 | }, 57 | "fp16": { 58 | "enabled": true, 59 | "loss_scale": 0, 60 | "loss_scale_window": 500, 61 | "hysteresis": 2, 62 | "min_loss_scale": 1, 63 | "initial_scale_power": 12 64 | }, 65 | "wall_clock_breakdown" : true 66 | } 67 | EOT 68 | 69 | ds_args="" 70 | ds_args=" --deepspeed ${ds_args}" 71 | ds_args=" --no-pipeline-parallel ${ds_args}" 72 | ds_args=" --deepspeed_config=$DS_CONFIG ${ds_args}" 73 | ds_args=" --zero-stage=$ZERO_STAGE ${ds_args}" 74 | ds_args=" --deepspeed-activation-checkpointing ${ds_args}" 75 | 76 | echo "Launching deepspeed" 77 | deepspeed \ 78 | --hostfile hostfile \ 79 | --master_addr $MASTER_IP \ 80 | $MAIN_DIR/codegeex/megatron/tools/pretrain_codegeex.py \ 81 | --tensor-model-parallel-size $TP \ 82 | --pipeline-model-parallel-size $PP \ 83 | --no-pipeline-parallel \ 84 | --num-layers $NLAYERS \ 85 | --hidden-size $HIDDEN \ 86 | --make-vocab-size-divisible-by $EMBED_VOCAB \ 87 | --num-attention-heads $NATTN_HEAD \ 88 | --seq-length 512 \ 89 | --loss-scale 12 \ 90 | --max-position-embeddings 2048 \ 91 | --micro-batch-size $MICRO_BATCH \ 92 | --global-batch-size $GLOBAL_BATCH \ 93 | --train-iters $NTRAIN_ITERS \ 94 | --lr 1e-6 \ 95 | --min-lr 1e-7 \ 96 | --lr-decay-iters 100000 \ 97 | --lr-decay-style cosine \ 98 | --lr-warmup-iters 1000 \ 99 | --log-interval 1 \ 100 | --eval-iters 10 \ 101 | --eval-interval $EVAL_INT \ 102 | --data-path $DATA_PATH \ 103 | --vocab-file $MAIN_DIR/codegeex/tokenizer/vocab.json \ 104 | --merge-file $MAIN_DIR/codegeex/tokenizer/merges.txt \ 105 | --save-interval $SAVE_INT \ 106 | --save $OUTPUT_DIR \ 107 | --load $OUTPUT_DIR \ 108 | --load-state $CKPT_PATH \ 109 | --split 98,2,0 \ 110 | --clip-grad 1.0 \ 111 | --weight-decay 0.1 \ 112 | --adam-beta1 0.9 \ 113 | --adam-beta2 0.95 \ 114 | --fp16 \ 115 | --ln-fp16 \ 116 | --attention-softmax-in-fp32 \ 117 | --checkpoint-activations \ 118 | --override-lr-scheduler \ 119 | --tensorboard-dir $TB_DIR \ 120 | $ds_args |& tee ${OUTPUT_DIR}/$NOW.log -------------------------------------------------------------------------------- /scripts/gather_output.sh: -------------------------------------------------------------------------------- 1 | # This script is used to gather the distributed outputs of different ranks. 2 | 3 | OUTPUT_DIR=$1 4 | OUTPUT_PREFIX=$2 5 | IF_REMOVE_RANK_FILES=$3 6 | 7 | echo "$OUTPUT_DIR" 8 | echo "$OUTPUT_PREFIX" 9 | 10 | if [ -z "$IF_REMOVE_RANK_FILES" ] 11 | then 12 | IF_REMOVE_RANK_FILES=0 13 | fi 14 | 15 | SCRIPT_PATH=$(realpath "$0") 16 | SCRIPT_DIR=$(dirname "$SCRIPT_PATH") 17 | MAIN_DIR=$(dirname "$SCRIPT_DIR") 18 | 19 | 20 | CMD="python $MAIN_DIR/codegeex/benchmark/gather_output.py \ 21 | --output_dir $OUTPUT_DIR \ 22 | --output_prefix $OUTPUT_PREFIX \ 23 | --if_remove_rank_files $IF_REMOVE_RANK_FILES" 24 | 25 | echo "$CMD" 26 | eval "$CMD" -------------------------------------------------------------------------------- /scripts/generate_humaneval_x.sh: -------------------------------------------------------------------------------- 1 | # This script is used to generate solutions of HumanEval-X. 2 | 3 | LANGUAGE=$1 # Target programming language, currently support one of ["python", "java", "cpp", "js", "go"] 4 | OUTPUT_PATH=$2 # Output path of the generated programs. 5 | HOSTLIST=$3 # Provide hostfile if generating distributedly 6 | 7 | SCRIPT_PATH=$(realpath "$0") 8 | SCRIPT_DIR=$(dirname "$SCRIPT_PATH") 9 | MAIN_DIR=$(dirname "$SCRIPT_DIR") 10 | TOKENIZER_PATH="$MAIN_DIR/codegeex/tokenizer/" 11 | 12 | # export CUDA settings 13 | export CUDA_HOME=/usr/local/cuda-11.1/ 14 | 15 | # import model configuration 16 | source "$MAIN_DIR/configs/codegeex_13b.sh" 17 | 18 | # nccl options 19 | OPTIONS_NCCL="export NCCL_DEBUG=warn; export NCCL_IB_DISABLE=0; export NCCL_IB_GID_INDEX=3" 20 | OPTIONS_PATH="export PATH=$PATH; export LD_LIBRARY_PATH=$LD_LIBRARY_PATH" 21 | CWD=$(pwd) 22 | 23 | # set master ip for zmq server 24 | if [ -z "$HOSTLIST" ]; then 25 | ZMQ_ADDR=$(hostname -i) 26 | echo "$ZMQ_ADDR" > "./hostfile" 27 | HOSTLIST="./hostfile" 28 | else 29 | ZMQ_ADDR=$(cat $HOSTLIST | head -n 1) 30 | fi 31 | echo "master_ip: $ZMQ_ADDR" 32 | 33 | NUM_SAMPLES=1 34 | MICRO_BSZ=1 35 | WORLD_SIZE=1 36 | TEMP=0.8 37 | TOPP=0.95 38 | SEED=42 39 | DATASET=humaneval 40 | TODAY=$(date +%y%m%d) 41 | CHANNEL_PORT=$(expr $RANDOM + 5000) 42 | MASTER_PORT=$(expr $RANDOM + 8000) 43 | 44 | # save log file 45 | LOG_DIR=$MAIN_DIR/log 46 | mkdir -p "$LOG_DIR" 47 | LOG_PATH="$LOG_DIR/$TODAY-generation.log" 48 | 49 | if [ -z "$LANGUAGE" ]; then 50 | LANGUAGE=python 51 | fi 52 | 53 | if [ -z "$INPUT_PATH" ]; then 54 | INPUT_PATH=$MAIN_DIR/codegeex/benchmark/humaneval-x/$LANGUAGE/data/humaneval_$LANGUAGE.jsonl.gz 55 | fi 56 | 57 | if [ -z "$OUTPUT_PATH" ]; then 58 | OUTPUT_PATH=$MAIN_DIR/codegeex/benchmark/output/humaneval-x/codegeex/ 59 | mkdir -p "$OUTPUT_PATH" 60 | fi 61 | 62 | JOB_ID=codegeex-ns$NUM_SAMPLES-t$TEMP-topp$TOPP-seed$SEED-$LANGUAGE 63 | 64 | RUN_CMD="python \ 65 | $MAIN_DIR/codegeex/benchmark/humaneval-x/generate_humaneval_x.py \ 66 | --hostfile $HOSTLIST \ 67 | --channel-ip $ZMQ_ADDR \ 68 | --channel-port $CHANNEL_PORT \ 69 | --master-port $MASTER_PORT \ 70 | --tokenizer-path $TOKENIZER_PATH \ 71 | --load-deepspeed \ 72 | --temperature $TEMP \ 73 | --top-p $TOPP \ 74 | --out-seq-length 1024 \ 75 | --micro-batch-size $MICRO_BSZ \ 76 | --samples-per-problem $NUM_SAMPLES \ 77 | --language-type $LANGUAGE \ 78 | --dataset $DATASET \ 79 | --input-path $INPUT_PATH \ 80 | --output-prefix $OUTPUT_PATH/$JOB_ID \ 81 | --gen-node-world-size $WORLD_SIZE \ 82 | --seed $SEED \ 83 | $MODEL_ARGS" 84 | 85 | RUN_CMD="$OPTIONS_NCCL; $OPTIONS_PATH; $RUN_CMD" 86 | RUN_CMD="cd $CWD; $RUN_CMD" 87 | 88 | if (( WORLD_SIZE != 1 )); then 89 | RUN_CMD="pdsh -R ssh -w ^$HOSTLIST \"$RUN_CMD\"" 90 | fi 91 | 92 | echo "$RUN_CMD" 93 | echo "Writing log to $LOG_PATH" 94 | eval "$RUN_CMD" > "$LOG_PATH" 95 | bash $MAIN_DIR/scripts/gather_output.sh $OUTPUT_PATH $JOB_ID 1 96 | -------------------------------------------------------------------------------- /scripts/pretrain_codegeex.sh: -------------------------------------------------------------------------------- 1 | SCRIPT_PATH=$(realpath "$0") 2 | SCRIPT_DIR=$(dirname "$SCRIPT_PATH") 3 | MAIN_DIR=$(dirname "$SCRIPT_DIR") 4 | 5 | # ====== Environment ====== 6 | # - NCCL & IB 7 | export NCCL_DEBUG=info 8 | export NCCL_IB_DISABLE=0 9 | export NCCL_IB_GID_INDEX=3 10 | 11 | HOSTFILE="" 12 | MASTER_IP=$(cat $HOSTFILE | head -n 1) 13 | cat $HOSTFILE | awk '{print $1 " slots=8"}' > $SCRIPT_DIR/hostfile 14 | echo "MASTER_IP=$MASTER_IP" 15 | 16 | # ====== Parameters ====== 17 | DATA_PATH="" 18 | CKPT_PATH="" 19 | DS_CONFIG=ds_config.json 20 | # - 13b 21 | TP=1 22 | PP=1 23 | NLAYERS=39 24 | HIDDEN=5120 25 | NATTN_HEAD=40 26 | EMBED_VOCAB=52224 27 | GLOBAL_BATCH=560 28 | MICRO_BATCH=10 29 | NTRAIN_ITERS=100000 30 | EVAL_INT=10 31 | SAVE_INT=10 32 | TRIAL_TAG="13b-test" 33 | # - trial 34 | TRIAL_NAME="pretrain-codegeex" 35 | # - zero stage 36 | ZERO_STAGE=2 37 | # - logging & output 38 | NOW=$(date +"%Y%m%d_%H%M%S") 39 | OUTPUT_DIR="-$TRIAL_NAME-$TRIAL_TAG" 40 | TB_DIR=$OUTPUT_DIR/tb$NOW 41 | mkdir -p $OUTPUT_DIR 42 | mkdir -p $TB_DIR 43 | 44 | # Deepspeed config 45 | cat < $DS_CONFIG 46 | { 47 | "train_batch_size" : $GLOBAL_BATCH, 48 | "train_micro_batch_size_per_gpu": $MICRO_BATCH, 49 | "steps_per_print": 5, 50 | "zero_optimization": { 51 | "stage": $ZERO_STAGE, 52 | "reduce_bucket_size": 50000000, 53 | "allgather_bucket_size": 50000000, 54 | "overlap_comm": true, 55 | "contiguous_gradients": false 56 | }, 57 | "fp16": { 58 | "enabled": true, 59 | "loss_scale": 0, 60 | "loss_scale_window": 500, 61 | "hysteresis": 2, 62 | "min_loss_scale": 1, 63 | "initial_scale_power": 12 64 | }, 65 | "wall_clock_breakdown" : true 66 | } 67 | EOT 68 | 69 | ds_args="" 70 | ds_args=" --deepspeed ${ds_args}" 71 | ds_args=" --no-pipeline-parallel ${ds_args}" 72 | ds_args=" --deepspeed_config=$DS_CONFIG ${ds_args}" 73 | ds_args=" --zero-stage=$ZERO_STAGE ${ds_args}" 74 | ds_args=" --deepspeed-activation-checkpointing ${ds_args}" 75 | 76 | echo "Launching deepspeed" 77 | deepspeed \ 78 | --hostfile hostfile \ 79 | --master_addr $MASTER_IP \ 80 | $MAIN_DIR/codegeex/megatron/tools/pretrain_codegeex.py \ 81 | --tensor-model-parallel-size $TP \ 82 | --pipeline-model-parallel-size $PP \ 83 | --no-pipeline-parallel \ 84 | --num-layers $NLAYERS \ 85 | --hidden-size $HIDDEN \ 86 | --make-vocab-size-divisible-by $EMBED_VOCAB \ 87 | --num-attention-heads $NATTN_HEAD \ 88 | --seq-length 512 \ 89 | --loss-scale 12 \ 90 | --max-position-embeddings 2048 \ 91 | --micro-batch-size $MICRO_BATCH \ 92 | --global-batch-size $GLOBAL_BATCH \ 93 | --train-iters $NTRAIN_ITERS \ 94 | --lr 2e-4 \ 95 | --min-lr 1e-7 \ 96 | --lr-decay-iters 100000 \ 97 | --lr-decay-style cosine \ 98 | --lr-warmup-iters 1500 \ 99 | --log-interval 1 \ 100 | --eval-iters 10 \ 101 | --eval-interval $EVAL_INT \ 102 | --data-path $DATA_PATH \ 103 | --vocab-file $MAIN_DIR/codegeex/tokenizer/vocab.json \ 104 | --merge-file $MAIN_DIR/codegeex/tokenizer/merges.txt \ 105 | --save-interval $SAVE_INT \ 106 | --save $OUTPUT_DIR \ 107 | --load $OUTPUT_DIR \ 108 | --load-state $CKPT_PATH \ 109 | --split 98,2,0 \ 110 | --clip-grad 1.0 \ 111 | --weight-decay 0.1 \ 112 | --adam-beta1 0.9 \ 113 | --adam-beta2 0.95 \ 114 | --fp16 \ 115 | --ln-fp16 \ 116 | --attention-softmax-in-fp32 \ 117 | --checkpoint-activations \ 118 | --override-lr-scheduler \ 119 | --tensorboard-dir $TB_DIR \ 120 | $ds_args |& tee ${OUTPUT_DIR}/$NOW.log 121 | 122 | -------------------------------------------------------------------------------- /scripts/process_pretrain_dataset.sh: -------------------------------------------------------------------------------- 1 | # Process dataset for CodeGeeX pretraining 2 | 3 | DATASET_PATH=$1 4 | OUTPUT_PATH=$2 5 | LANGUAGE=$3 6 | 7 | SCRIPT_PATH=$(realpath "$0") 8 | SCRIPT_DIR=$(dirname "$SCRIPT_PATH") 9 | MAIN_DIR=$(dirname "$SCRIPT_DIR") 10 | TOKENIZER_PATH="$MAIN_DIR/codegeex/tokenizer/" 11 | 12 | if [ -z "$LANGUAGE" ]; then 13 | LANGUAGE=python 14 | fi 15 | 16 | CMD="python $MAIN_DIR/codegeex/data/process_pretrain_dataset.py \ 17 | --dataset_path $DATASET_PATH \ 18 | --tokenizer_path $TOKENIZER_PATH \ 19 | --output_prefix $OUTPUT_PATH \ 20 | --language $LANGUAGE \ 21 | --mode pretrain \ 22 | --seq_len 2048" 23 | 24 | echo "$CMD" 25 | eval "$CMD" -------------------------------------------------------------------------------- /scripts/test_inference.sh: -------------------------------------------------------------------------------- 1 | # This script is used to test the inference of CodeGeeX. 2 | 3 | GPU=$1 4 | PROMPT_FILE=$2 5 | 6 | SCRIPT_PATH=$(realpath "$0") 7 | SCRIPT_DIR=$(dirname "$SCRIPT_PATH") 8 | MAIN_DIR=$(dirname "$SCRIPT_DIR") 9 | TOKENIZER_PATH="$MAIN_DIR/codegeex/tokenizer/" 10 | 11 | # import model configuration 12 | source "$MAIN_DIR/configs/codegeex_13b.sh" 13 | 14 | # export CUDA settings 15 | if [ -z "$GPU" ]; then 16 | GPU=0 17 | fi 18 | 19 | export CUDA_HOME=/usr/local/cuda-11.1/ 20 | export CUDA_VISIBLE_DEVICES=$GPU 21 | 22 | if [ -z "$PROMPT_FILE" ]; then 23 | PROMPT_FILE=$MAIN_DIR/tests/test_prompt.txt 24 | fi 25 | 26 | # remove --greedy if using sampling 27 | CMD="python $MAIN_DIR/tests/test_inference.py \ 28 | --prompt-file $PROMPT_FILE \ 29 | --tokenizer-path $TOKENIZER_PATH \ 30 | --micro-batch-size 1 \ 31 | --out-seq-length 1024 \ 32 | --temperature 0.8 \ 33 | --top-p 0.95 \ 34 | --top-k 0 \ 35 | --greedy \ 36 | $MODEL_ARGS" 37 | 38 | echo "$CMD" 39 | eval "$CMD" 40 | -------------------------------------------------------------------------------- /scripts/test_inference_oneflow.sh: -------------------------------------------------------------------------------- 1 | # This script is used to test the inference of CodeGeeX. 2 | 3 | GPU=$1 4 | PROMPT_FILE=$2 5 | 6 | SCRIPT_PATH=$(realpath "$0") 7 | SCRIPT_DIR=$(dirname "$SCRIPT_PATH") 8 | MAIN_DIR=$(dirname "$SCRIPT_DIR") 9 | TOKENIZER_PATH="$MAIN_DIR/codegeex/tokenizer/" 10 | 11 | # import model configuration 12 | source "$MAIN_DIR/configs/codegeex_13b.sh" 13 | 14 | # export CUDA settings 15 | if [ -z "$GPU" ]; then 16 | GPU=0 17 | fi 18 | 19 | export CUDA_HOME=/usr/local/cuda-11.1/ 20 | export CUDA_VISIBLE_DEVICES=$GPU 21 | 22 | if [ -z "$PROMPT_FILE" ]; then 23 | PROMPT_FILE=$MAIN_DIR/tests/test_prompt.txt 24 | fi 25 | 26 | # remove --greedy if using sampling 27 | CMD="python $MAIN_DIR/tests/test_inference_oneflow.py \ 28 | --prompt-file $PROMPT_FILE \ 29 | --tokenizer-path $TOKENIZER_PATH \ 30 | --micro-batch-size 1 \ 31 | --out-seq-length 1024 \ 32 | --temperature 0.8 \ 33 | --top-p 0.95 \ 34 | --top-k 0 \ 35 | --greedy \ 36 | $MODEL_ARGS" 37 | 38 | echo "$CMD" 39 | eval "$CMD" 40 | -------------------------------------------------------------------------------- /scripts/test_inference_oneflow_quantized.sh: -------------------------------------------------------------------------------- 1 | # This script is used to test the inference of CodeGeeX. 2 | 3 | GPU=$1 4 | PROMPT_FILE=$2 5 | 6 | SCRIPT_PATH=$(realpath "$0") 7 | SCRIPT_DIR=$(dirname "$SCRIPT_PATH") 8 | MAIN_DIR=$(dirname "$SCRIPT_DIR") 9 | TOKENIZER_PATH="$MAIN_DIR/codegeex/tokenizer/" 10 | 11 | # import model configuration 12 | source "$MAIN_DIR/configs/codegeex_13b.sh" 13 | 14 | # export CUDA settings 15 | if [ -z "$GPU" ]; then 16 | GPU=1 17 | fi 18 | 19 | export CUDA_HOME=/usr/local/cuda-11.1/ 20 | export CUDA_VISIBLE_DEVICES=$GPU 21 | 22 | if [ -z "$PROMPT_FILE" ]; then 23 | PROMPT_FILE=$MAIN_DIR/tests/test_prompt.txt 24 | fi 25 | 26 | # remove --greedy if using sampling 27 | CMD="python $MAIN_DIR/tests/test_inference_oneflow.py \ 28 | --prompt-file $PROMPT_FILE \ 29 | --tokenizer-path $TOKENIZER_PATH \ 30 | --micro-batch-size 1 \ 31 | --out-seq-length 1024 \ 32 | --temperature 0.2 \ 33 | --top-p 0.95 \ 34 | --top-k 0 \ 35 | --quantize \ 36 | $MODEL_ARGS" 37 | 38 | echo "$CMD" 39 | eval "$CMD" 40 | -------------------------------------------------------------------------------- /scripts/test_inference_paddle.sh: -------------------------------------------------------------------------------- 1 | # This script is used to test the inference of CodeGeeX. 2 | 3 | GPU=$1 4 | PROMPT_FILE=$2 5 | 6 | SCRIPT_PATH=$(realpath "$0") 7 | SCRIPT_DIR=$(dirname "$SCRIPT_PATH") 8 | MAIN_DIR=$(dirname "$SCRIPT_DIR") 9 | TOKENIZER_PATH="$MAIN_DIR/codegeex/tokenizer/" 10 | 11 | # import model configuration 12 | source "$MAIN_DIR/configs/codegeex_13b_paddle.sh" 13 | 14 | # export CUDA settings 15 | if [ -z "$GPU" ]; then 16 | GPU=0 17 | fi 18 | 19 | export CUDA_HOME=/usr/local/cuda-11.1/ 20 | export CUDA_VISIBLE_DEVICES=$GPU 21 | 22 | if [ -z "$PROMPT_FILE" ]; then 23 | PROMPT_FILE=$MAIN_DIR/tests/test_prompt.txt 24 | fi 25 | 26 | # remove --greedy if using sampling 27 | CMD="python $MAIN_DIR/tests/test_inference_paddle.py \ 28 | --prompt-file $PROMPT_FILE \ 29 | --tokenizer-path $TOKENIZER_PATH \ 30 | --micro-batch-size 1 \ 31 | --out-seq-length 1024 \ 32 | --temperature 0.8 \ 33 | --top-p 0.95 \ 34 | --top-k 0 \ 35 | --greedy \ 36 | $MODEL_ARGS" 37 | 38 | echo "$CMD" 39 | eval "$CMD" 40 | -------------------------------------------------------------------------------- /scripts/test_inference_parallel.sh: -------------------------------------------------------------------------------- 1 | # This script is used to test the inference of CodeGeeX. 2 | 3 | MP_SIZE=$1 4 | PROMPT_FILE=$2 5 | 6 | SCRIPT_PATH=$(realpath "$0") 7 | SCRIPT_DIR=$(dirname "$SCRIPT_PATH") 8 | MAIN_DIR=$(dirname "$SCRIPT_DIR") 9 | TOKENIZER_PATH="$MAIN_DIR/codegeex/tokenizer/" 10 | 11 | if [ -z "$MP_SIZE" ]; then 12 | MP_SIZE=1 13 | fi 14 | 15 | if [ "$MP_SIZE" -eq 1 ]; then 16 | source "$MAIN_DIR/configs/codegeex_13b.sh" 17 | echo "Load config from $MAIN_DIR/configs/codegeex_13b.sh" 18 | else 19 | source "$MAIN_DIR/configs/codegeex_13b_parallel.sh" 20 | echo "Load config from $MAIN_DIR/configs/codegeex_13b_parallel.sh" 21 | fi 22 | 23 | # export CUDA settings 24 | export CUDA_HOME=/usr/local/cuda-11.1/ 25 | # export CUDA_VISIBLE_DEVICES=0,1 26 | 27 | if [ -z "$PROMPT_FILE" ]; then 28 | PROMPT_FILE=$MAIN_DIR/tests/test_prompt.txt 29 | fi 30 | 31 | # remove --greedy if using sampling 32 | CMD="torchrun --nproc_per_node $MP_SIZE $MAIN_DIR/tests/test_inference_megatron.py \ 33 | --tensor-model-parallel-size $MP_SIZE \ 34 | --prompt-file $PROMPT_FILE \ 35 | --tokenizer-path $TOKENIZER_PATH \ 36 | --micro-batch-size 1 \ 37 | --out-seq-length 1024 \ 38 | --temperature 0.8 \ 39 | --top-p 0.95 \ 40 | --top-k 0 \ 41 | --greedy \ 42 | --use-cpu-initialization \ 43 | --ln-fp16 \ 44 | $MODEL_ARGS" 45 | 46 | echo "$CMD" 47 | eval "$CMD" 48 | -------------------------------------------------------------------------------- /scripts/test_inference_quantized.sh: -------------------------------------------------------------------------------- 1 | # This script is used to test the inference of CodeGeeX. 2 | 3 | GPU=$1 4 | PROMPT_FILE=$2 5 | 6 | SCRIPT_PATH=$(realpath "$0") 7 | SCRIPT_DIR=$(dirname "$SCRIPT_PATH") 8 | MAIN_DIR=$(dirname "$SCRIPT_DIR") 9 | TOKENIZER_PATH="$MAIN_DIR/codegeex/tokenizer/" 10 | 11 | # import model configuration 12 | source "$MAIN_DIR/configs/codegeex_13b.sh" 13 | 14 | # export CUDA settings 15 | if [ -z "$GPU" ]; then 16 | GPU=0 17 | fi 18 | 19 | export CUDA_HOME=/usr/local/cuda-11.1/ 20 | export CUDA_VISIBLE_DEVICES=$GPU 21 | 22 | if [ -z "$PROMPT_FILE" ]; then 23 | PROMPT_FILE=$MAIN_DIR/tests/test_prompt.txt 24 | fi 25 | 26 | # remove --greedy if using sampling 27 | CMD="python $MAIN_DIR/tests/test_inference.py \ 28 | --prompt-file $PROMPT_FILE \ 29 | --tokenizer-path $TOKENIZER_PATH \ 30 | --micro-batch-size 1 \ 31 | --out-seq-length 1024 \ 32 | --temperature 0.2 \ 33 | --top-p 0.95 \ 34 | --top-k 0 \ 35 | --quantize \ 36 | $MODEL_ARGS" 37 | 38 | echo "$CMD" 39 | eval "$CMD" 40 | -------------------------------------------------------------------------------- /scripts/translate_humaneval_x.sh: -------------------------------------------------------------------------------- 1 | # This script is used to translate solutions of HumanEval-X. 2 | 3 | LANG_SRC_TYPE=$1 # Source programming language, currently support one of ["python", "java", "cpp", "js", "go"] 4 | LANG_TGT_TYPE=$2 # Target programming language, currently support one of ["python", "java", "cpp", "js", "go"] 5 | OUTPUT_PATH=$3 # Output path of the generated programs. 6 | HOSTLIST=$4 # Provide hostfile if generating distributedly 7 | 8 | SCRIPT_PATH=$(realpath "$0") 9 | SCRIPT_DIR=$(dirname "$SCRIPT_PATH") 10 | MAIN_DIR=$(dirname "$SCRIPT_DIR") 11 | TOKENIZER_PATH="$MAIN_DIR/codegeex/tokenizer/" 12 | 13 | # export CUDA settings 14 | export CUDA_HOME=/usr/local/cuda-11.1/ 15 | 16 | # import model configuration 17 | source "$MAIN_DIR/configs/codegeex_13b.sh" 18 | 19 | # nccl options 20 | OPTIONS_NCCL="export NCCL_DEBUG=warn; export NCCL_IB_DISABLE=0; export NCCL_IB_GID_INDEX=3" 21 | OPTIONS_PATH="export PATH=$PATH; export LD_LIBRARY_PATH=$LD_LIBRARY_PATH" 22 | CWD=$(pwd) 23 | 24 | # set master ip for zmq server 25 | if [ -z "$HOSTLIST" ]; then 26 | ZMQ_ADDR=$(hostname -i) 27 | echo "$ZMQ_ADDR" > "./hostfile" 28 | HOSTLIST="./hostfile" 29 | else 30 | ZMQ_ADDR=$(cat $HOSTLIST | head -n 1) 31 | fi 32 | echo "master_ip: $ZMQ_ADDR" 33 | 34 | NUM_SAMPLES=1 35 | MICRO_BSZ=1 36 | WORLD_SIZE=1 37 | TEMP=0.8 38 | TOPP=0.95 39 | SEED=42 40 | DATASET=humaneval 41 | TODAY=$(date +%y%m%d) 42 | CHANNEL_PORT=$(expr $RANDOM + 5000) 43 | MASTER_PORT=$(expr $RANDOM + 8000) 44 | 45 | # save log file 46 | LOG_DIR=$MAIN_DIR/log 47 | mkdir -p "$LOG_DIR" 48 | LOG_PATH="$LOG_DIR/$TODAY-translation.log" 49 | 50 | if [ -z "$LANG_SRC_TYPE" ] 51 | then 52 | LANG_SRC_TYPE=python 53 | fi 54 | 55 | if [ -z "$LANG_TGT_TYPE" ] 56 | then 57 | LANG_TGT_TYPE=java 58 | fi 59 | 60 | if [ -z "$INPUT_SRC_PATH" ] 61 | then 62 | INPUT_SRC_PATH=$MAIN_DIR/codegeex/benchmark/humaneval-x/$LANG_SRC_TYPE/data/humaneval_$LANG_SRC_TYPE.jsonl.gz 63 | fi 64 | 65 | if [ -z "$INPUT_TGT_PATH" ] 66 | then 67 | INPUT_TGT_PATH=$MAIN_DIR/codegeex/benchmark/humaneval-x/$LANG_TGT_TYPE/data/humaneval_$LANG_TGT_TYPE.jsonl.gz 68 | fi 69 | 70 | if [ -z "$OUTPUT_PATH" ]; then 71 | OUTPUT_PATH=$MAIN_DIR/codegeex/benchmark/output/humaneval-x/codegeex/ 72 | mkdir -p "$OUTPUT_PATH" 73 | fi 74 | 75 | JOB_ID=codegeex-ns$NUM_SAMPLES-t$TEMP-topp$TOPP-seed$SEED-$LANGUAGE 76 | 77 | RUN_CMD="python \ 78 | $MAIN_DIR/codegeex/benchmark/humaneval-x/translate_humaneval_x.py \ 79 | --hostfile $HOSTLIST \ 80 | --channel-ip $ZMQ_ADDR \ 81 | --channel-port $CHANNEL_PORT \ 82 | --master-port $MASTER_PORT \ 83 | --tokenizer-path $TOKENIZER_PATH \ 84 | --load-deepspeed \ 85 | --temperature $TEMP \ 86 | --top-p $TOPP \ 87 | --out-seq-length 1024 \ 88 | --micro-batch-size $MICRO_BSZ \ 89 | --samples-per-problem $NUM_SAMPLES \ 90 | --language-src-type $LANG_SRC_TYPE \ 91 | --language-tgt-type $LANG_TGT_TYPE \ 92 | --src-path $INPUT_SRC_PATH \ 93 | --tgt-path $INPUT_TGT_PATH \ 94 | --dataset $DATASET \ 95 | --output-prefix $OUTPUT_PATH/$JOB_ID \ 96 | --gen-node-world-size $WORLD_SIZE \ 97 | --seed $SEED \ 98 | $MODEL_ARGS" 99 | 100 | RUN_CMD="$OPTIONS_NCCL; $OPTIONS_PATH; $RUN_CMD" 101 | RUN_CMD="cd $CWD; $RUN_CMD" 102 | 103 | if (( WORLD_SIZE != 1 )); then 104 | RUN_CMD="pdsh -R ssh -w ^$HOSTLIST \"$RUN_CMD\"" 105 | fi 106 | 107 | echo "$RUN_CMD" 108 | echo "Writing log to $LOG_PATH" 109 | eval "$RUN_CMD" > "$LOG_PATH" 110 | bash $MAIN_DIR/scripts/gather_output.sh $OUTPUT_PATH $JOB_ID 1 111 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name="codegeex", 5 | py_modules=["codegeex"], 6 | version="1.0", 7 | description="CodeGeeX: A Open Multilingual Code Generation Model.", 8 | author="Qinkai Zheng", 9 | packages=find_packages(), 10 | install_requires=[ 11 | "fire>=0.4.0", 12 | "ipython>=8.4.0", 13 | "numpy>=1.22.0", 14 | "pandas>=1.3.5", 15 | "pyzmq>=23.2.1", 16 | "regex>=2022.3.15", 17 | "setuptools>=58.0.4", 18 | "transformers>=4.22.0", 19 | "tokenizers>=0.11.0", 20 | "torch>=1.10.0", 21 | "tqdm>=4.63.0", 22 | "cpm_kernels", 23 | "deepspeed>0.6.1", 24 | ], 25 | entry_points={} 26 | ) 27 | -------------------------------------------------------------------------------- /tests/test_inference.py: -------------------------------------------------------------------------------- 1 | import time 2 | import torch 3 | import argparse 4 | import numpy as np 5 | 6 | import codegeex 7 | from codegeex.torch import CodeGeeXModel 8 | from codegeex.tokenizer import CodeGeeXTokenizer 9 | from codegeex.quantization import quantize 10 | 11 | 12 | def model_provider(args): 13 | """Build the model.""" 14 | 15 | model = CodeGeeXModel( 16 | args.hidden_size, 17 | args.num_layers, 18 | args.num_attention_heads, 19 | args.padded_vocab_size, 20 | args.max_position_embeddings 21 | ) 22 | 23 | return model 24 | 25 | 26 | def add_code_generation_args(parser): 27 | group = parser.add_argument_group(title="code generation") 28 | group.add_argument( 29 | "--num-layers", 30 | type=int, 31 | default=39, 32 | ) 33 | group.add_argument( 34 | "--hidden-size", 35 | type=int, 36 | default=5120, 37 | ) 38 | group.add_argument( 39 | "--num-attention-heads", 40 | type=int, 41 | default=40, 42 | ) 43 | group.add_argument( 44 | "--padded-vocab-size", 45 | type=int, 46 | default=52224, 47 | ) 48 | group.add_argument( 49 | "--max-position-embeddings", 50 | type=int, 51 | default=2048, 52 | ) 53 | group.add_argument( 54 | "--temperature", 55 | type=float, 56 | default=1.0, 57 | help="Sampling temperature.", 58 | ) 59 | group.add_argument( 60 | "--greedy", 61 | action="store_true", 62 | default=False, 63 | help="Use greedy sampling.", 64 | ) 65 | group.add_argument( 66 | "--top-p", 67 | type=float, 68 | default=0.0, 69 | help="Top p sampling.", 70 | ) 71 | group.add_argument( 72 | "--top-k", 73 | type=int, 74 | default=0, 75 | help="Top k sampling.", 76 | ) 77 | group.add_argument( 78 | "--out-seq-length", 79 | type=int, 80 | default=2048, 81 | help="Size of the output generated text.", 82 | ) 83 | group.add_argument( 84 | "--prompt-file", 85 | type=str, 86 | default="./test_prompt.txt", 87 | ) 88 | group.add_argument( 89 | "--tokenizer-path", 90 | type=str, 91 | default="./tokenizer", 92 | ) 93 | group.add_argument( 94 | "--load", 95 | type=str, 96 | ) 97 | group.add_argument( 98 | "--state-dict-path", 99 | type=str, 100 | ) 101 | group.add_argument( 102 | "--micro-batch-size", 103 | type=int, 104 | default=1, 105 | ) 106 | group.add_argument( 107 | "--quantize", 108 | action="store_true", 109 | ) 110 | group.add_argument( 111 | "--interative", 112 | action="store_true", 113 | ) 114 | 115 | return parser 116 | 117 | 118 | def main(): 119 | parser = argparse.ArgumentParser() 120 | parser = add_code_generation_args(parser) 121 | args, _ = parser.parse_known_args() 122 | 123 | print("Loading tokenizer ...") 124 | tokenizer = CodeGeeXTokenizer( 125 | tokenizer_path=args.tokenizer_path, 126 | mode="codegeex-13b") 127 | 128 | print("Loading state dict ...") 129 | state_dict = torch.load(args.load, map_location="cpu") 130 | state_dict = state_dict["module"] 131 | 132 | print("Building CodeGeeX model ...") 133 | model = model_provider(args) 134 | model.load_state_dict(state_dict) 135 | model.eval() 136 | model.half() 137 | if args.quantize: 138 | model = quantize(model, weight_bit_width=8, backend="torch") 139 | model.cuda() 140 | torch.cuda.synchronize() 141 | 142 | with open(args.prompt_file, "r") as f: 143 | prompt = f.readlines() 144 | prompt = "".join(prompt) 145 | 146 | out_seq_lengths = [args.out_seq_length] 147 | for out_seq_length in out_seq_lengths: 148 | print(f"Generating with out_seq_len {out_seq_length}...") 149 | while True: 150 | print("\nPlease Input Query (Ctrl-D to save multiple lines, 'stop' to exit) >>> ") 151 | prompts = [] 152 | while True: 153 | try: 154 | line = input() 155 | except EOFError: 156 | break 157 | prompts.append(line) 158 | prompt = "\n".join(prompts) 159 | prompt = prompt.strip() 160 | if not prompt: 161 | print('Query should not be empty!') 162 | continue 163 | if prompt == "stop": 164 | return 165 | try: 166 | t0 = time.perf_counter() 167 | generated_code = codegeex.generate( 168 | model, 169 | tokenizer, 170 | prompt, 171 | out_seq_length=out_seq_length, 172 | seq_length=args.max_position_embeddings, 173 | top_k=args.top_k, 174 | top_p=args.top_p, 175 | temperature=args.temperature, 176 | micro_batch_size=args.micro_batch_size, 177 | backend="megatron", 178 | verbose=True, 179 | ) 180 | t1 = time.perf_counter() 181 | print("Total generation time:", t1 - t0) 182 | except (ValueError, FileNotFoundError) as e: 183 | print(e) 184 | continue 185 | 186 | print("Generation finished.") 187 | 188 | 189 | if __name__ == "__main__": 190 | main() -------------------------------------------------------------------------------- /tests/test_prompt.txt: -------------------------------------------------------------------------------- 1 | code translation 2 | Java: 3 | public class Solution { 4 | public static boolean hasCloseElements(int[] nums, int threshold) { 5 | for (int i = 0; i < nums.length - 1; i++) { 6 | for (int j = i + 1; j < nums.length; j++) { 7 | if (Math.abs(nums[i] - nums[j]) < threshold) { 8 | return true; 9 | } 10 | } 11 | } 12 | return false; 13 | } 14 | } 15 | Python: 16 | -------------------------------------------------------------------------------- /vscode-extension/README_zh.md: -------------------------------------------------------------------------------- 1 | ![codegeex_logo](../resources/logo/codegeex_logo.png) 2 | 3 | 🌐 English 4 | 5 | ![CodeGeeX vscode extension version](https://img.shields.io/visual-studio-marketplace/v/aminer.codegeex?colorA=0B9FE0&colorB=brightgreen) 6 | ![CodeGeeX vscode extension last update](https://img.shields.io/visual-studio-marketplace/last-updated/aminer.codegeex?colorA=0B9FE0&colorB=brightgreen) 7 | ![CodeGeeX download](https://img.shields.io/visual-studio-marketplace/d/aminer.codegeex?colorA=0B9FE0&colorB=brightgreen) 8 | ![CodeGeeX vscode extension rating](https://img.shields.io/visual-studio-marketplace/stars/aminer.codegeex?colorA=0B9FE0&colorB=brightgreen) 9 | ![CodeGeeX github stars](https://img.shields.io/github/stars/THUDM/CodeGeeX?style=social) 10 | 11 | CodeGeeX是一个具有130亿参数的多编程语言代码生成预训练模型,使用超过二十种编程语言训练得到。基于CodeGeeX开发的插件可以实现通过描述生成代码、补全代码、代码翻译等一系列功能。CodeGeeX同样提供可以定制的**提示模式(Prompt Mode)**,构建专属的编程助手。Happy Coding! 12 | 13 | VS Code插件市场搜索"codegeex"即可免费使用(需要VS Code版本不低于1.68.0),更多关于CodeGeeX信息请见我们的[主页](https://models.aminer.cn/codegeex/) and [GitHub仓库](https://github.com/THUDM/CodeGeeX)。 14 | 15 | 如使用过程中遇到问题或有任何改进意见,欢迎发送邮件到[codegeex@aminer.cn](mailto:codegeex@aminer.cn)反馈! 16 | 17 | - [基本用法](#基本用法) 18 | - [隐私声明](#隐私声明) 19 | - [使用指南](#使用指南) 20 | - [隐匿模式](#隐匿模式) 21 | - [交互模式](#交互模式) 22 | - [翻译模式](#翻译模式) 23 | - [提示模式(实验功能)](#提示模式实验功能) 24 | 25 | ## 基本用法 26 | 需要保证VS Code版本 >= 1.68.0。安装插件并全局激活CodeGeeX,有以下四种使用模式: 27 | 28 | - **隐匿模式**: 保持CodeGeeX处于激活状态,当您停止输入时,会从当前光标处开始生成(右下角CodeGeeX图标转圈表示正在生成)。 生成完毕之后会以灰色显示,按``Tab``即可插入生成结果。 29 | - **交互模式**: 按``Ctrl+Enter``激活交互模式,CodeGeeX将生成``X``个候选,并显示在右侧窗口中(``X`` 数量可以在设置的``Candidate Num``中修改)。 点击候选代码上方的``use code``即可插入。 30 | - **翻译模式**: 选择代码,然后按下``Ctrl+Alt+T``激活翻译模式,CodeGeeX会把该代码翻译成匹配您当前编辑器语言的代码。点击翻译结果上方的``use code``插入。您还可以在设置中选择您希望插入的时候如何处理被翻译的代码,您可以选择注释它们或者覆盖它们。 31 | - **提示模式(实验功能)**: 选择需要作为输入的代码,按``Alt/Option+t``触发提示模式,会显示预定义模板列表,选择其中一个模板,即可将代码插入到模板中进行生成。 这个模式高度自定义,可以在设置中 ``Prompt Templates``修改或添加模板内容,为模型加入额外的提示。 32 | 33 | ## 隐私声明 34 | 35 | 我们高度尊重用户代码的隐私,代码仅用来辅助编程。在您第一次使用时,我们会询问您是否同意将生成的代码用于研究用途,帮助CodeGeeX变得更好(该选项默认**关闭**)。 36 | ## 使用指南 37 | 38 | 以下是CodeGeeX几种模式的详细用法: 39 | 40 | ### 隐匿模式 41 | 42 | 在该模式中,CodeGeeX将在您停止输入时,从光标处开始生成(右下角CodeGeeX图标转圈表示正在生成)。生成完毕之后会以灰色显示,按``Tab``即可插入生成结果。 在生成多个候选的情况下,可以使用``Alt/Option+[`` 或 ``]``在几个候选间进行切换。如果你对现有建议不满意,可以使用``Alt/Option+N``去获得新的候选。可以在设置中改变``Candidate Num``(增加个数会导致生成速度相对变慢)。**注意**:生成总是从当前光标位置开始,如果您在生成结束前移动光标位置,可能会导致一些bugs。我们正在努力使生成速度变得更快以提升用户体验。 43 | 44 | ![image](https://lfs.aminer.cn/misc/wangshan/pretrain/codegeex/bubble_sort_go.gif) 45 | 46 | ### 交互模式 47 | 48 | 在该模式中,按``Ctrl+Enter``激活交互模式,CodeGeeX将生成``X``个候选,并显示在右侧窗口中(``X`` 数量可以在设置的``Candidate Num``中修改)。 点击候选代码上方的``use code``即可插入结果到为当前光标位置。 49 | 50 | ![image](https://lfs.aminer.cn/misc/wangshan/pretrain/codegeex/interactive_mode2.gif) 51 | 52 | ### 翻译模式 53 | 54 | 在当前的语言的文本编辑器中输入或者粘贴其他语言的代码,您用鼠标选择这些代码,然后按下``Ctrl+Alt+T``激活翻译模式,您根据提示选择该代码的语言,然后CodeGeeX会帮您把该代码翻译成匹配您当前编辑器语言的代码。点击翻译结果上方的``use code``即可插入。您还可以在设置中选择您希望插入的时候如何处理被翻译的代码,您可以选择注释它们或者覆盖它们。 55 | 56 | ![image](https://lfs.aminer.cn/misc/wangshan/pretrain/codegeex/translation_cpp_to_python.gif) 57 | 58 | ### 提示模式(实验功能) 59 | 60 | 在该模式中,您可以在输入中添加额外的提示来实现一些有趣的功能,包括并不限于代码解释、概括、以特定风格生成等。该模式的原理是利用了CodeGeeX强大的少样本生成能力。当您在输入中提供一些例子时,CodeGeeX会模仿这些例子并实现相应的功能。比如,您可以自定义模板中提供一段逐行解释代码的例子。选择您想要解释的代码,按``Alt/Option+t``触发提示模式,选择您写好的模板(如``explanation``),CodeGeeX就会解释您输入的代码。以下我们会详细介绍如何制作模板。 61 | 62 | ![image](https://lfs.aminer.cn/misc/wangshan/pretrain/codegeex/explanation_python.gif) 63 | 64 | 上述例子中的模板如下图所示,由``[示例代码]``, ````, ``[带解释的示例代码]`` and ``[输出函数头]`` 。````表示您选中的代码将会插入的位置。 ```` 这一句用来保证模型解释的是同一个函数。当使用提示模式时,CodeGeeX会将您选择的代码(插入到部分)和模板代码相结合,一起作为模型的输入。 65 | 66 | ```python 67 | # language: Python 68 | 69 | def sum_squares(lst): 70 | sum = 0 71 | for i in range(len(lst)): 72 | if i % 3 == 0: 73 | lst[i] = lst[i]**2 74 | elif i % 4 == 0: 75 | lst[i] = lst[i]**3 76 | sum += lst[i] 77 | return sum 78 | 79 | 80 | 81 | # Explain the code line by line 82 | def sum_squares(lst): 83 | # initialize sum 84 | sum = 0 85 | # loop through the list 86 | for i in range(len(lst)): 87 | # if the index is a multiple of 3 88 | if i % 3 == 0: 89 | # square the entry 90 | lst[i] = lst[i]**2 91 | # if the index is a multiple of 4 92 | elif i % 4 == 0: 93 | # cube the entry 94 | lst[i] = lst[i]**3 95 | # add the entry to the sum 96 | sum += lst[i] 97 | # return the sum 98 | return sum 99 | 100 | # Explain the code line by line 101 | 102 | ``` 103 | 104 | 以下是另一个Python文档字符串生成的例子,CodeGeeX在您写新函数时会模仿该注释的格式: 105 | ```python 106 | def add_binary(a, b): 107 | ''' 108 | Returns the sum of two decimal numbers in binary digits. 109 | 110 | Parameters: 111 | a (int): A decimal integer 112 | b (int): Another decimal integer 113 | 114 | Returns: 115 | binary_sum (str): Binary string of the sum of a and b 116 | ''' 117 | binary_sum = bin(a+b)[2:] 118 | return binary_sum 119 | 120 | 121 | ``` 122 | 123 | 模板文件是高度自定义化的,您可以将自定义模板添加到插件设置中的``Prompt Templates``中。 ``key``表示模板的名字, ``value``是模板文件的路径(可以是您电脑上的任一路径,``.txt``, ``.py``, ``.h``, 等格式文件均可)。通过该功能,您可以让CodeGeeX生成具有特定风格或功能的代码,快尝试定义自己的专属模板吧! --------------------------------------------------------------------------------