├── .gitmodules
├── LICENSE
├── MODEL_LICENSE
├── README.md
├── README_zh.md
├── api
├── README_zh.md
├── codegeex-api-example-java
│ ├── pom.xml
│ └── src
│ │ └── main
│ │ └── java
│ │ └── cn
│ │ └── aminer
│ │ └── codegeex
│ │ └── example
│ │ ├── CodeGenerationExample.java
│ │ └── pojo
│ │ └── Payload.java
└── codegeex-api-example-python
│ └── generation_example.py
├── codegeex
├── __init__.py
├── benchmark
│ ├── README.md
│ ├── README_zh.md
│ ├── __init__.py
│ ├── evaluate_humaneval_x.py
│ ├── execution.py
│ ├── gather_output.py
│ ├── humaneval-x
│ │ ├── __init__.py
│ │ ├── cpp
│ │ │ ├── data
│ │ │ │ └── humaneval_cpp.jsonl.gz
│ │ │ └── evaluation
│ │ │ │ └── test.cpp
│ │ ├── evaluate_humaneval_x.py
│ │ ├── generate_humaneval_x.py
│ │ ├── go
│ │ │ ├── data
│ │ │ │ └── humaneval_go.jsonl.gz
│ │ │ └── evaluation
│ │ │ │ ├── go.mod
│ │ │ │ ├── go.sum
│ │ │ │ └── vendor.tar.gz
│ │ ├── java
│ │ │ └── data
│ │ │ │ └── humaneval_java.jsonl.gz
│ │ ├── js
│ │ │ └── data
│ │ │ │ └── humaneval_js.jsonl.gz
│ │ ├── python
│ │ │ └── data
│ │ │ │ └── humaneval_python.jsonl.gz
│ │ ├── rust
│ │ │ └── data
│ │ │ │ └── humaneval_rust.jsonl.gz
│ │ └── translate_humaneval_x.py
│ ├── inspect_result.py
│ ├── metric.py
│ ├── rust
│ │ ├── Cargo.lock
│ │ └── Cargo.toml
│ └── utils.py
├── data
│ ├── __init__.py
│ ├── data_utils.py
│ ├── process_pretrain_dataset.py
│ ├── processor.py
│ └── types.py
├── docker
│ └── Dockerfile
├── kernels
│ ├── __init__.py
│ └── quantization.fatbin
├── megatron
│ ├── __init__.py
│ ├── arguments.py
│ ├── checkpointing.py
│ ├── code_generation_utils.py
│ ├── convert_ckpt_parallel.py
│ ├── data
│ │ ├── __init__.py
│ │ ├── blendable_dataset.py
│ │ ├── data_samplers.py
│ │ ├── dataset_utils.py
│ │ ├── helpers.cpp
│ │ ├── indexed_dataset.py
│ │ └── prompt_dataset.py
│ ├── enums.py
│ ├── global_vars.py
│ ├── inference.py
│ ├── initialize.py
│ ├── learning_rates.py
│ ├── memory.py
│ ├── merge_ckpt_parallel.py
│ ├── microbatches.py
│ ├── mindspore_to_megatron.py
│ ├── model
│ │ ├── __init__.py
│ │ ├── codegeex_model.py
│ │ ├── distributed.py
│ │ ├── language_model.py
│ │ ├── module.py
│ │ ├── transformer.py
│ │ └── utils.py
│ ├── mpu
│ │ ├── __init__.py
│ │ ├── cross_entropy.py
│ │ ├── data.py
│ │ ├── initialize.py
│ │ ├── layers.py
│ │ ├── mappings.py
│ │ ├── random.py
│ │ └── utils.py
│ ├── optimizer
│ │ ├── __init__.py
│ │ ├── clip_grads.py
│ │ ├── grad_scaler.py
│ │ └── optimizer.py
│ ├── p2p_communication.py
│ ├── schedules.py
│ ├── tokenizer
│ │ ├── __init__.py
│ │ ├── gpt2_tokenization.py
│ │ └── tokenizer.py
│ ├── tools
│ │ ├── collect_env.py
│ │ ├── finetune_codegeex.py
│ │ └── pretrain_codegeex.py
│ ├── training.py
│ └── utils.py
├── mindspore
│ ├── configs
│ │ ├── 13B.sh
│ │ ├── 13B_128p_save_1p.sh
│ │ ├── 13B_128p_save_8p_ckpt.sh
│ │ ├── 13B_1p_to_torch.sh
│ │ ├── 13B_finetune.sh
│ │ ├── 13B_generate.sh
│ │ ├── 13B_generate_1p.sh
│ │ ├── 13B_generate_1p_values.sh
│ │ ├── 13B_generate_finetune.sh
│ │ ├── 13B_generate_humaneval.sh
│ │ └── 13B_generate_values.sh
│ ├── convertion_1p.py
│ ├── finetune.py
│ ├── generation.py
│ ├── generation_1p.py
│ ├── generation_batch.py
│ ├── generation_finetune.py
│ ├── generation_humaneval.py
│ ├── generation_values.py
│ ├── generation_values_1p.py
│ ├── save_1p_ckpt_from_8p_ckpt.py
│ ├── save_8p_ckpt.py
│ ├── scripts
│ │ ├── custom_tune_bank_new
│ │ │ └── Ascend910ProA
│ │ │ │ ├── cube
│ │ │ │ ├── repository_ascend910ProA_matmul.bin
│ │ │ │ └── repository_ascend910ProA_matmul.json
│ │ │ │ └── vector
│ │ │ │ └── Ascend910ProA_AiCore_32_v001_20220509_200939_588817.json
│ │ ├── layer_norm.py
│ │ ├── layer_norm_x_backprop_v2.py
│ │ ├── ma-pre-start.sh
│ │ ├── run_modelarts.py
│ │ ├── run_modelarts_gen_finetune.py
│ │ └── run_modelarts_gen_humaneval_x.py
│ ├── src
│ │ ├── __init__.py
│ │ ├── adam.py
│ │ ├── callbacks.py
│ │ ├── code_tokenizer.py
│ │ ├── dataset.py
│ │ ├── dataset_finetune.py
│ │ ├── generate.py
│ │ ├── generate_finetune.py
│ │ ├── generate_greedy.py
│ │ ├── generate_humaneval.py
│ │ ├── metrics.py
│ │ ├── pangu_alpha.py
│ │ ├── pangu_alpha_config.py
│ │ ├── pangu_alpha_fp16_predict.py
│ │ ├── pangu_alpha_wrapcell.py
│ │ ├── pangu_alpha_wrapcell_finetune.py
│ │ ├── preprocess.py
│ │ ├── sat_dataset.py
│ │ ├── tokenization_jieba.py
│ │ └── utils.py
│ └── train.py
├── oneflow
│ ├── __init__.py
│ ├── codegeex_model.py
│ └── inference.py
├── paddle
│ ├── __init__.py
│ ├── codegeex_model.py
│ ├── inference.py
│ └── pt_to_pdparams.py
├── quantization
│ ├── __init__.py
│ ├── quantize.py
│ └── quantize_oneflow.py
├── tokenizer
│ ├── __init__.py
│ ├── added_tokens.json
│ ├── merges.txt
│ ├── special_tokens_map.json
│ ├── tokenizer.py
│ ├── tokenizer_config.json
│ └── vocab.json
└── torch
│ ├── __init__.py
│ ├── codegeex_model.py
│ ├── get_ckpt_qkv.py
│ └── inference.py
├── configs
├── codegeex_13b.sh
├── codegeex_13b_paddle.sh
└── codegeex_13b_parallel.sh
├── deployment
├── example_inputs.jsonl
└── server_gradio.py
├── generations
├── humaneval_python_generations.jsonl.gz
└── humaneval_rust_generations.jsonl.gz
├── requirements.txt
├── resources
├── api
│ ├── api_step_1.png
│ ├── api_step_2.png
│ ├── api_step_3.png
│ ├── api_step_4.png
│ └── api_step_5.png
├── en
│ ├── codegeex_training.png
│ ├── hx_boxplot.png
│ ├── hx_examples.png
│ ├── hx_generattion_radar_horizon.png
│ ├── hx_pass_rate_vs_language.png
│ ├── hx_tasks.png
│ └── hx_translation.png
├── logo
│ └── codegeex_logo.png
└── zh
│ ├── hx_boxplot_zh.png
│ ├── hx_generattion_radar_horizon_zh.png
│ ├── hx_pass_rate_vs_language_zh.png
│ ├── hx_tasks_zh.png
│ ├── hx_translation_zh.png
│ ├── join_wechat.png
│ └── wechat.md
├── scripts
├── convert_ckpt_parallel.sh
├── convert_mindspore_to_megatron.sh
├── evaluate_humaneval_x.py
├── evaluate_humaneval_x.sh
├── finetune_codegeex.sh
├── gather_output.sh
├── generate_humaneval_x.sh
├── pretrain_codegeex.sh
├── process_pretrain_dataset.sh
├── test_inference.sh
├── test_inference_oneflow.sh
├── test_inference_oneflow_quantized.sh
├── test_inference_paddle.sh
├── test_inference_parallel.sh
├── test_inference_quantized.sh
└── translate_humaneval_x.sh
├── setup.py
├── tests
├── test_inference.py
├── test_inference_megatron.py
├── test_inference_oneflow.py
├── test_inference_paddle.py
└── test_prompt.txt
└── vscode-extension
├── README.md
└── README_zh.md
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "vscode-extension/codegeex-vscode-extension"]
2 | path = vscode-extension/codegeex-vscode-extension
3 | url = git@github.com:CodeGeeX/codegeex-vscode-extension.git
4 |
--------------------------------------------------------------------------------
/MODEL_LICENSE:
--------------------------------------------------------------------------------
1 | The CodeGeeX License
2 |
3 | 1. Definitions
4 |
5 | “Licensor” means the CodeGeeX Model Team that distributes its Software.
6 |
7 | “Software” means the CodeGeeX model parameters made available under this license.
8 |
9 | 2. License Grant
10 |
11 | Subject to the terms and conditions of this License, the Licensor hereby grants to you a non-exclusive, worldwide, non-transferable, non-sublicensable, revocable, royalty-free copyright license to use the Software solely for your non-commercial research purposes.
12 |
13 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
14 |
15 | 3. Restriction
16 |
17 | You will not use, copy, modify, merge, publish, distribute, reproduce, or create derivative works of the Software, in whole or in part, for any commercial, military, or illegal purposes.
18 |
19 | You will not use the Software for any act that may undermine China's national security and national unity, harm the public interest of society, or infringe upon the rights and interests of human beings.
20 |
21 | 4. Disclaimer
22 |
23 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
24 |
25 | 5. Limitation of Liability
26 |
27 | EXCEPT TO THE EXTENT PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL THEORY, WHETHER BASED IN TORT, NEGLIGENCE, CONTRACT, LIABILITY, OR OTHERWISE WILL ANY LICENSOR BE LIABLE TO YOU FOR ANY DIRECT, INDIRECT, SPECIAL, INCIDENTAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES, OR ANY OTHER COMMERCIAL LOSSES, EVEN IF THE LICENSOR HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
28 |
29 | 6. Dispute Resolution
30 |
31 | This license shall be governed and construed in accordance with the laws of People’s Republic of China. Any dispute arising from or in connection with this License shall be submitted to Haidian District People's Court in Beijing.
32 |
33 | Note that the license is subject to update to a more comprehensive version. For any questions related to the license and copyright, please contact us at report@aminer.cn.
--------------------------------------------------------------------------------
/api/README_zh.md:
--------------------------------------------------------------------------------
1 | 
2 |
3 | # 创建CodeGeeX API
4 |
5 | 使用[天启 · API开放平台](https://tianqi.aminer.cn/open/)申请CodeGeeX API:
6 |
7 |
8 |
9 | 点击首页中的天启平台体验入口:
10 |
11 | 点击API应用:
12 |
13 | 输入任意名称,创建API应用。创建后会得到API Key/Secret,用于调用API:
14 |
15 |
16 | 在API信息中,可以查看代码生成/代码翻译的请求地址和使用文档:
17 |
18 |
19 | 根据文档中的描述使用API,Python版参考目录``api/codegeex-api-example-python``;JAVA版参考工程:``api/codegeex-api-example-java``
20 |
--------------------------------------------------------------------------------
/api/codegeex-api-example-java/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 | 4.0.0
6 |
7 | cn.aminer
8 | codegeex-api-example-java
9 | 1.0-SNAPSHOT
10 |
11 |
15 |
16 | UTF-8
17 | UTF-8
18 |
19 |
20 |
21 |
22 |
23 | org.apache.maven.plugins
24 | maven-compiler-plugin
25 | 3.8.1
26 |
27 |
29 | 1.8
30 | 1.8
31 | UTF-8
32 |
33 |
34 |
35 | org.apache.maven.plugins
36 | maven-assembly-plugin
37 | 3.3.0
38 |
39 |
40 | jar-with-dependencies
41 |
42 |
43 |
44 |
45 |
46 | package
47 |
48 | single
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 | com.fasterxml.jackson.module
59 | jackson-module-parameter-names
60 | 2.6.6
61 |
62 |
63 | com.fasterxml.jackson.datatype
64 | jackson-datatype-jdk8
65 | 2.6.6
66 |
67 |
68 | com.fasterxml.jackson.datatype
69 | jackson-datatype-jsr310
70 | 2.6.6
71 |
72 |
73 | com.squareup.okhttp3
74 | okhttp
75 |
76 |
77 | org.slf4j
78 | slf4j-log4j12
79 |
80 |
81 | log4j
82 | log4j
83 |
84 |
85 | org.projectlombok
86 | lombok
87 | provided
88 |
89 |
90 |
91 |
92 |
93 |
94 | com.fasterxml.jackson.module
95 | jackson-module-parameter-names
96 |
97 |
98 | com.fasterxml.jackson.datatype
99 | jackson-datatype-jdk8
100 |
101 |
102 | com.fasterxml.jackson.datatype
103 | jackson-datatype-jsr310
104 |
105 |
106 | com.fasterxml.jackson.core
107 | jackson-databind
108 |
109 |
110 | com.squareup.okhttp3
111 | okhttp
112 | 4.10.0
113 |
114 |
115 | log4j
116 | log4j
117 | 1.2.17
118 |
119 |
120 | org.slf4j
121 | slf4j-log4j12
122 | 1.7.5
123 |
124 |
125 | org.projectlombok
126 | lombok
127 | 1.18.20
128 | provided
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 | central
137 | ALiYun
138 | http://maven.aliyun.com/nexus/content/groups/public
139 |
140 |
141 |
142 |
--------------------------------------------------------------------------------
/api/codegeex-api-example-java/src/main/java/cn/aminer/codegeex/example/CodeGenerationExample.java:
--------------------------------------------------------------------------------
1 | package cn.aminer.codegeex.example;
2 |
3 | import cn.aminer.codegeex.example.pojo.Payload;
4 | import com.fasterxml.jackson.databind.ObjectMapper;
5 | import okhttp3.*;
6 |
7 | import java.io.IOException;
8 |
9 | /**
10 | * 调用 CodeGeeX API 生成代码的例子。
11 | *
12 | * @author Darran Zhang @ codelast.com
13 | * @version 2023-01-20
14 | */
15 | public class CodeGenerationExample {
16 | public static final String API_KEY = "your_api_key"; // 在"天启开放平台"上申请到的API Key
17 | public static final String API_SECRET = "your_api_secret"; // 在"天启开放平台"上申请到的API Secret
18 | public static final int NUMBER = 3; // 生成几个候选
19 | public static final String LANGUAGE = "Java"; // 编程语言
20 | public static final String REQUEST_URL = "https://tianqi.aminer.cn/api/v2/multilingual_code_generate"; // 请求地址
21 |
22 | public static void main(String[] args) throws Exception {
23 | CodeGenerationExample example = new CodeGenerationExample();
24 | String prompt = "// use OkHttpClient library to write a function to perform http post request\n\n" +
25 | "public class HttpPost {\n" +
26 | " public static void main(String[] args) {\n";
27 | example.generateCode(prompt);
28 | }
29 |
30 | /**
31 | * 生成代码。
32 | *
33 | * @param prompt 待补全的代码
34 | */
35 | public void generateCode(String prompt) throws Exception {
36 | ObjectMapper objectMapper = new ObjectMapper();
37 | Payload payload = new Payload().setApiKey(API_KEY).setApiSecret(API_SECRET).setPrompt(prompt).setNumber(NUMBER)
38 | .setLanguage(LANGUAGE);
39 | String response = performHttpPost(REQUEST_URL, objectMapper.writeValueAsString(payload));
40 | System.out.println(response);
41 | }
42 |
43 | /**
44 | * 发起 HTTP POST 请求。
45 | *
46 | * @param url 请求的URL
47 | * @param payload 请求的JSON数据
48 | * @return 请求返回的内容,若出错则返回 null。
49 | */
50 | public String performHttpPost(String url, String payload) {
51 | HttpUrl.Builder builder = null;
52 | try {
53 | HttpUrl httpUrl = HttpUrl.parse(url);
54 | if (httpUrl != null) {
55 | builder = httpUrl.newBuilder();
56 | }
57 | } catch (IllegalArgumentException e) {
58 | System.out.println("failed to create HttpUrl.Builder from url " + url + ":" + e);
59 | }
60 | if (builder == null) {
61 | return null;
62 | }
63 | OkHttpClient client = new OkHttpClient();
64 | RequestBody requestBody = RequestBody.create(payload, MediaType.parse("application/json; charset=utf-8"));
65 | Request request = new Request.Builder()
66 | .url(builder.build())
67 | .post(requestBody)
68 | .build();
69 |
70 | try {
71 | Response response = client.newCall(request).execute();
72 | ResponseBody body = response.body();
73 | if (body == null) {
74 | System.out.println("null response body");
75 | return null;
76 | }
77 | return body.string();
78 | } catch (IOException e) {
79 | System.out.println("failed to send POST request: " + e);
80 | }
81 | return null;
82 | }
83 | }
84 |
--------------------------------------------------------------------------------
/api/codegeex-api-example-java/src/main/java/cn/aminer/codegeex/example/pojo/Payload.java:
--------------------------------------------------------------------------------
1 | package cn.aminer.codegeex.example.pojo;
2 |
3 | import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
4 | import com.fasterxml.jackson.annotation.JsonProperty;
5 | import lombok.Data;
6 | import lombok.experimental.Accessors;
7 |
8 | /**
9 | * 发送到 CodeGeex API 的请求中包含的JSON payload对象。
10 | *
11 | * @author Darran Zhang @ codelast.com
12 | * @version 2023-01-20
13 | */
14 | @JsonIgnoreProperties(ignoreUnknown = true)
15 | @Data
16 | @Accessors(chain = true)
17 | public class Payload {
18 | @JsonProperty("apikey")
19 | String apiKey; // 在"天启开放平台"上申请到的API Key
20 |
21 | @JsonProperty("apisecret")
22 | String apiSecret; // 在"天启开放平台"上申请到的API Secret
23 |
24 | String prompt; // 待补全的代码
25 |
26 | @JsonProperty("n")
27 | int number; // 生成几个候选
28 |
29 | @JsonProperty("lang")
30 | String language; // 编程语言
31 | }
32 |
--------------------------------------------------------------------------------
/api/codegeex-api-example-python/generation_example.py:
--------------------------------------------------------------------------------
1 | # encoding:utf-8
2 |
3 | import json
4 |
5 | import requests
6 |
7 | '''
8 | Code Generation
9 | '''
10 | API_KEY = "" # Get from Tianqi console. 从控制台获取
11 | API_SECRET = "" # Get from Tianqi console. 从控制台获取
12 | PROMPT = "from typing import List\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n " \
13 | "\"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n given " \
14 | "threshold.\n >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n False\n >>> has_close_elements(" \
15 | "[1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n True\n \"\"\"\n"
16 | NUMBER = 3
17 | LANG = "Python"
18 | request_url = "https://tianqi.aminer.cn/api/v2/"
19 | api = 'multilingual_code_generate'
20 |
21 | # Request is in json format. 指定请求参数格式为json
22 | headers = {'Content-Type': 'application/json'}
23 | request_url = request_url + api
24 | data = {
25 | "apikey": API_KEY,
26 | "apisecret": API_SECRET,
27 | "prompt": PROMPT,
28 | "n": NUMBER,
29 | "lang": LANG
30 | }
31 |
32 |
33 | def main():
34 | response = requests.post(request_url, headers=headers, data=json.dumps(data))
35 | if response:
36 | print(response.json())
37 |
38 |
39 | if __name__ == '__main__':
40 | main()
41 |
--------------------------------------------------------------------------------
/codegeex/__init__.py:
--------------------------------------------------------------------------------
1 | import copy
2 |
3 | from typing import *
4 | from codegeex.tokenizer import CodeGeeXTokenizer
5 | from codegeex.torch.inference import get_token_stream
6 |
7 |
8 | def get_model(
9 | backend: str = "megatron",
10 | quantized: bool = False,
11 | ):
12 | pass
13 |
14 |
15 | def generate(
16 | model,
17 | tokenizer: CodeGeeXTokenizer,
18 | prompt: str,
19 | out_seq_length: int,
20 | seq_length: int = 2048,
21 | top_k: int = 0,
22 | top_p: float = 1.0,
23 | temperature: float = 1.0,
24 | micro_batch_size: int = 1,
25 | backend: str = "megatron",
26 | greedy: bool = False,
27 | verbose: bool = False,
28 | ):
29 | tokens = tokenizer.encode_code(prompt)
30 | n_token_prompt = len(tokens)
31 |
32 | if verbose:
33 | print(f"Current prompt:\n{prompt}")
34 | print("N_token_prompt:", n_token_prompt)
35 |
36 | generated_codes = []
37 | if backend == "megatron":
38 | token_stream = get_token_stream(
39 | model,
40 | tokenizer,
41 | seq_length,
42 | out_seq_length,
43 | [copy.deepcopy(tokens) for _ in range(micro_batch_size)],
44 | micro_batch_size=micro_batch_size,
45 | topk=top_k,
46 | topp=top_p,
47 | temperature=temperature,
48 | greedy=greedy,
49 | )
50 | is_finished = [False for _ in range(micro_batch_size)]
51 | for i, generated in enumerate(token_stream):
52 | generated_tokens = generated[0]
53 | for j in range(micro_batch_size):
54 | if is_finished[j]:
55 | continue
56 |
57 | if generated_tokens[j].cpu().numpy()[-1] == tokenizer.eos_token_id or len(generated_tokens[j]) >= out_seq_length:
58 | is_finished[j] = True
59 | generated_tokens_ = generated_tokens[j].cpu().numpy().tolist()
60 | generated_code = tokenizer.decode_code(generated_tokens_[n_token_prompt:])
61 | generated_code = "".join(generated_code)
62 | generated_codes.append(generated_code)
63 | if verbose:
64 | print(f"\nGenerated code {i}:\n{generated_code}")
65 |
66 | if all(is_finished):
67 | break
68 |
69 | return generated_codes
--------------------------------------------------------------------------------
/codegeex/benchmark/README.md:
--------------------------------------------------------------------------------
1 | # HumanEval-X: A new benchmark for Multilingual Program Synthesis
2 |
3 | 🌐 中文
4 |
5 | HumanEval-X is a new benchmark for better evaluating the multilingual ability of code generation models. While previous works evaluate multilingual program synthesis under semantic similarity (e.g., [CodeBLEU](https://arxiv.org/abs/2009.10297)) which is often misleading, HumanEval-X evaluates the functional correctness of the generated programs. HumanEval-X consists of 820 high-quality human-crafted data samples (each with test cases) in Python, C++, Java, JavaScript, and Go, and can be used for various tasks.
6 |
7 |
8 |
9 |
An illustration of tasks supported by HumanEval-X. Declarations, docstrings, and solutions are marked with red, green, and blue respectively. Code generation uses declaration and docstring as input, to generate solution. Code translation uses declaration in both languages and translate the solution in source language to the one in target language.
10 |
11 | In HumanEval-X, every sample in each language contains declaration, docstring, and solution, which can be combined in various ways to support different downstream tasks including generation, translation, summarization, etc. We currently focus on two tasks: **code generation** and **code translation**. For code generation, the model uses declaration and docstring as input to generate the solution. For code translation, the model uses declarations in both languages and the solution in the source language as input, to generate solutions in the target language. We remove the description during code translation to prevent the model from directly solving the problem. For both tasks, we use the unbiased pass@k metric proposed in [Codex](https://arxiv.org/abs/2107.03374): $\text{pass}@k:= \mathbb{E}[1-\frac{\tbinom{n-c}{k}}{\tbinom{n}{k}}]$, with $n=200$ and $k\in(1,10,100)$.
12 |
13 | ## How to use HumanEval-X
14 |
15 | Data are stored in ``codegeex/benchmark/humaneval-x/[LANG]/data/humaneval_[LANG].jsonl.gz``, using JSON list format. There are six keys:
16 |
17 | * ``task_id``: indicates the target language and ID of the problem. Language is one of ["Python", "Java", "JavaScript", "CPP", "Go"].
18 | * ``prompt``: the function declaration and docstring, used for code generation.
19 | * ``declaration``: only the function declaration, used for code translation.
20 | * ``canonical_solution``: human-crafted example solutions.
21 | * ``test``: hidden test samples, used for evaluation.
22 | * ``example_test``: public test samples (appeared in prompt), used for evaluation.
23 |
24 | ### Evaluation Environment
25 |
26 | The evaluation of the generated codes involves compiling and running in multiple programming languages. The versions of the programming language environments and packages we use are as follows:
27 |
28 | | Dependency | Version |
29 | | ---------- | -------- |
30 | | Python | 3.8.12 |
31 | | JDK | 18.0.2.1 |
32 | | Node.js | 16.14.0 |
33 | | js-md5 | 0.7.3 |
34 | | C++ | 11 |
35 | | g++ | 7.5.0 |
36 | | Boost | 1.71.0 |
37 | | OpenSSL | 3.0.0 |
38 | | go | 1.18.4 |
39 |
40 | In order to save everyone the trouble of setting up the environments for these languages, we build a Docker image with the required environments and CodeGeeX installed.
41 |
42 | You can directly pull the image from Docker Hub:
43 |
44 | ```bash
45 | docker pull rishubi/codegeex:latest
46 | ```
47 |
48 | Alternatively, if you are familiar with Dockerfile, you can build the image from `codegeex/docker/Dockerfile` or configure the Dockerfile as you like it:
49 |
50 | ```bash
51 | cd codegeex/docker
52 | docker build [OPTIONS] .
53 | ```
54 |
55 | After obtaining the image, you can build a container using the following command:
56 |
57 | ```bash
58 | docker run -it --gpus all --mount type=bind,source=,target= [OPTIONS]
59 | ```
60 |
61 | ### Evaluation
62 |
63 | We recommend evaluating in [the provided image](#evaluation-environment). To evaluate the generated samples, save generated codes in the following JSON list format:
64 |
65 | ```
66 | {"task_id": "../..", "generation: "..."}
67 | {"task_id": "../..", "generation: "..."}
68 | ...
69 | ```
70 |
71 | and evaluate them using the following script under the root directory of the repository (please execute with caution, the generated codes might have unexpected behaviours though with very low possibility. See the warnings in [execution.py](execution.py) and uncomment the execution lines at your own risk):
72 |
73 | ```bash
74 | bash scripts/evaluate_humaneval_x.sh
75 | ```
76 |
--------------------------------------------------------------------------------
/codegeex/benchmark/README_zh.md:
--------------------------------------------------------------------------------
1 | # HumanEval-X: 多语言代码生成基准
2 |
3 | 🌐 English
4 |
5 | 为了更好地评测代码生成模型的多语言生成能力,我们构建了一个新基准HumanEval-X。此前,多语言代码生成能力是基于语义相似度(比如[CodeBLEU](https://arxiv.org/abs/2009.10297))衡量的,具有一定误导性;HumanEval-X则可用于衡量生成代码的功能正确性。HumanEval-X包含820个高质量手写样本,覆盖Python、C++、Java、JavaScript、Go,可用于多种任务。
6 |
7 |
8 |
9 |
10 |
11 | HumanEval-X中每个语言的样本,包含了声明、描述和解答,它们之间的组合可以支持不同的下游任务,包括生成、翻译、概括等。我们目前关注两个任务:**代码生成**与**代码翻译**。对于代码生成任务,模型将函数声明与文档字符串作为输入,输出函数实现;对于代码翻译任务,模型将两种语言的函数声明与源语言的实现作为输入,输出目标语言上的实现。我们在代码翻译任务中不将文档字符串输入模型,以避免模型直接通过描述生成答案。在两种任务下,我们都采用[Codex](https://arxiv.org/abs/2107.03374)所使用的无偏pass@k指标:$\text{pass}@k:= \mathbb{E}[1-\frac{\tbinom{n-c}{k}}{\tbinom{n}{k}}]$, $n=200$, $k\in(1,10,100)$。
12 |
13 | ## 如何使用HumanEval-X
14 |
15 | 样本使用JSON列表格式存储在``codegeex/benchmark/humaneval-x/[LANG]/data/humaneval_[LANG].jsonl.gz``,每条样本包含6个部分:
16 |
17 | * ``task_id``: 题目的目标语言与ID。语言为["Python", "Java", "JavaScript", "CPP", "Go"]中之一。
18 | * ``prompt``: 函数声明与描述,用于代码生成。
19 | * ``declaration``: 仅有函数声明,用于代码翻译。
20 | * ``canonical_solution``: 手写的示例解答。
21 | * ``test``: 隐藏测例,用于评测。
22 | * ``example_test``: 提示中出现的公开测例,用于评测。
23 |
24 | ### 评测环境
25 |
26 | 评测生成的代码需要使用多种语言编译、运行。我们使用的各编程语言依赖及所用包的版本如下:
27 |
28 | | 依赖 | 版本 |
29 | | ------- | -------- |
30 | | Python | 3.8.12 |
31 | | JDK | 18.0.2.1 |
32 | | Node.js | 16.14.0 |
33 | | js-md5 | 0.7.3 |
34 | | C++ | 11 |
35 | | g++ | 7.5.0 |
36 | | Boost | 1.71.0 |
37 | | OpenSSL | 3.0.0 |
38 | | go | 1.18.4 |
39 |
40 | 为了省去使用者配置这些语言环境的麻烦,我们构建了一个Docker镜像,并在其中配置了所需要的环境。
41 |
42 | 可以直接从Docker Hub拉取镜像:
43 |
44 | ```bash
45 | docker pull rishubi/codegeex:latest
46 | ```
47 |
48 | 如果您熟悉Dockerfile,也可以从`codegeex/docker/Dockerfile`构建镜像,或者修改之以定制自己的配置:
49 |
50 | ```bash
51 | cd codegeex/docker
52 | docker build [OPTIONS] .
53 | ```
54 |
55 | 获取镜像后,使用如下命令创建容器:
56 |
57 | ```bash
58 | docker run -it --gpus all --mount type=bind,source=,target= [OPTIONS]
59 | ```
60 |
61 | ### 评测
62 |
63 | 我们推荐使用给定的[评测环境](#评测环境)进行评测。在评测前,将生成的代码以如下JSON列表形式存储:
64 |
65 | ```
66 | {"task_id": "../..", "generation: "..."}
67 | {"task_id": "../..", "generation: "..."}
68 | ...
69 | ```
70 |
71 | 并在本仓库的根目录下使用如下指令(请谨慎执行,生成的代码可能有极低概率产生意外行为。在[execution.py](execution.py)中查看警告并取消执行代码的注释,风险自负):
72 |
73 | ```bash
74 | bash scripts/evaluate_humaneval_x.sh
75 | ```
76 |
--------------------------------------------------------------------------------
/codegeex/benchmark/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/codegeex/benchmark/__init__.py
--------------------------------------------------------------------------------
/codegeex/benchmark/gather_output.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import fire
4 | import glob
5 |
6 |
7 | def gather_output(
8 | output_dir: str = "./output",
9 | output_prefix: str = None,
10 | if_remove_rank_files: int = 0,
11 | ):
12 | if output_prefix is None:
13 | output_list = glob.glob(output_dir + "/*")
14 | else:
15 | output_list = glob.glob(os.path.join(output_dir, output_prefix + "*"))
16 |
17 | for output_file in output_list:
18 | if "rank0" in output_file:
19 | output_prefix_ = output_file.split("_rank0.jsonl")[0]
20 | rank_files = glob.glob(output_prefix_ + "_rank*")
21 | with open(output_prefix_ + ".jsonl", "w") as f_out:
22 | for rank_file in rank_files:
23 | with open(rank_file, "r") as f_in:
24 | for line in f_in:
25 | f_out.write(line)
26 | if if_remove_rank_files:
27 | os.remove(rank_file)
28 | print(f"Removing {rank_file}...")
29 |
30 | if output_prefix is None:
31 | output_list = glob.glob(output_dir + "/*")
32 | else:
33 | output_list = glob.glob(os.path.join(output_dir, output_prefix + "*"))
34 |
35 | for output_file in output_list:
36 | if "rank" in output_file or "_unfinished" in output_file or "all" in output_file or "_result" in output_file:
37 | continue
38 | if "_finished" not in output_file:
39 | continue
40 | output_prefix_ = output_file.split("_finished.jsonl")[0]
41 | files = [output_file, output_prefix_ + "_unfinished.jsonl"]
42 | with open(output_prefix_ + "_all.jsonl", "w") as f_out:
43 | for f in files:
44 | with open(f, "r") as f_in:
45 | for line in f_in:
46 | f_out.write(line)
47 |
48 | print("Gathering finished. Saved in {}".format(output_prefix_ + "_all.jsonl"))
49 |
50 |
51 | def main():
52 | fire.Fire(gather_output)
53 |
54 |
55 | if __name__ == "__main__":
56 | sys.exit(main())
57 |
--------------------------------------------------------------------------------
/codegeex/benchmark/humaneval-x/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/codegeex/benchmark/humaneval-x/__init__.py
--------------------------------------------------------------------------------
/codegeex/benchmark/humaneval-x/cpp/data/humaneval_cpp.jsonl.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/codegeex/benchmark/humaneval-x/cpp/data/humaneval_cpp.jsonl.gz
--------------------------------------------------------------------------------
/codegeex/benchmark/humaneval-x/cpp/evaluation/test.cpp:
--------------------------------------------------------------------------------
1 | /*
2 | Input to this function is a string containing multiple groups of nested parentheses. Your goal is to
3 | separate those group into separate strings and return the vector of those.
4 | Separate groups are balanced (each open brace is properly closed) and not nested within each other
5 | Ignore any spaces in the input string.
6 | >>> separate_paren_groups("( ) (( )) (( )( ))")
7 | {"()", "(())", "(()())"}
8 | */
9 | #include
10 | #include
11 | #include
12 | using namespace std;
13 | vector separate_paren_groups(string paren_string){
14 |
15 | vector all_parens;
16 | string current_paren;
17 | int level=0;
18 | char chr;
19 | int i;
20 | for (i=0;i
43 | bool issame(vector a,vectorb){
44 | if (a.size()!=b.size()) return false;
45 | for (int i=0;i np.ndarray:
32 | """
33 | Estimates pass@k of each problem and returns them in an array.
34 | """
35 |
36 | def estimator(n: int, c: int, k: int) -> float:
37 | """
38 | Calculates 1 - comb(n - c, k) / comb(n, k).
39 | """
40 | if n - c < k:
41 | return 1.0
42 | return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
43 |
44 | if isinstance(num_samples, int):
45 | num_samples_it = itertools.repeat(num_samples, len(num_correct))
46 | else:
47 | assert len(num_samples) == len(num_correct)
48 | num_samples_it = iter(num_samples)
49 |
50 | return np.array([estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)])
51 |
--------------------------------------------------------------------------------
/codegeex/benchmark/rust/Cargo.lock:
--------------------------------------------------------------------------------
1 | # This file is automatically @generated by Cargo.
2 | # It is not intended for manual editing.
3 | version = 3
4 |
5 | [[package]]
6 | name = "aho-corasick"
7 | version = "0.7.20"
8 | source = "registry+https://github.com/rust-lang/crates.io-index"
9 | checksum = "cc936419f96fa211c1b9166887b38e5e40b19958e5b895be7c1f93adec7071ac"
10 | dependencies = [
11 | "memchr",
12 | ]
13 |
14 | [[package]]
15 | name = "fuchsia-cprng"
16 | version = "0.1.1"
17 | source = "registry+https://github.com/rust-lang/crates.io-index"
18 | checksum = "a06f77d526c1a601b7c4cdd98f54b5eaabffc14d5f2f0296febdc7f357c6d3ba"
19 |
20 | [[package]]
21 | name = "libc"
22 | version = "0.2.139"
23 | source = "registry+https://github.com/rust-lang/crates.io-index"
24 | checksum = "201de327520df007757c1f0adce6e827fe8562fbc28bfd9c15571c66ca1f5f79"
25 |
26 | [[package]]
27 | name = "md5"
28 | version = "0.7.0"
29 | source = "registry+https://github.com/rust-lang/crates.io-index"
30 | checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"
31 |
32 | [[package]]
33 | name = "memchr"
34 | version = "2.5.0"
35 | source = "registry+https://github.com/rust-lang/crates.io-index"
36 | checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
37 |
38 | [[package]]
39 | name = "rand"
40 | version = "0.4.6"
41 | source = "registry+https://github.com/rust-lang/crates.io-index"
42 | checksum = "552840b97013b1a26992c11eac34bdd778e464601a4c2054b5f0bff7c6761293"
43 | dependencies = [
44 | "fuchsia-cprng",
45 | "libc",
46 | "rand_core 0.3.1",
47 | "rdrand",
48 | "winapi",
49 | ]
50 |
51 | [[package]]
52 | name = "rand_core"
53 | version = "0.3.1"
54 | source = "registry+https://github.com/rust-lang/crates.io-index"
55 | checksum = "7a6fdeb83b075e8266dcc8762c22776f6877a63111121f5f8c7411e5be7eed4b"
56 | dependencies = [
57 | "rand_core 0.4.2",
58 | ]
59 |
60 | [[package]]
61 | name = "rand_core"
62 | version = "0.4.2"
63 | source = "registry+https://github.com/rust-lang/crates.io-index"
64 | checksum = "9c33a3c44ca05fa6f1807d8e6743f3824e8509beca625669633be0acbdf509dc"
65 |
66 | [[package]]
67 | name = "rdrand"
68 | version = "0.4.0"
69 | source = "registry+https://github.com/rust-lang/crates.io-index"
70 | checksum = "678054eb77286b51581ba43620cc911abf02758c91f93f479767aed0f90458b2"
71 | dependencies = [
72 | "rand_core 0.3.1",
73 | ]
74 |
75 | [[package]]
76 | name = "regex"
77 | version = "1.7.1"
78 | source = "registry+https://github.com/rust-lang/crates.io-index"
79 | checksum = "48aaa5748ba571fb95cd2c85c09f629215d3a6ece942baa100950af03a34f733"
80 | dependencies = [
81 | "aho-corasick",
82 | "memchr",
83 | "regex-syntax",
84 | ]
85 |
86 | [[package]]
87 | name = "regex-syntax"
88 | version = "0.6.28"
89 | source = "registry+https://github.com/rust-lang/crates.io-index"
90 | checksum = "456c603be3e8d448b072f410900c09faf164fbce2d480456f50eea6e25f9c848"
91 |
92 | [[package]]
93 | name = "rust"
94 | version = "0.1.0"
95 | dependencies = [
96 | "md5",
97 | "rand",
98 | "regex",
99 | ]
100 |
101 | [[package]]
102 | name = "winapi"
103 | version = "0.3.9"
104 | source = "registry+https://github.com/rust-lang/crates.io-index"
105 | checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
106 | dependencies = [
107 | "winapi-i686-pc-windows-gnu",
108 | "winapi-x86_64-pc-windows-gnu",
109 | ]
110 |
111 | [[package]]
112 | name = "winapi-i686-pc-windows-gnu"
113 | version = "0.4.0"
114 | source = "registry+https://github.com/rust-lang/crates.io-index"
115 | checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
116 |
117 | [[package]]
118 | name = "winapi-x86_64-pc-windows-gnu"
119 | version = "0.4.0"
120 | source = "registry+https://github.com/rust-lang/crates.io-index"
121 | checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
122 |
--------------------------------------------------------------------------------
/codegeex/benchmark/rust/Cargo.toml:
--------------------------------------------------------------------------------
1 | [package]
2 | name = "rust"
3 | version = "0.1.0"
4 | edition = "2021"
5 |
6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
7 |
8 | [dependencies]
9 | rand = "0.4"
10 | regex = "1"
11 | md5 = "0.7.0"
12 |
13 |
--------------------------------------------------------------------------------
/codegeex/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/codegeex/data/__init__.py
--------------------------------------------------------------------------------
/codegeex/data/data_utils.py:
--------------------------------------------------------------------------------
1 | import os
2 | import gzip
3 | import json
4 |
5 | from typing import *
6 |
7 | LANGUAGE_TAG = {
8 | "c" : "// language: C",
9 | "c++" : "// language: C++",
10 | "cpp" : "// language: C++",
11 | "c#" : "// language: C#",
12 | "csharp" : "// language: C#",
13 | "css" : "/* language: CSS */",
14 | "cuda" : "// language: Cuda",
15 | "dart" : "// language: Dart",
16 | "lua" : "// language: Lua",
17 | "objectivec" : "// language: Objective-C",
18 | "objective-c" : "// language: Objective-C",
19 | "objective-c++": "// language: Objective-C++",
20 | "python" : "# language: Python",
21 | "perl" : "# language: Perl",
22 | "prolog" : f"% language: Prolog",
23 | "swift" : "// language: swift",
24 | "lisp" : "; language: Lisp",
25 | "java" : "// language: Java",
26 | "scala" : "// language: Scala",
27 | "tex" : f"% language: TeX",
28 | "vue" : "",
29 | "markdown" : "",
30 | "html" : "",
31 | "php" : "// language: PHP",
32 | "js" : "// language: JavaScript",
33 | "javascript" : "// language: JavaScript",
34 | "typescript" : "// language: TypeScript",
35 | "go" : "// language: Go",
36 | "shell" : "# language: Shell",
37 | "rust" : "// language: Rust",
38 | "sql" : "-- language: SQL",
39 | "kotlin" : "// language: Kotlin",
40 | "vb" : "' language: Visual Basic",
41 | "ruby" : "# language: Ruby",
42 | "pascal" : "// language: Pascal",
43 | "r" : "# language: R",
44 | "fortran" : "!language: Fortran",
45 | "lean" : "-- language: Lean",
46 | "matlab" : f"% language: Matlab",
47 | "delphi" : "{language: Delphi}",
48 | "scheme" : "; language: Scheme",
49 | "basic" : "' language: Basic",
50 | "assembly" : "; language: Assembly",
51 | "groovy" : "// language: Groovy",
52 | "abap" : "* language: Abap",
53 | "gdscript" : "# language: GDScript",
54 | "haskell" : "-- language: Haskell",
55 | "julia" : "# language: Julia",
56 | "elixir" : "# language: Elixir",
57 | "excel" : "' language: Excel",
58 | "clojure" : "; language: Clojure",
59 | "actionscript" : "// language: ActionScript",
60 | "solidity" : "// language: Solidity",
61 | "powershell" : "# language: PowerShell",
62 | "erlang" : f"% language: Erlang",
63 | "cobol" : "// language: Cobol",
64 | }
65 |
66 |
67 | def stream_jsonl(filename: str) -> Iterable[Dict]:
68 | """
69 | Parses each jsonl line and yields it as a dictionary
70 | """
71 | if filename.endswith(".gz"):
72 | with open(filename, "rb") as gzfp:
73 | with gzip.open(gzfp, "rt") as fp:
74 | for line in fp:
75 | if any(not x.isspace() for x in line):
76 | yield json.loads(line)
77 | else:
78 | with open(filename, "r") as fp:
79 | for line in fp:
80 | if any(not x.isspace() for x in line):
81 | yield json.loads(line)
82 |
83 |
84 | def write_jsonl(filename: str, data: Iterable[Dict], append: bool = False):
85 | """
86 | Writes an iterable of dictionaries to jsonl
87 | """
88 | if append:
89 | mode = "ab"
90 | else:
91 | mode = "wb"
92 | filename = os.path.expanduser(filename)
93 | if filename.endswith(".gz"):
94 | with open(filename, mode) as fp:
95 | with gzip.GzipFile(fileobj=fp, mode="wb") as gzfp:
96 | for x in data:
97 | gzfp.write((json.dumps(x) + "\n").encode("utf-8"))
98 | else:
99 | with open(filename, mode) as fp:
100 | for x in data:
101 | fp.write((json.dumps(x) + "\n").encode("utf-8"))
102 |
103 |
104 | def sliding_window(
105 | prompt_tokens: list,
106 | code_tokens: list,
107 | seq_len: int,
108 | sliding_stride: int,
109 | minimum_code_len: int = 1,
110 | ) -> Iterable[Tuple[list, list]]:
111 | """
112 | Generate a series of (prompt, code) pairs by sliding the window over the code.
113 | """
114 | prompt_len = len(prompt_tokens)
115 | code_len = len(code_tokens)
116 | total_len = prompt_len + code_len
117 |
118 | start_idx = max(0, prompt_len - seq_len + minimum_code_len) # at least `minimum_code_len` code token should be in the window
119 | end_idx = max(0, total_len - seq_len)
120 | start_idx = min(start_idx, end_idx)
121 |
122 | for i in range(start_idx, end_idx + 1, sliding_stride):
123 | current_prompt = prompt_tokens[i:i + seq_len]
124 | current_code = code_tokens[max(i - prompt_len, 0):i - prompt_len + seq_len]
125 | yield current_prompt, current_code
126 |
127 | if (end_idx - start_idx) % sliding_stride != 0:
128 | current_prompt = prompt_tokens[end_idx:end_idx + seq_len]
129 | current_code = code_tokens[max(end_idx - prompt_len, 0):end_idx - prompt_len + seq_len]
130 | yield current_prompt, current_code
131 |
--------------------------------------------------------------------------------
/codegeex/data/process_pretrain_dataset.py:
--------------------------------------------------------------------------------
1 | import os
2 | import glob
3 | import fire
4 | import torch
5 | import multiprocessing
6 |
7 | from typing import *
8 | from tqdm.auto import tqdm
9 | from time import perf_counter
10 | from black import format_str, FileMode
11 |
12 | from codegeex.data.types import PromptDataset, PromptSample
13 | from codegeex.data.processor import PromptDatasetProcessor
14 | from codegeex.data.data_utils import stream_jsonl, LANGUAGE_TAG
15 | from codegeex.megatron.data.indexed_dataset import make_mmap_builder
16 | from codegeex.tokenizer import CodeGeeXTokenizer
17 |
18 |
19 | def try_format_code(code: str):
20 | # Auto-correct to PEP8 format (Change tab to 4-whitespaces;
21 | # add whitespace around some special symbols;
22 | # reformat line length < 100, etc.)
23 | try:
24 | res = format_str(code, mode=FileMode(line_length=200))
25 | except Exception as e:
26 | res = code
27 | print(e)
28 | print("Wrong python format: {}".format(code))
29 | return res
30 |
31 |
32 | def load_pretrain_dataset(dataset_path: Union[str, List[str]]) -> Dict:
33 | if type(dataset_path) is str:
34 | dataset_path = [dataset_path]
35 |
36 | for p in dataset_path:
37 | if not os.path.isdir(p):
38 | if p.endswith(".gz") or p.endswith(".jsonl"):
39 | print(f"loading from {p}")
40 | yield from stream_jsonl(p)
41 | else:
42 | p_list = glob.glob(p + "/*")
43 | for p_ in p_list:
44 | if p_.endswith(".gz") or p_.endswith(".jsonl"):
45 | print(f"loading from {p_}")
46 | yield from stream_jsonl(p_)
47 |
48 |
49 | def process_sample(
50 | sample: Dict,
51 | language: str=None,
52 | mode: str="pretrain",
53 | ) -> Iterable[PromptSample]:
54 | if mode == "pretrain":
55 | prompt = ""
56 | else:
57 | prompt = sample["prompt"]
58 |
59 | try:
60 | if language is not None and language in LANGUAGE_TAG.keys():
61 | code = LANGUAGE_TAG[language] + "\n" + sample["code"]
62 | else:
63 | code = sample["code"]
64 | except Exception as e:
65 | print(e)
66 | print("The key 'code' is missing in data. Aborted")
67 | exit(0)
68 |
69 | yield PromptSample(prompt, code)
70 |
71 |
72 | def generate_prompt_samples(
73 | dataset: Iterable[Dict],
74 | language: str = None,
75 | mode: str = "pretrain",
76 | ) -> PromptDataset:
77 | for sample in dataset:
78 | yield from process_sample(sample, language, mode)
79 |
80 |
81 | def main(
82 | tokenizer_path: str,
83 | dataset_path: Union[str, List[str]],
84 | output_prefix: str,
85 | language: str = None,
86 | mode: str = "pretrain",
87 | discard_overlong: bool = False,
88 | sliding_stride: int = 200,
89 | num_workers: int = 32,
90 | seq_len: int = 2048,
91 | ):
92 | DATA_KEYS = ["input_ids", "attention_mask", "labels"]
93 |
94 | # create output dir
95 | os.makedirs(os.path.dirname(output_prefix), exist_ok=True)
96 |
97 | tokenizer = CodeGeeXTokenizer(tokenizer_path=tokenizer_path)
98 | pad_token_id = tokenizer.eos_token_id
99 |
100 | dataset = load_pretrain_dataset(dataset_path)
101 | prompt_dataset = generate_prompt_samples(dataset, language=language, mode=mode)
102 |
103 | if num_workers == 0:
104 | num_workers = multiprocessing.cpu_count()
105 | pool = multiprocessing.Pool(num_workers)
106 | output_bin_files = {}
107 | output_idx_files = {}
108 | builders = {}
109 |
110 | for key in DATA_KEYS:
111 | output_bin_files[key] = "{}_{}.bin".format(output_prefix, key)
112 | output_idx_files[key] = "{}_{}.idx".format(output_prefix, key)
113 | builders[key] = make_mmap_builder(
114 | output_bin_files[key],
115 | vocab_size=None, # magic number, should change it
116 | )
117 |
118 | # NOTE that we use seq_len + 1 instead of seq_len, since the input tokens will be shifted by one.
119 | processor = PromptDatasetProcessor(
120 | tokenize=tokenizer.encode_code,
121 | pad_token=pad_token_id,
122 | max_seq_len=seq_len + 1,
123 | discard_overlong=discard_overlong,
124 | sliding_stride=sliding_stride,
125 | eod_token=pad_token_id)
126 |
127 | processor.start_time = perf_counter()
128 | doc_iter = pool.imap_unordered(processor.process_sample_strict,
129 | prompt_dataset,
130 | chunksize=20)
131 |
132 | for doc_idx, docs in tqdm(enumerate(doc_iter, start=1)):
133 | processor.doc_processed += 1
134 | for doc in docs:
135 | processor.doc_generated += 1
136 | for key in DATA_KEYS:
137 | builders[key].add_item(torch.IntTensor(doc[key]))
138 |
139 | for key in DATA_KEYS:
140 | builders[key].finalize(output_idx_files[key])
141 |
142 |
143 | if __name__ == "__main__":
144 | fire.Fire(main)
145 |
--------------------------------------------------------------------------------
/codegeex/data/types.py:
--------------------------------------------------------------------------------
1 | from typing import *
2 | from dataclasses import dataclass
3 |
4 |
5 | @dataclass
6 | class PromptSample:
7 | prompt: str
8 | code: str
9 | extra: dict = None
10 |
11 |
12 | PromptDataset = Iterable[PromptSample]
13 |
14 | @dataclass
15 | class LabelSample:
16 | prompt: str
17 | label: int
18 | extra: dict = None
19 |
20 | LabelDataset = Iterable[LabelSample]
--------------------------------------------------------------------------------
/codegeex/docker/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM pytorch/pytorch:1.11.0-cuda11.3-cudnn8-runtime
2 |
3 | RUN cp /etc/apt/sources.list /etc/apt/sources.list.bak \
4 | && sed -i "s@http://.*archive.ubuntu.com@https://mirrors.tuna.tsinghua.edu.cn@g" /etc/apt/sources.list \
5 | && sed -i "s@http://.*security.ubuntu.com@https://mirrors.tuna.tsinghua.edu.cn@g" /etc/apt/sources.list \
6 | && pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple \
7 | && apt-get update && apt-get install -y curl npm git nano \
8 | && pip install fire zmq transformers tokenizers \
9 | && mkdir /workspace/download
10 |
11 | RUN curl -o /workspace/download/go.tar.gz -SL https://go.dev/dl/go1.18.4.linux-amd64.tar.gz \
12 | && tar -zxf /workspace/download/go.tar.gz -C /usr/local && rm /workspace/download/go.tar.gz
13 | ENV PATH=/bin:/usr/local/go/bin:$PATH
14 |
15 | RUN curl -o /workspace/download/node.tar.gz -SL https://nodejs.org/download/release/v16.14.0/node-v16.14.0-linux-x64.tar.gz \
16 | && mkdir -p /usr/local/lib/nodejs && tar -zxf /workspace/download/node.tar.gz -C /usr/local/lib/nodejs && mv /usr/local/lib/nodejs/node-v16.14.0-linux-x64 /usr/local/lib/nodejs/node \
17 | && rm /workspace/download/node.tar.gz && npm install -g js-md5@0.7.3
18 | ENV PATH=/usr/local/lib/nodejs/node/bin:$PATH
19 | ENV NODE_PATH=/usr/local/lib/node_modules
20 |
21 | RUN apt-get install -y build-essential && apt-get install -y g++ \
22 | && curl -o /workspace/download/boost_1_71_0.tar.gz -SL https://boostorg.jfrog.io/artifactory/main/release/1.71.0/source/boost_1_71_0.tar.gz \
23 | && tar -zxf /workspace/download/boost_1_71_0.tar.gz && rm /workspace/download/boost_1_71_0.tar.gz && cd boost_1_71_0 \
24 | && ./bootstrap.sh --prefix=/usr/ && ./b2 && ./b2 install \
25 | && cd .. && rm -r boost_1_71_0
26 | RUN curl -o /workspace/download/openssl.tar.gz -SL https://www.openssl.org/source/old/3.0/openssl-3.0.0.tar.gz \
27 | && tar -zxf /workspace/download/openssl.tar.gz && cd openssl-3.0.0 && ./Configure && make && make install \
28 | && rm /workspace/download/openssl.tar.gz && rm -r /workspace/openssl-3.0.0
29 | ENV PATH=/usr/bin/openssl:$PATH
30 |
31 | RUN curl -o /workspace/download/jdk.tar.gz -SL https://download.oracle.com/java/18/latest/jdk-18_linux-x64_bin.tar.gz \
32 | && mkdir /usr/java && tar -zxf /workspace/download/jdk.tar.gz -C /usr/java && rm /workspace/download/jdk.tar.gz \
33 | && java_path=`ls /usr/java/${path}` && echo "export JAVA_HOME=/usr/java/${java_path}" >> ~/.profile
34 |
35 | RUN git clone https://github.com/THUDM/CodeGeeX.git && pip install -e /workspace/CodeGeeX \
36 | && tar -zxf /workspace/CodeGeeX/codegeex/benchmark/humaneval-x/go/evaluation/vendor.tar.gz -C /workspace/CodeGeeX/codegeex/benchmark/humaneval-x/go/evaluation
37 |
38 | SHELL ["sh", "-lc"]
39 | RUN update-alternatives --install /usr/bin/java java $JAVA_HOME/bin/java 20000 \
40 | && update-alternatives --install /usr/bin/javac javac $JAVA_HOME/bin/javac 20000 \
41 | && rm -r /workspace/download \
42 | && env
43 |
--------------------------------------------------------------------------------
/codegeex/kernels/__init__.py:
--------------------------------------------------------------------------------
1 | import pkg_resources
2 | import torch
3 | import ctypes
4 |
5 | from typing import List
6 | from cpm_kernels.kernels.base import LazyKernelCModule, KernelFunction, round_up
7 |
8 | RESOURCE_PACKAGE_NAME = __name__
9 |
10 |
11 | class Kernel:
12 | def __init__(self, filename: str, function_names: List[str]):
13 | filename = filename + ".fatbin"
14 | if not pkg_resources.resource_exists(RESOURCE_PACKAGE_NAME, filename):
15 | raise RuntimeError("File `%s` not found in `%s`" % (filename, RESOURCE_PACKAGE_NAME))
16 | self.filename = filename
17 | self.code = pkg_resources.resource_string(RESOURCE_PACKAGE_NAME, filename)
18 | self._function_names = function_names
19 | self._cmodule = LazyKernelCModule(self.code)
20 |
21 | for name in self._function_names:
22 | setattr(self, name, KernelFunction(self._cmodule, name))
23 |
24 |
25 | kernels = Kernel(
26 | "quantization",
27 | [
28 | "int4WeightCompression",
29 | "int4WeightExtractionFloat",
30 | "int4WeightExtractionHalf",
31 | "int8WeightExtractionFloat",
32 | "int8WeightExtractionHalf",
33 | ],
34 | )
35 |
36 |
37 | def compress_int4_weight(weight: torch.Tensor): # (n, m)
38 | with torch.cuda.device(weight.device):
39 | n, m = weight.size(0), weight.size(1)
40 | assert m % 2 == 0
41 | m = m // 2
42 | out = torch.empty(n, m, dtype=torch.int8, device="cuda")
43 | stream = torch.cuda.current_stream()
44 |
45 | gridDim = (n, 1, 1)
46 | blockDim = (min(round_up(m, 32), 1024), 1, 1)
47 |
48 | kernels.int4WeightCompression(
49 | gridDim,
50 | blockDim,
51 | 0,
52 | stream,
53 | [ctypes.c_void_p(weight.data_ptr()), ctypes.c_void_p(out.data_ptr()), ctypes.c_int32(n), ctypes.c_int32(m)],
54 | )
55 | return out
56 |
57 |
58 | def extract_weight_to_half(weight: torch.Tensor, scale_list: torch.Tensor, source_bit_width: int):
59 | if source_bit_width == 8:
60 | func = kernels.int8WeightExtractionHalf
61 | elif source_bit_width == 4:
62 | func = kernels.int4WeightExtractionHalf
63 | else:
64 | assert False, "Unsupported bit-width"
65 |
66 | with torch.cuda.device(weight.device):
67 | n, m = weight.size(0), weight.size(1)
68 | out = torch.empty(n, m * (8 // source_bit_width), dtype=torch.half, device="cuda")
69 | stream = torch.cuda.current_stream()
70 |
71 | gridDim = (n, 1, 1)
72 | blockDim = (min(round_up(m, 32), 1024), 1, 1)
73 |
74 | func(
75 | gridDim,
76 | blockDim,
77 | 0,
78 | stream,
79 | [
80 | ctypes.c_void_p(weight.data_ptr()),
81 | ctypes.c_void_p(scale_list.data_ptr()),
82 | ctypes.c_void_p(out.data_ptr()),
83 | ctypes.c_int32(n),
84 | ctypes.c_int32(m),
85 | ],
86 | )
87 | return out
88 |
89 |
90 | if __name__ == "__main__":
91 | weight = torch.randn(4, 32).to(torch.int8).cuda()
92 | scale = torch.ones(weight.size(0)).to(torch.half).cuda()
93 |
94 | print(weight)
95 | b = compress_int4_weight(weight)
96 | print(b)
97 |
98 | a = extract_weight_to_half(b, scale, source_bit_width=4)
99 | print(a)
100 |
--------------------------------------------------------------------------------
/codegeex/kernels/quantization.fatbin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/codegeex/kernels/quantization.fatbin
--------------------------------------------------------------------------------
/codegeex/megatron/__init__.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | import torch
16 |
17 | from .global_vars import get_args
18 | from .global_vars import get_current_global_batch_size
19 | from .global_vars import get_num_microbatches
20 | from .global_vars import update_num_microbatches
21 | from .global_vars import get_tokenizer
22 | from .global_vars import get_tensorboard_writer
23 | from .global_vars import get_adlr_autoresume
24 | from .global_vars import get_timers
25 | from .initialize import initialize_megatron
26 |
27 |
28 | def print_rank_0(message):
29 | """If distributed is initialized, print only on rank 0."""
30 | if torch.distributed.is_initialized():
31 | if torch.distributed.get_rank() == 0:
32 | print(message, flush=True)
33 | else:
34 | print(message, flush=True)
35 |
36 |
37 | def is_last_rank():
38 | return torch.distributed.get_rank() == (torch.distributed.get_world_size() - 1)
39 |
40 |
41 | def print_rank_last(message):
42 | """If distributed is initialized, print only on last rank."""
43 | if torch.distributed.is_initialized():
44 | if is_last_rank():
45 | print(message, flush=True)
46 | else:
47 | print(message, flush=True)
48 |
--------------------------------------------------------------------------------
/codegeex/megatron/convert_ckpt_parallel.py:
--------------------------------------------------------------------------------
1 | """Get model parallel partitions."""
2 |
3 | import os
4 | import torch
5 | import argparse
6 |
7 |
8 | def get_change_ckpt_args(parser):
9 | """Provide extra arguments required for merging."""
10 | group = parser.add_argument_group(title='Mindspore to megatron')
11 | group.add_argument(
12 | '--load-ckpt-path',
13 | type=str,
14 | required=True,
15 | help='path to load ".pt" checkpoint.',
16 | )
17 | group.add_argument(
18 | '--save-ckpt-path',
19 | type=str,
20 | required=True,
21 | help='dir to save converted checkpoints.',
22 | )
23 | group.add_argument(
24 | '--target-tensor-model-parallel-size',
25 | type=int,
26 | default=2,
27 | help='target tensor model parallel size',
28 | )
29 |
30 | return parser
31 |
32 |
33 | def get_element_from_dict_by_path(d, path):
34 | """
35 | Get element from dictionary by path. If element is not present, recursively add empty dictionaries.
36 | Args:
37 | d (dict): the dictionary to get the element from
38 | path (list): the path to the element which is delimited by "."
39 | """
40 | path = path.split(".")
41 | for k in path:
42 | if k not in d:
43 | d[k] = {}
44 | d = d[k]
45 | return d
46 |
47 |
48 | def main():
49 | parser = argparse.ArgumentParser()
50 | parser = get_change_ckpt_args(parser)
51 | args, _ = parser.parse_known_args()
52 |
53 | print(f"Load ckpt from {args.load_ckpt_path}...")
54 | state_dict = torch.load(args.load_ckpt_path, map_location="cpu")
55 |
56 | print(f"Spliting ckpt into {args.target_tensor_model_parallel_size} parts...")
57 | output_state_dict = []
58 | for i in range(args.target_tensor_model_parallel_size):
59 | output_state_dict.append({})
60 |
61 | print("Converting Embedding layers...")
62 | word_embeddings = state_dict['module']['language_model']['embedding']['word_embeddings']['weight']
63 | position_embeddings = state_dict['module']['language_model']['embedding']['position_embeddings']['weight']
64 | out_word_embeddings = torch.chunk(word_embeddings, args.target_tensor_model_parallel_size, dim=0)
65 |
66 | for i in range(args.target_tensor_model_parallel_size):
67 | pos_emb_dict = get_element_from_dict_by_path(
68 | output_state_dict[i], "module.language_model.embedding.position_embeddings"
69 | )
70 | pos_emb_dict["weight"] = position_embeddings
71 |
72 | word_emb_dict = get_element_from_dict_by_path(
73 | output_state_dict[i], "module.language_model.embedding.word_embeddings"
74 | )
75 | word_emb_dict["weight"] = out_word_embeddings[i].clone()
76 |
77 | print("Converting QueryEmbedding layers...")
78 | query_embeddings = state_dict['module']['language_model']['topQueryEmbedding']['top_query_embeddings']['weight']
79 | out_query_embeddings = torch.chunk(query_embeddings, args.target_tensor_model_parallel_size, dim=0)
80 |
81 | for i in range(args.target_tensor_model_parallel_size):
82 | query_emb_dict = get_element_from_dict_by_path(
83 | output_state_dict[i], "module.language_model.topQueryEmbedding.top_query_embeddings"
84 | )
85 | query_emb_dict["weight"] = out_query_embeddings[i].clone()
86 |
87 | print("Converting Transformer layers...")
88 | for layer_name in state_dict['module']['language_model']['transformer'].keys():
89 | params = state_dict['module']['language_model']['transformer'][layer_name]
90 | if "layernorm" in layer_name:
91 | pass
92 | elif "attention" in layer_name and "weight" in layer_name:
93 | if "dense" in layer_name:
94 | params = torch.chunk(params, args.target_tensor_model_parallel_size, dim=1)
95 | else:
96 | params = torch.chunk(params, args.target_tensor_model_parallel_size, dim=0)
97 | elif "weight" in layer_name and "dense" in layer_name:
98 | if "h_to_4h" in layer_name:
99 | params = torch.chunk(params, args.target_tensor_model_parallel_size, dim=0)
100 | else:
101 | params = torch.chunk(params, args.target_tensor_model_parallel_size, dim=1)
102 | elif "bias" in layer_name:
103 | if "dense" not in layer_name or "mlp" in layer_name:
104 | if "4h_to_h" in layer_name:
105 | pass
106 | else:
107 | params = torch.chunk(params, args.target_tensor_model_parallel_size, dim=0)
108 |
109 | for i in range(args.target_tensor_model_parallel_size):
110 | params_dict = get_element_from_dict_by_path(output_state_dict[i], "module.language_model.transformer")
111 | if type(params) is tuple:
112 | params_dict[layer_name] = params[i].clone()
113 | else:
114 | params_dict[layer_name] = params
115 |
116 | os.makedirs(args.save_ckpt_path, exist_ok=True)
117 | for rank in range(args.target_tensor_model_parallel_size):
118 | save_ckpt_path = os.path.join(args.save_ckpt_path, f"mp_rank_{rank:02d}_model_states.pt")
119 | torch.save(output_state_dict[rank], save_ckpt_path)
120 | print(f"Converted checkpoint saved in {save_ckpt_path}.")
121 |
122 |
123 | if __name__ == '__main__':
124 | main()
125 |
--------------------------------------------------------------------------------
/codegeex/megatron/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/codegeex/megatron/data/__init__.py
--------------------------------------------------------------------------------
/codegeex/megatron/data/blendable_dataset.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | """Blendable dataset."""
17 |
18 | import time
19 | import torch
20 | import numpy as np
21 |
22 | from codegeex.megatron import print_rank_0
23 |
24 |
25 | class BlendableDataset(torch.utils.data.Dataset):
26 | def __init__(self, datasets, weights):
27 |
28 | self.datasets = datasets
29 | num_datasets = len(datasets)
30 | assert num_datasets == len(weights)
31 |
32 | self.size = 0
33 | for dataset in self.datasets:
34 | self.size += len(dataset)
35 |
36 | # Normalize weights.
37 | weights = np.array(weights, dtype=np.float64)
38 | sum_weights = np.sum(weights)
39 | assert sum_weights > 0.0
40 | weights /= sum_weights
41 |
42 | # Build indecies.
43 | start_time = time.time()
44 | assert num_datasets < 255
45 | self.dataset_index = np.zeros(self.size, dtype=np.uint8)
46 | self.dataset_sample_index = np.zeros(self.size, dtype=np.int64)
47 |
48 | from megatron.data import helpers
49 |
50 | helpers.build_blending_indices(
51 | self.dataset_index,
52 | self.dataset_sample_index,
53 | weights,
54 | num_datasets,
55 | self.size,
56 | torch.distributed.get_rank() == 0,
57 | )
58 | print_rank_0(
59 | "> elapsed time for building blendable dataset indices: "
60 | "{:.2f} (sec)".format(time.time() - start_time)
61 | )
62 |
63 | def __len__(self):
64 | return self.size
65 |
66 | def __getitem__(self, idx):
67 | dataset_idx = self.dataset_index[idx]
68 | sample_idx = self.dataset_sample_index[idx]
69 | return self.datasets[dataset_idx][sample_idx]
70 |
--------------------------------------------------------------------------------
/codegeex/megatron/enums.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | import enum
17 |
18 |
19 | class LayerType(enum.Enum):
20 | encoder = 1
21 | decoder = 2
22 |
23 |
24 | class AttnType(enum.Enum):
25 | self_attn = 1
26 | cross_attn = 2
27 |
28 |
29 | class AttnMaskType(enum.Enum):
30 | padding = 1
31 | causal = 2
32 |
--------------------------------------------------------------------------------
/codegeex/megatron/memory.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 |
17 | import torch
18 |
19 |
20 | # A dictionary of all the memory buffers allocated.
21 | _MEM_BUFFS = dict()
22 |
23 |
24 | def allocate_mem_buff(name, numel, dtype, track_usage):
25 | """Allocate a memory buffer."""
26 | assert name not in _MEM_BUFFS, "memory buffer {} already allocated.".format(name)
27 | _MEM_BUFFS[name] = MemoryBuffer(name, numel, dtype, track_usage)
28 | return _MEM_BUFFS[name]
29 |
30 |
31 | def get_mem_buff(name):
32 | """Get the memory buffer."""
33 | return _MEM_BUFFS[name]
34 |
35 |
36 | class MemoryBuffer:
37 | """Contiguous memory buffer.
38 | Allocate a contiguous memory of type `dtype` and size `numel`. It is
39 | used to reduce memory fragmentation.
40 |
41 | Usage: After the allocation, the `_start` index is set tot the first
42 | index of the memory. A memory chunk starting from `_start` index
43 | can be `allocated` for an input tensor, with the elements of the
44 | tensor being coppied. The buffer can be reused by resetting the
45 | `_start` index.
46 |
47 | """
48 |
49 | def __init__(self, name, numel, dtype, track_usage):
50 | if torch.distributed.get_rank() == 0:
51 | element_size = torch.tensor([], dtype=dtype).element_size()
52 | print(
53 | "> building the {} memory buffer with {} num elements "
54 | "and {} dtype ({:.1f} MB)...".format(
55 | name, numel, dtype, numel * element_size / 1024 / 1024
56 | ),
57 | flush=True,
58 | )
59 | self.name = name
60 | self.numel = numel
61 | self.dtype = dtype
62 | self.data = torch.empty(
63 | self.numel,
64 | dtype=self.dtype,
65 | device=torch.cuda.current_device(),
66 | requires_grad=False,
67 | )
68 |
69 | # Index tracking the start of the free memory.
70 | self._start = 0
71 |
72 | # Values used for tracking usage.
73 | self.track_usage = track_usage
74 | if self.track_usage:
75 | self.in_use_value = 0.0
76 | self.total_value = 0.0
77 |
78 | def reset(self):
79 | """Reset the buffer start index to the beginning of the buffer."""
80 | self._start = 0
81 |
82 | def is_in_use(self):
83 | """Whether the current buffer hold on to any memory."""
84 | return self._start > 0
85 |
86 | def numel_in_use(self):
87 | """Return number of elements in use."""
88 | return self._start
89 |
90 | def add(self, tensor):
91 | """Allocate a chunk of memory from the buffer to tensor and copy
92 | the values."""
93 | assert (
94 | tensor.dtype == self.dtype
95 | ), "Input tensor type {} different from buffer type {}".format(
96 | tensor.dtype, self.dtype
97 | )
98 | # Number of elements of the input tensor.
99 | tensor_numel = torch.numel(tensor)
100 | new_start = self._start + tensor_numel
101 | assert (
102 | new_start <= self.numel
103 | ), "Not enough memory left in the buffer ({} > {})".format(
104 | tensor_numel, self.numel - self._start
105 | )
106 | # New tensor is a view into the memory.
107 | new_tensor = self.data[self._start : new_start]
108 | self._start = new_start
109 | new_tensor = new_tensor.view(tensor.shape)
110 | new_tensor.copy_(tensor)
111 | # Return a pointer to the new tensor.
112 | return new_tensor
113 |
114 | def get_data(self):
115 | """Return the data currently in use."""
116 | if self.track_usage:
117 | self.in_use_value += float(self._start)
118 | self.total_value += float(self.numel)
119 | return self.data[: self._start]
120 |
121 | def print_average_usage(self):
122 | """Print memory usage average over time. We would like this value
123 | to be as high as possible."""
124 | assert self.track_usage, "You need to enable track usage."
125 | if torch.distributed.get_rank() == 0:
126 | print(
127 | " > usage of {} memory buffer: {:.2f} %".format(
128 | self.name, self.in_use_value * 100.0 / self.total_value
129 | ),
130 | flush=True,
131 | )
132 |
133 |
134 | class RingMemBuffer:
135 | """A ring of memory buffers."""
136 |
137 | def __init__(self, name, num_buffers, numel, dtype, track_usage):
138 | self.num_buffers = num_buffers
139 | self.buffers = [
140 | allocate_mem_buff(name + " {}".format(i), numel, dtype, track_usage)
141 | for i in range(num_buffers)
142 | ]
143 | self._index = -1
144 |
145 | def get_next_buffer(self):
146 | self._index += 1
147 | self._index = self._index % self.num_buffers
148 | buff = self.buffers[self._index]
149 | assert not buff.is_in_use(), "buffer is already in use."
150 | return buff
151 |
--------------------------------------------------------------------------------
/codegeex/megatron/model/__init__.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | from torch.nn import LayerNorm
17 | from .distributed import DistributedDataParallel
18 | from .codegeex_model import CodeGeeXModel
19 | from .language_model import get_language_model
20 | from .module import Float16Module
--------------------------------------------------------------------------------
/codegeex/megatron/model/utils.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | """Utilities for models."""
17 |
18 | import math
19 | import torch
20 |
21 |
22 | def init_method_normal(sigma):
23 | """Init method based on N(0, sigma)."""
24 |
25 | def init_(tensor):
26 | return torch.nn.init.normal_(tensor, mean=0.0, std=sigma)
27 |
28 | return init_
29 |
30 |
31 | def scaled_init_method_normal(sigma, num_layers):
32 | """Init method based on N(0, sigma/sqrt(2*num_layers)."""
33 | std = sigma / math.sqrt(2.0 * num_layers)
34 |
35 | def init_(tensor):
36 | return torch.nn.init.normal_(tensor, mean=0.0, std=std)
37 |
38 | return init_
39 |
40 |
41 | def attention_mask_func(attention_scores, attention_mask):
42 | attention_scores.masked_fill_(attention_mask, -10000.0)
43 |
44 | return attention_scores
45 |
46 |
47 | def get_linear_layer(rows, columns, init_method):
48 | """Simple linear layer with weight initialization."""
49 | layer = torch.nn.Linear(rows, columns)
50 | init_method(layer.weight)
51 | with torch.no_grad():
52 | layer.bias.zero_()
53 | return layer
54 |
55 |
56 | def fast_gelu(x):
57 | """Mindspore's fast gelu implementation."""
58 | return x / (1 + torch.exp(-1.702 * torch.abs(x))) * torch.exp(0.851 * (x - torch.abs(x)))
59 |
60 |
61 | @torch.jit.script
62 | def gelu_impl(x):
63 | """OpenAI's gelu implementation."""
64 | return (
65 | 0.5 * x * (1.0 + torch.tanh(0.7978845608028654 * x * (1.0 + 0.044715 * x * x)))
66 | )
67 |
68 |
69 | def openai_gelu(x):
70 | return gelu_impl(x)
71 |
72 |
73 | # This is actually Python equivalent of torch.nn.functional.gelu(), also with type hints for ONNX exporter
74 | @torch.jit.script
75 | def erf_gelu(x):
76 | return (
77 | x
78 | * 0.5
79 | * (
80 | torch.erf(x / 1.41421).to(dtype=x.dtype)
81 | + torch.ones_like(x).to(dtype=x.dtype)
82 | )
83 | )
84 |
--------------------------------------------------------------------------------
/codegeex/megatron/mpu/__init__.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | """Model parallel utility interface."""
17 |
18 | from .cross_entropy import vocab_parallel_cross_entropy
19 |
20 | from .data import broadcast_data
21 |
22 | from .initialize import is_unitialized
23 | from .initialize import destroy_model_parallel
24 | from .initialize import get_data_parallel_group
25 | from .initialize import get_data_parallel_rank
26 | from .initialize import get_data_parallel_world_size
27 | from .initialize import get_embedding_group
28 | from .initialize import get_model_parallel_group
29 | from .initialize import get_tensor_model_parallel_group
30 | from .initialize import get_pipeline_model_parallel_group
31 | from .initialize import get_tensor_model_parallel_rank, set_tensor_model_parallel_rank
32 | from .initialize import (
33 | get_pipeline_model_parallel_rank,
34 | set_pipeline_model_parallel_rank,
35 | )
36 | from .initialize import is_pipeline_first_stage, is_pipeline_last_stage
37 | from .initialize import get_tensor_model_parallel_src_rank
38 | from .initialize import get_pipeline_model_parallel_first_rank
39 | from .initialize import get_pipeline_model_parallel_last_rank
40 | from .initialize import get_pipeline_model_parallel_next_rank
41 | from .initialize import get_pipeline_model_parallel_prev_rank
42 | from .initialize import (
43 | get_tensor_model_parallel_world_size,
44 | set_tensor_model_parallel_world_size,
45 | )
46 | from .initialize import (
47 | get_pipeline_model_parallel_world_size,
48 | set_pipeline_model_parallel_world_size,
49 | )
50 | from .initialize import (
51 | get_virtual_pipeline_model_parallel_rank,
52 | set_virtual_pipeline_model_parallel_rank,
53 | )
54 | from .initialize import initialize_model_parallel
55 | from .initialize import model_parallel_is_initialized
56 | from .initialize import get_model_parallel_world_size, get_model_parallel_rank
57 |
58 | from .layers import ColumnParallelLinear
59 | from .layers import RowParallelLinear
60 | from .layers import VocabParallelEmbedding
61 | from .layers import (
62 | set_tensor_model_parallel_attributes,
63 | set_defaults_if_not_set_tensor_model_parallel_attributes,
64 | copy_tensor_model_parallel_attributes,
65 | )
66 |
67 | from .mappings import copy_to_tensor_model_parallel_region
68 | from .mappings import gather_from_tensor_model_parallel_region
69 | from .mappings import reduce_from_tensor_model_parallel_region
70 | from .mappings import scatter_to_tensor_model_parallel_region
71 |
72 | from .random import checkpoint
73 | from .random import get_cuda_rng_tracker
74 | from .random import init_checkpointed_activations_memory_buffer
75 | from .random import model_parallel_cuda_manual_seed
76 | from .random import reset_checkpointed_activations_memory_buffer
77 | from .random import gather_split_1d_tensor
78 | from .random import split_tensor_into_1d_equal_chunks
79 |
80 | from .utils import divide
81 | from .utils import split_tensor_along_last_dim
82 |
--------------------------------------------------------------------------------
/codegeex/megatron/mpu/cross_entropy.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 |
17 | import torch
18 |
19 | from .initialize import get_tensor_model_parallel_group
20 | from .initialize import get_tensor_model_parallel_rank
21 | from .initialize import get_tensor_model_parallel_world_size
22 | from .utils import VocabUtility
23 |
24 |
25 | class _VocabParallelCrossEntropy(torch.autograd.Function):
26 | @staticmethod
27 | def forward(ctx, vocab_parallel_logits, target):
28 |
29 | # Maximum value along vocab dimension across all GPUs.
30 | logits_max = torch.max(vocab_parallel_logits, dim=-1)[0]
31 | torch.distributed.all_reduce(
32 | logits_max,
33 | op=torch.distributed.ReduceOp.MAX,
34 | group=get_tensor_model_parallel_group(),
35 | )
36 | # Subtract the maximum value.
37 | vocab_parallel_logits.sub_(logits_max.unsqueeze(dim=-1))
38 |
39 | # Get the partition's vocab indecies
40 | get_vocab_range = VocabUtility.vocab_range_from_per_partition_vocab_size
41 | partition_vocab_size = vocab_parallel_logits.size()[-1]
42 | rank = get_tensor_model_parallel_rank()
43 | world_size = get_tensor_model_parallel_world_size()
44 | vocab_start_index, vocab_end_index = get_vocab_range(
45 | partition_vocab_size, rank, world_size
46 | )
47 |
48 | # Create a mask of valid vocab ids (1 means it needs to be masked).
49 | target_mask = (target < vocab_start_index) | (target >= vocab_end_index)
50 | masked_target = target.clone() - vocab_start_index
51 | masked_target[target_mask] = 0
52 |
53 | # Get predicted-logits = logits[target].
54 | # For Simplicity, we convert logits to a 2-D tensor with size
55 | # [*, partition-vocab-size] and target to a 1-D tensor of size [*].
56 | logits_2d = vocab_parallel_logits.view(-1, partition_vocab_size)
57 | masked_target_1d = masked_target.view(-1)
58 | arange_1d = torch.arange(
59 | start=0, end=logits_2d.size()[0], device=logits_2d.device
60 | )
61 | predicted_logits_1d = logits_2d[arange_1d, masked_target_1d]
62 | predicted_logits_1d = predicted_logits_1d.clone().contiguous()
63 | predicted_logits = predicted_logits_1d.view_as(target)
64 | predicted_logits[target_mask] = 0.0
65 | # All reduce is needed to get the chunks from other GPUs.
66 | torch.distributed.all_reduce(
67 | predicted_logits,
68 | op=torch.distributed.ReduceOp.SUM,
69 | group=get_tensor_model_parallel_group(),
70 | )
71 |
72 | # Sum of exponential of logits along vocab dimension across all GPUs.
73 | exp_logits = vocab_parallel_logits
74 | torch.exp(vocab_parallel_logits, out=exp_logits)
75 | sum_exp_logits = exp_logits.sum(dim=-1)
76 | torch.distributed.all_reduce(
77 | sum_exp_logits,
78 | op=torch.distributed.ReduceOp.SUM,
79 | group=get_tensor_model_parallel_group(),
80 | )
81 |
82 | # Loss = log(sum(exp(logits))) - predicted-logit.
83 | loss = torch.log(sum_exp_logits) - predicted_logits
84 |
85 | # Store softmax, target-mask and masked-target for backward pass.
86 | exp_logits.div_(sum_exp_logits.unsqueeze(dim=-1))
87 | ctx.save_for_backward(exp_logits, target_mask, masked_target_1d)
88 |
89 | return loss
90 |
91 | @staticmethod
92 | def backward(ctx, grad_output):
93 |
94 | # Retreive tensors from the forward path.
95 | softmax, target_mask, masked_target_1d = ctx.saved_tensors
96 |
97 | # All the inputs have softmax as their gradient.
98 | grad_input = softmax
99 | # For simplicity, work with the 2D gradient.
100 | partition_vocab_size = softmax.size()[-1]
101 | grad_2d = grad_input.view(-1, partition_vocab_size)
102 |
103 | # Add the gradient from matching classes.
104 | arange_1d = torch.arange(start=0, end=grad_2d.size()[0], device=grad_2d.device)
105 | grad_2d[arange_1d, masked_target_1d] -= 1.0 - target_mask.view(-1).float()
106 |
107 | # Finally elementwise multiplication with the output gradients.
108 | grad_input.mul_(grad_output.unsqueeze(dim=-1))
109 |
110 | return grad_input, None
111 |
112 |
113 | def vocab_parallel_cross_entropy(vocab_parallel_logits, target):
114 | """Helper function for the cross entropy."""
115 | return _VocabParallelCrossEntropy.apply(vocab_parallel_logits, target)
116 |
--------------------------------------------------------------------------------
/codegeex/megatron/mpu/data.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | import torch
17 |
18 | from .initialize import get_tensor_model_parallel_group
19 | from .initialize import get_tensor_model_parallel_rank
20 | from .initialize import get_tensor_model_parallel_src_rank
21 |
22 |
23 | _MAX_DATA_DIM = 5
24 |
25 |
26 | def _check_data_types(keys, data, target_dtype):
27 | """Check that all the keys have the same target data type."""
28 | for key in keys:
29 | assert (
30 | data[key].dtype == target_dtype
31 | ), "{} has data type {} which " "is different than {}".format(
32 | key, data[key].dtype, target_dtype
33 | )
34 |
35 |
36 | def _build_key_size_numel_dictionaries(keys, data):
37 | """Build the size on rank 0 and broadcast."""
38 | max_dim = _MAX_DATA_DIM
39 | sizes = [0 for _ in range(max_dim) for _ in keys]
40 |
41 | # Pack the sizes on rank zero.
42 | if get_tensor_model_parallel_rank() == 0:
43 | offset = 0
44 | for key in keys:
45 | assert data[key].dim() < max_dim, "you should increase MAX_DATA_DIM"
46 | size = data[key].size()
47 | for i, s in enumerate(size):
48 | sizes[i + offset] = s
49 | offset += max_dim
50 |
51 | # Move to GPU and broadcast.
52 | sizes_cuda = torch.cuda.LongTensor(sizes)
53 | torch.distributed.broadcast(
54 | sizes_cuda,
55 | get_tensor_model_parallel_src_rank(),
56 | group=get_tensor_model_parallel_group(),
57 | )
58 |
59 | # Move back to cpu and unpack.
60 | sizes_cpu = sizes_cuda.cpu()
61 | key_size = {}
62 | key_numel = {}
63 | total_numel = 0
64 | offset = 0
65 | for key in keys:
66 | i = 0
67 | size = []
68 | numel = 1
69 | while sizes_cpu[offset + i] > 0:
70 | this_size = sizes_cpu[offset + i]
71 | size.append(this_size)
72 | numel *= this_size
73 | i += 1
74 | key_size[key] = size
75 | key_numel[key] = numel
76 | total_numel += numel
77 | offset += max_dim
78 |
79 | return key_size, key_numel, total_numel
80 |
81 |
82 | def broadcast_data(keys, data, datatype):
83 | """Broadcast data from rank zero of each model parallel group to the
84 | members of the same model parallel group.
85 |
86 | Arguments:
87 | keys: list of keys in the data disctionary to be broadcasted
88 | data: data dictionary of string keys and cpu tensor values.
89 | datatype: torch data type of all tensors in data associated
90 | with keys.
91 | """
92 | # Build (key, size) and (key, number of elements) dictionaries along
93 | # with the total number of elements on all ranks.
94 | key_size, key_numel, total_numel = _build_key_size_numel_dictionaries(keys, data)
95 |
96 | # Pack on rank zero.
97 | if get_tensor_model_parallel_rank() == 0:
98 | # Check that all keys have the same data type.
99 | _check_data_types(keys, data, datatype)
100 | # Flatten the data associated with the keys
101 | flatten_data = torch.cat(
102 | [data[key].contiguous().view(-1) for key in keys], dim=0
103 | ).cuda()
104 | else:
105 | flatten_data = torch.empty(
106 | total_numel, device=torch.cuda.current_device(), dtype=datatype
107 | )
108 |
109 | # Broadcast
110 | torch.distributed.broadcast(
111 | flatten_data,
112 | get_tensor_model_parallel_src_rank(),
113 | group=get_tensor_model_parallel_group(),
114 | )
115 |
116 | # Unpack
117 | output = {}
118 | offset = 0
119 | for key in keys:
120 | size = key_size[key]
121 | numel = key_numel[key]
122 | output[key] = flatten_data.narrow(0, offset, numel).view(size)
123 | offset += numel
124 |
125 | return output
126 |
--------------------------------------------------------------------------------
/codegeex/megatron/mpu/mappings.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | import torch
17 |
18 | from .initialize import (
19 | get_tensor_model_parallel_group,
20 | get_tensor_model_parallel_world_size,
21 | get_tensor_model_parallel_rank,
22 | )
23 | from .utils import split_tensor_along_last_dim
24 |
25 |
26 | def _reduce(input_):
27 | """All-reduce the the input tensor across model parallel group."""
28 |
29 | # Bypass the function if we are using only 1 GPU.
30 | if get_tensor_model_parallel_world_size() == 1:
31 | return input_
32 |
33 | # All-reduce.
34 | torch.distributed.all_reduce(input_, group=get_tensor_model_parallel_group())
35 |
36 | return input_
37 |
38 |
39 | def _split(input_):
40 | """Split the tensor along its last dimension and keep the
41 | corresponding slice."""
42 |
43 | world_size = get_tensor_model_parallel_world_size()
44 | # Bypass the function if we are using only 1 GPU.
45 | if world_size == 1:
46 | return input_
47 |
48 | # Split along last dimension.
49 | input_list = split_tensor_along_last_dim(input_, world_size)
50 |
51 | # Note: torch.split does not create contiguous tensors by default.
52 | rank = get_tensor_model_parallel_rank()
53 | output = input_list[rank].contiguous()
54 |
55 | return output
56 |
57 |
58 | def _gather(input_):
59 | """Gather tensors and concatinate along the last dimension."""
60 |
61 | world_size = get_tensor_model_parallel_world_size()
62 | # Bypass the function if we are using only 1 GPU.
63 | if world_size == 1:
64 | return input_
65 |
66 | # Size and dimension.
67 | last_dim = input_.dim() - 1
68 | rank = get_tensor_model_parallel_rank()
69 |
70 | tensor_list = [torch.empty_like(input_) for _ in range(world_size)]
71 | tensor_list[rank] = input_
72 | torch.distributed.all_gather(
73 | tensor_list, input_, group=get_tensor_model_parallel_group()
74 | )
75 |
76 | # Note: torch.cat already creates a contiguous tensor.
77 | output = torch.cat(tensor_list, dim=last_dim).contiguous()
78 |
79 | return output
80 |
81 |
82 | class _CopyToModelParallelRegion(torch.autograd.Function):
83 | """Pass the input to the model parallel region."""
84 |
85 | @staticmethod
86 | def symbolic(graph, input_):
87 | return input_
88 |
89 | @staticmethod
90 | def forward(ctx, input_):
91 | return input_
92 |
93 | @staticmethod
94 | def backward(ctx, grad_output):
95 | return _reduce(grad_output)
96 |
97 |
98 | class _ReduceFromModelParallelRegion(torch.autograd.Function):
99 | """All-reduce the input from the model parallel region."""
100 |
101 | @staticmethod
102 | def symbolic(graph, input_):
103 | return _reduce(input_)
104 |
105 | @staticmethod
106 | def forward(ctx, input_):
107 | return _reduce(input_)
108 |
109 | @staticmethod
110 | def backward(ctx, grad_output):
111 | return grad_output
112 |
113 |
114 | class _ScatterToModelParallelRegion(torch.autograd.Function):
115 | """Split the input and keep only the corresponding chuck to the rank."""
116 |
117 | @staticmethod
118 | def symbolic(graph, input_):
119 | return _split(input_)
120 |
121 | @staticmethod
122 | def forward(ctx, input_):
123 | return _split(input_)
124 |
125 | @staticmethod
126 | def backward(ctx, grad_output):
127 | return _gather(grad_output)
128 |
129 |
130 | class _GatherFromModelParallelRegion(torch.autograd.Function):
131 | """Gather the input from model parallel region and concatinate."""
132 |
133 | @staticmethod
134 | def symbolic(graph, input_):
135 | return _gather(input_)
136 |
137 | @staticmethod
138 | def forward(ctx, input_):
139 | return _gather(input_)
140 |
141 | @staticmethod
142 | def backward(ctx, grad_output):
143 | return _split(grad_output)
144 |
145 |
146 | # -----------------
147 | # Helper functions.
148 | # -----------------
149 |
150 |
151 | def copy_to_tensor_model_parallel_region(input_):
152 | return _CopyToModelParallelRegion.apply(input_)
153 |
154 |
155 | def reduce_from_tensor_model_parallel_region(input_):
156 | return _ReduceFromModelParallelRegion.apply(input_)
157 |
158 |
159 | def scatter_to_tensor_model_parallel_region(input_):
160 | return _ScatterToModelParallelRegion.apply(input_)
161 |
162 |
163 | def gather_from_tensor_model_parallel_region(input_):
164 | return _GatherFromModelParallelRegion.apply(input_)
165 |
--------------------------------------------------------------------------------
/codegeex/megatron/mpu/utils.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 |
17 | import torch
18 |
19 |
20 | def ensure_divisibility(numerator, denominator):
21 | """Ensure that numerator is divisible by the denominator."""
22 | assert numerator % denominator == 0, "{} is not divisible by {}".format(
23 | numerator, denominator
24 | )
25 |
26 |
27 | def divide(numerator, denominator):
28 | """Ensure that numerator is divisible by the denominator and return
29 | the division value."""
30 | ensure_divisibility(numerator, denominator)
31 | return numerator // denominator
32 |
33 |
34 | def split_tensor_along_last_dim(tensor, num_partitions, contiguous_split_chunks=False):
35 | """Split a tensor along its last dimension.
36 | Arguments:
37 | tensor: input tensor.
38 | num_partitions: number of partitions to split the tensor
39 | contiguous_split_chunks: If True, make each chunk contiguous
40 | in memory.
41 | """
42 | # Get the size and dimension.
43 | last_dim = tensor.dim() - 1
44 | last_dim_size = divide(tensor.size()[last_dim], num_partitions)
45 | # Split.
46 | tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
47 | # Note: torch.split does not create contiguous tensors by default.
48 | if contiguous_split_chunks:
49 | return tuple(chunk.contiguous() for chunk in tensor_list)
50 |
51 | return tensor_list
52 |
53 |
54 | class VocabUtility:
55 | """Split the vocabulary into `world_size` chunks amd return the
56 | first and last index of the vocabulary belonging to the `rank`
57 | partition: Note that indecies in [fist, last)"""
58 |
59 | @staticmethod
60 | def vocab_range_from_per_partition_vocab_size(
61 | per_partition_vocab_size, rank, world_size
62 | ):
63 | index_f = rank * per_partition_vocab_size
64 | index_l = index_f + per_partition_vocab_size
65 | return index_f, index_l
66 |
67 | @staticmethod
68 | def vocab_range_from_global_vocab_size(global_vocab_size, rank, world_size):
69 | per_partition_vocab_size = divide(global_vocab_size, world_size)
70 | return VocabUtility.vocab_range_from_per_partition_vocab_size(
71 | per_partition_vocab_size, rank, world_size
72 | )
73 |
--------------------------------------------------------------------------------
/codegeex/megatron/optimizer/__init__.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | from apex.optimizers import FusedAdam as Adam
17 | from apex.optimizers import FusedSGD as SGD
18 |
19 | from codegeex.megatron import get_args
20 | from codegeex.megatron.model import LayerNorm
21 |
22 | from .grad_scaler import ConstantGradScaler, DynamicGradScaler
23 | from .optimizer import Float16OptimizerWithFloat16Params, FP32Optimizer
24 |
25 |
26 | def _get_params_for_weight_decay_optimization(modules):
27 | """Divide params into with-weight-decay and without-weight-decay groups.
28 | Layernorms and baises will have no weight decay but the rest will.
29 | """
30 |
31 | weight_decay_params = {"params": []}
32 | no_weight_decay_params = {"params": [], "weight_decay": 0.0}
33 | for module in modules:
34 | for module_ in module.modules():
35 | if isinstance(module_, LayerNorm):
36 | no_weight_decay_params["params"].extend(
37 | [p for p in list(module_._parameters.values()) if p is not None]
38 | )
39 | else:
40 | weight_decay_params["params"].extend(
41 | [
42 | p
43 | for n, p in list(module_._parameters.items())
44 | if p is not None and n != "bias"
45 | ]
46 | )
47 | no_weight_decay_params["params"].extend(
48 | [
49 | p
50 | for n, p in list(module_._parameters.items())
51 | if p is not None and n == "bias"
52 | ]
53 | )
54 |
55 | return weight_decay_params, no_weight_decay_params
56 |
57 |
58 | def get_megatron_optimizer(model):
59 | args = get_args()
60 |
61 | if args.cpu_optimizer:
62 | raise NotImplementedError("need to add cpu adam")
63 |
64 | param_groups = _get_params_for_weight_decay_optimization(model)
65 |
66 | if args.optimizer == "adam":
67 | optimizer = Adam(
68 | param_groups,
69 | lr=args.lr,
70 | weight_decay=args.weight_decay,
71 | betas=(args.adam_beta1, args.adam_beta2),
72 | eps=args.adam_eps,
73 | )
74 | elif args.optimizer == "sgd":
75 | optimizer = SGD(
76 | param_groups,
77 | lr=args.lr,
78 | weight_decay=args.weight_decay,
79 | momentum=args.sgd_momentum,
80 | )
81 | else:
82 | raise Exception("{} optimizer is not supported.".format(args.optimizer))
83 |
84 | if args.deepspeed:
85 | return optimizer
86 |
87 | # Determine whether the params have main-grad field.
88 | params_have_main_grad = False
89 | if args.DDP_impl == "local":
90 | params_have_main_grad = True
91 |
92 | if args.fp16 or args.bf16:
93 |
94 | # Grad scaler:
95 | # if loss-scale is provided, instantiate the constant scaler.
96 | # if we are using fp16 and loss-scale is not present, use a
97 | # dynamic scaler.
98 | # otherwise we are running in bf16 with no loss-scale so
99 | # leave it as None.
100 | grad_scaler = None
101 | # Constant loss scale.
102 | if args.loss_scale:
103 | grad_scaler = ConstantGradScaler(args.loss_scale)
104 | # Dynamic loss scale.
105 | else:
106 | if args.fp16:
107 | grad_scaler = DynamicGradScaler(
108 | initial_scale=args.initial_loss_scale,
109 | min_scale=args.min_loss_scale,
110 | growth_factor=2.0,
111 | backoff_factor=0.5,
112 | growth_interval=args.loss_scale_window,
113 | hysteresis=args.hysteresis,
114 | )
115 |
116 | # Megatron optimizer.
117 | return Float16OptimizerWithFloat16Params(
118 | optimizer,
119 | args.clip_grad,
120 | args.log_num_zeros_in_grad,
121 | params_have_main_grad,
122 | args.bf16,
123 | grad_scaler,
124 | )
125 |
126 | # FP32.
127 | return FP32Optimizer(
128 | optimizer, args.clip_grad, args.log_num_zeros_in_grad, params_have_main_grad
129 | )
130 |
--------------------------------------------------------------------------------
/codegeex/megatron/optimizer/clip_grads.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | """Gradient clipping."""
17 |
18 | import torch
19 | from torch._six import inf
20 |
21 | from apex.multi_tensor_apply import multi_tensor_applier
22 | import amp_C
23 |
24 | from codegeex.megatron import mpu
25 | from codegeex.megatron.model.module import param_is_not_shared
26 | from codegeex.megatron.mpu.layers import param_is_not_tensor_parallel_duplicate
27 |
28 |
29 | def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
30 | """Clips gradient norm of an iterable of parameters whose gradients
31 | are in fp32.
32 |
33 | This is adapted from torch.nn.utils.clip_grad.clip_grad_norm_ and
34 | added functionality to handle model parallel parameters. Note that
35 | the gradients are modified in place.
36 |
37 | Arguments:
38 | parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
39 | single Tensor that will have gradients normalized
40 | max_norm (float or int): max norm of the gradients
41 | norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
42 | infinity norm.
43 |
44 | Returns:
45 | Total norm of the parameters (viewed as a single vector).
46 | """
47 |
48 | if isinstance(parameters, torch.Tensor):
49 | parameters = [parameters]
50 |
51 | # Filter parameters based on:
52 | # - grad should not be none
53 | # - parameter should not be shared
54 | # - should not be a replica due to tensor model parallelism
55 | grads = []
56 | grads_for_norm = []
57 | for param in parameters:
58 | grad_not_none = param.grad is not None
59 | is_not_shared = param_is_not_shared(param)
60 | is_not_tp_duplicate = param_is_not_tensor_parallel_duplicate(param)
61 | grad = param.grad.detach()
62 | if grad_not_none:
63 | # Make sure the grads are in fp32
64 | assert param.grad.type() == "torch.cuda.FloatTensor"
65 | grads.append(grad)
66 | if grad_not_none and is_not_shared and is_not_tp_duplicate:
67 | grads_for_norm.append(grad)
68 |
69 | # Norm parameters.
70 | max_norm = float(max_norm)
71 | norm_type = float(norm_type)
72 | total_norm = 0.0
73 |
74 | # Calculate norm.
75 | if norm_type == inf:
76 | total_norm = max(grad.abs().max() for grad in grads_for_norm)
77 | total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
78 | # Take max across all model-parallel GPUs.
79 | torch.distributed.all_reduce(
80 | total_norm_cuda,
81 | op=torch.distributed.ReduceOp.MAX,
82 | group=mpu.get_model_parallel_group(),
83 | )
84 | total_norm = total_norm_cuda[0].item()
85 |
86 | else:
87 | if norm_type == 2.0:
88 | dummy_overflow_buf = torch.cuda.IntTensor([0])
89 | # Use apex's multi-tensor applier for efficiency reasons.
90 | # Multi-tensor applier takes a function and a list of list
91 | # and performs the operation on that list all in one kernel.
92 | grad_norm, _ = multi_tensor_applier(
93 | amp_C.multi_tensor_l2norm,
94 | dummy_overflow_buf,
95 | [grads_for_norm],
96 | False, # no per-parameter norm
97 | )
98 | # Since we will be summing across data parallel groups,
99 | # we need the pow(norm-type).
100 | total_norm = grad_norm ** norm_type
101 |
102 | else:
103 | for grad in grads_for_norm:
104 | grad_norm = torch.norm(grad, norm_type)
105 | total_norm += grad_norm ** norm_type
106 |
107 | # Sum across all model-parallel GPUs.
108 | torch.distributed.all_reduce(
109 | total_norm,
110 | op=torch.distributed.ReduceOp.SUM,
111 | group=mpu.get_model_parallel_group(),
112 | )
113 | total_norm = total_norm.item() ** (1.0 / norm_type)
114 |
115 | # Scale.
116 | clip_coeff = max_norm / (total_norm + 1.0e-6)
117 | if clip_coeff < 1.0:
118 | dummy_overflow_buf = torch.cuda.IntTensor([0])
119 | multi_tensor_applier(
120 | amp_C.multi_tensor_scale, dummy_overflow_buf, [grads, grads], clip_coeff
121 | )
122 |
123 | return total_norm
124 |
125 |
126 | def count_zeros_fp32(parameters):
127 |
128 | if isinstance(parameters, torch.Tensor):
129 | parameters = [parameters]
130 |
131 | # Filter parameters based on:
132 | # - grad should not be none
133 | # - parameter should not be shared
134 | # - should not be a replica due to tensor model parallelism
135 | total_num_zeros = 0.0
136 | for param in parameters:
137 | grad_not_none = param.grad is not None
138 | is_not_shared = param_is_not_shared(param)
139 | is_not_tp_duplicate = param_is_not_tensor_parallel_duplicate(param)
140 | if grad_not_none and is_not_shared and is_not_tp_duplicate:
141 | grad = param.grad.detach()
142 | num_zeros = grad.numel() - torch.count_nonzero(grad)
143 | total_num_zeros = num_zeros + total_num_zeros
144 |
145 | # Sum across all model-parallel GPUs.
146 | torch.distributed.all_reduce(
147 | total_num_zeros,
148 | op=torch.distributed.ReduceOp.SUM,
149 | group=mpu.get_model_parallel_group(),
150 | )
151 | total_num_zeros = total_num_zeros.item()
152 |
153 | return total_num_zeros
154 |
--------------------------------------------------------------------------------
/codegeex/megatron/optimizer/grad_scaler.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | """Megatron grad scaler."""
17 |
18 | from abc import ABC
19 | from abc import abstractmethod
20 |
21 | import torch
22 |
23 |
24 | class MegatronGradScaler(ABC):
25 | def __init__(self, initial_scale):
26 | """Initialize scale value with the input initial scale."""
27 | assert initial_scale > 0.0
28 | self._scale = torch.cuda.FloatTensor([initial_scale])
29 |
30 | @property
31 | def scale(self):
32 | return self._scale
33 |
34 | @property
35 | def inv_scale(self):
36 | return self._scale.double().reciprocal().float()
37 |
38 | @abstractmethod
39 | def update(self, found_inf):
40 | pass
41 |
42 | @abstractmethod
43 | def state_dict(self):
44 | pass
45 |
46 | @abstractmethod
47 | def load_state_dict(self, state_dict):
48 | pass
49 |
50 |
51 | class ConstantGradScaler(MegatronGradScaler):
52 | def update(self, found_inf):
53 | pass
54 |
55 | def state_dict(self):
56 | return dict()
57 |
58 | def load_state_dict(self, state_dict):
59 | pass
60 |
61 |
62 | class DynamicGradScaler(MegatronGradScaler):
63 | def __init__(
64 | self,
65 | initial_scale,
66 | min_scale,
67 | growth_factor,
68 | backoff_factor,
69 | growth_interval,
70 | hysteresis,
71 | ):
72 | """ "Grad scaler with dynamic scale that gets adjusted
73 | during training."""
74 | super(DynamicGradScaler, self).__init__(initial_scale)
75 |
76 | # Lower bound on the scale.
77 | assert min_scale > 0.0
78 | assert min_scale <= initial_scale
79 | self.min_scale = torch.cuda.FloatTensor([min_scale])
80 | # Growth and backoff factors for the scale.
81 | assert growth_factor > 1.0
82 | self.growth_factor = torch.cuda.FloatTensor([growth_factor])
83 | assert backoff_factor < 1.0
84 | assert backoff_factor > 0.0
85 | self.backoff_factor = torch.cuda.FloatTensor([backoff_factor])
86 | # Interval over which if we don't see any inf/nan,
87 | # we will scale the grad scale by the growth factor.
88 | assert growth_interval > 0
89 | self.growth_interval = growth_interval
90 | # Number of inf/nans we should see before scaling down
91 | # the grad scale by the backoff factor.
92 | assert hysteresis > 0
93 | self.hysteresis = hysteresis
94 |
95 | # Trackers.
96 | self._growth_tracker = 0
97 | self._hysteresis_tracker = self.hysteresis
98 |
99 | def update(self, found_inf):
100 |
101 | # If we have an inf/nan, growth tracker is set to 0
102 | # and hysterisis tracker is reduced by 1.
103 | if found_inf:
104 | self._growth_tracker = 0
105 | self._hysteresis_tracker -= 1
106 | # Now if we are out of hysteresis count, scale down the loss.
107 | if self._hysteresis_tracker <= 0:
108 | self._scale = torch.max(
109 | self._scale * self.backoff_factor, self.min_scale
110 | )
111 | else:
112 | # If there is no nan/inf, increment the growth tracker.
113 | self._growth_tracker += 1
114 | # If we have had enough consequitive intervals with no nan/inf:
115 | if self._growth_tracker == self.growth_interval:
116 | # Reset the tracker and hysteresis trackers,
117 | self._growth_tracker = 0
118 | self._hysteresis_tracker = self.hysteresis
119 | # and scale up the loss scale.
120 | self._scale = self._scale * self.growth_factor
121 |
122 | def state_dict(self):
123 | state_dict = {}
124 | state_dict["scale"] = self._scale
125 | state_dict["growth_tracker"] = self._growth_tracker
126 | state_dict["hysteresis_tracker"] = self._hysteresis_tracker
127 | return state_dict
128 |
129 | def load_state_dict(self, state_dict):
130 | self._scale = state_dict["scale"].cuda(torch.cuda.current_device())
131 | self._growth_tracker = state_dict["growth_tracker"]
132 | self._hysteresis_tracker = state_dict["hysteresis_tracker"]
133 |
--------------------------------------------------------------------------------
/codegeex/megatron/tokenizer/__init__.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 |
17 | from .tokenizer import build_tokenizer
18 |
--------------------------------------------------------------------------------
/codegeex/megatron/tools/collect_env.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 |
4 | ENV_NAMES = ["CUDA_HOME", "LD_LIBRARY_PATH", "PATH", "TORCH_EXTENSIONS_DIR", "CUDA_LAUNCH_BLOCKING"]
5 |
6 |
7 | def main():
8 | s = ""
9 | for name in ENV_NAMES:
10 | if name in os.environ:
11 | value = os.environ[name]
12 | s += "{}={}\n".format(name, value)
13 | print(f"{name}={value}")
14 | else:
15 | print(f"{name} is not set")
16 |
17 | # write env vars to .deepspeed_env
18 | with open(".deepspeed_env", "w") as f:
19 | f.write(s)
20 |
21 |
22 | if __name__ == "__main__":
23 | main()
24 |
--------------------------------------------------------------------------------
/codegeex/mindspore/configs/13B.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | script_path=$(realpath $BASH_SOURCE)
4 | script_dir=$(dirname $script_path)
5 |
6 | CODE_DATA_DIR="" # TODO: set the path to the code data directory
7 |
8 | GAS=1
9 |
10 | python ${script_dir}/../train.py \
11 | --distribute true \
12 | --device_num $RANK_SIZE \
13 | --sink_size 2 \
14 | --run_type train \
15 | --train_and_eval_mode 0 \
16 | --mode 13B \
17 | --code_data $CODE_DATA_DIR \
18 | --param_init_type fp32 \
19 | --micro_size $GAS \
20 | --seq_length 2048 \
21 | --vocab_size 51200 \
22 | --ckpt_name_prefix code-13B \
23 | --save_checkpoint=True \
24 | --save_checkpoint_path /cache/ckpts \
25 | --save_checkpoint_obs_path \ # TODO: set to obs path for saving ckpts
26 | --save_checkpoint_steps 250 \
27 | --load_ckpt_path \ # TODO: set to obs path for loading ckpt
28 | --load_ckpt_epoch \ # TODO: set to epoch number of loaded ckpt
29 | --per_batch_size 16 \
30 | --dropout_rate 0.1 \
31 | --full_batch 0 \
32 | --epoch_size 1 \
33 | --micro_interleaved_size 1 \
34 | --profiling 0 \
35 | --tb_dir $LOG_PATH
--------------------------------------------------------------------------------
/codegeex/mindspore/configs/13B_128p_save_1p.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | script_path=$(realpath $BASH_SOURCE)
3 | script_dir=$(dirname $script_path)
4 |
5 | CODE_DATA_DIR="" # TODO: set the path to the code data directory
6 |
7 | GAS=32
8 |
9 | python ${script_dir}/../save_1p_ckpt_from_8p_ckpt.py \
10 | --distribute true \
11 | --run_type train \
12 | --train_and_eval_mode 0 \
13 | --mode 13B \
14 | --code_data $CODE_DATA_DIR \
15 | --param_init_type fp32 \
16 | --micro_size $GAS \
17 | --seq_length 2048 \
18 | --vocab_size 51200 \
19 | --ckpt_name_prefix code-13B \
20 | --save_checkpoint=True \
21 | --save_checkpoint_path /cache/ckpts \
22 | --save_checkpoint_obs_path \ # TODO: set to obs path for saving ckpts
23 | --save_checkpoint_steps \ # TODO: set to epoch number of loaded ckpt
24 | --load_ckpt_path \ # TODO: set to obs path for loading ckpt
25 | --load_ckpt_epoch \ # TODO: set to epoch number of loaded ckpt, same as save_checkpoint_steps
26 | --strategy_load_ckpt_path "/home/work/user-job-dir/start_1.6/strategy.ckpt" \
27 | --per_batch_size 16 \
28 | --full_batch 0 \
29 | --epoch_size 1 \
30 | --micro_interleaved_size 1 \
31 | --profiling 0 \
32 | --tb_dir $LOG_PATH
33 |
--------------------------------------------------------------------------------
/codegeex/mindspore/configs/13B_128p_save_8p_ckpt.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | script_path=$(realpath $BASH_SOURCE)
4 | script_dir=$(dirname $script_path)
5 |
6 | CODE_DATA_DIR="" # TODO: set the path to the code data directory
7 |
8 | GAS=32
9 |
10 | python ${script_dir}/../save_8p_ckpt.py \
11 | --distribute true \
12 | --device_num $RANK_SIZE \
13 | --sink_size 2 \
14 | --run_type train \
15 | --train_and_eval_mode 0 \
16 | --mode 13B \
17 | --code_data $CODE_DATA_DIR \
18 | --param_init_type fp32 \
19 | --micro_size $GAS \
20 | --seq_length 2048 \
21 | --vocab_size 51200 \
22 | --ckpt_name_prefix code-13B \
23 | --save_checkpoint=True \
24 | --save_checkpoint_path /cache/ckpts \
25 | --save_checkpoint_obs_path \ # TODO: set to obs path for saving ckpts
26 | --save_checkpoint_steps 99999 \
27 | --load_ckpt_path \ # TODO: set to obs path for loading ckpt
28 | --load_ckpt_epoch \ # TODO: set to epoch number of loaded ckpt, same as save_checkpoint_steps
29 | --strategy_load_ckpt_path "/home/work/user-job-dir/start_1.6/strategy.ckpt" \
30 | --per_batch_size 16 \
31 | --full_batch 0 \
32 | --epoch_size 1 \
33 | --micro_interleaved_size 1 \
34 | --profiling 0 \
35 | --tb_dir $LOG_PATH
--------------------------------------------------------------------------------
/codegeex/mindspore/configs/13B_1p_to_torch.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | script_path=$(realpath $BASH_SOURCE)
4 | script_dir=$(dirname $script_path)
5 |
6 | CODE_DATA_DIR="" # TODO: set the path to the code data directory
7 |
8 | GAS=32
9 |
10 | python ${script_dir}/../convertion_1p.py \
11 | --distribute false \
12 | --device_num $RANK_SIZE \
13 | --sink_size 2 \
14 | --run_type predict \
15 | --train_and_eval_mode 0 \
16 | --mode 13B \
17 | --code_data $CODE_DATA_DIR \
18 | --param_init_type fp32 \
19 | --micro_size $GAS \
20 | --seq_length 2048 \
21 | --vocab_size 51200 \
22 | --ckpt_name_prefix code-13B \
23 | --save_checkpoint=True \
24 | --save_checkpoint_path /cache/ckpts \
25 | --load_ckpt_path \ # TODO: set to obs path for loading ckpt
26 | --per_batch_size 1 \
27 | --full_batch 1 \
28 | --epoch_size 1 \
29 | --micro_interleaved_size 1 \
30 | --profiling 0 \
31 | --use_past "true" \
32 | --top_p 0.95 \
33 | --top_k_num 100 \
34 | --temperature 0.8 \
35 | --op_level_model_parallel_num 1 \
36 | --frequency_penalty 0.0 \
37 | --presence_penalty 0.0 \
38 | --strategy_load_ckpt_path "/home/work/user-job-dir/start_1.6/strategy.ckpt" \
39 | --tb_dir $LOG_PATH
40 |
--------------------------------------------------------------------------------
/codegeex/mindspore/configs/13B_finetune.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | script_path=$(realpath $BASH_SOURCE)
4 | script_dir=$(dirname $script_path)
5 |
6 | CODE_DATA_DIR="" # TODO: set the path to the code data directory
7 |
8 | GAS=1
9 |
10 | python ${script_dir}/../../finetune.py \
11 | --distribute true \
12 | --device_num $RANK_SIZE \
13 | --sink_size 2 \
14 | --run_type train \
15 | --train_and_eval_mode 1 \
16 | --mode 13B \
17 | --code_data $CODE_DATA_DIR \
18 | --param_init_type fp32 \
19 | --micro_size $GAS \
20 | --seq_length 2048 \
21 | --vocab_size 51200 \
22 | --ckpt_name_prefix code-13B \
23 | --save_checkpoint=True \
24 | --save_checkpoint_path /cache/ckpts \
25 | --save_checkpoint_obs_path \ # TODO: set to obs path for saving ckpts
26 | --save_checkpoint_steps 20 \
27 | --load_ckpt_path \ # TODO: set to obs path for loading ckpt
28 | --load_ckpt_epoch \ # TODO: set to epoch number of loaded ckpt
29 | --per_batch_size 16 \
30 | --dropout_rate 0.1 \
31 | --full_batch 0 \
32 | --epoch_size 5 \
33 | --micro_interleaved_size 1 \
34 | --profiling 0 \
35 | --tb_dir $LOG_PATH \
--------------------------------------------------------------------------------
/codegeex/mindspore/configs/13B_generate.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | script_path=$(realpath $BASH_SOURCE)
4 | script_dir=$(dirname $script_path)
5 |
6 | CODE_DATA_DIR="" # TODO: set the path to the code data directory
7 |
8 | GAS=32
9 |
10 | python ${script_dir}/../generation.py \
11 | --distribute true \
12 | --device_num $RANK_SIZE \
13 | --sink_size 2 \
14 | --run_type predict \
15 | --train_and_eval_mode 0 \
16 | --mode 13B \
17 | --code_data $CODE_DATA_DIR \
18 | --param_init_type fp32 \
19 | --micro_size $GAS \
20 | --seq_length 2048 \
21 | --vocab_size 51200 \
22 | --ckpt_name_prefix code-13B \
23 | --save_checkpoint=True \
24 | --save_checkpoint_path /cache/ckpts \
25 | --save_checkpoint_obs_path /home \ # TODO: set at will
26 | --save_checkpoint_steps 99999 \ # TODO: set at will
27 | --load_ckpt_path \ # TODO: set to obs path for loading ckpt
28 | --load_ckpt_epoch \ # TODO: set to epoch number of loaded ckpt, same as save_checkpoint_steps
29 | --per_batch_size 1 \
30 | --full_batch 1 \
31 | --epoch_size 1 \
32 | --micro_interleaved_size 1 \
33 | --profiling 0 \
34 | --use_past "true" \
35 | --top_p 0.95 \
36 | --top_k_num 100 \
37 | --temperature 0.8 \
38 | --frequency_penalty 0.0 \
39 | --presence_penalty 0.0 \
40 | --strategy_load_ckpt_path "/home/work/user-job-dir/start_1.6/strategy.ckpt" \
41 | --tb_dir $LOG_PATH
42 |
--------------------------------------------------------------------------------
/codegeex/mindspore/configs/13B_generate_1p.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | script_path=$(realpath $BASH_SOURCE)
4 | script_dir=$(dirname $script_path)
5 |
6 | CODE_DATA_DIR="" # TODO: set the path to the code data directory
7 |
8 | GAS=32
9 |
10 | python ${script_dir}/../generation_1p.py \
11 | --distribute false \
12 | --device_num $RANK_SIZE \
13 | --sink_size 2 \
14 | --run_type predict \
15 | --train_and_eval_mode 0 \
16 | --mode 13B \
17 | --code_data $CODE_DATA_DIR \
18 | --param_init_type fp16 \
19 | --micro_size $GAS \
20 | --seq_length 2048 \
21 | --vocab_size 51200 \
22 | --ckpt_name_prefix code-13B \
23 | --save_checkpoint=True \
24 | --save_checkpoint_path /cache/ckpts \
25 | --save_checkpoint_obs_path /home \ # TODO: set at will
26 | --save_checkpoint_steps 99999 \ # TODO: set at will
27 | --load_ckpt_path \ # TODO: set to obs path for loading ckpt
28 | --load_ckpt_epoch \ # TODO: set to epoch number of loaded ckpt, same as save_checkpoint_steps
29 | --per_batch_size 1 \
30 | --full_batch 1 \
31 | --epoch_size 1 \
32 | --micro_interleaved_size 1 \
33 | --profiling 0 \
34 | --use_past "true" \
35 | --top_p 0.95 \
36 | --top_k_num 100 \
37 | --temperature 0.8 \
38 | --op_level_model_parallel_num 1 \
39 | --frequency_penalty 0.0 \
40 | --presence_penalty 0.0 \
41 | --strategy_load_ckpt_path "/home/work/user-job-dir/start_1.6/strategy.ckpt" \
42 | --tb_dir $LOG_PATH
43 |
--------------------------------------------------------------------------------
/codegeex/mindspore/configs/13B_generate_1p_values.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | script_path=$(realpath $BASH_SOURCE)
4 | script_dir=$(dirname $script_path)
5 |
6 | CODE_DATA_DIR="" # TODO: set the path to the code data directory
7 |
8 | GAS=32
9 |
10 | python ${script_dir}/../generation_values_1p.py \
11 | --distribute false \
12 | --device_num $RANK_SIZE \
13 | --sink_size 2 \
14 | --run_type predict \
15 | --train_and_eval_mode 0 \
16 | --mode 13B \
17 | --code_data $CODE_DATA_DIR \
18 | --param_init_type fp16 \
19 | --micro_size $GAS \
20 | --seq_length 2048 \
21 | --vocab_size 51200 \
22 | --ckpt_name_prefix code-13B \
23 | --save_checkpoint=True \
24 | --save_checkpoint_path /cache/ckpts \
25 | --save_checkpoint_obs_path /home \ # TODO: set at will
26 | --save_checkpoint_steps 213000 \ # TODO: set at will
27 | --load_ckpt_path \ # TODO: set to obs path for loading ckpt
28 | --load_ckpt_epoch \ # TODO: set to epoch number of loaded ckpt
29 | --per_batch_size 1 \
30 | --full_batch 1 \
31 | --epoch_size 1 \
32 | --micro_interleaved_size 1 \
33 | --profiling 0 \
34 | --use_past "false" \
35 | --top_p 0.95 \
36 | --top_k_num 100 \
37 | --temperature 0.8 \
38 | --op_level_model_parallel_num 1 \
39 | --frequency_penalty 0.0 \
40 | --presence_penalty 0.0 \
41 | --strategy_load_ckpt_path "/home/work/user-job-dir/start_1.6/strategy.ckpt" \
42 | --tb_dir $LOG_PATH
43 |
--------------------------------------------------------------------------------
/codegeex/mindspore/configs/13B_generate_finetune.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | script_path=$(realpath $BASH_SOURCE)
4 | script_dir=$(dirname $script_path)
5 |
6 | CODE_DATA_DIR="" # TODO: set the path to the code data directory
7 |
8 | GAS=32
9 |
10 | python ${script_dir}/../generation_finetune.py \
11 | --distribute true \
12 | --device_num $RANK_SIZE \
13 | --sink_size 2 \
14 | --run_type predict \
15 | --train_and_eval_mode 0 \
16 | --mode 13B \
17 | --code_data $CODE_DATA_DIR \
18 | --param_init_type fp32 \
19 | --micro_size $GAS \
20 | --seq_length 2048 \
21 | --vocab_size 51200 \
22 | --max_generate_length 1024 \
23 | --ckpt_name_prefix code-13B \
24 | --save_checkpoint=True \
25 | --save_checkpoint_path /cache/ckpts \
26 | --save_checkpoint_obs_path /home \ # TODO: set at will
27 | --save_checkpoint_steps 99999 \ # TODO: set at will
28 | --load_ckpt_path \ # TODO: set to obs path for loading ckpt
29 | --load_ckpt_epoch \ # TODO: set to epoch number of loaded ckpt, same as save_checkpoint_steps
30 | --per_batch_size 6 \
31 | --full_batch 1 \
32 | --epoch_size 1 \
33 | --micro_interleaved_size 1 \
34 | --profiling 0 \
35 | --use_past "true" \
36 | --top_p 0.95 \
37 | --top_k_num 100 \
38 | --temperature 0.2 \
39 | --frequency_penalty 0.0 \
40 | --presence_penalty 0.0 \
41 | --strategy_load_ckpt_path "/home/work/user-job-dir/start_1.6/strategy.ckpt" \
42 | --tb_dir $LOG_PATH \
43 | --language $LANGUAGE
44 |
45 |
--------------------------------------------------------------------------------
/codegeex/mindspore/configs/13B_generate_humaneval.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | script_path=$(realpath $BASH_SOURCE)
4 | script_dir=$(dirname $script_path)
5 |
6 | CODE_DATA_DIR="" # TODO: set the path to the code data directory
7 |
8 | GAS=32
9 |
10 | python ${script_dir}/../generation_humaneval.py \
11 | --distribute true \
12 | --device_num $RANK_SIZE \
13 | --sink_size 2 \
14 | --run_type predict \
15 | --train_and_eval_mode 0 \
16 | --mode 13B \
17 | --code_data $CODE_DATA_DIR \
18 | --param_init_type fp32 \
19 | --micro_size $GAS \
20 | --seq_length 2048 \
21 | --vocab_size 51200 \
22 | --max_generate_length 1024 \
23 | --ckpt_name_prefix code-13B \
24 | --save_checkpoint=True \
25 | --save_checkpoint_path /cache/ckpts \
26 | --save_checkpoint_obs_path /home \ # TODO: set at will
27 | --save_checkpoint_steps 99999 \ # TODO: set at will
28 | --load_ckpt_path \ # TODO: set to obs path for loading ckpt
29 | --load_ckpt_epoch \ # TODO: set to epoch number of loaded ckpt, same as save_checkpoint_steps
30 | --per_batch_size 6 \
31 | --full_batch 1 \
32 | --epoch_size 1 \
33 | --micro_interleaved_size 1 \
34 | --profiling 0 \
35 | --use_past "true" \
36 | --top_p 0.95 \
37 | --top_k_num 100 \
38 | --temperature 0.8 \
39 | --frequency_penalty 0.0 \
40 | --presence_penalty 0.0 \
41 | --strategy_load_ckpt_path "/home/work/user-job-dir/start_1.6/strategy.ckpt" \
42 | --tb_dir $LOG_PATH \
43 | --part $PART
44 |
45 |
--------------------------------------------------------------------------------
/codegeex/mindspore/configs/13B_generate_values.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | script_path=$(realpath $BASH_SOURCE)
4 | script_dir=$(dirname $script_path)
5 |
6 | CODE_DATA_DIR="" # TODO: set the path to the code data directory
7 |
8 | GAS=32
9 |
10 | python ${script_dir}/../generation_values.py \
11 | --distribute true \
12 | --device_num $RANK_SIZE \
13 | --sink_size 2 \
14 | --run_type predict \
15 | --train_and_eval_mode 0 \
16 | --mode 13B \
17 | --code_data $CODE_DATA_DIR \
18 | --param_init_type fp32 \
19 | --micro_size $GAS \
20 | --seq_length 2048 \
21 | --vocab_size 51200 \
22 | --max_generate_length 2048 \
23 | --ckpt_name_prefix code-13B \
24 | --save_checkpoint=True \
25 | --save_checkpoint_path /cache/ckpts \
26 | --save_checkpoint_obs_path /home \ # TODO: set at will
27 | --save_checkpoint_steps 99999 \ # TODO: set at will
28 | --load_ckpt_path \ # TODO: set to obs path for loading ckpt
29 | --load_ckpt_epoch \ # TODO: set to epoch number of loaded ckpt, same as save_checkpoint_steps
30 | --per_batch_size 6 \
31 | --full_batch 1 \
32 | --epoch_size 1 \
33 | --micro_interleaved_size 1 \
34 | --profiling 0 \
35 | --use_past "true" \
36 | --top_p 0.95 \
37 | --top_k_num 100 \
38 | --temperature 1.0 \
39 | --frequency_penalty 0.0 \
40 | --presence_penalty 0.0 \
41 | --strategy_load_ckpt_path "/home/work/user-job-dir/start_1.6/strategy.ckpt" \
42 | --tb_dir $LOG_PATH
43 |
--------------------------------------------------------------------------------
/codegeex/mindspore/scripts/custom_tune_bank_new/Ascend910ProA/cube/repository_ascend910ProA_matmul.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/codegeex/mindspore/scripts/custom_tune_bank_new/Ascend910ProA/cube/repository_ascend910ProA_matmul.bin
--------------------------------------------------------------------------------
/codegeex/mindspore/scripts/ma-pre-start.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | source ~/.bashrc
4 | echo "Start to intall the run package"
5 | WORK_DIR=start_1.7
6 | RUN_DIR=run
7 | mindspore_file=mindspore_ascend-1.7.0-cp37-cp37m-linux_aarch64.whl
8 | LOCAL_DIR=$(cd "$(dirname "$0")";pwd)
9 | echo $LOCAL_DIR
10 |
11 | echo "===current dir="
12 | ls ./${WORK_DIR}/${RUN_DIR}
13 |
14 | pip install ./${WORK_DIR}/${mindspore_file} -i http://100.125.33.126:8888/repository/pypi/simple --trusted-host=100.125.33.126
15 | sudo chmod +755 -R /usr/local/Ascend/nnae
16 | sudo rm -rf /usr/local/Ascend/nnae
17 |
18 | sudo chmod +x ./${WORK_DIR}/${RUN_DIR}/*.run
19 | sudo bash ./${WORK_DIR}/${RUN_DIR}/Ascend* --full --quiet
20 |
21 | export HCCL_CONNECT_TIMEOUT=1800 # 通信建链最长等待时间,单位s
22 |
23 | echo "======/usr/local/Ascend======"
24 | ls -al /usr/local/Ascend
25 | echo "======/usr/local/Ascend/ascend-toolkit/======"
26 | ls -al /usr/local/Ascend/ascend-toolkit/
27 | echo "======/usr/local/Ascend/ascend-toolkit/latest======"
28 | ls -al /usr/local/Ascend/ascend-toolkit/latest
29 | echo "======/usr/local/Ascend/driver/lib64========"
30 | ls -al /usr/local/Ascend/driver/lib64
31 | echo "======/usr/local/Ascend/driver/lib64/common======="
32 | ls -al /usr/local/Ascend/driver/lib64/common
33 | echo "=======/usr/local/Ascend/driver/lib64/driver======="
34 | ls -al /usr/local/Ascend/driver/lib64/driver
35 | echo "============/usr/local/Ascend/ascend-toolkit/5.1.RC1============="
36 | ls -al /usr/local/Ascend/ascend-toolkit/5.1.RC1
37 | sudo mkdir /usr/local/Ascend/nnae
38 | sudo chmod +755 -R /usr/local/Ascend/nnae
39 | #sudo mkdir /usr/local/Ascend/nnae/latest
40 | #sudo chmod +755 -R /usr/local/Ascend/nnae/latest
41 | sudo ln -s /usr/local/Ascend/ascend-toolkit/5.1.RC1 /usr/local/Ascend/nnae/latest
42 | echo "======/usr/local/Ascend/nnae======"
43 | ls -al /usr/local/Ascend/nnae
44 | echo "======/usr/local/Ascend/nnae/latest======"
45 | ls -al /usr/local/Ascend/nnae/latest
46 | echo "======/usr/local/Ascend/nnae/latest/lib64/libhccl.so======"
47 | ls -al /usr/local/Ascend/nnae/latest/lib64/libhccl.so
48 |
49 | # sudo cp -fp ${LOCAL_DIR}/${WORK_DIR}/libhccl.so /usr/local/Ascend/nnae/latest/lib64/libhccl.so
50 | echo "======/usr/local/Ascend/nnae/latest/lib64/libhccl.so======"
51 | ls -al /usr/local/Ascend/nnae/latest/lib64/libhccl.so
52 |
53 | echo "======/usr/local/Ascend/nnae/latest/opp/op_impl/built-in/ai_core/tbe/impl/layer_norm.py======"
54 | ls -al /usr/local/Ascend/nnae/latest/opp/op_impl/built-in/ai_core/tbe/impl/layer_norm.py
55 |
56 | echo "======/usr/local/Ascend/nnae/latest/opp/op_impl/built-in/ai_core/tbe/impl/layer_norm.py======"
57 | ls -al /usr/local/Ascend/nnae/latest/opp/op_impl/built-in/ai_core/tbe/impl/layer_norm_x_backprop_v2.py
58 |
59 |
60 | sudo cp -fp ${LOCAL_DIR}/${WORK_DIR}/layer_norm.py /usr/local/Ascend/nnae/latest/opp/op_impl/built-in/ai_core/tbe/impl/layer_norm.py
61 | sudo cp -fp ${LOCAL_DIR}/${WORK_DIR}/layer_norm_x_backprop_v2.py /usr/local/Ascend/nnae/latest/opp/op_impl/built-in/ai_core/tbe/impl/layer_norm_x_backprop_v2.py
62 |
63 | chmod +777 /usr/local/Ascend/nnae/latest/opp/op_impl/built-in/ai_core/tbe/impl/layer_norm_x_backprop_v2.py
64 | chmod +777 /usr/local/Ascend/nnae/latest/opp/op_impl/built-in/ai_core/tbe/impl/layer_norm.py
65 |
66 | echo "======/usr/local/Ascend/nnae/latest/opp/op_impl/built-in/ai_core/tbe/impl/layer_norm.py====new=="
67 | ls -al /usr/local/Ascend/nnae/latest/opp/op_impl/built-in/ai_core/tbe/impl/layer_norm.py
68 |
69 | echo "======/usr/local/Ascend/nnae/latest/opp/op_impl/built-in/ai_core/tbe/impl/layer_norm.py====new=="
70 | ls -al /usr/local/Ascend/nnae/latest/opp/op_impl/built-in/ai_core/tbe/impl/layer_norm_x_backprop_v2.py
71 |
72 | ls -al ${LOCAL_DIR}/${WORK_DIR}/custom_tune_bank_new
73 |
74 | export LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:$LD_LIBRARY_PATH
75 | export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/compiler/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/compiler/lib64/plugin/nnengine:$LD_LIBRARY_PATH
76 | export PATH=/usr/local/Ascend/ascend-toolkit/latest/bin:/usr/local/Ascend/ascend-toolkit/latest/compiler/ccec_compiler/bin:$PATH
77 | export ASCEND_AICPU_PATH=/usr/local/Ascend/ascend-toolkit/latest
78 | export ASCEND_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp
79 | export TOOLCHAIN_HOME=/usr/local/Ascend/ascend-toolkit/latest/toolkit
80 | export ASCEND_HOME_PATH=/usr/local/Ascend/ascend-toolkit/latest:$ASCEND_HOME_PATH
81 |
82 | echo "-------------------uninstall te topi and hccl--------------------------"
83 | sudo pip uninstall te -y
84 | sudo pip uninstall topi -y
85 | sudo pip uninstall hccl -y
86 | echo "-------------------install te topi and hccl--------------------------"
87 | pip install /usr/local/Ascend/ascend-toolkit/latest/lib64/topi-0.4.0-py3-none-any.whl
88 | pip install /usr/local/Ascend/ascend-toolkit/latest/lib64/te-0.4.0-py3-none-any.whl
89 | pip install /usr/local/Ascend/ascend-toolkit/latest/lib64/hccl-0.1.0-py3-none-any.whl
90 | pip install /usr/local/Ascend/ascend-toolkit/latest/tools/hccl_parser-0.1-py3-none-any.whl
91 |
92 |
93 | export GLOG_v=3 # mindspore日志开关,1:Info, 2:Warning, 3:Error
94 | export ASCEND_GLOBAL_LOG_LEVEL=3 # 底层软件的日志级别开关 1:Info, 2:Warning, 3:Error
95 | export ASCEND_GLOBAL_EVENT_ENABLE=1 # 底层软件的日志event日志开关 0:disable, 1:enable
96 | export ASCEND_SLOG_PRINT_TO_STDOUT=0 # 是否把底层日志重定向到打屏,0:disable, 1:enable
97 |
98 | export ENABLE_TUNE_BANK=True
99 | export TUNE_BANK_PATH=${LOCAL_DIR}/${WORK_DIR}/custom_tune_bank_new
100 |
101 | env
102 |
103 | mkdir -p /cache/ckpts
104 | mkdir -p /home/work/sfs/cache/${BATCH_JOB_ID}/1
105 | mkdir -p /home/work/sfs/cache/${BATCH_JOB_ID}/2
106 |
107 | sudo chmod +777 -R /cache/ckpts
108 | sudo chmod +777 -R /home/work/sfs/cache/${BATCH_JOB_ID}
109 |
110 | export GROUP_INFO_FILE=/home/work/sfs/cache/${BATCH_JOB_ID}/group_info_file.pb
111 |
--------------------------------------------------------------------------------
/codegeex/mindspore/scripts/run_modelarts.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 | import time
4 | from pathlib import Path
5 |
6 | parser = argparse.ArgumentParser()
7 | parser.add_argument("--work_dir", type=str, required=True)
8 | parser.add_argument("--script", type=str, required=True)
9 | parser.add_argument("--data_url", type=str, default=None)
10 | parser.add_argument("--train_url", type=str, default=None)
11 |
12 | args = parser.parse_args()
13 |
14 | log_path = os.path.join(args.work_dir, "logs", os.environ.get("JOB_ID"), f'device{os.environ.get("RANK_ID")}')
15 | tb_path = os.path.join(args.work_dir, "runs", os.environ.get("JOB_ID"))
16 |
17 | Path(log_path).mkdir(parents=True, exist_ok=True)
18 | Path(tb_path).mkdir(parents=True, exist_ok=True)
19 |
20 | log_path_prefix_1 = os.path.join(args.work_dir, "logs")
21 |
22 | os.environ["LOG_PATH"] = tb_path
23 |
24 | print("=================RANK_TABLE_FILE: ", os.environ["RANK_TABLE_FILE"], flush=True)
25 | print("=================ms import done", flush=True)
26 | time.sleep(10)
27 | os.system(
28 | "cp /home/work/rank_table/jobstart_hccl.json /home/work/sfs/xx; sudo chmod +777 /home/work/rank_table/jobstart_hccl.json")
29 | ret = os.system(f"cd {log_path} && bash {args.script} 2>&1 | tee output.log")
30 | if os.environ.get("RANK_ID") == 0:
31 | log_dir = os.path.join(args.work_dir, "logs", os.environ.get("JOB_ID"))
32 | os.system(f"sudo chmod +777 -R {tb_path}")
33 | os.system(f"sudo chmod +777 -R {log_dir}")
34 | print("==========ret code is: ", ret, flush=True)
35 | if ret != 0:
36 | raise RuntimeError("ret code is :" + str(ret))
37 |
--------------------------------------------------------------------------------
/codegeex/mindspore/scripts/run_modelarts_gen_finetune.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 | import time
4 | from pathlib import Path
5 |
6 | parser = argparse.ArgumentParser()
7 | parser.add_argument("--work_dir", type=str, required=True)
8 | parser.add_argument("--script", type=str, required=True)
9 | parser.add_argument("--data_url", type=str, default=None)
10 | parser.add_argument("--train_url", type=str, default=None)
11 | parser.add_argument("--language", type=str, default=None)
12 |
13 | args = parser.parse_args()
14 |
15 | log_path = os.path.join(args.work_dir, "logs", os.environ.get("JOB_ID"), f'device{os.environ.get("RANK_ID")}')
16 | tb_path = os.path.join(args.work_dir, "runs", os.environ.get("JOB_ID"))
17 |
18 | Path(log_path).mkdir(parents=True, exist_ok=True)
19 | Path(tb_path).mkdir(parents=True, exist_ok=True)
20 |
21 | log_path_prefix_1 = os.path.join(args.work_dir, "logs")
22 |
23 | os.environ["LOG_PATH"] = tb_path
24 | if args.language is not None:
25 | os.environ["LANGUAGE"] = args.language
26 | else:
27 | os.environ["LANGUAGE"] = "Null"
28 |
29 | print("=================RANK_TABLE_FILE: ", os.environ["RANK_TABLE_FILE"], flush=True)
30 | print("=================ms import done", flush=True)
31 | time.sleep(10)
32 | os.system(
33 | "cp /home/work/rank_table/jobstart_hccl.json /home/work/sfs/xx; sudo chmod +777 /home/work/rank_table/jobstart_hccl.json")
34 | ret = os.system(f"cd {log_path} && bash {args.script} 2>&1 | tee output.log")
35 | if os.environ.get("RANK_ID") == 0:
36 | log_dir = os.path.join(args.work_dir, "logs", os.environ.get("JOB_ID"))
37 | os.system(f"sudo chmod +777 -R {tb_path}")
38 | os.system(f"sudo chmod +777 -R {log_dir}")
39 | print("==========ret code is: ", ret, flush=True)
40 | if ret != 0:
41 | raise RuntimeError("ret code is :" + str(ret))
42 |
--------------------------------------------------------------------------------
/codegeex/mindspore/scripts/run_modelarts_gen_humaneval_x.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 | import time
4 | from pathlib import Path
5 |
6 | parser = argparse.ArgumentParser()
7 | parser.add_argument("--work_dir", type=str, required=True)
8 | parser.add_argument("--script", type=str, required=True)
9 | parser.add_argument("--data_url", type=str, default=None)
10 | parser.add_argument("--train_url", type=str, default=None)
11 | parser.add_argument("--part", type=str, default=None)
12 |
13 | args = parser.parse_args()
14 |
15 | log_path = os.path.join(args.work_dir, "logs", os.environ.get("JOB_ID"), f'device{os.environ.get("RANK_ID")}')
16 | tb_path = os.path.join(args.work_dir, "runs", os.environ.get("JOB_ID"))
17 |
18 | Path(log_path).mkdir(parents=True, exist_ok=True)
19 | Path(tb_path).mkdir(parents=True, exist_ok=True)
20 |
21 | log_path_prefix_1 = os.path.join(args.work_dir, "logs")
22 |
23 | os.environ["LOG_PATH"] = tb_path
24 | if args.part is not None:
25 | os.environ["PART"] = args.part
26 | else:
27 | os.environ["PART"] = "-1"
28 |
29 | print("=================RANK_TABLE_FILE: ", os.environ["RANK_TABLE_FILE"], flush=True)
30 | print("=================ms import done", flush=True)
31 | time.sleep(10)
32 | os.system(
33 | "cp /home/work/rank_table/jobstart_hccl.json /home/work/sfs/xx; sudo chmod +777 /home/work/rank_table/jobstart_hccl.json")
34 | ret = os.system(f"cd {log_path} && bash {args.script} 2>&1 | tee output.log")
35 | if os.environ.get("RANK_ID") == 0:
36 | log_dir = os.path.join(args.work_dir, "logs", os.environ.get("JOB_ID"))
37 | os.system(f"sudo chmod +777 -R {tb_path}")
38 | os.system(f"sudo chmod +777 -R {log_dir}")
39 | print("==========ret code is: ", ret, flush=True)
40 | if ret != 0:
41 | raise RuntimeError("ret code is :" + str(ret))
42 |
--------------------------------------------------------------------------------
/codegeex/mindspore/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/codegeex/mindspore/src/__init__.py
--------------------------------------------------------------------------------
/codegeex/mindspore/src/code_tokenizer.py:
--------------------------------------------------------------------------------
1 | from typing import *
2 |
3 | import numpy as np
4 | from transformers import AutoTokenizer
5 | from transformers.models.gpt2 import GPT2TokenizerFast
6 |
7 |
8 | def encode_whitespaces(text, start_extra_id: int, max_len: int):
9 | """ Encode whitespaces to extra tokens in GPT-J.
10 |
11 | >>> encode_whitespaces('a\\n b\\n c', 10, 10)
12 | 'a\\n<|extratoken_10|>b\\n<|extratoken_11|>c'
13 | """
14 |
15 | def push_acc_space(acc_len: int, text: str):
16 | if acc_len == 0:
17 | return text
18 | if acc_len == 1:
19 | return text + ' '
20 | assert acc_len <= max_len, f'Max whitespace run length {max_len}, but found {acc_len}'
21 | extra_id = start_extra_id - 2 + acc_len
22 | extra_token = f'<|extratoken_{extra_id}|>'
23 | return text + extra_token
24 |
25 | acc_len = 0
26 | res = ''
27 | for ch in text:
28 | if ch == ' ':
29 | acc_len += 1
30 | if acc_len == max_len:
31 | res = push_acc_space(acc_len, res)
32 | acc_len = 0
33 | else:
34 | res = push_acc_space(acc_len, res)
35 | acc_len = 0
36 | res = res + ch
37 |
38 | res = push_acc_space(acc_len, res)
39 |
40 | return res
41 |
42 |
43 | def decode_whitespaces(text: str, start_extra_id: int, max_len: int):
44 | """ Decode the whitespace-encoded strings produced by encode_whitespace.
45 |
46 | >>> text = 'a\\n b\\n c'
47 | >>> s, l = 10, 10
48 | >>> text == decode_whitespaces(encode_whitespaces(text, s, l), s, l)
49 | True
50 | """
51 | for l in range(2, max_len + 1):
52 | token_id = start_extra_id - 2 + l
53 | token = f'<|extratoken_{token_id}|>'
54 | text = text.replace(token, ' ' * l)
55 | return text
56 |
57 |
58 | class Code13BDictionary(object):
59 | def __init__(
60 | self,
61 | dict_file: str,
62 | extra_token_ids: List[str] = None,
63 | pad_to_vocab_size: int = -1,
64 | ):
65 | self._idx = dict()
66 | self._count = dict()
67 | self._num_symbols = 0
68 | self._symbols = []
69 |
70 | self._add_symbol("", 0)
71 | self._add_symbol("", 0)
72 | self._add_symbol("", 0)
73 | self._add_symbol("", 0)
74 | self._load_dict(dict_file)
75 |
76 | if extra_token_ids is None:
77 | extra_token_ids = [
78 | str(x) for x in range(50257, 50400)
79 | ] # follows GPT-J settings
80 |
81 | for token_id in extra_token_ids:
82 | self._add_symbol(token_id, 0)
83 |
84 | if pad_to_vocab_size > 0:
85 | self._pad_to_vocab_size(pad_to_vocab_size)
86 |
87 | def _pad_to_vocab_size(self, vocab_size: int):
88 | num_pad = vocab_size - len(self)
89 | if num_pad <= 0:
90 | return
91 | for i in range(1, num_pad + 1):
92 | self._add_symbol("vocab_pad_token{}".format(i), 0)
93 |
94 | def _load_dict(self, dict_file: str):
95 | with open(dict_file, "r") as f:
96 | for line in f:
97 | line = line.strip()
98 | if line == "" or line.startswith("#"):
99 | continue
100 | sym, count = line.split()
101 | self._add_symbol(sym, int(count))
102 |
103 | def _add_symbol(self, sym: str, count: int):
104 | self._idx[sym] = self._num_symbols
105 | self._count[sym] = count
106 | self._symbols.append(sym)
107 | self._num_symbols += 1
108 |
109 | def __len__(self):
110 | return self._num_symbols
111 |
112 | def index(self, sym: str):
113 | return self._idx[sym]
114 |
115 | def string(self, idx: int):
116 | return self._symbols[idx]
117 |
118 | def map_token(self, token: Union[int, str]):
119 | if isinstance(token, int):
120 | token = str(token)
121 | return self.index(token)
122 |
123 | def map_tokens(self, tokens):
124 | return [self.map_token(token) for token in tokens]
125 |
126 | def decode_tokens(self, tokens):
127 | decoded = [self.string(token) for token in tokens]
128 | return [int(x) for x in decoded if not x.startswith("vocab_pad_token")]
129 |
130 |
131 | class CodeTokenizer(object):
132 | def __init__(
133 | self,
134 | tokenizer: GPT2TokenizerFast = None,
135 | start_extra_id: int = 10,
136 | max_len: int = 10,
137 | mode='13b',
138 | dict_file: str = None,
139 | ):
140 | self.tokenizer = tokenizer if tokenizer is not None else AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")
141 | if mode not in ['6b', '13b']:
142 | raise ValueError(f"Invalid mode {mode}, choose from ['6b', '13b']")
143 | self.start_extra_id = start_extra_id
144 | self.max_len = max_len
145 | self.mode = mode
146 | self.code_dict = Code13BDictionary(dict_file, pad_to_vocab_size=51200) if self.mode == '13b' else None
147 | self.eos_token_id = self.tokenizer.eos_token_id
148 |
149 | def encode_code(self, code: str):
150 | if self.mode == '6b':
151 | code = encode_whitespaces(code, self.start_extra_id, self.max_len)
152 | input_ids = self.tokenizer(code).input_ids
153 |
154 | elif self.mode == '13b':
155 | code = encode_whitespaces(code, self.start_extra_id, self.max_len)
156 | input_ids = self.code_dict.map_tokens(self.tokenizer.encode(code))
157 | input_ids = np.array(input_ids, dtype=np.int64).reshape(1, -1)
158 |
159 | return input_ids
160 |
161 | def decode_code(self, input_ids):
162 | if self.mode == '6b':
163 | texts = self.tokenizer.batch_decode(input_ids)
164 | output_code = [decode_whitespaces(text, self.start_extra_id, self.max_len) for text in texts]
165 |
166 | elif self.mode == '13b':
167 | input_ids = [self.code_dict.decode_tokens(input_ids.tolist()[0])]
168 | texts = self.tokenizer.batch_decode(input_ids)
169 | output_code = [decode_whitespaces(text, self.start_extra_id, self.max_len) for text in texts]
170 |
171 | return output_code
172 |
--------------------------------------------------------------------------------
/codegeex/mindspore/src/metrics.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Huawei Technologies Co., Ltd
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 | """
16 | Eval metrics
17 | """
18 |
19 | import math
20 |
21 | from mindspore import context
22 | from mindspore.communication.management import get_rank, get_group_size
23 | from mindspore.nn.metrics import Metric
24 |
25 |
26 | class PPLMetric(Metric):
27 | """
28 | Ppl metric
29 | """
30 |
31 | def __init__(self, data_length):
32 | super(PPLMetric, self).__init__()
33 | self.clear()
34 | self.data_length = data_length
35 | pipeline_stages = context.get_auto_parallel_context("pipeline_stages")
36 | per_stage_device_num = get_group_size() // pipeline_stages
37 | stage_id = get_rank() // per_stage_device_num
38 | self.is_last_stage = (stage_id == pipeline_stages - 1)
39 |
40 | def clear(self):
41 | """Clear the internal evaluation result."""
42 | self.PPL = []
43 | self.tokens_count = 0
44 |
45 | def update(self, *inputs): # inputs
46 | """Update list of ppl"""
47 | if not self.is_last_stage:
48 | return
49 | logits = inputs[0].asnumpy().flatten().tolist() # logits
50 | self.PPL.append(logits[0] * self.data_length)
51 | self.tokens_count += 1
52 |
53 | def eval(self):
54 | if not self.is_last_stage:
55 | return 0
56 | if self.tokens_count == 0:
57 | print("Warning: tokens_count is 0")
58 | return 0
59 | val_loss = sum(self.PPL) / (self.tokens_count * self.data_length)
60 | ppl = math.exp(min(20, val_loss))
61 | # print("====" * 20 + " ppl end")
62 | # print("====" * 20 + " ppl: {}".format(ppl))
63 | # return ppl
64 | return val_loss
65 |
66 |
67 | class ValidationLoss(Metric):
68 | def __init__(self, data_length):
69 | super(ValidationLoss, self).__init__()
70 | self.clear()
71 | self.data_length = data_length
72 | pipeline_stages = context.get_auto_parallel_context("pipeline_stages")
73 | per_stage_device_num = get_group_size() // pipeline_stages
74 | stage_id = get_rank() // per_stage_device_num
75 | self.is_last_stage = (stage_id == pipeline_stages - 1)
76 |
77 | def clear(self):
78 | """Clear the internal evaluation result."""
79 | self.metric = []
80 | self.tokens_count = 0
81 |
82 | def update(self, *inputs): # inputs
83 | """Update list of ppl"""
84 | # logits = inputs[0].asnumpy()
85 | # if self.rank % 8 == 0:
86 | # print("====" * 2 + " logits: {}".format(logits), flush=True)
87 | # self.metric.append(logits)
88 | if not self.is_last_stage:
89 | return
90 | logits = inputs[0].asnumpy().flatten().tolist() # logits
91 | self.metric.append(logits[0] * self.data_length)
92 | self.tokens_count += 1
93 |
94 | def eval(self):
95 | if not self.is_last_stage == 0:
96 | return 0
97 | val_loss = sum(self.metric) / (self.tokens_count * self.data_length)
98 | return val_loss
99 |
--------------------------------------------------------------------------------
/codegeex/mindspore/src/tokenization_jieba.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Tokenization classes for OpenAI GPT."""
16 | from __future__ import (absolute_import, division, print_function,
17 | unicode_literals)
18 |
19 | from io import open
20 |
21 | import jieba
22 | import sentencepiece as spm
23 |
24 |
25 | class JIEBATokenizer():
26 | r"""
27 | Jieba Tokenizer
28 | """
29 |
30 | def __init__(self, vocab_file, model_file, max_len=None):
31 | self.max_len = max_len if max_len is not None else int(1e12)
32 | f = open(vocab_file, 'r')
33 | lines = f.readlines()
34 | self.encoder = {}
35 | for line in enumerate(lines):
36 | key = line[1].split('\t')[0]
37 | self.encoder[key] = line[0]
38 |
39 | self.decoder = {v: k for k, v in self.encoder.items()}
40 |
41 | self.sp = spm.SentencePieceProcessor(model_file=model_file)
42 | self.translator = str.maketrans(" \n", "\u2582\u2583")
43 |
44 | self.eod_id = self.encoder['']
45 | self.eot_id = self.encoder['']
46 | self.pad_id = self.encoder['']
47 |
48 | @property
49 | def vocab_size(self):
50 | return len(self.encoder)
51 |
52 | def __len__(self):
53 | return len(self.encoder) + len(self.special_tokens)
54 |
55 | @property
56 | def eod(self):
57 | return self.eod_id
58 |
59 | def tokenize(self, text):
60 | """ Tokenize a string. """
61 | seg_list = [x.translate(self.translator) for x in jieba.cut(text, cut_all=False)]
62 | new_seg = " ".join(seg_list)
63 | return self.sp.encode(new_seg)
64 |
65 | def convert_tokens_to_ids(self, tokens):
66 | return tokens
67 |
68 | def convert_ids_to_tokens(self, ids):
69 | return self.decode(ids)
70 |
71 | def encode(self, text):
72 | res = self.tokenize(text)
73 | return res
74 |
75 | def decode(self, tokens):
76 | text = self.sp.decode(tokens)
77 | text = text.replace(' ', '').replace('\u2582', ' ').replace('\u2583', '\n')
78 | return text
79 |
--------------------------------------------------------------------------------
/codegeex/oneflow/__init__.py:
--------------------------------------------------------------------------------
1 | from .codegeex_model import CodeGeeXModel
--------------------------------------------------------------------------------
/codegeex/paddle/__init__.py:
--------------------------------------------------------------------------------
1 | from .codegeex_model import CodeGeeXModel
--------------------------------------------------------------------------------
/codegeex/paddle/pt_to_pdparams.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import paddle
3 | import torch
4 |
5 | linear_layer = [
6 | "mlp.dense_h_to_4h",
7 | "mlp.dense_4h_to_h",
8 | "attention.query",
9 | "attention.key",
10 | "attention.value",
11 | "attention.dense",
12 | ]
13 |
14 |
15 | def WalkDict(x):
16 | for i in x:
17 | if isinstance(x[i], dict):
18 | WalkDict(x[i])
19 | elif isinstance(x[i], torch.Tensor):
20 | print(f"Converting '{i}' from 'torch.Tensor' to 'numpy.ndarray'.")
21 | npy = x[i].cpu().numpy()
22 | if any([f".{layer}.weight" in i for layer in linear_layer]):
23 | print(f"Transposing linear layer weight '{i}'.")
24 | x[i] = npy.T
25 | else:
26 | x[i] = npy
27 |
28 |
29 | def parse_opt():
30 | parser = argparse.ArgumentParser()
31 | parser.add_argument(
32 | "--pt",
33 | type=str,
34 | required=True,
35 | help="Path to pt checkpoint."
36 | )
37 | parser.add_argument(
38 | "--pdparams",
39 | type=str,
40 | required=True,
41 | help="Path to pdparams checkpoint."
42 | )
43 | opt = parser.parse_args()
44 | return opt
45 |
46 |
47 | def main(opt):
48 | state_dict = torch.load(opt.pt)
49 | WalkDict(state_dict)
50 | paddle.save(state_dict, opt.pdparams)
51 |
52 |
53 | if __name__ == "__main__":
54 | opt = parse_opt()
55 | main(opt)
56 |
--------------------------------------------------------------------------------
/codegeex/quantization/__init__.py:
--------------------------------------------------------------------------------
1 | from .quantize import quantize
2 | try:
3 | from .quantize_oneflow import quantize_oneflow
4 | from .quantize_oneflow import QuantizedLinear
5 | except ModuleNotFoundError:
6 | pass
7 |
--------------------------------------------------------------------------------
/codegeex/tokenizer/__init__.py:
--------------------------------------------------------------------------------
1 | from .tokenizer import CodeGeeXTokenizer
--------------------------------------------------------------------------------
/codegeex/tokenizer/added_tokens.json:
--------------------------------------------------------------------------------
1 | {"<|extratoken_14|>": 50270, "<|extratoken_121|>": 50377, "<|extratoken_3|>": 50259, "<|extratoken_25|>": 50281, "<|extratoken_101|>": 50357, "<|extratoken_138|>": 50394, "<|extratoken_10|>": 50266, "<|extratoken_21|>": 50277, "<|extratoken_32|>": 50288, "<|extratoken_46|>": 50302, "<|extratoken_22|>": 50278, "<|extratoken_40|>": 50296, "<|extratoken_96|>": 50352, "<|extratoken_92|>": 50348, "<|extratoken_95|>": 50351, "<|extratoken_141|>": 50397, "<|extratoken_78|>": 50334, "<|extratoken_86|>": 50342, "<|extratoken_56|>": 50312, "<|extratoken_124|>": 50380, "<|extratoken_127|>": 50383, "<|extratoken_122|>": 50378, "<|extratoken_123|>": 50379, "<|extratoken_111|>": 50367, "<|extratoken_93|>": 50349, "<|extratoken_130|>": 50386, "<|extratoken_113|>": 50369, "<|extratoken_50|>": 50306, "<|extratoken_97|>": 50353, "<|extratoken_1|>": 50257, "<|extratoken_55|>": 50311, "<|extratoken_34|>": 50290, "<|extratoken_143|>": 50399, "<|extratoken_62|>": 50318, "<|extratoken_74|>": 50330, "<|extratoken_136|>": 50392, "<|extratoken_117|>": 50373, "<|extratoken_38|>": 50294, "<|extratoken_120|>": 50376, "<|extratoken_39|>": 50295, "<|extratoken_65|>": 50321, "<|extratoken_29|>": 50285, "<|extratoken_104|>": 50360, "<|extratoken_13|>": 50269, "<|extratoken_5|>": 50261, "<|extratoken_107|>": 50363, "<|extratoken_19|>": 50275, "<|extratoken_84|>": 50340, "<|extratoken_77|>": 50333, "<|extratoken_135|>": 50391, "<|extratoken_24|>": 50280, "<|extratoken_134|>": 50390, "<|extratoken_15|>": 50271, "<|extratoken_67|>": 50323, "<|extratoken_89|>": 50345, "<|extratoken_2|>": 50258, "<|extratoken_73|>": 50329, "<|extratoken_129|>": 50385, "<|extratoken_126|>": 50382, "<|extratoken_30|>": 50286, "<|extratoken_41|>": 50297, "<|extratoken_28|>": 50284, "<|extratoken_114|>": 50370, "<|extratoken_128|>": 50384, "<|extratoken_118|>": 50374, "<|extratoken_131|>": 50387, "<|extratoken_68|>": 50324, "<|extratoken_125|>": 50381, "<|extratoken_103|>": 50359, "<|extratoken_8|>": 50264, "<|extratoken_64|>": 50320, "<|extratoken_52|>": 50308, "<|extratoken_45|>": 50301, "<|extratoken_43|>": 50299, "<|extratoken_18|>": 50274, "<|extratoken_139|>": 50395, "<|extratoken_85|>": 50341, "<|extratoken_88|>": 50344, "<|extratoken_63|>": 50319, "<|extratoken_4|>": 50260, "<|extratoken_48|>": 50304, "<|extratoken_112|>": 50368, "<|extratoken_17|>": 50273, "<|extratoken_49|>": 50305, "<|extratoken_108|>": 50364, "<|extratoken_110|>": 50366, "<|extratoken_42|>": 50298, "<|extratoken_70|>": 50326, "<|extratoken_6|>": 50262, "<|extratoken_35|>": 50291, "<|extratoken_23|>": 50279, "<|extratoken_66|>": 50322, "<|extratoken_60|>": 50316, "<|extratoken_71|>": 50327, "<|extratoken_51|>": 50307, "<|extratoken_133|>": 50389, "<|extratoken_20|>": 50276, "<|extratoken_76|>": 50332, "<|extratoken_81|>": 50337, "<|extratoken_142|>": 50398, "<|extratoken_116|>": 50372, "<|extratoken_57|>": 50313, "<|extratoken_75|>": 50331, "<|extratoken_37|>": 50293, "<|extratoken_33|>": 50289, "<|extratoken_16|>": 50272, "<|extratoken_61|>": 50317, "<|extratoken_7|>": 50263, "<|extratoken_12|>": 50268, "<|extratoken_36|>": 50292, "<|extratoken_80|>": 50336, "<|extratoken_98|>": 50354, "<|extratoken_105|>": 50361, "<|extratoken_91|>": 50347, "<|extratoken_53|>": 50309, "<|extratoken_137|>": 50393, "<|extratoken_9|>": 50265, "<|extratoken_79|>": 50335, "<|extratoken_83|>": 50339, "<|extratoken_109|>": 50365, "<|extratoken_99|>": 50355, "<|extratoken_140|>": 50396, "<|extratoken_72|>": 50328, "<|extratoken_11|>": 50267, "<|extratoken_94|>": 50350, "<|extratoken_26|>": 50282, "<|extratoken_59|>": 50315, "<|extratoken_106|>": 50362, "<|extratoken_115|>": 50371, "<|extratoken_58|>": 50314, "<|extratoken_90|>": 50346, "<|extratoken_31|>": 50287, "<|extratoken_102|>": 50358, "<|extratoken_47|>": 50303, "<|extratoken_100|>": 50356, "<|extratoken_82|>": 50338, "<|extratoken_44|>": 50300, "<|extratoken_69|>": 50325, "<|extratoken_54|>": 50310, "<|extratoken_132|>": 50388, "<|extratoken_27|>": 50283, "<|extratoken_87|>": 50343, "<|extratoken_119|>": 50375}
2 |
--------------------------------------------------------------------------------
/codegeex/tokenizer/special_tokens_map.json:
--------------------------------------------------------------------------------
1 | {"bos_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "eos_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "unk_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}}
2 |
--------------------------------------------------------------------------------
/codegeex/tokenizer/tokenizer.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from typing import *
3 | from transformers import AutoTokenizer
4 | from transformers.models.gpt2 import GPT2TokenizerFast
5 |
6 |
7 | def encode_whitespaces(text: str, start_extra_id: int, max_len: int):
8 | """ Encode whitespaces to extra tokens.
9 |
10 | >>> encode_whitespaces('a\\n b\\n c', 10, 10)
11 | 'a\\n<|extratoken_10|>b\\n<|extratoken_11|>c'
12 | """
13 | for i in np.arange(max_len, 1, -1):
14 | text = text.replace(" " * i, f"<|extratoken_{start_extra_id + i - 2}|>")
15 | return text
16 |
17 |
18 | def decode_whitespaces(text: str, start_extra_id: int, max_len: int):
19 | """ Decode the whitespace-encoded strings produced by encode_whitespace.
20 |
21 | >>> text = 'a\\n b\\n c'
22 | >>> s, l = 10, 10
23 | >>> text == decode_whitespaces(encode_whitespaces(text, s, l), s, l)
24 | True
25 | """
26 | for l in range(2, max_len + 1):
27 | token_id = start_extra_id - 2 + l
28 | token = f'<|extratoken_{token_id}|>'
29 | text = text.replace(token, ' ' * l)
30 | return text
31 |
32 |
33 | class CodeGeeXTokenizer(object):
34 | def __init__(
35 | self,
36 | tokenizer: GPT2TokenizerFast = None,
37 | tokenizer_path: str = "EleutherAI/gpt-j-6B",
38 | start_extra_id: int = 10,
39 | max_len : int = 10,
40 | mode='codegeex-13b',
41 | dict_file: str = None,
42 | ):
43 | self.tokenizer = tokenizer if tokenizer is not None else AutoTokenizer.from_pretrained(tokenizer_path)
44 | if mode not in ['codegeex-13b']:
45 | raise ValueError(f"Invalid mode {mode}, choose from ['codegeex-13b']")
46 | self.start_extra_id = start_extra_id
47 | self.max_len = max_len
48 | self.mode = mode
49 | self.eos_token_id = self.tokenizer.eos_token_id
50 |
51 | def encode_code(self, code: str):
52 | if self.mode == 'codegeex-13b':
53 | code = encode_whitespaces(code, self.start_extra_id, self.max_len)
54 | input_ids = self.tokenizer(code, is_split_into_words=False, verbose=False).input_ids
55 |
56 | return input_ids
57 |
58 | def decode_code(self, input_ids):
59 | if self.mode == 'codegeex-13b':
60 | text = self.tokenizer.decode(input_ids, skip_special_tokens=False, verbose=False)
61 | output_code = decode_whitespaces(text, self.start_extra_id, self.max_len)
62 |
63 | return output_code
--------------------------------------------------------------------------------
/codegeex/tokenizer/tokenizer_config.json:
--------------------------------------------------------------------------------
1 | {"unk_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "bos_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "eos_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "add_prefix_space": false, "errors": "replace", "model_max_length": 2048, "special_tokens_map_file": null, "name_or_path": "gpt-j-6B", "from_slow": true, "tokenizer_class": "GPT2Tokenizer"}
2 |
--------------------------------------------------------------------------------
/codegeex/torch/__init__.py:
--------------------------------------------------------------------------------
1 | from .codegeex_model import CodeGeeXModel
--------------------------------------------------------------------------------
/codegeex/torch/get_ckpt_qkv.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import torch
4 | import random
5 | import argparse
6 | import numpy as np
7 |
8 |
9 | def main():
10 | parser = argparse.ArgumentParser()
11 | parser.add_argument("--load-path",
12 | type=str,
13 | default="/zhangpai24/workspace/ckpt_ms/ckpt_ms_213000_fp32_52224.pt")
14 | parser.add_argument("--save-path",
15 | type=str,
16 | default="/zhangpai24/workspace/ckpt_ms/ckpt_ms_213000_qkv.pt")
17 |
18 | args, _ = parser.parse_known_args()
19 |
20 | state_dict_path = args.load_path
21 | print("Loading state dict ...")
22 | sd = torch.load(state_dict_path, map_location="cpu")
23 |
24 | for i in range(40):
25 | if i < 39:
26 | query_weight = sd['module']['language_model']['transformer'].pop(f'layers.{i}.attention.query.weight', None)
27 | query_bias = sd['module']['language_model']['transformer'].pop(f'layers.{i}.attention.query.bias', None)
28 | key_weight = sd['module']['language_model']['transformer'].pop(f'layers.{i}.attention.key.weight', None)
29 | key_bias = sd['module']['language_model']['transformer'].pop(f'layers.{i}.attention.key.bias', None)
30 | value_weight = sd['module']['language_model']['transformer'].pop(f'layers.{i}.attention.value.weight', None)
31 | value_bias = sd['module']['language_model']['transformer'].pop(f'layers.{i}.attention.value.bias', None)
32 | qkv_weight = torch.cat([query_weight, key_weight, value_weight], dim=0)
33 | qkv_bias = torch.cat([query_bias, key_bias, value_bias])
34 | sd['module']['language_model']['transformer'][f'layers.{i}.attention.query_key_value.weight'] = qkv_weight
35 | sd['module']['language_model']['transformer'][f'layers.{i}.attention.query_key_value.bias'] = qkv_bias
36 | else:
37 | tq_key_weight = sd['module']['language_model']['transformer'].pop('topQueryLayer.attention.key.weight', None)
38 | tq_key_bias = sd['module']['language_model']['transformer'].pop('topQueryLayer.attention.key.bias', None)
39 | tq_value_weight = sd['module']['language_model']['transformer'].pop('topQueryLayer.attention.value.weight', None)
40 | tq_value_bias = sd['module']['language_model']['transformer'].pop('topQueryLayer.attention.value.bias', None)
41 | tq_kv_weight = torch.cat([tq_key_weight, tq_value_weight], dim=0)
42 | tq_kv_bias = torch.cat([tq_key_bias, tq_value_bias])
43 | sd['module']['language_model']['transformer']['topQueryLayer.attention.key_value.weight'] = tq_kv_weight
44 | sd['module']['language_model']['transformer']['topQueryLayer.attention.key_value.bias'] = tq_kv_bias
45 |
46 | save_ckpt_path = args.save_path
47 | torch.save(sd, save_ckpt_path)
48 |
49 | if __name__ == '__main__':
50 | main()
51 |
--------------------------------------------------------------------------------
/configs/codegeex_13b.sh:
--------------------------------------------------------------------------------
1 | # CodeGeeX-13B configuration
2 |
3 | CHECKPOINT_PATH=""
4 |
5 | MODEL_ARGS="--num-layers 39 \
6 | --hidden-size 5120 \
7 | --num-attention-heads 40 \
8 | --max-position-embeddings 2048 \
9 | --attention-softmax-in-fp32 \
10 | --load "$CHECKPOINT_PATH" \
11 | --layernorm-epsilon 1e-5 \
12 | --fp16 \
13 | --ws-encoding-start-id 10 \
14 | --ws-encoding-length 10 \
15 | --make-vocab-size-divisible-by 52224 \
16 | --seq-length 2048"
--------------------------------------------------------------------------------
/configs/codegeex_13b_paddle.sh:
--------------------------------------------------------------------------------
1 | # CodeGeeX-13B paddle configuration
2 |
3 | CHECKPOINT_PATH=""
4 |
5 | MODEL_ARGS="--num-layers 39 \
6 | --hidden-size 5120 \
7 | --num-attention-heads 40 \
8 | --max-position-embeddings 2048 \
9 | --attention-softmax-in-fp32 \
10 | --load "$CHECKPOINT_PATH" \
11 | --layernorm-epsilon 1e-5 \
12 | --fp16 \
13 | --ws-encoding-start-id 10 \
14 | --ws-encoding-length 10 \
15 | --make-vocab-size-divisible-by 52224 \
16 | --seq-length 2048"
--------------------------------------------------------------------------------
/configs/codegeex_13b_parallel.sh:
--------------------------------------------------------------------------------
1 | # CodeGeeX-13B parallel configuration
2 | # Parallel checkpoints are named under the format "mp_rank_0{i}_model_states.pt", where i is the rank, start from 0.
3 |
4 | CHECKPOINT_PATH=""
5 |
6 | MODEL_ARGS="--num-layers 39 \
7 | --hidden-size 5120 \
8 | --num-attention-heads 40 \
9 | --max-position-embeddings 2048 \
10 | --attention-softmax-in-fp32 \
11 | --load "$CHECKPOINT_PATH" \
12 | --layernorm-epsilon 1e-5 \
13 | --fp16 \
14 | --ws-encoding-start-id 10 \
15 | --ws-encoding-length 10 \
16 | --make-vocab-size-divisible-by 52224 \
17 | --seq-length 2048"
--------------------------------------------------------------------------------
/deployment/example_inputs.jsonl:
--------------------------------------------------------------------------------
1 | {"code": "# Write a function that returns the sum of the numbers from 1 to n.\n# For example, if n is 5, then the function should return 1 + 2 + 3 + 4 + 5.\n\n# You may assume that n is a positive integer.\ndef sum_of_numbers(n):", "langauge": "Python"}
2 | {"code": "// Write a function that returns the sum of the numbers from 1 to n.\n// For example, if n is 5, then the function should return 1 + 2 + 3 + 4 + 5.\n\n#include \nusing namespace std;\nint sum_of_numbers(int n) {", "langauge": "C++"}
3 | {"code": "// Write a function that returns the sum of the numbers from 1 to n.\n// For example, if n is 5, then the function should return 1 + 2 + 3 + 4 + 5.\n\n#include \n#include \nint sum(int n)\n{", "langauge": "C"}
4 | {"code": "// Write a function that returns the sum of the numbers from 1 to n.\n// For example, if n is 5, then the function should return 1 + 2 + 3 + 4 + 5.\nprivate int sum(int n) {", "langauge": "C#"}
5 | {"code": "// Write a function that returns the sum of the numbers from 1 to n.\n// For example, if n is 5, then the function should return 1 + 2 + 3 + 4 + 5.\n\npublic class SumOfNumbers {", "langauge": "Java"}
6 | {"code": "\n\n
", "langauge": "HTML"}
7 | {"code": "// Write a function that returns the sum of the numbers from 1 to n.\n// For example, if n is 5, then the function should return 1 + 2 + 3 + 4 + 5.\n// If n is 0, then the function should return 0.\n// If n is less than 0, then the function should return -1.\n/**\n * @param {number} n\n * @return {number}\n */\nfunction sum ($n) {", "langauge": "PHP"}
8 | {"code": "// Write a function that returns the sum of the numbers from 1 to n.\n// For example, if n is 5, then the function should return 1 + 2 + 3 + 4 + 5.\n\nfunction sum(n) {", "langauge": "JavaScript"}
9 | {"code": "// Write a function that returns the sum of the numbers from 1 to n,\n// but using a for loop instead of a while loop.\n\nfunction sumForLoop(n) {", "langauge": "TypeScript"}
10 | {"code": "// Write a function that returns the sum of the numbers from 1 to n,\n// but using a for loop instead of a while loop.\n\nfunc sumN(n int) int {", "langauge": "Go"}
11 | {"code": "// Write a function that returns the sum of the numbers from 1 to n,\n// but using a for loop instead of a while loop.\n\nfn sum_numbers(n: usize) -> usize {", "langauge": "Rust"}
12 | {"code": "-- Search all the records from the table CodeGeeX\n-- Delete iterms with odd indices", "langauge": "SQL"}
13 | {"code": "// Write a function that returns the sum of the numbers from 1 to n.\n// For example, if n is 5, then the function should return 1 + 2 + 3 + 4 + 5.\n\nfun sum(n: Int): Int {", "langauge": "Kotlin"}
14 | {"code": "! Write a function that returns the sum of the numbers from 1 to n.\n! For example, if n is 5, then the function should return 1 + 2 + 3 + 4 + 5.\n\n! Use the following header:\n! module sum_numbers\n! end\nmodule sum_numbers", "langauge": "Fortran"}
15 | {"code": "# Write a function that returns the sum of the numbers from 1 to n.\n# For example, if n is 5, then the function should return 1 + 2 + 3 + 4 + 5.\nsum_numbers <- function(n) {", "langauge": "R"}
16 |
--------------------------------------------------------------------------------
/generations/humaneval_python_generations.jsonl.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/generations/humaneval_python_generations.jsonl.gz
--------------------------------------------------------------------------------
/generations/humaneval_rust_generations.jsonl.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/generations/humaneval_rust_generations.jsonl.gz
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | fire>=0.4.0
2 | ipython>=8.4.0
3 | numpy>=1.22.0
4 | pandas>=1.3.5
5 | pyzmq>=23.2.1
6 | regex>=2022.3.15
7 | setuptools>=58.0.4
8 | transformers>=4.22.0
9 | torch>=1.10.0
10 | tqdm>=4.63.0
11 | cpm_kernels
12 | deepspeed>0.6.1
--------------------------------------------------------------------------------
/resources/api/api_step_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/resources/api/api_step_1.png
--------------------------------------------------------------------------------
/resources/api/api_step_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/resources/api/api_step_2.png
--------------------------------------------------------------------------------
/resources/api/api_step_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/resources/api/api_step_3.png
--------------------------------------------------------------------------------
/resources/api/api_step_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/resources/api/api_step_4.png
--------------------------------------------------------------------------------
/resources/api/api_step_5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/resources/api/api_step_5.png
--------------------------------------------------------------------------------
/resources/en/codegeex_training.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/resources/en/codegeex_training.png
--------------------------------------------------------------------------------
/resources/en/hx_boxplot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/resources/en/hx_boxplot.png
--------------------------------------------------------------------------------
/resources/en/hx_examples.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/resources/en/hx_examples.png
--------------------------------------------------------------------------------
/resources/en/hx_generattion_radar_horizon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/resources/en/hx_generattion_radar_horizon.png
--------------------------------------------------------------------------------
/resources/en/hx_pass_rate_vs_language.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/resources/en/hx_pass_rate_vs_language.png
--------------------------------------------------------------------------------
/resources/en/hx_tasks.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/resources/en/hx_tasks.png
--------------------------------------------------------------------------------
/resources/en/hx_translation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/resources/en/hx_translation.png
--------------------------------------------------------------------------------
/resources/logo/codegeex_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/resources/logo/codegeex_logo.png
--------------------------------------------------------------------------------
/resources/zh/hx_boxplot_zh.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/resources/zh/hx_boxplot_zh.png
--------------------------------------------------------------------------------
/resources/zh/hx_generattion_radar_horizon_zh.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/resources/zh/hx_generattion_radar_horizon_zh.png
--------------------------------------------------------------------------------
/resources/zh/hx_pass_rate_vs_language_zh.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/resources/zh/hx_pass_rate_vs_language_zh.png
--------------------------------------------------------------------------------
/resources/zh/hx_tasks_zh.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/resources/zh/hx_tasks_zh.png
--------------------------------------------------------------------------------
/resources/zh/hx_translation_zh.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/resources/zh/hx_translation_zh.png
--------------------------------------------------------------------------------
/resources/zh/join_wechat.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/resources/zh/join_wechat.png
--------------------------------------------------------------------------------
/resources/zh/wechat.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
扫码关注公众号加入「CodeGeeX交流群」
5 |
Scan the QR code to join the "CodeGeeX WeChat Group"