├── .gitmodules
├── LICENSE
├── MODEL_LICENSE
├── README.md
├── README_zh.md
├── api
    ├── README_zh.md
    ├── codegeex-api-example-java
    │   ├── pom.xml
    │   └── src
    │   │   └── main
    │   │       └── java
    │   │           └── cn
    │   │               └── aminer
    │   │                   └── codegeex
    │   │                       └── example
    │   │                           ├── CodeGenerationExample.java
    │   │                           └── pojo
    │   │                               └── Payload.java
    └── codegeex-api-example-python
    │   └── generation_example.py
├── codegeex
    ├── __init__.py
    ├── benchmark
    │   ├── README.md
    │   ├── README_zh.md
    │   ├── __init__.py
    │   ├── evaluate_humaneval_x.py
    │   ├── execution.py
    │   ├── gather_output.py
    │   ├── humaneval-x
    │   │   ├── __init__.py
    │   │   ├── cpp
    │   │   │   ├── data
    │   │   │   │   └── humaneval_cpp.jsonl.gz
    │   │   │   └── evaluation
    │   │   │   │   └── test.cpp
    │   │   ├── evaluate_humaneval_x.py
    │   │   ├── generate_humaneval_x.py
    │   │   ├── go
    │   │   │   ├── data
    │   │   │   │   └── humaneval_go.jsonl.gz
    │   │   │   └── evaluation
    │   │   │   │   ├── go.mod
    │   │   │   │   ├── go.sum
    │   │   │   │   └── vendor.tar.gz
    │   │   ├── java
    │   │   │   └── data
    │   │   │   │   └── humaneval_java.jsonl.gz
    │   │   ├── js
    │   │   │   └── data
    │   │   │   │   └── humaneval_js.jsonl.gz
    │   │   ├── python
    │   │   │   └── data
    │   │   │   │   └── humaneval_python.jsonl.gz
    │   │   ├── rust
    │   │   │   └── data
    │   │   │   │   └── humaneval_rust.jsonl.gz
    │   │   └── translate_humaneval_x.py
    │   ├── inspect_result.py
    │   ├── metric.py
    │   ├── rust
    │   │   ├── Cargo.lock
    │   │   └── Cargo.toml
    │   └── utils.py
    ├── data
    │   ├── __init__.py
    │   ├── data_utils.py
    │   ├── process_pretrain_dataset.py
    │   ├── processor.py
    │   └── types.py
    ├── docker
    │   └── Dockerfile
    ├── kernels
    │   ├── __init__.py
    │   └── quantization.fatbin
    ├── megatron
    │   ├── __init__.py
    │   ├── arguments.py
    │   ├── checkpointing.py
    │   ├── code_generation_utils.py
    │   ├── convert_ckpt_parallel.py
    │   ├── data
    │   │   ├── __init__.py
    │   │   ├── blendable_dataset.py
    │   │   ├── data_samplers.py
    │   │   ├── dataset_utils.py
    │   │   ├── helpers.cpp
    │   │   ├── indexed_dataset.py
    │   │   └── prompt_dataset.py
    │   ├── enums.py
    │   ├── global_vars.py
    │   ├── inference.py
    │   ├── initialize.py
    │   ├── learning_rates.py
    │   ├── memory.py
    │   ├── merge_ckpt_parallel.py
    │   ├── microbatches.py
    │   ├── mindspore_to_megatron.py
    │   ├── model
    │   │   ├── __init__.py
    │   │   ├── codegeex_model.py
    │   │   ├── distributed.py
    │   │   ├── language_model.py
    │   │   ├── module.py
    │   │   ├── transformer.py
    │   │   └── utils.py
    │   ├── mpu
    │   │   ├── __init__.py
    │   │   ├── cross_entropy.py
    │   │   ├── data.py
    │   │   ├── initialize.py
    │   │   ├── layers.py
    │   │   ├── mappings.py
    │   │   ├── random.py
    │   │   └── utils.py
    │   ├── optimizer
    │   │   ├── __init__.py
    │   │   ├── clip_grads.py
    │   │   ├── grad_scaler.py
    │   │   └── optimizer.py
    │   ├── p2p_communication.py
    │   ├── schedules.py
    │   ├── tokenizer
    │   │   ├── __init__.py
    │   │   ├── gpt2_tokenization.py
    │   │   └── tokenizer.py
    │   ├── tools
    │   │   ├── collect_env.py
    │   │   ├── finetune_codegeex.py
    │   │   └── pretrain_codegeex.py
    │   ├── training.py
    │   └── utils.py
    ├── mindspore
    │   ├── configs
    │   │   ├── 13B.sh
    │   │   ├── 13B_128p_save_1p.sh
    │   │   ├── 13B_128p_save_8p_ckpt.sh
    │   │   ├── 13B_1p_to_torch.sh
    │   │   ├── 13B_finetune.sh
    │   │   ├── 13B_generate.sh
    │   │   ├── 13B_generate_1p.sh
    │   │   ├── 13B_generate_1p_values.sh
    │   │   ├── 13B_generate_finetune.sh
    │   │   ├── 13B_generate_humaneval.sh
    │   │   └── 13B_generate_values.sh
    │   ├── convertion_1p.py
    │   ├── finetune.py
    │   ├── generation.py
    │   ├── generation_1p.py
    │   ├── generation_batch.py
    │   ├── generation_finetune.py
    │   ├── generation_humaneval.py
    │   ├── generation_values.py
    │   ├── generation_values_1p.py
    │   ├── save_1p_ckpt_from_8p_ckpt.py
    │   ├── save_8p_ckpt.py
    │   ├── scripts
    │   │   ├── custom_tune_bank_new
    │   │   │   └── Ascend910ProA
    │   │   │   │   ├── cube
    │   │   │   │       ├── repository_ascend910ProA_matmul.bin
    │   │   │   │       └── repository_ascend910ProA_matmul.json
    │   │   │   │   └── vector
    │   │   │   │       └── Ascend910ProA_AiCore_32_v001_20220509_200939_588817.json
    │   │   ├── layer_norm.py
    │   │   ├── layer_norm_x_backprop_v2.py
    │   │   ├── ma-pre-start.sh
    │   │   ├── run_modelarts.py
    │   │   ├── run_modelarts_gen_finetune.py
    │   │   └── run_modelarts_gen_humaneval_x.py
    │   ├── src
    │   │   ├── __init__.py
    │   │   ├── adam.py
    │   │   ├── callbacks.py
    │   │   ├── code_tokenizer.py
    │   │   ├── dataset.py
    │   │   ├── dataset_finetune.py
    │   │   ├── generate.py
    │   │   ├── generate_finetune.py
    │   │   ├── generate_greedy.py
    │   │   ├── generate_humaneval.py
    │   │   ├── metrics.py
    │   │   ├── pangu_alpha.py
    │   │   ├── pangu_alpha_config.py
    │   │   ├── pangu_alpha_fp16_predict.py
    │   │   ├── pangu_alpha_wrapcell.py
    │   │   ├── pangu_alpha_wrapcell_finetune.py
    │   │   ├── preprocess.py
    │   │   ├── sat_dataset.py
    │   │   ├── tokenization_jieba.py
    │   │   └── utils.py
    │   └── train.py
    ├── oneflow
    │   ├── __init__.py
    │   ├── codegeex_model.py
    │   └── inference.py
    ├── paddle
    │   ├── __init__.py
    │   ├── codegeex_model.py
    │   ├── inference.py
    │   └── pt_to_pdparams.py
    ├── quantization
    │   ├── __init__.py
    │   ├── quantize.py
    │   └── quantize_oneflow.py
    ├── tokenizer
    │   ├── __init__.py
    │   ├── added_tokens.json
    │   ├── merges.txt
    │   ├── special_tokens_map.json
    │   ├── tokenizer.py
    │   ├── tokenizer_config.json
    │   └── vocab.json
    └── torch
    │   ├── __init__.py
    │   ├── codegeex_model.py
    │   ├── get_ckpt_qkv.py
    │   └── inference.py
├── configs
    ├── codegeex_13b.sh
    ├── codegeex_13b_paddle.sh
    └── codegeex_13b_parallel.sh
├── deployment
    ├── example_inputs.jsonl
    └── server_gradio.py
├── generations
    ├── humaneval_python_generations.jsonl.gz
    └── humaneval_rust_generations.jsonl.gz
├── requirements.txt
├── resources
    ├── api
    │   ├── api_step_1.png
    │   ├── api_step_2.png
    │   ├── api_step_3.png
    │   ├── api_step_4.png
    │   └── api_step_5.png
    ├── en
    │   ├── codegeex_training.png
    │   ├── hx_boxplot.png
    │   ├── hx_examples.png
    │   ├── hx_generattion_radar_horizon.png
    │   ├── hx_pass_rate_vs_language.png
    │   ├── hx_tasks.png
    │   └── hx_translation.png
    ├── logo
    │   └── codegeex_logo.png
    └── zh
    │   ├── hx_boxplot_zh.png
    │   ├── hx_generattion_radar_horizon_zh.png
    │   ├── hx_pass_rate_vs_language_zh.png
    │   ├── hx_tasks_zh.png
    │   ├── hx_translation_zh.png
    │   ├── join_wechat.png
    │   └── wechat.md
├── scripts
    ├── convert_ckpt_parallel.sh
    ├── convert_mindspore_to_megatron.sh
    ├── evaluate_humaneval_x.py
    ├── evaluate_humaneval_x.sh
    ├── finetune_codegeex.sh
    ├── gather_output.sh
    ├── generate_humaneval_x.sh
    ├── pretrain_codegeex.sh
    ├── process_pretrain_dataset.sh
    ├── test_inference.sh
    ├── test_inference_oneflow.sh
    ├── test_inference_oneflow_quantized.sh
    ├── test_inference_paddle.sh
    ├── test_inference_parallel.sh
    ├── test_inference_quantized.sh
    └── translate_humaneval_x.sh
├── setup.py
├── tests
    ├── test_inference.py
    ├── test_inference_megatron.py
    ├── test_inference_oneflow.py
    ├── test_inference_paddle.py
    └── test_prompt.txt
└── vscode-extension
    ├── README.md
    └── README_zh.md


/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "vscode-extension/codegeex-vscode-extension"]
2 | 	path = vscode-extension/codegeex-vscode-extension
3 | 	url = git@github.com:CodeGeeX/codegeex-vscode-extension.git
4 | 


--------------------------------------------------------------------------------
/MODEL_LICENSE:
--------------------------------------------------------------------------------
 1 | The CodeGeeX License
 2 | 
 3 | 1. Definitions
 4 | 
 5 | “Licensor” means the CodeGeeX Model Team that distributes its Software.
 6 | 
 7 | “Software” means the CodeGeeX model parameters made available under this license.
 8 | 
 9 | 2. License Grant
10 | 
11 | Subject to the terms and conditions of this License, the Licensor hereby grants to you a non-exclusive, worldwide, non-transferable, non-sublicensable, revocable, royalty-free copyright license to use the Software solely for your non-commercial research purposes.
12 | 
13 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
14 | 
15 | 3. Restriction
16 | 
17 | You will not use, copy, modify, merge, publish, distribute, reproduce, or create derivative works of the Software, in whole or in part, for any commercial, military, or illegal purposes.
18 | 
19 | You will not use the Software for any act that may undermine China's national security and national unity, harm the public interest of society, or infringe upon the rights and interests of human beings.
20 | 
21 | 4. Disclaimer
22 | 
23 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
24 | 
25 | 5. Limitation of Liability
26 | 
27 | EXCEPT TO THE EXTENT PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL THEORY, WHETHER BASED IN TORT, NEGLIGENCE, CONTRACT, LIABILITY, OR OTHERWISE WILL ANY LICENSOR BE LIABLE TO YOU FOR ANY DIRECT, INDIRECT, SPECIAL, INCIDENTAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES, OR ANY OTHER COMMERCIAL LOSSES, EVEN IF THE LICENSOR HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
28 | 
29 | 6. Dispute Resolution
30 | 
31 | This license shall be governed and construed in accordance with the laws of People’s Republic of China. Any dispute arising from or in connection with this License shall be submitted to Haidian District People's Court in Beijing.
32 | 
33 | Note that the license is subject to update to a more comprehensive version. For any questions related to the license and copyright, please contact us at report@aminer.cn.


--------------------------------------------------------------------------------
/api/README_zh.md:
--------------------------------------------------------------------------------
 1 | ![codegeex_logo](../resources/logo/codegeex_logo.png)
 2 | 
 3 | # 创建CodeGeeX API
 4 | 
 5 | 使用[天启 · API开放平台](https://tianqi.aminer.cn/open/)申请CodeGeeX API：
 6 | 
 7 | <img src="../resources/api/api_step_1.png">
 8 | 
 9 | 点击首页中的天启平台体验入口：
10 | <img src="../resources/api/api_step_2.png">
11 | 点击API应用：
12 | <img src="../resources/api/api_step_3.png">
13 | 输入任意名称，创建API应用。创建后会得到API Key/Secret，用于调用API：
14 | <img src="../resources/api/api_step_4.png">
15 | 
16 | 在API信息中，可以查看代码生成/代码翻译的请求地址和使用文档：
17 | <img src="../resources/api/api_step_5.png">
18 | 
19 | 根据文档中的描述使用API，Python版参考目录``api/codegeex-api-example-python``；JAVA版参考工程：``api/codegeex-api-example-java``
20 | 


--------------------------------------------------------------------------------
/api/codegeex-api-example-java/pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <project xmlns="http://maven.apache.org/POM/4.0.0"
  3 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  5 |   <modelVersion>4.0.0</modelVersion>
  6 | 
  7 |   <groupId>cn.aminer</groupId>
  8 |   <artifactId>codegeex-api-example-java</artifactId>
  9 |   <version>1.0-SNAPSHOT</version>
 10 | 
 11 |   <!--
 12 |   如果没有下面的这段encoding配置，会导致编译的时候输出WARNING信息：
 13 |   [WARNING] Using platform encoding (UTF-8 actually) to copy filtered resources, i.e. build is platform dependent!
 14 |   -->
 15 |   <properties>
 16 |     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
 17 |     <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
 18 |   </properties>
 19 | 
 20 |   <build>
 21 |     <plugins>
 22 |       <plugin>
 23 |         <groupId>org.apache.maven.plugins</groupId>
 24 |         <artifactId>maven-compiler-plugin</artifactId>
 25 |         <version>3.8.1</version>
 26 |         <configuration>
 27 |           <!-- 一般而言，target与source是保持一致的，但是，有时候为了让程序能在其他版本的jdk中运行(对于低版本目标jdk，源代码中不能使用
 28 |           低版本jdk中不支持的语法)，会存在target不同于source的情况 -->
 29 |           <source>1.8</source><!-- 源代码使用的JDK版本 -->
 30 |           <target>1.8</target><!-- 需要生成的目标class文件的编译版本 -->
 31 |           <encoding>UTF-8</encoding><!-- 字符集编码 -->
 32 |         </configuration>
 33 |       </plugin>
 34 |       <plugin>
 35 |         <groupId>org.apache.maven.plugins</groupId>
 36 |         <artifactId>maven-assembly-plugin</artifactId>
 37 |         <version>3.3.0</version>
 38 |         <configuration>
 39 |           <descriptorRefs>
 40 |             <descriptorRef>jar-with-dependencies</descriptorRef>
 41 |           </descriptorRefs>
 42 |         </configuration>
 43 |         <!-- 有下面这段executions，才能打出包含所有dependency的fat jar -->
 44 |         <executions>
 45 |           <execution>
 46 |             <phase>package</phase><!-- 指定在打包节点执行jar包合并操作 -->
 47 |             <goals>
 48 |               <goal>single</goal><!-- 该模块只运行一次 -->
 49 |             </goals>
 50 |           </execution>
 51 |         </executions>
 52 |       </plugin>
 53 |     </plugins>
 54 |   </build>
 55 | 
 56 |   <dependencies>
 57 |     <dependency>
 58 |       <groupId>com.fasterxml.jackson.module</groupId>
 59 |       <artifactId>jackson-module-parameter-names</artifactId>
 60 |       <version>2.6.6</version>
 61 |     </dependency>
 62 |     <dependency>
 63 |       <groupId>com.fasterxml.jackson.datatype</groupId>
 64 |       <artifactId>jackson-datatype-jdk8</artifactId>
 65 |       <version>2.6.6</version>
 66 |     </dependency>
 67 |     <dependency>
 68 |       <groupId>com.fasterxml.jackson.datatype</groupId>
 69 |       <artifactId>jackson-datatype-jsr310</artifactId>
 70 |       <version>2.6.6</version>
 71 |     </dependency>
 72 |     <dependency>
 73 |       <groupId>com.squareup.okhttp3</groupId>
 74 |       <artifactId>okhttp</artifactId>
 75 |     </dependency>
 76 |     <dependency>
 77 |       <groupId>org.slf4j</groupId>
 78 |       <artifactId>slf4j-log4j12</artifactId>
 79 |     </dependency>
 80 |     <dependency>
 81 |       <groupId>log4j</groupId>
 82 |       <artifactId>log4j</artifactId>
 83 |     </dependency>
 84 |     <dependency>
 85 |       <groupId>org.projectlombok</groupId>
 86 |       <artifactId>lombok</artifactId>
 87 |       <scope>provided</scope>
 88 |     </dependency>
 89 |   </dependencies>
 90 | 
 91 |   <dependencyManagement>
 92 |     <dependencies>
 93 |       <dependency>
 94 |         <groupId>com.fasterxml.jackson.module</groupId>
 95 |         <artifactId>jackson-module-parameter-names</artifactId>
 96 |       </dependency>
 97 |       <dependency>
 98 |         <groupId>com.fasterxml.jackson.datatype</groupId>
 99 |         <artifactId>jackson-datatype-jdk8</artifactId>
100 |       </dependency>
101 |       <dependency>
102 |         <groupId>com.fasterxml.jackson.datatype</groupId>
103 |         <artifactId>jackson-datatype-jsr310</artifactId>
104 |       </dependency>
105 |       <dependency>
106 |         <groupId>com.fasterxml.jackson.core</groupId>
107 |         <artifactId>jackson-databind</artifactId>
108 |       </dependency>
109 |       <dependency>
110 |         <groupId>com.squareup.okhttp3</groupId>
111 |         <artifactId>okhttp</artifactId>
112 |         <version>4.10.0</version>
113 |       </dependency>
114 |       <dependency>
115 |         <groupId>log4j</groupId>
116 |         <artifactId>log4j</artifactId>
117 |         <version>1.2.17</version>
118 |       </dependency>
119 |       <dependency>
120 |         <groupId>org.slf4j</groupId>
121 |         <artifactId>slf4j-log4j12</artifactId>
122 |         <version>1.7.5</version>
123 |       </dependency>
124 |       <dependency>
125 |         <groupId>org.projectlombok</groupId>
126 |         <artifactId>lombok</artifactId>
127 |         <version>1.18.20</version>
128 |         <scope>provided</scope>
129 |       </dependency>
130 |     </dependencies>
131 |   </dependencyManagement>
132 | 
133 |   <repositories>
134 |     <repository>
135 |       <!-- Maven 自带的中央仓库使用的id为central，如果其他的仓库声明也是用该id，就会覆盖中央仓库的配置 -->
136 |       <id>central</id>
137 |       <name>ALiYun</name>
138 |       <url>http://maven.aliyun.com/nexus/content/groups/public</url>
139 |     </repository>
140 |   </repositories>
141 | </project>
142 | 


--------------------------------------------------------------------------------
/api/codegeex-api-example-java/src/main/java/cn/aminer/codegeex/example/CodeGenerationExample.java:
--------------------------------------------------------------------------------
 1 | package cn.aminer.codegeex.example;
 2 | 
 3 | import cn.aminer.codegeex.example.pojo.Payload;
 4 | import com.fasterxml.jackson.databind.ObjectMapper;
 5 | import okhttp3.*;
 6 | 
 7 | import java.io.IOException;
 8 | 
 9 | /**
10 |  * 调用 CodeGeeX API 生成代码的例子。
11 |  *
12 |  * @author Darran Zhang @ codelast.com
13 |  * @version 2023-01-20
14 |  */
15 | public class CodeGenerationExample {
16 |   public static final String API_KEY = "your_api_key";  // 在"天启开放平台"上申请到的API Key
17 |   public static final String API_SECRET = "your_api_secret";  // 在"天启开放平台"上申请到的API Secret
18 |   public static final int NUMBER = 3;  // 生成几个候选
19 |   public static final String LANGUAGE = "Java";  // 编程语言
20 |   public static final String REQUEST_URL = "https://tianqi.aminer.cn/api/v2/multilingual_code_generate";  // 请求地址
21 | 
22 |   public static void main(String[] args) throws Exception {
23 |     CodeGenerationExample example = new CodeGenerationExample();
24 |     String prompt = "// use OkHttpClient library to write a function to perform http post request\n\n" +
25 |       "public class HttpPost {\n" +
26 |       "    public static void main(String[] args) {\n";
27 |     example.generateCode(prompt);
28 |   }
29 | 
30 |   /**
31 |    * 生成代码。
32 |    *
33 |    * @param prompt 待补全的代码
34 |    */
35 |   public void generateCode(String prompt) throws Exception {
36 |     ObjectMapper objectMapper = new ObjectMapper();
37 |     Payload payload = new Payload().setApiKey(API_KEY).setApiSecret(API_SECRET).setPrompt(prompt).setNumber(NUMBER)
38 |       .setLanguage(LANGUAGE);
39 |     String response = performHttpPost(REQUEST_URL, objectMapper.writeValueAsString(payload));
40 |     System.out.println(response);
41 |   }
42 | 
43 |   /**
44 |    * 发起 HTTP POST 请求。
45 |    *
46 |    * @param url     请求的URL
47 |    * @param payload 请求的JSON数据
48 |    * @return 请求返回的内容，若出错则返回 null。
49 |    */
50 |   public String performHttpPost(String url, String payload) {
51 |     HttpUrl.Builder builder = null;
52 |     try {
53 |       HttpUrl httpUrl = HttpUrl.parse(url);
54 |       if (httpUrl != null) {
55 |         builder = httpUrl.newBuilder();
56 |       }
57 |     } catch (IllegalArgumentException e) {
58 |       System.out.println("failed to create HttpUrl.Builder from url " + url + ":" + e);
59 |     }
60 |     if (builder == null) {
61 |       return null;
62 |     }
63 |     OkHttpClient client = new OkHttpClient();
64 |     RequestBody requestBody = RequestBody.create(payload, MediaType.parse("application/json; charset=utf-8"));
65 |     Request request = new Request.Builder()
66 |       .url(builder.build())
67 |       .post(requestBody)
68 |       .build();
69 | 
70 |     try {
71 |       Response response = client.newCall(request).execute();
72 |       ResponseBody body = response.body();
73 |       if (body == null) {
74 |         System.out.println("null response body");
75 |         return null;
76 |       }
77 |       return body.string();
78 |     } catch (IOException e) {
79 |       System.out.println("failed to send POST request: " + e);
80 |     }
81 |     return null;
82 |   }
83 | }
84 | 


--------------------------------------------------------------------------------
/api/codegeex-api-example-java/src/main/java/cn/aminer/codegeex/example/pojo/Payload.java:
--------------------------------------------------------------------------------
 1 | package cn.aminer.codegeex.example.pojo;
 2 | 
 3 | import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
 4 | import com.fasterxml.jackson.annotation.JsonProperty;
 5 | import lombok.Data;
 6 | import lombok.experimental.Accessors;
 7 | 
 8 | /**
 9 |  * 发送到 CodeGeex API 的请求中包含的JSON payload对象。
10 |  *
11 |  * @author Darran Zhang @ codelast.com
12 |  * @version 2023-01-20
13 |  */
14 | @JsonIgnoreProperties(ignoreUnknown = true)
15 | @Data
16 | @Accessors(chain = true)
17 | public class Payload {
18 |   @JsonProperty("apikey")
19 |   String apiKey;  // 在"天启开放平台"上申请到的API Key
20 | 
21 |   @JsonProperty("apisecret")
22 |   String apiSecret;  // 在"天启开放平台"上申请到的API Secret
23 | 
24 |   String prompt;  // 待补全的代码
25 | 
26 |   @JsonProperty("n")
27 |   int number;  // 生成几个候选
28 | 
29 |   @JsonProperty("lang")
30 |   String language;  // 编程语言
31 | }
32 | 


--------------------------------------------------------------------------------
/api/codegeex-api-example-python/generation_example.py:
--------------------------------------------------------------------------------
 1 | # encoding:utf-8
 2 | 
 3 | import json
 4 | 
 5 | import requests
 6 | 
 7 | '''
 8 | Code Generation
 9 | '''
10 | API_KEY = ""  # Get from Tianqi console. 从控制台获取
11 | API_SECRET = ""  # Get from Tianqi console. 从控制台获取
12 | PROMPT = "from typing import List\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    " \
13 |          "\"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n    given " \
14 |          "threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements(" \
15 |          "[1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    \"\"\"\n"
16 | NUMBER = 3
17 | LANG = "Python"
18 | request_url = "https://tianqi.aminer.cn/api/v2/"
19 | api = 'multilingual_code_generate'
20 | 
21 | # Request is in json format. 指定请求参数格式为json
22 | headers = {'Content-Type': 'application/json'}
23 | request_url = request_url + api
24 | data = {
25 |     "apikey": API_KEY,
26 |     "apisecret": API_SECRET,
27 |     "prompt": PROMPT,
28 |     "n": NUMBER,
29 |     "lang": LANG
30 | }
31 | 
32 | 
33 | def main():
34 |     response = requests.post(request_url, headers=headers, data=json.dumps(data))
35 |     if response:
36 |         print(response.json())
37 | 
38 | 
39 | if __name__ == '__main__':
40 |     main()
41 | 


--------------------------------------------------------------------------------
/codegeex/__init__.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | 
 3 | from typing import *
 4 | from codegeex.tokenizer import CodeGeeXTokenizer
 5 | from codegeex.torch.inference import get_token_stream
 6 | 
 7 | 
 8 | def get_model(
 9 |     backend: str = "megatron",
10 |     quantized: bool = False,
11 | ):
12 |     pass
13 | 
14 | 
15 | def generate(
16 |     model, 
17 |     tokenizer: CodeGeeXTokenizer, 
18 |     prompt: str, 
19 |     out_seq_length: int,
20 |     seq_length: int = 2048,
21 |     top_k: int = 0,
22 |     top_p: float = 1.0,
23 |     temperature: float = 1.0,
24 |     micro_batch_size: int = 1,
25 |     backend: str = "megatron",
26 |     greedy: bool = False,
27 |     verbose: bool = False,
28 | ):
29 |     tokens = tokenizer.encode_code(prompt)
30 |     n_token_prompt = len(tokens)
31 | 
32 |     if verbose:
33 |         print(f"Current prompt:\n{prompt}")
34 |         print("N_token_prompt:", n_token_prompt)
35 |     
36 |     generated_codes = []
37 |     if backend == "megatron":
38 |         token_stream = get_token_stream(
39 |             model,
40 |             tokenizer,
41 |             seq_length,
42 |             out_seq_length,
43 |             [copy.deepcopy(tokens) for _ in range(micro_batch_size)],
44 |             micro_batch_size=micro_batch_size,
45 |             topk=top_k,
46 |             topp=top_p,
47 |             temperature=temperature,
48 |             greedy=greedy,
49 |         )
50 |         is_finished = [False for _ in range(micro_batch_size)]
51 |         for i, generated in enumerate(token_stream):
52 |             generated_tokens = generated[0]
53 |             for j in range(micro_batch_size):
54 |                 if is_finished[j]:
55 |                     continue
56 |                 
57 |                 if generated_tokens[j].cpu().numpy()[-1] == tokenizer.eos_token_id or len(generated_tokens[j]) >= out_seq_length:
58 |                     is_finished[j] = True
59 |                     generated_tokens_ = generated_tokens[j].cpu().numpy().tolist()
60 |                     generated_code = tokenizer.decode_code(generated_tokens_[n_token_prompt:])
61 |                     generated_code = "".join(generated_code)
62 |                     generated_codes.append(generated_code)
63 |                     if verbose:
64 |                         print(f"\nGenerated code {i}:\n{generated_code}")
65 |                     
66 |                 if all(is_finished):
67 |                     break
68 | 
69 |     return generated_codes


--------------------------------------------------------------------------------
/codegeex/benchmark/README.md:
--------------------------------------------------------------------------------
 1 | # HumanEval-X: A new benchmark for Multilingual Program Synthesis
 2 | 
 3 | 🌐 <a href="README_zh.md" target="_blank">中文</a>
 4 | 
 5 | HumanEval-X is a new benchmark for better evaluating the multilingual ability of code generation models. While previous works evaluate multilingual program synthesis under semantic similarity (e.g., [CodeBLEU](https://arxiv.org/abs/2009.10297)) which is often misleading, HumanEval-X evaluates the functional correctness of the generated programs. HumanEval-X consists of 820 high-quality human-crafted data samples (each with test cases) in Python, C++, Java, JavaScript, and Go, and can be used for various tasks.
 6 | 
 7 | <img src="../../resources/en/hx_tasks.png">
 8 | 
 9 | <p align="center"><i>An illustration of tasks supported by <b>HumanEval-X</b>. Declarations, docstrings, and solutions are marked with red, green, and blue respectively. <b>Code generation</b> uses declaration and docstring as input, to generate solution. <b>Code translation</b> uses declaration in both languages and translate the solution in source language to the one in target language.</i></p>
10 | 
11 | In HumanEval-X, every sample in each language contains declaration, docstring, and solution, which can be combined in various ways to support different downstream tasks including generation, translation, summarization, etc. We currently focus on two tasks: **code generation** and **code translation**. For code generation, the model uses declaration and docstring as input to generate the solution. For code translation, the model uses declarations in both languages and the solution in the source language as input, to generate solutions in the target language. We remove the description during code translation to prevent the model from directly solving the problem. For both tasks, we use the unbiased pass@k metric proposed in [Codex](https://arxiv.org/abs/2107.03374): $\text{pass}@k:= \mathbb{E}[1-\frac{\tbinom{n-c}{k}}{\tbinom{n}{k}}]$, with $n=200$ and $k\in(1,10,100)$.
12 | 
13 | ## How to use HumanEval-X
14 | 
15 | Data are stored in ``codegeex/benchmark/humaneval-x/[LANG]/data/humaneval_[LANG].jsonl.gz``, using JSON list format. There are six keys:
16 | 
17 | *   ``task_id``: indicates the target language and ID of the problem. Language is one of ["Python", "Java", "JavaScript", "CPP", "Go"].
18 | *   ``prompt``: the function declaration and docstring, used for code generation.
19 | *   ``declaration``: only the function declaration, used for code translation. 
20 | *   ``canonical_solution``: human-crafted example solutions.
21 | *   ``test``: hidden test samples, used for evaluation.
22 | *   ``example_test``: public test samples (appeared in prompt), used for evaluation. 
23 | 
24 | ### Evaluation Environment
25 | 
26 | The evaluation of the generated codes involves compiling and running in multiple programming languages. The versions of the programming language environments and packages we use are as follows:
27 | 
28 | | Dependency | Version  |
29 | | ---------- | -------- |
30 | | Python     | 3.8.12   |
31 | | JDK        | 18.0.2.1 |
32 | | Node.js    | 16.14.0  |
33 | | js-md5     | 0.7.3    |
34 | | C++        | 11       |
35 | | g++        | 7.5.0    |
36 | | Boost      | 1.71.0   |
37 | | OpenSSL    | 3.0.0    |
38 | | go         | 1.18.4   |
39 | 
40 | In order to save everyone the trouble of setting up the environments for these languages, we build a Docker image with the required environments and CodeGeeX installed.
41 | 
42 | You can directly pull the image from Docker Hub:
43 | 
44 | ```bash
45 | docker pull rishubi/codegeex:latest
46 | ```
47 | 
48 | Alternatively, if you are familiar with Dockerfile, you can build the image from `codegeex/docker/Dockerfile` or configure the Dockerfile as you like it:
49 | 
50 | ```bash
51 | cd codegeex/docker
52 | docker build [OPTIONS] .
53 | ```
54 | 
55 | After obtaining the image, you can build a container using the following command:
56 | 
57 | ```bash
58 | docker run -it --gpus all --mount type=bind,source=<LOCAL PATH>,target=<PATH IN CONTAINER> [OPTIONS] <IMAGE NAME:TAG>
59 | ```
60 | 
61 | ### Evaluation
62 | 
63 | We recommend evaluating in [the provided image](#evaluation-environment). To evaluate the generated samples, save generated codes in the following JSON list format:
64 | 
65 | ```
66 | {"task_id": "../..", "generation: "..."}
67 | {"task_id": "../..", "generation: "..."}
68 | ...
69 | ```
70 | 
71 | and evaluate them using the following script under the root directory of the repository (<font color='red'>please execute with caution, the generated codes might have unexpected behaviours though with very low possibility. See the warnings in [execution.py](execution.py) and uncomment the execution lines at your own risk</font>):
72 | 
73 | ```bash
74 | bash scripts/evaluate_humaneval_x.sh <RESULT_FILE> <LANG> <N_WORKERS>
75 | ```
76 | 


--------------------------------------------------------------------------------
/codegeex/benchmark/README_zh.md:
--------------------------------------------------------------------------------
 1 | # HumanEval-X: 多语言代码生成基准
 2 | 
 3 | 🌐 <a href="README.md" target="_blank">English</a>
 4 | 
 5 | 为了更好地评测代码生成模型的多语言生成能力，我们构建了一个新基准HumanEval-X。此前，多语言代码生成能力是基于语义相似度（比如[CodeBLEU](https://arxiv.org/abs/2009.10297)）衡量的，具有一定误导性；HumanEval-X则可用于衡量生成代码的功能正确性。HumanEval-X包含820个高质量手写样本，覆盖Python、C++、Java、JavaScript、Go，可用于多种任务。
 6 | 
 7 | <img src="../../resources/en/hx_tasks.png">
 8 | 
 9 | <p align="center"><i><b>HumanEval-X</b>支持的任务示例。<font style='background-color:#F8CECC'>声明</font>、<font style='background-color:#D5E8D4'>描述</font>、<font style='background-color:#DAE8FC'>解答</font>分别用红、绿、蓝色标注。<i>代码生成</i>将声明与描述作为输入，输出解答。<i>代码翻译</i>将两种语言的声明与源语言的解答作为输入，输出目标语言的解答。</i></p>
10 | 
11 | HumanEval-X中每个语言的样本，包含了声明、描述和解答，它们之间的组合可以支持不同的下游任务，包括生成、翻译、概括等。我们目前关注两个任务：**代码生成**与**代码翻译**。对于代码生成任务，模型将函数声明与文档字符串作为输入，输出函数实现；对于代码翻译任务，模型将两种语言的函数声明与源语言的实现作为输入，输出目标语言上的实现。我们在代码翻译任务中不将文档字符串输入模型，以避免模型直接通过描述生成答案。在两种任务下，我们都采用[Codex](https://arxiv.org/abs/2107.03374)所使用的无偏pass@k指标：$\text{pass}@k:= \mathbb{E}[1-\frac{\tbinom{n-c}{k}}{\tbinom{n}{k}}]$, $n=200$, $k\in(1,10,100)$。
12 | 
13 | ## 如何使用HumanEval-X
14 | 
15 | 样本使用JSON列表格式存储在``codegeex/benchmark/humaneval-x/[LANG]/data/humaneval_[LANG].jsonl.gz``，每条样本包含6个部分：
16 | 
17 | *   ``task_id``: 题目的目标语言与ID。语言为["Python", "Java", "JavaScript", "CPP", "Go"]中之一。
18 | *   ``prompt``: 函数声明与描述，用于代码生成。
19 | *   ``declaration``: 仅有函数声明，用于代码翻译。
20 | *   ``canonical_solution``: 手写的示例解答。
21 | *   ``test``: 隐藏测例，用于评测。
22 | *   ``example_test``: 提示中出现的公开测例，用于评测。
23 | 
24 | ### 评测环境
25 | 
26 | 评测生成的代码需要使用多种语言编译、运行。我们使用的各编程语言依赖及所用包的版本如下：
27 | 
28 | | 依赖    | 版本     |
29 | | ------- | -------- |
30 | | Python  | 3.8.12   |
31 | | JDK     | 18.0.2.1 |
32 | | Node.js | 16.14.0  |
33 | | js-md5  | 0.7.3    |
34 | | C++     | 11       |
35 | | g++     | 7.5.0    |
36 | | Boost   | 1.71.0   |
37 | | OpenSSL | 3.0.0    |
38 | | go      | 1.18.4   |
39 | 
40 | 为了省去使用者配置这些语言环境的麻烦，我们构建了一个Docker镜像，并在其中配置了所需要的环境。
41 | 
42 | 可以直接从Docker Hub拉取镜像：
43 | 
44 | ```bash
45 | docker pull rishubi/codegeex:latest
46 | ```
47 | 
48 | 如果您熟悉Dockerfile，也可以从`codegeex/docker/Dockerfile`构建镜像，或者修改之以定制自己的配置：
49 | 
50 | ```bash
51 | cd codegeex/docker
52 | docker build [OPTIONS] .
53 | ```
54 | 
55 | 获取镜像后，使用如下命令创建容器：
56 | 
57 | ```bash
58 | docker run -it --gpus all --mount type=bind,source=<LOCAL PATH>,target=<PATH IN CONTAINER> [OPTIONS] <IMAGE NAME:TAG>
59 | ```
60 | 
61 | ### 评测
62 | 
63 | 我们推荐使用给定的[评测环境](#评测环境)进行评测。在评测前，将生成的代码以如下JSON列表形式存储：
64 | 
65 | ```
66 | {"task_id": "../..", "generation: "..."}
67 | {"task_id": "../..", "generation: "..."}
68 | ...
69 | ```
70 | 
71 | 并在本仓库的根目录下使用如下指令（<font color='red'>请谨慎执行，生成的代码可能有极低概率产生意外行为。在[execution.py](execution.py)中查看警告并取消执行代码的注释，风险自负</font>）：
72 | 
73 | ```bash
74 | bash scripts/evaluate_humaneval_x.sh <RESULT_FILE> <LANG> <N_WORKERS>
75 | ```
76 | 


--------------------------------------------------------------------------------
/codegeex/benchmark/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/codegeex/benchmark/__init__.py


--------------------------------------------------------------------------------
/codegeex/benchmark/gather_output.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import fire
 4 | import glob
 5 | 
 6 | 
 7 | def gather_output(
 8 |     output_dir: str = "./output",
 9 |     output_prefix: str = None,
10 |     if_remove_rank_files: int = 0,
11 | ):
12 |     if output_prefix is None:
13 |         output_list = glob.glob(output_dir + "/*")
14 |     else:
15 |         output_list = glob.glob(os.path.join(output_dir, output_prefix + "*"))
16 | 
17 |     for output_file in output_list:
18 |         if "rank0" in output_file:
19 |             output_prefix_ = output_file.split("_rank0.jsonl")[0]
20 |             rank_files = glob.glob(output_prefix_ + "_rank*")
21 |             with open(output_prefix_ + ".jsonl", "w") as f_out:
22 |                 for rank_file in rank_files:
23 |                     with open(rank_file, "r") as f_in:
24 |                         for line in f_in:
25 |                             f_out.write(line)
26 |                         if if_remove_rank_files:
27 |                             os.remove(rank_file)
28 |                             print(f"Removing {rank_file}...")
29 | 
30 |     if output_prefix is None:
31 |         output_list = glob.glob(output_dir + "/*")
32 |     else:
33 |         output_list = glob.glob(os.path.join(output_dir, output_prefix + "*"))
34 | 
35 |     for output_file in output_list:
36 |         if "rank" in output_file or "_unfinished" in output_file or "all" in output_file or "_result" in output_file:
37 |             continue
38 |         if "_finished" not in output_file:
39 |             continue
40 |         output_prefix_ = output_file.split("_finished.jsonl")[0]
41 |         files = [output_file, output_prefix_ + "_unfinished.jsonl"]
42 |         with open(output_prefix_ + "_all.jsonl", "w") as f_out:
43 |             for f in files:
44 |                 with open(f, "r") as f_in:
45 |                     for line in f_in:
46 |                         f_out.write(line)
47 | 
48 |         print("Gathering finished. Saved in {}".format(output_prefix_ + "_all.jsonl"))
49 | 
50 | 
51 | def main():
52 |     fire.Fire(gather_output)
53 | 
54 | 
55 | if __name__ == "__main__":
56 |     sys.exit(main())
57 | 


--------------------------------------------------------------------------------
/codegeex/benchmark/humaneval-x/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/codegeex/benchmark/humaneval-x/__init__.py


--------------------------------------------------------------------------------
/codegeex/benchmark/humaneval-x/cpp/data/humaneval_cpp.jsonl.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/codegeex/benchmark/humaneval-x/cpp/data/humaneval_cpp.jsonl.gz


--------------------------------------------------------------------------------
/codegeex/benchmark/humaneval-x/cpp/evaluation/test.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 | Input to this function is a string containing multiple groups of nested parentheses. Your goal is to
 3 | separate those group into separate strings and return the vector of those.
 4 | Separate groups are balanced (each open brace is properly closed) and not nested within each other
 5 | Ignore any spaces in the input string.
 6 | >>> separate_paren_groups("( ) (( )) (( )( ))")
 7 | {"()", "(())", "(()())"}
 8 | */
 9 | #include<stdio.h>
10 | #include<vector>
11 | #include<string>
12 | using namespace std;
13 | vector<string> separate_paren_groups(string paren_string){
14 | 
15 |     vector<string> all_parens;
16 |     string current_paren;
17 |     int level=0;
18 |     char chr;
19 |     int i;
20 |     for (i=0;i<paren_string.length();i++)
21 |     {
22 |         chr=paren_string[i];
23 |         if (chr=='(')
24 |         {
25 |         level+=1;
26 |         current_paren+=chr;
27 |         }
28 |         if (chr==')')
29 |         {
30 |             level-=1;
31 |             current_paren+=chr;
32 |             if (level==0){
33 |                 all_parens.push_back(current_paren);
34 |                 current_paren="";
35 |             }
36 |         }
37 |     }
38 |     return all_parens;
39 | }
40 | 
41 | #undef NDEBUG
42 | #include<assert.h>
43 | bool issame(vector<string> a,vector<string>b){
44 |     if (a.size()!=b.size()) return false;
45 |     for (int i=0;i<a.size();i++)
46 |     {
47 |     if (a[i]!=b[i]) return false;
48 |     }
49 |     return true;
50 | }
51 | int main(){
52 |     assert (issame(separate_paren_groups("(()()) ((())) () ((())()())"),{"(()())", "((()))", "()", "((())()())"}));
53 |     assert (issame(separate_paren_groups("() (()) ((())) (((())))"), {"()", "(())", "((()))", "(((())))" }));
54 |     assert (issame(separate_paren_groups("(()(())((())))") ,{ "(()(())((())))" }));
55 |     assert (issame(separate_paren_groups("( ) (( )) (( )( ))") ,{"()", "(())", "(()())"}));
56 | }


--------------------------------------------------------------------------------
/codegeex/benchmark/humaneval-x/go/data/humaneval_go.jsonl.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/codegeex/benchmark/humaneval-x/go/data/humaneval_go.jsonl.gz


--------------------------------------------------------------------------------
/codegeex/benchmark/humaneval-x/go/evaluation/go.mod:
--------------------------------------------------------------------------------
 1 | module humanEval
 2 | 
 3 | go 1.18
 4 | 
 5 | require (
 6 | 	github.com/go-openapi/inflect v0.19.0
 7 | 	github.com/stretchr/testify v1.8.0
 8 | )
 9 | 
10 | require (
11 | 	github.com/davecgh/go-spew v1.1.1 // indirect
12 | 	github.com/pmezard/go-difflib v1.0.0 // indirect
13 | 	gopkg.in/yaml.v3 v3.0.1 // indirect
14 | )
15 | 


--------------------------------------------------------------------------------
/codegeex/benchmark/humaneval-x/go/evaluation/go.sum:
--------------------------------------------------------------------------------
 1 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 2 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
 3 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 4 | github.com/go-openapi/inflect v0.19.0 h1:9jCH9scKIbHeV9m12SmPilScz6krDxKRasNNSNPXu/4=
 5 | github.com/go-openapi/inflect v0.19.0/go.mod h1:lHpZVlpIQqLyKwJ4N+YSc9hchQy/i12fJykb83CRBH4=
 6 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
 7 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
 8 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
 9 | github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
10 | github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
11 | github.com/stretchr/testify v1.8.0 h1:pSgiaMZlXftHpm5L7V1+rVB+AZJydKsMxsQBIJw4PKk=
12 | github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
13 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
14 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
15 | gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
16 | gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
17 | gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
18 | 


--------------------------------------------------------------------------------
/codegeex/benchmark/humaneval-x/go/evaluation/vendor.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/codegeex/benchmark/humaneval-x/go/evaluation/vendor.tar.gz


--------------------------------------------------------------------------------
/codegeex/benchmark/humaneval-x/java/data/humaneval_java.jsonl.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/codegeex/benchmark/humaneval-x/java/data/humaneval_java.jsonl.gz


--------------------------------------------------------------------------------
/codegeex/benchmark/humaneval-x/js/data/humaneval_js.jsonl.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/codegeex/benchmark/humaneval-x/js/data/humaneval_js.jsonl.gz


--------------------------------------------------------------------------------
/codegeex/benchmark/humaneval-x/python/data/humaneval_python.jsonl.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/codegeex/benchmark/humaneval-x/python/data/humaneval_python.jsonl.gz


--------------------------------------------------------------------------------
/codegeex/benchmark/humaneval-x/rust/data/humaneval_rust.jsonl.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/codegeex/benchmark/humaneval-x/rust/data/humaneval_rust.jsonl.gz


--------------------------------------------------------------------------------
/codegeex/benchmark/metric.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenAI (https://openai.com)
 2 | 
 3 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | # of this software and associated documentation files (the "Software"), to deal
 5 | # in the Software without restriction, including without limitation the rights
 6 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | # copies of the Software, and to permit persons to whom the Software is
 8 | # furnished to do so, subject to the following conditions:
 9 | 
10 | # The above copyright notice and this permission notice shall be included in
11 | # all copies or substantial portions of the Software.
12 | 
13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | # THE SOFTWARE.
20 | # ============================================================================
21 | import itertools
22 | import numpy as np
23 | 
24 | from typing import *
25 | 
26 | 
27 | def estimate_pass_at_k(
28 |         num_samples: Union[int, List[int], np.ndarray],
29 |         num_correct: Union[List[int], np.ndarray],
30 |         k: int
31 | ) -> np.ndarray:
32 |     """
33 |     Estimates pass@k of each problem and returns them in an array.
34 |     """
35 | 
36 |     def estimator(n: int, c: int, k: int) -> float:
37 |         """
38 |         Calculates 1 - comb(n - c, k) / comb(n, k).
39 |         """
40 |         if n - c < k:
41 |             return 1.0
42 |         return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
43 | 
44 |     if isinstance(num_samples, int):
45 |         num_samples_it = itertools.repeat(num_samples, len(num_correct))
46 |     else:
47 |         assert len(num_samples) == len(num_correct)
48 |         num_samples_it = iter(num_samples)
49 | 
50 |     return np.array([estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)])
51 | 


--------------------------------------------------------------------------------
/codegeex/benchmark/rust/Cargo.lock:
--------------------------------------------------------------------------------
  1 | # This file is automatically @generated by Cargo.
  2 | # It is not intended for manual editing.
  3 | version = 3
  4 | 
  5 | [[package]]
  6 | name = "aho-corasick"
  7 | version = "0.7.20"
  8 | source = "registry+https://github.com/rust-lang/crates.io-index"
  9 | checksum = "cc936419f96fa211c1b9166887b38e5e40b19958e5b895be7c1f93adec7071ac"
 10 | dependencies = [
 11 |  "memchr",
 12 | ]
 13 | 
 14 | [[package]]
 15 | name = "fuchsia-cprng"
 16 | version = "0.1.1"
 17 | source = "registry+https://github.com/rust-lang/crates.io-index"
 18 | checksum = "a06f77d526c1a601b7c4cdd98f54b5eaabffc14d5f2f0296febdc7f357c6d3ba"
 19 | 
 20 | [[package]]
 21 | name = "libc"
 22 | version = "0.2.139"
 23 | source = "registry+https://github.com/rust-lang/crates.io-index"
 24 | checksum = "201de327520df007757c1f0adce6e827fe8562fbc28bfd9c15571c66ca1f5f79"
 25 | 
 26 | [[package]]
 27 | name = "md5"
 28 | version = "0.7.0"
 29 | source = "registry+https://github.com/rust-lang/crates.io-index"
 30 | checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"
 31 | 
 32 | [[package]]
 33 | name = "memchr"
 34 | version = "2.5.0"
 35 | source = "registry+https://github.com/rust-lang/crates.io-index"
 36 | checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
 37 | 
 38 | [[package]]
 39 | name = "rand"
 40 | version = "0.4.6"
 41 | source = "registry+https://github.com/rust-lang/crates.io-index"
 42 | checksum = "552840b97013b1a26992c11eac34bdd778e464601a4c2054b5f0bff7c6761293"
 43 | dependencies = [
 44 |  "fuchsia-cprng",
 45 |  "libc",
 46 |  "rand_core 0.3.1",
 47 |  "rdrand",
 48 |  "winapi",
 49 | ]
 50 | 
 51 | [[package]]
 52 | name = "rand_core"
 53 | version = "0.3.1"
 54 | source = "registry+https://github.com/rust-lang/crates.io-index"
 55 | checksum = "7a6fdeb83b075e8266dcc8762c22776f6877a63111121f5f8c7411e5be7eed4b"
 56 | dependencies = [
 57 |  "rand_core 0.4.2",
 58 | ]
 59 | 
 60 | [[package]]
 61 | name = "rand_core"
 62 | version = "0.4.2"
 63 | source = "registry+https://github.com/rust-lang/crates.io-index"
 64 | checksum = "9c33a3c44ca05fa6f1807d8e6743f3824e8509beca625669633be0acbdf509dc"
 65 | 
 66 | [[package]]
 67 | name = "rdrand"
 68 | version = "0.4.0"
 69 | source = "registry+https://github.com/rust-lang/crates.io-index"
 70 | checksum = "678054eb77286b51581ba43620cc911abf02758c91f93f479767aed0f90458b2"
 71 | dependencies = [
 72 |  "rand_core 0.3.1",
 73 | ]
 74 | 
 75 | [[package]]
 76 | name = "regex"
 77 | version = "1.7.1"
 78 | source = "registry+https://github.com/rust-lang/crates.io-index"
 79 | checksum = "48aaa5748ba571fb95cd2c85c09f629215d3a6ece942baa100950af03a34f733"
 80 | dependencies = [
 81 |  "aho-corasick",
 82 |  "memchr",
 83 |  "regex-syntax",
 84 | ]
 85 | 
 86 | [[package]]
 87 | name = "regex-syntax"
 88 | version = "0.6.28"
 89 | source = "registry+https://github.com/rust-lang/crates.io-index"
 90 | checksum = "456c603be3e8d448b072f410900c09faf164fbce2d480456f50eea6e25f9c848"
 91 | 
 92 | [[package]]
 93 | name = "rust"
 94 | version = "0.1.0"
 95 | dependencies = [
 96 |  "md5",
 97 |  "rand",
 98 |  "regex",
 99 | ]
100 | 
101 | [[package]]
102 | name = "winapi"
103 | version = "0.3.9"
104 | source = "registry+https://github.com/rust-lang/crates.io-index"
105 | checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
106 | dependencies = [
107 |  "winapi-i686-pc-windows-gnu",
108 |  "winapi-x86_64-pc-windows-gnu",
109 | ]
110 | 
111 | [[package]]
112 | name = "winapi-i686-pc-windows-gnu"
113 | version = "0.4.0"
114 | source = "registry+https://github.com/rust-lang/crates.io-index"
115 | checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
116 | 
117 | [[package]]
118 | name = "winapi-x86_64-pc-windows-gnu"
119 | version = "0.4.0"
120 | source = "registry+https://github.com/rust-lang/crates.io-index"
121 | checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
122 | 


--------------------------------------------------------------------------------
/codegeex/benchmark/rust/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "rust"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | 
 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 7 | 
 8 | [dependencies]
 9 | rand = "0.4"
10 | regex = "1"
11 | md5 = "0.7.0"
12 | 
13 | 


--------------------------------------------------------------------------------
/codegeex/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/codegeex/data/__init__.py


--------------------------------------------------------------------------------
/codegeex/data/data_utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import gzip
  3 | import json
  4 | 
  5 | from typing import *
  6 | 
  7 | LANGUAGE_TAG = {
  8 |     "c"            : "// language: C",
  9 |     "c++"          : "// language: C++",
 10 |     "cpp"          : "// language: C++",
 11 |     "c#"           : "// language: C#",
 12 |     "csharp"       : "// language: C#",
 13 |     "css"          : "/* language: CSS */",
 14 |     "cuda"         : "// language: Cuda",
 15 |     "dart"         : "// language: Dart",
 16 |     "lua"          : "// language: Lua",
 17 |     "objectivec"  : "// language: Objective-C",
 18 |     "objective-c"  : "// language: Objective-C",
 19 |     "objective-c++": "// language: Objective-C++",
 20 |     "python"       : "# language: Python",
 21 |     "perl"         : "# language: Perl",
 22 |     "prolog"       : f"% language: Prolog",
 23 |     "swift"        : "// language: swift",
 24 |     "lisp"         : "; language: Lisp",
 25 |     "java"         : "// language: Java",
 26 |     "scala"        : "// language: Scala",
 27 |     "tex"          : f"% language: TeX",
 28 |     "vue"          : "<!--language: Vue-->",
 29 |     "markdown"     : "<!--language: Markdown-->",
 30 |     "html"         : "<!--language: HTML-->",
 31 |     "php"          : "// language: PHP",
 32 |     "js"           : "// language: JavaScript",
 33 |     "javascript"   : "// language: JavaScript",
 34 |     "typescript"   : "// language: TypeScript",
 35 |     "go"           : "// language: Go",
 36 |     "shell"        : "# language: Shell",
 37 |     "rust"         : "// language: Rust",
 38 |     "sql"          : "-- language: SQL",
 39 |     "kotlin"       : "// language: Kotlin",
 40 |     "vb"           : "' language: Visual Basic",
 41 |     "ruby"         : "# language: Ruby",
 42 |     "pascal"       : "// language: Pascal",
 43 |     "r"            : "# language: R",
 44 |     "fortran"      : "!language: Fortran",
 45 |     "lean"         : "-- language: Lean",
 46 |     "matlab"       : f"% language: Matlab",
 47 |     "delphi"       : "{language: Delphi}",
 48 |     "scheme"       : "; language: Scheme",
 49 |     "basic"        : "' language: Basic",
 50 |     "assembly"     : "; language: Assembly",
 51 |     "groovy"       : "// language: Groovy",
 52 |     "abap"         : "* language: Abap",
 53 |     "gdscript"     : "# language: GDScript",
 54 |     "haskell"      : "-- language: Haskell",
 55 |     "julia"        : "# language: Julia",
 56 |     "elixir"       : "# language: Elixir",
 57 |     "excel"        : "' language: Excel",
 58 |     "clojure"      : "; language: Clojure",
 59 |     "actionscript" : "// language: ActionScript",
 60 |     "solidity"     : "// language: Solidity",
 61 |     "powershell"   : "# language: PowerShell",
 62 |     "erlang"       : f"% language: Erlang",
 63 |     "cobol"        : "// language: Cobol",
 64 | }
 65 | 
 66 | 
 67 | def stream_jsonl(filename: str) -> Iterable[Dict]:
 68 |     """
 69 |     Parses each jsonl line and yields it as a dictionary
 70 |     """
 71 |     if filename.endswith(".gz"):
 72 |         with open(filename, "rb") as gzfp:
 73 |             with gzip.open(gzfp, "rt") as fp:
 74 |                 for line in fp:
 75 |                     if any(not x.isspace() for x in line):
 76 |                         yield json.loads(line)
 77 |     else:
 78 |         with open(filename, "r") as fp:
 79 |             for line in fp:
 80 |                 if any(not x.isspace() for x in line):
 81 |                     yield json.loads(line)
 82 | 
 83 | 
 84 | def write_jsonl(filename: str, data: Iterable[Dict], append: bool = False):
 85 |     """
 86 |     Writes an iterable of dictionaries to jsonl
 87 |     """
 88 |     if append:
 89 |         mode = "ab"
 90 |     else:
 91 |         mode = "wb"
 92 |     filename = os.path.expanduser(filename)
 93 |     if filename.endswith(".gz"):
 94 |         with open(filename, mode) as fp:
 95 |             with gzip.GzipFile(fileobj=fp, mode="wb") as gzfp:
 96 |                 for x in data:
 97 |                     gzfp.write((json.dumps(x) + "\n").encode("utf-8"))
 98 |     else:
 99 |         with open(filename, mode) as fp:
100 |             for x in data:
101 |                 fp.write((json.dumps(x) + "\n").encode("utf-8"))
102 |                 
103 |                 
104 | def sliding_window(
105 |     prompt_tokens: list, 
106 |     code_tokens: list, 
107 |     seq_len: int, 
108 |     sliding_stride: int, 
109 |     minimum_code_len: int = 1,
110 | ) -> Iterable[Tuple[list, list]]:
111 |     """
112 |     Generate a series of (prompt, code) pairs by sliding the window over the code.
113 |     """
114 |     prompt_len = len(prompt_tokens)
115 |     code_len = len(code_tokens)
116 |     total_len = prompt_len + code_len
117 | 
118 |     start_idx = max(0, prompt_len - seq_len + minimum_code_len)  # at least `minimum_code_len` code token should be in the window
119 |     end_idx = max(0, total_len - seq_len)
120 |     start_idx = min(start_idx, end_idx)
121 | 
122 |     for i in range(start_idx, end_idx + 1, sliding_stride):
123 |         current_prompt = prompt_tokens[i:i + seq_len]
124 |         current_code = code_tokens[max(i - prompt_len, 0):i - prompt_len + seq_len]
125 |         yield current_prompt, current_code
126 | 
127 |     if (end_idx - start_idx) % sliding_stride != 0:
128 |         current_prompt = prompt_tokens[end_idx:end_idx + seq_len]
129 |         current_code = code_tokens[max(end_idx - prompt_len, 0):end_idx - prompt_len + seq_len]
130 |         yield current_prompt, current_code
131 | 


--------------------------------------------------------------------------------
/codegeex/data/process_pretrain_dataset.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import glob
  3 | import fire
  4 | import torch
  5 | import multiprocessing
  6 | 
  7 | from typing import *
  8 | from tqdm.auto import tqdm
  9 | from time import perf_counter
 10 | from black import format_str, FileMode
 11 | 
 12 | from codegeex.data.types import PromptDataset, PromptSample
 13 | from codegeex.data.processor import PromptDatasetProcessor
 14 | from codegeex.data.data_utils import stream_jsonl, LANGUAGE_TAG
 15 | from codegeex.megatron.data.indexed_dataset import make_mmap_builder
 16 | from codegeex.tokenizer import CodeGeeXTokenizer
 17 | 
 18 | 
 19 | def try_format_code(code: str):
 20 |     # Auto-correct to PEP8 format (Change tab to 4-whitespaces;
 21 |     # add whitespace around some special symbols;
 22 |     # reformat line length < 100, etc.)
 23 |     try:
 24 |         res = format_str(code, mode=FileMode(line_length=200))
 25 |     except Exception as e:
 26 |         res = code
 27 |         print(e)
 28 |         print("Wrong python format: {}".format(code))
 29 |     return res
 30 | 
 31 | 
 32 | def load_pretrain_dataset(dataset_path: Union[str, List[str]]) -> Dict:
 33 |     if type(dataset_path) is str:
 34 |         dataset_path = [dataset_path]
 35 |     
 36 |     for p in dataset_path:
 37 |         if not os.path.isdir(p):
 38 |             if p.endswith(".gz") or p.endswith(".jsonl"):
 39 |                 print(f"loading from {p}")
 40 |                 yield from stream_jsonl(p)
 41 |         else:
 42 |             p_list = glob.glob(p + "/*")
 43 |             for p_ in p_list:
 44 |                 if p_.endswith(".gz") or p_.endswith(".jsonl"):
 45 |                     print(f"loading from {p_}")
 46 |                     yield from stream_jsonl(p_)
 47 |           
 48 |             
 49 | def process_sample(
 50 |     sample: Dict, 
 51 |     language: str=None, 
 52 |     mode: str="pretrain",
 53 | ) -> Iterable[PromptSample]:
 54 |     if mode == "pretrain":
 55 |         prompt = ""
 56 |     else:
 57 |         prompt = sample["prompt"]
 58 |     
 59 |     try:
 60 |         if language is not None and language in LANGUAGE_TAG.keys():
 61 |             code = LANGUAGE_TAG[language] + "\n" + sample["code"]
 62 |         else:
 63 |             code = sample["code"]
 64 |     except Exception as e:
 65 |         print(e)
 66 |         print("The key 'code' is missing in data. Aborted")
 67 |         exit(0)
 68 |         
 69 |     yield PromptSample(prompt, code)
 70 | 
 71 | 
 72 | def generate_prompt_samples(
 73 |     dataset: Iterable[Dict], 
 74 |     language: str = None,
 75 |     mode: str = "pretrain",
 76 | ) -> PromptDataset:
 77 |     for sample in dataset:
 78 |         yield from process_sample(sample, language, mode)
 79 | 
 80 | 
 81 | def main(
 82 |     tokenizer_path: str,
 83 |     dataset_path: Union[str, List[str]],
 84 |     output_prefix: str,
 85 |     language: str = None,
 86 |     mode: str = "pretrain",
 87 |     discard_overlong: bool = False,
 88 |     sliding_stride: int = 200,
 89 |     num_workers: int = 32,
 90 |     seq_len: int = 2048,
 91 | ):
 92 |     DATA_KEYS = ["input_ids", "attention_mask", "labels"]
 93 |     
 94 |     # create output dir
 95 |     os.makedirs(os.path.dirname(output_prefix), exist_ok=True)
 96 | 
 97 |     tokenizer = CodeGeeXTokenizer(tokenizer_path=tokenizer_path)
 98 |     pad_token_id = tokenizer.eos_token_id
 99 | 
100 |     dataset = load_pretrain_dataset(dataset_path)
101 |     prompt_dataset = generate_prompt_samples(dataset, language=language, mode=mode)
102 | 
103 |     if num_workers == 0:
104 |         num_workers = multiprocessing.cpu_count()
105 |     pool = multiprocessing.Pool(num_workers)
106 |     output_bin_files = {}
107 |     output_idx_files = {}
108 |     builders = {}
109 | 
110 |     for key in DATA_KEYS:
111 |         output_bin_files[key] = "{}_{}.bin".format(output_prefix, key)
112 |         output_idx_files[key] = "{}_{}.idx".format(output_prefix, key)
113 |         builders[key] = make_mmap_builder(
114 |             output_bin_files[key],
115 |             vocab_size=None,  # magic number, should change it
116 |         )
117 | 
118 |     # NOTE that we use seq_len + 1 instead of seq_len, since the input tokens will be shifted by one.
119 |     processor = PromptDatasetProcessor(
120 |         tokenize=tokenizer.encode_code, 
121 |         pad_token=pad_token_id,
122 |         max_seq_len=seq_len + 1, 
123 |         discard_overlong=discard_overlong,
124 |         sliding_stride=sliding_stride,
125 |         eod_token=pad_token_id)
126 |     
127 |     processor.start_time = perf_counter()
128 |     doc_iter = pool.imap_unordered(processor.process_sample_strict,
129 |                                    prompt_dataset, 
130 |                                    chunksize=20)
131 | 
132 |     for doc_idx, docs in tqdm(enumerate(doc_iter, start=1)):
133 |         processor.doc_processed += 1
134 |         for doc in docs:
135 |             processor.doc_generated += 1
136 |             for key in DATA_KEYS:
137 |                 builders[key].add_item(torch.IntTensor(doc[key]))
138 | 
139 |     for key in DATA_KEYS:
140 |         builders[key].finalize(output_idx_files[key])
141 | 
142 | 
143 | if __name__ == "__main__":
144 |     fire.Fire(main)
145 | 


--------------------------------------------------------------------------------
/codegeex/data/types.py:
--------------------------------------------------------------------------------
 1 | from typing import *
 2 | from dataclasses import dataclass
 3 | 
 4 | 
 5 | @dataclass
 6 | class PromptSample:
 7 |     prompt: str
 8 |     code: str
 9 |     extra: dict = None
10 | 
11 | 
12 | PromptDataset = Iterable[PromptSample]
13 | 
14 | @dataclass
15 | class LabelSample:
16 |     prompt: str
17 |     label: int
18 |     extra: dict = None
19 | 
20 | LabelDataset = Iterable[LabelSample]


--------------------------------------------------------------------------------
/codegeex/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM pytorch/pytorch:1.11.0-cuda11.3-cudnn8-runtime
 2 | 
 3 | RUN cp /etc/apt/sources.list /etc/apt/sources.list.bak \
 4 |     && sed -i "s@http://.*archive.ubuntu.com@https://mirrors.tuna.tsinghua.edu.cn@g" /etc/apt/sources.list \
 5 |     && sed -i "s@http://.*security.ubuntu.com@https://mirrors.tuna.tsinghua.edu.cn@g" /etc/apt/sources.list \
 6 |     && pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple \
 7 |     && apt-get update && apt-get install -y curl npm git nano \
 8 |     && pip install fire zmq transformers tokenizers \
 9 |     && mkdir /workspace/download
10 | 
11 | RUN curl -o /workspace/download/go.tar.gz -SL https://go.dev/dl/go1.18.4.linux-amd64.tar.gz \
12 |     && tar -zxf /workspace/download/go.tar.gz -C /usr/local && rm /workspace/download/go.tar.gz
13 | ENV PATH=/bin:/usr/local/go/bin:$PATH
14 | 
15 | RUN curl -o /workspace/download/node.tar.gz -SL https://nodejs.org/download/release/v16.14.0/node-v16.14.0-linux-x64.tar.gz \
16 |     && mkdir -p /usr/local/lib/nodejs && tar -zxf /workspace/download/node.tar.gz -C /usr/local/lib/nodejs && mv /usr/local/lib/nodejs/node-v16.14.0-linux-x64 /usr/local/lib/nodejs/node \
17 |     && rm /workspace/download/node.tar.gz && npm install -g js-md5@0.7.3
18 | ENV PATH=/usr/local/lib/nodejs/node/bin:$PATH
19 | ENV NODE_PATH=/usr/local/lib/node_modules
20 | 
21 | RUN apt-get install -y build-essential && apt-get install -y g++ \
22 |     && curl -o /workspace/download/boost_1_71_0.tar.gz -SL https://boostorg.jfrog.io/artifactory/main/release/1.71.0/source/boost_1_71_0.tar.gz \
23 |     && tar -zxf /workspace/download/boost_1_71_0.tar.gz && rm /workspace/download/boost_1_71_0.tar.gz && cd boost_1_71_0 \
24 |     && ./bootstrap.sh --prefix=/usr/ && ./b2 && ./b2 install \
25 |     && cd .. && rm -r boost_1_71_0
26 | RUN curl -o /workspace/download/openssl.tar.gz -SL https://www.openssl.org/source/old/3.0/openssl-3.0.0.tar.gz \
27 |     && tar -zxf /workspace/download/openssl.tar.gz && cd openssl-3.0.0 && ./Configure && make && make install \
28 |     && rm /workspace/download/openssl.tar.gz && rm -r /workspace/openssl-3.0.0
29 | ENV PATH=/usr/bin/openssl:$PATH
30 | 
31 | RUN curl -o /workspace/download/jdk.tar.gz -SL https://download.oracle.com/java/18/latest/jdk-18_linux-x64_bin.tar.gz \
32 |     && mkdir /usr/java && tar -zxf /workspace/download/jdk.tar.gz -C /usr/java && rm /workspace/download/jdk.tar.gz \
33 |     && java_path=`ls /usr/java/${path}` && echo "export JAVA_HOME=/usr/java/${java_path}" >> ~/.profile
34 | 
35 | RUN git clone https://github.com/THUDM/CodeGeeX.git && pip install -e /workspace/CodeGeeX \
36 |     && tar -zxf /workspace/CodeGeeX/codegeex/benchmark/humaneval-x/go/evaluation/vendor.tar.gz -C /workspace/CodeGeeX/codegeex/benchmark/humaneval-x/go/evaluation
37 | 
38 | SHELL ["sh", "-lc"]
39 | RUN update-alternatives --install /usr/bin/java java $JAVA_HOME/bin/java 20000 \
40 |     && update-alternatives --install /usr/bin/javac javac $JAVA_HOME/bin/javac 20000 \
41 |     && rm -r /workspace/download \
42 |     && env
43 | 


--------------------------------------------------------------------------------
/codegeex/kernels/__init__.py:
--------------------------------------------------------------------------------
  1 | import pkg_resources
  2 | import torch
  3 | import ctypes
  4 | 
  5 | from typing import List
  6 | from cpm_kernels.kernels.base import LazyKernelCModule, KernelFunction, round_up
  7 | 
  8 | RESOURCE_PACKAGE_NAME = __name__
  9 | 
 10 | 
 11 | class Kernel:
 12 |     def __init__(self, filename: str, function_names: List[str]):
 13 |         filename = filename + ".fatbin"
 14 |         if not pkg_resources.resource_exists(RESOURCE_PACKAGE_NAME, filename):
 15 |             raise RuntimeError("File `%s` not found in `%s`" % (filename, RESOURCE_PACKAGE_NAME))
 16 |         self.filename = filename
 17 |         self.code = pkg_resources.resource_string(RESOURCE_PACKAGE_NAME, filename)
 18 |         self._function_names = function_names
 19 |         self._cmodule = LazyKernelCModule(self.code)
 20 | 
 21 |         for name in self._function_names:
 22 |             setattr(self, name, KernelFunction(self._cmodule, name))
 23 | 
 24 | 
 25 | kernels = Kernel(
 26 |     "quantization",
 27 |     [
 28 |         "int4WeightCompression",
 29 |         "int4WeightExtractionFloat",
 30 |         "int4WeightExtractionHalf",
 31 |         "int8WeightExtractionFloat",
 32 |         "int8WeightExtractionHalf",
 33 |     ],
 34 | )
 35 | 
 36 | 
 37 | def compress_int4_weight(weight: torch.Tensor):  # (n, m)
 38 |     with torch.cuda.device(weight.device):
 39 |         n, m = weight.size(0), weight.size(1)
 40 |         assert m % 2 == 0
 41 |         m = m // 2
 42 |         out = torch.empty(n, m, dtype=torch.int8, device="cuda")
 43 |         stream = torch.cuda.current_stream()
 44 | 
 45 |         gridDim = (n, 1, 1)
 46 |         blockDim = (min(round_up(m, 32), 1024), 1, 1)
 47 | 
 48 |         kernels.int4WeightCompression(
 49 |             gridDim,
 50 |             blockDim,
 51 |             0,
 52 |             stream,
 53 |             [ctypes.c_void_p(weight.data_ptr()), ctypes.c_void_p(out.data_ptr()), ctypes.c_int32(n), ctypes.c_int32(m)],
 54 |         )
 55 |         return out
 56 | 
 57 | 
 58 | def extract_weight_to_half(weight: torch.Tensor, scale_list: torch.Tensor, source_bit_width: int):
 59 |     if source_bit_width == 8:
 60 |         func = kernels.int8WeightExtractionHalf
 61 |     elif source_bit_width == 4:
 62 |         func = kernels.int4WeightExtractionHalf
 63 |     else:
 64 |         assert False, "Unsupported bit-width"
 65 | 
 66 |     with torch.cuda.device(weight.device):
 67 |         n, m = weight.size(0), weight.size(1)
 68 |         out = torch.empty(n, m * (8 // source_bit_width), dtype=torch.half, device="cuda")
 69 |         stream = torch.cuda.current_stream()
 70 | 
 71 |         gridDim = (n, 1, 1)
 72 |         blockDim = (min(round_up(m, 32), 1024), 1, 1)
 73 | 
 74 |         func(
 75 |             gridDim,
 76 |             blockDim,
 77 |             0,
 78 |             stream,
 79 |             [
 80 |                 ctypes.c_void_p(weight.data_ptr()),
 81 |                 ctypes.c_void_p(scale_list.data_ptr()),
 82 |                 ctypes.c_void_p(out.data_ptr()),
 83 |                 ctypes.c_int32(n),
 84 |                 ctypes.c_int32(m),
 85 |             ],
 86 |         )
 87 |         return out
 88 | 
 89 | 
 90 | if __name__ == "__main__":
 91 |     weight = torch.randn(4, 32).to(torch.int8).cuda()
 92 |     scale = torch.ones(weight.size(0)).to(torch.half).cuda()
 93 | 
 94 |     print(weight)
 95 |     b = compress_int4_weight(weight)
 96 |     print(b)
 97 | 
 98 |     a = extract_weight_to_half(b, scale, source_bit_width=4)
 99 |     print(a)
100 | 


--------------------------------------------------------------------------------
/codegeex/kernels/quantization.fatbin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/codegeex/kernels/quantization.fatbin


--------------------------------------------------------------------------------
/codegeex/megatron/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | import torch
16 | 
17 | from .global_vars import get_args
18 | from .global_vars import get_current_global_batch_size
19 | from .global_vars import get_num_microbatches
20 | from .global_vars import update_num_microbatches
21 | from .global_vars import get_tokenizer
22 | from .global_vars import get_tensorboard_writer
23 | from .global_vars import get_adlr_autoresume
24 | from .global_vars import get_timers
25 | from .initialize import initialize_megatron
26 | 
27 | 
28 | def print_rank_0(message):
29 |     """If distributed is initialized, print only on rank 0."""
30 |     if torch.distributed.is_initialized():
31 |         if torch.distributed.get_rank() == 0:
32 |             print(message, flush=True)
33 |     else:
34 |         print(message, flush=True)
35 | 
36 | 
37 | def is_last_rank():
38 |     return torch.distributed.get_rank() == (torch.distributed.get_world_size() - 1)
39 | 
40 | 
41 | def print_rank_last(message):
42 |     """If distributed is initialized, print only on last rank."""
43 |     if torch.distributed.is_initialized():
44 |         if is_last_rank():
45 |             print(message, flush=True)
46 |     else:
47 |         print(message, flush=True)
48 | 


--------------------------------------------------------------------------------
/codegeex/megatron/convert_ckpt_parallel.py:
--------------------------------------------------------------------------------
  1 | """Get model parallel partitions."""
  2 | 
  3 | import os
  4 | import torch
  5 | import argparse
  6 | 
  7 | 
  8 | def get_change_ckpt_args(parser):
  9 |     """Provide extra arguments required for merging."""
 10 |     group = parser.add_argument_group(title='Mindspore to megatron')
 11 |     group.add_argument(
 12 |         '--load-ckpt-path',
 13 |         type=str,
 14 |         required=True,
 15 |         help='path to load ".pt" checkpoint.',
 16 |     )
 17 |     group.add_argument(
 18 |         '--save-ckpt-path',
 19 |         type=str,
 20 |         required=True,
 21 |         help='dir to save converted checkpoints.',
 22 |     )
 23 |     group.add_argument(
 24 |         '--target-tensor-model-parallel-size',
 25 |         type=int,
 26 |         default=2,
 27 |         help='target tensor model parallel size',
 28 |     )
 29 |     
 30 |     return parser
 31 | 
 32 | 
 33 | def get_element_from_dict_by_path(d, path):
 34 |     """
 35 |     Get element from dictionary by path. If element is not present, recursively add empty dictionaries.
 36 |     Args:
 37 |         d (dict): the dictionary to get the element from
 38 |         path (list): the path to the element which is delimited by "."
 39 |     """
 40 |     path = path.split(".")
 41 |     for k in path:
 42 |         if k not in d:
 43 |             d[k] = {}
 44 |         d = d[k]
 45 |     return d
 46 | 
 47 | 
 48 | def main():
 49 |     parser = argparse.ArgumentParser()
 50 |     parser = get_change_ckpt_args(parser)
 51 |     args, _ = parser.parse_known_args()
 52 |     
 53 |     print(f"Load ckpt from {args.load_ckpt_path}...")
 54 |     state_dict = torch.load(args.load_ckpt_path, map_location="cpu")
 55 | 
 56 |     print(f"Spliting ckpt into {args.target_tensor_model_parallel_size} parts...")
 57 |     output_state_dict = []
 58 |     for i in range(args.target_tensor_model_parallel_size):
 59 |         output_state_dict.append({})
 60 |     
 61 |     print("Converting Embedding layers...")
 62 |     word_embeddings = state_dict['module']['language_model']['embedding']['word_embeddings']['weight']
 63 |     position_embeddings = state_dict['module']['language_model']['embedding']['position_embeddings']['weight']
 64 |     out_word_embeddings = torch.chunk(word_embeddings, args.target_tensor_model_parallel_size, dim=0)
 65 |     
 66 |     for i in range(args.target_tensor_model_parallel_size):
 67 |         pos_emb_dict = get_element_from_dict_by_path(
 68 |             output_state_dict[i], "module.language_model.embedding.position_embeddings"
 69 |         )
 70 |         pos_emb_dict["weight"] = position_embeddings
 71 | 
 72 |         word_emb_dict = get_element_from_dict_by_path(
 73 |             output_state_dict[i], "module.language_model.embedding.word_embeddings"
 74 |         )
 75 |         word_emb_dict["weight"] = out_word_embeddings[i].clone()
 76 |         
 77 |     print("Converting QueryEmbedding layers...")
 78 |     query_embeddings = state_dict['module']['language_model']['topQueryEmbedding']['top_query_embeddings']['weight']
 79 |     out_query_embeddings = torch.chunk(query_embeddings, args.target_tensor_model_parallel_size, dim=0)
 80 |     
 81 |     for i in range(args.target_tensor_model_parallel_size):
 82 |         query_emb_dict = get_element_from_dict_by_path(
 83 |             output_state_dict[i], "module.language_model.topQueryEmbedding.top_query_embeddings"
 84 |         )
 85 |         query_emb_dict["weight"] = out_query_embeddings[i].clone()
 86 |     
 87 |     print("Converting Transformer layers...")
 88 |     for layer_name in state_dict['module']['language_model']['transformer'].keys():
 89 |         params = state_dict['module']['language_model']['transformer'][layer_name]
 90 |         if "layernorm" in layer_name:
 91 |             pass
 92 |         elif "attention" in layer_name and "weight" in layer_name:
 93 |             if "dense" in layer_name:
 94 |                 params = torch.chunk(params, args.target_tensor_model_parallel_size, dim=1)
 95 |             else:
 96 |                 params = torch.chunk(params, args.target_tensor_model_parallel_size, dim=0)
 97 |         elif "weight" in layer_name and "dense" in layer_name:
 98 |             if "h_to_4h" in layer_name:
 99 |                 params = torch.chunk(params, args.target_tensor_model_parallel_size, dim=0)
100 |             else:
101 |                 params = torch.chunk(params, args.target_tensor_model_parallel_size, dim=1)
102 |         elif "bias" in layer_name:
103 |             if "dense" not in layer_name or "mlp" in layer_name:
104 |                 if "4h_to_h" in layer_name:
105 |                     pass
106 |                 else:
107 |                     params = torch.chunk(params, args.target_tensor_model_parallel_size, dim=0)
108 |                 
109 |         for i in range(args.target_tensor_model_parallel_size):
110 |             params_dict = get_element_from_dict_by_path(output_state_dict[i], "module.language_model.transformer")
111 |             if type(params) is tuple:
112 |                 params_dict[layer_name] = params[i].clone()
113 |             else:
114 |                 params_dict[layer_name] = params
115 |     
116 |     os.makedirs(args.save_ckpt_path, exist_ok=True)
117 |     for rank in range(args.target_tensor_model_parallel_size):
118 |         save_ckpt_path = os.path.join(args.save_ckpt_path, f"mp_rank_{rank:02d}_model_states.pt")
119 |         torch.save(output_state_dict[rank], save_ckpt_path)
120 |         print(f"Converted checkpoint saved in {save_ckpt_path}.")
121 | 
122 | 
123 | if __name__ == '__main__':
124 |     main()
125 | 


--------------------------------------------------------------------------------
/codegeex/megatron/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/codegeex/megatron/data/__init__.py


--------------------------------------------------------------------------------
/codegeex/megatron/data/blendable_dataset.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """Blendable dataset."""
17 | 
18 | import time
19 | import torch
20 | import numpy as np
21 | 
22 | from codegeex.megatron import print_rank_0
23 | 
24 | 
25 | class BlendableDataset(torch.utils.data.Dataset):
26 |     def __init__(self, datasets, weights):
27 | 
28 |         self.datasets = datasets
29 |         num_datasets = len(datasets)
30 |         assert num_datasets == len(weights)
31 | 
32 |         self.size = 0
33 |         for dataset in self.datasets:
34 |             self.size += len(dataset)
35 | 
36 |         # Normalize weights.
37 |         weights = np.array(weights, dtype=np.float64)
38 |         sum_weights = np.sum(weights)
39 |         assert sum_weights > 0.0
40 |         weights /= sum_weights
41 | 
42 |         # Build indecies.
43 |         start_time = time.time()
44 |         assert num_datasets < 255
45 |         self.dataset_index = np.zeros(self.size, dtype=np.uint8)
46 |         self.dataset_sample_index = np.zeros(self.size, dtype=np.int64)
47 | 
48 |         from megatron.data import helpers
49 | 
50 |         helpers.build_blending_indices(
51 |             self.dataset_index,
52 |             self.dataset_sample_index,
53 |             weights,
54 |             num_datasets,
55 |             self.size,
56 |             torch.distributed.get_rank() == 0,
57 |         )
58 |         print_rank_0(
59 |             "> elapsed time for building blendable dataset indices: "
60 |             "{:.2f} (sec)".format(time.time() - start_time)
61 |         )
62 | 
63 |     def __len__(self):
64 |         return self.size
65 | 
66 |     def __getitem__(self, idx):
67 |         dataset_idx = self.dataset_index[idx]
68 |         sample_idx = self.dataset_sample_index[idx]
69 |         return self.datasets[dataset_idx][sample_idx]
70 | 


--------------------------------------------------------------------------------
/codegeex/megatron/enums.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | import enum
17 | 
18 | 
19 | class LayerType(enum.Enum):
20 |     encoder = 1
21 |     decoder = 2
22 | 
23 | 
24 | class AttnType(enum.Enum):
25 |     self_attn = 1
26 |     cross_attn = 2
27 | 
28 | 
29 | class AttnMaskType(enum.Enum):
30 |     padding = 1
31 |     causal = 2
32 | 


--------------------------------------------------------------------------------
/codegeex/megatron/memory.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | 
 17 | import torch
 18 | 
 19 | 
 20 | # A dictionary of all the memory buffers allocated.
 21 | _MEM_BUFFS = dict()
 22 | 
 23 | 
 24 | def allocate_mem_buff(name, numel, dtype, track_usage):
 25 |     """Allocate a memory buffer."""
 26 |     assert name not in _MEM_BUFFS, "memory buffer {} already allocated.".format(name)
 27 |     _MEM_BUFFS[name] = MemoryBuffer(name, numel, dtype, track_usage)
 28 |     return _MEM_BUFFS[name]
 29 | 
 30 | 
 31 | def get_mem_buff(name):
 32 |     """Get the memory buffer."""
 33 |     return _MEM_BUFFS[name]
 34 | 
 35 | 
 36 | class MemoryBuffer:
 37 |     """Contiguous memory buffer.
 38 |     Allocate a contiguous memory of type `dtype` and size `numel`. It is
 39 |     used to reduce memory fragmentation.
 40 | 
 41 |     Usage: After the allocation, the `_start` index is set tot the first
 42 |            index of the memory. A memory chunk starting from `_start` index
 43 |            can be `allocated` for an input tensor, with the elements of the
 44 |            tensor being coppied. The buffer can be reused by resetting the
 45 |            `_start` index.
 46 | 
 47 |     """
 48 | 
 49 |     def __init__(self, name, numel, dtype, track_usage):
 50 |         if torch.distributed.get_rank() == 0:
 51 |             element_size = torch.tensor([], dtype=dtype).element_size()
 52 |             print(
 53 |                 "> building the {} memory buffer with {} num elements "
 54 |                 "and {} dtype ({:.1f} MB)...".format(
 55 |                     name, numel, dtype, numel * element_size / 1024 / 1024
 56 |                 ),
 57 |                 flush=True,
 58 |             )
 59 |         self.name = name
 60 |         self.numel = numel
 61 |         self.dtype = dtype
 62 |         self.data = torch.empty(
 63 |             self.numel,
 64 |             dtype=self.dtype,
 65 |             device=torch.cuda.current_device(),
 66 |             requires_grad=False,
 67 |         )
 68 | 
 69 |         # Index tracking the start of the free memory.
 70 |         self._start = 0
 71 | 
 72 |         # Values used for tracking usage.
 73 |         self.track_usage = track_usage
 74 |         if self.track_usage:
 75 |             self.in_use_value = 0.0
 76 |             self.total_value = 0.0
 77 | 
 78 |     def reset(self):
 79 |         """Reset the buffer start index to the beginning of the buffer."""
 80 |         self._start = 0
 81 | 
 82 |     def is_in_use(self):
 83 |         """Whether the current buffer hold on to any memory."""
 84 |         return self._start > 0
 85 | 
 86 |     def numel_in_use(self):
 87 |         """Return number of elements in use."""
 88 |         return self._start
 89 | 
 90 |     def add(self, tensor):
 91 |         """Allocate a chunk of memory from the buffer to tensor and copy
 92 |         the values."""
 93 |         assert (
 94 |             tensor.dtype == self.dtype
 95 |         ), "Input tensor type {} different from buffer type {}".format(
 96 |             tensor.dtype, self.dtype
 97 |         )
 98 |         # Number of elements of the input tensor.
 99 |         tensor_numel = torch.numel(tensor)
100 |         new_start = self._start + tensor_numel
101 |         assert (
102 |             new_start <= self.numel
103 |         ), "Not enough memory left in the buffer ({} > {})".format(
104 |             tensor_numel, self.numel - self._start
105 |         )
106 |         # New tensor is a view into the memory.
107 |         new_tensor = self.data[self._start : new_start]
108 |         self._start = new_start
109 |         new_tensor = new_tensor.view(tensor.shape)
110 |         new_tensor.copy_(tensor)
111 |         # Return a pointer to the new tensor.
112 |         return new_tensor
113 | 
114 |     def get_data(self):
115 |         """Return the data currently in use."""
116 |         if self.track_usage:
117 |             self.in_use_value += float(self._start)
118 |             self.total_value += float(self.numel)
119 |         return self.data[: self._start]
120 | 
121 |     def print_average_usage(self):
122 |         """Print memory usage average over time. We would like this value
123 |         to be as high as possible."""
124 |         assert self.track_usage, "You need to enable track usage."
125 |         if torch.distributed.get_rank() == 0:
126 |             print(
127 |                 " > usage of {} memory buffer: {:.2f} %".format(
128 |                     self.name, self.in_use_value * 100.0 / self.total_value
129 |                 ),
130 |                 flush=True,
131 |             )
132 | 
133 | 
134 | class RingMemBuffer:
135 |     """A ring of memory buffers."""
136 | 
137 |     def __init__(self, name, num_buffers, numel, dtype, track_usage):
138 |         self.num_buffers = num_buffers
139 |         self.buffers = [
140 |             allocate_mem_buff(name + " {}".format(i), numel, dtype, track_usage)
141 |             for i in range(num_buffers)
142 |         ]
143 |         self._index = -1
144 | 
145 |     def get_next_buffer(self):
146 |         self._index += 1
147 |         self._index = self._index % self.num_buffers
148 |         buff = self.buffers[self._index]
149 |         assert not buff.is_in_use(), "buffer is already in use."
150 |         return buff
151 | 


--------------------------------------------------------------------------------
/codegeex/megatron/model/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | from torch.nn import LayerNorm
17 | from .distributed import DistributedDataParallel
18 | from .codegeex_model import CodeGeeXModel
19 | from .language_model import get_language_model
20 | from .module import Float16Module


--------------------------------------------------------------------------------
/codegeex/megatron/model/utils.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """Utilities for models."""
17 | 
18 | import math
19 | import torch
20 | 
21 | 
22 | def init_method_normal(sigma):
23 |     """Init method based on N(0, sigma)."""
24 | 
25 |     def init_(tensor):
26 |         return torch.nn.init.normal_(tensor, mean=0.0, std=sigma)
27 | 
28 |     return init_
29 | 
30 | 
31 | def scaled_init_method_normal(sigma, num_layers):
32 |     """Init method based on N(0, sigma/sqrt(2*num_layers)."""
33 |     std = sigma / math.sqrt(2.0 * num_layers)
34 | 
35 |     def init_(tensor):
36 |         return torch.nn.init.normal_(tensor, mean=0.0, std=std)
37 | 
38 |     return init_
39 | 
40 | 
41 | def attention_mask_func(attention_scores, attention_mask):
42 |     attention_scores.masked_fill_(attention_mask, -10000.0)
43 | 
44 |     return attention_scores
45 | 
46 | 
47 | def get_linear_layer(rows, columns, init_method):
48 |     """Simple linear layer with weight initialization."""
49 |     layer = torch.nn.Linear(rows, columns)
50 |     init_method(layer.weight)
51 |     with torch.no_grad():
52 |         layer.bias.zero_()
53 |     return layer
54 | 
55 | 
56 | def fast_gelu(x):
57 |     """Mindspore's fast gelu implementation."""
58 |     return x / (1 + torch.exp(-1.702 * torch.abs(x))) * torch.exp(0.851 * (x - torch.abs(x)))
59 | 
60 | 
61 | @torch.jit.script
62 | def gelu_impl(x):
63 |     """OpenAI's gelu implementation."""
64 |     return (
65 |             0.5 * x * (1.0 + torch.tanh(0.7978845608028654 * x * (1.0 + 0.044715 * x * x)))
66 |     )
67 | 
68 | 
69 | def openai_gelu(x):
70 |     return gelu_impl(x)
71 | 
72 | 
73 | # This is actually Python equivalent of torch.nn.functional.gelu(), also with type hints for ONNX exporter
74 | @torch.jit.script
75 | def erf_gelu(x):
76 |     return (
77 |             x
78 |             * 0.5
79 |             * (
80 |                     torch.erf(x / 1.41421).to(dtype=x.dtype)
81 |                     + torch.ones_like(x).to(dtype=x.dtype)
82 |             )
83 |     )
84 | 


--------------------------------------------------------------------------------
/codegeex/megatron/mpu/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """Model parallel utility interface."""
17 | 
18 | from .cross_entropy import vocab_parallel_cross_entropy
19 | 
20 | from .data import broadcast_data
21 | 
22 | from .initialize import is_unitialized
23 | from .initialize import destroy_model_parallel
24 | from .initialize import get_data_parallel_group
25 | from .initialize import get_data_parallel_rank
26 | from .initialize import get_data_parallel_world_size
27 | from .initialize import get_embedding_group
28 | from .initialize import get_model_parallel_group
29 | from .initialize import get_tensor_model_parallel_group
30 | from .initialize import get_pipeline_model_parallel_group
31 | from .initialize import get_tensor_model_parallel_rank, set_tensor_model_parallel_rank
32 | from .initialize import (
33 |     get_pipeline_model_parallel_rank,
34 |     set_pipeline_model_parallel_rank,
35 | )
36 | from .initialize import is_pipeline_first_stage, is_pipeline_last_stage
37 | from .initialize import get_tensor_model_parallel_src_rank
38 | from .initialize import get_pipeline_model_parallel_first_rank
39 | from .initialize import get_pipeline_model_parallel_last_rank
40 | from .initialize import get_pipeline_model_parallel_next_rank
41 | from .initialize import get_pipeline_model_parallel_prev_rank
42 | from .initialize import (
43 |     get_tensor_model_parallel_world_size,
44 |     set_tensor_model_parallel_world_size,
45 | )
46 | from .initialize import (
47 |     get_pipeline_model_parallel_world_size,
48 |     set_pipeline_model_parallel_world_size,
49 | )
50 | from .initialize import (
51 |     get_virtual_pipeline_model_parallel_rank,
52 |     set_virtual_pipeline_model_parallel_rank,
53 | )
54 | from .initialize import initialize_model_parallel
55 | from .initialize import model_parallel_is_initialized
56 | from .initialize import get_model_parallel_world_size, get_model_parallel_rank
57 | 
58 | from .layers import ColumnParallelLinear
59 | from .layers import RowParallelLinear
60 | from .layers import VocabParallelEmbedding
61 | from .layers import (
62 |     set_tensor_model_parallel_attributes,
63 |     set_defaults_if_not_set_tensor_model_parallel_attributes,
64 |     copy_tensor_model_parallel_attributes,
65 | )
66 | 
67 | from .mappings import copy_to_tensor_model_parallel_region
68 | from .mappings import gather_from_tensor_model_parallel_region
69 | from .mappings import reduce_from_tensor_model_parallel_region
70 | from .mappings import scatter_to_tensor_model_parallel_region
71 | 
72 | from .random import checkpoint
73 | from .random import get_cuda_rng_tracker
74 | from .random import init_checkpointed_activations_memory_buffer
75 | from .random import model_parallel_cuda_manual_seed
76 | from .random import reset_checkpointed_activations_memory_buffer
77 | from .random import gather_split_1d_tensor
78 | from .random import split_tensor_into_1d_equal_chunks
79 | 
80 | from .utils import divide
81 | from .utils import split_tensor_along_last_dim
82 | 


--------------------------------------------------------------------------------
/codegeex/megatron/mpu/cross_entropy.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | 
 17 | import torch
 18 | 
 19 | from .initialize import get_tensor_model_parallel_group
 20 | from .initialize import get_tensor_model_parallel_rank
 21 | from .initialize import get_tensor_model_parallel_world_size
 22 | from .utils import VocabUtility
 23 | 
 24 | 
 25 | class _VocabParallelCrossEntropy(torch.autograd.Function):
 26 |     @staticmethod
 27 |     def forward(ctx, vocab_parallel_logits, target):
 28 | 
 29 |         # Maximum value along vocab dimension across all GPUs.
 30 |         logits_max = torch.max(vocab_parallel_logits, dim=-1)[0]
 31 |         torch.distributed.all_reduce(
 32 |             logits_max,
 33 |             op=torch.distributed.ReduceOp.MAX,
 34 |             group=get_tensor_model_parallel_group(),
 35 |         )
 36 |         # Subtract the maximum value.
 37 |         vocab_parallel_logits.sub_(logits_max.unsqueeze(dim=-1))
 38 | 
 39 |         # Get the partition's vocab indecies
 40 |         get_vocab_range = VocabUtility.vocab_range_from_per_partition_vocab_size
 41 |         partition_vocab_size = vocab_parallel_logits.size()[-1]
 42 |         rank = get_tensor_model_parallel_rank()
 43 |         world_size = get_tensor_model_parallel_world_size()
 44 |         vocab_start_index, vocab_end_index = get_vocab_range(
 45 |             partition_vocab_size, rank, world_size
 46 |         )
 47 | 
 48 |         # Create a mask of valid vocab ids (1 means it needs to be masked).
 49 |         target_mask = (target < vocab_start_index) | (target >= vocab_end_index)
 50 |         masked_target = target.clone() - vocab_start_index
 51 |         masked_target[target_mask] = 0
 52 | 
 53 |         # Get predicted-logits = logits[target].
 54 |         # For Simplicity, we convert logits to a 2-D tensor with size
 55 |         # [*, partition-vocab-size] and target to a 1-D tensor of size [*].
 56 |         logits_2d = vocab_parallel_logits.view(-1, partition_vocab_size)
 57 |         masked_target_1d = masked_target.view(-1)
 58 |         arange_1d = torch.arange(
 59 |             start=0, end=logits_2d.size()[0], device=logits_2d.device
 60 |         )
 61 |         predicted_logits_1d = logits_2d[arange_1d, masked_target_1d]
 62 |         predicted_logits_1d = predicted_logits_1d.clone().contiguous()
 63 |         predicted_logits = predicted_logits_1d.view_as(target)
 64 |         predicted_logits[target_mask] = 0.0
 65 |         # All reduce is needed to get the chunks from other GPUs.
 66 |         torch.distributed.all_reduce(
 67 |             predicted_logits,
 68 |             op=torch.distributed.ReduceOp.SUM,
 69 |             group=get_tensor_model_parallel_group(),
 70 |         )
 71 | 
 72 |         # Sum of exponential of logits along vocab dimension across all GPUs.
 73 |         exp_logits = vocab_parallel_logits
 74 |         torch.exp(vocab_parallel_logits, out=exp_logits)
 75 |         sum_exp_logits = exp_logits.sum(dim=-1)
 76 |         torch.distributed.all_reduce(
 77 |             sum_exp_logits,
 78 |             op=torch.distributed.ReduceOp.SUM,
 79 |             group=get_tensor_model_parallel_group(),
 80 |         )
 81 | 
 82 |         # Loss = log(sum(exp(logits))) - predicted-logit.
 83 |         loss = torch.log(sum_exp_logits) - predicted_logits
 84 | 
 85 |         # Store softmax, target-mask and masked-target for backward pass.
 86 |         exp_logits.div_(sum_exp_logits.unsqueeze(dim=-1))
 87 |         ctx.save_for_backward(exp_logits, target_mask, masked_target_1d)
 88 | 
 89 |         return loss
 90 | 
 91 |     @staticmethod
 92 |     def backward(ctx, grad_output):
 93 | 
 94 |         # Retreive tensors from the forward path.
 95 |         softmax, target_mask, masked_target_1d = ctx.saved_tensors
 96 | 
 97 |         # All the inputs have softmax as their gradient.
 98 |         grad_input = softmax
 99 |         # For simplicity, work with the 2D gradient.
100 |         partition_vocab_size = softmax.size()[-1]
101 |         grad_2d = grad_input.view(-1, partition_vocab_size)
102 | 
103 |         # Add the gradient from matching classes.
104 |         arange_1d = torch.arange(start=0, end=grad_2d.size()[0], device=grad_2d.device)
105 |         grad_2d[arange_1d, masked_target_1d] -= 1.0 - target_mask.view(-1).float()
106 | 
107 |         # Finally elementwise multiplication with the output gradients.
108 |         grad_input.mul_(grad_output.unsqueeze(dim=-1))
109 | 
110 |         return grad_input, None
111 | 
112 | 
113 | def vocab_parallel_cross_entropy(vocab_parallel_logits, target):
114 |     """Helper function for the cross entropy."""
115 |     return _VocabParallelCrossEntropy.apply(vocab_parallel_logits, target)
116 | 


--------------------------------------------------------------------------------
/codegeex/megatron/mpu/data.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | import torch
 17 | 
 18 | from .initialize import get_tensor_model_parallel_group
 19 | from .initialize import get_tensor_model_parallel_rank
 20 | from .initialize import get_tensor_model_parallel_src_rank
 21 | 
 22 | 
 23 | _MAX_DATA_DIM = 5
 24 | 
 25 | 
 26 | def _check_data_types(keys, data, target_dtype):
 27 |     """Check that all the keys have the same target data type."""
 28 |     for key in keys:
 29 |         assert (
 30 |             data[key].dtype == target_dtype
 31 |         ), "{} has data type {} which " "is different than {}".format(
 32 |             key, data[key].dtype, target_dtype
 33 |         )
 34 | 
 35 | 
 36 | def _build_key_size_numel_dictionaries(keys, data):
 37 |     """Build the size on rank 0 and broadcast."""
 38 |     max_dim = _MAX_DATA_DIM
 39 |     sizes = [0 for _ in range(max_dim) for _ in keys]
 40 | 
 41 |     # Pack the sizes on rank zero.
 42 |     if get_tensor_model_parallel_rank() == 0:
 43 |         offset = 0
 44 |         for key in keys:
 45 |             assert data[key].dim() < max_dim, "you should increase MAX_DATA_DIM"
 46 |             size = data[key].size()
 47 |             for i, s in enumerate(size):
 48 |                 sizes[i + offset] = s
 49 |             offset += max_dim
 50 | 
 51 |     # Move to GPU and broadcast.
 52 |     sizes_cuda = torch.cuda.LongTensor(sizes)
 53 |     torch.distributed.broadcast(
 54 |         sizes_cuda,
 55 |         get_tensor_model_parallel_src_rank(),
 56 |         group=get_tensor_model_parallel_group(),
 57 |     )
 58 | 
 59 |     # Move back to cpu and unpack.
 60 |     sizes_cpu = sizes_cuda.cpu()
 61 |     key_size = {}
 62 |     key_numel = {}
 63 |     total_numel = 0
 64 |     offset = 0
 65 |     for key in keys:
 66 |         i = 0
 67 |         size = []
 68 |         numel = 1
 69 |         while sizes_cpu[offset + i] > 0:
 70 |             this_size = sizes_cpu[offset + i]
 71 |             size.append(this_size)
 72 |             numel *= this_size
 73 |             i += 1
 74 |         key_size[key] = size
 75 |         key_numel[key] = numel
 76 |         total_numel += numel
 77 |         offset += max_dim
 78 | 
 79 |     return key_size, key_numel, total_numel
 80 | 
 81 | 
 82 | def broadcast_data(keys, data, datatype):
 83 |     """Broadcast data from rank zero of each model parallel group to the
 84 |     members of the same model parallel group.
 85 | 
 86 |     Arguments:
 87 |         keys: list of keys in the data disctionary to be broadcasted
 88 |         data: data dictionary of string keys and cpu tensor values.
 89 |         datatype: torch data type of all tensors in data associated
 90 |                   with keys.
 91 |     """
 92 |     # Build (key, size) and (key, number of elements) dictionaries along
 93 |     # with the total number of elements on all ranks.
 94 |     key_size, key_numel, total_numel = _build_key_size_numel_dictionaries(keys, data)
 95 | 
 96 |     # Pack on rank zero.
 97 |     if get_tensor_model_parallel_rank() == 0:
 98 |         # Check that all keys have the same data type.
 99 |         _check_data_types(keys, data, datatype)
100 |         # Flatten the data associated with the keys
101 |         flatten_data = torch.cat(
102 |             [data[key].contiguous().view(-1) for key in keys], dim=0
103 |         ).cuda()
104 |     else:
105 |         flatten_data = torch.empty(
106 |             total_numel, device=torch.cuda.current_device(), dtype=datatype
107 |         )
108 | 
109 |     # Broadcast
110 |     torch.distributed.broadcast(
111 |         flatten_data,
112 |         get_tensor_model_parallel_src_rank(),
113 |         group=get_tensor_model_parallel_group(),
114 |     )
115 | 
116 |     # Unpack
117 |     output = {}
118 |     offset = 0
119 |     for key in keys:
120 |         size = key_size[key]
121 |         numel = key_numel[key]
122 |         output[key] = flatten_data.narrow(0, offset, numel).view(size)
123 |         offset += numel
124 | 
125 |     return output
126 | 


--------------------------------------------------------------------------------
/codegeex/megatron/mpu/mappings.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | import torch
 17 | 
 18 | from .initialize import (
 19 |     get_tensor_model_parallel_group,
 20 |     get_tensor_model_parallel_world_size,
 21 |     get_tensor_model_parallel_rank,
 22 | )
 23 | from .utils import split_tensor_along_last_dim
 24 | 
 25 | 
 26 | def _reduce(input_):
 27 |     """All-reduce the the input tensor across model parallel group."""
 28 | 
 29 |     # Bypass the function if we are using only 1 GPU.
 30 |     if get_tensor_model_parallel_world_size() == 1:
 31 |         return input_
 32 | 
 33 |     # All-reduce.
 34 |     torch.distributed.all_reduce(input_, group=get_tensor_model_parallel_group())
 35 | 
 36 |     return input_
 37 | 
 38 | 
 39 | def _split(input_):
 40 |     """Split the tensor along its last dimension and keep the
 41 |     corresponding slice."""
 42 | 
 43 |     world_size = get_tensor_model_parallel_world_size()
 44 |     # Bypass the function if we are using only 1 GPU.
 45 |     if world_size == 1:
 46 |         return input_
 47 | 
 48 |     # Split along last dimension.
 49 |     input_list = split_tensor_along_last_dim(input_, world_size)
 50 | 
 51 |     # Note: torch.split does not create contiguous tensors by default.
 52 |     rank = get_tensor_model_parallel_rank()
 53 |     output = input_list[rank].contiguous()
 54 | 
 55 |     return output
 56 | 
 57 | 
 58 | def _gather(input_):
 59 |     """Gather tensors and concatinate along the last dimension."""
 60 | 
 61 |     world_size = get_tensor_model_parallel_world_size()
 62 |     # Bypass the function if we are using only 1 GPU.
 63 |     if world_size == 1:
 64 |         return input_
 65 | 
 66 |     # Size and dimension.
 67 |     last_dim = input_.dim() - 1
 68 |     rank = get_tensor_model_parallel_rank()
 69 | 
 70 |     tensor_list = [torch.empty_like(input_) for _ in range(world_size)]
 71 |     tensor_list[rank] = input_
 72 |     torch.distributed.all_gather(
 73 |         tensor_list, input_, group=get_tensor_model_parallel_group()
 74 |     )
 75 | 
 76 |     # Note: torch.cat already creates a contiguous tensor.
 77 |     output = torch.cat(tensor_list, dim=last_dim).contiguous()
 78 | 
 79 |     return output
 80 | 
 81 | 
 82 | class _CopyToModelParallelRegion(torch.autograd.Function):
 83 |     """Pass the input to the model parallel region."""
 84 | 
 85 |     @staticmethod
 86 |     def symbolic(graph, input_):
 87 |         return input_
 88 | 
 89 |     @staticmethod
 90 |     def forward(ctx, input_):
 91 |         return input_
 92 | 
 93 |     @staticmethod
 94 |     def backward(ctx, grad_output):
 95 |         return _reduce(grad_output)
 96 | 
 97 | 
 98 | class _ReduceFromModelParallelRegion(torch.autograd.Function):
 99 |     """All-reduce the input from the model parallel region."""
100 | 
101 |     @staticmethod
102 |     def symbolic(graph, input_):
103 |         return _reduce(input_)
104 | 
105 |     @staticmethod
106 |     def forward(ctx, input_):
107 |         return _reduce(input_)
108 | 
109 |     @staticmethod
110 |     def backward(ctx, grad_output):
111 |         return grad_output
112 | 
113 | 
114 | class _ScatterToModelParallelRegion(torch.autograd.Function):
115 |     """Split the input and keep only the corresponding chuck to the rank."""
116 | 
117 |     @staticmethod
118 |     def symbolic(graph, input_):
119 |         return _split(input_)
120 | 
121 |     @staticmethod
122 |     def forward(ctx, input_):
123 |         return _split(input_)
124 | 
125 |     @staticmethod
126 |     def backward(ctx, grad_output):
127 |         return _gather(grad_output)
128 | 
129 | 
130 | class _GatherFromModelParallelRegion(torch.autograd.Function):
131 |     """Gather the input from model parallel region and concatinate."""
132 | 
133 |     @staticmethod
134 |     def symbolic(graph, input_):
135 |         return _gather(input_)
136 | 
137 |     @staticmethod
138 |     def forward(ctx, input_):
139 |         return _gather(input_)
140 | 
141 |     @staticmethod
142 |     def backward(ctx, grad_output):
143 |         return _split(grad_output)
144 | 
145 | 
146 | # -----------------
147 | # Helper functions.
148 | # -----------------
149 | 
150 | 
151 | def copy_to_tensor_model_parallel_region(input_):
152 |     return _CopyToModelParallelRegion.apply(input_)
153 | 
154 | 
155 | def reduce_from_tensor_model_parallel_region(input_):
156 |     return _ReduceFromModelParallelRegion.apply(input_)
157 | 
158 | 
159 | def scatter_to_tensor_model_parallel_region(input_):
160 |     return _ScatterToModelParallelRegion.apply(input_)
161 | 
162 | 
163 | def gather_from_tensor_model_parallel_region(input_):
164 |     return _GatherFromModelParallelRegion.apply(input_)
165 | 


--------------------------------------------------------------------------------
/codegeex/megatron/mpu/utils.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 
17 | import torch
18 | 
19 | 
20 | def ensure_divisibility(numerator, denominator):
21 |     """Ensure that numerator is divisible by the denominator."""
22 |     assert numerator % denominator == 0, "{} is not divisible by {}".format(
23 |         numerator, denominator
24 |     )
25 | 
26 | 
27 | def divide(numerator, denominator):
28 |     """Ensure that numerator is divisible by the denominator and return
29 |     the division value."""
30 |     ensure_divisibility(numerator, denominator)
31 |     return numerator // denominator
32 | 
33 | 
34 | def split_tensor_along_last_dim(tensor, num_partitions, contiguous_split_chunks=False):
35 |     """Split a tensor along its last dimension.
36 |     Arguments:
37 |         tensor: input tensor.
38 |         num_partitions: number of partitions to split the tensor
39 |         contiguous_split_chunks: If True, make each chunk contiguous
40 |                                  in memory.
41 |     """
42 |     # Get the size and dimension.
43 |     last_dim = tensor.dim() - 1
44 |     last_dim_size = divide(tensor.size()[last_dim], num_partitions)
45 |     # Split.
46 |     tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
47 |     # Note: torch.split does not create contiguous tensors by default.
48 |     if contiguous_split_chunks:
49 |         return tuple(chunk.contiguous() for chunk in tensor_list)
50 | 
51 |     return tensor_list
52 | 
53 | 
54 | class VocabUtility:
55 |     """Split the vocabulary into `world_size` chunks amd return the
56 |     first and last index of the vocabulary belonging to the `rank`
57 |     partition: Note that indecies in [fist, last)"""
58 | 
59 |     @staticmethod
60 |     def vocab_range_from_per_partition_vocab_size(
61 |         per_partition_vocab_size, rank, world_size
62 |     ):
63 |         index_f = rank * per_partition_vocab_size
64 |         index_l = index_f + per_partition_vocab_size
65 |         return index_f, index_l
66 | 
67 |     @staticmethod
68 |     def vocab_range_from_global_vocab_size(global_vocab_size, rank, world_size):
69 |         per_partition_vocab_size = divide(global_vocab_size, world_size)
70 |         return VocabUtility.vocab_range_from_per_partition_vocab_size(
71 |             per_partition_vocab_size, rank, world_size
72 |         )
73 | 


--------------------------------------------------------------------------------
/codegeex/megatron/optimizer/__init__.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | from apex.optimizers import FusedAdam as Adam
 17 | from apex.optimizers import FusedSGD as SGD
 18 | 
 19 | from codegeex.megatron import get_args
 20 | from codegeex.megatron.model import LayerNorm
 21 | 
 22 | from .grad_scaler import ConstantGradScaler, DynamicGradScaler
 23 | from .optimizer import Float16OptimizerWithFloat16Params, FP32Optimizer
 24 | 
 25 | 
 26 | def _get_params_for_weight_decay_optimization(modules):
 27 |     """Divide params into with-weight-decay and without-weight-decay groups.
 28 |     Layernorms and baises will have no weight decay but the rest will.
 29 |     """
 30 | 
 31 |     weight_decay_params = {"params": []}
 32 |     no_weight_decay_params = {"params": [], "weight_decay": 0.0}
 33 |     for module in modules:
 34 |         for module_ in module.modules():
 35 |             if isinstance(module_, LayerNorm):
 36 |                 no_weight_decay_params["params"].extend(
 37 |                     [p for p in list(module_._parameters.values()) if p is not None]
 38 |                 )
 39 |             else:
 40 |                 weight_decay_params["params"].extend(
 41 |                     [
 42 |                         p
 43 |                         for n, p in list(module_._parameters.items())
 44 |                         if p is not None and n != "bias"
 45 |                     ]
 46 |                 )
 47 |                 no_weight_decay_params["params"].extend(
 48 |                     [
 49 |                         p
 50 |                         for n, p in list(module_._parameters.items())
 51 |                         if p is not None and n == "bias"
 52 |                     ]
 53 |                 )
 54 | 
 55 |     return weight_decay_params, no_weight_decay_params
 56 | 
 57 | 
 58 | def get_megatron_optimizer(model):
 59 |     args = get_args()
 60 | 
 61 |     if args.cpu_optimizer:
 62 |         raise NotImplementedError("need to add cpu adam")
 63 |     
 64 |     param_groups = _get_params_for_weight_decay_optimization(model)
 65 |    
 66 |     if args.optimizer == "adam":
 67 |         optimizer = Adam(
 68 |             param_groups,
 69 |             lr=args.lr,
 70 |             weight_decay=args.weight_decay,
 71 |             betas=(args.adam_beta1, args.adam_beta2),
 72 |             eps=args.adam_eps,
 73 |         )
 74 |     elif args.optimizer == "sgd":
 75 |         optimizer = SGD(
 76 |             param_groups,
 77 |             lr=args.lr,
 78 |             weight_decay=args.weight_decay,
 79 |             momentum=args.sgd_momentum,
 80 |         )
 81 |     else:
 82 |         raise Exception("{} optimizer is not supported.".format(args.optimizer))
 83 | 
 84 |     if args.deepspeed:
 85 |         return optimizer
 86 | 
 87 |     # Determine whether the params have main-grad field.
 88 |     params_have_main_grad = False
 89 |     if args.DDP_impl == "local":
 90 |         params_have_main_grad = True
 91 | 
 92 |     if args.fp16 or args.bf16:
 93 | 
 94 |         # Grad scaler:
 95 |         #    if loss-scale is provided, instantiate the constant scaler.
 96 |         #    if we are using fp16 and loss-scale is not present, use a
 97 |         #       dynamic scaler.
 98 |         #    otherwise we are running in bf16 with no loss-scale so
 99 |         #       leave it as None.
100 |         grad_scaler = None
101 |         # Constant loss scale.
102 |         if args.loss_scale:
103 |             grad_scaler = ConstantGradScaler(args.loss_scale)
104 |         # Dynamic loss scale.
105 |         else:
106 |             if args.fp16:
107 |                 grad_scaler = DynamicGradScaler(
108 |                     initial_scale=args.initial_loss_scale,
109 |                     min_scale=args.min_loss_scale,
110 |                     growth_factor=2.0,
111 |                     backoff_factor=0.5,
112 |                     growth_interval=args.loss_scale_window,
113 |                     hysteresis=args.hysteresis,
114 |                 )
115 | 
116 |         # Megatron optimizer.
117 |         return Float16OptimizerWithFloat16Params(
118 |             optimizer,
119 |             args.clip_grad,
120 |             args.log_num_zeros_in_grad,
121 |             params_have_main_grad,
122 |             args.bf16,
123 |             grad_scaler,
124 |         )
125 | 
126 |     # FP32.
127 |     return FP32Optimizer(
128 |         optimizer, args.clip_grad, args.log_num_zeros_in_grad, params_have_main_grad
129 |     )
130 | 


--------------------------------------------------------------------------------
/codegeex/megatron/optimizer/clip_grads.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | """Gradient clipping."""
 17 | 
 18 | import torch
 19 | from torch._six import inf
 20 | 
 21 | from apex.multi_tensor_apply import multi_tensor_applier
 22 | import amp_C
 23 | 
 24 | from codegeex.megatron import mpu
 25 | from codegeex.megatron.model.module import param_is_not_shared
 26 | from codegeex.megatron.mpu.layers import param_is_not_tensor_parallel_duplicate
 27 | 
 28 | 
 29 | def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
 30 |     """Clips gradient norm of an iterable of parameters whose gradients
 31 |        are in fp32.
 32 | 
 33 |     This is adapted from torch.nn.utils.clip_grad.clip_grad_norm_ and
 34 |     added functionality to handle model parallel parameters. Note that
 35 |     the gradients are modified in place.
 36 | 
 37 |     Arguments:
 38 |         parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
 39 |             single Tensor that will have gradients normalized
 40 |         max_norm (float or int): max norm of the gradients
 41 |         norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
 42 |             infinity norm.
 43 | 
 44 |     Returns:
 45 |         Total norm of the parameters (viewed as a single vector).
 46 |     """
 47 | 
 48 |     if isinstance(parameters, torch.Tensor):
 49 |         parameters = [parameters]
 50 | 
 51 |     # Filter parameters based on:
 52 |     #   - grad should not be none
 53 |     #   - parameter should not be shared
 54 |     #   - should not be a replica due to tensor model parallelism
 55 |     grads = []
 56 |     grads_for_norm = []
 57 |     for param in parameters:
 58 |         grad_not_none = param.grad is not None
 59 |         is_not_shared = param_is_not_shared(param)
 60 |         is_not_tp_duplicate = param_is_not_tensor_parallel_duplicate(param)
 61 |         grad = param.grad.detach()
 62 |         if grad_not_none:
 63 |             # Make sure the grads are in fp32
 64 |             assert param.grad.type() == "torch.cuda.FloatTensor"
 65 |             grads.append(grad)
 66 |         if grad_not_none and is_not_shared and is_not_tp_duplicate:
 67 |             grads_for_norm.append(grad)
 68 | 
 69 |     # Norm parameters.
 70 |     max_norm = float(max_norm)
 71 |     norm_type = float(norm_type)
 72 |     total_norm = 0.0
 73 | 
 74 |     # Calculate norm.
 75 |     if norm_type == inf:
 76 |         total_norm = max(grad.abs().max() for grad in grads_for_norm)
 77 |         total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
 78 |         # Take max across all model-parallel GPUs.
 79 |         torch.distributed.all_reduce(
 80 |             total_norm_cuda,
 81 |             op=torch.distributed.ReduceOp.MAX,
 82 |             group=mpu.get_model_parallel_group(),
 83 |         )
 84 |         total_norm = total_norm_cuda[0].item()
 85 | 
 86 |     else:
 87 |         if norm_type == 2.0:
 88 |             dummy_overflow_buf = torch.cuda.IntTensor([0])
 89 |             # Use apex's multi-tensor applier for efficiency reasons.
 90 |             # Multi-tensor applier takes a function and a list of list
 91 |             # and performs the operation on that list all in one kernel.
 92 |             grad_norm, _ = multi_tensor_applier(
 93 |                 amp_C.multi_tensor_l2norm,
 94 |                 dummy_overflow_buf,
 95 |                 [grads_for_norm],
 96 |                 False,  # no per-parameter norm
 97 |             )
 98 |             # Since we will be summing across data parallel groups,
 99 |             # we need the pow(norm-type).
100 |             total_norm = grad_norm ** norm_type
101 | 
102 |         else:
103 |             for grad in grads_for_norm:
104 |                 grad_norm = torch.norm(grad, norm_type)
105 |                 total_norm += grad_norm ** norm_type
106 | 
107 |         # Sum across all model-parallel GPUs.
108 |         torch.distributed.all_reduce(
109 |             total_norm,
110 |             op=torch.distributed.ReduceOp.SUM,
111 |             group=mpu.get_model_parallel_group(),
112 |         )
113 |         total_norm = total_norm.item() ** (1.0 / norm_type)
114 | 
115 |     # Scale.
116 |     clip_coeff = max_norm / (total_norm + 1.0e-6)
117 |     if clip_coeff < 1.0:
118 |         dummy_overflow_buf = torch.cuda.IntTensor([0])
119 |         multi_tensor_applier(
120 |             amp_C.multi_tensor_scale, dummy_overflow_buf, [grads, grads], clip_coeff
121 |         )
122 | 
123 |     return total_norm
124 | 
125 | 
126 | def count_zeros_fp32(parameters):
127 | 
128 |     if isinstance(parameters, torch.Tensor):
129 |         parameters = [parameters]
130 | 
131 |     # Filter parameters based on:
132 |     #   - grad should not be none
133 |     #   - parameter should not be shared
134 |     #   - should not be a replica due to tensor model parallelism
135 |     total_num_zeros = 0.0
136 |     for param in parameters:
137 |         grad_not_none = param.grad is not None
138 |         is_not_shared = param_is_not_shared(param)
139 |         is_not_tp_duplicate = param_is_not_tensor_parallel_duplicate(param)
140 |         if grad_not_none and is_not_shared and is_not_tp_duplicate:
141 |             grad = param.grad.detach()
142 |             num_zeros = grad.numel() - torch.count_nonzero(grad)
143 |             total_num_zeros = num_zeros + total_num_zeros
144 | 
145 |     # Sum across all model-parallel GPUs.
146 |     torch.distributed.all_reduce(
147 |         total_num_zeros,
148 |         op=torch.distributed.ReduceOp.SUM,
149 |         group=mpu.get_model_parallel_group(),
150 |     )
151 |     total_num_zeros = total_num_zeros.item()
152 | 
153 |     return total_num_zeros
154 | 


--------------------------------------------------------------------------------
/codegeex/megatron/optimizer/grad_scaler.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | """Megatron grad scaler."""
 17 | 
 18 | from abc import ABC
 19 | from abc import abstractmethod
 20 | 
 21 | import torch
 22 | 
 23 | 
 24 | class MegatronGradScaler(ABC):
 25 |     def __init__(self, initial_scale):
 26 |         """Initialize scale value with the input initial scale."""
 27 |         assert initial_scale > 0.0
 28 |         self._scale = torch.cuda.FloatTensor([initial_scale])
 29 | 
 30 |     @property
 31 |     def scale(self):
 32 |         return self._scale
 33 | 
 34 |     @property
 35 |     def inv_scale(self):
 36 |         return self._scale.double().reciprocal().float()
 37 | 
 38 |     @abstractmethod
 39 |     def update(self, found_inf):
 40 |         pass
 41 | 
 42 |     @abstractmethod
 43 |     def state_dict(self):
 44 |         pass
 45 | 
 46 |     @abstractmethod
 47 |     def load_state_dict(self, state_dict):
 48 |         pass
 49 | 
 50 | 
 51 | class ConstantGradScaler(MegatronGradScaler):
 52 |     def update(self, found_inf):
 53 |         pass
 54 | 
 55 |     def state_dict(self):
 56 |         return dict()
 57 | 
 58 |     def load_state_dict(self, state_dict):
 59 |         pass
 60 | 
 61 | 
 62 | class DynamicGradScaler(MegatronGradScaler):
 63 |     def __init__(
 64 |         self,
 65 |         initial_scale,
 66 |         min_scale,
 67 |         growth_factor,
 68 |         backoff_factor,
 69 |         growth_interval,
 70 |         hysteresis,
 71 |     ):
 72 |         """ "Grad scaler with dynamic scale that gets adjusted
 73 |         during training."""
 74 |         super(DynamicGradScaler, self).__init__(initial_scale)
 75 | 
 76 |         # Lower bound on the scale.
 77 |         assert min_scale > 0.0
 78 |         assert min_scale <= initial_scale
 79 |         self.min_scale = torch.cuda.FloatTensor([min_scale])
 80 |         # Growth and backoff factors for the scale.
 81 |         assert growth_factor > 1.0
 82 |         self.growth_factor = torch.cuda.FloatTensor([growth_factor])
 83 |         assert backoff_factor < 1.0
 84 |         assert backoff_factor > 0.0
 85 |         self.backoff_factor = torch.cuda.FloatTensor([backoff_factor])
 86 |         # Interval over which if we don't see any inf/nan,
 87 |         # we will scale the grad scale by the growth factor.
 88 |         assert growth_interval > 0
 89 |         self.growth_interval = growth_interval
 90 |         # Number of inf/nans we should see before scaling down
 91 |         # the grad scale by the backoff factor.
 92 |         assert hysteresis > 0
 93 |         self.hysteresis = hysteresis
 94 | 
 95 |         # Trackers.
 96 |         self._growth_tracker = 0
 97 |         self._hysteresis_tracker = self.hysteresis
 98 | 
 99 |     def update(self, found_inf):
100 | 
101 |         # If we have an inf/nan, growth tracker is set to 0
102 |         # and hysterisis tracker is reduced by 1.
103 |         if found_inf:
104 |             self._growth_tracker = 0
105 |             self._hysteresis_tracker -= 1
106 |             # Now if we are out of hysteresis count, scale down the loss.
107 |             if self._hysteresis_tracker <= 0:
108 |                 self._scale = torch.max(
109 |                     self._scale * self.backoff_factor, self.min_scale
110 |                 )
111 |         else:
112 |             # If there is no nan/inf, increment the growth tracker.
113 |             self._growth_tracker += 1
114 |             # If we have had enough consequitive intervals with no nan/inf:
115 |             if self._growth_tracker == self.growth_interval:
116 |                 # Reset the tracker and hysteresis trackers,
117 |                 self._growth_tracker = 0
118 |                 self._hysteresis_tracker = self.hysteresis
119 |                 # and scale up the loss scale.
120 |                 self._scale = self._scale * self.growth_factor
121 | 
122 |     def state_dict(self):
123 |         state_dict = {}
124 |         state_dict["scale"] = self._scale
125 |         state_dict["growth_tracker"] = self._growth_tracker
126 |         state_dict["hysteresis_tracker"] = self._hysteresis_tracker
127 |         return state_dict
128 | 
129 |     def load_state_dict(self, state_dict):
130 |         self._scale = state_dict["scale"].cuda(torch.cuda.current_device())
131 |         self._growth_tracker = state_dict["growth_tracker"]
132 |         self._hysteresis_tracker = state_dict["hysteresis_tracker"]
133 | 


--------------------------------------------------------------------------------
/codegeex/megatron/tokenizer/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 
17 | from .tokenizer import build_tokenizer
18 | 


--------------------------------------------------------------------------------
/codegeex/megatron/tools/collect_env.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | 
 4 | ENV_NAMES = ["CUDA_HOME", "LD_LIBRARY_PATH", "PATH", "TORCH_EXTENSIONS_DIR", "CUDA_LAUNCH_BLOCKING"]
 5 | 
 6 | 
 7 | def main():
 8 |     s = ""
 9 |     for name in ENV_NAMES:
10 |         if name in os.environ:
11 |             value = os.environ[name]
12 |             s += "{}={}\n".format(name, value)
13 |             print(f"{name}={value}")
14 |         else:
15 |             print(f"{name} is not set")
16 | 
17 |     # write env vars to .deepspeed_env
18 |     with open(".deepspeed_env", "w") as f:
19 |         f.write(s)
20 | 
21 | 
22 | if __name__ == "__main__":
23 |     main()
24 | 


--------------------------------------------------------------------------------
/codegeex/mindspore/configs/13B.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | script_path=$(realpath $BASH_SOURCE)
 4 | script_dir=$(dirname $script_path)
 5 | 
 6 | CODE_DATA_DIR="" # TODO: set the path to the code data directory
 7 | 
 8 | GAS=1
 9 | 
10 | python ${script_dir}/../train.py \
11 |     --distribute true \
12 |     --device_num $RANK_SIZE \
13 |     --sink_size 2 \
14 |     --run_type train \
15 |     --train_and_eval_mode 0 \
16 |     --mode 13B \
17 |     --code_data $CODE_DATA_DIR \
18 |     --param_init_type fp32 \
19 |     --micro_size $GAS \
20 |     --seq_length 2048 \
21 |     --vocab_size 51200 \
22 |     --ckpt_name_prefix code-13B \
23 |     --save_checkpoint=True \
24 |     --save_checkpoint_path /cache/ckpts \
25 |     --save_checkpoint_obs_path <TODO> \ # TODO: set to obs path for saving ckpts
26 |     --save_checkpoint_steps 250 \
27 |     --load_ckpt_path <TODO> \ # TODO: set to obs path for loading ckpt
28 |     --load_ckpt_epoch <TODO> \ # TODO: set to epoch number of loaded ckpt
29 |     --per_batch_size 16 \
30 |     --dropout_rate 0.1 \
31 |     --full_batch 0 \
32 |     --epoch_size 1 \
33 |     --micro_interleaved_size 1 \
34 |     --profiling 0 \
35 |     --tb_dir $LOG_PATH


--------------------------------------------------------------------------------
/codegeex/mindspore/configs/13B_128p_save_1p.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | script_path=$(realpath $BASH_SOURCE)
 3 | script_dir=$(dirname $script_path)
 4 | 
 5 | CODE_DATA_DIR="" # TODO: set the path to the code data directory
 6 | 
 7 | GAS=32
 8 | 
 9 | python ${script_dir}/../save_1p_ckpt_from_8p_ckpt.py \
10 |     --distribute true \
11 |     --run_type train \
12 |     --train_and_eval_mode 0 \
13 |     --mode 13B \
14 |     --code_data $CODE_DATA_DIR \
15 |     --param_init_type fp32 \
16 |     --micro_size $GAS \
17 |     --seq_length 2048 \
18 |     --vocab_size 51200 \
19 |     --ckpt_name_prefix code-13B \
20 |     --save_checkpoint=True \
21 |     --save_checkpoint_path /cache/ckpts \
22 |     --save_checkpoint_obs_path <TODO> \ # TODO: set to obs path for saving ckpts
23 |     --save_checkpoint_steps <TODO> \ # TODO: set to epoch number of loaded ckpt
24 |     --load_ckpt_path <TODO> \ # TODO: set to obs path for loading ckpt
25 |     --load_ckpt_epoch <TODO> \ # TODO: set to epoch number of loaded ckpt, same as save_checkpoint_steps
26 |     --strategy_load_ckpt_path "/home/work/user-job-dir/start_1.6/strategy.ckpt" \
27 |     --per_batch_size 16 \
28 |     --full_batch 0 \
29 |     --epoch_size 1 \
30 |     --micro_interleaved_size 1 \
31 |     --profiling 0 \
32 |     --tb_dir $LOG_PATH
33 | 


--------------------------------------------------------------------------------
/codegeex/mindspore/configs/13B_128p_save_8p_ckpt.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | script_path=$(realpath $BASH_SOURCE)
 4 | script_dir=$(dirname $script_path)
 5 | 
 6 | CODE_DATA_DIR="" # TODO: set the path to the code data directory
 7 | 
 8 | GAS=32
 9 | 
10 | python ${script_dir}/../save_8p_ckpt.py \
11 |     --distribute true \
12 |     --device_num $RANK_SIZE \
13 |     --sink_size 2 \
14 |     --run_type train \
15 |     --train_and_eval_mode 0 \
16 |     --mode 13B \
17 |     --code_data $CODE_DATA_DIR \
18 |     --param_init_type fp32 \
19 |     --micro_size $GAS \
20 |     --seq_length 2048 \
21 |     --vocab_size 51200 \
22 |     --ckpt_name_prefix code-13B \
23 |     --save_checkpoint=True \
24 |     --save_checkpoint_path /cache/ckpts \
25 |     --save_checkpoint_obs_path <TODO> \ # TODO: set to obs path for saving ckpts
26 |     --save_checkpoint_steps 99999 \
27 |     --load_ckpt_path <TODO> \ # TODO: set to obs path for loading ckpt
28 |     --load_ckpt_epoch <TODO> \ # TODO: set to epoch number of loaded ckpt, same as save_checkpoint_steps
29 |     --strategy_load_ckpt_path "/home/work/user-job-dir/start_1.6/strategy.ckpt" \
30 |     --per_batch_size 16 \
31 |     --full_batch 0 \
32 |     --epoch_size 1 \
33 |     --micro_interleaved_size 1 \
34 |     --profiling 0 \
35 |     --tb_dir $LOG_PATH


--------------------------------------------------------------------------------
/codegeex/mindspore/configs/13B_1p_to_torch.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | script_path=$(realpath $BASH_SOURCE)
 4 | script_dir=$(dirname $script_path)
 5 | 
 6 | CODE_DATA_DIR="" # TODO: set the path to the code data directory
 7 | 
 8 | GAS=32
 9 | 
10 | python ${script_dir}/../convertion_1p.py \
11 |     --distribute false \
12 |     --device_num $RANK_SIZE \
13 |     --sink_size 2 \
14 |     --run_type predict \
15 |     --train_and_eval_mode 0 \
16 |     --mode 13B \
17 |     --code_data $CODE_DATA_DIR \
18 |     --param_init_type fp32 \
19 |     --micro_size $GAS \
20 |     --seq_length 2048 \
21 |     --vocab_size 51200 \
22 |     --ckpt_name_prefix code-13B \
23 |     --save_checkpoint=True \
24 |     --save_checkpoint_path /cache/ckpts \
25 |     --load_ckpt_path <TODO> \ # TODO: set to obs path for loading ckpt
26 |     --per_batch_size 1 \
27 |     --full_batch 1 \
28 |     --epoch_size 1 \
29 |     --micro_interleaved_size 1 \
30 |     --profiling 0 \
31 |     --use_past "true" \
32 |     --top_p 0.95 \
33 |     --top_k_num 100 \
34 |     --temperature 0.8 \
35 |     --op_level_model_parallel_num 1 \
36 |     --frequency_penalty 0.0 \
37 |     --presence_penalty 0.0 \
38 |     --strategy_load_ckpt_path "/home/work/user-job-dir/start_1.6/strategy.ckpt" \
39 |     --tb_dir $LOG_PATH
40 | 


--------------------------------------------------------------------------------
/codegeex/mindspore/configs/13B_finetune.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | script_path=$(realpath $BASH_SOURCE)
 4 | script_dir=$(dirname $script_path)
 5 | 
 6 | CODE_DATA_DIR="" # TODO: set the path to the code data directory
 7 | 
 8 | GAS=1
 9 | 
10 | python ${script_dir}/../../finetune.py \
11 |     --distribute true \
12 |     --device_num $RANK_SIZE \
13 |     --sink_size 2 \
14 |     --run_type train \
15 |     --train_and_eval_mode 1 \
16 |     --mode 13B \
17 |     --code_data $CODE_DATA_DIR \
18 |     --param_init_type fp32 \
19 |     --micro_size $GAS \
20 |     --seq_length 2048 \
21 |     --vocab_size 51200 \
22 |     --ckpt_name_prefix code-13B \
23 |     --save_checkpoint=True \
24 |     --save_checkpoint_path /cache/ckpts \
25 |     --save_checkpoint_obs_path <TODO> \ # TODO: set to obs path for saving ckpts
26 |     --save_checkpoint_steps 20 \
27 |     --load_ckpt_path <TODO> \ # TODO: set to obs path for loading ckpt
28 |     --load_ckpt_epoch <TODO> \ # TODO: set to epoch number of loaded ckpt
29 |     --per_batch_size 16 \
30 |     --dropout_rate 0.1 \
31 |     --full_batch 0 \
32 |     --epoch_size 5 \
33 |     --micro_interleaved_size 1 \
34 |     --profiling 0 \
35 |     --tb_dir $LOG_PATH \


--------------------------------------------------------------------------------
/codegeex/mindspore/configs/13B_generate.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | script_path=$(realpath $BASH_SOURCE)
 4 | script_dir=$(dirname $script_path)
 5 | 
 6 | CODE_DATA_DIR="" # TODO: set the path to the code data directory
 7 | 
 8 | GAS=32
 9 | 
10 | python ${script_dir}/../generation.py \
11 |     --distribute true \
12 |     --device_num $RANK_SIZE \
13 |     --sink_size 2 \
14 |     --run_type predict \
15 |     --train_and_eval_mode 0 \
16 |     --mode 13B \
17 |     --code_data $CODE_DATA_DIR \
18 |     --param_init_type fp32 \
19 |     --micro_size $GAS \
20 |     --seq_length 2048 \
21 |     --vocab_size 51200 \
22 |     --ckpt_name_prefix code-13B \
23 |     --save_checkpoint=True \
24 |     --save_checkpoint_path /cache/ckpts \
25 |     --save_checkpoint_obs_path /home \ # TODO: set at will
26 |     --save_checkpoint_steps 99999 \ # TODO: set at will
27 |     --load_ckpt_path <TODO> \ # TODO: set to obs path for loading ckpt
28 |     --load_ckpt_epoch <TODO> \ # TODO: set to epoch number of loaded ckpt, same as save_checkpoint_steps
29 |     --per_batch_size 1 \
30 |     --full_batch 1 \
31 |     --epoch_size 1 \
32 |     --micro_interleaved_size 1 \
33 |     --profiling 0 \
34 |     --use_past "true" \
35 |     --top_p 0.95 \
36 |     --top_k_num 100 \
37 |     --temperature 0.8 \
38 |     --frequency_penalty 0.0 \
39 |     --presence_penalty 0.0 \
40 |     --strategy_load_ckpt_path "/home/work/user-job-dir/start_1.6/strategy.ckpt" \
41 |     --tb_dir $LOG_PATH
42 |     


--------------------------------------------------------------------------------
/codegeex/mindspore/configs/13B_generate_1p.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | script_path=$(realpath $BASH_SOURCE)
 4 | script_dir=$(dirname $script_path)
 5 | 
 6 | CODE_DATA_DIR="" # TODO: set the path to the code data directory
 7 | 
 8 | GAS=32
 9 | 
10 | python ${script_dir}/../generation_1p.py \
11 |     --distribute false \
12 |     --device_num $RANK_SIZE \
13 |     --sink_size 2 \
14 |     --run_type predict \
15 |     --train_and_eval_mode 0 \
16 |     --mode 13B \
17 |     --code_data $CODE_DATA_DIR \
18 |     --param_init_type fp16 \
19 |     --micro_size $GAS \
20 |     --seq_length 2048 \
21 |     --vocab_size 51200 \
22 |     --ckpt_name_prefix code-13B \
23 |     --save_checkpoint=True \
24 |     --save_checkpoint_path /cache/ckpts \
25 |     --save_checkpoint_obs_path /home \ # TODO: set at will
26 |     --save_checkpoint_steps 99999 \ # TODO: set at will
27 |     --load_ckpt_path <TODO> \ # TODO: set to obs path for loading ckpt
28 |     --load_ckpt_epoch <TODO> \ # TODO: set to epoch number of loaded ckpt, same as save_checkpoint_steps
29 |     --per_batch_size 1 \
30 |     --full_batch 1 \
31 |     --epoch_size 1 \
32 |     --micro_interleaved_size 1 \
33 |     --profiling 0 \
34 |     --use_past "true" \
35 |     --top_p 0.95 \
36 |     --top_k_num 100 \
37 |     --temperature 0.8 \
38 |     --op_level_model_parallel_num 1 \
39 |     --frequency_penalty 0.0 \
40 |     --presence_penalty 0.0 \
41 |     --strategy_load_ckpt_path "/home/work/user-job-dir/start_1.6/strategy.ckpt" \
42 |     --tb_dir $LOG_PATH
43 |     


--------------------------------------------------------------------------------
/codegeex/mindspore/configs/13B_generate_1p_values.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | script_path=$(realpath $BASH_SOURCE)
 4 | script_dir=$(dirname $script_path)
 5 | 
 6 | CODE_DATA_DIR="" # TODO: set the path to the code data directory
 7 | 
 8 | GAS=32
 9 | 
10 | python ${script_dir}/../generation_values_1p.py \
11 |     --distribute false \
12 |     --device_num $RANK_SIZE \
13 |     --sink_size 2 \
14 |     --run_type predict \
15 |     --train_and_eval_mode 0 \
16 |     --mode 13B \
17 |     --code_data $CODE_DATA_DIR \
18 |     --param_init_type fp16 \
19 |     --micro_size $GAS \
20 |     --seq_length 2048 \
21 |     --vocab_size 51200 \
22 |     --ckpt_name_prefix code-13B \
23 |     --save_checkpoint=True \
24 |     --save_checkpoint_path /cache/ckpts \
25 |     --save_checkpoint_obs_path /home \ # TODO: set at will
26 |     --save_checkpoint_steps 213000 \ # TODO: set at will
27 |     --load_ckpt_path <TODO> \ # TODO: set to obs path for loading ckpt
28 |     --load_ckpt_epoch <TODO> \ # TODO: set to epoch number of loaded ckpt
29 |     --per_batch_size 1 \
30 |     --full_batch 1 \
31 |     --epoch_size 1 \
32 |     --micro_interleaved_size 1 \
33 |     --profiling 0 \
34 |     --use_past "false" \
35 |     --top_p 0.95 \
36 |     --top_k_num 100 \
37 |     --temperature 0.8 \
38 |     --op_level_model_parallel_num 1 \
39 |     --frequency_penalty 0.0 \
40 |     --presence_penalty 0.0 \
41 |     --strategy_load_ckpt_path "/home/work/user-job-dir/start_1.6/strategy.ckpt" \
42 |     --tb_dir $LOG_PATH
43 |     


--------------------------------------------------------------------------------
/codegeex/mindspore/configs/13B_generate_finetune.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | script_path=$(realpath $BASH_SOURCE)
 4 | script_dir=$(dirname $script_path)
 5 | 
 6 | CODE_DATA_DIR="" # TODO: set the path to the code data directory
 7 | 
 8 | GAS=32
 9 | 
10 | python ${script_dir}/../generation_finetune.py \
11 |     --distribute true \
12 |     --device_num $RANK_SIZE \
13 |     --sink_size 2 \
14 |     --run_type predict \
15 |     --train_and_eval_mode 0 \
16 |     --mode 13B \
17 |     --code_data $CODE_DATA_DIR \
18 |     --param_init_type fp32 \
19 |     --micro_size $GAS \
20 |     --seq_length 2048 \
21 |     --vocab_size 51200 \
22 |     --max_generate_length 1024 \
23 |     --ckpt_name_prefix code-13B \
24 |     --save_checkpoint=True \
25 |     --save_checkpoint_path /cache/ckpts \
26 |     --save_checkpoint_obs_path /home \ # TODO: set at will
27 |     --save_checkpoint_steps 99999 \ # TODO: set at will
28 |     --load_ckpt_path <TODO> \ # TODO: set to obs path for loading ckpt
29 |     --load_ckpt_epoch <TODO> \ # TODO: set to epoch number of loaded ckpt, same as save_checkpoint_steps
30 |     --per_batch_size 6 \
31 |     --full_batch 1 \
32 |     --epoch_size 1 \
33 |     --micro_interleaved_size 1 \
34 |     --profiling 0 \
35 |     --use_past "true" \
36 |     --top_p 0.95 \
37 |     --top_k_num 100 \
38 |     --temperature 0.2 \
39 |     --frequency_penalty 0.0 \
40 |     --presence_penalty 0.0 \
41 |     --strategy_load_ckpt_path "/home/work/user-job-dir/start_1.6/strategy.ckpt" \
42 |     --tb_dir $LOG_PATH \
43 |     --language $LANGUAGE
44 |     
45 | 


--------------------------------------------------------------------------------
/codegeex/mindspore/configs/13B_generate_humaneval.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | script_path=$(realpath $BASH_SOURCE)
 4 | script_dir=$(dirname $script_path)
 5 | 
 6 | CODE_DATA_DIR="" # TODO: set the path to the code data directory
 7 | 
 8 | GAS=32
 9 | 
10 | python ${script_dir}/../generation_humaneval.py \
11 |     --distribute true \
12 |     --device_num $RANK_SIZE \
13 |     --sink_size 2 \
14 |     --run_type predict \
15 |     --train_and_eval_mode 0 \
16 |     --mode 13B \
17 |     --code_data $CODE_DATA_DIR \
18 |     --param_init_type fp32 \
19 |     --micro_size $GAS \
20 |     --seq_length 2048 \
21 |     --vocab_size 51200 \
22 |     --max_generate_length 1024 \
23 |     --ckpt_name_prefix code-13B \
24 |     --save_checkpoint=True \
25 |     --save_checkpoint_path /cache/ckpts \
26 |     --save_checkpoint_obs_path /home \ # TODO: set at will
27 |     --save_checkpoint_steps 99999 \ # TODO: set at will
28 |     --load_ckpt_path <TODO> \ # TODO: set to obs path for loading ckpt
29 |     --load_ckpt_epoch <TODO> \ # TODO: set to epoch number of loaded ckpt, same as save_checkpoint_steps
30 |     --per_batch_size 6 \
31 |     --full_batch 1 \
32 |     --epoch_size 1 \
33 |     --micro_interleaved_size 1 \
34 |     --profiling 0 \
35 |     --use_past "true" \
36 |     --top_p 0.95 \
37 |     --top_k_num 100 \
38 |     --temperature 0.8 \
39 |     --frequency_penalty 0.0 \
40 |     --presence_penalty 0.0 \
41 |     --strategy_load_ckpt_path "/home/work/user-job-dir/start_1.6/strategy.ckpt" \
42 |     --tb_dir $LOG_PATH \
43 |     --part $PART
44 |     
45 | 


--------------------------------------------------------------------------------
/codegeex/mindspore/configs/13B_generate_values.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | script_path=$(realpath $BASH_SOURCE)
 4 | script_dir=$(dirname $script_path)
 5 | 
 6 | CODE_DATA_DIR="" # TODO: set the path to the code data directory
 7 | 
 8 | GAS=32
 9 | 
10 | python ${script_dir}/../generation_values.py \
11 |     --distribute true \
12 |     --device_num $RANK_SIZE \
13 |     --sink_size 2 \
14 |     --run_type predict \
15 |     --train_and_eval_mode 0 \
16 |     --mode 13B \
17 |     --code_data $CODE_DATA_DIR \
18 |     --param_init_type fp32 \
19 |     --micro_size $GAS \
20 |     --seq_length 2048 \
21 |     --vocab_size 51200 \
22 |     --max_generate_length 2048 \
23 |     --ckpt_name_prefix code-13B \
24 |     --save_checkpoint=True \
25 |     --save_checkpoint_path /cache/ckpts \
26 |     --save_checkpoint_obs_path /home \ # TODO: set at will
27 |     --save_checkpoint_steps 99999 \ # TODO: set at will
28 |     --load_ckpt_path <TODO> \ # TODO: set to obs path for loading ckpt
29 |     --load_ckpt_epoch <TODO> \ # TODO: set to epoch number of loaded ckpt, same as save_checkpoint_steps
30 |     --per_batch_size 6 \
31 |     --full_batch 1 \
32 |     --epoch_size 1 \
33 |     --micro_interleaved_size 1 \
34 |     --profiling 0 \
35 |     --use_past "true" \
36 |     --top_p 0.95 \
37 |     --top_k_num 100 \
38 |     --temperature 1.0 \
39 |     --frequency_penalty 0.0 \
40 |     --presence_penalty 0.0 \
41 |     --strategy_load_ckpt_path "/home/work/user-job-dir/start_1.6/strategy.ckpt" \
42 |     --tb_dir $LOG_PATH   
43 | 


--------------------------------------------------------------------------------
/codegeex/mindspore/scripts/custom_tune_bank_new/Ascend910ProA/cube/repository_ascend910ProA_matmul.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/codegeex/mindspore/scripts/custom_tune_bank_new/Ascend910ProA/cube/repository_ascend910ProA_matmul.bin


--------------------------------------------------------------------------------
/codegeex/mindspore/scripts/ma-pre-start.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | source ~/.bashrc
  4 | echo "Start to intall the run package"
  5 | WORK_DIR=start_1.7
  6 | RUN_DIR=run
  7 | mindspore_file=mindspore_ascend-1.7.0-cp37-cp37m-linux_aarch64.whl
  8 | LOCAL_DIR=$(cd "$(dirname "$0")";pwd)
  9 | echo $LOCAL_DIR
 10 | 
 11 | echo "===current dir="
 12 | ls ./${WORK_DIR}/${RUN_DIR}
 13 | 
 14 | pip install ./${WORK_DIR}/${mindspore_file} -i http://100.125.33.126:8888/repository/pypi/simple --trusted-host=100.125.33.126
 15 | sudo chmod +755 -R /usr/local/Ascend/nnae
 16 | sudo rm -rf /usr/local/Ascend/nnae
 17 | 
 18 | sudo chmod +x ./${WORK_DIR}/${RUN_DIR}/*.run
 19 | sudo bash ./${WORK_DIR}/${RUN_DIR}/Ascend* --full --quiet
 20 | 
 21 | export HCCL_CONNECT_TIMEOUT=1800 # 通信建链最长等待时间，单位s
 22 | 
 23 | echo "======/usr/local/Ascend======"
 24 | ls -al /usr/local/Ascend
 25 | echo "======/usr/local/Ascend/ascend-toolkit/======"
 26 | ls -al /usr/local/Ascend/ascend-toolkit/
 27 | echo "======/usr/local/Ascend/ascend-toolkit/latest======"
 28 | ls -al /usr/local/Ascend/ascend-toolkit/latest
 29 | echo "======/usr/local/Ascend/driver/lib64========"
 30 | ls -al /usr/local/Ascend/driver/lib64
 31 | echo "======/usr/local/Ascend/driver/lib64/common======="
 32 | ls -al /usr/local/Ascend/driver/lib64/common
 33 | echo "=======/usr/local/Ascend/driver/lib64/driver======="
 34 | ls -al /usr/local/Ascend/driver/lib64/driver
 35 | echo "============/usr/local/Ascend/ascend-toolkit/5.1.RC1============="
 36 | ls -al /usr/local/Ascend/ascend-toolkit/5.1.RC1
 37 | sudo mkdir /usr/local/Ascend/nnae
 38 | sudo chmod +755 -R /usr/local/Ascend/nnae
 39 | #sudo mkdir /usr/local/Ascend/nnae/latest
 40 | #sudo chmod +755 -R /usr/local/Ascend/nnae/latest
 41 | sudo ln -s /usr/local/Ascend/ascend-toolkit/5.1.RC1 /usr/local/Ascend/nnae/latest
 42 | echo "======/usr/local/Ascend/nnae======"
 43 | ls -al /usr/local/Ascend/nnae
 44 | echo "======/usr/local/Ascend/nnae/latest======"
 45 | ls -al /usr/local/Ascend/nnae/latest
 46 | echo "======/usr/local/Ascend/nnae/latest/lib64/libhccl.so======"
 47 | ls -al /usr/local/Ascend/nnae/latest/lib64/libhccl.so
 48 | 
 49 | # sudo cp -fp ${LOCAL_DIR}/${WORK_DIR}/libhccl.so /usr/local/Ascend/nnae/latest/lib64/libhccl.so
 50 | echo "======/usr/local/Ascend/nnae/latest/lib64/libhccl.so======"
 51 | ls -al /usr/local/Ascend/nnae/latest/lib64/libhccl.so
 52 | 
 53 | echo "======/usr/local/Ascend/nnae/latest/opp/op_impl/built-in/ai_core/tbe/impl/layer_norm.py======"
 54 | ls -al /usr/local/Ascend/nnae/latest/opp/op_impl/built-in/ai_core/tbe/impl/layer_norm.py
 55 | 
 56 | echo "======/usr/local/Ascend/nnae/latest/opp/op_impl/built-in/ai_core/tbe/impl/layer_norm.py======"
 57 | ls -al /usr/local/Ascend/nnae/latest/opp/op_impl/built-in/ai_core/tbe/impl/layer_norm_x_backprop_v2.py
 58 | 
 59 | 
 60 | sudo cp -fp ${LOCAL_DIR}/${WORK_DIR}/layer_norm.py /usr/local/Ascend/nnae/latest/opp/op_impl/built-in/ai_core/tbe/impl/layer_norm.py
 61 | sudo cp -fp ${LOCAL_DIR}/${WORK_DIR}/layer_norm_x_backprop_v2.py /usr/local/Ascend/nnae/latest/opp/op_impl/built-in/ai_core/tbe/impl/layer_norm_x_backprop_v2.py
 62 | 
 63 | chmod +777 /usr/local/Ascend/nnae/latest/opp/op_impl/built-in/ai_core/tbe/impl/layer_norm_x_backprop_v2.py
 64 | chmod +777 /usr/local/Ascend/nnae/latest/opp/op_impl/built-in/ai_core/tbe/impl/layer_norm.py
 65 | 
 66 | echo "======/usr/local/Ascend/nnae/latest/opp/op_impl/built-in/ai_core/tbe/impl/layer_norm.py====new=="
 67 | ls -al /usr/local/Ascend/nnae/latest/opp/op_impl/built-in/ai_core/tbe/impl/layer_norm.py
 68 | 
 69 | echo "======/usr/local/Ascend/nnae/latest/opp/op_impl/built-in/ai_core/tbe/impl/layer_norm.py====new=="
 70 | ls -al /usr/local/Ascend/nnae/latest/opp/op_impl/built-in/ai_core/tbe/impl/layer_norm_x_backprop_v2.py
 71 | 
 72 | ls -al ${LOCAL_DIR}/${WORK_DIR}/custom_tune_bank_new
 73 | 
 74 | export LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:$LD_LIBRARY_PATH
 75 | export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/compiler/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/compiler/lib64/plugin/nnengine:$LD_LIBRARY_PATH
 76 | export PATH=/usr/local/Ascend/ascend-toolkit/latest/bin:/usr/local/Ascend/ascend-toolkit/latest/compiler/ccec_compiler/bin:$PATH
 77 | export ASCEND_AICPU_PATH=/usr/local/Ascend/ascend-toolkit/latest
 78 | export ASCEND_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp
 79 | export TOOLCHAIN_HOME=/usr/local/Ascend/ascend-toolkit/latest/toolkit
 80 | export ASCEND_HOME_PATH=/usr/local/Ascend/ascend-toolkit/latest:$ASCEND_HOME_PATH
 81 | 
 82 | echo "-------------------uninstall te topi and hccl--------------------------"
 83 | sudo pip uninstall te -y
 84 | sudo pip uninstall topi -y
 85 | sudo pip uninstall hccl -y
 86 | echo "-------------------install te topi and hccl--------------------------"
 87 | pip install /usr/local/Ascend/ascend-toolkit/latest/lib64/topi-0.4.0-py3-none-any.whl 
 88 | pip install /usr/local/Ascend/ascend-toolkit/latest/lib64/te-0.4.0-py3-none-any.whl 
 89 | pip install /usr/local/Ascend/ascend-toolkit/latest/lib64/hccl-0.1.0-py3-none-any.whl 
 90 | pip install /usr/local/Ascend/ascend-toolkit/latest/tools/hccl_parser-0.1-py3-none-any.whl
 91 | 
 92 | 
 93 | export GLOG_v=3 # mindspore日志开关，1：Info, 2:Warning, 3:Error
 94 | export ASCEND_GLOBAL_LOG_LEVEL=3 # 底层软件的日志级别开关 1：Info, 2:Warning, 3:Error
 95 | export ASCEND_GLOBAL_EVENT_ENABLE=1 # 底层软件的日志event日志开关 0：disable, 1:enable
 96 | export ASCEND_SLOG_PRINT_TO_STDOUT=0 # 是否把底层日志重定向到打屏，0：disable, 1:enable
 97 | 
 98 | export ENABLE_TUNE_BANK=True
 99 | export TUNE_BANK_PATH=${LOCAL_DIR}/${WORK_DIR}/custom_tune_bank_new
100 | 
101 | env
102 | 
103 | mkdir -p /cache/ckpts
104 | mkdir -p /home/work/sfs/cache/${BATCH_JOB_ID}/1
105 | mkdir -p /home/work/sfs/cache/${BATCH_JOB_ID}/2
106 | 
107 | sudo chmod +777 -R /cache/ckpts
108 | sudo chmod +777 -R /home/work/sfs/cache/${BATCH_JOB_ID}
109 | 
110 | export GROUP_INFO_FILE=/home/work/sfs/cache/${BATCH_JOB_ID}/group_info_file.pb
111 | 


--------------------------------------------------------------------------------
/codegeex/mindspore/scripts/run_modelarts.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import time
 4 | from pathlib import Path
 5 | 
 6 | parser = argparse.ArgumentParser()
 7 | parser.add_argument("--work_dir", type=str, required=True)
 8 | parser.add_argument("--script", type=str, required=True)
 9 | parser.add_argument("--data_url", type=str, default=None)
10 | parser.add_argument("--train_url", type=str, default=None)
11 | 
12 | args = parser.parse_args()
13 | 
14 | log_path = os.path.join(args.work_dir, "logs", os.environ.get("JOB_ID"), f'device{os.environ.get("RANK_ID")}')
15 | tb_path = os.path.join(args.work_dir, "runs", os.environ.get("JOB_ID"))
16 | 
17 | Path(log_path).mkdir(parents=True, exist_ok=True)
18 | Path(tb_path).mkdir(parents=True, exist_ok=True)
19 | 
20 | log_path_prefix_1 = os.path.join(args.work_dir, "logs")
21 | 
22 | os.environ["LOG_PATH"] = tb_path
23 | 
24 | print("=================RANK_TABLE_FILE: ", os.environ["RANK_TABLE_FILE"], flush=True)
25 | print("=================ms import done", flush=True)
26 | time.sleep(10)
27 | os.system(
28 |     "cp /home/work/rank_table/jobstart_hccl.json /home/work/sfs/xx; sudo chmod +777 /home/work/rank_table/jobstart_hccl.json")
29 | ret = os.system(f"cd {log_path} && bash {args.script} 2>&1 | tee output.log")
30 | if os.environ.get("RANK_ID") == 0:
31 |     log_dir = os.path.join(args.work_dir, "logs", os.environ.get("JOB_ID"))
32 |     os.system(f"sudo chmod +777 -R {tb_path}")
33 |     os.system(f"sudo chmod +777 -R {log_dir}")
34 | print("==========ret code is: ", ret, flush=True)
35 | if ret != 0:
36 |     raise RuntimeError("ret code is :" + str(ret))
37 | 


--------------------------------------------------------------------------------
/codegeex/mindspore/scripts/run_modelarts_gen_finetune.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import time
 4 | from pathlib import Path
 5 | 
 6 | parser = argparse.ArgumentParser()
 7 | parser.add_argument("--work_dir", type=str, required=True)
 8 | parser.add_argument("--script", type=str, required=True)
 9 | parser.add_argument("--data_url", type=str, default=None)
10 | parser.add_argument("--train_url", type=str, default=None)
11 | parser.add_argument("--language", type=str, default=None)
12 | 
13 | args = parser.parse_args()
14 | 
15 | log_path = os.path.join(args.work_dir, "logs", os.environ.get("JOB_ID"), f'device{os.environ.get("RANK_ID")}')
16 | tb_path = os.path.join(args.work_dir, "runs", os.environ.get("JOB_ID"))
17 | 
18 | Path(log_path).mkdir(parents=True, exist_ok=True)
19 | Path(tb_path).mkdir(parents=True, exist_ok=True)
20 | 
21 | log_path_prefix_1 = os.path.join(args.work_dir, "logs")
22 | 
23 | os.environ["LOG_PATH"] = tb_path
24 | if args.language is not None:
25 |     os.environ["LANGUAGE"] = args.language
26 | else:
27 |     os.environ["LANGUAGE"] = "Null"
28 | 
29 | print("=================RANK_TABLE_FILE: ", os.environ["RANK_TABLE_FILE"], flush=True)
30 | print("=================ms import done", flush=True)
31 | time.sleep(10)
32 | os.system(
33 |     "cp /home/work/rank_table/jobstart_hccl.json /home/work/sfs/xx; sudo chmod +777 /home/work/rank_table/jobstart_hccl.json")
34 | ret = os.system(f"cd {log_path} && bash {args.script} 2>&1 | tee output.log")
35 | if os.environ.get("RANK_ID") == 0:
36 |     log_dir = os.path.join(args.work_dir, "logs", os.environ.get("JOB_ID"))
37 |     os.system(f"sudo chmod +777 -R {tb_path}")
38 |     os.system(f"sudo chmod +777 -R {log_dir}")
39 | print("==========ret code is: ", ret, flush=True)
40 | if ret != 0:
41 |     raise RuntimeError("ret code is :" + str(ret))
42 | 


--------------------------------------------------------------------------------
/codegeex/mindspore/scripts/run_modelarts_gen_humaneval_x.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import time
 4 | from pathlib import Path
 5 | 
 6 | parser = argparse.ArgumentParser()
 7 | parser.add_argument("--work_dir", type=str, required=True)
 8 | parser.add_argument("--script", type=str, required=True)
 9 | parser.add_argument("--data_url", type=str, default=None)
10 | parser.add_argument("--train_url", type=str, default=None)
11 | parser.add_argument("--part", type=str, default=None)
12 | 
13 | args = parser.parse_args()
14 | 
15 | log_path = os.path.join(args.work_dir, "logs", os.environ.get("JOB_ID"), f'device{os.environ.get("RANK_ID")}')
16 | tb_path = os.path.join(args.work_dir, "runs", os.environ.get("JOB_ID"))
17 | 
18 | Path(log_path).mkdir(parents=True, exist_ok=True)
19 | Path(tb_path).mkdir(parents=True, exist_ok=True)
20 | 
21 | log_path_prefix_1 = os.path.join(args.work_dir, "logs")
22 | 
23 | os.environ["LOG_PATH"] = tb_path
24 | if args.part is not None:
25 |     os.environ["PART"] = args.part
26 | else:
27 |     os.environ["PART"] = "-1"
28 | 
29 | print("=================RANK_TABLE_FILE: ", os.environ["RANK_TABLE_FILE"], flush=True)
30 | print("=================ms import done", flush=True)
31 | time.sleep(10)
32 | os.system(
33 |     "cp /home/work/rank_table/jobstart_hccl.json /home/work/sfs/xx; sudo chmod +777 /home/work/rank_table/jobstart_hccl.json")
34 | ret = os.system(f"cd {log_path} && bash {args.script} 2>&1 | tee output.log")
35 | if os.environ.get("RANK_ID") == 0:
36 |     log_dir = os.path.join(args.work_dir, "logs", os.environ.get("JOB_ID"))
37 |     os.system(f"sudo chmod +777 -R {tb_path}")
38 |     os.system(f"sudo chmod +777 -R {log_dir}")
39 | print("==========ret code is: ", ret, flush=True)
40 | if ret != 0:
41 |     raise RuntimeError("ret code is :" + str(ret))
42 | 


--------------------------------------------------------------------------------
/codegeex/mindspore/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/codegeex/mindspore/src/__init__.py


--------------------------------------------------------------------------------
/codegeex/mindspore/src/code_tokenizer.py:
--------------------------------------------------------------------------------
  1 | from typing import *
  2 | 
  3 | import numpy as np
  4 | from transformers import AutoTokenizer
  5 | from transformers.models.gpt2 import GPT2TokenizerFast
  6 | 
  7 | 
  8 | def encode_whitespaces(text, start_extra_id: int, max_len: int):
  9 |     """ Encode whitespaces to extra tokens in GPT-J.
 10 | 
 11 |     >>> encode_whitespaces('a\\n  b\\n   c', 10, 10)
 12 |     'a\\n<|extratoken_10|>b\\n<|extratoken_11|>c'
 13 |     """
 14 | 
 15 |     def push_acc_space(acc_len: int, text: str):
 16 |         if acc_len == 0:
 17 |             return text
 18 |         if acc_len == 1:
 19 |             return text + ' '
 20 |         assert acc_len <= max_len, f'Max whitespace run length {max_len}, but found {acc_len}'
 21 |         extra_id = start_extra_id - 2 + acc_len
 22 |         extra_token = f'<|extratoken_{extra_id}|>'
 23 |         return text + extra_token
 24 | 
 25 |     acc_len = 0
 26 |     res = ''
 27 |     for ch in text:
 28 |         if ch == ' ':
 29 |             acc_len += 1
 30 |             if acc_len == max_len:
 31 |                 res = push_acc_space(acc_len, res)
 32 |                 acc_len = 0
 33 |         else:
 34 |             res = push_acc_space(acc_len, res)
 35 |             acc_len = 0
 36 |             res = res + ch
 37 | 
 38 |     res = push_acc_space(acc_len, res)
 39 | 
 40 |     return res
 41 | 
 42 | 
 43 | def decode_whitespaces(text: str, start_extra_id: int, max_len: int):
 44 |     """ Decode the whitespace-encoded strings produced by encode_whitespace.
 45 | 
 46 |     >>> text = 'a\\n  b\\n   c'
 47 |     >>> s, l = 10, 10
 48 |     >>> text == decode_whitespaces(encode_whitespaces(text, s, l), s, l)
 49 |     True
 50 |     """
 51 |     for l in range(2, max_len + 1):
 52 |         token_id = start_extra_id - 2 + l
 53 |         token = f'<|extratoken_{token_id}|>'
 54 |         text = text.replace(token, ' ' * l)
 55 |     return text
 56 | 
 57 | 
 58 | class Code13BDictionary(object):
 59 |     def __init__(
 60 |             self,
 61 |             dict_file: str,
 62 |             extra_token_ids: List[str] = None,
 63 |             pad_to_vocab_size: int = -1,
 64 |     ):
 65 |         self._idx = dict()
 66 |         self._count = dict()
 67 |         self._num_symbols = 0
 68 |         self._symbols = []
 69 | 
 70 |         self._add_symbol("<s>", 0)
 71 |         self._add_symbol("<pad>", 0)
 72 |         self._add_symbol("</s>", 0)
 73 |         self._add_symbol("<unk>", 0)
 74 |         self._load_dict(dict_file)
 75 | 
 76 |         if extra_token_ids is None:
 77 |             extra_token_ids = [
 78 |                 str(x) for x in range(50257, 50400)
 79 |             ]  # follows GPT-J settings
 80 | 
 81 |         for token_id in extra_token_ids:
 82 |             self._add_symbol(token_id, 0)
 83 | 
 84 |         if pad_to_vocab_size > 0:
 85 |             self._pad_to_vocab_size(pad_to_vocab_size)
 86 | 
 87 |     def _pad_to_vocab_size(self, vocab_size: int):
 88 |         num_pad = vocab_size - len(self)
 89 |         if num_pad <= 0:
 90 |             return
 91 |         for i in range(1, num_pad + 1):
 92 |             self._add_symbol("vocab_pad_token{}".format(i), 0)
 93 | 
 94 |     def _load_dict(self, dict_file: str):
 95 |         with open(dict_file, "r") as f:
 96 |             for line in f:
 97 |                 line = line.strip()
 98 |                 if line == "" or line.startswith("#"):
 99 |                     continue
100 |                 sym, count = line.split()
101 |                 self._add_symbol(sym, int(count))
102 | 
103 |     def _add_symbol(self, sym: str, count: int):
104 |         self._idx[sym] = self._num_symbols
105 |         self._count[sym] = count
106 |         self._symbols.append(sym)
107 |         self._num_symbols += 1
108 | 
109 |     def __len__(self):
110 |         return self._num_symbols
111 | 
112 |     def index(self, sym: str):
113 |         return self._idx[sym]
114 | 
115 |     def string(self, idx: int):
116 |         return self._symbols[idx]
117 | 
118 |     def map_token(self, token: Union[int, str]):
119 |         if isinstance(token, int):
120 |             token = str(token)
121 |         return self.index(token)
122 | 
123 |     def map_tokens(self, tokens):
124 |         return [self.map_token(token) for token in tokens]
125 | 
126 |     def decode_tokens(self, tokens):
127 |         decoded = [self.string(token) for token in tokens]
128 |         return [int(x) for x in decoded if not x.startswith("vocab_pad_token")]
129 | 
130 | 
131 | class CodeTokenizer(object):
132 |     def __init__(
133 |             self,
134 |             tokenizer: GPT2TokenizerFast = None,
135 |             start_extra_id: int = 10,
136 |             max_len: int = 10,
137 |             mode='13b',
138 |             dict_file: str = None,
139 |     ):
140 |         self.tokenizer = tokenizer if tokenizer is not None else AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")
141 |         if mode not in ['6b', '13b']:
142 |             raise ValueError(f"Invalid mode {mode}, choose from ['6b', '13b']")
143 |         self.start_extra_id = start_extra_id
144 |         self.max_len = max_len
145 |         self.mode = mode
146 |         self.code_dict = Code13BDictionary(dict_file, pad_to_vocab_size=51200) if self.mode == '13b' else None
147 |         self.eos_token_id = self.tokenizer.eos_token_id
148 | 
149 |     def encode_code(self, code: str):
150 |         if self.mode == '6b':
151 |             code = encode_whitespaces(code, self.start_extra_id, self.max_len)
152 |             input_ids = self.tokenizer(code).input_ids
153 | 
154 |         elif self.mode == '13b':
155 |             code = encode_whitespaces(code, self.start_extra_id, self.max_len)
156 |             input_ids = self.code_dict.map_tokens(self.tokenizer.encode(code))
157 |             input_ids = np.array(input_ids, dtype=np.int64).reshape(1, -1)
158 | 
159 |         return input_ids
160 | 
161 |     def decode_code(self, input_ids):
162 |         if self.mode == '6b':
163 |             texts = self.tokenizer.batch_decode(input_ids)
164 |             output_code = [decode_whitespaces(text, self.start_extra_id, self.max_len) for text in texts]
165 | 
166 |         elif self.mode == '13b':
167 |             input_ids = [self.code_dict.decode_tokens(input_ids.tolist()[0])]
168 |             texts = self.tokenizer.batch_decode(input_ids)
169 |             output_code = [decode_whitespaces(text, self.start_extra_id, self.max_len) for text in texts]
170 | 
171 |         return output_code
172 | 


--------------------------------------------------------------------------------
/codegeex/mindspore/src/metrics.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Huawei Technologies Co., Ltd
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 | """
16 | Eval metrics
17 | """
18 | 
19 | import math
20 | 
21 | from mindspore import context
22 | from mindspore.communication.management import get_rank, get_group_size
23 | from mindspore.nn.metrics import Metric
24 | 
25 | 
26 | class PPLMetric(Metric):
27 |     """
28 |     Ppl metric
29 |     """
30 | 
31 |     def __init__(self, data_length):
32 |         super(PPLMetric, self).__init__()
33 |         self.clear()
34 |         self.data_length = data_length
35 |         pipeline_stages = context.get_auto_parallel_context("pipeline_stages")
36 |         per_stage_device_num = get_group_size() // pipeline_stages
37 |         stage_id = get_rank() // per_stage_device_num
38 |         self.is_last_stage = (stage_id == pipeline_stages - 1)
39 | 
40 |     def clear(self):
41 |         """Clear the internal evaluation result."""
42 |         self.PPL = []
43 |         self.tokens_count = 0
44 | 
45 |     def update(self, *inputs):  # inputs
46 |         """Update list of ppl"""
47 |         if not self.is_last_stage:
48 |             return
49 |         logits = inputs[0].asnumpy().flatten().tolist()  # logits
50 |         self.PPL.append(logits[0] * self.data_length)
51 |         self.tokens_count += 1
52 | 
53 |     def eval(self):
54 |         if not self.is_last_stage:
55 |             return 0
56 |         if self.tokens_count == 0:
57 |             print("Warning: tokens_count is 0")
58 |             return 0
59 |         val_loss = sum(self.PPL) / (self.tokens_count * self.data_length)
60 |         ppl = math.exp(min(20, val_loss))
61 |         # print("====" * 20 + " ppl  end")
62 |         # print("====" * 20 + " ppl: {}".format(ppl))
63 |         # return ppl
64 |         return val_loss
65 | 
66 | 
67 | class ValidationLoss(Metric):
68 |     def __init__(self, data_length):
69 |         super(ValidationLoss, self).__init__()
70 |         self.clear()
71 |         self.data_length = data_length
72 |         pipeline_stages = context.get_auto_parallel_context("pipeline_stages")
73 |         per_stage_device_num = get_group_size() // pipeline_stages
74 |         stage_id = get_rank() // per_stage_device_num
75 |         self.is_last_stage = (stage_id == pipeline_stages - 1)
76 | 
77 |     def clear(self):
78 |         """Clear the internal evaluation result."""
79 |         self.metric = []
80 |         self.tokens_count = 0
81 | 
82 |     def update(self, *inputs):  # inputs
83 |         """Update list of ppl"""
84 |         # logits = inputs[0].asnumpy()
85 |         # if self.rank % 8 == 0:
86 |         #     print("====" * 2 + " logits: {}".format(logits), flush=True)
87 |         # self.metric.append(logits)
88 |         if not self.is_last_stage:
89 |             return
90 |         logits = inputs[0].asnumpy().flatten().tolist()  # logits
91 |         self.metric.append(logits[0] * self.data_length)
92 |         self.tokens_count += 1
93 | 
94 |     def eval(self):
95 |         if not self.is_last_stage == 0:
96 |             return 0
97 |         val_loss = sum(self.metric) / (self.tokens_count * self.data_length)
98 |         return val_loss
99 | 


--------------------------------------------------------------------------------
/codegeex/mindspore/src/tokenization_jieba.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Tokenization classes for OpenAI GPT."""
16 | from __future__ import (absolute_import, division, print_function,
17 |                         unicode_literals)
18 | 
19 | from io import open
20 | 
21 | import jieba
22 | import sentencepiece as spm
23 | 
24 | 
25 | class JIEBATokenizer():
26 |     r"""
27 |     Jieba Tokenizer
28 |     """
29 | 
30 |     def __init__(self, vocab_file, model_file, max_len=None):
31 |         self.max_len = max_len if max_len is not None else int(1e12)
32 |         f = open(vocab_file, 'r')
33 |         lines = f.readlines()
34 |         self.encoder = {}
35 |         for line in enumerate(lines):
36 |             key = line[1].split('\t')[0]
37 |             self.encoder[key] = line[0]
38 | 
39 |         self.decoder = {v: k for k, v in self.encoder.items()}
40 | 
41 |         self.sp = spm.SentencePieceProcessor(model_file=model_file)
42 |         self.translator = str.maketrans(" \n", "\u2582\u2583")
43 | 
44 |         self.eod_id = self.encoder['<eod>']
45 |         self.eot_id = self.encoder['<eot>']
46 |         self.pad_id = self.encoder['<pad>']
47 | 
48 |     @property
49 |     def vocab_size(self):
50 |         return len(self.encoder)
51 | 
52 |     def __len__(self):
53 |         return len(self.encoder) + len(self.special_tokens)
54 | 
55 |     @property
56 |     def eod(self):
57 |         return self.eod_id
58 | 
59 |     def tokenize(self, text):
60 |         """ Tokenize a string. """
61 |         seg_list = [x.translate(self.translator) for x in jieba.cut(text, cut_all=False)]
62 |         new_seg = " ".join(seg_list)
63 |         return self.sp.encode(new_seg)
64 | 
65 |     def convert_tokens_to_ids(self, tokens):
66 |         return tokens
67 | 
68 |     def convert_ids_to_tokens(self, ids):
69 |         return self.decode(ids)
70 | 
71 |     def encode(self, text):
72 |         res = self.tokenize(text)
73 |         return res
74 | 
75 |     def decode(self, tokens):
76 |         text = self.sp.decode(tokens)
77 |         text = text.replace(' ', '').replace('\u2582', ' ').replace('\u2583', '\n')
78 |         return text
79 | 


--------------------------------------------------------------------------------
/codegeex/oneflow/__init__.py:
--------------------------------------------------------------------------------
1 | from .codegeex_model import CodeGeeXModel


--------------------------------------------------------------------------------
/codegeex/paddle/__init__.py:
--------------------------------------------------------------------------------
1 | from .codegeex_model import CodeGeeXModel


--------------------------------------------------------------------------------
/codegeex/paddle/pt_to_pdparams.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import paddle
 3 | import torch
 4 | 
 5 | linear_layer = [
 6 |     "mlp.dense_h_to_4h",
 7 |     "mlp.dense_4h_to_h",
 8 |     "attention.query",
 9 |     "attention.key",
10 |     "attention.value",
11 |     "attention.dense",
12 | ]
13 | 
14 | 
15 | def WalkDict(x):
16 |     for i in x:
17 |         if isinstance(x[i], dict):
18 |             WalkDict(x[i])
19 |         elif isinstance(x[i], torch.Tensor):
20 |             print(f"Converting '{i}' from 'torch.Tensor' to 'numpy.ndarray'.")
21 |             npy = x[i].cpu().numpy()
22 |             if any([f".{layer}.weight" in i for layer in linear_layer]):
23 |                 print(f"Transposing linear layer weight '{i}'.")
24 |                 x[i] = npy.T
25 |             else:
26 |                 x[i] = npy
27 | 
28 | 
29 | def parse_opt():
30 |     parser = argparse.ArgumentParser()
31 |     parser.add_argument(
32 |         "--pt",
33 |         type=str,
34 |         required=True,
35 |         help="Path to pt checkpoint."
36 |     )
37 |     parser.add_argument(
38 |         "--pdparams",
39 |         type=str,
40 |         required=True,
41 |         help="Path to pdparams checkpoint."
42 |     )
43 |     opt = parser.parse_args()
44 |     return opt
45 | 
46 | 
47 | def main(opt):
48 |     state_dict = torch.load(opt.pt)
49 |     WalkDict(state_dict)
50 |     paddle.save(state_dict, opt.pdparams)
51 | 
52 | 
53 | if __name__ == "__main__":
54 |     opt = parse_opt()
55 |     main(opt)
56 | 


--------------------------------------------------------------------------------
/codegeex/quantization/__init__.py:
--------------------------------------------------------------------------------
1 | from .quantize import quantize
2 | try:
3 |     from .quantize_oneflow import quantize_oneflow
4 |     from .quantize_oneflow import QuantizedLinear
5 | except ModuleNotFoundError:
6 |     pass
7 | 


--------------------------------------------------------------------------------
/codegeex/tokenizer/__init__.py:
--------------------------------------------------------------------------------
1 | from .tokenizer import CodeGeeXTokenizer


--------------------------------------------------------------------------------
/codegeex/tokenizer/added_tokens.json:
--------------------------------------------------------------------------------
1 | {"<|extratoken_14|>": 50270, "<|extratoken_121|>": 50377, "<|extratoken_3|>": 50259, "<|extratoken_25|>": 50281, "<|extratoken_101|>": 50357, "<|extratoken_138|>": 50394, "<|extratoken_10|>": 50266, "<|extratoken_21|>": 50277, "<|extratoken_32|>": 50288, "<|extratoken_46|>": 50302, "<|extratoken_22|>": 50278, "<|extratoken_40|>": 50296, "<|extratoken_96|>": 50352, "<|extratoken_92|>": 50348, "<|extratoken_95|>": 50351, "<|extratoken_141|>": 50397, "<|extratoken_78|>": 50334, "<|extratoken_86|>": 50342, "<|extratoken_56|>": 50312, "<|extratoken_124|>": 50380, "<|extratoken_127|>": 50383, "<|extratoken_122|>": 50378, "<|extratoken_123|>": 50379, "<|extratoken_111|>": 50367, "<|extratoken_93|>": 50349, "<|extratoken_130|>": 50386, "<|extratoken_113|>": 50369, "<|extratoken_50|>": 50306, "<|extratoken_97|>": 50353, "<|extratoken_1|>": 50257, "<|extratoken_55|>": 50311, "<|extratoken_34|>": 50290, "<|extratoken_143|>": 50399, "<|extratoken_62|>": 50318, "<|extratoken_74|>": 50330, "<|extratoken_136|>": 50392, "<|extratoken_117|>": 50373, "<|extratoken_38|>": 50294, "<|extratoken_120|>": 50376, "<|extratoken_39|>": 50295, "<|extratoken_65|>": 50321, "<|extratoken_29|>": 50285, "<|extratoken_104|>": 50360, "<|extratoken_13|>": 50269, "<|extratoken_5|>": 50261, "<|extratoken_107|>": 50363, "<|extratoken_19|>": 50275, "<|extratoken_84|>": 50340, "<|extratoken_77|>": 50333, "<|extratoken_135|>": 50391, "<|extratoken_24|>": 50280, "<|extratoken_134|>": 50390, "<|extratoken_15|>": 50271, "<|extratoken_67|>": 50323, "<|extratoken_89|>": 50345, "<|extratoken_2|>": 50258, "<|extratoken_73|>": 50329, "<|extratoken_129|>": 50385, "<|extratoken_126|>": 50382, "<|extratoken_30|>": 50286, "<|extratoken_41|>": 50297, "<|extratoken_28|>": 50284, "<|extratoken_114|>": 50370, "<|extratoken_128|>": 50384, "<|extratoken_118|>": 50374, "<|extratoken_131|>": 50387, "<|extratoken_68|>": 50324, "<|extratoken_125|>": 50381, "<|extratoken_103|>": 50359, "<|extratoken_8|>": 50264, "<|extratoken_64|>": 50320, "<|extratoken_52|>": 50308, "<|extratoken_45|>": 50301, "<|extratoken_43|>": 50299, "<|extratoken_18|>": 50274, "<|extratoken_139|>": 50395, "<|extratoken_85|>": 50341, "<|extratoken_88|>": 50344, "<|extratoken_63|>": 50319, "<|extratoken_4|>": 50260, "<|extratoken_48|>": 50304, "<|extratoken_112|>": 50368, "<|extratoken_17|>": 50273, "<|extratoken_49|>": 50305, "<|extratoken_108|>": 50364, "<|extratoken_110|>": 50366, "<|extratoken_42|>": 50298, "<|extratoken_70|>": 50326, "<|extratoken_6|>": 50262, "<|extratoken_35|>": 50291, "<|extratoken_23|>": 50279, "<|extratoken_66|>": 50322, "<|extratoken_60|>": 50316, "<|extratoken_71|>": 50327, "<|extratoken_51|>": 50307, "<|extratoken_133|>": 50389, "<|extratoken_20|>": 50276, "<|extratoken_76|>": 50332, "<|extratoken_81|>": 50337, "<|extratoken_142|>": 50398, "<|extratoken_116|>": 50372, "<|extratoken_57|>": 50313, "<|extratoken_75|>": 50331, "<|extratoken_37|>": 50293, "<|extratoken_33|>": 50289, "<|extratoken_16|>": 50272, "<|extratoken_61|>": 50317, "<|extratoken_7|>": 50263, "<|extratoken_12|>": 50268, "<|extratoken_36|>": 50292, "<|extratoken_80|>": 50336, "<|extratoken_98|>": 50354, "<|extratoken_105|>": 50361, "<|extratoken_91|>": 50347, "<|extratoken_53|>": 50309, "<|extratoken_137|>": 50393, "<|extratoken_9|>": 50265, "<|extratoken_79|>": 50335, "<|extratoken_83|>": 50339, "<|extratoken_109|>": 50365, "<|extratoken_99|>": 50355, "<|extratoken_140|>": 50396, "<|extratoken_72|>": 50328, "<|extratoken_11|>": 50267, "<|extratoken_94|>": 50350, "<|extratoken_26|>": 50282, "<|extratoken_59|>": 50315, "<|extratoken_106|>": 50362, "<|extratoken_115|>": 50371, "<|extratoken_58|>": 50314, "<|extratoken_90|>": 50346, "<|extratoken_31|>": 50287, "<|extratoken_102|>": 50358, "<|extratoken_47|>": 50303, "<|extratoken_100|>": 50356, "<|extratoken_82|>": 50338, "<|extratoken_44|>": 50300, "<|extratoken_69|>": 50325, "<|extratoken_54|>": 50310, "<|extratoken_132|>": 50388, "<|extratoken_27|>": 50283, "<|extratoken_87|>": 50343, "<|extratoken_119|>": 50375}
2 | 


--------------------------------------------------------------------------------
/codegeex/tokenizer/special_tokens_map.json:
--------------------------------------------------------------------------------
1 | {"bos_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "eos_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "unk_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}}
2 | 


--------------------------------------------------------------------------------
/codegeex/tokenizer/tokenizer.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from typing import *
 3 | from transformers import AutoTokenizer
 4 | from transformers.models.gpt2 import GPT2TokenizerFast
 5 | 
 6 | 
 7 | def encode_whitespaces(text: str, start_extra_id: int, max_len: int):
 8 |     """ Encode whitespaces to extra tokens.
 9 | 
10 |     >>> encode_whitespaces('a\\n  b\\n   c', 10, 10)
11 |     'a\\n<|extratoken_10|>b\\n<|extratoken_11|>c'
12 |     """
13 |     for i in np.arange(max_len, 1, -1):
14 |         text = text.replace(" " * i, f"<|extratoken_{start_extra_id + i - 2}|>")
15 |     return text
16 | 
17 | 
18 | def decode_whitespaces(text: str, start_extra_id: int, max_len: int):
19 |     """ Decode the whitespace-encoded strings produced by encode_whitespace.
20 | 
21 |     >>> text = 'a\\n  b\\n   c'
22 |     >>> s, l = 10, 10
23 |     >>> text == decode_whitespaces(encode_whitespaces(text, s, l), s, l)
24 |     True
25 |     """
26 |     for l in range(2, max_len + 1):
27 |         token_id = start_extra_id - 2 + l
28 |         token = f'<|extratoken_{token_id}|>'
29 |         text = text.replace(token, ' ' * l)
30 |     return text
31 | 
32 |     
33 | class CodeGeeXTokenizer(object):
34 |     def __init__(
35 |         self, 
36 |         tokenizer: GPT2TokenizerFast = None, 
37 |         tokenizer_path: str = "EleutherAI/gpt-j-6B", 
38 |         start_extra_id: int = 10,
39 |         max_len : int = 10,
40 |         mode='codegeex-13b',
41 |         dict_file: str = None,
42 |     ):
43 |         self.tokenizer = tokenizer if tokenizer is not None else AutoTokenizer.from_pretrained(tokenizer_path)
44 |         if mode not in ['codegeex-13b']:
45 |             raise ValueError(f"Invalid mode {mode}, choose from ['codegeex-13b']")
46 |         self.start_extra_id = start_extra_id
47 |         self.max_len = max_len
48 |         self.mode = mode
49 |         self.eos_token_id = self.tokenizer.eos_token_id
50 |         
51 |     def encode_code(self, code: str):
52 |         if self.mode == 'codegeex-13b':
53 |             code = encode_whitespaces(code, self.start_extra_id, self.max_len)
54 |             input_ids = self.tokenizer(code, is_split_into_words=False, verbose=False).input_ids
55 |             
56 |         return input_ids
57 |     
58 |     def decode_code(self, input_ids):
59 |         if self.mode == 'codegeex-13b':
60 |             text = self.tokenizer.decode(input_ids, skip_special_tokens=False, verbose=False)
61 |             output_code = decode_whitespaces(text, self.start_extra_id, self.max_len)
62 |         
63 |         return output_code


--------------------------------------------------------------------------------
/codegeex/tokenizer/tokenizer_config.json:
--------------------------------------------------------------------------------
1 | {"unk_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "bos_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "eos_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "add_prefix_space": false, "errors": "replace", "model_max_length": 2048, "special_tokens_map_file": null, "name_or_path": "gpt-j-6B", "from_slow": true, "tokenizer_class": "GPT2Tokenizer"}
2 | 


--------------------------------------------------------------------------------
/codegeex/torch/__init__.py:
--------------------------------------------------------------------------------
1 | from .codegeex_model import CodeGeeXModel


--------------------------------------------------------------------------------
/codegeex/torch/get_ckpt_qkv.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import torch
 4 | import random
 5 | import argparse
 6 | import numpy as np
 7 | 
 8 | 
 9 | def main():
10 |     parser = argparse.ArgumentParser()
11 |     parser.add_argument("--load-path", 
12 |                         type=str, 
13 |                         default="/zhangpai24/workspace/ckpt_ms/ckpt_ms_213000_fp32_52224.pt")
14 |     parser.add_argument("--save-path", 
15 |                         type=str, 
16 |                         default="/zhangpai24/workspace/ckpt_ms/ckpt_ms_213000_qkv.pt")
17 |     
18 |     args, _ = parser.parse_known_args()
19 |     
20 |     state_dict_path = args.load_path
21 |     print("Loading state dict ...")
22 |     sd = torch.load(state_dict_path, map_location="cpu")
23 |     
24 |     for i in range(40):
25 |         if i < 39:
26 |             query_weight = sd['module']['language_model']['transformer'].pop(f'layers.{i}.attention.query.weight', None)
27 |             query_bias = sd['module']['language_model']['transformer'].pop(f'layers.{i}.attention.query.bias', None)
28 |             key_weight = sd['module']['language_model']['transformer'].pop(f'layers.{i}.attention.key.weight', None)
29 |             key_bias = sd['module']['language_model']['transformer'].pop(f'layers.{i}.attention.key.bias', None)
30 |             value_weight = sd['module']['language_model']['transformer'].pop(f'layers.{i}.attention.value.weight', None)
31 |             value_bias = sd['module']['language_model']['transformer'].pop(f'layers.{i}.attention.value.bias', None)
32 |             qkv_weight = torch.cat([query_weight, key_weight, value_weight], dim=0)
33 |             qkv_bias = torch.cat([query_bias, key_bias, value_bias])
34 |             sd['module']['language_model']['transformer'][f'layers.{i}.attention.query_key_value.weight'] = qkv_weight
35 |             sd['module']['language_model']['transformer'][f'layers.{i}.attention.query_key_value.bias'] = qkv_bias
36 |         else:
37 |             tq_key_weight = sd['module']['language_model']['transformer'].pop('topQueryLayer.attention.key.weight', None)
38 |             tq_key_bias = sd['module']['language_model']['transformer'].pop('topQueryLayer.attention.key.bias', None)
39 |             tq_value_weight = sd['module']['language_model']['transformer'].pop('topQueryLayer.attention.value.weight', None)
40 |             tq_value_bias = sd['module']['language_model']['transformer'].pop('topQueryLayer.attention.value.bias', None)
41 |             tq_kv_weight = torch.cat([tq_key_weight, tq_value_weight], dim=0)
42 |             tq_kv_bias = torch.cat([tq_key_bias, tq_value_bias])
43 |             sd['module']['language_model']['transformer']['topQueryLayer.attention.key_value.weight'] = tq_kv_weight
44 |             sd['module']['language_model']['transformer']['topQueryLayer.attention.key_value.bias'] = tq_kv_bias
45 |     
46 |     save_ckpt_path = args.save_path
47 |     torch.save(sd, save_ckpt_path)
48 |     
49 | if __name__ == '__main__':
50 |     main()
51 | 


--------------------------------------------------------------------------------
/configs/codegeex_13b.sh:
--------------------------------------------------------------------------------
 1 | # CodeGeeX-13B configuration
 2 | 
 3 | CHECKPOINT_PATH="<path where you put the checkpoint (e.g., XXX/codegeex_13b.pt)>"
 4 | 
 5 | MODEL_ARGS="--num-layers 39 \
 6 |             --hidden-size 5120 \
 7 |             --num-attention-heads 40 \
 8 |             --max-position-embeddings 2048 \
 9 |             --attention-softmax-in-fp32 \
10 |             --load "$CHECKPOINT_PATH" \
11 |             --layernorm-epsilon 1e-5 \
12 |             --fp16 \
13 |             --ws-encoding-start-id 10 \
14 |             --ws-encoding-length 10 \
15 |             --make-vocab-size-divisible-by 52224 \
16 |             --seq-length 2048"


--------------------------------------------------------------------------------
/configs/codegeex_13b_paddle.sh:
--------------------------------------------------------------------------------
 1 | # CodeGeeX-13B paddle configuration
 2 | 
 3 | CHECKPOINT_PATH="<path where you put the checkpoint (e.g., XXX/codegeex_13b.pdparams)>"
 4 | 
 5 | MODEL_ARGS="--num-layers 39 \
 6 |             --hidden-size 5120 \
 7 |             --num-attention-heads 40 \
 8 |             --max-position-embeddings 2048 \
 9 |             --attention-softmax-in-fp32 \
10 |             --load "$CHECKPOINT_PATH" \
11 |             --layernorm-epsilon 1e-5 \
12 |             --fp16 \
13 |             --ws-encoding-start-id 10 \
14 |             --ws-encoding-length 10 \
15 |             --make-vocab-size-divisible-by 52224 \
16 |             --seq-length 2048"


--------------------------------------------------------------------------------
/configs/codegeex_13b_parallel.sh:
--------------------------------------------------------------------------------
 1 | # CodeGeeX-13B parallel configuration
 2 | # Parallel checkpoints are named under the format "mp_rank_0{i}_model_states.pt", where i is the rank, start from 0.
 3 | 
 4 | CHECKPOINT_PATH="<path where you put all parallel checkpoints (e.g., XXX/tp4/)>"
 5 | 
 6 | MODEL_ARGS="--num-layers 39 \
 7 |             --hidden-size 5120 \
 8 |             --num-attention-heads 40 \
 9 |             --max-position-embeddings 2048 \
10 |             --attention-softmax-in-fp32 \
11 |             --load "$CHECKPOINT_PATH" \
12 |             --layernorm-epsilon 1e-5 \
13 |             --fp16 \
14 |             --ws-encoding-start-id 10 \
15 |             --ws-encoding-length 10 \
16 |             --make-vocab-size-divisible-by 52224 \
17 |             --seq-length 2048"


--------------------------------------------------------------------------------
/deployment/example_inputs.jsonl:
--------------------------------------------------------------------------------
 1 | {"code": "# Write a function that returns the sum of the numbers from 1 to n.\n# For example, if n is 5, then the function should return 1 + 2 + 3 + 4 + 5.\n\n# You may assume that n is a positive integer.\ndef sum_of_numbers(n):", "langauge": "Python"}
 2 | {"code": "// Write a function that returns the sum of the numbers from 1 to n.\n// For example, if n is 5, then the function should return 1 + 2 + 3 + 4 + 5.\n\n#include <iostream>\nusing namespace std;\nint sum_of_numbers(int n) {", "langauge": "C++"}
 3 | {"code": "// Write a function that returns the sum of the numbers from 1 to n.\n// For example, if n is 5, then the function should return 1 + 2 + 3 + 4 + 5.\n\n#include <stdio.h>\n#include <stdlib.h>\nint sum(int n)\n{", "langauge": "C"}
 4 | {"code": "// Write a function that returns the sum of the numbers from 1 to n.\n// For example, if n is 5, then the function should return 1 + 2 + 3 + 4 + 5.\nprivate int sum(int n) {", "langauge": "C#"}
 5 | {"code": "// Write a function that returns the sum of the numbers from 1 to n.\n// For example, if n is 5, then the function should return 1 + 2 + 3 + 4 + 5.\n\npublic class SumOfNumbers {", "langauge": "Java"}
 6 | {"code": "<!--Write a homepage of CodeGeeX.-->\n\n<div class=\"container\">", "langauge": "HTML"}
 7 | {"code": "// Write a function that returns the sum of the numbers from 1 to n.\n// For example, if n is 5, then the function should return 1 + 2 + 3 + 4 + 5.\n// If n is 0, then the function should return 0.\n// If n is less than 0, then the function should return -1.\n/**\n * @param {number} n\n * @return {number}\n */\nfunction sum ($n) {", "langauge": "PHP"}
 8 | {"code": "// Write a function that returns the sum of the numbers from 1 to n.\n// For example, if n is 5, then the function should return 1 + 2 + 3 + 4 + 5.\n\nfunction sum(n) {", "langauge": "JavaScript"}
 9 | {"code": "// Write a function that returns the sum of the numbers from 1 to n,\n// but using a for loop instead of a while loop.\n\nfunction sumForLoop(n) {", "langauge": "TypeScript"}
10 | {"code": "// Write a function that returns the sum of the numbers from 1 to n,\n// but using a for loop instead of a while loop.\n\nfunc sumN(n int) int {", "langauge": "Go"}
11 | {"code": "// Write a function that returns the sum of the numbers from 1 to n,\n// but using a for loop instead of a while loop.\n\nfn sum_numbers(n: usize) -> usize {", "langauge": "Rust"}
12 | {"code": "-- Search all the records from the table CodeGeeX\n-- Delete iterms with odd indices", "langauge": "SQL"}
13 | {"code": "// Write a function that returns the sum of the numbers from 1 to n.\n// For example, if n is 5, then the function should return 1 + 2 + 3 + 4 + 5.\n\nfun sum(n: Int): Int {", "langauge": "Kotlin"}
14 | {"code": "! Write a function that returns the sum of the numbers from 1 to n.\n! For example, if n is 5, then the function should return 1 + 2 + 3 + 4 + 5.\n\n! Use the following header:\n! module sum_numbers\n! end\nmodule sum_numbers", "langauge": "Fortran"}
15 | {"code": "# Write a function that returns the sum of the numbers from 1 to n.\n# For example, if n is 5, then the function should return 1 + 2 + 3 + 4 + 5.\nsum_numbers <- function(n) {", "langauge": "R"}
16 | 


--------------------------------------------------------------------------------
/generations/humaneval_python_generations.jsonl.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/generations/humaneval_python_generations.jsonl.gz


--------------------------------------------------------------------------------
/generations/humaneval_rust_generations.jsonl.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/generations/humaneval_rust_generations.jsonl.gz


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | fire>=0.4.0
 2 | ipython>=8.4.0
 3 | numpy>=1.22.0
 4 | pandas>=1.3.5
 5 | pyzmq>=23.2.1
 6 | regex>=2022.3.15
 7 | setuptools>=58.0.4
 8 | transformers>=4.22.0
 9 | torch>=1.10.0
10 | tqdm>=4.63.0
11 | cpm_kernels
12 | deepspeed>0.6.1


--------------------------------------------------------------------------------
/resources/api/api_step_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/resources/api/api_step_1.png


--------------------------------------------------------------------------------
/resources/api/api_step_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/resources/api/api_step_2.png


--------------------------------------------------------------------------------
/resources/api/api_step_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/resources/api/api_step_3.png


--------------------------------------------------------------------------------
/resources/api/api_step_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/resources/api/api_step_4.png


--------------------------------------------------------------------------------
/resources/api/api_step_5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/resources/api/api_step_5.png


--------------------------------------------------------------------------------
/resources/en/codegeex_training.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/resources/en/codegeex_training.png


--------------------------------------------------------------------------------
/resources/en/hx_boxplot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/resources/en/hx_boxplot.png


--------------------------------------------------------------------------------
/resources/en/hx_examples.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/resources/en/hx_examples.png


--------------------------------------------------------------------------------
/resources/en/hx_generattion_radar_horizon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/resources/en/hx_generattion_radar_horizon.png


--------------------------------------------------------------------------------
/resources/en/hx_pass_rate_vs_language.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/resources/en/hx_pass_rate_vs_language.png


--------------------------------------------------------------------------------
/resources/en/hx_tasks.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/resources/en/hx_tasks.png


--------------------------------------------------------------------------------
/resources/en/hx_translation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/resources/en/hx_translation.png


--------------------------------------------------------------------------------
/resources/logo/codegeex_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/resources/logo/codegeex_logo.png


--------------------------------------------------------------------------------
/resources/zh/hx_boxplot_zh.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/resources/zh/hx_boxplot_zh.png


--------------------------------------------------------------------------------
/resources/zh/hx_generattion_radar_horizon_zh.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/resources/zh/hx_generattion_radar_horizon_zh.png


--------------------------------------------------------------------------------
/resources/zh/hx_pass_rate_vs_language_zh.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/resources/zh/hx_pass_rate_vs_language_zh.png


--------------------------------------------------------------------------------
/resources/zh/hx_tasks_zh.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/resources/zh/hx_tasks_zh.png


--------------------------------------------------------------------------------
/resources/zh/hx_translation_zh.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/resources/zh/hx_translation_zh.png


--------------------------------------------------------------------------------
/resources/zh/join_wechat.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/CodeGeeX/2838420b7b4492cf3d16bce5320e26e65960c9e2/resources/zh/join_wechat.png


--------------------------------------------------------------------------------
/resources/zh/wechat.md:
--------------------------------------------------------------------------------
1 | <div align="center">
2 | <img src=join_wechat.png width="60%"/>
3 | 
4 | <p> 扫码关注公众号加入「CodeGeeX交流群」 </p>
5 | <p> Scan the QR code to join the "CodeGeeX WeChat Group" </p>
6 | </div>
7 | 


--------------------------------------------------------------------------------
/scripts/convert_ckpt_parallel.sh:
--------------------------------------------------------------------------------
 1 | # This script is used to convert checkpoint model parallel partitions.
 2 | 
 3 | LOAD_CKPT_PATH=$1  # Path to weights in .pt format.
 4 | SAVE_CKPT_PATH=$2  # Path to save the output MP checkpoints.
 5 | MP_SIZE=$3 # Model parallel size
 6 | 
 7 | SCRIPT_PATH=$(realpath "$0")
 8 | SCRIPT_DIR=$(dirname "$SCRIPT_PATH")
 9 | MAIN_DIR=$(dirname "$SCRIPT_DIR")
10 | TOKENIZER_PATH="$MAIN_DIR/codegeex/tokenizer/"
11 | 
12 | if [ -z "$MP_SIZE" ]; then
13 |   MP_SIZE=1
14 | fi
15 | 
16 | # export CUDA settings
17 | export CUDA_HOME=/usr/local/cuda-11.1/
18 | export CUDA_VISIBLE_DEVICES=0,1
19 | 
20 | 
21 | CMD="python $MAIN_DIR/codegeex/megatron/convert_ckpt_parallel.py \
22 |       --load-ckpt-path $LOAD_CKPT_PATH \
23 |       --save-ckpt-path $SAVE_CKPT_PATH \
24 |       --tokenizer-path $TOKENIZER_PATH \
25 |       --target-tensor-model-parallel-size $MP_SIZE \
26 |       --num-layers 39 \
27 |       --hidden-size 5120 \
28 |       --num-attention-heads 40 \
29 |       --max-position-embeddings 2048 \
30 |       --attention-softmax-in-fp32 \
31 |       --fp16 \
32 |       --micro-batch-size 1 \
33 |       --make-vocab-size-divisible-by 52224 \
34 |       --seq-length 2048"
35 | 
36 | echo "$CMD"
37 | eval "$CMD"


--------------------------------------------------------------------------------
/scripts/convert_mindspore_to_megatron.sh:
--------------------------------------------------------------------------------
 1 | # This script is used to convert mindspore checkpoint to the megatron format.
 2 | 
 3 | NPY_CKPT_PATH=$1  # Path to Mindspore exported weights in .npy format.
 4 | SAVE_CKPT_PATH=$2  # Path to save the output .pt checkpoint.
 5 | GPU=$3
 6 | 
 7 | SCRIPT_PATH=$(realpath "$0")
 8 | SCRIPT_DIR=$(dirname "$SCRIPT_PATH")
 9 | MAIN_DIR=$(dirname "$SCRIPT_DIR")
10 | TOKENIZER_PATH="$MAIN_DIR/codegeex/tokenizer/"
11 | 
12 | # export CUDA settings
13 | if [ -z "$GPU" ]; then
14 |   GPU=0
15 | fi
16 | 
17 | export CUDA_HOME=/usr/local/cuda-11.1/
18 | export CUDA_VISIBLE_DEVICES=$GPU
19 | 
20 | 
21 | CMD="python $MAIN_DIR/codegeex/megatron/mindspore_to_megatron.py \
22 |       --npy-ckpt-path $NPY_CKPT_PATH \
23 |       --save-ckpt-path $SAVE_CKPT_PATH \
24 |       --tokenizer-path $TOKENIZER_PATH \
25 |       $MODEL_ARGS"
26 | 
27 | echo "$CMD"
28 | eval "$CMD"


--------------------------------------------------------------------------------
/scripts/evaluate_humaneval_x.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | from pathlib import Path
 4 | from codegeex.benchmark.evaluate_humaneval_x import evaluate_functional_correctness
 5 | #GLOBALS
 6 | INPUT_FILE: str  
 7 | LANGUAGE: str  
 8 | N_WORKERS: int  
 9 | TIMEOUT: int 
10 | 
11 | 
12 | parser = argparse.ArgumentParser("Debugging evaluate humaneval_x")
13 | # Path to the .jsonl file that contains the generated codes.
14 | parser.add_argument("-s","--samples", type=str)
15 | 
16 | # Target programming language, currently support one of ["python", "java", "cpp", "js", "go"]
17 | parser.add_argument("-l","--language", default="python", type=str)
18 | 
19 | # Number of parallel workers.
20 | parser.add_argument("-w","--workers", default=64, type=int)
21 | 
22 | # Timeout in seconds.
23 | parser.add_argument("-t","--timeout", default=5, type=int)
24 | 
25 | args = parser.parse_args()
26 | 
27 | INPUT_FILE = args.samples
28 | LANGUAGE = args.language  
29 | N_WORKERS = args.workers  
30 | TIMEOUT= args.timeout
31 | 
32 | 
33 | 
34 | SCRIPT_PATH: str = Path(os.path.abspath(__file__))
35 | print(SCRIPT_PATH)
36 | SCRIPT_DIR: str = os.path.dirname(SCRIPT_PATH)
37 | print(SCRIPT_DIR)
38 | MAIN_DIR: str = os.path.dirname(SCRIPT_DIR)
39 | print(MAIN_DIR)
40 | 
41 | DATA_DIR=os.path.join(MAIN_DIR,"codegeex/benchmark/humaneval-x/" + LANGUAGE + "/data/humaneval_" + LANGUAGE + ".jsonl.gz")
42 | print(DATA_DIR)
43 | 
44 | TMP_DIR=os.path.join(MAIN_DIR, "/codegeex/benchmark/humaneval-x/")
45 | 
46 | 
47 | #Debugging
48 | INPUT_FILE='/home/rog0d/Escritorio/CodeGeeX/generations/humaneval_rust_generations.jsonl.gz'
49 | LANGUAGE='rust'
50 | DATA_DIR=os.path.join(MAIN_DIR,"codegeex/benchmark/humaneval-x/" + LANGUAGE + "/data/humaneval_" + LANGUAGE + ".jsonl.gz")
51 | 
52 | """
53 | input_file: str = None,
54 |         tmp_dir: str = "./",
55 |         n_workers: int = 32,
56 |         timeout: float = 5.0,
57 |         problem_file: str = "../data/humaneval_python.jsonl.gz",
58 |         out_dir: str = None,
59 |         k: List[int] = [1, 10, 100],
60 |         test_groundtruth: bool = False,
61 |         example_test: bool = False,
62 | 
63 | """
64 | 
65 | evaluate_functional_correctness(input_file=INPUT_FILE,
66 |                                 n_workers=N_WORKERS,
67 |                                 tmp_dir=TMP_DIR,
68 |                                 problem_file=DATA_DIR,
69 |                                 timeout=300.0)
70 | 
71 | 
72 | 


--------------------------------------------------------------------------------
/scripts/evaluate_humaneval_x.sh:
--------------------------------------------------------------------------------
 1 | # This script is for evaluating the functional correctness of the generated codes of HumanEval-X.
 2 | 
 3 | INPUT_FILE=$1  # Path to the .jsonl file that contains the generated codes.
 4 | LANGUAGE=$2  # Target programming language, currently support one of ["python", "java", "cpp", "js", "go"]
 5 | N_WORKERS=$3  # Number of parallel workers.
 6 | TIMEOUT=$4  # Timeout in seconds.
 7 | 
 8 | SCRIPT_PATH=$(realpath "$0")
 9 | SCRIPT_DIR=$(dirname "$SCRIPT_PATH")
10 | MAIN_DIR=$(dirname "$SCRIPT_DIR")
11 | 
12 | echo "$INPUT_FILE"
13 | 
14 | if [ -z "$N_WORKERS" ]
15 | then
16 |     N_WORKERS=64
17 | fi
18 | 
19 | if [ -z "$LANGUAGE" ]
20 | then
21 |     LANGUAGE=python
22 | fi
23 | 
24 | if [ -z "$TIMEOUT" ]
25 | then
26 |     TIMEOUT=5
27 | fi
28 | 
29 | DATA_DIR=$MAIN_DIR/codegeex/benchmark/humaneval-x/$LANGUAGE/data/humaneval_$LANGUAGE.jsonl.gz
30 | 
31 | if [ $LANGUAGE = go ]; then
32 |   export PATH=$PATH:/usr/local/go/bin
33 | fi
34 | 
35 | if [ $LANGUAGE = cpp ]; then
36 |   export PATH=$PATH:/usr/bin/openssl
37 | fi
38 | 
39 | CMD="python $MAIN_DIR/codegeex/benchmark/humaneval-x/evaluate_humaneval_x.py \
40 |     --input_file "$INPUT_FILE" \
41 |     --n_workers $N_WORKERS \
42 |     --tmp_dir $MAIN_DIR/codegeex/benchmark/humaneval-x/ \
43 |     --problem_file $DATA_DIR \
44 |     --timeout $TIMEOUT"
45 | 
46 | echo "$CMD"
47 | eval "$CMD"


--------------------------------------------------------------------------------
/scripts/finetune_codegeex.sh:
--------------------------------------------------------------------------------
  1 | SCRIPT_PATH=$(realpath "$0")
  2 | SCRIPT_DIR=$(dirname "$SCRIPT_PATH")
  3 | MAIN_DIR=$(dirname "$SCRIPT_DIR")
  4 | 
  5 | # ====== Environment ======
  6 | # - NCCL & IB
  7 | export NCCL_DEBUG=info
  8 | export NCCL_IB_DISABLE=0
  9 | export NCCL_IB_GID_INDEX=3
 10 | 
 11 | HOSTFILE="<path to hostfile (with node ip addresses per line)>"
 12 | MASTER_IP=$(cat $HOSTFILE | head -n 1)
 13 | cat $HOSTFILE | awk '{print $1 " slots=8"}' > $SCRIPT_DIR/hostfile
 14 | echo "MASTER_IP=$MASTER_IP"
 15 | 
 16 | # ====== Parameters ======
 17 | DATA_PATH="<path with prefix where you put the data (e.g., XXX/data.13b.mmap/data)>"
 18 | CKPT_PATH="<path where you put the checkpoint (e.g., XXX/codegeex_13b.pt)>"
 19 | DS_CONFIG=ds_config.json
 20 | # - 13b
 21 | TP=1
 22 | PP=1
 23 | NLAYERS=39
 24 | HIDDEN=5120
 25 | NATTN_HEAD=40
 26 | EMBED_VOCAB=52224
 27 | GLOBAL_BATCH=560
 28 | MICRO_BATCH=10
 29 | NTRAIN_ITERS=100000
 30 | EVAL_INT=10
 31 | SAVE_INT=10
 32 | TRIAL_TAG="13b-test"
 33 | # - trial
 34 | TRIAL_NAME="pretrain-codegeex"
 35 | # - zero stage
 36 | ZERO_STAGE=2
 37 | # - logging & output
 38 | NOW=$(date +"%Y%m%d_%H%M%S")
 39 | OUTPUT_DIR="<path-to-output>-$TRIAL_NAME-$TRIAL_TAG"
 40 | TB_DIR=$OUTPUT_DIR/tb$NOW
 41 | mkdir -p $OUTPUT_DIR
 42 | mkdir -p $TB_DIR
 43 | 
 44 | # Deepspeed config
 45 | cat <<EOT > $DS_CONFIG
 46 | {
 47 |   "train_batch_size" : $GLOBAL_BATCH,
 48 |   "train_micro_batch_size_per_gpu": $MICRO_BATCH,
 49 |   "steps_per_print": 5,
 50 |   "zero_optimization": {
 51 |     "stage": $ZERO_STAGE,
 52 |     "reduce_bucket_size": 50000000,
 53 |     "allgather_bucket_size": 50000000,
 54 |     "overlap_comm": true,
 55 |     "contiguous_gradients": false
 56 |   },
 57 |   "fp16": {
 58 |     "enabled": true,
 59 |     "loss_scale": 0,
 60 |     "loss_scale_window": 500,
 61 |     "hysteresis": 2,
 62 |     "min_loss_scale": 1,
 63 |     "initial_scale_power": 12
 64 |   },
 65 |   "wall_clock_breakdown" : true
 66 | }
 67 | EOT
 68 | 
 69 | ds_args=""
 70 | ds_args=" --deepspeed ${ds_args}"
 71 | ds_args=" --no-pipeline-parallel ${ds_args}"
 72 | ds_args=" --deepspeed_config=$DS_CONFIG ${ds_args}"
 73 | ds_args=" --zero-stage=$ZERO_STAGE ${ds_args}"
 74 | ds_args=" --deepspeed-activation-checkpointing ${ds_args}"
 75 | 
 76 | echo "Launching deepspeed"
 77 | deepspeed \
 78 |     --hostfile hostfile \
 79 |     --master_addr $MASTER_IP \
 80 |     $MAIN_DIR/codegeex/megatron/tools/pretrain_codegeex.py \
 81 |     --tensor-model-parallel-size $TP \
 82 |     --pipeline-model-parallel-size $PP \
 83 |     --no-pipeline-parallel \
 84 |     --num-layers $NLAYERS \
 85 |     --hidden-size $HIDDEN \
 86 |     --make-vocab-size-divisible-by $EMBED_VOCAB \
 87 |     --num-attention-heads $NATTN_HEAD \
 88 |     --seq-length 512 \
 89 |     --loss-scale 12 \
 90 |     --max-position-embeddings 2048 \
 91 |     --micro-batch-size $MICRO_BATCH \
 92 |     --global-batch-size $GLOBAL_BATCH \
 93 |     --train-iters $NTRAIN_ITERS \
 94 |     --lr 1e-6 \
 95 |     --min-lr 1e-7 \
 96 |     --lr-decay-iters 100000 \
 97 |     --lr-decay-style cosine \
 98 |     --lr-warmup-iters 1000 \
 99 |     --log-interval 1 \
100 |     --eval-iters 10 \
101 |     --eval-interval $EVAL_INT \
102 |     --data-path $DATA_PATH \
103 |     --vocab-file $MAIN_DIR/codegeex/tokenizer/vocab.json \
104 |     --merge-file $MAIN_DIR/codegeex/tokenizer/merges.txt \
105 |     --save-interval $SAVE_INT \
106 |     --save $OUTPUT_DIR \
107 |     --load $OUTPUT_DIR \
108 |     --load-state $CKPT_PATH \
109 |     --split 98,2,0 \
110 |     --clip-grad 1.0 \
111 |     --weight-decay 0.1 \
112 |     --adam-beta1 0.9 \
113 |     --adam-beta2 0.95 \
114 |     --fp16 \
115 |     --ln-fp16 \
116 |     --attention-softmax-in-fp32 \
117 |     --checkpoint-activations \
118 |     --override-lr-scheduler \
119 |     --tensorboard-dir $TB_DIR \
120 |     $ds_args |& tee ${OUTPUT_DIR}/$NOW.log


--------------------------------------------------------------------------------
/scripts/gather_output.sh:
--------------------------------------------------------------------------------
 1 | # This script is used to gather the distributed outputs of different ranks.
 2 | 
 3 | OUTPUT_DIR=$1
 4 | OUTPUT_PREFIX=$2
 5 | IF_REMOVE_RANK_FILES=$3
 6 | 
 7 | echo "$OUTPUT_DIR"
 8 | echo "$OUTPUT_PREFIX"
 9 | 
10 | if [ -z "$IF_REMOVE_RANK_FILES" ]
11 | then
12 |     IF_REMOVE_RANK_FILES=0
13 | fi
14 | 
15 | SCRIPT_PATH=$(realpath "$0")
16 | SCRIPT_DIR=$(dirname "$SCRIPT_PATH")
17 | MAIN_DIR=$(dirname "$SCRIPT_DIR")
18 | 
19 | 
20 | CMD="python $MAIN_DIR/codegeex/benchmark/gather_output.py \
21 |         --output_dir $OUTPUT_DIR \
22 |         --output_prefix $OUTPUT_PREFIX \
23 |         --if_remove_rank_files $IF_REMOVE_RANK_FILES"
24 | 
25 | echo "$CMD"
26 | eval "$CMD"


--------------------------------------------------------------------------------
/scripts/generate_humaneval_x.sh:
--------------------------------------------------------------------------------
 1 | # This script is used to generate solutions of HumanEval-X.
 2 | 
 3 | LANGUAGE=$1  # Target programming language, currently support one of ["python", "java", "cpp", "js", "go"]
 4 | OUTPUT_PATH=$2  # Output path of the generated programs.
 5 | HOSTLIST=$3  # Provide hostfile if generating distributedly
 6 | 
 7 | SCRIPT_PATH=$(realpath "$0")
 8 | SCRIPT_DIR=$(dirname "$SCRIPT_PATH")
 9 | MAIN_DIR=$(dirname "$SCRIPT_DIR")
10 | TOKENIZER_PATH="$MAIN_DIR/codegeex/tokenizer/"
11 | 
12 | # export CUDA settings
13 | export CUDA_HOME=/usr/local/cuda-11.1/
14 | 
15 | # import model configuration
16 | source "$MAIN_DIR/configs/codegeex_13b.sh"
17 | 
18 | # nccl options
19 | OPTIONS_NCCL="export NCCL_DEBUG=warn; export NCCL_IB_DISABLE=0; export NCCL_IB_GID_INDEX=3"
20 | OPTIONS_PATH="export PATH=$PATH; export LD_LIBRARY_PATH=$LD_LIBRARY_PATH"
21 | CWD=$(pwd)
22 | 
23 | # set master ip for zmq server
24 | if [ -z "$HOSTLIST" ]; then
25 |   ZMQ_ADDR=$(hostname -i)
26 |   echo "$ZMQ_ADDR" > "./hostfile"
27 |   HOSTLIST="./hostfile"
28 | else
29 |   ZMQ_ADDR=$(cat $HOSTLIST | head -n 1)
30 | fi
31 | echo "master_ip: $ZMQ_ADDR"
32 | 
33 | NUM_SAMPLES=1
34 | MICRO_BSZ=1
35 | WORLD_SIZE=1
36 | TEMP=0.8
37 | TOPP=0.95
38 | SEED=42
39 | DATASET=humaneval
40 | TODAY=$(date +%y%m%d)
41 | CHANNEL_PORT=$(expr $RANDOM + 5000)
42 | MASTER_PORT=$(expr $RANDOM + 8000)
43 | 
44 | # save log file
45 | LOG_DIR=$MAIN_DIR/log
46 | mkdir -p "$LOG_DIR"
47 | LOG_PATH="$LOG_DIR/$TODAY-generation.log"
48 | 
49 | if [ -z "$LANGUAGE" ]; then
50 |   LANGUAGE=python
51 | fi
52 | 
53 | if [ -z "$INPUT_PATH" ]; then
54 |   INPUT_PATH=$MAIN_DIR/codegeex/benchmark/humaneval-x/$LANGUAGE/data/humaneval_$LANGUAGE.jsonl.gz
55 | fi
56 | 
57 | if [ -z "$OUTPUT_PATH" ]; then
58 |   OUTPUT_PATH=$MAIN_DIR/codegeex/benchmark/output/humaneval-x/codegeex/
59 |   mkdir -p "$OUTPUT_PATH"
60 | fi
61 | 
62 | JOB_ID=codegeex-ns$NUM_SAMPLES-t$TEMP-topp$TOPP-seed$SEED-$LANGUAGE
63 | 
64 | RUN_CMD="python \
65 |   $MAIN_DIR/codegeex/benchmark/humaneval-x/generate_humaneval_x.py \
66 |   --hostfile $HOSTLIST \
67 |   --channel-ip $ZMQ_ADDR \
68 |   --channel-port $CHANNEL_PORT \
69 |   --master-port $MASTER_PORT \
70 |   --tokenizer-path $TOKENIZER_PATH \
71 |   --load-deepspeed \
72 |   --temperature $TEMP \
73 |   --top-p $TOPP \
74 |   --out-seq-length 1024 \
75 |   --micro-batch-size $MICRO_BSZ \
76 |   --samples-per-problem $NUM_SAMPLES \
77 |   --language-type $LANGUAGE \
78 |   --dataset $DATASET \
79 |   --input-path $INPUT_PATH \
80 |   --output-prefix $OUTPUT_PATH/$JOB_ID \
81 |   --gen-node-world-size $WORLD_SIZE \
82 |   --seed $SEED \
83 |   $MODEL_ARGS"
84 | 
85 | RUN_CMD="$OPTIONS_NCCL; $OPTIONS_PATH; $RUN_CMD"
86 | RUN_CMD="cd $CWD; $RUN_CMD"
87 | 
88 | if (( WORLD_SIZE != 1 )); then
89 |   RUN_CMD="pdsh -R ssh -w ^$HOSTLIST \"$RUN_CMD\""
90 | fi
91 | 
92 | echo "$RUN_CMD"
93 | echo "Writing log to $LOG_PATH"
94 | eval "$RUN_CMD" > "$LOG_PATH"
95 | bash $MAIN_DIR/scripts/gather_output.sh $OUTPUT_PATH $JOB_ID 1
96 | 


--------------------------------------------------------------------------------
/scripts/pretrain_codegeex.sh:
--------------------------------------------------------------------------------
  1 | SCRIPT_PATH=$(realpath "$0")
  2 | SCRIPT_DIR=$(dirname "$SCRIPT_PATH")
  3 | MAIN_DIR=$(dirname "$SCRIPT_DIR")
  4 | 
  5 | # ====== Environment ======
  6 | # - NCCL & IB
  7 | export NCCL_DEBUG=info
  8 | export NCCL_IB_DISABLE=0
  9 | export NCCL_IB_GID_INDEX=3
 10 | 
 11 | HOSTFILE="<path to hostfile (with node ip addresses per line)>"
 12 | MASTER_IP=$(cat $HOSTFILE | head -n 1)
 13 | cat $HOSTFILE | awk '{print $1 " slots=8"}' > $SCRIPT_DIR/hostfile
 14 | echo "MASTER_IP=$MASTER_IP"
 15 | 
 16 | # ====== Parameters ======
 17 | DATA_PATH="<path with prefix where you put the data (e.g., XXX/data.13b.mmap/data)>"
 18 | CKPT_PATH="<path where you put the checkpoint (e.g., XXX/codegeex_13b.pt)>"
 19 | DS_CONFIG=ds_config.json
 20 | # - 13b
 21 | TP=1
 22 | PP=1
 23 | NLAYERS=39
 24 | HIDDEN=5120
 25 | NATTN_HEAD=40
 26 | EMBED_VOCAB=52224
 27 | GLOBAL_BATCH=560
 28 | MICRO_BATCH=10
 29 | NTRAIN_ITERS=100000
 30 | EVAL_INT=10
 31 | SAVE_INT=10
 32 | TRIAL_TAG="13b-test"
 33 | # - trial
 34 | TRIAL_NAME="pretrain-codegeex"
 35 | # - zero stage
 36 | ZERO_STAGE=2
 37 | # - logging & output
 38 | NOW=$(date +"%Y%m%d_%H%M%S")
 39 | OUTPUT_DIR="<path-to-output>-$TRIAL_NAME-$TRIAL_TAG"
 40 | TB_DIR=$OUTPUT_DIR/tb$NOW
 41 | mkdir -p $OUTPUT_DIR
 42 | mkdir -p $TB_DIR
 43 | 
 44 | # Deepspeed config
 45 | cat <<EOT > $DS_CONFIG
 46 | {
 47 |   "train_batch_size" : $GLOBAL_BATCH,
 48 |   "train_micro_batch_size_per_gpu": $MICRO_BATCH,
 49 |   "steps_per_print": 5,
 50 |   "zero_optimization": {
 51 |     "stage": $ZERO_STAGE,
 52 |     "reduce_bucket_size": 50000000,
 53 |     "allgather_bucket_size": 50000000,
 54 |     "overlap_comm": true,
 55 |     "contiguous_gradients": false
 56 |   },
 57 |   "fp16": {
 58 |     "enabled": true,
 59 |     "loss_scale": 0,
 60 |     "loss_scale_window": 500,
 61 |     "hysteresis": 2,
 62 |     "min_loss_scale": 1,
 63 |     "initial_scale_power": 12
 64 |   },
 65 |   "wall_clock_breakdown" : true
 66 | }
 67 | EOT
 68 | 
 69 | ds_args=""
 70 | ds_args=" --deepspeed ${ds_args}"
 71 | ds_args=" --no-pipeline-parallel ${ds_args}"
 72 | ds_args=" --deepspeed_config=$DS_CONFIG ${ds_args}"
 73 | ds_args=" --zero-stage=$ZERO_STAGE ${ds_args}"
 74 | ds_args=" --deepspeed-activation-checkpointing ${ds_args}"
 75 | 
 76 | echo "Launching deepspeed"
 77 | deepspeed \
 78 |     --hostfile hostfile \
 79 |     --master_addr $MASTER_IP \
 80 |     $MAIN_DIR/codegeex/megatron/tools/pretrain_codegeex.py \
 81 |     --tensor-model-parallel-size $TP \
 82 |     --pipeline-model-parallel-size $PP \
 83 |     --no-pipeline-parallel \
 84 |     --num-layers $NLAYERS \
 85 |     --hidden-size $HIDDEN \
 86 |     --make-vocab-size-divisible-by $EMBED_VOCAB \
 87 |     --num-attention-heads $NATTN_HEAD \
 88 |     --seq-length 512 \
 89 |     --loss-scale 12 \
 90 |     --max-position-embeddings 2048 \
 91 |     --micro-batch-size $MICRO_BATCH \
 92 |     --global-batch-size $GLOBAL_BATCH \
 93 |     --train-iters $NTRAIN_ITERS \
 94 |     --lr 2e-4 \
 95 |     --min-lr 1e-7 \
 96 |     --lr-decay-iters 100000 \
 97 |     --lr-decay-style cosine \
 98 |     --lr-warmup-iters 1500 \
 99 |     --log-interval 1 \
100 |     --eval-iters 10 \
101 |     --eval-interval $EVAL_INT \
102 |     --data-path $DATA_PATH \
103 |     --vocab-file $MAIN_DIR/codegeex/tokenizer/vocab.json \
104 |     --merge-file $MAIN_DIR/codegeex/tokenizer/merges.txt \
105 |     --save-interval $SAVE_INT \
106 |     --save $OUTPUT_DIR \
107 |     --load $OUTPUT_DIR \
108 |     --load-state $CKPT_PATH \
109 |     --split 98,2,0 \
110 |     --clip-grad 1.0 \
111 |     --weight-decay 0.1 \
112 |     --adam-beta1 0.9 \
113 |     --adam-beta2 0.95 \
114 |     --fp16 \
115 |     --ln-fp16 \
116 |     --attention-softmax-in-fp32 \
117 |     --checkpoint-activations \
118 |     --override-lr-scheduler \
119 |     --tensorboard-dir $TB_DIR \
120 |     $ds_args |& tee ${OUTPUT_DIR}/$NOW.log
121 | 
122 | 


--------------------------------------------------------------------------------
/scripts/process_pretrain_dataset.sh:
--------------------------------------------------------------------------------
 1 | # Process dataset for CodeGeeX pretraining
 2 | 
 3 | DATASET_PATH=$1
 4 | OUTPUT_PATH=$2
 5 | LANGUAGE=$3
 6 | 
 7 | SCRIPT_PATH=$(realpath "$0")
 8 | SCRIPT_DIR=$(dirname "$SCRIPT_PATH")
 9 | MAIN_DIR=$(dirname "$SCRIPT_DIR")
10 | TOKENIZER_PATH="$MAIN_DIR/codegeex/tokenizer/"
11 | 
12 | if [ -z "$LANGUAGE" ]; then
13 |   LANGUAGE=python
14 | fi
15 | 
16 | CMD="python $MAIN_DIR/codegeex/data/process_pretrain_dataset.py \
17 |         --dataset_path $DATASET_PATH \
18 |         --tokenizer_path $TOKENIZER_PATH \
19 |         --output_prefix $OUTPUT_PATH \
20 |         --language $LANGUAGE \
21 |         --mode pretrain \
22 |         --seq_len 2048"
23 | 
24 | echo "$CMD"
25 | eval "$CMD"


--------------------------------------------------------------------------------
/scripts/test_inference.sh:
--------------------------------------------------------------------------------
 1 | # This script is used to test the inference of CodeGeeX.
 2 | 
 3 | GPU=$1
 4 | PROMPT_FILE=$2
 5 | 
 6 | SCRIPT_PATH=$(realpath "$0")
 7 | SCRIPT_DIR=$(dirname "$SCRIPT_PATH")
 8 | MAIN_DIR=$(dirname "$SCRIPT_DIR")
 9 | TOKENIZER_PATH="$MAIN_DIR/codegeex/tokenizer/"
10 | 
11 | # import model configuration
12 | source "$MAIN_DIR/configs/codegeex_13b.sh"
13 | 
14 | # export CUDA settings
15 | if [ -z "$GPU" ]; then
16 |   GPU=0
17 | fi
18 | 
19 | export CUDA_HOME=/usr/local/cuda-11.1/
20 | export CUDA_VISIBLE_DEVICES=$GPU
21 | 
22 | if [ -z "$PROMPT_FILE" ]; then
23 |   PROMPT_FILE=$MAIN_DIR/tests/test_prompt.txt
24 | fi
25 | 
26 | # remove --greedy if using sampling
27 | CMD="python $MAIN_DIR/tests/test_inference.py \
28 |         --prompt-file $PROMPT_FILE \
29 |         --tokenizer-path $TOKENIZER_PATH \
30 |         --micro-batch-size 1 \
31 |         --out-seq-length 1024 \
32 |         --temperature 0.8 \
33 |         --top-p 0.95 \
34 |         --top-k 0 \
35 |         --greedy \
36 |         $MODEL_ARGS"
37 | 
38 | echo "$CMD"
39 | eval "$CMD"
40 | 


--------------------------------------------------------------------------------
/scripts/test_inference_oneflow.sh:
--------------------------------------------------------------------------------
 1 | # This script is used to test the inference of CodeGeeX.
 2 | 
 3 | GPU=$1
 4 | PROMPT_FILE=$2
 5 | 
 6 | SCRIPT_PATH=$(realpath "$0")
 7 | SCRIPT_DIR=$(dirname "$SCRIPT_PATH")
 8 | MAIN_DIR=$(dirname "$SCRIPT_DIR")
 9 | TOKENIZER_PATH="$MAIN_DIR/codegeex/tokenizer/"
10 | 
11 | # import model configuration
12 | source "$MAIN_DIR/configs/codegeex_13b.sh"
13 | 
14 | # export CUDA settings
15 | if [ -z "$GPU" ]; then
16 |   GPU=0
17 | fi
18 | 
19 | export CUDA_HOME=/usr/local/cuda-11.1/
20 | export CUDA_VISIBLE_DEVICES=$GPU
21 | 
22 | if [ -z "$PROMPT_FILE" ]; then
23 |   PROMPT_FILE=$MAIN_DIR/tests/test_prompt.txt
24 | fi
25 | 
26 | # remove --greedy if using sampling
27 | CMD="python $MAIN_DIR/tests/test_inference_oneflow.py \
28 |         --prompt-file $PROMPT_FILE \
29 |         --tokenizer-path $TOKENIZER_PATH \
30 |         --micro-batch-size 1 \
31 |         --out-seq-length 1024 \
32 |         --temperature 0.8 \
33 |         --top-p 0.95 \
34 |         --top-k 0 \
35 |         --greedy \
36 |         $MODEL_ARGS"
37 | 
38 | echo "$CMD"
39 | eval "$CMD"
40 | 


--------------------------------------------------------------------------------
/scripts/test_inference_oneflow_quantized.sh:
--------------------------------------------------------------------------------
 1 | # This script is used to test the inference of CodeGeeX.
 2 | 
 3 | GPU=$1
 4 | PROMPT_FILE=$2
 5 | 
 6 | SCRIPT_PATH=$(realpath "$0")
 7 | SCRIPT_DIR=$(dirname "$SCRIPT_PATH")
 8 | MAIN_DIR=$(dirname "$SCRIPT_DIR")
 9 | TOKENIZER_PATH="$MAIN_DIR/codegeex/tokenizer/"
10 | 
11 | # import model configuration
12 | source "$MAIN_DIR/configs/codegeex_13b.sh"
13 | 
14 | # export CUDA settings
15 | if [ -z "$GPU" ]; then
16 |   GPU=1
17 | fi
18 | 
19 | export CUDA_HOME=/usr/local/cuda-11.1/
20 | export CUDA_VISIBLE_DEVICES=$GPU
21 | 
22 | if [ -z "$PROMPT_FILE" ]; then
23 |   PROMPT_FILE=$MAIN_DIR/tests/test_prompt.txt
24 | fi
25 | 
26 | # remove --greedy if using sampling
27 | CMD="python $MAIN_DIR/tests/test_inference_oneflow.py \
28 |         --prompt-file $PROMPT_FILE \
29 |         --tokenizer-path $TOKENIZER_PATH \
30 |         --micro-batch-size 1 \
31 |         --out-seq-length 1024 \
32 |         --temperature 0.2 \
33 |         --top-p 0.95 \
34 |         --top-k 0 \
35 |         --quantize \
36 |         $MODEL_ARGS"
37 | 
38 | echo "$CMD"
39 | eval "$CMD"
40 | 


--------------------------------------------------------------------------------
/scripts/test_inference_paddle.sh:
--------------------------------------------------------------------------------
 1 | # This script is used to test the inference of CodeGeeX.
 2 | 
 3 | GPU=$1
 4 | PROMPT_FILE=$2
 5 | 
 6 | SCRIPT_PATH=$(realpath "$0")
 7 | SCRIPT_DIR=$(dirname "$SCRIPT_PATH")
 8 | MAIN_DIR=$(dirname "$SCRIPT_DIR")
 9 | TOKENIZER_PATH="$MAIN_DIR/codegeex/tokenizer/"
10 | 
11 | # import model configuration
12 | source "$MAIN_DIR/configs/codegeex_13b_paddle.sh"
13 | 
14 | # export CUDA settings
15 | if [ -z "$GPU" ]; then
16 |   GPU=0
17 | fi
18 | 
19 | export CUDA_HOME=/usr/local/cuda-11.1/
20 | export CUDA_VISIBLE_DEVICES=$GPU
21 | 
22 | if [ -z "$PROMPT_FILE" ]; then
23 |   PROMPT_FILE=$MAIN_DIR/tests/test_prompt.txt
24 | fi
25 | 
26 | # remove --greedy if using sampling
27 | CMD="python $MAIN_DIR/tests/test_inference_paddle.py \
28 |         --prompt-file $PROMPT_FILE \
29 |         --tokenizer-path $TOKENIZER_PATH \
30 |         --micro-batch-size 1 \
31 |         --out-seq-length 1024 \
32 |         --temperature 0.8 \
33 |         --top-p 0.95 \
34 |         --top-k 0 \
35 |         --greedy \
36 |         $MODEL_ARGS"
37 | 
38 | echo "$CMD"
39 | eval "$CMD"
40 | 


--------------------------------------------------------------------------------
/scripts/test_inference_parallel.sh:
--------------------------------------------------------------------------------
 1 | # This script is used to test the inference of CodeGeeX.
 2 | 
 3 | MP_SIZE=$1
 4 | PROMPT_FILE=$2
 5 | 
 6 | SCRIPT_PATH=$(realpath "$0")
 7 | SCRIPT_DIR=$(dirname "$SCRIPT_PATH")
 8 | MAIN_DIR=$(dirname "$SCRIPT_DIR")
 9 | TOKENIZER_PATH="$MAIN_DIR/codegeex/tokenizer/"
10 | 
11 | if [ -z "$MP_SIZE" ]; then
12 |   MP_SIZE=1
13 | fi
14 | 
15 | if [ "$MP_SIZE" -eq 1 ]; then
16 |   source "$MAIN_DIR/configs/codegeex_13b.sh"
17 |   echo "Load config from $MAIN_DIR/configs/codegeex_13b.sh"
18 | else
19 |   source "$MAIN_DIR/configs/codegeex_13b_parallel.sh"
20 |   echo "Load config from $MAIN_DIR/configs/codegeex_13b_parallel.sh"
21 | fi
22 | 
23 | # export CUDA settings
24 | export CUDA_HOME=/usr/local/cuda-11.1/
25 | # export CUDA_VISIBLE_DEVICES=0,1
26 | 
27 | if [ -z "$PROMPT_FILE" ]; then
28 |   PROMPT_FILE=$MAIN_DIR/tests/test_prompt.txt
29 | fi
30 | 
31 | # remove --greedy if using sampling
32 | CMD="torchrun --nproc_per_node $MP_SIZE $MAIN_DIR/tests/test_inference_megatron.py \
33 |         --tensor-model-parallel-size $MP_SIZE \
34 |         --prompt-file $PROMPT_FILE \
35 |         --tokenizer-path $TOKENIZER_PATH \
36 |         --micro-batch-size 1 \
37 |         --out-seq-length 1024 \
38 |         --temperature 0.8 \
39 |         --top-p 0.95 \
40 |         --top-k 0 \
41 |         --greedy \
42 |         --use-cpu-initialization \
43 |         --ln-fp16 \
44 |         $MODEL_ARGS"
45 | 
46 | echo "$CMD"
47 | eval "$CMD"
48 | 


--------------------------------------------------------------------------------
/scripts/test_inference_quantized.sh:
--------------------------------------------------------------------------------
 1 | # This script is used to test the inference of CodeGeeX.
 2 | 
 3 | GPU=$1
 4 | PROMPT_FILE=$2
 5 | 
 6 | SCRIPT_PATH=$(realpath "$0")
 7 | SCRIPT_DIR=$(dirname "$SCRIPT_PATH")
 8 | MAIN_DIR=$(dirname "$SCRIPT_DIR")
 9 | TOKENIZER_PATH="$MAIN_DIR/codegeex/tokenizer/"
10 | 
11 | # import model configuration
12 | source "$MAIN_DIR/configs/codegeex_13b.sh"
13 | 
14 | # export CUDA settings
15 | if [ -z "$GPU" ]; then
16 |   GPU=0
17 | fi
18 | 
19 | export CUDA_HOME=/usr/local/cuda-11.1/
20 | export CUDA_VISIBLE_DEVICES=$GPU
21 | 
22 | if [ -z "$PROMPT_FILE" ]; then
23 |   PROMPT_FILE=$MAIN_DIR/tests/test_prompt.txt
24 | fi
25 | 
26 | # remove --greedy if using sampling
27 | CMD="python $MAIN_DIR/tests/test_inference.py \
28 |         --prompt-file $PROMPT_FILE \
29 |         --tokenizer-path $TOKENIZER_PATH \
30 |         --micro-batch-size 1 \
31 |         --out-seq-length 1024 \
32 |         --temperature 0.2 \
33 |         --top-p 0.95 \
34 |         --top-k 0 \
35 |         --quantize \
36 |         $MODEL_ARGS"
37 | 
38 | echo "$CMD"
39 | eval "$CMD"
40 | 


--------------------------------------------------------------------------------
/scripts/translate_humaneval_x.sh:
--------------------------------------------------------------------------------
  1 | # This script is used to translate solutions of HumanEval-X.
  2 | 
  3 | LANG_SRC_TYPE=$1  # Source programming language, currently support one of ["python", "java", "cpp", "js", "go"]
  4 | LANG_TGT_TYPE=$2    # Target programming language, currently support one of ["python", "java", "cpp", "js", "go"]
  5 | OUTPUT_PATH=$3 # Output path of the generated programs.
  6 | HOSTLIST=$4    # Provide hostfile if generating distributedly
  7 | 
  8 | SCRIPT_PATH=$(realpath "$0")
  9 | SCRIPT_DIR=$(dirname "$SCRIPT_PATH")
 10 | MAIN_DIR=$(dirname "$SCRIPT_DIR")
 11 | TOKENIZER_PATH="$MAIN_DIR/codegeex/tokenizer/"
 12 | 
 13 | # export CUDA settings
 14 | export CUDA_HOME=/usr/local/cuda-11.1/
 15 | 
 16 | # import model configuration
 17 | source "$MAIN_DIR/configs/codegeex_13b.sh"
 18 | 
 19 | # nccl options
 20 | OPTIONS_NCCL="export NCCL_DEBUG=warn; export NCCL_IB_DISABLE=0; export NCCL_IB_GID_INDEX=3"
 21 | OPTIONS_PATH="export PATH=$PATH; export LD_LIBRARY_PATH=$LD_LIBRARY_PATH"
 22 | CWD=$(pwd)
 23 | 
 24 | # set master ip for zmq server
 25 | if [ -z "$HOSTLIST" ]; then
 26 |   ZMQ_ADDR=$(hostname -i)
 27 |   echo "$ZMQ_ADDR" > "./hostfile"
 28 |   HOSTLIST="./hostfile"
 29 | else
 30 |   ZMQ_ADDR=$(cat $HOSTLIST | head -n 1)
 31 | fi
 32 | echo "master_ip: $ZMQ_ADDR"
 33 | 
 34 | NUM_SAMPLES=1
 35 | MICRO_BSZ=1
 36 | WORLD_SIZE=1
 37 | TEMP=0.8
 38 | TOPP=0.95
 39 | SEED=42
 40 | DATASET=humaneval
 41 | TODAY=$(date +%y%m%d)
 42 | CHANNEL_PORT=$(expr $RANDOM + 5000)
 43 | MASTER_PORT=$(expr $RANDOM + 8000)
 44 | 
 45 | # save log file
 46 | LOG_DIR=$MAIN_DIR/log
 47 | mkdir -p "$LOG_DIR"
 48 | LOG_PATH="$LOG_DIR/$TODAY-translation.log"
 49 | 
 50 | if [ -z "$LANG_SRC_TYPE" ]
 51 | then
 52 |     LANG_SRC_TYPE=python
 53 | fi
 54 | 
 55 | if [ -z "$LANG_TGT_TYPE" ]
 56 | then
 57 |     LANG_TGT_TYPE=java
 58 | fi
 59 | 
 60 | if [ -z "$INPUT_SRC_PATH" ]
 61 | then
 62 |     INPUT_SRC_PATH=$MAIN_DIR/codegeex/benchmark/humaneval-x/$LANG_SRC_TYPE/data/humaneval_$LANG_SRC_TYPE.jsonl.gz
 63 | fi
 64 | 
 65 | if [ -z "$INPUT_TGT_PATH" ]
 66 | then
 67 |     INPUT_TGT_PATH=$MAIN_DIR/codegeex/benchmark/humaneval-x/$LANG_TGT_TYPE/data/humaneval_$LANG_TGT_TYPE.jsonl.gz
 68 | fi
 69 | 
 70 | if [ -z "$OUTPUT_PATH" ]; then
 71 |   OUTPUT_PATH=$MAIN_DIR/codegeex/benchmark/output/humaneval-x/codegeex/
 72 |   mkdir -p "$OUTPUT_PATH"
 73 | fi
 74 | 
 75 | JOB_ID=codegeex-ns$NUM_SAMPLES-t$TEMP-topp$TOPP-seed$SEED-$LANGUAGE
 76 | 
 77 | RUN_CMD="python \
 78 |   $MAIN_DIR/codegeex/benchmark/humaneval-x/translate_humaneval_x.py \
 79 |   --hostfile $HOSTLIST \
 80 |   --channel-ip $ZMQ_ADDR \
 81 |   --channel-port $CHANNEL_PORT \
 82 |   --master-port $MASTER_PORT \
 83 |   --tokenizer-path $TOKENIZER_PATH \
 84 |   --load-deepspeed \
 85 |   --temperature $TEMP \
 86 |   --top-p $TOPP \
 87 |   --out-seq-length 1024 \
 88 |   --micro-batch-size $MICRO_BSZ \
 89 |   --samples-per-problem $NUM_SAMPLES \
 90 |   --language-src-type $LANG_SRC_TYPE \
 91 |   --language-tgt-type $LANG_TGT_TYPE \
 92 |   --src-path $INPUT_SRC_PATH \
 93 |   --tgt-path $INPUT_TGT_PATH \
 94 |   --dataset $DATASET \
 95 |   --output-prefix $OUTPUT_PATH/$JOB_ID \
 96 |   --gen-node-world-size $WORLD_SIZE \
 97 |   --seed $SEED \
 98 |   $MODEL_ARGS"
 99 | 
100 | RUN_CMD="$OPTIONS_NCCL; $OPTIONS_PATH; $RUN_CMD"
101 | RUN_CMD="cd $CWD; $RUN_CMD"
102 | 
103 | if (( WORLD_SIZE != 1 )); then
104 |   RUN_CMD="pdsh -R ssh -w ^$HOSTLIST \"$RUN_CMD\""
105 | fi
106 | 
107 | echo "$RUN_CMD"
108 | echo "Writing log to $LOG_PATH"
109 | eval "$RUN_CMD" > "$LOG_PATH"
110 | bash $MAIN_DIR/scripts/gather_output.sh $OUTPUT_PATH $JOB_ID 1
111 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(
 4 |     name="codegeex",
 5 |     py_modules=["codegeex"],
 6 |     version="1.0",
 7 |     description="CodeGeeX: A Open Multilingual Code Generation Model.",
 8 |     author="Qinkai Zheng",
 9 |     packages=find_packages(),
10 |     install_requires=[
11 |         "fire>=0.4.0",
12 |         "ipython>=8.4.0",
13 |         "numpy>=1.22.0",
14 |         "pandas>=1.3.5",
15 |         "pyzmq>=23.2.1",
16 |         "regex>=2022.3.15",
17 |         "setuptools>=58.0.4",
18 |         "transformers>=4.22.0",
19 |         "tokenizers>=0.11.0",
20 |         "torch>=1.10.0",
21 |         "tqdm>=4.63.0",
22 |         "cpm_kernels",
23 |         "deepspeed>0.6.1",
24 |     ],
25 |     entry_points={}
26 | )
27 | 


--------------------------------------------------------------------------------
/tests/test_inference.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import torch
  3 | import argparse
  4 | import numpy as np
  5 | 
  6 | import codegeex
  7 | from codegeex.torch import CodeGeeXModel
  8 | from codegeex.tokenizer import CodeGeeXTokenizer
  9 | from codegeex.quantization import quantize
 10 | 
 11 | 
 12 | def model_provider(args):
 13 |     """Build the model."""
 14 | 
 15 |     model = CodeGeeXModel(
 16 |         args.hidden_size,
 17 |         args.num_layers,
 18 |         args.num_attention_heads,
 19 |         args.padded_vocab_size,
 20 |         args.max_position_embeddings
 21 |     )
 22 |     
 23 |     return model
 24 | 
 25 | 
 26 | def add_code_generation_args(parser):
 27 |     group = parser.add_argument_group(title="code generation")
 28 |     group.add_argument(
 29 |         "--num-layers",
 30 |         type=int,
 31 |         default=39,
 32 |     )
 33 |     group.add_argument(
 34 |         "--hidden-size",
 35 |         type=int,
 36 |         default=5120,
 37 |     )
 38 |     group.add_argument(
 39 |         "--num-attention-heads",
 40 |         type=int,
 41 |         default=40,
 42 |     )
 43 |     group.add_argument(
 44 |         "--padded-vocab-size",
 45 |         type=int,
 46 |         default=52224,
 47 |     )
 48 |     group.add_argument(
 49 |         "--max-position-embeddings",
 50 |         type=int,
 51 |         default=2048,
 52 |     )
 53 |     group.add_argument(
 54 |         "--temperature",
 55 |         type=float,
 56 |         default=1.0,
 57 |         help="Sampling temperature.",
 58 |     )
 59 |     group.add_argument(
 60 |         "--greedy",
 61 |         action="store_true",
 62 |         default=False,
 63 |         help="Use greedy sampling.",
 64 |     )
 65 |     group.add_argument(
 66 |         "--top-p",
 67 |         type=float,
 68 |         default=0.0,
 69 |         help="Top p sampling.",
 70 |     )
 71 |     group.add_argument(
 72 |         "--top-k",
 73 |         type=int,
 74 |         default=0,
 75 |         help="Top k sampling.",
 76 |     )
 77 |     group.add_argument(
 78 |         "--out-seq-length",
 79 |         type=int,
 80 |         default=2048,
 81 |         help="Size of the output generated text.",
 82 |     )
 83 |     group.add_argument(
 84 |         "--prompt-file",
 85 |         type=str,
 86 |         default="./test_prompt.txt",
 87 |     )
 88 |     group.add_argument(
 89 |         "--tokenizer-path",
 90 |         type=str,
 91 |         default="./tokenizer",
 92 |     )
 93 |     group.add_argument(
 94 |         "--load",
 95 |         type=str,
 96 |     )
 97 |     group.add_argument(
 98 |         "--state-dict-path",
 99 |         type=str,
100 |     )
101 |     group.add_argument(
102 |         "--micro-batch-size",
103 |         type=int,
104 |         default=1,
105 |     )
106 |     group.add_argument(
107 |         "--quantize",
108 |         action="store_true",
109 |     )
110 |     group.add_argument(
111 |         "--interative",
112 |         action="store_true",
113 |     )
114 |     
115 |     return parser
116 | 
117 |     
118 | def main():
119 |     parser = argparse.ArgumentParser()
120 |     parser = add_code_generation_args(parser)
121 |     args, _ = parser.parse_known_args()
122 |     
123 |     print("Loading tokenizer ...")
124 |     tokenizer = CodeGeeXTokenizer(
125 |         tokenizer_path=args.tokenizer_path, 
126 |         mode="codegeex-13b")
127 | 
128 |     print("Loading state dict ...")
129 |     state_dict = torch.load(args.load, map_location="cpu")
130 |     state_dict = state_dict["module"]
131 | 
132 |     print("Building CodeGeeX model ...")
133 |     model = model_provider(args)
134 |     model.load_state_dict(state_dict)
135 |     model.eval()
136 |     model.half()
137 |     if args.quantize:
138 |         model = quantize(model, weight_bit_width=8, backend="torch")
139 |     model.cuda()
140 |     torch.cuda.synchronize()
141 |     
142 |     with open(args.prompt_file, "r") as f:
143 |         prompt = f.readlines()
144 |         prompt = "".join(prompt)
145 |     
146 |     out_seq_lengths = [args.out_seq_length]
147 |     for out_seq_length in out_seq_lengths:        
148 |         print(f"Generating with out_seq_len {out_seq_length}...")
149 |         while True:
150 |             print("\nPlease Input Query (Ctrl-D to save multiple lines, 'stop' to exit) >>> ")
151 |             prompts = []
152 |             while True:
153 |                 try:
154 |                     line = input()
155 |                 except EOFError:
156 |                     break
157 |                 prompts.append(line)
158 |             prompt = "\n".join(prompts)
159 |             prompt = prompt.strip()
160 |             if not prompt:
161 |                 print('Query should not be empty!')
162 |                 continue
163 |             if prompt == "stop":
164 |                 return 
165 |             try:
166 |                 t0 = time.perf_counter()
167 |                 generated_code = codegeex.generate(
168 |                     model,
169 |                     tokenizer,
170 |                     prompt,
171 |                     out_seq_length=out_seq_length,
172 |                     seq_length=args.max_position_embeddings,
173 |                     top_k=args.top_k,
174 |                     top_p=args.top_p,
175 |                     temperature=args.temperature,
176 |                     micro_batch_size=args.micro_batch_size,
177 |                     backend="megatron",
178 |                     verbose=True,
179 |                 )
180 |                 t1 = time.perf_counter()
181 |                 print("Total generation time:", t1 - t0)
182 |             except (ValueError, FileNotFoundError) as e:
183 |                 print(e)
184 |                 continue
185 |             
186 |     print("Generation finished.")
187 | 
188 | 
189 | if __name__ == "__main__":
190 |     main()


--------------------------------------------------------------------------------
/tests/test_prompt.txt:
--------------------------------------------------------------------------------
 1 | code translation
 2 | Java:
 3 | public class Solution {
 4 |     public static boolean hasCloseElements(int[] nums, int threshold) {
 5 |         for (int i = 0; i < nums.length - 1; i++) {
 6 |             for (int j = i + 1; j < nums.length; j++) {
 7 |                 if (Math.abs(nums[i] - nums[j]) < threshold) {
 8 |                     return true;
 9 |                 }
10 |             }
11 |         }
12 |         return false;
13 |     }
14 | }
15 | Python:
16 | 


--------------------------------------------------------------------------------
/vscode-extension/README_zh.md:
--------------------------------------------------------------------------------
  1 | ![codegeex_logo](../resources/logo/codegeex_logo.png)
  2 | 
  3 | 🌐 <a href="https://github.com/THUDM/CodeGeeX/blob/main/vscode-extension/README.md" target="_blank">English</a>
  4 | 
  5 | ![CodeGeeX vscode extension version](https://img.shields.io/visual-studio-marketplace/v/aminer.codegeex?colorA=0B9FE0&colorB=brightgreen)
  6 | ![CodeGeeX vscode extension last update](https://img.shields.io/visual-studio-marketplace/last-updated/aminer.codegeex?colorA=0B9FE0&colorB=brightgreen)
  7 | ![CodeGeeX download](https://img.shields.io/visual-studio-marketplace/d/aminer.codegeex?colorA=0B9FE0&colorB=brightgreen)
  8 | ![CodeGeeX vscode extension rating](https://img.shields.io/visual-studio-marketplace/stars/aminer.codegeex?colorA=0B9FE0&colorB=brightgreen)
  9 | ![CodeGeeX github stars](https://img.shields.io/github/stars/THUDM/CodeGeeX?style=social)
 10 | 
 11 | CodeGeeX是一个具有130亿参数的多编程语言代码生成预训练模型，使用超过二十种编程语言训练得到。基于CodeGeeX开发的插件可以实现通过描述生成代码、补全代码、代码翻译等一系列功能。CodeGeeX同样提供可以定制的**提示模式（Prompt Mode）**，构建专属的编程助手。Happy Coding！
 12 | 
 13 | VS Code插件市场搜索"codegeex"即可免费使用(需要VS Code版本不低于1.68.0)，更多关于CodeGeeX信息请见我们的[主页](https://models.aminer.cn/codegeex/) and [GitHub仓库](https://github.com/THUDM/CodeGeeX)。
 14 | 
 15 | 如使用过程中遇到问题或有任何改进意见，欢迎发送邮件到[codegeex@aminer.cn](mailto:codegeex@aminer.cn)反馈！
 16 | 
 17 | - [基本用法](#基本用法)
 18 | - [隐私声明](#隐私声明)
 19 | - [使用指南](#使用指南)
 20 |   - [隐匿模式](#隐匿模式)
 21 |   - [交互模式](#交互模式)
 22 |   - [翻译模式](#翻译模式)
 23 |   - [提示模式（实验功能）](#提示模式实验功能)
 24 | 
 25 | ## 基本用法
 26 | 需要保证VS Code版本 >= 1.68.0。安装插件并全局激活CodeGeeX，有以下四种使用模式：
 27 | 
 28 | -   **隐匿模式**: 保持CodeGeeX处于激活状态，当您停止输入时，会从当前光标处开始生成（右下角CodeGeeX图标转圈表示正在生成）。 生成完毕之后会以灰色显示，按``Tab``即可插入生成结果。 
 29 | -   **交互模式**: 按``Ctrl+Enter``激活交互模式，CodeGeeX将生成``X``个候选，并显示在右侧窗口中（``X`` 数量可以在设置的``Candidate Num``中修改）。 点击候选代码上方的``use code``即可插入。
 30 | -   **翻译模式**: 选择代码，然后按下``Ctrl+Alt+T``激活翻译模式，CodeGeeX会把该代码翻译成匹配您当前编辑器语言的代码。点击翻译结果上方的``use code``插入。您还可以在设置中选择您希望插入的时候如何处理被翻译的代码，您可以选择注释它们或者覆盖它们。
 31 | -   **提示模式（实验功能）**: 选择需要作为输入的代码，按``Alt/Option+t``触发提示模式，会显示预定义模板列表，选择其中一个模板，即可将代码插入到模板中进行生成。 这个模式高度自定义，可以在设置中 ``Prompt Templates``修改或添加模板内容，为模型加入额外的提示。 
 32 | 
 33 | ## 隐私声明
 34 | 
 35 | 我们高度尊重用户代码的隐私，代码仅用来辅助编程。在您第一次使用时，我们会询问您是否同意将生成的代码用于研究用途，帮助CodeGeeX变得更好（该选项默认**关闭**）。
 36 | ## 使用指南
 37 | 
 38 | 以下是CodeGeeX几种模式的详细用法：
 39 | 
 40 | ### 隐匿模式
 41 | 
 42 | 在该模式中，CodeGeeX将在您停止输入时，从光标处开始生成（右下角CodeGeeX图标转圈表示正在生成）。生成完毕之后会以灰色显示，按``Tab``即可插入生成结果。 在生成多个候选的情况下，可以使用``Alt/Option+[`` 或 ``]``在几个候选间进行切换。如果你对现有建议不满意，可以使用``Alt/Option+N``去获得新的候选。可以在设置中改变``Candidate Num``（增加个数会导致生成速度相对变慢）。**注意**：生成总是从当前光标位置开始，如果您在生成结束前移动光标位置，可能会导致一些bugs。我们正在努力使生成速度变得更快以提升用户体验。
 43 | 
 44 | ![image](https://lfs.aminer.cn/misc/wangshan/pretrain/codegeex/bubble_sort_go.gif)
 45 | 
 46 | ### 交互模式
 47 | 
 48 | 在该模式中，按``Ctrl+Enter``激活交互模式，CodeGeeX将生成``X``个候选，并显示在右侧窗口中（``X`` 数量可以在设置的``Candidate Num``中修改）。 点击候选代码上方的``use code``即可插入结果到为当前光标位置。 
 49 | 
 50 | ![image](https://lfs.aminer.cn/misc/wangshan/pretrain/codegeex/interactive_mode2.gif)
 51 | 
 52 | ### 翻译模式
 53 | 
 54 | 在当前的语言的文本编辑器中输入或者粘贴其他语言的代码，您用鼠标选择这些代码，然后按下``Ctrl+Alt+T``激活翻译模式，您根据提示选择该代码的语言，然后CodeGeeX会帮您把该代码翻译成匹配您当前编辑器语言的代码。点击翻译结果上方的``use code``即可插入。您还可以在设置中选择您希望插入的时候如何处理被翻译的代码，您可以选择注释它们或者覆盖它们。
 55 | 
 56 | ![image](https://lfs.aminer.cn/misc/wangshan/pretrain/codegeex/translation_cpp_to_python.gif)
 57 | 
 58 | ### 提示模式（实验功能）
 59 | 
 60 | 在该模式中，您可以在输入中添加额外的提示来实现一些有趣的功能，包括并不限于代码解释、概括、以特定风格生成等。该模式的原理是利用了CodeGeeX强大的少样本生成能力。当您在输入中提供一些例子时，CodeGeeX会模仿这些例子并实现相应的功能。比如，您可以自定义模板中提供一段逐行解释代码的例子。选择您想要解释的代码，按``Alt/Option+t``触发提示模式，选择您写好的模板（如``explanation``），CodeGeeX就会解释您输入的代码。以下我们会详细介绍如何制作模板。
 61 | 
 62 | ![image](https://lfs.aminer.cn/misc/wangshan/pretrain/codegeex/explanation_python.gif)
 63 | 
 64 | 上述例子中的模板如下图所示，由``[示例代码]``, ``<INPUT>``, ``[带解释的示例代码]`` and ``[输出函数头]`` 。``<INPUT>``表示您选中的代码将会插入的位置。 ``<INPUT0:1>`` 这一句用来保证模型解释的是同一个函数。当使用提示模式时，CodeGeeX会将您选择的代码（插入到<INPUT>部分）和模板代码相结合，一起作为模型的输入。 
 65 | 
 66 | ```python
 67 | # language: Python
 68 | 
 69 | def sum_squares(lst):
 70 |     sum = 0
 71 |     for i in range(len(lst)):
 72 |         if i % 3 == 0:
 73 |             lst[i] = lst[i]**2
 74 |         elif i % 4 == 0:
 75 |             lst[i] = lst[i]**3
 76 |         sum += lst[i]
 77 |     return sum
 78 | 
 79 | <INPUT>
 80 | 
 81 | # Explain the code line by line
 82 | def sum_squares(lst):
 83 |     # initialize sum
 84 |     sum = 0
 85 |     # loop through the list
 86 |     for i in range(len(lst)):
 87 |         # if the index is a multiple of 3
 88 |         if i % 3 == 0:
 89 |             # square the entry
 90 |             lst[i] = lst[i]**2
 91 |         # if the index is a multiple of 4
 92 |         elif i % 4 == 0:
 93 |             # cube the entry
 94 |             lst[i] = lst[i]**3
 95 |         # add the entry to the sum
 96 |         sum += lst[i]
 97 |     # return the sum
 98 |     return sum
 99 | 
100 | # Explain the code line by line
101 | <INPUT:0,1>
102 | ```
103 | 
104 | 以下是另一个Python文档字符串生成的例子，CodeGeeX在您写新函数时会模仿该注释的格式：
105 | ```python
106 | def add_binary(a, b):
107 |     '''
108 |     Returns the sum of two decimal numbers in binary digits.
109 | 
110 |     Parameters:
111 |             a (int): A decimal integer
112 |             b (int): Another decimal integer
113 | 
114 |     Returns:
115 |             binary_sum (str): Binary string of the sum of a and b
116 |     '''
117 |     binary_sum = bin(a+b)[2:]
118 |     return binary_sum
119 | 
120 | <INPUT>
121 | ```
122 | 
123 | 模板文件是高度自定义化的，您可以将自定义模板添加到插件设置中的``Prompt Templates``中。 ``key``表示模板的名字， ``value``是模板文件的路径（可以是您电脑上的任一路径，``.txt``, ``.py``, ``.h``, 等格式文件均可）。通过该功能，您可以让CodeGeeX生成具有特定风格或功能的代码，快尝试定义自己的专属模板吧！


--------------------------------------------------------------------------------