├── .gitattributes ├── .gitignore ├── LICENSE ├── README.md ├── docs ├── missing-translation.md └── zh │ ├── docs │ ├── benchmark │ │ ├── hpcg.md │ │ ├── hpl.md │ │ ├── intro.md │ │ └── mlperf.md │ ├── communication │ │ ├── cost.md │ │ ├── deadlock.md │ │ ├── intro.md │ │ └── model.md │ ├── competition │ │ ├── asc.md │ │ ├── competition-intro.md │ │ ├── hpcgame.md │ │ ├── isc.md │ │ ├── other.md │ │ └── sc.md │ ├── contribute │ │ ├── before-contributing.md │ │ ├── docs-organization.md │ │ └── docs-style.md │ ├── gpu │ │ ├── arch.md │ │ ├── cuda-advanced.md │ │ ├── cuda.md │ │ ├── hip.md │ │ ├── images │ │ │ └── cuda │ │ │ │ ├── cpu-gpu-arch-diff.png │ │ │ │ ├── cuda-kernel-index.png │ │ │ │ ├── kernel-execution-on-gpu-1.png │ │ │ │ └── memory-hierarchy-in-gpus-1.png │ │ ├── intro.md │ │ ├── nccl.md │ │ ├── nsys.md │ │ ├── openacc.md │ │ └── opencl.md │ ├── hardware │ │ ├── fpga-and-asics.md │ │ ├── gpu.md │ │ ├── hardware-intro.md │ │ ├── interconnect.md │ │ ├── memory.md │ │ ├── processor.md │ │ └── storage.md │ ├── hpc-intro │ │ ├── hpc-history.md │ │ ├── modern-hpc.md │ │ └── what-is-hpc.md │ ├── index.md │ ├── memory-model │ │ ├── cache.md │ │ ├── consistency.md │ │ ├── intro.md │ │ └── numa.md │ ├── misc │ │ ├── faq.md │ │ ├── git.md │ │ └── shell.md │ ├── parallel-programming │ │ ├── images │ │ │ └── intro-cpuperf.svg │ │ ├── mpi.md │ │ ├── mpi4py.md │ │ ├── openmp.md │ │ └── parallel-programming-intro.md │ ├── performance-analysis │ │ ├── basics.md │ │ ├── intro.md │ │ ├── nsys.md │ │ └── vtune.md │ ├── platform │ │ ├── cloud.md │ │ ├── cluster.md │ │ ├── modules.md │ │ ├── platform-intro.md │ │ └── scheduling.md │ ├── power-management │ │ ├── intro.md │ │ └── perception.md │ ├── sci-mlsys │ │ ├── intro.md │ │ ├── parallelism.md │ │ ├── quantization.md │ │ └── sparsity.md │ ├── static │ │ ├── css │ │ │ └── extra.css │ │ ├── img │ │ │ └── logo.png │ │ └── js │ │ │ └── extra.js │ └── thread-process │ │ ├── intro.md │ │ ├── more.md │ │ ├── process.md │ │ └── thread.md │ ├── mkdocs.yml │ └── overrides │ └── .gitignore ├── requirements.txt └── scripts ├── createfiles.py └── docs.py /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | # Custom for Visual Studio 5 | *.cs diff=csharp 6 | 7 | # Standard to msysgit 8 | *.doc diff=astextplain 9 | *.DOC diff=astextplain 10 | *.docx diff=astextplain 11 | *.DOCX diff=astextplain 12 | *.dot diff=astextplain 13 | *.DOT diff=astextplain 14 | *.pdf diff=astextplain 15 | *.PDF diff=astextplain 16 | *.rtf diff=astextplain 17 | *.RTF diff=astextplain 18 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | _build/ 2 | docs_build/ 3 | site/ 4 | .vscode/ 5 | **/.DS_Store 6 | .env -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | # Attribution-NonCommercial-ShareAlike 4.0 International 2 | 3 | Creative Commons Corporation (“Creative Commons”) is not a law firm and does not provide legal services or legal advice. Distribution of Creative Commons public licenses does not create a lawyer-client or other relationship. Creative Commons makes its licenses and related information available on an “as-is” basis. Creative Commons gives no warranties regarding its licenses, any material licensed under their terms and conditions, or any related information. Creative Commons disclaims all liability for damages resulting from their use to the fullest extent possible. 4 | 5 | ### Using Creative Commons Public Licenses 6 | 7 | Creative Commons public licenses provide a standard set of terms and conditions that creators and other rights holders may use to share original works of authorship and other material subject to copyright and certain other rights specified in the public license below. The following considerations are for informational purposes only, are not exhaustive, and do not form part of our licenses. 8 | 9 | * __Considerations for licensors:__ Our public licenses are intended for use by those authorized to give the public permission to use material in ways otherwise restricted by copyright and certain other rights. Our licenses are irrevocable. Licensors should read and understand the terms and conditions of the license they choose before applying it. Licensors should also secure all rights necessary before applying our licenses so that the public can reuse the material as expected. Licensors should clearly mark any material not subject to the license. This includes other CC-licensed material, or material used under an exception or limitation to copyright. [More considerations for licensors](http://wiki.creativecommons.org/Considerations_for_licensors_and_licensees#Considerations_for_licensors). 10 | 11 | * __Considerations for the public:__ By using one of our public licenses, a licensor grants the public permission to use the licensed material under specified terms and conditions. If the licensor’s permission is not necessary for any reason–for example, because of any applicable exception or limitation to copyright–then that use is not regulated by the license. Our licenses grant only permissions under copyright and certain other rights that a licensor has authority to grant. Use of the licensed material may still be restricted for other reasons, including because others have copyright or other rights in the material. A licensor may make special requests, such as asking that all changes be marked or described. Although not required by our licenses, you are encouraged to respect those requests where reasonable. [More considerations for the public](http://wiki.creativecommons.org/Considerations_for_licensors_and_licensees#Considerations_for_licensees). 12 | 13 | ## Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International Public License 14 | 15 | By exercising the Licensed Rights (defined below), You accept and agree to be bound by the terms and conditions of this Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International Public License ("Public License"). To the extent this Public License may be interpreted as a contract, You are granted the Licensed Rights in consideration of Your acceptance of these terms and conditions, and the Licensor grants You such rights in consideration of benefits the Licensor receives from making the Licensed Material available under these terms and conditions. 16 | 17 | ### Section 1 – Definitions. 18 | 19 | a. __Adapted Material__ means material subject to Copyright and Similar Rights that is derived from or based upon the Licensed Material and in which the Licensed Material is translated, altered, arranged, transformed, or otherwise modified in a manner requiring permission under the Copyright and Similar Rights held by the Licensor. For purposes of this Public License, where the Licensed Material is a musical work, performance, or sound recording, Adapted Material is always produced where the Licensed Material is synched in timed relation with a moving image. 20 | 21 | b. __Adapter's License__ means the license You apply to Your Copyright and Similar Rights in Your contributions to Adapted Material in accordance with the terms and conditions of this Public License. 22 | 23 | c. __BY-NC-SA Compatible License__ means a license listed at [creativecommons.org/compatiblelicenses](http://creativecommons.org/compatiblelicenses), approved by Creative Commons as essentially the equivalent of this Public License. 24 | 25 | d. __Copyright and Similar Rights__ means copyright and/or similar rights closely related to copyright including, without limitation, performance, broadcast, sound recording, and Sui Generis Database Rights, without regard to how the rights are labeled or categorized. For purposes of this Public License, the rights specified in Section 2(b)(1)-(2) are not Copyright and Similar Rights. 26 | 27 | e. __Effective Technological Measures__ means those measures that, in the absence of proper authority, may not be circumvented under laws fulfilling obligations under Article 11 of the WIPO Copyright Treaty adopted on December 20, 1996, and/or similar international agreements. 28 | 29 | f. __Exceptions and Limitations__ means fair use, fair dealing, and/or any other exception or limitation to Copyright and Similar Rights that applies to Your use of the Licensed Material. 30 | 31 | g. __License Elements__ means the license attributes listed in the name of a Creative Commons Public License. The License Elements of this Public License are Attribution, NonCommercial, and ShareAlike. 32 | 33 | h. __Licensed Material__ means the artistic or literary work, database, or other material to which the Licensor applied this Public License. 34 | 35 | i. __Licensed Rights__ means the rights granted to You subject to the terms and conditions of this Public License, which are limited to all Copyright and Similar Rights that apply to Your use of the Licensed Material and that the Licensor has authority to license. 36 | 37 | h. __Licensor__ means the individual(s) or entity(ies) granting rights under this Public License. 38 | 39 | i. __NonCommercial__ means not primarily intended for or directed towards commercial advantage or monetary compensation. For purposes of this Public License, the exchange of the Licensed Material for other material subject to Copyright and Similar Rights by digital file-sharing or similar means is NonCommercial provided there is no payment of monetary compensation in connection with the exchange. 40 | 41 | j. __Share__ means to provide material to the public by any means or process that requires permission under the Licensed Rights, such as reproduction, public display, public performance, distribution, dissemination, communication, or importation, and to make material available to the public including in ways that members of the public may access the material from a place and at a time individually chosen by them. 42 | 43 | k. __Sui Generis Database Rights__ means rights other than copyright resulting from Directive 96/9/EC of the European Parliament and of the Council of 11 March 1996 on the legal protection of databases, as amended and/or succeeded, as well as other essentially equivalent rights anywhere in the world. 44 | 45 | l. __You__ means the individual or entity exercising the Licensed Rights under this Public License. Your has a corresponding meaning. 46 | 47 | ### Section 2 – Scope. 48 | 49 | a. ___License grant.___ 50 | 51 | 1. Subject to the terms and conditions of this Public License, the Licensor hereby grants You a worldwide, royalty-free, non-sublicensable, non-exclusive, irrevocable license to exercise the Licensed Rights in the Licensed Material to: 52 | 53 | A. reproduce and Share the Licensed Material, in whole or in part, for NonCommercial purposes only; and 54 | 55 | B. produce, reproduce, and Share Adapted Material for NonCommercial purposes only. 56 | 57 | 2. __Exceptions and Limitations.__ For the avoidance of doubt, where Exceptions and Limitations apply to Your use, this Public License does not apply, and You do not need to comply with its terms and conditions. 58 | 59 | 3. __Term.__ The term of this Public License is specified in Section 6(a). 60 | 61 | 4. __Media and formats; technical modifications allowed.__ The Licensor authorizes You to exercise the Licensed Rights in all media and formats whether now known or hereafter created, and to make technical modifications necessary to do so. The Licensor waives and/or agrees not to assert any right or authority to forbid You from making technical modifications necessary to exercise the Licensed Rights, including technical modifications necessary to circumvent Effective Technological Measures. For purposes of this Public License, simply making modifications authorized by this Section 2(a)(4) never produces Adapted Material. 62 | 63 | 5. __Downstream recipients.__ 64 | 65 | A. __Offer from the Licensor – Licensed Material.__ Every recipient of the Licensed Material automatically receives an offer from the Licensor to exercise the Licensed Rights under the terms and conditions of this Public License. 66 | 67 | B. __Additional offer from the Licensor – Adapted Material.__ Every recipient of Adapted Material from You automatically receives an offer from the Licensor to exercise the Licensed Rights in the Adapted Material under the conditions of the Adapter’s License You apply. 68 | 69 | C. __No downstream restrictions.__ You may not offer or impose any additional or different terms or conditions on, or apply any Effective Technological Measures to, the Licensed Material if doing so restricts exercise of the Licensed Rights by any recipient of the Licensed Material. 70 | 71 | 6. __No endorsement.__ Nothing in this Public License constitutes or may be construed as permission to assert or imply that You are, or that Your use of the Licensed Material is, connected with, or sponsored, endorsed, or granted official status by, the Licensor or others designated to receive attribution as provided in Section 3(a)(1)(A)(i). 72 | 73 | b. ___Other rights.___ 74 | 75 | 1. Moral rights, such as the right of integrity, are not licensed under this Public License, nor are publicity, privacy, and/or other similar personality rights; however, to the extent possible, the Licensor waives and/or agrees not to assert any such rights held by the Licensor to the limited extent necessary to allow You to exercise the Licensed Rights, but not otherwise. 76 | 77 | 2. Patent and trademark rights are not licensed under this Public License. 78 | 79 | 3. To the extent possible, the Licensor waives any right to collect royalties from You for the exercise of the Licensed Rights, whether directly or through a collecting society under any voluntary or waivable statutory or compulsory licensing scheme. In all other cases the Licensor expressly reserves any right to collect such royalties, including when the Licensed Material is used other than for NonCommercial purposes. 80 | 81 | ### Section 3 – License Conditions. 82 | 83 | Your exercise of the Licensed Rights is expressly made subject to the following conditions. 84 | 85 | a. ___Attribution.___ 86 | 87 | 1. If You Share the Licensed Material (including in modified form), You must: 88 | 89 | A. retain the following if it is supplied by the Licensor with the Licensed Material: 90 | 91 | i. identification of the creator(s) of the Licensed Material and any others designated to receive attribution, in any reasonable manner requested by the Licensor (including by pseudonym if designated); 92 | 93 | ii. a copyright notice; 94 | 95 | iii. a notice that refers to this Public License; 96 | 97 | iv. a notice that refers to the disclaimer of warranties; 98 | 99 | v. a URI or hyperlink to the Licensed Material to the extent reasonably practicable; 100 | 101 | B. indicate if You modified the Licensed Material and retain an indication of any previous modifications; and 102 | 103 | C. indicate the Licensed Material is licensed under this Public License, and include the text of, or the URI or hyperlink to, this Public License. 104 | 105 | 2. You may satisfy the conditions in Section 3(a)(1) in any reasonable manner based on the medium, means, and context in which You Share the Licensed Material. For example, it may be reasonable to satisfy the conditions by providing a URI or hyperlink to a resource that includes the required information. 106 | 107 | 3. If requested by the Licensor, You must remove any of the information required by Section 3(a)(1)(A) to the extent reasonably practicable. 108 | 109 | b. ___ShareAlike.___ 110 | 111 | In addition to the conditions in Section 3(a), if You Share Adapted Material You produce, the following conditions also apply. 112 | 113 | 1. The Adapter’s License You apply must be a Creative Commons license with the same License Elements, this version or later, or a BY-NC-SA Compatible License. 114 | 115 | 2. You must include the text of, or the URI or hyperlink to, the Adapter's License You apply. You may satisfy this condition in any reasonable manner based on the medium, means, and context in which You Share Adapted Material. 116 | 117 | 3. You may not offer or impose any additional or different terms or conditions on, or apply any Effective Technological Measures to, Adapted Material that restrict exercise of the rights granted under the Adapter's License You apply. 118 | 119 | ### Section 4 – Sui Generis Database Rights. 120 | 121 | Where the Licensed Rights include Sui Generis Database Rights that apply to Your use of the Licensed Material: 122 | 123 | a. for the avoidance of doubt, Section 2(a)(1) grants You the right to extract, reuse, reproduce, and Share all or a substantial portion of the contents of the database for NonCommercial purposes only; 124 | 125 | b. if You include all or a substantial portion of the database contents in a database in which You have Sui Generis Database Rights, then the database in which You have Sui Generis Database Rights (but not its individual contents) is Adapted Material, including for purposes of Section 3(b); and 126 | 127 | c. You must comply with the conditions in Section 3(a) if You Share all or a substantial portion of the contents of the database. 128 | 129 | For the avoidance of doubt, this Section 4 supplements and does not replace Your obligations under this Public License where the Licensed Rights include other Copyright and Similar Rights. 130 | 131 | ### Section 5 – Disclaimer of Warranties and Limitation of Liability. 132 | 133 | a. __Unless otherwise separately undertaken by the Licensor, to the extent possible, the Licensor offers the Licensed Material as-is and as-available, and makes no representations or warranties of any kind concerning the Licensed Material, whether express, implied, statutory, or other. This includes, without limitation, warranties of title, merchantability, fitness for a particular purpose, non-infringement, absence of latent or other defects, accuracy, or the presence or absence of errors, whether or not known or discoverable. Where disclaimers of warranties are not allowed in full or in part, this disclaimer may not apply to You.__ 134 | 135 | b. __To the extent possible, in no event will the Licensor be liable to You on any legal theory (including, without limitation, negligence) or otherwise for any direct, special, indirect, incidental, consequential, punitive, exemplary, or other losses, costs, expenses, or damages arising out of this Public License or use of the Licensed Material, even if the Licensor has been advised of the possibility of such losses, costs, expenses, or damages. Where a limitation of liability is not allowed in full or in part, this limitation may not apply to You.__ 136 | 137 | c. The disclaimer of warranties and limitation of liability provided above shall be interpreted in a manner that, to the extent possible, most closely approximates an absolute disclaimer and waiver of all liability. 138 | 139 | ### Section 6 – Term and Termination. 140 | 141 | a. This Public License applies for the term of the Copyright and Similar Rights licensed here. However, if You fail to comply with this Public License, then Your rights under this Public License terminate automatically. 142 | 143 | b. Where Your right to use the Licensed Material has terminated under Section 6(a), it reinstates: 144 | 145 | 1. automatically as of the date the violation is cured, provided it is cured within 30 days of Your discovery of the violation; or 146 | 147 | 2. upon express reinstatement by the Licensor. 148 | 149 | For the avoidance of doubt, this Section 6(b) does not affect any right the Licensor may have to seek remedies for Your violations of this Public License. 150 | 151 | c. For the avoidance of doubt, the Licensor may also offer the Licensed Material under separate terms or conditions or stop distributing the Licensed Material at any time; however, doing so will not terminate this Public License. 152 | 153 | d. Sections 1, 5, 6, 7, and 8 survive termination of this Public License. 154 | 155 | ### Section 7 – Other Terms and Conditions. 156 | 157 | a. The Licensor shall not be bound by any additional or different terms or conditions communicated by You unless expressly agreed. 158 | 159 | b. Any arrangements, understandings, or agreements regarding the Licensed Material not stated herein are separate from and independent of the terms and conditions of this Public License. 160 | 161 | ### Section 8 – Interpretation. 162 | 163 | a. For the avoidance of doubt, this Public License does not, and shall not be interpreted to, reduce, limit, restrict, or impose conditions on any use of the Licensed Material that could lawfully be made without permission under this Public License. 164 | 165 | b. To the extent possible, if any provision of this Public License is deemed unenforceable, it shall be automatically reformed to the minimum extent necessary to make it enforceable. If the provision cannot be reformed, it shall be severed from this Public License without affecting the enforceability of the remaining terms and conditions. 166 | 167 | c. No term or condition of this Public License will be waived and no failure to comply consented to unless expressly agreed to by the Licensor. 168 | 169 | d. Nothing in this Public License constitutes or may be interpreted as a limitation upon, or waiver of, any privileges and immunities that apply to the Licensor or You, including from the legal processes of any jurisdiction or authority. 170 | 171 | > Creative Commons is not a party to its public licenses. Notwithstanding, Creative Commons may elect to apply one of its public licenses to material it publishes and in those instances will be considered the “Licensor.” Except for the limited purpose of indicating that material is shared under a Creative Commons public license or as otherwise permitted by the Creative Commons policies published at [creativecommons.org/policies](http://creativecommons.org/policies), Creative Commons does not authorize the use of the trademark “Creative Commons” or any other trademark or logo of Creative Commons without its prior written consent including, without limitation, in connection with any unauthorized modifications to any of its public licenses or any other arrangements, understandings, or agreements concerning use of licensed material. For the avoidance of doubt, this paragraph does not form part of the public licenses. 172 | > 173 | > Creative Commons may be contacted at creativecommons.org 174 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # HPC Wiki 2 | 3 | HPC 比赛,通常指的是高性能计算相关的比赛,主要形式包括以 `SCC`、`ISC`和`ASC`为代表的学生集群竞赛,和以 `PKU HPCGame`为代表的高性能计算挑战赛。比赛要求选手在规定时间、规定功耗或成本范围内解决高性能计算相关问题,并尽可能提高问题的解决效率。比赛对选手在并行程序设计、软硬件协同优化、计算机体系结构理解与运用、临场问题处理以及团队协作等诸多方面都有很高的要求。 4 | 5 | 全国高校范围内,有大约200所学校建有超算队。大多数队伍都有自己的文档库用来培养新队员,这些文档库中的内容大多数是相同的,而且限于超算队的规模,文档库的内容也很难得到及时的更新。为此,我们共同建设 **HPC Wiki**,提高文档质量和内容丰富度,让更多的同学能够更快地学习到高性能计算相关的知识,从而更好地参与到 HPC 比赛中。希望能够减少“重复建造轮子”的现象,让大家能够更好地利用时间做更有意义的事情。 6 | 7 | **HPC Wiki** 源于社区,由北京大学学生 Linux 俱乐部长期运营和维护,将始终保持**独立自由**的性质,采取`cc-by-nc-sa`的知识共享许可协议,绝不会商业化。 8 | 9 | ## How to build? 10 | 11 | 本文档目前采用 [mkdocs](https://github.com/mkdocs/mkdocs) 部署在 [https://hpcwiki.io](https://hpcwiki.io)。 12 | 13 | 本项目可以直接部署在本地,具体方式如下: 14 | 15 | ```shell 16 | # 1. clone 17 | git clone https://github.com/lcpu-club/hpc-wiki.git 18 | # 2. requirements 19 | pip install -r requirements.txt 20 | # generate static file in site/ 21 | python3 scripts/docs.py build-all 22 | # deploy at http://127.0.0.1:8008 23 | python3 scripts/docs.py serve 24 | ``` 25 | 26 | **mkdocs 本地部署的网站是动态更新的,即当你修改并保存 md 文件后,刷新页面就能随之动态更新。** 27 | 28 | 29 | 在阅读 Wiki 之前,这里有一些小建议: 30 | 31 | - 学习 [提问的智慧](https://github.com/ryanhanwu/How-To-Ask-Questions-The-Smart-Way) 32 | - 善用 Google 搜索能帮助你更好地提升自己 33 | - 至少掌握一门编程语言,比如 Python 34 | - 动手实践比什么都要管用 35 | - 保持对技术的好奇与渴望并坚持下去 36 | 37 | ## 特别鸣谢 38 | 39 | 本项目受 [CTF Wiki](https://ctf-wiki.org/) 和 [OI Wiki](https://oi-wiki.org/) 的启发,同时在编写过程中参考了很多资料,特别鸣谢以下项目: 40 | - 上海科技大学 GeekPie 社区的 [GeekPie_HPC Wiki](https://hpc.geekpie.club/wiki/index.html) 41 | - 东南大学超算团队的 [asc-wiki](https://asc-wiki.com) 42 | - 北京大学学生 Linux 俱乐部的 [HPC from Scratch 项目](https://wiki.lcpu.dev/zh/hpc/from-scratch/arrange) 43 | 44 | ## Copyleft 45 | 知识共享许可协议
本作品采用知识共享署名-非商业性使用-相同方式共享 4.0 国际许可协议进行许可。 -------------------------------------------------------------------------------- /docs/missing-translation.md: -------------------------------------------------------------------------------- 1 | !!! warning 2 | The current page still doesn't have a translation for this language. 3 | 4 | You can read it through Google Translate. 5 | 6 | Besides, you can also help to translate it: [Contributing](https://ctf-wiki.org/en/contribute/before-contributing/). 7 | 8 | -------------------------------------------------------------------------------- /docs/zh/docs/benchmark/hpcg.md: -------------------------------------------------------------------------------- 1 | TODO -------------------------------------------------------------------------------- /docs/zh/docs/benchmark/hpl.md: -------------------------------------------------------------------------------- 1 | TODO -------------------------------------------------------------------------------- /docs/zh/docs/benchmark/intro.md: -------------------------------------------------------------------------------- 1 | TODO -------------------------------------------------------------------------------- /docs/zh/docs/benchmark/mlperf.md: -------------------------------------------------------------------------------- 1 | TODO -------------------------------------------------------------------------------- /docs/zh/docs/communication/cost.md: -------------------------------------------------------------------------------- 1 | TODO -------------------------------------------------------------------------------- /docs/zh/docs/communication/deadlock.md: -------------------------------------------------------------------------------- 1 | TODO -------------------------------------------------------------------------------- /docs/zh/docs/communication/intro.md: -------------------------------------------------------------------------------- 1 | TODO -------------------------------------------------------------------------------- /docs/zh/docs/communication/model.md: -------------------------------------------------------------------------------- 1 | TODO -------------------------------------------------------------------------------- /docs/zh/docs/competition/asc.md: -------------------------------------------------------------------------------- 1 | TODO -------------------------------------------------------------------------------- /docs/zh/docs/competition/competition-intro.md: -------------------------------------------------------------------------------- 1 | TODO -------------------------------------------------------------------------------- /docs/zh/docs/competition/hpcgame.md: -------------------------------------------------------------------------------- 1 | TODO -------------------------------------------------------------------------------- /docs/zh/docs/competition/isc.md: -------------------------------------------------------------------------------- 1 | TODO -------------------------------------------------------------------------------- /docs/zh/docs/competition/other.md: -------------------------------------------------------------------------------- 1 | TODO -------------------------------------------------------------------------------- /docs/zh/docs/competition/sc.md: -------------------------------------------------------------------------------- 1 | TODO -------------------------------------------------------------------------------- /docs/zh/docs/contribute/before-contributing.md: -------------------------------------------------------------------------------- 1 | # 贡献之前 2 | 3 | 在向项目提交贡献之前,请确保您已经阅读了以下内容: 4 | 5 | - [文档组织方式](./docs-organization.md):了解文档的组织方式,以及如何添加或修改文档。 6 | - [文档风格规范](./docs-style.md):了解文档的编写规范,以及编写文档的一些常见思路 7 | 8 | 同时,本 Wiki 使用 `知识共享署名-非商业性使用-相同方式共享 4.0 国际许可协议` 进行许可,所有提交到本仓库的内容均遵循该协议,包括但不限于文档、代码、图片等。在向项目提交贡献之前,请确保您已经阅读并同意该协议的内容。 9 | 10 | 对于引用或改编自其他来源的内容,我们将在文档中进行明确标注,同时在文档末尾提供引用来源。如果您认为我们的文档中存在侵权行为,请通过 [issue](https://github.com/lcpu-club/hpc-wiki/issues/new)进行反馈,我们将在核实后尽快删除相关内容。 -------------------------------------------------------------------------------- /docs/zh/docs/contribute/docs-organization.md: -------------------------------------------------------------------------------- 1 | # 文档组织方式 2 | 3 | ## 部署方式 4 | 5 | HPC Wiki 使用 [MkDocs](https://www.mkdocs.org/) 作为文档生成工具,使用 [Material for MkDocs](https://squidfunk.github.io/mkdocs-material/) 作为主题。文档源码托管在 [GitHub](https://github.com/lcpu-club/hpc-wiki),使用 Cloudflare Pages 进行自动部署。所有对`main`分支的更改都会在几分钟内同步到网站上。 6 | 7 | 部署方式如下: 8 | 9 | ```bash 10 | # 1. clone 11 | git clone https://github.com/lcpu-club/hpc-wiki.git 12 | # 2. requirements 13 | pip install -r requirements.txt 14 | # generate static file in site/ 15 | python3 scripts/docs.py build-all 16 | # deploy at http://127.0.0.1:8008 17 | python3 scripts/docs.py serve # or just python3 -m http.server --directory site 18 | ``` 19 | 20 | ## 文档组织方式 21 | 22 | 本项目的仓库结构如下:(省略了不重要的部分) 23 | 24 | ``` 25 | -- docs 26 | |-- zh 27 | | |--overrides 28 | | |-- docs 29 | | | |-- mkdocs.yml 30 | | | |-- index.md 31 | | | |-- contribute 32 | | | | |-- doc-orgnazation.md 33 | | | | |-- images 34 | | | | |-- doc-orgnazation.png 35 | | | |-- other topics 36 | | | |-- ... 37 | |-- en 38 | |-- missing-translations.md 39 | -- scripts 40 | ``` 41 | 42 | 具体来说,只需要关注 `docs` 和 `scripts`两个文件夹。`docs`中是按语言分类的内容,`scripts`中是一些辅助脚本。我们目前没有多语言支持的计划,所以只有`zh`文件夹。每个语言文件夹中,`mkdocs.yml`是MkDocs的配置文件,`docs`文件夹中是文档源码,`overrides`文件夹中是一些覆盖文件,用于修改主题的一些默认设置,但我们目前也没有启用。 43 | 44 | `docs`中的文档按照主题分类,每个主题一个文件夹,每个文件夹下的一个`md`文件都是一篇文章。同时,我们约定,将所有文档的图片放置于同文件夹下`image`文件夹的`文档标题`子文件夹中。 45 | 46 | ## 举例说明 47 | 48 | ### 如何增加一篇新文章 49 | 50 | 假设我们要增加一篇关于 FPGA 硬件特性介绍的文章,我们需要做以下几步: 51 | 52 | 1. 分类与定位:我们需要将这篇文章放置于哪个主题下?我们可以将其放置于`Hardware`主题下,也可以新建一个主题,比如`FPGA`,然后将其放置于`FPGA`主题下。这里我们选择将其放置于`Hardware`主题下。 53 | 2. 创建新文件:在`docs/zh/docs/hardware`文件夹下,创建一个新的`md`文件,命名为`fpga.md`。 54 | 3. 编写文章:在`fpga.md`中,编写文章内容。文章的格式使用Markdown语法,具体请参考[Markdown 语法说明](https://www.markdown.xyz/basic-syntax/)。文章在导航栏中的标题是文章第一个一级标题决定的,所以请在文章开头使用一级标题。 55 | 4. 添加图片:如果文章中需要插入图片,请将图片放置于`docs/zh/docs/hardware/image`文件夹下,并在文章中使用相对路径引用图片。例如,如果我们在`fpga.md`中需要引用`fpga.png`,则可以使用`![fpga](image/fpga.png)`来引用图片。 56 | 5. 添加索引:在`docs/zh/docs/hardware/index.md`中,添加一行`- fpga.md`,这样就可以在导航栏中添加对`fpga.md`的链接了。 57 | 6. 本地预览:在`docs`文件夹下,运行部署命令(参考上文),在本地预览效果。 58 | 7. 提交更改:将更改以PR的形式提交到仓库,在由两位同学审读后,即可合并到`main`分支,网站将在几分钟内自动更新。 59 | 8. 完成:至此,我们就完成了一篇新文章的添加。 60 | 61 | ### 如何修改一篇文章 62 | 63 | 假设我们要修改`docs/zh/docs/hardware/fpga.md`这篇文章,我们需要做以下几步: 64 | 65 | 1. 修改文章:在`fpga.md`中,修改文章内容。 66 | 2. 本地预览:在`docs`文件夹下,运行部署命令(参考上文),在本地预览效果。 67 | 3. 提交更改:将更改以PR的形式提交到仓库,在由两位同学审读后,即可合并到`main`分支,网站将在几分钟内自动更新。 68 | 4. 完成:至此,我们就完成了一篇文章的修改。 69 | 70 | 71 | -------------------------------------------------------------------------------- /docs/zh/docs/contribute/docs-style.md: -------------------------------------------------------------------------------- 1 | # 文档风格规范 2 | 3 | 每一个 Wiki 都有自己的文档风格,我们希望 Wiki 这一个整体在风格上是统一的,这样不仅方便读者阅读与学习,也可以帮助贡献者更好地编写文档。 4 | 5 | 以下是本文档的风格指南。好的风格指南需要在编写中不断积累、不断讨论,因此本文档也会不断更新。我们把[CUDA编程基础](https://hpcwiki.io/gpu/cuda/)作为本文的示例并定期修订,以便更好地说明风格指南,您可以参考该文档进行写作。 6 | 7 | ## 目标受众 8 | 9 | Wiki 的目标受众是刚刚接触 HPC 的新手,因此我们的文档应该尽可能地简单易懂,尽可能出展示 HPC 的 `Big Picture`。在覆盖必要的、不容易掌握的细节的同时,应鼓励读者自行探索其他细节。对于 HPC 应用方面的高级文档(如`MLSys`方面的文档),我们可以认为读者已经具备一定的 HPC 基础知识,可以以更加专业的方式进行阐述。 10 | 11 | ## 行文结构 12 | 13 | Wiki 的文档应该具有良好的行文结构,以便读者快速地获取信息。我们推荐使用以下结构: 14 | 15 | - **简介**:简介部分应该包含对该文档的简要介绍,以及该文档的目标受众。简介部分应该尽可能地简短,以便读者快速地了解该文档的内容。 16 | - **背景**:背景部分应该包含该文档所涉及的背景知识,以及该文档所涉及的相关概念。背景部分应该尽可能地简短,以便读者快速地了解该文档所涉及的背景知识。 17 | - **内容**:内容部分应该包含该文档的主要内容,以及该文档的主要内容的详细介绍。内容部分应该尽可能地详细。 18 | - **总结**:总结部分应简短地总结该文档的主要内容,以及该文档的主要内容的重点。 19 | - **任务**:对于教程类的文档,我们原创或者整理推荐一些适合练手的项目,以便读者巩固所学的知识。项目预计用时以2~4小时为宜。 -------------------------------------------------------------------------------- /docs/zh/docs/gpu/arch.md: -------------------------------------------------------------------------------- 1 | TODO -------------------------------------------------------------------------------- /docs/zh/docs/gpu/cuda-advanced.md: -------------------------------------------------------------------------------- 1 | TODO -------------------------------------------------------------------------------- /docs/zh/docs/gpu/cuda.md: -------------------------------------------------------------------------------- 1 | # CUDA 编程入门 2 | 3 | Introduction to CUDA Programming: From Correctness to Performance 4 | 5 | 本文改编自北京大学超算队CUDA教程讲义,原作者为interestingLSY。 6 | 7 | # Overview 8 | 9 | 本文将从 GPU 的结构与 CUDA 的基本概念出发,带领大家写出自己的第一个正确的 CUDA 程序,并展示一些基本的优化技巧,带领大家优化自己的 CUDA 程序(正如标题所示,From Correctness to Performamce)。 10 | 11 | 本文分为三部分:Part 0 简要介绍了为什么 GPU 能在许多任务上取得千倍的加速比;Part 1 介绍了 GPU 编程的基本概念,以及如何写出第一个 CUDA 程序;Part 2 则是一些基本的优化技巧。示例代码、课后作业所在的 git 仓库位于 https://github.com/interestingLSY/CUDA-From-Correctness-To-Performance-Code 12 | 13 | > Aside | 拓展内容:本文中标注了 "Aside" 的内容为拓展内容,我们认为他们比较有趣,但与本文主线关联不大。 14 | 15 | # Introduction 16 | 17 | ## Why GPU? 18 | 19 | 使用过 `PyTorch`等机器学习框架的同学应该体会过,同一个神经网络在 GPU 上的运行速度可能是 CPU 的数十倍甚至数百倍。因为GPU 底层的硬件设计它特别擅长于执行逻辑简单但是并行程度极高的计算任务。这样的任务与机器学习的负责高度重合,包括: 20 | 21 | - 向量加法 22 | - 矩阵乘法 23 | - 有限元解偏微分方程 24 | - 求出一个向量中所有元素的和 25 | - ... 26 | 27 | GPU 能在此类任务上取得如此高的性能提升,主要是在设计哲学上与 CPU 有显著不同。CPU 的设计目标是让单个线程执行得更快,降低串行任务完成的延迟 (latency),所以CPU 把大量的晶体管耗费在了分支预测、乱序执行等控制单元上,分配给运算单元(ALU 等)的晶体管较少。 28 | 29 | 而 GPU 的设计哲学是吞吐量(throughput)比延迟更加重要。也就是说,GPU 的设计目标是让大量的数据可以同时被处理。GPU 使用大量的晶体管来堆砌大量的运算单元,通过同时让许多运算单元共享同一个控制单元,以节约晶体管。对于分支预测、乱序执行等控制单元,GPU 采用了“不预测、不乱序”的策略。这样的设计使得 GPU 在执行逻辑简单、并行度高的任务时能够发挥出其最大的性能。虽然 GPU 上单条数据处理的时间可能比用 CPU 更长,但 GPU 单位时间内处理完成的数据要要多得多。 30 | 31 | 我们打个比方:如果我们现在想证明一个复杂的数学命题,那么是一位陈景润解得快,还是 100 名大一学生解得快?大概率是前者,因为陈景润先生有着深厚的功底与丰富的经验,并且“证明命题”这一过程很难并行。 32 | 33 | 但如果我们现在想要计算 10000 道 100 以内的乘除法呢?那么大概率是 100 名大一学生算得快。因为,虽然一名大一学生计算一道 100 以内的乘除法的速度比不上陈景润先生计算一道 100 以内的乘除法的速度,但100 名大一学生一起工作,速度一定会比一位陈景润先生要快。 34 | 35 | CPU 与 GPU 的区别就好像上文中的一位陈景润先生与 100 名大一学生的区别。CPU 适合执行逻辑复杂、并行度低的任务,GPU 适合执行逻辑简单、并行度高的任务。以向量加法为例,它逻辑很简单(只需要把两个向量的对应位置加起来即可),且并行度极高(可以同时计算输出向量每个位置上的结果)。如果使用 CPU,那么我需要依次计算输出向量每个位置的结果;但如果使用 GPU,我可以同时计算输出向量每个位置的结果,进而大大提高了速度。 36 | 37 | ![CPU 与 GPU 的结构区别](images/cuda/cpu-gpu-arch-diff.png) 38 | 39 | *Image source: https://developer.nvidia.com/blog/cuda-refresher-reviewing-the-origins-of-gpu-computing/* 40 | 41 | > Aside | Intel Xeon Phi: CPU 是”少量大核心“,适合执行串行任务;GPU 是”大量小核心“,适合执行逻辑简单、并行度大的任务;那么”中量中核心“会有怎样的效果呢?感兴趣的同学可以搜索一下 Intel Xeon Phi。 42 | 43 | 这里有一个形象的解释 CPU 与 GPU 工作原理区别的视频:[Link](https://www.bilibili.com/video/BV1ry4y1y7KZ)。 44 | 45 | ## 为什么 GPU 会这样设计 46 | 47 | 可是... GPU,Graphic Processing Unit,“显卡”,原本不是用来处理图像的吗?为什么它会这么设计? 48 | 49 | 我们先来考虑 3D 游戏画面的渲染管线。我们可以将这个管线分为三个部分:Vertex Mapping, Fragment Generation 与 Fragment Merging。 50 | 51 | 在很久之前,每一家 GPU(当时还叫“图形加速卡”)的厂商的做法都是:让一部分电路专门负责 Vertex Mapping,一部分电路专门负责 Fragment Generation,一部分电路专门负责 Fragment Merging。但是,这样做有一个问题:每一款游戏对不同的处理步骤的负载是不同的,比如游戏 A 可能给 Vertex Mapping 单元的负载较高,导致其成为瓶颈,同时其他两部分电路有空闲;游戏 B 可能给 Fragment Generation 的负载较高,导致其成为瓶颈。 52 | 53 | 在 2006 年,NVIDIA 推翻了传统而设计,发布了一个革命性的 GPU 架构 - Tesla。在 Tesla 架构中,没有了专门负责处理某一个步骤的硬件单元,取而代之的则是 Stream Multiprocessor (SM) 。每个 SM 都像一个小型 CPU 一样,可以执行其支持的指令集中的任何程序。这也就代表着,每一个 SM 都有能力执行渲染管线中的每个部分。这种设计避免了某一个步骤成为瓶颈而其他步骤的运算单元闲置的情况:我只需要根据不同的游戏负载,为每个步骤分配一定数量的 SM 即可。 54 | 55 | 同时,NVIDIA 发现,Tesla 架构不仅可以执行图像渲染方面的计算,其在通用计算方面也很有潜力(毕竟,SM 可以执行任何指令集支持的指令)。所以,Nvidia 也在同时发布了一套可以在 GPU 上执行通用计算的工具链 —— Compute Unified Device Architecture (CUDA)。从此,GPU 成为了 General-Purpose Graphics Processing Unit (GPGPU)。 56 | 57 | > Aside: 想要深入了解 NVIDIA 的 GPU 发展史的同学可以看[这篇文章](https://fabiensanglard.net/cuda/)与 Bilibili UP 主“极客湾”的“显卡发展史系列”:[1](https://www.bilibili.com/video/BV1Hb41177JB) [2](https://www.bilibili.com/video/BV1C4411J7cR) [3](https://www.bilibili.com/video/BV1YJ411h7aY)。 58 | 59 | ## Takeaway 60 | 61 | - CPU 的每个核心都很“大”,但核心数较少;GPU 每个核心都很“小”,但是核心数非常多。 62 | - CPU 和 GPU 的结构决定了:CPU 适合执行串行程序,GPU 适合执行并行度极高的程序。 63 | - GPU 之所以这样设计,与 GPU 的发展历史息息相关 64 | 65 | # Part 1. Correctness 66 | 67 | 那么,掌握了 GPU 的基础知识,我们接下来就要开始写代码啦! 68 | 69 | 接下来我将以矩阵乘法为例,带领大家逐渐编写一个正确性无误且性能勉强过关的矩阵乘法。 70 | 71 | > Aside | cuBLAS:作为一个十分常用的操作,市面上已经有很多成熟高效的矩阵乘法库,比如实现了线性代数中大部分计算的 [NVIDIA cuBLAS](https://developer.nvidia.com/cublas)。 72 | 73 | > Aside | Tensor Core:因为矩阵乘法这个操作太普遍了,所以除了我们即将使用的 CUDA Core 外,Nvidia 还在显卡中设计了另一种计算单元 - [Tensor Core](https://www.nvidia.com/en-us/data-center/tensor-cores)。它可以以比 CUDA Core 高数十倍的速率计算矩阵乘法,且支持 AI 中常用的混合精度计算。 74 | 75 | 首先,请确认你正在使用的机器上面有 NVIDIA 的显卡,且你能调用 CUDA 编译器(一般来说,直接在命令行中输入 `nvcc` 即可)。 76 | 77 | > Aside | CUDA C++:CUDA 使用的是经过 NVIDIA 魔改的 C++,其包含一些原生 C++ 不支持的语法,故不能使用 `g++` 等常规编译器。 78 | 79 | ## Step 0. 部署示例代码 80 | 81 | 首先,我们先部署好的示例代码。示例代码中包含一个用来对照的 CPU 上的 GEMM 实现、用来做性能测试的代码、以及 GEMM 在 GPU 上的若干种实现。 82 | 83 | *注:GEMM 代表 GEneral Matrix Multiplication,矩阵乘法。* 84 | 85 | 先 clone [这个仓库](https://github.com/interestingLSY/CUDA-From-Correctness-To-Performance-Code)。随后使用 `make all && ./gemm_test 64 64 64` 来运行它。理想情况下它应该输出类似于这样的东西: 86 | 87 | ```plain 88 | Benchmarking gpu_mult_block... 89 | Warming up... 90 | Verification passed! 91 | Warming up (again)... 92 | Round 0: 12 us 93 | Round 1: 12 us 94 | Round 2: 12 us 95 | Round 3: 12 us 96 | Round 4: 12 us 97 | Round 5: 12 us 98 | Round 6: 12 us 99 | Round 7: 12 us 100 | Average time usage: 12.000000 us 101 | ``` 102 | 103 | 这个仓库中,`gemm_test.cc` 是主程序,其包含 `main` 函数以及与性能测试(benchmark)相关的逻辑。其他的 `gemm_XXX` 中包含了各种各样的 GEMM 的实现,比如 CPU 上的简单实现 `gemm_cpu_naive`、GPU 上的多 thread 单 block 实现 `gemm_gpu_mult_thread` 等。 104 | 105 | 你可以在编译后使用 `./gemm_test [implementation]` 来 benchmark 所有的或特定的 GEMM 实现。其中,`n`, `m`, `k` 分别代表矩阵的三个维度(假设我们要计算 $C = A \times B$,那么 $A$ 矩阵的大小为 $n \times k$,$B$ 矩阵的大小为 $k \times m$,$C$ 矩阵的大小为 $n \times m$),`implementation` 代表你要 benchmark 的 GEMM 实现的名字(留空以 benchmark 所有的 GEMM 实现)。你可以在 `gemm_test.cc` 的开头位置找到所有的 GEMM 实现及其名字。 106 | 107 | *注:如果你每次开始运行程序的时候,程序都要卡 $1 \sim 2$ 秒才有输出,那么可能是因为你没有开启 GPU 的 Persistent Mode。* 108 | 109 | ## Step 1. Your First CUDA Kernel 110 | 111 | 首先,我们需要掌握一个基本概念,CUDA Kernel: 112 | 113 | CUDA 的设计思想大致是:向显卡提交一个又一个任务,每一个任务都形如“给定一个函数,与调用它的参数,请在显卡上运行这个函数”。我们一般**称这种“在显卡上运行的函数”叫做 CUDA Kernel**。仔细想想,这种设计很合理嘛!毕竟现在 GPU 是“加速器”,其仅负责加速程序中的某一些部分,其他的控制流程与计算还是要由 CPU 来做的。 114 | 115 | 所以,现在的问题就是: 116 | 117 | - 如何定义(创建)一个 CUDA Kernel? 118 | - 如何调用这个 CUDA Kernel? 119 | 120 | 首先是如何定义 CUDA Kernel 的问题。CUDA C++ 中有三类函数: 121 | 122 | - `__host__`: 这类函数与正常的函数没有区别。其只能被 host 上执行的函数(`__host__`)调用,并在 host 上执行。 123 | - `__global__`: 这类函数可以被任何函数调用,并在 device 上执行。 124 | - `__device__`: 这类函数只能被 device 上执行的函数(`__device__` 或 `__global__`)调用,并在 device 上执行。 125 | 126 | 不难发现,CUDA Kernel 不就正属于 `__global__` 类嘛! 127 | 128 | 在 CUDA 中,我们可以把一个函数的类别放在函数的返回值类型的前面,以告知编译器这个函数属于哪一类。没有定义属于哪一类的函数默认为 `__host__`。如下例: 129 | 130 | ```cpp 131 | // 下面这句话定义了一个 __global__ 类型的函数。该函数将在 GPU 上运行,并可以被任意函数调用。 132 | __global__ void gemm_gpu_1thread_kernel(int* C, const int* A, const int* B, int n, int m, int k) { 133 | } 134 | // 下面这句话定义了一个 __device__ 类型的函数。 135 | __device__ int mult_helper(int a, int b) { 136 | } 137 | // 下面这句话定义了一个 __host__ 类型的函数。 138 | __host__ void prepare_input() { 139 | } 140 | // 不加任何修饰的函数默认为 __host__ 类型。 141 | void func() { 142 | } 143 | ``` 144 | 145 | *注:在 CUDA 的编程模型中,一般称 CPU 为 Host,GPU 为 Device。* 146 | 147 | 因此,我们只要在一个函数的定义的最前面加上 `__global__`,它就是一个 Kernel 啦! 148 | 149 | 那么,如何调用 CUDA Kernel 呢?与 C++ 中调用函数的方式大同小异,不过要在函数名与参数列表的中间加上一个 `<<>>`(现阶段,先认为 `GRID_DIM` 与 `BLOCK_DIM` 均为 1)。举个例子: 150 | 151 | ```cpp 152 | // 下面这句话调用了一个名为 gemm_gpu_1thread_kernel 的 kernel 153 | gemm_gpu_1thread_kernel<<<1, 1>>>(C, A, B, n, m, k); 154 | ``` 155 | 156 | 现在,请打开示例代码中的 [gemm_gpu_1thread.cu](https://github.com/interestingLSY/CUDA-From-Correctness-To-Performance-Code/blob/master/gemm_gpu_1thread.cu),并阅读这份代码。试着找找:这份代码定义了哪个 CUDA Kernel?哪一行代码调用了这个 CUDA Kernel? 157 | 158 | 那么你现在信心满满地写了一个 CUDA Kernel!它的功能是传入两个数组 `A` 和 `B`,将 `A + B`(点对点地加)的结果输出到数组 `C` 中。你写道: 159 | 160 | ```cpp 161 | #include 162 | #include 163 | #include 164 | 165 | __global__ void pointwise_add_kernel(int* C, const int* A, const int* B, int n) { 166 | for (int i = 0; i < n; ++i) 167 | C[i] = A[i] + B[i]; 168 | } 169 | 170 | int main() { 171 | const int n = 128; 172 | int* C = new int[n]; 173 | int* A = new int[n]; 174 | int* B = new int[n]; 175 | for (int i = 0; i < n; ++i) { 176 | A[i] = i; 177 | B[i] = i*i; 178 | } 179 | pointwise_add_kernel<<<1, 1>>>(C, A, B, n); 180 | cudaDeviceSynchronize(); // 见下方 Aside 181 | cudaError_t error = cudaGetLastError(); // 检查当前 CUDA 驱动是否返回了任何异常。调用这句话之前记得调用 cudaDeviceSynchronize() 182 | if (error != cudaSuccess) { 183 | printf("CUDA error: %s\n", cudaGetErrorString(error)); 184 | exit(1); 185 | } 186 | for (int i = 0; i < n; ++i) { 187 | assert(C[i] == A[i] + B[i]); 188 | } 189 | return 0; 190 | } 191 | ``` 192 | 193 | *注:记得将文件保存为 `.cu` 类型的,否则 NVCC 不会认为这是一份 CUDA C++ 代码从而导致编译错误,同时记得使用 nvcc 而不是 g++ 来编译。* 194 | 195 | > Aside | 异步执行:CUDA Kernel 是异步执行的,也就是说,所谓的“调用” Kernel 只不过是 CPU 向 GPU 的任务队列里提交了一个任务,随后 CPU 就会继续执行接下来的指令,并不会等待 GPU 将这个 Kernel 执行完。这样设计的目的是:可以同时让 CPU 与 GPU 有活干,同时发掘出二者的潜力。如果想让 CPU 等待 GPU 上的所有 Kernel 均执行完(即,让两个设备同步),请调用 `cudaDeviceSynchronize()`。 196 | 197 | 可惜的是,程序输出了:`CUDA error: an illegal memory access was encountered`。这是为什么呢?请见下一张:内存管理。 198 | 199 | ## Step 2. Memory Management 200 | 201 | 要想理解为什么上面的程序无法执行,我们需要先学习一下 CUDA 的内存模型。 202 | 203 | 在 CUDA 中,每一个设备都只能访问自己的那一块内存。可以理解为(这个理解并不严谨):整个系统的“内存空间”被分为了两个部分:“内存”与“显存”。CPU 只能访问内存而不能访问显存,GPU 只能访问显存而不能访问内存。上面的例子中,我们就让 GPU 试图访问处于 CPU 内存上的数组 `A`, `B`, `C` 从而导致了 `Illegal memory access` 错误。 204 | 205 | 那么怎么办呢?我们需要先认识几个函数: 206 | 207 | - `cudaMalloc()`: 在显存上申请一块存储空间,类似于 `malloc()`。 208 | - `cudaFree()`:释放一块之前使用 `cudaMalloc()` 申请的存储空间,类似于 `free()`。 209 | - `cudaMemcpy()`:在内存与显存之间拷贝数据,类似于 `memcpy()`。 210 | 211 | 这里有一个更加形象的理解方式:假设我们有两个仓库:A 和 B,分别代表 CPU 内存与 GPU 显存。CPU 只能访问 A 仓库中的数据,GPU 只能访问 B 仓库中的数据。常规的 `malloc()`、`free()` 与 `memcpy()` 都只会影响到 A 仓库, `cudaMalloc()`、`cudaFree()` 的操作对象则是 B 仓库,而 `cudaMemcpy()` 则是在 A 仓库与 B 仓库之间迁移数据。 212 | 213 | > Aside | 对显存进行操作的函数:还有很多函数也可以对显存进行操作,比如 `cudaMemset()`, `cudaMalloc2D()` 等。 214 | 215 | 216 | 所以,我们现在的思路就很清晰了: 217 | 218 | ```cpp 219 | #include 220 | #include 221 | #include 222 | 223 | __global__ void pointwise_add_kernel(int* C, const int* A, const int* B, int n) { 224 | for (int i = 0; i < n; ++i) 225 | C[i] = A[i] + B[i]; 226 | } 227 | 228 | int main() { 229 | const int n = 128; 230 | int* C = new int[n]; 231 | int* A = new int[n]; 232 | int* B = new int[n]; 233 | for (int i = 0; i < n; ++i) { 234 | A[i] = i; 235 | B[i] = i*i; 236 | } 237 | // Create 3 arrays on GPU 238 | int* A_gpu, *B_gpu, *C_gpu; 239 | cudaMalloc(&A_gpu, n * sizeof(int)); 240 | cudaMalloc(&B_gpu, n * sizeof(int)); 241 | cudaMalloc(&C_gpu, n * sizeof(int)); 242 | // Copy the content of A and B to A_gpu and B_gpu, respectively 243 | cudaMemcpy(A_gpu, A, n * sizeof(int), cudaMemcpyHostToDevice); 244 | cudaMemcpy(B_gpu, B, n * sizeof(int), cudaMemcpyHostToDevice); 245 | pointwise_add_kernel<<<1, 1>>>(C_gpu, A_gpu, B_gpu, n); 246 | cudaDeviceSynchronize(); // 见下方 Aside 247 | cudaError_t error = cudaGetLastError(); // 检查当前 CUDA 驱动是否返回了任何异常。调用这句话之前记得调用 cudaDeviceSynchronize() 248 | if (error != cudaSuccess) { 249 | printf("CUDA error: %s\n", cudaGetErrorString(error)); 250 | exit(1); 251 | } 252 | // Copy the result from C_gpu to C 253 | cudaMemcpy(C, C_gpu, n * sizeof(int), cudaMemcpyDeviceToHost); 254 | for (int i = 0; i < n; ++i) { 255 | assert(C[i] == A[i] + B[i]); 256 | } 257 | return 0; 258 | } 259 | ``` 260 | 261 | 这份代码可以正常运行 262 | 263 | > Aside | CUDA Unified Memory: 有一种东西叫做 Unified Memory,其借助类似操作系统中的 Page Fault 的方式实现了在 CPU 与 GPU 之间无感地共享同一块内存(可以理解为,我有一个指针 `p`,CPU 与 GPU 均能访问 `*p`)。感兴趣的同学可以查看[这篇教程](https://developer.nvidia.com/blog/unified-memory-cuda-beginners/)。 264 | 265 | ## Step 3. Threads and Blocks 266 | 267 | 如果你运行一下 `./gemm_test 1024 1024 1024`,你会发现,我们刚刚的 [gemm_gpu_1thread.cu](https://github.com/interestingLSY/CUDA-From-Correctness-To-Performance-Code/blob/master/gemm_gpu_1thread.cu) 怎么比 CPU 的版本慢了好几个数量级!说好的“巨大幅度性能提升”呢? 268 | 269 | 而且,之前不是说,GPU 是“大量小核心”么?这也没体现出来呀!我就是写了一个串行版本的函数,怎么就能在“大量小核心“上面,顶多使用一个小核心吧!那性能肯定不行呀! 270 | 271 | 那么,如何用好 GPU 内部的大量小核心呢?这就要涉及到 GPU 内部的三个概念了:Thread, Block 以及 Grid: 272 | 273 | - Thread 是最基本的执行单位,**每一个 Thread 都会把你写的 CUDA Kernel 从头到尾完整地执行一遍**。 274 | - 每一个 Block 中包含若干个 Thread,每一个 Thread 都会有一个 `threadIdx`,代表这个 Thread 在它所在的 Block 中的 id。可以使用 `blockDim` 来获取 Block 中有多少个 Thread。 275 | - 每一个 Grid 包含若干个 Block,每一个 Thread 也有一个 `blockIdx`,代表这个 Thread 所在的 Block 在 Grid 中的 id。可以使用 `gridDim` 来获取 Grid 中有多少个 Block。每一次启动 CUDA Kernel 时都会生成一个 Grid(某种意义上可以理解为一个“执行上下文”。 276 | 277 | 三者的关系看上去大概就是这样的: 278 | 279 | ![kernel-execution-on-gpu-1.png](images/cuda/kernel-execution-on-gpu-1.png) 280 | 281 | *Image source: https://developer.nvidia.com/blog/cuda-refresher-cuda-programming-model/* 282 | 283 | ![cuda-kernel-index.png](images/cuda/cuda-kernel-index.png) 284 | 285 | *Image source: https://developer.nvidia.com/blog/even-easier-introduction-cuda* 286 | 287 | 在启动 CUDA Kernel 时,`<<<>>>` 中的第一个数字是每一个 Grid 中的 Block 数量,第二个数字是每一个 Block 中的 Thread 数量(每一个 Block 中含有的 Thread 数量是相等的)。比如: 288 | 289 | ```cpp 290 | #include 291 | #include 292 | 293 | __global__ void print_grid_block_info_kernel() { 294 | printf("Block id: %d. Number of blocks in one grid: %d. " 295 | "Thread id: %d. Number of threads in one block: %d\n", 296 | blockIdx.x, gridDim.x, threadIdx.x, blockDim.x); 297 | } 298 | 299 | int main() { 300 | const int GRID_SIZE = 4; 301 | const int BLOCK_SIZE = 3; 302 | print_grid_block_info_kernel<<>>(); 303 | cudaDeviceSynchronize(); 304 | cudaError_t error = cudaGetLastError(); // 检查当前 CUDA 驱动是否返回了任何异常。调用这句话之前记得调用 cudaDeviceSynchronize() 305 | if (error != cudaSuccess) { 306 | printf("CUDA error: %s\n", cudaGetErrorString(error)); 307 | exit(1); 308 | } 309 | return 0; 310 | } 311 | ``` 312 | 313 | 由于每一个 CUDA Kernel 都会被每个 Thread 完整地执行一遍,所以我们称 CUDA 是 SPMD (Single Program Multiple Data) Style 的。 314 | 315 | 让我们再完善一下我们的向量点对点加法程序: 316 | 317 | ```cpp 318 | #include 319 | #include 320 | #include 321 | 322 | __global__ void pointwise_add_kernel(int* C, const int* A, const int* B, int n) { 323 | // 别忘了,每一个 Thread 把整个 CUDA Kernel 都完整地执行一遍 324 | for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < n; i += blockDim.x*gridDim.x) 325 | C[i] = A[i] + B[i]; 326 | } 327 | 328 | int main() { 329 | const int n = 128; 330 | const int BLOCK_DIM = 4; 331 | const int GRID_DIM = 3; 332 | int* C = new int[n]; 333 | int* A = new int[n]; 334 | int* B = new int[n]; 335 | for (int i = 0; i < n; ++i) { 336 | A[i] = i; 337 | B[i] = i*i; 338 | } 339 | // Create 3 arrays on GPU 340 | int* A_gpu, *B_gpu, *C_gpu; 341 | cudaMalloc(&A_gpu, n * sizeof(int)); 342 | cudaMalloc(&B_gpu, n * sizeof(int)); 343 | cudaMalloc(&C_gpu, n * sizeof(int)); 344 | // Copy the content of A and B to A_gpu and B_gpu, respectively 345 | cudaMemcpy(A_gpu, A, n * sizeof(int), cudaMemcpyHostToDevice); 346 | cudaMemcpy(B_gpu, B, n * sizeof(int), cudaMemcpyHostToDevice); 347 | pointwise_add_kernel<<>>(C_gpu, A_gpu, B_gpu, n); 348 | cudaDeviceSynchronize(); // 见下方 Aside 349 | cudaError_t error = cudaGetLastError(); // 检查当前 CUDA 驱动是否返回了任何异常。调用这句话之前记得调用 cudaDeviceSynchronize() 350 | if (error != cudaSuccess) { 351 | printf("CUDA error: %s\n", cudaGetErrorString(error)); 352 | exit(1); 353 | } 354 | // Copy the result from C_gpu to C 355 | cudaMemcpy(C, C_gpu, n * sizeof(int), cudaMemcpyDeviceToHost); 356 | for (int i = 0; i < n; ++i) { 357 | assert(C[i] == A[i] + B[i]); 358 | } 359 | return 0; 360 | } 361 | ``` 362 | 363 | 接下来,请阅读并理解 [gemm_gpu_mult_thread.cu](https://github.com/interestingLSY/CUDA-From-Correctness-To-Performance-Code/blob/master/gemm_gpu_mult_thread.cu) 与 [gemm_gpu_mult_block.cu](https://github.com/interestingLSY/CUDA-From-Correctness-To-Performance-Code/blob/master/gemm_gpu_mult_block.cu)。请暂时忽略代码中的 `__restrict__`。 364 | 365 | 在 Ryzen 7700X CPU + RTX 4090 GPU 上,各个 GEMM 的时限的耗时如下: 366 | 367 | ```plain 368 | Results: 369 | cpu_naive 348434.62 us 370 | cpu_simd 133023.88 us 371 | gpu_1thread 20276114.00 us 372 | gpu_mult_thread 410566.25 us 373 | gpu_mult_block 3988.38 us 374 | ``` 375 | 376 | 可以看到,在加入了多 Thread 与多 Block 之后,性能取得了明显的提升。 377 | 378 | ## Takeaway 379 | 380 | 那么,你现在已经可以写出性能基本过关的 CUDA 程序啦!我们再来回顾一下基本知识: 381 | 382 | - CUDA 的设计思想大致是:向显卡提交一个又一个任务,每一个任务都形如“给定一个函数,与调用它的参数,请在显卡上运行这个函数”。我们一般称这种“在显卡上运行的函数”为 CUDA Kernel。 383 | - 可以使用 `__global__`, `__host__` 和 `__device__` 来修饰函数。如果想写一个 CUDA Kernel(能被 CPU 上运行的其他函数,并在 GPU 上执行的函数),那么应当使用 `__global__`。 384 | - 启动 CUDA Kernel 时,请在函数名和参数列表之间加上 `<<<每个 Grid 中有多少 Block, 每个 Block 中有多少 Thread>>>`。 385 | - 启动 CUDA Kernel 的时候会创建一个 Grid。这个 Grid 里包含若干 Block,每个 Block 里包含若干 Thread。 386 | 387 | # Part 2. Performance 388 | 389 | 在上一章中,我们学习了如何写出正确的 CUDA Kernel,那么现在我们来学学如何利用好 GPU 的底层架构,优化 CUDA Kernel 的性能。 390 | 391 | ## 0. 算存比 392 | 393 | 想要优化 GPU 的性能,我们首先要知道“算存比”的概念。 394 | 395 | 在经典的冯诺依曼架构下,ALU (Arithmetic Logic Unit,计算逻辑单元,可以简单理解为加法器、乘法器等) 要从内存中取操作数,进行对应的计算(如乘法),并写回内存。所以,计算速度会受到两个因素的限制:ALU 进行计算的速度,与内存的存取速度。如果一个程序的运行速度瓶颈在于前者,那么称其为 Compute-bound 的;如果瓶颈在于后者,那么称其为 Memory-bound 的。 396 | 397 | 由于 CPU 中运算单元较少,且 CPU 具有多级缓存,所以空间连续性、时间连续性较好的程序在 CPU 上一般是 Compute-bound 的。而 GPU 则恰恰相反:GPU 的核心的规模一般很大,比如 RTX 4090 可以在一秒内做 82.58T 次 float16 运算(暂不考虑 Tensor core),但其内存带宽只有 1TB/s,每秒只能传输 0.5T 个 float16。这便导致 GPU 上的操作更可能会受到内存带宽的限制,成为 Memory-bound。 398 | 399 | 如何估测一个 CUDA Kernel 是 Compute-bound 还是 Memory-bound 呢?我们可以计算它的“算存比”,也即,$计算次数/访存次数$,并将其与 GPU 的 $每秒能做的运算次数/每秒能做的访存次数$ 做比较(这里其实不太严谨,仅能用来做粗略估计,严谨的计算还要考虑到 FMA、缓存、显存带宽利用率等等因素)。 400 | 401 | 比如,对于上面的 `pointwise_add_kernel`,其需要访问 $3N$ 次内存,同时做 $N$ 次加法,所以其存算比为 $N/3N = 1/3$,其远小于 $82.58T/0.5T = 165.16$,所以其为 Memory-bound。 402 | 403 | 我们的优化思路大体是:如果一个 Kernel 是 Memory-bound 的,那么就优化它的访存次数(哪怕这样可能需要多进行一些计算),反之则要减少其计算次数。一般来说,Compute-bound 的 Kernel 不太常见(毕竟算存比得过百才能达到 Compute-bound)(常见的 Compute-bound 的 Kernel 可能只有矩阵乘法与卷积核比较大的卷积),所以下面我们主要关注如何优化访存。 404 | 405 | > Aside | Fused Multiply-Add (FMA):现在的 NVIDIA GPU 可以在 1 个时钟周期内计算 `a*b+c`, (`a`, `b` 和 `c` 均为浮点数),而不是先花一个周期计算加法,再花一个周期计算乘法。这称为 Fused multiply-add (FMA)。 406 | 407 | ## 1. `__restrict__` 408 | 409 | 大家还记得什么是 Pointer aliasing 嘛?简单来说,下面两段代码并不是等价的: 410 | 411 | ```cpp 412 | void f1(int* x, int* y) { 413 | *x += *y; 414 | *x += *y; 415 | } 416 | ``` 417 | 418 | ```cpp 419 | void f2(int* x, int* y) { 420 | *x += 2*(*y); 421 | } 422 | ``` 423 | 424 | 这是因为,`x` 和 `y` 两个指针可能指向相同的内存。考虑 `f(x, x)`,第一段代码将把 `*x` 变为 `4(*x)`,而第二段代码则会把 `*x` 变为 `3(*x)`。 425 | 426 | Pointer aliasing 可能会抑制编译器做出某些优化。比如在上面的代码中,`f1()` 需要 5 次访存而 `f2()` 仅需三次,后者更优。但由于编译器并不能假设 `x` 和 `y` ,它不敢做这个优化。 427 | 428 | 所以,我们需要“显式地”告诉编译器,两个指针不会指向相同的内存地址(准确来说,应该是“改变一个指针指向的地址的数据,不会影响到通过其他指针读取的数据”),从而让编译器“放心地”做出优化。`nvcc` 支持一个关键字,叫做 `__restrict__`,加上它,编译器就可以放心地把指针指向的值存在寄存器里,而不是一次又一次地访存,进而提高了性能。 429 | 430 | 我们可以对比一下示例代码中的 `gemm_gpu_mult_block_no_restrict.cu` 与 `gemm_gpu_mult_block.cu` 的性能。在 4090 上,前者平均耗时 40420.75,后者平均耗时 3988.38。可以看出,性能提升幅度不容小觑。 431 | 432 | 为了验证性能下降确实是由于没有了 `__restrict__` 关键字后的额外访存带来的,我们可以对比 `gemm_gpu_mult_block.cu` 与 `gemm_gpu_mult_block_no_restrict_reg.cu` 的性能。后者虽然没有使用 `__restrict__` 关键字,但它把中间的累加结果存在了变量中,而不是每一次都写回 C 数组。在 4090 上,二者的性能非常相似。这说明,在缺少 `__restrict__` 关键字的时候,代码需要进行许多不必要的访存,进而拖慢了速度。 433 | 434 | ## 2. Memory Coalescing 435 | 436 | 在学习 Memory coalescing 之前,我们需要先了解一下 GPU 内部的调度方式。 437 | 438 | 之前我们说过,Grid 里包含若干 Thread block,每个 Thread block 则又包含若干 Thread,那么这些 Thread 是如何被调度的呢?它们被按编号分成了若干组,每一组中有 32 个 Thread(即,线程 0 ~ 31 为第一组,32 ~ 63 为第二组,依次类推),这样的“组”便被叫做 Warp。 439 | 440 | GPU 的调度是以 Warp 作为基本单位的。每个时钟周期内,同一个 Warp 中的所有线程都会执行相同的指令。 441 | 442 | 那么访存呢?难道 Warp 中的 32 个 Thread 同时访存的话,GPU 核心会向显存发起 32 次请求嘛?显然不会。GPU 会把这些请求打包成尽可能少的 Transaction(可以把每一个 Transaction 理解为 GPU 核心向显存发起的一次访存操作),这个过程就叫做 Memory coalescing。Transaction 需要满足: 443 | 444 | - 长度为 32 个 Byte 445 | - 开始地址是 32 的倍数 446 | 447 | 也即:如果一个 Warp 中的第 i 个 Thread 要访问地址为 $4i \sim 4i+3$ 的内存,那么一共需要 4 个 Transaction 才能读完所有的数据;如果一个 Warp 中的第 i 个 Thread 要访问地址为 $4i+1 \sim 4i+4$ 的内存,那么需要 5 个 Transaction 才能读取所有的数据;如果第 i 个 Warp 要访问地址为 $32i \sim 32i+3$ 的内存,那么就需要 32 次 Transaction 才能完成读取了。 448 | 449 | 然而,内存带宽是有上限的,且每一个 Transaction 的大小都是 32 Byte,这注定了每一秒 GPU 核心可以发起的 Transaction 数量是有上限的。对于上述的最后一种情况,由于每一个 Transaction 中的 32 Byte 只有 4 Byte 是有用的,此时内存带宽的利用率仅有 $1/8$。 450 | 451 | 接下来请阅读 [CUDA Best Practices](https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#coalesced-access-to-global-memory),了解 Memory coalescing 在一个具体的例子中的优化效果。 452 | 453 | 总之,我们需要尽量保证同一个 Warp 中每一个 Thread 的访存是 coalesced 的,以充分利用内存带宽。 454 | 455 | ## 3. Shared Memory 456 | 457 | 在学习 Shared memory 之前,我们需要先了解一下 CUDA 的内存模型: 458 | 459 | CUDA 中大致有这几种内存: 460 | 461 | - Global Memory:俗称显存,位于 GPU 核心外部,很大(比如 A100 有 80GB),但是带宽很有限 462 | - L2 Cache:位于 GPU 核心内部,是显存的缓存,程序不能直接使用 463 | - Register:寄存器,位于 GPU 核心内部,Thread 可以直接调用 464 | - Shared memory:位于 GPU 核心内部,每个 Thread block 中的所有 Thread 共用同一块 Shared memory(因此,Shared memory 可以用来在同一个 Thread block 的不同 Thread 之间共享数据),并且带宽极高(因此,Shared memory 可以用来优化性能)。 465 | 466 | 正如上文所说,Share memory 既可以用来在同一个 Thread block 的不同 Thread 之间共享数据(最常见的用法是 Reduction),也可以用来优化访存性能。我们现在主要关注后者。 467 | 468 | 我们还是以矩阵乘法为例。在上面的 `gemm_gpu_mult_block.cu` 中,为了计算大小分别为 $n \times k$ 与 $k \times m$ 的两个矩阵乘法,我们一共访问了大约 $2nmk$ 次内存。这十分不合算,因为三个矩阵加起来也就只有 $nk + km + nm$ 个元素。 469 | 470 | 我们尝试使用 Shared memory 来优化矩阵乘法。具体的,我们使用一种叫做 Tiling 的技术。接下来请阅读[这篇文章](https://penny-xu.github.io/blog/tiled-matrix-multiplication)(里面有很多好看又形象的动图)。 471 | 472 | 在阅读上面那篇文章之后,请阅读示例代码中的 `gemm_gpu_tiling.cu`,看看我如何实现 Tiling 版本的矩阵乘法。在 4090 上,`gemm_gpu_mult_block` 耗时 3988.38 us,`gemm_gpu_tiling` 耗时 311.38 us,性能提升约 10 倍。 473 | 474 | > Aside | CUDA Memory Hierarchy: ![memory-hierarchy-in-gpus-1.png](images/cuda/memory-hierarchy-in-gpus-1.png) 475 | > *Image source: https://developer.nvidia.com/blog/cuda-refresher-cuda-programming-model/* 476 | 477 | > Aside | Reduction: 对 Reduction 操作的优化比较感兴趣的同学可以阅读 [这篇文章](https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf)。 478 | 479 | ## 4. Profiling Tools 480 | 481 | 在优化 CUDA Kernel 的时候,除了依照经验与惯用套路(比如 Memory coalescing),我们也可以使用专业的 Profiling 工具来测试一个 Kernel 或者一个程序的性能瓶颈。常用的 Profiling 工具包括: 482 | 483 | - NVIDIA Nsight System: 它可以对整个应用程序进行 Profile,可以得到各个 Kernel 的耗时,以及究竟是 CPU 还是 GPU 拖慢了整体的执行速度。 484 | - NVIDIA Nsight Compute: 它可以对单个 CUDA Kernel 进行 Profiling,进而得到该 CUDA Kernel 的瓶颈所在。它会提供许多的详细信息(比如,内存带宽占用率、CUDA Core 活跃时间比、活跃的 SM 比例等等)来帮助你更加细致地优化 CUDA Kernel。 485 | 486 | ## Takeaway 487 | 488 | 总结一下,我们的优化技巧包括: 489 | 490 | - 使用 `__restrict__` 让编译器放心地优化指针访存 491 | - 想办法让同一个 Warp 中的线程的访存 Pattern 尽可能连续,以利用 Memory coalescing 492 | - 使用 Shared memory 493 | - 使用专业的 Profiling Tool -------------------------------------------------------------------------------- /docs/zh/docs/gpu/hip.md: -------------------------------------------------------------------------------- 1 | TODO -------------------------------------------------------------------------------- /docs/zh/docs/gpu/images/cuda/cpu-gpu-arch-diff.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcpu-club/hpc-wiki/8530f11d9f4a9c1389af487545bb22aa5f9f03d9/docs/zh/docs/gpu/images/cuda/cpu-gpu-arch-diff.png -------------------------------------------------------------------------------- /docs/zh/docs/gpu/images/cuda/cuda-kernel-index.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcpu-club/hpc-wiki/8530f11d9f4a9c1389af487545bb22aa5f9f03d9/docs/zh/docs/gpu/images/cuda/cuda-kernel-index.png -------------------------------------------------------------------------------- /docs/zh/docs/gpu/images/cuda/kernel-execution-on-gpu-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcpu-club/hpc-wiki/8530f11d9f4a9c1389af487545bb22aa5f9f03d9/docs/zh/docs/gpu/images/cuda/kernel-execution-on-gpu-1.png -------------------------------------------------------------------------------- /docs/zh/docs/gpu/images/cuda/memory-hierarchy-in-gpus-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcpu-club/hpc-wiki/8530f11d9f4a9c1389af487545bb22aa5f9f03d9/docs/zh/docs/gpu/images/cuda/memory-hierarchy-in-gpus-1.png -------------------------------------------------------------------------------- /docs/zh/docs/gpu/intro.md: -------------------------------------------------------------------------------- 1 | TODO -------------------------------------------------------------------------------- /docs/zh/docs/gpu/nccl.md: -------------------------------------------------------------------------------- 1 | TODO -------------------------------------------------------------------------------- /docs/zh/docs/gpu/nsys.md: -------------------------------------------------------------------------------- 1 | TODO -------------------------------------------------------------------------------- /docs/zh/docs/gpu/openacc.md: -------------------------------------------------------------------------------- 1 | TODO -------------------------------------------------------------------------------- /docs/zh/docs/gpu/opencl.md: -------------------------------------------------------------------------------- 1 | TODO -------------------------------------------------------------------------------- /docs/zh/docs/hardware/fpga-and-asics.md: -------------------------------------------------------------------------------- 1 | TODO -------------------------------------------------------------------------------- /docs/zh/docs/hardware/gpu.md: -------------------------------------------------------------------------------- 1 | # GPU 2 | 3 | Redirect to hardware/gpu 4 | Spaceholder for directory generations -------------------------------------------------------------------------------- /docs/zh/docs/hardware/hardware-intro.md: -------------------------------------------------------------------------------- 1 | TODO -------------------------------------------------------------------------------- /docs/zh/docs/hardware/interconnect.md: -------------------------------------------------------------------------------- 1 | TODO -------------------------------------------------------------------------------- /docs/zh/docs/hardware/memory.md: -------------------------------------------------------------------------------- 1 | TODO -------------------------------------------------------------------------------- /docs/zh/docs/hardware/processor.md: -------------------------------------------------------------------------------- 1 | TODO -------------------------------------------------------------------------------- /docs/zh/docs/hardware/storage.md: -------------------------------------------------------------------------------- 1 | TODO -------------------------------------------------------------------------------- /docs/zh/docs/hpc-intro/hpc-history.md: -------------------------------------------------------------------------------- 1 | TODO -------------------------------------------------------------------------------- /docs/zh/docs/hpc-intro/modern-hpc.md: -------------------------------------------------------------------------------- 1 | TODO -------------------------------------------------------------------------------- /docs/zh/docs/hpc-intro/what-is-hpc.md: -------------------------------------------------------------------------------- 1 | TODO -------------------------------------------------------------------------------- /docs/zh/docs/index.md: -------------------------------------------------------------------------------- 1 | # HPC Wiki 2 | 3 | HPC 比赛,通常指的是高性能计算相关的比赛,主要形式包括以 `SCC`、`ISC`和`ASC`为代表的学生集群竞赛,和以 `PKU HPCGame`为代表的高性能计算挑战赛。比赛要求选手在规定时间、规定功耗或成本范围内解决高性能计算相关问题,并尽可能提高问题的解决效率。比赛对选手在并行程序设计、软硬件协同优化、计算机体系结构理解与运用、临场问题处理以及团队协作等诸多方面都有很高的要求。 4 | 5 | 全国高校范围内,有大约200所学校建有超算队。大多数队伍都有自己的文档库用来培养新队员,这些文档库中的内容大多数是相同的,而且限于超算队的规模,文档库的内容也很难得到及时的更新。为此,我们共同建设 **HPC Wiki**,提高文档质量和内容丰富度,让更多的同学能够更快地学习到高性能计算相关的知识,从而更好地参与到 HPC 比赛中。希望能够减少“重复建造轮子”的现象,让大家能够更好地利用时间做更有意义的事情。 6 | 7 | **HPC Wiki** 源于社区,由北京大学学生 Linux 俱乐部长期运营和维护,将始终保持**独立自由**的性质,采取`cc-by-nc-sa`的知识共享许可协议,绝不会商业化。 8 | 9 | ## Material color palette 颜色主题 10 | 11 | ### Color Scheme 配色方案 12 | 13 | 根据浏览器与系统设置自动切换明暗主题,也可手动切换 14 |
15 | 16 | 17 |
18 | 27 | 28 | ### Primary colors 主色 29 | 30 | 点击色块可更换主题的主色 31 |
32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 |
53 | 62 | 63 | ### Accent colors 辅助色 64 | 65 | 点击色块更换主题的辅助色 66 |
67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 |
84 | 93 | 94 | -------------------------------------------------------------------------------- /docs/zh/docs/memory-model/cache.md: -------------------------------------------------------------------------------- 1 | TODO -------------------------------------------------------------------------------- /docs/zh/docs/memory-model/consistency.md: -------------------------------------------------------------------------------- 1 | TODO -------------------------------------------------------------------------------- /docs/zh/docs/memory-model/intro.md: -------------------------------------------------------------------------------- 1 | TODO -------------------------------------------------------------------------------- /docs/zh/docs/memory-model/numa.md: -------------------------------------------------------------------------------- 1 | TODO -------------------------------------------------------------------------------- /docs/zh/docs/misc/faq.md: -------------------------------------------------------------------------------- 1 | TODO -------------------------------------------------------------------------------- /docs/zh/docs/misc/git.md: -------------------------------------------------------------------------------- 1 | TODO -------------------------------------------------------------------------------- /docs/zh/docs/misc/shell.md: -------------------------------------------------------------------------------- 1 | TODO -------------------------------------------------------------------------------- /docs/zh/docs/parallel-programming/mpi.md: -------------------------------------------------------------------------------- 1 | TODO -------------------------------------------------------------------------------- /docs/zh/docs/parallel-programming/mpi4py.md: -------------------------------------------------------------------------------- 1 | TODO -------------------------------------------------------------------------------- /docs/zh/docs/parallel-programming/openmp.md: -------------------------------------------------------------------------------- 1 | TODO -------------------------------------------------------------------------------- /docs/zh/docs/parallel-programming/parallel-programming-intro.md: -------------------------------------------------------------------------------- 1 | # 并行编程导论 2 | 3 | !!! info "Question" 4 | If you were plowing a field, which would you rather use? Two strong oxen or 1024 chickens? 5 | 6 | *Seymour Cray* 7 | 8 | 尽管我们要介绍的是并行编程,但是不得不说的是,程序员往往都是不愿意并行编程的。要知道,我们今天所熟知的MPI、OpenMP和CUDA编程都是20世纪90年代以后才逐渐发展起来,并在最近的20年逐渐走向热门的。 9 | 10 | 究其原因,是因为计算机性能的发展不再能够满足人们的需要。我们可以看到,在处理器遇到功耗墙(2003年前后)后,人们意识到不能通过一味提高处理器主频来获得性能的持续增长。 11 | 12 | ![40年来处理器性能进步示意图。](./images/intro-cpuperf.svg) 13 | 14 | 图:40年来处理器性能进步示意图。1978-2018年的数据由Patterson和Hennessy(2017)给出,之后几年的数据是综合了Geekbench5和其他几个基准测试给出的大致值(具体的细节在这里我们不作过多描述)。2023年的数据经过估算,将其转化到4个核总的的平均表现上得出。 15 | 16 | 所以,从单核转向多核,是人们迫不得已的选择。当前面临的情况是,处理器性能提升的速度放缓,根据Hennessy和Patterson等人的测算(2017),大约是每20年翻一番,而不是1986年到2003年期间的每1.5年翻一番。现实情况是 17 | 18 | - 摩尔定律放缓,登纳德缩放定律终结,晶体管的工艺不再在短时间内持续大幅进展; 19 | 20 | - 微处理器的功耗预算不变(在桌面计算机和个人移动设备上,人们的功耗预期甚至在下降); 21 | 22 | - 使用多个高能效的处理器代替单个功能强大但是功耗更大的处理器——然而这并不是没有代价的,这意味着掌握并行编程已经成为了程序员应当具备的十分重要的能力,我们所说的并行编程,应当说涵盖了指令级并行、数据级并行、线程(和进程)级并行和请求级并行(这在构建高并发的互联网应用中十分常见)。 23 | 24 | 最后,在本章将要结束的时候,我们会看到多重处理已经达到了Amdahl定律的上限。为了在能耗、成本、性能的三角中取得平衡,唯一的途径就是专用。我们在其他章节还会介绍Tensor Core的体系结构,它就是专用处理器的典型代表,能够大幅提升矩阵乘法的性能。 25 | 26 | ## 并行编程的基本概念 27 | 28 | 我们来看一个最简单的例子。 29 | 30 | === "C语言" 31 | ```c 32 | #include 33 | int main(){ 34 | printf("Hello, world!\n"); 35 | return 0; 36 | } 37 | ``` 38 | 39 | 使用gcc可以很容易地将这段代码编译为可执行文件。像这样的文件在现代操作系统上运行时,操作系统会提供一种假象,好像系统上只有这个程序在运行。程序看上去是独占地使用处理器、主存和I/O设备;处理器看上去就像是在一条接一条地执行程序中的指令——这个假象是通过**进程**(process)这一抽象概念来实现的,而并发运行则是指不同进程之间的指令在处理器上交错运行。 40 | 41 | 对于一个单处理器的系统,在任何时刻它都只能处理一个进程的代码。于是,需要依靠操作系统来将控制权从一个进程切换到另一个进程。在切换之前,操作系统需要保存这个进程的全部状态(被称作**上下文**,context),当控制权切换回先前的程序时,进程就会从它上次停止的地方继续运行。 42 | 43 | 在现代的计算机系统中,一个进程实际上可以由多个更轻量级的单元组成,每一个单元被称为**线程**(thread)。每一个线程都运行在进程的上下文中,并共享同样的代码和全局数据。 44 | 45 | 既然希望并发编程,那我们当然需要不同的执行单元之间存在某种能够相互协同的机制。在曾经的某段时间里,人们倾向于将计算机系统设计为所有的存储器都使用相同的编址,所有的处理器都通过片间互联网络访问到存储器,执行单元通过向某个特定的内存地址写入或读出数据,可以实现与其他执行单元隐式的信息交换。Pthread和OpenMP正是采用了这样的思路,不同的线程共享了相同的地址空间,通过对共享变量的读写,不同的线程间得以实现通信。 46 | 47 | 当需要协同的执行单元分布在不同的计算机上(甚至是网络中的计算机),上面共享内存的方法就爱莫能助了。然而,这些执行单元可以通过网络在不同的主机之间传递消息。流行的消息传递库MPI(Message Passing Interface)提供了一组丰富的进程间通信原语,通过显式地在不同进程间发送或接收消息,来实现进程间的通信。特别地,这些进程还可以分布在遥远的世界角落中,通过网络可以完成分布式计算。 48 | 49 | ??? info "SETI@home:分布式计算分析天文观测结果" 50 | SETI@home是近几十年来影响最大的分布式计算应用。它从1999年5月17日开始运行,全世界的计算机都可以通过下载一个软件,参与到计算和分析射电望远镜的观测结果的工作中。 51 | 52 | ## 并行编程的一般方法 53 | 54 | 在这里我们介绍经典的Foster设计方法,它由lan Foster提出,并被广广泛应用到了并行程序的设计中。Foster方法由四部分构成:**划分**(partition)、**通信**(communication)、**整合**(agglomeration)和**映射**(mapping)。 55 | 56 | 划分阶段,我们需要将工作分解为若干小的部分。一方面,我们可以对数据进行划分,例如将计算$n$个粒子的运动参数划分为对$\frac{n}{2}$个粒子运动参数的计算。另一方面,我们可以对操作进行划分,这在大规模神经网络训练上很常见,可能一张加速卡只负责某些层的计算,然后将计算结果作为输入通过网络传递给下游的加速卡。 57 | 58 | ??? warning "什么是好的划分" 59 | 好的划分通常具有这些特征:尽可能减少由于划分带来的额外开销;原任务数是关于划分数量的某个函数;对于各个划分,它们的任务量应当是大致相同的。在之后我们会看到好的划分在负载均衡和提升总体性能表现上的优越性。 60 | 61 | 通信阶段主要包含局部通信和全局通信。通信往往会带来额外的开销,例如当进程A需要的数据还没有被进程B计算完时,进程A将不得不等待。通信的功能一方面是为了交换数据,另一方面也能起到同步的作用。 62 | 63 | ??? warning "通信是有代价的吗" 64 | 在一个并行程序中,通信的开销往往是很昂贵的(尤其是在分布式计算中,数据包不得不沿着长长的链路在两台主机间交互)。因此,一个性质良好的程序应当使得计算单元尽可能少地进行通信,尤其是要避免大量的全局通信出现。执行单元还应当可以独立地并行进行通信,为进行并行的计算提供便利。 65 | 66 | 整合的含义是研究划分好的子任务可能被合并的可能性,并加以合并。例如,当任务A所需要的数据仅仅来自任务B,而任务B的功能仅仅是产生这个数据时,我们完全可以将任务A和任务B合并成一个较大的任务,这样就能够减少通信带来的开销。 67 | 68 | 映射阶段,我们将任务映射到具体的处理器。在这个阶段,我们关注任务的负载均衡,并期望尽可能减少处理器之间的通信。 69 | 70 | ## 学习路径 71 | 72 | 首先,为了在支持并行计算的系统(往往是多核系统)上进行编程,我们首先需要了解对我们来说有用的有关系统的特征;然后,为了简化这个系统,我们将使用流行的并行编程API(application program interface),包括MPI(message passing interface)、OpenMP和CUDA(在其他大章节中涉及)。在每一个环节上,我们都会通过几个经典的例子来了解如何使用这些API来编程,主要包括 73 | 74 | - 圆周率计算 75 | - 矩阵乘法 76 | - 动力学参量计算 77 | 78 | 它们都来自于现代计算的实际需要。当然了,在编程实践中,我们将会碰到许多有意思的话题,例如我们会写出看上去是并行但实际上比串行程序将运行得更慢的代码。因此,除了计算,我们还会关注若干基本的性能的分析、评价和优化,我们相信对程序本身的分析和写出正确的程序同等重要。 79 | 80 | 或许你对上面的概念都还有些许陌生,但是没关系,我们暂时还不需要十分关心这些术语的确切含义,而是会在之后的学习中渐渐理解它们。现在,我们就要开始通过一些实例具体地着手并行编程了。你可以查看其他章节进行进一步的学习。 -------------------------------------------------------------------------------- /docs/zh/docs/performance-analysis/basics.md: -------------------------------------------------------------------------------- 1 | TODO -------------------------------------------------------------------------------- /docs/zh/docs/performance-analysis/intro.md: -------------------------------------------------------------------------------- 1 | TODO -------------------------------------------------------------------------------- /docs/zh/docs/performance-analysis/nsys.md: -------------------------------------------------------------------------------- 1 | # GPU性能分析--NSYS 2 | 3 | Redirect to gpu/nsys 4 | Spaceholder for directory generations -------------------------------------------------------------------------------- /docs/zh/docs/performance-analysis/vtune.md: -------------------------------------------------------------------------------- 1 | TODO -------------------------------------------------------------------------------- /docs/zh/docs/platform/cloud.md: -------------------------------------------------------------------------------- 1 | TODO -------------------------------------------------------------------------------- /docs/zh/docs/platform/cluster.md: -------------------------------------------------------------------------------- 1 | TODO -------------------------------------------------------------------------------- /docs/zh/docs/platform/modules.md: -------------------------------------------------------------------------------- 1 | TODO -------------------------------------------------------------------------------- /docs/zh/docs/platform/platform-intro.md: -------------------------------------------------------------------------------- 1 | TODO -------------------------------------------------------------------------------- /docs/zh/docs/platform/scheduling.md: -------------------------------------------------------------------------------- 1 | TODO -------------------------------------------------------------------------------- /docs/zh/docs/power-management/intro.md: -------------------------------------------------------------------------------- 1 | TODO -------------------------------------------------------------------------------- /docs/zh/docs/power-management/perception.md: -------------------------------------------------------------------------------- 1 | TODO -------------------------------------------------------------------------------- /docs/zh/docs/sci-mlsys/intro.md: -------------------------------------------------------------------------------- 1 | TODO -------------------------------------------------------------------------------- /docs/zh/docs/sci-mlsys/parallelism.md: -------------------------------------------------------------------------------- 1 | TODO -------------------------------------------------------------------------------- /docs/zh/docs/sci-mlsys/quantization.md: -------------------------------------------------------------------------------- 1 | TODO -------------------------------------------------------------------------------- /docs/zh/docs/sci-mlsys/sparsity.md: -------------------------------------------------------------------------------- 1 | TODO -------------------------------------------------------------------------------- /docs/zh/docs/static/css/extra.css: -------------------------------------------------------------------------------- 1 | .md-typeset table:not([class]) th { 2 | min-width: 0rem; 3 | } 4 | .md-nav__link--active { 5 | color: inherit; 6 | } 7 | blockquote.page-copyright { 8 | margin: 20px 0; 9 | padding-left: 1.5rem; 10 | border-left: 5px solid #ff1700; /* Just change the color value and that's it*/ 11 | } 12 | .gt-container .gt-meta { 13 | z-index: auto !important; 14 | } 15 | -------------------------------------------------------------------------------- /docs/zh/docs/static/img/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcpu-club/hpc-wiki/8530f11d9f4a9c1389af487545bb22aa5f9f03d9/docs/zh/docs/static/img/logo.png -------------------------------------------------------------------------------- /docs/zh/docs/static/js/extra.js: -------------------------------------------------------------------------------- 1 | window.MathJax = { 2 | tex2jax: { 3 | inlineMath: [ ["\\(","\\)"] ], 4 | displayMath: [ ["\\[","\\]"] ] 5 | }, 6 | TeX: { 7 | TagSide: "right", 8 | TagIndent: ".8em", 9 | MultLineWidth: "85%", 10 | equationNumbers: { 11 | autoNumber: "AMS", 12 | }, 13 | unicode: { 14 | fonts: "STIXGeneral,'Arial Unicode MS'" 15 | } 16 | }, 17 | showProcessingMessages: false, 18 | messageStyle: "none" 19 | }; 20 | window.addEventListener('load', function() { 21 | var p=localStorage.getItem("data-md-color-primary"); 22 | if (p){ 23 | document.body.setAttribute('data-md-color-primary',p); 24 | } 25 | var a=localStorage.getItem('data-md-color-accent'); 26 | if (a){ 27 | document.body.setAttribute('data-md-color-accent',a); 28 | } 29 | var s = localStorage.getItem('data-md-color-scheme'); 30 | if (s) { 31 | document.body.setAttribute('data-md-color-scheme', s); 32 | } 33 | 34 | }, false); 35 | 36 | //添加空格 37 | pangu.spacingPageBody(); -------------------------------------------------------------------------------- /docs/zh/docs/thread-process/intro.md: -------------------------------------------------------------------------------- 1 | TODO -------------------------------------------------------------------------------- /docs/zh/docs/thread-process/more.md: -------------------------------------------------------------------------------- 1 | TODO -------------------------------------------------------------------------------- /docs/zh/docs/thread-process/process.md: -------------------------------------------------------------------------------- 1 | TODO -------------------------------------------------------------------------------- /docs/zh/docs/thread-process/thread.md: -------------------------------------------------------------------------------- 1 | TODO -------------------------------------------------------------------------------- /docs/zh/mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: HPC Wiki 2 | site_description: HPC Wiki 3 | site_author: HPC Wiki Team 4 | site_url: https://hpcwiki.io/ 5 | repo_name: lcpu-club/hpc-wiki 6 | repo_url: https://github.com/lcpu-club/hpc-wiki 7 | edit_uri: blob/master/docs/zh/docs/ 8 | copyright: Copyright © 2023 - 2023 HPC Wiki Team 9 | theme: 10 | name: material 11 | language: zh 12 | palette: 13 | primary: white 14 | accent: red 15 | scheme: preference 16 | icon: 17 | repo: fontawesome/brands/github-alt 18 | logo: static/img/logo.png 19 | features: 20 | - navigation.tabs 21 | - navigation.tabs.sticky 22 | - search.suggest 23 | - search.highlight 24 | - search.share 25 | font: 26 | text: Noto Sans 27 | code: Source Code Pro 28 | plugins: 29 | - redirects: 30 | redirect_maps: 31 | performance-analysis/nsys.md: gpu/nsys.md 32 | hardware/gpu.md: gpu/arch.md 33 | - search 34 | - minify: 35 | minify_html: true 36 | nav: 37 | - Start: 38 | - index.md 39 | - 贡献指南: 40 | - contribute/before-contributing.md 41 | - contribute/docs-organization.md 42 | - contribute/docs-style.md 43 | - HPC简介: 44 | - hpc-intro/what-is-hpc.md 45 | - hpc-intro/hpc-history.md 46 | - hpc-intro/modern-hpc.md 47 | - HPC比赛: 48 | - 比赛简介: 49 | - competition/competition-intro.md 50 | - competition/sc.md 51 | - competition/isc.md 52 | - competition/asc.md 53 | - competition/hpcgame.md 54 | - competition/other.md 55 | - 功耗管理: 56 | - power-management/intro.md 57 | - power-management/perception.md 58 | - 超算平台: 59 | - platform/platform-intro.md 60 | - platform/cluster.md 61 | - platform/cloud.md 62 | - platform/scheduling.md 63 | - platform/modules.md 64 | - 硬件: 65 | - hardware/hardware-intro.md 66 | - hardware/processor.md 67 | - hardware/memory.md 68 | - hardware/gpu.md 69 | - hardware/fpga-and-asics.md 70 | - hardware/interconnect.md 71 | - hardware/storage.md 72 | - 并行编程: 73 | - parallel-programming/parallel-programming-intro.md 74 | - 基础:线程与进程模型: 75 | - thread-process/intro.md 76 | - thread-process/process.md 77 | - thread-process/thread.md 78 | - thread-process/more.md 79 | - 基础:内存模型: 80 | - memory-model/intro.md 81 | - memory-model/consistency.md 82 | - memory-model/cache.md 83 | - memory-model/numa.md 84 | - 基础:通信: 85 | - communication/intro.md 86 | - communication/model.md 87 | - communication/cost.md 88 | - communication/deadlock.md 89 | - 编程工具: 90 | - parallel-programming/mpi.md 91 | - parallel-programming/openmp.md 92 | - parallel-programming/mpi4py.md 93 | - 性能分析: 94 | - performance-analysis/intro.md 95 | - performance-analysis/basics.md 96 | - performance-analysis/vtune.md 97 | - performance-analysis/nsys.md 98 | - GPU编程: 99 | - gpu/intro.md 100 | - gpu/arch.md 101 | - CUDA专题: 102 | - gpu/cuda.md 103 | - gpu/cuda-advanced.md 104 | - gpu/nccl.md 105 | - gpu/nsys.md 106 | - gpu/openacc.md 107 | - gpu/opencl.md 108 | - gpu/hip.md 109 | - Benchmark: 110 | - benchmark/intro.md 111 | - benchmark/hpl.md 112 | - benchmark/hpcg.md 113 | - benchmark/mlperf.md 114 | - 科学计算与机器学习系统: 115 | - sci-mlsys/intro.md 116 | - sci-mlsys/parallelism.md 117 | - sci-mlsys/sparsity.md 118 | - sci-mlsys/quantization.md 119 | - Misc: 120 | - misc/faq.md 121 | - misc/shell.md 122 | - misc/git.md 123 | markdown_extensions: 124 | - admonition 125 | - md_in_html 126 | - codehilite: 127 | guess_lang: false 128 | - def_list 129 | - footnotes 130 | - meta 131 | - toc: 132 | permalink: true 133 | - pymdownx.arithmatex 134 | - pymdownx.caret 135 | - pymdownx.critic 136 | - pymdownx.details 137 | - pymdownx.emoji: 138 | emoji_index: !!python/name:material.extensions.emoji.twemoji '' 139 | emoji_generator: !!python/name:material.extensions.emoji.to_svg '' 140 | - pymdownx.highlight 141 | - pymdownx.inlinehilite 142 | - pymdownx.keys 143 | - pymdownx.magiclink 144 | - pymdownx.mark 145 | - pymdownx.smartsymbols 146 | - pymdownx.superfences: 147 | custom_fences: 148 | - name: mermaid 149 | class: mermaid 150 | format: !!python/name:pymdownx.superfences.fence_code_format '' 151 | - pymdownx.tabbed: 152 | alternate_style: true 153 | - pymdownx.tasklist: 154 | custom_checkbox: true 155 | - pymdownx.tilde 156 | extra: 157 | alternate: 158 | - link: / 159 | name: zh - 汉语 160 | copyright: CC BY-NC-SA 4.0 161 | extra_javascript: 162 | - https://cdnjs.loli.net/ajax/libs/pangu/3.3.0/pangu.min.js 163 | - static/js/extra.js 164 | - https://cdnjs.loli.net/ajax/libs/mathjax/2.7.2/MathJax.js?config=TeX-MML-AM_CHTML 165 | extra_css: 166 | - static/css/extra.css 167 | -------------------------------------------------------------------------------- /docs/zh/overrides/.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lcpu-club/hpc-wiki/8530f11d9f4a9c1389af487545bb22aa5f9f03d9/docs/zh/overrides/.gitignore -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Markdown 2 | mkdocs 3 | mkdocs-material 4 | pymdown-extensions 5 | typer 6 | mkdocs-minify-plugin 7 | mkdocs-redirects -------------------------------------------------------------------------------- /scripts/createfiles.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import shutil 4 | 5 | # input: dir/filename.md 6 | # if dir not exist, create it 7 | # create the filename.md with content "TODO" 8 | def mkdir_touch_files(path): 9 | path = path.strip() 10 | dirname = path.split('/')[0] 11 | if not os.path.exists(dirname): 12 | os.makedirs(dirname) 13 | with open(path, 'w') as f: 14 | f.write('TODO') 15 | print('create file: ' + path) 16 | 17 | files = [] 18 | for file in files: 19 | mkdir_touch_files(file) 20 | 21 | -------------------------------------------------------------------------------- /scripts/docs.py: -------------------------------------------------------------------------------- 1 | # modified based on https://github.com/tiangolo/fastapi/blob/master/scripts/docs.py 2 | import os 3 | import re 4 | import shutil 5 | import subprocess 6 | from http.server import HTTPServer, SimpleHTTPRequestHandler 7 | from multiprocessing import Pool 8 | from pathlib import Path 9 | from typing import Dict, List, Optional, Tuple 10 | 11 | import mkdocs.commands.build 12 | import mkdocs.commands.serve 13 | import mkdocs.config 14 | import mkdocs.utils 15 | import typer 16 | import yaml 17 | from jinja2 import Template 18 | 19 | app = typer.Typer() 20 | 21 | mkdocs_name = "mkdocs.yml" 22 | 23 | missing_translation_snippet_path = "docs/missing-translation.md" 24 | missing_translation_snippet = open(missing_translation_snippet_path).read() 25 | docs_root_path = Path("docs") 26 | default_lang = "zh" 27 | default_docs_path = Path("docs/zh") 28 | default_config_path: Path = default_docs_path / mkdocs_name 29 | 30 | 31 | def get_default_config() -> dict: 32 | return mkdocs.utils.yaml_load(default_config_path.read_text(encoding="utf-8")) 33 | 34 | 35 | def get_lang_paths(): 36 | return sorted(docs_root_path.iterdir()) 37 | 38 | 39 | def lang_callback(lang: Optional[str]): 40 | if lang is None: 41 | return 42 | if not lang.isalpha() or len(lang) != 2: 43 | typer.echo("Use a 2 letter language code, like: es") 44 | raise typer.Abort() 45 | lang = lang.lower() 46 | return lang 47 | 48 | 49 | def complete_existing_lang(incomplete: str): 50 | lang_path: Path 51 | for lang_path in get_lang_paths(): 52 | if lang_path.is_dir() and lang_path.name.startswith(incomplete): 53 | yield lang_path.name 54 | 55 | 56 | def get_base_lang_config(lang: str): 57 | default_config = get_default_config() 58 | url_base = default_config["site_url"] 59 | new_config = default_config.copy() 60 | new_config["site_url"] = default_config["site_url"] + f"{lang}/" 61 | new_config["theme"]["logo"] = url_base + \ 62 | default_config["theme"]["logo"] 63 | # new_config["theme"]["favicon"] = url_base + \ 64 | # default_config["theme"]["favicon"] 65 | new_config["theme"]["language"] = lang 66 | new_config["nav"] = default_config["nav"][:1] 67 | extra_css = [] 68 | css: str 69 | for css in default_config["extra_css"]: 70 | if css.startswith("http"): 71 | extra_css.append(css) 72 | else: 73 | extra_css.append(url_base + css) 74 | new_config["extra_css"] = extra_css 75 | 76 | extra_js = [] 77 | js: str 78 | for js in default_config["extra_javascript"]: 79 | if js.startswith("http"): 80 | extra_js.append(js) 81 | else: 82 | extra_js.append(url_base + js) 83 | new_config["extra_javascript"] = extra_js 84 | return new_config 85 | 86 | 87 | @app.command() 88 | def new_lang(lang: str = typer.Argument(..., callback=lang_callback)): 89 | """ 90 | Generate a new docs translation directory for the language LANG. 91 | 92 | LANG should be a 2-letter language code, like: en, es, de, pt, etc. 93 | """ 94 | new_path: Path = docs_root_path / lang 95 | if new_path.exists(): 96 | typer.echo(f"The language was already created: {lang}") 97 | raise typer.Abort() 98 | new_path.mkdir() 99 | new_config = get_base_lang_config(lang) 100 | new_config_path: Path = Path(new_path) / mkdocs_name 101 | new_config_path.write_text( 102 | yaml.dump(new_config, sort_keys=False, width=200, allow_unicode=True), 103 | encoding="utf-8", 104 | ) 105 | new_config_docs_path: Path = new_path / "docs" 106 | new_config_docs_path.mkdir() 107 | 108 | new_overrides_path: Path = new_path / "overrides" 109 | shutil.copytree(docs_root_path/default_lang/"overrides", new_overrides_path) 110 | 111 | default_index_path: Path = default_docs_path / "docs" / "index.md" 112 | new_index_path: Path = new_config_docs_path / "index.md" 113 | 114 | default_text = default_index_path.read_text(encoding="utf-8") 115 | lang_text = get_text_with_translate_missing(default_text) 116 | new_index_path.write_text(lang_text, encoding="utf-8") 117 | typer.secho( 118 | f"Successfully initialized: {new_path}", color=typer.colors.GREEN) 119 | update_languages(lang=None) 120 | 121 | 122 | @app.command() 123 | def build_lang( 124 | lang: str = typer.Argument( 125 | ..., callback=lang_callback, autocompletion=complete_existing_lang 126 | ) 127 | ): 128 | """ 129 | Build the docs for a language, filling missing pages with translation notifications. 130 | """ 131 | lang_path: Path = docs_root_path / lang 132 | if not lang_path.is_dir(): 133 | typer.echo( 134 | f"The language translation doesn't seem to exist yet: {lang}") 135 | raise typer.Abort() 136 | typer.echo(f"Building docs for: {lang}") 137 | build_dir_path = Path("docs_build") 138 | build_dir_path.mkdir(exist_ok=True) 139 | build_lang_path = build_dir_path / lang 140 | 141 | site_path = Path("site").absolute() 142 | if lang == default_lang: 143 | dist_path = site_path 144 | else: 145 | dist_path: Path = site_path / lang 146 | 147 | shutil.rmtree(build_lang_path, ignore_errors=True) 148 | shutil.copytree(lang_path, build_lang_path) 149 | overrides_src = default_docs_path / "overrides" 150 | overrides_dest = build_lang_path / "overrides" 151 | for path in overrides_src.iterdir(): 152 | dest_path = overrides_dest / path.name 153 | if not dest_path.exists(): 154 | shutil.copy(path, dest_path) 155 | 156 | default_lang_path = Path("docs/zh") 157 | default_config_path: Path = default_lang_path / mkdocs_name 158 | default_config: dict = mkdocs.utils.yaml_load( 159 | default_config_path.read_text(encoding="utf-8")) 160 | nav = default_config["nav"] 161 | use_nav = nav 162 | file_to_nav = get_file_to_nav_map(use_nav) 163 | sections = get_sections(use_nav) 164 | 165 | lang_config_path: Path = lang_path / mkdocs_name 166 | lang_config: dict = mkdocs.utils.yaml_load( 167 | lang_config_path.read_text(encoding="utf-8") 168 | ) 169 | lang_nav = lang_config["nav"] 170 | lang_use_nav = lang_nav 171 | lang_file_to_nav = get_file_to_nav_map(lang_use_nav) 172 | 173 | use_lang_file_to_nav = get_file_to_nav_map(lang_use_nav) 174 | for file in file_to_nav: 175 | file_path = Path(file) 176 | build_lang_file_path: Path = build_lang_path / "docs" / file_path 177 | default_lang_file_path: Path = default_lang_path / "docs" / file_path 178 | build_lang_file_path.parent.mkdir(parents=True, exist_ok=True) 179 | if not build_lang_file_path.is_file(): 180 | # generate the text with translation missing 181 | default_text = default_lang_file_path.read_text(encoding="utf-8") 182 | lang_text = get_text_with_translate_missing(default_text) 183 | build_lang_file_path.write_text(lang_text, encoding="utf-8") 184 | 185 | file_key = file_to_nav[file] 186 | use_lang_file_to_nav[file] = file_key 187 | if file_key: 188 | composite_key = () 189 | new_key = () 190 | for key_part in file_key: 191 | composite_key += (key_part,) 192 | key_first_file = sections[composite_key] 193 | if type(key_first_file)==dict: 194 | new_key += (key_part,) 195 | elif key_first_file in lang_file_to_nav: 196 | new_key = lang_file_to_nav[key_first_file] 197 | else: 198 | new_key += (key_part,) 199 | use_lang_file_to_nav[file] = new_key 200 | key_to_section = {(): []} 201 | for file, orig_file_key in file_to_nav.items(): 202 | if file in use_lang_file_to_nav: 203 | file_key = use_lang_file_to_nav[file] 204 | else: 205 | file_key = orig_file_key 206 | section = get_key_section(key_to_section=key_to_section, key=file_key) 207 | section.append(file) 208 | # copy figures 209 | for path, dirs, files in os.walk(default_lang_path): 210 | for dir in dirs: 211 | tmp_default_dir: Path = Path(path) / Path(dir) 212 | tmp_build_dir:Path = Path(str(tmp_default_dir).replace("docs/zh/","docs_build/en/")) 213 | if str(tmp_build_dir).endswith("/figure"): 214 | if not tmp_build_dir.exists(): 215 | shutil.copytree(tmp_default_dir,tmp_build_dir) 216 | 217 | new_nav = key_to_section[()] 218 | export_lang_nav = new_nav 219 | lang_config["nav"] = export_lang_nav 220 | build_lang_config_path: Path = build_lang_path / mkdocs_name 221 | build_lang_config_path.write_text( 222 | yaml.dump(lang_config, sort_keys=False, width=200, allow_unicode=True), 223 | encoding="utf-8", 224 | ) 225 | current_dir = os.getcwd() 226 | os.chdir(build_lang_path) 227 | subprocess.run(["mkdocs", "build", "--site-dir", dist_path], check=True) 228 | os.chdir(current_dir) 229 | typer.secho( 230 | f"Successfully built docs for: {lang}", color=typer.colors.GREEN) 231 | 232 | 233 | def generate_readme_content(): 234 | default_index = default_docs_path / "docs" / "index.md" 235 | content = default_index.read_text("utf-8") 236 | return content 237 | 238 | 239 | @app.command() 240 | def generate_readme(): 241 | """ 242 | Generate README.md content from main index.md 243 | """ 244 | typer.echo("Generating README") 245 | readme_path = Path("README.md") 246 | new_content = generate_readme_content() 247 | readme_path.write_text(new_content, encoding="utf-8") 248 | 249 | 250 | @app.command() 251 | def verify_readme(): 252 | """ 253 | Verify README.md content from main index.md 254 | """ 255 | typer.echo("Verifying README") 256 | readme_path = Path("README.md") 257 | generated_content = generate_readme_content() 258 | readme_content = readme_path.read_text("utf-8") 259 | if generated_content != readme_content: 260 | typer.secho( 261 | "README.md outdated from the latest index.md", color=typer.colors.RED 262 | ) 263 | raise typer.Abort() 264 | typer.echo("Valid README ✅") 265 | 266 | 267 | @app.command() 268 | def build_all(): 269 | """ 270 | Build mkdocs site for default language, and then build each language inside, end result is located 271 | at directory ./site/ with each language inside. 272 | """ 273 | site_path = Path("site").absolute() 274 | update_languages(lang=None) 275 | current_dir = os.getcwd() 276 | os.chdir(default_docs_path) 277 | typer.echo("Building docs for: " + default_lang) 278 | subprocess.run(["mkdocs", "build", "--site-dir", site_path], check=True) 279 | os.chdir(current_dir) 280 | langs = [] 281 | for lang in get_lang_paths(): 282 | if lang == default_docs_path or not lang.is_dir(): 283 | continue 284 | langs.append(lang.name) 285 | cpu_count = os.cpu_count() or 1 286 | with Pool(cpu_count * 2) as p: 287 | p.map(build_lang, langs) 288 | 289 | 290 | def update_single_lang(lang: str): 291 | lang_path = docs_root_path / lang 292 | typer.echo(f"Updating {lang_path.name}") 293 | update_config(lang_path.name) 294 | 295 | 296 | @app.command() 297 | def update_languages( 298 | lang: str = typer.Argument( 299 | None, callback=lang_callback, autocompletion=complete_existing_lang 300 | ) 301 | ): 302 | """ 303 | Update the mkdocs.yml file Languages section including all the available languages. 304 | 305 | The LANG argument is a 2-letter language code. If it's not provided, update all the 306 | mkdocs.yml files (for all the languages). 307 | """ 308 | if lang is None: 309 | for lang_path in get_lang_paths(): 310 | if lang_path.is_dir(): 311 | update_single_lang(lang_path.name) 312 | else: 313 | update_single_lang(lang) 314 | 315 | 316 | @app.command() 317 | def serve(): 318 | """ 319 | A quick server to preview a built site with translations. 320 | 321 | For development, prefer the command live (or just mkdocs serve). 322 | 323 | This is here only to preview a site with translations already built. 324 | 325 | Make sure you run the build-all command first. 326 | """ 327 | typer.echo("Warning: this is a very simple server.") 328 | typer.echo("For development, use the command live instead.") 329 | typer.echo( 330 | "This is here only to preview a site with translations already built.") 331 | typer.echo("Make sure you run the build-all command first.") 332 | os.chdir("site") 333 | server_address = ("", 8008) 334 | server = HTTPServer(server_address, SimpleHTTPRequestHandler) 335 | typer.echo(f"Serving at: http://127.0.0.1:8008") 336 | server.serve_forever() 337 | 338 | 339 | @app.command() 340 | def live( 341 | lang: str = typer.Argument( 342 | None, callback=lang_callback, autocompletion=complete_existing_lang 343 | ) 344 | ): 345 | """ 346 | Serve with livereload a docs site for a specific language. 347 | 348 | This only shows the actual translated files, not the placeholders created with 349 | build-all. 350 | 351 | Takes an optional LANG argument with the name of the language to serve, by default 352 | language. 353 | """ 354 | if lang is None: 355 | lang = default_lang 356 | lang_path: Path = docs_root_path / lang 357 | os.chdir(lang_path) 358 | mkdocs.commands.serve.serve(dev_addr="127.0.0.1:8008") 359 | 360 | 361 | def update_config(lang: str): 362 | lang_path: Path = docs_root_path / lang 363 | config_path = lang_path / mkdocs_name 364 | current_config: dict = mkdocs.utils.yaml_load( 365 | config_path.read_text(encoding="utf-8") 366 | ) 367 | if lang == default_lang: 368 | config = get_default_config() 369 | else: 370 | config = get_base_lang_config(lang) 371 | config["nav"] = current_config["nav"] 372 | config["theme"]["language"] = current_config["theme"]["language"] 373 | languages = [{default_lang: "/"}] 374 | alternate: List[Dict[str, str]] = config["extra"].get("alternate", []) 375 | alternate_dict = {alt["link"]: alt["name"] for alt in alternate} 376 | new_alternate: List[Dict[str, str]] = [] 377 | for lang_path in get_lang_paths(): 378 | if lang_path.name == default_lang or not lang_path.is_dir(): 379 | continue 380 | name = lang_path.name 381 | languages.append({name: f"/{name}/"}) 382 | tmp_map = dict() 383 | tmp_map["zh"] = "zh - 汉语" 384 | tmp_map["en"] = "en - English" 385 | for lang_dict in languages: 386 | name = list(lang_dict.keys())[0] 387 | url = lang_dict[name] 388 | if url not in alternate_dict: 389 | new_alternate.append({"link": url, "name": tmp_map[name]}) 390 | else: 391 | use_name = alternate_dict[url] 392 | new_alternate.append({"link": url, "name": use_name}) 393 | 394 | config["extra"]["alternate"] = new_alternate 395 | config_path.write_text( 396 | yaml.dump(config, sort_keys=False, width=200, allow_unicode=True), 397 | encoding="utf-8", 398 | ) 399 | 400 | 401 | def get_key_section( 402 | *, key_to_section: Dict[Tuple[str, ...], list], key: Tuple[str, ...] 403 | ) -> list: 404 | if key in key_to_section: 405 | return key_to_section[key] 406 | super_key = key[:-1] 407 | title = key[-1] 408 | super_section = get_key_section( 409 | key_to_section=key_to_section, key=super_key) 410 | new_section = [] 411 | super_section.append({title: new_section}) 412 | key_to_section[key] = new_section 413 | return new_section 414 | 415 | 416 | def get_text_with_translate_missing(text: str) -> str: 417 | lines = text.splitlines() 418 | idx = 0 419 | while idx Dict[str, Tuple[str, ...]]: 427 | file_to_nav = {} 428 | for item in nav: 429 | if type(item) is str: 430 | file_to_nav[item] = tuple() 431 | elif type(item) is dict: 432 | item_key = list(item.keys())[0] 433 | sub_nav = item[item_key] 434 | sub_file_to_nav = get_file_to_nav_map(sub_nav) 435 | for k, v in sub_file_to_nav.items(): 436 | file_to_nav[k] = (item_key,) + v 437 | return file_to_nav 438 | 439 | 440 | def get_sections(nav: list) -> Dict[Tuple[str, ...], str]: 441 | sections = {} 442 | for item in nav: 443 | if type(item) is str: 444 | continue 445 | elif type(item) is dict: 446 | item_key = list(item.keys())[0] 447 | sub_nav = item[item_key] 448 | sections[(item_key,)] = sub_nav[0] 449 | sub_sections = get_sections(sub_nav) 450 | for k, v in sub_sections.items(): 451 | new_key = (item_key,) + k 452 | sections[new_key] = v 453 | return sections 454 | 455 | 456 | if __name__ == "__main__": 457 | app() 458 | --------------------------------------------------------------------------------