├── .editorconfig ├── .github └── workflows │ └── main.yml ├── .gitignore ├── LICENSE.GPL2 ├── README.md ├── benchmarks ├── bench_helper.py ├── test_dispatch.py ├── test_gpu_clock.py └── test_tmu_performance.py ├── examples ├── memset.py ├── pctr_gpu_clock.py ├── scopy.py ├── sgemm.py └── summation.py ├── setup.py ├── tests ├── test_alu.py ├── test_branch.py ├── test_condition_codes.py ├── test_driver.py ├── test_drm.py ├── test_labels.py ├── test_parallel.py ├── test_sfu.py ├── test_signals.py ├── test_tmu.py ├── test_unifa.py └── test_v3d.py └── videocore6 ├── __init__.py ├── assembler.py ├── driver.py ├── drm_v3d.py ├── readwrite4.c └── v3d.py /.editorconfig: -------------------------------------------------------------------------------- 1 | # .editorconfig -- Config file for EditorConfig. http://editorconfig.org/ 2 | 3 | [*] 4 | charset = utf-8 5 | end_of_line = lf 6 | trim_trailing_whitespace = true 7 | insert_final_newline = true 8 | 9 | [*.py] 10 | indent_style = space 11 | indent_size = 4 12 | 13 | [*.c] 14 | indent_style = space 15 | indent_size = 4 16 | -------------------------------------------------------------------------------- /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: [push] 4 | 5 | jobs: 6 | 7 | test: 8 | runs-on: ubuntu-latest 9 | steps: 10 | - uses: actions/checkout@v1 11 | - name: Set up Python 3.9 12 | uses: actions/setup-python@v1 13 | with: 14 | python-version: 3.9 15 | - name: Test code format 16 | run: | 17 | pip3 install autopep8 18 | autopep8 --diff --exit-code --max-line-length 128 --recursive videocore6 19 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | 3 | # Created by https://www.gitignore.io/api/git,vim,linux,emacs,python 4 | # Edit at https://www.gitignore.io/?templates=git,vim,linux,emacs,python 5 | 6 | ### Emacs ### 7 | # -*- mode: gitignore; -*- 8 | *~ 9 | \#*\# 10 | /.emacs.desktop 11 | /.emacs.desktop.lock 12 | *.elc 13 | auto-save-list 14 | tramp 15 | .\#* 16 | 17 | # Org-mode 18 | .org-id-locations 19 | *_archive 20 | 21 | # flymake-mode 22 | *_flymake.* 23 | 24 | # eshell files 25 | /eshell/history 26 | /eshell/lastdir 27 | 28 | # elpa packages 29 | /elpa/ 30 | 31 | # reftex files 32 | *.rel 33 | 34 | # AUCTeX auto folder 35 | /auto/ 36 | 37 | # cask packages 38 | .cask/ 39 | dist/ 40 | 41 | # Flycheck 42 | flycheck_*.el 43 | 44 | # server auth directory 45 | /server/ 46 | 47 | # projectiles files 48 | .projectile 49 | 50 | # directory configuration 51 | .dir-locals.el 52 | 53 | # network security 54 | /network-security.data 55 | 56 | 57 | ### Git ### 58 | # Created by git for backups. To disable backups in Git: 59 | # $ git config --global mergetool.keepBackup false 60 | *.orig 61 | 62 | # Created by git when using merge tools for conflicts 63 | *.BACKUP.* 64 | *.BASE.* 65 | *.LOCAL.* 66 | *.REMOTE.* 67 | *_BACKUP_*.txt 68 | *_BASE_*.txt 69 | *_LOCAL_*.txt 70 | *_REMOTE_*.txt 71 | 72 | ### Linux ### 73 | 74 | # temporary files which can be created if a process still has a handle open of a deleted file 75 | .fuse_hidden* 76 | 77 | # KDE directory preferences 78 | .directory 79 | 80 | # Linux trash folder which might appear on any partition or disk 81 | .Trash-* 82 | 83 | # .nfs files are created when an open file is removed but is still being accessed 84 | .nfs* 85 | 86 | ### Python ### 87 | # Byte-compiled / optimized / DLL files 88 | __pycache__/ 89 | *.py[cod] 90 | *$py.class 91 | 92 | # C extensions 93 | *.so 94 | 95 | # Distribution / packaging 96 | .Python 97 | build/ 98 | develop-eggs/ 99 | downloads/ 100 | eggs/ 101 | .eggs/ 102 | lib/ 103 | lib64/ 104 | parts/ 105 | sdist/ 106 | var/ 107 | wheels/ 108 | pip-wheel-metadata/ 109 | share/python-wheels/ 110 | *.egg-info/ 111 | .installed.cfg 112 | *.egg 113 | MANIFEST 114 | 115 | # PyInstaller 116 | # Usually these files are written by a python script from a template 117 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 118 | *.manifest 119 | *.spec 120 | 121 | # Installer logs 122 | pip-log.txt 123 | pip-delete-this-directory.txt 124 | 125 | # Unit test / coverage reports 126 | htmlcov/ 127 | .tox/ 128 | .nox/ 129 | .coverage 130 | .coverage.* 131 | .cache 132 | nosetests.xml 133 | coverage.xml 134 | *.cover 135 | .hypothesis/ 136 | .pytest_cache/ 137 | 138 | # Translations 139 | *.mo 140 | *.pot 141 | 142 | # Scrapy stuff: 143 | .scrapy 144 | 145 | # Sphinx documentation 146 | docs/_build/ 147 | 148 | # PyBuilder 149 | target/ 150 | 151 | # pyenv 152 | .python-version 153 | 154 | # pipenv 155 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 156 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 157 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 158 | # install all needed dependencies. 159 | #Pipfile.lock 160 | 161 | # celery beat schedule file 162 | celerybeat-schedule 163 | 164 | # SageMath parsed files 165 | *.sage.py 166 | 167 | # Spyder project settings 168 | .spyderproject 169 | .spyproject 170 | 171 | # Rope project settings 172 | .ropeproject 173 | 174 | # Mr Developer 175 | .mr.developer.cfg 176 | .project 177 | .pydevproject 178 | 179 | # mkdocs documentation 180 | /site 181 | 182 | # mypy 183 | .mypy_cache/ 184 | .dmypy.json 185 | dmypy.json 186 | 187 | # Pyre type checker 188 | .pyre/ 189 | 190 | ### Vim ### 191 | # Swap 192 | [._]*.s[a-v][a-z] 193 | [._]*.sw[a-p] 194 | [._]s[a-rt-v][a-z] 195 | [._]ss[a-gi-z] 196 | [._]sw[a-p] 197 | 198 | # Session 199 | Session.vim 200 | Sessionx.vim 201 | 202 | # Temporary 203 | .netrwhist 204 | # Auto-generated tag files 205 | tags 206 | # Persistent undo 207 | [._]*.un~ 208 | 209 | # End of https://www.gitignore.io/api/git,vim,linux,emacs,python 210 | -------------------------------------------------------------------------------- /LICENSE.GPL2: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 2, June 1991 3 | 4 | Copyright (C) 1989, 1991 Free Software Foundation, Inc., 5 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 6 | Everyone is permitted to copy and distribute verbatim copies 7 | of this license document, but changing it is not allowed. 8 | 9 | Preamble 10 | 11 | The licenses for most software are designed to take away your 12 | freedom to share and change it. By contrast, the GNU General Public 13 | License is intended to guarantee your freedom to share and change free 14 | software--to make sure the software is free for all its users. This 15 | General Public License applies to most of the Free Software 16 | Foundation's software and to any other program whose authors commit to 17 | using it. (Some other Free Software Foundation software is covered by 18 | the GNU Lesser General Public License instead.) You can apply it to 19 | your programs, too. 20 | 21 | When we speak of free software, we are referring to freedom, not 22 | price. Our General Public Licenses are designed to make sure that you 23 | have the freedom to distribute copies of free software (and charge for 24 | this service if you wish), that you receive source code or can get it 25 | if you want it, that you can change the software or use pieces of it 26 | in new free programs; and that you know you can do these things. 27 | 28 | To protect your rights, we need to make restrictions that forbid 29 | anyone to deny you these rights or to ask you to surrender the rights. 30 | These restrictions translate to certain responsibilities for you if you 31 | distribute copies of the software, or if you modify it. 32 | 33 | For example, if you distribute copies of such a program, whether 34 | gratis or for a fee, you must give the recipients all the rights that 35 | you have. You must make sure that they, too, receive or can get the 36 | source code. And you must show them these terms so they know their 37 | rights. 38 | 39 | We protect your rights with two steps: (1) copyright the software, and 40 | (2) offer you this license which gives you legal permission to copy, 41 | distribute and/or modify the software. 42 | 43 | Also, for each author's protection and ours, we want to make certain 44 | that everyone understands that there is no warranty for this free 45 | software. If the software is modified by someone else and passed on, we 46 | want its recipients to know that what they have is not the original, so 47 | that any problems introduced by others will not reflect on the original 48 | authors' reputations. 49 | 50 | Finally, any free program is threatened constantly by software 51 | patents. We wish to avoid the danger that redistributors of a free 52 | program will individually obtain patent licenses, in effect making the 53 | program proprietary. To prevent this, we have made it clear that any 54 | patent must be licensed for everyone's free use or not licensed at all. 55 | 56 | The precise terms and conditions for copying, distribution and 57 | modification follow. 58 | 59 | GNU GENERAL PUBLIC LICENSE 60 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 61 | 62 | 0. This License applies to any program or other work which contains 63 | a notice placed by the copyright holder saying it may be distributed 64 | under the terms of this General Public License. The "Program", below, 65 | refers to any such program or work, and a "work based on the Program" 66 | means either the Program or any derivative work under copyright law: 67 | that is to say, a work containing the Program or a portion of it, 68 | either verbatim or with modifications and/or translated into another 69 | language. (Hereinafter, translation is included without limitation in 70 | the term "modification".) Each licensee is addressed as "you". 71 | 72 | Activities other than copying, distribution and modification are not 73 | covered by this License; they are outside its scope. The act of 74 | running the Program is not restricted, and the output from the Program 75 | is covered only if its contents constitute a work based on the 76 | Program (independent of having been made by running the Program). 77 | Whether that is true depends on what the Program does. 78 | 79 | 1. You may copy and distribute verbatim copies of the Program's 80 | source code as you receive it, in any medium, provided that you 81 | conspicuously and appropriately publish on each copy an appropriate 82 | copyright notice and disclaimer of warranty; keep intact all the 83 | notices that refer to this License and to the absence of any warranty; 84 | and give any other recipients of the Program a copy of this License 85 | along with the Program. 86 | 87 | You may charge a fee for the physical act of transferring a copy, and 88 | you may at your option offer warranty protection in exchange for a fee. 89 | 90 | 2. You may modify your copy or copies of the Program or any portion 91 | of it, thus forming a work based on the Program, and copy and 92 | distribute such modifications or work under the terms of Section 1 93 | above, provided that you also meet all of these conditions: 94 | 95 | a) You must cause the modified files to carry prominent notices 96 | stating that you changed the files and the date of any change. 97 | 98 | b) You must cause any work that you distribute or publish, that in 99 | whole or in part contains or is derived from the Program or any 100 | part thereof, to be licensed as a whole at no charge to all third 101 | parties under the terms of this License. 102 | 103 | c) If the modified program normally reads commands interactively 104 | when run, you must cause it, when started running for such 105 | interactive use in the most ordinary way, to print or display an 106 | announcement including an appropriate copyright notice and a 107 | notice that there is no warranty (or else, saying that you provide 108 | a warranty) and that users may redistribute the program under 109 | these conditions, and telling the user how to view a copy of this 110 | License. (Exception: if the Program itself is interactive but 111 | does not normally print such an announcement, your work based on 112 | the Program is not required to print an announcement.) 113 | 114 | These requirements apply to the modified work as a whole. If 115 | identifiable sections of that work are not derived from the Program, 116 | and can be reasonably considered independent and separate works in 117 | themselves, then this License, and its terms, do not apply to those 118 | sections when you distribute them as separate works. But when you 119 | distribute the same sections as part of a whole which is a work based 120 | on the Program, the distribution of the whole must be on the terms of 121 | this License, whose permissions for other licensees extend to the 122 | entire whole, and thus to each and every part regardless of who wrote it. 123 | 124 | Thus, it is not the intent of this section to claim rights or contest 125 | your rights to work written entirely by you; rather, the intent is to 126 | exercise the right to control the distribution of derivative or 127 | collective works based on the Program. 128 | 129 | In addition, mere aggregation of another work not based on the Program 130 | with the Program (or with a work based on the Program) on a volume of 131 | a storage or distribution medium does not bring the other work under 132 | the scope of this License. 133 | 134 | 3. You may copy and distribute the Program (or a work based on it, 135 | under Section 2) in object code or executable form under the terms of 136 | Sections 1 and 2 above provided that you also do one of the following: 137 | 138 | a) Accompany it with the complete corresponding machine-readable 139 | source code, which must be distributed under the terms of Sections 140 | 1 and 2 above on a medium customarily used for software interchange; or, 141 | 142 | b) Accompany it with a written offer, valid for at least three 143 | years, to give any third party, for a charge no more than your 144 | cost of physically performing source distribution, a complete 145 | machine-readable copy of the corresponding source code, to be 146 | distributed under the terms of Sections 1 and 2 above on a medium 147 | customarily used for software interchange; or, 148 | 149 | c) Accompany it with the information you received as to the offer 150 | to distribute corresponding source code. (This alternative is 151 | allowed only for noncommercial distribution and only if you 152 | received the program in object code or executable form with such 153 | an offer, in accord with Subsection b above.) 154 | 155 | The source code for a work means the preferred form of the work for 156 | making modifications to it. For an executable work, complete source 157 | code means all the source code for all modules it contains, plus any 158 | associated interface definition files, plus the scripts used to 159 | control compilation and installation of the executable. However, as a 160 | special exception, the source code distributed need not include 161 | anything that is normally distributed (in either source or binary 162 | form) with the major components (compiler, kernel, and so on) of the 163 | operating system on which the executable runs, unless that component 164 | itself accompanies the executable. 165 | 166 | If distribution of executable or object code is made by offering 167 | access to copy from a designated place, then offering equivalent 168 | access to copy the source code from the same place counts as 169 | distribution of the source code, even though third parties are not 170 | compelled to copy the source along with the object code. 171 | 172 | 4. You may not copy, modify, sublicense, or distribute the Program 173 | except as expressly provided under this License. Any attempt 174 | otherwise to copy, modify, sublicense or distribute the Program is 175 | void, and will automatically terminate your rights under this License. 176 | However, parties who have received copies, or rights, from you under 177 | this License will not have their licenses terminated so long as such 178 | parties remain in full compliance. 179 | 180 | 5. You are not required to accept this License, since you have not 181 | signed it. However, nothing else grants you permission to modify or 182 | distribute the Program or its derivative works. These actions are 183 | prohibited by law if you do not accept this License. Therefore, by 184 | modifying or distributing the Program (or any work based on the 185 | Program), you indicate your acceptance of this License to do so, and 186 | all its terms and conditions for copying, distributing or modifying 187 | the Program or works based on it. 188 | 189 | 6. Each time you redistribute the Program (or any work based on the 190 | Program), the recipient automatically receives a license from the 191 | original licensor to copy, distribute or modify the Program subject to 192 | these terms and conditions. You may not impose any further 193 | restrictions on the recipients' exercise of the rights granted herein. 194 | You are not responsible for enforcing compliance by third parties to 195 | this License. 196 | 197 | 7. If, as a consequence of a court judgment or allegation of patent 198 | infringement or for any other reason (not limited to patent issues), 199 | conditions are imposed on you (whether by court order, agreement or 200 | otherwise) that contradict the conditions of this License, they do not 201 | excuse you from the conditions of this License. If you cannot 202 | distribute so as to satisfy simultaneously your obligations under this 203 | License and any other pertinent obligations, then as a consequence you 204 | may not distribute the Program at all. For example, if a patent 205 | license would not permit royalty-free redistribution of the Program by 206 | all those who receive copies directly or indirectly through you, then 207 | the only way you could satisfy both it and this License would be to 208 | refrain entirely from distribution of the Program. 209 | 210 | If any portion of this section is held invalid or unenforceable under 211 | any particular circumstance, the balance of the section is intended to 212 | apply and the section as a whole is intended to apply in other 213 | circumstances. 214 | 215 | It is not the purpose of this section to induce you to infringe any 216 | patents or other property right claims or to contest validity of any 217 | such claims; this section has the sole purpose of protecting the 218 | integrity of the free software distribution system, which is 219 | implemented by public license practices. Many people have made 220 | generous contributions to the wide range of software distributed 221 | through that system in reliance on consistent application of that 222 | system; it is up to the author/donor to decide if he or she is willing 223 | to distribute software through any other system and a licensee cannot 224 | impose that choice. 225 | 226 | This section is intended to make thoroughly clear what is believed to 227 | be a consequence of the rest of this License. 228 | 229 | 8. If the distribution and/or use of the Program is restricted in 230 | certain countries either by patents or by copyrighted interfaces, the 231 | original copyright holder who places the Program under this License 232 | may add an explicit geographical distribution limitation excluding 233 | those countries, so that distribution is permitted only in or among 234 | countries not thus excluded. In such case, this License incorporates 235 | the limitation as if written in the body of this License. 236 | 237 | 9. The Free Software Foundation may publish revised and/or new versions 238 | of the General Public License from time to time. Such new versions will 239 | be similar in spirit to the present version, but may differ in detail to 240 | address new problems or concerns. 241 | 242 | Each version is given a distinguishing version number. If the Program 243 | specifies a version number of this License which applies to it and "any 244 | later version", you have the option of following the terms and conditions 245 | either of that version or of any later version published by the Free 246 | Software Foundation. If the Program does not specify a version number of 247 | this License, you may choose any version ever published by the Free Software 248 | Foundation. 249 | 250 | 10. If you wish to incorporate parts of the Program into other free 251 | programs whose distribution conditions are different, write to the author 252 | to ask for permission. For software which is copyrighted by the Free 253 | Software Foundation, write to the Free Software Foundation; we sometimes 254 | make exceptions for this. Our decision will be guided by the two goals 255 | of preserving the free status of all derivatives of our free software and 256 | of promoting the sharing and reuse of software generally. 257 | 258 | NO WARRANTY 259 | 260 | 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY 261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN 262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES 263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED 264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS 266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE 267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, 268 | REPAIR OR CORRECTION. 269 | 270 | 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR 272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, 273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING 274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED 275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY 276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER 277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE 278 | POSSIBILITY OF SUCH DAMAGES. 279 | 280 | END OF TERMS AND CONDITIONS 281 | 282 | How to Apply These Terms to Your New Programs 283 | 284 | If you develop a new program, and you want it to be of the greatest 285 | possible use to the public, the best way to achieve this is to make it 286 | free software which everyone can redistribute and change under these terms. 287 | 288 | To do so, attach the following notices to the program. It is safest 289 | to attach them to the start of each source file to most effectively 290 | convey the exclusion of warranty; and each file should have at least 291 | the "copyright" line and a pointer to where the full notice is found. 292 | 293 | 294 | Copyright (C) 295 | 296 | This program is free software; you can redistribute it and/or modify 297 | it under the terms of the GNU General Public License as published by 298 | the Free Software Foundation; either version 2 of the License, or 299 | (at your option) any later version. 300 | 301 | This program is distributed in the hope that it will be useful, 302 | but WITHOUT ANY WARRANTY; without even the implied warranty of 303 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 304 | GNU General Public License for more details. 305 | 306 | You should have received a copy of the GNU General Public License along 307 | with this program; if not, write to the Free Software Foundation, Inc., 308 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 309 | 310 | Also add information on how to contact you by electronic and paper mail. 311 | 312 | If the program is interactive, make it output a short notice like this 313 | when it starts in an interactive mode: 314 | 315 | Gnomovision version 69, Copyright (C) year name of author 316 | Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 317 | This is free software, and you are welcome to redistribute it 318 | under certain conditions; type `show c' for details. 319 | 320 | The hypothetical commands `show w' and `show c' should show the appropriate 321 | parts of the General Public License. Of course, the commands you use may 322 | be called something other than `show w' and `show c'; they could even be 323 | mouse-clicks or menu items--whatever suits your program. 324 | 325 | You should also get your employer (if you work as a programmer) or your 326 | school, if any, to sign a "copyright disclaimer" for the program, if 327 | necessary. Here is a sample; alter the names: 328 | 329 | Yoyodyne, Inc., hereby disclaims all copyright interest in the program 330 | `Gnomovision' (which makes passes at compilers) written by James Hacker. 331 | 332 | , 1 April 1989 333 | Ty Coon, President of Vice 334 | 335 | This General Public License does not permit incorporating your program into 336 | proprietary programs. If your program is a subroutine library, you may 337 | consider it more useful to permit linking proprietary applications with the 338 | library. If this is what you want to do, use the GNU Lesser General 339 | Public License instead of this License. 340 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # py-videocore6 2 | 3 | A Python library for GPGPU programming on Raspberry Pi 4, which realizes 4 | assembling and running QPU programs. 5 | 6 | For Raspberry Pi Zero/1/2/3, use 7 | [nineties/py-videocore](https://github.com/nineties/py-videocore) instead. 8 | 9 | 10 | ## About VideoCore VI QPU 11 | 12 | Raspberry Pi 4 (BCM2711) has a GPU named VideoCore VI QPU in its SoC. 13 | The basic instruction set (add/mul ALU dual issue, three delay slots et al.) 14 | remains the same as VideoCore IV QPU of Raspberry Pi Zero/1/2/3, and some units 15 | now perform differently. 16 | For instance, the TMU can now write to memory in addition to read, and it seems 17 | that the VPM DMA is no longer available. 18 | 19 | Theoretical peak performance of QPUs are as follows. 20 | 21 | - VideoCore IV QPU @ 250MHz: 250 [MHz] x 3 [slice] x 4 [qpu/slice] x 4 [physical core/qpu] x 2 [op/cycle] = 24 [Gflop/s] 22 | - VideoCore IV QPU @ 300MHz: 300 [MHz] x 3 [slice] x 4 [qpu/slice] x 4 [physical core/qpu] x 2 [op/cycle] = 28.8 [Gflop/s] 23 | - VideoCore VI QPU @ 500MHz: 500 [MHz] x 2 [slice] x 4 [qpu/slice] x 4 [physical core/qpu] x 2 [op/cycle] = 32 [Gflop/s] 24 | 25 | 26 | ## Requirements 27 | 28 | `py-videocore6` communicates with the V3D hardware through `/dev/dri/card0`, 29 | which is exposed by the DRM V3D driver. 30 | To access the device, you need to belong to `video` group or be `root` user. 31 | If you choose the former, run `sudo usermod --append --groups video $USER` 32 | (re-login to take effect). 33 | 34 | 35 | ## Installation 36 | 37 | You can install `py-videocore6` directly using `pip`: 38 | 39 | ```console 40 | $ sudo apt update 41 | $ sudo apt upgrade 42 | $ sudo apt install python3-pip python3-numpy 43 | $ pip3 install --user --upgrade pip setuptools wheel 44 | $ pip3 install --user git+https://github.com/Idein/py-videocore6.git 45 | ``` 46 | 47 | If you are willing to run tests and examples, install `py-videocore6` after 48 | cloning it: 49 | 50 | ```console 51 | $ sudo apt update 52 | $ sudo apt upgrade 53 | $ sudo apt install python3-pip python3-numpy libatlas3-base 54 | $ python3 -m pip install --user --upgrade pip setuptools wheel 55 | $ git clone https://github.com/Idein/py-videocore6.git 56 | $ cd py-videocore6/ 57 | $ python3 -m pip install --target sandbox/ --upgrade . nose 58 | ``` 59 | 60 | 61 | ## Running tests and examples 62 | 63 | In the `py-videocore6` directory cloned above: 64 | 65 | ```console 66 | $ python3 setup.py build_ext --inplace 67 | $ PYTHONPATH=sandbox/ python3 -m nose -v -s 68 | ``` 69 | 70 | ```console 71 | $ PYTHONPATH=sandbox/ python3 examples/sgemm.py 72 | ==== sgemm example (1024x1024 times 1024x1024) ==== 73 | numpy: 0.6986 sec, 3.078 Gflop/s 74 | QPU: 0.5546 sec, 3.878 Gflop/s 75 | Minimum absolute error: 0.0 76 | Maximum absolute error: 0.0003814697265625 77 | Minimum relative error: 0.0 78 | Maximum relative error: 0.13375753164291382 79 | ``` 80 | 81 | ```console 82 | $ PYTHONPATH=sandbox/ python3 examples/summation.py 83 | ==== summaton example (32.0 Mi elements) ==== 84 | Preparing for buffers... 85 | Executing on QPU... 86 | 0.01853448400004254 sec, 7241.514141947083 MB/s 87 | ``` 88 | 89 | ```console 90 | $ PYTHONPATH=sandbox/ python3 examples/memset.py 91 | ==== memset example (64.0 MiB) ==== 92 | Preparing for buffers... 93 | Executing on QPU... 94 | 0.01788834699993913 sec, 3751.5408215319367 MB/s 95 | ``` 96 | 97 | ```console 98 | $ PYTHONPATH=sandbox/ python3 examples/scopy.py 99 | ==== scopy example (16.0 Mi elements) ==== 100 | Preparing for buffers... 101 | Executing on QPU... 102 | 0.02768789600000332 sec, 2423.761776625857 MB/s 103 | ``` 104 | 105 | ```console 106 | $ sudo PYTHONPATH=sandbox/ python3 examples/pctr_gpu_clock.py 107 | ==== QPU clock measurement with performance counters ==== 108 | 500.529835 MHz 109 | ``` 110 | 111 | You may see lower performance without `force_turbo=1` in `/boot/config.txt`. 112 | 113 | 114 | ## References 115 | 116 | - DRM V3D driver which controls QPU via hardware V3D registers: [linux/drivers/gpu/drm/v3d](https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/tree/drivers/gpu/drm/v3d) 117 | - Mesa library which partially includes the QPU instruction set: [mesa/src/broadcom/qpu](https://gitlab.freedesktop.org/mesa/mesa/-/tree/main/src/broadcom/qpu) 118 | - Mesa also includes QPU program disassembler, which can be tested with: [Terminus-IMRC/vc6qpudisas](https://github.com/Terminus-IMRC/vc6qpudisas) 119 | -------------------------------------------------------------------------------- /benchmarks/bench_helper.py: -------------------------------------------------------------------------------- 1 | 2 | # Copyright (c) 2019-2020 Idein Inc. 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy 5 | # of this software and associated documentation files (the "Software"), to deal 6 | # in the Software without restriction, including without limitation the rights 7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | # copies of the Software, and to permit persons to whom the Software is 9 | # furnished to do so, subject to the following conditions: 10 | # 11 | # The above copyright notice and this permission notice (including the next 12 | # paragraph) shall be included in all copies or substantial portions of the 13 | # Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | import subprocess 24 | from ctypes import cdll 25 | import numpy as np 26 | 27 | class BenchHelper(object): 28 | 29 | def __init__(self, path = './libbench_helper.so'): 30 | 31 | try: 32 | self.lib = cdll.LoadLibrary(path) 33 | except OSError: 34 | subprocess.run(f'gcc -O2 -shared -fPIC -o {path} -xc -'.split(), text=True, 35 | input=''' 36 | #include 37 | void wait_address(uint32_t volatile * p) { 38 | while(p[0] == 0){} 39 | } 40 | ''' 41 | ) 42 | self.lib = cdll.LoadLibrary(path) 43 | 44 | 45 | self.lib.wait_address.argtypes = [ 46 | np.ctypeslib.ndpointer(dtype=np.uint32, shape=(1,), flags="C_CONTIGUOUS"), 47 | ] 48 | 49 | def wait_address(self, done): 50 | self.lib.wait_address(done) 51 | -------------------------------------------------------------------------------- /benchmarks/test_dispatch.py: -------------------------------------------------------------------------------- 1 | 2 | # Copyright (c) 2019-2020 Idein Inc. 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy 5 | # of this software and associated documentation files (the "Software"), to deal 6 | # in the Software without restriction, including without limitation the rights 7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | # copies of the Software, and to permit persons to whom the Software is 9 | # furnished to do so, subject to the following conditions: 10 | # 11 | # The above copyright notice and this permission notice (including the next 12 | # paragraph) shall be included in all copies or substantial portions of the 13 | # Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | import time 24 | from videocore6.driver import Driver 25 | from videocore6.assembler import qpu 26 | import numpy as np 27 | from bench_helper import BenchHelper 28 | 29 | @qpu 30 | def qpu_write_N(asm, N): 31 | 32 | eidx(r0, sig = ldunif) 33 | nop(sig = ldunifrf(rf0)) 34 | shl(r0, r0, 2) 35 | mov(tmud, N) 36 | add(tmua, r5, r0) 37 | tmuwt() 38 | 39 | mov(tmud, 1) 40 | mov(tmua, rf0) 41 | tmuwt() 42 | 43 | nop(sig = thrsw) 44 | nop(sig = thrsw) 45 | nop() 46 | nop() 47 | nop(sig = thrsw) 48 | nop() 49 | nop() 50 | nop() 51 | 52 | def test_multiple_dispatch_delay(): 53 | print() 54 | 55 | bench = BenchHelper('benchmarks/libbench_helper.so') 56 | 57 | with Driver() as drv: 58 | 59 | data = drv.alloc((10, 16), dtype = 'uint32') 60 | code = [drv.program(lambda asm: qpu_write_N(asm, i)) for i in range(data.shape[0])] 61 | unif = drv.alloc((data.shape[0], 2), dtype = 'uint32') 62 | done = drv.alloc(1, dtype = 'uint32') 63 | 64 | data[:] = 0 65 | unif[:,0] = data.addresses()[:,0] 66 | unif[:,1] = done.addresses()[0] 67 | 68 | ref_start = time.time() 69 | with drv.compute_shader_dispatcher() as csd: 70 | for i in range(data.shape[0]): 71 | csd.dispatch(code[i], unif.addresses()[i,0]) 72 | ref_end = time.time() 73 | assert (data == np.arange(data.shape[0]).reshape(data.shape[0],1)).all() 74 | 75 | data[:] = 0 76 | 77 | naive_results = np.zeros(data.shape[0], dtype='float32') 78 | with drv.compute_shader_dispatcher() as csd: 79 | for i in range(data.shape[0]): 80 | done[:] = 0 81 | start = time.time() 82 | csd.dispatch(code[i], unif.addresses()[i,0]) 83 | bench.wait_address(done) 84 | end = time.time() 85 | naive_results[i] = end - start 86 | assert (data == np.arange(data.shape[0]).reshape(data.shape[0],1)).all() 87 | 88 | sleep_results = np.zeros(data.shape[0], dtype='float32') 89 | with drv.compute_shader_dispatcher() as csd: 90 | for i in range(data.shape[0]): 91 | done[:] = 0 92 | time.sleep(1) 93 | start = time.time() 94 | csd.dispatch(code[i], unif.addresses()[i,0]) 95 | bench.wait_address(done) 96 | end = time.time() 97 | sleep_results[i] = end - start 98 | assert (data == np.arange(data.shape[0]).reshape(data.shape[0],1)).all() 99 | 100 | print 101 | print(f'API wait after {data.shape[0]} dispatch: {ref_end - ref_start:.6f} sec') 102 | print(f'polling wait for each {data.shape[0]} dispatch:') 103 | print(f' total: {np.sum(naive_results):.6f} sec') 104 | print(f' details: {" ".join([f"{t:.6f}" for t in naive_results])}') 105 | print(f'polling wait for each {data.shape[0]} dispatch with between sleep:') 106 | print(f' total: {np.sum(sleep_results):.6f} sec + sleep...') 107 | print(f' details: {" ".join([f"{t:.6f}" for t in sleep_results])}') 108 | -------------------------------------------------------------------------------- /benchmarks/test_gpu_clock.py: -------------------------------------------------------------------------------- 1 | 2 | # Copyright (c) 2019-2020 Idein Inc. 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy 5 | # of this software and associated documentation files (the "Software"), to deal 6 | # in the Software without restriction, including without limitation the rights 7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | # copies of the Software, and to permit persons to whom the Software is 9 | # furnished to do so, subject to the following conditions: 10 | # 11 | # The above copyright notice and this permission notice (including the next 12 | # paragraph) shall be included in all copies or substantial portions of the 13 | # Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | import time 24 | from videocore6.driver import Driver 25 | from videocore6.assembler import qpu 26 | from bench_helper import BenchHelper 27 | 28 | @qpu 29 | def qpu_clock(asm): 30 | 31 | nop(sig = ldunif) 32 | nop(sig = ldunifrf(rf0)) 33 | 34 | with loop as l: 35 | sub(r5, r5, 1, cond = 'pushn') 36 | l.b(cond = 'anyna') 37 | nop() 38 | nop() 39 | nop() 40 | 41 | mov(tmud, 1) 42 | mov(tmua, rf0) 43 | tmuwt() 44 | 45 | nop(sig = thrsw) 46 | nop(sig = thrsw) 47 | nop() 48 | nop() 49 | nop(sig = thrsw) 50 | nop() 51 | nop() 52 | nop() 53 | 54 | 55 | def test_clock(): 56 | print() 57 | 58 | bench = BenchHelper('benchmarks/libbench_helper.so') 59 | 60 | with Driver() as drv: 61 | 62 | f = pow(2, 25) 63 | 64 | code = drv.program(qpu_clock) 65 | unif = drv.alloc(2, dtype = 'uint32') 66 | done = drv.alloc(1, dtype = 'uint32') 67 | 68 | done[:] = 0 69 | 70 | unif[0] = f 71 | unif[1] = done.addresses()[0] 72 | 73 | with drv.compute_shader_dispatcher() as csd: 74 | start = time.time() 75 | csd.dispatch(code, unif.addresses()[0]) 76 | bench.wait_address(done) 77 | end = time.time() 78 | 79 | print(f'{end - start:.6f} sec') 80 | print(f'{f * 5 / (end - start) / 1000 / 1000 * 4:.6f} MHz') 81 | -------------------------------------------------------------------------------- /benchmarks/test_tmu_performance.py: -------------------------------------------------------------------------------- 1 | 2 | # Copyright (c) 2019-2020 Idein Inc. 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy 5 | # of this software and associated documentation files (the "Software"), to deal 6 | # in the Software without restriction, including without limitation the rights 7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | # copies of the Software, and to permit persons to whom the Software is 9 | # furnished to do so, subject to the following conditions: 10 | # 11 | # The above copyright notice and this permission notice (including the next 12 | # paragraph) shall be included in all copies or substantial portions of the 13 | # Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | 24 | import time 25 | from videocore6.driver import Driver 26 | from videocore6.assembler import qpu 27 | import numpy as np 28 | import matplotlib.pyplot as plt 29 | from bench_helper import BenchHelper 30 | 31 | 32 | @qpu 33 | def qpu_tmu_load_1_slot_1_qpu(asm, nops): 34 | 35 | nop(sig = ldunifrf(rf0)) # X.shape[1] 36 | nop(sig = ldunifrf(rf1)) # X 37 | nop(sig = ldunifrf(rf2)) # X.stride[1] 38 | nop(sig = ldunifrf(rf3)) # X.stride[0] 39 | nop(sig = ldunifrf(rf4)) # Y 40 | nop(sig = ldunifrf(rf5)) # done 41 | 42 | barrierid(syncb, sig = thrsw) 43 | nop() 44 | nop() 45 | 46 | tidx(r0) 47 | shr(r0, r0, 2) 48 | band(r0, r0, 0b1111, cond = 'pushz') 49 | b(R.done, cond = 'allna') 50 | nop() # delay slot 51 | nop() # delay slot 52 | nop() # delay slot 53 | 54 | eidx(r0) 55 | shl(r0, r0, 2) 56 | add(rf4, rf4, r0) 57 | 58 | eidx(r0) 59 | umul24(r0, r0, rf3) 60 | add(rf1, rf1, r0) 61 | 62 | mov(r2, 0.0) 63 | with loop as l: 64 | mov(tmua, rf1).add(rf1, rf1, rf2) 65 | for i in range(nops): 66 | nop() 67 | nop(sig = ldtmu(r3)) 68 | sub(rf0, rf0, 1, cond = 'pushz') 69 | l.b(cond = 'anyna') 70 | fadd(r2, r2, r3) # delay slot 71 | nop() # delay slot 72 | nop() # delay slot 73 | 74 | mov(tmud, r2) 75 | mov(tmua, rf4) 76 | tmuwt() 77 | 78 | mov(tmud, 1) 79 | mov(tmua, rf5) 80 | tmuwt() 81 | 82 | L.done 83 | barrierid(syncb, sig = thrsw) 84 | nop() 85 | nop() 86 | 87 | nop(sig = thrsw) 88 | nop(sig = thrsw) 89 | nop() 90 | nop() 91 | nop(sig = thrsw) 92 | nop() 93 | nop() 94 | nop() 95 | 96 | def test_tmu_load_1_slot_1_qpu(): 97 | 98 | bench = BenchHelper('benchmarks/libbench_helper.so') 99 | 100 | for trans in [False, True]: 101 | 102 | with Driver() as drv: 103 | 104 | loop = 2**15 105 | 106 | X = drv.alloc((16, loop) if trans else (loop, 16), dtype = 'float32') 107 | Y = drv.alloc(16, dtype = 'float32') 108 | unif = drv.alloc(6, dtype = 'uint32') 109 | done = drv.alloc(1, dtype = 'uint32') 110 | 111 | unif[0] = loop 112 | unif[1] = X.addresses()[0,0] 113 | unif[2] = X.strides[int(trans)] 114 | unif[3] = X.strides[1-int(trans)] 115 | unif[4] = Y.addresses()[0] 116 | unif[5] = done.addresses()[0] 117 | 118 | results = np.zeros((24, 10), dtype = 'float32') 119 | 120 | fig = plt.figure() 121 | ax = fig.add_subplot(1,1,1) 122 | ax.set_title(f'TMU load latency (1 slot, 1 qpu, stride=({unif[2]},{unif[3]}))') 123 | ax.set_xlabel('# of nop (between request and load signal)') 124 | ax.set_ylabel('sec') 125 | 126 | print() 127 | for nops in range(results.shape[0]): 128 | 129 | code = drv.program(lambda asm: qpu_tmu_load_1_slot_1_qpu(asm, nops)) 130 | 131 | for i in range(results.shape[1]): 132 | 133 | with drv.compute_shader_dispatcher() as csd: 134 | 135 | X[:] = np.random.randn(*X.shape) / X.shape[int(trans)] 136 | Y[:] = 0.0 137 | done[:] = 0 138 | 139 | start = time.time() 140 | csd.dispatch(code, unif.addresses()[0], thread = 8) 141 | bench.wait_address(done) 142 | end = time.time() 143 | 144 | results[nops,i] = end - start 145 | 146 | assert np.allclose(Y, np.sum(X, axis=int(trans)), atol = 1e-4) 147 | 148 | ax.scatter(np.zeros(results.shape[1])+nops, results[nops], s=1, c='blue') 149 | 150 | print('{:4}/{}\t{:.9f}'.format(nops, results.shape[0], np.sum(results[nops]) / results.shape[1])) 151 | 152 | ax.set_ylim(auto=True) 153 | ax.set_xlim(0, results.shape[0]) 154 | fig.savefig(f'benchmarks/tmu_load_1_slot_1_qpu_{unif[2]}_{unif[3]}.png') 155 | 156 | @qpu 157 | def qpu_tmu_load_2_slot_1_qpu(asm, nops): 158 | 159 | nop(sig = ldunifrf(rf0)) # X.shape[1] 160 | nop(sig = ldunifrf(rf1)) # X 161 | nop(sig = ldunifrf(rf2)) # X.stride[1] 162 | nop(sig = ldunifrf(rf3)) # X.stride[0] 163 | nop(sig = ldunifrf(rf4)) # Y 164 | nop(sig = ldunifrf(rf5)) # done 165 | 166 | barrierid(syncb, sig = thrsw) 167 | nop() 168 | nop() 169 | 170 | tidx(r0) 171 | shr(r0, r0, 2) 172 | band(r0, r0, 0b0011, cond = 'pushz') 173 | b(R.skip_bench, cond = 'allna') 174 | nop() 175 | nop() 176 | nop() 177 | 178 | eidx(r0) 179 | shl(r0, r0, 2) 180 | add(rf4, rf4, r0) 181 | tidx(r0) 182 | shr(r0, r0, 2) 183 | band(r0, r0, 0b1111) 184 | shl(r1, 4, 4) 185 | umul24(r0, r0, r1) 186 | add(rf4, rf4, r0) 187 | 188 | eidx(r0) 189 | umul24(r0, r0, rf3) 190 | add(rf1, rf1, r0) 191 | tidx(r0) 192 | shr(r0, r0, 2) 193 | band(r0, r0, 0b1111) 194 | shl(r1, rf0, 6) 195 | umul24(r0, r0, r1) 196 | add(rf1, rf1, r0) 197 | 198 | mov(r2, 0.0) 199 | with loop as l: 200 | mov(tmua, rf1).add(rf1, rf1, rf2) 201 | for i in range(nops): 202 | nop() 203 | nop(sig = ldtmu(r3)) 204 | sub(rf0, rf0, 1, cond = 'pushz') 205 | l.b(cond = 'anyna') 206 | fadd(r2, r2, r3) # delay slot 207 | nop() # delay slot 208 | nop() # delay slot 209 | 210 | mov(tmud, r2) 211 | mov(tmua, rf4) 212 | tmuwt() 213 | 214 | L.skip_bench 215 | 216 | barrierid(syncb, sig = thrsw) 217 | nop() 218 | nop() 219 | 220 | tidx(r0) 221 | shr(r0, r0, 2) 222 | band(r0, r0, 0b1111, cond = 'pushz') 223 | b(R.skip_done, cond = 'allna') 224 | nop() 225 | nop() 226 | nop() 227 | mov(tmud, 1) 228 | mov(tmua, rf5) 229 | tmuwt() 230 | L.skip_done 231 | 232 | nop(sig = thrsw) 233 | nop(sig = thrsw) 234 | nop() 235 | nop() 236 | nop(sig = thrsw) 237 | nop() 238 | nop() 239 | nop() 240 | 241 | def test_tmu_load_2_slot_1_qpu(): 242 | 243 | bench = BenchHelper('benchmarks/libbench_helper.so') 244 | 245 | for trans, min_nops, max_nops in [(False, 0, 64), (True, 128-32, 128+32)]: 246 | 247 | with Driver() as drv: 248 | 249 | loop = 2**13 250 | 251 | X = drv.alloc((8, 16, loop) if trans else (8, loop, 16), dtype = 'float32') 252 | Y = drv.alloc((8, 16), dtype = 'float32') 253 | unif = drv.alloc(6, dtype = 'uint32') 254 | done = drv.alloc(1, dtype = 'uint32') 255 | 256 | unif[0] = loop 257 | unif[1] = X.addresses()[0,0,0] 258 | unif[2] = X.strides[1+int(trans)] 259 | unif[3] = X.strides[2-int(trans)] 260 | unif[4] = Y.addresses()[0,0] 261 | unif[5] = done.addresses()[0] 262 | 263 | results = np.zeros((max_nops, 10), dtype = 'float32') 264 | 265 | fig = plt.figure() 266 | ax = fig.add_subplot(1,1,1) 267 | ax.set_title(f'TMU load latency (2 slot, 1 qpu, stride=({unif[2]},{unif[3]}))') 268 | ax.set_xlabel('# of nop (between request and load signal)') 269 | ax.set_ylabel('sec') 270 | 271 | print() 272 | for nops in range(min_nops, results.shape[0]): 273 | 274 | code = drv.program(lambda asm: qpu_tmu_load_2_slot_1_qpu(asm, nops)) 275 | 276 | for i in range(results.shape[1]): 277 | 278 | with drv.compute_shader_dispatcher() as csd: 279 | 280 | X[:] = np.random.randn(*X.shape) / X.shape[1+int(trans)] 281 | Y[:] = 0.0 282 | done[:] = 0 283 | 284 | start = time.time() 285 | csd.dispatch(code, unif.addresses()[0], thread = 8) 286 | bench.wait_address(done) 287 | end = time.time() 288 | 289 | results[nops,i] = end - start 290 | 291 | assert np.allclose(Y[0::4], np.sum(X[0::4], axis=1+int(trans)), atol = 1e-4) 292 | assert (Y[1:4] == 0).all() 293 | assert (Y[5:8] == 0).all() 294 | 295 | ax.scatter(np.zeros(results.shape[1])+nops, results[nops], s=1, c='blue') 296 | 297 | print('{:4}/{}\t{:.9f}'.format(nops, results.shape[0], np.sum(results[nops]) / results.shape[1])) 298 | 299 | ax.set_ylim(auto=True) 300 | ax.set_xlim(min_nops, max_nops) 301 | fig.savefig(f'benchmarks/tmu_load_2_slot_1_qpu_{unif[2]}_{unif[3]}.png') 302 | -------------------------------------------------------------------------------- /examples/memset.py: -------------------------------------------------------------------------------- 1 | 2 | # Copyright (c) 2019-2020 Idein Inc. 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy 5 | # of this software and associated documentation files (the "Software"), to deal 6 | # in the Software without restriction, including without limitation the rights 7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | # copies of the Software, and to permit persons to whom the Software is 9 | # furnished to do so, subject to the following conditions: 10 | # 11 | # The above copyright notice and this permission notice (including the next 12 | # paragraph) shall be included in all copies or substantial portions of the 13 | # Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | 24 | from time import monotonic 25 | 26 | import numpy as np 27 | 28 | from videocore6.assembler import qpu 29 | from videocore6.driver import Driver 30 | 31 | 32 | @qpu 33 | def qpu_memset(asm, *, num_qpus, unroll_shift, code_offset, 34 | align_cond=lambda pos: pos % 512 == 0): 35 | 36 | g = globals() 37 | for i, v in enumerate(['dst', 'fill', 'length', 'qpu_num', 'stride']): 38 | g[f'reg_{v}'] = rf[i] 39 | 40 | nop(sig=ldunifrf(reg_dst)) 41 | nop(sig=ldunifrf(reg_fill)) 42 | nop(sig=ldunifrf(reg_length)) 43 | 44 | if num_qpus == 1: 45 | num_qpus_shift = 0 46 | mov(reg_qpu_num, 0) 47 | elif num_qpus == 8: 48 | num_qpus_shift = 3 49 | tidx(r0) 50 | shr(r0, r0, 2) 51 | band(reg_qpu_num, r0, 0b1111) 52 | else: 53 | raise Exception('num_qpus must be 1 or 8') 54 | 55 | # addr += 4 * 4 * (thread_num + 16 * qpu_num) 56 | shl(r0, reg_qpu_num, 4) 57 | eidx(r1) 58 | add(r0, r0, r1) 59 | shl(r0, r0, 4) 60 | add(reg_dst, reg_dst, r0) 61 | 62 | # stride = 4 * 4 * 16 * num_qpus 63 | mov(r0, 1) 64 | shl(reg_stride, r0, 8 + num_qpus_shift) 65 | 66 | # length /= 16 * num_qpus * unroll 67 | shr(reg_length, reg_length, 4 + num_qpus_shift + unroll_shift) 68 | 69 | while not align_cond(code_offset + len(asm)): 70 | nop() 71 | 72 | with loop as l: 73 | 74 | unroll = 1 << unroll_shift 75 | 76 | for i in range(unroll // 4 - 1): 77 | mov(tmud, reg_fill) 78 | mov(tmud, reg_fill) 79 | mov(tmud, reg_fill) 80 | mov(tmud, reg_fill) 81 | mov(tmuau if i % 4 == 0 else tmua, reg_dst).add(reg_dst, reg_dst, reg_stride) 82 | 83 | mov(tmud, reg_fill).mov(r0, 1) 84 | mov(tmud, reg_fill).sub(reg_length, reg_length, r0, cond='pushz') 85 | 86 | l.b(cond='na0').unif_addr(absolute=False) 87 | mov(tmud, reg_fill) 88 | mov(tmud, reg_fill) 89 | mov(tmua, reg_dst).add(reg_dst, reg_dst, reg_stride) 90 | 91 | nop(sig=thrsw) 92 | nop(sig=thrsw) 93 | nop() 94 | nop() 95 | nop(sig=thrsw) 96 | nop() 97 | nop() 98 | nop() 99 | 100 | 101 | def memset(*, fill, length, num_qpus=8, unroll_shift=5): 102 | 103 | assert length > 0 104 | assert length % (16 * num_qpus * (1 << unroll_shift)) == 0 105 | assert unroll_shift >= 4 106 | 107 | print(f'==== memset example ({length * 4 / 1024 / 1024} MiB) ====') 108 | 109 | with Driver(data_area_size=(length + 1024) * 4) as drv: 110 | 111 | code = drv.program(qpu_memset, num_qpus=num_qpus, 112 | unroll_shift=unroll_shift, 113 | code_offset=drv.code_pos // 8) 114 | 115 | print('Preparing for buffers...') 116 | 117 | X = drv.alloc(length, dtype='uint32') 118 | 119 | X.fill(~fill & 0xFFFFFFFF) 120 | 121 | assert not np.array_equiv(X, fill) 122 | 123 | unif = drv.alloc(3 + (1 << (unroll_shift - 4)) + 1, dtype='uint32') 124 | unif[0] = X.addresses()[0] 125 | unif[1] = fill 126 | unif[2] = length 127 | unif[3: -1] = 0xfcfcfcfc 128 | unif[-1] = 4 * (-len(unif) + 3) & 0xFFFFFFFF 129 | 130 | print('Executing on QPU...') 131 | 132 | start = monotonic() 133 | drv.execute(code, unif.addresses()[0], thread=num_qpus) 134 | end = monotonic() 135 | 136 | assert np.array_equiv(X, fill) 137 | 138 | print(f'{end - start} sec, {length * 4 / (end - start) * 1e-6} MB/s') 139 | 140 | 141 | def main(): 142 | 143 | memset(fill=0x5a5a5a5a, length=16 * 1024 * 1024) 144 | 145 | 146 | if __name__ == '__main__': 147 | 148 | main() 149 | -------------------------------------------------------------------------------- /examples/pctr_gpu_clock.py: -------------------------------------------------------------------------------- 1 | 2 | # Copyright (c) 2019-2020 Idein Inc. 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy 5 | # of this software and associated documentation files (the "Software"), to deal 6 | # in the Software without restriction, including without limitation the rights 7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | # copies of the Software, and to permit persons to whom the Software is 9 | # furnished to do so, subject to the following conditions: 10 | # 11 | # The above copyright notice and this permission notice (including the next 12 | # paragraph) shall be included in all copies or substantial portions of the 13 | # Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | import time 24 | 25 | from videocore6.v3d import * 26 | 27 | with RegisterMapping() as regmap: 28 | 29 | with PerformanceCounter(regmap, [CORE_PCTR_CYCLE_COUNT]) as pctr: 30 | 31 | time.sleep(1) 32 | result = pctr.result() 33 | 34 | print('==== QPU clock measurement with performance counters ====') 35 | print(f'{result[0] * 1e-6} MHz') 36 | -------------------------------------------------------------------------------- /examples/scopy.py: -------------------------------------------------------------------------------- 1 | 2 | # Copyright (c) 2019-2020 Idein Inc. 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy 5 | # of this software and associated documentation files (the "Software"), to deal 6 | # in the Software without restriction, including without limitation the rights 7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | # copies of the Software, and to permit persons to whom the Software is 9 | # furnished to do so, subject to the following conditions: 10 | # 11 | # The above copyright notice and this permission notice (including the next 12 | # paragraph) shall be included in all copies or substantial portions of the 13 | # Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | 24 | from time import monotonic 25 | 26 | import numpy as np 27 | 28 | from videocore6.assembler import qpu 29 | from videocore6.driver import Driver 30 | 31 | 32 | @qpu 33 | def qpu_scopy(asm, *, num_qpus, unroll_shift, code_offset, 34 | align_cond=lambda pos: pos % 512 == 259): 35 | 36 | g = globals() 37 | for i, v in enumerate(['length', 'src', 'dst', 'qpu_num', 'stride']): 38 | g[f'reg_{v}'] = rf[i] 39 | 40 | nop(sig=ldunifrf(reg_length)) 41 | nop(sig=ldunifrf(reg_src)) 42 | nop(sig=ldunifrf(reg_dst)) 43 | 44 | if num_qpus == 1: 45 | num_qpus_shift = 0 46 | mov(reg_qpu_num, 0) 47 | elif num_qpus == 8: 48 | num_qpus_shift = 3 49 | tidx(r0) 50 | shr(r0, r0, 2) 51 | band(reg_qpu_num, r0, 0b1111) 52 | else: 53 | raise Exception('num_qpus must be 1 or 8') 54 | 55 | # addr += 4 * 4 * (thread_num + 16 * qpu_num) 56 | shl(r0, reg_qpu_num, 4) 57 | eidx(r1) 58 | add(r0, r0, r1) 59 | shl(r0, r0, 4) 60 | add(reg_src, reg_src, r0).add(reg_dst, reg_dst, r0) 61 | 62 | # stride = 4 * 4 * 16 * num_qpus 63 | mov(reg_stride, 1) 64 | shl(reg_stride, reg_stride, 8 + num_qpus_shift) 65 | 66 | num_shifts = [*range(16), *range(-16, 0)] 67 | 68 | # length /= 16 * 8 * num_qpus * unroll 69 | shr(reg_length, reg_length, num_shifts[7 + num_qpus_shift + unroll_shift]) 70 | 71 | # This single thread switch and two nops just before the loop are really 72 | # important for TMU read to achieve a better performance. 73 | # This also enables TMU read requests without the thread switch signal, and 74 | # the eight-depth TMU read request queue. 75 | nop(sig=thrsw) 76 | nop() 77 | nop() 78 | 79 | while not align_cond(code_offset + len(asm)): 80 | nop() 81 | 82 | with loop as l: 83 | 84 | unroll = 1 << unroll_shift 85 | 86 | # A smaller number of instructions does not necessarily mean a faster 87 | # operation. Rather, complicated TMU manipulations may perform worse 88 | # and even cause a hardware bug. 89 | 90 | mov(tmuau, reg_src).add(reg_src, reg_src, reg_stride) 91 | mov(tmua, reg_src).add(reg_src, reg_src, reg_stride) 92 | 93 | for i in range(unroll - 1): 94 | nop(sig=ldtmu(r0)) 95 | mov(tmud, r0, sig=ldtmu(r0)) 96 | mov(tmud, r0, sig=ldtmu(r0)) 97 | mov(tmud, r0) 98 | nop(sig=ldtmu(r0)) 99 | mov(tmud, r0) 100 | mov(tmua, reg_dst).add(reg_dst, reg_dst, reg_stride) 101 | mov(tmua, reg_src).add(reg_src, reg_src, reg_stride) 102 | nop(sig=ldtmu(r0)) 103 | mov(tmud, r0, sig=ldtmu(r0)) 104 | mov(tmud, r0, sig=ldtmu(r0)) 105 | mov(tmud, r0) 106 | nop(sig=ldtmu(r0)) 107 | mov(tmud, r0) 108 | mov(tmuau, reg_dst).add(reg_dst, reg_dst, reg_stride) 109 | mov(tmua, reg_src).add(reg_src, reg_src, reg_stride) 110 | 111 | if unroll == 1: 112 | # Prefetch the next source. 113 | mov(tmua, reg_src) 114 | 115 | nop(sig=ldtmu(r0)) 116 | mov(tmud, r0, sig=ldtmu(r0)) 117 | mov(tmud, r0, sig=ldtmu(r0)) 118 | mov(tmud, r0) 119 | nop(sig=ldtmu(r0)) 120 | sub(reg_length, reg_length, 1, cond='pushz').mov(tmud, r0) 121 | mov(tmua, reg_dst).add(reg_dst, reg_dst, reg_stride) 122 | 123 | if unroll == 1: 124 | mov(tmuc, 0xfffffffc) 125 | nop(sig=ldtmu(r0)) 126 | mov(tmud, r0, sig=ldtmu(r0)) 127 | mov(tmud, r0, sig=ldtmu(r0)) 128 | 129 | l.b(cond='na0').unif_addr(absolute=False) 130 | mov(tmud, r0, sig=ldtmu(r0)) 131 | mov(tmud, r0) 132 | mov(tmua, reg_dst).add(reg_dst, reg_dst, reg_stride) 133 | 134 | # This synchronization is needed between the last TMU operation and the 135 | # program end with the thread switch just before the loop above. 136 | barrierid(syncb, sig=thrsw) 137 | nop() 138 | nop() 139 | 140 | nop(sig=thrsw) 141 | nop(sig=thrsw) 142 | nop() 143 | nop() 144 | nop(sig=thrsw) 145 | nop() 146 | nop() 147 | nop() 148 | 149 | 150 | def scopy(*, length, num_qpus=8, unroll_shift=0): 151 | 152 | assert length > 0 153 | assert length % (16 * 8 * num_qpus * (1 << unroll_shift)) == 0 154 | 155 | print(f'==== scopy example ({length / 1024 / 1024} Mi elements) ====') 156 | 157 | with Driver(data_area_size=(length * 2 + 1024) * 4) as drv: 158 | 159 | code = drv.program(qpu_scopy, num_qpus=num_qpus, 160 | unroll_shift=unroll_shift, 161 | code_offset=drv.code_pos // 8) 162 | 163 | print('Preparing for buffers...') 164 | 165 | X = drv.alloc(length, dtype='uint32') 166 | Y = drv.alloc(length, dtype='uint32') 167 | 168 | X[:] = np.arange(*X.shape, dtype=X.dtype) 169 | Y[:] = -X 170 | 171 | assert not np.array_equal(X, Y) 172 | 173 | unif = drv.alloc(3 + (1 << unroll_shift) + 1, dtype='uint32') 174 | unif[0] = length 175 | unif[1] = X.addresses()[0] 176 | unif[2] = Y.addresses()[0] 177 | if unroll_shift == 0: 178 | unif[3] = 0xfc80fcfc 179 | else: 180 | unif[3: -1] = 0xfcfcfcfc 181 | unif[-1] = 4 * (-len(unif) + 3) & 0xFFFFFFFF 182 | 183 | print('Executing on QPU...') 184 | 185 | start = monotonic() 186 | drv.execute(code, unif.addresses()[0], thread=num_qpus) 187 | end = monotonic() 188 | 189 | assert np.array_equal(X, Y) 190 | 191 | print(f'{end - start} sec, {length * 4 / (end - start) * 1e-6} MB/s') 192 | 193 | 194 | def main(): 195 | 196 | scopy(length=16 * 1024 * 1024) 197 | 198 | 199 | if __name__ == '__main__': 200 | 201 | main() 202 | -------------------------------------------------------------------------------- /examples/sgemm.py: -------------------------------------------------------------------------------- 1 | 2 | # Copyright (c) 2019-2020 Idein Inc. 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy 5 | # of this software and associated documentation files (the "Software"), to deal 6 | # in the Software without restriction, including without limitation the rights 7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | # copies of the Software, and to permit persons to whom the Software is 9 | # furnished to do so, subject to the following conditions: 10 | # 11 | # The above copyright notice and this permission notice (including the next 12 | # paragraph) shall be included in all copies or substantial portions of the 13 | # Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | 24 | from time import clock_gettime, CLOCK_MONOTONIC 25 | import numpy as np 26 | from videocore6 import pack_unpack 27 | from videocore6.driver import Driver 28 | from videocore6.assembler import qpu 29 | 30 | 31 | def getsec(): 32 | return clock_gettime(CLOCK_MONOTONIC) 33 | 34 | 35 | @qpu 36 | def load_params(asm, thread, regs): 37 | 38 | if thread == 1: 39 | bxor(r0, r0, r0, sig = ldunifrf(rf0)) 40 | elif thread == 8: 41 | # 8 threads (1 threads / qpu) 42 | tidx(r0, sig = ldunifrf(rf0)) 43 | shr(r0, r0, 2) 44 | mov(r1, 0b1111) 45 | elif thread == 16: 46 | # 16 threads (2 threads / qpu) 47 | tidx(r0, sig = ldunifrf(rf0)) 48 | shr(r0, r0, 1).mov(r1, 1) 49 | shl(r1, r1, 5) 50 | sub(r1, r1, 1) 51 | else: 52 | assert thread in [1,8,16] 53 | 54 | band(r3, r0, r1, sig = ldunifrf(rf1)) 55 | shl(r0, rf1, 2) 56 | umul24(r0, r0, r3) 57 | eidx(r1).add(r0, r0, rf0) 58 | shl(r1, r1, 2) 59 | shl(r3, 4, 4).add(r0, r0, r1) 60 | n = len(regs) 61 | mov(tmua, r0, sig = thrsw).add(r0, r0, r3) 62 | nop() 63 | nop() 64 | nop(sig = ldtmu(r1)) 65 | for i in range(n): 66 | if i % 16 == 0: 67 | mov(r5rep, r1) 68 | mov(regs[i], r5) 69 | elif i % 16 == 15 and i != n - 1: 70 | mov(tmua, r0, sig = thrsw).add(r0, r0, r3) 71 | rotate(r5rep, r1, - (i % 16)) 72 | mov(regs[i], r5) 73 | nop(sig = ldtmu(r1)) 74 | else: 75 | rotate(r5rep, r1, - (i % 16)) 76 | mov(regs[i], r5) 77 | 78 | @qpu 79 | def qpu_sgemm_rnn_naive(asm, thread): 80 | 81 | params = [ 82 | 'P', 83 | 'Q', 84 | 'R', 85 | 'A_base', 86 | 'A_stride', 87 | 'B_base', 88 | 'B_stride', 89 | 'C_base', 90 | 'C_stride', 91 | 'alpha', 92 | 'beta', 93 | ] 94 | 95 | values = [ 96 | 'A_cur', 97 | 'B_cur', 98 | 'C_cur', 99 | 'i', 'j', 'k', 100 | ] 101 | 102 | g = globals() 103 | for i, reg in enumerate(params + values): 104 | g['reg_' + reg] = g['rf' + str(i+32)] 105 | 106 | load_params(asm, thread, [g['reg_' + reg] for reg in params]) 107 | 108 | add(r0, reg_P, 15) 109 | shr(r0, r0, 4) 110 | shl(r0, r0, 4) 111 | add(r1, reg_R, 15) 112 | shr(r1, r1, 4) 113 | shl(r1, r1, 6) 114 | umul24(r3, r0, reg_A_stride) 115 | add(reg_A_base, reg_A_base, r3) 116 | add(reg_B_base, reg_B_base, r1) 117 | umul24(r3, r0, reg_C_stride) 118 | add(reg_C_base, reg_C_base, r3) 119 | add(reg_C_base, reg_C_base, r1) 120 | 121 | for i in range(16): 122 | mov(rf[i], 0.0).mov(rf[i+16], 0.0) 123 | 124 | # i=(p+15)/16. 125 | add(r0, reg_P, 15) 126 | shr(reg_i, r0, 4) 127 | with loop as li: 128 | 129 | # j=(r+15)/16 130 | add(r0, reg_R, 15) 131 | shr(reg_j, r0, 4) 132 | with loop as lj: 133 | 134 | shl(r0, reg_i, 4) 135 | umul24(r3, r0, reg_C_stride) 136 | shl(r1, reg_j, 6) 137 | sub(reg_C_cur, reg_C_base, r3) 138 | sub(reg_C_cur, reg_C_cur, r1) 139 | umul24(r3, r0, reg_A_stride) 140 | sub(reg_A_cur, reg_A_base, r3) 141 | sub(reg_B_cur, reg_B_base, r1) 142 | 143 | mov(reg_k, reg_Q) 144 | with loop as lk: 145 | 146 | eidx(r0) 147 | umul24(r1, r0, reg_A_stride) 148 | add(r1, r1, reg_A_cur).add(reg_A_cur, reg_A_cur, 4) 149 | mov(tmua, r1, sig = thrsw) 150 | shl(r1, r0, 2) 151 | add(r1, r1, reg_B_cur).add(reg_B_cur, reg_B_cur, reg_B_stride) 152 | mov(tmua, r1, sig = thrsw) 153 | 154 | nop(sig = ldtmu(r0)) 155 | mov(r5rep, r0) 156 | nop(sig = ldtmu(r4)) 157 | nop().fmul(r3, r5, r4) 158 | for i in range(1,16): 159 | rotate(r5rep, r0, -i) 160 | fadd(rf[i-1], rf[i-1], r3).fmul(r3, r5, r4) 161 | fadd(rf15, rf15, r3) 162 | 163 | sub(reg_k, reg_k, 1, cond = 'pushz') 164 | lk.b(cond = 'anyna') 165 | nop() # delay slot 166 | nop() # delay slot 167 | nop() # delay slot 168 | 169 | eidx(r0) 170 | shl(r0, r0, 2) 171 | add(r1, reg_C_cur, r0) 172 | mov(tmua, r1, sig = thrsw).add(r1, r1, reg_C_stride) 173 | fmul(rf[0], rf[0], reg_alpha) 174 | for i in range(1, 16): 175 | mov(tmua, r1, sig = thrsw).add(r1, r1, reg_C_stride) 176 | fmul(rf[i], rf[i], reg_alpha, sig = ldtmu(rf[i+15])) 177 | mov(r0, reg_beta).fmul(r3, rf[16], reg_beta, sig = ldtmu(rf[31])) 178 | for i in range(16): 179 | fadd(rf[i], rf[i], r3).fmul(r3, rf[i+17], r0) 180 | 181 | eidx(r0) 182 | shl(r0, r0, 2) 183 | add(r1, reg_C_cur, r0) 184 | for i in range(16): 185 | mov(tmud, rf[i]) 186 | mov(tmua, r1).add(r1, r1, reg_C_stride) 187 | mov(rf[i], 0.0).mov(rf[i+16], 0.0) 188 | tmuwt() 189 | 190 | sub(reg_j, reg_j, 1, cond = 'pushz') 191 | lj.b(cond = 'anyna') 192 | nop() # delay slot 193 | nop() # delay slot 194 | nop() # delay slot 195 | 196 | sub(reg_i, reg_i, 1, cond = 'pushz') 197 | li.b(cond = 'anyna') 198 | nop() 199 | nop() 200 | nop() 201 | 202 | nop(sig = thrsw) 203 | nop(sig = thrsw) 204 | nop() 205 | nop() 206 | nop(sig = thrsw) 207 | nop() 208 | nop() 209 | nop() 210 | 211 | def sgemm_rnn_naive(): 212 | 213 | thread = 8 214 | 215 | P = 1024 216 | Q = 1024 217 | R = 1024 218 | 219 | assert P % (16 * 2) == 0 220 | assert R % (16 * 4) == 0 221 | 222 | with Driver() as drv: 223 | 224 | code = drv.program(lambda asm: qpu_sgemm_rnn_naive(asm, thread)) 225 | 226 | A = drv.alloc((P, Q), dtype = 'float32') 227 | B = drv.alloc((Q, R), dtype = 'float32') 228 | C = drv.alloc((P, R), dtype = 'float32') 229 | 230 | np.random.seed(0) 231 | alpha = np.random.randn() 232 | beta = np.random.randn() 233 | A_ref = np.random.randn(*A.shape).astype(A.dtype) 234 | B_ref = np.random.randn(*B.shape).astype(B.dtype) 235 | C_ref = np.random.randn(*C.shape).astype(C.dtype) 236 | 237 | A[:] = A_ref 238 | B[:] = B_ref 239 | C[:] = C_ref 240 | 241 | start = getsec() 242 | C_ref[:] = alpha * A_ref.dot(B_ref) + beta * C_ref 243 | time_ref = getsec() - start 244 | 245 | def block_2x4_params(i, j): 246 | tile_P = P // 2 247 | tile_R = R // 4 248 | return [ 249 | tile_P, Q, tile_R, 250 | A.addresses()[tile_P*i, 0 ], 251 | A.strides[0], 252 | B.addresses()[0 , tile_R*j], 253 | B.strides[0], 254 | C.addresses()[tile_P*i, tile_R*j], 255 | C.strides[0], 256 | *pack_unpack('f', 'I', [alpha, beta]), 257 | ] 258 | 259 | unif_params = drv.alloc((thread, len(block_2x4_params(0,0))), dtype = 'uint32') 260 | for th in range(thread): 261 | unif_params[th] = block_2x4_params(th // 4, th % 4) 262 | 263 | unif = drv.alloc(2, dtype = 'uint32') 264 | unif[0] = unif_params.addresses()[0,0] 265 | unif[1] = unif_params.shape[1] 266 | 267 | start = getsec() 268 | drv.execute(code, unif.addresses()[0], thread = thread) 269 | time_gpu = getsec() - start 270 | 271 | np.set_printoptions(threshold=np.inf) 272 | # print(C) 273 | # print(C-C_ref) 274 | 275 | def Gflops(sec): 276 | return (2 * P * Q * R + 3 * P * R) / sec * 1e-9 277 | 278 | print(f'==== sgemm example ({P}x{Q} times {Q}x{R}) ====') 279 | print(f'numpy: {time_ref:.4} sec, {Gflops(time_ref):.4} Gflop/s') 280 | print(f'QPU: {time_gpu:.4} sec, {Gflops(time_gpu):.4} Gflop/s') 281 | print(f'Minimum absolute error: {np.min(np.abs(C - C_ref))}') 282 | print(f'Maximum absolute error: {np.max(np.abs(C - C_ref))}') 283 | print(f'Minimum relative error: {np.min(np.abs((C - C_ref) / C_ref))}') 284 | print(f'Maximum relative error: {np.max(np.abs((C - C_ref) / C_ref))}') 285 | 286 | 287 | def main(): 288 | 289 | sgemm_rnn_naive() 290 | 291 | 292 | if __name__ == '__main__': 293 | main() 294 | -------------------------------------------------------------------------------- /examples/summation.py: -------------------------------------------------------------------------------- 1 | 2 | # Copyright (c) 2019-2020 Idein Inc. 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy 5 | # of this software and associated documentation files (the "Software"), to deal 6 | # in the Software without restriction, including without limitation the rights 7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | # copies of the Software, and to permit persons to whom the Software is 9 | # furnished to do so, subject to the following conditions: 10 | # 11 | # The above copyright notice and this permission notice (including the next 12 | # paragraph) shall be included in all copies or substantial portions of the 13 | # Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | 24 | from time import monotonic 25 | 26 | import numpy as np 27 | 28 | from videocore6.assembler import qpu 29 | from videocore6.driver import Driver 30 | 31 | 32 | @qpu 33 | def qpu_summation(asm, *, num_qpus, unroll_shift, code_offset, 34 | align_cond=lambda pos: pos % 512 == 170): 35 | 36 | g = globals() 37 | for i, v in enumerate(['length', 'src', 'dst', 'qpu_num', 'stride', 'sum']): 38 | g[f'reg_{v}'] = rf[i] 39 | 40 | nop(sig=ldunifrf(reg_length)) 41 | nop(sig=ldunifrf(reg_src)) 42 | nop(sig=ldunifrf(reg_dst)) 43 | 44 | if num_qpus == 1: 45 | num_qpus_shift = 0 46 | mov(reg_qpu_num, 0) 47 | elif num_qpus == 8: 48 | num_qpus_shift = 3 49 | tidx(r0) 50 | shr(r0, r0, 2) 51 | band(reg_qpu_num, r0, 0b1111) 52 | else: 53 | raise Exception('num_qpus must be 1 or 8') 54 | 55 | # src += 4 * 4 * (thread_num + 16 * qpu_num) 56 | # dst += 4 * (thread_num + 16 * qpu_num) 57 | shl(r0, reg_qpu_num, 4) 58 | eidx(r1) 59 | add(r0, r0, r1) 60 | shl(r0, r0, 2) 61 | shl(r0, r0, 2).add(reg_dst, reg_dst, r0) 62 | add(reg_src, reg_src, r0) 63 | 64 | # stride = 4 * 4 * 16 * num_qpus 65 | mov(reg_stride, 1) 66 | shl(reg_stride, reg_stride, 8 + num_qpus_shift) 67 | 68 | # The QPU performs shifts and rotates modulo 32, so it actually supports 69 | # shift amounts [0, 31] only with small immediates. 70 | num_shifts = [*range(16), *range(-16, 0)] 71 | 72 | # length /= 16 * 8 * num_qpus * unroll 73 | shr(reg_length, reg_length, num_shifts[7 + num_qpus_shift + unroll_shift]) 74 | 75 | # sum = 0 76 | # length -= 1 77 | # r2 = stride 78 | 79 | # This single thread switch and two instructions just before the loop are 80 | # really important for TMU read to achieve a better performance. 81 | # This also enables TMU read requests without the thread switch signal, and 82 | # the eight-depth TMU read request queue. 83 | nop(sig=thrsw) 84 | bxor(reg_sum, 1, 1).mov(r1, 1) 85 | sub(reg_length, reg_length, r1, cond='pushz').mov(r2, reg_stride) 86 | 87 | while not align_cond(code_offset + len(asm)): 88 | nop() 89 | 90 | with loop as l: 91 | 92 | unroll = 1 << unroll_shift 93 | 94 | mov(tmuau, reg_src).add(reg_src, reg_src, reg_stride) 95 | mov(tmua, reg_src, sig=ldtmu(r0)) 96 | 97 | for i in range(unroll - 1): 98 | add(reg_sum, reg_sum, r0, sig=ldtmu(r0)).add(reg_src, reg_src, r2) 99 | add(reg_sum, reg_sum, r0, sig=ldtmu(r0)) 100 | add(reg_sum, reg_sum, r0, sig=ldtmu(r0)) 101 | add(reg_sum, reg_sum, r0, sig=ldtmu(r0)).mov(tmuau if i % 2 == 1 else tmua, reg_src) 102 | add(reg_sum, reg_sum, r0, sig=ldtmu(r0)).add(reg_src, reg_src, r2) 103 | add(reg_sum, reg_sum, r0, sig=ldtmu(r0)) 104 | add(reg_sum, reg_sum, r0, sig=ldtmu(r0)) 105 | add(reg_sum, reg_sum, r0, sig=ldtmu(r0)).mov(tmua, reg_src) 106 | 107 | add(reg_sum, reg_sum, r0, sig=ldtmu(r0)).add(reg_src, reg_src, r2) 108 | add(reg_sum, reg_sum, r0, sig=ldtmu(r0)) 109 | add(reg_sum, reg_sum, r0, sig=ldtmu(r0)) 110 | add(reg_sum, reg_sum, r0, sig=ldtmu(r0)) 111 | add(reg_sum, reg_sum, r0, sig=ldtmu(r0)) 112 | 113 | l.b(cond='na0').unif_addr(absolute=False) 114 | add(reg_sum, reg_sum, r0, sig=ldtmu(r0)) 115 | add(reg_sum, reg_sum, r0, sig=ldtmu(r0)) 116 | add(reg_sum, reg_sum, r0).sub(reg_length, reg_length, r1, cond='pushz') 117 | 118 | mov(tmud, reg_sum) 119 | mov(tmua, reg_dst) 120 | 121 | # This synchronization is needed between the last TMU operation and the 122 | # program end with the thread switch just before the loop above. 123 | barrierid(syncb, sig=thrsw) 124 | nop() 125 | nop() 126 | 127 | nop(sig=thrsw) 128 | nop(sig=thrsw) 129 | nop() 130 | nop() 131 | nop(sig=thrsw) 132 | nop() 133 | nop() 134 | nop() 135 | 136 | 137 | def summation(*, length, num_qpus=8, unroll_shift=2): 138 | 139 | assert length > 0 140 | assert length % (16 * 8 * num_qpus * (1 << unroll_shift)) == 0 141 | 142 | print(f'==== summaton example ({length / 1024 / 1024} Mi elements) ====') 143 | 144 | with Driver(data_area_size=(length + 1024) * 4) as drv: 145 | 146 | code = drv.program(qpu_summation, num_qpus=num_qpus, 147 | unroll_shift=unroll_shift, 148 | code_offset=drv.code_pos // 8) 149 | 150 | print('Preparing for buffers...') 151 | 152 | X = drv.alloc(length, dtype='uint32') 153 | Y = drv.alloc(16 * num_qpus, dtype='uint32') 154 | 155 | X[:] = np.arange(length, dtype=X.dtype) 156 | Y.fill(0) 157 | 158 | assert sum(Y) == 0 159 | 160 | if unroll_shift == 0: 161 | unif = drv.alloc(3 + 1 + 1, dtype='uint32') 162 | unif[3] = 0xfffffcfc 163 | else: 164 | unif = drv.alloc(3 + (1 << (unroll_shift - 1)) + 1, dtype='uint32') 165 | unif[3: -1] = 0xfcfcfcfc 166 | unif[0] = length 167 | unif[1] = X.addresses()[0] 168 | unif[2] = Y.addresses()[0] 169 | unif[-1] = 4 * (-len(unif) + 3) & 0xFFFFFFFF 170 | 171 | print('Executing on QPU...') 172 | 173 | start = monotonic() 174 | drv.execute(code, unif.addresses()[0], thread=num_qpus) 175 | end = monotonic() 176 | 177 | assert int(sum(Y.astype(int))) % 2**32 == (length - 1) * length // 2 % 2**32 178 | 179 | print(f'{end - start} sec, {length * 4 / (end - start) * 1e-6} MB/s') 180 | 181 | 182 | def main(): 183 | 184 | summation(length=32 * 1024 * 1024) 185 | 186 | 187 | if __name__ == '__main__': 188 | 189 | main() 190 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | 2 | # Copyright (c) 2019-2020 Idein Inc. 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy 5 | # of this software and associated documentation files (the "Software"), to deal 6 | # in the Software without restriction, including without limitation the rights 7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | # copies of the Software, and to permit persons to whom the Software is 9 | # furnished to do so, subject to the following conditions: 10 | # 11 | # The above copyright notice and this permission notice (including the next 12 | # paragraph) shall be included in all copies or substantial portions of the 13 | # Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | 24 | import platform 25 | 26 | from setuptools import setup, Extension 27 | 28 | from videocore6 import __version__ as version 29 | 30 | 31 | ext_modules = [] 32 | 33 | if platform.machine() in ['armv7l', 'aarch64']: 34 | ext_modules.append(Extension('videocore6.readwrite4', 35 | sources = ['videocore6/readwrite4.c'])) 36 | 37 | setup( 38 | name = 'py-videocore6', 39 | packages = [ 40 | 'videocore6', 41 | ], 42 | version = version, 43 | description = 'Python library for GPGPU programming on Raspberry Pi 4', 44 | author = 'Sugizaki Yukimasa', 45 | author_email = 'ysugi@idein.jp', 46 | install_requires = [ 47 | 'ioctl-opt >= 1.2', 48 | 'numpy', 49 | ], 50 | ext_modules = ext_modules, 51 | python_requires = '~= 3.7', # for f-string. 52 | ) 53 | -------------------------------------------------------------------------------- /tests/test_alu.py: -------------------------------------------------------------------------------- 1 | 2 | # Copyright (c) 2019-2020 Idein Inc. 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy 5 | # of this software and associated documentation files (the "Software"), to deal 6 | # in the Software without restriction, including without limitation the rights 7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | # copies of the Software, and to permit persons to whom the Software is 9 | # furnished to do so, subject to the following conditions: 10 | # 11 | # The above copyright notice and this permission notice (including the next 12 | # paragraph) shall be included in all copies or substantial portions of the 13 | # Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | import time 24 | from videocore6.driver import Driver 25 | from videocore6.assembler import qpu 26 | import numpy as np 27 | import itertools 28 | 29 | def rotate_right(n, s): 30 | return ((n << (32-s)) | (n >> s)) & 0xffffffff 31 | 32 | def count_leading_zeros(n): 33 | bit = 0x80000000 34 | count = 0 35 | while bit != n & bit: 36 | count += 1 37 | bit >>= 1 38 | return count 39 | 40 | ops = { 41 | # binary ops 42 | 'fadd' : lambda a,b: a + b, 43 | 'faddnf' : lambda a,b: a + b, 44 | 'fsub' : lambda a,b: a - b, 45 | 'fmin' : np.minimum, 46 | 'fmax' : np.maximum, 47 | 'fmul' : lambda a,b: a * b, 48 | 'fcmp' : lambda a,b: a - b, 49 | 'vfpack' : lambda a,b: np.stack([a,b]).T.ravel(), 50 | 'vfmin' : np.minimum, 51 | 'vfmax' : np.maximum, 52 | 'vfmul' : lambda a,b: a * b, 53 | 54 | 'add' : lambda a,b: a + b, 55 | 'sub' : lambda a,b: a - b, 56 | 'imin' : np.minimum, 57 | 'imax' : np.maximum, 58 | 'umin' : np.minimum, 59 | 'umax' : np.maximum, 60 | 61 | 'shl' : lambda a,b: a << (b % 32), 62 | 'shr' : lambda a,b: a >> (b % 32), 63 | 'asr' : lambda a,b: a.astype(np.int32) >> (b % 32), 64 | 'ror' : lambda a,b: np.vectorize(rotate_right)(a, b % 32), 65 | 66 | 'band' : lambda a,b: a & b, 67 | 'bor' : lambda a,b: a | b, 68 | 'bxor' : lambda a,b: a ^ b, 69 | 70 | # unary ops 71 | 'fmov' : lambda x: x, 72 | 'fround' : np.round, 73 | 'ftrunc' : np.trunc, 74 | 'ffloor' : np.floor, 75 | 'fceil' : np.ceil, 76 | 'fdx' : lambda x: (x[1::2] - x[0::2]).repeat(2), 77 | 'fdy' : lambda x: (lambda a: (a[1::2] - a[0::2]).ravel())(x.reshape(-1,2).repeat(2,axis=0).reshape(-1,4)), 78 | 'ftoin': lambda x: x.round().astype(np.int32), 79 | 'ftoiz': lambda x: np.float32(x).astype(np.int32), 80 | 'ftouz': np.vectorize(lambda x: np.float32(x).astype(np.uint32) if x > -1 else 0), 81 | 82 | 'bnot' : lambda x: ~x, 83 | 'neg' : lambda x: -x, 84 | 85 | 'itof' : lambda x: x.astype(np.float32), 86 | 'clz' : np.vectorize(count_leading_zeros), 87 | 'utof' : lambda x: x.astype(np.float32), 88 | 89 | # pack/unpack flags 90 | 'l' : lambda x: x[0::2], 91 | 'h' : lambda x: x[1::2], 92 | None : lambda x: x, 93 | 'none' : lambda x: x, 94 | 'abs' : np.abs, 95 | 'r32' : lambda x: x.repeat(2), 96 | 'rl2h' : lambda x: x[0::2].repeat(2), 97 | 'rh2l' : lambda x: x[1::2].repeat(2), 98 | 'swap' : lambda x: x.reshape(-1,2)[:,::-1].ravel(), 99 | } 100 | 101 | 102 | @qpu 103 | def qpu_binary_ops(asm, bin_ops, dst_ops, src1_ops, src2_ops): 104 | 105 | eidx(r0, sig = ldunif) 106 | mov(rf0, r5, sig = ldunif) # in 107 | mov(rf1, r5, sig = ldunif) # out 108 | shl(r3, 4, 4).mov(rf2, r5) 109 | 110 | shl(r0, r0, 2) 111 | add(rf0, rf0, r0) 112 | add(rf1, rf1, r0) 113 | add(rf2, rf2, r0) 114 | 115 | mov(tmua, rf0, sig = thrsw).add(rf0, rf0, r3) 116 | nop() 117 | mov(tmua, rf1, sig = thrsw).add(rf1, rf1, r3) 118 | nop(sig = ldtmu(r1)) 119 | nop() 120 | nop(sig = ldtmu(r2)) 121 | 122 | g = globals() 123 | for op, pack, unpack1, unpack2 in itertools.product(bin_ops, dst_ops, src1_ops, src2_ops): 124 | g[op]( 125 | r0.pack(pack) if pack is not None else r0, 126 | r1.unpack(unpack1) if unpack1 is not None else r1, 127 | r2.unpack(unpack2) if unpack2 is not None else r2 128 | ) 129 | mov(tmud, r0) 130 | mov(tmua, rf2) 131 | tmuwt().add(rf2, rf2, r3) 132 | 133 | nop(sig = thrsw) 134 | nop(sig = thrsw) 135 | nop() 136 | nop() 137 | nop(sig = thrsw) 138 | nop() 139 | nop() 140 | nop() 141 | 142 | def boilerplate_binary_ops(bin_ops, dst, src1, src2): 143 | 144 | dst_dtype, dst_ops = dst 145 | src1_dtype, src1_ops = src1 146 | src2_dtype, src2_ops = src2 147 | 148 | with Driver() as drv: 149 | 150 | cases = list(itertools.product(bin_ops, dst_ops, src1_ops, src2_ops)) 151 | 152 | code = drv.program(lambda asm: qpu_binary_ops(asm, bin_ops, dst_ops, src1_ops, src2_ops)) 153 | X1 = drv.alloc((16*4//np.dtype(src1_dtype).itemsize, ), dtype = src1_dtype) 154 | X2 = drv.alloc((16*4//np.dtype(src2_dtype).itemsize, ), dtype = src2_dtype) 155 | Y = drv.alloc((len(cases), 16*4//np.dtype(dst_dtype).itemsize), dtype = dst_dtype) 156 | unif = drv.alloc(3, dtype = 'uint32') 157 | 158 | if np.dtype(dst_dtype).name.startswith('float'): 159 | X1[:] = np.random.uniform(-(2**7), 2**7, X1.shape).astype(src1_dtype) 160 | X2[:] = np.random.uniform(-(2**7), 2**7, X2.shape).astype(src2_dtype) 161 | elif np.dtype(dst_dtype).name.startswith('int'): 162 | X1[:] = np.random.randint(-(2**31), 2**31, X1.shape, dtype=src1_dtype) 163 | X2[:] = np.random.randint(-(2**31), 2**31, X2.shape, dtype=src2_dtype) 164 | elif np.dtype(dst_dtype).name.startswith('uint'): 165 | X1[:] = np.random.randint(0, 2**32, X1.shape, dtype=src1_dtype) 166 | X2[:] = np.random.randint(0, 2**32, X2.shape, dtype=src2_dtype) 167 | Y[:] = 0.0 168 | 169 | unif[0] = X1.addresses()[0] 170 | unif[1] = X2.addresses()[0] 171 | unif[2] = Y.addresses()[0,0] 172 | 173 | start = time.time() 174 | drv.execute(code, unif.addresses()[0]) 175 | end = time.time() 176 | 177 | for ix, (bin_op, dst_op, src1_op, src2_op) in enumerate(cases): 178 | msg = '{}({}, {}, {})'.format(bin_op, dst_op, src1_op, src2_op) 179 | if np.dtype(dst_dtype).name.startswith('float'): 180 | assert np.allclose(ops[dst_op](Y[ix]), ops[bin_op](ops[src1_op](X1), ops[src2_op](X2)), rtol=1e-2), msg 181 | elif np.dtype(dst_dtype).name.startswith('int') or np.dtype(dst_dtype).name.startswith('uint'): 182 | assert np.all(ops[dst_op](Y[ix]) == ops[bin_op](ops[src1_op](X1), ops[src2_op](X2))), msg 183 | 184 | def test_binary_ops(): 185 | packs = [('float32', [None, 'none']), ('float16', ['l', 'h'])] 186 | unpacks = [('float32', [None, 'none', 'abs']), ('float16', ['l', 'h'])] 187 | for dst, src1, src2 in itertools.product(packs, unpacks, unpacks): 188 | boilerplate_binary_ops( 189 | ['fadd', 'faddnf', 'fsub', 'fmin', 'fmax', 'fmul', 'fcmp'], 190 | dst, src1, src2, 191 | ) 192 | packs = [('float16', [None, 'none'])] 193 | unpacks = [('float32', [None, 'none']), ('float16', ['l', 'h'])] 194 | for dst, src1, src2 in itertools.product(packs, unpacks, unpacks): 195 | boilerplate_binary_ops( 196 | ['vfpack'], 197 | dst, src1, src2, 198 | ) 199 | packs = [('float16', [None, 'none'])] 200 | unpacks = [('float32', ['r32']), ('float16', ['rl2h', 'rh2l', 'swap'])] 201 | for dst, src1, src2 in itertools.product(packs, unpacks, packs): 202 | boilerplate_binary_ops( 203 | ['vfmin', 'vfmax', 'vfmul'], 204 | dst, src1, src2, 205 | ) 206 | 207 | boilerplate_binary_ops( 208 | ['add', 'sub', 'imin', 'imax', 'asr'], 209 | ('int32', [None]), ('int32', [None]), ('int32', [None]), 210 | ) 211 | boilerplate_binary_ops( 212 | ['add', 'sub', 'umin', 'umax'], 213 | ('uint32', [None]), ('uint32', [None]), ('uint32', [None]), 214 | ) 215 | boilerplate_binary_ops( 216 | ['shl', 'shr', 'ror'], 217 | ('uint32', [None]), ('uint32', [None]), ('uint32', [None]), 218 | ) 219 | boilerplate_binary_ops( 220 | ['band', 'bor', 'bxor'], 221 | ('uint32', [None]), ('uint32', [None]), ('uint32', [None]), 222 | ) 223 | 224 | @qpu 225 | def qpu_unary_ops(asm, bin_ops, dst_ops, src_ops): 226 | 227 | eidx(r0, sig = ldunif) 228 | mov(rf0, r5, sig = ldunif) # in 229 | shl(r3, 4, 4).mov(rf1, r5) 230 | 231 | shl(r0, r0, 2) 232 | add(rf0, rf0, r0) 233 | add(rf1, rf1, r0) 234 | 235 | mov(tmua, rf0, sig = thrsw).add(rf0, rf0, r3) 236 | nop() 237 | nop() 238 | nop(sig = ldtmu(r1)) 239 | 240 | g = globals() 241 | for op, pack, unpack in itertools.product(bin_ops, dst_ops, src_ops): 242 | g[op]( 243 | r0.pack(pack) if pack is not None else r0, 244 | r1.unpack(unpack) if unpack is not None else r1, 245 | ) 246 | mov(tmud, r0) 247 | mov(tmua, rf1) 248 | tmuwt().add(rf1, rf1, r3) 249 | 250 | nop(sig = thrsw) 251 | nop(sig = thrsw) 252 | nop() 253 | nop() 254 | nop(sig = thrsw) 255 | nop() 256 | nop() 257 | nop() 258 | 259 | def boilerplate_unary_ops(uni_ops, dst, src): 260 | 261 | dst_dtype, dst_ops = dst 262 | src_dtype, src_ops = src 263 | 264 | with Driver() as drv: 265 | 266 | cases = list(itertools.product(uni_ops, dst_ops, src_ops)) 267 | 268 | code = drv.program(lambda asm: qpu_unary_ops(asm, uni_ops, dst_ops, src_ops)) 269 | X = drv.alloc((16*4//np.dtype(src_dtype).itemsize, ), dtype = src_dtype) 270 | Y = drv.alloc((len(cases), 16*4//np.dtype(dst_dtype).itemsize), dtype = dst_dtype) 271 | unif = drv.alloc(3, dtype = 'uint32') 272 | 273 | X[:] = np.random.uniform(-(2**15), 2**15, X.shape).astype(src_dtype) 274 | Y[:] = 0.0 275 | 276 | unif[0] = X.addresses()[0] 277 | unif[1] = Y.addresses()[0,0] 278 | 279 | start = time.time() 280 | drv.execute(code, unif.addresses()[0]) 281 | end = time.time() 282 | 283 | for ix, (uni_op, dst_op, src_op) in enumerate(cases): 284 | msg = '{}({}, {})'.format(uni_op, dst_op, src_op) 285 | if np.dtype(dst_dtype).name.startswith('float'): 286 | assert np.allclose(ops[dst_op](Y[ix]), ops[uni_op](ops[src_op](X)), rtol=1e-2), msg 287 | elif np.dtype(dst_dtype).name.startswith('int') or np.dtype(dst_dtype).name.startswith('uint'): 288 | assert np.all(ops[dst_op](Y[ix]) == ops[uni_op](ops[src_op](X))), msg 289 | 290 | def test_unary_ops(): 291 | packs = [('float32', [None, 'none']), ('float16', ['l', 'h'])] 292 | unpacks = [('float32', [None, 'none', 'abs']), ('float16', ['l', 'h'])] 293 | for dst, src in itertools.product(packs, unpacks): 294 | boilerplate_unary_ops( 295 | ['fmov'], 296 | dst, src, 297 | ) 298 | packs = [('float32', [None, 'none']), ('float16', ['l', 'h'])] 299 | unpacks = [('float32', [None, 'none']), ('float16', ['l', 'h'])] 300 | for dst, src in itertools.product(packs, unpacks): 301 | boilerplate_unary_ops( 302 | ['fround', 'ftrunc', 'ffloor', 'fceil', 'fdx', 'fdy'], 303 | dst, src, 304 | ) 305 | packs = [('int32', [None, 'none'])] 306 | unpacks = [('float32', [None, 'none']), ('float16', ['l', 'h'])] 307 | for dst, src in itertools.product(packs, unpacks): 308 | boilerplate_unary_ops( 309 | ['ftoin', 'ftoiz'], 310 | dst, src, 311 | ) 312 | packs = [('uint32', [None, 'none'])] 313 | unpacks = [('float32', [None, 'none']), ('float16', ['l', 'h'])] 314 | for dst, src in itertools.product(packs, unpacks): 315 | boilerplate_unary_ops( 316 | ['ftouz'], 317 | dst, src, 318 | ) 319 | # TODO: 'ftoc': what is the meaning of this instruction ? 320 | # packs = [('int32', ['none'])] 321 | # unpacks = [('float32', ['none']), ('float16', ['l', 'h'])] 322 | # for dst, src in itertools.product(packs, unpacks): 323 | # boilerplate_unary_ops( 324 | # ['ftoc'], 325 | # dst, src, 326 | # ) 327 | boilerplate_unary_ops( 328 | ['bnot', 'neg'], 329 | ('int32', [None]), ('int32', [None]), 330 | ) 331 | boilerplate_unary_ops( 332 | ['itof'], 333 | ('float32', [None]), ('int32', [None]), 334 | ) 335 | boilerplate_unary_ops( 336 | ['clz'], 337 | ('uint32', [None]), ('uint32', [None]), 338 | ) 339 | boilerplate_unary_ops( 340 | ['utof'], 341 | ('float32', [None]), ('uint32', [None]), 342 | ) 343 | -------------------------------------------------------------------------------- /tests/test_branch.py: -------------------------------------------------------------------------------- 1 | 2 | # Copyright (c) 2019-2020 Idein Inc. 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy 5 | # of this software and associated documentation files (the "Software"), to deal 6 | # in the Software without restriction, including without limitation the rights 7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | # copies of the Software, and to permit persons to whom the Software is 9 | # furnished to do so, subject to the following conditions: 10 | # 11 | # The above copyright notice and this permission notice (including the next 12 | # paragraph) shall be included in all copies or substantial portions of the 13 | # Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | import time 24 | from videocore6.driver import Driver 25 | from videocore6.assembler import qpu 26 | import numpy as np 27 | 28 | # branch (destination from relative imm) 29 | @qpu 30 | def qpu_branch_rel_imm(asm): 31 | 32 | eidx(r0, sig = ldunifrf(rf0)) 33 | nop(sig = ldunifrf(rf1)) 34 | shl(r0, r0, 2) 35 | add(rf0, rf0, r0) 36 | add(rf1, rf1, r0) 37 | 38 | mov(tmua, rf0, sig = thrsw) 39 | nop() 40 | nop() 41 | nop(sig = ldtmu(r1)) 42 | 43 | b(2*8, cond = 'always') 44 | nop() 45 | nop() 46 | nop() 47 | add(r1, r1, 1) 48 | add(r1, r1, 1) 49 | add(r1, r1, 1) # jump comes here 50 | add(r1, r1, 1) 51 | 52 | mov(tmud, r1) 53 | mov(tmua, rf1) 54 | tmuwt() 55 | 56 | nop(sig = thrsw) 57 | nop(sig = thrsw) 58 | nop() 59 | nop() 60 | nop(sig = thrsw) 61 | nop() 62 | nop() 63 | nop() 64 | 65 | def test_branch_rel_imm(): 66 | 67 | with Driver() as drv: 68 | 69 | code = drv.program(qpu_branch_rel_imm) 70 | X = drv.alloc((16, ), dtype = 'uint32') 71 | Y = drv.alloc((16, ), dtype = 'uint32') 72 | unif = drv.alloc(3, dtype = 'uint32') 73 | 74 | X[:] = np.arange(16) 75 | Y[:] = 0.0 76 | 77 | unif[0] = X.addresses()[0] 78 | unif[1] = Y.addresses()[0] 79 | 80 | start = time.time() 81 | drv.execute(code, unif.addresses()[0]) 82 | end = time.time() 83 | 84 | assert (Y == X + 2).all() 85 | 86 | 87 | # branch (destination from absolute imm) 88 | @qpu 89 | def qpu_branch_abs_imm(asm, absimm): 90 | 91 | eidx(r0, sig = ldunifrf(rf0)) 92 | nop(sig = ldunifrf(rf1)) 93 | shl(r0, r0, 2) 94 | add(rf0, rf0, r0) 95 | add(rf1, rf1, r0) 96 | 97 | mov(tmua, rf0, sig = thrsw) 98 | nop() 99 | nop() 100 | nop(sig = ldtmu(r1)) 101 | 102 | b(absimm, absolute = True, cond = 'always') 103 | nop() 104 | nop() 105 | nop() 106 | add(r1, r1, 1) 107 | add(r1, r1, 1) 108 | add(r1, r1, 1) # jump comes here 109 | add(r1, r1, 1) 110 | 111 | mov(tmud, r1) 112 | mov(tmua, rf1) 113 | tmuwt() 114 | 115 | nop(sig = thrsw) 116 | nop(sig = thrsw) 117 | nop() 118 | nop() 119 | nop(sig = thrsw) 120 | nop() 121 | nop() 122 | nop() 123 | 124 | def test_branch_abs_imm(): 125 | 126 | with Driver() as drv: 127 | 128 | @qpu 129 | def qpu_dummy(asm): 130 | nop() 131 | dummy = drv.program(qpu_dummy) 132 | code = drv.program(lambda asm: qpu_branch_abs_imm(asm, int(dummy.addresses()[0]+16*8))) 133 | X = drv.alloc((16, ), dtype = 'uint32') 134 | Y = drv.alloc((16, ), dtype = 'uint32') 135 | unif = drv.alloc(3, dtype = 'uint32') 136 | 137 | X[:] = np.arange(16) 138 | Y[:] = 0.0 139 | 140 | unif[0] = X.addresses()[0] 141 | unif[1] = Y.addresses()[0] 142 | 143 | start = time.time() 144 | drv.execute(code, unif.addresses()[0]) 145 | end = time.time() 146 | 147 | assert (Y == X + 2).all() 148 | 149 | 150 | # branch (destination from label) 151 | @qpu 152 | def qpu_branch_rel_label(asm): 153 | 154 | eidx(r0, sig = ldunifrf(rf0)) 155 | nop(sig = ldunifrf(rf1)) 156 | shl(r0, r0, 2) 157 | add(rf0, rf0, r0) 158 | add(rf1, rf1, r0) 159 | 160 | mov(tmua, rf0, sig = thrsw) 161 | nop() 162 | nop() 163 | nop(sig = ldtmu(r1)) 164 | 165 | b(R.foo, cond = 'always') 166 | nop() 167 | nop() 168 | nop() 169 | add(r1, r1, 1) 170 | L.foo 171 | add(r1, r1, 1) # jump comes here 172 | L.bar 173 | add(r1, r1, 1) 174 | L.baz 175 | add(r1, r1, 1) 176 | 177 | mov(tmud, r1) 178 | mov(tmua, rf1) 179 | tmuwt() 180 | 181 | nop(sig = thrsw) 182 | nop(sig = thrsw) 183 | nop() 184 | nop() 185 | nop(sig = thrsw) 186 | nop() 187 | nop() 188 | nop() 189 | 190 | def test_branch_rel_label(): 191 | 192 | with Driver() as drv: 193 | 194 | code = drv.program(qpu_branch_rel_label) 195 | X = drv.alloc((16, ), dtype = 'uint32') 196 | Y = drv.alloc((16, ), dtype = 'uint32') 197 | unif = drv.alloc(3, dtype = 'uint32') 198 | 199 | X[:] = np.arange(16) 200 | Y[:] = 0.0 201 | 202 | unif[0] = X.addresses()[0] 203 | unif[1] = Y.addresses()[0] 204 | 205 | start = time.time() 206 | drv.execute(code, unif.addresses()[0]) 207 | end = time.time() 208 | 209 | assert (Y == X + 3).all() 210 | 211 | 212 | # branch (destination from regfile) 213 | @qpu 214 | def qpu_branch_abs_reg(asm): 215 | 216 | eidx(r0, sig = ldunifrf(rf0)) 217 | nop(sig = ldunifrf(rf1)) 218 | shl(r0, r0, 2) 219 | add(rf0, rf0, r0) 220 | add(rf1, rf1, r0) 221 | 222 | mov(tmua, rf0, sig = thrsw) 223 | nop() 224 | nop() 225 | nop(sig = ldtmu(rf2)) 226 | 227 | mov(r1, 0) 228 | b(rf2, cond = 'always') 229 | nop() 230 | nop() 231 | nop() 232 | L.label 233 | add(r1, r1, 1) 234 | add(r1, r1, 1) 235 | add(r1, r1, 1) 236 | add(r1, r1, 1) # jump comes here 237 | 238 | mov(tmud, r1) 239 | mov(tmua, rf1) 240 | tmuwt() 241 | 242 | nop(sig = thrsw) 243 | nop(sig = thrsw) 244 | nop() 245 | nop() 246 | nop(sig = thrsw) 247 | nop() 248 | nop() 249 | nop() 250 | 251 | def test_branch_abs_reg(): 252 | 253 | with Driver() as drv: 254 | 255 | code = drv.program(qpu_branch_abs_reg) 256 | X = drv.alloc((16, ), dtype = 'uint32') 257 | Y = drv.alloc((16, ), dtype = 'uint32') 258 | unif = drv.alloc(3, dtype = 'uint32') 259 | 260 | X[:] = code.addresses()[0] + 17*8 261 | Y[:] = 0.0 262 | 263 | unif[0] = X.addresses()[0] 264 | unif[1] = Y.addresses()[0] 265 | 266 | start = time.time() 267 | drv.execute(code, unif.addresses()[0]) 268 | end = time.time() 269 | 270 | assert (Y == 1).all() 271 | 272 | 273 | # branch (destination from link_reg) 274 | @qpu 275 | def qpu_branch_link_reg(asm, set_subroutine_link, use_link_reg_direct): 276 | 277 | eidx(r0, sig = ldunifrf(rf0)) 278 | nop(sig = ldunifrf(rf1)) 279 | shl(r0, r0, 2) 280 | add(rf0, rf0, r0) 281 | add(rf1, rf1, r0) 282 | 283 | mov(tmua, rf0, sig = thrsw) 284 | nop() 285 | nop() 286 | nop(sig = ldtmu(r2)) 287 | 288 | mov(rf2, 0) 289 | mov(rf3, 0) 290 | b(R.init_link, cond = 'always', set_link = True) 291 | nop() # delay slot 292 | nop() # delay slot 293 | nop() # delay slot 294 | L.init_link 295 | 296 | # subroutine returns to here if set_subroutine_link is False. 297 | add(rf3, rf3, 1) 298 | 299 | # jump to subroutine once. 300 | mov(null, rf2, cond = 'pushz') 301 | b(R.subroutine, cond = 'alla', set_link = set_subroutine_link) 302 | mov(rf2, 1) # delay slot 303 | nop() # delay slot 304 | nop() # delay slot 305 | 306 | # subroutine returns to here if set_subroutine_link is True. 307 | shl(r1, 4, 4) 308 | mov(tmud, rf3) # rf3 will be 1 if set_subroutine_link, else 2. 309 | mov(tmua, rf1).add(rf1, rf1, r1) 310 | tmuwt() 311 | 312 | nop(sig = thrsw) 313 | nop(sig = thrsw) 314 | nop() 315 | nop() 316 | nop(sig = thrsw) 317 | nop() 318 | nop() 319 | nop() 320 | 321 | L.subroutine 322 | 323 | shl(r1, 4, 4) 324 | mov(tmud, r2) 325 | mov(tmua, rf1).add(rf1, rf1, r1) 326 | tmuwt() 327 | 328 | if use_link_reg_direct: 329 | b(link, cond = 'always') 330 | else: 331 | lr(rf32) # lr instruction reads link register 332 | b(rf32, cond = 'always') 333 | nop() # delay slot 334 | nop() # delay slot 335 | nop() # delay slot 336 | 337 | def test_branch_link_reg(): 338 | 339 | for set_subroutine_link, expected in [(False, 2), (True, 1)]: 340 | for use_link_reg_direct in [False, True]: 341 | with Driver() as drv: 342 | 343 | code = drv.program(lambda asm: qpu_branch_link_reg(asm, set_subroutine_link, use_link_reg_direct)) 344 | X = drv.alloc(16, dtype = 'uint32') 345 | Y = drv.alloc((2, 16), dtype = 'uint32') 346 | unif = drv.alloc(2, dtype = 'uint32') 347 | 348 | X[:] = (np.random.randn(16) * 1024).astype('uint32') 349 | Y[:] = 0.0 350 | 351 | unif[0] = X.addresses()[0] 352 | unif[1] = Y.addresses()[0,0] 353 | 354 | start = time.time() 355 | drv.execute(code, unif.addresses()[0]) 356 | end = time.time() 357 | 358 | assert (Y[0] == X).all() 359 | assert (Y[1] == expected).all() 360 | 361 | 362 | # uniform branch (destination from uniform relative value) 363 | @qpu 364 | def qpu_uniform_branch_rel(asm): 365 | 366 | eidx(r0, sig = ldunifrf(rf0)) 367 | shl(r0, r0, 2) 368 | add(rf0, rf0, r0) 369 | 370 | b(R.label, cond = 'always').unif_addr() 371 | nop() 372 | nop() 373 | nop() 374 | L.label 375 | nop(sig = ldunifrf(tmud)) 376 | mov(tmua, rf0) 377 | tmuwt() 378 | 379 | nop(sig = thrsw) 380 | nop(sig = thrsw) 381 | nop() 382 | nop() 383 | nop(sig = thrsw) 384 | nop() 385 | nop() 386 | nop() 387 | 388 | def test_uniform_branch_rel(): 389 | 390 | with Driver() as drv: 391 | 392 | code = drv.program(qpu_uniform_branch_rel) 393 | Y = drv.alloc((16, ), dtype = 'uint32') 394 | unif = drv.alloc(5, dtype = 'uint32') 395 | 396 | Y[:] = 0.0 397 | 398 | unif[0] = Y.addresses()[0] 399 | unif[1] = 8 # relative address for uniform branch 400 | unif[2] = 5 401 | unif[3] = 6 402 | unif[4] = 7 # uniform branch point here 403 | 404 | start = time.time() 405 | drv.execute(code, unif.addresses()[0]) 406 | end = time.time() 407 | 408 | assert (Y == 7).all() 409 | 410 | 411 | # uniform branch (destination from uniform absolute value) 412 | @qpu 413 | def qpu_uniform_branch_abs(asm): 414 | 415 | eidx(r0, sig = ldunifrf(rf0)) 416 | shl(r0, r0, 2) 417 | add(rf0, rf0, r0) 418 | 419 | b(R.label, cond = 'always').unif_addr(absolute = True) 420 | nop() 421 | nop() 422 | nop() 423 | L.label 424 | nop(sig = ldunifrf(tmud)) 425 | mov(tmua, rf0) 426 | tmuwt() 427 | 428 | nop(sig = thrsw) 429 | nop(sig = thrsw) 430 | nop() 431 | nop() 432 | nop(sig = thrsw) 433 | nop() 434 | nop() 435 | nop() 436 | 437 | def test_uniform_branch_abs(): 438 | 439 | with Driver() as drv: 440 | 441 | code = drv.program(qpu_uniform_branch_abs) 442 | Y = drv.alloc((16, ), dtype = 'uint32') 443 | unif = drv.alloc(5, dtype = 'uint32') 444 | 445 | Y[:] = 0.0 446 | 447 | unif[0] = Y.addresses()[0] 448 | unif[1] = unif.addresses()[3] # absolute address for uniform branch 449 | unif[2] = 5 450 | unif[3] = 6 # uniform branch point here 451 | unif[4] = 7 452 | 453 | start = time.time() 454 | drv.execute(code, unif.addresses()[0]) 455 | end = time.time() 456 | 457 | assert (Y == 6).all() 458 | 459 | 460 | # uniform branch (destination from register) 461 | @qpu 462 | def qpu_uniform_branch_reg(asm): 463 | 464 | 465 | eidx(r0, sig = ldunifrf(rf0)) 466 | nop(sig = ldunifrf(rf1)) 467 | shl(r0, r0, 2) 468 | add(rf0, rf0, r0) 469 | add(rf1, rf1, r0) 470 | 471 | mov(tmua, rf0, sig = thrsw) 472 | nop() 473 | nop() 474 | nop(sig = ldtmu(rf2)) 475 | 476 | b(R.label, cond = 'always').unif_addr(rf2) 477 | nop() 478 | nop() 479 | nop() 480 | L.label 481 | nop(sig = ldunifrf(rf3)) 482 | mov(tmud, rf3) 483 | mov(tmua, rf1) 484 | tmuwt() 485 | 486 | nop(sig = thrsw) 487 | nop(sig = thrsw) 488 | nop() 489 | nop() 490 | nop(sig = thrsw) 491 | nop() 492 | nop() 493 | nop() 494 | 495 | def test_uniform_branch_reg(): 496 | 497 | with Driver() as drv: 498 | 499 | code = drv.program(qpu_uniform_branch_reg) 500 | X = drv.alloc((16, ), dtype = 'uint32') 501 | Y = drv.alloc((16, ), dtype = 'uint32') 502 | unif = drv.alloc(6, dtype = 'uint32') 503 | 504 | X[1] = unif.addresses()[4] # absolute address for uniform branch 505 | Y[:] = 0.0 506 | 507 | unif[0] = X.addresses()[0] 508 | unif[1] = Y.addresses()[0] 509 | unif[2] = 3 510 | unif[3] = 4 511 | unif[4] = 5 # uniform branch point here 512 | unif[5] = 6 513 | 514 | start = time.time() 515 | drv.execute(code, unif.addresses()[0]) 516 | end = time.time() 517 | 518 | assert (Y == 5).all() 519 | -------------------------------------------------------------------------------- /tests/test_condition_codes.py: -------------------------------------------------------------------------------- 1 | 2 | # Copyright (c) 2019-2020 Idein Inc. 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy 5 | # of this software and associated documentation files (the "Software"), to deal 6 | # in the Software without restriction, including without limitation the rights 7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | # copies of the Software, and to permit persons to whom the Software is 9 | # furnished to do so, subject to the following conditions: 10 | # 11 | # The above copyright notice and this permission notice (including the next 12 | # paragraph) shall be included in all copies or substantial portions of the 13 | # Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | 24 | import time 25 | from videocore6.driver import Driver 26 | from videocore6.assembler import qpu 27 | import numpy as np 28 | 29 | 30 | # `cond = 'push*'` sets the conditional flag A 31 | @qpu 32 | def qpu_cond_push_a(asm): 33 | 34 | eidx(r0, sig = ldunif) 35 | mov(r2, r5) 36 | shl(r0, r0, 2) 37 | add(r2, r2, r0) 38 | shl(r1, 4, 4) 39 | 40 | cond_pairs = [ 41 | ('pushz', 'ifa'), 42 | ('pushn', 'ifna'), 43 | ('pushc', 'ifa'), 44 | ] 45 | 46 | for cond_push, cond_if in cond_pairs: 47 | eidx(r0) 48 | sub(r0, r0, 10, cond = cond_push) 49 | mov(r0, 0) 50 | mov(r0, 1, cond = cond_if) 51 | mov(tmud, r0) 52 | mov(tmua, r2) 53 | tmuwt().add(r2, r2, r1) 54 | mov(r0, 0) 55 | nop().mov(r0, 1, cond = cond_if) 56 | mov(tmud, r0) 57 | mov(tmua, r2) 58 | tmuwt().add(r2, r2, r1) 59 | 60 | nop(sig = thrsw) 61 | nop(sig = thrsw) 62 | nop() 63 | nop() 64 | nop(sig = thrsw) 65 | nop() 66 | nop() 67 | nop() 68 | 69 | def test_cond_push_a(): 70 | 71 | with Driver() as drv: 72 | 73 | code = drv.program(qpu_cond_push_a) 74 | data = drv.alloc((6, 16), dtype = 'uint32') 75 | unif = drv.alloc(1, dtype = 'uint32') 76 | 77 | data[:] = 0 78 | 79 | unif[0] = data.addresses()[0,0] 80 | 81 | start = time.time() 82 | drv.execute(code, unif.addresses()[0]) 83 | end = time.time() 84 | 85 | pushz_if_expected = np.zeros((16,), dtype = 'uint32') 86 | pushz_if_expected[10] = 1 87 | 88 | pushn_ifn_expected = np.zeros((16,), dtype = 'uint32') 89 | pushn_ifn_expected[10:] = 1 90 | 91 | pushc_if_expected = np.zeros((16,), dtype = 'uint32') 92 | pushc_if_expected[:10] = 1 93 | 94 | assert (data[0] == pushz_if_expected).all() 95 | assert (data[1] == pushz_if_expected).all() 96 | assert (data[2] == pushn_ifn_expected).all() 97 | assert (data[3] == pushn_ifn_expected).all() 98 | assert (data[4] == pushc_if_expected).all() 99 | assert (data[5] == pushc_if_expected).all() 100 | 101 | # `cond = 'push*'` moves the old conditional flag A to B 102 | @qpu 103 | def qpu_cond_push_b(asm): 104 | 105 | eidx(r0, sig = ldunif) 106 | mov(r2, r5) 107 | shl(r0, r0, 2) 108 | add(r2, r2, r0) 109 | shl(r1, 4, 4) 110 | 111 | eidx(r0) 112 | sub(null, r0, 10, cond = 'pushz') 113 | mov(r0, 0, cond = 'ifa') 114 | eidx(r0).mov(tmud, r0) 115 | mov(tmua, r2) 116 | tmuwt().add(r2, r2, r1) 117 | 118 | eidx(r0) 119 | sub(null, r0, 5, cond = 'pushz') 120 | mov(r0, 0, cond = 'ifa') 121 | eidx(r0).mov(tmud, r0) 122 | mov(tmua, r2) 123 | tmuwt().add(r2, r2, r1) 124 | mov(r0, 0, cond = 'ifb') 125 | eidx(r0).mov(tmud, r0) 126 | mov(tmua, r2) 127 | tmuwt().add(r2, r2, r1) 128 | 129 | eidx(r0) 130 | sub(null, r0, 1, cond = 'pushz') 131 | mov(r0, 0, cond = 'ifa') 132 | eidx(r0).mov(tmud, r0) 133 | mov(tmua, r2) 134 | tmuwt().add(r2, r2, r1) 135 | mov(r0, 0, cond = 'ifb') 136 | eidx(r0).mov(tmud, r0) 137 | mov(tmua, r2) 138 | tmuwt().add(r2, r2, r1) 139 | 140 | nop(sig = thrsw) 141 | nop(sig = thrsw) 142 | nop() 143 | nop() 144 | nop(sig = thrsw) 145 | nop() 146 | nop() 147 | nop() 148 | 149 | def test_cond_push_b(): 150 | 151 | with Driver() as drv: 152 | 153 | code = drv.program(qpu_cond_push_b) 154 | data = drv.alloc((5, 16), dtype = 'uint32') 155 | unif = drv.alloc(1, dtype = 'uint32') 156 | 157 | data[:] = 0 158 | 159 | unif[0] = data.addresses()[0,0] 160 | 161 | start = time.time() 162 | drv.execute(code, unif.addresses()[0]) 163 | end = time.time() 164 | 165 | push0 = [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0,11,12,13,14,15] 166 | push1 = [ 0, 1, 2, 3, 4, 0, 6, 7, 8, 9,10,11,12,13,14,15] 167 | push2 = [ 0, 0, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15] 168 | 169 | expected = np.array( 170 | # pushz 171 | [push0, # ifa 172 | # pushz 173 | push1, # ifa 174 | push0, # ifb 175 | # pushz 176 | push2, # ifa 177 | push1], # ifb 178 | dtype = 'uint32' 179 | ) 180 | 181 | assert (data == expected).all() 182 | 183 | # `cond = '{and,nor}*'` updates the conditional flag A and it don't affect to B 184 | @qpu 185 | def qpu_cond_update(asm, cond_update_flags): 186 | 187 | eidx(r0, sig = ldunif) 188 | mov(r2, r5) 189 | shl(r0, r0, 2) 190 | add(r2, r2, r0) 191 | shl(r1, 4, 4) 192 | 193 | for cond_update_flag in cond_update_flags: 194 | eidx(r0) 195 | band(r0, r0, 1, cond = 'pushz') # fla = [1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0] 196 | eidx(r0) 197 | sub(null, r0, 5, cond = cond_update_flag) 198 | mov(r0, 0) 199 | mov(r0, 1, cond = 'ifa') 200 | mov(tmud, r0) 201 | mov(tmua, r2) 202 | tmuwt().add(r2, r2, r1) 203 | 204 | for cond_update_flag in cond_update_flags: 205 | eidx(r0) 206 | band(r0, r0, 1, cond = 'pushz') 207 | eidx(r0) 208 | add(r3, r0, r0).sub(r0, r0, 5, cond = cond_update_flag) 209 | mov(r0, 0) 210 | mov(r0, 1, cond = 'ifa') 211 | mov(tmud, r0) 212 | mov(tmua, r2) 213 | tmuwt().add(r2, r2, r1) 214 | 215 | nop(sig = thrsw) 216 | nop(sig = thrsw) 217 | nop() 218 | nop() 219 | nop(sig = thrsw) 220 | nop() 221 | nop() 222 | nop() 223 | 224 | def test_cond_update(): 225 | 226 | cond_update_flags = [ 227 | 'andz', 228 | 'andnz', 229 | 'nornz', 230 | 'norz', 231 | 'andn', 232 | 'andnn', 233 | 'nornn', 234 | 'norn', 235 | 'andc', 236 | 'andnc', 237 | 'nornc', 238 | 'norc', 239 | ] 240 | 241 | def cond_update_op(cond_update_flag): 242 | bin_op = [ 243 | lambda a,b: np.logical_not(np.logical_or(a, b)), 244 | np.logical_and 245 | ][cond_update_flag[:3] == 'and'] 246 | b_op = lambda b: [b < 0, b == 0][cond_update_flag[-1] == 'z'] 247 | not_op = [lambda x: x, np.logical_not][cond_update_flag[3:-1] == 'n'] 248 | return lambda a,b: bin_op(a, not_op(b_op(b))) 249 | 250 | with Driver() as drv: 251 | 252 | code = drv.program(lambda asm: qpu_cond_update(asm, cond_update_flags)) 253 | data = drv.alloc((24, 16), dtype = 'uint32') 254 | unif = drv.alloc(1, dtype = 'uint32') 255 | 256 | data[:] = 0 257 | 258 | unif[0] = data.addresses()[0,0] 259 | 260 | start = time.time() 261 | drv.execute(code, unif.addresses()[0]) 262 | end = time.time() 263 | 264 | a = np.array([1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0]) > 0 265 | b = np.arange(16) - 5 266 | 267 | for ix, cond_update_flag in enumerate(cond_update_flags): 268 | assert np.all(data[ix] == cond_update_op(cond_update_flag)(a, b)) 269 | 270 | # dual `cond=''` instruction 271 | @qpu 272 | def qpu_cond_combination(asm): 273 | 274 | eidx(r0, sig = ldunif) 275 | mov(r2, r5) 276 | shl(r0, r0, 2) 277 | add(r2, r2, r0) 278 | shl(r1, 4, 4) 279 | 280 | # if / push 281 | eidx(r0) 282 | sub(r0, r0, 10, cond = 'pushz') 283 | eidx(r0) 284 | mov(r0, 5, cond = 'ifa').sub(r3, r0, 5, cond = 'pushn') 285 | mov(tmud, r0) 286 | mov(tmua, r2) 287 | tmuwt().add(r2, r2, r1) 288 | eidx(r0) 289 | mov(r0, 0, cond = 'ifa') 290 | mov(tmud, r0) 291 | mov(tmua, r2) 292 | tmuwt().add(r2, r2, r1) 293 | 294 | # push / if 295 | eidx(r0) 296 | sub(r0, r0, 10, cond = 'pushz') 297 | eidx(r0) 298 | sub(null, r0, 5, cond = 'pushn').mov(r0, 5, cond = 'ifa') 299 | mov(tmud, r0) 300 | mov(tmua, r2) 301 | tmuwt().add(r2, r2, r1) 302 | eidx(r0) 303 | mov(r0, 0, cond = 'ifa') 304 | mov(tmud, r0) 305 | mov(tmua, r2) 306 | tmuwt().add(r2, r2, r1) 307 | 308 | # if / if 309 | eidx(r0) 310 | sub(null, r0, 10, cond = 'pushn') 311 | eidx(r3) 312 | mov(r0, 0, cond = 'ifna').mov(r3, 0, cond = 'ifna') 313 | mov(tmud, r0) 314 | mov(tmua, r2) 315 | tmuwt().add(r2, r2, r1) 316 | mov(tmud, r3) 317 | mov(tmua, r2) 318 | tmuwt().add(r2, r2, r1) 319 | 320 | # update / if 321 | eidx(r0) 322 | sub(null, r0, 10, cond = 'pushn') 323 | eidx(r3) 324 | sub(null, r0, 5, cond = 'andn').mov(r3, 5, cond = 'ifa') 325 | eidx(r0) 326 | mov(r0, 0, cond = 'ifa') 327 | mov(tmud, r0) 328 | mov(tmua, r2) 329 | tmuwt().add(r2, r2, r1) 330 | mov(tmud, r3) 331 | mov(tmua, r2) 332 | tmuwt().add(r2, r2, r1) 333 | 334 | nop(sig = thrsw) 335 | nop(sig = thrsw) 336 | nop() 337 | nop() 338 | nop(sig = thrsw) 339 | nop() 340 | nop() 341 | nop() 342 | 343 | def test_cond_combination(): 344 | 345 | with Driver() as drv: 346 | 347 | code = drv.program(qpu_cond_combination) 348 | data = drv.alloc((8, 16), dtype = 'uint32') 349 | unif = drv.alloc(1, dtype = 'uint32') 350 | 351 | data[:] = 0 352 | 353 | unif[0] = data.addresses()[0,0] 354 | 355 | start = time.time() 356 | drv.execute(code, unif.addresses()[0]) 357 | end = time.time() 358 | 359 | expected = np.array( 360 | [[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 5,11,12,13,14,15], 361 | [ 0, 0, 0, 0, 0, 5, 6, 7, 8, 9,10,11,12,13,14,15], 362 | [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 5,11,12,13,14,15], 363 | [ 0, 0, 0, 0, 0, 5, 6, 7, 8, 9,10,11,12,13,14,15], 364 | [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 0, 0, 0, 0, 0], 365 | [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 0, 0, 0, 0, 0], 366 | [ 0, 0, 0, 0, 0, 5, 6, 7, 8, 9,10,11,12,13,14,15], 367 | [ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,10,11,12,13,14,15]], 368 | dtype = 'uint32' 369 | ) 370 | 371 | assert (data == expected).all() 372 | 373 | 374 | # vflx instructions read a condition flag as int16 375 | @qpu 376 | def qpu_cond_vflx(asm, ops): 377 | 378 | eidx(r0, sig = ldunif) 379 | mov(r2, r5) 380 | shl(r0, r0, 2) 381 | add(r2, r2, r0) 382 | shl(r1, 4, 4) 383 | 384 | # init fla/flb 385 | bxor(rf0, rf0, rf0).sub(rf1, rf1, rf1) 386 | eidx(r0) 387 | band(null, r0, 1 << 0, cond = 'pushz') # a = [1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0] 388 | band(null, r0, 1 << 1, cond = 'pushz') # a = [1,1,0,0,1,1,0,0,1,1,0,0,1,1,0,0], b = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] 389 | 390 | # flapush 391 | g = globals() 392 | for op in ops: 393 | g[op](r0) 394 | mov(tmud, r0) 395 | mov(tmua, r2) 396 | tmuwt().add(r2, r2, r1) 397 | 398 | nop(sig = thrsw) 399 | nop(sig = thrsw) 400 | nop() 401 | nop() 402 | nop(sig = thrsw) 403 | nop() 404 | nop() 405 | nop() 406 | 407 | def test_cond_vflx(): 408 | 409 | def expected(op): 410 | result = [ 411 | np.array([1,1,0,0,1,1,0,0,1,1,0,0,1,1,0,0], dtype = 'int16'), 412 | np.array([1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0], dtype = 'int16'), 413 | ][op[-1] == 'b'].repeat(2) 414 | if op[3:-1] == 'n': 415 | result = 1 - result 416 | return result 417 | 418 | ops = [ 419 | 'vfla', 420 | 'vflna', 421 | 'vflb', 422 | 'vflnb', 423 | ] 424 | 425 | with Driver() as drv: 426 | 427 | code = drv.program(lambda asm: qpu_cond_vflx(asm, ops)) 428 | data = drv.alloc((len(ops), 32), dtype = 'int16') 429 | unif = drv.alloc(1, dtype = 'uint32') 430 | 431 | data[:] = 0 432 | 433 | unif[0] = data.addresses()[0,0] 434 | 435 | start = time.time() 436 | drv.execute(code, unif.addresses()[0]) 437 | end = time.time() 438 | 439 | for ix, op in enumerate(ops): 440 | assert (data[ix] == expected(op)).all() 441 | 442 | 443 | # vflx instructions read a condition flag as int16 444 | @qpu 445 | def qpu_cond_vflx(asm, ops): 446 | 447 | eidx(r0, sig = ldunif) 448 | mov(r2, r5) 449 | shl(r0, r0, 2) 450 | add(r2, r2, r0) 451 | shl(r1, 4, 4) 452 | 453 | # init fla/flb 454 | bxor(rf0, rf0, rf0).sub(rf1, rf1, rf1) 455 | eidx(r0) 456 | band(null, r0, 1 << 0, cond = 'pushz') # a = [1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0] 457 | band(null, r0, 1 << 1, cond = 'pushz') # a = [1,1,0,0,1,1,0,0,1,1,0,0,1,1,0,0], b = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] 458 | 459 | # flapush 460 | g = globals() 461 | for op in ops: 462 | g[op](r0) 463 | mov(tmud, r0) 464 | mov(tmua, r2) 465 | tmuwt().add(r2, r2, r1) 466 | 467 | nop(sig = thrsw) 468 | nop(sig = thrsw) 469 | nop() 470 | nop() 471 | nop(sig = thrsw) 472 | nop() 473 | nop() 474 | nop() 475 | 476 | def test_cond_vflx(): 477 | 478 | def expected(op): 479 | result = [ 480 | np.array([1,1,0,0,1,1,0,0,1,1,0,0,1,1,0,0], dtype = 'int16'), 481 | np.array([1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0], dtype = 'int16'), 482 | ][op[-1] == 'b'].repeat(2) 483 | if op[3:-1] == 'n': 484 | result = 1 - result 485 | return result 486 | 487 | ops = [ 488 | 'vfla', 489 | 'vflna', 490 | 'vflb', 491 | 'vflnb', 492 | ] 493 | 494 | with Driver() as drv: 495 | 496 | code = drv.program(lambda asm: qpu_cond_vflx(asm, ops)) 497 | data = drv.alloc((len(ops), 32), dtype = 'int16') 498 | unif = drv.alloc(1, dtype = 'uint32') 499 | 500 | data[:] = 0 501 | 502 | unif[0] = data.addresses()[0,0] 503 | 504 | start = time.time() 505 | drv.execute(code, unif.addresses()[0]) 506 | end = time.time() 507 | 508 | for ix, op in enumerate(ops): 509 | assert (data[ix] == expected(op)).all() 510 | 511 | 512 | @qpu 513 | def qpu_cond_flx(asm, ops): 514 | 515 | eidx(r0, sig = ldunif) 516 | mov(rf0, r5, sig = ldunif) # in 517 | mov(rf1, r5, sig = ldunif) # out 518 | shl(r3, 4, 4).mov(rf2, r5) 519 | 520 | shl(r0, r0, 2) 521 | add(rf0, rf0, r0) 522 | add(rf1, rf1, r0) 523 | add(rf2, rf2, r0) 524 | 525 | mov(tmua, rf0, sig = thrsw).add(rf0, rf0, r3) 526 | nop() 527 | mov(tmua, rf1, sig = thrsw).add(rf1, rf1, r3) 528 | nop(sig = ldtmu(r1)) 529 | nop() 530 | nop(sig = ldtmu(r2)) 531 | 532 | # init fla/flb 533 | mov(null, r2, cond = 'pushn') 534 | band(null, r2, 1, cond = 'pushz') # fla, flb = ~(r2 & 1), r2 < 0 535 | 536 | g = globals() 537 | for op in ops: 538 | g[op](tmud, r1) 539 | mov(tmua, rf2) 540 | tmuwt().add(rf2, rf2, r3) 541 | 542 | nop(sig = thrsw) 543 | nop(sig = thrsw) 544 | nop() 545 | nop() 546 | nop(sig = thrsw) 547 | nop() 548 | nop() 549 | nop() 550 | 551 | def test_cond_flx(): 552 | 553 | ops = [ 554 | 'flapush', 555 | 'flbpush', 556 | 'flpop', 557 | ] 558 | 559 | with Driver() as drv: 560 | 561 | code = drv.program(lambda asm: qpu_cond_flx(asm, ops)) 562 | X1 = drv.alloc((16,), dtype = 'uint32') 563 | X2 = drv.alloc((16,), dtype = 'int32') 564 | Y = drv.alloc((len(ops), 16), dtype = 'uint32') 565 | unif = drv.alloc(3, dtype = 'uint32') 566 | 567 | X1[:] = (np.random.randn(*X1.shape) * (2**24)).astype('uint32') 568 | X2[:] = np.random.randn(*X2.shape).astype('int32') 569 | Y[:] = 0.0 570 | 571 | unif[0] = X1.addresses()[0] 572 | unif[1] = X2.addresses()[0] 573 | unif[2] = Y.addresses()[0,0] 574 | 575 | start = time.time() 576 | drv.execute(code, unif.addresses()[0]) 577 | end = time.time() 578 | 579 | fla = 1 - X2 & 1 580 | flb = X2 < 0 581 | 582 | for ix, op in enumerate(ops): 583 | assert (Y[ix] == [(X1 << 2) | (3 * [fla,flb][op[2] == 'b']), X1 >> 2][op[2:] == 'pop']).all() 584 | -------------------------------------------------------------------------------- /tests/test_driver.py: -------------------------------------------------------------------------------- 1 | 2 | # Copyright (c) 2019-2020 Idein Inc. 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy 5 | # of this software and associated documentation files (the "Software"), to deal 6 | # in the Software without restriction, including without limitation the rights 7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | # copies of the Software, and to permit persons to whom the Software is 9 | # furnished to do so, subject to the following conditions: 10 | # 11 | # The above copyright notice and this permission notice (including the next 12 | # paragraph) shall be included in all copies or substantial portions of the 13 | # Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | from videocore6.driver import Driver 24 | 25 | def test_mem(): 26 | print() 27 | 28 | with Driver() as drv: 29 | 30 | n = 4 31 | a = [None] * n 32 | off = 42 33 | 34 | for i in range(n): 35 | a[i] = drv.alloc((256 * 1024), dtype = 'uint32') 36 | a[i][:] = range(i, a[i].shape[0] * n, n) 37 | a[i][:] += off 38 | 39 | for i in range(n): 40 | assert all(a[i][:] == range(i + off, a[i].shape[0] * n + off, n)) 41 | -------------------------------------------------------------------------------- /tests/test_drm.py: -------------------------------------------------------------------------------- 1 | 2 | # Copyright (c) 2019-2020 Idein Inc. 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy 5 | # of this software and associated documentation files (the "Software"), to deal 6 | # in the Software without restriction, including without limitation the rights 7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | # copies of the Software, and to permit persons to whom the Software is 9 | # furnished to do so, subject to the following conditions: 10 | # 11 | # The above copyright notice and this permission notice (including the next 12 | # paragraph) shall be included in all copies or substantial portions of the 13 | # Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | 24 | from videocore6.drm_v3d import DRM_V3D 25 | 26 | 27 | def test_get_param(): 28 | print() 29 | 30 | with DRM_V3D() as drm: 31 | 32 | uifcfg = drm.v3d_get_param(DRM_V3D.V3D_PARAM_V3D_UIFCFG) 33 | hub_ident1 = drm.v3d_get_param(DRM_V3D.V3D_PARAM_V3D_HUB_IDENT1) 34 | hub_ident2 = drm.v3d_get_param(DRM_V3D.V3D_PARAM_V3D_HUB_IDENT2) 35 | hub_ident3 = drm.v3d_get_param(DRM_V3D.V3D_PARAM_V3D_HUB_IDENT3) 36 | core0_ident0 = drm.v3d_get_param(DRM_V3D.V3D_PARAM_V3D_CORE0_IDENT0) 37 | core0_ident1 = drm.v3d_get_param(DRM_V3D.V3D_PARAM_V3D_CORE0_IDENT1) 38 | core0_ident2 = drm.v3d_get_param(DRM_V3D.V3D_PARAM_V3D_CORE0_IDENT2) 39 | supports_tfu = drm.v3d_get_param(DRM_V3D.V3D_PARAM_SUPPORTS_TFU) 40 | supports_csd = drm.v3d_get_param(DRM_V3D.V3D_PARAM_SUPPORTS_CSD) 41 | 42 | print(f'uifcfg: {uifcfg:#010x}') 43 | print(f'hub_ident1: {hub_ident1:#010x}') 44 | print(f'hub_ident2: {hub_ident2:#010x}') 45 | print(f'hub_ident3: {hub_ident3:#010x}') 46 | print(f'core0_ident0: {core0_ident0:#010x}') 47 | print(f'core0_ident1: {core0_ident1:#010x}') 48 | print(f'core0_ident2: {core0_ident2:#010x}') 49 | print(f'supports_tfu: {supports_tfu:#010x}') 50 | print(f'supports_csd: {supports_csd:#010x}') 51 | 52 | print('Consult /sys/kernel/debug/dri/0/v3d_regs for more information') 53 | 54 | 55 | def test_alloc(): 56 | print() 57 | 58 | size = pow(2, 24) 59 | 60 | with DRM_V3D() as drm: 61 | 62 | handle, phyaddr = drm.v3d_create_bo(size) 63 | offset = drm.v3d_mmap_bo(handle) 64 | 65 | print(f'size = {size:#010x}') 66 | print(f'handle = {handle:#010x}') 67 | print(f'phyaddr = {phyaddr:#010x}') 68 | print(f'offset = {offset:#010x}') 69 | -------------------------------------------------------------------------------- /tests/test_labels.py: -------------------------------------------------------------------------------- 1 | 2 | # Copyright (c) 2019-2020 Idein Inc. 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy 5 | # of this software and associated documentation files (the "Software"), to deal 6 | # in the Software without restriction, including without limitation the rights 7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | # copies of the Software, and to permit persons to whom the Software is 9 | # furnished to do so, subject to the following conditions: 10 | # 11 | # The above copyright notice and this permission notice (including the next 12 | # paragraph) shall be included in all copies or substantial portions of the 13 | # Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | import time 24 | from videocore6.driver import Driver 25 | from videocore6.assembler import qpu 26 | import numpy as np 27 | 28 | @qpu 29 | def qpu_label_with_namespace(asm): 30 | 31 | mov(r0, 0) 32 | 33 | with namespace('ns1'): 34 | b(R.test, cond = 'always') 35 | nop() 36 | nop() 37 | nop() 38 | add(r0, r0, 10) 39 | L.test 40 | add(r0, r0, 1) 41 | 42 | with namespace('nested'): 43 | b(R.test, cond = 'always') 44 | nop() 45 | nop() 46 | nop() 47 | add(r0, r0, 10) 48 | L.test 49 | add(r0, r0, 1) 50 | 51 | with namespace('ns2'): 52 | b(R.test, cond = 'always') 53 | nop() 54 | nop() 55 | nop() 56 | add(r0, r0, 10) 57 | L.test 58 | add(r0, r0, 1) 59 | 60 | b(R.test, cond = 'always') 61 | nop() 62 | nop() 63 | nop() 64 | add(r0, r0, 10) 65 | L.test 66 | add(r0, r0, 1) 67 | 68 | with namespace('ns3'): 69 | b(R.test, cond = 'always') 70 | nop() 71 | nop() 72 | nop() 73 | add(r0, r0, 10) 74 | L.test 75 | add(r0, r0, 1) 76 | 77 | eidx(r1, sig = ldunifrf(rf2)) 78 | shl(r1, r1, 2) 79 | 80 | mov(tmud, r0) 81 | add(tmua, rf2, r1) 82 | tmuwt() 83 | 84 | nop(sig = thrsw) 85 | nop(sig = thrsw) 86 | nop() 87 | nop() 88 | nop(sig = thrsw) 89 | nop() 90 | nop() 91 | nop() 92 | 93 | def test_label_with_namespace(): 94 | 95 | with Driver() as drv: 96 | 97 | code = drv.program(qpu_label_with_namespace) 98 | data = drv.alloc(16, dtype = 'uint32') 99 | unif = drv.alloc(1, dtype = 'uint32') 100 | 101 | data[:] = 1234 102 | 103 | unif[0] = data.addresses()[0] 104 | 105 | start = time.time() 106 | drv.execute(code, unif.addresses()[0]) 107 | end = time.time() 108 | 109 | assert (data == 5).all() 110 | -------------------------------------------------------------------------------- /tests/test_parallel.py: -------------------------------------------------------------------------------- 1 | 2 | # Copyright (c) 2019-2020 Idein Inc. 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy 5 | # of this software and associated documentation files (the "Software"), to deal 6 | # in the Software without restriction, including without limitation the rights 7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | # copies of the Software, and to permit persons to whom the Software is 9 | # furnished to do so, subject to the following conditions: 10 | # 11 | # The above copyright notice and this permission notice (including the next 12 | # paragraph) shall be included in all copies or substantial portions of the 13 | # Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | 24 | import time 25 | from videocore6.driver import Driver 26 | from videocore6.assembler import qpu 27 | import numpy as np 28 | 29 | 30 | @qpu 31 | def cost(asm): 32 | shl(r0, 8, 8) 33 | shl(r0, r0, 8) 34 | with loop as l: 35 | sub(r0, r0, 1, cond = 'pushn') 36 | l.b(cond = 'anyna') 37 | nop() 38 | nop() 39 | nop() 40 | 41 | @qpu 42 | def qpu_serial(asm): 43 | 44 | nop(sig = ldunifrf(rf0)) 45 | nop(sig = ldunifrf(rf1)) 46 | nop(sig = ldunifrf(rf2)) 47 | nop(sig = ldunifrf(rf3)) 48 | 49 | eidx(r0) 50 | shl(r0, r0, 2) 51 | add(rf2, rf2, r0) 52 | add(rf3, rf3, r0) 53 | shl(r3, 4, 4) 54 | 55 | for i in range(16): 56 | mov(tmua, rf2, sig = thrsw).add(rf2, rf2, r3) 57 | nop() 58 | nop() 59 | nop(sig = ldtmu(r0)) 60 | mov(tmud, r0) 61 | mov(tmua, rf3, sig = thrsw).add(rf3, rf3, r3) 62 | tmuwt() 63 | 64 | cost(asm) 65 | 66 | nop(sig = thrsw) 67 | nop(sig = thrsw) 68 | nop() 69 | nop() 70 | nop(sig = thrsw) 71 | nop() 72 | nop() 73 | nop() 74 | 75 | # This code requires 16 thread execution. 76 | # If # of thread < 16, thread id (= (tidx & 0b111110) >> 1) could be discontiguous. 77 | # If # of thread > 16, thread id (= (tidx & 0b111110) >> 1) could be duplicated. 78 | @qpu 79 | def qpu_parallel_16(asm): 80 | 81 | tidx(r0, sig = ldunifrf(rf0)) 82 | shr(r0, r0, 1).mov(r1, 1) 83 | shl(r1, r1, 5) 84 | sub(r1, r1, 1) 85 | band(rf31, r0, r1) # rf31 = (qpu_id * 2) + (thread_id >> 1) 86 | 87 | # rf31 * unif[0,1] * sizeof(float) + (unif.addresses[0,0] + 2 * sizeof(float)) 88 | nop(sig = ldunifrf(rf1)) # rf1 = unif[0,1] 89 | shl(r0, rf1, 2) 90 | umul24(r0, r0, rf31) 91 | add(r1, rf0, 8) 92 | add(r0, r0, r1) 93 | eidx(r1) 94 | shl(r1, r1, 2) 95 | add(tmua, r0, r1, sig = thrsw) 96 | nop() 97 | nop() 98 | nop(sig = ldtmu(r0)) # unif[th,2:18] 99 | mov(r5rep, r0) 100 | mov(rf2, r5).rotate(r5rep, r0, -1) # rf2 = unif[th,2] 101 | mov(rf3, r5) # rf3 = unif[th,3] 102 | 103 | eidx(r2) 104 | shl(r2, r2, 2) 105 | add(tmua, rf2, r2, sig = thrsw) 106 | nop() 107 | nop() 108 | nop(sig = ldtmu(rf32)) 109 | 110 | eidx(r2) 111 | shl(r2, r2, 2) 112 | mov(tmud, rf32) 113 | add(tmua, rf3, r2) 114 | tmuwt() 115 | 116 | cost(asm) 117 | 118 | nop(sig = thrsw) 119 | nop(sig = thrsw) 120 | nop() 121 | nop() 122 | nop(sig = thrsw) 123 | nop() 124 | nop() 125 | nop() 126 | 127 | def test_parallel_16(): 128 | 129 | with Driver() as drv: 130 | 131 | thread = 16 132 | 133 | serial_code = drv.program(qpu_serial) 134 | parallel_code = drv.program(qpu_parallel_16) 135 | X = drv.alloc((thread, 16), dtype = 'float32') 136 | Ys = drv.alloc((thread, 16), dtype = 'float32') 137 | Yp = drv.alloc((thread, 16), dtype = 'float32') 138 | unif = drv.alloc((thread, 4), dtype = 'uint32') 139 | 140 | X[:] = np.random.randn(*X.shape) 141 | Ys[:] = -1 142 | Yp[:] = -1 143 | 144 | unif[:,0] = unif.addresses()[:,0] 145 | unif[:,1] = unif.shape[1] 146 | unif[:,2] = X.addresses()[:,0] 147 | unif[:,3] = Ys.addresses()[:,0] 148 | 149 | start = time.time() 150 | drv.execute(serial_code, unif.addresses()[0,0]) 151 | end = time.time() 152 | serial_cost = end - start 153 | 154 | unif[:,3] = Yp.addresses()[:,0] 155 | 156 | start = time.time() 157 | drv.execute(parallel_code, unif.addresses()[0,0], thread=thread) 158 | end = time.time() 159 | parallel_cost = end - start 160 | 161 | np.set_printoptions(threshold=np.inf) 162 | 163 | assert (X == Ys).all() 164 | assert (X == Yp).all() 165 | assert parallel_cost < serial_cost * 2 166 | 167 | # If remove `barrierid` in this code, `test_barrier` will fail. 168 | @qpu 169 | def qpu_barrier(asm): 170 | 171 | tidx(r0, sig = ldunifrf(rf0)) # rf0 = unif[0,0] 172 | shr(r2, r0, 2) 173 | band(r1, r0, 0b11) # thread_id 174 | band(r2, r2, 0b1111) # qpu_id 175 | shr(r1, r1, 1) 176 | shl(r2, r2, 1) 177 | add(rf31, r1, r2) # rf31 = (qpu_id * 2) + (thread_id >> 1) 178 | 179 | nop(sig = ldunifrf(rf1)) # rf1 = unif[0,1] 180 | 181 | # rf31 * unif[0,1] * sizeof(float) + (unif.addresses[0,0] + 2 * sizeof(float)) 182 | shl(r0, rf1, 2) 183 | umul24(r0, r0, rf31) 184 | add(r1, rf0, 8) 185 | add(r0, r0, r1) 186 | eidx(r1) 187 | shl(r1, r1, 2) 188 | add(tmua, r0, r1, sig = thrsw) 189 | nop() 190 | nop() 191 | nop(sig = ldtmu(r0)) # unif[th,2:18] 192 | mov(r5rep, r0) 193 | mov(rf2, r5).rotate(r5rep, r0, -1) # rf2 = unif[th,2] 194 | mov(rf3, r5) # rf3 = unif[th,3] 195 | 196 | eidx(r2) 197 | shl(r2, r2, 2) 198 | add(tmua, rf2, r2, sig = thrsw) 199 | nop() 200 | nop() 201 | nop(sig = ldtmu(r0)) 202 | 203 | mov(r1, rf31) 204 | shl(r1, r1, 8) 205 | L.loop 206 | sub(r1, r1, 1, cond = 'pushn') 207 | b(R.loop, cond = 'anyna') 208 | nop() 209 | nop() 210 | nop() 211 | 212 | eidx(r2) 213 | shl(r2, r2, 2) 214 | mov(tmud, r0) 215 | add(tmua, rf3, r2) 216 | tmuwt() 217 | 218 | barrierid(syncb, sig = thrsw) 219 | 220 | add(rf32, rf31, 1) 221 | band(rf32, rf32, 0b1111) # rf32 = (rf31 + 1) mod 16 222 | 223 | # rf32 * unif[0,1] * sizeof(float) + (unif.addresses[0,0] + 2 * sizeof(float)) 224 | shl(r0, rf1, 2) 225 | umul24(r0, r0, rf32) 226 | add(r1, rf0, 8) 227 | add(r0, r0, r1) 228 | eidx(r1) 229 | shl(r1, r1, 2) 230 | add(tmua, r0, r1, sig = thrsw) 231 | nop() 232 | nop() 233 | nop(sig = ldtmu(r0)) # unif[(th+1)%16,2:18] 234 | mov(r5rep, r0) 235 | mov(rf4, r5).rotate(r5rep, r0, -1) # rf4 = unif[(th+1)%16,2] 236 | mov(rf5, r5) # rf5 = unif[(th+1)%16,3] 237 | 238 | eidx(r2) 239 | shl(r2, r2, 2) 240 | add(tmua, rf5, r2, sig = thrsw) 241 | nop() 242 | nop() 243 | nop(sig = ldtmu(r0)) 244 | 245 | eidx(r2) 246 | shl(r2, r2, 2) 247 | mov(tmud, r0) 248 | add(tmua, rf3, r2) 249 | tmuwt() 250 | 251 | nop(sig = thrsw) 252 | nop(sig = thrsw) 253 | nop() 254 | nop() 255 | nop(sig = thrsw) 256 | nop() 257 | nop() 258 | nop() 259 | 260 | def test_barrier(): 261 | 262 | with Driver() as drv: 263 | 264 | thread = 16 265 | 266 | code = drv.program(qpu_barrier) 267 | X = drv.alloc((thread, 16), dtype = 'float32') 268 | Y = drv.alloc((thread, 16), dtype = 'float32') 269 | unif = drv.alloc((thread, 4), dtype = 'uint32') 270 | 271 | X[:] = np.random.randn(*X.shape) 272 | Y[:] = -1 273 | 274 | unif[:,0] = unif.addresses()[:,0] 275 | unif[:,1] = unif.shape[1] 276 | unif[:,2] = X.addresses()[:,0] 277 | unif[:,3] = Y.addresses()[:,0] 278 | 279 | start = time.time() 280 | drv.execute(code, unif.addresses()[0,0], thread=thread) 281 | end = time.time() 282 | 283 | np.set_printoptions(threshold=np.inf) 284 | 285 | assert (Y == np.concatenate([X[1:],X[:1]])).all() 286 | -------------------------------------------------------------------------------- /tests/test_sfu.py: -------------------------------------------------------------------------------- 1 | 2 | # Copyright (c) 2019-2020 Idein Inc. 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy 5 | # of this software and associated documentation files (the "Software"), to deal 6 | # in the Software without restriction, including without limitation the rights 7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | # copies of the Software, and to permit persons to whom the Software is 9 | # furnished to do so, subject to the following conditions: 10 | # 11 | # The above copyright notice and this permission notice (including the next 12 | # paragraph) shall be included in all copies or substantial portions of the 13 | # Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | import time 24 | from videocore6.driver import Driver 25 | from videocore6.assembler import qpu 26 | import numpy as np 27 | 28 | def sfu_sin(x): 29 | result = np.sin(x * np.pi) 30 | result[x < -0.5] = -1 31 | result[x > 0.5] = 1 32 | return result 33 | 34 | ops = { 35 | # sfu regs/ops 36 | 'recip' : lambda x: 1 / x, 37 | 'rsqrt' : lambda x: 1 / np.sqrt(x), 38 | 'exp' : lambda x: 2 ** x, 39 | 'log' : np.log2, 40 | 'sin' : sfu_sin, 41 | 'rsqrt2' : lambda x: 1 / np.sqrt(x), 42 | } 43 | 44 | 45 | 46 | # SFU IO registers 47 | @qpu 48 | def qpu_sfu_regs(asm, sfu_regs): 49 | 50 | eidx(r0, sig = ldunif) 51 | mov(rf0, r5, sig = ldunif) # in 52 | shl(r3, 4, 4).mov(rf1, r5) 53 | 54 | shl(r0, r0, 2) 55 | add(rf0, rf0, r0) 56 | add(rf1, rf1, r0) 57 | 58 | mov(tmua, rf0, sig = thrsw).add(rf0, rf0, r3) 59 | nop() 60 | nop() 61 | nop(sig = ldtmu(r1)) 62 | 63 | g = globals() 64 | for reg in sfu_regs: 65 | mov(g[reg], r1) 66 | nop() # required ? enough ? 67 | mov(tmud, r4) 68 | mov(tmua, rf1) 69 | tmuwt().add(rf1, rf1, r3) 70 | 71 | nop(sig = thrsw) 72 | nop(sig = thrsw) 73 | nop() 74 | nop() 75 | nop(sig = thrsw) 76 | nop() 77 | nop() 78 | nop() 79 | 80 | def boilerplate_sfu_regs(sfu_regs, domain_limitter): 81 | 82 | with Driver() as drv: 83 | 84 | code = drv.program(lambda asm: qpu_sfu_regs(asm, sfu_regs)) 85 | X = drv.alloc((16, ), dtype = 'float32') 86 | Y = drv.alloc((len(sfu_regs), 16), dtype = 'float32') 87 | unif = drv.alloc(3, dtype = 'uint32') 88 | 89 | X[:] = domain_limitter(np.random.randn(*X.shape).astype('float32')) 90 | Y[:] = 0.0 91 | 92 | unif[0] = X.addresses()[0] 93 | unif[1] = Y.addresses()[0,0] 94 | 95 | start = time.time() 96 | drv.execute(code, unif.addresses()[0]) 97 | end = time.time() 98 | 99 | for ix, reg in enumerate(sfu_regs): 100 | msg = 'mov({}, None)'.format(reg) 101 | assert np.allclose(Y[ix], ops[reg](X), rtol=1e-4), msg 102 | 103 | def test_sfu_regs(): 104 | boilerplate_sfu_regs(['recip','exp','sin'], lambda x: x) 105 | boilerplate_sfu_regs(['rsqrt','log','rsqrt2'], lambda x: x ** 2 + 1e-6) 106 | 107 | 108 | # SFU ops 109 | @qpu 110 | def qpu_sfu_ops(asm, sfu_ops): 111 | 112 | eidx(r0, sig = ldunif) 113 | mov(rf0, r5, sig = ldunif) # in 114 | shl(r3, 4, 4).mov(rf1, r5) 115 | 116 | shl(r0, r0, 2) 117 | add(rf0, rf0, r0) 118 | add(rf1, rf1, r0) 119 | 120 | mov(tmua, rf0, sig = thrsw).add(rf0, rf0, r3) 121 | nop() 122 | nop() 123 | nop(sig = ldtmu(r1)) 124 | 125 | g = globals() 126 | for op in sfu_ops: 127 | g[op](rf2, r1) # ATTENTION: SFU ops requires rfN ? 128 | mov(tmud, rf2) 129 | mov(tmua, rf1) 130 | tmuwt().add(rf1, rf1, r3) 131 | 132 | nop(sig = thrsw) 133 | nop(sig = thrsw) 134 | nop() 135 | nop() 136 | nop(sig = thrsw) 137 | nop() 138 | nop() 139 | nop() 140 | 141 | def boilerplate_sfu_ops(sfu_ops, domain_limitter): 142 | 143 | with Driver() as drv: 144 | 145 | code = drv.program(lambda asm: qpu_sfu_ops(asm, sfu_ops)) 146 | X = drv.alloc((16, ), dtype = 'float32') 147 | Y = drv.alloc((len(sfu_ops), 16), dtype = 'float32') 148 | unif = drv.alloc(3, dtype = 'uint32') 149 | 150 | X[:] = domain_limitter(np.random.randn(*X.shape).astype('float32')) 151 | Y[:] = 0.0 152 | 153 | unif[0] = X.addresses()[0] 154 | unif[1] = Y.addresses()[0,0] 155 | 156 | start = time.time() 157 | drv.execute(code, unif.addresses()[0]) 158 | end = time.time() 159 | 160 | for ix, op in enumerate(sfu_ops): 161 | msg = '{}(None, None)'.format(op) 162 | assert np.allclose(Y[ix], ops[op](X), rtol=1e-4), msg 163 | 164 | def test_sfu_ops(): 165 | boilerplate_sfu_ops(['recip','exp','sin'], lambda x: x) 166 | boilerplate_sfu_ops(['rsqrt','log','rsqrt2'], lambda x: x ** 2 + 1e-6) 167 | -------------------------------------------------------------------------------- /tests/test_signals.py: -------------------------------------------------------------------------------- 1 | 2 | # Copyright (c) 2019-2020 Idein Inc. 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy 5 | # of this software and associated documentation files (the "Software"), to deal 6 | # in the Software without restriction, including without limitation the rights 7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | # copies of the Software, and to permit persons to whom the Software is 9 | # furnished to do so, subject to the following conditions: 10 | # 11 | # The above copyright notice and this permission notice (including the next 12 | # paragraph) shall be included in all copies or substantial portions of the 13 | # Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | 24 | import time 25 | from videocore6.driver import Driver 26 | from videocore6.assembler import qpu 27 | import numpy as np 28 | 29 | 30 | # ldtmu 31 | @qpu 32 | def qpu_signal_ldtmu(asm): 33 | 34 | eidx(r0, sig = ldunif) 35 | mov(rf0, r5, sig = ldunif) 36 | shl(r3, 4, 4).mov(rf1, r5) 37 | 38 | shl(r0, r0, 2) 39 | add(rf0, rf0, r0) 40 | add(rf1, rf1, r0) 41 | 42 | mov(tmua, rf0, sig = thrsw).add(rf0, rf0, r3) # start load X 43 | mov(r0, 1.0) # r0 <- 1.0 44 | mov(r1, 2.0) # r1 <- 2.0 45 | fadd(r0, r0, r0).fmul(r1, r1, r1, sig = ldtmu(rf31)) # r0 <- 2 * r0, r1 <- r1 ^ 2, rf31 <- X 46 | mov(tmud, rf31) 47 | mov(tmua, rf1) 48 | tmuwt().add(rf1, rf1, r3) 49 | mov(tmud, r0) 50 | mov(tmua, rf1) 51 | tmuwt().add(rf1, rf1, r3) 52 | mov(tmud, r1) 53 | mov(tmua, rf1) 54 | tmuwt().add(rf1, rf1, r3) 55 | 56 | nop(sig = thrsw) 57 | nop(sig = thrsw) 58 | nop() 59 | nop() 60 | nop(sig = thrsw) 61 | nop() 62 | nop() 63 | nop() 64 | 65 | def test_signal_ldtmu(): 66 | 67 | with Driver() as drv: 68 | 69 | code = drv.program(qpu_signal_ldtmu) 70 | X = drv.alloc((16, ), dtype = 'float32') 71 | Y = drv.alloc((3, 16), dtype = 'float32') 72 | unif = drv.alloc(3, dtype = 'uint32') 73 | 74 | X[:] = np.random.randn(*X.shape).astype('float32') 75 | Y[:] = 0.0 76 | 77 | unif[0] = X.addresses()[0] 78 | unif[1] = Y.addresses()[0,0] 79 | 80 | start = time.time() 81 | drv.execute(code, unif.addresses()[0]) 82 | end = time.time() 83 | 84 | assert (Y[0] == X).all() 85 | assert (Y[1] == 2).all() 86 | assert (Y[2] == 4).all() 87 | 88 | # rot signal with rN source performs as a full rotate 89 | @qpu 90 | def qpu_full_rotate(asm): 91 | 92 | eidx(r0, sig = ldunif) 93 | mov(rf0, r5, sig = ldunif) 94 | shl(r3, 4, 4).mov(rf1, r5) 95 | 96 | shl(r0, r0, 2) 97 | add(rf0, rf0, r0) 98 | add(rf1, rf1, r0) 99 | 100 | mov(tmua, rf0, sig = thrsw).add(rf0, rf0, r3) 101 | nop() 102 | nop() 103 | nop(sig = ldtmu(r0)) 104 | nop() # required before rotate 105 | 106 | for i in range(-15, 16): 107 | nop().add(r1, r0, r0, sig = rot(i)) 108 | mov(tmud, r1) 109 | mov(tmua, rf1) 110 | tmuwt().add(rf1, rf1, r3) 111 | 112 | for i in range(-15, 16): 113 | mov(r5, i) 114 | nop() # require 115 | nop().add(r1, r0, r0, sig = rot(i)) 116 | mov(tmud, r1) 117 | mov(tmua, rf1) 118 | tmuwt().add(rf1, rf1, r3) 119 | 120 | nop(sig = thrsw) 121 | nop(sig = thrsw) 122 | nop() 123 | nop() 124 | nop(sig = thrsw) 125 | nop() 126 | nop() 127 | nop() 128 | 129 | def test_full_rotate(): 130 | 131 | with Driver() as drv: 132 | 133 | code = drv.program(qpu_full_rotate) 134 | X = drv.alloc((16, ), dtype = 'int32') 135 | Y = drv.alloc((2, len(range(-15, 16)), 16), dtype = 'int32') 136 | unif = drv.alloc(3, dtype = 'uint32') 137 | 138 | X[:] = np.arange(16) 139 | Y[:] = 0 140 | 141 | unif[0] = X.addresses()[0] 142 | unif[1] = Y.addresses()[0,0,0] 143 | 144 | start = time.time() 145 | drv.execute(code, unif.addresses()[0]) 146 | end = time.time() 147 | 148 | expected = np.concatenate([X,X]) * 2 149 | for ix, rot in enumerate(range(-15, 16)): 150 | assert (Y[:,ix] == expected[(-rot%16):(-rot%16)+16]).all() 151 | 152 | 153 | # rotate alias 154 | @qpu 155 | def qpu_rotate_alias(asm): 156 | 157 | eidx(r0, sig = ldunif) 158 | mov(rf0, r5, sig = ldunif) 159 | shl(r3, 4, 4).mov(rf1, r5) 160 | 161 | shl(r0, r0, 2) 162 | add(rf0, rf0, r0) 163 | add(rf1, rf1, r0) 164 | 165 | mov(tmua, rf0, sig = thrsw).add(rf0, rf0, r3) 166 | nop() 167 | nop() 168 | nop(sig = ldtmu(r0)) 169 | nop() # required before rotate 170 | 171 | for i in range(-15, 16): 172 | rotate(r1, r0, i) # add alias 173 | mov(tmud, r1) 174 | mov(tmua, rf1) 175 | tmuwt().add(rf1, rf1, r3) 176 | 177 | for i in range(-15, 16): 178 | nop().rotate(r1, r0, i) # mul alias 179 | mov(tmud, r1) 180 | mov(tmua, rf1) 181 | tmuwt().add(rf1, rf1, r3) 182 | 183 | for i in range(-15, 16): 184 | mov(r5, i) 185 | nop() # require 186 | rotate(r1, r0, r5) # add alias 187 | mov(tmud, r1) 188 | mov(tmua, rf1) 189 | tmuwt().add(rf1, rf1, r3) 190 | 191 | for i in range(-15, 16): 192 | mov(r5, i) 193 | nop() # require 194 | nop().rotate(r1, r0, r5) # mul alias 195 | mov(tmud, r1) 196 | mov(tmua, rf1) 197 | tmuwt().add(rf1, rf1, r3) 198 | 199 | nop(sig = thrsw) 200 | nop(sig = thrsw) 201 | nop() 202 | nop() 203 | nop(sig = thrsw) 204 | nop() 205 | nop() 206 | nop() 207 | 208 | def test_rotate_alias(): 209 | 210 | with Driver() as drv: 211 | 212 | code = drv.program(qpu_rotate_alias) 213 | X = drv.alloc((16, ), dtype = 'int32') 214 | Y = drv.alloc((4, len(range(-15, 16)), 16), dtype = 'int32') 215 | unif = drv.alloc(3, dtype = 'uint32') 216 | 217 | X[:] = np.arange(16) 218 | Y[:] = 0 219 | 220 | unif[0] = X.addresses()[0] 221 | unif[1] = Y.addresses()[0,0,0] 222 | 223 | start = time.time() 224 | drv.execute(code, unif.addresses()[0]) 225 | end = time.time() 226 | 227 | expected = np.concatenate([X,X]) 228 | for ix, rot in enumerate(range(-15, 16)): 229 | assert (Y[:,ix] == expected[(-rot%16):(-rot%16)+16]).all() 230 | 231 | 232 | # rot signal with rfN source performs as a quad rotate 233 | @qpu 234 | def qpu_quad_rotate(asm): 235 | 236 | eidx(r0, sig = ldunif) 237 | mov(rf0, r5, sig = ldunif) 238 | shl(r3, 4, 4).mov(rf1, r5) 239 | 240 | shl(r0, r0, 2) 241 | add(rf0, rf0, r0) 242 | add(rf1, rf1, r0) 243 | 244 | mov(tmua, rf0, sig = thrsw).add(rf0, rf0, r3) 245 | nop() 246 | nop() 247 | nop(sig = ldtmu(rf32)) 248 | nop() # required before rotate 249 | 250 | for i in range(-15, 16): 251 | nop().add(r1, rf32, rf32, sig = rot(i)) 252 | mov(tmud, r1) 253 | mov(tmua, rf1) 254 | tmuwt().add(rf1, rf1, r3) 255 | 256 | for i in range(-15, 16): 257 | mov(r5, i) 258 | nop() # require 259 | nop().add(r1, rf32, rf32, sig = rot(r5)) 260 | mov(tmud, r1) 261 | mov(tmua, rf1) 262 | tmuwt().add(rf1, rf1, r3) 263 | 264 | nop(sig = thrsw) 265 | nop(sig = thrsw) 266 | nop() 267 | nop() 268 | nop(sig = thrsw) 269 | nop() 270 | nop() 271 | nop() 272 | 273 | def test_quad_rotate(): 274 | 275 | with Driver() as drv: 276 | 277 | code = drv.program(qpu_quad_rotate) 278 | X = drv.alloc((16, ), dtype = 'int32') 279 | Y = drv.alloc((2, len(range(-15, 16)), 16), dtype = 'int32') 280 | unif = drv.alloc(3, dtype = 'uint32') 281 | 282 | X[:] = np.arange(16) 283 | Y[:] = 0 284 | 285 | unif[0] = X.addresses()[0] 286 | unif[1] = Y.addresses()[0,0,0] 287 | 288 | start = time.time() 289 | drv.execute(code, unif.addresses()[0]) 290 | end = time.time() 291 | 292 | expected = np.concatenate([X.reshape(4,4)]*2, axis=1)*2 293 | for ix, rot in enumerate(range(-15, 16)): 294 | assert (Y[:,ix] == expected[:,(-rot%4):(-rot%4)+4].ravel()).all() 295 | 296 | 297 | # quad_rotate alias 298 | @qpu 299 | def qpu_quad_rotate_alias(asm): 300 | 301 | eidx(r0, sig = ldunif) 302 | mov(rf0, r5, sig = ldunif) 303 | shl(r3, 4, 4).mov(rf1, r5) 304 | 305 | shl(r0, r0, 2) 306 | add(rf0, rf0, r0) 307 | add(rf1, rf1, r0) 308 | 309 | mov(tmua, rf0, sig = thrsw).add(rf0, rf0, r3) 310 | nop() 311 | nop() 312 | nop(sig = ldtmu(rf32)) 313 | nop() # required before rotate 314 | 315 | for i in range(-15, 16): 316 | quad_rotate(r1, rf32, i) # add alias 317 | mov(tmud, r1) 318 | mov(tmua, rf1) 319 | tmuwt().add(rf1, rf1, r3) 320 | 321 | for i in range(-15, 16): 322 | nop().quad_rotate(r1, rf32, i) # mul alias 323 | mov(tmud, r1) 324 | mov(tmua, rf1) 325 | tmuwt().add(rf1, rf1, r3) 326 | 327 | for i in range(-15, 16): 328 | mov(r5, i) 329 | nop() # require 330 | quad_rotate(r1, rf32, r5) # add alias 331 | mov(tmud, r1) 332 | mov(tmua, rf1) 333 | tmuwt().add(rf1, rf1, r3) 334 | 335 | for i in range(-15, 16): 336 | mov(r5, i) 337 | nop() # require 338 | nop().quad_rotate(r1, rf32, r5) # mul alias 339 | mov(tmud, r1) 340 | mov(tmua, rf1) 341 | tmuwt().add(rf1, rf1, r3) 342 | 343 | nop(sig = thrsw) 344 | nop(sig = thrsw) 345 | nop() 346 | nop() 347 | nop(sig = thrsw) 348 | nop() 349 | nop() 350 | nop() 351 | 352 | def test_quad_rotate_alias(): 353 | 354 | with Driver() as drv: 355 | 356 | code = drv.program(qpu_quad_rotate_alias) 357 | X = drv.alloc((16, ), dtype = 'int32') 358 | Y = drv.alloc((4, len(range(-15, 16)), 16), dtype = 'int32') 359 | unif = drv.alloc(3, dtype = 'uint32') 360 | 361 | X[:] = np.arange(16) 362 | Y[:] = 0 363 | 364 | unif[0] = X.addresses()[0] 365 | unif[1] = Y.addresses()[0,0,0] 366 | 367 | start = time.time() 368 | drv.execute(code, unif.addresses()[0]) 369 | end = time.time() 370 | 371 | expected = np.concatenate([X.reshape(4,4)]*2, axis=1) 372 | for ix, rot in enumerate(range(-15, 16)): 373 | assert (Y[:,ix] == expected[:,(-rot%4):(-rot%4)+4].ravel()).all() 374 | 375 | 376 | # instruction with r5rep dst performs as a full broadcast 377 | @qpu 378 | def qpu_full_broadcast(asm): 379 | 380 | eidx(r0, sig = ldunif) 381 | mov(rf0, r5, sig = ldunif) 382 | shl(r3, 4, 4).mov(rf1, r5) 383 | 384 | shl(r0, r0, 2) 385 | add(rf0, rf0, r0) 386 | add(rf1, rf1, r0) 387 | 388 | mov(tmua, rf0, sig = thrsw).add(rf0, rf0, r3) 389 | nop() 390 | nop() 391 | nop(sig = ldtmu(r0)) 392 | nop() # required before rotate 393 | 394 | for i in range(-15, 16): 395 | nop().mov(r5rep, r0, sig = [rot(ix) for ix in [i] if ix != 0] ) 396 | mov(tmud, r5) 397 | mov(tmua, rf1) 398 | tmuwt().add(rf1, rf1, r3) 399 | 400 | nop(sig = thrsw) 401 | nop(sig = thrsw) 402 | nop() 403 | nop() 404 | nop(sig = thrsw) 405 | nop() 406 | nop() 407 | nop() 408 | 409 | def test_full_broadcast(): 410 | 411 | with Driver() as drv: 412 | 413 | code = drv.program(qpu_full_broadcast) 414 | X = drv.alloc((16, ), dtype = 'int32') 415 | Y = drv.alloc((len(range(-15, 16)), 16), dtype = 'int32') 416 | unif = drv.alloc(3, dtype = 'uint32') 417 | 418 | X[:] = np.arange(16) 419 | Y[:] = 0 420 | 421 | unif[0] = X.addresses()[0] 422 | unif[1] = Y.addresses()[0,0] 423 | 424 | start = time.time() 425 | drv.execute(code, unif.addresses()[0]) 426 | end = time.time() 427 | 428 | expected = X 429 | for ix, rot in enumerate(range(-15, 16)): 430 | assert (Y[ix] == expected[(-rot%16)].repeat(16)).all() 431 | 432 | 433 | # broadcast alias 434 | @qpu 435 | def qpu_broadcast_alias(asm): 436 | 437 | eidx(r0, sig = ldunif) 438 | mov(rf0, r5, sig = ldunif) 439 | shl(r3, 4, 4).mov(rf1, r5) 440 | 441 | shl(r0, r0, 2) 442 | add(rf0, rf0, r0) 443 | add(rf1, rf1, r0) 444 | 445 | mov(tmua, rf0, sig = thrsw).add(rf0, rf0, r3) 446 | nop() 447 | nop() 448 | nop(sig = ldtmu(r0)) 449 | nop() # required before rotate 450 | 451 | for i in range(-15, 16): 452 | nop().mov(broadcast, r0, sig = [rot(ix) for ix in [i] if ix != 0] ) 453 | mov(tmud, r5) 454 | mov(tmua, rf1) 455 | tmuwt().add(rf1, rf1, r3) 456 | 457 | nop(sig = thrsw) 458 | nop(sig = thrsw) 459 | nop() 460 | nop() 461 | nop(sig = thrsw) 462 | nop() 463 | nop() 464 | nop() 465 | 466 | def test_broadcast_alias(): 467 | 468 | with Driver() as drv: 469 | 470 | code = drv.program(qpu_broadcast_alias) 471 | X = drv.alloc((16, ), dtype = 'int32') 472 | Y = drv.alloc((len(range(-15, 16)), 16), dtype = 'int32') 473 | unif = drv.alloc(3, dtype = 'uint32') 474 | 475 | X[:] = np.arange(16) 476 | Y[:] = 0 477 | 478 | unif[0] = X.addresses()[0] 479 | unif[1] = Y.addresses()[0,0] 480 | 481 | start = time.time() 482 | drv.execute(code, unif.addresses()[0]) 483 | end = time.time() 484 | 485 | expected = X 486 | for ix, rot in enumerate(range(-15, 16)): 487 | assert (Y[ix] == expected[(-rot%16)].repeat(16)).all() 488 | 489 | 490 | # instruction with r5 dst performs as a quad broadcast 491 | @qpu 492 | def qpu_quad_broadcast(asm): 493 | 494 | eidx(r0, sig = ldunif) 495 | mov(rf0, r5, sig = ldunif) 496 | shl(r3, 4, 4).mov(rf1, r5) 497 | 498 | shl(r0, r0, 2) 499 | add(rf0, rf0, r0) 500 | add(rf1, rf1, r0) 501 | 502 | mov(tmua, rf0, sig = thrsw).add(rf0, rf0, r3) 503 | nop() 504 | nop() 505 | nop(sig = ldtmu(r0)) 506 | nop() # required before rotate 507 | 508 | for i in range(-15, 16): 509 | nop().mov(r5, r0, sig = [rot(ix) for ix in [i] if ix != 0] ) 510 | mov(tmud, r5) 511 | mov(tmua, rf1) 512 | tmuwt().add(rf1, rf1, r3) 513 | 514 | nop(sig = thrsw) 515 | nop(sig = thrsw) 516 | nop() 517 | nop() 518 | nop(sig = thrsw) 519 | nop() 520 | nop() 521 | nop() 522 | 523 | def test_quad_broadcast(): 524 | 525 | with Driver() as drv: 526 | 527 | code = drv.program(qpu_quad_broadcast) 528 | X = drv.alloc((16, ), dtype = 'int32') 529 | Y = drv.alloc((len(range(-15, 16)), 16), dtype = 'int32') 530 | unif = drv.alloc(3, dtype = 'uint32') 531 | 532 | X[:] = np.arange(16) 533 | Y[:] = 0 534 | 535 | unif[0] = X.addresses()[0] 536 | unif[1] = Y.addresses()[0,0] 537 | 538 | start = time.time() 539 | drv.execute(code, unif.addresses()[0]) 540 | end = time.time() 541 | 542 | expected = np.concatenate([X,X]) 543 | for ix, rot in enumerate(range(-15, 16)): 544 | assert (Y[ix] == expected[(-rot%16):(-rot%16)+16:4].repeat(4)).all() 545 | 546 | 547 | # instruction with r5 dst performs as a quad broadcast 548 | @qpu 549 | def qpu_quad_broadcast_alias(asm): 550 | 551 | eidx(r0, sig = ldunif) 552 | mov(rf0, r5, sig = ldunif) 553 | shl(r3, 4, 4).mov(rf1, r5) 554 | 555 | shl(r0, r0, 2) 556 | add(rf0, rf0, r0) 557 | add(rf1, rf1, r0) 558 | 559 | mov(tmua, rf0, sig = thrsw).add(rf0, rf0, r3) 560 | nop() 561 | nop() 562 | nop(sig = ldtmu(r0)) 563 | nop() # required before rotate 564 | 565 | for i in range(-15, 16): 566 | nop().mov(quad_broadcast, r0, sig = [rot(ix) for ix in [i] if ix != 0] ) 567 | mov(tmud, r5) 568 | mov(tmua, rf1) 569 | tmuwt().add(rf1, rf1, r3) 570 | 571 | nop(sig = thrsw) 572 | nop(sig = thrsw) 573 | nop() 574 | nop() 575 | nop(sig = thrsw) 576 | nop() 577 | nop() 578 | nop() 579 | 580 | def test_quad_broadcast_alias(): 581 | 582 | with Driver() as drv: 583 | 584 | code = drv.program(qpu_quad_broadcast_alias) 585 | X = drv.alloc((16, ), dtype = 'int32') 586 | Y = drv.alloc((len(range(-15, 16)), 16), dtype = 'int32') 587 | unif = drv.alloc(3, dtype = 'uint32') 588 | 589 | X[:] = np.arange(16) 590 | Y[:] = 0 591 | 592 | unif[0] = X.addresses()[0] 593 | unif[1] = Y.addresses()[0,0] 594 | 595 | start = time.time() 596 | drv.execute(code, unif.addresses()[0]) 597 | end = time.time() 598 | 599 | expected = np.concatenate([X,X]) 600 | for ix, rot in enumerate(range(-15, 16)): 601 | assert (Y[ix] == expected[(-rot%16):(-rot%16)+16:4].repeat(4)).all() 602 | -------------------------------------------------------------------------------- /tests/test_tmu.py: -------------------------------------------------------------------------------- 1 | 2 | # Copyright (c) 2019-2020 Idein Inc. 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy 5 | # of this software and associated documentation files (the "Software"), to deal 6 | # in the Software without restriction, including without limitation the rights 7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | # copies of the Software, and to permit persons to whom the Software is 9 | # furnished to do so, subject to the following conditions: 10 | # 11 | # The above copyright notice and this permission notice (including the next 12 | # paragraph) shall be included in all copies or substantial portions of the 13 | # Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | 24 | import time 25 | from videocore6.driver import Driver 26 | from videocore6.assembler import qpu 27 | import numpy as np 28 | 29 | 30 | @qpu 31 | def qpu_tmu_write(asm): 32 | 33 | nop(sig = ldunif) 34 | mov(r1, r5, sig = ldunif) 35 | 36 | # r2 = addr + eidx * 4 37 | # rf0 = eidx 38 | eidx(r0).mov(r2, r5) 39 | shl(r0, r0, 2).mov(rf0, r0) 40 | add(r2, r2, r0) 41 | 42 | with loop as l: 43 | 44 | # rf0: Data to be written. 45 | # r0: Overwritten. 46 | # r2: Address to write data to. 47 | 48 | sub(r1, r1, 1, cond = 'pushz').mov(tmud, rf0) 49 | l.b(cond = 'anyna') 50 | # rf0 += 16 51 | sub(rf0, rf0, -16).mov(tmua, r2) 52 | # r2 += 64 53 | shl(r0, 4, 4) 54 | tmuwt().add(r2, r2, r0) 55 | 56 | nop(sig = thrsw) 57 | nop(sig = thrsw) 58 | nop() 59 | nop() 60 | nop(sig = thrsw) 61 | nop() 62 | nop() 63 | nop() 64 | 65 | 66 | def test_tmu_write(): 67 | 68 | n = 4096 69 | 70 | with Driver(data_area_size = n * 16 * 4 + 2 * 4) as drv: 71 | 72 | code = drv.program(qpu_tmu_write) 73 | data = drv.alloc(n * 16, dtype = 'uint32') 74 | unif = drv.alloc(2, dtype = 'uint32') 75 | 76 | data[:] = 0xdeadbeaf 77 | unif[0] = n 78 | unif[1] = data.addresses()[0] 79 | 80 | start = time.time() 81 | drv.execute(code, unif.addresses()[0]) 82 | end = time.time() 83 | 84 | assert all(data == range(n * 16)) 85 | 86 | 87 | @qpu 88 | def qpu_tmu_vec_write(asm, configs, vec_offset): 89 | 90 | reg_addr = rf0 91 | reg_n = rf1 92 | 93 | nop(sig=ldunifrf(reg_addr)) 94 | nop(sig=ldunifrf(reg_n)) 95 | 96 | with loop as l: 97 | 98 | assert 1 <= len(configs) <= 4 99 | for i, config in enumerate(configs): 100 | 101 | eidx(r0) 102 | shl(r0, r0, 0xfffffff0) # 0xfffffff0 % 32 = 16 103 | assert 1 <= config <= 4 104 | for j in range(config): 105 | mov(tmud, r0).add(r0, r0, 1) 106 | 107 | assert 0 <= vec_offset <= 3 108 | # addr + 4 * 4 * eidx + 4 * vec_offset 109 | eidx(r0) 110 | shl(r0, r0, 4) 111 | sub(r0, r0, -4 * vec_offset) 112 | add(tmuau if i == 0 else tmua, reg_addr, r0) 113 | 114 | # addr += 4 * len(configs) * 16 115 | shl(r0, 4, 4) 116 | umul24(r0, r0, len(configs)) 117 | add(reg_addr, reg_addr, r0) 118 | 119 | sub(reg_n, reg_n, 1, cond='pushz') 120 | l.b(cond='na0') 121 | nop() 122 | nop() 123 | nop() 124 | 125 | nop(sig = thrsw) 126 | nop(sig = thrsw) 127 | nop() 128 | nop() 129 | nop(sig = thrsw) 130 | nop() 131 | nop() 132 | nop() 133 | 134 | 135 | def test_tmu_vec_write(): 136 | 137 | n = 123 138 | 139 | # The number of 32-bit values in a vector element per pixel is 1, 2, 3, or 4. 140 | # For example, with four 32-bit config: 141 | # tmud <- r0 142 | # tmud <- r1 143 | # tmud <- r2 144 | # tmud <- r3 145 | # tmuau <- addr + 4 * 4 * eidx 146 | # results in: 147 | # addr + 0x00: r0[ 0], r1[ 0], r2[ 0], r3[ 0], r0[ 1], r1[ 1], ..., r3[ 3] 148 | # addr + 0x40: r0[ 4], r1[ 4], r2[ 4], r3[ 4], r0[ 5], r1[ 5], ..., r3[ 7] 149 | # addr + 0x80: ... 150 | # addr + 0xc0: r0[12], r1[12], r2[12], r3[12], r0[13], r1[13], ..., r3[15] 151 | # where rn[i] (0 <= i < 16) is the value in register rn of pixel (eidx) i. 152 | configs = [4, 3, 2, 1] 153 | 154 | # The element per pixel is wrapped modulo 16 bytes. 155 | # For example, if the above address setting is addr + 4 * 4 * eidx + 4, then 156 | # addr + 0x00: r3[ 0], r0[ 0], r1[ 0], r2[ 0], r3[ 1], r0[ 1], ..., r2[ 3] 157 | # addr + 0x40: r3[ 4], r0[ 4], r1[ 4], r2[ 4], r3[ 5], r0[ 5], ..., r2[ 7] 158 | # addr + 0x80: ... 159 | # addr + 0xc0: r3[12], r0[12], r1[12], r2[12], r3[13], r0[13], ..., r2[15] 160 | vec_offset = 3 161 | 162 | data_default = 0xdeadbeef 163 | 164 | with Driver() as drv: 165 | 166 | code = drv.program(qpu_tmu_vec_write, configs, vec_offset) 167 | data = drv.alloc(16 * 4 * len(configs) * n, dtype='uint32') 168 | unif = drv.alloc(2 + n, dtype='uint32') 169 | 170 | data[:] = data_default 171 | 172 | unif[0] = data.addresses()[0] 173 | unif[1] = n 174 | 175 | conf = 0xffffffff 176 | for config in reversed(configs): 177 | conf <<= 8 178 | conf |= {1: 0xff, 2: 0xfa, 3: 0xfb, 4: 0xfc}[config] 179 | conf &= 0xffffffff 180 | unif[2:] = conf 181 | 182 | drv.execute(code, unif.addresses()[0]) 183 | 184 | for i, row in enumerate(data.reshape(-1, 4 * 16)): 185 | config = configs[i % len(configs)] 186 | for j, vec in enumerate(row.reshape(-1, 4)): 187 | ref = list(range(j << 16, (j << 16) + config)) + [data_default] * (4 - config) 188 | assert all(np.roll(vec, -vec_offset) == ref) 189 | 190 | 191 | @qpu 192 | def qpu_tmu_read(asm): 193 | 194 | # r0: Number of vectors to read. 195 | # r1: Pointer to the read vectors + eidx * 4. 196 | # r2: Pointer to the write vectors + eidx * 4 197 | eidx(r2, sig = ldunif) 198 | mov(r0, r5, sig = ldunif) 199 | shl(r2, r2, 2).mov(r1, r5) 200 | add(r1, r1, r2, sig = ldunif) 201 | add(r2, r5, r2) 202 | 203 | with loop as l: 204 | 205 | mov(tmua, r1, sig = thrsw) 206 | nop() 207 | nop() 208 | nop(sig = ldtmu(rf0)) 209 | 210 | sub(r0, r0, 1, cond = 'pushz').add(tmud, rf0, 1) 211 | l.b(cond = 'anyna') 212 | shl(r3, 4, 4).mov(tmua, r2) 213 | # r1 += 64 214 | # r2 += 64 215 | add(r1, r1, r3).add(r2, r2, r3) 216 | tmuwt() 217 | 218 | nop(sig = thrsw) 219 | nop(sig = thrsw) 220 | nop() 221 | nop() 222 | nop(sig = thrsw) 223 | nop() 224 | nop() 225 | nop() 226 | 227 | 228 | def test_tmu_read(): 229 | 230 | n = 4096 231 | 232 | with Driver() as drv: 233 | 234 | code = drv.program(qpu_tmu_read) 235 | data = drv.alloc(n * 16, dtype = 'uint32') 236 | unif = drv.alloc(3, dtype = 'uint32') 237 | 238 | data[:] = range(len(data)) 239 | unif[0] = n 240 | unif[1] = data.addresses()[0] 241 | unif[2] = data.addresses()[0] 242 | 243 | drv.execute(code, unif.addresses()[0]) 244 | 245 | assert all(data == range(1, n * 16 + 1)) 246 | 247 | 248 | @qpu 249 | def qpu_tmu_vec_read(asm, configs, vec_offset): 250 | 251 | reg_src = rf0 252 | reg_dst = rf1 253 | reg_n = rf2 254 | 255 | nop(sig=ldunifrf(reg_src)) 256 | nop(sig=ldunifrf(reg_dst)) 257 | nop(sig=ldunifrf(reg_n)) 258 | 259 | # dst += 4 * eidx 260 | eidx(r0) 261 | shl(r0, r0, 2) 262 | add(reg_dst, reg_dst, r0) 263 | 264 | with loop as l: 265 | 266 | mov(r4, 0) 267 | 268 | assert 1 <= len(configs) <= 4 269 | for i, config in enumerate(configs): 270 | 271 | assert 1 <= config <= 4 272 | assert 0 <= vec_offset <= 3 273 | # addr + 4 * 4 * eidx + 4 * vec_offset 274 | eidx(r0) 275 | shl(r0, r0, 4) 276 | sub(r0, r0, -4 * vec_offset) 277 | add(tmuau if i == 0 else tmua, reg_src, r0, sig=thrsw) 278 | nop() 279 | nop() 280 | nop(sig=ldtmu(r0)) 281 | nop(sig=ldtmu(r1)) if config >= 2 else eidx(r1) 282 | nop(sig=ldtmu(r2)) if config >= 3 else eidx(r2) 283 | nop(sig=ldtmu(r3)) if config >= 4 else eidx(r3) 284 | 285 | add(r0, r0, r1).add(r2, r2, r3) 286 | add(r0, r0, r2) 287 | add(r4, r4, r0) 288 | # src += 4 * 4 * 16 289 | shl(r0, 4, 4) 290 | umul24(r0, r0, 4) 291 | add(reg_src, reg_src, r0) 292 | 293 | mov(tmud, r4) 294 | # If the configs are shited out, then 0xff (per-pixel regular 32-bit 295 | # write) is filled in. 296 | mov(tmua, reg_dst) 297 | 298 | # dst += 4 * 16 299 | shl(r0, 4, 4) 300 | add(reg_dst, reg_dst, r0) 301 | 302 | sub(reg_n, reg_n, 1, cond='pushz') 303 | l.b(cond='na0') 304 | nop() 305 | nop() 306 | nop() 307 | 308 | nop(sig = thrsw) 309 | nop(sig = thrsw) 310 | nop() 311 | nop() 312 | nop(sig = thrsw) 313 | nop() 314 | nop() 315 | nop() 316 | 317 | 318 | def test_tmu_vec_read(): 319 | 320 | # The settings, the number of elements in a vector, and 16-byte wrapping are 321 | # the same as the vector writes. 322 | 323 | n = 123 324 | configs = [4, 3, 2, 1] 325 | vec_offset = 1 326 | 327 | with Driver() as drv: 328 | 329 | code = drv.program(qpu_tmu_vec_read, configs, vec_offset) 330 | src = drv.alloc((n, 16 * 4 * len(configs)), dtype='uint32') 331 | dst = drv.alloc((n, 16), dtype='uint32') 332 | unif = drv.alloc(3 + n, dtype='uint32') 333 | 334 | src[:, :] = np.arange(src.size, dtype=src.dtype).reshape(src.shape) 335 | dst[:, :] = 0 336 | 337 | unif[0] = src.addresses()[0, 0] 338 | unif[1] = dst.addresses()[0, 0] 339 | unif[2] = n 340 | 341 | conf = 0xffffffff 342 | for config in reversed(configs): 343 | conf <<= 8 344 | conf |= {1: 0xff, 2: 0xfa, 3: 0xfb, 4: 0xfc}[config] 345 | conf &= 0xffffffff 346 | unif[3:] = conf 347 | 348 | drv.execute(code, unif.addresses()[0]) 349 | 350 | for i, vec in enumerate(dst): 351 | data = src.shape[1] * i + np.arange(src.shape[1], dtype='uint32').reshape(len(configs), 16, 4) 352 | s = [0] * 16 353 | for j, config in enumerate(configs): 354 | for eidx in range(16): 355 | for k in range(config): 356 | s[eidx] += data[j, eidx, (k + vec_offset) % 4] 357 | s[eidx] += eidx * (4 - config) 358 | assert all(vec == s) 359 | 360 | 361 | # VC4 TMU cache & DMA break memory consistency. 362 | # How about VC6 TMU ? 363 | @qpu 364 | def qpu_tmu_keeps_memory_consistency(asm): 365 | 366 | nop(sig = ldunifrf(r0)) 367 | 368 | mov(tmua, r0, sig = thrsw) 369 | nop() 370 | nop() 371 | nop(sig = ldtmu(r1)) 372 | 373 | add(tmud, r1, 1) 374 | mov(tmua, r0) 375 | tmuwt() 376 | 377 | mov(tmua, r0, sig = thrsw) 378 | nop() 379 | nop() 380 | nop(sig = ldtmu(r1)) 381 | 382 | add(tmud, r1, 1) 383 | mov(tmua, r0) 384 | tmuwt() 385 | 386 | nop(sig = thrsw) 387 | nop(sig = thrsw) 388 | nop() 389 | nop() 390 | nop(sig = thrsw) 391 | nop() 392 | nop() 393 | nop() 394 | 395 | def test_tmu_keeps_memory_consistency(): 396 | 397 | with Driver() as drv: 398 | 399 | code = drv.program(qpu_tmu_keeps_memory_consistency) 400 | data = drv.alloc(16, dtype = 'uint32') 401 | unif = drv.alloc(3, dtype = 'uint32') 402 | 403 | data[:] = 1 404 | unif[0] = data.addresses()[0] 405 | 406 | drv.execute(code, unif.addresses()[0]) 407 | 408 | assert (data[0] == 3).all() 409 | assert (data[1:] == 1).all() 410 | 411 | 412 | @qpu 413 | def qpu_tmu_read_tmu_write_uniform_read(asm): 414 | 415 | eidx(r0, sig = ldunifrf(rf0)) 416 | shl(r0, r0, 2) 417 | add(rf0, rf0, r0, sig = ldunifrf(rf1)) 418 | add(rf1, rf1, r0) 419 | 420 | mov(tmua, rf0, sig = thrsw) 421 | nop() 422 | nop() 423 | nop(sig = ldtmu(r0)) # r0 = [1,...,1] 424 | 425 | add(tmud, r0, 1) 426 | mov(tmua, rf0) # data = [2,...,2] 427 | tmuwt() 428 | 429 | b(R.set_unif_addr, cond = 'always').unif_addr(rf0) # unif_addr = data.addresses()[0] 430 | nop() 431 | nop() 432 | nop() 433 | L.set_unif_addr 434 | 435 | nop(sig = ldunifrf(r0)) # r0 = [data[0],...,data[0]] = [2,...,2] 436 | 437 | add(tmud, r0, 1) 438 | mov(tmua, rf1) # result = [3,...,3] 439 | tmuwt() 440 | 441 | nop(sig = thrsw) 442 | nop(sig = thrsw) 443 | nop() 444 | nop() 445 | nop(sig = thrsw) 446 | nop() 447 | nop() 448 | nop() 449 | 450 | def test_tmu_read_tmu_write_uniform_read(): 451 | 452 | with Driver() as drv: 453 | 454 | code = drv.program(qpu_tmu_read_tmu_write_uniform_read) 455 | data = drv.alloc(16, dtype = 'uint32') 456 | result = drv.alloc(16, dtype = 'uint32') 457 | unif = drv.alloc(3, dtype = 'uint32') 458 | 459 | data[:] = 1 460 | unif[0] = data.addresses()[0] 461 | unif[1] = result.addresses()[0] 462 | 463 | drv.execute(code, unif.addresses()[0]) 464 | 465 | assert (data == 2).all() 466 | assert (result == 2).all() # !? not 3 ? 467 | -------------------------------------------------------------------------------- /tests/test_unifa.py: -------------------------------------------------------------------------------- 1 | 2 | # Copyright (c) 2021 Idein Inc. 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy 5 | # of this software and associated documentation files (the "Software"), to deal 6 | # in the Software without restriction, including without limitation the rights 7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | # copies of the Software, and to permit persons to whom the Software is 9 | # furnished to do so, subject to the following conditions: 10 | # 11 | # The above copyright notice and this permission notice (including the next 12 | # paragraph) shall be included in all copies or substantial portions of the 13 | # Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | 24 | from videocore6.assembler import qpu 25 | from videocore6.driver import Driver 26 | 27 | import numpy as np 28 | 29 | 30 | @qpu 31 | def qpu_unifa(asm): 32 | 33 | reg_n = rf0 34 | reg_src0 = rf1 35 | reg_src1 = rf2 36 | reg_dst = rf3 37 | reg_inc = rf4 38 | reg_tmp = rf5 39 | 40 | nop(sig=ldunifrf(reg_n)) 41 | nop(sig=ldunifrf(reg_src0)) 42 | nop(sig=ldunifrf(reg_src1)) 43 | nop(sig=ldunifrf(reg_dst)) 44 | 45 | eidx(r0) 46 | shl(r0, r0, 2) 47 | add(reg_src0, reg_src0, r0) 48 | add(reg_src1, reg_src1, r0) 49 | add(reg_dst, reg_dst, r0) 50 | 51 | shl(reg_inc, 4, 4) 52 | 53 | # Address is taken from element zero. 54 | mov(unifa, reg_src0) 55 | # Three delays are required for the data to be ready. 56 | nop() 57 | nop() 58 | sub(r0, reg_n, 1, cond='pushz') 59 | L.l0 60 | nop(sig=ldunifa) 61 | b(R.l0, cond='na0') 62 | mov(tmud, r5) 63 | mov(tmua, reg_dst).add(reg_dst, reg_dst, reg_inc) 64 | sub(r0, r0, 1, cond='pushz') 65 | 66 | # Ordinary uniform and sideband uniform simultaneous reads. 67 | b(R.l1, cond='always').unif_addr(reg_src0) 68 | mov(unifa, reg_src1) 69 | sub(r0, reg_n, 1, cond='pushz') 70 | nop() 71 | L.l1 72 | nop(sig=ldunif) 73 | mov(tmud, r5, sig=ldunifa) 74 | mov(tmua, reg_dst).add(reg_dst, reg_dst, reg_inc) 75 | b(R.l1, cond='na0') 76 | mov(tmud, r5) 77 | mov(tmua, reg_dst).add(reg_dst, reg_dst, reg_inc) 78 | sub(r0, r0, 1, cond='pushz') 79 | 80 | # Check if the two uniform streams proceed mutually-exclusively. 81 | # 82 | # Timeline: 83 | # 84 | # time | unif | unifa 85 | # ------+----------+---------- 86 | # T0 | set addr | 87 | # T1 | load | 88 | # T2 | | load 89 | # T3 | | set addr 90 | # T4 | | load 91 | # T5 | load | 92 | # T0 | set addr | 93 | # T1 | load | 94 | # T2 | | load 95 | # T3 | | set addr 96 | # T4 | | load 97 | # T5 | load | 98 | # ... | ... | ... 99 | 100 | # Branch takes the second element as a new uniform address. 101 | quad_rotate(reg_src1, reg_src1, 1) 102 | shr(r0, reg_n, 1) 103 | mov(unifa, reg_src0).add(reg_src0, reg_src0, 4) 104 | L.l2 105 | b(R.l3, cond='always').unif_addr(reg_src1) # T0 106 | add(reg_src1, reg_src1, 8) 107 | sub(r0, r0, 1, cond='pushz') 108 | nop() 109 | L.l3 110 | nop(sig=ldunif) # T1 111 | mov(tmud, r5) 112 | mov(tmua, reg_dst, sig=ldunifa).mov(unifa, reg_src0) # T2, T3 113 | mov(tmud, r5) 114 | add(reg_dst, reg_dst, reg_inc) 115 | add(reg_src0, reg_src0, 8) 116 | mov(tmua, reg_dst, sig=ldunifa).add(reg_dst, reg_dst, reg_inc) # T4 117 | mov(tmud, r5, sig=ldunif) # T5 118 | b(R.l2, cond='na0') 119 | mov(tmua, reg_dst).add(reg_dst, reg_dst, reg_inc) 120 | mov(tmud, r5) 121 | mov(tmua, reg_dst).add(reg_dst, reg_dst, reg_inc) 122 | 123 | nop(sig=thrsw) 124 | nop(sig=thrsw) 125 | nop() 126 | nop() 127 | nop(sig=thrsw) 128 | nop() 129 | nop() 130 | nop() 131 | 132 | 133 | def test_unifa(): 134 | 135 | n = 548 136 | 137 | assert n >= 2 and n % 2 == 0 138 | 139 | with Driver() as drv: 140 | 141 | code = drv.program(qpu_unifa) 142 | unif = drv.alloc(4, dtype='uint32') 143 | src0 = drv.alloc(n, dtype='uint32') 144 | src1 = drv.alloc(n, dtype='uint32') 145 | dst = drv.alloc((n * 5, 16), dtype='uint32') 146 | 147 | rng = np.random.default_rng() 148 | src0[:] = rng.integers(1, 2 ** 32 - 1, size=n) 149 | src1[:] = rng.integers(1, 2 ** 32 - 1, size=n) 150 | dst[:, :] = 0 151 | 152 | unif[0] = n 153 | unif[1] = src0.addresses()[0] 154 | unif[2] = src1.addresses()[0] 155 | unif[3] = dst.addresses()[0, 0] 156 | 157 | drv.execute(code, unif.addresses()[0]) 158 | 159 | for i in range(n): 160 | assert all(dst[i, :] == src0[i]) 161 | assert all(dst[n + i * 2 + 0, :] == src0[i]) 162 | assert all(dst[n + i * 2 + 1, :] == src1[i]) 163 | assert all(dst[n * 3 + i * 2 + (i % 2), :] == src1[i]) 164 | assert all(dst[n * 3 + i * 2 + (1 - i % 2), :] == src0[i]) 165 | -------------------------------------------------------------------------------- /tests/test_v3d.py: -------------------------------------------------------------------------------- 1 | 2 | # Copyright (c) 2019-2020 Idein Inc. 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy 5 | # of this software and associated documentation files (the "Software"), to deal 6 | # in the Software without restriction, including without limitation the rights 7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | # copies of the Software, and to permit persons to whom the Software is 9 | # furnished to do so, subject to the following conditions: 10 | # 11 | # The above copyright notice and this permission notice (including the next 12 | # paragraph) shall be included in all copies or substantial portions of the 13 | # Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | 24 | from videocore6.drm_v3d import DRM_V3D 25 | from videocore6.v3d import * 26 | 27 | 28 | def test_v3d_regs(): 29 | 30 | with DRM_V3D() as drm: 31 | 32 | try: 33 | 34 | with RegisterMapping() as regmap: 35 | 36 | assert regmap[HUB_UIFCFG] \ 37 | == drm.v3d_get_param(DRM_V3D.V3D_PARAM_V3D_UIFCFG) 38 | 39 | assert regmap[HUB_IDENT1] \ 40 | == drm.v3d_get_param(DRM_V3D.V3D_PARAM_V3D_HUB_IDENT1) 41 | 42 | assert regmap[HUB_IDENT2] \ 43 | == drm.v3d_get_param(DRM_V3D.V3D_PARAM_V3D_HUB_IDENT2) 44 | 45 | assert regmap[HUB_IDENT3] \ 46 | == drm.v3d_get_param(DRM_V3D.V3D_PARAM_V3D_HUB_IDENT3) 47 | 48 | assert regmap[CORE_IDENT0, 0] \ 49 | == drm.v3d_get_param(DRM_V3D.V3D_PARAM_V3D_CORE0_IDENT0) 50 | 51 | assert regmap[CORE_IDENT1, 0] \ 52 | == drm.v3d_get_param(DRM_V3D.V3D_PARAM_V3D_CORE0_IDENT1) 53 | 54 | assert regmap[CORE_IDENT2, 0] \ 55 | == drm.v3d_get_param(DRM_V3D.V3D_PARAM_V3D_CORE0_IDENT2) 56 | 57 | except PermissionError: 58 | 59 | print('Skipping tests because of a lack of root privilege') 60 | -------------------------------------------------------------------------------- /videocore6/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | # Copyright (c) 2019-2020 Idein Inc. 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy 5 | # of this software and associated documentation files (the "Software"), to deal 6 | # in the Software without restriction, including without limitation the rights 7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | # copies of the Software, and to permit persons to whom the Software is 9 | # furnished to do so, subject to the following conditions: 10 | # 11 | # The above copyright notice and this permission notice (including the next 12 | # paragraph) shall be included in all copies or substantial portions of the 13 | # Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | 24 | __version__ = '0.0.0' 25 | 26 | 27 | import struct 28 | 29 | 30 | def pack_unpack(pack, unpack, v): 31 | 32 | if isinstance(v, list): 33 | return [struct.unpack(unpack, struct.pack(pack, _))[0] for _ in v] 34 | 35 | return struct.unpack(unpack, struct.pack(pack, v))[0] 36 | -------------------------------------------------------------------------------- /videocore6/driver.py: -------------------------------------------------------------------------------- 1 | 2 | # Copyright (c) 2014-2018 Broadcom 3 | # Copyright (c) 2019-2020 Idein Inc. 4 | # 5 | # This program is free software; you can redistribute it and/or modify it under 6 | # the terms of the GNU General Public License as published by the Free Software 7 | # Foundation; either version 2 of the License, or (at your option) any later 8 | # version. 9 | # 10 | # This program is distributed in the hope that it will be useful, but WITHOUT 11 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 12 | # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 13 | # details. 14 | # 15 | # You should have received a copy of the GNU General Public License along with 16 | # this program; if not, write to the Free Software Foundation, Inc., 51 Franklin 17 | # Street, Fifth Floor, Boston, MA 02110-1301 USA. 18 | 19 | 20 | import sys 21 | import mmap 22 | from videocore6.drm_v3d import DRM_V3D 23 | from videocore6.assembler import assemble 24 | import numpy as np 25 | 26 | 27 | DEFAULT_CODE_AREA_SIZE = 1024 * 1024 28 | DEFAULT_DATA_AREA_SIZE = 32 * 1024 * 1024 29 | 30 | 31 | class DriverError(Exception): 32 | pass 33 | 34 | 35 | class Array(np.ndarray): 36 | 37 | def __new__(cls, *args, **kwargs): 38 | 39 | phyaddr = kwargs.pop('phyaddr') 40 | obj = super().__new__(cls, *args, **kwargs) 41 | obj.address = phyaddr 42 | return obj 43 | 44 | def addresses(self): 45 | 46 | return np.arange( 47 | self.address, 48 | self.address + self.nbytes, 49 | self.itemsize, 50 | np.uint32, 51 | ).reshape(self.shape) 52 | 53 | 54 | class Memory(object): 55 | 56 | def __init__(self, drm, size): 57 | 58 | self.drm = drm 59 | self.size = size 60 | self.handle = None # Handle of BO for V3D DRM 61 | self.phyaddr = None # Physical address used in QPU 62 | self.buffer = None # mmap object of the memory area 63 | 64 | try: 65 | 66 | self.handle, self.phyaddr = drm.v3d_create_bo(size) 67 | offset = drm.v3d_mmap_bo(self.handle) 68 | self.buffer = mmap.mmap(fileno=drm.fd, length=size, 69 | flags=mmap.MAP_SHARED, 70 | prot=mmap.PROT_READ | mmap.PROT_WRITE, 71 | offset=offset) 72 | 73 | except Exception as e: 74 | 75 | self.close() 76 | raise e 77 | 78 | def close(self): 79 | 80 | if self.buffer is not None: 81 | self.buffer.close() 82 | 83 | if self.handle is not None: 84 | self.drm.gem_close(self.handle) 85 | 86 | self.drm = None 87 | self.size = None 88 | self.handle = None 89 | self.phyaddr = None 90 | self.buffer = None 91 | 92 | 93 | class Dispatcher(object): 94 | 95 | def __init__(self, drm, bo_handles, timeout_sec=10): 96 | self.drm = drm 97 | self.bo_handles = bo_handles 98 | self.timeout_sec = timeout_sec 99 | 100 | def __enter__(self): 101 | return self 102 | 103 | def __exit__(self, ex_type, ex_value, trace): 104 | for bo_handle in self.bo_handles: 105 | self.drm.v3d_wait_bo(bo_handle, 106 | timeout_ns=int(self.timeout_sec / 1e-9)) 107 | 108 | def dispatch(self, code, uniforms=None, workgroup=(16, 1, 1), wgs_per_sg=16, thread=1): 109 | 110 | wg_x, wg_y, wg_z = workgroup 111 | wg_size = wg_x * wg_y * wg_z 112 | 113 | def roundup(n, d): 114 | return (n + d - 1) // d 115 | 116 | self.drm.v3d_submit_csd( 117 | cfg=[ 118 | # WGS X, Y, Z and settings 119 | wg_x << 16, 120 | wg_y << 16, 121 | wg_z << 16, 122 | ((roundup(wgs_per_sg * wg_size, 16) - 1) << 12) | 123 | (wgs_per_sg << 8) | 124 | (wg_size & 0xff), 125 | # Number of batches minus 1 126 | thread - 1, 127 | # Shader address, pnan, singleseg, threading 128 | code.addresses()[0], 129 | # Uniforms address 130 | uniforms if uniforms is not None else 0, 131 | ], 132 | # Not used in the driver. 133 | coef=[0, 0, 0, 0], 134 | bo_handles=self.bo_handles.ctypes.data, 135 | bo_handle_count=len(self.bo_handles), 136 | in_sync=0, 137 | out_sync=0, 138 | ) 139 | 140 | 141 | class Driver(object): 142 | 143 | def __init__(self, *, 144 | code_area_size=DEFAULT_CODE_AREA_SIZE, 145 | data_area_size=DEFAULT_DATA_AREA_SIZE, 146 | ): 147 | 148 | self.code_area_size = code_area_size 149 | self.data_area_size = data_area_size 150 | total_size = self.code_area_size + self.data_area_size 151 | self.code_area_base = 0 152 | self.data_area_base = self.code_area_base + self.code_area_size 153 | self.code_pos = self.code_area_base 154 | self.data_pos = self.data_area_base 155 | 156 | self.drm = None 157 | self.memory = None 158 | self.bo_handles = None 159 | 160 | try: 161 | 162 | self.drm = DRM_V3D() 163 | 164 | self.memory = Memory(self.drm, total_size) 165 | 166 | self.bo_handles = np.array([self.memory.handle], dtype=np.uint32) 167 | 168 | except Exception as e: 169 | 170 | self.close() 171 | raise e 172 | 173 | def close(self): 174 | 175 | if self.memory is not None: 176 | self.memory.close() 177 | 178 | if self.drm is not None: 179 | self.drm.close() 180 | 181 | self.drm = None 182 | self.memory = None 183 | self.bo_handles = None 184 | 185 | def __enter__(self): 186 | 187 | return self 188 | 189 | def __exit__(self, exc_type, value, traceback): 190 | 191 | self.close() 192 | return exc_type is None 193 | 194 | def alloc(self, *args, **kwargs): 195 | 196 | offset = self.data_pos 197 | kwargs['phyaddr'] = self.memory.phyaddr + offset 198 | kwargs['buffer'] = self.memory.buffer 199 | kwargs['offset'] = offset 200 | 201 | arr = Array(*args, **kwargs) 202 | 203 | self.data_pos += arr.nbytes 204 | if self.data_pos > self.data_area_base + self.data_area_size: 205 | raise DriverError('Data too large') 206 | 207 | return arr 208 | 209 | def dump_code(self, code, *, file=sys.stdout): 210 | for insn in code: 211 | print(f'{insn:#018x}', file=file) 212 | 213 | def dump_program(self, prog, *args, file=sys.stdout, **kwargs): 214 | self.dump_code(assemble(prog, *args, **kwargs), file=file) 215 | 216 | def program(self, prog, *args, **kwargs): 217 | if hasattr(prog, '__call__'): 218 | asm = assemble(prog, *args, **kwargs) 219 | else: 220 | asm = prog 221 | 222 | offset = self.code_pos 223 | code = Array( 224 | shape=len(asm), 225 | dtype=np.uint64, 226 | phyaddr=self.memory.phyaddr + offset, 227 | buffer=self.memory.buffer, 228 | offset=offset, 229 | ) 230 | 231 | self.code_pos += code.nbytes 232 | if self.code_pos > self.code_area_base + self.code_area_size: 233 | raise DriverError('Code too large') 234 | 235 | code[:] = asm 236 | 237 | return code 238 | 239 | def compute_shader_dispatcher(self, timeout_sec=10): 240 | return Dispatcher(self.drm, self.bo_handles, timeout_sec=timeout_sec) 241 | 242 | def execute(self, code, uniforms=None, timeout_sec=10, workgroup=(16, 1, 1), wgs_per_sg=16, thread=1): 243 | with self.compute_shader_dispatcher(timeout_sec) as csd: 244 | csd.dispatch(code, uniforms=uniforms, workgroup=workgroup, wgs_per_sg=wgs_per_sg, thread=thread) 245 | -------------------------------------------------------------------------------- /videocore6/drm_v3d.py: -------------------------------------------------------------------------------- 1 | 2 | # Copyright (c) 2014-2018 Broadcom 3 | # Copyright (c) 2019-2020 Idein Inc. 4 | # 5 | # This program is free software; you can redistribute it and/or modify it under 6 | # the terms of the GNU General Public License as published by the Free Software 7 | # Foundation; either version 2 of the License, or (at your option) any later 8 | # version. 9 | # 10 | # This program is distributed in the hope that it will be useful, but WITHOUT 11 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 12 | # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 13 | # details. 14 | # 15 | # You should have received a copy of the GNU General Public License along with 16 | # this program; if not, write to the Free Software Foundation, Inc., 51 Franklin 17 | # Street, Fifth Floor, Boston, MA 02110-1301 USA. 18 | 19 | 20 | import os 21 | from fcntl import ioctl 22 | from ctypes import Structure, c_uint32, c_uint64 23 | from ioctl_opt import IOW, IOWR 24 | 25 | 26 | class DRM_V3D(object): 27 | 28 | def __init__(self, path='/dev/dri/by-path/platform-fec00000.v3d-card'): 29 | self.fd = os.open(path, os.O_RDWR) 30 | 31 | def close(self): 32 | if self.fd is not None: 33 | os.close(self.fd) 34 | self.fd = None 35 | 36 | def __enter__(self): 37 | return self 38 | 39 | def __exit__(self, exc_type, exc_value, traceback): 40 | self.close() 41 | return exc_value is None 42 | 43 | # Derived from linux/include/uapi/drm/drm.h 44 | DRM_IOCTL_BASE = ord('d') 45 | DRM_COMMAND_BASE = 0x40 46 | DRM_GEM_CLOSE = 0x09 47 | 48 | # Derived from linux/include/uapi/drm/v3d_drm.h 49 | DRM_V3D_WAIT_BO = DRM_COMMAND_BASE + 0x01 50 | DRM_V3D_CREATE_BO = DRM_COMMAND_BASE + 0x02 51 | DRM_V3D_MMAP_BO = DRM_COMMAND_BASE + 0x03 52 | DRM_V3D_GET_PARAM = DRM_COMMAND_BASE + 0x04 53 | DRM_V3D_SUBMIT_CSD = DRM_COMMAND_BASE + 0x07 54 | 55 | V3D_PARAM_V3D_UIFCFG = 0 56 | V3D_PARAM_V3D_HUB_IDENT1 = 1 57 | V3D_PARAM_V3D_HUB_IDENT2 = 2 58 | V3D_PARAM_V3D_HUB_IDENT3 = 3 59 | V3D_PARAM_V3D_CORE0_IDENT0 = 4 60 | V3D_PARAM_V3D_CORE0_IDENT1 = 5 61 | V3D_PARAM_V3D_CORE0_IDENT2 = 6 62 | V3D_PARAM_SUPPORTS_TFU = 7 63 | V3D_PARAM_SUPPORTS_CSD = 8 64 | 65 | class st_gem_close(Structure): 66 | _fields_ = [ 67 | ('handle', c_uint32), 68 | ('pad', c_uint32), 69 | ] 70 | 71 | class st_v3d_wait_bo(Structure): 72 | _fields_ = [ 73 | ('handle', c_uint32), 74 | ('pad', c_uint32), 75 | ('timeout_ns', c_uint64), 76 | ] 77 | 78 | class st_v3d_create_bo(Structure): 79 | _fields_ = [ 80 | ('size', c_uint32), 81 | ('flags', c_uint32), 82 | ('handle', c_uint32), 83 | ('offset', c_uint32), 84 | ] 85 | 86 | class st_v3d_mmap_bo(Structure): 87 | _fields_ = [ 88 | ('handle', c_uint32), 89 | ('flags', c_uint32), 90 | ('offset', c_uint64), 91 | ] 92 | 93 | class st_v3d_get_param(Structure): 94 | _fields_ = [ 95 | ('param', c_uint32), 96 | ('pad', c_uint32), 97 | ('value', c_uint64), 98 | ] 99 | 100 | class st_v3d_submit_csd(Structure): 101 | _fields_ = [ 102 | ('cfg', c_uint32 * 7), 103 | ('coef', c_uint32 * 4), 104 | ('bo_handles', c_uint64), 105 | ('bo_handle_count', c_uint32), 106 | ('in_sync', c_uint32), 107 | ('out_sync', c_uint32), 108 | ] 109 | 110 | IOCTL_GEM_CLOSE = IOW(DRM_IOCTL_BASE, DRM_GEM_CLOSE, st_gem_close) 111 | 112 | IOCTL_V3D_WAIT_BO = IOWR(DRM_IOCTL_BASE, DRM_V3D_WAIT_BO, st_v3d_wait_bo) 113 | IOCTL_V3D_CREATE_BO = IOWR(DRM_IOCTL_BASE, DRM_V3D_CREATE_BO, 114 | st_v3d_create_bo) 115 | IOCTL_V3D_MMAP_BO = IOWR(DRM_IOCTL_BASE, DRM_V3D_MMAP_BO, st_v3d_mmap_bo) 116 | IOCTL_V3D_GET_PARAM = IOWR(DRM_IOCTL_BASE, DRM_V3D_GET_PARAM, 117 | st_v3d_get_param) 118 | IOCTL_V3D_SUBMIT_CSD = IOW(DRM_IOCTL_BASE, DRM_V3D_SUBMIT_CSD, 119 | st_v3d_submit_csd) 120 | 121 | def gem_close(self, handle): 122 | st = self.st_gem_close( 123 | handle=handle, 124 | pad=0, 125 | ) 126 | ioctl(self.fd, self.IOCTL_GEM_CLOSE, st) 127 | 128 | def v3d_wait_bo(self, handle, timeout_ns): 129 | st = self.st_v3d_wait_bo( 130 | handle=handle, 131 | pad=0, 132 | timeout_ns=timeout_ns, 133 | ) 134 | ioctl(self.fd, self.IOCTL_V3D_WAIT_BO, st) 135 | 136 | def v3d_create_bo(self, size, flags=0): 137 | st = self.st_v3d_create_bo( 138 | size=size, 139 | flags=flags, 140 | handle=0, 141 | offset=0, 142 | ) 143 | ioctl(self.fd, self.IOCTL_V3D_CREATE_BO, st) 144 | return st.handle, st.offset 145 | 146 | def v3d_mmap_bo(self, handle, flags=0): 147 | st = self.st_v3d_mmap_bo( 148 | handle=handle, 149 | flags=flags, 150 | offset=0, 151 | ) 152 | ioctl(self.fd, self.IOCTL_V3D_MMAP_BO, st) 153 | return st.offset 154 | 155 | def v3d_get_param(self, param): 156 | st = self.st_v3d_get_param( 157 | param=param, 158 | pad=0, 159 | value=0, 160 | ) 161 | ioctl(self.fd, self.IOCTL_V3D_GET_PARAM, st) 162 | return st.value 163 | 164 | def v3d_submit_csd(self, cfg, coef, bo_handles, bo_handle_count, in_sync, 165 | out_sync): 166 | st = self.st_v3d_submit_csd( 167 | # XXX: Dirty hack! 168 | cfg=(c_uint32 * 7)(*cfg), 169 | coef=(c_uint32 * 4)(*coef), 170 | bo_handles=bo_handles, 171 | bo_handle_count=bo_handle_count, 172 | in_sync=in_sync, 173 | out_sync=out_sync, 174 | ) 175 | ioctl(self.fd, self.IOCTL_V3D_SUBMIT_CSD, st) 176 | -------------------------------------------------------------------------------- /videocore6/readwrite4.c: -------------------------------------------------------------------------------- 1 | 2 | /* 3 | * Copyright (c) 2019-2020 Idein Inc. 4 | * 5 | * Permission is hereby granted, free of charge, to any person obtaining a copy 6 | * of this software and associated documentation files (the "Software"), to deal 7 | * in the Software without restriction, including without limitation the rights 8 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | * copies of the Software, and to permit persons to whom the Software is 10 | * furnished to do so, subject to the following conditions: 11 | * 12 | * The above copyright notice and this permission notice (including the next 13 | * paragraph) shall be included in all copies or substantial portions of the 14 | * Software. 15 | * 16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | * SOFTWARE. 23 | */ 24 | 25 | 26 | #if defined(__arm__) && defined(__aarch64__) 27 | #error "__arm__ and __aarch64__ are both defined" 28 | #elif !defined(__arm__) && !defined(__aarch64__) 29 | #error "__arm__ and __aarch64__ are both not defined" 30 | #endif 31 | 32 | 33 | #include 34 | 35 | 36 | uint32_t read4(void * const addr) 37 | { 38 | uint32_t value; 39 | 40 | asm volatile ( 41 | #if defined(__arm__) 42 | "ldr %[value], [%[addr]]\n\t" 43 | #elif defined(__aarch64__) 44 | "ldr %w[value], [%[addr]]\n\t" 45 | #endif 46 | : [value] "=r" (value) 47 | : [addr] "r" (addr) 48 | : "memory" 49 | ); 50 | 51 | return value; 52 | } 53 | 54 | 55 | void write4(void * const addr, const uint32_t value) 56 | { 57 | asm volatile ( 58 | #if defined(__arm__) 59 | "str %[value], [%[addr]]\n\t" 60 | #elif defined(__aarch64__) 61 | "str %w[value], [%[addr]]\n\t" 62 | #endif 63 | : 64 | : [value] "r" (value), 65 | [addr] "r" (addr) 66 | : "memory" 67 | ); 68 | } 69 | -------------------------------------------------------------------------------- /videocore6/v3d.py: -------------------------------------------------------------------------------- 1 | 2 | # Copyright (c) 2014-2018 Broadcom 3 | # Copyright (c) 2019-2020 Idein Inc. 4 | # 5 | # This program is free software; you can redistribute it and/or modify it under 6 | # the terms of the GNU General Public License as published by the Free Software 7 | # Foundation; either version 2 of the License, or (at your option) any later 8 | # version. 9 | # 10 | # This program is distributed in the hope that it will be useful, but WITHOUT 11 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 12 | # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 13 | # details. 14 | # 15 | # You should have received a copy of the GNU General Public License along with 16 | # this program; if not, write to the Free Software Foundation, Inc., 51 Franklin 17 | # Street, Fifth Floor, Boston, MA 02110-1301 USA. 18 | 19 | 20 | from ctypes import cdll, c_uint32, c_void_p 21 | from importlib.machinery import EXTENSION_SUFFIXES 22 | from pathlib import Path 23 | import mmap 24 | import os 25 | 26 | import numpy as np 27 | 28 | 29 | class HubRegister: 30 | 31 | def __init__(self, offset): 32 | 33 | self.offset = offset 34 | 35 | 36 | class PerCoreRegister: 37 | 38 | def __init__(self, offset): 39 | 40 | self.offset = offset 41 | 42 | 43 | class HubField: 44 | 45 | def __init__(self, register, high, low): 46 | 47 | assert isinstance(register, HubRegister) 48 | self.register = register 49 | self.mask = ((1 << (high - low + 1)) - 1) << low 50 | self.shift = low 51 | 52 | 53 | class PerCoreField: 54 | 55 | def __init__(self, register, high, low): 56 | 57 | assert isinstance(register, PerCoreRegister) 58 | self.register = register 59 | self.mask = ((1 << (high - low + 1)) - 1) << low 60 | self.shift = low 61 | 62 | 63 | # V3D register definitions derived from linux/drivers/gpu/drm/v3d/v3d_regs.h 64 | 65 | HUB_AXICFG = HubRegister(0x00000) 66 | 67 | HUB_UIFCFG = HubRegister(0x00004) 68 | 69 | HUB_IDENT0 = HubRegister(0x00008) 70 | 71 | HUB_IDENT1 = HubRegister(0x0000c) 72 | HUB_IDENT1_WITH_MSO = HubField(HUB_IDENT1, 19, 19) 73 | HUB_IDENT1_WITH_TSY = HubField(HUB_IDENT1, 18, 18) 74 | HUB_IDENT1_WITH_TFU = HubField(HUB_IDENT1, 17, 17) 75 | HUB_IDENT1_WITH_L3C = HubField(HUB_IDENT1, 16, 16) 76 | HUB_IDENT1_NHOSTS = HubField(HUB_IDENT1, 15, 12) 77 | HUB_IDENT1_NCORES = HubField(HUB_IDENT1, 11, 8) 78 | HUB_IDENT1_REV = HubField(HUB_IDENT1, 7, 4) 79 | HUB_IDENT1_TVER = HubField(HUB_IDENT1, 3, 0) 80 | 81 | HUB_IDENT2 = HubRegister(0x00010) 82 | HUB_IDENT2_WITH_MMU = HubField(HUB_IDENT2, 8, 8) 83 | HUB_IDENT2_L3C_NKB = HubField(HUB_IDENT2, 7, 0) 84 | 85 | HUB_IDENT3 = HubRegister(0x00014) 86 | HUB_IDENT3_IPREV = HubField(HUB_IDENT3, 15, 8) 87 | HUB_IDENT3_IPIDX = HubField(HUB_IDENT3, 7, 0) 88 | 89 | HUB_TFU_CS = HubRegister(0x00400) 90 | 91 | 92 | CORE_IDENT0 = PerCoreRegister(0x00000) 93 | CORE_IDENT0_VER = PerCoreField(CORE_IDENT0, 31, 24) 94 | 95 | CORE_IDENT1 = PerCoreRegister(0x00004) 96 | CORE_IDENT1_VPM_SIZE = PerCoreField(CORE_IDENT1, 31, 28) 97 | CORE_IDENT1_NSEM = PerCoreField(CORE_IDENT1, 23, 16, ) 98 | CORE_IDENT1_NTMU = PerCoreField(CORE_IDENT1, 15, 12) 99 | CORE_IDENT1_QUPS = PerCoreField(CORE_IDENT1, 11, 8) 100 | CORE_IDENT1_NSLC = PerCoreField(CORE_IDENT1, 7, 4) 101 | CORE_IDENT1_REV = PerCoreField(CORE_IDENT1, 3, 0) 102 | 103 | CORE_IDENT2 = PerCoreRegister(0x00008) 104 | CORE_IDENT2_BCG = PerCoreField(CORE_IDENT2, 28, 28) 105 | 106 | CORE_MISCCFG = PerCoreRegister(0x00018) 107 | CORE_MISCCFG_QRMAXCNT = PerCoreField(CORE_MISCCFG, 3, 1) 108 | CORE_MISCCFG_OVRTMUOUT = PerCoreField(CORE_MISCCFG, 0, 0) 109 | 110 | CORE_L2CACTL = PerCoreRegister(0x00020) 111 | CORE_L2CACTL_L2CCLR = PerCoreField(CORE_L2CACTL, 2, 2) 112 | CORE_L2CACTL_L2CDIS = PerCoreField(CORE_L2CACTL, 1, 1) 113 | CORE_L2CACTL_L2CENA = PerCoreField(CORE_L2CACTL, 0, 0) 114 | 115 | CORE_SLCACTL = PerCoreRegister(0x00024) 116 | CORE_SLCACTL_TVCCS = PerCoreField(CORE_SLCACTL, 27, 24) 117 | CORE_SLCACTL_TDCCS = PerCoreField(CORE_SLCACTL, 19, 16) 118 | CORE_SLCACTL_UCC = PerCoreField(CORE_SLCACTL, 11, 8) 119 | CORE_SLCACTL_ICC = PerCoreField(CORE_SLCACTL, 3, 0) 120 | 121 | CORE_PCTR_0_EN = PerCoreRegister(0x00650) 122 | CORE_PCTR_0_CLR = PerCoreRegister(0x00654) 123 | CORE_PCTR_0_OVERFLOW = PerCoreRegister(0x00658) 124 | 125 | g = globals() 126 | 127 | for i in range(0, 32, 4): 128 | name = f'CORE_PCTR_0_SRC_{i}_{i+3}' 129 | g[name] = PerCoreRegister(0x00660 + i) 130 | g[name + f'_S{i+3}'] = PerCoreField(g[name], 30, 24) 131 | g[name + f'_S{i+2}'] = PerCoreField(g[name], 22, 16) 132 | g[name + f'_S{i+1}'] = PerCoreField(g[name], 14, 8) 133 | g[name + f'_S{i+0}'] = PerCoreField(g[name], 6, 0) 134 | g[f'CORE_PCTR_0_SRC_{i+3}'] = PerCoreField(g[name], 30, 24) 135 | g[f'CORE_PCTR_0_SRC_{i+2}'] = PerCoreField(g[name], 22, 16) 136 | g[f'CORE_PCTR_0_SRC_{i+1}'] = PerCoreField(g[name], 14, 8) 137 | g[f'CORE_PCTR_0_SRC_{i+0}'] = PerCoreField(g[name], 6, 0) 138 | 139 | for i in range(32): 140 | g[f'CORE_PCTR_0_PCTR{i}'] = PerCoreRegister(0x00680 + 4 * i) 141 | 142 | del g, i 143 | 144 | CORE_PCTR_CYCLE_COUNT = 32 145 | 146 | 147 | class RegisterMapping: 148 | 149 | def __init__(self): 150 | 151 | stem = Path(__file__).parent / 'readwrite4' 152 | for suffix in EXTENSION_SUFFIXES: 153 | try: 154 | lib = cdll.LoadLibrary(stem.with_suffix(suffix)) 155 | except OSError: 156 | continue 157 | else: 158 | break 159 | else: 160 | raise Exception('readwrite4 library is not found.' 161 | + ' Your installation seems to be broken.') 162 | 163 | self.read4 = lib.read4 164 | self.write4 = lib.write4 165 | del stem, lib 166 | 167 | self.read4.argtypes = [c_void_p] 168 | self.read4.restype = c_uint32 169 | self.write4.argtypes = [c_void_p, c_uint32] 170 | self.write4.restype = None 171 | 172 | fd = os.open('/dev/mem', os.O_RDWR) 173 | 174 | # XXX: Should use bcm_host_get_peripheral_address for the base address 175 | # on userland, and consult /proc/device-tree/__symbols__/v3d and then 176 | # /proc/device-tree/v3dbus/v3d@7ec04000/{reg-names,reg} for the offsets 177 | # in the future. 178 | 179 | self.map_hub = mmap.mmap(offset=0xfec00000, length=0x4000, fileno=fd, 180 | flags=mmap.MAP_SHARED, 181 | prot=mmap.PROT_READ | mmap.PROT_WRITE) 182 | self.ptr_hub = np.frombuffer(self.map_hub).ctypes.data 183 | 184 | self.ncores = 1 185 | self.map_cores = [None] * self.ncores 186 | self.ptr_cores = [None] * self.ncores 187 | for core in range(self.ncores): 188 | self.map_cores[core] = mmap.mmap(offset=0xfec04000 + 0x4000 * core, 189 | length=0x4000, fileno=fd, 190 | flags=mmap.MAP_SHARED, 191 | prot=mmap.PROT_READ | mmap.PROT_WRITE) 192 | self.ptr_cores[core] = \ 193 | np.frombuffer(self.map_cores[core]).ctypes.data 194 | 195 | os.close(fd) 196 | 197 | def __enter__(self): 198 | 199 | return self 200 | 201 | def __exit__(self, type, value, traceback): 202 | 203 | pass 204 | 205 | def _get_ptr(self, key, core): 206 | 207 | if isinstance(key, (HubField, PerCoreField)): 208 | return self._get_ptr(key.register, core) 209 | elif isinstance(key, HubRegister): 210 | assert core is None 211 | return self.ptr_hub + key.offset 212 | elif isinstance(key, PerCoreRegister): 213 | return self.ptr_cores[core] + key.offset 214 | 215 | def __getitem__(self, key): 216 | 217 | core = None 218 | if isinstance(key, tuple): 219 | key, core = key 220 | 221 | v = self.read4(self._get_ptr(key, core)) 222 | 223 | if isinstance(key, (HubField, PerCoreField)): 224 | v = (v & key.mask) >> key.shift 225 | 226 | return v 227 | 228 | def __setitem__(self, key, value): 229 | 230 | core = None 231 | if isinstance(key, tuple): 232 | key, core = key 233 | 234 | if isinstance(key, (HubField, PerCoreField)): 235 | value = (self[key.register, core] & ~key.mask) \ 236 | | ((value << key.shift) & key.mask) 237 | 238 | self.write4(self._get_ptr(key, core), value) 239 | 240 | 241 | class PerformanceCounter: 242 | 243 | _PCTR_SRCs = [globals()[f'CORE_PCTR_0_SRC_{_}'] for _ in range(32)] 244 | _PCTRs = [globals()[f'CORE_PCTR_0_PCTR{_}'] for _ in range(32)] 245 | 246 | def __init__(self, regmap, srcs): 247 | 248 | self.regmap = regmap 249 | self.srcs = srcs 250 | self.core = 0 # Sufficient for now. 251 | self.mask = (1 << len(self.srcs)) - 1 252 | 253 | def __enter__(self): 254 | 255 | self.regmap[CORE_PCTR_0_EN, self.core] = 0 256 | 257 | for i in range(len(self.srcs)): 258 | self.regmap[self._PCTR_SRCs[i], self.core] = self.srcs[i] 259 | 260 | self.regmap[CORE_PCTR_0_CLR, self.core] = self.mask 261 | self.regmap[CORE_PCTR_0_OVERFLOW, self.core] = self.mask 262 | self.regmap[CORE_PCTR_0_EN, self.core] = self.mask 263 | 264 | return self 265 | 266 | def __exit__(self, type, value, traceback): 267 | 268 | self.regmap[CORE_PCTR_0_EN, self.core] = 0 269 | self.regmap[CORE_PCTR_0_CLR, self.core] = self.mask 270 | self.regmap[CORE_PCTR_0_OVERFLOW, self.core] = self.mask 271 | 272 | def result(self): 273 | 274 | return [self.regmap[self._PCTRs[i], self.core] 275 | for i in range(len(self.srcs))] 276 | --------------------------------------------------------------------------------