├── .editorconfig
├── .github
    └── workflows
    │   └── main.yml
├── .gitignore
├── LICENSE.GPL2
├── README.md
├── benchmarks
    ├── bench_helper.py
    ├── test_dispatch.py
    ├── test_gpu_clock.py
    └── test_tmu_performance.py
├── examples
    ├── memset.py
    ├── pctr_gpu_clock.py
    ├── scopy.py
    ├── sgemm.py
    └── summation.py
├── setup.py
├── tests
    ├── test_alu.py
    ├── test_branch.py
    ├── test_condition_codes.py
    ├── test_driver.py
    ├── test_drm.py
    ├── test_labels.py
    ├── test_parallel.py
    ├── test_sfu.py
    ├── test_signals.py
    ├── test_tmu.py
    ├── test_unifa.py
    └── test_v3d.py
└── videocore6
    ├── __init__.py
    ├── assembler.py
    ├── driver.py
    ├── drm_v3d.py
    ├── readwrite4.c
    └── v3d.py


/.editorconfig:
--------------------------------------------------------------------------------
 1 | # .editorconfig -- Config file for EditorConfig. http://editorconfig.org/
 2 | 
 3 | [*]
 4 | charset = utf-8
 5 | end_of_line = lf
 6 | trim_trailing_whitespace = true
 7 | insert_final_newline = true
 8 | 
 9 | [*.py]
10 | indent_style = space
11 | indent_size = 4
12 | 
13 | [*.c]
14 | indent_style = space
15 | indent_size = 4
16 | 


--------------------------------------------------------------------------------
/.github/workflows/main.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | 
 3 | on: [push]
 4 | 
 5 | jobs:
 6 | 
 7 |   test:
 8 |     runs-on: ubuntu-latest
 9 |     steps:
10 |       - uses: actions/checkout@v1
11 |       - name: Set up Python 3.9
12 |         uses: actions/setup-python@v1
13 |         with:
14 |           python-version: 3.9
15 |       - name: Test code format
16 |         run: |
17 |           pip3 install autopep8
18 |           autopep8 --diff --exit-code --max-line-length 128 --recursive videocore6
19 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | # Created by https://www.gitignore.io/api/git,vim,linux,emacs,python
  4 | # Edit at https://www.gitignore.io/?templates=git,vim,linux,emacs,python
  5 | 
  6 | ### Emacs ###
  7 | # -*- mode: gitignore; -*-
  8 | *~
  9 | \#*\#
 10 | /.emacs.desktop
 11 | /.emacs.desktop.lock
 12 | *.elc
 13 | auto-save-list
 14 | tramp
 15 | .\#*
 16 | 
 17 | # Org-mode
 18 | .org-id-locations
 19 | *_archive
 20 | 
 21 | # flymake-mode
 22 | *_flymake.*
 23 | 
 24 | # eshell files
 25 | /eshell/history
 26 | /eshell/lastdir
 27 | 
 28 | # elpa packages
 29 | /elpa/
 30 | 
 31 | # reftex files
 32 | *.rel
 33 | 
 34 | # AUCTeX auto folder
 35 | /auto/
 36 | 
 37 | # cask packages
 38 | .cask/
 39 | dist/
 40 | 
 41 | # Flycheck
 42 | flycheck_*.el
 43 | 
 44 | # server auth directory
 45 | /server/
 46 | 
 47 | # projectiles files
 48 | .projectile
 49 | 
 50 | # directory configuration
 51 | .dir-locals.el
 52 | 
 53 | # network security
 54 | /network-security.data
 55 | 
 56 | 
 57 | ### Git ###
 58 | # Created by git for backups. To disable backups in Git:
 59 | # $ git config --global mergetool.keepBackup false
 60 | *.orig
 61 | 
 62 | # Created by git when using merge tools for conflicts
 63 | *.BACKUP.*
 64 | *.BASE.*
 65 | *.LOCAL.*
 66 | *.REMOTE.*
 67 | *_BACKUP_*.txt
 68 | *_BASE_*.txt
 69 | *_LOCAL_*.txt
 70 | *_REMOTE_*.txt
 71 | 
 72 | ### Linux ###
 73 | 
 74 | # temporary files which can be created if a process still has a handle open of a deleted file
 75 | .fuse_hidden*
 76 | 
 77 | # KDE directory preferences
 78 | .directory
 79 | 
 80 | # Linux trash folder which might appear on any partition or disk
 81 | .Trash-*
 82 | 
 83 | # .nfs files are created when an open file is removed but is still being accessed
 84 | .nfs*
 85 | 
 86 | ### Python ###
 87 | # Byte-compiled / optimized / DLL files
 88 | __pycache__/
 89 | *.py[cod]
 90 | *$py.class
 91 | 
 92 | # C extensions
 93 | *.so
 94 | 
 95 | # Distribution / packaging
 96 | .Python
 97 | build/
 98 | develop-eggs/
 99 | downloads/
100 | eggs/
101 | .eggs/
102 | lib/
103 | lib64/
104 | parts/
105 | sdist/
106 | var/
107 | wheels/
108 | pip-wheel-metadata/
109 | share/python-wheels/
110 | *.egg-info/
111 | .installed.cfg
112 | *.egg
113 | MANIFEST
114 | 
115 | # PyInstaller
116 | #  Usually these files are written by a python script from a template
117 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
118 | *.manifest
119 | *.spec
120 | 
121 | # Installer logs
122 | pip-log.txt
123 | pip-delete-this-directory.txt
124 | 
125 | # Unit test / coverage reports
126 | htmlcov/
127 | .tox/
128 | .nox/
129 | .coverage
130 | .coverage.*
131 | .cache
132 | nosetests.xml
133 | coverage.xml
134 | *.cover
135 | .hypothesis/
136 | .pytest_cache/
137 | 
138 | # Translations
139 | *.mo
140 | *.pot
141 | 
142 | # Scrapy stuff:
143 | .scrapy
144 | 
145 | # Sphinx documentation
146 | docs/_build/
147 | 
148 | # PyBuilder
149 | target/
150 | 
151 | # pyenv
152 | .python-version
153 | 
154 | # pipenv
155 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
156 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
157 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
158 | #   install all needed dependencies.
159 | #Pipfile.lock
160 | 
161 | # celery beat schedule file
162 | celerybeat-schedule
163 | 
164 | # SageMath parsed files
165 | *.sage.py
166 | 
167 | # Spyder project settings
168 | .spyderproject
169 | .spyproject
170 | 
171 | # Rope project settings
172 | .ropeproject
173 | 
174 | # Mr Developer
175 | .mr.developer.cfg
176 | .project
177 | .pydevproject
178 | 
179 | # mkdocs documentation
180 | /site
181 | 
182 | # mypy
183 | .mypy_cache/
184 | .dmypy.json
185 | dmypy.json
186 | 
187 | # Pyre type checker
188 | .pyre/
189 | 
190 | ### Vim ###
191 | # Swap
192 | [._]*.s[a-v][a-z]
193 | [._]*.sw[a-p]
194 | [._]s[a-rt-v][a-z]
195 | [._]ss[a-gi-z]
196 | [._]sw[a-p]
197 | 
198 | # Session
199 | Session.vim
200 | Sessionx.vim
201 | 
202 | # Temporary
203 | .netrwhist
204 | # Auto-generated tag files
205 | tags
206 | # Persistent undo
207 | [._]*.un~
208 | 
209 | # End of https://www.gitignore.io/api/git,vim,linux,emacs,python
210 | 


--------------------------------------------------------------------------------
/LICENSE.GPL2:
--------------------------------------------------------------------------------
  1 |                     GNU GENERAL PUBLIC LICENSE
  2 |                        Version 2, June 1991
  3 | 
  4 |  Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
  5 |  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  6 |  Everyone is permitted to copy and distribute verbatim copies
  7 |  of this license document, but changing it is not allowed.
  8 | 
  9 |                             Preamble
 10 | 
 11 |   The licenses for most software are designed to take away your
 12 | freedom to share and change it.  By contrast, the GNU General Public
 13 | License is intended to guarantee your freedom to share and change free
 14 | software--to make sure the software is free for all its users.  This
 15 | General Public License applies to most of the Free Software
 16 | Foundation's software and to any other program whose authors commit to
 17 | using it.  (Some other Free Software Foundation software is covered by
 18 | the GNU Lesser General Public License instead.)  You can apply it to
 19 | your programs, too.
 20 | 
 21 |   When we speak of free software, we are referring to freedom, not
 22 | price.  Our General Public Licenses are designed to make sure that you
 23 | have the freedom to distribute copies of free software (and charge for
 24 | this service if you wish), that you receive source code or can get it
 25 | if you want it, that you can change the software or use pieces of it
 26 | in new free programs; and that you know you can do these things.
 27 | 
 28 |   To protect your rights, we need to make restrictions that forbid
 29 | anyone to deny you these rights or to ask you to surrender the rights.
 30 | These restrictions translate to certain responsibilities for you if you
 31 | distribute copies of the software, or if you modify it.
 32 | 
 33 |   For example, if you distribute copies of such a program, whether
 34 | gratis or for a fee, you must give the recipients all the rights that
 35 | you have.  You must make sure that they, too, receive or can get the
 36 | source code.  And you must show them these terms so they know their
 37 | rights.
 38 | 
 39 |   We protect your rights with two steps: (1) copyright the software, and
 40 | (2) offer you this license which gives you legal permission to copy,
 41 | distribute and/or modify the software.
 42 | 
 43 |   Also, for each author's protection and ours, we want to make certain
 44 | that everyone understands that there is no warranty for this free
 45 | software.  If the software is modified by someone else and passed on, we
 46 | want its recipients to know that what they have is not the original, so
 47 | that any problems introduced by others will not reflect on the original
 48 | authors' reputations.
 49 | 
 50 |   Finally, any free program is threatened constantly by software
 51 | patents.  We wish to avoid the danger that redistributors of a free
 52 | program will individually obtain patent licenses, in effect making the
 53 | program proprietary.  To prevent this, we have made it clear that any
 54 | patent must be licensed for everyone's free use or not licensed at all.
 55 | 
 56 |   The precise terms and conditions for copying, distribution and
 57 | modification follow.
 58 | 
 59 |                     GNU GENERAL PUBLIC LICENSE
 60 |    TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
 61 | 
 62 |   0. This License applies to any program or other work which contains
 63 | a notice placed by the copyright holder saying it may be distributed
 64 | under the terms of this General Public License.  The "Program", below,
 65 | refers to any such program or work, and a "work based on the Program"
 66 | means either the Program or any derivative work under copyright law:
 67 | that is to say, a work containing the Program or a portion of it,
 68 | either verbatim or with modifications and/or translated into another
 69 | language.  (Hereinafter, translation is included without limitation in
 70 | the term "modification".)  Each licensee is addressed as "you".
 71 | 
 72 | Activities other than copying, distribution and modification are not
 73 | covered by this License; they are outside its scope.  The act of
 74 | running the Program is not restricted, and the output from the Program
 75 | is covered only if its contents constitute a work based on the
 76 | Program (independent of having been made by running the Program).
 77 | Whether that is true depends on what the Program does.
 78 | 
 79 |   1. You may copy and distribute verbatim copies of the Program's
 80 | source code as you receive it, in any medium, provided that you
 81 | conspicuously and appropriately publish on each copy an appropriate
 82 | copyright notice and disclaimer of warranty; keep intact all the
 83 | notices that refer to this License and to the absence of any warranty;
 84 | and give any other recipients of the Program a copy of this License
 85 | along with the Program.
 86 | 
 87 | You may charge a fee for the physical act of transferring a copy, and
 88 | you may at your option offer warranty protection in exchange for a fee.
 89 | 
 90 |   2. You may modify your copy or copies of the Program or any portion
 91 | of it, thus forming a work based on the Program, and copy and
 92 | distribute such modifications or work under the terms of Section 1
 93 | above, provided that you also meet all of these conditions:
 94 | 
 95 |     a) You must cause the modified files to carry prominent notices
 96 |     stating that you changed the files and the date of any change.
 97 | 
 98 |     b) You must cause any work that you distribute or publish, that in
 99 |     whole or in part contains or is derived from the Program or any
100 |     part thereof, to be licensed as a whole at no charge to all third
101 |     parties under the terms of this License.
102 | 
103 |     c) If the modified program normally reads commands interactively
104 |     when run, you must cause it, when started running for such
105 |     interactive use in the most ordinary way, to print or display an
106 |     announcement including an appropriate copyright notice and a
107 |     notice that there is no warranty (or else, saying that you provide
108 |     a warranty) and that users may redistribute the program under
109 |     these conditions, and telling the user how to view a copy of this
110 |     License.  (Exception: if the Program itself is interactive but
111 |     does not normally print such an announcement, your work based on
112 |     the Program is not required to print an announcement.)
113 | 
114 | These requirements apply to the modified work as a whole.  If
115 | identifiable sections of that work are not derived from the Program,
116 | and can be reasonably considered independent and separate works in
117 | themselves, then this License, and its terms, do not apply to those
118 | sections when you distribute them as separate works.  But when you
119 | distribute the same sections as part of a whole which is a work based
120 | on the Program, the distribution of the whole must be on the terms of
121 | this License, whose permissions for other licensees extend to the
122 | entire whole, and thus to each and every part regardless of who wrote it.
123 | 
124 | Thus, it is not the intent of this section to claim rights or contest
125 | your rights to work written entirely by you; rather, the intent is to
126 | exercise the right to control the distribution of derivative or
127 | collective works based on the Program.
128 | 
129 | In addition, mere aggregation of another work not based on the Program
130 | with the Program (or with a work based on the Program) on a volume of
131 | a storage or distribution medium does not bring the other work under
132 | the scope of this License.
133 | 
134 |   3. You may copy and distribute the Program (or a work based on it,
135 | under Section 2) in object code or executable form under the terms of
136 | Sections 1 and 2 above provided that you also do one of the following:
137 | 
138 |     a) Accompany it with the complete corresponding machine-readable
139 |     source code, which must be distributed under the terms of Sections
140 |     1 and 2 above on a medium customarily used for software interchange; or,
141 | 
142 |     b) Accompany it with a written offer, valid for at least three
143 |     years, to give any third party, for a charge no more than your
144 |     cost of physically performing source distribution, a complete
145 |     machine-readable copy of the corresponding source code, to be
146 |     distributed under the terms of Sections 1 and 2 above on a medium
147 |     customarily used for software interchange; or,
148 | 
149 |     c) Accompany it with the information you received as to the offer
150 |     to distribute corresponding source code.  (This alternative is
151 |     allowed only for noncommercial distribution and only if you
152 |     received the program in object code or executable form with such
153 |     an offer, in accord with Subsection b above.)
154 | 
155 | The source code for a work means the preferred form of the work for
156 | making modifications to it.  For an executable work, complete source
157 | code means all the source code for all modules it contains, plus any
158 | associated interface definition files, plus the scripts used to
159 | control compilation and installation of the executable.  However, as a
160 | special exception, the source code distributed need not include
161 | anything that is normally distributed (in either source or binary
162 | form) with the major components (compiler, kernel, and so on) of the
163 | operating system on which the executable runs, unless that component
164 | itself accompanies the executable.
165 | 
166 | If distribution of executable or object code is made by offering
167 | access to copy from a designated place, then offering equivalent
168 | access to copy the source code from the same place counts as
169 | distribution of the source code, even though third parties are not
170 | compelled to copy the source along with the object code.
171 | 
172 |   4. You may not copy, modify, sublicense, or distribute the Program
173 | except as expressly provided under this License.  Any attempt
174 | otherwise to copy, modify, sublicense or distribute the Program is
175 | void, and will automatically terminate your rights under this License.
176 | However, parties who have received copies, or rights, from you under
177 | this License will not have their licenses terminated so long as such
178 | parties remain in full compliance.
179 | 
180 |   5. You are not required to accept this License, since you have not
181 | signed it.  However, nothing else grants you permission to modify or
182 | distribute the Program or its derivative works.  These actions are
183 | prohibited by law if you do not accept this License.  Therefore, by
184 | modifying or distributing the Program (or any work based on the
185 | Program), you indicate your acceptance of this License to do so, and
186 | all its terms and conditions for copying, distributing or modifying
187 | the Program or works based on it.
188 | 
189 |   6. Each time you redistribute the Program (or any work based on the
190 | Program), the recipient automatically receives a license from the
191 | original licensor to copy, distribute or modify the Program subject to
192 | these terms and conditions.  You may not impose any further
193 | restrictions on the recipients' exercise of the rights granted herein.
194 | You are not responsible for enforcing compliance by third parties to
195 | this License.
196 | 
197 |   7. If, as a consequence of a court judgment or allegation of patent
198 | infringement or for any other reason (not limited to patent issues),
199 | conditions are imposed on you (whether by court order, agreement or
200 | otherwise) that contradict the conditions of this License, they do not
201 | excuse you from the conditions of this License.  If you cannot
202 | distribute so as to satisfy simultaneously your obligations under this
203 | License and any other pertinent obligations, then as a consequence you
204 | may not distribute the Program at all.  For example, if a patent
205 | license would not permit royalty-free redistribution of the Program by
206 | all those who receive copies directly or indirectly through you, then
207 | the only way you could satisfy both it and this License would be to
208 | refrain entirely from distribution of the Program.
209 | 
210 | If any portion of this section is held invalid or unenforceable under
211 | any particular circumstance, the balance of the section is intended to
212 | apply and the section as a whole is intended to apply in other
213 | circumstances.
214 | 
215 | It is not the purpose of this section to induce you to infringe any
216 | patents or other property right claims or to contest validity of any
217 | such claims; this section has the sole purpose of protecting the
218 | integrity of the free software distribution system, which is
219 | implemented by public license practices.  Many people have made
220 | generous contributions to the wide range of software distributed
221 | through that system in reliance on consistent application of that
222 | system; it is up to the author/donor to decide if he or she is willing
223 | to distribute software through any other system and a licensee cannot
224 | impose that choice.
225 | 
226 | This section is intended to make thoroughly clear what is believed to
227 | be a consequence of the rest of this License.
228 | 
229 |   8. If the distribution and/or use of the Program is restricted in
230 | certain countries either by patents or by copyrighted interfaces, the
231 | original copyright holder who places the Program under this License
232 | may add an explicit geographical distribution limitation excluding
233 | those countries, so that distribution is permitted only in or among
234 | countries not thus excluded.  In such case, this License incorporates
235 | the limitation as if written in the body of this License.
236 | 
237 |   9. The Free Software Foundation may publish revised and/or new versions
238 | of the General Public License from time to time.  Such new versions will
239 | be similar in spirit to the present version, but may differ in detail to
240 | address new problems or concerns.
241 | 
242 | Each version is given a distinguishing version number.  If the Program
243 | specifies a version number of this License which applies to it and "any
244 | later version", you have the option of following the terms and conditions
245 | either of that version or of any later version published by the Free
246 | Software Foundation.  If the Program does not specify a version number of
247 | this License, you may choose any version ever published by the Free Software
248 | Foundation.
249 | 
250 |   10. If you wish to incorporate parts of the Program into other free
251 | programs whose distribution conditions are different, write to the author
252 | to ask for permission.  For software which is copyrighted by the Free
253 | Software Foundation, write to the Free Software Foundation; we sometimes
254 | make exceptions for this.  Our decision will be guided by the two goals
255 | of preserving the free status of all derivatives of our free software and
256 | of promoting the sharing and reuse of software generally.
257 | 
258 |                             NO WARRANTY
259 | 
260 |   11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
268 | REPAIR OR CORRECTION.
269 | 
270 |   12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
278 | POSSIBILITY OF SUCH DAMAGES.
279 | 
280 |                      END OF TERMS AND CONDITIONS
281 | 
282 |             How to Apply These Terms to Your New Programs
283 | 
284 |   If you develop a new program, and you want it to be of the greatest
285 | possible use to the public, the best way to achieve this is to make it
286 | free software which everyone can redistribute and change under these terms.
287 | 
288 |   To do so, attach the following notices to the program.  It is safest
289 | to attach them to the start of each source file to most effectively
290 | convey the exclusion of warranty; and each file should have at least
291 | the "copyright" line and a pointer to where the full notice is found.
292 | 
293 |     <one line to give the program's name and a brief idea of what it does.>
294 |     Copyright (C) <year>  <name of author>
295 | 
296 |     This program is free software; you can redistribute it and/or modify
297 |     it under the terms of the GNU General Public License as published by
298 |     the Free Software Foundation; either version 2 of the License, or
299 |     (at your option) any later version.
300 | 
301 |     This program is distributed in the hope that it will be useful,
302 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
303 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
304 |     GNU General Public License for more details.
305 | 
306 |     You should have received a copy of the GNU General Public License along
307 |     with this program; if not, write to the Free Software Foundation, Inc.,
308 |     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
309 | 
310 | Also add information on how to contact you by electronic and paper mail.
311 | 
312 | If the program is interactive, make it output a short notice like this
313 | when it starts in an interactive mode:
314 | 
315 |     Gnomovision version 69, Copyright (C) year name of author
316 |     Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
317 |     This is free software, and you are welcome to redistribute it
318 |     under certain conditions; type `show c' for details.
319 | 
320 | The hypothetical commands `show w' and `show c' should show the appropriate
321 | parts of the General Public License.  Of course, the commands you use may
322 | be called something other than `show w' and `show c'; they could even be
323 | mouse-clicks or menu items--whatever suits your program.
324 | 
325 | You should also get your employer (if you work as a programmer) or your
326 | school, if any, to sign a "copyright disclaimer" for the program, if
327 | necessary.  Here is a sample; alter the names:
328 | 
329 |   Yoyodyne, Inc., hereby disclaims all copyright interest in the program
330 |   `Gnomovision' (which makes passes at compilers) written by James Hacker.
331 | 
332 |   <signature of Ty Coon>, 1 April 1989
333 |   Ty Coon, President of Vice
334 | 
335 | This General Public License does not permit incorporating your program into
336 | proprietary programs.  If your program is a subroutine library, you may
337 | consider it more useful to permit linking proprietary applications with the
338 | library.  If this is what you want to do, use the GNU Lesser General
339 | Public License instead of this License.
340 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # py-videocore6
  2 | 
  3 | A Python library for GPGPU programming on Raspberry Pi 4, which realizes
  4 | assembling and running QPU programs.
  5 | 
  6 | For Raspberry Pi Zero/1/2/3, use
  7 | [nineties/py-videocore](https://github.com/nineties/py-videocore) instead.
  8 | 
  9 | 
 10 | ## About VideoCore VI QPU
 11 | 
 12 | Raspberry Pi 4 (BCM2711) has a GPU named VideoCore VI QPU in its SoC.
 13 | The basic instruction set (add/mul ALU dual issue, three delay slots et al.)
 14 | remains the same as VideoCore IV QPU of Raspberry Pi Zero/1/2/3, and some units
 15 | now perform differently.
 16 | For instance, the TMU can now write to memory in addition to read, and it seems
 17 | that the VPM DMA is no longer available.
 18 | 
 19 | Theoretical peak performance of QPUs are as follows.
 20 | 
 21 | - VideoCore IV QPU @ 250MHz: 250 [MHz] x 3 [slice] x 4 [qpu/slice] x 4 [physical core/qpu] x 2 [op/cycle] = 24 [Gflop/s]
 22 | - VideoCore IV QPU @ 300MHz: 300 [MHz] x 3 [slice] x 4 [qpu/slice] x 4 [physical core/qpu] x 2 [op/cycle] = 28.8 [Gflop/s]
 23 | - VideoCore VI QPU @ 500MHz: 500 [MHz] x 2 [slice] x 4 [qpu/slice] x 4 [physical core/qpu] x 2 [op/cycle] = 32 [Gflop/s]
 24 | 
 25 | 
 26 | ## Requirements
 27 | 
 28 | `py-videocore6` communicates with the V3D hardware through `/dev/dri/card0`,
 29 | which is exposed by the DRM V3D driver.
 30 | To access the device, you need to belong to `video` group or be `root` user.
 31 | If you choose the former, run `sudo usermod --append --groups video $USER`
 32 | (re-login to take effect).
 33 | 
 34 | 
 35 | ## Installation
 36 | 
 37 | You can install `py-videocore6` directly using `pip`:
 38 | 
 39 | ```console
 40 | $ sudo apt update
 41 | $ sudo apt upgrade
 42 | $ sudo apt install python3-pip python3-numpy
 43 | $ pip3 install --user --upgrade pip setuptools wheel
 44 | $ pip3 install --user git+https://github.com/Idein/py-videocore6.git
 45 | ```
 46 | 
 47 | If you are willing to run tests and examples, install `py-videocore6` after
 48 | cloning it:
 49 | 
 50 | ```console
 51 | $ sudo apt update
 52 | $ sudo apt upgrade
 53 | $ sudo apt install python3-pip python3-numpy libatlas3-base
 54 | $ python3 -m pip install --user --upgrade pip setuptools wheel
 55 | $ git clone https://github.com/Idein/py-videocore6.git
 56 | $ cd py-videocore6/
 57 | $ python3 -m pip install --target sandbox/ --upgrade . nose
 58 | ```
 59 | 
 60 | 
 61 | ## Running tests and examples
 62 | 
 63 | In the `py-videocore6` directory cloned above:
 64 | 
 65 | ```console
 66 | $ python3 setup.py build_ext --inplace
 67 | $ PYTHONPATH=sandbox/ python3 -m nose -v -s
 68 | ```
 69 | 
 70 | ```console
 71 | $ PYTHONPATH=sandbox/ python3 examples/sgemm.py
 72 | ==== sgemm example (1024x1024 times 1024x1024) ====
 73 | numpy: 0.6986 sec, 3.078 Gflop/s
 74 | QPU:   0.5546 sec, 3.878 Gflop/s
 75 | Minimum absolute error: 0.0
 76 | Maximum absolute error: 0.0003814697265625
 77 | Minimum relative error: 0.0
 78 | Maximum relative error: 0.13375753164291382
 79 | ```
 80 | 
 81 | ```console
 82 | $ PYTHONPATH=sandbox/ python3 examples/summation.py
 83 | ==== summaton example (32.0 Mi elements) ====
 84 | Preparing for buffers...
 85 | Executing on QPU...
 86 | 0.01853448400004254 sec, 7241.514141947083 MB/s
 87 | ```
 88 | 
 89 | ```console
 90 | $ PYTHONPATH=sandbox/ python3 examples/memset.py
 91 | ==== memset example (64.0 MiB) ====
 92 | Preparing for buffers...
 93 | Executing on QPU...
 94 | 0.01788834699993913 sec, 3751.5408215319367 MB/s
 95 | ```
 96 | 
 97 | ```console
 98 | $ PYTHONPATH=sandbox/ python3 examples/scopy.py
 99 | ==== scopy example (16.0 Mi elements) ====
100 | Preparing for buffers...
101 | Executing on QPU...
102 | 0.02768789600000332 sec, 2423.761776625857 MB/s
103 | ```
104 | 
105 | ```console
106 | $ sudo PYTHONPATH=sandbox/ python3 examples/pctr_gpu_clock.py
107 | ==== QPU clock measurement with performance counters ====
108 | 500.529835 MHz
109 | ```
110 | 
111 | You may see lower performance without `force_turbo=1` in `/boot/config.txt`.
112 | 
113 | 
114 | ## References
115 | 
116 | - DRM V3D driver which controls QPU via hardware V3D registers: [linux/drivers/gpu/drm/v3d](https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/tree/drivers/gpu/drm/v3d)
117 | - Mesa library which partially includes the QPU instruction set: [mesa/src/broadcom/qpu](https://gitlab.freedesktop.org/mesa/mesa/-/tree/main/src/broadcom/qpu)
118 | - Mesa also includes QPU program disassembler, which can be tested with: [Terminus-IMRC/vc6qpudisas](https://github.com/Terminus-IMRC/vc6qpudisas)
119 | 


--------------------------------------------------------------------------------
/benchmarks/bench_helper.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # Copyright (c) 2019-2020 Idein Inc.
 3 | #
 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 5 | # of this software and associated documentation files (the "Software"), to deal
 6 | # in the Software without restriction, including without limitation the rights
 7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 8 | # copies of the Software, and to permit persons to whom the Software is
 9 | # furnished to do so, subject to the following conditions:
10 | #
11 | # The above copyright notice and this permission notice (including the next
12 | # paragraph) shall be included in all copies or substantial portions of the
13 | # Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | import subprocess
24 | from ctypes import cdll
25 | import numpy as np
26 | 
27 | class BenchHelper(object):
28 | 
29 |     def __init__(self, path = './libbench_helper.so'):
30 | 
31 |         try:
32 |             self.lib = cdll.LoadLibrary(path)
33 |         except OSError:
34 |             subprocess.run(f'gcc -O2 -shared -fPIC -o {path} -xc -'.split(), text=True,
35 |                            input='''
36 | #include <stdint.h>
37 | void wait_address(uint32_t volatile * p) {
38 |     while(p[0] == 0){}
39 | }
40 | '''
41 |             )
42 |             self.lib = cdll.LoadLibrary(path)
43 | 
44 | 
45 |         self.lib.wait_address.argtypes = [
46 |             np.ctypeslib.ndpointer(dtype=np.uint32, shape=(1,), flags="C_CONTIGUOUS"),
47 |         ]
48 | 
49 |     def wait_address(self, done):
50 |         self.lib.wait_address(done)
51 | 


--------------------------------------------------------------------------------
/benchmarks/test_dispatch.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # Copyright (c) 2019-2020 Idein Inc.
  3 | #
  4 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  5 | # of this software and associated documentation files (the "Software"), to deal
  6 | # in the Software without restriction, including without limitation the rights
  7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  8 | # copies of the Software, and to permit persons to whom the Software is
  9 | # furnished to do so, subject to the following conditions:
 10 | #
 11 | # The above copyright notice and this permission notice (including the next
 12 | # paragraph) shall be included in all copies or substantial portions of the
 13 | # Software.
 14 | #
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 21 | # SOFTWARE.
 22 | 
 23 | import time
 24 | from videocore6.driver import Driver
 25 | from videocore6.assembler import qpu
 26 | import numpy as np
 27 | from bench_helper import BenchHelper
 28 | 
 29 | @qpu
 30 | def qpu_write_N(asm, N):
 31 | 
 32 |     eidx(r0, sig = ldunif)
 33 |     nop(sig = ldunifrf(rf0))
 34 |     shl(r0, r0, 2)
 35 |     mov(tmud, N)
 36 |     add(tmua, r5, r0)
 37 |     tmuwt()
 38 | 
 39 |     mov(tmud, 1)
 40 |     mov(tmua, rf0)
 41 |     tmuwt()
 42 | 
 43 |     nop(sig = thrsw)
 44 |     nop(sig = thrsw)
 45 |     nop()
 46 |     nop()
 47 |     nop(sig = thrsw)
 48 |     nop()
 49 |     nop()
 50 |     nop()
 51 | 
 52 | def test_multiple_dispatch_delay():
 53 |     print()
 54 | 
 55 |     bench = BenchHelper('benchmarks/libbench_helper.so')
 56 | 
 57 |     with Driver() as drv:
 58 | 
 59 |         data = drv.alloc((10, 16), dtype = 'uint32')
 60 |         code = [drv.program(lambda asm: qpu_write_N(asm, i)) for i in range(data.shape[0])]
 61 |         unif = drv.alloc((data.shape[0], 2), dtype = 'uint32')
 62 |         done = drv.alloc(1, dtype = 'uint32')
 63 | 
 64 |         data[:] = 0
 65 |         unif[:,0] = data.addresses()[:,0]
 66 |         unif[:,1] = done.addresses()[0]
 67 | 
 68 |         ref_start = time.time()
 69 |         with drv.compute_shader_dispatcher() as csd:
 70 |             for i in range(data.shape[0]):
 71 |                 csd.dispatch(code[i], unif.addresses()[i,0])
 72 |         ref_end = time.time()
 73 |         assert (data == np.arange(data.shape[0]).reshape(data.shape[0],1)).all()
 74 | 
 75 |         data[:] = 0
 76 | 
 77 |         naive_results = np.zeros(data.shape[0], dtype='float32')
 78 |         with drv.compute_shader_dispatcher() as csd:
 79 |             for i in range(data.shape[0]):
 80 |                 done[:] = 0
 81 |                 start = time.time()
 82 |                 csd.dispatch(code[i], unif.addresses()[i,0])
 83 |                 bench.wait_address(done)
 84 |                 end = time.time()
 85 |                 naive_results[i] = end - start
 86 |         assert (data == np.arange(data.shape[0]).reshape(data.shape[0],1)).all()
 87 | 
 88 |         sleep_results = np.zeros(data.shape[0], dtype='float32')
 89 |         with drv.compute_shader_dispatcher() as csd:
 90 |             for i in range(data.shape[0]):
 91 |                 done[:] = 0
 92 |                 time.sleep(1)
 93 |                 start = time.time()
 94 |                 csd.dispatch(code[i], unif.addresses()[i,0])
 95 |                 bench.wait_address(done)
 96 |                 end = time.time()
 97 |                 sleep_results[i] = end - start
 98 |         assert (data == np.arange(data.shape[0]).reshape(data.shape[0],1)).all()
 99 | 
100 |         print
101 |         print(f'API wait after {data.shape[0]} dispatch: {ref_end - ref_start:.6f} sec')
102 |         print(f'polling wait for each {data.shape[0]} dispatch:')
103 |         print(f'    total: {np.sum(naive_results):.6f} sec')
104 |         print(f'    details: {" ".join([f"{t:.6f}" for t in naive_results])}')
105 |         print(f'polling wait for each {data.shape[0]} dispatch with between sleep:')
106 |         print(f'    total: {np.sum(sleep_results):.6f} sec + sleep...')
107 |         print(f'    details: {" ".join([f"{t:.6f}" for t in sleep_results])}')
108 | 


--------------------------------------------------------------------------------
/benchmarks/test_gpu_clock.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # Copyright (c) 2019-2020 Idein Inc.
 3 | #
 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 5 | # of this software and associated documentation files (the "Software"), to deal
 6 | # in the Software without restriction, including without limitation the rights
 7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 8 | # copies of the Software, and to permit persons to whom the Software is
 9 | # furnished to do so, subject to the following conditions:
10 | #
11 | # The above copyright notice and this permission notice (including the next
12 | # paragraph) shall be included in all copies or substantial portions of the
13 | # Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | import time
24 | from videocore6.driver import Driver
25 | from videocore6.assembler import qpu
26 | from bench_helper import BenchHelper
27 | 
28 | @qpu
29 | def qpu_clock(asm):
30 | 
31 |     nop(sig = ldunif)
32 |     nop(sig = ldunifrf(rf0))
33 | 
34 |     with loop as l:
35 |         sub(r5, r5, 1, cond = 'pushn')
36 |         l.b(cond = 'anyna')
37 |         nop()
38 |         nop()
39 |         nop()
40 | 
41 |     mov(tmud, 1)
42 |     mov(tmua, rf0)
43 |     tmuwt()
44 | 
45 |     nop(sig = thrsw)
46 |     nop(sig = thrsw)
47 |     nop()
48 |     nop()
49 |     nop(sig = thrsw)
50 |     nop()
51 |     nop()
52 |     nop()
53 | 
54 | 
55 | def test_clock():
56 |     print()
57 | 
58 |     bench = BenchHelper('benchmarks/libbench_helper.so')
59 | 
60 |     with Driver() as drv:
61 | 
62 |         f = pow(2, 25)
63 | 
64 |         code = drv.program(qpu_clock)
65 |         unif = drv.alloc(2, dtype = 'uint32')
66 |         done = drv.alloc(1, dtype = 'uint32')
67 | 
68 |         done[:] = 0
69 | 
70 |         unif[0] = f
71 |         unif[1] = done.addresses()[0]
72 | 
73 |         with drv.compute_shader_dispatcher() as csd:
74 |             start = time.time()
75 |             csd.dispatch(code, unif.addresses()[0])
76 |             bench.wait_address(done)
77 |             end = time.time()
78 | 
79 |         print(f'{end - start:.6f} sec')
80 |         print(f'{f * 5 / (end - start) / 1000 / 1000 * 4:.6f} MHz')
81 | 


--------------------------------------------------------------------------------
/benchmarks/test_tmu_performance.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # Copyright (c) 2019-2020 Idein Inc.
  3 | #
  4 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  5 | # of this software and associated documentation files (the "Software"), to deal
  6 | # in the Software without restriction, including without limitation the rights
  7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  8 | # copies of the Software, and to permit persons to whom the Software is
  9 | # furnished to do so, subject to the following conditions:
 10 | #
 11 | # The above copyright notice and this permission notice (including the next
 12 | # paragraph) shall be included in all copies or substantial portions of the
 13 | # Software.
 14 | #
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 21 | # SOFTWARE.
 22 | 
 23 | 
 24 | import time
 25 | from videocore6.driver import Driver
 26 | from videocore6.assembler import qpu
 27 | import numpy as np
 28 | import matplotlib.pyplot as plt
 29 | from bench_helper import BenchHelper
 30 | 
 31 | 
 32 | @qpu
 33 | def qpu_tmu_load_1_slot_1_qpu(asm, nops):
 34 | 
 35 |     nop(sig = ldunifrf(rf0)) # X.shape[1]
 36 |     nop(sig = ldunifrf(rf1)) # X
 37 |     nop(sig = ldunifrf(rf2)) # X.stride[1]
 38 |     nop(sig = ldunifrf(rf3)) # X.stride[0]
 39 |     nop(sig = ldunifrf(rf4)) # Y
 40 |     nop(sig = ldunifrf(rf5)) # done
 41 | 
 42 |     barrierid(syncb, sig = thrsw)
 43 |     nop()
 44 |     nop()
 45 | 
 46 |     tidx(r0)
 47 |     shr(r0, r0, 2)
 48 |     band(r0, r0, 0b1111, cond = 'pushz')
 49 |     b(R.done, cond = 'allna')
 50 |     nop() # delay slot
 51 |     nop() # delay slot
 52 |     nop() # delay slot
 53 | 
 54 |     eidx(r0)
 55 |     shl(r0, r0, 2)
 56 |     add(rf4, rf4, r0)
 57 | 
 58 |     eidx(r0)
 59 |     umul24(r0, r0, rf3)
 60 |     add(rf1, rf1, r0)
 61 | 
 62 |     mov(r2, 0.0)
 63 |     with loop as l:
 64 |         mov(tmua, rf1).add(rf1, rf1, rf2)
 65 |         for i in range(nops):
 66 |             nop()
 67 |         nop(sig = ldtmu(r3))
 68 |         sub(rf0, rf0, 1, cond = 'pushz')
 69 |         l.b(cond = 'anyna')
 70 |         fadd(r2, r2, r3) # delay slot
 71 |         nop()            # delay slot
 72 |         nop()            # delay slot
 73 | 
 74 |     mov(tmud, r2)
 75 |     mov(tmua, rf4)
 76 |     tmuwt()
 77 | 
 78 |     mov(tmud, 1)
 79 |     mov(tmua, rf5)
 80 |     tmuwt()
 81 | 
 82 |     L.done
 83 |     barrierid(syncb, sig = thrsw)
 84 |     nop()
 85 |     nop()
 86 | 
 87 |     nop(sig = thrsw)
 88 |     nop(sig = thrsw)
 89 |     nop()
 90 |     nop()
 91 |     nop(sig = thrsw)
 92 |     nop()
 93 |     nop()
 94 |     nop()
 95 | 
 96 | def test_tmu_load_1_slot_1_qpu():
 97 | 
 98 |     bench = BenchHelper('benchmarks/libbench_helper.so')
 99 | 
100 |     for trans in [False, True]:
101 | 
102 |         with Driver() as drv:
103 | 
104 |             loop = 2**15
105 | 
106 |             X = drv.alloc((16, loop) if trans else (loop, 16), dtype = 'float32')
107 |             Y = drv.alloc(16, dtype = 'float32')
108 |             unif = drv.alloc(6, dtype = 'uint32')
109 |             done = drv.alloc(1, dtype = 'uint32')
110 | 
111 |             unif[0] = loop
112 |             unif[1] = X.addresses()[0,0]
113 |             unif[2] = X.strides[int(trans)]
114 |             unif[3] = X.strides[1-int(trans)]
115 |             unif[4] = Y.addresses()[0]
116 |             unif[5] = done.addresses()[0]
117 | 
118 |             results = np.zeros((24, 10), dtype = 'float32')
119 | 
120 |             fig = plt.figure()
121 |             ax = fig.add_subplot(1,1,1)
122 |             ax.set_title(f'TMU load latency (1 slot, 1 qpu, stride=({unif[2]},{unif[3]}))')
123 |             ax.set_xlabel('# of nop (between request and load signal)')
124 |             ax.set_ylabel('sec')
125 | 
126 |             print()
127 |             for nops in range(results.shape[0]):
128 | 
129 |                 code = drv.program(lambda asm: qpu_tmu_load_1_slot_1_qpu(asm, nops))
130 | 
131 |                 for i in range(results.shape[1]):
132 | 
133 |                     with drv.compute_shader_dispatcher() as csd:
134 | 
135 |                         X[:] = np.random.randn(*X.shape) / X.shape[int(trans)]
136 |                         Y[:] = 0.0
137 |                         done[:] = 0
138 | 
139 |                         start = time.time()
140 |                         csd.dispatch(code, unif.addresses()[0], thread = 8)
141 |                         bench.wait_address(done)
142 |                         end = time.time()
143 | 
144 |                         results[nops,i] = end - start
145 | 
146 |                         assert np.allclose(Y, np.sum(X, axis=int(trans)), atol = 1e-4)
147 | 
148 |                 ax.scatter(np.zeros(results.shape[1])+nops, results[nops], s=1, c='blue')
149 | 
150 |                 print('{:4}/{}\t{:.9f}'.format(nops, results.shape[0], np.sum(results[nops]) / results.shape[1]))
151 | 
152 |             ax.set_ylim(auto=True)
153 |             ax.set_xlim(0, results.shape[0])
154 |             fig.savefig(f'benchmarks/tmu_load_1_slot_1_qpu_{unif[2]}_{unif[3]}.png')
155 | 
156 | @qpu
157 | def qpu_tmu_load_2_slot_1_qpu(asm, nops):
158 | 
159 |     nop(sig = ldunifrf(rf0)) # X.shape[1]
160 |     nop(sig = ldunifrf(rf1)) # X
161 |     nop(sig = ldunifrf(rf2)) # X.stride[1]
162 |     nop(sig = ldunifrf(rf3)) # X.stride[0]
163 |     nop(sig = ldunifrf(rf4)) # Y
164 |     nop(sig = ldunifrf(rf5)) # done
165 | 
166 |     barrierid(syncb, sig = thrsw)
167 |     nop()
168 |     nop()
169 | 
170 |     tidx(r0)
171 |     shr(r0, r0, 2)
172 |     band(r0, r0, 0b0011, cond = 'pushz')
173 |     b(R.skip_bench, cond = 'allna')
174 |     nop()
175 |     nop()
176 |     nop()
177 | 
178 |     eidx(r0)
179 |     shl(r0, r0, 2)
180 |     add(rf4, rf4, r0)
181 |     tidx(r0)
182 |     shr(r0, r0, 2)
183 |     band(r0, r0, 0b1111)
184 |     shl(r1, 4, 4)
185 |     umul24(r0, r0, r1)
186 |     add(rf4, rf4, r0)
187 | 
188 |     eidx(r0)
189 |     umul24(r0, r0, rf3)
190 |     add(rf1, rf1, r0)
191 |     tidx(r0)
192 |     shr(r0, r0, 2)
193 |     band(r0, r0, 0b1111)
194 |     shl(r1, rf0, 6)
195 |     umul24(r0, r0, r1)
196 |     add(rf1, rf1, r0)
197 | 
198 |     mov(r2, 0.0)
199 |     with loop as l:
200 |         mov(tmua, rf1).add(rf1, rf1, rf2)
201 |         for i in range(nops):
202 |             nop()
203 |         nop(sig = ldtmu(r3))
204 |         sub(rf0, rf0, 1, cond = 'pushz')
205 |         l.b(cond = 'anyna')
206 |         fadd(r2, r2, r3) # delay slot
207 |         nop()            # delay slot
208 |         nop()            # delay slot
209 | 
210 |     mov(tmud, r2)
211 |     mov(tmua, rf4)
212 |     tmuwt()
213 | 
214 |     L.skip_bench
215 | 
216 |     barrierid(syncb, sig = thrsw)
217 |     nop()
218 |     nop()
219 | 
220 |     tidx(r0)
221 |     shr(r0, r0, 2)
222 |     band(r0, r0, 0b1111, cond = 'pushz')
223 |     b(R.skip_done, cond = 'allna')
224 |     nop()
225 |     nop()
226 |     nop()
227 |     mov(tmud, 1)
228 |     mov(tmua, rf5)
229 |     tmuwt()
230 |     L.skip_done
231 | 
232 |     nop(sig = thrsw)
233 |     nop(sig = thrsw)
234 |     nop()
235 |     nop()
236 |     nop(sig = thrsw)
237 |     nop()
238 |     nop()
239 |     nop()
240 | 
241 | def test_tmu_load_2_slot_1_qpu():
242 | 
243 |     bench = BenchHelper('benchmarks/libbench_helper.so')
244 | 
245 |     for trans, min_nops, max_nops in [(False, 0, 64), (True, 128-32, 128+32)]:
246 | 
247 |         with Driver() as drv:
248 | 
249 |             loop = 2**13
250 | 
251 |             X = drv.alloc((8, 16, loop) if trans else (8, loop, 16), dtype = 'float32')
252 |             Y = drv.alloc((8, 16), dtype = 'float32')
253 |             unif = drv.alloc(6, dtype = 'uint32')
254 |             done = drv.alloc(1, dtype = 'uint32')
255 | 
256 |             unif[0] = loop
257 |             unif[1] = X.addresses()[0,0,0]
258 |             unif[2] = X.strides[1+int(trans)]
259 |             unif[3] = X.strides[2-int(trans)]
260 |             unif[4] = Y.addresses()[0,0]
261 |             unif[5] = done.addresses()[0]
262 | 
263 |             results = np.zeros((max_nops, 10), dtype = 'float32')
264 | 
265 |             fig = plt.figure()
266 |             ax = fig.add_subplot(1,1,1)
267 |             ax.set_title(f'TMU load latency (2 slot, 1 qpu, stride=({unif[2]},{unif[3]}))')
268 |             ax.set_xlabel('# of nop (between request and load signal)')
269 |             ax.set_ylabel('sec')
270 | 
271 |             print()
272 |             for nops in range(min_nops, results.shape[0]):
273 | 
274 |                 code = drv.program(lambda asm: qpu_tmu_load_2_slot_1_qpu(asm, nops))
275 | 
276 |                 for i in range(results.shape[1]):
277 | 
278 |                     with drv.compute_shader_dispatcher() as csd:
279 | 
280 |                         X[:] = np.random.randn(*X.shape) / X.shape[1+int(trans)]
281 |                         Y[:] = 0.0
282 |                         done[:] = 0
283 | 
284 |                         start = time.time()
285 |                         csd.dispatch(code, unif.addresses()[0], thread = 8)
286 |                         bench.wait_address(done)
287 |                         end = time.time()
288 | 
289 |                         results[nops,i] = end - start
290 | 
291 |                         assert np.allclose(Y[0::4], np.sum(X[0::4], axis=1+int(trans)), atol = 1e-4)
292 |                         assert (Y[1:4] == 0).all()
293 |                         assert (Y[5:8] == 0).all()
294 | 
295 |                 ax.scatter(np.zeros(results.shape[1])+nops, results[nops], s=1, c='blue')
296 | 
297 |                 print('{:4}/{}\t{:.9f}'.format(nops, results.shape[0], np.sum(results[nops]) / results.shape[1]))
298 | 
299 |             ax.set_ylim(auto=True)
300 |             ax.set_xlim(min_nops, max_nops)
301 |             fig.savefig(f'benchmarks/tmu_load_2_slot_1_qpu_{unif[2]}_{unif[3]}.png')
302 | 


--------------------------------------------------------------------------------
/examples/memset.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # Copyright (c) 2019-2020 Idein Inc.
  3 | #
  4 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  5 | # of this software and associated documentation files (the "Software"), to deal
  6 | # in the Software without restriction, including without limitation the rights
  7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  8 | # copies of the Software, and to permit persons to whom the Software is
  9 | # furnished to do so, subject to the following conditions:
 10 | #
 11 | # The above copyright notice and this permission notice (including the next
 12 | # paragraph) shall be included in all copies or substantial portions of the
 13 | # Software.
 14 | #
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 21 | # SOFTWARE.
 22 | 
 23 | 
 24 | from time import monotonic
 25 | 
 26 | import numpy as np
 27 | 
 28 | from videocore6.assembler import qpu
 29 | from videocore6.driver import Driver
 30 | 
 31 | 
 32 | @qpu
 33 | def qpu_memset(asm, *, num_qpus, unroll_shift, code_offset,
 34 |                align_cond=lambda pos: pos % 512 == 0):
 35 | 
 36 |     g = globals()
 37 |     for i, v in enumerate(['dst', 'fill', 'length', 'qpu_num', 'stride']):
 38 |         g[f'reg_{v}'] = rf[i]
 39 | 
 40 |     nop(sig=ldunifrf(reg_dst))
 41 |     nop(sig=ldunifrf(reg_fill))
 42 |     nop(sig=ldunifrf(reg_length))
 43 | 
 44 |     if num_qpus == 1:
 45 |         num_qpus_shift = 0
 46 |         mov(reg_qpu_num, 0)
 47 |     elif num_qpus == 8:
 48 |         num_qpus_shift = 3
 49 |         tidx(r0)
 50 |         shr(r0, r0, 2)
 51 |         band(reg_qpu_num, r0, 0b1111)
 52 |     else:
 53 |         raise Exception('num_qpus must be 1 or 8')
 54 | 
 55 |     # addr += 4 * 4 * (thread_num + 16 * qpu_num)
 56 |     shl(r0, reg_qpu_num, 4)
 57 |     eidx(r1)
 58 |     add(r0, r0, r1)
 59 |     shl(r0, r0, 4)
 60 |     add(reg_dst, reg_dst, r0)
 61 | 
 62 |     # stride = 4 * 4 * 16 * num_qpus
 63 |     mov(r0, 1)
 64 |     shl(reg_stride, r0, 8 + num_qpus_shift)
 65 | 
 66 |     # length /= 16 * num_qpus * unroll
 67 |     shr(reg_length, reg_length, 4 + num_qpus_shift + unroll_shift)
 68 | 
 69 |     while not align_cond(code_offset + len(asm)):
 70 |         nop()
 71 | 
 72 |     with loop as l:
 73 | 
 74 |         unroll = 1 << unroll_shift
 75 | 
 76 |         for i in range(unroll // 4 - 1):
 77 |             mov(tmud, reg_fill)
 78 |             mov(tmud, reg_fill)
 79 |             mov(tmud, reg_fill)
 80 |             mov(tmud, reg_fill)
 81 |             mov(tmuau if i % 4 == 0 else tmua, reg_dst).add(reg_dst, reg_dst, reg_stride)
 82 | 
 83 |         mov(tmud, reg_fill).mov(r0, 1)
 84 |         mov(tmud, reg_fill).sub(reg_length, reg_length, r0, cond='pushz')
 85 | 
 86 |         l.b(cond='na0').unif_addr(absolute=False)
 87 |         mov(tmud, reg_fill)
 88 |         mov(tmud, reg_fill)
 89 |         mov(tmua, reg_dst).add(reg_dst, reg_dst, reg_stride)
 90 | 
 91 |     nop(sig=thrsw)
 92 |     nop(sig=thrsw)
 93 |     nop()
 94 |     nop()
 95 |     nop(sig=thrsw)
 96 |     nop()
 97 |     nop()
 98 |     nop()
 99 | 
100 | 
101 | def memset(*, fill, length, num_qpus=8, unroll_shift=5):
102 | 
103 |     assert length > 0
104 |     assert length % (16 * num_qpus * (1 << unroll_shift)) == 0
105 |     assert unroll_shift >= 4
106 | 
107 |     print(f'==== memset example ({length * 4 / 1024 / 1024} MiB) ====')
108 | 
109 |     with Driver(data_area_size=(length + 1024) * 4) as drv:
110 | 
111 |         code = drv.program(qpu_memset, num_qpus=num_qpus,
112 |                            unroll_shift=unroll_shift,
113 |                            code_offset=drv.code_pos // 8)
114 | 
115 |         print('Preparing for buffers...')
116 | 
117 |         X = drv.alloc(length, dtype='uint32')
118 | 
119 |         X.fill(~fill & 0xFFFFFFFF)
120 | 
121 |         assert not np.array_equiv(X, fill)
122 | 
123 |         unif = drv.alloc(3 + (1 << (unroll_shift - 4)) + 1, dtype='uint32')
124 |         unif[0] = X.addresses()[0]
125 |         unif[1] = fill
126 |         unif[2] = length
127 |         unif[3: -1] = 0xfcfcfcfc
128 |         unif[-1] = 4 * (-len(unif) + 3) & 0xFFFFFFFF
129 | 
130 |         print('Executing on QPU...')
131 | 
132 |         start = monotonic()
133 |         drv.execute(code, unif.addresses()[0], thread=num_qpus)
134 |         end = monotonic()
135 | 
136 |         assert np.array_equiv(X, fill)
137 | 
138 |         print(f'{end - start} sec, {length * 4 / (end - start) * 1e-6} MB/s')
139 | 
140 | 
141 | def main():
142 | 
143 |     memset(fill=0x5a5a5a5a, length=16 * 1024 * 1024)
144 | 
145 | 
146 | if __name__ == '__main__':
147 | 
148 |     main()
149 | 


--------------------------------------------------------------------------------
/examples/pctr_gpu_clock.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # Copyright (c) 2019-2020 Idein Inc.
 3 | #
 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 5 | # of this software and associated documentation files (the "Software"), to deal
 6 | # in the Software without restriction, including without limitation the rights
 7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 8 | # copies of the Software, and to permit persons to whom the Software is
 9 | # furnished to do so, subject to the following conditions:
10 | #
11 | # The above copyright notice and this permission notice (including the next
12 | # paragraph) shall be included in all copies or substantial portions of the
13 | # Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | import time
24 | 
25 | from videocore6.v3d import *
26 | 
27 | with RegisterMapping() as regmap:
28 | 
29 |     with PerformanceCounter(regmap, [CORE_PCTR_CYCLE_COUNT]) as pctr:
30 | 
31 |         time.sleep(1)
32 |         result = pctr.result()
33 | 
34 | print('==== QPU clock measurement with performance counters ====')
35 | print(f'{result[0] * 1e-6} MHz')
36 | 


--------------------------------------------------------------------------------
/examples/scopy.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # Copyright (c) 2019-2020 Idein Inc.
  3 | #
  4 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  5 | # of this software and associated documentation files (the "Software"), to deal
  6 | # in the Software without restriction, including without limitation the rights
  7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  8 | # copies of the Software, and to permit persons to whom the Software is
  9 | # furnished to do so, subject to the following conditions:
 10 | #
 11 | # The above copyright notice and this permission notice (including the next
 12 | # paragraph) shall be included in all copies or substantial portions of the
 13 | # Software.
 14 | #
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 21 | # SOFTWARE.
 22 | 
 23 | 
 24 | from time import monotonic
 25 | 
 26 | import numpy as np
 27 | 
 28 | from videocore6.assembler import qpu
 29 | from videocore6.driver import Driver
 30 | 
 31 | 
 32 | @qpu
 33 | def qpu_scopy(asm, *, num_qpus, unroll_shift, code_offset,
 34 |               align_cond=lambda pos: pos % 512 == 259):
 35 | 
 36 |     g = globals()
 37 |     for i, v in enumerate(['length', 'src', 'dst', 'qpu_num', 'stride']):
 38 |         g[f'reg_{v}'] = rf[i]
 39 | 
 40 |     nop(sig=ldunifrf(reg_length))
 41 |     nop(sig=ldunifrf(reg_src))
 42 |     nop(sig=ldunifrf(reg_dst))
 43 | 
 44 |     if num_qpus == 1:
 45 |         num_qpus_shift = 0
 46 |         mov(reg_qpu_num, 0)
 47 |     elif num_qpus == 8:
 48 |         num_qpus_shift = 3
 49 |         tidx(r0)
 50 |         shr(r0, r0, 2)
 51 |         band(reg_qpu_num, r0, 0b1111)
 52 |     else:
 53 |         raise Exception('num_qpus must be 1 or 8')
 54 | 
 55 |     # addr += 4 * 4 * (thread_num + 16 * qpu_num)
 56 |     shl(r0, reg_qpu_num, 4)
 57 |     eidx(r1)
 58 |     add(r0, r0, r1)
 59 |     shl(r0, r0, 4)
 60 |     add(reg_src, reg_src, r0).add(reg_dst, reg_dst, r0)
 61 | 
 62 |     # stride = 4 * 4 * 16 * num_qpus
 63 |     mov(reg_stride, 1)
 64 |     shl(reg_stride, reg_stride, 8 + num_qpus_shift)
 65 | 
 66 |     num_shifts = [*range(16), *range(-16, 0)]
 67 | 
 68 |     # length /= 16 * 8 * num_qpus * unroll
 69 |     shr(reg_length, reg_length, num_shifts[7 + num_qpus_shift + unroll_shift])
 70 | 
 71 |     # This single thread switch and two nops just before the loop are really
 72 |     # important for TMU read to achieve a better performance.
 73 |     # This also enables TMU read requests without the thread switch signal, and
 74 |     # the eight-depth TMU read request queue.
 75 |     nop(sig=thrsw)
 76 |     nop()
 77 |     nop()
 78 | 
 79 |     while not align_cond(code_offset + len(asm)):
 80 |         nop()
 81 | 
 82 |     with loop as l:
 83 | 
 84 |         unroll = 1 << unroll_shift
 85 | 
 86 |         # A smaller number of instructions does not necessarily mean a faster
 87 |         # operation.  Rather, complicated TMU manipulations may perform worse
 88 |         # and even cause a hardware bug.
 89 | 
 90 |         mov(tmuau, reg_src).add(reg_src, reg_src, reg_stride)
 91 |         mov(tmua, reg_src).add(reg_src, reg_src, reg_stride)
 92 | 
 93 |         for i in range(unroll - 1):
 94 |             nop(sig=ldtmu(r0))
 95 |             mov(tmud, r0, sig=ldtmu(r0))
 96 |             mov(tmud, r0, sig=ldtmu(r0))
 97 |             mov(tmud, r0)
 98 |             nop(sig=ldtmu(r0))
 99 |             mov(tmud, r0)
100 |             mov(tmua, reg_dst).add(reg_dst, reg_dst, reg_stride)
101 |             mov(tmua, reg_src).add(reg_src, reg_src, reg_stride)
102 |             nop(sig=ldtmu(r0))
103 |             mov(tmud, r0, sig=ldtmu(r0))
104 |             mov(tmud, r0, sig=ldtmu(r0))
105 |             mov(tmud, r0)
106 |             nop(sig=ldtmu(r0))
107 |             mov(tmud, r0)
108 |             mov(tmuau, reg_dst).add(reg_dst, reg_dst, reg_stride)
109 |             mov(tmua, reg_src).add(reg_src, reg_src, reg_stride)
110 | 
111 |         if unroll == 1:
112 |             # Prefetch the next source.
113 |             mov(tmua, reg_src)
114 | 
115 |         nop(sig=ldtmu(r0))
116 |         mov(tmud, r0, sig=ldtmu(r0))
117 |         mov(tmud, r0, sig=ldtmu(r0))
118 |         mov(tmud, r0)
119 |         nop(sig=ldtmu(r0))
120 |         sub(reg_length, reg_length, 1, cond='pushz').mov(tmud, r0)
121 |         mov(tmua, reg_dst).add(reg_dst, reg_dst, reg_stride)
122 | 
123 |         if unroll == 1:
124 |             mov(tmuc, 0xfffffffc)
125 |         nop(sig=ldtmu(r0))
126 |         mov(tmud, r0, sig=ldtmu(r0))
127 |         mov(tmud, r0, sig=ldtmu(r0))
128 | 
129 |         l.b(cond='na0').unif_addr(absolute=False)
130 |         mov(tmud, r0, sig=ldtmu(r0))
131 |         mov(tmud, r0)
132 |         mov(tmua, reg_dst).add(reg_dst, reg_dst, reg_stride)
133 | 
134 |     # This synchronization is needed between the last TMU operation and the
135 |     # program end with the thread switch just before the loop above.
136 |     barrierid(syncb, sig=thrsw)
137 |     nop()
138 |     nop()
139 | 
140 |     nop(sig=thrsw)
141 |     nop(sig=thrsw)
142 |     nop()
143 |     nop()
144 |     nop(sig=thrsw)
145 |     nop()
146 |     nop()
147 |     nop()
148 | 
149 | 
150 | def scopy(*, length, num_qpus=8, unroll_shift=0):
151 | 
152 |     assert length > 0
153 |     assert length % (16 * 8 * num_qpus * (1 << unroll_shift)) == 0
154 | 
155 |     print(f'==== scopy example ({length / 1024 / 1024} Mi elements) ====')
156 | 
157 |     with Driver(data_area_size=(length * 2 + 1024) * 4) as drv:
158 | 
159 |         code = drv.program(qpu_scopy, num_qpus=num_qpus,
160 |                            unroll_shift=unroll_shift,
161 |                            code_offset=drv.code_pos // 8)
162 | 
163 |         print('Preparing for buffers...')
164 | 
165 |         X = drv.alloc(length, dtype='uint32')
166 |         Y = drv.alloc(length, dtype='uint32')
167 | 
168 |         X[:] = np.arange(*X.shape, dtype=X.dtype)
169 |         Y[:] = -X
170 | 
171 |         assert not np.array_equal(X, Y)
172 | 
173 |         unif = drv.alloc(3 + (1 << unroll_shift) + 1, dtype='uint32')
174 |         unif[0] = length
175 |         unif[1] = X.addresses()[0]
176 |         unif[2] = Y.addresses()[0]
177 |         if unroll_shift == 0:
178 |             unif[3] = 0xfc80fcfc
179 |         else:
180 |             unif[3: -1] = 0xfcfcfcfc
181 |         unif[-1] = 4 * (-len(unif) + 3) & 0xFFFFFFFF
182 | 
183 |         print('Executing on QPU...')
184 | 
185 |         start = monotonic()
186 |         drv.execute(code, unif.addresses()[0], thread=num_qpus)
187 |         end = monotonic()
188 | 
189 |         assert np.array_equal(X, Y)
190 | 
191 |         print(f'{end - start} sec, {length * 4 / (end - start) * 1e-6} MB/s')
192 | 
193 | 
194 | def main():
195 | 
196 |     scopy(length=16 * 1024 * 1024)
197 | 
198 | 
199 | if __name__ == '__main__':
200 | 
201 |     main()
202 | 


--------------------------------------------------------------------------------
/examples/sgemm.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # Copyright (c) 2019-2020 Idein Inc.
  3 | #
  4 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  5 | # of this software and associated documentation files (the "Software"), to deal
  6 | # in the Software without restriction, including without limitation the rights
  7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  8 | # copies of the Software, and to permit persons to whom the Software is
  9 | # furnished to do so, subject to the following conditions:
 10 | #
 11 | # The above copyright notice and this permission notice (including the next
 12 | # paragraph) shall be included in all copies or substantial portions of the
 13 | # Software.
 14 | #
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 21 | # SOFTWARE.
 22 | 
 23 | 
 24 | from time import clock_gettime, CLOCK_MONOTONIC
 25 | import numpy as np
 26 | from videocore6 import pack_unpack
 27 | from videocore6.driver import Driver
 28 | from videocore6.assembler import qpu
 29 | 
 30 | 
 31 | def getsec():
 32 |     return clock_gettime(CLOCK_MONOTONIC)
 33 | 
 34 | 
 35 | @qpu
 36 | def load_params(asm, thread, regs):
 37 | 
 38 |     if thread == 1:
 39 |         bxor(r0, r0, r0, sig = ldunifrf(rf0))
 40 |     elif thread == 8:
 41 |         #  8 threads (1 threads / qpu)
 42 |         tidx(r0, sig = ldunifrf(rf0))
 43 |         shr(r0, r0, 2)
 44 |         mov(r1, 0b1111)
 45 |     elif thread == 16:
 46 |         # 16 threads (2 threads / qpu)
 47 |         tidx(r0, sig = ldunifrf(rf0))
 48 |         shr(r0, r0, 1).mov(r1, 1)
 49 |         shl(r1, r1, 5)
 50 |         sub(r1, r1, 1)
 51 |     else:
 52 |         assert thread in [1,8,16]
 53 | 
 54 |     band(r3, r0, r1, sig = ldunifrf(rf1))
 55 |     shl(r0, rf1, 2)
 56 |     umul24(r0, r0, r3)
 57 |     eidx(r1).add(r0, r0, rf0)
 58 |     shl(r1, r1, 2)
 59 |     shl(r3, 4, 4).add(r0, r0, r1)
 60 |     n = len(regs)
 61 |     mov(tmua, r0, sig = thrsw).add(r0, r0, r3)
 62 |     nop()
 63 |     nop()
 64 |     nop(sig = ldtmu(r1))
 65 |     for i in range(n):
 66 |         if i % 16 == 0:
 67 |             mov(r5rep, r1)
 68 |             mov(regs[i], r5)
 69 |         elif i % 16 == 15 and i != n - 1:
 70 |             mov(tmua, r0, sig = thrsw).add(r0, r0, r3)
 71 |             rotate(r5rep, r1, - (i % 16))
 72 |             mov(regs[i], r5)
 73 |             nop(sig = ldtmu(r1))
 74 |         else:
 75 |             rotate(r5rep, r1, - (i % 16))
 76 |             mov(regs[i], r5)
 77 | 
 78 | @qpu
 79 | def qpu_sgemm_rnn_naive(asm, thread):
 80 | 
 81 |     params = [
 82 |         'P',
 83 |         'Q',
 84 |         'R',
 85 |         'A_base',
 86 |         'A_stride',
 87 |         'B_base',
 88 |         'B_stride',
 89 |         'C_base',
 90 |         'C_stride',
 91 |         'alpha',
 92 |         'beta',
 93 |     ]
 94 | 
 95 |     values = [
 96 |         'A_cur',
 97 |         'B_cur',
 98 |         'C_cur',
 99 |         'i', 'j', 'k',
100 |     ]
101 | 
102 |     g = globals()
103 |     for i, reg in enumerate(params + values):
104 |         g['reg_' + reg] = g['rf' + str(i+32)]
105 | 
106 |     load_params(asm, thread, [g['reg_' + reg] for reg in params])
107 | 
108 |     add(r0, reg_P, 15)
109 |     shr(r0, r0, 4)
110 |     shl(r0, r0, 4)
111 |     add(r1, reg_R, 15)
112 |     shr(r1, r1, 4)
113 |     shl(r1, r1, 6)
114 |     umul24(r3, r0, reg_A_stride)
115 |     add(reg_A_base, reg_A_base, r3)
116 |     add(reg_B_base, reg_B_base, r1)
117 |     umul24(r3, r0, reg_C_stride)
118 |     add(reg_C_base, reg_C_base, r3)
119 |     add(reg_C_base, reg_C_base, r1)
120 | 
121 |     for i in range(16):
122 |         mov(rf[i], 0.0).mov(rf[i+16], 0.0)
123 | 
124 |     # i=(p+15)/16.
125 |     add(r0, reg_P, 15)
126 |     shr(reg_i, r0, 4)
127 |     with loop as li:
128 | 
129 |         # j=(r+15)/16
130 |         add(r0, reg_R, 15)
131 |         shr(reg_j, r0, 4)
132 |         with loop as lj:
133 | 
134 |             shl(r0, reg_i, 4)
135 |             umul24(r3, r0, reg_C_stride)
136 |             shl(r1, reg_j, 6)
137 |             sub(reg_C_cur, reg_C_base, r3)
138 |             sub(reg_C_cur, reg_C_cur, r1)
139 |             umul24(r3, r0, reg_A_stride)
140 |             sub(reg_A_cur, reg_A_base, r3)
141 |             sub(reg_B_cur, reg_B_base, r1)
142 | 
143 |             mov(reg_k, reg_Q)
144 |             with loop as lk:
145 | 
146 |                 eidx(r0)
147 |                 umul24(r1, r0, reg_A_stride)
148 |                 add(r1, r1, reg_A_cur).add(reg_A_cur, reg_A_cur, 4)
149 |                 mov(tmua, r1, sig = thrsw)
150 |                 shl(r1, r0, 2)
151 |                 add(r1, r1, reg_B_cur).add(reg_B_cur, reg_B_cur, reg_B_stride)
152 |                 mov(tmua, r1, sig = thrsw)
153 | 
154 |                 nop(sig = ldtmu(r0))
155 |                 mov(r5rep, r0)
156 |                 nop(sig = ldtmu(r4))
157 |                 nop().fmul(r3, r5, r4)
158 |                 for i in range(1,16):
159 |                     rotate(r5rep, r0, -i)
160 |                     fadd(rf[i-1], rf[i-1], r3).fmul(r3, r5, r4)
161 |                 fadd(rf15, rf15, r3)
162 | 
163 |                 sub(reg_k, reg_k, 1, cond = 'pushz')
164 |                 lk.b(cond = 'anyna')
165 |                 nop() # delay slot
166 |                 nop() # delay slot
167 |                 nop() # delay slot
168 | 
169 |             eidx(r0)
170 |             shl(r0, r0, 2)
171 |             add(r1, reg_C_cur, r0)
172 |             mov(tmua, r1, sig = thrsw).add(r1, r1, reg_C_stride)
173 |             fmul(rf[0], rf[0], reg_alpha)
174 |             for i in range(1, 16):
175 |                 mov(tmua, r1, sig = thrsw).add(r1, r1, reg_C_stride)
176 |                 fmul(rf[i], rf[i], reg_alpha, sig = ldtmu(rf[i+15]))
177 |             mov(r0, reg_beta).fmul(r3, rf[16], reg_beta, sig = ldtmu(rf[31]))
178 |             for i in range(16):
179 |                 fadd(rf[i], rf[i], r3).fmul(r3, rf[i+17], r0)
180 | 
181 |             eidx(r0)
182 |             shl(r0, r0, 2)
183 |             add(r1, reg_C_cur, r0)
184 |             for i in range(16):
185 |                 mov(tmud, rf[i])
186 |                 mov(tmua, r1).add(r1, r1, reg_C_stride)
187 |                 mov(rf[i], 0.0).mov(rf[i+16], 0.0)
188 |                 tmuwt()
189 | 
190 |             sub(reg_j, reg_j, 1, cond = 'pushz')
191 |             lj.b(cond = 'anyna')
192 |             nop() # delay slot
193 |             nop() # delay slot
194 |             nop() # delay slot
195 | 
196 |         sub(reg_i, reg_i, 1, cond = 'pushz')
197 |         li.b(cond = 'anyna')
198 |         nop()
199 |         nop()
200 |         nop()
201 | 
202 |     nop(sig = thrsw)
203 |     nop(sig = thrsw)
204 |     nop()
205 |     nop()
206 |     nop(sig = thrsw)
207 |     nop()
208 |     nop()
209 |     nop()
210 | 
211 | def sgemm_rnn_naive():
212 | 
213 |     thread = 8
214 | 
215 |     P = 1024
216 |     Q = 1024
217 |     R = 1024
218 | 
219 |     assert P % (16 * 2) == 0
220 |     assert R % (16 * 4) == 0
221 | 
222 |     with Driver() as drv:
223 | 
224 |         code = drv.program(lambda asm: qpu_sgemm_rnn_naive(asm, thread))
225 | 
226 |         A = drv.alloc((P, Q), dtype = 'float32')
227 |         B = drv.alloc((Q, R), dtype = 'float32')
228 |         C = drv.alloc((P, R), dtype = 'float32')
229 | 
230 |         np.random.seed(0)
231 |         alpha = np.random.randn()
232 |         beta = np.random.randn()
233 |         A_ref = np.random.randn(*A.shape).astype(A.dtype)
234 |         B_ref = np.random.randn(*B.shape).astype(B.dtype)
235 |         C_ref = np.random.randn(*C.shape).astype(C.dtype)
236 | 
237 |         A[:] = A_ref
238 |         B[:] = B_ref
239 |         C[:] = C_ref
240 | 
241 |         start = getsec()
242 |         C_ref[:] = alpha * A_ref.dot(B_ref) + beta * C_ref
243 |         time_ref = getsec() - start
244 | 
245 |         def block_2x4_params(i, j):
246 |             tile_P = P // 2
247 |             tile_R = R // 4
248 |             return [
249 |                 tile_P, Q, tile_R,
250 |                 A.addresses()[tile_P*i, 0       ],
251 |                 A.strides[0],
252 |                 B.addresses()[0       , tile_R*j],
253 |                 B.strides[0],
254 |                 C.addresses()[tile_P*i, tile_R*j],
255 |                 C.strides[0],
256 |                 *pack_unpack('f', 'I', [alpha, beta]),
257 |             ]
258 | 
259 |         unif_params = drv.alloc((thread, len(block_2x4_params(0,0))), dtype = 'uint32')
260 |         for th in range(thread):
261 |             unif_params[th] = block_2x4_params(th // 4, th % 4)
262 | 
263 |         unif = drv.alloc(2, dtype = 'uint32')
264 |         unif[0] = unif_params.addresses()[0,0]
265 |         unif[1] = unif_params.shape[1]
266 | 
267 |         start = getsec()
268 |         drv.execute(code, unif.addresses()[0], thread = thread)
269 |         time_gpu = getsec() - start
270 | 
271 |         np.set_printoptions(threshold=np.inf)
272 |         # print(C)
273 |         # print(C-C_ref)
274 | 
275 |         def Gflops(sec):
276 |             return (2 * P * Q * R + 3 * P * R) / sec * 1e-9
277 | 
278 |         print(f'==== sgemm example ({P}x{Q} times {Q}x{R}) ====')
279 |         print(f'numpy: {time_ref:.4} sec, {Gflops(time_ref):.4} Gflop/s')
280 |         print(f'QPU:   {time_gpu:.4} sec, {Gflops(time_gpu):.4} Gflop/s')
281 |         print(f'Minimum absolute error: {np.min(np.abs(C - C_ref))}')
282 |         print(f'Maximum absolute error: {np.max(np.abs(C - C_ref))}')
283 |         print(f'Minimum relative error: {np.min(np.abs((C - C_ref) / C_ref))}')
284 |         print(f'Maximum relative error: {np.max(np.abs((C - C_ref) / C_ref))}')
285 | 
286 | 
287 | def main():
288 | 
289 |     sgemm_rnn_naive()
290 | 
291 | 
292 | if __name__ == '__main__':
293 |     main()
294 | 


--------------------------------------------------------------------------------
/examples/summation.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # Copyright (c) 2019-2020 Idein Inc.
  3 | #
  4 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  5 | # of this software and associated documentation files (the "Software"), to deal
  6 | # in the Software without restriction, including without limitation the rights
  7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  8 | # copies of the Software, and to permit persons to whom the Software is
  9 | # furnished to do so, subject to the following conditions:
 10 | #
 11 | # The above copyright notice and this permission notice (including the next
 12 | # paragraph) shall be included in all copies or substantial portions of the
 13 | # Software.
 14 | #
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 21 | # SOFTWARE.
 22 | 
 23 | 
 24 | from time import monotonic
 25 | 
 26 | import numpy as np
 27 | 
 28 | from videocore6.assembler import qpu
 29 | from videocore6.driver import Driver
 30 | 
 31 | 
 32 | @qpu
 33 | def qpu_summation(asm, *, num_qpus, unroll_shift, code_offset,
 34 |                   align_cond=lambda pos: pos % 512 == 170):
 35 | 
 36 |     g = globals()
 37 |     for i, v in enumerate(['length', 'src', 'dst', 'qpu_num', 'stride', 'sum']):
 38 |         g[f'reg_{v}'] = rf[i]
 39 | 
 40 |     nop(sig=ldunifrf(reg_length))
 41 |     nop(sig=ldunifrf(reg_src))
 42 |     nop(sig=ldunifrf(reg_dst))
 43 | 
 44 |     if num_qpus == 1:
 45 |         num_qpus_shift = 0
 46 |         mov(reg_qpu_num, 0)
 47 |     elif num_qpus == 8:
 48 |         num_qpus_shift = 3
 49 |         tidx(r0)
 50 |         shr(r0, r0, 2)
 51 |         band(reg_qpu_num, r0, 0b1111)
 52 |     else:
 53 |         raise Exception('num_qpus must be 1 or 8')
 54 | 
 55 |     # src += 4 * 4 * (thread_num + 16 * qpu_num)
 56 |     # dst += 4 * (thread_num + 16 * qpu_num)
 57 |     shl(r0, reg_qpu_num, 4)
 58 |     eidx(r1)
 59 |     add(r0, r0, r1)
 60 |     shl(r0, r0, 2)
 61 |     shl(r0, r0, 2).add(reg_dst, reg_dst, r0)
 62 |     add(reg_src, reg_src, r0)
 63 | 
 64 |     # stride = 4 * 4 * 16 * num_qpus
 65 |     mov(reg_stride, 1)
 66 |     shl(reg_stride, reg_stride, 8 + num_qpus_shift)
 67 | 
 68 |     # The QPU performs shifts and rotates modulo 32, so it actually supports
 69 |     # shift amounts [0, 31] only with small immediates.
 70 |     num_shifts = [*range(16), *range(-16, 0)]
 71 | 
 72 |     # length /= 16 * 8 * num_qpus * unroll
 73 |     shr(reg_length, reg_length, num_shifts[7 + num_qpus_shift + unroll_shift])
 74 | 
 75 |     # sum = 0
 76 |     # length -= 1
 77 |     # r2 = stride
 78 | 
 79 |     # This single thread switch and two instructions just before the loop are
 80 |     # really important for TMU read to achieve a better performance.
 81 |     # This also enables TMU read requests without the thread switch signal, and
 82 |     # the eight-depth TMU read request queue.
 83 |     nop(sig=thrsw)
 84 |     bxor(reg_sum, 1, 1).mov(r1, 1)
 85 |     sub(reg_length, reg_length, r1, cond='pushz').mov(r2, reg_stride)
 86 | 
 87 |     while not align_cond(code_offset + len(asm)):
 88 |         nop()
 89 | 
 90 |     with loop as l:
 91 | 
 92 |         unroll = 1 << unroll_shift
 93 | 
 94 |         mov(tmuau, reg_src).add(reg_src, reg_src, reg_stride)
 95 |         mov(tmua, reg_src, sig=ldtmu(r0))
 96 | 
 97 |         for i in range(unroll - 1):
 98 |             add(reg_sum, reg_sum, r0, sig=ldtmu(r0)).add(reg_src, reg_src, r2)
 99 |             add(reg_sum, reg_sum, r0, sig=ldtmu(r0))
100 |             add(reg_sum, reg_sum, r0, sig=ldtmu(r0))
101 |             add(reg_sum, reg_sum, r0, sig=ldtmu(r0)).mov(tmuau if i % 2 == 1 else tmua, reg_src)
102 |             add(reg_sum, reg_sum, r0, sig=ldtmu(r0)).add(reg_src, reg_src, r2)
103 |             add(reg_sum, reg_sum, r0, sig=ldtmu(r0))
104 |             add(reg_sum, reg_sum, r0, sig=ldtmu(r0))
105 |             add(reg_sum, reg_sum, r0, sig=ldtmu(r0)).mov(tmua, reg_src)
106 | 
107 |         add(reg_sum, reg_sum, r0, sig=ldtmu(r0)).add(reg_src, reg_src, r2)
108 |         add(reg_sum, reg_sum, r0, sig=ldtmu(r0))
109 |         add(reg_sum, reg_sum, r0, sig=ldtmu(r0))
110 |         add(reg_sum, reg_sum, r0, sig=ldtmu(r0))
111 |         add(reg_sum, reg_sum, r0, sig=ldtmu(r0))
112 | 
113 |         l.b(cond='na0').unif_addr(absolute=False)
114 |         add(reg_sum, reg_sum, r0, sig=ldtmu(r0))
115 |         add(reg_sum, reg_sum, r0, sig=ldtmu(r0))
116 |         add(reg_sum, reg_sum, r0).sub(reg_length, reg_length, r1, cond='pushz')
117 | 
118 |     mov(tmud, reg_sum)
119 |     mov(tmua, reg_dst)
120 | 
121 |     # This synchronization is needed between the last TMU operation and the
122 |     # program end with the thread switch just before the loop above.
123 |     barrierid(syncb, sig=thrsw)
124 |     nop()
125 |     nop()
126 | 
127 |     nop(sig=thrsw)
128 |     nop(sig=thrsw)
129 |     nop()
130 |     nop()
131 |     nop(sig=thrsw)
132 |     nop()
133 |     nop()
134 |     nop()
135 | 
136 | 
137 | def summation(*, length, num_qpus=8, unroll_shift=2):
138 | 
139 |     assert length > 0
140 |     assert length % (16 * 8 * num_qpus * (1 << unroll_shift)) == 0
141 | 
142 |     print(f'==== summaton example ({length / 1024 / 1024} Mi elements) ====')
143 | 
144 |     with Driver(data_area_size=(length + 1024) * 4) as drv:
145 | 
146 |         code = drv.program(qpu_summation, num_qpus=num_qpus,
147 |                            unroll_shift=unroll_shift,
148 |                            code_offset=drv.code_pos // 8)
149 | 
150 |         print('Preparing for buffers...')
151 | 
152 |         X = drv.alloc(length, dtype='uint32')
153 |         Y = drv.alloc(16 * num_qpus, dtype='uint32')
154 | 
155 |         X[:] = np.arange(length, dtype=X.dtype)
156 |         Y.fill(0)
157 | 
158 |         assert sum(Y) == 0
159 | 
160 |         if unroll_shift == 0:
161 |             unif = drv.alloc(3 + 1 + 1, dtype='uint32')
162 |             unif[3] = 0xfffffcfc
163 |         else:
164 |             unif = drv.alloc(3 + (1 << (unroll_shift - 1)) + 1, dtype='uint32')
165 |             unif[3: -1] = 0xfcfcfcfc
166 |         unif[0] = length
167 |         unif[1] = X.addresses()[0]
168 |         unif[2] = Y.addresses()[0]
169 |         unif[-1] = 4 * (-len(unif) + 3) & 0xFFFFFFFF
170 | 
171 |         print('Executing on QPU...')
172 | 
173 |         start = monotonic()
174 |         drv.execute(code, unif.addresses()[0], thread=num_qpus)
175 |         end = monotonic()
176 | 
177 |         assert int(sum(Y.astype(int))) % 2**32 == (length - 1) * length // 2 % 2**32
178 | 
179 |         print(f'{end - start} sec, {length * 4 / (end - start) * 1e-6} MB/s')
180 | 
181 | 
182 | def main():
183 | 
184 |     summation(length=32 * 1024 * 1024)
185 | 
186 | 
187 | if __name__ == '__main__':
188 | 
189 |     main()
190 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # Copyright (c) 2019-2020 Idein Inc.
 3 | #
 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 5 | # of this software and associated documentation files (the "Software"), to deal
 6 | # in the Software without restriction, including without limitation the rights
 7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 8 | # copies of the Software, and to permit persons to whom the Software is
 9 | # furnished to do so, subject to the following conditions:
10 | #
11 | # The above copyright notice and this permission notice (including the next
12 | # paragraph) shall be included in all copies or substantial portions of the
13 | # Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | 
24 | import platform
25 | 
26 | from setuptools import setup, Extension
27 | 
28 | from videocore6 import __version__ as version
29 | 
30 | 
31 | ext_modules = []
32 | 
33 | if platform.machine() in ['armv7l', 'aarch64']:
34 |     ext_modules.append(Extension('videocore6.readwrite4',
35 |                                  sources = ['videocore6/readwrite4.c']))
36 | 
37 | setup(
38 |         name = 'py-videocore6',
39 |         packages = [
40 |                 'videocore6',
41 |         ],
42 |         version = version,
43 |         description = 'Python library for GPGPU programming on Raspberry Pi 4',
44 |         author = 'Sugizaki Yukimasa',
45 |         author_email = 'ysugi@idein.jp',
46 |         install_requires = [
47 |                 'ioctl-opt >= 1.2',
48 |                 'numpy',
49 |         ],
50 |         ext_modules = ext_modules,
51 |         python_requires = '~= 3.7',  # for f-string.
52 | )
53 | 


--------------------------------------------------------------------------------
/tests/test_alu.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # Copyright (c) 2019-2020 Idein Inc.
  3 | #
  4 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  5 | # of this software and associated documentation files (the "Software"), to deal
  6 | # in the Software without restriction, including without limitation the rights
  7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  8 | # copies of the Software, and to permit persons to whom the Software is
  9 | # furnished to do so, subject to the following conditions:
 10 | #
 11 | # The above copyright notice and this permission notice (including the next
 12 | # paragraph) shall be included in all copies or substantial portions of the
 13 | # Software.
 14 | #
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 21 | # SOFTWARE.
 22 | 
 23 | import time
 24 | from videocore6.driver import Driver
 25 | from videocore6.assembler import qpu
 26 | import numpy as np
 27 | import itertools
 28 | 
 29 | def rotate_right(n, s):
 30 |     return ((n << (32-s)) | (n >> s)) & 0xffffffff
 31 | 
 32 | def count_leading_zeros(n):
 33 |     bit = 0x80000000
 34 |     count = 0
 35 |     while bit != n & bit:
 36 |         count += 1
 37 |         bit >>= 1
 38 |     return count
 39 | 
 40 | ops = {
 41 |     # binary ops
 42 |     'fadd' : lambda a,b: a + b,
 43 |     'faddnf' : lambda a,b: a + b,
 44 |     'fsub' : lambda a,b: a - b,
 45 |     'fmin' : np.minimum,
 46 |     'fmax' : np.maximum,
 47 |     'fmul' : lambda a,b: a * b,
 48 |     'fcmp' : lambda a,b: a - b,
 49 |     'vfpack' : lambda a,b: np.stack([a,b]).T.ravel(),
 50 |     'vfmin' : np.minimum,
 51 |     'vfmax' : np.maximum,
 52 |     'vfmul' : lambda a,b: a * b,
 53 | 
 54 |     'add' : lambda a,b: a + b,
 55 |     'sub' : lambda a,b: a - b,
 56 |     'imin' : np.minimum,
 57 |     'imax' : np.maximum,
 58 |     'umin' : np.minimum,
 59 |     'umax' : np.maximum,
 60 | 
 61 |     'shl' : lambda a,b: a << (b % 32),
 62 |     'shr' : lambda a,b: a >> (b % 32),
 63 |     'asr' : lambda a,b: a.astype(np.int32) >> (b % 32),
 64 |     'ror' : lambda a,b: np.vectorize(rotate_right)(a, b % 32),
 65 | 
 66 |     'band' : lambda a,b: a & b,
 67 |     'bor' : lambda a,b: a | b,
 68 |     'bxor' : lambda a,b: a ^ b,
 69 | 
 70 |     # unary ops
 71 |     'fmov' : lambda x: x,
 72 |     'fround' : np.round,
 73 |     'ftrunc' : np.trunc,
 74 |     'ffloor' : np.floor,
 75 |     'fceil' : np.ceil,
 76 |     'fdx' : lambda x: (x[1::2] - x[0::2]).repeat(2),
 77 |     'fdy' : lambda x: (lambda a: (a[1::2] - a[0::2]).ravel())(x.reshape(-1,2).repeat(2,axis=0).reshape(-1,4)),
 78 |     'ftoin': lambda x: x.round().astype(np.int32),
 79 |     'ftoiz': lambda x: np.float32(x).astype(np.int32),
 80 |     'ftouz': np.vectorize(lambda x: np.float32(x).astype(np.uint32) if x > -1 else 0),
 81 | 
 82 |     'bnot' : lambda x: ~x,
 83 |     'neg' : lambda x: -x,
 84 | 
 85 |     'itof' : lambda x: x.astype(np.float32),
 86 |     'clz' : np.vectorize(count_leading_zeros),
 87 |     'utof' : lambda x: x.astype(np.float32),
 88 | 
 89 |     # pack/unpack flags
 90 |     'l' : lambda x: x[0::2],
 91 |     'h' : lambda x: x[1::2],
 92 |     None : lambda x: x,
 93 |     'none' : lambda x: x,
 94 |     'abs' : np.abs,
 95 |     'r32' : lambda x: x.repeat(2),
 96 |     'rl2h' : lambda x: x[0::2].repeat(2),
 97 |     'rh2l' : lambda x: x[1::2].repeat(2),
 98 |     'swap' : lambda x: x.reshape(-1,2)[:,::-1].ravel(),
 99 | }
100 | 
101 | 
102 | @qpu
103 | def qpu_binary_ops(asm, bin_ops, dst_ops, src1_ops, src2_ops):
104 | 
105 |     eidx(r0, sig = ldunif)
106 |     mov(rf0, r5, sig = ldunif) # in
107 |     mov(rf1, r5, sig = ldunif)  # out
108 |     shl(r3, 4, 4).mov(rf2, r5)
109 | 
110 |     shl(r0, r0, 2)
111 |     add(rf0, rf0, r0)
112 |     add(rf1, rf1, r0)
113 |     add(rf2, rf2, r0)
114 | 
115 |     mov(tmua, rf0, sig = thrsw).add(rf0, rf0, r3)
116 |     nop()
117 |     mov(tmua, rf1, sig = thrsw).add(rf1, rf1, r3)
118 |     nop(sig = ldtmu(r1))
119 |     nop()
120 |     nop(sig = ldtmu(r2))
121 | 
122 |     g = globals()
123 |     for op, pack, unpack1, unpack2 in itertools.product(bin_ops, dst_ops, src1_ops, src2_ops):
124 |         g[op](
125 |             r0.pack(pack) if pack is not None else r0,
126 |             r1.unpack(unpack1) if unpack1 is not None else r1,
127 |             r2.unpack(unpack2) if unpack2 is not None else r2
128 |         )
129 |         mov(tmud, r0)
130 |         mov(tmua, rf2)
131 |         tmuwt().add(rf2, rf2, r3)
132 | 
133 |     nop(sig = thrsw)
134 |     nop(sig = thrsw)
135 |     nop()
136 |     nop()
137 |     nop(sig = thrsw)
138 |     nop()
139 |     nop()
140 |     nop()
141 | 
142 | def boilerplate_binary_ops(bin_ops, dst, src1, src2):
143 | 
144 |     dst_dtype, dst_ops = dst
145 |     src1_dtype, src1_ops = src1
146 |     src2_dtype, src2_ops = src2
147 | 
148 |     with Driver() as drv:
149 | 
150 |         cases = list(itertools.product(bin_ops, dst_ops, src1_ops, src2_ops))
151 | 
152 |         code = drv.program(lambda asm: qpu_binary_ops(asm, bin_ops, dst_ops, src1_ops, src2_ops))
153 |         X1 = drv.alloc((16*4//np.dtype(src1_dtype).itemsize, ), dtype = src1_dtype)
154 |         X2 = drv.alloc((16*4//np.dtype(src2_dtype).itemsize, ), dtype = src2_dtype)
155 |         Y = drv.alloc((len(cases), 16*4//np.dtype(dst_dtype).itemsize), dtype = dst_dtype)
156 |         unif = drv.alloc(3, dtype = 'uint32')
157 | 
158 |         if np.dtype(dst_dtype).name.startswith('float'):
159 |             X1[:] = np.random.uniform(-(2**7), 2**7, X1.shape).astype(src1_dtype)
160 |             X2[:] = np.random.uniform(-(2**7), 2**7, X2.shape).astype(src2_dtype)
161 |         elif np.dtype(dst_dtype).name.startswith('int'):
162 |             X1[:] = np.random.randint(-(2**31), 2**31, X1.shape, dtype=src1_dtype)
163 |             X2[:] = np.random.randint(-(2**31), 2**31, X2.shape, dtype=src2_dtype)
164 |         elif np.dtype(dst_dtype).name.startswith('uint'):
165 |             X1[:] = np.random.randint(0, 2**32, X1.shape, dtype=src1_dtype)
166 |             X2[:] = np.random.randint(0, 2**32, X2.shape, dtype=src2_dtype)
167 |         Y[:] = 0.0
168 | 
169 |         unif[0] = X1.addresses()[0]
170 |         unif[1] = X2.addresses()[0]
171 |         unif[2] = Y.addresses()[0,0]
172 | 
173 |         start = time.time()
174 |         drv.execute(code, unif.addresses()[0])
175 |         end = time.time()
176 | 
177 |         for ix, (bin_op, dst_op, src1_op, src2_op) in enumerate(cases):
178 |             msg = '{}({}, {}, {})'.format(bin_op, dst_op, src1_op, src2_op)
179 |             if np.dtype(dst_dtype).name.startswith('float'):
180 |                 assert np.allclose(ops[dst_op](Y[ix]), ops[bin_op](ops[src1_op](X1), ops[src2_op](X2)), rtol=1e-2), msg
181 |             elif np.dtype(dst_dtype).name.startswith('int') or np.dtype(dst_dtype).name.startswith('uint'):
182 |                 assert np.all(ops[dst_op](Y[ix]) == ops[bin_op](ops[src1_op](X1), ops[src2_op](X2))), msg
183 | 
184 | def test_binary_ops():
185 |     packs = [('float32', [None, 'none']), ('float16', ['l', 'h'])]
186 |     unpacks = [('float32', [None, 'none', 'abs']), ('float16', ['l', 'h'])]
187 |     for dst, src1, src2 in itertools.product(packs, unpacks, unpacks):
188 |         boilerplate_binary_ops(
189 |             ['fadd', 'faddnf', 'fsub', 'fmin', 'fmax', 'fmul', 'fcmp'],
190 |             dst, src1, src2,
191 |         )
192 |     packs = [('float16', [None, 'none'])]
193 |     unpacks = [('float32', [None, 'none']), ('float16', ['l', 'h'])]
194 |     for dst, src1, src2 in itertools.product(packs, unpacks, unpacks):
195 |         boilerplate_binary_ops(
196 |             ['vfpack'],
197 |             dst, src1, src2,
198 |         )
199 |     packs = [('float16', [None, 'none'])]
200 |     unpacks = [('float32', ['r32']), ('float16', ['rl2h', 'rh2l', 'swap'])]
201 |     for dst, src1, src2 in itertools.product(packs, unpacks, packs):
202 |         boilerplate_binary_ops(
203 |             ['vfmin', 'vfmax', 'vfmul'],
204 |             dst, src1, src2,
205 |         )
206 | 
207 |     boilerplate_binary_ops(
208 |         ['add', 'sub', 'imin', 'imax', 'asr'],
209 |         ('int32', [None]), ('int32', [None]), ('int32', [None]),
210 |     )
211 |     boilerplate_binary_ops(
212 |         ['add', 'sub', 'umin', 'umax'],
213 |         ('uint32', [None]), ('uint32', [None]), ('uint32', [None]),
214 |     )
215 |     boilerplate_binary_ops(
216 |         ['shl', 'shr', 'ror'],
217 |         ('uint32', [None]), ('uint32', [None]), ('uint32', [None]),
218 |     )
219 |     boilerplate_binary_ops(
220 |         ['band', 'bor', 'bxor'],
221 |         ('uint32', [None]), ('uint32', [None]), ('uint32', [None]),
222 |     )
223 | 
224 | @qpu
225 | def qpu_unary_ops(asm, bin_ops, dst_ops, src_ops):
226 | 
227 |     eidx(r0, sig = ldunif)
228 |     mov(rf0, r5, sig = ldunif) # in
229 |     shl(r3, 4, 4).mov(rf1, r5)
230 | 
231 |     shl(r0, r0, 2)
232 |     add(rf0, rf0, r0)
233 |     add(rf1, rf1, r0)
234 | 
235 |     mov(tmua, rf0, sig = thrsw).add(rf0, rf0, r3)
236 |     nop()
237 |     nop()
238 |     nop(sig = ldtmu(r1))
239 | 
240 |     g = globals()
241 |     for op, pack, unpack in itertools.product(bin_ops, dst_ops, src_ops):
242 |         g[op](
243 |             r0.pack(pack) if pack is not None else r0,
244 |             r1.unpack(unpack) if unpack is not None else r1,
245 |         )
246 |         mov(tmud, r0)
247 |         mov(tmua, rf1)
248 |         tmuwt().add(rf1, rf1, r3)
249 | 
250 |     nop(sig = thrsw)
251 |     nop(sig = thrsw)
252 |     nop()
253 |     nop()
254 |     nop(sig = thrsw)
255 |     nop()
256 |     nop()
257 |     nop()
258 | 
259 | def boilerplate_unary_ops(uni_ops, dst, src):
260 | 
261 |     dst_dtype, dst_ops = dst
262 |     src_dtype, src_ops = src
263 | 
264 |     with Driver() as drv:
265 | 
266 |         cases = list(itertools.product(uni_ops, dst_ops, src_ops))
267 | 
268 |         code = drv.program(lambda asm: qpu_unary_ops(asm, uni_ops, dst_ops, src_ops))
269 |         X = drv.alloc((16*4//np.dtype(src_dtype).itemsize, ), dtype = src_dtype)
270 |         Y = drv.alloc((len(cases), 16*4//np.dtype(dst_dtype).itemsize), dtype = dst_dtype)
271 |         unif = drv.alloc(3, dtype = 'uint32')
272 | 
273 |         X[:] = np.random.uniform(-(2**15), 2**15, X.shape).astype(src_dtype)
274 |         Y[:] = 0.0
275 | 
276 |         unif[0] = X.addresses()[0]
277 |         unif[1] = Y.addresses()[0,0]
278 | 
279 |         start = time.time()
280 |         drv.execute(code, unif.addresses()[0])
281 |         end = time.time()
282 | 
283 |         for ix, (uni_op, dst_op, src_op) in enumerate(cases):
284 |             msg = '{}({}, {})'.format(uni_op, dst_op, src_op)
285 |             if np.dtype(dst_dtype).name.startswith('float'):
286 |                 assert np.allclose(ops[dst_op](Y[ix]), ops[uni_op](ops[src_op](X)), rtol=1e-2), msg
287 |             elif np.dtype(dst_dtype).name.startswith('int') or np.dtype(dst_dtype).name.startswith('uint'):
288 |                 assert np.all(ops[dst_op](Y[ix]) == ops[uni_op](ops[src_op](X))), msg
289 | 
290 | def test_unary_ops():
291 |     packs = [('float32', [None, 'none']), ('float16', ['l', 'h'])]
292 |     unpacks = [('float32', [None, 'none', 'abs']), ('float16', ['l', 'h'])]
293 |     for dst, src in itertools.product(packs, unpacks):
294 |         boilerplate_unary_ops(
295 |             ['fmov'],
296 |             dst, src,
297 |         )
298 |     packs = [('float32', [None, 'none']), ('float16', ['l', 'h'])]
299 |     unpacks = [('float32', [None, 'none']), ('float16', ['l', 'h'])]
300 |     for dst, src in itertools.product(packs, unpacks):
301 |         boilerplate_unary_ops(
302 |             ['fround', 'ftrunc', 'ffloor', 'fceil', 'fdx', 'fdy'],
303 |             dst, src,
304 |         )
305 |     packs = [('int32', [None, 'none'])]
306 |     unpacks = [('float32', [None, 'none']), ('float16', ['l', 'h'])]
307 |     for dst, src in itertools.product(packs, unpacks):
308 |         boilerplate_unary_ops(
309 |             ['ftoin', 'ftoiz'],
310 |             dst, src,
311 |         )
312 |     packs = [('uint32', [None, 'none'])]
313 |     unpacks = [('float32', [None, 'none']), ('float16', ['l', 'h'])]
314 |     for dst, src in itertools.product(packs, unpacks):
315 |         boilerplate_unary_ops(
316 |             ['ftouz'],
317 |             dst, src,
318 |         )
319 |     # TODO: 'ftoc': what is the meaning of this instruction ?
320 |     # packs = [('int32', ['none'])]
321 |     # unpacks = [('float32', ['none']), ('float16', ['l', 'h'])]
322 |     # for dst, src in itertools.product(packs, unpacks):
323 |     #     boilerplate_unary_ops(
324 |     #         ['ftoc'],
325 |     #         dst, src,
326 |     #     )
327 |     boilerplate_unary_ops(
328 |         ['bnot', 'neg'],
329 |         ('int32', [None]), ('int32', [None]),
330 |     )
331 |     boilerplate_unary_ops(
332 |         ['itof'],
333 |         ('float32', [None]), ('int32', [None]),
334 |     )
335 |     boilerplate_unary_ops(
336 |         ['clz'],
337 |         ('uint32', [None]), ('uint32', [None]),
338 |     )
339 |     boilerplate_unary_ops(
340 |         ['utof'],
341 |         ('float32', [None]), ('uint32', [None]),
342 |     )
343 | 


--------------------------------------------------------------------------------
/tests/test_branch.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # Copyright (c) 2019-2020 Idein Inc.
  3 | #
  4 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  5 | # of this software and associated documentation files (the "Software"), to deal
  6 | # in the Software without restriction, including without limitation the rights
  7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  8 | # copies of the Software, and to permit persons to whom the Software is
  9 | # furnished to do so, subject to the following conditions:
 10 | #
 11 | # The above copyright notice and this permission notice (including the next
 12 | # paragraph) shall be included in all copies or substantial portions of the
 13 | # Software.
 14 | #
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 21 | # SOFTWARE.
 22 | 
 23 | import time
 24 | from videocore6.driver import Driver
 25 | from videocore6.assembler import qpu
 26 | import numpy as np
 27 | 
 28 | # branch (destination from relative imm)
 29 | @qpu
 30 | def qpu_branch_rel_imm(asm):
 31 | 
 32 |     eidx(r0, sig = ldunifrf(rf0))
 33 |     nop(sig = ldunifrf(rf1))
 34 |     shl(r0, r0, 2)
 35 |     add(rf0, rf0, r0)
 36 |     add(rf1, rf1, r0)
 37 | 
 38 |     mov(tmua, rf0, sig = thrsw)
 39 |     nop()
 40 |     nop()
 41 |     nop(sig = ldtmu(r1))
 42 | 
 43 |     b(2*8, cond = 'always')
 44 |     nop()
 45 |     nop()
 46 |     nop()
 47 |     add(r1, r1, 1)
 48 |     add(r1, r1, 1)
 49 |     add(r1, r1, 1) # jump comes here
 50 |     add(r1, r1, 1)
 51 | 
 52 |     mov(tmud, r1)
 53 |     mov(tmua, rf1)
 54 |     tmuwt()
 55 | 
 56 |     nop(sig = thrsw)
 57 |     nop(sig = thrsw)
 58 |     nop()
 59 |     nop()
 60 |     nop(sig = thrsw)
 61 |     nop()
 62 |     nop()
 63 |     nop()
 64 | 
 65 | def test_branch_rel_imm():
 66 | 
 67 |     with Driver() as drv:
 68 | 
 69 |         code = drv.program(qpu_branch_rel_imm)
 70 |         X = drv.alloc((16, ), dtype = 'uint32')
 71 |         Y = drv.alloc((16, ), dtype = 'uint32')
 72 |         unif = drv.alloc(3, dtype = 'uint32')
 73 | 
 74 |         X[:] = np.arange(16)
 75 |         Y[:] = 0.0
 76 | 
 77 |         unif[0] = X.addresses()[0]
 78 |         unif[1] = Y.addresses()[0]
 79 | 
 80 |         start = time.time()
 81 |         drv.execute(code, unif.addresses()[0])
 82 |         end = time.time()
 83 | 
 84 |         assert (Y == X + 2).all()
 85 | 
 86 | 
 87 | # branch (destination from absolute imm)
 88 | @qpu
 89 | def qpu_branch_abs_imm(asm, absimm):
 90 | 
 91 |     eidx(r0, sig = ldunifrf(rf0))
 92 |     nop(sig = ldunifrf(rf1))
 93 |     shl(r0, r0, 2)
 94 |     add(rf0, rf0, r0)
 95 |     add(rf1, rf1, r0)
 96 | 
 97 |     mov(tmua, rf0, sig = thrsw)
 98 |     nop()
 99 |     nop()
100 |     nop(sig = ldtmu(r1))
101 | 
102 |     b(absimm, absolute = True, cond = 'always')
103 |     nop()
104 |     nop()
105 |     nop()
106 |     add(r1, r1, 1)
107 |     add(r1, r1, 1)
108 |     add(r1, r1, 1) # jump comes here
109 |     add(r1, r1, 1)
110 | 
111 |     mov(tmud, r1)
112 |     mov(tmua, rf1)
113 |     tmuwt()
114 | 
115 |     nop(sig = thrsw)
116 |     nop(sig = thrsw)
117 |     nop()
118 |     nop()
119 |     nop(sig = thrsw)
120 |     nop()
121 |     nop()
122 |     nop()
123 | 
124 | def test_branch_abs_imm():
125 | 
126 |     with Driver() as drv:
127 | 
128 |         @qpu
129 |         def qpu_dummy(asm):
130 |             nop()
131 |         dummy = drv.program(qpu_dummy)
132 |         code = drv.program(lambda asm: qpu_branch_abs_imm(asm, int(dummy.addresses()[0]+16*8)))
133 |         X = drv.alloc((16, ), dtype = 'uint32')
134 |         Y = drv.alloc((16, ), dtype = 'uint32')
135 |         unif = drv.alloc(3, dtype = 'uint32')
136 | 
137 |         X[:] = np.arange(16)
138 |         Y[:] = 0.0
139 | 
140 |         unif[0] = X.addresses()[0]
141 |         unif[1] = Y.addresses()[0]
142 | 
143 |         start = time.time()
144 |         drv.execute(code, unif.addresses()[0])
145 |         end = time.time()
146 | 
147 |         assert (Y == X + 2).all()
148 | 
149 | 
150 | # branch (destination from label)
151 | @qpu
152 | def qpu_branch_rel_label(asm):
153 | 
154 |     eidx(r0, sig = ldunifrf(rf0))
155 |     nop(sig = ldunifrf(rf1))
156 |     shl(r0, r0, 2)
157 |     add(rf0, rf0, r0)
158 |     add(rf1, rf1, r0)
159 | 
160 |     mov(tmua, rf0, sig = thrsw)
161 |     nop()
162 |     nop()
163 |     nop(sig = ldtmu(r1))
164 | 
165 |     b(R.foo, cond = 'always')
166 |     nop()
167 |     nop()
168 |     nop()
169 |     add(r1, r1, 1)
170 |     L.foo
171 |     add(r1, r1, 1) # jump comes here
172 |     L.bar
173 |     add(r1, r1, 1)
174 |     L.baz
175 |     add(r1, r1, 1)
176 | 
177 |     mov(tmud, r1)
178 |     mov(tmua, rf1)
179 |     tmuwt()
180 | 
181 |     nop(sig = thrsw)
182 |     nop(sig = thrsw)
183 |     nop()
184 |     nop()
185 |     nop(sig = thrsw)
186 |     nop()
187 |     nop()
188 |     nop()
189 | 
190 | def test_branch_rel_label():
191 | 
192 |     with Driver() as drv:
193 | 
194 |         code = drv.program(qpu_branch_rel_label)
195 |         X = drv.alloc((16, ), dtype = 'uint32')
196 |         Y = drv.alloc((16, ), dtype = 'uint32')
197 |         unif = drv.alloc(3, dtype = 'uint32')
198 | 
199 |         X[:] = np.arange(16)
200 |         Y[:] = 0.0
201 | 
202 |         unif[0] = X.addresses()[0]
203 |         unif[1] = Y.addresses()[0]
204 | 
205 |         start = time.time()
206 |         drv.execute(code, unif.addresses()[0])
207 |         end = time.time()
208 | 
209 |         assert (Y == X + 3).all()
210 | 
211 | 
212 | # branch (destination from regfile)
213 | @qpu
214 | def qpu_branch_abs_reg(asm):
215 | 
216 |     eidx(r0, sig = ldunifrf(rf0))
217 |     nop(sig = ldunifrf(rf1))
218 |     shl(r0, r0, 2)
219 |     add(rf0, rf0, r0)
220 |     add(rf1, rf1, r0)
221 | 
222 |     mov(tmua, rf0, sig = thrsw)
223 |     nop()
224 |     nop()
225 |     nop(sig = ldtmu(rf2))
226 | 
227 |     mov(r1, 0)
228 |     b(rf2, cond = 'always')
229 |     nop()
230 |     nop()
231 |     nop()
232 |     L.label
233 |     add(r1, r1, 1)
234 |     add(r1, r1, 1)
235 |     add(r1, r1, 1)
236 |     add(r1, r1, 1) # jump comes here
237 | 
238 |     mov(tmud, r1)
239 |     mov(tmua, rf1)
240 |     tmuwt()
241 | 
242 |     nop(sig = thrsw)
243 |     nop(sig = thrsw)
244 |     nop()
245 |     nop()
246 |     nop(sig = thrsw)
247 |     nop()
248 |     nop()
249 |     nop()
250 | 
251 | def test_branch_abs_reg():
252 | 
253 |     with Driver() as drv:
254 | 
255 |         code = drv.program(qpu_branch_abs_reg)
256 |         X = drv.alloc((16, ), dtype = 'uint32')
257 |         Y = drv.alloc((16, ), dtype = 'uint32')
258 |         unif = drv.alloc(3, dtype = 'uint32')
259 | 
260 |         X[:] = code.addresses()[0] + 17*8
261 |         Y[:] = 0.0
262 | 
263 |         unif[0] = X.addresses()[0]
264 |         unif[1] = Y.addresses()[0]
265 | 
266 |         start = time.time()
267 |         drv.execute(code, unif.addresses()[0])
268 |         end = time.time()
269 | 
270 |         assert (Y == 1).all()
271 | 
272 | 
273 | # branch (destination from link_reg)
274 | @qpu
275 | def qpu_branch_link_reg(asm, set_subroutine_link, use_link_reg_direct):
276 | 
277 |     eidx(r0, sig = ldunifrf(rf0))
278 |     nop(sig = ldunifrf(rf1))
279 |     shl(r0, r0, 2)
280 |     add(rf0, rf0, r0)
281 |     add(rf1, rf1, r0)
282 | 
283 |     mov(tmua, rf0, sig = thrsw)
284 |     nop()
285 |     nop()
286 |     nop(sig = ldtmu(r2))
287 | 
288 |     mov(rf2, 0)
289 |     mov(rf3, 0)
290 |     b(R.init_link, cond = 'always', set_link = True)
291 |     nop() # delay slot
292 |     nop() # delay slot
293 |     nop() # delay slot
294 |     L.init_link
295 | 
296 |     # subroutine returns to here if set_subroutine_link is False.
297 |     add(rf3, rf3, 1)
298 | 
299 |     # jump to subroutine once.
300 |     mov(null, rf2, cond = 'pushz')
301 |     b(R.subroutine, cond = 'alla', set_link = set_subroutine_link)
302 |     mov(rf2, 1) # delay slot
303 |     nop()       # delay slot
304 |     nop()       # delay slot
305 | 
306 |     # subroutine returns to here if set_subroutine_link is True.
307 |     shl(r1, 4, 4)
308 |     mov(tmud, rf3) # rf3 will be 1 if set_subroutine_link, else 2.
309 |     mov(tmua, rf1).add(rf1, rf1, r1)
310 |     tmuwt()
311 | 
312 |     nop(sig = thrsw)
313 |     nop(sig = thrsw)
314 |     nop()
315 |     nop()
316 |     nop(sig = thrsw)
317 |     nop()
318 |     nop()
319 |     nop()
320 | 
321 |     L.subroutine
322 | 
323 |     shl(r1, 4, 4)
324 |     mov(tmud, r2)
325 |     mov(tmua, rf1).add(rf1, rf1, r1)
326 |     tmuwt()
327 | 
328 |     if use_link_reg_direct:
329 |         b(link, cond = 'always')
330 |     else:
331 |         lr(rf32) # lr instruction reads link register
332 |         b(rf32, cond = 'always')
333 |     nop() # delay slot
334 |     nop() # delay slot
335 |     nop() # delay slot
336 | 
337 | def test_branch_link_reg():
338 | 
339 |     for set_subroutine_link, expected in [(False, 2), (True, 1)]:
340 |         for use_link_reg_direct in [False, True]:
341 |             with Driver() as drv:
342 | 
343 |                 code = drv.program(lambda asm: qpu_branch_link_reg(asm, set_subroutine_link, use_link_reg_direct))
344 |                 X = drv.alloc(16, dtype = 'uint32')
345 |                 Y = drv.alloc((2, 16), dtype = 'uint32')
346 |                 unif = drv.alloc(2, dtype = 'uint32')
347 | 
348 |                 X[:] = (np.random.randn(16) * 1024).astype('uint32')
349 |                 Y[:] = 0.0
350 | 
351 |                 unif[0] = X.addresses()[0]
352 |                 unif[1] = Y.addresses()[0,0]
353 | 
354 |                 start = time.time()
355 |                 drv.execute(code, unif.addresses()[0])
356 |                 end = time.time()
357 | 
358 |                 assert (Y[0] == X).all()
359 |                 assert (Y[1] == expected).all()
360 | 
361 | 
362 | # uniform branch (destination from uniform relative value)
363 | @qpu
364 | def qpu_uniform_branch_rel(asm):
365 | 
366 |     eidx(r0, sig = ldunifrf(rf0))
367 |     shl(r0, r0, 2)
368 |     add(rf0, rf0, r0)
369 | 
370 |     b(R.label, cond = 'always').unif_addr()
371 |     nop()
372 |     nop()
373 |     nop()
374 |     L.label
375 |     nop(sig = ldunifrf(tmud))
376 |     mov(tmua, rf0)
377 |     tmuwt()
378 | 
379 |     nop(sig = thrsw)
380 |     nop(sig = thrsw)
381 |     nop()
382 |     nop()
383 |     nop(sig = thrsw)
384 |     nop()
385 |     nop()
386 |     nop()
387 | 
388 | def test_uniform_branch_rel():
389 | 
390 |     with Driver() as drv:
391 | 
392 |         code = drv.program(qpu_uniform_branch_rel)
393 |         Y = drv.alloc((16, ), dtype = 'uint32')
394 |         unif = drv.alloc(5, dtype = 'uint32')
395 | 
396 |         Y[:] = 0.0
397 | 
398 |         unif[0] = Y.addresses()[0]
399 |         unif[1] = 8 # relative address for uniform branch
400 |         unif[2] = 5
401 |         unif[3] = 6
402 |         unif[4] = 7 # uniform branch point here
403 | 
404 |         start = time.time()
405 |         drv.execute(code, unif.addresses()[0])
406 |         end = time.time()
407 | 
408 |         assert (Y == 7).all()
409 | 
410 | 
411 | # uniform branch (destination from uniform absolute value)
412 | @qpu
413 | def qpu_uniform_branch_abs(asm):
414 | 
415 |     eidx(r0, sig = ldunifrf(rf0))
416 |     shl(r0, r0, 2)
417 |     add(rf0, rf0, r0)
418 | 
419 |     b(R.label, cond = 'always').unif_addr(absolute = True)
420 |     nop()
421 |     nop()
422 |     nop()
423 |     L.label
424 |     nop(sig = ldunifrf(tmud))
425 |     mov(tmua, rf0)
426 |     tmuwt()
427 | 
428 |     nop(sig = thrsw)
429 |     nop(sig = thrsw)
430 |     nop()
431 |     nop()
432 |     nop(sig = thrsw)
433 |     nop()
434 |     nop()
435 |     nop()
436 | 
437 | def test_uniform_branch_abs():
438 | 
439 |     with Driver() as drv:
440 | 
441 |         code = drv.program(qpu_uniform_branch_abs)
442 |         Y = drv.alloc((16, ), dtype = 'uint32')
443 |         unif = drv.alloc(5, dtype = 'uint32')
444 | 
445 |         Y[:] = 0.0
446 | 
447 |         unif[0] = Y.addresses()[0]
448 |         unif[1] = unif.addresses()[3] # absolute address for uniform branch
449 |         unif[2] = 5
450 |         unif[3] = 6 # uniform branch point here
451 |         unif[4] = 7
452 | 
453 |         start = time.time()
454 |         drv.execute(code, unif.addresses()[0])
455 |         end = time.time()
456 | 
457 |         assert (Y == 6).all()
458 | 
459 | 
460 | # uniform branch (destination from register)
461 | @qpu
462 | def qpu_uniform_branch_reg(asm):
463 | 
464 | 
465 |     eidx(r0, sig = ldunifrf(rf0))
466 |     nop(sig = ldunifrf(rf1))
467 |     shl(r0, r0, 2)
468 |     add(rf0, rf0, r0)
469 |     add(rf1, rf1, r0)
470 | 
471 |     mov(tmua, rf0, sig = thrsw)
472 |     nop()
473 |     nop()
474 |     nop(sig = ldtmu(rf2))
475 | 
476 |     b(R.label, cond = 'always').unif_addr(rf2)
477 |     nop()
478 |     nop()
479 |     nop()
480 |     L.label
481 |     nop(sig = ldunifrf(rf3))
482 |     mov(tmud, rf3)
483 |     mov(tmua, rf1)
484 |     tmuwt()
485 | 
486 |     nop(sig = thrsw)
487 |     nop(sig = thrsw)
488 |     nop()
489 |     nop()
490 |     nop(sig = thrsw)
491 |     nop()
492 |     nop()
493 |     nop()
494 | 
495 | def test_uniform_branch_reg():
496 | 
497 |     with Driver() as drv:
498 | 
499 |         code = drv.program(qpu_uniform_branch_reg)
500 |         X = drv.alloc((16, ), dtype = 'uint32')
501 |         Y = drv.alloc((16, ), dtype = 'uint32')
502 |         unif = drv.alloc(6, dtype = 'uint32')
503 | 
504 |         X[1] = unif.addresses()[4] # absolute address for uniform branch
505 |         Y[:] = 0.0
506 | 
507 |         unif[0] = X.addresses()[0]
508 |         unif[1] = Y.addresses()[0]
509 |         unif[2] = 3
510 |         unif[3] = 4
511 |         unif[4] = 5 # uniform branch point here
512 |         unif[5] = 6
513 | 
514 |         start = time.time()
515 |         drv.execute(code, unif.addresses()[0])
516 |         end = time.time()
517 | 
518 |         assert (Y == 5).all()
519 | 


--------------------------------------------------------------------------------
/tests/test_condition_codes.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # Copyright (c) 2019-2020 Idein Inc.
  3 | #
  4 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  5 | # of this software and associated documentation files (the "Software"), to deal
  6 | # in the Software without restriction, including without limitation the rights
  7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  8 | # copies of the Software, and to permit persons to whom the Software is
  9 | # furnished to do so, subject to the following conditions:
 10 | #
 11 | # The above copyright notice and this permission notice (including the next
 12 | # paragraph) shall be included in all copies or substantial portions of the
 13 | # Software.
 14 | #
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 21 | # SOFTWARE.
 22 | 
 23 | 
 24 | import time
 25 | from videocore6.driver import Driver
 26 | from videocore6.assembler import qpu
 27 | import numpy as np
 28 | 
 29 | 
 30 | # `cond = 'push*'` sets the conditional flag A
 31 | @qpu
 32 | def qpu_cond_push_a(asm):
 33 | 
 34 |     eidx(r0, sig = ldunif)
 35 |     mov(r2, r5)
 36 |     shl(r0, r0, 2)
 37 |     add(r2, r2, r0)
 38 |     shl(r1, 4, 4)
 39 | 
 40 |     cond_pairs = [
 41 |         ('pushz', 'ifa'),
 42 |         ('pushn', 'ifna'),
 43 |         ('pushc', 'ifa'),
 44 |     ]
 45 | 
 46 |     for cond_push, cond_if in cond_pairs:
 47 |         eidx(r0)
 48 |         sub(r0, r0, 10, cond = cond_push)
 49 |         mov(r0, 0)
 50 |         mov(r0, 1, cond = cond_if)
 51 |         mov(tmud, r0)
 52 |         mov(tmua, r2)
 53 |         tmuwt().add(r2, r2, r1)
 54 |         mov(r0, 0)
 55 |         nop().mov(r0, 1, cond = cond_if)
 56 |         mov(tmud, r0)
 57 |         mov(tmua, r2)
 58 |         tmuwt().add(r2, r2, r1)
 59 | 
 60 |     nop(sig = thrsw)
 61 |     nop(sig = thrsw)
 62 |     nop()
 63 |     nop()
 64 |     nop(sig = thrsw)
 65 |     nop()
 66 |     nop()
 67 |     nop()
 68 | 
 69 | def test_cond_push_a():
 70 | 
 71 |     with Driver() as drv:
 72 | 
 73 |         code = drv.program(qpu_cond_push_a)
 74 |         data = drv.alloc((6, 16), dtype = 'uint32')
 75 |         unif = drv.alloc(1, dtype = 'uint32')
 76 | 
 77 |         data[:] = 0
 78 | 
 79 |         unif[0] = data.addresses()[0,0]
 80 | 
 81 |         start = time.time()
 82 |         drv.execute(code, unif.addresses()[0])
 83 |         end = time.time()
 84 | 
 85 |         pushz_if_expected = np.zeros((16,), dtype = 'uint32')
 86 |         pushz_if_expected[10] = 1
 87 | 
 88 |         pushn_ifn_expected = np.zeros((16,), dtype = 'uint32')
 89 |         pushn_ifn_expected[10:] = 1
 90 | 
 91 |         pushc_if_expected = np.zeros((16,), dtype = 'uint32')
 92 |         pushc_if_expected[:10] = 1
 93 | 
 94 |         assert (data[0] == pushz_if_expected).all()
 95 |         assert (data[1] == pushz_if_expected).all()
 96 |         assert (data[2] == pushn_ifn_expected).all()
 97 |         assert (data[3] == pushn_ifn_expected).all()
 98 |         assert (data[4] == pushc_if_expected).all()
 99 |         assert (data[5] == pushc_if_expected).all()
100 | 
101 | # `cond = 'push*'` moves the old conditional flag A to B
102 | @qpu
103 | def qpu_cond_push_b(asm):
104 | 
105 |     eidx(r0, sig = ldunif)
106 |     mov(r2, r5)
107 |     shl(r0, r0, 2)
108 |     add(r2, r2, r0)
109 |     shl(r1, 4, 4)
110 | 
111 |     eidx(r0)
112 |     sub(null, r0, 10, cond = 'pushz')
113 |     mov(r0, 0, cond = 'ifa')
114 |     eidx(r0).mov(tmud, r0)
115 |     mov(tmua, r2)
116 |     tmuwt().add(r2, r2, r1)
117 | 
118 |     eidx(r0)
119 |     sub(null, r0, 5, cond = 'pushz')
120 |     mov(r0, 0, cond = 'ifa')
121 |     eidx(r0).mov(tmud, r0)
122 |     mov(tmua, r2)
123 |     tmuwt().add(r2, r2, r1)
124 |     mov(r0, 0, cond = 'ifb')
125 |     eidx(r0).mov(tmud, r0)
126 |     mov(tmua, r2)
127 |     tmuwt().add(r2, r2, r1)
128 | 
129 |     eidx(r0)
130 |     sub(null, r0, 1, cond = 'pushz')
131 |     mov(r0, 0, cond = 'ifa')
132 |     eidx(r0).mov(tmud, r0)
133 |     mov(tmua, r2)
134 |     tmuwt().add(r2, r2, r1)
135 |     mov(r0, 0, cond = 'ifb')
136 |     eidx(r0).mov(tmud, r0)
137 |     mov(tmua, r2)
138 |     tmuwt().add(r2, r2, r1)
139 | 
140 |     nop(sig = thrsw)
141 |     nop(sig = thrsw)
142 |     nop()
143 |     nop()
144 |     nop(sig = thrsw)
145 |     nop()
146 |     nop()
147 |     nop()
148 | 
149 | def test_cond_push_b():
150 | 
151 |     with Driver() as drv:
152 | 
153 |         code = drv.program(qpu_cond_push_b)
154 |         data = drv.alloc((5, 16), dtype = 'uint32')
155 |         unif = drv.alloc(1, dtype = 'uint32')
156 | 
157 |         data[:] = 0
158 | 
159 |         unif[0] = data.addresses()[0,0]
160 | 
161 |         start = time.time()
162 |         drv.execute(code, unif.addresses()[0])
163 |         end = time.time()
164 | 
165 |         push0 = [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0,11,12,13,14,15]
166 |         push1 = [ 0, 1, 2, 3, 4, 0, 6, 7, 8, 9,10,11,12,13,14,15]
167 |         push2 = [ 0, 0, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15]
168 | 
169 |         expected = np.array(
170 |             #  pushz
171 |             [push0,  # ifa
172 |              # pushz
173 |              push1,  # ifa
174 |              push0,  # ifb
175 |              # pushz
176 |              push2,  # ifa
177 |              push1], # ifb
178 |             dtype = 'uint32'
179 |         )
180 | 
181 |         assert (data == expected).all()
182 | 
183 | # `cond = '{and,nor}*'` updates the conditional flag A and it don't affect to B
184 | @qpu
185 | def qpu_cond_update(asm, cond_update_flags):
186 | 
187 |     eidx(r0, sig = ldunif)
188 |     mov(r2, r5)
189 |     shl(r0, r0, 2)
190 |     add(r2, r2, r0)
191 |     shl(r1, 4, 4)
192 | 
193 |     for cond_update_flag in cond_update_flags:
194 |         eidx(r0)
195 |         band(r0, r0, 1, cond = 'pushz') # fla = [1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0]
196 |         eidx(r0)
197 |         sub(null, r0, 5, cond = cond_update_flag)
198 |         mov(r0, 0)
199 |         mov(r0, 1, cond = 'ifa')
200 |         mov(tmud, r0)
201 |         mov(tmua, r2)
202 |         tmuwt().add(r2, r2, r1)
203 | 
204 |     for cond_update_flag in cond_update_flags:
205 |         eidx(r0)
206 |         band(r0, r0, 1, cond = 'pushz')
207 |         eidx(r0)
208 |         add(r3, r0, r0).sub(r0, r0, 5, cond = cond_update_flag)
209 |         mov(r0, 0)
210 |         mov(r0, 1, cond = 'ifa')
211 |         mov(tmud, r0)
212 |         mov(tmua, r2)
213 |         tmuwt().add(r2, r2, r1)
214 | 
215 |     nop(sig = thrsw)
216 |     nop(sig = thrsw)
217 |     nop()
218 |     nop()
219 |     nop(sig = thrsw)
220 |     nop()
221 |     nop()
222 |     nop()
223 | 
224 | def test_cond_update():
225 | 
226 |     cond_update_flags = [
227 |         'andz',
228 |         'andnz',
229 |         'nornz',
230 |         'norz',
231 |         'andn',
232 |         'andnn',
233 |         'nornn',
234 |         'norn',
235 |         'andc',
236 |         'andnc',
237 |         'nornc',
238 |         'norc',
239 |     ]
240 | 
241 |     def cond_update_op(cond_update_flag):
242 |         bin_op = [
243 |             lambda a,b: np.logical_not(np.logical_or(a, b)),
244 |             np.logical_and
245 |         ][cond_update_flag[:3] == 'and']
246 |         b_op = lambda b: [b < 0, b == 0][cond_update_flag[-1] == 'z']
247 |         not_op = [lambda x: x, np.logical_not][cond_update_flag[3:-1] == 'n']
248 |         return lambda a,b: bin_op(a, not_op(b_op(b)))
249 | 
250 |     with Driver() as drv:
251 | 
252 |         code = drv.program(lambda asm: qpu_cond_update(asm, cond_update_flags))
253 |         data = drv.alloc((24, 16), dtype = 'uint32')
254 |         unif = drv.alloc(1, dtype = 'uint32')
255 | 
256 |         data[:] = 0
257 | 
258 |         unif[0] = data.addresses()[0,0]
259 | 
260 |         start = time.time()
261 |         drv.execute(code, unif.addresses()[0])
262 |         end = time.time()
263 | 
264 |         a = np.array([1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0]) > 0
265 |         b = np.arange(16) - 5
266 | 
267 |         for ix, cond_update_flag in enumerate(cond_update_flags):
268 |             assert np.all(data[ix] == cond_update_op(cond_update_flag)(a, b))
269 | 
270 | # dual `cond=''` instruction
271 | @qpu
272 | def qpu_cond_combination(asm):
273 | 
274 |     eidx(r0, sig = ldunif)
275 |     mov(r2, r5)
276 |     shl(r0, r0, 2)
277 |     add(r2, r2, r0)
278 |     shl(r1, 4, 4)
279 | 
280 |     # if / push
281 |     eidx(r0)
282 |     sub(r0, r0, 10, cond = 'pushz')
283 |     eidx(r0)
284 |     mov(r0, 5, cond = 'ifa').sub(r3, r0, 5, cond = 'pushn')
285 |     mov(tmud, r0)
286 |     mov(tmua, r2)
287 |     tmuwt().add(r2, r2, r1)
288 |     eidx(r0)
289 |     mov(r0, 0, cond = 'ifa')
290 |     mov(tmud, r0)
291 |     mov(tmua, r2)
292 |     tmuwt().add(r2, r2, r1)
293 | 
294 |     # push / if
295 |     eidx(r0)
296 |     sub(r0, r0, 10, cond = 'pushz')
297 |     eidx(r0)
298 |     sub(null, r0, 5, cond = 'pushn').mov(r0, 5, cond = 'ifa')
299 |     mov(tmud, r0)
300 |     mov(tmua, r2)
301 |     tmuwt().add(r2, r2, r1)
302 |     eidx(r0)
303 |     mov(r0, 0, cond = 'ifa')
304 |     mov(tmud, r0)
305 |     mov(tmua, r2)
306 |     tmuwt().add(r2, r2, r1)
307 | 
308 |     # if / if
309 |     eidx(r0)
310 |     sub(null, r0, 10, cond = 'pushn')
311 |     eidx(r3)
312 |     mov(r0, 0, cond = 'ifna').mov(r3, 0, cond = 'ifna')
313 |     mov(tmud, r0)
314 |     mov(tmua, r2)
315 |     tmuwt().add(r2, r2, r1)
316 |     mov(tmud, r3)
317 |     mov(tmua, r2)
318 |     tmuwt().add(r2, r2, r1)
319 | 
320 |     # update / if
321 |     eidx(r0)
322 |     sub(null, r0, 10, cond = 'pushn')
323 |     eidx(r3)
324 |     sub(null, r0, 5, cond = 'andn').mov(r3, 5, cond = 'ifa')
325 |     eidx(r0)
326 |     mov(r0, 0, cond = 'ifa')
327 |     mov(tmud, r0)
328 |     mov(tmua, r2)
329 |     tmuwt().add(r2, r2, r1)
330 |     mov(tmud, r3)
331 |     mov(tmua, r2)
332 |     tmuwt().add(r2, r2, r1)
333 | 
334 |     nop(sig = thrsw)
335 |     nop(sig = thrsw)
336 |     nop()
337 |     nop()
338 |     nop(sig = thrsw)
339 |     nop()
340 |     nop()
341 |     nop()
342 | 
343 | def test_cond_combination():
344 | 
345 |     with Driver() as drv:
346 | 
347 |         code = drv.program(qpu_cond_combination)
348 |         data = drv.alloc((8, 16), dtype = 'uint32')
349 |         unif = drv.alloc(1, dtype = 'uint32')
350 | 
351 |         data[:] = 0
352 | 
353 |         unif[0] = data.addresses()[0,0]
354 | 
355 |         start = time.time()
356 |         drv.execute(code, unif.addresses()[0])
357 |         end = time.time()
358 | 
359 |         expected = np.array(
360 |             [[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 5,11,12,13,14,15],
361 |              [ 0, 0, 0, 0, 0, 5, 6, 7, 8, 9,10,11,12,13,14,15],
362 |              [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 5,11,12,13,14,15],
363 |              [ 0, 0, 0, 0, 0, 5, 6, 7, 8, 9,10,11,12,13,14,15],
364 |              [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 0, 0, 0, 0, 0],
365 |              [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 0, 0, 0, 0, 0],
366 |              [ 0, 0, 0, 0, 0, 5, 6, 7, 8, 9,10,11,12,13,14,15],
367 |              [ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,10,11,12,13,14,15]],
368 |             dtype = 'uint32'
369 |         )
370 | 
371 |         assert (data == expected).all()
372 | 
373 | 
374 | # vflx instructions read a condition flag as int16
375 | @qpu
376 | def qpu_cond_vflx(asm, ops):
377 | 
378 |     eidx(r0, sig = ldunif)
379 |     mov(r2, r5)
380 |     shl(r0, r0, 2)
381 |     add(r2, r2, r0)
382 |     shl(r1, 4, 4)
383 | 
384 |     # init fla/flb
385 |     bxor(rf0, rf0, rf0).sub(rf1, rf1, rf1)
386 |     eidx(r0)
387 |     band(null, r0, 1 << 0, cond = 'pushz') # a = [1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0]
388 |     band(null, r0, 1 << 1, cond = 'pushz') # a = [1,1,0,0,1,1,0,0,1,1,0,0,1,1,0,0], b = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
389 | 
390 |     # flapush
391 |     g = globals()
392 |     for op in ops:
393 |         g[op](r0)
394 |         mov(tmud, r0)
395 |         mov(tmua, r2)
396 |         tmuwt().add(r2, r2, r1)
397 | 
398 |     nop(sig = thrsw)
399 |     nop(sig = thrsw)
400 |     nop()
401 |     nop()
402 |     nop(sig = thrsw)
403 |     nop()
404 |     nop()
405 |     nop()
406 | 
407 | def test_cond_vflx():
408 | 
409 |     def expected(op):
410 |         result = [
411 |             np.array([1,1,0,0,1,1,0,0,1,1,0,0,1,1,0,0], dtype = 'int16'),
412 |             np.array([1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0], dtype = 'int16'),
413 |         ][op[-1] == 'b'].repeat(2)
414 |         if op[3:-1] == 'n':
415 |             result = 1 - result
416 |         return result
417 | 
418 |     ops = [
419 |         'vfla',
420 |         'vflna',
421 |         'vflb',
422 |         'vflnb',
423 |     ]
424 | 
425 |     with Driver() as drv:
426 | 
427 |         code = drv.program(lambda asm: qpu_cond_vflx(asm, ops))
428 |         data = drv.alloc((len(ops), 32), dtype = 'int16')
429 |         unif = drv.alloc(1, dtype = 'uint32')
430 | 
431 |         data[:] = 0
432 | 
433 |         unif[0] = data.addresses()[0,0]
434 | 
435 |         start = time.time()
436 |         drv.execute(code, unif.addresses()[0])
437 |         end = time.time()
438 | 
439 |         for ix, op in enumerate(ops):
440 |             assert (data[ix] == expected(op)).all()
441 | 
442 | 
443 | # vflx instructions read a condition flag as int16
444 | @qpu
445 | def qpu_cond_vflx(asm, ops):
446 | 
447 |     eidx(r0, sig = ldunif)
448 |     mov(r2, r5)
449 |     shl(r0, r0, 2)
450 |     add(r2, r2, r0)
451 |     shl(r1, 4, 4)
452 | 
453 |     # init fla/flb
454 |     bxor(rf0, rf0, rf0).sub(rf1, rf1, rf1)
455 |     eidx(r0)
456 |     band(null, r0, 1 << 0, cond = 'pushz') # a = [1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0]
457 |     band(null, r0, 1 << 1, cond = 'pushz') # a = [1,1,0,0,1,1,0,0,1,1,0,0,1,1,0,0], b = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
458 | 
459 |     # flapush
460 |     g = globals()
461 |     for op in ops:
462 |         g[op](r0)
463 |         mov(tmud, r0)
464 |         mov(tmua, r2)
465 |         tmuwt().add(r2, r2, r1)
466 | 
467 |     nop(sig = thrsw)
468 |     nop(sig = thrsw)
469 |     nop()
470 |     nop()
471 |     nop(sig = thrsw)
472 |     nop()
473 |     nop()
474 |     nop()
475 | 
476 | def test_cond_vflx():
477 | 
478 |     def expected(op):
479 |         result = [
480 |             np.array([1,1,0,0,1,1,0,0,1,1,0,0,1,1,0,0], dtype = 'int16'),
481 |             np.array([1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0], dtype = 'int16'),
482 |         ][op[-1] == 'b'].repeat(2)
483 |         if op[3:-1] == 'n':
484 |             result = 1 - result
485 |         return result
486 | 
487 |     ops = [
488 |         'vfla',
489 |         'vflna',
490 |         'vflb',
491 |         'vflnb',
492 |     ]
493 | 
494 |     with Driver() as drv:
495 | 
496 |         code = drv.program(lambda asm: qpu_cond_vflx(asm, ops))
497 |         data = drv.alloc((len(ops), 32), dtype = 'int16')
498 |         unif = drv.alloc(1, dtype = 'uint32')
499 | 
500 |         data[:] = 0
501 | 
502 |         unif[0] = data.addresses()[0,0]
503 | 
504 |         start = time.time()
505 |         drv.execute(code, unif.addresses()[0])
506 |         end = time.time()
507 | 
508 |         for ix, op in enumerate(ops):
509 |             assert (data[ix] == expected(op)).all()
510 | 
511 | 
512 | @qpu
513 | def qpu_cond_flx(asm, ops):
514 | 
515 |     eidx(r0, sig = ldunif)
516 |     mov(rf0, r5, sig = ldunif) # in
517 |     mov(rf1, r5, sig = ldunif)  # out
518 |     shl(r3, 4, 4).mov(rf2, r5)
519 | 
520 |     shl(r0, r0, 2)
521 |     add(rf0, rf0, r0)
522 |     add(rf1, rf1, r0)
523 |     add(rf2, rf2, r0)
524 | 
525 |     mov(tmua, rf0, sig = thrsw).add(rf0, rf0, r3)
526 |     nop()
527 |     mov(tmua, rf1, sig = thrsw).add(rf1, rf1, r3)
528 |     nop(sig = ldtmu(r1))
529 |     nop()
530 |     nop(sig = ldtmu(r2))
531 | 
532 |     # init fla/flb
533 |     mov(null, r2, cond = 'pushn')
534 |     band(null, r2, 1, cond = 'pushz') # fla, flb = ~(r2 & 1), r2 < 0
535 | 
536 |     g = globals()
537 |     for op in ops:
538 |         g[op](tmud, r1)
539 |         mov(tmua, rf2)
540 |         tmuwt().add(rf2, rf2, r3)
541 | 
542 |     nop(sig = thrsw)
543 |     nop(sig = thrsw)
544 |     nop()
545 |     nop()
546 |     nop(sig = thrsw)
547 |     nop()
548 |     nop()
549 |     nop()
550 | 
551 | def test_cond_flx():
552 | 
553 |     ops = [
554 |         'flapush',
555 |         'flbpush',
556 |         'flpop',
557 |     ]
558 | 
559 |     with Driver() as drv:
560 | 
561 |         code = drv.program(lambda asm: qpu_cond_flx(asm, ops))
562 |         X1 = drv.alloc((16,), dtype = 'uint32')
563 |         X2 = drv.alloc((16,), dtype = 'int32')
564 |         Y = drv.alloc((len(ops), 16), dtype = 'uint32')
565 |         unif = drv.alloc(3, dtype = 'uint32')
566 | 
567 |         X1[:] = (np.random.randn(*X1.shape) * (2**24)).astype('uint32')
568 |         X2[:] = np.random.randn(*X2.shape).astype('int32')
569 |         Y[:] = 0.0
570 | 
571 |         unif[0] = X1.addresses()[0]
572 |         unif[1] = X2.addresses()[0]
573 |         unif[2] = Y.addresses()[0,0]
574 | 
575 |         start = time.time()
576 |         drv.execute(code, unif.addresses()[0])
577 |         end = time.time()
578 | 
579 |         fla = 1 - X2 & 1
580 |         flb = X2 < 0
581 | 
582 |         for ix, op in enumerate(ops):
583 |             assert (Y[ix] == [(X1 << 2) | (3 * [fla,flb][op[2] == 'b']), X1 >> 2][op[2:] == 'pop']).all()
584 | 


--------------------------------------------------------------------------------
/tests/test_driver.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # Copyright (c) 2019-2020 Idein Inc.
 3 | #
 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 5 | # of this software and associated documentation files (the "Software"), to deal
 6 | # in the Software without restriction, including without limitation the rights
 7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 8 | # copies of the Software, and to permit persons to whom the Software is
 9 | # furnished to do so, subject to the following conditions:
10 | #
11 | # The above copyright notice and this permission notice (including the next
12 | # paragraph) shall be included in all copies or substantial portions of the
13 | # Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | from videocore6.driver import Driver
24 | 
25 | def test_mem():
26 |     print()
27 | 
28 |     with Driver() as drv:
29 | 
30 |         n = 4
31 |         a = [None] * n
32 |         off = 42
33 | 
34 |         for i in range(n):
35 |             a[i] = drv.alloc((256 * 1024), dtype = 'uint32')
36 |             a[i][:] = range(i, a[i].shape[0] * n, n)
37 |             a[i][:] += off
38 | 
39 |         for i in range(n):
40 |             assert all(a[i][:] == range(i + off, a[i].shape[0] * n + off, n))
41 | 


--------------------------------------------------------------------------------
/tests/test_drm.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # Copyright (c) 2019-2020 Idein Inc.
 3 | #
 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 5 | # of this software and associated documentation files (the "Software"), to deal
 6 | # in the Software without restriction, including without limitation the rights
 7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 8 | # copies of the Software, and to permit persons to whom the Software is
 9 | # furnished to do so, subject to the following conditions:
10 | #
11 | # The above copyright notice and this permission notice (including the next
12 | # paragraph) shall be included in all copies or substantial portions of the
13 | # Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | 
24 | from videocore6.drm_v3d import DRM_V3D
25 | 
26 | 
27 | def test_get_param():
28 |     print()
29 | 
30 |     with DRM_V3D() as drm:
31 | 
32 |         uifcfg       = drm.v3d_get_param(DRM_V3D.V3D_PARAM_V3D_UIFCFG)
33 |         hub_ident1   = drm.v3d_get_param(DRM_V3D.V3D_PARAM_V3D_HUB_IDENT1)
34 |         hub_ident2   = drm.v3d_get_param(DRM_V3D.V3D_PARAM_V3D_HUB_IDENT2)
35 |         hub_ident3   = drm.v3d_get_param(DRM_V3D.V3D_PARAM_V3D_HUB_IDENT3)
36 |         core0_ident0 = drm.v3d_get_param(DRM_V3D.V3D_PARAM_V3D_CORE0_IDENT0)
37 |         core0_ident1 = drm.v3d_get_param(DRM_V3D.V3D_PARAM_V3D_CORE0_IDENT1)
38 |         core0_ident2 = drm.v3d_get_param(DRM_V3D.V3D_PARAM_V3D_CORE0_IDENT2)
39 |         supports_tfu = drm.v3d_get_param(DRM_V3D.V3D_PARAM_SUPPORTS_TFU)
40 |         supports_csd = drm.v3d_get_param(DRM_V3D.V3D_PARAM_SUPPORTS_CSD)
41 | 
42 |         print(f'uifcfg:       {uifcfg:#010x}')
43 |         print(f'hub_ident1:   {hub_ident1:#010x}')
44 |         print(f'hub_ident2:   {hub_ident2:#010x}')
45 |         print(f'hub_ident3:   {hub_ident3:#010x}')
46 |         print(f'core0_ident0: {core0_ident0:#010x}')
47 |         print(f'core0_ident1: {core0_ident1:#010x}')
48 |         print(f'core0_ident2: {core0_ident2:#010x}')
49 |         print(f'supports_tfu: {supports_tfu:#010x}')
50 |         print(f'supports_csd: {supports_csd:#010x}')
51 | 
52 |     print('Consult /sys/kernel/debug/dri/0/v3d_regs for more information')
53 | 
54 | 
55 | def test_alloc():
56 |     print()
57 | 
58 |     size = pow(2, 24)
59 | 
60 |     with DRM_V3D() as drm:
61 | 
62 |         handle, phyaddr = drm.v3d_create_bo(size)
63 |         offset = drm.v3d_mmap_bo(handle)
64 | 
65 |         print(f'size    = {size:#010x}')
66 |         print(f'handle  = {handle:#010x}')
67 |         print(f'phyaddr = {phyaddr:#010x}')
68 |         print(f'offset  = {offset:#010x}')
69 | 


--------------------------------------------------------------------------------
/tests/test_labels.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # Copyright (c) 2019-2020 Idein Inc.
  3 | #
  4 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  5 | # of this software and associated documentation files (the "Software"), to deal
  6 | # in the Software without restriction, including without limitation the rights
  7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  8 | # copies of the Software, and to permit persons to whom the Software is
  9 | # furnished to do so, subject to the following conditions:
 10 | #
 11 | # The above copyright notice and this permission notice (including the next
 12 | # paragraph) shall be included in all copies or substantial portions of the
 13 | # Software.
 14 | #
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 21 | # SOFTWARE.
 22 | 
 23 | import time
 24 | from videocore6.driver import Driver
 25 | from videocore6.assembler import qpu
 26 | import numpy as np
 27 | 
 28 | @qpu
 29 | def qpu_label_with_namespace(asm):
 30 | 
 31 |     mov(r0, 0)
 32 | 
 33 |     with namespace('ns1'):
 34 |         b(R.test, cond = 'always')
 35 |         nop()
 36 |         nop()
 37 |         nop()
 38 |         add(r0, r0, 10)
 39 |         L.test
 40 |         add(r0, r0, 1)
 41 | 
 42 |         with namespace('nested'):
 43 |             b(R.test, cond = 'always')
 44 |             nop()
 45 |             nop()
 46 |             nop()
 47 |             add(r0, r0, 10)
 48 |             L.test
 49 |             add(r0, r0, 1)
 50 | 
 51 |     with namespace('ns2'):
 52 |         b(R.test, cond = 'always')
 53 |         nop()
 54 |         nop()
 55 |         nop()
 56 |         add(r0, r0, 10)
 57 |         L.test
 58 |         add(r0, r0, 1)
 59 | 
 60 |     b(R.test, cond = 'always')
 61 |     nop()
 62 |     nop()
 63 |     nop()
 64 |     add(r0, r0, 10)
 65 |     L.test
 66 |     add(r0, r0, 1)
 67 | 
 68 |     with namespace('ns3'):
 69 |         b(R.test, cond = 'always')
 70 |         nop()
 71 |         nop()
 72 |         nop()
 73 |         add(r0, r0, 10)
 74 |         L.test
 75 |         add(r0, r0, 1)
 76 | 
 77 |     eidx(r1, sig = ldunifrf(rf2))
 78 |     shl(r1, r1, 2)
 79 | 
 80 |     mov(tmud, r0)
 81 |     add(tmua, rf2, r1)
 82 |     tmuwt()
 83 | 
 84 |     nop(sig = thrsw)
 85 |     nop(sig = thrsw)
 86 |     nop()
 87 |     nop()
 88 |     nop(sig = thrsw)
 89 |     nop()
 90 |     nop()
 91 |     nop()
 92 | 
 93 | def test_label_with_namespace():
 94 | 
 95 |     with Driver() as drv:
 96 | 
 97 |         code = drv.program(qpu_label_with_namespace)
 98 |         data = drv.alloc(16, dtype = 'uint32')
 99 |         unif = drv.alloc(1, dtype = 'uint32')
100 | 
101 |         data[:] = 1234
102 | 
103 |         unif[0] = data.addresses()[0]
104 | 
105 |         start = time.time()
106 |         drv.execute(code, unif.addresses()[0])
107 |         end = time.time()
108 | 
109 |         assert (data == 5).all()
110 | 


--------------------------------------------------------------------------------
/tests/test_parallel.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # Copyright (c) 2019-2020 Idein Inc.
  3 | #
  4 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  5 | # of this software and associated documentation files (the "Software"), to deal
  6 | # in the Software without restriction, including without limitation the rights
  7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  8 | # copies of the Software, and to permit persons to whom the Software is
  9 | # furnished to do so, subject to the following conditions:
 10 | #
 11 | # The above copyright notice and this permission notice (including the next
 12 | # paragraph) shall be included in all copies or substantial portions of the
 13 | # Software.
 14 | #
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 21 | # SOFTWARE.
 22 | 
 23 | 
 24 | import time
 25 | from videocore6.driver import Driver
 26 | from videocore6.assembler import qpu
 27 | import numpy as np
 28 | 
 29 | 
 30 | @qpu
 31 | def cost(asm):
 32 |     shl(r0, 8, 8)
 33 |     shl(r0, r0, 8)
 34 |     with loop as l:
 35 |         sub(r0, r0, 1, cond = 'pushn')
 36 |         l.b(cond = 'anyna')
 37 |         nop()
 38 |         nop()
 39 |         nop()
 40 | 
 41 | @qpu
 42 | def qpu_serial(asm):
 43 | 
 44 |     nop(sig = ldunifrf(rf0))
 45 |     nop(sig = ldunifrf(rf1))
 46 |     nop(sig = ldunifrf(rf2))
 47 |     nop(sig = ldunifrf(rf3))
 48 | 
 49 |     eidx(r0)
 50 |     shl(r0, r0, 2)
 51 |     add(rf2, rf2, r0)
 52 |     add(rf3, rf3, r0)
 53 |     shl(r3, 4, 4)
 54 | 
 55 |     for i in range(16):
 56 |         mov(tmua, rf2, sig = thrsw).add(rf2, rf2, r3)
 57 |         nop()
 58 |         nop()
 59 |         nop(sig = ldtmu(r0))
 60 |         mov(tmud, r0)
 61 |         mov(tmua, rf3, sig = thrsw).add(rf3, rf3, r3)
 62 |         tmuwt()
 63 | 
 64 |     cost(asm)
 65 | 
 66 |     nop(sig = thrsw)
 67 |     nop(sig = thrsw)
 68 |     nop()
 69 |     nop()
 70 |     nop(sig = thrsw)
 71 |     nop()
 72 |     nop()
 73 |     nop()
 74 | 
 75 | # This code requires 16 thread execution.
 76 | # If # of thread < 16, thread id (= (tidx & 0b111110) >> 1) could be discontiguous.
 77 | # If # of thread > 16, thread id (= (tidx & 0b111110) >> 1) could be duplicated.
 78 | @qpu
 79 | def qpu_parallel_16(asm):
 80 | 
 81 |     tidx(r0, sig = ldunifrf(rf0))
 82 |     shr(r0, r0, 1).mov(r1, 1)
 83 |     shl(r1, r1, 5)
 84 |     sub(r1, r1, 1)
 85 |     band(rf31, r0, r1) # rf31 = (qpu_id * 2) + (thread_id >> 1)
 86 | 
 87 |     # rf31 * unif[0,1] * sizeof(float) + (unif.addresses[0,0] + 2 * sizeof(float))
 88 |     nop(sig = ldunifrf(rf1))      # rf1 = unif[0,1]
 89 |     shl(r0, rf1, 2)
 90 |     umul24(r0, r0, rf31)
 91 |     add(r1, rf0, 8)
 92 |     add(r0, r0, r1)
 93 |     eidx(r1)
 94 |     shl(r1, r1, 2)
 95 |     add(tmua, r0, r1, sig = thrsw)
 96 |     nop()
 97 |     nop()
 98 |     nop(sig = ldtmu(r0))                          # unif[th,2:18]
 99 |     mov(r5rep, r0)
100 |     mov(rf2, r5).rotate(r5rep, r0, -1)            # rf2 = unif[th,2]
101 |     mov(rf3, r5)                                  # rf3 = unif[th,3]
102 | 
103 |     eidx(r2)
104 |     shl(r2, r2, 2)
105 |     add(tmua, rf2, r2, sig = thrsw)
106 |     nop()
107 |     nop()
108 |     nop(sig = ldtmu(rf32))
109 | 
110 |     eidx(r2)
111 |     shl(r2, r2, 2)
112 |     mov(tmud, rf32)
113 |     add(tmua, rf3, r2)
114 |     tmuwt()
115 | 
116 |     cost(asm)
117 | 
118 |     nop(sig = thrsw)
119 |     nop(sig = thrsw)
120 |     nop()
121 |     nop()
122 |     nop(sig = thrsw)
123 |     nop()
124 |     nop()
125 |     nop()
126 | 
127 | def test_parallel_16():
128 | 
129 |     with Driver() as drv:
130 | 
131 |         thread = 16
132 | 
133 |         serial_code = drv.program(qpu_serial)
134 |         parallel_code = drv.program(qpu_parallel_16)
135 |         X = drv.alloc((thread, 16), dtype = 'float32')
136 |         Ys = drv.alloc((thread, 16), dtype = 'float32')
137 |         Yp = drv.alloc((thread, 16), dtype = 'float32')
138 |         unif = drv.alloc((thread, 4), dtype = 'uint32')
139 | 
140 |         X[:] = np.random.randn(*X.shape)
141 |         Ys[:] = -1
142 |         Yp[:] = -1
143 | 
144 |         unif[:,0] = unif.addresses()[:,0]
145 |         unif[:,1] = unif.shape[1]
146 |         unif[:,2] = X.addresses()[:,0]
147 |         unif[:,3] = Ys.addresses()[:,0]
148 | 
149 |         start = time.time()
150 |         drv.execute(serial_code, unif.addresses()[0,0])
151 |         end = time.time()
152 |         serial_cost = end - start
153 | 
154 |         unif[:,3] = Yp.addresses()[:,0]
155 | 
156 |         start = time.time()
157 |         drv.execute(parallel_code, unif.addresses()[0,0], thread=thread)
158 |         end = time.time()
159 |         parallel_cost = end - start
160 | 
161 |         np.set_printoptions(threshold=np.inf)
162 | 
163 |         assert (X == Ys).all()
164 |         assert (X == Yp).all()
165 |         assert parallel_cost < serial_cost * 2
166 | 
167 | # If remove `barrierid` in this code, `test_barrier` will fail.
168 | @qpu
169 | def qpu_barrier(asm):
170 | 
171 |     tidx(r0, sig = ldunifrf(rf0)) # rf0 = unif[0,0]
172 |     shr(r2, r0, 2)
173 |     band(r1, r0, 0b11)            # thread_id
174 |     band(r2, r2, 0b1111)          # qpu_id
175 |     shr(r1, r1, 1)
176 |     shl(r2, r2, 1)
177 |     add(rf31, r1, r2)             # rf31 = (qpu_id * 2) + (thread_id >> 1)
178 | 
179 |     nop(sig = ldunifrf(rf1))      # rf1 = unif[0,1]
180 | 
181 |     # rf31 * unif[0,1] * sizeof(float) + (unif.addresses[0,0] + 2 * sizeof(float))
182 |     shl(r0, rf1, 2)
183 |     umul24(r0, r0, rf31)
184 |     add(r1, rf0, 8)
185 |     add(r0, r0, r1)
186 |     eidx(r1)
187 |     shl(r1, r1, 2)
188 |     add(tmua, r0, r1, sig = thrsw)
189 |     nop()
190 |     nop()
191 |     nop(sig = ldtmu(r0))                          # unif[th,2:18]
192 |     mov(r5rep, r0)
193 |     mov(rf2, r5).rotate(r5rep, r0, -1)            # rf2 = unif[th,2]
194 |     mov(rf3, r5)                                  # rf3 = unif[th,3]
195 | 
196 |     eidx(r2)
197 |     shl(r2, r2, 2)
198 |     add(tmua, rf2, r2, sig = thrsw)
199 |     nop()
200 |     nop()
201 |     nop(sig = ldtmu(r0))
202 | 
203 |     mov(r1, rf31)
204 |     shl(r1, r1, 8)
205 |     L.loop
206 |     sub(r1, r1, 1, cond = 'pushn')
207 |     b(R.loop, cond = 'anyna')
208 |     nop()
209 |     nop()
210 |     nop()
211 | 
212 |     eidx(r2)
213 |     shl(r2, r2, 2)
214 |     mov(tmud, r0)
215 |     add(tmua, rf3, r2)
216 |     tmuwt()
217 | 
218 |     barrierid(syncb, sig = thrsw)
219 | 
220 |     add(rf32, rf31, 1)
221 |     band(rf32, rf32, 0b1111) # rf32 = (rf31 + 1) mod 16
222 | 
223 |     # rf32 * unif[0,1] * sizeof(float) + (unif.addresses[0,0] + 2 * sizeof(float))
224 |     shl(r0, rf1, 2)
225 |     umul24(r0, r0, rf32)
226 |     add(r1, rf0, 8)
227 |     add(r0, r0, r1)
228 |     eidx(r1)
229 |     shl(r1, r1, 2)
230 |     add(tmua, r0, r1, sig = thrsw)
231 |     nop()
232 |     nop()
233 |     nop(sig = ldtmu(r0))                          # unif[(th+1)%16,2:18]
234 |     mov(r5rep, r0)
235 |     mov(rf4, r5).rotate(r5rep, r0, -1)            # rf4 = unif[(th+1)%16,2]
236 |     mov(rf5, r5)                                  # rf5 = unif[(th+1)%16,3]
237 | 
238 |     eidx(r2)
239 |     shl(r2, r2, 2)
240 |     add(tmua, rf5, r2, sig = thrsw)
241 |     nop()
242 |     nop()
243 |     nop(sig = ldtmu(r0))
244 | 
245 |     eidx(r2)
246 |     shl(r2, r2, 2)
247 |     mov(tmud, r0)
248 |     add(tmua, rf3, r2)
249 |     tmuwt()
250 | 
251 |     nop(sig = thrsw)
252 |     nop(sig = thrsw)
253 |     nop()
254 |     nop()
255 |     nop(sig = thrsw)
256 |     nop()
257 |     nop()
258 |     nop()
259 | 
260 | def test_barrier():
261 | 
262 |     with Driver() as drv:
263 | 
264 |         thread = 16
265 | 
266 |         code = drv.program(qpu_barrier)
267 |         X = drv.alloc((thread, 16), dtype = 'float32')
268 |         Y = drv.alloc((thread, 16), dtype = 'float32')
269 |         unif = drv.alloc((thread, 4), dtype = 'uint32')
270 | 
271 |         X[:] = np.random.randn(*X.shape)
272 |         Y[:] = -1
273 | 
274 |         unif[:,0] = unif.addresses()[:,0]
275 |         unif[:,1] = unif.shape[1]
276 |         unif[:,2] = X.addresses()[:,0]
277 |         unif[:,3] = Y.addresses()[:,0]
278 | 
279 |         start = time.time()
280 |         drv.execute(code, unif.addresses()[0,0], thread=thread)
281 |         end = time.time()
282 | 
283 |         np.set_printoptions(threshold=np.inf)
284 | 
285 |         assert (Y == np.concatenate([X[1:],X[:1]])).all()
286 | 


--------------------------------------------------------------------------------
/tests/test_sfu.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # Copyright (c) 2019-2020 Idein Inc.
  3 | #
  4 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  5 | # of this software and associated documentation files (the "Software"), to deal
  6 | # in the Software without restriction, including without limitation the rights
  7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  8 | # copies of the Software, and to permit persons to whom the Software is
  9 | # furnished to do so, subject to the following conditions:
 10 | #
 11 | # The above copyright notice and this permission notice (including the next
 12 | # paragraph) shall be included in all copies or substantial portions of the
 13 | # Software.
 14 | #
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 21 | # SOFTWARE.
 22 | 
 23 | import time
 24 | from videocore6.driver import Driver
 25 | from videocore6.assembler import qpu
 26 | import numpy as np
 27 | 
 28 | def sfu_sin(x):
 29 |     result = np.sin(x * np.pi)
 30 |     result[x < -0.5] = -1
 31 |     result[x >  0.5] = 1
 32 |     return result
 33 | 
 34 | ops = {
 35 |     # sfu regs/ops
 36 |     'recip' : lambda x: 1 / x,
 37 |     'rsqrt' : lambda x: 1 / np.sqrt(x),
 38 |     'exp' : lambda x: 2 ** x,
 39 |     'log' : np.log2,
 40 |     'sin' : sfu_sin,
 41 |     'rsqrt2' : lambda x: 1 / np.sqrt(x),
 42 | }
 43 | 
 44 | 
 45 | 
 46 | # SFU IO registers
 47 | @qpu
 48 | def qpu_sfu_regs(asm, sfu_regs):
 49 | 
 50 |     eidx(r0, sig = ldunif)
 51 |     mov(rf0, r5, sig = ldunif) # in
 52 |     shl(r3, 4, 4).mov(rf1, r5)
 53 | 
 54 |     shl(r0, r0, 2)
 55 |     add(rf0, rf0, r0)
 56 |     add(rf1, rf1, r0)
 57 | 
 58 |     mov(tmua, rf0, sig = thrsw).add(rf0, rf0, r3)
 59 |     nop()
 60 |     nop()
 61 |     nop(sig = ldtmu(r1))
 62 | 
 63 |     g = globals()
 64 |     for reg in sfu_regs:
 65 |         mov(g[reg], r1)
 66 |         nop() # required ? enough ?
 67 |         mov(tmud, r4)
 68 |         mov(tmua, rf1)
 69 |         tmuwt().add(rf1, rf1, r3)
 70 | 
 71 |     nop(sig = thrsw)
 72 |     nop(sig = thrsw)
 73 |     nop()
 74 |     nop()
 75 |     nop(sig = thrsw)
 76 |     nop()
 77 |     nop()
 78 |     nop()
 79 | 
 80 | def boilerplate_sfu_regs(sfu_regs, domain_limitter):
 81 | 
 82 |     with Driver() as drv:
 83 | 
 84 |         code = drv.program(lambda asm: qpu_sfu_regs(asm, sfu_regs))
 85 |         X = drv.alloc((16, ), dtype = 'float32')
 86 |         Y = drv.alloc((len(sfu_regs), 16), dtype = 'float32')
 87 |         unif = drv.alloc(3, dtype = 'uint32')
 88 | 
 89 |         X[:] = domain_limitter(np.random.randn(*X.shape).astype('float32'))
 90 |         Y[:] = 0.0
 91 | 
 92 |         unif[0] = X.addresses()[0]
 93 |         unif[1] = Y.addresses()[0,0]
 94 | 
 95 |         start = time.time()
 96 |         drv.execute(code, unif.addresses()[0])
 97 |         end = time.time()
 98 | 
 99 |         for ix, reg in enumerate(sfu_regs):
100 |             msg = 'mov({}, None)'.format(reg)
101 |             assert np.allclose(Y[ix], ops[reg](X), rtol=1e-4), msg
102 | 
103 | def test_sfu_regs():
104 |     boilerplate_sfu_regs(['recip','exp','sin'], lambda x: x)
105 |     boilerplate_sfu_regs(['rsqrt','log','rsqrt2'], lambda x: x ** 2 + 1e-6)
106 | 
107 | 
108 | # SFU ops
109 | @qpu
110 | def qpu_sfu_ops(asm, sfu_ops):
111 | 
112 |     eidx(r0, sig = ldunif)
113 |     mov(rf0, r5, sig = ldunif) # in
114 |     shl(r3, 4, 4).mov(rf1, r5)
115 | 
116 |     shl(r0, r0, 2)
117 |     add(rf0, rf0, r0)
118 |     add(rf1, rf1, r0)
119 | 
120 |     mov(tmua, rf0, sig = thrsw).add(rf0, rf0, r3)
121 |     nop()
122 |     nop()
123 |     nop(sig = ldtmu(r1))
124 | 
125 |     g = globals()
126 |     for op in sfu_ops:
127 |         g[op](rf2, r1) # ATTENTION: SFU ops requires rfN ?
128 |         mov(tmud, rf2)
129 |         mov(tmua, rf1)
130 |         tmuwt().add(rf1, rf1, r3)
131 | 
132 |     nop(sig = thrsw)
133 |     nop(sig = thrsw)
134 |     nop()
135 |     nop()
136 |     nop(sig = thrsw)
137 |     nop()
138 |     nop()
139 |     nop()
140 | 
141 | def boilerplate_sfu_ops(sfu_ops, domain_limitter):
142 | 
143 |     with Driver() as drv:
144 | 
145 |         code = drv.program(lambda asm: qpu_sfu_ops(asm, sfu_ops))
146 |         X = drv.alloc((16, ), dtype = 'float32')
147 |         Y = drv.alloc((len(sfu_ops), 16), dtype = 'float32')
148 |         unif = drv.alloc(3, dtype = 'uint32')
149 | 
150 |         X[:] = domain_limitter(np.random.randn(*X.shape).astype('float32'))
151 |         Y[:] = 0.0
152 | 
153 |         unif[0] = X.addresses()[0]
154 |         unif[1] = Y.addresses()[0,0]
155 | 
156 |         start = time.time()
157 |         drv.execute(code, unif.addresses()[0])
158 |         end = time.time()
159 | 
160 |         for ix, op in enumerate(sfu_ops):
161 |             msg = '{}(None, None)'.format(op)
162 |             assert np.allclose(Y[ix], ops[op](X), rtol=1e-4), msg
163 | 
164 | def test_sfu_ops():
165 |     boilerplate_sfu_ops(['recip','exp','sin'], lambda x: x)
166 |     boilerplate_sfu_ops(['rsqrt','log','rsqrt2'], lambda x: x ** 2 + 1e-6)
167 | 


--------------------------------------------------------------------------------
/tests/test_signals.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # Copyright (c) 2019-2020 Idein Inc.
  3 | #
  4 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  5 | # of this software and associated documentation files (the "Software"), to deal
  6 | # in the Software without restriction, including without limitation the rights
  7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  8 | # copies of the Software, and to permit persons to whom the Software is
  9 | # furnished to do so, subject to the following conditions:
 10 | #
 11 | # The above copyright notice and this permission notice (including the next
 12 | # paragraph) shall be included in all copies or substantial portions of the
 13 | # Software.
 14 | #
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 21 | # SOFTWARE.
 22 | 
 23 | 
 24 | import time
 25 | from videocore6.driver import Driver
 26 | from videocore6.assembler import qpu
 27 | import numpy as np
 28 | 
 29 | 
 30 | # ldtmu
 31 | @qpu
 32 | def qpu_signal_ldtmu(asm):
 33 | 
 34 |     eidx(r0, sig = ldunif)
 35 |     mov(rf0, r5, sig = ldunif)
 36 |     shl(r3, 4, 4).mov(rf1, r5)
 37 | 
 38 |     shl(r0, r0, 2)
 39 |     add(rf0, rf0, r0)
 40 |     add(rf1, rf1, r0)
 41 | 
 42 |     mov(tmua, rf0, sig = thrsw).add(rf0, rf0, r3)        # start load X
 43 |     mov(r0, 1.0)                                         # r0 <- 1.0
 44 |     mov(r1, 2.0)                                         # r1 <- 2.0
 45 |     fadd(r0, r0, r0).fmul(r1, r1, r1, sig = ldtmu(rf31)) # r0 <- 2 * r0, r1 <- r1 ^ 2, rf31 <- X
 46 |     mov(tmud, rf31)
 47 |     mov(tmua, rf1)
 48 |     tmuwt().add(rf1, rf1, r3)
 49 |     mov(tmud, r0)
 50 |     mov(tmua, rf1)
 51 |     tmuwt().add(rf1, rf1, r3)
 52 |     mov(tmud, r1)
 53 |     mov(tmua, rf1)
 54 |     tmuwt().add(rf1, rf1, r3)
 55 | 
 56 |     nop(sig = thrsw)
 57 |     nop(sig = thrsw)
 58 |     nop()
 59 |     nop()
 60 |     nop(sig = thrsw)
 61 |     nop()
 62 |     nop()
 63 |     nop()
 64 | 
 65 | def test_signal_ldtmu():
 66 | 
 67 |     with Driver() as drv:
 68 | 
 69 |         code = drv.program(qpu_signal_ldtmu)
 70 |         X = drv.alloc((16, ), dtype = 'float32')
 71 |         Y = drv.alloc((3, 16), dtype = 'float32')
 72 |         unif = drv.alloc(3, dtype = 'uint32')
 73 | 
 74 |         X[:] = np.random.randn(*X.shape).astype('float32')
 75 |         Y[:] = 0.0
 76 | 
 77 |         unif[0] = X.addresses()[0]
 78 |         unif[1] = Y.addresses()[0,0]
 79 | 
 80 |         start = time.time()
 81 |         drv.execute(code, unif.addresses()[0])
 82 |         end = time.time()
 83 | 
 84 |         assert (Y[0] == X).all()
 85 |         assert (Y[1] == 2).all()
 86 |         assert (Y[2] == 4).all()
 87 | 
 88 | # rot signal with rN source performs as a full rotate
 89 | @qpu
 90 | def qpu_full_rotate(asm):
 91 | 
 92 |     eidx(r0, sig = ldunif)
 93 |     mov(rf0, r5, sig = ldunif)
 94 |     shl(r3, 4, 4).mov(rf1, r5)
 95 | 
 96 |     shl(r0, r0, 2)
 97 |     add(rf0, rf0, r0)
 98 |     add(rf1, rf1, r0)
 99 | 
100 |     mov(tmua, rf0, sig = thrsw).add(rf0, rf0, r3)
101 |     nop()
102 |     nop()
103 |     nop(sig = ldtmu(r0))
104 |     nop() # required before rotate
105 | 
106 |     for i in range(-15, 16):
107 |         nop().add(r1, r0, r0, sig = rot(i))
108 |         mov(tmud, r1)
109 |         mov(tmua, rf1)
110 |         tmuwt().add(rf1, rf1, r3)
111 | 
112 |     for i in range(-15, 16):
113 |         mov(r5, i)
114 |         nop() # require
115 |         nop().add(r1, r0, r0, sig = rot(i))
116 |         mov(tmud, r1)
117 |         mov(tmua, rf1)
118 |         tmuwt().add(rf1, rf1, r3)
119 | 
120 |     nop(sig = thrsw)
121 |     nop(sig = thrsw)
122 |     nop()
123 |     nop()
124 |     nop(sig = thrsw)
125 |     nop()
126 |     nop()
127 |     nop()
128 | 
129 | def test_full_rotate():
130 | 
131 |     with Driver() as drv:
132 | 
133 |         code = drv.program(qpu_full_rotate)
134 |         X = drv.alloc((16, ), dtype = 'int32')
135 |         Y = drv.alloc((2, len(range(-15, 16)), 16), dtype = 'int32')
136 |         unif = drv.alloc(3, dtype = 'uint32')
137 | 
138 |         X[:] = np.arange(16)
139 |         Y[:] = 0
140 | 
141 |         unif[0] = X.addresses()[0]
142 |         unif[1] = Y.addresses()[0,0,0]
143 | 
144 |         start = time.time()
145 |         drv.execute(code, unif.addresses()[0])
146 |         end = time.time()
147 | 
148 |         expected = np.concatenate([X,X]) * 2
149 |         for ix, rot in enumerate(range(-15, 16)):
150 |             assert (Y[:,ix] == expected[(-rot%16):(-rot%16)+16]).all()
151 | 
152 | 
153 | # rotate alias
154 | @qpu
155 | def qpu_rotate_alias(asm):
156 | 
157 |     eidx(r0, sig = ldunif)
158 |     mov(rf0, r5, sig = ldunif)
159 |     shl(r3, 4, 4).mov(rf1, r5)
160 | 
161 |     shl(r0, r0, 2)
162 |     add(rf0, rf0, r0)
163 |     add(rf1, rf1, r0)
164 | 
165 |     mov(tmua, rf0, sig = thrsw).add(rf0, rf0, r3)
166 |     nop()
167 |     nop()
168 |     nop(sig = ldtmu(r0))
169 |     nop() # required before rotate
170 | 
171 |     for i in range(-15, 16):
172 |         rotate(r1, r0, i)       # add alias
173 |         mov(tmud, r1)
174 |         mov(tmua, rf1)
175 |         tmuwt().add(rf1, rf1, r3)
176 | 
177 |     for i in range(-15, 16):
178 |         nop().rotate(r1, r0, i) # mul alias
179 |         mov(tmud, r1)
180 |         mov(tmua, rf1)
181 |         tmuwt().add(rf1, rf1, r3)
182 | 
183 |     for i in range(-15, 16):
184 |         mov(r5, i)
185 |         nop() # require
186 |         rotate(r1, r0, r5)       # add alias
187 |         mov(tmud, r1)
188 |         mov(tmua, rf1)
189 |         tmuwt().add(rf1, rf1, r3)
190 | 
191 |     for i in range(-15, 16):
192 |         mov(r5, i)
193 |         nop() # require
194 |         nop().rotate(r1, r0, r5) # mul alias
195 |         mov(tmud, r1)
196 |         mov(tmua, rf1)
197 |         tmuwt().add(rf1, rf1, r3)
198 | 
199 |     nop(sig = thrsw)
200 |     nop(sig = thrsw)
201 |     nop()
202 |     nop()
203 |     nop(sig = thrsw)
204 |     nop()
205 |     nop()
206 |     nop()
207 | 
208 | def test_rotate_alias():
209 | 
210 |     with Driver() as drv:
211 | 
212 |         code = drv.program(qpu_rotate_alias)
213 |         X = drv.alloc((16, ), dtype = 'int32')
214 |         Y = drv.alloc((4, len(range(-15, 16)), 16), dtype = 'int32')
215 |         unif = drv.alloc(3, dtype = 'uint32')
216 | 
217 |         X[:] = np.arange(16)
218 |         Y[:] = 0
219 | 
220 |         unif[0] = X.addresses()[0]
221 |         unif[1] = Y.addresses()[0,0,0]
222 | 
223 |         start = time.time()
224 |         drv.execute(code, unif.addresses()[0])
225 |         end = time.time()
226 | 
227 |         expected = np.concatenate([X,X])
228 |         for ix, rot in enumerate(range(-15, 16)):
229 |             assert (Y[:,ix] == expected[(-rot%16):(-rot%16)+16]).all()
230 | 
231 | 
232 | # rot signal with rfN source performs as a quad rotate
233 | @qpu
234 | def qpu_quad_rotate(asm):
235 | 
236 |     eidx(r0, sig = ldunif)
237 |     mov(rf0, r5, sig = ldunif)
238 |     shl(r3, 4, 4).mov(rf1, r5)
239 | 
240 |     shl(r0, r0, 2)
241 |     add(rf0, rf0, r0)
242 |     add(rf1, rf1, r0)
243 | 
244 |     mov(tmua, rf0, sig = thrsw).add(rf0, rf0, r3)
245 |     nop()
246 |     nop()
247 |     nop(sig = ldtmu(rf32))
248 |     nop() # required before rotate
249 | 
250 |     for i in range(-15, 16):
251 |         nop().add(r1, rf32, rf32, sig = rot(i))
252 |         mov(tmud, r1)
253 |         mov(tmua, rf1)
254 |         tmuwt().add(rf1, rf1, r3)
255 | 
256 |     for i in range(-15, 16):
257 |         mov(r5, i)
258 |         nop() # require
259 |         nop().add(r1, rf32, rf32, sig = rot(r5))
260 |         mov(tmud, r1)
261 |         mov(tmua, rf1)
262 |         tmuwt().add(rf1, rf1, r3)
263 | 
264 |     nop(sig = thrsw)
265 |     nop(sig = thrsw)
266 |     nop()
267 |     nop()
268 |     nop(sig = thrsw)
269 |     nop()
270 |     nop()
271 |     nop()
272 | 
273 | def test_quad_rotate():
274 | 
275 |     with Driver() as drv:
276 | 
277 |         code = drv.program(qpu_quad_rotate)
278 |         X = drv.alloc((16, ), dtype = 'int32')
279 |         Y = drv.alloc((2, len(range(-15, 16)), 16), dtype = 'int32')
280 |         unif = drv.alloc(3, dtype = 'uint32')
281 | 
282 |         X[:] = np.arange(16)
283 |         Y[:] = 0
284 | 
285 |         unif[0] = X.addresses()[0]
286 |         unif[1] = Y.addresses()[0,0,0]
287 | 
288 |         start = time.time()
289 |         drv.execute(code, unif.addresses()[0])
290 |         end = time.time()
291 | 
292 |         expected = np.concatenate([X.reshape(4,4)]*2, axis=1)*2
293 |         for ix, rot in enumerate(range(-15, 16)):
294 |             assert (Y[:,ix] == expected[:,(-rot%4):(-rot%4)+4].ravel()).all()
295 | 
296 | 
297 | # quad_rotate alias
298 | @qpu
299 | def qpu_quad_rotate_alias(asm):
300 | 
301 |     eidx(r0, sig = ldunif)
302 |     mov(rf0, r5, sig = ldunif)
303 |     shl(r3, 4, 4).mov(rf1, r5)
304 | 
305 |     shl(r0, r0, 2)
306 |     add(rf0, rf0, r0)
307 |     add(rf1, rf1, r0)
308 | 
309 |     mov(tmua, rf0, sig = thrsw).add(rf0, rf0, r3)
310 |     nop()
311 |     nop()
312 |     nop(sig = ldtmu(rf32))
313 |     nop() # required before rotate
314 | 
315 |     for i in range(-15, 16):
316 |         quad_rotate(r1, rf32, i)       # add alias
317 |         mov(tmud, r1)
318 |         mov(tmua, rf1)
319 |         tmuwt().add(rf1, rf1, r3)
320 | 
321 |     for i in range(-15, 16):
322 |         nop().quad_rotate(r1, rf32, i) # mul alias
323 |         mov(tmud, r1)
324 |         mov(tmua, rf1)
325 |         tmuwt().add(rf1, rf1, r3)
326 | 
327 |     for i in range(-15, 16):
328 |         mov(r5, i)
329 |         nop() # require
330 |         quad_rotate(r1, rf32, r5)       # add alias
331 |         mov(tmud, r1)
332 |         mov(tmua, rf1)
333 |         tmuwt().add(rf1, rf1, r3)
334 | 
335 |     for i in range(-15, 16):
336 |         mov(r5, i)
337 |         nop() # require
338 |         nop().quad_rotate(r1, rf32, r5) # mul alias
339 |         mov(tmud, r1)
340 |         mov(tmua, rf1)
341 |         tmuwt().add(rf1, rf1, r3)
342 | 
343 |     nop(sig = thrsw)
344 |     nop(sig = thrsw)
345 |     nop()
346 |     nop()
347 |     nop(sig = thrsw)
348 |     nop()
349 |     nop()
350 |     nop()
351 | 
352 | def test_quad_rotate_alias():
353 | 
354 |     with Driver() as drv:
355 | 
356 |         code = drv.program(qpu_quad_rotate_alias)
357 |         X = drv.alloc((16, ), dtype = 'int32')
358 |         Y = drv.alloc((4, len(range(-15, 16)), 16), dtype = 'int32')
359 |         unif = drv.alloc(3, dtype = 'uint32')
360 | 
361 |         X[:] = np.arange(16)
362 |         Y[:] = 0
363 | 
364 |         unif[0] = X.addresses()[0]
365 |         unif[1] = Y.addresses()[0,0,0]
366 | 
367 |         start = time.time()
368 |         drv.execute(code, unif.addresses()[0])
369 |         end = time.time()
370 | 
371 |         expected = np.concatenate([X.reshape(4,4)]*2, axis=1)
372 |         for ix, rot in enumerate(range(-15, 16)):
373 |             assert (Y[:,ix] == expected[:,(-rot%4):(-rot%4)+4].ravel()).all()
374 | 
375 | 
376 | # instruction with r5rep dst performs as a full broadcast
377 | @qpu
378 | def qpu_full_broadcast(asm):
379 | 
380 |     eidx(r0, sig = ldunif)
381 |     mov(rf0, r5, sig = ldunif)
382 |     shl(r3, 4, 4).mov(rf1, r5)
383 | 
384 |     shl(r0, r0, 2)
385 |     add(rf0, rf0, r0)
386 |     add(rf1, rf1, r0)
387 | 
388 |     mov(tmua, rf0, sig = thrsw).add(rf0, rf0, r3)
389 |     nop()
390 |     nop()
391 |     nop(sig = ldtmu(r0))
392 |     nop() # required before rotate
393 | 
394 |     for i in range(-15, 16):
395 |         nop().mov(r5rep, r0, sig = [rot(ix) for ix in [i] if ix != 0] )
396 |         mov(tmud, r5)
397 |         mov(tmua, rf1)
398 |         tmuwt().add(rf1, rf1, r3)
399 | 
400 |     nop(sig = thrsw)
401 |     nop(sig = thrsw)
402 |     nop()
403 |     nop()
404 |     nop(sig = thrsw)
405 |     nop()
406 |     nop()
407 |     nop()
408 | 
409 | def test_full_broadcast():
410 | 
411 |     with Driver() as drv:
412 | 
413 |         code = drv.program(qpu_full_broadcast)
414 |         X = drv.alloc((16, ), dtype = 'int32')
415 |         Y = drv.alloc((len(range(-15, 16)), 16), dtype = 'int32')
416 |         unif = drv.alloc(3, dtype = 'uint32')
417 | 
418 |         X[:] = np.arange(16)
419 |         Y[:] = 0
420 | 
421 |         unif[0] = X.addresses()[0]
422 |         unif[1] = Y.addresses()[0,0]
423 | 
424 |         start = time.time()
425 |         drv.execute(code, unif.addresses()[0])
426 |         end = time.time()
427 | 
428 |         expected = X
429 |         for ix, rot in enumerate(range(-15, 16)):
430 |             assert (Y[ix] == expected[(-rot%16)].repeat(16)).all()
431 | 
432 | 
433 | # broadcast alias
434 | @qpu
435 | def qpu_broadcast_alias(asm):
436 | 
437 |     eidx(r0, sig = ldunif)
438 |     mov(rf0, r5, sig = ldunif)
439 |     shl(r3, 4, 4).mov(rf1, r5)
440 | 
441 |     shl(r0, r0, 2)
442 |     add(rf0, rf0, r0)
443 |     add(rf1, rf1, r0)
444 | 
445 |     mov(tmua, rf0, sig = thrsw).add(rf0, rf0, r3)
446 |     nop()
447 |     nop()
448 |     nop(sig = ldtmu(r0))
449 |     nop() # required before rotate
450 | 
451 |     for i in range(-15, 16):
452 |         nop().mov(broadcast, r0, sig = [rot(ix) for ix in [i] if ix != 0] )
453 |         mov(tmud, r5)
454 |         mov(tmua, rf1)
455 |         tmuwt().add(rf1, rf1, r3)
456 | 
457 |     nop(sig = thrsw)
458 |     nop(sig = thrsw)
459 |     nop()
460 |     nop()
461 |     nop(sig = thrsw)
462 |     nop()
463 |     nop()
464 |     nop()
465 | 
466 | def test_broadcast_alias():
467 | 
468 |     with Driver() as drv:
469 | 
470 |         code = drv.program(qpu_broadcast_alias)
471 |         X = drv.alloc((16, ), dtype = 'int32')
472 |         Y = drv.alloc((len(range(-15, 16)), 16), dtype = 'int32')
473 |         unif = drv.alloc(3, dtype = 'uint32')
474 | 
475 |         X[:] = np.arange(16)
476 |         Y[:] = 0
477 | 
478 |         unif[0] = X.addresses()[0]
479 |         unif[1] = Y.addresses()[0,0]
480 | 
481 |         start = time.time()
482 |         drv.execute(code, unif.addresses()[0])
483 |         end = time.time()
484 | 
485 |         expected = X
486 |         for ix, rot in enumerate(range(-15, 16)):
487 |             assert (Y[ix] == expected[(-rot%16)].repeat(16)).all()
488 | 
489 | 
490 | # instruction with r5 dst performs as a quad broadcast
491 | @qpu
492 | def qpu_quad_broadcast(asm):
493 | 
494 |     eidx(r0, sig = ldunif)
495 |     mov(rf0, r5, sig = ldunif)
496 |     shl(r3, 4, 4).mov(rf1, r5)
497 | 
498 |     shl(r0, r0, 2)
499 |     add(rf0, rf0, r0)
500 |     add(rf1, rf1, r0)
501 | 
502 |     mov(tmua, rf0, sig = thrsw).add(rf0, rf0, r3)
503 |     nop()
504 |     nop()
505 |     nop(sig = ldtmu(r0))
506 |     nop() # required before rotate
507 | 
508 |     for i in range(-15, 16):
509 |         nop().mov(r5, r0, sig = [rot(ix) for ix in [i] if ix != 0] )
510 |         mov(tmud, r5)
511 |         mov(tmua, rf1)
512 |         tmuwt().add(rf1, rf1, r3)
513 | 
514 |     nop(sig = thrsw)
515 |     nop(sig = thrsw)
516 |     nop()
517 |     nop()
518 |     nop(sig = thrsw)
519 |     nop()
520 |     nop()
521 |     nop()
522 | 
523 | def test_quad_broadcast():
524 | 
525 |     with Driver() as drv:
526 | 
527 |         code = drv.program(qpu_quad_broadcast)
528 |         X = drv.alloc((16, ), dtype = 'int32')
529 |         Y = drv.alloc((len(range(-15, 16)), 16), dtype = 'int32')
530 |         unif = drv.alloc(3, dtype = 'uint32')
531 | 
532 |         X[:] = np.arange(16)
533 |         Y[:] = 0
534 | 
535 |         unif[0] = X.addresses()[0]
536 |         unif[1] = Y.addresses()[0,0]
537 | 
538 |         start = time.time()
539 |         drv.execute(code, unif.addresses()[0])
540 |         end = time.time()
541 | 
542 |         expected = np.concatenate([X,X])
543 |         for ix, rot in enumerate(range(-15, 16)):
544 |             assert (Y[ix] == expected[(-rot%16):(-rot%16)+16:4].repeat(4)).all()
545 | 
546 | 
547 | # instruction with r5 dst performs as a quad broadcast
548 | @qpu
549 | def qpu_quad_broadcast_alias(asm):
550 | 
551 |     eidx(r0, sig = ldunif)
552 |     mov(rf0, r5, sig = ldunif)
553 |     shl(r3, 4, 4).mov(rf1, r5)
554 | 
555 |     shl(r0, r0, 2)
556 |     add(rf0, rf0, r0)
557 |     add(rf1, rf1, r0)
558 | 
559 |     mov(tmua, rf0, sig = thrsw).add(rf0, rf0, r3)
560 |     nop()
561 |     nop()
562 |     nop(sig = ldtmu(r0))
563 |     nop() # required before rotate
564 | 
565 |     for i in range(-15, 16):
566 |         nop().mov(quad_broadcast, r0, sig = [rot(ix) for ix in [i] if ix != 0] )
567 |         mov(tmud, r5)
568 |         mov(tmua, rf1)
569 |         tmuwt().add(rf1, rf1, r3)
570 | 
571 |     nop(sig = thrsw)
572 |     nop(sig = thrsw)
573 |     nop()
574 |     nop()
575 |     nop(sig = thrsw)
576 |     nop()
577 |     nop()
578 |     nop()
579 | 
580 | def test_quad_broadcast_alias():
581 | 
582 |     with Driver() as drv:
583 | 
584 |         code = drv.program(qpu_quad_broadcast_alias)
585 |         X = drv.alloc((16, ), dtype = 'int32')
586 |         Y = drv.alloc((len(range(-15, 16)), 16), dtype = 'int32')
587 |         unif = drv.alloc(3, dtype = 'uint32')
588 | 
589 |         X[:] = np.arange(16)
590 |         Y[:] = 0
591 | 
592 |         unif[0] = X.addresses()[0]
593 |         unif[1] = Y.addresses()[0,0]
594 | 
595 |         start = time.time()
596 |         drv.execute(code, unif.addresses()[0])
597 |         end = time.time()
598 | 
599 |         expected = np.concatenate([X,X])
600 |         for ix, rot in enumerate(range(-15, 16)):
601 |             assert (Y[ix] == expected[(-rot%16):(-rot%16)+16:4].repeat(4)).all()
602 | 


--------------------------------------------------------------------------------
/tests/test_tmu.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # Copyright (c) 2019-2020 Idein Inc.
  3 | #
  4 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  5 | # of this software and associated documentation files (the "Software"), to deal
  6 | # in the Software without restriction, including without limitation the rights
  7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  8 | # copies of the Software, and to permit persons to whom the Software is
  9 | # furnished to do so, subject to the following conditions:
 10 | #
 11 | # The above copyright notice and this permission notice (including the next
 12 | # paragraph) shall be included in all copies or substantial portions of the
 13 | # Software.
 14 | #
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 21 | # SOFTWARE.
 22 | 
 23 | 
 24 | import time
 25 | from videocore6.driver import Driver
 26 | from videocore6.assembler import qpu
 27 | import numpy as np
 28 | 
 29 | 
 30 | @qpu
 31 | def qpu_tmu_write(asm):
 32 | 
 33 |     nop(sig = ldunif)
 34 |     mov(r1, r5, sig = ldunif)
 35 | 
 36 |     # r2 = addr + eidx * 4
 37 |     # rf0 = eidx
 38 |     eidx(r0).mov(r2, r5)
 39 |     shl(r0, r0, 2).mov(rf0, r0)
 40 |     add(r2, r2, r0)
 41 | 
 42 |     with loop as l:
 43 | 
 44 |         # rf0: Data to be written.
 45 |         # r0: Overwritten.
 46 |         # r2: Address to write data to.
 47 | 
 48 |         sub(r1, r1, 1, cond = 'pushz').mov(tmud, rf0)
 49 |         l.b(cond = 'anyna')
 50 |         # rf0 += 16
 51 |         sub(rf0, rf0, -16).mov(tmua, r2)
 52 |         # r2 += 64
 53 |         shl(r0, 4, 4)
 54 |         tmuwt().add(r2, r2, r0)
 55 | 
 56 |     nop(sig = thrsw)
 57 |     nop(sig = thrsw)
 58 |     nop()
 59 |     nop()
 60 |     nop(sig = thrsw)
 61 |     nop()
 62 |     nop()
 63 |     nop()
 64 | 
 65 | 
 66 | def test_tmu_write():
 67 | 
 68 |     n = 4096
 69 | 
 70 |     with Driver(data_area_size = n * 16 * 4 + 2 * 4) as drv:
 71 | 
 72 |         code = drv.program(qpu_tmu_write)
 73 |         data = drv.alloc(n * 16, dtype = 'uint32')
 74 |         unif = drv.alloc(2, dtype = 'uint32')
 75 | 
 76 |         data[:] = 0xdeadbeaf
 77 |         unif[0] = n
 78 |         unif[1] = data.addresses()[0]
 79 | 
 80 |         start = time.time()
 81 |         drv.execute(code, unif.addresses()[0])
 82 |         end = time.time()
 83 | 
 84 |         assert all(data == range(n * 16))
 85 | 
 86 | 
 87 | @qpu
 88 | def qpu_tmu_vec_write(asm, configs, vec_offset):
 89 | 
 90 |     reg_addr = rf0
 91 |     reg_n = rf1
 92 | 
 93 |     nop(sig=ldunifrf(reg_addr))
 94 |     nop(sig=ldunifrf(reg_n))
 95 | 
 96 |     with loop as l:
 97 | 
 98 |         assert 1 <= len(configs) <= 4
 99 |         for i, config in enumerate(configs):
100 | 
101 |             eidx(r0)
102 |             shl(r0, r0, 0xfffffff0)  # 0xfffffff0 % 32 = 16
103 |             assert 1 <= config <= 4
104 |             for j in range(config):
105 |                 mov(tmud, r0).add(r0, r0, 1)
106 | 
107 |             assert 0 <= vec_offset <= 3
108 |             # addr + 4 * 4 * eidx + 4 * vec_offset
109 |             eidx(r0)
110 |             shl(r0, r0, 4)
111 |             sub(r0, r0, -4 * vec_offset)
112 |             add(tmuau if i == 0 else tmua, reg_addr, r0)
113 | 
114 |             # addr += 4 * len(configs) * 16
115 |             shl(r0, 4, 4)
116 |             umul24(r0, r0, len(configs))
117 |             add(reg_addr, reg_addr, r0)
118 | 
119 |         sub(reg_n, reg_n, 1, cond='pushz')
120 |         l.b(cond='na0')
121 |         nop()
122 |         nop()
123 |         nop()
124 | 
125 |     nop(sig = thrsw)
126 |     nop(sig = thrsw)
127 |     nop()
128 |     nop()
129 |     nop(sig = thrsw)
130 |     nop()
131 |     nop()
132 |     nop()
133 | 
134 | 
135 | def test_tmu_vec_write():
136 | 
137 |     n = 123
138 | 
139 |     # The number of 32-bit values in a vector element per pixel is 1, 2, 3, or 4.
140 |     # For example, with four 32-bit config:
141 |     #     tmud <- r0
142 |     #     tmud <- r1
143 |     #     tmud <- r2
144 |     #     tmud <- r3
145 |     #     tmuau <- addr + 4 * 4 * eidx
146 |     # results in:
147 |     #     addr + 0x00: r0[ 0], r1[ 0], r2[ 0], r3[ 0], r0[ 1], r1[ 1], ..., r3[ 3]
148 |     #     addr + 0x40: r0[ 4], r1[ 4], r2[ 4], r3[ 4], r0[ 5], r1[ 5], ..., r3[ 7]
149 |     #     addr + 0x80: ...
150 |     #     addr + 0xc0: r0[12], r1[12], r2[12], r3[12], r0[13], r1[13], ..., r3[15]
151 |     # where rn[i] (0 <= i < 16) is the value in register rn of pixel (eidx) i.
152 |     configs = [4, 3, 2, 1]
153 | 
154 |     # The element per pixel is wrapped modulo 16 bytes.
155 |     # For example, if the above address setting is addr + 4 * 4 * eidx + 4, then
156 |     #     addr + 0x00: r3[ 0], r0[ 0], r1[ 0], r2[ 0], r3[ 1], r0[ 1], ..., r2[ 3]
157 |     #     addr + 0x40: r3[ 4], r0[ 4], r1[ 4], r2[ 4], r3[ 5], r0[ 5], ..., r2[ 7]
158 |     #     addr + 0x80: ...
159 |     #     addr + 0xc0: r3[12], r0[12], r1[12], r2[12], r3[13], r0[13], ..., r2[15]
160 |     vec_offset = 3
161 | 
162 |     data_default = 0xdeadbeef
163 | 
164 |     with Driver() as drv:
165 | 
166 |         code = drv.program(qpu_tmu_vec_write, configs, vec_offset)
167 |         data = drv.alloc(16 * 4 * len(configs) * n, dtype='uint32')
168 |         unif = drv.alloc(2 + n, dtype='uint32')
169 | 
170 |         data[:] = data_default
171 | 
172 |         unif[0] = data.addresses()[0]
173 |         unif[1] = n
174 | 
175 |         conf = 0xffffffff
176 |         for config in reversed(configs):
177 |             conf <<= 8
178 |             conf |= {1: 0xff, 2: 0xfa, 3: 0xfb, 4: 0xfc}[config]
179 |         conf &= 0xffffffff
180 |         unif[2:] = conf
181 | 
182 |         drv.execute(code, unif.addresses()[0])
183 | 
184 |         for i, row in enumerate(data.reshape(-1, 4 * 16)):
185 |             config = configs[i % len(configs)]
186 |             for j, vec in enumerate(row.reshape(-1, 4)):
187 |                 ref = list(range(j << 16, (j << 16) + config)) + [data_default] * (4 - config)
188 |                 assert all(np.roll(vec, -vec_offset) == ref)
189 | 
190 | 
191 | @qpu
192 | def qpu_tmu_read(asm):
193 | 
194 |     # r0: Number of vectors to read.
195 |     # r1: Pointer to the read vectors + eidx * 4.
196 |     # r2: Pointer to the write vectors + eidx * 4
197 |     eidx(r2, sig = ldunif)
198 |     mov(r0, r5, sig = ldunif)
199 |     shl(r2, r2, 2).mov(r1, r5)
200 |     add(r1, r1, r2, sig = ldunif)
201 |     add(r2, r5, r2)
202 | 
203 |     with loop as l:
204 | 
205 |         mov(tmua, r1, sig = thrsw)
206 |         nop()
207 |         nop()
208 |         nop(sig = ldtmu(rf0))
209 | 
210 |         sub(r0, r0, 1, cond = 'pushz').add(tmud, rf0, 1)
211 |         l.b(cond = 'anyna')
212 |         shl(r3, 4, 4).mov(tmua, r2)
213 |         # r1 += 64
214 |         # r2 += 64
215 |         add(r1, r1, r3).add(r2, r2, r3)
216 |         tmuwt()
217 | 
218 |     nop(sig = thrsw)
219 |     nop(sig = thrsw)
220 |     nop()
221 |     nop()
222 |     nop(sig = thrsw)
223 |     nop()
224 |     nop()
225 |     nop()
226 | 
227 | 
228 | def test_tmu_read():
229 | 
230 |     n = 4096
231 | 
232 |     with Driver() as drv:
233 | 
234 |         code = drv.program(qpu_tmu_read)
235 |         data = drv.alloc(n * 16, dtype = 'uint32')
236 |         unif = drv.alloc(3, dtype = 'uint32')
237 | 
238 |         data[:] = range(len(data))
239 |         unif[0] = n
240 |         unif[1] = data.addresses()[0]
241 |         unif[2] = data.addresses()[0]
242 | 
243 |         drv.execute(code, unif.addresses()[0])
244 | 
245 |         assert all(data == range(1, n * 16 + 1))
246 | 
247 | 
248 | @qpu
249 | def qpu_tmu_vec_read(asm, configs, vec_offset):
250 | 
251 |     reg_src = rf0
252 |     reg_dst = rf1
253 |     reg_n = rf2
254 | 
255 |     nop(sig=ldunifrf(reg_src))
256 |     nop(sig=ldunifrf(reg_dst))
257 |     nop(sig=ldunifrf(reg_n))
258 | 
259 |     # dst += 4 * eidx
260 |     eidx(r0)
261 |     shl(r0, r0, 2)
262 |     add(reg_dst, reg_dst, r0)
263 | 
264 |     with loop as l:
265 | 
266 |         mov(r4, 0)
267 | 
268 |         assert 1 <= len(configs) <= 4
269 |         for i, config in enumerate(configs):
270 | 
271 |             assert 1 <= config <= 4
272 |             assert 0 <= vec_offset <= 3
273 |             # addr + 4 * 4 * eidx + 4 * vec_offset
274 |             eidx(r0)
275 |             shl(r0, r0, 4)
276 |             sub(r0, r0, -4 * vec_offset)
277 |             add(tmuau if i == 0 else tmua, reg_src, r0, sig=thrsw)
278 |             nop()
279 |             nop()
280 |             nop(sig=ldtmu(r0))
281 |             nop(sig=ldtmu(r1)) if config >= 2 else eidx(r1)
282 |             nop(sig=ldtmu(r2)) if config >= 3 else eidx(r2)
283 |             nop(sig=ldtmu(r3)) if config >= 4 else eidx(r3)
284 | 
285 |             add(r0, r0, r1).add(r2, r2, r3)
286 |             add(r0, r0, r2)
287 |             add(r4, r4, r0)
288 |             # src += 4 * 4 * 16
289 |             shl(r0, 4, 4)
290 |             umul24(r0, r0, 4)
291 |             add(reg_src, reg_src, r0)
292 | 
293 |         mov(tmud, r4)
294 |         # If the configs are shited out, then 0xff (per-pixel regular 32-bit
295 |         # write) is filled in.
296 |         mov(tmua, reg_dst)
297 | 
298 |         # dst += 4 * 16
299 |         shl(r0, 4, 4)
300 |         add(reg_dst, reg_dst, r0)
301 | 
302 |         sub(reg_n, reg_n, 1, cond='pushz')
303 |         l.b(cond='na0')
304 |         nop()
305 |         nop()
306 |         nop()
307 | 
308 |     nop(sig = thrsw)
309 |     nop(sig = thrsw)
310 |     nop()
311 |     nop()
312 |     nop(sig = thrsw)
313 |     nop()
314 |     nop()
315 |     nop()
316 | 
317 | 
318 | def test_tmu_vec_read():
319 | 
320 |     # The settings, the number of elements in a vector, and 16-byte wrapping are
321 |     # the same as the vector writes.
322 | 
323 |     n = 123
324 |     configs = [4, 3, 2, 1]
325 |     vec_offset = 1
326 | 
327 |     with Driver() as drv:
328 | 
329 |         code = drv.program(qpu_tmu_vec_read, configs, vec_offset)
330 |         src = drv.alloc((n, 16 * 4 * len(configs)), dtype='uint32')
331 |         dst = drv.alloc((n, 16), dtype='uint32')
332 |         unif = drv.alloc(3 + n, dtype='uint32')
333 | 
334 |         src[:, :] = np.arange(src.size, dtype=src.dtype).reshape(src.shape)
335 |         dst[:, :] = 0
336 | 
337 |         unif[0] = src.addresses()[0, 0]
338 |         unif[1] = dst.addresses()[0, 0]
339 |         unif[2] = n
340 | 
341 |         conf = 0xffffffff
342 |         for config in reversed(configs):
343 |             conf <<= 8
344 |             conf |= {1: 0xff, 2: 0xfa, 3: 0xfb, 4: 0xfc}[config]
345 |         conf &= 0xffffffff
346 |         unif[3:] = conf
347 | 
348 |         drv.execute(code, unif.addresses()[0])
349 | 
350 |         for i, vec in enumerate(dst):
351 |             data = src.shape[1] * i + np.arange(src.shape[1], dtype='uint32').reshape(len(configs), 16, 4)
352 |             s = [0] * 16
353 |             for j, config in enumerate(configs):
354 |                 for eidx in range(16):
355 |                     for k in range(config):
356 |                         s[eidx] += data[j, eidx, (k + vec_offset) % 4]
357 |                     s[eidx] += eidx * (4 - config)
358 |             assert all(vec == s)
359 | 
360 | 
361 | # VC4 TMU cache & DMA break memory consistency.
362 | # How about VC6 TMU ?
363 | @qpu
364 | def qpu_tmu_keeps_memory_consistency(asm):
365 | 
366 |     nop(sig = ldunifrf(r0))
367 | 
368 |     mov(tmua, r0, sig = thrsw)
369 |     nop()
370 |     nop()
371 |     nop(sig = ldtmu(r1))
372 | 
373 |     add(tmud, r1, 1)
374 |     mov(tmua, r0)
375 |     tmuwt()
376 | 
377 |     mov(tmua, r0, sig = thrsw)
378 |     nop()
379 |     nop()
380 |     nop(sig = ldtmu(r1))
381 | 
382 |     add(tmud, r1, 1)
383 |     mov(tmua, r0)
384 |     tmuwt()
385 | 
386 |     nop(sig = thrsw)
387 |     nop(sig = thrsw)
388 |     nop()
389 |     nop()
390 |     nop(sig = thrsw)
391 |     nop()
392 |     nop()
393 |     nop()
394 | 
395 | def test_tmu_keeps_memory_consistency():
396 | 
397 |     with Driver() as drv:
398 | 
399 |         code = drv.program(qpu_tmu_keeps_memory_consistency)
400 |         data = drv.alloc(16, dtype = 'uint32')
401 |         unif = drv.alloc(3, dtype = 'uint32')
402 | 
403 |         data[:] = 1
404 |         unif[0] = data.addresses()[0]
405 | 
406 |         drv.execute(code, unif.addresses()[0])
407 | 
408 |         assert (data[0] == 3).all()
409 |         assert (data[1:] == 1).all()
410 | 
411 | 
412 | @qpu
413 | def qpu_tmu_read_tmu_write_uniform_read(asm):
414 | 
415 |     eidx(r0, sig = ldunifrf(rf0))
416 |     shl(r0, r0, 2)
417 |     add(rf0, rf0, r0, sig = ldunifrf(rf1))
418 |     add(rf1, rf1, r0)
419 | 
420 |     mov(tmua, rf0, sig = thrsw)
421 |     nop()
422 |     nop()
423 |     nop(sig = ldtmu(r0)) # r0 = [1,...,1]
424 | 
425 |     add(tmud, r0, 1)
426 |     mov(tmua, rf0)       # data = [2,...,2]
427 |     tmuwt()
428 | 
429 |     b(R.set_unif_addr, cond = 'always').unif_addr(rf0) # unif_addr = data.addresses()[0]
430 |     nop()
431 |     nop()
432 |     nop()
433 |     L.set_unif_addr
434 | 
435 |     nop(sig = ldunifrf(r0)) # r0 = [data[0],...,data[0]] = [2,...,2]
436 | 
437 |     add(tmud, r0, 1)
438 |     mov(tmua, rf1)          # result = [3,...,3]
439 |     tmuwt()
440 | 
441 |     nop(sig = thrsw)
442 |     nop(sig = thrsw)
443 |     nop()
444 |     nop()
445 |     nop(sig = thrsw)
446 |     nop()
447 |     nop()
448 |     nop()
449 | 
450 | def test_tmu_read_tmu_write_uniform_read():
451 | 
452 |     with Driver() as drv:
453 | 
454 |         code = drv.program(qpu_tmu_read_tmu_write_uniform_read)
455 |         data = drv.alloc(16, dtype = 'uint32')
456 |         result = drv.alloc(16, dtype = 'uint32')
457 |         unif = drv.alloc(3, dtype = 'uint32')
458 | 
459 |         data[:] = 1
460 |         unif[0] = data.addresses()[0]
461 |         unif[1] = result.addresses()[0]
462 | 
463 |         drv.execute(code, unif.addresses()[0])
464 | 
465 |         assert (data == 2).all()
466 |         assert (result == 2).all() # !? not 3 ?
467 | 


--------------------------------------------------------------------------------
/tests/test_unifa.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # Copyright (c) 2021 Idein Inc.
  3 | #
  4 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  5 | # of this software and associated documentation files (the "Software"), to deal
  6 | # in the Software without restriction, including without limitation the rights
  7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  8 | # copies of the Software, and to permit persons to whom the Software is
  9 | # furnished to do so, subject to the following conditions:
 10 | #
 11 | # The above copyright notice and this permission notice (including the next
 12 | # paragraph) shall be included in all copies or substantial portions of the
 13 | # Software.
 14 | #
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 21 | # SOFTWARE.
 22 | 
 23 | 
 24 | from videocore6.assembler import qpu
 25 | from videocore6.driver import Driver
 26 | 
 27 | import numpy as np
 28 | 
 29 | 
 30 | @qpu
 31 | def qpu_unifa(asm):
 32 | 
 33 |     reg_n = rf0
 34 |     reg_src0 = rf1
 35 |     reg_src1 = rf2
 36 |     reg_dst = rf3
 37 |     reg_inc = rf4
 38 |     reg_tmp = rf5
 39 | 
 40 |     nop(sig=ldunifrf(reg_n))
 41 |     nop(sig=ldunifrf(reg_src0))
 42 |     nop(sig=ldunifrf(reg_src1))
 43 |     nop(sig=ldunifrf(reg_dst))
 44 | 
 45 |     eidx(r0)
 46 |     shl(r0, r0, 2)
 47 |     add(reg_src0, reg_src0, r0)
 48 |     add(reg_src1, reg_src1, r0)
 49 |     add(reg_dst, reg_dst, r0)
 50 | 
 51 |     shl(reg_inc, 4, 4)
 52 | 
 53 |     # Address is taken from element zero.
 54 |     mov(unifa, reg_src0)
 55 |     # Three delays are required for the data to be ready.
 56 |     nop()
 57 |     nop()
 58 |     sub(r0, reg_n, 1, cond='pushz')
 59 |     L.l0
 60 |     nop(sig=ldunifa)
 61 |     b(R.l0, cond='na0')
 62 |     mov(tmud, r5)
 63 |     mov(tmua, reg_dst).add(reg_dst, reg_dst, reg_inc)
 64 |     sub(r0, r0, 1, cond='pushz')
 65 | 
 66 |     # Ordinary uniform and sideband uniform simultaneous reads.
 67 |     b(R.l1, cond='always').unif_addr(reg_src0)
 68 |     mov(unifa, reg_src1)
 69 |     sub(r0, reg_n, 1, cond='pushz')
 70 |     nop()
 71 |     L.l1
 72 |     nop(sig=ldunif)
 73 |     mov(tmud, r5, sig=ldunifa)
 74 |     mov(tmua, reg_dst).add(reg_dst, reg_dst, reg_inc)
 75 |     b(R.l1, cond='na0')
 76 |     mov(tmud, r5)
 77 |     mov(tmua, reg_dst).add(reg_dst, reg_dst, reg_inc)
 78 |     sub(r0, r0, 1, cond='pushz')
 79 | 
 80 |     # Check if the two uniform streams proceed mutually-exclusively.
 81 |     #
 82 |     # Timeline:
 83 |     #
 84 |     #  time | unif     | unifa
 85 |     # ------+----------+----------
 86 |     #  T0   | set addr |
 87 |     #  T1   | load     |
 88 |     #  T2   |          | load
 89 |     #  T3   |          | set addr
 90 |     #  T4   |          | load
 91 |     #  T5   | load     |
 92 |     #  T0   | set addr |
 93 |     #  T1   | load     |
 94 |     #  T2   |          | load
 95 |     #  T3   |          | set addr
 96 |     #  T4   |          | load
 97 |     #  T5   | load     |
 98 |     #  ...  | ...      | ...
 99 | 
100 |     # Branch takes the second element as a new uniform address.
101 |     quad_rotate(reg_src1, reg_src1, 1)
102 |     shr(r0, reg_n, 1)
103 |     mov(unifa, reg_src0).add(reg_src0, reg_src0, 4)
104 |     L.l2
105 |     b(R.l3, cond='always').unif_addr(reg_src1)                      # T0
106 |     add(reg_src1, reg_src1, 8)
107 |     sub(r0, r0, 1, cond='pushz')
108 |     nop()
109 |     L.l3
110 |     nop(sig=ldunif)                                                 # T1
111 |     mov(tmud, r5)
112 |     mov(tmua, reg_dst, sig=ldunifa).mov(unifa, reg_src0)            # T2, T3
113 |     mov(tmud, r5)
114 |     add(reg_dst, reg_dst, reg_inc)
115 |     add(reg_src0, reg_src0, 8)
116 |     mov(tmua, reg_dst, sig=ldunifa).add(reg_dst, reg_dst, reg_inc)  # T4
117 |     mov(tmud, r5, sig=ldunif)                                       # T5
118 |     b(R.l2, cond='na0')
119 |     mov(tmua, reg_dst).add(reg_dst, reg_dst, reg_inc)
120 |     mov(tmud, r5)
121 |     mov(tmua, reg_dst).add(reg_dst, reg_dst, reg_inc)
122 | 
123 |     nop(sig=thrsw)
124 |     nop(sig=thrsw)
125 |     nop()
126 |     nop()
127 |     nop(sig=thrsw)
128 |     nop()
129 |     nop()
130 |     nop()
131 | 
132 | 
133 | def test_unifa():
134 | 
135 |     n = 548
136 | 
137 |     assert n >= 2 and n % 2 == 0
138 | 
139 |     with Driver() as drv:
140 | 
141 |         code = drv.program(qpu_unifa)
142 |         unif = drv.alloc(4, dtype='uint32')
143 |         src0 = drv.alloc(n, dtype='uint32')
144 |         src1 = drv.alloc(n, dtype='uint32')
145 |         dst = drv.alloc((n * 5, 16), dtype='uint32')
146 | 
147 |         rng = np.random.default_rng()
148 |         src0[:] = rng.integers(1, 2 ** 32 - 1, size=n)
149 |         src1[:] = rng.integers(1, 2 ** 32 - 1, size=n)
150 |         dst[:, :] = 0
151 | 
152 |         unif[0] = n
153 |         unif[1] = src0.addresses()[0]
154 |         unif[2] = src1.addresses()[0]
155 |         unif[3] = dst.addresses()[0, 0]
156 | 
157 |         drv.execute(code, unif.addresses()[0])
158 | 
159 |         for i in range(n):
160 |             assert all(dst[i, :] == src0[i])
161 |             assert all(dst[n + i * 2 + 0, :] == src0[i])
162 |             assert all(dst[n + i * 2 + 1, :] == src1[i])
163 |             assert all(dst[n * 3 + i * 2 + (i % 2), :] == src1[i])
164 |             assert all(dst[n * 3 + i * 2 + (1 - i % 2), :] == src0[i])
165 | 


--------------------------------------------------------------------------------
/tests/test_v3d.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # Copyright (c) 2019-2020 Idein Inc.
 3 | #
 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 5 | # of this software and associated documentation files (the "Software"), to deal
 6 | # in the Software without restriction, including without limitation the rights
 7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 8 | # copies of the Software, and to permit persons to whom the Software is
 9 | # furnished to do so, subject to the following conditions:
10 | #
11 | # The above copyright notice and this permission notice (including the next
12 | # paragraph) shall be included in all copies or substantial portions of the
13 | # Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | 
24 | from videocore6.drm_v3d import DRM_V3D
25 | from videocore6.v3d import *
26 | 
27 | 
28 | def test_v3d_regs():
29 | 
30 |     with DRM_V3D() as drm:
31 | 
32 |         try:
33 | 
34 |             with RegisterMapping() as regmap:
35 | 
36 |                 assert regmap[HUB_UIFCFG] \
37 |                         == drm.v3d_get_param(DRM_V3D.V3D_PARAM_V3D_UIFCFG)
38 | 
39 |                 assert regmap[HUB_IDENT1] \
40 |                         == drm.v3d_get_param(DRM_V3D.V3D_PARAM_V3D_HUB_IDENT1)
41 | 
42 |                 assert regmap[HUB_IDENT2] \
43 |                         == drm.v3d_get_param(DRM_V3D.V3D_PARAM_V3D_HUB_IDENT2)
44 | 
45 |                 assert regmap[HUB_IDENT3] \
46 |                         == drm.v3d_get_param(DRM_V3D.V3D_PARAM_V3D_HUB_IDENT3)
47 | 
48 |                 assert regmap[CORE_IDENT0, 0] \
49 |                         == drm.v3d_get_param(DRM_V3D.V3D_PARAM_V3D_CORE0_IDENT0)
50 | 
51 |                 assert regmap[CORE_IDENT1, 0] \
52 |                         == drm.v3d_get_param(DRM_V3D.V3D_PARAM_V3D_CORE0_IDENT1)
53 | 
54 |                 assert regmap[CORE_IDENT2, 0] \
55 |                         == drm.v3d_get_param(DRM_V3D.V3D_PARAM_V3D_CORE0_IDENT2)
56 | 
57 |         except PermissionError:
58 | 
59 |             print('Skipping tests because of a lack of root privilege')
60 | 


--------------------------------------------------------------------------------
/videocore6/__init__.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # Copyright (c) 2019-2020 Idein Inc.
 3 | #
 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 5 | # of this software and associated documentation files (the "Software"), to deal
 6 | # in the Software without restriction, including without limitation the rights
 7 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 8 | # copies of the Software, and to permit persons to whom the Software is
 9 | # furnished to do so, subject to the following conditions:
10 | #
11 | # The above copyright notice and this permission notice (including the next
12 | # paragraph) shall be included in all copies or substantial portions of the
13 | # Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | 
24 | __version__ = '0.0.0'
25 | 
26 | 
27 | import struct
28 | 
29 | 
30 | def pack_unpack(pack, unpack, v):
31 | 
32 |     if isinstance(v, list):
33 |         return [struct.unpack(unpack, struct.pack(pack, _))[0] for _ in v]
34 | 
35 |     return struct.unpack(unpack, struct.pack(pack, v))[0]
36 | 


--------------------------------------------------------------------------------
/videocore6/driver.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # Copyright (c) 2014-2018 Broadcom
  3 | # Copyright (c) 2019-2020 Idein Inc.
  4 | #
  5 | # This program is free software; you can redistribute it and/or modify it under
  6 | # the terms of the GNU General Public License as published by the Free Software
  7 | # Foundation; either version 2 of the License, or (at your option) any later
  8 | # version.
  9 | #
 10 | # This program is distributed in the hope that it will be useful, but WITHOUT
 11 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 12 | # FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
 13 | # details.
 14 | #
 15 | # You should have received a copy of the GNU General Public License along with
 16 | # this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
 17 | # Street, Fifth Floor, Boston, MA 02110-1301 USA.
 18 | 
 19 | 
 20 | import sys
 21 | import mmap
 22 | from videocore6.drm_v3d import DRM_V3D
 23 | from videocore6.assembler import assemble
 24 | import numpy as np
 25 | 
 26 | 
 27 | DEFAULT_CODE_AREA_SIZE = 1024 * 1024
 28 | DEFAULT_DATA_AREA_SIZE = 32 * 1024 * 1024
 29 | 
 30 | 
 31 | class DriverError(Exception):
 32 |     pass
 33 | 
 34 | 
 35 | class Array(np.ndarray):
 36 | 
 37 |     def __new__(cls, *args, **kwargs):
 38 | 
 39 |         phyaddr = kwargs.pop('phyaddr')
 40 |         obj = super().__new__(cls, *args, **kwargs)
 41 |         obj.address = phyaddr
 42 |         return obj
 43 | 
 44 |     def addresses(self):
 45 | 
 46 |         return np.arange(
 47 |             self.address,
 48 |             self.address + self.nbytes,
 49 |             self.itemsize,
 50 |             np.uint32,
 51 |         ).reshape(self.shape)
 52 | 
 53 | 
 54 | class Memory(object):
 55 | 
 56 |     def __init__(self, drm, size):
 57 | 
 58 |         self.drm = drm
 59 |         self.size = size
 60 |         self.handle = None  # Handle of BO for V3D DRM
 61 |         self.phyaddr = None  # Physical address used in QPU
 62 |         self.buffer = None  # mmap object of the memory area
 63 | 
 64 |         try:
 65 | 
 66 |             self.handle, self.phyaddr = drm.v3d_create_bo(size)
 67 |             offset = drm.v3d_mmap_bo(self.handle)
 68 |             self.buffer = mmap.mmap(fileno=drm.fd, length=size,
 69 |                                     flags=mmap.MAP_SHARED,
 70 |                                     prot=mmap.PROT_READ | mmap.PROT_WRITE,
 71 |                                     offset=offset)
 72 | 
 73 |         except Exception as e:
 74 | 
 75 |             self.close()
 76 |             raise e
 77 | 
 78 |     def close(self):
 79 | 
 80 |         if self.buffer is not None:
 81 |             self.buffer.close()
 82 | 
 83 |         if self.handle is not None:
 84 |             self.drm.gem_close(self.handle)
 85 | 
 86 |         self.drm = None
 87 |         self.size = None
 88 |         self.handle = None
 89 |         self.phyaddr = None
 90 |         self.buffer = None
 91 | 
 92 | 
 93 | class Dispatcher(object):
 94 | 
 95 |     def __init__(self, drm, bo_handles, timeout_sec=10):
 96 |         self.drm = drm
 97 |         self.bo_handles = bo_handles
 98 |         self.timeout_sec = timeout_sec
 99 | 
100 |     def __enter__(self):
101 |         return self
102 | 
103 |     def __exit__(self, ex_type, ex_value, trace):
104 |         for bo_handle in self.bo_handles:
105 |             self.drm.v3d_wait_bo(bo_handle,
106 |                                  timeout_ns=int(self.timeout_sec / 1e-9))
107 | 
108 |     def dispatch(self, code, uniforms=None, workgroup=(16, 1, 1), wgs_per_sg=16, thread=1):
109 | 
110 |         wg_x, wg_y, wg_z = workgroup
111 |         wg_size = wg_x * wg_y * wg_z
112 | 
113 |         def roundup(n, d):
114 |             return (n + d - 1) // d
115 | 
116 |         self.drm.v3d_submit_csd(
117 |             cfg=[
118 |                 # WGS X, Y, Z and settings
119 |                 wg_x << 16,
120 |                 wg_y << 16,
121 |                 wg_z << 16,
122 |                 ((roundup(wgs_per_sg * wg_size, 16) - 1) << 12) |
123 |                 (wgs_per_sg << 8) |
124 |                 (wg_size & 0xff),
125 |                 # Number of batches minus 1
126 |                 thread - 1,
127 |                 # Shader address, pnan, singleseg, threading
128 |                 code.addresses()[0],
129 |                 # Uniforms address
130 |                 uniforms if uniforms is not None else 0,
131 |             ],
132 |             # Not used in the driver.
133 |             coef=[0, 0, 0, 0],
134 |             bo_handles=self.bo_handles.ctypes.data,
135 |             bo_handle_count=len(self.bo_handles),
136 |             in_sync=0,
137 |             out_sync=0,
138 |         )
139 | 
140 | 
141 | class Driver(object):
142 | 
143 |     def __init__(self, *,
144 |                  code_area_size=DEFAULT_CODE_AREA_SIZE,
145 |                  data_area_size=DEFAULT_DATA_AREA_SIZE,
146 |                  ):
147 | 
148 |         self.code_area_size = code_area_size
149 |         self.data_area_size = data_area_size
150 |         total_size = self.code_area_size + self.data_area_size
151 |         self.code_area_base = 0
152 |         self.data_area_base = self.code_area_base + self.code_area_size
153 |         self.code_pos = self.code_area_base
154 |         self.data_pos = self.data_area_base
155 | 
156 |         self.drm = None
157 |         self.memory = None
158 |         self.bo_handles = None
159 | 
160 |         try:
161 | 
162 |             self.drm = DRM_V3D()
163 | 
164 |             self.memory = Memory(self.drm, total_size)
165 | 
166 |             self.bo_handles = np.array([self.memory.handle], dtype=np.uint32)
167 | 
168 |         except Exception as e:
169 | 
170 |             self.close()
171 |             raise e
172 | 
173 |     def close(self):
174 | 
175 |         if self.memory is not None:
176 |             self.memory.close()
177 | 
178 |         if self.drm is not None:
179 |             self.drm.close()
180 | 
181 |         self.drm = None
182 |         self.memory = None
183 |         self.bo_handles = None
184 | 
185 |     def __enter__(self):
186 | 
187 |         return self
188 | 
189 |     def __exit__(self, exc_type, value, traceback):
190 | 
191 |         self.close()
192 |         return exc_type is None
193 | 
194 |     def alloc(self, *args, **kwargs):
195 | 
196 |         offset = self.data_pos
197 |         kwargs['phyaddr'] = self.memory.phyaddr + offset
198 |         kwargs['buffer'] = self.memory.buffer
199 |         kwargs['offset'] = offset
200 | 
201 |         arr = Array(*args, **kwargs)
202 | 
203 |         self.data_pos += arr.nbytes
204 |         if self.data_pos > self.data_area_base + self.data_area_size:
205 |             raise DriverError('Data too large')
206 | 
207 |         return arr
208 | 
209 |     def dump_code(self, code, *, file=sys.stdout):
210 |         for insn in code:
211 |             print(f'{insn:#018x}', file=file)
212 | 
213 |     def dump_program(self, prog, *args, file=sys.stdout, **kwargs):
214 |         self.dump_code(assemble(prog, *args, **kwargs), file=file)
215 | 
216 |     def program(self, prog, *args, **kwargs):
217 |         if hasattr(prog, '__call__'):
218 |             asm = assemble(prog, *args, **kwargs)
219 |         else:
220 |             asm = prog
221 | 
222 |         offset = self.code_pos
223 |         code = Array(
224 |             shape=len(asm),
225 |             dtype=np.uint64,
226 |             phyaddr=self.memory.phyaddr + offset,
227 |             buffer=self.memory.buffer,
228 |             offset=offset,
229 |         )
230 | 
231 |         self.code_pos += code.nbytes
232 |         if self.code_pos > self.code_area_base + self.code_area_size:
233 |             raise DriverError('Code too large')
234 | 
235 |         code[:] = asm
236 | 
237 |         return code
238 | 
239 |     def compute_shader_dispatcher(self, timeout_sec=10):
240 |         return Dispatcher(self.drm, self.bo_handles, timeout_sec=timeout_sec)
241 | 
242 |     def execute(self, code, uniforms=None, timeout_sec=10, workgroup=(16, 1, 1), wgs_per_sg=16, thread=1):
243 |         with self.compute_shader_dispatcher(timeout_sec) as csd:
244 |             csd.dispatch(code, uniforms=uniforms, workgroup=workgroup, wgs_per_sg=wgs_per_sg, thread=thread)
245 | 


--------------------------------------------------------------------------------
/videocore6/drm_v3d.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # Copyright (c) 2014-2018 Broadcom
  3 | # Copyright (c) 2019-2020 Idein Inc.
  4 | #
  5 | # This program is free software; you can redistribute it and/or modify it under
  6 | # the terms of the GNU General Public License as published by the Free Software
  7 | # Foundation; either version 2 of the License, or (at your option) any later
  8 | # version.
  9 | #
 10 | # This program is distributed in the hope that it will be useful, but WITHOUT
 11 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 12 | # FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
 13 | # details.
 14 | #
 15 | # You should have received a copy of the GNU General Public License along with
 16 | # this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
 17 | # Street, Fifth Floor, Boston, MA 02110-1301 USA.
 18 | 
 19 | 
 20 | import os
 21 | from fcntl import ioctl
 22 | from ctypes import Structure, c_uint32, c_uint64
 23 | from ioctl_opt import IOW, IOWR
 24 | 
 25 | 
 26 | class DRM_V3D(object):
 27 | 
 28 |     def __init__(self, path='/dev/dri/by-path/platform-fec00000.v3d-card'):
 29 |         self.fd = os.open(path, os.O_RDWR)
 30 | 
 31 |     def close(self):
 32 |         if self.fd is not None:
 33 |             os.close(self.fd)
 34 |         self.fd = None
 35 | 
 36 |     def __enter__(self):
 37 |         return self
 38 | 
 39 |     def __exit__(self, exc_type, exc_value, traceback):
 40 |         self.close()
 41 |         return exc_value is None
 42 | 
 43 |     # Derived from linux/include/uapi/drm/drm.h
 44 |     DRM_IOCTL_BASE = ord('d')
 45 |     DRM_COMMAND_BASE = 0x40
 46 |     DRM_GEM_CLOSE = 0x09
 47 | 
 48 |     # Derived from linux/include/uapi/drm/v3d_drm.h
 49 |     DRM_V3D_WAIT_BO = DRM_COMMAND_BASE + 0x01
 50 |     DRM_V3D_CREATE_BO = DRM_COMMAND_BASE + 0x02
 51 |     DRM_V3D_MMAP_BO = DRM_COMMAND_BASE + 0x03
 52 |     DRM_V3D_GET_PARAM = DRM_COMMAND_BASE + 0x04
 53 |     DRM_V3D_SUBMIT_CSD = DRM_COMMAND_BASE + 0x07
 54 | 
 55 |     V3D_PARAM_V3D_UIFCFG = 0
 56 |     V3D_PARAM_V3D_HUB_IDENT1 = 1
 57 |     V3D_PARAM_V3D_HUB_IDENT2 = 2
 58 |     V3D_PARAM_V3D_HUB_IDENT3 = 3
 59 |     V3D_PARAM_V3D_CORE0_IDENT0 = 4
 60 |     V3D_PARAM_V3D_CORE0_IDENT1 = 5
 61 |     V3D_PARAM_V3D_CORE0_IDENT2 = 6
 62 |     V3D_PARAM_SUPPORTS_TFU = 7
 63 |     V3D_PARAM_SUPPORTS_CSD = 8
 64 | 
 65 |     class st_gem_close(Structure):
 66 |         _fields_ = [
 67 |             ('handle', c_uint32),
 68 |             ('pad', c_uint32),
 69 |         ]
 70 | 
 71 |     class st_v3d_wait_bo(Structure):
 72 |         _fields_ = [
 73 |             ('handle', c_uint32),
 74 |             ('pad', c_uint32),
 75 |             ('timeout_ns', c_uint64),
 76 |         ]
 77 | 
 78 |     class st_v3d_create_bo(Structure):
 79 |         _fields_ = [
 80 |             ('size', c_uint32),
 81 |             ('flags', c_uint32),
 82 |             ('handle', c_uint32),
 83 |             ('offset', c_uint32),
 84 |         ]
 85 | 
 86 |     class st_v3d_mmap_bo(Structure):
 87 |         _fields_ = [
 88 |             ('handle', c_uint32),
 89 |             ('flags', c_uint32),
 90 |             ('offset', c_uint64),
 91 |         ]
 92 | 
 93 |     class st_v3d_get_param(Structure):
 94 |         _fields_ = [
 95 |             ('param', c_uint32),
 96 |             ('pad', c_uint32),
 97 |             ('value', c_uint64),
 98 |         ]
 99 | 
100 |     class st_v3d_submit_csd(Structure):
101 |         _fields_ = [
102 |             ('cfg', c_uint32 * 7),
103 |             ('coef', c_uint32 * 4),
104 |             ('bo_handles', c_uint64),
105 |             ('bo_handle_count', c_uint32),
106 |             ('in_sync', c_uint32),
107 |             ('out_sync', c_uint32),
108 |         ]
109 | 
110 |     IOCTL_GEM_CLOSE = IOW(DRM_IOCTL_BASE, DRM_GEM_CLOSE, st_gem_close)
111 | 
112 |     IOCTL_V3D_WAIT_BO = IOWR(DRM_IOCTL_BASE, DRM_V3D_WAIT_BO, st_v3d_wait_bo)
113 |     IOCTL_V3D_CREATE_BO = IOWR(DRM_IOCTL_BASE, DRM_V3D_CREATE_BO,
114 |                                st_v3d_create_bo)
115 |     IOCTL_V3D_MMAP_BO = IOWR(DRM_IOCTL_BASE, DRM_V3D_MMAP_BO, st_v3d_mmap_bo)
116 |     IOCTL_V3D_GET_PARAM = IOWR(DRM_IOCTL_BASE, DRM_V3D_GET_PARAM,
117 |                                st_v3d_get_param)
118 |     IOCTL_V3D_SUBMIT_CSD = IOW(DRM_IOCTL_BASE, DRM_V3D_SUBMIT_CSD,
119 |                                st_v3d_submit_csd)
120 | 
121 |     def gem_close(self, handle):
122 |         st = self.st_gem_close(
123 |             handle=handle,
124 |             pad=0,
125 |         )
126 |         ioctl(self.fd, self.IOCTL_GEM_CLOSE, st)
127 | 
128 |     def v3d_wait_bo(self, handle, timeout_ns):
129 |         st = self.st_v3d_wait_bo(
130 |             handle=handle,
131 |             pad=0,
132 |             timeout_ns=timeout_ns,
133 |         )
134 |         ioctl(self.fd, self.IOCTL_V3D_WAIT_BO, st)
135 | 
136 |     def v3d_create_bo(self, size, flags=0):
137 |         st = self.st_v3d_create_bo(
138 |             size=size,
139 |             flags=flags,
140 |             handle=0,
141 |             offset=0,
142 |         )
143 |         ioctl(self.fd, self.IOCTL_V3D_CREATE_BO, st)
144 |         return st.handle, st.offset
145 | 
146 |     def v3d_mmap_bo(self, handle, flags=0):
147 |         st = self.st_v3d_mmap_bo(
148 |             handle=handle,
149 |             flags=flags,
150 |             offset=0,
151 |         )
152 |         ioctl(self.fd, self.IOCTL_V3D_MMAP_BO, st)
153 |         return st.offset
154 | 
155 |     def v3d_get_param(self, param):
156 |         st = self.st_v3d_get_param(
157 |             param=param,
158 |             pad=0,
159 |             value=0,
160 |         )
161 |         ioctl(self.fd, self.IOCTL_V3D_GET_PARAM, st)
162 |         return st.value
163 | 
164 |     def v3d_submit_csd(self, cfg, coef, bo_handles, bo_handle_count, in_sync,
165 |                        out_sync):
166 |         st = self.st_v3d_submit_csd(
167 |             # XXX: Dirty hack!
168 |             cfg=(c_uint32 * 7)(*cfg),
169 |             coef=(c_uint32 * 4)(*coef),
170 |             bo_handles=bo_handles,
171 |             bo_handle_count=bo_handle_count,
172 |             in_sync=in_sync,
173 |             out_sync=out_sync,
174 |         )
175 |         ioctl(self.fd, self.IOCTL_V3D_SUBMIT_CSD, st)
176 | 


--------------------------------------------------------------------------------
/videocore6/readwrite4.c:
--------------------------------------------------------------------------------
 1 | 
 2 | /*
 3 |  * Copyright (c) 2019-2020 Idein Inc.
 4 |  *
 5 |  * Permission is hereby granted, free of charge, to any person obtaining a copy
 6 |  * of this software and associated documentation files (the "Software"), to deal
 7 |  * in the Software without restriction, including without limitation the rights
 8 |  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 |  * copies of the Software, and to permit persons to whom the Software is
10 |  * furnished to do so, subject to the following conditions:
11 |  *
12 |  * The above copyright notice and this permission notice (including the next
13 |  * paragraph) shall be included in all copies or substantial portions of the
14 |  * Software.
15 |  *
16 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
19 |  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 |  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 |  * SOFTWARE.
23 |  */
24 | 
25 | 
26 | #if defined(__arm__) && defined(__aarch64__)
27 | #error "__arm__ and __aarch64__ are both defined"
28 | #elif !defined(__arm__) && !defined(__aarch64__)
29 | #error "__arm__ and __aarch64__ are both not defined"
30 | #endif
31 | 
32 | 
33 | #include <stdint.h>
34 | 
35 | 
36 | uint32_t read4(void * const addr)
37 | {
38 |     uint32_t value;
39 | 
40 |     asm volatile (
41 | #if defined(__arm__)
42 |             "ldr %[value], [%[addr]]\n\t"
43 | #elif defined(__aarch64__)
44 |             "ldr %w[value], [%[addr]]\n\t"
45 | #endif
46 |             : [value] "=r" (value)
47 |             : [addr] "r" (addr)
48 |             : "memory"
49 |     );
50 | 
51 |     return value;
52 | }
53 | 
54 | 
55 | void write4(void * const addr, const uint32_t value)
56 | {
57 |     asm volatile (
58 | #if defined(__arm__)
59 |             "str %[value], [%[addr]]\n\t"
60 | #elif defined(__aarch64__)
61 |             "str %w[value], [%[addr]]\n\t"
62 | #endif
63 |             :
64 |             : [value] "r" (value),
65 |               [addr] "r" (addr)
66 |             : "memory"
67 |     );
68 | }
69 | 


--------------------------------------------------------------------------------
/videocore6/v3d.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # Copyright (c) 2014-2018 Broadcom
  3 | # Copyright (c) 2019-2020 Idein Inc.
  4 | #
  5 | # This program is free software; you can redistribute it and/or modify it under
  6 | # the terms of the GNU General Public License as published by the Free Software
  7 | # Foundation; either version 2 of the License, or (at your option) any later
  8 | # version.
  9 | #
 10 | # This program is distributed in the hope that it will be useful, but WITHOUT
 11 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 12 | # FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
 13 | # details.
 14 | #
 15 | # You should have received a copy of the GNU General Public License along with
 16 | # this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
 17 | # Street, Fifth Floor, Boston, MA 02110-1301 USA.
 18 | 
 19 | 
 20 | from ctypes import cdll, c_uint32, c_void_p
 21 | from importlib.machinery import EXTENSION_SUFFIXES
 22 | from pathlib import Path
 23 | import mmap
 24 | import os
 25 | 
 26 | import numpy as np
 27 | 
 28 | 
 29 | class HubRegister:
 30 | 
 31 |     def __init__(self, offset):
 32 | 
 33 |         self.offset = offset
 34 | 
 35 | 
 36 | class PerCoreRegister:
 37 | 
 38 |     def __init__(self, offset):
 39 | 
 40 |         self.offset = offset
 41 | 
 42 | 
 43 | class HubField:
 44 | 
 45 |     def __init__(self, register, high, low):
 46 | 
 47 |         assert isinstance(register, HubRegister)
 48 |         self.register = register
 49 |         self.mask = ((1 << (high - low + 1)) - 1) << low
 50 |         self.shift = low
 51 | 
 52 | 
 53 | class PerCoreField:
 54 | 
 55 |     def __init__(self, register, high, low):
 56 | 
 57 |         assert isinstance(register, PerCoreRegister)
 58 |         self.register = register
 59 |         self.mask = ((1 << (high - low + 1)) - 1) << low
 60 |         self.shift = low
 61 | 
 62 | 
 63 | # V3D register definitions derived from linux/drivers/gpu/drm/v3d/v3d_regs.h
 64 | 
 65 | HUB_AXICFG = HubRegister(0x00000)
 66 | 
 67 | HUB_UIFCFG = HubRegister(0x00004)
 68 | 
 69 | HUB_IDENT0 = HubRegister(0x00008)
 70 | 
 71 | HUB_IDENT1 = HubRegister(0x0000c)
 72 | HUB_IDENT1_WITH_MSO = HubField(HUB_IDENT1, 19, 19)
 73 | HUB_IDENT1_WITH_TSY = HubField(HUB_IDENT1, 18, 18)
 74 | HUB_IDENT1_WITH_TFU = HubField(HUB_IDENT1, 17, 17)
 75 | HUB_IDENT1_WITH_L3C = HubField(HUB_IDENT1, 16, 16)
 76 | HUB_IDENT1_NHOSTS = HubField(HUB_IDENT1, 15, 12)
 77 | HUB_IDENT1_NCORES = HubField(HUB_IDENT1, 11, 8)
 78 | HUB_IDENT1_REV = HubField(HUB_IDENT1, 7, 4)
 79 | HUB_IDENT1_TVER = HubField(HUB_IDENT1, 3, 0)
 80 | 
 81 | HUB_IDENT2 = HubRegister(0x00010)
 82 | HUB_IDENT2_WITH_MMU = HubField(HUB_IDENT2, 8, 8)
 83 | HUB_IDENT2_L3C_NKB = HubField(HUB_IDENT2, 7, 0)
 84 | 
 85 | HUB_IDENT3 = HubRegister(0x00014)
 86 | HUB_IDENT3_IPREV = HubField(HUB_IDENT3, 15, 8)
 87 | HUB_IDENT3_IPIDX = HubField(HUB_IDENT3, 7, 0)
 88 | 
 89 | HUB_TFU_CS = HubRegister(0x00400)
 90 | 
 91 | 
 92 | CORE_IDENT0 = PerCoreRegister(0x00000)
 93 | CORE_IDENT0_VER = PerCoreField(CORE_IDENT0, 31, 24)
 94 | 
 95 | CORE_IDENT1 = PerCoreRegister(0x00004)
 96 | CORE_IDENT1_VPM_SIZE = PerCoreField(CORE_IDENT1, 31, 28)
 97 | CORE_IDENT1_NSEM = PerCoreField(CORE_IDENT1, 23, 16, )
 98 | CORE_IDENT1_NTMU = PerCoreField(CORE_IDENT1, 15, 12)
 99 | CORE_IDENT1_QUPS = PerCoreField(CORE_IDENT1, 11, 8)
100 | CORE_IDENT1_NSLC = PerCoreField(CORE_IDENT1, 7, 4)
101 | CORE_IDENT1_REV = PerCoreField(CORE_IDENT1, 3, 0)
102 | 
103 | CORE_IDENT2 = PerCoreRegister(0x00008)
104 | CORE_IDENT2_BCG = PerCoreField(CORE_IDENT2, 28, 28)
105 | 
106 | CORE_MISCCFG = PerCoreRegister(0x00018)
107 | CORE_MISCCFG_QRMAXCNT = PerCoreField(CORE_MISCCFG, 3, 1)
108 | CORE_MISCCFG_OVRTMUOUT = PerCoreField(CORE_MISCCFG, 0, 0)
109 | 
110 | CORE_L2CACTL = PerCoreRegister(0x00020)
111 | CORE_L2CACTL_L2CCLR = PerCoreField(CORE_L2CACTL, 2, 2)
112 | CORE_L2CACTL_L2CDIS = PerCoreField(CORE_L2CACTL, 1, 1)
113 | CORE_L2CACTL_L2CENA = PerCoreField(CORE_L2CACTL, 0, 0)
114 | 
115 | CORE_SLCACTL = PerCoreRegister(0x00024)
116 | CORE_SLCACTL_TVCCS = PerCoreField(CORE_SLCACTL, 27, 24)
117 | CORE_SLCACTL_TDCCS = PerCoreField(CORE_SLCACTL, 19, 16)
118 | CORE_SLCACTL_UCC = PerCoreField(CORE_SLCACTL, 11, 8)
119 | CORE_SLCACTL_ICC = PerCoreField(CORE_SLCACTL, 3, 0)
120 | 
121 | CORE_PCTR_0_EN = PerCoreRegister(0x00650)
122 | CORE_PCTR_0_CLR = PerCoreRegister(0x00654)
123 | CORE_PCTR_0_OVERFLOW = PerCoreRegister(0x00658)
124 | 
125 | g = globals()
126 | 
127 | for i in range(0, 32, 4):
128 |     name = f'CORE_PCTR_0_SRC_{i}_{i+3}'
129 |     g[name] = PerCoreRegister(0x00660 + i)
130 |     g[name + f'_S{i+3}'] = PerCoreField(g[name], 30, 24)
131 |     g[name + f'_S{i+2}'] = PerCoreField(g[name], 22, 16)
132 |     g[name + f'_S{i+1}'] = PerCoreField(g[name], 14, 8)
133 |     g[name + f'_S{i+0}'] = PerCoreField(g[name], 6, 0)
134 |     g[f'CORE_PCTR_0_SRC_{i+3}'] = PerCoreField(g[name], 30, 24)
135 |     g[f'CORE_PCTR_0_SRC_{i+2}'] = PerCoreField(g[name], 22, 16)
136 |     g[f'CORE_PCTR_0_SRC_{i+1}'] = PerCoreField(g[name], 14, 8)
137 |     g[f'CORE_PCTR_0_SRC_{i+0}'] = PerCoreField(g[name], 6, 0)
138 | 
139 | for i in range(32):
140 |     g[f'CORE_PCTR_0_PCTR{i}'] = PerCoreRegister(0x00680 + 4 * i)
141 | 
142 | del g, i
143 | 
144 | CORE_PCTR_CYCLE_COUNT = 32
145 | 
146 | 
147 | class RegisterMapping:
148 | 
149 |     def __init__(self):
150 | 
151 |         stem = Path(__file__).parent / 'readwrite4'
152 |         for suffix in EXTENSION_SUFFIXES:
153 |             try:
154 |                 lib = cdll.LoadLibrary(stem.with_suffix(suffix))
155 |             except OSError:
156 |                 continue
157 |             else:
158 |                 break
159 |         else:
160 |             raise Exception('readwrite4 library is not found.'
161 |                             + ' Your installation seems to be broken.')
162 | 
163 |         self.read4 = lib.read4
164 |         self.write4 = lib.write4
165 |         del stem, lib
166 | 
167 |         self.read4.argtypes = [c_void_p]
168 |         self.read4.restype = c_uint32
169 |         self.write4.argtypes = [c_void_p, c_uint32]
170 |         self.write4.restype = None
171 | 
172 |         fd = os.open('/dev/mem', os.O_RDWR)
173 | 
174 |         # XXX: Should use bcm_host_get_peripheral_address for the base address
175 |         # on userland, and consult /proc/device-tree/__symbols__/v3d and then
176 |         # /proc/device-tree/v3dbus/v3d@7ec04000/{reg-names,reg} for the offsets
177 |         # in the future.
178 | 
179 |         self.map_hub = mmap.mmap(offset=0xfec00000, length=0x4000, fileno=fd,
180 |                                  flags=mmap.MAP_SHARED,
181 |                                  prot=mmap.PROT_READ | mmap.PROT_WRITE)
182 |         self.ptr_hub = np.frombuffer(self.map_hub).ctypes.data
183 | 
184 |         self.ncores = 1
185 |         self.map_cores = [None] * self.ncores
186 |         self.ptr_cores = [None] * self.ncores
187 |         for core in range(self.ncores):
188 |             self.map_cores[core] = mmap.mmap(offset=0xfec04000 + 0x4000 * core,
189 |                                              length=0x4000, fileno=fd,
190 |                                              flags=mmap.MAP_SHARED,
191 |                                              prot=mmap.PROT_READ | mmap.PROT_WRITE)
192 |             self.ptr_cores[core] = \
193 |                 np.frombuffer(self.map_cores[core]).ctypes.data
194 | 
195 |         os.close(fd)
196 | 
197 |     def __enter__(self):
198 | 
199 |         return self
200 | 
201 |     def __exit__(self, type, value, traceback):
202 | 
203 |         pass
204 | 
205 |     def _get_ptr(self, key, core):
206 | 
207 |         if isinstance(key, (HubField, PerCoreField)):
208 |             return self._get_ptr(key.register, core)
209 |         elif isinstance(key, HubRegister):
210 |             assert core is None
211 |             return self.ptr_hub + key.offset
212 |         elif isinstance(key, PerCoreRegister):
213 |             return self.ptr_cores[core] + key.offset
214 | 
215 |     def __getitem__(self, key):
216 | 
217 |         core = None
218 |         if isinstance(key, tuple):
219 |             key, core = key
220 | 
221 |         v = self.read4(self._get_ptr(key, core))
222 | 
223 |         if isinstance(key, (HubField, PerCoreField)):
224 |             v = (v & key.mask) >> key.shift
225 | 
226 |         return v
227 | 
228 |     def __setitem__(self, key, value):
229 | 
230 |         core = None
231 |         if isinstance(key, tuple):
232 |             key, core = key
233 | 
234 |         if isinstance(key, (HubField, PerCoreField)):
235 |             value = (self[key.register, core] & ~key.mask) \
236 |                 | ((value << key.shift) & key.mask)
237 | 
238 |         self.write4(self._get_ptr(key, core), value)
239 | 
240 | 
241 | class PerformanceCounter:
242 | 
243 |     _PCTR_SRCs = [globals()[f'CORE_PCTR_0_SRC_{_}'] for _ in range(32)]
244 |     _PCTRs = [globals()[f'CORE_PCTR_0_PCTR{_}'] for _ in range(32)]
245 | 
246 |     def __init__(self, regmap, srcs):
247 | 
248 |         self.regmap = regmap
249 |         self.srcs = srcs
250 |         self.core = 0  # Sufficient for now.
251 |         self.mask = (1 << len(self.srcs)) - 1
252 | 
253 |     def __enter__(self):
254 | 
255 |         self.regmap[CORE_PCTR_0_EN, self.core] = 0
256 | 
257 |         for i in range(len(self.srcs)):
258 |             self.regmap[self._PCTR_SRCs[i], self.core] = self.srcs[i]
259 | 
260 |         self.regmap[CORE_PCTR_0_CLR, self.core] = self.mask
261 |         self.regmap[CORE_PCTR_0_OVERFLOW, self.core] = self.mask
262 |         self.regmap[CORE_PCTR_0_EN, self.core] = self.mask
263 | 
264 |         return self
265 | 
266 |     def __exit__(self, type, value, traceback):
267 | 
268 |         self.regmap[CORE_PCTR_0_EN, self.core] = 0
269 |         self.regmap[CORE_PCTR_0_CLR, self.core] = self.mask
270 |         self.regmap[CORE_PCTR_0_OVERFLOW, self.core] = self.mask
271 | 
272 |     def result(self):
273 | 
274 |         return [self.regmap[self._PCTRs[i], self.core]
275 |                 for i in range(len(self.srcs))]
276 | 


--------------------------------------------------------------------------------