├── COPYING ├── INSTALL ├── README ├── ROADMAP ├── VERSION ├── core └── config │ ├── rtarch.h │ ├── rtarch_a32.h │ ├── rtarch_a32_128x1v1.h │ ├── rtarch_a32_128x2v1.h │ ├── rtarch_a32_SVEx1v1.h │ ├── rtarch_a32_SVEx2v1.h │ ├── rtarch_a64.h │ ├── rtarch_a64_128x1v1.h │ ├── rtarch_a64_128x2v1.h │ ├── rtarch_a64_SVEx1v1.h │ ├── rtarch_a64_SVEx2v1.h │ ├── rtarch_aHB.h │ ├── rtarch_aHB_128x1v1.h │ ├── rtarch_aHB_128x2v1.h │ ├── rtarch_aHB_SVEx1v1.h │ ├── rtarch_aHB_SVEx2v1.h │ ├── rtarch_aHF_128x1v2.h │ ├── rtarch_aHF_128x2v2.h │ ├── rtarch_aHF_SVEx1v1.h │ ├── rtarch_aHF_SVEx2v1.h │ ├── rtarch_arm.h │ ├── rtarch_arm_128x1v4.h │ ├── rtarch_m32.h │ ├── rtarch_m32_128x1v1.h │ ├── rtarch_m32_128x2v1.h │ ├── rtarch_m64.h │ ├── rtarch_m64_128x1v1.h │ ├── rtarch_m64_128x2v1.h │ ├── rtarch_mHB.h │ ├── rtarch_mHB_128x1v1.h │ ├── rtarch_mHB_128x2v1.h │ ├── rtarch_p32.h │ ├── rtarch_p32_128x1v1.h │ ├── rtarch_p32_128x1v2.h │ ├── rtarch_p32_128x1v4.h │ ├── rtarch_p32_128x2v1.h │ ├── rtarch_p32_128x2v2.h │ ├── rtarch_p32_128x2v4.h │ ├── rtarch_p32_128x2v8.h │ ├── rtarch_p32_128x2vG.h │ ├── rtarch_p32_128x4v1.h │ ├── rtarch_p32_128x4v2.h │ ├── rtarch_p64.h │ ├── rtarch_p64_128x1v1.h │ ├── rtarch_p64_128x1v2.h │ ├── rtarch_p64_128x2v1.h │ ├── rtarch_p64_128x2v2.h │ ├── rtarch_p64_128x2v4.h │ ├── rtarch_p64_128x2v8.h │ ├── rtarch_p64_128x4v1.h │ ├── rtarch_p64_128x4v2.h │ ├── rtarch_pHB.h │ ├── rtarch_pHB_128x1v1.h │ ├── rtarch_pHB_128x1v2.h │ ├── rtarch_pHB_128x1v4.h │ ├── rtarch_pHB_128x2v1.h │ ├── rtarch_pHB_128x2v2.h │ ├── rtarch_pHB_128x2vG.h │ ├── rtarch_pQF_128x1v2.h │ ├── rtarch_pQF_128x2v2.h │ ├── rtarch_x32.h │ ├── rtarch_x32_128x1v2.h │ ├── rtarch_x32_128x1v4.h │ ├── rtarch_x32_128x1v8.h │ ├── rtarch_x32_128x2v4.h │ ├── rtarch_x32_256x1v2.h │ ├── rtarch_x32_256x1v8.h │ ├── rtarch_x32_256x2v2.h │ ├── rtarch_x32_512x1v8.h │ ├── rtarch_x32_512x2v2.h │ ├── rtarch_x32_512x4v2.h │ ├── rtarch_x64.h │ ├── rtarch_x64_128x1v2.h │ ├── rtarch_x64_128x1v4.h │ ├── rtarch_x64_128x1v8.h │ ├── rtarch_x64_128x2v4.h │ ├── rtarch_x64_256x1v2.h │ ├── rtarch_x64_256x1v8.h │ ├── rtarch_x64_256x2v2.h │ ├── rtarch_x64_512x1v8.h │ ├── rtarch_x64_512x2v2.h │ ├── rtarch_x64_512x4v2.h │ ├── rtarch_x86.h │ ├── rtarch_x86_128x1v4.h │ ├── rtarch_x86_128x1v8.h │ ├── rtarch_x86_256x1v2.h │ ├── rtarch_x86_512x1v2.h │ ├── rtarch_xHB.h │ ├── rtarch_xHB_128x1v2.h │ ├── rtarch_xHB_128x1v4.h │ ├── rtarch_xHB_128x1v8.h │ ├── rtarch_xHB_128x2v4.h │ ├── rtarch_xHB_256x1v2.h │ ├── rtarch_xHB_256x1v8.h │ ├── rtarch_xHB_256x2v2.h │ ├── rtarch_xHB_512x1v8.h │ ├── rtarch_xHB_512x2v2.h │ ├── rtarch_xHB_512x4v2.h │ ├── rtarch_xHF_128x1v2.h │ ├── rtarch_xHF_256x1v8.h │ ├── rtarch_xHF_512x1v8.h │ ├── rtarch_xHF_512x2v2.h │ ├── rtarch_xHF_512x4v2.h │ ├── rtbase.h │ ├── rtconf.h │ ├── rtdocs.h │ └── rtzero.h └── test ├── build_cross.sh ├── build_linux.sh ├── build_macM1.sh ├── build_macOS.sh ├── build_multi.sh ├── build_nokia.sh ├── build_raspi.sh ├── build_win64.bat ├── clean_cross.sh ├── clean_linux.sh ├── clean_macM1.sh ├── clean_macOS.sh ├── clean_multi.sh ├── clean_nokia.sh ├── clean_raspi.sh ├── clean_win64.bat ├── simd_make_a32.mk ├── simd_make_a64.mk ├── simd_make_arm.mk ├── simd_make_m32.mk ├── simd_make_m64.mk ├── simd_make_p32.mk ├── simd_make_p64.mk ├── simd_make_w64.bat ├── simd_make_w64.mk ├── simd_make_x32.mk ├── simd_make_x64.mk ├── simd_make_x86.mk ├── simd_qemu32.sh ├── simd_qemu64.sh ├── simd_test.cpp ├── simd_test64.sh ├── simd_test86.sh ├── simd_test_x64.sln ├── simd_test_x64.vcxproj ├── simd_test_x64.vcxproj.filters └── simd_test_x64.vcxproj.user /COPYING: -------------------------------------------------------------------------------- 1 | Copyright (c) 2013-2025 VectorChief (at github, bitbucket, sourceforge) 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. 20 | -------------------------------------------------------------------------------- /INSTALL: -------------------------------------------------------------------------------- 1 | ================================================================================ 2 | 3 | To build SIMD test framework on Linux, open terminal window, 4 | go to UniSIMD's "test" subfolder, 5 | make sure necessary tools and libraries are installed 6 | sudo apt-get update 7 | for native builds (binary ABI matches host: x64, RISCs): 8 | sudo apt-get install make g++ 9 | for multilib builds (if libs are available: 32-bit x86): 10 | sudo apt-get install make g++-multilib 11 | run for x64 architecture: 12 | make -f simd_make_x64.mk -j4 13 | ./simd_test.x64f32 14 | run for x86 architecture: 15 | make -f simd_make_x86.mk -j4 16 | ./simd_test.x86 17 | run for *** architectures (on native host or QEMU linux-user mode): 18 | make -f simd_make_***.mk -j4 19 | ./simd_test.*** 20 | Prerequisites for building/emulating guest non-x86 architectures as well as 21 | configurations for particular hardware platforms are given in the makefiles. 22 | By default, DEB-based distributions (Mint, Ubuntu, Debian) are implied, 23 | refer to the AArch64 Linux on RPi3 section down below for RPM-based options. 24 | 25 | To build SIMD test framework on macOS, open terminal window, 26 | go to UniSIMD's "test" subfolder, 27 | make sure necessary tools and libraries are installed: 28 | clang 29 | it will prompt to install Command Line Tools (will make alias to g++) 30 | run for x64 architecture: 31 | make -f simd_make_x64.mk -j4 32 | ./simd_test.x64f32 33 | run for a64 architecture: 34 | make -f simd_make_a64.mk clang -j4 35 | ./simd_test.a64f32 36 | Alternatively, use clang option for x64 (make -f simd_make_x64.mk clang -j4). 37 | 38 | To build SIMD test framework on Windows using Visual Studio, 39 | download and install Visual Studio 2022 or later (with clang option enabled), 40 | then open UniSIMD's "test" subfolder and click on VS2022 solution file: 41 | simd_test_x64.sln 42 | from within the Visual Studio press F5 key to build and run the binary. 43 | 44 | For maximum compatibility, always copy the binaries to *.sln's subfolder. 45 | 46 | To build SIMD test framework on Windows using TDM64-GCC, 47 | download and install TDM64-GCC toolchain (tdm64-gcc-10.3.0-2.exe) from github, 48 | then open UniSIMD's "test" subfolder and run from "cmd" or Windows Explorer: 49 | simd_make_w64.bat 50 | produced simd_test_w64f32.exe binary file will launch upon build completion. 51 | 52 | ================================================================================ 53 | 54 | To build QEMU emulator from source on Linux, download the latest version from: 55 | http://wiki.qemu.org/Download 56 | unpack the archive, open terminal window, go to QEMU's root folder, 57 | make sure necessary tools and libraries are installed: 58 | sudo apt-get update 59 | sudo apt-get install make g++ ninja-build 60 | sudo apt-get install pkg-config libglib2.0-dev libpixman-1-dev zlib1g-dev 61 | when building QEMU on RPM-based systems like openSUSE: 62 | sudo zypper install make gcc-c++ ninja 63 | sudo zypper install patch glib2-devel libpixman-1-0-devel zlib-devel 64 | to build a reduced set of targets for linux-user mode emulation only, use: 65 | ./configure --target-list=arm-linux-user,aarch64-linux-user,\ 66 | mips-linux-user,mipsel-linux-user,mips64-linux-user,mips64el-linux-user,\ 67 | ppc-linux-user,ppc64-linux-user,ppc64le-linux-user,\ 68 | i386-linux-user,x86_64-linux-user 69 | (copy the whole multi-line command above without leading or trailing spaces) 70 | (paste into terminal and run after or instead of plain ./configure script) 71 | run multithreaded make (use -j8 or -j16 on machines with higher core count): 72 | make -j4 73 | run installation script: 74 | sudo make install 75 | 76 | When building QEMU from source isn't necessary install full binary package: 77 | sudo apt-get install qemu-user 78 | on RPM-based systems like openSUSE: 79 | sudo zypper install qemu 80 | 81 | QEMU 5.2.0 and beyond may require ninja-build package to build from source. 82 | Starting from QEMU 5.2.0 POWER9 lxvwsx instruction is supported properly. 83 | From QEMU 5.2.0 through QEMU 6.2.0 ppc64abi32 targets are marked as deprecated 84 | and will be removed in the following releases (Ubuntu 22.04 should have 6.2.0). 85 | QEMU 6.2.0 contains some MIPS regressions and POWER bugs (exposed by gcc 11.3), 86 | consider an update: https://bugs.launchpad.net/ubuntu/+source/qemu/+bug/2011832 87 | or build QEMU 7.2.0 from source (contains all the fixes, also in Ubuntu 23.04). 88 | All QEMU versions since 6.0.0 including 8.0.0 have MIPS bug reported here: 89 | https://gitlab.com/qemu-project/qemu/-/issues/1624 90 | Note that using standalone MIPS cross-compiler masks the issue with local QEMU. 91 | 92 | Ubuntu 20.04 is the first release where MIPS cross-compilers have caught up 93 | with the rest of the pack (standalone 2020.06-01 and mipsisa64r6* are 9.3.0). 94 | It is the only release which has full support for all the targets (ppc64abi32). 95 | 96 | ================================================================================ 97 | 98 | To emulate future x86 targets (AVX-512) on modern x86 Linux hosts use Intel SDE: 99 | https://software.intel.com/content/www/us/en/develop/articles/ 100 | /intel-software-development-emulator.html 101 | download and unpack the archive. 102 | 103 | In terminal window for 32-bit x86 run: 104 | path-to-kit/sde -snb -- ./simd_test.x86avx -c 1 105 | path-to-kit/sde -knl -- ./simd_test.x86avx512 -c 1 106 | for AVX (Sandy Bridge) and AVX512F (Knights Landing) respectively. 107 | 108 | In terminal window for 64-bit x64 run: 109 | path-to-kit/sde64 -hsw -- ./simd_test.x64f32avx -c 1 110 | path-to-kit/sde64 -skx -- ./simd_test.x64f32avx512 -c 1 111 | for AVX2 (Haswell) and AVX512DQ (Skylake-X) respectively. 112 | 113 | Intel's AVX512 fp16 subset (in Sapphire Rapids and in some Alder Lake configs) 114 | is now supported in Intel SDE 9.0 (with -spr option) and can be tested within 115 | the assembler by substituting regular 32/64-bit cmdps with 16-bit cmdms aliases 116 | and using direct ASM section output comparison method (with ARMv8.2 fp16). 117 | 118 | Use "-c 1" option to reduce test time when emulating with Intel SDE. 119 | 120 | ================================================================================ 121 | 122 | To experiment with Linux in little-endian mode on POWER8 server consider: 123 | 124 | 1) Ubuntu Server 16.04.06 Xenial Xerus (ppc64el) from 27-Feb-2019: 125 | https://cdimage.ubuntu.com/releases/16.04/release/ 126 | ubuntu-16.04.6-server-ppc64el.iso 127 | (install most recent HWE kernel version) 128 | 129 | 2) Ubuntu Server 18.04.05 Bionic Beaver (ppc64el) from 10-Aug-2020: 130 | https://cdimage.ubuntu.com/releases/18.04/release/ 131 | ubuntu-18.04.5-server-ppc64el.iso 132 | (install the original kernel version, no HWE) 133 | 134 | 3) Ubuntu Server 20.04.02 Focal Fossa (ppc64el) from 01-Feb-2021: 135 | https://cdimage.ubuntu.com/releases/20.04/release/ 136 | ubuntu-20.04.2-live-server-ppc64el.iso 137 | (system installs, but won't boot, features new installer) 138 | 139 | The images were tested on Tyan Habanero TN71-BP012 10-core POWER8 server 140 | with installation instructions described here: 141 | https://www.phoronix.com/scan.php?page=article&item=tyan-power8-server&num=1 142 | 143 | Boot the system from USB flash drive without any ethernet cables attached, 144 | otherwise the boot menu won't allow any boot options to be activated properly. 145 | When presented with a blank screen and a cursor, read the next paragraph. 146 | 147 | Before an installation can proceed the following steps may need to be performed 148 | on the first boot. Switch to tty2 with Alt-F2, activate the tty as the message 149 | requests by pressing , run /sbin/debian-installer (first two images). 150 | 151 | To setup networking on a freshly installed Ubuntu Server with ethernet cable 152 | use the farmost ethernet socket among the main four as seen from the PSU. 153 | 154 | On Ubuntu Server 16.04 use ifconfig to configure the network: 155 | sudo nano /etc/network/interfaces 156 | Edit the file above to add the next two lines at the bottom: 157 | auto enP1p3s0 158 | iface enP1p3s0 inet dhcp 159 | Press Ctrl-O to save the changes and Ctrl-X to exit from the editor. 160 | sudo /etc/init.d/networking restart 161 | or 162 | sudo systemctl restart networking.service 163 | 164 | On Ubuntu Server 18.04 use netplan to configure the network: 165 | sudo nano /etc/netplan/01-netcfg.yaml 166 | Edit the file above so that it looks like this: 167 | network: 168 | ethernets: 169 | enP1p3s0: 170 | dhcp4: yes 171 | version: 2 172 | renderer: networkd 173 | Press Ctrl-O to save the changes and Ctrl-X to exit from the editor. 174 | sudo netplan apply 175 | 176 | When booting Ubuntu Server offline after the networking has been set up 177 | the login prompt is not immediately shown and a few minutes time 178 | needs to be taken before SSH authorization becomes workable. 179 | 180 | To install XFCE once the system is up and running: 181 | sudo apt-get update 182 | sudo apt-get install xfce4 183 | startx (choose "Use default config" on first start) 184 | (launch terminal window in XFCE from the bottom panel) 185 | sudo apt-get install firefox unzip leafpad 186 | sudo apt-get install make g++ clang 187 | Alternatively install Xubuntu desktop for more integrated experience: 188 | sudo apt-get update 189 | sudo apt-get install xubuntu-desktop 190 | sudo apt-get install make g++ clang 191 | reboot 192 | 193 | After installing XFCE Ubuntu may start ureadahead process which loads the CPU 194 | preventing normal user workflow for a few minutes until the process 195 | is complete and the system is back to normal (use top command to monitor). 196 | 197 | ================================================================================ 198 | 199 | To experiment with Linux in AArch64 mode on Raspberry Pi 3 consider: 200 | 201 | 1) Devuan ASCII 2.0.0 plain (arm64 raspi3) image from 06-Jun-2018: 202 | https://devuan.org/ 203 | https://files.devuan.org/devuan_ascii/embedded/ 204 | devuan_ascii_2.0.0_arm64_raspi3.img.xz 205 | image boot credentials: 206 | login: root 207 | password: toor 208 | 209 | 2) openSUSE Leap15.0 XFCE (aarch64 raspi3) image from 02-Jul-2018: 210 | https://en.opensuse.org/HCL:Raspberry_Pi3 211 | http://download.opensuse.org/ports/aarch64/distribution/leap/15.0/appliances 212 | openSUSE-...-ARM-XFCE-raspberrypi3.aarch64-2018.07.02-Buildlp150.1.1.raw.xz 213 | image boot credentials: 214 | login: root 215 | password: linux 216 | 217 | Flashing images above to an SD card is similar to a Raspbian installation: 218 | http://www.raspberrypi.org/documentation/installation/installing-images/linux.md 219 | lsblk (before inserting SD card) 220 | lsblk (after inserting SD card to see its device ID, mmcblk0 in this case) 221 | sudo su 222 | umount /dev/mmcblk0p1 (if exists and mounted, unmount partition: p1) 223 | umount /dev/mmcblk0p2 (if exists and mounted, unmount partition: p2) 224 | (change to Downloads directory where image file is unpacked) 225 | dcfldd bs=4M if=devuan_ascii_2.0.0_arm64_raspi3.img of=/dev/mmcblk0 226 | sync (before extracting SD card from the slot) 227 | exit (from super-user mode) 228 | 229 | Devuan image requires partition resizing once booted (credentials above): 230 | http://elinux.org/RPi_Resize_Flash_Partitions (performed on RPi using fdisk) 231 | fdisk /dev/mmcblk0 232 | p (to see the current partition table) 233 | d (answer: 2, to delete partition 2) 234 | (for swap leave some space at the end by subtracting 1M from default sector) 235 | n (answer: p, for primary; answer: 2, for new partition 2; ; ) 236 | (when fdisk asks to remove ext4 signature at the end, answer: N, to keep it) 237 | (create new partition 3 as 2, from the space left at the end of the SD card) 238 | (t 3, to change partition type from 83 "Linux" to 82 "Linux-swap / Solaris") 239 | w (writes the changes and quits fdisk) 240 | shutdown -r now (login again after reboot) 241 | resize2fs /dev/mmcblk0p2 242 | df -h (to check the new partition size) 243 | (mkswap /dev/mmcblk0p3) 244 | (add "/dev/mmcblk0p3 none swap sw 0 0" to /etc/fstab to auto-swapon at boot) 245 | to install XFCE once the partition is resized: 246 | (commands below are not prefixed with "sudo" as image is booted into "root") 247 | (using "apt" command instead of "apt-get" allows to save space on SD card) 248 | apt-get update 249 | apt install xfce4 (choose keyboard layout) 250 | reboot (for XFCE to honour chosen keyboard layout) 251 | startx (choose "Use default config" on first start) 252 | (launch terminal window in XFCE from the bottom panel) 253 | apt install firefox-esr unzip 254 | reboot (login again and "startx" into XFCE) 255 | (firefox is now available under "Internet" section of the main menu) 256 | apt install make g++ clang 257 | to setup ARM Instruction Emulator for SVE on AArch64 hosts install modules: 258 | apt install environment-modules 259 | reboot (login again and "startx" into XFCE) 260 | (modules only work outside of XFCE, use "Log Out" to configure modules) 261 | (once armie module is loaded, use startx again to work with it in XFCE) 262 | 263 | openSUSE image is RPM-based & boots directly to XFCE (credentials above): 264 | (commands below are not prefixed with "sudo" as image is booted into "root") 265 | (Raspberry Pi 3 Model B+ doesn't have networking with openSUSE, use old one) 266 | zypper install MozillaFirefox 267 | reboot (and login again) 268 | (firefox is now available under "Internet" section of the main menu) 269 | zypper install make gcc-c++ clang 270 | to setup ARM Instruction Emulator for SVE on AArch64 hosts, install modules: 271 | zypper install Modules 272 | reboot (and login again) 273 | adjust UniSIMD's makefiles to use g++ instead of triplet name, remove -static 274 | leafpad simd_make_a64.mk & (once archive is downloaded as shown below) 275 | 276 | Download the archive from github and unpack it (in terminal window): 277 | cd Downloads 278 | (alternatively to using a browser for downloading, use wget from terminal) 279 | (wget https://github.com/VectorChief/UniSIMD-assembler/archive/master.zip) 280 | (mv master.zip UniSIMD-assembler-master.zip) 281 | unzip UniSIMD-assembler-master.zip 282 | cd UniSIMD-assembler-master/test 283 | make -f simd_make_a64.mk -j4 284 | ./simd_test.a64f32 285 | 286 | Download the ARM IE and install it (Ubuntu_16.04 for Devuan, SUSE_12 for SUSE): 287 | https://developer.arm.com/tools-and-software/server-and-hpc/ 288 | /compile/arm-instruction-emulator/get-software/download 289 | cd Downloads 290 | tar -xvzf ARM-Instruction-Emulator_20.1_AArch64_***_aarch64.tar.gz 291 | cd ARM-Instruction-Emulator_20.1_AArch64_***_aarch64 292 | ./arm-instruction-emulator-20.1_Generic-AArch64_***_aarch64-linux-***.sh 293 | (scroll down and type: yes , when license shows up on the screen) 294 | reboot (and login again) 295 | module use /opt/arm/modulefiles 296 | module avail 297 | module load Generic-AArch64/***/arm-instruction-emulator/20.1 298 | (armie should now be available in the PATH variable, check vector lengths) 299 | armie -mlist-vector-lengths 300 | 301 | To test SVE targets with ARM Instruction Emulator run: 302 | armie -msve-vector-bits=512 -- ./simd_test.a64f32sve -c 1 303 | 304 | Use "-c 1" option to reduce test time when emulating with ARM IE. 305 | 306 | Devuan ASCII 2.0.0 image has USB flash drives automount, but older clang 3.8.1, 307 | it also allows setting CPU frequency scaling governor for maximum performance: 308 | echo "performance" | tee /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor 309 | Current frequency (600Mhz min-level, 1200Mhz max-level) can be monitored using: 310 | cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq 311 | 312 | openSUSE Leap15.0 image has newer clang, but no automount for USB flash drives, 313 | it offers min-level CPU frequency by default and it lacks CPU scaling governor. 314 | It also doesn't support networking on newer Raspberry Pi 3 Model B+ from 2018. 315 | On the bonus side, modules can be configured from within XFCE as they should be. 316 | 317 | Both images feature modern set of compilers sufficient for the build: 318 | g++ 6.3.0, clang 3.8.1 (Devuan ASCII 2.0.0, clang prior to 3.8 was much slower) 319 | g++ 7.3.1, clang 5.0.1 (openSUSE Leap15.0) 320 | fresh Firefox 60.2.2esr browser and a 32bpp display output. 321 | 322 | ================================================================================ 323 | 324 | To install Ubuntu MATE 20.04 LTS on Raspberry Pi 4 consider: 325 | 326 | 1) Ubuntu MATE 20.04.1 desktop (arm64 raspi) image from 29-Oct-2020: 327 | https://ubuntu-mate.org/download/arm64/focal/ 328 | https://releases.ubuntu-mate.org/focal/arm64/ 329 | ubuntu-mate-20.04.1-desktop-arm64+raspi.img.xz 330 | 331 | Flash image to an SD card using "Disks -> Restore Disk Image" utility with GUI 332 | from a regular Ubuntu desktop. It will unpack *.xz internally in the process. 333 | 334 | Once booted the image will automatically resize the root partition to fully 335 | utilize all space available on an SD card and proceed to install the system. 336 | 337 | When online Ubuntu may start unattended-upgr process which holds the lock 338 | preventing installation of other packages potentially for a few hours. 339 | Install the system offline to postpone this process for a later time. 340 | 341 | To emulate SVE instruction subset install QEMU 4.2.1 from the repository: 342 | sudo apt-get update 343 | sudo apt-get install qemu-user 344 | 345 | ================================================================================ 346 | 347 | To experiment with Ubuntu Server on Raspberry Pi 4 consider: 348 | 349 | 1) Ubuntu Server 20.04.2 preinstalled (arm64 raspi) image from 01-Feb-2021: 350 | http://cdimage.ubuntu.com/releases/20.04/release/ 351 | ubuntu-20.04.2-preinstalled-server-arm64+raspi.img.xz 352 | image boot credentials: 353 | login: ubuntu 354 | password: ubuntu 355 | 356 | Flash image to an SD card using "Disks -> Restore Disk Image" utility with GUI 357 | from a regular Ubuntu desktop. It will unpack *.xz internally in the process. 358 | 359 | When booting Ubuntu Server offline for the first time the login prompt 360 | is often presented too early in the process and a few minutes time 361 | needs to be taken before SSH authorization becomes workable. 362 | 363 | Once booted the image will automatically resize the root partition to fully 364 | utilize all space available on an SD card and request to change the password. 365 | 366 | When online Ubuntu may start unattended-upgr process which holds the lock 367 | preventing installation of other packages for a few minutes until the process 368 | is complete and the lock is released (run top to monitor, q to quit monitoring). 369 | 370 | To test SVE targets with ARM Instruction Emulator run: 371 | sudo apt-get update 372 | sudo apt-get install environment-modules 373 | reboot 374 | Download ARM-Instruction-Emulator_21.0_AArch64_Ubuntu-18.04_aarch64.tar.gz from: 375 | https://developer.arm.com/tools-and-software/server-and-hpc/ 376 | /compile/arm-instruction-emulator/get-software/download 377 | and follow installation instructions as presented here: 378 | https://developer.arm.com/documentation/102190/2100/Get-started/ 379 | /Install-Arm-Instruction-Emulator 380 | Note that module command is only available before installing the desktop. 381 | 382 | To setup networking on a freshly installed Ubuntu Server without ethernet cable 383 | consider using "USB tethering" from an Android phone or an iPhone. 384 | 385 | Plug in phone's USB cable to Raspberry Pi 4's USB slot and select 386 | "USB tethering" option from phone's menu or in "Settings -> Personal Hotspot". 387 | 388 | On Raspberry Pi 4 run the following command in the terminal: 389 | ip -c a 390 | to list all the network interfaces. USB tethered option should be called: 391 | usb0 392 | or 393 | eth1 394 | In any case it shows up on the list once tethering is activated on the phone. 395 | 396 | To complete setting up USB networking add interface names to the netplan: 397 | sudo nano /etc/netplan/50-cloud-init.yaml 398 | Edit the file above so that it looks like this: 399 | network: 400 | ethernets: 401 | eth0: 402 | dhcp4: true 403 | optional: true 404 | eth1: 405 | dhcp4: true 406 | usb0: 407 | dhcp4: true 408 | version: 2 409 | Press Ctrl-O to save the changes and Ctrl-X to exit from the editor. 410 | 411 | Apply the changes by typing: 412 | sudo netplan apply 413 | Check the connection with: 414 | ping google.com 415 | Press Ctrl-C to stop. 416 | 417 | Once the networking is set up (only needs to be done once) install the desktop: 418 | sudo apt-get update 419 | sudo apt-get install ubuntu-mate-desktop 420 | (if MATE is not yet available the command installs GNOME 3 desktop instead) 421 | (select lightdm when prompted for more familiar MATE-themed login screen) 422 | reboot 423 | Alternatively install Xubuntu desktop on a clean system: 424 | sudo apt-get update 425 | sudo apt-get install xubuntu-desktop 426 | (if XFCE is not yet available the command installs GNOME 3 desktop instead) 427 | (select lightdm when prompted for more familiar XFCE-themed login screen) 428 | reboot 429 | 430 | ================================================================================ 431 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | UniSIMD assembler is a high-level C/C++ macro assembler framework unified across 2 | ARM, MIPS, POWER and x86 architectures. It establishes a subset of both BASE and 3 | SIMD instruction sets with clearly defined common API, so that application logic 4 | can be written and maintained in one place without code replication. 5 | The assembler itself isn't a separate tool, but rather a collection of C/C++ 6 | header files, which applications need to include directly in order to use. 7 | 8 | Initial documentation for the assembler is provided in core/config/rtdocs.h. 9 | 10 | At present, Intel SSE/SSE2/SSE4 and AVX/AVX2/AVX-512 (32/64-bit x86 ISAs), 11 | ARMv7 NEON/NEONv2, ARMv8 AArch32 and AArch64 NEON, SVE (32/64-bit ARM ISAs), 12 | MIPS 32/64-bit r5/r6 MSA and POWER 32/64-bit VMX/VSX (little/big-endian ISAs) 13 | are mostly implemented (w/ horizontal reductions and byte/half SIMD+BASE ops) 14 | although scalar improvements, wider SIMD vectors with zeroing/merging predicates 15 | in 3/4-operand instructions, cross-precision fp-converters on modern CPU targets 16 | are planned as extensions to current 2/3-operand SPMD-driven vertical SIMD ISA. 17 | 18 | The project has a test framework for Linux/GCC/Clang and Windows/VC++/TDM64-GCC. 19 | Support for macOS is provided via Command Line Tools with GCC and Clang options. 20 | Instructions for resolving dependencies and building the binaries 21 | for supported platforms can be found in the accompanying INSTALL file. 22 | 23 | UniSIMD core features: 24 | - Unified, Universal, Portable, Compatible code 25 | - Explicit register allocation, predictable performance 26 | - Three register sets for code: 8, 16, 32 (free: 8, 15, 30) 27 | - High-level SIMD registers/ops as singles, pairs and quads 28 | - SIMD-aligned backend structures with offsets/factors 29 | - Vector-length agnostic vertical SIMD ISA, configurable 30 | - Simultaneous scalar + 128/256-bit + configurable SIMD ops 31 | - ISA implementation for fp16/fp128 (half/quad) SIMD ops 32 | - C/C++, Compute, SPMD on 4 major archs 33 | - Intel SSE/SSE2/SSE4 and AVX/AVX2/AVX-512 34 | - ARMv7 NEON/NEONv2, ARMv8 AArch32/AArch64 NEON, SVE 35 | - MIPS r5/r6 MSA (Warrior P5600, I6400/P6600) 36 | - POWER VMX/VSX (PowerPC G4/G5, POWER6/7/8/9) 37 | - CISC, RISC, CISC on RISC, little/big-endian ISA 38 | - Support for reg-reg, load/store, load-op instructions 39 | - Plain, indexed and scaled-indexed addressing modes 40 | - FMA3 support (native or higher-precision emulation) 41 | - 32/64-bit hybrid mode for native 64-bit ABI 42 | - 32/64-bit addressing for BASE and SIMD ops 43 | - 32/64-bit configurable SIMD elements (fp+int) 44 | - Simultaneous 32/64-bit BASE (bridges, rules) and SIMD ops 45 | - ISA implementation for int8/int16 (byte/half) BASE ops 46 | - Full control over code, compiler steps out of the way 47 | - Potential for bit-exact fp-compute across modern targets 48 | - Used in QuadRay engine 49 | -------------------------------------------------------------------------------- /ROADMAP: -------------------------------------------------------------------------------- 1 | ================================================================================ 2 | === >>> === tasks below are planned for the upcoming 1.2.0 milestone === <<< === 3 | ================================================================================ 4 | 5 | X) Task title: "implement predicated AVX-512/ARM-SVE backends (in *_RX slots)" 6 | 1) Add rtarch_***_***x*p*.h header files to core/config for predicated targets 7 | 2) Add predicate registers X1..X6 (merging) and Z1..Z6 (zeroing) as triplets 8 | 3) Add cmd**P** subset for "two-operand + predicate" instructions 9 | 4) Add cmd**4** subset for "three-operand + predicate" instructions 10 | 5) Predicate is placed 1st in cmp-ops, and right after dest-SIMD-reg otherwise 11 | 6) Predicated targets can be implemented as extension to current AVX-512/ARM-SVE 12 | 7) Use predicate registers X1..X6 where merging/zeroing is not applicable 13 | 8) Emulate zeroing and three-operand ops on ARM-SVE from fields in triplets 14 | 9) Paired predicated backends should expose half the predicates (and registers) 15 | 16 | ================================================================================ 17 | === >>> === tasks below are planned for the upcoming 1.3.0 milestone === <<< === 18 | ================================================================================ 19 | 20 | R) Task title: "implement basic runtime generation for existing ASM code-bases" 21 | 1) Rewrite ASM_ENTER macro to allocate temporary buffer with code-exec rights 22 | 2) Rewrite EMITB / EMITW emitters to write into a memory buffer at cur++ offset 23 | 3) Define M to (+/-) depending on static/dynamic code generation (+ clang check) 24 | 4) Rewrite j** to encode jump-label distances into binary form, track labels 25 | 5) Rewrite ASM_LEAVE to type-cast the buffer to a function-pointer, then call it 26 | 6) Implement proper buffer management for more advanced versions later 27 | 28 | ================================================================================ 29 | === >>> === tasks below are planned for the forthcoming 1.x.0 series === <<< === 30 | ================================================================================ 31 | 32 | K) Task title: "use configuration utils (autotools, CMake, etc) for building" 33 | 1) Use single build script for all host CPU architectures on Linux 34 | 2) Keep cross-compilation on x86-64 Linux hosts (targeting QEMU linux-user mode) 35 | 3) Consider adding continuous integration (CI) tests 36 | 37 | ================================================================================ 38 | 39 | E) Task title: "add 8 SIMD registers full-IEEE support for ARMv7 using VFP" 40 | 1) Implement 128-bit SIMD registers/instructions as 4x32-bit VFP (full-IEEE) 41 | 2) Emulate currently exposed NEON instructions using VFP variants/fallbacks 42 | 3) Use register-offloading to upper bank for 1 mem-arg in load-op instructions 43 | 4) Find place in SIMD target mask (RT_128=8), like legacy x86, ARMv7 is 8-regs 44 | 45 | ================================================================================ 46 | 47 | N) Task title: "implement new 128/256-bit 30-regs targets on top of current AVX" 48 | 1) Implement register-offloading to memory (SIMD structs) on top of current AVX 49 | 2) Add new SIMD compatiblity flag RT_SIMD_COMPAT_256=1/2 for 30-regs with AVX1/2 50 | 3) Find place in SIMD target mask (RT_128=1/RT_256=4) for custom 30-regs support 51 | 4) Improve mask-jump (mkj*x) instructions for 64-bit SIMD elements (optional) 52 | 5) Target 128-bit version to SSE, RT_SIMD_COMPAT_128=2/4/8/16/32 for 30-regs 53 | 6) Add tests to check defined immediate/displacement limits (for BASE/SIMD ops) 54 | 55 | ================================================================================ 56 | 57 | G) Task title: "consider 64-bit SIMD emulation with FPRs on PowerPC G5/POWER6" 58 | 1) Implement 64-bit SIMD registers/instructions as 2x64-bit FPRs (full-IEEE) 59 | 2) Emulate currently exposed SIMD instructions using FPU variants/fallbacks 60 | 3) Emulate 64-bit integer SIMD ops using 64-bit BASE registers where possible 61 | 4) New 64-bit SIMD backend would complement 32-bit targets in existing slots 62 | 5) Expose 16x128/8x256 on PowerPC VMX (v4) instead of 15x128/8x256 for 32-bit 63 | 6) Consider implementing 8x256 mode with register-offloading to mem for 64-bit 64 | 65 | ================================================================================ 66 | 67 | P) Task title: "use RT_REGS to unload SIMD target mask for 256-bit on POWER11" 68 | (may require significant redesign of SIMD target mask handling in rtbase.h) 69 | (better schedule this task for the next major update, also check rtzero.h) 70 | (consider renaming SVE binaries to *.a*armSVE to match *.x*avx512 on x86) 71 | 72 | ================================================================================ 73 | 74 | O) Task title: "use 3-operand SIMD instructions in packed/scalar SIMD tests" 75 | 76 | ================================================================================ 77 | 78 | T) Task title: "improve SIMD test coverage, add tests for corner cases in ops" 79 | 80 | ================================================================================ 81 | 82 | C) Task title: "implement SIMD fp32/fp64 converters consistently across targets" 83 | 84 | ================================================================================ 85 | 86 | A) Task title: "implement SIMD fp16 converters as tier-1 extension, modern CPUs" 87 | 88 | ================================================================================ 89 | 90 | F) Task title: "implement scalar fp compare-to-flags, fp/fp & fp/int converters" 91 | 92 | ================================================================================ 93 | 94 | M) Task title: "add support for trigonometric/randomizer SIMD meta-instructions" 95 | (consider sleef library as an example of elementary math functions with SIMD) 96 | (https://github.com/shibatch/sleef) <- use this code snapshot as a base 97 | 98 | ================================================================================ 99 | 100 | L) Task title: "consider SoftFP library integration for full fp16/fp128 support" 101 | 102 | ================================================================================ 103 | 104 | V) Task title: "add support for various new and existing architectures" 105 | 1) Add support for RISC-V architecture with "vector extension proposal" 106 | (search the Web for "RISC-V vector extension proposal" also standard SIMD) 107 | 2) Add support for Sunway SW26010 with custom Chinese BASE/SIMD ISAs (64-bit) 108 | (https://en.wikipedia.org/wiki/SW26010) 109 | 3) Add support for Loongson 3 (GS464E) with LoongSIMD ops as well as MIPS64r3 110 | (https://en.wikipedia.org/wiki/Loongson) 111 | 4) Add support for SPARC64 VIIIfx HPC-ACE SIMD extensions as well as BASE ops 112 | (http://www.fujitsu.com/downloads/TC/sparc64viiifx-extensions.pdf) 113 | 5) Add support for ELBRUS architecture, emulate SIMD with VLIW (plus Itanium) 114 | (https://en.wikipedia.org/wiki/Elbrus_2000) 115 | 116 | ================================================================================ 117 | -------------------------------------------------------------------------------- /VERSION: -------------------------------------------------------------------------------- 1 | v1.1.0d: UniSIMD, code name "ENsed+1d": macOS M1, VS2022, Ubuntu 22.04 2 | - switch to malloc for 64-bit pointer/address combo 3 | - use assembler-local labels to build on M1 macOS 4 | - add support for M1 macOS to makefiles 5 | - add VS2022 support for SIMD test 6 | - add notes for building on Windows with VS2022 and M1 macOS 7 | - update documentation and main header (add braces to ASM_INIT) 8 | - add double-precision logic/arithmetic to ARMv7, x86 9 | - add workarounds for POWER8 and POWER9 targets on Ubuntu 22.04 10 | - drop ppc64abi32 targets (since QEMU 5.2.0), also from QEMU build script 11 | - add notes for VS2022, QEMU 6.2.0 and 7.2.0, Ubuntu 23.04 12 | - swap 16-bit and SIMD integer compare test groups (30-37 <-> 38-44) 13 | - update copyright year to 2023 14 | 15 | v1.0.0g: UniSIMD, code name "ENsed+g", backports, VS2022, Ubuntu 22.04 16 | - switch to malloc for 64-bit pointer/address combo 17 | - require both SSE4.1 and SSE4.2 for SSE4 (v4) target slots 18 | - add DAZ support for flush-to-zero mode on x86 (makes on par with RISCs) 19 | - backport integer SIMD compare subset (min/max/ceq/cne/clt/cle/cgt/cge) 20 | - backport tests for integer SIMD compare (30-36) (signed/unsigned) 21 | - target slots AVX512DQ now include VL backends for 128/256-bit subsets 22 | - optimize SIMD compare and mask-jump instructions for AVX-512 23 | - add 64-bit sign/zero-extend bridges to existing 32/64-bit BASE subsets 24 | - optimize standalone remainder instructions on ARM and POWER 25 | - implement direct ASM section output comparison method (bypass C++ test) 26 | - extended 30-reg 256-bit and 15-reg 512-bit POWER backends are deprecated 27 | - extended POWER backends are still supported with v1.0.0f ASM feature set 28 | - add VS2022 support for SIMD test 29 | - update build scripts with TDM64-GCC 10.3.0-2 compiler reference 30 | - update documentation and main header (add braces to ASM_INIT) 31 | - add double-precision logic/arithmetic to ARMv7, x86 32 | - add workarounds for POWER8 and POWER9 targets on Ubuntu 22.04 33 | - drop ppc64abi32 targets (since QEMU 5.2.0), also from QEMU build script 34 | - add notes for VS2022, QEMU 6.2.0 and 7.2.0, Ubuntu 23.04 35 | - clean up comments in BASE and SIMD headers 36 | - update copyright year to 2023 37 | 38 | v1.1.0c: UniSIMD, code name "ENsed+1c": full-stack SIMD/BASE 39 | - add 8-bit (byte) BASE instruction subset, redesign 16-bit BASE 40 | - add 8-bit elements SIMD subset (native on RISCs, mostly emulated on x86) 41 | - add 64-bit sign/zero-extend bridges to existing 32/64-bit BASE subsets 42 | - add 32-bit sign/zero-extend bridges to new 8/16-bit BASE subsets 43 | - add RT_BASE flag to limit addressing granularity, extend range on ARMv8 44 | - add mask-jump (mkj) SIMD instructions for 8/16-bit SIMD subsets 45 | - add DAZ support for flush-to-zero mode on x86 (makes on par with RISCs) 46 | - add support for AVX-512 fp16 subset to match existing ARMv8.2 + SVE 47 | - AVX-512 fp16 requires separate binary (no target slot or cap check) 48 | - implement direct ASM section output comparison method (bypass C++ test) 49 | - AVX-512 fp16 now provides validation for ARM's fp16 using above method 50 | - target slots AVX512DQ now include VL backends for 128/256-bit subsets 51 | - target slots AVX512DQ now require BW support to facilitate 8/16-bit SIMD 52 | - optimize SIMD compare and mask-jump instructions for AVX-512 53 | - optimize setting flags instructions in 8/16-bit BASE subsets (on RISCs) 54 | - optimize standalone remainder instructions on ARM and POWER 55 | - extended 30-reg 256-bit and 15-reg 512-bit POWER backends are deprecated 56 | - extended POWER backends are still supported with v1.0.0f ASM feature set 57 | - fix 16-bit (half-int) BASE addressing granularity on POWER 58 | - add notes for VS2022, QEMU 6.2.0, Intel SDE 9.0 59 | - clean up comments in BASE and SIMD headers 60 | - update copyright year to 2022 61 | 62 | v1.1.0b: UniSIMD, code name "ENsed+1b", second development release 63 | - implement integer SIMD compare subset (signed/unsigned) 64 | - add integer SIMD compare on MIPS32/64 (min/max/ceq/cne/clt/cle/cgt/cge) 65 | - add integer SIMD compare on ARMv8 (64-bit min/max emulated) and SVE 66 | - add integer SIMD compare on POWER (64-bit emulated on POWER7) 67 | - add integer SIMD compare on x86+SSE2/4 (64-bit emulated) 68 | - add integer SIMD compare on x86+AVX1/2 (emulated for full SIMD AVX1) 69 | - add integer SIMD compare on x86+AVX512 70 | - add integer SIMD compare on original legacy targets (ARMv7, x86, PPC G4) 71 | - add integer SIMD compare for half-int SIMD backends (16-bit elements) 72 | - require both SSE4.1 and SSE4.2 for SSE4 (v4) target slots 73 | - add tests for integer SIMD compare (38-51) 74 | 75 | v1.0.0f: UniSIMD, code name "ENsed+f", fixes and tests 76 | - fix displacement encodings on MIPS 77 | - add testing for displacement levels and types 78 | - update makefiles to support ancient HW (SSE2, SSE1 has issue with cvzps) 79 | - update SIMD test framework, add scripts for test automation 80 | - update comments for QEMU 5.2.0 and QEMU 6.0.0 (require ninja-build) 81 | 82 | v1.1.0a: UniSIMD, code name "ENsed+1a", first development release 83 | - add tests for half-int SIMD/BASE ops (run level 30-37) 84 | - drop extended POWER targets from SIMD testing (no half-int support) 85 | - add half-int SIMD arithmetic with saturate (except original SSE1) 86 | - add implementation for half-int BASE ops across modern targets 87 | - add BASE half-int support on legacy ARMv7 and x86 88 | - add SIMD half-int support on legacy ARMv7 and x86 89 | - adjust displacement types for BASE half-int on legacy ARMv7 and x86 90 | - adjust displacement types for BASE half-int on MIPS and POWER 91 | - adjust displacement types for BASE half-int on x86_64 92 | - adjust displacement types for scalar fp16 on ARMv8 93 | - split SIMD half-int subset from fp16 on ARMv8 94 | - add SIMD half-int support on x86_64, enable on ARMv8 95 | - add SIMD half-int support on MIPS and POWER 96 | - add preliminary support for POWER9 fp128 SIMD ops (not tested) 97 | - add preliminary support for ARMv8.2 fp16 SIMD ops (not tested) 98 | 99 | v1.0.0e: UniSIMD, code name "ENsed+e", 2021 extended support 100 | - clarify instructions for POWER8 server, Raspberry Pi 3/4 101 | - update links and comments in project files 102 | - make comment for compiler swapping on MIPS more generic 103 | - update mappings for byte/char SIMD ops 104 | - update TDM64-GCC compiler reference to version 9.2.0 105 | - update copyright year to 2021 106 | - update comments for remainders and scaled addressing 107 | - optimize remainder ops on POWER9 108 | - add scaled-indexed addressing modes 109 | 110 | v1.0.0d: UniSIMD, code name "ENsed+d", documentation edition 111 | - clean up task descriptions in roadmap 112 | - add notes for Ubuntu, QEMU, MIPS cross-compilers 113 | - add Ubuntu (MATE) 20.04 LTS to makefile notes 114 | - update standalone MIPS compiler to 2020.06-01 115 | - change RUN_LEVEL to SUB_TEST for better wording 116 | - clean up comment about displacement values 117 | - add initial documentation for the assembler 118 | - add sin/cos and log/exp math definitions to rtbase 119 | 120 | v1.0.0c: UniSIMD, code name "ENsed++", celebration edition 121 | - celebrating C++ and its various compilers 122 | - add notes for Ubuntu Server on Raspberry Pi 4 123 | - add -mcpu=power8 compiler option to makefiles on POWER 124 | - fix RISC targets with clang after version 6.0 125 | - update copyright year to 2020 126 | 127 | v1.0.0b: UniSIMD, code name "ENsed+b", 2020-02-02 archive edition 128 | - all releases after 2020-01-01 have 2nd naming from their baseline: (ENsed) 129 | - letter from the update (b,c,..) appears concatenated after (+) in the name 130 | - future minor releases (v1.X.0a) will have digit and letter (+1a, +2a, +3a) 131 | - future major releases (v2.X.0a) will have the form: (2+, 2+1a, 2+2a, 2+3a) 132 | - clean up and update comments related to recent compiler and QEMU versions 133 | - fix comments for SIMD instructions in 3-operand forms, clarify for SIMD div 134 | - add SIMD fma3 aliases as 3-operand forms: fma**3** 135 | - fix SIMD fma3 emulation with fp32 elements on AVX1 136 | 137 | v1.0.0a: UniSIMD, code name "ENsed+", 2020-edition ("ENsed" + 2019 updates) 138 | - all new releases from now on will use *X.Y.Za(bc..) naming scheme 139 | - all branches start with letter (b), all tags start with letter (v) 140 | - first release (tag) on every new branch will be marked with letter (a) 141 | - all subsequent minor updates will have letters (b,c,..), tags aren't moving 142 | - add SIMD flag to replace VMX targets with VSX (on) 143 | - add signed BASE ops to combined-arithmetic-jump (arithmetic shift right) 144 | - add setting-flags BASE arithmetic shift right 145 | - make setting-flags BASE ops orthogonal to size/type (cmd**Z**) 146 | - add -mips64r6 compiler option to makefiles on MIPS 147 | - optimize 64-bit SIMD shifts on POWER9, clean up mkj** formatting 148 | - improve ARM/x86 compatibility in SIMD shifts 149 | - add SIMD integer multiply instruction (for 32/64-bit elements) 150 | - update copyright year to 2019 151 | - fix 32-bit BASE compare-to-mem on 64-bit POWER (backported down) 152 | - fix usage of non-persistent temp-register on POWER 153 | - update build instructions and makefile notes 154 | - add notes about QEMU 3.1.0 for SVE emulation 155 | - add SIMD flag to replace VMX targets with VSX (off) 156 | - fix and clean up SIMD target selection in headers 157 | - fix/add comments for SIMD/BASE shift count value 158 | - adjust build instructions for older HW compatibility 159 | - adjust Win64 release build script for lower core-count 160 | 161 | v1.0.0: UniSIMD assembler, code name "ENsed", base for future SIMD enhancements 162 | - foundation for QuadRay engine 0.7.0 "GIzmo" with ARM-SVE, POWER9, new scheme 163 | - renewed directory structure, move BASE and SIMD header files to core/config 164 | - add new fp-compatibility and feature tasks, rename TASKS file to ROADMAP 165 | - add support for 30 SIMD register pairs (2x128) backend on POWER7/8 166 | - add support for 30 SIMD registers (scalar+128+256) backend on Skylake-X 167 | - drop standalone SSE2 target from x64, reuse SSE4 (v4) slot, add compat flag 168 | - add support for 128-bit AVX1+FMA3 (v16) and AVX2+FMA3 (v32) targets for AMD 169 | - compactify POWER7/8 targets into one slot, add new RT_SIMD_COMPAT_PW8 flag 170 | - swap legacy PowerPC G4/POWER6 VMX (now v4) with POWER7/8 VSX1/2 (now v1) 171 | - 64-bit POWER6 now matches 64-bit Nehalem target (both v4), 15x128/8x256-bit 172 | - add support for POWER9 backend (v2) with immediate vector loads/stores 173 | - move 128-bit 30 SIMD registers Skylake-X target from v1 to v2, match POWER9 174 | - reserve 128-bit v1 and 256-bit v4 for 30 SIMD registers emulation on AVX1/2 175 | - implement plain ARM-SVE backend (v4) for 256/512/1K4/2K8-bit vector lengths 176 | - implement paired ARM-SVE backend (v1) for 512/1K4/2K8-bit SIMD target slots 177 | - new scheme: RT_128=4+8, RT_256=1+2, RT_512=1+2, RT_1K4=1+2 are 15 registers 178 | - new scheme: RT_128=1+2, RT_256=4+8, RT_512=4+8, RT_1K4=4+8 are 30 registers 179 | - add elm*x_st instruction to detach scalar subset from vectors (via mem) 180 | - add support for horizontal pairwise/reductive add/mul/min/max instructions 181 | - patch system allocators to compile on macOS, widen OS support in makefiles 182 | - clean up SIMD tests to support PIE (also macOS) 183 | - separate 64-bit Linux from multilib build scripts, add for macOS 184 | - add VMX-compatible scalar SIMD subset on PPC G4 and POWER family of CPUs 185 | - add MSA/scalar compatibility on big-endian MIPS, support for fp32 11-bit DP 186 | - rename sections in target-specific headers to BASE, SIMD, ELEM (for scalar) 187 | - optimize long displacements for BASE, SIMD, ELEM on RISCs where applicable 188 | - implement proper SIMD-scaling for displacement types (as sliding in rtbase) 189 | - move common internal x87 FPU sections to BASE headers on x86 190 | - dedicate rtconf header for configurable instruction subsets on all targets 191 | - allow target-specific headers to redefine common instructions from rtbase 192 | - improve SIMD target reporting in tests, add -c n option to reduce test time 193 | - update notes for MIPS cross-compiler location, add -mnan=2008 to makefiles 194 | - update notes for AArch64 Linux, QEMU 3.0.0, Intel SDE, add ARM IE reference 195 | - add test for SIMD mask-move (mmv), run level 27 196 | - add test for 8/15/30 BASE/SIMD registers, run level 28 197 | - warning-free building with GCC/Clang and MSVC++ 198 | - fix BASE shifts with zero immediate arg on legacy ARMv7 (backported down) 199 | - convert all text files with unix2dos 200 | - always reserve maximum space for SIMD register file 201 | - save/restore temp predicate register on AVX512 202 | - fix SIMD registers save/restore for 15x128x2 on POWER7 203 | - fix temporary FPRs save/restore on POWER 204 | - fix scalar SIMD min/max on POWER7 205 | - fix BASE compare immediate encodings on POWER 206 | - fix location for 128/256-bit common SIMD instructions 207 | - fix for scalar SIMD alignment on ARMv7, POWER8 208 | - fix compilation in C++11 mode with RT_DEBUG=2 209 | - add comment for NaNs handling in floating point piepline 210 | - clarify comments about SIMD fp round instructions 211 | - fix comment for SIMD shifts with count in memory 212 | - add comment for scalar/vector compatibility 213 | 214 | v0.9.1: Unified SIMD Assembler, 3-operand + basic scalar SIMD, extra backends 215 | - expose 128/256-bit SIMD subsets (cmd[i/j/l]*, cmd[c/d/f]*) simultaneously 216 | - add 3-operand SIMD instructions to all targets, emulate where not present 217 | - implement basic scalar SIMD support (arithmetic + compare-to-mask-elem) 218 | - implement additional paired/quaded 8-register SIMD backends on x86_64 219 | - add 8-register makefile flags RT_256_R8, RT_512_R8, RT_1K4_R8, RT_2K8_R8 220 | - original 15-register makefile flags RT_128, RT_256, RT_512 remain 221 | - add new makefile flag RT_1K4 for 15-register code-bases on paired AVX-512 222 | - expose 30 registers as an extension to common baseline of 15 where present 223 | - each major architecture has at least one SIMD target with 30 registers 224 | - add new RT_SIMD selector flag to remap vector-length-agnostic subsets 225 | - add new RT_REGS selector flag to choose targets within given RT_SIMD width 226 | - rename SIMD target headers to reflect size-factor/sub-variant, move legacy 227 | - add new internal flags RT_128X*, RT_256X*, RT_512X* to match SIMD headers 228 | - new internal flags keep SIMD sub-variant value in format for native width 229 | - implement SIMD flags compatibility layer in rtzero to map makefile flags 230 | - rtarch main header selects appropriate BASE/SIMD target from flags above 231 | - implement SIMD target format converters in rtbase for runtime selection 232 | - change SIMD target reporting to native-size x size-factor v version format 233 | - reserve _RX slots in SIMD target mask for predicated backends (30+8 regs) 234 | - clean up (drop) legacy SSE(1) support from x32 headers/makefiles 235 | - move BASE sub-target selection to rtarch main header (ARM, x86) 236 | - add notes for AArch64 Linux on Raspberry Pi 3 to INSTALL file 237 | - add new TASKS file with description for future tasks 238 | - enforce full ARMv7 instruction set (32-bit words) in makefiles 239 | - fix LLVM's condition evaluation sign on all targets, define M -/+ 240 | - fix SIMD registers save/restore for 128-bit AVX targets (backported down) 241 | - fix buffer allocation in SIMD tests (for 64-bit elems) 242 | - fix stack alignment (now 16 bytes) on ARMv8/AArch64 (hardware) targets 243 | - allow external override (from makefiles) for SIMD compatibility modes 244 | - minor fixes in rtarch, accelerate release builds on multi-core machines 245 | 246 | v0.9.0: Unified SIMD Assembler, 256-bit SIMD on RISCs, basic AVX-512 support 247 | - adjust root rt_SIMD_INFO struct to contain both 32-bit and 64-bit constants 248 | - add new sign-mask and full-mask general purpose constants to rt_SIMD_INFO 249 | - expose 32/64-bit SIMD-element-size subsets (cmdo*, cmdq*) simultaneously 250 | - element size in existing cmdp* subset remains configurable with RT_ELEMENT 251 | - all three SIMD subsets (cmdo*, cmdp*, cmdq*) are still SIMD-width-agnostic 252 | - expose fixed 64-bit BASE subset cmdz* for 64-bit targets only 253 | - existing address-size cmdx*, element-size cmdy* and 32-bit cmdw* remain 254 | - add BASE move instructions for 64-bit immediates as pairs of 32-bit types 255 | - add new rotate-right and inverse-logic BASE instructions (ror, ann, orn) 256 | - add new BMI1/BMI2 implementations for existing BASE instructions on x86 257 | - implement non-portable x87 ISA subset for x86 targets internally 258 | - implement fused-multiply-accumulate (fma/fms) on all SIMD targets 259 | - add new mask-move SIMD instructions to common SIMD ISA (was x86 only) 260 | - add new fp-negate and inverse-logic SIMD instructions (neg, orn, not) 261 | - add new variable SIMD shifts with per-element count to all targets 262 | - implement 256-bit SIMD support (2x128-bit, 15 regs) on modern RISC targets 263 | - implement 512-bit SIMD support (4x128-bit, 15 regs) on modern POWER targets 264 | - implement 512-bit SIMD support (1x512-bit, 16 regs) on future x86 targets 265 | - AVX1/AVX2 256-bit SIMD for x86 (1x256-bit, 16 regs) remains supported 266 | - 256-bit SIMD with 15 regs becomes new common baseline for modern hardware 267 | - improve test coverage for BASE and SIMD load-op instructions 268 | - add tests for new rotate, logic, shifts, fma/fms instructions, run level 24 269 | - add rtzero header file to clean up assembler definitions after use 270 | - rename instruction parameters to better reflect their use as source/dest 271 | - add formulas for all BASE and SIMD instructions for better clarity 272 | - reserve the whole alphabet for future BASE and SIMD instruction subsets 273 | - add new SIMD compatibility flags for 128-bit AVX1/2, FMA/FMS/FMR, XMM regs 274 | - add wrappers for 64-bit literals to better support legacy 32-bit compilers 275 | - fix label_ld/label_st range on ARMv7/AArch64 to be on par with other targets 276 | - fix discrepancy in VMX/VSX vector-loads on POWER (from here backported down) 277 | - fix AVX-version of mmvpx_ld from zeroing to merging on x86 278 | 279 | v0.8.1: Unified SIMD Assembler, full 64-bit fp/int SIMD compute elements 280 | - add element-sized BASE ISA subset to fixed-32-bit and address-sized subsets 281 | - new instruction mnemonics introduced for element-sized BASE subset (cmdy*) 282 | - add new rtarch headers to house element-sized SIMD subset for 64-bit targets 283 | - support for 64-bit SIMD elements currently requires 64-bit addresses as well 284 | - enable full-precision SIMD rcpps/rsqps and rceps/rseps instructions 285 | - add new offset corrections for endianness related to element-sized subset 286 | - add new SIMD width short names for fixed and element-sized SIMD fields 287 | - add new custom-sized integer types (address, element) with printf mods 288 | - make current adjustable fp types follow SIMD element size (RT_ELEMENT) 289 | - adjust math macros and definitions to support double-precision arithmetic 290 | - add build/clean scripts, update makefiles with extra targets, MIPS notes 291 | - remove unnecessary limitation on SIMD masks (add AVX-512/ARM-SVE notes) 292 | - distinguish SIMD NEONv1/v2 vanilla ARM builds (cortex-a8/cortex-a15) 293 | - distinguish SIMD v2/v4 64-bit POWER builds (POWER7+VSX/POWER8+VSX2) 294 | - fix non-setting-flags instructions to not interfere with cmp on MIPS, POWER 295 | - fix full-precision IEEE-compat divps_ld on ARMv7 targets (backported down) 296 | 297 | v0.8.0: Unified SIMD Assembler, full 64-bit addressing for BASE and SIMD 298 | - double original 32-bit BASE ISA to fixed-32-bit and address-sized subsets 299 | - original instruction mnemonics follow in-heap/code-segment address size 300 | - new instruction mnemonics introduced for fixed-32-bit subset (cmdw*) 301 | - setting-flags instruction mnemonics remapped from (cmdz*) to (cmd*z) 302 | - add combined-arithmetic-jump wrapper for better API stability/efficiency 303 | - add new rtarch headers to house address-sized subset for 64-bit targets 304 | - move original (now address-sized) mappings to rtbase for 32-bit targets 305 | - add canonical forms for BASE div/rem and shifts (not always efficient) 306 | - add setting-flags versions for BASE orr/xor and unsigned shifts 307 | - remap one-operand instructions from cmd**_rr/mm to rx/mx and xr/xm 308 | - move stack instructions to their own section at the end of rtarch headers 309 | - move sregs instructions to their own section at the end of rtarch headers 310 | - add config flags for full-precision SIMD rcpps/rsqps instructions 311 | - add master flags for SIMD compatibility modes to rtarch main header 312 | - add new offset corrections for endianness (from here backported down) 313 | - add Win64 support via TDM64-GCC toolchain (tdm64-gcc-5.1.0-2.exe) 314 | - add NULL-ptr checks to custom allocators (Linux/mmap, Win64/VirtualAlloc) 315 | - fix setting-flags instructions for 64-bit POWER running 32-bit ISA 316 | - fix non-setting-flags instructions (neg*x) to not set flags on MIPS 317 | 318 | v0.7.1: Unified SIMD Assembler, 64/32-bit hybrid mode for native 64-bit ABI 319 | - use fixed-sized and adjustable integer types in rtbase and SIMD test 320 | - add a64 (AArch64 native ABI) and x64 (x86_64 native ABI) targets/makefiles 321 | - add m64 (MIPS64 native ABI) and p64 (Power64 native ABI) targets/makefiles 322 | - most of the current ISA remains 32-bit for BASE and SIMD with few exceptions 323 | - adjust backend structures to support 64-bit pointer types in select places 324 | - move sys_alloc/sys_free to platform-specific sections in SIMD test 325 | - implement custom allocators (mmap) to limit address range to 32-bit (Linux) 326 | - limit address range to 2GB boundary as MIPS64 sign-extends 32-bit mem-loads 327 | - treat code labels as 64-bit in label_ld/st and jmpxx_mm instructions 328 | - implement 64-bit versions of stack_sa/la instructions on MIPS and POWER 329 | - fix variable SIMD shifts to support little-endian on POWER targets 330 | - fix ASM blocks to only use SIMD registers within VRSAVE segment on POWER 331 | - remove ASM block's zeroing of r15 as unnecessary on x32/x64 targets 332 | - reformat/rework ASM blocks to better respect internal register mapping 333 | - explicitly save/load SIMD registers in ASM blocks across all targets 334 | - drop ASM clobber lists for lack of consistency across targets/SIMD-widths 335 | - fix clang's ASM block l-value errors and other warnings, official support 336 | - add build instructions to makefiles for Ubuntu 16.04 LTS 64-bit Live CD 337 | - fix divps_ld instruction's encoding on ARM 338 | - use IEEE-compatible div/sqr on legacy ARM and POWER 339 | 340 | v0.7: Unified SIMD Assembler, additional 32-bit CPU architectures 341 | - add a32 (AArch64:ILP32 ABI) and x32 (x86_64:mx32 ABI) targets/makefiles 342 | - add m32 (MIPS32r5/r6 + MSA) and p32 (POWER + VMX/VSX) targets/makefiles 343 | - add yet another SIMD variant (v4) for x86/SSE4.1 and ARMv8/AArch32 344 | - separate ARMv7/ASIMDv2 (v2) and ARMv8/AArch32 (v4) SIMD variants on ARM 345 | - add ARM builds for Raspberry Pi 2 and 3 in addition to Nokia N900 346 | - use static linking in SIMD tests for QEMU emulation 347 | - add mmv (blendvps) to x86/x32 SSE4.1 for fast conditional loads 348 | - add combined-compare-jumps to rtarch for better efficiency (MIPS, POWER) 349 | - remove limitation for BASE instructions to only accept DP offsets 350 | - add new immediate/displacement types, add comment that they are unsigned 351 | - add comments throughout rtarch about instructions' set-flags behavior 352 | - implement full-range 32-bit integer divide on ARMv7 (v1) as 64-bit fp-div 353 | - add widening versions of integer multiply instructions to rtarch definitions 354 | - add remainder wrappers for integer divide instructions to rtarch definitions 355 | - add IEEE-compatible versions of fp div & sqr for ARMv7 and POWER targets 356 | - add "residual correction" to non-IEEE fp div on ARMv7 and POWER targets 357 | - add SIMD tests for fp-to-int round and int-div remainder, run level 18 358 | 359 | v0.6: Unified SIMD Assembler, additional SIMD targets 360 | - rename SIMD target files to reflect SIMD width 361 | - enable SIMD instructions definitions only if RT_SIMD_CODE is defined 362 | - add new SIMD targets for SSE1, AVX1, AVX2 with corresponding build flags 363 | - add float-to-integer convert with explicit mode parameter (x86, AArch32) 364 | - add signed-integer-divide native instruction for ARM's AArch32 mode 365 | - add SIMD test for shifts by runtime value & BASE register, run level 16 366 | - add ver (cpuid) instruction for runtime SIMD target selection (x86 only) 367 | - add mmv (vmaskmov) to AVX backend for fast conditional loads/stores 368 | - add BASE instructions sub-tests to SIMD test if RT_BASE_TEST is defined 369 | - drop set-flags bit (slow) from BASE mul instructions on ARM 370 | - add RT_SIMD_FAST_FCTRL to save 1 instruction on FCTRL blocks entry 371 | - clarify current and future targets in rtarch (from here backported down) 372 | - add xor & neg BASE instructions to rtarch 373 | - add shifts by fixed BASE register instructions 374 | - add register versions of BASE mul/div, remainder instructions 375 | - add SIMD cvzps instruction for fp-to-int round-towards-zero conversion 376 | - add ASM_ENTER_F/ASM_LEAVE_F/ROUND*_F for non-IEEE flush-to-zero SIMD mode 377 | - add RT_SIMD_FLUSH_ZERO to enable faster non-IEEE flush-to-zero SIMD mode 378 | - add ASM_INIT/ASM_DONE to manage root info structure 379 | - make stack pointer register architecturally invisible 380 | - replace non-standard malloc.h with stdlib.h for malloc/free 381 | - clean up rtarch whitespace formatting 382 | 383 | v0.5: Unified SIMD Assembler, API freeze for the engine 384 | - instruction naming scheme finalized 385 | - change ARM instructions to set flags 386 | - added framework for internal constants (used by reciprocals) 387 | - added SIMD instruction for cube root, reciprocal steps redesigned 388 | - additional SIMD tests, run level 15 389 | 390 | v0.4: SIMD test framework, macro assembler overhaul 391 | - macro expansion reworked for better compiler compatibility 392 | - immediate/displacement parameters handling redesigned 393 | - added reciprocal support for SSE, MPE support refined 394 | 395 | v0.3: SIMD test framework, run level 9 396 | - tests for integer mul, div, jmp instructions 397 | - SIMD tests for integer add, shl, shr instructions 398 | - SIMD tests for cvt, sqr, rsq instructions 399 | 400 | v0.2: SIMD test framework, run level 5 401 | - SIMD tests for mul, div, cmp instructions 402 | 403 | v0.1: SIMD test framework, run level 1 404 | - SIMD tests for add, sub instructions 405 | 406 | v0.0: Empty project 407 | - initial file set and directory structure 408 | -------------------------------------------------------------------------------- /core/config/rtarch_pQF_128x1v2.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************/ 2 | /* Copyright (c) 2013-2025 VectorChief (at github, bitbucket, sourceforge) */ 3 | /* Distributed under the MIT software license, see the accompanying */ 4 | /* file COPYING or http://www.opensource.org/licenses/mit-license.php */ 5 | /******************************************************************************/ 6 | 7 | #ifndef RT_RTARCH_PQF_128X1V2_H 8 | #define RT_RTARCH_PQF_128X1V2_H 9 | 10 | /******************************************************************************/ 11 | /********************************* LEGEND *********************************/ 12 | /******************************************************************************/ 13 | 14 | /* 15 | * rtarch_pQF_128x1v2.h: Implementation of POWER fp128 VSX3 instructions. 16 | * 17 | * This file is a part of the unified SIMD assembler framework (rtarch.h) 18 | * and contains architecture-specific extensions 19 | * outside of the common assembler core. 20 | * 21 | * Recommended naming scheme for instructions: 22 | * 23 | * cmdv*_rx - applies [cmd] to scalar-fp128: [r]egister (one operand) 24 | * cmdv*_rr - applies [cmd] to scalar-fp128: [r]egister from [r]egister 25 | * 26 | * cmdv*_rm - applies [cmd] to scalar-fp128: [r]egister from [m]emory 27 | * cmdv*_ld - applies [cmd] to scalar-fp128: as above (friendly alias) 28 | * 29 | * Note, when using fixed-data-size 128/256-bit SIMD subsets simultaneously 30 | * upper 128-bit halves of full 256-bit SIMD registers may end up undefined. 31 | * On RISC targets they remain unchanged, while on x86-AVX they are zeroed. 32 | * This happens when registers written in 128-bit subset are then used/read 33 | * from within 256-bit subset. The same rule applies to mixing with 512-bit 34 | * and wider vectors. Use of scalars may leave respective vector registers 35 | * undefined, as seen from the perspective of any particular vector subset. 36 | * 37 | * 256-bit vectors used with wider subsets may not be compatible with regards 38 | * to memory loads/stores when mixed in the code. It means that data loaded 39 | * with wider vector and stored within 256-bit subset at the same address may 40 | * result in changing the initial representation in memory. The same can be 41 | * said about mixing vector and scalar subsets. Scalars can be completely 42 | * detached on some architectures. Use elm*x_st to store 1st vector element. 43 | * 128-bit vectors should be memory-compatible with any wider vector subset. 44 | * 45 | * Handling of NaNs in the floating point pipeline may not be consistent 46 | * across different architectures. Avoid NaNs entering the data flow by using 47 | * masking or control flow instructions. Apply special care when dealing with 48 | * floating point compare and min/max input/output. The result of floating point 49 | * compare instructions can be considered a -QNaN, though it is also interpreted 50 | * as integer -1 and is often treated as a mask. Most arithmetic instructions 51 | * should propagate QNaNs unchanged, however this behavior hasn't been tested. 52 | * 53 | * Note, that instruction subsets operating on vectors of different length 54 | * may support different number of SIMD registers, therefore mixing them 55 | * in the same code needs to be done with register awareness in mind. 56 | * For example, AVX-512 supports 32 SIMD registers, while AVX2 only has 16, 57 | * as does 256-bit paired subset on ARMv8, while 128-bit and SVE have 32. 58 | * These numbers should be consistent across architectures if properly 59 | * mapped to SIMD target mask presented in rtzero.h (compatibility layer). 60 | * 61 | * Interpretation of instruction parameters: 62 | * 63 | * upper-case params have triplet structure and require W to pass-forward 64 | * lower-case params are singular and can be used/passed as such directly 65 | * 66 | * XD - SIMD register serving as destination only, if present 67 | * XG - SIMD register serving as destination and first source 68 | * XS - SIMD register serving as second source (first if any) 69 | * XT - SIMD register serving as third source (second if any) 70 | * 71 | * RD - BASE register serving as destination only, if present 72 | * RG - BASE register serving as destination and first source 73 | * RS - BASE register serving as second source (first if any) 74 | * RT - BASE register serving as third source (second if any) 75 | * 76 | * MD - BASE addressing mode (Oeax, M***, I***) (memory-dest) 77 | * MG - BASE addressing mode (Oeax, M***, I***) (memory-dsrc) 78 | * MS - BASE addressing mode (Oeax, M***, I***) (memory-src2) 79 | * MT - BASE addressing mode (Oeax, M***, I***) (memory-src3) 80 | * 81 | * DD - displacement value (DP, DF, DG, DH, DV) (memory-dest) 82 | * DG - displacement value (DP, DF, DG, DH, DV) (memory-dsrc) 83 | * DS - displacement value (DP, DF, DG, DH, DV) (memory-src2) 84 | * DT - displacement value (DP, DF, DG, DH, DV) (memory-src3) 85 | * 86 | * IS - immediate value (is used as a second or first source) 87 | * IT - immediate value (is used as a third or second source) 88 | */ 89 | 90 | /******************************************************************************/ 91 | /******************************** INTERNAL ********************************/ 92 | /******************************************************************************/ 93 | 94 | #if (defined RT_SIMD_CODE) 95 | 96 | #if (RT_128X1 == 2 || RT_128X1 == 8) 97 | 98 | /******************************************************************************/ 99 | /******************************** EXTERNAL ********************************/ 100 | /******************************************************************************/ 101 | 102 | /******************************************************************************/ 103 | /********************************** ELEM **********************************/ 104 | /******************************************************************************/ 105 | 106 | /**************** scalar quad-precision generic move/logic ****************/ 107 | 108 | /* mov (D = S) */ 109 | 110 | #define movvx_rr(XD, XS) \ 111 | EMITW(0xF0000497 | MXM(REG(XD), REG(XS), REG(XS))) 112 | 113 | #define movvx_ld(XD, MS, DS) \ 114 | AUW(SIB(MS), EMPTY, EMPTY, MOD(MS), VAL(DS), C2(DS), EMPTY2) \ 115 | EMITW(0x00000000 | MPM(REG(XD), MOD(MS), VAL(DS), B2(DS), P2(DS))) 116 | 117 | #define movvx_st(XS, MD, DD) \ 118 | AUW(SIB(MD), EMPTY, EMPTY, MOD(MD), VAL(DD), C2(DD), EMPTY2) \ 119 | EMITW(0x00000000 | MPM(REG(XS), MOD(MD), VAL(DD), B2(DD), O2(DD))) 120 | 121 | /* mmv (G = G mask-merge S) where (mask-elem: 0 keeps G, -1 picks S) 122 | * uses Xmm0 implicitly as a mask register, destroys Xmm0, 0-masked XS elems */ 123 | 124 | #define mmvvx_rr(XG, XS) \ 125 | EMITW(0xF000003F | MXM(REG(XG), REG(XG), REG(XS))) 126 | 127 | #define mmvvx_ld(XG, MS, DS) \ 128 | AUW(SIB(MS), EMPTY, EMPTY, MOD(MS), VAL(DS), C2(DS), EMPTY2) \ 129 | EMITW(0x00000000 | MPM(TmmM, MOD(MS), VAL(DS), B2(DS), P2(DS))) \ 130 | EMITW(0xF000003F | MXM(REG(XG), REG(XG), TmmM)) 131 | 132 | #define mmvvx_st(XS, MG, DG) \ 133 | AUW(SIB(MG), EMPTY, EMPTY, MOD(MG), VAL(DG), C2(DG), EMPTY2) \ 134 | EMITW(0x00000000 | MPM(TmmM, MOD(MG), VAL(DG), B2(DG), P2(DG))) \ 135 | EMITW(0xF000003F | MXM(TmmM, TmmM, REG(XS))) \ 136 | EMITW(0x00000000 | MPM(TmmM, MOD(MG), VAL(DG), B2(DG), O2(DG))) 137 | 138 | /* and (G = G & S), (D = S & T) if (#D != #T) */ 139 | 140 | #define andvx_rr(XG, XS) \ 141 | andvx3rr(W(XG), W(XG), W(XS)) 142 | 143 | #define andvx_ld(XG, MS, DS) \ 144 | andvx3ld(W(XG), W(XG), W(MS), W(DS)) 145 | 146 | #define andvx3rr(XD, XS, XT) \ 147 | EMITW(0xF0000417 | MXM(REG(XD), REG(XS), REG(XT))) 148 | 149 | #define andvx3ld(XD, XS, MT, DT) \ 150 | AUW(SIB(MT), EMPTY, EMPTY, MOD(MT), VAL(DT), C2(DT), EMPTY2) \ 151 | EMITW(0x00000000 | MPM(TmmM, MOD(MT), VAL(DT), B2(DT), P2(DT))) \ 152 | EMITW(0xF0000417 | MXM(REG(XD), REG(XS), TmmM)) 153 | 154 | /* ann (G = ~G & S), (D = ~S & T) if (#D != #T) */ 155 | 156 | #define annvx_rr(XG, XS) \ 157 | annvx3rr(W(XG), W(XG), W(XS)) 158 | 159 | #define annvx_ld(XG, MS, DS) \ 160 | annvx3ld(W(XG), W(XG), W(MS), W(DS)) 161 | 162 | #define annvx3rr(XD, XS, XT) \ 163 | EMITW(0xF0000457 | MXM(REG(XD), REG(XT), REG(XS))) 164 | 165 | #define annvx3ld(XD, XS, MT, DT) \ 166 | AUW(SIB(MT), EMPTY, EMPTY, MOD(MT), VAL(DT), C2(DT), EMPTY2) \ 167 | EMITW(0x00000000 | MPM(TmmM, MOD(MT), VAL(DT), B2(DT), P2(DT))) \ 168 | EMITW(0xF0000457 | MXM(REG(XD), TmmM, REG(XS))) 169 | 170 | /* orr (G = G | S), (D = S | T) if (#D != #T) */ 171 | 172 | #define orrvx_rr(XG, XS) \ 173 | orrvx3rr(W(XG), W(XG), W(XS)) 174 | 175 | #define orrvx_ld(XG, MS, DS) \ 176 | orrvx3ld(W(XG), W(XG), W(MS), W(DS)) 177 | 178 | #define orrvx3rr(XD, XS, XT) \ 179 | EMITW(0xF0000497 | MXM(REG(XD), REG(XS), REG(XT))) 180 | 181 | #define orrvx3ld(XD, XS, MT, DT) \ 182 | AUW(SIB(MT), EMPTY, EMPTY, MOD(MT), VAL(DT), C2(DT), EMPTY2) \ 183 | EMITW(0x00000000 | MPM(TmmM, MOD(MT), VAL(DT), B2(DT), P2(DT))) \ 184 | EMITW(0xF0000497 | MXM(REG(XD), REG(XS), TmmM)) 185 | 186 | /* orn (G = ~G | S), (D = ~S | T) if (#D != #T) */ 187 | 188 | #define ornvx_rr(XG, XS) \ 189 | ornvx3rr(W(XG), W(XG), W(XS)) 190 | 191 | #define ornvx_ld(XG, MS, DS) \ 192 | ornvx3ld(W(XG), W(XG), W(MS), W(DS)) 193 | 194 | #define ornvx3rr(XD, XS, XT) \ 195 | EMITW(0xF0000557 | MXM(REG(XD), REG(XT), REG(XS))) 196 | 197 | #define ornvx3ld(XD, XS, MT, DT) \ 198 | AUW(SIB(MT), EMPTY, EMPTY, MOD(MT), VAL(DT), C2(DT), EMPTY2) \ 199 | EMITW(0x00000000 | MPM(TmmM, MOD(MT), VAL(DT), B2(DT), P2(DT))) \ 200 | EMITW(0xF0000557 | MXM(REG(XD), TmmM, REG(XS))) 201 | 202 | /* xor (G = G ^ S), (D = S ^ T) if (#D != #T) */ 203 | 204 | #define xorvx_rr(XG, XS) \ 205 | xorvx3rr(W(XG), W(XG), W(XS)) 206 | 207 | #define xorvx_ld(XG, MS, DS) \ 208 | xorvx3ld(W(XG), W(XG), W(MS), W(DS)) 209 | 210 | #define xorvx3rr(XD, XS, XT) \ 211 | EMITW(0xF00004D7 | MXM(REG(XD), REG(XS), REG(XT))) 212 | 213 | #define xorvx3ld(XD, XS, MT, DT) \ 214 | AUW(SIB(MT), EMPTY, EMPTY, MOD(MT), VAL(DT), C2(DT), EMPTY2) \ 215 | EMITW(0x00000000 | MPM(TmmM, MOD(MT), VAL(DT), B2(DT), P2(DT))) \ 216 | EMITW(0xF00004D7 | MXM(REG(XD), REG(XS), TmmM)) 217 | 218 | /* not (G = ~G), (D = ~S) */ 219 | 220 | #define notvx_rx(XG) \ 221 | notvx_rr(W(XG), W(XG)) 222 | 223 | #define notvx_rr(XD, XS) \ 224 | EMITW(0xF0000517 | MXM(REG(XD), REG(XS), REG(XS))) 225 | 226 | /************* scalar quad-precision floating-point arithmetic ************/ 227 | 228 | /* neg (G = -G), (D = -S) */ 229 | 230 | #define negvs_rx(XG) \ 231 | negvs_rr(W(XG), W(XG)) 232 | 233 | #define negvs_rr(XD, XS) \ 234 | EMITW(0xFC000648 | MXM(REG(XD), 0x10, REG(XS))) 235 | 236 | /* add (G = G + S), (D = S + T) if (#D != #T) */ 237 | 238 | #define addvs_rr(XG, XS) \ 239 | addvs3rr(W(XG), W(XG), W(XS)) 240 | 241 | #define addvs_ld(XG, MS, DS) \ 242 | addvs3ld(W(XG), W(XG), W(MS), W(DS)) 243 | 244 | #define addvs3rr(XD, XS, XT) \ 245 | EMITW(0xFC000008 | MXM(REG(XD), REG(XS), REG(XT))) 246 | 247 | #define addvs3ld(XD, XS, MT, DT) \ 248 | AUW(SIB(MT), EMPTY, EMPTY, MOD(MT), VAL(DT), C2(DT), EMPTY2) \ 249 | EMITW(0x00000000 | MPM(TmmM, MOD(MT), VAL(DT), B2(DT), P2(DT))) \ 250 | EMITW(0xFC000008 | MXM(REG(XD), REG(XS), TmmM)) 251 | 252 | /* sub (G = G - S), (D = S - T) if (#D != #T) */ 253 | 254 | #define subvs_rr(XG, XS) \ 255 | subvs3rr(W(XG), W(XG), W(XS)) 256 | 257 | #define subvs_ld(XG, MS, DS) \ 258 | subvs3ld(W(XG), W(XG), W(MS), W(DS)) 259 | 260 | #define subvs3rr(XD, XS, XT) \ 261 | EMITW(0xFC000408 | MXM(REG(XD), REG(XS), REG(XT))) 262 | 263 | #define subvs3ld(XD, XS, MT, DT) \ 264 | AUW(SIB(MT), EMPTY, EMPTY, MOD(MT), VAL(DT), C2(DT), EMPTY2) \ 265 | EMITW(0x00000000 | MPM(TmmM, MOD(MT), VAL(DT), B2(DT), P2(DT))) \ 266 | EMITW(0xFC000408 | MXM(REG(XD), REG(XS), TmmM)) 267 | 268 | /* mul (G = G * S), (D = S * T) if (#D != #T) */ 269 | 270 | #define mulvs_rr(XG, XS) \ 271 | mulvs3rr(W(XG), W(XG), W(XS)) 272 | 273 | #define mulvs_ld(XG, MS, DS) \ 274 | mulvs3ld(W(XG), W(XG), W(MS), W(DS)) 275 | 276 | #define mulvs3rr(XD, XS, XT) \ 277 | EMITW(0xFC000048 | MXM(REG(XD), REG(XS), REG(XT))) 278 | 279 | #define mulvs3ld(XD, XS, MT, DT) \ 280 | AUW(SIB(MT), EMPTY, EMPTY, MOD(MT), VAL(DT), C2(DT), EMPTY2) \ 281 | EMITW(0x00000000 | MPM(TmmM, MOD(MT), VAL(DT), B2(DT), P2(DT))) \ 282 | EMITW(0xFC000048 | MXM(REG(XD), REG(XS), TmmM)) 283 | 284 | /* div (G = G / S), (D = S / T) if (#D != #T) and on ARMv7 if (#D != #S) */ 285 | 286 | #define divvs_rr(XG, XS) \ 287 | divvs3rr(W(XG), W(XG), W(XS)) 288 | 289 | #define divvs_ld(XG, MS, DS) \ 290 | divvs3ld(W(XG), W(XG), W(MS), W(DS)) 291 | 292 | #define divvs3rr(XD, XS, XT) \ 293 | EMITW(0xFC000448 | MXM(REG(XD), REG(XS), REG(XT))) 294 | 295 | #define divvs3ld(XD, XS, MT, DT) \ 296 | AUW(SIB(MT), EMPTY, EMPTY, MOD(MT), VAL(DT), C2(DT), EMPTY2) \ 297 | EMITW(0x00000000 | MPM(TmmM, MOD(MT), VAL(DT), B2(DT), P2(DT))) \ 298 | EMITW(0xFC000448 | MXM(REG(XD), REG(XS), TmmM)) 299 | 300 | /* sqr (D = sqrt S) */ 301 | 302 | #define sqrvs_rr(XD, XS) \ 303 | EMITW(0xFC000648 | MXM(REG(XD), 0x1B, REG(XS))) 304 | 305 | #define sqrvs_ld(XD, MS, DS) \ 306 | AUW(SIB(MS), EMPTY, EMPTY, MOD(MS), VAL(DS), C2(DS), EMPTY2) \ 307 | EMITW(0x00000000 | MPM(TmmM, MOD(MS), VAL(DS), B2(DS), P2(DS))) \ 308 | EMITW(0xFC000648 | MXM(REG(XD), 0x1B, TmmM)) 309 | 310 | /* fma (G = G + S * T) if (#G != #S && #G != #T) */ 311 | 312 | #define fmavs_rr(XG, XS, XT) \ 313 | EMITW(0xFC000308 | MXM(REG(XG), REG(XS), REG(XT))) 314 | 315 | #define fmavs_ld(XG, XS, MT, DT) \ 316 | AUW(SIB(MT), EMPTY, EMPTY, MOD(MT), VAL(DT), C2(DT), EMPTY2) \ 317 | EMITW(0x00000000 | MPM(TmmM, MOD(MT), VAL(DT), B2(DT), P2(DT))) \ 318 | EMITW(0xFC000308 | MXM(REG(XG), REG(XS), TmmM)) 319 | 320 | /* fms (G = G - S * T) if (#G != #S && #G != #T) */ 321 | 322 | #define fmsvs_rr(XG, XS, XT) \ 323 | EMITW(0xFC0003C8 | MXM(REG(XG), REG(XS), REG(XT))) 324 | 325 | #define fmsvs_ld(XG, XS, MT, DT) \ 326 | AUW(SIB(MT), EMPTY, EMPTY, MOD(MT), VAL(DT), C2(DT), EMPTY2) \ 327 | EMITW(0x00000000 | MPM(TmmM, MOD(MT), VAL(DT), B2(DT), P2(DT))) \ 328 | EMITW(0xFC0003C8 | MXM(REG(XG), REG(XS), TmmM)) 329 | 330 | /************* scalar quad-precision integer arithmetic/shifts ************/ 331 | 332 | /* add (G = G + S), (D = S + T) if (#D != #T) */ 333 | 334 | #define addvx_rr(XG, XS) \ 335 | addvx3rr(W(XG), W(XG), W(XS)) 336 | 337 | #define addvx_ld(XG, MS, DS) \ 338 | addvx3ld(W(XG), W(XG), W(MS), W(DS)) 339 | 340 | #define addvx3rr(XD, XS, XT) \ 341 | EMITW(0x10000100 | MXM(REG(XD), REG(XS), REG(XT))) 342 | 343 | #define addvx3ld(XD, XS, MT, DT) \ 344 | AUW(SIB(MT), EMPTY, EMPTY, MOD(MT), VAL(DT), C2(DT), EMPTY2) \ 345 | EMITW(0x00000000 | MPM(TmmM, MOD(MT), VAL(DT), B2(DT), P2(DT))) \ 346 | EMITW(0x10000100 | MXM(REG(XD), REG(XS), TmmM)) 347 | 348 | /* sub (G = G - S), (D = S - T) if (#D != #T) */ 349 | 350 | #define subvx_rr(XG, XS) \ 351 | subvx3rr(W(XG), W(XG), W(XS)) 352 | 353 | #define subvx_ld(XG, MS, DS) \ 354 | subvx3ld(W(XG), W(XG), W(MS), W(DS)) 355 | 356 | #define subvx3rr(XD, XS, XT) \ 357 | EMITW(0x10000500 | MXM(REG(XD), REG(XS), REG(XT))) 358 | 359 | #define subvx3ld(XD, XS, MT, DT) \ 360 | AUW(SIB(MT), EMPTY, EMPTY, MOD(MT), VAL(DT), C2(DT), EMPTY2) \ 361 | EMITW(0x00000000 | MPM(TmmM, MOD(MT), VAL(DT), B2(DT), P2(DT))) \ 362 | EMITW(0x10000500 | MXM(REG(XD), REG(XS), TmmM)) 363 | 364 | /* shl (G = G << S), (D = S << T) if (#D != #T) - plain, unsigned 365 | * for maximum compatibility: shift count must be modulo elem-size */ 366 | 367 | #define shlvx_ri(XG, IS) \ 368 | shlvx3ri(W(XG), W(XG), W(IS)) 369 | 370 | #define shlvx_ld(XG, MS, DS) /* loads SIMD, uses first elem, rest zeroed */ \ 371 | shlvx3ld(W(XG), W(XG), W(MS), W(DS)) 372 | 373 | #define shlvx3ri(XD, XS, IT) \ 374 | EMITW(0xF00002D1 | TmmM << 21 | (0x7F & VAL(IT)) << 11) \ 375 | EMITW(0x1000040C | MXM(REG(XD), REG(XS), TmmM)) \ 376 | EMITW(0x100001C4 | MXM(REG(XD), REG(XS), TmmM)) 377 | 378 | #define shlvx3ld(XD, XS, MT, DT) \ 379 | AUW(SIB(MT), EMPTY, EMPTY, MOD(MT), VAL(DT), C2(DT), EMPTY2) \ 380 | EMITW(0x00000000 | MPM(TmmM, MOD(MT), VAL(DT), B2(DT), P2(DT))) \ 381 | EMITW(0x1000020C | MXM(TmmM, 0x0F, TmmM)) \ 382 | EMITW(0x1000040C | MXM(REG(XD), REG(XS), TmmM)) \ 383 | EMITW(0x100001C4 | MXM(REG(XD), REG(XS), TmmM)) 384 | 385 | /* shr (G = G >> S), (D = S >> T) if (#D != #T) - plain, unsigned 386 | * for maximum compatibility: shift count must be modulo elem-size */ 387 | 388 | #define shrvx_ri(XG, IS) \ 389 | shrvx3ri(W(XG), W(XG), W(IS)) 390 | 391 | #define shrvx_ld(XG, MS, DS) /* loads SIMD, uses first elem, rest zeroed */ \ 392 | shrvx3ld(W(XG), W(XG), W(MS), W(DS)) 393 | 394 | #define shrvx3ri(XD, XS, IT) \ 395 | EMITW(0xF00002D1 | TmmM << 21 | (0x7F & VAL(IT)) << 11) \ 396 | EMITW(0x1000044C | MXM(REG(XD), REG(XS), TmmM)) \ 397 | EMITW(0x100002C4 | MXM(REG(XD), REG(XS), TmmM)) 398 | 399 | #define shrvx3ld(XD, XS, MT, DT) \ 400 | AUW(SIB(MT), EMPTY, EMPTY, MOD(MT), VAL(DT), C2(DT), EMPTY2) \ 401 | EMITW(0x00000000 | MPM(TmmM, MOD(MT), VAL(DT), B2(DT), P2(DT))) \ 402 | EMITW(0x1000020C | MXM(TmmM, 0x0F, TmmM)) \ 403 | EMITW(0x1000044C | MXM(REG(XD), REG(XS), TmmM)) \ 404 | EMITW(0x100002C4 | MXM(REG(XD), REG(XS), TmmM)) 405 | 406 | /* svl (G = G << S), (D = S << T) if (#D != #T) - variable, unsigned 407 | * for maximum compatibility: shift count must be modulo elem-size */ 408 | 409 | #define svlvx_rr(XG, XS) /* variable shift with per-elem count */ \ 410 | svlvx3rr(W(XG), W(XG), W(XS)) 411 | 412 | #define svlvx_ld(XG, MS, DS) /* variable shift with per-elem count */ \ 413 | svlvx3ld(W(XG), W(XG), W(MS), W(DS)) 414 | 415 | #define svlvx3rr(XD, XS, XT) \ 416 | EMITW(0x1000020C | MXM(TmmM, 0x0F, REG(XT))) \ 417 | EMITW(0x1000040C | MXM(REG(XD), REG(XS), TmmM)) \ 418 | EMITW(0x100001C4 | MXM(REG(XD), REG(XS), TmmM)) 419 | 420 | #define svlvx3ld(XD, XS, MT, DT) \ 421 | AUW(SIB(MT), EMPTY, EMPTY, MOD(MT), VAL(DT), C2(DT), EMPTY2) \ 422 | EMITW(0x00000000 | MPM(TmmM, MOD(MT), VAL(DT), B2(DT), P2(DT))) \ 423 | EMITW(0x1000020C | MXM(TmmM, 0x0F, TmmM)) \ 424 | EMITW(0x1000040C | MXM(REG(XD), REG(XS), TmmM)) \ 425 | EMITW(0x100001C4 | MXM(REG(XD), REG(XS), TmmM)) 426 | 427 | /* svr (G = G >> S), (D = S >> T) if (#D != #T) - variable, unsigned 428 | * for maximum compatibility: shift count must be modulo elem-size */ 429 | 430 | #define svrvx_rr(XG, XS) /* variable shift with per-elem count */ \ 431 | svrvx3rr(W(XG), W(XG), W(XS)) 432 | 433 | #define svrvx_ld(XG, MS, DS) /* variable shift with per-elem count */ \ 434 | svrvx3ld(W(XG), W(XG), W(MS), W(DS)) 435 | 436 | #define svrvx3rr(XD, XS, XT) \ 437 | EMITW(0x1000020C | MXM(TmmM, 0x0F, REG(XT))) \ 438 | EMITW(0x1000044C | MXM(REG(XD), REG(XS), TmmM)) \ 439 | EMITW(0x100002C4 | MXM(REG(XD), REG(XS), TmmM)) 440 | 441 | #define svrvx3ld(XD, XS, MT, DT) \ 442 | AUW(SIB(MT), EMPTY, EMPTY, MOD(MT), VAL(DT), C2(DT), EMPTY2) \ 443 | EMITW(0x00000000 | MPM(TmmM, MOD(MT), VAL(DT), B2(DT), P2(DT))) \ 444 | EMITW(0x1000020C | MXM(TmmM, 0x0F, TmmM)) \ 445 | EMITW(0x1000044C | MXM(REG(XD), REG(XS), TmmM)) \ 446 | EMITW(0x100002C4 | MXM(REG(XD), REG(XS), TmmM)) 447 | 448 | /******************************************************************************/ 449 | /******************************** INTERNAL ********************************/ 450 | /******************************************************************************/ 451 | 452 | #endif /* RT_128X1 */ 453 | 454 | #endif /* RT_SIMD_CODE */ 455 | 456 | #endif /* RT_RTARCH_PQF_128X1V2_H */ 457 | 458 | /******************************************************************************/ 459 | /******************************************************************************/ 460 | /******************************************************************************/ 461 | -------------------------------------------------------------------------------- /core/config/rtdocs.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************/ 2 | /* Copyright (c) 2013-2025 VectorChief (at github, bitbucket, sourceforge) */ 3 | /* Distributed under the MIT software license, see the accompanying */ 4 | /* file COPYING or http://www.opensource.org/licenses/mit-license.php */ 5 | /******************************************************************************/ 6 | 7 | /******************************************************************************/ 8 | /********************************* LEGEND *********************************/ 9 | /******************************************************************************/ 10 | 11 | /* 12 | * rtdocs.h: Documentation on how to get started using the assembler. 13 | * Table of contents is provided below. 14 | * 15 | * Chapter 1 - Overview 16 | * Chapter 2 - Introduction 17 | * Chapter 3 - Application types 18 | * Chapter 4 - Initialization 19 | * Chapter 5 - Configuration 20 | * 21 | * It is recommended to read all chapters at least once 22 | * before getting started with the code. 23 | */ 24 | 25 | /******************************************************************************/ 26 | /************************** CHAPTER 1 - OVERVIEW ***************************/ 27 | /******************************************************************************/ 28 | 29 | /* 30 | * The general structure of the application using UniSIMD is given below. 31 | * It is usually a combination of standard C/C++ code with some inline assembler 32 | * parts. Something like this: 33 | * 34 | * void func(rt_SIMD_INFOX *inf) 35 | * { 36 | * ASM_ENTER(inf) 37 | * .. 38 | * ASM_LEAVE(inf) 39 | * } 40 | * 41 | * The code above shows a C/C++ function with a parameter and ASM code section 42 | * within it. The parameter is a pointer to a SIMD-aligned structure 43 | * that is used to pass all the data to the ASM section and back if needed. 44 | * ASM section can read and write fields of that structure. 45 | * 46 | * There can be two types of instructions within the ASM section: BASE and SIMD. 47 | * UniSIMD also defines register sets that are common for all architectures 48 | * including variants of a single architecture. 49 | * 50 | * So, with UniSIMD there will always be: Reax/Rebx/Recx/... for BASE and 51 | * Xmm0/Xmm1/Xmm2/... for SIMD. However, these common definitions are then 52 | * mapped to actual architectural registers of ARM/MIPS/POWER and x86. 53 | * 54 | * Both BASE and SIMD register sizes depend on the type of instruction used. 55 | * UniSIMD defines a number of instruction subsets that are again common 56 | * across all architectures. 57 | * 58 | * Some instructions work with fixed register sizes, others have it configurable 59 | * with a flag. The full list of instruction subsets along with registers 60 | * and various addressing modes can be found in "core/config/rtzero.h", 61 | * while "test/simd_test.cpp" shows how these definitions can be used. 62 | * Refer to "c_test01" and "s_test01" for a start. 63 | * 64 | * Once the program is expressed with UniSIMD's syntax (C/C++ with ASM sections) 65 | * it can then be built for any supported architecture without having a need 66 | * to modify the source code again. Just pick the right makefile. 67 | */ 68 | 69 | /******************************************************************************/ 70 | /************************ CHAPTER 2 - INTRODUCTION *************************/ 71 | /******************************************************************************/ 72 | 73 | /* 74 | * In order for UniSIMD to work as intended application source files need 75 | * to do a few things first. 76 | * 77 | * #define RT_SIMD_CODE // enable SIMD instruction definitions 78 | * #define RT_DATA 8 // define data load-level for backend structures 79 | * 80 | * #include "rtbase.h" // include UniSIMD's base header after the 2 flags above 81 | * 82 | * As some SIMD widths (128/256/512-bit) are limited to specific CPU generations 83 | * (SSE/AVX/AVX-512) the use of SIMD is turned off by default and needs to be 84 | * explicitly enabled with a flag to make the generic ASM sections (no SIMD) 85 | * portable across generations. This is mostly relevant when implementing 86 | * runtime detection of SIMD and subsequent multi-targeting. 87 | * 88 | * Once SIMD instructions are enabled ASM sections will preserve/restore full 89 | * SIMD registers they are configured to work with. The maximal SIMD width 90 | * for a build is set in a makefile with RT_128=a/RT_256=b/RT_512=c/... and 91 | * is defined internally as Q (=1/2/4/...), while a/b/c define variant 92 | * within a given SIMD width. 93 | * 94 | * As UniSIMD needs to adjust for changing SIMD widths when working with 95 | * backend structures it needs to know how much they are filled. 96 | * The next (second) flag then defines the data load-level common for all 97 | * backend structures and ASM sections within its scope: 98 | * 99 | * 1 - means full DP-level (12-bit displacements) is filled or exceeded (Q=1). 100 | * 2 - means 1/2 DP-level (11-bit displacements) has not been exceeded (Q=1). 101 | * 4 - means 1/4 DP-level (10-bit displacements) has not been exceeded (Q=1). 102 | * 8 - means 1/8 DP-level (9-bit displacements) has not been exceeded (Q=1). 103 | * 16 means 1/16 DP-level (8-bit displacements) has not been exceeded (Q=1). 104 | * NOTE: the built-in rt_SIMD_INFO structure is already filled at full 1/16th. 105 | * 106 | * The load-level is measured at Q equal to 1 and UniSIMD then adjusts internal 107 | * displacement values as Q scales up. 108 | * 109 | * UniSIMD defines a lot of simple single-letter internal values which can 110 | * interfere with program's own variables, especially when adding UniSIMD to 111 | * an existing project. It is therefore recommended using a separate file 112 | * for ASM header and sections or adding them at the end of an existing file, 113 | * while keeping function declarations to be used in the program at the top. 114 | * 115 | * All applications need to include a single root header with base types and 116 | * definitions so that UniSIMD can do the rest of configuration based on 117 | * makefile flags. Depending on where the source files are located makefile 118 | * should specify a relative path to "core/config/" in order for UniSIMD headers 119 | * to become available. 120 | */ 121 | 122 | /******************************************************************************/ 123 | /********************** CHAPTER 3 - APPLICATION TYPES **********************/ 124 | /******************************************************************************/ 125 | 126 | /* 127 | * There can be two types of applications written with UniSIMD - single-target 128 | * and multi-target. In the first case the binary is configured and carries 129 | * the code for just one target (CPU generation or SIMD width and variant). 130 | * In the second case the binary carries multiple code sections for different 131 | * targets (CPU generations or SIMD widths and variants). 132 | * 133 | * Some architectures like x86 allow for runtime target detection on 134 | * the application level (user-space), others like ARM/MIPS/POWER only provide 135 | * that information to an operating system (priveleged), which makes producing 136 | * multi-target binaries for those architectures a bit cumbersome as 137 | * they become OS-specific. 138 | * 139 | * The test framework within UniSIMD is a single-target application, which means 140 | * only one CPU generation or SIMD width and variant per build. However, its use 141 | * of portable instruction subsets (cmdx*, cmdy* for BASE and cmdp*, cmds* for 142 | * SIMD and scalar) allows it to configure the same code-base for many different 143 | * targets and produce a separate binary for each target from a single source. 144 | * 145 | * Creating a proper multi-target binary requires use of C++ namespaces and 146 | * additional source-level target files, which would then include the same 147 | * portable code-base and wrap it into a target-specific namespace with a set of 148 | * flags for that target. In addition to that a generic ASM section should 149 | * determine the target at runtime and select appropriate code-path in a switch. 150 | * 151 | * A good example of how to build a multi-target binary with UniSIMD is provided 152 | * in the QuadRay engine (core/tracer). In that example backend structures are 153 | * always defined for the maximnal SIMD width (Q internally) configured in 154 | * makefiles (RT_128/RT_256/RT_512/...). However, the portable ASM code-base 155 | * needs to be aware of the actual SIMD width selected at runtime 156 | * and currently running. 157 | * 158 | * This is handled with a target-specific RT_SIMD_QUADS definition, which is 159 | * expressed in the same terms as Q (1/2/4/8/16), but is different from Q as it 160 | * always reflects the currently active SIMD width and not the maximal 161 | * SIMD width defined for the build. 162 | */ 163 | 164 | /******************************************************************************/ 165 | /************************ CHAPTER 4 - INITIALIZATION ***********************/ 166 | /******************************************************************************/ 167 | 168 | /* 169 | * Some emulated instructions within ASM sections rely on general purpose 170 | * constants in rt_SIMD_INFO structure defined in "core/config/rtbase.h". 171 | * They need to be initialized before the pointer to this structure is passed to 172 | * the first ASM section and deinitialized after the last one. It's done with: 173 | * 174 | * ASM_INIT(inf, reg) 175 | * 176 | * .. 177 | * 178 | * ASM_DONE(inf) 179 | * 180 | * Here "reg" is a pointer to SIMD-aligned structure rt_SIMD_REGS intended 181 | * to keep the state of all SIMD registers (from C/C++ code) while ASM section 182 | * is doing some processing. It can be allocated separately or as a part of a 183 | * larger combined "inf+reg" structure. In any case both pointers should end up 184 | * SIMD-aligned (divisible by full SIMD-width they are pointing at in bytes). 185 | * 186 | * As was mentioned previously "inf" is a pointer to rt_SIMD_INFOX structure, 187 | * which is usually an extension of rt_SIMD_INFO. The extension of the initial 188 | * built-in rt_SIMD_INFO structure can be done with inheritance (in C++) or 189 | * embedding (in C). This step (extension) is necessary in order to pass 190 | * application-specific parameters into application-defined ASM sections, 191 | * something that generic rt_SIMD_INFO cannot provide. 192 | * 193 | * Once "inf" pointer of initialized structure is passed to the ASM section 194 | * it shows up as Rebp register and can be accessed via Mebp addressing mode 195 | * with corresponding displacements (offsets) defined in rt_SIMD_INFO and 196 | * rt_SIMD_INFOX (by extension). 197 | * 198 | * Potential future improvement is to use an array instead of structure to avoid 199 | * possible paddings that compiler may introduce for its own needs (alignment), 200 | * in which case some parts of the assembler will need to be redesigned. 201 | * ASM_ENTER/LEAVE macros can be converted into just-in-time compilation along 202 | * with EMITW/EMITH/EMITB/LBL to avoid possible compiler issues with inline ASM. 203 | * The order of arithmetic and shifts within internal definitions can be 204 | * hardened by using extra parentheses (in a form of round brackets). 205 | * 206 | * Right shifts on signed/unsigned data types in C/C++ are not guaranteed by 207 | * the standard to produce arithmetic/logical shift instructions respectively, 208 | * therefore some tests within SIMD test framework may need to be rewritten. 209 | * Modern open-source compilers produce consistent results only with data sizes 210 | * above 8-bit (char). With 8-bit signed/unsigned char ARM and POWER compilers 211 | * show discrepancy in right shifts behavior relative to MIPS and x86. 212 | * 213 | * For every BASE register starting with R*** (like Rebx, Recx, Redx, ...) 214 | * there is a corresponding addressing mode starting with M*** (like Mebx, Mecx, 215 | * Medx, ...), which treats the register as a pointer and dereferences it with 216 | * additional displacement (offset) given as a separate parameter to cmd**_ld/st 217 | * instructions. 218 | * 219 | * The use of Reax is reserved for indexed addressing mode in the form of I*** 220 | * (like Iebx, Iecx, Iedx, ...) in which case the address is calculated as a sum 221 | * of R*** + Reax + displacement, where R*** is the BASE register encoded in the 222 | * addressing mode. Scaled indexed addressing modes are supported as J***, K***, 223 | * L*** (with Reax), while S***, T***, U***, V*** accept any BASE register index 224 | * maintaining the same built-in scaling factors 1x/2x/4x/8x respectively. 225 | * Fully configurable N*** takes index register and scale (1,2,3) for 2x/4x/8x. 226 | * Reax is also used for plain addressing mode (Oeax) without displacement 227 | * in which case PLAIN is passed as a displacement to cmd**_ld/st instructions. 228 | */ 229 | 230 | /******************************************************************************/ 231 | /************************ CHAPTER 5 - CONFIGURATION ************************/ 232 | /******************************************************************************/ 233 | 234 | /* 235 | * The initialization of SIMD fields within SIMD-aligned backend structures 236 | * can be streamlined with RT_SIMD_SET(s, v) macros used from within C/C++ code. 237 | * In this case "s" represents SIMD field (usually an array of elements) and "v" 238 | * represents scalar value that is going to be replicated across all elements. 239 | * 240 | * The RT_SIMD_SET macro is the most generic form which is then mapped to 241 | * target-specific form depending on the configured SIMD element size and 242 | * the maximal SIMD width. The RT_SIMD_SET32 and RT_SIMD_SET64 always work with 243 | * 32-bit and 64-bit SIMD elements respectively regardless of configuration, 244 | * but they both still respect maximal SIMD width. 245 | * 246 | * The element size is configured with RT_ELEMENT=32/64 definition from within 247 | * makefiles, while RT_ADDRESS, RT_POINTER define the respective address and 248 | * pointer sizes. These definitions affect the size of configurable scalar and 249 | * vector types used within backend structures and throughout C/C++ code. 250 | * For example, rt_elem/rt_uelm, rt_real depend on RT_ELEMENT, rt_addr/rt_uadr 251 | * depend on RT_ADDRESS, while rt_pntr/rt_uptr and rt_cell/rt_word depend on 252 | * RT_POINTER (which is fixed for the chosen target and cannot be changed). 253 | * 254 | * In addition to already mentioned flags and definitions "core/config/rtbase.h" 255 | * defines other useful constants, like R, T and S to configure the size of 256 | * SIMD fields depending on the chosen SIMD element size and maximal SIMD width. 257 | * Short names P, A and L represent RT_POINTER, RT_ADDRESS and RT_ELEMENT 258 | * in base units: 1 for 32-bit, 2 for 64-bit. 259 | * 260 | * Note that logical cmdpx instructions are configured for floating-point 261 | * SIMD pipeline on x86 where applicable, while logical cmdmx instructions 262 | * are better suited for integer SIMD workloads. 263 | * 264 | * Constants like B/C/D/.../I define various displacement corrections for 265 | * endianness, when C/C++ and ASM sections work on different data sizes packed 266 | * within a single larger field. 267 | * 268 | * Similar to how displacements are defined and then passed to BASE and SIMD 269 | * cmd**_ld/st instructions, immediate values of various sizes can be passed to 270 | * BASE cmd**_ri/rj instructions. The assembler defines the following immediate 271 | * and displacement types: IC/IB/IM/IG/IH/IV/IW as 7/8/12/15/16/31/32-bit values 272 | * and DP/DE/DF/DG/DH/DV as 12/13/14/15/16/31-bit values respectively. 273 | * 274 | * Both displacement and immediate common types are defined in corresponding 275 | * "core/config/rtarch_*32.h" files for each architecture individually. 276 | * Displacements are then additionally scaled with Q and RT_DATA expressed via O 277 | * definition in "core/config/rtbase.h". Immediate arguments only apply to BASE 278 | * instructions and don't need any additional SIMD scaling. All displacement and 279 | * immediate values are always unsigned within the assembler. 280 | */ 281 | 282 | /******************************************************************************/ 283 | /******************************************************************************/ 284 | /******************************************************************************/ 285 | -------------------------------------------------------------------------------- /test/build_cross.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # Intended for x86_64 Linux build environment 3 | # with many g++ cross-compilers installed (64-bit Ubuntu MATE 20.04 LTS tested) 4 | # refer to individual makefiles for installation instructions 5 | 6 | make -f simd_make_arm.mk build -j2 7 | make -f simd_make_m32.mk build -j2 8 | make -f simd_make_p32.mk build -j4 9 | make -f simd_make_a64.mk build -j8 10 | make -f simd_make_m64.mk build -j8 11 | make -f simd_make_p64.mk build -j8 12 | 13 | make -f simd_make_arm.mk strip 14 | make -f simd_make_m32.mk strip 15 | make -f simd_make_p32.mk strip 16 | make -f simd_make_a64.mk strip 17 | make -f simd_make_m64.mk strip 18 | make -f simd_make_p64.mk strip 19 | -------------------------------------------------------------------------------- /test/build_linux.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # Intended for x86_64 Linux build environment 3 | # with native g++ compiler installed (64-bit Linux Mint 18 tested) 4 | # works on Ubuntu MATE 18.04/20.04 LTS (binaries aren't backward compatible) 5 | 6 | make -f simd_make_x64.mk build -j8 7 | 8 | make -f simd_make_x64.mk strip 9 | -------------------------------------------------------------------------------- /test/build_macM1.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # Intended for AArch64 macOS build environment with Apple Silicon (M1 chip) 3 | # with Command Line Tools installed (macOS BigSur/Monterey tested) 4 | # build on the least recent OS as binaries aren't always backward compatible 5 | 6 | make -f simd_make_a64.mk clang -j8 7 | 8 | make -f simd_make_a64.mk macRD 9 | 10 | make -f simd_make_a64.mk macST 11 | 12 | make -f simd_make_a64.mk macOS 13 | -------------------------------------------------------------------------------- /test/build_macOS.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # Intended for x86_64 Mac OS X / OS X / macOS build environment 3 | # with Command Line Tools installed (Mac OS X Lion / macOS High Sierra tested) 4 | # build on the least recent OS as binaries aren't always backward compatible 5 | 6 | make -f simd_make_x64.mk build -j8 7 | 8 | make -f simd_make_x64.mk macRD 9 | 10 | make -f simd_make_x64.mk strip 11 | 12 | make -f simd_make_x64.mk macOS 13 | -------------------------------------------------------------------------------- /test/build_multi.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # Intended for x86_64 Linux build environment 3 | # with native g++ multilib-compiler installed (64-bit Linux Mint 18 tested) 4 | # refer to individual makefiles for installation instructions 5 | 6 | make -f simd_make_x86.mk build -j4 7 | make -f simd_make_x32.mk build 8 | 9 | make -f simd_make_x86.mk strip 10 | make -f simd_make_x32.mk strip 11 | -------------------------------------------------------------------------------- /test/build_nokia.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # Intended for scratchbox Linux build environment (32-bit Ubuntu 10.10 tested) 3 | # http://wiki.maemo.org/Documentation/Maemo_5_Final_SDK_Installation 4 | 5 | make -f simd_make_arm.mk build_n900 6 | 7 | make -f simd_make_arm.mk strip_n900 8 | -------------------------------------------------------------------------------- /test/build_raspi.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # Intended for ARMv7 Linux build environment 3 | # with native g++ compiler installed (32-bit Raspbian 7 and 8 tested) 4 | 5 | make -f simd_make_arm.mk build_rpiX -j4 6 | 7 | make -f simd_make_arm.mk strip_rpiX 8 | -------------------------------------------------------------------------------- /test/build_win64.bat: -------------------------------------------------------------------------------- 1 | :: Intended for x86_64 Windows build environment 2 | :: with TDM64-GCC compiler installed (64-bit Windows 7 SP1, Windows 10 tested) 3 | 4 | mingw32-make -f simd_make_w64.mk build -j4 5 | 6 | mingw32-make -f simd_make_w64.mk strip 7 | -------------------------------------------------------------------------------- /test/clean_cross.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # Intended for x86_64 Linux build environment 3 | # with many g++ cross-compilers installed (64-bit Ubuntu MATE 20.04 LTS tested) 4 | # refer to individual makefiles for installation instructions 5 | 6 | make -f simd_make_arm.mk clean 7 | make -f simd_make_m32.mk clean 8 | make -f simd_make_p32.mk clean 9 | make -f simd_make_a64.mk clean 10 | make -f simd_make_m64.mk clean 11 | make -f simd_make_p64.mk clean 12 | -------------------------------------------------------------------------------- /test/clean_linux.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # Intended for x86_64 Linux build environment 3 | # with native g++ compiler installed (64-bit Linux Mint 18 tested) 4 | # works on Ubuntu MATE 18.04/20.04 LTS (binaries aren't backward compatible) 5 | 6 | make -f simd_make_x64.mk clean 7 | -------------------------------------------------------------------------------- /test/clean_macM1.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # Intended for AArch64 macOS build environment with Apple Silicon (M1 chip) 3 | # with Command Line Tools installed (macOS BigSur/Monterey tested) 4 | # build on the least recent OS as binaries aren't always backward compatible 5 | 6 | make -f simd_make_a64.mk macRM 7 | -------------------------------------------------------------------------------- /test/clean_macOS.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # Intended for x86_64 Mac OS X / OS X / macOS build environment 3 | # with Command Line Tools installed (Mac OS X Lion / macOS High Sierra tested) 4 | # build on the least recent OS as binaries aren't always backward compatible 5 | 6 | make -f simd_make_x64.mk macRM 7 | -------------------------------------------------------------------------------- /test/clean_multi.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # Intended for x86_64 Linux build environment 3 | # with native g++ multilib-compiler installed (64-bit Linux Mint 18 tested) 4 | # refer to individual makefiles for installation instructions 5 | 6 | make -f simd_make_x86.mk clean 7 | make -f simd_make_x32.mk clean 8 | -------------------------------------------------------------------------------- /test/clean_nokia.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # Intended for scratchbox Linux build environment (32-bit Ubuntu 10.10 tested) 3 | # http://wiki.maemo.org/Documentation/Maemo_5_Final_SDK_Installation 4 | 5 | make -f simd_make_arm.mk clean_n900 6 | -------------------------------------------------------------------------------- /test/clean_raspi.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # Intended for ARMv7 Linux build environment 3 | # with native g++ compiler installed (32-bit Raspbian 7 and 8 tested) 4 | 5 | make -f simd_make_arm.mk clean_rpiX 6 | -------------------------------------------------------------------------------- /test/clean_win64.bat: -------------------------------------------------------------------------------- 1 | :: Intended for x86_64 Windows build environment 2 | :: with TDM64-GCC compiler installed (64-bit Windows 7 SP1 tested) 3 | 4 | mingw32-make -f simd_make_w64.mk clean 5 | -------------------------------------------------------------------------------- /test/simd_make_a32.mk: -------------------------------------------------------------------------------- 1 | 2 | INC_PATH = \ 3 | -I../core/config/ 4 | 5 | SRC_LIST = \ 6 | simd_test.cpp 7 | 8 | LIB_PATH = 9 | 10 | LIB_LIST = \ 11 | -lm 12 | 13 | 14 | build: simd_test_a32 15 | 16 | strip: 17 | aarch64-linux-gnu-strip simd_test.a32* 18 | 19 | clean: 20 | rm simd_test.a32* 21 | 22 | 23 | simd_test_a32: 24 | aarch64-linux-gnu-g++ -O3 -g -static -mabi=ilp32 \ 25 | -DRT_LINUX -DRT_A32 -DRT_128=1 -DRT_DEBUG=0 \ 26 | -DRT_POINTER=32 -DRT_ADDRESS=32 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \ 27 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.a32 28 | 29 | 30 | # On Ubuntu (MATE) 16.04-22.04 add "universe multiverse" to "main restricted" 31 | # in /etc/apt/sources.list (sudo nano /etc/apt/sources.list) then run: 32 | # sudo apt-get update 33 | # (Ubuntu MATE is set up for an update without a need to edit the file) 34 | # (extended repositories "universe multiverse" are only needed for clang) 35 | # 36 | # Prerequisites for the build: 37 | # (cross-)compiler for AArch64 is installed and in the PATH variable. 38 | # sudo apt-get install make g++-aarch64-linux-gnu 39 | # (recent upstream g++-5-aarch64 series may not fully support ILP32 ABI) 40 | # 41 | # Compiling/running SIMD test: 42 | # make -f simd_make_a32.mk 43 | 44 | # Clang native build should theoretically work too (not tested), use (replace): 45 | # clang++ (in place of ...-g++) on AArch64 host (Raspberry Pi 3/4) 46 | # sudo apt-get install clang 47 | 48 | # For interpretation of SIMD build flags check compatibility layer in rtzero.h. 49 | # The 128-bit 15-reg targets are supported for compatibility with x86/POWER. 50 | 51 | # For 128-bit NEON build use (replace): RT_128=1 (30 SIMD registers) 52 | # For 128-bit ARMv8.2 build use (replace): RT_128=2 (adds new fp16 ops) (30 rs) 53 | # For 128-bit NEON build use (replace): RT_128=4 (15 SIMD registers) 54 | # For 128-bit ARMv8.2 build use (replace): RT_128=8 (adds new fp16 ops) (15 rs) 55 | # For 256-bit NEON build use (replace): RT_256=1 (15 SIMD reg-pairs) 56 | # For 256-bit ARMv8.2 build use (replace): RT_256=2 (adds new fp16 ops) (15 rp) 57 | 58 | # For 256-bit SVEx1 build use (replace): RT_256=4 (30 SIMD registers) 59 | # For 512-bit SVEx2 build use (replace): RT_512=1 (15 SIMD reg-pairs) 60 | # For 512-bit SVEx1 build use (replace): RT_512=4 (30 SIMD registers) 61 | # For 1024-bit SVEx2 build use (replace): RT_1K4=1 (15 SIMD reg-pairs) 62 | # For 1024-bit SVEx1 build use (replace): RT_1K4=4 (30 SIMD registers) 63 | # For 2048-bit SVEx2 build use (replace): RT_2K8_R8=1 (8 SIMD reg-pairs) 64 | # For 2048-bit SVEx1 build use (replace): RT_2K8_R8=4 (15 SIMD registers) 65 | # The last two slots are artificially reg-limited for compatibility with AVX512 66 | 67 | # 32-bit ABI hasn't been fully tested yet due to lack of available libs, 68 | # check out 64/32-bit (ptr/adr) hybrid mode for 64-bit ABI in simd_make_a64.mk 69 | -------------------------------------------------------------------------------- /test/simd_make_a64.mk: -------------------------------------------------------------------------------- 1 | 2 | INC_PATH = \ 3 | -I../core/config/ 4 | 5 | SRC_LIST = \ 6 | simd_test.cpp 7 | 8 | LIB_PATH = 9 | 10 | LIB_LIST = \ 11 | -lm 12 | 13 | 14 | build: build_a64 build_a64sve 15 | clang: clang_a64 clang_a64sve 16 | 17 | strip: 18 | aarch64-linux-gnu-strip simd_test.a64* 19 | 20 | clean: 21 | rm simd_test.a64* 22 | 23 | macOS: 24 | mv simd_test.a64_32 simd_test.d64_32 25 | mv simd_test.a64_64 simd_test.d64_64 26 | mv simd_test.a64f32 simd_test.d64f32 27 | mv simd_test.a64f64 simd_test.d64f64 28 | mv simd_test.a64_32sve simd_test.d64_32sve 29 | mv simd_test.a64_64sve simd_test.d64_64sve 30 | mv simd_test.a64f32sve simd_test.d64f32sve 31 | mv simd_test.a64f64sve simd_test.d64f64sve 32 | 33 | macRD: 34 | rm -fr simd_test.a64*.dSYM/ 35 | 36 | macST: 37 | strip simd_test.a64* 38 | 39 | macRM: 40 | rm simd_test.d64* 41 | 42 | 43 | build_a64: simd_test_a64_32 simd_test_a64_64 simd_test_a64f32 simd_test_a64f64 44 | 45 | simd_test_a64_32: 46 | aarch64-linux-gnu-g++ -O3 -g -static \ 47 | -DRT_LINUX -DRT_A64 -DRT_128=1 -DRT_DEBUG=0 \ 48 | -DRT_POINTER=64 -DRT_ADDRESS=32 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \ 49 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.a64_32 50 | 51 | simd_test_a64_64: 52 | aarch64-linux-gnu-g++ -O3 -g -static \ 53 | -DRT_LINUX -DRT_A64 -DRT_128=1 -DRT_DEBUG=0 \ 54 | -DRT_POINTER=64 -DRT_ADDRESS=32 -DRT_ELEMENT=64 -DRT_ENDIAN=0 \ 55 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.a64_64 56 | 57 | simd_test_a64f32: 58 | aarch64-linux-gnu-g++ -O3 -g -static \ 59 | -DRT_LINUX -DRT_A64 -DRT_256=1 -DRT_DEBUG=0 \ 60 | -DRT_POINTER=64 -DRT_ADDRESS=64 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \ 61 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.a64f32 62 | 63 | simd_test_a64f64: 64 | aarch64-linux-gnu-g++ -O3 -g -static \ 65 | -DRT_LINUX -DRT_A64 -DRT_256=1 -DRT_DEBUG=0 \ 66 | -DRT_POINTER=64 -DRT_ADDRESS=64 -DRT_ELEMENT=64 -DRT_ENDIAN=0 \ 67 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.a64f64 68 | 69 | 70 | build_a64sve: simd_test_a64_32sve simd_test_a64_64sve \ 71 | simd_test_a64f32sve simd_test_a64f64sve 72 | 73 | simd_test_a64_32sve: 74 | aarch64-linux-gnu-g++ -O3 -g -static \ 75 | -DRT_LINUX -DRT_A64 -DRT_512=4 -DRT_DEBUG=0 \ 76 | -DRT_POINTER=64 -DRT_ADDRESS=32 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \ 77 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.a64_32sve 78 | 79 | simd_test_a64_64sve: 80 | aarch64-linux-gnu-g++ -O3 -g -static \ 81 | -DRT_LINUX -DRT_A64 -DRT_512=4 -DRT_DEBUG=0 \ 82 | -DRT_POINTER=64 -DRT_ADDRESS=32 -DRT_ELEMENT=64 -DRT_ENDIAN=0 \ 83 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.a64_64sve 84 | 85 | simd_test_a64f32sve: 86 | aarch64-linux-gnu-g++ -O3 -g -static \ 87 | -DRT_LINUX -DRT_A64 -DRT_1K4=1 -DRT_DEBUG=0 \ 88 | -DRT_POINTER=64 -DRT_ADDRESS=64 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \ 89 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.a64f32sve 90 | 91 | simd_test_a64f64sve: 92 | aarch64-linux-gnu-g++ -O3 -g -static \ 93 | -DRT_LINUX -DRT_A64 -DRT_1K4=1 -DRT_DEBUG=0 \ 94 | -DRT_POINTER=64 -DRT_ADDRESS=64 -DRT_ELEMENT=64 -DRT_ENDIAN=0 \ 95 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.a64f64sve 96 | 97 | 98 | clang_a64: simd_test.a64_32 simd_test.a64_64 simd_test.a64f32 simd_test.a64f64 99 | 100 | simd_test.a64_32: 101 | clang++ -O3 -g \ 102 | -DRT_LINUX -DRT_A64 -DRT_128=1 -DRT_DEBUG=0 \ 103 | -DRT_POINTER=64 -DRT_ADDRESS=32 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \ 104 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.a64_32 105 | 106 | simd_test.a64_64: 107 | clang++ -O3 -g \ 108 | -DRT_LINUX -DRT_A64 -DRT_128=1 -DRT_DEBUG=0 \ 109 | -DRT_POINTER=64 -DRT_ADDRESS=32 -DRT_ELEMENT=64 -DRT_ENDIAN=0 \ 110 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.a64_64 111 | 112 | simd_test.a64f32: 113 | clang++ -O3 -g \ 114 | -DRT_LINUX -DRT_A64 -DRT_256=1 -DRT_DEBUG=0 \ 115 | -DRT_POINTER=64 -DRT_ADDRESS=64 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \ 116 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.a64f32 117 | 118 | simd_test.a64f64: 119 | clang++ -O3 -g \ 120 | -DRT_LINUX -DRT_A64 -DRT_256=1 -DRT_DEBUG=0 \ 121 | -DRT_POINTER=64 -DRT_ADDRESS=64 -DRT_ELEMENT=64 -DRT_ENDIAN=0 \ 122 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.a64f64 123 | 124 | 125 | clang_a64sve: simd_test.a64_32sve simd_test.a64_64sve \ 126 | simd_test.a64f32sve simd_test.a64f64sve 127 | 128 | simd_test.a64_32sve: 129 | clang++ -O3 -g \ 130 | -DRT_LINUX -DRT_A64 -DRT_512=4 -DRT_DEBUG=0 \ 131 | -DRT_POINTER=64 -DRT_ADDRESS=32 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \ 132 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.a64_32sve 133 | 134 | simd_test.a64_64sve: 135 | clang++ -O3 -g \ 136 | -DRT_LINUX -DRT_A64 -DRT_512=4 -DRT_DEBUG=0 \ 137 | -DRT_POINTER=64 -DRT_ADDRESS=32 -DRT_ELEMENT=64 -DRT_ENDIAN=0 \ 138 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.a64_64sve 139 | 140 | simd_test.a64f32sve: 141 | clang++ -O3 -g \ 142 | -DRT_LINUX -DRT_A64 -DRT_1K4=1 -DRT_DEBUG=0 \ 143 | -DRT_POINTER=64 -DRT_ADDRESS=64 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \ 144 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.a64f32sve 145 | 146 | simd_test.a64f64sve: 147 | clang++ -O3 -g \ 148 | -DRT_LINUX -DRT_A64 -DRT_1K4=1 -DRT_DEBUG=0 \ 149 | -DRT_POINTER=64 -DRT_ADDRESS=64 -DRT_ELEMENT=64 -DRT_ENDIAN=0 \ 150 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.a64f64sve 151 | 152 | 153 | # On Ubuntu (MATE) 16.04-22.04 add "universe multiverse" to "main restricted" 154 | # in /etc/apt/sources.list (sudo nano /etc/apt/sources.list) then run: 155 | # sudo apt-get update 156 | # (Ubuntu MATE is set up for an update without a need to edit the file) 157 | # (extended repositories "universe multiverse" are only needed for clang) 158 | # 159 | # Prerequisites for the build: 160 | # (cross-)compiler for AArch64 is installed and in the PATH variable. 161 | # sudo apt-get install make g++-aarch64-linux-gnu 162 | # 163 | # Prerequisites for emulation: 164 | # recent QEMU(-2.5) is installed or built from source and in the PATH variable. 165 | # SVE targets require QEMU 3.x.y (or 3.0.0 with sve-max-vq cpu property patch). 166 | # recent QEMU 4.x.y work well with SVE, but only 4.2.0 is good for all targets. 167 | # sudo apt-get install qemu-user 168 | # 169 | # Compiling/running SIMD test: 170 | # make -f simd_make_a64.mk 171 | # qemu-aarch64 -cpu cortex-a57 simd_test.a64_32 -c 1 172 | # qemu-aarch64 -cpu cortex-a57 simd_test.a64_64 -c 1 173 | # qemu-aarch64 -cpu cortex-a57 simd_test.a64f32 -c 1 174 | # qemu-aarch64 -cpu cortex-a57 simd_test.a64f64 -c 1 175 | # qemu-aarch64 -cpu max,sve-max-vq=1 simd_test.a64_32sve -c 1 (RT_128=2/*_RX=2) 176 | # qemu-aarch64 -cpu max,sve-max-vq=1 simd_test.a64_64sve -c 1 (RT_128=2/*_RX=2) 177 | # qemu-aarch64 -cpu max,sve-max-vq=1 simd_test.a64f32sve -c 1 (RT_256=2/*_RX=2) 178 | # qemu-aarch64 -cpu max,sve-max-vq=1 simd_test.a64f64sve -c 1 (RT_256=2/*_RX=2) 179 | # qemu-aarch64 -cpu max,sve-max-vq=2 simd_test.a64_32sve -c 1 (for RT_256=4) 180 | # qemu-aarch64 -cpu max,sve-max-vq=2 simd_test.a64_64sve -c 1 (for RT_256=4) 181 | # qemu-aarch64 -cpu max,sve-max-vq=2 simd_test.a64f32sve -c 1 (for RT_512=1) 182 | # qemu-aarch64 -cpu max,sve-max-vq=2 simd_test.a64f64sve -c 1 (for RT_512=1) 183 | # qemu-aarch64 -cpu max,sve-max-vq=4 simd_test.a64_32sve -c 1 (for RT_512=4) 184 | # qemu-aarch64 -cpu max,sve-max-vq=4 simd_test.a64_64sve -c 1 (for RT_512=4) 185 | # qemu-aarch64 -cpu max,sve-max-vq=4 simd_test.a64f32sve -c 1 (for RT_1K4=1) 186 | # qemu-aarch64 -cpu max,sve-max-vq=4 simd_test.a64f64sve -c 1 (for RT_1K4=1) 187 | # qemu-aarch64 -cpu max,sve-max-vq=8 simd_test.a64_32sve -c 1 (for RT_1K4=4) 188 | # qemu-aarch64 -cpu max,sve-max-vq=8 simd_test.a64_64sve -c 1 (for RT_1K4=4) 189 | # qemu-aarch64 -cpu max,sve-max-vq=8 simd_test.a64f32sve -c 1 (for RT_2K8_R8=1) 190 | # qemu-aarch64 -cpu max,sve-max-vq=8 simd_test.a64f64sve -c 1 (for RT_2K8_R8=1) 191 | # qemu-aarch64 -cpu max,sve-max-vq=16 simd_test.a64_32sve -c 1 (for RT_2K8_R8=4) 192 | # qemu-aarch64 -cpu max,sve-max-vq=16 simd_test.a64_64sve -c 1 (for RT_2K8_R8=4) 193 | # Use "-c 1" option to reduce test time when emulating with QEMU 194 | 195 | # Clang native build works too (takes much longer prior to 3.8), use (replace): 196 | # clang++ (in place of ...-g++) on AArch64 host (Raspberry Pi 3/4) 197 | # sudo apt-get install clang 198 | 199 | # For interpretation of SIMD build flags check compatibility layer in rtzero.h. 200 | # The 128-bit 15-reg targets are supported for compatibility with x86/POWER. 201 | 202 | # For 128-bit NEON build use (replace): RT_128=1 (30 SIMD registers) 203 | # For 128-bit ARMv8.2 build use (replace): RT_128=2 (adds new fp16 ops) (30 rs) 204 | # For 128-bit SVE2x1 build use (replace): RT_128=2 RT_128_RX=2 (X regs) (30 rs) 205 | # For 128-bit NEON build use (replace): RT_128=4 (15 SIMD registers) 206 | # For 128-bit ARMv8.2 build use (replace): RT_128=8 (adds new fp16 ops) (15 rs) 207 | # For 256-bit NEON build use (replace): RT_256=1 (15 SIMD reg-pairs) 208 | # For 256-bit ARMv8.2 build use (replace): RT_256=2 (adds new fp16 ops) (15 rp) 209 | # For 256-bit SVE2x2 build use (replace): RT_256=2 RT_256_RX=2 (X regs) (15 rp) 210 | 211 | # For 256-bit SVEx1 build use (replace): RT_256=4 (30 SIMD registers) 212 | # For 512-bit SVEx2 build use (replace): RT_512=1 (15 SIMD reg-pairs) 213 | # For 512-bit SVEx1 build use (replace): RT_512=4 (30 SIMD registers) 214 | # For 1024-bit SVEx2 build use (replace): RT_1K4=1 (15 SIMD reg-pairs) 215 | # For 1024-bit SVEx1 build use (replace): RT_1K4=4 (30 SIMD registers) 216 | # For 2048-bit SVEx2 build use (replace): RT_2K8_R8=1 (8 SIMD reg-pairs) 217 | # For 2048-bit SVEx1 build use (replace): RT_2K8_R8=4 (15 SIMD registers) 218 | # The last two slots are artificially reg-limited for compatibility with AVX512 219 | 220 | # 64/32-bit (ptr/adr) hybrid mode is compatible with native 64-bit ABI, 221 | # use (replace): RT_ADDRESS=32, rename the binary to simd_test.a64_** 222 | # 64-bit packed SIMD mode (fp64/int64) is supported on 64-bit targets, 223 | # use (replace): RT_ELEMENT=64, rename the binary to simd_test.a64*64 224 | -------------------------------------------------------------------------------- /test/simd_make_arm.mk: -------------------------------------------------------------------------------- 1 | 2 | INC_PATH = \ 3 | -I../core/config/ 4 | 5 | SRC_LIST = \ 6 | simd_test.cpp 7 | 8 | LIB_PATH = 9 | 10 | LIB_LIST = \ 11 | -lm 12 | 13 | 14 | build: simd_test_arm_v1 simd_test_arm_v2 15 | 16 | strip: 17 | arm-linux-gnueabi-strip simd_test.arm_v* 18 | 19 | clean: 20 | rm simd_test.arm_v* 21 | 22 | 23 | simd_test_arm_v1: 24 | arm-linux-gnueabi-g++ -O3 -g -static -march=armv7-a -marm \ 25 | -DRT_LINUX -DRT_ARM -DRT_128=1 -DRT_DEBUG=0 \ 26 | -DRT_POINTER=32 -DRT_ADDRESS=32 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \ 27 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.arm_v1 28 | 29 | simd_test_arm_v2: 30 | arm-linux-gnueabi-g++ -O3 -g -static -march=armv7-a -marm \ 31 | -DRT_LINUX -DRT_ARM -DRT_128=2 -DRT_DEBUG=0 \ 32 | -DRT_POINTER=32 -DRT_ADDRESS=32 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \ 33 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.arm_v2 34 | 35 | 36 | build_n900: simd_test_arm_n900 37 | 38 | strip_n900: 39 | arm-linux-gnueabi-strip simd_test.arm_n900* 40 | 41 | clean_n900: 42 | rm simd_test.arm_n900* 43 | 44 | 45 | simd_test_arm_n900: 46 | arm-linux-gnueabi-g++ -O3 -g -static -march=armv7-a -marm \ 47 | -DRT_LINUX -DRT_ARM -DRT_128=1 -DRT_DEBUG=0 \ 48 | -DRT_POINTER=32 -DRT_ADDRESS=32 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \ 49 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.arm_n900 50 | 51 | 52 | build_rpiX: simd_test_arm_rpi2 simd_test_arm_rpi3 53 | 54 | strip_rpiX: 55 | arm-linux-gnueabihf-strip simd_test.arm_rpi* 56 | 57 | clean_rpiX: 58 | rm simd_test.arm_rpi* 59 | 60 | 61 | simd_test_arm_rpi2: 62 | arm-linux-gnueabihf-g++ -O3 -g -static -march=armv7-a -marm \ 63 | -DRT_LINUX -DRT_ARM -DRT_128=2 -DRT_DEBUG=0 \ 64 | -DRT_POINTER=32 -DRT_ADDRESS=32 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \ 65 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.arm_rpi2 66 | 67 | simd_test_arm_rpi3: 68 | arm-linux-gnueabihf-g++ -O3 -g -static -march=armv7-a -marm \ 69 | -DRT_LINUX -DRT_ARM -DRT_128=4 -DRT_DEBUG=0 \ 70 | -DRT_POINTER=32 -DRT_ADDRESS=32 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \ 71 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.arm_rpi3 72 | 73 | 74 | # On Ubuntu (MATE) 16.04-22.04 add "universe multiverse" to "main restricted" 75 | # in /etc/apt/sources.list (sudo nano /etc/apt/sources.list) then run: 76 | # sudo apt-get update 77 | # (Ubuntu MATE is set up for an update without a need to edit the file) 78 | # (extended repositories "universe multiverse" are only needed for clang) 79 | # 80 | # Prerequisites for the build: 81 | # (cross-)compiler for ARMv7 is installed and in the PATH variable. 82 | # sudo apt-get install make g++-arm-linux-gnueabi 83 | # 84 | # Prerequisites for emulation: 85 | # recent QEMU(-2.5) is installed or built from source and in the PATH variable. 86 | # sudo apt-get install qemu-user 87 | # 88 | # Compiling/running SIMD test: 89 | # make -f simd_make_arm.mk 90 | # qemu-arm -cpu cortex-a8 simd_test.arm_v1 -c 1 91 | # qemu-arm -cpu cortex-a15 simd_test.arm_v2 -c 1 92 | # Use "-c 1" option to reduce test time when emulating with QEMU 93 | 94 | # Clang native build works too (takes much longer prior to 3.8), use (replace): 95 | # clang++ (in place of ...-g++) on ARMv7 host (Raspberry Pi 2) 96 | # sudo apt-get install clang 97 | 98 | # For interpretation of SIMD build flags check compatibility layer in rtzero.h. 99 | # Original legacy 32-bit ARMv7/x86 targets only support 8 SIMD registers. 100 | 101 | # 1) Nokia N900, Maemo 5 scratchbox: "vanilla" (-DRT_128=1) (8 SIMD registers) 102 | # 2) Raspberry Pi 2, Raspbian: arm-linux-gnueabihf-g++ -DRT_128=2 (8 SIMD regs) 103 | # 3) Raspberry Pi 3, Raspbian: arm-linux-gnueabihf-g++ -DRT_128=4 (8 SIMD regs) 104 | -------------------------------------------------------------------------------- /test/simd_make_m32.mk: -------------------------------------------------------------------------------- 1 | 2 | INC_PATH = \ 3 | -I../core/config/ 4 | 5 | SRC_LIST = \ 6 | simd_test.cpp 7 | 8 | LIB_PATH = 9 | 10 | LIB_LIST = \ 11 | -lm 12 | 13 | 14 | build: simd_test_m32Lr5 simd_test_m32Br5 15 | 16 | strip: 17 | mips-mti-linux-gnu-strip simd_test.m32?r5* 18 | 19 | clean: 20 | rm simd_test.m32* 21 | 22 | 23 | simd_test_m32Lr5: 24 | mips-mti-linux-gnu-g++ -O3 -g -static -EL -mips32r5 -mmsa -mnan=2008 \ 25 | -DRT_LINUX -DRT_M32 -DRT_128=1 -DRT_DEBUG=0 \ 26 | -DRT_POINTER=32 -DRT_ADDRESS=32 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \ 27 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.m32Lr5 28 | 29 | simd_test_m32Br5: 30 | mips-mti-linux-gnu-g++ -O3 -g -static -EB -mips32r5 -mmsa -mnan=2008 \ 31 | -DRT_LINUX -DRT_M32 -DRT_128=1 -DRT_DEBUG=0 \ 32 | -DRT_POINTER=32 -DRT_ADDRESS=32 -DRT_ELEMENT=32 -DRT_ENDIAN=1 \ 33 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.m32Br5 34 | 35 | 36 | # On Ubuntu (MATE) 16.04-22.04 add "universe multiverse" to "main restricted" 37 | # in /etc/apt/sources.list (sudo nano /etc/apt/sources.list) then run: 38 | # sudo apt-get update 39 | # (Ubuntu MATE is set up for an update without a need to edit the file) 40 | # (extended repositories "universe multiverse" are only needed for clang) 41 | # 42 | # Download and unpack MIPS toolchain: 43 | # https://codescape.mips.com/components/toolchain/2020.06-01/downloads.html 44 | # 45 | # Prerequisites for the build: 46 | # (cross-)compiler for MIPSr5+MSA is installed and in the PATH variable. 47 | # Codescape.GNU.Tools.Package.2020.06-01.for.MIPS.MTI.Linux.CentOS-6.x86_64 48 | # is unpacked and folder mips-mti-linux-gnu/2020.06-01/bin is added to PATH: 49 | # PATH=/home/ubuntu/Downloads/mips-mti-linux-gnu/2020.06-01/bin:$PATH 50 | # PATH=/home/ubuntu-mate/Downloads/mips-mti-linux-gnu/2020.06-01/bin:$PATH 51 | # 52 | # Prerequisites for emulation: 53 | # recent QEMU(-2.5) is installed or built from source and in the PATH variable. 54 | # standalone toolchain from 2020.06-01 comes with QEMU 4.1.0 for MIPS in PATH. 55 | # sudo apt-get install qemu-user make 56 | # 57 | # Compiling/running SIMD test: 58 | # make -f simd_make_m32.mk 59 | # qemu-mipsel -cpu P5600 simd_test.m32Lr5 -c 1 60 | # qemu-mips -cpu P5600 simd_test.m32Br5 -c 1 61 | # Use "-c 1" option to reduce test time when emulating with QEMU 62 | 63 | # Clang native build should theoretically work too (not tested), use (replace): 64 | # clang++ -O0 (in place of ...-g++ -O3) on MIPS32r5 host (P5600) 65 | # sudo apt-get install clang 66 | 67 | # For interpretation of SIMD build flags check compatibility layer in rtzero.h. 68 | # The 128-bit 15-reg targets are supported for compatibility with x86/POWER. 69 | 70 | # For 128-bit SIMD build use (replace): RT_128=1 (30 SIMD registers) 71 | # For 128-bit SIMD build use (replace): RT_128=4 (15 SIMD registers) 72 | # For 256-bit SIMD build use (replace): RT_256=1 (15 SIMD reg-pairs) 73 | -------------------------------------------------------------------------------- /test/simd_make_m64.mk: -------------------------------------------------------------------------------- 1 | 2 | INC_PATH = \ 3 | -I../core/config/ 4 | 5 | SRC_LIST = \ 6 | simd_test.cpp 7 | 8 | LIB_PATH = 9 | 10 | LIB_LIST = \ 11 | -lm 12 | 13 | 14 | build: build_le build_be 15 | 16 | strip: 17 | mips-mti-linux-gnu-strip simd_test.m64???Lr6 18 | mips-mti-linux-gnu-strip simd_test.m64???Br6 19 | 20 | clean: 21 | rm simd_test.m64* 22 | 23 | 24 | build_le: simd_test_m64_32Lr6 simd_test_m64_64Lr6 \ 25 | simd_test_m64f32Lr6 simd_test_m64f64Lr6 26 | 27 | simd_test_m64_32Lr6: 28 | mips-mti-linux-gnu-g++ -O3 -g -static -EL -mips64r6 -mmsa -mabi=64 \ 29 | -DRT_LINUX -DRT_M64=6 -DRT_128=1 -DRT_DEBUG=0 \ 30 | -DRT_POINTER=64 -DRT_ADDRESS=32 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \ 31 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.m64_32Lr6 32 | 33 | simd_test_m64_64Lr6: 34 | mips-mti-linux-gnu-g++ -O3 -g -static -EL -mips64r6 -mmsa -mabi=64 \ 35 | -DRT_LINUX -DRT_M64=6 -DRT_128=1 -DRT_DEBUG=0 \ 36 | -DRT_POINTER=64 -DRT_ADDRESS=32 -DRT_ELEMENT=64 -DRT_ENDIAN=0 \ 37 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.m64_64Lr6 38 | 39 | simd_test_m64f32Lr6: 40 | mips-mti-linux-gnu-g++ -O3 -g -static -EL -mips64r6 -mmsa -mabi=64 \ 41 | -DRT_LINUX -DRT_M64=6 -DRT_256=1 -DRT_DEBUG=0 \ 42 | -DRT_POINTER=64 -DRT_ADDRESS=64 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \ 43 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.m64f32Lr6 44 | 45 | simd_test_m64f64Lr6: 46 | mips-mti-linux-gnu-g++ -O3 -g -static -EL -mips64r6 -mmsa -mabi=64 \ 47 | -DRT_LINUX -DRT_M64=6 -DRT_256=1 -DRT_DEBUG=0 \ 48 | -DRT_POINTER=64 -DRT_ADDRESS=64 -DRT_ELEMENT=64 -DRT_ENDIAN=0 \ 49 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.m64f64Lr6 50 | 51 | 52 | build_be: simd_test_m64_32Br6 simd_test_m64_64Br6 \ 53 | simd_test_m64f32Br6 simd_test_m64f64Br6 54 | 55 | simd_test_m64_32Br6: 56 | mips-mti-linux-gnu-g++ -O3 -g -static -EB -mips64r6 -mmsa -mabi=64 \ 57 | -DRT_LINUX -DRT_M64=6 -DRT_128=1 -DRT_DEBUG=0 \ 58 | -DRT_POINTER=64 -DRT_ADDRESS=32 -DRT_ELEMENT=32 -DRT_ENDIAN=1 \ 59 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.m64_32Br6 60 | 61 | simd_test_m64_64Br6: 62 | mips-mti-linux-gnu-g++ -O3 -g -static -EB -mips64r6 -mmsa -mabi=64 \ 63 | -DRT_LINUX -DRT_M64=6 -DRT_128=1 -DRT_DEBUG=0 \ 64 | -DRT_POINTER=64 -DRT_ADDRESS=32 -DRT_ELEMENT=64 -DRT_ENDIAN=1 \ 65 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.m64_64Br6 66 | 67 | simd_test_m64f32Br6: 68 | mips-mti-linux-gnu-g++ -O3 -g -static -EB -mips64r6 -mmsa -mabi=64 \ 69 | -DRT_LINUX -DRT_M64=6 -DRT_256=1 -DRT_DEBUG=0 \ 70 | -DRT_POINTER=64 -DRT_ADDRESS=64 -DRT_ELEMENT=32 -DRT_ENDIAN=1 \ 71 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.m64f32Br6 72 | 73 | simd_test_m64f64Br6: 74 | mips-mti-linux-gnu-g++ -O3 -g -static -EB -mips64r6 -mmsa -mabi=64 \ 75 | -DRT_LINUX -DRT_M64=6 -DRT_256=1 -DRT_DEBUG=0 \ 76 | -DRT_POINTER=64 -DRT_ADDRESS=64 -DRT_ELEMENT=64 -DRT_ENDIAN=1 \ 77 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.m64f64Br6 78 | 79 | 80 | # On Ubuntu (MATE) 16.04-22.04 add "universe multiverse" to "main restricted" 81 | # in /etc/apt/sources.list (sudo nano /etc/apt/sources.list) then run: 82 | # sudo apt-get update 83 | # (Ubuntu MATE is set up for an update without a need to edit the file) 84 | # (extended repositories "universe multiverse" are only needed for clang) 85 | # 86 | # Download and unpack MIPS toolchain: 87 | # https://codescape.mips.com/components/toolchain/2020.06-01/downloads.html 88 | # 89 | # Prerequisites for the build: 90 | # (cross-)compiler for MIPSr6+MSA is installed and in the PATH variable. 91 | # Codescape.GNU.Tools.Package.2020.06-01.for.MIPS.MTI.Linux.CentOS-6.x86_64 92 | # is unpacked and folder mips-mti-linux-gnu/2020.06-01/bin is added to PATH: 93 | # PATH=/home/ubuntu/Downloads/mips-mti-linux-gnu/2020.06-01/bin:$PATH 94 | # PATH=/home/ubuntu-mate/Downloads/mips-mti-linux-gnu/2020.06-01/bin:$PATH 95 | # 96 | # Starting from Ubuntu (MATE) 19.10 upstream (cross-)compiler supports MSA. 97 | # sudo apt-get install make g++-mipsisa64r6el-linux-gnuabi64 98 | # sudo apt-get install make g++-mipsisa64r6-linux-gnuabi64 99 | # (replace mips-mti-linux-gnu with mipsisa64r6el-linux-gnuabi64 for LE) 100 | # (replace mips-mti-linux-gnu with mipsisa64r6-linux-gnuabi64 for BE) 101 | # 102 | # Prerequisites for emulation: 103 | # recent QEMU(-2.7) is installed or built from source and in the PATH variable. 104 | # standalone toolchain from 2020.06-01 comes with QEMU 4.1.0 for MIPS in PATH. 105 | # sudo apt-get install qemu-user make 106 | # 107 | # Compiling/running SIMD test: 108 | # make -f simd_make_m64.mk 109 | # qemu-mips64el -cpu I6400 simd_test.m64_32Lr6 -c 1 110 | # qemu-mips64el -cpu I6400 simd_test.m64_64Lr6 -c 1 111 | # qemu-mips64el -cpu I6400 simd_test.m64f32Lr6 -c 1 112 | # qemu-mips64el -cpu I6400 simd_test.m64f64Lr6 -c 1 113 | # qemu-mips64 -cpu I6400 simd_test.m64_32Br6 -c 1 114 | # qemu-mips64 -cpu I6400 simd_test.m64_64Br6 -c 1 115 | # qemu-mips64 -cpu I6400 simd_test.m64f32Br6 -c 1 116 | # qemu-mips64 -cpu I6400 simd_test.m64f64Br6 -c 1 117 | # Use "-c 1" option to reduce test time when emulating with QEMU 118 | 119 | # Clang native build should theoretically work too (not tested), use (replace): 120 | # clang++ -O0 (in place of ...-g++ -O3) on MIPS64r6 host (I6400/P6600) 121 | # sudo apt-get install clang 122 | 123 | # For interpretation of SIMD build flags check compatibility layer in rtzero.h. 124 | # The 128-bit 15-reg targets are supported for compatibility with x86/POWER. 125 | 126 | # For 128-bit SIMD build use (replace): RT_128=1 (30 SIMD registers) 127 | # For 128-bit SIMD build use (replace): RT_128=4 (15 SIMD registers) 128 | # For 256-bit SIMD build use (replace): RT_256=1 (15 SIMD reg-pairs) 129 | 130 | # 64/32-bit (ptr/adr) hybrid mode is compatible with native 64-bit ABI, 131 | # use (replace): RT_ADDRESS=32, rename the binary to simd_test.m64_** 132 | # 64-bit packed SIMD mode (fp64/int64) is supported on 64-bit targets, 133 | # use (replace): RT_ELEMENT=64, rename the binary to simd_test.m64*64 134 | -------------------------------------------------------------------------------- /test/simd_make_p32.mk: -------------------------------------------------------------------------------- 1 | 2 | INC_PATH = \ 3 | -I../core/config/ 4 | 5 | SRC_LIST = \ 6 | simd_test.cpp 7 | 8 | LIB_PATH = 9 | 10 | LIB_LIST = \ 11 | -lm 12 | 13 | 14 | build: simd_test_p32Bg4 simd_test_p32Bp7 simd_test_p32Bp8 simd_test_p32Bp9 15 | 16 | strip: 17 | powerpc-linux-gnu-strip simd_test.p32* 18 | 19 | clean: 20 | rm simd_test.p32* 21 | 22 | 23 | simd_test_p32Bg4: 24 | powerpc-linux-gnu-g++ -O3 -g -static -DRT_SIMD_COMPAT_VSX=0 \ 25 | -DRT_LINUX -DRT_P32 -DRT_128=4 -DRT_DEBUG=0 \ 26 | -DRT_POINTER=32 -DRT_ADDRESS=32 -DRT_ELEMENT=32 -DRT_ENDIAN=1 \ 27 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.p32Bg4 28 | 29 | simd_test_p32Bp7: 30 | powerpc-linux-gnu-g++ -O3 -g -static \ 31 | -DRT_LINUX -DRT_P32 -DRT_128=1 -DRT_DEBUG=0 \ 32 | -DRT_POINTER=32 -DRT_ADDRESS=32 -DRT_ELEMENT=32 -DRT_ENDIAN=1 \ 33 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.p32Bp7 34 | 35 | simd_test_p32Bp8: 36 | powerpc-linux-gnu-g++ -O3 -g -static -DRT_SIMD_COMPAT_PW8=1 \ 37 | -DRT_LINUX -DRT_P32 -DRT_128=1 -DRT_DEBUG=0 \ 38 | -DRT_POINTER=32 -DRT_ADDRESS=32 -DRT_ELEMENT=32 -DRT_ENDIAN=1 \ 39 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.p32Bp8 40 | 41 | simd_test_p32Bp9: 42 | powerpc-linux-gnu-g++ -O3 -g -static \ 43 | -DRT_LINUX -DRT_P32 -DRT_128=2 -DRT_DEBUG=0 \ 44 | -DRT_POINTER=32 -DRT_ADDRESS=32 -DRT_ELEMENT=32 -DRT_ENDIAN=1 \ 45 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.p32Bp9 46 | 47 | 48 | # On Ubuntu (MATE) 16.04-22.04 add "universe multiverse" to "main restricted" 49 | # in /etc/apt/sources.list (sudo nano /etc/apt/sources.list) then run: 50 | # sudo apt-get update 51 | # (Ubuntu MATE is set up for an update without a need to edit the file) 52 | # (extended repositories "universe multiverse" are only needed for clang) 53 | # 54 | # Prerequisites for the build: 55 | # (cross-)compiler for PowerPC is installed and in the PATH variable. 56 | # sudo apt-get install make g++-powerpc-linux-gnu 57 | # 58 | # Prerequisites for emulation: 59 | # recent QEMU(-2.5) is installed or built from source and in the PATH variable. 60 | # POWER9 target requires more recent QEMU, tested with 3.x.y series and 4.2.0. 61 | # QEMU versions 4.x.y prior to 4.2.0 show issues with POWER8/9 fp32 LE targets. 62 | # sudo apt-get install qemu-user 63 | # 64 | # Compiling/running SIMD test: 65 | # make -f simd_make_p32.mk 66 | # qemu-ppc -cpu G4 simd_test.p32Bg4 -c 1 67 | # qemu-ppc64abi32 -cpu POWER7 simd_test.p32Bp7 -c 1 68 | # qemu-ppc64abi32 -cpu POWER8 simd_test.p32Bp8 -c 1 69 | # qemu-ppc64abi32 -cpu POWER9 simd_test.p32Bp9 -c 1 70 | # Use "-c 1" option to reduce test time when emulating with QEMU 71 | 72 | # Clang native build should theoretically work too (not tested), use (replace): 73 | # clang++ -O0 (in place of ...-g++ -O3) on PowerPC host (G4) 74 | # sudo apt-get install clang 75 | 76 | # For interpretation of SIMD build flags check compatibility layer in rtzero.h. 77 | # The RT_SIMD_COMPAT_PW8=1 flag below is redundant when building in LE mode. 78 | 79 | # For 128-bit VSX1 build use (replace): RT_128=1 (30 SIMD registers) 80 | # For 128-bit VSX2 build use (replace): RT_128=1 RT_SIMD_COMPAT_PW8=1 (30 regs) 81 | # For 128-bit VSX3 build use (replace): RT_128=2 (30 SIMD registers) 82 | # For 128-bit VMX build use (replace): RT_128=4 RT_SIMD_COMPAT_VSX=0 (15 regs) 83 | 84 | # For 256-bit VMX build use (replace): RT_256_R8=4 RT_SIMD_COMPAT_VSX=0 (8 rp) 85 | # For 256-bit VSX1 build use (replace): RT_256=1 (15 SIMD reg-pairs) 86 | # For 256-bit VSX2 build use (replace): RT_256=1 RT_SIMD_COMPAT_PW8=1 (15 rp) 87 | # For 256-bit VSX3 build use (replace): RT_256=2 (15 SIMD reg-pairs) 88 | # For 256-bit VSX1 build use (replace): RT_256=4 (<=test29) (30 SIMD reg-pairs) 89 | # For 256-bit VSX2 build use (replace): RT_256=4 RT_SIMD_COMPAT_PW8=1 (30 rp) 90 | # For 256-bit VSX3 build use (replace): RT_256=8 (<=test29) (30 SIMD reg-pairs) 91 | 92 | # For 512-bit VSX1 build use (replace): RT_512=1 (<=test29) (15 SIMD reg-quads) 93 | # For 512-bit VSX2 build use (replace): RT_512=1 RT_SIMD_COMPAT_PW8=1 (15 rq) 94 | # For 512-bit VSX3 build use (replace): RT_512=2 (<=test29) (15 SIMD reg-quads) 95 | -------------------------------------------------------------------------------- /test/simd_make_p64.mk: -------------------------------------------------------------------------------- 1 | 2 | INC_PATH = \ 3 | -I../core/config/ 4 | 5 | SRC_LIST = \ 6 | simd_test.cpp 7 | 8 | LIB_PATH = 9 | 10 | LIB_LIST = \ 11 | -lm 12 | 13 | 14 | build: build_p9 build_le build_be 15 | 16 | strip: 17 | powerpc64le-linux-gnu-strip simd_test.p64???L* 18 | powerpc64-linux-gnu-strip simd_test.p64???B* 19 | 20 | clean: 21 | rm simd_test.p64* 22 | 23 | 24 | # using -mcpu=power8 for power9 targets is a workaround for QEMU 6.2.0 bug 25 | # https://bugs.launchpad.net/ubuntu/+source/qemu/+bug/2011832 26 | 27 | build_p9: simd_test_p64_32Lp9 simd_test_p64_64Lp9 \ 28 | simd_test_p64f32Lp9 simd_test_p64f64Lp9 29 | 30 | simd_test_p64_32Lp9: 31 | powerpc64le-linux-gnu-g++ -O2 -g -static -mcpu=power8 \ 32 | -DRT_LINUX -DRT_P64 -DRT_128=2 -DRT_DEBUG=0 \ 33 | -DRT_POINTER=64 -DRT_ADDRESS=32 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \ 34 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.p64_32Lp9 35 | 36 | simd_test_p64_64Lp9: 37 | powerpc64le-linux-gnu-g++ -O2 -g -static -mcpu=power8 \ 38 | -DRT_LINUX -DRT_P64 -DRT_128=2 -DRT_DEBUG=0 \ 39 | -DRT_POINTER=64 -DRT_ADDRESS=32 -DRT_ELEMENT=64 -DRT_ENDIAN=0 \ 40 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.p64_64Lp9 41 | 42 | simd_test_p64f32Lp9: 43 | powerpc64le-linux-gnu-g++ -O2 -g -static -mcpu=power8 \ 44 | -DRT_LINUX -DRT_P64 -DRT_256=2 -DRT_DEBUG=0 \ 45 | -DRT_POINTER=64 -DRT_ADDRESS=64 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \ 46 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.p64f32Lp9 47 | 48 | simd_test_p64f64Lp9: 49 | powerpc64le-linux-gnu-g++ -O2 -g -static -mcpu=power8 \ 50 | -DRT_LINUX -DRT_P64 -DRT_256=2 -DRT_DEBUG=0 \ 51 | -DRT_POINTER=64 -DRT_ADDRESS=64 -DRT_ELEMENT=64 -DRT_ENDIAN=0 \ 52 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.p64f64Lp9 53 | 54 | 55 | build_pX: simd_test_p64_32LpX simd_test_p64_64LpX \ 56 | simd_test_p64f32LpX simd_test_p64f64LpX 57 | 58 | simd_test_p64_32LpX: 59 | powerpc64le-linux-gnu-g++ -O0 -g -static \ 60 | -DRT_LINUX -DRT_P64 -DRT_256=8 -DRT_DEBUG=0 \ 61 | -DRT_POINTER=64 -DRT_ADDRESS=32 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \ 62 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.p64_32LpX 63 | 64 | simd_test_p64_64LpX: 65 | powerpc64le-linux-gnu-g++ -O0 -g -static \ 66 | -DRT_LINUX -DRT_P64 -DRT_256=8 -DRT_DEBUG=0 \ 67 | -DRT_POINTER=64 -DRT_ADDRESS=32 -DRT_ELEMENT=64 -DRT_ENDIAN=0 \ 68 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.p64_64LpX 69 | 70 | simd_test_p64f32LpX: 71 | powerpc64le-linux-gnu-g++ -O0 -g -static \ 72 | -DRT_LINUX -DRT_P64 -DRT_512=2 -DRT_DEBUG=0 \ 73 | -DRT_POINTER=64 -DRT_ADDRESS=64 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \ 74 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.p64f32LpX 75 | 76 | simd_test_p64f64LpX: 77 | powerpc64le-linux-gnu-g++ -O0 -g -static \ 78 | -DRT_LINUX -DRT_P64 -DRT_512=2 -DRT_DEBUG=0 \ 79 | -DRT_POINTER=64 -DRT_ADDRESS=64 -DRT_ELEMENT=64 -DRT_ENDIAN=0 \ 80 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.p64f64LpX 81 | 82 | 83 | build_le: simd_test_p64_32Lp8 simd_test_p64_64Lp8 \ 84 | simd_test_p64f32Lp8 simd_test_p64f64Lp8 85 | 86 | simd_test_p64_32Lp8: 87 | powerpc64le-linux-gnu-g++ -O2 -g -static -mcpu=power8 \ 88 | -DRT_LINUX -DRT_P64 -DRT_128=1 -DRT_DEBUG=0 \ 89 | -DRT_POINTER=64 -DRT_ADDRESS=32 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \ 90 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.p64_32Lp8 91 | 92 | simd_test_p64_64Lp8: 93 | powerpc64le-linux-gnu-g++ -O2 -g -static -mcpu=power8 \ 94 | -DRT_LINUX -DRT_P64 -DRT_128=1 -DRT_DEBUG=0 \ 95 | -DRT_POINTER=64 -DRT_ADDRESS=32 -DRT_ELEMENT=64 -DRT_ENDIAN=0 \ 96 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.p64_64Lp8 97 | 98 | simd_test_p64f32Lp8: 99 | powerpc64le-linux-gnu-g++ -O2 -g -static -mcpu=power8 \ 100 | -DRT_LINUX -DRT_P64 -DRT_256=1 -DRT_DEBUG=0 \ 101 | -DRT_POINTER=64 -DRT_ADDRESS=64 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \ 102 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.p64f32Lp8 103 | 104 | simd_test_p64f64Lp8: 105 | powerpc64le-linux-gnu-g++ -O2 -g -static -mcpu=power8 \ 106 | -DRT_LINUX -DRT_P64 -DRT_256=1 -DRT_DEBUG=0 \ 107 | -DRT_POINTER=64 -DRT_ADDRESS=64 -DRT_ELEMENT=64 -DRT_ENDIAN=0 \ 108 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.p64f64Lp8 109 | 110 | 111 | build_be: simd_test_p64_32Bp7 simd_test_p64_64Bp7 \ 112 | simd_test_p64f32Bp7 simd_test_p64f64Bp7 113 | 114 | simd_test_p64_32Bp7: 115 | powerpc64-linux-gnu-g++ -O2 -g -static \ 116 | -DRT_LINUX -DRT_P64 -DRT_128=1 -DRT_DEBUG=0 \ 117 | -DRT_POINTER=64 -DRT_ADDRESS=32 -DRT_ELEMENT=32 -DRT_ENDIAN=1 \ 118 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.p64_32Bp7 119 | 120 | simd_test_p64_64Bp7: 121 | powerpc64-linux-gnu-g++ -O2 -g -static \ 122 | -DRT_LINUX -DRT_P64 -DRT_128=1 -DRT_DEBUG=0 \ 123 | -DRT_POINTER=64 -DRT_ADDRESS=32 -DRT_ELEMENT=64 -DRT_ENDIAN=1 \ 124 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.p64_64Bp7 125 | 126 | simd_test_p64f32Bp7: 127 | powerpc64-linux-gnu-g++ -O2 -g -static \ 128 | -DRT_LINUX -DRT_P64 -DRT_256=1 -DRT_DEBUG=0 \ 129 | -DRT_POINTER=64 -DRT_ADDRESS=64 -DRT_ELEMENT=32 -DRT_ENDIAN=1 \ 130 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.p64f32Bp7 131 | 132 | simd_test_p64f64Bp7: 133 | powerpc64-linux-gnu-g++ -O2 -g -static \ 134 | -DRT_LINUX -DRT_P64 -DRT_256=1 -DRT_DEBUG=0 \ 135 | -DRT_POINTER=64 -DRT_ADDRESS=64 -DRT_ELEMENT=64 -DRT_ENDIAN=1 \ 136 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.p64f64Bp7 137 | 138 | 139 | # On Ubuntu (MATE) 16.04-22.04 add "universe multiverse" to "main restricted" 140 | # in /etc/apt/sources.list (sudo nano /etc/apt/sources.list) then run: 141 | # sudo apt-get update 142 | # (Ubuntu MATE is set up for an update without a need to edit the file) 143 | # (extended repositories "universe multiverse" are only needed for clang) 144 | # 145 | # Prerequisites for the build: 146 | # (cross-)compiler for 64-bit POWER is installed and in the PATH variable. 147 | # sudo apt-get install make g++-powerpc64le-linux-gnu 148 | # sudo apt-get install make g++-powerpc64-linux-gnu 149 | # (recent g++-5-powerpc64le series target POWER8 and don't work well with -O3) 150 | # 151 | # Prerequisites for emulation: 152 | # recent QEMU(-2.5) is installed or built from source and in the PATH variable. 153 | # POWER9 target requires more recent QEMU, tested with 3.x.y series and 4.2.0. 154 | # QEMU versions 4.x.y prior to 4.2.0 show issues with POWER8/9 fp32 LE targets. 155 | # sudo apt-get install qemu-user 156 | # 157 | # Compiling/running SIMD test: 158 | # make -f simd_make_p64.mk 159 | # qemu-ppc64le -cpu POWER9 simd_test.p64_32Lp9 -c 1 160 | # qemu-ppc64le -cpu POWER9 simd_test.p64_64Lp9 -c 1 161 | # qemu-ppc64le -cpu POWER9 simd_test.p64f32Lp9 -c 1 162 | # qemu-ppc64le -cpu POWER9 simd_test.p64f64Lp9 -c 1 163 | # qemu-ppc64le -cpu POWER9 simd_test.p64_32LpX -c 1 164 | # qemu-ppc64le -cpu POWER9 simd_test.p64_64LpX -c 1 165 | # qemu-ppc64le -cpu POWER9 simd_test.p64f32LpX -c 1 166 | # qemu-ppc64le -cpu POWER9 simd_test.p64f64LpX -c 1 167 | # qemu-ppc64le -cpu POWER8 simd_test.p64_32Lp8 -c 1 (use POWER9 on Ubuntu 22.04) 168 | # qemu-ppc64le -cpu POWER8 simd_test.p64_64Lp8 -c 1 (use POWER9 on Ubuntu 22.04) 169 | # qemu-ppc64le -cpu POWER8 simd_test.p64f32Lp8 -c 1 (use POWER9 on Ubuntu 22.04) 170 | # qemu-ppc64le -cpu POWER8 simd_test.p64f64Lp8 -c 1 (use POWER9 on Ubuntu 22.04) 171 | # qemu-ppc64 -cpu POWER7 simd_test.p64_32Bp7 -c 1 172 | # qemu-ppc64 -cpu POWER7 simd_test.p64_64Bp7 -c 1 173 | # qemu-ppc64 -cpu POWER7 simd_test.p64f32Bp7 -c 1 174 | # qemu-ppc64 -cpu POWER7 simd_test.p64f64Bp7 -c 1 175 | # Use "-c 1" option to reduce test time when emulating with QEMU 176 | 177 | # Clang native build works too (takes much longer prior to 3.8), use (replace): 178 | # clang++ -O0 (in place of ...-g++ -O2) on 64-bit POWER host (Tyan TN71-BP012) 179 | # sudo apt-get install clang 180 | 181 | # For interpretation of SIMD build flags check compatibility layer in rtzero.h. 182 | # The RT_SIMD_COMPAT_PW8=1 flag below is redundant when building in LE mode. 183 | 184 | # For 128-bit VSX1 build use (replace): RT_128=1 (30 SIMD registers) 185 | # For 128-bit VSX2 build use (replace): RT_128=1 RT_SIMD_COMPAT_PW8=1 (30 regs) 186 | # For 128-bit VSX3 build use (replace): RT_128=2 (30 SIMD registers) 187 | # For 128-bit VMX build use (replace): RT_128=4 RT_SIMD_COMPAT_VSX=0 (15 regs) 188 | 189 | # For 256-bit VMX build use (replace): RT_256_R8=4 RT_SIMD_COMPAT_VSX=0 (8 rp) 190 | # For 256-bit VSX1 build use (replace): RT_256=1 (15 SIMD reg-pairs) 191 | # For 256-bit VSX2 build use (replace): RT_256=1 RT_SIMD_COMPAT_PW8=1 (15 rp) 192 | # For 256-bit VSX3 build use (replace): RT_256=2 (15 SIMD reg-pairs) 193 | # For 256-bit VSX1 build use (replace): RT_256=4 (<=test29) (30 SIMD reg-pairs) 194 | # For 256-bit VSX2 build use (replace): RT_256=4 RT_SIMD_COMPAT_PW8=1 (30 rp) 195 | # For 256-bit VSX3 build use (replace): RT_256=8 (<=test29) (30 SIMD reg-pairs) 196 | 197 | # For 512-bit VSX1 build use (replace): RT_512=1 (<=test29) (15 SIMD reg-quads) 198 | # For 512-bit VSX2 build use (replace): RT_512=1 RT_SIMD_COMPAT_PW8=1 (15 rq) 199 | # For 512-bit VSX3 build use (replace): RT_512=2 (<=test29) (15 SIMD reg-quads) 200 | 201 | # 64/32-bit (ptr/adr) hybrid mode is compatible with native 64-bit ABI, 202 | # use (replace): RT_ADDRESS=32, rename the binary to simd_test.p64_** 203 | # 64-bit packed SIMD mode (fp64/int64) is supported on 64-bit targets, 204 | # use (replace): RT_ELEMENT=64, rename the binary to simd_test.p64*64 205 | -------------------------------------------------------------------------------- /test/simd_make_w64.bat: -------------------------------------------------------------------------------- 1 | mingw32-make -f simd_make_w64.mk -j4 2 | simd_test_w64f32.exe 3 | -------------------------------------------------------------------------------- /test/simd_make_w64.mk: -------------------------------------------------------------------------------- 1 | 2 | INC_PATH = \ 3 | -I../core/config/ 4 | 5 | SRC_LIST = \ 6 | simd_test.cpp 7 | 8 | LIB_PATH = 9 | 10 | LIB_LIST = \ 11 | -lm 12 | 13 | 14 | build: build_w64 build_w64avx build_w64avx512 15 | 16 | strip: 17 | strip simd_test_w64*.exe 18 | 19 | clean: 20 | del simd_test_w64*.exe 21 | 22 | 23 | build_w64: simd_test_w64_32 simd_test_w64_64 simd_test_w64f32 simd_test_w64f64 24 | 25 | simd_test_w64_32: 26 | g++ -O3 -g -static -m64 \ 27 | -DRT_WIN64 -DRT_X64 -DRT_128=4 -DRT_SIMD_COMPAT_SSE=2 -DRT_DEBUG=0 \ 28 | -DRT_POINTER=64 -DRT_ADDRESS=32 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \ 29 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test_w64_32.exe 30 | 31 | simd_test_w64_64: 32 | g++ -O3 -g -static -m64 \ 33 | -DRT_WIN64 -DRT_X64 -DRT_128=4 -DRT_SIMD_COMPAT_SSE=2 -DRT_DEBUG=0 \ 34 | -DRT_POINTER=64 -DRT_ADDRESS=32 -DRT_ELEMENT=64 -DRT_ENDIAN=0 \ 35 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test_w64_64.exe 36 | 37 | simd_test_w64f32: 38 | g++ -O3 -g -static -m64 \ 39 | -DRT_WIN64 -DRT_X64 -DRT_256_R8=4 -DRT_DEBUG=0 \ 40 | -DRT_POINTER=64 -DRT_ADDRESS=64 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \ 41 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test_w64f32.exe 42 | 43 | simd_test_w64f64: 44 | g++ -O3 -g -static -m64 \ 45 | -DRT_WIN64 -DRT_X64 -DRT_256_R8=4 -DRT_DEBUG=0 \ 46 | -DRT_POINTER=64 -DRT_ADDRESS=64 -DRT_ELEMENT=64 -DRT_ENDIAN=0 \ 47 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test_w64f64.exe 48 | 49 | 50 | build_w64avx: simd_test_w64_32avx simd_test_w64_64avx \ 51 | simd_test_w64f32avx simd_test_w64f64avx 52 | 53 | simd_test_w64_32avx: 54 | g++ -O3 -g -static -m64 \ 55 | -DRT_WIN64 -DRT_X64 -DRT_256=1 -DRT_DEBUG=0 \ 56 | -DRT_POINTER=64 -DRT_ADDRESS=32 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \ 57 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test_w64_32avx.exe 58 | 59 | simd_test_w64_64avx: 60 | g++ -O3 -g -static -m64 \ 61 | -DRT_WIN64 -DRT_X64 -DRT_256=1 -DRT_DEBUG=0 \ 62 | -DRT_POINTER=64 -DRT_ADDRESS=32 -DRT_ELEMENT=64 -DRT_ENDIAN=0 \ 63 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test_w64_64avx.exe 64 | 65 | simd_test_w64f32avx: 66 | g++ -O3 -g -static -m64 \ 67 | -DRT_WIN64 -DRT_X64 -DRT_256=2 -DRT_DEBUG=0 \ 68 | -DRT_POINTER=64 -DRT_ADDRESS=64 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \ 69 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test_w64f32avx.exe 70 | 71 | simd_test_w64f64avx: 72 | g++ -O3 -g -static -m64 \ 73 | -DRT_WIN64 -DRT_X64 -DRT_256=2 -DRT_DEBUG=0 \ 74 | -DRT_POINTER=64 -DRT_ADDRESS=64 -DRT_ELEMENT=64 -DRT_ENDIAN=0 \ 75 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test_w64f64avx.exe 76 | 77 | 78 | build_w64avx512: simd_test_w64_32avx512 simd_test_w64_64avx512 \ 79 | simd_test_w64f32avx512 simd_test_w64f64avx512 80 | 81 | simd_test_w64_32avx512: 82 | g++ -O3 -g -static -m64 \ 83 | -DRT_WIN64 -DRT_X64 -DRT_512=1 -DRT_DEBUG=0 \ 84 | -DRT_POINTER=64 -DRT_ADDRESS=32 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \ 85 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test_w64_32avx512.exe 86 | 87 | simd_test_w64_64avx512: 88 | g++ -O3 -g -static -m64 \ 89 | -DRT_WIN64 -DRT_X64 -DRT_512=1 -DRT_DEBUG=0 \ 90 | -DRT_POINTER=64 -DRT_ADDRESS=32 -DRT_ELEMENT=64 -DRT_ENDIAN=0 \ 91 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test_w64_64avx512.exe 92 | 93 | simd_test_w64f32avx512: 94 | g++ -O3 -g -static -m64 \ 95 | -DRT_WIN64 -DRT_X64 -DRT_512=2 -DRT_DEBUG=0 \ 96 | -DRT_POINTER=64 -DRT_ADDRESS=64 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \ 97 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test_w64f32avx512.exe 98 | 99 | simd_test_w64f64avx512: 100 | g++ -O3 -g -static -m64 \ 101 | -DRT_WIN64 -DRT_X64 -DRT_512=2 -DRT_DEBUG=0 \ 102 | -DRT_POINTER=64 -DRT_ADDRESS=64 -DRT_ELEMENT=64 -DRT_ENDIAN=0 \ 103 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test_w64f64avx512.exe 104 | 105 | 106 | # Prerequisites for the build: 107 | # TDM64-GCC compiler for Win32/64 is installed and in the PATH variable. 108 | # Download tdm64-gcc-5.1.0-2.exe from sourceforge and run the installer. 109 | # Alternatively download and install tdm64-gcc-10.3.0-2.exe from github. 110 | # 111 | # Compiling/running SIMD test: 112 | # run simd_make_w64.bat from Windows Explorer or 113 | # run the following from Command Prompt "cmd": 114 | # mingw32-make -f simd_make_w64.mk 115 | # simd_test_w64f32.exe 116 | # simd_test_w64f32avx.exe 117 | # simd_test_w64f32avx512.exe 118 | # Use "-c 1" option to reduce test time when emulating with Intel SDE 119 | 120 | # Clang native build should theoretically work too (not tested), use (replace): 121 | # clang++ (in place of g++) may require Visual Studio 122 | # once clang for Windows is installed and in the PATH variable. 123 | 124 | # For interpretation of SIMD build flags check compatibility layer in rtzero.h. 125 | # The 30-reg targets on top of AVX1+2/SSEx below will require in-mem emulation. 126 | 127 | # For 128-bit 30-reg build use (replace): RT_128=1 (reserved for AVX1+2/SSEx) 128 | # For 128-bit 30-reg build use (replace): RT_128=2 (Skylake-X w/ AVX512DQ+VL) 129 | # For 128-bit SSE2 build use (replace): RT_128=4 RT_SIMD_COMPAT_SSE=2 (15 regs) 130 | # For 128-bit SSE4 build use (replace): RT_128=4 (15 SIMD registers) 131 | # For 128-bit AVX1 build use (replace): RT_128=8 (15 SIMD registers) 132 | # For 128-bit FMA3 build use (replace): RT_128=16 (AMD's AVX1+FMA3) (15 regs) 133 | # For 128-bit AVX2 build use (replace): RT_128=32 (AMD's AVX2+FMA3) (15 regs) 134 | 135 | # For 256-bit SSE2 build use (replace): RT_256_R8=4 RT_SIMD_COMPAT_SSE=2 (8 rp) 136 | # For 256-bit SSE4 build use (replace): RT_256_R8=4 (8 SIMD reg-pairs) 137 | # For 256-bit AVX1 build use (replace): RT_256=1 (15 SIMD registers) 138 | # For 256-bit AVX2 build use (replace): RT_256=2 (15 SIMD registers) 139 | # For 256-bit 30-reg build use (replace): RT_256=4 (reserved for AVX1+2/SSEx) 140 | # For 256-bit 30-reg build use (replace): RT_256=8 (Skylake-X w/ AVX512DQ+VL) 141 | 142 | # For 512-bit AVX1 build use (replace): RT_512_R8=1 (8 SIMD reg-pairs) 143 | # For 512-bit AVX2 build use (replace): RT_512_R8=2 (8 SIMD reg-pairs) 144 | # For 512-bit AVX512F build use (replace): RT_512=1 (15 SIMD registers) 145 | # For 512-bit AVX512DQ build use (replace): RT_512=2 (15 SIMD registers) 146 | # For 512-bit AVX512F build use (replace): RT_512=4 (30 SIMD registers) 147 | # For 512-bit AVX512DQ build use (replace): RT_512=8 (30 SIMD registers) 148 | 149 | # For 1024-bit AVX512F build use (replace): RT_1K4=1 (15 SIMD reg-pairs) 150 | # For 1024-bit AVX512DQ build use (replace): RT_1K4=2 (15 SIMD reg-pairs) 151 | # For 2048-bit AVX512F build use (replace): RT_2K8_R8=1 (8 SIMD reg-quads) 152 | # For 2048-bit AVX512DQ build use (replace): RT_2K8_R8=2 (8 SIMD reg-quads) 153 | 154 | # 64/32-bit (ptr/adr) hybrid mode is compatible with native 64-bit ABI, 155 | # use (replace): RT_ADDRESS=32, rename the binary to simd_test_w64_**.exe 156 | # 64-bit packed SIMD mode (fp64/int64) is supported on 64-bit targets, 157 | # use (replace): RT_ELEMENT=64, rename the binary to simd_test_w64*64.exe 158 | -------------------------------------------------------------------------------- /test/simd_make_x32.mk: -------------------------------------------------------------------------------- 1 | 2 | INC_PATH = \ 3 | -I../core/config/ 4 | 5 | SRC_LIST = \ 6 | simd_test.cpp 7 | 8 | LIB_PATH = 9 | 10 | LIB_LIST = \ 11 | -lm 12 | 13 | 14 | build: simd_test_x32 15 | 16 | strip: 17 | strip simd_test.x32* 18 | 19 | clean: 20 | rm simd_test.x32* 21 | 22 | 23 | simd_test_x32: 24 | g++ -O3 -g -mx32 \ 25 | -DRT_LINUX -DRT_X32 -DRT_256_R8=4 -DRT_SIMD_COMPAT_SSE=2 -DRT_DEBUG=0 \ 26 | -DRT_POINTER=32 -DRT_ADDRESS=32 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \ 27 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.x32 28 | 29 | 30 | # On Ubuntu (MATE) 16.04-22.04 add "universe multiverse" to "main restricted" 31 | # in /etc/apt/sources.list (sudo nano /etc/apt/sources.list) then run: 32 | # sudo apt-get update 33 | # (Ubuntu MATE is set up for an update without a need to edit the file) 34 | # (extended repositories "universe multiverse" are only needed for clang) 35 | # 36 | # Prerequisites for the build: 37 | # multilib-compiler for x86_64 is installed and in the PATH variable. 38 | # sudo apt-get install make g++-multilib 39 | # (installation of g++-multilib removes any g++ cross-compilers) 40 | # 41 | # Compiling/running SIMD test: 42 | # make -f simd_make_x32.mk 43 | # ./simd_test.x32 44 | 45 | # Clang native build works too (takes much longer prior to 3.8), use (replace): 46 | # clang++ (in place of g++) 47 | # sudo apt-get install clang (requires g++-multilib for non-native ABI) 48 | 49 | # For interpretation of SIMD build flags check compatibility layer in rtzero.h. 50 | # The 30-reg targets on top of AVX1+2/SSEx below will require in-mem emulation. 51 | 52 | # For 128-bit 30-reg build use (replace): RT_128=1 (reserved for AVX1+2/SSEx) 53 | # For 128-bit 30-reg build use (replace): RT_128=2 (Skylake-X w/ AVX512DQ+VL) 54 | # For 128-bit SSE2 build use (replace): RT_128=4 RT_SIMD_COMPAT_SSE=2 (15 regs) 55 | # For 128-bit SSE4 build use (replace): RT_128=4 (15 SIMD registers) 56 | # For 128-bit AVX1 build use (replace): RT_128=8 (15 SIMD registers) 57 | # For 128-bit FMA3 build use (replace): RT_128=16 (AMD's AVX1+FMA3) (15 regs) 58 | # For 128-bit AVX2 build use (replace): RT_128=32 (AMD's AVX2+FMA3) (15 regs) 59 | 60 | # For 256-bit SSE2 build use (replace): RT_256_R8=4 RT_SIMD_COMPAT_SSE=2 (8 rp) 61 | # For 256-bit SSE4 build use (replace): RT_256_R8=4 (8 SIMD reg-pairs) 62 | # For 256-bit AVX1 build use (replace): RT_256=1 (15 SIMD registers) 63 | # For 256-bit AVX2 build use (replace): RT_256=2 (15 SIMD registers) 64 | # For 256-bit 30-reg build use (replace): RT_256=4 (reserved for AVX1+2/SSEx) 65 | # For 256-bit 30-reg build use (replace): RT_256=8 (Skylake-X w/ AVX512DQ+VL) 66 | 67 | # For 512-bit AVX1 build use (replace): RT_512_R8=1 (8 SIMD reg-pairs) 68 | # For 512-bit AVX2 build use (replace): RT_512_R8=2 (8 SIMD reg-pairs) 69 | # For 512-bit AVX512F build use (replace): RT_512=1 (15 SIMD registers) 70 | # For 512-bit AVX512DQ build use (replace): RT_512=2 (15 SIMD registers) 71 | # For 512-bit AVX512F build use (replace): RT_512=4 (30 SIMD registers) 72 | # For 512-bit AVX512DQ build use (replace): RT_512=8 (30 SIMD registers) 73 | 74 | # For 1024-bit AVX512F build use (replace): RT_1K4=1 (15 SIMD reg-pairs) 75 | # For 1024-bit AVX512DQ build use (replace): RT_1K4=2 (15 SIMD reg-pairs) 76 | # For 2048-bit AVX512F build use (replace): RT_2K8_R8=1 (8 SIMD reg-quads) 77 | # For 2048-bit AVX512DQ build use (replace): RT_2K8_R8=2 (8 SIMD reg-quads) 78 | -------------------------------------------------------------------------------- /test/simd_make_x64.mk: -------------------------------------------------------------------------------- 1 | 2 | INC_PATH = \ 3 | -I../core/config/ 4 | 5 | SRC_LIST = \ 6 | simd_test.cpp 7 | 8 | LIB_PATH = 9 | 10 | LIB_LIST = \ 11 | -lm 12 | 13 | 14 | build: build_x64 build_x64avx build_x64avx512 15 | clang: clang_x64 clang_x64avx clang_x64avx512 16 | 17 | strip: 18 | strip simd_test.x64* 19 | 20 | clean: 21 | rm simd_test.x64* 22 | 23 | macOS: 24 | mv simd_test.x64_32 simd_test.o64_32 25 | mv simd_test.x64_64 simd_test.o64_64 26 | mv simd_test.x64f32 simd_test.o64f32 27 | mv simd_test.x64f64 simd_test.o64f64 28 | mv simd_test.x64_32avx simd_test.o64_32avx 29 | mv simd_test.x64_64avx simd_test.o64_64avx 30 | mv simd_test.x64f32avx simd_test.o64f32avx 31 | mv simd_test.x64f64avx simd_test.o64f64avx 32 | mv simd_test.x64_32avx512 simd_test.o64_32avx512 33 | mv simd_test.x64_64avx512 simd_test.o64_64avx512 34 | mv simd_test.x64f32avx512 simd_test.o64f32avx512 35 | mv simd_test.x64f64avx512 simd_test.o64f64avx512 36 | 37 | macRD: 38 | rm -fr simd_test.x64*.dSYM/ 39 | 40 | macRM: 41 | rm simd_test.o64* 42 | 43 | 44 | build_x64: simd_test_x64_32 simd_test_x64_64 simd_test_x64f32 simd_test_x64f64 45 | 46 | simd_test_x64_32: 47 | g++ -O3 -g \ 48 | -DRT_LINUX -DRT_X64 -DRT_128=4 -DRT_SIMD_COMPAT_SSE=2 -DRT_DEBUG=0 \ 49 | -DRT_POINTER=64 -DRT_ADDRESS=32 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \ 50 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.x64_32 51 | 52 | simd_test_x64_64: 53 | g++ -O3 -g \ 54 | -DRT_LINUX -DRT_X64 -DRT_128=4 -DRT_SIMD_COMPAT_SSE=2 -DRT_DEBUG=0 \ 55 | -DRT_POINTER=64 -DRT_ADDRESS=32 -DRT_ELEMENT=64 -DRT_ENDIAN=0 \ 56 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.x64_64 57 | 58 | simd_test_x64f32: 59 | g++ -O3 -g \ 60 | -DRT_LINUX -DRT_X64 -DRT_256_R8=4 -DRT_DEBUG=0 \ 61 | -DRT_POINTER=64 -DRT_ADDRESS=64 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \ 62 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.x64f32 63 | 64 | simd_test_x64f64: 65 | g++ -O3 -g \ 66 | -DRT_LINUX -DRT_X64 -DRT_256_R8=4 -DRT_DEBUG=0 \ 67 | -DRT_POINTER=64 -DRT_ADDRESS=64 -DRT_ELEMENT=64 -DRT_ENDIAN=0 \ 68 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.x64f64 69 | 70 | 71 | build_x64avx: simd_test_x64_32avx simd_test_x64_64avx \ 72 | simd_test_x64f32avx simd_test_x64f64avx 73 | 74 | simd_test_x64_32avx: 75 | g++ -O3 -g \ 76 | -DRT_LINUX -DRT_X64 -DRT_256=1 -DRT_DEBUG=0 \ 77 | -DRT_POINTER=64 -DRT_ADDRESS=32 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \ 78 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.x64_32avx 79 | 80 | simd_test_x64_64avx: 81 | g++ -O3 -g \ 82 | -DRT_LINUX -DRT_X64 -DRT_256=1 -DRT_DEBUG=0 \ 83 | -DRT_POINTER=64 -DRT_ADDRESS=32 -DRT_ELEMENT=64 -DRT_ENDIAN=0 \ 84 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.x64_64avx 85 | 86 | simd_test_x64f32avx: 87 | g++ -O3 -g \ 88 | -DRT_LINUX -DRT_X64 -DRT_256=2 -DRT_DEBUG=0 \ 89 | -DRT_POINTER=64 -DRT_ADDRESS=64 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \ 90 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.x64f32avx 91 | 92 | simd_test_x64f64avx: 93 | g++ -O3 -g \ 94 | -DRT_LINUX -DRT_X64 -DRT_256=2 -DRT_DEBUG=0 \ 95 | -DRT_POINTER=64 -DRT_ADDRESS=64 -DRT_ELEMENT=64 -DRT_ENDIAN=0 \ 96 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.x64f64avx 97 | 98 | 99 | build_x64avx512: simd_test_x64_32avx512 simd_test_x64_64avx512 \ 100 | simd_test_x64f32avx512 simd_test_x64f64avx512 101 | 102 | simd_test_x64_32avx512: 103 | g++ -O3 -g \ 104 | -DRT_LINUX -DRT_X64 -DRT_512=1 -DRT_DEBUG=0 \ 105 | -DRT_POINTER=64 -DRT_ADDRESS=32 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \ 106 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.x64_32avx512 107 | 108 | simd_test_x64_64avx512: 109 | g++ -O3 -g \ 110 | -DRT_LINUX -DRT_X64 -DRT_512=1 -DRT_DEBUG=0 \ 111 | -DRT_POINTER=64 -DRT_ADDRESS=32 -DRT_ELEMENT=64 -DRT_ENDIAN=0 \ 112 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.x64_64avx512 113 | 114 | simd_test_x64f32avx512: 115 | g++ -O3 -g \ 116 | -DRT_LINUX -DRT_X64 -DRT_512=2 -DRT_DEBUG=0 \ 117 | -DRT_POINTER=64 -DRT_ADDRESS=64 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \ 118 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.x64f32avx512 119 | 120 | simd_test_x64f64avx512: 121 | g++ -O3 -g \ 122 | -DRT_LINUX -DRT_X64 -DRT_512=2 -DRT_DEBUG=0 \ 123 | -DRT_POINTER=64 -DRT_ADDRESS=64 -DRT_ELEMENT=64 -DRT_ENDIAN=0 \ 124 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.x64f64avx512 125 | 126 | 127 | clang_x64: simd_test.x64_32 simd_test.x64_64 simd_test.x64f32 simd_test.x64f64 128 | 129 | simd_test.x64_32: 130 | clang++ -O3 -g \ 131 | -DRT_LINUX -DRT_X64 -DRT_128=4 -DRT_SIMD_COMPAT_SSE=2 -DRT_DEBUG=0 \ 132 | -DRT_POINTER=64 -DRT_ADDRESS=32 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \ 133 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.x64_32 134 | 135 | simd_test.x64_64: 136 | clang++ -O3 -g \ 137 | -DRT_LINUX -DRT_X64 -DRT_128=4 -DRT_SIMD_COMPAT_SSE=2 -DRT_DEBUG=0 \ 138 | -DRT_POINTER=64 -DRT_ADDRESS=32 -DRT_ELEMENT=64 -DRT_ENDIAN=0 \ 139 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.x64_64 140 | 141 | simd_test.x64f32: 142 | clang++ -O3 -g \ 143 | -DRT_LINUX -DRT_X64 -DRT_256_R8=4 -DRT_DEBUG=0 \ 144 | -DRT_POINTER=64 -DRT_ADDRESS=64 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \ 145 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.x64f32 146 | 147 | simd_test.x64f64: 148 | clang++ -O3 -g \ 149 | -DRT_LINUX -DRT_X64 -DRT_256_R8=4 -DRT_DEBUG=0 \ 150 | -DRT_POINTER=64 -DRT_ADDRESS=64 -DRT_ELEMENT=64 -DRT_ENDIAN=0 \ 151 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.x64f64 152 | 153 | 154 | clang_x64avx: simd_test.x64_32avx simd_test.x64_64avx \ 155 | simd_test.x64f32avx simd_test.x64f64avx 156 | 157 | simd_test.x64_32avx: 158 | clang++ -O3 -g \ 159 | -DRT_LINUX -DRT_X64 -DRT_256=1 -DRT_DEBUG=0 \ 160 | -DRT_POINTER=64 -DRT_ADDRESS=32 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \ 161 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.x64_32avx 162 | 163 | simd_test.x64_64avx: 164 | clang++ -O3 -g \ 165 | -DRT_LINUX -DRT_X64 -DRT_256=1 -DRT_DEBUG=0 \ 166 | -DRT_POINTER=64 -DRT_ADDRESS=32 -DRT_ELEMENT=64 -DRT_ENDIAN=0 \ 167 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.x64_64avx 168 | 169 | simd_test.x64f32avx: 170 | clang++ -O3 -g \ 171 | -DRT_LINUX -DRT_X64 -DRT_256=2 -DRT_DEBUG=0 \ 172 | -DRT_POINTER=64 -DRT_ADDRESS=64 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \ 173 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.x64f32avx 174 | 175 | simd_test.x64f64avx: 176 | clang++ -O3 -g \ 177 | -DRT_LINUX -DRT_X64 -DRT_256=2 -DRT_DEBUG=0 \ 178 | -DRT_POINTER=64 -DRT_ADDRESS=64 -DRT_ELEMENT=64 -DRT_ENDIAN=0 \ 179 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.x64f64avx 180 | 181 | 182 | clang_x64avx512: simd_test.x64_32avx512 simd_test.x64_64avx512 \ 183 | simd_test.x64f32avx512 simd_test.x64f64avx512 184 | 185 | simd_test.x64_32avx512: 186 | clang++ -O3 -g \ 187 | -DRT_LINUX -DRT_X64 -DRT_512=1 -DRT_DEBUG=0 \ 188 | -DRT_POINTER=64 -DRT_ADDRESS=32 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \ 189 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.x64_32avx512 190 | 191 | simd_test.x64_64avx512: 192 | clang++ -O3 -g \ 193 | -DRT_LINUX -DRT_X64 -DRT_512=1 -DRT_DEBUG=0 \ 194 | -DRT_POINTER=64 -DRT_ADDRESS=32 -DRT_ELEMENT=64 -DRT_ENDIAN=0 \ 195 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.x64_64avx512 196 | 197 | simd_test.x64f32avx512: 198 | clang++ -O3 -g \ 199 | -DRT_LINUX -DRT_X64 -DRT_512=2 -DRT_DEBUG=0 \ 200 | -DRT_POINTER=64 -DRT_ADDRESS=64 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \ 201 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.x64f32avx512 202 | 203 | simd_test.x64f64avx512: 204 | clang++ -O3 -g \ 205 | -DRT_LINUX -DRT_X64 -DRT_512=2 -DRT_DEBUG=0 \ 206 | -DRT_POINTER=64 -DRT_ADDRESS=64 -DRT_ELEMENT=64 -DRT_ENDIAN=0 \ 207 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.x64f64avx512 208 | 209 | 210 | # On Ubuntu (MATE) 16.04-22.04 add "universe multiverse" to "main restricted" 211 | # in /etc/apt/sources.list (sudo nano /etc/apt/sources.list) then run: 212 | # sudo apt-get update 213 | # (Ubuntu MATE is set up for an update without a need to edit the file) 214 | # (extended repositories "universe multiverse" are only needed for clang) 215 | # 216 | # Prerequisites for the build: 217 | # native-compiler for x86_64 is installed and in the PATH variable. 218 | # sudo apt-get install make g++ 219 | # 220 | # When building on macOS install Command Line Tools first. 221 | # http://osxdaily.com/2014/02/12/install-command-line-tools-mac-os-x/ 222 | # 223 | # Prerequisites for emulation: 224 | # http://software.intel.com/en-us/articles/intel-software-development-emulator 225 | # Intel SDE is downloaded, unpacked and in the PATH variable. 226 | # 227 | # Compiling/running SIMD test: 228 | # make -f simd_make_x64.mk 229 | # ./simd_test.x64f32 230 | # ./simd_test.x64f32avx 231 | # ./simd_test.x64f32avx512 232 | # sde64 -hsw -- ./simd_test.x64f32avx -c 1 233 | # sde64 -skx -- ./simd_test.x64f32avx512 -c 1 234 | # Use "-c 1" option to reduce test time when emulating with Intel SDE 235 | 236 | # Clang native build works too (takes much longer prior to 3.8), use (replace): 237 | # clang++ (in place of g++) 238 | # sudo apt-get install clang 239 | 240 | # For interpretation of SIMD build flags check compatibility layer in rtzero.h. 241 | # The 30-reg targets on top of AVX1+2/SSEx below will require in-mem emulation. 242 | 243 | # For 128-bit 30-reg build use (replace): RT_128=1 (reserved for AVX1+2/SSEx) 244 | # For 128-bit 30-reg build use (replace): RT_128=2 (Skylake-X w/ AVX512DQ+VL) 245 | # For 128-bit SSE2 build use (replace): RT_128=4 RT_SIMD_COMPAT_SSE=2 (15 regs) 246 | # For 128-bit SSE4 build use (replace): RT_128=4 (15 SIMD registers) 247 | # For 128-bit AVX1 build use (replace): RT_128=8 (15 SIMD registers) 248 | # For 128-bit FMA3 build use (replace): RT_128=16 (AMD's AVX1+FMA3) (15 regs) 249 | # For 128-bit AVX2 build use (replace): RT_128=32 (AMD's AVX2+FMA3) (15 regs) 250 | 251 | # For 256-bit SSE2 build use (replace): RT_256_R8=4 RT_SIMD_COMPAT_SSE=2 (8 rp) 252 | # For 256-bit SSE4 build use (replace): RT_256_R8=4 (8 SIMD reg-pairs) 253 | # For 256-bit AVX1 build use (replace): RT_256=1 (15 SIMD registers) 254 | # For 256-bit AVX2 build use (replace): RT_256=2 (15 SIMD registers) 255 | # For 256-bit 30-reg build use (replace): RT_256=4 (reserved for AVX1+2/SSEx) 256 | # For 256-bit 30-reg build use (replace): RT_256=8 (Skylake-X w/ AVX512DQ+VL) 257 | 258 | # For 512-bit AVX1 build use (replace): RT_512_R8=1 (8 SIMD reg-pairs) 259 | # For 512-bit AVX2 build use (replace): RT_512_R8=2 (8 SIMD reg-pairs) 260 | # For 512-bit AVX512F build use (replace): RT_512=1 (15 SIMD registers) 261 | # For 512-bit AVX512DQ build use (replace): RT_512=2 (15 SIMD registers) 262 | # For 512-bit AVX512F build use (replace): RT_512=4 (30 SIMD registers) 263 | # For 512-bit AVX512DQ build use (replace): RT_512=8 (30 SIMD registers) 264 | 265 | # For 1024-bit AVX512F build use (replace): RT_1K4=1 (15 SIMD reg-pairs) 266 | # For 1024-bit AVX512DQ build use (replace): RT_1K4=2 (15 SIMD reg-pairs) 267 | # For 2048-bit AVX512F build use (replace): RT_2K8_R8=1 (8 SIMD reg-quads) 268 | # For 2048-bit AVX512DQ build use (replace): RT_2K8_R8=2 (8 SIMD reg-quads) 269 | 270 | # 64/32-bit (ptr/adr) hybrid mode is compatible with native 64-bit ABI, 271 | # use (replace): RT_ADDRESS=32, rename the binary to simd_test.x64_** 272 | # 64-bit packed SIMD mode (fp64/int64) is supported on 64-bit targets, 273 | # use (replace): RT_ELEMENT=64, rename the binary to simd_test.x64*64 274 | -------------------------------------------------------------------------------- /test/simd_make_x86.mk: -------------------------------------------------------------------------------- 1 | 2 | INC_PATH = \ 3 | -I../core/config/ 4 | 5 | SRC_LIST = \ 6 | simd_test.cpp 7 | 8 | LIB_PATH = 9 | 10 | LIB_LIST = \ 11 | -lm 12 | 13 | 14 | build: simd_test_x86 simd_test_x86avx simd_test_x86avx512 15 | 16 | strip: 17 | strip simd_test.x86* 18 | 19 | clean: 20 | rm simd_test.x86* 21 | 22 | 23 | simd_test_x86: 24 | g++ -O3 -g -m32 \ 25 | -DRT_LINUX -DRT_X86 -DRT_128=2 -DRT_DEBUG=0 \ 26 | -DRT_POINTER=32 -DRT_ADDRESS=32 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \ 27 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.x86 28 | 29 | simd_test_x86avx: 30 | g++ -O3 -g -m32 \ 31 | -DRT_LINUX -DRT_X86 -DRT_256=1 -DRT_DEBUG=0 \ 32 | -DRT_POINTER=32 -DRT_ADDRESS=32 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \ 33 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.x86avx 34 | 35 | simd_test_x86avx512: 36 | g++ -O3 -g -m32 \ 37 | -DRT_LINUX -DRT_X86 -DRT_512=1 -DRT_DEBUG=0 \ 38 | -DRT_POINTER=32 -DRT_ADDRESS=32 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \ 39 | ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.x86avx512 40 | 41 | 42 | # On Ubuntu (MATE) 16.04-22.04 add "universe multiverse" to "main restricted" 43 | # in /etc/apt/sources.list (sudo nano /etc/apt/sources.list) then run: 44 | # sudo apt-get update 45 | # (Ubuntu MATE is set up for an update without a need to edit the file) 46 | # (extended repositories "universe multiverse" are only needed for clang) 47 | # 48 | # Prerequisites for the build: 49 | # native/multilib-compiler for x86/x86_64 is installed and in the PATH variable. 50 | # sudo apt-get install make g++ (for x86 host) 51 | # sudo apt-get install make g++-multilib (for x86_64 host) 52 | # (installation of g++-multilib removes any g++ cross-compilers) 53 | # 54 | # Prerequisites for emulation: 55 | # http://software.intel.com/en-us/articles/intel-software-development-emulator 56 | # Intel SDE is downloaded, unpacked and in the PATH variable. 57 | # 58 | # Compiling/running SIMD test: 59 | # make -f simd_make_x86.mk 60 | # ./simd_test.x86 61 | # ./simd_test.x86avx 62 | # ./simd_test.x86avx512 63 | # sde -snb -- ./simd_test.x86avx -c 1 64 | # sde -knl -- ./simd_test.x86avx512 -c 1 65 | # Use "-c 1" option to reduce test time when emulating with Intel SDE 66 | 67 | # Clang native build works too (takes much longer prior to 3.8), use (replace): 68 | # clang++ (in place of g++) 69 | # sudo apt-get install clang (requires g++-multilib for non-native ABI) 70 | 71 | # For interpretation of SIMD build flags check compatibility layer in rtzero.h. 72 | # Original legacy 32-bit ARMv7/x86 targets only support 8 SIMD registers. 73 | 74 | # For 128-bit SSE1 build use (replace): RT_128=1 (test36/37) (8 SIMD registers) 75 | # For 128-bit SSE2 build use (replace): RT_128=2 (8 SIMD registers) 76 | # For 128-bit SSE4 build use (replace): RT_128=4 (8 SIMD registers) 77 | # For 128-bit AVX1 build use (replace): RT_128=8 (AMD's AVX1-only) (8 regs) 78 | # For 128-bit FMA3 build use (replace): RT_128=16 (AMD's AVX1+FMA3) (8 regs) 79 | # For 128-bit AVX2 build use (replace): RT_128=32 (AMD's AVX2+FMA3) (8 regs) 80 | 81 | # For 256-bit AVX1 build use (replace): RT_256=1 (Intel's AVX1-only) (8 regs) 82 | # For 256-bit AVX2 build use (replace): RT_256=2 (Intel's AVX2+FMA3) (8 regs) 83 | # For 512-bit AVX512F build use (replace): RT_512=1 (8 SIMD registers) 84 | # For 512-bit AVX512DQ build use (replace): RT_512=2 (8 SIMD registers) 85 | -------------------------------------------------------------------------------- /test/simd_qemu32.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # Intended for x86_64 Linux test environment 3 | # with QEMU linux-user mode installed (64-bit Ubuntu MATE 20.04 LTS tested) 4 | # run this script after bulid_cross.sh with 32-bit cross-compilers installed 5 | 6 | touch qemu32; rm qemu32 7 | 8 | # fully successful test pass results in qemu32 file of 41524 bytes (51 tests) 9 | # unlike simd_test64/86.sh the result is the same on all CPU types (51 tests) 10 | # check the output if qemu32 file size differs, look for printouts 11 | 12 | 13 | echo "========================================================" | tee -a qemu32 14 | echo "Testing arm_v1 target (ARMv7 Cortex-A8 NEON)" | tee -a qemu32 15 | echo "========================================================" | tee -a qemu32 16 | qemu-arm -cpu cortex-a8 simd_test.arm_v1 -c 1 | tee -a qemu32 17 | echo "========================================================" | tee -a qemu32 18 | echo "Testing arm_v2 target (ARMv7 Cortex-A15 NEON)" | tee -a qemu32 19 | echo "========================================================" | tee -a qemu32 20 | qemu-arm -cpu cortex-a15 simd_test.arm_v2 -c 1 | tee -a qemu32 21 | 22 | 23 | echo "========================================================" | tee -a qemu32 24 | echo "Testing m32Lr5 target (MIPS32r5 MSA little-endian)" | tee -a qemu32 25 | echo "========================================================" | tee -a qemu32 26 | qemu-mipsel -cpu P5600 simd_test.m32Lr5 -c 1 | tee -a qemu32 27 | echo "========================================================" | tee -a qemu32 28 | echo "Testing m32Br5 target (MIPS32r5 MSA big-endian)" | tee -a qemu32 29 | echo "========================================================" | tee -a qemu32 30 | qemu-mips -cpu P5600 simd_test.m32Br5 -c 1 | tee -a qemu32 31 | 32 | 33 | # ppc64abi32 targets are deprecated since QEMU 5.2.0 (dropped in Ubuntu 22.04) 34 | # fully successful test pass writes 66442 bytes to qemu32 with ppc64abi32 runs 35 | 36 | echo "========================================================" | tee -a qemu32 37 | echo "Testing p32Bg4 target (PPC G4 VMX big-endian)" | tee -a qemu32 38 | echo "========================================================" | tee -a qemu32 39 | qemu-ppc -cpu G4 simd_test.p32Bg4 -c 1 | tee -a qemu32 40 | #echo "========================================================" | tee -a qemu32 41 | #echo "Testing p32Bp7 target (POWER7 VSX1 big-endian)" | tee -a qemu32 42 | #echo "========================================================" | tee -a qemu32 43 | #qemu-ppc64abi32 -cpu POWER7 simd_test.p32Bp7 -c 1 | tee -a qemu32 44 | #echo "========================================================" | tee -a qemu32 45 | #echo "Testing p32Bp8 target (POWER8 VSX2 big-endian)" | tee -a qemu32 46 | #echo "========================================================" | tee -a qemu32 47 | #qemu-ppc64abi32 -cpu POWER8 simd_test.p32Bp8 -c 1 | tee -a qemu32 48 | #echo "========================================================" | tee -a qemu32 49 | #echo "Testing p32Bp9 target (POWER9 VSX3 big-endian)" | tee -a qemu32 50 | #echo "========================================================" | tee -a qemu32 51 | #qemu-ppc64abi32 -cpu POWER9 simd_test.p32Bp9 -c 1 | tee -a qemu32 52 | 53 | 54 | echo "========================================================" 55 | echo "fully successful test pass writes 41524 bytes to qemu32" 56 | echo "the result doesn't depend on CPU type (unlike test64/86)" 57 | echo "check the output if qemu32 size differs, check printouts" 58 | echo "========================================================" 59 | echo "the actual file size after the test run is listed below:" 60 | ls -al qemu32 61 | echo "========================================================" 62 | 63 | 64 | -------------------------------------------------------------------------------- /test/simd_qemu64.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # Intended for x86_64 Linux test environment 3 | # with QEMU linux-user mode installed (64-bit Ubuntu MATE 20.04 LTS tested) 4 | # run this script after bulid_cross.sh with 64-bit cross-compilers installed 5 | 6 | touch qemu64; rm qemu64 7 | 8 | # fully successful test pass results in qemu64 file of 232524 bytes (51 tests) 9 | # unlike simd_test64/86.sh the result is the same on all CPU types (51 tests) 10 | # check the output if qemu64 file size differs, look for printouts 11 | 12 | 13 | echo "========================================================" | tee -a qemu64 14 | echo "Testing a64_32 target (ARMv8 NEON)" | tee -a qemu64 15 | echo "========================================================" | tee -a qemu64 16 | qemu-aarch64 -cpu cortex-a57 simd_test.a64_32 -c 1 | tee -a qemu64 17 | echo "========================================================" | tee -a qemu64 18 | echo "Testing a64_64 target (ARMv8 NEON)" | tee -a qemu64 19 | echo "========================================================" | tee -a qemu64 20 | qemu-aarch64 -cpu cortex-a57 simd_test.a64_64 -c 1 | tee -a qemu64 21 | echo "========================================================" | tee -a qemu64 22 | echo "Testing a64f32 target (ARMv8 NEON)" | tee -a qemu64 23 | echo "========================================================" | tee -a qemu64 24 | qemu-aarch64 -cpu cortex-a57 simd_test.a64f32 -c 1 | tee -a qemu64 25 | echo "========================================================" | tee -a qemu64 26 | echo "Testing a64f64 target (ARMv8 NEON)" | tee -a qemu64 27 | echo "========================================================" | tee -a qemu64 28 | qemu-aarch64 -cpu cortex-a57 simd_test.a64f64 -c 1 | tee -a qemu64 29 | 30 | echo "========================================================" | tee -a qemu64 31 | echo "Testing a64_32sve target (ARMv8 SVE)" | tee -a qemu64 32 | echo "========================================================" | tee -a qemu64 33 | qemu-aarch64 -cpu max,sve-max-vq=4 simd_test.a64_32sve -c 1 | tee -a qemu64 34 | echo "========================================================" | tee -a qemu64 35 | echo "Testing a64_64sve target (ARMv8 SVE)" | tee -a qemu64 36 | echo "========================================================" | tee -a qemu64 37 | qemu-aarch64 -cpu max,sve-max-vq=4 simd_test.a64_64sve -c 1 | tee -a qemu64 38 | echo "========================================================" | tee -a qemu64 39 | echo "Testing a64f32sve target (ARMv8 SVE)" | tee -a qemu64 40 | echo "========================================================" | tee -a qemu64 41 | qemu-aarch64 -cpu max,sve-max-vq=4 simd_test.a64f32sve -c 1 | tee -a qemu64 42 | echo "========================================================" | tee -a qemu64 43 | echo "Testing a64f64sve target (ARMv8 SVE)" | tee -a qemu64 44 | echo "========================================================" | tee -a qemu64 45 | qemu-aarch64 -cpu max,sve-max-vq=4 simd_test.a64f64sve -c 1 | tee -a qemu64 46 | 47 | 48 | echo "========================================================" | tee -a qemu64 49 | echo "Testing m64_32Lr6 target (MIPS64r6 MSA little-endian)" | tee -a qemu64 50 | echo "========================================================" | tee -a qemu64 51 | qemu-mips64el -cpu I6400 simd_test.m64_32Lr6 -c 1 | tee -a qemu64 52 | echo "========================================================" | tee -a qemu64 53 | echo "Testing m64_64Lr6 target (MIPS64r6 MSA little-endian)" | tee -a qemu64 54 | echo "========================================================" | tee -a qemu64 55 | qemu-mips64el -cpu I6400 simd_test.m64_64Lr6 -c 1 | tee -a qemu64 56 | echo "========================================================" | tee -a qemu64 57 | echo "Testing m64f32Lr6 target (MIPS64r6 MSA little-endian)" | tee -a qemu64 58 | echo "========================================================" | tee -a qemu64 59 | qemu-mips64el -cpu I6400 simd_test.m64f32Lr6 -c 1 | tee -a qemu64 60 | echo "========================================================" | tee -a qemu64 61 | echo "Testing m64f64Lr6 target (MIPS64r6 MSA little-endian)" | tee -a qemu64 62 | echo "========================================================" | tee -a qemu64 63 | qemu-mips64el -cpu I6400 simd_test.m64f64Lr6 -c 1 | tee -a qemu64 64 | 65 | echo "========================================================" | tee -a qemu64 66 | echo "Testing m64_32Br6 target (MIPS64r6 MSA big-endian)" | tee -a qemu64 67 | echo "========================================================" | tee -a qemu64 68 | qemu-mips64 -cpu I6400 simd_test.m64_32Br6 -c 1 | tee -a qemu64 69 | echo "========================================================" | tee -a qemu64 70 | echo "Testing m64_64Br6 target (MIPS64r6 MSA big-endian)" | tee -a qemu64 71 | echo "========================================================" | tee -a qemu64 72 | qemu-mips64 -cpu I6400 simd_test.m64_64Br6 -c 1 | tee -a qemu64 73 | echo "========================================================" | tee -a qemu64 74 | echo "Testing m64f32Br6 target (MIPS64r6 MSA big-endian)" | tee -a qemu64 75 | echo "========================================================" | tee -a qemu64 76 | qemu-mips64 -cpu I6400 simd_test.m64f32Br6 -c 1 | tee -a qemu64 77 | echo "========================================================" | tee -a qemu64 78 | echo "Testing m64f64Br6 target (MIPS64r6 MSA big-endian)" | tee -a qemu64 79 | echo "========================================================" | tee -a qemu64 80 | qemu-mips64 -cpu I6400 simd_test.m64f64Br6 -c 1 | tee -a qemu64 81 | 82 | 83 | echo "========================================================" | tee -a qemu64 84 | echo "Testing p64_32Bp7 target (POWER7 VSX1 big-endian)" | tee -a qemu64 85 | echo "========================================================" | tee -a qemu64 86 | qemu-ppc64 -cpu POWER7 simd_test.p64_32Bp7 -c 1 | tee -a qemu64 87 | echo "========================================================" | tee -a qemu64 88 | echo "Testing p64_64Bp7 target (POWER7 VSX1 big-endian)" | tee -a qemu64 89 | echo "========================================================" | tee -a qemu64 90 | qemu-ppc64 -cpu POWER7 simd_test.p64_64Bp7 -c 1 | tee -a qemu64 91 | echo "========================================================" | tee -a qemu64 92 | echo "Testing p64f32Bp7 target (POWER7 VSX1 big-endian)" | tee -a qemu64 93 | echo "========================================================" | tee -a qemu64 94 | qemu-ppc64 -cpu POWER7 simd_test.p64f32Bp7 -c 1 | tee -a qemu64 95 | echo "========================================================" | tee -a qemu64 96 | echo "Testing p64f64Bp7 target (POWER7 VSX1 big-endian)" | tee -a qemu64 97 | echo "========================================================" | tee -a qemu64 98 | qemu-ppc64 -cpu POWER7 simd_test.p64f64Bp7 -c 1 | tee -a qemu64 99 | 100 | # using -cpu power9 for power8 targets is a workaround for Ubuntu 22.04 LTS 101 | # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109007 102 | 103 | echo "========================================================" | tee -a qemu64 104 | echo "Testing p64_32Lp8 target (POWER8 VSX2 little-endian)" | tee -a qemu64 105 | echo "========================================================" | tee -a qemu64 106 | qemu-ppc64le -cpu POWER9 simd_test.p64_32Lp8 -c 1 | tee -a qemu64 107 | echo "========================================================" | tee -a qemu64 108 | echo "Testing p64_64Lp8 target (POWER8 VSX2 little-endian)" | tee -a qemu64 109 | echo "========================================================" | tee -a qemu64 110 | qemu-ppc64le -cpu POWER9 simd_test.p64_64Lp8 -c 1 | tee -a qemu64 111 | echo "========================================================" | tee -a qemu64 112 | echo "Testing p64f32Lp8 target (POWER8 VSX2 little-endian)" | tee -a qemu64 113 | echo "========================================================" | tee -a qemu64 114 | qemu-ppc64le -cpu POWER9 simd_test.p64f32Lp8 -c 1 | tee -a qemu64 115 | echo "========================================================" | tee -a qemu64 116 | echo "Testing p64f64Lp8 target (POWER8 VSX2 little-endian)" | tee -a qemu64 117 | echo "========================================================" | tee -a qemu64 118 | qemu-ppc64le -cpu POWER9 simd_test.p64f64Lp8 -c 1 | tee -a qemu64 119 | 120 | echo "========================================================" | tee -a qemu64 121 | echo "Testing p64_32Lp9 target (POWER9 VSX3 little-endian)" | tee -a qemu64 122 | echo "========================================================" | tee -a qemu64 123 | qemu-ppc64le -cpu POWER9 simd_test.p64_32Lp9 -c 1 | tee -a qemu64 124 | echo "========================================================" | tee -a qemu64 125 | echo "Testing p64_64Lp9 target (POWER9 VSX3 little-endian)" | tee -a qemu64 126 | echo "========================================================" | tee -a qemu64 127 | qemu-ppc64le -cpu POWER9 simd_test.p64_64Lp9 -c 1 | tee -a qemu64 128 | echo "========================================================" | tee -a qemu64 129 | echo "Testing p64f32Lp9 target (POWER9 VSX3 little-endian)" | tee -a qemu64 130 | echo "========================================================" | tee -a qemu64 131 | qemu-ppc64le -cpu POWER9 simd_test.p64f32Lp9 -c 1 | tee -a qemu64 132 | echo "========================================================" | tee -a qemu64 133 | echo "Testing p64f64Lp9 target (POWER9 VSX3 little-endian)" | tee -a qemu64 134 | echo "========================================================" | tee -a qemu64 135 | qemu-ppc64le -cpu POWER9 simd_test.p64f64Lp9 -c 1 | tee -a qemu64 136 | 137 | 138 | echo "========================================================" 139 | echo "fully successful test pass writes 232524 bytes to qemu64" 140 | echo "the result doesn't depend on CPU type (unlike test64/86)" 141 | echo "check the output if qemu64 size differs, check printouts" 142 | echo "========================================================" 143 | echo "the actual file size after the test run is listed below:" 144 | ls -al qemu64 145 | echo "========================================================" 146 | 147 | 148 | -------------------------------------------------------------------------------- /test/simd_test64.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # Intended for x86_64 Linux test environment 3 | # tested on 64-bit Linux Mint 18, 64-bit Ubuntu MATE 18.04/20.04 LTS 4 | # run this script after bulid_linux.sh with native compiler installed 5 | 6 | touch test64; rm test64 7 | 8 | # fully successful test pass results in test64 file of 99666 bytes (51 tests) 9 | # test pass on AVX2-only CPU results in test64 file of 69286 bytes (51 tests) 10 | # for any other CPU check the output or use Intel SDE within script 11 | 12 | 13 | echo "========================================================" | tee -a test64 14 | echo "Testing x64_32 target (Intel Core 2 Duo SSE2)" | tee -a test64 15 | echo "========================================================" | tee -a test64 16 | ./simd_test.x64_32 -c 1 | tee -a test64 17 | echo "========================================================" | tee -a test64 18 | echo "Testing x64_64 target (Intel Core 2 Duo SSE2)" | tee -a test64 19 | echo "========================================================" | tee -a test64 20 | ./simd_test.x64_64 -c 1 | tee -a test64 21 | echo "========================================================" | tee -a test64 22 | echo "Testing x64f32 target (Intel Nehalem SSE4)" | tee -a test64 23 | echo "========================================================" | tee -a test64 24 | ./simd_test.x64f32 -c 1 | tee -a test64 25 | echo "========================================================" | tee -a test64 26 | echo "Testing x64f64 target (Intel Nehalem SSE4)" | tee -a test64 27 | echo "========================================================" | tee -a test64 28 | ./simd_test.x64f64 -c 1 | tee -a test64 29 | 30 | echo "========================================================" | tee -a test64 31 | echo "Testing x64_32avx target (Intel Sandy Bridge AVX1)" | tee -a test64 32 | echo "========================================================" | tee -a test64 33 | ./simd_test.x64_32avx -c 1 | tee -a test64 34 | echo "========================================================" | tee -a test64 35 | echo "Testing x64_64avx target (Intel Sandy Bridge AVX1)" | tee -a test64 36 | echo "========================================================" | tee -a test64 37 | ./simd_test.x64_64avx -c 1 | tee -a test64 38 | echo "========================================================" | tee -a test64 39 | echo "Testing x64f32avx target (Intel Haswell AVX2)" | tee -a test64 40 | echo "========================================================" | tee -a test64 41 | ./simd_test.x64f32avx -c 1 | tee -a test64 42 | echo "========================================================" | tee -a test64 43 | echo "Testing x64f64avx target (Intel Haswell AVX2)" | tee -a test64 44 | echo "========================================================" | tee -a test64 45 | ./simd_test.x64f64avx -c 1 | tee -a test64 46 | 47 | echo "========================================================" | tee -a test64 48 | echo "Testing x64_32avx512 target (Intel Xeon Phi KNL AVX512)" | tee -a test64 49 | echo "========================================================" | tee -a test64 50 | ./simd_test.x64_32avx512 -c 1 | tee -a test64 51 | echo "========================================================" | tee -a test64 52 | echo "Testing x64_64avx512 target (Intel Xeon Phi KNL AVX512)" | tee -a test64 53 | echo "========================================================" | tee -a test64 54 | ./simd_test.x64_64avx512 -c 1 | tee -a test64 55 | echo "========================================================" | tee -a test64 56 | echo "Testing x64f32avx512 target (Intel Rocket Lake AVX512)" | tee -a test64 57 | echo "========================================================" | tee -a test64 58 | ./simd_test.x64f32avx512 -c 1 | tee -a test64 59 | echo "========================================================" | tee -a test64 60 | echo "Testing x64f64avx512 target (Intel Rocket Lake AVX512)" | tee -a test64 61 | echo "========================================================" | tee -a test64 62 | ./simd_test.x64f64avx512 -c 1 | tee -a test64 63 | 64 | 65 | echo "========================================================" 66 | echo "fully successful test pass writes 99666 bytes to test64" 67 | echo "test pass on AVX2-only CPU writes 69286 bytes to test64" 68 | echo "for other CPUs check the output, use Intel SDE in script" 69 | echo "========================================================" 70 | echo "the actual file size after the test run is listed below:" 71 | ls -al test64 72 | echo "========================================================" 73 | 74 | 75 | -------------------------------------------------------------------------------- /test/simd_test86.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # Intended for x86_64 Linux test environment 3 | # with multilib capabilities (64-bit Linux Mint 18 tested) 4 | # run this script after bulid_multi.sh with multilib-compiler installed 5 | 6 | touch test86; rm test86 7 | 8 | # fully successful test pass results in test86 file of 35482 bytes (51 tests) 9 | # test pass on AVX2-only CPU results in test86 file of 25616 bytes (51 tests) 10 | # for any other CPU check the output or use Intel SDE within script 11 | 12 | 13 | echo "========================================================" | tee -a test86 14 | echo "Testing x86 target (Intel Core 2 Duo SSE2)" | tee -a test86 15 | echo "========================================================" | tee -a test86 16 | ./simd_test.x86 -c 1 | tee -a test86 17 | echo "========================================================" | tee -a test86 18 | echo "Testing x86avx target (Intel Sandy Bridge AVX1)" | tee -a test86 19 | echo "========================================================" | tee -a test86 20 | ./simd_test.x86avx -c 1 | tee -a test86 21 | echo "========================================================" | tee -a test86 22 | echo "Testing x86avx512 target (Intel Xeon Phi KNL AVX512)" | tee -a test86 23 | echo "========================================================" | tee -a test86 24 | ./simd_test.x86avx512 -c 1 | tee -a test86 25 | echo "========================================================" | tee -a test86 26 | echo "Testing x32 target (Intel Core 2 Duo SSE2)" | tee -a test86 27 | echo "========================================================" | tee -a test86 28 | ./simd_test.x32 -c 1 | tee -a test86 29 | 30 | 31 | echo "========================================================" 32 | echo "fully successful test pass writes 35482 bytes to test86" 33 | echo "test pass on AVX2-only CPU writes 25616 bytes to test86" 34 | echo "for other CPUs check the output, use Intel SDE in script" 35 | echo "========================================================" 36 | echo "the actual file size after the test run is listed below:" 37 | ls -al test86 38 | echo "========================================================" 39 | 40 | 41 | -------------------------------------------------------------------------------- /test/simd_test_x64.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio Version 17 4 | VisualStudioVersion = 17.1.32328.378 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "simd_test_x64", "simd_test_x64.vcxproj", "{3CDB5A0F-6E4A-45F8-A234-25B9491748D9}" 7 | EndProject 8 | Global 9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 10 | Debug|x64 = Debug|x64 11 | Release|x64 = Release|x64 12 | EndGlobalSection 13 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 14 | {3CDB5A0F-6E4A-45F8-A234-25B9491748D9}.Debug|x64.ActiveCfg = Debug|x64 15 | {3CDB5A0F-6E4A-45F8-A234-25B9491748D9}.Debug|x64.Build.0 = Debug|x64 16 | {3CDB5A0F-6E4A-45F8-A234-25B9491748D9}.Release|x64.ActiveCfg = Release|x64 17 | {3CDB5A0F-6E4A-45F8-A234-25B9491748D9}.Release|x64.Build.0 = Release|x64 18 | EndGlobalSection 19 | GlobalSection(SolutionProperties) = preSolution 20 | HideSolutionNode = FALSE 21 | EndGlobalSection 22 | GlobalSection(ExtensibilityGlobals) = postSolution 23 | SolutionGuid = {11D5CE55-10D6-484B-AEE7-878D57BF69CE} 24 | EndGlobalSection 25 | EndGlobal 26 | -------------------------------------------------------------------------------- /test/simd_test_x64.vcxproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Debug 6 | Win32 7 | 8 | 9 | Release 10 | Win32 11 | 12 | 13 | Debug 14 | x64 15 | 16 | 17 | Release 18 | x64 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 16.0 71 | Win32Proj 72 | {3cdb5a0f-6e4a-45f8-a234-25b9491748d9} 73 | simdtestx64 74 | 10.0 75 | 76 | 77 | 78 | Application 79 | true 80 | v143 81 | Unicode 82 | 83 | 84 | Application 85 | false 86 | v143 87 | true 88 | Unicode 89 | 90 | 91 | Application 92 | true 93 | ClangCL 94 | Unicode 95 | 96 | 97 | Application 98 | false 99 | ClangCL 100 | true 101 | Unicode 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | true 123 | 124 | 125 | false 126 | 127 | 128 | true 129 | 130 | 131 | false 132 | 133 | 134 | 135 | Level3 136 | true 137 | WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) 138 | true 139 | 140 | 141 | Console 142 | true 143 | 144 | 145 | 146 | 147 | Level3 148 | true 149 | true 150 | true 151 | WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) 152 | true 153 | 154 | 155 | Console 156 | true 157 | true 158 | true 159 | 160 | 161 | 162 | 163 | Level3 164 | true 165 | _DEBUG;_CONSOLE;%(PreprocessorDefinitions) 166 | true 167 | /I "../core/config/" -Wno-unused-function -Wno-missing-braces -Wno-deprecated-declarations /D "RT_WIN64" /D "RT_X64" /D RT_128=4 /D RT_SIMD_COMPAT_SSE=2 /D RT_POINTER=64 /D RT_ADDRESS=64 /D RT_ELEMENT=32 /D RT_ENDIAN=0 /D RT_DEBUG=1 %(AdditionalOptions) 168 | 169 | 170 | Console 171 | true 172 | 173 | 174 | 175 | 176 | Level3 177 | true 178 | true 179 | true 180 | NDEBUG;_CONSOLE;%(PreprocessorDefinitions) 181 | true 182 | /I "../core/config/" -Wno-unused-function -Wno-missing-braces -Wno-deprecated-declarations /D "RT_WIN64" /D "RT_X64" /D RT_128=4 /D RT_SIMD_COMPAT_SSE=2 /D RT_POINTER=64 /D RT_ADDRESS=64 /D RT_ELEMENT=32 /D RT_ENDIAN=0 /D RT_DEBUG=0 %(AdditionalOptions) 183 | 184 | 185 | Console 186 | true 187 | true 188 | true 189 | 190 | 191 | 192 | 193 | 194 | -------------------------------------------------------------------------------- /test/simd_test_x64.vcxproj.filters: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | {4FC737F1-C7A5-4376-A066-2A32D752A2FF} 6 | cpp;c;cc;cxx;c++;cppm;ixx;def;odl;idl;hpj;bat;asm;asmx 7 | 8 | 9 | {93995380-89BD-4b04-88EB-625FBE52EBFB} 10 | h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd 11 | 12 | 13 | {0a255df8-ee9e-45d2-b928-0b1efb68dc8e} 14 | 15 | 16 | {b0a101d1-d784-48f3-8e7b-ed50858362a2} 17 | 18 | 19 | 20 | 21 | Source Files 22 | 23 | 24 | 25 | 26 | core\config 27 | 28 | 29 | core\config 30 | 31 | 32 | core\config 33 | 34 | 35 | core\config 36 | 37 | 38 | core\config 39 | 40 | 41 | core\config 42 | 43 | 44 | core\config 45 | 46 | 47 | core\config 48 | 49 | 50 | core\config 51 | 52 | 53 | core\config 54 | 55 | 56 | core\config 57 | 58 | 59 | core\config 60 | 61 | 62 | core\config 63 | 64 | 65 | core\config 66 | 67 | 68 | core\config 69 | 70 | 71 | core\config 72 | 73 | 74 | core\config 75 | 76 | 77 | core\config 78 | 79 | 80 | core\config 81 | 82 | 83 | core\config 84 | 85 | 86 | core\config 87 | 88 | 89 | core\config 90 | 91 | 92 | core\config 93 | 94 | 95 | core\config 96 | 97 | 98 | core\config 99 | 100 | 101 | core\config 102 | 103 | 104 | core\config 105 | 106 | 107 | core\config 108 | 109 | 110 | core\config 111 | 112 | 113 | core\config 114 | 115 | 116 | core\config 117 | 118 | 119 | core\config 120 | 121 | 122 | core\config 123 | 124 | 125 | core\config 126 | 127 | 128 | core\config 129 | 130 | 131 | core\config 132 | 133 | 134 | core\config 135 | 136 | 137 | core\config 138 | 139 | 140 | core\config 141 | 142 | 143 | core\config 144 | 145 | 146 | core\config 147 | 148 | 149 | core\config 150 | 151 | 152 | core\config 153 | 154 | 155 | -------------------------------------------------------------------------------- /test/simd_test_x64.vcxproj.user: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | --------------------------------------------------------------------------------