├── COPYING
├── INSTALL
├── README
├── ROADMAP
├── VERSION
├── core
    └── config
    │   ├── rtarch.h
    │   ├── rtarch_a32.h
    │   ├── rtarch_a32_128x1v1.h
    │   ├── rtarch_a32_128x2v1.h
    │   ├── rtarch_a32_SVEx1v1.h
    │   ├── rtarch_a32_SVEx2v1.h
    │   ├── rtarch_a64.h
    │   ├── rtarch_a64_128x1v1.h
    │   ├── rtarch_a64_128x2v1.h
    │   ├── rtarch_a64_SVEx1v1.h
    │   ├── rtarch_a64_SVEx2v1.h
    │   ├── rtarch_aHB.h
    │   ├── rtarch_aHB_128x1v1.h
    │   ├── rtarch_aHB_128x2v1.h
    │   ├── rtarch_aHB_SVEx1v1.h
    │   ├── rtarch_aHB_SVEx2v1.h
    │   ├── rtarch_aHF_128x1v2.h
    │   ├── rtarch_aHF_128x2v2.h
    │   ├── rtarch_aHF_SVEx1v1.h
    │   ├── rtarch_aHF_SVEx2v1.h
    │   ├── rtarch_arm.h
    │   ├── rtarch_arm_128x1v4.h
    │   ├── rtarch_m32.h
    │   ├── rtarch_m32_128x1v1.h
    │   ├── rtarch_m32_128x2v1.h
    │   ├── rtarch_m64.h
    │   ├── rtarch_m64_128x1v1.h
    │   ├── rtarch_m64_128x2v1.h
    │   ├── rtarch_mHB.h
    │   ├── rtarch_mHB_128x1v1.h
    │   ├── rtarch_mHB_128x2v1.h
    │   ├── rtarch_p32.h
    │   ├── rtarch_p32_128x1v1.h
    │   ├── rtarch_p32_128x1v2.h
    │   ├── rtarch_p32_128x1v4.h
    │   ├── rtarch_p32_128x2v1.h
    │   ├── rtarch_p32_128x2v2.h
    │   ├── rtarch_p32_128x2v4.h
    │   ├── rtarch_p32_128x2v8.h
    │   ├── rtarch_p32_128x2vG.h
    │   ├── rtarch_p32_128x4v1.h
    │   ├── rtarch_p32_128x4v2.h
    │   ├── rtarch_p64.h
    │   ├── rtarch_p64_128x1v1.h
    │   ├── rtarch_p64_128x1v2.h
    │   ├── rtarch_p64_128x2v1.h
    │   ├── rtarch_p64_128x2v2.h
    │   ├── rtarch_p64_128x2v4.h
    │   ├── rtarch_p64_128x2v8.h
    │   ├── rtarch_p64_128x4v1.h
    │   ├── rtarch_p64_128x4v2.h
    │   ├── rtarch_pHB.h
    │   ├── rtarch_pHB_128x1v1.h
    │   ├── rtarch_pHB_128x1v2.h
    │   ├── rtarch_pHB_128x1v4.h
    │   ├── rtarch_pHB_128x2v1.h
    │   ├── rtarch_pHB_128x2v2.h
    │   ├── rtarch_pHB_128x2vG.h
    │   ├── rtarch_pQF_128x1v2.h
    │   ├── rtarch_pQF_128x2v2.h
    │   ├── rtarch_x32.h
    │   ├── rtarch_x32_128x1v2.h
    │   ├── rtarch_x32_128x1v4.h
    │   ├── rtarch_x32_128x1v8.h
    │   ├── rtarch_x32_128x2v4.h
    │   ├── rtarch_x32_256x1v2.h
    │   ├── rtarch_x32_256x1v8.h
    │   ├── rtarch_x32_256x2v2.h
    │   ├── rtarch_x32_512x1v8.h
    │   ├── rtarch_x32_512x2v2.h
    │   ├── rtarch_x32_512x4v2.h
    │   ├── rtarch_x64.h
    │   ├── rtarch_x64_128x1v2.h
    │   ├── rtarch_x64_128x1v4.h
    │   ├── rtarch_x64_128x1v8.h
    │   ├── rtarch_x64_128x2v4.h
    │   ├── rtarch_x64_256x1v2.h
    │   ├── rtarch_x64_256x1v8.h
    │   ├── rtarch_x64_256x2v2.h
    │   ├── rtarch_x64_512x1v8.h
    │   ├── rtarch_x64_512x2v2.h
    │   ├── rtarch_x64_512x4v2.h
    │   ├── rtarch_x86.h
    │   ├── rtarch_x86_128x1v4.h
    │   ├── rtarch_x86_128x1v8.h
    │   ├── rtarch_x86_256x1v2.h
    │   ├── rtarch_x86_512x1v2.h
    │   ├── rtarch_xHB.h
    │   ├── rtarch_xHB_128x1v2.h
    │   ├── rtarch_xHB_128x1v4.h
    │   ├── rtarch_xHB_128x1v8.h
    │   ├── rtarch_xHB_128x2v4.h
    │   ├── rtarch_xHB_256x1v2.h
    │   ├── rtarch_xHB_256x1v8.h
    │   ├── rtarch_xHB_256x2v2.h
    │   ├── rtarch_xHB_512x1v8.h
    │   ├── rtarch_xHB_512x2v2.h
    │   ├── rtarch_xHB_512x4v2.h
    │   ├── rtarch_xHF_128x1v2.h
    │   ├── rtarch_xHF_256x1v8.h
    │   ├── rtarch_xHF_512x1v8.h
    │   ├── rtarch_xHF_512x2v2.h
    │   ├── rtarch_xHF_512x4v2.h
    │   ├── rtbase.h
    │   ├── rtconf.h
    │   ├── rtdocs.h
    │   └── rtzero.h
└── test
    ├── build_cross.sh
    ├── build_linux.sh
    ├── build_macM1.sh
    ├── build_macOS.sh
    ├── build_multi.sh
    ├── build_nokia.sh
    ├── build_raspi.sh
    ├── build_win64.bat
    ├── clean_cross.sh
    ├── clean_linux.sh
    ├── clean_macM1.sh
    ├── clean_macOS.sh
    ├── clean_multi.sh
    ├── clean_nokia.sh
    ├── clean_raspi.sh
    ├── clean_win64.bat
    ├── simd_make_a32.mk
    ├── simd_make_a64.mk
    ├── simd_make_arm.mk
    ├── simd_make_m32.mk
    ├── simd_make_m64.mk
    ├── simd_make_p32.mk
    ├── simd_make_p64.mk
    ├── simd_make_w64.bat
    ├── simd_make_w64.mk
    ├── simd_make_x32.mk
    ├── simd_make_x64.mk
    ├── simd_make_x86.mk
    ├── simd_qemu32.sh
    ├── simd_qemu64.sh
    ├── simd_test.cpp
    ├── simd_test64.sh
    ├── simd_test86.sh
    ├── simd_test_x64.sln
    ├── simd_test_x64.vcxproj
    ├── simd_test_x64.vcxproj.filters
    └── simd_test_x64.vcxproj.user


/COPYING:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2013-2025 VectorChief (at github, bitbucket, sourceforge)
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | THE SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/INSTALL:
--------------------------------------------------------------------------------
  1 | ================================================================================
  2 | 
  3 | To build SIMD test framework on Linux, open terminal window,
  4 | go to UniSIMD's "test" subfolder,
  5 | make sure necessary tools and libraries are installed
  6 |     sudo apt-get update
  7 | for native builds (binary ABI matches host: x64, RISCs):
  8 |     sudo apt-get install make g++
  9 | for multilib builds (if libs are available: 32-bit x86):
 10 |     sudo apt-get install make g++-multilib
 11 | run for x64 architecture:
 12 |     make -f simd_make_x64.mk -j4
 13 |     ./simd_test.x64f32
 14 | run for x86 architecture:
 15 |     make -f simd_make_x86.mk -j4
 16 |     ./simd_test.x86
 17 | run for *** architectures (on native host or QEMU linux-user mode):
 18 |     make -f simd_make_***.mk -j4
 19 |     ./simd_test.***
 20 | Prerequisites for building/emulating guest non-x86 architectures as well as
 21 | configurations for particular hardware platforms are given in the makefiles.
 22 | By default, DEB-based distributions (Mint, Ubuntu, Debian) are implied,
 23 | refer to the AArch64 Linux on RPi3 section down below for RPM-based options.
 24 | 
 25 | To build SIMD test framework on macOS, open terminal window,
 26 | go to UniSIMD's "test" subfolder,
 27 | make sure necessary tools and libraries are installed:
 28 |     clang
 29 | it will prompt to install Command Line Tools (will make alias to g++)
 30 | run for x64 architecture:
 31 |     make -f simd_make_x64.mk -j4
 32 |     ./simd_test.x64f32
 33 | run for a64 architecture:
 34 |     make -f simd_make_a64.mk clang -j4
 35 |     ./simd_test.a64f32
 36 | Alternatively, use clang option for x64 (make -f simd_make_x64.mk clang -j4).
 37 | 
 38 | To build SIMD test framework on Windows using Visual Studio,
 39 | download and install Visual Studio 2022 or later (with clang option enabled),
 40 | then open UniSIMD's "test" subfolder and click on VS2022 solution file:
 41 |     simd_test_x64.sln
 42 | from within the Visual Studio press F5 key to build and run the binary.
 43 | 
 44 | For maximum compatibility, always copy the binaries to *.sln's subfolder.
 45 | 
 46 | To build SIMD test framework on Windows using TDM64-GCC,
 47 | download and install TDM64-GCC toolchain (tdm64-gcc-10.3.0-2.exe) from github,
 48 | then open UniSIMD's "test" subfolder and run from "cmd" or Windows Explorer:
 49 |     simd_make_w64.bat
 50 | produced simd_test_w64f32.exe binary file will launch upon build completion.
 51 | 
 52 | ================================================================================
 53 | 
 54 | To build QEMU emulator from source on Linux, download the latest version from:
 55 |     http://wiki.qemu.org/Download
 56 | unpack the archive, open terminal window, go to QEMU's root folder,
 57 | make sure necessary tools and libraries are installed:
 58 |     sudo apt-get update
 59 |     sudo apt-get install make g++ ninja-build
 60 |     sudo apt-get install pkg-config libglib2.0-dev libpixman-1-dev zlib1g-dev
 61 | when building QEMU on RPM-based systems like openSUSE:
 62 |     sudo zypper install make gcc-c++ ninja
 63 |     sudo zypper install patch glib2-devel libpixman-1-0-devel zlib-devel
 64 | to build a reduced set of targets for linux-user mode emulation only, use:
 65 |     ./configure --target-list=arm-linux-user,aarch64-linux-user,\
 66 | mips-linux-user,mipsel-linux-user,mips64-linux-user,mips64el-linux-user,\
 67 | ppc-linux-user,ppc64-linux-user,ppc64le-linux-user,\
 68 | i386-linux-user,x86_64-linux-user
 69 |     (copy the whole multi-line command above without leading or trailing spaces)
 70 |     (paste into terminal and run after or instead of plain ./configure script)
 71 | run multithreaded make (use -j8 or -j16 on machines with higher core count):
 72 |     make -j4
 73 | run installation script:
 74 |     sudo make install
 75 | 
 76 | When building QEMU from source isn't necessary install full binary package:
 77 |     sudo apt-get install qemu-user
 78 | on RPM-based systems like openSUSE:
 79 |     sudo zypper install qemu
 80 | 
 81 | QEMU 5.2.0 and beyond may require ninja-build package to build from source.
 82 | Starting from QEMU 5.2.0 POWER9 lxvwsx instruction is supported properly.
 83 | From QEMU 5.2.0 through QEMU 6.2.0 ppc64abi32 targets are marked as deprecated
 84 | and will be removed in the following releases (Ubuntu 22.04 should have 6.2.0).
 85 | QEMU 6.2.0 contains some MIPS regressions and POWER bugs (exposed by gcc 11.3),
 86 | consider an update: https://bugs.launchpad.net/ubuntu/+source/qemu/+bug/2011832
 87 | or build QEMU 7.2.0 from source (contains all the fixes, also in Ubuntu 23.04).
 88 | All QEMU versions since 6.0.0 including 8.0.0 have MIPS bug reported here:
 89 | https://gitlab.com/qemu-project/qemu/-/issues/1624
 90 | Note that using standalone MIPS cross-compiler masks the issue with local QEMU.
 91 | 
 92 | Ubuntu 20.04 is the first release where MIPS cross-compilers have caught up
 93 | with the rest of the pack (standalone 2020.06-01 and mipsisa64r6* are 9.3.0).
 94 | It is the only release which has full support for all the targets (ppc64abi32).
 95 | 
 96 | ================================================================================
 97 | 
 98 | To emulate future x86 targets (AVX-512) on modern x86 Linux hosts use Intel SDE:
 99 |     https://software.intel.com/content/www/us/en/develop/articles/
100 |                                        /intel-software-development-emulator.html
101 | download and unpack the archive.
102 | 
103 | In terminal window for 32-bit x86 run:
104 |     path-to-kit/sde   -snb -- ./simd_test.x86avx -c 1
105 |     path-to-kit/sde   -knl -- ./simd_test.x86avx512 -c 1
106 | for AVX (Sandy Bridge) and AVX512F (Knights Landing) respectively.
107 | 
108 | In terminal window for 64-bit x64 run:
109 |     path-to-kit/sde64 -hsw -- ./simd_test.x64f32avx -c 1
110 |     path-to-kit/sde64 -skx -- ./simd_test.x64f32avx512 -c 1
111 | for AVX2 (Haswell) and AVX512DQ (Skylake-X) respectively.
112 | 
113 | Intel's AVX512 fp16 subset (in Sapphire Rapids and in some Alder Lake configs)
114 | is now supported in Intel SDE 9.0 (with -spr option) and can be tested within
115 | the assembler by substituting regular 32/64-bit cmdps with 16-bit cmdms aliases
116 | and using direct ASM section output comparison method (with ARMv8.2 fp16).
117 | 
118 | Use "-c 1" option to reduce test time when emulating with Intel SDE.
119 | 
120 | ================================================================================
121 | 
122 | To experiment with Linux in little-endian mode on POWER8 server consider:
123 | 
124 | 1) Ubuntu Server 16.04.06 Xenial Xerus (ppc64el) from 27-Feb-2019:
125 |     https://cdimage.ubuntu.com/releases/16.04/release/
126 |     ubuntu-16.04.6-server-ppc64el.iso
127 |     (install most recent HWE kernel version)
128 | 
129 | 2) Ubuntu Server 18.04.05 Bionic Beaver (ppc64el) from 10-Aug-2020:
130 |     https://cdimage.ubuntu.com/releases/18.04/release/
131 |     ubuntu-18.04.5-server-ppc64el.iso
132 |     (install the original kernel version, no HWE)
133 | 
134 | 3) Ubuntu Server 20.04.02 Focal Fossa (ppc64el) from 01-Feb-2021:
135 |     https://cdimage.ubuntu.com/releases/20.04/release/
136 |     ubuntu-20.04.2-live-server-ppc64el.iso
137 |     (system installs, but won't boot, features new installer)
138 | 
139 | The images were tested on Tyan Habanero TN71-BP012 10-core POWER8 server
140 | with installation instructions described here:
141 | https://www.phoronix.com/scan.php?page=article&item=tyan-power8-server&num=1
142 | 
143 | Boot the system from USB flash drive without any ethernet cables attached,
144 | otherwise the boot menu won't allow any boot options to be activated properly.
145 | When presented with a blank screen and a cursor, read the next paragraph.
146 | 
147 | Before an installation can proceed the following steps may need to be performed
148 | on the first boot. Switch to tty2 with Alt-F2, activate the tty as the message
149 | requests by pressing <enter>, run /sbin/debian-installer (first two images).
150 | 
151 | To setup networking on a freshly installed Ubuntu Server with ethernet cable
152 | use the farmost ethernet socket among the main four as seen from the PSU.
153 | 
154 | On Ubuntu Server 16.04 use ifconfig to configure the network:
155 |     sudo nano /etc/network/interfaces
156 | Edit the file above to add the next two lines at the bottom:
157 |     auto enP1p3s0
158 |     iface enP1p3s0 inet dhcp
159 | Press Ctrl-O <enter> to save the changes and Ctrl-X to exit from the editor.
160 |     sudo /etc/init.d/networking restart
161 | or
162 |     sudo systemctl restart networking.service
163 | 
164 | On Ubuntu Server 18.04 use netplan to configure the network:
165 |     sudo nano /etc/netplan/01-netcfg.yaml
166 | Edit the file above so that it looks like this:
167 | network:
168 |     ethernets:
169 |         enP1p3s0:
170 |             dhcp4: yes
171 |     version: 2
172 |     renderer: networkd
173 | Press Ctrl-O <enter> to save the changes and Ctrl-X to exit from the editor.
174 |     sudo netplan apply
175 | 
176 | When booting Ubuntu Server offline after the networking has been set up
177 | the login prompt is not immediately shown and a few minutes time
178 | needs to be taken before SSH authorization becomes workable.
179 | 
180 | To install XFCE once the system is up and running:
181 |     sudo apt-get update
182 |     sudo apt-get install xfce4
183 |     startx (choose "Use default config" on first start)
184 |     (launch terminal window in XFCE from the bottom panel)
185 |     sudo apt-get install firefox unzip leafpad
186 |     sudo apt-get install make g++ clang
187 | Alternatively install Xubuntu desktop for more integrated experience:
188 |     sudo apt-get update
189 |     sudo apt-get install xubuntu-desktop
190 |     sudo apt-get install make g++ clang
191 |     reboot
192 | 
193 | After installing XFCE Ubuntu may start ureadahead process which loads the CPU
194 | preventing normal user workflow for a few minutes until the process
195 | is complete and the system is back to normal (use top command to monitor).
196 | 
197 | ================================================================================
198 | 
199 | To experiment with Linux in AArch64 mode on Raspberry Pi 3 consider:
200 | 
201 | 1) Devuan ASCII 2.0.0 plain (arm64 raspi3) image from 06-Jun-2018:
202 |     https://devuan.org/
203 |     https://files.devuan.org/devuan_ascii/embedded/
204 |     devuan_ascii_2.0.0_arm64_raspi3.img.xz
205 | image boot credentials:
206 |     login: root
207 |     password: toor
208 | 
209 | 2) openSUSE Leap15.0 XFCE (aarch64 raspi3) image from 02-Jul-2018:
210 |     https://en.opensuse.org/HCL:Raspberry_Pi3
211 |     http://download.opensuse.org/ports/aarch64/distribution/leap/15.0/appliances
212 |     openSUSE-...-ARM-XFCE-raspberrypi3.aarch64-2018.07.02-Buildlp150.1.1.raw.xz
213 | image boot credentials:
214 |     login: root
215 |     password: linux
216 | 
217 | Flashing images above to an SD card is similar to a Raspbian installation:
218 | http://www.raspberrypi.org/documentation/installation/installing-images/linux.md
219 |     lsblk (before inserting SD card)
220 |     lsblk (after inserting SD card to see its device ID, mmcblk0 in this case)
221 |     sudo su
222 |     umount /dev/mmcblk0p1 (if exists and mounted, unmount partition: p1)
223 |     umount /dev/mmcblk0p2 (if exists and mounted, unmount partition: p2)
224 |     (change to Downloads directory where image file is unpacked)
225 |     dcfldd bs=4M if=devuan_ascii_2.0.0_arm64_raspi3.img of=/dev/mmcblk0
226 |     sync (before extracting SD card from the slot)
227 |     exit (from super-user mode)
228 | 
229 | Devuan image requires partition resizing once booted (credentials above):
230 | http://elinux.org/RPi_Resize_Flash_Partitions (performed on RPi using fdisk)
231 |     fdisk /dev/mmcblk0
232 |     p (to see the current partition table)
233 |     d (answer: 2, to delete partition 2)
234 |     (for swap leave some space at the end by subtracting 1M from default sector)
235 |     n (answer: p, for primary; answer: 2, for new partition 2; <enter>; <enter>)
236 |     (when fdisk asks to remove ext4 signature at the end, answer: N, to keep it)
237 |     (create new partition 3 as 2, from the space left at the end of the SD card)
238 |     (t 3, to change partition type from 83 "Linux" to 82 "Linux-swap / Solaris")
239 |     w (writes the changes and quits fdisk)
240 |     shutdown -r now (login again after reboot)
241 |     resize2fs /dev/mmcblk0p2
242 |     df -h (to check the new partition size)
243 |     (mkswap /dev/mmcblk0p3)
244 |     (add "/dev/mmcblk0p3 none swap sw 0 0" to /etc/fstab to auto-swapon at boot)
245 | to install XFCE once the partition is resized:
246 |     (commands below are not prefixed with "sudo" as image is booted into "root")
247 |     (using "apt" command instead of "apt-get" allows to save space on SD card)
248 |     apt-get update
249 |     apt install xfce4 (choose keyboard layout)
250 |     reboot (for XFCE to honour chosen keyboard layout)
251 |     startx (choose "Use default config" on first start)
252 |     (launch terminal window in XFCE from the bottom panel)
253 |     apt install firefox-esr unzip
254 |     reboot (login again and "startx" into XFCE)
255 |     (firefox is now available under "Internet" section of the main menu)
256 |     apt install make g++ clang
257 | to setup ARM Instruction Emulator for SVE on AArch64 hosts install modules:
258 |     apt install environment-modules
259 |     reboot (login again and "startx" into XFCE)
260 |     (modules only work outside of XFCE, use "Log Out" to configure modules)
261 |     (once armie module is loaded, use startx again to work with it in XFCE)
262 | 
263 | openSUSE image is RPM-based & boots directly to XFCE (credentials above):
264 |     (commands below are not prefixed with "sudo" as image is booted into "root")
265 |     (Raspberry Pi 3 Model B+ doesn't have networking with openSUSE, use old one)
266 |     zypper install MozillaFirefox
267 |     reboot (and login again)
268 |     (firefox is now available under "Internet" section of the main menu)
269 |     zypper install make gcc-c++ clang
270 | to setup ARM Instruction Emulator for SVE on AArch64 hosts, install modules:
271 |     zypper install Modules
272 |     reboot (and login again)
273 | adjust UniSIMD's makefiles to use g++ instead of triplet name, remove -static
274 |     leafpad simd_make_a64.mk & (once archive is downloaded as shown below)
275 | 
276 | Download the archive from github and unpack it (in terminal window):
277 |     cd Downloads
278 |     (alternatively to using a browser for downloading, use wget from terminal)
279 |     (wget https://github.com/VectorChief/UniSIMD-assembler/archive/master.zip)
280 |     (mv master.zip UniSIMD-assembler-master.zip)
281 |     unzip UniSIMD-assembler-master.zip
282 |     cd UniSIMD-assembler-master/test
283 |     make -f simd_make_a64.mk -j4
284 |     ./simd_test.a64f32
285 | 
286 | Download the ARM IE and install it (Ubuntu_16.04 for Devuan, SUSE_12 for SUSE):
287 | https://developer.arm.com/tools-and-software/server-and-hpc/
288 |                        /compile/arm-instruction-emulator/get-software/download
289 |     cd Downloads
290 |     tar -xvzf ARM-Instruction-Emulator_20.1_AArch64_***_aarch64.tar.gz
291 |     cd ARM-Instruction-Emulator_20.1_AArch64_***_aarch64
292 |     ./arm-instruction-emulator-20.1_Generic-AArch64_***_aarch64-linux-***.sh
293 |     (scroll down and type: yes <enter>, when license shows up on the screen)
294 |     reboot (and login again)
295 |     module use /opt/arm/modulefiles
296 |     module avail
297 |     module load Generic-AArch64/***/arm-instruction-emulator/20.1
298 |     (armie should now be available in the PATH variable, check vector lengths)
299 |     armie -mlist-vector-lengths
300 | 
301 | To test SVE targets with ARM Instruction Emulator run:
302 |     armie -msve-vector-bits=512 -- ./simd_test.a64f32sve -c 1
303 | 
304 | Use "-c 1" option to reduce test time when emulating with ARM IE.
305 | 
306 | Devuan ASCII 2.0.0 image has USB flash drives automount, but older clang 3.8.1,
307 | it also allows setting CPU frequency scaling governor for maximum performance:
308 | echo "performance" | tee /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor
309 | Current frequency (600Mhz min-level, 1200Mhz max-level) can be monitored using:
310 | cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq
311 | 
312 | openSUSE Leap15.0 image has newer clang, but no automount for USB flash drives,
313 | it offers min-level CPU frequency by default and it lacks CPU scaling governor.
314 | It also doesn't support networking on newer Raspberry Pi 3 Model B+ from 2018.
315 | On the bonus side, modules can be configured from within XFCE as they should be.
316 | 
317 | Both images feature modern set of compilers sufficient for the build:
318 | g++ 6.3.0, clang 3.8.1 (Devuan ASCII 2.0.0, clang prior to 3.8 was much slower)
319 | g++ 7.3.1, clang 5.0.1 (openSUSE Leap15.0)
320 | fresh Firefox 60.2.2esr browser and a 32bpp display output.
321 | 
322 | ================================================================================
323 | 
324 | To install Ubuntu MATE 20.04 LTS on Raspberry Pi 4 consider:
325 | 
326 | 1) Ubuntu MATE 20.04.1 desktop (arm64 raspi) image from 29-Oct-2020:
327 |     https://ubuntu-mate.org/download/arm64/focal/
328 |     https://releases.ubuntu-mate.org/focal/arm64/
329 |     ubuntu-mate-20.04.1-desktop-arm64+raspi.img.xz
330 | 
331 | Flash image to an SD card using "Disks -> Restore Disk Image" utility with GUI
332 | from a regular Ubuntu desktop. It will unpack *.xz internally in the process.
333 | 
334 | Once booted the image will automatically resize the root partition to fully
335 | utilize all space available on an SD card and proceed to install the system.
336 | 
337 | When online Ubuntu may start unattended-upgr process which holds the lock
338 | preventing installation of other packages potentially for a few hours.
339 | Install the system offline to postpone this process for a later time.
340 | 
341 | To emulate SVE instruction subset install QEMU 4.2.1 from the repository:
342 |     sudo apt-get update
343 |     sudo apt-get install qemu-user
344 | 
345 | ================================================================================
346 | 
347 | To experiment with Ubuntu Server on Raspberry Pi 4 consider:
348 | 
349 | 1) Ubuntu Server 20.04.2 preinstalled (arm64 raspi) image from 01-Feb-2021:
350 |     http://cdimage.ubuntu.com/releases/20.04/release/
351 |     ubuntu-20.04.2-preinstalled-server-arm64+raspi.img.xz
352 | image boot credentials:
353 |     login: ubuntu
354 |     password: ubuntu
355 | 
356 | Flash image to an SD card using "Disks -> Restore Disk Image" utility with GUI
357 | from a regular Ubuntu desktop. It will unpack *.xz internally in the process.
358 | 
359 | When booting Ubuntu Server offline for the first time the login prompt
360 | is often presented too early in the process and a few minutes time
361 | needs to be taken before SSH authorization becomes workable.
362 | 
363 | Once booted the image will automatically resize the root partition to fully
364 | utilize all space available on an SD card and request to change the password.
365 | 
366 | When online Ubuntu may start unattended-upgr process which holds the lock
367 | preventing installation of other packages for a few minutes until the process
368 | is complete and the lock is released (run top to monitor, q to quit monitoring).
369 | 
370 | To test SVE targets with ARM Instruction Emulator run:
371 |     sudo apt-get update
372 |     sudo apt-get install environment-modules
373 |     reboot
374 | Download ARM-Instruction-Emulator_21.0_AArch64_Ubuntu-18.04_aarch64.tar.gz from:
375 | https://developer.arm.com/tools-and-software/server-and-hpc/
376 |                        /compile/arm-instruction-emulator/get-software/download
377 | and follow installation instructions as presented here:
378 | https://developer.arm.com/documentation/102190/2100/Get-started/
379 |                                              /Install-Arm-Instruction-Emulator
380 | Note that module command is only available before installing the desktop.
381 | 
382 | To setup networking on a freshly installed Ubuntu Server without ethernet cable
383 | consider using "USB tethering" from an Android phone or an iPhone.
384 | 
385 | Plug in phone's USB cable to Raspberry Pi 4's USB slot and select
386 | "USB tethering" option from phone's menu or in "Settings -> Personal Hotspot".
387 | 
388 | On Raspberry Pi 4 run the following command in the terminal:
389 |     ip -c a
390 | to list all the network interfaces. USB tethered option should be called:
391 |     usb0
392 | or
393 |     eth1
394 | In any case it shows up on the list once tethering is activated on the phone.
395 | 
396 | To complete setting up USB networking add interface names to the netplan:
397 |     sudo nano /etc/netplan/50-cloud-init.yaml
398 | Edit the file above so that it looks like this:
399 | network:
400 |     ethernets:
401 |         eth0:
402 |             dhcp4: true
403 |             optional: true
404 |         eth1:
405 |             dhcp4: true
406 |         usb0:
407 |             dhcp4: true
408 |     version: 2
409 | Press Ctrl-O <enter> to save the changes and Ctrl-X to exit from the editor.
410 | 
411 | Apply the changes by typing:
412 |     sudo netplan apply
413 | Check the connection with:
414 |     ping google.com
415 | Press Ctrl-C to stop.
416 | 
417 | Once the networking is set up (only needs to be done once) install the desktop:
418 |     sudo apt-get update
419 |     sudo apt-get install ubuntu-mate-desktop
420 |     (if MATE is not yet available the command installs GNOME 3 desktop instead)
421 |     (select lightdm when prompted for more familiar MATE-themed login screen)
422 |     reboot
423 | Alternatively install Xubuntu desktop on a clean system:
424 |     sudo apt-get update
425 |     sudo apt-get install xubuntu-desktop
426 |     (if XFCE is not yet available the command installs GNOME 3 desktop instead)
427 |     (select lightdm when prompted for more familiar XFCE-themed login screen)
428 |     reboot
429 | 
430 | ================================================================================
431 | 


--------------------------------------------------------------------------------
/README:
--------------------------------------------------------------------------------
 1 | UniSIMD assembler is a high-level C/C++ macro assembler framework unified across
 2 | ARM, MIPS, POWER and x86 architectures. It establishes a subset of both BASE and
 3 | SIMD instruction sets with clearly defined common API, so that application logic
 4 | can be written and maintained in one place without code replication.
 5 | The assembler itself isn't a separate tool, but rather a collection of C/C++
 6 | header files, which applications need to include directly in order to use.
 7 | 
 8 | Initial documentation for the assembler is provided in core/config/rtdocs.h.
 9 | 
10 | At present, Intel SSE/SSE2/SSE4 and AVX/AVX2/AVX-512 (32/64-bit x86 ISAs),
11 | ARMv7 NEON/NEONv2, ARMv8 AArch32 and AArch64 NEON, SVE (32/64-bit ARM ISAs),
12 | MIPS 32/64-bit r5/r6 MSA and POWER 32/64-bit VMX/VSX (little/big-endian ISAs)
13 | are mostly implemented (w/ horizontal reductions and byte/half SIMD+BASE ops)
14 | although scalar improvements, wider SIMD vectors with zeroing/merging predicates
15 | in 3/4-operand instructions, cross-precision fp-converters on modern CPU targets
16 | are planned as extensions to current 2/3-operand SPMD-driven vertical SIMD ISA.
17 | 
18 | The project has a test framework for Linux/GCC/Clang and Windows/VC++/TDM64-GCC.
19 | Support for macOS is provided via Command Line Tools with GCC and Clang options.
20 | Instructions for resolving dependencies and building the binaries
21 | for supported platforms can be found in the accompanying INSTALL file.
22 | 
23 | UniSIMD core features:
24 |  - Unified, Universal, Portable, Compatible code
25 |  - Explicit register allocation, predictable performance
26 |  - Three register sets for code: 8, 16, 32 (free: 8, 15, 30)
27 |  - High-level SIMD registers/ops as singles, pairs and quads
28 |  - SIMD-aligned backend structures with offsets/factors
29 |  - Vector-length agnostic vertical SIMD ISA, configurable
30 |  - Simultaneous scalar + 128/256-bit + configurable SIMD ops
31 |  - ISA implementation for fp16/fp128 (half/quad) SIMD ops
32 |  - C/C++, Compute, SPMD on 4 major archs
33 |  - Intel SSE/SSE2/SSE4 and AVX/AVX2/AVX-512
34 |  - ARMv7 NEON/NEONv2, ARMv8 AArch32/AArch64 NEON, SVE
35 |  - MIPS r5/r6 MSA (Warrior P5600, I6400/P6600)
36 |  - POWER VMX/VSX (PowerPC G4/G5, POWER6/7/8/9)
37 |  - CISC, RISC, CISC on RISC, little/big-endian ISA
38 |  - Support for reg-reg, load/store, load-op instructions
39 |  - Plain, indexed and scaled-indexed addressing modes
40 |  - FMA3 support (native or higher-precision emulation)
41 |  - 32/64-bit hybrid mode for native 64-bit ABI
42 |  - 32/64-bit addressing for BASE and SIMD ops
43 |  - 32/64-bit configurable SIMD elements (fp+int)
44 |  - Simultaneous 32/64-bit BASE (bridges, rules) and SIMD ops
45 |  - ISA implementation for int8/int16 (byte/half) BASE ops
46 |  - Full control over code, compiler steps out of the way
47 |  - Potential for bit-exact fp-compute across modern targets
48 |  - Used in QuadRay engine
49 | 


--------------------------------------------------------------------------------
/ROADMAP:
--------------------------------------------------------------------------------
  1 | ================================================================================
  2 | === >>> === tasks below are planned for the upcoming 1.2.0 milestone === <<< ===
  3 | ================================================================================
  4 | 
  5 | X) Task title: "implement predicated AVX-512/ARM-SVE backends (in *_RX slots)"
  6 | 1) Add rtarch_***_***x*p*.h header files to core/config for predicated targets
  7 | 2) Add predicate registers X1..X6 (merging) and Z1..Z6 (zeroing) as triplets
  8 | 3) Add cmd**P** subset for "two-operand + predicate" instructions
  9 | 4) Add cmd**4** subset for "three-operand + predicate" instructions
 10 | 5) Predicate is placed 1st in cmp-ops, and right after dest-SIMD-reg otherwise
 11 | 6) Predicated targets can be implemented as extension to current AVX-512/ARM-SVE
 12 | 7) Use predicate registers X1..X6 where merging/zeroing is not applicable
 13 | 8) Emulate zeroing and three-operand ops on ARM-SVE from fields in triplets
 14 | 9) Paired predicated backends should expose half the predicates (and registers)
 15 | 
 16 | ================================================================================
 17 | === >>> === tasks below are planned for the upcoming 1.3.0 milestone === <<< ===
 18 | ================================================================================
 19 | 
 20 | R) Task title: "implement basic runtime generation for existing ASM code-bases"
 21 | 1) Rewrite ASM_ENTER macro to allocate temporary buffer with code-exec rights
 22 | 2) Rewrite EMITB / EMITW emitters to write into a memory buffer at cur++ offset
 23 | 3) Define M to (+/-) depending on static/dynamic code generation (+ clang check)
 24 | 4) Rewrite j** to encode jump-label distances into binary form, track labels
 25 | 5) Rewrite ASM_LEAVE to type-cast the buffer to a function-pointer, then call it
 26 | 6) Implement proper buffer management for more advanced versions later
 27 | 
 28 | ================================================================================
 29 | === >>> === tasks below are planned for the forthcoming 1.x.0 series === <<< ===
 30 | ================================================================================
 31 | 
 32 | K) Task title: "use configuration utils (autotools, CMake, etc) for building"
 33 | 1) Use single build script for all host CPU architectures on Linux
 34 | 2) Keep cross-compilation on x86-64 Linux hosts (targeting QEMU linux-user mode)
 35 | 3) Consider adding continuous integration (CI) tests
 36 | 
 37 | ================================================================================
 38 | 
 39 | E) Task title: "add 8 SIMD registers full-IEEE support for ARMv7 using VFP"
 40 | 1) Implement 128-bit SIMD registers/instructions as 4x32-bit VFP (full-IEEE)
 41 | 2) Emulate currently exposed NEON instructions using VFP variants/fallbacks
 42 | 3) Use register-offloading to upper bank for 1 mem-arg in load-op instructions
 43 | 4) Find place in SIMD target mask (RT_128=8), like legacy x86, ARMv7 is 8-regs
 44 | 
 45 | ================================================================================
 46 | 
 47 | N) Task title: "implement new 128/256-bit 30-regs targets on top of current AVX"
 48 | 1) Implement register-offloading to memory (SIMD structs) on top of current AVX
 49 | 2) Add new SIMD compatiblity flag RT_SIMD_COMPAT_256=1/2 for 30-regs with AVX1/2
 50 | 3) Find place in SIMD target mask (RT_128=1/RT_256=4) for custom 30-regs support
 51 | 4) Improve mask-jump (mkj*x) instructions for 64-bit SIMD elements (optional)
 52 | 5) Target 128-bit version to SSE, RT_SIMD_COMPAT_128=2/4/8/16/32 for 30-regs
 53 | 6) Add tests to check defined immediate/displacement limits (for BASE/SIMD ops)
 54 | 
 55 | ================================================================================
 56 | 
 57 | G) Task title: "consider 64-bit SIMD emulation with FPRs on PowerPC G5/POWER6"
 58 | 1) Implement 64-bit SIMD registers/instructions as 2x64-bit FPRs (full-IEEE)
 59 | 2) Emulate currently exposed SIMD instructions using FPU variants/fallbacks
 60 | 3) Emulate 64-bit integer SIMD ops using 64-bit BASE registers where possible
 61 | 4) New 64-bit SIMD backend would complement 32-bit targets in existing slots
 62 | 5) Expose 16x128/8x256 on PowerPC VMX (v4) instead of 15x128/8x256 for 32-bit
 63 | 6) Consider implementing 8x256 mode with register-offloading to mem for 64-bit
 64 | 
 65 | ================================================================================
 66 | 
 67 | P) Task title: "use RT_REGS to unload SIMD target mask for 256-bit on POWER11"
 68 |    (may require significant redesign of SIMD target mask handling in rtbase.h)
 69 |    (better schedule this task for the next major update, also check rtzero.h)
 70 |    (consider renaming SVE binaries to *.a*armSVE to match *.x*avx512 on x86)
 71 | 
 72 | ================================================================================
 73 | 
 74 | O) Task title: "use 3-operand SIMD instructions in packed/scalar SIMD tests"
 75 | 
 76 | ================================================================================
 77 | 
 78 | T) Task title: "improve SIMD test coverage, add tests for corner cases in ops"
 79 | 
 80 | ================================================================================
 81 | 
 82 | C) Task title: "implement SIMD fp32/fp64 converters consistently across targets"
 83 | 
 84 | ================================================================================
 85 | 
 86 | A) Task title: "implement SIMD fp16 converters as tier-1 extension, modern CPUs"
 87 | 
 88 | ================================================================================
 89 | 
 90 | F) Task title: "implement scalar fp compare-to-flags, fp/fp & fp/int converters"
 91 | 
 92 | ================================================================================
 93 | 
 94 | M) Task title: "add support for trigonometric/randomizer SIMD meta-instructions"
 95 |    (consider sleef library as an example of elementary math functions with SIMD)
 96 |    (https://github.com/shibatch/sleef)      <- use this code snapshot as a base
 97 | 
 98 | ================================================================================
 99 | 
100 | L) Task title: "consider SoftFP library integration for full fp16/fp128 support"
101 | 
102 | ================================================================================
103 | 
104 | V) Task title: "add support for various new and existing architectures"
105 | 1) Add support for RISC-V architecture with "vector extension proposal"
106 |    (search the Web for "RISC-V vector extension proposal" also standard SIMD)
107 | 2) Add support for Sunway SW26010 with custom Chinese BASE/SIMD ISAs (64-bit)
108 |    (https://en.wikipedia.org/wiki/SW26010)
109 | 3) Add support for Loongson 3 (GS464E) with LoongSIMD ops as well as MIPS64r3
110 |    (https://en.wikipedia.org/wiki/Loongson)
111 | 4) Add support for SPARC64 VIIIfx HPC-ACE SIMD extensions as well as BASE ops
112 |    (http://www.fujitsu.com/downloads/TC/sparc64viiifx-extensions.pdf)
113 | 5) Add support for ELBRUS architecture, emulate SIMD with VLIW (plus Itanium)
114 |    (https://en.wikipedia.org/wiki/Elbrus_2000)
115 | 
116 | ================================================================================
117 | 


--------------------------------------------------------------------------------
/VERSION:
--------------------------------------------------------------------------------
  1 | v1.1.0d: UniSIMD, code name "ENsed+1d": macOS M1, VS2022, Ubuntu 22.04
  2 |   - switch to malloc for 64-bit pointer/address combo
  3 |   - use assembler-local labels to build on M1 macOS
  4 |   - add support for M1 macOS to makefiles
  5 |   - add VS2022 support for SIMD test
  6 |   - add notes for building on Windows with VS2022 and M1 macOS
  7 |   - update documentation and main header (add braces to ASM_INIT)
  8 |   - add double-precision logic/arithmetic to ARMv7, x86
  9 |   - add workarounds for POWER8 and POWER9 targets on Ubuntu 22.04
 10 |   - drop ppc64abi32 targets (since QEMU 5.2.0), also from QEMU build script
 11 |   - add notes for VS2022, QEMU 6.2.0 and 7.2.0, Ubuntu 23.04
 12 |   - swap 16-bit and SIMD integer compare test groups (30-37 <-> 38-44)
 13 |   - update copyright year to 2023
 14 | 
 15 | v1.0.0g: UniSIMD, code name "ENsed+g", backports, VS2022, Ubuntu 22.04
 16 |   - switch to malloc for 64-bit pointer/address combo
 17 |   - require both SSE4.1 and SSE4.2 for SSE4 (v4) target slots
 18 |   - add DAZ support for flush-to-zero mode on x86 (makes on par with RISCs)
 19 |   - backport integer SIMD compare subset (min/max/ceq/cne/clt/cle/cgt/cge)
 20 |   - backport tests for integer SIMD compare (30-36) (signed/unsigned)
 21 |   - target slots AVX512DQ now include VL backends for 128/256-bit subsets
 22 |   - optimize SIMD compare and mask-jump instructions for AVX-512
 23 |   - add 64-bit sign/zero-extend bridges to existing 32/64-bit BASE subsets
 24 |   - optimize standalone remainder instructions on ARM and POWER
 25 |   - implement direct ASM section output comparison method (bypass C++ test)
 26 |   - extended 30-reg 256-bit and 15-reg 512-bit POWER backends are deprecated
 27 |   - extended POWER backends are still supported with v1.0.0f ASM feature set
 28 |   - add VS2022 support for SIMD test
 29 |   - update build scripts with TDM64-GCC 10.3.0-2 compiler reference
 30 |   - update documentation and main header (add braces to ASM_INIT)
 31 |   - add double-precision logic/arithmetic to ARMv7, x86
 32 |   - add workarounds for POWER8 and POWER9 targets on Ubuntu 22.04
 33 |   - drop ppc64abi32 targets (since QEMU 5.2.0), also from QEMU build script
 34 |   - add notes for VS2022, QEMU 6.2.0 and 7.2.0, Ubuntu 23.04
 35 |   - clean up comments in BASE and SIMD headers
 36 |   - update copyright year to 2023
 37 | 
 38 | v1.1.0c: UniSIMD, code name "ENsed+1c": full-stack SIMD/BASE
 39 |   - add 8-bit (byte) BASE instruction subset, redesign 16-bit BASE
 40 |   - add 8-bit elements SIMD subset (native on RISCs, mostly emulated on x86)
 41 |   - add 64-bit sign/zero-extend bridges to existing 32/64-bit BASE subsets
 42 |   - add 32-bit sign/zero-extend bridges to new 8/16-bit BASE subsets
 43 |   - add RT_BASE flag to limit addressing granularity, extend range on ARMv8
 44 |   - add mask-jump (mkj) SIMD instructions for 8/16-bit SIMD subsets
 45 |   - add DAZ support for flush-to-zero mode on x86 (makes on par with RISCs)
 46 |   - add support for AVX-512 fp16 subset to match existing ARMv8.2 + SVE
 47 |   - AVX-512 fp16 requires separate binary (no target slot or cap check)
 48 |   - implement direct ASM section output comparison method (bypass C++ test)
 49 |   - AVX-512 fp16 now provides validation for ARM's fp16 using above method
 50 |   - target slots AVX512DQ now include VL backends for 128/256-bit subsets
 51 |   - target slots AVX512DQ now require BW support to facilitate 8/16-bit SIMD
 52 |   - optimize SIMD compare and mask-jump instructions for AVX-512
 53 |   - optimize setting flags instructions in 8/16-bit BASE subsets (on RISCs)
 54 |   - optimize standalone remainder instructions on ARM and POWER
 55 |   - extended 30-reg 256-bit and 15-reg 512-bit POWER backends are deprecated
 56 |   - extended POWER backends are still supported with v1.0.0f ASM feature set
 57 |   - fix 16-bit (half-int) BASE addressing granularity on POWER
 58 |   - add notes for VS2022, QEMU 6.2.0, Intel SDE 9.0
 59 |   - clean up comments in BASE and SIMD headers
 60 |   - update copyright year to 2022
 61 | 
 62 | v1.1.0b: UniSIMD, code name "ENsed+1b", second development release
 63 |   - implement integer SIMD compare subset (signed/unsigned)
 64 |   - add integer SIMD compare on MIPS32/64 (min/max/ceq/cne/clt/cle/cgt/cge)
 65 |   - add integer SIMD compare on ARMv8 (64-bit min/max emulated) and SVE
 66 |   - add integer SIMD compare on POWER (64-bit emulated on POWER7)
 67 |   - add integer SIMD compare on x86+SSE2/4 (64-bit emulated)
 68 |   - add integer SIMD compare on x86+AVX1/2 (emulated for full SIMD AVX1)
 69 |   - add integer SIMD compare on x86+AVX512
 70 |   - add integer SIMD compare on original legacy targets (ARMv7, x86, PPC G4)
 71 |   - add integer SIMD compare for half-int SIMD backends (16-bit elements)
 72 |   - require both SSE4.1 and SSE4.2 for SSE4 (v4) target slots
 73 |   - add tests for integer SIMD compare (38-51)
 74 | 
 75 | v1.0.0f: UniSIMD, code name "ENsed+f", fixes and tests
 76 |   - fix displacement encodings on MIPS
 77 |   - add testing for displacement levels and types
 78 |   - update makefiles to support ancient HW (SSE2, SSE1 has issue with cvzps)
 79 |   - update SIMD test framework, add scripts for test automation
 80 |   - update comments for QEMU 5.2.0 and QEMU 6.0.0 (require ninja-build)
 81 | 
 82 | v1.1.0a: UniSIMD, code name "ENsed+1a", first development release
 83 |   - add tests for half-int SIMD/BASE ops (run level 30-37)
 84 |   - drop extended POWER targets from SIMD testing (no half-int support)
 85 |   - add half-int SIMD arithmetic with saturate (except original SSE1)
 86 |   - add implementation for half-int BASE ops across modern targets
 87 |   - add BASE half-int support on legacy ARMv7 and x86
 88 |   - add SIMD half-int support on legacy ARMv7 and x86
 89 |   - adjust displacement types for BASE half-int on legacy ARMv7 and x86
 90 |   - adjust displacement types for BASE half-int on MIPS and POWER
 91 |   - adjust displacement types for BASE half-int on x86_64
 92 |   - adjust displacement types for scalar fp16 on ARMv8
 93 |   - split SIMD half-int subset from fp16 on ARMv8
 94 |   - add SIMD half-int support on x86_64, enable on ARMv8
 95 |   - add SIMD half-int support on MIPS and POWER
 96 |   - add preliminary support for POWER9 fp128 SIMD ops (not tested)
 97 |   - add preliminary support for ARMv8.2 fp16 SIMD ops (not tested)
 98 | 
 99 | v1.0.0e: UniSIMD, code name "ENsed+e", 2021 extended support
100 |   - clarify instructions for POWER8 server, Raspberry Pi 3/4
101 |   - update links and comments in project files
102 |   - make comment for compiler swapping on MIPS more generic
103 |   - update mappings for byte/char SIMD ops
104 |   - update TDM64-GCC compiler reference to version 9.2.0
105 |   - update copyright year to 2021
106 |   - update comments for remainders and scaled addressing
107 |   - optimize remainder ops on POWER9
108 |   - add scaled-indexed addressing modes
109 | 
110 | v1.0.0d: UniSIMD, code name "ENsed+d", documentation edition
111 |   - clean up task descriptions in roadmap
112 |   - add notes for Ubuntu, QEMU, MIPS cross-compilers
113 |   - add Ubuntu (MATE) 20.04 LTS to makefile notes
114 |   - update standalone MIPS compiler to 2020.06-01
115 |   - change RUN_LEVEL to SUB_TEST for better wording
116 |   - clean up comment about displacement values
117 |   - add initial documentation for the assembler
118 |   - add sin/cos and log/exp math definitions to rtbase
119 | 
120 | v1.0.0c: UniSIMD, code name "ENsed++", celebration edition
121 |   - celebrating C++ and its various compilers
122 |   - add notes for Ubuntu Server on Raspberry Pi 4
123 |   - add -mcpu=power8 compiler option to makefiles on POWER
124 |   - fix RISC targets with clang after version 6.0
125 |   - update copyright year to 2020
126 | 
127 | v1.0.0b: UniSIMD, code name "ENsed+b", 2020-02-02 archive edition
128 |   - all releases after 2020-01-01 have 2nd naming from their baseline: (ENsed)
129 |   - letter from the update (b,c,..) appears concatenated after (+) in the name
130 |   - future minor releases (v1.X.0a) will have digit and letter (+1a, +2a, +3a)
131 |   - future major releases (v2.X.0a) will have the form: (2+, 2+1a, 2+2a, 2+3a)
132 |   - clean up and update comments related to recent compiler and QEMU versions
133 |   - fix comments for SIMD instructions in 3-operand forms, clarify for SIMD div
134 |   - add SIMD fma3 aliases as 3-operand forms: fma**3**
135 |   - fix SIMD fma3 emulation with fp32 elements on AVX1
136 | 
137 | v1.0.0a: UniSIMD, code name "ENsed+", 2020-edition ("ENsed" + 2019 updates)
138 |   - all new releases from now on will use *X.Y.Za(bc..) naming scheme
139 |   - all branches start with letter (b), all tags start with letter (v)
140 |   - first release (tag) on every new branch will be marked with letter (a)
141 |   - all subsequent minor updates will have letters (b,c,..), tags aren't moving
142 |   - add SIMD flag to replace VMX targets with VSX (on)
143 |   - add signed BASE ops to combined-arithmetic-jump (arithmetic shift right)
144 |   - add setting-flags BASE arithmetic shift right
145 |   - make setting-flags BASE ops orthogonal to size/type (cmd**Z**)
146 |   - add -mips64r6 compiler option to makefiles on MIPS
147 |   - optimize 64-bit SIMD shifts on POWER9, clean up mkj** formatting
148 |   - improve ARM/x86 compatibility in SIMD shifts
149 |   - add SIMD integer multiply instruction (for 32/64-bit elements)
150 |   - update copyright year to 2019
151 |   - fix 32-bit BASE compare-to-mem on 64-bit POWER (backported down)
152 |   - fix usage of non-persistent temp-register on POWER
153 |   - update build instructions and makefile notes
154 |   - add notes about QEMU 3.1.0 for SVE emulation
155 |   - add SIMD flag to replace VMX targets with VSX (off)
156 |   - fix and clean up SIMD target selection in headers
157 |   - fix/add comments for SIMD/BASE shift count value
158 |   - adjust build instructions for older HW compatibility
159 |   - adjust Win64 release build script for lower core-count
160 | 
161 | v1.0.0: UniSIMD assembler, code name "ENsed", base for future SIMD enhancements
162 |   - foundation for QuadRay engine 0.7.0 "GIzmo" with ARM-SVE, POWER9, new scheme
163 |   - renewed directory structure, move BASE and SIMD header files to core/config
164 |   - add new fp-compatibility and feature tasks, rename TASKS file to ROADMAP
165 |   - add support for 30 SIMD register pairs (2x128) backend on POWER7/8
166 |   - add support for 30 SIMD registers (scalar+128+256) backend on Skylake-X
167 |   - drop standalone SSE2 target from x64, reuse SSE4 (v4) slot, add compat flag
168 |   - add support for 128-bit AVX1+FMA3 (v16) and AVX2+FMA3 (v32) targets for AMD
169 |   - compactify POWER7/8 targets into one slot, add new RT_SIMD_COMPAT_PW8 flag
170 |   - swap legacy PowerPC G4/POWER6 VMX (now v4) with POWER7/8 VSX1/2 (now v1)
171 |   - 64-bit POWER6 now matches 64-bit Nehalem target (both v4), 15x128/8x256-bit
172 |   - add support for POWER9 backend (v2) with immediate vector loads/stores
173 |   - move 128-bit 30 SIMD registers Skylake-X target from v1 to v2, match POWER9
174 |   - reserve 128-bit v1 and 256-bit v4 for 30 SIMD registers emulation on AVX1/2
175 |   - implement plain ARM-SVE backend (v4) for 256/512/1K4/2K8-bit vector lengths
176 |   - implement paired ARM-SVE backend (v1) for 512/1K4/2K8-bit SIMD target slots
177 |   - new scheme: RT_128=4+8, RT_256=1+2, RT_512=1+2, RT_1K4=1+2 are 15 registers
178 |   - new scheme: RT_128=1+2, RT_256=4+8, RT_512=4+8, RT_1K4=4+8 are 30 registers
179 |   - add elm*x_st instruction to detach scalar subset from vectors (via mem)
180 |   - add support for horizontal pairwise/reductive add/mul/min/max instructions
181 |   - patch system allocators to compile on macOS, widen OS support in makefiles
182 |   - clean up SIMD tests to support PIE (also macOS)
183 |   - separate 64-bit Linux from multilib build scripts, add for macOS
184 |   - add VMX-compatible scalar SIMD subset on PPC G4 and POWER family of CPUs
185 |   - add MSA/scalar compatibility on big-endian MIPS, support for fp32 11-bit DP
186 |   - rename sections in target-specific headers to BASE, SIMD, ELEM (for scalar)
187 |   - optimize long displacements for BASE, SIMD, ELEM on RISCs where applicable
188 |   - implement proper SIMD-scaling for displacement types (as sliding in rtbase)
189 |   - move common internal x87 FPU sections to BASE headers on x86
190 |   - dedicate rtconf header for configurable instruction subsets on all targets
191 |   - allow target-specific headers to redefine common instructions from rtbase
192 |   - improve SIMD target reporting in tests, add -c n option to reduce test time
193 |   - update notes for MIPS cross-compiler location, add -mnan=2008 to makefiles
194 |   - update notes for AArch64 Linux, QEMU 3.0.0, Intel SDE, add ARM IE reference
195 |   - add test for SIMD mask-move (mmv), run level 27
196 |   - add test for 8/15/30 BASE/SIMD registers, run level 28
197 |   - warning-free building with GCC/Clang and MSVC++
198 |   - fix BASE shifts with zero immediate arg on legacy ARMv7 (backported down)
199 |   - convert all text files with unix2dos
200 |   - always reserve maximum space for SIMD register file
201 |   - save/restore temp predicate register on AVX512
202 |   - fix SIMD registers save/restore for 15x128x2 on POWER7
203 |   - fix temporary FPRs save/restore on POWER
204 |   - fix scalar SIMD min/max on POWER7
205 |   - fix BASE compare immediate encodings on POWER
206 |   - fix location for 128/256-bit common SIMD instructions
207 |   - fix for scalar SIMD alignment on ARMv7, POWER8
208 |   - fix compilation in C++11 mode with RT_DEBUG=2
209 |   - add comment for NaNs handling in floating point piepline
210 |   - clarify comments about SIMD fp round instructions
211 |   - fix comment for SIMD shifts with count in memory
212 |   - add comment for scalar/vector compatibility
213 | 
214 | v0.9.1: Unified SIMD Assembler, 3-operand + basic scalar SIMD, extra backends
215 |   - expose 128/256-bit SIMD subsets (cmd[i/j/l]*, cmd[c/d/f]*) simultaneously
216 |   - add 3-operand SIMD instructions to all targets, emulate where not present
217 |   - implement basic scalar SIMD support (arithmetic + compare-to-mask-elem)
218 |   - implement additional paired/quaded 8-register SIMD backends on x86_64
219 |   - add 8-register makefile flags RT_256_R8, RT_512_R8, RT_1K4_R8, RT_2K8_R8
220 |   - original 15-register makefile flags RT_128, RT_256, RT_512 remain
221 |   - add new makefile flag RT_1K4 for 15-register code-bases on paired AVX-512
222 |   - expose 30 registers as an extension to common baseline of 15 where present
223 |   - each major architecture has at least one SIMD target with 30 registers
224 |   - add new RT_SIMD selector flag to remap vector-length-agnostic subsets
225 |   - add new RT_REGS selector flag to choose targets within given RT_SIMD width
226 |   - rename SIMD target headers to reflect size-factor/sub-variant, move legacy
227 |   - add new internal flags RT_128X*, RT_256X*, RT_512X* to match SIMD headers
228 |   - new internal flags keep SIMD sub-variant value in format for native width
229 |   - implement SIMD flags compatibility layer in rtzero to map makefile flags
230 |   - rtarch main header selects appropriate BASE/SIMD target from flags above
231 |   - implement SIMD target format converters in rtbase for runtime selection
232 |   - change SIMD target reporting to native-size x size-factor v version format
233 |   - reserve _RX slots in SIMD target mask for predicated backends (30+8 regs)
234 |   - clean up (drop) legacy SSE(1) support from x32 headers/makefiles
235 |   - move BASE sub-target selection to rtarch main header (ARM, x86)
236 |   - add notes for AArch64 Linux on Raspberry Pi 3 to INSTALL file
237 |   - add new TASKS file with description for future tasks
238 |   - enforce full ARMv7 instruction set (32-bit words) in makefiles
239 |   - fix LLVM's condition evaluation sign on all targets, define M -/+
240 |   - fix SIMD registers save/restore for 128-bit AVX targets (backported down)
241 |   - fix buffer allocation in SIMD tests (for 64-bit elems)
242 |   - fix stack alignment (now 16 bytes) on ARMv8/AArch64 (hardware) targets
243 |   - allow external override (from makefiles) for SIMD compatibility modes
244 |   - minor fixes in rtarch, accelerate release builds on multi-core machines
245 | 
246 | v0.9.0: Unified SIMD Assembler, 256-bit SIMD on RISCs, basic AVX-512 support
247 |   - adjust root rt_SIMD_INFO struct to contain both 32-bit and 64-bit constants
248 |   - add new sign-mask and full-mask general purpose constants to rt_SIMD_INFO
249 |   - expose 32/64-bit SIMD-element-size subsets (cmdo*, cmdq*) simultaneously
250 |   - element size in existing cmdp* subset remains configurable with RT_ELEMENT
251 |   - all three SIMD subsets (cmdo*, cmdp*, cmdq*) are still SIMD-width-agnostic
252 |   - expose fixed 64-bit BASE subset cmdz* for 64-bit targets only
253 |   - existing address-size cmdx*, element-size cmdy* and 32-bit cmdw* remain
254 |   - add BASE move instructions for 64-bit immediates as pairs of 32-bit types
255 |   - add new rotate-right and inverse-logic BASE instructions (ror, ann, orn)
256 |   - add new BMI1/BMI2 implementations for existing BASE instructions on x86
257 |   - implement non-portable x87 ISA subset for x86 targets internally
258 |   - implement fused-multiply-accumulate (fma/fms) on all SIMD targets
259 |   - add new mask-move SIMD instructions to common SIMD ISA (was x86 only)
260 |   - add new fp-negate and inverse-logic SIMD instructions (neg, orn, not)
261 |   - add new variable SIMD shifts with per-element count to all targets
262 |   - implement 256-bit SIMD support (2x128-bit, 15 regs) on modern RISC targets
263 |   - implement 512-bit SIMD support (4x128-bit, 15 regs) on modern POWER targets
264 |   - implement 512-bit SIMD support (1x512-bit, 16 regs) on future x86 targets
265 |   - AVX1/AVX2 256-bit SIMD for x86 (1x256-bit, 16 regs) remains supported
266 |   - 256-bit SIMD with 15 regs becomes new common baseline for modern hardware
267 |   - improve test coverage for BASE and SIMD load-op instructions
268 |   - add tests for new rotate, logic, shifts, fma/fms instructions, run level 24
269 |   - add rtzero header file to clean up assembler definitions after use
270 |   - rename instruction parameters to better reflect their use as source/dest
271 |   - add formulas for all BASE and SIMD instructions for better clarity
272 |   - reserve the whole alphabet for future BASE and SIMD instruction subsets
273 |   - add new SIMD compatibility flags for 128-bit AVX1/2, FMA/FMS/FMR, XMM regs
274 |   - add wrappers for 64-bit literals to better support legacy 32-bit compilers
275 |   - fix label_ld/label_st range on ARMv7/AArch64 to be on par with other targets
276 |   - fix discrepancy in VMX/VSX vector-loads on POWER (from here backported down)
277 |   - fix AVX-version of mmvpx_ld from zeroing to merging on x86
278 | 
279 | v0.8.1: Unified SIMD Assembler, full 64-bit fp/int SIMD compute elements
280 |   - add element-sized BASE ISA subset to fixed-32-bit and address-sized subsets
281 |   - new instruction mnemonics introduced for element-sized BASE subset (cmdy*)
282 |   - add new rtarch headers to house element-sized SIMD subset for 64-bit targets
283 |   - support for 64-bit SIMD elements currently requires 64-bit addresses as well
284 |   - enable full-precision SIMD rcpps/rsqps and rceps/rseps instructions
285 |   - add new offset corrections for endianness related to element-sized subset
286 |   - add new SIMD width short names for fixed and element-sized SIMD fields
287 |   - add new custom-sized integer types (address, element) with printf mods
288 |   - make current adjustable fp types follow SIMD element size (RT_ELEMENT)
289 |   - adjust math macros and definitions to support double-precision arithmetic
290 |   - add build/clean scripts, update makefiles with extra targets, MIPS notes
291 |   - remove unnecessary limitation on SIMD masks (add AVX-512/ARM-SVE notes)
292 |   - distinguish SIMD NEONv1/v2 vanilla ARM builds (cortex-a8/cortex-a15)
293 |   - distinguish SIMD v2/v4 64-bit POWER builds (POWER7+VSX/POWER8+VSX2)
294 |   - fix non-setting-flags instructions to not interfere with cmp on MIPS, POWER
295 |   - fix full-precision IEEE-compat divps_ld on ARMv7 targets (backported down)
296 | 
297 | v0.8.0: Unified SIMD Assembler, full 64-bit addressing for BASE and SIMD
298 |   - double original 32-bit BASE ISA to fixed-32-bit and address-sized subsets
299 |   - original instruction mnemonics follow in-heap/code-segment address size
300 |   - new instruction mnemonics introduced for fixed-32-bit subset (cmdw*)
301 |   - setting-flags instruction mnemonics remapped from (cmdz*) to (cmd*z)
302 |   - add combined-arithmetic-jump wrapper for better API stability/efficiency
303 |   - add new rtarch headers to house address-sized subset for 64-bit targets
304 |   - move original (now address-sized) mappings to rtbase for 32-bit targets
305 |   - add canonical forms for BASE div/rem and shifts (not always efficient)
306 |   - add setting-flags versions for BASE orr/xor and unsigned shifts
307 |   - remap one-operand instructions from cmd**_rr/mm to rx/mx and xr/xm
308 |   - move stack instructions to their own section at the end of rtarch headers
309 |   - move sregs instructions to their own section at the end of rtarch headers
310 |   - add config flags for full-precision SIMD rcpps/rsqps instructions
311 |   - add master flags for SIMD compatibility modes to rtarch main header
312 |   - add new offset corrections for endianness (from here backported down)
313 |   - add Win64 support via TDM64-GCC toolchain (tdm64-gcc-5.1.0-2.exe)
314 |   - add NULL-ptr checks to custom allocators (Linux/mmap, Win64/VirtualAlloc)
315 |   - fix setting-flags instructions for 64-bit POWER running 32-bit ISA
316 |   - fix non-setting-flags instructions (neg*x) to not set flags on MIPS
317 | 
318 | v0.7.1: Unified SIMD Assembler, 64/32-bit hybrid mode for native 64-bit ABI
319 |   - use fixed-sized and adjustable integer types in rtbase and SIMD test
320 |   - add a64 (AArch64 native ABI) and x64 (x86_64 native ABI) targets/makefiles
321 |   - add m64 (MIPS64 native ABI) and p64 (Power64 native ABI) targets/makefiles
322 |   - most of the current ISA remains 32-bit for BASE and SIMD with few exceptions
323 |   - adjust backend structures to support 64-bit pointer types in select places
324 |   - move sys_alloc/sys_free to platform-specific sections in SIMD test
325 |   - implement custom allocators (mmap) to limit address range to 32-bit (Linux)
326 |   - limit address range to 2GB boundary as MIPS64 sign-extends 32-bit mem-loads
327 |   - treat code labels as 64-bit in label_ld/st and jmpxx_mm instructions
328 |   - implement 64-bit versions of stack_sa/la instructions on MIPS and POWER
329 |   - fix variable SIMD shifts to support little-endian on POWER targets
330 |   - fix ASM blocks to only use SIMD registers within VRSAVE segment on POWER
331 |   - remove ASM block's zeroing of r15 as unnecessary on x32/x64 targets
332 |   - reformat/rework ASM blocks to better respect internal register mapping
333 |   - explicitly save/load SIMD registers in ASM blocks across all targets
334 |   - drop ASM clobber lists for lack of consistency across targets/SIMD-widths
335 |   - fix clang's ASM block l-value errors and other warnings, official support
336 |   - add build instructions to makefiles for Ubuntu 16.04 LTS 64-bit Live CD
337 |   - fix divps_ld instruction's encoding on ARM
338 |   - use IEEE-compatible div/sqr on legacy ARM and POWER
339 | 
340 | v0.7: Unified SIMD Assembler, additional 32-bit CPU architectures
341 |   - add a32 (AArch64:ILP32 ABI) and x32 (x86_64:mx32 ABI) targets/makefiles
342 |   - add m32 (MIPS32r5/r6 + MSA) and p32 (POWER + VMX/VSX) targets/makefiles
343 |   - add yet another SIMD variant (v4) for x86/SSE4.1 and ARMv8/AArch32
344 |   - separate ARMv7/ASIMDv2 (v2) and ARMv8/AArch32 (v4) SIMD variants on ARM
345 |   - add ARM builds for Raspberry Pi 2 and 3 in addition to Nokia N900
346 |   - use static linking in SIMD tests for QEMU emulation
347 |   - add mmv (blendvps) to x86/x32 SSE4.1 for fast conditional loads
348 |   - add combined-compare-jumps to rtarch for better efficiency (MIPS, POWER)
349 |   - remove limitation for BASE instructions to only accept DP offsets
350 |   - add new immediate/displacement types, add comment that they are unsigned
351 |   - add comments throughout rtarch about instructions' set-flags behavior
352 |   - implement full-range 32-bit integer divide on ARMv7 (v1) as 64-bit fp-div
353 |   - add widening versions of integer multiply instructions to rtarch definitions
354 |   - add remainder wrappers for integer divide instructions to rtarch definitions
355 |   - add IEEE-compatible versions of fp div & sqr for ARMv7 and POWER targets
356 |   - add "residual correction" to non-IEEE fp div on ARMv7 and POWER targets
357 |   - add SIMD tests for fp-to-int round and int-div remainder, run level 18
358 | 
359 | v0.6: Unified SIMD Assembler, additional SIMD targets
360 |   - rename SIMD target files to reflect SIMD width
361 |   - enable SIMD instructions definitions only if RT_SIMD_CODE is defined
362 |   - add new SIMD targets for SSE1, AVX1, AVX2 with corresponding build flags
363 |   - add float-to-integer convert with explicit mode parameter (x86, AArch32)
364 |   - add signed-integer-divide native instruction for ARM's AArch32 mode
365 |   - add SIMD test for shifts by runtime value & BASE register, run level 16
366 |   - add ver (cpuid) instruction for runtime SIMD target selection (x86 only)
367 |   - add mmv (vmaskmov) to AVX backend for fast conditional loads/stores
368 |   - add BASE instructions sub-tests to SIMD test if RT_BASE_TEST is defined
369 |   - drop set-flags bit (slow) from BASE mul instructions on ARM
370 |   - add RT_SIMD_FAST_FCTRL to save 1 instruction on FCTRL blocks entry
371 |   - clarify current and future targets in rtarch (from here backported down)
372 |   - add xor & neg BASE instructions to rtarch
373 |   - add shifts by fixed BASE register instructions
374 |   - add register versions of BASE mul/div, remainder instructions
375 |   - add SIMD cvzps instruction for fp-to-int round-towards-zero conversion
376 |   - add ASM_ENTER_F/ASM_LEAVE_F/ROUND*_F for non-IEEE flush-to-zero SIMD mode
377 |   - add RT_SIMD_FLUSH_ZERO to enable faster non-IEEE flush-to-zero SIMD mode
378 |   - add ASM_INIT/ASM_DONE to manage root info structure
379 |   - make stack pointer register architecturally invisible
380 |   - replace non-standard malloc.h with stdlib.h for malloc/free
381 |   - clean up rtarch whitespace formatting
382 | 
383 | v0.5: Unified SIMD Assembler, API freeze for the engine
384 |   - instruction naming scheme finalized
385 |   - change ARM instructions to set flags
386 |   - added framework for internal constants (used by reciprocals)
387 |   - added SIMD instruction for cube root, reciprocal steps redesigned
388 |   - additional SIMD tests, run level 15
389 | 
390 | v0.4: SIMD test framework, macro assembler overhaul
391 |   - macro expansion reworked for better compiler compatibility
392 |   - immediate/displacement parameters handling redesigned
393 |   - added reciprocal support for SSE, MPE support refined
394 | 
395 | v0.3: SIMD test framework, run level 9
396 |   - tests for integer mul, div, jmp instructions
397 |   - SIMD tests for integer add, shl, shr instructions
398 |   - SIMD tests for cvt, sqr, rsq instructions
399 | 
400 | v0.2: SIMD test framework, run level 5
401 |   - SIMD tests for mul, div, cmp instructions
402 | 
403 | v0.1: SIMD test framework, run level 1
404 |   - SIMD tests for add, sub instructions
405 | 
406 | v0.0: Empty project
407 |   - initial file set and directory structure
408 | 


--------------------------------------------------------------------------------
/core/config/rtarch_pQF_128x1v2.h:
--------------------------------------------------------------------------------
  1 | /******************************************************************************/
  2 | /* Copyright (c) 2013-2025 VectorChief (at github, bitbucket, sourceforge)    */
  3 | /* Distributed under the MIT software license, see the accompanying           */
  4 | /* file COPYING or http://www.opensource.org/licenses/mit-license.php         */
  5 | /******************************************************************************/
  6 | 
  7 | #ifndef RT_RTARCH_PQF_128X1V2_H
  8 | #define RT_RTARCH_PQF_128X1V2_H
  9 | 
 10 | /******************************************************************************/
 11 | /*********************************   LEGEND   *********************************/
 12 | /******************************************************************************/
 13 | 
 14 | /*
 15 |  * rtarch_pQF_128x1v2.h: Implementation of POWER fp128 VSX3 instructions.
 16 |  *
 17 |  * This file is a part of the unified SIMD assembler framework (rtarch.h)
 18 |  * and contains architecture-specific extensions
 19 |  * outside of the common assembler core.
 20 |  *
 21 |  * Recommended naming scheme for instructions:
 22 |  *
 23 |  * cmdv*_rx - applies [cmd] to scalar-fp128: [r]egister (one operand)
 24 |  * cmdv*_rr - applies [cmd] to scalar-fp128: [r]egister from [r]egister
 25 |  *
 26 |  * cmdv*_rm - applies [cmd] to scalar-fp128: [r]egister from [m]emory
 27 |  * cmdv*_ld - applies [cmd] to scalar-fp128: as above (friendly alias)
 28 |  *
 29 |  * Note, when using fixed-data-size 128/256-bit SIMD subsets simultaneously
 30 |  * upper 128-bit halves of full 256-bit SIMD registers may end up undefined.
 31 |  * On RISC targets they remain unchanged, while on x86-AVX they are zeroed.
 32 |  * This happens when registers written in 128-bit subset are then used/read
 33 |  * from within 256-bit subset. The same rule applies to mixing with 512-bit
 34 |  * and wider vectors. Use of scalars may leave respective vector registers
 35 |  * undefined, as seen from the perspective of any particular vector subset.
 36 |  *
 37 |  * 256-bit vectors used with wider subsets may not be compatible with regards
 38 |  * to memory loads/stores when mixed in the code. It means that data loaded
 39 |  * with wider vector and stored within 256-bit subset at the same address may
 40 |  * result in changing the initial representation in memory. The same can be
 41 |  * said about mixing vector and scalar subsets. Scalars can be completely
 42 |  * detached on some architectures. Use elm*x_st to store 1st vector element.
 43 |  * 128-bit vectors should be memory-compatible with any wider vector subset.
 44 |  *
 45 |  * Handling of NaNs in the floating point pipeline may not be consistent
 46 |  * across different architectures. Avoid NaNs entering the data flow by using
 47 |  * masking or control flow instructions. Apply special care when dealing with
 48 |  * floating point compare and min/max input/output. The result of floating point
 49 |  * compare instructions can be considered a -QNaN, though it is also interpreted
 50 |  * as integer -1 and is often treated as a mask. Most arithmetic instructions
 51 |  * should propagate QNaNs unchanged, however this behavior hasn't been tested.
 52 |  *
 53 |  * Note, that instruction subsets operating on vectors of different length
 54 |  * may support different number of SIMD registers, therefore mixing them
 55 |  * in the same code needs to be done with register awareness in mind.
 56 |  * For example, AVX-512 supports 32 SIMD registers, while AVX2 only has 16,
 57 |  * as does 256-bit paired subset on ARMv8, while 128-bit and SVE have 32.
 58 |  * These numbers should be consistent across architectures if properly
 59 |  * mapped to SIMD target mask presented in rtzero.h (compatibility layer).
 60 |  *
 61 |  * Interpretation of instruction parameters:
 62 |  *
 63 |  * upper-case params have triplet structure and require W to pass-forward
 64 |  * lower-case params are singular and can be used/passed as such directly
 65 |  *
 66 |  * XD - SIMD register serving as destination only, if present
 67 |  * XG - SIMD register serving as destination and first source
 68 |  * XS - SIMD register serving as second source (first if any)
 69 |  * XT - SIMD register serving as third source (second if any)
 70 |  *
 71 |  * RD - BASE register serving as destination only, if present
 72 |  * RG - BASE register serving as destination and first source
 73 |  * RS - BASE register serving as second source (first if any)
 74 |  * RT - BASE register serving as third source (second if any)
 75 |  *
 76 |  * MD - BASE addressing mode (Oeax, M***, I***) (memory-dest)
 77 |  * MG - BASE addressing mode (Oeax, M***, I***) (memory-dsrc)
 78 |  * MS - BASE addressing mode (Oeax, M***, I***) (memory-src2)
 79 |  * MT - BASE addressing mode (Oeax, M***, I***) (memory-src3)
 80 |  *
 81 |  * DD - displacement value (DP, DF, DG, DH, DV) (memory-dest)
 82 |  * DG - displacement value (DP, DF, DG, DH, DV) (memory-dsrc)
 83 |  * DS - displacement value (DP, DF, DG, DH, DV) (memory-src2)
 84 |  * DT - displacement value (DP, DF, DG, DH, DV) (memory-src3)
 85 |  *
 86 |  * IS - immediate value (is used as a second or first source)
 87 |  * IT - immediate value (is used as a third or second source)
 88 |  */
 89 | 
 90 | /******************************************************************************/
 91 | /********************************   INTERNAL   ********************************/
 92 | /******************************************************************************/
 93 | 
 94 | #if (defined RT_SIMD_CODE)
 95 | 
 96 | #if (RT_128X1 == 2 || RT_128X1 == 8)
 97 | 
 98 | /******************************************************************************/
 99 | /********************************   EXTERNAL   ********************************/
100 | /******************************************************************************/
101 | 
102 | /******************************************************************************/
103 | /**********************************   ELEM   **********************************/
104 | /******************************************************************************/
105 | 
106 | /****************   scalar quad-precision generic move/logic   ****************/
107 | 
108 | /* mov (D = S) */
109 | 
110 | #define movvx_rr(XD, XS)                                                    \
111 |         EMITW(0xF0000497 | MXM(REG(XD), REG(XS), REG(XS)))
112 | 
113 | #define movvx_ld(XD, MS, DS)                                                \
114 |         AUW(SIB(MS),  EMPTY,  EMPTY,    MOD(MS), VAL(DS), C2(DS), EMPTY2)   \
115 |         EMITW(0x00000000 | MPM(REG(XD), MOD(MS), VAL(DS), B2(DS), P2(DS)))
116 | 
117 | #define movvx_st(XS, MD, DD)                                                \
118 |         AUW(SIB(MD),  EMPTY,  EMPTY,    MOD(MD), VAL(DD), C2(DD), EMPTY2)   \
119 |         EMITW(0x00000000 | MPM(REG(XS), MOD(MD), VAL(DD), B2(DD), O2(DD)))
120 | 
121 | /* mmv (G = G mask-merge S) where (mask-elem: 0 keeps G, -1 picks S)
122 |  * uses Xmm0 implicitly as a mask register, destroys Xmm0, 0-masked XS elems */
123 | 
124 | #define mmvvx_rr(XG, XS)                                                    \
125 |         EMITW(0xF000003F | MXM(REG(XG), REG(XG), REG(XS)))
126 | 
127 | #define mmvvx_ld(XG, MS, DS)                                                \
128 |         AUW(SIB(MS),  EMPTY,  EMPTY,    MOD(MS), VAL(DS), C2(DS), EMPTY2)   \
129 |         EMITW(0x00000000 | MPM(TmmM,    MOD(MS), VAL(DS), B2(DS), P2(DS)))  \
130 |         EMITW(0xF000003F | MXM(REG(XG), REG(XG), TmmM))
131 | 
132 | #define mmvvx_st(XS, MG, DG)                                                \
133 |         AUW(SIB(MG),  EMPTY,  EMPTY,    MOD(MG), VAL(DG), C2(DG), EMPTY2)   \
134 |         EMITW(0x00000000 | MPM(TmmM,    MOD(MG), VAL(DG), B2(DG), P2(DG)))  \
135 |         EMITW(0xF000003F | MXM(TmmM,    TmmM,    REG(XS)))                  \
136 |         EMITW(0x00000000 | MPM(TmmM,    MOD(MG), VAL(DG), B2(DG), O2(DG)))
137 | 
138 | /* and (G = G & S), (D = S & T) if (#D != #T) */
139 | 
140 | #define andvx_rr(XG, XS)                                                    \
141 |         andvx3rr(W(XG), W(XG), W(XS))
142 | 
143 | #define andvx_ld(XG, MS, DS)                                                \
144 |         andvx3ld(W(XG), W(XG), W(MS), W(DS))
145 | 
146 | #define andvx3rr(XD, XS, XT)                                                \
147 |         EMITW(0xF0000417 | MXM(REG(XD), REG(XS), REG(XT)))
148 | 
149 | #define andvx3ld(XD, XS, MT, DT)                                            \
150 |         AUW(SIB(MT),  EMPTY,  EMPTY,    MOD(MT), VAL(DT), C2(DT), EMPTY2)   \
151 |         EMITW(0x00000000 | MPM(TmmM,    MOD(MT), VAL(DT), B2(DT), P2(DT)))  \
152 |         EMITW(0xF0000417 | MXM(REG(XD), REG(XS), TmmM))
153 | 
154 | /* ann (G = ~G & S), (D = ~S & T) if (#D != #T) */
155 | 
156 | #define annvx_rr(XG, XS)                                                    \
157 |         annvx3rr(W(XG), W(XG), W(XS))
158 | 
159 | #define annvx_ld(XG, MS, DS)                                                \
160 |         annvx3ld(W(XG), W(XG), W(MS), W(DS))
161 | 
162 | #define annvx3rr(XD, XS, XT)                                                \
163 |         EMITW(0xF0000457 | MXM(REG(XD), REG(XT), REG(XS)))
164 | 
165 | #define annvx3ld(XD, XS, MT, DT)                                            \
166 |         AUW(SIB(MT),  EMPTY,  EMPTY,    MOD(MT), VAL(DT), C2(DT), EMPTY2)   \
167 |         EMITW(0x00000000 | MPM(TmmM,    MOD(MT), VAL(DT), B2(DT), P2(DT)))  \
168 |         EMITW(0xF0000457 | MXM(REG(XD), TmmM,    REG(XS)))
169 | 
170 | /* orr (G = G | S), (D = S | T) if (#D != #T) */
171 | 
172 | #define orrvx_rr(XG, XS)                                                    \
173 |         orrvx3rr(W(XG), W(XG), W(XS))
174 | 
175 | #define orrvx_ld(XG, MS, DS)                                                \
176 |         orrvx3ld(W(XG), W(XG), W(MS), W(DS))
177 | 
178 | #define orrvx3rr(XD, XS, XT)                                                \
179 |         EMITW(0xF0000497 | MXM(REG(XD), REG(XS), REG(XT)))
180 | 
181 | #define orrvx3ld(XD, XS, MT, DT)                                            \
182 |         AUW(SIB(MT),  EMPTY,  EMPTY,    MOD(MT), VAL(DT), C2(DT), EMPTY2)   \
183 |         EMITW(0x00000000 | MPM(TmmM,    MOD(MT), VAL(DT), B2(DT), P2(DT)))  \
184 |         EMITW(0xF0000497 | MXM(REG(XD), REG(XS), TmmM))
185 | 
186 | /* orn (G = ~G | S), (D = ~S | T) if (#D != #T) */
187 | 
188 | #define ornvx_rr(XG, XS)                                                    \
189 |         ornvx3rr(W(XG), W(XG), W(XS))
190 | 
191 | #define ornvx_ld(XG, MS, DS)                                                \
192 |         ornvx3ld(W(XG), W(XG), W(MS), W(DS))
193 | 
194 | #define ornvx3rr(XD, XS, XT)                                                \
195 |         EMITW(0xF0000557 | MXM(REG(XD), REG(XT), REG(XS)))
196 | 
197 | #define ornvx3ld(XD, XS, MT, DT)                                            \
198 |         AUW(SIB(MT),  EMPTY,  EMPTY,    MOD(MT), VAL(DT), C2(DT), EMPTY2)   \
199 |         EMITW(0x00000000 | MPM(TmmM,    MOD(MT), VAL(DT), B2(DT), P2(DT)))  \
200 |         EMITW(0xF0000557 | MXM(REG(XD), TmmM,    REG(XS)))
201 | 
202 | /* xor (G = G ^ S), (D = S ^ T) if (#D != #T) */
203 | 
204 | #define xorvx_rr(XG, XS)                                                    \
205 |         xorvx3rr(W(XG), W(XG), W(XS))
206 | 
207 | #define xorvx_ld(XG, MS, DS)                                                \
208 |         xorvx3ld(W(XG), W(XG), W(MS), W(DS))
209 | 
210 | #define xorvx3rr(XD, XS, XT)                                                \
211 |         EMITW(0xF00004D7 | MXM(REG(XD), REG(XS), REG(XT)))
212 | 
213 | #define xorvx3ld(XD, XS, MT, DT)                                            \
214 |         AUW(SIB(MT),  EMPTY,  EMPTY,    MOD(MT), VAL(DT), C2(DT), EMPTY2)   \
215 |         EMITW(0x00000000 | MPM(TmmM,    MOD(MT), VAL(DT), B2(DT), P2(DT)))  \
216 |         EMITW(0xF00004D7 | MXM(REG(XD), REG(XS), TmmM))
217 | 
218 | /* not (G = ~G), (D = ~S) */
219 | 
220 | #define notvx_rx(XG)                                                        \
221 |         notvx_rr(W(XG), W(XG))
222 | 
223 | #define notvx_rr(XD, XS)                                                    \
224 |         EMITW(0xF0000517 | MXM(REG(XD), REG(XS), REG(XS)))
225 | 
226 | /*************   scalar quad-precision floating-point arithmetic   ************/
227 | 
228 | /* neg (G = -G), (D = -S) */
229 | 
230 | #define negvs_rx(XG)                                                        \
231 |         negvs_rr(W(XG), W(XG))
232 | 
233 | #define negvs_rr(XD, XS)                                                    \
234 |         EMITW(0xFC000648 | MXM(REG(XD), 0x10,    REG(XS)))
235 | 
236 | /* add (G = G + S), (D = S + T) if (#D != #T) */
237 | 
238 | #define addvs_rr(XG, XS)                                                    \
239 |         addvs3rr(W(XG), W(XG), W(XS))
240 | 
241 | #define addvs_ld(XG, MS, DS)                                                \
242 |         addvs3ld(W(XG), W(XG), W(MS), W(DS))
243 | 
244 | #define addvs3rr(XD, XS, XT)                                                \
245 |         EMITW(0xFC000008 | MXM(REG(XD), REG(XS), REG(XT)))
246 | 
247 | #define addvs3ld(XD, XS, MT, DT)                                            \
248 |         AUW(SIB(MT),  EMPTY,  EMPTY,    MOD(MT), VAL(DT), C2(DT), EMPTY2)   \
249 |         EMITW(0x00000000 | MPM(TmmM,    MOD(MT), VAL(DT), B2(DT), P2(DT)))  \
250 |         EMITW(0xFC000008 | MXM(REG(XD), REG(XS), TmmM))
251 | 
252 | /* sub (G = G - S), (D = S - T) if (#D != #T) */
253 | 
254 | #define subvs_rr(XG, XS)                                                    \
255 |         subvs3rr(W(XG), W(XG), W(XS))
256 | 
257 | #define subvs_ld(XG, MS, DS)                                                \
258 |         subvs3ld(W(XG), W(XG), W(MS), W(DS))
259 | 
260 | #define subvs3rr(XD, XS, XT)                                                \
261 |         EMITW(0xFC000408 | MXM(REG(XD), REG(XS), REG(XT)))
262 | 
263 | #define subvs3ld(XD, XS, MT, DT)                                            \
264 |         AUW(SIB(MT),  EMPTY,  EMPTY,    MOD(MT), VAL(DT), C2(DT), EMPTY2)   \
265 |         EMITW(0x00000000 | MPM(TmmM,    MOD(MT), VAL(DT), B2(DT), P2(DT)))  \
266 |         EMITW(0xFC000408 | MXM(REG(XD), REG(XS), TmmM))
267 | 
268 | /* mul (G = G * S), (D = S * T) if (#D != #T) */
269 | 
270 | #define mulvs_rr(XG, XS)                                                    \
271 |         mulvs3rr(W(XG), W(XG), W(XS))
272 | 
273 | #define mulvs_ld(XG, MS, DS)                                                \
274 |         mulvs3ld(W(XG), W(XG), W(MS), W(DS))
275 | 
276 | #define mulvs3rr(XD, XS, XT)                                                \
277 |         EMITW(0xFC000048 | MXM(REG(XD), REG(XS), REG(XT)))
278 | 
279 | #define mulvs3ld(XD, XS, MT, DT)                                            \
280 |         AUW(SIB(MT),  EMPTY,  EMPTY,    MOD(MT), VAL(DT), C2(DT), EMPTY2)   \
281 |         EMITW(0x00000000 | MPM(TmmM,    MOD(MT), VAL(DT), B2(DT), P2(DT)))  \
282 |         EMITW(0xFC000048 | MXM(REG(XD), REG(XS), TmmM))
283 | 
284 | /* div (G = G / S), (D = S / T) if (#D != #T) and on ARMv7 if (#D != #S) */
285 | 
286 | #define divvs_rr(XG, XS)                                                    \
287 |         divvs3rr(W(XG), W(XG), W(XS))
288 | 
289 | #define divvs_ld(XG, MS, DS)                                                \
290 |         divvs3ld(W(XG), W(XG), W(MS), W(DS))
291 | 
292 | #define divvs3rr(XD, XS, XT)                                                \
293 |         EMITW(0xFC000448 | MXM(REG(XD), REG(XS), REG(XT)))
294 | 
295 | #define divvs3ld(XD, XS, MT, DT)                                            \
296 |         AUW(SIB(MT),  EMPTY,  EMPTY,    MOD(MT), VAL(DT), C2(DT), EMPTY2)   \
297 |         EMITW(0x00000000 | MPM(TmmM,    MOD(MT), VAL(DT), B2(DT), P2(DT)))  \
298 |         EMITW(0xFC000448 | MXM(REG(XD), REG(XS), TmmM))
299 | 
300 | /* sqr (D = sqrt S) */
301 | 
302 | #define sqrvs_rr(XD, XS)                                                    \
303 |         EMITW(0xFC000648 | MXM(REG(XD), 0x1B,    REG(XS)))
304 | 
305 | #define sqrvs_ld(XD, MS, DS)                                                \
306 |         AUW(SIB(MS),  EMPTY,  EMPTY,    MOD(MS), VAL(DS), C2(DS), EMPTY2)   \
307 |         EMITW(0x00000000 | MPM(TmmM,    MOD(MS), VAL(DS), B2(DS), P2(DS)))  \
308 |         EMITW(0xFC000648 | MXM(REG(XD), 0x1B,    TmmM))
309 | 
310 | /* fma (G = G + S * T) if (#G != #S && #G != #T) */
311 | 
312 | #define fmavs_rr(XG, XS, XT)                                                \
313 |         EMITW(0xFC000308 | MXM(REG(XG), REG(XS), REG(XT)))
314 | 
315 | #define fmavs_ld(XG, XS, MT, DT)                                            \
316 |         AUW(SIB(MT),  EMPTY,  EMPTY,    MOD(MT), VAL(DT), C2(DT), EMPTY2)   \
317 |         EMITW(0x00000000 | MPM(TmmM,    MOD(MT), VAL(DT), B2(DT), P2(DT)))  \
318 |         EMITW(0xFC000308 | MXM(REG(XG), REG(XS), TmmM))
319 | 
320 | /* fms (G = G - S * T) if (#G != #S && #G != #T) */
321 | 
322 | #define fmsvs_rr(XG, XS, XT)                                                \
323 |         EMITW(0xFC0003C8 | MXM(REG(XG), REG(XS), REG(XT)))
324 | 
325 | #define fmsvs_ld(XG, XS, MT, DT)                                            \
326 |         AUW(SIB(MT),  EMPTY,  EMPTY,    MOD(MT), VAL(DT), C2(DT), EMPTY2)   \
327 |         EMITW(0x00000000 | MPM(TmmM,    MOD(MT), VAL(DT), B2(DT), P2(DT)))  \
328 |         EMITW(0xFC0003C8 | MXM(REG(XG), REG(XS), TmmM))
329 | 
330 | /*************   scalar quad-precision integer arithmetic/shifts   ************/
331 | 
332 | /* add (G = G + S), (D = S + T) if (#D != #T) */
333 | 
334 | #define addvx_rr(XG, XS)                                                    \
335 |         addvx3rr(W(XG), W(XG), W(XS))
336 | 
337 | #define addvx_ld(XG, MS, DS)                                                \
338 |         addvx3ld(W(XG), W(XG), W(MS), W(DS))
339 | 
340 | #define addvx3rr(XD, XS, XT)                                                \
341 |         EMITW(0x10000100 | MXM(REG(XD), REG(XS), REG(XT)))
342 | 
343 | #define addvx3ld(XD, XS, MT, DT)                                            \
344 |         AUW(SIB(MT),  EMPTY,  EMPTY,    MOD(MT), VAL(DT), C2(DT), EMPTY2)   \
345 |         EMITW(0x00000000 | MPM(TmmM,    MOD(MT), VAL(DT), B2(DT), P2(DT)))  \
346 |         EMITW(0x10000100 | MXM(REG(XD), REG(XS), TmmM))
347 | 
348 | /* sub (G = G - S), (D = S - T) if (#D != #T) */
349 | 
350 | #define subvx_rr(XG, XS)                                                    \
351 |         subvx3rr(W(XG), W(XG), W(XS))
352 | 
353 | #define subvx_ld(XG, MS, DS)                                                \
354 |         subvx3ld(W(XG), W(XG), W(MS), W(DS))
355 | 
356 | #define subvx3rr(XD, XS, XT)                                                \
357 |         EMITW(0x10000500 | MXM(REG(XD), REG(XS), REG(XT)))
358 | 
359 | #define subvx3ld(XD, XS, MT, DT)                                            \
360 |         AUW(SIB(MT),  EMPTY,  EMPTY,    MOD(MT), VAL(DT), C2(DT), EMPTY2)   \
361 |         EMITW(0x00000000 | MPM(TmmM,    MOD(MT), VAL(DT), B2(DT), P2(DT)))  \
362 |         EMITW(0x10000500 | MXM(REG(XD), REG(XS), TmmM))
363 | 
364 | /* shl (G = G << S), (D = S << T) if (#D != #T) - plain, unsigned
365 |  * for maximum compatibility: shift count must be modulo elem-size */
366 | 
367 | #define shlvx_ri(XG, IS)                                                    \
368 |         shlvx3ri(W(XG), W(XG), W(IS))
369 | 
370 | #define shlvx_ld(XG, MS, DS) /* loads SIMD, uses first elem, rest zeroed */ \
371 |         shlvx3ld(W(XG), W(XG), W(MS), W(DS))
372 | 
373 | #define shlvx3ri(XD, XS, IT)                                                \
374 |         EMITW(0xF00002D1 | TmmM << 21 | (0x7F & VAL(IT)) << 11)             \
375 |         EMITW(0x1000040C | MXM(REG(XD), REG(XS), TmmM))                     \
376 |         EMITW(0x100001C4 | MXM(REG(XD), REG(XS), TmmM))
377 | 
378 | #define shlvx3ld(XD, XS, MT, DT)                                            \
379 |         AUW(SIB(MT),  EMPTY,  EMPTY,    MOD(MT), VAL(DT), C2(DT), EMPTY2)   \
380 |         EMITW(0x00000000 | MPM(TmmM,    MOD(MT), VAL(DT), B2(DT), P2(DT)))  \
381 |         EMITW(0x1000020C | MXM(TmmM,    0x0F,    TmmM))                     \
382 |         EMITW(0x1000040C | MXM(REG(XD), REG(XS), TmmM))                     \
383 |         EMITW(0x100001C4 | MXM(REG(XD), REG(XS), TmmM))
384 | 
385 | /* shr (G = G >> S), (D = S >> T) if (#D != #T) - plain, unsigned
386 |  * for maximum compatibility: shift count must be modulo elem-size */
387 | 
388 | #define shrvx_ri(XG, IS)                                                    \
389 |         shrvx3ri(W(XG), W(XG), W(IS))
390 | 
391 | #define shrvx_ld(XG, MS, DS) /* loads SIMD, uses first elem, rest zeroed */ \
392 |         shrvx3ld(W(XG), W(XG), W(MS), W(DS))
393 | 
394 | #define shrvx3ri(XD, XS, IT)                                                \
395 |         EMITW(0xF00002D1 | TmmM << 21 | (0x7F & VAL(IT)) << 11)             \
396 |         EMITW(0x1000044C | MXM(REG(XD), REG(XS), TmmM))                     \
397 |         EMITW(0x100002C4 | MXM(REG(XD), REG(XS), TmmM))
398 | 
399 | #define shrvx3ld(XD, XS, MT, DT)                                            \
400 |         AUW(SIB(MT),  EMPTY,  EMPTY,    MOD(MT), VAL(DT), C2(DT), EMPTY2)   \
401 |         EMITW(0x00000000 | MPM(TmmM,    MOD(MT), VAL(DT), B2(DT), P2(DT)))  \
402 |         EMITW(0x1000020C | MXM(TmmM,    0x0F,    TmmM))                     \
403 |         EMITW(0x1000044C | MXM(REG(XD), REG(XS), TmmM))                     \
404 |         EMITW(0x100002C4 | MXM(REG(XD), REG(XS), TmmM))
405 | 
406 | /* svl (G = G << S), (D = S << T) if (#D != #T) - variable, unsigned
407 |  * for maximum compatibility: shift count must be modulo elem-size */
408 | 
409 | #define svlvx_rr(XG, XS)     /* variable shift with per-elem count */       \
410 |         svlvx3rr(W(XG), W(XG), W(XS))
411 | 
412 | #define svlvx_ld(XG, MS, DS) /* variable shift with per-elem count */       \
413 |         svlvx3ld(W(XG), W(XG), W(MS), W(DS))
414 | 
415 | #define svlvx3rr(XD, XS, XT)                                                \
416 |         EMITW(0x1000020C | MXM(TmmM,    0x0F,    REG(XT)))                  \
417 |         EMITW(0x1000040C | MXM(REG(XD), REG(XS), TmmM))                     \
418 |         EMITW(0x100001C4 | MXM(REG(XD), REG(XS), TmmM))
419 | 
420 | #define svlvx3ld(XD, XS, MT, DT)                                            \
421 |         AUW(SIB(MT),  EMPTY,  EMPTY,    MOD(MT), VAL(DT), C2(DT), EMPTY2)   \
422 |         EMITW(0x00000000 | MPM(TmmM,    MOD(MT), VAL(DT), B2(DT), P2(DT)))  \
423 |         EMITW(0x1000020C | MXM(TmmM,    0x0F,    TmmM))                     \
424 |         EMITW(0x1000040C | MXM(REG(XD), REG(XS), TmmM))                     \
425 |         EMITW(0x100001C4 | MXM(REG(XD), REG(XS), TmmM))
426 | 
427 | /* svr (G = G >> S), (D = S >> T) if (#D != #T) - variable, unsigned
428 |  * for maximum compatibility: shift count must be modulo elem-size */
429 | 
430 | #define svrvx_rr(XG, XS)     /* variable shift with per-elem count */       \
431 |         svrvx3rr(W(XG), W(XG), W(XS))
432 | 
433 | #define svrvx_ld(XG, MS, DS) /* variable shift with per-elem count */       \
434 |         svrvx3ld(W(XG), W(XG), W(MS), W(DS))
435 | 
436 | #define svrvx3rr(XD, XS, XT)                                                \
437 |         EMITW(0x1000020C | MXM(TmmM,    0x0F,    REG(XT)))                  \
438 |         EMITW(0x1000044C | MXM(REG(XD), REG(XS), TmmM))                     \
439 |         EMITW(0x100002C4 | MXM(REG(XD), REG(XS), TmmM))
440 | 
441 | #define svrvx3ld(XD, XS, MT, DT)                                            \
442 |         AUW(SIB(MT),  EMPTY,  EMPTY,    MOD(MT), VAL(DT), C2(DT), EMPTY2)   \
443 |         EMITW(0x00000000 | MPM(TmmM,    MOD(MT), VAL(DT), B2(DT), P2(DT)))  \
444 |         EMITW(0x1000020C | MXM(TmmM,    0x0F,    TmmM))                     \
445 |         EMITW(0x1000044C | MXM(REG(XD), REG(XS), TmmM))                     \
446 |         EMITW(0x100002C4 | MXM(REG(XD), REG(XS), TmmM))
447 | 
448 | /******************************************************************************/
449 | /********************************   INTERNAL   ********************************/
450 | /******************************************************************************/
451 | 
452 | #endif /* RT_128X1 */
453 | 
454 | #endif /* RT_SIMD_CODE */
455 | 
456 | #endif /* RT_RTARCH_PQF_128X1V2_H */
457 | 
458 | /******************************************************************************/
459 | /******************************************************************************/
460 | /******************************************************************************/
461 | 


--------------------------------------------------------------------------------
/core/config/rtdocs.h:
--------------------------------------------------------------------------------
  1 | /******************************************************************************/
  2 | /* Copyright (c) 2013-2025 VectorChief (at github, bitbucket, sourceforge)    */
  3 | /* Distributed under the MIT software license, see the accompanying           */
  4 | /* file COPYING or http://www.opensource.org/licenses/mit-license.php         */
  5 | /******************************************************************************/
  6 | 
  7 | /******************************************************************************/
  8 | /*********************************   LEGEND   *********************************/
  9 | /******************************************************************************/
 10 | 
 11 | /*
 12 |  * rtdocs.h: Documentation on how to get started using the assembler.
 13 |  * Table of contents is provided below.
 14 |  *
 15 |  * Chapter 1 - Overview
 16 |  * Chapter 2 - Introduction
 17 |  * Chapter 3 - Application types
 18 |  * Chapter 4 - Initialization
 19 |  * Chapter 5 - Configuration
 20 |  *
 21 |  * It is recommended to read all chapters at least once
 22 |  * before getting started with the code.
 23 |  */
 24 | 
 25 | /******************************************************************************/
 26 | /**************************   CHAPTER 1 - OVERVIEW  ***************************/
 27 | /******************************************************************************/
 28 | 
 29 | /*
 30 |  * The general structure of the application using UniSIMD is given below.
 31 |  * It is usually a combination of standard C/C++ code with some inline assembler
 32 |  * parts. Something like this:
 33 |  *
 34 |  * void func(rt_SIMD_INFOX *inf)
 35 |  * {
 36 |  *     ASM_ENTER(inf)
 37 |  *     ..
 38 |  *     ASM_LEAVE(inf)
 39 |  * }
 40 |  *
 41 |  * The code above shows a C/C++ function with a parameter and ASM code section
 42 |  * within it. The parameter is a pointer to a SIMD-aligned structure
 43 |  * that is used to pass all the data to the ASM section and back if needed.
 44 |  * ASM section can read and write fields of that structure.
 45 |  *
 46 |  * There can be two types of instructions within the ASM section: BASE and SIMD.
 47 |  * UniSIMD also defines register sets that are common for all architectures
 48 |  * including variants of a single architecture.
 49 |  *
 50 |  * So, with UniSIMD there will always be: Reax/Rebx/Recx/... for BASE and
 51 |  * Xmm0/Xmm1/Xmm2/... for SIMD. However, these common definitions are then
 52 |  * mapped to actual architectural registers of ARM/MIPS/POWER and x86.
 53 |  *
 54 |  * Both BASE and SIMD register sizes depend on the type of instruction used.
 55 |  * UniSIMD defines a number of instruction subsets that are again common
 56 |  * across all architectures.
 57 |  *
 58 |  * Some instructions work with fixed register sizes, others have it configurable
 59 |  * with a flag. The full list of instruction subsets along with registers
 60 |  * and various addressing modes can be found in "core/config/rtzero.h",
 61 |  * while "test/simd_test.cpp" shows how these definitions can be used.
 62 |  * Refer to "c_test01" and "s_test01" for a start.
 63 |  *
 64 |  * Once the program is expressed with UniSIMD's syntax (C/C++ with ASM sections)
 65 |  * it can then be built for any supported architecture without having a need
 66 |  * to modify the source code again. Just pick the right makefile.
 67 |  */
 68 | 
 69 | /******************************************************************************/
 70 | /************************   CHAPTER 2 - INTRODUCTION  *************************/
 71 | /******************************************************************************/
 72 | 
 73 | /*
 74 |  * In order for UniSIMD to work as intended application source files need
 75 |  * to do a few things first.
 76 |  *
 77 |  * #define RT_SIMD_CODE // enable SIMD instruction definitions
 78 |  * #define RT_DATA 8 // define data load-level for backend structures
 79 |  *
 80 |  * #include "rtbase.h" // include UniSIMD's base header after the 2 flags above
 81 |  *
 82 |  * As some SIMD widths (128/256/512-bit) are limited to specific CPU generations
 83 |  * (SSE/AVX/AVX-512) the use of SIMD is turned off by default and needs to be
 84 |  * explicitly enabled with a flag to make the generic ASM sections (no SIMD)
 85 |  * portable across generations. This is mostly relevant when implementing
 86 |  * runtime detection of SIMD and subsequent multi-targeting.
 87 |  *
 88 |  * Once SIMD instructions are enabled ASM sections will preserve/restore full
 89 |  * SIMD registers they are configured to work with. The maximal SIMD width
 90 |  * for a build is set in a makefile with RT_128=a/RT_256=b/RT_512=c/... and
 91 |  * is defined internally as Q (=1/2/4/...), while a/b/c define variant
 92 |  * within a given SIMD width.
 93 |  *
 94 |  * As UniSIMD needs to adjust for changing SIMD widths when working with
 95 |  * backend structures it needs to know how much they are filled.
 96 |  * The next (second) flag then defines the data load-level common for all
 97 |  * backend structures and ASM sections within its scope:
 98 |  *
 99 |  * 1 - means full DP-level (12-bit displacements) is filled or exceeded (Q=1).
100 |  * 2 - means 1/2  DP-level (11-bit displacements) has not been exceeded (Q=1).
101 |  * 4 - means 1/4  DP-level (10-bit displacements) has not been exceeded (Q=1).
102 |  * 8 - means 1/8  DP-level  (9-bit displacements) has not been exceeded (Q=1).
103 |  * 16  means 1/16 DP-level  (8-bit displacements) has not been exceeded (Q=1).
104 |  * NOTE: the built-in rt_SIMD_INFO structure is already filled at full 1/16th.
105 |  *
106 |  * The load-level is measured at Q equal to 1 and UniSIMD then adjusts internal
107 |  * displacement values as Q scales up.
108 |  *
109 |  * UniSIMD defines a lot of simple single-letter internal values which can
110 |  * interfere with program's own variables, especially when adding UniSIMD to
111 |  * an existing project. It is therefore recommended using a separate file
112 |  * for ASM header and sections or adding them at the end of an existing file,
113 |  * while keeping function declarations to be used in the program at the top.
114 |  *
115 |  * All applications need to include a single root header with base types and
116 |  * definitions so that UniSIMD can do the rest of configuration based on
117 |  * makefile flags. Depending on where the source files are located makefile
118 |  * should specify a relative path to "core/config/" in order for UniSIMD headers
119 |  * to become available.
120 |  */
121 | 
122 | /******************************************************************************/
123 | /**********************   CHAPTER 3 - APPLICATION TYPES  **********************/
124 | /******************************************************************************/
125 | 
126 | /*
127 |  * There can be two types of applications written with UniSIMD - single-target
128 |  * and multi-target. In the first case the binary is configured and carries
129 |  * the code for just one target (CPU generation or SIMD width and variant).
130 |  * In the second case the binary carries multiple code sections for different
131 |  * targets (CPU generations or SIMD widths and variants).
132 |  *
133 |  * Some architectures like x86 allow for runtime target detection on
134 |  * the application level (user-space), others like ARM/MIPS/POWER only provide
135 |  * that information to an operating system (priveleged), which makes producing
136 |  * multi-target binaries for those architectures a bit cumbersome as
137 |  * they become OS-specific.
138 |  *
139 |  * The test framework within UniSIMD is a single-target application, which means
140 |  * only one CPU generation or SIMD width and variant per build. However, its use
141 |  * of portable instruction subsets (cmdx*, cmdy* for BASE and cmdp*, cmds* for
142 |  * SIMD and scalar) allows it to configure the same code-base for many different
143 |  * targets and produce a separate binary for each target from a single source.
144 |  *
145 |  * Creating a proper multi-target binary requires use of C++ namespaces and
146 |  * additional source-level target files, which would then include the same
147 |  * portable code-base and wrap it into a target-specific namespace with a set of
148 |  * flags for that target. In addition to that a generic ASM section should
149 |  * determine the target at runtime and select appropriate code-path in a switch.
150 |  *
151 |  * A good example of how to build a multi-target binary with UniSIMD is provided
152 |  * in the QuadRay engine (core/tracer). In that example backend structures are
153 |  * always defined for the maximnal SIMD width (Q internally) configured in
154 |  * makefiles (RT_128/RT_256/RT_512/...). However, the portable ASM code-base
155 |  * needs to be aware of the actual SIMD width selected at runtime
156 |  * and currently running.
157 |  *
158 |  * This is handled with a target-specific RT_SIMD_QUADS definition, which is
159 |  * expressed in the same terms as Q (1/2/4/8/16), but is different from Q as it
160 |  * always reflects the currently active SIMD width and not the maximal
161 |  * SIMD width defined for the build.
162 |  */
163 | 
164 | /******************************************************************************/
165 | /************************   CHAPTER 4 - INITIALIZATION  ***********************/
166 | /******************************************************************************/
167 | 
168 | /*
169 |  * Some emulated instructions within ASM sections rely on general purpose
170 |  * constants in rt_SIMD_INFO structure defined in "core/config/rtbase.h".
171 |  * They need to be initialized before the pointer to this structure is passed to
172 |  * the first ASM section and deinitialized after the last one. It's done with:
173 |  *
174 |  * ASM_INIT(inf, reg)
175 |  *
176 |  * ..
177 |  *
178 |  * ASM_DONE(inf)
179 |  *
180 |  * Here "reg" is a pointer to SIMD-aligned structure rt_SIMD_REGS intended
181 |  * to keep the state of all SIMD registers (from C/C++ code) while ASM section
182 |  * is doing some processing. It can be allocated separately or as a part of a
183 |  * larger combined "inf+reg" structure. In any case both pointers should end up
184 |  * SIMD-aligned (divisible by full SIMD-width they are pointing at in bytes).
185 |  *
186 |  * As was mentioned previously "inf" is a pointer to rt_SIMD_INFOX structure,
187 |  * which is usually an extension of rt_SIMD_INFO. The extension of the initial
188 |  * built-in rt_SIMD_INFO structure can be done with inheritance (in C++) or
189 |  * embedding (in C). This step (extension) is necessary in order to pass
190 |  * application-specific parameters into application-defined ASM sections,
191 |  * something that generic rt_SIMD_INFO cannot provide.
192 |  *
193 |  * Once "inf" pointer of initialized structure is passed to the ASM section
194 |  * it shows up as Rebp register and can be accessed via Mebp addressing mode
195 |  * with corresponding displacements (offsets) defined in rt_SIMD_INFO and
196 |  * rt_SIMD_INFOX (by extension).
197 |  *
198 |  * Potential future improvement is to use an array instead of structure to avoid
199 |  * possible paddings that compiler may introduce for its own needs (alignment),
200 |  * in which case some parts of the assembler will need to be redesigned.
201 |  * ASM_ENTER/LEAVE macros can be converted into just-in-time compilation along
202 |  * with EMITW/EMITH/EMITB/LBL to avoid possible compiler issues with inline ASM.
203 |  * The order of arithmetic and shifts within internal definitions can be
204 |  * hardened by using extra parentheses (in a form of round brackets).
205 |  *
206 |  * Right shifts on signed/unsigned data types in C/C++ are not guaranteed by
207 |  * the standard to produce arithmetic/logical shift instructions respectively,
208 |  * therefore some tests within SIMD test framework may need to be rewritten.
209 |  * Modern open-source compilers produce consistent results only with data sizes
210 |  * above 8-bit (char). With 8-bit signed/unsigned char ARM and POWER compilers
211 |  * show discrepancy in right shifts behavior relative to MIPS and x86.
212 |  *
213 |  * For every BASE register starting with R*** (like Rebx, Recx, Redx, ...)
214 |  * there is a corresponding addressing mode starting with M*** (like Mebx, Mecx,
215 |  * Medx, ...), which treats the register as a pointer and dereferences it with
216 |  * additional displacement (offset) given as a separate parameter to cmd**_ld/st
217 |  * instructions.
218 |  *
219 |  * The use of Reax is reserved for indexed addressing mode in the form of I***
220 |  * (like Iebx, Iecx, Iedx, ...) in which case the address is calculated as a sum
221 |  * of R*** + Reax + displacement, where R*** is the BASE register encoded in the
222 |  * addressing mode. Scaled indexed addressing modes are supported as J***, K***,
223 |  * L*** (with Reax), while S***, T***, U***, V*** accept any BASE register index
224 |  * maintaining the same built-in scaling factors 1x/2x/4x/8x respectively.
225 |  * Fully configurable N*** takes index register and scale (1,2,3) for 2x/4x/8x.
226 |  * Reax is also used for plain addressing mode (Oeax) without displacement
227 |  * in which case PLAIN is passed as a displacement to cmd**_ld/st instructions.
228 |  */
229 | 
230 | /******************************************************************************/
231 | /************************   CHAPTER 5 - CONFIGURATION  ************************/
232 | /******************************************************************************/
233 | 
234 | /*
235 |  * The initialization of SIMD fields within SIMD-aligned backend structures
236 |  * can be streamlined with RT_SIMD_SET(s, v) macros used from within C/C++ code.
237 |  * In this case "s" represents SIMD field (usually an array of elements) and "v"
238 |  * represents scalar value that is going to be replicated across all elements.
239 |  *
240 |  * The RT_SIMD_SET macro is the most generic form which is then mapped to
241 |  * target-specific form depending on the configured SIMD element size and
242 |  * the maximal SIMD width. The RT_SIMD_SET32 and RT_SIMD_SET64 always work with
243 |  * 32-bit and 64-bit SIMD elements respectively regardless of configuration,
244 |  * but they both still respect maximal SIMD width.
245 |  *
246 |  * The element size is configured with RT_ELEMENT=32/64 definition from within
247 |  * makefiles, while RT_ADDRESS, RT_POINTER define the respective address and
248 |  * pointer sizes. These definitions affect the size of configurable scalar and
249 |  * vector types used within backend structures and throughout C/C++ code.
250 |  * For example, rt_elem/rt_uelm, rt_real depend on RT_ELEMENT, rt_addr/rt_uadr
251 |  * depend on RT_ADDRESS, while rt_pntr/rt_uptr and rt_cell/rt_word depend on
252 |  * RT_POINTER (which is fixed for the chosen target and cannot be changed).
253 |  *
254 |  * In addition to already mentioned flags and definitions "core/config/rtbase.h"
255 |  * defines other useful constants, like R, T and S to configure the size of
256 |  * SIMD fields depending on the chosen SIMD element size and maximal SIMD width.
257 |  * Short names P, A and L represent RT_POINTER, RT_ADDRESS and RT_ELEMENT
258 |  * in base units: 1 for 32-bit, 2 for 64-bit.
259 |  *
260 |  * Note that logical cmdpx instructions are configured for floating-point
261 |  * SIMD pipeline on x86 where applicable, while logical cmdmx instructions
262 |  * are better suited for integer SIMD workloads.
263 |  *
264 |  * Constants like B/C/D/.../I define various displacement corrections for
265 |  * endianness, when C/C++ and ASM sections work on different data sizes packed
266 |  * within a single larger field.
267 |  *
268 |  * Similar to how displacements are defined and then passed to BASE and SIMD
269 |  * cmd**_ld/st instructions, immediate values of various sizes can be passed to
270 |  * BASE cmd**_ri/rj instructions. The assembler defines the following immediate
271 |  * and displacement types: IC/IB/IM/IG/IH/IV/IW as 7/8/12/15/16/31/32-bit values
272 |  * and DP/DE/DF/DG/DH/DV as 12/13/14/15/16/31-bit values respectively.
273 |  *
274 |  * Both displacement and immediate common types are defined in corresponding
275 |  * "core/config/rtarch_*32.h" files for each architecture individually.
276 |  * Displacements are then additionally scaled with Q and RT_DATA expressed via O
277 |  * definition in "core/config/rtbase.h". Immediate arguments only apply to BASE
278 |  * instructions and don't need any additional SIMD scaling. All displacement and
279 |  * immediate values are always unsigned within the assembler.
280 |  */
281 | 
282 | /******************************************************************************/
283 | /******************************************************************************/
284 | /******************************************************************************/
285 | 


--------------------------------------------------------------------------------
/test/build_cross.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | # Intended for x86_64 Linux build environment
 3 | # with many g++ cross-compilers installed (64-bit Ubuntu MATE 20.04 LTS tested)
 4 | # refer to individual makefiles for installation instructions
 5 | 
 6 | make -f simd_make_arm.mk build -j2
 7 | make -f simd_make_m32.mk build -j2
 8 | make -f simd_make_p32.mk build -j4
 9 | make -f simd_make_a64.mk build -j8
10 | make -f simd_make_m64.mk build -j8
11 | make -f simd_make_p64.mk build -j8
12 | 
13 | make -f simd_make_arm.mk strip
14 | make -f simd_make_m32.mk strip
15 | make -f simd_make_p32.mk strip
16 | make -f simd_make_a64.mk strip
17 | make -f simd_make_m64.mk strip
18 | make -f simd_make_p64.mk strip
19 | 


--------------------------------------------------------------------------------
/test/build_linux.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | # Intended for x86_64 Linux build environment
3 | # with native g++ compiler installed (64-bit Linux Mint 18 tested)
4 | # works on Ubuntu MATE 18.04/20.04 LTS (binaries aren't backward compatible)
5 | 
6 | make -f simd_make_x64.mk build -j8
7 | 
8 | make -f simd_make_x64.mk strip
9 | 


--------------------------------------------------------------------------------
/test/build_macM1.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | # Intended for AArch64 macOS build environment with Apple Silicon (M1 chip)
 3 | # with Command Line Tools installed (macOS BigSur/Monterey tested)
 4 | # build on the least recent OS as binaries aren't always backward compatible
 5 | 
 6 | make -f simd_make_a64.mk clang -j8
 7 | 
 8 | make -f simd_make_a64.mk macRD
 9 | 
10 | make -f simd_make_a64.mk macST
11 | 
12 | make -f simd_make_a64.mk macOS
13 | 


--------------------------------------------------------------------------------
/test/build_macOS.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | # Intended for x86_64 Mac OS X / OS X / macOS build environment
 3 | # with Command Line Tools installed (Mac OS X Lion / macOS High Sierra tested)
 4 | # build on the least recent OS as binaries aren't always backward compatible
 5 | 
 6 | make -f simd_make_x64.mk build -j8
 7 | 
 8 | make -f simd_make_x64.mk macRD
 9 | 
10 | make -f simd_make_x64.mk strip
11 | 
12 | make -f simd_make_x64.mk macOS
13 | 


--------------------------------------------------------------------------------
/test/build_multi.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | # Intended for x86_64 Linux build environment
 3 | # with native g++ multilib-compiler installed (64-bit Linux Mint 18 tested)
 4 | # refer to individual makefiles for installation instructions
 5 | 
 6 | make -f simd_make_x86.mk build -j4
 7 | make -f simd_make_x32.mk build
 8 | 
 9 | make -f simd_make_x86.mk strip
10 | make -f simd_make_x32.mk strip
11 | 


--------------------------------------------------------------------------------
/test/build_nokia.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | # Intended for scratchbox Linux build environment (32-bit Ubuntu 10.10 tested)
3 | # http://wiki.maemo.org/Documentation/Maemo_5_Final_SDK_Installation
4 | 
5 | make -f simd_make_arm.mk build_n900
6 | 
7 | make -f simd_make_arm.mk strip_n900
8 | 


--------------------------------------------------------------------------------
/test/build_raspi.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | # Intended for ARMv7 Linux build environment
3 | # with native g++ compiler installed (32-bit Raspbian 7 and 8 tested)
4 | 
5 | make -f simd_make_arm.mk build_rpiX -j4
6 | 
7 | make -f simd_make_arm.mk strip_rpiX
8 | 


--------------------------------------------------------------------------------
/test/build_win64.bat:
--------------------------------------------------------------------------------
1 | :: Intended for x86_64 Windows build environment
2 | :: with TDM64-GCC compiler installed (64-bit Windows 7 SP1, Windows 10 tested)
3 | 
4 | mingw32-make -f simd_make_w64.mk build -j4
5 | 
6 | mingw32-make -f simd_make_w64.mk strip
7 | 


--------------------------------------------------------------------------------
/test/clean_cross.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | # Intended for x86_64 Linux build environment
 3 | # with many g++ cross-compilers installed (64-bit Ubuntu MATE 20.04 LTS tested)
 4 | # refer to individual makefiles for installation instructions
 5 | 
 6 | make -f simd_make_arm.mk clean
 7 | make -f simd_make_m32.mk clean
 8 | make -f simd_make_p32.mk clean
 9 | make -f simd_make_a64.mk clean
10 | make -f simd_make_m64.mk clean
11 | make -f simd_make_p64.mk clean
12 | 


--------------------------------------------------------------------------------
/test/clean_linux.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | # Intended for x86_64 Linux build environment
3 | # with native g++ compiler installed (64-bit Linux Mint 18 tested)
4 | # works on Ubuntu MATE 18.04/20.04 LTS (binaries aren't backward compatible)
5 | 
6 | make -f simd_make_x64.mk clean
7 | 


--------------------------------------------------------------------------------
/test/clean_macM1.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | # Intended for AArch64 macOS build environment with Apple Silicon (M1 chip)
3 | # with Command Line Tools installed (macOS BigSur/Monterey tested)
4 | # build on the least recent OS as binaries aren't always backward compatible
5 | 
6 | make -f simd_make_a64.mk macRM
7 | 


--------------------------------------------------------------------------------
/test/clean_macOS.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | # Intended for x86_64 Mac OS X / OS X / macOS build environment
3 | # with Command Line Tools installed (Mac OS X Lion / macOS High Sierra tested)
4 | # build on the least recent OS as binaries aren't always backward compatible
5 | 
6 | make -f simd_make_x64.mk macRM
7 | 


--------------------------------------------------------------------------------
/test/clean_multi.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | # Intended for x86_64 Linux build environment
3 | # with native g++ multilib-compiler installed (64-bit Linux Mint 18 tested)
4 | # refer to individual makefiles for installation instructions
5 | 
6 | make -f simd_make_x86.mk clean
7 | make -f simd_make_x32.mk clean
8 | 


--------------------------------------------------------------------------------
/test/clean_nokia.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | # Intended for scratchbox Linux build environment (32-bit Ubuntu 10.10 tested)
3 | # http://wiki.maemo.org/Documentation/Maemo_5_Final_SDK_Installation
4 | 
5 | make -f simd_make_arm.mk clean_n900
6 | 


--------------------------------------------------------------------------------
/test/clean_raspi.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | # Intended for ARMv7 Linux build environment
3 | # with native g++ compiler installed (32-bit Raspbian 7 and 8 tested)
4 | 
5 | make -f simd_make_arm.mk clean_rpiX
6 | 


--------------------------------------------------------------------------------
/test/clean_win64.bat:
--------------------------------------------------------------------------------
1 | :: Intended for x86_64 Windows build environment
2 | :: with TDM64-GCC compiler installed (64-bit Windows 7 SP1 tested)
3 | 
4 | mingw32-make -f simd_make_w64.mk clean
5 | 


--------------------------------------------------------------------------------
/test/simd_make_a32.mk:
--------------------------------------------------------------------------------
 1 | 
 2 | INC_PATH =                              \
 3 |         -I../core/config/
 4 | 
 5 | SRC_LIST =                              \
 6 |         simd_test.cpp
 7 | 
 8 | LIB_PATH =
 9 | 
10 | LIB_LIST =                              \
11 |         -lm
12 | 
13 | 
14 | build: simd_test_a32
15 | 
16 | strip:
17 | 	aarch64-linux-gnu-strip simd_test.a32*
18 | 
19 | clean:
20 | 	rm simd_test.a32*
21 | 
22 | 
23 | simd_test_a32:
24 | 	aarch64-linux-gnu-g++ -O3 -g -static -mabi=ilp32 \
25 |         -DRT_LINUX -DRT_A32 -DRT_128=1 -DRT_DEBUG=0 \
26 |         -DRT_POINTER=32 -DRT_ADDRESS=32 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \
27 |         ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.a32
28 | 
29 | 
30 | # On Ubuntu (MATE) 16.04-22.04 add "universe multiverse" to "main restricted"
31 | # in /etc/apt/sources.list (sudo nano /etc/apt/sources.list) then run:
32 | # sudo apt-get update
33 | # (Ubuntu MATE is set up for an update without a need to edit the file)
34 | # (extended repositories "universe multiverse" are only needed for clang)
35 | #
36 | # Prerequisites for the build:
37 | # (cross-)compiler for AArch64 is installed and in the PATH variable.
38 | # sudo apt-get install make g++-aarch64-linux-gnu
39 | # (recent upstream g++-5-aarch64 series may not fully support ILP32 ABI)
40 | #
41 | # Compiling/running SIMD test:
42 | # make -f simd_make_a32.mk
43 | 
44 | # Clang native build should theoretically work too (not tested), use (replace):
45 | # clang++ (in place of ...-g++) on AArch64 host (Raspberry Pi 3/4)
46 | # sudo apt-get install clang
47 | 
48 | # For interpretation of SIMD build flags check compatibility layer in rtzero.h.
49 | # The 128-bit 15-reg targets are supported for compatibility with x86/POWER.
50 | 
51 | # For 128-bit NEON build use (replace): RT_128=1            (30 SIMD registers)
52 | # For 128-bit ARMv8.2 build use (replace): RT_128=2 (adds new fp16 ops) (30 rs)
53 | # For 128-bit NEON build use (replace): RT_128=4            (15 SIMD registers)
54 | # For 128-bit ARMv8.2 build use (replace): RT_128=8 (adds new fp16 ops) (15 rs)
55 | # For 256-bit NEON build use (replace): RT_256=1            (15 SIMD reg-pairs)
56 | # For 256-bit ARMv8.2 build use (replace): RT_256=2 (adds new fp16 ops) (15 rp)
57 | 
58 | # For 256-bit  SVEx1 build use (replace): RT_256=4          (30 SIMD registers)
59 | # For 512-bit  SVEx2 build use (replace): RT_512=1          (15 SIMD reg-pairs)
60 | # For 512-bit  SVEx1 build use (replace): RT_512=4          (30 SIMD registers)
61 | # For 1024-bit SVEx2 build use (replace): RT_1K4=1          (15 SIMD reg-pairs)
62 | # For 1024-bit SVEx1 build use (replace): RT_1K4=4          (30 SIMD registers)
63 | # For 2048-bit SVEx2 build use (replace): RT_2K8_R8=1        (8 SIMD reg-pairs)
64 | # For 2048-bit SVEx1 build use (replace): RT_2K8_R8=4       (15 SIMD registers)
65 | # The last two slots are artificially reg-limited for compatibility with AVX512
66 | 
67 | # 32-bit ABI hasn't been fully tested yet due to lack of available libs,
68 | # check out 64/32-bit (ptr/adr) hybrid mode for 64-bit ABI in simd_make_a64.mk
69 | 


--------------------------------------------------------------------------------
/test/simd_make_a64.mk:
--------------------------------------------------------------------------------
  1 | 
  2 | INC_PATH =                              \
  3 |         -I../core/config/
  4 | 
  5 | SRC_LIST =                              \
  6 |         simd_test.cpp
  7 | 
  8 | LIB_PATH =
  9 | 
 10 | LIB_LIST =                              \
 11 |         -lm
 12 | 
 13 | 
 14 | build: build_a64 build_a64sve
 15 | clang: clang_a64 clang_a64sve
 16 | 
 17 | strip:
 18 | 	aarch64-linux-gnu-strip simd_test.a64*
 19 | 
 20 | clean:
 21 | 	rm simd_test.a64*
 22 | 
 23 | macOS:
 24 | 	mv simd_test.a64_32 simd_test.d64_32
 25 | 	mv simd_test.a64_64 simd_test.d64_64
 26 | 	mv simd_test.a64f32 simd_test.d64f32
 27 | 	mv simd_test.a64f64 simd_test.d64f64
 28 | 	mv simd_test.a64_32sve simd_test.d64_32sve
 29 | 	mv simd_test.a64_64sve simd_test.d64_64sve
 30 | 	mv simd_test.a64f32sve simd_test.d64f32sve
 31 | 	mv simd_test.a64f64sve simd_test.d64f64sve
 32 | 
 33 | macRD:
 34 | 	rm -fr simd_test.a64*.dSYM/
 35 | 
 36 | macST:
 37 | 	strip simd_test.a64*
 38 | 
 39 | macRM:
 40 | 	rm simd_test.d64*
 41 | 
 42 | 
 43 | build_a64: simd_test_a64_32 simd_test_a64_64 simd_test_a64f32 simd_test_a64f64
 44 | 
 45 | simd_test_a64_32:
 46 | 	aarch64-linux-gnu-g++ -O3 -g -static \
 47 |         -DRT_LINUX -DRT_A64 -DRT_128=1 -DRT_DEBUG=0 \
 48 |         -DRT_POINTER=64 -DRT_ADDRESS=32 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \
 49 |         ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.a64_32
 50 | 
 51 | simd_test_a64_64:
 52 | 	aarch64-linux-gnu-g++ -O3 -g -static \
 53 |         -DRT_LINUX -DRT_A64 -DRT_128=1 -DRT_DEBUG=0 \
 54 |         -DRT_POINTER=64 -DRT_ADDRESS=32 -DRT_ELEMENT=64 -DRT_ENDIAN=0 \
 55 |         ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.a64_64
 56 | 
 57 | simd_test_a64f32:
 58 | 	aarch64-linux-gnu-g++ -O3 -g -static \
 59 |         -DRT_LINUX -DRT_A64 -DRT_256=1 -DRT_DEBUG=0 \
 60 |         -DRT_POINTER=64 -DRT_ADDRESS=64 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \
 61 |         ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.a64f32
 62 | 
 63 | simd_test_a64f64:
 64 | 	aarch64-linux-gnu-g++ -O3 -g -static \
 65 |         -DRT_LINUX -DRT_A64 -DRT_256=1 -DRT_DEBUG=0 \
 66 |         -DRT_POINTER=64 -DRT_ADDRESS=64 -DRT_ELEMENT=64 -DRT_ENDIAN=0 \
 67 |         ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.a64f64
 68 | 
 69 | 
 70 | build_a64sve: simd_test_a64_32sve simd_test_a64_64sve \
 71 |               simd_test_a64f32sve simd_test_a64f64sve
 72 | 
 73 | simd_test_a64_32sve:
 74 | 	aarch64-linux-gnu-g++ -O3 -g -static \
 75 |         -DRT_LINUX -DRT_A64 -DRT_512=4 -DRT_DEBUG=0 \
 76 |         -DRT_POINTER=64 -DRT_ADDRESS=32 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \
 77 |         ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.a64_32sve
 78 | 
 79 | simd_test_a64_64sve:
 80 | 	aarch64-linux-gnu-g++ -O3 -g -static \
 81 |         -DRT_LINUX -DRT_A64 -DRT_512=4 -DRT_DEBUG=0 \
 82 |         -DRT_POINTER=64 -DRT_ADDRESS=32 -DRT_ELEMENT=64 -DRT_ENDIAN=0 \
 83 |         ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.a64_64sve
 84 | 
 85 | simd_test_a64f32sve:
 86 | 	aarch64-linux-gnu-g++ -O3 -g -static \
 87 |         -DRT_LINUX -DRT_A64 -DRT_1K4=1 -DRT_DEBUG=0 \
 88 |         -DRT_POINTER=64 -DRT_ADDRESS=64 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \
 89 |         ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.a64f32sve
 90 | 
 91 | simd_test_a64f64sve:
 92 | 	aarch64-linux-gnu-g++ -O3 -g -static \
 93 |         -DRT_LINUX -DRT_A64 -DRT_1K4=1 -DRT_DEBUG=0 \
 94 |         -DRT_POINTER=64 -DRT_ADDRESS=64 -DRT_ELEMENT=64 -DRT_ENDIAN=0 \
 95 |         ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.a64f64sve
 96 | 
 97 | 
 98 | clang_a64: simd_test.a64_32 simd_test.a64_64 simd_test.a64f32 simd_test.a64f64
 99 | 
100 | simd_test.a64_32:
101 | 	clang++ -O3 -g \
102 |         -DRT_LINUX -DRT_A64 -DRT_128=1 -DRT_DEBUG=0 \
103 |         -DRT_POINTER=64 -DRT_ADDRESS=32 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \
104 |         ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.a64_32
105 | 
106 | simd_test.a64_64:
107 | 	clang++ -O3 -g \
108 |         -DRT_LINUX -DRT_A64 -DRT_128=1 -DRT_DEBUG=0 \
109 |         -DRT_POINTER=64 -DRT_ADDRESS=32 -DRT_ELEMENT=64 -DRT_ENDIAN=0 \
110 |         ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.a64_64
111 | 
112 | simd_test.a64f32:
113 | 	clang++ -O3 -g \
114 |         -DRT_LINUX -DRT_A64 -DRT_256=1 -DRT_DEBUG=0 \
115 |         -DRT_POINTER=64 -DRT_ADDRESS=64 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \
116 |         ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.a64f32
117 | 
118 | simd_test.a64f64:
119 | 	clang++ -O3 -g \
120 |         -DRT_LINUX -DRT_A64 -DRT_256=1 -DRT_DEBUG=0 \
121 |         -DRT_POINTER=64 -DRT_ADDRESS=64 -DRT_ELEMENT=64 -DRT_ENDIAN=0 \
122 |         ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.a64f64
123 | 
124 | 
125 | clang_a64sve: simd_test.a64_32sve simd_test.a64_64sve \
126 |               simd_test.a64f32sve simd_test.a64f64sve
127 | 
128 | simd_test.a64_32sve:
129 | 	clang++ -O3 -g \
130 |         -DRT_LINUX -DRT_A64 -DRT_512=4 -DRT_DEBUG=0 \
131 |         -DRT_POINTER=64 -DRT_ADDRESS=32 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \
132 |         ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.a64_32sve
133 | 
134 | simd_test.a64_64sve:
135 | 	clang++ -O3 -g \
136 |         -DRT_LINUX -DRT_A64 -DRT_512=4 -DRT_DEBUG=0 \
137 |         -DRT_POINTER=64 -DRT_ADDRESS=32 -DRT_ELEMENT=64 -DRT_ENDIAN=0 \
138 |         ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.a64_64sve
139 | 
140 | simd_test.a64f32sve:
141 | 	clang++ -O3 -g \
142 |         -DRT_LINUX -DRT_A64 -DRT_1K4=1 -DRT_DEBUG=0 \
143 |         -DRT_POINTER=64 -DRT_ADDRESS=64 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \
144 |         ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.a64f32sve
145 | 
146 | simd_test.a64f64sve:
147 | 	clang++ -O3 -g \
148 |         -DRT_LINUX -DRT_A64 -DRT_1K4=1 -DRT_DEBUG=0 \
149 |         -DRT_POINTER=64 -DRT_ADDRESS=64 -DRT_ELEMENT=64 -DRT_ENDIAN=0 \
150 |         ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.a64f64sve
151 | 
152 | 
153 | # On Ubuntu (MATE) 16.04-22.04 add "universe multiverse" to "main restricted"
154 | # in /etc/apt/sources.list (sudo nano /etc/apt/sources.list) then run:
155 | # sudo apt-get update
156 | # (Ubuntu MATE is set up for an update without a need to edit the file)
157 | # (extended repositories "universe multiverse" are only needed for clang)
158 | #
159 | # Prerequisites for the build:
160 | # (cross-)compiler for AArch64 is installed and in the PATH variable.
161 | # sudo apt-get install make g++-aarch64-linux-gnu
162 | #
163 | # Prerequisites for emulation:
164 | # recent QEMU(-2.5) is installed or built from source and in the PATH variable.
165 | # SVE targets require QEMU 3.x.y (or 3.0.0 with sve-max-vq cpu property patch).
166 | # recent QEMU 4.x.y work well with SVE, but only 4.2.0 is good for all targets.
167 | # sudo apt-get install qemu-user
168 | #
169 | # Compiling/running SIMD test:
170 | # make -f simd_make_a64.mk
171 | # qemu-aarch64 -cpu cortex-a57 simd_test.a64_32 -c 1
172 | # qemu-aarch64 -cpu cortex-a57 simd_test.a64_64 -c 1
173 | # qemu-aarch64 -cpu cortex-a57 simd_test.a64f32 -c 1
174 | # qemu-aarch64 -cpu cortex-a57 simd_test.a64f64 -c 1
175 | # qemu-aarch64 -cpu max,sve-max-vq=1 simd_test.a64_32sve -c 1  (RT_128=2/*_RX=2)
176 | # qemu-aarch64 -cpu max,sve-max-vq=1 simd_test.a64_64sve -c 1  (RT_128=2/*_RX=2)
177 | # qemu-aarch64 -cpu max,sve-max-vq=1 simd_test.a64f32sve -c 1  (RT_256=2/*_RX=2)
178 | # qemu-aarch64 -cpu max,sve-max-vq=1 simd_test.a64f64sve -c 1  (RT_256=2/*_RX=2)
179 | # qemu-aarch64 -cpu max,sve-max-vq=2 simd_test.a64_32sve -c 1  (for RT_256=4)
180 | # qemu-aarch64 -cpu max,sve-max-vq=2 simd_test.a64_64sve -c 1  (for RT_256=4)
181 | # qemu-aarch64 -cpu max,sve-max-vq=2 simd_test.a64f32sve -c 1  (for RT_512=1)
182 | # qemu-aarch64 -cpu max,sve-max-vq=2 simd_test.a64f64sve -c 1  (for RT_512=1)
183 | # qemu-aarch64 -cpu max,sve-max-vq=4 simd_test.a64_32sve -c 1  (for RT_512=4)
184 | # qemu-aarch64 -cpu max,sve-max-vq=4 simd_test.a64_64sve -c 1  (for RT_512=4)
185 | # qemu-aarch64 -cpu max,sve-max-vq=4 simd_test.a64f32sve -c 1  (for RT_1K4=1)
186 | # qemu-aarch64 -cpu max,sve-max-vq=4 simd_test.a64f64sve -c 1  (for RT_1K4=1)
187 | # qemu-aarch64 -cpu max,sve-max-vq=8 simd_test.a64_32sve -c 1  (for RT_1K4=4)
188 | # qemu-aarch64 -cpu max,sve-max-vq=8 simd_test.a64_64sve -c 1  (for RT_1K4=4)
189 | # qemu-aarch64 -cpu max,sve-max-vq=8 simd_test.a64f32sve -c 1  (for RT_2K8_R8=1)
190 | # qemu-aarch64 -cpu max,sve-max-vq=8 simd_test.a64f64sve -c 1  (for RT_2K8_R8=1)
191 | # qemu-aarch64 -cpu max,sve-max-vq=16 simd_test.a64_32sve -c 1 (for RT_2K8_R8=4)
192 | # qemu-aarch64 -cpu max,sve-max-vq=16 simd_test.a64_64sve -c 1 (for RT_2K8_R8=4)
193 | # Use "-c 1" option to reduce test time when emulating with QEMU
194 | 
195 | # Clang native build works too (takes much longer prior to 3.8), use (replace):
196 | # clang++ (in place of ...-g++) on AArch64 host (Raspberry Pi 3/4)
197 | # sudo apt-get install clang
198 | 
199 | # For interpretation of SIMD build flags check compatibility layer in rtzero.h.
200 | # The 128-bit 15-reg targets are supported for compatibility with x86/POWER.
201 | 
202 | # For 128-bit NEON build use (replace): RT_128=1            (30 SIMD registers)
203 | # For 128-bit ARMv8.2 build use (replace): RT_128=2 (adds new fp16 ops) (30 rs)
204 | # For 128-bit SVE2x1 build use (replace): RT_128=2 RT_128_RX=2 (X regs) (30 rs)
205 | # For 128-bit NEON build use (replace): RT_128=4            (15 SIMD registers)
206 | # For 128-bit ARMv8.2 build use (replace): RT_128=8 (adds new fp16 ops) (15 rs)
207 | # For 256-bit NEON build use (replace): RT_256=1            (15 SIMD reg-pairs)
208 | # For 256-bit ARMv8.2 build use (replace): RT_256=2 (adds new fp16 ops) (15 rp)
209 | # For 256-bit SVE2x2 build use (replace): RT_256=2 RT_256_RX=2 (X regs) (15 rp)
210 | 
211 | # For 256-bit  SVEx1 build use (replace): RT_256=4          (30 SIMD registers)
212 | # For 512-bit  SVEx2 build use (replace): RT_512=1          (15 SIMD reg-pairs)
213 | # For 512-bit  SVEx1 build use (replace): RT_512=4          (30 SIMD registers)
214 | # For 1024-bit SVEx2 build use (replace): RT_1K4=1          (15 SIMD reg-pairs)
215 | # For 1024-bit SVEx1 build use (replace): RT_1K4=4          (30 SIMD registers)
216 | # For 2048-bit SVEx2 build use (replace): RT_2K8_R8=1        (8 SIMD reg-pairs)
217 | # For 2048-bit SVEx1 build use (replace): RT_2K8_R8=4       (15 SIMD registers)
218 | # The last two slots are artificially reg-limited for compatibility with AVX512
219 | 
220 | # 64/32-bit (ptr/adr) hybrid mode is compatible with native 64-bit ABI,
221 | # use (replace): RT_ADDRESS=32, rename the binary to simd_test.a64_**
222 | # 64-bit packed SIMD mode (fp64/int64) is supported on 64-bit targets,
223 | # use (replace): RT_ELEMENT=64, rename the binary to simd_test.a64*64
224 | 


--------------------------------------------------------------------------------
/test/simd_make_arm.mk:
--------------------------------------------------------------------------------
  1 | 
  2 | INC_PATH =                              \
  3 |         -I../core/config/
  4 | 
  5 | SRC_LIST =                              \
  6 |         simd_test.cpp
  7 | 
  8 | LIB_PATH =
  9 | 
 10 | LIB_LIST =                              \
 11 |         -lm
 12 | 
 13 | 
 14 | build: simd_test_arm_v1 simd_test_arm_v2
 15 | 
 16 | strip:
 17 | 	arm-linux-gnueabi-strip simd_test.arm_v*
 18 | 
 19 | clean:
 20 | 	rm simd_test.arm_v*
 21 | 
 22 | 
 23 | simd_test_arm_v1:
 24 | 	arm-linux-gnueabi-g++ -O3 -g -static -march=armv7-a -marm \
 25 |         -DRT_LINUX -DRT_ARM -DRT_128=1 -DRT_DEBUG=0 \
 26 |         -DRT_POINTER=32 -DRT_ADDRESS=32 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \
 27 |         ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.arm_v1
 28 | 
 29 | simd_test_arm_v2:
 30 | 	arm-linux-gnueabi-g++ -O3 -g -static -march=armv7-a -marm \
 31 |         -DRT_LINUX -DRT_ARM -DRT_128=2 -DRT_DEBUG=0 \
 32 |         -DRT_POINTER=32 -DRT_ADDRESS=32 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \
 33 |         ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.arm_v2
 34 | 
 35 | 
 36 | build_n900: simd_test_arm_n900
 37 | 
 38 | strip_n900:
 39 | 	arm-linux-gnueabi-strip simd_test.arm_n900*
 40 | 
 41 | clean_n900:
 42 | 	rm simd_test.arm_n900*
 43 | 
 44 | 
 45 | simd_test_arm_n900:
 46 | 	arm-linux-gnueabi-g++ -O3 -g -static -march=armv7-a -marm \
 47 |         -DRT_LINUX -DRT_ARM -DRT_128=1 -DRT_DEBUG=0 \
 48 |         -DRT_POINTER=32 -DRT_ADDRESS=32 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \
 49 |         ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.arm_n900
 50 | 
 51 | 
 52 | build_rpiX: simd_test_arm_rpi2 simd_test_arm_rpi3
 53 | 
 54 | strip_rpiX:
 55 | 	arm-linux-gnueabihf-strip simd_test.arm_rpi*
 56 | 
 57 | clean_rpiX:
 58 | 	rm simd_test.arm_rpi*
 59 | 
 60 | 
 61 | simd_test_arm_rpi2:
 62 | 	arm-linux-gnueabihf-g++ -O3 -g -static -march=armv7-a -marm \
 63 |         -DRT_LINUX -DRT_ARM -DRT_128=2 -DRT_DEBUG=0 \
 64 |         -DRT_POINTER=32 -DRT_ADDRESS=32 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \
 65 |         ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.arm_rpi2
 66 | 
 67 | simd_test_arm_rpi3:
 68 | 	arm-linux-gnueabihf-g++ -O3 -g -static -march=armv7-a -marm \
 69 |         -DRT_LINUX -DRT_ARM -DRT_128=4 -DRT_DEBUG=0 \
 70 |         -DRT_POINTER=32 -DRT_ADDRESS=32 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \
 71 |         ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.arm_rpi3
 72 | 
 73 | 
 74 | # On Ubuntu (MATE) 16.04-22.04 add "universe multiverse" to "main restricted"
 75 | # in /etc/apt/sources.list (sudo nano /etc/apt/sources.list) then run:
 76 | # sudo apt-get update
 77 | # (Ubuntu MATE is set up for an update without a need to edit the file)
 78 | # (extended repositories "universe multiverse" are only needed for clang)
 79 | #
 80 | # Prerequisites for the build:
 81 | # (cross-)compiler for ARMv7 is installed and in the PATH variable.
 82 | # sudo apt-get install make g++-arm-linux-gnueabi
 83 | #
 84 | # Prerequisites for emulation:
 85 | # recent QEMU(-2.5) is installed or built from source and in the PATH variable.
 86 | # sudo apt-get install qemu-user
 87 | #
 88 | # Compiling/running SIMD test:
 89 | # make -f simd_make_arm.mk
 90 | # qemu-arm -cpu cortex-a8  simd_test.arm_v1 -c 1
 91 | # qemu-arm -cpu cortex-a15 simd_test.arm_v2 -c 1
 92 | # Use "-c 1" option to reduce test time when emulating with QEMU
 93 | 
 94 | # Clang native build works too (takes much longer prior to 3.8), use (replace):
 95 | # clang++ (in place of ...-g++) on ARMv7 host (Raspberry Pi 2)
 96 | # sudo apt-get install clang
 97 | 
 98 | # For interpretation of SIMD build flags check compatibility layer in rtzero.h.
 99 | # Original legacy 32-bit ARMv7/x86 targets only support 8 SIMD registers.
100 | 
101 | # 1) Nokia N900, Maemo 5 scratchbox: "vanilla" (-DRT_128=1)  (8 SIMD registers)
102 | # 2) Raspberry Pi 2, Raspbian: arm-linux-gnueabihf-g++ -DRT_128=2 (8 SIMD regs)
103 | # 3) Raspberry Pi 3, Raspbian: arm-linux-gnueabihf-g++ -DRT_128=4 (8 SIMD regs)
104 | 


--------------------------------------------------------------------------------
/test/simd_make_m32.mk:
--------------------------------------------------------------------------------
 1 | 
 2 | INC_PATH =                              \
 3 |         -I../core/config/
 4 | 
 5 | SRC_LIST =                              \
 6 |         simd_test.cpp
 7 | 
 8 | LIB_PATH =
 9 | 
10 | LIB_LIST =                              \
11 |         -lm
12 | 
13 | 
14 | build: simd_test_m32Lr5 simd_test_m32Br5
15 | 
16 | strip:
17 | 	mips-mti-linux-gnu-strip simd_test.m32?r5*
18 | 
19 | clean:
20 | 	rm simd_test.m32*
21 | 
22 | 
23 | simd_test_m32Lr5:
24 | 	mips-mti-linux-gnu-g++ -O3 -g -static -EL -mips32r5 -mmsa -mnan=2008 \
25 |         -DRT_LINUX -DRT_M32 -DRT_128=1 -DRT_DEBUG=0 \
26 |         -DRT_POINTER=32 -DRT_ADDRESS=32 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \
27 |         ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.m32Lr5
28 | 
29 | simd_test_m32Br5:
30 | 	mips-mti-linux-gnu-g++ -O3 -g -static -EB -mips32r5 -mmsa -mnan=2008 \
31 |         -DRT_LINUX -DRT_M32 -DRT_128=1 -DRT_DEBUG=0 \
32 |         -DRT_POINTER=32 -DRT_ADDRESS=32 -DRT_ELEMENT=32 -DRT_ENDIAN=1 \
33 |         ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.m32Br5
34 | 
35 | 
36 | # On Ubuntu (MATE) 16.04-22.04 add "universe multiverse" to "main restricted"
37 | # in /etc/apt/sources.list (sudo nano /etc/apt/sources.list) then run:
38 | # sudo apt-get update
39 | # (Ubuntu MATE is set up for an update without a need to edit the file)
40 | # (extended repositories "universe multiverse" are only needed for clang)
41 | #
42 | # Download and unpack MIPS toolchain:
43 | # https://codescape.mips.com/components/toolchain/2020.06-01/downloads.html
44 | #
45 | # Prerequisites for the build:
46 | # (cross-)compiler for MIPSr5+MSA is installed and in the PATH variable.
47 | # Codescape.GNU.Tools.Package.2020.06-01.for.MIPS.MTI.Linux.CentOS-6.x86_64
48 | # is unpacked and folder mips-mti-linux-gnu/2020.06-01/bin is added to PATH:
49 | # PATH=/home/ubuntu/Downloads/mips-mti-linux-gnu/2020.06-01/bin:$PATH
50 | # PATH=/home/ubuntu-mate/Downloads/mips-mti-linux-gnu/2020.06-01/bin:$PATH
51 | #
52 | # Prerequisites for emulation:
53 | # recent QEMU(-2.5) is installed or built from source and in the PATH variable.
54 | # standalone toolchain from 2020.06-01 comes with QEMU 4.1.0 for MIPS in PATH.
55 | # sudo apt-get install qemu-user make
56 | #
57 | # Compiling/running SIMD test:
58 | # make -f simd_make_m32.mk
59 | # qemu-mipsel -cpu P5600 simd_test.m32Lr5 -c 1
60 | # qemu-mips   -cpu P5600 simd_test.m32Br5 -c 1
61 | # Use "-c 1" option to reduce test time when emulating with QEMU
62 | 
63 | # Clang native build should theoretically work too (not tested), use (replace):
64 | # clang++ -O0 (in place of ...-g++ -O3) on MIPS32r5 host (P5600)
65 | # sudo apt-get install clang
66 | 
67 | # For interpretation of SIMD build flags check compatibility layer in rtzero.h.
68 | # The 128-bit 15-reg targets are supported for compatibility with x86/POWER.
69 | 
70 | # For 128-bit SIMD build use (replace): RT_128=1            (30 SIMD registers)
71 | # For 128-bit SIMD build use (replace): RT_128=4            (15 SIMD registers)
72 | # For 256-bit SIMD build use (replace): RT_256=1            (15 SIMD reg-pairs)
73 | 


--------------------------------------------------------------------------------
/test/simd_make_m64.mk:
--------------------------------------------------------------------------------
  1 | 
  2 | INC_PATH =                              \
  3 |         -I../core/config/
  4 | 
  5 | SRC_LIST =                              \
  6 |         simd_test.cpp
  7 | 
  8 | LIB_PATH =
  9 | 
 10 | LIB_LIST =                              \
 11 |         -lm
 12 | 
 13 | 
 14 | build: build_le build_be
 15 | 
 16 | strip:
 17 | 	mips-mti-linux-gnu-strip simd_test.m64???Lr6
 18 | 	mips-mti-linux-gnu-strip simd_test.m64???Br6
 19 | 
 20 | clean:
 21 | 	rm simd_test.m64*
 22 | 
 23 | 
 24 | build_le: simd_test_m64_32Lr6 simd_test_m64_64Lr6 \
 25 |           simd_test_m64f32Lr6 simd_test_m64f64Lr6
 26 | 
 27 | simd_test_m64_32Lr6:
 28 | 	mips-mti-linux-gnu-g++ -O3 -g -static -EL -mips64r6 -mmsa -mabi=64 \
 29 |         -DRT_LINUX -DRT_M64=6 -DRT_128=1 -DRT_DEBUG=0 \
 30 |         -DRT_POINTER=64 -DRT_ADDRESS=32 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \
 31 |         ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.m64_32Lr6
 32 | 
 33 | simd_test_m64_64Lr6:
 34 | 	mips-mti-linux-gnu-g++ -O3 -g -static -EL -mips64r6 -mmsa -mabi=64 \
 35 |         -DRT_LINUX -DRT_M64=6 -DRT_128=1 -DRT_DEBUG=0 \
 36 |         -DRT_POINTER=64 -DRT_ADDRESS=32 -DRT_ELEMENT=64 -DRT_ENDIAN=0 \
 37 |         ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.m64_64Lr6
 38 | 
 39 | simd_test_m64f32Lr6:
 40 | 	mips-mti-linux-gnu-g++ -O3 -g -static -EL -mips64r6 -mmsa -mabi=64 \
 41 |         -DRT_LINUX -DRT_M64=6 -DRT_256=1 -DRT_DEBUG=0 \
 42 |         -DRT_POINTER=64 -DRT_ADDRESS=64 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \
 43 |         ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.m64f32Lr6
 44 | 
 45 | simd_test_m64f64Lr6:
 46 | 	mips-mti-linux-gnu-g++ -O3 -g -static -EL -mips64r6 -mmsa -mabi=64 \
 47 |         -DRT_LINUX -DRT_M64=6 -DRT_256=1 -DRT_DEBUG=0 \
 48 |         -DRT_POINTER=64 -DRT_ADDRESS=64 -DRT_ELEMENT=64 -DRT_ENDIAN=0 \
 49 |         ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.m64f64Lr6
 50 | 
 51 | 
 52 | build_be: simd_test_m64_32Br6 simd_test_m64_64Br6 \
 53 |           simd_test_m64f32Br6 simd_test_m64f64Br6
 54 | 
 55 | simd_test_m64_32Br6:
 56 | 	mips-mti-linux-gnu-g++ -O3 -g -static -EB -mips64r6 -mmsa -mabi=64 \
 57 |         -DRT_LINUX -DRT_M64=6 -DRT_128=1 -DRT_DEBUG=0 \
 58 |         -DRT_POINTER=64 -DRT_ADDRESS=32 -DRT_ELEMENT=32 -DRT_ENDIAN=1 \
 59 |         ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.m64_32Br6
 60 | 
 61 | simd_test_m64_64Br6:
 62 | 	mips-mti-linux-gnu-g++ -O3 -g -static -EB -mips64r6 -mmsa -mabi=64 \
 63 |         -DRT_LINUX -DRT_M64=6 -DRT_128=1 -DRT_DEBUG=0 \
 64 |         -DRT_POINTER=64 -DRT_ADDRESS=32 -DRT_ELEMENT=64 -DRT_ENDIAN=1 \
 65 |         ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.m64_64Br6
 66 | 
 67 | simd_test_m64f32Br6:
 68 | 	mips-mti-linux-gnu-g++ -O3 -g -static -EB -mips64r6 -mmsa -mabi=64 \
 69 |         -DRT_LINUX -DRT_M64=6 -DRT_256=1 -DRT_DEBUG=0 \
 70 |         -DRT_POINTER=64 -DRT_ADDRESS=64 -DRT_ELEMENT=32 -DRT_ENDIAN=1 \
 71 |         ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.m64f32Br6
 72 | 
 73 | simd_test_m64f64Br6:
 74 | 	mips-mti-linux-gnu-g++ -O3 -g -static -EB -mips64r6 -mmsa -mabi=64 \
 75 |         -DRT_LINUX -DRT_M64=6 -DRT_256=1 -DRT_DEBUG=0 \
 76 |         -DRT_POINTER=64 -DRT_ADDRESS=64 -DRT_ELEMENT=64 -DRT_ENDIAN=1 \
 77 |         ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.m64f64Br6
 78 | 
 79 | 
 80 | # On Ubuntu (MATE) 16.04-22.04 add "universe multiverse" to "main restricted"
 81 | # in /etc/apt/sources.list (sudo nano /etc/apt/sources.list) then run:
 82 | # sudo apt-get update
 83 | # (Ubuntu MATE is set up for an update without a need to edit the file)
 84 | # (extended repositories "universe multiverse" are only needed for clang)
 85 | #
 86 | # Download and unpack MIPS toolchain:
 87 | # https://codescape.mips.com/components/toolchain/2020.06-01/downloads.html
 88 | #
 89 | # Prerequisites for the build:
 90 | # (cross-)compiler for MIPSr6+MSA is installed and in the PATH variable.
 91 | # Codescape.GNU.Tools.Package.2020.06-01.for.MIPS.MTI.Linux.CentOS-6.x86_64
 92 | # is unpacked and folder mips-mti-linux-gnu/2020.06-01/bin is added to PATH:
 93 | # PATH=/home/ubuntu/Downloads/mips-mti-linux-gnu/2020.06-01/bin:$PATH
 94 | # PATH=/home/ubuntu-mate/Downloads/mips-mti-linux-gnu/2020.06-01/bin:$PATH
 95 | #
 96 | # Starting from Ubuntu (MATE) 19.10 upstream (cross-)compiler supports MSA.
 97 | # sudo apt-get install make g++-mipsisa64r6el-linux-gnuabi64
 98 | # sudo apt-get install make g++-mipsisa64r6-linux-gnuabi64
 99 | # (replace mips-mti-linux-gnu with mipsisa64r6el-linux-gnuabi64 for LE)
100 | # (replace mips-mti-linux-gnu with mipsisa64r6-linux-gnuabi64 for BE)
101 | #
102 | # Prerequisites for emulation:
103 | # recent QEMU(-2.7) is installed or built from source and in the PATH variable.
104 | # standalone toolchain from 2020.06-01 comes with QEMU 4.1.0 for MIPS in PATH.
105 | # sudo apt-get install qemu-user make
106 | #
107 | # Compiling/running SIMD test:
108 | # make -f simd_make_m64.mk
109 | # qemu-mips64el -cpu I6400 simd_test.m64_32Lr6 -c 1
110 | # qemu-mips64el -cpu I6400 simd_test.m64_64Lr6 -c 1
111 | # qemu-mips64el -cpu I6400 simd_test.m64f32Lr6 -c 1
112 | # qemu-mips64el -cpu I6400 simd_test.m64f64Lr6 -c 1
113 | # qemu-mips64   -cpu I6400 simd_test.m64_32Br6 -c 1
114 | # qemu-mips64   -cpu I6400 simd_test.m64_64Br6 -c 1
115 | # qemu-mips64   -cpu I6400 simd_test.m64f32Br6 -c 1
116 | # qemu-mips64   -cpu I6400 simd_test.m64f64Br6 -c 1
117 | # Use "-c 1" option to reduce test time when emulating with QEMU
118 | 
119 | # Clang native build should theoretically work too (not tested), use (replace):
120 | # clang++ -O0 (in place of ...-g++ -O3) on MIPS64r6 host (I6400/P6600)
121 | # sudo apt-get install clang
122 | 
123 | # For interpretation of SIMD build flags check compatibility layer in rtzero.h.
124 | # The 128-bit 15-reg targets are supported for compatibility with x86/POWER.
125 | 
126 | # For 128-bit SIMD build use (replace): RT_128=1            (30 SIMD registers)
127 | # For 128-bit SIMD build use (replace): RT_128=4            (15 SIMD registers)
128 | # For 256-bit SIMD build use (replace): RT_256=1            (15 SIMD reg-pairs)
129 | 
130 | # 64/32-bit (ptr/adr) hybrid mode is compatible with native 64-bit ABI,
131 | # use (replace): RT_ADDRESS=32, rename the binary to simd_test.m64_**
132 | # 64-bit packed SIMD mode (fp64/int64) is supported on 64-bit targets,
133 | # use (replace): RT_ELEMENT=64, rename the binary to simd_test.m64*64
134 | 


--------------------------------------------------------------------------------
/test/simd_make_p32.mk:
--------------------------------------------------------------------------------
 1 | 
 2 | INC_PATH =                              \
 3 |         -I../core/config/
 4 | 
 5 | SRC_LIST =                              \
 6 |         simd_test.cpp
 7 | 
 8 | LIB_PATH =
 9 | 
10 | LIB_LIST =                              \
11 |         -lm
12 | 
13 | 
14 | build: simd_test_p32Bg4 simd_test_p32Bp7 simd_test_p32Bp8 simd_test_p32Bp9
15 | 
16 | strip:
17 | 	powerpc-linux-gnu-strip simd_test.p32*
18 | 
19 | clean:
20 | 	rm simd_test.p32*
21 | 
22 | 
23 | simd_test_p32Bg4:
24 | 	powerpc-linux-gnu-g++ -O3 -g -static -DRT_SIMD_COMPAT_VSX=0 \
25 |         -DRT_LINUX -DRT_P32 -DRT_128=4 -DRT_DEBUG=0 \
26 |         -DRT_POINTER=32 -DRT_ADDRESS=32 -DRT_ELEMENT=32 -DRT_ENDIAN=1 \
27 |         ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.p32Bg4
28 | 
29 | simd_test_p32Bp7:
30 | 	powerpc-linux-gnu-g++ -O3 -g -static \
31 |         -DRT_LINUX -DRT_P32 -DRT_128=1 -DRT_DEBUG=0 \
32 |         -DRT_POINTER=32 -DRT_ADDRESS=32 -DRT_ELEMENT=32 -DRT_ENDIAN=1 \
33 |         ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.p32Bp7
34 | 
35 | simd_test_p32Bp8:
36 | 	powerpc-linux-gnu-g++ -O3 -g -static -DRT_SIMD_COMPAT_PW8=1 \
37 |         -DRT_LINUX -DRT_P32 -DRT_128=1 -DRT_DEBUG=0 \
38 |         -DRT_POINTER=32 -DRT_ADDRESS=32 -DRT_ELEMENT=32 -DRT_ENDIAN=1 \
39 |         ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.p32Bp8
40 | 
41 | simd_test_p32Bp9:
42 | 	powerpc-linux-gnu-g++ -O3 -g -static \
43 |         -DRT_LINUX -DRT_P32 -DRT_128=2 -DRT_DEBUG=0 \
44 |         -DRT_POINTER=32 -DRT_ADDRESS=32 -DRT_ELEMENT=32 -DRT_ENDIAN=1 \
45 |         ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.p32Bp9
46 | 
47 | 
48 | # On Ubuntu (MATE) 16.04-22.04 add "universe multiverse" to "main restricted"
49 | # in /etc/apt/sources.list (sudo nano /etc/apt/sources.list) then run:
50 | # sudo apt-get update
51 | # (Ubuntu MATE is set up for an update without a need to edit the file)
52 | # (extended repositories "universe multiverse" are only needed for clang)
53 | #
54 | # Prerequisites for the build:
55 | # (cross-)compiler for PowerPC is installed and in the PATH variable.
56 | # sudo apt-get install make g++-powerpc-linux-gnu
57 | #
58 | # Prerequisites for emulation:
59 | # recent QEMU(-2.5) is installed or built from source and in the PATH variable.
60 | # POWER9 target requires more recent QEMU, tested with 3.x.y series and 4.2.0.
61 | # QEMU versions 4.x.y prior to 4.2.0 show issues with POWER8/9 fp32 LE targets.
62 | # sudo apt-get install qemu-user
63 | #
64 | # Compiling/running SIMD test:
65 | # make -f simd_make_p32.mk
66 | # qemu-ppc        -cpu G4     simd_test.p32Bg4 -c 1
67 | # qemu-ppc64abi32 -cpu POWER7 simd_test.p32Bp7 -c 1
68 | # qemu-ppc64abi32 -cpu POWER8 simd_test.p32Bp8 -c 1
69 | # qemu-ppc64abi32 -cpu POWER9 simd_test.p32Bp9 -c 1
70 | # Use "-c 1" option to reduce test time when emulating with QEMU
71 | 
72 | # Clang native build should theoretically work too (not tested), use (replace):
73 | # clang++ -O0 (in place of ...-g++ -O3) on PowerPC host (G4)
74 | # sudo apt-get install clang
75 | 
76 | # For interpretation of SIMD build flags check compatibility layer in rtzero.h.
77 | # The RT_SIMD_COMPAT_PW8=1 flag below is redundant when building in LE mode.
78 | 
79 | # For 128-bit VSX1 build use (replace): RT_128=1            (30 SIMD registers)
80 | # For 128-bit VSX2 build use (replace): RT_128=1 RT_SIMD_COMPAT_PW8=1 (30 regs)
81 | # For 128-bit VSX3 build use (replace): RT_128=2            (30 SIMD registers)
82 | # For 128-bit VMX  build use (replace): RT_128=4 RT_SIMD_COMPAT_VSX=0 (15 regs)
83 | 
84 | # For 256-bit VMX  build use (replace): RT_256_R8=4 RT_SIMD_COMPAT_VSX=0 (8 rp)
85 | # For 256-bit VSX1 build use (replace): RT_256=1            (15 SIMD reg-pairs)
86 | # For 256-bit VSX2 build use (replace): RT_256=1 RT_SIMD_COMPAT_PW8=1   (15 rp)
87 | # For 256-bit VSX3 build use (replace): RT_256=2            (15 SIMD reg-pairs)
88 | # For 256-bit VSX1 build use (replace): RT_256=4 (<=test29) (30 SIMD reg-pairs)
89 | # For 256-bit VSX2 build use (replace): RT_256=4 RT_SIMD_COMPAT_PW8=1   (30 rp)
90 | # For 256-bit VSX3 build use (replace): RT_256=8 (<=test29) (30 SIMD reg-pairs)
91 | 
92 | # For 512-bit VSX1 build use (replace): RT_512=1 (<=test29) (15 SIMD reg-quads)
93 | # For 512-bit VSX2 build use (replace): RT_512=1 RT_SIMD_COMPAT_PW8=1   (15 rq)
94 | # For 512-bit VSX3 build use (replace): RT_512=2 (<=test29) (15 SIMD reg-quads)
95 | 


--------------------------------------------------------------------------------
/test/simd_make_p64.mk:
--------------------------------------------------------------------------------
  1 | 
  2 | INC_PATH =                              \
  3 |         -I../core/config/
  4 | 
  5 | SRC_LIST =                              \
  6 |         simd_test.cpp
  7 | 
  8 | LIB_PATH =
  9 | 
 10 | LIB_LIST =                              \
 11 |         -lm
 12 | 
 13 | 
 14 | build: build_p9 build_le build_be
 15 | 
 16 | strip:
 17 | 	powerpc64le-linux-gnu-strip simd_test.p64???L*
 18 | 	powerpc64-linux-gnu-strip simd_test.p64???B*
 19 | 
 20 | clean:
 21 | 	rm simd_test.p64*
 22 | 
 23 | 
 24 | # using -mcpu=power8 for power9 targets is a workaround for QEMU 6.2.0 bug
 25 | # https://bugs.launchpad.net/ubuntu/+source/qemu/+bug/2011832
 26 | 
 27 | build_p9: simd_test_p64_32Lp9 simd_test_p64_64Lp9 \
 28 |           simd_test_p64f32Lp9 simd_test_p64f64Lp9
 29 | 
 30 | simd_test_p64_32Lp9:
 31 | 	powerpc64le-linux-gnu-g++ -O2 -g -static -mcpu=power8 \
 32 |         -DRT_LINUX -DRT_P64 -DRT_128=2 -DRT_DEBUG=0 \
 33 |         -DRT_POINTER=64 -DRT_ADDRESS=32 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \
 34 |         ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.p64_32Lp9
 35 | 
 36 | simd_test_p64_64Lp9:
 37 | 	powerpc64le-linux-gnu-g++ -O2 -g -static -mcpu=power8 \
 38 |         -DRT_LINUX -DRT_P64 -DRT_128=2 -DRT_DEBUG=0 \
 39 |         -DRT_POINTER=64 -DRT_ADDRESS=32 -DRT_ELEMENT=64 -DRT_ENDIAN=0 \
 40 |         ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.p64_64Lp9
 41 | 
 42 | simd_test_p64f32Lp9:
 43 | 	powerpc64le-linux-gnu-g++ -O2 -g -static -mcpu=power8 \
 44 |         -DRT_LINUX -DRT_P64 -DRT_256=2 -DRT_DEBUG=0 \
 45 |         -DRT_POINTER=64 -DRT_ADDRESS=64 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \
 46 |         ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.p64f32Lp9
 47 | 
 48 | simd_test_p64f64Lp9:
 49 | 	powerpc64le-linux-gnu-g++ -O2 -g -static -mcpu=power8 \
 50 |         -DRT_LINUX -DRT_P64 -DRT_256=2 -DRT_DEBUG=0 \
 51 |         -DRT_POINTER=64 -DRT_ADDRESS=64 -DRT_ELEMENT=64 -DRT_ENDIAN=0 \
 52 |         ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.p64f64Lp9
 53 | 
 54 | 
 55 | build_pX: simd_test_p64_32LpX simd_test_p64_64LpX \
 56 |           simd_test_p64f32LpX simd_test_p64f64LpX
 57 | 
 58 | simd_test_p64_32LpX:
 59 | 	powerpc64le-linux-gnu-g++ -O0 -g -static \
 60 |         -DRT_LINUX -DRT_P64 -DRT_256=8 -DRT_DEBUG=0 \
 61 |         -DRT_POINTER=64 -DRT_ADDRESS=32 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \
 62 |         ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.p64_32LpX
 63 | 
 64 | simd_test_p64_64LpX:
 65 | 	powerpc64le-linux-gnu-g++ -O0 -g -static \
 66 |         -DRT_LINUX -DRT_P64 -DRT_256=8 -DRT_DEBUG=0 \
 67 |         -DRT_POINTER=64 -DRT_ADDRESS=32 -DRT_ELEMENT=64 -DRT_ENDIAN=0 \
 68 |         ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.p64_64LpX
 69 | 
 70 | simd_test_p64f32LpX:
 71 | 	powerpc64le-linux-gnu-g++ -O0 -g -static \
 72 |         -DRT_LINUX -DRT_P64 -DRT_512=2 -DRT_DEBUG=0 \
 73 |         -DRT_POINTER=64 -DRT_ADDRESS=64 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \
 74 |         ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.p64f32LpX
 75 | 
 76 | simd_test_p64f64LpX:
 77 | 	powerpc64le-linux-gnu-g++ -O0 -g -static \
 78 |         -DRT_LINUX -DRT_P64 -DRT_512=2 -DRT_DEBUG=0 \
 79 |         -DRT_POINTER=64 -DRT_ADDRESS=64 -DRT_ELEMENT=64 -DRT_ENDIAN=0 \
 80 |         ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.p64f64LpX
 81 | 
 82 | 
 83 | build_le: simd_test_p64_32Lp8 simd_test_p64_64Lp8 \
 84 |           simd_test_p64f32Lp8 simd_test_p64f64Lp8
 85 | 
 86 | simd_test_p64_32Lp8:
 87 | 	powerpc64le-linux-gnu-g++ -O2 -g -static -mcpu=power8 \
 88 |         -DRT_LINUX -DRT_P64 -DRT_128=1 -DRT_DEBUG=0 \
 89 |         -DRT_POINTER=64 -DRT_ADDRESS=32 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \
 90 |         ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.p64_32Lp8
 91 | 
 92 | simd_test_p64_64Lp8:
 93 | 	powerpc64le-linux-gnu-g++ -O2 -g -static -mcpu=power8 \
 94 |         -DRT_LINUX -DRT_P64 -DRT_128=1 -DRT_DEBUG=0 \
 95 |         -DRT_POINTER=64 -DRT_ADDRESS=32 -DRT_ELEMENT=64 -DRT_ENDIAN=0 \
 96 |         ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.p64_64Lp8
 97 | 
 98 | simd_test_p64f32Lp8:
 99 | 	powerpc64le-linux-gnu-g++ -O2 -g -static -mcpu=power8 \
100 |         -DRT_LINUX -DRT_P64 -DRT_256=1 -DRT_DEBUG=0 \
101 |         -DRT_POINTER=64 -DRT_ADDRESS=64 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \
102 |         ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.p64f32Lp8
103 | 
104 | simd_test_p64f64Lp8:
105 | 	powerpc64le-linux-gnu-g++ -O2 -g -static -mcpu=power8 \
106 |         -DRT_LINUX -DRT_P64 -DRT_256=1 -DRT_DEBUG=0 \
107 |         -DRT_POINTER=64 -DRT_ADDRESS=64 -DRT_ELEMENT=64 -DRT_ENDIAN=0 \
108 |         ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.p64f64Lp8
109 | 
110 | 
111 | build_be: simd_test_p64_32Bp7 simd_test_p64_64Bp7 \
112 |           simd_test_p64f32Bp7 simd_test_p64f64Bp7
113 | 
114 | simd_test_p64_32Bp7:
115 | 	powerpc64-linux-gnu-g++ -O2 -g -static \
116 |         -DRT_LINUX -DRT_P64 -DRT_128=1 -DRT_DEBUG=0 \
117 |         -DRT_POINTER=64 -DRT_ADDRESS=32 -DRT_ELEMENT=32 -DRT_ENDIAN=1 \
118 |         ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.p64_32Bp7
119 | 
120 | simd_test_p64_64Bp7:
121 | 	powerpc64-linux-gnu-g++ -O2 -g -static \
122 |         -DRT_LINUX -DRT_P64 -DRT_128=1 -DRT_DEBUG=0 \
123 |         -DRT_POINTER=64 -DRT_ADDRESS=32 -DRT_ELEMENT=64 -DRT_ENDIAN=1 \
124 |         ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.p64_64Bp7
125 | 
126 | simd_test_p64f32Bp7:
127 | 	powerpc64-linux-gnu-g++ -O2 -g -static \
128 |         -DRT_LINUX -DRT_P64 -DRT_256=1 -DRT_DEBUG=0 \
129 |         -DRT_POINTER=64 -DRT_ADDRESS=64 -DRT_ELEMENT=32 -DRT_ENDIAN=1 \
130 |         ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.p64f32Bp7
131 | 
132 | simd_test_p64f64Bp7:
133 | 	powerpc64-linux-gnu-g++ -O2 -g -static \
134 |         -DRT_LINUX -DRT_P64 -DRT_256=1 -DRT_DEBUG=0 \
135 |         -DRT_POINTER=64 -DRT_ADDRESS=64 -DRT_ELEMENT=64 -DRT_ENDIAN=1 \
136 |         ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.p64f64Bp7
137 | 
138 | 
139 | # On Ubuntu (MATE) 16.04-22.04 add "universe multiverse" to "main restricted"
140 | # in /etc/apt/sources.list (sudo nano /etc/apt/sources.list) then run:
141 | # sudo apt-get update
142 | # (Ubuntu MATE is set up for an update without a need to edit the file)
143 | # (extended repositories "universe multiverse" are only needed for clang)
144 | #
145 | # Prerequisites for the build:
146 | # (cross-)compiler for 64-bit POWER is installed and in the PATH variable.
147 | # sudo apt-get install make g++-powerpc64le-linux-gnu
148 | # sudo apt-get install make g++-powerpc64-linux-gnu
149 | # (recent g++-5-powerpc64le series target POWER8 and don't work well with -O3)
150 | #
151 | # Prerequisites for emulation:
152 | # recent QEMU(-2.5) is installed or built from source and in the PATH variable.
153 | # POWER9 target requires more recent QEMU, tested with 3.x.y series and 4.2.0.
154 | # QEMU versions 4.x.y prior to 4.2.0 show issues with POWER8/9 fp32 LE targets.
155 | # sudo apt-get install qemu-user
156 | #
157 | # Compiling/running SIMD test:
158 | # make -f simd_make_p64.mk
159 | # qemu-ppc64le -cpu POWER9 simd_test.p64_32Lp9 -c 1
160 | # qemu-ppc64le -cpu POWER9 simd_test.p64_64Lp9 -c 1
161 | # qemu-ppc64le -cpu POWER9 simd_test.p64f32Lp9 -c 1
162 | # qemu-ppc64le -cpu POWER9 simd_test.p64f64Lp9 -c 1
163 | # qemu-ppc64le -cpu POWER9 simd_test.p64_32LpX -c 1
164 | # qemu-ppc64le -cpu POWER9 simd_test.p64_64LpX -c 1
165 | # qemu-ppc64le -cpu POWER9 simd_test.p64f32LpX -c 1
166 | # qemu-ppc64le -cpu POWER9 simd_test.p64f64LpX -c 1
167 | # qemu-ppc64le -cpu POWER8 simd_test.p64_32Lp8 -c 1 (use POWER9 on Ubuntu 22.04)
168 | # qemu-ppc64le -cpu POWER8 simd_test.p64_64Lp8 -c 1 (use POWER9 on Ubuntu 22.04)
169 | # qemu-ppc64le -cpu POWER8 simd_test.p64f32Lp8 -c 1 (use POWER9 on Ubuntu 22.04)
170 | # qemu-ppc64le -cpu POWER8 simd_test.p64f64Lp8 -c 1 (use POWER9 on Ubuntu 22.04)
171 | # qemu-ppc64   -cpu POWER7 simd_test.p64_32Bp7 -c 1
172 | # qemu-ppc64   -cpu POWER7 simd_test.p64_64Bp7 -c 1
173 | # qemu-ppc64   -cpu POWER7 simd_test.p64f32Bp7 -c 1
174 | # qemu-ppc64   -cpu POWER7 simd_test.p64f64Bp7 -c 1
175 | # Use "-c 1" option to reduce test time when emulating with QEMU
176 | 
177 | # Clang native build works too (takes much longer prior to 3.8), use (replace):
178 | # clang++ -O0 (in place of ...-g++ -O2) on 64-bit POWER host (Tyan TN71-BP012)
179 | # sudo apt-get install clang
180 | 
181 | # For interpretation of SIMD build flags check compatibility layer in rtzero.h.
182 | # The RT_SIMD_COMPAT_PW8=1 flag below is redundant when building in LE mode.
183 | 
184 | # For 128-bit VSX1 build use (replace): RT_128=1            (30 SIMD registers)
185 | # For 128-bit VSX2 build use (replace): RT_128=1 RT_SIMD_COMPAT_PW8=1 (30 regs)
186 | # For 128-bit VSX3 build use (replace): RT_128=2            (30 SIMD registers)
187 | # For 128-bit VMX  build use (replace): RT_128=4 RT_SIMD_COMPAT_VSX=0 (15 regs)
188 | 
189 | # For 256-bit VMX  build use (replace): RT_256_R8=4 RT_SIMD_COMPAT_VSX=0 (8 rp)
190 | # For 256-bit VSX1 build use (replace): RT_256=1            (15 SIMD reg-pairs)
191 | # For 256-bit VSX2 build use (replace): RT_256=1 RT_SIMD_COMPAT_PW8=1   (15 rp)
192 | # For 256-bit VSX3 build use (replace): RT_256=2            (15 SIMD reg-pairs)
193 | # For 256-bit VSX1 build use (replace): RT_256=4 (<=test29) (30 SIMD reg-pairs)
194 | # For 256-bit VSX2 build use (replace): RT_256=4 RT_SIMD_COMPAT_PW8=1   (30 rp)
195 | # For 256-bit VSX3 build use (replace): RT_256=8 (<=test29) (30 SIMD reg-pairs)
196 | 
197 | # For 512-bit VSX1 build use (replace): RT_512=1 (<=test29) (15 SIMD reg-quads)
198 | # For 512-bit VSX2 build use (replace): RT_512=1 RT_SIMD_COMPAT_PW8=1   (15 rq)
199 | # For 512-bit VSX3 build use (replace): RT_512=2 (<=test29) (15 SIMD reg-quads)
200 | 
201 | # 64/32-bit (ptr/adr) hybrid mode is compatible with native 64-bit ABI,
202 | # use (replace): RT_ADDRESS=32, rename the binary to simd_test.p64_**
203 | # 64-bit packed SIMD mode (fp64/int64) is supported on 64-bit targets,
204 | # use (replace): RT_ELEMENT=64, rename the binary to simd_test.p64*64
205 | 


--------------------------------------------------------------------------------
/test/simd_make_w64.bat:
--------------------------------------------------------------------------------
1 | mingw32-make -f simd_make_w64.mk -j4
2 | simd_test_w64f32.exe
3 | 


--------------------------------------------------------------------------------
/test/simd_make_w64.mk:
--------------------------------------------------------------------------------
  1 | 
  2 | INC_PATH =                              \
  3 |         -I../core/config/
  4 | 
  5 | SRC_LIST =                              \
  6 |         simd_test.cpp
  7 | 
  8 | LIB_PATH =
  9 | 
 10 | LIB_LIST =                              \
 11 |         -lm
 12 | 
 13 | 
 14 | build: build_w64 build_w64avx build_w64avx512
 15 | 
 16 | strip:
 17 | 	strip simd_test_w64*.exe
 18 | 
 19 | clean:
 20 | 	del simd_test_w64*.exe
 21 | 
 22 | 
 23 | build_w64: simd_test_w64_32 simd_test_w64_64 simd_test_w64f32 simd_test_w64f64
 24 | 
 25 | simd_test_w64_32:
 26 | 	g++ -O3 -g -static -m64 \
 27 |         -DRT_WIN64 -DRT_X64 -DRT_128=4 -DRT_SIMD_COMPAT_SSE=2 -DRT_DEBUG=0 \
 28 |         -DRT_POINTER=64 -DRT_ADDRESS=32 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \
 29 |   ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test_w64_32.exe
 30 | 
 31 | simd_test_w64_64:
 32 | 	g++ -O3 -g -static -m64 \
 33 |         -DRT_WIN64 -DRT_X64 -DRT_128=4 -DRT_SIMD_COMPAT_SSE=2 -DRT_DEBUG=0 \
 34 |         -DRT_POINTER=64 -DRT_ADDRESS=32 -DRT_ELEMENT=64 -DRT_ENDIAN=0 \
 35 |   ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test_w64_64.exe
 36 | 
 37 | simd_test_w64f32:
 38 | 	g++ -O3 -g -static -m64 \
 39 |         -DRT_WIN64 -DRT_X64 -DRT_256_R8=4 -DRT_DEBUG=0 \
 40 |         -DRT_POINTER=64 -DRT_ADDRESS=64 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \
 41 |   ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test_w64f32.exe
 42 | 
 43 | simd_test_w64f64:
 44 | 	g++ -O3 -g -static -m64 \
 45 |         -DRT_WIN64 -DRT_X64 -DRT_256_R8=4 -DRT_DEBUG=0 \
 46 |         -DRT_POINTER=64 -DRT_ADDRESS=64 -DRT_ELEMENT=64 -DRT_ENDIAN=0 \
 47 |   ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test_w64f64.exe
 48 | 
 49 | 
 50 | build_w64avx: simd_test_w64_32avx simd_test_w64_64avx \
 51 |               simd_test_w64f32avx simd_test_w64f64avx
 52 | 
 53 | simd_test_w64_32avx:
 54 | 	g++ -O3 -g -static -m64 \
 55 |         -DRT_WIN64 -DRT_X64 -DRT_256=1 -DRT_DEBUG=0 \
 56 |         -DRT_POINTER=64 -DRT_ADDRESS=32 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \
 57 |   ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test_w64_32avx.exe
 58 | 
 59 | simd_test_w64_64avx:
 60 | 	g++ -O3 -g -static -m64 \
 61 |         -DRT_WIN64 -DRT_X64 -DRT_256=1 -DRT_DEBUG=0 \
 62 |         -DRT_POINTER=64 -DRT_ADDRESS=32 -DRT_ELEMENT=64 -DRT_ENDIAN=0 \
 63 |   ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test_w64_64avx.exe
 64 | 
 65 | simd_test_w64f32avx:
 66 | 	g++ -O3 -g -static -m64 \
 67 |         -DRT_WIN64 -DRT_X64 -DRT_256=2 -DRT_DEBUG=0 \
 68 |         -DRT_POINTER=64 -DRT_ADDRESS=64 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \
 69 |   ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test_w64f32avx.exe
 70 | 
 71 | simd_test_w64f64avx:
 72 | 	g++ -O3 -g -static -m64 \
 73 |         -DRT_WIN64 -DRT_X64 -DRT_256=2 -DRT_DEBUG=0 \
 74 |         -DRT_POINTER=64 -DRT_ADDRESS=64 -DRT_ELEMENT=64 -DRT_ENDIAN=0 \
 75 |   ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test_w64f64avx.exe
 76 | 
 77 | 
 78 | build_w64avx512: simd_test_w64_32avx512 simd_test_w64_64avx512 \
 79 |                  simd_test_w64f32avx512 simd_test_w64f64avx512
 80 | 
 81 | simd_test_w64_32avx512:
 82 | 	g++ -O3 -g -static -m64 \
 83 |         -DRT_WIN64 -DRT_X64 -DRT_512=1 -DRT_DEBUG=0 \
 84 |         -DRT_POINTER=64 -DRT_ADDRESS=32 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \
 85 |   ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test_w64_32avx512.exe
 86 | 
 87 | simd_test_w64_64avx512:
 88 | 	g++ -O3 -g -static -m64 \
 89 |         -DRT_WIN64 -DRT_X64 -DRT_512=1 -DRT_DEBUG=0 \
 90 |         -DRT_POINTER=64 -DRT_ADDRESS=32 -DRT_ELEMENT=64 -DRT_ENDIAN=0 \
 91 |   ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test_w64_64avx512.exe
 92 | 
 93 | simd_test_w64f32avx512:
 94 | 	g++ -O3 -g -static -m64 \
 95 |         -DRT_WIN64 -DRT_X64 -DRT_512=2 -DRT_DEBUG=0 \
 96 |         -DRT_POINTER=64 -DRT_ADDRESS=64 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \
 97 |   ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test_w64f32avx512.exe
 98 | 
 99 | simd_test_w64f64avx512:
100 | 	g++ -O3 -g -static -m64 \
101 |         -DRT_WIN64 -DRT_X64 -DRT_512=2 -DRT_DEBUG=0 \
102 |         -DRT_POINTER=64 -DRT_ADDRESS=64 -DRT_ELEMENT=64 -DRT_ENDIAN=0 \
103 |   ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test_w64f64avx512.exe
104 | 
105 | 
106 | # Prerequisites for the build:
107 | # TDM64-GCC compiler for Win32/64 is installed and in the PATH variable.
108 | # Download tdm64-gcc-5.1.0-2.exe from sourceforge and run the installer.
109 | # Alternatively download and install tdm64-gcc-10.3.0-2.exe from github.
110 | #
111 | # Compiling/running SIMD test:
112 | # run simd_make_w64.bat from Windows Explorer or
113 | # run the following from Command Prompt "cmd":
114 | # mingw32-make -f simd_make_w64.mk
115 | # simd_test_w64f32.exe
116 | # simd_test_w64f32avx.exe
117 | # simd_test_w64f32avx512.exe
118 | # Use "-c 1" option to reduce test time when emulating with Intel SDE
119 | 
120 | # Clang native build should theoretically work too (not tested), use (replace):
121 | # clang++ (in place of g++) may require Visual Studio
122 | # once clang for Windows is installed and in the PATH variable.
123 | 
124 | # For interpretation of SIMD build flags check compatibility layer in rtzero.h.
125 | # The 30-reg targets on top of AVX1+2/SSEx below will require in-mem emulation.
126 | 
127 | # For 128-bit 30-reg build use (replace): RT_128=1   (reserved for AVX1+2/SSEx)
128 | # For 128-bit 30-reg build use (replace): RT_128=2   (Skylake-X w/ AVX512DQ+VL)
129 | # For 128-bit SSE2 build use (replace): RT_128=4 RT_SIMD_COMPAT_SSE=2 (15 regs)
130 | # For 128-bit SSE4 build use (replace): RT_128=4            (15 SIMD registers)
131 | # For 128-bit AVX1 build use (replace): RT_128=8            (15 SIMD registers)
132 | # For 128-bit FMA3 build use (replace): RT_128=16   (AMD's AVX1+FMA3) (15 regs)
133 | # For 128-bit AVX2 build use (replace): RT_128=32   (AMD's AVX2+FMA3) (15 regs)
134 | 
135 | # For 256-bit SSE2 build use (replace): RT_256_R8=4 RT_SIMD_COMPAT_SSE=2 (8 rp)
136 | # For 256-bit SSE4 build use (replace): RT_256_R8=4          (8 SIMD reg-pairs)
137 | # For 256-bit AVX1 build use (replace): RT_256=1            (15 SIMD registers)
138 | # For 256-bit AVX2 build use (replace): RT_256=2            (15 SIMD registers)
139 | # For 256-bit 30-reg build use (replace): RT_256=4   (reserved for AVX1+2/SSEx)
140 | # For 256-bit 30-reg build use (replace): RT_256=8   (Skylake-X w/ AVX512DQ+VL)
141 | 
142 | # For 512-bit AVX1 build use (replace): RT_512_R8=1          (8 SIMD reg-pairs)
143 | # For 512-bit AVX2 build use (replace): RT_512_R8=2          (8 SIMD reg-pairs)
144 | # For 512-bit AVX512F  build use (replace): RT_512=1        (15 SIMD registers)
145 | # For 512-bit AVX512DQ build use (replace): RT_512=2        (15 SIMD registers)
146 | # For 512-bit AVX512F  build use (replace): RT_512=4        (30 SIMD registers)
147 | # For 512-bit AVX512DQ build use (replace): RT_512=8        (30 SIMD registers)
148 | 
149 | # For 1024-bit AVX512F  build use (replace): RT_1K4=1       (15 SIMD reg-pairs)
150 | # For 1024-bit AVX512DQ build use (replace): RT_1K4=2       (15 SIMD reg-pairs)
151 | # For 2048-bit AVX512F  build use (replace): RT_2K8_R8=1     (8 SIMD reg-quads)
152 | # For 2048-bit AVX512DQ build use (replace): RT_2K8_R8=2     (8 SIMD reg-quads)
153 | 
154 | # 64/32-bit (ptr/adr) hybrid mode is compatible with native 64-bit ABI,
155 | # use (replace): RT_ADDRESS=32, rename the binary to simd_test_w64_**.exe
156 | # 64-bit packed SIMD mode (fp64/int64) is supported on 64-bit targets,
157 | # use (replace): RT_ELEMENT=64, rename the binary to simd_test_w64*64.exe
158 | 


--------------------------------------------------------------------------------
/test/simd_make_x32.mk:
--------------------------------------------------------------------------------
 1 | 
 2 | INC_PATH =                              \
 3 |         -I../core/config/
 4 | 
 5 | SRC_LIST =                              \
 6 |         simd_test.cpp
 7 | 
 8 | LIB_PATH =
 9 | 
10 | LIB_LIST =                              \
11 |         -lm
12 | 
13 | 
14 | build: simd_test_x32
15 | 
16 | strip:
17 | 	strip simd_test.x32*
18 | 
19 | clean:
20 | 	rm simd_test.x32*
21 | 
22 | 
23 | simd_test_x32:
24 | 	g++ -O3 -g -mx32 \
25 |         -DRT_LINUX -DRT_X32 -DRT_256_R8=4 -DRT_SIMD_COMPAT_SSE=2 -DRT_DEBUG=0 \
26 |         -DRT_POINTER=32 -DRT_ADDRESS=32 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \
27 |         ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.x32
28 | 
29 | 
30 | # On Ubuntu (MATE) 16.04-22.04 add "universe multiverse" to "main restricted"
31 | # in /etc/apt/sources.list (sudo nano /etc/apt/sources.list) then run:
32 | # sudo apt-get update
33 | # (Ubuntu MATE is set up for an update without a need to edit the file)
34 | # (extended repositories "universe multiverse" are only needed for clang)
35 | #
36 | # Prerequisites for the build:
37 | # multilib-compiler for x86_64 is installed and in the PATH variable.
38 | # sudo apt-get install make g++-multilib
39 | # (installation of g++-multilib removes any g++ cross-compilers)
40 | #
41 | # Compiling/running SIMD test:
42 | # make -f simd_make_x32.mk
43 | # ./simd_test.x32
44 | 
45 | # Clang native build works too (takes much longer prior to 3.8), use (replace):
46 | # clang++ (in place of g++)
47 | # sudo apt-get install clang (requires g++-multilib for non-native ABI)
48 | 
49 | # For interpretation of SIMD build flags check compatibility layer in rtzero.h.
50 | # The 30-reg targets on top of AVX1+2/SSEx below will require in-mem emulation.
51 | 
52 | # For 128-bit 30-reg build use (replace): RT_128=1   (reserved for AVX1+2/SSEx)
53 | # For 128-bit 30-reg build use (replace): RT_128=2   (Skylake-X w/ AVX512DQ+VL)
54 | # For 128-bit SSE2 build use (replace): RT_128=4 RT_SIMD_COMPAT_SSE=2 (15 regs)
55 | # For 128-bit SSE4 build use (replace): RT_128=4            (15 SIMD registers)
56 | # For 128-bit AVX1 build use (replace): RT_128=8            (15 SIMD registers)
57 | # For 128-bit FMA3 build use (replace): RT_128=16   (AMD's AVX1+FMA3) (15 regs)
58 | # For 128-bit AVX2 build use (replace): RT_128=32   (AMD's AVX2+FMA3) (15 regs)
59 | 
60 | # For 256-bit SSE2 build use (replace): RT_256_R8=4 RT_SIMD_COMPAT_SSE=2 (8 rp)
61 | # For 256-bit SSE4 build use (replace): RT_256_R8=4          (8 SIMD reg-pairs)
62 | # For 256-bit AVX1 build use (replace): RT_256=1            (15 SIMD registers)
63 | # For 256-bit AVX2 build use (replace): RT_256=2            (15 SIMD registers)
64 | # For 256-bit 30-reg build use (replace): RT_256=4   (reserved for AVX1+2/SSEx)
65 | # For 256-bit 30-reg build use (replace): RT_256=8   (Skylake-X w/ AVX512DQ+VL)
66 | 
67 | # For 512-bit AVX1 build use (replace): RT_512_R8=1          (8 SIMD reg-pairs)
68 | # For 512-bit AVX2 build use (replace): RT_512_R8=2          (8 SIMD reg-pairs)
69 | # For 512-bit AVX512F  build use (replace): RT_512=1        (15 SIMD registers)
70 | # For 512-bit AVX512DQ build use (replace): RT_512=2        (15 SIMD registers)
71 | # For 512-bit AVX512F  build use (replace): RT_512=4        (30 SIMD registers)
72 | # For 512-bit AVX512DQ build use (replace): RT_512=8        (30 SIMD registers)
73 | 
74 | # For 1024-bit AVX512F  build use (replace): RT_1K4=1       (15 SIMD reg-pairs)
75 | # For 1024-bit AVX512DQ build use (replace): RT_1K4=2       (15 SIMD reg-pairs)
76 | # For 2048-bit AVX512F  build use (replace): RT_2K8_R8=1     (8 SIMD reg-quads)
77 | # For 2048-bit AVX512DQ build use (replace): RT_2K8_R8=2     (8 SIMD reg-quads)
78 | 


--------------------------------------------------------------------------------
/test/simd_make_x64.mk:
--------------------------------------------------------------------------------
  1 | 
  2 | INC_PATH =                              \
  3 |         -I../core/config/
  4 | 
  5 | SRC_LIST =                              \
  6 |         simd_test.cpp
  7 | 
  8 | LIB_PATH =
  9 | 
 10 | LIB_LIST =                              \
 11 |         -lm
 12 | 
 13 | 
 14 | build: build_x64 build_x64avx build_x64avx512
 15 | clang: clang_x64 clang_x64avx clang_x64avx512
 16 | 
 17 | strip:
 18 | 	strip simd_test.x64*
 19 | 
 20 | clean:
 21 | 	rm simd_test.x64*
 22 | 
 23 | macOS:
 24 | 	mv simd_test.x64_32 simd_test.o64_32
 25 | 	mv simd_test.x64_64 simd_test.o64_64
 26 | 	mv simd_test.x64f32 simd_test.o64f32
 27 | 	mv simd_test.x64f64 simd_test.o64f64
 28 | 	mv simd_test.x64_32avx simd_test.o64_32avx
 29 | 	mv simd_test.x64_64avx simd_test.o64_64avx
 30 | 	mv simd_test.x64f32avx simd_test.o64f32avx
 31 | 	mv simd_test.x64f64avx simd_test.o64f64avx
 32 | 	mv simd_test.x64_32avx512 simd_test.o64_32avx512
 33 | 	mv simd_test.x64_64avx512 simd_test.o64_64avx512
 34 | 	mv simd_test.x64f32avx512 simd_test.o64f32avx512
 35 | 	mv simd_test.x64f64avx512 simd_test.o64f64avx512
 36 | 
 37 | macRD:
 38 | 	rm -fr simd_test.x64*.dSYM/
 39 | 
 40 | macRM:
 41 | 	rm simd_test.o64*
 42 | 
 43 | 
 44 | build_x64: simd_test_x64_32 simd_test_x64_64 simd_test_x64f32 simd_test_x64f64
 45 | 
 46 | simd_test_x64_32:
 47 | 	g++ -O3 -g \
 48 |         -DRT_LINUX -DRT_X64 -DRT_128=4 -DRT_SIMD_COMPAT_SSE=2 -DRT_DEBUG=0 \
 49 |         -DRT_POINTER=64 -DRT_ADDRESS=32 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \
 50 |       ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.x64_32
 51 | 
 52 | simd_test_x64_64:
 53 | 	g++ -O3 -g \
 54 |         -DRT_LINUX -DRT_X64 -DRT_128=4 -DRT_SIMD_COMPAT_SSE=2 -DRT_DEBUG=0 \
 55 |         -DRT_POINTER=64 -DRT_ADDRESS=32 -DRT_ELEMENT=64 -DRT_ENDIAN=0 \
 56 |       ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.x64_64
 57 | 
 58 | simd_test_x64f32:
 59 | 	g++ -O3 -g \
 60 |         -DRT_LINUX -DRT_X64 -DRT_256_R8=4 -DRT_DEBUG=0 \
 61 |         -DRT_POINTER=64 -DRT_ADDRESS=64 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \
 62 |       ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.x64f32
 63 | 
 64 | simd_test_x64f64:
 65 | 	g++ -O3 -g \
 66 |         -DRT_LINUX -DRT_X64 -DRT_256_R8=4 -DRT_DEBUG=0 \
 67 |         -DRT_POINTER=64 -DRT_ADDRESS=64 -DRT_ELEMENT=64 -DRT_ENDIAN=0 \
 68 |       ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.x64f64
 69 | 
 70 | 
 71 | build_x64avx: simd_test_x64_32avx simd_test_x64_64avx \
 72 |               simd_test_x64f32avx simd_test_x64f64avx
 73 | 
 74 | simd_test_x64_32avx:
 75 | 	g++ -O3 -g \
 76 |         -DRT_LINUX -DRT_X64 -DRT_256=1 -DRT_DEBUG=0 \
 77 |         -DRT_POINTER=64 -DRT_ADDRESS=32 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \
 78 |       ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.x64_32avx
 79 | 
 80 | simd_test_x64_64avx:
 81 | 	g++ -O3 -g \
 82 |         -DRT_LINUX -DRT_X64 -DRT_256=1 -DRT_DEBUG=0 \
 83 |         -DRT_POINTER=64 -DRT_ADDRESS=32 -DRT_ELEMENT=64 -DRT_ENDIAN=0 \
 84 |       ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.x64_64avx
 85 | 
 86 | simd_test_x64f32avx:
 87 | 	g++ -O3 -g \
 88 |         -DRT_LINUX -DRT_X64 -DRT_256=2 -DRT_DEBUG=0 \
 89 |         -DRT_POINTER=64 -DRT_ADDRESS=64 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \
 90 |       ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.x64f32avx
 91 | 
 92 | simd_test_x64f64avx:
 93 | 	g++ -O3 -g \
 94 |         -DRT_LINUX -DRT_X64 -DRT_256=2 -DRT_DEBUG=0 \
 95 |         -DRT_POINTER=64 -DRT_ADDRESS=64 -DRT_ELEMENT=64 -DRT_ENDIAN=0 \
 96 |       ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.x64f64avx
 97 | 
 98 | 
 99 | build_x64avx512: simd_test_x64_32avx512 simd_test_x64_64avx512 \
100 |                  simd_test_x64f32avx512 simd_test_x64f64avx512
101 | 
102 | simd_test_x64_32avx512:
103 | 	g++ -O3 -g \
104 |         -DRT_LINUX -DRT_X64 -DRT_512=1 -DRT_DEBUG=0 \
105 |         -DRT_POINTER=64 -DRT_ADDRESS=32 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \
106 |       ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.x64_32avx512
107 | 
108 | simd_test_x64_64avx512:
109 | 	g++ -O3 -g \
110 |         -DRT_LINUX -DRT_X64 -DRT_512=1 -DRT_DEBUG=0 \
111 |         -DRT_POINTER=64 -DRT_ADDRESS=32 -DRT_ELEMENT=64 -DRT_ENDIAN=0 \
112 |       ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.x64_64avx512
113 | 
114 | simd_test_x64f32avx512:
115 | 	g++ -O3 -g \
116 |         -DRT_LINUX -DRT_X64 -DRT_512=2 -DRT_DEBUG=0 \
117 |         -DRT_POINTER=64 -DRT_ADDRESS=64 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \
118 |       ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.x64f32avx512
119 | 
120 | simd_test_x64f64avx512:
121 | 	g++ -O3 -g \
122 |         -DRT_LINUX -DRT_X64 -DRT_512=2 -DRT_DEBUG=0 \
123 |         -DRT_POINTER=64 -DRT_ADDRESS=64 -DRT_ELEMENT=64 -DRT_ENDIAN=0 \
124 |       ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.x64f64avx512
125 | 
126 | 
127 | clang_x64: simd_test.x64_32 simd_test.x64_64 simd_test.x64f32 simd_test.x64f64
128 | 
129 | simd_test.x64_32:
130 | 	clang++ -O3 -g \
131 |         -DRT_LINUX -DRT_X64 -DRT_128=4 -DRT_SIMD_COMPAT_SSE=2 -DRT_DEBUG=0 \
132 |         -DRT_POINTER=64 -DRT_ADDRESS=32 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \
133 |       ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.x64_32
134 | 
135 | simd_test.x64_64:
136 | 	clang++ -O3 -g \
137 |         -DRT_LINUX -DRT_X64 -DRT_128=4 -DRT_SIMD_COMPAT_SSE=2 -DRT_DEBUG=0 \
138 |         -DRT_POINTER=64 -DRT_ADDRESS=32 -DRT_ELEMENT=64 -DRT_ENDIAN=0 \
139 |       ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.x64_64
140 | 
141 | simd_test.x64f32:
142 | 	clang++ -O3 -g \
143 |         -DRT_LINUX -DRT_X64 -DRT_256_R8=4 -DRT_DEBUG=0 \
144 |         -DRT_POINTER=64 -DRT_ADDRESS=64 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \
145 |       ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.x64f32
146 | 
147 | simd_test.x64f64:
148 | 	clang++ -O3 -g \
149 |         -DRT_LINUX -DRT_X64 -DRT_256_R8=4 -DRT_DEBUG=0 \
150 |         -DRT_POINTER=64 -DRT_ADDRESS=64 -DRT_ELEMENT=64 -DRT_ENDIAN=0 \
151 |       ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.x64f64
152 | 
153 | 
154 | clang_x64avx: simd_test.x64_32avx simd_test.x64_64avx \
155 |               simd_test.x64f32avx simd_test.x64f64avx
156 | 
157 | simd_test.x64_32avx:
158 | 	clang++ -O3 -g \
159 |         -DRT_LINUX -DRT_X64 -DRT_256=1 -DRT_DEBUG=0 \
160 |         -DRT_POINTER=64 -DRT_ADDRESS=32 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \
161 |       ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.x64_32avx
162 | 
163 | simd_test.x64_64avx:
164 | 	clang++ -O3 -g \
165 |         -DRT_LINUX -DRT_X64 -DRT_256=1 -DRT_DEBUG=0 \
166 |         -DRT_POINTER=64 -DRT_ADDRESS=32 -DRT_ELEMENT=64 -DRT_ENDIAN=0 \
167 |       ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.x64_64avx
168 | 
169 | simd_test.x64f32avx:
170 | 	clang++ -O3 -g \
171 |         -DRT_LINUX -DRT_X64 -DRT_256=2 -DRT_DEBUG=0 \
172 |         -DRT_POINTER=64 -DRT_ADDRESS=64 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \
173 |       ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.x64f32avx
174 | 
175 | simd_test.x64f64avx:
176 | 	clang++ -O3 -g \
177 |         -DRT_LINUX -DRT_X64 -DRT_256=2 -DRT_DEBUG=0 \
178 |         -DRT_POINTER=64 -DRT_ADDRESS=64 -DRT_ELEMENT=64 -DRT_ENDIAN=0 \
179 |       ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.x64f64avx
180 | 
181 | 
182 | clang_x64avx512: simd_test.x64_32avx512 simd_test.x64_64avx512 \
183 |                  simd_test.x64f32avx512 simd_test.x64f64avx512
184 | 
185 | simd_test.x64_32avx512:
186 | 	clang++ -O3 -g \
187 |         -DRT_LINUX -DRT_X64 -DRT_512=1 -DRT_DEBUG=0 \
188 |         -DRT_POINTER=64 -DRT_ADDRESS=32 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \
189 |       ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.x64_32avx512
190 | 
191 | simd_test.x64_64avx512:
192 | 	clang++ -O3 -g \
193 |         -DRT_LINUX -DRT_X64 -DRT_512=1 -DRT_DEBUG=0 \
194 |         -DRT_POINTER=64 -DRT_ADDRESS=32 -DRT_ELEMENT=64 -DRT_ENDIAN=0 \
195 |       ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.x64_64avx512
196 | 
197 | simd_test.x64f32avx512:
198 | 	clang++ -O3 -g \
199 |         -DRT_LINUX -DRT_X64 -DRT_512=2 -DRT_DEBUG=0 \
200 |         -DRT_POINTER=64 -DRT_ADDRESS=64 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \
201 |       ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.x64f32avx512
202 | 
203 | simd_test.x64f64avx512:
204 | 	clang++ -O3 -g \
205 |         -DRT_LINUX -DRT_X64 -DRT_512=2 -DRT_DEBUG=0 \
206 |         -DRT_POINTER=64 -DRT_ADDRESS=64 -DRT_ELEMENT=64 -DRT_ENDIAN=0 \
207 |       ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.x64f64avx512
208 | 
209 | 
210 | # On Ubuntu (MATE) 16.04-22.04 add "universe multiverse" to "main restricted"
211 | # in /etc/apt/sources.list (sudo nano /etc/apt/sources.list) then run:
212 | # sudo apt-get update
213 | # (Ubuntu MATE is set up for an update without a need to edit the file)
214 | # (extended repositories "universe multiverse" are only needed for clang)
215 | #
216 | # Prerequisites for the build:
217 | # native-compiler for x86_64 is installed and in the PATH variable.
218 | # sudo apt-get install make g++
219 | #
220 | # When building on macOS install Command Line Tools first.
221 | # http://osxdaily.com/2014/02/12/install-command-line-tools-mac-os-x/
222 | #
223 | # Prerequisites for emulation:
224 | # http://software.intel.com/en-us/articles/intel-software-development-emulator
225 | # Intel SDE is downloaded, unpacked and in the PATH variable.
226 | #
227 | # Compiling/running SIMD test:
228 | # make -f simd_make_x64.mk
229 | # ./simd_test.x64f32
230 | # ./simd_test.x64f32avx
231 | # ./simd_test.x64f32avx512
232 | # sde64 -hsw -- ./simd_test.x64f32avx -c 1
233 | # sde64 -skx -- ./simd_test.x64f32avx512 -c 1
234 | # Use "-c 1" option to reduce test time when emulating with Intel SDE
235 | 
236 | # Clang native build works too (takes much longer prior to 3.8), use (replace):
237 | # clang++ (in place of g++)
238 | # sudo apt-get install clang
239 | 
240 | # For interpretation of SIMD build flags check compatibility layer in rtzero.h.
241 | # The 30-reg targets on top of AVX1+2/SSEx below will require in-mem emulation.
242 | 
243 | # For 128-bit 30-reg build use (replace): RT_128=1   (reserved for AVX1+2/SSEx)
244 | # For 128-bit 30-reg build use (replace): RT_128=2   (Skylake-X w/ AVX512DQ+VL)
245 | # For 128-bit SSE2 build use (replace): RT_128=4 RT_SIMD_COMPAT_SSE=2 (15 regs)
246 | # For 128-bit SSE4 build use (replace): RT_128=4            (15 SIMD registers)
247 | # For 128-bit AVX1 build use (replace): RT_128=8            (15 SIMD registers)
248 | # For 128-bit FMA3 build use (replace): RT_128=16   (AMD's AVX1+FMA3) (15 regs)
249 | # For 128-bit AVX2 build use (replace): RT_128=32   (AMD's AVX2+FMA3) (15 regs)
250 | 
251 | # For 256-bit SSE2 build use (replace): RT_256_R8=4 RT_SIMD_COMPAT_SSE=2 (8 rp)
252 | # For 256-bit SSE4 build use (replace): RT_256_R8=4          (8 SIMD reg-pairs)
253 | # For 256-bit AVX1 build use (replace): RT_256=1            (15 SIMD registers)
254 | # For 256-bit AVX2 build use (replace): RT_256=2            (15 SIMD registers)
255 | # For 256-bit 30-reg build use (replace): RT_256=4   (reserved for AVX1+2/SSEx)
256 | # For 256-bit 30-reg build use (replace): RT_256=8   (Skylake-X w/ AVX512DQ+VL)
257 | 
258 | # For 512-bit AVX1 build use (replace): RT_512_R8=1          (8 SIMD reg-pairs)
259 | # For 512-bit AVX2 build use (replace): RT_512_R8=2          (8 SIMD reg-pairs)
260 | # For 512-bit AVX512F  build use (replace): RT_512=1        (15 SIMD registers)
261 | # For 512-bit AVX512DQ build use (replace): RT_512=2        (15 SIMD registers)
262 | # For 512-bit AVX512F  build use (replace): RT_512=4        (30 SIMD registers)
263 | # For 512-bit AVX512DQ build use (replace): RT_512=8        (30 SIMD registers)
264 | 
265 | # For 1024-bit AVX512F  build use (replace): RT_1K4=1       (15 SIMD reg-pairs)
266 | # For 1024-bit AVX512DQ build use (replace): RT_1K4=2       (15 SIMD reg-pairs)
267 | # For 2048-bit AVX512F  build use (replace): RT_2K8_R8=1     (8 SIMD reg-quads)
268 | # For 2048-bit AVX512DQ build use (replace): RT_2K8_R8=2     (8 SIMD reg-quads)
269 | 
270 | # 64/32-bit (ptr/adr) hybrid mode is compatible with native 64-bit ABI,
271 | # use (replace): RT_ADDRESS=32, rename the binary to simd_test.x64_**
272 | # 64-bit packed SIMD mode (fp64/int64) is supported on 64-bit targets,
273 | # use (replace): RT_ELEMENT=64, rename the binary to simd_test.x64*64
274 | 


--------------------------------------------------------------------------------
/test/simd_make_x86.mk:
--------------------------------------------------------------------------------
 1 | 
 2 | INC_PATH =                              \
 3 |         -I../core/config/
 4 | 
 5 | SRC_LIST =                              \
 6 |         simd_test.cpp
 7 | 
 8 | LIB_PATH =
 9 | 
10 | LIB_LIST =                              \
11 |         -lm
12 | 
13 | 
14 | build: simd_test_x86 simd_test_x86avx simd_test_x86avx512
15 | 
16 | strip:
17 | 	strip simd_test.x86*
18 | 
19 | clean:
20 | 	rm simd_test.x86*
21 | 
22 | 
23 | simd_test_x86:
24 | 	g++ -O3 -g -m32 \
25 |         -DRT_LINUX -DRT_X86 -DRT_128=2 -DRT_DEBUG=0 \
26 |         -DRT_POINTER=32 -DRT_ADDRESS=32 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \
27 |         ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.x86
28 | 
29 | simd_test_x86avx:
30 | 	g++ -O3 -g -m32 \
31 |         -DRT_LINUX -DRT_X86 -DRT_256=1 -DRT_DEBUG=0 \
32 |         -DRT_POINTER=32 -DRT_ADDRESS=32 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \
33 |         ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.x86avx
34 | 
35 | simd_test_x86avx512:
36 | 	g++ -O3 -g -m32 \
37 |         -DRT_LINUX -DRT_X86 -DRT_512=1 -DRT_DEBUG=0 \
38 |         -DRT_POINTER=32 -DRT_ADDRESS=32 -DRT_ELEMENT=32 -DRT_ENDIAN=0 \
39 |         ${INC_PATH} ${SRC_LIST} ${LIB_PATH} ${LIB_LIST} -o simd_test.x86avx512
40 | 
41 | 
42 | # On Ubuntu (MATE) 16.04-22.04 add "universe multiverse" to "main restricted"
43 | # in /etc/apt/sources.list (sudo nano /etc/apt/sources.list) then run:
44 | # sudo apt-get update
45 | # (Ubuntu MATE is set up for an update without a need to edit the file)
46 | # (extended repositories "universe multiverse" are only needed for clang)
47 | #
48 | # Prerequisites for the build:
49 | # native/multilib-compiler for x86/x86_64 is installed and in the PATH variable.
50 | # sudo apt-get install make g++ (for x86 host)
51 | # sudo apt-get install make g++-multilib (for x86_64 host)
52 | # (installation of g++-multilib removes any g++ cross-compilers)
53 | #
54 | # Prerequisites for emulation:
55 | # http://software.intel.com/en-us/articles/intel-software-development-emulator
56 | # Intel SDE is downloaded, unpacked and in the PATH variable.
57 | #
58 | # Compiling/running SIMD test:
59 | # make -f simd_make_x86.mk
60 | # ./simd_test.x86
61 | # ./simd_test.x86avx
62 | # ./simd_test.x86avx512
63 | # sde -snb -- ./simd_test.x86avx -c 1
64 | # sde -knl -- ./simd_test.x86avx512 -c 1
65 | # Use "-c 1" option to reduce test time when emulating with Intel SDE
66 | 
67 | # Clang native build works too (takes much longer prior to 3.8), use (replace):
68 | # clang++ (in place of g++)
69 | # sudo apt-get install clang (requires g++-multilib for non-native ABI)
70 | 
71 | # For interpretation of SIMD build flags check compatibility layer in rtzero.h.
72 | # Original legacy 32-bit ARMv7/x86 targets only support 8 SIMD registers.
73 | 
74 | # For 128-bit SSE1 build use (replace): RT_128=1 (test36/37) (8 SIMD registers)
75 | # For 128-bit SSE2 build use (replace): RT_128=2             (8 SIMD registers)
76 | # For 128-bit SSE4 build use (replace): RT_128=4             (8 SIMD registers)
77 | # For 128-bit AVX1 build use (replace): RT_128=8     (AMD's AVX1-only) (8 regs)
78 | # For 128-bit FMA3 build use (replace): RT_128=16    (AMD's AVX1+FMA3) (8 regs)
79 | # For 128-bit AVX2 build use (replace): RT_128=32    (AMD's AVX2+FMA3) (8 regs)
80 | 
81 | # For 256-bit AVX1 build use (replace): RT_256=1   (Intel's AVX1-only) (8 regs)
82 | # For 256-bit AVX2 build use (replace): RT_256=2   (Intel's AVX2+FMA3) (8 regs)
83 | # For 512-bit AVX512F  build use (replace): RT_512=1         (8 SIMD registers)
84 | # For 512-bit AVX512DQ build use (replace): RT_512=2         (8 SIMD registers)
85 | 


--------------------------------------------------------------------------------
/test/simd_qemu32.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | # Intended for x86_64 Linux test environment
 3 | # with QEMU linux-user mode installed (64-bit Ubuntu MATE 20.04 LTS tested)
 4 | # run this script after bulid_cross.sh with 32-bit cross-compilers installed
 5 | 
 6 | touch qemu32; rm qemu32
 7 | 
 8 | # fully successful test pass results in qemu32 file of  41524 bytes (51 tests)
 9 | # unlike simd_test64/86.sh the result is the same on all CPU types  (51 tests)
10 | # check the output if qemu32 file size differs, look for printouts
11 | 
12 | 
13 | echo "========================================================" | tee -a qemu32
14 | echo "Testing arm_v1 target (ARMv7 Cortex-A8  NEON)" | tee -a qemu32
15 | echo "========================================================" | tee -a qemu32
16 | qemu-arm -cpu cortex-a8  simd_test.arm_v1 -c 1 | tee -a qemu32
17 | echo "========================================================" | tee -a qemu32
18 | echo "Testing arm_v2 target (ARMv7 Cortex-A15 NEON)" | tee -a qemu32
19 | echo "========================================================" | tee -a qemu32
20 | qemu-arm -cpu cortex-a15 simd_test.arm_v2 -c 1 | tee -a qemu32
21 | 
22 | 
23 | echo "========================================================" | tee -a qemu32
24 | echo "Testing m32Lr5 target (MIPS32r5 MSA little-endian)" | tee -a qemu32
25 | echo "========================================================" | tee -a qemu32
26 | qemu-mipsel -cpu P5600 simd_test.m32Lr5 -c 1 | tee -a qemu32
27 | echo "========================================================" | tee -a qemu32
28 | echo "Testing m32Br5 target (MIPS32r5 MSA    big-endian)" | tee -a qemu32
29 | echo "========================================================" | tee -a qemu32
30 | qemu-mips   -cpu P5600 simd_test.m32Br5 -c 1 | tee -a qemu32
31 | 
32 | 
33 | # ppc64abi32 targets are deprecated since QEMU 5.2.0 (dropped in Ubuntu 22.04)
34 | # fully successful test pass writes 66442 bytes to qemu32 with ppc64abi32 runs
35 | 
36 | echo "========================================================" | tee -a qemu32
37 | echo "Testing p32Bg4 target (PPC G4 VMX     big-endian)" | tee -a qemu32
38 | echo "========================================================" | tee -a qemu32
39 | qemu-ppc        -cpu G4     simd_test.p32Bg4 -c 1 | tee -a qemu32
40 | #echo "========================================================" | tee -a qemu32
41 | #echo "Testing p32Bp7 target (POWER7 VSX1    big-endian)" | tee -a qemu32
42 | #echo "========================================================" | tee -a qemu32
43 | #qemu-ppc64abi32 -cpu POWER7 simd_test.p32Bp7 -c 1 | tee -a qemu32
44 | #echo "========================================================" | tee -a qemu32
45 | #echo "Testing p32Bp8 target (POWER8 VSX2    big-endian)" | tee -a qemu32
46 | #echo "========================================================" | tee -a qemu32
47 | #qemu-ppc64abi32 -cpu POWER8 simd_test.p32Bp8 -c 1 | tee -a qemu32
48 | #echo "========================================================" | tee -a qemu32
49 | #echo "Testing p32Bp9 target (POWER9 VSX3    big-endian)" | tee -a qemu32
50 | #echo "========================================================" | tee -a qemu32
51 | #qemu-ppc64abi32 -cpu POWER9 simd_test.p32Bp9 -c 1 | tee -a qemu32
52 | 
53 | 
54 | echo "========================================================"
55 | echo "fully successful test pass writes  41524 bytes to qemu32"
56 | echo "the result doesn't depend on CPU type (unlike test64/86)"
57 | echo "check the output if qemu32 size differs, check printouts"
58 | echo "========================================================"
59 | echo "the actual file size after the test run is listed below:"
60 | ls -al qemu32
61 | echo "========================================================"
62 | 
63 | 
64 | 


--------------------------------------------------------------------------------
/test/simd_qemu64.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/sh
  2 | # Intended for x86_64 Linux test environment
  3 | # with QEMU linux-user mode installed (64-bit Ubuntu MATE 20.04 LTS tested)
  4 | # run this script after bulid_cross.sh with 64-bit cross-compilers installed
  5 | 
  6 | touch qemu64; rm qemu64
  7 | 
  8 | # fully successful test pass results in qemu64 file of 232524 bytes (51 tests)
  9 | # unlike simd_test64/86.sh the result is the same on all CPU types  (51 tests)
 10 | # check the output if qemu64 file size differs, look for printouts
 11 | 
 12 | 
 13 | echo "========================================================" | tee -a qemu64
 14 | echo "Testing a64_32 target (ARMv8 NEON)" | tee -a qemu64
 15 | echo "========================================================" | tee -a qemu64
 16 | qemu-aarch64 -cpu cortex-a57 simd_test.a64_32 -c 1 | tee -a qemu64
 17 | echo "========================================================" | tee -a qemu64
 18 | echo "Testing a64_64 target (ARMv8 NEON)" | tee -a qemu64
 19 | echo "========================================================" | tee -a qemu64
 20 | qemu-aarch64 -cpu cortex-a57 simd_test.a64_64 -c 1 | tee -a qemu64
 21 | echo "========================================================" | tee -a qemu64
 22 | echo "Testing a64f32 target (ARMv8 NEON)" | tee -a qemu64
 23 | echo "========================================================" | tee -a qemu64
 24 | qemu-aarch64 -cpu cortex-a57 simd_test.a64f32 -c 1 | tee -a qemu64
 25 | echo "========================================================" | tee -a qemu64
 26 | echo "Testing a64f64 target (ARMv8 NEON)" | tee -a qemu64
 27 | echo "========================================================" | tee -a qemu64
 28 | qemu-aarch64 -cpu cortex-a57 simd_test.a64f64 -c 1 | tee -a qemu64
 29 | 
 30 | echo "========================================================" | tee -a qemu64
 31 | echo "Testing a64_32sve target (ARMv8 SVE)" | tee -a qemu64
 32 | echo "========================================================" | tee -a qemu64
 33 | qemu-aarch64 -cpu max,sve-max-vq=4 simd_test.a64_32sve -c 1 | tee -a qemu64
 34 | echo "========================================================" | tee -a qemu64
 35 | echo "Testing a64_64sve target (ARMv8 SVE)" | tee -a qemu64
 36 | echo "========================================================" | tee -a qemu64
 37 | qemu-aarch64 -cpu max,sve-max-vq=4 simd_test.a64_64sve -c 1 | tee -a qemu64
 38 | echo "========================================================" | tee -a qemu64
 39 | echo "Testing a64f32sve target (ARMv8 SVE)" | tee -a qemu64
 40 | echo "========================================================" | tee -a qemu64
 41 | qemu-aarch64 -cpu max,sve-max-vq=4 simd_test.a64f32sve -c 1 | tee -a qemu64
 42 | echo "========================================================" | tee -a qemu64
 43 | echo "Testing a64f64sve target (ARMv8 SVE)" | tee -a qemu64
 44 | echo "========================================================" | tee -a qemu64
 45 | qemu-aarch64 -cpu max,sve-max-vq=4 simd_test.a64f64sve -c 1 | tee -a qemu64
 46 | 
 47 | 
 48 | echo "========================================================" | tee -a qemu64
 49 | echo "Testing m64_32Lr6 target (MIPS64r6 MSA little-endian)" | tee -a qemu64
 50 | echo "========================================================" | tee -a qemu64
 51 | qemu-mips64el -cpu I6400 simd_test.m64_32Lr6 -c 1 | tee -a qemu64
 52 | echo "========================================================" | tee -a qemu64
 53 | echo "Testing m64_64Lr6 target (MIPS64r6 MSA little-endian)" | tee -a qemu64
 54 | echo "========================================================" | tee -a qemu64
 55 | qemu-mips64el -cpu I6400 simd_test.m64_64Lr6 -c 1 | tee -a qemu64
 56 | echo "========================================================" | tee -a qemu64
 57 | echo "Testing m64f32Lr6 target (MIPS64r6 MSA little-endian)" | tee -a qemu64
 58 | echo "========================================================" | tee -a qemu64
 59 | qemu-mips64el -cpu I6400 simd_test.m64f32Lr6 -c 1 | tee -a qemu64
 60 | echo "========================================================" | tee -a qemu64
 61 | echo "Testing m64f64Lr6 target (MIPS64r6 MSA little-endian)" | tee -a qemu64
 62 | echo "========================================================" | tee -a qemu64
 63 | qemu-mips64el -cpu I6400 simd_test.m64f64Lr6 -c 1 | tee -a qemu64
 64 | 
 65 | echo "========================================================" | tee -a qemu64
 66 | echo "Testing m64_32Br6 target (MIPS64r6 MSA    big-endian)" | tee -a qemu64
 67 | echo "========================================================" | tee -a qemu64
 68 | qemu-mips64   -cpu I6400 simd_test.m64_32Br6 -c 1 | tee -a qemu64
 69 | echo "========================================================" | tee -a qemu64
 70 | echo "Testing m64_64Br6 target (MIPS64r6 MSA    big-endian)" | tee -a qemu64
 71 | echo "========================================================" | tee -a qemu64
 72 | qemu-mips64   -cpu I6400 simd_test.m64_64Br6 -c 1 | tee -a qemu64
 73 | echo "========================================================" | tee -a qemu64
 74 | echo "Testing m64f32Br6 target (MIPS64r6 MSA    big-endian)" | tee -a qemu64
 75 | echo "========================================================" | tee -a qemu64
 76 | qemu-mips64   -cpu I6400 simd_test.m64f32Br6 -c 1 | tee -a qemu64
 77 | echo "========================================================" | tee -a qemu64
 78 | echo "Testing m64f64Br6 target (MIPS64r6 MSA    big-endian)" | tee -a qemu64
 79 | echo "========================================================" | tee -a qemu64
 80 | qemu-mips64   -cpu I6400 simd_test.m64f64Br6 -c 1 | tee -a qemu64
 81 | 
 82 | 
 83 | echo "========================================================" | tee -a qemu64
 84 | echo "Testing p64_32Bp7 target (POWER7 VSX1    big-endian)" | tee -a qemu64
 85 | echo "========================================================" | tee -a qemu64
 86 | qemu-ppc64   -cpu POWER7 simd_test.p64_32Bp7 -c 1 | tee -a qemu64
 87 | echo "========================================================" | tee -a qemu64
 88 | echo "Testing p64_64Bp7 target (POWER7 VSX1    big-endian)" | tee -a qemu64
 89 | echo "========================================================" | tee -a qemu64
 90 | qemu-ppc64   -cpu POWER7 simd_test.p64_64Bp7 -c 1 | tee -a qemu64
 91 | echo "========================================================" | tee -a qemu64
 92 | echo "Testing p64f32Bp7 target (POWER7 VSX1    big-endian)" | tee -a qemu64
 93 | echo "========================================================" | tee -a qemu64
 94 | qemu-ppc64   -cpu POWER7 simd_test.p64f32Bp7 -c 1 | tee -a qemu64
 95 | echo "========================================================" | tee -a qemu64
 96 | echo "Testing p64f64Bp7 target (POWER7 VSX1    big-endian)" | tee -a qemu64
 97 | echo "========================================================" | tee -a qemu64
 98 | qemu-ppc64   -cpu POWER7 simd_test.p64f64Bp7 -c 1 | tee -a qemu64
 99 | 
100 | # using -cpu power9 for power8 targets is a workaround for Ubuntu 22.04 LTS
101 | # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109007
102 | 
103 | echo "========================================================" | tee -a qemu64
104 | echo "Testing p64_32Lp8 target (POWER8 VSX2 little-endian)" | tee -a qemu64
105 | echo "========================================================" | tee -a qemu64
106 | qemu-ppc64le -cpu POWER9 simd_test.p64_32Lp8 -c 1 | tee -a qemu64
107 | echo "========================================================" | tee -a qemu64
108 | echo "Testing p64_64Lp8 target (POWER8 VSX2 little-endian)" | tee -a qemu64
109 | echo "========================================================" | tee -a qemu64
110 | qemu-ppc64le -cpu POWER9 simd_test.p64_64Lp8 -c 1 | tee -a qemu64
111 | echo "========================================================" | tee -a qemu64
112 | echo "Testing p64f32Lp8 target (POWER8 VSX2 little-endian)" | tee -a qemu64
113 | echo "========================================================" | tee -a qemu64
114 | qemu-ppc64le -cpu POWER9 simd_test.p64f32Lp8 -c 1 | tee -a qemu64
115 | echo "========================================================" | tee -a qemu64
116 | echo "Testing p64f64Lp8 target (POWER8 VSX2 little-endian)" | tee -a qemu64
117 | echo "========================================================" | tee -a qemu64
118 | qemu-ppc64le -cpu POWER9 simd_test.p64f64Lp8 -c 1 | tee -a qemu64
119 | 
120 | echo "========================================================" | tee -a qemu64
121 | echo "Testing p64_32Lp9 target (POWER9 VSX3 little-endian)" | tee -a qemu64
122 | echo "========================================================" | tee -a qemu64
123 | qemu-ppc64le -cpu POWER9 simd_test.p64_32Lp9 -c 1 | tee -a qemu64
124 | echo "========================================================" | tee -a qemu64
125 | echo "Testing p64_64Lp9 target (POWER9 VSX3 little-endian)" | tee -a qemu64
126 | echo "========================================================" | tee -a qemu64
127 | qemu-ppc64le -cpu POWER9 simd_test.p64_64Lp9 -c 1 | tee -a qemu64
128 | echo "========================================================" | tee -a qemu64
129 | echo "Testing p64f32Lp9 target (POWER9 VSX3 little-endian)" | tee -a qemu64
130 | echo "========================================================" | tee -a qemu64
131 | qemu-ppc64le -cpu POWER9 simd_test.p64f32Lp9 -c 1 | tee -a qemu64
132 | echo "========================================================" | tee -a qemu64
133 | echo "Testing p64f64Lp9 target (POWER9 VSX3 little-endian)" | tee -a qemu64
134 | echo "========================================================" | tee -a qemu64
135 | qemu-ppc64le -cpu POWER9 simd_test.p64f64Lp9 -c 1 | tee -a qemu64
136 | 
137 | 
138 | echo "========================================================"
139 | echo "fully successful test pass writes 232524 bytes to qemu64"
140 | echo "the result doesn't depend on CPU type (unlike test64/86)"
141 | echo "check the output if qemu64 size differs, check printouts"
142 | echo "========================================================"
143 | echo "the actual file size after the test run is listed below:"
144 | ls -al qemu64
145 | echo "========================================================"
146 | 
147 | 
148 | 


--------------------------------------------------------------------------------
/test/simd_test64.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | # Intended for x86_64 Linux test environment
 3 | # tested on 64-bit Linux Mint 18, 64-bit Ubuntu MATE 18.04/20.04 LTS
 4 | # run this script after bulid_linux.sh with native compiler installed
 5 | 
 6 | touch test64; rm test64
 7 | 
 8 | # fully successful test pass results in test64 file of  99666 bytes (51 tests)
 9 | # test pass on AVX2-only CPU results in test64 file of  69286 bytes (51 tests)
10 | # for any other CPU check the output or use Intel SDE within script
11 | 
12 | 
13 | echo "========================================================" | tee -a test64
14 | echo "Testing x64_32 target (Intel Core 2 Duo SSE2)" | tee -a test64
15 | echo "========================================================" | tee -a test64
16 | ./simd_test.x64_32 -c 1 | tee -a test64
17 | echo "========================================================" | tee -a test64
18 | echo "Testing x64_64 target (Intel Core 2 Duo SSE2)" | tee -a test64
19 | echo "========================================================" | tee -a test64
20 | ./simd_test.x64_64 -c 1 | tee -a test64
21 | echo "========================================================" | tee -a test64
22 | echo "Testing x64f32 target (Intel Nehalem SSE4)" | tee -a test64
23 | echo "========================================================" | tee -a test64
24 | ./simd_test.x64f32 -c 1 | tee -a test64
25 | echo "========================================================" | tee -a test64
26 | echo "Testing x64f64 target (Intel Nehalem SSE4)" | tee -a test64
27 | echo "========================================================" | tee -a test64
28 | ./simd_test.x64f64 -c 1 | tee -a test64
29 | 
30 | echo "========================================================" | tee -a test64
31 | echo "Testing x64_32avx target (Intel Sandy Bridge AVX1)" | tee -a test64
32 | echo "========================================================" | tee -a test64
33 | ./simd_test.x64_32avx -c 1 | tee -a test64
34 | echo "========================================================" | tee -a test64
35 | echo "Testing x64_64avx target (Intel Sandy Bridge AVX1)" | tee -a test64
36 | echo "========================================================" | tee -a test64
37 | ./simd_test.x64_64avx -c 1 | tee -a test64
38 | echo "========================================================" | tee -a test64
39 | echo "Testing x64f32avx target (Intel Haswell AVX2)" | tee -a test64
40 | echo "========================================================" | tee -a test64
41 | ./simd_test.x64f32avx -c 1 | tee -a test64
42 | echo "========================================================" | tee -a test64
43 | echo "Testing x64f64avx target (Intel Haswell AVX2)" | tee -a test64
44 | echo "========================================================" | tee -a test64
45 | ./simd_test.x64f64avx -c 1 | tee -a test64
46 | 
47 | echo "========================================================" | tee -a test64
48 | echo "Testing x64_32avx512 target (Intel Xeon Phi KNL AVX512)" | tee -a test64
49 | echo "========================================================" | tee -a test64
50 | ./simd_test.x64_32avx512 -c 1 | tee -a test64
51 | echo "========================================================" | tee -a test64
52 | echo "Testing x64_64avx512 target (Intel Xeon Phi KNL AVX512)" | tee -a test64
53 | echo "========================================================" | tee -a test64
54 | ./simd_test.x64_64avx512 -c 1 | tee -a test64
55 | echo "========================================================" | tee -a test64
56 | echo "Testing x64f32avx512 target (Intel Rocket Lake AVX512)" | tee -a test64
57 | echo "========================================================" | tee -a test64
58 | ./simd_test.x64f32avx512 -c 1 | tee -a test64
59 | echo "========================================================" | tee -a test64
60 | echo "Testing x64f64avx512 target (Intel Rocket Lake AVX512)" | tee -a test64
61 | echo "========================================================" | tee -a test64
62 | ./simd_test.x64f64avx512 -c 1 | tee -a test64
63 | 
64 | 
65 | echo "========================================================"
66 | echo "fully successful test pass writes  99666 bytes to test64"
67 | echo "test pass on AVX2-only CPU writes  69286 bytes to test64"
68 | echo "for other CPUs check the output, use Intel SDE in script"
69 | echo "========================================================"
70 | echo "the actual file size after the test run is listed below:"
71 | ls -al test64
72 | echo "========================================================"
73 | 
74 | 
75 | 


--------------------------------------------------------------------------------
/test/simd_test86.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | # Intended for x86_64 Linux test environment
 3 | # with multilib capabilities (64-bit Linux Mint 18 tested)
 4 | # run this script after bulid_multi.sh with multilib-compiler installed
 5 | 
 6 | touch test86; rm test86
 7 | 
 8 | # fully successful test pass results in test86 file of  35482 bytes (51 tests)
 9 | # test pass on AVX2-only CPU results in test86 file of  25616 bytes (51 tests)
10 | # for any other CPU check the output or use Intel SDE within script
11 | 
12 | 
13 | echo "========================================================" | tee -a test86
14 | echo "Testing x86 target (Intel Core 2 Duo SSE2)" | tee -a test86
15 | echo "========================================================" | tee -a test86
16 | ./simd_test.x86 -c 1 | tee -a test86
17 | echo "========================================================" | tee -a test86
18 | echo "Testing x86avx target (Intel Sandy Bridge AVX1)" | tee -a test86
19 | echo "========================================================" | tee -a test86
20 | ./simd_test.x86avx -c 1 | tee -a test86
21 | echo "========================================================" | tee -a test86
22 | echo "Testing x86avx512 target (Intel Xeon Phi KNL AVX512)" | tee -a test86
23 | echo "========================================================" | tee -a test86
24 | ./simd_test.x86avx512 -c 1 | tee -a test86
25 | echo "========================================================" | tee -a test86
26 | echo "Testing x32 target (Intel Core 2 Duo SSE2)" | tee -a test86
27 | echo "========================================================" | tee -a test86
28 | ./simd_test.x32 -c 1 | tee -a test86
29 | 
30 | 
31 | echo "========================================================"
32 | echo "fully successful test pass writes  35482 bytes to test86"
33 | echo "test pass on AVX2-only CPU writes  25616 bytes to test86"
34 | echo "for other CPUs check the output, use Intel SDE in script"
35 | echo "========================================================"
36 | echo "the actual file size after the test run is listed below:"
37 | ls -al test86
38 | echo "========================================================"
39 | 
40 | 
41 | 


--------------------------------------------------------------------------------
/test/simd_test_x64.sln:
--------------------------------------------------------------------------------
 1 | ﻿
 2 | Microsoft Visual Studio Solution File, Format Version 12.00
 3 | # Visual Studio Version 17
 4 | VisualStudioVersion = 17.1.32328.378
 5 | MinimumVisualStudioVersion = 10.0.40219.1
 6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "simd_test_x64", "simd_test_x64.vcxproj", "{3CDB5A0F-6E4A-45F8-A234-25B9491748D9}"
 7 | EndProject
 8 | Global
 9 | 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
10 | 		Debug|x64 = Debug|x64
11 | 		Release|x64 = Release|x64
12 | 	EndGlobalSection
13 | 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
14 | 		{3CDB5A0F-6E4A-45F8-A234-25B9491748D9}.Debug|x64.ActiveCfg = Debug|x64
15 | 		{3CDB5A0F-6E4A-45F8-A234-25B9491748D9}.Debug|x64.Build.0 = Debug|x64
16 | 		{3CDB5A0F-6E4A-45F8-A234-25B9491748D9}.Release|x64.ActiveCfg = Release|x64
17 | 		{3CDB5A0F-6E4A-45F8-A234-25B9491748D9}.Release|x64.Build.0 = Release|x64
18 | 	EndGlobalSection
19 | 	GlobalSection(SolutionProperties) = preSolution
20 | 		HideSolutionNode = FALSE
21 | 	EndGlobalSection
22 | 	GlobalSection(ExtensibilityGlobals) = postSolution
23 | 		SolutionGuid = {11D5CE55-10D6-484B-AEE7-878D57BF69CE}
24 | 	EndGlobalSection
25 | EndGlobal
26 | 


--------------------------------------------------------------------------------
/test/simd_test_x64.vcxproj:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="utf-8"?>
  2 | <Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  3 |   <ItemGroup Label="ProjectConfigurations">
  4 |     <ProjectConfiguration Include="Debug|Win32">
  5 |       <Configuration>Debug</Configuration>
  6 |       <Platform>Win32</Platform>
  7 |     </ProjectConfiguration>
  8 |     <ProjectConfiguration Include="Release|Win32">
  9 |       <Configuration>Release</Configuration>
 10 |       <Platform>Win32</Platform>
 11 |     </ProjectConfiguration>
 12 |     <ProjectConfiguration Include="Debug|x64">
 13 |       <Configuration>Debug</Configuration>
 14 |       <Platform>x64</Platform>
 15 |     </ProjectConfiguration>
 16 |     <ProjectConfiguration Include="Release|x64">
 17 |       <Configuration>Release</Configuration>
 18 |       <Platform>x64</Platform>
 19 |     </ProjectConfiguration>
 20 |   </ItemGroup>
 21 |   <ItemGroup>
 22 |     <ClCompile Include="simd_test.cpp" />
 23 |   </ItemGroup>
 24 |   <ItemGroup>
 25 |     <ClInclude Include="..\core\config\rtarch.h" />
 26 |     <ClInclude Include="..\core\config\rtarch_x32.h" />
 27 |     <ClInclude Include="..\core\config\rtarch_x32_128x1v2.h" />
 28 |     <ClInclude Include="..\core\config\rtarch_x32_128x1v4.h" />
 29 |     <ClInclude Include="..\core\config\rtarch_x32_128x1v8.h" />
 30 |     <ClInclude Include="..\core\config\rtarch_x32_128x2v4.h" />
 31 |     <ClInclude Include="..\core\config\rtarch_x32_256x1v2.h" />
 32 |     <ClInclude Include="..\core\config\rtarch_x32_256x1v8.h" />
 33 |     <ClInclude Include="..\core\config\rtarch_x32_256x2v2.h" />
 34 |     <ClInclude Include="..\core\config\rtarch_x32_512x1v8.h" />
 35 |     <ClInclude Include="..\core\config\rtarch_x32_512x2v2.h" />
 36 |     <ClInclude Include="..\core\config\rtarch_x32_512x4v2.h" />
 37 |     <ClInclude Include="..\core\config\rtarch_x64.h" />
 38 |     <ClInclude Include="..\core\config\rtarch_x64_128x1v2.h" />
 39 |     <ClInclude Include="..\core\config\rtarch_x64_128x1v4.h" />
 40 |     <ClInclude Include="..\core\config\rtarch_x64_128x1v8.h" />
 41 |     <ClInclude Include="..\core\config\rtarch_x64_128x2v4.h" />
 42 |     <ClInclude Include="..\core\config\rtarch_x64_256x1v2.h" />
 43 |     <ClInclude Include="..\core\config\rtarch_x64_256x1v8.h" />
 44 |     <ClInclude Include="..\core\config\rtarch_x64_256x2v2.h" />
 45 |     <ClInclude Include="..\core\config\rtarch_x64_512x1v8.h" />
 46 |     <ClInclude Include="..\core\config\rtarch_x64_512x2v2.h" />
 47 |     <ClInclude Include="..\core\config\rtarch_x64_512x4v2.h" />
 48 |     <ClInclude Include="..\core\config\rtarch_xHB.h" />
 49 |     <ClInclude Include="..\core\config\rtarch_xHB_128x1v2.h" />
 50 |     <ClInclude Include="..\core\config\rtarch_xHB_128x1v4.h" />
 51 |     <ClInclude Include="..\core\config\rtarch_xHB_128x1v8.h" />
 52 |     <ClInclude Include="..\core\config\rtarch_xHB_128x2v4.h" />
 53 |     <ClInclude Include="..\core\config\rtarch_xHB_256x1v2.h" />
 54 |     <ClInclude Include="..\core\config\rtarch_xHB_256x1v8.h" />
 55 |     <ClInclude Include="..\core\config\rtarch_xHB_256x2v2.h" />
 56 |     <ClInclude Include="..\core\config\rtarch_xHB_512x1v8.h" />
 57 |     <ClInclude Include="..\core\config\rtarch_xHB_512x2v2.h" />
 58 |     <ClInclude Include="..\core\config\rtarch_xHB_512x4v2.h" />
 59 |     <ClInclude Include="..\core\config\rtarch_xHF_128x1v2.h" />
 60 |     <ClInclude Include="..\core\config\rtarch_xHF_256x1v8.h" />
 61 |     <ClInclude Include="..\core\config\rtarch_xHF_512x1v8.h" />
 62 |     <ClInclude Include="..\core\config\rtarch_xHF_512x2v2.h" />
 63 |     <ClInclude Include="..\core\config\rtarch_xHF_512x4v2.h" />
 64 |     <ClInclude Include="..\core\config\rtbase.h" />
 65 |     <ClInclude Include="..\core\config\rtconf.h" />
 66 |     <ClInclude Include="..\core\config\rtdocs.h" />
 67 |     <ClInclude Include="..\core\config\rtzero.h" />
 68 |   </ItemGroup>
 69 |   <PropertyGroup Label="Globals">
 70 |     <VCProjectVersion>16.0</VCProjectVersion>
 71 |     <Keyword>Win32Proj</Keyword>
 72 |     <ProjectGuid>{3cdb5a0f-6e4a-45f8-a234-25b9491748d9}</ProjectGuid>
 73 |     <RootNamespace>simdtestx64</RootNamespace>
 74 |     <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
 75 |   </PropertyGroup>
 76 |   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
 77 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
 78 |     <ConfigurationType>Application</ConfigurationType>
 79 |     <UseDebugLibraries>true</UseDebugLibraries>
 80 |     <PlatformToolset>v143</PlatformToolset>
 81 |     <CharacterSet>Unicode</CharacterSet>
 82 |   </PropertyGroup>
 83 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
 84 |     <ConfigurationType>Application</ConfigurationType>
 85 |     <UseDebugLibraries>false</UseDebugLibraries>
 86 |     <PlatformToolset>v143</PlatformToolset>
 87 |     <WholeProgramOptimization>true</WholeProgramOptimization>
 88 |     <CharacterSet>Unicode</CharacterSet>
 89 |   </PropertyGroup>
 90 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
 91 |     <ConfigurationType>Application</ConfigurationType>
 92 |     <UseDebugLibraries>true</UseDebugLibraries>
 93 |     <PlatformToolset>ClangCL</PlatformToolset>
 94 |     <CharacterSet>Unicode</CharacterSet>
 95 |   </PropertyGroup>
 96 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
 97 |     <ConfigurationType>Application</ConfigurationType>
 98 |     <UseDebugLibraries>false</UseDebugLibraries>
 99 |     <PlatformToolset>ClangCL</PlatformToolset>
100 |     <WholeProgramOptimization>true</WholeProgramOptimization>
101 |     <CharacterSet>Unicode</CharacterSet>
102 |   </PropertyGroup>
103 |   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
104 |   <ImportGroup Label="ExtensionSettings">
105 |   </ImportGroup>
106 |   <ImportGroup Label="Shared">
107 |   </ImportGroup>
108 |   <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
109 |     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
110 |   </ImportGroup>
111 |   <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
112 |     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
113 |   </ImportGroup>
114 |   <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
115 |     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
116 |   </ImportGroup>
117 |   <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
118 |     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
119 |   </ImportGroup>
120 |   <PropertyGroup Label="UserMacros" />
121 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
122 |     <LinkIncremental>true</LinkIncremental>
123 |   </PropertyGroup>
124 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
125 |     <LinkIncremental>false</LinkIncremental>
126 |   </PropertyGroup>
127 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
128 |     <LinkIncremental>true</LinkIncremental>
129 |   </PropertyGroup>
130 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
131 |     <LinkIncremental>false</LinkIncremental>
132 |   </PropertyGroup>
133 |   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
134 |     <ClCompile>
135 |       <WarningLevel>Level3</WarningLevel>
136 |       <SDLCheck>true</SDLCheck>
137 |       <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
138 |       <ConformanceMode>true</ConformanceMode>
139 |     </ClCompile>
140 |     <Link>
141 |       <SubSystem>Console</SubSystem>
142 |       <GenerateDebugInformation>true</GenerateDebugInformation>
143 |     </Link>
144 |   </ItemDefinitionGroup>
145 |   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
146 |     <ClCompile>
147 |       <WarningLevel>Level3</WarningLevel>
148 |       <FunctionLevelLinking>true</FunctionLevelLinking>
149 |       <IntrinsicFunctions>true</IntrinsicFunctions>
150 |       <SDLCheck>true</SDLCheck>
151 |       <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
152 |       <ConformanceMode>true</ConformanceMode>
153 |     </ClCompile>
154 |     <Link>
155 |       <SubSystem>Console</SubSystem>
156 |       <EnableCOMDATFolding>true</EnableCOMDATFolding>
157 |       <OptimizeReferences>true</OptimizeReferences>
158 |       <GenerateDebugInformation>true</GenerateDebugInformation>
159 |     </Link>
160 |   </ItemDefinitionGroup>
161 |   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
162 |     <ClCompile>
163 |       <WarningLevel>Level3</WarningLevel>
164 |       <SDLCheck>true</SDLCheck>
165 |       <PreprocessorDefinitions>_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
166 |       <ConformanceMode>true</ConformanceMode>
167 |       <AdditionalOptions>/I "../core/config/" -Wno-unused-function -Wno-missing-braces -Wno-deprecated-declarations /D "RT_WIN64" /D "RT_X64" /D RT_128=4 /D RT_SIMD_COMPAT_SSE=2 /D RT_POINTER=64 /D RT_ADDRESS=64  /D RT_ELEMENT=32 /D RT_ENDIAN=0 /D RT_DEBUG=1 %(AdditionalOptions)</AdditionalOptions>
168 |     </ClCompile>
169 |     <Link>
170 |       <SubSystem>Console</SubSystem>
171 |       <GenerateDebugInformation>true</GenerateDebugInformation>
172 |     </Link>
173 |   </ItemDefinitionGroup>
174 |   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
175 |     <ClCompile>
176 |       <WarningLevel>Level3</WarningLevel>
177 |       <FunctionLevelLinking>true</FunctionLevelLinking>
178 |       <IntrinsicFunctions>true</IntrinsicFunctions>
179 |       <SDLCheck>true</SDLCheck>
180 |       <PreprocessorDefinitions>NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
181 |       <ConformanceMode>true</ConformanceMode>
182 |       <AdditionalOptions>/I "../core/config/" -Wno-unused-function -Wno-missing-braces -Wno-deprecated-declarations /D "RT_WIN64" /D "RT_X64" /D RT_128=4 /D RT_SIMD_COMPAT_SSE=2 /D RT_POINTER=64 /D RT_ADDRESS=64  /D RT_ELEMENT=32 /D RT_ENDIAN=0 /D RT_DEBUG=0 %(AdditionalOptions)</AdditionalOptions>
183 |     </ClCompile>
184 |     <Link>
185 |       <SubSystem>Console</SubSystem>
186 |       <EnableCOMDATFolding>true</EnableCOMDATFolding>
187 |       <OptimizeReferences>true</OptimizeReferences>
188 |       <GenerateDebugInformation>true</GenerateDebugInformation>
189 |     </Link>
190 |   </ItemDefinitionGroup>
191 |   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
192 |   <ImportGroup Label="ExtensionTargets">
193 |   </ImportGroup>
194 | </Project>


--------------------------------------------------------------------------------
/test/simd_test_x64.vcxproj.filters:
--------------------------------------------------------------------------------
  1 | ﻿<?xml version="1.0" encoding="utf-8"?>
  2 | <Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  3 |   <ItemGroup>
  4 |     <Filter Include="Source Files">
  5 |       <UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
  6 |       <Extensions>cpp;c;cc;cxx;c++;cppm;ixx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
  7 |     </Filter>
  8 |     <Filter Include="Header Files">
  9 |       <UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
 10 |       <Extensions>h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd</Extensions>
 11 |     </Filter>
 12 |     <Filter Include="core">
 13 |       <UniqueIdentifier>{0a255df8-ee9e-45d2-b928-0b1efb68dc8e}</UniqueIdentifier>
 14 |     </Filter>
 15 |     <Filter Include="core\config">
 16 |       <UniqueIdentifier>{b0a101d1-d784-48f3-8e7b-ed50858362a2}</UniqueIdentifier>
 17 |     </Filter>
 18 |   </ItemGroup>
 19 |   <ItemGroup>
 20 |     <ClCompile Include="simd_test.cpp">
 21 |       <Filter>Source Files</Filter>
 22 |     </ClCompile>
 23 |   </ItemGroup>
 24 |   <ItemGroup>
 25 |     <ClInclude Include="..\core\config\rtarch.h">
 26 |       <Filter>core\config</Filter>
 27 |     </ClInclude>
 28 |     <ClInclude Include="..\core\config\rtarch_x32.h">
 29 |       <Filter>core\config</Filter>
 30 |     </ClInclude>
 31 |     <ClInclude Include="..\core\config\rtarch_x32_128x1v2.h">
 32 |       <Filter>core\config</Filter>
 33 |     </ClInclude>
 34 |     <ClInclude Include="..\core\config\rtarch_x32_128x1v4.h">
 35 |       <Filter>core\config</Filter>
 36 |     </ClInclude>
 37 |     <ClInclude Include="..\core\config\rtarch_x32_128x1v8.h">
 38 |       <Filter>core\config</Filter>
 39 |     </ClInclude>
 40 |     <ClInclude Include="..\core\config\rtarch_x32_128x2v4.h">
 41 |       <Filter>core\config</Filter>
 42 |     </ClInclude>
 43 |     <ClInclude Include="..\core\config\rtarch_x32_256x1v2.h">
 44 |       <Filter>core\config</Filter>
 45 |     </ClInclude>
 46 |     <ClInclude Include="..\core\config\rtarch_x32_256x1v8.h">
 47 |       <Filter>core\config</Filter>
 48 |     </ClInclude>
 49 |     <ClInclude Include="..\core\config\rtarch_x32_256x2v2.h">
 50 |       <Filter>core\config</Filter>
 51 |     </ClInclude>
 52 |     <ClInclude Include="..\core\config\rtarch_x32_512x1v8.h">
 53 |       <Filter>core\config</Filter>
 54 |     </ClInclude>
 55 |     <ClInclude Include="..\core\config\rtarch_x32_512x2v2.h">
 56 |       <Filter>core\config</Filter>
 57 |     </ClInclude>
 58 |     <ClInclude Include="..\core\config\rtarch_x32_512x4v2.h">
 59 |       <Filter>core\config</Filter>
 60 |     </ClInclude>
 61 |     <ClInclude Include="..\core\config\rtarch_x64.h">
 62 |       <Filter>core\config</Filter>
 63 |     </ClInclude>
 64 |     <ClInclude Include="..\core\config\rtarch_x64_128x1v2.h">
 65 |       <Filter>core\config</Filter>
 66 |     </ClInclude>
 67 |     <ClInclude Include="..\core\config\rtarch_x64_128x1v4.h">
 68 |       <Filter>core\config</Filter>
 69 |     </ClInclude>
 70 |     <ClInclude Include="..\core\config\rtarch_x64_128x1v8.h">
 71 |       <Filter>core\config</Filter>
 72 |     </ClInclude>
 73 |     <ClInclude Include="..\core\config\rtarch_x64_128x2v4.h">
 74 |       <Filter>core\config</Filter>
 75 |     </ClInclude>
 76 |     <ClInclude Include="..\core\config\rtarch_x64_256x1v2.h">
 77 |       <Filter>core\config</Filter>
 78 |     </ClInclude>
 79 |     <ClInclude Include="..\core\config\rtarch_x64_256x1v8.h">
 80 |       <Filter>core\config</Filter>
 81 |     </ClInclude>
 82 |     <ClInclude Include="..\core\config\rtarch_x64_256x2v2.h">
 83 |       <Filter>core\config</Filter>
 84 |     </ClInclude>
 85 |     <ClInclude Include="..\core\config\rtarch_x64_512x1v8.h">
 86 |       <Filter>core\config</Filter>
 87 |     </ClInclude>
 88 |     <ClInclude Include="..\core\config\rtarch_x64_512x2v2.h">
 89 |       <Filter>core\config</Filter>
 90 |     </ClInclude>
 91 |     <ClInclude Include="..\core\config\rtarch_x64_512x4v2.h">
 92 |       <Filter>core\config</Filter>
 93 |     </ClInclude>
 94 |     <ClInclude Include="..\core\config\rtarch_xHB.h">
 95 |       <Filter>core\config</Filter>
 96 |     </ClInclude>
 97 |     <ClInclude Include="..\core\config\rtarch_xHB_128x1v2.h">
 98 |       <Filter>core\config</Filter>
 99 |     </ClInclude>
100 |     <ClInclude Include="..\core\config\rtarch_xHB_128x1v4.h">
101 |       <Filter>core\config</Filter>
102 |     </ClInclude>
103 |     <ClInclude Include="..\core\config\rtarch_xHB_128x1v8.h">
104 |       <Filter>core\config</Filter>
105 |     </ClInclude>
106 |     <ClInclude Include="..\core\config\rtarch_xHB_128x2v4.h">
107 |       <Filter>core\config</Filter>
108 |     </ClInclude>
109 |     <ClInclude Include="..\core\config\rtarch_xHB_256x1v2.h">
110 |       <Filter>core\config</Filter>
111 |     </ClInclude>
112 |     <ClInclude Include="..\core\config\rtarch_xHB_256x1v8.h">
113 |       <Filter>core\config</Filter>
114 |     </ClInclude>
115 |     <ClInclude Include="..\core\config\rtarch_xHB_256x2v2.h">
116 |       <Filter>core\config</Filter>
117 |     </ClInclude>
118 |     <ClInclude Include="..\core\config\rtarch_xHB_512x1v8.h">
119 |       <Filter>core\config</Filter>
120 |     </ClInclude>
121 |     <ClInclude Include="..\core\config\rtarch_xHB_512x2v2.h">
122 |       <Filter>core\config</Filter>
123 |     </ClInclude>
124 |     <ClInclude Include="..\core\config\rtarch_xHB_512x4v2.h">
125 |       <Filter>core\config</Filter>
126 |     </ClInclude>
127 |     <ClInclude Include="..\core\config\rtarch_xHF_128x1v2.h">
128 |       <Filter>core\config</Filter>
129 |     </ClInclude>
130 |     <ClInclude Include="..\core\config\rtarch_xHF_256x1v8.h">
131 |       <Filter>core\config</Filter>
132 |     </ClInclude>
133 |     <ClInclude Include="..\core\config\rtarch_xHF_512x1v8.h">
134 |       <Filter>core\config</Filter>
135 |     </ClInclude>
136 |     <ClInclude Include="..\core\config\rtarch_xHF_512x2v2.h">
137 |       <Filter>core\config</Filter>
138 |     </ClInclude>
139 |     <ClInclude Include="..\core\config\rtarch_xHF_512x4v2.h">
140 |       <Filter>core\config</Filter>
141 |     </ClInclude>
142 |     <ClInclude Include="..\core\config\rtbase.h">
143 |       <Filter>core\config</Filter>
144 |     </ClInclude>
145 |     <ClInclude Include="..\core\config\rtconf.h">
146 |       <Filter>core\config</Filter>
147 |     </ClInclude>
148 |     <ClInclude Include="..\core\config\rtdocs.h">
149 |       <Filter>core\config</Filter>
150 |     </ClInclude>
151 |     <ClInclude Include="..\core\config\rtzero.h">
152 |       <Filter>core\config</Filter>
153 |     </ClInclude>
154 |   </ItemGroup>
155 | </Project>


--------------------------------------------------------------------------------
/test/simd_test_x64.vcxproj.user:
--------------------------------------------------------------------------------
1 | ﻿<?xml version="1.0" encoding="utf-8"?>
2 | <Project ToolsVersion="Current" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
3 |   <PropertyGroup />
4 | </Project>


--------------------------------------------------------------------------------