├── .gitattributes ├── .gitignore ├── LICENSE ├── README.md ├── disassemble ├── asm │ ├── int16_128.asm │ ├── int16_16.asm │ ├── int16_32.asm │ ├── int16_64.asm │ ├── int16_8.asm │ ├── int32_128.asm │ ├── int32_16.asm │ ├── int32_32.asm │ ├── int32_64.asm │ ├── int32_8.asm │ ├── int8_128.asm │ ├── int8_16.asm │ ├── int8_32.asm │ ├── int8_64.asm │ └── int8_8.asm ├── int16_128.asm ├── int16_16.asm ├── int16_32.asm ├── int16_64.asm ├── int16_8.asm ├── int32_128.asm ├── int32_16.asm ├── int32_32.asm ├── int32_64.asm ├── int32_8.asm ├── int64_128.asm ├── int64_16.asm ├── int64_32.asm ├── int64_64.asm ├── int64_8.asm ├── int8_128.asm ├── int8_16.asm ├── int8_32.asm ├── int8_64.asm ├── int8_8.asm ├── main_int16_128.mojo ├── main_int16_16.mojo ├── main_int16_32.mojo ├── main_int16_64.mojo ├── main_int16_8.mojo ├── main_int32_128.mojo ├── main_int32_16.mojo ├── main_int32_32.mojo ├── main_int32_64.mojo ├── main_int32_8.mojo ├── main_int64_128.mojo ├── main_int64_16.mojo ├── main_int64_32.mojo ├── main_int64_64.mojo ├── main_int64_8.mojo ├── main_int8_128.mojo ├── main_int8_16.mojo ├── main_int8_32.mojo ├── main_int8_64.mojo ├── main_int8_8.mojo └── run_all.sh ├── img └── sort-network-16.png ├── main.mojo └── sort_network ├── Layer.mojo ├── SwapData.mojo ├── __init__.mojo ├── crash3.mojo ├── crash4.mojo ├── mllr_examples.mojo ├── nan_check.mojo ├── performance.mojo ├── shuffle_test.mojo ├── sort_network.mojo ├── sort_network_data.mojo ├── sort_network_ml.mojo ├── sort_tools.mojo ├── test_individual.mojo ├── test_tools.mojo ├── tests.mojo └── timing_test.mojo /.gitattributes: -------------------------------------------------------------------------------- 1 | # Set the default behavior, in case people don't have core.autocrlf set. 2 | * text=auto 3 | 4 | # Explicitly declare text files you want to always be normalized and converted 5 | # to native line endings on checkout. 6 | *.asm text 7 | *.mojo text 8 | 9 | # Declare files that will always have CRLF line endings on checkout. 10 | *.sln text eol=crlf 11 | 12 | # Declare files that will always have LF line endings on checkout. 13 | *.sh test eol=lf 14 | 15 | # Denote all files that are truly binary and should not be modified. 16 | *.png binary 17 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /main 2 | /disassemble/main 3 | /disassemble/sort_network.mojopkg 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Henk-Jan Lebbink 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # High Performance Sorting in Mojo 2 | 3 | Efficient sorting in Modular Mojo optimized for small datasets (with a number of elements less than or equal to 128). 4 | 5 | The primary objective is to create a drop-in replacement for the `sort[type: DType](inout v: DynamicVector[SIMD[type, 1]])` 6 | function, using sorting networks when the dataset is 128 elements or fewer. However, there are still a few areas that need refinement. 7 | 8 | The sorting networks are shamelessly borrowed from the work of [Bert Dobbelaere](https://bertdobbelaere.github.io/sorting_networks_extended.html) who did all the hard searching! 9 | 10 | ## Performance compared to stdlib sort 11 | 12 | I would love to present comprehensive scientific results, complete with boxplots, once a proper statistics library is available for computing standard deviations and confidence intervals. If you find yourself in need of ideas for a useful Mojo project, please consider it. In the meantime, humble time taken (in ns) of the minimum of total 1_000_000 runs is what I can present. 13 | 14 | 1. In the `mojo` column, you'll find a call to: `sort[type: DType](inout v: DynamicVector[SIMD[type, 1]]` with the specified type and vector size. These numbers are average ns of a run of 1000 samples, and this is done 10000 times and only the minimum is reported. 15 | 16 | 2. Under the `netw_SIMD` column is a call to the sorting network: `fn sort_network[T: DType, width: Int, ascending: Bool = True](v: SIMD[T, width]) -> SIMD[T, width]`. If you are sceptical (and you should be), please take a look at the code in the `test_performance` function. 17 | 18 | 3. In column `netw_vec` is a similar function that uses a DTypePointer instead of a SIMD registers, `fn sort_network[type: DType, ascending: Bool = True](inout v: DTypePointer[type], size: Int)`. Note that mojo is able to cheat (a bit) by optimizing over multiple sample steps. 19 | 20 | 21 | Results from Sapphire Rapids (Intel(R) Xeon(R) w5-2455X 3.19 GHz) 22 | ``` 23 | size mojo netw_SIMD netw_vec 24 | uint64 8 20.273000717163086 13.581000328063965 4.6510000228881836 25 | uint64 16 23.427999496459961 22.229000091552734 17.906999588012695 26 | uint64 32 89.584999084472656 52.423000335693359 53.023998260498047 27 | uint64 64 147.78300476074219 144.59100341796875 141.26400756835938 28 | uint64 128 312.09100341796875 341.37298583984375 339.8699951171875 29 | 30 | int64 8 20.659999847412109 13.581000328063965 4.8870000839233398 31 | int64 16 24.968999862670898 22.239999771118164 18.802000045776367 32 | int64 32 85.327003479003906 55.11199951171875 53.051998138427734 33 | int64 64 150.22099304199219 144.59100341796875 144.75700378417969 34 | int64 128 322.68798828125 341.3909912109375 340.40399169921875 35 | 36 | float64 8 22.27400016784668 18.808000564575195 6.7069997787475586 37 | float64 16 24.620000839233398 31.35099983215332 20.356000900268555 38 | float64 32 81.494003295898438 67.047996520996094 66.035003662109375 39 | float64 64 150.78700256347656 172.62100219726562 166.50799560546875 40 | float64 128 331.7349853515625 404.89401245117188 411.49600219726562 41 | 42 | uint32 8 20.590000152587891 7.3559999465942383 2.3519999980926514 43 | uint32 16 23.315000534057617 11.532999992370605 4.7659997940063477 44 | uint32 32 81.68499755859375 22.245000839233398 14.359000205993652 45 | uint32 64 135.843994140625 53.701999664306641 53.279998779296875 46 | uint32 128 286.82101440429688 116.65699768066406 117.87899780273438 47 | 48 | int32 8 21.63800048828125 7.3579998016357422 2.4739999771118164 49 | int32 16 23.562999725341797 11.534000396728516 5.0029997825622559 50 | int32 32 82.912002563476562 22.243000030517578 14.378999710083008 51 | int32 64 143.15400695800781 56.402999877929688 52.959999084472656 52 | int32 128 288.4530029296875 122.76100158691406 117.63999938964844 53 | 54 | float32 8 21.847000122070312 15.956999778747559 4.9879999160766602 55 | float32 16 24.704999923706055 28.111000061035156 11.850000381469727 56 | float32 32 83.470001220703125 50.549999237060547 37.381000518798828 57 | float32 64 144.34100341796875 107.47599792480469 100.47000122070312 58 | float32 128 329.67401123046875 219.9219970703125 221.28599548339844 59 | 60 | uint16 8 23.617000579833984 5.3600001335144043 2.3429999351501465 61 | uint16 16 27.139999389648438 23.76300048828125 6.9120001792907715 62 | uint16 32 95.140998840332031 23.583999633789062 10.407999992370605 63 | uint16 64 172.92799377441406 55.355998992919922 43.13800048828125 64 | uint16 128 349.23599243164062 92.987998962402344 86.794998168945312 65 | 66 | int16 8 22.202999114990234 5.3649997711181641 2.3369998931884766 67 | int16 16 27.21299934387207 24.965999603271484 6.9099998474121094 68 | int16 32 100.19100189208984 23.583999633789062 10.407999992370605 69 | int16 64 184.5469970703125 55.359001159667969 41.124000549316406 70 | int16 128 404.385009765625 92.977996826171875 86.78399658203125 71 | 72 | float16 8 20.812999725341797 16.593999862670898 5.2340002059936523 73 | float16 16 25.663999557495117 41.544998168945312 15.77400016784668 74 | float16 32 83.755996704101562 52.127998352050781 23.106000900268555 75 | float16 64 153.06500244140625 103.88200378417969 90.084999084472656 76 | float16 128 351.635009765625 167.30299377441406 167.4010009765625 77 | 78 | uint8 8 20.913999557495117 6.5199999809265137 2.0220000743865967 79 | uint8 16 25.128000259399414 11.020999908447266 3.7279999256134033 80 | uint8 32 93.383003234863281 32.431999206542969 11.883999824523926 81 | uint8 64 167.77999877929688 34.359001159667969 12.085000038146973 82 | uint8 128 380.62701416015625 53.615001678466797 34.665000915527344 83 | 84 | int8 8 20.586000442504883 6.5199999809265137 2.0190000534057617 85 | int8 16 25.600000381469727 11.022000312805176 3.562000036239624 86 | int8 32 85.268997192382812 32.435001373291016 11.329000473022461 87 | int8 64 141.73300170898438 32.689998626708984 12.116000175476074 88 | int8 128 308.25698852539062 53.620998382568359 34.612998962402344 89 | ``` 90 | 91 | ``` 92 | Results from Emerald Rapids (Intel(R) Xeon(R) ?? 1.7 GHz) 93 | size mojo netw_SIMD netw_vec 94 | uint64 8 27.791000366210938 20.422000885009766 7.3550000190734863 95 | uint64 16 32.803001403808594 33.422000885009766 27.797000885009766 96 | uint64 32 122.26399993896484 82.86199951171875 83.290000915527344 97 | uint64 64 223.39799499511719 228.552001953125 227.84500122070312 98 | uint64 128 478.69100952148438 538.88897705078125 536.98602294921875 99 | 100 | int64 8 26.641000747680664 20.422000885009766 7.3610000610351562 101 | int64 16 32.807998657226562 33.424999237060547 27.790000915527344 102 | int64 32 117.85600280761719 82.860000610351562 83.291999816894531 103 | int64 64 223.14300537109375 228.55099487304688 227.82200622558594 104 | int64 128 484.18499755859375 538.8900146484375 536.98199462890625 105 | 106 | float64 8 30.49799919128418 27.63599967956543 10.373000144958496 107 | float64 16 35.863998413085938 48.244998931884766 31.976999282836914 108 | float64 32 145.55900573730469 102.96800231933594 101.15399932861328 109 | float64 64 268.17498779296875 252.70399475097656 260.26901245117188 110 | float64 128 596.2239990234375 657.43402099609375 667.83599853515625 111 | 112 | uint32 8 27.006999969482422 11.060999870300293 3.7309999465942383 113 | uint32 16 34.424999237060547 17.33799934387207 7.0060000419616699 114 | uint32 32 116.31300354003906 33.462001800537109 22.275999069213867 115 | uint32 64 204.51199340820312 84.58599853515625 82.709999084472656 116 | uint32 128 442.9119873046875 179.36799621582031 179.79100036621094 117 | 118 | int32 8 28.940999984741211 10.868000030517578 3.7190001010894775 119 | int32 16 38.169998168945312 17.339000701904297 6.9850001335144043 120 | int32 32 147.83000183105469 33.451999664306641 22.235000610351562 121 | int32 64 284.40301513671875 84.636001586914062 82.153999328613281 122 | int32 128 641.84600830078125 179.36399841308594 179.90400695800781 123 | 124 | float32 8 28.200000762939453 25.253999710083008 7.9130001068115234 125 | float32 16 34.761001586914062 41.203998565673828 17.174999237060547 126 | float32 32 120.22899627685547 75.650001525878906 59.050998687744141 127 | float32 64 221.822998046875 157.47200012207031 154.09199523925781 128 | float32 128 518.968994140625 339.24099731445312 340.75 129 | 130 | uint16 8 28.165000915527344 8.0579996109008789 3.5130000114440918 131 | uint16 16 32.837001800537109 22.895000457763672 6.3600001335144043 132 | uint16 32 114.80500030517578 35.465000152587891 15.109000205993652 133 | uint16 64 204.48300170898438 83.19000244140625 61.066001892089844 134 | uint16 128 453.56900024414062 139.95399475097656 132.66499328613281 135 | 136 | int16 8 28.25200080871582 8.0649995803833008 3.5039999485015869 137 | int16 16 33.259998321533203 22.892999649047852 6.3569998741149902 138 | int16 32 114.947998046875 35.465999603271484 15.116999626159668 139 | int16 64 207.40299987792969 83.19000244140625 61.062000274658203 140 | int16 128 452.95498657226562 139.83700561523438 132.86399841308594 141 | 142 | float16 8 27.378999710083008 24.954999923706055 7.8639998435974121 143 | float16 16 33.935001373291016 49.456001281738281 16.621000289916992 144 | float16 32 117.83300018310547 80.332000732421875 36.534999847412109 145 | float16 64 221.02099609375 152.37399291992188 129.11700439453125 146 | float16 128 522.46002197265625 245.54800415039062 243.36000061035156 147 | 148 | uint8 8 27.99799919128418 9.7910003662109375 3.0350000858306885 149 | uint8 16 32.863998413085938 16.569999694824219 5.6020002365112305 150 | uint8 32 113.04599761962891 32.797000885009766 9.6079998016357422 151 | uint8 64 205.45799255371094 51.645000457763672 18.542999267578125 152 | uint8 128 447.67498779296875 80.572998046875 56.722000122070312 153 | 154 | int8 8 28.150999069213867 9.7969999313354492 3.0339999198913574 155 | int8 16 32.794998168945312 16.569999694824219 5.5999999046325684 156 | int8 32 118.37799835205078 32.793998718261719 9.6129999160766602 157 | int8 64 212.61599731445312 51.645000457763672 18.607000350952148 158 | int8 128 457.52200317382812 80.58599853515625 56.761001586914062 159 | ``` 160 | 161 | 162 | 163 | Overall, a sorting network is about 4 times faster. 164 | 165 | Note that sorts of size 64 are currently not reported due to a bug. If you are in a position to address this issue, please take a look at https://github.com/modularml/mojo/issues/1505. 166 | 167 | Note that the performance of float code is notably different compared to sorts with integer of the same size. I think it can be attributed to nan checking, as explained later on. 168 | 169 | ## How does it work? 170 | 171 | A sorting network represents the smallest number of comparisons and swaps required to sort an array. For instance, the sorting network for 16 inputs has 61 compare/exchange elements (CEs) organized into 9 layers. Layers consist of parallel CE operations, allowing them to be executed in any order. However, the order of the layers remains fixed. The big advantage of sorting networks is that they can be implemented without any data-dependent control flow. Thus, a single sorting network is just a linear branch-free sequence of instructions. Just what we need. For some interesting details see [here](https://jix.one/proving-50-year-old-sorting-networks-optimal-part-1/). 172 | 173 | ![net16](https://github.com/HJLebbink/sort-networks-mojo/blob/main/img/sort-network-16.png "Sorting Network 16") 174 | 175 | The above sorting network has been proven to be minimal [https://arxiv.org/abs/1310.6271], no need to worry about that. What remains is our quest to find the most efficient method to implement this on our current hardware. 176 | 177 | ## Is the code efficient? 178 | I like to restrict this question to code generated by the Mojo compiler (version 0.7.0) for AVX-512 capabable architectures. 179 | 180 | Next is the assembly code of one of the nine layers in a network that sorts 16 uint32 elements. 181 | 182 | ```asm 183 | vmovdqa64 zmm0, ZMMWORD PTR [r13+rax*1+0x0] 184 | vpermd zmm3, zmm0, zmm1 185 | vpminud zmm2, zmm1, zmm3 186 | mov ax, 0xb552 187 | kmovd k1, eax 188 | vpmaxud zmm2{k1}, zmm1, zmm3 189 | ``` 190 | 191 | To start, `zmm0` is loaded with permutation indices, which hold the static information in the layer indicating how elements should be exchanged. 192 | In the subsequent `vpermd` instruction, the data in zmm1 is permuted and stored in `zmm3`. 193 | 194 | We then obtain the minimum (`vpminud`) between the original data (`zmm1`) and the permuted data (`zmm0`), storing the result in `zmm2`. 195 | Here comes a clever trick – we also compute the maximum values (`vpmaxud`), and only overwrite the minimum values based on a static 196 | mask (`k1`) that indicates the lower side of the compare/exchange element. 197 | 198 | Repeat this for all layers and you sorted the data without any branches, and with minimal memory access. For sorting 16 uint32 values, I can't think of anything more efficient. 199 | 200 | ## Why Mojo? 201 | 202 | I view Mojo as a smart assembler. While I would love to manually write all the sorting functionality in assembly, the myriad combinations of array 203 | lengths and data types make it somewhat impractical. Luckily, Mojo diligently generates similarly efficient code for int32, int16, sorting in ascending or descending order, and more. 204 | 205 | Is the Mojo code flawless? No, you could blame LLVM for the following unnecessary nan check: 206 | 207 | ```asm 208 | vmovaps zmm0, ZMMWORD PTR [r15+rax*1] 209 | vpermps zmm0, zmm0, zmm1 210 | vminps zmm2, zmm0, zmm1 211 | vcmpunordps k1, zmm1, zmm1 212 | vmovaps zmm2{k1}, zmm0 213 | vmaxps zmm1, zmm0, zmm1 214 | vmovaps zmm1{k1}, zmm0 215 | mov ax, 0xb552 216 | kmovd k1, eax 217 | vmovaps zmm2{k1}, zmm1 218 | ``` 219 | 220 | Compared to the code for sorting 16 uint16 values, the first three instructions are unchanged (but are now for float32 instead of uint32). 221 | The [`vcmpunordps`](https://github.com/HJLebbink/asm-dude/wiki/CMPPS) instruction is new, which stores in mask `k1` the values in the data 222 | (`zmm1`) that are nan. However, there are several reasons why there cannot be any nans in `zmm1`. The simplest reason is that the previous layer 223 | already includes the exact same nan tests. 224 | 225 | Next, the minimum and maximum values, which happen to contain no nan values, are overwritten with the permuted data 226 | (which could also contain nan values, but that doesn't seem to be of interest). Removing the nan tests would result 227 | in the same optimal code. If there were a way to toy with the strictness of floating points, perhaps this unnecessary code could be trimmed. If you know a way, let me know! -------------------------------------------------------------------------------- /disassemble/asm/int16_16.asm: -------------------------------------------------------------------------------- 1 | 0x00000000000060a6 <+582>: call 0x5470 2 | 0x00000000000060ab <+587>: mov rbx,QWORD PTR [rsp+0xc0] 3 | 0x00000000000060b3 <+595>: mov rax,QWORD PTR [rsp+0xc8] 4 | 0x00000000000060bb <+603>: mov QWORD PTR [rsp+0x150],rax 5 | 0x00000000000060c3 <+611>: vmovdqa ymm0,YMMWORD PTR [rip+0x57035] # 0x5d100 6 | 0x00000000000060cb <+619>: vmovdqu ymm2,YMMWORD PTR [rsp+0x10] 7 | 0x00000000000060d1 <+625>: vpermw ymm0,ymm0,ymm2 8 | 0x00000000000060d7 <+631>: vpminsw ymm1,ymm2,ymm0 9 | 0x00000000000060db <+635>: mov ax,0xf2b0 10 | 0x00000000000060df <+639>: kmovd k1,eax 11 | 0x00000000000060e3 <+643>: vpmaxsw ymm1{k1},ymm2,ymm0 12 | 0x00000000000060e9 <+649>: vmovdqa ymm0,YMMWORD PTR [rip+0x5702f] # 0x5d120 13 | 0x00000000000060f1 <+657>: vpermw ymm0,ymm0,ymm1 14 | 0x00000000000060f7 <+663>: vpminsw ymm2,ymm1,ymm0 15 | 0x00000000000060fb <+667>: mov ax,0xdcc4 16 | 0x00000000000060ff <+671>: kmovd k1,eax 17 | 0x0000000000006103 <+675>: vpmaxsw ymm2{k1},ymm1,ymm0 18 | 0x0000000000006109 <+681>: vmovdqa ymm0,YMMWORD PTR [rip+0x5702f] # 0x5d140 19 | 0x0000000000006111 <+689>: vpermw ymm0,ymm0,ymm2 20 | 0x0000000000006117 <+695>: vpminsw ymm1,ymm2,ymm0 21 | 0x000000000000611b <+699>: mov ax,0xef08 22 | 0x000000000000611f <+703>: kmovd k1,eax 23 | 0x0000000000006123 <+707>: vpmaxsw ymm1{k1},ymm2,ymm0 24 | 0x0000000000006129 <+713>: vmovdqa ymm0,YMMWORD PTR [rip+0x5702f] # 0x5d160 25 | 0x0000000000006131 <+721>: vpermw ymm0,ymm0,ymm1 26 | 0x0000000000006137 <+727>: vpminsw ymm2,ymm1,ymm0 27 | 0x000000000000613b <+731>: mov ax,0xb552 28 | 0x000000000000613f <+735>: kmovd k1,eax 29 | 0x0000000000006143 <+739>: vpmaxsw ymm2{k1},ymm1,ymm0 30 | 0x0000000000006149 <+745>: vmovdqa ymm0,YMMWORD PTR [rip+0x5702f] # 0x5d180 31 | 0x0000000000006151 <+753>: vpermw ymm0,ymm0,ymm2 32 | 0x0000000000006157 <+759>: vpmaxsw ymm1,ymm2,ymm0 33 | 0x000000000000615b <+763>: mov ax,0x14d6 34 | 0x000000000000615f <+767>: kmovd k1,eax 35 | 0x0000000000006163 <+771>: vpminsw ymm1{k1},ymm2,ymm0 36 | 0x0000000000006169 <+777>: vmovdqa ymm0,YMMWORD PTR [rip+0x5702f] # 0x5d1a0 37 | 0x0000000000006171 <+785>: vpermw ymm0,ymm0,ymm1 38 | 0x0000000000006177 <+791>: vpmaxsw ymm2,ymm1,ymm0 39 | 0x000000000000617b <+795>: mov ax,0x24da 40 | 0x000000000000617f <+799>: kmovd k1,eax 41 | 0x0000000000006183 <+803>: vpminsw ymm2{k1},ymm1,ymm0 42 | 0x0000000000006189 <+809>: vpshufb ymm0,ymm2,YMMWORD PTR [rip+0x5702e] # 0x5d1c0 43 | 0x0000000000006192 <+818>: vpmaxsw ymm1,ymm2,ymm0 44 | 0x0000000000006196 <+822>: mov ax,0x1554 45 | 0x000000000000619a <+826>: kmovd k1,eax 46 | 0x000000000000619e <+830>: vpminsw ymm1{k1},ymm2,ymm0 47 | 0x00000000000061a4 <+836>: vmovdqa ymm0,YMMWORD PTR [rip+0x57034] # 0x5d1e0 48 | 0x00000000000061ac <+844>: vpermd ymm0,ymm0,ymm1 49 | 0x00000000000061b1 <+849>: vpminsw ymm2,ymm1,ymm0 50 | 0x00000000000061b5 <+853>: vpmaxsw ymm0,ymm1,ymm0 51 | 0x00000000000061b9 <+857>: vpblendd ymm1,ymm0,ymm2,0x14 52 | 0x00000000000061bf <+863>: vmovdqa ymm0,YMMWORD PTR [rip+0x57039] # 0x5d200 53 | 0x00000000000061c7 <+871>: vmovdqu YMMWORD PTR [rsp+0x130],ymm1 54 | 0x00000000000061d0 <+880>: vpermw ymm0,ymm0,ymm1 55 | 0x00000000000061d6 <+886>: vmovdqu YMMWORD PTR [rsp+0x280],ymm0 56 | 0x00000000000061df <+895>: mov bp,0xaa8 57 | 0x00000000000061e3 <+899>: vpxor xmm0,xmm0,xmm0 58 | 0x00000000000061e7 <+903>: vmovdqa XMMWORD PTR [rsp+0xe0],xmm0 59 | 0x00000000000061f0 <+912>: lea rsi,[rsp+0xe0] 60 | 0x00000000000061f8 <+920>: mov edi,0x1 61 | 0x00000000000061fd <+925>: vzeroupper 62 | 0x0000000000006200 <+928>: call 0x5470 -------------------------------------------------------------------------------- /disassemble/asm/int16_32.asm: -------------------------------------------------------------------------------- 1 | 0x00000000000060bb <+603>: call 0x5470 2 | 0x00000000000060c0 <+608>: mov rbx,QWORD PTR [rsp+0xa0] 3 | 0x00000000000060c8 <+616>: mov rax,QWORD PTR [rsp+0xa8] 4 | 0x00000000000060d0 <+624>: mov QWORD PTR [rsp+0x150],rax 5 | 0x00000000000060d8 <+632>: vmovdqu64 zmm2,ZMMWORD PTR [rsp+0x110] 6 | 0x00000000000060e3 <+643>: vprold zmm0,zmm2,0x10 7 | 0x00000000000060ea <+650>: vpminsw zmm1,zmm2,zmm0 8 | 0x00000000000060f0 <+656>: mov eax,0xaaaaaaaa 9 | 0x00000000000060f5 <+661>: kmovd k1,eax 10 | 0x00000000000060f9 <+665>: vpmaxsw zmm1{k1},zmm2,zmm0 11 | 0x00000000000060ff <+671>: vpshufd zmm0,zmm1,0xb1 12 | 0x0000000000006106 <+678>: vpminsw zmm2,zmm1,zmm0 13 | 0x000000000000610c <+684>: mov eax,0xcccccccc 14 | 0x0000000000006111 <+689>: kmovd k1,eax 15 | 0x0000000000006115 <+693>: vpmaxsw zmm2{k1},zmm1,zmm0 16 | 0x000000000000611b <+699>: vpshufd zmm0,zmm2,0x4e 17 | 0x0000000000006122 <+706>: vpminsw zmm1,zmm2,zmm0 18 | 0x0000000000006128 <+712>: mov eax,0xf0f0f0f0 19 | 0x000000000000612d <+717>: kmovd k1,eax 20 | 0x0000000000006131 <+721>: vpmaxsw zmm1{k1},zmm2,zmm0 21 | 0x0000000000006137 <+727>: vpxor xmm0,xmm0,xmm0 22 | 0x000000000000613b <+731>: vpermq zmm0,zmm1,0x4e 23 | 0x0000000000006142 <+738>: vpminsw zmm2,zmm1,zmm0 24 | 0x0000000000006148 <+744>: mov eax,0xff00ff00 25 | 0x000000000000614d <+749>: kmovd k1,eax 26 | 0x0000000000006151 <+753>: vpmaxsw zmm2{k1},zmm1,zmm0 27 | 0x0000000000006157 <+759>: vmovdqa64 zmm0,ZMMWORD PTR [rip+0x56fdf] # 0x5d140 28 | 0x0000000000006161 <+769>: vpermw zmm0,zmm0,zmm2 29 | 0x0000000000006167 <+775>: vpminsw zmm1,zmm2,zmm0 30 | 0x000000000000616d <+781>: mov eax,0xf7117710 31 | 0x0000000000006172 <+786>: kmovd k1,eax 32 | 0x0000000000006176 <+790>: vpmaxsw zmm1{k1},zmm2,zmm0 33 | 0x000000000000617c <+796>: vmovdqa64 zmm0,ZMMWORD PTR [rip+0x56ffa] # 0x5d180 34 | 0x0000000000006186 <+806>: vpermw zmm0,zmm0,zmm1 35 | 0x000000000000618c <+812>: vpmaxsw zmm2,zmm1,zmm0 36 | 0x0000000000006192 <+818>: mov eax,0x249a26da 37 | 0x0000000000006197 <+823>: kmovd k1,eax 38 | 0x000000000000619b <+827>: vpminsw zmm2{k1},zmm1,zmm0 39 | 0x00000000000061a1 <+833>: vmovdqa64 zmm0,ZMMWORD PTR [rip+0x57015] # 0x5d1c0 40 | 0x00000000000061ab <+843>: vpermw zmm0,zmm0,zmm2 41 | 0x00000000000061b1 <+849>: vpmaxsw zmm1,zmm2,zmm0 42 | 0x00000000000061b7 <+855>: mov eax,0x2079be 43 | 0x00000000000061bc <+860>: kmovd k1,eax 44 | 0x00000000000061c0 <+864>: vpminsw zmm1{k1},zmm2,zmm0 45 | 0x00000000000061c6 <+870>: vmovdqa64 zmm0,ZMMWORD PTR [rip+0x57030] # 0x5d200 46 | 0x00000000000061d0 <+880>: vpermw zmm0,zmm0,zmm1 47 | 0x00000000000061d6 <+886>: vpmaxsw zmm2,zmm1,zmm0 48 | 0x00000000000061dc <+892>: mov eax,0x40edf8 49 | 0x00000000000061e1 <+897>: kmovd k1,eax 50 | 0x00000000000061e5 <+901>: vpminsw zmm2{k1},zmm1,zmm0 51 | 0x00000000000061eb <+907>: vmovdqa64 zmm0,ZMMWORD PTR [rip+0x5704b] # 0x5d240 52 | 0x00000000000061f5 <+917>: vpermw zmm0,zmm0,zmm2 53 | 0x00000000000061fb <+923>: vpmaxsw zmm1,zmm2,zmm0 54 | 0x0000000000006201 <+929>: mov eax,0x880deaa 55 | 0x0000000000006206 <+934>: kmovd k1,eax 56 | 0x000000000000620a <+938>: vpminsw zmm1{k1},zmm2,zmm0 57 | 0x0000000000006210 <+944>: vmovdqa64 zmm0,ZMMWORD PTR [rip+0x57066] # 0x5d280 58 | 0x000000000000621a <+954>: vpermw zmm0,zmm0,zmm1 59 | 0x0000000000006220 <+960>: vpmaxsw zmm2,zmm1,zmm0 60 | 0x0000000000006226 <+966>: mov eax,0x480fa84 61 | 0x000000000000622b <+971>: kmovd k1,eax 62 | 0x000000000000622f <+975>: vpminsw zmm2{k1},zmm1,zmm0 63 | 0x0000000000006235 <+981>: vmovdqa64 zmm0,ZMMWORD PTR [rip+0x57081] # 0x5d2c0 64 | 0x000000000000623f <+991>: vpermw zmm0,zmm0,zmm2 65 | 0x0000000000006245 <+997>: vpmaxsw zmm1,zmm2,zmm0 66 | 0x000000000000624b <+1003>: mov eax,0x818e644 67 | 0x0000000000006250 <+1008>: kmovd k1,eax 68 | 0x0000000000006254 <+1012>: vpminsw zmm1{k1},zmm2,zmm0 69 | 0x000000000000625a <+1018>: vmovdqa64 zmm0,ZMMWORD PTR [rip+0x5709c] # 0x5d300 70 | 0x0000000000006264 <+1028>: vpermw zmm0,zmm0,zmm1 71 | 0x000000000000626a <+1034>: vpmaxsw zmm2,zmm1,zmm0 72 | 0x0000000000006270 <+1040>: mov eax,0x22ccb20 73 | 0x0000000000006275 <+1045>: kmovd k1,eax 74 | 0x0000000000006279 <+1049>: vpminsw zmm2{k1},zmm1,zmm0 75 | 0x000000000000627f <+1055>: vmovdqa64 zmm0,ZMMWORD PTR [rip+0x570b7] # 0x5d340 76 | 0x0000000000006289 <+1065>: vpermw zmm0,zmm0,zmm2 77 | 0x000000000000628f <+1071>: vpmaxsw zmm1,zmm2,zmm0 78 | 0x0000000000006295 <+1077>: mov eax,0x54aad48 79 | 0x000000000000629a <+1082>: kmovd k1,eax 80 | 0x000000000000629e <+1086>: vpminsw zmm1{k1},zmm2,zmm0 81 | 0x00000000000062a4 <+1092>: vmovdqa64 zmm0,ZMMWORD PTR [rip+0x570d2] # 0x5d380 82 | 0x00000000000062ae <+1102>: vmovdqu64 ZMMWORD PTR [rsp+0x270],zmm1 83 | 0x00000000000062b9 <+1113>: vpermw zmm0,zmm0,zmm1 84 | 0x00000000000062bf <+1119>: vmovdqu64 ZMMWORD PTR [rsp+0x2c0],zmm0 85 | 0x00000000000062c7 <+1127>: mov ebp,0xaaaaaa8 86 | 0x00000000000062cc <+1132>: vpxor xmm0,xmm0,xmm0 87 | 0x00000000000062d0 <+1136>: vmovdqa XMMWORD PTR [rsp+0xc0],xmm0 88 | 0x00000000000062d9 <+1145>: lea rsi,[rsp+0xc0] 89 | 0x00000000000062e1 <+1153>: mov edi,0x1 90 | 0x00000000000062e6 <+1158>: vzeroupper 91 | 0x00000000000062e9 <+1161>: call 0x5470 -------------------------------------------------------------------------------- /disassemble/asm/int16_64.asm: -------------------------------------------------------------------------------- 1 | 0x0000000000006100 <+640>: call 0x5470 2 | 0x0000000000006105 <+645>: vmovdqa64 zmm5,ZMMWORD PTR [rsp+0x140] 3 | 0x000000000000610d <+653>: vpshufd zmm0,zmm5,0xb1 4 | 0x0000000000006114 <+660>: vmovdqa64 zmm4,ZMMWORD PTR [rsp+0x180] 5 | 0x000000000000611c <+668>: vpshufd zmm1,zmm4,0xb1 6 | 0x0000000000006123 <+675>: vpminsw zmm2,zmm4,zmm1 7 | 0x0000000000006129 <+681>: vpminsw zmm3,zmm5,zmm0 8 | 0x000000000000612f <+687>: mov eax,0xcccccccc 9 | 0x0000000000006134 <+692>: kmovd k1,eax 10 | 0x0000000000006138 <+696>: vpmaxsw zmm3{k1},zmm5,zmm0 11 | 0x000000000000613e <+702>: vpmaxsw zmm2{k1},zmm4,zmm1 12 | 0x0000000000006144 <+708>: vprold zmm0,zmm2,0x10 13 | 0x000000000000614b <+715>: vprold zmm1,zmm3,0x10 14 | 0x0000000000006152 <+722>: vpminsw zmm4,zmm3,zmm1 15 | 0x0000000000006158 <+728>: vpminsw zmm5,zmm2,zmm0 16 | 0x000000000000615e <+734>: mov eax,0xaaaaaaaa 17 | 0x0000000000006163 <+739>: kmovd k1,eax 18 | 0x0000000000006167 <+743>: vpmaxsw zmm5{k1},zmm2,zmm0 19 | 0x000000000000616d <+749>: vpmaxsw zmm4{k1},zmm3,zmm1 20 | 0x0000000000006173 <+755>: vmovdqa64 zmm0,ZMMWORD PTR [rip+0x56f83] # 0x5d100 21 | 0x000000000000617d <+765>: vpermi2w zmm0,zmm5,zmm4 22 | 0x0000000000006183 <+771>: vmovdqa64 zmm1,ZMMWORD PTR [rip+0x56fb3] # 0x5d140 23 | 0x000000000000618d <+781>: vpermi2w zmm1,zmm4,zmm5 24 | 0x0000000000006193 <+787>: vpmaxsw zmm2,zmm4,zmm1 25 | 0x0000000000006199 <+793>: mov eax,0x2222bb2b 26 | 0x000000000000619e <+798>: kmovd k1,eax 27 | 0x00000000000061a2 <+802>: vpminsw zmm2{k1},zmm4,zmm1 28 | 0x00000000000061a8 <+808>: vpminsw zmm1,zmm5,zmm0 29 | 0x00000000000061ae <+814>: mov eax,0xd4dd4444 30 | 0x00000000000061b3 <+819>: kmovd k1,eax 31 | 0x00000000000061b7 <+823>: vpmaxsw zmm1{k1},zmm5,zmm0 32 | 0x00000000000061bd <+829>: vmovdqa64 zmm0,ZMMWORD PTR [rip+0x56fb9] # 0x5d180 33 | 0x00000000000061c7 <+839>: vpermi2w zmm0,zmm1,zmm2 34 | 0x00000000000061cd <+845>: vmovdqa64 zmm3,ZMMWORD PTR [rip+0x56fe9] # 0x5d1c0 35 | 0x00000000000061d7 <+855>: vpermi2w zmm3,zmm2,zmm1 36 | 0x00000000000061dd <+861>: vpmaxsw zmm4,zmm2,zmm3 37 | 0x00000000000061e3 <+867>: mov eax,0x90669f 38 | 0x00000000000061e8 <+872>: kmovd k1,eax 39 | 0x00000000000061ec <+876>: vpminsw zmm4{k1},zmm2,zmm3 40 | 0x00000000000061f2 <+882>: vpminsw zmm2,zmm1,zmm0 41 | 0x00000000000061f8 <+888>: mov eax,0xf9660900 42 | 0x00000000000061fd <+893>: kmovd k1,eax 43 | 0x0000000000006201 <+897>: vpmaxsw zmm2{k1},zmm1,zmm0 44 | 0x0000000000006207 <+903>: vmovdqa64 zmm0,ZMMWORD PTR [rip+0x56fef] # 0x5d200 45 | 0x0000000000006211 <+913>: vpermi2w zmm0,zmm2,zmm4 46 | 0x0000000000006217 <+919>: vmovdqa64 zmm1,ZMMWORD PTR [rip+0x5701f] # 0x5d240 47 | 0x0000000000006221 <+929>: vpermi2w zmm1,zmm2,zmm4 48 | 0x0000000000006227 <+935>: vpmaxsw zmm3,zmm4,zmm1 49 | 0x000000000000622d <+941>: mov eax,0x690066 50 | 0x0000000000006232 <+946>: kmovd k1,eax 51 | 0x0000000000006236 <+950>: vpminsw zmm3{k1},zmm4,zmm1 52 | 0x000000000000623c <+956>: vpminsw zmm1,zmm2,zmm0 53 | 0x0000000000006242 <+962>: vmovdqa64 zmm4,ZMMWORD PTR [rip+0x57074] # 0x5d2c0 54 | 0x000000000000624c <+972>: vpermi2w zmm4,zmm1,zmm3 55 | 0x0000000000006252 <+978>: mov eax,0x66009600 56 | 0x0000000000006257 <+983>: kmovd k1,eax 57 | 0x000000000000625b <+987>: vpmaxsw zmm1{k1},zmm2,zmm0 58 | 0x0000000000006261 <+993>: vmovdqa64 zmm0,ZMMWORD PTR [rip+0x57015] # 0x5d280 59 | 0x000000000000626b <+1003>: vpermi2w zmm0,zmm1,zmm3 60 | 0x0000000000006271 <+1009>: vpmaxsw zmm2,zmm3,zmm0 61 | 0x0000000000006277 <+1015>: mov eax,0x9069090 62 | 0x000000000000627c <+1020>: kmovd k1,eax 63 | 0x0000000000006280 <+1024>: vpminsw zmm2{k1},zmm3,zmm0 64 | 0x0000000000006286 <+1030>: vpminsw zmm0,zmm1,zmm4 65 | 0x000000000000628c <+1036>: mov eax,0x9096090 66 | 0x0000000000006291 <+1041>: kmovd k1,eax 67 | 0x0000000000006295 <+1045>: vpmaxsw zmm0{k1},zmm1,zmm4 68 | 0x000000000000629b <+1051>: vmovdqa64 zmm1,ZMMWORD PTR [rip+0x5705b] # 0x5d300 69 | 0x00000000000062a5 <+1061>: vpermi2w zmm1,zmm0,zmm2 70 | 0x00000000000062ab <+1067>: vmovdqa64 zmm3,ZMMWORD PTR [rip+0x5708b] # 0x5d340 71 | 0x00000000000062b5 <+1077>: vpermi2w zmm3,zmm2,zmm0 72 | 0x00000000000062bb <+1083>: vpmaxsw zmm4,zmm2,zmm3 73 | 0x00000000000062c1 <+1089>: mov eax,0x6096960 74 | 0x00000000000062c6 <+1094>: kmovd k1,eax 75 | 0x00000000000062ca <+1098>: vpminsw zmm4{k1},zmm2,zmm3 76 | 0x00000000000062d0 <+1104>: vpminsw zmm2,zmm0,zmm1 77 | 0x00000000000062d6 <+1110>: mov eax,0x6969069 78 | 0x00000000000062db <+1115>: kmovd k1,eax 79 | 0x00000000000062df <+1119>: vpmaxsw zmm2{k1},zmm0,zmm1 80 | 0x00000000000062e5 <+1125>: vmovdqa64 zmm0,ZMMWORD PTR [rip+0x57091] # 0x5d380 81 | 0x00000000000062ef <+1135>: vpermi2w zmm0,zmm2,zmm4 82 | 0x00000000000062f5 <+1141>: vmovdqa64 zmm1,ZMMWORD PTR [rip+0x570c1] # 0x5d3c0 83 | 0x00000000000062ff <+1151>: vpermi2w zmm1,zmm4,zmm2 84 | 0x0000000000006305 <+1157>: vpmaxsw zmm3,zmm4,zmm1 85 | 0x000000000000630b <+1163>: vpminsw zmm5,zmm2,zmm0 86 | 0x0000000000006311 <+1169>: mov eax,0xf0690f 87 | 0x0000000000006316 <+1174>: kmovd k1,eax 88 | 0x000000000000631a <+1178>: vpmaxsw zmm5{k1},zmm2,zmm0 89 | 0x0000000000006320 <+1184>: vmovdqa64 zmm0,ZMMWORD PTR [rip+0x57116] # 0x5d440 90 | 0x000000000000632a <+1194>: vpermi2w zmm0,zmm5,zmm3 91 | 0x0000000000006330 <+1200>: mov eax,0x960f00 92 | 0x0000000000006335 <+1205>: kmovd k1,eax 93 | 0x0000000000006339 <+1209>: vpminsw zmm3{k1},zmm4,zmm1 94 | 0x000000000000633f <+1215>: vmovdqa64 zmm1,ZMMWORD PTR [rip+0x570b7] # 0x5d400 95 | 0x0000000000006349 <+1225>: vpermi2w zmm1,zmm3,zmm5 96 | 0x000000000000634f <+1231>: vpmaxsw zmm2,zmm3,zmm1 97 | 0x0000000000006355 <+1237>: mov eax,0x690f09 98 | 0x000000000000635a <+1242>: kmovd k1,eax 99 | 0x000000000000635e <+1246>: vpminsw zmm2{k1},zmm3,zmm1 100 | 0x0000000000006364 <+1252>: vpmaxsw zmm1,zmm5,zmm0 101 | 0x000000000000636a <+1258>: mov eax,0x6f0f6960 102 | 0x000000000000636f <+1263>: kmovd k1,eax 103 | 0x0000000000006373 <+1267>: vpminsw zmm1{k1},zmm5,zmm0 104 | 0x0000000000006379 <+1273>: vmovdqa64 zmm0,ZMMWORD PTR [rip+0x570fd] # 0x5d480 105 | 0x0000000000006383 <+1283>: vpermw zmm0,zmm0,zmm2 106 | 0x0000000000006389 <+1289>: vmovdqa64 zmm3,ZMMWORD PTR [rip+0x5712d] # 0x5d4c0 107 | 0x0000000000006393 <+1299>: vpermw zmm3,zmm3,zmm1 108 | 0x0000000000006399 <+1305>: vpminsw zmm4,zmm2,zmm0 109 | 0x000000000000639f <+1311>: vpmaxsw zmm0,zmm2,zmm0 110 | 0x00000000000063a5 <+1317>: mov eax,0x6069f 111 | 0x00000000000063aa <+1322>: kmovd k1,eax 112 | 0x00000000000063ae <+1326>: vmovdqu16 zmm0{k1},zmm4 113 | 0x00000000000063b4 <+1332>: vpmaxsw zmm2,zmm1,zmm3 114 | 0x00000000000063ba <+1338>: mov eax,0x69f0600 115 | 0x00000000000063bf <+1343>: kmovd k1,eax 116 | 0x00000000000063c3 <+1347>: vpminsw zmm2{k1},zmm1,zmm3 117 | 0x00000000000063c9 <+1353>: vmovdqa64 zmm1,ZMMWORD PTR [rip+0x5712d] # 0x5d500 118 | 0x00000000000063d3 <+1363>: vpermi2w zmm1,zmm0,zmm2 119 | 0x00000000000063d9 <+1369>: vmovdqa64 zmm3,ZMMWORD PTR [rip+0x5715d] # 0x5d540 120 | 0x00000000000063e3 <+1379>: vpermi2w zmm3,zmm2,zmm4 121 | 0x00000000000063e9 <+1385>: vpmaxsw zmm4,zmm0,zmm1 122 | 0x00000000000063ef <+1391>: mov eax,0x90f6 123 | 0x00000000000063f4 <+1396>: kmovd k1,eax 124 | 0x00000000000063f8 <+1400>: vpminsw zmm4{k1},zmm0,zmm1 125 | 0x00000000000063fe <+1406>: vpmaxsw zmm0,zmm2,zmm3 126 | 0x0000000000006404 <+1412>: mov eax,0x90f69000 127 | 0x0000000000006409 <+1417>: kmovd k1,eax 128 | 0x000000000000640d <+1421>: vpminsw zmm0{k1},zmm2,zmm3 129 | 0x0000000000006413 <+1427>: vmovdqa64 zmm1,ZMMWORD PTR [rip+0x57163] # 0x5d580 130 | 0x000000000000641d <+1437>: vpermi2w zmm1,zmm0,zmm4 131 | 0x0000000000006423 <+1443>: vmovdqa64 zmm2,ZMMWORD PTR [rip+0x57193] # 0x5d5c0 132 | 0x000000000000642d <+1453>: vpermi2w zmm2,zmm4,zmm0 133 | 0x0000000000006433 <+1459>: vpmaxsw zmm3,zmm4,zmm2 134 | 0x0000000000006439 <+1465>: mov eax,0xe8e0 135 | 0x000000000000643e <+1470>: kmovd k1,eax 136 | 0x0000000000006442 <+1474>: vpminsw zmm3{k1},zmm4,zmm2 137 | 0x0000000000006448 <+1480>: vpmaxsw zmm2,zmm0,zmm1 138 | 0x000000000000644e <+1486>: mov eax,0xe8e06666 139 | 0x0000000000006453 <+1491>: kmovd k1,eax 140 | 0x0000000000006457 <+1495>: vpminsw zmm2{k1},zmm0,zmm1 141 | 0x000000000000645d <+1501>: vmovdqa64 zmm0,ZMMWORD PTR [rip+0x57199] # 0x5d600 142 | 0x0000000000006467 <+1511>: vpermi2w zmm0,zmm2,zmm3 143 | 0x000000000000646d <+1517>: vmovdqa64 zmm1,ZMMWORD PTR [rip+0x571c9] # 0x5d640 144 | 0x0000000000006477 <+1527>: vpermi2w zmm1,zmm3,zmm2 145 | 0x000000000000647d <+1533>: vpmaxsw zmm4,zmm3,zmm1 146 | 0x0000000000006483 <+1539>: vpminsw zmm5,zmm2,zmm0 147 | 0x0000000000006489 <+1545>: mov eax,0xb3931331 148 | 0x000000000000648e <+1550>: kmovd k1,eax 149 | 0x0000000000006492 <+1554>: vpmaxsw zmm5{k1},zmm2,zmm0 150 | 0x0000000000006498 <+1560>: vmovdqa64 zmm0,ZMMWORD PTR [rip+0x5721e] # 0x5d6c0 151 | 0x00000000000064a2 <+1570>: vpermi2w zmm0,zmm5,zmm4 152 | 0x00000000000064a8 <+1576>: mov eax,0x8880088 153 | 0x00000000000064ad <+1581>: kmovd k1,eax 154 | 0x00000000000064b1 <+1585>: vpminsw zmm4{k1},zmm3,zmm1 155 | 0x00000000000064b7 <+1591>: vmovdqa64 zmm1,ZMMWORD PTR [rip+0x571bf] # 0x5d680 156 | 0x00000000000064c1 <+1601>: vpermi2w zmm1,zmm4,zmm5 157 | 0x00000000000064c7 <+1607>: vpmaxsw zmm2,zmm4,zmm1 158 | 0x00000000000064cd <+1613>: mov eax,0xa00ca4c 159 | 0x00000000000064d2 <+1618>: kmovd k1,eax 160 | 0x00000000000064d6 <+1622>: vpminsw zmm2{k1},zmm4,zmm1 161 | 0x00000000000064dc <+1628>: vpmaxsw zmm1,zmm5,zmm0 162 | 0x00000000000064e2 <+1634>: mov eax,0xc48cd9ac 163 | 0x00000000000064e7 <+1639>: kmovd k1,eax 164 | 0x00000000000064eb <+1643>: vpminsw zmm1{k1},zmm5,zmm0 165 | 0x00000000000064f1 <+1649>: vmovdqa64 zmm0,ZMMWORD PTR [rip+0x57205] # 0x5d700 166 | 0x00000000000064fb <+1659>: vpermi2w zmm0,zmm2,zmm1 167 | 0x0000000000006501 <+1665>: vmovdqa64 zmm3,ZMMWORD PTR [rip+0x57235] # 0x5d740 168 | 0x000000000000650b <+1675>: vpermi2w zmm3,zmm1,zmm2 169 | 0x0000000000006511 <+1681>: vpmaxsw zmm4,zmm1,zmm3 170 | 0x0000000000006517 <+1687>: mov eax,0x88ca8888 171 | 0x000000000000651c <+1692>: kmovd k1,eax 172 | 0x0000000000006520 <+1696>: vpminsw zmm4{k1},zmm1,zmm3 173 | 0x0000000000006526 <+1702>: vpmaxsw zmm1,zmm2,zmm0 174 | 0x000000000000652c <+1708>: mov eax,0x2466 175 | 0x0000000000006531 <+1713>: kmovd k1,eax 176 | 0x0000000000006535 <+1717>: vpminsw zmm1{k1},zmm2,zmm0 177 | 0x000000000000653b <+1723>: vmovdqa64 zmm0,ZMMWORD PTR [rip+0x5723b] # 0x5d780 178 | 0x0000000000006545 <+1733>: vpermi2w zmm0,zmm1,zmm4 179 | 0x000000000000654b <+1739>: vmovdqa64 zmm2,ZMMWORD PTR [rip+0x5726b] # 0x5d7c0 180 | 0x0000000000006555 <+1749>: vpermi2w zmm2,zmm4,zmm1 181 | 0x000000000000655b <+1755>: vpmaxsw zmm3,zmm4,zmm2 182 | 0x0000000000006561 <+1761>: mov eax,0xeeca8888 183 | 0x0000000000006566 <+1766>: kmovd k1,eax 184 | 0x000000000000656a <+1770>: vpminsw zmm3{k1},zmm4,zmm2 185 | 0x0000000000006570 <+1776>: vpmaxsw zmm2,zmm1,zmm0 186 | 0x0000000000006576 <+1782>: vmovdqa64 zmm4,ZMMWORD PTR [rip+0x572c0] # 0x5d840 187 | 0x0000000000006580 <+1792>: vpermi2w zmm4,zmm3,zmm2 188 | 0x0000000000006586 <+1798>: mov eax,0xac88 189 | 0x000000000000658b <+1803>: kmovd k1,eax 190 | 0x000000000000658f <+1807>: vpminsw zmm2{k1},zmm1,zmm0 191 | 0x0000000000006595 <+1813>: vmovdqa64 zmm0,ZMMWORD PTR [rip+0x57261] # 0x5d800 192 | 0x000000000000659f <+1823>: vpermi2w zmm0,zmm2,zmm3 193 | 0x00000000000065a5 <+1829>: vpmaxsw zmm1,zmm2,zmm0 194 | 0x00000000000065ab <+1835>: mov eax,0x44caaa 195 | 0x00000000000065b0 <+1840>: kmovd k1,eax 196 | 0x00000000000065b4 <+1844>: vpminsw zmm1{k1},zmm2,zmm0 197 | 0x00000000000065ba <+1850>: vpmaxsw zmm0,zmm3,zmm4 198 | 0x00000000000065c0 <+1856>: mov eax,0xaaaccc88 199 | 0x00000000000065c5 <+1861>: kmovd k1,eax 200 | 0x00000000000065c9 <+1865>: vpminsw zmm0{k1},zmm3,zmm4 201 | 0x00000000000065cf <+1871>: vmovdqa64 zmm2,ZMMWORD PTR [rip+0x572a7] # 0x5d880 202 | 0x00000000000065d9 <+1881>: vpermi2w zmm2,zmm1,zmm0 203 | 0x00000000000065df <+1887>: vmovdqa64 zmm3,ZMMWORD PTR [rip+0x572d7] # 0x5d8c0 204 | 0x00000000000065e9 <+1897>: vpermi2w zmm3,zmm0,zmm1 205 | 0x00000000000065ef <+1903>: vpmaxsw zmm4,zmm0,zmm3 206 | 0x00000000000065f5 <+1909>: mov eax,0xcaacaa88 207 | 0x00000000000065fa <+1914>: kmovd k1,eax 208 | 0x00000000000065fe <+1918>: vpminsw zmm4{k1},zmm0,zmm3 209 | 0x0000000000006604 <+1924>: vpmaxsw zmm0,zmm1,zmm2 210 | 0x000000000000660a <+1930>: mov eax,0xaacaac 211 | 0x000000000000660f <+1935>: kmovd k1,eax 212 | 0x0000000000006613 <+1939>: vpminsw zmm0{k1},zmm1,zmm2 213 | 0x0000000000006619 <+1945>: vmovdqa64 zmm1,ZMMWORD PTR [rip+0x572dd] # 0x5d900 214 | 0x0000000000006623 <+1955>: vpermi2w zmm1,zmm0,zmm4 215 | 0x0000000000006629 <+1961>: vmovdqa64 zmm2,ZMMWORD PTR [rip+0x5730d] # 0x5d940 216 | 0x0000000000006633 <+1971>: vpermi2w zmm2,zmm4,zmm0 217 | 0x0000000000006639 <+1977>: vpmaxsw zmm3,zmm4,zmm2 218 | 0x000000000000663f <+1983>: mov eax,0xaccaccc8 219 | 0x0000000000006644 <+1988>: kmovd k1,eax 220 | 0x0000000000006648 <+1992>: vpminsw zmm3{k1},zmm4,zmm2 221 | 0x000000000000664e <+1998>: vpmaxsw zmm2,zmm0,zmm1 222 | 0x0000000000006654 <+2004>: vmovdqa64 zmm4,ZMMWORD PTR [rip+0x57362] # 0x5d9c0 223 | 0x000000000000665e <+2014>: vpermi2w zmm4,zmm3,zmm2 224 | 0x0000000000006664 <+2020>: vmovdqa64 ZMMWORD PTR [rsp+0x380],zmm4 225 | 0x000000000000666c <+2028>: mov eax,0x4ccacca 226 | 0x0000000000006671 <+2033>: kmovd k1,eax 227 | 0x0000000000006675 <+2037>: vpminsw zmm2{k1},zmm0,zmm1 228 | 0x000000000000667b <+2043>: vmovdqa64 zmm0,ZMMWORD PTR [rip+0x572fb] # 0x5d980 229 | 0x0000000000006685 <+2053>: vmovdqa64 ZMMWORD PTR [rsp+0x3c0],zmm3 230 | 0x000000000000668d <+2061>: vmovdqa64 ZMMWORD PTR [rsp+0x340],zmm2 231 | 0x0000000000006695 <+2069>: vpermi2w zmm0,zmm2,zmm3 232 | 0x000000000000669b <+2075>: vmovdqa64 ZMMWORD PTR [rsp+0x300],zmm0 233 | 0x00000000000066a3 <+2083>: mov rbx,QWORD PTR [rsp+0xd0] 234 | 0x00000000000066ab <+2091>: mov rax,QWORD PTR [rsp+0xd8] 235 | 0x00000000000066b3 <+2099>: mov QWORD PTR [rsp+0x1d0],rax 236 | 0x00000000000066bb <+2107>: vpxor xmm0,xmm0,xmm0 237 | 0x00000000000066bf <+2111>: vmovdqa XMMWORD PTR [rsp+0xf0],xmm0 238 | 0x00000000000066c8 <+2120>: lea rsi,[rsp+0xf0] 239 | 0x00000000000066d0 <+2128>: mov edi,0x1 240 | 0x00000000000066d5 <+2133>: vzeroupper 241 | 0x00000000000066d8 <+2136>: call 0x5470 242 | -------------------------------------------------------------------------------- /disassemble/asm/int16_8.asm: -------------------------------------------------------------------------------- 1 | 0x0000000000006072 <+562>: call 0x5470 2 | 0x0000000000006077 <+567>: mov rbx,QWORD PTR [rsp+0xb0] 3 | 0x000000000000607f <+575>: mov r12,QWORD PTR [rsp+0xb8] 4 | 0x0000000000006087 <+583>: vmovdqa xmm2,XMMWORD PTR [rsp] 5 | 0x000000000000608c <+588>: vpshufd xmm0,xmm2,0xb1 6 | 0x0000000000006091 <+593>: vpminsw xmm1,xmm2,xmm0 7 | 0x0000000000006095 <+597>: vpmaxsw xmm0,xmm2,xmm0 8 | 0x0000000000006099 <+601>: vpblendd xmm0,xmm1,xmm0,0xa 9 | 0x000000000000609f <+607>: vpshufd xmm1,xmm0,0x4e 10 | 0x00000000000060a4 <+612>: vpminsw xmm2,xmm0,xmm1 11 | 0x00000000000060a8 <+616>: vpmaxsw xmm0,xmm0,xmm1 12 | 0x00000000000060ac <+620>: vpblendd xmm0,xmm2,xmm0,0xc 13 | 0x00000000000060b2 <+626>: vprold xmm1,xmm0,0x10 14 | 0x00000000000060b9 <+633>: vpminsw xmm2,xmm0,xmm1 15 | 0x00000000000060bd <+637>: vpmaxsw xmm0,xmm0,xmm1 16 | 0x00000000000060c1 <+641>: vpblendw xmm0,xmm2,xmm0,0xaa 17 | 0x00000000000060c7 <+647>: vpshufd xmm1,xmm0,0xd8 18 | 0x00000000000060cc <+652>: vpminsw xmm2,xmm0,xmm1 19 | 0x00000000000060d0 <+656>: vpmaxsw xmm0,xmm0,xmm1 20 | 0x00000000000060d4 <+660>: vpblendd xmm0,xmm0,xmm2,0x2 21 | 0x00000000000060da <+666>: vpshufb xmm1,xmm0,XMMWORD PTR [rip+0x5701d] # 0x5d100 22 | 0x00000000000060e3 <+675>: vpminsw xmm2,xmm0,xmm1 23 | 0x00000000000060e7 <+679>: vpmaxsw xmm0,xmm0,xmm1 24 | 0x00000000000060eb <+683>: vpblendw xmm0,xmm0,xmm2,0xa 25 | 0x00000000000060f1 <+689>: vpshufb xmm1,xmm0,XMMWORD PTR [rip+0x57016] # 0x5d110 26 | 0x00000000000060fa <+698>: vpminsw xmm2,xmm0,xmm1 27 | 0x00000000000060fe <+702>: vmovdqa XMMWORD PTR [rsp],xmm2 28 | 0x0000000000006103 <+707>: vpmaxsw xmm0,xmm0,xmm1 29 | 0x0000000000006107 <+711>: vmovdqa XMMWORD PTR [rsp+0x230],xmm0 30 | 0x0000000000006110 <+720>: vpxor xmm0,xmm0,xmm0 31 | 0x0000000000006114 <+724>: vmovdqa XMMWORD PTR [rsp+0xd0],xmm0 32 | 0x000000000000611d <+733>: lea rsi,[rsp+0xd0] 33 | 0x0000000000006125 <+741>: mov edi,0x1 34 | 0x000000000000612a <+746>: call 0x5470 -------------------------------------------------------------------------------- /disassemble/asm/int32_16.asm: -------------------------------------------------------------------------------- 1 | 0x00000000000060ab <+603>: call 0x5470 2 | 0x00000000000060b0 <+608>: mov rbx,QWORD PTR [rsp+0xa0] 3 | 0x00000000000060b8 <+616>: mov rax,QWORD PTR [rsp+0xa8] 4 | 0x00000000000060c0 <+624>: mov QWORD PTR [rsp+0x150],rax 5 | 0x00000000000060c8 <+632>: vmovdqa64 zmm0,ZMMWORD PTR [rip+0x5706e] # 0x5d140 6 | 0x00000000000060d2 <+642>: vmovdqu64 zmm2,ZMMWORD PTR [rsp+0x110] 7 | 0x00000000000060dd <+653>: vpermd zmm0,zmm0,zmm2 8 | 0x00000000000060e3 <+659>: vpminsd zmm1,zmm2,zmm0 9 | 0x00000000000060e9 <+665>: mov ax,0xf2b0 10 | 0x00000000000060ed <+669>: kmovd k1,eax 11 | 0x00000000000060f1 <+673>: vpmaxsd zmm1{k1},zmm2,zmm0 12 | 0x00000000000060f7 <+679>: vmovdqa64 zmm0,ZMMWORD PTR [rip+0x5707f] # 0x5d180 13 | 0x0000000000006101 <+689>: vpermd zmm0,zmm0,zmm1 14 | 0x0000000000006107 <+695>: vpminsd zmm2,zmm1,zmm0 15 | 0x000000000000610d <+701>: mov ax,0xdcc4 16 | 0x0000000000006111 <+705>: kmovd k1,eax 17 | 0x0000000000006115 <+709>: vpmaxsd zmm2{k1},zmm1,zmm0 18 | 0x000000000000611b <+715>: vmovdqa64 zmm0,ZMMWORD PTR [rip+0x5709b] # 0x5d1c0 19 | 0x0000000000006125 <+725>: vpermd zmm0,zmm0,zmm2 20 | 0x000000000000612b <+731>: vpminsd zmm1,zmm2,zmm0 21 | 0x0000000000006131 <+737>: mov ax,0xef08 22 | 0x0000000000006135 <+741>: kmovd k1,eax 23 | 0x0000000000006139 <+745>: vpmaxsd zmm1{k1},zmm2,zmm0 24 | 0x000000000000613f <+751>: vmovdqa64 zmm0,ZMMWORD PTR [rip+0x570b7] # 0x5d200 25 | 0x0000000000006149 <+761>: vpermd zmm0,zmm0,zmm1 26 | 0x000000000000614f <+767>: vpminsd zmm2,zmm1,zmm0 27 | 0x0000000000006155 <+773>: mov ax,0xb552 28 | 0x0000000000006159 <+777>: kmovd k1,eax 29 | 0x000000000000615d <+781>: vpmaxsd zmm2{k1},zmm1,zmm0 30 | 0x0000000000006163 <+787>: vmovdqa64 zmm0,ZMMWORD PTR [rip+0x570d3] # 0x5d240 31 | 0x000000000000616d <+797>: vpermd zmm0,zmm0,zmm2 32 | 0x0000000000006173 <+803>: vpmaxsd zmm1,zmm2,zmm0 33 | 0x0000000000006179 <+809>: mov ax,0x14d6 34 | 0x000000000000617d <+813>: kmovd k1,eax 35 | 0x0000000000006181 <+817>: vpminsd zmm1{k1},zmm2,zmm0 36 | 0x0000000000006187 <+823>: vmovdqa64 zmm0,ZMMWORD PTR [rip+0x570ef] # 0x5d280 37 | 0x0000000000006191 <+833>: vpermd zmm0,zmm0,zmm1 38 | 0x0000000000006197 <+839>: vpmaxsd zmm2,zmm1,zmm0 39 | 0x000000000000619d <+845>: mov ax,0x24da 40 | 0x00000000000061a1 <+849>: kmovd k1,eax 41 | 0x00000000000061a5 <+853>: vpminsd zmm2{k1},zmm1,zmm0 42 | 0x00000000000061ab <+859>: vmovdqa64 zmm0,ZMMWORD PTR [rip+0x5710b] # 0x5d2c0 43 | 0x00000000000061b5 <+869>: vpermd zmm0,zmm0,zmm2 44 | 0x00000000000061bb <+875>: vpmaxsd zmm1,zmm2,zmm0 45 | 0x00000000000061c1 <+881>: mov ax,0x1554 46 | 0x00000000000061c5 <+885>: kmovd k1,eax 47 | 0x00000000000061c9 <+889>: vpminsd zmm1{k1},zmm2,zmm0 48 | 0x00000000000061cf <+895>: vmovdqa64 zmm0,ZMMWORD PTR [rip+0x57127] # 0x5d300 49 | 0x00000000000061d9 <+905>: vpermq zmm0,zmm0,zmm1 50 | 0x00000000000061df <+911>: vpmaxsd zmm2,zmm1,zmm0 51 | 0x00000000000061e5 <+917>: mov ax,0x330 52 | 0x00000000000061e9 <+921>: kmovd k1,eax 53 | 0x00000000000061ed <+925>: vpminsd zmm2{k1},zmm1,zmm0 54 | 0x00000000000061f3 <+931>: vmovdqa64 zmm0,ZMMWORD PTR [rip+0x57143] # 0x5d340 55 | 0x00000000000061fd <+941>: vmovdqu64 ZMMWORD PTR [rsp+0x270],zmm2 56 | 0x0000000000006208 <+952>: vpermd zmm0,zmm0,zmm2 57 | 0x000000000000620e <+958>: vmovdqu64 ZMMWORD PTR [rsp+0x2c0],zmm0 58 | 0x0000000000006216 <+966>: mov bp,0xaa8 59 | 0x000000000000621a <+970>: vpxor xmm0,xmm0,xmm0 60 | 0x000000000000621e <+974>: vmovdqa XMMWORD PTR [rsp+0xc0],xmm0 61 | 0x0000000000006227 <+983>: lea rsi,[rsp+0xc0] 62 | 0x000000000000622f <+991>: mov edi,0x1 63 | 0x0000000000006234 <+996>: vzeroupper 64 | 0x0000000000006237 <+999>: call 0x5470 -------------------------------------------------------------------------------- /disassemble/asm/int32_32.asm: -------------------------------------------------------------------------------- 1 | 0x0000000000006100 <+640>: call 0x5470 2 | 0x0000000000006105 <+645>: mov rbx,QWORD PTR [rsp+0xd0] 3 | 0x000000000000610d <+653>: mov rax,QWORD PTR [rsp+0xd8] 4 | 0x0000000000006115 <+661>: mov QWORD PTR [rsp+0x1d0],rax 5 | 0x000000000000611d <+669>: vmovdqa64 zmm5,ZMMWORD PTR [rsp+0x140] 6 | 0x0000000000006125 <+677>: vpshufd zmm0,zmm5,0xb1 7 | 0x000000000000612c <+684>: vmovdqa64 zmm4,ZMMWORD PTR [rsp+0x180] 8 | 0x0000000000006134 <+692>: vpshufd zmm1,zmm4,0xb1 9 | 0x000000000000613b <+699>: vpminsd zmm2,zmm4,zmm1 10 | 0x0000000000006141 <+705>: vpminsd zmm3,zmm5,zmm0 11 | 0x0000000000006147 <+711>: mov ax,0xaaaa 12 | 0x000000000000614b <+715>: kmovd k1,eax 13 | 0x000000000000614f <+719>: vpmaxsd zmm3{k1},zmm5,zmm0 14 | 0x0000000000006155 <+725>: vpmaxsd zmm2{k1},zmm4,zmm1 15 | 0x000000000000615b <+731>: vpshufd zmm0,zmm2,0x4e 16 | 0x0000000000006162 <+738>: vpshufd zmm1,zmm3,0x4e 17 | 0x0000000000006169 <+745>: vpminsd zmm4,zmm3,zmm1 18 | 0x000000000000616f <+751>: vpminsd zmm5,zmm2,zmm0 19 | 0x0000000000006175 <+757>: vpmaxsd zmm1,zmm3,zmm1 20 | 0x000000000000617b <+763>: vshufps zmm1,zmm4,zmm1,0xe4 21 | 0x0000000000006182 <+770>: vpmaxsd zmm0,zmm2,zmm0 22 | 0x0000000000006188 <+776>: vshufps zmm0,zmm5,zmm0,0xe4 23 | 0x000000000000618f <+783>: vpxor xmm2,xmm2,xmm2 24 | 0x0000000000006193 <+787>: vpermpd zmm2,zmm1,0x4e 25 | 0x000000000000619a <+794>: vpxor xmm3,xmm3,xmm3 26 | 0x000000000000619e <+798>: vpermpd zmm3,zmm0,0x4e 27 | 0x00000000000061a5 <+805>: vpminsd zmm4,zmm0,zmm3 28 | 0x00000000000061ab <+811>: vpminsd zmm5,zmm1,zmm2 29 | 0x00000000000061b1 <+817>: mov ax,0xf0f0 30 | 0x00000000000061b5 <+821>: kmovd k1,eax 31 | 0x00000000000061b9 <+825>: vpmaxsd zmm5{k1},zmm1,zmm2 32 | 0x00000000000061bf <+831>: vpmaxsd zmm4{k1},zmm0,zmm3 33 | 0x00000000000061c5 <+837>: vshufi64x2 zmm0,zmm4,zmm4,0x4e 34 | 0x00000000000061cc <+844>: vshufi64x2 zmm1,zmm5,zmm5,0x4e 35 | 0x00000000000061d3 <+851>: vpminsd zmm2,zmm5,zmm1 36 | 0x00000000000061d9 <+857>: vpminsd zmm3,zmm4,zmm0 37 | 0x00000000000061df <+863>: mov ax,0xff00 38 | 0x00000000000061e3 <+867>: kmovd k1,eax 39 | 0x00000000000061e7 <+871>: vpmaxsd zmm3{k1},zmm4,zmm0 40 | 0x00000000000061ed <+877>: vpmaxsd zmm2{k1},zmm5,zmm1 41 | 0x00000000000061f3 <+883>: vmovdqa64 zmm0,ZMMWORD PTR [rip+0x56f03] # 0x5d100 42 | 0x00000000000061fd <+893>: vmovdqa64 zmm1,zmm3 43 | 0x0000000000006203 <+899>: vpermt2d zmm1,zmm0,zmm2 44 | 0x0000000000006209 <+905>: vpermi2d zmm0,zmm2,zmm3 45 | 0x000000000000620f <+911>: vpmaxsd zmm4,zmm2,zmm0 46 | 0x0000000000006215 <+917>: mov ax,0x8ee 47 | 0x0000000000006219 <+921>: kmovd k1,eax 48 | 0x000000000000621d <+925>: vpminsd zmm4{k1},zmm2,zmm0 49 | 0x0000000000006223 <+931>: vpminsd zmm0,zmm3,zmm1 50 | 0x0000000000006229 <+937>: mov ax,0x7710 51 | 0x000000000000622d <+941>: kmovd k1,eax 52 | 0x0000000000006231 <+945>: vpmaxsd zmm0{k1},zmm3,zmm1 53 | 0x0000000000006237 <+951>: vmovdqa64 zmm1,ZMMWORD PTR [rip+0x56eff] # 0x5d140 54 | 0x0000000000006241 <+961>: vmovdqa64 zmm2,zmm0 55 | 0x0000000000006247 <+967>: vpermt2d zmm2,zmm1,zmm4 56 | 0x000000000000624d <+973>: vpermi2d zmm1,zmm4,zmm0 57 | 0x0000000000006253 <+979>: vpmaxsd zmm3,zmm4,zmm1 58 | 0x0000000000006259 <+985>: mov ax,0x249a 59 | 0x000000000000625d <+989>: kmovd k1,eax 60 | 0x0000000000006261 <+993>: vpminsd zmm3{k1},zmm4,zmm1 61 | 0x0000000000006267 <+999>: vpminsd zmm1,zmm0,zmm2 62 | 0x000000000000626d <+1005>: mov ax,0xd925 63 | 0x0000000000006271 <+1009>: kmovd k1,eax 64 | 0x0000000000006275 <+1013>: vpmaxsd zmm1{k1},zmm0,zmm2 65 | 0x000000000000627b <+1019>: vmovdqa64 zmm0,ZMMWORD PTR [rip+0x56efb] # 0x5d180 66 | 0x0000000000006285 <+1029>: vmovdqa64 zmm2,zmm3 67 | 0x000000000000628b <+1035>: vpermt2d zmm2,zmm0,zmm1 68 | 0x0000000000006291 <+1041>: vpermi2d zmm0,zmm1,zmm3 69 | 0x0000000000006297 <+1047>: vpmaxsd zmm4,zmm3,zmm0 70 | 0x000000000000629d <+1053>: mov ax,0x20 71 | 0x00000000000062a1 <+1057>: kmovd k1,eax 72 | 0x00000000000062a5 <+1061>: vpminsd zmm4{k1},zmm3,zmm0 73 | 0x00000000000062ab <+1067>: vpminsd zmm0,zmm1,zmm2 74 | 0x00000000000062b1 <+1073>: mov ax,0x8641 75 | 0x00000000000062b5 <+1077>: kmovd k1,eax 76 | 0x00000000000062b9 <+1081>: vpmaxsd zmm0{k1},zmm1,zmm2 77 | 0x00000000000062bf <+1087>: vmovdqa64 zmm1,ZMMWORD PTR [rip+0x56ef7] # 0x5d1c0 78 | 0x00000000000062c9 <+1097>: vpermi2d zmm1,zmm4,zmm0 79 | 0x00000000000062cf <+1103>: vmovdqa64 zmm2,ZMMWORD PTR [rip+0x56f27] # 0x5d200 80 | 0x00000000000062d9 <+1113>: vpermi2d zmm2,zmm0,zmm4 81 | 0x00000000000062df <+1119>: vpmaxsd zmm3,zmm4,zmm2 82 | 0x00000000000062e5 <+1125>: mov ax,0x40 83 | 0x00000000000062e9 <+1129>: kmovd k1,eax 84 | 0x00000000000062ed <+1133>: vpminsd zmm3{k1},zmm4,zmm2 85 | 0x00000000000062f3 <+1139>: vpminsd zmm2,zmm0,zmm1 86 | 0x00000000000062f9 <+1145>: mov ax,0x1207 87 | 0x00000000000062fd <+1149>: kmovd k1,eax 88 | 0x0000000000006301 <+1153>: vpmaxsd zmm2{k1},zmm0,zmm1 89 | 0x0000000000006307 <+1159>: vmovdqa64 zmm0,ZMMWORD PTR [rip+0x56f2f] # 0x5d240 90 | 0x0000000000006311 <+1169>: vpermi2d zmm0,zmm2,zmm3 91 | 0x0000000000006317 <+1175>: vmovdqa64 zmm1,ZMMWORD PTR [rip+0x56f5f] # 0x5d280 92 | 0x0000000000006321 <+1185>: vpermi2d zmm1,zmm2,zmm3 93 | 0x0000000000006327 <+1191>: vpmaxsd zmm4,zmm3,zmm1 94 | 0x000000000000632d <+1197>: mov ax,0x880 95 | 0x0000000000006331 <+1201>: vpminsd zmm5,zmm2,zmm0 96 | 0x0000000000006337 <+1207>: mov cx,0x2155 97 | 0x000000000000633b <+1211>: kmovd k1,ecx 98 | 0x000000000000633f <+1215>: vpmaxsd zmm5{k1},zmm2,zmm0 99 | 0x0000000000006345 <+1221>: kmovd k1,eax 100 | 0x0000000000006349 <+1225>: vmovdqa64 zmm0,ZMMWORD PTR [rip+0x56fad] # 0x5d300 101 | 0x0000000000006353 <+1235>: vpermi2d zmm0,zmm5,zmm4 102 | 0x0000000000006359 <+1241>: vpminsd zmm4{k1},zmm3,zmm1 103 | 0x000000000000635f <+1247>: vmovdqa64 zmm1,ZMMWORD PTR [rip+0x56f57] # 0x5d2c0 104 | 0x0000000000006369 <+1257>: vpermi2d zmm1,zmm4,zmm5 105 | 0x000000000000636f <+1263>: vpmaxsd zmm2,zmm4,zmm1 106 | 0x0000000000006375 <+1269>: mov ax,0x480 107 | 0x0000000000006379 <+1273>: vpmaxsd zmm3,zmm5,zmm0 108 | 0x000000000000637f <+1279>: mov cx,0xfa84 109 | 0x0000000000006383 <+1283>: kmovd k1,ecx 110 | 0x0000000000006387 <+1287>: vpminsd zmm3{k1},zmm5,zmm0 111 | 0x000000000000638d <+1293>: kmovd k1,eax 112 | 0x0000000000006391 <+1297>: vmovdqa64 zmm0,ZMMWORD PTR [rip+0x56fe5] # 0x5d380 113 | 0x000000000000639b <+1307>: vpermi2d zmm0,zmm3,zmm2 114 | 0x00000000000063a1 <+1313>: vpminsd zmm2{k1},zmm4,zmm1 115 | 0x00000000000063a7 <+1319>: vmovdqa64 zmm1,ZMMWORD PTR [rip+0x56f8f] # 0x5d340 116 | 0x00000000000063b1 <+1329>: vpermi2d zmm1,zmm2,zmm3 117 | 0x00000000000063b7 <+1335>: vpmaxsd zmm4,zmm3,zmm0 118 | 0x00000000000063bd <+1341>: mov ax,0xe644 119 | 0x00000000000063c1 <+1345>: kmovd k1,eax 120 | 0x00000000000063c5 <+1349>: vpminsd zmm4{k1},zmm3,zmm0 121 | 0x00000000000063cb <+1355>: vpmaxsd zmm0,zmm2,zmm1 122 | 0x00000000000063d1 <+1361>: mov ax,0x818 123 | 0x00000000000063d5 <+1365>: kmovd k1,eax 124 | 0x00000000000063d9 <+1369>: vmovdqa64 zmm3,ZMMWORD PTR [rip+0x5701d] # 0x5d400 125 | 0x00000000000063e3 <+1379>: vpermi2d zmm3,zmm4,zmm0 126 | 0x00000000000063e9 <+1385>: vpminsd zmm0{k1},zmm2,zmm1 127 | 0x00000000000063ef <+1391>: vmovdqa64 zmm1,ZMMWORD PTR [rip+0x56fc7] # 0x5d3c0 128 | 0x00000000000063f9 <+1401>: vpermi2d zmm1,zmm0,zmm4 129 | 0x00000000000063ff <+1407>: vpmaxsd zmm2,zmm4,zmm3 130 | 0x0000000000006405 <+1413>: mov ax,0xcb20 131 | 0x0000000000006409 <+1417>: kmovd k1,eax 132 | 0x000000000000640d <+1421>: vpminsd zmm2{k1},zmm4,zmm3 133 | 0x0000000000006413 <+1427>: vpmaxsd zmm3,zmm0,zmm1 134 | 0x0000000000006419 <+1433>: mov ax,0x22c 135 | 0x000000000000641d <+1437>: kmovd k1,eax 136 | 0x0000000000006421 <+1441>: vpminsd zmm3{k1},zmm0,zmm1 137 | 0x0000000000006427 <+1447>: vmovdqa64 zmm0,ZMMWORD PTR [rip+0x5700f] # 0x5d440 138 | 0x0000000000006431 <+1457>: vpermi2d zmm0,zmm3,zmm2 139 | 0x0000000000006437 <+1463>: vmovdqa64 zmm1,ZMMWORD PTR [rip+0x5703f] # 0x5d480 140 | 0x0000000000006441 <+1473>: vpermi2d zmm1,zmm2,zmm3 141 | 0x0000000000006447 <+1479>: vpmaxsd zmm4,zmm2,zmm1 142 | 0x000000000000644d <+1485>: mov ax,0xad48 143 | 0x0000000000006451 <+1489>: kmovd k1,eax 144 | 0x0000000000006455 <+1493>: vpminsd zmm4{k1},zmm2,zmm1 145 | 0x000000000000645b <+1499>: vpmaxsd zmm1,zmm3,zmm0 146 | 0x0000000000006461 <+1505>: mov ax,0x54a 147 | 0x0000000000006465 <+1509>: kmovd k1,eax 148 | 0x0000000000006469 <+1513>: vmovdqa64 zmm2,ZMMWORD PTR [rip+0x5708d] # 0x5d500 149 | 0x0000000000006473 <+1523>: vpermi2d zmm2,zmm4,zmm1 150 | 0x0000000000006479 <+1529>: vmovdqa64 ZMMWORD PTR [rsp+0x380],zmm2 151 | 0x0000000000006481 <+1537>: vpminsd zmm1{k1},zmm3,zmm0 152 | 0x0000000000006487 <+1543>: vmovdqa64 zmm0,ZMMWORD PTR [rip+0x5702f] # 0x5d4c0 153 | 0x0000000000006491 <+1553>: vmovdqa64 ZMMWORD PTR [rsp+0x3c0],zmm4 154 | 0x0000000000006499 <+1561>: vmovdqa64 ZMMWORD PTR [rsp+0x340],zmm1 155 | 0x00000000000064a1 <+1569>: vpermi2d zmm0,zmm1,zmm4 156 | 0x00000000000064a7 <+1575>: vmovdqa64 ZMMWORD PTR [rsp+0x300],zmm0 157 | 0x00000000000064af <+1583>: vpxor xmm0,xmm0,xmm0 158 | 0x00000000000064b3 <+1587>: vmovdqa XMMWORD PTR [rsp+0xf0],xmm0 159 | 0x00000000000064bc <+1596>: lea rsi,[rsp+0xf0] 160 | 0x00000000000064c4 <+1604>: mov edi,0x1 161 | 0x00000000000064c9 <+1609>: vzeroupper 162 | 0x00000000000064cc <+1612>: call 0x5470 -------------------------------------------------------------------------------- /disassemble/asm/int32_8.asm: -------------------------------------------------------------------------------- 1 | 0x0000000000006086 <+566>: call 0x5470 2 | 0x000000000000608b <+571>: mov rbx,QWORD PTR [rsp+0xc0] 3 | 0x0000000000006093 <+579>: mov r12,QWORD PTR [rsp+0xc8] 4 | 0x000000000000609b <+587>: vmovdqu ymm2,YMMWORD PTR [rsp+0x10] 5 | 0x00000000000060a1 <+593>: vpshufd ymm0,ymm2,0x4e 6 | 0x00000000000060a6 <+598>: vpminsd ymm1,ymm2,ymm0 7 | 0x00000000000060ab <+603>: vpmaxsd ymm0,ymm2,ymm0 8 | 0x00000000000060b0 <+608>: vpblendd ymm0,ymm1,ymm0,0xcc 9 | 0x00000000000060b6 <+614>: vxorps xmm1,xmm1,xmm1 10 | 0x00000000000060ba <+618>: vpermq ymm1,ymm0,0x4e 11 | 0x00000000000060c0 <+624>: vpminsd ymm2,ymm0,ymm1 12 | 0x00000000000060c5 <+629>: vpmaxsd ymm0,ymm0,ymm1 13 | 0x00000000000060ca <+634>: vpblendd ymm0,ymm2,ymm0,0xf0 14 | 0x00000000000060d0 <+640>: vpshufd ymm1,ymm0,0xb1 15 | 0x00000000000060d5 <+645>: vpminsd ymm2,ymm0,ymm1 16 | 0x00000000000060da <+650>: vpmaxsd ymm0,ymm0,ymm1 17 | 0x00000000000060df <+655>: vpblendd ymm0,ymm2,ymm0,0xaa 18 | 0x00000000000060e5 <+661>: vxorps xmm1,xmm1,xmm1 19 | 0x00000000000060e9 <+665>: vpermq ymm1,ymm0,0xd8 20 | 0x00000000000060ef <+671>: vpminsd ymm2,ymm0,ymm1 21 | 0x00000000000060f4 <+676>: vpmaxsd ymm0,ymm0,ymm1 22 | 0x00000000000060f9 <+681>: vpblendd ymm0,ymm0,ymm2,0xc 23 | 0x00000000000060ff <+687>: vmovdqa ymm1,YMMWORD PTR [rip+0x56ff9] # 0x5d100 24 | 0x0000000000006107 <+695>: vpermd ymm1,ymm1,ymm0 25 | 0x000000000000610c <+700>: vpminsd ymm2,ymm0,ymm1 26 | 0x0000000000006111 <+705>: vpmaxsd ymm0,ymm0,ymm1 27 | 0x0000000000006116 <+710>: vpblendd ymm0,ymm0,ymm2,0xa 28 | 0x000000000000611c <+716>: vmovdqa ymm1,YMMWORD PTR [rip+0x56ffc] # 0x5d120 29 | 0x0000000000006124 <+724>: vpermd ymm1,ymm1,ymm0 30 | 0x0000000000006129 <+729>: vpminsd ymm2,ymm0,ymm1 31 | 0x000000000000612e <+734>: vmovdqu YMMWORD PTR [rsp+0x10],ymm2 32 | 0x0000000000006134 <+740>: vpmaxsd ymm0,ymm0,ymm1 33 | 0x0000000000006139 <+745>: vmovdqu YMMWORD PTR [rsp+0x240],ymm0 34 | 0x0000000000006142 <+754>: vpxor xmm0,xmm0,xmm0 35 | 0x0000000000006146 <+758>: vmovdqa XMMWORD PTR [rsp+0xe0],xmm0 36 | 0x000000000000614f <+767>: lea rsi,[rsp+0xe0] 37 | 0x0000000000006157 <+775>: mov edi,0x1 38 | 0x000000000000615c <+780>: vzeroupper 39 | 0x000000000000615f <+783>: call 0x5470 -------------------------------------------------------------------------------- /disassemble/asm/int8_128.asm: -------------------------------------------------------------------------------- 1 | 0x0000000000006110 <+640>: call 0x5470 2 | 0x0000000000006115 <+645>: vmovdqa64 zmm5,ZMMWORD PTR [rsp+0x100] 3 | 0x000000000000611d <+653>: vprold zmm0,zmm5,0x10 4 | 0x0000000000006124 <+660>: vmovdqa64 zmm4,ZMMWORD PTR [rsp+0x140] 5 | 0x000000000000612c <+668>: vprold zmm1,zmm4,0x10 6 | 0x0000000000006133 <+675>: vpminsb zmm2,zmm4,zmm1 7 | 0x0000000000006139 <+681>: vpminsb zmm3,zmm5,zmm0 8 | 0x000000000000613f <+687>: movabs rax,0xcccccccccccccccc 9 | 0x0000000000006149 <+697>: kmovq k1,rax 10 | 0x000000000000614e <+702>: vpmaxsb zmm3{k1},zmm5,zmm0 11 | 0x0000000000006154 <+708>: vpmaxsb zmm2{k1},zmm4,zmm1 12 | 0x000000000000615a <+714>: vbroadcasti32x4 zmm0,XMMWORD PTR [rip+0x5779c] # 0x5d900 13 | 0x0000000000006164 <+724>: vpshufb zmm1,zmm2,zmm0 14 | 0x000000000000616a <+730>: vpshufb zmm0,zmm3,zmm0 15 | 0x0000000000006170 <+736>: vpminsb zmm4,zmm3,zmm0 16 | 0x0000000000006176 <+742>: vpminsb zmm5,zmm2,zmm1 17 | 0x000000000000617c <+748>: movabs rax,0xaaaaaaaaaaaaaaaa 18 | 0x0000000000006186 <+758>: kmovq k2,rax 19 | 0x000000000000618b <+763>: vpmaxsb zmm5{k2},zmm2,zmm1 20 | 0x0000000000006191 <+769>: kmovq QWORD PTR [rsp+0x190],k2 21 | 0x000000000000619b <+779>: vpmaxsb zmm4{k2},zmm3,zmm0 22 | 0x00000000000061a1 <+785>: vmovdqa64 zmm0,ZMMWORD PTR [rip+0x56f95] # 0x5d140 23 | 0x00000000000061ab <+795>: vpermb zmm1,zmm0,zmm4 24 | 0x00000000000061b1 <+801>: vpermb zmm0,zmm0,zmm5 25 | 0x00000000000061b7 <+807>: vpminsb zmm2,zmm5,zmm0 26 | 0x00000000000061bd <+813>: vpminsb zmm3,zmm4,zmm1 27 | 0x00000000000061c3 <+819>: movabs rax,0xdddd44d4d4dd4444 28 | 0x00000000000061cd <+829>: kmovq k2,rax 29 | 0x00000000000061d2 <+834>: vpmaxsb zmm3{k2},zmm4,zmm1 30 | 0x00000000000061d8 <+840>: vpmaxsb zmm2{k2},zmm5,zmm0 31 | 0x00000000000061de <+846>: vmovdqa64 zmm0,ZMMWORD PTR [rip+0x56f98] # 0x5d180 32 | 0x00000000000061e8 <+856>: vpermb zmm1,zmm0,zmm2 33 | 0x00000000000061ee <+862>: vpermb zmm0,zmm0,zmm3 34 | 0x00000000000061f4 <+868>: vpminsb zmm4,zmm3,zmm0 35 | 0x00000000000061fa <+874>: vpminsb zmm5,zmm2,zmm1 36 | 0x0000000000006200 <+880>: movabs rax,0xff6f9960f9660900 37 | 0x000000000000620a <+890>: kmovq k2,rax 38 | 0x000000000000620f <+895>: vpmaxsb zmm5{k2},zmm2,zmm1 39 | 0x0000000000006215 <+901>: vpmaxsb zmm4{k2},zmm3,zmm0 40 | 0x000000000000621b <+907>: vmovdqa64 zmm0,ZMMWORD PTR [rip+0x56f9b] # 0x5d1c0 41 | 0x0000000000006225 <+917>: vpermb zmm1,zmm0,zmm4 42 | 0x000000000000622b <+923>: vpermb zmm0,zmm0,zmm5 43 | 0x0000000000006231 <+929>: vpminsb zmm2,zmm5,zmm0 44 | 0x0000000000006237 <+935>: vpminsb zmm3,zmm4,zmm1 45 | 0x000000000000623d <+941>: movabs rax,0xff96ff9966009600 46 | 0x0000000000006247 <+951>: kmovq k2,rax 47 | 0x000000000000624c <+956>: vpmaxsb zmm3{k2},zmm4,zmm1 48 | 0x0000000000006252 <+962>: vpmaxsb zmm2{k2},zmm5,zmm0 49 | 0x0000000000006258 <+968>: vmovdqa64 zmm0,ZMMWORD PTR [rip+0x56f9e] # 0x5d200 50 | 0x0000000000006262 <+978>: vpermb zmm1,zmm0,zmm2 51 | 0x0000000000006268 <+984>: vpermb zmm0,zmm0,zmm3 52 | 0x000000000000626e <+990>: vpminsb zmm4,zmm3,zmm0 53 | 0x0000000000006274 <+996>: vpminsb zmm5,zmm2,zmm1 54 | 0x000000000000627a <+1002>: movabs rax,0xf6f96f6f09096090 55 | 0x0000000000006284 <+1012>: kmovq k2,rax 56 | 0x0000000000006289 <+1017>: vpmaxsb zmm5{k2},zmm2,zmm1 57 | 0x000000000000628f <+1023>: vpmaxsb zmm4{k2},zmm3,zmm0 58 | 0x0000000000006295 <+1029>: vmovdqa64 zmm0,ZMMWORD PTR [rip+0x56fa1] # 0x5d240 59 | 0x000000000000629f <+1039>: vmovdqa64 zmm1,zmm5 60 | 0x00000000000062a5 <+1045>: vpermt2b zmm1,zmm0,zmm4 61 | 0x00000000000062ab <+1051>: vpermi2b zmm0,zmm4,zmm5 62 | 0x00000000000062b1 <+1057>: vpmaxsb zmm2,zmm4,zmm0 63 | 0x00000000000062b7 <+1063>: movabs rax,0x6096960f9696f96 64 | 0x00000000000062c1 <+1073>: kmovq k2,rax 65 | 0x00000000000062c6 <+1078>: vpminsb zmm2{k2},zmm4,zmm0 66 | 0x00000000000062cc <+1084>: vpminsb zmm0,zmm5,zmm1 67 | 0x00000000000062d2 <+1090>: movabs rax,0x79f6969f06969068 68 | 0x00000000000062dc <+1100>: kmovq k2,rax 69 | 0x00000000000062e1 <+1105>: vpmaxsb zmm0{k2},zmm5,zmm1 70 | 0x00000000000062e7 <+1111>: vmovdqa64 zmm1,ZMMWORD PTR [rip+0x56f8f] # 0x5d280 71 | 0x00000000000062f1 <+1121>: vpermb zmm3,zmm1,zmm2 72 | 0x00000000000062f7 <+1127>: vpermb zmm1,zmm1,zmm0 73 | 0x00000000000062fd <+1133>: vpmaxsb zmm4,zmm0,zmm1 74 | 0x0000000000006303 <+1139>: movabs rax,0x960f00ff0f96f0 75 | 0x000000000000630d <+1149>: kmovq k2,rax 76 | 0x0000000000006312 <+1154>: vpminsb zmm4{k2},zmm0,zmm1 77 | 0x0000000000006318 <+1160>: vpmaxsb zmm0,zmm2,zmm3 78 | 0x000000000000631e <+1166>: vpminsb zmm0{k2},zmm2,zmm3 79 | 0x0000000000006324 <+1172>: vmovdqa64 zmm1,ZMMWORD PTR [rip+0x56f92] # 0x5d2c0 80 | 0x000000000000632e <+1182>: vpermb zmm2,zmm1,zmm0 81 | 0x0000000000006334 <+1188>: vpermb zmm1,zmm1,zmm4 82 | 0x000000000000633a <+1194>: vpmaxsb zmm3,zmm4,zmm1 83 | 0x0000000000006340 <+1200>: movabs rax,0x690f096f0f6960 84 | 0x000000000000634a <+1210>: kmovq k2,rax 85 | 0x000000000000634f <+1215>: vpminsb zmm3{k2},zmm4,zmm1 86 | 0x0000000000006355 <+1221>: vpmaxsb zmm1,zmm0,zmm2 87 | 0x000000000000635b <+1227>: vpminsb zmm1{k2},zmm0,zmm2 88 | 0x0000000000006361 <+1233>: vmovdqa64 zmm0,ZMMWORD PTR [rip+0x56f95] # 0x5d300 89 | 0x000000000000636b <+1243>: vpshufb zmm2,zmm1,zmm0 90 | 0x0000000000006371 <+1249>: vpshufb zmm0,zmm3,zmm0 91 | 0x0000000000006377 <+1255>: vpmaxsb zmm4,zmm3,zmm0 92 | 0x000000000000637d <+1261>: movabs rax,0x6069f069f0600 93 | 0x0000000000006387 <+1271>: kmovq k2,rax 94 | 0x000000000000638c <+1276>: vpminsb zmm4{k2},zmm3,zmm0 95 | 0x0000000000006392 <+1282>: vpmaxsb zmm0,zmm1,zmm2 96 | 0x0000000000006398 <+1288>: vpminsb zmm0{k2},zmm1,zmm2 97 | 0x000000000000639e <+1294>: vmovdqa64 zmm1,ZMMWORD PTR [rip+0x56f98] # 0x5d340 98 | 0x00000000000063a8 <+1304>: vpermb zmm2,zmm1,zmm0 99 | 0x00000000000063ae <+1310>: vpermb zmm1,zmm1,zmm4 100 | 0x00000000000063b4 <+1316>: vpmaxsb zmm3,zmm4,zmm1 101 | 0x00000000000063ba <+1322>: movabs rax,0x90f690f69000 102 | 0x00000000000063c4 <+1332>: kmovq k2,rax 103 | 0x00000000000063c9 <+1337>: vpminsb zmm3{k2},zmm4,zmm1 104 | 0x00000000000063cf <+1343>: vpmaxsb zmm1,zmm0,zmm2 105 | 0x00000000000063d5 <+1349>: vpminsb zmm1{k2},zmm0,zmm2 106 | 0x00000000000063db <+1355>: vmovdqa64 zmm0,ZMMWORD PTR [rip+0x56f9b] # 0x5d380 107 | 0x00000000000063e5 <+1365>: vpermb zmm2,zmm0,zmm1 108 | 0x00000000000063eb <+1371>: vpermb zmm0,zmm0,zmm3 109 | 0x00000000000063f1 <+1377>: vpmaxsb zmm4,zmm3,zmm0 110 | 0x00000000000063f7 <+1383>: movabs rax,0xe8e0e8e06666 111 | 0x0000000000006401 <+1393>: kmovq k2,rax 112 | 0x0000000000006406 <+1398>: vpminsb zmm4{k2},zmm3,zmm0 113 | 0x000000000000640c <+1404>: vpmaxsb zmm0,zmm1,zmm2 114 | 0x0000000000006412 <+1410>: vpminsb zmm0{k2},zmm1,zmm2 115 | 0x0000000000006418 <+1416>: vmovdqa64 zmm1,ZMMWORD PTR [rip+0x56f9e] # 0x5d3c0 116 | 0x0000000000006422 <+1426>: vpermb zmm2,zmm1,zmm0 117 | 0x0000000000006428 <+1432>: vpermb zmm1,zmm1,zmm4 118 | 0x000000000000642e <+1438>: vpmaxsb zmm3,zmm4,zmm1 119 | 0x0000000000006434 <+1444>: movabs rax,0x88800884c6cecce 120 | 0x000000000000643e <+1454>: kmovq k2,rax 121 | 0x0000000000006443 <+1459>: vpminsb zmm3{k2},zmm4,zmm1 122 | 0x0000000000006449 <+1465>: vpmaxsb zmm1,zmm0,zmm2 123 | 0x000000000000644f <+1471>: vpminsb zmm1{k2},zmm0,zmm2 124 | 0x0000000000006455 <+1477>: vmovdqa64 zmm0,ZMMWORD PTR [rip+0x56fa1] # 0x5d400 125 | 0x000000000000645f <+1487>: vmovdqa64 zmm2,zmm1 126 | 0x0000000000006465 <+1493>: vpermt2b zmm2,zmm0,zmm3 127 | 0x000000000000646b <+1499>: vpermi2b zmm0,zmm3,zmm1 128 | 0x0000000000006471 <+1505>: vpmaxsb zmm4,zmm3,zmm0 129 | 0x0000000000006477 <+1511>: movabs rax,0x4a00ca4cc48cd9ae 130 | 0x0000000000006481 <+1521>: kmovq k2,rax 131 | 0x0000000000006486 <+1526>: vpminsb zmm4{k2},zmm3,zmm0 132 | 0x000000000000648c <+1532>: vpmaxsb zmm0,zmm1,zmm2 133 | 0x0000000000006492 <+1538>: movabs rax,0xa00ca4cc48cd9ac 134 | 0x000000000000649c <+1548>: kmovq k2,rax 135 | 0x00000000000064a1 <+1553>: vpminsb zmm0{k2},zmm1,zmm2 136 | 0x00000000000064a7 <+1559>: vmovdqa64 zmm1,ZMMWORD PTR [rip+0x56f8f] # 0x5d440 137 | 0x00000000000064b1 <+1569>: vmovdqa64 zmm2,zmm0 138 | 0x00000000000064b7 <+1575>: vpermt2b zmm2,zmm1,zmm4 139 | 0x00000000000064bd <+1581>: vpermi2b zmm1,zmm4,zmm0 140 | 0x00000000000064c3 <+1587>: vpmaxsb zmm3,zmm4,zmm1 141 | 0x00000000000064c9 <+1593>: movabs rax,0x2000246688ca888c 142 | 0x00000000000064d3 <+1603>: kmovq k2,rax 143 | 0x00000000000064d8 <+1608>: vpminsb zmm3{k2},zmm4,zmm1 144 | 0x00000000000064de <+1614>: vpmaxsb zmm1,zmm0,zmm2 145 | 0x00000000000064e4 <+1620>: movabs rax,0x246688ca8888 146 | 0x00000000000064ee <+1630>: kmovq k2,rax 147 | 0x00000000000064f3 <+1635>: vpminsb zmm1{k2},zmm0,zmm2 148 | 0x00000000000064f9 <+1641>: vmovdqa64 zmm0,ZMMWORD PTR [rip+0x56f7d] # 0x5d480 149 | 0x0000000000006503 <+1651>: vpermb zmm2,zmm0,zmm1 150 | 0x0000000000006509 <+1657>: vpermb zmm0,zmm0,zmm3 151 | 0x000000000000650f <+1663>: vpmaxsb zmm4,zmm3,zmm0 152 | 0x0000000000006515 <+1669>: movabs rax,0xac88eeca8888 153 | 0x000000000000651f <+1679>: kmovq k2,rax 154 | 0x0000000000006524 <+1684>: vpminsb zmm4{k2},zmm3,zmm0 155 | 0x000000000000652a <+1690>: vpmaxsb zmm0,zmm1,zmm2 156 | 0x0000000000006530 <+1696>: vpminsb zmm0{k2},zmm1,zmm2 157 | 0x0000000000006536 <+1702>: vmovdqa64 zmm1,ZMMWORD PTR [rip+0x56f80] # 0x5d4c0 158 | 0x0000000000006540 <+1712>: vpermb zmm2,zmm1,zmm0 159 | 0x0000000000006546 <+1718>: vpermb zmm1,zmm1,zmm4 160 | 0x000000000000654c <+1724>: vpmaxsb zmm3,zmm4,zmm1 161 | 0x0000000000006552 <+1730>: movabs rax,0x44caaaaaaccc88 162 | 0x000000000000655c <+1740>: kmovq k2,rax 163 | 0x0000000000006561 <+1745>: vpminsb zmm3{k2},zmm4,zmm1 164 | 0x0000000000006567 <+1751>: vpmaxsb zmm1,zmm0,zmm2 165 | 0x000000000000656d <+1757>: vpminsb zmm1{k2},zmm0,zmm2 166 | 0x0000000000006573 <+1763>: vmovdqa64 zmm0,ZMMWORD PTR [rip+0x56f83] # 0x5d500 167 | 0x000000000000657d <+1773>: vpermb zmm2,zmm0,zmm1 168 | 0x0000000000006583 <+1779>: vpermb zmm0,zmm0,zmm3 169 | 0x0000000000006589 <+1785>: vpmaxsb zmm4,zmm3,zmm0 170 | 0x000000000000658f <+1791>: movabs rax,0xaacaaccaacaa88 171 | 0x0000000000006599 <+1801>: kmovq k2,rax 172 | 0x000000000000659e <+1806>: vpminsb zmm4{k2},zmm3,zmm0 173 | 0x00000000000065a4 <+1812>: vpmaxsb zmm0,zmm1,zmm2 174 | 0x00000000000065aa <+1818>: vpminsb zmm0{k2},zmm1,zmm2 175 | 0x00000000000065b0 <+1824>: vmovdqa64 zmm1,ZMMWORD PTR [rip+0x56f86] # 0x5d540 176 | 0x00000000000065ba <+1834>: vpermb zmm2,zmm1,zmm0 177 | 0x00000000000065c0 <+1840>: vpermb zmm1,zmm1,zmm4 178 | 0x00000000000065c6 <+1846>: vpmaxsb zmm3,zmm4,zmm1 179 | 0x00000000000065cc <+1852>: movabs rax,0x4ccaccaaccaccc8 180 | 0x00000000000065d6 <+1862>: kmovq k2,rax 181 | 0x00000000000065db <+1867>: vpminsb zmm3{k2},zmm4,zmm1 182 | 0x00000000000065e1 <+1873>: vpmaxsb zmm1,zmm0,zmm2 183 | 0x00000000000065e7 <+1879>: vpminsb zmm1{k2},zmm0,zmm2 184 | 0x00000000000065ed <+1885>: vmovdqa64 zmm0,ZMMWORD PTR [rip+0x56f89] # 0x5d580 185 | 0x00000000000065f7 <+1895>: vpermb zmm2,zmm0,zmm1 186 | 0x00000000000065fd <+1901>: vpermb zmm0,zmm0,zmm3 187 | 0x0000000000006603 <+1907>: vpmaxsb zmm4,zmm3,zmm0 188 | 0x0000000000006609 <+1913>: movabs rax,0xaaaaaaaaaaaaaa8 189 | 0x0000000000006613 <+1923>: kmovq k2,rax 190 | 0x0000000000006618 <+1928>: vpmaxsb zmm5,zmm1,zmm2 191 | 0x000000000000661e <+1934>: vpminsb zmm5{k2},zmm1,zmm2 192 | 0x0000000000006624 <+1940>: movabs rax,0xe000000000000007 193 | 0x000000000000662e <+1950>: kmovq k3,rax 194 | 0x0000000000006633 <+1955>: vpblendmb zmm1{k3},zmm5,zmm4 195 | 0x0000000000006639 <+1961>: vpminsb zmm4{k2},zmm3,zmm0 196 | 0x000000000000663f <+1967>: vpblendmb zmm0{k3},zmm4,zmm5 197 | 0x0000000000006645 <+1973>: vpminsb zmm2,zmm4,zmm1 198 | 0x000000000000664b <+1979>: vpmaxsb zmm2{k3},zmm4,zmm1 199 | 0x0000000000006651 <+1985>: vpmaxsb zmm0,zmm5,zmm0 200 | 0x0000000000006657 <+1991>: vshufi64x2 zmm1,zmm2,zmm0,0xee 201 | 0x000000000000665e <+1998>: vinserti64x4 zmm3,zmm2,ymm0,0x1 202 | 0x0000000000006665 <+2005>: vpmaxsb zmm4,zmm2,zmm3 203 | 0x000000000000666b <+2011>: movabs rax,0xffffffff00000000 204 | 0x0000000000006675 <+2021>: kmovq k2,rax 205 | 0x000000000000667a <+2026>: vpminsb zmm4{k2},zmm2,zmm3 206 | 0x0000000000006680 <+2032>: vpmaxsb zmm0,zmm0,zmm1 207 | 0x0000000000006686 <+2038>: vmovdqa64 zmm1,ZMMWORD PTR [rip+0x56f30] # 0x5d5c0 208 | 0x0000000000006690 <+2048>: vpermi2q zmm1,zmm0,zmm4 209 | 0x0000000000006696 <+2054>: vmovdqa64 zmm2,ZMMWORD PTR [rip+0x56f60] # 0x5d600 210 | 0x00000000000066a0 <+2064>: vpermi2q zmm2,zmm4,zmm0 211 | 0x00000000000066a6 <+2070>: vpmaxsb zmm3,zmm4,zmm2 212 | 0x00000000000066ac <+2076>: movabs rax,0xffff0000ffff0000 213 | 0x00000000000066b6 <+2086>: kmovq k2,rax 214 | 0x00000000000066bb <+2091>: vpminsb zmm3{k2},zmm4,zmm2 215 | 0x00000000000066c1 <+2097>: vpmaxsb zmm2,zmm0,zmm1 216 | 0x00000000000066c7 <+2103>: mov eax,0xffff0000 217 | 0x00000000000066cc <+2108>: kmovq k2,rax 218 | 0x00000000000066d1 <+2113>: vmovdqa64 zmm4,ZMMWORD PTR [rip+0x56fa5] # 0x5d680 219 | 0x00000000000066db <+2123>: vpermi2q zmm4,zmm3,zmm2 220 | 0x00000000000066e1 <+2129>: vpminsb zmm2{k2},zmm0,zmm1 221 | 0x00000000000066e7 <+2135>: vmovdqa64 zmm0,ZMMWORD PTR [rip+0x56f4f] # 0x5d640 222 | 0x00000000000066f1 <+2145>: vpermi2q zmm0,zmm2,zmm3 223 | 0x00000000000066f7 <+2151>: vpmaxsb zmm1,zmm2,zmm0 224 | 0x00000000000066fd <+2157>: movabs rax,0xff00ff00ff00 225 | 0x0000000000006707 <+2167>: vpmaxsb zmm5,zmm3,zmm4 226 | 0x000000000000670d <+2173>: movabs rcx,0xff00ff00ff00ff00 227 | 0x0000000000006717 <+2183>: kmovq k2,rcx 228 | 0x000000000000671c <+2188>: vpminsb zmm5{k2},zmm3,zmm4 229 | 0x0000000000006722 <+2194>: kmovq k2,rax 230 | 0x0000000000006727 <+2199>: vmovdqa64 zmm3,ZMMWORD PTR [rip+0x56fcf] # 0x5d700 231 | 0x0000000000006731 <+2209>: vpermi2d zmm3,zmm5,zmm1 232 | 0x0000000000006737 <+2215>: vpminsb zmm1{k2},zmm2,zmm0 233 | 0x000000000000673d <+2221>: vmovdqa64 zmm0,ZMMWORD PTR [rip+0x56f79] # 0x5d6c0 234 | 0x0000000000006747 <+2231>: vpermi2d zmm0,zmm1,zmm5 235 | 0x000000000000674d <+2237>: vpmaxsb zmm2,zmm5,zmm3 236 | 0x0000000000006753 <+2243>: movabs rax,0xf0f0f0f0f0f0f0f0 237 | 0x000000000000675d <+2253>: kmovq k2,rax 238 | 0x0000000000006762 <+2258>: vpminsb zmm2{k2},zmm5,zmm3 239 | 0x0000000000006768 <+2264>: vpmaxsb zmm3,zmm1,zmm0 240 | 0x000000000000676e <+2270>: vmovdqa64 zmm4,ZMMWORD PTR [rip+0x57008] # 0x5d780 241 | 0x0000000000006778 <+2280>: vpermi2w zmm4,zmm2,zmm3 242 | 0x000000000000677e <+2286>: movabs rax,0xf0f0f0f0f0f0f0 243 | 0x0000000000006788 <+2296>: kmovq k2,rax 244 | 0x000000000000678d <+2301>: vpminsb zmm3{k2},zmm1,zmm0 245 | 0x0000000000006793 <+2307>: vmovdqa64 zmm0,ZMMWORD PTR [rip+0x56fa3] # 0x5d740 246 | 0x000000000000679d <+2317>: vpermi2w zmm0,zmm3,zmm2 247 | 0x00000000000067a3 <+2323>: mov rbx,QWORD PTR [rsp+0x90] 248 | 0x00000000000067ab <+2331>: mov rax,QWORD PTR [rsp+0x98] 249 | 0x00000000000067b3 <+2339>: mov QWORD PTR [rsp+0x188],rax 250 | 0x00000000000067bb <+2347>: vpmaxsb zmm5,zmm2,zmm4 251 | 0x00000000000067c1 <+2353>: vpminsb zmm5{k1},zmm2,zmm4 252 | 0x00000000000067c7 <+2359>: vpmaxsb zmm1,zmm3,zmm0 253 | 0x00000000000067cd <+2365>: movabs rax,0xccccccccccccccc 254 | 0x00000000000067d7 <+2375>: kmovq k1,rax 255 | 0x00000000000067dc <+2380>: vmovdqa64 zmm2,ZMMWORD PTR [rip+0x5701a] # 0x5d800 256 | 0x00000000000067e6 <+2390>: vpermi2b zmm2,zmm5,zmm1 257 | 0x00000000000067ec <+2396>: vmovdqa64 ZMMWORD PTR [rsp+0x340],zmm2 258 | 0x00000000000067f4 <+2404>: vpminsb zmm1{k1},zmm3,zmm0 259 | 0x00000000000067fa <+2410>: vmovdqa64 zmm0,ZMMWORD PTR [rip+0x56fbc] # 0x5d7c0 260 | 0x0000000000006804 <+2420>: vmovdqa64 ZMMWORD PTR [rsp+0x380],zmm5 261 | 0x000000000000680c <+2428>: vmovdqa64 ZMMWORD PTR [rsp+0x300],zmm1 262 | 0x0000000000006814 <+2436>: vpermi2b zmm0,zmm1,zmm5 263 | 0x000000000000681a <+2442>: vmovdqa64 ZMMWORD PTR [rsp+0x2c0],zmm0 264 | 0x0000000000006822 <+2450>: vpxor xmm0,xmm0,xmm0 265 | 0x0000000000006826 <+2454>: vmovdqa XMMWORD PTR [rsp+0xb0],xmm0 266 | 0x000000000000682f <+2463>: lea rsi,[rsp+0xb0] 267 | 0x0000000000006837 <+2471>: mov edi,0x1 268 | 0x000000000000683c <+2476>: vzeroupper 269 | 0x000000000000683f <+2479>: call 0x5470 -------------------------------------------------------------------------------- /disassemble/asm/int8_16.asm: -------------------------------------------------------------------------------- 1 | 0x0000000000006092 <+562>: call 0x5470 2 | 0x0000000000006097 <+567>: vmovdqa xmm2,XMMWORD PTR [rsp] 3 | 0x000000000000609c <+572>: vpshufb xmm0,xmm2,XMMWORD PTR [rip+0x5705b] # 0x5d100 4 | 0x00000000000060a5 <+581>: vpminsb xmm1,xmm2,xmm0 5 | 0x00000000000060aa <+586>: mov ax,0xf2b0 6 | 0x00000000000060ae <+590>: kmovd k1,eax 7 | 0x00000000000060b2 <+594>: vpmaxsb xmm1{k1},xmm2,xmm0 8 | 0x00000000000060b8 <+600>: vpshufb xmm0,xmm1,XMMWORD PTR [rip+0x5704f] # 0x5d110 9 | 0x00000000000060c1 <+609>: vpminsb xmm2,xmm1,xmm0 10 | 0x00000000000060c6 <+614>: mov ax,0xdcc4 11 | 0x00000000000060ca <+618>: kmovd k1,eax 12 | 0x00000000000060ce <+622>: vpmaxsb xmm2{k1},xmm1,xmm0 13 | 0x00000000000060d4 <+628>: vpshufb xmm0,xmm2,XMMWORD PTR [rip+0x57043] # 0x5d120 14 | 0x00000000000060dd <+637>: vpminsb xmm1,xmm2,xmm0 15 | 0x00000000000060e2 <+642>: mov ax,0xef08 16 | 0x00000000000060e6 <+646>: kmovd k1,eax 17 | 0x00000000000060ea <+650>: vpmaxsb xmm1{k1},xmm2,xmm0 18 | 0x00000000000060f0 <+656>: vpshufb xmm0,xmm1,XMMWORD PTR [rip+0x57037] # 0x5d130 19 | 0x00000000000060f9 <+665>: mov rbx,QWORD PTR [rsp+0xc0] 20 | 0x0000000000006101 <+673>: vpminsb xmm2,xmm1,xmm0 21 | 0x0000000000006106 <+678>: mov ax,0xb552 22 | 0x000000000000610a <+682>: kmovd k1,eax 23 | 0x000000000000610e <+686>: vpmaxsb xmm2{k1},xmm1,xmm0 24 | 0x0000000000006114 <+692>: vpshufb xmm0,xmm2,XMMWORD PTR [rip+0x57023] # 0x5d140 25 | 0x000000000000611d <+701>: mov r12,QWORD PTR [rsp+0xc8] 26 | 0x0000000000006125 <+709>: vpmaxsb xmm1,xmm2,xmm0 27 | 0x000000000000612a <+714>: mov ax,0x14d6 28 | 0x000000000000612e <+718>: kmovd k1,eax 29 | 0x0000000000006132 <+722>: vpminsb xmm1{k1},xmm2,xmm0 30 | 0x0000000000006138 <+728>: vpshufb xmm0,xmm1,XMMWORD PTR [rip+0x5700f] # 0x5d150 31 | 0x0000000000006141 <+737>: vpmaxsb xmm2,xmm1,xmm0 32 | 0x0000000000006146 <+742>: mov ax,0x24da 33 | 0x000000000000614a <+746>: kmovd k1,eax 34 | 0x000000000000614e <+750>: vpminsb xmm2{k1},xmm1,xmm0 35 | 0x0000000000006154 <+756>: vpshufb xmm0,xmm2,XMMWORD PTR [rip+0x57003] # 0x5d160 36 | 0x000000000000615d <+765>: vpmaxsb xmm1,xmm2,xmm0 37 | 0x0000000000006162 <+770>: mov ax,0x1554 38 | 0x0000000000006166 <+774>: kmovd k1,eax 39 | 0x000000000000616a <+778>: vpminsb xmm1{k1},xmm2,xmm0 40 | 0x0000000000006170 <+784>: vpshufb xmm0,xmm1,XMMWORD PTR [rip+0x56ff7] # 0x5d170 41 | 0x0000000000006179 <+793>: vpminsb xmm2,xmm1,xmm0 42 | 0x000000000000617e <+798>: vpmaxsb xmm0,xmm1,xmm0 43 | 0x0000000000006183 <+803>: vpblendw xmm0,xmm0,xmm2,0x14 44 | 0x0000000000006189 <+809>: vmovdqa XMMWORD PTR [rsp+0x20],xmm0 45 | 0x000000000000618f <+815>: vpshufb xmm0,xmm0,XMMWORD PTR [rip+0x56fe8] # 0x5d180 46 | 0x0000000000006198 <+824>: vmovdqa XMMWORD PTR [rsp+0x220],xmm0 47 | 0x00000000000061a1 <+833>: vpxor xmm0,xmm0,xmm0 48 | 0x00000000000061a5 <+837>: vmovdqa XMMWORD PTR [rsp+0xe0],xmm0 49 | 0x00000000000061ae <+846>: lea rsi,[rsp+0xe0] 50 | 0x00000000000061b6 <+854>: mov edi,0x1 51 | 0x00000000000061bb <+859>: call 0x5470 -------------------------------------------------------------------------------- /disassemble/asm/int8_32.asm: -------------------------------------------------------------------------------- 1 | 0x00000000000060b5 <+581>: call 0x5470 2 | 0x00000000000060ba <+586>: mov rbx,QWORD PTR [rsp+0xc0] 3 | 0x00000000000060c2 <+594>: vmovdqu ymm2,YMMWORD PTR [rsp] 4 | 0x00000000000060c7 <+599>: vpshufb ymm0,ymm2,YMMWORD PTR [rip+0x57030] # 0x5d100 5 | 0x00000000000060d0 <+608>: mov rax,QWORD PTR [rsp+0xc8] 6 | 0x00000000000060d8 <+616>: mov QWORD PTR [rsp+0x150],rax 7 | 0x00000000000060e0 <+624>: vpminsb ymm1,ymm2,ymm0 8 | 0x00000000000060e5 <+629>: mov eax,0xaaaaaaaa 9 | 0x00000000000060ea <+634>: kmovd k1,eax 10 | 0x00000000000060ee <+638>: vpmaxsb ymm1{k1},ymm2,ymm0 11 | 0x00000000000060f4 <+644>: vprold ymm0,ymm1,0x10 12 | 0x00000000000060fb <+651>: vpminsb ymm2,ymm1,ymm0 13 | 0x0000000000006100 <+656>: vpmaxsb ymm0,ymm1,ymm0 14 | 0x0000000000006105 <+661>: vpblendw ymm0,ymm2,ymm0,0xaa 15 | 0x000000000000610b <+667>: vpshufd ymm1,ymm0,0xb1 16 | 0x0000000000006110 <+672>: vpminsb ymm2,ymm0,ymm1 17 | 0x0000000000006115 <+677>: vpmaxsb ymm0,ymm0,ymm1 18 | 0x000000000000611a <+682>: vpblendd ymm0,ymm2,ymm0,0xaa 19 | 0x0000000000006120 <+688>: vpshufd ymm1,ymm0,0x4e 20 | 0x0000000000006125 <+693>: vpminsb ymm2,ymm0,ymm1 21 | 0x000000000000612a <+698>: vpmaxsb ymm0,ymm0,ymm1 22 | 0x000000000000612f <+703>: vpblendd ymm0,ymm2,ymm0,0xcc 23 | 0x0000000000006135 <+709>: vmovdqa ymm1,YMMWORD PTR [rip+0x56fe3] # 0x5d120 24 | 0x000000000000613d <+717>: vpermb ymm1,ymm1,ymm0 25 | 0x0000000000006143 <+723>: vpminsb ymm2,ymm0,ymm1 26 | 0x0000000000006148 <+728>: mov eax,0xf7117710 27 | 0x000000000000614d <+733>: kmovd k1,eax 28 | 0x0000000000006151 <+737>: vpmaxsb ymm2{k1},ymm0,ymm1 29 | 0x0000000000006157 <+743>: vmovdqa ymm0,YMMWORD PTR [rip+0x56fe1] # 0x5d140 30 | 0x000000000000615f <+751>: vpermb ymm0,ymm0,ymm2 31 | 0x0000000000006165 <+757>: vpmaxsb ymm1,ymm2,ymm0 32 | 0x000000000000616a <+762>: mov eax,0x249a26da 33 | 0x000000000000616f <+767>: kmovd k1,eax 34 | 0x0000000000006173 <+771>: vpminsb ymm1{k1},ymm2,ymm0 35 | 0x0000000000006179 <+777>: vmovdqa ymm0,YMMWORD PTR [rip+0x56fdf] # 0x5d160 36 | 0x0000000000006181 <+785>: vpermb ymm0,ymm0,ymm1 37 | 0x0000000000006187 <+791>: vpmaxsb ymm2,ymm1,ymm0 38 | 0x000000000000618c <+796>: mov eax,0x2079be 39 | 0x0000000000006191 <+801>: kmovd k1,eax 40 | 0x0000000000006195 <+805>: vpminsb ymm2{k1},ymm1,ymm0 41 | 0x000000000000619b <+811>: vmovdqa ymm0,YMMWORD PTR [rip+0x56fdd] # 0x5d180 42 | 0x00000000000061a3 <+819>: vpermb ymm0,ymm0,ymm2 43 | 0x00000000000061a9 <+825>: vpmaxsb ymm1,ymm2,ymm0 44 | 0x00000000000061ae <+830>: mov eax,0x40edf8 45 | 0x00000000000061b3 <+835>: kmovd k1,eax 46 | 0x00000000000061b7 <+839>: vpminsb ymm1{k1},ymm2,ymm0 47 | 0x00000000000061bd <+845>: vmovdqa ymm0,YMMWORD PTR [rip+0x56fdb] # 0x5d1a0 48 | 0x00000000000061c5 <+853>: vpermb ymm0,ymm0,ymm1 49 | 0x00000000000061cb <+859>: vpmaxsb ymm2,ymm1,ymm0 50 | 0x00000000000061d0 <+864>: mov eax,0x880deaa 51 | 0x00000000000061d5 <+869>: kmovd k1,eax 52 | 0x00000000000061d9 <+873>: vpminsb ymm2{k1},ymm1,ymm0 53 | 0x00000000000061df <+879>: vmovdqa ymm0,YMMWORD PTR [rip+0x56fd9] # 0x5d1c0 54 | 0x00000000000061e7 <+887>: vpermb ymm0,ymm0,ymm2 55 | 0x00000000000061ed <+893>: vpmaxsb ymm1,ymm2,ymm0 56 | 0x00000000000061f2 <+898>: mov eax,0x480fa84 57 | 0x00000000000061f7 <+903>: kmovd k1,eax 58 | 0x00000000000061fb <+907>: vpminsb ymm1{k1},ymm2,ymm0 59 | 0x0000000000006201 <+913>: vmovdqa ymm0,YMMWORD PTR [rip+0x56fd7] # 0x5d1e0 60 | 0x0000000000006209 <+921>: vpermb ymm0,ymm0,ymm1 61 | 0x000000000000620f <+927>: vpmaxsb ymm2,ymm1,ymm0 62 | 0x0000000000006214 <+932>: mov eax,0x818e644 63 | 0x0000000000006219 <+937>: kmovd k1,eax 64 | 0x000000000000621d <+941>: vpminsb ymm2{k1},ymm1,ymm0 65 | 0x0000000000006223 <+947>: vmovdqa ymm0,YMMWORD PTR [rip+0x56fd5] # 0x5d200 66 | 0x000000000000622b <+955>: vpermb ymm0,ymm0,ymm2 67 | 0x0000000000006231 <+961>: vpmaxsb ymm1,ymm2,ymm0 68 | 0x0000000000006236 <+966>: mov eax,0x22ccb20 69 | 0x000000000000623b <+971>: kmovd k1,eax 70 | 0x000000000000623f <+975>: vpminsb ymm1{k1},ymm2,ymm0 71 | 0x0000000000006245 <+981>: vmovdqa ymm0,YMMWORD PTR [rip+0x56fd3] # 0x5d220 72 | 0x000000000000624d <+989>: vpermb ymm0,ymm0,ymm1 73 | 0x0000000000006253 <+995>: vpmaxsb ymm2,ymm1,ymm0 74 | 0x0000000000006258 <+1000>: mov eax,0x54aad48 75 | 0x000000000000625d <+1005>: kmovd k1,eax 76 | 0x0000000000006261 <+1009>: vpminsb ymm2{k1},ymm1,ymm0 77 | 0x0000000000006267 <+1015>: vmovdqa ymm0,YMMWORD PTR [rip+0x56fd1] # 0x5d240 78 | 0x000000000000626f <+1023>: vmovdqu YMMWORD PTR [rsp+0x130],ymm2 79 | 0x0000000000006278 <+1032>: vpermb ymm0,ymm0,ymm2 80 | 0x000000000000627e <+1038>: vmovdqu YMMWORD PTR [rsp+0x280],ymm0 81 | 0x0000000000006287 <+1047>: mov ebp,0xaaaaaa8 82 | 0x000000000000628c <+1052>: vpxor xmm0,xmm0,xmm0 83 | 0x0000000000006290 <+1056>: vmovdqa XMMWORD PTR [rsp+0xe0],xmm0 84 | 0x0000000000006299 <+1065>: lea rsi,[rsp+0xe0] 85 | 0x00000000000062a1 <+1073>: mov edi,0x1 86 | 0x00000000000062a6 <+1078>: vzeroupper 87 | 0x00000000000062a9 <+1081>: call 0x5470 -------------------------------------------------------------------------------- /disassemble/asm/int8_64.asm: -------------------------------------------------------------------------------- 1 | 0x00000000000060b8 <+584>: call 0x5470 2 | 0x00000000000060bd <+589>: mov rbx,QWORD PTR [rsp+0x90] 3 | 0x00000000000060c5 <+597>: vmovdqu64 zmm2,ZMMWORD PTR [rsp+0x100] 4 | 0x00000000000060cd <+605>: vprold zmm0,zmm2,0x10 5 | 0x00000000000060d4 <+612>: vpminsb zmm1,zmm2,zmm0 6 | 0x00000000000060da <+618>: movabs rax,0xcccccccccccccccc 7 | 0x00000000000060e4 <+628>: kmovq k1,rax 8 | 0x00000000000060e9 <+633>: vpmaxsb zmm1{k1},zmm2,zmm0 9 | 0x00000000000060ef <+639>: vpshufb zmm0,zmm1,ZMMWORD PTR [rip+0x57047] # 0x5d140 10 | 0x00000000000060f9 <+649>: vpminsb zmm2,zmm1,zmm0 11 | 0x00000000000060ff <+655>: movabs rax,0xaaaaaaaaaaaaaaaa 12 | 0x0000000000006109 <+665>: kmovq k1,rax 13 | 0x000000000000610e <+670>: vpmaxsb zmm2{k1},zmm1,zmm0 14 | 0x0000000000006114 <+676>: vmovdqa64 zmm0,ZMMWORD PTR [rip+0x57062] # 0x5d180 15 | 0x000000000000611e <+686>: vpermb zmm0,zmm0,zmm2 16 | 0x0000000000006124 <+692>: vpminsb zmm1,zmm2,zmm0 17 | 0x000000000000612a <+698>: movabs rax,0xdddd44d4d4dd4444 18 | 0x0000000000006134 <+708>: kmovq k1,rax 19 | 0x0000000000006139 <+713>: vpmaxsb zmm1{k1},zmm2,zmm0 20 | 0x000000000000613f <+719>: vmovdqa64 zmm0,ZMMWORD PTR [rip+0x57077] # 0x5d1c0 21 | 0x0000000000006149 <+729>: vpermb zmm0,zmm0,zmm1 22 | 0x000000000000614f <+735>: vpminsb zmm2,zmm1,zmm0 23 | 0x0000000000006155 <+741>: movabs rax,0xff6f9960f9660900 24 | 0x000000000000615f <+751>: kmovq k1,rax 25 | 0x0000000000006164 <+756>: vpmaxsb zmm2{k1},zmm1,zmm0 26 | 0x000000000000616a <+762>: vmovdqa64 zmm0,ZMMWORD PTR [rip+0x5708c] # 0x5d200 27 | 0x0000000000006174 <+772>: vpermb zmm0,zmm0,zmm2 28 | 0x000000000000617a <+778>: vpminsb zmm1,zmm2,zmm0 29 | 0x0000000000006180 <+784>: movabs rax,0xff96ff9966009600 30 | 0x000000000000618a <+794>: kmovq k1,rax 31 | 0x000000000000618f <+799>: vpmaxsb zmm1{k1},zmm2,zmm0 32 | 0x0000000000006195 <+805>: vmovdqa64 zmm0,ZMMWORD PTR [rip+0x570a1] # 0x5d240 33 | 0x000000000000619f <+815>: vpermb zmm0,zmm0,zmm1 34 | 0x00000000000061a5 <+821>: vpminsb zmm2,zmm1,zmm0 35 | 0x00000000000061ab <+827>: movabs rax,0xf6f96f6f09096090 36 | 0x00000000000061b5 <+837>: kmovq k1,rax 37 | 0x00000000000061ba <+842>: vpmaxsb zmm2{k1},zmm1,zmm0 38 | 0x00000000000061c0 <+848>: vmovdqa64 zmm0,ZMMWORD PTR [rip+0x570b6] # 0x5d280 39 | 0x00000000000061ca <+858>: vpermb zmm0,zmm0,zmm2 40 | 0x00000000000061d0 <+864>: vpmaxsb zmm1,zmm2,zmm0 41 | 0x00000000000061d6 <+870>: movabs rax,0x6096960f9696f96 42 | 0x00000000000061e0 <+880>: kmovq k1,rax 43 | 0x00000000000061e5 <+885>: vpminsb zmm1{k1},zmm2,zmm0 44 | 0x00000000000061eb <+891>: vmovdqa64 zmm0,ZMMWORD PTR [rip+0x570cb] # 0x5d2c0 45 | 0x00000000000061f5 <+901>: vpermb zmm0,zmm0,zmm1 46 | 0x00000000000061fb <+907>: vpmaxsb zmm2,zmm1,zmm0 47 | 0x0000000000006201 <+913>: movabs rax,0x960f00ff0f96f0 48 | 0x000000000000620b <+923>: kmovq k1,rax 49 | 0x0000000000006210 <+928>: vpminsb zmm2{k1},zmm1,zmm0 50 | 0x0000000000006216 <+934>: vmovdqa64 zmm0,ZMMWORD PTR [rip+0x570e0] # 0x5d300 51 | 0x0000000000006220 <+944>: vpermb zmm0,zmm0,zmm2 52 | 0x0000000000006226 <+950>: vpmaxsb zmm1,zmm2,zmm0 53 | 0x000000000000622c <+956>: movabs rax,0x690f096f0f6960 54 | 0x0000000000006236 <+966>: kmovq k1,rax 55 | 0x000000000000623b <+971>: vpminsb zmm1{k1},zmm2,zmm0 56 | 0x0000000000006241 <+977>: vpshufb zmm0,zmm1,ZMMWORD PTR [rip+0x570f5] # 0x5d340 57 | 0x000000000000624b <+987>: mov rax,QWORD PTR [rsp+0x98] 58 | 0x0000000000006253 <+995>: mov QWORD PTR [rsp+0x150],rax 59 | 0x000000000000625b <+1003>: vpmaxsb zmm2,zmm1,zmm0 60 | 0x0000000000006261 <+1009>: movabs rax,0x6069f069f0600 61 | 0x000000000000626b <+1019>: kmovq k1,rax 62 | 0x0000000000006270 <+1024>: vpminsb zmm2{k1},zmm1,zmm0 63 | 0x0000000000006276 <+1030>: vmovdqa64 zmm0,ZMMWORD PTR [rip+0x57100] # 0x5d380 64 | 0x0000000000006280 <+1040>: vpermb zmm0,zmm0,zmm2 65 | 0x0000000000006286 <+1046>: vpmaxsb zmm1,zmm2,zmm0 66 | 0x000000000000628c <+1052>: movabs rax,0x90f690f69000 67 | 0x0000000000006296 <+1062>: kmovq k1,rax 68 | 0x000000000000629b <+1067>: vpminsb zmm1{k1},zmm2,zmm0 69 | 0x00000000000062a1 <+1073>: vmovdqa64 zmm0,ZMMWORD PTR [rip+0x57115] # 0x5d3c0 70 | 0x00000000000062ab <+1083>: vpermb zmm0,zmm0,zmm1 71 | 0x00000000000062b1 <+1089>: vpmaxsb zmm2,zmm1,zmm0 72 | 0x00000000000062b7 <+1095>: movabs rax,0xe8e0e8e06666 73 | 0x00000000000062c1 <+1105>: kmovq k1,rax 74 | 0x00000000000062c6 <+1110>: vpminsb zmm2{k1},zmm1,zmm0 75 | 0x00000000000062cc <+1116>: vmovdqa64 zmm0,ZMMWORD PTR [rip+0x5712a] # 0x5d400 76 | 0x00000000000062d6 <+1126>: vpermb zmm0,zmm0,zmm2 77 | 0x00000000000062dc <+1132>: vpmaxsb zmm1,zmm2,zmm0 78 | 0x00000000000062e2 <+1138>: movabs rax,0x88800884c6cecce 79 | 0x00000000000062ec <+1148>: kmovq k1,rax 80 | 0x00000000000062f1 <+1153>: vpminsb zmm1{k1},zmm2,zmm0 81 | 0x00000000000062f7 <+1159>: vmovdqa64 zmm0,ZMMWORD PTR [rip+0x5713f] # 0x5d440 82 | 0x0000000000006301 <+1169>: vpermb zmm0,zmm0,zmm1 83 | 0x0000000000006307 <+1175>: vpmaxsb zmm2,zmm1,zmm0 84 | 0x000000000000630d <+1181>: movabs rax,0xa00ca4cc48cd9ac 85 | 0x0000000000006317 <+1191>: kmovq k1,rax 86 | 0x000000000000631c <+1196>: vpminsb zmm2{k1},zmm1,zmm0 87 | 0x0000000000006322 <+1202>: vmovdqa64 zmm0,ZMMWORD PTR [rip+0x57154] # 0x5d480 88 | 0x000000000000632c <+1212>: vpermb zmm0,zmm0,zmm2 89 | 0x0000000000006332 <+1218>: vpmaxsb zmm1,zmm2,zmm0 90 | 0x0000000000006338 <+1224>: movabs rax,0x246688ca8888 91 | 0x0000000000006342 <+1234>: kmovq k1,rax 92 | 0x0000000000006347 <+1239>: vpminsb zmm1{k1},zmm2,zmm0 93 | 0x000000000000634d <+1245>: vmovdqa64 zmm0,ZMMWORD PTR [rip+0x57169] # 0x5d4c0 94 | 0x0000000000006357 <+1255>: vpermb zmm0,zmm0,zmm1 95 | 0x000000000000635d <+1261>: vpmaxsb zmm2,zmm1,zmm0 96 | 0x0000000000006363 <+1267>: movabs rax,0xac88eeca8888 97 | 0x000000000000636d <+1277>: kmovq k1,rax 98 | 0x0000000000006372 <+1282>: vpminsb zmm2{k1},zmm1,zmm0 99 | 0x0000000000006378 <+1288>: vmovdqa64 zmm0,ZMMWORD PTR [rip+0x5717e] # 0x5d500 100 | 0x0000000000006382 <+1298>: vpermb zmm0,zmm0,zmm2 101 | 0x0000000000006388 <+1304>: vpmaxsb zmm1,zmm2,zmm0 102 | 0x000000000000638e <+1310>: movabs rax,0x44caaaaaaccc88 103 | 0x0000000000006398 <+1320>: kmovq k1,rax 104 | 0x000000000000639d <+1325>: vpminsb zmm1{k1},zmm2,zmm0 105 | 0x00000000000063a3 <+1331>: vmovdqa64 zmm0,ZMMWORD PTR [rip+0x57193] # 0x5d540 106 | 0x00000000000063ad <+1341>: vpermb zmm0,zmm0,zmm1 107 | 0x00000000000063b3 <+1347>: vpmaxsb zmm2,zmm1,zmm0 108 | 0x00000000000063b9 <+1353>: movabs rax,0xaacaaccaacaa88 109 | 0x00000000000063c3 <+1363>: kmovq k1,rax 110 | 0x00000000000063c8 <+1368>: vpminsb zmm2{k1},zmm1,zmm0 111 | 0x00000000000063ce <+1374>: vmovdqa64 zmm0,ZMMWORD PTR [rip+0x571a8] # 0x5d580 112 | 0x00000000000063d8 <+1384>: vpermb zmm0,zmm0,zmm2 113 | 0x00000000000063de <+1390>: vpmaxsb zmm1,zmm2,zmm0 114 | 0x00000000000063e4 <+1396>: movabs rax,0x4ccaccaaccaccc8 115 | 0x00000000000063ee <+1406>: kmovq k1,rax 116 | 0x00000000000063f3 <+1411>: vpminsb zmm1{k1},zmm2,zmm0 117 | 0x00000000000063f9 <+1417>: vmovdqa64 zmm0,ZMMWORD PTR [rip+0x571bd] # 0x5d5c0 118 | 0x0000000000006403 <+1427>: vmovdqu64 ZMMWORD PTR [rsp+0x270],zmm1 119 | 0x000000000000640e <+1438>: vpermb zmm0,zmm0,zmm1 120 | 0x0000000000006414 <+1444>: vmovdqu64 ZMMWORD PTR [rsp+0x2c0],zmm0 121 | 0x000000000000641c <+1452>: movabs r13,0xaaaaaaaaaaaaaa8 122 | 0x0000000000006426 <+1462>: vpxor xmm0,xmm0,xmm0 123 | 0x000000000000642a <+1466>: vmovdqa XMMWORD PTR [rsp+0xb0],xmm0 124 | 0x0000000000006433 <+1475>: lea rsi,[rsp+0xb0] 125 | 0x000000000000643b <+1483>: mov edi,0x1 126 | 0x0000000000006440 <+1488>: vzeroupper 127 | 0x0000000000006443 <+1491>: call 0x5470 -------------------------------------------------------------------------------- /disassemble/asm/int8_8.asm: -------------------------------------------------------------------------------- 1 | 0x000000000005bd27 <+775>: call r14 2 | 0x000000000005bd2a <+778>: mov r15,QWORD PTR [rsp+0xc0] 3 | 0x000000000005bd32 <+786>: mov rax,QWORD PTR [rsp+0xc8] 4 | 0x000000000005bd3a <+794>: mov QWORD PTR [rsp+0x128],rax 5 | 0x000000000005bd42 <+802>: vmovdqa xmm2,XMMWORD PTR [rsp] 6 | 0x000000000005bd47 <+807>: vprold xmm0,xmm2,0x10 7 | 0x000000000005bd4e <+814>: vpminsb xmm1,xmm2,xmm0 8 | 0x000000000005bd53 <+819>: vpmaxsb xmm0,xmm2,xmm0 9 | 0x000000000005bd58 <+824>: vpblendw xmm0,xmm1,xmm0,0xa 10 | 0x000000000005bd5e <+830>: vpshufd xmm1,xmm0,0xe1 11 | 0x000000000005bd63 <+835>: vpminsb xmm2,xmm0,xmm1 12 | 0x000000000005bd68 <+840>: vpmaxsb xmm0,xmm0,xmm1 13 | 0x000000000005bd6d <+845>: vpblendd xmm0,xmm2,xmm0,0x2 14 | 0x000000000005bd73 <+851>: movabs rax,0xfffffffffffd56d8 15 | 0x000000000005bd7d <+861>: vpshufb xmm1,xmm0,XMMWORD PTR [r13+rax*1+0x0] 16 | 0x000000000005bd84 <+868>: vpminsb xmm2,xmm0,xmm1 17 | 0x000000000005bd89 <+873>: mov ax,0xaa 18 | 0x000000000005bd8d <+877>: kmovd k1,eax 19 | 0x000000000005bd91 <+881>: vpmaxsb xmm2{k1},xmm0,xmm1 20 | 0x000000000005bd97 <+887>: vpshuflw xmm0,xmm2,0xd8 21 | 0x000000000005bd9c <+892>: vpminsb xmm1,xmm2,xmm0 22 | 0x000000000005bda1 <+897>: vpmaxsb xmm0,xmm2,xmm0 23 | 0x000000000005bda6 <+902>: vpblendw xmm0,xmm0,xmm1,0x2 24 | 0x000000000005bdac <+908>: movabs rax,0xfffffffffffd56e8 25 | 0x000000000005bdb6 <+918>: vpshufb xmm1,xmm0,XMMWORD PTR [r13+rax*1+0x0] 26 | 0x000000000005bdbd <+925>: vpmaxsb xmm2,xmm0,xmm1 27 | 0x000000000005bdc2 <+930>: mov ax,0xa 28 | 0x000000000005bdc6 <+934>: kmovd k1,eax 29 | 0x000000000005bdca <+938>: vpminsb xmm2{k1},xmm0,xmm1 30 | 0x000000000005bdd0 <+944>: movabs rax,0xfffffffffffd56f8 31 | 0x000000000005bdda <+954>: vmovdqa XMMWORD PTR [rsp+0x240],xmm2 32 | 0x000000000005bde3 <+963>: vpshufb xmm0,xmm2,XMMWORD PTR [r13+rax*1+0x0] 33 | 0x000000000005bdea <+970>: vmovdqa XMMWORD PTR [rsp+0x230],xmm0 34 | 0x000000000005bdf3 <+979>: vpxor xmm0,xmm0,xmm0 35 | 0x000000000005bdf7 <+983>: vmovdqa XMMWORD PTR [rsp+0xe0],xmm0 36 | 0x000000000005be00 <+992>: lea rsi,[rsp+0xe0] 37 | 0x000000000005be08 <+1000>: mov edi,0x1 38 | 0x000000000005be0d <+1005>: call r14 -------------------------------------------------------------------------------- /disassemble/int16_8.asm: -------------------------------------------------------------------------------- 1 | GNU gdb (Ubuntu 12.1-0ubuntu1~22.04) 12.1 2 | Copyright (C) 2022 Free Software Foundation, Inc. 3 | License GPLv3+: GNU GPL version 3 or later 4 | This is free software: you are free to change and redistribute it. 5 | There is NO WARRANTY, to the extent permitted by law. 6 | Type "show copying" and "show warranty" for details. 7 | This GDB was configured as "x86_64-linux-gnu". 8 | Type "show configuration" for configuration details. 9 | For bug reporting instructions, please see: 10 | . 11 | Find the GDB manual and other documentation resources online at: 12 | . 13 | 14 | For help, type "help". 15 | Type "apropos word" to search for commands related to "word"... 16 | Reading symbols from main... 17 | Dump of assembler code for function main: 18 | 0x0000000000005d10 <+0>: push rbp 19 | 0x0000000000005d11 <+1>: push r15 20 | 0x0000000000005d13 <+3>: push r14 21 | 0x0000000000005d15 <+5>: push r13 22 | 0x0000000000005d17 <+7>: push r12 23 | 0x0000000000005d19 <+9>: push rbx 24 | 0x0000000000005d1a <+10>: sub rsp,0x1e8 25 | 0x0000000000005d21 <+17>: call 0x2eef0 26 | 0x0000000000005d26 <+22>: vpxor xmm0,xmm0,xmm0 27 | 0x0000000000005d2a <+26>: mov ebx,0x9 28 | 0x0000000000005d2f <+31>: xor r14d,r14d 29 | 0x0000000000005d32 <+34>: data16 data16 data16 data16 cs nop WORD PTR [rax+rax*1+0x0] 30 | 0x0000000000005d40 <+48>: vmovdqa XMMWORD PTR [rsp+0x10],xmm0 31 | 0x0000000000005d46 <+54>: call 0x2de40 32 | 0x0000000000005d4b <+59>: mov edx,0x64 33 | 0x0000000000005d50 <+64>: mov rdi,rax 34 | 0x0000000000005d53 <+67>: xor esi,esi 35 | 0x0000000000005d55 <+69>: call 0x2e250 36 | 0x0000000000005d5a <+74>: vpbroadcastw xmm0,r14d 37 | 0x0000000000005d60 <+80>: vpcmpeqw k1,xmm0,XMMWORD PTR [rip+0x56386] # 0x5c0f0 38 | 0x0000000000005d6a <+90>: vmovdqa xmm0,XMMWORD PTR [rsp+0x10] 39 | 0x0000000000005d70 <+96>: vpbroadcastw xmm0{k1},eax 40 | 0x0000000000005d76 <+102>: dec rbx 41 | 0x0000000000005d79 <+105>: inc r14 42 | 0x0000000000005d7c <+108>: cmp rbx,0x1 43 | 0x0000000000005d80 <+112>: ja 0x5d40 44 | 0x0000000000005d82 <+114>: vmovdqa XMMWORD PTR [rsp+0x10],xmm0 45 | 0x0000000000005d88 <+120>: mov edi,0x8 46 | 0x0000000000005d8d <+125>: call 0x7dd0 <$stdlib::$builtin::$string::_calc_initial_buffer_size($stdlib::$builtin::$int::Int)> 47 | 0x0000000000005d92 <+130>: mov rbx,rax 48 | 0x0000000000005d95 <+133>: test rax,rax 49 | 0x0000000000005d98 <+136>: jle 0x5daf 50 | 0x0000000000005d9a <+138>: mov edi,0x1 51 | 0x0000000000005d9f <+143>: mov rsi,rbx 52 | 0x0000000000005da2 <+146>: call 0x2d320 53 | 0x0000000000005da7 <+151>: mov r14,rax 54 | 0x0000000000005daa <+154>: mov r15,rbx 55 | 0x0000000000005dad <+157>: jmp 0x5db5 56 | 0x0000000000005daf <+159>: xor r14d,r14d 57 | 0x0000000000005db2 <+162>: xor r15d,r15d 58 | 0x0000000000005db5 <+165>: lea rdx,[rip+0x56374] # 0x5c130 59 | 0x0000000000005dbc <+172>: mov ecx,0x8 60 | 0x0000000000005dc1 <+177>: mov rdi,r14 61 | 0x0000000000005dc4 <+180>: mov rsi,rbx 62 | 0x0000000000005dc7 <+183>: xor eax,eax 63 | 0x0000000000005dc9 <+185>: call 0x57c0 64 | 0x0000000000005dce <+190>: cdqe 65 | 0x0000000000005dd0 <+192>: inc rax 66 | 0x0000000000005dd3 <+195>: mov QWORD PTR [rsp+0x50],r14 67 | 0x0000000000005dd8 <+200>: mov QWORD PTR [rsp+0x58],rax 68 | 0x0000000000005ddd <+205>: mov QWORD PTR [rsp+0x60],r15 69 | 0x0000000000005de2 <+210>: lea rdx,[rip+0x56357] # 0x5c140 70 | 0x0000000000005de9 <+217>: lea rdi,[rsp+0xf8] 71 | 0x0000000000005df1 <+225>: lea rsi,[rsp+0x50] 72 | 0x0000000000005df6 <+230>: mov ecx,0x7 73 | 0x0000000000005dfb <+235>: call 0xd420 <$stdlib::$builtin::$string::String::__radd__(,$stdlib::$builtin::$string::String,$stdlib::$builtin::$string_literal::StringLiteral)> 74 | 0x0000000000005e00 <+240>: mov rdi,QWORD PTR [rsp+0x50] 75 | 0x0000000000005e05 <+245>: test rdi,rdi 76 | 0x0000000000005e08 <+248>: je 0x5e0f 77 | 0x0000000000005e0a <+250>: call 0x2d340 78 | 0x0000000000005e0f <+255>: mov edi,0x1 79 | 0x0000000000005e14 <+260>: mov esi,0x3 80 | 0x0000000000005e19 <+265>: call 0x2d320 81 | 0x0000000000005e1e <+270>: xor ecx,ecx 82 | 0x0000000000005e20 <+272>: mov BYTE PTR [rax+rcx*1],0x0 83 | 0x0000000000005e24 <+276>: inc rcx 84 | 0x0000000000005e27 <+279>: cmp rcx,0x3 85 | 0x0000000000005e2b <+283>: jne 0x5e20 86 | 0x0000000000005e2d <+285>: mov WORD PTR [rax],0x203a 87 | 0x0000000000005e32 <+290>: mov BYTE PTR [rax+0x2],0x0 88 | 0x0000000000005e36 <+294>: mov QWORD PTR [rsp+0x68],rax 89 | 0x0000000000005e3b <+299>: mov QWORD PTR [rsp+0x70],0x3 90 | 0x0000000000005e44 <+308>: mov QWORD PTR [rsp+0x78],0x3 91 | 0x0000000000005e4d <+317>: lea rdi,[rsp+0x110] 92 | 0x0000000000005e55 <+325>: lea rsi,[rsp+0xf8] 93 | 0x0000000000005e5d <+333>: lea rdx,[rsp+0x68] 94 | 0x0000000000005e62 <+338>: call 0xcfe0 <$stdlib::$builtin::$string::String::__add__(,$stdlib::$builtin::$string::String,$stdlib::$builtin::$string::String)> 95 | 0x0000000000005e67 <+343>: mov rdi,QWORD PTR [rsp+0x68] 96 | 0x0000000000005e6c <+348>: test rdi,rdi 97 | 0x0000000000005e6f <+351>: je 0x5e76 98 | 0x0000000000005e71 <+353>: call 0x2d340 99 | 0x0000000000005e76 <+358>: mov rdi,QWORD PTR [rsp+0xf8] 100 | 0x0000000000005e7e <+366>: test rdi,rdi 101 | 0x0000000000005e81 <+369>: je 0x5e88 102 | 0x0000000000005e83 <+371>: call 0x2d340 103 | 0x0000000000005e88 <+376>: lea rbx,[rsp+0x1b8] 104 | 0x0000000000005e90 <+384>: mov rdi,rbx 105 | 0x0000000000005e93 <+387>: vmovaps xmm0,XMMWORD PTR [rsp+0x10] 106 | 0x0000000000005e99 <+393>: call 0x5ae0 <$stdlib::$builtin::$simd::SIMD::__str__(,$stdlib::$builtin::$simd::SIMD[type, size]),_74x13_type=si16,_74x26_size=8> 107 | 0x0000000000005e9e <+398>: lea rdi,[rsp+0x128] 108 | 0x0000000000005ea6 <+406>: lea rsi,[rsp+0x110] 109 | 0x0000000000005eae <+414>: mov rdx,rbx 110 | 0x0000000000005eb1 <+417>: call 0xcfe0 <$stdlib::$builtin::$string::String::__add__(,$stdlib::$builtin::$string::String,$stdlib::$builtin::$string::String)> 111 | 0x0000000000005eb6 <+422>: mov rdi,QWORD PTR [rsp+0x1b8] 112 | 0x0000000000005ebe <+430>: test rdi,rdi 113 | 0x0000000000005ec1 <+433>: je 0x5ec8 114 | 0x0000000000005ec3 <+435>: call 0x2d340 115 | 0x0000000000005ec8 <+440>: mov rdi,QWORD PTR [rsp+0x110] 116 | 0x0000000000005ed0 <+448>: test rdi,rdi 117 | 0x0000000000005ed3 <+451>: je 0x5eda 118 | 0x0000000000005ed5 <+453>: call 0x2d340 119 | 0x0000000000005eda <+458>: lea rdi,[rsp+0x128] 120 | 0x0000000000005ee2 <+466>: call 0x8880 <$stdlib::$builtin::$io::print($stdlib::$builtin::$string::String)> 121 | 0x0000000000005ee7 <+471>: mov rdi,QWORD PTR [rsp+0x128] 122 | 0x0000000000005eef <+479>: test rdi,rdi 123 | 0x0000000000005ef2 <+482>: je 0x5ef9 124 | 0x0000000000005ef4 <+484>: call 0x2d340 125 | 0x0000000000005ef9 <+489>: vxorps xmm0,xmm0,xmm0 126 | 0x0000000000005efd <+493>: vmovaps XMMWORD PTR [rsp+0x20],xmm0 127 | 0x0000000000005f03 <+499>: lea rsi,[rsp+0x20] 128 | 0x0000000000005f08 <+504>: mov edi,0x1 129 | 0x0000000000005f0d <+509>: call 0x5470 130 | 0x0000000000005f12 <+514>: mov rbx,QWORD PTR [rsp+0x20] 131 | 0x0000000000005f17 <+519>: mov r12,QWORD PTR [rsp+0x28] 132 | 0x0000000000005f1c <+524>: vmovdqa xmm2,XMMWORD PTR [rsp+0x10] 133 | 0x0000000000005f22 <+530>: vpshufd xmm0,xmm2,0xb1 134 | 0x0000000000005f27 <+535>: vpminsw xmm1,xmm2,xmm0 135 | 0x0000000000005f2b <+539>: vpmaxsw xmm0,xmm2,xmm0 136 | 0x0000000000005f2f <+543>: vpblendd xmm0,xmm1,xmm0,0xa 137 | 0x0000000000005f35 <+549>: vpshufd xmm1,xmm0,0x4e 138 | 0x0000000000005f3a <+554>: vpminsw xmm2,xmm0,xmm1 139 | 0x0000000000005f3e <+558>: vpmaxsw xmm0,xmm0,xmm1 140 | 0x0000000000005f42 <+562>: vpblendd xmm0,xmm2,xmm0,0xc 141 | 0x0000000000005f48 <+568>: vprold xmm1,xmm0,0x10 142 | 0x0000000000005f4f <+575>: vpminsw xmm2,xmm0,xmm1 143 | 0x0000000000005f53 <+579>: vpmaxsw xmm0,xmm0,xmm1 144 | 0x0000000000005f57 <+583>: vpblendw xmm0,xmm2,xmm0,0xaa 145 | 0x0000000000005f5d <+589>: vpshufd xmm1,xmm0,0xd8 146 | 0x0000000000005f62 <+594>: vpminsw xmm2,xmm0,xmm1 147 | 0x0000000000005f66 <+598>: vpmaxsw xmm0,xmm0,xmm1 148 | 0x0000000000005f6a <+602>: vpblendd xmm0,xmm0,xmm2,0x2 149 | 0x0000000000005f70 <+608>: vpshufb xmm1,xmm0,XMMWORD PTR [rip+0x56187] # 0x5c100 150 | 0x0000000000005f79 <+617>: vpminsw xmm2,xmm0,xmm1 151 | 0x0000000000005f7d <+621>: vpmaxsw xmm0,xmm0,xmm1 152 | 0x0000000000005f81 <+625>: vpblendw xmm0,xmm0,xmm2,0xa 153 | 0x0000000000005f87 <+631>: vpshufb xmm1,xmm0,XMMWORD PTR [rip+0x56180] # 0x5c110 154 | 0x0000000000005f90 <+640>: vpminsw xmm2,xmm0,xmm1 155 | 0x0000000000005f94 <+644>: vmovdqa XMMWORD PTR [rsp+0x10],xmm2 156 | 0x0000000000005f9a <+650>: vpmaxsw xmm0,xmm0,xmm1 157 | 0x0000000000005f9e <+654>: vmovdqa XMMWORD PTR [rsp+0xe0],xmm0 158 | 0x0000000000005fa7 <+663>: vpxor xmm0,xmm0,xmm0 159 | 0x0000000000005fab <+667>: vmovdqa XMMWORD PTR [rsp+0x30],xmm0 160 | 0x0000000000005fb1 <+673>: lea rsi,[rsp+0x30] 161 | 0x0000000000005fb6 <+678>: mov edi,0x1 162 | 0x0000000000005fbb <+683>: call 0x5470 163 | 0x0000000000005fc0 <+688>: mov r13,QWORD PTR [rsp+0x30] 164 | 0x0000000000005fc5 <+693>: sub r13,rbx 165 | 0x0000000000005fc8 <+696>: mov rbx,QWORD PTR [rsp+0x38] 166 | 0x0000000000005fcd <+701>: mov edi,0x8 167 | 0x0000000000005fd2 <+706>: call 0x7dd0 <$stdlib::$builtin::$string::_calc_initial_buffer_size($stdlib::$builtin::$int::Int)> 168 | 0x0000000000005fd7 <+711>: mov r14,rax 169 | 0x0000000000005fda <+714>: test rax,rax 170 | 0x0000000000005fdd <+717>: jle 0x5ff4 171 | 0x0000000000005fdf <+719>: mov edi,0x1 172 | 0x0000000000005fe4 <+724>: mov rsi,r14 173 | 0x0000000000005fe7 <+727>: call 0x2d320 174 | 0x0000000000005fec <+732>: mov r15,rax 175 | 0x0000000000005fef <+735>: mov rbp,r14 176 | 0x0000000000005ff2 <+738>: jmp 0x5ff9 177 | 0x0000000000005ff4 <+740>: xor r15d,r15d 178 | 0x0000000000005ff7 <+743>: xor ebp,ebp 179 | 0x0000000000005ff9 <+745>: vmovdqa xmm0,XMMWORD PTR [rsp+0xe0] 180 | 0x0000000000006002 <+754>: vpblendw xmm0,xmm0,XMMWORD PTR [rsp+0x10],0x2a 181 | 0x000000000000600a <+762>: vmovdqa XMMWORD PTR [rsp+0x10],xmm0 182 | 0x0000000000006010 <+768>: imul r13,r13,0x3b9aca00 183 | 0x0000000000006017 <+775>: sub rbx,r12 184 | 0x000000000000601a <+778>: lea rdx,[rip+0x5610f] # 0x5c130 185 | 0x0000000000006021 <+785>: mov ecx,0x8 186 | 0x0000000000006026 <+790>: mov rdi,r15 187 | 0x0000000000006029 <+793>: mov rsi,r14 188 | 0x000000000000602c <+796>: xor eax,eax 189 | 0x000000000000602e <+798>: call 0x57c0 190 | 0x0000000000006033 <+803>: cdqe 191 | 0x0000000000006035 <+805>: inc rax 192 | 0x0000000000006038 <+808>: mov QWORD PTR [rsp+0x80],r15 193 | 0x0000000000006040 <+816>: mov QWORD PTR [rsp+0x88],rax 194 | 0x0000000000006048 <+824>: mov QWORD PTR [rsp+0x90],rbp 195 | 0x0000000000006050 <+832>: lea rdx,[rip+0x560f9] # 0x5c150 196 | 0x0000000000006057 <+839>: lea rdi,[rsp+0x140] 197 | 0x000000000000605f <+847>: lea rsi,[rsp+0x80] 198 | 0x0000000000006067 <+855>: mov ecx,0x6 199 | 0x000000000000606c <+860>: call 0xd420 <$stdlib::$builtin::$string::String::__radd__(,$stdlib::$builtin::$string::String,$stdlib::$builtin::$string_literal::StringLiteral)> 200 | 0x0000000000006071 <+865>: mov rdi,QWORD PTR [rsp+0x80] 201 | 0x0000000000006079 <+873>: test rdi,rdi 202 | 0x000000000000607c <+876>: je 0x6083 203 | 0x000000000000607e <+878>: call 0x2d340 204 | 0x0000000000006083 <+883>: add rbx,r13 205 | 0x0000000000006086 <+886>: mov edi,0x1 206 | 0x000000000000608b <+891>: mov esi,0x3 207 | 0x0000000000006090 <+896>: call 0x2d320 208 | 0x0000000000006095 <+901>: xor ecx,ecx 209 | 0x0000000000006097 <+903>: nop WORD PTR [rax+rax*1+0x0] 210 | 0x00000000000060a0 <+912>: mov BYTE PTR [rax+rcx*1],0x0 211 | 0x00000000000060a4 <+916>: inc rcx 212 | 0x00000000000060a7 <+919>: cmp rcx,0x3 213 | 0x00000000000060ab <+923>: jne 0x60a0 214 | 0x00000000000060ad <+925>: mov WORD PTR [rax],0x203a 215 | 0x00000000000060b2 <+930>: mov BYTE PTR [rax+0x2],0x0 216 | 0x00000000000060b6 <+934>: mov QWORD PTR [rsp+0x98],rax 217 | 0x00000000000060be <+942>: mov QWORD PTR [rsp+0xa0],0x3 218 | 0x00000000000060ca <+954>: mov QWORD PTR [rsp+0xa8],0x3 219 | 0x00000000000060d6 <+966>: lea rdi,[rsp+0x158] 220 | 0x00000000000060de <+974>: lea rsi,[rsp+0x140] 221 | 0x00000000000060e6 <+982>: lea rdx,[rsp+0x98] 222 | 0x00000000000060ee <+990>: call 0xcfe0 <$stdlib::$builtin::$string::String::__add__(,$stdlib::$builtin::$string::String,$stdlib::$builtin::$string::String)> 223 | 0x00000000000060f3 <+995>: mov rdi,QWORD PTR [rsp+0x98] 224 | 0x00000000000060fb <+1003>: test rdi,rdi 225 | 0x00000000000060fe <+1006>: je 0x6105 226 | 0x0000000000006100 <+1008>: call 0x2d340 227 | 0x0000000000006105 <+1013>: mov rdi,QWORD PTR [rsp+0x140] 228 | 0x000000000000610d <+1021>: test rdi,rdi 229 | 0x0000000000006110 <+1024>: je 0x6117 230 | 0x0000000000006112 <+1026>: call 0x2d340 231 | 0x0000000000006117 <+1031>: lea r14,[rsp+0x1d0] 232 | 0x000000000000611f <+1039>: mov rdi,r14 233 | 0x0000000000006122 <+1042>: vmovaps xmm0,XMMWORD PTR [rsp+0x10] 234 | 0x0000000000006128 <+1048>: call 0x5ae0 <$stdlib::$builtin::$simd::SIMD::__str__(,$stdlib::$builtin::$simd::SIMD[type, size]),_74x13_type=si16,_74x26_size=8> 235 | 0x000000000000612d <+1053>: lea rdi,[rsp+0x170] 236 | 0x0000000000006135 <+1061>: lea rsi,[rsp+0x158] 237 | 0x000000000000613d <+1069>: mov rdx,r14 238 | 0x0000000000006140 <+1072>: call 0xcfe0 <$stdlib::$builtin::$string::String::__add__(,$stdlib::$builtin::$string::String,$stdlib::$builtin::$string::String)> 239 | 0x0000000000006145 <+1077>: mov rdi,QWORD PTR [rsp+0x1d0] 240 | 0x000000000000614d <+1085>: test rdi,rdi 241 | 0x0000000000006150 <+1088>: je 0x6157 242 | 0x0000000000006152 <+1090>: call 0x2d340 243 | 0x0000000000006157 <+1095>: mov rdi,QWORD PTR [rsp+0x158] 244 | 0x000000000000615f <+1103>: test rdi,rdi 245 | 0x0000000000006162 <+1106>: je 0x6169 246 | 0x0000000000006164 <+1108>: call 0x2d340 247 | 0x0000000000006169 <+1113>: lea rdi,[rsp+0x170] 248 | 0x0000000000006171 <+1121>: call 0x8880 <$stdlib::$builtin::$io::print($stdlib::$builtin::$string::String)> 249 | 0x0000000000006176 <+1126>: mov rdi,QWORD PTR [rsp+0x170] 250 | 0x000000000000617e <+1134>: test rdi,rdi 251 | 0x0000000000006181 <+1137>: je 0x6188 252 | 0x0000000000006183 <+1139>: call 0x2d340 253 | 0x0000000000006188 <+1144>: vmovdqa xmm1,XMMWORD PTR [rsp+0x10] 254 | 0x000000000000618e <+1150>: vpshufd xmm0,xmm1,0xee 255 | 0x0000000000006193 <+1155>: vpaddw xmm0,xmm1,xmm0 256 | 0x0000000000006197 <+1159>: vpshufd xmm1,xmm0,0x55 257 | 0x000000000000619c <+1164>: vpaddw xmm0,xmm0,xmm1 258 | 0x00000000000061a0 <+1168>: vpsrld xmm1,xmm0,0x10 259 | 0x00000000000061a5 <+1173>: vpaddw xmm0,xmm0,xmm1 260 | 0x00000000000061a9 <+1177>: vmovw eax,xmm0 261 | 0x00000000000061af <+1183>: vmovw WORD PTR [rsp+0xe],xmm0 262 | 0x00000000000061b7 <+1191>: lea rcx,[rsp+0xe] 263 | 0x00000000000061bc <+1196>: mov QWORD PTR [rsp+0x48],rcx 264 | 0x00000000000061c1 <+1201>: mov rdi,rbx 265 | 0x00000000000061c4 <+1204>: call 0x7dd0 <$stdlib::$builtin::$string::_calc_initial_buffer_size($stdlib::$builtin::$int::Int)> 266 | 0x00000000000061c9 <+1209>: mov r14,rax 267 | 0x00000000000061cc <+1212>: test rax,rax 268 | 0x00000000000061cf <+1215>: jle 0x61e6 269 | 0x00000000000061d1 <+1217>: mov edi,0x1 270 | 0x00000000000061d6 <+1222>: mov rsi,r14 271 | 0x00000000000061d9 <+1225>: call 0x2d320 272 | 0x00000000000061de <+1230>: mov r15,rax 273 | 0x00000000000061e1 <+1233>: mov r12,r14 274 | 0x00000000000061e4 <+1236>: jmp 0x61ec 275 | 0x00000000000061e6 <+1238>: xor r15d,r15d 276 | 0x00000000000061e9 <+1241>: xor r12d,r12d 277 | 0x00000000000061ec <+1244>: lea rdx,[rip+0x55f3d] # 0x5c130 278 | 0x00000000000061f3 <+1251>: mov rdi,r15 279 | 0x00000000000061f6 <+1254>: mov rsi,r14 280 | 0x00000000000061f9 <+1257>: mov rcx,rbx 281 | 0x00000000000061fc <+1260>: xor eax,eax 282 | 0x00000000000061fe <+1262>: call 0x57c0 283 | 0x0000000000006203 <+1267>: cdqe 284 | 0x0000000000006205 <+1269>: inc rax 285 | 0x0000000000006208 <+1272>: mov QWORD PTR [rsp+0xb0],r15 286 | 0x0000000000006210 <+1280>: mov QWORD PTR [rsp+0xb8],rax 287 | 0x0000000000006218 <+1288>: mov QWORD PTR [rsp+0xc0],r12 288 | 0x0000000000006220 <+1296>: lea rdx,[rip+0x55f39] # 0x5c160 289 | 0x0000000000006227 <+1303>: lea rdi,[rsp+0x188] 290 | 0x000000000000622f <+1311>: lea rsi,[rsp+0xb0] 291 | 0x0000000000006237 <+1319>: mov ecx,0xb 292 | 0x000000000000623c <+1324>: call 0xd420 <$stdlib::$builtin::$string::String::__radd__(,$stdlib::$builtin::$string::String,$stdlib::$builtin::$string_literal::StringLiteral)> 293 | 0x0000000000006241 <+1329>: mov rdi,QWORD PTR [rsp+0xb0] 294 | 0x0000000000006249 <+1337>: test rdi,rdi 295 | 0x000000000000624c <+1340>: je 0x6253 296 | 0x000000000000624e <+1342>: call 0x2d340 297 | 0x0000000000006253 <+1347>: mov edi,0x1 298 | 0x0000000000006258 <+1352>: mov esi,0x4 299 | 0x000000000000625d <+1357>: call 0x2d320 300 | 0x0000000000006262 <+1362>: xor ecx,ecx 301 | 0x0000000000006264 <+1364>: data16 data16 cs nop WORD PTR [rax+rax*1+0x0] 302 | 0x0000000000006270 <+1376>: mov BYTE PTR [rax+rcx*1],0x0 303 | 0x0000000000006274 <+1380>: inc rcx 304 | 0x0000000000006277 <+1383>: cmp rcx,0x4 305 | 0x000000000000627b <+1387>: jne 0x6270 306 | 0x000000000000627d <+1389>: mov DWORD PTR [rax],0x736e20 307 | 0x0000000000006283 <+1395>: mov QWORD PTR [rsp+0xc8],rax 308 | 0x000000000000628b <+1403>: mov QWORD PTR [rsp+0xd0],0x4 309 | 0x0000000000006297 <+1415>: mov QWORD PTR [rsp+0xd8],0x4 310 | 0x00000000000062a3 <+1427>: lea rdi,[rsp+0x1a0] 311 | 0x00000000000062ab <+1435>: lea rsi,[rsp+0x188] 312 | 0x00000000000062b3 <+1443>: lea rdx,[rsp+0xc8] 313 | 0x00000000000062bb <+1451>: call 0xcfe0 <$stdlib::$builtin::$string::String::__add__(,$stdlib::$builtin::$string::String,$stdlib::$builtin::$string::String)> 314 | 0x00000000000062c0 <+1456>: mov rdi,QWORD PTR [rsp+0xc8] 315 | 0x00000000000062c8 <+1464>: test rdi,rdi 316 | 0x00000000000062cb <+1467>: je 0x62d2 317 | 0x00000000000062cd <+1469>: call 0x2d340 318 | 0x00000000000062d2 <+1474>: mov rdi,QWORD PTR [rsp+0x188] 319 | 0x00000000000062da <+1482>: test rdi,rdi 320 | 0x00000000000062dd <+1485>: je 0x62e4 321 | 0x00000000000062df <+1487>: call 0x2d340 322 | 0x00000000000062e4 <+1492>: lea rdi,[rsp+0x1a0] 323 | 0x00000000000062ec <+1500>: call 0x8880 <$stdlib::$builtin::$io::print($stdlib::$builtin::$string::String)> 324 | 0x00000000000062f1 <+1505>: mov rdi,QWORD PTR [rsp+0x1a0] 325 | 0x00000000000062f9 <+1513>: test rdi,rdi 326 | 0x00000000000062fc <+1516>: je 0x6303 327 | 0x00000000000062fe <+1518>: call 0x2d340 328 | 0x0000000000006303 <+1523>: call 0x29740 329 | 0x0000000000006308 <+1528>: xor eax,eax 330 | 0x000000000000630a <+1530>: add rsp,0x1e8 331 | 0x0000000000006311 <+1537>: pop rbx 332 | 0x0000000000006312 <+1538>: pop r12 333 | 0x0000000000006314 <+1540>: pop r13 334 | 0x0000000000006316 <+1542>: pop r14 335 | 0x0000000000006318 <+1544>: pop r15 336 | 0x000000000000631a <+1546>: pop rbp 337 | 0x000000000000631b <+1547>: ret 338 | End of assembler dump. 339 | -------------------------------------------------------------------------------- /disassemble/int8_8.asm: -------------------------------------------------------------------------------- 1 | GNU gdb (Ubuntu 12.1-0ubuntu1~22.04) 12.1 2 | Copyright (C) 2022 Free Software Foundation, Inc. 3 | License GPLv3+: GNU GPL version 3 or later 4 | This is free software: you are free to change and redistribute it. 5 | There is NO WARRANTY, to the extent permitted by law. 6 | Type "show copying" and "show warranty" for details. 7 | This GDB was configured as "x86_64-linux-gnu". 8 | Type "show configuration" for configuration details. 9 | For bug reporting instructions, please see: 10 | . 11 | Find the GDB manual and other documentation resources online at: 12 | . 13 | 14 | For help, type "help". 15 | Type "apropos word" to search for commands related to "word"... 16 | Reading symbols from main... 17 | Dump of assembler code for function main: 18 | 0x000000000005b890 <+0>: push rbp 19 | 0x000000000005b891 <+1>: push r15 20 | 0x000000000005b893 <+3>: push r14 21 | 0x000000000005b895 <+5>: push r13 22 | 0x000000000005b897 <+7>: push r12 23 | 0x000000000005b899 <+9>: push rbx 24 | 0x000000000005b89a <+10>: sub rsp,0x218 25 | 0x000000000005b8a1 <+17>: lea rax,[rip+0xfffffffffffffff9] # 0x5b8a1 26 | 0x000000000005b8a8 <+24>: movabs r13,0x2c197 27 | 0x000000000005b8b2 <+34>: add r13,rax 28 | 0x000000000005b8b5 <+37>: movabs rax,0x568 29 | 0x000000000005b8bf <+47>: call QWORD PTR [r13+rax*1+0x0] 30 | 0x000000000005b8c4 <+52>: vpxor xmm0,xmm0,xmm0 31 | 0x000000000005b8c8 <+56>: mov ebx,0x9 32 | 0x000000000005b8cd <+61>: xor r14d,r14d 33 | 0x000000000005b8d0 <+64>: movabs rax,0x538 34 | 0x000000000005b8da <+74>: mov r15,QWORD PTR [r13+rax*1+0x0] 35 | 0x000000000005b8df <+79>: movabs rax,0x560 36 | 0x000000000005b8e9 <+89>: mov r12,QWORD PTR [r13+rax*1+0x0] 37 | 0x000000000005b8ee <+94>: movabs rax,0xfffffffffffd56b8 38 | 0x000000000005b8f8 <+104>: vmovaps xmm1,XMMWORD PTR [r13+rax*1+0x0] 39 | 0x000000000005b8ff <+111>: vmovaps XMMWORD PTR [rsp+0x20],xmm1 40 | 0x000000000005b905 <+117>: data16 cs nop WORD PTR [rax+rax*1+0x0] 41 | 0x000000000005b910 <+128>: vmovdqa XMMWORD PTR [rsp+0x10],xmm0 42 | 0x000000000005b916 <+134>: call r15 43 | 0x000000000005b919 <+137>: mov edx,0x64 44 | 0x000000000005b91e <+142>: mov rdi,rax 45 | 0x000000000005b921 <+145>: xor esi,esi 46 | 0x000000000005b923 <+147>: call r12 47 | 0x000000000005b926 <+150>: vpbroadcastb xmm0,r14d 48 | 0x000000000005b92c <+156>: vpcmpeqb k1,xmm0,XMMWORD PTR [rsp+0x20] 49 | 0x000000000005b934 <+164>: vmovdqa xmm0,XMMWORD PTR [rsp+0x10] 50 | 0x000000000005b93a <+170>: vpbroadcastb xmm0{k1},eax 51 | 0x000000000005b940 <+176>: dec rbx 52 | 0x000000000005b943 <+179>: inc r14 53 | 0x000000000005b946 <+182>: cmp rbx,0x1 54 | 0x000000000005b94a <+186>: ja 0x5b910 55 | 0x000000000005b94c <+188>: vmovdqa XMMWORD PTR [rsp+0x10],xmm0 56 | 0x000000000005b952 <+194>: movabs rax,0x548 57 | 0x000000000005b95c <+204>: mov edi,0x8 58 | 0x000000000005b961 <+209>: call QWORD PTR [r13+rax*1+0x0] 59 | 0x000000000005b966 <+214>: mov r14,rax 60 | 0x000000000005b969 <+217>: movabs r12,0x518 61 | 0x000000000005b973 <+227>: test rax,rax 62 | 0x000000000005b976 <+230>: jle 0x5b98d 63 | 0x000000000005b978 <+232>: mov edi,0x1 64 | 0x000000000005b97d <+237>: mov rsi,r14 65 | 0x000000000005b980 <+240>: call QWORD PTR [r13+r12*1+0x0] 66 | 0x000000000005b985 <+245>: mov r15,rax 67 | 0x000000000005b988 <+248>: mov rbx,r14 68 | 0x000000000005b98b <+251>: jmp 0x5b992 69 | 0x000000000005b98d <+253>: xor r15d,r15d 70 | 0x000000000005b990 <+256>: xor ebx,ebx 71 | 0x000000000005b992 <+258>: movabs rdx,0x2588 72 | 0x000000000005b99c <+268>: add rdx,r13 73 | 0x000000000005b99f <+271>: movabs r8,0x580 74 | 0x000000000005b9a9 <+281>: mov ecx,0x8 75 | 0x000000000005b9ae <+286>: mov rdi,r15 76 | 0x000000000005b9b1 <+289>: mov rsi,r14 77 | 0x000000000005b9b4 <+292>: mov QWORD PTR [rsp+0x20],rdx 78 | 0x000000000005b9b9 <+297>: xor eax,eax 79 | 0x000000000005b9bb <+299>: call QWORD PTR [r13+r8*1+0x0] 80 | 0x000000000005b9c0 <+304>: cdqe 81 | 0x000000000005b9c2 <+306>: inc rax 82 | 0x000000000005b9c5 <+309>: mov QWORD PTR [rsp+0x70],r15 83 | 0x000000000005b9ca <+314>: mov QWORD PTR [rsp+0x78],rax 84 | 0x000000000005b9cf <+319>: mov QWORD PTR [rsp+0x80],rbx 85 | 0x000000000005b9d7 <+327>: movabs rdx,0x2598 86 | 0x000000000005b9e1 <+337>: add rdx,r13 87 | 0x000000000005b9e4 <+340>: movabs rax,0x500 88 | 0x000000000005b9ee <+350>: lea rdi,[rsp+0x128] 89 | 0x000000000005b9f6 <+358>: lea rsi,[rsp+0x70] 90 | 0x000000000005b9fb <+363>: mov ecx,0x7 91 | 0x000000000005ba00 <+368>: call QWORD PTR [r13+rax*1+0x0] 92 | 0x000000000005ba05 <+373>: mov rdi,QWORD PTR [rsp+0x70] 93 | 0x000000000005ba0a <+378>: movabs rbx,0x4f8 94 | 0x000000000005ba14 <+388>: test rdi,rdi 95 | 0x000000000005ba17 <+391>: je 0x5ba1e 96 | 0x000000000005ba19 <+393>: call QWORD PTR [r13+rbx*1+0x0] 97 | 0x000000000005ba1e <+398>: mov edi,0x1 98 | 0x000000000005ba23 <+403>: mov esi,0x3 99 | 0x000000000005ba28 <+408>: call QWORD PTR [r13+r12*1+0x0] 100 | 0x000000000005ba2d <+413>: xor ecx,ecx 101 | 0x000000000005ba2f <+415>: nop 102 | 0x000000000005ba30 <+416>: mov BYTE PTR [rax+rcx*1],0x0 103 | 0x000000000005ba34 <+420>: inc rcx 104 | 0x000000000005ba37 <+423>: cmp rcx,0x3 105 | 0x000000000005ba3b <+427>: jne 0x5ba30 106 | 0x000000000005ba3d <+429>: mov WORD PTR [rax],0x203a 107 | 0x000000000005ba42 <+434>: mov BYTE PTR [rax+0x2],0x0 108 | 0x000000000005ba46 <+438>: mov QWORD PTR [rsp+0x88],rax 109 | 0x000000000005ba4e <+446>: mov QWORD PTR [rsp+0x90],0x3 110 | 0x000000000005ba5a <+458>: mov QWORD PTR [rsp+0x98],0x3 111 | 0x000000000005ba66 <+470>: movabs r15,0x578 112 | 0x000000000005ba70 <+480>: lea rdi,[rsp+0x140] 113 | 0x000000000005ba78 <+488>: lea rsi,[rsp+0x128] 114 | 0x000000000005ba80 <+496>: lea rdx,[rsp+0x88] 115 | 0x000000000005ba88 <+504>: call QWORD PTR [r13+r15*1+0x0] 116 | 0x000000000005ba8d <+509>: mov rdi,QWORD PTR [rsp+0x88] 117 | 0x000000000005ba95 <+517>: test rdi,rdi 118 | 0x000000000005ba98 <+520>: je 0x5ba9f 119 | 0x000000000005ba9a <+522>: call QWORD PTR [r13+rbx*1+0x0] 120 | 0x000000000005ba9f <+527>: mov rdi,QWORD PTR [rsp+0x128] 121 | 0x000000000005baa7 <+535>: test rdi,rdi 122 | 0x000000000005baaa <+538>: je 0x5bab1 123 | 0x000000000005baac <+540>: call QWORD PTR [r13+rbx*1+0x0] 124 | 0x000000000005bab1 <+545>: movabs rax,0xfffffffffffd3b58 125 | 0x000000000005babb <+555>: add rax,r13 126 | 0x000000000005babe <+558>: lea r14,[rsp+0x1e8] 127 | 0x000000000005bac6 <+566>: mov rdi,r14 128 | 0x000000000005bac9 <+569>: vmovaps xmm0,XMMWORD PTR [rsp+0x10] 129 | 0x000000000005bacf <+575>: mov QWORD PTR [rsp+0x60],rax 130 | 0x000000000005bad4 <+580>: call rax 131 | 0x000000000005bad6 <+582>: lea rdi,[rsp+0x158] 132 | 0x000000000005bade <+590>: lea rsi,[rsp+0x140] 133 | 0x000000000005bae6 <+598>: mov rdx,r14 134 | 0x000000000005bae9 <+601>: call QWORD PTR [r13+r15*1+0x0] 135 | 0x000000000005baee <+606>: mov rdi,QWORD PTR [rsp+0x1e8] 136 | 0x000000000005baf6 <+614>: test rdi,rdi 137 | 0x000000000005baf9 <+617>: je 0x5bb00 138 | 0x000000000005bafb <+619>: call QWORD PTR [r13+rbx*1+0x0] 139 | 0x000000000005bb00 <+624>: mov rdi,QWORD PTR [rsp+0x140] 140 | 0x000000000005bb08 <+632>: test rdi,rdi 141 | 0x000000000005bb0b <+635>: je 0x5bb12 142 | 0x000000000005bb0d <+637>: call QWORD PTR [r13+rbx*1+0x0] 143 | 0x000000000005bb12 <+642>: movabs rax,0x4e8 144 | 0x000000000005bb1c <+652>: lea rdi,[rsp+0x158] 145 | 0x000000000005bb24 <+660>: call QWORD PTR [r13+rax*1+0x0] 146 | 0x000000000005bb29 <+665>: mov rdi,QWORD PTR [rsp+0x158] 147 | 0x000000000005bb31 <+673>: test rdi,rdi 148 | 0x000000000005bb34 <+676>: je 0x5bb3b 149 | 0x000000000005bb36 <+678>: call QWORD PTR [r13+rbx*1+0x0] 150 | 0x000000000005bb3b <+683>: vxorps xmm0,xmm0,xmm0 151 | 0x000000000005bb3f <+687>: vmovaps XMMWORD PTR [rsp+0x30],xmm0 152 | 0x000000000005bb45 <+693>: movabs rax,0x540 153 | 0x000000000005bb4f <+703>: mov r14,QWORD PTR [r13+rax*1+0x0] 154 | 0x000000000005bb54 <+708>: lea rsi,[rsp+0x30] 155 | 0x000000000005bb59 <+713>: mov edi,0x1 156 | 0x000000000005bb5e <+718>: call r14 157 | 0x000000000005bb61 <+721>: mov r15,QWORD PTR [rsp+0x30] 158 | 0x000000000005bb66 <+726>: mov rax,QWORD PTR [rsp+0x38] 159 | 0x000000000005bb6b <+731>: mov QWORD PTR [rsp+0x58],rax 160 | 0x000000000005bb70 <+736>: vmovdqa xmm2,XMMWORD PTR [rsp+0x10] 161 | 0x000000000005bb76 <+742>: vprold xmm0,xmm2,0x10 162 | 0x000000000005bb7d <+749>: vpminsb xmm1,xmm2,xmm0 163 | 0x000000000005bb82 <+754>: vpmaxsb xmm0,xmm2,xmm0 164 | 0x000000000005bb87 <+759>: vpblendw xmm0,xmm1,xmm0,0xa 165 | 0x000000000005bb8d <+765>: vpshufd xmm1,xmm0,0xe1 166 | 0x000000000005bb92 <+770>: vpminsb xmm2,xmm0,xmm1 167 | 0x000000000005bb97 <+775>: vpmaxsb xmm0,xmm0,xmm1 168 | 0x000000000005bb9c <+780>: vpblendd xmm0,xmm2,xmm0,0x2 169 | 0x000000000005bba2 <+786>: movabs rax,0xfffffffffffd56c8 170 | 0x000000000005bbac <+796>: vpshufb xmm1,xmm0,XMMWORD PTR [r13+rax*1+0x0] 171 | 0x000000000005bbb3 <+803>: vpminsb xmm2,xmm0,xmm1 172 | 0x000000000005bbb8 <+808>: mov ax,0xaa 173 | 0x000000000005bbbc <+812>: kmovd k1,eax 174 | 0x000000000005bbc0 <+816>: vpmaxsb xmm2{k1},xmm0,xmm1 175 | 0x000000000005bbc6 <+822>: vpshuflw xmm0,xmm2,0xd8 176 | 0x000000000005bbcb <+827>: vpminsb xmm1,xmm2,xmm0 177 | 0x000000000005bbd0 <+832>: vpmaxsb xmm0,xmm2,xmm0 178 | 0x000000000005bbd5 <+837>: vpblendw xmm0,xmm0,xmm1,0x2 179 | 0x000000000005bbdb <+843>: movabs rax,0xfffffffffffd56d8 180 | 0x000000000005bbe5 <+853>: vpshufb xmm1,xmm0,XMMWORD PTR [r13+rax*1+0x0] 181 | 0x000000000005bbec <+860>: vpmaxsb xmm2,xmm0,xmm1 182 | 0x000000000005bbf1 <+865>: mov ax,0xa 183 | 0x000000000005bbf5 <+869>: kmovd k1,eax 184 | 0x000000000005bbf9 <+873>: vpminsb xmm2{k1},xmm0,xmm1 185 | 0x000000000005bbff <+879>: movabs rax,0xfffffffffffd56e8 186 | 0x000000000005bc09 <+889>: vmovdqa XMMWORD PTR [rsp+0x110],xmm2 187 | 0x000000000005bc12 <+898>: vpshufb xmm0,xmm2,XMMWORD PTR [r13+rax*1+0x0] 188 | 0x000000000005bc19 <+905>: vmovdqa XMMWORD PTR [rsp+0x100],xmm0 189 | 0x000000000005bc22 <+914>: vpxor xmm0,xmm0,xmm0 190 | 0x000000000005bc26 <+918>: vmovdqa XMMWORD PTR [rsp+0x40],xmm0 191 | 0x000000000005bc2c <+924>: lea rsi,[rsp+0x40] 192 | 0x000000000005bc31 <+929>: mov edi,0x1 193 | 0x000000000005bc36 <+934>: call r14 194 | 0x000000000005bc39 <+937>: mov rbp,QWORD PTR [rsp+0x40] 195 | 0x000000000005bc3e <+942>: sub rbp,r15 196 | 0x000000000005bc41 <+945>: mov r14,QWORD PTR [rsp+0x48] 197 | 0x000000000005bc46 <+950>: mov edi,0x8 198 | 0x000000000005bc4b <+955>: movabs rax,0x548 199 | 0x000000000005bc55 <+965>: call QWORD PTR [r13+rax*1+0x0] 200 | 0x000000000005bc5a <+970>: mov r15,rax 201 | 0x000000000005bc5d <+973>: test rax,rax 202 | 0x000000000005bc60 <+976>: jle 0x5bc81 203 | 0x000000000005bc62 <+978>: mov edi,0x1 204 | 0x000000000005bc67 <+983>: mov rsi,r15 205 | 0x000000000005bc6a <+986>: movabs rax,0x518 206 | 0x000000000005bc74 <+996>: call QWORD PTR [r13+rax*1+0x0] 207 | 0x000000000005bc79 <+1001>: mov r12,rax 208 | 0x000000000005bc7c <+1004>: mov rbx,r15 209 | 0x000000000005bc7f <+1007>: jmp 0x5bc86 210 | 0x000000000005bc81 <+1009>: xor r12d,r12d 211 | 0x000000000005bc84 <+1012>: xor ebx,ebx 212 | 0x000000000005bc86 <+1014>: vmovdqa xmm0,XMMWORD PTR [rsp+0x100] 213 | 0x000000000005bc8f <+1023>: vpmaxsb xmm0,xmm0,XMMWORD PTR [rsp+0x110] 214 | 0x000000000005bc99 <+1033>: vmovdqa XMMWORD PTR [rsp+0x10],xmm0 215 | 0x000000000005bc9f <+1039>: mov ax,0x2a 216 | 0x000000000005bca3 <+1043>: kmovd k1,eax 217 | 0x000000000005bca7 <+1047>: kmovw WORD PTR [rsp+0xe],k1 218 | 0x000000000005bcad <+1053>: imul rbp,rbp,0x3b9aca00 219 | 0x000000000005bcb4 <+1060>: sub r14,QWORD PTR [rsp+0x58] 220 | 0x000000000005bcb9 <+1065>: mov ecx,0x8 221 | 0x000000000005bcbe <+1070>: mov rdi,r12 222 | 0x000000000005bcc1 <+1073>: mov rsi,r15 223 | 0x000000000005bcc4 <+1076>: mov rdx,QWORD PTR [rsp+0x20] 224 | 0x000000000005bcc9 <+1081>: xor eax,eax 225 | 0x000000000005bccb <+1083>: movabs r8,0x580 226 | 0x000000000005bcd5 <+1093>: call QWORD PTR [r13+r8*1+0x0] 227 | 0x000000000005bcda <+1098>: cdqe 228 | 0x000000000005bcdc <+1100>: inc rax 229 | 0x000000000005bcdf <+1103>: mov QWORD PTR [rsp+0xa0],r12 230 | 0x000000000005bce7 <+1111>: mov QWORD PTR [rsp+0xa8],rax 231 | 0x000000000005bcef <+1119>: mov QWORD PTR [rsp+0xb0],rbx 232 | 0x000000000005bcf7 <+1127>: movabs rdx,0x25a8 233 | 0x000000000005bd01 <+1137>: add rdx,r13 234 | 0x000000000005bd04 <+1140>: lea rdi,[rsp+0x170] 235 | 0x000000000005bd0c <+1148>: lea rsi,[rsp+0xa0] 236 | 0x000000000005bd14 <+1156>: mov ecx,0x6 237 | 0x000000000005bd19 <+1161>: movabs rax,0x500 238 | 0x000000000005bd23 <+1171>: call QWORD PTR [r13+rax*1+0x0] 239 | 0x000000000005bd28 <+1176>: mov rdi,QWORD PTR [rsp+0xa0] 240 | 0x000000000005bd30 <+1184>: test rdi,rdi 241 | 0x000000000005bd33 <+1187>: movabs rbx,0x4f8 242 | 0x000000000005bd3d <+1197>: je 0x5bd44 243 | 0x000000000005bd3f <+1199>: call QWORD PTR [r13+rbx*1+0x0] 244 | 0x000000000005bd44 <+1204>: vmovdqa xmm0,XMMWORD PTR [rsp+0x10] 245 | 0x000000000005bd4a <+1210>: vmovdqa xmm1,XMMWORD PTR [rsp+0x100] 246 | 0x000000000005bd53 <+1219>: kmovw k1,WORD PTR [rsp+0xe] 247 | 0x000000000005bd59 <+1225>: vpminsb xmm0{k1},xmm1,XMMWORD PTR [rsp+0x110] 248 | 0x000000000005bd61 <+1233>: vmovdqa XMMWORD PTR [rsp+0x10],xmm0 249 | 0x000000000005bd67 <+1239>: add r14,rbp 250 | 0x000000000005bd6a <+1242>: mov edi,0x1 251 | 0x000000000005bd6f <+1247>: mov esi,0x3 252 | 0x000000000005bd74 <+1252>: movabs rbp,0x518 253 | 0x000000000005bd7e <+1262>: call QWORD PTR [r13+rbp*1+0x0] 254 | 0x000000000005bd83 <+1267>: xor ecx,ecx 255 | 0x000000000005bd85 <+1269>: movabs r12,0x578 256 | 0x000000000005bd8f <+1279>: nop 257 | 0x000000000005bd90 <+1280>: mov BYTE PTR [rax+rcx*1],0x0 258 | 0x000000000005bd94 <+1284>: inc rcx 259 | 0x000000000005bd97 <+1287>: cmp rcx,0x3 260 | 0x000000000005bd9b <+1291>: jne 0x5bd90 261 | 0x000000000005bd9d <+1293>: mov WORD PTR [rax],0x203a 262 | 0x000000000005bda2 <+1298>: mov BYTE PTR [rax+0x2],0x0 263 | 0x000000000005bda6 <+1302>: mov QWORD PTR [rsp+0xb8],rax 264 | 0x000000000005bdae <+1310>: mov QWORD PTR [rsp+0xc0],0x3 265 | 0x000000000005bdba <+1322>: mov QWORD PTR [rsp+0xc8],0x3 266 | 0x000000000005bdc6 <+1334>: lea rdi,[rsp+0x188] 267 | 0x000000000005bdce <+1342>: lea rsi,[rsp+0x170] 268 | 0x000000000005bdd6 <+1350>: lea rdx,[rsp+0xb8] 269 | 0x000000000005bdde <+1358>: call QWORD PTR [r13+r12*1+0x0] 270 | 0x000000000005bde3 <+1363>: mov rdi,QWORD PTR [rsp+0xb8] 271 | 0x000000000005bdeb <+1371>: test rdi,rdi 272 | 0x000000000005bdee <+1374>: je 0x5bdf5 273 | 0x000000000005bdf0 <+1376>: call QWORD PTR [r13+rbx*1+0x0] 274 | 0x000000000005bdf5 <+1381>: mov rdi,QWORD PTR [rsp+0x170] 275 | 0x000000000005bdfd <+1389>: test rdi,rdi 276 | 0x000000000005be00 <+1392>: je 0x5be07 277 | 0x000000000005be02 <+1394>: call QWORD PTR [r13+rbx*1+0x0] 278 | 0x000000000005be07 <+1399>: lea r15,[rsp+0x200] 279 | 0x000000000005be0f <+1407>: mov rdi,r15 280 | 0x000000000005be12 <+1410>: vmovdqa xmm0,XMMWORD PTR [rsp+0x10] 281 | 0x000000000005be18 <+1416>: call QWORD PTR [rsp+0x60] 282 | 0x000000000005be1c <+1420>: lea rdi,[rsp+0x1a0] 283 | 0x000000000005be24 <+1428>: lea rsi,[rsp+0x188] 284 | 0x000000000005be2c <+1436>: mov rdx,r15 285 | 0x000000000005be2f <+1439>: call QWORD PTR [r13+r12*1+0x0] 286 | 0x000000000005be34 <+1444>: mov rdi,QWORD PTR [rsp+0x200] 287 | 0x000000000005be3c <+1452>: test rdi,rdi 288 | 0x000000000005be3f <+1455>: je 0x5be46 289 | 0x000000000005be41 <+1457>: call QWORD PTR [r13+rbx*1+0x0] 290 | 0x000000000005be46 <+1462>: mov rdi,QWORD PTR [rsp+0x188] 291 | 0x000000000005be4e <+1470>: test rdi,rdi 292 | 0x000000000005be51 <+1473>: je 0x5be58 293 | 0x000000000005be53 <+1475>: call QWORD PTR [r13+rbx*1+0x0] 294 | 0x000000000005be58 <+1480>: lea rdi,[rsp+0x1a0] 295 | 0x000000000005be60 <+1488>: movabs rax,0x4e8 296 | 0x000000000005be6a <+1498>: call QWORD PTR [r13+rax*1+0x0] 297 | 0x000000000005be6f <+1503>: mov rdi,QWORD PTR [rsp+0x1a0] 298 | 0x000000000005be77 <+1511>: test rdi,rdi 299 | 0x000000000005be7a <+1514>: je 0x5be81 300 | 0x000000000005be7c <+1516>: call QWORD PTR [r13+rbx*1+0x0] 301 | 0x000000000005be81 <+1521>: vpxor xmm0,xmm0,xmm0 302 | 0x000000000005be85 <+1525>: vpsadbw xmm0,xmm0,XMMWORD PTR [rsp+0x10] 303 | 0x000000000005be8b <+1531>: vmovd eax,xmm0 304 | 0x000000000005be8f <+1535>: mov BYTE PTR [rsp+0xd],al 305 | 0x000000000005be93 <+1539>: lea rcx,[rsp+0xd] 306 | 0x000000000005be98 <+1544>: mov QWORD PTR [rsp+0x68],rcx 307 | 0x000000000005be9d <+1549>: mov rdi,r14 308 | 0x000000000005bea0 <+1552>: movabs rax,0x548 309 | 0x000000000005beaa <+1562>: call QWORD PTR [r13+rax*1+0x0] 310 | 0x000000000005beaf <+1567>: mov r15,rax 311 | 0x000000000005beb2 <+1570>: test rax,rax 312 | 0x000000000005beb5 <+1573>: jle 0x5becc 313 | 0x000000000005beb7 <+1575>: mov edi,0x1 314 | 0x000000000005bebc <+1580>: mov rsi,r15 315 | 0x000000000005bebf <+1583>: call QWORD PTR [r13+rbp*1+0x0] 316 | 0x000000000005bec4 <+1588>: mov r12,rax 317 | 0x000000000005bec7 <+1591>: mov rbx,r15 318 | 0x000000000005beca <+1594>: jmp 0x5bed1 319 | 0x000000000005becc <+1596>: xor r12d,r12d 320 | 0x000000000005becf <+1599>: xor ebx,ebx 321 | 0x000000000005bed1 <+1601>: mov rdi,r12 322 | 0x000000000005bed4 <+1604>: mov rsi,r15 323 | 0x000000000005bed7 <+1607>: mov rdx,QWORD PTR [rsp+0x20] 324 | 0x000000000005bedc <+1612>: mov rcx,r14 325 | 0x000000000005bedf <+1615>: xor eax,eax 326 | 0x000000000005bee1 <+1617>: movabs r8,0x580 327 | 0x000000000005beeb <+1627>: call QWORD PTR [r13+r8*1+0x0] 328 | 0x000000000005bef0 <+1632>: cdqe 329 | 0x000000000005bef2 <+1634>: inc rax 330 | 0x000000000005bef5 <+1637>: mov QWORD PTR [rsp+0xd0],r12 331 | 0x000000000005befd <+1645>: mov QWORD PTR [rsp+0xd8],rax 332 | 0x000000000005bf05 <+1653>: mov QWORD PTR [rsp+0xe0],rbx 333 | 0x000000000005bf0d <+1661>: movabs rdx,0x25b8 334 | 0x000000000005bf17 <+1671>: add rdx,r13 335 | 0x000000000005bf1a <+1674>: lea rdi,[rsp+0x1b8] 336 | 0x000000000005bf22 <+1682>: lea rsi,[rsp+0xd0] 337 | 0x000000000005bf2a <+1690>: mov ecx,0xb 338 | 0x000000000005bf2f <+1695>: movabs rax,0x500 339 | 0x000000000005bf39 <+1705>: call QWORD PTR [r13+rax*1+0x0] 340 | 0x000000000005bf3e <+1710>: mov rdi,QWORD PTR [rsp+0xd0] 341 | 0x000000000005bf46 <+1718>: test rdi,rdi 342 | 0x000000000005bf49 <+1721>: movabs rbx,0x4f8 343 | 0x000000000005bf53 <+1731>: je 0x5bf5a 344 | 0x000000000005bf55 <+1733>: call QWORD PTR [r13+rbx*1+0x0] 345 | 0x000000000005bf5a <+1738>: mov edi,0x1 346 | 0x000000000005bf5f <+1743>: mov esi,0x4 347 | 0x000000000005bf64 <+1748>: call QWORD PTR [r13+rbp*1+0x0] 348 | 0x000000000005bf69 <+1753>: xor ecx,ecx 349 | 0x000000000005bf6b <+1755>: movabs r8,0x578 350 | 0x000000000005bf75 <+1765>: data16 cs nop WORD PTR [rax+rax*1+0x0] 351 | 0x000000000005bf80 <+1776>: mov BYTE PTR [rax+rcx*1],0x0 352 | 0x000000000005bf84 <+1780>: inc rcx 353 | 0x000000000005bf87 <+1783>: cmp rcx,0x4 354 | 0x000000000005bf8b <+1787>: jne 0x5bf80 355 | 0x000000000005bf8d <+1789>: mov DWORD PTR [rax],0x736e20 356 | 0x000000000005bf93 <+1795>: mov QWORD PTR [rsp+0xe8],rax 357 | 0x000000000005bf9b <+1803>: mov QWORD PTR [rsp+0xf0],0x4 358 | 0x000000000005bfa7 <+1815>: mov QWORD PTR [rsp+0xf8],0x4 359 | 0x000000000005bfb3 <+1827>: lea rdi,[rsp+0x1d0] 360 | 0x000000000005bfbb <+1835>: lea rsi,[rsp+0x1b8] 361 | 0x000000000005bfc3 <+1843>: lea rdx,[rsp+0xe8] 362 | 0x000000000005bfcb <+1851>: call QWORD PTR [r13+r8*1+0x0] 363 | 0x000000000005bfd0 <+1856>: mov rdi,QWORD PTR [rsp+0xe8] 364 | 0x000000000005bfd8 <+1864>: test rdi,rdi 365 | 0x000000000005bfdb <+1867>: je 0x5bfe2 366 | 0x000000000005bfdd <+1869>: call QWORD PTR [r13+rbx*1+0x0] 367 | 0x000000000005bfe2 <+1874>: mov rdi,QWORD PTR [rsp+0x1b8] 368 | 0x000000000005bfea <+1882>: test rdi,rdi 369 | 0x000000000005bfed <+1885>: je 0x5bff4 370 | 0x000000000005bfef <+1887>: call QWORD PTR [r13+rbx*1+0x0] 371 | 0x000000000005bff4 <+1892>: lea rdi,[rsp+0x1d0] 372 | 0x000000000005bffc <+1900>: movabs rax,0x4e8 373 | 0x000000000005c006 <+1910>: call QWORD PTR [r13+rax*1+0x0] 374 | 0x000000000005c00b <+1915>: mov rdi,QWORD PTR [rsp+0x1d0] 375 | 0x000000000005c013 <+1923>: test rdi,rdi 376 | 0x000000000005c016 <+1926>: je 0x5c01d 377 | 0x000000000005c018 <+1928>: call QWORD PTR [r13+rbx*1+0x0] 378 | 0x000000000005c01d <+1933>: movabs rax,0x570 379 | 0x000000000005c027 <+1943>: call QWORD PTR [r13+rax*1+0x0] 380 | 0x000000000005c02c <+1948>: xor eax,eax 381 | 0x000000000005c02e <+1950>: add rsp,0x218 382 | 0x000000000005c035 <+1957>: pop rbx 383 | 0x000000000005c036 <+1958>: pop r12 384 | 0x000000000005c038 <+1960>: pop r13 385 | 0x000000000005c03a <+1962>: pop r14 386 | 0x000000000005c03c <+1964>: pop r15 387 | 0x000000000005c03e <+1966>: pop rbp 388 | 0x000000000005c03f <+1967>: ret 389 | End of assembler dump. 390 | -------------------------------------------------------------------------------- /disassemble/main_int16_128.mojo: -------------------------------------------------------------------------------- 1 | import sort_network as sn 2 | fn main(): 3 | sn.test_netw_SIMD_sort[DType.int16, 128, True]() -------------------------------------------------------------------------------- /disassemble/main_int16_16.mojo: -------------------------------------------------------------------------------- 1 | import sort_network as sn 2 | fn main(): 3 | sn.test_netw_SIMD_sort[DType.int16, 16, True]() -------------------------------------------------------------------------------- /disassemble/main_int16_32.mojo: -------------------------------------------------------------------------------- 1 | import sort_network as sn 2 | fn main(): 3 | sn.test_netw_SIMD_sort[DType.int16, 32, True]() -------------------------------------------------------------------------------- /disassemble/main_int16_64.mojo: -------------------------------------------------------------------------------- 1 | import sort_network as sn 2 | fn main(): 3 | sn.test_netw_SIMD_sort[DType.int16, 64, True]() -------------------------------------------------------------------------------- /disassemble/main_int16_8.mojo: -------------------------------------------------------------------------------- 1 | import sort_network as sn 2 | fn main(): 3 | sn.test_netw_SIMD_sort[DType.int16, 8, True]() -------------------------------------------------------------------------------- /disassemble/main_int32_128.mojo: -------------------------------------------------------------------------------- 1 | import sort_network as sn 2 | fn main(): 3 | sn.test_netw_SIMD_sort[DType.int32, 128, True]() -------------------------------------------------------------------------------- /disassemble/main_int32_16.mojo: -------------------------------------------------------------------------------- 1 | import sort_network as sn 2 | fn main(): 3 | sn.test_netw_SIMD_sort[DType.int32, 16, True]() -------------------------------------------------------------------------------- /disassemble/main_int32_32.mojo: -------------------------------------------------------------------------------- 1 | import sort_network as sn 2 | fn main(): 3 | sn.test_netw_SIMD_sort[DType.int32, 32, True]() -------------------------------------------------------------------------------- /disassemble/main_int32_64.mojo: -------------------------------------------------------------------------------- 1 | import sort_network as sn 2 | fn main(): 3 | sn.test_netw_SIMD_sort[DType.int32, 64, True]() -------------------------------------------------------------------------------- /disassemble/main_int32_8.mojo: -------------------------------------------------------------------------------- 1 | import sort_network as sn 2 | fn main(): 3 | sn.test_netw_SIMD_sort[DType.int32, 8, True]() -------------------------------------------------------------------------------- /disassemble/main_int64_128.mojo: -------------------------------------------------------------------------------- 1 | import sort_network as sn 2 | fn main(): 3 | sn.test_netw_SIMD_sort[DType.int64, 128, True]() -------------------------------------------------------------------------------- /disassemble/main_int64_16.mojo: -------------------------------------------------------------------------------- 1 | import sort_network as sn 2 | fn main(): 3 | sn.test_netw_SIMD_sort[DType.int64, 16, True]() -------------------------------------------------------------------------------- /disassemble/main_int64_32.mojo: -------------------------------------------------------------------------------- 1 | import sort_network as sn 2 | fn main(): 3 | sn.test_netw_SIMD_sort[DType.int64, 32, True]() -------------------------------------------------------------------------------- /disassemble/main_int64_64.mojo: -------------------------------------------------------------------------------- 1 | import sort_network as sn 2 | fn main(): 3 | sn.test_netw_SIMD_sort[DType.int64, 64, True]() -------------------------------------------------------------------------------- /disassemble/main_int64_8.mojo: -------------------------------------------------------------------------------- 1 | import sort_network as sn 2 | fn main(): 3 | sn.test_netw_SIMD_sort[DType.int64, 8, True]() -------------------------------------------------------------------------------- /disassemble/main_int8_128.mojo: -------------------------------------------------------------------------------- 1 | import sort_network as sn 2 | fn main(): 3 | sn.test_netw_SIMD_sort[DType.int8, 128, True]() -------------------------------------------------------------------------------- /disassemble/main_int8_16.mojo: -------------------------------------------------------------------------------- 1 | import sort_network as sn 2 | fn main(): 3 | sn.test_netw_SIMD_sort[DType.int8, 16, True]() -------------------------------------------------------------------------------- /disassemble/main_int8_32.mojo: -------------------------------------------------------------------------------- 1 | import sort_network as sn 2 | fn main(): 3 | sn.test_netw_SIMD_sort[DType.int8, 32, True]() -------------------------------------------------------------------------------- /disassemble/main_int8_64.mojo: -------------------------------------------------------------------------------- 1 | import sort_network as sn 2 | fn main(): 3 | sn.test_netw_SIMD_sort[DType.int8, 64, True]() -------------------------------------------------------------------------------- /disassemble/main_int8_8.mojo: -------------------------------------------------------------------------------- 1 | import sort_network as sn 2 | fn main(): 3 | sn.test_netw_SIMD_sort[DType.int8, 8, True]() -------------------------------------------------------------------------------- /disassemble/run_all.sh: -------------------------------------------------------------------------------- 1 | echo "making sort_network package" 2 | mojo package ../sort_network -o sort_network.mojopkg 3 | 4 | ID="int8_8" 5 | echo "disassembling $ID" 6 | mojo build main_$ID.mojo -o main 7 | gdb main -ex 'set disassembly-flavor intel' -ex 'disassemble main' -ex q > $ID.asm 8 | 9 | ID="int8_16" 10 | echo "disassembling $ID" 11 | mojo build main_$ID.mojo -o main 12 | gdb main -ex 'set disassembly-flavor intel' -ex 'disassemble main' -ex q > $ID.asm 13 | 14 | ID="int8_32" 15 | echo "disassembling $ID" 16 | mojo build main_$ID.mojo -o main 17 | gdb main -ex 'set disassembly-flavor intel' -ex 'disassemble main' -ex q > $ID.asm 18 | 19 | ID="int8_64" 20 | echo "disassembling $ID" 21 | mojo build main_$ID.mojo -o main 22 | gdb main -ex 'set disassembly-flavor intel' -ex 'disassemble main' -ex q > $ID.asm 23 | 24 | ID="int8_128" 25 | echo "disassembling $ID" 26 | mojo build main_$ID.mojo -o main 27 | gdb main -ex 'set disassembly-flavor intel' -ex 'disassemble main' -ex q > $ID.asm 28 | 29 | 30 | ID="int16_8" 31 | echo "disassembling $ID" 32 | mojo build main_$ID.mojo -o main 33 | gdb main -ex 'set disassembly-flavor intel' -ex 'disassemble main' -ex q > $ID.asm 34 | 35 | ID="int16_16" 36 | echo "disassembling $ID" 37 | mojo build main_$ID.mojo -o main 38 | gdb main -ex 'set disassembly-flavor intel' -ex 'disassemble main' -ex q > $ID.asm 39 | 40 | ID="int16_32" 41 | echo "disassembling $ID" 42 | mojo build main_$ID.mojo -o main 43 | gdb main -ex 'set disassembly-flavor intel' -ex 'disassemble main' -ex q > $ID.asm 44 | 45 | ID="int16_64" 46 | echo "disassembling $ID" 47 | mojo build main_$ID.mojo -o main 48 | gdb main -ex 'set disassembly-flavor intel' -ex 'disassemble main' -ex q > $ID.asm 49 | 50 | ID="int16_128" 51 | echo "disassembling $ID" 52 | mojo build main_$ID.mojo -o main 53 | gdb main -ex 'set disassembly-flavor intel' -ex 'disassemble main' -ex q > $ID.asm 54 | 55 | 56 | ID="int32_8" 57 | echo "disassembling $ID" 58 | mojo build main_$ID.mojo -o main 59 | gdb main -ex 'set disassembly-flavor intel' -ex 'disassemble main' -ex q > $ID.asm 60 | 61 | ID="int32_16" 62 | echo "disassembling $ID" 63 | mojo build main_$ID.mojo -o main 64 | gdb main -ex 'set disassembly-flavor intel' -ex 'disassemble main' -ex q > $ID.asm 65 | 66 | ID="int32_32" 67 | echo "disassembling $ID" 68 | mojo build main_$ID.mojo -o main 69 | gdb main -ex 'set disassembly-flavor intel' -ex 'disassemble main' -ex q > $ID.asm 70 | 71 | ID="int32_64" 72 | echo "disassembling $ID" 73 | mojo build main_$ID.mojo -o main 74 | gdb main -ex 'set disassembly-flavor intel' -ex 'disassemble main' -ex q > $ID.asm 75 | 76 | ID="int32_128" 77 | echo "disassembling $ID" 78 | mojo build main_$ID.mojo -o main 79 | gdb main -ex 'set disassembly-flavor intel' -ex 'disassemble main' -ex q > $ID.asm 80 | 81 | 82 | ID="int64_8" 83 | echo "disassembling $ID" 84 | mojo build main_$ID.mojo -o main 85 | gdb main -ex 'set disassembly-flavor intel' -ex 'disassemble main' -ex q > $ID.asm 86 | 87 | ID="int64_16" 88 | echo "disassembling $ID" 89 | mojo build main_$ID.mojo -o main 90 | gdb main -ex 'set disassembly-flavor intel' -ex 'disassemble main' -ex q > $ID.asm 91 | 92 | ID="int64_32" 93 | echo "disassembling $ID" 94 | mojo build main_$ID.mojo -o main 95 | gdb main -ex 'set disassembly-flavor intel' -ex 'disassemble main' -ex q > $ID.asm 96 | 97 | ID="int64_64" 98 | echo "disassembling $ID" 99 | mojo build main_$ID.mojo -o main 100 | gdb main -ex 'set disassembly-flavor intel' -ex 'disassemble main' -ex q > $ID.asm 101 | 102 | ID="int64_128" 103 | echo "disassembling $ID" 104 | mojo build main_$ID.mojo -o main 105 | gdb main -ex 'set disassembly-flavor intel' -ex 'disassemble main' -ex q > $ID.asm 106 | -------------------------------------------------------------------------------- /img/sort-network-16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HJLebbink/sort-networks-mojo/a1b05c307a2b0ddd3a7eb9f5629c746e1d0cbab0/img/sort-network-16.png -------------------------------------------------------------------------------- /main.mojo: -------------------------------------------------------------------------------- 1 | from collections.vector import DynamicVector, InlinedFixedVector 2 | from algorithm.sort import sort 3 | from time import now 4 | from benchmark import keep 5 | 6 | import sort_network as sn 7 | 8 | 9 | fn main(): 10 | let start_time_ns = now() 11 | 12 | # sn.test_perm_code() 13 | # sn.test_sort() 14 | # sn.test_sort_X(0xFFFF) 15 | 16 | # sn.test_netw_SIMD_sort_multi_layer[DType.uint8, True]() 17 | 18 | sn.test_netw_SIMD_sort[DType.uint64, 8, True]() 19 | # sn.test_netw_SIMD_sort[DType.uint64, 16, True]() 20 | # sn.test_netw_SIMD_sort[DType.uint64, 32, True]() 21 | # sn.test_netw_SIMD_sort[DType.uint64, 64, True]() 22 | # sn.test_netw_SIMD_sort[DType.uint64, 128, True]() 23 | 24 | # sn.test_netw_SIMD_sort[DType.int64, 8, True]() 25 | # sn.test_netw_SIMD_sort[DType.int64, 16, True]() 26 | # sn.test_netw_SIMD_sort[DType.int64, 32, True]() 27 | # sn.test_netw_SIMD_sort[DType.int64, 64, True]() 28 | # sn.test_netw_SIMD_sort[DType.int64, 128, True]() 29 | 30 | # sn.test_netw_SIMD_sort[DType.float32, 8, True]() 31 | # sn.test_netw_SIMD_sort[DType.float32, 16, True]() 32 | # sn.test_netw_SIMD_sort[DType.float32, 32, True]() 33 | # sn.test_netw_SIMD_sort[DType.float32, 64, True]() 34 | # sn.test_netw_SIMD_sort[DType.float32, 128, True]() 35 | 36 | # sn.test_netw_SIMD_sort[DType.float16, 16, True]() 37 | 38 | # sn.test_netw_SIMD_sort[DType.bfloat16, 8, True]() # Error: 0.7.0 "JIT session error: Symbols not found: [ __truncsfbf2 ]" 39 | # sn.test_netw_SIMD_sort[DType.bfloat16, 16, True]() # Error: 0.7.0 "JIT session error: Symbols not found: [ __truncsfbf2 ]" 40 | # sn.test_netw_SIMD_sort[DType.bfloat16, 32, True]() # Error: 0.7.0 "JIT session error: Symbols not found: [ __truncsfbf2 ]" 41 | # sn.test_netw_SIMD_sort[DType.bfloat16, 64, True]() # Error: 0.7.0 "JIT session error: Symbols not found: [ __truncsfbf2 ]" 42 | # sn.test_netw_SIMD_sort[DType.bfloat16, 128, True]() 43 | 44 | # sn.test_netw_SIMD_sort[DType.int32, 8, True]() 45 | # sn.test_netw_SIMD_sort[DType.int32, 16, True]() 46 | # sn.test_netw_SIMD_sort[DType.int32, 32, True]() 47 | # sn.test_netw_SIMD_sort[DType.int32, 64, True]() 48 | # sn.test_netw_SIMD_sort[DType.int32, 128, True]() 49 | 50 | # sn.test_netw_SIMD_sort[DType.int16, 8, True]() 51 | # sn.test_netw_SIMD_sort[DType.int16, 16, True]() 52 | # sn.test_netw_SIMD_sort[DType.int16, 32, True]() 53 | # sn.test_netw_SIMD_sort[DType.int16, 64, True]() 54 | # sn.test_netw_SIMD_sort[DType.int16, 128, True]() 55 | 56 | # sn.test_netw_SIMD_sort[DType.int8, 8, True]() #6.5ns 57 | # sn.test_netw_SIMD_sort[DType.int8, 16, True]() #11 ns 58 | # sn.test_netw_SIMD_sort[DType.int8, 32, True]() #32 ns XX 59 | # sn.test_netw_SIMD_sort[DType.int8, 64, True]() #32 ns 60 | # sn.test_netw_SIMD_sort[DType.int8, 128, True]() #53 ns 61 | 62 | # sn.test_netw_SIMD_sort_2x_B[DType.int32, DType.uint32, True, True]() 63 | # sn.test_netw_SIMD_sort_idx[DType.int32, DType.uint32, 32, False]() 64 | 65 | # sn.test_netw_SIMD_sort_2x_A[DType.int8, DType.int8, 16]() 66 | # sn.test_netw_SIMD_sort_2x_B[DType.uint8, DType.uint8]() 67 | 68 | # sn.test_performance1(10000, 100) 69 | # sn.test_performance2(10000, 100) 70 | # print(measure_time_netw_sort_generic[DType.int8](10000, 100, 15)) 71 | 72 | # sn.test_netw_SIMD_sort[DType.uint32, 16, True]() 73 | # sn.test_netw_SIMD_sort_2x_C[DType.uint16, 16, True]() 74 | # sn.test_netw_SIMD_sort[DType.uint16, 16, True]() 75 | 76 | @parameter 77 | if False: 78 | alias sd1 = sn.swap_data[8]() 79 | print(str(sd1)) 80 | alias sd2 = sn.join_swap_data[sd1, sd1]() 81 | print(str(sd2)) 82 | 83 | @parameter 84 | if False: 85 | let sd = sn.swap_data[64]() 86 | print(str(sd)) 87 | let sd_2x = sn.join_swap_data(sd, sd) 88 | print(sd_2x.to_code()) 89 | 90 | @parameter 91 | if False: # print a network as a sequence of CE's 92 | let sd = sn.swap_data[128]() 93 | for i in range(sd.n_layers): 94 | for j in range(len(sd[i])): 95 | print_no_newline("(") 96 | print_no_newline(sd[i].get_min(j)) 97 | print_no_newline(",") 98 | print_no_newline(sd[i].get_max(j)) 99 | print_no_newline("),") 100 | 101 | @parameter 102 | if False: # print code 103 | for i in range(26): 104 | # fmt: off 105 | print(" let v"+str(i+1)+"a = swap_n[T1, channels, sd["+str(i+1)+"], ascending1](v"+str(i)+"a)") 106 | print(" let v"+str(i+1)+"b = swap_n[T2, channels, sd["+str(i+1)+"], ascending2](v"+str(i)+"b)") 107 | print(" @parameter") 108 | print(" if n_layers == " + str(i + 2) + ":") 109 | print(" return (v" + str(i + 1) + "a, v" + str(i + 1) + "b)") 110 | # fmt: on 111 | 112 | @parameter 113 | if False: # print code 114 | for i in range(26): 115 | # fmt: off 116 | print(" let t"+str(i+1)+" = swap_idx[T1, T2, channels, sd["+str(i+1)+"], ascending](t"+str(i)+")") 117 | print(" @parameter") 118 | print(" if n_layers == " + str(i + 2) + ":") 119 | print(" return t" + str(i + 1)) 120 | # fmt: on 121 | 122 | @parameter 123 | if False: # print code 124 | for i in range(26): 125 | # fmt: off 126 | print(" let v"+str(i+1)+" = swap_n[T, 2*channels, sd["+str(i+1)+"], ascending](v"+str(i)+")") 127 | print(" @parameter") 128 | print(" if n_layers == " + str(i + 2) + ":") 129 | print(" return v"+str(i+1)+".slice[channels](0), v"+str(i+1)+".slice[channels](channels)") 130 | # fmt: on 131 | 132 | let elapsed_time_ns = now() - start_time_ns 133 | print_no_newline("Elapsed time " + str(elapsed_time_ns) + " ns") 134 | print_no_newline(" = " + str(Float32(elapsed_time_ns) / 1_000) + " μs") 135 | print_no_newline(" = " + str(Float32(elapsed_time_ns) / 1_000_000) + " ms") 136 | print_no_newline(" = " + str(Float32(elapsed_time_ns) / 1_000_000_000) + " s") 137 | print_no_newline(" = " + str(Float32(elapsed_time_ns) / 60_000_000_000) + " min\n") 138 | -------------------------------------------------------------------------------- /sort_network/Layer.mojo: -------------------------------------------------------------------------------- 1 | struct Layer(CollectionElement, Sized, Stringable): 2 | alias LayerData = SIMD[DType.uint16, 2] 3 | var data: DynamicVector[Self.LayerData] 4 | 5 | @staticmethod 6 | @always_inline("nodebug") 7 | fn merge(layer1: Self, layer2: Self, width1: Int) -> Self: 8 | var result = Self() 9 | for i in range(len(layer1.data)): 10 | result.data.push_back(layer1.data[i]) 11 | for i in range(len(layer2.data)): 12 | let min: SIMD[DType.uint16, 1] = layer2.get_min(i) + width1 13 | let max: SIMD[DType.uint16, 1] = layer2.get_max(i) + width1 14 | result.data.push_back(SIMD[DType.uint16, 2](min, max)) 15 | return result ^ 16 | 17 | @always_inline("nodebug") 18 | fn __init__(inout self, v: VariadicList[Tuple[Int, Int]]): 19 | self.data = DynamicVector[Self.LayerData]() 20 | for i in range(v.__len__()): 21 | let v1 = v[i].get[0, Int]() 22 | let v2 = v[i].get[1, Int]() 23 | if v1 < v2: 24 | self.data.push_back(SIMD[DType.uint16, 2](v1, v2)) 25 | else: 26 | self.data.push_back(SIMD[DType.uint16, 2](v2, v1)) 27 | 28 | @always_inline("nodebug") 29 | fn __init__(inout self): 30 | self.data = DynamicVector[Self.LayerData]() 31 | 32 | # trait CollectionElement 33 | @always_inline("nodebug") 34 | fn __copyinit__(inout self, existing: Self): 35 | self.data = existing.data 36 | 37 | # trait CollectionElement 38 | @always_inline("nodebug") 39 | fn __moveinit__(inout self, owned existing: Self): 40 | self.data = existing.data 41 | 42 | # trait CollectionElement 43 | @always_inline("nodebug") 44 | fn __del__(owned self: Self): 45 | pass 46 | 47 | # trait Stringable 48 | @always_inline("nodebug") 49 | fn __str__(self) -> String: 50 | var result: String = "" 51 | let s = len(self.data) 52 | if s > 0: 53 | for i in range(s - 1): 54 | let min = self.get_min(i) 55 | let max = self.get_max(i) 56 | result += "(" + str(min) + "," + str(max) + ")," 57 | let min = self.get_min(s - 1) 58 | let max = self.get_max(s - 1) 59 | result += "(" + str(min) + "," + str(max) + ")" 60 | return result 61 | 62 | # trait Sized 63 | @always_inline("nodebug") 64 | fn __len__(self) -> Int: 65 | return len(self.data) 66 | 67 | @always_inline("nodebug") 68 | fn get_min(self, idx: Int) -> Int: 69 | return self.data[idx][0].to_int() 70 | 71 | @always_inline("nodebug") 72 | fn get_max(self, idx: Int) -> Int: 73 | return self.data[idx][1].to_int() 74 | -------------------------------------------------------------------------------- /sort_network/SwapData.mojo: -------------------------------------------------------------------------------- 1 | from collections.vector import DynamicVector 2 | from sort_network.Layer import Layer 3 | 4 | 5 | # A sorting network consists of a collection of compare/exchange elements (tuples) ordered in layers 6 | struct SwapData(Stringable): 7 | var data: DynamicVector[Layer] 8 | var channels: Int 9 | var n_layers: Int 10 | 11 | @always_inline("nodebug") 12 | fn __init__(inout self, channels: Int, n_layers: Int): 13 | self.data = DynamicVector[Layer]() 14 | self.channels = channels 15 | self.n_layers = n_layers 16 | 17 | @always_inline("nodebug") 18 | fn __copyinit__(inout self, existing: Self): 19 | self.data = existing.data 20 | self.channels = existing.channels 21 | self.n_layers = existing.n_layers 22 | 23 | # get the i-th layer 24 | @always_inline("nodebug") 25 | fn __getitem__(self, idx: Int) -> Layer: 26 | return self.data[idx] 27 | 28 | @always_inline("nodebug") 29 | fn get[idx: Int](self) -> Layer: 30 | return self.data[idx] 31 | 32 | # add a layer of swaps 33 | @always_inline("nodebug") 34 | fn add_layer( 35 | inout self, layer_id: Int, layer_content: VariadicList[Tuple[Int, Int]] 36 | ): 37 | let x = Layer(layer_content) 38 | self.data.push_back(x ^) 39 | 40 | @always_inline("nodebug") 41 | fn add_layer_l(inout self, layer: Layer): 42 | self.data.push_back(layer) 43 | 44 | # trait Stringable 45 | @always_inline("nodebug") 46 | fn __str__(self) -> String: 47 | let n_layer = str(self.count_layers()) 48 | let n_ce = str(self.count_ce()) 49 | var result: String = "Sorting network for ? inputs, " + n_ce + " CEs, " + n_layer + " layers:\n" 50 | for i in range(len(self.data)): 51 | result += str(self.data[i]) + "\n" 52 | return result 53 | 54 | fn to_code(self) -> String: 55 | let n_layers = self.count_layers() 56 | # fmt: off 57 | var result: String = "var result = SwapData(" + str(self.channels) + ", " + str(n_layers) + ")\n" 58 | for i in range(n_layers): 59 | result += "result.add_layer(" + str(i) + ", VariadicList(" + str(self.data[i]) + "))\n" 60 | return result 61 | # fmt: on 62 | 63 | # count number of compare/exchange elements 64 | @always_inline("nodebug") 65 | fn count_ce(self) -> Int: 66 | var result = 0 67 | for i in range(len(self.data)): 68 | result += len(self.data[i]) 69 | return result 70 | 71 | # count number of layers 72 | @always_inline("nodebug") 73 | fn count_layers(self) -> Int: 74 | return len(self.data) 75 | -------------------------------------------------------------------------------- /sort_network/__init__.mojo: -------------------------------------------------------------------------------- 1 | from .sort_network import sn, sn_merge, swap_data 2 | from .sort_network_data import join_swap_data 3 | from .performance import test_performance1, test_performance2 4 | from .test_individual import test_netw_SIMD_sort -------------------------------------------------------------------------------- /sort_network/crash3.mojo: -------------------------------------------------------------------------------- 1 | # A sorting network consists of a collection of compare/exchange elements (tuples) ordered in layers 2 | struct SwapData(Stringable): 3 | var data: DynamicVector[Layer] 4 | 5 | @always_inline("nodebug") 6 | fn __init__(inout self): 7 | self.data = DynamicVector[Layer]() 8 | 9 | @always_inline("nodebug") 10 | fn __copyinit__(inout self, existing: Self): 11 | self.data.__copyinit__(existing.data) 12 | 13 | # get the i-th layer 14 | @always_inline("nodebug") 15 | fn __getitem__(self, idx: Int) -> Layer: 16 | return self.data[idx] 17 | 18 | @always_inline("nodebug") 19 | fn get[idx: Int](self) -> Layer: 20 | return self.data[idx] 21 | 22 | # add a layer of swaps 23 | @always_inline("nodebug") 24 | fn add(inout self, layer: VariadicList[Tuple[Int, Int]]): 25 | self.data.push_back(Layer(layer)) 26 | 27 | @always_inline("nodebug") 28 | fn add(inout self, layer: Layer): 29 | self.data.push_back(layer) 30 | 31 | # trait Stringable 32 | @always_inline("nodebug") 33 | fn __str__(self) -> String: 34 | let n_layer = str(self.count_layers()) 35 | let n_ce = str(self.count_ce()) 36 | var result: String = "Sorting network for ? inputs, " + n_ce + " CEs, " + n_layer + " layers:\n" 37 | for i in range(len(self.data)): 38 | result += str(self.data[i]) + "\n" 39 | return result 40 | 41 | # count number of compare/exchange elements 42 | @always_inline("nodebug") 43 | fn count_ce(self) -> Int: 44 | var result = 0 45 | for i in range(len(self.data)): 46 | result += len(self.data[i]) 47 | return result 48 | 49 | # count number of layers 50 | @always_inline("nodebug") 51 | fn count_layers(self) -> Int: 52 | return len(self.data) 53 | 54 | @always_inline("nodebug") 55 | fn get_width(self) -> Int: 56 | var result: Int = -1 57 | for i in range(len(self.data)): 58 | for j in range(len(self.data[i])): 59 | let m = self.data[i].get_max(j) 60 | if m > result: 61 | result = m 62 | return result + 1 # plus one because we start counting at zero 63 | 64 | 65 | struct Layer(CollectionElement, Sized, Stringable): 66 | var data: DynamicVector[Int] 67 | 68 | @staticmethod 69 | @always_inline("nodebug") 70 | fn pack(t: Tuple[Int, Int]) -> Int: 71 | let v1 = t.get[0, Int]() 72 | let v2 = t.get[1, Int]() 73 | if v1 < v2: 74 | return (v1 << 16) | v2 75 | else: 76 | return (v2 << 16) | v1 77 | 78 | # unpack Tuple[Minimum, Maximum] 79 | @staticmethod 80 | @always_inline("nodebug") 81 | fn unpack(v: Int) -> Tuple[Int, Int]: 82 | return (v >> 16, v & 0xFFFF) 83 | 84 | @always_inline("nodebug") 85 | fn __init__(inout self): 86 | self.data = DynamicVector[Int]() 87 | 88 | @always_inline("nodebug") 89 | fn __init__(inout self, v: VariadicList[Tuple[Int, Int]]): 90 | self.data = DynamicVector[Int](v.__len__()) 91 | for i in range(v.__len__()): 92 | self.data.push_back(Layer.pack(v[i])) 93 | 94 | fn __init__(inout self, v: DynamicVector[Tuple[Int, Int]]): 95 | self.data = DynamicVector[Int](v.__len__()) 96 | for i in range(v.__len__()): 97 | self.data.push_back(Layer.pack(v[i])) 98 | 99 | # trait CollectionElement 100 | @always_inline("nodebug") 101 | fn __copyinit__(inout self, existing: Self): 102 | self.data.__copyinit__(existing.data) 103 | 104 | # trait CollectionElement 105 | @always_inline("nodebug") 106 | fn __moveinit__(inout self, owned existing: Self): 107 | self.data = existing.data ^ 108 | 109 | # trait CollectionElement 110 | @always_inline("nodebug") 111 | fn __del__(owned self: Self): 112 | pass 113 | 114 | # trait Stringable 115 | @always_inline("nodebug") 116 | fn __str__(self) -> String: 117 | var result: String = "[" 118 | let size = len(self.data) 119 | if size > 0: 120 | for i in range(size - 1): 121 | result += "(" + str(self.get_min(i)) + "," + str(self.get_max(i)) + ")," 122 | result += ( 123 | "(" 124 | + str(self.get_min(size - 1)) 125 | + "," 126 | + str(self.get_max(size - 1)) 127 | + ")" 128 | ) 129 | return result + "]" 130 | 131 | # trait Sized 132 | @always_inline("nodebug") 133 | fn __len__(self) -> Int: 134 | return len(self.data) 135 | 136 | @always_inline("nodebug") 137 | fn get_min(self, idx: Int) -> Int: 138 | return Layer.unpack(self.data[idx]).get[0, Int]() 139 | 140 | @always_inline("nodebug") 141 | fn get_max(self, idx: Int) -> Int: 142 | return Layer.unpack(self.data[idx]).get[1, Int]() 143 | 144 | 145 | fn join_swap_data2[sd1: SwapData, sd2: SwapData]() -> SwapData: 146 | var result = SwapData() 147 | alias width1 = sd1.get_width() 148 | alias width2 = sd2.get_width() 149 | if width1 != width2: 150 | print("ERROR join_swap_data: currently only equal widths are supported") 151 | return result 152 | for i in range(sd1.count_layers()): 153 | var x = DynamicVector[Tuple[Int, Int]](len(sd1[i].data) + len(sd2[i].data)) 154 | for j in range(len(sd1[i].data)): 155 | x.push_back(Tuple[Int, Int](sd1[i].get_min(j), sd1[i].get_max(j))) 156 | for j in range(len(sd2[i].data)): 157 | x.push_back( 158 | Tuple[Int, Int](sd2[i].get_min(j) + width1, sd2[i].get_max(j) + width1) 159 | ) 160 | result.add(x) 161 | return result 162 | 163 | 164 | fn swap_data() -> SwapData: 165 | var result = SwapData() 166 | result.add(VariadicList((0, 2), (1, 3), (4, 6), (5, 7))) 167 | result.add(VariadicList((0, 4), (1, 5), (2, 6), (3, 7))) 168 | result.add(VariadicList((0, 1), (2, 3), (4, 5), (6, 7))) 169 | result.add(VariadicList((2, 4), (3, 5))) 170 | result.add(VariadicList((1, 4), (3, 6))) 171 | result.add(VariadicList((1, 2), (3, 4), (5, 6))) 172 | return result 173 | 174 | 175 | fn main(): 176 | alias sd1 = swap_data() 177 | print(str(sd1)) 178 | alias sd2 = join_swap_data2[sd1, sd1]() 179 | print(str(sd2)) 180 | -------------------------------------------------------------------------------- /sort_network/crash4.mojo: -------------------------------------------------------------------------------- 1 | fn main(): 2 | let data = SIMD[DType.uint16, 16](0) 3 | alias SD1 = swap_data() 4 | let d2 = xyzzy[SD1[0]](data) 5 | print(d2) 6 | 7 | 8 | fn swap_data() -> DynamicVector[Layer]: 9 | var result = DynamicVector[Layer]() 10 | result.push_back(VariadicList((0, 2), (1, 3), (4, 6), (5, 7))) 11 | return result 12 | 13 | 14 | fn xyzzy[swaps: Layer](v: SIMD[DType.uint16, 16]) -> SIMD[DType.uint16, 16]: 15 | fn gen_perm[swaps: Layer]() -> StaticIntTuple[16]: 16 | let result = StaticIntTuple[16]() 17 | for i in range(len(swaps.data)): 18 | let from_ = swaps.data[i] # removing this line removes the crash 19 | return result 20 | 21 | alias permutations = gen_perm[swaps]() # changing alias to let removes the crash 22 | return v # just do nothing, removing gen_perm removes the crash 23 | 24 | 25 | struct Layer(CollectionElement): 26 | var data: DynamicVector[Int] 27 | 28 | fn __init__(inout self, v: VariadicList[Tuple[Int, Int]]): 29 | self.data = DynamicVector[Int]() 30 | for i in range(v.__len__()): 31 | self.data.push_back(v[i].get[0, Int]()) 32 | 33 | fn __copyinit__(inout self, existing: Self): 34 | self.data = existing.data 35 | 36 | fn __moveinit__(inout self, owned existing: Self): 37 | self.data = existing.data ^ 38 | 39 | fn __del__(owned self: Self): 40 | pass 41 | -------------------------------------------------------------------------------- /sort_network/mllr_examples.mojo: -------------------------------------------------------------------------------- 1 | from benchmark import keep 2 | 3 | 4 | fn main(): 5 | 6 | # does nothing... 7 | let tmp1: NoneType = rebind[NoneType](__mlir_op.`llvm.debugtrap`) 8 | 9 | @parameter 10 | if False: 11 | # --------------- 12 | let x0 = __mlir_op.`llvm.mlir.constant`[value = __mlir_attr.`42: i32`, _type = __mlir_type.i32]() 13 | let y0 = rebind[Scalar[DType.int32]](x0) 14 | print("y0="+str(y0)) 15 | 16 | let x1 = __mlir_op.`llvm.mlir.constant`[value = __mlir_attr.`42.: f32`, _type = __mlir_type.f32]() 17 | let y1 = rebind[Scalar[DType.float32]](x1) 18 | print("y1="+str(y1)) 19 | 20 | @parameter 21 | if False: 22 | # https://mlir.llvm.org/docs/Dialects/LLVM/#llvmadd-llvmaddop 23 | let d2a: Scalar[DType.int32] = 10 24 | let d2b: Scalar[DType.int32] = 20 25 | 26 | let z2a = rebind[__mlir_type.i32](d2a) 27 | let z2b = rebind[__mlir_type.i32](d2b) 28 | 29 | var x2 = __mlir_op.`llvm.add`[_type = __mlir_type.i32](z2a, z2b) 30 | let y2 = rebind[Scalar[DType.int32]](x2) 31 | print("y2="+str(y2)) 32 | 33 | @parameter 34 | if False: 35 | # https://mlir.llvm.org/docs/Dialects/LLVM/#llvmfadd-llvmfaddop 36 | let d3a: Scalar[DType.float32] = 10. 37 | let d3b: Scalar[DType.float32] = 20. 38 | 39 | let z3a = rebind[__mlir_type.f32](d3a) 40 | let z3b = rebind[__mlir_type.f32](d3b) 41 | 42 | var x3 = __mlir_op.`llvm.fadd`[_type = __mlir_type.f32](z3a, z3b) 43 | let y3 = rebind[Scalar[DType.float32]](x3) 44 | print("y3="+str(y3)) 45 | 46 | @parameter 47 | if False: 48 | # https://mlir.llvm.org/docs/Dialects/LLVM/#llvmadd-llvmaddop 49 | let d4a: SIMD[DType.int32, 16] = 10 50 | let d4b: SIMD[DType.int32, 16] = 20 51 | 52 | let z4a = rebind[__mlir_type.`vector<16xi32>`](d4a) 53 | let z4b = rebind[__mlir_type.`vector<16xi32>`](d4b) 54 | 55 | var x4 = __mlir_op.`llvm.add`[_type = __mlir_type.`vector<16xi32>`](z4a, z4b) 56 | let y4 = rebind[SIMD[DType.int32, 16]](x4) 57 | print("y4="+str(y4)) 58 | 59 | @parameter 60 | if False: 61 | # https://mlir.llvm.org/docs/Dialects/LLVM/#llvmfadd-llvmfaddop 62 | let d5a: SIMD[DType.float32, 16] = 10. 63 | let d5b: SIMD[DType.float32, 16] = 20. 64 | 65 | let z5a = rebind[__mlir_type.`vector<16xf32>`](d5a) 66 | let z5b = rebind[__mlir_type.`vector<16xf32>`](d5b) 67 | 68 | # https://llvm.org/docs/LangRef.html#fastmath 69 | var x5 = __mlir_op.`llvm.fadd`[_type = __mlir_type.`vector<16xf32>`, _flag = 'fast'](z5a, z5b) 70 | let y5 = rebind[SIMD[DType.float32, 16]](x5) 71 | print("y5="+str(y5)) 72 | 73 | @parameter 74 | if True: 75 | # https://mlir.llvm.org/docs/Dialects/LLVM/#llvmintrmaximum-llvmmaximumop 76 | # https://mlir.llvm.org/docs/Dialects/LLVM/#llvmintrmaxnum-llvmmaxnumop 77 | let d6a: SIMD[DType.float32, 16] = 10. 78 | let d6b: SIMD[DType.float32, 16] = 20. 79 | 80 | let z6a = rebind[__mlir_type.`vector<16xf32>`](d6a) 81 | let z6b = rebind[__mlir_type.`vector<16xf32>`](d6b) 82 | 83 | #var x6 = __mlir_op.`llvm.intr.maxnum`[_type = __mlir_type.`vector<16xf32>`, _flag = 'fast'](z6a, z6b) 84 | var x6 = __mlir_op.`llvm.intr.maximum`[_type = __mlir_type.`vector<16xf32>`, _flag = 'fast'](z6a, z6b) 85 | let y6 = rebind[SIMD[DType.float32, 16]](x6) 86 | print("y6="+str(y6)) 87 | 88 | # https://llvm.org/docs/LangRef.html#llvm-maxnum-intrinsic 89 | # let data2: T2 = rebind[SIMD[T, channels]](llvm_intrinsic["llvm.maxnum", T2, T2, T2](data0, data1)) 90 | 91 | 92 | 93 | 94 | #var x2 = __mlir_op.`llvm.maxnum`[_type = __mlir_type.`!pop.scalar`](d0, d1) 95 | #let y2 = rebind[SIMD[DType.uint32, 1]](x1.cast[DType.uint32]) 96 | #print("y="+str(y2)) 97 | -------------------------------------------------------------------------------- /sort_network/nan_check.mojo: -------------------------------------------------------------------------------- 1 | from random import random_ui64 2 | from time import now 3 | from sys.intrinsics import llvm_intrinsic 4 | 5 | 6 | fn gen_random_SIMD[T: DType, width: Int]() -> SIMD[T, width]: 7 | var result = SIMD[T, width]() 8 | for i in range(width): 9 | result[i] = random_ui64(0, 100).cast[T]() 10 | return result 11 | 12 | 13 | fn test_float32_16(): 14 | alias T = DType.float32 15 | alias channels = 16 16 | alias T2 = SIMD[T, channels] 17 | 18 | let data0: T2 = gen_random_SIMD[T, channels]() 19 | let data1: T2 = gen_random_SIMD[T, channels]() 20 | 21 | let start_time_ns = now() 22 | let data2: T2 = data0.max(data1) 23 | 24 | #0x000000000005b6fe <+366>: call rbx 25 | #0x000000000005b700 <+368>: mov r14,QWORD PTR [rsp+0x20] 26 | #0x000000000005b705 <+373>: mov rax,QWORD PTR [rsp+0x28] 27 | #0x000000000005b70a <+378>: mov QWORD PTR [rsp+0x58],rax 28 | #0x000000000005b70f <+383>: vmovaps zmm1,ZMMWORD PTR [rsp+0xc0] 29 | #0x000000000005b717 <+391>: vmovaps zmm2,ZMMWORD PTR [rsp+0x100] 30 | #0x000000000005b71f <+399>: vmaxps zmm0,zmm2,zmm1 31 | #0x000000000005b725 <+405>: vcmpunordps k1,zmm1,zmm1 32 | #0x000000000005b72c <+412>: vmovaps zmm0{k1},zmm2 33 | #0x000000000005b732 <+418>: vmovaps ZMMWORD PTR [rsp+0xc0],zmm0 34 | #0x000000000005b73a <+426>: vxorps xmm0,xmm0,xmm0 35 | #0x000000000005b73e <+430>: vmovaps XMMWORD PTR [rsp+0x30],xmm0 36 | #0x000000000005b744 <+436>: lea rsi,[rsp+0x30] 37 | #0x000000000005b749 <+441>: mov edi,0x1 38 | #0x000000000005b74e <+446>: vzeroupper 39 | #0x000000000005b751 <+449>: call rbx 40 | 41 | let elapsed_time_ns = now() - start_time_ns 42 | 43 | print(data2) 44 | print("Elapsed time " + str(elapsed_time_ns) + " ns") 45 | 46 | 47 | fn test_float64_8(): 48 | alias T = DType.float64 49 | alias channels = 8 50 | alias T2 = SIMD[T, channels] 51 | let data0: T2 = gen_random_SIMD[T, channels]() 52 | let data1: T2 = gen_random_SIMD[T, channels]() 53 | 54 | let start_time_ns = now() 55 | let data2: T2 = data0.max(data1) 56 | 57 | # it turn out that max is equal to "llvm.maxnum.f64" 58 | 59 | #0x0000000000005bf6 <+278>: call 0x5470 60 | #0x0000000000005bfb <+283>: mov rbx,QWORD PTR [rsp+0x40] 61 | #0x0000000000005c00 <+288>: mov rax,QWORD PTR [rsp+0x48] 62 | #0x0000000000005c05 <+293>: mov QWORD PTR [rsp+0x70],rax 63 | #0x0000000000005c0a <+298>: vmovapd zmm0,ZMMWORD PTR [rsp+0xc0] 64 | #0x0000000000005c12 <+306>: vmovapd zmm2,ZMMWORD PTR [rsp+0x100] 65 | #0x0000000000005c1a <+314>: vmaxpd zmm1,zmm2,zmm0 66 | #0x0000000000005c20 <+320>: vcmpunordpd k1,zmm0,zmm0 67 | #0x0000000000005c27 <+327>: vmovapd zmm1{k1},zmm2 68 | #0x0000000000005c2d <+333>: vmovapd ZMMWORD PTR [rsp+0xc0],zmm1 69 | #0x0000000000005c35 <+341>: vxorpd xmm0,xmm0,xmm0 70 | #0x0000000000005c39 <+345>: vmovapd XMMWORD PTR [rsp+0x50],xmm0 71 | #0x0000000000005c3f <+351>: lea rsi,[rsp+0x50] 72 | #0x0000000000005c44 <+356>: mov edi,0x1 73 | #0x0000000000005c49 <+361>: vzeroupper 74 | #0x0000000000005c4c <+364>: call 0x5470 75 | 76 | let elapsed_time_ns = now() - start_time_ns 77 | 78 | #print(data2) 79 | print("Elapsed time " + str(elapsed_time_ns) + " ns") 80 | 81 | 82 | ## llvm.maxnum.f64: 83 | # This intrinsic computes the maximum value between two floating-point numbers (f64), but 84 | # if one of the inputs is NaN, it returns the other input value. In other words, if one 85 | # of the operands is NaN, the result will be the non-NaN operand. 86 | # 87 | # This behavior is often referred to as "maximum number" semantics. 88 | 89 | ## llvm.maximum.f64: 90 | # This intrinsic computes the maximum value between two floating-point numbers (f64) 91 | # according to IEEE 754 floating-point arithmetic rules. If one of the inputs is NaN, the 92 | # result will be NaN, regardless of the other operand. 93 | # 94 | # This behavior adheres strictly to IEEE 754 floating-point arithmetic rules, where any 95 | # comparison involving NaN results in NaN. 96 | 97 | 98 | 99 | 100 | fn test_intrinsic_1(): 101 | alias T = DType.float64 102 | alias channels = 8 103 | alias T2 = SIMD[T, channels] 104 | let data0: T2 = gen_random_SIMD[T, channels]() 105 | let data1: T2 = gen_random_SIMD[T, channels]() 106 | 107 | let start_time_ns = now() 108 | #let data2: T2 = data0.max(data1) 109 | 110 | # https://llvm.org/docs/LangRef.html#llvm-maximum-intrinsic 111 | let data2: T2 = rebind[SIMD[T, channels]](llvm_intrinsic["llvm.maximum", T2, T2, T2](data0, data1)) 112 | 113 | #0x000000000005b6fe <+366>: call rbx 114 | #0x000000000005b700 <+368>: mov r14,QWORD PTR [rsp+0x20] 115 | #0x000000000005b705 <+373>: mov rax,QWORD PTR [rsp+0x28] 116 | #0x000000000005b70a <+378>: mov QWORD PTR [rsp+0x58],rax 117 | #0x000000000005b70f <+383>: vmovdqa64 zmm1,ZMMWORD PTR [rsp+0xc0] 118 | #0x000000000005b717 <+391>: vpmovq2m k1,zmm1 119 | #0x000000000005b71d <+397>: vmovapd zmm2,ZMMWORD PTR [rsp+0x100] 120 | #0x000000000005b725 <+405>: vblendmpd zmm0{k1},zmm1,zmm2 121 | #0x000000000005b72b <+411>: vmovapd zmm2{k1},zmm1 122 | #0x000000000005b731 <+417>: vmaxpd zmm0,zmm2,zmm0 123 | #0x000000000005b737 <+423>: vcmpunordpd k1,zmm2,zmm2 124 | #0x000000000005b73e <+430>: vmovapd zmm0{k1},zmm2 125 | #0x000000000005b744 <+436>: vmovapd ZMMWORD PTR [rsp+0xc0],zmm0 126 | #0x000000000005b74c <+444>: vxorpd xmm0,xmm0,xmm0 127 | #0x000000000005b750 <+448>: vmovapd XMMWORD PTR [rsp+0x30],xmm0 128 | #0x000000000005b756 <+454>: lea rsi,[rsp+0x30] 129 | #0x000000000005b75b <+459>: mov edi,0x1 130 | #0x000000000005b760 <+464>: vzeroupper 131 | #0x000000000005b763 <+467>: call rbx 132 | 133 | let elapsed_time_ns = now() - start_time_ns 134 | print(data2) 135 | print("Elapsed time " + str(elapsed_time_ns) + " ns") 136 | 137 | 138 | fn test_intrinsic_2(): 139 | alias T = DType.float64 140 | alias channels = 8 141 | alias T2 = SIMD[T, channels] 142 | let data0: T2 = gen_random_SIMD[T, channels]() 143 | let data1: T2 = gen_random_SIMD[T, channels]() 144 | 145 | let start_time_ns = now() 146 | #let data2: T2 = data0.max(data1) 147 | 148 | # https://llvm.org/docs/LangRef.html#llvm-maxnum-intrinsic 149 | let data2: T2 = rebind[SIMD[T, channels]](llvm_intrinsic["llvm.maxnum", T2, T2, T2](data0, data1)) 150 | 151 | #0x000000000005b6fb <+363>: vzeroupper 152 | #0x000000000005b6fe <+366>: call rbx 153 | #0x000000000005b700 <+368>: mov r14,QWORD PTR [rsp+0x20] 154 | #0x000000000005b705 <+373>: mov rax,QWORD PTR [rsp+0x28] 155 | #0x000000000005b70a <+378>: mov QWORD PTR [rsp+0x58],rax 156 | #0x000000000005b70f <+383>: vmovapd zmm1,ZMMWORD PTR [rsp+0xc0] 157 | #0x000000000005b717 <+391>: vmovapd zmm2,ZMMWORD PTR [rsp+0x100] 158 | #0x000000000005b71f <+399>: vmaxpd zmm0,zmm2,zmm1 159 | #0x000000000005b725 <+405>: vcmpunordpd k1,zmm1,zmm1 160 | #0x000000000005b72c <+412>: vmovapd zmm0{k1},zmm2 161 | #0x000000000005b732 <+418>: vmovapd ZMMWORD PTR [rsp+0xc0],zmm0 162 | #0x000000000005b73a <+426>: vxorpd xmm0,xmm0,xmm0 163 | #0x000000000005b73e <+430>: vmovapd XMMWORD PTR [rsp+0x30],xmm0 164 | #0x000000000005b744 <+436>: lea rsi,[rsp+0x30] 165 | #0x000000000005b749 <+441>: mov edi,0x1 166 | #0x000000000005b74e <+446>: vzeroupper 167 | #0x000000000005b751 <+449>: call rbx 168 | 169 | let elapsed_time_ns = now() - start_time_ns 170 | print(data2) 171 | print("Elapsed time " + str(elapsed_time_ns) + " ns") 172 | 173 | 174 | fn test_llvm_1(): 175 | let data0: SIMD[DType.float32, 16] = gen_random_SIMD[DType.float32, 16]() 176 | let data1: SIMD[DType.float32, 16] = gen_random_SIMD[DType.float32, 16]() 177 | #let data0: SIMD[DType.float32, 16] = 10. 178 | #let data1: SIMD[DType.float32, 16] = 20. 179 | 180 | let data0x = rebind[__mlir_type.`vector<16xf32>`](data0) 181 | let data1x = rebind[__mlir_type.`vector<16xf32>`](data1) 182 | 183 | 184 | let start_time_ns = now() 185 | #var tmp = __mlir_op.`llvm.fadd`[_type = __mlir_type.`vector<16xf32>`, _flag = 'fast'](data0x, data1x) 186 | #var tmp = __mlir_op.`llvm.intr.maxnum`[_type = __mlir_type.`vector<16xf32>`, _flag = 'fast'](data0x, data1x) 187 | #var tmp = __mlir_op.`llvm.intr.maximum`[_type = __mlir_type.`vector<16xf32>`, _flag = 'fast'](data0x, data1x) 188 | var tmp = __mlir_op.`llvm.intr.maxnum`[_type = __mlir_type.`vector<16xf32>`](data0x, data1x) 189 | 190 | let elapsed_time_ns = now() - start_time_ns 191 | 192 | # %vec5 = "llvm.maxnum.v16f32"(%vec3, %vec4) : 193 | # (vector<16xf32>, vector<16xf32>) -> vector<16xf32> 194 | # "llvm.fastmath" = { flags = ["fast"] } 195 | 196 | 197 | 198 | 199 | let data2 = rebind[SIMD[DType.float32, 16]](tmp) 200 | print(data2) 201 | print("Elapsed time " + str(elapsed_time_ns) + " ns") 202 | 203 | 204 | fn main(): 205 | test_llvm_1() -------------------------------------------------------------------------------- /sort_network/performance.mojo: -------------------------------------------------------------------------------- 1 | from benchmark import keep 2 | from algorithm.sort import sort 3 | from time import time_function, now 4 | from random import random_ui64 5 | 6 | from sort_network.sort_network import ( 7 | sn, 8 | sn_2x_interleave, 9 | sn_2x_parallel, 10 | ) 11 | from sort_network.test_tools import ( 12 | gen_random_SIMD, 13 | gen_random_vec, 14 | gen_random_pointer, 15 | gen_random_DTypePointer, 16 | ) 17 | from sort_network.sort_network_ml import sn_ml_4n, sn_ml_8n 18 | 19 | 20 | fn load_file(filename: StringLiteral) -> String: 21 | try: 22 | with open(filename, "r") as f: 23 | return f.read() 24 | except e: 25 | print("Error " + str(e)) 26 | return "" 27 | 28 | 29 | fn test_performance1(n_samples: Int, n_iterations: Int): 30 | alias sep = "\t" 31 | 32 | fn experiment1[ 33 | T: DType, channels: Int 34 | ](n_samples: Int, n_iterations: Int, name: String, sep: String) -> String: 35 | fn measure_time_mojo_sort[ 36 | T: DType 37 | ](n_samples: Int, n_iterations: Int, channels: Int) -> Float32: 38 | var best_time_ms: Int = 1 << 62 39 | let buff: Pointer[SIMD[T, 1], 0] = Pointer[SIMD[T, 1]].aligned_alloc( 40 | 16, channels * n_iterations 41 | ) 42 | for iteration in range(channels * n_iterations): 43 | buff[iteration] = random_ui64(0, 100).cast[T]() 44 | 45 | for sample in range(n_samples): 46 | var ptr = buff 47 | let start_time_ms = now() 48 | 49 | for iteration in range(n_iterations): 50 | # sort[type: DType](inout buff: Pointer[SIMD[type, 1], 0], len: Int) 51 | sort[T](ptr, channels) 52 | ptr += channels 53 | 54 | let elapsed_time_ms = now() - start_time_ms 55 | 56 | if elapsed_time_ms < best_time_ms: 57 | best_time_ms = elapsed_time_ms 58 | keep(buff) 59 | buff.free() 60 | return Float32(best_time_ms) / n_iterations 61 | 62 | fn measure_time_netw_sort_SIMD[ 63 | T: DType, channels: Int 64 | ](n_samples: Int, n_iterations: Int) -> Float32: 65 | @parameter 66 | if channels == 256 or channels == 512: 67 | return -1 68 | else: 69 | var data2 = gen_random_SIMD[T, channels]() 70 | var best_time_ms: Int = 1 << 62 71 | for sample in range(n_samples): 72 | let start_time_ms = now() 73 | for i in range(n_iterations): 74 | data2 = sn[T, channels](data2) 75 | 76 | let elapsed_time_ms = now() - start_time_ms 77 | keep(data2) 78 | 79 | if elapsed_time_ms < best_time_ms: 80 | best_time_ms = elapsed_time_ms 81 | 82 | return Float32(best_time_ms) / n_iterations 83 | 84 | fn measure_time_netw_sort_SIMD_ml4[ 85 | T: DType, channels: Int 86 | ](n_samples: Int, n_iterations: Int) -> Float32: 87 | @parameter 88 | if channels == 8 or channels == 512: 89 | return -1 90 | else: 91 | var data2 = gen_random_SIMD[T, channels]() 92 | var best_time_ms: Int = 1 << 62 93 | for sample in range(n_samples): 94 | let start_time_ms = now() 95 | for i in range(n_iterations): 96 | data2 = sn_ml_4n[T, channels, True](data2) 97 | 98 | let elapsed_time_ms = now() - start_time_ms 99 | keep(data2) 100 | 101 | if elapsed_time_ms < best_time_ms: 102 | best_time_ms = elapsed_time_ms 103 | 104 | return Float32(best_time_ms) / n_iterations 105 | 106 | fn measure_time_netw_sort_SIMD_ml8[ 107 | T: DType, channels: Int 108 | ](n_samples: Int, n_iterations: Int) -> Float32: 109 | @parameter 110 | if channels == 8 or channels == 16: 111 | return -1 112 | else: 113 | var data2 = gen_random_SIMD[T, channels]() 114 | var best_time_ms: Int = 1 << 62 115 | for sample in range(n_samples): 116 | let start_time_ms = now() 117 | for i in range(n_iterations): 118 | data2 = sn_ml_8n[T, channels, True](data2) 119 | 120 | let elapsed_time_ms = now() - start_time_ms 121 | keep(data2) 122 | 123 | if elapsed_time_ms < best_time_ms: 124 | best_time_ms = elapsed_time_ms 125 | 126 | return Float32(best_time_ms) / n_iterations 127 | 128 | 129 | 130 | fn measure_time_netw_sort_generic[ 131 | T: DType 132 | ](n_samples: Int, n_iterations: Int, channels: Int) -> Float32: 133 | if channels == 256 or channels == 512: 134 | return -1 135 | 136 | var best_time_ms: Int = 1 << 62 137 | let buff = DTypePointer[T].aligned_alloc(16, channels * n_iterations) 138 | 139 | for sample in range(n_samples): 140 | for iteration in range(channels * n_iterations): 141 | buff[iteration] = random_ui64(0, 100).cast[T]() 142 | 143 | var ptr = buff 144 | let start_time_ms = now() 145 | 146 | for iteration in range(n_iterations): 147 | # sort[type: DType](inout buff: Pointer[SIMD[type, 1], 0], len: Int) 148 | sn[T](ptr, channels) 149 | ptr += channels 150 | 151 | let elapsed_time_ms = now() - start_time_ms 152 | 153 | if elapsed_time_ms < best_time_ms: 154 | best_time_ms = elapsed_time_ms 155 | 156 | keep(buff) 157 | buff.free() 158 | return Float32(best_time_ms) / n_iterations 159 | 160 | var result = name 161 | result += sep 162 | result += str(channels) 163 | result += sep 164 | result += str(measure_time_mojo_sort[T](n_samples, n_iterations, channels)) 165 | result += sep 166 | result += str(measure_time_netw_sort_SIMD[T, channels](n_samples, n_iterations)) 167 | result += sep 168 | result += str(measure_time_netw_sort_generic[T](n_samples, n_iterations, channels)) 169 | result += sep 170 | result += str(measure_time_netw_sort_SIMD_ml4[T, channels](n_samples, n_iterations)) 171 | result += sep 172 | result += str(measure_time_netw_sort_SIMD_ml8[T, channels](n_samples, n_iterations)) 173 | return result 174 | 175 | fn test_perf[T: DType](n_samples: Int, n_iterations: Int, name: String): 176 | print(experiment1[T, 8](n_samples, n_iterations, name, sep)) 177 | print(experiment1[T, 16](n_samples, n_iterations, name, sep)) 178 | print(experiment1[T, 32](n_samples, n_iterations, name, sep)) 179 | print(experiment1[T, 64](n_samples, n_iterations, name, sep)) 180 | print(experiment1[T, 128](n_samples, n_iterations, name, sep)) 181 | print(experiment1[T, 256](n_samples, n_iterations, name, sep)) 182 | print(experiment1[T, 512](n_samples, n_iterations, name, sep)) 183 | print("") 184 | 185 | print(sep + "channels" + sep + "mojo" + sep + "netw_SIMD" + sep + "netw_vec") 186 | test_perf[DType.uint8](n_samples, n_iterations, "uint8") 187 | test_perf[DType.int8](n_samples, n_iterations, "int8") 188 | test_perf[DType.uint16](n_samples, n_iterations, "uint16") 189 | test_perf[DType.int16](n_samples, n_iterations, "int16") 190 | test_perf[DType.float16](n_samples, n_iterations, "float16") 191 | # test_perf[DType.bfloat16](n_samples, n_iterations, "bfloat16") 192 | test_perf[DType.uint32](n_samples, n_iterations, "uint32") 193 | test_perf[DType.int32](n_samples, n_iterations, "int32") 194 | test_perf[DType.float32](n_samples, n_iterations, "float32") 195 | test_perf[DType.uint64](n_samples, n_iterations, "uint64") 196 | test_perf[DType.int64](n_samples, n_iterations, "int64") 197 | test_perf[DType.float64](n_samples, n_iterations, "float64") 198 | 199 | 200 | fn test_performance2(n_samples: Int, n_iterations: Int): 201 | alias sep = "\t" 202 | 203 | fn experiment2[ 204 | T: DType, channels: Int 205 | ](n_samples: Int, n_iterations: Int, name: String, sep: String) -> String: 206 | fn measure_time_2x_sequential[ 207 | T: DType, channels: Int 208 | ](samples: Int, n_iterations: Int) -> Float32: 209 | var data3 = gen_random_SIMD[T, channels]() 210 | var best_time_ms: Int = 1 << 62 211 | for sample in range(samples): 212 | var data4 = data3 213 | let start_time_ms = now() 214 | for i in range(n_iterations): 215 | data3 = sn[T, channels, True](data3) 216 | data4 = sn[T, channels, True](data4) 217 | 218 | let elapsed_time_ms = now() - start_time_ms 219 | keep(data3) 220 | keep(data4) 221 | 222 | if elapsed_time_ms < best_time_ms: 223 | best_time_ms = elapsed_time_ms 224 | 225 | return Float32(best_time_ms) / n_iterations 226 | 227 | fn measure_time_2x_interleaved[ 228 | T: DType, channels: Int 229 | ](samples: Int, n_iterations: Int) -> Float32: 230 | var data3 = gen_random_SIMD[T, channels]() 231 | var best_time_ms: Int = 1 << 62 232 | for sample in range(samples): 233 | var data4 = data3 234 | let start_time_ms = now() 235 | for i in range(n_iterations): 236 | data3, data4 = sn_2x_interleave[T, T, channels, True, True]( 237 | data3, data4 238 | ) 239 | 240 | let elapsed_time_ms = now() - start_time_ms 241 | keep(data3) 242 | keep(data4) 243 | 244 | if elapsed_time_ms < best_time_ms: 245 | best_time_ms = elapsed_time_ms 246 | 247 | return Float32(best_time_ms) / n_iterations 248 | 249 | fn measure_time_2x_parallel[ 250 | T: DType, channels: Int 251 | ](samples: Int, n_iterations: Int) -> Float32: 252 | var data3 = gen_random_SIMD[T, channels]() 253 | var best_time_ms: Int = 1 << 62 254 | for sample in range(samples): 255 | var data4 = data3 256 | let start_time_ms = now() 257 | for i in range(n_iterations): 258 | data3, data4 = sn_2x_parallel[T, channels, True](data3, data4) 259 | 260 | let elapsed_time_ms = now() - start_time_ms 261 | keep(data3) 262 | keep(data4) 263 | 264 | if elapsed_time_ms < best_time_ms: 265 | best_time_ms = elapsed_time_ms 266 | 267 | return Float32(best_time_ms) / n_iterations 268 | 269 | var result = name 270 | result += sep 271 | result += str(channels) 272 | result += sep 273 | result += str(measure_time_2x_sequential[T, channels](n_samples, n_iterations)) 274 | result += sep 275 | result += str(measure_time_2x_interleaved[T, channels](n_samples, n_iterations)) 276 | result += sep 277 | result += str(measure_time_2x_parallel[T, channels](n_samples, n_iterations)) 278 | return result 279 | 280 | fn test_perf[T: DType](n_samples: Int, n_iterations: Int, name: String): 281 | print(experiment2[T, 8](n_samples, n_iterations, name, sep)) 282 | print(experiment2[T, 16](n_samples, n_iterations, name, sep)) 283 | print(experiment2[T, 32](n_samples, n_iterations, name, sep)) 284 | print(experiment2[T, 64](n_samples, n_iterations, name, sep)) 285 | # print(experiment2[T, 128](n_samples, n_iterations, name, sep)) 286 | # print(experiment2[T, 256](n_samples, n_iterations, name, sep)) 287 | print("") 288 | 289 | print( 290 | sep + "channels" + sep + "2x seq" + sep + "2x interleaved" + sep + "2x parallel" 291 | ) 292 | test_perf[DType.uint8](n_samples, n_iterations, "uint8") 293 | test_perf[DType.int8](n_samples, n_iterations, "int8") 294 | test_perf[DType.uint16](n_samples, n_iterations, "uint16") 295 | test_perf[DType.int16](n_samples, n_iterations, "int16") 296 | test_perf[DType.float16](n_samples, n_iterations, "float16") 297 | # test_perf[DType.bfloat16](n_samples, n_iterations, "bfloat16") 298 | test_perf[DType.uint32](n_samples, n_iterations, "uint32") 299 | test_perf[DType.int32](n_samples, n_iterations, "int32") 300 | test_perf[DType.float32](n_samples, n_iterations, "float32") 301 | test_perf[DType.uint64](n_samples, n_iterations, "uint64") 302 | test_perf[DType.int64](n_samples, n_iterations, "int64") 303 | test_perf[DType.float64](n_samples, n_iterations, "float64") 304 | -------------------------------------------------------------------------------- /sort_network/shuffle_test.mojo: -------------------------------------------------------------------------------- 1 | from random import random_ui64 2 | from time import now 3 | 4 | from sort_network.sort_tools import my_shuffle, gen_perm 5 | from sort_network.sort_network_data import swap_data 6 | from sort_network.SwapData import Layer, SwapData 7 | 8 | fn gen_random_SIMD[T: DType, width: Int]() -> SIMD[T, width]: 9 | var result = SIMD[T, width]() 10 | for i in range(width): 11 | result[i] = random_ui64(0, 100).cast[T]() 12 | return result 13 | 14 | 15 | @always_inline 16 | fn my_shuffle2[T: DType, width: Int, p: StaticIntTuple[width]](v: SIMD[T, width]) -> SIMD[T, width]: 17 | @parameter 18 | if width == 8: 19 | alias x = VariadicList[Int](p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7]) 20 | return v._shuffle_list[x, width](v) 21 | else: 22 | constrained[False]() 23 | return v 24 | 25 | 26 | fn main(): 27 | alias T = DType.int16 28 | alias channels = 32 29 | alias sd: SwapData = swap_data[channels]() 30 | alias perm = gen_perm[sd[2], channels]() 31 | 32 | let x: SIMD[T, channels] = gen_random_SIMD[T, channels]() 33 | 34 | let start_time_ns = now() 35 | let y = my_shuffle[T, channels, perm](x) 36 | #let y = my_shuffle2[T, 8, perm](x) 37 | let elapsed_time_ns = now() - start_time_ns 38 | 39 | print(y) 40 | print("Elapsed time " + str(elapsed_time_ns) + " ns") 41 | 42 | -------------------------------------------------------------------------------- /sort_network/sort_network_ml.mojo: -------------------------------------------------------------------------------- 1 | from sort_network.sort_network import ( 2 | sn, 3 | sn_merge, 4 | sn_2x_interleave, 5 | sn_2x_parallel, 6 | ) 7 | from sort_network.SwapData import SwapData 8 | 9 | 10 | fn sort_2x[T: DType, sub_chan: Int, sequential: Bool, ascending: Bool]( 11 | d0: SIMD[T, sub_chan], 12 | d1: SIMD[T, sub_chan], 13 | ) -> (SIMD[T, sub_chan], SIMD[T, sub_chan]): 14 | @parameter 15 | if sequential: 16 | return (sn[T, sub_chan, ascending](d0), sn[T, sub_chan, ascending](d1)) 17 | else: 18 | return sn_2x_interleave[T, T, sub_chan, ascending](d0, d1) 19 | #return sn_2x_parallel[T, sub_size, ascending](d0, d1) 20 | 21 | 22 | 23 | # sorting network multi-layer 4N: divide width in two; and use sorting network 4 24 | fn sn_ml_4n[ 25 | T: DType, channels: Int, ascending: Bool 26 | ](data: SIMD[T, channels]) -> SIMD[T, channels]: 27 | alias sub_size: Int = channels >> 2 28 | alias sequential: Bool = False 29 | # Sorting network for 4 inputs, 5 CEs, 3 layers: 30 | # [(0,2),(1,3)] 31 | # [(0,1),(2,3)] 32 | # [(1,2)] 33 | 34 | var d0: SIMD[T, sub_size] = data.slice[sub_size](0 * sub_size) 35 | var d1: SIMD[T, sub_size] = data.slice[sub_size](1 * sub_size) 36 | var d2: SIMD[T, sub_size] = data.slice[sub_size](2 * sub_size) 37 | var d3: SIMD[T, sub_size] = data.slice[sub_size](3 * sub_size) 38 | 39 | let d02: SIMD[T, 2 * sub_size] 40 | let d13: SIMD[T, 2 * sub_size] 41 | d02, d13 = sort_2x[T, 2*sub_size, sequential, ascending](d0.join(d2), d1.join(d3)) 42 | 43 | d0 = d02.slice[sub_size](0) 44 | d1 = d13.slice[sub_size](0) 45 | d2 = d02.slice[sub_size](sub_size) 46 | d3 = d13.slice[sub_size](sub_size) 47 | 48 | let d01: SIMD[T, 2 * sub_size] = sn_merge[T, 2 * sub_size, ascending](d0.join(d1)) 49 | let d23: SIMD[T, 2 * sub_size] = sn_merge[T, 2 * sub_size, ascending](d2.join(d3)) 50 | 51 | d0 = d01.slice[sub_size](0) 52 | d1 = d01.slice[sub_size](sub_size) 53 | d2 = d23.slice[sub_size](0) 54 | d3 = d23.slice[sub_size](sub_size) 55 | 56 | let d12: SIMD[T, 2 * sub_size] = sn_merge[T, 2 * sub_size, ascending](d1.join(d2)) 57 | 58 | d1 = d12.slice[sub_size](0) 59 | d2 = d12.slice[sub_size](sub_size) 60 | 61 | let d0123 = d0.join(d1).join(d2.join(d3)) 62 | return rebind[SIMD[T, channels]](d0123) 63 | 64 | 65 | fn sn_ml_8n[ 66 | T: DType, channels: Int, ascending: Bool 67 | ](data: SIMD[T, channels]) -> SIMD[T, channels]: 68 | alias sub_size: Int = channels >> 3 69 | 70 | # Sorting network for 8 inputs, 19 CEs, 6 layers: 71 | # [(0,2),(1,3),(4,6),(5,7)] 72 | # [(0,4),(1,5),(2,6),(3,7)] 73 | # [(0,1),(2,3),(4,5),(6,7)] 74 | # [(2,4),(3,5)] 75 | # [(1,4),(3,6)] 76 | # [(1,2),(3,4),(5,6)] 77 | 78 | var d0: SIMD[T, sub_size] = data.slice[sub_size](0 * sub_size) 79 | var d1: SIMD[T, sub_size] = data.slice[sub_size](1 * sub_size) 80 | var d2: SIMD[T, sub_size] = data.slice[sub_size](2 * sub_size) 81 | var d3: SIMD[T, sub_size] = data.slice[sub_size](3 * sub_size) 82 | var d4: SIMD[T, sub_size] = data.slice[sub_size](4 * sub_size) 83 | var d5: SIMD[T, sub_size] = data.slice[sub_size](5 * sub_size) 84 | var d6: SIMD[T, sub_size] = data.slice[sub_size](6 * sub_size) 85 | var d7: SIMD[T, sub_size] = data.slice[sub_size](7 * sub_size) 86 | 87 | # [(0,2),(1,3),(4,6),(5,7)] 88 | @parameter # just a hack to create a local scope 89 | if True: 90 | let d02 = sn[T, 2 * sub_size, ascending](d0.join(d2)) 91 | let d13 = sn[T, 2 * sub_size, ascending](d1.join(d3)) 92 | let d46 = sn[T, 2 * sub_size, ascending](d4.join(d6)) 93 | let d57 = sn[T, 2 * sub_size, ascending](d5.join(d7)) 94 | 95 | d0 = d02.slice[sub_size](0) 96 | d1 = d13.slice[sub_size](0) 97 | d2 = d02.slice[sub_size](sub_size) 98 | d3 = d13.slice[sub_size](sub_size) 99 | d4 = d46.slice[sub_size](0) 100 | d5 = d57.slice[sub_size](0) 101 | d6 = d46.slice[sub_size](sub_size) 102 | d7 = d57.slice[sub_size](sub_size) 103 | 104 | # [(0,4),(1,5),(2,6),(3,7)] 105 | @parameter # just a hack to create a local scope 106 | if True: 107 | let d04 = sn_merge[T, 2 * sub_size, ascending](d0.join(d4)) 108 | let d15 = sn_merge[T, 2 * sub_size, ascending](d1.join(d5)) 109 | let d26 = sn_merge[T, 2 * sub_size, ascending](d2.join(d6)) 110 | let d37 = sn_merge[T, 2 * sub_size, ascending](d3.join(d7)) 111 | 112 | d0 = d04.slice[sub_size](0) 113 | d1 = d15.slice[sub_size](0) 114 | d2 = d26.slice[sub_size](0) 115 | d3 = d37.slice[sub_size](0) 116 | d4 = d04.slice[sub_size](sub_size) 117 | d5 = d15.slice[sub_size](sub_size) 118 | d6 = d26.slice[sub_size](sub_size) 119 | d7 = d37.slice[sub_size](sub_size) 120 | 121 | # [(0,1),(2,3),(4,5),(6,7)] 122 | @parameter # just a hack to create a local scope 123 | if True: 124 | let d01 = sn_merge[T, 2 * sub_size, ascending](d0.join(d1)) 125 | let d23 = sn_merge[T, 2 * sub_size, ascending](d2.join(d3)) 126 | let d45 = sn_merge[T, 2 * sub_size, ascending](d4.join(d5)) 127 | let d67 = sn_merge[T, 2 * sub_size, ascending](d6.join(d7)) 128 | 129 | d0 = d01.slice[sub_size](0) 130 | d1 = d01.slice[sub_size](sub_size) 131 | d2 = d23.slice[sub_size](0) 132 | d3 = d23.slice[sub_size](sub_size) 133 | d4 = d45.slice[sub_size](0) 134 | d5 = d45.slice[sub_size](sub_size) 135 | d6 = d67.slice[sub_size](0) 136 | d7 = d67.slice[sub_size](sub_size) 137 | 138 | # [(2,4),(3,5)] 139 | @parameter # just a hack to create a local scope 140 | if True: 141 | let d24 = sn_merge[T, 2 * sub_size, ascending](d2.join(d4)) 142 | let d35 = sn_merge[T, 2 * sub_size, ascending](d3.join(d5)) 143 | 144 | d2 = d24.slice[sub_size](0) 145 | d3 = d35.slice[sub_size](0) 146 | d4 = d24.slice[sub_size](sub_size) 147 | d5 = d35.slice[sub_size](sub_size) 148 | 149 | # [(1,4),(3,6)] 150 | @parameter # just a hack to create a local scope 151 | if True: 152 | let d14 = sn_merge[T, 2 * sub_size, ascending](d1.join(d4)) 153 | let d36 = sn_merge[T, 2 * sub_size, ascending](d3.join(d6)) 154 | 155 | d1 = d14.slice[sub_size](0) 156 | d3 = d36.slice[sub_size](0) 157 | d4 = d14.slice[sub_size](sub_size) 158 | d6 = d36.slice[sub_size](sub_size) 159 | 160 | # [(1,2),(3,4),(5,6)] 161 | @parameter # just a hack to create a local scope 162 | if True: 163 | let d12 = sn_merge[T, 2 * sub_size, ascending](d1.join(d2)) 164 | let d34 = sn_merge[T, 2 * sub_size, ascending](d3.join(d4)) 165 | let d56 = sn_merge[T, 2 * sub_size, ascending](d5.join(d6)) 166 | 167 | d1 = d12.slice[sub_size](0) 168 | d2 = d12.slice[sub_size](sub_size) 169 | d3 = d34.slice[sub_size](0) 170 | d4 = d34.slice[sub_size](sub_size) 171 | d5 = d56.slice[sub_size](0) 172 | d6 = d56.slice[sub_size](sub_size) 173 | 174 | let d0123 = d0.join(d1).join(d2.join(d3)) 175 | let d4567 = d4.join(d5).join(d6.join(d7)) 176 | return rebind[SIMD[T, channels]](d0123.join(d4567)) 177 | -------------------------------------------------------------------------------- /sort_network/sort_tools.mojo: -------------------------------------------------------------------------------- 1 | from testing import assert_true 2 | 3 | from sort_network.sort_network_data import swap_data 4 | from sort_network.SwapData import Layer, SwapData 5 | 6 | 7 | fn gen_merge_mask[ 8 | swaps: Layer, width: Int, ascending: Bool 9 | ]() -> SIMD[DType.bool, width]: 10 | var result = SIMD[DType.bool, width]() 11 | for i in range(len(swaps)): 12 | if ascending: 13 | # set the minium of the comparison to true to get ascending 14 | result[swaps.get_min(i)] = True 15 | else: 16 | # set the maximum of the comparison to true to get descending 17 | result[swaps.get_max(i)] = True 18 | return result 19 | 20 | 21 | # generate a index permutation (of size width) from the provided swaps in Layer 22 | fn gen_perm[swaps: Layer, width: Int]() -> StaticIntTuple[width]: 23 | var result = StaticIntTuple[width]() 24 | for i in range(width): 25 | result[i] = i 26 | 27 | for i in range(len(swaps)): 28 | let from_ = swaps.get_min(i) 29 | let to_ = swaps.get_max(i) 30 | 31 | let tmp = result[to_] 32 | result[to_] = result[from_] 33 | result[from_] = tmp 34 | 35 | return result 36 | 37 | 38 | @always_inline 39 | fn swap_n[ 40 | T: DType, width: Int, swaps: Layer, ascending: Bool 41 | ](v: SIMD[T, width]) -> SIMD[T, width]: 42 | alias permutations = gen_perm[swaps, width]() 43 | constrained[len(permutations) == width, "invalid number of permutations"]() 44 | alias merge_mask = gen_merge_mask[swaps, width, ascending]() 45 | let v2 = my_shuffle[T, width, permutations](v) 46 | return merge_mask.select(v.min(v2), v.max(v2)) 47 | 48 | 49 | @always_inline 50 | fn swap_idx[ 51 | T1: DType, T2: DType, width: Int, swaps: Layer, ascending: Bool 52 | ](t: Tuple[SIMD[T1, width], SIMD[T2, width]]) -> (SIMD[T1, width], SIMD[T2, width]): 53 | alias permutations = gen_perm[swaps, width]() 54 | let data = t.get[0, SIMD[T1, width]]() 55 | let idx = t.get[1, SIMD[T2, width]]() 56 | let data_sorted = swap_n[T1, width, swaps, ascending](data) 57 | let change_mask = data_sorted != data 58 | let idx_shuffled = my_shuffle[T2, width, permutations](idx) 59 | return (data_sorted, change_mask.select(idx_shuffled, idx)) 60 | 61 | 62 | fn test_perm_code(): 63 | alias swap16 = swap_data[16]() 64 | 65 | alias l1_obs = gen_perm[swap16[0], 16]() 66 | alias l1_exp = StaticIntTuple[16]( 67 | 5, 4, 12, 13, 1, 0, 7, 6, 9, 8, 15, 14, 2, 3, 11, 10 68 | ) 69 | constrained[l1_obs == l1_exp, "l1"]() 70 | 71 | alias l2_obs = gen_perm[swap16[1], 16]() 72 | alias l2_exp = StaticIntTuple[16]( 73 | 2, 10, 0, 6, 7, 14, 3, 4, 11, 12, 1, 8, 9, 15, 5, 13 74 | ) 75 | constrained[l2_obs == l2_exp, "l2"]() 76 | 77 | alias l3_obs = gen_perm[swap16[2], 16]() 78 | alias l3_exp = StaticIntTuple[16]( 79 | 8, 3, 11, 1, 13, 9, 10, 15, 0, 5, 6, 2, 14, 4, 12, 7 80 | ) 81 | constrained[l3_obs == l3_exp, "l3"]() 82 | 83 | alias l4_obs = gen_perm[swap16[3], 16]() 84 | alias l4_exp = StaticIntTuple[16]( 85 | 1, 0, 4, 8, 2, 6, 5, 12, 3, 10, 9, 13, 7, 11, 15, 14 86 | ) 87 | constrained[l4_obs == l4_exp, "l4"]() 88 | 89 | alias l5_obs = gen_perm[swap16[4], 16]() 90 | alias l5_exp = StaticIntTuple[16]( 91 | 0, 3, 5, 1, 8, 2, 9, 11, 4, 6, 13, 7, 14, 10, 12, 15 92 | ) 93 | constrained[l5_obs == l5_exp, "l5"]() 94 | 95 | alias l6_obs = gen_perm[swap16[5], 16]() 96 | alias l6_exp = StaticIntTuple[16]( 97 | 0, 2, 1, 5, 11, 3, 8, 9, 6, 7, 12, 4, 10, 14, 13, 15 98 | ) 99 | constrained[l6_obs == l6_exp, "l6"]() 100 | 101 | alias l7_obs = gen_perm[swap16[6], 16]() 102 | alias l7_exp = StaticIntTuple[16]( 103 | 0, 1, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 15 104 | ) 105 | constrained[l7_obs == l7_exp, "l7"]() 106 | 107 | alias l8_obs = gen_perm[swap16[7], 16]() 108 | alias l8_exp = StaticIntTuple[16]( 109 | 0, 1, 2, 3, 6, 7, 4, 5, 10, 11, 8, 9, 12, 13, 14, 15 110 | ) 111 | constrained[l8_obs == l8_exp, "l8"]() 112 | 113 | alias l9_obs = gen_perm[swap16[8], 16]() 114 | alias l9_exp = StaticIntTuple[16]( 115 | 0, 1, 2, 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 13, 14, 15 116 | ) 117 | constrained[l9_obs == l9_exp, "l9"]() 118 | 119 | print("test_perm_code: DONE") 120 | 121 | 122 | @always_inline 123 | fn my_shuffle[ 124 | T: DType, width: Int, p: StaticIntTuple[width] 125 | ](v: SIMD[T, width]) -> SIMD[T, width]: 126 | @parameter 127 | if width == 8: 128 | return v.shuffle[ 129 | p[0], 130 | p[1], 131 | p[2], 132 | p[3], 133 | p[4], 134 | p[5], 135 | p[6], 136 | p[7], 137 | ]() 138 | elif width == 16: 139 | return v.shuffle[ 140 | p[0], 141 | p[1], 142 | p[2], 143 | p[3], 144 | p[4], 145 | p[5], 146 | p[6], 147 | p[7], 148 | p[8], 149 | p[9], 150 | p[10], 151 | p[11], 152 | p[12], 153 | p[13], 154 | p[14], 155 | p[15], 156 | ]() 157 | elif width == 32: 158 | return v.shuffle[ 159 | p[0], 160 | p[1], 161 | p[2], 162 | p[3], 163 | p[4], 164 | p[5], 165 | p[6], 166 | p[7], 167 | p[8], 168 | p[9], 169 | p[10], 170 | p[11], 171 | p[12], 172 | p[13], 173 | p[14], 174 | p[15], 175 | p[16], 176 | p[17], 177 | p[18], 178 | p[19], 179 | p[20], 180 | p[21], 181 | p[22], 182 | p[23], 183 | p[24], 184 | p[25], 185 | p[26], 186 | p[27], 187 | p[28], 188 | p[29], 189 | p[30], 190 | p[31], 191 | ]() 192 | elif width == 64: 193 | return v.shuffle[ 194 | p[0], 195 | p[1], 196 | p[2], 197 | p[3], 198 | p[4], 199 | p[5], 200 | p[6], 201 | p[7], 202 | p[8], 203 | p[9], 204 | p[10], 205 | p[11], 206 | p[12], 207 | p[13], 208 | p[14], 209 | p[15], 210 | p[16], 211 | p[17], 212 | p[18], 213 | p[19], 214 | p[20], 215 | p[21], 216 | p[22], 217 | p[23], 218 | p[24], 219 | p[25], 220 | p[26], 221 | p[27], 222 | p[28], 223 | p[29], 224 | p[30], 225 | p[31], 226 | p[32], 227 | p[33], 228 | p[34], 229 | p[35], 230 | p[36], 231 | p[37], 232 | p[38], 233 | p[39], 234 | p[40], 235 | p[41], 236 | p[42], 237 | p[43], 238 | p[44], 239 | p[45], 240 | p[46], 241 | p[47], 242 | p[48], 243 | p[49], 244 | p[50], 245 | p[51], 246 | p[52], 247 | p[53], 248 | p[54], 249 | p[55], 250 | p[56], 251 | p[57], 252 | p[58], 253 | p[59], 254 | p[60], 255 | p[61], 256 | p[62], 257 | p[63], 258 | ]() 259 | elif width == 128: 260 | return v.shuffle[ 261 | p[0], 262 | p[1], 263 | p[2], 264 | p[3], 265 | p[4], 266 | p[5], 267 | p[6], 268 | p[7], 269 | p[8], 270 | p[9], 271 | p[10], 272 | p[11], 273 | p[12], 274 | p[13], 275 | p[14], 276 | p[15], 277 | p[16], 278 | p[17], 279 | p[18], 280 | p[19], 281 | p[20], 282 | p[21], 283 | p[22], 284 | p[23], 285 | p[24], 286 | p[25], 287 | p[26], 288 | p[27], 289 | p[28], 290 | p[29], 291 | p[30], 292 | p[31], 293 | p[32], 294 | p[33], 295 | p[34], 296 | p[35], 297 | p[36], 298 | p[37], 299 | p[38], 300 | p[39], 301 | p[40], 302 | p[41], 303 | p[42], 304 | p[43], 305 | p[44], 306 | p[45], 307 | p[46], 308 | p[47], 309 | p[48], 310 | p[49], 311 | p[50], 312 | p[51], 313 | p[52], 314 | p[53], 315 | p[54], 316 | p[55], 317 | p[56], 318 | p[57], 319 | p[58], 320 | p[59], 321 | p[60], 322 | p[61], 323 | p[62], 324 | p[63], 325 | p[64], 326 | p[65], 327 | p[66], 328 | p[67], 329 | p[68], 330 | p[69], 331 | p[70], 332 | p[71], 333 | p[72], 334 | p[73], 335 | p[74], 336 | p[75], 337 | p[76], 338 | p[77], 339 | p[78], 340 | p[79], 341 | p[80], 342 | p[81], 343 | p[82], 344 | p[83], 345 | p[84], 346 | p[85], 347 | p[86], 348 | p[87], 349 | p[88], 350 | p[89], 351 | p[90], 352 | p[91], 353 | p[92], 354 | p[93], 355 | p[94], 356 | p[95], 357 | p[96], 358 | p[97], 359 | p[98], 360 | p[99], 361 | p[100], 362 | p[101], 363 | p[102], 364 | p[103], 365 | p[104], 366 | p[105], 367 | p[106], 368 | p[107], 369 | p[108], 370 | p[109], 371 | p[110], 372 | p[111], 373 | p[112], 374 | p[113], 375 | p[114], 376 | p[115], 377 | p[116], 378 | p[117], 379 | p[118], 380 | p[119], 381 | p[120], 382 | p[121], 383 | p[122], 384 | p[123], 385 | p[124], 386 | p[125], 387 | p[126], 388 | p[127], 389 | ]() 390 | elif width == 256: 391 | return v.shuffle[ 392 | p[0], 393 | p[1], 394 | p[2], 395 | p[3], 396 | p[4], 397 | p[5], 398 | p[6], 399 | p[7], 400 | p[8], 401 | p[9], 402 | p[10], 403 | p[11], 404 | p[12], 405 | p[13], 406 | p[14], 407 | p[15], 408 | p[16], 409 | p[17], 410 | p[18], 411 | p[19], 412 | p[20], 413 | p[21], 414 | p[22], 415 | p[23], 416 | p[24], 417 | p[25], 418 | p[26], 419 | p[27], 420 | p[28], 421 | p[29], 422 | p[30], 423 | p[31], 424 | p[32], 425 | p[33], 426 | p[34], 427 | p[35], 428 | p[36], 429 | p[37], 430 | p[38], 431 | p[39], 432 | p[40], 433 | p[41], 434 | p[42], 435 | p[43], 436 | p[44], 437 | p[45], 438 | p[46], 439 | p[47], 440 | p[48], 441 | p[49], 442 | p[50], 443 | p[51], 444 | p[52], 445 | p[53], 446 | p[54], 447 | p[55], 448 | p[56], 449 | p[57], 450 | p[58], 451 | p[59], 452 | p[60], 453 | p[61], 454 | p[62], 455 | p[63], 456 | p[64], 457 | p[65], 458 | p[66], 459 | p[67], 460 | p[68], 461 | p[69], 462 | p[70], 463 | p[71], 464 | p[72], 465 | p[73], 466 | p[74], 467 | p[75], 468 | p[76], 469 | p[77], 470 | p[78], 471 | p[79], 472 | p[80], 473 | p[81], 474 | p[82], 475 | p[83], 476 | p[84], 477 | p[85], 478 | p[86], 479 | p[87], 480 | p[88], 481 | p[89], 482 | p[90], 483 | p[91], 484 | p[92], 485 | p[93], 486 | p[94], 487 | p[95], 488 | p[96], 489 | p[97], 490 | p[98], 491 | p[99], 492 | p[100], 493 | p[101], 494 | p[102], 495 | p[103], 496 | p[104], 497 | p[105], 498 | p[106], 499 | p[107], 500 | p[108], 501 | p[109], 502 | p[110], 503 | p[111], 504 | p[112], 505 | p[113], 506 | p[114], 507 | p[115], 508 | p[116], 509 | p[117], 510 | p[118], 511 | p[119], 512 | p[120], 513 | p[121], 514 | p[122], 515 | p[123], 516 | p[124], 517 | p[125], 518 | p[126], 519 | p[127], 520 | p[128], 521 | p[129], 522 | p[130], 523 | p[131], 524 | p[132], 525 | p[133], 526 | p[134], 527 | p[135], 528 | p[136], 529 | p[137], 530 | p[138], 531 | p[139], 532 | p[140], 533 | p[141], 534 | p[142], 535 | p[143], 536 | p[144], 537 | p[145], 538 | p[146], 539 | p[147], 540 | p[148], 541 | p[149], 542 | p[150], 543 | p[151], 544 | p[152], 545 | p[153], 546 | p[154], 547 | p[155], 548 | p[156], 549 | p[157], 550 | p[158], 551 | p[159], 552 | p[160], 553 | p[161], 554 | p[162], 555 | p[163], 556 | p[164], 557 | p[165], 558 | p[166], 559 | p[167], 560 | p[168], 561 | p[169], 562 | p[170], 563 | p[171], 564 | p[172], 565 | p[173], 566 | p[174], 567 | p[175], 568 | p[176], 569 | p[177], 570 | p[178], 571 | p[179], 572 | p[180], 573 | p[181], 574 | p[182], 575 | p[183], 576 | p[184], 577 | p[185], 578 | p[186], 579 | p[187], 580 | p[188], 581 | p[189], 582 | p[190], 583 | p[191], 584 | p[192], 585 | p[193], 586 | p[194], 587 | p[195], 588 | p[196], 589 | p[197], 590 | p[198], 591 | p[199], 592 | p[200], 593 | p[201], 594 | p[202], 595 | p[203], 596 | p[204], 597 | p[205], 598 | p[206], 599 | p[207], 600 | p[208], 601 | p[209], 602 | p[210], 603 | p[211], 604 | p[212], 605 | p[213], 606 | p[214], 607 | p[215], 608 | p[216], 609 | p[217], 610 | p[218], 611 | p[219], 612 | p[220], 613 | p[221], 614 | p[222], 615 | p[223], 616 | p[224], 617 | p[225], 618 | p[226], 619 | p[227], 620 | p[228], 621 | p[229], 622 | p[230], 623 | p[231], 624 | p[232], 625 | p[233], 626 | p[234], 627 | p[235], 628 | p[236], 629 | p[237], 630 | p[238], 631 | p[239], 632 | p[240], 633 | p[241], 634 | p[242], 635 | p[243], 636 | p[244], 637 | p[245], 638 | p[246], 639 | p[247], 640 | p[248], 641 | p[249], 642 | p[250], 643 | p[251], 644 | p[252], 645 | p[253], 646 | p[254], 647 | p[255], 648 | ]() 649 | else: 650 | constrained[False]() 651 | return v 652 | -------------------------------------------------------------------------------- /sort_network/test_individual.mojo: -------------------------------------------------------------------------------- 1 | from algorithm.sort import sort 2 | from time import now 3 | from benchmark import keep 4 | 5 | from sort_network.sort_network import ( 6 | sn, 7 | sn_idx, 8 | sn_2x_interleave, 9 | sn_2x_parallel, 10 | ) 11 | 12 | from sort_network.sort_network_ml import sn_ml_4n 13 | 14 | from sort_network.performance import ( 15 | gen_random_SIMD, 16 | gen_random_vec, 17 | gen_random_pointer, 18 | gen_random_DTypePointer, 19 | ) 20 | 21 | 22 | fn test_mojo_sort[T: DType](size: Int): 23 | let buff = gen_random_pointer[T](size) 24 | 25 | for i in range(size): 26 | print_no_newline(str(buff[i]) + " ") 27 | print("") 28 | 29 | var ptr = buff 30 | let start_time_ms = now() 31 | sort[T](ptr, size) 32 | let elapsed_time_ms = now() - start_time_ms 33 | 34 | for i in range(size): 35 | print_no_newline(str(ptr[i]) + " ") 36 | print("\ntime spend " + str(elapsed_time_ms) + " ns") 37 | buff.free() 38 | 39 | 40 | fn test_netw_vec_sort[T: DType](size: Int): 41 | let buff = gen_random_DTypePointer[T](size) 42 | 43 | for i in range(size): 44 | print_no_newline(str(buff[i]) + " ") 45 | print("") 46 | 47 | var ptr = buff 48 | let start_time_ms = now() 49 | sn[T](ptr, size) 50 | let elapsed_time_ms = now() - start_time_ms 51 | 52 | for i in range(size): 53 | print_no_newline(str(ptr[i]) + " ") 54 | print("\ntime spend " + str(elapsed_time_ms) + " ns") 55 | buff.free() 56 | 57 | 58 | fn test_netw_SIMD_sort[T: DType, channels: Int, ascending: Bool](): 59 | let data1 = gen_random_SIMD[T, channels]() 60 | print("before " + str(channels) + ": " + str(data1)) 61 | let start_time_ms = now() 62 | let data2 = sn[T, channels, ascending](data1) 63 | # let data2 = sort_by_counting[T, channels, ascending](data1) 64 | let elapsed_time_ms = now() - start_time_ms 65 | print("after " + str(channels) + ": " + str(data2)) 66 | keep(data2.reduce_add()) 67 | print("time spend " + str(elapsed_time_ms) + " ns") 68 | 69 | 70 | fn test_netw_SIMD_sort_multi_layer[T: DType, ascending: Bool](): 71 | alias channels: Int = 128 72 | let data1 = gen_random_SIMD[T, channels]() 73 | print("before " + str(channels) + ": " + str(data1)) 74 | let start_time_ms = now() 75 | let data2 = sn_ml_4n[T, channels, ascending](data1) 76 | let elapsed_time_ms = now() - start_time_ms 77 | print("after " + str(channels) + ": " + str(data2)) 78 | keep(data2.reduce_add()) 79 | print("time spend " + str(elapsed_time_ms) + " ns") 80 | 81 | 82 | fn test_netw_SIMD_sort_idx[T1: DType, T2: DType, channels: Int, ascending: Bool](): 83 | let data = gen_random_SIMD[T1, channels]() 84 | var idx = SIMD[T2, channels]() 85 | for i in range(channels): 86 | idx[i] = i 87 | print("before: " + String(data)) 88 | print("before: " + String(idx)) 89 | let t = sn_idx[T1, T2, channels, ascending](data, idx) 90 | let data2 = t.get[0, SIMD[T1, channels]]() 91 | let idx2 = t.get[1, SIMD[T2, channels]]() 92 | print("after: " + String(data2)) 93 | print("after: " + String(idx2)) 94 | 95 | 96 | # conclusion comparing test_netw_SIMD_sort_2x_A with test_netw_SIMD_sort_2x_B: 97 | # sort_16element_2x is slightly more efficient, but not much 98 | 99 | 100 | fn test_netw_SIMD_sort_2x_A[ 101 | T1: DType, 102 | T2: DType, 103 | channels: Int, 104 | ascending1: Bool = True, 105 | ascending2: Bool = True, 106 | ](): 107 | let data1a = gen_random_SIMD[T1, channels]() 108 | let data1b = gen_random_SIMD[T2, channels]() 109 | 110 | print("before: " + String(data1a)) 111 | print("before: " + String(data1b)) 112 | let data2a = sn[T1, channels, ascending1](data1a) 113 | let data2b = sn[T2, channels, ascending2](data1b) 114 | print("after: " + String(data2a)) 115 | print("after: " + String(data2b)) 116 | 117 | 118 | fn test_netw_SIMD_sort_2x_B[ 119 | T1: DType, T2: DType, ascending1: Bool = True, ascending2: Bool = True 120 | ](): 121 | alias channels: Int = 16 122 | 123 | let data1a = gen_random_SIMD[T1, channels]() 124 | let data1b = gen_random_SIMD[T2, channels]() 125 | 126 | print("before: " + String(data1a)) 127 | print("before: " + String(data1b)) 128 | let data2 = sn_2x_interleave[T1, T2, channels, ascending1, ascending2]( 129 | data1a, data1b 130 | ) 131 | let data2a = data2.get[0, SIMD[T1, channels]]() 132 | let data2b = data2.get[1, SIMD[T2, channels]]() 133 | print("after: " + String(data2a)) 134 | print("after: " + String(data2b)) 135 | 136 | # let data3 = sn_2x_parallel[T1, channels, ascending1](data1a, data1b) 137 | # let data3a = data2.get[0, SIMD[T1, channels]]() 138 | # let data3b = data2.get[1, SIMD[T2, channels]]() 139 | # print("after: " + String(data3a)) 140 | # print("after: " + String(data3b)) 141 | 142 | -------------------------------------------------------------------------------- /sort_network/test_tools.mojo: -------------------------------------------------------------------------------- 1 | from random import random_ui64 2 | 3 | 4 | fn gen_random_SIMD[T: DType, width: Int]() -> SIMD[T, width]: 5 | var result = SIMD[T, width]() 6 | # TODO: use faster methods 7 | for i in range(width): 8 | result[i] = random_ui64(0, 100).cast[T]() 9 | return result 10 | 11 | 12 | fn gen_random_vec[T: DType](size: Int) -> DynamicVector[SIMD[T, 1]]: 13 | var result = DynamicVector[SIMD[T, 1]](size) 14 | # TODO: use faster methods 15 | for i in range(size): 16 | result.push_back(random_ui64(0, 100).cast[T]()) 17 | return result 18 | 19 | 20 | fn gen_random_pointer[T: DType](size: Int) -> Pointer[SIMD[T, 1]]: 21 | let result = Pointer[SIMD[T, 1]].aligned_alloc(16, size) 22 | # TODO: use faster methods 23 | for i in range(size): 24 | result[i] = random_ui64(0, 100).cast[T]() 25 | return result 26 | 27 | 28 | fn gen_random_DTypePointer[T: DType](size: Int) -> DTypePointer[T, 0]: 29 | let result = DTypePointer[T].alloc(size) 30 | # TODO: use faster methods 31 | for i in range(size): 32 | result[i] = random_ui64(0, 100).cast[T]() 33 | return result 34 | -------------------------------------------------------------------------------- /sort_network/tests.mojo: -------------------------------------------------------------------------------- 1 | from algorithm.sort import sort 2 | 3 | from sort_network.sort_network import sn_idx, sn 4 | from sort_network.test_tools import gen_random_SIMD 5 | 6 | 7 | fn test_sort(): 8 | fn eq[T: DType, s: Int](v1: SIMD[T, s], v2: SIMD[T, s]) -> Bool: 9 | for i in range(s): 10 | if v1[i] != v2[i]: 11 | return False 12 | return True 13 | 14 | alias data_1 = SIMD[DType.int32, 16]( 15 | 15, 13, 14, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 16 | ) 17 | alias idx_1 = SIMD[DType.int32, 16]( 18 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 19 | ) 20 | 21 | alias t1a = sn_idx[channels=16, ascending=True](data_1, idx_1) 22 | alias data_1a_obs = t1a.get[0, SIMD[DType.int32, 16]]() 23 | alias data_1a_exp = SIMD[DType.int32, 16]( 24 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 25 | ) 26 | 27 | @parameter 28 | if not eq(data_1a_obs, data_1a_exp): 29 | print("data_1 org " + str(data_1)) 30 | print("data_1a exp " + str(data_1a_exp)) 31 | print("data_1a obs " + str(data_1a_obs)) 32 | 33 | alias idx_1a_obs = t1a.get[1, SIMD[DType.int32, 16]]() 34 | alias idx_1a_exp = SIMD[DType.int32, 16]( 35 | 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 1, 2, 0 36 | ) 37 | 38 | @parameter 39 | if not eq(idx_1a_obs, idx_1a_exp): 40 | print("idx_1 org " + str(idx_1)) 41 | print("idx_1a exp " + str(idx_1a_exp)) 42 | print("idx_1a obs " + str(idx_1a_obs)) 43 | 44 | alias t1b = sn_idx[channels=16, ascending=False](data_1, idx_1) 45 | alias data_1b_obs = t1b.get[0, SIMD[DType.int32, 16]]() 46 | alias data_1b_exp = SIMD[DType.int32, 16]( 47 | 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 48 | ) 49 | 50 | @parameter 51 | if not eq(data_1b_obs, data_1b_exp): 52 | print("data_1 org " + str(data_1)) 53 | print("data_1b exp " + str(data_1b_exp)) 54 | print("data_1b obs " + str(data_1b_obs)) 55 | 56 | alias idx_1b_obs = t1b.get[1, SIMD[DType.int32, 16]]() 57 | alias idx_1b_exp = SIMD[DType.int32, 16]( 58 | 0, 2, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 59 | ) 60 | 61 | @parameter 62 | if not eq(idx_1b_obs, idx_1b_exp): 63 | print("idx_1 org " + str(idx_1)) 64 | print("idx_1b exp " + str(idx_1b_exp)) 65 | print("idx_1b obs " + str(idx_1b_obs)) 66 | 67 | print("test_sort: DONE") 68 | 69 | 70 | fn test_sort_N[T: DType, size: Int](n_experiments: Int): 71 | var buff: Pointer[SIMD[T, 1], 0] = Pointer[SIMD[T, 1]].alloc(size) 72 | for i in range(n_experiments): 73 | if i == 0: 74 | print_no_newline("test_sort_N " + str(size) + ": ") 75 | elif (i & 0xFFFF) == 0: 76 | print_no_newline("x") 77 | 78 | let data = gen_random_SIMD[T, size]() 79 | for i in range(size): 80 | buff[i] = data[i] 81 | 82 | # sort with Mojo as reference impl 83 | sort[T](buff, size) 84 | 85 | # sort with SortingNetwork 86 | let sorted_data = sn[T](data) 87 | 88 | # check if reference and SortingNetwork yield equal results 89 | for i in range(size): 90 | if sorted_data[i] != buff[i]: 91 | print("NOT equal!") 92 | return 93 | 94 | print(" " + str(n_experiments) + " tests successes") 95 | 96 | 97 | fn test_sort_X(n_experiments: Int): 98 | test_sort_N[DType.uint8, 8](n_experiments) 99 | test_sort_N[DType.uint8, 16](n_experiments) 100 | test_sort_N[DType.uint8, 32](n_experiments) 101 | test_sort_N[DType.uint8, 64](n_experiments) 102 | test_sort_N[DType.uint8, 128](n_experiments) 103 | -------------------------------------------------------------------------------- /sort_network/timing_test.mojo: -------------------------------------------------------------------------------- 1 | from benchmark import keep 2 | from time import time_function, now 3 | from random import random_ui64 4 | 5 | 6 | fn gen_random_SIMD[T: DType, width: Int]() -> SIMD[T, width]: 7 | var result = SIMD[T, width]() 8 | # TODO: use faster methods 9 | for i in range(width): 10 | result[i] = random_ui64(0, 100).cast[T]() 11 | return result 12 | 13 | 14 | fn main(): 15 | let a = gen_random_SIMD[DType.uint32, 16]() 16 | var b: UInt32 = 0 17 | let start_time_ms = now() 18 | b = a.reduce_add() 19 | let elapsed_time_ms = now() - start_time_ms 20 | keep(b) 21 | 22 | # @parameter 23 | # fn runner(): 24 | # b = a.reduce_add() 25 | # keep(b) 26 | 27 | # let elapsed_time_ms = time_function[runner]() 28 | 29 | print(elapsed_time_ms) 30 | # print(b) 31 | --------------------------------------------------------------------------------