├── .envrc ├── .github └── workflows │ └── default.yaml ├── .gitignore ├── LICENSE ├── Makefile ├── README.md ├── go.mod ├── go.sum ├── pkg ├── decode │ ├── decode.go │ ├── decode_amd64.go │ ├── decode_amd64.s │ ├── decode_base.go │ ├── decode_test.go │ ├── gen.go │ └── main │ │ └── asm.go ├── encode │ ├── encode.go │ ├── encode_amd64.go │ ├── encode_amd64.s │ ├── encode_base.go │ ├── encode_test.go │ ├── gen.go │ └── main │ │ └── asm.go ├── pkg_test.go ├── shared │ ├── asm.go │ ├── gen.go │ ├── main │ │ └── gentables.go │ ├── mode.go │ └── tables.go ├── stream │ ├── reader │ │ ├── reader.go │ │ ├── reader_amd64.go │ │ ├── reader_base.go │ │ └── reader_test.go │ └── writer │ │ ├── writer.go │ │ ├── writer_amd64.go │ │ ├── writer_base.go │ │ └── writer_test.go └── util │ ├── rand.go │ ├── util.go │ ├── varint.go │ └── varint_test.go └── tools ├── generate_and_check.sh ├── parse_and_write_bench.go └── update_bench.sh /.envrc: -------------------------------------------------------------------------------- 1 | THIS_DIR="$( realpath "$( dirname "${BASH_SOURCE[0]}" )" )" 2 | export GOPATH=$THIS_DIR/go 3 | export GOBIN=$GOPATH/bin 4 | export SBYTE_HOME=$THIS_DIR 5 | 6 | PATH_add $GOPATH/bin 7 | -------------------------------------------------------------------------------- /.github/workflows/default.yaml: -------------------------------------------------------------------------------- 1 | name: CI 2 | on: 3 | push: 4 | branches: 5 | - main 6 | pull_request: 7 | branches: 8 | - main 9 | jobs: 10 | test: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - uses: actions/checkout@v2 14 | - name: Test 15 | run: make test 16 | generate: 17 | runs-on: ubuntu-latest 18 | steps: 19 | - uses: actions/checkout@v2 20 | - name: Generate check 21 | run: ./tools/generate_and_check.sh 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Binaries for programs and plugins 2 | *.exe 3 | *.exe~ 4 | *.dll 5 | *.so 6 | *.dylib 7 | 8 | # Test binary, built with `go test -c` 9 | *.test 10 | 11 | # Output of the go coverage tool, specifically when used with LiteIDE 12 | *.out 13 | 14 | # Dependency directories (remove the comment below to include it) 15 | # vendor/ 16 | 17 | .idea 18 | go -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2021 Milan Patel 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | 2 | generate: 3 | go generate ./pkg/... 4 | 5 | test: 6 | go test -v ./pkg/... 7 | 8 | update-bench: 9 | ./tools/update_bench.sh 10 | 11 | fmtgo: 12 | find ./pkg -type f -iname "*.go" | xargs gofmt -w -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Stream VByte SIMD Go 2 | 3 | ![Tests](https://github.com/theMPatel/streamvbyte-simdgo/actions/workflows/default.yaml/badge.svg) 4 | 5 | This is a repository that contains a port of Stream VByte to Go. Notably, this repo takes extra care 6 | to leverage SIMD techniques to achieve better performance. Currently, there is support for x86_64 architectures 7 | that have AVX and AVX2 hardware instructions. In cases where that is not available, or on non x86_64 architectures 8 | there is a portable scalar implementation. We also perform a runtime check to make sure that the necessary 9 | ISA is available and if not fallback to the scalar approach. 10 | 11 | There are several existing implementations: 12 | 13 | 1. [Reference C/C++](https://github.com/lemire/streamvbyte) 14 | 2. [Rust](https://bitbucket.org/marshallpierce/stream-vbyte-rust) 15 | 3. [Go](https://github.com/nelz9999/stream-vbyte-go) 16 | * Note: only has a scalar implementation which prompted this implementation with SIMD techniques. 17 | 18 | ## Benchmarks 19 | 20 | ```text 21 | goos: darwin 22 | goarch: amd64 23 | pkg: github.com/theMPatel/streamvbyte-simdgo/pkg 24 | cpu: Intel(R) Core(TM) i7-8700B CPU @ 3.20GHz 25 | -- 26 | MemCopy8Uint32-12 463986302 2.608 ns/op 12269.03 MB/s 27 | 28 | goos: darwin 29 | goarch: amd64 30 | pkg: github.com/theMPatel/streamvbyte-simdgo/pkg/decode 31 | cpu: Intel(R) Core(TM) i7-8700B CPU @ 3.20GHz 32 | -- 33 | Get8uint32Fast-12 377839186 3.170 ns/op 10095.99 MB/s 34 | Get8uint32DeltaFast-12 298522095 4.455 ns/op 7183.20 MB/s 35 | Get8uint32Scalar-12 63384603 19.28 ns/op 1659.36 MB/s 36 | Get8uint32DeltaScalar-12 58705828 20.04 ns/op 1596.46 MB/s 37 | Get8uint32Varint-12 27369775 43.77 ns/op 731.10 MB/s 38 | Get8uint32DeltaVarint-12 20924770 57.30 ns/op 558.46 MB/s 39 | 40 | goos: darwin 41 | goarch: amd64 42 | pkg: github.com/theMPatel/streamvbyte-simdgo/pkg/encode 43 | cpu: Intel(R) Core(TM) i7-8700B CPU @ 3.20GHz 44 | -- 45 | Put8uint32Fast-12 297620898 3.864 ns/op 8281.18 MB/s 46 | Put8uint32DeltaFast-12 276545827 4.350 ns/op 7356.59 MB/s 47 | Put8uint32Scalar-12 41200776 28.59 ns/op 1119.30 MB/s 48 | Put8uint32DeltaScalar-12 37773458 30.65 ns/op 1044.11 MB/s 49 | Put8uint32Varint-12 58668867 17.20 ns/op 1860.67 MB/s 50 | Put8uint32DeltaVarint-12 61446153 22.88 ns/op 1398.80 MB/s 51 | 52 | goos: darwin 53 | goarch: amd64 54 | pkg: github.com/theMPatel/streamvbyte-simdgo/pkg/stream/reader 55 | cpu: Intel(R) Core(TM) i7-8700B CPU @ 3.20GHz 56 | -- 57 | ReadAllFast/Count_1e0-12 99354789 12.24 ns/op 326.80 MB/s 58 | ReadAllFast/Count_1e1-12 28076071 42.81 ns/op 934.43 MB/s 59 | ReadAllFast/Count_1e2-12 11041639 107.2 ns/op 3730.16 MB/s 60 | ReadAllFast/Count_1e3-12 1645387 729.9 ns/op 5480.00 MB/s 61 | ReadAllFast/Count_1e4-12 170894 7034 ns/op 5686.52 MB/s 62 | ReadAllFast/Count_1e5-12 16848 70969 ns/op 5636.29 MB/s 63 | ReadAllFast/Count_1e6-12 1513 728516 ns/op 5490.62 MB/s 64 | ReadAllFast/Count_1e7-12 152 7835111 ns/op 5105.22 MB/s 65 | ReadAllDeltaFast/Count_1e0-12 92727970 13.10 ns/op 305.44 MB/s 66 | ReadAllDeltaFast/Count_1e1-12 26164140 45.89 ns/op 871.61 MB/s 67 | ReadAllDeltaFast/Count_1e2-12 9458992 128.5 ns/op 3113.55 MB/s 68 | ReadAllDeltaFast/Count_1e3-12 1277408 934.4 ns/op 4280.69 MB/s 69 | ReadAllDeltaFast/Count_1e4-12 144405 8318 ns/op 4808.88 MB/s 70 | ReadAllDeltaFast/Count_1e5-12 14444 83151 ns/op 4810.55 MB/s 71 | ReadAllDeltaFast/Count_1e6-12 1426 846305 ns/op 4726.43 MB/s 72 | ReadAllDeltaFast/Count_1e7-12 127 9337355 ns/op 4283.87 MB/s 73 | ReadAllScalar/Count_1e0-12 122650209 9.770 ns/op 409.43 MB/s 74 | ReadAllScalar/Count_1e1-12 38012136 31.63 ns/op 1264.64 MB/s 75 | ReadAllScalar/Count_1e2-12 4999376 241.6 ns/op 1655.30 MB/s 76 | ReadAllScalar/Count_1e3-12 500337 2459 ns/op 1626.38 MB/s 77 | ReadAllScalar/Count_1e4-12 50247 24034 ns/op 1664.34 MB/s 78 | ReadAllScalar/Count_1e5-12 5032 238354 ns/op 1678.17 MB/s 79 | ReadAllScalar/Count_1e6-12 499 2405669 ns/op 1662.74 MB/s 80 | ReadAllScalar/Count_1e7-12 46 24533207 ns/op 1630.44 MB/s 81 | ReadAllDeltaScalar/Count_1e0-12 100000000 10.32 ns/op 387.49 MB/s 82 | ReadAllDeltaScalar/Count_1e1-12 36915704 32.52 ns/op 1230.08 MB/s 83 | ReadAllDeltaScalar/Count_1e2-12 4818140 249.8 ns/op 1601.58 MB/s 84 | ReadAllDeltaScalar/Count_1e3-12 512492 2374 ns/op 1685.20 MB/s 85 | ReadAllDeltaScalar/Count_1e4-12 51004 23639 ns/op 1692.11 MB/s 86 | ReadAllDeltaScalar/Count_1e5-12 3568 333168 ns/op 1200.60 MB/s 87 | ReadAllDeltaScalar/Count_1e6-12 520 2304864 ns/op 1735.46 MB/s 88 | ReadAllDeltaScalar/Count_1e7-12 48 24810555 ns/op 1612.22 MB/s 89 | ReadAllVarint/Count_1e0-12 121348074 9.967 ns/op 401.34 MB/s 90 | ReadAllVarint/Count_1e1-12 21056739 57.34 ns/op 697.64 MB/s 91 | ReadAllVarint/Count_1e2-12 2025081 589.0 ns/op 679.15 MB/s 92 | ReadAllVarint/Count_1e3-12 205881 5851 ns/op 683.69 MB/s 93 | ReadAllVarint/Count_1e4-12 20906 57446 ns/op 696.31 MB/s 94 | ReadAllVarint/Count_1e5-12 2037 580620 ns/op 688.92 MB/s 95 | ReadAllVarint/Count_1e6-12 208 5755083 ns/op 695.04 MB/s 96 | ReadAllVarint/Count_1e7-12 20 57872736 ns/op 691.17 MB/s 97 | ReadAllDeltaVarint/Count_1e0-12 139763250 8.318 ns/op 480.87 MB/s 98 | ReadAllDeltaVarint/Count_1e1-12 19199100 62.49 ns/op 640.11 MB/s 99 | ReadAllDeltaVarint/Count_1e2-12 2149660 556.6 ns/op 718.65 MB/s 100 | ReadAllDeltaVarint/Count_1e3-12 207122 5810 ns/op 688.41 MB/s 101 | ReadAllDeltaVarint/Count_1e4-12 22680 53200 ns/op 751.88 MB/s 102 | ReadAllDeltaVarint/Count_1e5-12 2145 500177 ns/op 799.72 MB/s 103 | ReadAllDeltaVarint/Count_1e6-12 228 5262741 ns/op 760.06 MB/s 104 | ReadAllDeltaVarint/Count_1e7-12 27 42000722 ns/op 952.36 MB/s 105 | 106 | goos: darwin 107 | goarch: amd64 108 | pkg: github.com/theMPatel/streamvbyte-simdgo/pkg/stream/writer 109 | cpu: Intel(R) Core(TM) i7-8700B CPU @ 3.20GHz 110 | -- 111 | WriteAllFast/Count_1e0-12 54152408 22.05 ns/op 181.36 MB/s 112 | WriteAllFast/Count_1e1-12 27681948 43.17 ns/op 926.49 MB/s 113 | WriteAllFast/Count_1e2-12 7136480 167.0 ns/op 2395.79 MB/s 114 | WriteAllFast/Count_1e3-12 928952 1273 ns/op 3141.14 MB/s 115 | WriteAllFast/Count_1e4-12 96117 12012 ns/op 3329.93 MB/s 116 | WriteAllFast/Count_1e5-12 9718 114260 ns/op 3500.80 MB/s 117 | WriteAllFast/Count_1e6-12 879 1242927 ns/op 3218.21 MB/s 118 | WriteAllFast/Count_1e7-12 100 10368754 ns/op 3857.74 MB/s 119 | WriteAllDeltaFast/Count_1e0-12 50489378 23.38 ns/op 171.06 MB/s 120 | WriteAllDeltaFast/Count_1e1-12 26866423 45.03 ns/op 888.33 MB/s 121 | WriteAllDeltaFast/Count_1e2-12 6695125 175.8 ns/op 2275.37 MB/s 122 | WriteAllDeltaFast/Count_1e3-12 899895 1391 ns/op 2875.71 MB/s 123 | WriteAllDeltaFast/Count_1e4-12 90394 12958 ns/op 3086.82 MB/s 124 | WriteAllDeltaFast/Count_1e5-12 10000 122319 ns/op 3270.13 MB/s 125 | WriteAllDeltaFast/Count_1e6-12 945 1249546 ns/op 3201.16 MB/s 126 | WriteAllDeltaFast/Count_1e7-12 100 11461852 ns/op 3489.84 MB/s 127 | WriteAllScalar/Count_1e0-12 56106489 21.72 ns/op 184.18 MB/s 128 | WriteAllScalar/Count_1e1-12 18309972 65.09 ns/op 614.51 MB/s 129 | WriteAllScalar/Count_1e2-12 2776918 433.5 ns/op 922.63 MB/s 130 | WriteAllScalar/Count_1e3-12 289309 4209 ns/op 950.38 MB/s 131 | WriteAllScalar/Count_1e4-12 29497 40884 ns/op 978.38 MB/s 132 | WriteAllScalar/Count_1e5-12 3027 399959 ns/op 1000.10 MB/s 133 | WriteAllScalar/Count_1e6-12 296 4010161 ns/op 997.47 MB/s 134 | WriteAllScalar/Count_1e7-12 28 38753790 ns/op 1032.16 MB/s 135 | WriteAllDeltaScalar/Count_1e0-12 54981757 21.90 ns/op 182.65 MB/s 136 | WriteAllDeltaScalar/Count_1e1-12 17823349 67.10 ns/op 596.14 MB/s 137 | WriteAllDeltaScalar/Count_1e2-12 2711672 442.4 ns/op 904.09 MB/s 138 | WriteAllDeltaScalar/Count_1e3-12 292664 4130 ns/op 968.62 MB/s 139 | WriteAllDeltaScalar/Count_1e4-12 29340 41014 ns/op 975.28 MB/s 140 | WriteAllDeltaScalar/Count_1e5-12 2289 516113 ns/op 775.02 MB/s 141 | WriteAllDeltaScalar/Count_1e6-12 302 3930860 ns/op 1017.59 MB/s 142 | WriteAllDeltaScalar/Count_1e7-12 30 41357670 ns/op 967.17 MB/s 143 | WriteAllVarint/Count_1e0-12 208214545 5.720 ns/op 699.32 MB/s 144 | WriteAllVarint/Count_1e1-12 43083270 28.02 ns/op 1427.34 MB/s 145 | WriteAllVarint/Count_1e2-12 4972045 242.8 ns/op 1647.67 MB/s 146 | WriteAllVarint/Count_1e3-12 499011 2409 ns/op 1660.60 MB/s 147 | WriteAllVarint/Count_1e4-12 51022 23590 ns/op 1695.67 MB/s 148 | WriteAllVarint/Count_1e5-12 5216 231741 ns/op 1726.07 MB/s 149 | WriteAllVarint/Count_1e6-12 518 2305364 ns/op 1735.08 MB/s 150 | WriteAllVarint/Count_1e7-12 50 24905825 ns/op 1606.05 MB/s 151 | WriteAllDeltaVarint/Count_1e0-12 175269966 6.792 ns/op 588.93 MB/s 152 | WriteAllDeltaVarint/Count_1e1-12 51799438 23.38 ns/op 1710.63 MB/s 153 | WriteAllDeltaVarint/Count_1e2-12 5417458 221.3 ns/op 1807.60 MB/s 154 | WriteAllDeltaVarint/Count_1e3-12 539414 2243 ns/op 1783.48 MB/s 155 | WriteAllDeltaVarint/Count_1e4-12 52717 22753 ns/op 1757.99 MB/s 156 | WriteAllDeltaVarint/Count_1e5-12 5716 210456 ns/op 1900.63 MB/s 157 | WriteAllDeltaVarint/Count_1e6-12 495 2453672 ns/op 1630.21 MB/s 158 | WriteAllDeltaVarint/Count_1e7-12 70 17491186 ns/op 2286.87 MB/s 159 | ``` 160 | 161 | A note on the benchmarks: An array of random uint32's is generated and then encoded/decoded over 162 | and over again. An attempt is made to ensure that some of these benchmarks reflect the most probable 163 | real world performance metrics. 164 | 165 | --- 166 | Stream VByte uses the same underlying format as Google's Group Varint approach. Lemire et al. wanted 167 | to see if there was a way to improve the performance even more and introduced a clever twist to enable 168 | better performance via SIMD techniques. The basic goal of the Group Varint format is to be able to 169 | achieve similar compression characteristics as the VByte format for integers and also be able to load 170 | and process them really quickly. 171 | 172 | ## VByte format 173 | 174 | The insight that backs the VByte encoding is noticing that you oftentimes don't need 32 bits to 175 | encode a 32-bit integer. Take for example an unsigned integer that is less than 2^8 (256). This 176 | integer will have bits set in the lowest byte of a 32-bit integer, while the remaining 3 bytes will 177 | simply be zeros. 178 | 179 | ``` 180 | 111 in binary: 181 | 182 | 00000000 00000000 00000000 01101111 183 | ``` 184 | 185 | An approach you can take to compress this integer is to encode the integer using a variable 186 | number of bytes. For example, you can use the lower 7 bits to store data, i.e. bits 187 | from the original integer, and then use the MSB as a continuation bit. If the MSB bit is on, i.e. 188 | is 1, then more bytes are needed to decode this particular integer. Below is an example where 189 | you might need 2 bytes to store the number 1234. 190 | 191 | ``` 192 | 1234 in binary: 193 | 194 | 00000000 00000000 00000100 11010010 195 | 196 | Num compressed: 197 | 198 | v v Continuation bits 199 | 0|0001001| 1|1010010| 200 | ^ ^ Data bits 201 | ``` 202 | 203 | If you want to decode this integer, you simply build up the number iteratively. I.e. you OR the 204 | last 7 bits of every byte shifted to the appropriate length to your 32-bit integer until you 205 | find a byte that doesn't have a continuation bit set. Note that this works the same for 64-bit 206 | numbers. 207 | 208 | The problem with this approach is that it can introduce a lot of branch mis-predictions during encoding/decoding. 209 | During the decoding phase, you don't know ahead of time the number of bytes that were used to encode the integer 210 | you are currently processing and so you need to iterate until you find a byte without a continuation bit on. 211 | If you have integers that are nonuniform, i.e. integers that require random numbers of bytes to encode relative 212 | to one another, this can pose a challenge to the processor's branch predictor. These mis-predictions can cause 213 | major slowdowns in processor pipelines and so was born the Group Varint format. 214 | 215 | ## Group Varint format 216 | 217 | The Group Varint (varint-GB) format assumes that everything you hope to achieve, you can do with 32-bit integers. 218 | It introduces the concept of a control byte which is simply a byte that stores the encoded 219 | lengths of a group of 4 32-bit integers, hence Group Varint. 32-bit integers only require up to 4 bytes 220 | to properly encode. This means that you can represent their lengths with 2 bits using a zero-indexed length 221 | i.e. 0, 1, 2, and 3 to represent integers that require 1, 2, 3 and 4 bytes to encode, respectively. 222 | 223 | ``` 224 | 00000000 00000000 00000000 01101111 = 111 225 | 00000000 00000000 00000100 11010010 = 1234 226 | 00000000 00001100 00001010 10000011 = 789123 227 | 01000000 00000000 00000000 00000000 = 1073741824 228 | 229 | Num Len 2-bit control 230 | ---------------------------------- 231 | 111 1 0b00 232 | 1234 2 0b01 233 | 789123 3 0b10 234 | 1073741824 4 0b11 235 | 236 | Final Control byte 237 | 0b11100100 238 | 239 | Encoded data (little endian right-to-left bottom-to-top) 240 | 0b01000000 0b00000000 0b00000000 0b00000000 0b00001100 241 | 0b00001010 0b10000011 0b00000100 0b11010010 0b01101111 242 | ``` 243 | 244 | You can then prefix every group of 4 encoded 32-bit integers with their control byte and then use it during decoding. 245 | The obvious downside is that you pay a storage cost of one byte for every 4 integers you want to encode. For 2^20 246 | encoded integers, that's an extra 256 KB of extra space: totally marginal. The great upside, though, is that 247 | you've now removed almost all branches from your decoding phase. You know exactly how many data bytes you need 248 | to read from a buffer for a particular number and then can use branchless decoding. 249 | 250 | ```go 251 | package foo 252 | 253 | import ( 254 | "encoding/binary" 255 | ) 256 | 257 | func decodeOne(input []byte, size uint8) uint32 { 258 | buf := make([]byte, 4) 259 | copy(buf, input[:size]) 260 | 261 | // func (littleEndian) Uint32(b []byte) uint32 { 262 | // _ = b[3] 263 | // return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24 264 | // } 265 | return binary.LittleEndian.Uint32(buf) 266 | } 267 | 268 | func main() { 269 | ctrl := uint8(0b11_10_01_00) 270 | data := []byte{ 271 | 0b01101111, 0b11010010, 0b00000100, 272 | 0b10000011, 0b00001010, 0b00001100, 273 | 0b00000000, 0b00000000, 0b00000000, 274 | 0b01000000, 275 | } 276 | 277 | len0 := (ctrl & 3) + 1 // 1 278 | len1 := (ctrl >> 2 & 3) + 1 // 2 279 | len2 := (ctrl >> 4 & 3) + 1 // 3 280 | len3 := (ctrl >> 6 & 3) + 1 // 4 281 | 282 | _ = decodeOne(data, len0) // 111 283 | _ = decodeOne(data[len0:], len1) // 1234 284 | _ = decodeOne(data[len0+len1:], len2) // 789_123 285 | _ = decodeOne(data[len0+len1+len2:], len3) // 1_073_741_824 286 | } 287 | ``` 288 | 289 | ## Stream VByte format 290 | 291 | Unfortunately, accelerating decoding of the varint-GB format with only SIMD techniques 292 | has proven unsuccessful. The below excerpt from the paper outlines why. 293 | 294 | > To understand why it might be difficult to accelerate the decoding of data compressed in the VARINT-GB 295 | > format compared to the VARINT-G8IU format, consider that we cannot decode faster than we can access the 296 | > control bytes. In VARINT-G8IU, the control bytes are conveniently always located nine compressed bytes 297 | > apart. Thus while a control byte is being processed, or even before, our superscalar processor can load 298 | > and start processing upcoming control bytes, as their locations are predictable. Instructions depending 299 | > on these control bytes can be reordered by the processor for best performance. However, in the VARINT-GB 300 | > format, there is a strong data dependency: the location of the next control byte depends on the current 301 | > control byte. This increases the risk that the processor remains underutilized, delayed by the latency 302 | > between issuing the load for the next control byte and waiting for it to be ready. 303 | 304 | Additionally, they prove that decoding 4 integers at a time using 128-bit registers is faster than trying 305 | to decode a variable number of integers that fit into an 8-byte register, i.e. the varint-G8IU approach. 306 | 307 | ### SIMD control byte generation algorithm 308 | 309 | Lemire et al. have devised a brilliant SIMD algorithm for simultaneously generating two control bytes 310 | for a group of 8 integers. The best way to understand this algorithm is to understand how it works on 311 | a single integer and then assume it works in a vectorized form (it does). Going forward we'll use 312 | *control bits stream* to represent these control bytes we are building. 313 | 314 | ``` 315 | 00000000 00000000 00000100 11010010 // 1234 316 | ``` 317 | 318 | Let's take one of the previous integers that we were looking at, `1234`, and walk through an example 319 | of how the 2-bit control is generated for it using SIMD techniques. The goal is to be able to, for 320 | any 32-bit integer, generate a 2-bit zero indexed length value. For example, if you have an integer 321 | that requires 2 bytes to be encoded, we want for the algorithm to generate `0b01`. 322 | 323 | ``` 324 | 00000000 00000000 00000100 11010010 // 1234 325 | 00000001 00000001 00000001 00000001 // 0x0101 mask 326 | ----------------------------------- // byte-min(1234, 0x0101) 327 | 00000000 00000000 00000001 00000001 328 | ``` 329 | 330 | The algorithm first uses a mask where every byte is equal to 1. If you perform a per-byte min operation 331 | on our integer and the 1's mask, the result will have a 1 at every byte that had a value in the original 332 | integer. 333 | 334 | ``` 335 | 00000000 00000000 00000001 00000001 336 | ----------------------------------- // pack unsigned saturating 16-bit to 8-bit 337 | 00000000 00000000 00000000 11111111 338 | ``` 339 | 340 | Now you perform a 16-bit to 8-bit unsigned saturating pack operation. Practically this means that you're 341 | taking every 16-bit value and trying to shove that into 8 bits. If the 16-bit integer is larger than 342 | the largest unsigned integer 8 bits can support, the pack saturates to the largest unsigned 8-bit value. 343 | 344 | Why this is performed will become more clear in the subsequent steps, however, at a high level, for every 345 | integer you want to encode, you want for the MSB of two consecutive bytes in the control bits stream 346 | to be representative of the final 2-bit control. For example, if you have a 3-byte integer, you want the 347 | MSB of two consecutive bytes to be 1 and 0, in that order. The reason you would want this is that 348 | there is a vector pack instruction that takes the MSB from every byte in the control bits stream 349 | and packs it into the lowest byte. This would thus represent the value `0b10` in the final byte for 350 | this 3-byte integer, which is what we want. 351 | 352 | Performing a 16-bit to 8-bit unsigned saturating pack has the effect that you can use the saturation 353 | behavior to conditionally turn on the MSB of these bytes depending on which bytes have values in the 354 | original 32-bit integer. 355 | 356 | ``` 357 | 00000000 00000000 00000000 11111111 // control bits stream 358 | 00000001 00000001 00000001 00000001 // 0x0101 mask 359 | ----------------------------------- // signed 16-bit min 360 | 00000000 00000000 00000000 11111111 361 | ``` 362 | 363 | We then take the 1's mask we used before and perform a __signed 16-bit__ min operation. The reason for this 364 | is more clear if you look at an example using a 3-byte integer. 365 | 366 | ``` 367 | 00000000 00001100 00001010 10000011 // 789123 368 | 00000001 00000001 00000001 00000001 // 0x0101 mask 369 | ----------------------------------- // byte-min(789123, 0x0101) 370 | 00000000 00000001 00000001 00000001 371 | ----------------------------------- // pack unsigned saturating 16-bit to 8-bit 372 | 00000000 00000000 00000001 11111111 373 | 00000001 00000001 00000001 00000001 // 0x0101 mask 374 | ----------------------------------- // signed 16-bit min 375 | 00000000 00000000 00000001 00000001 376 | ``` 377 | 378 | The signed 16-bit min operation has three important effects. 379 | 380 | First, for 3-byte integers, it has the effect of turning off the MSB of the lowest byte. This is necessary 381 | because a 3-byte integer should have a 2-bit control that is `0b10` and without this step using the MSB pack 382 | operation would result in a 2-bit control that looks something like `0b_1`, where the lowest bit is on. 383 | Obviously this is wrong, since only integers that require 2 or 4 bytes to encode should have that lower bit 384 | on, i.e. 1 or 3 as a zero-indexed length. 385 | 386 | Second, for 4-byte integers, the signed aspect has the effect of leaving both MSBs of the 2 bytes on. When using the 387 | MSB pack operation later on, it will result in a 2-bit control value of `0b11`, which is what we want. 388 | 389 | Third, for 1 and 2 byte integers, it has no effect. This is great for 2-byte values since the MSB will remain on 390 | and 1 byte values will not have any MSB on anyways, so it is effectively a noop in both scenarios. 391 | 392 | ``` 393 | 00000000 00000000 00000000 11111111 // control bits stream (original 1234) 394 | 01111111 00000000 01111111 00000000 // 0x7F00 mask 395 | ----------------------------------- // add unsigned saturating 16-bit 396 | 01111111 00000000 01111111 11111111 397 | ``` 398 | 399 | Next, we take a mask with the value `0x7F00` and perform an unsigned saturating add to the control bits stream. 400 | In the case for the integer `1234` this has no real effect. We maintain the MSB in the lowest byte. You'll note, 401 | however, that the only byte that has its MSB on is the last one, so performing an MSB pack operation would result 402 | in a value of `0b0001`, which is what we want. An example of this step on the integer `789123` might paint a clearer 403 | picture. 404 | 405 | ``` 406 | 00000000 00000000 00000001 00000001 // control bits stream (789123) 407 | 01111111 00000000 01111111 00000000 // 0x7F00 mask 408 | ----------------------------------- // add unsigned saturating 16-bit 409 | 01111111 00000000 11111111 00000001 410 | ``` 411 | 412 | You'll note here that the addition of `0x01` with `0x7F` in the upper byte results in the MSB of the resulting upper 413 | byte turning on. The MSB in the lower byte remains off and now an MSB pack operation will resolve to `0b0010`, 414 | which is what we want. The unsigned saturation behavior is really important for 4-byte numbers that only have 415 | bits in the most significant byte on. An example below: 416 | 417 | ``` 418 | 01000000 00000000 00000000 00000000 // 1073741824 419 | 00000001 00000001 00000001 00000001 // 0x0101 mask 420 | ----------------------------------- // byte-min(1073741824, 0x0101) 421 | 00000001 00000000 00000000 00000000 422 | ----------------------------------- // pack unsigned saturating 16-bit to 8-bit 423 | 00000000 00000000 11111111 00000000 424 | 00000001 00000001 00000001 00000001 // 0x0101 mask 425 | ----------------------------------- // signed 16-bit min 426 | 00000000 00000000 11111111 00000000 427 | 01111111 00000000 01111111 00000000 // 0x7F00 mask 428 | ----------------------------------- // add unsigned saturating 16-bit 429 | 01111111 00000000 11111111 11111111 430 | ``` 431 | 432 | Note here that because only the upper byte had a value in it, the lowest byte in the control bits stream remains 433 | zero for the duration of the algorithm. This poses an issue, since for a 4-byte value, we want for the 2-bit 434 | control to result in a value of `0b11`. Performing a 16-bit unsigned *saturating* addition has the effect of 435 | turning on all bits in the lower byte, and thus we get a result with the MSB in the lower byte on. 436 | 437 | ``` 438 | 01111111 00000000 11111111 00000001 // control bits stream (789123) 439 | ----------------------------------- // move byte mask 440 | 00000000 00000000 00000000 00000010 // 2-bit control 441 | ``` 442 | 443 | The final move byte mask is performed on the control bits stream, and we now have the result we wanted. Now that you 444 | see that this works for 1 integer, you know how it can work for 8 integers simultaneously, since we use vector 445 | instructions that operate on 128 bit registers. 446 | 447 | ### SIMD integer packing/unpacking 448 | 449 | The next problem to be solved is how to take a group of 4 integers, and compress it by removing extraneous/unused 450 | bytes so that all you're left with is a stream of data bytes with real information. Let's take two numbers from 451 | our examples above. 452 | 453 | ``` 454 | 789123 1234 455 | 00000000 00001100 00001010 10000011 | 00000000 00000000 00000100 11010010 456 | ------------------------------------------------------------------------- 457 | 00001100 00001010 10000011 00000100 11010010 // packed 458 | ``` 459 | 460 | Here, we can use a shuffle operation. Vector shuffle operations rearrange the bytes in an input register according 461 | to some provided mask into a destination register. Every position in the mask stores an offset into the source 462 | vector stream that represents the data byte that should go into that position. 463 | 464 | ``` 465 | input [1234, 789123] (little endian R-to-L) 466 | 00000000 00001100 00001010 10000011 00000000 00000000 00000100 11010010 467 | | | | | | 468 | | | |____________________ | | 469 | | |_____________________ | | | 470 | |____________________ | | | | 471 | v v v v v 472 | 0xff 0xff 0xff 0x06 0x05 0x04 0x01 0x00 // mask in hex 473 | ----------------------------------------------------------------------- 474 | 00000000 00000000 00000000 00001100 00001010 10000011 00000100 11010010 // packed 475 | ``` 476 | 477 | We keep a prebuilt lookup table that contains a mapping from control byte to the necessary mask and simply 478 | load that after we construct the control byte above. In addition, we keep a lookup table for a mapping from 479 | control bytes to total encoded length. This allows us to know by how much to increment the output pointer and 480 | overwrite, for example, the redundant upper 3 bytes in the above shuffle example. 481 | 482 | Unpacking during decoding is the same as the above, but in reverse. We need to go from a packed format 483 | to an unpacked memory format. We keep lookup tables to maintain a mapping from control byte to the reverse 484 | shuffle mask, and then perform a shuffle operation to output to an `uint32` array. 485 | 486 | # References 487 | 488 | [Stream VByte: Faster Byte-Oriented Integer Compression](https://arxiv.org/pdf/1709.08990.pdf) 489 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/theMPatel/streamvbyte-simdgo 2 | 3 | go 1.16 4 | 5 | require ( 6 | github.com/mmcloughlin/avo v0.2.0 7 | github.com/pkg/errors v0.9.1 8 | golang.org/x/sys v0.0.0-20210510120138-977fb7262007 9 | golang.org/x/tools v0.1.5 // indirect 10 | ) 11 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/mmcloughlin/avo v0.2.0 h1:6vhoSaKtxb6f4RiH+LK2qL6GSMpFzhEwJYTTSZNy09w= 2 | github.com/mmcloughlin/avo v0.2.0/go.mod h1:5tidO2Z9Z7N6X7UMcGg+1KTj51O8OxYDCMHxCZTVpEA= 3 | github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= 4 | github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= 5 | github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= 6 | github.com/yuin/goldmark v1.3.5/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k= 7 | golang.org/x/arch v0.0.0-20210405154355-08b684f594a5/go.mod h1:flIaEI6LNU6xOCD5PaJvn9wGP0agmIOqjrtsKGRguv4= 8 | golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= 9 | golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= 10 | golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= 11 | golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= 12 | golang.org/x/mod v0.4.2 h1:Gz96sIWK3OalVv/I/qNygP42zyoKp3xptRVCWRFEBvo= 13 | golang.org/x/mod v0.4.2/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= 14 | golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= 15 | golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= 16 | golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= 17 | golang.org/x/net v0.0.0-20210405180319-a5a99cb37ef4/go.mod h1:p54w0d4576C0XHj96bSt6lcn1PtDYWL6XObtHCRCNQM= 18 | golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 19 | golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 20 | golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 21 | golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= 22 | golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 23 | golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 24 | golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 25 | golang.org/x/sys v0.0.0-20210119212857-b64e53b001e4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 26 | golang.org/x/sys v0.0.0-20210330210617-4fbd30eecc44/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 27 | golang.org/x/sys v0.0.0-20210403161142-5e06dd20ab57/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 28 | golang.org/x/sys v0.0.0-20210510120138-977fb7262007 h1:gG67DSER+11cZvqIMb8S8bt0vZtiN6xWYARwirrOSfE= 29 | golang.org/x/sys v0.0.0-20210510120138-977fb7262007/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 30 | golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= 31 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= 32 | golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= 33 | golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= 34 | golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= 35 | golang.org/x/tools v0.1.0/go.mod h1:xkSsbof2nBLbhDlRMhhhyNLN/zl3eTqcnHD5viDpcZ0= 36 | golang.org/x/tools v0.1.5 h1:ouewzE6p+/VEB31YYnTbEJdi8pFqKp4P4n85vwo3DHA= 37 | golang.org/x/tools v0.1.5/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk= 38 | golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= 39 | golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= 40 | golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 h1:go1bK/D/BFZV2I8cIQd1NKEZ+0owSTG1fDTci4IqFcE= 41 | golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= 42 | rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4= 43 | -------------------------------------------------------------------------------- /pkg/decode/decode.go: -------------------------------------------------------------------------------- 1 | package decode 2 | 3 | import ( 4 | "github.com/theMPatel/streamvbyte-simdgo/pkg/shared" 5 | ) 6 | 7 | var ( 8 | getImpl Get8Impl 9 | getDeltaImpl Get8DeltaImpl 10 | ) 11 | 12 | type Get8Impl func(in []byte, out []uint32, ctrl uint16) 13 | type Get8DeltaImpl func(in []byte, out []uint32, ctrl uint16, prev uint32) 14 | 15 | func init() { 16 | if GetMode() == shared.Fast { 17 | getImpl = Get8uint32Fast 18 | getDeltaImpl = Get8uint32DeltaFast 19 | } else { 20 | getImpl = Get8uint32Scalar 21 | getDeltaImpl = Get8uint32DeltaScalar 22 | } 23 | } 24 | 25 | // Get8uint32 is a general func you can use to decode 8 uint32's at a time. 26 | // It will use the fastest implementation available determined during 27 | // package initialization. If your CPU supports special hardware instructions 28 | // then it will use an accelerated version of Stream VByte. Otherwise, the 29 | // scalar implementation will be used as the fallback. 30 | func Get8uint32(in []byte, out []uint32, ctrl uint16) { 31 | getImpl(in, out, ctrl) 32 | } 33 | 34 | // Get8uint32Delta is a general func you can use to decode 8 differentially coded 35 | // uint32's at a time. It will use the fastest implementation available determined 36 | // during package initialization. If your CPU supports special hardware instructions 37 | // then it will use an accelerated version of Stream VByte. Otherwise, the 38 | // scalar implementation will be used as the fallback. 39 | func Get8uint32Delta(in []byte, out []uint32, ctrl uint16, prev uint32) { 40 | getDeltaImpl(in, out, ctrl, prev) 41 | } 42 | 43 | // Get8uint32Scalar will decode 8 uint32 values from in into out using the 44 | // Stream VByte format. Returns the number of bytes read from the input 45 | // buffer. 46 | // 47 | // Note: It is your responsibility to ensure that the incoming slices have 48 | // the appropriate sizes and data otherwise this func will panic. 49 | func Get8uint32Scalar(in []byte, out []uint32, ctrl uint16) { 50 | lower := uint8(ctrl & 0xff) 51 | upper := uint8(ctrl >> 8) 52 | lowerSize := shared.ControlByteToSize(lower) 53 | Get4uint32Scalar(in, out, lower) 54 | Get4uint32Scalar(in[lowerSize:], out[4:], upper) 55 | } 56 | 57 | // Get4uint32Scalar will decode 4 uint32 values from in into out using the 58 | // Stream VByte format. Returns the number of bytes read from the input 59 | // buffer. 60 | // 61 | // Note: It is your responsibility to ensure that the incoming slices have 62 | // the appropriate sizes and data otherwise this func will panic. 63 | func Get4uint32Scalar(in []byte, out []uint32, ctrl uint8) { 64 | sizes := shared.PerNumLenTable[ctrl] 65 | 66 | len3 := sizes[3] 67 | len2 := sizes[2] 68 | len1 := sizes[1] 69 | len0 := sizes[0] 70 | 71 | out[3] = decodeOne(in[len0+len1+len2:], len3) 72 | out[2] = decodeOne(in[len0+len1:], len2) 73 | out[1] = decodeOne(in[len0:], len1) 74 | out[0] = decodeOne(in, len0) 75 | } 76 | 77 | // GetUint32Scalar decodes up to 4 integers from in into out using the 78 | // Stream VByte format. 79 | // 80 | // Note: It is your responsibility to ensure that the incoming slices have 81 | // the appropriate sizes and data otherwise this func will panic. 82 | func GetUint32Scalar(in []byte, out []uint32, ctrl uint8, count int) int { 83 | if count == 0 { 84 | return 0 85 | } 86 | 87 | if count > 4 { 88 | count = 4 89 | } 90 | 91 | shift := 0 92 | total := 0 93 | for i := 0; i < count; i++ { 94 | size := ((ctrl >> shift) & 0x3) + 1 95 | out[i] = decodeOne(in[total:], size) 96 | total += int(size) 97 | shift += 2 98 | } 99 | 100 | return total 101 | } 102 | 103 | // GetUint32DeltaScalar decodes up to 4 integers from in into out using the 104 | // Stream VByte format. It will reconstruct the original non differentially 105 | // encoded values. 106 | // 107 | // Note: It is your responsibility to ensure that the incoming slices have 108 | // the appropriate sizes and data otherwise this func will panic. 109 | func GetUint32DeltaScalar(in []byte, out []uint32, ctrl uint8, count int, prev uint32) int { 110 | if count == 0 { 111 | return 0 112 | } 113 | 114 | if count > 4 { 115 | count = 4 116 | } 117 | 118 | shift := 0 119 | total := 0 120 | for i := 0; i < count; i++ { 121 | size := ((ctrl >> shift) & 0x3) + 1 122 | num := decodeOne(in[total:], size) + prev 123 | out[i] = num 124 | prev = num 125 | total += int(size) 126 | shift += 2 127 | } 128 | 129 | return total 130 | } 131 | 132 | // Get8uint32DeltaScalar will decode 8 uint32 values from in into out and reconstruct 133 | // the original values via differential coding. Prev provides a way for you to 134 | // indicate the base value for this batch of 8. For example, when decoding the second 135 | // batch of 8 integers out of, e.g. 16, you would provide a prev value of the last value 136 | // in the first batch of 8 you decoded. This is done to ensure that the integers are 137 | // correctly resolved to the correct diff. An example below. 138 | // 139 | // Input: [ 10, 10, 10, 10, 10, 10, 10, 10 ] [ 10, 10, 10, 10, 10, 10, 10, 10 ] 140 | // Output: [ 10, 20, 30, 40, 50, 60, 70, 80 ] [ 90, 100, 110, 120, 130, 140, 150, 160 ] 141 | // Prev: 80 142 | func Get8uint32DeltaScalar(in []byte, out []uint32, ctrl uint16, prev uint32) { 143 | lower := uint8(ctrl & 0xff) 144 | upper := uint8(ctrl >> 8) 145 | lowerSize := shared.ControlByteToSize(lower) 146 | Get4uint32DeltaScalar(in, out, lower, prev) 147 | Get4uint32DeltaScalar(in[lowerSize:], out[4:], upper, out[3]) 148 | } 149 | 150 | // Get4uint32DeltaScalar will decode 4 uint32 values from in into out and reconstruct 151 | // the original values via differential coding. Prev provides a way for you to 152 | // indicate the base value for this batch of 4. For example, when decoding the second 153 | // batch of 4 integers out of, e.g. 8, you would provide a prev value of the last value 154 | // in the first batch of 4 you decoded. This is done to ensure that the integers are 155 | // correctly resolved to the correct diff. An example below. 156 | // 157 | // Input: [ 10, 10, 10, 10 ] [ 10, 10, 10, 10 ] 158 | // Output: [ 10, 20, 30, 40 ] [ 50, 60, 70, 80 ] 159 | // Prev: 40 160 | func Get4uint32DeltaScalar(in []byte, out []uint32, ctrl uint8, prev uint32) { 161 | sizes := shared.PerNumLenTable[ctrl] 162 | 163 | len0 := sizes[0] 164 | len1 := sizes[1] 165 | len2 := sizes[2] 166 | len3 := sizes[3] 167 | 168 | // bounds check hint to compiler 169 | _ = out[3] 170 | out[0] = decodeOne(in, len0) + prev 171 | out[1] = decodeOne(in[len0:], len1) + out[0] 172 | out[2] = decodeOne(in[len0+len1:], len2) + out[1] 173 | out[3] = decodeOne(in[len0+len1+len2:], len3) + out[2] 174 | } 175 | 176 | func decodeOne(b []byte, size uint8) uint32 { 177 | switch size { 178 | case 4: 179 | return uint32(b[3])<<24 | uint32(b[2])<<16 | uint32(b[1])<<8 | uint32(b[0]) 180 | case 3: 181 | return uint32(b[2])<<16 | uint32(b[1])<<8 | uint32(b[0]) 182 | case 2: 183 | return uint32(b[1])<<8 | uint32(b[0]) 184 | case 1: 185 | return uint32(b[0]) 186 | } 187 | panic("impossible") 188 | } 189 | -------------------------------------------------------------------------------- /pkg/decode/decode_amd64.go: -------------------------------------------------------------------------------- 1 | // +build amd64 2 | 3 | // Package decode provides an x86_64 implementation of two 4 | // Stream VByte decoding algorithms, a normal decoding approach 5 | // and one that incorporates differential coding. 6 | package decode 7 | 8 | import ( 9 | "github.com/theMPatel/streamvbyte-simdgo/pkg/shared" 10 | "golang.org/x/sys/cpu" 11 | ) 12 | 13 | // GetMode performs a check to see if the current ISA supports 14 | // the below decoding funcs. 15 | func GetMode() shared.PerformanceMode { 16 | if cpu.X86.HasAVX { 17 | return shared.Fast 18 | } 19 | return shared.Normal 20 | } 21 | 22 | // Get8uint32Fast binds to get8uint32Fast which is implemented in 23 | // assembly. 24 | func Get8uint32Fast(in []byte, out []uint32, ctrl uint16) { 25 | Get8uint32FastAsm(in, out, ctrl, 26 | shared.DecodeShuffleTable, 27 | shared.PerControlLenTable, 28 | ) 29 | } 30 | 31 | // Get8uint32DeltaFast binds to get8uint32DeltaFast which is implemented 32 | // in assembly. 33 | func Get8uint32DeltaFast(in []byte, out []uint32, ctrl uint16, prev uint32) { 34 | Get8uint32DeltaFastAsm( 35 | in, out, ctrl, prev, 36 | shared.DecodeShuffleTable, 37 | shared.PerControlLenTable, 38 | ) 39 | } 40 | 41 | // Get8uint32FastAsm uses the provided 16-bit control to load the 42 | // appropriate decoding shuffle masks and performs a shuffle 43 | // operation on the provided input bytes. This in effect decompresses 44 | // the input byte stream to uint32s. The result is written to 45 | // the provided output slice. 46 | //go:noescape 47 | func Get8uint32FastAsm( 48 | in []byte, out []uint32, ctrl uint16, 49 | shuffle *[256][16]uint8, lenTable *[256]uint8, 50 | ) 51 | 52 | // Get8uint32DeltaFastAsm works similarly to get8uint32Fast with the 53 | // exception that prior to writing the uncompressed integers out 54 | // to the output slice, the original values are reconstructed from 55 | // the diffs. The basic reconstruction algorithm is as follows: 56 | // 57 | // Input: [A B C D] 58 | // Input Shifted: [- A B C] 59 | // Add above two: [A AB BC CD] 60 | // Add Prev: [PA PAB PBC PCD] 61 | // Input Shifted: [- - A AB] 62 | // Add Shifted: [PA PAB PABC PABCD] 63 | //go:noescape 64 | func Get8uint32DeltaFastAsm( 65 | in []byte, out []uint32, ctrl uint16, prev uint32, 66 | shuffle *[256][16]uint8, lenTable *[256]uint8, 67 | ) 68 | -------------------------------------------------------------------------------- /pkg/decode/decode_amd64.s: -------------------------------------------------------------------------------- 1 | // Code generated by command: go run asm.go -out ./decode_amd64.s. DO NOT EDIT. 2 | 3 | #include "textflag.h" 4 | 5 | // func Get8uint32FastAsm(in []byte, out []uint32, ctrl uint16, shuffle *[256][16]uint8, lenTable *[256]uint8) 6 | // Requires: AVX 7 | TEXT ·Get8uint32FastAsm(SB), NOSPLIT, $0-72 8 | MOVWQZX ctrl+48(FP), AX 9 | MOVQ shuffle+56(FP), CX 10 | MOVBQZX AL, DX 11 | SHLQ $0x04, DX 12 | ADDQ CX, DX 13 | MOVWQZX AX, BX 14 | SHRQ $0x08, BX 15 | SHLQ $0x04, BX 16 | ADDQ CX, BX 17 | MOVQ in_base+0(FP), CX 18 | MOVQ CX, SI 19 | MOVQ lenTable+64(FP), DI 20 | MOVBQZX AL, AX 21 | ADDQ DI, AX 22 | MOVBQZX (AX), AX 23 | ADDQ AX, SI 24 | VLDDQU (CX), X0 25 | VLDDQU (SI), X1 26 | VPSHUFB (DX), X0, X0 27 | VPSHUFB (BX), X1, X1 28 | MOVQ out_base+24(FP), AX 29 | VMOVDQU X0, (AX) 30 | VMOVDQU X1, 16(AX) 31 | RET 32 | 33 | // func Get8uint32DeltaFastAsm(in []byte, out []uint32, ctrl uint16, prev uint32, shuffle *[256][16]uint8, lenTable *[256]uint8) 34 | // Requires: AVX 35 | TEXT ·Get8uint32DeltaFastAsm(SB), NOSPLIT, $0-72 36 | MOVWQZX ctrl+48(FP), AX 37 | MOVQ shuffle+56(FP), CX 38 | MOVBQZX AL, DX 39 | SHLQ $0x04, DX 40 | ADDQ CX, DX 41 | MOVWQZX AX, BX 42 | SHRQ $0x08, BX 43 | SHLQ $0x04, BX 44 | ADDQ CX, BX 45 | MOVQ in_base+0(FP), CX 46 | MOVQ CX, SI 47 | MOVQ lenTable+64(FP), DI 48 | MOVBQZX AL, AX 49 | ADDQ DI, AX 50 | MOVBQZX (AX), AX 51 | ADDQ AX, SI 52 | VLDDQU (CX), X0 53 | VLDDQU (SI), X1 54 | VPSHUFB (DX), X0, X0 55 | VPSHUFB (BX), X1, X1 56 | VBROADCASTSS prev+52(FP), X2 57 | VPSLLDQ $0x04, X0, X3 58 | VPADDD X0, X3, X0 59 | VPSLLDQ $0x08, X0, X3 60 | VPADDD X0, X2, X0 61 | VPADDD X0, X3, X0 62 | VPSHUFD $0xff, X0, X2 63 | VPSLLDQ $0x04, X1, X3 64 | VPADDD X1, X3, X1 65 | VPSLLDQ $0x08, X1, X3 66 | VPADDD X1, X2, X1 67 | VPADDD X1, X3, X1 68 | MOVQ out_base+24(FP), AX 69 | VMOVDQU X0, (AX) 70 | VMOVDQU X1, 16(AX) 71 | RET 72 | -------------------------------------------------------------------------------- /pkg/decode/decode_base.go: -------------------------------------------------------------------------------- 1 | // +build !amd64 2 | 3 | package decode 4 | 5 | import ( 6 | "github.com/theMPatel/streamvbyte-simdgo/pkg/shared" 7 | ) 8 | 9 | func GetMode() shared.PerformanceMode { 10 | return shared.Normal 11 | } 12 | 13 | func Get8uint32Fast(in []byte, out []uint32, ctrl uint16) int { 14 | panic("unreachable") 15 | } 16 | 17 | func Get8uint32DeltaFast(in []byte, out []uint32, ctrl uint16, prev uint32) int { 18 | panic("unreachable") 19 | } 20 | -------------------------------------------------------------------------------- /pkg/decode/decode_test.go: -------------------------------------------------------------------------------- 1 | package decode 2 | 3 | import ( 4 | "encoding/binary" 5 | "math/rand" 6 | "reflect" 7 | "testing" 8 | "time" 9 | 10 | "github.com/theMPatel/streamvbyte-simdgo/pkg/encode" 11 | "github.com/theMPatel/streamvbyte-simdgo/pkg/shared" 12 | "github.com/theMPatel/streamvbyte-simdgo/pkg/util" 13 | ) 14 | 15 | func init() { 16 | rand.Seed(time.Now().UnixNano()) 17 | } 18 | 19 | func TestGet8uint32Scalar(t *testing.T) { 20 | count := 8 21 | expected := util.GenUint32(count) 22 | in := make([]byte, count*encode.MaxBytesPerNum) 23 | ctrl := encode.Put8uint32Scalar(expected, in) 24 | out := make([]uint32, 8) 25 | 26 | Get8uint32Scalar(in, out, ctrl) 27 | if !reflect.DeepEqual(expected, out) { 28 | t.Fatalf("expected %+v, got %+v", expected, out) 29 | } 30 | } 31 | 32 | func TestGet8uint32DeltaScalar(t *testing.T) { 33 | count := 8 34 | expected := util.GenUint32(count) 35 | util.SortUint32(expected) 36 | in := make([]byte, count*encode.MaxBytesPerNum) 37 | ctrl := encode.Put8uint32DeltaScalar(expected, in, 0) 38 | out := make([]uint32, 8) 39 | 40 | Get8uint32DeltaScalar(in, out, ctrl, 0) 41 | if !reflect.DeepEqual(expected, out) { 42 | t.Fatalf("expected %+v, got %+v", expected, out) 43 | } 44 | } 45 | 46 | func TestGet8uint32Fast(t *testing.T) { 47 | if GetMode() == shared.Normal { 48 | t.Skipf("Testing environment doesn't support this test") 49 | } 50 | 51 | count := 8 52 | expected := util.GenUint32(count) 53 | in := make([]byte, count*encode.MaxBytesPerNum) 54 | ctrl := encode.Put8uint32Scalar(expected, in) 55 | out := make([]uint32, 8) 56 | 57 | Get8uint32Fast(in, out, ctrl) 58 | if !reflect.DeepEqual(expected, out) { 59 | t.Fatalf("expected %+v, got %+v", expected, out) 60 | } 61 | } 62 | 63 | func TestGet8uint32DeltaFast(t *testing.T) { 64 | if GetMode() == shared.Normal { 65 | t.Skipf("Testing environment doesn't support this test") 66 | } 67 | 68 | count := 8 69 | expected := util.GenUint32(count) 70 | util.SortUint32(expected) 71 | in := make([]byte, count*encode.MaxBytesPerNum) 72 | ctrl := encode.Put8uint32DeltaScalar(expected, in, 0) 73 | out := make([]uint32, 8) 74 | 75 | Get8uint32DeltaFast(in, out, ctrl, 0) 76 | if !reflect.DeepEqual(expected, out) { 77 | t.Fatalf("expected %+v, got %+v", expected, out) 78 | } 79 | } 80 | 81 | func TestGetUint32Scalar(t *testing.T) { 82 | count := rand.Intn(4) + 1 83 | expected := util.GenUint32(count) 84 | in := make([]byte, count*encode.MaxBytesPerNum) 85 | ctrl := encode.PutUint32Scalar(expected, in, count) 86 | out := make([]uint32, count) 87 | 88 | GetUint32Scalar(in, out, ctrl, count) 89 | if !reflect.DeepEqual(expected, out) { 90 | t.Fatalf("expected %+v, got %+v", expected, out) 91 | } 92 | } 93 | 94 | func TestGetUint32DeltaScalar(t *testing.T) { 95 | count := rand.Intn(4) + 1 96 | expected := util.GenUint32(count) 97 | util.SortUint32(expected) 98 | in := make([]byte, count*encode.MaxBytesPerNum) 99 | deltas := make([]uint32, count) 100 | util.Delta(expected, deltas) 101 | ctrl := encode.PutUint32Scalar(deltas, in, count) 102 | 103 | out := make([]uint32, count) 104 | GetUint32DeltaScalar(in, out, ctrl, count, 0) 105 | if !reflect.DeepEqual(expected, out) { 106 | t.Fatalf("expected %+v, got %+v", expected, out) 107 | } 108 | } 109 | 110 | var readSinkA []uint32 111 | 112 | func BenchmarkGet8uint32Fast(b *testing.B) { 113 | if GetMode() == shared.Normal { 114 | b.Skipf("Testing environment doesn't support this test") 115 | } 116 | 117 | count := 8 118 | out := make([]uint32, count) 119 | 120 | nums := util.GenUint32(count) 121 | in := make([]byte, count*encode.MaxBytesPerNum) 122 | ctrl := encode.Put8uint32Scalar(nums, in) 123 | 124 | b.SetBytes(int64(count * encode.MaxBytesPerNum)) 125 | b.ResetTimer() 126 | for i := 0; i < b.N; i++ { 127 | Get8uint32Fast(in, out, ctrl) 128 | } 129 | readSinkA = out 130 | } 131 | 132 | var readSinkB []uint32 133 | 134 | func BenchmarkGet8uint32DeltaFast(b *testing.B) { 135 | if GetMode() == shared.Normal { 136 | b.Skipf("Testing environment doesn't support this test") 137 | } 138 | 139 | count := 8 140 | out := make([]uint32, count) 141 | nums := util.GenUint32(count) 142 | util.SortUint32(nums) 143 | in := make([]byte, count*encode.MaxBytesPerNum) 144 | ctrl := encode.Put8uint32DeltaScalar(nums, in, 0) 145 | 146 | b.SetBytes(int64(count * encode.MaxBytesPerNum)) 147 | b.ResetTimer() 148 | for i := 0; i < b.N; i++ { 149 | Get8uint32DeltaFast(in, out, ctrl, 0) 150 | } 151 | readSinkB = out 152 | } 153 | 154 | var readSinkC []uint32 155 | 156 | func BenchmarkGet8uint32Scalar(b *testing.B) { 157 | count := 8 158 | out := make([]uint32, count) 159 | nums := util.GenUint32(count) 160 | in := make([]byte, count*encode.MaxBytesPerNum) 161 | ctrl := encode.Put8uint32Scalar(nums, in) 162 | 163 | b.SetBytes(int64(count * encode.MaxBytesPerNum)) 164 | b.ResetTimer() 165 | for i := 0; i < b.N; i++ { 166 | Get8uint32Scalar(in, out, ctrl) 167 | } 168 | readSinkC = out 169 | } 170 | 171 | var readSinkD []uint32 172 | 173 | func BenchmarkGet8uint32DeltaScalar(b *testing.B) { 174 | count := 8 175 | out := make([]uint32, count) 176 | nums := util.GenUint32(count) 177 | util.SortUint32(nums) 178 | in := make([]byte, count*encode.MaxBytesPerNum) 179 | ctrl := encode.Put8uint32DeltaScalar(nums, in, 0) 180 | 181 | b.SetBytes(int64(count * encode.MaxBytesPerNum)) 182 | b.ResetTimer() 183 | for i := 0; i < b.N; i++ { 184 | Get8uint32DeltaScalar(in, out, ctrl, 0) 185 | } 186 | readSinkD = out 187 | } 188 | 189 | var readSinkE []uint32 190 | 191 | func BenchmarkGet8uint32Varint(b *testing.B) { 192 | count := 8 193 | out := make([]uint32, count) 194 | data := make([]byte, binary.MaxVarintLen32*count) 195 | written := util.PutVarint(util.GenUint32(count), data) 196 | data = data[:written] 197 | 198 | b.SetBytes(int64(count * encode.MaxBytesPerNum)) 199 | b.ResetTimer() 200 | for i := 0; i < b.N; i++ { 201 | util.GetVarint(data, out) 202 | } 203 | readSinkE = out 204 | } 205 | 206 | var readSinkF []uint32 207 | 208 | func BenchmarkGet8uint32DeltaVarint(b *testing.B) { 209 | count := 8 210 | out := make([]uint32, count) 211 | data := make([]byte, binary.MaxVarintLen32*count) 212 | written := util.PutDeltaVarint(util.GenUint32(count), data, 0) 213 | data = data[:written] 214 | 215 | b.SetBytes(int64(count * encode.MaxBytesPerNum)) 216 | b.ResetTimer() 217 | for i := 0; i < b.N; i++ { 218 | util.GetDeltaVarint(data, out, 0) 219 | } 220 | readSinkF = out 221 | } 222 | -------------------------------------------------------------------------------- /pkg/decode/gen.go: -------------------------------------------------------------------------------- 1 | package decode 2 | 3 | //go:generate go run ./main/asm.go -out ./decode_amd64.s 4 | -------------------------------------------------------------------------------- /pkg/decode/main/asm.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "log" 6 | 7 | . "github.com/mmcloughlin/avo/build" 8 | "github.com/mmcloughlin/avo/operand" 9 | "github.com/mmcloughlin/avo/reg" 10 | "github.com/theMPatel/streamvbyte-simdgo/pkg/shared" 11 | ) 12 | 13 | const ( 14 | name = "Get8uint32FastAsm" 15 | nameDelta = "Get8uint32DeltaFastAsm" 16 | 17 | pIn = "in" 18 | pOut = "out" 19 | pCtrl = "ctrl" 20 | pShuffle = "shuffle" 21 | pLenTable = "lenTable" 22 | pPrev = "prev" 23 | ) 24 | 25 | var ( 26 | signature = fmt.Sprintf( 27 | "func(%s []byte, %s []uint32, %s uint16, %s *[256][16]uint8, %s *[256]uint8)", 28 | pIn, pOut, pCtrl, pShuffle, pLenTable) 29 | 30 | signatureDelta = fmt.Sprintf( 31 | "func(%s []byte, %s []uint32, %s uint16, %s uint32, %s *[256][16]uint8, %s *[256]uint8)", 32 | pIn, pOut, pCtrl, pPrev, pShuffle, pLenTable) 33 | ) 34 | 35 | func main() { 36 | regular() 37 | differential() 38 | Generate() 39 | } 40 | 41 | func regular() { 42 | TEXT(name, NOSPLIT, signature) 43 | 44 | firstFour, secondFour := coreAlgorithm() 45 | outBase := operand.Mem{Base: Load(Param(pOut).Base(), GP64())} 46 | 47 | VMOVDQU(firstFour, outBase) 48 | VMOVDQU(secondFour, outBase.Offset(16)) 49 | 50 | RET() 51 | } 52 | 53 | func differential() { 54 | TEXT(nameDelta, NOSPLIT, signatureDelta) 55 | 56 | firstFour, secondFour := coreAlgorithm() // [A B C D] [E F G H] 57 | prevSingular, err := Param(pPrev).Resolve() 58 | if err != nil { 59 | log.Fatalf("failed to get addr of prev") 60 | } 61 | 62 | prev := XMM() 63 | VBROADCASTSS(prevSingular.Addr, prev) // [P P P P] 64 | undoDelta(firstFour, prev) 65 | 66 | VPSHUFD(operand.Imm(0xff), firstFour, prev) // [A B C D] -> [D D D D] 67 | undoDelta(secondFour, prev) 68 | 69 | outBase := operand.Mem{Base: Load(Param(pOut).Base(), GP64())} 70 | 71 | VMOVDQU(firstFour, outBase) 72 | VMOVDQU(secondFour, outBase.Offset(16)) 73 | 74 | RET() 75 | } 76 | 77 | func undoDelta(four, prev reg.VecVirtual) { 78 | adder := XMM() // [A B C D] 79 | VPSLLDQ(operand.Imm(4), four, adder) // [- A B C] 80 | VPADDD(four, adder, four) // [A AB BC CD] 81 | VPSLLDQ(operand.Imm(8), four, adder) // [- - A AB] 82 | VPADDD(four, prev, four) // [PA PAB PBC PCD] 83 | VPADDD(four, adder, four) // [PA PAB PABC PABCD] 84 | } 85 | 86 | func coreAlgorithm() (reg.VecVirtual, reg.VecVirtual) { 87 | ctrl := GP64() 88 | Load(Param(pCtrl), ctrl) 89 | 90 | shuffleBase := Load(Param(pShuffle), GP64()) 91 | shuffleA := shared.CalculateShuffleAddrFromCtrl(shuffleBase, ctrl, false) 92 | shuffleB := shared.CalculateShuffleAddrFromCtrl(shuffleBase, ctrl, true) 93 | 94 | firstBlock := Load(Param(pIn).Base(), GP64()) 95 | secondBlock := GP64() 96 | MOVQ(firstBlock, secondBlock) 97 | lowerAddr, lowerSize := shared.LenValueAddr(ctrl, false, pLenTable) 98 | 99 | MOVBQZX(lowerAddr, lowerSize) 100 | ADDQ(lowerSize, secondBlock) 101 | 102 | firstFour := XMM() 103 | secondFour := XMM() 104 | VLDDQU(operand.Mem{Base: firstBlock}, firstFour) 105 | VLDDQU(operand.Mem{Base: secondBlock}, secondFour) 106 | 107 | VPSHUFB(shuffleA, firstFour, firstFour) 108 | VPSHUFB(shuffleB, secondFour, secondFour) 109 | 110 | return firstFour, secondFour 111 | } 112 | -------------------------------------------------------------------------------- /pkg/encode/encode.go: -------------------------------------------------------------------------------- 1 | package encode 2 | 3 | import ( 4 | "math/bits" 5 | 6 | "github.com/theMPatel/streamvbyte-simdgo/pkg/shared" 7 | ) 8 | 9 | const ( 10 | MaxBytesPerNum = 4 11 | ) 12 | 13 | var ( 14 | putImpl Put8Impl 15 | putDeltaImpl Put8DeltaImpl 16 | ) 17 | 18 | type Put8Impl func(in []uint32, out []byte) (ctrl uint16) 19 | type Put8DeltaImpl func(in []uint32, out []byte, prev uint32) (ctrl uint16) 20 | 21 | func init() { 22 | if GetMode() == shared.Fast { 23 | putImpl = Put8uint32Fast 24 | putDeltaImpl = Put8uint32DeltaFast 25 | } else { 26 | putImpl = Put8uint32Scalar 27 | putDeltaImpl = Put8uint32DeltaScalar 28 | } 29 | } 30 | 31 | // Put8uint32 is a general func you can use to encode 8 uint32's at a time. 32 | // It will use the fastest implementation available determined during 33 | // package initialization. If your CPU supports special hardware instructions 34 | // then it will use an accelerated version of Stream VByte. Otherwise, the 35 | // scalar implementation will be used as the fallback. 36 | func Put8uint32(in []uint32, out []byte) uint16 { 37 | return putImpl(in, out) 38 | } 39 | 40 | // Put8uint32Delta is a general func you can use to encode 8 differentially coded 41 | // uint32's with at a time. It will use the fastest implementation available 42 | // determined during package initialization. If your CPU supports special hardware 43 | // instructions then it will use an accelerated version of Stream VByte. Otherwise, 44 | // the scalar implementation will be used as the fallback. 45 | func Put8uint32Delta(in []uint32, out []byte, prev uint32) uint16 { 46 | return putDeltaImpl(in, out, prev) 47 | } 48 | 49 | // PutUint32Scalar encodes up to 4 integers from in into out using the 50 | // Stream VByte format. 51 | // 52 | // Note: It is your responsibility to ensure that the incoming slices have 53 | // the appropriate sizes and data otherwise this func will panic. 54 | func PutUint32Scalar(in []uint32, out []byte, count int) uint8 { 55 | if count == 0 { 56 | return 0 57 | } 58 | 59 | if count > 4 { 60 | count = 4 61 | } 62 | 63 | var ( 64 | ctrl uint8 65 | shift = 0 66 | total = 0 67 | ) 68 | for i := 0; i < count; i++ { 69 | size := encodeOne(in[i], out[total:]) 70 | total += size 71 | ctrl |= uint8(size-1) << shift 72 | shift += 2 73 | } 74 | 75 | return ctrl 76 | } 77 | 78 | // PutUint32DeltaScalar encodes up to 4 integers from in into out using the 79 | // Stream VByte format. 80 | // 81 | // Note: It is your responsibility to ensure that the incoming slices have 82 | // the appropriate sizes and data otherwise this func will panic. 83 | func PutUint32DeltaScalar(in []uint32, out []byte, count int, prev uint32) uint8 { 84 | if count == 0 { 85 | return 0 86 | } 87 | 88 | if count > 4 { 89 | count = 4 90 | } 91 | 92 | var ( 93 | ctrl uint8 94 | shift = 0 95 | total = 0 96 | ) 97 | for i := 0; i < count; i++ { 98 | size := encodeOne(in[i]-prev, out[total:]) 99 | total += size 100 | ctrl |= uint8(size-1) << shift 101 | shift += 2 102 | prev = in[i] 103 | } 104 | 105 | return ctrl 106 | } 107 | 108 | // Put8uint32Scalar will encode 8 uint32 values from in into out using the 109 | // Stream VByte format. Returns an 16-bit control value produced from the 110 | // encoding. 111 | // 112 | // Note: It is your responsibility to ensure that the incoming slices have 113 | // the appropriate sizes and data otherwise this func will panic. 114 | func Put8uint32Scalar(in []uint32, out []byte) uint16 { 115 | var ctrl uint16 116 | first := Put4uint32Scalar(in, out) 117 | ctrl |= uint16(first) 118 | encoded := shared.ControlByteToSize(first) 119 | second := Put4uint32Scalar(in[4:], out[encoded:]) 120 | return ctrl | uint16(second)<<8 121 | } 122 | 123 | // Put4uint32Scalar will encode 4 uint32 values from in into out using the 124 | // Stream VByte format. Returns an 8-bit control value produced from the 125 | // encoding. Every incoming number is variably encoded, and an 8-bit control 126 | // is constructed from the 2-bit len of each uint32. Below is an example of 127 | // 4 uint32's and how they are encoded. 128 | // 129 | // 00000000 00000000 00000000 01101111 = 111 130 | // 00000000 00000000 00000100 11010010 = 1234 131 | // 00000000 00001100 00001010 10000011 = 789123 132 | // 01000000 00000000 00000000 00000000 = 1073741824 133 | // 134 | // Num Len 2-bit control 135 | // ---------------------------------- 136 | // 111 1 0b00 137 | // 1234 2 0b01 138 | // 789123 3 0b10 139 | // 1073741824 4 0b11 140 | // 141 | // Final Control byte 142 | // 0b11100100 143 | // 144 | // Encoded data (little endian right-to-left bottom-to-top) 145 | // 0b01000000 0b00000000 0b00000000 0b00000000 0b00001100 146 | // 0b00001010 0b10000011 0b00000100 0b11010010 0b01101111 147 | // 148 | // Note: It is your responsibility to ensure that the incoming slices have 149 | // the appropriate sizes and data otherwise this func will panic. 150 | func Put4uint32Scalar(in []uint32, out []byte) uint8 { 151 | // bounds check hint to compiler 152 | _ = in[3] 153 | 154 | num0 := in[0] 155 | num1 := in[1] 156 | num2 := in[2] 157 | num3 := in[3] 158 | 159 | len0 := encodeOne(num0, out) 160 | len1 := encodeOne(num1, out[len0:]) 161 | len2 := encodeOne(num2, out[len0+len1:]) 162 | len3 := encodeOne(num3, out[len0+len1+len2:]) 163 | 164 | return uint8((len0 - 1) | (len1-1)<<2 | (len2-1)<<4 | (len3-1)<<6) 165 | } 166 | 167 | // Put8uint32DeltaScalar will differentially encode 8 uint32 values from in into out. 168 | // Prev provides a way for you to indicate the base value for this batch of 8. 169 | // For example, when encoding the second batch of 8 integers out of, e.g. 16, you would 170 | // provide a prev value of the last value in the first batch of 8 you encoded. This 171 | // is done to ensure that the integers are correctly resolved to the correct diff. An 172 | // example below. Note that this func assumes that the input integers are already sorted. 173 | // 174 | // Input: [ 10, 20, 30, 40, 50, 60, 70, 80 ] [ 90, 100, 110, 120, 130, 140, 150, 160 ] 175 | // Output: [ 10, 10, 10, 10, 10, 10, 10, 10 ] [ 10, 10, 10, 10, 10, 10, 10, 10 ] 176 | // Prev: 80 177 | func Put8uint32DeltaScalar(in []uint32, out []byte, prev uint32) uint16 { 178 | var ctrl uint16 179 | first := Put4uint32DeltaScalar(in, out, prev) 180 | ctrl |= uint16(first) 181 | encoded := shared.ControlByteToSize(first) 182 | second := Put4uint32DeltaScalar(in[4:], out[encoded:], in[3]) 183 | return ctrl | uint16(second)<<8 184 | } 185 | 186 | // Put4uint32DeltaScalar will differentially encode 4 uint32 values from in into out. 187 | // Prev provides a way for you to indicate the base value for this batch of 4. 188 | // For example, when encoding the second batch of 4 integers out of, e.g. 8, you would 189 | // provide a prev value of the last value in the first batch of 4 you encoded. This 190 | // is done to ensure that the integers are correctly resolved to the correct diff. An 191 | // example below. Note that this func assumes that the input integers are already sorted. 192 | // 193 | // Input: [ 10, 20, 30, 40 ] [ 50, 60, 70, 80 ] 194 | // Output: [ 10, 10, 10, 10 ] [ 10, 10, 10, 10 ] 195 | // Prev: 40 196 | func Put4uint32DeltaScalar(in []uint32, out []byte, prev uint32) uint8 { 197 | // bounds check hint to compiler 198 | _ = in[3] 199 | 200 | num0 := in[0] - prev 201 | num1 := in[1] - in[0] 202 | num2 := in[2] - in[1] 203 | num3 := in[3] - in[2] 204 | 205 | len0 := encodeOne(num0, out) 206 | len1 := encodeOne(num1, out[len0:]) 207 | len2 := encodeOne(num2, out[len0+len1:]) 208 | len3 := encodeOne(num3, out[len0+len1+len2:]) 209 | 210 | return uint8((len0 - 1) | (len1-1)<<2 | (len2-1)<<4 | (len3-1)<<6) 211 | } 212 | 213 | func encodeOne(num uint32, out []byte) int { 214 | size := max(1, 4-(bits.LeadingZeros32(num)/8)) 215 | switch size { 216 | case 4: 217 | out[3] = byte(num >> 24) 218 | fallthrough 219 | case 3: 220 | out[2] = byte(num >> 16) 221 | fallthrough 222 | case 2: 223 | out[1] = byte(num >> 8) 224 | fallthrough 225 | case 1: 226 | out[0] = byte(num) 227 | } 228 | return size 229 | } 230 | 231 | func max(a, b int) int { 232 | if a < b { 233 | return b 234 | } 235 | return a 236 | } 237 | -------------------------------------------------------------------------------- /pkg/encode/encode_amd64.go: -------------------------------------------------------------------------------- 1 | // +build amd64 2 | 3 | // Package encode provides an x86_64 implementation of two 4 | // Stream VByte encoding algorithms, a normal encoding approach 5 | // and one that incorporates differential coding. 6 | package encode 7 | 8 | import ( 9 | "github.com/theMPatel/streamvbyte-simdgo/pkg/shared" 10 | "golang.org/x/sys/cpu" 11 | ) 12 | 13 | // GetMode performs a check to see if the current ISA supports 14 | // the below encoding funcs. 15 | func GetMode() shared.PerformanceMode { 16 | if cpu.X86.HasAVX && cpu.X86.HasAVX2 { 17 | return shared.Fast 18 | } 19 | return shared.Normal 20 | } 21 | 22 | // Put8uint32Fast binds to put8uint32Fast which is implemented 23 | // in assembly. 24 | func Put8uint32Fast(in []uint32, out []byte) uint16 { 25 | return Put8uint32FastAsm(in, out, 26 | shared.EncodeShuffleTable, 27 | shared.PerControlLenTable, 28 | ) 29 | } 30 | 31 | // Put8uint32DeltaFast binds to put8uint32DeltaFast which is implemented 32 | // in assembly. 33 | func Put8uint32DeltaFast(in []uint32, out []byte, prev uint32) uint16 { 34 | return Put8uint32DeltaFastAsm( 35 | in, out, prev, 36 | shared.EncodeShuffleTable, 37 | shared.PerControlLenTable, 38 | ) 39 | } 40 | 41 | // Put8uint32FastAsm has three core phases. First a 16-bit control is 42 | // generated for the incoming 8 uint32s. Then, the calculated control 43 | // is used to index into shared.EncodeShuffleTable to fetch the 44 | // correct shuffle mask to compress the incoming integers. Finally, 45 | // the calculated control is used to index into shared.PerControlLenTable 46 | // to determine the offsets in the output array to write to. 47 | // 48 | // Based on the algorithm devised by Lemire et al., the SIMD control 49 | // byte generation algorithm proceeds as follows. Note that here we 50 | // are using 1234 as our first example integer. 51 | // 52 | // 00000000 00000000 00000100 11010010 // 1234 53 | // 00000001 00000001 00000001 00000001 // 0x0101 mask 54 | // ----------------------------------- // byte-min(1234, 0x0101) 55 | // 00000000 00000000 00000001 00000001 56 | // 57 | // The algorithm first uses a mask where every byte is equal to 1. If 58 | // you perform a per-byte min operation on our integer and the 1's mask, 59 | // the result will have a 1 at every byte that had a value in the original 60 | // integer. 61 | // 62 | // 00000000 00000000 00000001 00000001 63 | // ----------------------------------- // pack unsigned saturating 64 | // 00000000 00000000 00000000 11111111 // 16-bit to 8-bit 65 | // 66 | // Now you perform a 16-bit to 8-bit unsigned saturating pack operation. 67 | // Practically this means that you're taking every 16-bit value and trying 68 | // to shove that into 8 bits. If the 16-bit integer is larger than the 69 | // largest unsigned integer 8 bits can support, the pack saturates to the 70 | // largest unsigned 8-bit value. 71 | // 72 | // Why this is performed will become more clear in the subsequent steps, 73 | // however, at a high level, for every integer you want to encode, you 74 | // want for the MSB of two consecutive bytes in the control bits stream 75 | // to be representative of the final 2-bit control. For example, if you 76 | // have a 3-byte integer, you want the MSB of two consecutive bytes to be 77 | // 1 and 0, in that order. The reason you would want this is that there 78 | // is a vector pack instruction that takes the MSB from every byte in the 79 | // control bits stream and packs it into the lowest byte. This would thus 80 | // represent the value `0b10` in the final byte for this 3-byte integer, 81 | // which is what we want. 82 | // 83 | // Performing a 16-bit to 8-bit unsigned saturating pack has the effect 84 | // that you can use the saturation behavior to conditionally turn on the 85 | // MSB of these bytes depending on which bytes have values in the original 86 | // 32-bit integer. 87 | // 88 | // 00000000 00000000 00000000 11111111 // control bits stream 89 | // 00000001 00000001 00000001 00000001 // 0x0101 mask 90 | // ----------------------------------- // signed 16-bit min 91 | // 00000000 00000000 00000000 11111111 92 | // 93 | // We then take the 1's mask we used before and perform a signed 16-bit 94 | // min operation. The reason for this is more clear if you look at an 95 | // example using a 3-byte integer. 96 | // 97 | // 00000000 00001100 00001010 10000011 // 789123 98 | // 00000001 00000001 00000001 00000001 // 0x0101 mask 99 | // ----------------------------------- // byte-min(789123, 0x0101) 100 | // 00000000 00000001 00000001 00000001 101 | // ----------------------------------- // pack unsigned saturating 16-bit to 8-bit 102 | // 00000000 00000000 00000001 11111111 103 | // 00000001 00000001 00000001 00000001 // 0x0101 mask 104 | // ----------------------------------- // signed 16-bit min 105 | // 00000000 00000000 00000001 00000001 106 | // 107 | // The signed 16-bit min operation has three important effects. 108 | // 109 | // First, for 3-byte integers, it has the effect of turning off the 110 | // MSB of the lowest byte. This is necessary because a 3-byte integer 111 | // should have a 2-bit control that is `0b10` and without this step 112 | // using the MSB pack operation would result in a 2-bit control that 113 | // looks something like `0b_1`, where the lowest bit is on. Obviously 114 | // this is wrong, since only integers that require 2 or 4 bytes to 115 | // encode should have that lower bit on, i.e. 1 or 3 as a zero-indexed 116 | // length. 117 | // 118 | // Second, for 4-byte integers, the signed aspect has the effect of 119 | // leaving both MSBs of the 2 bytes on. When using the MSB pack 120 | // operation later on, it will result in a 2-bit control value of 121 | // `0b11`, which is what we want. 122 | // 123 | // Third, for 1 and 2 byte integers, it has no effect. This is great 124 | // for 2-byte values since the MSB will remain on and 1 byte values 125 | // will not have any MSB on anyways, so it is effectively a noop in 126 | // both scenarios. 127 | // 128 | // 00000000 00000000 00000000 11111111 // control bits stream (original 1234) 129 | // 01111111 00000000 01111111 00000000 // 0x7F00 mask 130 | // ----------------------------------- // add unsigned saturating 16-bit 131 | // 01111111 00000000 01111111 11111111 132 | // 133 | // Next, we take a mask with the value `0x7F00` and perform an unsigned 134 | // saturating add to the control bits stream. In the case for the integer 135 | // `1234` this has no real effect. We maintain the MSB in the lowest byte. 136 | // You'll note, however, that the only byte that has its MSB on is the last 137 | // one, so performing an MSB pack operation would result in a value of 138 | // `0b0001`, which is what we want. An example of this step on the integer 139 | // `789123` might paint a clearer picture. 140 | // 141 | // 00000000 00000000 00000001 00000001 // control bits stream (789123) 142 | // 01111111 00000000 01111111 00000000 // 0x7F00 mask 143 | // ----------------------------------- // add unsigned saturating 16-bit 144 | // 01111111 00000000 11111111 00000001 145 | // 146 | // You'll note here that the addition of `0x01` with `0x7F` in the upper 147 | // byte results in the MSB of the resulting upper byte turning on. The MSB 148 | // in the lower byte remains off and now an MSB pack operation will resolve 149 | // to `0b0010`, which is what we want. The unsigned saturation behavior is 150 | // really important for 4-byte numbers that only have bits in the most 151 | // significant byte on. An example below: 152 | // 153 | // 01000000 00000000 00000000 00000000 // 1073741824 154 | // 00000001 00000001 00000001 00000001 // 0x0101 mask 155 | // ----------------------------------- // byte-min(1073741824, 0x0101) 156 | // 00000001 00000000 00000000 00000000 157 | // ----------------------------------- // pack unsigned saturating 16-bit to 8-bit 158 | // 00000000 00000000 11111111 00000000 159 | // 00000001 00000001 00000001 00000001 // 0x0101 mask 160 | // ----------------------------------- // signed 16-bit min 161 | // 00000000 00000000 11111111 00000000 162 | // 01111111 00000000 01111111 00000000 // 0x7F00 mask 163 | // ----------------------------------- // add unsigned saturating 16-bit 164 | // 01111111 00000000 11111111 11111111 165 | // 166 | // Note here that because only the upper byte had a value in it, the lowest 167 | // byte in the control bits stream remains zero for the duration of the 168 | // algorithm. This poses an issue, since for a 4-byte value, we want for the 169 | // 2-bit control to result in a value of `0b11`. Performing a 16-bit unsigned 170 | // saturating addition has the effect of turning on all bits in the lower 171 | // byte, and thus we get a result with the MSB in the lower byte on. 172 | // 173 | // 01111111 00000000 11111111 00000001 // control bits stream (789123) 174 | // ----------------------------------- // move byte mask 175 | // 00000000 00000000 00000000 00000010 // 2-bit control 176 | // 177 | // The final move byte mask is performed on the control bits stream, and we 178 | // now have the result we wanted. 179 | // 180 | // We then use the above control bits we generated to get the appropriate 181 | // shuffle masks. Below is an example of how the shuffle operation and a 182 | // mask allows for us to tightly pack two integers into the output buffer. 183 | // 184 | // input [1234, 789123] (little endian R-to-L) 185 | // 00000000 00001100 00001010 10000011 00000000 00000000 00000100 11010010 186 | // | | | | | 187 | // | | |____________________ | | 188 | // | |_____________________ | | | 189 | // |____________________ | | | | 190 | // v v v v v 191 | // 0xff 0xff 0xff 0x06 0x05 0x04 0x01 0x00 // mask in hex 192 | // ----------------------------------------------------------------------- 193 | // 00000000 00000000 00000000 00001100 00001010 10000011 00000100 11010010 // packed 194 | //go:noescape 195 | func Put8uint32FastAsm( 196 | in []uint32, outBytes []byte, 197 | shuffle *[256][16]uint8, lenTable *[256]uint8, 198 | ) (r uint16) 199 | 200 | // Put8uint32DeltaFastAsm works similarly to put8uint32Fast except 201 | // that prior to encoding the 8 uint32s, we first use differential 202 | // coding to change the original numbers into deltas using SIMD 203 | // techniques. Afterwards, the encoding algorithm follows the same 204 | // flow as put8uint32Fast. The basic differential coding algorithm 205 | // is as follows: 206 | // 207 | // Prev: [P P P P] 208 | // Input: [A B C D] 209 | // Concat-shift: [P A B C] 210 | // Subtract: [A-P B-A C-B D-C] 211 | //go:noescape 212 | func Put8uint32DeltaFastAsm( 213 | in []uint32, outBytes []byte, prev uint32, 214 | shuffle *[256][16]uint8, lenTable *[256]uint8, 215 | ) (r uint16) 216 | -------------------------------------------------------------------------------- /pkg/encode/encode_amd64.s: -------------------------------------------------------------------------------- 1 | // Code generated by command: go run asm.go -out ./encode_amd64.s. DO NOT EDIT. 2 | 3 | #include "textflag.h" 4 | 5 | DATA mask0101<>+0(SB)/2, $0x0101 6 | GLOBL mask0101<>(SB), RODATA|NOPTR, $2 7 | 8 | DATA mask7F00<>+0(SB)/2, $0x7f00 9 | GLOBL mask7F00<>(SB), RODATA|NOPTR, $2 10 | 11 | // func Put8uint32FastAsm(in []uint32, outBytes []byte, shuffle *[256][16]uint8, lenTable *[256]uint8) (r uint16) 12 | // Requires: AVX, AVX2 13 | TEXT ·Put8uint32FastAsm(SB), NOSPLIT, $0-66 14 | MOVQ in_base+0(FP), AX 15 | VLDDQU (AX), X0 16 | VLDDQU 16(AX), X1 17 | VPBROADCASTW mask0101<>+0(SB), X2 18 | VPBROADCASTW mask7F00<>+0(SB), X3 19 | VPMINUB X2, X0, X4 20 | VPMINUB X2, X1, X5 21 | VPACKUSWB X5, X4, X4 22 | VPMINSW X2, X4, X4 23 | VPADDUSW X3, X4, X4 24 | VPMOVMSKB X4, AX 25 | MOVW AX, r+64(FP) 26 | MOVQ shuffle+48(FP), CX 27 | MOVBQZX AL, DX 28 | SHLQ $0x04, DX 29 | ADDQ CX, DX 30 | MOVWQZX AX, BX 31 | SHRQ $0x08, BX 32 | SHLQ $0x04, BX 33 | ADDQ CX, BX 34 | VPSHUFB (DX), X0, X0 35 | VPSHUFB (BX), X1, X1 36 | MOVQ outBytes_base+24(FP), CX 37 | MOVQ CX, DX 38 | MOVQ lenTable+56(FP), BX 39 | MOVBQZX AL, AX 40 | ADDQ BX, AX 41 | MOVBQZX (AX), AX 42 | ADDQ AX, DX 43 | VMOVDQU X0, (CX) 44 | VMOVDQU X1, (DX) 45 | RET 46 | 47 | // func Put8uint32DeltaFastAsm(in []uint32, outBytes []byte, prev uint32, shuffle *[256][16]uint8, lenTable *[256]uint8) (r uint16) 48 | // Requires: AVX, AVX2 49 | TEXT ·Put8uint32DeltaFastAsm(SB), NOSPLIT, $0-74 50 | MOVQ in_base+0(FP), AX 51 | VLDDQU (AX), X0 52 | VLDDQU 16(AX), X1 53 | VPALIGNR $0x0c, X0, X1, X2 54 | VPSUBD X2, X1, X1 55 | VBROADCASTSS prev+48(FP), X2 56 | VPALIGNR $0x0c, X2, X0, X2 57 | VPSUBD X2, X0, X0 58 | VPBROADCASTW mask0101<>+0(SB), X2 59 | VPBROADCASTW mask7F00<>+0(SB), X3 60 | VPMINUB X2, X0, X4 61 | VPMINUB X2, X1, X5 62 | VPACKUSWB X5, X4, X4 63 | VPMINSW X2, X4, X4 64 | VPADDUSW X3, X4, X4 65 | VPMOVMSKB X4, AX 66 | MOVW AX, r+72(FP) 67 | MOVQ shuffle+56(FP), CX 68 | MOVBQZX AL, DX 69 | SHLQ $0x04, DX 70 | ADDQ CX, DX 71 | MOVWQZX AX, BX 72 | SHRQ $0x08, BX 73 | SHLQ $0x04, BX 74 | ADDQ CX, BX 75 | VPSHUFB (DX), X0, X0 76 | VPSHUFB (BX), X1, X1 77 | MOVQ outBytes_base+24(FP), CX 78 | MOVQ CX, DX 79 | MOVQ lenTable+64(FP), BX 80 | MOVBQZX AL, AX 81 | ADDQ BX, AX 82 | MOVBQZX (AX), AX 83 | ADDQ AX, DX 84 | VMOVDQU X0, (CX) 85 | VMOVDQU X1, (DX) 86 | RET 87 | -------------------------------------------------------------------------------- /pkg/encode/encode_base.go: -------------------------------------------------------------------------------- 1 | // +build !amd64 2 | 3 | package encode 4 | 5 | import ( 6 | "github.com/theMPatel/streamvbyte-simdgo/pkg/shared" 7 | ) 8 | 9 | func GetMode() shared.PerformanceMode { 10 | return shared.Normal 11 | } 12 | 13 | func Put8uint32Fast(in []uint32, out []byte) uint16 { 14 | panic("unreachable") 15 | } 16 | 17 | func Put8uint32DeltaFast(in []uint32, out []byte, prev uint32) uint16 { 18 | panic("unreachable") 19 | } 20 | -------------------------------------------------------------------------------- /pkg/encode/encode_test.go: -------------------------------------------------------------------------------- 1 | package encode 2 | 3 | import ( 4 | "encoding/binary" 5 | "math/rand" 6 | "reflect" 7 | "testing" 8 | "time" 9 | 10 | "github.com/theMPatel/streamvbyte-simdgo/pkg/shared" 11 | "github.com/theMPatel/streamvbyte-simdgo/pkg/util" 12 | ) 13 | 14 | func init() { 15 | rand.Seed(time.Now().UnixNano()) 16 | } 17 | 18 | func TestPut8uint32Scalar(t *testing.T) { 19 | in := []uint32{1024, 3, 2, 1, 1_073_741_824, 10, 12, 1024} 20 | expectedData := []byte{ 21 | 0x00, 0x04, 0x03, 0x02, 0x01, 0x00, 0x00, 0x00, 0x40, 22 | 0x0a, 0x0c, 0x00, 0x04, 23 | } 24 | 25 | expectedCtrl := uint16(0b01_00_00_11_00_00_00_01) 26 | out := make([]byte, 32) 27 | actualCtrl := Put8uint32Scalar(in, out) 28 | if actualCtrl != expectedCtrl { 29 | t.Fatalf("expected: %#016b, got %#016b, %+v", expectedCtrl, actualCtrl, in) 30 | } 31 | 32 | actualData := out[:13] 33 | if !reflect.DeepEqual(expectedData, actualData) { 34 | t.Fatalf("expected %+v, got %+v, %+v", expectedData, actualData, in) 35 | } 36 | } 37 | 38 | func TestPut8uint32DeltaScalar(t *testing.T) { 39 | count := 8 40 | nums := util.GenUint32(count) 41 | util.SortUint32(nums) 42 | diffed := make([]uint32, count) 43 | util.Delta(nums, diffed) 44 | 45 | expectedData := make([]byte, count*MaxBytesPerNum) 46 | expectedCtrl := Put8uint32Scalar(diffed, expectedData) 47 | expectedData = expectedData[:shared.ControlByteToSizeTwo(expectedCtrl)] 48 | 49 | out := make([]byte, count*MaxBytesPerNum) 50 | actualCtrl := Put8uint32DeltaScalar(nums, out, 0) 51 | if actualCtrl != expectedCtrl { 52 | t.Fatalf("expected: %#016b, got %#016b, %+v", expectedCtrl, actualCtrl, nums) 53 | } 54 | 55 | actualData := out[:shared.ControlByteToSizeTwo(actualCtrl)] 56 | if !reflect.DeepEqual(expectedData, actualData) { 57 | t.Fatalf("expected %+v, got %+v, %+v", expectedData, actualData, nums) 58 | } 59 | } 60 | 61 | func TestPut8uint32Fast(t *testing.T) { 62 | if GetMode() == shared.Normal { 63 | t.Skipf("Testing environment doesn't support this test") 64 | } 65 | 66 | count := 8 67 | nums := util.GenUint32(count) 68 | 69 | out := make([]byte, MaxBytesPerNum*count) 70 | scalarCtrl := Put8uint32Scalar(nums, out) 71 | out = out[:shared.ControlByteToSizeTwo(scalarCtrl)] 72 | 73 | fastOut := make([]byte, MaxBytesPerNum*count) 74 | fastCtrl := Put8uint32Fast(nums, fastOut) 75 | fastOut = fastOut[:shared.ControlByteToSizeTwo(fastCtrl)] 76 | 77 | if scalarCtrl != fastCtrl { 78 | t.Fatalf("expected %#04x, actual %#04x, %+v", scalarCtrl, fastCtrl, nums) 79 | } 80 | 81 | if !reflect.DeepEqual(out, fastOut) { 82 | t.Fatalf("expected %+v, got %+v, %+v", out, fastOut, nums) 83 | } 84 | } 85 | 86 | func TestPut8uint32DeltaFast(t *testing.T) { 87 | if GetMode() == shared.Normal { 88 | t.Skipf("Testing environment doesn't support this test") 89 | } 90 | 91 | count := 8 92 | nums := util.GenUint32(count) 93 | util.SortUint32(nums) 94 | 95 | expectedData := make([]byte, MaxBytesPerNum*count) 96 | scalarCtrl := Put8uint32DeltaScalar(nums, expectedData, 0) 97 | expectedData = expectedData[:shared.ControlByteToSizeTwo(scalarCtrl)] 98 | 99 | fastOut := make([]byte, MaxBytesPerNum*count) 100 | fastCtrl := Put8uint32DeltaFast(nums, fastOut, 0) 101 | fastOut = fastOut[:shared.ControlByteToSizeTwo(fastCtrl)] 102 | 103 | if scalarCtrl != fastCtrl { 104 | t.Fatalf("expected %#04x, actual %#04x, %+v", scalarCtrl, fastCtrl, nums) 105 | } 106 | 107 | if !reflect.DeepEqual(expectedData, fastOut) { 108 | t.Fatalf("expected %+v, got %+v, %+v", expectedData, fastOut, nums) 109 | } 110 | } 111 | 112 | func TestPutUint32Scalar(t *testing.T) { 113 | count := rand.Intn(4) + 1 114 | nums := util.GenUint32(count) 115 | for i := 4 - count; i > 0; i-- { 116 | nums = append(nums, 0) 117 | } 118 | 119 | expected := make([]byte, 4*MaxBytesPerNum) 120 | ctrl := Put4uint32Scalar(nums, expected) 121 | size := shared.ControlByteToSize(ctrl) 122 | size -= 4 - count 123 | expected = expected[:size] 124 | 125 | out := make([]byte, count*MaxBytesPerNum) 126 | ctrl = PutUint32Scalar(nums[:count], out, count) 127 | size = shared.ControlByteToSize(ctrl) 128 | size -= 4 - count 129 | out = out[:size] 130 | if !reflect.DeepEqual(expected, out) { 131 | t.Fatalf("expected %+v, got %+v", expected, out) 132 | } 133 | } 134 | 135 | func TestPutUint32DeltaScalar(t *testing.T) { 136 | count := rand.Intn(4) + 1 137 | nums := util.GenUint32(count) 138 | util.SortUint32(nums) 139 | for i := 4 - count; i > 0; i-- { 140 | nums = append(nums, nums[count-1]) 141 | } 142 | 143 | deltas := make([]uint32, 4) 144 | util.Delta(nums, deltas) 145 | 146 | expected := make([]byte, 4*MaxBytesPerNum) 147 | ctrl := Put4uint32Scalar(deltas, expected) 148 | size := shared.ControlByteToSize(ctrl) 149 | size -= 4 - count 150 | expected = expected[:size] 151 | 152 | out := make([]byte, count*MaxBytesPerNum) 153 | ctrl = PutUint32DeltaScalar(nums[:count], out, count, 0) 154 | size = shared.ControlByteToSize(ctrl) 155 | size -= 4 - count 156 | out = out[:size] 157 | if !reflect.DeepEqual(expected, out) { 158 | t.Fatalf("expected %+v, got %+v", expected, out) 159 | } 160 | } 161 | 162 | var writeSinkA uint16 163 | 164 | func BenchmarkPut8uint32Fast(b *testing.B) { 165 | if GetMode() == shared.Normal { 166 | b.Skipf("Testing environment doesn't support this test") 167 | } 168 | 169 | count := 8 170 | out := make([]byte, count*MaxBytesPerNum) 171 | nums := util.GenUint32(count) 172 | 173 | var ctrl uint16 174 | b.SetBytes(int64(count * MaxBytesPerNum)) 175 | b.ResetTimer() 176 | for i := 0; i < b.N; i++ { 177 | ctrl = Put8uint32Fast(nums, out) 178 | } 179 | writeSinkA = ctrl 180 | } 181 | 182 | var writeSinkB uint16 183 | 184 | func BenchmarkPut8uint32DeltaFast(b *testing.B) { 185 | if GetMode() == shared.Normal { 186 | b.Skipf("Testing environment doesn't support this test") 187 | } 188 | 189 | count := 8 190 | out := make([]byte, count*MaxBytesPerNum) 191 | nums := util.GenUint32(count) 192 | util.SortUint32(nums) 193 | 194 | var ctrl uint16 195 | b.SetBytes(int64(count * MaxBytesPerNum)) 196 | b.ResetTimer() 197 | for i := 0; i < b.N; i++ { 198 | ctrl = Put8uint32DeltaFast(nums, out, 0) 199 | } 200 | writeSinkB = ctrl 201 | } 202 | 203 | var writeSinkC uint16 204 | 205 | func BenchmarkPut8uint32Scalar(b *testing.B) { 206 | count := 8 207 | out := make([]byte, count*MaxBytesPerNum) 208 | nums := util.GenUint32(count) 209 | 210 | var ctrl uint16 211 | b.SetBytes(int64(count * MaxBytesPerNum)) 212 | b.ResetTimer() 213 | for i := 0; i < b.N; i++ { 214 | ctrl = Put8uint32Scalar(nums, out) 215 | } 216 | writeSinkC = ctrl 217 | } 218 | 219 | var writeSinkD uint16 220 | 221 | func BenchmarkPut8uint32DeltaScalar(b *testing.B) { 222 | count := 8 223 | out := make([]byte, count*MaxBytesPerNum) 224 | nums := util.GenUint32(count) 225 | util.SortUint32(nums) 226 | 227 | var ctrl uint16 228 | b.SetBytes(int64(count * MaxBytesPerNum)) 229 | b.ResetTimer() 230 | for i := 0; i < b.N; i++ { 231 | ctrl = Put8uint32DeltaScalar(nums, out, 0) 232 | } 233 | writeSinkD = ctrl 234 | } 235 | 236 | var writeSinkE int 237 | 238 | func BenchmarkPut8uint32Varint(b *testing.B) { 239 | count := 8 240 | out := make([]byte, count*binary.MaxVarintLen32) 241 | nums := util.GenUint32(count) 242 | written := 0 243 | 244 | b.SetBytes(int64(count * MaxBytesPerNum)) 245 | b.ResetTimer() 246 | for i := 0; i < b.N; i++ { 247 | written = util.PutVarint(nums, out) 248 | } 249 | writeSinkE = written 250 | } 251 | 252 | var writeSinkF int 253 | 254 | func BenchmarkPut8uint32DeltaVarint(b *testing.B) { 255 | count := 8 256 | out := make([]byte, count*binary.MaxVarintLen32) 257 | nums := util.GenUint32(count) 258 | util.SortUint32(nums) 259 | written := 0 260 | 261 | b.SetBytes(int64(count * MaxBytesPerNum)) 262 | b.ResetTimer() 263 | for i := 0; i < b.N; i++ { 264 | written = util.PutDeltaVarint(nums, out, 0) 265 | } 266 | writeSinkF = written 267 | } 268 | -------------------------------------------------------------------------------- /pkg/encode/gen.go: -------------------------------------------------------------------------------- 1 | package encode 2 | 3 | //go:generate go run ./main/asm.go -out ./encode_amd64.s 4 | -------------------------------------------------------------------------------- /pkg/encode/main/asm.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "log" 6 | 7 | . "github.com/mmcloughlin/avo/build" 8 | "github.com/mmcloughlin/avo/operand" 9 | "github.com/mmcloughlin/avo/reg" 10 | "github.com/theMPatel/streamvbyte-simdgo/pkg/shared" 11 | ) 12 | 13 | const ( 14 | name = "Put8uint32FastAsm" 15 | nameDelta = "Put8uint32DeltaFastAsm" 16 | pIn = "in" 17 | pOut = "outBytes" 18 | pShuffle = "shuffle" 19 | pLenTable = "lenTable" 20 | pPrev = "prev" 21 | pR = "r" 22 | ) 23 | 24 | var ( 25 | signature = fmt.Sprintf( 26 | "func(%s []uint32, %s []byte, %s *[256][16]uint8, %s *[256]uint8) (%s uint16)", 27 | pIn, pOut, pShuffle, pLenTable, pR) 28 | 29 | signatureDelta = fmt.Sprintf( 30 | "func(%s []uint32, %s []byte, %s uint32, %s *[256][16]uint8, %s *[256]uint8) (%s uint16)", 31 | pIn, pOut, pPrev, pShuffle, pLenTable, pR) 32 | 33 | mask1111R = ConstData("mask0101", operand.U16(0x0101)) 34 | mask7F00R = ConstData("mask7F00", operand.U16(0x7F00)) 35 | ) 36 | 37 | func main() { 38 | regular() 39 | differential() 40 | Generate() 41 | } 42 | 43 | func differential() { 44 | TEXT(nameDelta, NOSPLIT, signatureDelta) 45 | 46 | prevSingular, err := Param(pPrev).Resolve() 47 | if err != nil { 48 | log.Fatalf("failed to get addr of prev") 49 | } 50 | 51 | firstFour, secondFour := shared.Load8(pIn) 52 | prev := XMM() 53 | VPALIGNR(operand.Imm(12), firstFour, secondFour, prev) 54 | VPSUBD(prev, secondFour, secondFour) 55 | 56 | VBROADCASTSS(prevSingular.Addr, prev) 57 | VPALIGNR(operand.Imm(12), prev, firstFour, prev) 58 | VPSUBD(prev, firstFour, firstFour) 59 | 60 | coreAlgorithm(firstFour, secondFour) 61 | } 62 | 63 | func regular() { 64 | TEXT(name, NOSPLIT, signature) 65 | coreAlgorithm(shared.Load8(pIn)) 66 | } 67 | 68 | func coreAlgorithm(firstFour, secondFour reg.VecVirtual) { 69 | onesMask := XMM() 70 | sevenFzerozero := XMM() 71 | VPBROADCASTW(mask1111R, onesMask) 72 | VPBROADCASTW(mask7F00R, sevenFzerozero) 73 | 74 | minFirstFour := XMM() 75 | minSecondFour := XMM() 76 | VPMINUB(onesMask, firstFour, minFirstFour) 77 | VPMINUB(onesMask, secondFour, minSecondFour) 78 | 79 | // Re-use minFirstFour register 80 | VPACKUSWB(minSecondFour, minFirstFour, minFirstFour) 81 | VPMINSW(onesMask, minFirstFour, minFirstFour) 82 | VPADDUSW(sevenFzerozero, minFirstFour, minFirstFour) 83 | 84 | ctrl := GP32() 85 | VPMOVMSKB(minFirstFour, ctrl) 86 | Store(ctrl.As16(), Return(pR)) 87 | 88 | shuffleBase := Load(Param(pShuffle), GP64()) 89 | firstShuffle := shared.CalculateShuffleAddrFromCtrl(shuffleBase, ctrl, false) 90 | secondShuffle := shared.CalculateShuffleAddrFromCtrl(shuffleBase, ctrl, true) 91 | 92 | VPSHUFB(firstShuffle, firstFour, firstFour) 93 | VPSHUFB(secondShuffle, secondFour, secondFour) 94 | 95 | firstAddr := Load(Param(pOut).Base(), GP64()) 96 | secondAddr := GP64() 97 | MOVQ(firstAddr, secondAddr) 98 | 99 | lenAddr, lenValue := shared.LenValueAddr(ctrl, false, pLenTable) 100 | 101 | MOVBQZX(lenAddr, lenValue) 102 | ADDQ(lenValue, secondAddr) 103 | 104 | VMOVDQU(firstFour, operand.Mem{Base: firstAddr}) 105 | VMOVDQU(secondFour, operand.Mem{Base: secondAddr}) 106 | 107 | RET() 108 | } 109 | -------------------------------------------------------------------------------- /pkg/pkg_test.go: -------------------------------------------------------------------------------- 1 | package pkg 2 | 3 | import ( 4 | "math/rand" 5 | "reflect" 6 | "testing" 7 | "time" 8 | 9 | "github.com/theMPatel/streamvbyte-simdgo/pkg/decode" 10 | "github.com/theMPatel/streamvbyte-simdgo/pkg/encode" 11 | "github.com/theMPatel/streamvbyte-simdgo/pkg/util" 12 | ) 13 | 14 | func init() { 15 | rand.Seed(time.Now().UnixNano()) 16 | } 17 | 18 | func TestRoundTripScalar(t *testing.T) { 19 | in := []uint32{1024, 3, 2, 1, 1_073_741_824, 10, 12, 1024} 20 | expectedData := []byte{ 21 | 0x00, 0x04, 0x03, 0x02, 0x01, 0x00, 0x00, 0x00, 0x40, 22 | 0x0a, 0x0c, 0x00, 0x04, 23 | } 24 | 25 | expectedCtrl := uint16(0b01_00_00_11_00_00_00_01) 26 | out := make([]byte, 32) 27 | actualCtrl := encode.Put8uint32Scalar(in, out) 28 | if actualCtrl != expectedCtrl { 29 | t.Fatalf("expected: %#016b, got %#016b", expectedCtrl, actualCtrl) 30 | } 31 | 32 | actualData := out[:13] 33 | if !reflect.DeepEqual(expectedData, actualData) { 34 | t.Fatalf("expected %+v, got %+v", expectedData, actualData) 35 | } 36 | 37 | decoded := make([]uint32, 8) 38 | decode.Get8uint32Scalar(actualData, decoded, actualCtrl) 39 | 40 | if !reflect.DeepEqual(in, decoded) { 41 | t.Fatalf("expected %+v, actual %+v", in, decoded) 42 | } 43 | } 44 | 45 | func BenchmarkMemCopy8Uint32(b *testing.B) { 46 | count := 8 47 | nums := make([]uint32, count) 48 | for i := 0; i < count; i++ { 49 | nums[i] = util.RandUint32() 50 | } 51 | 52 | out := make([]uint32, count) 53 | b.SetBytes(int64(count * encode.MaxBytesPerNum)) 54 | b.ResetTimer() 55 | for i := 0; i < b.N; i++ { 56 | copy(out, nums) 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /pkg/shared/asm.go: -------------------------------------------------------------------------------- 1 | package shared 2 | 3 | import ( 4 | . "github.com/mmcloughlin/avo/build" 5 | "github.com/mmcloughlin/avo/operand" 6 | "github.com/mmcloughlin/avo/reg" 7 | ) 8 | 9 | func CalculateShuffleAddrFromCtrl(shuffleBase reg.Register, ctrl reg.GPVirtual, upper bool) operand.Mem { 10 | addr := GP64() 11 | if upper { 12 | MOVWQZX(ctrl.As16(), addr) 13 | SHRQ(operand.Imm(8), addr) 14 | } else { 15 | MOVBQZX(ctrl.As8(), addr) 16 | } 17 | 18 | // Left shift by 4 to get the byte level offset for the shuffle table 19 | SHLQ(operand.Imm(4), addr) 20 | ADDQ(shuffleBase, addr) 21 | 22 | return operand.Mem{Base: addr} 23 | } 24 | 25 | func LenValueAddr(ctrl reg.GPVirtual, upper bool, lenTableParam string) (operand.Mem, reg.GPVirtual) { 26 | lenTableBase := Load(Param(lenTableParam), GP64()) 27 | lenValueAddr := GP64() 28 | if upper { 29 | MOVWQZX(ctrl.As16(), lenValueAddr) 30 | SHRQ(operand.Imm(8), lenValueAddr) 31 | } else { 32 | MOVBQZX(ctrl.As8L(), lenValueAddr) 33 | } 34 | ADDQ(lenTableBase, lenValueAddr) 35 | 36 | return operand.Mem{Base: lenValueAddr}, lenValueAddr 37 | } 38 | 39 | func Load8(paramName string) (reg.VecVirtual, reg.VecVirtual) { 40 | arrBase := operand.Mem{ 41 | Base: Load(Param(paramName).Base(), GP64()), 42 | } 43 | firstFour := XMM() 44 | secondFour := XMM() 45 | VLDDQU(arrBase, firstFour) 46 | VLDDQU(arrBase.Offset(16), secondFour) 47 | 48 | return firstFour, secondFour 49 | } 50 | -------------------------------------------------------------------------------- /pkg/shared/gen.go: -------------------------------------------------------------------------------- 1 | package shared 2 | 3 | //go:generate go run ./main/gentables.go -out ./tables.go -package shared 4 | 5 | func ControlByteToSize(in uint8) int { 6 | return int(PerControlLenTable[in]) 7 | } 8 | 9 | func ControlByteToSizeTwo(in uint16) int { 10 | return int(PerControlLenTable[in&0xff] + PerControlLenTable[in>>8]) 11 | } 12 | -------------------------------------------------------------------------------- /pkg/shared/main/gentables.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bytes" 5 | "flag" 6 | "fmt" 7 | "go/format" 8 | "io" 9 | "log" 10 | "os" 11 | 12 | "github.com/pkg/errors" 13 | "github.com/theMPatel/streamvbyte-simdgo/pkg/util" 14 | ) 15 | 16 | var ( 17 | fOut = flag.String("out", "", "path to output") 18 | fPackage = flag.String("package", "shared", "package name") 19 | ) 20 | 21 | const MaxControlByte = 1 << 8 22 | 23 | func main() { 24 | flag.Parse() 25 | 26 | if *fOut == "" { 27 | log.Fatalf("outfile cannot be empty") 28 | } 29 | 30 | out := &bytes.Buffer{} 31 | 32 | _, _ = fmt.Fprintln(out, "// Code generated by gentables. DO NOT EDIT.") 33 | _, _ = fmt.Fprintf(out, "\npackage %s\n", *fPackage) 34 | 35 | if err := genPerNumLengthTable(out); err != nil { 36 | log.Fatalf("failed to gen per num length table") 37 | } 38 | 39 | if err := genPerQuadLengthTable(out); err != nil { 40 | log.Fatalf("failed to gen sum length table") 41 | } 42 | 43 | if err := genEncodeShuffleTable(out); err != nil { 44 | log.Fatalf("failed to gen encode shuffle table") 45 | } 46 | 47 | if err := genDecodeShuffleTable(out); err != nil { 48 | log.Fatalf("failed to gen decode shuffle table") 49 | } 50 | 51 | final, err := format.Source(out.Bytes()) 52 | if err != nil { 53 | log.Fatalf("failed to go fmt output") 54 | } 55 | 56 | fileOut, err := os.Create(*fOut) 57 | if err != nil { 58 | log.Fatalf("failed to open: %s %s", *fOut, err) 59 | } 60 | defer util.SilentClose(fileOut) 61 | 62 | _, err = fileOut.Write(final) 63 | if err != nil { 64 | log.Fatalf("failed to write generated tables to file") 65 | } 66 | } 67 | 68 | func newLineAfter(countPerLine int) func(out io.Writer) { 69 | count := 1 70 | return func(out io.Writer) { 71 | if count%countPerLine == 0 { 72 | _, _ = fmt.Fprintln(out, "") 73 | } else { 74 | _, _ = fmt.Fprintf(out, " ") 75 | } 76 | count++ 77 | } 78 | } 79 | 80 | func genPerNumLengthTable(out io.Writer) error { 81 | _, _ = fmt.Fprintf(out, "\nvar PerNumLenTable *[256][4]uint8 = &[256][4]uint8{\n") 82 | tabber := newLineAfter(4) 83 | for i := 0; i < MaxControlByte; i++ { 84 | one, two, three, four := sizes(uint8(i)) 85 | _, err := fmt.Fprintf(out, "\t{%d, %d, %d, %d},", one, two, three, four) 86 | if err != nil { 87 | return errors.Wrapf(err, "failed to write per num len: %d", i) 88 | } 89 | tabber(out) 90 | } 91 | _, _ = fmt.Fprintln(out, "}") 92 | return nil 93 | } 94 | 95 | func genPerQuadLengthTable(out io.Writer) error { 96 | _, _ = fmt.Fprintf(out, "\nvar PerControlLenTable *[256]uint8 = &[256]uint8{\n") 97 | tabber := newLineAfter(8) 98 | for i := 0; i < MaxControlByte; i++ { 99 | one, two, three, four := sizes(uint8(i)) 100 | _, err := fmt.Fprintf(out, "\t%d,", one+two+three+four) 101 | if err != nil { 102 | return errors.Wrapf(err, "failed to write summed len: %d", i) 103 | } 104 | tabber(out) 105 | } 106 | _, _ = fmt.Fprintln(out, "}") 107 | return nil 108 | } 109 | 110 | const ( 111 | shuffleFmtStr = "%#02x, %#02x, %#02x, %#02x, %#02x, %#02x, %#02x, %#02x, %#02x, %#02x, %#02x, %#02x, %#02x, %#02x, %#02x, %#02x}," 112 | commentStr = "\t// %d\t%#02x\t%08b\tlen\t%d\t%d\t%d\t%d\n" 113 | ) 114 | 115 | func genEncodeShuffleTable(out io.Writer) error { 116 | _, _ = fmt.Fprintf(out, "\nvar EncodeShuffleTable *[256][16]uint8 = &[256][16]uint8{\n") 117 | tabber := newLineAfter(1) 118 | for i := 0; i < MaxControlByte; i++ { 119 | one, two, three, four := sizes(uint8(i)) 120 | _, _ = fmt.Fprintf(out, commentStr, i, i, i, one, two, three, four) 121 | _, err := fmt.Fprintf(out, "\t{") 122 | if err != nil { 123 | return errors.Wrapf(err, "failed to write encode shuffle table") 124 | } 125 | 126 | var positions []interface{} 127 | var base uint8 128 | for _, size := range []uint8{one, two, three, four} { 129 | for j := uint8(0); j < size; j++ { 130 | positions = append(positions, base+j) 131 | } 132 | base += 4 133 | } 134 | 135 | for len(positions) < 16 { 136 | positions = append(positions, 0xff) 137 | } 138 | _, err = fmt.Fprintf(out, shuffleFmtStr, positions...) 139 | if err != nil { 140 | return errors.Wrapf(err, "failed to write per num len: %d", i) 141 | } 142 | tabber(out) 143 | } 144 | _, _ = fmt.Fprintln(out, "}") 145 | return nil 146 | } 147 | 148 | func genDecodeShuffleTable(out io.Writer) error { 149 | _, _ = fmt.Fprintf(out, "\nvar DecodeShuffleTable *[256][16]uint8 = &[256][16]uint8{\n") 150 | tabber := newLineAfter(1) 151 | for i := 0; i < MaxControlByte; i++ { 152 | one, two, three, four := sizes(uint8(i)) 153 | _, _ = fmt.Fprintf(out, commentStr, i, i, i, one, two, three, four) 154 | _, err := fmt.Fprintf(out, "\t{") 155 | if err != nil { 156 | return errors.Wrapf(err, "failed to write encode shuffle table") 157 | } 158 | 159 | var positions []interface{} 160 | var pos uint8 161 | for _, size := range []uint8{one, two, three, four} { 162 | for j := 0; j < 4; j++ { 163 | if size > 0 { 164 | positions = append(positions, pos) 165 | pos++ 166 | size-- 167 | } else { 168 | positions = append(positions, 0xff) 169 | } 170 | } 171 | } 172 | 173 | _, err = fmt.Fprintf(out, shuffleFmtStr, positions...) 174 | if err != nil { 175 | return errors.Wrapf(err, "failed to write per num len: %d", i) 176 | } 177 | tabber(out) 178 | } 179 | _, _ = fmt.Fprintln(out, "}") 180 | return nil 181 | } 182 | 183 | // sizes returns the length in bytes for each of the four numbers 184 | // represented by the provided control byte. 185 | func sizes(control uint8) (one uint8, two uint8, three uint8, four uint8) { 186 | one = (control & 3) + 1 187 | two = (control >> 2 & 3) + 1 188 | three = (control >> 4 & 3) + 1 189 | four = (control >> 6 & 3) + 1 190 | return 191 | } 192 | -------------------------------------------------------------------------------- /pkg/shared/mode.go: -------------------------------------------------------------------------------- 1 | package shared 2 | 3 | // PerformanceMode indicates which mode the code is operating under. If Normal, 4 | // then the code is NOT using special hardware instructions and instead relying 5 | // on portable Go code. If Fast, then the code IS using special hardware instructions 6 | // that is platform dependent. Each package exports a func that can be used to debug 7 | // or inspect the configuration at runtime. 8 | type PerformanceMode int 9 | 10 | const ( 11 | Normal PerformanceMode = iota 12 | Fast 13 | ) 14 | 15 | type CheckMode func() PerformanceMode 16 | -------------------------------------------------------------------------------- /pkg/stream/reader/reader.go: -------------------------------------------------------------------------------- 1 | package reader 2 | 3 | import ( 4 | "github.com/theMPatel/streamvbyte-simdgo/pkg/decode" 5 | "github.com/theMPatel/streamvbyte-simdgo/pkg/shared" 6 | ) 7 | 8 | const ( 9 | jump = 16 10 | jumpCtrl = jump / 4 11 | ) 12 | 13 | // ReadAll will read the entire input stream into out according to the 14 | // Stream VByte format. It will select the best implementation depending 15 | // on the presence of special hardware instructions. 16 | // 17 | // Note: It is your responsibility to ensure that the incoming slices are 18 | // appropriately sized as well as tracking the count of integers in the 19 | // stream. 20 | func ReadAll(count int, stream []byte, out []uint32) { 21 | if decode.GetMode() == shared.Fast { 22 | ReadAllFast(count, stream, out) 23 | } else { 24 | ReadAllScalar(count, stream, out) 25 | } 26 | } 27 | 28 | // ReadAllDelta will read the entire input stream into out according to the 29 | // Stream VByte format. It will select the best implementation depending 30 | // on the presence of special hardware instructions. It will reconstruct the 31 | // original non differentially encoded values. 32 | // 33 | // Note: It is your responsibility to ensure that the incoming slices are 34 | // appropriately sized as well as tracking the count of integers in the 35 | // stream. 36 | func ReadAllDelta(count int, stream []byte, out []uint32, prev uint32) { 37 | if decode.GetMode() == shared.Fast { 38 | ReadAllDeltaFast(count, stream, out, prev) 39 | } else { 40 | ReadAllDeltaScalar(count, stream, out, prev) 41 | } 42 | } 43 | 44 | // ReadAllScalar will read the entire input stream into out according to the 45 | // Stream VByte format. 46 | // 47 | // Note: It is your responsibility to ensure that the incoming slices are 48 | // appropriately sized as well as tracking the count of integers in the 49 | // stream. 50 | func ReadAllScalar(count int, stream []byte, out []uint32) { 51 | var ( 52 | ctrlLen = (count + 3) / 4 53 | 54 | dataPos = ctrlLen 55 | ctrlPos = 0 56 | decoded = 0 57 | lowestJump = count &^ (jump - 1) 58 | lowest4 = count &^ 3 59 | ) 60 | 61 | for ; decoded < lowestJump; decoded += jump { 62 | data := stream[dataPos:] 63 | ctrls := stream[ctrlPos : ctrlPos+jumpCtrl] 64 | nums := out[decoded : decoded+jump] 65 | 66 | ctrl := ctrls[0] 67 | decode.Get4uint32Scalar(data, nums, ctrl) 68 | sizeA := shared.ControlByteToSize(ctrl) 69 | 70 | ctrl = ctrls[1] 71 | decode.Get4uint32Scalar(data[sizeA:], nums[4:], ctrl) 72 | sizeB := shared.ControlByteToSize(ctrl) 73 | 74 | ctrl = ctrls[2] 75 | decode.Get4uint32Scalar(data[sizeA+sizeB:], nums[8:], ctrl) 76 | sizeC := shared.ControlByteToSize(ctrl) 77 | 78 | ctrl = ctrls[3] 79 | decode.Get4uint32Scalar(data[sizeA+sizeB+sizeC:], nums[12:], ctrl) 80 | sizeD := shared.ControlByteToSize(ctrl) 81 | 82 | dataPos += sizeA + sizeB + sizeC + sizeD 83 | ctrlPos += jumpCtrl 84 | } 85 | 86 | for ; decoded < lowest4; decoded += 4 { 87 | ctrl := stream[ctrlPos] 88 | decode.Get4uint32Scalar(stream[dataPos:], out[decoded:], ctrl) 89 | size := shared.ControlByteToSize(ctrl) 90 | dataPos += size 91 | ctrlPos++ 92 | } 93 | 94 | if lowest4 != count { 95 | decode.GetUint32Scalar(stream[dataPos:], out[decoded:], stream[ctrlPos], count-lowest4) 96 | } 97 | } 98 | 99 | // ReadAllDeltaScalar will read the entire input stream into out according to the 100 | // Stream VByte format. It will reconstruct the original non differentially 101 | // encoded values. 102 | // 103 | // Note: It is your responsibility to ensure that the incoming slices are 104 | // appropriately sized as well as tracking the count of integers in the 105 | // stream. 106 | func ReadAllDeltaScalar(count int, stream []byte, out []uint32, prev uint32) { 107 | var ( 108 | ctrlLen = (count + 3) / 4 109 | 110 | dataPos = ctrlLen 111 | ctrlPos = 0 112 | decoded = 0 113 | lowestJump = count &^ (jump - 1) 114 | lowest4 = count &^ 3 115 | ) 116 | 117 | for ; decoded < lowestJump; decoded += jump { 118 | data := stream[dataPos:] 119 | ctrls := stream[ctrlPos : ctrlPos+jumpCtrl] 120 | nums := out[decoded : decoded+jump] 121 | 122 | ctrl := ctrls[0] 123 | decode.Get4uint32DeltaScalar(data, nums, ctrl, prev) 124 | sizeA := shared.ControlByteToSize(ctrl) 125 | 126 | ctrl = ctrls[1] 127 | decode.Get4uint32DeltaScalar(data[sizeA:], nums[4:], ctrl, nums[3]) 128 | sizeB := shared.ControlByteToSize(ctrl) 129 | 130 | ctrl = ctrls[2] 131 | decode.Get4uint32DeltaScalar(data[sizeA+sizeB:], nums[8:], ctrl, nums[7]) 132 | sizeC := shared.ControlByteToSize(ctrl) 133 | 134 | ctrl = ctrls[3] 135 | decode.Get4uint32DeltaScalar(data[sizeA+sizeB+sizeC:], nums[12:], ctrl, nums[11]) 136 | sizeD := shared.ControlByteToSize(ctrl) 137 | 138 | dataPos += sizeA + sizeB + sizeC + sizeD 139 | ctrlPos += jumpCtrl 140 | prev = nums[15] 141 | } 142 | 143 | for ; decoded < lowest4; decoded += 4 { 144 | ctrl := stream[ctrlPos] 145 | decode.Get4uint32DeltaScalar(stream[dataPos:], out[decoded:], ctrl, prev) 146 | size := shared.ControlByteToSize(ctrl) 147 | dataPos += size 148 | ctrlPos++ 149 | prev = out[decoded+3] 150 | } 151 | 152 | if lowest4 != count { 153 | decode.GetUint32DeltaScalar(stream[dataPos:], out[decoded:], stream[ctrlPos], count-lowest4, prev) 154 | } 155 | } 156 | -------------------------------------------------------------------------------- /pkg/stream/reader/reader_amd64.go: -------------------------------------------------------------------------------- 1 | // +build amd64 2 | 3 | package reader 4 | 5 | import ( 6 | "github.com/theMPatel/streamvbyte-simdgo/pkg/decode" 7 | "github.com/theMPatel/streamvbyte-simdgo/pkg/shared" 8 | ) 9 | 10 | // ReadAllFast will read the entire input stream into out according to the 11 | // Stream VByte format using special hardware instructions. 12 | // 13 | // Note: It is your responsibility to ensure that the incoming slices are 14 | // appropriately sized as well as tracking the count of integers in the 15 | // stream. 16 | func ReadAllFast(count int, stream []byte, out []uint32) { 17 | var ( 18 | ctrlPos = 0 19 | decoded = 0 20 | dataPos = (count + 3) / 4 21 | ctrlLen = dataPos 22 | // lowest32 is the limit for the count of integers we'll read in 23 | // bulk 8 at a time directly from the input stream. We subtract 3 24 | // here since we load 16 bytes at a time in the assembly code. If 25 | // you attempt to load the last few control bytes worth of data, 26 | // it's possible there won't be enough bytes in the data stream to 27 | // support it, which can lead to loading from uninitialized memory. 28 | // 29 | // [ _ _ _ _ | _ _ _ _ | _ _ _ _ | _ _ _ _ ] 30 | // works fine --^ ^-- bad things here 31 | // 32 | // Imagine the last group of 16 in the above array is all encoded with 33 | // 4 bytes. Decoding the first 4 integers in that group will work fine, 34 | // since it will load the last 3 (unused) bytes. However, when attempting 35 | // to decode the last three groups of 4, each load will need an extra 36 | // 1, 2, or 3 bytes (respectively) in order to be considered safe. 37 | lowest32 = ((ctrlLen - 3) * 4) &^ 31 38 | ) 39 | 40 | for ; decoded < lowest32; decoded += 32 { 41 | data := stream[dataPos:] 42 | ctrls := stream[ctrlPos : ctrlPos+8] 43 | nums := out[decoded : decoded+32] 44 | 45 | ctrl := uint16(ctrls[0]) | uint16(ctrls[1])<<8 46 | decode.Get8uint32FastAsm( 47 | data, 48 | nums, 49 | ctrl, 50 | shared.DecodeShuffleTable, 51 | shared.PerControlLenTable, 52 | ) 53 | sizeA := shared.ControlByteToSize(ctrls[0]) + shared.ControlByteToSize(ctrls[1]) 54 | 55 | ctrl = uint16(ctrls[2]) | uint16(ctrls[3])<<8 56 | decode.Get8uint32FastAsm( 57 | data[sizeA:], 58 | nums[8:], 59 | ctrl, 60 | shared.DecodeShuffleTable, 61 | shared.PerControlLenTable, 62 | ) 63 | sizeB := shared.ControlByteToSize(ctrls[2]) + shared.ControlByteToSize(ctrls[3]) 64 | 65 | ctrl = uint16(ctrls[4]) | uint16(ctrls[5])<<8 66 | decode.Get8uint32FastAsm( 67 | data[sizeA+sizeB:], 68 | nums[16:], 69 | ctrl, 70 | shared.DecodeShuffleTable, 71 | shared.PerControlLenTable, 72 | ) 73 | sizeC := shared.ControlByteToSize(ctrls[4]) + shared.ControlByteToSize(ctrls[5]) 74 | 75 | ctrl = uint16(ctrls[6]) | uint16(ctrls[7])<<8 76 | decode.Get8uint32FastAsm( 77 | data[sizeA+sizeB+sizeC:], 78 | nums[24:], 79 | ctrl, 80 | shared.DecodeShuffleTable, 81 | shared.PerControlLenTable, 82 | ) 83 | sizeD := shared.ControlByteToSize(ctrls[6]) + shared.ControlByteToSize(ctrls[7]) 84 | 85 | dataPos += sizeA + sizeB + sizeC + sizeD 86 | ctrlPos += 8 87 | } 88 | 89 | // Must be strictly less than the last 4 blocks of integers, since we can't safely 90 | // decode 8 if our ctrl pos starts at the first 4 in the block. 91 | for ; ctrlPos < ctrlLen-4; ctrlPos += 2 { 92 | ctrl := uint16(stream[ctrlPos]) | uint16(stream[ctrlPos+1])<<8 93 | decode.Get8uint32FastAsm( 94 | stream[dataPos:], 95 | out[decoded:], 96 | ctrl, 97 | shared.DecodeShuffleTable, 98 | shared.PerControlLenTable, 99 | ) 100 | dataPos += shared.ControlByteToSize(stream[ctrlPos]) + shared.ControlByteToSize(stream[ctrlPos+1]) 101 | decoded += 8 102 | } 103 | 104 | for ; ctrlPos < ctrlLen; ctrlPos += 1 { 105 | nums := count - decoded 106 | if nums > 4 { 107 | nums = 4 108 | } 109 | dataPos += decode.GetUint32Scalar( 110 | stream[dataPos:], 111 | out[decoded:], 112 | stream[ctrlPos], 113 | nums, 114 | ) 115 | decoded += nums 116 | } 117 | } 118 | 119 | // ReadAllDeltaFast will read the entire input stream into out according to the 120 | // Stream VByte format using special hardware instructions. It will reconstruct 121 | // the original non differentially encoded values. 122 | // 123 | // Note: It is your responsibility to ensure that the incoming slices are 124 | // appropriately sized as well as tracking the count of integers in the 125 | // stream. 126 | func ReadAllDeltaFast(count int, stream []byte, out []uint32, prev uint32) { 127 | var ( 128 | ctrlPos = 0 129 | decoded = 0 130 | dataPos = (count + 3) / 4 131 | ctrlLen = dataPos 132 | // lowest32 is the limit for the count of integers we'll read in 133 | // bulk 8 at a time directly from the input stream. We subtract 3 134 | // here since we load 16 bytes at a time in the assembly code. If 135 | // you attempt to load the last few control bytes worth of data, 136 | // it's possible there won't be enough bytes in the data stream to 137 | // support it, which can lead to loading from uninitialized memory. 138 | // 139 | // [ _ _ _ _ | _ _ _ _ | _ _ _ _ | _ _ _ _ ] 140 | // works fine --^ ^-- bad things here 141 | // 142 | // Imagine the last group of 16 in the above array is all encoded with 143 | // 4 bytes. Decoding the first 4 integers in that group will work fine, 144 | // since it will load the last 3 (unused) bytes. However, when attempting 145 | // to decode the last three groups of 4, each load will need an extra 146 | // 1, 2, or 3 bytes (respectively) in order to be considered safe. 147 | lowest32 = ((ctrlLen - 3) * 4) &^ 31 148 | ) 149 | 150 | for ; decoded < lowest32; decoded += 32 { 151 | data := stream[dataPos:] 152 | ctrls := stream[ctrlPos : ctrlPos+8] 153 | nums := out[decoded : decoded+32] 154 | 155 | ctrl := uint16(ctrls[0]) | uint16(ctrls[1])<<8 156 | decode.Get8uint32DeltaFastAsm( 157 | data, 158 | nums, 159 | ctrl, 160 | prev, 161 | shared.DecodeShuffleTable, 162 | shared.PerControlLenTable, 163 | ) 164 | sizeA := shared.ControlByteToSize(ctrls[0]) + shared.ControlByteToSize(ctrls[1]) 165 | 166 | ctrl = uint16(ctrls[2]) | uint16(ctrls[3])<<8 167 | decode.Get8uint32DeltaFastAsm( 168 | data[sizeA:], 169 | nums[8:], 170 | ctrl, 171 | nums[7], 172 | shared.DecodeShuffleTable, 173 | shared.PerControlLenTable, 174 | ) 175 | sizeB := shared.ControlByteToSize(ctrls[2]) + shared.ControlByteToSize(ctrls[3]) 176 | 177 | ctrl = uint16(ctrls[4]) | uint16(ctrls[5])<<8 178 | decode.Get8uint32DeltaFastAsm( 179 | data[sizeA+sizeB:], 180 | nums[16:], 181 | ctrl, 182 | nums[15], 183 | shared.DecodeShuffleTable, 184 | shared.PerControlLenTable, 185 | ) 186 | sizeC := shared.ControlByteToSize(ctrls[4]) + shared.ControlByteToSize(ctrls[5]) 187 | 188 | ctrl = uint16(ctrls[6]) | uint16(ctrls[7])<<8 189 | decode.Get8uint32DeltaFastAsm( 190 | data[sizeA+sizeB+sizeC:], 191 | nums[24:], 192 | ctrl, 193 | nums[23], 194 | shared.DecodeShuffleTable, 195 | shared.PerControlLenTable, 196 | ) 197 | sizeD := shared.ControlByteToSize(ctrls[6]) + shared.ControlByteToSize(ctrls[7]) 198 | 199 | dataPos += sizeA + sizeB + sizeC + sizeD 200 | ctrlPos += 8 201 | prev = nums[31] 202 | } 203 | 204 | // Must be strictly less than the last 4 blocks of integers, since we can't safely 205 | // decode 8 if our ctrl pos starts at the first 4 in the block. 206 | for ; ctrlPos < ctrlLen-4; ctrlPos += 2 { 207 | ctrl := uint16(stream[ctrlPos]) | uint16(stream[ctrlPos+1])<<8 208 | decode.Get8uint32DeltaFastAsm( 209 | stream[dataPos:], 210 | out[decoded:], 211 | ctrl, 212 | prev, 213 | shared.DecodeShuffleTable, 214 | shared.PerControlLenTable, 215 | ) 216 | dataPos += shared.ControlByteToSize(stream[ctrlPos]) + shared.ControlByteToSize(stream[ctrlPos+1]) 217 | decoded += 8 218 | prev = out[decoded-1] 219 | } 220 | 221 | for ; ctrlPos < ctrlLen; ctrlPos += 1 { 222 | nums := count - decoded 223 | if nums > 4 { 224 | nums = 4 225 | } 226 | dataPos += decode.GetUint32DeltaScalar( 227 | stream[dataPos:], 228 | out[decoded:], 229 | stream[ctrlPos], 230 | nums, 231 | prev, 232 | ) 233 | decoded += nums 234 | prev = out[decoded-1] 235 | } 236 | } 237 | -------------------------------------------------------------------------------- /pkg/stream/reader/reader_base.go: -------------------------------------------------------------------------------- 1 | // +build !amd64 2 | 3 | package reader 4 | 5 | func ReadAllFast(count int, stream []byte, out []uint32) { 6 | panic("unreachable") 7 | } 8 | -------------------------------------------------------------------------------- /pkg/stream/reader/reader_test.go: -------------------------------------------------------------------------------- 1 | package reader 2 | 3 | import ( 4 | "encoding/binary" 5 | "fmt" 6 | "math" 7 | "math/rand" 8 | "reflect" 9 | "testing" 10 | "time" 11 | 12 | "github.com/theMPatel/streamvbyte-simdgo/pkg/decode" 13 | "github.com/theMPatel/streamvbyte-simdgo/pkg/encode" 14 | "github.com/theMPatel/streamvbyte-simdgo/pkg/shared" 15 | "github.com/theMPatel/streamvbyte-simdgo/pkg/stream/writer" 16 | "github.com/theMPatel/streamvbyte-simdgo/pkg/util" 17 | ) 18 | 19 | func init() { 20 | rand.Seed(time.Now().UnixNano()) 21 | } 22 | 23 | func TestReadAllScalar(t *testing.T) { 24 | for i := 0; i < 6; i++ { 25 | count := int(util.RandUint32() % 1e6) 26 | nums := util.GenUint32(count) 27 | stream := writer.WriteAllScalar(nums) 28 | t.Run(fmt.Sprintf("ReadAll: %d", count), func(t *testing.T) { 29 | out := make([]uint32, count) 30 | ReadAllScalar(count, stream, out) 31 | if !reflect.DeepEqual(nums, out) { 32 | t.Fatalf("decoded wrong nums") 33 | } 34 | }) 35 | } 36 | } 37 | 38 | func TestReadAllDeltaScalar(t *testing.T) { 39 | for i := 0; i < 6; i++ { 40 | count := int(util.RandUint32() % 1e6) 41 | nums := util.GenUint32(count) 42 | util.SortUint32(nums) 43 | stream := writer.WriteAllDeltaScalar(nums, 0) 44 | t.Run(fmt.Sprintf("ReadAll: %d", count), func(t *testing.T) { 45 | out := make([]uint32, count) 46 | ReadAllDeltaScalar(count, stream, out, 0) 47 | if !reflect.DeepEqual(nums, out) { 48 | t.Fatalf("decoded wrong nums") 49 | } 50 | }) 51 | } 52 | } 53 | 54 | func TestReadAllFast(t *testing.T) { 55 | if decode.GetMode() == shared.Normal { 56 | t.Skipf("Testing environment doesn't support this test") 57 | } 58 | 59 | for i := 0; i < 6; i++ { 60 | count := int(util.RandUint32() % 1e6) 61 | nums := util.GenUint32(count) 62 | stream := writer.WriteAllScalar(nums) 63 | t.Run(fmt.Sprintf("ReadAll: %d", count), func(t *testing.T) { 64 | out := make([]uint32, count) 65 | ReadAllFast(count, stream, out) 66 | if !reflect.DeepEqual(nums, out) { 67 | t.Fatalf("decoded wrong nums") 68 | } 69 | }) 70 | } 71 | } 72 | 73 | func TestReadAllDeltaFast(t *testing.T) { 74 | if decode.GetMode() == shared.Normal { 75 | t.Skipf("Testing environment doesn't support this test") 76 | } 77 | 78 | for i := 0; i < 6; i++ { 79 | count := int(util.RandUint32() % 1e6) 80 | nums := util.GenUint32(count) 81 | util.SortUint32(nums) 82 | diffed := make([]uint32, count) 83 | util.Delta(nums, diffed) 84 | 85 | stream := writer.WriteAllScalar(diffed) 86 | t.Run(fmt.Sprintf("ReadAll: %d", count), func(t *testing.T) { 87 | out := make([]uint32, count) 88 | ReadAllDeltaFast(count, stream, out, 0) 89 | if !reflect.DeepEqual(nums, out) { 90 | t.Fatalf("decoded wrong nums") 91 | } 92 | }) 93 | } 94 | } 95 | 96 | var readSinkA []uint32 97 | 98 | func BenchmarkReadAllFast(b *testing.B) { 99 | if decode.GetMode() == shared.Normal { 100 | b.Skipf("Testing environment doesn't support this test") 101 | } 102 | 103 | for i := 0; i < 8; i++ { 104 | count := int(math.Pow10(i)) 105 | nums := util.GenUint32(count) 106 | stream := writer.WriteAllScalar(nums) 107 | out := make([]uint32, count) 108 | b.Run(fmt.Sprintf("Count_1e%d", i), func(b *testing.B) { 109 | b.SetBytes(int64(count * encode.MaxBytesPerNum)) 110 | b.ResetTimer() 111 | for i := 0; i < b.N; i++ { 112 | ReadAllFast(count, stream, out) 113 | } 114 | readSinkA = out 115 | }) 116 | } 117 | } 118 | 119 | var readSinkB []uint32 120 | 121 | func BenchmarkReadAllDeltaFast(b *testing.B) { 122 | if decode.GetMode() == shared.Normal { 123 | b.Skipf("Testing environment doesn't support this test") 124 | } 125 | 126 | for i := 0; i < 8; i++ { 127 | count := int(math.Pow10(i)) 128 | nums := util.GenUint32(count) 129 | util.SortUint32(nums) 130 | stream := writer.WriteAllDeltaScalar(nums, 0) 131 | out := make([]uint32, count) 132 | b.Run(fmt.Sprintf("Count_1e%d", i), func(b *testing.B) { 133 | b.SetBytes(int64(count * encode.MaxBytesPerNum)) 134 | b.ResetTimer() 135 | for i := 0; i < b.N; i++ { 136 | ReadAllDeltaFast(count, stream, out, 0) 137 | } 138 | readSinkB = out 139 | }) 140 | } 141 | } 142 | 143 | var readSinkC []uint32 144 | 145 | func BenchmarkReadAllScalar(b *testing.B) { 146 | for i := 0; i < 8; i++ { 147 | count := int(math.Pow10(i)) 148 | nums := util.GenUint32(count) 149 | util.SortUint32(nums) 150 | stream := writer.WriteAllScalar(nums) 151 | out := make([]uint32, count) 152 | b.Run(fmt.Sprintf("Count_1e%d", i), func(b *testing.B) { 153 | b.SetBytes(int64(count * encode.MaxBytesPerNum)) 154 | b.ResetTimer() 155 | for i := 0; i < b.N; i++ { 156 | ReadAllScalar(count, stream, out) 157 | } 158 | readSinkC = out 159 | }) 160 | } 161 | } 162 | 163 | var readSinkD []uint32 164 | 165 | func BenchmarkReadAllDeltaScalar(b *testing.B) { 166 | for i := 0; i < 8; i++ { 167 | count := int(math.Pow10(i)) 168 | nums := util.GenUint32(count) 169 | util.SortUint32(nums) 170 | stream := writer.WriteAllDeltaScalar(nums, 0) 171 | out := make([]uint32, count) 172 | b.Run(fmt.Sprintf("Count_1e%d", i), func(b *testing.B) { 173 | b.SetBytes(int64(count * encode.MaxBytesPerNum)) 174 | b.ResetTimer() 175 | for i := 0; i < b.N; i++ { 176 | ReadAllDeltaScalar(count, stream, out, 0) 177 | } 178 | readSinkD = out 179 | }) 180 | } 181 | } 182 | 183 | var readSinkE []uint32 184 | 185 | func BenchmarkReadAllVarint(b *testing.B) { 186 | for i := 0; i < 8; i++ { 187 | count := int(math.Pow10(i)) 188 | out := make([]uint32, count) 189 | data := make([]byte, binary.MaxVarintLen32*count) 190 | nums := util.GenUint32(count) 191 | util.SortUint32(nums) 192 | written := util.PutVarint(nums, data) 193 | data = data[:written] 194 | b.Run(fmt.Sprintf("Count_1e%d", i), func(b *testing.B) { 195 | b.SetBytes(int64(count * encode.MaxBytesPerNum)) 196 | b.ResetTimer() 197 | for i := 0; i < b.N; i++ { 198 | util.GetVarint(data, out) 199 | } 200 | readSinkB = out 201 | }) 202 | } 203 | } 204 | 205 | var readSinkF []uint32 206 | 207 | func BenchmarkReadAllDeltaVarint(b *testing.B) { 208 | for i := 0; i < 8; i++ { 209 | count := int(math.Pow10(i)) 210 | out := make([]uint32, count) 211 | data := make([]byte, binary.MaxVarintLen32*count) 212 | nums := util.GenUint32(count) 213 | util.SortUint32(nums) 214 | written := util.PutDeltaVarint(nums, data, 0) 215 | data = data[:written] 216 | b.Run(fmt.Sprintf("Count_1e%d", i), func(b *testing.B) { 217 | b.SetBytes(int64(count * encode.MaxBytesPerNum)) 218 | b.ResetTimer() 219 | for i := 0; i < b.N; i++ { 220 | util.GetDeltaVarint(data, out, 0) 221 | } 222 | readSinkB = out 223 | }) 224 | } 225 | } 226 | -------------------------------------------------------------------------------- /pkg/stream/writer/writer.go: -------------------------------------------------------------------------------- 1 | package writer 2 | 3 | import ( 4 | "github.com/theMPatel/streamvbyte-simdgo/pkg/encode" 5 | "github.com/theMPatel/streamvbyte-simdgo/pkg/shared" 6 | ) 7 | 8 | const ( 9 | jump = 16 10 | jumpCtrl = jump / 4 11 | ) 12 | 13 | // WriteAll will encode all the integers from in using the Stream VByte 14 | // format and will return the byte array holding the encoded data. It will 15 | // select the best implementation depending on the presence of special 16 | // hardware instructions. 17 | func WriteAll(in []uint32) []byte { 18 | if encode.GetMode() == shared.Fast { 19 | return WriteAllFast(in) 20 | } else { 21 | return WriteAllScalar(in) 22 | } 23 | } 24 | 25 | // WriteAllDelta will differentially encode all the integers from in using 26 | // the Stream VByte format and will return the byte array holding the encoded 27 | // data. It will select the best implementation depending on the presence of 28 | // special hardware instructions. 29 | func WriteAllDelta(in []uint32, prev uint32) []byte { 30 | if encode.GetMode() == shared.Fast { 31 | return WriteAllDeltaFast(in, prev) 32 | } else { 33 | return WriteAllDeltaScalar(in, prev) 34 | } 35 | } 36 | 37 | // WriteAllScalar will encode all the integers from in using the Stream VByte 38 | // format and will return the byte array holding the encoded data. 39 | func WriteAllScalar(in []uint32) []byte { 40 | var ( 41 | count = len(in) 42 | ctrlLen = (count + 3) / 4 43 | stream = make([]byte, ctrlLen+(encode.MaxBytesPerNum*count)) 44 | 45 | dataPos = ctrlLen 46 | ctrlPos = 0 47 | encoded = 0 48 | lowestJump = count &^ (jump - 1) 49 | lowest4 = count &^ 3 50 | ) 51 | 52 | for ; encoded < lowestJump; encoded += jump { 53 | nums := in[encoded : encoded+jump] 54 | data := stream[dataPos:] 55 | ctrls := stream[ctrlPos : ctrlPos+jumpCtrl] 56 | 57 | ctrl := encode.Put4uint32Scalar(nums, data) 58 | ctrls[0] = ctrl 59 | sizeA := shared.ControlByteToSize(ctrl) 60 | 61 | ctrl = encode.Put4uint32Scalar(nums[4:], data[sizeA:]) 62 | ctrls[1] = ctrl 63 | sizeB := shared.ControlByteToSize(ctrl) 64 | 65 | ctrl = encode.Put4uint32Scalar(nums[8:], data[sizeA+sizeB:]) 66 | ctrls[2] = ctrl 67 | sizeC := shared.ControlByteToSize(ctrl) 68 | 69 | ctrl = encode.Put4uint32Scalar(nums[12:], data[sizeA+sizeB+sizeC:]) 70 | ctrls[3] = ctrl 71 | sizeD := shared.ControlByteToSize(ctrl) 72 | 73 | dataPos += sizeA + sizeB + sizeC + sizeD 74 | ctrlPos += jumpCtrl 75 | } 76 | 77 | for ; encoded < lowest4; encoded += 4 { 78 | ctrl := encode.Put4uint32Scalar(in[encoded:], stream[dataPos:]) 79 | stream[ctrlPos] = ctrl 80 | size := shared.ControlByteToSize(ctrl) 81 | dataPos += size 82 | ctrlPos++ 83 | } 84 | 85 | if lowest4 != count { 86 | nums := count - lowest4 87 | ctrl := encode.PutUint32Scalar(in[encoded:], stream[dataPos:], nums) 88 | size := shared.ControlByteToSize(ctrl) 89 | size -= 4 - nums 90 | dataPos += size 91 | stream[ctrlPos] = ctrl 92 | } 93 | 94 | return stream[:dataPos] 95 | } 96 | 97 | // WriteAllDeltaScalar will differentially encode all the integers from in using 98 | // the Stream VByte format and will return the byte array holding the encoded data. 99 | func WriteAllDeltaScalar(in []uint32, prev uint32) []byte { 100 | var ( 101 | count = len(in) 102 | ctrlLen = (count + 3) / 4 103 | stream = make([]byte, ctrlLen+(encode.MaxBytesPerNum*count)) 104 | 105 | dataPos = ctrlLen 106 | ctrlPos = 0 107 | encoded = 0 108 | lowestJump = count &^ (jump - 1) 109 | lowest4 = count &^ 3 110 | ) 111 | 112 | for ; encoded < lowestJump; encoded += jump { 113 | nums := in[encoded : encoded+jump] 114 | data := stream[dataPos:] 115 | ctrls := stream[ctrlPos : ctrlPos+jumpCtrl] 116 | 117 | ctrl := encode.Put4uint32DeltaScalar(nums, data, prev) 118 | ctrls[0] = ctrl 119 | sizeA := shared.ControlByteToSize(ctrl) 120 | 121 | ctrl = encode.Put4uint32DeltaScalar(nums[4:], data[sizeA:], nums[3]) 122 | ctrls[1] = ctrl 123 | sizeB := shared.ControlByteToSize(ctrl) 124 | 125 | ctrl = encode.Put4uint32DeltaScalar(nums[8:], data[sizeA+sizeB:], nums[7]) 126 | ctrls[2] = ctrl 127 | sizeC := shared.ControlByteToSize(ctrl) 128 | 129 | ctrl = encode.Put4uint32DeltaScalar(nums[12:], data[sizeA+sizeB+sizeC:], nums[11]) 130 | ctrls[3] = ctrl 131 | sizeD := shared.ControlByteToSize(ctrl) 132 | 133 | dataPos += sizeA + sizeB + sizeC + sizeD 134 | ctrlPos += jumpCtrl 135 | prev = nums[15] 136 | } 137 | 138 | for ; encoded < lowest4; encoded += 4 { 139 | ctrl := encode.Put4uint32DeltaScalar(in[encoded:], stream[dataPos:], prev) 140 | stream[ctrlPos] = ctrl 141 | size := shared.ControlByteToSize(ctrl) 142 | dataPos += size 143 | ctrlPos++ 144 | prev = in[encoded+3] 145 | } 146 | 147 | if lowest4 != count { 148 | nums := count - lowest4 149 | ctrl := encode.PutUint32DeltaScalar(in[encoded:], stream[dataPos:], nums, prev) 150 | size := shared.ControlByteToSize(ctrl) 151 | size -= 4 - nums 152 | dataPos += size 153 | stream[ctrlPos] = ctrl 154 | } 155 | 156 | return stream[:dataPos] 157 | } 158 | -------------------------------------------------------------------------------- /pkg/stream/writer/writer_amd64.go: -------------------------------------------------------------------------------- 1 | // +build amd64 2 | 3 | package writer 4 | 5 | import ( 6 | "github.com/theMPatel/streamvbyte-simdgo/pkg/encode" 7 | "github.com/theMPatel/streamvbyte-simdgo/pkg/shared" 8 | ) 9 | 10 | // WriteAllFast will encode all the integers from in using the Stream VByte 11 | // format using special hardware instructions and will return the byte array 12 | // holding the encoded data. 13 | func WriteAllFast(in []uint32) []byte { 14 | var ( 15 | count = len(in) 16 | ctrlLen = (count + 3) / 4 17 | stream = make([]byte, ctrlLen+(encode.MaxBytesPerNum*count)) 18 | 19 | dataPos = ctrlLen 20 | ctrlPos = 0 21 | encoded = 0 22 | lowest32 = ((ctrlLen - 3) * 4) &^ 31 23 | ) 24 | 25 | for ; encoded < lowest32; encoded += 32 { 26 | ctrls := stream[ctrlPos : ctrlPos+8] 27 | nums := in[encoded : encoded+32] 28 | out := stream[dataPos:] 29 | 30 | ctrl := encode.Put8uint32FastAsm( 31 | nums[0:8], 32 | out, 33 | shared.EncodeShuffleTable, 34 | shared.PerControlLenTable, 35 | ) 36 | 37 | ctrls[0] = uint8(ctrl & 0xff) 38 | ctrls[1] = uint8(ctrl >> 8) 39 | sizeA := shared.ControlByteToSizeTwo(ctrl) 40 | 41 | ctrl = encode.Put8uint32FastAsm( 42 | nums[8:16], 43 | out[sizeA:], 44 | shared.EncodeShuffleTable, 45 | shared.PerControlLenTable, 46 | ) 47 | 48 | ctrls[2] = uint8(ctrl & 0xff) 49 | ctrls[3] = uint8(ctrl >> 8) 50 | sizeB := shared.ControlByteToSizeTwo(ctrl) 51 | 52 | ctrl = encode.Put8uint32FastAsm( 53 | nums[16:24], 54 | out[sizeA+sizeB:], 55 | shared.EncodeShuffleTable, 56 | shared.PerControlLenTable, 57 | ) 58 | 59 | ctrls[4] = uint8(ctrl & 0xff) 60 | ctrls[5] = uint8(ctrl >> 8) 61 | sizeC := shared.ControlByteToSizeTwo(ctrl) 62 | 63 | ctrl = encode.Put8uint32FastAsm( 64 | nums[24:], 65 | out[sizeA+sizeB+sizeC:], 66 | shared.EncodeShuffleTable, 67 | shared.PerControlLenTable, 68 | ) 69 | 70 | ctrls[6] = uint8(ctrl & 0xff) 71 | ctrls[7] = uint8(ctrl >> 8) 72 | sizeD := shared.ControlByteToSizeTwo(ctrl) 73 | 74 | ctrlPos += 8 75 | dataPos += sizeA + sizeB + sizeC + sizeD 76 | } 77 | 78 | for ; ctrlPos < ctrlLen-2; ctrlPos += 2 { 79 | ctrl := encode.Put8uint32FastAsm( 80 | in[encoded:], 81 | stream[dataPos:], 82 | shared.EncodeShuffleTable, 83 | shared.PerControlLenTable, 84 | ) 85 | 86 | stream[ctrlPos] = uint8(ctrl & 0xff) 87 | stream[ctrlPos+1] = uint8(ctrl >> 8) 88 | encoded += 8 89 | dataPos += shared.ControlByteToSizeTwo(ctrl) 90 | } 91 | 92 | for ; ctrlPos < ctrlLen; ctrlPos += 1 { 93 | nums := count - encoded 94 | if nums > 4 { 95 | nums = 4 96 | } 97 | ctrl := encode.PutUint32Scalar(in[encoded:], stream[dataPos:], nums) 98 | size := shared.ControlByteToSize(ctrl) 99 | stream[ctrlPos] = ctrl 100 | size -= 4 - nums 101 | dataPos += size 102 | encoded += nums 103 | } 104 | 105 | return stream[:dataPos] 106 | } 107 | 108 | // WriteAllDeltaFast will differentially encode all the integers from in using 109 | // the Stream VByte format using special hardware instructions and will return 110 | // the byte array holding the encoded data. 111 | func WriteAllDeltaFast(in []uint32, prev uint32) []byte { 112 | var ( 113 | count = len(in) 114 | ctrlLen = (count + 3) / 4 115 | stream = make([]byte, ctrlLen+(encode.MaxBytesPerNum*count)) 116 | 117 | dataPos = ctrlLen 118 | ctrlPos = 0 119 | encoded = 0 120 | lowest32 = ((ctrlLen - 3) * 4) &^ 31 121 | ) 122 | 123 | for ; encoded < lowest32; encoded += 32 { 124 | ctrls := stream[ctrlPos : ctrlPos+8] 125 | nums := in[encoded : encoded+32] 126 | out := stream[dataPos:] 127 | 128 | ctrl := encode.Put8uint32DeltaFastAsm( 129 | nums[0:8], 130 | out, 131 | prev, 132 | shared.EncodeShuffleTable, 133 | shared.PerControlLenTable, 134 | ) 135 | 136 | ctrls[0] = uint8(ctrl & 0xff) 137 | ctrls[1] = uint8(ctrl >> 8) 138 | sizeA := shared.ControlByteToSizeTwo(ctrl) 139 | 140 | ctrl = encode.Put8uint32DeltaFastAsm( 141 | nums[8:16], 142 | out[sizeA:], 143 | nums[7], 144 | shared.EncodeShuffleTable, 145 | shared.PerControlLenTable, 146 | ) 147 | 148 | ctrls[2] = uint8(ctrl & 0xff) 149 | ctrls[3] = uint8(ctrl >> 8) 150 | sizeB := shared.ControlByteToSizeTwo(ctrl) 151 | 152 | ctrl = encode.Put8uint32DeltaFastAsm( 153 | nums[16:24], 154 | out[sizeA+sizeB:], 155 | nums[15], 156 | shared.EncodeShuffleTable, 157 | shared.PerControlLenTable, 158 | ) 159 | 160 | ctrls[4] = uint8(ctrl & 0xff) 161 | ctrls[5] = uint8(ctrl >> 8) 162 | sizeC := shared.ControlByteToSizeTwo(ctrl) 163 | 164 | ctrl = encode.Put8uint32DeltaFastAsm( 165 | nums[24:], 166 | out[sizeA+sizeB+sizeC:], 167 | nums[23], 168 | shared.EncodeShuffleTable, 169 | shared.PerControlLenTable, 170 | ) 171 | 172 | ctrls[6] = uint8(ctrl & 0xff) 173 | ctrls[7] = uint8(ctrl >> 8) 174 | sizeD := shared.ControlByteToSizeTwo(ctrl) 175 | 176 | ctrlPos += 8 177 | dataPos += sizeA + sizeB + sizeC + sizeD 178 | prev = nums[31] 179 | } 180 | 181 | for ; ctrlPos < ctrlLen-2; ctrlPos += 2 { 182 | ctrl := encode.Put8uint32DeltaFastAsm( 183 | in[encoded:], 184 | stream[dataPos:], 185 | prev, 186 | shared.EncodeShuffleTable, 187 | shared.PerControlLenTable, 188 | ) 189 | 190 | stream[ctrlPos] = uint8(ctrl & 0xff) 191 | stream[ctrlPos+1] = uint8(ctrl >> 8) 192 | encoded += 8 193 | dataPos += shared.ControlByteToSizeTwo(ctrl) 194 | prev = in[encoded-1] 195 | } 196 | 197 | for ; ctrlPos < ctrlLen; ctrlPos += 1 { 198 | nums := count - encoded 199 | if nums > 4 { 200 | nums = 4 201 | } 202 | ctrl := encode.PutUint32DeltaScalar(in[encoded:], stream[dataPos:], nums, prev) 203 | size := shared.ControlByteToSize(ctrl) 204 | stream[ctrlPos] = ctrl 205 | size -= 4 - nums 206 | dataPos += size 207 | encoded += nums 208 | prev = in[encoded-1] 209 | } 210 | 211 | return stream[:dataPos] 212 | } 213 | -------------------------------------------------------------------------------- /pkg/stream/writer/writer_base.go: -------------------------------------------------------------------------------- 1 | // +build !amd64 2 | 3 | package writer 4 | 5 | func WriteAllFast(in []uint32) []byte { 6 | panic("unreachable") 7 | } 8 | -------------------------------------------------------------------------------- /pkg/stream/writer/writer_test.go: -------------------------------------------------------------------------------- 1 | package writer 2 | 3 | import ( 4 | "encoding/binary" 5 | "fmt" 6 | "math" 7 | "math/rand" 8 | "reflect" 9 | "testing" 10 | "time" 11 | 12 | "github.com/theMPatel/streamvbyte-simdgo/pkg/encode" 13 | "github.com/theMPatel/streamvbyte-simdgo/pkg/shared" 14 | "github.com/theMPatel/streamvbyte-simdgo/pkg/stream/reader" 15 | "github.com/theMPatel/streamvbyte-simdgo/pkg/util" 16 | ) 17 | 18 | func init() { 19 | rand.Seed(time.Now().UnixNano()) 20 | } 21 | 22 | func TestWriteAllScalar(t *testing.T) { 23 | for i := 0; i < 6; i++ { 24 | count := int(util.RandUint32() % 1e6) 25 | nums := util.GenUint32(count) 26 | stream := WriteAllScalar(nums) 27 | t.Run(fmt.Sprintf("WriteAll: %d", count), func(t *testing.T) { 28 | out := make([]uint32, count) 29 | reader.ReadAllScalar(count, stream, out) 30 | if !reflect.DeepEqual(nums, out) { 31 | t.Fatalf("decoded wrong nums") 32 | } 33 | }) 34 | } 35 | } 36 | 37 | func TestWriteAllDeltaScalar(t *testing.T) { 38 | for i := 0; i < 6; i++ { 39 | count := int(util.RandUint32() % 1e6) 40 | nums := util.GenUint32(count) 41 | util.SortUint32(nums) 42 | diffed := make([]uint32, count) 43 | util.Delta(nums, diffed) 44 | 45 | stream := WriteAllScalar(diffed) 46 | t.Run(fmt.Sprintf("WriteAll: %d", count), func(t *testing.T) { 47 | actual := WriteAllDeltaScalar(nums, 0) 48 | if !reflect.DeepEqual(stream, actual) { 49 | t.Fatalf("bad encoding") 50 | } 51 | }) 52 | } 53 | } 54 | 55 | func TestWriteAllFast(t *testing.T) { 56 | if encode.GetMode() == shared.Normal { 57 | t.Skipf("Testing environment doesn't support this test") 58 | } 59 | 60 | for i := 0; i < 6; i++ { 61 | count := int(util.RandUint32() % 1e6) 62 | nums := util.GenUint32(count) 63 | stream := WriteAllScalar(nums) 64 | t.Run(fmt.Sprintf("WriteAll: %d", count), func(t *testing.T) { 65 | actual := WriteAllFast(nums) 66 | if !reflect.DeepEqual(stream, actual) { 67 | t.Fatalf("bad encoding") 68 | } 69 | }) 70 | } 71 | } 72 | 73 | func TestWriteAllDeltaFast(t *testing.T) { 74 | if encode.GetMode() == shared.Normal { 75 | t.Skipf("Testing environment doesn't support this test") 76 | } 77 | 78 | for i := 0; i < 6; i++ { 79 | count := int(util.RandUint32() % 1e6) 80 | nums := util.GenUint32(count) 81 | util.SortUint32(nums) 82 | diffed := make([]uint32, count) 83 | util.Delta(nums, diffed) 84 | 85 | stream := WriteAllScalar(diffed) 86 | t.Run(fmt.Sprintf("WriteAll: %d", count), func(t *testing.T) { 87 | actual := WriteAllDeltaFast(nums, 0) 88 | if !reflect.DeepEqual(stream, actual) { 89 | t.Fatalf("bad encoding") 90 | } 91 | }) 92 | } 93 | } 94 | 95 | var readSinkA []byte 96 | 97 | func BenchmarkWriteAllFast(b *testing.B) { 98 | if encode.GetMode() == shared.Normal { 99 | b.Skipf("Testing environment doesn't support this test") 100 | } 101 | 102 | for i := 0; i < 8; i++ { 103 | count := int(math.Pow10(i)) 104 | nums := util.GenUint32(count) 105 | b.Run(fmt.Sprintf("Count_1e%d", i), func(b *testing.B) { 106 | var stream []byte 107 | b.SetBytes(int64(count * encode.MaxBytesPerNum)) 108 | b.ResetTimer() 109 | for i := 0; i < b.N; i++ { 110 | stream = WriteAllFast(nums) 111 | } 112 | readSinkA = stream 113 | }) 114 | } 115 | } 116 | 117 | var readSinkB []byte 118 | 119 | func BenchmarkWriteAllDeltaFast(b *testing.B) { 120 | if encode.GetMode() == shared.Normal { 121 | b.Skipf("Testing environment doesn't support this test") 122 | } 123 | 124 | for i := 0; i < 8; i++ { 125 | count := int(math.Pow10(i)) 126 | nums := util.GenUint32(count) 127 | util.SortUint32(nums) 128 | b.Run(fmt.Sprintf("Count_1e%d", i), func(b *testing.B) { 129 | var stream []byte 130 | b.SetBytes(int64(count * encode.MaxBytesPerNum)) 131 | b.ResetTimer() 132 | for i := 0; i < b.N; i++ { 133 | stream = WriteAllDeltaFast(nums, 0) 134 | } 135 | readSinkB = stream 136 | }) 137 | } 138 | } 139 | 140 | var readSinkC []byte 141 | 142 | func BenchmarkWriteAllScalar(b *testing.B) { 143 | for i := 0; i < 8; i++ { 144 | count := int(math.Pow10(i)) 145 | nums := util.GenUint32(count) 146 | util.SortUint32(nums) 147 | b.Run(fmt.Sprintf("Count_1e%d", i), func(b *testing.B) { 148 | var stream []byte 149 | b.SetBytes(int64(count * encode.MaxBytesPerNum)) 150 | b.ResetTimer() 151 | for i := 0; i < b.N; i++ { 152 | stream = WriteAllScalar(nums) 153 | } 154 | readSinkC = stream 155 | }) 156 | } 157 | } 158 | 159 | var readSinkD []byte 160 | 161 | func BenchmarkWriteAllDeltaScalar(b *testing.B) { 162 | for i := 0; i < 8; i++ { 163 | count := int(math.Pow10(i)) 164 | nums := util.GenUint32(count) 165 | util.SortUint32(nums) 166 | b.Run(fmt.Sprintf("Count_1e%d", i), func(b *testing.B) { 167 | var stream []byte 168 | b.SetBytes(int64(count * encode.MaxBytesPerNum)) 169 | b.ResetTimer() 170 | for i := 0; i < b.N; i++ { 171 | stream = WriteAllDeltaScalar(nums, 0) 172 | } 173 | readSinkD = stream 174 | }) 175 | } 176 | } 177 | 178 | var readSinkE int 179 | 180 | func BenchmarkWriteAllVarint(b *testing.B) { 181 | for i := 0; i < 8; i++ { 182 | count := int(math.Pow10(i)) 183 | nums := util.GenUint32(count) 184 | util.SortUint32(nums) 185 | out := make([]byte, count*binary.MaxVarintLen32) 186 | written := 0 187 | b.Run(fmt.Sprintf("Count_1e%d", i), func(b *testing.B) { 188 | b.SetBytes(int64(count * encode.MaxBytesPerNum)) 189 | b.ResetTimer() 190 | for i := 0; i < b.N; i++ { 191 | written = util.PutVarint(nums, out) 192 | } 193 | readSinkE = written 194 | }) 195 | } 196 | } 197 | 198 | var readSinkF int 199 | 200 | func BenchmarkWriteAllDeltaVarint(b *testing.B) { 201 | for i := 0; i < 8; i++ { 202 | count := int(math.Pow10(i)) 203 | nums := util.GenUint32(count) 204 | util.SortUint32(nums) 205 | out := make([]byte, count*binary.MaxVarintLen32) 206 | written := 0 207 | b.Run(fmt.Sprintf("Count_1e%d", i), func(b *testing.B) { 208 | b.SetBytes(int64(count * encode.MaxBytesPerNum)) 209 | b.ResetTimer() 210 | for i := 0; i < b.N; i++ { 211 | written = util.PutDeltaVarint(nums, out, 0) 212 | } 213 | readSinkE = written 214 | }) 215 | } 216 | } 217 | -------------------------------------------------------------------------------- /pkg/util/rand.go: -------------------------------------------------------------------------------- 1 | package util 2 | 3 | // This file provides a more uniform random number generator that creates 4 | // numbers that have a more normal distribution along the number of bytes 5 | // required to encode them. This is needed because the larger encoded bytes 6 | // i.e. 3 and 4 bytes have more numbers to pick from versus those that require 7 | // just 1 or 2. Thus, using a normally generated number is more likely to produce 8 | // a number that requires 3 or 4 bytes to encode. 9 | 10 | import ( 11 | "math" 12 | "math/rand" 13 | ) 14 | 15 | type generator func() uint32 16 | 17 | func randUint32Range(low, high uint32) uint32 { 18 | return (rand.Uint32() % (high - low + 1)) + low 19 | } 20 | 21 | var ( 22 | generators = []generator{ 23 | // 1 byte, 24 | func() uint32 { 25 | return randUint32Range(0, 1<<8) 26 | }, 27 | // 2 byte, 28 | func() uint32 { 29 | return randUint32Range(1<<8, 1<<16) 30 | }, 31 | // 3 byte, 32 | func() uint32 { 33 | return randUint32Range(1<<16, 1<<24) 34 | }, 35 | // 4 byte, 36 | func() uint32 { 37 | return randUint32Range(1<<24, math.MaxUint32) 38 | }, 39 | } 40 | ) 41 | 42 | // RandUint32 generates a random number that is also uniformly random 43 | // on the axis for the number of bytes required to encode it. It first 44 | // randomly chooses a byte length, i.e. 1, 2, 3 or 4 and then randomly 45 | // generates a number whose encoded length would be that length. 46 | func RandUint32() uint32 { 47 | return generators[rand.Int()%4]() 48 | } 49 | -------------------------------------------------------------------------------- /pkg/util/util.go: -------------------------------------------------------------------------------- 1 | package util 2 | 3 | import ( 4 | "io" 5 | "sort" 6 | ) 7 | 8 | func SilentClose(closer io.Closer) { 9 | _ = closer.Close() 10 | } 11 | 12 | func GenUint32(n int) []uint32 { 13 | nums := make([]uint32, n) 14 | for i := 0; i < n; i++ { 15 | nums[i] = RandUint32() 16 | } 17 | 18 | return nums 19 | } 20 | 21 | func SortUint32(in []uint32) { 22 | sort.Slice(in, func(i, j int) bool { 23 | return in[i] < in[j] 24 | }) 25 | } 26 | 27 | func Delta(in []uint32, out []uint32) { 28 | for i := range in { 29 | if i > 0 { 30 | out[i] = in[i] - in[i-1] 31 | } else { 32 | out[i] = in[i] 33 | } 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /pkg/util/varint.go: -------------------------------------------------------------------------------- 1 | package util 2 | 3 | import "encoding/binary" 4 | 5 | func PutVarint(nums []uint32, out []byte) int { 6 | pos := 0 7 | for i := range nums { 8 | size := binary.PutUvarint(out[pos:], uint64(nums[i])) 9 | pos += size 10 | } 11 | 12 | return pos 13 | } 14 | 15 | func GetVarint(data []byte, out []uint32) int { 16 | pos := 0 17 | i := 0 18 | for pos < len(data) { 19 | num, read := binary.Uvarint(data[pos:]) 20 | pos += read 21 | out[i] = uint32(num) 22 | i++ 23 | } 24 | return pos 25 | } 26 | 27 | func PutDeltaVarint(nums []uint32, out []byte, prev uint32) int { 28 | pos := 0 29 | for i := range nums { 30 | size := binary.PutUvarint(out[pos:], uint64(nums[i]-prev)) 31 | pos += size 32 | prev = nums[i] 33 | } 34 | 35 | return pos 36 | } 37 | 38 | func GetDeltaVarint(in []byte, out []uint32, prev uint32) int { 39 | pos := 0 40 | i := 0 41 | for pos < len(in) { 42 | num, size := binary.Uvarint(in[pos:]) 43 | pos += size 44 | res := uint32(num) + prev 45 | out[i] = res 46 | prev = res 47 | i++ 48 | } 49 | 50 | return pos 51 | } 52 | -------------------------------------------------------------------------------- /pkg/util/varint_test.go: -------------------------------------------------------------------------------- 1 | package util 2 | 3 | import ( 4 | "encoding/binary" 5 | "reflect" 6 | "testing" 7 | ) 8 | 9 | func TestVarintRoundTrip(t *testing.T) { 10 | count := 8 11 | nums := GenUint32(count) 12 | out := make([]byte, count*binary.MaxVarintLen32) 13 | written := PutVarint(nums, out) 14 | out = out[:written] 15 | 16 | actual := make([]uint32, count) 17 | read := GetVarint(out, actual) 18 | 19 | if written != read { 20 | t.Fatalf("expected to read %d, got %d", written, read) 21 | } 22 | 23 | if !reflect.DeepEqual(nums, actual) { 24 | t.Fatalf("expected %+v, got %+v", nums, actual) 25 | } 26 | } 27 | 28 | func TestVarintDeltaRoundTrip(t *testing.T) { 29 | count := 8 30 | nums := GenUint32(count) 31 | SortUint32(nums) 32 | out := make([]byte, count*binary.MaxVarintLen32) 33 | written := PutDeltaVarint(nums, out, 0) 34 | out = out[:written] 35 | 36 | actual := make([]uint32, count) 37 | read := GetDeltaVarint(out, actual, 0) 38 | 39 | if written != read { 40 | t.Fatalf("expected to read %d, got %d", written, read) 41 | } 42 | 43 | if !reflect.DeepEqual(nums, actual) { 44 | t.Fatalf("expected %+v, got %+v", nums, actual) 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /tools/generate_and_check.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | BEFORE_DIFF=$(git diff | sha1sum ) 6 | BEFORE_STATUS=$(git status --porcelain | sha1sum) 7 | 8 | make generate 9 | 10 | AFTER_DIFF=$(git diff | sha1sum ) 11 | AFTER_STATUS=$(git status --porcelain | sha1sum) 12 | 13 | if [[ $BEFORE_DIFF != $AFTER_DIFF || $BEFORE_STATUS != $AFTER_STATUS ]]; then 14 | echo "Unstable generate. Make sure to generate and check in changed files." 15 | exit 1 16 | fi -------------------------------------------------------------------------------- /tools/parse_and_write_bench.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bufio" 5 | "bytes" 6 | "flag" 7 | "io" 8 | "log" 9 | "os" 10 | "path/filepath" 11 | "strings" 12 | 13 | "github.com/theMPatel/streamvbyte-simdgo/pkg/util" 14 | ) 15 | 16 | const ( 17 | goos = "goos" 18 | goarch = "goarch" 19 | pkg = "pkg" 20 | cpu = "cpu" 21 | benchmark = "Benchmark" 22 | dashes = "--" 23 | startSentinel = "## Benchmarks\n\n```text\n" 24 | endSentinel = "```\n" 25 | ) 26 | 27 | var ( 28 | validPrefixes = []string{ 29 | goos, 30 | goarch, 31 | pkg, 32 | cpu, 33 | benchmark, 34 | } 35 | 36 | readmeFile = filepath.Join(os.Getenv("SBYTE_HOME"), "README.md") 37 | fWriteOut = flag.Bool("w", false, "write out to readme") 38 | ) 39 | 40 | func anyPrefix(in string) bool { 41 | for _, p := range validPrefixes { 42 | if strings.HasPrefix(in, p) { 43 | return true 44 | } 45 | } 46 | 47 | return false 48 | } 49 | 50 | func main() { 51 | flag.Parse() 52 | var ( 53 | lines []string 54 | hasBench = false 55 | ) 56 | 57 | scanner := bufio.NewScanner(os.Stdin) 58 | for scanner.Scan() { 59 | line := scanner.Text() 60 | if anyPrefix(line) { 61 | emitDash := strings.HasPrefix(line, cpu) 62 | hasBench = hasBench || strings.HasPrefix(line, benchmark) 63 | emitNewline := hasBench && strings.HasPrefix(line, goos) 64 | 65 | if emitNewline { 66 | lines = append(lines, "") 67 | } 68 | if hasBench { 69 | line = strings.TrimPrefix(line, benchmark) 70 | } 71 | lines = append(lines, line) 72 | if emitDash { 73 | lines = append(lines, dashes) 74 | } 75 | } 76 | } 77 | 78 | if err := scanner.Err(); err != nil { 79 | log.Fatalf("failed to read input: %s", err) 80 | } 81 | 82 | outputFile, err := os.Open(readmeFile) 83 | if err != nil { 84 | log.Fatalf("failed to open file: %s, %s", readmeFile, err) 85 | } 86 | 87 | allData, err := io.ReadAll(outputFile) 88 | if err != nil { 89 | log.Fatalf("failed to read file: %s, %s", readmeFile, err) 90 | } 91 | 92 | util.SilentClose(outputFile) 93 | 94 | bStart := []byte(startSentinel) 95 | bEnd := []byte(endSentinel) 96 | 97 | start := bytes.Index(allData, bStart) 98 | if start < 0 { 99 | log.Fatalf("couldn't find start sentinel") 100 | } 101 | 102 | restStart := bytes.Index(allData, bEnd) 103 | 104 | var final []byte 105 | final = append(final, allData[:start]...) 106 | final = append(final, bStart...) 107 | final = append(final, []byte(strings.Join(lines, "\n"))...) 108 | final = append(final, '\n') 109 | final = append(final, allData[restStart:]...) 110 | 111 | var out io.Writer 112 | if *fWriteOut { 113 | outputFile, err = os.Create(readmeFile) 114 | if err != nil { 115 | log.Fatalf("failed to open file: %s, %s", readmeFile, err) 116 | } 117 | defer util.SilentClose(outputFile) 118 | out = outputFile 119 | } else { 120 | out = os.Stdout 121 | } 122 | 123 | _, err = out.Write(final) 124 | if err != nil { 125 | log.Fatalf("failed to write to file: %s, %s", readmeFile, err) 126 | } 127 | } 128 | -------------------------------------------------------------------------------- /tools/update_bench.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | TMP_FILE=$(mktemp) 6 | go test -bench . ./pkg/... | tee $TMP_FILE 7 | cat $TMP_FILE | go run $SBYTE_HOME/tools/parse_and_write_bench.go -w 8 | rm $TMP_FILE --------------------------------------------------------------------------------