├── .envrc
├── .github
    └── workflows
    │   └── default.yaml
├── .gitignore
├── LICENSE
├── Makefile
├── README.md
├── go.mod
├── go.sum
├── pkg
    ├── decode
    │   ├── decode.go
    │   ├── decode_amd64.go
    │   ├── decode_amd64.s
    │   ├── decode_base.go
    │   ├── decode_test.go
    │   ├── gen.go
    │   └── main
    │   │   └── asm.go
    ├── encode
    │   ├── encode.go
    │   ├── encode_amd64.go
    │   ├── encode_amd64.s
    │   ├── encode_base.go
    │   ├── encode_test.go
    │   ├── gen.go
    │   └── main
    │   │   └── asm.go
    ├── pkg_test.go
    ├── shared
    │   ├── asm.go
    │   ├── gen.go
    │   ├── main
    │   │   └── gentables.go
    │   ├── mode.go
    │   └── tables.go
    ├── stream
    │   ├── reader
    │   │   ├── reader.go
    │   │   ├── reader_amd64.go
    │   │   ├── reader_base.go
    │   │   └── reader_test.go
    │   └── writer
    │   │   ├── writer.go
    │   │   ├── writer_amd64.go
    │   │   ├── writer_base.go
    │   │   └── writer_test.go
    └── util
    │   ├── rand.go
    │   ├── util.go
    │   ├── varint.go
    │   └── varint_test.go
└── tools
    ├── generate_and_check.sh
    ├── parse_and_write_bench.go
    └── update_bench.sh


/.envrc:
--------------------------------------------------------------------------------
1 | THIS_DIR="$( realpath "$( dirname "${BASH_SOURCE[0]}" )" )"
2 | export GOPATH=$THIS_DIR/go
3 | export GOBIN=$GOPATH/bin
4 | export SBYTE_HOME=$THIS_DIR
5 | 
6 | PATH_add $GOPATH/bin
7 | 


--------------------------------------------------------------------------------
/.github/workflows/default.yaml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | on:
 3 |   push:
 4 |     branches:
 5 |       - main
 6 |   pull_request:
 7 |     branches:
 8 |       - main
 9 | jobs:
10 |   test:
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |       - uses: actions/checkout@v2
14 |       - name: Test
15 |         run: make test
16 |   generate:
17 |     runs-on: ubuntu-latest
18 |     steps:
19 |       - uses: actions/checkout@v2
20 |       - name: Generate check
21 |         run: ./tools/generate_and_check.sh
22 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Binaries for programs and plugins
 2 | *.exe
 3 | *.exe~
 4 | *.dll
 5 | *.so
 6 | *.dylib
 7 | 
 8 | # Test binary, built with `go test -c`
 9 | *.test
10 | 
11 | # Output of the go coverage tool, specifically when used with LiteIDE
12 | *.out
13 | 
14 | # Dependency directories (remove the comment below to include it)
15 | # vendor/
16 | 
17 | .idea
18 | go


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright 2021 Milan Patel
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | generate:
 3 | 	go generate ./pkg/...
 4 | 
 5 | test:
 6 | 	go test -v ./pkg/...
 7 | 
 8 | update-bench:
 9 | 	./tools/update_bench.sh
10 | 
11 | fmtgo:
12 | 	find ./pkg -type f -iname "*.go" | xargs gofmt -w


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Stream VByte SIMD Go
  2 | 
  3 | ![Tests](https://github.com/theMPatel/streamvbyte-simdgo/actions/workflows/default.yaml/badge.svg)
  4 | 
  5 | This is a repository that contains a port of Stream VByte to Go. Notably, this repo takes extra care
  6 | to leverage SIMD techniques to achieve better performance. Currently, there is support for x86_64 architectures
  7 | that have AVX and AVX2 hardware instructions. In cases where that is not available, or on non x86_64 architectures
  8 | there is a portable scalar implementation. We also perform a runtime check to make sure that the necessary
  9 | ISA is available and if not fallback to the scalar approach.
 10 | 
 11 | There are several existing implementations:
 12 | 
 13 | 1. [Reference C/C++](https://github.com/lemire/streamvbyte)
 14 | 2. [Rust](https://bitbucket.org/marshallpierce/stream-vbyte-rust)
 15 | 3. [Go](https://github.com/nelz9999/stream-vbyte-go)
 16 |    * Note: only has a scalar implementation which prompted this implementation with SIMD techniques.
 17 | 
 18 | ## Benchmarks
 19 | 
 20 | ```text
 21 | goos: darwin
 22 | goarch: amd64
 23 | pkg: github.com/theMPatel/streamvbyte-simdgo/pkg
 24 | cpu: Intel(R) Core(TM) i7-8700B CPU @ 3.20GHz
 25 | --
 26 | MemCopy8Uint32-12    	463986302	         2.608 ns/op	12269.03 MB/s
 27 | 
 28 | goos: darwin
 29 | goarch: amd64
 30 | pkg: github.com/theMPatel/streamvbyte-simdgo/pkg/decode
 31 | cpu: Intel(R) Core(TM) i7-8700B CPU @ 3.20GHz
 32 | --
 33 | Get8uint32Fast-12           	377839186	         3.170 ns/op	10095.99 MB/s
 34 | Get8uint32DeltaFast-12      	298522095	         4.455 ns/op	7183.20 MB/s
 35 | Get8uint32Scalar-12         	63384603	        19.28 ns/op	1659.36 MB/s
 36 | Get8uint32DeltaScalar-12    	58705828	        20.04 ns/op	1596.46 MB/s
 37 | Get8uint32Varint-12         	27369775	        43.77 ns/op	 731.10 MB/s
 38 | Get8uint32DeltaVarint-12    	20924770	        57.30 ns/op	 558.46 MB/s
 39 | 
 40 | goos: darwin
 41 | goarch: amd64
 42 | pkg: github.com/theMPatel/streamvbyte-simdgo/pkg/encode
 43 | cpu: Intel(R) Core(TM) i7-8700B CPU @ 3.20GHz
 44 | --
 45 | Put8uint32Fast-12           	297620898	         3.864 ns/op	8281.18 MB/s
 46 | Put8uint32DeltaFast-12      	276545827	         4.350 ns/op	7356.59 MB/s
 47 | Put8uint32Scalar-12         	41200776	        28.59 ns/op	1119.30 MB/s
 48 | Put8uint32DeltaScalar-12    	37773458	        30.65 ns/op	1044.11 MB/s
 49 | Put8uint32Varint-12         	58668867	        17.20 ns/op	1860.67 MB/s
 50 | Put8uint32DeltaVarint-12    	61446153	        22.88 ns/op	1398.80 MB/s
 51 | 
 52 | goos: darwin
 53 | goarch: amd64
 54 | pkg: github.com/theMPatel/streamvbyte-simdgo/pkg/stream/reader
 55 | cpu: Intel(R) Core(TM) i7-8700B CPU @ 3.20GHz
 56 | --
 57 | ReadAllFast/Count_1e0-12 	99354789	        12.24 ns/op	 326.80 MB/s
 58 | ReadAllFast/Count_1e1-12 	28076071	        42.81 ns/op	 934.43 MB/s
 59 | ReadAllFast/Count_1e2-12 	11041639	       107.2 ns/op	3730.16 MB/s
 60 | ReadAllFast/Count_1e3-12 	 1645387	       729.9 ns/op	5480.00 MB/s
 61 | ReadAllFast/Count_1e4-12 	  170894	      7034 ns/op	5686.52 MB/s
 62 | ReadAllFast/Count_1e5-12 	   16848	     70969 ns/op	5636.29 MB/s
 63 | ReadAllFast/Count_1e6-12 	    1513	    728516 ns/op	5490.62 MB/s
 64 | ReadAllFast/Count_1e7-12 	     152	   7835111 ns/op	5105.22 MB/s
 65 | ReadAllDeltaFast/Count_1e0-12         	92727970	        13.10 ns/op	 305.44 MB/s
 66 | ReadAllDeltaFast/Count_1e1-12         	26164140	        45.89 ns/op	 871.61 MB/s
 67 | ReadAllDeltaFast/Count_1e2-12         	 9458992	       128.5 ns/op	3113.55 MB/s
 68 | ReadAllDeltaFast/Count_1e3-12         	 1277408	       934.4 ns/op	4280.69 MB/s
 69 | ReadAllDeltaFast/Count_1e4-12         	  144405	      8318 ns/op	4808.88 MB/s
 70 | ReadAllDeltaFast/Count_1e5-12         	   14444	     83151 ns/op	4810.55 MB/s
 71 | ReadAllDeltaFast/Count_1e6-12         	    1426	    846305 ns/op	4726.43 MB/s
 72 | ReadAllDeltaFast/Count_1e7-12         	     127	   9337355 ns/op	4283.87 MB/s
 73 | ReadAllScalar/Count_1e0-12            	122650209	         9.770 ns/op	 409.43 MB/s
 74 | ReadAllScalar/Count_1e1-12            	38012136	        31.63 ns/op	1264.64 MB/s
 75 | ReadAllScalar/Count_1e2-12            	 4999376	       241.6 ns/op	1655.30 MB/s
 76 | ReadAllScalar/Count_1e3-12            	  500337	      2459 ns/op	1626.38 MB/s
 77 | ReadAllScalar/Count_1e4-12            	   50247	     24034 ns/op	1664.34 MB/s
 78 | ReadAllScalar/Count_1e5-12            	    5032	    238354 ns/op	1678.17 MB/s
 79 | ReadAllScalar/Count_1e6-12            	     499	   2405669 ns/op	1662.74 MB/s
 80 | ReadAllScalar/Count_1e7-12            	      46	  24533207 ns/op	1630.44 MB/s
 81 | ReadAllDeltaScalar/Count_1e0-12       	100000000	        10.32 ns/op	 387.49 MB/s
 82 | ReadAllDeltaScalar/Count_1e1-12       	36915704	        32.52 ns/op	1230.08 MB/s
 83 | ReadAllDeltaScalar/Count_1e2-12       	 4818140	       249.8 ns/op	1601.58 MB/s
 84 | ReadAllDeltaScalar/Count_1e3-12       	  512492	      2374 ns/op	1685.20 MB/s
 85 | ReadAllDeltaScalar/Count_1e4-12       	   51004	     23639 ns/op	1692.11 MB/s
 86 | ReadAllDeltaScalar/Count_1e5-12       	    3568	    333168 ns/op	1200.60 MB/s
 87 | ReadAllDeltaScalar/Count_1e6-12       	     520	   2304864 ns/op	1735.46 MB/s
 88 | ReadAllDeltaScalar/Count_1e7-12       	      48	  24810555 ns/op	1612.22 MB/s
 89 | ReadAllVarint/Count_1e0-12            	121348074	         9.967 ns/op	 401.34 MB/s
 90 | ReadAllVarint/Count_1e1-12            	21056739	        57.34 ns/op	 697.64 MB/s
 91 | ReadAllVarint/Count_1e2-12            	 2025081	       589.0 ns/op	 679.15 MB/s
 92 | ReadAllVarint/Count_1e3-12            	  205881	      5851 ns/op	 683.69 MB/s
 93 | ReadAllVarint/Count_1e4-12            	   20906	     57446 ns/op	 696.31 MB/s
 94 | ReadAllVarint/Count_1e5-12            	    2037	    580620 ns/op	 688.92 MB/s
 95 | ReadAllVarint/Count_1e6-12            	     208	   5755083 ns/op	 695.04 MB/s
 96 | ReadAllVarint/Count_1e7-12            	      20	  57872736 ns/op	 691.17 MB/s
 97 | ReadAllDeltaVarint/Count_1e0-12       	139763250	         8.318 ns/op	 480.87 MB/s
 98 | ReadAllDeltaVarint/Count_1e1-12       	19199100	        62.49 ns/op	 640.11 MB/s
 99 | ReadAllDeltaVarint/Count_1e2-12       	 2149660	       556.6 ns/op	 718.65 MB/s
100 | ReadAllDeltaVarint/Count_1e3-12       	  207122	      5810 ns/op	 688.41 MB/s
101 | ReadAllDeltaVarint/Count_1e4-12       	   22680	     53200 ns/op	 751.88 MB/s
102 | ReadAllDeltaVarint/Count_1e5-12       	    2145	    500177 ns/op	 799.72 MB/s
103 | ReadAllDeltaVarint/Count_1e6-12       	     228	   5262741 ns/op	 760.06 MB/s
104 | ReadAllDeltaVarint/Count_1e7-12       	      27	  42000722 ns/op	 952.36 MB/s
105 | 
106 | goos: darwin
107 | goarch: amd64
108 | pkg: github.com/theMPatel/streamvbyte-simdgo/pkg/stream/writer
109 | cpu: Intel(R) Core(TM) i7-8700B CPU @ 3.20GHz
110 | --
111 | WriteAllFast/Count_1e0-12 	54152408	        22.05 ns/op	 181.36 MB/s
112 | WriteAllFast/Count_1e1-12 	27681948	        43.17 ns/op	 926.49 MB/s
113 | WriteAllFast/Count_1e2-12 	 7136480	       167.0 ns/op	2395.79 MB/s
114 | WriteAllFast/Count_1e3-12 	  928952	      1273 ns/op	3141.14 MB/s
115 | WriteAllFast/Count_1e4-12 	   96117	     12012 ns/op	3329.93 MB/s
116 | WriteAllFast/Count_1e5-12 	    9718	    114260 ns/op	3500.80 MB/s
117 | WriteAllFast/Count_1e6-12 	     879	   1242927 ns/op	3218.21 MB/s
118 | WriteAllFast/Count_1e7-12 	     100	  10368754 ns/op	3857.74 MB/s
119 | WriteAllDeltaFast/Count_1e0-12         	50489378	        23.38 ns/op	 171.06 MB/s
120 | WriteAllDeltaFast/Count_1e1-12         	26866423	        45.03 ns/op	 888.33 MB/s
121 | WriteAllDeltaFast/Count_1e2-12         	 6695125	       175.8 ns/op	2275.37 MB/s
122 | WriteAllDeltaFast/Count_1e3-12         	  899895	      1391 ns/op	2875.71 MB/s
123 | WriteAllDeltaFast/Count_1e4-12         	   90394	     12958 ns/op	3086.82 MB/s
124 | WriteAllDeltaFast/Count_1e5-12         	   10000	    122319 ns/op	3270.13 MB/s
125 | WriteAllDeltaFast/Count_1e6-12         	     945	   1249546 ns/op	3201.16 MB/s
126 | WriteAllDeltaFast/Count_1e7-12         	     100	  11461852 ns/op	3489.84 MB/s
127 | WriteAllScalar/Count_1e0-12            	56106489	        21.72 ns/op	 184.18 MB/s
128 | WriteAllScalar/Count_1e1-12            	18309972	        65.09 ns/op	 614.51 MB/s
129 | WriteAllScalar/Count_1e2-12            	 2776918	       433.5 ns/op	 922.63 MB/s
130 | WriteAllScalar/Count_1e3-12            	  289309	      4209 ns/op	 950.38 MB/s
131 | WriteAllScalar/Count_1e4-12            	   29497	     40884 ns/op	 978.38 MB/s
132 | WriteAllScalar/Count_1e5-12            	    3027	    399959 ns/op	1000.10 MB/s
133 | WriteAllScalar/Count_1e6-12            	     296	   4010161 ns/op	 997.47 MB/s
134 | WriteAllScalar/Count_1e7-12            	      28	  38753790 ns/op	1032.16 MB/s
135 | WriteAllDeltaScalar/Count_1e0-12       	54981757	        21.90 ns/op	 182.65 MB/s
136 | WriteAllDeltaScalar/Count_1e1-12       	17823349	        67.10 ns/op	 596.14 MB/s
137 | WriteAllDeltaScalar/Count_1e2-12       	 2711672	       442.4 ns/op	 904.09 MB/s
138 | WriteAllDeltaScalar/Count_1e3-12       	  292664	      4130 ns/op	 968.62 MB/s
139 | WriteAllDeltaScalar/Count_1e4-12       	   29340	     41014 ns/op	 975.28 MB/s
140 | WriteAllDeltaScalar/Count_1e5-12       	    2289	    516113 ns/op	 775.02 MB/s
141 | WriteAllDeltaScalar/Count_1e6-12       	     302	   3930860 ns/op	1017.59 MB/s
142 | WriteAllDeltaScalar/Count_1e7-12       	      30	  41357670 ns/op	 967.17 MB/s
143 | WriteAllVarint/Count_1e0-12            	208214545	         5.720 ns/op	 699.32 MB/s
144 | WriteAllVarint/Count_1e1-12            	43083270	        28.02 ns/op	1427.34 MB/s
145 | WriteAllVarint/Count_1e2-12            	 4972045	       242.8 ns/op	1647.67 MB/s
146 | WriteAllVarint/Count_1e3-12            	  499011	      2409 ns/op	1660.60 MB/s
147 | WriteAllVarint/Count_1e4-12            	   51022	     23590 ns/op	1695.67 MB/s
148 | WriteAllVarint/Count_1e5-12            	    5216	    231741 ns/op	1726.07 MB/s
149 | WriteAllVarint/Count_1e6-12            	     518	   2305364 ns/op	1735.08 MB/s
150 | WriteAllVarint/Count_1e7-12            	      50	  24905825 ns/op	1606.05 MB/s
151 | WriteAllDeltaVarint/Count_1e0-12       	175269966	         6.792 ns/op	 588.93 MB/s
152 | WriteAllDeltaVarint/Count_1e1-12       	51799438	        23.38 ns/op	1710.63 MB/s
153 | WriteAllDeltaVarint/Count_1e2-12       	 5417458	       221.3 ns/op	1807.60 MB/s
154 | WriteAllDeltaVarint/Count_1e3-12       	  539414	      2243 ns/op	1783.48 MB/s
155 | WriteAllDeltaVarint/Count_1e4-12       	   52717	     22753 ns/op	1757.99 MB/s
156 | WriteAllDeltaVarint/Count_1e5-12       	    5716	    210456 ns/op	1900.63 MB/s
157 | WriteAllDeltaVarint/Count_1e6-12       	     495	   2453672 ns/op	1630.21 MB/s
158 | WriteAllDeltaVarint/Count_1e7-12       	      70	  17491186 ns/op	2286.87 MB/s
159 | ```
160 | 
161 | A note on the benchmarks: An array of random uint32's is generated and then encoded/decoded over
162 | and over again. An attempt is made to ensure that some of these benchmarks reflect the most probable
163 | real world performance metrics.
164 | 
165 | ---
166 | Stream VByte uses the same underlying format as Google's Group Varint approach. Lemire et al. wanted
167 | to see if there was a way to improve the performance even more and introduced a clever twist to enable
168 | better performance via SIMD techniques. The basic goal of the Group Varint format is to be able to
169 | achieve similar compression characteristics as the VByte format for integers and also be able to load
170 | and process them really quickly.
171 | 
172 | ## VByte format
173 | 
174 | The insight that backs the VByte encoding is noticing that you oftentimes don't need 32 bits to
175 | encode a 32-bit integer. Take for example an unsigned integer that is less than 2^8 (256). This
176 | integer will have bits set in the lowest byte of a 32-bit integer, while the remaining 3 bytes will
177 | simply be zeros.
178 | 
179 | ```
180 | 111 in binary:
181 | 
182 | 00000000 00000000 00000000 01101111
183 | ```
184 | 
185 | An approach you can take to compress this integer is to encode the integer using a variable
186 | number of bytes. For example, you can use the lower 7 bits to store data, i.e. bits
187 | from the original integer, and then use the MSB as a continuation bit. If the MSB bit is on, i.e.
188 | is 1, then more bytes are needed to decode this particular integer. Below is an example where
189 | you might need 2 bytes to store the number 1234.
190 | 
191 | ```
192 | 1234 in binary:
193 | 
194 | 00000000 00000000 00000100 11010010
195 | 
196 | Num compressed:
197 | 
198 | v          v          Continuation bits
199 | 0|0001001| 1|1010010|
200 |     ^           ^     Data bits
201 | ```
202 | 
203 | If you want to decode this integer, you simply build up the number iteratively. I.e. you OR the
204 | last 7 bits of every byte shifted to the appropriate length to your 32-bit integer until you
205 | find a byte that doesn't have a continuation bit set. Note that this works the same for 64-bit
206 | numbers.
207 | 
208 | The problem with this approach is that it can introduce a lot of branch mis-predictions during encoding/decoding.
209 | During the decoding phase, you don't know ahead of time the number of bytes that were used to encode the integer
210 | you are currently processing and so you need to iterate until you find a byte without a continuation bit on.
211 | If you have integers that are nonuniform, i.e. integers that require random numbers of bytes to encode relative
212 | to one another, this can pose a challenge to the processor's branch predictor. These mis-predictions can cause
213 | major slowdowns in processor pipelines and so was born the Group Varint format.
214 | 
215 | ## Group Varint format
216 | 
217 | The Group Varint (varint-GB) format assumes that everything you hope to achieve, you can do with 32-bit integers.
218 | It introduces the concept of a control byte which is simply a byte that stores the encoded
219 | lengths of a group of 4 32-bit integers, hence Group Varint. 32-bit integers only require up to 4 bytes
220 | to properly encode. This means that you can represent their lengths with 2 bits using a zero-indexed length
221 | i.e. 0, 1, 2, and 3 to represent integers that require 1, 2, 3 and 4 bytes to encode, respectively.
222 | 
223 | ```
224 | 00000000 00000000 00000000 01101111  =        111 
225 | 00000000 00000000 00000100 11010010  =       1234
226 | 00000000 00001100 00001010 10000011  =     789123
227 | 01000000 00000000 00000000 00000000  = 1073741824
228 | 
229 | Num         Len      2-bit control
230 | ----------------------------------
231 | 111          1                0b00 
232 | 1234         2                0b01
233 | 789123       3                0b10
234 | 1073741824   4                0b11
235 | 
236 | Final Control byte
237 | 0b11100100
238 | 
239 | Encoded data (little endian right-to-left bottom-to-top) 
240 | 0b01000000 0b00000000 0b00000000 0b00000000 0b00001100
241 | 0b00001010 0b10000011 0b00000100 0b11010010 0b01101111
242 | ```
243 | 
244 | You can then prefix every group of 4 encoded 32-bit integers with their control byte and then use it during decoding.
245 | The obvious downside is that you pay a storage cost of one byte for every 4 integers you want to encode. For 2^20 
246 | encoded integers, that's an extra 256 KB of extra space: totally marginal. The great upside, though, is that
247 | you've now removed almost all branches from your decoding phase. You know exactly how many data bytes you need
248 | to read from a buffer for a particular number and then can use branchless decoding.
249 | 
250 | ```go
251 | package foo
252 | 
253 | import (
254 | 	"encoding/binary"
255 | )
256 | 
257 | func decodeOne(input []byte, size uint8) uint32 {
258 | 	buf := make([]byte, 4)
259 | 	copy(buf, input[:size])
260 | 
261 | 	// func (littleEndian) Uint32(b []byte) uint32 {
262 | 	//  	_ = b[3]
263 | 	//  	return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
264 | 	// }
265 | 	return binary.LittleEndian.Uint32(buf)
266 | }
267 | 
268 | func main() {
269 | 	ctrl := uint8(0b11_10_01_00)
270 | 	data := []byte{
271 | 		0b01101111, 0b11010010, 0b00000100,
272 | 		0b10000011, 0b00001010, 0b00001100,
273 | 		0b00000000, 0b00000000, 0b00000000,
274 | 		0b01000000, 
275 |     }
276 |     
277 | 	len0 := (ctrl & 3) + 1      // 1
278 | 	len1 := (ctrl >> 2 & 3) + 1 // 2
279 | 	len2 := (ctrl >> 4 & 3) + 1 // 3
280 | 	len3 := (ctrl >> 6 & 3) + 1 // 4
281 | 	
282 | 	_ = decodeOne(data, len0)                   // 111
283 | 	_ = decodeOne(data[len0:], len1)            // 1234
284 | 	_ = decodeOne(data[len0+len1:], len2)       // 789_123
285 | 	_ = decodeOne(data[len0+len1+len2:], len3)  // 1_073_741_824
286 | }
287 | ```
288 | 
289 | ## Stream VByte format
290 | 
291 | Unfortunately, accelerating decoding of the varint-GB format with only SIMD techniques
292 | has proven unsuccessful. The below excerpt from the paper outlines why. 
293 | 
294 | > To understand why it might be difficult to accelerate the decoding of data compressed in the VARINT-GB
295 | > format compared to the VARINT-G8IU format, consider that we cannot decode faster than we can access the
296 | > control bytes. In VARINT-G8IU, the control bytes are conveniently always located nine compressed bytes
297 | > apart. Thus while a control byte is being processed, or even before, our superscalar processor can load
298 | > and start processing upcoming control bytes, as their locations are predictable. Instructions depending
299 | > on these control bytes can be reordered by the processor for best performance. However, in the VARINT-GB
300 | > format, there is a strong data dependency: the location of the next control byte depends on the current
301 | > control byte. This increases the risk that the processor remains underutilized, delayed by the latency
302 | > between issuing the load for the next control byte and waiting for it to be ready.
303 | 
304 | Additionally, they prove that decoding 4 integers at a time using 128-bit registers is faster than trying
305 | to decode a variable number of integers that fit into an 8-byte register, i.e. the varint-G8IU approach.
306 | 
307 | ### SIMD control byte generation algorithm
308 | 
309 | Lemire et al. have devised a brilliant SIMD algorithm for simultaneously generating two control bytes
310 | for a group of 8 integers. The best way to understand this algorithm is to understand how it works on 
311 | a single integer and then assume it works in a vectorized form (it does). Going forward we'll use
312 | *control bits stream* to represent these control bytes we are building. 
313 | 
314 | ```
315 | 00000000 00000000 00000100 11010010 // 1234
316 | ```
317 | 
318 | Let's take one of the previous integers that we were looking at, `1234`, and walk through an example
319 | of how the 2-bit control is generated for it using SIMD techniques. The goal is to be able to, for
320 | any 32-bit integer, generate a 2-bit zero indexed length value. For example, if you have an integer
321 | that requires 2 bytes to be encoded, we want for the algorithm to generate `0b01`.
322 | 
323 | ```
324 | 00000000 00000000 00000100 11010010 // 1234
325 | 00000001 00000001 00000001 00000001 // 0x0101 mask
326 | ----------------------------------- // byte-min(1234, 0x0101)
327 | 00000000 00000000 00000001 00000001
328 | ```
329 | 
330 | The algorithm first uses a mask where every byte is equal to 1. If you perform a per-byte min operation
331 | on our integer and the 1's mask, the result will have a 1 at every byte that had a value in the original
332 | integer. 
333 | 
334 | ```
335 | 00000000 00000000 00000001 00000001
336 | ----------------------------------- // pack unsigned saturating 16-bit to 8-bit
337 | 00000000 00000000 00000000 11111111
338 | ```
339 | 
340 | Now you perform a 16-bit to 8-bit unsigned saturating pack operation. Practically this means that you're
341 | taking every 16-bit value and trying to shove that into 8 bits. If the 16-bit integer is larger than
342 | the largest unsigned integer 8 bits can support, the pack saturates to the largest unsigned 8-bit value. 
343 | 
344 | Why this is performed will become more clear in the subsequent steps, however, at a high level, for every
345 | integer you want to encode, you want for the MSB of two consecutive bytes in the control bits stream
346 | to be representative of the final 2-bit control. For example, if you have a 3-byte integer, you want the
347 | MSB of two consecutive bytes to be 1 and 0, in that order. The reason you would want this is that
348 | there is a vector pack instruction that takes the MSB from every byte in the control bits stream
349 | and packs it into the lowest byte. This would thus represent the value `0b10` in the final byte for
350 | this 3-byte integer, which is what we want.
351 | 
352 | Performing a 16-bit to 8-bit unsigned saturating pack has the effect that you can use the saturation
353 | behavior to conditionally turn on the MSB of these bytes depending on which bytes have values in the
354 | original 32-bit integer.
355 | 
356 | ```
357 | 00000000 00000000 00000000 11111111 // control bits stream
358 | 00000001 00000001 00000001 00000001 // 0x0101 mask
359 | ----------------------------------- // signed 16-bit min
360 | 00000000 00000000 00000000 11111111
361 | ```
362 | 
363 | We then take the 1's mask we used before and perform a __signed 16-bit__ min operation. The reason for this
364 | is more clear if you look at an example using a 3-byte integer.
365 | 
366 | ```
367 | 00000000 00001100 00001010 10000011 // 789123
368 | 00000001 00000001 00000001 00000001 // 0x0101 mask
369 | ----------------------------------- // byte-min(789123, 0x0101)
370 | 00000000 00000001 00000001 00000001
371 | ----------------------------------- // pack unsigned saturating 16-bit to 8-bit
372 | 00000000 00000000 00000001 11111111
373 | 00000001 00000001 00000001 00000001 // 0x0101 mask
374 | ----------------------------------- // signed 16-bit min
375 | 00000000 00000000 00000001 00000001
376 | ```
377 | 
378 | The signed 16-bit min operation has three important effects.
379 | 
380 | First, for 3-byte integers, it has the effect of turning off the MSB of the lowest byte. This is necessary
381 | because a 3-byte integer should have a 2-bit control that is `0b10` and without this step using the MSB pack
382 | operation would result in a 2-bit control that looks something like `0b_1`, where the lowest bit is on.
383 | Obviously this is wrong, since only integers that require 2 or 4 bytes to encode should have that lower bit
384 | on, i.e. 1 or 3 as a zero-indexed length.
385 | 
386 | Second, for 4-byte integers, the signed aspect has the effect of leaving both MSBs of the 2 bytes on. When using the
387 | MSB pack operation later on, it will result in a 2-bit control value of `0b11`, which is what we want.
388 | 
389 | Third, for 1 and 2 byte integers, it has no effect. This is great for 2-byte values since the MSB will remain on
390 | and 1 byte values will not have any MSB on anyways, so it is effectively a noop in both scenarios.
391 | 
392 | ```
393 | 00000000 00000000 00000000 11111111 // control bits stream (original 1234)
394 | 01111111 00000000 01111111 00000000 // 0x7F00 mask
395 | ----------------------------------- // add unsigned saturating 16-bit
396 | 01111111 00000000 01111111 11111111
397 | ```
398 | 
399 | Next, we take a mask with the value `0x7F00` and perform an unsigned saturating add to the control bits stream.
400 | In the case for the integer `1234` this has no real effect. We maintain the MSB in the lowest byte. You'll note,
401 | however, that the only byte that has its MSB on is the last one, so performing an MSB pack operation would result
402 | in a value of `0b0001`, which is what we want. An example of this step on the integer `789123` might paint a clearer
403 | picture.
404 | 
405 | ```
406 | 00000000 00000000 00000001 00000001 // control bits stream (789123)
407 | 01111111 00000000 01111111 00000000 // 0x7F00 mask
408 | ----------------------------------- // add unsigned saturating 16-bit
409 | 01111111 00000000 11111111 00000001
410 | ```
411 | 
412 | You'll note here that the addition of `0x01` with `0x7F` in the upper byte results in the MSB of the resulting upper
413 | byte turning on. The MSB in the lower byte remains off and now an MSB pack operation will resolve to `0b0010`,
414 | which is what we want. The unsigned saturation behavior is really important for 4-byte numbers that only have
415 | bits in the most significant byte on. An example below:
416 | 
417 | ```
418 | 01000000 00000000 00000000 00000000 // 1073741824
419 | 00000001 00000001 00000001 00000001 // 0x0101 mask
420 | ----------------------------------- // byte-min(1073741824, 0x0101)
421 | 00000001 00000000 00000000 00000000
422 | ----------------------------------- // pack unsigned saturating 16-bit to 8-bit
423 | 00000000 00000000 11111111 00000000
424 | 00000001 00000001 00000001 00000001 // 0x0101 mask
425 | ----------------------------------- // signed 16-bit min
426 | 00000000 00000000 11111111 00000000
427 | 01111111 00000000 01111111 00000000 // 0x7F00 mask
428 | ----------------------------------- // add unsigned saturating 16-bit
429 | 01111111 00000000 11111111 11111111
430 | ```
431 | 
432 | Note here that because only the upper byte had a value in it, the lowest byte in the control bits stream remains
433 | zero for the duration of the algorithm. This poses an issue, since for a 4-byte value, we want for the 2-bit
434 | control to result in a value of `0b11`. Performing a 16-bit unsigned *saturating* addition has the effect of
435 | turning on all bits in the lower byte, and thus we get a result with the MSB in the lower byte on. 
436 | 
437 | ```
438 | 01111111 00000000 11111111 00000001 // control bits stream (789123)
439 | ----------------------------------- // move byte mask 
440 | 00000000 00000000 00000000 00000010 // 2-bit control 
441 | ```
442 | 
443 | The final move byte mask is performed on the control bits stream, and we now have the result we wanted. Now that you
444 | see that this works for 1 integer, you know how it can work for 8 integers simultaneously, since we use vector
445 | instructions that operate on 128 bit registers.
446 | 
447 | ### SIMD integer packing/unpacking
448 | 
449 | The next problem to be solved is how to take a group of 4 integers, and compress it by removing extraneous/unused
450 | bytes so that all you're left with is a stream of data bytes with real information. Let's take two numbers from
451 | our examples above.
452 | 
453 | ```
454 |                789123                                 1234
455 | 00000000 00001100 00001010 10000011 | 00000000 00000000 00000100 11010010
456 | -------------------------------------------------------------------------
457 |          00001100 00001010 10000011   00000100 11010010      // packed
458 | ```
459 | 
460 | Here, we can use a shuffle operation. Vector shuffle operations rearrange the bytes in an input register according
461 | to some provided mask into a destination register. Every position in the mask stores an offset into the source
462 | vector stream that represents the data byte that should go into that position.
463 | 
464 | ```
465 | input [1234, 789123] (little endian R-to-L)
466 | 00000000 00001100 00001010 10000011 00000000 00000000 00000100 11010010
467 |             |       |         |                             |        |
468 |             |       |         |____________________         |        |
469 |             |       |_____________________         |        |        |
470 |             |____________________         |        |        |        |
471 |                                  v        v        v        v        v
472 |     0xff     0xff     0xff     0x06     0x05     0x04     0x01     0x00 // mask in hex
473 | -----------------------------------------------------------------------
474 | 00000000 00000000 00000000 00001100 00001010 10000011 00000100 11010010 // packed
475 | ```
476 | 
477 | We keep a prebuilt lookup table that contains a mapping from control byte to the necessary mask and simply
478 | load that after we construct the control byte above. In addition, we keep a lookup table for a mapping from
479 | control bytes to total encoded length. This allows us to know by how much to increment the output pointer and
480 | overwrite, for example, the redundant upper 3 bytes in the above shuffle example.
481 | 
482 | Unpacking during decoding is the same as the above, but in reverse. We need to go from a packed format
483 | to an unpacked memory format. We keep lookup tables to maintain a mapping from control byte to the reverse
484 | shuffle mask, and then perform a shuffle operation to output to an `uint32` array.
485 | 
486 | # References
487 | 
488 | [Stream VByte: Faster Byte-Oriented Integer Compression](https://arxiv.org/pdf/1709.08990.pdf)
489 | 


--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
 1 | module github.com/theMPatel/streamvbyte-simdgo
 2 | 
 3 | go 1.16
 4 | 
 5 | require (
 6 | 	github.com/mmcloughlin/avo v0.2.0
 7 | 	github.com/pkg/errors v0.9.1
 8 | 	golang.org/x/sys v0.0.0-20210510120138-977fb7262007
 9 | 	golang.org/x/tools v0.1.5 // indirect
10 | )
11 | 


--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
 1 | github.com/mmcloughlin/avo v0.2.0 h1:6vhoSaKtxb6f4RiH+LK2qL6GSMpFzhEwJYTTSZNy09w=
 2 | github.com/mmcloughlin/avo v0.2.0/go.mod h1:5tidO2Z9Z7N6X7UMcGg+1KTj51O8OxYDCMHxCZTVpEA=
 3 | github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
 4 | github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
 5 | github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
 6 | github.com/yuin/goldmark v1.3.5/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k=
 7 | golang.org/x/arch v0.0.0-20210405154355-08b684f594a5/go.mod h1:flIaEI6LNU6xOCD5PaJvn9wGP0agmIOqjrtsKGRguv4=
 8 | golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
 9 | golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
10 | golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
11 | golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
12 | golang.org/x/mod v0.4.2 h1:Gz96sIWK3OalVv/I/qNygP42zyoKp3xptRVCWRFEBvo=
13 | golang.org/x/mod v0.4.2/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
14 | golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
15 | golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
16 | golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
17 | golang.org/x/net v0.0.0-20210405180319-a5a99cb37ef4/go.mod h1:p54w0d4576C0XHj96bSt6lcn1PtDYWL6XObtHCRCNQM=
18 | golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
19 | golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
20 | golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
21 | golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
22 | golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
23 | golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
24 | golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
25 | golang.org/x/sys v0.0.0-20210119212857-b64e53b001e4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
26 | golang.org/x/sys v0.0.0-20210330210617-4fbd30eecc44/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
27 | golang.org/x/sys v0.0.0-20210403161142-5e06dd20ab57/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
28 | golang.org/x/sys v0.0.0-20210510120138-977fb7262007 h1:gG67DSER+11cZvqIMb8S8bt0vZtiN6xWYARwirrOSfE=
29 | golang.org/x/sys v0.0.0-20210510120138-977fb7262007/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
30 | golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
31 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
32 | golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
33 | golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
34 | golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
35 | golang.org/x/tools v0.1.0/go.mod h1:xkSsbof2nBLbhDlRMhhhyNLN/zl3eTqcnHD5viDpcZ0=
36 | golang.org/x/tools v0.1.5 h1:ouewzE6p+/VEB31YYnTbEJdi8pFqKp4P4n85vwo3DHA=
37 | golang.org/x/tools v0.1.5/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk=
38 | golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
39 | golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
40 | golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 h1:go1bK/D/BFZV2I8cIQd1NKEZ+0owSTG1fDTci4IqFcE=
41 | golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
42 | rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4=
43 | 


--------------------------------------------------------------------------------
/pkg/decode/decode.go:
--------------------------------------------------------------------------------
  1 | package decode
  2 | 
  3 | import (
  4 | 	"github.com/theMPatel/streamvbyte-simdgo/pkg/shared"
  5 | )
  6 | 
  7 | var (
  8 | 	getImpl      Get8Impl
  9 | 	getDeltaImpl Get8DeltaImpl
 10 | )
 11 | 
 12 | type Get8Impl func(in []byte, out []uint32, ctrl uint16)
 13 | type Get8DeltaImpl func(in []byte, out []uint32, ctrl uint16, prev uint32)
 14 | 
 15 | func init() {
 16 | 	if GetMode() == shared.Fast {
 17 | 		getImpl = Get8uint32Fast
 18 | 		getDeltaImpl = Get8uint32DeltaFast
 19 | 	} else {
 20 | 		getImpl = Get8uint32Scalar
 21 | 		getDeltaImpl = Get8uint32DeltaScalar
 22 | 	}
 23 | }
 24 | 
 25 | // Get8uint32 is a general func you can use to decode 8 uint32's at a time.
 26 | // It will use the fastest implementation available determined during
 27 | // package initialization. If your CPU supports special hardware instructions
 28 | // then it will use an accelerated version of Stream VByte. Otherwise, the
 29 | // scalar implementation will be used as the fallback.
 30 | func Get8uint32(in []byte, out []uint32, ctrl uint16) {
 31 | 	getImpl(in, out, ctrl)
 32 | }
 33 | 
 34 | // Get8uint32Delta is a general func you can use to decode 8 differentially coded
 35 | // uint32's at a time. It will use the fastest implementation available determined
 36 | // during package initialization. If your CPU supports special hardware instructions
 37 | // then it will use an accelerated version of Stream VByte. Otherwise, the
 38 | // scalar implementation will be used as the fallback.
 39 | func Get8uint32Delta(in []byte, out []uint32, ctrl uint16, prev uint32) {
 40 | 	getDeltaImpl(in, out, ctrl, prev)
 41 | }
 42 | 
 43 | // Get8uint32Scalar will decode 8 uint32 values from in into out using the
 44 | // Stream VByte format. Returns the number of bytes read from the input
 45 | // buffer.
 46 | //
 47 | // Note: It is your responsibility to ensure that the incoming slices have
 48 | // the appropriate sizes and data otherwise this func will panic.
 49 | func Get8uint32Scalar(in []byte, out []uint32, ctrl uint16) {
 50 | 	lower := uint8(ctrl & 0xff)
 51 | 	upper := uint8(ctrl >> 8)
 52 | 	lowerSize := shared.ControlByteToSize(lower)
 53 | 	Get4uint32Scalar(in, out, lower)
 54 | 	Get4uint32Scalar(in[lowerSize:], out[4:], upper)
 55 | }
 56 | 
 57 | // Get4uint32Scalar will decode 4 uint32 values from in into out using the
 58 | // Stream VByte format. Returns the number of bytes read from the input
 59 | // buffer.
 60 | //
 61 | // Note: It is your responsibility to ensure that the incoming slices have
 62 | // the appropriate sizes and data otherwise this func will panic.
 63 | func Get4uint32Scalar(in []byte, out []uint32, ctrl uint8) {
 64 | 	sizes := shared.PerNumLenTable[ctrl]
 65 | 
 66 | 	len3 := sizes[3]
 67 | 	len2 := sizes[2]
 68 | 	len1 := sizes[1]
 69 | 	len0 := sizes[0]
 70 | 
 71 | 	out[3] = decodeOne(in[len0+len1+len2:], len3)
 72 | 	out[2] = decodeOne(in[len0+len1:], len2)
 73 | 	out[1] = decodeOne(in[len0:], len1)
 74 | 	out[0] = decodeOne(in, len0)
 75 | }
 76 | 
 77 | // GetUint32Scalar decodes up to 4 integers from in into out using the
 78 | // Stream VByte format.
 79 | //
 80 | // Note: It is your responsibility to ensure that the incoming slices have
 81 | // the appropriate sizes and data otherwise this func will panic.
 82 | func GetUint32Scalar(in []byte, out []uint32, ctrl uint8, count int) int {
 83 | 	if count == 0 {
 84 | 		return 0
 85 | 	}
 86 | 
 87 | 	if count > 4 {
 88 | 		count = 4
 89 | 	}
 90 | 
 91 | 	shift := 0
 92 | 	total := 0
 93 | 	for i := 0; i < count; i++ {
 94 | 		size := ((ctrl >> shift) & 0x3) + 1
 95 | 		out[i] = decodeOne(in[total:], size)
 96 | 		total += int(size)
 97 | 		shift += 2
 98 | 	}
 99 | 
100 | 	return total
101 | }
102 | 
103 | // GetUint32DeltaScalar decodes up to 4 integers from in into out using the
104 | // Stream VByte format. It will reconstruct the original non differentially
105 | // encoded values.
106 | //
107 | // Note: It is your responsibility to ensure that the incoming slices have
108 | // the appropriate sizes and data otherwise this func will panic.
109 | func GetUint32DeltaScalar(in []byte, out []uint32, ctrl uint8, count int, prev uint32) int {
110 | 	if count == 0 {
111 | 		return 0
112 | 	}
113 | 
114 | 	if count > 4 {
115 | 		count = 4
116 | 	}
117 | 
118 | 	shift := 0
119 | 	total := 0
120 | 	for i := 0; i < count; i++ {
121 | 		size := ((ctrl >> shift) & 0x3) + 1
122 | 		num := decodeOne(in[total:], size) + prev
123 | 		out[i] = num
124 | 		prev = num
125 | 		total += int(size)
126 | 		shift += 2
127 | 	}
128 | 
129 | 	return total
130 | }
131 | 
132 | // Get8uint32DeltaScalar will decode 8 uint32 values from in into out and reconstruct
133 | // the original values via differential coding. Prev provides a way for you to
134 | // indicate the base value for this batch of 8. For example, when decoding the second
135 | // batch of 8 integers out of, e.g. 16, you would provide a prev value of the last value
136 | // in the first batch of 8 you decoded. This is done to ensure that the integers are
137 | // correctly resolved to the correct diff. An example below.
138 | //
139 | // Input:	[ 10, 10, 10, 10, 10, 10, 10, 10 ] [ 10, 10, 10, 10, 10, 10, 10, 10 ]
140 | // Output:	[ 10, 20, 30, 40, 50, 60, 70, 80 ] [ 90, 100, 110, 120, 130, 140, 150, 160 ]
141 | // Prev: 80
142 | func Get8uint32DeltaScalar(in []byte, out []uint32, ctrl uint16, prev uint32) {
143 | 	lower := uint8(ctrl & 0xff)
144 | 	upper := uint8(ctrl >> 8)
145 | 	lowerSize := shared.ControlByteToSize(lower)
146 | 	Get4uint32DeltaScalar(in, out, lower, prev)
147 | 	Get4uint32DeltaScalar(in[lowerSize:], out[4:], upper, out[3])
148 | }
149 | 
150 | // Get4uint32DeltaScalar will decode 4 uint32 values from in into out and reconstruct
151 | // the original values via differential coding. Prev provides a way for you to
152 | // indicate the base value for this batch of 4. For example, when decoding the second
153 | // batch of 4 integers out of, e.g. 8, you would provide a prev value of the last value
154 | // in the first batch of 4 you decoded. This is done to ensure that the integers are
155 | // correctly resolved to the correct diff. An example below.
156 | //
157 | // Input:	[ 10, 10, 10, 10 ] [ 10, 10, 10, 10 ]
158 | // Output:	[ 10, 20, 30, 40 ] [ 50, 60, 70, 80 ]
159 | // Prev: 40
160 | func Get4uint32DeltaScalar(in []byte, out []uint32, ctrl uint8, prev uint32) {
161 | 	sizes := shared.PerNumLenTable[ctrl]
162 | 
163 | 	len0 := sizes[0]
164 | 	len1 := sizes[1]
165 | 	len2 := sizes[2]
166 | 	len3 := sizes[3]
167 | 
168 | 	// bounds check hint to compiler
169 | 	_ = out[3]
170 | 	out[0] = decodeOne(in, len0) + prev
171 | 	out[1] = decodeOne(in[len0:], len1) + out[0]
172 | 	out[2] = decodeOne(in[len0+len1:], len2) + out[1]
173 | 	out[3] = decodeOne(in[len0+len1+len2:], len3) + out[2]
174 | }
175 | 
176 | func decodeOne(b []byte, size uint8) uint32 {
177 | 	switch size {
178 | 	case 4:
179 | 		return uint32(b[3])<<24 | uint32(b[2])<<16 | uint32(b[1])<<8 | uint32(b[0])
180 | 	case 3:
181 | 		return uint32(b[2])<<16 | uint32(b[1])<<8 | uint32(b[0])
182 | 	case 2:
183 | 		return uint32(b[1])<<8 | uint32(b[0])
184 | 	case 1:
185 | 		return uint32(b[0])
186 | 	}
187 | 	panic("impossible")
188 | }
189 | 


--------------------------------------------------------------------------------
/pkg/decode/decode_amd64.go:
--------------------------------------------------------------------------------
 1 | // +build amd64
 2 | 
 3 | // Package decode provides an x86_64 implementation of two
 4 | // Stream VByte decoding algorithms, a normal decoding approach
 5 | // and one that incorporates differential coding.
 6 | package decode
 7 | 
 8 | import (
 9 | 	"github.com/theMPatel/streamvbyte-simdgo/pkg/shared"
10 | 	"golang.org/x/sys/cpu"
11 | )
12 | 
13 | // GetMode performs a check to see if the current ISA supports
14 | // the below decoding funcs.
15 | func GetMode() shared.PerformanceMode {
16 | 	if cpu.X86.HasAVX {
17 | 		return shared.Fast
18 | 	}
19 | 	return shared.Normal
20 | }
21 | 
22 | // Get8uint32Fast binds to get8uint32Fast which is implemented in
23 | // assembly.
24 | func Get8uint32Fast(in []byte, out []uint32, ctrl uint16) {
25 | 	Get8uint32FastAsm(in, out, ctrl,
26 | 		shared.DecodeShuffleTable,
27 | 		shared.PerControlLenTable,
28 | 	)
29 | }
30 | 
31 | // Get8uint32DeltaFast binds to get8uint32DeltaFast which is implemented
32 | // in assembly.
33 | func Get8uint32DeltaFast(in []byte, out []uint32, ctrl uint16, prev uint32) {
34 | 	Get8uint32DeltaFastAsm(
35 | 		in, out, ctrl, prev,
36 | 		shared.DecodeShuffleTable,
37 | 		shared.PerControlLenTable,
38 | 	)
39 | }
40 | 
41 | // Get8uint32FastAsm uses the provided 16-bit control to load the
42 | // appropriate decoding shuffle masks and performs a shuffle
43 | // operation on the provided input bytes. This in effect decompresses
44 | // the input byte stream to uint32s. The result is written to
45 | // the provided output slice.
46 | //go:noescape
47 | func Get8uint32FastAsm(
48 | 	in []byte, out []uint32, ctrl uint16,
49 | 	shuffle *[256][16]uint8, lenTable *[256]uint8,
50 | )
51 | 
52 | // Get8uint32DeltaFastAsm works similarly to get8uint32Fast with the
53 | // exception that prior to writing the uncompressed integers out
54 | // to the output slice, the original values are reconstructed from
55 | // the diffs. The basic reconstruction algorithm is as follows:
56 | //
57 | // Input:           [A B C D]
58 | // Input Shifted:   [- A  B  C]
59 | // Add above two:   [A AB BC CD]
60 | // Add Prev:        [PA PAB PBC PCD]
61 | // Input Shifted:   [- - A AB]
62 | // Add Shifted:     [PA PAB PABC PABCD]
63 | //go:noescape
64 | func Get8uint32DeltaFastAsm(
65 | 	in []byte, out []uint32, ctrl uint16, prev uint32,
66 | 	shuffle *[256][16]uint8, lenTable *[256]uint8,
67 | )
68 | 


--------------------------------------------------------------------------------
/pkg/decode/decode_amd64.s:
--------------------------------------------------------------------------------
 1 | // Code generated by command: go run asm.go -out ./decode_amd64.s. DO NOT EDIT.
 2 | 
 3 | #include "textflag.h"
 4 | 
 5 | // func Get8uint32FastAsm(in []byte, out []uint32, ctrl uint16, shuffle *[256][16]uint8, lenTable *[256]uint8)
 6 | // Requires: AVX
 7 | TEXT ·Get8uint32FastAsm(SB), NOSPLIT, $0-72
 8 | 	MOVWQZX ctrl+48(FP), AX
 9 | 	MOVQ    shuffle+56(FP), CX
10 | 	MOVBQZX AL, DX
11 | 	SHLQ    $0x04, DX
12 | 	ADDQ    CX, DX
13 | 	MOVWQZX AX, BX
14 | 	SHRQ    $0x08, BX
15 | 	SHLQ    $0x04, BX
16 | 	ADDQ    CX, BX
17 | 	MOVQ    in_base+0(FP), CX
18 | 	MOVQ    CX, SI
19 | 	MOVQ    lenTable+64(FP), DI
20 | 	MOVBQZX AL, AX
21 | 	ADDQ    DI, AX
22 | 	MOVBQZX (AX), AX
23 | 	ADDQ    AX, SI
24 | 	VLDDQU  (CX), X0
25 | 	VLDDQU  (SI), X1
26 | 	VPSHUFB (DX), X0, X0
27 | 	VPSHUFB (BX), X1, X1
28 | 	MOVQ    out_base+24(FP), AX
29 | 	VMOVDQU X0, (AX)
30 | 	VMOVDQU X1, 16(AX)
31 | 	RET
32 | 
33 | // func Get8uint32DeltaFastAsm(in []byte, out []uint32, ctrl uint16, prev uint32, shuffle *[256][16]uint8, lenTable *[256]uint8)
34 | // Requires: AVX
35 | TEXT ·Get8uint32DeltaFastAsm(SB), NOSPLIT, $0-72
36 | 	MOVWQZX      ctrl+48(FP), AX
37 | 	MOVQ         shuffle+56(FP), CX
38 | 	MOVBQZX      AL, DX
39 | 	SHLQ         $0x04, DX
40 | 	ADDQ         CX, DX
41 | 	MOVWQZX      AX, BX
42 | 	SHRQ         $0x08, BX
43 | 	SHLQ         $0x04, BX
44 | 	ADDQ         CX, BX
45 | 	MOVQ         in_base+0(FP), CX
46 | 	MOVQ         CX, SI
47 | 	MOVQ         lenTable+64(FP), DI
48 | 	MOVBQZX      AL, AX
49 | 	ADDQ         DI, AX
50 | 	MOVBQZX      (AX), AX
51 | 	ADDQ         AX, SI
52 | 	VLDDQU       (CX), X0
53 | 	VLDDQU       (SI), X1
54 | 	VPSHUFB      (DX), X0, X0
55 | 	VPSHUFB      (BX), X1, X1
56 | 	VBROADCASTSS prev+52(FP), X2
57 | 	VPSLLDQ      $0x04, X0, X3
58 | 	VPADDD       X0, X3, X0
59 | 	VPSLLDQ      $0x08, X0, X3
60 | 	VPADDD       X0, X2, X0
61 | 	VPADDD       X0, X3, X0
62 | 	VPSHUFD      $0xff, X0, X2
63 | 	VPSLLDQ      $0x04, X1, X3
64 | 	VPADDD       X1, X3, X1
65 | 	VPSLLDQ      $0x08, X1, X3
66 | 	VPADDD       X1, X2, X1
67 | 	VPADDD       X1, X3, X1
68 | 	MOVQ         out_base+24(FP), AX
69 | 	VMOVDQU      X0, (AX)
70 | 	VMOVDQU      X1, 16(AX)
71 | 	RET
72 | 


--------------------------------------------------------------------------------
/pkg/decode/decode_base.go:
--------------------------------------------------------------------------------
 1 | // +build !amd64
 2 | 
 3 | package decode
 4 | 
 5 | import (
 6 | 	"github.com/theMPatel/streamvbyte-simdgo/pkg/shared"
 7 | )
 8 | 
 9 | func GetMode() shared.PerformanceMode {
10 | 	return shared.Normal
11 | }
12 | 
13 | func Get8uint32Fast(in []byte, out []uint32, ctrl uint16) int {
14 | 	panic("unreachable")
15 | }
16 | 
17 | func Get8uint32DeltaFast(in []byte, out []uint32, ctrl uint16, prev uint32) int {
18 | 	panic("unreachable")
19 | }
20 | 


--------------------------------------------------------------------------------
/pkg/decode/decode_test.go:
--------------------------------------------------------------------------------
  1 | package decode
  2 | 
  3 | import (
  4 | 	"encoding/binary"
  5 | 	"math/rand"
  6 | 	"reflect"
  7 | 	"testing"
  8 | 	"time"
  9 | 
 10 | 	"github.com/theMPatel/streamvbyte-simdgo/pkg/encode"
 11 | 	"github.com/theMPatel/streamvbyte-simdgo/pkg/shared"
 12 | 	"github.com/theMPatel/streamvbyte-simdgo/pkg/util"
 13 | )
 14 | 
 15 | func init() {
 16 | 	rand.Seed(time.Now().UnixNano())
 17 | }
 18 | 
 19 | func TestGet8uint32Scalar(t *testing.T) {
 20 | 	count := 8
 21 | 	expected := util.GenUint32(count)
 22 | 	in := make([]byte, count*encode.MaxBytesPerNum)
 23 | 	ctrl := encode.Put8uint32Scalar(expected, in)
 24 | 	out := make([]uint32, 8)
 25 | 
 26 | 	Get8uint32Scalar(in, out, ctrl)
 27 | 	if !reflect.DeepEqual(expected, out) {
 28 | 		t.Fatalf("expected %+v, got %+v", expected, out)
 29 | 	}
 30 | }
 31 | 
 32 | func TestGet8uint32DeltaScalar(t *testing.T) {
 33 | 	count := 8
 34 | 	expected := util.GenUint32(count)
 35 | 	util.SortUint32(expected)
 36 | 	in := make([]byte, count*encode.MaxBytesPerNum)
 37 | 	ctrl := encode.Put8uint32DeltaScalar(expected, in, 0)
 38 | 	out := make([]uint32, 8)
 39 | 
 40 | 	Get8uint32DeltaScalar(in, out, ctrl, 0)
 41 | 	if !reflect.DeepEqual(expected, out) {
 42 | 		t.Fatalf("expected %+v, got %+v", expected, out)
 43 | 	}
 44 | }
 45 | 
 46 | func TestGet8uint32Fast(t *testing.T) {
 47 | 	if GetMode() == shared.Normal {
 48 | 		t.Skipf("Testing environment doesn't support this test")
 49 | 	}
 50 | 
 51 | 	count := 8
 52 | 	expected := util.GenUint32(count)
 53 | 	in := make([]byte, count*encode.MaxBytesPerNum)
 54 | 	ctrl := encode.Put8uint32Scalar(expected, in)
 55 | 	out := make([]uint32, 8)
 56 | 
 57 | 	Get8uint32Fast(in, out, ctrl)
 58 | 	if !reflect.DeepEqual(expected, out) {
 59 | 		t.Fatalf("expected %+v, got %+v", expected, out)
 60 | 	}
 61 | }
 62 | 
 63 | func TestGet8uint32DeltaFast(t *testing.T) {
 64 | 	if GetMode() == shared.Normal {
 65 | 		t.Skipf("Testing environment doesn't support this test")
 66 | 	}
 67 | 
 68 | 	count := 8
 69 | 	expected := util.GenUint32(count)
 70 | 	util.SortUint32(expected)
 71 | 	in := make([]byte, count*encode.MaxBytesPerNum)
 72 | 	ctrl := encode.Put8uint32DeltaScalar(expected, in, 0)
 73 | 	out := make([]uint32, 8)
 74 | 
 75 | 	Get8uint32DeltaFast(in, out, ctrl, 0)
 76 | 	if !reflect.DeepEqual(expected, out) {
 77 | 		t.Fatalf("expected %+v, got %+v", expected, out)
 78 | 	}
 79 | }
 80 | 
 81 | func TestGetUint32Scalar(t *testing.T) {
 82 | 	count := rand.Intn(4) + 1
 83 | 	expected := util.GenUint32(count)
 84 | 	in := make([]byte, count*encode.MaxBytesPerNum)
 85 | 	ctrl := encode.PutUint32Scalar(expected, in, count)
 86 | 	out := make([]uint32, count)
 87 | 
 88 | 	GetUint32Scalar(in, out, ctrl, count)
 89 | 	if !reflect.DeepEqual(expected, out) {
 90 | 		t.Fatalf("expected %+v, got %+v", expected, out)
 91 | 	}
 92 | }
 93 | 
 94 | func TestGetUint32DeltaScalar(t *testing.T) {
 95 | 	count := rand.Intn(4) + 1
 96 | 	expected := util.GenUint32(count)
 97 | 	util.SortUint32(expected)
 98 | 	in := make([]byte, count*encode.MaxBytesPerNum)
 99 | 	deltas := make([]uint32, count)
100 | 	util.Delta(expected, deltas)
101 | 	ctrl := encode.PutUint32Scalar(deltas, in, count)
102 | 
103 | 	out := make([]uint32, count)
104 | 	GetUint32DeltaScalar(in, out, ctrl, count, 0)
105 | 	if !reflect.DeepEqual(expected, out) {
106 | 		t.Fatalf("expected %+v, got %+v", expected, out)
107 | 	}
108 | }
109 | 
110 | var readSinkA []uint32
111 | 
112 | func BenchmarkGet8uint32Fast(b *testing.B) {
113 | 	if GetMode() == shared.Normal {
114 | 		b.Skipf("Testing environment doesn't support this test")
115 | 	}
116 | 
117 | 	count := 8
118 | 	out := make([]uint32, count)
119 | 
120 | 	nums := util.GenUint32(count)
121 | 	in := make([]byte, count*encode.MaxBytesPerNum)
122 | 	ctrl := encode.Put8uint32Scalar(nums, in)
123 | 
124 | 	b.SetBytes(int64(count * encode.MaxBytesPerNum))
125 | 	b.ResetTimer()
126 | 	for i := 0; i < b.N; i++ {
127 | 		Get8uint32Fast(in, out, ctrl)
128 | 	}
129 | 	readSinkA = out
130 | }
131 | 
132 | var readSinkB []uint32
133 | 
134 | func BenchmarkGet8uint32DeltaFast(b *testing.B) {
135 | 	if GetMode() == shared.Normal {
136 | 		b.Skipf("Testing environment doesn't support this test")
137 | 	}
138 | 
139 | 	count := 8
140 | 	out := make([]uint32, count)
141 | 	nums := util.GenUint32(count)
142 | 	util.SortUint32(nums)
143 | 	in := make([]byte, count*encode.MaxBytesPerNum)
144 | 	ctrl := encode.Put8uint32DeltaScalar(nums, in, 0)
145 | 
146 | 	b.SetBytes(int64(count * encode.MaxBytesPerNum))
147 | 	b.ResetTimer()
148 | 	for i := 0; i < b.N; i++ {
149 | 		Get8uint32DeltaFast(in, out, ctrl, 0)
150 | 	}
151 | 	readSinkB = out
152 | }
153 | 
154 | var readSinkC []uint32
155 | 
156 | func BenchmarkGet8uint32Scalar(b *testing.B) {
157 | 	count := 8
158 | 	out := make([]uint32, count)
159 | 	nums := util.GenUint32(count)
160 | 	in := make([]byte, count*encode.MaxBytesPerNum)
161 | 	ctrl := encode.Put8uint32Scalar(nums, in)
162 | 
163 | 	b.SetBytes(int64(count * encode.MaxBytesPerNum))
164 | 	b.ResetTimer()
165 | 	for i := 0; i < b.N; i++ {
166 | 		Get8uint32Scalar(in, out, ctrl)
167 | 	}
168 | 	readSinkC = out
169 | }
170 | 
171 | var readSinkD []uint32
172 | 
173 | func BenchmarkGet8uint32DeltaScalar(b *testing.B) {
174 | 	count := 8
175 | 	out := make([]uint32, count)
176 | 	nums := util.GenUint32(count)
177 | 	util.SortUint32(nums)
178 | 	in := make([]byte, count*encode.MaxBytesPerNum)
179 | 	ctrl := encode.Put8uint32DeltaScalar(nums, in, 0)
180 | 
181 | 	b.SetBytes(int64(count * encode.MaxBytesPerNum))
182 | 	b.ResetTimer()
183 | 	for i := 0; i < b.N; i++ {
184 | 		Get8uint32DeltaScalar(in, out, ctrl, 0)
185 | 	}
186 | 	readSinkD = out
187 | }
188 | 
189 | var readSinkE []uint32
190 | 
191 | func BenchmarkGet8uint32Varint(b *testing.B) {
192 | 	count := 8
193 | 	out := make([]uint32, count)
194 | 	data := make([]byte, binary.MaxVarintLen32*count)
195 | 	written := util.PutVarint(util.GenUint32(count), data)
196 | 	data = data[:written]
197 | 
198 | 	b.SetBytes(int64(count * encode.MaxBytesPerNum))
199 | 	b.ResetTimer()
200 | 	for i := 0; i < b.N; i++ {
201 | 		util.GetVarint(data, out)
202 | 	}
203 | 	readSinkE = out
204 | }
205 | 
206 | var readSinkF []uint32
207 | 
208 | func BenchmarkGet8uint32DeltaVarint(b *testing.B) {
209 | 	count := 8
210 | 	out := make([]uint32, count)
211 | 	data := make([]byte, binary.MaxVarintLen32*count)
212 | 	written := util.PutDeltaVarint(util.GenUint32(count), data, 0)
213 | 	data = data[:written]
214 | 
215 | 	b.SetBytes(int64(count * encode.MaxBytesPerNum))
216 | 	b.ResetTimer()
217 | 	for i := 0; i < b.N; i++ {
218 | 		util.GetDeltaVarint(data, out, 0)
219 | 	}
220 | 	readSinkF = out
221 | }
222 | 


--------------------------------------------------------------------------------
/pkg/decode/gen.go:
--------------------------------------------------------------------------------
1 | package decode
2 | 
3 | //go:generate go run ./main/asm.go -out ./decode_amd64.s
4 | 


--------------------------------------------------------------------------------
/pkg/decode/main/asm.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"log"
  6 | 
  7 | 	. "github.com/mmcloughlin/avo/build"
  8 | 	"github.com/mmcloughlin/avo/operand"
  9 | 	"github.com/mmcloughlin/avo/reg"
 10 | 	"github.com/theMPatel/streamvbyte-simdgo/pkg/shared"
 11 | )
 12 | 
 13 | const (
 14 | 	name      = "Get8uint32FastAsm"
 15 | 	nameDelta = "Get8uint32DeltaFastAsm"
 16 | 
 17 | 	pIn       = "in"
 18 | 	pOut      = "out"
 19 | 	pCtrl     = "ctrl"
 20 | 	pShuffle  = "shuffle"
 21 | 	pLenTable = "lenTable"
 22 | 	pPrev     = "prev"
 23 | )
 24 | 
 25 | var (
 26 | 	signature = fmt.Sprintf(
 27 | 		"func(%s []byte, %s []uint32, %s uint16, %s *[256][16]uint8, %s *[256]uint8)",
 28 | 		pIn, pOut, pCtrl, pShuffle, pLenTable)
 29 | 
 30 | 	signatureDelta = fmt.Sprintf(
 31 | 		"func(%s []byte, %s []uint32, %s uint16, %s uint32, %s *[256][16]uint8, %s *[256]uint8)",
 32 | 		pIn, pOut, pCtrl, pPrev, pShuffle, pLenTable)
 33 | )
 34 | 
 35 | func main() {
 36 | 	regular()
 37 | 	differential()
 38 | 	Generate()
 39 | }
 40 | 
 41 | func regular() {
 42 | 	TEXT(name, NOSPLIT, signature)
 43 | 
 44 | 	firstFour, secondFour := coreAlgorithm()
 45 | 	outBase := operand.Mem{Base: Load(Param(pOut).Base(), GP64())}
 46 | 
 47 | 	VMOVDQU(firstFour, outBase)
 48 | 	VMOVDQU(secondFour, outBase.Offset(16))
 49 | 
 50 | 	RET()
 51 | }
 52 | 
 53 | func differential() {
 54 | 	TEXT(nameDelta, NOSPLIT, signatureDelta)
 55 | 
 56 | 	firstFour, secondFour := coreAlgorithm() // [A B C D] [E F G H]
 57 | 	prevSingular, err := Param(pPrev).Resolve()
 58 | 	if err != nil {
 59 | 		log.Fatalf("failed to get addr of prev")
 60 | 	}
 61 | 
 62 | 	prev := XMM()
 63 | 	VBROADCASTSS(prevSingular.Addr, prev) // [P P P P]
 64 | 	undoDelta(firstFour, prev)
 65 | 
 66 | 	VPSHUFD(operand.Imm(0xff), firstFour, prev) // [A B C D] -> [D D D D]
 67 | 	undoDelta(secondFour, prev)
 68 | 
 69 | 	outBase := operand.Mem{Base: Load(Param(pOut).Base(), GP64())}
 70 | 
 71 | 	VMOVDQU(firstFour, outBase)
 72 | 	VMOVDQU(secondFour, outBase.Offset(16))
 73 | 
 74 | 	RET()
 75 | }
 76 | 
 77 | func undoDelta(four, prev reg.VecVirtual) {
 78 | 	adder := XMM()                       // [A B C D]
 79 | 	VPSLLDQ(operand.Imm(4), four, adder) // [- A  B  C]
 80 | 	VPADDD(four, adder, four)            // [A AB BC CD]
 81 | 	VPSLLDQ(operand.Imm(8), four, adder) // [- - A AB]
 82 | 	VPADDD(four, prev, four)             // [PA PAB PBC PCD]
 83 | 	VPADDD(four, adder, four)            // [PA PAB PABC PABCD]
 84 | }
 85 | 
 86 | func coreAlgorithm() (reg.VecVirtual, reg.VecVirtual) {
 87 | 	ctrl := GP64()
 88 | 	Load(Param(pCtrl), ctrl)
 89 | 
 90 | 	shuffleBase := Load(Param(pShuffle), GP64())
 91 | 	shuffleA := shared.CalculateShuffleAddrFromCtrl(shuffleBase, ctrl, false)
 92 | 	shuffleB := shared.CalculateShuffleAddrFromCtrl(shuffleBase, ctrl, true)
 93 | 
 94 | 	firstBlock := Load(Param(pIn).Base(), GP64())
 95 | 	secondBlock := GP64()
 96 | 	MOVQ(firstBlock, secondBlock)
 97 | 	lowerAddr, lowerSize := shared.LenValueAddr(ctrl, false, pLenTable)
 98 | 
 99 | 	MOVBQZX(lowerAddr, lowerSize)
100 | 	ADDQ(lowerSize, secondBlock)
101 | 
102 | 	firstFour := XMM()
103 | 	secondFour := XMM()
104 | 	VLDDQU(operand.Mem{Base: firstBlock}, firstFour)
105 | 	VLDDQU(operand.Mem{Base: secondBlock}, secondFour)
106 | 
107 | 	VPSHUFB(shuffleA, firstFour, firstFour)
108 | 	VPSHUFB(shuffleB, secondFour, secondFour)
109 | 
110 | 	return firstFour, secondFour
111 | }
112 | 


--------------------------------------------------------------------------------
/pkg/encode/encode.go:
--------------------------------------------------------------------------------
  1 | package encode
  2 | 
  3 | import (
  4 | 	"math/bits"
  5 | 
  6 | 	"github.com/theMPatel/streamvbyte-simdgo/pkg/shared"
  7 | )
  8 | 
  9 | const (
 10 | 	MaxBytesPerNum = 4
 11 | )
 12 | 
 13 | var (
 14 | 	putImpl      Put8Impl
 15 | 	putDeltaImpl Put8DeltaImpl
 16 | )
 17 | 
 18 | type Put8Impl func(in []uint32, out []byte) (ctrl uint16)
 19 | type Put8DeltaImpl func(in []uint32, out []byte, prev uint32) (ctrl uint16)
 20 | 
 21 | func init() {
 22 | 	if GetMode() == shared.Fast {
 23 | 		putImpl = Put8uint32Fast
 24 | 		putDeltaImpl = Put8uint32DeltaFast
 25 | 	} else {
 26 | 		putImpl = Put8uint32Scalar
 27 | 		putDeltaImpl = Put8uint32DeltaScalar
 28 | 	}
 29 | }
 30 | 
 31 | // Put8uint32 is a general func you can use to encode 8 uint32's at a time.
 32 | // It will use the fastest implementation available determined during
 33 | // package initialization. If your CPU supports special hardware instructions
 34 | // then it will use an accelerated version of Stream VByte. Otherwise, the
 35 | // scalar implementation will be used as the fallback.
 36 | func Put8uint32(in []uint32, out []byte) uint16 {
 37 | 	return putImpl(in, out)
 38 | }
 39 | 
 40 | // Put8uint32Delta is a general func you can use to encode 8 differentially coded
 41 | // uint32's with at a time. It will use the fastest implementation available
 42 | // determined during package initialization. If your CPU supports special hardware
 43 | // instructions then it will use an accelerated version of Stream VByte. Otherwise,
 44 | // the scalar implementation will be used as the fallback.
 45 | func Put8uint32Delta(in []uint32, out []byte, prev uint32) uint16 {
 46 | 	return putDeltaImpl(in, out, prev)
 47 | }
 48 | 
 49 | // PutUint32Scalar encodes up to 4 integers from in into out using the
 50 | // Stream VByte format.
 51 | //
 52 | // Note: It is your responsibility to ensure that the incoming slices have
 53 | // the appropriate sizes and data otherwise this func will panic.
 54 | func PutUint32Scalar(in []uint32, out []byte, count int) uint8 {
 55 | 	if count == 0 {
 56 | 		return 0
 57 | 	}
 58 | 
 59 | 	if count > 4 {
 60 | 		count = 4
 61 | 	}
 62 | 
 63 | 	var (
 64 | 		ctrl  uint8
 65 | 		shift = 0
 66 | 		total = 0
 67 | 	)
 68 | 	for i := 0; i < count; i++ {
 69 | 		size := encodeOne(in[i], out[total:])
 70 | 		total += size
 71 | 		ctrl |= uint8(size-1) << shift
 72 | 		shift += 2
 73 | 	}
 74 | 
 75 | 	return ctrl
 76 | }
 77 | 
 78 | // PutUint32DeltaScalar encodes up to 4 integers from in into out using the
 79 | // Stream VByte format.
 80 | //
 81 | // Note: It is your responsibility to ensure that the incoming slices have
 82 | // the appropriate sizes and data otherwise this func will panic.
 83 | func PutUint32DeltaScalar(in []uint32, out []byte, count int, prev uint32) uint8 {
 84 | 	if count == 0 {
 85 | 		return 0
 86 | 	}
 87 | 
 88 | 	if count > 4 {
 89 | 		count = 4
 90 | 	}
 91 | 
 92 | 	var (
 93 | 		ctrl  uint8
 94 | 		shift = 0
 95 | 		total = 0
 96 | 	)
 97 | 	for i := 0; i < count; i++ {
 98 | 		size := encodeOne(in[i]-prev, out[total:])
 99 | 		total += size
100 | 		ctrl |= uint8(size-1) << shift
101 | 		shift += 2
102 | 		prev = in[i]
103 | 	}
104 | 
105 | 	return ctrl
106 | }
107 | 
108 | // Put8uint32Scalar will encode 8 uint32 values from in into out using the
109 | // Stream VByte format. Returns an 16-bit control value produced from the
110 | // encoding.
111 | //
112 | // Note: It is your responsibility to ensure that the incoming slices have
113 | // the appropriate sizes and data otherwise this func will panic.
114 | func Put8uint32Scalar(in []uint32, out []byte) uint16 {
115 | 	var ctrl uint16
116 | 	first := Put4uint32Scalar(in, out)
117 | 	ctrl |= uint16(first)
118 | 	encoded := shared.ControlByteToSize(first)
119 | 	second := Put4uint32Scalar(in[4:], out[encoded:])
120 | 	return ctrl | uint16(second)<<8
121 | }
122 | 
123 | // Put4uint32Scalar will encode 4 uint32 values from in into out using the
124 | // Stream VByte format. Returns an 8-bit control value produced from the
125 | // encoding. Every incoming number is variably encoded, and an 8-bit control
126 | // is constructed from the 2-bit len of each uint32. Below is an example of
127 | // 4 uint32's and how they are encoded.
128 | //
129 | // 00000000 00000000 00000000 01101111  =        111
130 | // 00000000 00000000 00000100 11010010  =       1234
131 | // 00000000 00001100 00001010 10000011  =     789123
132 | // 01000000 00000000 00000000 00000000  = 1073741824
133 | //
134 | // Num         Len      2-bit control
135 | // ----------------------------------
136 | // 111          1                0b00
137 | // 1234         2                0b01
138 | // 789123       3                0b10
139 | // 1073741824   4                0b11
140 | //
141 | // Final Control byte
142 | // 0b11100100
143 | //
144 | // Encoded data (little endian right-to-left bottom-to-top)
145 | // 0b01000000 0b00000000 0b00000000 0b00000000 0b00001100
146 | // 0b00001010 0b10000011 0b00000100 0b11010010 0b01101111
147 | //
148 | // Note: It is your responsibility to ensure that the incoming slices have
149 | // the appropriate sizes and data otherwise this func will panic.
150 | func Put4uint32Scalar(in []uint32, out []byte) uint8 {
151 | 	// bounds check hint to compiler
152 | 	_ = in[3]
153 | 
154 | 	num0 := in[0]
155 | 	num1 := in[1]
156 | 	num2 := in[2]
157 | 	num3 := in[3]
158 | 
159 | 	len0 := encodeOne(num0, out)
160 | 	len1 := encodeOne(num1, out[len0:])
161 | 	len2 := encodeOne(num2, out[len0+len1:])
162 | 	len3 := encodeOne(num3, out[len0+len1+len2:])
163 | 
164 | 	return uint8((len0 - 1) | (len1-1)<<2 | (len2-1)<<4 | (len3-1)<<6)
165 | }
166 | 
167 | // Put8uint32DeltaScalar will differentially encode 8 uint32 values from in into out.
168 | // Prev provides a way for you to indicate the base value for this batch of 8.
169 | // For example, when encoding the second batch of 8 integers out of, e.g. 16, you would
170 | // provide a prev value of the last value in the first batch of 8 you encoded. This
171 | // is done to ensure that the integers are correctly resolved to the correct diff. An
172 | // example below. Note that this func assumes that the input integers are already sorted.
173 | //
174 | // Input:	[ 10, 20, 30, 40, 50, 60, 70, 80 ] [ 90, 100, 110, 120, 130, 140, 150, 160 ]
175 | // Output:	[ 10, 10, 10, 10, 10, 10, 10, 10 ] [ 10, 10, 10, 10, 10, 10, 10, 10 ]
176 | // Prev: 80
177 | func Put8uint32DeltaScalar(in []uint32, out []byte, prev uint32) uint16 {
178 | 	var ctrl uint16
179 | 	first := Put4uint32DeltaScalar(in, out, prev)
180 | 	ctrl |= uint16(first)
181 | 	encoded := shared.ControlByteToSize(first)
182 | 	second := Put4uint32DeltaScalar(in[4:], out[encoded:], in[3])
183 | 	return ctrl | uint16(second)<<8
184 | }
185 | 
186 | // Put4uint32DeltaScalar will differentially encode 4 uint32 values from in into out.
187 | // Prev provides a way for you to indicate the base value for this batch of 4.
188 | // For example, when encoding the second batch of 4 integers out of, e.g. 8, you would
189 | // provide a prev value of the last value in the first batch of 4 you encoded. This
190 | // is done to ensure that the integers are correctly resolved to the correct diff. An
191 | // example below. Note that this func assumes that the input integers are already sorted.
192 | //
193 | // Input:	[ 10, 20, 30, 40 ] [ 50, 60, 70, 80 ]
194 | // Output:	[ 10, 10, 10, 10 ] [ 10, 10, 10, 10 ]
195 | // Prev: 40
196 | func Put4uint32DeltaScalar(in []uint32, out []byte, prev uint32) uint8 {
197 | 	// bounds check hint to compiler
198 | 	_ = in[3]
199 | 
200 | 	num0 := in[0] - prev
201 | 	num1 := in[1] - in[0]
202 | 	num2 := in[2] - in[1]
203 | 	num3 := in[3] - in[2]
204 | 
205 | 	len0 := encodeOne(num0, out)
206 | 	len1 := encodeOne(num1, out[len0:])
207 | 	len2 := encodeOne(num2, out[len0+len1:])
208 | 	len3 := encodeOne(num3, out[len0+len1+len2:])
209 | 
210 | 	return uint8((len0 - 1) | (len1-1)<<2 | (len2-1)<<4 | (len3-1)<<6)
211 | }
212 | 
213 | func encodeOne(num uint32, out []byte) int {
214 | 	size := max(1, 4-(bits.LeadingZeros32(num)/8))
215 | 	switch size {
216 | 	case 4:
217 | 		out[3] = byte(num >> 24)
218 | 		fallthrough
219 | 	case 3:
220 | 		out[2] = byte(num >> 16)
221 | 		fallthrough
222 | 	case 2:
223 | 		out[1] = byte(num >> 8)
224 | 		fallthrough
225 | 	case 1:
226 | 		out[0] = byte(num)
227 | 	}
228 | 	return size
229 | }
230 | 
231 | func max(a, b int) int {
232 | 	if a < b {
233 | 		return b
234 | 	}
235 | 	return a
236 | }
237 | 


--------------------------------------------------------------------------------
/pkg/encode/encode_amd64.go:
--------------------------------------------------------------------------------
  1 | // +build amd64
  2 | 
  3 | // Package encode provides an x86_64 implementation of two
  4 | // Stream VByte encoding algorithms, a normal encoding approach
  5 | // and one that incorporates differential coding.
  6 | package encode
  7 | 
  8 | import (
  9 | 	"github.com/theMPatel/streamvbyte-simdgo/pkg/shared"
 10 | 	"golang.org/x/sys/cpu"
 11 | )
 12 | 
 13 | // GetMode performs a check to see if the current ISA supports
 14 | // the below encoding funcs.
 15 | func GetMode() shared.PerformanceMode {
 16 | 	if cpu.X86.HasAVX && cpu.X86.HasAVX2 {
 17 | 		return shared.Fast
 18 | 	}
 19 | 	return shared.Normal
 20 | }
 21 | 
 22 | // Put8uint32Fast binds to put8uint32Fast which is implemented
 23 | // in assembly.
 24 | func Put8uint32Fast(in []uint32, out []byte) uint16 {
 25 | 	return Put8uint32FastAsm(in, out,
 26 | 		shared.EncodeShuffleTable,
 27 | 		shared.PerControlLenTable,
 28 | 	)
 29 | }
 30 | 
 31 | // Put8uint32DeltaFast binds to put8uint32DeltaFast which is implemented
 32 | // in assembly.
 33 | func Put8uint32DeltaFast(in []uint32, out []byte, prev uint32) uint16 {
 34 | 	return Put8uint32DeltaFastAsm(
 35 | 		in, out, prev,
 36 | 		shared.EncodeShuffleTable,
 37 | 		shared.PerControlLenTable,
 38 | 	)
 39 | }
 40 | 
 41 | // Put8uint32FastAsm has three core phases. First a 16-bit control is
 42 | // generated for the incoming 8 uint32s. Then, the calculated control
 43 | // is used to index into shared.EncodeShuffleTable to fetch the
 44 | // correct shuffle mask to compress the incoming integers. Finally,
 45 | // the calculated control is used to index into shared.PerControlLenTable
 46 | // to determine the offsets in the output array to write to.
 47 | //
 48 | // Based on the algorithm devised by Lemire et al., the SIMD control
 49 | // byte generation algorithm proceeds as follows. Note that here we
 50 | // are using 1234 as our first example integer.
 51 | //
 52 | // 00000000 00000000 00000100 11010010 // 1234
 53 | // 00000001 00000001 00000001 00000001 // 0x0101 mask
 54 | // ----------------------------------- // byte-min(1234, 0x0101)
 55 | // 00000000 00000000 00000001 00000001
 56 | //
 57 | // The algorithm first uses a mask where every byte is equal to 1. If
 58 | // you perform a per-byte min operation on our integer and the 1's mask,
 59 | // the result will have a 1 at every byte that had a value in the original
 60 | // integer.
 61 | //
 62 | // 00000000 00000000 00000001 00000001
 63 | // ----------------------------------- // pack unsigned saturating
 64 | // 00000000 00000000 00000000 11111111 // 16-bit to 8-bit
 65 | //
 66 | // Now you perform a 16-bit to 8-bit unsigned saturating pack operation.
 67 | // Practically this means that you're taking every 16-bit value and trying
 68 | // to shove that into 8 bits. If the 16-bit integer is larger than the
 69 | // largest unsigned integer 8 bits can support, the pack saturates to the
 70 | // largest unsigned 8-bit value.
 71 | //
 72 | // Why this is performed will become more clear in the subsequent steps,
 73 | // however, at a high level, for every integer you want to encode, you
 74 | // want for the MSB of two consecutive bytes in the control bits stream
 75 | // to be representative of the final 2-bit control. For example, if you
 76 | // have a 3-byte integer, you want the MSB of two consecutive bytes to be
 77 | // 1 and 0, in that order. The reason you would want this is that there
 78 | // is a vector pack instruction that takes the MSB from every byte in the
 79 | // control bits stream and packs it into the lowest byte. This would thus
 80 | // represent the value `0b10` in the final byte for this 3-byte integer,
 81 | // which is what we want.
 82 | //
 83 | // Performing a 16-bit to 8-bit unsigned saturating pack has the effect
 84 | // that you can use the saturation behavior to conditionally turn on the
 85 | // MSB of these bytes depending on which bytes have values in the original
 86 | // 32-bit integer.
 87 | //
 88 | // 00000000 00000000 00000000 11111111 // control bits stream
 89 | // 00000001 00000001 00000001 00000001 // 0x0101 mask
 90 | // ----------------------------------- // signed 16-bit min
 91 | // 00000000 00000000 00000000 11111111
 92 | //
 93 | // We then take the 1's mask we used before and perform a signed 16-bit
 94 | // min operation. The reason for this is more clear if you look at an
 95 | // example using a 3-byte integer.
 96 | //
 97 | // 00000000 00001100 00001010 10000011 // 789123
 98 | // 00000001 00000001 00000001 00000001 // 0x0101 mask
 99 | // ----------------------------------- // byte-min(789123, 0x0101)
100 | // 00000000 00000001 00000001 00000001
101 | // ----------------------------------- // pack unsigned saturating 16-bit to 8-bit
102 | // 00000000 00000000 00000001 11111111
103 | // 00000001 00000001 00000001 00000001 // 0x0101 mask
104 | // ----------------------------------- // signed 16-bit min
105 | // 00000000 00000000 00000001 00000001
106 | //
107 | // The signed 16-bit min operation has three important effects.
108 | //
109 | // First, for 3-byte integers, it has the effect of turning off the
110 | // MSB of the lowest byte. This is necessary because a 3-byte integer
111 | // should have a 2-bit control that is `0b10` and without this step
112 | // using the MSB pack operation would result in a 2-bit control that
113 | // looks something like `0b_1`, where the lowest bit is on. Obviously
114 | // this is wrong, since only integers that require 2 or 4 bytes to
115 | // encode should have that lower bit on, i.e. 1 or 3 as a zero-indexed
116 | // length.
117 | //
118 | // Second, for 4-byte integers, the signed aspect has the effect of
119 | // leaving both MSBs of the 2 bytes on. When using the MSB pack
120 | // operation later on, it will result in a 2-bit control value of
121 | // `0b11`, which is what we want.
122 | //
123 | // Third, for 1 and 2 byte integers, it has no effect. This is great
124 | // for 2-byte values since the MSB will remain on and 1 byte values
125 | // will not have any MSB on anyways, so it is effectively a noop in
126 | // both scenarios.
127 | //
128 | // 00000000 00000000 00000000 11111111 // control bits stream (original 1234)
129 | // 01111111 00000000 01111111 00000000 // 0x7F00 mask
130 | // ----------------------------------- // add unsigned saturating 16-bit
131 | // 01111111 00000000 01111111 11111111
132 | //
133 | // Next, we take a mask with the value `0x7F00` and perform an unsigned
134 | // saturating add to the control bits stream. In the case for the integer
135 | // `1234` this has no real effect. We maintain the MSB in the lowest byte.
136 | // You'll note, however, that the only byte that has its MSB on is the last
137 | // one, so performing an MSB pack operation would result in a value of
138 | // `0b0001`, which is what we want. An example of this step on the integer
139 | // `789123` might paint a clearer picture.
140 | //
141 | // 00000000 00000000 00000001 00000001 // control bits stream (789123)
142 | // 01111111 00000000 01111111 00000000 // 0x7F00 mask
143 | // ----------------------------------- // add unsigned saturating 16-bit
144 | // 01111111 00000000 11111111 00000001
145 | //
146 | // You'll note here that the addition of `0x01` with `0x7F` in the upper
147 | // byte results in the MSB of the resulting upper byte turning on. The MSB
148 | // in the lower byte remains off and now an MSB pack operation will resolve
149 | // to `0b0010`, which is what we want. The unsigned saturation behavior is
150 | // really important for 4-byte numbers that only have bits in the most
151 | // significant byte on. An example below:
152 | //
153 | // 01000000 00000000 00000000 00000000 // 1073741824
154 | // 00000001 00000001 00000001 00000001 // 0x0101 mask
155 | // ----------------------------------- // byte-min(1073741824, 0x0101)
156 | // 00000001 00000000 00000000 00000000
157 | // ----------------------------------- // pack unsigned saturating 16-bit to 8-bit
158 | // 00000000 00000000 11111111 00000000
159 | // 00000001 00000001 00000001 00000001 // 0x0101 mask
160 | // ----------------------------------- // signed 16-bit min
161 | // 00000000 00000000 11111111 00000000
162 | // 01111111 00000000 01111111 00000000 // 0x7F00 mask
163 | // ----------------------------------- // add unsigned saturating 16-bit
164 | // 01111111 00000000 11111111 11111111
165 | //
166 | // Note here that because only the upper byte had a value in it, the lowest
167 | // byte in the control bits stream remains zero for the duration of the
168 | // algorithm. This poses an issue, since for a 4-byte value, we want for the
169 | // 2-bit control to result in a value of `0b11`. Performing a 16-bit unsigned
170 | // saturating addition has the effect of turning on all bits in the lower
171 | // byte, and thus we get a result with the MSB in the lower byte on.
172 | //
173 | // 01111111 00000000 11111111 00000001 // control bits stream (789123)
174 | // ----------------------------------- // move byte mask
175 | // 00000000 00000000 00000000 00000010 // 2-bit control
176 | //
177 | // The final move byte mask is performed on the control bits stream, and we
178 | // now have the result we wanted.
179 | //
180 | // We then use the above control bits we generated to get the appropriate
181 | // shuffle masks. Below is an example of how the shuffle operation and a
182 | // mask allows for us to tightly pack two integers into the output buffer.
183 | //
184 | // input [1234, 789123] (little endian R-to-L)
185 | // 00000000 00001100 00001010 10000011 00000000 00000000 00000100 11010010
186 | //            |       |         |                             |        |
187 | //            |       |         |____________________         |        |
188 | //            |       |_____________________         |        |        |
189 | //            |____________________         |        |        |        |
190 | //                                 v        v        v        v        v
191 | //    0xff     0xff     0xff     0x06     0x05     0x04     0x01     0x00 // mask in hex
192 | // -----------------------------------------------------------------------
193 | // 00000000 00000000 00000000 00001100 00001010 10000011 00000100 11010010 // packed
194 | //go:noescape
195 | func Put8uint32FastAsm(
196 | 	in []uint32, outBytes []byte,
197 | 	shuffle *[256][16]uint8, lenTable *[256]uint8,
198 | ) (r uint16)
199 | 
200 | // Put8uint32DeltaFastAsm works similarly to put8uint32Fast except
201 | // that prior to encoding the 8 uint32s, we first use differential
202 | // coding to change the original numbers into deltas using SIMD
203 | // techniques. Afterwards, the encoding algorithm follows the same
204 | // flow as put8uint32Fast. The basic differential coding algorithm
205 | // is as follows:
206 | //
207 | // Prev:            [P P P P]
208 | // Input:           [A B C D]
209 | // Concat-shift:    [P A B C]
210 | // Subtract:        [A-P B-A C-B D-C]
211 | //go:noescape
212 | func Put8uint32DeltaFastAsm(
213 | 	in []uint32, outBytes []byte, prev uint32,
214 | 	shuffle *[256][16]uint8, lenTable *[256]uint8,
215 | ) (r uint16)
216 | 


--------------------------------------------------------------------------------
/pkg/encode/encode_amd64.s:
--------------------------------------------------------------------------------
 1 | // Code generated by command: go run asm.go -out ./encode_amd64.s. DO NOT EDIT.
 2 | 
 3 | #include "textflag.h"
 4 | 
 5 | DATA mask0101<>+0(SB)/2, $0x0101
 6 | GLOBL mask0101<>(SB), RODATA|NOPTR, $2
 7 | 
 8 | DATA mask7F00<>+0(SB)/2, $0x7f00
 9 | GLOBL mask7F00<>(SB), RODATA|NOPTR, $2
10 | 
11 | // func Put8uint32FastAsm(in []uint32, outBytes []byte, shuffle *[256][16]uint8, lenTable *[256]uint8) (r uint16)
12 | // Requires: AVX, AVX2
13 | TEXT ·Put8uint32FastAsm(SB), NOSPLIT, $0-66
14 | 	MOVQ         in_base+0(FP), AX
15 | 	VLDDQU       (AX), X0
16 | 	VLDDQU       16(AX), X1
17 | 	VPBROADCASTW mask0101<>+0(SB), X2
18 | 	VPBROADCASTW mask7F00<>+0(SB), X3
19 | 	VPMINUB      X2, X0, X4
20 | 	VPMINUB      X2, X1, X5
21 | 	VPACKUSWB    X5, X4, X4
22 | 	VPMINSW      X2, X4, X4
23 | 	VPADDUSW     X3, X4, X4
24 | 	VPMOVMSKB    X4, AX
25 | 	MOVW         AX, r+64(FP)
26 | 	MOVQ         shuffle+48(FP), CX
27 | 	MOVBQZX      AL, DX
28 | 	SHLQ         $0x04, DX
29 | 	ADDQ         CX, DX
30 | 	MOVWQZX      AX, BX
31 | 	SHRQ         $0x08, BX
32 | 	SHLQ         $0x04, BX
33 | 	ADDQ         CX, BX
34 | 	VPSHUFB      (DX), X0, X0
35 | 	VPSHUFB      (BX), X1, X1
36 | 	MOVQ         outBytes_base+24(FP), CX
37 | 	MOVQ         CX, DX
38 | 	MOVQ         lenTable+56(FP), BX
39 | 	MOVBQZX      AL, AX
40 | 	ADDQ         BX, AX
41 | 	MOVBQZX      (AX), AX
42 | 	ADDQ         AX, DX
43 | 	VMOVDQU      X0, (CX)
44 | 	VMOVDQU      X1, (DX)
45 | 	RET
46 | 
47 | // func Put8uint32DeltaFastAsm(in []uint32, outBytes []byte, prev uint32, shuffle *[256][16]uint8, lenTable *[256]uint8) (r uint16)
48 | // Requires: AVX, AVX2
49 | TEXT ·Put8uint32DeltaFastAsm(SB), NOSPLIT, $0-74
50 | 	MOVQ         in_base+0(FP), AX
51 | 	VLDDQU       (AX), X0
52 | 	VLDDQU       16(AX), X1
53 | 	VPALIGNR     $0x0c, X0, X1, X2
54 | 	VPSUBD       X2, X1, X1
55 | 	VBROADCASTSS prev+48(FP), X2
56 | 	VPALIGNR     $0x0c, X2, X0, X2
57 | 	VPSUBD       X2, X0, X0
58 | 	VPBROADCASTW mask0101<>+0(SB), X2
59 | 	VPBROADCASTW mask7F00<>+0(SB), X3
60 | 	VPMINUB      X2, X0, X4
61 | 	VPMINUB      X2, X1, X5
62 | 	VPACKUSWB    X5, X4, X4
63 | 	VPMINSW      X2, X4, X4
64 | 	VPADDUSW     X3, X4, X4
65 | 	VPMOVMSKB    X4, AX
66 | 	MOVW         AX, r+72(FP)
67 | 	MOVQ         shuffle+56(FP), CX
68 | 	MOVBQZX      AL, DX
69 | 	SHLQ         $0x04, DX
70 | 	ADDQ         CX, DX
71 | 	MOVWQZX      AX, BX
72 | 	SHRQ         $0x08, BX
73 | 	SHLQ         $0x04, BX
74 | 	ADDQ         CX, BX
75 | 	VPSHUFB      (DX), X0, X0
76 | 	VPSHUFB      (BX), X1, X1
77 | 	MOVQ         outBytes_base+24(FP), CX
78 | 	MOVQ         CX, DX
79 | 	MOVQ         lenTable+64(FP), BX
80 | 	MOVBQZX      AL, AX
81 | 	ADDQ         BX, AX
82 | 	MOVBQZX      (AX), AX
83 | 	ADDQ         AX, DX
84 | 	VMOVDQU      X0, (CX)
85 | 	VMOVDQU      X1, (DX)
86 | 	RET
87 | 


--------------------------------------------------------------------------------
/pkg/encode/encode_base.go:
--------------------------------------------------------------------------------
 1 | // +build !amd64
 2 | 
 3 | package encode
 4 | 
 5 | import (
 6 | 	"github.com/theMPatel/streamvbyte-simdgo/pkg/shared"
 7 | )
 8 | 
 9 | func GetMode() shared.PerformanceMode {
10 | 	return shared.Normal
11 | }
12 | 
13 | func Put8uint32Fast(in []uint32, out []byte) uint16 {
14 | 	panic("unreachable")
15 | }
16 | 
17 | func Put8uint32DeltaFast(in []uint32, out []byte, prev uint32) uint16 {
18 | 	panic("unreachable")
19 | }
20 | 


--------------------------------------------------------------------------------
/pkg/encode/encode_test.go:
--------------------------------------------------------------------------------
  1 | package encode
  2 | 
  3 | import (
  4 | 	"encoding/binary"
  5 | 	"math/rand"
  6 | 	"reflect"
  7 | 	"testing"
  8 | 	"time"
  9 | 
 10 | 	"github.com/theMPatel/streamvbyte-simdgo/pkg/shared"
 11 | 	"github.com/theMPatel/streamvbyte-simdgo/pkg/util"
 12 | )
 13 | 
 14 | func init() {
 15 | 	rand.Seed(time.Now().UnixNano())
 16 | }
 17 | 
 18 | func TestPut8uint32Scalar(t *testing.T) {
 19 | 	in := []uint32{1024, 3, 2, 1, 1_073_741_824, 10, 12, 1024}
 20 | 	expectedData := []byte{
 21 | 		0x00, 0x04, 0x03, 0x02, 0x01, 0x00, 0x00, 0x00, 0x40,
 22 | 		0x0a, 0x0c, 0x00, 0x04,
 23 | 	}
 24 | 
 25 | 	expectedCtrl := uint16(0b01_00_00_11_00_00_00_01)
 26 | 	out := make([]byte, 32)
 27 | 	actualCtrl := Put8uint32Scalar(in, out)
 28 | 	if actualCtrl != expectedCtrl {
 29 | 		t.Fatalf("expected: %#016b, got %#016b, %+v", expectedCtrl, actualCtrl, in)
 30 | 	}
 31 | 
 32 | 	actualData := out[:13]
 33 | 	if !reflect.DeepEqual(expectedData, actualData) {
 34 | 		t.Fatalf("expected %+v, got %+v, %+v", expectedData, actualData, in)
 35 | 	}
 36 | }
 37 | 
 38 | func TestPut8uint32DeltaScalar(t *testing.T) {
 39 | 	count := 8
 40 | 	nums := util.GenUint32(count)
 41 | 	util.SortUint32(nums)
 42 | 	diffed := make([]uint32, count)
 43 | 	util.Delta(nums, diffed)
 44 | 
 45 | 	expectedData := make([]byte, count*MaxBytesPerNum)
 46 | 	expectedCtrl := Put8uint32Scalar(diffed, expectedData)
 47 | 	expectedData = expectedData[:shared.ControlByteToSizeTwo(expectedCtrl)]
 48 | 
 49 | 	out := make([]byte, count*MaxBytesPerNum)
 50 | 	actualCtrl := Put8uint32DeltaScalar(nums, out, 0)
 51 | 	if actualCtrl != expectedCtrl {
 52 | 		t.Fatalf("expected: %#016b, got %#016b, %+v", expectedCtrl, actualCtrl, nums)
 53 | 	}
 54 | 
 55 | 	actualData := out[:shared.ControlByteToSizeTwo(actualCtrl)]
 56 | 	if !reflect.DeepEqual(expectedData, actualData) {
 57 | 		t.Fatalf("expected %+v, got %+v, %+v", expectedData, actualData, nums)
 58 | 	}
 59 | }
 60 | 
 61 | func TestPut8uint32Fast(t *testing.T) {
 62 | 	if GetMode() == shared.Normal {
 63 | 		t.Skipf("Testing environment doesn't support this test")
 64 | 	}
 65 | 
 66 | 	count := 8
 67 | 	nums := util.GenUint32(count)
 68 | 
 69 | 	out := make([]byte, MaxBytesPerNum*count)
 70 | 	scalarCtrl := Put8uint32Scalar(nums, out)
 71 | 	out = out[:shared.ControlByteToSizeTwo(scalarCtrl)]
 72 | 
 73 | 	fastOut := make([]byte, MaxBytesPerNum*count)
 74 | 	fastCtrl := Put8uint32Fast(nums, fastOut)
 75 | 	fastOut = fastOut[:shared.ControlByteToSizeTwo(fastCtrl)]
 76 | 
 77 | 	if scalarCtrl != fastCtrl {
 78 | 		t.Fatalf("expected %#04x, actual %#04x, %+v", scalarCtrl, fastCtrl, nums)
 79 | 	}
 80 | 
 81 | 	if !reflect.DeepEqual(out, fastOut) {
 82 | 		t.Fatalf("expected %+v, got %+v, %+v", out, fastOut, nums)
 83 | 	}
 84 | }
 85 | 
 86 | func TestPut8uint32DeltaFast(t *testing.T) {
 87 | 	if GetMode() == shared.Normal {
 88 | 		t.Skipf("Testing environment doesn't support this test")
 89 | 	}
 90 | 
 91 | 	count := 8
 92 | 	nums := util.GenUint32(count)
 93 | 	util.SortUint32(nums)
 94 | 
 95 | 	expectedData := make([]byte, MaxBytesPerNum*count)
 96 | 	scalarCtrl := Put8uint32DeltaScalar(nums, expectedData, 0)
 97 | 	expectedData = expectedData[:shared.ControlByteToSizeTwo(scalarCtrl)]
 98 | 
 99 | 	fastOut := make([]byte, MaxBytesPerNum*count)
100 | 	fastCtrl := Put8uint32DeltaFast(nums, fastOut, 0)
101 | 	fastOut = fastOut[:shared.ControlByteToSizeTwo(fastCtrl)]
102 | 
103 | 	if scalarCtrl != fastCtrl {
104 | 		t.Fatalf("expected %#04x, actual %#04x, %+v", scalarCtrl, fastCtrl, nums)
105 | 	}
106 | 
107 | 	if !reflect.DeepEqual(expectedData, fastOut) {
108 | 		t.Fatalf("expected %+v, got %+v, %+v", expectedData, fastOut, nums)
109 | 	}
110 | }
111 | 
112 | func TestPutUint32Scalar(t *testing.T) {
113 | 	count := rand.Intn(4) + 1
114 | 	nums := util.GenUint32(count)
115 | 	for i := 4 - count; i > 0; i-- {
116 | 		nums = append(nums, 0)
117 | 	}
118 | 
119 | 	expected := make([]byte, 4*MaxBytesPerNum)
120 | 	ctrl := Put4uint32Scalar(nums, expected)
121 | 	size := shared.ControlByteToSize(ctrl)
122 | 	size -= 4 - count
123 | 	expected = expected[:size]
124 | 
125 | 	out := make([]byte, count*MaxBytesPerNum)
126 | 	ctrl = PutUint32Scalar(nums[:count], out, count)
127 | 	size = shared.ControlByteToSize(ctrl)
128 | 	size -= 4 - count
129 | 	out = out[:size]
130 | 	if !reflect.DeepEqual(expected, out) {
131 | 		t.Fatalf("expected %+v, got %+v", expected, out)
132 | 	}
133 | }
134 | 
135 | func TestPutUint32DeltaScalar(t *testing.T) {
136 | 	count := rand.Intn(4) + 1
137 | 	nums := util.GenUint32(count)
138 | 	util.SortUint32(nums)
139 | 	for i := 4 - count; i > 0; i-- {
140 | 		nums = append(nums, nums[count-1])
141 | 	}
142 | 
143 | 	deltas := make([]uint32, 4)
144 | 	util.Delta(nums, deltas)
145 | 
146 | 	expected := make([]byte, 4*MaxBytesPerNum)
147 | 	ctrl := Put4uint32Scalar(deltas, expected)
148 | 	size := shared.ControlByteToSize(ctrl)
149 | 	size -= 4 - count
150 | 	expected = expected[:size]
151 | 
152 | 	out := make([]byte, count*MaxBytesPerNum)
153 | 	ctrl = PutUint32DeltaScalar(nums[:count], out, count, 0)
154 | 	size = shared.ControlByteToSize(ctrl)
155 | 	size -= 4 - count
156 | 	out = out[:size]
157 | 	if !reflect.DeepEqual(expected, out) {
158 | 		t.Fatalf("expected %+v, got %+v", expected, out)
159 | 	}
160 | }
161 | 
162 | var writeSinkA uint16
163 | 
164 | func BenchmarkPut8uint32Fast(b *testing.B) {
165 | 	if GetMode() == shared.Normal {
166 | 		b.Skipf("Testing environment doesn't support this test")
167 | 	}
168 | 
169 | 	count := 8
170 | 	out := make([]byte, count*MaxBytesPerNum)
171 | 	nums := util.GenUint32(count)
172 | 
173 | 	var ctrl uint16
174 | 	b.SetBytes(int64(count * MaxBytesPerNum))
175 | 	b.ResetTimer()
176 | 	for i := 0; i < b.N; i++ {
177 | 		ctrl = Put8uint32Fast(nums, out)
178 | 	}
179 | 	writeSinkA = ctrl
180 | }
181 | 
182 | var writeSinkB uint16
183 | 
184 | func BenchmarkPut8uint32DeltaFast(b *testing.B) {
185 | 	if GetMode() == shared.Normal {
186 | 		b.Skipf("Testing environment doesn't support this test")
187 | 	}
188 | 
189 | 	count := 8
190 | 	out := make([]byte, count*MaxBytesPerNum)
191 | 	nums := util.GenUint32(count)
192 | 	util.SortUint32(nums)
193 | 
194 | 	var ctrl uint16
195 | 	b.SetBytes(int64(count * MaxBytesPerNum))
196 | 	b.ResetTimer()
197 | 	for i := 0; i < b.N; i++ {
198 | 		ctrl = Put8uint32DeltaFast(nums, out, 0)
199 | 	}
200 | 	writeSinkB = ctrl
201 | }
202 | 
203 | var writeSinkC uint16
204 | 
205 | func BenchmarkPut8uint32Scalar(b *testing.B) {
206 | 	count := 8
207 | 	out := make([]byte, count*MaxBytesPerNum)
208 | 	nums := util.GenUint32(count)
209 | 
210 | 	var ctrl uint16
211 | 	b.SetBytes(int64(count * MaxBytesPerNum))
212 | 	b.ResetTimer()
213 | 	for i := 0; i < b.N; i++ {
214 | 		ctrl = Put8uint32Scalar(nums, out)
215 | 	}
216 | 	writeSinkC = ctrl
217 | }
218 | 
219 | var writeSinkD uint16
220 | 
221 | func BenchmarkPut8uint32DeltaScalar(b *testing.B) {
222 | 	count := 8
223 | 	out := make([]byte, count*MaxBytesPerNum)
224 | 	nums := util.GenUint32(count)
225 | 	util.SortUint32(nums)
226 | 
227 | 	var ctrl uint16
228 | 	b.SetBytes(int64(count * MaxBytesPerNum))
229 | 	b.ResetTimer()
230 | 	for i := 0; i < b.N; i++ {
231 | 		ctrl = Put8uint32DeltaScalar(nums, out, 0)
232 | 	}
233 | 	writeSinkD = ctrl
234 | }
235 | 
236 | var writeSinkE int
237 | 
238 | func BenchmarkPut8uint32Varint(b *testing.B) {
239 | 	count := 8
240 | 	out := make([]byte, count*binary.MaxVarintLen32)
241 | 	nums := util.GenUint32(count)
242 | 	written := 0
243 | 
244 | 	b.SetBytes(int64(count * MaxBytesPerNum))
245 | 	b.ResetTimer()
246 | 	for i := 0; i < b.N; i++ {
247 | 		written = util.PutVarint(nums, out)
248 | 	}
249 | 	writeSinkE = written
250 | }
251 | 
252 | var writeSinkF int
253 | 
254 | func BenchmarkPut8uint32DeltaVarint(b *testing.B) {
255 | 	count := 8
256 | 	out := make([]byte, count*binary.MaxVarintLen32)
257 | 	nums := util.GenUint32(count)
258 | 	util.SortUint32(nums)
259 | 	written := 0
260 | 
261 | 	b.SetBytes(int64(count * MaxBytesPerNum))
262 | 	b.ResetTimer()
263 | 	for i := 0; i < b.N; i++ {
264 | 		written = util.PutDeltaVarint(nums, out, 0)
265 | 	}
266 | 	writeSinkF = written
267 | }
268 | 


--------------------------------------------------------------------------------
/pkg/encode/gen.go:
--------------------------------------------------------------------------------
1 | package encode
2 | 
3 | //go:generate go run ./main/asm.go -out ./encode_amd64.s
4 | 


--------------------------------------------------------------------------------
/pkg/encode/main/asm.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"log"
  6 | 
  7 | 	. "github.com/mmcloughlin/avo/build"
  8 | 	"github.com/mmcloughlin/avo/operand"
  9 | 	"github.com/mmcloughlin/avo/reg"
 10 | 	"github.com/theMPatel/streamvbyte-simdgo/pkg/shared"
 11 | )
 12 | 
 13 | const (
 14 | 	name      = "Put8uint32FastAsm"
 15 | 	nameDelta = "Put8uint32DeltaFastAsm"
 16 | 	pIn       = "in"
 17 | 	pOut      = "outBytes"
 18 | 	pShuffle  = "shuffle"
 19 | 	pLenTable = "lenTable"
 20 | 	pPrev     = "prev"
 21 | 	pR        = "r"
 22 | )
 23 | 
 24 | var (
 25 | 	signature = fmt.Sprintf(
 26 | 		"func(%s []uint32, %s []byte, %s *[256][16]uint8, %s *[256]uint8) (%s uint16)",
 27 | 		pIn, pOut, pShuffle, pLenTable, pR)
 28 | 
 29 | 	signatureDelta = fmt.Sprintf(
 30 | 		"func(%s []uint32, %s []byte, %s uint32, %s *[256][16]uint8, %s *[256]uint8) (%s uint16)",
 31 | 		pIn, pOut, pPrev, pShuffle, pLenTable, pR)
 32 | 
 33 | 	mask1111R = ConstData("mask0101", operand.U16(0x0101))
 34 | 	mask7F00R = ConstData("mask7F00", operand.U16(0x7F00))
 35 | )
 36 | 
 37 | func main() {
 38 | 	regular()
 39 | 	differential()
 40 | 	Generate()
 41 | }
 42 | 
 43 | func differential() {
 44 | 	TEXT(nameDelta, NOSPLIT, signatureDelta)
 45 | 
 46 | 	prevSingular, err := Param(pPrev).Resolve()
 47 | 	if err != nil {
 48 | 		log.Fatalf("failed to get addr of prev")
 49 | 	}
 50 | 
 51 | 	firstFour, secondFour := shared.Load8(pIn)
 52 | 	prev := XMM()
 53 | 	VPALIGNR(operand.Imm(12), firstFour, secondFour, prev)
 54 | 	VPSUBD(prev, secondFour, secondFour)
 55 | 
 56 | 	VBROADCASTSS(prevSingular.Addr, prev)
 57 | 	VPALIGNR(operand.Imm(12), prev, firstFour, prev)
 58 | 	VPSUBD(prev, firstFour, firstFour)
 59 | 
 60 | 	coreAlgorithm(firstFour, secondFour)
 61 | }
 62 | 
 63 | func regular() {
 64 | 	TEXT(name, NOSPLIT, signature)
 65 | 	coreAlgorithm(shared.Load8(pIn))
 66 | }
 67 | 
 68 | func coreAlgorithm(firstFour, secondFour reg.VecVirtual) {
 69 | 	onesMask := XMM()
 70 | 	sevenFzerozero := XMM()
 71 | 	VPBROADCASTW(mask1111R, onesMask)
 72 | 	VPBROADCASTW(mask7F00R, sevenFzerozero)
 73 | 
 74 | 	minFirstFour := XMM()
 75 | 	minSecondFour := XMM()
 76 | 	VPMINUB(onesMask, firstFour, minFirstFour)
 77 | 	VPMINUB(onesMask, secondFour, minSecondFour)
 78 | 
 79 | 	// Re-use minFirstFour register
 80 | 	VPACKUSWB(minSecondFour, minFirstFour, minFirstFour)
 81 | 	VPMINSW(onesMask, minFirstFour, minFirstFour)
 82 | 	VPADDUSW(sevenFzerozero, minFirstFour, minFirstFour)
 83 | 
 84 | 	ctrl := GP32()
 85 | 	VPMOVMSKB(minFirstFour, ctrl)
 86 | 	Store(ctrl.As16(), Return(pR))
 87 | 
 88 | 	shuffleBase := Load(Param(pShuffle), GP64())
 89 | 	firstShuffle := shared.CalculateShuffleAddrFromCtrl(shuffleBase, ctrl, false)
 90 | 	secondShuffle := shared.CalculateShuffleAddrFromCtrl(shuffleBase, ctrl, true)
 91 | 
 92 | 	VPSHUFB(firstShuffle, firstFour, firstFour)
 93 | 	VPSHUFB(secondShuffle, secondFour, secondFour)
 94 | 
 95 | 	firstAddr := Load(Param(pOut).Base(), GP64())
 96 | 	secondAddr := GP64()
 97 | 	MOVQ(firstAddr, secondAddr)
 98 | 
 99 | 	lenAddr, lenValue := shared.LenValueAddr(ctrl, false, pLenTable)
100 | 
101 | 	MOVBQZX(lenAddr, lenValue)
102 | 	ADDQ(lenValue, secondAddr)
103 | 
104 | 	VMOVDQU(firstFour, operand.Mem{Base: firstAddr})
105 | 	VMOVDQU(secondFour, operand.Mem{Base: secondAddr})
106 | 
107 | 	RET()
108 | }
109 | 


--------------------------------------------------------------------------------
/pkg/pkg_test.go:
--------------------------------------------------------------------------------
 1 | package pkg
 2 | 
 3 | import (
 4 | 	"math/rand"
 5 | 	"reflect"
 6 | 	"testing"
 7 | 	"time"
 8 | 
 9 | 	"github.com/theMPatel/streamvbyte-simdgo/pkg/decode"
10 | 	"github.com/theMPatel/streamvbyte-simdgo/pkg/encode"
11 | 	"github.com/theMPatel/streamvbyte-simdgo/pkg/util"
12 | )
13 | 
14 | func init() {
15 | 	rand.Seed(time.Now().UnixNano())
16 | }
17 | 
18 | func TestRoundTripScalar(t *testing.T) {
19 | 	in := []uint32{1024, 3, 2, 1, 1_073_741_824, 10, 12, 1024}
20 | 	expectedData := []byte{
21 | 		0x00, 0x04, 0x03, 0x02, 0x01, 0x00, 0x00, 0x00, 0x40,
22 | 		0x0a, 0x0c, 0x00, 0x04,
23 | 	}
24 | 
25 | 	expectedCtrl := uint16(0b01_00_00_11_00_00_00_01)
26 | 	out := make([]byte, 32)
27 | 	actualCtrl := encode.Put8uint32Scalar(in, out)
28 | 	if actualCtrl != expectedCtrl {
29 | 		t.Fatalf("expected: %#016b, got %#016b", expectedCtrl, actualCtrl)
30 | 	}
31 | 
32 | 	actualData := out[:13]
33 | 	if !reflect.DeepEqual(expectedData, actualData) {
34 | 		t.Fatalf("expected %+v, got %+v", expectedData, actualData)
35 | 	}
36 | 
37 | 	decoded := make([]uint32, 8)
38 | 	decode.Get8uint32Scalar(actualData, decoded, actualCtrl)
39 | 
40 | 	if !reflect.DeepEqual(in, decoded) {
41 | 		t.Fatalf("expected %+v, actual %+v", in, decoded)
42 | 	}
43 | }
44 | 
45 | func BenchmarkMemCopy8Uint32(b *testing.B) {
46 | 	count := 8
47 | 	nums := make([]uint32, count)
48 | 	for i := 0; i < count; i++ {
49 | 		nums[i] = util.RandUint32()
50 | 	}
51 | 
52 | 	out := make([]uint32, count)
53 | 	b.SetBytes(int64(count * encode.MaxBytesPerNum))
54 | 	b.ResetTimer()
55 | 	for i := 0; i < b.N; i++ {
56 | 		copy(out, nums)
57 | 	}
58 | }
59 | 


--------------------------------------------------------------------------------
/pkg/shared/asm.go:
--------------------------------------------------------------------------------
 1 | package shared
 2 | 
 3 | import (
 4 | 	. "github.com/mmcloughlin/avo/build"
 5 | 	"github.com/mmcloughlin/avo/operand"
 6 | 	"github.com/mmcloughlin/avo/reg"
 7 | )
 8 | 
 9 | func CalculateShuffleAddrFromCtrl(shuffleBase reg.Register, ctrl reg.GPVirtual, upper bool) operand.Mem {
10 | 	addr := GP64()
11 | 	if upper {
12 | 		MOVWQZX(ctrl.As16(), addr)
13 | 		SHRQ(operand.Imm(8), addr)
14 | 	} else {
15 | 		MOVBQZX(ctrl.As8(), addr)
16 | 	}
17 | 
18 | 	// Left shift by 4 to get the byte level offset for the shuffle table
19 | 	SHLQ(operand.Imm(4), addr)
20 | 	ADDQ(shuffleBase, addr)
21 | 
22 | 	return operand.Mem{Base: addr}
23 | }
24 | 
25 | func LenValueAddr(ctrl reg.GPVirtual, upper bool, lenTableParam string) (operand.Mem, reg.GPVirtual) {
26 | 	lenTableBase := Load(Param(lenTableParam), GP64())
27 | 	lenValueAddr := GP64()
28 | 	if upper {
29 | 		MOVWQZX(ctrl.As16(), lenValueAddr)
30 | 		SHRQ(operand.Imm(8), lenValueAddr)
31 | 	} else {
32 | 		MOVBQZX(ctrl.As8L(), lenValueAddr)
33 | 	}
34 | 	ADDQ(lenTableBase, lenValueAddr)
35 | 
36 | 	return operand.Mem{Base: lenValueAddr}, lenValueAddr
37 | }
38 | 
39 | func Load8(paramName string) (reg.VecVirtual, reg.VecVirtual) {
40 | 	arrBase := operand.Mem{
41 | 		Base: Load(Param(paramName).Base(), GP64()),
42 | 	}
43 | 	firstFour := XMM()
44 | 	secondFour := XMM()
45 | 	VLDDQU(arrBase, firstFour)
46 | 	VLDDQU(arrBase.Offset(16), secondFour)
47 | 
48 | 	return firstFour, secondFour
49 | }
50 | 


--------------------------------------------------------------------------------
/pkg/shared/gen.go:
--------------------------------------------------------------------------------
 1 | package shared
 2 | 
 3 | //go:generate go run ./main/gentables.go -out ./tables.go -package shared
 4 | 
 5 | func ControlByteToSize(in uint8) int {
 6 | 	return int(PerControlLenTable[in])
 7 | }
 8 | 
 9 | func ControlByteToSizeTwo(in uint16) int {
10 | 	return int(PerControlLenTable[in&0xff] + PerControlLenTable[in>>8])
11 | }
12 | 


--------------------------------------------------------------------------------
/pkg/shared/main/gentables.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"flag"
  6 | 	"fmt"
  7 | 	"go/format"
  8 | 	"io"
  9 | 	"log"
 10 | 	"os"
 11 | 
 12 | 	"github.com/pkg/errors"
 13 | 	"github.com/theMPatel/streamvbyte-simdgo/pkg/util"
 14 | )
 15 | 
 16 | var (
 17 | 	fOut     = flag.String("out", "", "path to output")
 18 | 	fPackage = flag.String("package", "shared", "package name")
 19 | )
 20 | 
 21 | const MaxControlByte = 1 << 8
 22 | 
 23 | func main() {
 24 | 	flag.Parse()
 25 | 
 26 | 	if *fOut == "" {
 27 | 		log.Fatalf("outfile cannot be empty")
 28 | 	}
 29 | 
 30 | 	out := &bytes.Buffer{}
 31 | 
 32 | 	_, _ = fmt.Fprintln(out, "// Code generated by gentables. DO NOT EDIT.")
 33 | 	_, _ = fmt.Fprintf(out, "\npackage %s\n", *fPackage)
 34 | 
 35 | 	if err := genPerNumLengthTable(out); err != nil {
 36 | 		log.Fatalf("failed to gen per num length table")
 37 | 	}
 38 | 
 39 | 	if err := genPerQuadLengthTable(out); err != nil {
 40 | 		log.Fatalf("failed to gen sum length table")
 41 | 	}
 42 | 
 43 | 	if err := genEncodeShuffleTable(out); err != nil {
 44 | 		log.Fatalf("failed to gen encode shuffle table")
 45 | 	}
 46 | 
 47 | 	if err := genDecodeShuffleTable(out); err != nil {
 48 | 		log.Fatalf("failed to gen decode shuffle table")
 49 | 	}
 50 | 
 51 | 	final, err := format.Source(out.Bytes())
 52 | 	if err != nil {
 53 | 		log.Fatalf("failed to go fmt output")
 54 | 	}
 55 | 
 56 | 	fileOut, err := os.Create(*fOut)
 57 | 	if err != nil {
 58 | 		log.Fatalf("failed to open: %s %s", *fOut, err)
 59 | 	}
 60 | 	defer util.SilentClose(fileOut)
 61 | 
 62 | 	_, err = fileOut.Write(final)
 63 | 	if err != nil {
 64 | 		log.Fatalf("failed to write generated tables to file")
 65 | 	}
 66 | }
 67 | 
 68 | func newLineAfter(countPerLine int) func(out io.Writer) {
 69 | 	count := 1
 70 | 	return func(out io.Writer) {
 71 | 		if count%countPerLine == 0 {
 72 | 			_, _ = fmt.Fprintln(out, "")
 73 | 		} else {
 74 | 			_, _ = fmt.Fprintf(out, " ")
 75 | 		}
 76 | 		count++
 77 | 	}
 78 | }
 79 | 
 80 | func genPerNumLengthTable(out io.Writer) error {
 81 | 	_, _ = fmt.Fprintf(out, "\nvar PerNumLenTable *[256][4]uint8 = &[256][4]uint8{\n")
 82 | 	tabber := newLineAfter(4)
 83 | 	for i := 0; i < MaxControlByte; i++ {
 84 | 		one, two, three, four := sizes(uint8(i))
 85 | 		_, err := fmt.Fprintf(out, "\t{%d, %d, %d, %d},", one, two, three, four)
 86 | 		if err != nil {
 87 | 			return errors.Wrapf(err, "failed to write per num len: %d", i)
 88 | 		}
 89 | 		tabber(out)
 90 | 	}
 91 | 	_, _ = fmt.Fprintln(out, "}")
 92 | 	return nil
 93 | }
 94 | 
 95 | func genPerQuadLengthTable(out io.Writer) error {
 96 | 	_, _ = fmt.Fprintf(out, "\nvar PerControlLenTable *[256]uint8 = &[256]uint8{\n")
 97 | 	tabber := newLineAfter(8)
 98 | 	for i := 0; i < MaxControlByte; i++ {
 99 | 		one, two, three, four := sizes(uint8(i))
100 | 		_, err := fmt.Fprintf(out, "\t%d,", one+two+three+four)
101 | 		if err != nil {
102 | 			return errors.Wrapf(err, "failed to write summed len: %d", i)
103 | 		}
104 | 		tabber(out)
105 | 	}
106 | 	_, _ = fmt.Fprintln(out, "}")
107 | 	return nil
108 | }
109 | 
110 | const (
111 | 	shuffleFmtStr = "%#02x, %#02x, %#02x, %#02x, %#02x, %#02x, %#02x, %#02x, %#02x, %#02x, %#02x, %#02x, %#02x, %#02x, %#02x, %#02x},"
112 | 	commentStr    = "\t// %d\t%#02x\t%08b\tlen\t%d\t%d\t%d\t%d\n"
113 | )
114 | 
115 | func genEncodeShuffleTable(out io.Writer) error {
116 | 	_, _ = fmt.Fprintf(out, "\nvar EncodeShuffleTable *[256][16]uint8 = &[256][16]uint8{\n")
117 | 	tabber := newLineAfter(1)
118 | 	for i := 0; i < MaxControlByte; i++ {
119 | 		one, two, three, four := sizes(uint8(i))
120 | 		_, _ = fmt.Fprintf(out, commentStr, i, i, i, one, two, three, four)
121 | 		_, err := fmt.Fprintf(out, "\t{")
122 | 		if err != nil {
123 | 			return errors.Wrapf(err, "failed to write encode shuffle table")
124 | 		}
125 | 
126 | 		var positions []interface{}
127 | 		var base uint8
128 | 		for _, size := range []uint8{one, two, three, four} {
129 | 			for j := uint8(0); j < size; j++ {
130 | 				positions = append(positions, base+j)
131 | 			}
132 | 			base += 4
133 | 		}
134 | 
135 | 		for len(positions) < 16 {
136 | 			positions = append(positions, 0xff)
137 | 		}
138 | 		_, err = fmt.Fprintf(out, shuffleFmtStr, positions...)
139 | 		if err != nil {
140 | 			return errors.Wrapf(err, "failed to write per num len: %d", i)
141 | 		}
142 | 		tabber(out)
143 | 	}
144 | 	_, _ = fmt.Fprintln(out, "}")
145 | 	return nil
146 | }
147 | 
148 | func genDecodeShuffleTable(out io.Writer) error {
149 | 	_, _ = fmt.Fprintf(out, "\nvar DecodeShuffleTable *[256][16]uint8 = &[256][16]uint8{\n")
150 | 	tabber := newLineAfter(1)
151 | 	for i := 0; i < MaxControlByte; i++ {
152 | 		one, two, three, four := sizes(uint8(i))
153 | 		_, _ = fmt.Fprintf(out, commentStr, i, i, i, one, two, three, four)
154 | 		_, err := fmt.Fprintf(out, "\t{")
155 | 		if err != nil {
156 | 			return errors.Wrapf(err, "failed to write encode shuffle table")
157 | 		}
158 | 
159 | 		var positions []interface{}
160 | 		var pos uint8
161 | 		for _, size := range []uint8{one, two, three, four} {
162 | 			for j := 0; j < 4; j++ {
163 | 				if size > 0 {
164 | 					positions = append(positions, pos)
165 | 					pos++
166 | 					size--
167 | 				} else {
168 | 					positions = append(positions, 0xff)
169 | 				}
170 | 			}
171 | 		}
172 | 
173 | 		_, err = fmt.Fprintf(out, shuffleFmtStr, positions...)
174 | 		if err != nil {
175 | 			return errors.Wrapf(err, "failed to write per num len: %d", i)
176 | 		}
177 | 		tabber(out)
178 | 	}
179 | 	_, _ = fmt.Fprintln(out, "}")
180 | 	return nil
181 | }
182 | 
183 | // sizes returns the length in bytes for each of the four numbers
184 | // represented by the provided control byte.
185 | func sizes(control uint8) (one uint8, two uint8, three uint8, four uint8) {
186 | 	one = (control & 3) + 1
187 | 	two = (control >> 2 & 3) + 1
188 | 	three = (control >> 4 & 3) + 1
189 | 	four = (control >> 6 & 3) + 1
190 | 	return
191 | }
192 | 


--------------------------------------------------------------------------------
/pkg/shared/mode.go:
--------------------------------------------------------------------------------
 1 | package shared
 2 | 
 3 | // PerformanceMode indicates which mode the code is operating under. If Normal,
 4 | // then the code is NOT using special hardware instructions and instead relying
 5 | // on portable Go code. If Fast, then the code IS using special hardware instructions
 6 | // that is platform dependent. Each package exports a func that can be used to debug
 7 | // or inspect the configuration at runtime.
 8 | type PerformanceMode int
 9 | 
10 | const (
11 | 	Normal PerformanceMode = iota
12 | 	Fast
13 | )
14 | 
15 | type CheckMode func() PerformanceMode
16 | 


--------------------------------------------------------------------------------
/pkg/stream/reader/reader.go:
--------------------------------------------------------------------------------
  1 | package reader
  2 | 
  3 | import (
  4 | 	"github.com/theMPatel/streamvbyte-simdgo/pkg/decode"
  5 | 	"github.com/theMPatel/streamvbyte-simdgo/pkg/shared"
  6 | )
  7 | 
  8 | const (
  9 | 	jump     = 16
 10 | 	jumpCtrl = jump / 4
 11 | )
 12 | 
 13 | // ReadAll will read the entire input stream into out according to the
 14 | // Stream VByte format. It will select the best implementation depending
 15 | // on the presence of special hardware instructions.
 16 | //
 17 | // Note: It is your responsibility to ensure that the incoming slices are
 18 | // appropriately sized as well as tracking the count of integers in the
 19 | // stream.
 20 | func ReadAll(count int, stream []byte, out []uint32) {
 21 | 	if decode.GetMode() == shared.Fast {
 22 | 		ReadAllFast(count, stream, out)
 23 | 	} else {
 24 | 		ReadAllScalar(count, stream, out)
 25 | 	}
 26 | }
 27 | 
 28 | // ReadAllDelta will read the entire input stream into out according to the
 29 | // Stream VByte format. It will select the best implementation depending
 30 | // on the presence of special hardware instructions. It will reconstruct the
 31 | // original non differentially encoded values.
 32 | //
 33 | // Note: It is your responsibility to ensure that the incoming slices are
 34 | // appropriately sized as well as tracking the count of integers in the
 35 | // stream.
 36 | func ReadAllDelta(count int, stream []byte, out []uint32, prev uint32) {
 37 | 	if decode.GetMode() == shared.Fast {
 38 | 		ReadAllDeltaFast(count, stream, out, prev)
 39 | 	} else {
 40 | 		ReadAllDeltaScalar(count, stream, out, prev)
 41 | 	}
 42 | }
 43 | 
 44 | // ReadAllScalar will read the entire input stream into out according to the
 45 | // Stream VByte format.
 46 | //
 47 | // Note: It is your responsibility to ensure that the incoming slices are
 48 | // appropriately sized as well as tracking the count of integers in the
 49 | // stream.
 50 | func ReadAllScalar(count int, stream []byte, out []uint32) {
 51 | 	var (
 52 | 		ctrlLen = (count + 3) / 4
 53 | 
 54 | 		dataPos    = ctrlLen
 55 | 		ctrlPos    = 0
 56 | 		decoded    = 0
 57 | 		lowestJump = count &^ (jump - 1)
 58 | 		lowest4    = count &^ 3
 59 | 	)
 60 | 
 61 | 	for ; decoded < lowestJump; decoded += jump {
 62 | 		data := stream[dataPos:]
 63 | 		ctrls := stream[ctrlPos : ctrlPos+jumpCtrl]
 64 | 		nums := out[decoded : decoded+jump]
 65 | 
 66 | 		ctrl := ctrls[0]
 67 | 		decode.Get4uint32Scalar(data, nums, ctrl)
 68 | 		sizeA := shared.ControlByteToSize(ctrl)
 69 | 
 70 | 		ctrl = ctrls[1]
 71 | 		decode.Get4uint32Scalar(data[sizeA:], nums[4:], ctrl)
 72 | 		sizeB := shared.ControlByteToSize(ctrl)
 73 | 
 74 | 		ctrl = ctrls[2]
 75 | 		decode.Get4uint32Scalar(data[sizeA+sizeB:], nums[8:], ctrl)
 76 | 		sizeC := shared.ControlByteToSize(ctrl)
 77 | 
 78 | 		ctrl = ctrls[3]
 79 | 		decode.Get4uint32Scalar(data[sizeA+sizeB+sizeC:], nums[12:], ctrl)
 80 | 		sizeD := shared.ControlByteToSize(ctrl)
 81 | 
 82 | 		dataPos += sizeA + sizeB + sizeC + sizeD
 83 | 		ctrlPos += jumpCtrl
 84 | 	}
 85 | 
 86 | 	for ; decoded < lowest4; decoded += 4 {
 87 | 		ctrl := stream[ctrlPos]
 88 | 		decode.Get4uint32Scalar(stream[dataPos:], out[decoded:], ctrl)
 89 | 		size := shared.ControlByteToSize(ctrl)
 90 | 		dataPos += size
 91 | 		ctrlPos++
 92 | 	}
 93 | 
 94 | 	if lowest4 != count {
 95 | 		decode.GetUint32Scalar(stream[dataPos:], out[decoded:], stream[ctrlPos], count-lowest4)
 96 | 	}
 97 | }
 98 | 
 99 | // ReadAllDeltaScalar will read the entire input stream into out according to the
100 | // Stream VByte format. It will reconstruct the original non differentially
101 | // encoded values.
102 | //
103 | // Note: It is your responsibility to ensure that the incoming slices are
104 | // appropriately sized as well as tracking the count of integers in the
105 | // stream.
106 | func ReadAllDeltaScalar(count int, stream []byte, out []uint32, prev uint32) {
107 | 	var (
108 | 		ctrlLen = (count + 3) / 4
109 | 
110 | 		dataPos    = ctrlLen
111 | 		ctrlPos    = 0
112 | 		decoded    = 0
113 | 		lowestJump = count &^ (jump - 1)
114 | 		lowest4    = count &^ 3
115 | 	)
116 | 
117 | 	for ; decoded < lowestJump; decoded += jump {
118 | 		data := stream[dataPos:]
119 | 		ctrls := stream[ctrlPos : ctrlPos+jumpCtrl]
120 | 		nums := out[decoded : decoded+jump]
121 | 
122 | 		ctrl := ctrls[0]
123 | 		decode.Get4uint32DeltaScalar(data, nums, ctrl, prev)
124 | 		sizeA := shared.ControlByteToSize(ctrl)
125 | 
126 | 		ctrl = ctrls[1]
127 | 		decode.Get4uint32DeltaScalar(data[sizeA:], nums[4:], ctrl, nums[3])
128 | 		sizeB := shared.ControlByteToSize(ctrl)
129 | 
130 | 		ctrl = ctrls[2]
131 | 		decode.Get4uint32DeltaScalar(data[sizeA+sizeB:], nums[8:], ctrl, nums[7])
132 | 		sizeC := shared.ControlByteToSize(ctrl)
133 | 
134 | 		ctrl = ctrls[3]
135 | 		decode.Get4uint32DeltaScalar(data[sizeA+sizeB+sizeC:], nums[12:], ctrl, nums[11])
136 | 		sizeD := shared.ControlByteToSize(ctrl)
137 | 
138 | 		dataPos += sizeA + sizeB + sizeC + sizeD
139 | 		ctrlPos += jumpCtrl
140 | 		prev = nums[15]
141 | 	}
142 | 
143 | 	for ; decoded < lowest4; decoded += 4 {
144 | 		ctrl := stream[ctrlPos]
145 | 		decode.Get4uint32DeltaScalar(stream[dataPos:], out[decoded:], ctrl, prev)
146 | 		size := shared.ControlByteToSize(ctrl)
147 | 		dataPos += size
148 | 		ctrlPos++
149 | 		prev = out[decoded+3]
150 | 	}
151 | 
152 | 	if lowest4 != count {
153 | 		decode.GetUint32DeltaScalar(stream[dataPos:], out[decoded:], stream[ctrlPos], count-lowest4, prev)
154 | 	}
155 | }
156 | 


--------------------------------------------------------------------------------
/pkg/stream/reader/reader_amd64.go:
--------------------------------------------------------------------------------
  1 | // +build amd64
  2 | 
  3 | package reader
  4 | 
  5 | import (
  6 | 	"github.com/theMPatel/streamvbyte-simdgo/pkg/decode"
  7 | 	"github.com/theMPatel/streamvbyte-simdgo/pkg/shared"
  8 | )
  9 | 
 10 | // ReadAllFast will read the entire input stream into out according to the
 11 | // Stream VByte format using special hardware instructions.
 12 | //
 13 | // Note: It is your responsibility to ensure that the incoming slices are
 14 | // appropriately sized as well as tracking the count of integers in the
 15 | // stream.
 16 | func ReadAllFast(count int, stream []byte, out []uint32) {
 17 | 	var (
 18 | 		ctrlPos = 0
 19 | 		decoded = 0
 20 | 		dataPos = (count + 3) / 4
 21 | 		ctrlLen = dataPos
 22 | 		// lowest32 is the limit for the count of integers we'll read in
 23 | 		// bulk 8 at a time directly from the input stream. We subtract 3
 24 | 		// here since we load 16 bytes at a time in the assembly code. If
 25 | 		// you attempt to load the last few control bytes worth of data,
 26 | 		// it's possible there won't be enough bytes in the data stream to
 27 | 		// support it, which can lead to loading from uninitialized memory.
 28 | 		//
 29 | 		// [ _ _ _ _ | _ _ _ _ | _ _ _ _ | _ _ _ _ ]
 30 | 		//                    works fine --^ ^-- bad things here
 31 | 		//
 32 | 		// Imagine the last group of 16 in the above array is all encoded with
 33 | 		// 4 bytes. Decoding the first 4 integers in that group will work fine,
 34 | 		// since it will load the last 3 (unused) bytes. However, when attempting
 35 | 		// to decode the last three groups of 4, each load will need an extra
 36 | 		// 1, 2, or 3 bytes (respectively) in order to be considered safe.
 37 | 		lowest32 = ((ctrlLen - 3) * 4) &^ 31
 38 | 	)
 39 | 
 40 | 	for ; decoded < lowest32; decoded += 32 {
 41 | 		data := stream[dataPos:]
 42 | 		ctrls := stream[ctrlPos : ctrlPos+8]
 43 | 		nums := out[decoded : decoded+32]
 44 | 
 45 | 		ctrl := uint16(ctrls[0]) | uint16(ctrls[1])<<8
 46 | 		decode.Get8uint32FastAsm(
 47 | 			data,
 48 | 			nums,
 49 | 			ctrl,
 50 | 			shared.DecodeShuffleTable,
 51 | 			shared.PerControlLenTable,
 52 | 		)
 53 | 		sizeA := shared.ControlByteToSize(ctrls[0]) + shared.ControlByteToSize(ctrls[1])
 54 | 
 55 | 		ctrl = uint16(ctrls[2]) | uint16(ctrls[3])<<8
 56 | 		decode.Get8uint32FastAsm(
 57 | 			data[sizeA:],
 58 | 			nums[8:],
 59 | 			ctrl,
 60 | 			shared.DecodeShuffleTable,
 61 | 			shared.PerControlLenTable,
 62 | 		)
 63 | 		sizeB := shared.ControlByteToSize(ctrls[2]) + shared.ControlByteToSize(ctrls[3])
 64 | 
 65 | 		ctrl = uint16(ctrls[4]) | uint16(ctrls[5])<<8
 66 | 		decode.Get8uint32FastAsm(
 67 | 			data[sizeA+sizeB:],
 68 | 			nums[16:],
 69 | 			ctrl,
 70 | 			shared.DecodeShuffleTable,
 71 | 			shared.PerControlLenTable,
 72 | 		)
 73 | 		sizeC := shared.ControlByteToSize(ctrls[4]) + shared.ControlByteToSize(ctrls[5])
 74 | 
 75 | 		ctrl = uint16(ctrls[6]) | uint16(ctrls[7])<<8
 76 | 		decode.Get8uint32FastAsm(
 77 | 			data[sizeA+sizeB+sizeC:],
 78 | 			nums[24:],
 79 | 			ctrl,
 80 | 			shared.DecodeShuffleTable,
 81 | 			shared.PerControlLenTable,
 82 | 		)
 83 | 		sizeD := shared.ControlByteToSize(ctrls[6]) + shared.ControlByteToSize(ctrls[7])
 84 | 
 85 | 		dataPos += sizeA + sizeB + sizeC + sizeD
 86 | 		ctrlPos += 8
 87 | 	}
 88 | 
 89 | 	// Must be strictly less than the last 4 blocks of integers, since we can't safely
 90 | 	// decode 8 if our ctrl pos starts at the first 4 in the block.
 91 | 	for ; ctrlPos < ctrlLen-4; ctrlPos += 2 {
 92 | 		ctrl := uint16(stream[ctrlPos]) | uint16(stream[ctrlPos+1])<<8
 93 | 		decode.Get8uint32FastAsm(
 94 | 			stream[dataPos:],
 95 | 			out[decoded:],
 96 | 			ctrl,
 97 | 			shared.DecodeShuffleTable,
 98 | 			shared.PerControlLenTable,
 99 | 		)
100 | 		dataPos += shared.ControlByteToSize(stream[ctrlPos]) + shared.ControlByteToSize(stream[ctrlPos+1])
101 | 		decoded += 8
102 | 	}
103 | 
104 | 	for ; ctrlPos < ctrlLen; ctrlPos += 1 {
105 | 		nums := count - decoded
106 | 		if nums > 4 {
107 | 			nums = 4
108 | 		}
109 | 		dataPos += decode.GetUint32Scalar(
110 | 			stream[dataPos:],
111 | 			out[decoded:],
112 | 			stream[ctrlPos],
113 | 			nums,
114 | 		)
115 | 		decoded += nums
116 | 	}
117 | }
118 | 
119 | // ReadAllDeltaFast will read the entire input stream into out according to the
120 | // Stream VByte format using special hardware instructions. It will reconstruct
121 | // the original non differentially encoded values.
122 | //
123 | // Note: It is your responsibility to ensure that the incoming slices are
124 | // appropriately sized as well as tracking the count of integers in the
125 | // stream.
126 | func ReadAllDeltaFast(count int, stream []byte, out []uint32, prev uint32) {
127 | 	var (
128 | 		ctrlPos = 0
129 | 		decoded = 0
130 | 		dataPos = (count + 3) / 4
131 | 		ctrlLen = dataPos
132 | 		// lowest32 is the limit for the count of integers we'll read in
133 | 		// bulk 8 at a time directly from the input stream. We subtract 3
134 | 		// here since we load 16 bytes at a time in the assembly code. If
135 | 		// you attempt to load the last few control bytes worth of data,
136 | 		// it's possible there won't be enough bytes in the data stream to
137 | 		// support it, which can lead to loading from uninitialized memory.
138 | 		//
139 | 		// [ _ _ _ _ | _ _ _ _ | _ _ _ _ | _ _ _ _ ]
140 | 		//                    works fine --^ ^-- bad things here
141 | 		//
142 | 		// Imagine the last group of 16 in the above array is all encoded with
143 | 		// 4 bytes. Decoding the first 4 integers in that group will work fine,
144 | 		// since it will load the last 3 (unused) bytes. However, when attempting
145 | 		// to decode the last three groups of 4, each load will need an extra
146 | 		// 1, 2, or 3 bytes (respectively) in order to be considered safe.
147 | 		lowest32 = ((ctrlLen - 3) * 4) &^ 31
148 | 	)
149 | 
150 | 	for ; decoded < lowest32; decoded += 32 {
151 | 		data := stream[dataPos:]
152 | 		ctrls := stream[ctrlPos : ctrlPos+8]
153 | 		nums := out[decoded : decoded+32]
154 | 
155 | 		ctrl := uint16(ctrls[0]) | uint16(ctrls[1])<<8
156 | 		decode.Get8uint32DeltaFastAsm(
157 | 			data,
158 | 			nums,
159 | 			ctrl,
160 | 			prev,
161 | 			shared.DecodeShuffleTable,
162 | 			shared.PerControlLenTable,
163 | 		)
164 | 		sizeA := shared.ControlByteToSize(ctrls[0]) + shared.ControlByteToSize(ctrls[1])
165 | 
166 | 		ctrl = uint16(ctrls[2]) | uint16(ctrls[3])<<8
167 | 		decode.Get8uint32DeltaFastAsm(
168 | 			data[sizeA:],
169 | 			nums[8:],
170 | 			ctrl,
171 | 			nums[7],
172 | 			shared.DecodeShuffleTable,
173 | 			shared.PerControlLenTable,
174 | 		)
175 | 		sizeB := shared.ControlByteToSize(ctrls[2]) + shared.ControlByteToSize(ctrls[3])
176 | 
177 | 		ctrl = uint16(ctrls[4]) | uint16(ctrls[5])<<8
178 | 		decode.Get8uint32DeltaFastAsm(
179 | 			data[sizeA+sizeB:],
180 | 			nums[16:],
181 | 			ctrl,
182 | 			nums[15],
183 | 			shared.DecodeShuffleTable,
184 | 			shared.PerControlLenTable,
185 | 		)
186 | 		sizeC := shared.ControlByteToSize(ctrls[4]) + shared.ControlByteToSize(ctrls[5])
187 | 
188 | 		ctrl = uint16(ctrls[6]) | uint16(ctrls[7])<<8
189 | 		decode.Get8uint32DeltaFastAsm(
190 | 			data[sizeA+sizeB+sizeC:],
191 | 			nums[24:],
192 | 			ctrl,
193 | 			nums[23],
194 | 			shared.DecodeShuffleTable,
195 | 			shared.PerControlLenTable,
196 | 		)
197 | 		sizeD := shared.ControlByteToSize(ctrls[6]) + shared.ControlByteToSize(ctrls[7])
198 | 
199 | 		dataPos += sizeA + sizeB + sizeC + sizeD
200 | 		ctrlPos += 8
201 | 		prev = nums[31]
202 | 	}
203 | 
204 | 	// Must be strictly less than the last 4 blocks of integers, since we can't safely
205 | 	// decode 8 if our ctrl pos starts at the first 4 in the block.
206 | 	for ; ctrlPos < ctrlLen-4; ctrlPos += 2 {
207 | 		ctrl := uint16(stream[ctrlPos]) | uint16(stream[ctrlPos+1])<<8
208 | 		decode.Get8uint32DeltaFastAsm(
209 | 			stream[dataPos:],
210 | 			out[decoded:],
211 | 			ctrl,
212 | 			prev,
213 | 			shared.DecodeShuffleTable,
214 | 			shared.PerControlLenTable,
215 | 		)
216 | 		dataPos += shared.ControlByteToSize(stream[ctrlPos]) + shared.ControlByteToSize(stream[ctrlPos+1])
217 | 		decoded += 8
218 | 		prev = out[decoded-1]
219 | 	}
220 | 
221 | 	for ; ctrlPos < ctrlLen; ctrlPos += 1 {
222 | 		nums := count - decoded
223 | 		if nums > 4 {
224 | 			nums = 4
225 | 		}
226 | 		dataPos += decode.GetUint32DeltaScalar(
227 | 			stream[dataPos:],
228 | 			out[decoded:],
229 | 			stream[ctrlPos],
230 | 			nums,
231 | 			prev,
232 | 		)
233 | 		decoded += nums
234 | 		prev = out[decoded-1]
235 | 	}
236 | }
237 | 


--------------------------------------------------------------------------------
/pkg/stream/reader/reader_base.go:
--------------------------------------------------------------------------------
1 | // +build !amd64
2 | 
3 | package reader
4 | 
5 | func ReadAllFast(count int, stream []byte, out []uint32) {
6 | 	panic("unreachable")
7 | }
8 | 


--------------------------------------------------------------------------------
/pkg/stream/reader/reader_test.go:
--------------------------------------------------------------------------------
  1 | package reader
  2 | 
  3 | import (
  4 | 	"encoding/binary"
  5 | 	"fmt"
  6 | 	"math"
  7 | 	"math/rand"
  8 | 	"reflect"
  9 | 	"testing"
 10 | 	"time"
 11 | 
 12 | 	"github.com/theMPatel/streamvbyte-simdgo/pkg/decode"
 13 | 	"github.com/theMPatel/streamvbyte-simdgo/pkg/encode"
 14 | 	"github.com/theMPatel/streamvbyte-simdgo/pkg/shared"
 15 | 	"github.com/theMPatel/streamvbyte-simdgo/pkg/stream/writer"
 16 | 	"github.com/theMPatel/streamvbyte-simdgo/pkg/util"
 17 | )
 18 | 
 19 | func init() {
 20 | 	rand.Seed(time.Now().UnixNano())
 21 | }
 22 | 
 23 | func TestReadAllScalar(t *testing.T) {
 24 | 	for i := 0; i < 6; i++ {
 25 | 		count := int(util.RandUint32() % 1e6)
 26 | 		nums := util.GenUint32(count)
 27 | 		stream := writer.WriteAllScalar(nums)
 28 | 		t.Run(fmt.Sprintf("ReadAll: %d", count), func(t *testing.T) {
 29 | 			out := make([]uint32, count)
 30 | 			ReadAllScalar(count, stream, out)
 31 | 			if !reflect.DeepEqual(nums, out) {
 32 | 				t.Fatalf("decoded wrong nums")
 33 | 			}
 34 | 		})
 35 | 	}
 36 | }
 37 | 
 38 | func TestReadAllDeltaScalar(t *testing.T) {
 39 | 	for i := 0; i < 6; i++ {
 40 | 		count := int(util.RandUint32() % 1e6)
 41 | 		nums := util.GenUint32(count)
 42 | 		util.SortUint32(nums)
 43 | 		stream := writer.WriteAllDeltaScalar(nums, 0)
 44 | 		t.Run(fmt.Sprintf("ReadAll: %d", count), func(t *testing.T) {
 45 | 			out := make([]uint32, count)
 46 | 			ReadAllDeltaScalar(count, stream, out, 0)
 47 | 			if !reflect.DeepEqual(nums, out) {
 48 | 				t.Fatalf("decoded wrong nums")
 49 | 			}
 50 | 		})
 51 | 	}
 52 | }
 53 | 
 54 | func TestReadAllFast(t *testing.T) {
 55 | 	if decode.GetMode() == shared.Normal {
 56 | 		t.Skipf("Testing environment doesn't support this test")
 57 | 	}
 58 | 
 59 | 	for i := 0; i < 6; i++ {
 60 | 		count := int(util.RandUint32() % 1e6)
 61 | 		nums := util.GenUint32(count)
 62 | 		stream := writer.WriteAllScalar(nums)
 63 | 		t.Run(fmt.Sprintf("ReadAll: %d", count), func(t *testing.T) {
 64 | 			out := make([]uint32, count)
 65 | 			ReadAllFast(count, stream, out)
 66 | 			if !reflect.DeepEqual(nums, out) {
 67 | 				t.Fatalf("decoded wrong nums")
 68 | 			}
 69 | 		})
 70 | 	}
 71 | }
 72 | 
 73 | func TestReadAllDeltaFast(t *testing.T) {
 74 | 	if decode.GetMode() == shared.Normal {
 75 | 		t.Skipf("Testing environment doesn't support this test")
 76 | 	}
 77 | 
 78 | 	for i := 0; i < 6; i++ {
 79 | 		count := int(util.RandUint32() % 1e6)
 80 | 		nums := util.GenUint32(count)
 81 | 		util.SortUint32(nums)
 82 | 		diffed := make([]uint32, count)
 83 | 		util.Delta(nums, diffed)
 84 | 
 85 | 		stream := writer.WriteAllScalar(diffed)
 86 | 		t.Run(fmt.Sprintf("ReadAll: %d", count), func(t *testing.T) {
 87 | 			out := make([]uint32, count)
 88 | 			ReadAllDeltaFast(count, stream, out, 0)
 89 | 			if !reflect.DeepEqual(nums, out) {
 90 | 				t.Fatalf("decoded wrong nums")
 91 | 			}
 92 | 		})
 93 | 	}
 94 | }
 95 | 
 96 | var readSinkA []uint32
 97 | 
 98 | func BenchmarkReadAllFast(b *testing.B) {
 99 | 	if decode.GetMode() == shared.Normal {
100 | 		b.Skipf("Testing environment doesn't support this test")
101 | 	}
102 | 
103 | 	for i := 0; i < 8; i++ {
104 | 		count := int(math.Pow10(i))
105 | 		nums := util.GenUint32(count)
106 | 		stream := writer.WriteAllScalar(nums)
107 | 		out := make([]uint32, count)
108 | 		b.Run(fmt.Sprintf("Count_1e%d", i), func(b *testing.B) {
109 | 			b.SetBytes(int64(count * encode.MaxBytesPerNum))
110 | 			b.ResetTimer()
111 | 			for i := 0; i < b.N; i++ {
112 | 				ReadAllFast(count, stream, out)
113 | 			}
114 | 			readSinkA = out
115 | 		})
116 | 	}
117 | }
118 | 
119 | var readSinkB []uint32
120 | 
121 | func BenchmarkReadAllDeltaFast(b *testing.B) {
122 | 	if decode.GetMode() == shared.Normal {
123 | 		b.Skipf("Testing environment doesn't support this test")
124 | 	}
125 | 
126 | 	for i := 0; i < 8; i++ {
127 | 		count := int(math.Pow10(i))
128 | 		nums := util.GenUint32(count)
129 | 		util.SortUint32(nums)
130 | 		stream := writer.WriteAllDeltaScalar(nums, 0)
131 | 		out := make([]uint32, count)
132 | 		b.Run(fmt.Sprintf("Count_1e%d", i), func(b *testing.B) {
133 | 			b.SetBytes(int64(count * encode.MaxBytesPerNum))
134 | 			b.ResetTimer()
135 | 			for i := 0; i < b.N; i++ {
136 | 				ReadAllDeltaFast(count, stream, out, 0)
137 | 			}
138 | 			readSinkB = out
139 | 		})
140 | 	}
141 | }
142 | 
143 | var readSinkC []uint32
144 | 
145 | func BenchmarkReadAllScalar(b *testing.B) {
146 | 	for i := 0; i < 8; i++ {
147 | 		count := int(math.Pow10(i))
148 | 		nums := util.GenUint32(count)
149 | 		util.SortUint32(nums)
150 | 		stream := writer.WriteAllScalar(nums)
151 | 		out := make([]uint32, count)
152 | 		b.Run(fmt.Sprintf("Count_1e%d", i), func(b *testing.B) {
153 | 			b.SetBytes(int64(count * encode.MaxBytesPerNum))
154 | 			b.ResetTimer()
155 | 			for i := 0; i < b.N; i++ {
156 | 				ReadAllScalar(count, stream, out)
157 | 			}
158 | 			readSinkC = out
159 | 		})
160 | 	}
161 | }
162 | 
163 | var readSinkD []uint32
164 | 
165 | func BenchmarkReadAllDeltaScalar(b *testing.B) {
166 | 	for i := 0; i < 8; i++ {
167 | 		count := int(math.Pow10(i))
168 | 		nums := util.GenUint32(count)
169 | 		util.SortUint32(nums)
170 | 		stream := writer.WriteAllDeltaScalar(nums, 0)
171 | 		out := make([]uint32, count)
172 | 		b.Run(fmt.Sprintf("Count_1e%d", i), func(b *testing.B) {
173 | 			b.SetBytes(int64(count * encode.MaxBytesPerNum))
174 | 			b.ResetTimer()
175 | 			for i := 0; i < b.N; i++ {
176 | 				ReadAllDeltaScalar(count, stream, out, 0)
177 | 			}
178 | 			readSinkD = out
179 | 		})
180 | 	}
181 | }
182 | 
183 | var readSinkE []uint32
184 | 
185 | func BenchmarkReadAllVarint(b *testing.B) {
186 | 	for i := 0; i < 8; i++ {
187 | 		count := int(math.Pow10(i))
188 | 		out := make([]uint32, count)
189 | 		data := make([]byte, binary.MaxVarintLen32*count)
190 | 		nums := util.GenUint32(count)
191 | 		util.SortUint32(nums)
192 | 		written := util.PutVarint(nums, data)
193 | 		data = data[:written]
194 | 		b.Run(fmt.Sprintf("Count_1e%d", i), func(b *testing.B) {
195 | 			b.SetBytes(int64(count * encode.MaxBytesPerNum))
196 | 			b.ResetTimer()
197 | 			for i := 0; i < b.N; i++ {
198 | 				util.GetVarint(data, out)
199 | 			}
200 | 			readSinkB = out
201 | 		})
202 | 	}
203 | }
204 | 
205 | var readSinkF []uint32
206 | 
207 | func BenchmarkReadAllDeltaVarint(b *testing.B) {
208 | 	for i := 0; i < 8; i++ {
209 | 		count := int(math.Pow10(i))
210 | 		out := make([]uint32, count)
211 | 		data := make([]byte, binary.MaxVarintLen32*count)
212 | 		nums := util.GenUint32(count)
213 | 		util.SortUint32(nums)
214 | 		written := util.PutDeltaVarint(nums, data, 0)
215 | 		data = data[:written]
216 | 		b.Run(fmt.Sprintf("Count_1e%d", i), func(b *testing.B) {
217 | 			b.SetBytes(int64(count * encode.MaxBytesPerNum))
218 | 			b.ResetTimer()
219 | 			for i := 0; i < b.N; i++ {
220 | 				util.GetDeltaVarint(data, out, 0)
221 | 			}
222 | 			readSinkB = out
223 | 		})
224 | 	}
225 | }
226 | 


--------------------------------------------------------------------------------
/pkg/stream/writer/writer.go:
--------------------------------------------------------------------------------
  1 | package writer
  2 | 
  3 | import (
  4 | 	"github.com/theMPatel/streamvbyte-simdgo/pkg/encode"
  5 | 	"github.com/theMPatel/streamvbyte-simdgo/pkg/shared"
  6 | )
  7 | 
  8 | const (
  9 | 	jump     = 16
 10 | 	jumpCtrl = jump / 4
 11 | )
 12 | 
 13 | // WriteAll will encode all the integers from in using the Stream VByte
 14 | // format and will return the byte array holding the encoded data. It will
 15 | // select the best implementation depending on the presence of special
 16 | // hardware instructions.
 17 | func WriteAll(in []uint32) []byte {
 18 | 	if encode.GetMode() == shared.Fast {
 19 | 		return WriteAllFast(in)
 20 | 	} else {
 21 | 		return WriteAllScalar(in)
 22 | 	}
 23 | }
 24 | 
 25 | // WriteAllDelta will differentially encode all the integers from in using
 26 | // the Stream VByte format and will return the byte array holding the encoded
 27 | // data. It will select the best implementation depending on the presence of
 28 | // special hardware instructions.
 29 | func WriteAllDelta(in []uint32, prev uint32) []byte {
 30 | 	if encode.GetMode() == shared.Fast {
 31 | 		return WriteAllDeltaFast(in, prev)
 32 | 	} else {
 33 | 		return WriteAllDeltaScalar(in, prev)
 34 | 	}
 35 | }
 36 | 
 37 | // WriteAllScalar will encode all the integers from in using the Stream VByte
 38 | // format and will return the byte array holding the encoded data.
 39 | func WriteAllScalar(in []uint32) []byte {
 40 | 	var (
 41 | 		count   = len(in)
 42 | 		ctrlLen = (count + 3) / 4
 43 | 		stream  = make([]byte, ctrlLen+(encode.MaxBytesPerNum*count))
 44 | 
 45 | 		dataPos    = ctrlLen
 46 | 		ctrlPos    = 0
 47 | 		encoded    = 0
 48 | 		lowestJump = count &^ (jump - 1)
 49 | 		lowest4    = count &^ 3
 50 | 	)
 51 | 
 52 | 	for ; encoded < lowestJump; encoded += jump {
 53 | 		nums := in[encoded : encoded+jump]
 54 | 		data := stream[dataPos:]
 55 | 		ctrls := stream[ctrlPos : ctrlPos+jumpCtrl]
 56 | 
 57 | 		ctrl := encode.Put4uint32Scalar(nums, data)
 58 | 		ctrls[0] = ctrl
 59 | 		sizeA := shared.ControlByteToSize(ctrl)
 60 | 
 61 | 		ctrl = encode.Put4uint32Scalar(nums[4:], data[sizeA:])
 62 | 		ctrls[1] = ctrl
 63 | 		sizeB := shared.ControlByteToSize(ctrl)
 64 | 
 65 | 		ctrl = encode.Put4uint32Scalar(nums[8:], data[sizeA+sizeB:])
 66 | 		ctrls[2] = ctrl
 67 | 		sizeC := shared.ControlByteToSize(ctrl)
 68 | 
 69 | 		ctrl = encode.Put4uint32Scalar(nums[12:], data[sizeA+sizeB+sizeC:])
 70 | 		ctrls[3] = ctrl
 71 | 		sizeD := shared.ControlByteToSize(ctrl)
 72 | 
 73 | 		dataPos += sizeA + sizeB + sizeC + sizeD
 74 | 		ctrlPos += jumpCtrl
 75 | 	}
 76 | 
 77 | 	for ; encoded < lowest4; encoded += 4 {
 78 | 		ctrl := encode.Put4uint32Scalar(in[encoded:], stream[dataPos:])
 79 | 		stream[ctrlPos] = ctrl
 80 | 		size := shared.ControlByteToSize(ctrl)
 81 | 		dataPos += size
 82 | 		ctrlPos++
 83 | 	}
 84 | 
 85 | 	if lowest4 != count {
 86 | 		nums := count - lowest4
 87 | 		ctrl := encode.PutUint32Scalar(in[encoded:], stream[dataPos:], nums)
 88 | 		size := shared.ControlByteToSize(ctrl)
 89 | 		size -= 4 - nums
 90 | 		dataPos += size
 91 | 		stream[ctrlPos] = ctrl
 92 | 	}
 93 | 
 94 | 	return stream[:dataPos]
 95 | }
 96 | 
 97 | // WriteAllDeltaScalar will differentially encode all the integers from in using
 98 | // the Stream VByte format and will return the byte array holding the encoded data.
 99 | func WriteAllDeltaScalar(in []uint32, prev uint32) []byte {
100 | 	var (
101 | 		count   = len(in)
102 | 		ctrlLen = (count + 3) / 4
103 | 		stream  = make([]byte, ctrlLen+(encode.MaxBytesPerNum*count))
104 | 
105 | 		dataPos    = ctrlLen
106 | 		ctrlPos    = 0
107 | 		encoded    = 0
108 | 		lowestJump = count &^ (jump - 1)
109 | 		lowest4    = count &^ 3
110 | 	)
111 | 
112 | 	for ; encoded < lowestJump; encoded += jump {
113 | 		nums := in[encoded : encoded+jump]
114 | 		data := stream[dataPos:]
115 | 		ctrls := stream[ctrlPos : ctrlPos+jumpCtrl]
116 | 
117 | 		ctrl := encode.Put4uint32DeltaScalar(nums, data, prev)
118 | 		ctrls[0] = ctrl
119 | 		sizeA := shared.ControlByteToSize(ctrl)
120 | 
121 | 		ctrl = encode.Put4uint32DeltaScalar(nums[4:], data[sizeA:], nums[3])
122 | 		ctrls[1] = ctrl
123 | 		sizeB := shared.ControlByteToSize(ctrl)
124 | 
125 | 		ctrl = encode.Put4uint32DeltaScalar(nums[8:], data[sizeA+sizeB:], nums[7])
126 | 		ctrls[2] = ctrl
127 | 		sizeC := shared.ControlByteToSize(ctrl)
128 | 
129 | 		ctrl = encode.Put4uint32DeltaScalar(nums[12:], data[sizeA+sizeB+sizeC:], nums[11])
130 | 		ctrls[3] = ctrl
131 | 		sizeD := shared.ControlByteToSize(ctrl)
132 | 
133 | 		dataPos += sizeA + sizeB + sizeC + sizeD
134 | 		ctrlPos += jumpCtrl
135 | 		prev = nums[15]
136 | 	}
137 | 
138 | 	for ; encoded < lowest4; encoded += 4 {
139 | 		ctrl := encode.Put4uint32DeltaScalar(in[encoded:], stream[dataPos:], prev)
140 | 		stream[ctrlPos] = ctrl
141 | 		size := shared.ControlByteToSize(ctrl)
142 | 		dataPos += size
143 | 		ctrlPos++
144 | 		prev = in[encoded+3]
145 | 	}
146 | 
147 | 	if lowest4 != count {
148 | 		nums := count - lowest4
149 | 		ctrl := encode.PutUint32DeltaScalar(in[encoded:], stream[dataPos:], nums, prev)
150 | 		size := shared.ControlByteToSize(ctrl)
151 | 		size -= 4 - nums
152 | 		dataPos += size
153 | 		stream[ctrlPos] = ctrl
154 | 	}
155 | 
156 | 	return stream[:dataPos]
157 | }
158 | 


--------------------------------------------------------------------------------
/pkg/stream/writer/writer_amd64.go:
--------------------------------------------------------------------------------
  1 | // +build amd64
  2 | 
  3 | package writer
  4 | 
  5 | import (
  6 | 	"github.com/theMPatel/streamvbyte-simdgo/pkg/encode"
  7 | 	"github.com/theMPatel/streamvbyte-simdgo/pkg/shared"
  8 | )
  9 | 
 10 | // WriteAllFast will encode all the integers from in using the Stream VByte
 11 | // format using special hardware instructions and will return the byte array
 12 | // holding the encoded data.
 13 | func WriteAllFast(in []uint32) []byte {
 14 | 	var (
 15 | 		count   = len(in)
 16 | 		ctrlLen = (count + 3) / 4
 17 | 		stream  = make([]byte, ctrlLen+(encode.MaxBytesPerNum*count))
 18 | 
 19 | 		dataPos  = ctrlLen
 20 | 		ctrlPos  = 0
 21 | 		encoded  = 0
 22 | 		lowest32 = ((ctrlLen - 3) * 4) &^ 31
 23 | 	)
 24 | 
 25 | 	for ; encoded < lowest32; encoded += 32 {
 26 | 		ctrls := stream[ctrlPos : ctrlPos+8]
 27 | 		nums := in[encoded : encoded+32]
 28 | 		out := stream[dataPos:]
 29 | 
 30 | 		ctrl := encode.Put8uint32FastAsm(
 31 | 			nums[0:8],
 32 | 			out,
 33 | 			shared.EncodeShuffleTable,
 34 | 			shared.PerControlLenTable,
 35 | 		)
 36 | 
 37 | 		ctrls[0] = uint8(ctrl & 0xff)
 38 | 		ctrls[1] = uint8(ctrl >> 8)
 39 | 		sizeA := shared.ControlByteToSizeTwo(ctrl)
 40 | 
 41 | 		ctrl = encode.Put8uint32FastAsm(
 42 | 			nums[8:16],
 43 | 			out[sizeA:],
 44 | 			shared.EncodeShuffleTable,
 45 | 			shared.PerControlLenTable,
 46 | 		)
 47 | 
 48 | 		ctrls[2] = uint8(ctrl & 0xff)
 49 | 		ctrls[3] = uint8(ctrl >> 8)
 50 | 		sizeB := shared.ControlByteToSizeTwo(ctrl)
 51 | 
 52 | 		ctrl = encode.Put8uint32FastAsm(
 53 | 			nums[16:24],
 54 | 			out[sizeA+sizeB:],
 55 | 			shared.EncodeShuffleTable,
 56 | 			shared.PerControlLenTable,
 57 | 		)
 58 | 
 59 | 		ctrls[4] = uint8(ctrl & 0xff)
 60 | 		ctrls[5] = uint8(ctrl >> 8)
 61 | 		sizeC := shared.ControlByteToSizeTwo(ctrl)
 62 | 
 63 | 		ctrl = encode.Put8uint32FastAsm(
 64 | 			nums[24:],
 65 | 			out[sizeA+sizeB+sizeC:],
 66 | 			shared.EncodeShuffleTable,
 67 | 			shared.PerControlLenTable,
 68 | 		)
 69 | 
 70 | 		ctrls[6] = uint8(ctrl & 0xff)
 71 | 		ctrls[7] = uint8(ctrl >> 8)
 72 | 		sizeD := shared.ControlByteToSizeTwo(ctrl)
 73 | 
 74 | 		ctrlPos += 8
 75 | 		dataPos += sizeA + sizeB + sizeC + sizeD
 76 | 	}
 77 | 
 78 | 	for ; ctrlPos < ctrlLen-2; ctrlPos += 2 {
 79 | 		ctrl := encode.Put8uint32FastAsm(
 80 | 			in[encoded:],
 81 | 			stream[dataPos:],
 82 | 			shared.EncodeShuffleTable,
 83 | 			shared.PerControlLenTable,
 84 | 		)
 85 | 
 86 | 		stream[ctrlPos] = uint8(ctrl & 0xff)
 87 | 		stream[ctrlPos+1] = uint8(ctrl >> 8)
 88 | 		encoded += 8
 89 | 		dataPos += shared.ControlByteToSizeTwo(ctrl)
 90 | 	}
 91 | 
 92 | 	for ; ctrlPos < ctrlLen; ctrlPos += 1 {
 93 | 		nums := count - encoded
 94 | 		if nums > 4 {
 95 | 			nums = 4
 96 | 		}
 97 | 		ctrl := encode.PutUint32Scalar(in[encoded:], stream[dataPos:], nums)
 98 | 		size := shared.ControlByteToSize(ctrl)
 99 | 		stream[ctrlPos] = ctrl
100 | 		size -= 4 - nums
101 | 		dataPos += size
102 | 		encoded += nums
103 | 	}
104 | 
105 | 	return stream[:dataPos]
106 | }
107 | 
108 | // WriteAllDeltaFast will differentially encode all the integers from in using
109 | // the Stream VByte format using special hardware instructions and will return
110 | // the byte array holding the encoded data.
111 | func WriteAllDeltaFast(in []uint32, prev uint32) []byte {
112 | 	var (
113 | 		count   = len(in)
114 | 		ctrlLen = (count + 3) / 4
115 | 		stream  = make([]byte, ctrlLen+(encode.MaxBytesPerNum*count))
116 | 
117 | 		dataPos  = ctrlLen
118 | 		ctrlPos  = 0
119 | 		encoded  = 0
120 | 		lowest32 = ((ctrlLen - 3) * 4) &^ 31
121 | 	)
122 | 
123 | 	for ; encoded < lowest32; encoded += 32 {
124 | 		ctrls := stream[ctrlPos : ctrlPos+8]
125 | 		nums := in[encoded : encoded+32]
126 | 		out := stream[dataPos:]
127 | 
128 | 		ctrl := encode.Put8uint32DeltaFastAsm(
129 | 			nums[0:8],
130 | 			out,
131 | 			prev,
132 | 			shared.EncodeShuffleTable,
133 | 			shared.PerControlLenTable,
134 | 		)
135 | 
136 | 		ctrls[0] = uint8(ctrl & 0xff)
137 | 		ctrls[1] = uint8(ctrl >> 8)
138 | 		sizeA := shared.ControlByteToSizeTwo(ctrl)
139 | 
140 | 		ctrl = encode.Put8uint32DeltaFastAsm(
141 | 			nums[8:16],
142 | 			out[sizeA:],
143 | 			nums[7],
144 | 			shared.EncodeShuffleTable,
145 | 			shared.PerControlLenTable,
146 | 		)
147 | 
148 | 		ctrls[2] = uint8(ctrl & 0xff)
149 | 		ctrls[3] = uint8(ctrl >> 8)
150 | 		sizeB := shared.ControlByteToSizeTwo(ctrl)
151 | 
152 | 		ctrl = encode.Put8uint32DeltaFastAsm(
153 | 			nums[16:24],
154 | 			out[sizeA+sizeB:],
155 | 			nums[15],
156 | 			shared.EncodeShuffleTable,
157 | 			shared.PerControlLenTable,
158 | 		)
159 | 
160 | 		ctrls[4] = uint8(ctrl & 0xff)
161 | 		ctrls[5] = uint8(ctrl >> 8)
162 | 		sizeC := shared.ControlByteToSizeTwo(ctrl)
163 | 
164 | 		ctrl = encode.Put8uint32DeltaFastAsm(
165 | 			nums[24:],
166 | 			out[sizeA+sizeB+sizeC:],
167 | 			nums[23],
168 | 			shared.EncodeShuffleTable,
169 | 			shared.PerControlLenTable,
170 | 		)
171 | 
172 | 		ctrls[6] = uint8(ctrl & 0xff)
173 | 		ctrls[7] = uint8(ctrl >> 8)
174 | 		sizeD := shared.ControlByteToSizeTwo(ctrl)
175 | 
176 | 		ctrlPos += 8
177 | 		dataPos += sizeA + sizeB + sizeC + sizeD
178 | 		prev = nums[31]
179 | 	}
180 | 
181 | 	for ; ctrlPos < ctrlLen-2; ctrlPos += 2 {
182 | 		ctrl := encode.Put8uint32DeltaFastAsm(
183 | 			in[encoded:],
184 | 			stream[dataPos:],
185 | 			prev,
186 | 			shared.EncodeShuffleTable,
187 | 			shared.PerControlLenTable,
188 | 		)
189 | 
190 | 		stream[ctrlPos] = uint8(ctrl & 0xff)
191 | 		stream[ctrlPos+1] = uint8(ctrl >> 8)
192 | 		encoded += 8
193 | 		dataPos += shared.ControlByteToSizeTwo(ctrl)
194 | 		prev = in[encoded-1]
195 | 	}
196 | 
197 | 	for ; ctrlPos < ctrlLen; ctrlPos += 1 {
198 | 		nums := count - encoded
199 | 		if nums > 4 {
200 | 			nums = 4
201 | 		}
202 | 		ctrl := encode.PutUint32DeltaScalar(in[encoded:], stream[dataPos:], nums, prev)
203 | 		size := shared.ControlByteToSize(ctrl)
204 | 		stream[ctrlPos] = ctrl
205 | 		size -= 4 - nums
206 | 		dataPos += size
207 | 		encoded += nums
208 | 		prev = in[encoded-1]
209 | 	}
210 | 
211 | 	return stream[:dataPos]
212 | }
213 | 


--------------------------------------------------------------------------------
/pkg/stream/writer/writer_base.go:
--------------------------------------------------------------------------------
1 | // +build !amd64
2 | 
3 | package writer
4 | 
5 | func WriteAllFast(in []uint32) []byte {
6 | 	panic("unreachable")
7 | }
8 | 


--------------------------------------------------------------------------------
/pkg/stream/writer/writer_test.go:
--------------------------------------------------------------------------------
  1 | package writer
  2 | 
  3 | import (
  4 | 	"encoding/binary"
  5 | 	"fmt"
  6 | 	"math"
  7 | 	"math/rand"
  8 | 	"reflect"
  9 | 	"testing"
 10 | 	"time"
 11 | 
 12 | 	"github.com/theMPatel/streamvbyte-simdgo/pkg/encode"
 13 | 	"github.com/theMPatel/streamvbyte-simdgo/pkg/shared"
 14 | 	"github.com/theMPatel/streamvbyte-simdgo/pkg/stream/reader"
 15 | 	"github.com/theMPatel/streamvbyte-simdgo/pkg/util"
 16 | )
 17 | 
 18 | func init() {
 19 | 	rand.Seed(time.Now().UnixNano())
 20 | }
 21 | 
 22 | func TestWriteAllScalar(t *testing.T) {
 23 | 	for i := 0; i < 6; i++ {
 24 | 		count := int(util.RandUint32() % 1e6)
 25 | 		nums := util.GenUint32(count)
 26 | 		stream := WriteAllScalar(nums)
 27 | 		t.Run(fmt.Sprintf("WriteAll: %d", count), func(t *testing.T) {
 28 | 			out := make([]uint32, count)
 29 | 			reader.ReadAllScalar(count, stream, out)
 30 | 			if !reflect.DeepEqual(nums, out) {
 31 | 				t.Fatalf("decoded wrong nums")
 32 | 			}
 33 | 		})
 34 | 	}
 35 | }
 36 | 
 37 | func TestWriteAllDeltaScalar(t *testing.T) {
 38 | 	for i := 0; i < 6; i++ {
 39 | 		count := int(util.RandUint32() % 1e6)
 40 | 		nums := util.GenUint32(count)
 41 | 		util.SortUint32(nums)
 42 | 		diffed := make([]uint32, count)
 43 | 		util.Delta(nums, diffed)
 44 | 
 45 | 		stream := WriteAllScalar(diffed)
 46 | 		t.Run(fmt.Sprintf("WriteAll: %d", count), func(t *testing.T) {
 47 | 			actual := WriteAllDeltaScalar(nums, 0)
 48 | 			if !reflect.DeepEqual(stream, actual) {
 49 | 				t.Fatalf("bad encoding")
 50 | 			}
 51 | 		})
 52 | 	}
 53 | }
 54 | 
 55 | func TestWriteAllFast(t *testing.T) {
 56 | 	if encode.GetMode() == shared.Normal {
 57 | 		t.Skipf("Testing environment doesn't support this test")
 58 | 	}
 59 | 
 60 | 	for i := 0; i < 6; i++ {
 61 | 		count := int(util.RandUint32() % 1e6)
 62 | 		nums := util.GenUint32(count)
 63 | 		stream := WriteAllScalar(nums)
 64 | 		t.Run(fmt.Sprintf("WriteAll: %d", count), func(t *testing.T) {
 65 | 			actual := WriteAllFast(nums)
 66 | 			if !reflect.DeepEqual(stream, actual) {
 67 | 				t.Fatalf("bad encoding")
 68 | 			}
 69 | 		})
 70 | 	}
 71 | }
 72 | 
 73 | func TestWriteAllDeltaFast(t *testing.T) {
 74 | 	if encode.GetMode() == shared.Normal {
 75 | 		t.Skipf("Testing environment doesn't support this test")
 76 | 	}
 77 | 
 78 | 	for i := 0; i < 6; i++ {
 79 | 		count := int(util.RandUint32() % 1e6)
 80 | 		nums := util.GenUint32(count)
 81 | 		util.SortUint32(nums)
 82 | 		diffed := make([]uint32, count)
 83 | 		util.Delta(nums, diffed)
 84 | 
 85 | 		stream := WriteAllScalar(diffed)
 86 | 		t.Run(fmt.Sprintf("WriteAll: %d", count), func(t *testing.T) {
 87 | 			actual := WriteAllDeltaFast(nums, 0)
 88 | 			if !reflect.DeepEqual(stream, actual) {
 89 | 				t.Fatalf("bad encoding")
 90 | 			}
 91 | 		})
 92 | 	}
 93 | }
 94 | 
 95 | var readSinkA []byte
 96 | 
 97 | func BenchmarkWriteAllFast(b *testing.B) {
 98 | 	if encode.GetMode() == shared.Normal {
 99 | 		b.Skipf("Testing environment doesn't support this test")
100 | 	}
101 | 
102 | 	for i := 0; i < 8; i++ {
103 | 		count := int(math.Pow10(i))
104 | 		nums := util.GenUint32(count)
105 | 		b.Run(fmt.Sprintf("Count_1e%d", i), func(b *testing.B) {
106 | 			var stream []byte
107 | 			b.SetBytes(int64(count * encode.MaxBytesPerNum))
108 | 			b.ResetTimer()
109 | 			for i := 0; i < b.N; i++ {
110 | 				stream = WriteAllFast(nums)
111 | 			}
112 | 			readSinkA = stream
113 | 		})
114 | 	}
115 | }
116 | 
117 | var readSinkB []byte
118 | 
119 | func BenchmarkWriteAllDeltaFast(b *testing.B) {
120 | 	if encode.GetMode() == shared.Normal {
121 | 		b.Skipf("Testing environment doesn't support this test")
122 | 	}
123 | 
124 | 	for i := 0; i < 8; i++ {
125 | 		count := int(math.Pow10(i))
126 | 		nums := util.GenUint32(count)
127 | 		util.SortUint32(nums)
128 | 		b.Run(fmt.Sprintf("Count_1e%d", i), func(b *testing.B) {
129 | 			var stream []byte
130 | 			b.SetBytes(int64(count * encode.MaxBytesPerNum))
131 | 			b.ResetTimer()
132 | 			for i := 0; i < b.N; i++ {
133 | 				stream = WriteAllDeltaFast(nums, 0)
134 | 			}
135 | 			readSinkB = stream
136 | 		})
137 | 	}
138 | }
139 | 
140 | var readSinkC []byte
141 | 
142 | func BenchmarkWriteAllScalar(b *testing.B) {
143 | 	for i := 0; i < 8; i++ {
144 | 		count := int(math.Pow10(i))
145 | 		nums := util.GenUint32(count)
146 | 		util.SortUint32(nums)
147 | 		b.Run(fmt.Sprintf("Count_1e%d", i), func(b *testing.B) {
148 | 			var stream []byte
149 | 			b.SetBytes(int64(count * encode.MaxBytesPerNum))
150 | 			b.ResetTimer()
151 | 			for i := 0; i < b.N; i++ {
152 | 				stream = WriteAllScalar(nums)
153 | 			}
154 | 			readSinkC = stream
155 | 		})
156 | 	}
157 | }
158 | 
159 | var readSinkD []byte
160 | 
161 | func BenchmarkWriteAllDeltaScalar(b *testing.B) {
162 | 	for i := 0; i < 8; i++ {
163 | 		count := int(math.Pow10(i))
164 | 		nums := util.GenUint32(count)
165 | 		util.SortUint32(nums)
166 | 		b.Run(fmt.Sprintf("Count_1e%d", i), func(b *testing.B) {
167 | 			var stream []byte
168 | 			b.SetBytes(int64(count * encode.MaxBytesPerNum))
169 | 			b.ResetTimer()
170 | 			for i := 0; i < b.N; i++ {
171 | 				stream = WriteAllDeltaScalar(nums, 0)
172 | 			}
173 | 			readSinkD = stream
174 | 		})
175 | 	}
176 | }
177 | 
178 | var readSinkE int
179 | 
180 | func BenchmarkWriteAllVarint(b *testing.B) {
181 | 	for i := 0; i < 8; i++ {
182 | 		count := int(math.Pow10(i))
183 | 		nums := util.GenUint32(count)
184 | 		util.SortUint32(nums)
185 | 		out := make([]byte, count*binary.MaxVarintLen32)
186 | 		written := 0
187 | 		b.Run(fmt.Sprintf("Count_1e%d", i), func(b *testing.B) {
188 | 			b.SetBytes(int64(count * encode.MaxBytesPerNum))
189 | 			b.ResetTimer()
190 | 			for i := 0; i < b.N; i++ {
191 | 				written = util.PutVarint(nums, out)
192 | 			}
193 | 			readSinkE = written
194 | 		})
195 | 	}
196 | }
197 | 
198 | var readSinkF int
199 | 
200 | func BenchmarkWriteAllDeltaVarint(b *testing.B) {
201 | 	for i := 0; i < 8; i++ {
202 | 		count := int(math.Pow10(i))
203 | 		nums := util.GenUint32(count)
204 | 		util.SortUint32(nums)
205 | 		out := make([]byte, count*binary.MaxVarintLen32)
206 | 		written := 0
207 | 		b.Run(fmt.Sprintf("Count_1e%d", i), func(b *testing.B) {
208 | 			b.SetBytes(int64(count * encode.MaxBytesPerNum))
209 | 			b.ResetTimer()
210 | 			for i := 0; i < b.N; i++ {
211 | 				written = util.PutDeltaVarint(nums, out, 0)
212 | 			}
213 | 			readSinkE = written
214 | 		})
215 | 	}
216 | }
217 | 


--------------------------------------------------------------------------------
/pkg/util/rand.go:
--------------------------------------------------------------------------------
 1 | package util
 2 | 
 3 | // This file provides a more uniform random number generator that creates
 4 | // numbers that have a more normal distribution along the number of bytes
 5 | // required to encode them. This is needed because the larger encoded bytes
 6 | // i.e. 3 and 4 bytes have more numbers to pick from versus those that require
 7 | // just 1 or 2. Thus, using a normally generated number is more likely to produce
 8 | // a number that requires 3 or 4 bytes to encode.
 9 | 
10 | import (
11 | 	"math"
12 | 	"math/rand"
13 | )
14 | 
15 | type generator func() uint32
16 | 
17 | func randUint32Range(low, high uint32) uint32 {
18 | 	return (rand.Uint32() % (high - low + 1)) + low
19 | }
20 | 
21 | var (
22 | 	generators = []generator{
23 | 		// 1 byte,
24 | 		func() uint32 {
25 | 			return randUint32Range(0, 1<<8)
26 | 		},
27 | 		// 2 byte,
28 | 		func() uint32 {
29 | 			return randUint32Range(1<<8, 1<<16)
30 | 		},
31 | 		// 3 byte,
32 | 		func() uint32 {
33 | 			return randUint32Range(1<<16, 1<<24)
34 | 		},
35 | 		// 4 byte,
36 | 		func() uint32 {
37 | 			return randUint32Range(1<<24, math.MaxUint32)
38 | 		},
39 | 	}
40 | )
41 | 
42 | // RandUint32 generates a random number that is also uniformly random
43 | // on the axis for the number of bytes required to encode it. It first
44 | // randomly chooses a byte length, i.e. 1, 2, 3 or 4 and then randomly
45 | // generates a number whose encoded length would be that length.
46 | func RandUint32() uint32 {
47 | 	return generators[rand.Int()%4]()
48 | }
49 | 


--------------------------------------------------------------------------------
/pkg/util/util.go:
--------------------------------------------------------------------------------
 1 | package util
 2 | 
 3 | import (
 4 | 	"io"
 5 | 	"sort"
 6 | )
 7 | 
 8 | func SilentClose(closer io.Closer) {
 9 | 	_ = closer.Close()
10 | }
11 | 
12 | func GenUint32(n int) []uint32 {
13 | 	nums := make([]uint32, n)
14 | 	for i := 0; i < n; i++ {
15 | 		nums[i] = RandUint32()
16 | 	}
17 | 
18 | 	return nums
19 | }
20 | 
21 | func SortUint32(in []uint32) {
22 | 	sort.Slice(in, func(i, j int) bool {
23 | 		return in[i] < in[j]
24 | 	})
25 | }
26 | 
27 | func Delta(in []uint32, out []uint32) {
28 | 	for i := range in {
29 | 		if i > 0 {
30 | 			out[i] = in[i] - in[i-1]
31 | 		} else {
32 | 			out[i] = in[i]
33 | 		}
34 | 	}
35 | }
36 | 


--------------------------------------------------------------------------------
/pkg/util/varint.go:
--------------------------------------------------------------------------------
 1 | package util
 2 | 
 3 | import "encoding/binary"
 4 | 
 5 | func PutVarint(nums []uint32, out []byte) int {
 6 | 	pos := 0
 7 | 	for i := range nums {
 8 | 		size := binary.PutUvarint(out[pos:], uint64(nums[i]))
 9 | 		pos += size
10 | 	}
11 | 
12 | 	return pos
13 | }
14 | 
15 | func GetVarint(data []byte, out []uint32) int {
16 | 	pos := 0
17 | 	i := 0
18 | 	for pos < len(data) {
19 | 		num, read := binary.Uvarint(data[pos:])
20 | 		pos += read
21 | 		out[i] = uint32(num)
22 | 		i++
23 | 	}
24 | 	return pos
25 | }
26 | 
27 | func PutDeltaVarint(nums []uint32, out []byte, prev uint32) int {
28 | 	pos := 0
29 | 	for i := range nums {
30 | 		size := binary.PutUvarint(out[pos:], uint64(nums[i]-prev))
31 | 		pos += size
32 | 		prev = nums[i]
33 | 	}
34 | 
35 | 	return pos
36 | }
37 | 
38 | func GetDeltaVarint(in []byte, out []uint32, prev uint32) int {
39 | 	pos := 0
40 | 	i := 0
41 | 	for pos < len(in) {
42 | 		num, size := binary.Uvarint(in[pos:])
43 | 		pos += size
44 | 		res := uint32(num) + prev
45 | 		out[i] = res
46 | 		prev = res
47 | 		i++
48 | 	}
49 | 
50 | 	return pos
51 | }
52 | 


--------------------------------------------------------------------------------
/pkg/util/varint_test.go:
--------------------------------------------------------------------------------
 1 | package util
 2 | 
 3 | import (
 4 | 	"encoding/binary"
 5 | 	"reflect"
 6 | 	"testing"
 7 | )
 8 | 
 9 | func TestVarintRoundTrip(t *testing.T) {
10 | 	count := 8
11 | 	nums := GenUint32(count)
12 | 	out := make([]byte, count*binary.MaxVarintLen32)
13 | 	written := PutVarint(nums, out)
14 | 	out = out[:written]
15 | 
16 | 	actual := make([]uint32, count)
17 | 	read := GetVarint(out, actual)
18 | 
19 | 	if written != read {
20 | 		t.Fatalf("expected to read %d, got %d", written, read)
21 | 	}
22 | 
23 | 	if !reflect.DeepEqual(nums, actual) {
24 | 		t.Fatalf("expected %+v, got %+v", nums, actual)
25 | 	}
26 | }
27 | 
28 | func TestVarintDeltaRoundTrip(t *testing.T) {
29 | 	count := 8
30 | 	nums := GenUint32(count)
31 | 	SortUint32(nums)
32 | 	out := make([]byte, count*binary.MaxVarintLen32)
33 | 	written := PutDeltaVarint(nums, out, 0)
34 | 	out = out[:written]
35 | 
36 | 	actual := make([]uint32, count)
37 | 	read := GetDeltaVarint(out, actual, 0)
38 | 
39 | 	if written != read {
40 | 		t.Fatalf("expected to read %d, got %d", written, read)
41 | 	}
42 | 
43 | 	if !reflect.DeepEqual(nums, actual) {
44 | 		t.Fatalf("expected %+v, got %+v", nums, actual)
45 | 	}
46 | }
47 | 


--------------------------------------------------------------------------------
/tools/generate_and_check.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | BEFORE_DIFF=$(git diff | sha1sum )
 6 | BEFORE_STATUS=$(git status --porcelain | sha1sum)
 7 | 
 8 | make generate
 9 | 
10 | AFTER_DIFF=$(git diff | sha1sum )
11 | AFTER_STATUS=$(git status --porcelain | sha1sum)
12 | 
13 | if [[ $BEFORE_DIFF != $AFTER_DIFF || $BEFORE_STATUS != $AFTER_STATUS ]]; then
14 |   echo "Unstable generate. Make sure to generate and check in changed files."
15 |   exit 1
16 | fi


--------------------------------------------------------------------------------
/tools/parse_and_write_bench.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | import (
  4 | 	"bufio"
  5 | 	"bytes"
  6 | 	"flag"
  7 | 	"io"
  8 | 	"log"
  9 | 	"os"
 10 | 	"path/filepath"
 11 | 	"strings"
 12 | 
 13 | 	"github.com/theMPatel/streamvbyte-simdgo/pkg/util"
 14 | )
 15 | 
 16 | const (
 17 | 	goos = "goos"
 18 | 	goarch = "goarch"
 19 | 	pkg = "pkg"
 20 | 	cpu = "cpu"
 21 | 	benchmark = "Benchmark"
 22 | 	dashes = "--"
 23 | 	startSentinel = "## Benchmarks\n\n```text\n"
 24 | 	endSentinel = "```\n"
 25 | )
 26 | 
 27 | var (
 28 | 	validPrefixes = []string{
 29 | 		goos,
 30 | 		goarch,
 31 | 		pkg,
 32 | 		cpu,
 33 | 		benchmark,
 34 | 	}
 35 | 
 36 | 	readmeFile = filepath.Join(os.Getenv("SBYTE_HOME"), "README.md")
 37 | 	fWriteOut = flag.Bool("w", false, "write out to readme")
 38 | )
 39 | 
 40 | func anyPrefix(in string) bool {
 41 | 	for _, p := range validPrefixes {
 42 | 		if strings.HasPrefix(in, p) {
 43 | 			return true
 44 | 		}
 45 | 	}
 46 | 
 47 | 	return false
 48 | }
 49 | 
 50 | func main() {
 51 | 	flag.Parse()
 52 | 	var (
 53 | 		lines []string
 54 | 		hasBench = false
 55 | 	)
 56 | 
 57 | 	scanner := bufio.NewScanner(os.Stdin)
 58 | 	for scanner.Scan() {
 59 | 		line := scanner.Text()
 60 | 		if anyPrefix(line) {
 61 | 			emitDash := strings.HasPrefix(line, cpu)
 62 | 			hasBench = hasBench || strings.HasPrefix(line, benchmark)
 63 | 			emitNewline := hasBench && strings.HasPrefix(line, goos)
 64 | 
 65 | 			if emitNewline {
 66 | 				lines = append(lines, "")
 67 | 			}
 68 | 			if hasBench {
 69 | 				line = strings.TrimPrefix(line, benchmark)
 70 | 			}
 71 | 			lines = append(lines, line)
 72 | 			if emitDash {
 73 | 				lines = append(lines, dashes)
 74 | 			}
 75 | 		}
 76 | 	}
 77 | 
 78 | 	if err := scanner.Err(); err != nil {
 79 | 		log.Fatalf("failed to read input: %s", err)
 80 | 	}
 81 | 
 82 | 	outputFile, err := os.Open(readmeFile)
 83 | 	if err != nil {
 84 | 		log.Fatalf("failed to open file: %s, %s", readmeFile, err)
 85 | 	}
 86 | 
 87 | 	allData, err := io.ReadAll(outputFile)
 88 | 	if err != nil {
 89 | 		log.Fatalf("failed to read file: %s, %s", readmeFile, err)
 90 | 	}
 91 | 
 92 | 	util.SilentClose(outputFile)
 93 | 
 94 | 	bStart := []byte(startSentinel)
 95 | 	bEnd := []byte(endSentinel)
 96 | 
 97 | 	start := bytes.Index(allData, bStart)
 98 | 	if start < 0 {
 99 | 		log.Fatalf("couldn't find start sentinel")
100 | 	}
101 | 
102 | 	restStart := bytes.Index(allData, bEnd)
103 | 
104 | 	var final []byte
105 | 	final = append(final, allData[:start]...)
106 | 	final = append(final, bStart...)
107 | 	final = append(final, []byte(strings.Join(lines, "\n"))...)
108 | 	final = append(final, '\n')
109 | 	final = append(final, allData[restStart:]...)
110 | 
111 | 	var out io.Writer
112 | 	if *fWriteOut {
113 | 		outputFile, err = os.Create(readmeFile)
114 | 		if err != nil {
115 | 			log.Fatalf("failed to open file: %s, %s", readmeFile, err)
116 | 		}
117 | 		defer util.SilentClose(outputFile)
118 | 		out = outputFile
119 | 	} else {
120 | 		out = os.Stdout
121 | 	}
122 | 
123 | 	_, err = out.Write(final)
124 | 	if err != nil {
125 | 		log.Fatalf("failed to write to file: %s, %s", readmeFile, err)
126 | 	}
127 | }
128 | 


--------------------------------------------------------------------------------
/tools/update_bench.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | set -e
4 | 
5 | TMP_FILE=$(mktemp)
6 | go test -bench . ./pkg/... | tee $TMP_FILE
7 | cat $TMP_FILE | go run $SBYTE_HOME/tools/parse_and_write_bench.go -w
8 | rm $TMP_FILE


--------------------------------------------------------------------------------