├── .gitignore ├── Dockerfile ├── LICENSE ├── Makefile ├── README.md ├── data ├── .gitignore ├── Makefile ├── bing.base64 ├── bing.png ├── googlelogo.base64 ├── googlelogo.png ├── large.base64 ├── lena_color_512.base64 ├── lena_color_512.jpg ├── mandril_color.base64 ├── mandril_color.jpg ├── moby_dick.base64 ├── peppers_color.base64 └── peppers_color.jpg ├── docker.sh ├── dockeroutput ├── decodingperf.txt ├── encodingperf.txt └── realperf.txt ├── include ├── avx512memcpy.h ├── chromiumbase64.h ├── decode_base64_avx512vbmi.h ├── decode_base64_avx512vbmi__unrolled.h ├── decode_base64_avx512vbmi_despace.h ├── encode_base64_avx512vbmi.h ├── encode_base64_avx512vl.h ├── fastavxbase64.h ├── load_file.h └── memalloc.h ├── processdock.sh ├── results ├── Makefile ├── cnl_decoding_cyclesperinputbyte.pdf ├── cnl_decoding_cyclesperinputbyte.png ├── cnl_decoding_cyclesperinputbyte_png.gnuplot ├── cnl_encoding_cyclesperinputbyte.pdf ├── cnl_encoding_cyclesperinputbyte.png ├── cnl_encoding_cyclesperinputbyte_png.gnuplot ├── cnldecoding.txt ├── cnlencoding.txt ├── decoding_cyclesperinputbyte.gnuplot ├── decoding_gbps.gnuplot ├── encoding_cyclesperinputbyte.gnuplot ├── encoding_gbps.gnuplot └── linespointsstyle.gnuplot ├── scripts ├── avx512vbmi_decode_lookups.py ├── email-generator.py └── setupfortesting │ ├── README.md │ ├── disablehyperthreading.sh │ ├── powerpolicy.sh │ ├── setupfortesting.sh │ └── turboboost.sh └── src ├── base64 ├── chromiumbase64.c ├── decode_base64_avx512vbmi.c ├── decode_base64_avx512vbmi__unrolled.c ├── decode_base64_avx512vbmi_despace.c ├── decode_base64_tail_avx512vbmi.c ├── encode_base64_avx512vbmi.c ├── encode_base64_avx512vl.c └── fastavxbase64.c ├── benchmark.c ├── benchmark.h ├── benchmark_despace.c ├── benchmark_email.c ├── load_file.c ├── unit.c ├── unit_despace.c └── unit_tail.c /.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | *.pyc 3 | *.swp 4 | *.dat 5 | .gdb_history 6 | perf.data* 7 | 8 | # executables 9 | unit 10 | unit_tail 11 | unit_despace 12 | benchmark 13 | benchmark_email 14 | benchmark_despace 15 | 16 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM gcc:9.1 2 | COPY . /usr/src/ 3 | WORKDIR /usr/src/ 4 | RUN make benchmark 5 | CMD ["./benchmark", "/dockeroutput"] 6 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2019 Daniel Lemire, Wojciech Muła 2 | 3 | Redistribution and use in source and binary forms, with or without 4 | modification, are permitted provided that the following conditions are met: 5 | 6 | 1. Redistributions of source code must retain the above copyright notice, this 7 | list of conditions and the following disclaimer. 8 | 9 | 2. Redistributions in binary form must reproduce the above copyright notice, 10 | this list of conditions and the following disclaimer in the documentation 11 | and/or other materials provided with the distribution. 12 | 13 | 3. Neither the name of the copyright holder nor the names of its contributors 14 | may be used to endorse or promote products derived from this software without 15 | specific prior written permission. 16 | 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 18 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 19 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 20 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 21 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 23 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 24 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 25 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # we target only AVX512VBMI Cannon Lake CPU 2 | 3 | ######## 4 | # I do not think one should use '-funroll-loops' as it may harm some operations. 5 | ####### 6 | ## Similarly, it seems that -ftree-vectorize is not obviously useful. 7 | ###### 8 | CFLAGS += -O3 -std=c99 -Wall -Wextra -pedantic -Iinclude 9 | #FLAGS+=-march=native 10 | CFLAGS += -march=cannonlake 11 | 12 | BASE64 = src/base64/chromiumbase64.o \ 13 | src/base64/fastavxbase64.o \ 14 | src/base64/encode_base64_avx512vbmi.o \ 15 | src/base64/decode_base64_avx512vbmi.o \ 16 | src/base64/encode_base64_avx512vl.o \ 17 | src/base64/decode_base64_avx512vbmi_despace.o \ 18 | src/base64/decode_base64_avx512vbmi__unrolled.o 19 | HELPERS = src/load_file.o 20 | BINOBJS = src/unit.o \ 21 | src/unit_tail.o \ 22 | src/unit_despace.o \ 23 | src/benchmark.o \ 24 | src/benchmark_despace.o \ 25 | src/benchmark_email.o 26 | 27 | BINS = unit \ 28 | unit_despace \ 29 | benchmark \ 30 | benchmark_email \ 31 | benchmark_despace 32 | 33 | all: $(BINS) 34 | 35 | src/base64/chromiumbase64.o: include/chromiumbase64.h 36 | src/base64/decode_base64_avx512vbmi_despace.o: include/decode_base64_avx512vbmi_despace.h 37 | src/base64/decode_base64_avx512vbmi.o: include/decode_base64_avx512vbmi.h 38 | src/base64/decode_base64_avx512vbmi__unrolled.o: include/decode_base64_avx512vbmi__unrolled.h 39 | src/base64/encode_base64_avx512vbmi.o: include/encode_base64_avx512vbmi.h 40 | src/base64/encode_base64_avx512vl.o: include/encode_base64_avx512vl.h 41 | src/base64/fastavxbase64.o: include/fastavxbase64.h 42 | src/load_file.o: include/load_file.h include/memalloc.h 43 | src/benchmark.o: src/benchmark.h include/memalloc.h include/avx512memcpy.h 44 | src/benchmark_email.o: include/memalloc.h 45 | src/unit_tail.o: src/base64/decode_base64_tail_avx512vbmi.c 46 | 47 | benchmark: src/benchmark.o $(BASE64) $(HELPERS) 48 | benchmark_despace: src/benchmark_despace.o $(BASE64) 49 | benchmark_email: src/benchmark_email.o $(BASE64) $(HELPERS) 50 | unit: src/unit.o $(BASE64) 51 | unit_despace: src/unit_despace.o $(BASE64) 52 | unit_tail: src/unit_tail.o \ 53 | src/base64/chromiumbase64.o 54 | 55 | $(BINS) unit_tail: 56 | $(CC) $^ -o $@ 57 | 58 | clean: 59 | $(RM) $(BINS) 60 | $(RM) $(BASE64) $(HELPERS) $(BINOBJS) 61 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # base64-avx512 2 | 3 | Please ensure that you have a recent compiler. For example, 4 | you may use the GNU GCC 8 compiler. On some machines, this 5 | can be selected by the command `export CC=gcc-8`. 6 | 7 | ``` 8 | make 9 | ``` 10 | 11 | You should have a Cannon Lake processor or better. 12 | 13 | You can also run the software using a Docker container... 14 | 15 | ``` 16 | docker build -t avx512base64 . 17 | mkdir -p dockeroutput 18 | docker run -v $PWD/dockeroutput:/dockeroutput --privileged avx512base64 19 | ``` 20 | 21 | There is a bash script (`docker.sh`) to help. 22 | 23 | ## Reference 24 | 25 | 26 | Wojciech Muła, Daniel Lemire, [Base64 encoding and decoding at almost the speed of a memory copy](https://arxiv.org/abs/1910.05109), Software: Practice and Experience (to appear) 27 | -------------------------------------------------------------------------------- /data/.gitignore: -------------------------------------------------------------------------------- 1 | *.base64 2 | -------------------------------------------------------------------------------- /data/Makefile: -------------------------------------------------------------------------------- 1 | %.base64: %.jpg 2 | base64 $< > $@ 3 | 4 | %.base64: %.png 5 | base64 $< > $@ 6 | 7 | JPG=$(wildcard *.jpg) 8 | PNG=$(wildcard *.png) 9 | ALL=$(JPG:jpg=base64) $(PNG:png=base64) 10 | 11 | all: $(ALL) 12 | -------------------------------------------------------------------------------- /data/bing.base64: -------------------------------------------------------------------------------- 1 | iVBORw0KGgoAAAANSUhEUgAAAFEAAAASCAYAAAAjQzL0AAAFEklEQVRYR+WYT4hVVRzH5402SSoz 2 | aoFJoOSAJswfMIgwcDAMQrBpUoQ2zkpoMeLSlai4dCFuatcYtlJEc9QEjUHaBCJKkRGOzsKNlTlo 3 | TjiDb/p8f+937tx73733vfsqCPrCl3N+/8657/fOOb9zb6UNzM3NLalUKn+oX4RqtbqBZj98G87A 4 | C/Boe3v7L4yxEf5J/0f8nqB/CbaCacZY6v06MMcgzT642eVbNKM8f5dkcFNEnqyJrYFxO+AndHfC 5 | ZfA6PMyzTdAmwQ9+EZ6Hb7oqE9g3wccMnAC6n+A1+B18zX2fu7k0FGsTZgDzaM2rMRjnLE2/h5YC 6 | cV3EX9SzOGd9zAew293mgW0lhqfwIXzf1Qngswjb9xooC9iEr+nukT/9ZpN4He7Efys8LIVibdIU 7 | MA3K3gKGfYimgP8ynuEqFO7CIdgDz2sw2hPuOg+Ur8Df3EFZPw5Xu9mA3Aer8skD5os0tqXoN0wi 8 | PqdtcAfyR67PTCJ6raw8jMODcBS/SdMk0VQi8esk/rKeAU4gr3eT5lciZ+FtVyWBQQmIYwrdl3A3 9 | XA3fdX0msOtPeNmHazaJy913C7wEb7o+byXWAV8lNpyFEdANhPFiKNza2CtQSVwFd8F1bjIg74D6 10 | XeOumgfKhRjfgw/o1wH9M6itnrsSMd2HUTGg30wSzZ/2kqsMirVBUnBzHFPQEkg7SNxZeIz+GtcN 11 | wAiyS58FzB3Y90Kd7XfgadjrZj3jOngPaoWOuHoeKF+Ft+HvtenKg9hbNO0+ZGESsWlLPIYhifrx 12 | kp+6PW87p1dWtCJcDlBytRL31cQELMFxoNMZGLawFREoaOEsx76edsI01eoF5A4PTQLjmGZoFcSf 13 | 9KEMyEVJHHO3BNB/4Pa8JCaSgqyrjAExrMSs8zAOXY8iICuB38hAe0922m54BJ6EqgUhgVew1x0d 14 | ETCuwekHDdYKiN3uQxmQi5Kof/MFd9XcCySj/9jtedtZhSONuoKBTr9FCdeKTOOgu8lP15hQhZUo 15 | JUx1QAnsxq6EagsLxQkMwOkdqCpXFrqmLPRhDExalERB2yZs53AfszNXfRskBUxZSZS/isswTGxV 16 | 5H7ZU7Ak0uoM1LyCVWHaFdDOfqAtHba3zuzGCRRwtpVQForzISJocjfnAp8oia4yKNYGSQFT3T0R 17 | 38Q5iaztHF9tiUWBXW86mnNE80DdA6MqTL8XqqiouKjIjBCWfQbmgaBDsGECAvD9wkMTaGYMfMom 18 | savmUQN+dibSTVRhMGUBgH46iSFmXPPAIXN0IKsK63qzCp9OaK/FpUHg6wzyGW0h8NF2WOxhCaAP 19 | W6GIIYljKX1mEgVs0YWbfvRujBgKi+wDrlPS685F2fDTbURbtkeyQP8NqJWpZ7iMa6ebGoOAxVDv 20 | z6Ju5VqNqlS5wK7KZUnIAjY9yEwB9S66xH314+O23CQSkzjnFEuTddlWcUlfiQyyYzuhPq2+Geg3 21 | 69VuEgpXMemDQ/Mg6C34FfxVAxcBn5/hEN3CZY6Pkqh/Op6cOJVEvbMvhefiepibRIE4FZE0tD2P 22 | ierXVJmwuyV+usbYywVtvIhEr65lED6FrYRb6G6D+tylpayK+ww+gjfgmUql8i2cpl8IHuZf+xQm 23 | 8Ky62nxek0rhQ57f3lx4xrU0B6C+Xuk3nsL2KdQnvv8HSGQ/iYjOyCYw6qH/OFqrPv8hkBxtPxUT 24 | +8CArOo8zIrqc/kcfX20zX13/ntoa/sLfuPonSru/8QAAAAASUVORK5CYII= 25 | -------------------------------------------------------------------------------- /data/bing.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WojciechMula/base64-avx512/1cb949021bf070faaed8e8c767819f35ffc7e78e/data/bing.png -------------------------------------------------------------------------------- /data/googlelogo.base64: -------------------------------------------------------------------------------- 1 | iVBORw0KGgoAAAANSUhEUgAAALwAAABACAQAAAAKENVCAAAI/ElEQVR4Ae3ae3BU5RnH8e/ZTbIh 2 | hIRbRIJyCZcEk4ZyE4RBAiRBxRahEZBLQYUZAjIgoLUWB6wjKIK2MtAqOLVUKSqWQW0ZaOQq0IFA 3 | IZVrgFQhXAOShITEbHY7407mnPfc8u6ya2f0fN6/9rzvc87Z39nbed/l/8OhIKMDQ+hHKp1JJB6F 4 | Kq5QQhH72MZ1IsDRhvkU4bds9WxlLNE4wqg9q6jBL9G+4knc/HB9qXmuG4goD89TjT+IVkimE/zt 5 | 6sYh/EG3WmaiOMGHbgQ38YfY3ibKCV6GMabHWY0bo+Ps5jjnuYlCczrSk8Hcgd5U1rONoDnG48Ov 6 | a2W8RGeMXAxiHfWakT4mOx81oRiG1/C5vYh47KSx5fZid4JvxxVd7MdIp3EK06kNNXYneIWtutgL 7 | aIasQUwkJE7wE3SxbycWR8SD93BOiL2YRBwRDN5FwOPchaqecZQTQQ4XAApz0FrFQSLPwQD8mlZN 8 | Et8L5841D62/cJVIi2cgPelEAlBOCYfYSxXymjKAXqSQAFRwloPspRp5dzOMHiTThEqK2c1OvGHI 9 | sg/30YUWKHzDKfZwEB+2xBn3gUSSwmA+MpluruYDySMPYD23TOrX0V/q+CPZYai+yHw8wKscbmhM 10 | D+IVfyevcMlkuvxXxGOphTD4Gi4iJ40C/DZtM12wk8Lfbes/oSN27mGPZW0RnVmvebxIMng3z1Bl 11 | uddz5Mh9wm8icqZIzPHfZDxW8qhotL6cUVh5zP74XOBg0MEnsgW/bfMxzyIOYdgSIuV5/JJtPmZm 12 | Slb7mI6ZGTLVQQafSKHUvp7BxFxhSD6N8UsH4An5aT+J3mNB1T+K3hj8YQ/ezRbpvY3CYKEwYFLY 13 | gvfTkQZ9qTN8nS3lIdJJZwTLDdNztfwUrTTDp+hllmnqrxo+sLqi1dWwuFPKYnK5h0we5c/UhhT8 14 | fF1FHWsZTis8dGAyB4S+67RF5wVhwC/DGHxvAqI4Imyv50Vi0YpjsW4l4AAuGii63yE+lhCHVlOW 15 | 6o79TxRN/ee64y/SHb8TO4MOvq3uYh6iO1oufiP0r0VnjtA9K4zBDzSdgKtjJGbyqBfG5dFguC62 16 | sZiZoLt0Qy3qvYzCKIZNQQYvXupdxGO0Rni5dLebl1wexuD7A4DuC+gprMwTxu2hwT+E7c9iZYEw 17 | 7lMaiBPeczAXT3EQwcdwTbP1Eq3RiyaPvcIe/4igj9C5NYzBpwOQKmzbh4IVF4dMviOShHfCEdxY 18 | ieKY8M5qCUCy8E4oxIWVnwcRfK4wdhqitiyk1JBHJc3UU4UT+HDRYADR1GEnB2s9WYrqssn41/Bj 19 | xcdrrEOVzRogS4hqOfVY8fI6qzWXYTAbgRwUVMvwYeUzzpKCnMGobvIeDRTuZyajiMLoMG2oRONf 20 | wnV5kNDNFH5ZKAD8SbPtFrHYaSr8+nkLgCXC53sCdloJz+RlAFYJv5bisPOG9Cv+U+F+O6AZM4Sx 21 | 2iz+QKZxWrgArSmEbiAIpwvQGdV/qMFOFUdRdTbUn6QCO9c4bajvJhy/GjuFyOqEqhhIZyUXWEk6 22 | esd4imTyKTIG/1e08kghNNEMR7WfgERUpTTmPKrmIdSXGupbiHu3dQFZCagy2MGXzCAekZcPySKD 23 | lVSYTwsf5QB9aeBiCWMJxcO0RPU5AW5UPuyJI9xhr/diz4ssF6ohGJXyFmu42Fj5MrTGMILgKTyH 24 | qpoCAipR3YE9cURFWOorUCVhrzWyKrFWwGg68hIXG79uGziG1rt0IFhPcC+qj6gioARVJm7sRPMT 25 | VCWG+u54sBNHqm19Ji7sZCDrv5gp53ekkcNGvHJvGB+zdVd+M60JRi/eREt9VIQqgfuxM5Q4VEcM 26 | 9R5ysfMAUaA78iFUzRmIfb2sw+j9m6m042lOEqS1hv+R3Y2svpSJCxJCn9hjR5ztywSgg7BtGwpW 27 | FHYLY+8CIB2/5Jppj5BvoE7Qz/a8bCVSrIv+quQrYCLVQl0NXVEpnBF6f4aVX+guvELAPmH7GMk/ 28 | ZX1BgKJb2szBnEJBEMFHUyY841SsjGcr7bGVabLC8z6dsJPC3ww1sxE9LfTeoAdmeumOPkNzYcUb 29 | 776Y6aebOh5Hg6m6l1MaZhYGOUn2sjD6MAmYyeIWfiqYhoKNLJNlaC/ryCUGvRhyWUedYfx7KIia 30 | ck4XfZ5ujMI4XewlxIpzMEL04w31k3STtEW4NWd6Uugr4yFEHt4Ielo4iRvC+P20R6QwTZPnFtpj 31 | I4dKi5veAlbwLPnM4NesZDs3Tcd9RgxGIw3jdjCeO1FQSGYiuw39D6A1CJ+u/wsm0pZA/STDEnY9 32 | A9DKMtRvZjStAIVOzOJMSAsh+YaMltGXGEChHVPYr+s/igsbPTmHP8T2IR7MvW46voZa0+2voLfA 33 | or7GdPtz6C0yHVfNt4S+9KewwXTJ8xtumWyv5T6w14pNIYTu40VcWHHzvvSe3sWFnsIq6foVKCb1 34 | qyOw2N2EnZJ7+5aRSFAYS2lQp3maLOy5WS61pyW4MKOwCJ/E5X8BBTMuXsW+tpITQQYPcXws8Zyu 35 | k420eOZyQSqqy8zDg4yH+cp2T2cYjp1sim3rTzEEO4/YPKNL9AvpD00K+ZTbnZXwc1KSh9FspNrm 36 | DbSZicQirwmzLMI7Qb7EnjxM57hp/TGmEUNjEljAZUNtHW/TGvhA+J6QCx4gicVcNT2r7TyIgoEi 37 | Gf+99CeVLiTSDKimjK85QSH7qCJ4Cr0YRi9SaI6fG5zlIAUcwS9d34Nsen9Xz3f1hRRQJF0fzVCy 38 | yaQdcZRzil18zCUAPtHc3s3mTYIRzWCGkEEH4vFSxmn2s5kSJDgOGP/l4Ii8aOHetzeOsIhiNAX0 39 | wVq28O3lwXHbklnIeQJ/PHJhQbh72YXjts3Eq4n0t5h7BL+mzcVx29Kpxy9E70IvV5h7qiEJRxis 40 | wC+0feTgJkAhg3d098S/J8IUfhziOUAaouscoYJmpNIO0WXSuYYjLLpxFb9U85KNI4wyKJWKfQKO 41 | MEtmm33sXCCbCHC4mMxZIWpx/aglEeNwM4J3KNb8jvmaDTxBIt8jhR8vD22IpYYr1PBD5HA4HP8D 42 | xVcxdwELEFUAAAAASUVORK5CYII= 43 | -------------------------------------------------------------------------------- /data/googlelogo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WojciechMula/base64-avx512/1cb949021bf070faaed8e8c767819f35ffc7e78e/data/googlelogo.png -------------------------------------------------------------------------------- /data/lena_color_512.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WojciechMula/base64-avx512/1cb949021bf070faaed8e8c767819f35ffc7e78e/data/lena_color_512.jpg -------------------------------------------------------------------------------- /data/mandril_color.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WojciechMula/base64-avx512/1cb949021bf070faaed8e8c767819f35ffc7e78e/data/mandril_color.jpg -------------------------------------------------------------------------------- /data/moby_dick.base64: -------------------------------------------------------------------------------- 1 | Q2FsbCBtZSBJc2htYWVsLiBTb21lIHllYXJzIGFnby0tbmV2ZXIgbWluZCBob3cgbG9uZyBwcmVjaXNlbHktLWhhdmluZwpsaXR0bGUgb3Igbm8gbW9uZXkgaW4gbXkgcHVyc2UsIGFuZCBub3RoaW5nIHBhcnRpY3VsYXIgdG8gaW50ZXJlc3QgbWUgb24Kc2hvcmUsIEkgdGhvdWdodCBJIHdvdWxkIHNhaWwgYWJvdXQgYSBsaXR0bGUgYW5kIHNlZSB0aGUgd2F0ZXJ5IHBhcnQgb2YKdGhlIHdvcmxkLiBJdCBpcyBhIHdheSBJIGhhdmUgb2YgZHJpdmluZyBvZmYgdGhlIHNwbGVlbiBhbmQgcmVndWxhdGluZwp0aGUgY2lyY3VsYXRpb24uIFdoZW5ldmVyIEkgZmluZCBteXNlbGYgZ3Jvd2luZyBncmltIGFib3V0IHRoZSBtb3V0aDsKd2hlbmV2ZXIgaXQgaXMgYSBkYW1wLCBkcml6emx5IE5vdmVtYmVyIGluIG15IHNvdWw7IHdoZW5ldmVyIEkgZmluZApteXNlbGYgaW52b2x1bnRhcmlseSBwYXVzaW5nIGJlZm9yZSBjb2ZmaW4gd2FyZWhvdXNlcywgYW5kIGJyaW5naW5nIHVwCnRoZSByZWFyIG9mIGV2ZXJ5IGZ1bmVyYWwgSSBtZWV0OyBhbmQgZXNwZWNpYWxseSB3aGVuZXZlciBteSBoeXBvcyBnZXQKc3VjaCBhbiB1cHBlciBoYW5kIG9mIG1lLCB0aGF0IGl0IHJlcXVpcmVzIGEgc3Ryb25nIG1vcmFsIHByaW5jaXBsZSB0bwpwcmV2ZW50IG1lIGZyb20gZGVsaWJlcmF0ZWx5IHN0ZXBwaW5nIGludG8gdGhlIHN0cmVldCwgYW5kIG1ldGhvZGljYWxseQprbm9ja2luZyBwZW9wbGUncyBoYXRzIG9mZi0tdGhlbiwgSSBhY2NvdW50IGl0IGhpZ2ggdGltZSB0byBnZXQgdG8gc2VhCmFzIHNvb24gYXMgSSBjYW4uIFRoaXMgaXMgbXkgc3Vic3RpdHV0ZSBmb3IgcGlzdG9sIGFuZCBiYWxsLiBXaXRoIGEKcGhpbG9zb3BoaWNhbCBmbG91cmlzaCBDYXRvIHRocm93cyBoaW1zZWxmIHVwb24gaGlzIHN3b3JkOyBJIHF1aWV0bHkKdGFrZSB0byB0aGUgc2hpcC4gVGhlcmUgaXMgbm90aGluZyBzdXJwcmlzaW5nIGluIHRoaXMuIElmIHRoZXkgYnV0IGtuZXcKaXQsIGFsbW9zdCBhbGwgbWVuIGluIHRoZWlyIGRlZ3JlZSwgc29tZSB0aW1lIG9yIG90aGVyLCBjaGVyaXNoIHZlcnkKbmVhcmx5IHRoZSBzYW1lIGZlZWxpbmdzIHRvd2FyZHMgdGhlIG9jZWFuIHdpdGggbWUuCg== 2 | -------------------------------------------------------------------------------- /data/peppers_color.base64: -------------------------------------------------------------------------------- 1 | /9j/4AAQSkZJRgABAQEAZABkAAD/2wBDABALDA4MChAODQ4SERATGCgaGBYWGDEjJR0oOjM9PDkz 2 | ODdASFxOQERXRTc4UG1RV19iZ2hnPk1xeXBkeFxlZ2P/2wBDARESEhgVGC8aGi9jQjhCY2NjY2Nj 3 | Y2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2P/wAARCAEAAQADASIA 4 | AhEBAxEB/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQA 5 | AAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3 6 | ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWm 7 | p6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/8QAHwEA 8 | AwEBAQEBAQEBAQAAAAAAAAECAwQFBgcICQoL/8QAtREAAgECBAQDBAcFBAQAAQJ3AAECAxEEBSEx 9 | BhJBUQdhcRMiMoEIFEKRobHBCSMzUvAVYnLRChYkNOEl8RcYGRomJygpKjU2Nzg5OkNERUZHSElK 10 | U1RVVldYWVpjZGVmZ2hpanN0dXZ3eHl6goOEhYaHiImKkpOUlZaXmJmaoqOkpaanqKmqsrO0tba3 11 | uLm6wsPExcbHyMnK0tPU1dbX2Nna4uPk5ebn6Onq8vP09fb3+Pn6/9oADAMBAAIRAxEAPwDOit4d 12 | ikJyVHc0/wAtOy/rWta6fCbaJipJKAn8qtR2UK/8swfrXnvGVFopP7z1PrKWxz/lpj7v60vlJ6fr 13 | W3dabFLGTGAjjoRXPvKyOUYEMDitYYmrL7T+9mtOs5rRjzEvYVbsEsmYJdRcno24j+tU2eSPAdCC 14 | elRtcEfeGPqKr21Z/af3sJNzVlL8TUvdClDF7NwyHop6iiCwhDxxzxZbo3Jqpa6vPbjCNuX+63Sr 15 | dpetcXoMq7W6kDpSdevb4n95yVJYiKs3oaiaLp7f8u//AI+3+NP/ALC0/tb/APj7f41n32utE/l2 16 | qqSOrNz+Qql/bOon/l4I9tq/4VUcRUtrJ/eOFDEzV+a3zZszaRp8S5MGP+Bt/jVVdMtpWHlwbV92 17 | PP61SXWJ2INyolA/4DUd3qc9z8qExR/3V7/U1DrVm/if3j9jir2cn63JL57CFzFb26yMOC5dsfz5 18 | qqssfe1i/N//AIqouScmlFP21Rfaf3s9CMGlZt/eWPMt8c2a59pGpAIJGCiMR5OMljgVFilo9tV/ 19 | mf3sbj5v7zfg0azaNSyb8/xBjz+tDeG9KLEm1yT1PmN/jWTZ309m+Y2yndD0NdJZajb3qfIdko6o 20 | ev4etXCvU2cn97PIxNOtHVttFL/hG9KA/wCPX/yI/wDjTLjQtHt7aSZrTOxc48x+T+dac04RTk9K 21 | wNV1AswRiPIH3ue9U8VVTspP72cseZ9Svo2i29ykhubbOMYO5gP0NWrrRLP5Ugt/JbP3i7EH261b 22 | 024ihiXedquuRVxzJcyIu0rDgnPHJrP6zWe03f1YnUkpXRzkmlPCp2KHx1Ck5qnhMbHXkdTyDXWh 23 | FaQBcgj171Tv9JiuJEmwVkTuON3san6xVe8n97OqnjZLSRiwaRby4Zos55+8a0Y9A00j5rfP/A2/ 24 | xqxAO3THUVbRlrOWKrbc7+9nJOo5SuU10HTNmz7N8vXHmN/jTZPDmmmMhLfa2ODvb/GtJTnkVJkY 25 | 5pRxNZfbf3sXPJ7s4+40y2gyjRYYHGdxqeHTrHaN0PP+83+NGuXAbUooU5LelX40CbCR+ddDxNdJ 26 | Pnf3s1q1E4prQS30uz/54cH/AGj/AI1O+l2OD+5/8fP+NSrKFFKZg3GKweIrv7b+9mar1ErKT+8n 27 | s1xaQ/7i/wAqsoV/ubqzrS7g+zQr5gyEUfpVxJdvKmuSMuWV2bNXH3LeWpYrgAZrkIV+3ahLMMhA 28 | /P1rU1nVCR9miOXbjim2UAhjWNR7n3NdCerl3Nm3Rp36snWPfjcMgdKe9rFKu10BFSqOQqjJNWrq 29 | KK1s98rYftRGMpXZ56k73Rx2tQHTWV413I54HoantHkNibpxtkkGAB2FQT3TarqiwRDMScs1XtQI 30 | VFiUYAAGK26JPc9bDylUlGM3qUO+afmmdKXOKR64EmjJz9KciFiAOSa3IbKGCxeSYAqB+ZosctfE 31 | KlZPVswwc07NITzwKVMHmg2ANThTHwTSrRYB1OVirBlYgjoRwRTRzTTmkJlqe8vLqWGNWGxmAkbo 32 | QPWpLvy1RlRRxznrVIEipEdN2ZFP1HWlscNTC8zvH7jWtZ/M8sAE4XawrRtncZRxlRyprKsXHOHD 33 | 85J7/jWrEflrNO0jyakHCVmWFiLyK5UfL09qdKQ3HekE2BgUzJJya3lOPLZGZRniImyOAev1p8aV 34 | NOo4NIg21ysfUdgCoZ5QqE56U+eQRxljXL63rSxoYomy5447VUIOT0BRcnZEUJ+2eIS+crEv61vl 35 | Q4xXN+FvnkncnJ4rpAea2npL0CrpKyGbSp2k81YjRStQupyDViPGPwrOfkQihaRD7OnH8Iq4keEK 36 | hiAR61Us2Pkx5/uirqHjJrORV2paGHDYvDqMryktj7h+tasQ2jABLGmTAPdJzg4qa91K00q2PlEP 37 | MRyxraK59y5zlVkrjzfx6ajSTKCSMjPauQ1fW7nVLjYhOGOFA/lVLUdTmvZCWY7ewq7oVqBFJdSK 38 | d3SPP6mt0uVanVQoe8l1JbWP7HF5ULZkf77j19B7VOy7Bt9KQMFcmkZ95JPWp3PchBQ0ihp6UmaU 39 | 1JaxCWcKc7RyfpSHOSinJ9DQ02De6/LkkZ+gqXW7nhLZTwoy3NWYU+y2sk7Z6ZAOKwpWaSRnbkk5 40 | NTe55eGTr1XVlsthAKUxsqh9pA7HFNWn4pnq2GYzSjil70oHtQIUHPWnU3FOFJiEIH40mKcRTcUg 41 | FR3jcMhKsOhFbtjdi4jz0dfvD+tYQ61Nbym3mVxzjr7iokrnPiaCrQ8+h0YPeng8VTguVmTKnkHB 42 | HoatIcjJIxWOx8/KLg+WQ9xkU1RxUE+oQRDg7vp0rNfVLlpyIgqx+uMmqUWwuReLLxrawCocM7Yr 43 | hmZnbcxJJruryxTVEUXhc7ehXg1lnw0IZAxbzY+/Y/l/9euulKMY2Nac4JasqeF5dl3Ih/iXP5V1 44 | LdMg1WtNPt4FBjhRT6gc/nVwRcd6ib5nc56k4yldCI29e1WogCtUljMTHJ4NXYCNuKxnogiZtqcx 45 | Rn/ZA/Sp5ZhEvzcfWprKzVIYH3E4UHB+lLe6dFesxmZ/m7KcUvdNHTfMcTqOpTNqDSwOVC/KMelU 46 | bi6numzKxNdTeeF7ZUJilkVgONxyK55rSWOcQkfMTge9dcJQex6EIwe3Qs6Bo39o3HmTcQR/e/2j 47 | 6VrX0gW5kRAAoOAB2ArWtbZLGxSFByByfU1jagmy6P8AtDcKhy5pF4KalWa8isfegcUGkz3NUeuL 48 | WnpkGACQcuc59qp2cSy3G1+ijJBOK3bNduGzgZ6egqZOyPKx+JXL7OPzF1gmOxCf3mA6fjWA45rq 49 | NXhEtgx7qNwrl2bJotYvL2vZ2QgHPNTRBc/N0qIHpT15pHok11CoiWWLleh74quDU8LhW2ScxtwR 50 | /WoHQxyMh6qcGmQrrRig0A80ClIpMYvtQRjmnwgeauehIzW7c6NE8RaL5X68dKIxctjKpWjTaUup 51 | z4UntS4wea6G30sbFZgDx0xWTqUey6KhcHuMU5QaVxU8RGpJxRWguGgl3jOD1HqKr32uTvMbdV2A 52 | H8x61oahaiJ8p0HFYWrRfIlwvDKcH6UlFc2plXpQqxVS2pet5yBuk+Y1oWw3HcefasGzuGl2oegr 53 | ftgRRNWPHxEeXYvoB6VKF496iiq3FGWGaUU5bHFYrrFh+KtLACuQcn0pn/LQcd6LhmhI/MVrBJJ3 54 | GRvEQSMVGFKdPyq/YnzgS/Wo7yNVkO3AHelOC5boaXULQf6HD/1zX+VPaorV1+yQjcP9Wvf2pLq5 55 | SGMsT09K43udm7KOqXAjQ5NYGmOt1rMY67ctVLVtWa7mYR5AzVvwjETeTSt/CmPzrpjDlhdnQ1yU 56 | 33OjuHxk5qjqFv5tiJlHzx8n6VecCRtvWpYwCNmAVIwRWd+XU5aM3TmpI5u1ge7mWOPvyT6CtCaw 57 | t4nxhnTGMDkg+tWLO2FiJVHJLHB9u1OVCzZNaOel0deKxLqStF6EBt4Yvn4yw5JGOKsW8RiuIooy 58 | /l4+Vc5xVlFyOKkSBRhl4K9PSslO71OJxuSHLkH+FTgj2rK1DTjMXlhI3KM7e7VsKZT91vyoYNxu 59 | 6iqc9LpF0pypS5os5BTkVImciptShEOoyqo4J3Y+ozUcIy44qz6OE+aKkWZ4f9HV8AGqc7FyrnOc 60 | bT+H+RXWTWaTadgAbgvH1rk5htJU8YNXKNjChVVVPyEWnVGpp+ahm5ZtF3Sr7EV1kUgEBJPSuRtp 61 | PLfNXZL6SRDHnC04y5TkxNB1WjcguQ0bNkAD/CuevZ/Nu5Jhg44FMa5cLsDnb6VXd/lx3JzSlNy0 62 | Ko4dU5OXcfJcPMPnbmqtxH51rKnqtamlad9u35YrtHX3qBrdohOjDlDg0mmkmbc8LuC6HO6eec11 63 | NqS0ak4yRXLaeAXx1wa6ezz5YzVVNzxcUvdRoRjpWrbp+6FZcQ5APetmLiMYrWgjz47mdcfLMahe 64 | SLIhlcKpOQc/d/8ArVLcnMzdjWLqDTS3Ihi3MTxtHSofxMEruxYuNRkicw2Thj08zqP/AK9EUTyM 65 | HuHaV/VjU1vopgtWmnf51GQM4H0pYTkA+tTLsOWmiM6CMi2RlLYCjJHbip4sHqc1RsNRNuFO/MRG 66 | CPWny6jbLKTGwCn9KzcTVxldjdXtLT7O8rxqrgfeAwai8KMDDcEDncM1laxqhuf3Mf3R1I71c8Kl 67 | lMhU8Hhq15WqepvFSVO0mdHuw7Ed6lifpk81UMmFOeBUMl2kWS8irj1OKwkuYySb2RbDbrlwTwTU 68 | iqVfpVWFxKiTKQQ3erQfLZo6WCS1LSDIAHepXUKpqCMkDNS5LA5qYtK9xsmtpAgJNI77jn1qNBTq 69 | Uqj5eUEjnNTJOpzE+2PpgVDA22VT71Z1vC6guP4owT+ZqmpwQa2j8KPfoO9Jeh2Vkwe2A65Fcrqs 70 | XlXUqjgZzirUF/IigI2B6VXvHMzl2OSa0c7pIxoUJU5t9GUFPFOBoS1ndswxsy55OOPzq2ulXbIX 71 | woAGeuSfpjNFr7GsqsIaSZXWpMntUR3Ru0cgKupwQe1LuzUNGqkmKeBk0w5PXvTsd60NLs1nl3SY 72 | 2D9aEruwSmoR5ma+iRfZ7He3BbmsLUbsA3Ug6Ek5+laOr3PlqIY2woHQVy2rzbLRh3c4rR6tRRx0 73 | o2Uq0upS08kNuFdHZyjyxiuXtJNnXFacN6EwOoqpxucdaPPHQ6i3lBrVspCVOTxXI2mpiWYRwozv 74 | 6KM1prqNxayiH7NIXYZHIxShLlep53s5J7GjfkRtuHJJwB60WsEVhA93cEbvvM39BUdtHPNcI9yq 75 | ggZCr0FWdTsBfW6wmRkVW3fKOtF+ZtjSsY0t3JqE3mSEhAflTPA/+vVqHpUD6VcWw3ROJ4x1wMMP 76 | w71JA+RjPapa1M5nFT219DFl4H2HncvIx+FUt7yEKMkk4AHU13iKotkY4GEHP4VS8uKJzd+WvndF 77 | O0AgUU6t+h6sVKbsjCi0cQxiXUZDFnpEv3z9fT9ackrQ/JZDyd390kk/nU15OZnJbmm2oCP5h/h6 78 | Zq277nZCjGKtYurNJbW2JHZnbksTk1mTN5hJbGakuJWlY88VCELuqDqxAoSSNbKKOrtI/LsoF9EX 79 | +VWetKsY2bccAcU3kHBArlZ4F7tk65xU8dVUPGKtx4xWezGStz0ppFL6UMflNTJ8zuM5nW2zqQH9 80 | 2MD+tVA3tT9WfOqze2B+gqFDmuyK91Hu0NKaLCsRUucgVXWpQQB1qWdIjyyxglTkY6VuaPcGWNon 81 | /A+uawyrSr8gJzWjp7PDJEzDBxhh6GqTseJj6cItSjuyPxBBsu47jHEq4b/eHFZ4IIrotag+0adO 82 | oGTGBMv9f61y0b8U5I6sFV5qdn0LSAEitW2uVt4Sny/WsdQ5HyqT+FPZZVXc6OF9SpxUbbHXJRmr 83 | Nj7mQySE5zmlgtoZbe4meNJXhRmUMMgHHpVO4u47eEseWPQdyaZpEWqN5ki4WOU5YN39qdmlc58Z 84 | NKnyJnPA+lO8xsYBrS1DRLmKQvDD8h52hs4rKKsrbWGCOoNdKalscClfY7/w3ZJZacjbR5so3Oe/ 85 | 0rWNuHywADn+LHNVLVgIIwP7oq/G421wxleTuY1NxYVZWYEcYwOamIYjjFMUjNTriuinroYNCKMD 86 | FYN8qwajIq4CsN2PTNdAa5bV5863Kg52KoPtxn+tXJaWQpK6ZB5nmRxg8RhRx68Vn393k7V6VYeT 87 | baoF6lR/Ksqblqzikj6CnBRVxhbdyadu+XGajx09adzVmqE+lTafGZNSt1HPzhvy5qHmtHQE3XzM 88 | cfKnH1JpSdkzGs+Wm2dOPuD6UwgEcjNSP0pmK52eEgjXmrScCooxVhR0rGTuyxaRuhpR1pk7iNCx 89 | PA5qFqwZxGqXKnVLk+khH5cVCt0oPHWlW3+2XTMq5Z2LE9utbNrpEcY5G49zivR0SsejPEKgknuZ 90 | gnlILeWwUfxEVPZRS3cgLkqnpjGa1jZRlTgbT7cVX897MiO5ZmQ8Bj1Wp9DKWNc42juXoIghAI+l 91 | LcfJKSACNob8f8iqOn6s0pka4CCNG2jHf3q9LMlxIrQspXZyRUtWPOmpJ+8acLLIkRxlWXaR7Vyc 92 | 9otvqEkBz8jYH07V0unkmLaSCV6VBqlgst7Fc4O3Z+898VS1Vy6VRxuY5cmRY0+6PSta3YhBsJA7 93 | n1rKCKrvNGxaHOM919j/AI1pWw6elZtXLqT00K+o6Lb3+JFBimXow6H6irFqhjQRMNrIMYqZr+3j 94 | O0vk+1BlV8NjHvSk9LGbdRr3thXj3DBGa5rxFpy+WZ0GGX0rsbMRyjghiP0qrrNkHt244bitKcXF 95 | c6JhLldyhpk/m2MDDugz+VacUnGK47RrloS9vk5RiBmuhS4GM5rOcLM6px1NdH96sRvkVhm9CDrS 96 | /wBrqmBnJPYClFyiyfZt7G8XA5zXBMt9f6jeX8SFbYuT5j8LgdMevSujkv08otMRjH3M9aw76/mu 97 | 12khYlHyovAFaxnKRvRw0nq9inuZ419MCoXT86cjERJ9BTiAelUeyldFanY4zT5FAOaDgRjFMViM 98 | mtPw8D9plb2H9ayzx1rY8OglpW7Egf5/Opn8Jy4p/umb5OaF5NB44xTk61zs8ZEijmpR1piCpBWL 99 | ZQo61i+KLz7Lp5VThpAQK2elcd4lc32swWychRzWlCN56lRtfUs6NamK2VmHzuMmtmPCrjgZqKGL 100 | bEo9qGX5uvNbttu5yylzScmWlVT0I/OqmoJBfR/YYXQzE5b5h8o7mniVE+9xVTUba1vQHhLRXicx 101 | yrx+ftVRlbQqMdbnP6rp7W94kTiRYSQu/BwBXTWkQWNRGBtC8ewqKM3j2wjvoYZuMNsbr9Qcfzqx 102 | AFFv5cQa3K4ADfMSB9CaJNtF1JOS1Ldn8rD+GrlwiSJ5bDIPaqMc0KfL56l0wWU4yPr6Ut3dsIgE 103 | 8sTbhgbgeP8AGknpYzRRMLQ+daRlV8vlC3UL7Ad6o6hMYLfyYyzM743ngjHXGMew/GtQzC2QmTfc 104 | OT0jO4/iT0+lY1y1/NqbJPat5CZ8sxIxU57k45NC3uawWt2MtISXG/8AWtuFflAyPyqjHE+7mIoP 105 | 9rrWjCoAxUSd2OtUlLclVSjBlJBHcVNdTia2AYANnn3pEjOOKhuEIjbHXGRVapHP5HCXL+RrM23p 106 | vPStJbokCsK5kMl5LJ6uT+tTJc4Wuhxuj1aaVtTTe7bdgdfSnpKVO7OWqpAjbQ78EirSDioaR304 107 | aXHDcxJY5JpxUYpKeoLcCpOhKyKAJVQB6U5W7GkAwq/QUyQMo3A4pmd7Ikc5phJpqyhx1571G8h3 108 | YRSx9AM00gcla4SGt/wyn+j5/vOT/KsCSGcRmUxMEA5Jrp9AULYwe67vzJNTUfunBippxsjUP3s0 109 | +OmY5qRa5WzzkiYdKUHim9qRmAHWs0tBkV3MIomOe1cjphN1rU1w3OOBWvrdyVt2xWRoAwrv3Y11 110 | 0o2g2Vb3GzpZXAAC1Gp9TUYO7k0vQU7HK0KzAnkEmlXA6DFN6dakTmnykscOeM04LjgGmDgk1PAh 111 | dgcUkruxJFe2MV7amK5j3KR8p7r7g1jWtsbQNbebIVRuNxxwa7VURowCB0rm9TiWHU3C5AKg1tKH 112 | KtzZNpWFt+AM5NWwgPbiqCScjBq/CwYYziue2pnrcesJPAX9KXyXXkjitG3ddmDjNJcTIFIxW/so 113 | 2uVYghPf9KinH3j2FLG/P1pkrZUj1qOZctgaPNb1PLvp0H8MjD9alsIN7+Y4yi9vU0uoJ5mq3IHe 114 | Vh+tW0AVQiDAXpWzeh61CF9WWFO41ItRxA55qbGKyZ6cR20dq1tI00zZkboOlZ1sm6ZFPTNdfaol 115 | rZnkAAU4K71ObGVXThyx3Z571UfSmSsdmOaNrBAc9qIyqMXlDHbyFAzmkjWcrK4024ttry/PI3SN 116 | T0+tW5Yp1CMJIoGOB5SnH54qDTR5tzLcZG6Nflz61eitorixXzQpkBLM/qc027PU82c3cy9UDiYI 117 | 8oIwOFORXYWEflxImPuqB+lcjLAy3aKFH7yQBTk8LnHQ/Wuzt+FNZ1XokY1dkS456VKtRj71SCuV 118 | mQrHA4qOZ8LTn5PFQTk04oRha437hqqaLxB0qbXwfs59+KpaLJwyn1rsivcOjlvTOjjcbae2CKz2 119 | faKfFOSMHP1pI5eR7lvODTt4Aqr5+TzS+bn6UMhxLUTZ5PNW45tgwB1rPibB9qsA/L6VKdjOxeS6 120 | IPJzWXrDiSVH74walDHr1qpcgsc1Tm2rFIijJ/rVyN8DrVNVITNToeOTWYmi9FM4wAalL55PJqnG 121 | xB61Mr59zT5hpEobB60jHIpgPz1HdS+VbSyHGEUt+Qqeo/I4ZmL380nq7H8zVyHPvVCD75PrWhEK 122 | 6me5QVkWk5PFS1FH0qVTk81kztiaWlxBpgx6CrWsXxEfko3Uc4qtp7hUbPHvVK+lV5mK9OlF9DBw 123 | 5qt30MNJwRjPSkeYAgg1A9vIRkDNMhba21xz71rZboy9o37rLUMmGZIySWGQPete1uIkUBldQ3J5 124 | 6e1ZJkDjrj3HWpHPzLDauZGPDAf41LVzGpBF9ZoLzUI1PyPG42AH7wHPNdJB0auV0i2YavHvUDaC 125 | cYA7Y/rXVwdxXPV0dkcdXeyJVHzVJTFp9c73IImPzVFNzUp5c4qOUYq0Sc34jbESr6msjTpfLnx2 126 | NaPiV/3yJ6CsVWKsGHbmu6C907IL3UdUrCReKEQBjk1QtZjhXHQ9a0d4wGBrNqxi1yuxG3D4Bqfa 127 | duaimABDip1O5ODSJmCOA3WrakFRVAgo/J4qzCwKgHGaTMZRJW4XIqk0mXxircrYX0rNLDeetIcY 128 | XRaGCBin4x0qBJML3xmpVfIHWkS42LCkU9W56CoQc96lA5HSkSyQHJqprLbNJuWPHyY/PirijFUP 129 | EY/4k02Pb+Ypx3QR+JHH27YdeevFaaYwKyEBwSDyORWpayiVMg/WuqR7VB9C0vTrUinHFRLx3p4N 130 | ZM7UWEkKjA4FV5zjOOtLuxUErUJDb0Et9m1Q5C8DO6m3dpG/cexFaa2sjQxny1xtB6e1V5bP1UKf 131 | UVCmrnnJtmHIrwHDcjsaSJuSwYg+1actgZBgy4HoRmq40wL1lJ/DFbKSaKs7ml4bTzLuaUkkqgXk 132 | 56n/AOtXRxcAn3rI8O2wgt5Wzne3X6VrRHK59TmuWpqzz6zvUZYTpUmMjqKjWnHkVityOgip1JNR 133 | zVKg4qveSLDA8jdFUmqWrFY4rXpPM1JwOi8VnU+aQzTvIxyWbNNAzXopWVjuS0saFhJ8oB7HFbCY 134 | MeOQTWFYkB2ye1bMTjy8E1E0RURYUZQg0kEm1iuc0qONm4elV0J37scGsrEWui7Jgr1psJ55H0pe 135 | qZ6ZqJP9ZkYA9KRja6L+Mpjgg1nOMOcCrbP8vGazw/zk9qEOCdmW48FBUnlccHmoou1Wl578VJlJ 136 | iIGHU1OpOcnFNGM9aeKTM2yQfpWf4hONHnz7D9RWiSMVjeKZQul7R/G4H9f6VUfiQ4fEjmrSLzWK 137 | 0jeZZybl4+veo7eYwyBhV+6uLa605h92dGDA/wB4HqP5V1dT0ua2qFh1GJhiTKn86nE6MMq2ayra 138 | MMcmtFE+UYFS0jtpTk1qPMhJ44ppAx60pXFJnrika+p2UEQNpFkfwL/KoZrZTnj9KvW0bG1hwD9w 139 | fyoliIGSMV5/JJa2PG57MwJ7bAJA6VnyELnNb1ygx2rEuFDzhAB8xxW1N3OqnO61NWzHlWKgDnGf 140 | zq2nAAHpVZf4Vx3qyuM1EmcD1bZZQcUp9KE6U7vWYCgcVheKrjydOKKfmkYKK3u1cZ4sufNv0t1+ 141 | 7EuT9T/9bFa0VeZdNXkjAC09R7UoHFO7V3nco2HxEhwRWtA2Y+tZGfarltL8lJomaua8Jyp+lMj+ 142 | 8RgcdKS2fgE0btsp561ic8kXMZSo1XmposMlMI+apMPISc4jwP0qhvXkgEc+tXpWwhzWUzYZue9V 143 | FXN6UdC7HOoPU8VaimDYxWVHyR/Or0GQcL09aUkTUgkaCkcU9cl8CokB9amTg1kzjZK2AK5PxPdr 144 | NcRQI2fLBLfU/wD6v1ra1fUksYPWRuFFZVpbJP8AvJAszsOXbmtKfu+8zsw1B1Hc57NLmurfSI3j 145 | IMCfguP1rLutEwSYWKn+63IraNWLOx4ea+F3KlkcrjPetIY24HFZQins3/exsF9e1XY5Q68GqZtR 146 | lZWe5KxpFXjmnAZqQLgdO1TsbpXO8tJFFnCD/cX+VMupFK8VBbuBaQ/7i/ypzR7/AL2celckq7as 147 | eHyq9zE1K5WMEZ5rAW8/01T1wSa7gWcA5MKE+pUE1II1XoAKiNRRWx0RqJRtY5yC8QygZ7cGrsMw 148 | Y8GtjHrTDBExyY0J9SopOaZi0iJGBXrT1INL9njxwCPpSCHb91z+NQRYbdTpbW7yyHCopJrzi4ma 149 | 5uZJ3+9IxY+1dzq+lzajCIVuhEmcsNmc/rWK/g+YD93doT6MhH9a6qMoRWr1N6TjHVnPDFLWvN4V 150 | 1KNcp5Mp9FfB/UCs26sL2zz9otpEUfxYyPzHFdCknszpVSLIs1JBJsfHaq4b1pwYY5z7VQOzN2yc 151 | McZ/OpLkYfjFY9tdGNxk9KtXF8HVcfzqHHUhxNCG42ipEm3Z3f4VlxShz26VOJAgzmpcSPZos3Mg 152 | CfM2B9ayXkVn+Un86jvbx5P3YPy1TVyDyauMbIqNo6GkJSvU1agu9jA7qyRLkdaY0h7MRTcbjkk9 153 | zrIb+EL8zrk9jVpjcmFmgs5nIHGVxn864m3uPIuUlbnGffqPeuzTxlp+FjAmTAxvaIED8mzWTp2Z 154 | w1KVnoY76Vevc/atUQJnhY8g/TpVmwj2SBX+UA9BTLzxFFeSMjICucq65GfwPSktZklkJjbgcE4x 155 | zRK7Wp3YTmTaZ1Fokcvy44xTrrRllG6NsH0IqlZ3XlEAsD+NbEd5uUfKcfSsqTglaRFX2tOV4mHJ 156 | ok+SNny+ueKz7nw5IMOsJUn+5/hXXi4TPRseoqRnVkYA4JGBXRBQfwsX1uqviR520MkDEOMgd6Xc 157 | Cp+lbN4VUmOdFbbkblGDmsma3BBaM4NJPuepHWN0bmlXv2qeOJVO1IwSa2wKwNIiFvahYSWYlSxP 158 | 0rfU8VwzSUtDx6jTl7uwu2kI5pc8UhNQ0QhMUuKTNOFEY3BsTFJint97ilULtySM1sqethXGbaAM 159 | 08yxr94/hTDdD/lnFn6nFaqnDuTdi7KQx57UgnlJ6IPwqTzJD1CflVezgxczM650WwuQfNtIyT1I 160 | XafzHNYd74PBy1lOVP8Ack5H5iuuMhx8yZ+lN8xc/wCreqS5dmWqkkeZ3ul3di4W5hKg9GHIP41X 161 | 8uQjG0kdq9G1SzOo2/kkCPkENjJrDn8N3sYzbSwPjnDgjNWp3OqFWDXvOzOdgjkQ/dJqaRLiQYxg 162 | Vbltbu2Obu3ZD3OMr+Y4p8YDDgim2dMYRkrpmUdNuDztz/wIVG1hcL1iJ+hzW4FI65oORU87K9jE 163 | 55reZPvRuB7g1EQRXSFj2pyK7nOTT9pYTw6fU5lI3kbaikmr1vpgbBmc49F/xrrLK1ymSvHUkjNO 164 | utPtpoBLakBiMjCkA/h2qHWbM1ClCVpamEltbRfcgQe55/nUu4LwAAKbvxkFcEcGkwD60HaklsPL 165 | jGDQGAPysQfUVNZ2q3Up3D92AQXz90+tV9gBPOR607ExqKTaXQmFzcdrqX/vs1NHqF4gwty5+pz/ 166 | ADqsqjtT0Xmk7D5YvdE2ZrtwrNuLH0qWfSp4xhGySOhGK0NKtlYh2UnB5x6e1aKRsytudjG7blzx 167 | t9qSTaujkrYr2cuWOx//2Q== 168 | -------------------------------------------------------------------------------- /data/peppers_color.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WojciechMula/base64-avx512/1cb949021bf070faaed8e8c767819f35ffc7e78e/data/peppers_color.jpg -------------------------------------------------------------------------------- /docker.sh: -------------------------------------------------------------------------------- 1 | docker build -t avx512base64 . 2 | mkdir -p dockeroutput 3 | docker run -v $PWD/dockeroutput:/dockeroutput --privileged avx512base64 4 | docker run -v $PWD/dockeroutput:/dockeroutput --privileged avx512base64 5 | -------------------------------------------------------------------------------- /dockeroutput/decodingperf.txt: -------------------------------------------------------------------------------- 1 | #displaying speed (GB/s) based on input bytes for memcpy and decoders: memcpy base64, chromium, AVX2, AVX512; first column is number of bytes 2 | #Each measure is given as a triple (median, min, max) 3 | 1024 73.901 67.303 74.289 2.323 1.467 2.343 11.238 5.228 11.263 30.077 7.977 30.284 4 | 3072 80.403 77.740 80.564 2.339 1.663 2.343 13.187 8.279 13.200 41.038 15.954 41.240 5 | 5120 125.438 124.346 125.723 2.357 2.181 2.598 13.403 8.745 13.867 39.908 22.218 39.992 6 | 7168 138.759 138.102 139.029 2.431 2.274 2.593 14.291 10.083 14.774 44.263 26.854 44.346 7 | 9216 150.878 37.425 151.277 2.578 2.347 2.587 14.773 12.419 14.785 46.614 25.873 46.711 8 | 11264 153.804 146.969 154.160 2.584 2.579 2.591 14.746 13.104 14.755 44.433 29.706 44.470 9 | 13312 77.756 77.300 78.325 2.587 2.351 2.593 14.483 11.820 14.492 45.095 33.068 45.141 10 | 15360 45.058 43.322 45.135 2.594 2.290 2.599 14.434 13.345 14.468 44.220 33.670 44.374 11 | 17408 40.546 39.357 40.941 2.594 2.458 2.598 14.449 14.295 14.463 41.608 40.467 41.872 12 | 19456 42.555 41.451 43.032 2.596 2.465 2.600 14.558 11.036 14.597 39.642 33.515 41.318 13 | 21504 42.627 41.597 42.746 2.597 2.482 2.598 14.572 13.598 14.586 42.264 34.304 43.122 14 | 23552 42.864 41.582 44.013 2.596 2.396 2.597 14.594 13.705 14.604 39.697 34.402 41.014 15 | 25600 41.041 40.426 41.881 2.596 2.486 2.596 14.480 11.629 14.499 42.428 33.785 42.492 16 | 27648 42.131 36.504 44.154 2.596 2.508 2.597 14.520 13.711 14.531 42.223 34.660 42.347 17 | 29696 43.476 33.665 44.382 2.598 2.508 2.599 14.524 13.635 14.545 41.591 35.190 42.167 18 | 31744 41.123 40.541 41.874 2.596 2.517 2.599 14.594 12.033 14.615 42.955 35.806 43.308 19 | 33792 43.453 42.861 44.515 2.597 2.434 2.599 14.609 13.859 14.625 42.768 36.425 43.146 20 | 35840 41.717 40.820 42.063 2.597 2.375 2.600 14.608 13.913 14.617 40.863 35.593 41.487 21 | 37888 39.521 38.676 40.576 2.596 2.509 2.598 14.532 13.957 14.545 41.832 36.277 42.480 22 | 39936 40.907 39.826 42.089 2.597 2.455 2.599 14.518 12.225 14.536 41.630 36.960 42.539 23 | 41984 41.028 39.728 42.582 2.598 2.413 2.599 14.535 13.926 14.556 41.579 36.646 41.661 24 | 44032 40.630 40.099 41.076 2.598 2.431 2.599 14.603 14.056 14.616 41.824 36.388 42.338 25 | 46080 41.287 40.024 41.873 2.598 2.533 2.598 14.609 12.789 14.619 42.578 37.740 43.047 26 | 48128 38.864 37.684 39.610 2.598 2.437 2.600 14.589 12.914 14.614 40.929 36.544 41.995 27 | 50176 37.877 37.159 38.235 2.597 2.567 2.598 14.557 12.633 14.570 42.621 32.354 43.179 28 | 52224 38.314 29.261 38.715 2.595 2.550 2.596 14.525 14.053 14.565 40.855 37.505 41.351 29 | 54272 39.341 38.257 40.144 2.597 2.534 2.598 14.526 13.199 14.578 41.731 37.283 42.456 30 | 56320 35.372 34.868 35.668 2.598 2.552 2.600 14.517 13.084 14.564 38.887 35.752 39.296 31 | 58368 39.216 38.312 39.724 2.599 2.542 2.600 14.537 12.809 14.589 40.162 35.959 40.621 32 | 60416 38.659 38.099 39.293 2.599 2.552 2.601 14.565 12.380 14.638 42.059 35.751 42.779 33 | 62464 35.191 34.598 35.652 2.596 2.554 2.596 14.482 13.218 14.523 38.979 35.415 39.341 34 | 64512 37.182 36.467 37.960 2.597 2.555 2.597 14.491 14.002 14.530 40.014 33.054 40.798 35 | -------------------------------------------------------------------------------- /dockeroutput/encodingperf.txt: -------------------------------------------------------------------------------- 1 | #displaying speed (GB/s) based on input bytes for memcpy and encoders: memcpy base64, chromium, AVX2, AVX512 first column is number of bytes 2 | #Each measure is given as a triple (median, min, max) 3 | 1024 83.326 67.877 83.599 1.573 0.972 1.574 14.011 6.626 14.062 34.565 11.474 34.744 4 | 3072 88.048 80.957 88.673 1.492 1.265 1.581 16.095 10.454 16.124 39.127 17.222 39.242 5 | 5120 125.438 117.935 125.723 1.485 1.481 1.583 16.801 12.768 16.820 50.345 27.273 50.449 6 | 7168 138.737 132.197 139.051 1.399 1.117 1.558 16.978 8.917 17.011 56.481 26.247 56.594 7 | 9216 150.878 37.425 151.277 1.482 1.389 1.582 17.062 14.612 17.077 52.412 33.043 52.479 8 | 11264 153.804 146.969 154.160 1.483 1.410 1.581 17.484 15.244 17.505 56.713 36.980 56.764 9 | 13312 77.008 70.469 77.583 1.484 1.368 1.581 17.374 15.406 17.394 59.122 40.047 59.187 10 | 15360 42.332 40.035 42.517 1.484 1.426 1.546 17.159 15.430 17.170 42.808 33.972 43.349 11 | 17408 44.056 37.827 44.182 1.495 1.402 1.583 17.180 15.667 17.202 39.529 32.643 39.816 12 | 19456 43.216 40.675 44.485 1.509 1.484 1.569 17.197 15.748 17.223 38.014 31.690 38.170 13 | 21504 41.902 40.779 42.975 1.498 1.479 1.582 17.202 16.116 17.214 37.544 23.273 40.006 14 | 23552 42.432 38.497 43.094 1.499 1.484 1.551 17.368 16.359 17.392 38.511 33.671 39.361 15 | 25600 41.577 36.606 41.662 1.509 1.469 1.567 17.358 16.252 17.379 39.957 33.288 40.649 16 | 27648 42.469 39.661 42.578 1.516 1.477 1.545 17.267 16.219 17.279 38.348 33.791 38.515 17 | 29696 43.223 36.607 43.324 1.501 1.460 1.580 17.355 16.399 17.373 41.766 26.483 42.233 18 | 31744 41.164 35.803 41.285 1.506 1.485 1.583 17.368 16.418 17.396 40.573 33.765 41.604 19 | 33792 42.918 35.996 43.711 1.499 1.461 1.535 17.345 16.520 17.358 40.075 33.946 40.405 20 | 35840 42.902 40.745 43.659 1.515 1.484 1.566 17.434 16.530 17.453 41.110 37.928 41.748 21 | 37888 42.479 39.343 42.581 1.497 1.460 1.550 17.439 14.537 17.461 40.616 34.905 42.144 22 | 39936 41.378 36.303 42.910 1.511 1.419 1.540 17.420 14.779 17.432 39.764 34.817 40.664 23 | 41984 37.161 32.680 37.809 1.508 1.459 1.546 17.388 16.581 17.406 41.605 36.934 42.019 24 | 44032 36.788 36.499 37.640 1.502 1.485 1.539 17.394 16.526 17.419 40.603 37.082 41.043 25 | 46080 36.802 29.164 37.288 1.505 1.473 1.554 17.374 16.772 17.397 39.730 35.533 40.760 26 | 48128 41.665 36.117 42.241 1.510 1.470 1.542 17.468 15.488 17.498 39.466 35.467 39.976 27 | 50176 40.456 35.104 41.215 1.505 1.471 1.545 17.438 15.546 17.465 40.343 35.337 41.725 28 | 52224 38.918 37.596 40.003 1.500 1.479 1.557 17.418 16.766 17.435 39.259 31.286 41.114 29 | 54272 38.938 37.883 39.574 1.507 1.471 1.530 17.395 15.450 17.441 40.772 32.854 41.457 30 | 56320 36.098 32.397 36.726 1.503 1.472 1.577 16.901 15.158 16.932 32.771 30.288 33.346 31 | 58368 36.311 32.657 37.164 1.509 1.479 1.548 17.375 16.825 17.418 39.663 35.807 40.511 32 | 60416 36.391 35.506 37.515 1.506 1.467 1.541 17.443 16.772 17.455 40.044 33.257 41.104 33 | 62464 35.995 28.667 37.338 1.503 1.472 1.532 17.430 16.910 17.470 40.306 35.866 40.905 34 | 64512 38.777 35.002 39.835 1.500 1.481 1.561 17.398 15.191 17.452 40.087 35.598 40.805 35 | -------------------------------------------------------------------------------- /dockeroutput/realperf.txt: -------------------------------------------------------------------------------- 1 | Testing with real data. 2 | lena [jpg] 3 | loading file data/lena_color_512.base64 4 | removing spaces (as a preliminary step), init size = 142876, final size = 141020 5 | decoding a base64 input of 141020 bytes, original size = 105764 6 | memcpy (base64) : 30.013 GB/s 7 | Google chrome : 2.598 GB/s 8 | AVX2 : 14.360 GB/s 9 | AVX-512 : 33.665 GB/s 10 | 11 | encode a base64 input of 105764 bytes, encoded size = 141020 12 | memcpy (base64) : 29.565 GB/s 13 | Google chrome : 1.499 GB/s 14 | AVX2 : 17.188 GB/s 15 | AVX-512 : 32.844 GB/s 16 | 17 | mandril [jpg] 18 | loading file data/mandril_color.base64 19 | removing spaces (as a preliminary step), init size = 333970, final size = 329632 20 | decoding a base64 input of 329632 bytes, original size = 247222 21 | memcpy (base64) : 22.492 GB/s 22 | Google chrome : 2.589 GB/s 23 | AVX2 : 14.135 GB/s 24 | AVX-512 : 24.929 GB/s 25 | 26 | encode a base64 input of 247222 bytes, encoded size = 329632 27 | memcpy (base64) : 22.900 GB/s 28 | Google chrome : 1.501 GB/s 29 | AVX2 : 16.646 GB/s 30 | AVX-512 : 22.938 GB/s 31 | 32 | google logo [png] 33 | loading file data/googlelogo.base64 34 | removing spaces (as a preliminary step), init size = 3186, final size = 3144 35 | decoding a base64 input of 3144 bytes, original size = 2357 36 | memcpy (base64) : 83.863 GB/s 37 | Google chrome : 2.602 GB/s 38 | AVX2 : 13.606 GB/s 39 | AVX-512 : 43.488 GB/s 40 | 41 | encode a base64 input of 2357 bytes, encoded size = 3144 42 | memcpy (base64) : 81.927 GB/s 43 | Google chrome : 1.482 GB/s 44 | AVX2 : 14.385 GB/s 45 | AVX-512 : 33.255 GB/s 46 | 47 | -------------------------------------------------------------------------------- /include/avx512memcpy.h: -------------------------------------------------------------------------------- 1 | #ifndef AVX512MEMCPY_H 2 | #define AVX512MEMCPY_H 3 | 4 | #include 5 | #include 6 | 7 | static inline void* avx512_memcpy(void *dst, const void * src, size_t n) { 8 | if(n >= 64) { 9 | size_t i = 0; 10 | if(n >= 4*64) { 11 | for(; i <= n - 4*64; i+=4*64) { 12 | __m512i x0 = _mm512_loadu_si512((const char*)src + i); 13 | __m512i x1 = _mm512_loadu_si512((const char*)src + i + 64); 14 | __m512i x2 = _mm512_loadu_si512((const char*)src + i + 128); 15 | __m512i x3 = _mm512_loadu_si512((const char*)src + i + 192); 16 | _mm512_storeu_si512((char*)dst + i, x0); 17 | _mm512_storeu_si512((char*)dst + i + 64, x1); 18 | _mm512_storeu_si512((char*)dst + i + 128, x2); 19 | _mm512_storeu_si512((char*)dst + i + 192, x3); 20 | } 21 | } 22 | if(n>=64) { 23 | for(; i <= n - 64; i+=64) { 24 | __m512i x0 = _mm512_loadu_si512((const char*)src + i); 25 | _mm512_storeu_si512((char*)dst + i, x0); 26 | } 27 | } 28 | size_t leftover = n % 64; 29 | memcpy((char*)dst + n - leftover, (const char*)src + n - leftover, leftover); 30 | return dst; 31 | } else { 32 | return memcpy(dst,src,n); 33 | } 34 | } 35 | 36 | #endif 37 | -------------------------------------------------------------------------------- /include/chromiumbase64.h: -------------------------------------------------------------------------------- 1 | /*************** 2 | * Taken more or less as-is from the chromium project 3 | ****************/ 4 | 5 | 6 | 7 | /** 8 | * \file 9 | *
 10 |  * High performance base64 encoder / decoder
 11 |  * Version 1.3 -- 17-Mar-2006
 12 |  *
 13 |  * Copyright © 2005, 2006, Nick Galbreath -- nickg [at] modp [dot] com
 14 |  * All rights reserved.
 15 |  *
 16 |  * http://modp.com/release/base64
 17 |  *
 18 |  * Released under bsd license.  See modp_b64.c for details.
 19 |  * 
20 | * 21 | * The default implementation is the standard b64 encoding with padding. 22 | * It's easy to change this to use "URL safe" characters and to remove 23 | * padding. See the modp_b64.c source code for details. 24 | * 25 | */ 26 | 27 | #ifndef MODP_B64 28 | #define MODP_B64 29 | 30 | #include 31 | #include 32 | 33 | #ifdef __cplusplus 34 | extern "C" { 35 | #endif 36 | 37 | #define MODP_B64_ERROR ((size_t)-1) 38 | /** 39 | * Encode a raw binary string into base 64. 40 | * src contains the bytes 41 | * len contains the number of bytes in the src 42 | * dest should be allocated by the caller to contain 43 | * at least chromium_base64_encode_len(len) bytes (see below) 44 | * This will contain the null-terminated b64 encoded result 45 | * returns length of the destination string plus the ending null byte 46 | * i.e. the result will be equal to strlen(dest) + 1 47 | * 48 | * Example 49 | * 50 | * \code 51 | * char* src = ...; 52 | * int srclen = ...; //the length of number of bytes in src 53 | * char* dest = (char*) malloc(chromium_base64_decode_len(srclen)); 54 | * int len = chromium_base64_encode(dest, src, sourcelen); 55 | * if (len == MODP_B64_ERROR) { 56 | * printf("Error\n"); 57 | * } else { 58 | * printf("b64 = %s\n", dest); 59 | * } 60 | * \endcode 61 | * 62 | */ 63 | size_t chromium_base64_encode(char* dest, const char* str, size_t len); 64 | 65 | /** 66 | * Decode a base64 encoded string 67 | * 68 | * 69 | * src should contain exactly len bytes of b64 characters. 70 | * if src contains -any- non-base characters (such as white 71 | * space, MODP_B64_ERROR is returned. 72 | * 73 | * dest should be allocated by the caller to contain at least 74 | * len * 3 / 4 bytes. 75 | * 76 | * Returns the length (strlen) of the output, or MODP_B64_ERROR if unable to 77 | * decode 78 | * 79 | * \code 80 | * char* src = ...; 81 | * int srclen = ...; // or if you don't know use strlen(src) 82 | * char* dest = (char*) malloc(chromium_base64_encode_len(srclen)); 83 | * int len = chromium_base64_decode(dest, src, sourcelen); 84 | * if (len == MODP_B64_ERROR) { error } 85 | * \endcode 86 | */ 87 | size_t chromium_base64_decode(char* dest, const char* src, size_t len); 88 | 89 | /** 90 | * Given a source string of length len, this returns the amount of 91 | * memory the destination string should have. 92 | * 93 | * remember, this is integer math 94 | * 3 bytes turn into 4 chars 95 | * ceiling[len / 3] * 4 + 1 96 | * 97 | * +1 is for any extra null. 98 | */ 99 | #define chromium_base64_encode_len(A) ((A+2)/3 * 4 + 1) 100 | 101 | /** 102 | * Given a base64 string of length len, 103 | * this returns the amount of memory required for output string 104 | * It maybe be more than the actual number of bytes written. 105 | * NOTE: remember this is integer math 106 | * this allocates a bit more memory than traditional versions of b64 107 | * decode 4 chars turn into 3 bytes 108 | * floor[len * 3/4] + 2 109 | */ 110 | #define chromium_base64_decode_len(A) (A / 4 * 3 + 2) 111 | 112 | /** 113 | * Will return the strlen of the output from encoding. 114 | * This may be less than the required number of bytes allocated. 115 | * 116 | * This allows you to 'deserialized' a struct 117 | * \code 118 | * char* b64encoded = "..."; 119 | * int len = strlen(b64encoded); 120 | * 121 | * struct datastuff foo; 122 | * if (chromium_base64_encode_strlen(sizeof(struct datastuff)) != len) { 123 | * // wrong size 124 | * return false; 125 | * } else { 126 | * // safe to do; 127 | * if (chromium_base64_encode((char*) &foo, b64encoded, len) == MODP_B64_ERROR) { 128 | * // bad characters 129 | * return false; 130 | * } 131 | * } 132 | * // foo is filled out now 133 | * \endcode 134 | */ 135 | #define chromium_base64_encode_strlen(A) ((A + 2)/ 3 * 4) 136 | 137 | 138 | 139 | #ifdef __cplusplus 140 | } 141 | 142 | #include 143 | 144 | 145 | /** 146 | * base 64 decode a string (self-modifing) 147 | * On failure, the string is empty. 148 | * 149 | * This function is for C++ only (duh) 150 | * 151 | * \param[in,out] s the string to be decoded 152 | * \return a reference to the input string 153 | */ 154 | inline std::string& chromium_base64_encode(std::string& s) 155 | { 156 | std::string x(chromium_base64_encode_len(s.size()), '\0'); 157 | size_t d = chromium_base64_encode(const_cast(x.data()), s.data(), (int)s.size()); 158 | if (d == MODP_B64_ERROR) { 159 | x.clear(); 160 | } else { 161 | x.erase(d, std::string::npos); 162 | } 163 | s.swap(x); 164 | return s; 165 | } 166 | 167 | #endif /* __cplusplus */ 168 | #endif 169 | -------------------------------------------------------------------------------- /include/decode_base64_avx512vbmi.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | size_t decode_base64_avx512vbmi(uint8_t* output, const uint8_t* input, size_t size); 7 | -------------------------------------------------------------------------------- /include/decode_base64_avx512vbmi__unrolled.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | size_t decode_base64_avx512vbmi__unrolled(uint8_t* output, const uint8_t* input, size_t size); 7 | -------------------------------------------------------------------------------- /include/decode_base64_avx512vbmi_despace.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | // characters skipped: space (' '), new line ('\n'), carriage return ('\r') 7 | size_t decode_base64_avx512vbmi_despace(uint8_t* output, const uint8_t* input, size_t size); 8 | size_t decode_base64_avx512vbmi_despace_email(uint8_t* output, uint8_t** input, size_t size); 9 | -------------------------------------------------------------------------------- /include/encode_base64_avx512vbmi.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | void encode_base64_avx512vbmi(uint8_t* output, const uint8_t* input, size_t size); 7 | -------------------------------------------------------------------------------- /include/encode_base64_avx512vl.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | // note: this scheme is named avx512vl but it is not really related to avx512vl per se. 6 | void encode_base64_avx512vl(uint8_t* output, const uint8_t* input, size_t size); 7 | -------------------------------------------------------------------------------- /include/fastavxbase64.h: -------------------------------------------------------------------------------- 1 | #ifndef EXPAVX_B64 2 | #define EXPAVX_B64 3 | 4 | /** 5 | * Assumes recent x64 hardware with AVX2 instructions. 6 | */ 7 | 8 | #include 9 | #include 10 | #include "chromiumbase64.h" 11 | 12 | #ifdef __cplusplus 13 | extern "C" { 14 | #endif /* __cplusplus */ 15 | 16 | /** 17 | * This code extends Nick Galbreath's high performance base 64decoder (used in Chromium), the API is the 18 | * same effectively, see chromium64.h. 19 | */ 20 | 21 | /* 22 | * AVX2 accelerated version of Galbreath's chromium_base64_decode function 23 | * Usage remains the same, see chromium.h. 24 | */ 25 | size_t fast_avx2_base64_decode(char *out, const char *src, size_t srclen); 26 | 27 | /* 28 | * AVX2 accelerated version of Galbreath's chromium_base64_encode function 29 | * Usage remains the same, see chromium.h. 30 | */ 31 | size_t fast_avx2_base64_encode(char* dest, const char* str, size_t len); 32 | 33 | #ifdef __cplusplus 34 | } 35 | #endif /* __cplusplus */ 36 | 37 | #endif 38 | -------------------------------------------------------------------------------- /include/load_file.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | typedef struct MemoryArray { 6 | char* bytes; 7 | size_t size; 8 | } MemoryArray; 9 | 10 | void load_file(const char* path, MemoryArray* data); 11 | 12 | -------------------------------------------------------------------------------- /include/memalloc.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | // portable version of posix_memalign 7 | static inline void *aligned_malloc(size_t alignment, size_t size) { 8 | void *p; 9 | #ifdef _MSC_VER 10 | p = _aligned_malloc(size, alignment); 11 | #elif defined(__MINGW32__) || defined(__MINGW64__) 12 | p = __mingw_aligned_malloc(size, alignment); 13 | #else 14 | // Define _POSIX_C_SOURCE 200212L before the first include 15 | // of stdlib.h in order to avoid warning "implicit defined fun". 16 | if (posix_memalign(&p, alignment, size) != 0) return NULL; 17 | #endif 18 | return p; 19 | } 20 | 21 | static inline void aligned_free(void *memblock) { 22 | if(memblock == NULL) return; 23 | #ifdef _MSC_VER 24 | _aligned_free(memblock); 25 | #elif defined(__MINGW32__) || defined(__MINGW64__) 26 | __mingw_aligned_free(memblock); 27 | #else 28 | free(memblock); 29 | #endif 30 | } 31 | 32 | -------------------------------------------------------------------------------- /processdock.sh: -------------------------------------------------------------------------------- 1 | cp dockeroutput/encodingperf.txt results/cnlencoding.txt 2 | cp dockeroutput/decodingperf.txt results/cnldecoding.txt 3 | cd results && make 4 | -------------------------------------------------------------------------------- /results/Makefile: -------------------------------------------------------------------------------- 1 | ALL=cnl_decoding_cyclesperinputbyte.png \ 2 | cnl_encoding_cyclesperinputbyte.png 3 | 4 | all: $(ALL) 5 | 6 | cnl_decoding_cyclesperinputbyte.png: cnldecoding.txt *.gnuplot 7 | gnuplot cnl_decoding_cyclesperinputbyte_png.gnuplot 8 | 9 | cnl_encoding_cyclesperinputbyte.png: cnlencoding.txt *.gnuplot 10 | gnuplot cnl_encoding_cyclesperinputbyte_png.gnuplot 11 | 12 | clean: 13 | rm -f *.pdf *.png 14 | -------------------------------------------------------------------------------- /results/cnl_decoding_cyclesperinputbyte.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WojciechMula/base64-avx512/1cb949021bf070faaed8e8c767819f35ffc7e78e/results/cnl_decoding_cyclesperinputbyte.pdf -------------------------------------------------------------------------------- /results/cnl_decoding_cyclesperinputbyte.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WojciechMula/base64-avx512/1cb949021bf070faaed8e8c767819f35ffc7e78e/results/cnl_decoding_cyclesperinputbyte.png -------------------------------------------------------------------------------- /results/cnl_decoding_cyclesperinputbyte_png.gnuplot: -------------------------------------------------------------------------------- 1 | set term pngcairo fontscale 1.5 size 1024,768 2 | set out "cnl_decoding_cyclesperinputbyte.png" 3 | load "decoding_cyclesperinputbyte.gnuplot" 4 | 5 | set term pdfcairo fontscale 0.8 6 | set out "cnl_decoding_cyclesperinputbyte.pdf" 7 | load "decoding_cyclesperinputbyte.gnuplot" 8 | -------------------------------------------------------------------------------- /results/cnl_encoding_cyclesperinputbyte.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WojciechMula/base64-avx512/1cb949021bf070faaed8e8c767819f35ffc7e78e/results/cnl_encoding_cyclesperinputbyte.pdf -------------------------------------------------------------------------------- /results/cnl_encoding_cyclesperinputbyte.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WojciechMula/base64-avx512/1cb949021bf070faaed8e8c767819f35ffc7e78e/results/cnl_encoding_cyclesperinputbyte.png -------------------------------------------------------------------------------- /results/cnl_encoding_cyclesperinputbyte_png.gnuplot: -------------------------------------------------------------------------------- 1 | set term pngcairo fontscale 1.5 size 1024,768 2 | set out "cnl_encoding_cyclesperinputbyte.png" 3 | load "encoding_cyclesperinputbyte.gnuplot" 4 | 5 | set term pdfcairo fontscale 0.8 6 | set out "cnl_encoding_cyclesperinputbyte.pdf" 7 | load "encoding_cyclesperinputbyte.gnuplot" 8 | 9 | #set term pngcairo fontscale 1.5 size 1024,768 10 | #set out "cnl_encoding_gbps.png" 11 | #load "encoding_gbps.gnuplot" 12 | 13 | #set term pdfcairo fontscale 1 14 | #set out "cnl_encoding_gbps.pdf" 15 | #load "encoding_gbps.gnuplot" 16 | -------------------------------------------------------------------------------- /results/cnldecoding.txt: -------------------------------------------------------------------------------- 1 | #displaying speed (GB/s) based on input bytes for memcpy and decoders: memcpy base64, chromium, AVX2, AVX512; first column is number of bytes 2 | #Each measure is given as a triple (median, min, max) 3 | 1024 73.901 67.303 74.289 2.323 1.467 2.343 11.238 5.228 11.263 30.077 7.977 30.284 4 | 3072 80.403 77.740 80.564 2.339 1.663 2.343 13.187 8.279 13.200 41.038 15.954 41.240 5 | 5120 125.438 124.346 125.723 2.357 2.181 2.598 13.403 8.745 13.867 39.908 22.218 39.992 6 | 7168 138.759 138.102 139.029 2.431 2.274 2.593 14.291 10.083 14.774 44.263 26.854 44.346 7 | 9216 150.878 37.425 151.277 2.578 2.347 2.587 14.773 12.419 14.785 46.614 25.873 46.711 8 | 11264 153.804 146.969 154.160 2.584 2.579 2.591 14.746 13.104 14.755 44.433 29.706 44.470 9 | 13312 77.756 77.300 78.325 2.587 2.351 2.593 14.483 11.820 14.492 45.095 33.068 45.141 10 | 15360 45.058 43.322 45.135 2.594 2.290 2.599 14.434 13.345 14.468 44.220 33.670 44.374 11 | 17408 40.546 39.357 40.941 2.594 2.458 2.598 14.449 14.295 14.463 41.608 40.467 41.872 12 | 19456 42.555 41.451 43.032 2.596 2.465 2.600 14.558 11.036 14.597 39.642 33.515 41.318 13 | 21504 42.627 41.597 42.746 2.597 2.482 2.598 14.572 13.598 14.586 42.264 34.304 43.122 14 | 23552 42.864 41.582 44.013 2.596 2.396 2.597 14.594 13.705 14.604 39.697 34.402 41.014 15 | 25600 41.041 40.426 41.881 2.596 2.486 2.596 14.480 11.629 14.499 42.428 33.785 42.492 16 | 27648 42.131 36.504 44.154 2.596 2.508 2.597 14.520 13.711 14.531 42.223 34.660 42.347 17 | 29696 43.476 33.665 44.382 2.598 2.508 2.599 14.524 13.635 14.545 41.591 35.190 42.167 18 | 31744 41.123 40.541 41.874 2.596 2.517 2.599 14.594 12.033 14.615 42.955 35.806 43.308 19 | 33792 43.453 42.861 44.515 2.597 2.434 2.599 14.609 13.859 14.625 42.768 36.425 43.146 20 | 35840 41.717 40.820 42.063 2.597 2.375 2.600 14.608 13.913 14.617 40.863 35.593 41.487 21 | 37888 39.521 38.676 40.576 2.596 2.509 2.598 14.532 13.957 14.545 41.832 36.277 42.480 22 | 39936 40.907 39.826 42.089 2.597 2.455 2.599 14.518 12.225 14.536 41.630 36.960 42.539 23 | 41984 41.028 39.728 42.582 2.598 2.413 2.599 14.535 13.926 14.556 41.579 36.646 41.661 24 | 44032 40.630 40.099 41.076 2.598 2.431 2.599 14.603 14.056 14.616 41.824 36.388 42.338 25 | 46080 41.287 40.024 41.873 2.598 2.533 2.598 14.609 12.789 14.619 42.578 37.740 43.047 26 | 48128 38.864 37.684 39.610 2.598 2.437 2.600 14.589 12.914 14.614 40.929 36.544 41.995 27 | 50176 37.877 37.159 38.235 2.597 2.567 2.598 14.557 12.633 14.570 42.621 32.354 43.179 28 | 52224 38.314 29.261 38.715 2.595 2.550 2.596 14.525 14.053 14.565 40.855 37.505 41.351 29 | 54272 39.341 38.257 40.144 2.597 2.534 2.598 14.526 13.199 14.578 41.731 37.283 42.456 30 | 56320 35.372 34.868 35.668 2.598 2.552 2.600 14.517 13.084 14.564 38.887 35.752 39.296 31 | 58368 39.216 38.312 39.724 2.599 2.542 2.600 14.537 12.809 14.589 40.162 35.959 40.621 32 | 60416 38.659 38.099 39.293 2.599 2.552 2.601 14.565 12.380 14.638 42.059 35.751 42.779 33 | 62464 35.191 34.598 35.652 2.596 2.554 2.596 14.482 13.218 14.523 38.979 35.415 39.341 34 | 64512 37.182 36.467 37.960 2.597 2.555 2.597 14.491 14.002 14.530 40.014 33.054 40.798 35 | -------------------------------------------------------------------------------- /results/cnlencoding.txt: -------------------------------------------------------------------------------- 1 | #displaying speed (GB/s) based on input bytes for memcpy and encoders: memcpy base64, chromium, AVX2, AVX512 first column is number of bytes 2 | #Each measure is given as a triple (median, min, max) 3 | 1024 83.326 67.877 83.599 1.573 0.972 1.574 14.011 6.626 14.062 34.565 11.474 34.744 4 | 3072 88.048 80.957 88.673 1.492 1.265 1.581 16.095 10.454 16.124 39.127 17.222 39.242 5 | 5120 125.438 117.935 125.723 1.485 1.481 1.583 16.801 12.768 16.820 50.345 27.273 50.449 6 | 7168 138.737 132.197 139.051 1.399 1.117 1.558 16.978 8.917 17.011 56.481 26.247 56.594 7 | 9216 150.878 37.425 151.277 1.482 1.389 1.582 17.062 14.612 17.077 52.412 33.043 52.479 8 | 11264 153.804 146.969 154.160 1.483 1.410 1.581 17.484 15.244 17.505 56.713 36.980 56.764 9 | 13312 77.008 70.469 77.583 1.484 1.368 1.581 17.374 15.406 17.394 59.122 40.047 59.187 10 | 15360 42.332 40.035 42.517 1.484 1.426 1.546 17.159 15.430 17.170 42.808 33.972 43.349 11 | 17408 44.056 37.827 44.182 1.495 1.402 1.583 17.180 15.667 17.202 39.529 32.643 39.816 12 | 19456 43.216 40.675 44.485 1.509 1.484 1.569 17.197 15.748 17.223 38.014 31.690 38.170 13 | 21504 41.902 40.779 42.975 1.498 1.479 1.582 17.202 16.116 17.214 37.544 23.273 40.006 14 | 23552 42.432 38.497 43.094 1.499 1.484 1.551 17.368 16.359 17.392 38.511 33.671 39.361 15 | 25600 41.577 36.606 41.662 1.509 1.469 1.567 17.358 16.252 17.379 39.957 33.288 40.649 16 | 27648 42.469 39.661 42.578 1.516 1.477 1.545 17.267 16.219 17.279 38.348 33.791 38.515 17 | 29696 43.223 36.607 43.324 1.501 1.460 1.580 17.355 16.399 17.373 41.766 26.483 42.233 18 | 31744 41.164 35.803 41.285 1.506 1.485 1.583 17.368 16.418 17.396 40.573 33.765 41.604 19 | 33792 42.918 35.996 43.711 1.499 1.461 1.535 17.345 16.520 17.358 40.075 33.946 40.405 20 | 35840 42.902 40.745 43.659 1.515 1.484 1.566 17.434 16.530 17.453 41.110 37.928 41.748 21 | 37888 42.479 39.343 42.581 1.497 1.460 1.550 17.439 14.537 17.461 40.616 34.905 42.144 22 | 39936 41.378 36.303 42.910 1.511 1.419 1.540 17.420 14.779 17.432 39.764 34.817 40.664 23 | 41984 37.161 32.680 37.809 1.508 1.459 1.546 17.388 16.581 17.406 41.605 36.934 42.019 24 | 44032 36.788 36.499 37.640 1.502 1.485 1.539 17.394 16.526 17.419 40.603 37.082 41.043 25 | 46080 36.802 29.164 37.288 1.505 1.473 1.554 17.374 16.772 17.397 39.730 35.533 40.760 26 | 48128 41.665 36.117 42.241 1.510 1.470 1.542 17.468 15.488 17.498 39.466 35.467 39.976 27 | 50176 40.456 35.104 41.215 1.505 1.471 1.545 17.438 15.546 17.465 40.343 35.337 41.725 28 | 52224 38.918 37.596 40.003 1.500 1.479 1.557 17.418 16.766 17.435 39.259 31.286 41.114 29 | 54272 38.938 37.883 39.574 1.507 1.471 1.530 17.395 15.450 17.441 40.772 32.854 41.457 30 | 56320 36.098 32.397 36.726 1.503 1.472 1.577 16.901 15.158 16.932 32.771 30.288 33.346 31 | 58368 36.311 32.657 37.164 1.509 1.479 1.548 17.375 16.825 17.418 39.663 35.807 40.511 32 | 60416 36.391 35.506 37.515 1.506 1.467 1.541 17.443 16.772 17.455 40.044 33.257 41.104 33 | 62464 35.995 28.667 37.338 1.503 1.472 1.532 17.430 16.910 17.470 40.306 35.866 40.905 34 | 64512 38.777 35.002 39.835 1.500 1.481 1.561 17.398 15.191 17.452 40.087 35.598 40.805 35 | -------------------------------------------------------------------------------- /results/decoding_cyclesperinputbyte.gnuplot: -------------------------------------------------------------------------------- 1 | reset 2 | load "linespointsstyle.gnuplot" 3 | set style line 81 lt 0 # dashed 4 | set style line 81 lt rgb "#808080" # grey 5 | set grid back linestyle 81 6 | set xlabel "input kilobytes" 7 | set ylabel "base64 GB/s" 8 | 9 | stats 'cnldecoding.txt' using 1 10 | set xrange [STATS_min/1024:STATS_max/1024] 11 | #set ytics 0.05 12 | set yrange [0:155] 13 | set xrange [4:] 14 | set key top right invert 15 | set xtics 4 16 | plot "cnldecoding.txt" using ($1/1024):1+3*1+1 every 1 ti "Chrome" with linespoints ls 2, \ 17 | "" using ($1/1024):1+3*2+1 every 1 ti "AVX2" with linespoints ls 3, \ 18 | "" using ($1/1024):1+3*3+1 every 1 ti "AVX-512" with linespoints ls 4,\ 19 | "" using ($1/1024):1+3*0+1 every 1 ti "memcpy" with linespoints ls 1 20 | -------------------------------------------------------------------------------- /results/decoding_gbps.gnuplot: -------------------------------------------------------------------------------- 1 | reset 2 | load "linespointsstyle.gnuplot" 3 | set style line 81 lt 0 # dashed 4 | set style line 81 lt rgb "#808080" # grey 5 | set grid back linestyle 81 6 | set xlabel "input kilobytes" 7 | set ylabel "GB/s" 8 | 9 | stats 'cnldecoding.txt' using 1 10 | set xrange [STATS_min/1024:STATS_max/1024] 11 | #set ytics 0.5 12 | #set yrange [0:1.0] 13 | set key top right opaque 14 | set xtics 4 15 | plot "cnldecoding.txt" \ 16 | using ($1/1024):1+3*(1+7)+2 ti "Google Chrome" smooth acsplines ls 2, \ 17 | "" using ($1/1024):1+3*(2+7)+2 ti "AVX2" smooth acsplines ls 3, \ 18 | "" using ($1/1024):1+3*(4+7)+2 ti "AVX512VBMI" smooth acsplines ls 4, \ 19 | "" using ($1/1024):1+3*(0+7)+2 ti "memcpy" smooth acsplines ls 1 20 | 21 | -------------------------------------------------------------------------------- /results/encoding_cyclesperinputbyte.gnuplot: -------------------------------------------------------------------------------- 1 | reset 2 | load "linespointsstyle.gnuplot" 3 | set style line 81 lt 0 # dashed 4 | set style line 81 lt rgb "#808080" # grey 5 | set grid back linestyle 81 6 | set xlabel "input kilobytes" 7 | set ylabel "base64 GB/s" 8 | 9 | stats 'cnlencoding.txt' using 1 10 | set xrange [STATS_min/1024:STATS_max/1024] 11 | #set ytics 0.05 12 | set yrange [0:155] 13 | set xrange [4:] 14 | 15 | set key top right invert 16 | set xtics 4 17 | plot "cnlencoding.txt" using ($1/1024):1+3*1+1 every 1 ti "Chrome" with linespoints ls 2, \ 18 | "" using ($1/1024):1+3*2+1 every 1 ti "AVX2" with linespoints ls 3, \ 19 | "" using ($1/1024):1+3*3+1 every 1 ti "AVX-512" with linespoints ls 4,\ 20 | "" using ($1/1024):1+3*0+1 every 1 ti "memcpy" with linespoints ls 1 -------------------------------------------------------------------------------- /results/encoding_gbps.gnuplot: -------------------------------------------------------------------------------- 1 | reset 2 | load "linespointsstyle.gnuplot" 3 | set style line 81 lt 0 # dashed 4 | set style line 81 lt rgb "#808080" # grey 5 | set grid back linestyle 81 6 | set xlabel "input kilobytes" 7 | set ylabel "GB/s" 8 | 9 | stats 'cnlencoding.txt' using 1 10 | set xrange [STATS_min/1024:STATS_max/1024] 11 | #set ytics 0.5 12 | #set yrange [0:2] 13 | set key top right opaque 14 | set xtics 4 15 | plot "cnlencoding.txt" \ 16 | using ($1/1024):1+3*(1+6)+2 ti "Google Chrome" smooth acsplines ls 2, \ 17 | "" using ($1/1024):1+3*(2+6)+2 ti "AVX2" smooth acsplines ls 3, \ 18 | "" using ($1/1024):1+3*(4+6)+2 ti "AVX512VL" smooth acsplines ls 5, \ 19 | "" using ($1/1024):1+3*(0+6)+2 ti "memcpy" smooth acsplines ls 1 20 | 21 | -------------------------------------------------------------------------------- /results/linespointsstyle.gnuplot: -------------------------------------------------------------------------------- 1 | 2 | set style line 80 lt rgb "#000000" 3 | 4 | # Line style for grid 5 | #set style line 81 lt 0 # dashed 6 | #set style line 81 lt rgb "#808080" # grey 7 | 8 | #set grid back linestyle 81 9 | set border 3 back linestyle 80 # Remove border on top and right. These 10 | # borders are useless and make it harder 11 | # to see plotted lines near the border. 12 | # Also, put it in grey; no need for so much emphasis on a border. 13 | set xtics nomirror 14 | set ytics nomirror 15 | 16 | 17 | 18 | set style line 1 lt rgb "#A00000" lw 5 pt 1 ps 1 19 | set style line 2 lt rgb "#00A000" lw 5 pt 5 ps 0.6 20 | set style line 3 lt rgb "#5060D0" lw 5 pt 7 ps 0.6 21 | set style line 4 lt rgb "#FF1493" lw 5 pt 9 ps 1 22 | set style line 5 lt rgb "red" lw 5 pt 11 ps 1 23 | set style line 6 lt rgb "#808000" lw 5 pt 13 ps 1 24 | set style line 7 lt rgb "#00008B" lw 5 pt 15 ps 1 25 | set style line 8 lt rgb "#800080" lw 5 pt 21 ps 1 26 | set style line 9 lt rgb "black" lw 5 pt 63 ps 1 27 | set style line 10 lt rgb "blue" lw 5 pt 28 ps 1 28 | set style line 11 lt rgb "violet" lw 5 pt 44 ps 1 29 | -------------------------------------------------------------------------------- /scripts/avx512vbmi_decode_lookups.py: -------------------------------------------------------------------------------- 1 | def main(): 2 | decode_lookup() 3 | decode_lookup_despace() 4 | decode_pack() 5 | join_four_decoded() 6 | 7 | 8 | def decode_lookup(): 9 | print "decode lookup" 10 | base64 = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/" 11 | 12 | lookup_hi = [0x80] * 64 13 | lookup_lo = [0x80] * 64 14 | 15 | for encoded, char in enumerate(base64): 16 | 17 | value = ord(char) 18 | index = value & 0x3f 19 | if value & 0x40: 20 | lookup = lookup_hi 21 | else: 22 | lookup = lookup_lo 23 | 24 | lookup[index] = encoded 25 | 26 | print("lookup_lo") 27 | format_lookup(lookup_lo) 28 | print("lookup_hi") 29 | format_lookup(lookup_hi) 30 | 31 | 32 | def decode_lookup_despace(): 33 | print "decode lookup (despace)" 34 | base64 = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/" 35 | spaces = ' \r\n' 36 | 37 | lookup_hi = [0x80] * 64 38 | lookup_lo = [0x80] * 64 39 | 40 | for encoded, char in enumerate(base64): 41 | 42 | value = ord(char) 43 | index = value & 0x3f 44 | if value & 0x40: 45 | lookup = lookup_hi 46 | else: 47 | lookup = lookup_lo 48 | 49 | lookup[index] = encoded 50 | 51 | for char in spaces: 52 | 53 | value = ord(char) 54 | index = value & 0x3f 55 | if value & 0x40: 56 | lookup = lookup_hi 57 | else: 58 | lookup = lookup_lo 59 | 60 | lookup[index] = 0x40 # set 6th bit 61 | 62 | print("lookup_lo") 63 | format_lookup(lookup_lo) 64 | print("lookup_hi") 65 | format_lookup(lookup_hi) 66 | 67 | 68 | def decode_pack(): 69 | print "decode pack" 70 | 71 | pack = [0] * 64 72 | 73 | output = 0 74 | for i in xrange(16): 75 | pack[i*3 + 0] = output + 2 76 | pack[i*3 + 1] = output + 1 77 | pack[i*3 + 2] = output + 0 78 | output += 4 79 | 80 | format_lookup(pack) 81 | 82 | 83 | def join_four_decoded(): 84 | print("join four decoded vectors") 85 | # decoded data has 24-bit values in 32-bit words, we need to save these byte in a continous array 86 | 87 | # vec0 has bytes 0 .. 48 88 | # vec1 has bytes 49 .. 96 89 | # vec2 has bytes 96 .. 144 90 | # vec3 has bytes 144 .. 192 91 | 92 | join = [] 93 | index = 0 94 | 95 | k = 0 96 | while len(join) < 4*48: 97 | join.append(index + 2) 98 | join.append(index + 1) 99 | join.append(index + 0) 100 | index += 4 101 | 102 | join01 = join[0*64:1*64] 103 | join12 = [x - 64 for x in join[1*64:2*64]] 104 | join23 = [x - 128 for x in join[2*64:3*64]] 105 | 106 | print "const __m512i join01 =", 107 | format_lookup(join01) 108 | print "const __m512i join12 =", 109 | format_lookup(join12) 110 | print "const __m512i join23 =", 111 | format_lookup(join23) 112 | 113 | 114 | def format_lookup(array): 115 | assert len(array) == 64 116 | assert all(0 <= x <= 255 for x in array) 117 | 118 | dwords = [] 119 | for i in xrange(0, len(array), 4): 120 | b0 = array[i + 0] 121 | b1 = array[i + 1] 122 | b2 = array[i + 2] 123 | b3 = array[i + 3] 124 | 125 | dwords.append( b0 | (b1 << 8) | (b2 << 16) | (b3 << 24)) 126 | 127 | dwords_fmt = ['0x%08x' % x for x in dwords] 128 | dwords_fmt = ', '.join(dwords_fmt) 129 | 130 | print '_mm512_setr_epi32(%s)' % dwords_fmt 131 | 132 | 133 | if __name__ == '__main__': 134 | main() 135 | -------------------------------------------------------------------------------- /scripts/email-generator.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import base64 5 | import hashlib 6 | from os.path import basename 7 | 8 | 9 | EMAIL_HEADER="""--%(id)s 10 | Content-Type: mime/type 11 | Content-Disposition: attachment; filename="%(name)s" 12 | Content-Transfer-Encoding: base64 13 | 14 | """ 15 | 16 | EMAIL_FOOTER=""" 17 | --%(id)s-- 18 | """ 19 | 20 | 21 | class Application(object): 22 | def __init__(self, options): 23 | self.options = options 24 | 25 | 26 | def run(self): 27 | 28 | def md5(s): 29 | h = hashlib.new('md5') 30 | h.update(s) 31 | return h.hexdigest() 32 | 33 | with open(self.options.output, 'w', encoding='utf-8') as out: 34 | for index, path in enumerate(self.options.files): 35 | params = { 36 | 'id' : md5(bytes(path, 'utf-8')), 37 | 'name' : basename(path), 38 | } 39 | 40 | if index > 0: 41 | out.write('\n') 42 | 43 | out.write(EMAIL_HEADER % params) 44 | self.__encode_base64(out, path) 45 | out.write(EMAIL_FOOTER % params) 46 | 47 | 48 | def __encode_base64(self, out, path): 49 | with open(path, 'rb') as f: 50 | bin = f.read() 51 | enc = str(base64.standard_b64encode(bin), 'ascii') 52 | 53 | n = self.options.maxlength 54 | for i in range(0, len(enc), n): 55 | line = enc[i:i+n] 56 | out.write(line) 57 | out.write('\n') 58 | 59 | 60 | def get_options(): 61 | parser = argparse.ArgumentParser(description="Prepare pseudo-email messages with base64-encoded attachments") 62 | parser.add_argument("-l", dest='maxlength', metavar="LINELEN", default=76, 63 | help="maximum length of line") 64 | parser.add_argument("-o", dest='output', metavar="FILE", type=str, required=True, 65 | help="output file") 66 | parser.add_argument("files", nargs='+', 67 | help="input file(s)") 68 | 69 | return parser.parse_args() 70 | 71 | 72 | def main(): 73 | options = get_options() 74 | app = Application(options) 75 | app.run() 76 | 77 | 78 | if __name__ == '__main__': 79 | main() 80 | -------------------------------------------------------------------------------- /scripts/setupfortesting/README.md: -------------------------------------------------------------------------------- 1 | These scripts are used to configure the system for testing. 2 | -------------------------------------------------------------------------------- /scripts/setupfortesting/disablehyperthreading.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Be careful to not skip the space at the beginning nor the end 4 | CPUS_TO_SKIP=" $(cat /sys/devices/system/cpu/cpu*/topology/thread_siblings_list | sed 's/[^0-9].*//' | sort | uniq | tr "\r\n" " ") " 5 | 6 | 7 | for CPU_PATH in /sys/devices/system/cpu/cpu[0-9]*; do 8 | CPU="$(echo $CPU_PATH | tr -cd "0-9")" 9 | echo "$CPUS_TO_SKIP" | grep " $CPU " > /dev/null 10 | if [ $? -ne 0 ]; then 11 | echo 0 > $CPU_PATH/online 12 | fi 13 | done 14 | 15 | egrep 'siblings|cpu cores' /proc/cpuinfo | head -2 16 | -------------------------------------------------------------------------------- /scripts/setupfortesting/powerpolicy.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # taken from http://hbfs.wordpress.com/2013/06/18/fast-path-finding-part-ii/ 3 | # might require sudo apt-get install cpufrequtils 4 | # invoke with performance or ondemand 5 | # type cpufreq-info to check results, you can also verify with cat /proc/cpuinfo 6 | # enumerate found CPUs 7 | cpus=$( grep processor /proc/cpuinfo | cut -d: -f 2 ) 8 | 9 | 10 | if [ "$1" = "ondemand" ]; then 11 | echo "setting up ondemand" 12 | policy="ondemand" 13 | elif [ "$1" = "performance" ]; then 14 | echo "setting up for performance" 15 | policy="performance" 16 | elif [ "$1" = "list" ]; then 17 | cpufreq-info 18 | exit 0 19 | else 20 | echo "usage: powerpolicy.sh ondemand | performance list" 21 | exit -1 22 | fi 23 | 24 | echo "chosen policy " $1 25 | # set governor for each CPU 26 | # 27 | for cpu in ${cpus[@]} 28 | do 29 | cpufreq-set -c $cpu -g $1 30 | done 31 | -------------------------------------------------------------------------------- /scripts/setupfortesting/setupfortesting.sh: -------------------------------------------------------------------------------- 1 | cd "${0%/*}" 2 | export CXX=g++-7 3 | export CC=gcc-7 4 | #./powerpolicy.sh performance 5 | ./disablehyperthreading.sh 6 | ./turboboost.sh on 7 | -------------------------------------------------------------------------------- /scripts/setupfortesting/turboboost.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # stolen from https://github.com/DropD/fnc-simplex/blob/master/linux_turboboost.sh 3 | 4 | # you might need to run sudo apt-get install msr-tools 5 | # Toggle Turbo Boost for Ivy Bridge CPUs (should work for all newer Core) 6 | # Requires a fairly new Linux kernel (let's say 3.0+) 7 | # Written by Donjan Rodic, released for free use 8 | 9 | # check current real frequency with sudo turbostat -s -i1 10 | 11 | sudo modprobe msr 12 | 13 | # all_cores FOO 14 | # perform FOO(i) for each core i 15 | all_cores() { 16 | NPROCS=`cat /proc/cpuinfo | grep "core id" | wc -l` 17 | NPROCS=$(($NPROCS - 1)) 18 | for i in `seq 0 1 $NPROCS`; do 19 | $1 $i 20 | done 21 | } 22 | 23 | 24 | # report Turbo Boost state on core $1 25 | read_tb() { 26 | ret=`sudo rdmsr -p"$1" 0x1a0 -f 38:38` 27 | [ $ret -eq 0 ] && echo "$1": on || echo "$1": off 28 | } 29 | 30 | # enable Turbo Boost on core $1 31 | enable_tb() { 32 | sudo wrmsr -p"$1" 0x1a0 0x850089 33 | } 34 | 35 | # disable Turbo Boost on core $1 36 | disable_tb() { 37 | sudo wrmsr -p"$1" 0x1a0 0x4000850089 38 | } 39 | 40 | 41 | if [ "$1" = "on" ]; then 42 | all_cores enable_tb 43 | all_cores read_tb 44 | elif [ "$1" = "off" ]; then 45 | all_cores disable_tb 46 | all_cores read_tb 47 | elif [ "$1" = "list" ]; then 48 | all_cores read_tb 49 | else 50 | echo "usage: turboboost.sh on | off | list" 51 | fi 52 | -------------------------------------------------------------------------------- /src/base64/chromiumbase64.c: -------------------------------------------------------------------------------- 1 | #include "chromiumbase64.h" 2 | 3 | #define CHAR62 '+' 4 | #define CHAR63 '/' 5 | #define CHARPAD '=' 6 | static const char e0[256] = { 7 | 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'C', 'C', 8 | 'C', 'C', 'D', 'D', 'D', 'D', 'E', 'E', 'E', 'E', 9 | 'F', 'F', 'F', 'F', 'G', 'G', 'G', 'G', 'H', 'H', 10 | 'H', 'H', 'I', 'I', 'I', 'I', 'J', 'J', 'J', 'J', 11 | 'K', 'K', 'K', 'K', 'L', 'L', 'L', 'L', 'M', 'M', 12 | 'M', 'M', 'N', 'N', 'N', 'N', 'O', 'O', 'O', 'O', 13 | 'P', 'P', 'P', 'P', 'Q', 'Q', 'Q', 'Q', 'R', 'R', 14 | 'R', 'R', 'S', 'S', 'S', 'S', 'T', 'T', 'T', 'T', 15 | 'U', 'U', 'U', 'U', 'V', 'V', 'V', 'V', 'W', 'W', 16 | 'W', 'W', 'X', 'X', 'X', 'X', 'Y', 'Y', 'Y', 'Y', 17 | 'Z', 'Z', 'Z', 'Z', 'a', 'a', 'a', 'a', 'b', 'b', 18 | 'b', 'b', 'c', 'c', 'c', 'c', 'd', 'd', 'd', 'd', 19 | 'e', 'e', 'e', 'e', 'f', 'f', 'f', 'f', 'g', 'g', 20 | 'g', 'g', 'h', 'h', 'h', 'h', 'i', 'i', 'i', 'i', 21 | 'j', 'j', 'j', 'j', 'k', 'k', 'k', 'k', 'l', 'l', 22 | 'l', 'l', 'm', 'm', 'm', 'm', 'n', 'n', 'n', 'n', 23 | 'o', 'o', 'o', 'o', 'p', 'p', 'p', 'p', 'q', 'q', 24 | 'q', 'q', 'r', 'r', 'r', 'r', 's', 's', 's', 's', 25 | 't', 't', 't', 't', 'u', 'u', 'u', 'u', 'v', 'v', 26 | 'v', 'v', 'w', 'w', 'w', 'w', 'x', 'x', 'x', 'x', 27 | 'y', 'y', 'y', 'y', 'z', 'z', 'z', 'z', '0', '0', 28 | '0', '0', '1', '1', '1', '1', '2', '2', '2', '2', 29 | '3', '3', '3', '3', '4', '4', '4', '4', '5', '5', 30 | '5', '5', '6', '6', '6', '6', '7', '7', '7', '7', 31 | '8', '8', '8', '8', '9', '9', '9', '9', '+', '+', 32 | '+', '+', '/', '/', '/', '/' 33 | }; 34 | 35 | static const char e1[256] = { 36 | 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 37 | 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 38 | 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 39 | 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 40 | 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 41 | 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', 42 | '8', '9', '+', '/', 'A', 'B', 'C', 'D', 'E', 'F', 43 | 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 44 | 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 45 | 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 46 | 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 47 | 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', 48 | '4', '5', '6', '7', '8', '9', '+', '/', 'A', 'B', 49 | 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 50 | 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 51 | 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 52 | 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 53 | 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 54 | '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 55 | '+', '/', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 56 | 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 57 | 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 58 | 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 59 | 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 60 | 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', 61 | '6', '7', '8', '9', '+', '/' 62 | }; 63 | 64 | static const char e2[256] = { 65 | 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 66 | 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 67 | 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 68 | 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 69 | 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 70 | 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', 71 | '8', '9', '+', '/', 'A', 'B', 'C', 'D', 'E', 'F', 72 | 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 73 | 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 74 | 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 75 | 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 76 | 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', 77 | '4', '5', '6', '7', '8', '9', '+', '/', 'A', 'B', 78 | 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 79 | 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 80 | 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 81 | 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 82 | 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 83 | '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 84 | '+', '/', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 85 | 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 86 | 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 87 | 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 88 | 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 89 | 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', 90 | '6', '7', '8', '9', '+', '/' 91 | }; 92 | 93 | 94 | 95 | /* SPECIAL DECODE TABLES FOR LITTLE ENDIAN (INTEL) CPUS */ 96 | 97 | static const uint32_t d0[256] = { 98 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 99 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 100 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 101 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 102 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 103 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 104 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 105 | 0x01ffffff, 0x000000f8, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x000000fc, 106 | 0x000000d0, 0x000000d4, 0x000000d8, 0x000000dc, 0x000000e0, 0x000000e4, 107 | 0x000000e8, 0x000000ec, 0x000000f0, 0x000000f4, 0x01ffffff, 0x01ffffff, 108 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00000000, 109 | 0x00000004, 0x00000008, 0x0000000c, 0x00000010, 0x00000014, 0x00000018, 110 | 0x0000001c, 0x00000020, 0x00000024, 0x00000028, 0x0000002c, 0x00000030, 111 | 0x00000034, 0x00000038, 0x0000003c, 0x00000040, 0x00000044, 0x00000048, 112 | 0x0000004c, 0x00000050, 0x00000054, 0x00000058, 0x0000005c, 0x00000060, 113 | 0x00000064, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 114 | 0x01ffffff, 0x00000068, 0x0000006c, 0x00000070, 0x00000074, 0x00000078, 115 | 0x0000007c, 0x00000080, 0x00000084, 0x00000088, 0x0000008c, 0x00000090, 116 | 0x00000094, 0x00000098, 0x0000009c, 0x000000a0, 0x000000a4, 0x000000a8, 117 | 0x000000ac, 0x000000b0, 0x000000b4, 0x000000b8, 0x000000bc, 0x000000c0, 118 | 0x000000c4, 0x000000c8, 0x000000cc, 0x01ffffff, 0x01ffffff, 0x01ffffff, 119 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 120 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 121 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 122 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 123 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 124 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 125 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 126 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 127 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 128 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 129 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 130 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 131 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 132 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 133 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 134 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 135 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 136 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 137 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 138 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 139 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 140 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff 141 | }; 142 | 143 | 144 | static const uint32_t d1[256] = { 145 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 146 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 147 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 148 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 149 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 150 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 151 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 152 | 0x01ffffff, 0x0000e003, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x0000f003, 153 | 0x00004003, 0x00005003, 0x00006003, 0x00007003, 0x00008003, 0x00009003, 154 | 0x0000a003, 0x0000b003, 0x0000c003, 0x0000d003, 0x01ffffff, 0x01ffffff, 155 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00000000, 156 | 0x00001000, 0x00002000, 0x00003000, 0x00004000, 0x00005000, 0x00006000, 157 | 0x00007000, 0x00008000, 0x00009000, 0x0000a000, 0x0000b000, 0x0000c000, 158 | 0x0000d000, 0x0000e000, 0x0000f000, 0x00000001, 0x00001001, 0x00002001, 159 | 0x00003001, 0x00004001, 0x00005001, 0x00006001, 0x00007001, 0x00008001, 160 | 0x00009001, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 161 | 0x01ffffff, 0x0000a001, 0x0000b001, 0x0000c001, 0x0000d001, 0x0000e001, 162 | 0x0000f001, 0x00000002, 0x00001002, 0x00002002, 0x00003002, 0x00004002, 163 | 0x00005002, 0x00006002, 0x00007002, 0x00008002, 0x00009002, 0x0000a002, 164 | 0x0000b002, 0x0000c002, 0x0000d002, 0x0000e002, 0x0000f002, 0x00000003, 165 | 0x00001003, 0x00002003, 0x00003003, 0x01ffffff, 0x01ffffff, 0x01ffffff, 166 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 167 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 168 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 169 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 170 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 171 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 172 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 173 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 174 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 175 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 176 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 177 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 178 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 179 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 180 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 181 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 182 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 183 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 184 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 185 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 186 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 187 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff 188 | }; 189 | 190 | 191 | static const uint32_t d2[256] = { 192 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 193 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 194 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 195 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 196 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 197 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 198 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 199 | 0x01ffffff, 0x00800f00, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00c00f00, 200 | 0x00000d00, 0x00400d00, 0x00800d00, 0x00c00d00, 0x00000e00, 0x00400e00, 201 | 0x00800e00, 0x00c00e00, 0x00000f00, 0x00400f00, 0x01ffffff, 0x01ffffff, 202 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00000000, 203 | 0x00400000, 0x00800000, 0x00c00000, 0x00000100, 0x00400100, 0x00800100, 204 | 0x00c00100, 0x00000200, 0x00400200, 0x00800200, 0x00c00200, 0x00000300, 205 | 0x00400300, 0x00800300, 0x00c00300, 0x00000400, 0x00400400, 0x00800400, 206 | 0x00c00400, 0x00000500, 0x00400500, 0x00800500, 0x00c00500, 0x00000600, 207 | 0x00400600, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 208 | 0x01ffffff, 0x00800600, 0x00c00600, 0x00000700, 0x00400700, 0x00800700, 209 | 0x00c00700, 0x00000800, 0x00400800, 0x00800800, 0x00c00800, 0x00000900, 210 | 0x00400900, 0x00800900, 0x00c00900, 0x00000a00, 0x00400a00, 0x00800a00, 211 | 0x00c00a00, 0x00000b00, 0x00400b00, 0x00800b00, 0x00c00b00, 0x00000c00, 212 | 0x00400c00, 0x00800c00, 0x00c00c00, 0x01ffffff, 0x01ffffff, 0x01ffffff, 213 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 214 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 215 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 216 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 217 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 218 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 219 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 220 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 221 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 222 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 223 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 224 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 225 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 226 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 227 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 228 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 229 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 230 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 231 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 232 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 233 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 234 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff 235 | }; 236 | 237 | 238 | static const uint32_t d3[256] = { 239 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 240 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 241 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 242 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 243 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 244 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 245 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 246 | 0x01ffffff, 0x003e0000, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x003f0000, 247 | 0x00340000, 0x00350000, 0x00360000, 0x00370000, 0x00380000, 0x00390000, 248 | 0x003a0000, 0x003b0000, 0x003c0000, 0x003d0000, 0x01ffffff, 0x01ffffff, 249 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00000000, 250 | 0x00010000, 0x00020000, 0x00030000, 0x00040000, 0x00050000, 0x00060000, 251 | 0x00070000, 0x00080000, 0x00090000, 0x000a0000, 0x000b0000, 0x000c0000, 252 | 0x000d0000, 0x000e0000, 0x000f0000, 0x00100000, 0x00110000, 0x00120000, 253 | 0x00130000, 0x00140000, 0x00150000, 0x00160000, 0x00170000, 0x00180000, 254 | 0x00190000, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 255 | 0x01ffffff, 0x001a0000, 0x001b0000, 0x001c0000, 0x001d0000, 0x001e0000, 256 | 0x001f0000, 0x00200000, 0x00210000, 0x00220000, 0x00230000, 0x00240000, 257 | 0x00250000, 0x00260000, 0x00270000, 0x00280000, 0x00290000, 0x002a0000, 258 | 0x002b0000, 0x002c0000, 0x002d0000, 0x002e0000, 0x002f0000, 0x00300000, 259 | 0x00310000, 0x00320000, 0x00330000, 0x01ffffff, 0x01ffffff, 0x01ffffff, 260 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 261 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 262 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 263 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 264 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 265 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 266 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 267 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 268 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 269 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 270 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 271 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 272 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 273 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 274 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 275 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 276 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 277 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 278 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 279 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 280 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 281 | 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff 282 | }; 283 | 284 | 285 | 286 | #define BADCHAR 0x01FFFFFF 287 | 288 | /** 289 | * you can control if we use padding by commenting out this 290 | * next line. However, I highly recommend you use padding and not 291 | * using it should only be for compatability with a 3rd party. 292 | * Also, 'no padding' is not tested! 293 | */ 294 | #define DOPAD 1 295 | 296 | /* 297 | * if we aren't doing padding 298 | * set the pad character to NULL 299 | */ 300 | #ifndef DOPAD 301 | #undef CHARPAD 302 | #define CHARPAD '\0' 303 | #endif 304 | 305 | size_t chromium_base64_encode(char* dest, const char* str, size_t len) 306 | { 307 | size_t i = 0; 308 | uint8_t* p = (uint8_t*) dest; 309 | 310 | /* unsigned here is important! */ 311 | uint8_t t1, t2, t3; 312 | 313 | if (len > 2) { 314 | for (; i < len - 2; i += 3) { 315 | t1 = str[i]; t2 = str[i+1]; t3 = str[i+2]; 316 | *p++ = e0[t1]; 317 | *p++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)]; 318 | *p++ = e1[((t2 & 0x0F) << 2) | ((t3 >> 6) & 0x03)]; 319 | *p++ = e2[t3]; 320 | } 321 | } 322 | 323 | switch (len - i) { 324 | case 0: 325 | break; 326 | case 1: 327 | t1 = str[i]; 328 | *p++ = e0[t1]; 329 | *p++ = e1[(t1 & 0x03) << 4]; 330 | *p++ = CHARPAD; 331 | *p++ = CHARPAD; 332 | break; 333 | default: /* case 2 */ 334 | t1 = str[i]; t2 = str[i+1]; 335 | *p++ = e0[t1]; 336 | *p++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)]; 337 | *p++ = e2[(t2 & 0x0F) << 2]; 338 | *p++ = CHARPAD; 339 | } 340 | 341 | *p = '\0'; 342 | return p - (uint8_t*)dest; 343 | } 344 | 345 | 346 | size_t chromium_base64_decode(char* dest, const char* src, size_t len) 347 | { 348 | if (len == 0) return 0; 349 | 350 | #ifdef DOPAD 351 | /* 352 | * if padding is used, then the message must be at least 353 | * 4 chars and be a multiple of 4 354 | */ 355 | if (len < 4 || (len % 4 != 0)) { 356 | return MODP_B64_ERROR; /* error */ 357 | } 358 | /* there can be at most 2 pad chars at the end */ 359 | if (src[len-1] == CHARPAD) { 360 | len--; 361 | if (src[len -1] == CHARPAD) { 362 | len--; 363 | } 364 | } 365 | #endif 366 | 367 | size_t i; 368 | int leftover = len % 4; 369 | size_t chunks = (leftover == 0) ? len / 4 - 1 : len /4; 370 | 371 | uint8_t* p = (uint8_t*)dest; 372 | uint32_t x = 0; 373 | const uint8_t* y = (uint8_t*)src; 374 | for (i = 0; i < chunks; ++i, y += 4) { 375 | x = d0[y[0]] | d1[y[1]] | d2[y[2]] | d3[y[3]]; 376 | if (x >= BADCHAR) return MODP_B64_ERROR; 377 | *p++ = ((uint8_t*)(&x))[0]; 378 | *p++ = ((uint8_t*)(&x))[1]; 379 | *p++ = ((uint8_t*)(&x))[2]; 380 | } 381 | 382 | switch (leftover) { 383 | case 0: 384 | x = d0[y[0]] | d1[y[1]] | d2[y[2]] | d3[y[3]]; 385 | 386 | if (x >= BADCHAR) return MODP_B64_ERROR; 387 | *p++ = ((uint8_t*)(&x))[0]; 388 | *p++ = ((uint8_t*)(&x))[1]; 389 | *p = ((uint8_t*)(&x))[2]; 390 | return (chunks+1)*3; 391 | break; 392 | case 1: /* with padding this is an impossible case */ 393 | x = d0[y[0]]; 394 | *p = *((uint8_t*)(&x)); // i.e. first char/byte in int 395 | break; 396 | case 2: // * case 2, 1 output byte */ 397 | x = d0[y[0]] | d1[y[1]]; 398 | *p = *((uint8_t*)(&x)); // i.e. first char 399 | break; 400 | default: /* case 3, 2 output bytes */ 401 | x = d0[y[0]] | d1[y[1]] | d2[y[2]]; /* 0x3c */ 402 | *p++ = ((uint8_t*)(&x))[0]; 403 | *p = ((uint8_t*)(&x))[1]; 404 | break; 405 | } 406 | 407 | if (x >= BADCHAR) return MODP_B64_ERROR; 408 | 409 | return 3*chunks + (6*leftover)/8; 410 | } 411 | -------------------------------------------------------------------------------- /src/base64/decode_base64_avx512vbmi.c: -------------------------------------------------------------------------------- 1 | /* 2 | sources: 3 | - https://github.com/WojciechMula/base64simd/blob/master/decode/decode.avx512vbmi.cpp 4 | - https://github.com/WojciechMula/base64simd/blob/master/decode/lookup.avx512vbmi.cpp 5 | - https://github.com/WojciechMula/base64simd/blob/master/decode/pack.avx512bw.cpp 6 | */ 7 | #include "decode_base64_avx512vbmi.h" 8 | 9 | #include 10 | #include 11 | 12 | #include "decode_base64_tail_avx512vbmi.c" 13 | 14 | // Note: constants lookup_lo, lookup_hi, pack were 15 | // generated with scripts/avx512vbmi_decode_lookups.py 16 | 17 | size_t decode_base64_avx512vbmi(uint8_t* dst, const uint8_t* src, size_t size) { 18 | 19 | const __m512i lookup_0 = _mm512_setr_epi32( 20 | 0x80808080, 0x80808080, 0x80808080, 0x80808080, 21 | 0x80808080, 0x80808080, 0x80808080, 0x80808080, 22 | 0x80808080, 0x80808080, 0x3e808080, 0x3f808080, 23 | 0x37363534, 0x3b3a3938, 0x80803d3c, 0x80808080); 24 | const __m512i lookup_1 = _mm512_setr_epi32( 25 | 0x02010080, 0x06050403, 0x0a090807, 0x0e0d0c0b, 26 | 0x1211100f, 0x16151413, 0x80191817, 0x80808080, 27 | 0x1c1b1a80, 0x201f1e1d, 0x24232221, 0x28272625, 28 | 0x2c2b2a29, 0x302f2e2d, 0x80333231, 0x80808080); 29 | 30 | uint8_t* start = dst; 31 | __m512i errorvec = _mm512_setzero_si512(); 32 | while (size >= 64) { 33 | 34 | // 1. load input 35 | __m512i input = _mm512_loadu_si512((const __m512i*)src); 36 | 37 | // 2. translate from ASCII into 6-bit values 38 | __m512i translated = _mm512_permutex2var_epi8(lookup_0, input, lookup_1); 39 | 40 | // 2a. check for errors --- convert MSBs to a mask (note: we're reporting error at the end) 41 | const int OR_ALL = 0xfe; 42 | errorvec = _mm512_ternarylogic_epi32(errorvec, translated, input, OR_ALL); 43 | 44 | // 3. pack four 6-bit values into 24-bit words (all within 32-bit lanes) 45 | // Note: exactly the same procedure as we have in AVX2 version 46 | // input: packed_dword([00dddddd|00cccccc|00bbbbbb|00aaaaaa] x 4) 47 | // merged: packed_dword([00000000|aaaaabbb|bbbbcccc|ccdddddd] x 4) 48 | const __m512i merge_ab_and_bc = _mm512_maddubs_epi16(translated, 49 | _mm512_set1_epi32(0x01400140)); 50 | 51 | __m512i merged = _mm512_madd_epi16(merge_ab_and_bc, _mm512_set1_epi32(0x00011000)); 52 | 53 | 54 | // 4. pack 24-bit values into continous array of 48 bytes 55 | const __m512i pack = _mm512_setr_epi32( 56 | 0x06000102, 0x090a0405, 0x0c0d0e08, 0x16101112, 57 | 0x191a1415, 0x1c1d1e18, 0x26202122, 0x292a2425, 58 | 0x2c2d2e28, 0x36303132, 0x393a3435, 0x3c3d3e38, 59 | 0x00000000, 0x00000000, 0x00000000, 0x00000000); 60 | const __m512i shuffled = _mm512_permutexvar_epi8(pack, merged); 61 | 62 | _mm512_storeu_si512((__m512*)dst, shuffled); 63 | 64 | src += 64; 65 | dst += 48; 66 | size -= 64; 67 | } 68 | 69 | if (_mm512_movepi8_mask(errorvec) != 0) 70 | return (size_t)-1; 71 | 72 | int scalar = decode_base64_tail_avx512vbmi(dst, src, size); 73 | if (scalar < 0) 74 | return (size_t)-1; 75 | 76 | return (dst - start) + scalar; 77 | } 78 | -------------------------------------------------------------------------------- /src/base64/decode_base64_avx512vbmi__unrolled.c: -------------------------------------------------------------------------------- 1 | #include "decode_base64_avx512vbmi__unrolled.h" 2 | #include "chromiumbase64.h" 3 | 4 | #include 5 | #include 6 | 7 | // Note: constants lookup_lo, lookup_hi, joinXX were 8 | // generated with scripts/avx512vbmi_decode_lookups.py 9 | // loads inputs of 64 * 4 = 256 bytes at a time. 10 | size_t decode_base64_avx512vbmi__unrolled(uint8_t* dst, const uint8_t* src, size_t size) { 11 | 12 | const __m512i lookup_0 = _mm512_setr_epi32( 13 | 0x80808080, 0x80808080, 0x80808080, 0x80808080, 14 | 0x80808080, 0x80808080, 0x80808080, 0x80808080, 15 | 0x80808080, 0x80808080, 0x3e808080, 0x3f808080, 16 | 0x37363534, 0x3b3a3938, 0x80803d3c, 0x80808080); 17 | const __m512i lookup_1 = _mm512_setr_epi32( 18 | 0x02010080, 0x06050403, 0x0a090807, 0x0e0d0c0b, 19 | 0x1211100f, 0x16151413, 0x80191817, 0x80808080, 20 | 0x1c1b1a80, 0x201f1e1d, 0x24232221, 0x28272625, 21 | 0x2c2b2a29, 0x302f2e2d, 0x80333231, 0x80808080); 22 | 23 | uint8_t* start = dst; 24 | const int OR_ALL = 0xfe; // function "a or b or c" 25 | __m512i errorvec = _mm512_setzero_si512(); 26 | while (size >= 64*4) { 27 | 28 | // 1. load input 29 | __m512i input0 = _mm512_loadu_si512((const __m512i*)(src + 0*64)); 30 | __m512i input1 = _mm512_loadu_si512((const __m512i*)(src + 1*64)); 31 | __m512i input2 = _mm512_loadu_si512((const __m512i*)(src + 2*64)); 32 | __m512i input3 = _mm512_loadu_si512((const __m512i*)(src + 3*64)); 33 | 34 | // 2. translate from ASCII into 6-bit values 35 | __m512i translated0 = _mm512_permutex2var_epi8(lookup_0, input0, lookup_1); 36 | __m512i translated1 = _mm512_permutex2var_epi8(lookup_0, input1, lookup_1); 37 | __m512i translated2 = _mm512_permutex2var_epi8(lookup_0, input2, lookup_1); 38 | __m512i translated3 = _mm512_permutex2var_epi8(lookup_0, input3, lookup_1); 39 | 40 | // 2a. check for errors --- convert MSBs to a mask 41 | const __m512i t0 = _mm512_ternarylogic_epi32(input0, input1, input2, OR_ALL); 42 | const __m512i t1 = _mm512_ternarylogic_epi32(input3, translated0, translated1, OR_ALL); 43 | const __m512i t2 = _mm512_ternarylogic_epi32(translated2, translated3, t0, OR_ALL); 44 | errorvec = _mm512_ternarylogic_epi32(t0, t1, t2, OR_ALL); 45 | 46 | // 3. pack four 6-bit values into 24-bit words (all within 32-bit lanes) 47 | // Note: exactly the same procedure as we have in AVX2 version 48 | // input: packed_dword([00dddddd|00cccccc|00bbbbbb|00aaaaaa] x 4) 49 | // merged: packed_dword([00000000|aaaaabbb|bbbbcccc|ccdddddd] x 4) 50 | const __m512i merge_ab_and_bc0 = _mm512_maddubs_epi16(translated0, _mm512_set1_epi32(0x01400140)); 51 | const __m512i merge_ab_and_bc1 = _mm512_maddubs_epi16(translated1, _mm512_set1_epi32(0x01400140)); 52 | const __m512i merge_ab_and_bc2 = _mm512_maddubs_epi16(translated2, _mm512_set1_epi32(0x01400140)); 53 | const __m512i merge_ab_and_bc3 = _mm512_maddubs_epi16(translated3, _mm512_set1_epi32(0x01400140)); 54 | 55 | __m512i merged0 = _mm512_madd_epi16(merge_ab_and_bc0, _mm512_set1_epi32(0x00011000)); 56 | __m512i merged1 = _mm512_madd_epi16(merge_ab_and_bc1, _mm512_set1_epi32(0x00011000)); 57 | __m512i merged2 = _mm512_madd_epi16(merge_ab_and_bc2, _mm512_set1_epi32(0x00011000)); 58 | __m512i merged3 = _mm512_madd_epi16(merge_ab_and_bc3, _mm512_set1_epi32(0x00011000)); 59 | 60 | // 4. pack 4 x 48 bytes into three AVX512 registers 24-bit values into continous array of 3*64 bytes 61 | const __m512i join01 = _mm512_setr_epi32(0x06000102, 0x090a0405, 0x0c0d0e08, 0x16101112, 62 | 0x191a1415, 0x1c1d1e18, 0x26202122, 0x292a2425, 63 | 0x2c2d2e28, 0x36303132, 0x393a3435, 0x3c3d3e38, 64 | 0x46404142, 0x494a4445, 0x4c4d4e48, 0x56505152); 65 | const __m512i join12 = _mm512_setr_epi32(0x191a1415, 0x1c1d1e18, 0x26202122, 0x292a2425, 66 | 0x2c2d2e28, 0x36303132, 0x393a3435, 0x3c3d3e38, 67 | 0x46404142, 0x494a4445, 0x4c4d4e48, 0x56505152, 68 | 0x595a5455, 0x5c5d5e58, 0x66606162, 0x696a6465); 69 | const __m512i join23 = _mm512_setr_epi32(0x2c2d2e28, 0x36303132, 0x393a3435, 0x3c3d3e38, 70 | 0x46404142, 0x494a4445, 0x4c4d4e48, 0x56505152, 71 | 0x595a5455, 0x5c5d5e58, 0x66606162, 0x696a6465, 72 | 0x6c6d6e68, 0x76707172, 0x797a7475, 0x7c7d7e78); 73 | const __m512i arr01 = _mm512_permutex2var_epi8(merged0, join01, merged1); 74 | const __m512i arr12 = _mm512_permutex2var_epi8(merged1, join12, merged2); 75 | const __m512i arr23 = _mm512_permutex2var_epi8(merged2, join23, merged3); 76 | 77 | _mm512_storeu_si512((__m512*)(dst + 0*64), arr01); 78 | _mm512_storeu_si512((__m512*)(dst + 1*64), arr12); 79 | _mm512_storeu_si512((__m512*)(dst + 2*64), arr23); 80 | 81 | src += 4*64; 82 | dst += 3*64; 83 | size -= 4*64; 84 | } 85 | 86 | while (size >= 64) { 87 | 88 | // 1. load input 89 | __m512i input = _mm512_loadu_si512((const __m512i*)src); 90 | 91 | // 2. translate from ASCII into 6-bit values 92 | __m512i translated = _mm512_permutex2var_epi8(lookup_0, input, lookup_1); 93 | 94 | // 2a. check for errors --- convert MSBs to a mask 95 | errorvec = _mm512_ternarylogic_epi32(errorvec, translated, input, OR_ALL); 96 | 97 | // 3. pack four 6-bit values into 24-bit words (all within 32-bit lanes) 98 | // Note: exactly the same procedure as we have in AVX2 version 99 | // input: packed_dword([00dddddd|00cccccc|00bbbbbb|00aaaaaa] x 4) 100 | // merged: packed_dword([00000000|aaaaabbb|bbbbcccc|ccdddddd] x 4) 101 | const __m512i merge_ab_and_bc = _mm512_maddubs_epi16(translated, 102 | _mm512_set1_epi32(0x01400140)); 103 | 104 | __m512i merged = _mm512_madd_epi16(merge_ab_and_bc, _mm512_set1_epi32(0x00011000)); 105 | 106 | 107 | // 4. pack 24-bit values into continous array of 48 bytes 108 | const __m512i pack = _mm512_setr_epi32( 109 | 0x06000102, 0x090a0405, 0x0c0d0e08, 0x16101112, 110 | 0x191a1415, 0x1c1d1e18, 0x26202122, 0x292a2425, 111 | 0x2c2d2e28, 0x36303132, 0x393a3435, 0x3c3d3e38, 112 | 0x00000000, 0x00000000, 0x00000000, 0x00000000); 113 | const __m512i shuffled = _mm512_permutexvar_epi8(pack, merged); 114 | 115 | _mm512_storeu_si512((__m512*)dst, shuffled); 116 | 117 | src += 64; 118 | dst += 48; 119 | size -= 64; 120 | } 121 | 122 | if (_mm512_movepi8_mask(errorvec) != 0) 123 | return (size_t)-1; 124 | 125 | size_t scalar = chromium_base64_decode((char*)dst, (const char*)src, size); 126 | if (scalar == MODP_B64_ERROR) 127 | return (size_t)-1; 128 | 129 | return (dst - start) + scalar; 130 | } 131 | -------------------------------------------------------------------------------- /src/base64/decode_base64_avx512vbmi_despace.c: -------------------------------------------------------------------------------- 1 | /* 2 | sources: 3 | - https://github.com/WojciechMula/base64simd/blob/master/decode/decode.avx512vbmi.cpp 4 | - https://github.com/WojciechMula/base64simd/blob/master/decode/lookup.avx512vbmi.cpp 5 | - https://github.com/WojciechMula/base64simd/blob/master/decode/pack.avx512bw.cpp 6 | 7 | despace code by Zach Wegner --- see https://news.ycombinator.com/item?id=18834741 8 | */ 9 | #include "decode_base64_avx512vbmi_despace.h" 10 | #include "chromiumbase64.h" 11 | 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | // Note: constants lookup_lo, lookup_hi, pack were 18 | // generated with scripts/avx512vbmi_decode_lookups.py 19 | 20 | size_t despace(uint8_t* dst, const uint8_t* src, size_t size); 21 | 22 | #if defined(__GNUC__) 23 | # define CACHELINE_ALIGNED __attribute__((aligned(64))) 24 | # define ALIGN_LOADS 25 | #else 26 | # define CACHELINE_ALIGNED 27 | #endif 28 | 29 | uint8_t despace_single[64][64] CACHELINE_ALIGNED; 30 | 31 | size_t decode_base64_avx512vbmi_despace(uint8_t* dst, const uint8_t* src, size_t size) { 32 | 33 | // lookup: 34 | // - 'A'-'Z', 'a'-'z', '0'-'9', '+', '/' : base64 6-bit values 35 | // - ' ', '\n', '\r' : 0x40 [6th bit set] 36 | // - all other chars : 0x80 [7th bit set] 37 | 38 | const __m512i lookup_0 = _mm512_setr_epi32( 39 | 0x80808080, 0x80808080, 0x80408080, 0x80804080, 40 | 0x80808080, 0x80808080, 0x80808080, 0x80808080, 41 | 0x80808040, 0x80808080, 0x3e808080, 0x3f808080, 42 | 0x37363534, 0x3b3a3938, 0x80803d3c, 0x80808080); 43 | const __m512i lookup_1 = _mm512_setr_epi32( 44 | 0x02010080, 0x06050403, 0x0a090807, 0x0e0d0c0b, 45 | 0x1211100f, 0x16151413, 0x80191817, 0x80808080, 46 | 0x1c1b1a80, 0x201f1e1d, 0x24232221, 0x28272625, 47 | 0x2c2b2a29, 0x302f2e2d, 0x80333231, 0x80808080); 48 | 49 | // despace constants 50 | const uint64_t index_masks[6] = { 51 | 0xaaaaaaaaaaaaaaaa, 52 | 0xcccccccccccccccc, 53 | 0xf0f0f0f0f0f0f0f0, 54 | 0xff00ff00ff00ff00, 55 | 0xffff0000ffff0000, 56 | 0xffffffff00000000, 57 | }; 58 | 59 | const __m512i index_bits[6] = { 60 | _mm512_set1_epi8(1), 61 | _mm512_set1_epi8(2), 62 | _mm512_set1_epi8(4), 63 | _mm512_set1_epi8(8), 64 | _mm512_set1_epi8(16), 65 | _mm512_set1_epi8(32), 66 | }; 67 | 68 | uint8_t* start = dst; 69 | size_t scalar = 0; 70 | 71 | while (size >= 64) { 72 | 73 | // 1. load input 74 | __m512i input = _mm512_loadu_si512((const __m512i*)src); 75 | 76 | // 2. translate from ASCII into 6-bit values 77 | __m512i translated = _mm512_permutex2var_epi8(lookup_0, input, lookup_1); 78 | 79 | // 2a. check for errors --- convert MSBs to a mask 80 | const uint64_t error_mask = _mm512_test_epi8_mask(translated | input, _mm512_set1_epi8((int8_t)0x80)); 81 | if (error_mask != 0) return MODP_B64_ERROR; 82 | 83 | // 3. check if we need there are spaces (bit 6th) 84 | uint64_t whitespace_mask = _mm512_test_epi8_mask(translated, _mm512_set1_epi8(0x40)); 85 | if (whitespace_mask == 0) { 86 | // no despacing 87 | const __m512i merge_ab_and_bc = _mm512_maddubs_epi16(translated, _mm512_set1_epi32(0x01400140)); 88 | const __m512i merged = _mm512_madd_epi16(merge_ab_and_bc, _mm512_set1_epi32(0x00011000)); 89 | 90 | const __m512i pack = _mm512_setr_epi32( 91 | 0x06000102, 0x090a0405, 0x0c0d0e08, 0x16101112, 92 | 0x191a1415, 0x1c1d1e18, 0x26202122, 0x292a2425, 93 | 0x2c2d2e28, 0x36303132, 0x393a3435, 0x3c3d3e38, 94 | 0x00000000, 0x00000000, 0x00000000, 0x00000000); 95 | const __m512i shuffled = _mm512_permutexvar_epi8(pack, merged); 96 | 97 | _mm512_storeu_si512((__m512*)dst, shuffled); 98 | 99 | src += 64; 100 | dst += 48; 101 | size -= 64; 102 | } else { 103 | // In emails it's a common case, as all lines have the same length 104 | // and are separated by **single** '\n' char. 105 | if (_blsr_u64(whitespace_mask) == 0) { 106 | 107 | const int idx = __builtin_ctzll(whitespace_mask); 108 | #ifdef ALIGN_LOADS 109 | const __m512i indices = _mm512_load_si512((const __m512i*)(&despace_single[0][0] + 64 * idx)); 110 | #else 111 | const __m512i indices = _mm512_loadu_si512((const __m512i*)(&despace_single[0][0] + 64 * idx)); 112 | #endif // ALIGN_LOADS 113 | 114 | translated = _mm512_permutexvar_epi8(indices, translated); 115 | 116 | const __m512i merge_ab_and_bc = _mm512_maddubs_epi16(translated, _mm512_set1_epi32(0x01400140)); 117 | const __m512i merged = _mm512_madd_epi16(merge_ab_and_bc, _mm512_set1_epi32(0x00011000)); 118 | 119 | const __m512i pack = _mm512_setr_epi32( 120 | 0x06000102, 0x090a0405, 0x0c0d0e08, 0x16101112, 121 | 0x191a1415, 0x1c1d1e18, 0x26202122, 0x292a2425, 122 | 0x2c2d2e28, 0x36303132, 0x393a3435, 0x3c3d3e38, 123 | 0x00000000, 0x00000000, 0x00000000, 0x00000000); 124 | const __m512i shuffled = _mm512_permutexvar_epi8(pack, merged); 125 | 126 | _mm512_storeu_si512((__m512*)dst, shuffled); 127 | 128 | size_t input_skip = 64 - 3; 129 | if (idx > 60) // don't like this 'if' 130 | input_skip -= 1; 131 | 132 | src += input_skip; 133 | size -= input_skip; 134 | dst += 48 - 3; 135 | 136 | continue; 137 | } 138 | 139 | // despace --- Zach's algorithm starts here 140 | uint64_t characters_mask = ~whitespace_mask; 141 | 142 | __m512i indices = _mm512_set1_epi8(0); 143 | for (size_t index = 0; index < 6; index++) { 144 | uint64_t m = _pext_u64(index_masks[index], characters_mask); 145 | indices = _mm512_mask_add_epi8(indices, m, indices, index_bits[index]); 146 | } 147 | 148 | translated = _mm512_permutexvar_epi8(indices, translated); 149 | // end of despace 150 | 151 | // base64 algorithm 152 | const __m512i merge_ab_and_bc = _mm512_maddubs_epi16(translated, _mm512_set1_epi32(0x01400140)); 153 | const __m512i merged = _mm512_madd_epi16(merge_ab_and_bc, _mm512_set1_epi32(0x00011000)); 154 | 155 | const __m512i pack = _mm512_setr_epi32( 156 | 0x06000102, 0x090a0405, 0x0c0d0e08, 0x16101112, 157 | 0x191a1415, 0x1c1d1e18, 0x26202122, 0x292a2425, 158 | 0x2c2d2e28, 0x36303132, 0x393a3435, 0x3c3d3e38, 159 | 0x00000000, 0x00000000, 0x00000000, 0x00000000); 160 | const __m512i shuffled = _mm512_permutexvar_epi8(pack, merged); 161 | 162 | _mm512_storeu_si512((__m512*)dst, shuffled); 163 | 164 | /* 165 | Getting number of bytes to skip is a bit tricky. 166 | 167 | Despacing yields arbitrary number of bytes, but in base64 we 168 | process groups of 4 bytes. Thus the number of **despaced** bytes 169 | that contributed to output is 4 * popcount(characters_mask) / 4. 170 | However, the number of **input** bytes that have to be skipped 171 | depends on the spaces appeared before the last consumed byte. 172 | 173 | Solution [in examples we assume input = 32 bytes, 64 is just too wide] 174 | 175 | The input data we already have from despacing: 176 | 177 | whitespace_mask = 0010_0101_0010_0010_0100_0000_1000_0110 178 | characters_mask = 1101_1010_1101_1101_1011_1111_0111_1001 - popcnt = 23 179 | characters_compr = 1111_1111_1111_1111_1111_1110_0000_0000 180 | ^ ^ 181 | LSB MSB 182 | 183 | 1. We trim `characters_compr` to have exactly 4 * (23/4) + 1 = 21 bits 184 | This extra bit is needed to skip also trailing whitespaces. 185 | 186 | characters_converted = 1111_1111_1111_1111_1111_1000_0000_0000 187 | 188 | 2. The converted characters is expanded using PDEP with 189 | `character_mask` as a pattern: 190 | 191 | expanded = _pdep_u64(characters_converted, characters_mask) => 192 | 1101_1010_1101_1101_1011_1111_0111_1001 193 | 1111_1111_1111_1111_1111_1000_0000_0000 194 | = 1101_1010_1101_1101_1011_1111_0111_0000 195 | 196 | 3. The last step is to get position of last bit set 197 | 198 | input_skip = 64 - __builtin_clzll(expanded) - !!(count != rounded); 199 | = 1101_1010_1101_1101_1011_1111_0111_0000 200 | ^ ^ 201 | LSB 28 202 | 203 | The correction is needed for cases when count != rounded, otherwise 204 | for cases when all converted bytes are consumed we'd end up with 205 | wrong skip amounts. 206 | */ 207 | 208 | const size_t count = __builtin_popcountll(characters_mask); 209 | const size_t rounded = 4 * (count / 4); 210 | 211 | const uint64_t characters_converted = (uint64_t)(-1) >> (64 - rounded - 1); 212 | const uint64_t expanded = _pdep_u64(characters_converted, characters_mask); 213 | const size_t input_skip = 64 - __builtin_clzll(expanded) - !!(count != rounded); 214 | 215 | src += input_skip; 216 | size -= input_skip; 217 | dst += 3 * (count / 4); 218 | } 219 | } 220 | 221 | // this is a really slow part 222 | if (size > 0) { 223 | uint8_t tmp[128]; 224 | 225 | size = despace(tmp, src, size); 226 | scalar = chromium_base64_decode((char*)dst, (const char*)tmp, size); 227 | if (scalar == MODP_B64_ERROR) return MODP_B64_ERROR; 228 | } 229 | 230 | return (dst - start) + scalar; 231 | } 232 | 233 | 234 | size_t decode_base64_avx512vbmi_despace_email(uint8_t* dst, uint8_t** src_ptr, size_t size) { 235 | 236 | const __m512i lookup_0 = _mm512_setr_epi32( 237 | 0x80808080, 0x80808080, 0x80408080, 0x80804080, 238 | 0x80808080, 0x80808080, 0x80808080, 0x80808080, 239 | 0x80808040, 0x80808080, 0x3e808080, 0x3f808080, 240 | 0x37363534, 0x3b3a3938, 0x80803d3c, 0x80808080); 241 | const __m512i lookup_1 = _mm512_setr_epi32( 242 | 0x02010080, 0x06050403, 0x0a090807, 0x0e0d0c0b, 243 | 0x1211100f, 0x16151413, 0x80191817, 0x80808080, 244 | 0x1c1b1a80, 0x201f1e1d, 0x24232221, 0x28272625, 245 | 0x2c2b2a29, 0x302f2e2d, 0x80333231, 0x80808080); 246 | 247 | // despace constants 248 | const uint64_t index_masks[6] = { 249 | 0xaaaaaaaaaaaaaaaa, 250 | 0xcccccccccccccccc, 251 | 0xf0f0f0f0f0f0f0f0, 252 | 0xff00ff00ff00ff00, 253 | 0xffff0000ffff0000, 254 | 0xffffffff00000000, 255 | }; 256 | 257 | const __m512i index_bits[6] = { 258 | _mm512_set1_epi8(1), 259 | _mm512_set1_epi8(2), 260 | _mm512_set1_epi8(4), 261 | _mm512_set1_epi8(8), 262 | _mm512_set1_epi8(16), 263 | _mm512_set1_epi8(32), 264 | }; 265 | 266 | uint8_t* start = dst; 267 | size_t scalar = 0; 268 | uint8_t* src = *src_ptr; 269 | 270 | while (size >= 64) { 271 | 272 | // 1. load input 273 | __m512i input = _mm512_loadu_si512((const __m512i*)src); 274 | 275 | // 2. translate from ASCII into 6-bit values 276 | __m512i translated = _mm512_permutex2var_epi8(lookup_0, input, lookup_1); 277 | 278 | // 2a. check for errors --- convert MSBs to a mask 279 | const uint64_t error_mask = _mm512_test_epi8_mask(translated | input, _mm512_set1_epi8((int8_t)0x80)); 280 | if (error_mask != 0) { 281 | const size_t first_error = __builtin_ctzll(error_mask); 282 | // we have following cases for BASE64-message tails: 283 | // a) ^BASE64==\n\n--... 284 | if (memcmp(src + first_error, "==\n\n--", 6) == 0) { 285 | size = first_error + 2; 286 | goto tail; 287 | } 288 | 289 | // b) ^BASE64=\n\n--... 290 | if (memcmp(src + first_error, "=\n\n--", 5) == 0) { 291 | size = first_error + 1; 292 | goto tail; 293 | } 294 | 295 | // c) ^BASE64\n\n--... 296 | if (first_error >= 2 && memcmp(src + first_error - 2, "\n\n--", 4) == 0) { 297 | size = first_error - 2; 298 | goto tail; 299 | } 300 | 301 | // we have following cases for valid tails 302 | // a) ^\n\n--... 303 | // b) ^\n--... 304 | // c) ^--... 305 | if ((memcmp(src + first_error, "\n\n--", 4) == 0) || 306 | (memcmp(src + first_error, "\n--", 3) == 0) || 307 | (memcmp(src + first_error, "--", 2) == 0)) { 308 | goto end; 309 | } 310 | 311 | // otherwise return error 312 | return MODP_B64_ERROR; 313 | } 314 | 315 | // 3. check if we need there are spaces (bit 6th) 316 | uint64_t whitespace_mask = _mm512_test_epi8_mask(translated, _mm512_set1_epi8(0x40)); 317 | if (whitespace_mask == 0) { 318 | // no despacing 319 | const __m512i merge_ab_and_bc = _mm512_maddubs_epi16(translated, _mm512_set1_epi32(0x01400140)); 320 | const __m512i merged = _mm512_madd_epi16(merge_ab_and_bc, _mm512_set1_epi32(0x00011000)); 321 | 322 | const __m512i pack = _mm512_setr_epi32( 323 | 0x06000102, 0x090a0405, 0x0c0d0e08, 0x16101112, 324 | 0x191a1415, 0x1c1d1e18, 0x26202122, 0x292a2425, 325 | 0x2c2d2e28, 0x36303132, 0x393a3435, 0x3c3d3e38, 326 | 0x00000000, 0x00000000, 0x00000000, 0x00000000); 327 | const __m512i shuffled = _mm512_permutexvar_epi8(pack, merged); 328 | 329 | _mm512_storeu_si512((__m512*)dst, shuffled); 330 | 331 | src += 64; 332 | dst += 48; 333 | size -= 64; 334 | } else { 335 | // In emails it's a common case, as all lines have the same length 336 | // and are separated by **single** '\n' char. 337 | if (_blsr_u64(whitespace_mask) == 0) { 338 | 339 | const int idx = __builtin_ctzll(whitespace_mask); 340 | #ifdef ALIGN_LOADS 341 | const __m512i indices = _mm512_load_si512((const __m512i*)(&despace_single[0][0] + 64 * idx)); 342 | #else 343 | const __m512i indices = _mm512_loadu_si512((const __m512i*)(&despace_single[0][0] + 64 * idx)); 344 | #endif // ALIGN_LOADS 345 | 346 | translated = _mm512_permutexvar_epi8(indices, translated); 347 | 348 | const __m512i merge_ab_and_bc = _mm512_maddubs_epi16(translated, _mm512_set1_epi32(0x01400140)); 349 | const __m512i merged = _mm512_madd_epi16(merge_ab_and_bc, _mm512_set1_epi32(0x00011000)); 350 | 351 | const __m512i pack = _mm512_setr_epi32( 352 | 0x06000102, 0x090a0405, 0x0c0d0e08, 0x16101112, 353 | 0x191a1415, 0x1c1d1e18, 0x26202122, 0x292a2425, 354 | 0x2c2d2e28, 0x36303132, 0x393a3435, 0x3c3d3e38, 355 | 0x00000000, 0x00000000, 0x00000000, 0x00000000); 356 | const __m512i shuffled = _mm512_permutexvar_epi8(pack, merged); 357 | 358 | _mm512_storeu_si512((__m512*)dst, shuffled); 359 | 360 | size_t input_skip = 64 - 3; 361 | if (idx > 60) // don't like this 'if' 362 | input_skip -= 1; 363 | 364 | src += input_skip; 365 | size -= input_skip; 366 | dst += 48 - 3; 367 | 368 | continue; 369 | } 370 | 371 | // despace --- Zach's algorithm starts here 372 | uint64_t characters_mask = ~whitespace_mask; 373 | 374 | __m512i indices = _mm512_set1_epi8(0); 375 | for (size_t index = 0; index < 6; index++) { 376 | uint64_t m = _pext_u64(index_masks[index], characters_mask); 377 | indices = _mm512_mask_add_epi8(indices, m, indices, index_bits[index]); 378 | } 379 | 380 | translated = _mm512_permutexvar_epi8(indices, translated); 381 | // end of despace 382 | 383 | // base64 algorithm 384 | const __m512i merge_ab_and_bc = _mm512_maddubs_epi16(translated, _mm512_set1_epi32(0x01400140)); 385 | const __m512i merged = _mm512_madd_epi16(merge_ab_and_bc, _mm512_set1_epi32(0x00011000)); 386 | 387 | const __m512i pack = _mm512_setr_epi32( 388 | 0x06000102, 0x090a0405, 0x0c0d0e08, 0x16101112, 389 | 0x191a1415, 0x1c1d1e18, 0x26202122, 0x292a2425, 390 | 0x2c2d2e28, 0x36303132, 0x393a3435, 0x3c3d3e38, 391 | 0x00000000, 0x00000000, 0x00000000, 0x00000000); 392 | const __m512i shuffled = _mm512_permutexvar_epi8(pack, merged); 393 | 394 | _mm512_storeu_si512((__m512*)dst, shuffled); 395 | 396 | const size_t count = __builtin_popcountll(characters_mask); 397 | const size_t rounded = 4 * (count / 4); 398 | 399 | const uint64_t characters_converted = (uint64_t)(-1) >> (64 - rounded - 1); 400 | const uint64_t expanded = _pdep_u64(characters_converted, characters_mask); 401 | const size_t input_skip = 64 - __builtin_clzll(expanded) - !!(count != rounded); 402 | 403 | src += input_skip; 404 | size -= input_skip; 405 | dst += 3 * (count / 4); 406 | } 407 | } 408 | 409 | tail: 410 | // this is a really slow part 411 | if (size > 0) { 412 | uint8_t tmp[128]; 413 | 414 | size = despace(tmp, src, size); 415 | scalar = chromium_base64_decode((char*)dst, (const char*)tmp, size); 416 | if (scalar == MODP_B64_ERROR) return MODP_B64_ERROR; 417 | 418 | src += size; 419 | } 420 | 421 | end: 422 | *src_ptr = src; 423 | return (dst - start) + scalar; 424 | } 425 | 426 | 427 | size_t despace(uint8_t* dst, const uint8_t* src, size_t size) { 428 | uint8_t* orig = dst; 429 | for (size_t i=0; i < size; i++) { 430 | if (src[i] == ' ' || src[i] == '\n' || src[i] == '\r') 431 | continue; 432 | 433 | *dst++ = src[i]; 434 | } 435 | 436 | return dst - orig; 437 | } 438 | 439 | 440 | uint8_t despace_single[64][64] = { 441 | {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,-1}, 442 | {0,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,-1}, 443 | {0,1,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,-1}, 444 | {0,1,2,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,-1}, 445 | {0,1,2,3,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,-1}, 446 | {0,1,2,3,4,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,-1}, 447 | {0,1,2,3,4,5,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,-1}, 448 | {0,1,2,3,4,5,6,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,-1}, 449 | {0,1,2,3,4,5,6,7,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,-1}, 450 | {0,1,2,3,4,5,6,7,8,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,-1}, 451 | {0,1,2,3,4,5,6,7,8,9,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,-1}, 452 | {0,1,2,3,4,5,6,7,8,9,10,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,-1}, 453 | {0,1,2,3,4,5,6,7,8,9,10,11,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,-1}, 454 | {0,1,2,3,4,5,6,7,8,9,10,11,12,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,-1}, 455 | {0,1,2,3,4,5,6,7,8,9,10,11,12,13,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,-1}, 456 | {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,-1}, 457 | {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,-1}, 458 | {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,-1}, 459 | {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,-1}, 460 | {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,-1}, 461 | {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,-1}, 462 | {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,-1}, 463 | {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,-1}, 464 | {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,-1}, 465 | {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,-1}, 466 | {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,-1}, 467 | {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,-1}, 468 | {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,-1}, 469 | {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,-1}, 470 | {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,-1}, 471 | {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,-1}, 472 | {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,-1}, 473 | {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,-1}, 474 | {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,-1}, 475 | {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,-1}, 476 | {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,-1}, 477 | {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,-1}, 478 | {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,-1}, 479 | {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,-1}, 480 | {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,-1}, 481 | {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,-1}, 482 | {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,-1}, 483 | {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,-1}, 484 | {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,-1}, 485 | {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,-1}, 486 | {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,-1}, 487 | {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,-1}, 488 | {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,-1}, 489 | {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,-1}, 490 | {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,50,51,52,53,54,55,56,57,58,59,60,61,62,63,-1}, 491 | {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,51,52,53,54,55,56,57,58,59,60,61,62,63,-1}, 492 | {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,52,53,54,55,56,57,58,59,60,61,62,63,-1}, 493 | {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,53,54,55,56,57,58,59,60,61,62,63,-1}, 494 | {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,54,55,56,57,58,59,60,61,62,63,-1}, 495 | {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,55,56,57,58,59,60,61,62,63,-1}, 496 | {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,56,57,58,59,60,61,62,63,-1}, 497 | {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,57,58,59,60,61,62,63,-1}, 498 | {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,58,59,60,61,62,63,-1}, 499 | {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,59,60,61,62,63,-1}, 500 | {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,60,61,62,63,-1}, 501 | {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,61,62,63,-1}, 502 | {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,62,63,-1}, 503 | {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,63,-1}, 504 | {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,-1} 505 | }; 506 | -------------------------------------------------------------------------------- /src/base64/decode_base64_tail_avx512vbmi.c: -------------------------------------------------------------------------------- 1 | // This file is meant to be included into another .c file in 2 | // order to allow inlining. 3 | // 4 | // Procedure decodes tail of whitespace-free base64-encoded 5 | // message, and deals with trailing '='. 6 | #include 7 | #include 8 | #include 9 | 10 | int decode_base64_tail_avx512vbmi(uint8_t* dst, const uint8_t* src, size_t size) { 11 | 12 | const char BASE64_PAD = '='; 13 | 14 | assert(size <= 64); 15 | if (size == 0) { 16 | return 0; 17 | } 18 | 19 | if (size % 4 != 0) { 20 | return -1; 21 | } 22 | 23 | uint64_t input_mask = ((uint64_t)-1) >> (64 - size); 24 | int output_size = (size / 4) * 3; 25 | if (src[size - 1] == BASE64_PAD) { 26 | output_size -= 1; 27 | input_mask >>= 1; 28 | if (src[size - 2] == BASE64_PAD) { 29 | output_size -= 1; 30 | input_mask >>= 1; 31 | } 32 | } 33 | 34 | const uint64_t output_mask = (0x0000ffffffffffffllu) >> (48 - output_size); 35 | 36 | const __m512i lookup_0 = _mm512_setr_epi32( 37 | 0x80808080, 0x80808080, 0x80808080, 0x80808080, 38 | 0x80808080, 0x80808080, 0x80808080, 0x80808080, 39 | 0x80808080, 0x80808080, 0x3e808080, 0x3f808080, 40 | 0x37363534, 0x3b3a3938, 0x80803d3c, 0x80808080); 41 | const __m512i lookup_1 = _mm512_setr_epi32( 42 | 0x02010080, 0x06050403, 0x0a090807, 0x0e0d0c0b, 43 | 0x1211100f, 0x16151413, 0x80191817, 0x80808080, 44 | 0x1c1b1a80, 0x201f1e1d, 0x24232221, 0x28272625, 45 | 0x2c2b2a29, 0x302f2e2d, 0x80333231, 0x80808080); 46 | 47 | // 1. load input, fill past-end bytes with a valid base64 character 48 | __m512i input = _mm512_mask_loadu_epi8(_mm512_set1_epi8('a'), input_mask, src); 49 | 50 | // 2. translate from ASCII into 6-bit values 51 | __m512i translated = _mm512_permutex2var_epi8(lookup_0, input, lookup_1); 52 | 53 | // 2a. check for errors --- convert MSBs to a mask 54 | const uint64_t mask = _mm512_test_epi8_mask(translated | input, _mm512_set1_epi8((int8_t)0x80)); 55 | if (mask != 0) 56 | return -1; 57 | 58 | // 3. pack four 6-bit values into 24-bit words (all within 32-bit lanes) 59 | // Note: exactly the same procedure as we have in AVX2 version 60 | // input: packed_dword([00dddddd|00cccccc|00bbbbbb|00aaaaaa] x 4) 61 | // merged: packed_dword([00000000|aaaaabbb|bbbbcccc|ccdddddd] x 4) 62 | const __m512i merge_ab_and_bc = _mm512_maddubs_epi16(translated, 63 | _mm512_set1_epi32(0x01400140)); 64 | 65 | __m512i merged = _mm512_madd_epi16(merge_ab_and_bc, _mm512_set1_epi32(0x00011000)); 66 | 67 | 68 | // 4. pack 24-bit values into continous array of 48 bytes 69 | const __m512i pack = _mm512_setr_epi32( 70 | 0x06000102, 0x090a0405, 0x0c0d0e08, 0x16101112, 71 | 0x191a1415, 0x1c1d1e18, 0x26202122, 0x292a2425, 72 | 0x2c2d2e28, 0x36303132, 0x393a3435, 0x3c3d3e38, 73 | 0x00000000, 0x00000000, 0x00000000, 0x00000000); 74 | const __m512i shuffled = _mm512_permutexvar_epi8(pack, merged); 75 | 76 | _mm512_mask_storeu_epi8((__m512*)dst, output_mask, shuffled); 77 | 78 | return output_size; 79 | } 80 | -------------------------------------------------------------------------------- /src/base64/encode_base64_avx512vbmi.c: -------------------------------------------------------------------------------- 1 | // copy from: https://github.com/WojciechMula/base64simd/blob/master/encode/encode.avx512vbmi.cpp 2 | #include "encode_base64_avx512vbmi.h" 3 | #include "chromiumbase64.h" 4 | 5 | #include 6 | 7 | void encode_base64_avx512vbmi(uint8_t* dst, const uint8_t* src, size_t size) { 8 | 9 | static const char* lookup_tbl = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; 10 | 11 | const __m512i shuffle_input = _mm512_setr_epi32( 12 | 0x01020001, 0x04050304, 0x07080607, 0x0a0b090a, 13 | 0x0d0e0c0d, 0x10110f10, 0x13141213, 0x16171516, 14 | 0x191a1819, 0x1c1d1b1c, 0x1f201e1f, 0x22232122, 15 | 0x25262425, 0x28292728, 0x2b2c2a2b, 0x2e2f2d2e); 16 | const __m512i lookup = _mm512_loadu_si512((const __m512i*)(lookup_tbl)); 17 | 18 | while (size >= 64) { 19 | const __m512i v = _mm512_loadu_si512((const __m512i*)src); 20 | 21 | // reorder bytes 22 | const __m512i in = _mm512_permutexvar_epi8(shuffle_input, v); 23 | 24 | // in = [bbbbcccc|ccdddddd|aaaaaabb|bbbbcccc] 25 | // t0 = [0000cccc|cc000000|aaaaaa00|00000000] 26 | const __m512i t0 = _mm512_and_si512(in, _mm512_set1_epi32(0x0fc0fc00)); 27 | // t1 = [00000000|00cccccc|00000000|00aaaaaa] (c >> 6, a >> 10) 28 | const __m512i t1 = _mm512_srlv_epi16(t0, _mm512_set1_epi32(0x0006000a)); 29 | // t2 = [ccdddddd|00000000|aabbbbbb|cccc0000] 30 | const __m512i t2 = _mm512_sllv_epi16(in, _mm512_set1_epi32(0x00080004)); 31 | 32 | // indices = 0x3f003f00 ? t2 : t1 33 | // = [00dddddd|00cccccc|00bbbbbb|00aaaaaa] 34 | const __m512i indices = _mm512_ternarylogic_epi32(_mm512_set1_epi32(0x3f003f00), t2, t1, 0xca); 35 | 36 | // translation into ASCII 37 | const __m512i result = _mm512_permutexvar_epi8(indices, lookup); 38 | 39 | _mm512_storeu_si512((__m512i*)dst, result); 40 | 41 | dst += 64; 42 | src += 48; 43 | size -= 48; 44 | } 45 | 46 | if (size > 0) { 47 | chromium_base64_encode((char*)dst, (const char*)src, size); 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/base64/encode_base64_avx512vl.c: -------------------------------------------------------------------------------- 1 | // copy from: https://github.com/WojciechMula/base64simd/blob/master/encode/encode.avx512vl.cpp 2 | #include "encode_base64_avx512vl.h" 3 | #include "chromiumbase64.h" 4 | 5 | #include 6 | 7 | void encode_base64_avx512vl(uint8_t* dst, const uint8_t* src, size_t size) { 8 | 9 | static const char* lookup_tbl = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; 10 | 11 | // 32-bit input: [ 0 0 0 0 0 0 0 0|c1 c0 d5 d4 d3 d2 d1 d0|b3 b2 b1 b0 c5 c4 c3 c2|a5 a4 a3 a2 a1 a0 b5 b4] 12 | // 2 1 0 13 | // output order [1, 2, 0, 1], i.e.: 14 | // [b3 b2 b1 b0 c5 c4 c3 c2|c1 c0 d5 d4 d3 d2 d1 d0|a5 a4 a3 a2 a1 a0 b5 b4|b3 b2 b1 b0 c3 c2 c1 c0] 15 | // ^^^^^^^^^^^ ^^^^^ ----------------- ^^^^^^^^^^^^^^^^^ ----------------- 16 | // constants generated by script/permutexvar_parameters.py 17 | const __m512i shuffle_input = _mm512_setr_epi32( 18 | 0x01020001, 0x04050304, 0x07080607, 0x0a0b090a, 19 | 0x0d0e0c0d, 0x10110f10, 0x13141213, 0x16171516, 20 | 0x191a1819, 0x1c1d1b1c, 0x1f201e1f, 0x22232122, 21 | 0x25262425, 0x28292728, 0x2b2c2a2b, 0x2e2f2d2e); 22 | const __m512i lookup = _mm512_loadu_si512((const __m512i*)(lookup_tbl)); 23 | 24 | while (size >= 64) { 25 | const __m512i v = _mm512_loadu_si512((const __m512i*)src); 26 | 27 | // reorder bytes 28 | // [b3 b2 b1 b0 c5 c4 c3 c2|c1 c0 d5 d4 d3 d2 d1 d0|a5 a4 a3 a2 a1 a0 b5 b4|b3 b2 b1 b0 c3 c2 c1 c0] 29 | const __m512i in = _mm512_permutexvar_epi8(shuffle_input, v); 30 | 31 | // after multishift a single 32-bit lane has following layout: 32 | // [c1 c0 d5 d4 d3 d2 d1 d0|b1 b0 c5 c4 c3 c2 c1 c0|a1 a0 b5 b4 b3 b2 b1 b0|d1 d0 a5 a4 a3 a2 a1 a0] 33 | // ^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^ 34 | // i.e.: (a = [10:17], b = [4:11], c = [22:27], d = [16:21]) 35 | 36 | const __m512i shifts = _mm512_set1_epi64(0x3036242a1016040alu); // 48, 54, 36, 42, 16, 22, 4, 10 37 | const __m512i indices = _mm512_multishift_epi64_epi8(shifts, in); 38 | 39 | // Note: the two higher bits of each indices' byte have garbage, 40 | // but the following permutexvar instruction masks them out. 41 | 42 | // translation 6-bit values -> ASCII 43 | const __m512i result = _mm512_permutexvar_epi8(indices, lookup); 44 | 45 | _mm512_storeu_si512((__m512i*)dst, result); 46 | 47 | dst += 64; 48 | src += 48; 49 | size -= 48; 50 | } 51 | 52 | if (size > 0) { 53 | chromium_base64_encode((char*)dst, (const char*)src, size); 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /src/base64/fastavxbase64.c: -------------------------------------------------------------------------------- 1 | #include "fastavxbase64.h" 2 | 3 | #include 4 | #include 5 | 6 | /** 7 | * This code borrows from Wojciech Mula's library at 8 | * https://github.com/WojciechMula/base64simd (published under BSD) 9 | * as well as code from Alfred Klomp's library https://github.com/aklomp/base64 (published under BSD) 10 | * 11 | */ 12 | 13 | 14 | 15 | 16 | /** 17 | * Note : Hardware such as Knights Landing might do poorly with this AVX2 code since it relies on shuffles. Alternatives might be faster. 18 | */ 19 | 20 | 21 | static inline __m256i enc_reshuffle(const __m256i input) { 22 | 23 | // translation from SSE into AVX2 of procedure 24 | // https://github.com/WojciechMula/base64simd/blob/master/encode/unpack_bigendian.cpp 25 | const __m256i in = _mm256_shuffle_epi8(input, _mm256_set_epi8( 26 | 10, 11, 9, 10, 27 | 7, 8, 6, 7, 28 | 4, 5, 3, 4, 29 | 1, 2, 0, 1, 30 | 31 | 14, 15, 13, 14, 32 | 11, 12, 10, 11, 33 | 8, 9, 7, 8, 34 | 5, 6, 4, 5 35 | )); 36 | 37 | const __m256i t0 = _mm256_and_si256(in, _mm256_set1_epi32(0x0fc0fc00)); 38 | const __m256i t1 = _mm256_mulhi_epu16(t0, _mm256_set1_epi32(0x04000040)); 39 | 40 | const __m256i t2 = _mm256_and_si256(in, _mm256_set1_epi32(0x003f03f0)); 41 | const __m256i t3 = _mm256_mullo_epi16(t2, _mm256_set1_epi32(0x01000010)); 42 | 43 | return _mm256_or_si256(t1, t3); 44 | } 45 | 46 | static inline __m256i enc_translate(const __m256i in) { 47 | const __m256i lut = _mm256_setr_epi8( 48 | 65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0, 65, 71, 49 | -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0); 50 | __m256i indices = _mm256_subs_epu8(in, _mm256_set1_epi8(51)); 51 | __m256i mask = _mm256_cmpgt_epi8((in), _mm256_set1_epi8(25)); 52 | indices = _mm256_sub_epi8(indices, mask); 53 | __m256i out = _mm256_add_epi8(in, _mm256_shuffle_epi8(lut, indices)); 54 | return out; 55 | } 56 | 57 | static inline __m256i dec_reshuffle(__m256i in) { 58 | 59 | // inlined procedure pack_madd from https://github.com/WojciechMula/base64simd/blob/master/decode/pack.avx2.cpp 60 | // The only difference is that elements are reversed, 61 | // only the multiplication constants were changed. 62 | 63 | const __m256i merge_ab_and_bc = _mm256_maddubs_epi16(in, _mm256_set1_epi32(0x01400140)); //_mm256_maddubs_epi16 is likely expensive 64 | __m256i out = _mm256_madd_epi16(merge_ab_and_bc, _mm256_set1_epi32(0x00011000)); 65 | // end of inlined 66 | 67 | // Pack bytes together within 32-bit words, discarding words 3 and 7: 68 | out = _mm256_shuffle_epi8(out, _mm256_setr_epi8( 69 | 2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1, 70 | 2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1 71 | )); 72 | // the call to _mm256_permutevar8x32_epi32 could be replaced by a call to _mm256_storeu2_m128i but it is doubtful that it would help 73 | return _mm256_permutevar8x32_epi32( 74 | out, _mm256_setr_epi32(0, 1, 2, 4, 5, 6, -1, -1)); 75 | } 76 | 77 | 78 | size_t fast_avx2_base64_encode(char* dest, const char* str, size_t len) { 79 | const char* const dest_orig = dest; 80 | if(len >= 32 - 4) { 81 | // first load is masked 82 | __m256i inputvector = _mm256_maskload_epi32((int const*)(str - 4), _mm256_set_epi32( 83 | 0x80000000, 84 | 0x80000000, 85 | 0x80000000, 86 | 0x80000000, 87 | 88 | 0x80000000, 89 | 0x80000000, 90 | 0x80000000, 91 | 0x00000000 // we do not load the first 4 bytes 92 | )); 93 | ////////// 94 | // Intel docs: Faults occur only due to mask-bit required memory accesses that caused the faults. 95 | // Faults will not occur due to referencing any memory location if the corresponding mask bit for 96 | //that memory location is 0. For example, no faults will be detected if the mask bits are all zero. 97 | //////////// 98 | while(true) { 99 | inputvector = enc_reshuffle(inputvector); 100 | inputvector = enc_translate(inputvector); 101 | _mm256_storeu_si256((__m256i *)dest, inputvector); 102 | str += 24; 103 | dest += 32; 104 | len -= 24; 105 | if(len >= 32) { 106 | inputvector = _mm256_loadu_si256((__m256i *)(str - 4)); // no need for a mask here 107 | // we could do a mask load as long as len >= 24 108 | } else { 109 | break; 110 | } 111 | } 112 | } 113 | size_t scalarret = chromium_base64_encode(dest, str, len); 114 | if(scalarret == MODP_B64_ERROR) return MODP_B64_ERROR; 115 | return (dest - dest_orig) + scalarret; 116 | } 117 | 118 | size_t fast_avx2_base64_decode(char *out, const char *src, size_t srclen) { 119 | char* out_orig = out; 120 | while (srclen >= 45) { 121 | 122 | // The input consists of six character sets in the Base64 alphabet, 123 | // which we need to map back to the 6-bit values they represent. 124 | // There are three ranges, two singles, and then there's the rest. 125 | // 126 | // # From To Add Characters 127 | // 1 [43] [62] +19 + 128 | // 2 [47] [63] +16 / 129 | // 3 [48..57] [52..61] +4 0..9 130 | // 4 [65..90] [0..25] -65 A..Z 131 | // 5 [97..122] [26..51] -71 a..z 132 | // (6) Everything else => invalid input 133 | 134 | __m256i str = _mm256_loadu_si256((__m256i *)src); 135 | 136 | // code by @aqrit from 137 | // https://github.com/WojciechMula/base64simd/issues/3#issuecomment-271137490 138 | // transated into AVX2 139 | const __m256i lut_lo = _mm256_setr_epi8( 140 | 0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 141 | 0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A, 142 | 0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 143 | 0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A 144 | ); 145 | const __m256i lut_hi = _mm256_setr_epi8( 146 | 0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08, 147 | 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 148 | 0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08, 149 | 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10 150 | ); 151 | const __m256i lut_roll = _mm256_setr_epi8( 152 | 0, 16, 19, 4, -65, -65, -71, -71, 153 | 0, 0, 0, 0, 0, 0, 0, 0, 154 | 0, 16, 19, 4, -65, -65, -71, -71, 155 | 0, 0, 0, 0, 0, 0, 0, 0 156 | ); 157 | 158 | const __m256i mask_2F = _mm256_set1_epi8(0x2f); 159 | 160 | // lookup 161 | __m256i hi_nibbles = _mm256_srli_epi32(str, 4); 162 | __m256i lo_nibbles = _mm256_and_si256(str, mask_2F); 163 | 164 | const __m256i lo = _mm256_shuffle_epi8(lut_lo, lo_nibbles); 165 | const __m256i eq_2F = _mm256_cmpeq_epi8(str, mask_2F); 166 | 167 | hi_nibbles = _mm256_and_si256(hi_nibbles, mask_2F); 168 | const __m256i hi = _mm256_shuffle_epi8(lut_hi, hi_nibbles); 169 | const __m256i roll = _mm256_shuffle_epi8(lut_roll, _mm256_add_epi8(eq_2F, hi_nibbles)); 170 | 171 | if (!_mm256_testz_si256(lo, hi)) { 172 | break; 173 | } 174 | 175 | str = _mm256_add_epi8(str, roll); 176 | // end of copied function 177 | 178 | srclen -= 32; 179 | src += 32; 180 | 181 | // end of inlined function 182 | 183 | // Reshuffle the input to packed 12-byte output format: 184 | str = dec_reshuffle(str); 185 | _mm256_storeu_si256((__m256i *)out, str); 186 | out += 24; 187 | } 188 | size_t scalarret = chromium_base64_decode(out, src, srclen); 189 | if(scalarret == MODP_B64_ERROR) return MODP_B64_ERROR; 190 | return (out - out_orig) + scalarret; 191 | } 192 | -------------------------------------------------------------------------------- /src/benchmark.c: -------------------------------------------------------------------------------- 1 | #define _POSIX_C_SOURCE 200212L // enable posix_memalign 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #include "benchmark.h" 14 | #include "chromiumbase64.h" 15 | #include "fastavxbase64.h" 16 | #include "encode_base64_avx512vbmi.h" 17 | #include "encode_base64_avx512vl.h" 18 | #include "decode_base64_avx512vbmi.h" 19 | #include "decode_base64_avx512vbmi__unrolled.h" 20 | #include "decode_base64_avx512vbmi_despace.h" 21 | #include "avx512memcpy.h" 22 | #include "memalloc.h" 23 | #include "load_file.h" 24 | 25 | static const int repeat = 10; 26 | static const int alignment = 16384; 27 | static const int minimal_volume = 16384; 28 | 29 | // to void inlining! 30 | // just a wrapper around memcpy... 31 | __attribute__((noinline)) void *copy(void *restrict dst, 32 | const void *restrict src, size_t n) { 33 | return memcpy(dst, src, n); 34 | } 35 | 36 | void testencode(const char *data, size_t datalength, bool verbose) { 37 | if (verbose) 38 | printf("encode a base64 input of %zu bytes, ", datalength); 39 | char *buffer = aligned_malloc(alignment, datalength * 2); 40 | char *tmp = aligned_malloc(alignment, datalength * 2); 41 | size_t expected = chromium_base64_encode(buffer, data, datalength); 42 | if (verbose) 43 | printf("encoded size = %zu \n", expected); 44 | int speedrepeat = repeat * 10; 45 | // ensure we have a 16K volume at all times 46 | while (speedrepeat * datalength < minimal_volume) { 47 | speedrepeat *= 2; 48 | } 49 | int statspeed = 30; 50 | MEASURE_SPEED_WARMUP("memcpy (base64)", copy(buffer, data, datalength), 51 | speedrepeat, statspeed * 10, datalength, false); 52 | MEASURE_SPEED("memcpy (base64)", copy(tmp, buffer, expected), speedrepeat, 53 | statspeed, expected, verbose); 54 | MEASURE_SPEED("Google chrome", 55 | chromium_base64_encode(buffer, data, datalength), speedrepeat, 56 | statspeed, expected, verbose); 57 | MEASURE_SPEED("AVX2", fast_avx2_base64_encode(buffer, data, datalength), 58 | speedrepeat, statspeed, expected, verbose); 59 | MEASURE_SPEED("AVX-512", 60 | encode_base64_avx512vl((uint8_t *)buffer, (const uint8_t *)data, 61 | datalength), 62 | speedrepeat, statspeed, expected, verbose); 63 | aligned_free(buffer); 64 | aligned_free(tmp); 65 | if (verbose) 66 | printf("\n"); 67 | } 68 | 69 | void testdecode(const char *data, size_t datalength, bool verbose) { 70 | if (verbose) 71 | printf("decoding a base64 input of %zu bytes, ", datalength); 72 | if (datalength < 4 || (datalength % 4 != 0)) { 73 | printf("size should be divisible by 4 bytes.\n"); 74 | return; 75 | } 76 | char *buffer = aligned_malloc(alignment, datalength * 2); 77 | size_t expected = chromium_base64_decode(buffer, data, datalength); 78 | if (verbose) 79 | printf("original size = %zu \n", expected); 80 | int speedrepeat = repeat * 10; 81 | while (speedrepeat * datalength < minimal_volume) { 82 | speedrepeat *= 2; 83 | } 84 | int statspeed = 30; 85 | MEASURE_SPEED_WARMUP("memcpy (base64)", copy(buffer, data, datalength), 86 | speedrepeat, statspeed * 10, datalength, false); 87 | MEASURE_SPEED("memcpy (base64)", copy(buffer, data, datalength), speedrepeat, 88 | statspeed, datalength, verbose); 89 | MEASURE_SPEED("Google chrome", 90 | chromium_base64_decode(buffer, data, datalength), speedrepeat, 91 | statspeed, datalength, verbose); 92 | MEASURE_SPEED("AVX2", fast_avx2_base64_decode(buffer, data, datalength), 93 | speedrepeat, statspeed, datalength, verbose); 94 | MEASURE_SPEED("AVX-512", 95 | decode_base64_avx512vbmi__unrolled( 96 | (uint8_t *)buffer, (const uint8_t *)data, datalength), 97 | speedrepeat, statspeed, datalength, verbose); 98 | aligned_free(buffer); 99 | if (verbose) 100 | printf("\n"); 101 | } 102 | 103 | typedef struct RealData { 104 | const char *description; 105 | const char *path; 106 | } RealData; 107 | 108 | RealData real_data[] = { { "lena [jpg]", "data/lena_color_512.base64" }, 109 | { "peppers [jpg]", "data/peppers_color.base64" }, 110 | { "mandril [jpg]", "data/mandril_color.base64" }, 111 | { "moby_dick [text]", "data/moby_dick.base64" }, 112 | { "google logo [png]", "data/googlelogo.base64" }, 113 | { "bing.com social icons [png]", "data/bing.base64" }, 114 | { "large [zip]", "data/large.base64" }, 115 | { NULL, NULL } }; 116 | 117 | static inline size_t despace(char *bytes, size_t howmany) { 118 | size_t i = 0, pos = 0; 119 | while (i < howmany) { 120 | const char c = bytes[i++]; 121 | bytes[pos] = c; 122 | pos += ((unsigned char)c > 32 ? 1 : 0); 123 | } 124 | return pos; 125 | } 126 | 127 | void test_real_data(bool removespaces) { 128 | 129 | MemoryArray data; 130 | RealData *item; 131 | 132 | for (item = real_data; item->description != NULL; item++) { 133 | printf("%s\n", item->description); 134 | printf("loading file %s \n", item->path); 135 | load_file(item->path, &data); 136 | if (removespaces) { 137 | printf("removing spaces (as a preliminary step), init size = %zu, ", 138 | data.size); 139 | data.size = despace(data.bytes, data.size); 140 | printf(" final size = %zu \n", data.size); 141 | } 142 | testdecode(data.bytes, data.size, true); 143 | char *buffer = aligned_malloc(alignment, data.size * 2); 144 | size_t origsize = chromium_base64_decode(buffer, data.bytes, data.size); 145 | testencode(buffer, origsize, true); 146 | aligned_free(buffer); 147 | aligned_free(data.bytes); 148 | item++; 149 | } 150 | } 151 | 152 | int main(int argc, char *argv[]) { 153 | RDTSC_SET_OVERHEAD(rdtsc_overhead_func(1), repeat); 154 | char *outputdir = "."; 155 | if (argc > 1) { 156 | outputdir = argv[1]; 157 | printf("Outputting results to directory %s.\n", outputdir); 158 | DIR *dir = opendir(outputdir); 159 | if (dir) { 160 | closedir(dir); 161 | } else { 162 | printf("I can't open the directory.\n"); 163 | return EXIT_FAILURE; 164 | } 165 | } 166 | printf("Testing first with random data.\n"); 167 | const int N = 2048 * 32; 168 | const int step = 2048; 169 | char randombuffer[N]; 170 | for (int k = 0; k < N; ++k) 171 | randombuffer[k] = rand(); 172 | size_t dirlen = strlen(outputdir); 173 | if (dirlen > 128) { 174 | printf("Your directory path is too long.\n"); 175 | return EXIT_FAILURE; 176 | } 177 | char decodingfilename[256]; 178 | const char *decodingfilenamej = "decodingperf.txt"; 179 | strcpy(decodingfilename, outputdir); 180 | decodingfilename[dirlen] = '/'; 181 | strcpy(decodingfilename + dirlen + 1, decodingfilenamej); 182 | char encodingfilename[256]; 183 | const char *encodingfilenamej = "encodingperf.txt"; 184 | strcpy(encodingfilename, outputdir); 185 | encodingfilename[dirlen] = '/'; 186 | strcpy(encodingfilename + dirlen + 1, encodingfilenamej); 187 | char realfilename[256]; 188 | const char *realfilenamej = "realperf.txt"; 189 | strcpy(realfilename, outputdir); 190 | realfilename[dirlen] = '/'; 191 | strcpy(realfilename + dirlen + 1, realfilenamej); 192 | printf("All speepds are normalized based on the base64 data size.\n"); 193 | printf("See files %s %s %s... \n", encodingfilename, decodingfilename, 194 | realfilename); 195 | if (freopen(decodingfilename, "w", stdout) == NULL) { 196 | printf("error opening %s \n", decodingfilename); 197 | } 198 | 199 | printf("#displaying speed (GB/s) based on input bytes for memcpy and " 200 | "decoders: memcpy base64, chromium, AVX2, AVX512; first column is " 201 | "number of bytes\n"); 202 | printf("#Each measure is given as a triple (median, min, max)\n"); 203 | for (int l = 1024; l <= N; l += step) { 204 | printf("%d ", l); 205 | char *code = 206 | (char *)aligned_malloc(alignment, chromium_base64_encode_len(l)); 207 | int codedlen = chromium_base64_encode(code, randombuffer, l); 208 | testdecode(code, codedlen, false); 209 | aligned_free(code); 210 | printf("\n"); 211 | } 212 | 213 | if (freopen(encodingfilename, "w", stdout) == NULL) { 214 | printf("error opening %s \n", encodingfilename); 215 | } 216 | printf("#displaying speed (GB/s) based on input bytes for memcpy and " 217 | "encoders: memcpy base64, chromium, AVX2, AVX512 first column is " 218 | "number of bytes\n"); 219 | printf("#Each measure is given as a triple (median, min, max)\n"); 220 | for (int l = 1024; l <= N; l += step) { 221 | printf("%d ", l); 222 | testencode(randombuffer, l, false); 223 | printf("\n"); 224 | } 225 | if (freopen(realfilename, "w", stdout) == NULL) { 226 | printf("error opening %s \n", realfilename); 227 | } 228 | 229 | printf("Testing with real data.\n"); 230 | bool removespaces = true; 231 | test_real_data(removespaces); 232 | const char *ttystr = "/dev/tty"; 233 | if (freopen(ttystr, "w", stdout) == NULL) { 234 | printf("error opening %s \n", ttystr); 235 | } 236 | printf("Done.\n"); 237 | return EXIT_SUCCESS; 238 | } 239 | -------------------------------------------------------------------------------- /src/benchmark.h: -------------------------------------------------------------------------------- 1 | #ifndef _BENCHMARK_H_ 2 | #define _BENCHMARK_H_ 3 | #include 4 | #include 5 | #include 6 | #define RDTSC_START(cycles) \ 7 | do { \ 8 | uint32_t cyc_high, cyc_low; \ 9 | __asm volatile("cpuid\n" \ 10 | "rdtsc\n" \ 11 | "mov %%edx, %0\n" \ 12 | "mov %%eax, %1" \ 13 | : "=r"(cyc_high), "=r"(cyc_low) \ 14 | : \ 15 | : /* no read only */ \ 16 | "%rax", "%rbx", "%rcx", "%rdx" /* clobbers */ \ 17 | ); \ 18 | (cycles) = ((uint64_t)cyc_high << 32) | cyc_low; \ 19 | } while (0) 20 | 21 | #define RDTSC_STOP(cycles) \ 22 | do { \ 23 | uint32_t cyc_high, cyc_low; \ 24 | __asm volatile("rdtscp\n" \ 25 | "mov %%edx, %0\n" \ 26 | "mov %%eax, %1\n" \ 27 | "cpuid" \ 28 | : "=r"(cyc_high), "=r"(cyc_low) \ 29 | : \ 30 | /* no read only registers */ \ 31 | : "%rax", "%rbx", "%rcx", "%rdx" /* clobbers */ \ 32 | ); \ 33 | (cycles) = ((uint64_t)cyc_high << 32) | cyc_low; \ 34 | } while (0) 35 | 36 | static __attribute__((noinline)) uint64_t rdtsc_overhead_func(uint64_t dummy) { 37 | return dummy; 38 | } 39 | 40 | uint64_t global_rdtsc_overhead = (uint64_t)UINT64_MAX; 41 | 42 | #define RDTSC_SET_OVERHEAD(test, repeat) \ 43 | do { \ 44 | uint64_t cycles_start, cycles_final, cycles_diff; \ 45 | uint64_t min_diff = UINT64_MAX; \ 46 | for (int i = 0; i < repeat; i++) { \ 47 | __asm volatile("" :: : /* pretend to clobber */ "memory"); \ 48 | RDTSC_START(cycles_start); \ 49 | test; \ 50 | RDTSC_STOP(cycles_final); \ 51 | cycles_diff = (cycles_final - cycles_start); \ 52 | if (cycles_diff < min_diff) \ 53 | min_diff = cycles_diff; \ 54 | } \ 55 | global_rdtsc_overhead = min_diff; \ 56 | printf("rdtsc_overhead set to %d\n", (int)global_rdtsc_overhead); \ 57 | } while (0) 58 | 59 | /* 60 | * Prints the best number of operations per cycle where 61 | * test is the function call, answer is the expected answer generated by 62 | * test, repeat is the number of times we should repeat and size is the 63 | * number of operations represented by test. 64 | */ 65 | #define BEST_TIME(name, test, expected, pre, repeat, size, verbose) \ 66 | do { \ 67 | if (global_rdtsc_overhead == UINT64_MAX) { \ 68 | RDTSC_SET_OVERHEAD(rdtsc_overhead_func(1), repeat); \ 69 | } \ 70 | if (verbose) \ 71 | printf("%-60.60s\t: ", name); \ 72 | fflush(NULL); \ 73 | uint64_t cycles_start, cycles_final, cycles_diff; \ 74 | uint64_t min_diff = (uint64_t) - 1; \ 75 | uint64_t max_diff = (uint64_t)0; \ 76 | uint64_t sum_diff = 0; \ 77 | for (int i = 0; i < repeat; i++) { \ 78 | pre; \ 79 | __asm volatile("" :: : /* pretend to clobber */ "memory"); \ 80 | RDTSC_START(cycles_start); \ 81 | if ((int)test != expected) { \ 82 | printf("not expected (%d , %d )", (int)test, expected); \ 83 | break; \ 84 | } \ 85 | RDTSC_STOP(cycles_final); \ 86 | cycles_diff = (cycles_final - cycles_start - global_rdtsc_overhead); \ 87 | if (cycles_diff < min_diff) \ 88 | min_diff = cycles_diff; \ 89 | if (cycles_diff > max_diff) \ 90 | max_diff = cycles_diff; \ 91 | sum_diff += cycles_diff; \ 92 | } \ 93 | uint64_t S = size; \ 94 | float cycle_per_op = (min_diff) / (double)S; \ 95 | float avg_cycle_per_op = (sum_diff) / ((double)S * repeat); \ 96 | if (verbose) \ 97 | printf(" %.3f cycles per operation (best) ", cycle_per_op); \ 98 | if (verbose) \ 99 | printf("\t%.3f cycles per operation (avg) ", avg_cycle_per_op); \ 100 | if (verbose) \ 101 | printf("\n"); \ 102 | if (!verbose) \ 103 | printf(" %.3f %.3f %.3f ", (sum_diff) / ((double)S *repeat), \ 104 | (min_diff) / (double)S, (max_diff) / (double)S); \ 105 | fflush(NULL); \ 106 | } while (0) 107 | 108 | double time_in_seconds(struct timespec *start, struct timespec *end) { 109 | return ((end->tv_sec - start->tv_sec) * 1e9 + 110 | (end->tv_nsec - start->tv_nsec)) * 111 | 1e-9; 112 | } 113 | static int comparedouble(const void *a, const void *b) { 114 | if (*(double *)a > *(double *)b) 115 | return 1; 116 | else if (*(double *)a < *(double *)b) 117 | return -1; 118 | else 119 | return 0; 120 | } 121 | 122 | double median(double *values, size_t howmany) { 123 | qsort(values, howmany, sizeof(double), comparedouble); 124 | if ((howmany & 1) == 0) { // even case 125 | return (values[howmany / 2] + values[howmany / 2 + 1]) / 2; 126 | } else { 127 | return values[howmany / 2]; // given 3, this looks at 1 128 | } 129 | } 130 | #define MEASURE_SPEED_WARMUP(name, test, repeat, statrepeat, sizeinbytes, \ 131 | verbose) \ 132 | do { \ 133 | if (verbose) \ 134 | printf("%-60.60s\t: ", name); \ 135 | fflush(NULL); \ 136 | double min_time = DBL_MAX; \ 137 | double max_time = 0; \ 138 | double *alltimes = (double *)malloc(sizeof(double) * statrepeat); \ 139 | struct timespec start, end; \ 140 | for (int stati = 0; stati < statrepeat; stati++) { \ 141 | __asm volatile("" :: : /* pretend to clobber */ "memory"); \ 142 | clock_gettime(CLOCK_MONOTONIC, &start); \ 143 | for (int i = 0; i < repeat; i++) { \ 144 | test; \ 145 | } \ 146 | clock_gettime(CLOCK_MONOTONIC, &end); \ 147 | double t = time_in_seconds(&start, &end); \ 148 | alltimes[stati] = t; \ 149 | if (t < min_time) \ 150 | min_time = t; \ 151 | if (t > max_time) \ 152 | max_time = t; \ 153 | } \ 154 | double tvmin = min_time; \ 155 | double tvmax = max_time; \ 156 | double gb_per_s = (sizeinbytes * repeat) / \ 157 | (median(alltimes, statrepeat) * 1024 * 1024 * 1024.0); \ 158 | double max_gb_per_s = \ 159 | (sizeinbytes * repeat) / (tvmin * 1024 * 1024 * 1024.0); \ 160 | double min_gp_per_s = \ 161 | (sizeinbytes * repeat) / (tvmax * 1024 * 1024 * 1024.0); \ 162 | if (verbose) \ 163 | printf(" %.3f GB/s ", gb_per_s); \ 164 | if (verbose) \ 165 | printf("\n"); \ 166 | if (gb_per_s <= 0) \ 167 | printf("issue\n"); \ 168 | if (min_gp_per_s <= 0) \ 169 | printf("issue\n"); \ 170 | if (max_gb_per_s <= 0) \ 171 | printf("issue\n"); \ 172 | fflush(NULL); \ 173 | } while (0) 174 | 175 | #define MEASURE_SPEED(name, test, repeat, statrepeat, sizeinbytes, verbose) \ 176 | do { \ 177 | if (verbose) \ 178 | printf("%-60.60s\t: ", name); \ 179 | fflush(NULL); \ 180 | double min_time = DBL_MAX; \ 181 | double max_time = 0; \ 182 | double *alltimes = (double *)malloc(sizeof(double) * statrepeat); \ 183 | struct timespec start, end; \ 184 | for (int stati = 0; stati < statrepeat; stati++) { \ 185 | __asm volatile("" :: : /* pretend to clobber */ "memory"); \ 186 | clock_gettime(CLOCK_MONOTONIC, &start); \ 187 | for (int i = 0; i < repeat; i++) { \ 188 | test; \ 189 | } \ 190 | clock_gettime(CLOCK_MONOTONIC, &end); \ 191 | double t = time_in_seconds(&start, &end); \ 192 | alltimes[stati] = t; \ 193 | if (t < min_time) \ 194 | min_time = t; \ 195 | if (t > max_time) \ 196 | max_time = t; \ 197 | } \ 198 | double tvmin = min_time; \ 199 | double tvmax = max_time; \ 200 | double gb_per_s = (sizeinbytes * repeat) / \ 201 | (median(alltimes, statrepeat) * 1024 * 1024 * 1024.0); \ 202 | double max_gb_per_s = \ 203 | (sizeinbytes * repeat) / (tvmin * 1024 * 1024 * 1024.0); \ 204 | double min_gp_per_s = \ 205 | (sizeinbytes * repeat) / (tvmax * 1024 * 1024 * 1024.0); \ 206 | if (verbose) \ 207 | printf(" %.3f GB/s ", gb_per_s); \ 208 | if (verbose) \ 209 | printf("\n"); \ 210 | if (!verbose) \ 211 | printf(" %.3f %.3f %.3f ", gb_per_s, min_gp_per_s, max_gb_per_s); \ 212 | fflush(NULL); \ 213 | } while (0) 214 | 215 | // like BEST_TIME, but no check 216 | #define BEST_TIME_NOCHECK(name, test, pre, repeat, size, verbose) \ 217 | do { \ 218 | if (global_rdtsc_overhead == UINT64_MAX) { \ 219 | RDTSC_SET_OVERHEAD(rdtsc_overhead_func(1), repeat); \ 220 | } \ 221 | if (verbose) \ 222 | printf("%-60s\t: ", name); \ 223 | fflush(NULL); \ 224 | uint64_t cycles_start, cycles_final, cycles_diff; \ 225 | uint64_t min_diff = (uint64_t) - 1; \ 226 | uint64_t max_diff = (uint64_t)0; \ 227 | uint64_t sum_diff = 0; \ 228 | for (int i = 0; i < repeat; i++) { \ 229 | pre; \ 230 | __asm volatile("" :: : /* pretend to clobber */ "memory"); \ 231 | RDTSC_START(cycles_start); \ 232 | test; \ 233 | RDTSC_STOP(cycles_final); \ 234 | cycles_diff = (cycles_final - cycles_start - global_rdtsc_overhead); \ 235 | if (cycles_diff < min_diff) \ 236 | min_diff = cycles_diff; \ 237 | if (cycles_diff > max_diff) \ 238 | max_diff = cycles_diff; \ 239 | sum_diff += cycles_diff; \ 240 | } \ 241 | uint64_t S = size; \ 242 | float cycle_per_op = (min_diff) / (double)S; \ 243 | float avg_cycle_per_op = (sum_diff) / ((double)S * repeat); \ 244 | if (verbose) \ 245 | printf(" %.3f cycles per operation (best) ", cycle_per_op); \ 246 | if (verbose) \ 247 | printf("\t%.3f cycles per operation (avg) ", avg_cycle_per_op); \ 248 | if (verbose) \ 249 | printf("\n"); \ 250 | if (!verbose) \ 251 | printf(" %.3f %.3f %.3f ", (sum_diff) / ((double)S *repeat), \ 252 | (min_diff) / (double)S, (max_diff) / (double)S); \ 253 | fflush(NULL); \ 254 | } while (0) 255 | 256 | // like BEST_TIME except that we run a function to check the result 257 | #define BEST_TIME_CHECK(name, test, check, pre, repeat, size, verbose) \ 258 | do { \ 259 | if (global_rdtsc_overhead == UINT64_MAX) { \ 260 | RDTSC_SET_OVERHEAD(rdtsc_overhead_func(1), repeat); \ 261 | } \ 262 | if (verbose) \ 263 | printf("%-60s\t: ", name); \ 264 | fflush(NULL); \ 265 | uint64_t cycles_start, cycles_final, cycles_diff; \ 266 | uint64_t min_diff = (uint64_t) - 1; \ 267 | uint64_t max_diff = (uint64_t)0; \ 268 | uint64_t sum_diff = 0; \ 269 | for (int i = 0; i < repeat; i++) { \ 270 | pre; \ 271 | __asm volatile("" :: : /* pretend to clobber */ "memory"); \ 272 | RDTSC_START(cycles_start); \ 273 | test; \ 274 | RDTSC_STOP(cycles_final); \ 275 | if (!check) { \ 276 | printf("error"); \ 277 | break; \ 278 | } \ 279 | cycles_diff = (cycles_final - cycles_start - global_rdtsc_overhead); \ 280 | if (cycles_diff < min_diff) \ 281 | min_diff = cycles_diff; \ 282 | if (cycles_diff > max_diff) \ 283 | max_diff = cycles_diff; \ 284 | sum_diff += cycles_diff; \ 285 | } \ 286 | uint64_t S = size; \ 287 | float cycle_per_op = (min_diff) / (double)S; \ 288 | float avg_cycle_per_op = (sum_diff) / ((double)S * repeat); \ 289 | if (verbose) \ 290 | printf(" %.3f cycles per operation (best) ", cycle_per_op); \ 291 | if (verbose) \ 292 | printf("\t%.3f cycles per operation (avg) ", avg_cycle_per_op); \ 293 | if (verbose) \ 294 | printf("\n"); \ 295 | if (!verbose) \ 296 | printf(" %.3f %.3f %.3f ", (sum_diff) / ((double)S *repeat), \ 297 | (min_diff) / (double)S, (max_diff) / (double)S); \ 298 | fflush(NULL); \ 299 | } while (0) 300 | 301 | #endif 302 | -------------------------------------------------------------------------------- /src/benchmark_despace.c: -------------------------------------------------------------------------------- 1 | #define _POSIX_C_SOURCE 200212L // enable posix_memalign 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #include "benchmark.h" 13 | #include "decode_base64_avx512vbmi_despace.h" 14 | 15 | static const int repeat = 50; 16 | 17 | uint8_t* email_likedata(size_t lines, size_t line_length, size_t* data_size, size_t* decoded_size); 18 | 19 | int main() { 20 | RDTSC_SET_OVERHEAD(rdtsc_overhead_func(1), repeat); 21 | 22 | const int verbose = 1; 23 | const size_t lines = 1024; 24 | 25 | for (size_t line_length=50; line_length < 80; line_length++) { 26 | size_t data_size; 27 | size_t decoded_size; 28 | uint8_t* data = email_likedata(lines, line_length, &data_size, &decoded_size); 29 | uint8_t* dest = (uint8_t*)malloc(data_size); 30 | 31 | printf("data_size = %lu decoded_size = %lu\n", data_size, decoded_size); 32 | BEST_TIME_NOCHECK("AVX512VBMI (despace)", decode_base64_avx512vbmi_despace(dest, data, decoded_size), 33 | /*no-pre*/, repeat, data_size, verbose); 34 | 35 | free(data); 36 | free(dest); 37 | } 38 | 39 | return 0; 40 | } 41 | uint8_t* email_likedata(size_t lines, size_t line_length, size_t* data_size, size_t* decoded_size) { 42 | const size_t size = (lines + 1) * (line_length + 2); // +2 = '\n' and '\r' 43 | uint8_t* buffer = (uint8_t*)malloc(size); 44 | 45 | uint8_t* out = buffer; 46 | *decoded_size = 0; 47 | for (size_t i=0; i < lines; i++) { 48 | memset(out, 'V', line_length); // in base64 'V' maps to 0x15 = 0b010101 49 | *decoded_size += line_length; 50 | out += line_length; 51 | *out++ = '\r'; 52 | *out++ = '\n'; 53 | } 54 | 55 | while (*decoded_size % 4 != 0) { 56 | *out++ = 'V'; 57 | *decoded_size += 1; 58 | } 59 | 60 | *data_size = (out - buffer); 61 | 62 | return buffer; 63 | } 64 | -------------------------------------------------------------------------------- /src/benchmark_email.c: -------------------------------------------------------------------------------- 1 | /* 2 | This program is meant to be a simple email-like data parser. It's not a 3 | fully-fledged library, it mimics application of base64 decoder. 4 | 5 | When the parser detects beginning of bas64-encoded data it passes control 6 | to our base64 decoder and then the decoder is responsible for 1) 7 | **inplace** decode data 2) detect end of base64-encoded data. Then it 8 | returns back to parser, returning two numbers: how many bytes were read and 9 | written. Then parser might consume decoded data (in example we write it 10 | back to disc) and carry on parsing. 11 | 12 | */ 13 | #define _POSIX_C_SOURCE 200212L // enable posix_memalign 14 | 15 | #include 16 | #include 17 | #include 18 | 19 | #include "memalloc.h" 20 | #include "load_file.h" 21 | #include "decode_base64_avx512vbmi_despace.h" 22 | #include 23 | 24 | static uint64_t clock(void) { 25 | static struct timeval T; 26 | gettimeofday(&T, NULL); 27 | return (T.tv_sec * 1000000) + T.tv_usec; 28 | } 29 | 30 | 31 | typedef void (*consume_attachment_function)(const uint8_t* input, size_t size, void* extra); 32 | 33 | // Note that we're going to modify data-inplace! 34 | // It's way easier and faster than making copies. 35 | void decode_email(char* data, size_t size, consume_attachment_function consume, void* extra) { 36 | 37 | char* cursor = data; 38 | char* end = data + size; 39 | 40 | int linelen; 41 | 42 | #define SKIP_LINE \ 43 | do { \ 44 | char* start = cursor; \ 45 | for (/**/; *cursor != '\n'; cursor++); \ 46 | linelen = (cursor - start); \ 47 | if (*cursor == '\n') \ 48 | cursor++; \ 49 | } while (0); 50 | 51 | const uint64_t t_start = clock(); 52 | 53 | while (cursor < end) { 54 | 55 | // skip empty lines 56 | while (*cursor == '\n') 57 | cursor++; 58 | 59 | // skip lines --- in fact here we'd have parsing 60 | // i.e. detecting keywords, etc. 61 | do { 62 | SKIP_LINE; 63 | } while (linelen > 0); 64 | 65 | // there's an empty line after the attachment header 66 | if (linelen == 0) { 67 | puts("detected BASE64 data"); 68 | uint8_t* src_ptr = (uint8_t*)cursor; 69 | const uint64_t t1 = clock(); 70 | const size_t k = decode_base64_avx512vbmi_despace_email((uint8_t*)cursor, &src_ptr, end - cursor); 71 | const uint64_t t2 = clock(); 72 | if (k == (size_t)-1) { 73 | printf("... BASE64 data is broken (the current offset is %lu)\n", cursor - data); 74 | exit(2); 75 | } else { 76 | printf("... decoded %lu bytes in %lu us\n", k, t2 - t1); 77 | if (consume != NULL) { 78 | consume((uint8_t*)cursor, k, extra); 79 | } 80 | } 81 | 82 | cursor = (char*)src_ptr; 83 | if (*cursor == '\n') 84 | cursor++; 85 | 86 | SKIP_LINE; 87 | SKIP_LINE; 88 | } 89 | } 90 | 91 | const uint64_t t_end = clock(); 92 | 93 | printf("Whole procedure completed in %lu us\n", t_end - t_start); 94 | } 95 | 96 | void save_attachment(const uint8_t* input, size_t size, void* extra) { 97 | 98 | int* number = (int*)extra; 99 | char filename[256]; 100 | snprintf(filename, sizeof(filename), "attachement%d.dat", *number); 101 | 102 | *number += 1; 103 | 104 | printf("Saving attachement as %s\n", filename); 105 | FILE* f = fopen(filename, "wb"); 106 | fwrite(input, 1, size, f); 107 | fclose(f); 108 | } 109 | 110 | void usage(const char* progname) { 111 | printf("Usage: %s FILE\n", progname); 112 | puts(""); 113 | puts("FILE is an e-mail like message produced by scripts/email-generator.py"); 114 | } 115 | 116 | int main(int argc, char* argv[]) { 117 | 118 | if (argc != 2) { 119 | usage(argv[0]); 120 | return EXIT_FAILURE; 121 | } 122 | 123 | const char* inputfile = argv[1]; 124 | 125 | printf("Loading %s ... ", inputfile); fflush(stdout); 126 | MemoryArray email; 127 | load_file(inputfile, &email); 128 | puts("done"); 129 | 130 | int number = 0; 131 | decode_email(email.bytes, email.size, save_attachment, &number); 132 | 133 | return EXIT_SUCCESS; 134 | } 135 | -------------------------------------------------------------------------------- /src/load_file.c: -------------------------------------------------------------------------------- 1 | #include "load_file.h" 2 | #include "memalloc.h" 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | static const int alignment = 2048; 11 | 12 | void load_file(const char* path, MemoryArray* data) { 13 | FILE* f = fopen(path, "rb"); 14 | if (f == NULL) { 15 | printf("Can't open '%s': %s\n", path, strerror(errno)); 16 | exit(1); 17 | } 18 | 19 | fseek(f, 0, SEEK_END); 20 | data->size = ftell(f); 21 | fseek(f, 0, SEEK_SET); 22 | 23 | data->bytes = aligned_malloc(alignment, data->size); 24 | if (data->bytes == NULL) { 25 | puts("allocation failed"); 26 | exit(1); 27 | } 28 | 29 | if (fread(data->bytes, 1, data->size, f) != data->size) { 30 | printf("Error reading '%s': %s\n", path, strerror(errno)); 31 | exit(1); 32 | } 33 | 34 | fclose(f); 35 | } 36 | -------------------------------------------------------------------------------- /src/unit.c: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include "chromiumbase64.h" 11 | #include "fastavxbase64.h" 12 | #include "encode_base64_avx512vbmi.h" 13 | #include "encode_base64_avx512vl.h" 14 | #include "decode_base64_avx512vbmi.h" 15 | #include "decode_base64_avx512vbmi__unrolled.h" 16 | 17 | 18 | void print_example(const char * source) { 19 | char * dest1 = (char*) malloc(chromium_base64_encode_len(strlen(source))); 20 | unsigned int len = chromium_base64_decode(dest1, source,strlen(source)); 21 | unsigned int i = 0; 22 | for(; i != len; i++) printf("%u ",dest1[i]&0xFF); 23 | printf("\n"); 24 | free(dest1); 25 | } 26 | void chromium_checkExample(const char * source, const char * coded) { 27 | printf("chromium codec check.\n"); 28 | unsigned int len; 29 | unsigned int codedlen; 30 | 31 | char * dest1 = (char*) malloc(chromium_base64_encode_len(strlen(source))); 32 | codedlen = chromium_base64_encode(dest1, source, strlen(source)); 33 | assert(strncmp(dest1,coded,codedlen) == 0); 34 | char *dest2 = (char*) malloc(chromium_base64_decode_len(codedlen)); 35 | len = chromium_base64_decode(dest2, coded, codedlen); 36 | assert(len == strlen(source)); 37 | assert(strncmp(dest2,source,strlen(source)) == 0); 38 | char *dest3 = (char*) malloc(chromium_base64_decode_len(codedlen)); 39 | len = chromium_base64_decode(dest3, dest1, codedlen); 40 | assert(len == strlen(source)); 41 | assert(strncmp(dest3,source,strlen(source)) == 0); 42 | free(dest1); 43 | free(dest2); 44 | free(dest3); 45 | } 46 | 47 | void fast_avx2_checkExample(const char * source, const char * coded) { 48 | printf("fast_avx2 codec check.\n"); 49 | size_t len; 50 | size_t codedlen; 51 | 52 | char * dest1 = (char*) malloc(chromium_base64_encode_len(strlen(source))); 53 | codedlen = fast_avx2_base64_encode(dest1, source, strlen(source)); 54 | assert(strncmp(dest1,coded,codedlen) == 0); 55 | char *dest2 = (char*) malloc(chromium_base64_decode_len(codedlen)); 56 | len = fast_avx2_base64_decode(dest2, coded, codedlen); 57 | assert(len == strlen(source)); 58 | assert(strncmp(dest2,source,strlen(source)) == 0); 59 | char *dest3 = (char*) malloc(chromium_base64_decode_len(codedlen)); 60 | len = fast_avx2_base64_decode(dest3, dest1, codedlen); 61 | assert(len == strlen(source)); 62 | assert(strncmp(dest3,source,strlen(source)) == 0); 63 | free(dest1); 64 | free(dest2); 65 | free(dest3); 66 | } 67 | 68 | typedef void (*encode_base64_function)(uint8_t* output, const uint8_t* input, size_t size); 69 | typedef size_t (*decode_base64_function)(uint8_t* output, const uint8_t* input, size_t size); 70 | 71 | void test( 72 | const char* name, 73 | encode_base64_function encode, 74 | decode_base64_function decode, 75 | const char* source, 76 | const char* coded) 77 | { 78 | printf("%s\n", name); 79 | 80 | size_t size = strlen(source); 81 | size_t encoded_len; 82 | size_t decoded_len; 83 | size_t decoded_len_ret; 84 | 85 | encoded_len = chromium_base64_encode_len(size); 86 | decoded_len = chromium_base64_decode_len(encoded_len); 87 | 88 | uint8_t* dest1 = (uint8_t*)malloc(encoded_len); 89 | encode(dest1, (const uint8_t*)source, size); 90 | assert(memcmp(dest1, coded, encoded_len) == 0); 91 | 92 | uint8_t* dest2 = (uint8_t*)malloc(decoded_len); 93 | decoded_len_ret = decode(dest2, (const uint8_t*)coded, strlen(coded)); 94 | assert(decoded_len_ret == strlen(source)); 95 | assert(memcmp(dest2, source, size) == 0); 96 | 97 | free(dest1); 98 | free(dest2); 99 | } 100 | 101 | static const uint8_t base64_table_enc[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" 102 | "abcdefghijklmnopqrstuvwxyz" 103 | "0123456789+/"; 104 | void fast_avx2_checkError() { 105 | printf("fast_avx2 codec error check.\n"); 106 | char source[64]; 107 | char dest[48]; 108 | for(unsigned int z = 0; z < 64; ++z) { 109 | for(int i = 0; i < 64; ++i) source[i] = base64_table_enc[z]; 110 | int len = fast_avx2_base64_decode(dest, source, 64); 111 | assert(len == 48); 112 | } 113 | for(int z = 0; z < 256 ; ++z) { 114 | bool in_list = false; 115 | for(unsigned int zz = 0; zz < 64 ; ++zz) 116 | if(base64_table_enc[zz] == z) in_list = true; 117 | if(! in_list) { 118 | for(int pos = 0; pos < 32; ++pos) { 119 | for(int i = 0; i < 64; ++i) source[i] = 'A'; 120 | source[pos] = z; 121 | int len = fast_avx2_base64_decode(dest, source, 64); 122 | assert(len == -1); 123 | } 124 | } 125 | } 126 | } 127 | 128 | 129 | int main() { 130 | fast_avx2_checkError(); 131 | 132 | // from Wikipedia page 133 | const char * wikipediasource = "Man is distinguished, not only by his reason, but by this singular passion from \ 134 | other animals, which is a lust of the mind, that by a perseverance of delight \ 135 | in the continued and indefatigable generation of knowledge, exceeds the short \ 136 | vehemence of any carnal pleasure."; 137 | const char * wikipediacoded = "TWFuIGlzIGRpc3Rpbmd1aXNoZWQsIG5vdCBvbmx5IGJ5IGhpcyByZWFzb24sIGJ1dCBieSB0aGlz\ 138 | IHNpbmd1bGFyIHBhc3Npb24gZnJvbSBvdGhlciBhbmltYWxzLCB3aGljaCBpcyBhIGx1c3Qgb2Yg\ 139 | dGhlIG1pbmQsIHRoYXQgYnkgYSBwZXJzZXZlcmFuY2Ugb2YgZGVsaWdodCBpbiB0aGUgY29udGlu\ 140 | dWVkIGFuZCBpbmRlZmF0aWdhYmxlIGdlbmVyYXRpb24gb2Yga25vd2xlZGdlLCBleGNlZWRzIHRo\ 141 | ZSBzaG9ydCB2ZWhlbWVuY2Ugb2YgYW55IGNhcm5hbCBwbGVhc3VyZS4="; 142 | 143 | // from https://gobyexample.com/base64-encoding 144 | const char * gosource = "abc123!?$*&()'-=@~"; 145 | const char * gocoded = "YWJjMTIzIT8kKiYoKSctPUB+"; 146 | 147 | // from https://www.tutorialspoint.com/java8/java8_base64.htm 148 | const char * tutosource = "TutorialsPoint?java8"; 149 | const char * tutocoded = "VHV0b3JpYWxzUG9pbnQ/amF2YTg="; 150 | 151 | 152 | chromium_checkExample(wikipediasource,wikipediacoded); 153 | chromium_checkExample(gosource,gocoded); 154 | chromium_checkExample(tutosource,tutocoded); 155 | 156 | fast_avx2_checkExample(wikipediasource,wikipediacoded); 157 | fast_avx2_checkExample(gosource,gocoded); 158 | fast_avx2_checkExample(tutosource,tutocoded); 159 | 160 | test("AVX512VBMI", encode_base64_avx512vbmi, decode_base64_avx512vbmi, wikipediasource, wikipediacoded); 161 | test("AVX512VBMI (unrolled)", encode_base64_avx512vbmi, decode_base64_avx512vbmi__unrolled, wikipediasource, wikipediacoded); 162 | test("AVX512VL", encode_base64_avx512vl, decode_base64_avx512vbmi, wikipediasource, wikipediacoded); 163 | 164 | print_example("R0lGODlhAQABAIAAAP///wAAACwAAAAAAQABAAACAkQBADs="); 165 | printf("Code looks ok.\n"); 166 | return 0; 167 | } 168 | -------------------------------------------------------------------------------- /src/unit_despace.c: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include "chromiumbase64.h" 11 | #include "encode_base64_avx512vbmi.h" 12 | #include "decode_base64_avx512vbmi_despace.h" 13 | 14 | 15 | void test_wikidata(); 16 | void test_synthetic_simple(); 17 | void test_synthetic(); 18 | 19 | int main() { 20 | test_wikidata(); 21 | test_synthetic_simple(); 22 | test_synthetic(); 23 | return 0; 24 | } 25 | 26 | 27 | size_t count_bytes(uint8_t* data, size_t size, uint8_t val) { 28 | size_t res = 0; 29 | for (size_t i=0; i < size; i++) 30 | res += (data[i] == val); 31 | 32 | return res; 33 | } 34 | 35 | void test_synthetic_simple() { 36 | 37 | puts("Test sythetic data -- simple case"); 38 | 39 | size_t size = 64 * 4; 40 | uint8_t input[size + 1]; 41 | uint8_t dest[size + 1]; 42 | 43 | uint8_t BASE64_CHAR = 'V'; 44 | 45 | int passed = 0; 46 | int failed = 0; 47 | 48 | for (size_t i=0; i < 64; i++) { 49 | memset(input, BASE64_CHAR, sizeof(input)); 50 | input[size] = '\0'; 51 | input[i] = ' '; 52 | 53 | const size_t input_size = 2*64 + 1; 54 | size_t decoded_len = decode_base64_avx512vbmi_despace(dest, input, input_size); 55 | if (decoded_len == (size_t)(-1)) { 56 | printf("failed for i = %lu\n", i); 57 | printf("'%*s'\n", (int)input_size, input); 58 | failed += 1; 59 | } 60 | else 61 | passed += 1; 62 | } 63 | 64 | printf("Summary: %d passed, %d failed\n", passed, failed); 65 | } 66 | 67 | void test_synthetic() { 68 | 69 | puts("Test sythetic data (might take a while)"); 70 | 71 | size_t size = 64 * 3; 72 | uint8_t buffer[size + 1]; 73 | uint8_t dest[size + 1]; 74 | 75 | buffer[size] = '\0'; // make it puts-friendly 76 | 77 | uint8_t BASE64_CHAR = 'V'; 78 | 79 | int passed = 0; 80 | int failed = 0; 81 | 82 | for (size_t i=0; i < size; i++) { 83 | for (size_t j=i; j < size; j++) { 84 | for (size_t k=j; k < size; k++) { 85 | memset(buffer, BASE64_CHAR, size); 86 | buffer[i] = ' '; 87 | buffer[j] = ' '; 88 | buffer[k] = ' '; 89 | 90 | 91 | // 1. assure that input with spaces is valid base64 string 92 | size_t base64_len = count_bytes(buffer, size, BASE64_CHAR); 93 | uint8_t* input = buffer; 94 | size_t input_size = size; 95 | while (base64_len % 4 != 0) { 96 | if (*input == BASE64_CHAR) { 97 | input++; 98 | base64_len -= 1; 99 | input_size -= 1; 100 | } else if (input[input_size - 1] == BASE64_CHAR) { 101 | base64_len -= 1; 102 | input_size -= 1; 103 | } else { 104 | break; 105 | } 106 | } 107 | 108 | // 2. test 109 | if (base64_len % 4 == 0) { 110 | size_t decoded_len = decode_base64_avx512vbmi_despace(dest, input, input_size); 111 | if (decoded_len == (size_t)(-1)) { 112 | printf("failed for i = %lu j = %lu k = %lu\n", i, j, k); 113 | printf("'%*s'\n", (int)input_size, input); 114 | failed += 1; 115 | } 116 | else 117 | passed += 1; 118 | } 119 | } 120 | } 121 | } 122 | 123 | printf("Summary: %d passed, %d failed\n", passed, failed); 124 | } 125 | 126 | 127 | char* insert_spaces(const char* data, double prob); 128 | 129 | typedef size_t (*decode_base64_function)(uint8_t* output, const uint8_t* input, size_t size); 130 | 131 | void test( 132 | const char* name, 133 | decode_base64_function decode, 134 | const char* source, 135 | const char* coded) 136 | { 137 | printf("%s\n", name); 138 | 139 | size_t size = strlen(source); 140 | size_t encoded_len; 141 | size_t decoded_len_ret; 142 | 143 | encoded_len = chromium_base64_encode_len(size); 144 | 145 | uint8_t* dest = (uint8_t*)malloc(encoded_len); 146 | decoded_len_ret = decode(dest, (const uint8_t*)coded, strlen(coded)); 147 | int fail = 0; 148 | if (decoded_len_ret != strlen(source)) { 149 | printf("wrong length: result = %lu, expected = %lu\n", decoded_len_ret, strlen(source)); 150 | fail = 1; 151 | } 152 | 153 | if (memcmp(dest, source, size) != 0) { 154 | puts("different results"); 155 | puts((const char*)source); 156 | puts((const char*)dest); 157 | fail = 1; 158 | } 159 | 160 | free(dest); 161 | 162 | if (fail) exit(1); 163 | } 164 | 165 | void test_wikidata() { 166 | 167 | puts("Test wikidata"); 168 | 169 | // from Wikipedia page 170 | const char * wikipediasource = 171 | "Man is distinguished, not only by his reason, but by this singular passion from " 172 | "other animals, which is a lust of the mind, that by a perseverance of delight " 173 | "in the continued and indefatigable generation of knowledge, exceeds the short " 174 | "vehemence of any carnal pleasure."; 175 | 176 | const char * wikipediacoded = 177 | "TWFuIGlzIGRpc3Rpbmd1aXNoZWQsIG5vdCBvbmx5IGJ5IGhpcyByZWFzb24sIGJ1dCBieSB0aGlz" 178 | "IHNpbmd1bGFyIHBhc3Npb24gZnJvbSBvdGhlciBhbmltYWxzLCB3aGljaCBpcyBhIGx1c3Qgb2Yg" 179 | "dGhlIG1pbmQsIHRoYXQgYnkgYSBwZXJzZXZlcmFuY2Ugb2YgZGVsaWdodCBpbiB0aGUgY29udGlu" 180 | "dWVkIGFuZCBpbmRlZmF0aWdhYmxlIGdlbmVyYXRpb24gb2Yga25vd2xlZGdlLCBleGNlZWRzIHRo" 181 | "ZSBzaG9ydCB2ZWhlbWVuY2Ugb2YgYW55IGNhcm5hbCBwbGVhc3VyZS4="; 182 | 183 | 184 | for (double prob = 0.1; prob < 1.0; prob += 0.1) { 185 | char* encoded_with_spaces = insert_spaces(wikipediacoded, prob); 186 | 187 | test("AVX512VBMI (despace)", 188 | decode_base64_avx512vbmi_despace, 189 | wikipediasource, 190 | encoded_with_spaces); 191 | 192 | free(encoded_with_spaces); 193 | } 194 | } 195 | 196 | double randd() { 197 | return (double)rand() / RAND_MAX; 198 | } 199 | 200 | char* insert_spaces(const char* data, double prob) { 201 | const size_t size = strlen(data); 202 | 203 | size_t newsize = size * 2; 204 | char* newdata = (char*)malloc(newsize); 205 | size_t j = 0; 206 | 207 | for (size_t i=0; i < size; i++) { 208 | if (randd() > prob) { 209 | newdata[j] = ' '; 210 | j += 1; 211 | if (j == newsize) { 212 | newsize *= 2; 213 | newdata = realloc(newdata, newsize); 214 | } 215 | } 216 | 217 | newdata[j] = data[i]; 218 | j += 1; 219 | if (j == newsize) { 220 | newsize *= 2; 221 | newdata = realloc(newdata, newsize); 222 | } 223 | } 224 | 225 | newdata[j] = '\0'; 226 | 227 | return newdata; 228 | } 229 | -------------------------------------------------------------------------------- /src/unit_tail.c: -------------------------------------------------------------------------------- 1 | #include "chromiumbase64.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include "base64/decode_base64_tail_avx512vbmi.c" 8 | 9 | static const char* input_string = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789#%"; 10 | 11 | int main() { 12 | char input[256]; 13 | char encoded[256]; 14 | char decoded[256]; 15 | 16 | for (int input_size=1; /**/; input_size++) { 17 | memcpy(input, input_string, strlen(input_string)); 18 | input[input_size] = '\0'; 19 | 20 | const int encoded_size = chromium_base64_encode(encoded, input, input_size); 21 | if (encoded_size > 64) 22 | break; 23 | 24 | puts(""); 25 | printf("input = '%s', len = %d\n", input, input_size); 26 | printf("encoded = '%s', len = %d\n", encoded, encoded_size); 27 | 28 | const int decoded_size = decode_base64_tail_avx512vbmi((uint8_t*)decoded, 29 | (const uint8_t*)encoded, 30 | encoded_size); 31 | assert(decoded_size >= 0); 32 | 33 | decoded[decoded_size] = '\0'; 34 | printf("decoded = '%s', len = %d\n", decoded, decoded_size); 35 | 36 | assert(decoded_size == input_size); 37 | assert(memcmp(decoded, input_string, input_size) == 0); 38 | } 39 | } 40 | --------------------------------------------------------------------------------