├── .gitignore ├── LICENSE ├── README.md ├── bold-extra.sty ├── containers_manual.pdf ├── containers_manual.tex ├── freesoftwarelogo.jpg ├── vcl_bool.tex ├── vcl_contributing.tex ├── vcl_conversion.tex ├── vcl_errors_etc.tex ├── vcl_examples.tex ├── vcl_file_list.tex ├── vcl_float_behavior.tex ├── vcl_introduction.tex ├── vcl_manual.pdf ├── vcl_manual.tex ├── vcl_mathematical_functions.tex ├── vcl_operators_and_functions.tex ├── vcl_packages.tex ├── vcl_performance.tex ├── vcl_permute_functions.tex └── vcl_technical_details.tex /.gitignore: -------------------------------------------------------------------------------- 1 | ## Core latex/pdflatex auxiliary files: 2 | *.aux 3 | *.lof 4 | *.log 5 | *.lot 6 | *.fls 7 | *.out 8 | *.toc 9 | *.fmt 10 | *.fot 11 | *.cb 12 | *.cb2 13 | .*.lb 14 | 15 | ## Intermediate documents: 16 | *.dvi 17 | *.xdv 18 | *-converted-to.* 19 | # these rules might exclude image files for figures etc. 20 | # *.ps 21 | # *.eps 22 | # *.pdf 23 | 24 | ## Generated if empty string is given at "Please type another file name for output:" 25 | .pdf 26 | 27 | ## Bibliography auxiliary files (bibtex/biblatex/biber): 28 | *.bbl 29 | *.bcf 30 | *.blg 31 | *-blx.aux 32 | *-blx.bib 33 | *.run.xml 34 | 35 | ## Build tool auxiliary files: 36 | *.fdb_latexmk 37 | *.synctex 38 | *.synctex(busy) 39 | *.synctex.gz 40 | *.synctex.gz(busy) 41 | *.pdfsync 42 | 43 | ## Auxiliary and intermediate files from other packages: 44 | # algorithms 45 | *.alg 46 | *.loa 47 | 48 | # achemso 49 | acs-*.bib 50 | 51 | # amsthm 52 | *.thm 53 | 54 | # beamer 55 | *.nav 56 | *.pre 57 | *.snm 58 | *.vrb 59 | 60 | # changes 61 | *.soc 62 | 63 | # cprotect 64 | *.cpt 65 | 66 | # elsarticle (documentclass of Elsevier journals) 67 | *.spl 68 | 69 | # endnotes 70 | *.ent 71 | 72 | # fixme 73 | *.lox 74 | 75 | # feynmf/feynmp 76 | *.mf 77 | *.mp 78 | *.t[1-9] 79 | *.t[1-9][0-9] 80 | *.tfm 81 | 82 | #(r)(e)ledmac/(r)(e)ledpar 83 | *.end 84 | *.?end 85 | *.[1-9] 86 | *.[1-9][0-9] 87 | *.[1-9][0-9][0-9] 88 | *.[1-9]R 89 | *.[1-9][0-9]R 90 | *.[1-9][0-9][0-9]R 91 | *.eledsec[1-9] 92 | *.eledsec[1-9]R 93 | *.eledsec[1-9][0-9] 94 | *.eledsec[1-9][0-9]R 95 | *.eledsec[1-9][0-9][0-9] 96 | *.eledsec[1-9][0-9][0-9]R 97 | 98 | # glossaries 99 | *.acn 100 | *.acr 101 | *.glg 102 | *.glo 103 | *.gls 104 | *.glsdefs 105 | 106 | # gnuplottex 107 | *-gnuplottex-* 108 | 109 | # gregoriotex 110 | *.gaux 111 | *.gtex 112 | 113 | # htlatex 114 | *.4ct 115 | *.4tc 116 | *.idv 117 | *.lg 118 | *.trc 119 | *.xref 120 | 121 | # hyperref 122 | *.brf 123 | 124 | # knitr 125 | *-concordance.tex 126 | # TODO Comment the next line if you want to keep your tikz graphics files 127 | *.tikz 128 | *-tikzDictionary 129 | 130 | # listings 131 | *.lol 132 | 133 | # makeidx 134 | *.idx 135 | *.ilg 136 | *.ind 137 | *.ist 138 | 139 | # minitoc 140 | *.maf 141 | *.mlf 142 | *.mlt 143 | *.mtc[0-9]* 144 | *.slf[0-9]* 145 | *.slt[0-9]* 146 | *.stc[0-9]* 147 | 148 | # minted 149 | _minted* 150 | *.pyg 151 | 152 | # morewrites 153 | *.mw 154 | 155 | # nomencl 156 | *.nlg 157 | *.nlo 158 | *.nls 159 | 160 | # pax 161 | *.pax 162 | 163 | # pdfpcnotes 164 | *.pdfpc 165 | 166 | # sagetex 167 | *.sagetex.sage 168 | *.sagetex.py 169 | *.sagetex.scmd 170 | 171 | # scrwfile 172 | *.wrt 173 | 174 | # sympy 175 | *.sout 176 | *.sympy 177 | sympy-plots-for-*.tex/ 178 | 179 | # pdfcomment 180 | *.upa 181 | *.upb 182 | 183 | # pythontex 184 | *.pytxcode 185 | pythontex-files-*/ 186 | 187 | # thmtools 188 | *.loe 189 | 190 | # TikZ & PGF 191 | *.dpth 192 | *.md5 193 | *.auxlock 194 | 195 | # todonotes 196 | *.tdo 197 | 198 | # easy-todo 199 | *.lod 200 | 201 | # xmpincl 202 | *.xmpi 203 | 204 | # xindy 205 | *.xdy 206 | 207 | # xypic precompiled matrices 208 | *.xyc 209 | 210 | # endfloat 211 | *.ttt 212 | *.fff 213 | 214 | # Latexian 215 | TSWLatexianTemp* 216 | 217 | ## Editors: 218 | # WinEdt 219 | *.bak 220 | *.sav 221 | 222 | # Texpad 223 | .texpadtmp 224 | 225 | # Kile 226 | *.backup 227 | 228 | # KBibTeX 229 | *~[0-9]* 230 | 231 | # auto folder when using emacs and auctex 232 | ./auto/* 233 | *.el 234 | 235 | # expex forward references with \gathertags 236 | *-tags.tex 237 | 238 | # standalone packages 239 | *.sta 240 | 241 | # generated if using elsarticle.cls 242 | *.spl 243 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | 179 | Copyright 2012-2019 Agner Fog. 180 | 181 | Licensed under the Apache License, Version 2.0 (the "License"); 182 | you may not use this file except in compliance with the License. 183 | You may obtain a copy of the License at 184 | 185 | http://www.apache.org/licenses/LICENSE-2.0 186 | 187 | Unless required by applicable law or agreed to in writing, software 188 | distributed under the License is distributed on an "AS IS" BASIS, 189 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 190 | See the License for the specific language governing permissions and 191 | limitations under the License. 192 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # manual 2 | The manual for the C++ vector class library is here. 3 | [Download manual](https://github.com/vectorclass/manual/blob/master/vcl_manual.pdf) 4 | 5 | The latest release of the Vector Class Library is in 6 | [releases](https://github.com/vectorclass/version2/releases) 7 | 8 | The current version of the source files is in 9 | [version2](https://github.com/vectorclass/version2) 10 | 11 | Various add-on packages for specific applictions are in 12 | [add-on](https://github.com/vectorclass/add-on) 13 | 14 | To re-build the manual: 15 | The pdf manual is built from the .tex files, using MiKTeX and Texmaker. 16 | Run LuaLaTeX twice to fix forward references. 17 | -------------------------------------------------------------------------------- /bold-extra.sty: -------------------------------------------------------------------------------- 1 | % bold-extra.sty - a jiffy to provide access (in latex) to (some of) 2 | % the fonts in ctan directory fonts/cm/mf-extra/bold 3 | % 4 | % by robin fairbairns, rf10@cam.ac.uk November 2001 5 | % 6 | % this package is provided under the provisions of the latex project 7 | % public licence, http://www.latex-project.org/lppl.txt 8 | % 9 | % this packages provides font shapes to support bold small caps and tt 10 | % text. there is a choice of bold tt fonts, which are selected by 11 | % package options cmbtt and cmttb (this reflects the confusingly 12 | % similar font names). the default (based on the author's estimation 13 | % of the fonts' relative merits) is cmttb. 14 | % 15 | % to use these fonts you need their metafont sources available to your 16 | % tex system (as far as i know, there are no type 1 versions of the 17 | % fonts available yet). place them in an appropriate place under 18 | % fonts/source in your tds texmf tree; place this file somewhere like 19 | % tex/latex/misc in your tree. see 20 | % http://www.tex.ac.uk/cgi-bin/texfaq2html?label=instpackages+wherefiles 21 | % for more details. 22 | 23 | \ProvidesPackage{bold-extra}[2001/11/13 v0.1 Use fonts from cm/mf-extra/bold] 24 | \NeedsTeXFormat{LaTeX2e} 25 | 26 | \newif\if@cmttb 27 | \DeclareOption{cmttb}{\@cmttbtrue} 28 | \DeclareOption{cmbtt}{\@cmttbfalse} 29 | \ExecuteOptions{cmttb} 30 | \ProcessOptions 31 | 32 | % declare bold small caps font 33 | \DeclareFontShape{OT1}{cmr}{b}{sc} 34 | { 35 | <5><6><7><8><9><10><12><10.95><14.4><17.28><20.74><24.88>cmbcsc10 36 | }{} 37 | \DeclareFontShape{OT1}{cmr}{bx}{sc} 38 | {<->ssub*cmr/b/sc}{} 39 | 40 | % declare bold tt font: note, we use cmttb10 by default rather than 41 | % the cmbtt series (which seem over-bold to me) 42 | \if@cmttb 43 | \DeclareFontShape{OT1}{cmtt}{b}{n} 44 | { 45 | <5><6><7><8><9><10><12><10.95><14.4><17.28><20.74><24.88>cmttb10 46 | }{} 47 | \else 48 | \DeclareFontShape{OT1}{cmtt}{b}{n} 49 | { 50 | <5><6><7><8>cmbtt8% 51 | <9>cmbtt9% 52 | <10><12><10.95><14.4><17.28><20.74><24.88>cmbtt10 53 | }{} 54 | \fi 55 | \DeclareFontShape{OT1}{cmtt}{bx}{n} 56 | {<->ssub*cmtt/b/n}{} 57 | -------------------------------------------------------------------------------- /containers_manual.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vectorclass/manual/62cb40d710f8d6180511ba03ca6e09347e06f0b9/containers_manual.pdf -------------------------------------------------------------------------------- /containers_manual.tex: -------------------------------------------------------------------------------- 1 | \documentclass[11pt,a4paper,oneside,openright]{report} 2 | 3 | \usepackage[bindingoffset=5mm,left=20mm,right=20mm,top=20mm,bottom=20mm,footskip=10mm]{geometry} 4 | \usepackage[utf8x]{inputenc} 5 | \usepackage{hyperref} 6 | \usepackage[english]{babel} 7 | \usepackage{listings} 8 | \usepackage{subfiles} 9 | \usepackage{longtable} 10 | \usepackage{multirow} 11 | \usepackage{ragged2e} 12 | \usepackage{cmap} % avoid fi ligatures in pdf file 13 | \usepackage{amsthm} % example numbering 14 | \usepackage{color} 15 | %\usepackage{bold-extra} % for bold tt font. Remember to include bold-extra.sty file 16 | \usepackage{graphicx} 17 | \usepackage[yyyymmdd]{datetime} 18 | \usepackage{float} 19 | 20 | % style for code listing 21 | \renewcommand{\familydefault}{\sfdefault} 22 | \renewcommand{\ttdefault}{pcr} % selects Courier font 23 | \newtheorem{example}{Example}[chapter] % example numbering 24 | \lstset{language=C} % formatting for code listing 25 | \lstset{basicstyle=\ttfamily,breaklines=true} 26 | \definecolor{darkGreen}{rgb}{0,0.4,0} 27 | \definecolor{mybrown}{rgb}{0.40,0.10,0.05} 28 | \lstset{commentstyle=\color{darkGreen}} % comments color 29 | \lstset{keywordstyle=\color{blue}} % keyword color 30 | \lstset{stringstyle=\color{mybrown}} % string color 31 | \lstset{showstringspaces=false} % don't mark spaces in strings 32 | 33 | \renewcommand{\dateseparator}{-} 34 | 35 | % command for turning indent back on after \flushleft 36 | \newcommand{\indenton}{\RaggedRight\parindent=15pt} 37 | 38 | % command for vertical space 39 | \newcommand{\vspacesmall}{\vspace{3mm}} 40 | \newcommand{\vspacebig}{\vspace{6mm}} 41 | 42 | % style for code inlined in text: 43 | \newcommand{\codei}[1]{\bfseries \ttfamily{#1}\normalfont} 44 | 45 | 46 | \begin{document} 47 | 48 | \begin{titlepage} 49 | \centering 50 | 51 | \null %empty box needed for vfill to work 52 | \vfill 53 | 54 | {\bfseries\Huge 55 | Container class templates 56 | \vspacesmall 57 | 58 | extension for C++ vector class library 59 | \vspacebig 60 | 61 | } 62 | \vspacebig 63 | 64 | {\Large 65 | Agner Fog 66 | \vspacebig 67 | 68 | \copyright\ \today. Apache license 2.0 69 | } 70 | 71 | \vfill 72 | 73 | \includegraphics[width=306pt]{freesoftwarelogo.jpg} 74 | \vfill 75 | 76 | \end{titlepage} 77 | 78 | \RaggedRight 79 | 80 | \chapter{Introduction}\label{chap:Introduction} 81 | 82 | A container class template is a piece of C++ code that is useful for allocating memory space for a list of objects. It is similar to an array but with additional functionality and security. 83 | \vspacesmall 84 | 85 | C++ programmers routinely use the C++ standard containers (previously known as the standard template library) for this purpose. Unfortunately, the standard 86 | C++ container templates can be quite inefficient. They are optimized for generality and flexibility, while efficiency has been sacrificed. Many of the standard C++ containers are implemented as linked lists that allocate memory in a lot of separate small pieces. This is inefficient because of a lot of heap management overhead, memory fragmentation, and poor caching. Many C++ programmers are routinely implementing a matrix as a nested container (vector of vectors) which is even more inefficient. 87 | \vspacesmall 88 | 89 | The container class templates provided here are intended to fill the need for more efficient containers with contiguous memory storage. Some of these containers are tailor-made to fit the classes defined in the vector classes library (VCL). Containers for other types of objects are also included. 90 | \vspacesmall 91 | 92 | Overview of container class templates: 93 | \begin {table}[H] 94 | \caption{Container class templates} 95 | \label{table:containerClassTemplates} 96 | \begin{tabular}{|p{44mm}|p{70mm}|p{35mm}|} 97 | \hline 98 | \bfseries Template & \bfseries Description & \bfseries Header file \\ \hline 99 | ContainerG & Linear array of any type of objects, \newline dynamic size & general\_containers.h \\ \hline 100 | ContainerV & Linear array of vectors, fixed size or \newline dynamic size & vector\_containers.h \\ \hline 101 | MatrixV & Matrix. Rows are stored as VCL vectors & matrixv.h \\ \hline 102 | \end{tabular} 103 | \end{table} 104 | \vspacebig 105 | 106 | 107 | \chapter{Description of container templates}\label{chap:DescriptionTemplates} 108 | 109 | \section{ContainerG} \label{ContainerG} 110 | 111 | {\bfseries Declaration}\\ 112 | \codei{template class ContainerG}; 113 | \vspacebig 114 | 115 | This container class template makes a linear array with dynamic size. \codei{ContainerG} is independent of the vector class library and can be used for most kinds of objects. 116 | \vspacesmall 117 | 118 | The type \codei{T} can be a simple type such as \codei{int} or \codei{float}, or a composite type such as a \codei{struct}, \codei{class}, or \codei{union}. The container will likely not work if the type \codei{T} has a non-default constructor, destructor, copy constructor, or move constructor. The type \codei{T} cannot be another container, but it can be a \codei{struct} containing a fixed-size array. 119 | \vspacesmall 120 | 121 | \begin{lstlisting}[frame=none] 122 | // Example: 123 | #include 124 | #include "general_containers.h" 125 | 126 | // Function for error reporting 127 | void error_reporter() { 128 | fprintf(stderr, "\nError: index out of range\n"); 129 | } 130 | 131 | int main () { 132 | // Declare a container for float elements 133 | ContainerG my_array; 134 | 135 | // Register error_reporter function to report any errors 136 | my_array.set_error_handler(error_reporter); 137 | 138 | // Set the size of the array 139 | my_array.set_size(10); 140 | 141 | // Put data into a C-style array 142 | const int listsize = 8; 143 | float list[listsize]; 144 | for (int i = 0; i < listsize; i++) list[i] = float(i); 145 | 146 | // Load 8 elements into my_array 147 | my_array.load(listsize, list); 148 | 149 | // Print contents of my_array 150 | for (int i = 0; i < my_array.size(); i++) { 151 | printf(" %.2f", my_array[i]); 152 | } 153 | 154 | // Increase the size of the array 155 | my_array.set_size(12); 156 | 157 | // Change last element (index goes from 0 to size()-1 ) 158 | my_array[my_array.size()-1] = 88; 159 | 160 | // Print contents again 161 | printf("\n\n"); 162 | for (int i = 0; i < my_array.size(); i++) { 163 | printf(" %.2f", my_array[i]); 164 | } 165 | } 166 | 167 | /* Output: 168 | 0.00 1.00 2.00 3.00 4.00 5.00 6.00 7.00 0.00 0.00 169 | 170 | 0.00 1.00 2.00 3.00 4.00 5.00 6.00 7.00 0.00 0.00 0.00 88.00 171 | */ 172 | \end{lstlisting} 173 | \vspacebig 174 | 175 | {\bfseries Member functions:}\\ 176 | \vspacebig 177 | 178 | {\bfseries void set\_size(int size)}\\ 179 | Sets the size of the container, i.e. the number of elements it can contain.\\ 180 | The size can be changed at any time. Increasing the size will make the code allocate a new internal memory buffer if the current memory buffer is not big enough. All contents will be copied to the new memory buffer and the old buffer will be deleted.\\ 181 | The code may allocate a bigger memory block than requested in order to avoid frequent re-allocations if the size is increased in small amounts. 182 | The code does not re-allocate memory if the size is decreased to a non-zero value.\\ 183 | Calling \codei{set\_size} with a size of zero will delete the allocated memory and set everything to the initial condition. This may free memory for other purposes, but it is not needed because the container has a destructor that will free the allocated memory anyway. 184 | \vspacebig 185 | 186 | {\bfseries int size()}\\ 187 | Returns the current size of the container, i.e. the number of elements it can contain. 188 | \vspacebig 189 | 190 | {\bfseries int allocated\_size()}\\ 191 | Gives the size of the internal buffer, which may be bigger than specified by the last call to \codei{set\_size}. \codei{allocated\_size()} is the maximum size that can be set without reallocation of the internal memory. 192 | \vspacebig 193 | 194 | {\bfseries T \& operator [] (int index)}\\ 195 | The operator \codei{[]} works like an array index. This makes it possible to read or write a single element in the array. The code checks that the index is within the range $0 \leq$ \codei{index} $<$ \codei{size()}. 196 | \vspacebig 197 | 198 | {\bfseries void load(int n, T const {*} p)}\\ 199 | Loads \codei{n} objects from an array \codei{p}.\\ 200 | \codei{n} is the size of the array \codei{p} or the maximum number of elements to load. If \codei{n} is bigger than the size of the \codei{ContainerG} then it is reduced to the size of the container. 201 | \vspacebig 202 | 203 | {\bfseries void store(int n, T {*} p)}\\ 204 | Stores \codei{n} objects to an array \codei{p}.\\ 205 | \codei{n} is the size of the array \codei{p} or the maximum number of elements to store. If \codei{n} is bigger than the size of the \codei{ContainerG} then it is reduced to the size of the container. 206 | \vspacebig 207 | 208 | {\bfseries set\_error\_handler(void ({*}err)(void))}\\ 209 | Saves a pointer to an error handling function. This function will be called in case an index is out of range. The error handling function should issue an error message in a way that is appropriate for the actual user interface. The program will crash in case of an index out of range if no error handler is set. 210 | \vspacebig 211 | 212 | {\bfseries T {*} get\_buf()}\\ 213 | Returns a pointer to the internal buffer. It is important to remember that any pointer or reference to elements in the container will be invalid after the size has been increased. 214 | \vspacebig 215 | 216 | \subsection{Recycling of memory} \label{RecyclingOfMemory} 217 | It may be more efficient to reuse a container for a new purpose during the course of the program than to delete each container when it is no longer needed and create a new one. This will improve memory caching. 218 | \vspacesmall 219 | 220 | The container may be resized for every new use. It can be useful to specify an estimated maximum size before first use of the container, and then reduce and increase the size as required during the course of the program. 221 | \vspacesmall 222 | 223 | The memory is zeroed at the first call to \codei{set\_size}, but it is not necessarily cleared when the container is later resized. 224 | \vspacebig 225 | 226 | 227 | \section{ContainerV} \label{ContainerV} 228 | 229 | {\bfseries Declaration}\\ 230 | \codei{template class ContainerV}; 231 | \vspacebig 232 | 233 | This container is designed for VCL vectors only. Vectors of all integer and floating point types are allowed, but not boolean types. Access is provided to each vector as well as to individual vector elements. 234 | \vspacesmall 235 | 236 | \begin{lstlisting}[frame=none] 237 | // Example: 238 | #include 239 | #include 240 | #include 241 | 242 | // Make container of four vectors of 8 float values each 243 | ContainerV c; 244 | // Array of floats 245 | float list[32] = {0,1,2,3,4,5,6}; 246 | // Load array into container 247 | c.load(32, list); 248 | // Change one vector in container 249 | c.set_vector(Vec8f(16,17,18,19,20,21,22,23), 2); 250 | // Change one vector element in container 251 | c.set_element(-99, 5); 252 | // Loop through vectors: 253 | for (int i = 0; i < c.n_vectors(); i++) { 254 | // Loop through elements of each vector: 255 | for (int j = 0; j < c.get_vector(0).size(); j++) { 256 | // Print value: 257 | printf(" %6.2f", c.get_vector(i)[j]); 258 | } 259 | // Next vector on new line: 260 | printf("\n"); 261 | } 262 | 263 | /* Output: 264 | 0.00 1.00 2.00 3.00 4.00 -99.00 6.00 0.00 265 | 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 266 | 16.00 17.00 18.00 19.00 20.00 21.00 22.00 23.00 267 | 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 268 | */ 269 | \end{lstlisting} 270 | \vspacebig 271 | 272 | {\bfseries Defined type \codei{etype} }\\ 273 | The template defines \codei{etype} as the type of the vector elements. For example, if the container is based on vectors of type \codei{Vec8f}, then \codei{etype} is the type \codei{float}. 274 | \vspacebig 275 | 276 | {\bfseries Member functions:}\\ 277 | \vspacebig 278 | 279 | {\bfseries int n\_vectors()}\\ 280 | Returns the number of vectors. 281 | \vspacebig 282 | 283 | {\bfseries int n\_elements()}\\ 284 | Returns the number of vector elements. 285 | \vspacebig 286 | 287 | {\bfseries int elementtype()}\\ 288 | Returns the \codei{elementtype()} of the underlying vector class + 0x1000. 289 | \vspacebig 290 | 291 | {\bfseries vector\_type get\_vector(int index)}\\ 292 | Returns one vector from the position indicated by index. (The first vector has index 0). 293 | \vspacebig 294 | 295 | {\bfseries set\_vector(vector\_type x, int index)}\\ 296 | Replaces one vector at the position indicated by index. 297 | \vspacebig 298 | 299 | {\bfseries etype get\_element(int index)}\\ 300 | Returns a single vector element. The index runs consecutively through all vectors in the container, from 0 to (number of vectors) * (elements per vector) - 1. 301 | \vspacebig 302 | 303 | {\bfseries set\_element(etype x, int index)}\\ 304 | Replaces a single vector element. The index runs consecutively through all vectors in the container, from 0 to (number of vectors) * (elements per vector) - 1. 305 | \vspacebig 306 | 307 | {\bfseries load(int n, void const {*} p)}\\ 308 | Loads values from an array into the container. p points to an array of type 309 | \codei{etype}. n is the array size or the maximum number of vector elements to load. If n is not a multiple of the vector size then the last vector will be partially filled. If n is bigger than the container size, then it is limited to the container size. If n is smaller than the container size, then the remaining full vectors are unchanged. 310 | \vspacebig 311 | 312 | {\bfseries store(int n, void * p)}\\ 313 | Stores values from the container into an array. p points to an array of type 314 | \codei{etype}. 315 | n is the array size or the maximum number of vector elements to store. n does not have to be a multiple of the vector size. If n is bigger than the container size, then it is limited to the container size. 316 | \vspacebig 317 | 318 | {\bfseries vector\_type get\_buf()}\\ 319 | Returns a pointer to the internal buffer. Note that this pointer will be invalid if the size of the container is later increased (see below). 320 | \vspacebig 321 | 322 | {\bfseries zero()}\\ 323 | Sets all vectors and all vector elements in the container to zero. Does not change the size of the container. 324 | \vspacebig 325 | 326 | {\bfseries set\_error\_handler(void ({*}err)(void))}\\ 327 | Saves a pointer to an error handling function. This function will be called in case an index is out of range. The error handling function should issue an error message in a way that is appropriate for the actual user interface. The program will crash in case of an index out of range if no error handler is set. 328 | 329 | \begin{lstlisting}[frame=none] 330 | // Example: 331 | #include 332 | #include 333 | #include 334 | 335 | void error_reporter() { 336 | fprintf(stderr, "\nError: index out of range\n"); 337 | } 338 | 339 | // Make container of four vectors of 8 float values each 340 | ContainerV c; 341 | 342 | // Set the error handler 343 | c.set_error_handler(error_reporter); 344 | 345 | \end{lstlisting} 346 | \vspacebig 347 | 348 | 349 | \subsection{Dynamic size} \label{ContainerVDynamicSize} 350 | The \codei{ContainerV} template has a dynamic size if, and only if, the initial size is 0. 351 | \vspacesmall 352 | 353 | \begin{lstlisting}[frame=none] 354 | // Example: 355 | #include 356 | #include 357 | #include 358 | 359 | // Make dynamic container of a variable number of vectors 360 | ContainerV c; 361 | // Set the size 362 | c.set_nvectors(6); 363 | // Change one vector in container 364 | c.set_vector(Vec8f(16,17,18,19,20,21,22,23), 2); 365 | // Loop through vectors: 366 | for (int i = 0; i < c.n_vectors(); i++) { 367 | // Loop through elements of each vector: 368 | for (int j = 0; j < c.get_vector(0).size(); j++) { 369 | // Print value: 370 | printf(" %6.2f", c.get_vector(i)[j]); 371 | } 372 | // Next vector on new line: 373 | printf("\n"); 374 | } 375 | 376 | /* Output: 377 | 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 378 | 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 379 | 16.00 17.00 18.00 19.00 20.00 21.00 22.00 23.00 380 | 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 381 | 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 382 | 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 383 | */ 384 | \end{lstlisting} 385 | \vspacebig 386 | 387 | {\bfseries Additional member functions for dynamic size}\\ 388 | The following member functions are only available for ContainerV if the initial size is 0: 389 | \vspacebig 390 | 391 | {\bfseries set\_nvectors(int size)}\\ 392 | Changes the size of the container. On the first call to this function, the code allocates a memory block big enough to contain the specified number of vectors. If a later call to \codei{set\_nvectors} increases the size, then it will allocate a new bigger memory block if necessary, copy all data to the new memory block, and delete the old memory block. The code may allocate a bigger memory block than requested in order to avoid frequent re-allocations if the size is increased in small amounts. 393 | The code does not re-allocate memory if the size is decreased to a non-zero value. 394 | Calling \codei{set\_nvectors} with a size of zero will delete the allocated memory and set everything to the initial condition. This may free memory for other purposes, but it is not needed because the container has a destructor that frees the allocated memory anyway. 395 | \vspacebig 396 | 397 | {\bfseries set\_nelements(int n)}\\ 398 | This function is similar to \codei{set\_nvectors}, but makes it possible to set a size that is not a multiple of the vector size. The size is rounded up to a multiple of the vector size in order to determine the amount of memory to allocate, while the last vector will be only partially used. 399 | \codei{n\_vectors()} will report the number of vectors used, including any partially used vector, while \codei{n\_elements()} will return the value set by \codei{set\_nelements(n)}. 400 | \vspacebig 401 | 402 | {\bfseries int allocated\_size()}\\ 403 | This function returns the actual amount of memory allocated, including any unused memory. The unit is the vector size, similar to \codei{set\_nvectors}. 404 | \vspacebig 405 | 406 | \subsection{Recycling of memory} \label{RecyclingOfMemory} 407 | It may be more efficient to reuse a container for a new purpose during the course of the program than to delete each container when it is no longer needed and create a new one. This will improve memory caching. 408 | \vspacesmall 409 | 410 | The container may be resized for every new use. It can be useful to specify an estimated maximum size before first use of the container, and then reduce and increase the size as required during the course of the program. 411 | \vspacesmall 412 | 413 | The memory is zeroed at the first call to \codei{set\_nvectors}, but it is not necessarily cleared when the container is later resized. A container with fixed size contains random data initially. 414 | \vspacebig 415 | 416 | 417 | \section{MatrixV} \label{ContainerV} 418 | MatrixV is a container for representing numerical data as a matrix. Each row in the matrix is stored as one or more VCL vectors. This is useful for representing numerical data as a matrix. MatrixV can also be used for storing a list of vectors where each vector is represented as a row, and the number of columns corresponds to the number of elements in each vector. 419 | \vspacesmall 420 | 421 | MatrixV is optimized for accessing the matrix as rows rather than columns. 422 | \vspacesmall 423 | 424 | {\bfseries Declaration}\\ 425 | \codei{template 426 | class MatrixV;} 427 | \vspacebig 428 | 429 | The vector type \codei{V} can be any floating point or integer vector class. Boolean vectors are not supported. (It is more efficient to pack boolean vectors into integer bitfields). 430 | \vspacesmall 431 | 432 | If the number of columns is larger than the number of elements in the vector class \codei{V} then the template will use multiple vectors for each row. If the number of columns is less than the number of elements in the vector class \codei{V} then the template will use the smallest possible vector class that fits the number of columns. Any extra elements in \codei{V} will be unused. The template will not store multiple rows in one vector, but start each row with a new vector. It is possible to specify \codei{V} as the largest possible vector with the desired element type and leave it to the template to find the smallest vector of the same element type that fits the number of columns. The MatrixV template will not use vectors larger than \codei{V}. 433 | \vspacesmall 434 | 435 | {\bfseries Defined type \codei{row\_vector\_type} }\\ 436 | The template defines \codei{row\_vector\_type} as the vector class used for storing rows. This may be the specified vector class \codei{V} or a smaller vector class with the same element type. 437 | \vspacesmall 438 | 439 | {\bfseries Defined type \codei{etype} }\\ 440 | The template defines \codei{etype} as the type of the vector elements. For example, if the container is based on vectors of type \codei{Vec8f}, then \codei{etype} is the type \codei{float}. 441 | \vspacebig 442 | 443 | {\bfseries Member functions:}\\ 444 | \vspacebig 445 | 446 | {\bfseries int nrows()}\\ 447 | Returns the number of rows in the matrix. 448 | \vspacebig 449 | 450 | {\bfseries int ncolumns()}\\ 451 | Returns the number of columns in the matrix. 452 | \vspacebig 453 | 454 | {\bfseries int vectors\_per\_row()}\\ 455 | Returns the number of vectors of class \codei{row\_vector\_type} that are used for each row. Any partially used vector is included. 456 | \vspacebig 457 | 458 | {\bfseries int full\_vectors\_per\_row()}\\ 459 | Returns the number of fully-used vectors of class \codei{row\_vector\_type} that are used for each row. Any partially used vector is not included. 460 | \vspacebig 461 | 462 | {\bfseries int partial\_vector\_elements()}\\ 463 | If the number of columns is not divisible by the number of elements in vector class \codei{row\_vector\_type} then the last vector in each row will be partially used. This function returns the number of used elements in a partially-used vector. The function returns 0 if there are no partially used vectors. 464 | \vspacebig 465 | 466 | {\bfseries set\_error\_handler(void ({*}err)(void))}\\ 467 | This function is used for registering a function for reporting errors such as an index out of range. \codei{err} should be a function that reports the error in a way that is appropriate for the actual user interface. The program will crash in case a row or column index is out of range if no error-handling function is registered. 468 | \vspacebig 469 | 470 | {\bfseries void ({*}get\_error\_handler())(void)}\\ 471 | Returns a pointer to the function set by \codei{set\_error\_handler}. 472 | \vspacebig 473 | 474 | {\bfseries row\_vector\_type get\_row(int r, int i = 0)}\\ 475 | Returns row number \codei{r} as a vector of class \codei{row\_vector\_type}. \\ 476 | Row numbers go from 0 to \codei{nrows()-1}.\\ 477 | If each row contains more than one vector then call \codei{get\_row} multiple times with \codei{i} going from 0 to \codei{vectors\_per\_row() - 1}. 478 | \vspacebig 479 | 480 | {\bfseries set\_row(row\_vector\_type x, int r, int i = 0)}\\ 481 | Sets row number \codei{r} to a vector of class \codei{row\_vector\_type}. \\ 482 | Row numbers go from 0 to \codei{nrows()-1}.\\ 483 | If each row contains more than one vector then call \codei{set\_row} multiple times with \codei{i} going from 0 to \codei{vectors\_per\_row() - 1}. 484 | \vspacebig 485 | 486 | {\bfseries etype get\_element(int row, int column)}\\ 487 | Returns a single element from the matrix.\\ 488 | The row number goes from 0 to \codei{nrows()-1}.\\ 489 | The column number goes from 0 to \codei{ncolumns()-1}. 490 | \vspacebig 491 | 492 | {\bfseries set\_element(etype x, int row, int column)}\\ 493 | Changes a single element in the matrix.\\ 494 | The row number goes from 0 to \codei{nrows()-1}.\\ 495 | The column number goes from 0 to \codei{ncolumns()-1}. 496 | \vspacebig 497 | 498 | {\bfseries load(void const {*} p)}\\ 499 | Fills the entire matrix with data from a C-style matrix or linear array pointed to by \codei{p}. The matrix or array must contain (rows * columns) elements.\\ 500 | The elements are retrieved in row-major order in accordance with the C standard. 501 | \vspacebig 502 | 503 | {\bfseries store(void {*} p)}\\ 504 | A C-style matrix or array pointed to by \codei{p} is filled with all data from the entire matrix. The number of elements stored at \codei{ {*}p} is (rows * columns).\\ 505 | The elements are stored in row-major order in accordance with the C standard. 506 | \vspacebig 507 | 508 | {\bfseries zero()}\\ 509 | Sets all elements in the matrix to zero. 510 | \vspacebig 511 | 512 | 513 | \subsection{Initializing a MatrixV} \label{InitializingMatrixV} 514 | A \codei{MatrixV} is not initialized when it is first constructed. The contents of unitialized matrix elements is unpredictable. 515 | The matrix can be initialized by the \codei{load} member function or by multiple calls to \codei{set\_row}. It is less efficient to set all elements individually with \codei{set\_element}. 516 | \vspacesmall 517 | 518 | The internal vectors contain unused vector elements if the number of columns is not divisible by the vector size. 519 | It is recommended to call the \codei{zero} member function first if the matrix elements are initialized with \codei{set\_element} only, in order to clear any unused vector elements. Otherwise, the unused elements may occur as random values in unused vector elements retrieved by \codei{get\_row} or by the pack functions described below. 520 | \vspacebig 521 | 522 | 523 | \subsection{Pack and unpack functions} \label{PackAndUnpack} 524 | Several functions are defined for packing multiple matrix rows into one big vector and for unpacking such a vector into multiple matrix rows. These functions cannot be used if the number of columns is too big for multiple rows to fit into a single large vector. 525 | \vspacesmall 526 | 527 | The pack and unpack functions are useful in cases where matrix elements are accessed in other patterns than rowwise and when permutations are needed, such as matrix transposition and matrix-by-matrix products. The pack and unpack functions support all floating point vector classes and integer vector classes with integer types of at least 16 bits. 8-bit integers are not supported. 528 | \vspacebig 529 | 530 | {\bfseries Pack functions}\\ 531 | \codei{template auto pack2rows(M \& matrix, int first\_row)}\\ 532 | \codei{template auto pack3rows(M \& matrix, int first\_row)}\\ 533 | \codei{template auto pack4rows(M \& matrix, int first\_row)}\\ 534 | \codei{template auto pack5rows(M \& matrix, int first\_row)} 535 | \vspacebig 536 | 537 | These functions will pack n consecutive rows of a \codei{MatrixV} matrix into a single vector, provided that a vector class with sufficient size exists. 538 | \vspacesmall 539 | 540 | For example, a matrix with 5 rows and 3 columns with elements of type \codei{float} can be packed in the following ways. 541 | \codei{pack2rows} will pack two consecutive rows into a vector of type \codei{Vec8f} with the elements of two rows in the first 6 vector positions, while the last two vector positions are unused. \codei{pack3rows} will pack three rows into a \codei{Vec16f} with the first 9 elements used and the last 7 elements unused. \codei{pack4rows} will use 12 elements, and \codei{pack5rows} can pack the entire matrix into 15 elements of a \codei{Vec16f} with the last element unused. \codei{first\_row} indicates the start row, where row numbers start at 0. 542 | \vspacesmall 543 | 544 | The pack functions are automatically finding a vector size that fits the number of data elements packed. You will get a compilation error if no sufficiently big vector class exists. It is not possible to pack the rows into multiple vectors with a single call to a pack function. 545 | \vspacesmall 546 | 547 | The error handling function set by \codei{set\_error\_handler} for the matrix will be called in case any of the row indexes is out of range. For example, calling \codei{pack3rows} with \codei{first\_row} set to 3 on a matrix with 5 rows will give an error because the last row is out of range. The program will crash if there is no error handling function and a row index is out of range. 548 | \vspacebig 549 | 550 | {\bfseries Unpack functions}\\ 551 | \codei{template unpack2rows(V rr, M \& matrix, int first\_row))}\\ 552 | \codei{template unpack3rows(V rr, M \& matrix, int first\_row))}\\ 553 | \codei{template unpack4rows(V rr, M \& matrix, int first\_row))}\\ 554 | \codei{template unpack5rows(V rr, M \& matrix, int first\_row))}\\ 555 | \vspacebig 556 | 557 | These functions are doing the opposite of the pack functions. The large vector \codei{rr} is unpacked to fill n consecutive rows of the matrix \codei{M}. The unpack functions can be used for initializing or modifying a matrix. 558 | \vspacesmall 559 | 560 | A row index out of range will be indicated by a call to the error handling function in the same way as for the pack functions. 561 | \vspacebig 562 | 563 | 564 | \subsection{Examples} \label{MatrixExamples} 565 | The following examples illustrate how to use the \codei{MatrixV} container, its member functions, and the pack and unpack functions. 566 | \vspacebig 567 | 568 | \begin{lstlisting}[frame=none] 569 | #include 570 | #include 571 | #include 572 | 573 | // Function for reporting an error message 574 | void error_reporter() { 575 | fprintf(stderr, "\nError: index out of range\n"); 576 | } 577 | 578 | // Function for printing a whole matrix 579 | template 580 | void print_matrix(M & matrix) { 581 | // row loop 582 | for (int r = 0; r < matrix.nrows(); r++) { 583 | // column loop 584 | for (int c = 0; c < matrix.ncolumns(); c++) { 585 | // print one element 586 | printf(" %6.2f", matrix.get_element(r, c)); 587 | } 588 | printf("\n"); // new line for next row 589 | } 590 | } 591 | 592 | int main() { 593 | 594 | // C-style matrix with 3 rows and 4 columns 595 | float Amatrix[3][4]; 596 | 597 | // Put data into this matrix 598 | for (int r = 0; r < 3; r++) { 599 | for (int c = 0; c < 4; c++) { 600 | Amatrix[r][c] = float(r + 0.1 * c); 601 | } 602 | } 603 | 604 | // Vector-based matrix with 3 rows and 4 columns 605 | MatrixV A; 606 | 607 | // Set an error handler 608 | A.set_error_handler(error_reporter); 609 | 610 | // Load data from C-style matrix Amatrix into vector-based matrix A 611 | A.load(Amatrix); 612 | 613 | // Print matrix A 614 | printf("\n A:\n"); 615 | print_matrix(A); 616 | 617 | // Pack matrix A into one big vector (A has 3 rows) 618 | Vec16f Apack = pack3rows(A, 0); 619 | 620 | // Transpose matrix A 621 | const int d = V_DC; // d means don't care 622 | Vec16f Atransposed = permute16< 623 | 0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11, d, d, d, d>(Apack); 624 | 625 | // Vector-based matrix for A transposed. Has 4 rows and 3 columns 626 | MatrixV B; 627 | 628 | // Set an error handler 629 | B.set_error_handler(error_reporter); 630 | 631 | // Unpack Atransposed vector into matrix B (B has 4 rows) 632 | unpack4rows(Atransposed, B, 0); 633 | 634 | // Row operations 635 | 636 | // Subtract 2 * row2 from row1 in matrix B 637 | auto row1 = B.get_row(1); // row 1 as Vec4f 638 | auto row2 = B.get_row(2); // row 2 as Vec4f 639 | auto new_row1 = row1 - 2.0f * row2; // calculate new row 1 640 | B.set_row(new_row1, 1); // insert new row 1 641 | 642 | // Print matrix B 643 | printf("\n B:\n"); 644 | print_matrix(B); 645 | 646 | // Matrix multiplication 647 | 648 | // Pack matrix B into one big vector 649 | Vec16f Bpack = pack4rows(B, 0); 650 | 651 | // Calculate matrix product A * B 652 | Vec16f AxBpack = 653 | 654 | permute16<0, 0, 0, 4, 4, 4, 8, 8, 8, d, d, d, d, d, d, d>(Apack) 655 | * permute16<0, 1, 2, 0, 1, 2, 0, 1, 2, d, d, d, d, d, d, d>(Bpack) 656 | 657 | + permute16<1, 1, 1, 5, 5, 5, 9, 9, 9, d, d, d, d, d, d, d>(Apack) 658 | * permute16<3, 4, 5, 3, 4, 5, 3, 4, 5, d, d, d, d, d, d, d>(Bpack) 659 | 660 | + permute16<2, 2, 2, 6, 6, 6,10,10,10, d, d, d, d, d, d, d>(Apack) 661 | * permute16<6, 7, 8, 6, 7, 8, 6, 7, 8, d, d, d, d, d, d, d>(Bpack) 662 | 663 | + permute16<3, 3, 3, 7, 7, 7,11,11,11, d, d, d, d, d, d, d>(Apack) 664 | * permute16<9,10,11, 9,10,11, 9,10,11, d, d, d, d, d, d, d>(Bpack); 665 | 666 | // Product matrix has 3 rows and 3 columns 667 | MatrixV AxB; 668 | 669 | // Set an error handler 670 | AxB.set_error_handler(error_reporter); 671 | 672 | // Unpack product vector into matrix AxB 673 | unpack3rows(AxBpack, AxB, 0); 674 | 675 | // Print product matrix AxB 676 | printf("\n AxB:\n"); 677 | print_matrix(AxB); 678 | } 679 | 680 | /* Output: 681 | A: 682 | 0.00 0.10 0.20 0.30 683 | 1.00 1.10 1.20 1.30 684 | 2.00 2.10 2.20 2.30 685 | 686 | B: 687 | 0.00 1.00 2.00 688 | -0.30 -1.30 -2.30 689 | 0.20 1.20 2.20 690 | 0.30 1.30 2.30 691 | 692 | AxB: 693 | 0.10 0.50 0.90 694 | 0.30 2.70 5.10 695 | 0.50 4.90 9.30 696 | */ 697 | \end{lstlisting} 698 | \vspacebig 699 | 700 | \end{document} 701 | -------------------------------------------------------------------------------- /freesoftwarelogo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vectorclass/manual/62cb40d710f8d6180511ba03ca6e09347e06f0b9/freesoftwarelogo.jpg -------------------------------------------------------------------------------- /vcl_bool.tex: -------------------------------------------------------------------------------- 1 | % chapter included in vclmanual.tex 2 | \documentclass[vcl_manual.tex]{subfiles} 3 | \begin{document} 4 | 5 | \flushleft 6 | 7 | \chapter{Boolean operations and per-element branches}\label{chap:BooleanOperations} 8 | Consider this piece of C++ code: 9 | 10 | \begin{lstlisting}[frame=none] 11 | int a[4], b[4], c[4], d[4]; 12 | ... 13 | for (int i = 0; i < 4; i++) { 14 | d[i] = (a[i] > 0 && a[i] < 10) ? b[i] : c[i]; 15 | } 16 | \end{lstlisting} 17 | \vspacesmall 18 | 19 | We can do this with vectors in the following way: 20 | 21 | \begin{lstlisting}[frame=none] 22 | Vec4i a, b, c, d; 23 | ... 24 | d = select(a > 0 & a < 10, b, c); 25 | \end{lstlisting} 26 | \vspacesmall 27 | 28 | The \codei{select} function is similar to the \codei{?:} operator. 29 | It has three vector parameters: The first parameter is a boolean vector that chooses between the elements of the second and the third vector parameter. 30 | \vspacesmall 31 | 32 | The relational operators \codei{\textgreater}, \codei{\textgreater=}, \codei{\textless}, \codei{\textless=}, \codei{==}, \codei{!=} produce boolean vectors, 33 | which accept the boolean operations \codei{\&}, 34 | \codei{|}, \codei{$\wedge$}, \codei{$\sim$} (and, or, exclusive or, not). 35 | \vspacesmall 36 | 37 | In the above example, the expressions \codei{a \textgreater{} 0} and \codei{a \textless{} 10} are boolean vectors of type \codei{Vec4ib}. The boolean vectors must have a type that matches the data vectors they are used with. Table \ref{table:BooleanVectorClasses} on page \pageref{table:BooleanVectorClasses} shows which boolean vector class to use for each vector type. 38 | \vspacesmall 39 | 40 | The vector elements that are not selected are calculated anyway because normally all parts of a vector are calculated. For example: 41 | 42 | \begin{lstlisting}[frame=none] 43 | Vec4f a(-1.0f, 0.0f, 1.0f, 2.0f); 44 | Vec4f b = select(a >= 0.0f, sqrt(a), 0.0f); 45 | \end{lstlisting} 46 | \vspacesmall 47 | 48 | Here, we will be calculating the square root of -1 even though we are not using it. This will not cause problems if floating point exceptions are masked off, which they normally are. A safe solution that works even if floating point exceptions are enabled would be: 49 | 50 | \begin{lstlisting}[frame=none] 51 | Vec4f a(-1.0f, 0.0f, 1.0f, 2.0f); 52 | Vec4f b = sqrt(max(a, 0.0f)); 53 | \end{lstlisting} 54 | \vspacesmall 55 | 56 | 57 | Likewise, the \codei{\&} and \codei{|} operators are calculating both input operands, even if the second operand is not needed. The following examples illustrates this: 58 | 59 | \begin{lstlisting}[frame=none] 60 | // array version: 61 | float a[4] = {0.0f, 1.0f, 2.0f, 3.0f}; 62 | float b[4]; 63 | for (int i = 0; i < 4; i++) { 64 | if (a[i] > 0.0f && 1.0f/a[i] != 4.0f) { 65 | b[i] = a[i]; 66 | } 67 | else { 68 | b[i] = 1.0f; 69 | } 70 | } 71 | \end{lstlisting} 72 | \vspacesmall 73 | 74 | and the vector version of the same: 75 | 76 | \begin{lstlisting}[frame=none] 77 | Vec4f a(0.0f, 1.0f, 2.0f, 3.0f); 78 | Vec4f b = select(a > 0.0f & 1.0f/a != 4.0f, a, 1.0f); 79 | \end{lstlisting} 80 | \vspacesmall 81 | 82 | In the array version, we will never divide by zero because the \codei{\&\&} operator does not evaluate the second operand when the first operand is false. But in the vector version, we are indeed dividing by zero because the \codei{\&} operator always evaluates both operands. The vector class library defines the operators \codei{\&\&} and \codei{||} as synonyms for \codei{\&} and \codei{|} for convenience, but they are still doing the bitwise AND or OR operation, so \codei{\&} and \codei{|} are actually more representative of what these operators really do. This example should be changed to: 83 | 84 | \begin{lstlisting}[frame=none] 85 | Vec4f a(0.0f, 1.0f, 2.0f, 3.0f); 86 | Vec4f b = select(a > 0.0f & a != 0.25f, a, 1.0f); 87 | \end{lstlisting} 88 | \vspacesmall 89 | 90 | 91 | \section{Internal representation of boolean vectors}\label{InternalRepresentationOfBoolean} 92 | 93 | The way boolean vectors are stored depends on the instruction set and the Vector Class Library (VCL) version. 94 | Older instruction sets have the boolean vectors stored with the same number of bits as the data vectors they are applied to (broad boolean vectors). The later instruction sets AVX512 and AVX512VL allow boolean vectors to be stored with only one bit for each element (compact boolean vectors). 95 | \vspacesmall 96 | 97 | Version 1.xx of the VCL is using the broad boolean vectors for the sake of backwards compatibility, while version 2.xx is prioritizing the more efficient compact boolean vectors when the appropriate instruction set is enabled. The boolean vector sizes are summarized in the following table. 98 | \vspacesmall 99 | 100 | \label{tableBooleanVectorSizes} 101 | \begin{tabular}{|p{50mm}|p{40mm}|p{40mm}|} 102 | \hline 103 | \bfseries Data vector size \newline and instruction set & \bfseries VCL version 1 \newline Boolean vectors & \bfseries VCL version 2 \newline Boolean vectors \\ \hline 104 | 128 bits & broad & broad \\ \hline 105 | 128 bits with AVX512VL & broad & compact \\ \hline 106 | 256 bits & broad & broad \\ \hline 107 | 256 bits with AVX512VL & broad & compact \\ \hline 108 | 512 bits & broad & broad \\ \hline 109 | 512 bits with AVX512F & compact & compact \\ \hline 110 | \end{tabular} 111 | \vspacebig 112 | 113 | The broad boolean vectors are stored as integer vectors with the same number of bits per element as the integer or floating point vectors they are used for. For example, the broad boolean vector class \codei{Vec4fb} is stored as a vector of four 32-bit integers because it is used with vectors \codei{Vec4f} of four single precision floating point numbers, using 32 bits each. The broad boolean vector class \codei{Vec4db} is stored as a vector of four 64-bit integers because it is used with vectors \codei{Vec4d} of four double precision floating point numbers, using 64 bits each. Note that the integer representation of true in a broad boolean vector element is not 1, but -1. The representation of false is 0. Any other values than 0 and -1 in broad boolean vectors will produce wrong and inconsistent results that depend on the instruction set. 114 | \vspacesmall 115 | 116 | The compact boolean vectors are stored with one bit per element (at least 8 bits). 117 | You should make no assumption about how boolean vectors are stored if your code may be compiled for different instruction sets or different versions of VCL. For example, 118 | \codei{Vec16ib} uses 16 bits of storage when compiling for AVX512, but 512 bits of storage when compiling for AVX2. Do not store boolean vectors directly to binary files, and do not transmit boolean vectors between different functions that may be compiled for different instruction sets or different VCL versions. 119 | \vspacesmall 120 | 121 | Different compact boolean vectors are mutually compatible if they have the same number of elements. Different broad boolean vectors are mutually compatible if they have the same number of elements and the same number of bits. Broad and compact boolean vectors are not compatible with each other. See page \pageref{ConversionBetweenBooleanTypes} for conversion between different types of boolean vectors. 122 | \vspacesmall 123 | 124 | 125 | \section{Functions for use with booleans}\label{FunctionsForBooleans} 126 | 127 | \vspacesmall 128 | \begin{tabular}{|p{30mm}|p{120mm}|} 129 | \hline 130 | \bfseries Function & vector select(boolean vector s, vector a, vector b) \\ \hline 131 | \bfseries Defined for & all integer and floating point vector classes \\ \hline 132 | \bfseries Description & branch per element.\newline 133 | result[i] = s[i] ? a[i] : b[i] \\ \hline 134 | \bfseries Efficiency & good \\ \hline 135 | \end{tabular} 136 | \begin{lstlisting}[frame=none] 137 | // Example: 138 | Vec4i a(-1, 0, 1, 2); 139 | Vec4i b = select(a>0, a+10, a-10); // b = (-11,-10,11,12) 140 | \end{lstlisting} 141 | \vspacesmall 142 | 143 | 144 | \begin{tabular}{|p{30mm}|p{120mm}|} 145 | \hline 146 | \bfseries Function & vector if\_add(boolean vector f, vector a, vector b) \\ \hline 147 | \bfseries Defined for & all integer and floating point vector classes \\ \hline 148 | \bfseries Description & conditional addition \newline 149 | result[i] = f[i] ? (a[i] + b[i]) : a[i] \\ \hline 150 | \bfseries Efficiency & good \\ \hline 151 | \end{tabular} 152 | \begin{lstlisting}[frame=none] 153 | // Example: 154 | Vec4i a(-1, 0, 1, 2); 155 | Vec4i b = if_add(a < 0, a, 100); // b = (99,0,1,2) 156 | \end{lstlisting} 157 | \vspacesmall 158 | 159 | \begin{tabular}{|p{30mm}|p{120mm}|} 160 | \hline 161 | \bfseries Function & vector if\_sub(boolean vector f, vector a, vector b) \\ \hline 162 | \bfseries Defined for & all integer and floating point vector classes \\ \hline 163 | \bfseries Description & conditional subtraction \newline 164 | result[i] = f[i] ? (a[i] - b[i]) : a[i] \\ \hline 165 | \bfseries Efficiency & good \\ \hline 166 | \end{tabular} 167 | \vspacebig 168 | 169 | \begin{tabular}{|p{30mm}|p{120mm}|} 170 | \hline 171 | \bfseries Function & vector if\_mul(boolean vector f, vector a, vector b) \\ \hline 172 | \bfseries Defined for & all integer and floating point vector classes \\ \hline 173 | \bfseries Description & conditional multiplication\newline 174 | result[i] = f[i] ? (a[i] * b[i]) : a[i] \\ \hline 175 | \bfseries Efficiency & good \\ \hline 176 | \end{tabular} 177 | \vspacebig 178 | 179 | \begin{tabular}{|p{30mm}|p{120mm}|} 180 | \hline 181 | \bfseries Function & vector if\_div(boolean vector f, vector a, vector b) \\ \hline 182 | \bfseries Defined for & all floating point vector classes \\ \hline 183 | \bfseries Description & conditional division\newline 184 | result[i] = f[i] ? (a[i] / b[i]) : a[i] \\ \hline 185 | \bfseries Efficiency & medium \\ \hline 186 | \end{tabular} 187 | \vspacebig 188 | 189 | 190 | \begin{tabular}{|p{30mm}|p{120mm}|} 191 | \hline 192 | \bfseries Function & vector andnot(vector, vector) \\ \hline 193 | \bfseries Defined for & all boolean vector classes \\ \hline 194 | \bfseries Description & andnot(a,b) = a \& $\sim$ b \\ \hline 195 | \bfseries Efficiency & good \\ \hline 196 | \end{tabular} 197 | \vspacebig 198 | 199 | 200 | \begin{tabular}{|p{30mm}|p{120mm}|} 201 | \hline 202 | \bfseries Function & bool horizontal\_and(boolean vector) \\ \hline 203 | \bfseries Defined for & all boolean vector classes \\ \hline 204 | \bfseries Description & The output is the AND combination of all elements \\ \hline 205 | \bfseries Efficiency & Medium for broad boolean vectors. Better if SSE4.1 or later. Good for compact boolean vectors \\ \hline 206 | \end{tabular} 207 | \begin{lstlisting}[frame=none] 208 | // Example: 209 | Vec4i a(-1, 0, 1, 2); 210 | bool b = horizontal_and(a > 0); // b = false 211 | \end{lstlisting} 212 | \vspacesmall 213 | 214 | 215 | \begin{tabular}{|p{30mm}|p{120mm}|} 216 | \hline 217 | \bfseries Function & bool horizontal\_or(boolean vector) \\ \hline 218 | \bfseries Defined for & all boolean vector classes \\ \hline 219 | \bfseries Description & The output is the OR combination of all elements \\ \hline 220 | \bfseries Efficiency & Medium for broad boolean vectors. Better if SSE4.1 or later. Good for compact boolean vectors \\ \hline 221 | \end{tabular} 222 | \begin{lstlisting}[frame=none] 223 | // Example: 224 | Vec4i a(-1, 0, 1, 2); 225 | bool b = horizontal_or(a > 0); // b = true 226 | \end{lstlisting} 227 | \vspacesmall 228 | 229 | 230 | \begin{tabular}{|p{30mm}|p{120mm}|} 231 | \hline 232 | \bfseries Function & int horizontal\_find\_first(boolean vector) \\ \hline 233 | \bfseries Defined for & all boolean vector classes \\ \hline 234 | \bfseries Description & Returns an index to the first element that is true. 235 | Returns -1 if all elements are false \\ \hline 236 | \bfseries Efficiency & medium \\ \hline 237 | \end{tabular} 238 | \begin{lstlisting}[frame=none] 239 | // Example: 240 | Vec4i a(1, 2, 3, 4); 241 | Vec4i b(0, 2, 3, 5); 242 | int c = horizontal_find_first(a == b); // c = 1 243 | \end{lstlisting} 244 | \vspacesmall 245 | 246 | 247 | \begin{tabular}{|p{30mm}|p{120mm}|} 248 | \hline 249 | \bfseries Function & unsigned int horizontal\_count(boolean vector) \\ \hline 250 | \bfseries Defined for & all boolean vector classes \\ \hline 251 | \bfseries Description & counts the number of elements that are true \\ \hline 252 | \bfseries Efficiency & medium if SSE4.2 or later \\ \hline 253 | \end{tabular} 254 | \begin{lstlisting}[frame=none] 255 | // Example: 256 | Vec4i a(1, 2, 3, 4); 257 | Vec4i b(0, 2, 3, 5); 258 | int c = horizontal_count(a == b); // c = 2 259 | \end{lstlisting} 260 | \vspacesmall 261 | 262 | \end{document} 263 | -------------------------------------------------------------------------------- /vcl_contributing.tex: -------------------------------------------------------------------------------- 1 | % chapter included in vclmanual.tex 2 | \documentclass[vcl_manual.tex]{subfiles} 3 | \begin{document} 4 | 5 | 6 | \section{Making add-on packages}\label{MakingPackages} 7 | \flushleft 8 | 9 | Anybody can contribute add-on packages for VCL. Contributors must follow the following guidelines: 10 | \vspacebig 11 | 12 | 13 | \textbf{Purpose}\\ 14 | The package must serve a general purpose that is useful for others. The code must rely on the VCL. 15 | \vspacebig 16 | 17 | \textbf{Open source}\\ 18 | The package must be published under an open source license. 19 | The preferred license is the same as for VCL, i.e. Apache 2.0 license or later. 20 | Other accepted licenses include GPL 3.0 or later, LGPL 3.0 or later, and revised BSD license. 21 | \vspacebig 22 | 23 | \textbf{Documentation}\\ 24 | The package must include an instruction manual in English. The manual may be supplied in one of these formats: 25 | \begin{itemize} 26 | \item Plain text as a an ASCII .txt file 27 | \item Plain text as a comment in the beginning of the code file 28 | \item A .pdf file. The source needed for modifying and rebuilding the .pdf file must be included. 29 | The file format of the pdf source must be .tex, .odt, or .docx. Closed, proprietary file formats are not allowed. 30 | \end{itemize} 31 | The documentation must include the name and contact information of at least one person responsible for maintaining the code. 32 | \vspacesmall 33 | 34 | VCL does not use Doxygen or other kinds of metadata for generating documentation. You may use an advanced IDE such as Microsoft Visual Studio for navigating, tracing, browsing, and finding cross-references. 35 | \vspacebig 36 | 37 | 38 | \textbf{Coding style}\label{CodingStyle} \\ 39 | The code must be in C++ language, with file format .h and/or .cpp. 40 | Names and comments must use English language. 41 | Name, date, and version number must be written in a comment at the beginning of each code file. 42 | \vspacesmall 43 | 44 | The file format is plain ASCII. UTF-8 should be avoided if possible. 45 | Use Windows-style linefeeds, i.e. \textbackslash r\textbackslash n. 46 | Indent 4 spaces for every block level. Tabs are not allowed. Remember to set the option in your editor to use spaces instead of tabs. 47 | \vspacesmall 48 | 49 | The purposes of all classes, functions, and variables must be explained in comments unless they are self-explaining. 50 | \vspacesmall 51 | 52 | Use curly brackets for branches and loops. A closing curly bracket must be placed on a separate line. An opening curly bracket does not need a separate line. 53 | \codei{else-if} may be contracted without an extra curly bracket. Example: 54 | \begin{lstlisting}[frame=none] 55 | if (a < 0) { 56 | // negative 57 | } 58 | else if (a == 0) { 59 | // zero 60 | } 61 | else { 62 | // positive 63 | } 64 | \end{lstlisting} 65 | \vspacebig 66 | 67 | 68 | \textbf{Optimization}\\ 69 | 70 | All functions and operators in .h files should be \codei{static} and \codei{inline}. 71 | \vspacesmall 72 | 73 | Do not optimize the code for a specific microprocessor, but focus on what is likely to be optimal on future microprocessor models. The most likely bottlenecks to consider are cache use, instruction decoding, and dependency chains. Small loops are usually more efficient than large unrolled loops. 74 | \vspacesmall 75 | 76 | Minimize the use of static constants because they take op memory space even when they are not used. 77 | Static constants may be stored in templates that are not instantiated if they are not used. 78 | \vspacesmall 79 | 80 | Preprocessing \codei{\#define}'s must have unique names that are unlikely to cause name clashes because they are in the global namespace. It is preferred to use \codei{const int} etc. instead for defining constants. 81 | \vspacesmall 82 | 83 | \textbf{Testing}\\ 84 | Any code must be thoroughly tested with the latest version of VCL before submission. 85 | It should preferably be tested with multiple different compilers and different operating systems. 86 | Add-on packages may have their own test bench. 87 | \vspacesmall 88 | 89 | 90 | \section{Contributing to VCL}\label{Contributing} 91 | \textbf{Bug reports}\\ 92 | Bug reports should preferably be filed as issues on the git repository. 93 | Please check the list of known bugs at the GIT repository under 94 | \href{https://github.com/vectorclass/miscellaneous}{miscellaneous}. 95 | 96 | \vspacesmall 97 | 98 | \textbf{Avoid feature bloat}\\ 99 | Do not put new features into the main VCL files unless there is general agreement that they are needed. Special purpose features should instead be placed in add-on packages. 100 | \vspacesmall 101 | 102 | The coding style must follow the guidelines listed above on page \pageref{CodingStyle}. Do not insert metadata for Doxygen or similar tools. Follow the optimization guidelines mentioned above. 103 | \vspacesmall 104 | 105 | Any modification to the main VCL files should be tested with different compilers and different operating systems on the test bench described below in chapter \ref{chap:TestBench}. Avoid any files or features that are specific to a particular CPU, operating system, platform, or development tool. 106 | \vspacesmall 107 | 108 | Copyright is a problem. If different contributions are copyrighted by different contributors than it will be impossible to make any legal decisions regarding VCL if not all contributors can be contacted. There are plans to assign the copyright to a non-profit organization, but no particular organization has been chosen yet. 109 | \vspacesmall 110 | 111 | 112 | \section{Test bench}\label{chap:TestBench} 113 | A test bench has been developed for the purpose of automatic testing of VCL. 114 | The test bench includes C++ code and a bash script for automatic testing of operators and functions. The script will run through a list of test cases to test each operator and function with many different combinations of vector classes, instruction sets, compilers, and operating systems. Each test case will be implemented by compiling and running a small test program and comparing the resulting values with the expected values. 115 | \vspacesmall 116 | 117 | The test bench is used in the development of VCL. It is not intended for programmers that use the VCL. All code and documentation for the test bench is provided in the folder named testbench. 118 | \vspacesmall 119 | 120 | 121 | \end{document} -------------------------------------------------------------------------------- /vcl_conversion.tex: -------------------------------------------------------------------------------- 1 | % chapter included in vclmanual.tex 2 | \documentclass[vcl_manual.tex]{subfiles} 3 | \begin{document} 4 | 5 | 6 | \chapter{Conversion between vector types}\label{Conversion between vector types} 7 | \flushleft 8 | 9 | Below is a list of methods and functions for conversion between different vector types, vector sizes or precisions. 10 | \vspacebig 11 | 12 | \section{Conversion between data vector types} 13 | 14 | \begin{tabular}{|p{30mm}|p{120mm}|} 15 | \hline 16 | \bfseries Method & conversion between vector class and intrinsic vector type \\ \hline 17 | \bfseries Defined for & all integer and floating point vector classes \\ \hline 18 | \bfseries Description & conversion between a vector class and the corresponding intrinsic vector type \_\_m128, \_\_m128d, \_\_m128i, \_\_m256, \_\_m256d, \_\_m256i, \_\_m512, \_\_m512d, \_\_m512i can be done implicitly or explicitly. \newline 19 | Boolean vectors can be converted to their internal representation, which is an integer vector for broad boolean vectors, or a single integer for compact boolean vectors. \\ \hline 20 | \bfseries Efficiency & good \\ \hline 21 | \end{tabular} 22 | \begin{lstlisting}[frame=none] 23 | // Example: 24 | Vec4i a(0,1,2,3); 25 | __m128i b = a; // b = 0x00000003000000020000000100000000 26 | Vec4i c = b; // c = (0,1,2,3) 27 | \end{lstlisting} 28 | \vspacesmall 29 | 30 | 31 | \begin{tabular}{|p{30mm}|p{120mm}|} 32 | \hline 33 | \bfseries Method & conversion from scalar to vector \\ \hline 34 | \bfseries Defined for & all integer and floating point vector classes \\ \hline 35 | \bfseries Description & conversion from a scalar (single value) to a vector can be done explicitly by calling a constructor, or implicitly by putting a scalar where a vector is expected. All vector elements get the same value. \\ \hline 36 | \bfseries Efficiency & good for constant. Medium for variable as parameter \\ \hline 37 | \end{tabular} 38 | \begin{lstlisting}[frame=none] 39 | // Example: 40 | Vec4i a, b; 41 | a = Vec4i(5); // explicit conversion. a = (5,5,5,5) 42 | b = a + 3; // implicit conversion to Vec4i. b = (8,8,8,8) 43 | \end{lstlisting} 44 | \vspacesmall 45 | 46 | 47 | \begin{tabular}{|p{30mm}|p{120mm}|} 48 | \hline 49 | \bfseries Method & conversion between signed and unsigned integer vectors \\ \hline 50 | \bfseries Defined for & all integer vector classes \\ \hline 51 | \bfseries Description & Conversion between signed and unsigned integer vectors can be done implicitly or explicitly. Overflow and underflow wraps around. \\ \hline 52 | \bfseries Efficiency & good \\ \hline 53 | \end{tabular} 54 | \begin{lstlisting}[frame=none] 55 | // Example: 56 | Vec4i a(-1,0,1,2); // signed vector 57 | Vec4ui b = a; // implicit conversion to unsigned. 58 | // b = (0xFFFFFFFF,0,1,2) 59 | Vec4ui c = Vec4ui(a); // same, with explicit conversion 60 | Vec4i d = c; // convert back to signed 61 | \end{lstlisting} 62 | \vspacesmall 63 | 64 | 65 | \begin{tabular}{|p{30mm}|p{120mm}|} 66 | \hline 67 | \bfseries Method & conversion between different integer vector types \\ \hline 68 | \bfseries Defined for & all integer vector classes \\ \hline 69 | \bfseries Description & Conversion can be done implicitly or explicitly between all integer vector classes with the same total number of bits. This conversion does not change any bits, just the grouping of bits into elements is changed. \\ \hline 70 | \bfseries Efficiency & good \\ \hline 71 | \end{tabular} 72 | \begin{lstlisting}[frame=none] 73 | // Example: 74 | Vec8s a(0,1,2,3,4,5,6,7); 75 | Vec4i b; 76 | b = a; // b = (0x1000, 0x3002, 0x5004, 0x7006) 77 | \end{lstlisting} 78 | \vspacesmall 79 | 80 | 81 | \begin{tabular}{|p{30mm}|p{120mm}|} 82 | \hline 83 | \bfseries Function & reinterpret\_d, reinterpret\_f, reinterpret\_i, reinterpret\_h \\ \hline 84 | \bfseries Defined for & all integer and floating point vector classes \\ \hline 85 | \bfseries Description & Reinterprets a vector as a different type with the same total number of bits. No bits are changed, only interpreted differently (bit casting).\newline 86 | reinterpret\_d is used for converting to Vec2d, Vec4d, or Vec8d, \newline 87 | reinterpret\_f is used for converting to Vec4f, Vec8f, or Vec16f, \newline 88 | reinterpret\_i is used for converting to any integer vector type, \newline 89 | reinterpret\_h is used for converting to Vec8h, Vec16h, or Vec32h. \\ \hline 90 | \bfseries Efficiency & good \\ \hline 91 | \end{tabular} 92 | \begin{lstlisting}[frame=none] 93 | // Example: 94 | Vec4f a(1.0f, 1.5f, 2.0f, 2.5f); 95 | Vec4i b = reinterpret_i(a); 96 | // b = (0x3F800000, 0x3FC00000, 0x40000000, 0x40200000) 97 | \end{lstlisting} 98 | \vspacesmall 99 | 100 | \label{roundToInt} 101 | \begin{tabular}{|p{30mm}|p{120mm}|} 102 | \hline 103 | \bfseries Function & 104 | Vec8s roundi(Vec8h) \newline 105 | Vec16s roundi(Vec16h) \newline 106 | Vec32s roundi(Vec32h) \newline 107 | Vec4i roundi(Vec4f) \newline 108 | Vec8i roundi(Vec8f) \newline 109 | Vec16i roundi(Vec16f) \newline 110 | Vec2q roundi(Vec2d) \newline 111 | Vec4q roundi(Vec4d) \newline 112 | Vec8q roundi(Vec8d) \\ \hline 113 | \bfseries Defined for & all floating point vector classes \\ \hline 114 | \bfseries Description & Rounds floating point numbers to nearest integer and returns an integer vector of the same size. Where two integers are equally near, the even integer is returned. \newline 115 | INF input may give INT\_MAX or INT\_MIN depending on the implementation and the instruction set.\\ \hline 116 | \bfseries Efficiency & float types: good \newline 117 | double types: good if AVX512DQ instruction set, otherwise poor \\ \hline 118 | \end{tabular} 119 | \begin{lstlisting}[frame=none] 120 | // Example: 121 | Vec4f a(1.0f, 1.5f, 2.0f, 2.5f); 122 | Vec4i b = round_to_int(a); // b = (1,2,2,2) 123 | \end{lstlisting} 124 | \vspacesmall 125 | 126 | 127 | \begin{tabular}{|p{30mm}|p{120mm}|} 128 | \hline 129 | \bfseries Function & 130 | Vec4i round\_to\_int32(Vec2d) \newline 131 | Vec4i round\_to\_int32(Vec2d, Vec2d) \newline 132 | Vec4i round\_to\_int32(Vec4d) \newline 133 | Vec8i round\_to\_int32(Vec8d)\\ \hline 134 | \bfseries Defined for & Vec2d, Vec4d, Vec8d \\ \hline 135 | \bfseries Description & rounds double precision floating point numbers and returns vector of 32-bit integers. Where two integers are equally near, the even integer is returned. 136 | \\ \hline 137 | \bfseries Efficiency & good \\ \hline 138 | \end{tabular} 139 | \begin{lstlisting}[frame=none] 140 | // Example: 141 | Vec4d a(1.0, 1.5, 2.0, 2.5); 142 | Vec4i b = round_to_int32(a); // b = (1,2,2,2) 143 | \end{lstlisting} 144 | \vspacesmall 145 | 146 | \label{truncateToInt} 147 | \begin{tabular}{|p{30mm}|p{120mm}|} 148 | \hline 149 | \bfseries Function & 150 | Vec8s truncatei(Vec8h) \newline 151 | Vec16s truncatei(Vec16h) \newline 152 | Vec32s truncatei(Vec32h) \newline 153 | Vec4i truncatei(Vec4f) \newline 154 | Vec8i truncatei(Vec8f)\newline 155 | Vec16i truncatei(Vec16f)\newline 156 | Vec2q truncatei(Vec2d) \newline 157 | Vec4q truncatei(Vec4d) \newline 158 | Vec8q truncatei(Vec8d) \\ \hline 159 | \bfseries Defined for & all floating point vector classes \\ \hline 160 | \bfseries Description & truncates floating point numbers towards zero and returns signed integer vector of the same size. \newline 161 | INF input may give INT\_MAX or INT\_MIN depending on the implementation and the instruction set.\\ \hline 162 | \bfseries Efficiency & 163 | float types: good \newline 164 | double types: good if AVX512DQ instruction set, otherwise poor \\ \hline 165 | \end{tabular} 166 | \begin{lstlisting}[frame=none] 167 | // Example: 168 | Vec4f a(-1.6f, 1.5f, 2.0f, 2.9f); 169 | Vec4i b = truncate_to_int(a); // b = (-1,1,2,2) 170 | \end{lstlisting} 171 | \vspacesmall 172 | 173 | 174 | \begin{tabular}{|p{30mm}|p{120mm}|} 175 | \hline 176 | \bfseries Function & 177 | Vec4i truncate\_to\_int32(Vec2d, Vec2d)\newline 178 | Vec4i truncate\_to\_int32(Vec4d)\newline 179 | Vec8i truncate\_to\_int32(Vec8d) \\ \hline 180 | \bfseries Defined for & Vec2d, Vec4d, Vec8d \\ \hline 181 | \bfseries Description & truncates double precision floating point numbers towards zero and returns signed vector of 32-bit integers. \\ \hline 182 | \bfseries Efficiency & good \\ \hline 183 | \end{tabular} 184 | \begin{lstlisting}[frame=none] 185 | // Example: 186 | Vec4d a(-1.5, 1.5, 2.0, 2.9); 187 | Vec4i b = truncate_to_int32(a); // b = (-1,1,2,2) 188 | \end{lstlisting} 189 | \vspacesmall 190 | 191 | 192 | \begin{tabular}{|p{30mm}|p{120mm}|} 193 | \hline 194 | \bfseries Function & 195 | Vec4f to\_float(Vec4i) \newline 196 | Vec8f to\_float(Vec8i) \newline 197 | Vec16f to\_float(Vec16i) \\ \hline 198 | \bfseries Defined for & Vec4i, Vec8i, Vec16i \\ \hline 199 | \bfseries Description & converts signed 32-bit integers to single precision float \\ \hline 200 | \bfseries Efficiency & good \\ \hline 201 | \end{tabular} 202 | \begin{lstlisting}[frame=none] 203 | // Example: 204 | Vec4i a(0, 1, 2, 3); 205 | Vec4f b = to_float(a); // b = (0.0f, 1.0f, 2.0f, 3.0f) 206 | \end{lstlisting} 207 | \vspacesmall 208 | 209 | 210 | \begin{tabular}{|p{30mm}|p{120mm}|} 211 | \hline 212 | \bfseries Function & 213 | Vec4f to\_float(Vec4ui) \newline 214 | Vec8f to\_float(Vec8ui) \newline 215 | Vec16f to\_float(Vec16ui) \\ \hline 216 | \bfseries Defined for & Vec4ui, Vec8ui, Vec16ui \\ \hline 217 | \bfseries Description & converts unsigned integers to single precision float \\ \hline 218 | \bfseries Efficiency & good if AVX512VL instruction set. Poor otherwise \\ \hline 219 | \end{tabular} 220 | \begin{lstlisting}[frame=none] 221 | // Example: 222 | Vec4ui a(0, 1, 2, 3); 223 | Vec4f b = to_float(a); // b = (0.0f, 1.0f, 2.0f, 3.0f) 224 | \end{lstlisting} 225 | \vspacesmall 226 | 227 | \begin{tabular}{|p{30mm}|p{120mm}|} 228 | \hline 229 | \bfseries Function & 230 | Vec4f to\_float(Vec2d) \newline 231 | Vec4f to\_float(Vec4d) \newline 232 | Vec8f to\_float(Vec8d) \\ \hline 233 | \bfseries Defined for & Vec2d, Vec4d, Vec8d \\ \hline 234 | \bfseries Description & converts floating point vectors from double precision to single precision. \\ \hline 235 | \bfseries Efficiency & good \\ \hline 236 | \end{tabular} 237 | \vspacesmall 238 | 239 | \begin{tabular}{|p{30mm}|p{120mm}|} 240 | \hline 241 | \bfseries Function & 242 | Vec4f convert8h\_4f(Vec8h) \newline 243 | Vec8f to\_float(Vec8h) \newline 244 | Vec16f to\_float(Vec16h) \\ \hline 245 | \bfseries Defined for & Vec8h, Vec16h \\ \hline 246 | \bfseries Description & converts floating point vectors from half precision to single precision. \\ \hline 247 | \bfseries Efficiency & good if F16C or AVX512-FP16 \\ \hline 248 | \end{tabular} 249 | \vspacebig 250 | 251 | \begin{tabular}{|p{30mm}|p{120mm}|} 252 | \hline 253 | \bfseries Function & 254 | Vec8h convert4f\_8h(Vec4f) \newline 255 | Vec8h to\_float16(Vec8f) \newline 256 | Vec16h to\_float16(Vec16f) \\ \hline 257 | \bfseries Defined for & Vec4f, Vec8f, Vec16f \\ \hline 258 | \bfseries Description & converts floating point vectors from single precision to half precision. \\ \hline 259 | \bfseries Efficiency & good if F16C or AVX512-FP16 \\ \hline 260 | \end{tabular} 261 | \vspacebig 262 | 263 | \begin{tabular}{|p{30mm}|p{120mm}|} 264 | \hline 265 | \bfseries Function & 266 | Vec4d to\_double(Vec4i) \newline 267 | Vec8d to\_double(Vec8i) \\ \hline 268 | \bfseries Defined for & Vec4i, Vec8i \\ \hline 269 | \bfseries Description & converts signed 32-bit integers to double precision float. The output vector is larger than the input vector. \\ \hline 270 | \bfseries Efficiency & medium \\ \hline 271 | \end{tabular} 272 | \begin{lstlisting}[frame=none] 273 | // Example: 274 | Vec4i a(0, 1, 2, 3); 275 | Vec4d b = to_double(a); // b = (0.0, 1.0, 2.0, 3.0) 276 | \end{lstlisting} 277 | \vspacesmall 278 | 279 | 280 | \begin{tabular}{|p{30mm}|p{120mm}|} 281 | \hline 282 | \bfseries Function & 283 | Vec2d to\_double(Vec2q x) \newline 284 | Vec4d to\_double(Vec4q x) \newline 285 | Vec8d to\_double(Vec8q x) \newline 286 | Vec2d to\_double(Vec2uq x) \newline 287 | Vec4d to\_double(Vec4uq x) \newline 288 | Vec8d to\_double(Vec8uq x) \\ \hline 289 | \bfseries Defined for & Vec2q, Vec4q, Vec8q, Vec2uq, Vec4uq, Vec8uq \\ \hline 290 | \bfseries Description & converts signed or unsigned 64-bit integers to double precision float \\ \hline 291 | \bfseries Efficiency & good if AVX512DQ and AVX512VL instruction sets, otherwise poor. \\ \hline 292 | \end{tabular} 293 | \begin{lstlisting}[frame=none] 294 | // Example: 295 | Vec2q a(0, 1); 296 | Vec2d b = to_double(a); // b = (0.0, 1.0) 297 | \end{lstlisting} 298 | \vspacesmall 299 | 300 | 301 | \begin{tabular}{|p{30mm}|p{120mm}|} 302 | \hline 303 | \bfseries Function & 304 | Vec4d to\_double(Vec4f x) \newline 305 | Vec8d to\_double(Vec8f x) \\ \hline 306 | \bfseries Defined for & Vec4f, Vec8f \\ \hline 307 | \bfseries Description & converts floating point vectors from single precision to double precision. The total number of bits in the vector is doubled \\ \hline 308 | \bfseries Efficiency & good \\ \hline 309 | \end{tabular} 310 | \vspacebig 311 | 312 | 313 | \begin{tabular}{|p{30mm}|p{120mm}|} 314 | \hline 315 | \bfseries Function & 316 | Vec2d to\_double\_low(Vec4i) \newline 317 | Vec2d to\_double\_high(Vec4i) \\ \hline 318 | \bfseries Defined for & Vec4i \\ \hline 319 | \bfseries Description & converts signed 32-bit integers to double precision float \\ \hline 320 | \bfseries Efficiency & medium \\ \hline 321 | \end{tabular} 322 | \begin{lstlisting}[frame=none] 323 | // Example: 324 | Vec4i a(0, 1, 2, 3); 325 | Vec2d b = to_double_low(a); // b = (0.0, 1.0) 326 | Vec2d c = to_double_high(a); // c = (2.0, 3.0) 327 | \end{lstlisting} 328 | \vspacesmall 329 | 330 | 331 | \begin{tabular}{|p{30mm}|p{120mm}|} 332 | \hline 333 | \bfseries Method & concatenating vectors \\ \hline 334 | \bfseries Defined for & All 128-bit and 256-bit vector classes and corresponding boolean vector classes \\ \hline 335 | \bfseries Description & Two vectors can be concatenated into one vector of the double size by calling a constructor or the function concatenate2. \\ \hline 336 | \bfseries Efficiency & good \\ \hline 337 | \end{tabular} 338 | \begin{lstlisting}[frame=none] 339 | // Example: 340 | Vec4i a(10,11,12,13); 341 | Vec4i b(20,21,22,23); 342 | Vec8i c(a, b); // c = (10,11,12,13,20,21,22,23) 343 | Vec8i d = concatenate2(a, b); // same as c 344 | \end{lstlisting} 345 | \vspacesmall 346 | 347 | 348 | \begin{tabular}{|p{30mm}|p{120mm}|} 349 | \hline 350 | \bfseries Method & get\_low, get\_high \\ \hline 351 | \bfseries Defined for & all 256-bit and 512-bit vector classes \\ \hline 352 | \bfseries Description & One big vector can be split into two vectors of half the size by calling the methods get\_low and get\_high \\ \hline 353 | \bfseries Efficiency & good \\ \hline 354 | \end{tabular} 355 | \begin{lstlisting}[frame=none] 356 | // Example: 357 | Vec8i a(10,11,12,13,14,15,16,17); 358 | Vec4i b = a.get_low(); // b = (10,11,12,13) 359 | Vec4i c = a.get_high(); // c = (14,15,16,17) 360 | \end{lstlisting} 361 | \vspacesmall 362 | 363 | 364 | \begin{tabular}{|p{30mm}|p{120mm}|} 365 | \hline 366 | \bfseries Method & extend\_z \\ \hline 367 | \bfseries Defined for & All 128-bit and 256-bit vector classes and corresponding boolean vector classes \\ \hline 368 | \bfseries Description & The vector is extended to double size by adding zeroes. \\ \hline 369 | \bfseries Efficiency & good \\ \hline 370 | \end{tabular} 371 | \begin{lstlisting}[frame=none] 372 | // Example: 373 | Vec4i a(10,11,12,13); 374 | Vec8i b = extend_z(a); // b = (10,11,12,13,0,0,0,0) 375 | \end{lstlisting} 376 | \vspacesmall 377 | 378 | 379 | \begin{tabular}{|p{30mm}|p{120mm}|} 380 | \hline 381 | \bfseries Function & extend \\ \hline 382 | \bfseries Defined for & Vec16c, Vec16uc, Vec32c, Vec32uc, 383 | Vec8s, Vec8us, Vec16s, Vec16us, 384 | Vec4i, Vec4ui, Vec8i, Vec8ui, \\ \hline 385 | \bfseries Description & Extends integers to a larger number of bits per element. 386 | The total number of bits in the vector is doubled. 387 | Unsigned integers are zero-extended, signed integers are sign-extended. \\ \hline 388 | \bfseries Efficiency & good for instruction sets that support the highest vector size, medium otherwise. \\ \hline 389 | \end{tabular} 390 | \begin{lstlisting}[frame=none] 391 | // Example: 392 | Vec8s a(-2, -1, 0, 1, 2, 3, 4, 5); 393 | Vec8i b = extend(a); // b = (-2, -1, 0, 1, 2, 3, 4, 5) 394 | \end{lstlisting} 395 | \vspacesmall 396 | 397 | 398 | \begin{tabular}{|p{30mm}|p{120mm}|} 399 | \hline 400 | \bfseries Function & extend\_low, extend\_high \\ \hline 401 | \bfseries Defined for & Vec16c, Vec16uc, Vec32c, Vec32uc, Vec64c, Vec64uc, 402 | Vec8s, Vec8us, Vec16s, Vec16us, Vec32s, Vec32us, 403 | Vec4i, Vec4ui, Vec8i, Vec8ui, Vec16i, Vec16ui \\ \hline 404 | \bfseries Description & Extends integers to a larger number of bits per element. 405 | Only the lower or upper half of the vector is converted. The total number of bits in the vector is unchanged. 406 | Unsigned integers are zero-extended, signed integers are sign-extended. \\ \hline 407 | \bfseries Efficiency & good \\ \hline 408 | \end{tabular} 409 | \begin{lstlisting}[frame=none] 410 | // Example: 411 | Vec8s a(-2, -1, 0, 1, 2, 3, 4, 5); 412 | Vec4i b = extend_low(a); // b = (-2, -1, 0, 1) 413 | Vec4i c = extend_high(a); // c = (2, 3, 4, 5) 414 | \end{lstlisting} 415 | \vspacesmall 416 | 417 | 418 | \begin{tabular}{|p{30mm}|p{120mm}|} 419 | \hline 420 | \bfseries Function & extend\_low, extend\_high \\ \hline 421 | \bfseries Defined for & Vec4f, Vec8f, Vec16f \\ \hline 422 | \bfseries Description & extends single precision floating point numbers to double precision. 423 | Only the lower or upper half of the vector is converted. The total number of bits in the vector is unchanged. \\ \hline 424 | \bfseries Efficiency & good \\ \hline 425 | \end{tabular} 426 | \begin{lstlisting}[frame=none] 427 | // Example: 428 | Vec4f a(1.0f, 1.1f, 1.2f, 1.3f); 429 | Vec2d b = extend_low(a); // b = (1.0, 1.1) 430 | Vec2d c = extend_high(a); // c = (1.2, 1.3) 431 | \end{lstlisting} 432 | \vspacesmall 433 | 434 | 435 | \begin{tabular}{|p{30mm}|p{120mm}|} 436 | \hline 437 | \bfseries Function & compress \\ \hline 438 | \bfseries Defined for & Vec16s, Vec16us, Vec32s, Vec32us, 439 | Vec8i, Vec8ui, Vec16i, Vec16ui, 440 | Vec4q, Vec4uq, Vec8q, Vec8uq \\ \hline 441 | \bfseries Description & Reduces integers to a lower number of bits per element. 442 | The total number of bits in the vector is halved. 443 | There is no overflow check. The upper bits are simply cut off (wrap around). \\ \hline 444 | \bfseries Efficiency & good for instruction sets that support the highest vector size, medium otherwise . \\ \hline 445 | \end{tabular} 446 | \begin{lstlisting}[frame=none] 447 | // Example: 448 | Vec8q a(10, 11, 12, 13, 14, 15, 16, 17); 449 | Vec8i b = compress(a); // b = (10, 11, 12, 13, 14, 15, 16, 17) 450 | \end{lstlisting} 451 | \vspacesmall 452 | 453 | 454 | \begin{tabular}{|p{30mm}|p{120mm}|} 455 | \hline 456 | \bfseries Function & compress (with two vector parameters) \\ \hline 457 | \bfseries Defined for & Vec8s, Vec8us, Vec16s, Vec16us, Vec32s, Vec32us, 458 | Vec4i, Vec4ui, Vec8i, Vec8ui, Vec16i, Vec16ui, 459 | Vec2q, Vec2uq, Vec4q, Vec4uq, Vec8q, Vec8uq \\ \hline 460 | \bfseries Description & Packs two integer vectors into a single vector with the same total number of bits, by reducing each integer to a lower number of bits per element. 461 | There is no overflow check. The upper bits are simply cut off (wrap around). \\ \hline 462 | \bfseries Efficiency & medium \\ \hline 463 | \end{tabular} 464 | \begin{lstlisting}[frame=none] 465 | // Example: 466 | Vec4i a(10, 11, 12, 13); 467 | Vec4i b(20, 21, 22, 23); 468 | Vec8s c = compress(a, b); // c = (10,11,12,13,20,21,22,23) 469 | \end{lstlisting} 470 | \vspacesmall 471 | 472 | 473 | \begin{tabular}{|p{30mm}|p{120mm}|} 474 | \hline 475 | \bfseries Function & compress (with two vector parameters)\\ \hline 476 | \bfseries Defined for & Vec2d, Vec4d, Vec8d \\ \hline 477 | \bfseries Description & reduces double precision floating point numbers to single precision. Two double precision vectors are packed into one single precision vector with the same total number of bits. \\ \hline 478 | \bfseries Efficiency & medium \\ \hline 479 | \end{tabular} 480 | \begin{lstlisting}[frame=none] 481 | // Example: 482 | Vec2d a(1.0, 1.1); 483 | Vec2d b(2.0, 2.1); 484 | Vec4f c = compress(a, b); // c = (1.0f, 1.1f, 2.0f, 2.1f) 485 | \end{lstlisting} 486 | \vspacesmall 487 | 488 | \begin{tabular}{|p{30mm}|p{120mm}|} 489 | \hline 490 | \bfseries Function & compress\_saturated (with one vector parameter) \\ \hline 491 | \bfseries Defined for & Vec16s, Vec16us, Vec32s, Vec32us, Vec8i, Vec8ui, Vec16i, Vec16ui, Vec4q, Vec4uq, Vec8q, Vec8uq \\ \hline 492 | \bfseries Description & Packs an integer vector into a vector with the same number of elements and half the number of bits per element. 493 | Overflow and underflow saturates \\ \hline 494 | \bfseries Efficiency & medium (worse than compress in most cases) \\ \hline 495 | \end{tabular} 496 | \vspacebig 497 | 498 | \begin{tabular}{|p{30mm}|p{120mm}|} 499 | \hline 500 | \bfseries Function & compress\_saturated (with two vector parameters) \\ \hline 501 | \bfseries Defined for & Vec8s, Vec8us, Vec16s, Vec16us, Vec32s, Vec32us, 502 | Vec4i, Vec4ui, Vec8i, Vec8ui, Vec16i, Vec16ui, 503 | Vec2q, Vec2uq, Vec4q, Vec4uq, Vec8q, Vec8uq \\ \hline 504 | 505 | \bfseries Description & Packs two integer vectors into a single vector with the same total number of bits, by reducing each integer to a lower number of bits per element. 506 | Overflow and underflow saturates \\ \hline 507 | \bfseries Efficiency & medium (worse than compress in most cases) \\ \hline 508 | \end{tabular} 509 | \begin{lstlisting}[frame=none] 510 | // Example: 511 | Vec4i a(10, 11, 12, 13); 512 | Vec4i b(20, 21, 22, 23); 513 | Vec8s c = compress_saturated(a, b); 514 | // c = (10,11,12,13,20,21,22,23) 515 | \end{lstlisting} 516 | \vspacesmall 517 | 518 | 519 | 520 | 521 | \section{Conversion between boolean vector types}\label{ConversionBetweenBooleanTypes} 522 | 523 | \begin{tabular}{|p{30mm}|p{120mm}|} 524 | \hline 525 | \bfseries Function & to\_bits \\ \hline 526 | \bfseries Defined for & all boolean vectors \\ \hline 527 | \bfseries Description & converts a boolean vector to an integer with one bit per element \\ \hline 528 | \bfseries Efficiency & good for compact boolean vectors. Medium for broad boolean vectors \\ \hline 529 | \end{tabular} 530 | \begin{lstlisting}[frame=none] 531 | // Example: 532 | Vec4i a(10, 11, 12, 13); 533 | Vec4i b(12, 11, 10, 9); 534 | Vec4ib f = a > b; // (false, false, true, true) 535 | uint8_t g = to_bits(f); // = 0b1100 536 | // The order is not reversed, but in the comments above, 537 | // the vector elements are listed in little endian order, 538 | // while the binary number is written in big endian order. 539 | \end{lstlisting} 540 | \vspacesmall 541 | 542 | 543 | \begin{tabular}{|p{30mm}|p{120mm}|} 544 | \hline 545 | \bfseries Method & load\_bits \\ \hline 546 | \bfseries Defined for & all boolean vectors \\ \hline 547 | \bfseries Description & converts an integer bit-field to a boolean vector \\ \hline 548 | \bfseries Efficiency & good for compact boolean vectors. Medium for broad boolean vectors \\ \hline 549 | \end{tabular} 550 | \begin{lstlisting}[frame=none] 551 | // Example: 552 | uint8_t a = 0b11000010; // binary number 553 | Vec8fb b; // boolean vector 554 | b.load_bits(a); 555 | // b = (false, true, false, false, false, false, true, true) 556 | // The order is not reversed, but in the comments above, 557 | // the vector elements are listed in little endian order, 558 | // while the binary number is written in big endian order. 559 | \end{lstlisting} 560 | \vspacesmall 561 | 562 | 563 | \begin{tabular}{|p{30mm}|p{120mm}|} 564 | \hline 565 | \bfseries Method & conversion between boolean vectors of same size and element size \\ \hline 566 | \bfseries Defined for & 567 | Vec4ib $\leftrightarrow$ Vec4fb \newline 568 | Vec8ib $\leftrightarrow$ Vec8fb \newline 569 | Vec16ib $\leftrightarrow$ Vec16fb \newline 570 | Vec2qb $\leftrightarrow$ Vec2db \newline 571 | Vec4qb $\leftrightarrow$ Vec4db \newline 572 | Vec8qb $\leftrightarrow$ Vec8db \\ \hline 573 | \bfseries Description & Boolean vectors for use with different types of vectors with the same bit size can be converted to each other. \\ \hline 574 | \bfseries Efficiency & good \\ \hline 575 | \end{tabular} 576 | \begin{lstlisting}[frame=none] 577 | // Example: 578 | Vec4i a(0,1,2,3); 579 | Vec4i b(4,3,2,1); 580 | Vec4ib f = a > b; // f = (false,false,false,true) 581 | Vec4fb g = Vec4fb(f); // g = (false,false,false,true) 582 | \end{lstlisting} 583 | \vspacesmall 584 | 585 | 586 | \begin{tabular}{|p{30mm}|p{120mm}|} 587 | \hline 588 | \bfseries Method & conversion from boolean vectors to integer vectors of the same size and element size \\ \hline 589 | \bfseries Defined for & broad boolean vectors only. \\ \hline 590 | \bfseries Description & broad boolean vectors can be converted to integer vectors of the same size and bit size. The result will be -1 for true and 0 for false.\newline 591 | Avoid this method if compact boolean vectors may be used.\newline 592 | Conversion the other way, e.g. from Vec4i to Vec4ib is possible for broad boolean vectors 593 | if the input vector contains -1 for true and 0 for false, but the result is implementation dependent and possibly wrong and inconsistent if the input vector contains any other values than 0 and -1. To prevent errors, it is recommended to use a comparison instead for converting an integer vector to a boolean vector. \\ \hline 594 | \bfseries Efficiency & good \\ \hline 595 | \end{tabular} 596 | \begin{lstlisting}[frame=none] 597 | // This example works only for broad boolean vectors 598 | Vec4i a(0,1,2,3); 599 | Vec4i b(4,3,2,1); 600 | Vec4ib f = a > b; // f = (false,false,false,true) 601 | Vec4i g = Vec4i(f); // g = (0, 0, 0, -1) 602 | \end{lstlisting} 603 | \vspacesmall 604 | 605 | 606 | \end{document} -------------------------------------------------------------------------------- /vcl_errors_etc.tex: -------------------------------------------------------------------------------- 1 | % chapter included in vclmanual.tex 2 | \documentclass[vcl_manual.tex]{subfiles} 3 | \begin{document} 4 | 5 | \chapter{Technical details}\label{chap:TechnicalDetails} 6 | 7 | \section{Error conditions}\label{chap:ErrorConditions} 8 | 9 | \subsection{Runtime errors}\label{RuntimeErrors} 10 | \flushleft 11 | 12 | The vector class library is generally not producing runtime error messages. An index out of range produces behavior that is implementation-dependent. This means that the output may be different for different instruction sets or for different versions of the vector class library. 13 | \vspacesmall 14 | 15 | For example, an attempt to read a vector element with an index that is out of range may result in various behaviors, such as producing zero, taking the index modulo the vector size, giving the last element, or producing an arbitrary value. Likewise, an attempt to write a vector element with an index that is out of range may variously take the index modulo the vector size, write the last element, or do nothing. This applies to functions such as \codei{insert}, \codei{extract}, \codei{load\_partial}, \codei{store\_partial}, \codei{cutoff}, \codei{permute}, \codei{blend}, \codei{lookup}, and \codei{gather}. The same applies to a bit-index that is out of range in rotate functions 16 | and shift operators (\textless \textless , \textgreater \textgreater). 17 | \vspacesmall 18 | 19 | Boolean vectors in the broad form (see page \pageref{tableBooleanVectorSizes}) are stored as integer vectors. The only allowed values for boolean vector elements in this case are 0 (false) and -1 (true). The behavior for other values is implementation-dependent and possibly inconsistent. For example, the behavior of the select function when a boolean selector element is a mixture of 0 and 1 bits depends on the instruction set. For instruction sets prior to SSE4.1, it will select between the operands bit-by-bit. For SSE4.1 and higher it will select integer vectors byte-by-byte, using the leftmost bit of each byte in the selector input. For floating point vectors under SSE4.1 and higher, it will use only the leftmost bit (sign bit) of the selector. Boolean vectors in the compact form have only one bit for each element. 20 | \vspacesmall 21 | 22 | An integer division by a variable that is zero will usually produce a runtime exception. 23 | \vspacesmall 24 | 25 | A program crash may be caused by alignment errors with instruction sets prior to AVX. This can happen if a VCL vector is stored in a dynamic array or a container class template instance that does not have correct alignment. See page \pageref{Alignment} 26 | \vspacesmall 27 | 28 | 29 | \subsection{Floating point errors}\label{FloatingPointErrors} 30 | The Vector Class Library produces infinity (INF) or "Not A Number" (NAN) to indicate floating point errors, as discussed on page \pageref{NoExceptionTrapping}. 31 | Floating point overflow will usually produce infinity, floating point underflow produces zero, and an invalid floating point operation produces NAN (Not A Number). The INF and NAN codes will usually propagate to the end result where they can be detected. 32 | \vspacesmall 33 | 34 | There are a few cases where INF and NAN codes do not propagate. For example, dividing a nonzero number by INF produces zero. Error codes cannot propagate through integer and boolean vectors. For example: 35 | \vspacesmall 36 | 37 | \begin{lstlisting}[frame=none] 38 | Vec4d a, b; 39 | ... 40 | Vec4db f = a > 1.0; 41 | b = select(f, a, 0.5); 42 | \end{lstlisting} 43 | \vspacesmall 44 | 45 | The boolean vector elements in \codei{f} will be either true or false, even if \codei{a} is NAN, because a boolean can have no other values. 46 | In the case that an element of \codei{a} is NAN, the corresponding element in \codei{f} will be false, and the element in \codei{b} will be 0.5. The NAN error is not propagated from \codei{a} to \codei{b}. Therefore, you have to check for errors before making a boolean expression. This can be done like this: 47 | 48 | \begin{lstlisting}[frame=none] 49 | Vec4d a, b; 50 | ... 51 | if ( ! horizontal_and(is_finite(a))) { 52 | // handle error 53 | ... 54 | } 55 | Vec4db f = a > 1.0; 56 | b = select(f, a, 0.5); 57 | \end{lstlisting} 58 | \vspacesmall 59 | 60 | 61 | \subsection{Compile-time errors}\label{CompileTimeErrors} 62 | The Vector Class Library is making heavy use of metaprogramming features that go to the limit of what modern compilers can do. Occasional problems have been observed with all compilers. 63 | Errors that are specific to a particular compiler are listed in separate files at the GIT repository under 64 | \href{https://github.com/vectorclass/miscellaneous}{miscellaneous}. 65 | Please check these lists of known errors before reporting a problem. 66 | \vspacesmall 67 | 68 | Even small syntax errors may result in very long error messages due to the heavy use of templates and overloading. These error messages may be confusing, but generally indicating the line number of the error. 69 | \vspacesmall 70 | 71 | Integer vector division by a \codei{const\_int} or \codei{const\_uint} can produce a compile-time error message when the divisor is zero or out of range. 72 | \vspacesmall 73 | 74 | \textbf{"Ambiguous call to overloaded function"}: \\ 75 | This can happen when parameters have wrong types. 76 | Make sure all parameters have the correct type. 77 | \vspacesmall 78 | 79 | Version 1.xx of VCL may produce error messages that are not very informative, such as 80 | \textbf{"Static\_error\_check\textless false\textgreater"} due to limitations in template metaprogramming. 81 | \vspacesmall 82 | 83 | 84 | \subsection{Link errors}\label{LinkErrors} 85 | 86 | \textbf{"unresolved external symbol \_\_intel\_cpu\_indicator\_x"}: \\ 87 | This link error occurs when you are using Intel's SVML library without including a CPU dispatcher. Add the library libircmt.lib or libirc.a to use Intel's CPU dispatch function. Make sure to choose the 32-bit or 64-bit of the library, as appropriate. See page \pageref{ExternalMathLibrary} for details. 88 | \vspacesmall 89 | 90 | \textbf{"unresolved external symbol \_\_svml\_sin2@@16}, etc. \\ 91 | You need to link the library \textbf{svmlpatch.lib}, which you can find at the git repository under miscellaneous. 92 | \vspacesmall 93 | 94 | 95 | \subsection{Implementation-dependent behavior}\label{ImplementationDependentBehavior} 96 | 97 | A big advantage of the VCL library is that you can compile the same source code for different instruction set extensions. A higher instruction set will generally give faster code, but produce the same results. There may, however, be cases where the same code generates different results with different instruction sets or different compilers. These cases include: 98 | 99 | \begin{itemize} 100 | \item An index out of range produces implementation-dependent results. Functions such as 101 | \codei{insert}, \codei{extract}, \codei{load\_partial}, \codei{store\_partial}, \codei{cutoff}, \codei{permute}, \codei{blend}, \codei{lookup}, \codei{gather}, and \codei{scatter} may produce different results for an index out of range depending on the instruction set. No exception or error message is generated, only a meaningless number. 102 | 103 | \item permute and blend functions allow a "don't care" index (\codei{V\_DC}) to be specified. The result for a don't care element may depend on the instruction set. 104 | 105 | \item Negative zero. The floating point values of 0.0 and -0.0 are normally regarded as equal. Some functions may return 0.0 or -0.0 depending on the instruction set, e.g. when rounding a negative number. The sign of a zero can be detected by the functions \codei{sign\_bit} and \codei{sign\_combine}. 106 | You may {} \codei{\#define SIGNED\_ZERO} {} to get consistent and pedantic conformance to the specifications of signed zero in the IEEE 754-2019 standard. 107 | 108 | \item NANs. An error code can be propagated through NAN (not-a-number) values and retrieved by the function \codei{nan\_code}. When two NAN values with different codes are combined, for example by adding them together, the result may be either of the two values, depending on the compiler. The sign of a NAN has no meaning and may vary. \\ 109 | Use the \codei{minimum} and \codei{maximum} functions rather than \codei{min} and \codei{max} if you want to propagate NAN values through these functions. 110 | 111 | \end{itemize} 112 | \vspacesmall 113 | 114 | \end{document} -------------------------------------------------------------------------------- /vcl_examples.tex: -------------------------------------------------------------------------------- 1 | % chapter included in vclmanual.tex 2 | \documentclass[vcl_manual.tex]{subfiles} 3 | \begin{document} 4 | 5 | 6 | \chapter{Examples}\label{chap:Examples} 7 | \flushleft 8 | 9 | This example calculates the polynomial $x^3 + 2\cdot x^2 - 5\cdot x + 1$ on a floating point vector. The order of calculation is specified by parentheses in order to make shorter dependency chains. 10 | 11 | \begin{example} 12 | \label{examplePolynomial} 13 | \end{example} 14 | \begin{lstlisting}[frame=single] 15 | Vec4f polynomial (Vec4f x) { 16 | return (x + 2.0f) * (x * x) + ((-5.0f) * x + 1.0f); 17 | } 18 | \end{lstlisting} 19 | \vspacebig 20 | 21 | In 64-bit Windows, you may add \codei{\_\_vectorcall} and use a Clang or Microsoft compiler. This makes sure that vector parameters are transferred in registers rather than in memory. This is not needed when the function is inlined or when compiling for other platforms than Windows: 22 | 23 | \begin{example} 24 | \label{examplePolynomialVectorcall} 25 | \end{example} 26 | \begin{lstlisting}[frame=single] 27 | Vec4f __vectorcall polynomial (Vec4f x) { 28 | return (x + 2.0f) * (x * x) + ((-5.0f) * x + 1.0f); 29 | } 30 | \end{lstlisting} 31 | \vspacebig 32 | 33 | 34 | The next example transposes a 4x4 matrix, using the AVX2 instruction set. 35 | 36 | \begin{example} 37 | \label{exampleTranspose4x4} 38 | \end{example} 39 | \begin{lstlisting}[frame=single] 40 | void transpose(float matrix[4][4]) { 41 | Vec8f row01, row23, col01, col23; 42 | // load first two rows 43 | row01.load(&matrix[0][0]); 44 | // load next two rows 45 | row23.load(&matrix[2][0]); 46 | // reorder into columns 47 | col01 = blend8f<0,4, 8,12,1,5, 9,13>(row01, row23); 48 | col23 = blend8f<2,6,10,14,3,7,11,15>(row01, row23); 49 | // store columns into rows 50 | col01.store(&matrix[0][0]); 51 | col23.store(&matrix[2][0]); 52 | } 53 | \end{lstlisting} 54 | \vspacesmall 55 | 56 | Same example with AVX512: 57 | 58 | \begin{example} 59 | \label{exampleTranspose4x4avx512} 60 | \end{example} 61 | \begin{lstlisting}[frame=single] 62 | void transpose(float matrix[4][4]) { 63 | Vec16f rows, columns; 64 | // load entire matrix as rows 65 | rows.load(&matrix[0][0]); 66 | // reorder into columns 67 | columns = permute16f<0,4,8,12,1,5,9,13, 68 | 2,6,10,14,3,7,11,15>(rows); 69 | // store columns into rows 70 | columns.store(&matrix[0][0]); 71 | } 72 | \end{lstlisting} 73 | \vspacebig 74 | 75 | The next example makes a matrix multiplication of two 4x4 matrixes. 76 | 77 | \begin{example} 78 | \label{exampleMatrixMul4x4} 79 | \end{example} 80 | \begin{lstlisting}[frame=single] 81 | void matrixmul(float A[4][4], float B[4][4], float M[4][4]){ 82 | // calculates M = A*B 83 | Vec4f Brow[4], Mrow[4]; 84 | int i, j; 85 | // load B as rows 86 | for (i = 0; i < 4; i++) { 87 | Brow[i].load(&B[i][0]); 88 | } 89 | // loop for A and M rows 90 | for (i = 0; i < 4; i++) { 91 | Mrow[i] = Vec4f(0.0f); 92 | // loop for A columns, B rows 93 | for (j = 0; j < 4; j++) { 94 | Mrow[i] += Brow[j] * A[i][j]; 95 | } 96 | } 97 | // store M 98 | for (i = 0; i < 4; i++) { 99 | Mrow[i].store(&M[i][0]); 100 | } 101 | } 102 | \end{lstlisting} 103 | \vspacebig 104 | 105 | 106 | The next example makes a table of the sin function and gets sin(x) and cos(x) by table lookup. 107 | 108 | \begin{example} 109 | \label{exampleSinTable} 110 | \end{example} 111 | \begin{lstlisting}[frame=single] 112 | 113 | #include 114 | 115 | const double pi = 3.14159265358979323846; 116 | 117 | // length of table. Must be a power of 2. 118 | #define sin_tablelen 1024 119 | // the accuracy of table lookup is +/- pi/sin_tablelen 120 | 121 | class SinTable { 122 | protected: 123 | float table[sin_tablelen]; 124 | float resolution; 125 | float rres; // 1./resolution 126 | public: 127 | SinTable(); // constructor 128 | Vec4f sin(Vec4f x); 129 | Vec4f cos(Vec4f x); 130 | }; 131 | 132 | SinTable::SinTable() { // constructor 133 | // compute resolution 134 | resolution = float(2.0 * pi / sin_tablelen); 135 | rres = 1.0f / resolution; 136 | // Initialize table (No need to use vectors here because this 137 | // is calculated only once:) 138 | for (int i = 0; i < sin_tablelen; i++) { 139 | table[i] = sinf((float)i * resolution); 140 | } 141 | } 142 | 143 | Vec4f SinTable::sin(Vec4f x) { 144 | // calculate sin by table lookup 145 | Vec4i index = roundi(x * rres); 146 | // modulo tablelen equivalent to modulo 2*pi 147 | index &= sin_tablelen - 1; 148 | // look up in table 149 | return lookup(index, table); 150 | } 151 | 152 | Vec4f SinTable::cos(Vec4f x) { 153 | // calculate cos by table lookup 154 | Vec4i index = roundi(x * rres) + sin_tablelen/4; 155 | // modulo tablelen equivalent to modulo 2*pi 156 | index &= sin_tablelen - 1; 157 | // look up in table 158 | return lookup(index, table); 159 | } 160 | 161 | int main() { 162 | SinTable sintab; 163 | Vec4f a(0.0f, 0.5f, 1.0f, 1.5f); 164 | Vec4f b = sintab.sin(a); 165 | // b = (0.0000 0.4768 0.8416 0.9973) 166 | // accuracy +/- 0.003 167 | ... 168 | return 0; 169 | } 170 | \end{lstlisting} 171 | \vspacesmall 172 | 173 | 174 | \end{document} -------------------------------------------------------------------------------- /vcl_file_list.tex: -------------------------------------------------------------------------------- 1 | % chapter included in vclmanual.tex 2 | \documentclass[vcl_manual.tex]{subfiles} 3 | \begin{document} 4 | 5 | 6 | \section{File list} 7 | %\label{FileList} 8 | \flushleft 9 | 10 | \begin{longtable}[l]{|p{40mm}|p{100mm}|} 11 | \endfirsthead 12 | \label{table:fileList} \\ 13 | \endhead 14 | \hline 15 | \bfseries File name & \bfseries Purpose \\ \hline 16 | manual/vcl\_manual.pdf & Instruction manual (this file) \\ \hline 17 | 18 | vectorclass.h & Top-level C++ header file. This will include several other header files, according to the indicated instruction set \\ \hline 19 | 20 | instrset.h & Detection of which instruction set the code is compiled for, 21 | and functions that depend on the instruction set. This file also contains various common definitions and templates. Included by vectorclass.h \\ \hline 22 | 23 | vectori128.h & Defines classes, operators and functions for integer vectors with a total size of 128 bits. Included by vectorclass.h \\ \hline 24 | 25 | vectori256.h & Defines classes, operators and functions for integer vectors with a total size of 256 bits for the AVX2 instruction set. Included by vectorclass.h if appropriate \\ \hline 26 | 27 | vectori256e.h & Defines classes, operators and functions for integer vectors with a total size of 256 bits for instruction sets lower than AVX2. Included by vectorclass.h if appropriate \\ \hline 28 | 29 | vectori512.h & Defines classes, operators and functions for vectors of 32-bit and 64-bit integers with a total size of 512 bits for the AVX512F instruction set. Included by vectorclass.h if appropriate \\ \hline 30 | 31 | vectori512e.h & Defines classes, operators and functions for vectors of 32-bit and 64-bit integers with a total size of 512 bits for instruction sets lower than AVX512F. Included by vectorclass.h if appropriate \\ \hline 32 | 33 | vectori512s.h & Defines classes, operators and functions for vectors of 8-bit and 16-bit integers with a total size of 512 bits for the AVX512BW instruction set. Included by vectorclass.h if appropriate \\ \hline 34 | 35 | vectori512se.h & Defines classes, operators and functions for vectors of 8-bit and 16-bit integers with a total size of 512 bits for instruction sets lower than 36 | AVX512BW. Included by vectorclass.h if appropriate \\ \hline 37 | 38 | vectorf128.h & Defines classes, operators and functions for floating point vectors with a total size of 128 bits. Included by vectorclass.h \\ \hline 39 | 40 | vectorf256.h & Defines classes, operators and functions for floating point vectors with a total size of 256 bits for the AVX and later instruction sets. Included by vectorclass.h if appropriate \\ \hline 41 | 42 | vectorf256e.h & Defines classes, operators and functions for floating point vectors with a total size of 256 bits for instruction sets lower than AVX. Included by vectorclass.h if appropriate \\ \hline 43 | 44 | vectorf512.h & Defines classes, operators and functions for floating point vectors with a total size of 512 bits for the AVX512F and later instruction sets. Included by vectorclass.h if appropriate \\ \hline 45 | 46 | vectorf512e.h & Defines classes, operators and functions for floating point vectors with a total size of 512 bits for instruction sets lower than AVX512F. Included by vectorclass.h if appropriate \\ \hline 47 | 48 | vectorfp16.h & Defines classes, operators and functions for half precision floating point vectors of all sizes, including mathematical functions, for AVX512-FP16 \\ \hline 49 | 50 | vectorfp16e.h & Defines emulating classes, operators and functions for half precision floating point vectors of all sizes, including mathematical functions, for processors without AVX512-FP16 \\ \hline 51 | 52 | vector\_convert.h & Defines functions for conversion between different vector sizes, as well as some generic function templates. \\ \hline 53 | 54 | vectormath\_exp.h & Optional inline mathematical functions: power, logarithms and exponential functions \\ \hline 55 | 56 | vectormath\_trig.h & Optional inline mathematical functions: trigonometric and inverse trigonometric functions \\ \hline 57 | 58 | vectormath\_hyp.h & Optional inline mathematical functions: hyperbolic and inverse hyperbolic functions \\ \hline 59 | 60 | vectormath\_common.h & Common definitions for vectormath\_exp.h, vectormath\_trig.h and vectormath\_hyp.h \\ \hline 61 | 62 | vectormath\_lib.h & Optional header file for external mathematical vector function library \\ \hline 63 | 64 | instrset\_detect.cpp & Optional functions for detecting which instruction set is supported at runtime \\ \hline 65 | 66 | dispatch\_example.cpp & Example of how to make automatic CPU dispatching \\ \hline 67 | 68 | LICENSE & Apache 2.0 license \\ \hline 69 | 70 | changelog.txt & VCL version history \\ \hline 71 | 72 | miscellaneous/svmlpatch & Folder containing the library svmlpatch.lib as well as the source code to build it. Used for fixing a compatibillity issue with Intel SVML library in 64-bit Windows \\ \hline 73 | 74 | testbench & Folder containing test bench files for testing the VCL library. This is used in the development of VCL, and is not needed by programmers using the VCL. Includes code and documentation. \\ \hline 75 | 76 | \end{longtable} 77 | %\end{tabular} 78 | \vspacesmall 79 | 80 | 81 | 82 | \end{document} 83 | -------------------------------------------------------------------------------- /vcl_float_behavior.tex: -------------------------------------------------------------------------------- 1 | % chapter included in vclmanual.tex 2 | \documentclass[vcl_manual.tex]{subfiles} 3 | \begin{document} 4 | 5 | 6 | \section{Floating point behavior details} 7 | \label{FloatingPointBehavior} 8 | 9 | The Vector Class Library is generally conforming to the new IEEE 754-2019 Standard for Floating-Point Arithmetic, but some compromises have been necessary for the purpose of vector processing and for better performance. The deviations from the standard are discussed below. 10 | \vspacesmall 11 | 12 | \begin{description} 13 | 14 | \item[Subnormal numbers.] 15 | Subnormal numbers (also called denormal numbers) are numerically extremely small floating point numbers where the exponent is below the normal range. Some microprocessors are handling subnormal numbers in a very inefficient way that is more than a hundred times slower than for normal floating point numbers. You may call the function \codei{no\_subnormals()} to prevent this and treat subnormal numbers as zero in single and double precision floating point calculations. Calculations in half precision are generally efficient even when values are subnormal. 16 | Some of the mathematical functions in VCL always treat subnormal numbers as zero for reasons of performance. This includes logarithm, exponential, and power functions. 17 | 18 | \item[Signed zero.] 19 | Signed zero is a controversial issue. The floating point standard defines two different zeroes: +0.0 and -0.0. 20 | The two zeroes are equal, but still distinguishable. Some of the functions may return +0.0 where the standard requires -0.0.\\ 21 | You may {} \codei{\#define SIGNED\_ZERO} {} if you want the sign of zero to conform to the 22 | IEEE 754-2019 standard, though this may slow down performance a little. 23 | \codei{SIGNED\_ZERO} may affect several functions, including 24 | \codei{round}, \codei{truncate}, \codei{floor}, \codei{ceil}, 25 | \codei{maximum}, \codei{minimum}, \codei{cbrt}, \codei{pow\_ratio}, \codei{expm1}, \codei{log1p}. 26 | 27 | \item[No exception trapping.] \label{NoExceptionTrapping} 28 | Floating point errors are traditionally detected by trapping errors or relying on an \codei{errno} variable. These methods are not well suited for vector processing and out-of-order processing. This is explained in the document \href{https://www.agner.org/optimize/nan_propagation.pdf}{"NAN propagation versus fault trapping in floating point code", Agner Fog, 2019}. 29 | \vspacesmall 30 | 31 | The Vector Class Library does not support fault trapping, and it does not indicate exceptions in a variable such as the traditional \codei{errno}. It is not recommended to turn on floating point exceptions because this can cause inconsistent behavior, such as traps for exceptions in not-taken branches. Do not attempt to trap numerical errors in \codei{try/catch} blocks. 32 | \vspacesmall 33 | 34 | Instead, the vector class library indicates floating point exceptions by producing INF or NAN codes in the individual vector element that produced the fault. 35 | The INF and NAN codes will propagate to the end result of a series of calculations when certain conditions are satisfied. The most efficient way of detecting floating point errors is to look for INF and NAN codes in the result. 36 | \vspacesmall 37 | 38 | Conditions where INF and NAN codes are not propagated are discussed at page \pageref{FloatingPointErrors} 39 | \vspacesmall 40 | 41 | Do not use the compiler options -ffast-math, -ffinite-math-only, or /fp:fast because this may disable the detection of INF and NAN. 42 | \vspacesmall 43 | 44 | \item[No signaling NANs.] 45 | Signaling NANs are special codes that will raise an exception when they are loaded from memory. Signaling NANs are rarely used in modern software. Signaling NANs should not be used in VCL because exception trapping is not supported. 46 | 47 | \item[NAN payload operations.] 48 | A NAN may contain additional information called a payload. This payload can propagate through a series of calculations to the end result. Some of the mathematical functions in VCL can put a payload into the NAN result in case of an error. This makes it possible to identify which function generated the NAN. 49 | \vspacesmall 50 | 51 | The \codei{nan..} and \codei{nan\_code} functions make it possible to set and get NAN payloads. The IEEE 754 standard does not specify what happens to the payload when converting between single and double precision, but experiments show that all microprocessors that use the binary floating point format will left-justify the payload. The \codei{nan..} and \codei{nan\_code} functions treat the NAN payload as a 22-bit left-justified unsigned integer in order to allow conversions between single and double precision. These functions deviate from the IEEE 754-2019 standard. 52 | 53 | \item[NAN propagation in maximum and minimum functions.] 54 | The \codei{max} and \codei{min} functions do not propagate NANs according to the 2008 version of the standard. This unfortunate situation is redressed in the 2019 revision of the standard. VCL offers two different versions of these functions: 55 | The \codei{max} and \codei{min} functions are equivalent to 56 | \codei{a > b ? a : b} and 57 | \codei{a < b ? a : b}, respectively. These functions return \codei{b} if \codei{a} is NAN. The slightly less efficient functions \codei{maximum} and \codei{minimum} are sure to propagate NANs, in accordance with the 2019 revision of the standard. 58 | 59 | \item[NAN propagation in pow function.] 60 | The standard specifies that pow(NAN,0) and pow(1,NAN) will give the result 1.0. The VCL implementation deviates from this and produces a NAN output in all cases where an input is NAN, in order to support reliable NAN propagation. 61 | 62 | \item[Function parameter range.] 63 | Some of the mathematical functions have internal overflow for extreme values of the input parameters. These functions have a limited input range because an extra branch to handle the extreme cases would reduce the overall performance. Limitations of the input range are mentioned in the listing of the individual functions. 64 | 65 | 66 | \end{description} 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | \vspacesmall 81 | 82 | 83 | 84 | \end{document} 85 | -------------------------------------------------------------------------------- /vcl_manual.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vectorclass/manual/62cb40d710f8d6180511ba03ca6e09347e06f0b9/vcl_manual.pdf -------------------------------------------------------------------------------- /vcl_manual.tex: -------------------------------------------------------------------------------- 1 | \documentclass[11pt,a4paper,oneside,openright]{report} 2 | 3 | \usepackage[bindingoffset=5mm,left=20mm,right=20mm,top=20mm,bottom=20mm,footskip=10mm]{geometry} 4 | \usepackage[utf8x]{inputenc} 5 | \usepackage{hyperref} 6 | \usepackage[english]{babel} 7 | \usepackage{listings} 8 | \usepackage{subfiles} 9 | \usepackage{longtable} 10 | \usepackage{multirow} 11 | \usepackage{ragged2e} 12 | \usepackage{cmap} % avoid fi ligatures in pdf file 13 | \usepackage{amsthm} % example numbering 14 | \usepackage{color} 15 | \usepackage{bold-extra} % for bold tt font. Remember to include bold-extra.sty file 16 | \usepackage{graphicx} 17 | \usepackage[yyyymmdd]{datetime} 18 | \usepackage{float} 19 | 20 | % style for code listing 21 | \renewcommand{\familydefault}{\sfdefault} 22 | \renewcommand{\ttdefault}{pcr} % selects Courier font 23 | \newtheorem{example}{Example}[chapter] % example numbering 24 | \lstset{language=C} % formatting for code listing 25 | \lstset{basicstyle=\ttfamily,breaklines=true} 26 | \definecolor{darkGreen}{rgb}{0,0.4,0} 27 | \definecolor{mybrown}{rgb}{0.40,0.10,0.05} 28 | \lstset{commentstyle=\color{darkGreen}} % comments color 29 | \lstset{keywordstyle=\color{blue}} % keyword color 30 | \lstset{stringstyle=\color{mybrown}} % string color 31 | \lstset{showstringspaces=false} % don't mark spaces in strings 32 | 33 | \renewcommand{\dateseparator}{-} 34 | 35 | % command for turning indent back on after \flushleft 36 | \newcommand{\indenton}{\RaggedRight\parindent=15pt} 37 | 38 | % command for vertical space 39 | \newcommand{\vspacesmall}{\vspace{3mm}} 40 | \newcommand{\vspacebig}{\vspace{6mm}} 41 | 42 | % style for code inlined in text: 43 | \newcommand{\codei}[1]{\bfseries \ttfamily{#1}\normalfont} 44 | 45 | 46 | 47 | \begin{document} 48 | 49 | \begin{titlepage} 50 | \centering 51 | 52 | \null %empty box needed for vfill to work 53 | \vfill 54 | 55 | {\bfseries\Huge 56 | VCL 57 | \vspacesmall 58 | 59 | C++ vector class library 60 | \vspacebig 61 | 62 | manual 63 | } 64 | \vspacebig 65 | 66 | {\Large 67 | Agner Fog 68 | \vspacebig 69 | 70 | \copyright\ \today. Apache license 2.0 71 | } 72 | 73 | \vfill 74 | 75 | \includegraphics[width=306pt]{freesoftwarelogo.jpg} 76 | \vfill 77 | 78 | \end{titlepage} 79 | 80 | \RaggedRight 81 | 82 | 83 | 84 | \tableofcontents 85 | \setcounter{secnumdepth}{1} 86 | %\indenton 87 | \flushleft 88 | 89 | % Introduction 90 | % The basics 91 | \subfile{vcl_introduction.tex} 92 | 93 | % Operators and functions 94 | \subfile{vcl_operators_and_functions.tex} 95 | 96 | % Boolean operations and per-element branches 97 | \subfile{vcl_bool.tex} 98 | 99 | % Conversion between vector types 100 | \subfile{vcl_conversion.tex} 101 | 102 | % Permute, blend, lookup, gather and scatter functions 103 | \subfile{vcl_permute_functions.tex} 104 | 105 | % Mathematical functions 106 | \subfile{vcl_mathematical_functions.tex} 107 | 108 | % Performance 109 | \subfile{vcl_performance.tex} 110 | 111 | % Examples 112 | \subfile{vcl_examples.tex} 113 | 114 | % Application specific packages: 115 | % Decimal-string conversion 116 | % 3-dimensional vectors 117 | % complex number vectors 118 | % quaternions 119 | % Decimal conversion 120 | \subfile{vcl_packages.tex} 121 | 122 | % Error conditions 123 | % Implementation dependent behavior 124 | \subfile{vcl_errors_etc.tex} 125 | 126 | % Floating point behavior 127 | \subfile{vcl_float_behavior.tex} 128 | 129 | % Contributing 130 | % Test bench 131 | \subfile{vcl_contributing.tex} 132 | 133 | % File list 134 | \subfile{vcl_file_list.tex} 135 | 136 | 137 | 138 | \end{document} 139 | -------------------------------------------------------------------------------- /vcl_operators_and_functions.tex: -------------------------------------------------------------------------------- 1 | % chapter included in vclmanual.tex 2 | \documentclass[vcl_manual.tex]{subfiles} 3 | \begin{document} 4 | 5 | 6 | \chapter{Operators}\label{chap:Operators} 7 | 8 | \section{Arithmetic operators} 9 | 10 | \flushleft 11 | 12 | \vspacesmall 13 | \begin{tabular}{|p{25mm}|p{100mm}|} 14 | \hline 15 | \bfseries Operator & \texttt{+, ++, +=} \\ \hline 16 | \bfseries Defined for & all integer and floating point vector classes \\ \hline 17 | \bfseries Description & addition \\ \hline 18 | \bfseries Efficiency & good \\ \hline 19 | \end{tabular} 20 | \begin{lstlisting}[frame=none] 21 | // Example: 22 | Vec4i a(10, 11, 12, 13); 23 | Vec4i b(20, 21, 22, 23); 24 | Vec4i c = a + b; // c = (30, 32, 34, 36) 25 | \end{lstlisting} 26 | 27 | 28 | \vspacesmall 29 | \begin{tabular}{|p{25mm}|p{100mm}|} 30 | \hline 31 | \bfseries Operator & \texttt{-, --, -=,} unary \texttt{-} \\ \hline 32 | \bfseries Defined for & all integer and floating point vector classes \\ \hline 33 | \bfseries Description & subtraction \\ \hline 34 | \bfseries Efficiency & good \\ \hline 35 | \end{tabular} 36 | \begin{lstlisting}[frame=none] 37 | // Example: 38 | Vec4i a(10, 11, 12, 13); 39 | Vec4i b(20, 21, 22, 23); 40 | Vec4i c = a - b; // c = (-10, -10, -10, -10) 41 | \end{lstlisting} 42 | 43 | 44 | \vspacesmall 45 | \begin{tabular}{|p{25mm}|p{100mm}|} 46 | \hline 47 | \bfseries Operator & \texttt{*, *=} \\ \hline 48 | \bfseries Defined for & all integer and floating point vector classes \\ \hline 49 | \bfseries Description & multiplication \\ \hline 50 | \bfseries Efficiency & 8 bit integers: poor \newline 51 | 16 bit integers: good \newline 52 | 32 bit integers: good for SSE4.1 and later instruction set, poor otherwise \newline 53 | 64 bit integers: good for AVX512DQ instruction set, poor otherwise \newline 54 | float: good \newline 55 | double: good 56 | \\ \hline 57 | \end{tabular} 58 | \begin{lstlisting}[frame=none] 59 | // Example: 60 | Vec4i a(10, 11, 12, 13); 61 | Vec4i b(20, 21, 22, 23); 62 | Vec4i c = a * b; // c = (200, 231, 264, 299) 63 | \end{lstlisting} 64 | 65 | 66 | \vspacesmall 67 | \begin{tabular}{|p{25mm}|p{100mm}|} 68 | \hline 69 | \bfseries Operator & \texttt{/, /=} (floating point) \\ \hline 70 | \bfseries Defined for & all floating point vector classes \\ \hline 71 | \bfseries Description & division \\ \hline 72 | \bfseries Efficiency & medium \\ \hline 73 | \end{tabular} 74 | \begin{lstlisting}[frame=none] 75 | // Example: 76 | Vec4f a(1.0f, 1.1f, 1.2f, 1.3f); 77 | Vec4f b(2.0f, 2.1f, 2.2f, 2.3f); 78 | Vec4f c = a / b; // c = (0.500f, 0.524f, 0.545f, 0.565f) 79 | \end{lstlisting} 80 | 81 | 82 | \vspacesmall 83 | \begin{tabular}{|p{25mm}|p{100mm}|} 84 | \hline 85 | \bfseries Operator & \texttt{/, /=} (integer vector divided by scalar) \\ \hline 86 | \bfseries Defined for & all classes of 8-bit, 16-bit and 32-bit integers, signed and unsigned. Not available for 64-bit integers \\ \hline 87 | \bfseries Description & division by scalar. Results are truncated to integer. All elements are divided by the same divisor. See page \pageref{IntegerDivision} for explanation 88 | \\ \hline 89 | \bfseries Efficiency & poor \\ \hline 90 | \end{tabular} 91 | \begin{lstlisting}[frame=none] 92 | // Example: 93 | Vec4i a(10, 11, 12, 13); 94 | int b = 3; 95 | Vec4i c = a / b; // c = (3, 3, 4, 4) 96 | \end{lstlisting} 97 | 98 | 99 | \vspacesmall 100 | \begin{tabular}{|p{25mm}|p{100mm}|} 101 | \hline 102 | \bfseries Operator & \texttt{/, /=} (integer vector divided by constant) \\ \hline 103 | \bfseries Defined for & all classes of 8-bit, 16-bit and 32-bit integers, signed and unsigned. Not available for 64-bit integers \\ \hline 104 | \bfseries Description & division by compile-time constant. All elements are divided by the same divisor. See page \pageref{IntegerDivision} for explanation \\ \hline 105 | \bfseries Efficiency & medium (better than division by scalar variable). \newline Good if divisor is a power of 2 \\ \hline 106 | \end{tabular} 107 | \begin{lstlisting}[frame=none] 108 | // Example, signed: 109 | Vec4i a(10, 11, 12, 13); 110 | Vec4i b = a / const_int(3); // b = (3, 3, 4, 4) 111 | // Example, unsigned: 112 | Vec4ui c(10, 11, 12, 13); 113 | Vec4ui d = c / const_uint(3); // d = (3, 3, 4, 4) 114 | \end{lstlisting} 115 | 116 | 117 | \section{Logic operators} \label{LogicOperators} 118 | 119 | \vspacesmall 120 | \begin{tabular}{|p{25mm}|p{100mm}|} 121 | \hline 122 | \bfseries Operator & $<<$, $<<=$ \\ \hline 123 | \bfseries Defined for & all integer vector classes \\ \hline 124 | \bfseries Description & bit shift left. All vector elements are shifted by the same amount. \newline 125 | Shifting left by n is a fast way of multiplying by $2^n$ \\ \hline 126 | \bfseries Efficiency & good \\ \hline 127 | \end{tabular} 128 | \begin{lstlisting}[frame=none] 129 | // Example: 130 | Vec4i a(10, 11, 12, 13); 131 | Vec4i b = a << 2; // b = (40, 44, 48, 52) 132 | \end{lstlisting} 133 | 134 | 135 | \vspacesmall 136 | \begin{tabular}{|p{25mm}|p{100mm}|} 137 | \hline 138 | \bfseries Operator & $>>$, $>>=$ \\ \hline 139 | \bfseries Defined for & all integer vector classes \\ \hline 140 | \bfseries Description & bit shift right. All vector elements are shifted by the same amount.\newline 141 | Unsigned integers use logical shift. \newline 142 | Signed integers use arithmetic shift (i.e. the sign bit is copied). \newline 143 | Shifting unsigned right by n is a fast way of dividing by $2^n$ 144 | \\ \hline 145 | \bfseries Efficiency & good \\ \hline 146 | \end{tabular} 147 | \begin{lstlisting}[frame=none] 148 | // Example: 149 | Vec4i a(10, 11, 12, 13); 150 | Vec4i b = a >> 2; // b = (2, 2, 3, 3) 151 | \end{lstlisting} 152 | 153 | 154 | \vspacesmall 155 | \begin{tabular}{|p{25mm}|p{100mm}|} 156 | \hline 157 | \bfseries Operator & == \\ \hline 158 | \bfseries Defined for & all vector classes \\ \hline 159 | \bfseries Description & test if equal. Result is a boolean vector \\ \hline 160 | \bfseries Efficiency & good \\ \hline 161 | \end{tabular} 162 | \begin{lstlisting}[frame=none] 163 | // Example: 164 | Vec4i a(10, 11, 12, 13); 165 | Vec4i b(14, 13, 12, 11); 166 | Vec4ib c = a == b; // c = (false, false, true, false) 167 | \end{lstlisting} 168 | 169 | 170 | \vspacesmall 171 | \begin{tabular}{|p{25mm}|p{100mm}|} 172 | \hline 173 | \bfseries Operator & != \\ \hline 174 | \bfseries Defined for & all vector classes \\ \hline 175 | \bfseries Description & test if not equal. Result is a boolean vector \\ \hline 176 | \bfseries Efficiency & good \\ \hline 177 | \end{tabular} 178 | \begin{lstlisting}[frame=none] 179 | // Example: 180 | Vec4i a(10, 11, 12, 13); 181 | Vec4i b(14, 13, 12, 11); 182 | Vec4ib c = a != b; // c = (true, true, false, true) 183 | \end{lstlisting} 184 | 185 | 186 | \vspacesmall 187 | \begin{tabular}{|p{25mm}|p{100mm}|} 188 | \hline 189 | \bfseries Operator & \textgreater \\ \hline 190 | \bfseries Defined for & all integer and floating point vector classes \\ \hline 191 | \bfseries Description & test if bigger. Result is a boolean vector \\ \hline 192 | \bfseries Efficiency & good \\ \hline 193 | \end{tabular} 194 | \begin{lstlisting}[frame=none] 195 | // Example: 196 | Vec4i a(10, 11, 12, 13); 197 | Vec4i b(14, 13, 12, 11); 198 | Vec4ib c = a > b; // c = (false, false, false, true) 199 | \end{lstlisting} 200 | 201 | 202 | \vspacesmall 203 | \begin{tabular}{|p{25mm}|p{100mm}|} 204 | \hline 205 | \bfseries Operator & \textgreater= \\ \hline 206 | \bfseries Defined for & all integer and floating point vector classes \\ \hline 207 | \bfseries Description & test if bigger or equal. Result is a boolean vector \\ \hline 208 | \bfseries Efficiency & good \\ \hline 209 | \end{tabular} 210 | \begin{lstlisting}[frame=none] 211 | // Example: 212 | Vec4i a(10, 11, 12, 13); 213 | Vec4i b(14, 13, 12, 11); 214 | Vec4ib c = a >= b; // c = (false, false, true, true) 215 | \end{lstlisting} 216 | 217 | 218 | \vspacesmall 219 | \begin{tabular}{|p{25mm}|p{100mm}|} 220 | \hline 221 | \bfseries Operator & \textless \\ \hline 222 | \bfseries Defined for & all integer and floating point vector classes \\ \hline 223 | \bfseries Description & test if smaller. Result is a boolean vector \\ \hline 224 | \bfseries Efficiency & good \\ \hline 225 | \end{tabular} 226 | \begin{lstlisting}[frame=none] 227 | // Example: 228 | Vec4i a(10, 11, 12, 13); 229 | Vec4i b(14, 13, 12, 11); 230 | Vec4ib c = a < b; // c = (true, true, false, false) 231 | \end{lstlisting} 232 | 233 | \vspacesmall 234 | \begin{tabular}{|p{25mm}|p{100mm}|} 235 | \hline 236 | \bfseries Operator & \textless= \\ \hline 237 | \bfseries Defined for & all integer and floating point vector classes \\ \hline 238 | \bfseries Description & test if smaller or equal. Result is a boolean vector \\ \hline 239 | \bfseries Efficiency & good \\ \hline 240 | \end{tabular} 241 | \begin{lstlisting}[frame=none] 242 | // Example: 243 | Vec4i a(10, 11, 12, 13); 244 | Vec4i b(14, 13, 12, 11); 245 | Vec4ib c = a <= b; // c = (true, true, true, false) 246 | \end{lstlisting} 247 | 248 | 249 | 250 | \vspacesmall 251 | \begin{tabular}{|p{25mm}|p{100mm}|} 252 | \hline 253 | \bfseries Operator & \&, \&= \\ \hline 254 | \bfseries Defined for & all vector classes \\ \hline 255 | \bfseries Description & bitwise and \\ \hline 256 | \bfseries Efficiency & good \\ \hline 257 | \end{tabular} 258 | \begin{lstlisting}[frame=none] 259 | // Example: 260 | Vec4i a(10, 11, 12, 13); 261 | Vec4i b(20, 21, 22, 23); 262 | Vec4i c = a & b; // c = (0, 1, 4, 5) 263 | \end{lstlisting} 264 | 265 | 266 | \vspacesmall 267 | \begin{tabular}{|p{25mm}|p{100mm}|} 268 | \hline 269 | \bfseries Operator & \texttt{$\vert$, $\vert=$} \\ \hline 270 | \bfseries Defined for & all vector classes \\ \hline 271 | \bfseries Description & bitwise or \\ \hline 272 | \bfseries Efficiency & good \\ \hline 273 | \end{tabular} 274 | \begin{lstlisting}[frame=none] 275 | // Example: 276 | Vec4i a(10, 11, 12, 13); 277 | Vec4i b(20, 21, 22, 23); 278 | Vec4i c = a | b; // c = (30, 31, 30, 31) 279 | \end{lstlisting} 280 | 281 | 282 | \vspacesmall 283 | \begin{tabular}{|p{25mm}|p{100mm}|} 284 | \hline 285 | \bfseries Operator & \textasciicircum \\ \hline 286 | \bfseries Defined for & all vector classes \\ \hline 287 | \bfseries Description & bitwise exclusive or \\ \hline 288 | \bfseries Efficiency & good \\ \hline 289 | \end{tabular} 290 | \begin{lstlisting}[frame=none] 291 | // Example: 292 | Vec4i a(10, 11, 12, 13); 293 | Vec4i b(20, 21, 22, 23); 294 | Vec4i c = a ^ b; // c = (30, 30, 26, 26) 295 | \end{lstlisting} 296 | 297 | 298 | \vspacesmall 299 | \begin{tabular}{|p{25mm}|p{100mm}|} 300 | \hline 301 | \bfseries Operator & $\sim$ \\ \hline 302 | \bfseries Defined for & all boolean and integer vector classes \\ \hline 303 | \bfseries Description & bitwise not \\ \hline 304 | \bfseries Efficiency & good \\ \hline 305 | \end{tabular} 306 | \begin{lstlisting}[frame=none] 307 | // Example: 308 | Vec4i a(10, 11, 12, 13); 309 | Vec4i b = ~a; // b = (-11, -12, -13, -14) 310 | \end{lstlisting} 311 | 312 | 313 | \vspacesmall 314 | \begin{tabular}{|p{25mm}|p{100mm}|} 315 | \hline 316 | \bfseries Operator & ! \\ \hline 317 | \bfseries Defined for & all vector classes \\ \hline 318 | \bfseries Description & logical not. Result is a boolean vector \\ \hline 319 | \bfseries Efficiency & good \\ \hline 320 | \end{tabular} 321 | \begin{lstlisting}[frame=none] 322 | // Example: 323 | Vec4i a(-1, 0, 1, 2); 324 | Vec4ib b = !a; // b = (false,true,false,false) 325 | \end{lstlisting} 326 | 327 | %\indenton % undo flushleft 328 | 329 | \section{Integer division} \label{IntegerDivision} 330 | 331 | There are no instructions in the x86 instruction set extensions that are useful for integer vector division, and such instructions might be quite slow if they existed. Therefore, the vector class library is using an algorithm for fast integer division. The basic principle of this algorithm can be expressed in this formula: 332 | \vspacesmall \newline 333 | $a / b \approx a * (2^n / b) >> n$ \newline 334 | \vspacesmall 335 | This calculation goes through the following steps: 336 | 337 | \begin{enumerate} 338 | \item find a suitable value for n 339 | \item calculate $2^n / b$ 340 | \item calculate necessary corrections for rounding errors 341 | \item do the multiplication and shift-right, and apply corrections for rounding errors 342 | \end{enumerate} 343 | 344 | This formula is advantageous if multiple numbers are divided by the same divisor b. Steps 1, 2 and 3 need only be done once while step 4 is repeated for each value of the dividend a. The mathematical details are described in the file vectori128.h. (See also T. Granlund and P. L. Montgomery: Division by Invariant Integers Using Multiplication, Proceedings of the SIGPLAN 1994 Conference on Programming Language Design and Implementation) 345 | \vspacesmall 346 | 347 | The implementation in the vector class library uses various variants of this method with appropriate corrections for rounding errors to get the exact result truncated towards zero. 348 | 349 | The way to use this in your code depends on whether the divisor b is a variable or constant, and whether the same divisor is applied to multiple vectors. This is illustrated in the following examples: 350 | 351 | \begin{lstlisting}[frame=none] 352 | // Division example A: 353 | // A variable divisor is applied to one vector 354 | Vec4i a(10, 11, 12, 13);// dividend is an integer vector 355 | int b = 3; // divisor is an integer variable 356 | Vec4i c = a / b; // result c = (3, 3, 4, 4) 357 | \end{lstlisting} 358 | 359 | \begin{lstlisting}[frame=none] 360 | // Division example B: 361 | // The same divisor is applied to multiple vectors 362 | int b = 3; // divisor 363 | Divisor_i divb(b); // this object contains the results 364 | // of calculation steps 1, 2, and 3 365 | for (...) { // loop through multiple vectors 366 | Vec4i a = ... // get dividend 367 | a = a / divb; // do step 4 of the division 368 | ... // store results 369 | } 370 | \end{lstlisting} 371 | 372 | \begin{lstlisting}[frame=none] 373 | // Division example C: 374 | // The divisor is a constant, known at compile time 375 | Vec4i a(10, 11, 12, 13); // dividend is integer vector 376 | Vec4i c = a / const_int(3); // result c = (3, 3, 4, 4) 377 | \end{lstlisting} 378 | 379 | 380 | Explanation: 381 | 382 | The class \codei{Divisor\_i} in example B takes care of the calculation steps 1, 2 and 3 in the algorithm described above. The overloaded \codei{/} operator takes a vector on the left hand side and an object of class \codei{Divisor\_i} on the right hand side. This object is created before the loop with the divisor as parameter to the constructor. We are saving time by doing this time-consuming calculation only once while step 4 in the calculation is done multiple times inside the loop by \codei{a = a / divb;} 383 | \vspacesmall 384 | 385 | In example A, we are also creating an object of class \codei{Divisor\_i}, but this is done implicitly. The compiler sees an integer on the right hand side of the \codei{/} operator where it needs an object of class \codei{Divisor\_i}, and therefore converts the integer \codei{b} to such an object by calling the constructor \codei{Divisor\_i}(int). 386 | 387 | \vspacesmall 388 | The following divisor classes are available: 389 | 390 | \vspacesmall 391 | \begin{tabular}{|p{50mm}|p{50mm}|} 392 | \hline 393 | \bfseries Dividend vector type & \bfseries Divisor class required \\ \hline 394 | Vec16c, Vec32c , Vec64c & Divisor\_s \\ \hline 395 | Vec16uc, Vec32uc, Vec64uc & Divisor\_us \\ \hline 396 | Vec8s, Vec16s , Vec32s & Divisor\_s \\ \hline 397 | Vec8us, Vec16us , Vec32us & Divisor\_us \\ \hline 398 | Vec4i, Vec8i, Vec16i & Divisor\_i \\ \hline 399 | Vec4ui, Vec8ui, Vec16ui & Divisor\_ui \\ \hline 400 | \end{tabular} 401 | \vspacesmall 402 | 403 | If the divisor is a constant and the value is known at compile time, then we can use the method in example C. The implementation here uses macros and templates to do the calculation steps 1, 2 and 3 at compile time rather than at execution time. This makes the code even faster. The expression to put on the right-hand side of the \codei{/} operator looks as follows: 404 | 405 | \vspacesmall 406 | \begin{tabular}{|p{50mm}|p{50mm}|} 407 | \hline 408 | \bfseries Dividend vector type & \bfseries Divisor expression \\ \hline 409 | Vec16c, Vec32c, Vec64c & const\_int \\ \hline 410 | Vec16uc, Vec32uc, Vec64uc & const\_uint \\ \hline 411 | Vec8s, Vec16s, Vec32s & const\_int \\ \hline 412 | Vec8us, Vec16us, Vec32us & const\_uint \\ \hline 413 | Vec4i, Vec8i, Vec16i & const\_int \\ \hline 414 | Vec4ui, Vec8ui, Vec16ui & const\_uint \\ \hline 415 | \end{tabular} 416 | \vspacesmall 417 | 418 | The compiler will generate an error message if the parameter to \codei{const\_int} or \codei{const\_uint} is not a valid compile-time constant. (A valid compile time constant can contain integer literals and operators, as well as macros that are expanded to compile time constants, but not ordinary function calls). 419 | \vspacesmall 420 | 421 | A further advantage of the method in example C is that the code is able to use different methods for different values of the divisor. The division is particularly fast if the divisor is a power of 2. Make sure to use \codei{const\_int} or \codei{const\_uint} on the right hand side of the \codei{/} operator if you are dividing by 2, 4, 8, 16, etc. 422 | \vspacesmall 423 | 424 | Division is faster for vectors of 16-bit integers than for vectors of 8-bit or 32-bit integers. There is no support for division of vectors of 64-bit integers. Unsigned division is faster than signed division. 425 | 426 | 427 | \chapter{Functions}\label{chap:Functions} 428 | 429 | \section{Integer functions} 430 | \flushleft 431 | 432 | \vspacesmall 433 | \begin{tabular}{|p{25mm}|p{100mm}|} 434 | \hline 435 | \bfseries Function & horizontal\_add \\ \hline 436 | \bfseries Defined for & all integer vector classes \\ \hline 437 | \bfseries Description & calculates the sum of all vector elements \\ \hline 438 | \bfseries Efficiency & medium. For best performance, use normal (vertical) addition where possible. \\ \hline 439 | \end{tabular} 440 | \begin{lstlisting}[frame=none] 441 | // Example: 442 | Vec4i a(10, 11, 12, 13); 443 | int b = horizontal_add(a); // b = 46 444 | \end{lstlisting} 445 | 446 | 447 | \vspacesmall 448 | \begin{tabular}{|p{25mm}|p{100mm}|} 449 | \hline 450 | \bfseries Function & horizontal\_add\_x \\ \hline 451 | \bfseries Defined for & all 8-bit, 16-bit and 32-bit integer vector classes \\ \hline 452 | \bfseries Description & calculates the sum of all vector elements. The sum is calculated with a higher number of bits to avoid overflow 453 | \\ \hline 454 | \bfseries Efficiency & medium (slower than horizontal\_add) \\ \hline 455 | \end{tabular} 456 | \begin{lstlisting}[frame=none] 457 | // Example: 458 | Vec4i a(10, 11, 12, 13); 459 | int64_t b = horizontal_add_x(a); // b = 46 460 | \end{lstlisting} 461 | 462 | \vspacesmall 463 | \begin{tabular}{|p{25mm}|p{100mm}|} 464 | \hline 465 | \bfseries Function & horizontal\_min, horizontal\_max \\ \hline 466 | \bfseries Defined for & all integer vector classes \\ \hline 467 | \bfseries Description & Returns the lowest or highest element in a vector. \\ \hline 468 | \bfseries Efficiency & medium \\ \hline 469 | \end{tabular} 470 | \begin{lstlisting}[frame=none] 471 | // Example: 472 | Vec4i a(1, 8, -5, 3); 473 | int b = horizontal_min(a); // b = -5 474 | int c = horizontal_max(a); // c = 8 475 | \end{lstlisting} 476 | 477 | \vspacesmall 478 | \begin{tabular}{|p{25mm}|p{100mm}|} 479 | \hline 480 | \bfseries Function & add\_saturated\\ \hline 481 | \bfseries Defined for & all 8-bit, 16-bit and 32-bit integer vector classes \\ \hline 482 | \bfseries Description & same as operator +. Overflow is handled by saturation rather than wrap-around \\ \hline 483 | \bfseries Efficiency & fast for 8-bit and 16-bit integers. Medium for 32-bit integers \\ \hline 484 | \end{tabular} 485 | \begin{lstlisting}[frame=none] 486 | // Example: 487 | Vec4i a(0x10000000, 0x20000000, 0x30000000, 0x40000000); 488 | Vec4i b(0x30000000, 0x40000000, 0x50000000, 0x60000000); 489 | Vec4i c = add_saturated(a, b); 490 | // c = (0x40000000, 0x60000000, 0x7FFFFFFF, 0x7FFFFFFF) 491 | Vec4i d = a + b; 492 | // d = (0x40000000, 0x60000000, -0x80000000, -0x60000000) 493 | \end{lstlisting} 494 | 495 | 496 | \vspacesmall 497 | \begin{tabular}{|p{25mm}|p{100mm}|} 498 | \hline 499 | \bfseries Function & sub\_saturated\\ \hline 500 | \bfseries Defined for & all 8-bit, 16-bit and 32-bit integer vector classes \\ \hline 501 | \bfseries Description & same as operator -. Overflow is handled by saturation rather than wrap-around \\ \hline 502 | \bfseries Efficiency & fast for 8-bit and 16-bit integers. Medium for 32-bit integers \\ \hline 503 | \end{tabular} 504 | \begin{lstlisting}[frame=none] 505 | // Example: 506 | Vec4i a(-0x10000000,-0x20000000,-0x30000000,-0x40000000); 507 | Vec4i b( 0x30000000, 0x40000000, 0x50000000, 0x60000000); 508 | Vec4i c = sub_saturated(a, b); 509 | // c = (-0x40000000,-0x60000000,-0x80000000,-0x80000000) 510 | Vec4i d = a - b; 511 | // d = (-0x40000000,-0x60000000,-0x80000000, 0x60000000) 512 | \end{lstlisting} 513 | 514 | 515 | \vspacesmall 516 | \begin{tabular}{|p{25mm}|p{100mm}|} 517 | \hline 518 | \bfseries Function & max \\ \hline 519 | \bfseries Defined for & all integer vector classes \\ \hline 520 | \bfseries Description & returns the biggest of two values \\ \hline 521 | \bfseries Efficiency & medium for 64-bit integers with instruction sets lower than SSE4.2. Fast otherwise \\ \hline 522 | \end{tabular} 523 | \begin{lstlisting}[frame=none] 524 | Vec4i a(10, 11, 12, 13); 525 | Vec4i b(14, 13, 12, 11); 526 | Vec4i c = max(a, b); // c = (14, 13, 12, 13) 527 | \end{lstlisting} 528 | 529 | 530 | \vspacesmall 531 | \begin{tabular}{|p{25mm}|p{100mm}|} 532 | \hline 533 | \bfseries Function & min \\ \hline 534 | \bfseries Defined for & all integer vector classes \\ \hline 535 | \bfseries Description & returns the smallest of two values \\ \hline 536 | \bfseries Efficiency & medium for 64-bit integers with instruction sets lower than SSE4.2. Fast otherwise \\ \hline 537 | \end{tabular} 538 | \begin{lstlisting}[frame=none] 539 | // Example: 540 | Vec4i a(10, 11, 12, 13); 541 | Vec4i b(14, 13, 12, 11); 542 | Vec4i c = min(a, b); // c = (10, 11, 12, 11) 543 | \end{lstlisting} 544 | 545 | 546 | \vspacesmall 547 | \begin{tabular}{|p{25mm}|p{100mm}|} 548 | \hline 549 | \bfseries Function & abs \\ \hline 550 | \bfseries Defined for & all signed integer vector classes \\ \hline 551 | \bfseries Description & calculates the absolute value \\ \hline 552 | \bfseries Efficiency & medium \\ \hline 553 | \end{tabular} 554 | \begin{lstlisting}[frame=none] 555 | // Example: 556 | Vec4i a(-1, 0, 1, 2); 557 | Vec4i b = abs(a); // b = (1, 0, 1, 2) 558 | \end{lstlisting} 559 | 560 | 561 | \vspacesmall 562 | \begin{tabular}{|p{25mm}|p{100mm}|} 563 | \hline 564 | \bfseries Function & abs\_saturated \\ \hline 565 | \bfseries Defined for & all signed integer vector classes \\ \hline 566 | \bfseries Description & calculates the absolute value. Overflow saturates to make sure the result is never negative when the input is INT\_MIN 567 | \\ \hline 568 | \bfseries Efficiency & medium (slower than abs) \\ \hline 569 | \end{tabular} 570 | \begin{lstlisting}[frame=none] 571 | // Example: 572 | Vec4i a(-0x80000000, -1, 0, 1); 573 | Vec4i b = abs_saturated(a); // b=( 0x7FFFFFFF,1,0,1) 574 | Vec4i c = abs(a); // c=(-0x80000000,1,0,1) 575 | \end{lstlisting} 576 | 577 | 578 | \vspacesmall 579 | \begin{tabular}{|p{25mm}|p{100mm}|} 580 | \hline 581 | \bfseries Function & rotate\_left(vector, int) \\ \hline 582 | \bfseries Defined for & all signed integer vector classes \\ \hline 583 | \bfseries Description & rotates the bits of each element. Use a negative count to rotate right \\ \hline 584 | \bfseries Efficiency & 8 bit: poor \newline 585 | 16 bit: medium \newline 586 | 32 and 64 bit: good for AVX512DQ instruction set, medium otherwise. 587 | \\ \hline 588 | \end{tabular} 589 | \begin{lstlisting}[frame=none] 590 | // Example: 591 | Vec4i a(0x12345678, 0x0000FFFF, 0xA000B000, 0x00000001); 592 | Vec4i b = rotate_left(a, 8); 593 | // b = (0x34567812, 0x00FFFF00, 0x00B000A0, 0x00000100) 594 | \end{lstlisting} 595 | 596 | 597 | \vspacesmall 598 | \begin{tabular}{|p{25mm}|p{100mm}|} 599 | \hline 600 | \bfseries Function & 601 | vector shift\_bytes\_up\textless n\textgreater(vector)\newline 602 | vector shift\_bytes\_down\textless n\textgreater(vector) 603 | \\ \hline 604 | \bfseries Defined for & Vec16c, Vec32c, Vec64c \\ \hline 605 | \bfseries Description & shifts the bytes of a vector up or down and inserts zeroes at the vacant places \\ \hline 606 | \bfseries Efficiency & 607 | Vec16c: Good for SSSE3, medium otherwise \newline 608 | Vec32c: Good for AVX2, medium otherwise \newline 609 | Vec64c: Good for AVX512BW, medium otherwise \\ \hline 610 | \end{tabular} 611 | \begin{lstlisting}[frame=none] 612 | // Example: 613 | Vec16c a(10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25); 614 | Vec16c b = shift_bytes_up<5>(a); 615 | // b = (0,0,0,0,0,10,11,12,13,14,15,16,17,18,19,20) 616 | \end{lstlisting} 617 | 618 | 619 | 620 | \section{Floating point simple functions} 621 | 622 | \vspacesmall 623 | \begin{tabular}{|p{25mm}|p{100mm}|} 624 | \hline 625 | \bfseries Function & horizontal\_add \\ \hline 626 | \bfseries Defined for & all floating point vector classes \\ \hline 627 | \bfseries Description & calculates the sum of all vector elements \\ \hline 628 | \bfseries Efficiency & medium. For best performance, use normal (vertical) addition where possible. \\ \hline 629 | \end{tabular} 630 | \begin{lstlisting}[frame=none] 631 | // Example: 632 | Vec4f a(1.0f, 1.1f, 1.2f, 1.3f); 633 | float b = horizontal_add(a); // b = 4.6 634 | \end{lstlisting} 635 | 636 | 637 | \vspacesmall 638 | \begin{tabular}{|p{25mm}|p{100mm}|} 639 | \hline 640 | \bfseries Function & max \newline min \\ \hline 641 | \bfseries Defined for & all floating point vector classes \\ \hline 642 | \bfseries Description & returns the biggest/smallest of two values \\ \hline 643 | \bfseries Efficiency & good \\ \hline 644 | \end{tabular} 645 | \vspacesmall 646 | 647 | \codei{max(a,b)} is equivalent to \codei{a > b ? a : b }\\ 648 | \codei{min(a,b)} is equivalent to \codei{a < b ? a : b }\\ 649 | \vspacesmall 650 | 651 | These functions will not return a NAN if the first parameter is NAN.\\ 652 | These functions make no distinction between 0 and -0. 653 | \begin{lstlisting}[frame=none] 654 | // Example: 655 | Vec4f a(1.0f, 1.1f, 1.2f, 1.3f); 656 | Vec4f b(1.4f, 1.3f, 1.2f, 1.1f); 657 | Vec4f c = max(a, b); // c = (1.4f, 1.3f, 1.2f, 1.3f) 658 | \end{lstlisting} 659 | 660 | \vspacesmall 661 | \begin{tabular}{|p{25mm}|p{100mm}|} 662 | \hline 663 | \bfseries Function & maximum \newline minimum \\ \hline 664 | \bfseries Defined for & all floating point vector classes \\ \hline 665 | \bfseries Description & returns the biggest/smallest of two values \\ \hline 666 | \bfseries Efficiency & good, but slower than max / min \\ \hline 667 | \end{tabular} 668 | \vspacesmall 669 | 670 | These functions are similar to max and min, but sure to propagate NAN values.\\ 671 | The sign of zero is ignored unless SIGNED\_ZERO is defined. 672 | \vspacesmall 673 | 674 | \vspacesmall 675 | \begin{tabular}{|p{25mm}|p{100mm}|} 676 | \hline 677 | \bfseries Function & horizontal\_min, horizontal\_max \\ \hline 678 | \bfseries Defined for & all floating point vector classes \\ \hline 679 | \bfseries Description & Returns the lowest or highest element in a vector.\newline 680 | NANs are propagated. The sign of zero is ignored. \\ \hline 681 | \bfseries Efficiency & medium \\ \hline 682 | \end{tabular} 683 | \begin{lstlisting}[frame=none] 684 | // Example: 685 | Vec4i a(1, 8, -5, 3); 686 | int b = horizontal_min(a); // b = -5 687 | int c = horizontal_max(a); // c = 8 688 | \end{lstlisting} 689 | 690 | 691 | \vspacebig 692 | \begin{tabular}{|p{25mm}|p{100mm}|} 693 | \hline 694 | \bfseries Function & abs \\ \hline 695 | \bfseries Defined for & all floating point vector classes \\ \hline 696 | \bfseries Description & gets the absolute value \\ \hline 697 | \bfseries Efficiency & good \\ \hline 698 | \end{tabular} 699 | \begin{lstlisting}[frame=none] 700 | // Example: 701 | Vec4f a(-1.0f, 0.0f, 1.0f, 2.0f); 702 | Vec4f b = abs(a); // b = (1.0f, 0.0f, 1.0f, 2.0f) 703 | \end{lstlisting} 704 | \vspacesmall 705 | 706 | 707 | \begin{tabular}{|p{25mm}|p{100mm}|} 708 | \hline 709 | \bfseries Function & change\_sign\textless i0, i1, ...\textgreater(vector) \\ \hline 710 | \bfseries Defined for & all floating point vector classes \\ \hline 711 | \bfseries Description & changes sign of selected vector elements.\newline 712 | Each template parameter is 1 for changing sign of the corresponding element, and 0 for no change. \\ \hline 713 | \bfseries Efficiency & good \\ \hline 714 | \end{tabular} 715 | \begin{lstlisting}[frame=none] 716 | // Example: 717 | Vec4f a(10.0f, 11.0f, -12.0f, 13.0f); 718 | Vec4f b = change_sign<0,1,1,0>(a); // b = (10.f, -11.f, 12.f, 13.f) 719 | \end{lstlisting} 720 | \vspacesmall 721 | 722 | \begin{tabular}{|p{25mm}|p{100mm}|} 723 | \hline 724 | \bfseries Function & sign\_combine(vector a, vector b) \\ \hline 725 | \bfseries Defined for & all floating point vector classes \\ \hline 726 | \bfseries Description & Returns the value of a, with the sign inverted if b has its sign bit set.\newline 727 | Corresponds to select(sign\_bit(b), -a, a) \\ \hline 728 | \bfseries Efficiency & good \\ \hline 729 | \end{tabular} 730 | \begin{lstlisting}[frame=none] 731 | // Example: 732 | Vec4f a(-2.0f, -1.0f, 0.0f, 1.0f); 733 | Vec4f b(-10.f, 0.0f, -20.f, 30.f); 734 | Vec4f c = sign_combine(a, b); // c = (2.0f, -1.0f, -0.0f, 1.0f) 735 | \end{lstlisting} 736 | \vspacesmall 737 | 738 | 739 | \begin{tabular}{|p{25mm}|p{100mm}|} 740 | \hline 741 | \bfseries Function & sign\_bit \\ \hline 742 | \bfseries Defined for & all floating point vector classes \\ \hline 743 | \bfseries Description & returns a boolean vector with true for elements that have the sign bit set, including -0.0, -INF, and -NAN \\ \hline 744 | \bfseries Efficiency & medium \\ \hline 745 | \end{tabular} 746 | \begin{lstlisting}[frame=none] 747 | // Example: 748 | Vec4f a(-1.0f, 0.0f, 1.0f, 2.0f); 749 | Vec4fb b = sign_bit(a); // b = (true, false, false, false) 750 | \end{lstlisting} 751 | \vspacesmall 752 | 753 | 754 | \begin{tabular}{|p{25mm}|p{100mm}|} 755 | \hline 756 | \bfseries Function & sqrt \\ \hline 757 | \bfseries Defined for & all floating point vector classes \\ \hline 758 | \bfseries Description & calculates the square root \\ \hline 759 | \bfseries Efficiency & poor \\ \hline 760 | \end{tabular} 761 | \begin{lstlisting}[frame=none] 762 | // Example: 763 | Vec4f a(0.0f, 1.0f, 2.0f, 3.0f); 764 | Vec4f b = sqrt(a); // b = (0.000f, 1.000f, 1.414f, 1.732f) 765 | \end{lstlisting} 766 | 767 | 768 | \vspacesmall 769 | \begin{tabular}{|p{25mm}|p{100mm}|} 770 | \hline 771 | \bfseries Function & square \\ \hline 772 | \bfseries Defined for & all floating point vector classes \\ \hline 773 | \bfseries Description & calculates the square \\ \hline 774 | \bfseries Efficiency & good \\ \hline 775 | \end{tabular} 776 | \begin{lstlisting}[frame=none] 777 | // Example: 778 | Vec4f a(0.0f, 1.0f, 2.0f, 3.0f); 779 | Vec4f b = square(a); // b = (0.0f, 1.0f, 4.0f, 9.0f) 780 | \end{lstlisting} 781 | 782 | 783 | \label{powVectorInt} 784 | \vspacesmall 785 | \begin{tabular}{|p{25mm}|p{100mm}|} 786 | \hline 787 | \bfseries Function & pow(vector x, int n) \\ \hline 788 | \bfseries Defined for & all floating point vector classes \\ \hline 789 | \bfseries Description & raises all vector elements to the same integer power. 790 | Will generate a compiler error if n is floating point and vectormath\_exp.h is not included, or in general if n is not of type int. 791 | See page \pageref{ExpLogFunctions} for pow with floating point exponent. 792 | \\ \hline 793 | \bfseries Precision & slightly imprecise for high values of n due to accumulation of rounding errors \\ \hline 794 | \bfseries Efficiency & medium \\ \hline 795 | \end{tabular} 796 | \begin{lstlisting}[frame=none] 797 | // Example: 798 | Vec4f a(0.0f, 1.0f, 2.0f, 3.0f); 799 | int b = 3; 800 | Vec4f c = pow(a, b); // c = (0.0f, 1.0f, 8.0f, 27.0f) 801 | \end{lstlisting} 802 | 803 | 804 | \label{powConstVectorInt} 805 | \vspacesmall 806 | \begin{tabular}{|p{25mm}|p{100mm}|} 807 | \hline 808 | \bfseries Function & pow\_const(vector x, const int n) \\ \hline 809 | \bfseries Defined for & all floating point vector classes \\ \hline 810 | \bfseries Description & raises all vector elements to the same integer power n, where n is a compile-time constant \\ \hline 811 | \bfseries Precision & slightly imprecise for high values of n due to accumulation of rounding errors \\ \hline 812 | \bfseries Efficiency & medium, often better than pow(vector, int) \\ \hline 813 | \end{tabular} 814 | \begin{lstlisting}[frame=none] 815 | // Example: 816 | Vec4f a(0.0f, 1.0f, 2.0f, 3.0f); 817 | Vec4f c = pow_const(a, 3); // c = (0.0f, 1.0f, 8.0f, 27.0f) 818 | \end{lstlisting} 819 | 820 | 821 | \vspacesmall 822 | \begin{tabular}{|p{25mm}|p{100mm}|} 823 | \hline 824 | \bfseries Function & round \\ \hline 825 | \bfseries Defined for & all floating point vector classes \\ \hline 826 | \bfseries Description & round to nearest integer (even value if two values are equally near). The value is returned as a floating point vector.\newline 827 | See also roundi and round\_to\_int32 on page \pageref{roundToInt}. \\ \hline 828 | \bfseries Efficiency & good if SSE4.1 or higher instruction set \\ \hline 829 | \end{tabular} 830 | \begin{lstlisting}[frame=none] 831 | // Example: 832 | Vec4f a(1.0f, 1.4f, 1.5f, 1.6f) 833 | Vec4f b = round(a); // b = (1.0f, 1.0f, 2.0f, 2.0f) 834 | \end{lstlisting} 835 | 836 | 837 | \vspacesmall 838 | \begin{tabular}{|p{25mm}|p{100mm}|} 839 | \hline 840 | \bfseries Function & truncate \\ \hline 841 | \bfseries Defined for & all floating point vector classes \\ \hline 842 | \bfseries Description & truncates number towards zero. The value is returned as a floating point vector. \newline 843 | See also truncatei and truncate\_to\_int32 on page \pageref{truncateToInt}. \\ \hline 844 | \bfseries Efficiency & good if SSE4.1 or higher instruction set \\ \hline 845 | \bfseries Note & may be slightly inaccurate for x \textgreater{} $10^7$ if instruction set is less than SSE4.1 \\ \hline 846 | \end{tabular} 847 | \begin{lstlisting}[frame=none] 848 | // Example: 849 | Vec4f a(1.0f, 1.5f, 1.9f, 2.0f) 850 | Vec4f b = truncate(a); // b = (1.0f, 1.0f, 1.0f, 2.0f) 851 | \end{lstlisting} 852 | 853 | 854 | \vspacesmall 855 | \begin{tabular}{|p{25mm}|p{100mm}|} 856 | \hline 857 | \bfseries Function & floor \\ \hline 858 | \bfseries Defined for & all floating point vector classes \\ \hline 859 | \bfseries Description & rounds number towards $-\infty$. The value is returned as a floating point vector \\ \hline 860 | \bfseries Efficiency & good if SSE4.1 or higher instruction set \\ \hline 861 | \bfseries Note & may be slightly inaccurate for x \textgreater{} $10^7$ if instruction set is less than SSE4.1 \\ \hline 862 | \end{tabular} 863 | \begin{lstlisting}[frame=none] 864 | // Example: 865 | Vec4f a(-0.5f, 1.5f, 1.9f, 2.0f) 866 | Vec4f b = floor(a); // b = (-1.0f, 1.0f, 1.0f, 2.0f) 867 | \end{lstlisting} 868 | 869 | 870 | \vspacesmall 871 | \begin{tabular}{|p{25mm}|p{100mm}|} 872 | \hline 873 | \bfseries Function & ceil \\ \hline 874 | \bfseries Defined for & all floating point vector classes \\ \hline 875 | \bfseries Description & rounds number towards $+\infty$. The value is returned as a floating point vector \\ \hline 876 | \bfseries Efficiency & good if SSE4.1 or higher instruction set \\ \hline 877 | \bfseries Note & may be slightly inaccurate for x \textgreater{} $10^7$ if instruction set is less than SSE4.1 \\ \hline 878 | \end{tabular} 879 | \begin{lstlisting}[frame=none] 880 | // Example: 881 | Vec4f a(-0.5f, 1.1f, 1.9f, 2.0f) 882 | Vec4f b = ceil(a); // b = (0.0f, 2.0f, 2.0f, 2.0f) 883 | \end{lstlisting} 884 | 885 | 886 | \vspacesmall 887 | \begin{tabular}{|p{25mm}|p{100mm}|} 888 | \hline 889 | \bfseries Function & approx\_recipr \\ \hline 890 | \bfseries Defined for & single and half precision floating point vectors \\ \hline 891 | \bfseries Description & fast approximate calculation of reciprocal \\ \hline 892 | \bfseries Precision & the relative accuracy depends on the instruction set:\newline 893 | Default: $2^{-11}$\newline 894 | AVX512F: $2^{-14}$\newline 895 | AVX512ER: full precision \\ \hline 896 | \bfseries Efficiency & good \\ \hline 897 | \end{tabular} 898 | \begin{lstlisting}[frame=none] 899 | // Example: 900 | Vec4f a(1.5f, 2.0f, 3.0f, 4.0f) 901 | Vec4f b(0.5f, 1.0f, 0.5f, 1.0f) 902 | Vec4f c = a * approx_recipr(b); // c approximates a/b 903 | \end{lstlisting} 904 | 905 | 906 | \vspacesmall 907 | \begin{tabular}{|p{25mm}|p{100mm}|} 908 | \hline 909 | \bfseries Function & approx\_rsqrt \\ \hline 910 | \bfseries Defined for & single and half precision floating point vectors \\ \hline 911 | \bfseries Description & reciprocal square root. Fast approximate calculation of value to the power of -0.5 \\ \hline 912 | \bfseries Precision & the relative accuracy depends on the instruction set:\newline 913 | Default: $2^{-11}$\newline 914 | AVX512F: $2^{-14}$\newline 915 | AVX512ER: full precision \\ \hline 916 | \bfseries Efficiency & good \\ \hline 917 | \end{tabular} 918 | \begin{lstlisting}[frame=none] 919 | // Example: 920 | Vec4f a(1.0f, 2.0f, 3.0f, 4.0f) 921 | Vec4f b = approx_rsqrt(a) * a; // b approximates sqrt(a) 922 | \end{lstlisting} 923 | \vspacesmall 924 | 925 | 926 | \end{document} -------------------------------------------------------------------------------- /vcl_packages.tex: -------------------------------------------------------------------------------- 1 | % chapter included in vclmanual.tex 2 | \documentclass[vcl_manual.tex]{subfiles} 3 | \begin{document} 4 | 5 | \chapter{Add-on packages}\label{chap:AddOnPackages} 6 | \flushleft 7 | 8 | Various extra packages are available with code for special applications. 9 | These packages are stored at 10 | \url{https://github.com/vectorclass/add-on}. 11 | Manuals are included with each package. The add-on packages for VCL include: 12 | 13 | \begin{description} 14 | 15 | \item[Container classes.] 16 | Container class templates for storing arrays of vectors. More efficient than the standard C++ container class templates. \newline 17 | This package also contains a class template for matrices where matrix rows are stored as VCL vectors. Various functions are included for accessing matrix elements and rows and for packing and unpacking matrix data. 18 | 19 | \item[Random number generator.] 20 | A high-quality pseudo random number generator. Capable of generating random integer and floating point vectors. Suitable for large multi-threaded applications. 21 | 22 | \item[Decimal string conversion.] 23 | Converts integer vectors to and from comma-separated lists in human-readable decimal ASCII form. Useful for reading and writing comma-separated files. 24 | 25 | \item[3-dimensional vectors.] 26 | Defines 3-dimensional vectors for use in geometry and physics. 27 | Includes operators and functions for addition, multiplication, dot product, cross product, and rotation. 28 | 29 | \item[Complex number vectors.] 30 | Defines complex number vectors for use in mathematics and electronics. 31 | Includes operators for add, subtract, multiply, divide, and conjugate, as well as functions such as complex square root, exponential function, and logarithm. 32 | 33 | \item[Quaternions.] 34 | Defines quaternions (hypercomplex numbers) for use in mathematics. 35 | Includes operators for add, subtract, multiply, divide, conjugate, etc. 36 | 37 | 38 | % add more packages here 39 | 40 | 41 | \end{description} 42 | \vspacesmall 43 | 44 | 45 | \end{document} -------------------------------------------------------------------------------- /vcl_permute_functions.tex: -------------------------------------------------------------------------------- 1 | % chapter included in vclmanual.tex 2 | \documentclass[vcl_manual.tex]{subfiles} 3 | \begin{document} 4 | 5 | 6 | \chapter{Permute, blend, lookup, gather and scatter functions}\label{chap:PermuteBlendEtc} 7 | 8 | \section{Permute functions}\label{PermuteFunctions} 9 | \flushleft 10 | 11 | \vspacesmall 12 | \begin{tabular}{|p{30mm}|p{120mm}|} 13 | \hline 14 | \bfseries Function & permute..\textless i0, i1, ...\textgreater(vector) \\ \hline 15 | \bfseries Defined for & all integer and floating point vector classes \\ \hline 16 | \bfseries Description & permutes vector elements \\ \hline 17 | \bfseries Efficiency & depends on parameters and instruction set \\ \hline 18 | \end{tabular} 19 | \vspacesmall 20 | 21 | The permute functions can move any element of a vector into any position, copy the same element to multiple positions, and set any element to zero. 22 | \vspacesmall 23 | 24 | The name of the permute function is "permute" followed by the number of vector elements, for example permute4 for Vec4i. The permute function for a vector of $n$ elements has $n$ indexes, which are entered as template parameters in angle brackets. Each index indicates the desired contents of the corresponding element in the result vector. An index $i$ in the interval 25 | $0 \leq i \leq n-1$ indicates that element number $i$ from the input vector should be placed in the corresponding position in the result vector. An index $i = -1$ gives a zero in the corresponding position. An index $i$ = V\_DC means don't care. This will give whatever implementation is fastest, regardless of what value it puts in this position. The value you get with "don't care" may be different for different implementations or different instruction sets. 26 | \vspacesmall 27 | 28 | \begin{lstlisting}[frame=none] 29 | // Example: 30 | Vec4i a(10, 11, 12, 13); 31 | Vec4i b = permute4<2,2,3,0>(a); // b = (12, 12, 13, 10) 32 | Vec4i c = permute4<-1,-1,1,1>(a); // c = ( 0, 0, 11, 11) 33 | \end{lstlisting} 34 | \vspacesmall 35 | 36 | The indexes in angle brackets must be compile-time constants, they cannot contain variables or function calls. If you need variable indexes then use the lookup functions instead (see page \pageref{LookupFunctions}). 37 | \vspacesmall 38 | 39 | The permute functions are using advanced metaprogramming techniques in order to find the optimal combination of instructions that fit the given set of indexes and the specified instruction set. The optimization criteria include number of instructions, instruction latency, and data cache use. The metaprogramming may produce extra code when compiling in debug mode, but this extra code is eliminated when compiling for release mode with optimization on. The call to a permute function is reduced to just one or a few machine instructions in favorable cases. 40 | \vspacesmall 41 | 42 | The performance is generally good when the instruction set SSSE3 or higher is enabled. The performance for permuting vectors of 16-bit integers is medium, and the performance for permuting vectors of 8-bit integers is poor for instruction sets lower than SSSE3. You may get the best performance with instruction set AVX2 or AVX512VL. 43 | \vspacesmall 44 | 45 | 46 | \section{Blend functions}\label{BlendFunctions} 47 | 48 | \vspacesmall 49 | \begin{tabular}{|p{30mm}|p{120mm}|} 50 | \hline 51 | \bfseries Function & blend..\textless i0, i1, ...\textgreater(vector, vector) \\ \hline 52 | \bfseries Defined for & all integer and floating point vector classes \\ \hline 53 | \bfseries Description & permutes and blends elements from two vectors \\ \hline 54 | \bfseries Efficiency & depends on parameters and instruction set \\ \hline 55 | \end{tabular} 56 | \vspacesmall 57 | 58 | The blend functions are similar to the permute functions, but with two input vectors. 59 | The name of the function is "blend" followed by the number of vector elements, for example blend4 for Vec4i. The blend function for a vector of $n$ elements has $n$ indexes, which are entered as template parameters in angle brackets. Each index indicates the desired contents of the corresponding element in the result vector. The indexes must be compile-time constants. 60 | An index $i$ in the interval $0 \leq i \leq n-1$ indicates that element number $i$ from the first input vector should be placed in the corresponding position in the result vector. An index $i$ in the interval $n \leq i \leq 2 \cdot n-1$ indicates that element number $i-n$ from the second input vector should be placed in the corresponding position in the result vector. An index $i = -1$ gives a zero in the corresponding position. An index $i$ = V\_DC means don't care. 61 | \vspacesmall 62 | 63 | The blend functions are using metaprogramming in the same way as the permute functions. The performance is similar to the permute functions, or slightly lower. 64 | \vspacesmall 65 | 66 | \begin{lstlisting}[frame=none] 67 | // Example: 68 | Vec4i a(10, 11, 12, 13); 69 | Vec4i b(20, 21, 22, 23); 70 | Vec4i c = blend4<4,0,6,3>(a, b); // c = (20, 10, 22, 13) 71 | \end{lstlisting} 72 | \vspacesmall 73 | 74 | There are different methods you can use if you want to blend inputs from more than two vectors: 75 | \vspacesmall 76 | 77 | 1. A binary tree of blend calls, where unused values are set to V\_DC meaning don't care. 78 | \begin{lstlisting}[frame=none] 79 | // Example: 80 | Vec4i a(10, 11, 12, 13); 81 | Vec4i b(20, 21, 22, 23); 82 | Vec4i c(30, 31, 32, 33); 83 | Vec4i d(40, 41, 42, 43); 84 | Vec4i r = blend4<0,5,V_DC,V_DC>(a, b);// r = (10,21,?,?) 85 | Vec4i s = blend4(c, d);// s = (?,?,32,43) 86 | Vec4i t = blend4<0,1,6,7>(r, s); // t = (10,21,32,43) 87 | \end{lstlisting} 88 | \vspacesmall 89 | 90 | 2. Set unused values to zero, then OR the results. 91 | \begin{lstlisting}[frame=none] 92 | // Example: 93 | Vec4i a(10, 11, 12, 13); 94 | Vec4i b(20, 21, 22, 23); 95 | Vec4i c(30, 31, 32, 33); 96 | Vec4i d(40, 41, 42, 43); 97 | Vec4i r = blend4<0,5,-1,-1>(a, b);// r = (10,21,0,0) 98 | Vec4i s = blend4<-1,-1,2,7>(c, d);// s = (0,0,32,43) 99 | Vec4i t = r | s; // t = (10,21,32,43) 100 | \end{lstlisting} 101 | \vspacesmall 102 | 103 | 3. If the input vectors are stored sequentially in memory then use the lookup functions shown below. 104 | \vspacesmall 105 | 106 | 107 | \section{Lookup functions}\label{LookupFunctions} 108 | \vspacesmall 109 | 110 | \begin{tabular}{|p{30mm}|p{120mm}|} 111 | \hline 112 | \bfseries Function & Vec16c lookup16(Vec16c, Vec16c) \newline 113 | Vec32c lookup32(Vec32c, Vec32c) \newline 114 | Vec64c lookup64(Vec64c, Vec64c) \newline 115 | Vec8s lookup8(Vec8s, Vec8s) \newline 116 | Vec16s lookup16(Vec16s, Vec16s) \newline 117 | Vec32s lookup32(Vec32s, Vec32s) \newline 118 | Vec4i lookup4(Vec4i, Vec4i) \newline 119 | Vec8i lookup8(Vec8i, Vec8i) \newline 120 | Vec16i lookup16(Vec16i, Vec16i) \newline 121 | Vec4q lookup4(Vec4q, Vec4q) \newline 122 | Vec8q lookup8(Vec8q, Vec8q) \\ \hline 123 | \bfseries Defined for & Vec16c, Vec32c, Vec64c, Vec8s, Vec16s, Vec32s, Vec4i, Vec8i, Vec16i, Vec4q, Vec8q \\ \hline 124 | \bfseries Description & Permutation with variable indexes. The first input vector contains the indexes, the second input vector is the data source. Each index must be in the range $0 \leq i \leq n-1$ where n is the number of elements in a vector. \\ \hline 125 | \bfseries Efficiency & 126 | Vec16i, Vec8q: Good for AVX512F, medium otherwise. \newline 127 | Vec64c, Vec32s: Good for AVX512VBMI, medium for AVX512BW, poor otherwise. \newline 128 | Vec32c, Vec16s, Vec8i, Vec4i, Vec4q: Good for AVX2, medium otherwise. \newline 129 | Vec16c, Vec8s: Good for SSSE3, poor otherwise. \\ \hline 130 | \end{tabular} 131 | \vspacebig 132 | 133 | 134 | \begin{tabular}{|p{30mm}|p{120mm}|} 135 | \hline 136 | \bfseries Function & 137 | Vec16c lookup32(Vec16c, Vec16c, Vec16c) \newline 138 | Vec64c lookup128(Vec64c, Vec64c, Vec64c) \newline 139 | Vec8s lookup16(Vec8s, Vec8s, Vec8s) \newline 140 | Vec32s lookup64(Vec32s, Vec32s, Vec32s) \newline 141 | Vec4i lookup8(Vec4i, Vec4i, Vec4i) \newline 142 | Vec16i lookup32(Vec16i, Vec16i, Vec16i) \\ \hline 143 | \bfseries Defined for & Vec16c, Vec64c, Vec8s, Vec32s, Vec4i, Vec16i \\ \hline 144 | \bfseries Description & Blend with variable indexes. The first input vector contains the indexes, the following two input vectors contain the data source. Each index must be in the range $0 \leq i \leq 2\cdot n - 1$ where n is the number of elements in each vector. \\ \hline 145 | \bfseries Efficiency & 146 | Vec4i, Vec8s: Good for AVX2, medium or poor otherwise. \newline 147 | Vec16i: Good for AVX512, medium or poor otherwise. \newline 148 | Vec64c, Vec32s: Good for AVX512VBMI, medium for AVX512BW, poor otherwise. \newline 149 | Vec16c, Vec8s: Good for SSSE3, poor otherwise. \\ \hline 150 | \end{tabular} 151 | \vspacebig 152 | 153 | 154 | \begin{tabular}{|p{30mm}|p{120mm}|} 155 | \hline 156 | \bfseries Function & 157 | Vec4i lookup16(Vec4i, Vec4i, Vec4i, Vec4i, Vec4i) \newline 158 | Vec16i lookup64(Vec16i, Vec16i, Vec16i, Vec16i, Vec16i) \newline 159 | Vec64c lookup256(Vec64c, Vec64c, Vec64c, Vec64c, Vec64c) \newline 160 | Vec32s lookup128(Vec32s, Vec32s, Vec32s, Vec32s, Vec32s) \\ \hline 161 | \bfseries Defined for & Vec4i, Vec32s, Vec64c \\ \hline 162 | \bfseries Description & Blend with variable indexes. The first input vector contains the indexes, the following four input vectors contain the data source. Each index must be in the range $0 \leq i \leq 4\cdot n - 1$ where n is the number of elements in each vector. \\ \hline 163 | \bfseries Efficiency & 164 | Vec4i: Good for AVX2, medium otherwise. \newline 165 | Vec16i: Good for AVX512, medium or poor otherwise. \newline 166 | Vec64c, Vec32s: Good for AVX512VBMI, medium for AVX512BW, poor otherwise. 167 | \\ \hline 168 | \end{tabular} 169 | \vspacebig 170 | 171 | 172 | \begin{tabular}{|p{30mm}|p{120mm}|} 173 | \hline 174 | \bfseries Function & 175 | Vec8h lookup8(Vec8s, Vec8h) \newline 176 | Vec16h lookup16(Vec16s, Vec16h) \newline 177 | Vec32h lookup32(Vec32s, Vec32h) \newline 178 | Vec4f lookup4(Vec4i, Vec4f) \newline 179 | Vec8f lookup8(Vec8i, Vec8f) \newline 180 | Vec16f lookup16(Vec16i, Vec16f) \newline 181 | Vec2d lookup2(Vec2q, Vec2d) \newline 182 | Vec4d lookup4(Vec4q, Vec4d) \newline 183 | Vec8d lookup8(Vec8q, Vec8d) \\ \hline 184 | \bfseries Defined for & all floating point vector classes \\ \hline 185 | \bfseries Description & Permutation of floating point vectors with integer indexes. Each index must be in the range $0 \leq i \leq n-1$ where n is the number of elements in a vector. \\ \hline 186 | \bfseries Efficiency & good for AVX2 and later, medium for lower instruction sets \\ \hline 187 | \end{tabular} 188 | \vspacebig 189 | 190 | 191 | \begin{tabular}{|p{30mm}|p{120mm}|} 192 | \hline 193 | \bfseries Function & 194 | Vec8h lookup16(Vec8s, Vec8h, Vec8h) \newline 195 | Vec4f lookup8(Vec4i, Vec4f, Vec4f) \newline 196 | Vec2d lookup4(Vec2q, Vec2d, Vec2d) \\ \hline 197 | \bfseries Defined for & Vec4f, Vec2d \\ \hline 198 | \bfseries Description & Blend of floating point vectors with integer indexes. Each index must be in the range $0 \leq i \leq 2*n-1$ where n is the number of elements in a vector. \\ \hline 199 | \bfseries Efficiency & medium \\ \hline 200 | \end{tabular} 201 | \vspacebig 202 | 203 | 204 | \begin{tabular}{|p{30mm}|p{120mm}|} 205 | \hline 206 | \bfseries Function & 207 | Vec16c lookup\textless n\textgreater(Vec16c index, void const * table) \newline 208 | Vec32c lookup\textless n\textgreater(Vec32c index, void const * table) \newline 209 | Vec8s lookup\textless n\textgreater(Vec8s index, void const * table) \newline 210 | Vec16s lookup\textless n\textgreater(Vec16s index, void const * table) \newline 211 | Vec4i lookup\textless n\textgreater(Vec4i index, void const * table) \newline 212 | Vec8i lookup\textless n\textgreater(Vec8i index, void const * table) \newline 213 | Vec16i lookup\textless n\textgreater(Vec16i index, void const * table) \newline 214 | Vec4q lookup\textless n\textgreater(Vec4q index, void const * table) \newline 215 | Vec8q lookup\textless n\textgreater(Vec8q index, void const * table) \newline 216 | Vec8h lookup\textless n\textgreater(Vec8s index, void const * table) \newline 217 | Vec16h lookup\textless n\textgreater(Vec16s index, void const * table) \newline 218 | Vec32h lookup\textless n\textgreater(Vec32s index, void const * table) \newline 219 | Vec4f lookup\textless n\textgreater(Vec4i index, float const * table) \newline 220 | Vec8f lookup\textless n\textgreater(Vec8i const \& index, float const * table) \newline 221 | Vec16f lookup\textless n\textgreater(Vec16i const \& index, float const * table) \newline 222 | Vec2d lookup\textless n\textgreater(Vec2q index, double const * table) \newline 223 | Vec4d lookup\textless n\textgreater(Vec4q const \& i, double const * table) \newline 224 | Vec8d lookup\textless n\textgreater(Vec8q const \& i, double const * table) \\ \hline 225 | \bfseries Defined for & all floating point and signed integer vector classes \\ \hline 226 | \bfseries Description & Permute, blend, table lookup, or gather data from array with an integer vector of indexes.\newline 227 | Each index must be in the range $0 \leq i \leq n-1$, where $n$ is indicated as a template parameter. $n$ must be a positive compile-time constant. 228 | The range check can be omitted by setting n = INT\_MAX. \\ \hline 229 | \bfseries Efficiency & good for AVX2 and later, medium for lower instruction sets. 230 | Best if n is no bigger than twice the vector length. \\ \hline 231 | \end{tabular} 232 | \vspacebig 233 | 234 | 235 | The lookup functions are similar to the permute and blend functions, but with variable indexes. They cannot be used for setting an element to zero, and there is no "don't care" option. The lookup functions can be used for several purposes: 236 | 237 | \begin{enumerate} 238 | \item permute with variable indexes 239 | \item blend with variable indexes 240 | \item blend from more than two sources 241 | \item table lookup 242 | \item gather non-contiguous data from an array 243 | \end{enumerate} 244 | \vspacesmall 245 | 246 | The index is always an integer vector. The input can be one or more vectors or an array. The result is a vector of the same type as the input. All elements in the index vector must be in the specified range. The behavior for an index out of range is implementation-dependent and may give any value for the corresponding element. The function may in some cases read up to one vector size past the end of the table for the sake of efficient permutation. 247 | \vspacesmall 248 | 249 | The lookup functions are not defined for unsigned integer vector types, but the corresponding signed versions can be used. You don't have to worry about overflow when converting unsigned integers to signed here, as long as the result vector is converted back to unsigned. 250 | \vspacebig 251 | 252 | 253 | \begin{lstlisting}[frame=none] 254 | // Example of permutation with variable indexes: 255 | Vec4f a(1.0, 1.1, 1.2, 1.3); 256 | Vec4i b(2, 3, 3, 0); 257 | Vec4f c = lookup4(b, a); // c = (1.2, 1.3, 1.3, 1.0) 258 | 259 | // Example of blending with variable indexes: 260 | Vec4f a(1.0, 1.1, 1.2, 1.3); 261 | Vec4f b(2.0, 2.1, 2.2, 2.3); 262 | Vec4i c(4, 3, 2, 7); 263 | Vec4f d = lookup4(c,a,b); // d = (2.0, 1.3, 1.2, 2.3) 264 | 265 | // Example of blending from more than two sources: 266 | float sources[12] = { 267 | 1.0,1.1,1.2,1.3,2.0,2.1,2.2,2.3,3.0,3.1,3.2,3.3}; 268 | Vec4i i(11, 0, 5, 5); 269 | Vec4f c = lookup<12>(i, sources); // c = (3.3,1.0,2.1,2.1) 270 | \end{lstlisting} 271 | \vspacebig 272 | 273 | 274 | A function with a limited number of possible input values can be replaced by a lookup table. This is useful if table lookup is faster than calculating the function. The following example has a table of the function $y = x^2 - 1$ 275 | 276 | \begin{lstlisting}[frame=none] 277 | // Table of the function y = x*x-1 278 | int table[6] = {-1,0,3,8,15,24}; 279 | Vec4i x(4,2,0,5); 280 | Vec4i y = lookup<6>(x, table); // y = (15, 3, -1, 24) 281 | 282 | // Example of gathering non-contiguous data from an array: 283 | float x[16] = { ... }; 284 | Vec4i i(0,4,8,12); 285 | Vec4f y = lookup<16>(i, x); // y = (x[0],x[4],x[8],x[12]) 286 | \end{lstlisting} 287 | \vspacesmall 288 | 289 | 290 | \section{Gather functions}\label{GatherFunctions} 291 | 292 | \vspacesmall 293 | \begin{tabular}{|p{30mm}|p{120mm}|} \hline 294 | \bfseries Function & 295 | Vec4i gather4i\textless indexes\textgreater(void const * table) \newline 296 | Vec8i gather8i\textless indexes\textgreater(void const * table) \newline 297 | Vec16i gather16i\textless indexes\textgreater(void const * table) \newline 298 | Vec2q gather2q\textless indexes\textgreater(void const * table) \newline 299 | Vec4q gather4q\textless indexes\textgreater(void const * table) \newline 300 | Vec8q gather8q\textless indexes\textgreater(void const * table) \newline 301 | Vec4f gather4f\textless indexes\textgreater(void const * table) \newline 302 | Vec8f gather8f\textless indexes\textgreater(void const * table) \newline 303 | Vec16f gather16f\textless indexes\textgreater(void const * table) \newline 304 | Vec2d gather2d\textless indexes\textgreater(void const * table) \newline 305 | Vec4d gather4d\textless indexes\textgreater(void const * table) \newline 306 | Vec8d gather8d\textless indexes\textgreater(void const * table) \\ \hline 307 | \bfseries Defined for & Vec4i, Vec8i, Vec16i, Vec2q, Vec4q, Vec8q, \newline 308 | Vec4f, Vec8f, Vec16f, Vec2d, Vec4d, Vec8d \\ \hline 309 | \bfseries Description & Load non-contiguous data from a table. Indexes cannot be negative. There is no option for zeroing or don't care. \newline 310 | The function may read a full vector and permute it if all indexes are smaller than the vector size. \\ \hline 311 | \bfseries Efficiency & medium \\ \hline 312 | \end{tabular} 313 | \vspacesmall 314 | 315 | \begin{lstlisting}[frame=none] 316 | // Example: 317 | int tab[8] = {10,11,12,13,14,15,16,17}; 318 | Vec4i a = gather4i<6,4,4,0>(tab); 319 | // a = (16, 14, 14, 10); 320 | \end{lstlisting} 321 | \vspacesmall 322 | 323 | Use the lookup\textless n\textgreater {} functions instead if you need variable indexes. \newline 324 | \vspacesmall 325 | 326 | 327 | \section{Scatter functions}\label{Scatter functions} 328 | 329 | \begin{tabular}{|p{30mm}|p{120mm}|} \hline 330 | \bfseries Function & scatter\textless indexes\textgreater(Vec4i data, void * array) \newline 331 | scatter\textless indexes\textgreater(Vec8i data, void * array) \newline 332 | scatter\textless indexes\textgreater(Vec16i data, void * array) \newline 333 | scatter\textless indexes\textgreater(Vec2q data, void * array) \newline 334 | scatter\textless indexes\textgreater(Vec4q data, void * array) \newline 335 | scatter\textless indexes\textgreater(Vec8q data, void * array) \newline 336 | scatter\textless indexes\textgreater(Vec4f data, float * array) \newline 337 | scatter\textless indexes\textgreater(Vec8f data, float * array) \newline 338 | scatter\textless indexes\textgreater(Vec16f data, float * array) \newline 339 | scatter\textless indexes\textgreater(Vec2d data, double * array) \newline 340 | scatter\textless indexes\textgreater(Vec4d data, double * array) \newline 341 | scatter\textless indexes\textgreater(Vec8d data, double * array) \\ \hline 342 | \bfseries Defined for & 343 | Vec4i, Vec8i, Vec16i, Vec2q, Vec4q, Vec8q, \newline 344 | Vec4f, Vec8f, Vec16f, Vec2d, Vec4d, Vec8d \\ \hline 345 | \bfseries Description & Store vector elements into non-contiguous positions in an array. Each vector element is stored in the array position indicated by the corresponding index. An element is not stored if the corresponding index is negative. \\ \hline 346 | \bfseries Efficiency & 347 | Medium for 512 bit vectors if AVX512F instruction set supported. \newline 348 | Medium for 256 bit vectors if AVX512F, or better AVX512VL, supported. \newline 349 | Medium for 128 bit vectors if AVX512VL supported. \newline 350 | Poor otherwise. \\ \hline 351 | \end{tabular} 352 | \vspacesmall 353 | 354 | \begin{lstlisting}[frame=none] 355 | // Example: 356 | Vec8i a(10,11,12,13,14,15,16,17); 357 | int array[10] = {0}; 358 | scatter<5,4,3,2,-1,-1,7,0>(a, array); 359 | // array = (17,0,13,12,11,10,0,16,0,0) 360 | \end{lstlisting} 361 | \vspacebig 362 | 363 | 364 | \begin{tabular}{|p{30mm}|p{120mm}|} \hline 365 | \bfseries Function & 366 | scatter(Vec4i index, uint32\_t limit, Vec4i data, void * array) \newline 367 | scatter(Vec8i index, uint32\_t limit, Vec8i data, void * array) \newline 368 | scatter(Vec16i index, uint32\_t limit, Vec16i data, void * array) \newline 369 | scatter(Vec2q index, uint32\_t limit, Vec2q data, void * array) \newline 370 | scatter(Vec4i index, uint32\_t limit, Vec4q data, void * array) \newline 371 | scatter(Vec4q index, uint32\_t limit, Vec4q data, void * array) \newline 372 | scatter(Vec8i index, uint32\_t limit, Vec8q data, void * array) \newline 373 | scatter(Vec8q index, uint32\_t limit, Vec8q data, void * array) \newline 374 | scatter(Vec4i index, uint32\_t limit, Vec4f data, float * array) \newline 375 | scatter(Vec8i index, uint32\_t limit, Vec8f data, float * array) \newline 376 | scatter(Vec16i index, uint32\_t limit, Vec16f data, float * array) \newline 377 | scatter(Vec2q index, uint32\_t limit, Vec2d data, double * array) \newline 378 | scatter(Vec4i index, uint32\_t limit, Vec4d data, double * array) \newline 379 | scatter(Vec4q index, uint32\_t limit, Vec4d data, double * array) \newline 380 | scatter(Vec8i index, uint32\_t limit, Vec8d data, double * array) \newline 381 | scatter(Vec8q index, uint32\_t limit, Vec8d data, double * array) \\ \hline 382 | \bfseries Defined for & 383 | Vec4i, Vec8i, Vec16i, Vec2q, Vec4q, Vec8q, \newline 384 | Vec4f, Vec8f, Vec16f, Vec2d, Vec4d, Vec8d \\ \hline 385 | \bfseries Description & Store vector elements into non-contiguous positions in an array. Each vector element is stored in the array position indicated by the corresponding element of the index vector. An element is not stored if the corresponding index is negative or bigger than or equal to the limit. The limit will typically be the size of the array. \\ \hline 386 | \bfseries Efficiency & 387 | Medium for 512 bit vectors if AVX512F instruction set supported. \newline 388 | Medium for 256 bit vectors if AVX512F, or better AVX512VL, supported. \newline 389 | Medium for 128 bit vectors if AVX512VL supported. \newline 390 | Poor otherwise. \\ \hline 391 | \end{tabular} 392 | \vspacesmall 393 | 394 | \begin{lstlisting}[frame=none] 395 | // Example: 396 | Vec8i a(10,11,12,13,14,15,16,17); 397 | Vec8i x(5,4,3,2,-1,99,7,0); 398 | int array[10] = {0}; 399 | scatter(x, 5, a, array); 400 | // array = (17,0,13,12,11,0,0,0,0,0) 401 | \end{lstlisting} 402 | \vspacebig 403 | 404 | The scatter functions are useful for writing sparse arrays. If you have more dense arrays, then it may be more efficient to permute the vector and then store the whole vector into the array. 405 | \vspacesmall 406 | 407 | If you want to permute a dataset that is too big for the permute and blend functions, then it is better to use lookup or gather functions than to use scatter functions. 408 | \vspacesmall 409 | 410 | \end{document} -------------------------------------------------------------------------------- /vcl_technical_details.tex: -------------------------------------------------------------------------------- 1 | % chapter included in vclmanual.tex 2 | \documentclass[vcl_manual.tex]{subfiles} 3 | \begin{document} 4 | 5 | 6 | \chapter{Technical details}\label{chap:TechnicalDetails} 7 | \flushleft 8 | 9 | \begin{longtable}[l]{|p{40mm}|p{100mm}|} 10 | \endfirsthead 11 | \label{table:fileList} \\ 12 | \endhead 13 | \hline 14 | \bfseries File name & \bfseries Purpose \\ \hline 15 | vcl\_manual.pdf & Instruction manual (this file) \\ \hline 16 | 17 | vectorclass.h & Top-level C++ header file. This will include several other header files, according to the indicated instruction set \\ \hline 18 | 19 | instrset.h & Detection of which instruction set the code is compiled for, various common definitions, and functions that depend on the instruction set. Included by vectorclass.h \\ \hline 20 | 21 | vectori128.h & Defines classes, operators and functions for integer vectors with a total size of 128 bits. Included by vectorclass.h \\ \hline 22 | 23 | vectori256.h & Defines classes, operators and functions for integer vectors with a total size of 256 bits for the AVX2 instruction set. Included by vectorclass.h if appropriate \\ \hline 24 | 25 | vectori256e.h & Defines classes, operators and functions for integer vectors with a total size of 256 bits for instruction sets lower than AVX2. Included by vectorclass.h if appropriate \\ \hline 26 | 27 | vectori512.h & Defines classes, operators and functions for vectors of 32-bit and 64-bit integers with a total size of 512 bits for the AVX512F instruction set. Included by vectorclass.h if appropriate \\ \hline 28 | 29 | vectori512e.h & Defines classes, operators and functions for vectors of 32-bit and 64-bit integers with a total size of 512 bits for instruction sets lower than AVX512F. Included by vectorclass.h if appropriate \\ \hline 30 | 31 | vectori512s.h & Defines classes, operators and functions for vectors of 8-bit and 16-bit integers with a total size of 512 bits for the AVX512BW instruction set. Included by vectorclass.h if appropriate \\ \hline 32 | 33 | vectori512se.h & Defines classes, operators and functions for vectors of 8-bit and 16-bit integers with a total size of 512 bits for instruction sets lower than 34 | AVX512BW. Included by vectorclass.h if appropriate \\ \hline 35 | 36 | vectorf128.h & Defines classes, operators and functions for floating point vectors with a total size of 128 bits. Included by vectorclass.h \\ \hline 37 | 38 | vectorf256.h & Defines classes, operators and functions for floating point vectors with a total size of 256 bits for the AVX and later instruction sets. Included by vectorclass.h if appropriate \\ \hline 39 | 40 | vectorf256e.h & Defines classes, operators and functions for floating point vectors with a total size of 256 bits for instruction sets lower than AVX. Included by vectorclass.h if appropriate \\ \hline 41 | 42 | vectorf512.h & Defines classes, operators and functions for floating point vectors with a total size of 512 bits for the AVX512F and later instruction sets. Included by vectorclass.h if appropriate \\ \hline 43 | 44 | vectorf512e.h & Defines classes, operators and functions for floating point vectors with a total size of 512 bits for instruction sets lower than AVX512F. Included by vectorclass.h if appropriate \\ \hline 45 | 46 | vectormath\_exp.h & Optional inline mathematical functions: power, logarithms and exponential functions \\ \hline 47 | 48 | vectormath\_trig.h & Optional inline mathematical functions: trigonometric and inverse trigonometric functions \\ \hline 49 | 50 | vectormath\_hyp.h & Optional inline mathematical functions: hyperbolic and inverse hyperbolic functions \\ \hline 51 | 52 | vectormath\_common.h & Common definitions for vectormath\_exp.h, vectormath\_trig.h and vectormath\_hyp.h \\ \hline 53 | 54 | vectormath\_lib.h & Optional header file for external mathematical vector function libraries \\ \hline 55 | 56 | instrset\_detect.cpp & Optional functions for detecting which instruction set is supported at runtime \\ \hline 57 | 58 | dispatch\_example.cpp & Example of how to make automatic CPU dispatching \\ \hline 59 | 60 | changelog.txt & VCL version history \\ \hline 61 | 62 | license.txt & Apache 2.0 license \\ \hline 63 | 64 | svml\_patch & Folder containing the library win64patch.lib as well as the source code to build it. Used for fixing a compatibillity issue with Intel SVML library in 64-bit Windows \\ \hline 65 | 66 | testbench & Folder containing a test bench for testing the VCL library. This is used in the development of VCL, and is not needed by programmers using the VCL. Includes code and documentation. \\ \hline 67 | 68 | \end{longtable} 69 | %\end{tabular} 70 | \vspacesmall 71 | 72 | 73 | 74 | \end{document} --------------------------------------------------------------------------------