├── .gitignore
├── LICENSE
├── README.md
├── bold-extra.sty
├── containers_manual.pdf
├── containers_manual.tex
├── freesoftwarelogo.jpg
├── vcl_bool.tex
├── vcl_contributing.tex
├── vcl_conversion.tex
├── vcl_errors_etc.tex
├── vcl_examples.tex
├── vcl_file_list.tex
├── vcl_float_behavior.tex
├── vcl_introduction.tex
├── vcl_manual.pdf
├── vcl_manual.tex
├── vcl_mathematical_functions.tex
├── vcl_operators_and_functions.tex
├── vcl_packages.tex
├── vcl_performance.tex
├── vcl_permute_functions.tex
└── vcl_technical_details.tex


/.gitignore:
--------------------------------------------------------------------------------
  1 | ## Core latex/pdflatex auxiliary files:
  2 | *.aux
  3 | *.lof
  4 | *.log
  5 | *.lot
  6 | *.fls
  7 | *.out
  8 | *.toc
  9 | *.fmt
 10 | *.fot
 11 | *.cb
 12 | *.cb2
 13 | .*.lb
 14 | 
 15 | ## Intermediate documents:
 16 | *.dvi
 17 | *.xdv
 18 | *-converted-to.*
 19 | # these rules might exclude image files for figures etc.
 20 | # *.ps
 21 | # *.eps
 22 | # *.pdf
 23 | 
 24 | ## Generated if empty string is given at "Please type another file name for output:"
 25 | .pdf
 26 | 
 27 | ## Bibliography auxiliary files (bibtex/biblatex/biber):
 28 | *.bbl
 29 | *.bcf
 30 | *.blg
 31 | *-blx.aux
 32 | *-blx.bib
 33 | *.run.xml
 34 | 
 35 | ## Build tool auxiliary files:
 36 | *.fdb_latexmk
 37 | *.synctex
 38 | *.synctex(busy)
 39 | *.synctex.gz
 40 | *.synctex.gz(busy)
 41 | *.pdfsync
 42 | 
 43 | ## Auxiliary and intermediate files from other packages:
 44 | # algorithms
 45 | *.alg
 46 | *.loa
 47 | 
 48 | # achemso
 49 | acs-*.bib
 50 | 
 51 | # amsthm
 52 | *.thm
 53 | 
 54 | # beamer
 55 | *.nav
 56 | *.pre
 57 | *.snm
 58 | *.vrb
 59 | 
 60 | # changes
 61 | *.soc
 62 | 
 63 | # cprotect
 64 | *.cpt
 65 | 
 66 | # elsarticle (documentclass of Elsevier journals)
 67 | *.spl
 68 | 
 69 | # endnotes
 70 | *.ent
 71 | 
 72 | # fixme
 73 | *.lox
 74 | 
 75 | # feynmf/feynmp
 76 | *.mf
 77 | *.mp
 78 | *.t[1-9]
 79 | *.t[1-9][0-9]
 80 | *.tfm
 81 | 
 82 | #(r)(e)ledmac/(r)(e)ledpar
 83 | *.end
 84 | *.?end
 85 | *.[1-9]
 86 | *.[1-9][0-9]
 87 | *.[1-9][0-9][0-9]
 88 | *.[1-9]R
 89 | *.[1-9][0-9]R
 90 | *.[1-9][0-9][0-9]R
 91 | *.eledsec[1-9]
 92 | *.eledsec[1-9]R
 93 | *.eledsec[1-9][0-9]
 94 | *.eledsec[1-9][0-9]R
 95 | *.eledsec[1-9][0-9][0-9]
 96 | *.eledsec[1-9][0-9][0-9]R
 97 | 
 98 | # glossaries
 99 | *.acn
100 | *.acr
101 | *.glg
102 | *.glo
103 | *.gls
104 | *.glsdefs
105 | 
106 | # gnuplottex
107 | *-gnuplottex-*
108 | 
109 | # gregoriotex
110 | *.gaux
111 | *.gtex
112 | 
113 | # htlatex
114 | *.4ct
115 | *.4tc
116 | *.idv
117 | *.lg
118 | *.trc
119 | *.xref
120 | 
121 | # hyperref
122 | *.brf
123 | 
124 | # knitr
125 | *-concordance.tex
126 | # TODO Comment the next line if you want to keep your tikz graphics files
127 | *.tikz
128 | *-tikzDictionary
129 | 
130 | # listings
131 | *.lol
132 | 
133 | # makeidx
134 | *.idx
135 | *.ilg
136 | *.ind
137 | *.ist
138 | 
139 | # minitoc
140 | *.maf
141 | *.mlf
142 | *.mlt
143 | *.mtc[0-9]*
144 | *.slf[0-9]*
145 | *.slt[0-9]*
146 | *.stc[0-9]*
147 | 
148 | # minted
149 | _minted*
150 | *.pyg
151 | 
152 | # morewrites
153 | *.mw
154 | 
155 | # nomencl
156 | *.nlg
157 | *.nlo
158 | *.nls
159 | 
160 | # pax
161 | *.pax
162 | 
163 | # pdfpcnotes
164 | *.pdfpc
165 | 
166 | # sagetex
167 | *.sagetex.sage
168 | *.sagetex.py
169 | *.sagetex.scmd
170 | 
171 | # scrwfile
172 | *.wrt
173 | 
174 | # sympy
175 | *.sout
176 | *.sympy
177 | sympy-plots-for-*.tex/
178 | 
179 | # pdfcomment
180 | *.upa
181 | *.upb
182 | 
183 | # pythontex
184 | *.pytxcode
185 | pythontex-files-*/
186 | 
187 | # thmtools
188 | *.loe
189 | 
190 | # TikZ & PGF
191 | *.dpth
192 | *.md5
193 | *.auxlock
194 | 
195 | # todonotes
196 | *.tdo
197 | 
198 | # easy-todo
199 | *.lod
200 | 
201 | # xmpincl
202 | *.xmpi
203 | 
204 | # xindy
205 | *.xdy
206 | 
207 | # xypic precompiled matrices
208 | *.xyc
209 | 
210 | # endfloat
211 | *.ttt
212 | *.fff
213 | 
214 | # Latexian
215 | TSWLatexianTemp*
216 | 
217 | ## Editors:
218 | # WinEdt
219 | *.bak
220 | *.sav
221 | 
222 | # Texpad
223 | .texpadtmp
224 | 
225 | # Kile
226 | *.backup
227 | 
228 | # KBibTeX
229 | *~[0-9]*
230 | 
231 | # auto folder when using emacs and auctex
232 | ./auto/*
233 | *.el
234 | 
235 | # expex forward references with \gathertags
236 | *-tags.tex
237 | 
238 | # standalone packages
239 | *.sta
240 | 
241 | # generated if using elsarticle.cls
242 | *.spl
243 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 | 
179 |    Copyright 2012-2019 Agner Fog.
180 | 
181 |    Licensed under the Apache License, Version 2.0 (the "License");
182 |    you may not use this file except in compliance with the License.
183 |    You may obtain a copy of the License at
184 | 
185 |        http://www.apache.org/licenses/LICENSE-2.0
186 | 
187 |    Unless required by applicable law or agreed to in writing, software
188 |    distributed under the License is distributed on an "AS IS" BASIS,
189 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
190 |    See the License for the specific language governing permissions and
191 |    limitations under the License.
192 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # manual
 2 | The manual for the C++ vector class library is here.
 3 | [Download manual](https://github.com/vectorclass/manual/blob/master/vcl_manual.pdf)
 4 | 
 5 | The latest release of the Vector Class Library is in
 6 | [releases](https://github.com/vectorclass/version2/releases)
 7 | 
 8 | The current version of the source files is in 
 9 | [version2](https://github.com/vectorclass/version2) 
10 | 
11 | Various add-on packages for specific applictions are in 
12 | [add-on](https://github.com/vectorclass/add-on)
13 | 
14 | To re-build the manual:  
15 | The pdf manual is built from the .tex files, using MiKTeX and Texmaker. 
16 | Run LuaLaTeX twice to fix forward references.
17 | 


--------------------------------------------------------------------------------
/bold-extra.sty:
--------------------------------------------------------------------------------
 1 | % bold-extra.sty - a jiffy to provide access (in latex) to (some of)
 2 | % the fonts in ctan directory fonts/cm/mf-extra/bold
 3 | %
 4 | % by robin fairbairns, rf10@cam.ac.uk November 2001
 5 | %
 6 | % this package is provided under the provisions of the latex project
 7 | % public licence, http://www.latex-project.org/lppl.txt
 8 | %
 9 | % this packages provides font shapes to support bold small caps and tt
10 | % text.  there is a choice of bold tt fonts, which are selected by
11 | % package options cmbtt and cmttb (this reflects the confusingly
12 | % similar font names).  the default (based on the author's estimation
13 | % of the fonts' relative merits) is cmttb.
14 | %
15 | % to use these fonts you need their metafont sources available to your
16 | % tex system (as far as i know, there are no type 1 versions of the
17 | % fonts available yet).  place them in an appropriate place under
18 | % fonts/source in your tds texmf tree; place this file somewhere like
19 | % tex/latex/misc in your tree.  see
20 | % http://www.tex.ac.uk/cgi-bin/texfaq2html?label=instpackages+wherefiles
21 | % for more details.
22 | 
23 | \ProvidesPackage{bold-extra}[2001/11/13 v0.1 Use fonts from cm/mf-extra/bold]
24 | \NeedsTeXFormat{LaTeX2e}
25 | 
26 | \newif\if@cmttb
27 | \DeclareOption{cmttb}{\@cmttbtrue}
28 | \DeclareOption{cmbtt}{\@cmttbfalse}
29 | \ExecuteOptions{cmttb}
30 | \ProcessOptions
31 | 
32 | % declare bold small caps font
33 | \DeclareFontShape{OT1}{cmr}{b}{sc}
34 |    {
35 |     <5><6><7><8><9><10><12><10.95><14.4><17.28><20.74><24.88>cmbcsc10
36 |     }{}
37 | \DeclareFontShape{OT1}{cmr}{bx}{sc}
38 |    {<->ssub*cmr/b/sc}{}
39 | 
40 | % declare bold tt font: note, we use cmttb10 by default rather than
41 | % the cmbtt series (which seem over-bold to me)
42 | \if@cmttb
43 | \DeclareFontShape{OT1}{cmtt}{b}{n}
44 |    {
45 |     <5><6><7><8><9><10><12><10.95><14.4><17.28><20.74><24.88>cmttb10
46 |     }{}
47 | \else
48 | \DeclareFontShape{OT1}{cmtt}{b}{n}
49 |    {
50 |     <5><6><7><8>cmbtt8%
51 |     <9>cmbtt9%
52 |     <10><12><10.95><14.4><17.28><20.74><24.88>cmbtt10
53 |     }{}
54 | \fi
55 | \DeclareFontShape{OT1}{cmtt}{bx}{n}
56 |    {<->ssub*cmtt/b/n}{}
57 | 


--------------------------------------------------------------------------------
/containers_manual.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vectorclass/manual/62cb40d710f8d6180511ba03ca6e09347e06f0b9/containers_manual.pdf


--------------------------------------------------------------------------------
/containers_manual.tex:
--------------------------------------------------------------------------------
  1 | \documentclass[11pt,a4paper,oneside,openright]{report}
  2 | 
  3 | \usepackage[bindingoffset=5mm,left=20mm,right=20mm,top=20mm,bottom=20mm,footskip=10mm]{geometry}
  4 | \usepackage[utf8x]{inputenc}
  5 | \usepackage{hyperref}
  6 | \usepackage[english]{babel}
  7 | \usepackage{listings}
  8 | \usepackage{subfiles}
  9 | \usepackage{longtable}
 10 | \usepackage{multirow}
 11 | \usepackage{ragged2e} 
 12 | \usepackage{cmap} % avoid fi ligatures in pdf file
 13 | \usepackage{amsthm} % example numbering
 14 | \usepackage{color}
 15 | %\usepackage{bold-extra} % for bold tt font. Remember to include bold-extra.sty file
 16 | \usepackage{graphicx}
 17 | \usepackage[yyyymmdd]{datetime}
 18 | \usepackage{float}
 19 | 
 20 | % style for code listing
 21 | \renewcommand{\familydefault}{\sfdefault}
 22 | \renewcommand{\ttdefault}{pcr}          % selects Courier font
 23 | \newtheorem{example}{Example}[chapter]  % example numbering
 24 | \lstset{language=C}                     % formatting for code listing
 25 | \lstset{basicstyle=\ttfamily,breaklines=true}
 26 | \definecolor{darkGreen}{rgb}{0,0.4,0}
 27 | \definecolor{mybrown}{rgb}{0.40,0.10,0.05}
 28 | \lstset{commentstyle=\color{darkGreen}}  % comments color
 29 | \lstset{keywordstyle=\color{blue}}       % keyword color
 30 | \lstset{stringstyle=\color{mybrown}}     % string color
 31 | \lstset{showstringspaces=false}          % don't mark spaces in strings
 32 | 
 33 | \renewcommand{\dateseparator}{-}
 34 | 
 35 | % command for turning indent back on after \flushleft
 36 | \newcommand{\indenton}{\RaggedRight\parindent=15pt}
 37 | 
 38 | % command for vertical space
 39 | \newcommand{\vspacesmall}{\vspace{3mm}}
 40 | \newcommand{\vspacebig}{\vspace{6mm}}
 41 | 
 42 | % style for code inlined in text:
 43 | \newcommand{\codei}[1]{\bfseries \ttfamily{#1}\normalfont}
 44 | 
 45 | 
 46 | \begin{document}
 47 | 
 48 | \begin{titlepage}
 49 |     \centering
 50 |    
 51 |     \null %empty box needed for vfill to work
 52 |     \vfill
 53 | 
 54 |    {\bfseries\Huge
 55 |     Container class templates
 56 |     \vspacesmall
 57 |         
 58 |     extension for C++ vector class library 
 59 |     \vspacebig
 60 |         
 61 |    }        
 62 |     \vspacebig
 63 |     
 64 |    {\Large    
 65 |     Agner Fog
 66 |     \vspacebig
 67 |     
 68 |     \copyright\ \today. Apache license 2.0
 69 |    }
 70 |     
 71 |     \vfill
 72 |     
 73 |     \includegraphics[width=306pt]{freesoftwarelogo.jpg}
 74 |     \vfill
 75 |     
 76 | \end{titlepage}
 77 | 
 78 | \RaggedRight
 79 | 
 80 | \chapter{Introduction}\label{chap:Introduction}
 81 | 
 82 | A container class template is a piece of C++ code that is useful for allocating memory space for a list of objects. It is similar to an array but with additional functionality and security.
 83 | \vspacesmall
 84 | 
 85 | C++ programmers routinely use the C++ standard containers (previously known as the standard template library) for this purpose. Unfortunately, the standard 
 86 | C++ container templates can be quite inefficient. They are optimized for generality and flexibility, while efficiency has been sacrificed. Many of the standard C++ containers are implemented as linked lists that allocate memory in a lot of separate small pieces. This is inefficient because of a lot of heap management overhead, memory fragmentation, and poor caching. Many C++ programmers are routinely implementing a matrix as a nested container (vector of vectors) which is even more inefficient.
 87 | \vspacesmall
 88 | 
 89 | The container class templates provided here are intended to fill the need for more efficient containers with contiguous memory storage. Some of these containers are tailor-made to fit the classes defined in the vector classes library (VCL). Containers for other types of objects are also included.
 90 | \vspacesmall
 91 | 
 92 | Overview of container class templates:
 93 | \begin {table}[H]
 94 | \caption{Container class templates}
 95 | \label{table:containerClassTemplates}
 96 | \begin{tabular}{|p{44mm}|p{70mm}|p{35mm}|}
 97 | \hline
 98 | \bfseries Template & \bfseries Description & \bfseries Header file \\ \hline
 99 | ContainerG<type>  & Linear array of any type of objects, \newline dynamic size & general\_containers.h \\ \hline
100 | ContainerV<vector\_type, size> & Linear array of vectors, fixed size or \newline dynamic size & vector\_containers.h \\ \hline
101 | MatrixV<vector\_type, rows, columns> & Matrix. Rows are stored as VCL vectors & matrixv.h \\ \hline
102 | \end{tabular}
103 | \end{table}
104 | \vspacebig
105 | 
106 | 
107 | \chapter{Description of container templates}\label{chap:DescriptionTemplates}
108 | 
109 | \section{ContainerG} \label{ContainerG}
110 | 
111 | {\bfseries Declaration}\\
112 | \codei{template <typename T> class ContainerG};
113 | \vspacebig
114 | 
115 | This container class template makes a linear array with dynamic size. \codei{ContainerG} is independent of the vector class library and can be used for most kinds of objects.
116 | \vspacesmall
117 | 
118 | The type \codei{T} can be a simple type such as \codei{int} or \codei{float}, or a composite type such as a \codei{struct}, \codei{class}, or \codei{union}. The container will likely not work if the type \codei{T} has a non-default constructor, destructor, copy constructor, or move constructor. The type \codei{T} cannot be another container, but it can be a \codei{struct} containing a fixed-size array.
119 | \vspacesmall
120 | 
121 | \begin{lstlisting}[frame=none]
122 | // Example:
123 | #include <stdio.h>
124 | #include "general_containers.h"
125 | 
126 | // Function for error reporting
127 | void error_reporter() {
128 |     fprintf(stderr, "\nError: index out of range\n");
129 | }
130 | 
131 | int main () {
132 |     // Declare a container for float elements
133 |     ContainerG<float> my_array;
134 | 
135 |     // Register error_reporter function to report any errors
136 |     my_array.set_error_handler(error_reporter);
137 | 
138 |     // Set the size of the array    
139 |     my_array.set_size(10);
140 | 
141 |     // Put data into a C-style array
142 |     const int listsize = 8;
143 |     float list[listsize];
144 |     for (int i = 0; i < listsize; i++) list[i] = float(i);
145 | 
146 |     // Load 8 elements into my_array
147 |     my_array.load(listsize, list);
148 | 
149 |     // Print contents of my_array
150 |     for (int i = 0; i < my_array.size(); i++) {
151 |         printf(" %.2f", my_array[i]);
152 |     }
153 | 
154 |     // Increase the size of the array    
155 |     my_array.set_size(12);
156 | 
157 |     // Change last element (index goes from 0 to size()-1 )
158 |     my_array[my_array.size()-1] = 88;
159 | 
160 |     // Print contents again
161 |     printf("\n\n");
162 |     for (int i = 0; i < my_array.size(); i++) {
163 |         printf(" %.2f", my_array[i]);
164 |     }
165 | }
166 | 
167 | /*  Output:
168 |  0.00 1.00 2.00 3.00 4.00 5.00 6.00 7.00 0.00 0.00
169 |  
170 |  0.00 1.00 2.00 3.00 4.00 5.00 6.00 7.00 0.00 0.00 0.00 88.00
171 | */
172 | \end{lstlisting}
173 | \vspacebig
174 | 
175 | {\bfseries Member functions:}\\
176 | \vspacebig
177 | 
178 | {\bfseries void set\_size(int size)}\\
179 | Sets the size of the container, i.e. the number of elements it can contain.\\
180 | The size can be changed at any time. Increasing the size will make the code allocate a new internal memory buffer if the current memory buffer is not big enough. All contents will be copied to the new memory buffer and the old buffer will be deleted.\\
181 | The code may allocate a bigger memory block than requested in order to avoid frequent re-allocations if the size is increased in small amounts. 
182 | The code does not re-allocate memory if the size is decreased to a non-zero value.\\
183 | Calling \codei{set\_size} with a size of zero will delete the allocated memory and set everything to the initial condition. This may free memory for other purposes, but it is not needed because the container has a destructor that will free the allocated memory anyway.
184 | \vspacebig
185 | 
186 | {\bfseries int size()}\\
187 | Returns the current size of the container, i.e. the number of elements it can contain.
188 | \vspacebig
189 | 
190 | {\bfseries int allocated\_size()}\\
191 | Gives the size of the internal buffer, which may be bigger than specified by the last call to \codei{set\_size}. \codei{allocated\_size()} is the maximum size that can be set without reallocation of the internal memory.
192 | \vspacebig
193 | 
194 | {\bfseries T \& operator [] (int index)}\\
195 | The operator \codei{[]} works like an array index. This makes it possible to read or write a single element in the array. The code checks that the index is within the range $0 \leq$ \codei{index} $<$ \codei{size()}.
196 | \vspacebig
197 | 
198 | {\bfseries void load(int n, T const {*} p)}\\
199 | Loads \codei{n} objects from an array \codei{p}.\\
200 | \codei{n} is the size of the array \codei{p} or the maximum number of elements to load. If \codei{n} is bigger than the size of the \codei{ContainerG} then it is reduced to the size of the container.
201 | \vspacebig
202 | 
203 | {\bfseries void store(int n, T {*} p)}\\
204 | Stores \codei{n} objects to an array \codei{p}.\\
205 | \codei{n} is the size of the array \codei{p} or the maximum number of elements to store. If \codei{n} is bigger than the size of the \codei{ContainerG} then it is reduced to the size of the container.
206 | \vspacebig
207 | 
208 | {\bfseries set\_error\_handler(void ({*}err)(void))}\\
209 | Saves a pointer to an error handling function. This function will be called in case an index is out of range. The error handling function should issue an error message in a way that is appropriate for the actual user interface. The program will crash in case of an index out of range if no error handler is set.
210 | \vspacebig
211 | 
212 | {\bfseries T {*} get\_buf()}\\
213 | Returns a pointer to the internal buffer. It is important to remember that any pointer or reference to elements in the container will be invalid after the size has been increased.
214 | \vspacebig
215 | 
216 | \subsection{Recycling of memory} \label{RecyclingOfMemory}
217 | It may be more efficient to reuse a container for a new purpose during the course of the program than to delete each container when it is no longer needed and create a new one. This will improve memory caching.
218 | \vspacesmall
219 | 
220 | The container may be resized for every new use. It can be useful to specify an estimated maximum size before first use of the container, and then reduce and increase the size as required during the course of the program.
221 | \vspacesmall
222 | 
223 | The memory is zeroed at the first call to \codei{set\_size}, but it is not necessarily cleared when the container is later resized.
224 | \vspacebig
225 | 
226 | 
227 | \section{ContainerV} \label{ContainerV}
228 | 
229 | {\bfseries Declaration}\\
230 | \codei{template <typename V, int n> class ContainerV};
231 | \vspacebig
232 | 
233 | This container is designed for VCL vectors only. Vectors of all integer and floating point types are allowed, but not boolean types. Access is provided to each vector as well as to individual vector elements.
234 | \vspacesmall
235 | 
236 | \begin{lstlisting}[frame=none]
237 | // Example:
238 | #include <stdio.h>
239 | #include <vectorclass.h>
240 | #include <vector_containers.h>
241 | 
242 | // Make container of four vectors of 8 float values each
243 | ContainerV<Vec8f, 4> c;
244 | // Array of floats
245 | float list[32] = {0,1,2,3,4,5,6};
246 | // Load array into container
247 | c.load(32, list);
248 | // Change one vector in container
249 | c.set_vector(Vec8f(16,17,18,19,20,21,22,23), 2);
250 | // Change one vector element in container
251 | c.set_element(-99, 5);
252 | // Loop through vectors:
253 | for (int i = 0; i < c.n_vectors(); i++) { 
254 |     // Loop through elements of each vector:
255 |     for (int j = 0; j < c.get_vector(0).size(); j++) {
256 |         // Print value:
257 |         printf(" %6.2f", c.get_vector(i)[j]);
258 |     }
259 |     // Next vector on new line:
260 |     printf("\n");    
261 | }
262 | 
263 | /* Output:
264 |    0.00   1.00   2.00   3.00   4.00 -99.00   6.00   0.00
265 |    0.00   0.00   0.00   0.00   0.00   0.00   0.00   0.00
266 |   16.00  17.00  18.00  19.00  20.00  21.00  22.00  23.00
267 |    0.00   0.00   0.00   0.00   0.00   0.00   0.00   0.00
268 | */
269 | \end{lstlisting}
270 | \vspacebig
271 | 
272 | {\bfseries Defined type \codei{etype} }\\
273 | The template defines \codei{etype} as the type of the vector elements. For example, if the container is based on vectors of type \codei{Vec8f}, then \codei{etype} is the type \codei{float}.
274 | \vspacebig
275 | 
276 | {\bfseries Member functions:}\\
277 | \vspacebig
278 | 
279 | {\bfseries int n\_vectors()}\\
280 | Returns the number of vectors.
281 | \vspacebig
282 | 
283 | {\bfseries int n\_elements()}\\
284 | Returns the number of vector elements.
285 | \vspacebig
286 | 
287 | {\bfseries int elementtype()}\\
288 | Returns the \codei{elementtype()} of the underlying vector class + 0x1000.
289 | \vspacebig
290 | 
291 | {\bfseries vector\_type get\_vector(int index)}\\
292 | Returns one vector from the position indicated by index. (The first vector has index 0).
293 | \vspacebig
294 | 
295 | {\bfseries set\_vector(vector\_type x, int index)}\\
296 | Replaces one vector at the position indicated by index.
297 | \vspacebig
298 | 
299 | {\bfseries etype get\_element(int index)}\\
300 | Returns a single vector element. The index runs consecutively through all vectors in the container, from 0 to (number of vectors) * (elements per vector) - 1.
301 | \vspacebig
302 | 
303 | {\bfseries set\_element(etype x, int index)}\\
304 | Replaces a single vector element. The index runs consecutively through all vectors in the container, from 0 to (number of vectors) * (elements per vector) - 1.
305 | \vspacebig
306 | 
307 | {\bfseries load(int n, void const {*} p)}\\
308 | Loads values from an array into the container. p points to an array of type 
309 | \codei{etype}. n is the array size or the maximum number of vector elements to load. If n is not a multiple of the vector size then the last vector will be partially filled. If n is bigger than the container size, then it is limited to the container size. If n is smaller than the container size, then the remaining full vectors are unchanged.
310 | \vspacebig
311 | 
312 | {\bfseries store(int n, void * p)}\\
313 | Stores values from the container into an array. p points to an array of type 
314 | \codei{etype}.
315 | n is the array size or the maximum number of vector elements to store. n does not have to be a multiple of the vector size. If n is bigger than the container size, then it is limited to the container size.
316 | \vspacebig
317 | 
318 | {\bfseries vector\_type get\_buf()}\\
319 | Returns a pointer to the internal buffer. Note that this pointer will be invalid if the size of the container is later increased (see below).
320 | \vspacebig
321 | 
322 | {\bfseries zero()}\\
323 | Sets all vectors and all vector elements in the container to zero. Does not change the size of the container.
324 | \vspacebig
325 | 
326 | {\bfseries set\_error\_handler(void ({*}err)(void))}\\
327 | Saves a pointer to an error handling function. This function will be called in case an index is out of range. The error handling function should issue an error message in a way that is appropriate for the actual user interface. The program will crash in case of an index out of range if no error handler is set.
328 | 
329 | \begin{lstlisting}[frame=none]
330 | // Example:
331 | #include <stdio.h>
332 | #include <vectorclass.h>
333 | #include <vector_containers.h>
334 | 
335 | void error_reporter() {
336 |     fprintf(stderr, "\nError: index out of range\n");
337 | }
338 | 
339 | // Make container of four vectors of 8 float values each
340 | ContainerV<Vec8f, 4> c;
341 | 
342 | // Set the error handler
343 | c.set_error_handler(error_reporter);
344 | 
345 | \end{lstlisting}
346 | \vspacebig
347 | 
348 | 
349 | \subsection{Dynamic size} \label{ContainerVDynamicSize}
350 | The \codei{ContainerV} template has a dynamic size if, and only if, the initial size is 0.
351 | \vspacesmall
352 | 
353 | \begin{lstlisting}[frame=none]
354 | // Example:
355 | #include <stdio.h>
356 | #include <vectorclass.h>
357 | #include <vector_containers.h>
358 | 
359 | // Make dynamic container of a variable number of vectors
360 | ContainerV<Vec8f, 0> c;
361 | // Set the size
362 | c.set_nvectors(6);
363 | // Change one vector in container
364 | c.set_vector(Vec8f(16,17,18,19,20,21,22,23), 2);
365 | // Loop through vectors:
366 | for (int i = 0; i < c.n_vectors(); i++) { 
367 |     // Loop through elements of each vector:
368 |     for (int j = 0; j < c.get_vector(0).size(); j++) {
369 |         // Print value:
370 |         printf(" %6.2f", c.get_vector(i)[j]);
371 |     }
372 |     // Next vector on new line:
373 |     printf("\n");    
374 | }
375 | 
376 | /* Output:
377 |    0.00   0.00   0.00   0.00   0.00   0.00   0.00   0.00
378 |    0.00   0.00   0.00   0.00   0.00   0.00   0.00   0.00
379 |   16.00  17.00  18.00  19.00  20.00  21.00  22.00  23.00
380 |    0.00   0.00   0.00   0.00   0.00   0.00   0.00   0.00
381 |    0.00   0.00   0.00   0.00   0.00   0.00   0.00   0.00
382 |    0.00   0.00   0.00   0.00   0.00   0.00   0.00   0.00
383 | */
384 | \end{lstlisting}
385 | \vspacebig
386 | 
387 | {\bfseries Additional member functions for dynamic size}\\
388 | The following member functions are only available for ContainerV if the initial size is 0:
389 | \vspacebig
390 | 
391 | {\bfseries set\_nvectors(int size)}\\
392 | Changes the size of the container. On the first call to this function, the code allocates a memory block big enough to contain the specified number of vectors. If a later call to \codei{set\_nvectors} increases the size, then it will allocate a new bigger memory block if necessary, copy all data to the new memory block, and delete the old memory block. The code may allocate a bigger memory block than requested in order to avoid frequent re-allocations if the size is increased in small amounts. 
393 | The code does not re-allocate memory if the size is decreased to a non-zero value.
394 | Calling \codei{set\_nvectors} with a size of zero will delete the allocated memory and set everything to the initial condition. This may free memory for other purposes, but it is not needed because the container has a destructor that frees the allocated memory anyway.
395 | \vspacebig
396 | 
397 | {\bfseries set\_nelements(int n)}\\
398 | This function is similar to \codei{set\_nvectors}, but makes it possible to set a size that is not a multiple of the vector size. The size is rounded up to a multiple of the vector size in order to determine the amount of memory to allocate, while the last vector will be only partially used. 
399 | \codei{n\_vectors()} will report the number of vectors used, including any partially used vector, while \codei{n\_elements()} will return the value set by \codei{set\_nelements(n)}.
400 | \vspacebig
401 | 
402 | {\bfseries int allocated\_size()}\\
403 | This function returns the actual amount of memory allocated, including any unused memory. The unit is the vector size, similar to \codei{set\_nvectors}.
404 | \vspacebig
405 | 
406 | \subsection{Recycling of memory} \label{RecyclingOfMemory}
407 | It may be more efficient to reuse a container for a new purpose during the course of the program than to delete each container when it is no longer needed and create a new one. This will improve memory caching.
408 | \vspacesmall
409 | 
410 | The container may be resized for every new use. It can be useful to specify an estimated maximum size before first use of the container, and then reduce and increase the size as required during the course of the program.
411 | \vspacesmall
412 | 
413 | The memory is zeroed at the first call to \codei{set\_nvectors}, but it is not necessarily cleared when the container is later resized. A container with fixed size contains random data initially.
414 | \vspacebig
415 | 
416 | 
417 | \section{MatrixV} \label{ContainerV}
418 | MatrixV is a container for representing numerical data as a matrix. Each row in the matrix is stored as one or more VCL vectors. This is useful for representing numerical data as a matrix. MatrixV can also be used for storing a list of vectors where each vector is represented as a row, and the number of columns corresponds to the number of elements in each vector.
419 | \vspacesmall
420 | 
421 | MatrixV is optimized for accessing the matrix as rows rather than columns.
422 | \vspacesmall
423 | 
424 | {\bfseries Declaration}\\
425 | \codei{template <typename V, int rows, int columns>
426 | class MatrixV;}
427 | \vspacebig
428 | 
429 | The vector type \codei{V} can be any floating point or integer vector class. Boolean vectors are not supported. (It is more efficient to pack boolean vectors into integer bitfields).
430 | \vspacesmall
431 | 
432 | If the number of columns is larger than the number of elements in the vector class \codei{V} then the template will use multiple vectors for each row. If the number of columns is less than the number of elements in the vector class \codei{V} then the template will use the smallest possible vector class that fits the number of columns. Any extra elements in \codei{V} will be unused. The template will not store multiple rows in one vector, but start each row with a new vector. It is possible to specify \codei{V} as the largest possible vector with the desired element type and leave it to the template to find the smallest vector of the same element type that fits the number of columns. The MatrixV template will not use vectors larger than \codei{V}.
433 | \vspacesmall
434 | 
435 | {\bfseries Defined type \codei{row\_vector\_type} }\\
436 | The template defines \codei{row\_vector\_type} as the vector class used for storing rows. This may be the specified vector class \codei{V} or a smaller vector class with the same element type.
437 | \vspacesmall 
438 | 
439 | {\bfseries Defined type \codei{etype} }\\
440 | The template defines \codei{etype} as the type of the vector elements. For example, if the container is based on vectors of type \codei{Vec8f}, then \codei{etype} is the type \codei{float}.
441 | \vspacebig
442 | 
443 | {\bfseries Member functions:}\\
444 | \vspacebig
445 | 
446 | {\bfseries int nrows()}\\
447 | Returns the number of rows in the matrix.
448 | \vspacebig
449 | 
450 | {\bfseries int ncolumns()}\\
451 | Returns the number of columns in the matrix.
452 | \vspacebig
453 | 
454 | {\bfseries int vectors\_per\_row()}\\
455 | Returns the number of vectors of class \codei{row\_vector\_type} that are used for each row. Any partially used vector is included.
456 | \vspacebig
457 | 
458 | {\bfseries int full\_vectors\_per\_row()}\\
459 | Returns the number of fully-used vectors of class \codei{row\_vector\_type} that are used for each row. Any partially used vector is not included.
460 | \vspacebig
461 | 
462 | {\bfseries int partial\_vector\_elements()}\\
463 | If the number of columns is not divisible by the number of elements in vector class \codei{row\_vector\_type} then the last vector in each row will be partially used. This function returns the number of used elements in a partially-used vector. The function returns 0 if there are no partially used vectors.
464 | \vspacebig
465 | 
466 | {\bfseries set\_error\_handler(void ({*}err)(void))}\\
467 | This function is used for registering a function for reporting errors such as an index out of range. \codei{err} should be a function that reports the error in a way that is appropriate for the actual user interface. The program will crash in case a row or column index is out of range if no error-handling function is registered.
468 | \vspacebig
469 | 
470 | {\bfseries void ({*}get\_error\_handler())(void)}\\
471 | Returns a pointer to the function set by \codei{set\_error\_handler}.
472 | \vspacebig
473 | 
474 | {\bfseries row\_vector\_type get\_row(int r, int i = 0)}\\
475 | Returns row number \codei{r} as a vector of class \codei{row\_vector\_type}. \\
476 | Row numbers go from 0 to \codei{nrows()-1}.\\ 
477 | If each row contains more than one vector then call \codei{get\_row} multiple times with \codei{i} going from 0 to \codei{vectors\_per\_row() - 1}.
478 | \vspacebig
479 | 
480 | {\bfseries set\_row(row\_vector\_type x, int r, int i = 0)}\\
481 | Sets row number \codei{r} to a vector of class \codei{row\_vector\_type}. \\
482 | Row numbers go from 0 to \codei{nrows()-1}.\\ 
483 | If each row contains more than one vector then call \codei{set\_row} multiple times with \codei{i} going from 0 to \codei{vectors\_per\_row() - 1}.
484 | \vspacebig
485 | 
486 | {\bfseries etype get\_element(int row, int column)}\\
487 | Returns a single element from the matrix.\\
488 | The row number goes from 0 to \codei{nrows()-1}.\\ 
489 | The column number goes from 0 to \codei{ncolumns()-1}.
490 | \vspacebig
491 | 
492 | {\bfseries set\_element(etype x, int row, int column)}\\
493 | Changes a single element in the matrix.\\
494 | The row number goes from 0 to \codei{nrows()-1}.\\ 
495 | The column number goes from 0 to \codei{ncolumns()-1}.
496 | \vspacebig
497 | 
498 | {\bfseries load(void const {*} p)}\\
499 | Fills the entire matrix with data from a C-style matrix or linear array pointed to by \codei{p}. The matrix or array must contain (rows * columns) elements.\\
500 | The elements are retrieved in row-major order in accordance with the C standard.
501 | \vspacebig
502 | 
503 | {\bfseries store(void {*} p)}\\
504 | A C-style matrix or array pointed to by \codei{p} is filled with all data from the entire matrix. The number of elements stored at \codei{ {*}p} is (rows * columns).\\
505 | The elements are stored in row-major order in accordance with the C standard.
506 | \vspacebig
507 | 
508 | {\bfseries zero()}\\
509 | Sets all elements in the matrix to zero.
510 | \vspacebig
511 | 
512 | 
513 | \subsection{Initializing a MatrixV} \label{InitializingMatrixV}
514 | A \codei{MatrixV} is not initialized when it is first constructed. The contents of unitialized matrix elements is unpredictable. 
515 | The matrix can be initialized by the \codei{load} member function or by multiple calls to \codei{set\_row}. It is less efficient to set all elements individually with \codei{set\_element}.
516 | \vspacesmall 
517 | 
518 | The internal vectors contain unused vector elements if the number of columns is not divisible by the vector size. 
519 | It is recommended to call the \codei{zero} member function first if the matrix elements are initialized with \codei{set\_element} only, in order to clear any unused vector elements. Otherwise, the unused elements may occur as random values in unused vector elements retrieved by \codei{get\_row} or by the pack functions described below.
520 | \vspacebig
521 | 
522 | 
523 | \subsection{Pack and unpack functions} \label{PackAndUnpack}
524 | Several functions are defined for packing multiple matrix rows into one big vector and for unpacking such a vector into multiple matrix rows. These functions cannot be used if the number of columns is too big for multiple rows to fit into a single large vector.
525 | \vspacesmall
526 | 
527 | The pack and unpack functions are useful in cases where matrix elements are accessed in other patterns than rowwise and when permutations are needed, such as matrix transposition and matrix-by-matrix products. The pack and unpack functions support all floating point vector classes and integer vector classes with integer types of at least 16 bits. 8-bit integers are not supported.
528 | \vspacebig
529 | 
530 | {\bfseries Pack functions}\\
531 | \codei{template <typename M> auto pack2rows(M \& matrix, int first\_row)}\\
532 | \codei{template <typename M> auto pack3rows(M \& matrix, int first\_row)}\\
533 | \codei{template <typename M> auto pack4rows(M \& matrix, int first\_row)}\\
534 | \codei{template <typename M> auto pack5rows(M \& matrix, int first\_row)}
535 | \vspacebig
536 | 
537 | These functions will pack n consecutive rows of a \codei{MatrixV} matrix into a single vector, provided that a vector class with sufficient size exists.
538 | \vspacesmall
539 | 
540 | For example, a matrix with 5 rows and 3 columns with elements of type \codei{float} can be packed in the following ways. 
541 | \codei{pack2rows} will pack two consecutive rows into a vector of type \codei{Vec8f} with the elements of two rows in the first 6 vector positions, while the last two vector positions are unused. \codei{pack3rows} will pack three rows into a \codei{Vec16f} with the first 9 elements used and the last 7 elements unused. \codei{pack4rows} will use 12 elements, and \codei{pack5rows} can pack the entire matrix into 15 elements of a \codei{Vec16f} with the last element unused. \codei{first\_row} indicates the start row, where row numbers start at 0.
542 | \vspacesmall
543 | 
544 | The pack functions are automatically finding a vector size that fits the number of data elements packed. You will get a compilation error if no sufficiently big vector class exists. It is not possible to pack the rows into multiple vectors with a single call to a pack function.
545 | \vspacesmall
546 | 
547 | The error handling function set by \codei{set\_error\_handler} for the matrix will be called in case any of the row indexes is out of range. For example, calling \codei{pack3rows} with \codei{first\_row} set to 3 on a matrix with 5 rows will give an error because the last row is out of range. The program will crash if there is no error handling function and a row index is out of range.
548 | \vspacebig
549 | 
550 | {\bfseries Unpack functions}\\
551 | \codei{template <typename V, typename M> unpack2rows(V rr, M \& matrix, int first\_row))}\\
552 | \codei{template <typename V, typename M> unpack3rows(V rr, M \& matrix, int first\_row))}\\
553 | \codei{template <typename V, typename M> unpack4rows(V rr, M \& matrix, int first\_row))}\\
554 | \codei{template <typename V, typename M> unpack5rows(V rr, M \& matrix, int first\_row))}\\
555 | \vspacebig
556 | 
557 | These functions are doing the opposite of the pack functions. The large vector \codei{rr} is unpacked to fill n consecutive rows of the matrix \codei{M}. The unpack functions can be used for initializing or modifying a matrix.
558 | \vspacesmall
559 | 
560 | A row index out of range will be indicated by a call to the error handling function in the same way as for the pack functions.
561 | \vspacebig
562 | 
563 | 
564 | \subsection{Examples} \label{MatrixExamples}
565 | The following examples illustrate how to use the \codei{MatrixV} container, its member functions, and the pack and unpack functions.
566 | \vspacebig
567 | 
568 | \begin{lstlisting}[frame=none]
569 | #include <stdio.h>
570 | #include <vectorclass.h>
571 | #include <matrixv.h>
572 | 
573 | // Function for reporting an error message
574 | void error_reporter() {
575 |     fprintf(stderr, "\nError: index out of range\n");
576 | }
577 | 
578 | // Function for printing a whole matrix
579 | template <typename M>
580 | void print_matrix(M & matrix) {
581 |     // row loop
582 |     for (int r = 0; r < matrix.nrows(); r++) {
583 |         // column loop
584 |         for (int c = 0; c < matrix.ncolumns(); c++) {
585 |             // print one element
586 |             printf(" %6.2f", matrix.get_element(r, c));
587 |         }
588 |         printf("\n"); // new line for next row
589 |     }
590 | }
591 | 
592 | int main() {
593 | 
594 |     // C-style matrix with 3 rows and 4 columns
595 |     float Amatrix[3][4];
596 | 
597 |     // Put data into this matrix
598 |     for (int r = 0; r < 3; r++) {
599 |         for (int c = 0; c < 4; c++) {
600 |             Amatrix[r][c] = float(r + 0.1 * c);
601 |         }
602 |     }
603 | 
604 |     // Vector-based matrix with 3 rows and 4 columns
605 |     MatrixV<Vec8f, 3, 4> A;
606 | 
607 |     // Set an error handler
608 |     A.set_error_handler(error_reporter);
609 | 
610 |     // Load data from C-style matrix Amatrix into vector-based matrix A
611 |     A.load(Amatrix);
612 | 
613 |     // Print matrix A
614 |     printf("\n   A:\n");
615 |     print_matrix(A);
616 | 
617 |     // Pack matrix A into one big vector (A has 3 rows)
618 |     Vec16f Apack = pack3rows(A, 0);
619 | 
620 |     // Transpose matrix A
621 |     const int d = V_DC;   // d means don't care
622 |     Vec16f Atransposed = permute16<
623 |         0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11, d, d, d, d>(Apack);
624 | 
625 |     // Vector-based matrix for A transposed. Has 4 rows and 3 columns
626 |     MatrixV<Vec8f, 4, 3> B;
627 | 
628 |     // Set an error handler
629 |     B.set_error_handler(error_reporter);
630 | 
631 |     // Unpack Atransposed vector into matrix B (B has 4 rows)
632 |     unpack4rows(Atransposed, B, 0);
633 |     
634 |     // Row operations
635 | 
636 |     // Subtract 2 * row2 from row1 in matrix B
637 |     auto row1 = B.get_row(1);  // row 1 as Vec4f
638 |     auto row2 = B.get_row(2);  // row 2 as Vec4f
639 |     auto new_row1 = row1 - 2.0f * row2; // calculate new row 1
640 |     B.set_row(new_row1, 1);    // insert new row 1
641 | 
642 |     // Print matrix B
643 |     printf("\n   B:\n");
644 |     print_matrix(B);
645 | 
646 |     // Matrix multiplication
647 | 
648 |     // Pack matrix B into one big vector
649 |     Vec16f Bpack = pack4rows(B, 0);
650 | 
651 |     // Calculate matrix product A * B
652 |     Vec16f AxBpack =
653 |     
654 |       permute16<0, 0, 0, 4, 4, 4, 8, 8, 8, d, d, d, d, d, d, d>(Apack)
655 |     * permute16<0, 1, 2, 0, 1, 2, 0, 1, 2, d, d, d, d, d, d, d>(Bpack)
656 | 
657 |     + permute16<1, 1, 1, 5, 5, 5, 9, 9, 9, d, d, d, d, d, d, d>(Apack)
658 |     * permute16<3, 4, 5, 3, 4, 5, 3, 4, 5, d, d, d, d, d, d, d>(Bpack)
659 | 
660 |     + permute16<2, 2, 2, 6, 6, 6,10,10,10, d, d, d, d, d, d, d>(Apack)
661 |     * permute16<6, 7, 8, 6, 7, 8, 6, 7, 8, d, d, d, d, d, d, d>(Bpack)
662 | 
663 |     + permute16<3, 3, 3, 7, 7, 7,11,11,11, d, d, d, d, d, d, d>(Apack)
664 |     * permute16<9,10,11, 9,10,11, 9,10,11, d, d, d, d, d, d, d>(Bpack);
665 | 
666 |     // Product matrix has 3 rows and 3 columns
667 |     MatrixV<Vec8f, 3, 3> AxB;
668 | 
669 |     // Set an error handler
670 |     AxB.set_error_handler(error_reporter);
671 | 
672 |     // Unpack product vector into matrix AxB
673 |     unpack3rows(AxBpack, AxB, 0);
674 | 
675 |     // Print product matrix AxB
676 |     printf("\n   AxB:\n");
677 |     print_matrix(AxB);
678 | }
679 | 
680 | /* Output:
681 |    A:
682 |    0.00   0.10   0.20   0.30
683 |    1.00   1.10   1.20   1.30
684 |    2.00   2.10   2.20   2.30
685 | 
686 |    B:
687 |    0.00   1.00   2.00
688 |   -0.30  -1.30  -2.30
689 |    0.20   1.20   2.20
690 |    0.30   1.30   2.30
691 | 
692 |    AxB:
693 |    0.10   0.50   0.90
694 |    0.30   2.70   5.10
695 |    0.50   4.90   9.30
696 | */
697 | \end{lstlisting}
698 | \vspacebig
699 | 
700 | \end{document}
701 | 


--------------------------------------------------------------------------------
/freesoftwarelogo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vectorclass/manual/62cb40d710f8d6180511ba03ca6e09347e06f0b9/freesoftwarelogo.jpg


--------------------------------------------------------------------------------
/vcl_bool.tex:
--------------------------------------------------------------------------------
  1 | % chapter included in vclmanual.tex
  2 | \documentclass[vcl_manual.tex]{subfiles}
  3 | \begin{document}
  4 | 
  5 | \flushleft
  6 | 
  7 | \chapter{Boolean operations and per-element branches}\label{chap:BooleanOperations}
  8 | Consider this piece of C++ code:
  9 | 
 10 | \begin{lstlisting}[frame=none]
 11 | int a[4], b[4], c[4], d[4];
 12 |   ...
 13 | for (int i = 0; i < 4; i++) {
 14 |     d[i] = (a[i] > 0 && a[i] < 10) ? b[i] : c[i];
 15 | }
 16 | \end{lstlisting}
 17 | \vspacesmall
 18 | 
 19 | We can do this with vectors in the following way:
 20 | 
 21 | \begin{lstlisting}[frame=none]
 22 | Vec4i a, b, c, d;
 23 |   ...
 24 | d = select(a > 0 & a < 10, b, c);
 25 | \end{lstlisting}
 26 | \vspacesmall
 27 | 
 28 | The \codei{select} function is similar to the \codei{?:}  operator. 
 29 | It has three vector parameters: The first parameter is a boolean vector that chooses between the elements of the second and the third vector parameter. 
 30 | \vspacesmall
 31 | 
 32 | The relational operators \codei{\textgreater}, \codei{\textgreater=}, \codei{\textless}, \codei{\textless=}, \codei{==}, \codei{!=} produce boolean vectors, 
 33 | which accept the boolean operations \codei{\&}, 
 34 | \codei{|}, \codei{$\wedge$}, \codei{$\sim$} (and, or, exclusive or, not). 
 35 | \vspacesmall
 36 | 
 37 | In the above example, the expressions \codei{a \textgreater{} 0} and \codei{a \textless{} 10} are boolean vectors of type \codei{Vec4ib}. The boolean vectors must have a type that matches the data vectors they are used with. Table \ref{table:BooleanVectorClasses} on page \pageref{table:BooleanVectorClasses} shows which boolean vector class to use for each vector type.
 38 | \vspacesmall
 39 | 
 40 | The vector elements that are not selected are calculated anyway because normally all parts of a vector are calculated. For example:
 41 | 
 42 | \begin{lstlisting}[frame=none]
 43 | Vec4f a(-1.0f, 0.0f, 1.0f, 2.0f);
 44 | Vec4f b = select(a >= 0.0f, sqrt(a), 0.0f);
 45 | \end{lstlisting}
 46 | \vspacesmall
 47 | 
 48 | Here, we will be calculating the square root of -1 even though we are not using it. This will not cause problems if floating point exceptions are masked off, which they normally are. A safe solution that works even if floating point exceptions are enabled would be:
 49 | 
 50 | \begin{lstlisting}[frame=none]
 51 | Vec4f a(-1.0f, 0.0f, 1.0f, 2.0f);
 52 | Vec4f b = sqrt(max(a, 0.0f));
 53 | \end{lstlisting}
 54 | \vspacesmall
 55 | 
 56 | 
 57 | Likewise, the \codei{\&} and \codei{|} operators are calculating both input operands, even if the second operand is not needed. The following examples illustrates this:
 58 | 
 59 | \begin{lstlisting}[frame=none]
 60 | // array version:
 61 | float a[4] = {0.0f, 1.0f, 2.0f, 3.0f};
 62 | float b[4];
 63 | for (int i = 0; i < 4; i++) {
 64 |    if (a[i] > 0.0f && 1.0f/a[i] != 4.0f) {
 65 |       b[i] = a[i]; 
 66 |    }
 67 |    else {
 68 |       b[i] = 1.0f;   
 69 |    }
 70 | }
 71 | \end{lstlisting}
 72 | \vspacesmall
 73 | 
 74 | and the vector version of the same:
 75 | 
 76 | \begin{lstlisting}[frame=none]
 77 | Vec4f a(0.0f, 1.0f, 2.0f, 3.0f);
 78 | Vec4f b = select(a > 0.0f & 1.0f/a != 4.0f, a, 1.0f);
 79 | \end{lstlisting}
 80 | \vspacesmall
 81 | 
 82 | In the array version, we will never divide by zero because the \codei{\&\&} operator does not evaluate the second operand when the first operand is false. But in the vector version, we are indeed dividing by zero because the \codei{\&} operator always evaluates both operands. The vector class library defines the operators \codei{\&\&} and \codei{||} as synonyms for \codei{\&} and \codei{|} for convenience, but they are still doing the bitwise AND or OR operation, so \codei{\&} and \codei{|} are actually more representative of what these operators really do. This example should be changed to:
 83 | 
 84 | \begin{lstlisting}[frame=none]
 85 | Vec4f a(0.0f, 1.0f, 2.0f, 3.0f);
 86 | Vec4f b = select(a > 0.0f & a != 0.25f, a, 1.0f);
 87 | \end{lstlisting}
 88 | \vspacesmall
 89 | 
 90 | 
 91 | \section{Internal representation of boolean vectors}\label{InternalRepresentationOfBoolean}
 92 | 
 93 | The way boolean vectors are stored depends on the instruction set and the Vector Class Library (VCL) version.
 94 | Older instruction sets have the boolean vectors stored with the same number of bits as the data vectors they are applied to (broad boolean vectors). The later instruction sets AVX512 and AVX512VL allow boolean vectors to be stored with only one bit for each element (compact boolean vectors). 
 95 | \vspacesmall
 96 | 
 97 | Version 1.xx of the VCL is using the broad boolean vectors for the sake of backwards compatibility, while version 2.xx is prioritizing the more efficient compact boolean vectors when the appropriate instruction set is enabled. The boolean vector sizes are summarized in the following table.
 98 | \vspacesmall
 99 | 
100 | \label{tableBooleanVectorSizes}
101 | \begin{tabular}{|p{50mm}|p{40mm}|p{40mm}|}
102 | \hline
103 | \bfseries Data vector size \newline and instruction set & \bfseries VCL version 1 \newline Boolean vectors & \bfseries VCL version 2 \newline Boolean vectors \\ \hline
104 | 128 bits & broad & broad  \\ \hline
105 | 128 bits with AVX512VL & broad & compact \\ \hline
106 | 256 bits & broad & broad  \\ \hline
107 | 256 bits with AVX512VL & broad & compact \\ \hline
108 | 512 bits & broad & broad  \\ \hline
109 | 512 bits with AVX512F & compact & compact \\ \hline
110 | \end{tabular}
111 | \vspacebig
112 | 
113 | The broad boolean vectors are stored as integer vectors with the same number of bits per element as the integer or floating point vectors they are used for. For example, the broad boolean vector class \codei{Vec4fb} is stored as a vector of four 32-bit integers because it is used with vectors \codei{Vec4f} of four single precision floating point numbers, using 32 bits each. The broad boolean vector class \codei{Vec4db} is stored as a vector of four 64-bit integers because it is used with vectors \codei{Vec4d} of four double precision floating point numbers, using 64 bits each. Note that the integer representation of true in a broad boolean vector element is not 1, but  -1. The representation of false is 0. Any other values than 0 and -1 in broad boolean vectors will produce wrong and inconsistent results that depend on the instruction set.
114 | \vspacesmall
115 | 
116 | The compact boolean vectors are stored with one bit per element (at least 8 bits). 
117 | You should make no assumption about how boolean vectors are stored if your code may be compiled for different instruction sets or different versions of VCL. For example,
118 | \codei{Vec16ib} uses 16 bits of storage when compiling for AVX512, but 512 bits of storage when compiling for AVX2. Do not store boolean vectors directly to binary files, and do not transmit boolean vectors between different functions that may be compiled for different instruction sets or different VCL versions.
119 | \vspacesmall
120 | 
121 | Different compact boolean vectors are mutually compatible if they have the same number of elements. Different broad boolean vectors are mutually compatible if they have the same number of elements and the same number of bits. Broad and compact boolean vectors are not compatible with each other. See page \pageref{ConversionBetweenBooleanTypes} for conversion between different types of boolean vectors.
122 | \vspacesmall
123 | 
124 | 
125 | \section{Functions for use with booleans}\label{FunctionsForBooleans}
126 | 
127 | \vspacesmall
128 | \begin{tabular}{|p{30mm}|p{120mm}|}
129 | \hline
130 | \bfseries Function & vector select(boolean vector s, vector a, vector b) \\ \hline
131 | \bfseries Defined for & all integer and floating point vector classes \\ \hline
132 | \bfseries Description & branch per element.\newline
133 | result[i] = s[i] ? a[i] : b[i] \\ \hline
134 | \bfseries Efficiency & good \\ \hline
135 | \end{tabular}
136 | \begin{lstlisting}[frame=none]
137 | // Example:
138 | Vec4i a(-1, 0, 1, 2);
139 | Vec4i b = select(a>0, a+10, a-10); // b = (-11,-10,11,12)
140 | \end{lstlisting}
141 | \vspacesmall
142 | 
143 | 
144 | \begin{tabular}{|p{30mm}|p{120mm}|}
145 | \hline
146 | \bfseries Function & vector if\_add(boolean vector f, vector a, vector b) \\ \hline
147 | \bfseries Defined for & all integer and floating point vector classes \\ \hline
148 | \bfseries Description & conditional addition \newline
149 | result[i] = f[i] ? (a[i] + b[i]) : a[i] \\ \hline
150 | \bfseries Efficiency & good \\ \hline
151 | \end{tabular}
152 | \begin{lstlisting}[frame=none]
153 | // Example:
154 | Vec4i a(-1, 0, 1, 2);
155 | Vec4i b = if_add(a < 0, a, 100);  // b = (99,0,1,2)
156 | \end{lstlisting}
157 | \vspacesmall
158 | 
159 | \begin{tabular}{|p{30mm}|p{120mm}|}
160 | \hline
161 | \bfseries Function & vector if\_sub(boolean vector f, vector a, vector b) \\ \hline
162 | \bfseries Defined for & all integer and floating point vector classes \\ \hline
163 | \bfseries Description & conditional subtraction \newline
164 | result[i] = f[i] ? (a[i] - b[i]) : a[i] \\ \hline
165 | \bfseries Efficiency & good \\ \hline
166 | \end{tabular}
167 | \vspacebig
168 | 
169 | \begin{tabular}{|p{30mm}|p{120mm}|}
170 | \hline
171 | \bfseries Function & vector if\_mul(boolean vector f, vector a, vector b) \\ \hline
172 | \bfseries Defined for & all integer and floating point vector classes \\ \hline
173 | \bfseries Description & conditional multiplication\newline
174 | result[i] = f[i] ? (a[i] * b[i]) : a[i] \\ \hline
175 | \bfseries Efficiency & good \\ \hline
176 | \end{tabular}
177 | \vspacebig
178 | 
179 | \begin{tabular}{|p{30mm}|p{120mm}|}
180 | \hline
181 | \bfseries Function & vector if\_div(boolean vector f, vector a, vector b) \\ \hline
182 | \bfseries Defined for & all floating point vector classes \\ \hline
183 | \bfseries Description & conditional division\newline
184 | result[i] = f[i] ? (a[i] / b[i]) : a[i] \\ \hline
185 | \bfseries Efficiency & medium \\ \hline
186 | \end{tabular}
187 | \vspacebig
188 | 
189 | 
190 | \begin{tabular}{|p{30mm}|p{120mm}|}
191 | \hline
192 | \bfseries Function & vector andnot(vector, vector) \\ \hline
193 | \bfseries Defined for & all boolean vector classes \\ \hline
194 | \bfseries Description & andnot(a,b) = a \& $\sim$ b \\ \hline
195 | \bfseries Efficiency & good \\ \hline
196 | \end{tabular}
197 | \vspacebig
198 | 
199 |   
200 | \begin{tabular}{|p{30mm}|p{120mm}|}
201 | \hline
202 | \bfseries Function & bool horizontal\_and(boolean vector) \\ \hline
203 | \bfseries Defined for & all boolean vector classes \\ \hline
204 | \bfseries Description & The output is the AND combination of all elements \\ \hline
205 | \bfseries Efficiency & Medium for broad boolean vectors. Better if SSE4.1 or later. Good for compact boolean vectors \\ \hline
206 | \end{tabular}
207 | \begin{lstlisting}[frame=none]
208 | // Example:
209 | Vec4i a(-1, 0, 1, 2);
210 | bool  b = horizontal_and(a > 0);  // b = false
211 | \end{lstlisting}
212 | \vspacesmall
213 | 
214 | 
215 | \begin{tabular}{|p{30mm}|p{120mm}|}
216 | \hline
217 | \bfseries Function & bool horizontal\_or(boolean vector) \\ \hline
218 | \bfseries Defined for & all boolean vector classes \\ \hline
219 | \bfseries Description & The output is the OR combination of all elements \\ \hline
220 | \bfseries Efficiency & Medium for broad boolean vectors. Better if SSE4.1 or later. Good for compact boolean vectors \\ \hline
221 | \end{tabular}
222 | \begin{lstlisting}[frame=none]
223 | // Example:
224 | Vec4i a(-1, 0, 1, 2);
225 | bool  b = horizontal_or(a > 0);  // b = true
226 | \end{lstlisting}
227 | \vspacesmall
228 | 
229 | 
230 | \begin{tabular}{|p{30mm}|p{120mm}|}
231 | \hline
232 | \bfseries Function & int horizontal\_find\_first(boolean vector) \\ \hline
233 | \bfseries Defined for & all boolean vector classes \\ \hline
234 | \bfseries Description & Returns an index to the first element that is true.
235 | Returns -1 if all elements are false \\ \hline
236 | \bfseries Efficiency & medium \\ \hline
237 | \end{tabular}
238 | \begin{lstlisting}[frame=none]
239 | // Example:
240 | Vec4i  a(1, 2, 3, 4);
241 | Vec4i  b(0, 2, 3, 5);
242 | int c = horizontal_find_first(a == b);  // c = 1
243 | \end{lstlisting}
244 | \vspacesmall
245 | 
246 | 
247 | \begin{tabular}{|p{30mm}|p{120mm}|}
248 | \hline
249 | \bfseries Function & unsigned int horizontal\_count(boolean vector) \\ \hline
250 | \bfseries Defined for & all boolean vector classes \\ \hline
251 | \bfseries Description & counts the number of elements that are true \\ \hline
252 | \bfseries Efficiency & medium if SSE4.2 or later \\ \hline
253 | \end{tabular}
254 | \begin{lstlisting}[frame=none]
255 | // Example:
256 | Vec4i  a(1, 2, 3, 4);
257 | Vec4i  b(0, 2, 3, 5);
258 | int c = horizontal_count(a == b);  // c = 2
259 | \end{lstlisting}
260 | \vspacesmall
261 | 
262 | \end{document}
263 | 


--------------------------------------------------------------------------------
/vcl_contributing.tex:
--------------------------------------------------------------------------------
  1 | % chapter included in vclmanual.tex
  2 | \documentclass[vcl_manual.tex]{subfiles}
  3 | \begin{document}
  4 | 
  5 | 
  6 | \section{Making add-on packages}\label{MakingPackages}
  7 | \flushleft
  8 | 
  9 | Anybody can contribute add-on packages for VCL. Contributors must follow the following guidelines:
 10 | \vspacebig
 11 | 
 12 | 
 13 | \textbf{Purpose}\\
 14 | The package must serve a general purpose that is useful for others. The code must rely on the VCL.
 15 | \vspacebig
 16 | 
 17 | \textbf{Open source}\\
 18 | The package must be published under an open source license. 
 19 | The preferred license is the same as for VCL, i.e. Apache 2.0 license or later.
 20 | Other accepted licenses include GPL 3.0 or later, LGPL 3.0 or later, and revised BSD license.
 21 | \vspacebig
 22 | 
 23 | \textbf{Documentation}\\
 24 | The package must include an instruction manual in English. The manual may be supplied in one of these formats:
 25 | \begin{itemize}
 26 |   \item Plain text as a an ASCII .txt file
 27 |   \item Plain text as a comment in the beginning of the code file
 28 |   \item A .pdf file. The source needed for modifying and rebuilding the .pdf file must be included.
 29 |         The file format of the pdf source must be .tex, .odt, or .docx. Closed, proprietary file formats are not allowed.
 30 | \end{itemize}
 31 | The documentation must include the name and contact information of at least one person responsible for maintaining the code.
 32 | \vspacesmall
 33 | 
 34 | VCL does not use Doxygen or other kinds of metadata for generating documentation. You may use an advanced IDE such as Microsoft Visual Studio for navigating, tracing, browsing, and finding cross-references.
 35 | \vspacebig
 36 | 
 37 | 
 38 | \textbf{Coding style}\label{CodingStyle} \\
 39 | The code must be in C++ language, with file format .h and/or .cpp.
 40 | Names and comments must use English language.
 41 | Name, date, and version number must be written in a comment at the beginning of each code file.
 42 | \vspacesmall
 43 | 
 44 | The file format is plain ASCII. UTF-8 should be avoided if possible. 
 45 | Use Windows-style linefeeds, i.e. \textbackslash r\textbackslash n.
 46 | Indent 4 spaces for every block level. Tabs are not allowed. Remember to set the option in your editor to use spaces instead of tabs.
 47 | \vspacesmall
 48 | 
 49 | The purposes of all classes, functions, and variables must be explained in comments unless they are self-explaining.
 50 | \vspacesmall
 51 | 
 52 | Use curly brackets for branches and loops. A closing curly bracket must be placed on a separate line. An opening curly bracket does not need a separate line.
 53 | \codei{else-if} may be contracted without an extra curly bracket. Example:
 54 | \begin{lstlisting}[frame=none]
 55 | if (a < 0) {
 56 |     // negative
 57 | }
 58 | else if (a == 0) {
 59 |     // zero
 60 | }
 61 | else {
 62 |     // positive
 63 | }
 64 | \end{lstlisting}
 65 | \vspacebig
 66 | 
 67 | 
 68 | \textbf{Optimization}\\
 69 | 
 70 | All functions and operators in .h files should be \codei{static} and \codei{inline}.
 71 | \vspacesmall
 72 | 
 73 | Do not optimize the code for a specific microprocessor, but focus on what is likely to be optimal on future microprocessor models. The most likely bottlenecks to consider are cache use, instruction decoding, and dependency chains. Small loops are usually more efficient than large unrolled loops.
 74 | \vspacesmall
 75 | 
 76 | Minimize the use of static constants because they take op memory space even when they are not used.
 77 | Static constants may be stored in templates that are not instantiated if they are not used.
 78 | \vspacesmall
 79 | 
 80 | Preprocessing \codei{\#define}'s must have unique names that are unlikely to cause name clashes because they are in the global namespace. It is preferred to use \codei{const int} etc. instead for defining constants.
 81 | \vspacesmall
 82 | 
 83 | \textbf{Testing}\\
 84 | Any code must be thoroughly tested with the latest version of VCL before submission.
 85 | It should preferably be tested with multiple different compilers and different operating systems.
 86 | Add-on packages may have their own test bench.
 87 | \vspacesmall
 88 | 
 89 | 
 90 | \section{Contributing to VCL}\label{Contributing}
 91 | \textbf{Bug reports}\\
 92 | Bug reports should preferably be filed as issues on the git repository. 
 93 | Please check the list of known bugs at the GIT repository under 
 94 | \href{https://github.com/vectorclass/miscellaneous}{miscellaneous}. 
 95 | 
 96 | \vspacesmall
 97 | 
 98 | \textbf{Avoid feature bloat}\\
 99 | Do not put new features into the main VCL files unless there is general agreement that they are needed. Special purpose features should instead be placed in add-on packages.
100 | \vspacesmall
101 | 
102 | The coding style must follow the guidelines listed above on page \pageref{CodingStyle}. Do not insert metadata for Doxygen or similar tools. Follow the optimization guidelines mentioned above.
103 | \vspacesmall
104 | 
105 | Any modification to the main VCL files should be tested with different compilers and different operating systems on the test bench described below in chapter \ref{chap:TestBench}. Avoid any files or features that are specific to a particular CPU, operating system, platform, or development tool.
106 | \vspacesmall
107 | 
108 | Copyright is a problem. If different contributions are copyrighted by different contributors than it will be impossible to make any legal decisions regarding VCL if not all contributors can be contacted. There are plans to assign the copyright to a non-profit organization, but no particular organization has been chosen yet.
109 | \vspacesmall
110 | 
111 | 
112 | \section{Test bench}\label{chap:TestBench}
113 | A test bench has been developed for the purpose of automatic testing of VCL.
114 | The test bench includes C++ code and a bash script for automatic testing of operators and functions. The script will run through a list of test cases to test each operator and function with many different combinations of vector classes, instruction sets, compilers, and operating systems. Each test case will be implemented by compiling and running a small test program and comparing the resulting values with the expected values.
115 | \vspacesmall
116 | 
117 | The test bench is used in the development of VCL. It is not intended for programmers that use the VCL. All code and documentation for the test bench is provided in the folder named testbench.
118 | \vspacesmall
119 | 
120 | 
121 | \end{document}


--------------------------------------------------------------------------------
/vcl_conversion.tex:
--------------------------------------------------------------------------------
  1 | % chapter included in vclmanual.tex
  2 | \documentclass[vcl_manual.tex]{subfiles}
  3 | \begin{document}
  4 | 
  5 | 
  6 | \chapter{Conversion between vector types}\label{Conversion between vector types}
  7 | \flushleft
  8 | 
  9 | Below is a list of methods and functions for conversion between different vector types, vector sizes or precisions.
 10 | \vspacebig
 11 | 
 12 | \section{Conversion between data vector types}
 13 | 
 14 | \begin{tabular}{|p{30mm}|p{120mm}|}
 15 | \hline
 16 | \bfseries Method & conversion between vector class and intrinsic vector type \\ \hline
 17 | \bfseries Defined for & all integer and floating point vector classes \\ \hline
 18 | \bfseries Description & conversion between a vector class and the corresponding intrinsic vector type \_\_m128, \_\_m128d, \_\_m128i, \_\_m256, \_\_m256d, \_\_m256i, \_\_m512, \_\_m512d, \_\_m512i can be done implicitly or explicitly. \newline
 19 | Boolean vectors can be converted to their internal representation, which is an integer vector for broad boolean vectors, or a single integer for compact boolean vectors. \\ \hline
 20 | \bfseries Efficiency & good \\ \hline
 21 | \end{tabular}
 22 | \begin{lstlisting}[frame=none]
 23 | // Example:
 24 | Vec4i   a(0,1,2,3);
 25 | __m128i b = a;    // b = 0x00000003000000020000000100000000
 26 | Vec4i   c = b;    // c = (0,1,2,3)
 27 | \end{lstlisting}
 28 | \vspacesmall
 29 | 
 30 | 
 31 | \begin{tabular}{|p{30mm}|p{120mm}|}
 32 | \hline
 33 | \bfseries Method & conversion from scalar to vector \\ \hline
 34 | \bfseries Defined for & all integer and floating point vector classes \\ \hline
 35 | \bfseries Description & conversion from a scalar (single value) to a vector can be done explicitly by calling a constructor, or implicitly by putting a scalar where a vector is expected. All vector elements get the same value. \\ \hline
 36 | \bfseries Efficiency & good for constant. Medium for variable as parameter \\ \hline
 37 | \end{tabular}
 38 | \begin{lstlisting}[frame=none]
 39 | // Example:
 40 | Vec4i a, b;
 41 | a = Vec4i(5);  // explicit conversion. a = (5,5,5,5)
 42 | b = a + 3;     // implicit conversion to Vec4i. b = (8,8,8,8)
 43 | \end{lstlisting}
 44 | \vspacesmall
 45 | 
 46 | 
 47 | \begin{tabular}{|p{30mm}|p{120mm}|}
 48 | \hline
 49 | \bfseries Method & conversion between signed and unsigned integer vectors \\ \hline
 50 | \bfseries Defined for & all integer vector classes \\ \hline
 51 | \bfseries Description & Conversion between signed and unsigned integer vectors can be done implicitly or explicitly. Overflow and underflow wraps around. \\ \hline
 52 | \bfseries Efficiency & good \\ \hline
 53 | \end{tabular}
 54 | \begin{lstlisting}[frame=none]
 55 | // Example:
 56 | Vec4i  a(-1,0,1,2);   // signed vector
 57 | Vec4ui b = a;         // implicit conversion to unsigned.
 58 |                       // b = (0xFFFFFFFF,0,1,2)
 59 | Vec4ui c = Vec4ui(a); // same, with explicit conversion
 60 | Vec4i  d = c;         // convert back to signed
 61 | \end{lstlisting}
 62 | \vspacesmall
 63 | 
 64 | 
 65 | \begin{tabular}{|p{30mm}|p{120mm}|}
 66 | \hline
 67 | \bfseries Method & conversion between different integer vector types \\ \hline
 68 | \bfseries Defined for & all integer vector classes \\ \hline
 69 | \bfseries Description & Conversion can be done implicitly or explicitly between all integer vector classes with the same total number of bits. This conversion does not change any bits, just the grouping of bits into elements is changed. \\ \hline
 70 | \bfseries Efficiency & good \\ \hline
 71 | \end{tabular}
 72 | \begin{lstlisting}[frame=none]
 73 | // Example:
 74 | Vec8s a(0,1,2,3,4,5,6,7);
 75 | Vec4i b;
 76 | b = a;           // b = (0x1000, 0x3002, 0x5004, 0x7006)
 77 | \end{lstlisting}
 78 | \vspacesmall
 79 | 
 80 | 
 81 | \begin{tabular}{|p{30mm}|p{120mm}|}
 82 | \hline
 83 | \bfseries Function & reinterpret\_d, reinterpret\_f, reinterpret\_i, reinterpret\_h \\ \hline
 84 | \bfseries Defined for & all integer and floating point vector classes \\ \hline
 85 | \bfseries Description & Reinterprets a vector as a different type with the same total number of bits. No bits are changed, only interpreted differently (bit casting).\newline
 86 | reinterpret\_d is used for converting to Vec2d, Vec4d, or Vec8d, \newline
 87 | reinterpret\_f is used for converting to Vec4f, Vec8f, or Vec16f, \newline
 88 | reinterpret\_i is used for converting to any integer vector type, \newline
 89 | reinterpret\_h is used for converting to Vec8h, Vec16h, or Vec32h. \\ \hline
 90 | \bfseries Efficiency & good \\ \hline
 91 | \end{tabular}
 92 | \begin{lstlisting}[frame=none]
 93 | // Example:
 94 | Vec4f a(1.0f, 1.5f, 2.0f, 2.5f);
 95 | Vec4i b = reinterpret_i(a); 
 96 | // b = (0x3F800000, 0x3FC00000, 0x40000000, 0x40200000)
 97 | \end{lstlisting}
 98 | \vspacesmall
 99 | 
100 | \label{roundToInt}
101 | \begin{tabular}{|p{30mm}|p{120mm}|}
102 | \hline
103 | \bfseries Function & 
104 | Vec8s roundi(Vec8h) \newline
105 | Vec16s roundi(Vec16h) \newline
106 | Vec32s roundi(Vec32h) \newline
107 | Vec4i roundi(Vec4f) \newline
108 | Vec8i roundi(Vec8f) \newline
109 | Vec16i roundi(Vec16f) \newline
110 | Vec2q roundi(Vec2d) \newline
111 | Vec4q roundi(Vec4d) \newline
112 | Vec8q roundi(Vec8d) \\ \hline
113 | \bfseries Defined for & all floating point vector classes \\ \hline
114 | \bfseries Description & Rounds floating point numbers to nearest integer and returns an integer vector of the same size. Where two integers are equally near, the even integer is returned. \newline
115 | INF input may give INT\_MAX or INT\_MIN depending on the implementation and the instruction set.\\ \hline
116 | \bfseries Efficiency & float types: good \newline
117 | double types: good if AVX512DQ instruction set, otherwise poor \\ \hline
118 | \end{tabular}
119 | \begin{lstlisting}[frame=none]
120 | // Example:
121 | Vec4f a(1.0f, 1.5f, 2.0f, 2.5f);
122 | Vec4i b = round_to_int(a);  // b = (1,2,2,2)
123 | \end{lstlisting}
124 | \vspacesmall
125 | 
126 | 
127 | \begin{tabular}{|p{30mm}|p{120mm}|}
128 | \hline
129 | \bfseries Function & 
130 | Vec4i round\_to\_int32(Vec2d) \newline
131 | Vec4i round\_to\_int32(Vec2d, Vec2d) \newline
132 | Vec4i round\_to\_int32(Vec4d) \newline
133 | Vec8i round\_to\_int32(Vec8d)\\ \hline
134 | \bfseries Defined for & Vec2d, Vec4d, Vec8d \\ \hline
135 | \bfseries Description & rounds double precision floating point numbers and returns vector of 32-bit integers. Where two integers are equally near, the even integer is returned. 
136 | \\ \hline
137 | \bfseries Efficiency & good \\ \hline
138 | \end{tabular}
139 | \begin{lstlisting}[frame=none]
140 | // Example:
141 | Vec4d a(1.0, 1.5, 2.0, 2.5);
142 | Vec4i b = round_to_int32(a);  // b = (1,2,2,2)
143 | \end{lstlisting}
144 | \vspacesmall
145 | 
146 | \label{truncateToInt}
147 | \begin{tabular}{|p{30mm}|p{120mm}|}
148 | \hline
149 | \bfseries Function & 
150 | Vec8s truncatei(Vec8h) \newline
151 | Vec16s truncatei(Vec16h) \newline
152 | Vec32s truncatei(Vec32h) \newline
153 | Vec4i truncatei(Vec4f) \newline
154 | Vec8i truncatei(Vec8f)\newline
155 | Vec16i truncatei(Vec16f)\newline
156 | Vec2q truncatei(Vec2d) \newline
157 | Vec4q truncatei(Vec4d) \newline
158 | Vec8q truncatei(Vec8d) \\ \hline
159 | \bfseries Defined for & all floating point vector classes \\ \hline
160 | \bfseries Description & truncates floating point numbers towards zero and returns signed integer vector of the same size.  \newline
161 | INF input may give INT\_MAX or INT\_MIN depending on the implementation and the instruction set.\\ \hline
162 | \bfseries Efficiency & 
163 | float types: good \newline
164 | double types: good if AVX512DQ instruction set, otherwise poor  \\ \hline
165 | \end{tabular}
166 | \begin{lstlisting}[frame=none]
167 | // Example:
168 | Vec4f a(-1.6f, 1.5f, 2.0f, 2.9f);
169 | Vec4i b = truncate_to_int(a);  // b = (-1,1,2,2)
170 | \end{lstlisting}
171 | \vspacesmall
172 | 
173 | 
174 | \begin{tabular}{|p{30mm}|p{120mm}|}
175 | \hline
176 | \bfseries Function & 
177 | Vec4i truncate\_to\_int32(Vec2d, Vec2d)\newline
178 | Vec4i truncate\_to\_int32(Vec4d)\newline
179 | Vec8i truncate\_to\_int32(Vec8d) \\ \hline
180 | \bfseries Defined for & Vec2d, Vec4d, Vec8d \\ \hline
181 | \bfseries Description & truncates double precision floating point numbers towards zero and returns signed vector of 32-bit integers. \\ \hline
182 | \bfseries Efficiency & good \\ \hline
183 | \end{tabular}
184 | \begin{lstlisting}[frame=none]
185 | // Example:
186 | Vec4d a(-1.5, 1.5, 2.0, 2.9);
187 | Vec4i b = truncate_to_int32(a);  // b = (-1,1,2,2)
188 | \end{lstlisting}
189 | \vspacesmall
190 | 
191 | 
192 | \begin{tabular}{|p{30mm}|p{120mm}|}
193 | \hline
194 | \bfseries Function & 
195 | Vec4f to\_float(Vec4i) \newline
196 | Vec8f to\_float(Vec8i) \newline
197 | Vec16f to\_float(Vec16i) \\ \hline
198 | \bfseries Defined for & Vec4i, Vec8i, Vec16i \\ \hline
199 | \bfseries Description & converts signed 32-bit integers to single precision float \\ \hline
200 | \bfseries Efficiency & good \\ \hline
201 | \end{tabular}
202 | \begin{lstlisting}[frame=none]
203 | // Example:
204 | Vec4i a(0, 1, 2, 3);
205 | Vec4f b = to_float(a);  // b = (0.0f, 1.0f, 2.0f, 3.0f)
206 | \end{lstlisting}
207 | \vspacesmall
208 | 
209 | 
210 | \begin{tabular}{|p{30mm}|p{120mm}|}
211 | \hline
212 | \bfseries Function & 
213 | Vec4f to\_float(Vec4ui) \newline
214 | Vec8f to\_float(Vec8ui) \newline
215 | Vec16f to\_float(Vec16ui) \\ \hline
216 | \bfseries Defined for & Vec4ui, Vec8ui, Vec16ui \\ \hline
217 | \bfseries Description & converts unsigned integers to single precision float \\ \hline
218 | \bfseries Efficiency & good if AVX512VL instruction set. Poor otherwise \\ \hline
219 | \end{tabular}
220 | \begin{lstlisting}[frame=none]
221 | // Example:
222 | Vec4ui a(0, 1, 2, 3);
223 | Vec4f b = to_float(a);  // b = (0.0f, 1.0f, 2.0f, 3.0f)
224 | \end{lstlisting}
225 | \vspacesmall
226 | 
227 | \begin{tabular}{|p{30mm}|p{120mm}|}
228 | \hline
229 | \bfseries Function & 
230 | Vec4f to\_float(Vec2d) \newline
231 | Vec4f to\_float(Vec4d) \newline
232 | Vec8f to\_float(Vec8d) \\ \hline
233 | \bfseries Defined for & Vec2d, Vec4d, Vec8d \\ \hline
234 | \bfseries Description & converts floating point vectors from double precision to single precision. \\ \hline
235 | \bfseries Efficiency & good \\ \hline
236 | \end{tabular}
237 | \vspacesmall
238 | 
239 | \begin{tabular}{|p{30mm}|p{120mm}|}
240 | \hline
241 | \bfseries Function & 
242 | Vec4f convert8h\_4f(Vec8h) \newline
243 | Vec8f to\_float(Vec8h) \newline
244 | Vec16f to\_float(Vec16h) \\ \hline
245 | \bfseries Defined for & Vec8h, Vec16h \\ \hline
246 | \bfseries Description & converts floating point vectors from half precision to single precision. \\ \hline
247 | \bfseries Efficiency & good if F16C or AVX512-FP16 \\ \hline
248 | \end{tabular}
249 | \vspacebig
250 | 
251 | \begin{tabular}{|p{30mm}|p{120mm}|}
252 | \hline
253 | \bfseries Function & 
254 | Vec8h convert4f\_8h(Vec4f) \newline
255 | Vec8h to\_float16(Vec8f) \newline
256 | Vec16h to\_float16(Vec16f) \\ \hline
257 | \bfseries Defined for & Vec4f, Vec8f, Vec16f \\ \hline
258 | \bfseries Description & converts floating point vectors from single precision to half precision. \\ \hline
259 | \bfseries Efficiency & good if F16C or AVX512-FP16 \\ \hline
260 | \end{tabular}
261 | \vspacebig
262 | 
263 | \begin{tabular}{|p{30mm}|p{120mm}|}
264 | \hline
265 | \bfseries Function & 
266 | Vec4d to\_double(Vec4i) \newline
267 | Vec8d to\_double(Vec8i) \\ \hline
268 | \bfseries Defined for & Vec4i, Vec8i \\ \hline
269 | \bfseries Description & converts signed 32-bit integers to double precision float. The output vector is larger than the input vector. \\ \hline
270 | \bfseries Efficiency & medium \\ \hline
271 | \end{tabular}
272 | \begin{lstlisting}[frame=none]
273 | // Example:
274 | Vec4i a(0, 1, 2, 3);
275 | Vec4d b = to_double(a);  // b = (0.0, 1.0, 2.0, 3.0)
276 | \end{lstlisting}
277 | \vspacesmall
278 | 
279 | 
280 | \begin{tabular}{|p{30mm}|p{120mm}|}
281 | \hline
282 | \bfseries Function & 
283 | Vec2d to\_double(Vec2q x) \newline
284 | Vec4d to\_double(Vec4q x) \newline
285 | Vec8d to\_double(Vec8q x) \newline 
286 | Vec2d to\_double(Vec2uq x) \newline
287 | Vec4d to\_double(Vec4uq x) \newline
288 | Vec8d to\_double(Vec8uq x) \\ \hline
289 | \bfseries Defined for & Vec2q, Vec4q, Vec8q, Vec2uq, Vec4uq, Vec8uq \\ \hline
290 | \bfseries Description & converts signed or unsigned 64-bit integers to double precision float \\ \hline
291 | \bfseries Efficiency & good if AVX512DQ and AVX512VL instruction sets, otherwise poor. \\ \hline
292 | \end{tabular}
293 | \begin{lstlisting}[frame=none]
294 | // Example:
295 | Vec2q a(0, 1);
296 | Vec2d b = to_double(a);  // b = (0.0, 1.0)
297 | \end{lstlisting}
298 | \vspacesmall
299 | 
300 | 
301 | \begin{tabular}{|p{30mm}|p{120mm}|}
302 | \hline
303 | \bfseries Function & 
304 | Vec4d to\_double(Vec4f x) \newline
305 | Vec8d to\_double(Vec8f x) \\ \hline
306 | \bfseries Defined for & Vec4f, Vec8f \\ \hline
307 | \bfseries Description & converts floating point vectors from single precision to double precision. The total number of bits in the vector is doubled \\ \hline
308 | \bfseries Efficiency & good  \\ \hline
309 | \end{tabular}
310 | \vspacebig
311 | 
312 | 
313 | \begin{tabular}{|p{30mm}|p{120mm}|}
314 | \hline
315 | \bfseries Function & 
316 | Vec2d to\_double\_low(Vec4i) \newline
317 | Vec2d to\_double\_high(Vec4i) \\ \hline
318 | \bfseries Defined for & Vec4i \\ \hline
319 | \bfseries Description & converts signed 32-bit integers to double precision float \\ \hline
320 | \bfseries Efficiency & medium \\ \hline
321 | \end{tabular}
322 | \begin{lstlisting}[frame=none]
323 | // Example:
324 | Vec4i a(0, 1, 2, 3);
325 | Vec2d b = to_double_low(a);  // b = (0.0, 1.0)
326 | Vec2d c = to_double_high(a); // c = (2.0, 3.0)
327 | \end{lstlisting}
328 | \vspacesmall
329 | 
330 | 
331 | \begin{tabular}{|p{30mm}|p{120mm}|}
332 | \hline
333 | \bfseries Method & concatenating vectors \\ \hline
334 | \bfseries Defined for & All 128-bit and 256-bit vector classes and corresponding boolean vector classes \\ \hline
335 | \bfseries Description & Two vectors can be concatenated into one vector of the double size by calling a constructor or the function concatenate2. \\ \hline
336 | \bfseries Efficiency & good \\ \hline
337 | \end{tabular}
338 | \begin{lstlisting}[frame=none]
339 | // Example:
340 | Vec4i a(10,11,12,13);
341 | Vec4i b(20,21,22,23);
342 | Vec8i c(a, b);    // c = (10,11,12,13,20,21,22,23)
343 | Vec8i d = concatenate2(a, b); // same as c
344 | \end{lstlisting}
345 | \vspacesmall
346 | 
347 | 
348 | \begin{tabular}{|p{30mm}|p{120mm}|}
349 | \hline
350 | \bfseries Method & get\_low, get\_high \\ \hline
351 | \bfseries Defined for & all 256-bit and 512-bit vector classes \\ \hline
352 | \bfseries Description & One big vector can be split into two vectors of half the size by calling the methods get\_low and get\_high \\ \hline
353 | \bfseries Efficiency & good \\ \hline
354 | \end{tabular}
355 | \begin{lstlisting}[frame=none]
356 | // Example:
357 | Vec8i a(10,11,12,13,14,15,16,17);
358 | Vec4i b = a.get_low();  // b = (10,11,12,13)
359 | Vec4i c = a.get_high(); // c = (14,15,16,17)
360 | \end{lstlisting}
361 | \vspacesmall
362 | 
363 | 
364 | \begin{tabular}{|p{30mm}|p{120mm}|}
365 | \hline
366 | \bfseries Method & extend\_z \\ \hline
367 | \bfseries Defined for & All 128-bit and 256-bit vector classes and corresponding boolean vector classes \\ \hline
368 | \bfseries Description & The vector is extended to double size by adding zeroes. \\ \hline
369 | \bfseries Efficiency & good \\ \hline
370 | \end{tabular}
371 | \begin{lstlisting}[frame=none]
372 | // Example:
373 | Vec4i a(10,11,12,13);
374 | Vec8i b = extend_z(a);  // b = (10,11,12,13,0,0,0,0)
375 | \end{lstlisting}
376 | \vspacesmall
377 | 
378 | 
379 | \begin{tabular}{|p{30mm}|p{120mm}|}
380 | \hline
381 | \bfseries Function & extend \\ \hline
382 | \bfseries Defined for & Vec16c, Vec16uc, Vec32c, Vec32uc, 
383 | Vec8s, Vec8us, Vec16s, Vec16us,
384 | Vec4i, Vec4ui, Vec8i, Vec8ui,  \\ \hline
385 | \bfseries Description & Extends integers to a larger number of bits per element.
386 | The total number of bits in the vector is doubled. 
387 | Unsigned integers are zero-extended, signed integers are sign-extended. \\ \hline
388 | \bfseries Efficiency & good for instruction sets that support the highest vector size, medium otherwise. \\ \hline
389 | \end{tabular}
390 | \begin{lstlisting}[frame=none]
391 | // Example:
392 | Vec8s a(-2, -1, 0, 1, 2, 3, 4, 5);
393 | Vec8i b = extend(a);   // b = (-2, -1, 0, 1, 2, 3, 4, 5)
394 | \end{lstlisting}
395 | \vspacesmall
396 | 
397 | 
398 | \begin{tabular}{|p{30mm}|p{120mm}|}
399 | \hline
400 | \bfseries Function & extend\_low, extend\_high \\ \hline
401 | \bfseries Defined for & Vec16c, Vec16uc, Vec32c, Vec32uc, Vec64c, Vec64uc,  
402 | Vec8s, Vec8us, Vec16s, Vec16us, Vec32s, Vec32us, 
403 | Vec4i, Vec4ui, Vec8i, Vec8ui, Vec16i, Vec16ui \\ \hline
404 | \bfseries Description & Extends integers to a larger number of bits per element.
405 | Only the lower or upper half of the vector is converted. The total number of bits in the vector is unchanged. 
406 | Unsigned integers are zero-extended, signed integers are sign-extended. \\ \hline
407 | \bfseries Efficiency & good \\ \hline
408 | \end{tabular}
409 | \begin{lstlisting}[frame=none]
410 | // Example:
411 | Vec8s a(-2, -1, 0, 1, 2, 3, 4, 5);
412 | Vec4i b = extend_low(a);   // b = (-2, -1, 0, 1)
413 | Vec4i c = extend_high(a);  // c = (2, 3, 4, 5)
414 | \end{lstlisting}
415 | \vspacesmall
416 | 
417 | 
418 | \begin{tabular}{|p{30mm}|p{120mm}|}
419 | \hline
420 | \bfseries Function & extend\_low, extend\_high \\ \hline
421 | \bfseries Defined for & Vec4f, Vec8f, Vec16f \\ \hline
422 | \bfseries Description & extends single precision floating point numbers to double precision. 
423 | Only the lower or upper half of the vector is converted. The total number of bits in the vector is unchanged.  \\ \hline
424 | \bfseries Efficiency & good \\ \hline
425 | \end{tabular}
426 | \begin{lstlisting}[frame=none]
427 | // Example:
428 | Vec4f a(1.0f, 1.1f, 1.2f, 1.3f);
429 | Vec2d b = extend_low(a);   // b = (1.0, 1.1)
430 | Vec2d c = extend_high(a);  // c = (1.2, 1.3)
431 | \end{lstlisting}
432 | \vspacesmall
433 | 
434 | 
435 | \begin{tabular}{|p{30mm}|p{120mm}|}
436 | \hline
437 | \bfseries Function & compress \\ \hline
438 | \bfseries Defined for & Vec16s, Vec16us, Vec32s, Vec32us, 
439 | Vec8i, Vec8ui, Vec16i, Vec16ui,  
440 | Vec4q, Vec4uq, Vec8q, Vec8uq \\ \hline
441 | \bfseries Description & Reduces integers to a lower number of bits per element. 
442 | The total number of bits in the vector is halved. 
443 | There is no overflow check. The upper bits are simply cut off (wrap around). \\ \hline
444 | \bfseries Efficiency & good for instruction sets that support the highest vector size, medium otherwise . \\ \hline
445 | \end{tabular}
446 | \begin{lstlisting}[frame=none]
447 | // Example:
448 | Vec8q a(10, 11, 12, 13, 14, 15, 16, 17);
449 | Vec8i b = compress(a); // b = (10, 11, 12, 13, 14, 15, 16, 17)
450 | \end{lstlisting}
451 | \vspacesmall
452 | 
453 | 
454 | \begin{tabular}{|p{30mm}|p{120mm}|}
455 | \hline
456 | \bfseries Function & compress (with two vector parameters) \\ \hline
457 | \bfseries Defined for & Vec8s, Vec8us, Vec16s, Vec16us, Vec32s, Vec32us, 
458 | Vec4i, Vec4ui, Vec8i, Vec8ui, Vec16i, Vec16ui,  
459 | Vec2q, Vec2uq, Vec4q, Vec4uq, Vec8q, Vec8uq \\ \hline
460 | \bfseries Description & Packs two integer vectors into a single vector with the same total number of bits, by reducing each integer to a lower number of bits per element. 
461 | There is no overflow check. The upper bits are simply cut off (wrap around). \\ \hline
462 | \bfseries Efficiency & medium \\ \hline
463 | \end{tabular}
464 | \begin{lstlisting}[frame=none]
465 | // Example:
466 | Vec4i a(10, 11, 12, 13);
467 | Vec4i b(20, 21, 22, 23);
468 | Vec8s c = compress(a, b); // c = (10,11,12,13,20,21,22,23)
469 | \end{lstlisting}
470 | \vspacesmall
471 | 
472 | 
473 | \begin{tabular}{|p{30mm}|p{120mm}|}
474 | \hline
475 | \bfseries Function & compress (with two vector parameters)\\ \hline
476 | \bfseries Defined for & Vec2d, Vec4d, Vec8d \\ \hline
477 | \bfseries Description & reduces double precision floating point numbers to single precision. Two double precision vectors are packed into one single precision vector with the same total number of bits. \\ \hline
478 | \bfseries Efficiency & medium \\ \hline
479 | \end{tabular}
480 | \begin{lstlisting}[frame=none]
481 | // Example:
482 | Vec2d a(1.0, 1.1);
483 | Vec2d b(2.0, 2.1);
484 | Vec4f c = compress(a, b); // c = (1.0f, 1.1f, 2.0f, 2.1f)
485 | \end{lstlisting}
486 | \vspacesmall
487 | 
488 | \begin{tabular}{|p{30mm}|p{120mm}|}
489 | \hline
490 | \bfseries Function & compress\_saturated (with one vector parameter) \\ \hline
491 | \bfseries Defined for & Vec16s, Vec16us, Vec32s, Vec32us, Vec8i, Vec8ui, Vec16i, Vec16ui, Vec4q, Vec4uq, Vec8q, Vec8uq \\ \hline
492 | \bfseries Description & Packs an integer vector into a vector with the same number of elements and half the number of bits per element. 
493 | Overflow and underflow saturates \\ \hline
494 | \bfseries Efficiency & medium (worse than compress in most cases) \\ \hline
495 | \end{tabular}
496 | \vspacebig
497 | 
498 | \begin{tabular}{|p{30mm}|p{120mm}|}
499 | \hline
500 | \bfseries Function & compress\_saturated (with two vector parameters) \\ \hline
501 | \bfseries Defined for & Vec8s, Vec8us, Vec16s, Vec16us, Vec32s, Vec32us, 
502 | Vec4i, Vec4ui, Vec8i, Vec8ui, Vec16i, Vec16ui,  
503 | Vec2q, Vec2uq, Vec4q, Vec4uq, Vec8q, Vec8uq \\ \hline
504 | 
505 | \bfseries Description & Packs two integer vectors into a single vector with the same total number of bits, by reducing each integer to a lower number of bits per element. 
506 | Overflow and underflow saturates \\ \hline
507 | \bfseries Efficiency & medium (worse than compress in most cases) \\ \hline
508 | \end{tabular}
509 | \begin{lstlisting}[frame=none]
510 | // Example:
511 | Vec4i a(10, 11, 12, 13);
512 | Vec4i b(20, 21, 22, 23);
513 | Vec8s c = compress_saturated(a, b);
514 | // c = (10,11,12,13,20,21,22,23)
515 | \end{lstlisting}
516 | \vspacesmall
517 | 
518 | 
519 | 
520 | 
521 | \section{Conversion between boolean vector types}\label{ConversionBetweenBooleanTypes}
522 | 
523 | \begin{tabular}{|p{30mm}|p{120mm}|}
524 | \hline
525 | \bfseries Function & to\_bits \\ \hline
526 | \bfseries Defined for & all boolean vectors \\ \hline
527 | \bfseries Description & converts a boolean vector to an integer with one bit per element \\ \hline
528 | \bfseries Efficiency & good for compact boolean vectors. Medium for broad boolean vectors \\ \hline
529 | \end{tabular}
530 | \begin{lstlisting}[frame=none]
531 | // Example:
532 | Vec4i   a(10, 11, 12, 13);
533 | Vec4i   b(12, 11, 10,  9);
534 | Vec4ib  f = a > b;       // (false, false, true, true)
535 | uint8_t g = to_bits(f);  // = 0b1100
536 | // The order is not reversed, but in the comments above, 
537 | // the vector elements are listed in little endian order, 
538 | // while the binary number is written in big endian order.
539 | \end{lstlisting}
540 | \vspacesmall
541 | 
542 | 
543 | \begin{tabular}{|p{30mm}|p{120mm}|}
544 | \hline
545 | \bfseries Method & load\_bits \\ \hline
546 | \bfseries Defined for & all boolean vectors \\ \hline
547 | \bfseries Description & converts an integer bit-field to a boolean vector \\ \hline
548 | \bfseries Efficiency & good for compact boolean vectors. Medium for broad boolean vectors \\ \hline
549 | \end{tabular}
550 | \begin{lstlisting}[frame=none]
551 | // Example:
552 | uint8_t a = 0b11000010;   // binary number
553 | Vec8fb  b;                // boolean vector
554 | b.load_bits(a);
555 | // b = (false, true, false, false, false, false, true, true)
556 | // The order is not reversed, but in the comments above, 
557 | // the vector elements are listed in little endian order, 
558 | // while the binary number is written in big endian order.
559 | \end{lstlisting}
560 | \vspacesmall
561 | 
562 | 
563 | \begin{tabular}{|p{30mm}|p{120mm}|}
564 | \hline
565 | \bfseries Method & conversion between boolean vectors of same size and element size \\ \hline
566 | \bfseries Defined for & 
567 | Vec4ib $\leftrightarrow$ Vec4fb \newline
568 | Vec8ib $\leftrightarrow$ Vec8fb \newline
569 | Vec16ib $\leftrightarrow$ Vec16fb \newline
570 | Vec2qb $\leftrightarrow$ Vec2db \newline
571 | Vec4qb $\leftrightarrow$ Vec4db \newline
572 | Vec8qb $\leftrightarrow$ Vec8db \\ \hline
573 | \bfseries Description & Boolean vectors for use with different types of vectors with the same bit size can be converted to each other. \\ \hline
574 | \bfseries Efficiency & good \\ \hline
575 | \end{tabular}
576 | \begin{lstlisting}[frame=none]
577 | // Example:
578 | Vec4i  a(0,1,2,3);
579 | Vec4i  b(4,3,2,1);
580 | Vec4ib f = a > b;     // f = (false,false,false,true)
581 | Vec4fb g = Vec4fb(f); // g = (false,false,false,true)
582 | \end{lstlisting}
583 | \vspacesmall
584 | 
585 | 
586 | \begin{tabular}{|p{30mm}|p{120mm}|}
587 | \hline
588 | \bfseries Method & conversion from boolean vectors to integer vectors of the same size and element size \\ \hline
589 | \bfseries Defined for & broad boolean vectors only.  \\ \hline
590 | \bfseries Description & broad boolean vectors can be converted to integer vectors of the same size and bit size. The result will be -1 for true and 0 for false.\newline
591 | Avoid this method if compact boolean vectors may be used.\newline
592 | Conversion the other way, e.g. from Vec4i to Vec4ib is possible for broad boolean vectors
593 |  if the input vector contains -1 for true and 0 for false, but the result is implementation dependent and possibly wrong and inconsistent if the input vector contains any other values than 0 and -1. To prevent errors, it is recommended to use a comparison instead for converting an integer vector to a boolean vector.  \\ \hline
594 | \bfseries Efficiency & good \\ \hline
595 | \end{tabular}
596 | \begin{lstlisting}[frame=none]
597 | // This example works only for broad boolean vectors
598 | Vec4i  a(0,1,2,3);
599 | Vec4i  b(4,3,2,1);
600 | Vec4ib f = a > b;     // f = (false,false,false,true)
601 | Vec4i  g = Vec4i(f);  // g = (0, 0, 0, -1)
602 | \end{lstlisting}
603 | \vspacesmall
604 | 
605 | 
606 | \end{document}


--------------------------------------------------------------------------------
/vcl_errors_etc.tex:
--------------------------------------------------------------------------------
  1 | % chapter included in vclmanual.tex
  2 | \documentclass[vcl_manual.tex]{subfiles}
  3 | \begin{document}
  4 | 
  5 | \chapter{Technical details}\label{chap:TechnicalDetails}
  6 | 
  7 | \section{Error conditions}\label{chap:ErrorConditions}
  8 | 
  9 | \subsection{Runtime errors}\label{RuntimeErrors}
 10 | \flushleft
 11 | 
 12 | The vector class library is generally not producing runtime error messages. An index out of range produces behavior that is implementation-dependent. This means that the output may be different for different instruction sets or for different versions of the vector class library.
 13 | \vspacesmall
 14 | 
 15 | For example, an attempt to read a vector element with an index that is out of range may result in various behaviors, such as producing zero, taking the index modulo the vector size, giving the last element, or producing an arbitrary value. Likewise, an attempt to write a vector element with an index that is out of range may variously take the index modulo the vector size, write the last element, or do nothing. This applies to functions such as \codei{insert}, \codei{extract}, \codei{load\_partial}, \codei{store\_partial}, \codei{cutoff}, \codei{permute}, \codei{blend}, \codei{lookup}, and \codei{gather}. The same applies to a bit-index that is out of range in rotate functions
 16 | and shift operators (\textless \textless , \textgreater \textgreater).
 17 | \vspacesmall
 18 | 
 19 | Boolean vectors in the broad form (see page \pageref{tableBooleanVectorSizes}) are stored as integer vectors. The only allowed values for boolean vector elements in this case are 0 (false) and -1 (true). The behavior for other values is implementation-dependent and possibly inconsistent. For example, the behavior of the select function when a boolean selector element is a mixture of 0 and 1 bits depends on the instruction set. For instruction sets prior to SSE4.1, it will select between the operands bit-by-bit. For SSE4.1 and higher it will select integer vectors byte-by-byte, using the leftmost bit of each byte in the selector input. For floating point vectors under SSE4.1 and higher, it will use only the leftmost bit (sign bit) of the selector. Boolean vectors in the compact form have only one bit for each element.
 20 | \vspacesmall
 21 | 
 22 | An integer division by a variable that is zero will usually produce a runtime exception.
 23 | \vspacesmall
 24 | 
 25 | A program crash may be caused by alignment errors with instruction sets prior to AVX. This can happen if a VCL vector is stored in a dynamic array or a container class template instance that does not have correct alignment. See page \pageref{Alignment}
 26 | \vspacesmall
 27 | 
 28 | 
 29 | \subsection{Floating point errors}\label{FloatingPointErrors}
 30 | The Vector Class Library produces infinity (INF) or "Not A Number" (NAN) to indicate floating point errors, as discussed on page \pageref{NoExceptionTrapping}.
 31 | Floating point overflow will usually produce infinity, floating point underflow produces zero, and an invalid floating point operation produces NAN (Not A Number). The INF and NAN codes will usually propagate to the end result where they can be detected.
 32 | \vspacesmall
 33 | 
 34 | There are a few cases where INF and NAN codes do not propagate. For example, dividing a nonzero number by INF produces zero. Error codes cannot propagate through integer and boolean vectors. For example:
 35 | \vspacesmall
 36 | 
 37 | \begin{lstlisting}[frame=none]
 38 | Vec4d a, b;
 39 | ...
 40 | Vec4db f = a > 1.0;
 41 | b = select(f, a, 0.5);
 42 | \end{lstlisting}
 43 | \vspacesmall
 44 | 
 45 | The boolean vector elements in \codei{f} will be either true or false, even if \codei{a} is NAN, because a boolean can have no other values. 
 46 | In the case that an element of \codei{a} is NAN, the corresponding element in \codei{f} will be false, and the element in \codei{b} will be 0.5. The NAN error is not propagated from \codei{a} to \codei{b}. Therefore, you have to check for errors before making a boolean expression. This can be done like this:
 47 | 
 48 | \begin{lstlisting}[frame=none]
 49 | Vec4d a, b;
 50 | ...
 51 | if ( ! horizontal_and(is_finite(a))) {
 52 |     // handle error
 53 |     ...
 54 | }
 55 | Vec4db f = a > 1.0;
 56 | b = select(f, a, 0.5);
 57 | \end{lstlisting}
 58 | \vspacesmall
 59 | 
 60 | 
 61 | \subsection{Compile-time errors}\label{CompileTimeErrors}
 62 | The Vector Class Library is making heavy use of metaprogramming features that go to the limit of what modern compilers can do. Occasional problems have been observed with all compilers.
 63 | Errors that are specific to a particular compiler are listed in separate files at the GIT repository under 
 64 | \href{https://github.com/vectorclass/miscellaneous}{miscellaneous}. 
 65 | Please check these lists of known errors before reporting a problem.
 66 | \vspacesmall
 67 | 
 68 | Even small syntax errors may result in very long error messages due to the heavy use of templates and overloading. These error messages may be confusing, but generally indicating the line number of the error.
 69 | \vspacesmall
 70 | 
 71 | Integer vector division by a \codei{const\_int} or \codei{const\_uint} can produce a compile-time error message when the divisor is zero or out of range.
 72 | \vspacesmall
 73 |  
 74 | \textbf{"Ambiguous call to overloaded function"}: \\
 75 | This can happen when parameters have wrong types.
 76 | Make sure all parameters have the correct type.
 77 | \vspacesmall
 78 | 
 79 | Version 1.xx of VCL may produce error messages that are not very informative, such as 
 80 | \textbf{"Static\_error\_check\textless false\textgreater"} due to limitations in template metaprogramming.
 81 | \vspacesmall
 82 | 
 83 | 
 84 | \subsection{Link errors}\label{LinkErrors}
 85 | 
 86 | \textbf{"unresolved external symbol \_\_intel\_cpu\_indicator\_x"}: \\
 87 | This link error occurs when you are using Intel's SVML library without including a CPU dispatcher. Add the library libircmt.lib or libirc.a to use Intel's  CPU dispatch function. Make sure to choose the 32-bit or 64-bit of the library, as appropriate. See page \pageref{ExternalMathLibrary} for details.
 88 | \vspacesmall
 89 | 
 90 | \textbf{"unresolved external symbol \_\_svml\_sin2@@16}, etc. \\
 91 | You need to link the library \textbf{svmlpatch.lib}, which you can find at the git repository under miscellaneous.
 92 | \vspacesmall
 93 | 
 94 | 
 95 | \subsection{Implementation-dependent behavior}\label{ImplementationDependentBehavior}
 96 | 
 97 | A big advantage of the VCL library is that you can compile the same source code for different instruction set extensions. A higher instruction set will generally give faster code, but produce the same results. There may, however, be cases where the same code generates different results with different instruction sets or different compilers. These cases include:
 98 | 
 99 | \begin{itemize}
100 |   \item An index out of range produces implementation-dependent results. Functions such as 
101 |   \codei{insert}, \codei{extract}, \codei{load\_partial}, \codei{store\_partial}, \codei{cutoff}, \codei{permute}, \codei{blend}, \codei{lookup}, \codei{gather}, and \codei{scatter} may produce different results for an index out of range depending on the instruction set. No exception or error message is generated, only a meaningless number.
102 |   
103 |   \item permute and blend functions allow a "don't care" index (\codei{V\_DC}) to be specified. The result for a don't care element may depend on the instruction set.
104 |   
105 |   \item Negative zero. The floating point values of 0.0 and -0.0 are normally regarded as equal. Some functions may return 0.0 or -0.0 depending on the instruction set, e.g. when rounding a negative number. The sign of a zero can be detected by the functions \codei{sign\_bit} and \codei{sign\_combine}.
106 | You may {} \codei{\#define SIGNED\_ZERO} {} to get consistent and pedantic conformance to the specifications of signed zero in the IEEE 754-2019 standard.
107 |   
108 |   \item NANs. An error code can be propagated through NAN (not-a-number) values and retrieved by the function \codei{nan\_code}. When two NAN values with different codes are combined, for example by adding them together, the result may be either of the two values, depending on the compiler. The sign of a NAN has no meaning and may vary. \\
109 |   Use the \codei{minimum} and \codei{maximum} functions rather than \codei{min} and \codei{max} if you want to propagate NAN values through these functions.
110 |   
111 | \end{itemize}
112 | \vspacesmall
113 | 
114 | \end{document}


--------------------------------------------------------------------------------
/vcl_examples.tex:
--------------------------------------------------------------------------------
  1 | % chapter included in vclmanual.tex
  2 | \documentclass[vcl_manual.tex]{subfiles}
  3 | \begin{document}
  4 | 
  5 | 
  6 | \chapter{Examples}\label{chap:Examples}
  7 | \flushleft
  8 | 
  9 | This example calculates the polynomial $x^3 + 2\cdot x^2 - 5\cdot x + 1$ on a floating point vector. The order of calculation is specified by parentheses in order to make shorter dependency chains.
 10 | 
 11 | \begin{example}
 12 | \label{examplePolynomial}
 13 | \end{example}
 14 | \begin{lstlisting}[frame=single]
 15 | Vec4f polynomial (Vec4f x) {
 16 |     return (x + 2.0f) * (x * x) + ((-5.0f) * x + 1.0f);
 17 | }
 18 | \end{lstlisting}
 19 | \vspacebig
 20 | 
 21 | In 64-bit Windows, you may add \codei{\_\_vectorcall} and use a Clang or Microsoft compiler. This makes sure that vector parameters are transferred in registers rather than in memory. This is not needed when the function is inlined or when compiling for other platforms than Windows:
 22 | 
 23 | \begin{example}
 24 | \label{examplePolynomialVectorcall}
 25 | \end{example}
 26 | \begin{lstlisting}[frame=single]
 27 | Vec4f __vectorcall polynomial (Vec4f x) {
 28 |     return (x + 2.0f) * (x * x) + ((-5.0f) * x + 1.0f);
 29 | }
 30 | \end{lstlisting}
 31 | \vspacebig
 32 | 
 33 | 
 34 | The next example transposes a 4x4 matrix, using the AVX2 instruction set.
 35 | 
 36 | \begin{example}
 37 | \label{exampleTranspose4x4}
 38 | \end{example}
 39 | \begin{lstlisting}[frame=single]
 40 | void transpose(float matrix[4][4]) {
 41 |     Vec8f row01, row23, col01, col23;
 42 |     // load first two rows
 43 |     row01.load(&matrix[0][0]);
 44 |     // load next two rows
 45 |     row23.load(&matrix[2][0]);
 46 |     // reorder into columns
 47 |     col01 = blend8f<0,4, 8,12,1,5, 9,13>(row01, row23);
 48 |     col23 = blend8f<2,6,10,14,3,7,11,15>(row01, row23);
 49 |     // store columns into rows
 50 |     col01.store(&matrix[0][0]);
 51 |     col23.store(&matrix[2][0]);
 52 | }
 53 | \end{lstlisting}
 54 | \vspacesmall
 55 | 
 56 | Same example with AVX512:
 57 | 
 58 | \begin{example}
 59 | \label{exampleTranspose4x4avx512}
 60 | \end{example}
 61 | \begin{lstlisting}[frame=single]
 62 | void transpose(float matrix[4][4]) {
 63 |     Vec16f rows, columns;
 64 |     // load entire matrix as rows
 65 |     rows.load(&matrix[0][0]);
 66 |     // reorder into columns
 67 |     columns = permute16f<0,4,8,12,1,5,9,13,
 68 |         2,6,10,14,3,7,11,15>(rows);
 69 |     // store columns into rows
 70 |     columns.store(&matrix[0][0]);
 71 | }
 72 | \end{lstlisting}
 73 | \vspacebig
 74 | 
 75 | The next example makes a matrix multiplication of two 4x4 matrixes.
 76 | 
 77 | \begin{example}
 78 | \label{exampleMatrixMul4x4}
 79 | \end{example}
 80 | \begin{lstlisting}[frame=single]
 81 | void matrixmul(float A[4][4], float B[4][4], float M[4][4]){
 82 |     // calculates M = A*B
 83 |     Vec4f Brow[4], Mrow[4];
 84 |     int i, j;
 85 |     // load B as rows
 86 |     for (i = 0; i < 4; i++) {
 87 |         Brow[i].load(&B[i][0]);
 88 |     }
 89 |     // loop for A and M rows
 90 |     for (i = 0; i < 4; i++) {
 91 |         Mrow[i] = Vec4f(0.0f);
 92 |         // loop for A columns, B rows
 93 |         for (j = 0; j < 4; j++) {
 94 |             Mrow[i] += Brow[j] * A[i][j];
 95 |         }
 96 |     }
 97 |     // store M
 98 |     for (i = 0; i < 4; i++) {
 99 |         Mrow[i].store(&M[i][0]);
100 |     }
101 | }
102 | \end{lstlisting}
103 | \vspacebig
104 | 
105 | 
106 | The next example makes a table of the sin function and gets sin(x) and cos(x) by table lookup.
107 | 
108 | \begin{example}
109 | \label{exampleSinTable}
110 | \end{example}
111 | \begin{lstlisting}[frame=single]
112 | 
113 | #include <cmath>
114 | 
115 | const double pi = 3.14159265358979323846;
116 | 
117 | // length of table. Must be a power of 2.
118 | #define sin_tablelen 1024
119 | // the accuracy of table lookup is +/- pi/sin_tablelen
120 | 
121 | class SinTable {
122 | protected:
123 |     float table[sin_tablelen];
124 |     float resolution;
125 |     float rres;  // 1./resolution
126 | public:
127 |     SinTable();  // constructor
128 |     Vec4f sin(Vec4f x);
129 |     Vec4f cos(Vec4f x);
130 | };
131 | 
132 | SinTable::SinTable() {  // constructor
133 |     // compute resolution
134 |     resolution = float(2.0 * pi / sin_tablelen);
135 |     rres = 1.0f / resolution;
136 |     // Initialize table (No need to use vectors here because this 
137 |     // is calculated only once:)
138 |     for (int i = 0; i < sin_tablelen; i++) {
139 |         table[i] = sinf((float)i * resolution);
140 |     }
141 | }
142 | 
143 | Vec4f SinTable::sin(Vec4f x) {
144 |     // calculate sin by table lookup
145 |     Vec4i index = roundi(x * rres);
146 |     // modulo tablelen equivalent to modulo 2*pi
147 |     index &= sin_tablelen - 1;
148 |     // look up in table
149 |     return lookup<sin_tablelen>(index, table);
150 | }
151 | 
152 | Vec4f SinTable::cos(Vec4f x) {
153 |     // calculate cos by table lookup
154 |     Vec4i index = roundi(x * rres) + sin_tablelen/4;
155 |     // modulo tablelen equivalent to modulo 2*pi
156 |     index &= sin_tablelen - 1;
157 |     // look up in table
158 |     return lookup<sin_tablelen>(index, table);
159 | }
160 | 
161 | int main() {
162 |     SinTable sintab;
163 |     Vec4f a(0.0f, 0.5f, 1.0f, 1.5f);
164 |     Vec4f b = sintab.sin(a);
165 |     // b = (0.0000 0.4768 0.8416 0.9973)
166 |     // accuracy +/- 0.003
167 |     ...
168 |     return 0;
169 | }
170 | \end{lstlisting}
171 | \vspacesmall
172 | 
173 | 
174 | \end{document}


--------------------------------------------------------------------------------
/vcl_file_list.tex:
--------------------------------------------------------------------------------
 1 | % chapter included in vclmanual.tex
 2 | \documentclass[vcl_manual.tex]{subfiles}
 3 | \begin{document}
 4 | 
 5 | 
 6 | \section{File list}
 7 | %\label{FileList}
 8 | \flushleft
 9 | 
10 | \begin{longtable}[l]{|p{40mm}|p{100mm}|}
11 | \endfirsthead
12 | \label{table:fileList} \\
13 | \endhead
14 | \hline
15 | \bfseries File name & \bfseries Purpose \\ \hline
16 | manual/vcl\_manual.pdf & Instruction manual (this file) \\ \hline
17 | 
18 | vectorclass.h & Top-level C++ header file. This will include several other header files, according to the indicated instruction set \\ \hline
19 | 
20 | instrset.h & Detection of which instruction set the code is compiled for, 
21 | and functions that depend on the instruction set. This file also contains various common definitions and templates. Included by vectorclass.h \\ \hline
22 | 
23 | vectori128.h & Defines classes, operators and functions for integer vectors with a total size of 128 bits. Included by vectorclass.h \\ \hline
24 | 
25 | vectori256.h & Defines classes, operators and functions for integer vectors with a total size of 256 bits for the AVX2 instruction set. Included by vectorclass.h if appropriate \\ \hline
26 | 
27 | vectori256e.h & Defines classes, operators and functions for integer vectors with a total size of 256 bits for instruction sets lower than AVX2. Included by vectorclass.h if appropriate \\ \hline
28 | 
29 | vectori512.h & Defines classes, operators and functions for vectors of 32-bit and 64-bit integers with a total size of 512 bits for the AVX512F instruction set. Included by vectorclass.h if appropriate \\ \hline
30 | 
31 | vectori512e.h & Defines classes, operators and functions for vectors of 32-bit and 64-bit integers with a total size of 512 bits for instruction sets lower than AVX512F. Included by vectorclass.h if appropriate \\ \hline
32 | 
33 | vectori512s.h & Defines classes, operators and functions for vectors of 8-bit and 16-bit integers with a total size of 512 bits for the AVX512BW instruction set. Included by vectorclass.h if appropriate \\ \hline
34 | 
35 | vectori512se.h & Defines classes, operators and functions for vectors of 8-bit and 16-bit integers  with a total size of 512 bits for instruction sets lower than
36 | AVX512BW. Included by vectorclass.h if appropriate \\ \hline
37 | 
38 | vectorf128.h & Defines classes, operators and functions for floating point vectors with a total size of 128 bits. Included by vectorclass.h \\ \hline
39 | 
40 | vectorf256.h & Defines classes, operators and functions for floating point vectors with a total size of 256 bits for the AVX and later instruction sets. Included by vectorclass.h if appropriate \\ \hline
41 | 
42 | vectorf256e.h & Defines classes, operators and functions for floating point vectors with a total size of 256 bits for instruction sets lower than AVX. Included by vectorclass.h if appropriate \\ \hline
43 | 
44 | vectorf512.h & Defines classes, operators and functions for floating point vectors with a total size of 512 bits for the AVX512F and later instruction sets. Included by vectorclass.h if appropriate \\ \hline
45 | 
46 | vectorf512e.h & Defines classes, operators and functions for floating point vectors with a total size of 512 bits for instruction sets lower than AVX512F. Included by vectorclass.h if appropriate \\ \hline
47 | 
48 | vectorfp16.h & Defines classes, operators and functions for half precision floating point vectors of all sizes, including mathematical functions, for AVX512-FP16 \\ \hline
49 | 
50 | vectorfp16e.h & Defines emulating classes, operators and functions for half precision floating point vectors of all sizes, including mathematical functions, for processors without AVX512-FP16 \\ \hline
51 | 
52 | vector\_convert.h & Defines functions for conversion between different vector sizes, as well as some generic function templates. \\ \hline
53 | 
54 | vectormath\_exp.h & Optional inline mathematical functions: power, logarithms and exponential functions \\ \hline
55 | 
56 | vectormath\_trig.h & Optional inline mathematical functions: trigonometric and inverse trigonometric functions \\ \hline
57 | 
58 | vectormath\_hyp.h & Optional inline mathematical functions: hyperbolic and inverse hyperbolic functions \\ \hline
59 | 
60 | vectormath\_common.h & Common definitions for vectormath\_exp.h, vectormath\_trig.h and vectormath\_hyp.h \\ \hline
61 | 
62 | vectormath\_lib.h & Optional header file for external mathematical vector function library \\ \hline
63 | 
64 | instrset\_detect.cpp & Optional functions for detecting which instruction set is supported at runtime \\ \hline
65 | 
66 | dispatch\_example.cpp & Example of how to make automatic CPU dispatching \\ \hline
67 | 
68 | LICENSE & Apache 2.0 license \\ \hline
69 | 
70 | changelog.txt & VCL version history \\ \hline
71 | 
72 | miscellaneous/svmlpatch & Folder containing the library svmlpatch.lib as well as the source code to build it. Used for fixing a compatibillity issue with Intel SVML library in 64-bit Windows \\ \hline
73 | 
74 | testbench & Folder containing test bench files for testing the VCL library. This is used in the development of VCL, and is not needed by programmers using the VCL. Includes code and documentation. \\ \hline
75 | 
76 | \end{longtable}
77 | %\end{tabular}
78 | \vspacesmall
79 | 
80 | 
81 | 
82 | \end{document}
83 | 


--------------------------------------------------------------------------------
/vcl_float_behavior.tex:
--------------------------------------------------------------------------------
 1 | % chapter included in vclmanual.tex
 2 | \documentclass[vcl_manual.tex]{subfiles}
 3 | \begin{document}
 4 | 
 5 | 
 6 | \section{Floating point behavior details}
 7 | \label{FloatingPointBehavior}
 8 | 
 9 | The Vector Class Library is generally conforming to the new IEEE 754-2019 Standard for Floating-Point Arithmetic, but some compromises have been necessary for the purpose of vector processing and for better performance. The deviations from the standard are discussed below.
10 | \vspacesmall
11 | 
12 | \begin{description}
13 | 
14 | \item[Subnormal numbers.]
15 | Subnormal numbers (also called denormal numbers) are numerically extremely small floating point numbers where the exponent is below the normal range. Some microprocessors are handling subnormal numbers in a very inefficient way that is more than a hundred times slower than for normal floating point numbers. You may call the function \codei{no\_subnormals()} to prevent this and treat subnormal numbers as zero in single and double precision floating point calculations. Calculations in half precision are generally efficient even when values are subnormal. 
16 | Some of the mathematical functions in VCL always treat subnormal numbers as zero for reasons of performance. This includes logarithm, exponential, and power functions.
17 | 
18 | \item[Signed zero.]
19 | Signed zero is a controversial issue. The floating point standard defines two different zeroes: +0.0 and -0.0.
20 | The two zeroes are equal, but still distinguishable. Some of the functions may return +0.0 where the standard requires -0.0.\\
21 | You may {} \codei{\#define SIGNED\_ZERO} {} if you want the sign of zero to conform to the
22 | IEEE 754-2019 standard, though this may slow down performance a little.
23 | \codei{SIGNED\_ZERO} may affect several functions, including
24 | \codei{round}, \codei{truncate}, \codei{floor}, \codei{ceil}, 
25 | \codei{maximum}, \codei{minimum}, \codei{cbrt}, \codei{pow\_ratio}, \codei{expm1}, \codei{log1p}.
26 | 
27 | \item[No exception trapping.] \label{NoExceptionTrapping}
28 | Floating point errors are traditionally detected by trapping errors or relying on an \codei{errno} variable. These methods are not well suited for vector processing and out-of-order processing. This is explained in the document \href{https://www.agner.org/optimize/nan_propagation.pdf}{"NAN propagation versus fault trapping in floating point code", Agner Fog, 2019}.
29 | \vspacesmall
30 | 
31 | The Vector Class Library does not support fault trapping, and it does not indicate exceptions in a variable such as the traditional \codei{errno}. It is not recommended to turn on floating point exceptions because this can cause inconsistent behavior, such as traps for exceptions in not-taken branches. Do not attempt to trap numerical errors in \codei{try/catch} blocks.
32 | \vspacesmall
33 | 
34 | Instead, the vector class library indicates floating point exceptions by producing INF or NAN codes in the individual vector element that produced the fault.
35 | The INF and NAN codes will propagate to the end result of a series of calculations when certain conditions are satisfied. The most efficient way of detecting floating point errors is to look for INF and NAN codes in the result.
36 | \vspacesmall
37 | 
38 | Conditions where INF and NAN codes are not propagated are discussed at page  \pageref{FloatingPointErrors}
39 | \vspacesmall
40 | 
41 | Do not use the compiler options -ffast-math, -ffinite-math-only, or /fp:fast because this may disable the detection of INF and NAN.
42 | \vspacesmall
43 | 
44 | \item[No signaling NANs.]
45 | Signaling NANs are special codes that will raise an exception when they are loaded from memory. Signaling NANs are rarely used in modern software. Signaling NANs should not be used in VCL because exception trapping is not supported.
46 | 
47 | \item[NAN payload operations.]
48 | A NAN may contain additional information called a payload. This payload can propagate through a series of calculations to the end result. Some of the mathematical functions in VCL can put a payload into the NAN result in case of an error. This makes it possible to identify which function generated the NAN.
49 | \vspacesmall
50 | 
51 | The \codei{nan..} and \codei{nan\_code} functions make it possible to set and get NAN payloads. The IEEE 754 standard does not specify what happens to the payload when converting between single and double precision, but experiments show that all microprocessors that use the binary floating point format will left-justify the payload. The \codei{nan..} and \codei{nan\_code} functions treat the NAN payload as a 22-bit left-justified unsigned integer in order to allow conversions between single and double precision. These functions deviate from the IEEE 754-2019 standard.
52 | 
53 | \item[NAN propagation in maximum and minimum functions.]
54 | The \codei{max} and \codei{min} functions do not propagate NANs according to the 2008 version of the standard. This unfortunate situation is redressed in the  2019 revision of the standard. VCL offers two different versions of these functions:
55 | The \codei{max} and \codei{min} functions are equivalent to 
56 | \codei{a > b ? a : b} and 
57 | \codei{a < b ? a : b}, respectively. These functions return \codei{b} if \codei{a} is NAN. The slightly less efficient functions \codei{maximum} and \codei{minimum} are sure to propagate NANs, in accordance with the 2019 revision of the standard.
58 | 
59 | \item[NAN propagation in pow function.]
60 | The standard specifies that pow(NAN,0) and pow(1,NAN) will give the result 1.0. The VCL implementation deviates from this and produces a NAN output in all cases where an input is NAN, in order to support reliable NAN propagation.
61 | 
62 | \item[Function parameter range.]
63 | Some of the mathematical functions have internal overflow for extreme values of the input parameters. These functions have a limited input range because an extra branch to handle the extreme cases would reduce the overall performance. Limitations of the input range are mentioned in the listing of the individual functions.
64 | 
65 | 
66 | \end{description}
67 | 
68 | 
69 | 
70 | 
71 | 
72 | 
73 | 
74 | 
75 | 
76 | 
77 | 
78 | 
79 | 
80 | \vspacesmall
81 | 
82 | 
83 | 
84 | \end{document}
85 | 


--------------------------------------------------------------------------------
/vcl_manual.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vectorclass/manual/62cb40d710f8d6180511ba03ca6e09347e06f0b9/vcl_manual.pdf


--------------------------------------------------------------------------------
/vcl_manual.tex:
--------------------------------------------------------------------------------
  1 | \documentclass[11pt,a4paper,oneside,openright]{report}
  2 | 
  3 | \usepackage[bindingoffset=5mm,left=20mm,right=20mm,top=20mm,bottom=20mm,footskip=10mm]{geometry}
  4 | \usepackage[utf8x]{inputenc}
  5 | \usepackage{hyperref}
  6 | \usepackage[english]{babel}
  7 | \usepackage{listings}
  8 | \usepackage{subfiles}
  9 | \usepackage{longtable}
 10 | \usepackage{multirow}
 11 | \usepackage{ragged2e} 
 12 | \usepackage{cmap} % avoid fi ligatures in pdf file
 13 | \usepackage{amsthm} % example numbering
 14 | \usepackage{color}
 15 | \usepackage{bold-extra} % for bold tt font. Remember to include bold-extra.sty file
 16 | \usepackage{graphicx}
 17 | \usepackage[yyyymmdd]{datetime}
 18 | \usepackage{float}
 19 | 
 20 | % style for code listing
 21 | \renewcommand{\familydefault}{\sfdefault}
 22 | \renewcommand{\ttdefault}{pcr} % selects Courier font
 23 | \newtheorem{example}{Example}[chapter]  % example numbering
 24 | \lstset{language=C}                     % formatting for code listing
 25 | \lstset{basicstyle=\ttfamily,breaklines=true}
 26 | \definecolor{darkGreen}{rgb}{0,0.4,0}
 27 | \definecolor{mybrown}{rgb}{0.40,0.10,0.05}
 28 | \lstset{commentstyle=\color{darkGreen}}  % comments color
 29 | \lstset{keywordstyle=\color{blue}}       % keyword color
 30 | \lstset{stringstyle=\color{mybrown}}     % string color
 31 | \lstset{showstringspaces=false}          % don't mark spaces in strings
 32 | 
 33 | \renewcommand{\dateseparator}{-}
 34 | 
 35 | % command for turning indent back on after \flushleft
 36 | \newcommand{\indenton}{\RaggedRight\parindent=15pt}
 37 | 
 38 | % command for vertical space
 39 | \newcommand{\vspacesmall}{\vspace{3mm}}
 40 | \newcommand{\vspacebig}{\vspace{6mm}}
 41 | 
 42 | % style for code inlined in text:
 43 | \newcommand{\codei}[1]{\bfseries \ttfamily{#1}\normalfont}
 44 | 
 45 | 
 46 | 
 47 | \begin{document}
 48 | 
 49 | \begin{titlepage}
 50 |     \centering
 51 |    
 52 |     \null %empty box needed for vfill to work
 53 |     \vfill
 54 | 
 55 |    {\bfseries\Huge
 56 |     VCL 
 57 |     \vspacesmall
 58 |         
 59 |     C++ vector class library 
 60 |     \vspacebig
 61 |         
 62 |     manual
 63 |    }        
 64 |     \vspacebig
 65 |     
 66 |    {\Large    
 67 |     Agner Fog
 68 |     \vspacebig
 69 |     
 70 |     \copyright\ \today. Apache license 2.0
 71 |    }
 72 |     
 73 |     \vfill
 74 |     
 75 |     \includegraphics[width=306pt]{freesoftwarelogo.jpg}
 76 |     \vfill
 77 |     
 78 | \end{titlepage}
 79 | 
 80 | \RaggedRight
 81 | 
 82 | 
 83 | 
 84 | \tableofcontents
 85 | \setcounter{secnumdepth}{1}
 86 | %\indenton
 87 | \flushleft
 88 | 
 89 | % Introduction
 90 | % The basics
 91 | \subfile{vcl_introduction.tex}
 92 | 
 93 | % Operators and functions
 94 | \subfile{vcl_operators_and_functions.tex}
 95 | 
 96 | % Boolean operations and per-element branches
 97 | \subfile{vcl_bool.tex}
 98 | 
 99 | % Conversion between vector types
100 | \subfile{vcl_conversion.tex}
101 | 
102 | % Permute, blend, lookup, gather and scatter functions 
103 | \subfile{vcl_permute_functions.tex}
104 | 
105 | % Mathematical functions
106 | \subfile{vcl_mathematical_functions.tex}
107 | 
108 | % Performance
109 | \subfile{vcl_performance.tex}
110 | 
111 | % Examples
112 | \subfile{vcl_examples.tex}
113 | 
114 | % Application specific packages:
115 | %  Decimal-string conversion
116 | %  3-dimensional vectors
117 | %  complex number vectors
118 | %  quaternions
119 | %  Decimal conversion
120 | \subfile{vcl_packages.tex}
121 | 
122 | % Error conditions
123 | % Implementation dependent behavior
124 | \subfile{vcl_errors_etc.tex}
125 | 
126 | % Floating point behavior
127 | \subfile{vcl_float_behavior.tex}
128 | 
129 | % Contributing
130 | % Test bench
131 | \subfile{vcl_contributing.tex}
132 | 
133 | % File list
134 | \subfile{vcl_file_list.tex}
135 | 
136 | 
137 | 
138 | \end{document}
139 | 


--------------------------------------------------------------------------------
/vcl_operators_and_functions.tex:
--------------------------------------------------------------------------------
  1 | % chapter included in vclmanual.tex
  2 | \documentclass[vcl_manual.tex]{subfiles}
  3 | \begin{document}
  4 | 
  5 | 
  6 | \chapter{Operators}\label{chap:Operators}
  7 | 
  8 | \section{Arithmetic operators}
  9 | 
 10 | \flushleft
 11 | 
 12 | \vspacesmall
 13 | \begin{tabular}{|p{25mm}|p{100mm}|}
 14 | \hline
 15 | \bfseries Operator & \texttt{+, ++, +=} \\ \hline
 16 | \bfseries Defined for & all integer and floating point vector classes \\ \hline
 17 | \bfseries Description & addition \\ \hline
 18 | \bfseries Efficiency & good \\ \hline
 19 | \end{tabular}
 20 | \begin{lstlisting}[frame=none]
 21 | // Example:
 22 | Vec4i a(10, 11, 12, 13);
 23 | Vec4i b(20, 21, 22, 23);
 24 | Vec4i c = a + b;           // c = (30, 32, 34, 36)
 25 | \end{lstlisting}
 26 | 
 27 | 
 28 | \vspacesmall
 29 | \begin{tabular}{|p{25mm}|p{100mm}|}
 30 | \hline
 31 | \bfseries Operator & \texttt{-, --, -=,} unary \texttt{-} \\ \hline
 32 | \bfseries Defined for & all integer and floating point vector classes \\ \hline
 33 | \bfseries Description & subtraction \\ \hline
 34 | \bfseries Efficiency & good \\ \hline
 35 | \end{tabular}
 36 | \begin{lstlisting}[frame=none]
 37 | // Example:
 38 | Vec4i a(10, 11, 12, 13);
 39 | Vec4i b(20, 21, 22, 23);
 40 | Vec4i c = a - b;           // c = (-10, -10, -10, -10)
 41 | \end{lstlisting}
 42 | 
 43 | 
 44 | \vspacesmall
 45 | \begin{tabular}{|p{25mm}|p{100mm}|}
 46 | \hline
 47 | \bfseries Operator & \texttt{*, *=} \\ \hline
 48 | \bfseries Defined for & all integer and floating point vector classes \\ \hline
 49 | \bfseries Description & multiplication \\ \hline
 50 | \bfseries Efficiency & 8 bit integers: poor \newline
 51 | 16 bit integers: good \newline
 52 | 32 bit integers: good for SSE4.1 and later instruction set, poor otherwise \newline
 53 | 64 bit integers: good for AVX512DQ instruction set, poor otherwise \newline
 54 | float: good \newline
 55 | double: good
 56 |  \\ \hline
 57 | \end{tabular}
 58 | \begin{lstlisting}[frame=none]
 59 | // Example:
 60 | Vec4i a(10, 11, 12, 13);
 61 | Vec4i b(20, 21, 22, 23);
 62 | Vec4i c = a * b;           // c = (200, 231, 264, 299)
 63 | \end{lstlisting}
 64 | 
 65 | 
 66 | \vspacesmall
 67 | \begin{tabular}{|p{25mm}|p{100mm}|}
 68 | \hline
 69 | \bfseries Operator & \texttt{/, /=}  (floating point) \\ \hline
 70 | \bfseries Defined for & all floating point vector classes \\ \hline
 71 | \bfseries Description & division \\ \hline
 72 | \bfseries Efficiency & medium \\ \hline
 73 | \end{tabular}
 74 | \begin{lstlisting}[frame=none]
 75 | // Example:
 76 | Vec4f a(1.0f, 1.1f, 1.2f, 1.3f);
 77 | Vec4f b(2.0f, 2.1f, 2.2f, 2.3f);
 78 | Vec4f c = a / b;  // c = (0.500f, 0.524f, 0.545f, 0.565f)
 79 | \end{lstlisting}
 80 | 
 81 | 
 82 | \vspacesmall
 83 | \begin{tabular}{|p{25mm}|p{100mm}|}
 84 | \hline
 85 | \bfseries Operator & \texttt{/, /=}  (integer vector divided by scalar) \\ \hline
 86 | \bfseries Defined for & all classes of 8-bit, 16-bit and 32-bit integers, signed and unsigned. Not available for 64-bit integers \\ \hline
 87 | \bfseries Description & division by scalar. Results are truncated to integer. All elements are divided by the same divisor. See page \pageref{IntegerDivision} for explanation
 88 |  \\ \hline
 89 | \bfseries Efficiency & poor \\ \hline
 90 | \end{tabular}
 91 | \begin{lstlisting}[frame=none]
 92 | // Example:
 93 | Vec4i a(10, 11, 12, 13);
 94 | int   b = 3;
 95 | Vec4i c = a / b;  // c = (3, 3, 4, 4)
 96 | \end{lstlisting}
 97 | 
 98 | 
 99 | \vspacesmall
100 | \begin{tabular}{|p{25mm}|p{100mm}|}
101 | \hline
102 | \bfseries Operator & \texttt{/, /=}  (integer vector divided by constant) \\ \hline
103 | \bfseries Defined for & all classes of 8-bit, 16-bit and 32-bit integers, signed and unsigned. Not available for 64-bit integers \\ \hline
104 | \bfseries Description & division by compile-time constant. All elements are divided by the same divisor. See page \pageref{IntegerDivision} for explanation \\ \hline
105 | \bfseries Efficiency & medium (better than division by scalar variable). \newline Good if divisor is a power of 2 \\ \hline
106 | \end{tabular}
107 | \begin{lstlisting}[frame=none]
108 | // Example, signed:
109 | Vec4i  a(10, 11, 12, 13);
110 | Vec4i  b = a / const_int(3);  // b = (3, 3, 4, 4)
111 | // Example, unsigned:
112 | Vec4ui c(10, 11, 12, 13);
113 | Vec4ui d = c / const_uint(3); // d = (3, 3, 4, 4)
114 | \end{lstlisting}
115 | 
116 | 
117 | \section{Logic operators} \label{LogicOperators}
118 | 
119 | \vspacesmall
120 | \begin{tabular}{|p{25mm}|p{100mm}|}
121 | \hline
122 | \bfseries Operator & $<<$, $<<=$ \\ \hline
123 | \bfseries Defined for & all integer vector classes \\ \hline
124 | \bfseries Description & bit shift left. All vector elements are shifted by the same amount. \newline
125 | Shifting left by n is a fast way of multiplying by $2^n$ \\ \hline
126 | \bfseries Efficiency & good \\ \hline
127 | \end{tabular}
128 | \begin{lstlisting}[frame=none]
129 | // Example:
130 | Vec4i a(10, 11, 12, 13);
131 | Vec4i b = a << 2;         // b = (40, 44, 48, 52)
132 | \end{lstlisting}
133 | 
134 | 
135 | \vspacesmall
136 | \begin{tabular}{|p{25mm}|p{100mm}|}
137 | \hline
138 | \bfseries Operator & $>>$, $>>=$ \\ \hline
139 | \bfseries Defined for & all integer vector classes \\ \hline
140 | \bfseries Description & bit shift right. All vector elements are shifted by the same amount.\newline
141 | Unsigned integers use logical shift. \newline
142 | Signed integers use arithmetic shift (i.e. the sign bit is copied). \newline
143 | Shifting unsigned right by n is a fast way of dividing by $2^n$ 
144 |  \\ \hline
145 | \bfseries Efficiency & good \\ \hline
146 | \end{tabular}
147 | \begin{lstlisting}[frame=none]
148 | // Example:
149 | Vec4i a(10, 11, 12, 13);
150 | Vec4i b = a >> 2;         // b = (2, 2, 3, 3)
151 | \end{lstlisting}
152 | 
153 | 
154 | \vspacesmall
155 | \begin{tabular}{|p{25mm}|p{100mm}|}
156 | \hline
157 | \bfseries Operator & == \\ \hline
158 | \bfseries Defined for & all vector classes \\ \hline
159 | \bfseries Description & test if equal. Result is a boolean vector \\ \hline
160 | \bfseries Efficiency & good \\ \hline
161 | \end{tabular}
162 | \begin{lstlisting}[frame=none]
163 | // Example:
164 | Vec4i  a(10, 11, 12, 13);
165 | Vec4i  b(14, 13, 12, 11);
166 | Vec4ib c = a == b;    // c = (false, false, true, false)
167 | \end{lstlisting}
168 | 
169 | 
170 | \vspacesmall
171 | \begin{tabular}{|p{25mm}|p{100mm}|}
172 | \hline
173 | \bfseries Operator & != \\ \hline
174 | \bfseries Defined for & all vector classes \\ \hline
175 | \bfseries Description & test if not equal. Result is a boolean vector \\ \hline
176 | \bfseries Efficiency & good \\ \hline
177 | \end{tabular}
178 | \begin{lstlisting}[frame=none]
179 | // Example:
180 | Vec4i  a(10, 11, 12, 13);
181 | Vec4i  b(14, 13, 12, 11);
182 | Vec4ib c = a != b;    // c = (true, true, false, true)
183 | \end{lstlisting}
184 | 
185 | 
186 | \vspacesmall
187 | \begin{tabular}{|p{25mm}|p{100mm}|}
188 | \hline
189 | \bfseries Operator & \textgreater \\ \hline
190 | \bfseries Defined for & all integer and floating point vector classes \\ \hline
191 | \bfseries Description & test if bigger. Result is a boolean vector \\ \hline
192 | \bfseries Efficiency & good \\ \hline
193 | \end{tabular}
194 | \begin{lstlisting}[frame=none]
195 | // Example:
196 | Vec4i  a(10, 11, 12, 13);
197 | Vec4i  b(14, 13, 12, 11);
198 | Vec4ib c = a > b;     // c = (false, false, false, true)
199 | \end{lstlisting}
200 | 
201 | 
202 | \vspacesmall
203 | \begin{tabular}{|p{25mm}|p{100mm}|}
204 | \hline
205 | \bfseries Operator & \textgreater= \\ \hline
206 | \bfseries Defined for & all integer and floating point vector classes \\ \hline
207 | \bfseries Description & test if bigger or equal. Result is a boolean vector \\ \hline
208 | \bfseries Efficiency & good \\ \hline
209 | \end{tabular}
210 | \begin{lstlisting}[frame=none]
211 | // Example:
212 | Vec4i  a(10, 11, 12, 13);
213 | Vec4i  b(14, 13, 12, 11);
214 | Vec4ib c = a >= b;     // c = (false, false, true, true)
215 | \end{lstlisting}
216 | 
217 | 
218 | \vspacesmall
219 | \begin{tabular}{|p{25mm}|p{100mm}|}
220 | \hline
221 | \bfseries Operator & \textless \\ \hline
222 | \bfseries Defined for & all integer and floating point vector classes \\ \hline
223 | \bfseries Description & test if smaller. Result is a boolean vector \\ \hline
224 | \bfseries Efficiency & good \\ \hline
225 | \end{tabular}
226 | \begin{lstlisting}[frame=none]
227 | // Example:
228 | Vec4i  a(10, 11, 12, 13);
229 | Vec4i  b(14, 13, 12, 11);
230 | Vec4ib c = a < b;       // c = (true, true, false, false)
231 | \end{lstlisting}
232 | 
233 | \vspacesmall
234 | \begin{tabular}{|p{25mm}|p{100mm}|}
235 | \hline
236 | \bfseries Operator & \textless= \\ \hline
237 | \bfseries Defined for & all integer and floating point vector classes \\ \hline
238 | \bfseries Description & test if smaller or equal. Result is a boolean vector \\ \hline
239 | \bfseries Efficiency & good \\ \hline
240 | \end{tabular}
241 | \begin{lstlisting}[frame=none]
242 | // Example:
243 | Vec4i  a(10, 11, 12, 13);
244 | Vec4i  b(14, 13, 12, 11);
245 | Vec4ib c = a <= b;      // c = (true, true, true, false)
246 | \end{lstlisting}
247 | 
248 | 
249 | 
250 | \vspacesmall
251 | \begin{tabular}{|p{25mm}|p{100mm}|}
252 | \hline
253 | \bfseries Operator & \&, \&= \\ \hline
254 | \bfseries Defined for & all vector classes \\ \hline
255 | \bfseries Description & bitwise and \\ \hline
256 | \bfseries Efficiency & good \\ \hline
257 | \end{tabular}
258 | \begin{lstlisting}[frame=none]
259 | // Example:
260 | Vec4i a(10, 11, 12, 13);
261 | Vec4i b(20, 21, 22, 23);
262 | Vec4i c = a & b;         // c = (0, 1, 4, 5)
263 | \end{lstlisting}
264 | 
265 | 
266 | \vspacesmall
267 | \begin{tabular}{|p{25mm}|p{100mm}|}
268 | \hline
269 | \bfseries Operator & \texttt{$\vert$, $\vert=$} \\ \hline
270 | \bfseries Defined for & all vector classes \\ \hline
271 | \bfseries Description & bitwise or \\ \hline
272 | \bfseries Efficiency & good \\ \hline
273 | \end{tabular}
274 | \begin{lstlisting}[frame=none]
275 | // Example:
276 | Vec4i a(10, 11, 12, 13);
277 | Vec4i b(20, 21, 22, 23);
278 | Vec4i c = a | b;         // c = (30, 31, 30, 31)
279 | \end{lstlisting}
280 | 
281 | 
282 | \vspacesmall
283 | \begin{tabular}{|p{25mm}|p{100mm}|}
284 | \hline
285 | \bfseries Operator & \textasciicircum \\ \hline
286 | \bfseries Defined for & all vector classes \\ \hline
287 | \bfseries Description & bitwise exclusive or \\ \hline
288 | \bfseries Efficiency & good \\ \hline
289 | \end{tabular}
290 | \begin{lstlisting}[frame=none]
291 | // Example:
292 | Vec4i a(10, 11, 12, 13);
293 | Vec4i b(20, 21, 22, 23);
294 | Vec4i c = a ^ b;         // c = (30, 30, 26, 26)
295 | \end{lstlisting}
296 | 
297 | 
298 | \vspacesmall
299 | \begin{tabular}{|p{25mm}|p{100mm}|}
300 | \hline
301 | \bfseries Operator & $\sim$ \\ \hline
302 | \bfseries Defined for & all boolean and integer vector classes \\ \hline
303 | \bfseries Description & bitwise not \\ \hline
304 | \bfseries Efficiency & good \\ \hline
305 | \end{tabular}
306 | \begin{lstlisting}[frame=none]
307 | // Example:
308 | Vec4i a(10, 11, 12, 13);
309 | Vec4i b = ~a;            // b = (-11, -12, -13, -14)
310 | \end{lstlisting}
311 | 
312 | 
313 | \vspacesmall
314 | \begin{tabular}{|p{25mm}|p{100mm}|}
315 | \hline
316 | \bfseries Operator & ! \\ \hline
317 | \bfseries Defined for & all vector classes \\ \hline
318 | \bfseries Description & logical not. Result is a boolean vector \\ \hline
319 | \bfseries Efficiency & good \\ \hline
320 | \end{tabular}
321 | \begin{lstlisting}[frame=none]
322 | // Example:
323 | Vec4i  a(-1, 0, 1, 2);
324 | Vec4ib b = !a;          // b = (false,true,false,false)
325 | \end{lstlisting}
326 | 
327 | %\indenton  % undo flushleft
328 | 
329 | \section{Integer division} \label{IntegerDivision}
330 | 
331 | There are no instructions in the x86 instruction set extensions that are useful for integer vector division, and such instructions might be quite slow if they existed. Therefore, the vector class library is using an algorithm for fast integer division. The basic principle of this algorithm can be expressed in this formula:
332 | \vspacesmall \newline
333 | 	$a / b \approx a * (2^n / b) >> n$ \newline
334 | \vspacesmall
335 | This calculation goes through the following steps:
336 | 
337 | \begin{enumerate}
338 |   \item find a suitable value for n
339 |   \item calculate $2^n / b$
340 |   \item calculate necessary corrections for rounding errors
341 |   \item do the multiplication and shift-right, and apply corrections for rounding errors  
342 | \end{enumerate}
343 | 
344 | This formula is advantageous if multiple numbers are divided by the same divisor b. Steps 1, 2 and 3 need only be done once while step 4 is repeated for each value of the dividend a. The mathematical details are described in the file vectori128.h. (See also T. Granlund and P. L. Montgomery: Division by Invariant Integers Using Multiplication, Proceedings of the SIGPLAN 1994 Conference on Programming Language Design and Implementation)
345 | \vspacesmall
346 | 
347 | The implementation in the vector class library uses various variants of this method with appropriate corrections for rounding errors to get the exact result truncated towards zero.
348 | 
349 | The way to use this in your code depends on whether the divisor b is a variable or constant, and whether the same divisor is applied to multiple vectors. This is illustrated in the following examples:
350 | 
351 | \begin{lstlisting}[frame=none]
352 | // Division example A:
353 | // A variable divisor is applied to one vector
354 | Vec4i a(10, 11, 12, 13);// dividend is an integer vector
355 | int   b = 3;            // divisor is an integer variable
356 | Vec4i c = a / b;        // result c = (3, 3, 4, 4)
357 | \end{lstlisting}
358 | 
359 | \begin{lstlisting}[frame=none]
360 | // Division example B:
361 | // The same divisor is applied to multiple vectors
362 | int b = 3;              // divisor
363 | Divisor_i divb(b);      // this object contains the results
364 |                         // of calculation steps 1, 2, and 3
365 | for (...) {             // loop through multiple vectors
366 |     Vec4i a = ...       // get dividend
367 |     a = a / divb;       // do step 4 of the division
368 |     ...                 // store results
369 | }
370 | \end{lstlisting}
371 | 
372 | \begin{lstlisting}[frame=none]
373 | // Division example C:
374 | // The divisor is a constant, known at compile time
375 | Vec4i a(10, 11, 12, 13);     // dividend is integer vector
376 | Vec4i c = a / const_int(3);  // result c = (3, 3, 4, 4)
377 | \end{lstlisting}
378 | 
379 | 
380 | Explanation:
381 | 
382 | The class \codei{Divisor\_i} in example B takes care of the calculation steps 1, 2 and 3 in the algorithm described above. The overloaded \codei{/} operator takes a vector on the left hand side and an object of class \codei{Divisor\_i} on the right hand side. This object is created before the loop with the divisor as parameter to the constructor. We are saving time by doing this time-consuming calculation only once while step 4 in the calculation is done multiple times inside the loop by \codei{a = a / divb;}
383 | \vspacesmall
384 | 
385 | In example A, we are also creating an object of class \codei{Divisor\_i}, but this is done implicitly. The compiler sees an integer on the right hand side of the \codei{/} operator where it needs an object of class \codei{Divisor\_i}, and therefore converts the integer \codei{b} to such an object by calling the constructor \codei{Divisor\_i}(int).
386 | 
387 | \vspacesmall
388 | The following divisor classes are available:
389 | 
390 | \vspacesmall
391 | \begin{tabular}{|p{50mm}|p{50mm}|}
392 | \hline
393 | \bfseries Dividend vector type & \bfseries Divisor class required \\ \hline
394 | Vec16c, Vec32c , Vec64c & Divisor\_s \\ \hline
395 | Vec16uc, Vec32uc, Vec64uc & Divisor\_us \\ \hline
396 | Vec8s, Vec16s , Vec32s & Divisor\_s \\ \hline
397 | Vec8us, Vec16us , Vec32us & Divisor\_us \\ \hline
398 | Vec4i, Vec8i, Vec16i & Divisor\_i \\ \hline
399 | Vec4ui, Vec8ui, Vec16ui & Divisor\_ui \\ \hline
400 | \end{tabular}
401 | \vspacesmall
402 | 
403 | If the divisor is a constant and the value is known at compile time, then we can use the method in example C. The implementation here uses macros and templates to do the calculation steps 1, 2 and 3 at compile time rather than at execution time. This makes the code even faster. The expression to put on the right-hand side of the \codei{/} operator looks as follows:
404 | 
405 | \vspacesmall
406 | \begin{tabular}{|p{50mm}|p{50mm}|}
407 | \hline
408 | \bfseries Dividend vector type & \bfseries Divisor expression \\ \hline
409 | Vec16c, Vec32c, Vec64c & const\_int \\ \hline
410 | Vec16uc, Vec32uc, Vec64uc & const\_uint \\ \hline
411 | Vec8s, Vec16s, Vec32s & const\_int \\ \hline
412 | Vec8us, Vec16us, Vec32us & const\_uint \\ \hline
413 | Vec4i, Vec8i, Vec16i & const\_int \\ \hline
414 | Vec4ui, Vec8ui, Vec16ui & const\_uint \\ \hline
415 | \end{tabular}
416 | \vspacesmall
417 | 
418 | The compiler will generate an error message if the parameter to \codei{const\_int} or \codei{const\_uint} is not a valid compile-time constant. (A valid compile time constant can contain integer literals and operators, as well as macros that are expanded to compile time constants, but not ordinary function calls).
419 | \vspacesmall
420 | 
421 | A further advantage of the method in example C is that the code is able to use different methods for different values of the divisor. The division is particularly fast if the divisor is a power of 2. Make sure to use \codei{const\_int} or \codei{const\_uint} on the right hand side of the \codei{/} operator if you are dividing by 2, 4, 8, 16, etc.
422 | \vspacesmall
423 | 
424 | Division is faster for vectors of 16-bit integers than for vectors of 8-bit or 32-bit integers. There is no support for division of vectors of 64-bit integers. Unsigned division is faster than signed division.
425 | 
426 | 
427 | \chapter{Functions}\label{chap:Functions}
428 | 
429 | \section{Integer functions}
430 | \flushleft
431 | 
432 | \vspacesmall
433 | \begin{tabular}{|p{25mm}|p{100mm}|}
434 | \hline
435 | \bfseries Function & horizontal\_add \\ \hline
436 | \bfseries Defined for & all integer vector classes \\ \hline
437 | \bfseries Description & calculates the sum of all vector elements \\ \hline
438 | \bfseries Efficiency & medium. For best performance, use normal (vertical) addition where possible. \\ \hline
439 | \end{tabular}
440 | \begin{lstlisting}[frame=none]
441 | // Example:
442 | Vec4i a(10, 11, 12, 13);
443 | int   b = horizontal_add(a);  // b = 46
444 | \end{lstlisting}
445 | 
446 | 
447 | \vspacesmall
448 | \begin{tabular}{|p{25mm}|p{100mm}|}
449 | \hline
450 | \bfseries Function & horizontal\_add\_x \\ \hline
451 | \bfseries Defined for & all 8-bit, 16-bit and 32-bit integer vector classes \\ \hline
452 | \bfseries Description & calculates the sum of all vector elements. The sum is calculated with a higher number of bits to avoid overflow
453 |  \\ \hline
454 | \bfseries Efficiency & medium (slower than horizontal\_add) \\ \hline
455 | \end{tabular}
456 | \begin{lstlisting}[frame=none]
457 | // Example:
458 | Vec4i   a(10, 11, 12, 13);
459 | int64_t b = horizontal_add_x(a);  // b = 46
460 | \end{lstlisting}
461 | 
462 | \vspacesmall
463 | \begin{tabular}{|p{25mm}|p{100mm}|}
464 | \hline
465 | \bfseries Function & horizontal\_min, horizontal\_max \\ \hline
466 | \bfseries Defined for & all integer vector classes \\ \hline
467 | \bfseries Description & Returns the lowest or highest element in a vector. \\ \hline
468 | \bfseries Efficiency & medium \\ \hline
469 | \end{tabular}
470 | \begin{lstlisting}[frame=none]
471 | // Example:
472 | Vec4i a(1, 8, -5, 3);
473 | int   b = horizontal_min(a);  // b = -5
474 | int   c = horizontal_max(a);  // c =  8
475 | \end{lstlisting}
476 | 
477 | \vspacesmall
478 | \begin{tabular}{|p{25mm}|p{100mm}|}
479 | \hline
480 | \bfseries Function & add\_saturated\\ \hline
481 | \bfseries Defined for & all 8-bit, 16-bit and 32-bit integer vector classes \\ \hline
482 | \bfseries Description & same as operator +. Overflow is handled by saturation rather than wrap-around \\ \hline
483 | \bfseries Efficiency & fast for 8-bit and 16-bit integers. Medium for 32-bit integers \\ \hline
484 | \end{tabular}
485 | \begin{lstlisting}[frame=none]
486 | // Example:
487 | Vec4i a(0x10000000,  0x20000000,  0x30000000,  0x40000000);
488 | Vec4i b(0x30000000,  0x40000000,  0x50000000,  0x60000000);
489 | Vec4i c = add_saturated(a, b);  
490 | //  c = (0x40000000,  0x60000000,  0x7FFFFFFF,  0x7FFFFFFF)
491 | Vec4i d = a + b;
492 | //  d = (0x40000000,  0x60000000, -0x80000000, -0x60000000)
493 | \end{lstlisting}
494 | 
495 | 
496 | \vspacesmall
497 | \begin{tabular}{|p{25mm}|p{100mm}|}
498 | \hline
499 | \bfseries Function & sub\_saturated\\ \hline
500 | \bfseries Defined for & all 8-bit, 16-bit and 32-bit integer vector classes \\ \hline
501 | \bfseries Description & same as operator -. Overflow is handled by saturation rather than wrap-around \\ \hline
502 | \bfseries Efficiency & fast for 8-bit and 16-bit integers. Medium for 32-bit integers \\ \hline
503 | \end{tabular}
504 | \begin{lstlisting}[frame=none]
505 | // Example:
506 | Vec4i a(-0x10000000,-0x20000000,-0x30000000,-0x40000000);
507 | Vec4i b( 0x30000000, 0x40000000, 0x50000000, 0x60000000);
508 | Vec4i c = sub_saturated(a, b);  
509 | //  c = (-0x40000000,-0x60000000,-0x80000000,-0x80000000)
510 | Vec4i d = a - b;
511 | //  d = (-0x40000000,-0x60000000,-0x80000000, 0x60000000)
512 | \end{lstlisting}
513 | 
514 | 
515 | \vspacesmall
516 | \begin{tabular}{|p{25mm}|p{100mm}|}
517 | \hline
518 | \bfseries Function & max \\ \hline
519 | \bfseries Defined for & all integer vector classes \\ \hline
520 | \bfseries Description & returns the biggest of two values \\ \hline
521 | \bfseries Efficiency & medium for 64-bit integers with instruction sets lower than SSE4.2. Fast otherwise \\ \hline
522 | \end{tabular}
523 | \begin{lstlisting}[frame=none]
524 | Vec4i a(10, 11, 12, 13);
525 | Vec4i b(14, 13, 12, 11);
526 | Vec4i c = max(a, b);  // c = (14, 13, 12, 13)
527 | \end{lstlisting}
528 | 
529 | 
530 | \vspacesmall
531 | \begin{tabular}{|p{25mm}|p{100mm}|}
532 | \hline
533 | \bfseries Function & min \\ \hline
534 | \bfseries Defined for & all integer vector classes \\ \hline
535 | \bfseries Description & returns the smallest of two values \\ \hline
536 | \bfseries Efficiency & medium for 64-bit integers with instruction sets lower than SSE4.2. Fast otherwise \\ \hline
537 | \end{tabular}
538 | \begin{lstlisting}[frame=none]
539 | // Example:
540 | Vec4i a(10, 11, 12, 13);
541 | Vec4i b(14, 13, 12, 11);
542 | Vec4i c = min(a, b);  // c = (10, 11, 12, 11)
543 | \end{lstlisting}
544 | 
545 | 
546 | \vspacesmall
547 | \begin{tabular}{|p{25mm}|p{100mm}|}
548 | \hline
549 | \bfseries Function & abs \\ \hline
550 | \bfseries Defined for & all signed integer vector classes \\ \hline
551 | \bfseries Description & calculates the absolute value \\ \hline
552 | \bfseries Efficiency & medium \\ \hline
553 | \end{tabular}
554 | \begin{lstlisting}[frame=none]
555 | // Example:
556 | Vec4i a(-1, 0, 1, 2);
557 | Vec4i b = abs(a);     // b = (1, 0, 1, 2)
558 | \end{lstlisting}
559 | 
560 | 
561 | \vspacesmall
562 | \begin{tabular}{|p{25mm}|p{100mm}|}
563 | \hline
564 | \bfseries Function & abs\_saturated \\ \hline
565 | \bfseries Defined for & all signed integer vector classes \\ \hline
566 | \bfseries Description & calculates the absolute value. Overflow saturates to make sure the result is never negative when the input is INT\_MIN
567 |  \\ \hline
568 | \bfseries Efficiency & medium (slower than abs) \\ \hline
569 | \end{tabular}
570 | \begin{lstlisting}[frame=none]
571 | // Example:
572 | Vec4i a(-0x80000000, -1, 0, 1);
573 | Vec4i b = abs_saturated(a);  // b=( 0x7FFFFFFF,1,0,1)
574 | Vec4i c = abs(a);            // c=(-0x80000000,1,0,1)
575 | \end{lstlisting}
576 | 
577 | 
578 | \vspacesmall
579 | \begin{tabular}{|p{25mm}|p{100mm}|}
580 | \hline
581 | \bfseries Function & rotate\_left(vector, int) \\ \hline
582 | \bfseries Defined for & all signed integer vector classes \\ \hline
583 | \bfseries Description & rotates the bits of each element. Use a negative count to rotate right \\ \hline
584 | \bfseries Efficiency & 8 bit: poor \newline
585 | 16 bit: medium \newline
586 | 32 and 64 bit: good for AVX512DQ instruction set, medium otherwise.
587 |  \\ \hline
588 | \end{tabular}
589 | \begin{lstlisting}[frame=none]
590 | // Example:
591 | Vec4i a(0x12345678, 0x0000FFFF, 0xA000B000, 0x00000001);
592 | Vec4i b = rotate_left(a, 8);  
593 | // b = (0x34567812, 0x00FFFF00, 0x00B000A0, 0x00000100)
594 | \end{lstlisting}
595 | 
596 | 
597 | \vspacesmall
598 | \begin{tabular}{|p{25mm}|p{100mm}|}
599 | \hline
600 | \bfseries Function & 
601 | vector shift\_bytes\_up\textless n\textgreater(vector)\newline
602 | vector shift\_bytes\_down\textless n\textgreater(vector)
603 |  \\ \hline
604 | \bfseries Defined for & Vec16c, Vec32c, Vec64c \\ \hline
605 | \bfseries Description & shifts the bytes of a vector up or down and inserts zeroes at the vacant places \\ \hline
606 | \bfseries Efficiency &
607 | Vec16c: Good for SSSE3, medium otherwise \newline
608 | Vec32c: Good for AVX2, medium otherwise \newline
609 | Vec64c: Good for AVX512BW, medium otherwise \\ \hline
610 | \end{tabular}
611 | \begin{lstlisting}[frame=none]
612 | // Example:
613 | Vec16c a(10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25);
614 | Vec16c b = shift_bytes_up<5>(a);
615 | // b = (0,0,0,0,0,10,11,12,13,14,15,16,17,18,19,20)
616 | \end{lstlisting}
617 | 
618 | 
619 | 
620 | \section{Floating point simple functions}
621 | 
622 | \vspacesmall
623 | \begin{tabular}{|p{25mm}|p{100mm}|}
624 | \hline
625 | \bfseries Function & horizontal\_add \\ \hline
626 | \bfseries Defined for & all floating point vector classes \\ \hline
627 | \bfseries Description & calculates the sum of all vector elements \\ \hline
628 | \bfseries Efficiency & medium. For best performance, use normal (vertical) addition where possible. \\ \hline
629 | \end{tabular}
630 | \begin{lstlisting}[frame=none]
631 | // Example:
632 | Vec4f a(1.0f, 1.1f, 1.2f, 1.3f);
633 | float b = horizontal_add(a);  // b = 4.6
634 | \end{lstlisting}
635 | 
636 | 
637 | \vspacesmall
638 | \begin{tabular}{|p{25mm}|p{100mm}|}
639 | \hline
640 | \bfseries Function & max \newline min \\ \hline
641 | \bfseries Defined for & all floating point vector classes \\ \hline
642 | \bfseries Description & returns the biggest/smallest of two values \\ \hline
643 | \bfseries Efficiency & good \\ \hline
644 | \end{tabular}
645 | \vspacesmall
646 | 
647 | \codei{max(a,b)} is equivalent to \codei{a > b ? a : b }\\
648 | \codei{min(a,b)} is equivalent to \codei{a < b ? a : b }\\
649 | \vspacesmall
650 | 
651 | These functions will not return a NAN if the first parameter is NAN.\\
652 | These functions make no distinction between 0 and -0.
653 | \begin{lstlisting}[frame=none]
654 | // Example:
655 | Vec4f a(1.0f, 1.1f, 1.2f, 1.3f);
656 | Vec4f b(1.4f, 1.3f, 1.2f, 1.1f);
657 | Vec4f c = max(a, b);         // c = (1.4f, 1.3f, 1.2f, 1.3f)
658 | \end{lstlisting}
659 | 
660 | \vspacesmall
661 | \begin{tabular}{|p{25mm}|p{100mm}|}
662 | \hline
663 | \bfseries Function & maximum \newline minimum \\ \hline
664 | \bfseries Defined for & all floating point vector classes \\ \hline
665 | \bfseries Description & returns the biggest/smallest of two values \\ \hline
666 | \bfseries Efficiency & good, but slower than max / min \\ \hline
667 | \end{tabular}
668 | \vspacesmall
669 | 
670 | These functions are similar to max and min, but sure to propagate NAN values.\\
671 | The sign of zero is ignored unless SIGNED\_ZERO is defined.
672 | \vspacesmall
673 | 
674 | \vspacesmall
675 | \begin{tabular}{|p{25mm}|p{100mm}|}
676 | \hline
677 | \bfseries Function & horizontal\_min, horizontal\_max \\ \hline
678 | \bfseries Defined for & all floating point vector classes \\ \hline
679 | \bfseries Description & Returns the lowest or highest element in a vector.\newline
680 | NANs are propagated. The sign of zero is ignored. \\ \hline
681 | \bfseries Efficiency & medium \\ \hline
682 | \end{tabular}
683 | \begin{lstlisting}[frame=none]
684 | // Example:
685 | Vec4i a(1, 8, -5, 3);
686 | int   b = horizontal_min(a);  // b = -5
687 | int   c = horizontal_max(a);  // c =  8
688 | \end{lstlisting}
689 | 
690 | 
691 | \vspacebig
692 | \begin{tabular}{|p{25mm}|p{100mm}|}
693 | \hline
694 | \bfseries Function & abs \\ \hline
695 | \bfseries Defined for & all floating point vector classes \\ \hline
696 | \bfseries Description & gets the absolute value \\ \hline
697 | \bfseries Efficiency & good \\ \hline
698 | \end{tabular}
699 | \begin{lstlisting}[frame=none]
700 | // Example:
701 | Vec4f a(-1.0f, 0.0f, 1.0f, 2.0f);
702 | Vec4f b = abs(a);  // b = (1.0f, 0.0f, 1.0f, 2.0f)
703 | \end{lstlisting}
704 | \vspacesmall
705 | 
706 | 
707 | \begin{tabular}{|p{25mm}|p{100mm}|}
708 | \hline
709 | \bfseries Function & change\_sign\textless i0, i1, ...\textgreater(vector) \\ \hline
710 | \bfseries Defined for & all floating point vector classes \\ \hline
711 | \bfseries Description & changes sign of selected vector elements.\newline
712 | Each template parameter is 1 for changing sign of the corresponding element, and 0 for no change. \\ \hline
713 | \bfseries Efficiency & good \\ \hline
714 | \end{tabular}
715 | \begin{lstlisting}[frame=none]
716 | // Example:
717 | Vec4f a(10.0f, 11.0f, -12.0f, 13.0f);
718 | Vec4f b = change_sign<0,1,1,0>(a); // b = (10.f, -11.f, 12.f, 13.f)
719 | \end{lstlisting}
720 | \vspacesmall
721 | 
722 | \begin{tabular}{|p{25mm}|p{100mm}|}
723 | \hline
724 | \bfseries Function & sign\_combine(vector a, vector b) \\ \hline
725 | \bfseries Defined for & all floating point vector classes \\ \hline
726 | \bfseries Description & Returns the value of a, with the sign inverted if b has its sign bit set.\newline
727 | Corresponds to select(sign\_bit(b), -a, a) \\ \hline
728 | \bfseries Efficiency & good \\ \hline
729 | \end{tabular}
730 | \begin{lstlisting}[frame=none]
731 | // Example:
732 | Vec4f  a(-2.0f, -1.0f,  0.0f,  1.0f);
733 | Vec4f  b(-10.f,  0.0f, -20.f,  30.f);
734 | Vec4f  c = sign_combine(a, b);  // c = (2.0f, -1.0f, -0.0f, 1.0f)
735 | \end{lstlisting}
736 | \vspacesmall
737 | 
738 | 
739 | \begin{tabular}{|p{25mm}|p{100mm}|}
740 | \hline
741 | \bfseries Function & sign\_bit \\ \hline
742 | \bfseries Defined for & all floating point vector classes \\ \hline
743 | \bfseries Description & returns a boolean vector with true for elements that have the sign bit set, including -0.0, -INF, and -NAN \\ \hline
744 | \bfseries Efficiency & medium \\ \hline
745 | \end{tabular}
746 | \begin{lstlisting}[frame=none]
747 | // Example:
748 | Vec4f  a(-1.0f, 0.0f, 1.0f, 2.0f);
749 | Vec4fb b = sign_bit(a);  // b = (true, false, false, false)
750 | \end{lstlisting}
751 | \vspacesmall
752 | 
753 | 
754 | \begin{tabular}{|p{25mm}|p{100mm}|}
755 | \hline
756 | \bfseries Function & sqrt \\ \hline
757 | \bfseries Defined for & all floating point vector classes \\ \hline
758 | \bfseries Description & calculates the square root \\ \hline
759 | \bfseries Efficiency & poor \\ \hline
760 | \end{tabular}
761 | \begin{lstlisting}[frame=none]
762 | // Example:
763 | Vec4f a(0.0f, 1.0f, 2.0f, 3.0f);
764 | Vec4f b = sqrt(a);  // b = (0.000f, 1.000f, 1.414f, 1.732f)
765 | \end{lstlisting}
766 | 
767 | 
768 | \vspacesmall
769 | \begin{tabular}{|p{25mm}|p{100mm}|}
770 | \hline
771 | \bfseries Function & square \\ \hline
772 | \bfseries Defined for & all floating point vector classes \\ \hline
773 | \bfseries Description & calculates the square \\ \hline
774 | \bfseries Efficiency & good \\ \hline
775 | \end{tabular}
776 | \begin{lstlisting}[frame=none]
777 | // Example:
778 | Vec4f a(0.0f, 1.0f, 2.0f, 3.0f);
779 | Vec4f b = square(a);  // b = (0.0f, 1.0f, 4.0f, 9.0f)
780 | \end{lstlisting}
781 | 
782 | 
783 | \label{powVectorInt}
784 | \vspacesmall
785 | \begin{tabular}{|p{25mm}|p{100mm}|}
786 | \hline
787 | \bfseries Function & pow(vector x, int n) \\ \hline
788 | \bfseries Defined for & all floating point vector classes \\ \hline
789 | \bfseries Description & raises all vector elements to the same integer power. 
790 | Will generate a compiler error if n is floating point and vectormath\_exp.h is not included, or in general if n is not of type int.
791 | See page \pageref{ExpLogFunctions} for pow with floating point exponent.
792 |  \\ \hline
793 | \bfseries Precision & slightly imprecise for high values of n due to accumulation of rounding errors \\ \hline
794 | \bfseries Efficiency & medium \\ \hline
795 | \end{tabular}
796 | \begin{lstlisting}[frame=none]
797 | // Example:
798 | Vec4f a(0.0f, 1.0f, 2.0f, 3.0f);
799 | int   b = 3;
800 | Vec4f c = pow(a, b);  // c = (0.0f, 1.0f, 8.0f, 27.0f)
801 | \end{lstlisting}
802 | 
803 | 
804 | \label{powConstVectorInt}
805 | \vspacesmall
806 | \begin{tabular}{|p{25mm}|p{100mm}|}
807 | \hline
808 | \bfseries Function & pow\_const(vector x, const int n) \\ \hline
809 | \bfseries Defined for & all floating point vector classes \\ \hline
810 | \bfseries Description & raises all vector elements to the same integer power n, where n is a compile-time constant \\ \hline
811 | \bfseries Precision & slightly imprecise for high values of n due to accumulation of rounding errors \\ \hline
812 | \bfseries Efficiency & medium, often better than pow(vector, int) \\ \hline
813 | \end{tabular}
814 | \begin{lstlisting}[frame=none]
815 | // Example:
816 | Vec4f a(0.0f, 1.0f, 2.0f, 3.0f);
817 | Vec4f c = pow_const(a, 3);  // c = (0.0f, 1.0f, 8.0f, 27.0f)
818 | \end{lstlisting}
819 | 
820 | 
821 | \vspacesmall
822 | \begin{tabular}{|p{25mm}|p{100mm}|}
823 | \hline
824 | \bfseries Function & round \\ \hline
825 | \bfseries Defined for & all floating point vector classes \\ \hline
826 | \bfseries Description & round to nearest integer (even value if two values are equally near). The value is returned as a floating point vector.\newline
827 | See also roundi and round\_to\_int32 on page \pageref{roundToInt}. \\ \hline
828 | \bfseries Efficiency & good if SSE4.1 or higher instruction set \\ \hline
829 | \end{tabular}
830 | \begin{lstlisting}[frame=none]
831 | // Example:
832 | Vec4f a(1.0f, 1.4f, 1.5f, 1.6f)
833 | Vec4f b = round(a);   // b = (1.0f, 1.0f, 2.0f, 2.0f)
834 | \end{lstlisting}
835 | 
836 | 
837 | \vspacesmall
838 | \begin{tabular}{|p{25mm}|p{100mm}|}
839 | \hline
840 | \bfseries Function & truncate \\ \hline
841 | \bfseries Defined for & all floating point vector classes \\ \hline
842 | \bfseries Description & truncates number towards zero. The value is returned as a floating point vector. \newline
843 | See also truncatei and truncate\_to\_int32 on page \pageref{truncateToInt}. \\ \hline
844 | \bfseries Efficiency & good if SSE4.1 or higher instruction set \\ \hline
845 | \bfseries Note & may be slightly inaccurate for x \textgreater{} $10^7$ if instruction set is less than SSE4.1 \\ \hline
846 | \end{tabular}
847 | \begin{lstlisting}[frame=none]
848 | // Example:
849 | Vec4f a(1.0f, 1.5f, 1.9f, 2.0f)
850 | Vec4f b = truncate(a);   // b = (1.0f, 1.0f, 1.0f, 2.0f)
851 | \end{lstlisting}
852 | 
853 | 
854 | \vspacesmall
855 | \begin{tabular}{|p{25mm}|p{100mm}|}
856 | \hline
857 | \bfseries Function & floor \\ \hline
858 | \bfseries Defined for & all floating point vector classes \\ \hline
859 | \bfseries Description & rounds number towards $-\infty$. The value is returned as a floating point vector \\ \hline
860 | \bfseries Efficiency & good if SSE4.1 or higher instruction set \\ \hline
861 | \bfseries Note & may be slightly inaccurate for x \textgreater{} $10^7$ if instruction set is less than SSE4.1 \\ \hline
862 | \end{tabular}
863 | \begin{lstlisting}[frame=none]
864 | // Example:
865 | Vec4f a(-0.5f, 1.5f, 1.9f, 2.0f)
866 | Vec4f b = floor(a);   // b = (-1.0f, 1.0f, 1.0f, 2.0f)
867 | \end{lstlisting}
868 | 
869 | 
870 | \vspacesmall
871 | \begin{tabular}{|p{25mm}|p{100mm}|}
872 | \hline
873 | \bfseries Function & ceil \\ \hline
874 | \bfseries Defined for & all floating point vector classes \\ \hline
875 | \bfseries Description & rounds number towards $+\infty$. The value is returned as a floating point vector \\ \hline
876 | \bfseries Efficiency & good if SSE4.1 or higher instruction set \\ \hline
877 | \bfseries Note & may be slightly inaccurate for x \textgreater{} $10^7$ if instruction set is less than SSE4.1 \\ \hline
878 | \end{tabular}
879 | \begin{lstlisting}[frame=none]
880 | // Example:
881 | Vec4f a(-0.5f, 1.1f, 1.9f, 2.0f)
882 | Vec4f b = ceil(a);   // b = (0.0f, 2.0f, 2.0f, 2.0f)
883 | \end{lstlisting}
884 | 
885 | 
886 | \vspacesmall
887 | \begin{tabular}{|p{25mm}|p{100mm}|}
888 | \hline
889 | \bfseries Function & approx\_recipr \\ \hline
890 | \bfseries Defined for & single and half precision floating point vectors \\ \hline
891 | \bfseries Description & fast approximate calculation of reciprocal  \\ \hline
892 | \bfseries Precision & the relative accuracy depends on the instruction set:\newline
893 | Default: $2^{-11}$\newline
894 | AVX512F: $2^{-14}$\newline
895 | AVX512ER: full precision \\ \hline
896 | \bfseries Efficiency & good \\ \hline
897 | \end{tabular}
898 | \begin{lstlisting}[frame=none]
899 | // Example:
900 | Vec4f a(1.5f, 2.0f, 3.0f, 4.0f)
901 | Vec4f b(0.5f, 1.0f, 0.5f, 1.0f)
902 | Vec4f c = a * approx_recipr(b);  // c approximates a/b
903 | \end{lstlisting}
904 | 
905 | 
906 | \vspacesmall
907 | \begin{tabular}{|p{25mm}|p{100mm}|}
908 | \hline
909 | \bfseries Function & approx\_rsqrt \\ \hline
910 | \bfseries Defined for & single and half precision floating point vectors \\ \hline
911 | \bfseries Description & reciprocal square root. Fast approximate calculation of value to the power of -0.5 \\ \hline
912 | \bfseries Precision & the relative accuracy depends on the instruction set:\newline
913 | Default: $2^{-11}$\newline
914 | AVX512F: $2^{-14}$\newline
915 | AVX512ER: full precision \\ \hline
916 | \bfseries Efficiency & good \\ \hline
917 | \end{tabular}
918 | \begin{lstlisting}[frame=none]
919 | // Example:
920 | Vec4f a(1.0f, 2.0f, 3.0f, 4.0f)
921 | Vec4f b = approx_rsqrt(a) * a;  // b approximates sqrt(a)
922 | \end{lstlisting}
923 | \vspacesmall
924 | 
925 | 
926 | \end{document}


--------------------------------------------------------------------------------
/vcl_packages.tex:
--------------------------------------------------------------------------------
 1 | % chapter included in vclmanual.tex
 2 | \documentclass[vcl_manual.tex]{subfiles}
 3 | \begin{document}
 4 | 
 5 | \chapter{Add-on packages}\label{chap:AddOnPackages}
 6 | \flushleft
 7 | 
 8 | Various extra packages are available with code for special applications. 
 9 | These packages are stored at 
10 | \url{https://github.com/vectorclass/add-on}.
11 | Manuals are included with each package. The add-on packages for VCL include:
12 | 
13 | \begin{description}
14 | 
15 | \item[Container classes.] 
16 |    Container class templates for storing arrays of vectors. More efficient than the standard C++ container class templates. \newline
17 |    This package also contains a class template for matrices where matrix rows are stored as VCL vectors. Various functions are included for accessing matrix elements and rows and for packing and unpacking matrix data.
18 | 
19 | \item[Random number generator.] 
20 |    A high-quality pseudo random number generator. Capable of generating random integer and floating point vectors. Suitable for large multi-threaded applications.
21 | 
22 | \item[Decimal string conversion.] 
23 |    Converts integer vectors to and from comma-separated lists in human-readable decimal ASCII form. Useful for reading and writing comma-separated files.
24 | 
25 | \item[3-dimensional vectors.] 
26 |    Defines 3-dimensional vectors for use in geometry and physics. 
27 |    Includes operators and functions for addition, multiplication, dot product, cross product, and rotation.
28 |    
29 | \item[Complex number vectors.]    
30 |    Defines complex number vectors for use in mathematics and electronics. 
31 |    Includes operators for add, subtract, multiply, divide, and conjugate, as well as functions such as complex square root, exponential function, and logarithm.
32 |    
33 | \item[Quaternions.]    
34 |    Defines quaternions (hypercomplex numbers) for use in mathematics.
35 |    Includes operators for add, subtract, multiply, divide, conjugate, etc.
36 | 
37 | 
38 | % add more packages here
39 | 
40 | 
41 | \end{description}
42 | \vspacesmall
43 | 
44 | 
45 | \end{document}


--------------------------------------------------------------------------------
/vcl_permute_functions.tex:
--------------------------------------------------------------------------------
  1 | % chapter included in vclmanual.tex
  2 | \documentclass[vcl_manual.tex]{subfiles}
  3 | \begin{document}
  4 | 
  5 | 
  6 | \chapter{Permute, blend, lookup, gather and scatter functions}\label{chap:PermuteBlendEtc}
  7 | 
  8 | \section{Permute functions}\label{PermuteFunctions}
  9 | \flushleft
 10 | 
 11 | \vspacesmall
 12 | \begin{tabular}{|p{30mm}|p{120mm}|}
 13 | \hline
 14 | \bfseries Function & permute..\textless i0, i1, ...\textgreater(vector) \\ \hline
 15 | \bfseries Defined for & all integer and floating point vector classes \\ \hline
 16 | \bfseries Description & permutes vector elements \\ \hline
 17 | \bfseries Efficiency & depends on parameters and instruction set \\ \hline
 18 | \end{tabular}
 19 | \vspacesmall
 20 | 
 21 | The permute functions can move any element of a vector into any position, copy the same element to multiple positions, and set any element to zero.
 22 | \vspacesmall
 23 | 
 24 | The name of the permute function is "permute" followed by the number of vector elements, for example permute4 for Vec4i. The permute function for a vector of $n$ elements has $n$ indexes, which are entered as template parameters in angle brackets. Each index indicates the desired contents of the corresponding element in the result vector. An index $i$ in the interval
 25 | $0 \leq i \leq n-1$ indicates that element number $i$ from the input vector should be placed in the corresponding position in the result vector. An index $i = -1$ gives a zero in the corresponding position. An index $i$ = V\_DC means don't care. This will give whatever implementation is fastest, regardless of what value it puts in this position. The value you get with "don't care" may be different for different implementations or different instruction sets.
 26 | \vspacesmall
 27 | 
 28 | \begin{lstlisting}[frame=none]
 29 | // Example:
 30 | Vec4i a(10, 11, 12, 13);
 31 | Vec4i b = permute4<2,2,3,0>(a);   // b = (12, 12, 13, 10)
 32 | Vec4i c = permute4<-1,-1,1,1>(a); // c = ( 0,  0, 11, 11)
 33 | \end{lstlisting}
 34 | \vspacesmall
 35 | 
 36 | The indexes in angle brackets must be compile-time constants, they cannot contain variables or function calls. If you need variable indexes then use the lookup functions instead (see page \pageref{LookupFunctions}).
 37 | \vspacesmall
 38 | 
 39 | The permute functions are using advanced metaprogramming techniques in order to find the optimal combination of instructions that fit the given set of indexes and the specified instruction set. The optimization criteria include number of instructions, instruction latency, and data cache use. The metaprogramming may produce extra code when compiling in debug mode, but this extra code is eliminated when compiling for release mode with optimization on. The call to a permute function is reduced to just one or a few machine instructions in favorable cases. 
 40 | \vspacesmall
 41 | 
 42 | The performance is generally good when the instruction set SSSE3 or higher is enabled. The performance for permuting vectors of 16-bit integers is medium, and the performance for permuting vectors of 8-bit integers is poor for instruction sets lower than SSSE3. You may get the best performance with instruction set AVX2 or AVX512VL.
 43 | \vspacesmall
 44 | 
 45 | 
 46 | \section{Blend functions}\label{BlendFunctions}
 47 | 
 48 | \vspacesmall
 49 | \begin{tabular}{|p{30mm}|p{120mm}|}
 50 | \hline
 51 | \bfseries Function & blend..\textless i0, i1, ...\textgreater(vector, vector) \\ \hline
 52 | \bfseries Defined for & all integer and floating point vector classes \\ \hline
 53 | \bfseries Description & permutes and blends elements from two vectors \\ \hline
 54 | \bfseries Efficiency & depends on parameters and instruction set \\ \hline
 55 | \end{tabular}
 56 | \vspacesmall
 57 | 
 58 | The blend functions are similar to the permute functions, but with two input vectors. 
 59 | The name of the function is "blend" followed by the number of vector elements, for example blend4 for Vec4i. The blend function for a vector of $n$ elements has $n$ indexes, which are entered as template parameters in angle brackets. Each index indicates the desired contents of the corresponding element in the result vector. The indexes must be compile-time constants.
 60 | An index $i$ in the interval $0 \leq i \leq n-1$ indicates that element number $i$ from the first input vector should be placed in the corresponding position in the result vector. An index $i$ in the interval $n \leq i \leq 2 \cdot n-1$ indicates that element number $i-n$ from the second input vector should be placed in the corresponding position in the result vector. An index $i = -1$ gives a zero in the corresponding position. An index $i$ = V\_DC means don't care.
 61 | \vspacesmall
 62 | 
 63 | The blend functions are using metaprogramming in the same way as the permute functions. The performance is similar to the permute functions, or slightly lower.
 64 | \vspacesmall
 65 | 
 66 | \begin{lstlisting}[frame=none]
 67 | // Example:
 68 | Vec4i a(10, 11, 12, 13);
 69 | Vec4i b(20, 21, 22, 23);
 70 | Vec4i c = blend4<4,0,6,3>(a, b); // c = (20, 10, 22, 13)
 71 | \end{lstlisting}
 72 | \vspacesmall
 73 | 
 74 | There are different methods you can use if you want to blend inputs from more than two vectors: 
 75 | \vspacesmall
 76 | 
 77 | 1. A binary tree of blend calls, where unused values are set to V\_DC meaning don't care.
 78 | \begin{lstlisting}[frame=none]
 79 | // Example:
 80 | Vec4i a(10, 11, 12, 13);
 81 | Vec4i b(20, 21, 22, 23);
 82 | Vec4i c(30, 31, 32, 33);
 83 | Vec4i d(40, 41, 42, 43);
 84 | Vec4i r = blend4<0,5,V_DC,V_DC>(a, b);// r = (10,21,?,?)
 85 | Vec4i s = blend4<V_DC,V_DC,2,7>(c, d);// s = (?,?,32,43)
 86 | Vec4i t = blend4<0,1,6,7>(r, s);      // t = (10,21,32,43)
 87 | \end{lstlisting}
 88 | \vspacesmall
 89 | 
 90 | 2. Set unused values to zero, then OR the results.
 91 | \begin{lstlisting}[frame=none]
 92 | // Example:
 93 | Vec4i a(10, 11, 12, 13);
 94 | Vec4i b(20, 21, 22, 23);
 95 | Vec4i c(30, 31, 32, 33);
 96 | Vec4i d(40, 41, 42, 43);
 97 | Vec4i r = blend4<0,5,-1,-1>(a, b);// r = (10,21,0,0)
 98 | Vec4i s = blend4<-1,-1,2,7>(c, d);// s = (0,0,32,43)
 99 | Vec4i t = r | s;                  // t = (10,21,32,43)
100 | \end{lstlisting}
101 | \vspacesmall
102 | 
103 | 3. If the input vectors are stored sequentially in memory then use the lookup functions shown below.
104 | \vspacesmall
105 | 
106 | 
107 | \section{Lookup functions}\label{LookupFunctions}
108 | \vspacesmall
109 | 
110 | \begin{tabular}{|p{30mm}|p{120mm}|}
111 | \hline
112 | \bfseries Function & Vec16c lookup16(Vec16c, Vec16c) \newline
113 | Vec32c lookup32(Vec32c, Vec32c) \newline
114 | Vec64c lookup64(Vec64c, Vec64c) \newline
115 | Vec8s lookup8(Vec8s, Vec8s) \newline
116 | Vec16s lookup16(Vec16s, Vec16s) \newline
117 | Vec32s lookup32(Vec32s, Vec32s) \newline
118 | Vec4i lookup4(Vec4i, Vec4i) \newline
119 | Vec8i lookup8(Vec8i, Vec8i) \newline
120 | Vec16i lookup16(Vec16i, Vec16i) \newline
121 | Vec4q lookup4(Vec4q, Vec4q) \newline
122 | Vec8q lookup8(Vec8q, Vec8q) \\ \hline
123 | \bfseries Defined for & Vec16c, Vec32c, Vec64c, Vec8s, Vec16s, Vec32s, Vec4i, Vec8i, Vec16i, Vec4q, Vec8q \\ \hline
124 | \bfseries Description & Permutation with variable indexes. The first input vector contains the indexes, the second input vector is the data source. Each index must be in the range  $0 \leq i \leq n-1$ where n is the number of elements in a vector. \\ \hline
125 | \bfseries Efficiency & 
126 | Vec16i, Vec8q: Good for AVX512F, medium otherwise.  \newline
127 | Vec64c, Vec32s: Good for AVX512VBMI, medium for AVX512BW, poor otherwise. \newline
128 | Vec32c, Vec16s, Vec8i, Vec4i, Vec4q: Good for AVX2, medium otherwise. \newline
129 | Vec16c, Vec8s: Good for SSSE3, poor otherwise. \\ \hline
130 | \end{tabular}
131 | \vspacebig
132 | 
133 | 
134 | \begin{tabular}{|p{30mm}|p{120mm}|}
135 | \hline
136 | \bfseries Function & 
137 | Vec16c lookup32(Vec16c, Vec16c, Vec16c) \newline
138 | Vec64c lookup128(Vec64c, Vec64c, Vec64c) \newline
139 | Vec8s lookup16(Vec8s, Vec8s, Vec8s) \newline
140 | Vec32s lookup64(Vec32s, Vec32s, Vec32s) \newline
141 | Vec4i lookup8(Vec4i, Vec4i, Vec4i) \newline
142 | Vec16i lookup32(Vec16i, Vec16i, Vec16i) \\ \hline
143 | \bfseries Defined for & Vec16c, Vec64c, Vec8s, Vec32s, Vec4i, Vec16i \\ \hline
144 | \bfseries Description & Blend with variable indexes. The first input vector contains the indexes, the following two input vectors contain the data source. Each index must be in the range  $0 \leq i \leq 2\cdot n - 1$ where n is the number of elements in each vector. \\ \hline
145 | \bfseries Efficiency & 
146 | Vec4i, Vec8s: Good for AVX2, medium or poor otherwise. \newline
147 | Vec16i: Good for AVX512, medium or poor otherwise. \newline
148 | Vec64c, Vec32s: Good for AVX512VBMI, medium for AVX512BW, poor otherwise. \newline
149 | Vec16c, Vec8s: Good for SSSE3, poor otherwise. \\ \hline
150 | \end{tabular}
151 | \vspacebig
152 | 
153 | 
154 | \begin{tabular}{|p{30mm}|p{120mm}|}
155 | \hline
156 | \bfseries Function & 
157 | Vec4i lookup16(Vec4i, Vec4i, Vec4i, Vec4i, Vec4i) \newline
158 | Vec16i lookup64(Vec16i, Vec16i, Vec16i, Vec16i, Vec16i) \newline
159 | Vec64c lookup256(Vec64c, Vec64c, Vec64c, Vec64c, Vec64c) \newline
160 | Vec32s lookup128(Vec32s, Vec32s, Vec32s, Vec32s, Vec32s) \\ \hline
161 | \bfseries Defined for & Vec4i, Vec32s, Vec64c \\ \hline
162 | \bfseries Description & Blend with variable indexes. The first input vector contains the indexes, the following four input vectors contain the data source. Each index must be in the range  $0 \leq i \leq 4\cdot n - 1$ where n is the number of elements in each vector. \\ \hline
163 | \bfseries Efficiency & 
164 | Vec4i: Good for AVX2, medium otherwise. \newline
165 | Vec16i: Good for AVX512, medium or poor otherwise. \newline
166 | Vec64c, Vec32s: Good for AVX512VBMI, medium for AVX512BW, poor otherwise. 
167 | \\ \hline
168 | \end{tabular}
169 | \vspacebig
170 | 
171 | 
172 | \begin{tabular}{|p{30mm}|p{120mm}|}
173 | \hline
174 | \bfseries Function & 
175 | Vec8h lookup8(Vec8s, Vec8h) \newline
176 | Vec16h lookup16(Vec16s, Vec16h) \newline
177 | Vec32h lookup32(Vec32s, Vec32h) \newline
178 | Vec4f lookup4(Vec4i, Vec4f) \newline
179 | Vec8f lookup8(Vec8i, Vec8f) \newline
180 | Vec16f lookup16(Vec16i, Vec16f) \newline
181 | Vec2d lookup2(Vec2q, Vec2d) \newline
182 | Vec4d lookup4(Vec4q, Vec4d) \newline
183 | Vec8d lookup8(Vec8q, Vec8d) \\ \hline
184 | \bfseries Defined for & all floating point vector classes \\ \hline
185 | \bfseries Description & Permutation of floating point vectors with integer indexes. Each index must be in the range  $0 \leq i \leq n-1$ where n is the number of elements in a vector. \\ \hline
186 | \bfseries Efficiency & good for AVX2 and later, medium for lower instruction sets \\ \hline
187 | \end{tabular}
188 | \vspacebig
189 | 
190 | 
191 | \begin{tabular}{|p{30mm}|p{120mm}|}
192 | \hline
193 | \bfseries Function & 
194 | Vec8h lookup16(Vec8s, Vec8h, Vec8h) \newline
195 | Vec4f lookup8(Vec4i, Vec4f, Vec4f) \newline
196 | Vec2d lookup4(Vec2q, Vec2d, Vec2d) \\ \hline
197 | \bfseries Defined for & Vec4f, Vec2d \\ \hline
198 | \bfseries Description & Blend of floating point vectors with integer indexes. Each index must be in the range  $0 \leq i \leq 2*n-1$ where n is the number of elements in a vector. \\ \hline
199 | \bfseries Efficiency & medium \\ \hline
200 | \end{tabular}
201 | \vspacebig
202 | 
203 | 
204 | \begin{tabular}{|p{30mm}|p{120mm}|}
205 | \hline
206 | \bfseries Function &
207 | Vec16c lookup\textless n\textgreater(Vec16c index, void const * table) \newline
208 | Vec32c lookup\textless n\textgreater(Vec32c index, void const * table) \newline
209 | Vec8s lookup\textless n\textgreater(Vec8s index, void const * table) \newline
210 | Vec16s lookup\textless n\textgreater(Vec16s index, void const * table) \newline
211 | Vec4i lookup\textless n\textgreater(Vec4i index, void const * table) \newline
212 | Vec8i lookup\textless n\textgreater(Vec8i index, void const * table) \newline
213 | Vec16i lookup\textless n\textgreater(Vec16i index, void const * table) \newline
214 | Vec4q lookup\textless n\textgreater(Vec4q index, void const * table) \newline
215 | Vec8q lookup\textless n\textgreater(Vec8q index, void const * table) \newline
216 | Vec8h lookup\textless n\textgreater(Vec8s index, void const * table) \newline
217 | Vec16h lookup\textless n\textgreater(Vec16s index, void const * table) \newline
218 | Vec32h lookup\textless n\textgreater(Vec32s index, void const * table) \newline
219 | Vec4f lookup\textless n\textgreater(Vec4i index, float const * table) \newline
220 | Vec8f lookup\textless n\textgreater(Vec8i const \& index, float const * table) \newline
221 | Vec16f lookup\textless n\textgreater(Vec16i const \& index, float const * table) \newline
222 | Vec2d lookup\textless n\textgreater(Vec2q index, double const * table) \newline
223 | Vec4d lookup\textless n\textgreater(Vec4q const \& i, double const * table) \newline
224 | Vec8d lookup\textless n\textgreater(Vec8q const \& i, double const * table) \\ \hline
225 | \bfseries Defined for & all floating point and signed integer vector classes \\ \hline
226 | \bfseries Description & Permute, blend, table lookup, or gather data from array with an integer vector of indexes.\newline
227 | Each index must be in the range  $0 \leq i \leq n-1$, where $n$ is indicated as a template parameter. $n$ must be a positive compile-time constant. 
228 | The range check can be omitted by setting n = INT\_MAX. \\ \hline
229 | \bfseries Efficiency & good for AVX2 and later, medium for lower instruction sets.
230 | Best if n is no bigger than twice the vector length. \\ \hline
231 | \end{tabular}
232 | \vspacebig
233 | 
234 | 
235 | The lookup functions are similar to the permute and blend functions, but with variable indexes. They cannot be used for setting an element to zero, and there is no "don't care" option. The lookup functions can be used for several purposes:
236 | 
237 | \begin{enumerate}
238 | \item permute with variable indexes
239 | \item blend with variable indexes
240 | \item blend from more than two sources
241 | \item table lookup
242 | \item gather non-contiguous data from an array
243 | \end{enumerate}
244 | \vspacesmall
245 | 
246 | The index is always an integer vector. The input can be one or more vectors or an array. The result is a vector of the same type as the input. All elements in the index vector must be in the specified range. The behavior for an index out of range is implementation-dependent and may give any value for the corresponding element. The function may in some cases read up to one vector size past the end of the table for the sake of efficient permutation.
247 | \vspacesmall
248 | 
249 | The lookup functions are not defined for unsigned integer vector types, but the corresponding signed versions can be used. You don't have to worry about overflow when converting unsigned integers to signed here, as long as the result vector is converted back to unsigned.
250 | \vspacebig
251 | 
252 | 
253 | \begin{lstlisting}[frame=none]
254 | // Example of permutation with variable indexes:
255 | Vec4f a(1.0, 1.1, 1.2, 1.3);
256 | Vec4i b(2, 3, 3, 0);
257 | Vec4f c = lookup4(b, a);  // c = (1.2, 1.3, 1.3, 1.0)
258 | 
259 | // Example of blending with variable indexes:
260 | Vec4f a(1.0, 1.1, 1.2, 1.3);
261 | Vec4f b(2.0, 2.1, 2.2, 2.3);
262 | Vec4i c(4, 3, 2, 7);
263 | Vec4f d = lookup4(c,a,b); // d = (2.0, 1.3, 1.2, 2.3)
264 | 
265 | // Example of blending from more than two sources:
266 | float sources[12] = {
267 | 1.0,1.1,1.2,1.3,2.0,2.1,2.2,2.3,3.0,3.1,3.2,3.3};
268 | Vec4i i(11, 0, 5, 5);
269 | Vec4f c = lookup<12>(i, sources); // c = (3.3,1.0,2.1,2.1)
270 | \end{lstlisting}
271 | \vspacebig
272 | 
273 | 
274 | A function with a limited number of possible input values can be replaced by a lookup table. This is useful if table lookup is faster than calculating the function. The following example has a table of the function $y = x^2 - 1$
275 | 
276 | \begin{lstlisting}[frame=none]
277 | // Table of the function y = x*x-1
278 | int table[6] = {-1,0,3,8,15,24};
279 | Vec4i x(4,2,0,5);
280 | Vec4i y = lookup<6>(x, table);  // y = (15, 3, -1, 24)
281 | 
282 | // Example of gathering non-contiguous data from an array:
283 | float x[16] = { ... };
284 | Vec4i i(0,4,8,12);
285 | Vec4f y = lookup<16>(i, x); // y = (x[0],x[4],x[8],x[12])
286 | \end{lstlisting}
287 | \vspacesmall
288 | 
289 | 
290 | \section{Gather functions}\label{GatherFunctions}
291 | 
292 | \vspacesmall
293 | \begin{tabular}{|p{30mm}|p{120mm}|} \hline
294 | \bfseries Function & 
295 | Vec4i gather4i\textless indexes\textgreater(void const * table) \newline
296 | Vec8i gather8i\textless indexes\textgreater(void const * table) \newline
297 | Vec16i gather16i\textless indexes\textgreater(void const * table) \newline
298 | Vec2q gather2q\textless indexes\textgreater(void const * table) \newline
299 | Vec4q gather4q\textless indexes\textgreater(void const * table) \newline
300 | Vec8q gather8q\textless indexes\textgreater(void const * table) \newline
301 | Vec4f gather4f\textless indexes\textgreater(void const * table) \newline
302 | Vec8f gather8f\textless indexes\textgreater(void const * table) \newline
303 | Vec16f gather16f\textless indexes\textgreater(void const * table) \newline
304 | Vec2d gather2d\textless indexes\textgreater(void const * table) \newline
305 | Vec4d gather4d\textless indexes\textgreater(void const * table) \newline
306 | Vec8d gather8d\textless indexes\textgreater(void const * table) \\ \hline
307 | \bfseries Defined for & Vec4i, Vec8i, Vec16i, Vec2q, Vec4q, Vec8q, \newline
308 | Vec4f, Vec8f, Vec16f, Vec2d, Vec4d, Vec8d \\ \hline
309 | \bfseries Description & Load non-contiguous data from a table. Indexes cannot be negative. There is no option for zeroing or don't care.  \newline
310 | The function may read a full vector and permute it if all indexes are smaller than the vector size.  \\ \hline
311 | \bfseries Efficiency & medium \\ \hline
312 | \end{tabular}
313 | \vspacesmall
314 | 
315 | \begin{lstlisting}[frame=none]
316 | // Example:
317 | int tab[8] = {10,11,12,13,14,15,16,17};
318 | Vec4i a = gather4i<6,4,4,0>(tab);
319 | // a = (16, 14, 14, 10);
320 | \end{lstlisting}
321 | \vspacesmall
322 | 
323 | Use the lookup\textless n\textgreater {} functions instead if you need variable indexes. \newline
324 | \vspacesmall
325 | 
326 | 
327 | \section{Scatter functions}\label{Scatter functions}
328 | 
329 | \begin{tabular}{|p{30mm}|p{120mm}|} \hline
330 | \bfseries Function & scatter\textless indexes\textgreater(Vec4i data, void * array) \newline
331 | scatter\textless indexes\textgreater(Vec8i data, void * array) \newline
332 | scatter\textless indexes\textgreater(Vec16i data, void * array) \newline
333 | scatter\textless indexes\textgreater(Vec2q data, void * array) \newline
334 | scatter\textless indexes\textgreater(Vec4q data, void * array) \newline
335 | scatter\textless indexes\textgreater(Vec8q data, void * array) \newline
336 | scatter\textless indexes\textgreater(Vec4f data, float * array) \newline
337 | scatter\textless indexes\textgreater(Vec8f data, float * array) \newline
338 | scatter\textless indexes\textgreater(Vec16f data, float * array) \newline
339 | scatter\textless indexes\textgreater(Vec2d data, double * array) \newline
340 | scatter\textless indexes\textgreater(Vec4d data, double * array) \newline
341 | scatter\textless indexes\textgreater(Vec8d data, double * array) \\ \hline
342 | \bfseries Defined for & 
343 | Vec4i, Vec8i, Vec16i, Vec2q, Vec4q, Vec8q, \newline
344 | Vec4f, Vec8f, Vec16f, Vec2d, Vec4d, Vec8d \\ \hline
345 | \bfseries Description & Store vector elements into non-contiguous positions in an array. Each vector element is stored in the array position indicated by the corresponding index. An element is not stored if the corresponding index is negative. \\ \hline
346 | \bfseries Efficiency & 
347 | Medium for 512 bit vectors if AVX512F instruction set supported. \newline
348 | Medium for 256 bit vectors if AVX512F, or better AVX512VL, supported. \newline
349 | Medium for 128 bit vectors if AVX512VL supported. \newline
350 | Poor otherwise. \\ \hline
351 | \end{tabular}
352 | \vspacesmall
353 | 
354 | \begin{lstlisting}[frame=none]
355 | // Example:
356 | Vec8i a(10,11,12,13,14,15,16,17);
357 | int array[10] = {0};
358 | scatter<5,4,3,2,-1,-1,7,0>(a, array);
359 | // array = (17,0,13,12,11,10,0,16,0,0)
360 | \end{lstlisting}
361 | \vspacebig
362 | 
363 | 
364 | \begin{tabular}{|p{30mm}|p{120mm}|} \hline
365 | \bfseries Function & 
366 | scatter(Vec4i index, uint32\_t limit, Vec4i data, void * array) \newline
367 | scatter(Vec8i index, uint32\_t limit, Vec8i data, void * array) \newline
368 | scatter(Vec16i index, uint32\_t limit, Vec16i data, void * array) \newline
369 | scatter(Vec2q index, uint32\_t limit, Vec2q data, void * array) \newline
370 | scatter(Vec4i index, uint32\_t limit, Vec4q data, void * array) \newline
371 | scatter(Vec4q index, uint32\_t limit, Vec4q data, void * array) \newline
372 | scatter(Vec8i index, uint32\_t limit, Vec8q data, void * array) \newline
373 | scatter(Vec8q index, uint32\_t limit, Vec8q data, void * array) \newline
374 | scatter(Vec4i index, uint32\_t limit, Vec4f data, float * array) \newline
375 | scatter(Vec8i index, uint32\_t limit, Vec8f data, float * array) \newline
376 | scatter(Vec16i index, uint32\_t limit, Vec16f data, float * array) \newline
377 | scatter(Vec2q index, uint32\_t limit, Vec2d data, double * array) \newline
378 | scatter(Vec4i index, uint32\_t limit, Vec4d data, double * array) \newline
379 | scatter(Vec4q index, uint32\_t limit, Vec4d data, double * array) \newline
380 | scatter(Vec8i index, uint32\_t limit, Vec8d data, double * array) \newline
381 | scatter(Vec8q index, uint32\_t limit, Vec8d data, double * array) \\ \hline
382 | \bfseries Defined for & 
383 | Vec4i, Vec8i, Vec16i, Vec2q, Vec4q, Vec8q, \newline
384 | Vec4f, Vec8f, Vec16f, Vec2d, Vec4d, Vec8d \\ \hline
385 | \bfseries Description & Store vector elements into non-contiguous positions in an array. Each vector element is stored in the array position indicated by the corresponding element of the index vector. An element is not stored if the corresponding index is negative or bigger than or equal to the limit. The limit will typically be the size of the array. \\ \hline
386 | \bfseries Efficiency & 
387 | Medium for 512 bit vectors if AVX512F instruction set supported. \newline
388 | Medium for 256 bit vectors if AVX512F, or better AVX512VL, supported. \newline
389 | Medium for 128 bit vectors if AVX512VL supported. \newline
390 | Poor otherwise. \\ \hline
391 | \end{tabular}
392 | \vspacesmall
393 | 
394 | \begin{lstlisting}[frame=none]
395 | // Example:
396 | Vec8i a(10,11,12,13,14,15,16,17);
397 | Vec8i x(5,4,3,2,-1,99,7,0);
398 | int array[10] = {0};
399 | scatter(x, 5, a, array);
400 | // array = (17,0,13,12,11,0,0,0,0,0)
401 | \end{lstlisting}
402 | \vspacebig
403 | 
404 | The scatter functions are useful for writing sparse arrays. If you have more dense arrays, then it may be more efficient to permute the vector and then store the whole vector into the array.
405 | \vspacesmall
406 | 
407 | If you want to permute a dataset that is too big for the permute and blend functions, then it is better to use lookup or gather functions than to use scatter functions.
408 | \vspacesmall
409 | 
410 | \end{document}


--------------------------------------------------------------------------------
/vcl_technical_details.tex:
--------------------------------------------------------------------------------
 1 | % chapter included in vclmanual.tex
 2 | \documentclass[vcl_manual.tex]{subfiles}
 3 | \begin{document}
 4 | 
 5 | 
 6 | \chapter{Technical details}\label{chap:TechnicalDetails}
 7 | \flushleft
 8 | 
 9 | \begin{longtable}[l]{|p{40mm}|p{100mm}|}
10 | \endfirsthead
11 | \label{table:fileList} \\
12 | \endhead
13 | \hline
14 | \bfseries File name & \bfseries Purpose \\ \hline
15 | vcl\_manual.pdf & Instruction manual (this file) \\ \hline
16 | 
17 | vectorclass.h & Top-level C++ header file. This will include several other header files, according to the indicated instruction set \\ \hline
18 | 
19 | instrset.h & Detection of which instruction set the code is compiled for,  various common definitions, and functions that depend on the instruction set. Included by vectorclass.h \\ \hline
20 | 
21 | vectori128.h & Defines classes, operators and functions for integer vectors with a total size of 128 bits. Included by vectorclass.h \\ \hline
22 | 
23 | vectori256.h & Defines classes, operators and functions for integer vectors with a total size of 256 bits for the AVX2 instruction set. Included by vectorclass.h if appropriate \\ \hline
24 | 
25 | vectori256e.h & Defines classes, operators and functions for integer vectors with a total size of 256 bits for instruction sets lower than AVX2. Included by vectorclass.h if appropriate \\ \hline
26 | 
27 | vectori512.h & Defines classes, operators and functions for vectors of 32-bit and 64-bit integers with a total size of 512 bits for the AVX512F instruction set. Included by vectorclass.h if appropriate \\ \hline
28 | 
29 | vectori512e.h & Defines classes, operators and functions for vectors of 32-bit and 64-bit integers with a total size of 512 bits for instruction sets lower than AVX512F. Included by vectorclass.h if appropriate \\ \hline
30 | 
31 | vectori512s.h & Defines classes, operators and functions for vectors of 8-bit and 16-bit integers with a total size of 512 bits for the AVX512BW instruction set. Included by vectorclass.h if appropriate \\ \hline
32 | 
33 | vectori512se.h & Defines classes, operators and functions for vectors of 8-bit and 16-bit integers  with a total size of 512 bits for instruction sets lower than
34 | AVX512BW. Included by vectorclass.h if appropriate \\ \hline
35 | 
36 | vectorf128.h & Defines classes, operators and functions for floating point vectors with a total size of 128 bits. Included by vectorclass.h \\ \hline
37 | 
38 | vectorf256.h & Defines classes, operators and functions for floating point vectors with a total size of 256 bits for the AVX and later instruction sets. Included by vectorclass.h if appropriate \\ \hline
39 | 
40 | vectorf256e.h & Defines classes, operators and functions for floating point vectors with a total size of 256 bits for instruction sets lower than AVX. Included by vectorclass.h if appropriate \\ \hline
41 | 
42 | vectorf512.h & Defines classes, operators and functions for floating point vectors with a total size of 512 bits for the AVX512F and later instruction sets. Included by vectorclass.h if appropriate \\ \hline
43 | 
44 | vectorf512e.h & Defines classes, operators and functions for floating point vectors with a total size of 512 bits for instruction sets lower than AVX512F. Included by vectorclass.h if appropriate \\ \hline
45 | 
46 | vectormath\_exp.h & Optional inline mathematical functions: power, logarithms and exponential functions \\ \hline
47 | 
48 | vectormath\_trig.h & Optional inline mathematical functions: trigonometric and inverse trigonometric functions \\ \hline
49 | 
50 | vectormath\_hyp.h & Optional inline mathematical functions: hyperbolic and inverse hyperbolic functions \\ \hline
51 | 
52 | vectormath\_common.h & Common definitions for vectormath\_exp.h, vectormath\_trig.h and vectormath\_hyp.h \\ \hline
53 | 
54 | vectormath\_lib.h & Optional header file for external mathematical vector function libraries \\ \hline
55 | 
56 | instrset\_detect.cpp & Optional functions for detecting which instruction set is supported at runtime \\ \hline
57 | 
58 | dispatch\_example.cpp & Example of how to make automatic CPU dispatching \\ \hline
59 | 
60 | changelog.txt & VCL version history \\ \hline
61 | 
62 | license.txt & Apache 2.0 license \\ \hline
63 | 
64 | svml\_patch & Folder containing the library win64patch.lib as well as the source code to build it. Used for fixing a compatibillity issue with Intel SVML library in 64-bit Windows \\ \hline
65 | 
66 | testbench & Folder containing a test bench for testing the VCL library. This is used in the development of VCL, and is not needed by programmers using the VCL. Includes code and documentation. \\ \hline
67 | 
68 | \end{longtable}
69 | %\end{tabular}
70 | \vspacesmall
71 | 
72 | 
73 | 
74 | \end{document}


--------------------------------------------------------------------------------