├── random ├── ranvec1.h ├── ranvec1.cpp ├── freesoftwarelogo.jpg ├── ranvec1_manual.pdf ├── readme.md ├── test_ranvec.cpp ├── physseed.cpp └── ranvec1_manual.tex ├── decimal ├── decimal_manual.pdf ├── freesoftwarelogo.jpg ├── readme.md ├── decimal.h ├── decimal_manual.tex └── testbench_decimal.cpp ├── complex ├── complexvec_manual.pdf ├── freesoftwarelogo.jpg ├── readme.md └── test_complex.lst ├── vector3d ├── freesoftwarelogo.jpg ├── vector3d_manual.pdf ├── test_vector3d.lst ├── readme.md ├── vector3d.h ├── vector3d_manual.tex └── testbench_vector3d.cpp ├── containers ├── containers_manual.pdf ├── README.md ├── general_containers.h └── vector_containers.h ├── quaternion ├── freesoftwarelogo.jpg ├── quaternion_manual.pdf ├── test_quaternion.lst ├── readme.md ├── quaternion_manual.tex └── quaternion.h ├── physical_processors ├── readme.md └── physical_processors.cpp ├── README.md └── license.txt /random/ranvec1.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vectorclass/add-on/HEAD/random/ranvec1.h -------------------------------------------------------------------------------- /random/ranvec1.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vectorclass/add-on/HEAD/random/ranvec1.cpp -------------------------------------------------------------------------------- /decimal/decimal_manual.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vectorclass/add-on/HEAD/decimal/decimal_manual.pdf -------------------------------------------------------------------------------- /random/freesoftwarelogo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vectorclass/add-on/HEAD/random/freesoftwarelogo.jpg -------------------------------------------------------------------------------- /random/ranvec1_manual.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vectorclass/add-on/HEAD/random/ranvec1_manual.pdf -------------------------------------------------------------------------------- /complex/complexvec_manual.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vectorclass/add-on/HEAD/complex/complexvec_manual.pdf -------------------------------------------------------------------------------- /complex/freesoftwarelogo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vectorclass/add-on/HEAD/complex/freesoftwarelogo.jpg -------------------------------------------------------------------------------- /decimal/freesoftwarelogo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vectorclass/add-on/HEAD/decimal/freesoftwarelogo.jpg -------------------------------------------------------------------------------- /vector3d/freesoftwarelogo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vectorclass/add-on/HEAD/vector3d/freesoftwarelogo.jpg -------------------------------------------------------------------------------- /vector3d/vector3d_manual.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vectorclass/add-on/HEAD/vector3d/vector3d_manual.pdf -------------------------------------------------------------------------------- /containers/containers_manual.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vectorclass/add-on/HEAD/containers/containers_manual.pdf -------------------------------------------------------------------------------- /quaternion/freesoftwarelogo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vectorclass/add-on/HEAD/quaternion/freesoftwarelogo.jpg -------------------------------------------------------------------------------- /quaternion/quaternion_manual.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vectorclass/add-on/HEAD/quaternion/quaternion_manual.pdf -------------------------------------------------------------------------------- /physical_processors/readme.md: -------------------------------------------------------------------------------- 1 | physical_processors.cpp 2 | 3 | Detect the number of physical and logical processors on an x86 computer 4 | -------------------------------------------------------------------------------- /containers/README.md: -------------------------------------------------------------------------------- 1 | # C++ container class templates 2 | 3 | * Containers for arrays with fixed and variable size for use with VCL vector classes 4 | * Containers for matrixes with fixed size for use with VCL vector classes 5 | * Containers for arrays with fixed and variable size for use with general types independent of VCL 6 | 7 | See containers_manual.pdf for instructions 8 | -------------------------------------------------------------------------------- /vector3d/test_vector3d.lst: -------------------------------------------------------------------------------- 1 | # Test data for vector3d.h under Vector class library 2 | # Use with runtest.sh from testbench repository 3 | 4 | $compiler=1 5 | $mode=64 6 | $testbench=testbench_vector3d.cpp 7 | $include=../src2 8 | $outfile=t.txt 9 | $seed=1 10 | 11 | 12 | # test case, vector type, return type, instruction set 13 | 14 | # operators and functions 15 | 1 2 3 4 5 8 9 11 12 13 14 15 , Vec3Df Vec3Dd , , 2 6 7 8 9 10 16 | 17 | # conversion 18 | 16 , Vec3Dd , , 4 7 8 9 10 19 | 17 , Vec3Df , , 3 7 8 9 10 20 | 21 | # constructors etc 22 | 20 21 22 23 24 , Vec3Df Vec3Dd , , 3 5 7 8 9 10 23 | -------------------------------------------------------------------------------- /quaternion/test_quaternion.lst: -------------------------------------------------------------------------------- 1 | # Test data for quaternion.h under VCL 2 | # Use with runtest.sh from testbench repository 3 | 4 | $compiler=1 5 | $mode=64 6 | $testbench=testbench_quaternion.cpp 7 | $include=./ 8 | $outfile=q.txt 9 | $seed=1 10 | 11 | 12 | # test case, vector type, return type, instruction set 13 | 14 | # operators and functions 15 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 , Quaternion1f Quaternion1d , , 2 6 7 8 9 10 16 | 17 | # constructors etc 18 | 20 21 25 26 27 28 , Quaternion1f Quaternion1d , , 3 5 7 8 9 10 19 | 22 , Quaternion1f , , 4 7 8 9 10 20 | 23 , Quaternion1d , , 3 7 8 9 10 21 | -------------------------------------------------------------------------------- /decimal/readme.md: -------------------------------------------------------------------------------- 1 | # Decimal.cpp 2 | 3 | The decimal ASCII extension to the Vector Class Library contains functions for conversion of integer 4 | vectors to and from comma-separated lists of numbers as human-readable decimal ASCII strings. 5 | This is useful for efficient reading and writing of comma-separated files. 6 | 7 | ## File list: 8 | 9 | * decimal_manual.pdf: Instructions 10 | * decimal.cpp: Contains functions bin2ascii and ascii2bin 11 | * decimal.h: C++ Header file 12 | * testbench_decimal.cpp: Test program 13 | * decimal_manual.tex: Latex source for decimal_manual.pdf 14 | * freesoftwarelogo.jpg: Used by decimal_manual.tex 15 | * readme.md: This file 16 | 17 | -------------------------------------------------------------------------------- /vector3d/readme.md: -------------------------------------------------------------------------------- 1 | # 3-dimensional vectors 2 | 3 | # Add-on package for Vector Class Library 4 | 5 | 3-dimensional vectors are useful in geometry and physics. 6 | The file vector3d.h provides vector classes, operators, and functions for 7 | calculations with 3-D vectors 8 | 9 | ## File list: 10 | * vector3d.h: C++ header file, defining 3-D classes, operators, and functions 11 | * vector3d_manual.pdf: Instruction manual 12 | * testbench_vector3d.cpp: Test program for testing vector3d.h during development. Not required for applications 13 | * test_vector3d.lst: Test cases for testbench_vector3d.cpp 14 | * vector3d_manual.tex: Source for building vector3d_manual.pdf 15 | * freesoftwarelogo.jpg: Used by vector3d_manual.tex 16 | -------------------------------------------------------------------------------- /quaternion/readme.md: -------------------------------------------------------------------------------- 1 | # Quaternion.h 2 | 3 | # Add-on package for Vector Class Library 4 | 5 | Quaternions or hypercomplex numbers is a topic in theoretical algebra and quantum physics. 6 | 7 | The file quaternion.h defines quaternions with single and double precision, including operators + - * / and various functions 8 | 9 | **File list:** 10 | * quaternion.h: C++ header file defining quaternion classes, operators, and functions 11 | * quaternion_manual.pdf: Instruction manual 12 | * testbench_quaternion.cpp: Test program for testing quaternion.h 13 | * test_quaternion.lst: List of test cases for testbench_quaternion.cpp 14 | * quaternion_manual.tex: Source for building quaternion_manual.pdf 15 | * freesoftwarelogo.jpg: Used by quaternion_manual.tex 16 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Add-on packages for vector class library 2 | 3 | These packages are extensions to the C++ vector class library for specific applications: 4 | 5 | * containers. Container class tempates for arrays with fixed and dynamic size and matrixes. 6 | * random. Generates random number vectors of integers or floating point values. Excellent randomness. Suitable for large simulations and multi-threaded applications. 7 | * decimal. Conversion between integer vectors and comma-separated lists in human-readable form (decimal ASCII). Useful for reading and writing comma-separated files. 8 | * vector3d. Three-dimensional vector arithmetics, operators, cross product and dot product. 9 | * complex. Defines complex number vectors. Operators + - * / etc., and functions. Complex exponential function and logarithm. 10 | * quaternion. Hyper-complex numbers 11 | * physical_processors. Detect the number of physical and logical processors on an x86 computer 12 | -------------------------------------------------------------------------------- /random/readme.md: -------------------------------------------------------------------------------- 1 | # Ranvec1 2 | 3 | # Random number generator 4 | 5 | Ranvec1 is an efficient high quality pseudo random number generator designed for large vector applications and multi-threaded applications. 6 | 7 | **Features:** 8 | * Vector and scalar output 9 | * Random integers with uniform distribution in an arbitrary interval 10 | * Random floating point numbers with single and double precision 11 | * Suitable for large Monte Carlo simulations 12 | * Suitable for multi-threaded applications 13 | * High security 14 | * High resolution and very long cycle length 15 | * Includes seed generator based on truly random physical process 16 | * Detailed theoretical description available 17 | 18 | 19 | **File list:** 20 | * ranvec1_manual.pdf: Instructions manual 21 | * ranvec1.h: C++ header file 22 | * ranvec1.cpp: C++ code for random number generator 23 | * physseed.cpp: Generator of nondeterministic seed through physical process 24 | * test_ranvec.cpp: test program 25 | * ranvec1_manual.tex: Source for ranvec1_manual.pdf 26 | * freesoftwarelogo.jpg: Used by ranvec1_manual.tex 27 | * readme.md: This file 28 | -------------------------------------------------------------------------------- /complex/readme.md: -------------------------------------------------------------------------------- 1 | # complexvec1.h 2 | 3 | # Defines C++ classes for complex numbers and complex number vectors 4 | 5 | **Features:** 6 | * Defines complex number scalars and vectors 7 | * Vectors of up to 4 double precision, 8 single precision, or 16 half precision complex numbers 8 | * Operators + - * / == != 9 | * Functions abs, sqrt, etc. 10 | * Complex exponential function and logarithm 11 | 12 | **File list:** 13 | * complexvec_manual.pdf: Instruction manual 14 | * complexvec1.h: C++ header file defining complex number classes, operators, and functions with single and double precision 15 | * complexvecfp16.h: Additional header file defining half precision complex number vectors 16 | * complexvecfp16e.h: Additional header file emulating half precision complex number vectors 17 | * testbench_complex.cpp: Program used for testing complex number vector classes during development. Not needed for application 18 | * test_complex.lst: List of test cases for testbench_complex.cpp 19 | * complexvec_manual.tex: Source for building complexvec_manual.pdf 20 | * freesoftwarelogo.jpg: Used by complexvec_manual.tex 21 | * readme.md: This file 22 | -------------------------------------------------------------------------------- /random/test_ranvec.cpp: -------------------------------------------------------------------------------- 1 | /************************* test_ranvec.cpp ********************************* 2 | * Author: Agner Fog 3 | * Date created: 2019-07-08 4 | * Last modified: 2022-07-16 5 | * Version: 2.02 6 | * Project: add-on package for vector class library 7 | * Description: 8 | * Test program for ranvec1.cpp 9 | * 10 | ******************************************************************************/ 11 | 12 | 13 | #include 14 | 15 | 16 | #ifndef INSTRSET 17 | #define INSTRSET 10 // instruction set 18 | #endif 19 | 20 | #define MAX_VECTOR_SIZE 512 21 | 22 | #include "vectorclass.h" // vector class library 23 | #include "ranvec1.cpp" // random number generator 24 | #include "physseed.cpp" 25 | 26 | 27 | int main() { 28 | // Make instance of random number generator class, type 3. 29 | Ranvec1 ran(3); 30 | //Ranvec1 ran(3, 0); // constructor with seed 31 | 32 | #if true // initialize with single seed 33 | ran.init(0); 34 | #else // initialize with array of seeds 35 | const int numseeds = 5; 36 | const int seeds[numseeds] = {5,4,3,2,1}; 37 | ran.initByArray(seeds, numseeds); 38 | #endif 39 | 40 | Vec16i ri = ran.random16i(0, 99); // random integers in interval 0 - 99 41 | Vec16f rf = ran.random16f(); // random floats in interval 0 - 1 42 | 43 | for (int i=0; i 1 63 | printf("\nSeed = %08X %08X\n", physicalSeed(), physicalSeed()); 64 | 65 | return 0; 66 | } 67 | -------------------------------------------------------------------------------- /random/physseed.cpp: -------------------------------------------------------------------------------- 1 | /*************************** phusseed.cpp ********************************* 2 | * Author: Agner Fog 3 | * Date created: 2014-09-09 4 | * Last modified: 2019-08-08 5 | * Version: 2.01 6 | * Project: add-on package for vector class library 7 | * Description: 8 | * Physical seed generator for random number generator 9 | * 10 | * (c) Copyright 2019 Agner Fog. Apache License version 2.0 or later. 11 | ******************************************************************************/ 12 | 13 | #include "ranvec1.h" 14 | 15 | #ifdef VCL_NAMESPACE 16 | namespace VCL_NAMESPACE { 17 | #endif 18 | 19 | 20 | /****************************************************************************** 21 | Physical seed generation 22 | ******************************************************************************/ 23 | 24 | // Determine the type of physical seed that can be generated by current CPU: 25 | // Return value: 26 | // 0: No physical seed 27 | // 1: CPU clock (consecutive calls are not independent) 28 | // 2: RDRAND instruction 29 | // 3: RDSEED instruction 30 | int physicalSeedType() { 31 | int abcd[4]; // return values from cpuid instruction 32 | cpuid (abcd, 7); // call cpuid function 7 33 | if (abcd[1] & (1 << 18)) return 3; // ebx bit 18: RDSEED available 34 | cpuid (abcd, 1); // call cpuid function 1 35 | if (abcd[2] & (1 << 30)) return 2; // ecx bit 30: RDRAND available 36 | if (abcd[3] & (1 << 4)) return 1; // edx bit 4: RDTSC available 37 | return 0; 38 | } 39 | 40 | // Get a truly random number based on a physical process. 41 | // The source of randomness is indicated by physicalSeedType(); 42 | static int physicalSeedTypei = -1; 43 | int physicalSeed() { 44 | if (physicalSeedTypei < 0) { // get the seed type on first call 45 | physicalSeedTypei = physicalSeedType(); 46 | } 47 | uint32_t ran = 0; // random number 48 | switch (physicalSeedTypei) { 49 | case 1: // use RDTSC instruction 50 | ran = (uint32_t)__rdtsc(); 51 | break; 52 | case 2: // use RDRAND instruction 53 | while (_rdrand32_step(&ran) == 0) {} 54 | break; 55 | case 3: // use RDSEED instruction 56 | while (_rdseed32_step(&ran) == 0) {} 57 | break; 58 | } 59 | return (int)ran; // return random number 60 | } 61 | 62 | #ifdef VCL_NAMESPACE 63 | } 64 | #endif 65 | -------------------------------------------------------------------------------- /complex/test_complex.lst: -------------------------------------------------------------------------------- 1 | # Test data for complex1.h under VCL 2 | # To use with runtest.sh from testbench repository 3 | 4 | $compiler=1 5 | 6 | # Maximum instruction set supported by this compiler 7 | # Set to 12 if compiler supports AVX512-FP16 8 | $compilermax=12 9 | 10 | $mode=64 11 | 12 | # Testbench file 13 | #$testbench=testbench_complex.cpp 14 | $testbench=/mnt/c/_Public/VectorClass/special/complex/testbench_complex.cpp 15 | 16 | # Path to include files 17 | #$include=./ 18 | $include=/mnt/c/_Public/VectorClass/src2 19 | 20 | # Intel emulator 21 | $emulator=/home/agner/emulator/sde/sde 22 | 23 | # Output file name 24 | $outfile=test_complex.txt 25 | 26 | # Random number seed 27 | $seed=1 28 | 29 | 30 | # test case, vector type, return type, instruction set 31 | 32 | # half precision: 33 | ################# 34 | 35 | 1 2 3 4 5 6 7 8 9 10 11 , Complex1h Complex2h Complex4h Complex8h Complex16h , , 2 4 6 7 8 9 10 12 36 | 37 | # constructors 38 | 20 21 22 , Complex1h Complex2h Complex4h Complex8h Complex16h , , 6 8 9 10 12 39 | 23 , Complex2h Complex4h Complex8h Complex16h , , 6 8 9 10 12 40 | 41 | # constructor from two halves, split into two halves 42 | 23 24 , Complex2h Complex4h Complex8h Complex16h , , 5 8 9 10 12 43 | # constructor from four complex scalars 44 | 25 , Complex4h , , 5 8 9 10 12 45 | # constructor from eight complex scalars 46 | 26 , Complex8h , , 5 8 9 10 12 47 | # constructor from 16 complex scalars 48 | 27 , Complex16h , , 5 8 9 10 12 49 | 50 | # Get real/imag part of complex scalar 51 | 30 31 , Complex1h , , 4 8 9 10 12 52 | 53 | # Get real/imag parts of complex vector 54 | 32 33 34 , Complex8h Complex16h , , 4 8 9 10 12 55 | 35 , Complex2h , , 4 8 9 10 12 56 | 36 , Complex4h , , 4 8 9 10 12 57 | 58 | # extract and insert 59 | 39 49 , Complex1h Complex2h Complex4h Complex8h Complex16h , , 6 7 8 9 10 12 60 | 61 | # various functions: extract, ==, !=, select, abs, sqrt, cexp 62 | 40 41 42 43 50 55 56 500 , Complex1h Complex2h Complex4h Complex8h Complex16h , , 3 7 8 9 10 12 63 | 103 104 , Complex1h Complex2h Complex4h Complex8h Complex16h , , 8 9 10 12 64 | 65 | # to float 66 | 51 , Complex1h Complex2h Complex4h Complex8h , , 3 7 8 9 10 12 67 | 68 | 69 | # single and double precision 70 | ############################# 71 | 72 | # operators 73 | 1 2 3 4 5 6 7 8 9 10 11 , Complex1f Complex2f Complex4f Complex8f Complex1d Complex2d Complex4d , , 2 5 7 8 9 10 74 | 75 | # constructors 76 | 20 21 22 23 , Complex1f Complex2f Complex4f Complex8f Complex1d Complex2d Complex4d , , 3 6 8 9 10 77 | 78 | # constructor from two halves 79 | 24 , Complex2f Complex4f Complex8f Complex2d Complex4d , , 4 5 8 9 10 80 | 81 | # constructor from four complex scalars 82 | 25 , Complex4f Complex4d , , 4 5 8 9 10 83 | 84 | # constructor from 8 complex scalars 85 | 26 , Complex8f , , 4 5 8 9 10 86 | 87 | # real, imag scalars 88 | 30 31 , Complex1f Complex1d , , 7 8 10 89 | 90 | # real, imag vectors 91 | 32 33 , Complex2f Complex4f Complex8f Complex2d Complex4d , , 7 8 10 92 | 93 | # extract and insert 94 | 39 49 , Complex1f Complex2f Complex4f Complex8f Complex1d Complex2d Complex4d , , 6 7 8 9 10 12 95 | 96 | # interleave real and imag parts 97 | 34 , Complex4f Complex8f Complex2d Complex4d , , 7 8 10 98 | 35 , Complex2f , , 7 8 10 99 | 100 | # to_float 101 | 51 , Complex1d Complex2d Complex4d , , 3 7 8 9 10 102 | 103 | # to_double 104 | 52 , Complex1f Complex2f Complex4f , , 3 7 8 9 10 105 | 106 | # various functions: extract, ==, !=, select, abs, sqrt 107 | 40 41 42 43 50 55 56 60 , Complex1f Complex2f Complex4f Complex8f Complex1d Complex2d Complex4d , , 3 7 8 9 10 108 | 109 | 103 104 , Complex1f Complex2f Complex4f Complex8f , , 8 9 10 110 | # double: no sufficiently accurate reference 111 | # 103 104 , Complex1d Complex2d Complex4d , , 8 9 10 112 | 113 | # cexp, clog 114 | 500 501 , Complex1f Complex2f Complex4f Complex8f Complex1d Complex2d Complex4d , , 3 7 8 9 10 115 | -------------------------------------------------------------------------------- /decimal/decimal.h: -------------------------------------------------------------------------------- 1 | /*************************** decimal.h ************************************* 2 | * Author: Agner Fog 3 | * Date created: 2012-07-08 4 | * Last modified: 2019-07-20 5 | * Version: 2.00 6 | * Project: Extension to vector class library 7 | * Description: 8 | * Functions for conversion between binary number vectors and comma-separated 9 | * decimal ASCII lists. 10 | * 11 | * Please see decimal_manual.pdf for instructions 12 | * 13 | * (c) Copyright 2012-2019 Agner Fog. Apache License version 2.0 or later. 14 | ******************************************************************************/ 15 | 16 | #pragma once 17 | #include "vectorclass.h" 18 | 19 | #ifdef VCL_NAMESPACE 20 | namespace VCL_NAMESPACE { 21 | #endif 22 | 23 | 24 | /***************************************************************************** 25 | * 26 | * Conversion from binary to decimal ASCII string 27 | * 28 | *****************************************************************************/ 29 | 30 | // Convert binary numbers to decimal ASCII string. 31 | // The numbers will be written to the string as decimal numbers in human-readable format. 32 | // Each number will be right-justified with leading spaces in a field of the specified length. 33 | int bin2ascii ( 34 | Vec4i const a, // vector of integers to convert 35 | char * string, // string to receive the decimal ascii numbers 36 | int fieldlen = 8, // length of each field 37 | int numdat = 4, // number of data 38 | char ovfl = '*', // overflow indicated by this character. 39 | // ovfl = 0 will make the field wide enough to contain the number 40 | char separator = ',', // character to separate fields. 0 for no separator 41 | bool signd = true, // data are interpreted as signed integers 42 | bool term = true); // write a zero-terminated string 43 | 44 | int bin2ascii ( 45 | Vec8i const a, // vector of integers to convert 46 | char * string, // string to receive the decimal ascii numbers 47 | int fieldlen = 8, // length of each field 48 | int numdat = 4, // number of data 49 | char ovfl = '*', // overflow indicated by this character. 50 | // ovfl = 0 will make the field wide enough to contain the number 51 | char separator = ',', // character to separate fields. 0 for no separator 52 | bool signd = true, // data are interpreted as signed integers 53 | bool term = true); // write a zero-terminated string 54 | 55 | 56 | 57 | /***************************************************************************** 58 | * 59 | * Conversion from comma-separated decimal ASCII string to binary number vector 60 | * 61 | *****************************************************************************/ 62 | 63 | /* 64 | The function ascii2bin shows how it is possible to parse a string of 65 | variable-length fields without looping through the characters of the sting. 66 | It is quite a challenge, though. There are many special cases to take care 67 | of and to test. Whether it is worth the effort depends on whether string 68 | parsing is a bottleneck. In many cases, data transfer is the bottleneck 69 | that limits the speed, not data parsing. 70 | This code may serve as a source of inspiration anyway. 71 | */ 72 | 73 | Vec8i ascii2bin( 74 | const char * string, // ASCII string containing numdat comma-separated integers 75 | int * chars_read, // Number of characters read 76 | int * error, // Errors will be indicated here 77 | int max_stringlen = 64, // Maximum length of string 78 | int numdat = 8, // Expected number of data in string. Max 8 79 | char separator = ','); 80 | 81 | // Error codes returned in *error: 82 | // 1: parameters out of range 83 | // 2: illegal character. value will be interpreted as if this was a space 84 | // 4: misplaced character. value will be zero 85 | // 8: too few separators. value will be zero 86 | // 16: overflow. value will be INT_MAX or INT_MIN 87 | 88 | 89 | #ifdef VCL_NAMESPACE 90 | } 91 | #endif 92 | -------------------------------------------------------------------------------- /containers/general_containers.h: -------------------------------------------------------------------------------- 1 | /************************ general_containers.h ***************************** 2 | * Author: Agner Fog 3 | * Date created: 2022-07-05 4 | * Last modified: 2023-09-13 5 | * Version: 2.02.00 6 | * Description: 7 | * Header file for general container classes 8 | * These containers are independent of the vector class library and intended 9 | * for objects that are not VCL vectors. 10 | * It may not be suitable for objects that have non-standard constructors, 11 | * copy constructors, move constructors, or destructors. 12 | * 13 | * Example: 14 | 15 | ContainerG c; // make container for type double 16 | c.set_size(10); // allocate space for 10 objects 17 | c[2] = 88.8; // change value of one object 18 | // print out all objects 19 | for (int i = 0; i < c.size(); i++) printf(" %.1f", c[i]); 20 | 21 | * For further instructions, see containers_manual.pdf 22 | * 23 | * (c) Copyright 2022 - 2023 Agner Fog. 24 | * Apache License version 2.0 or later. 25 | ******************************************************************************/ 26 | 27 | #ifndef GENERAL_CONTAINERS_H 28 | #define GENERAL_CONTAINERS_H 20200 29 | 30 | // Container class to store a variable number of objects of any type. 31 | // This container does not rely on the vector class library 32 | template 33 | class ContainerG { 34 | protected: 35 | T * buf; // allocated memory buffer containing array 36 | unsigned int allocatedSize; // size of allocated buffer 37 | unsigned int nobjects; // number of objects currently used 38 | void (*errorfunction)(void); // pointer to error handling function 39 | public: 40 | ContainerG() { // constructor 41 | buf = 0; allocatedSize = 0; nobjects = 0; errorfunction = 0; 42 | } 43 | ~ContainerG() { // destructor 44 | if (buf) delete[] buf; // free allocated memory 45 | } 46 | ContainerG(ContainerG&) = delete; // prevent copying entire container (a copy constructor would have to allocate a new buffer) 47 | ContainerG operator = (ContainerG&) = delete;// prevent copying entire container 48 | int size() const { // get size as number of objects 49 | return nobjects; 50 | } 51 | int allocated_size() const { // maximum size that can be set without reallocation 52 | return allocatedSize; 53 | } 54 | void set_error_handler(void (*e)(void)) { // set function pointer to error handler 55 | errorfunction = e; 56 | } 57 | void set_size(int size) { 58 | // Allocate, reallocate or deallocate buffer of specified size. size is the number of objects. 59 | // Setting size > allocated_size will allocate more buffer and fill it with zeroes 60 | // Setting size < allocated_size will decrease size so that some of the data are inaccessible 61 | // Setting size = 0 will discard all data and de-allocate the buffer. 62 | if (size <= 0) { // discard everything 63 | if (buf) delete[] buf; // de-allocate buffer 64 | buf = 0; allocatedSize = 0; nobjects = 0; 65 | } 66 | else if ((unsigned int)size <= allocatedSize) { // grow or shrink within allocated size 67 | nobjects = size; 68 | } 69 | else { // increase allocated size 70 | unsigned int newallocsize; // new size to allocate 71 | if ((unsigned int)size >= allocatedSize + allocatedSize/2) { 72 | newallocsize = size; // first time or big increase. allocate only the specified size 73 | } 74 | else { 75 | newallocsize = size*2; // small increase. allocate more than requested to avoid frequent reallocations 76 | } 77 | T * buf2 = 0; // pointer to new buffer 78 | buf2 = new T[newallocsize](); // allocate new buffer. () means initialize to zero 79 | if (buf) { // previously allocated buffer exists 80 | for (unsigned int i = 0; i < allocatedSize; i++) { 81 | buf2[i] = buf[i]; // copy from old to new buffer 82 | } 83 | delete [] buf; // deallocate old buffer 84 | } 85 | // store pointer to new buffer 86 | buf = buf2; allocatedSize = newallocsize; 87 | nobjects = size; // new used size 88 | } 89 | } 90 | T & operator [] (int index) { // access one object 91 | if ((unsigned int)index < nobjects) { 92 | return buf[index]; // get reference to object 93 | } 94 | else { // index out of range 95 | (*errorfunction)(); // report error 96 | return buf[0]; 97 | } 98 | } 99 | void load(int n, T const * p) { // load n objects from array 100 | if (n <= 0) return; // nothing to do 101 | if ((unsigned int)n > nobjects) n = nobjects;// max size 102 | for (int i = 0; i < n; i++) { 103 | buf[i] = p[i]; // load n objects 104 | } 105 | } 106 | void store(int n, T * p) { // store n elements to array 107 | if (n <= 0) return; // nothing to do 108 | if (uint32_t(n) > nobjects) n = nobjects;// max size 109 | for (int i = 0; i < n; i++) { 110 | p[i] = buf[i]; // store n objects 111 | } 112 | } 113 | T * get_buf() { // get address of internal buffer. warning: address may change 114 | return buf; 115 | } 116 | }; 117 | 118 | 119 | #endif // GENERAL_CONTAINERS_H 120 | -------------------------------------------------------------------------------- /license.txt: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | 179 | Copyright 2012-2019 Agner Fog. 180 | 181 | Licensed under the Apache License, Version 2.0 (the "License"); 182 | you may not use this file except in compliance with the License. 183 | You may obtain a copy of the License at 184 | 185 | http://www.apache.org/licenses/LICENSE-2.0 186 | 187 | Unless required by applicable law or agreed to in writing, software 188 | distributed under the License is distributed on an "AS IS" BASIS, 189 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 190 | See the License for the specific language governing permissions and 191 | limitations under the License. 192 | -------------------------------------------------------------------------------- /physical_processors/physical_processors.cpp: -------------------------------------------------------------------------------- 1 | /********************* physical_processors.cpp ***************************** 2 | * Author: Agner Fog 3 | * Date created: 2019-10-29 4 | * Last modified: 2021-05-04 5 | * Version: 2.01 6 | * Project: vector class library 7 | * Description: Detect number of physical and logical processors on CPU chip. 8 | * Compile for C++11 or later 9 | * 10 | * (c) Copyright 2019-2021 Agner Fog. 11 | * Apache License version 2.0 or later. 12 | ******************************************************************************* 13 | Some modern CPUs can run two threads in each CPU core when simultaneous 14 | multithreading (SMT, called hyperthreading by Intel) is enabled. 15 | 16 | The number of physical processors is the number of CPU cores. 17 | The number of logical processors is the same number multiplied by the number of 18 | threads that can run simultaneously in each CPU core. 19 | 20 | Simultaneous multithreading will slow down performance when two CPU-intensive 21 | threads running in the same physical processor (CPU core) are competing for the 22 | same resources. Therefore, the optimal number of threads for CPU-intensive 23 | tasks is most likely to be the number of physical processors. 24 | 25 | Tasks that are less CPU-intensive but limited by RAM access, disk access, 26 | network, etc. may get an advantage by running as many threads as the number of 27 | logical processors. This will be double the number of physical processors when 28 | simultaneous multithreading is enabled. 29 | 30 | The physicalProcessors function detects the number of physical processors and 31 | logical processors on an x86 computer. This is useful for determining the 32 | optimal number of threads. 33 | 34 | 35 | Note: There are several problems in detecting the number of physical processors: 36 | 37 | 1. The CPUID instruction on Intel CPUs will return a wrong number of logical 38 | processors when SMT (hyperthreading) is disabled. It may be necessary to 39 | compare the number of processors returned by the CPUID instruction with the 40 | number of processors reported by the operating system to detect if SMT is 41 | enabled (AMD processors do not have this problem). 42 | 43 | 2. It is necessary to rely on system functions to detect if there is more than 44 | one CPU chip installed. It is assumed that the status of SMT is the same on 45 | all CPU chips in a system. 46 | 47 | 3. The behavior of VIA processors is undocumented. 48 | 49 | 4. This function is not guaranteed to work on future CPUs. It may need updating 50 | when new CPUs with different configurations or different CPUID functionality 51 | appear. 52 | ******************************************************************************/ 53 | 54 | #include // std::thread functions 55 | 56 | #ifdef _MSC_VER 57 | #include // __cpuidex intrinsic function available on microsoft compilers 58 | #endif 59 | 60 | #ifdef VCL_NAMESPACE 61 | namespace VCL_NAMESPACE { 62 | #endif 63 | 64 | // Define interface to CPUID instruction. 65 | // input: leaf = eax, subleaf = ecx 66 | // output: output[0] = eax, output[1] = ebx, output[2] = ecx, output[3] = edx 67 | static inline void cpuid(int output[4], int leaf, int subleaf = 0) { 68 | #if defined(__GNUC__) || defined(__clang__) // use inline assembly, Gnu/AT&T syntax 69 | int a, b, c, d; 70 | __asm("cpuid" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "a"(leaf), "c"(subleaf) : ); 71 | output[0] = a; 72 | output[1] = b; 73 | output[2] = c; 74 | output[3] = d; 75 | 76 | #elif defined (_MSC_VER) // Microsoft compiler, intrin.h included 77 | __cpuidex(output, leaf, subleaf); // intrinsic function for CPUID 78 | 79 | #else // unknown platform. try inline assembly with masm/intel syntax 80 | __asm { 81 | mov eax, leaf 82 | mov ecx, subleaf 83 | cpuid; 84 | mov esi, output 85 | mov[esi], eax 86 | mov[esi + 4], ebx 87 | mov[esi + 8], ecx 88 | mov[esi + 12], edx 89 | } 90 | #endif 91 | } 92 | 93 | // Function prototype: 94 | int physicalProcessors(int * logical_processors = 0); 95 | 96 | 97 | // Find the number of physical and logical processors supported by CPU 98 | // Parameter: 99 | // logical_processors: an optional pointer to an integer that will receive the number of logical processors. 100 | // Return value: number of physical processors 101 | int physicalProcessors(int * logical_processors) { 102 | int vendor = 0; // CPU vendor: 1 = Intel, 2 = AMD, 3 = VIA, 0 = other 103 | int logicalProc = 1; // number of logical processor cores 104 | int physicalProc = 1; // number of physical processor cores 105 | int procPerCore = 1; // logical cores per physical core 106 | bool hyperthreadingSupported = false; // CPU supports hyperthreading / simultaneous multithreading 107 | int systemProcessors = std::thread::hardware_concurrency(); // number of processors reported by operating system 108 | 109 | int abcd[4] = { 0,0,0,0 }; // CPUID output 110 | cpuid(abcd, 0); // CPUID function 0 111 | 112 | int maxLeaf = abcd[0]; // maximum eax input for CPUID 113 | if (abcd[2] == 0x6C65746E) { // last 4 chars of "GenuineIntel" 114 | vendor = 1; 115 | } 116 | else if (abcd[2] == 0x444D4163) { // last 4 chars of "AuthenticAMD" 117 | vendor = 2; 118 | } 119 | else if (abcd[2] == 0x736C7561) { // last 4 chars of "CentaurHauls" 120 | vendor = 3; 121 | } 122 | 123 | if (maxLeaf >= 1) { 124 | cpuid(abcd, 1); 125 | if (abcd[3] & (1 << 28)) { // hyperthreading supported 126 | hyperthreadingSupported = true; 127 | } 128 | } 129 | 130 | if (vendor == 1) { 131 | ////////////////// 132 | // Intel // 133 | ////////////////// 134 | 135 | int hyper = 0; // hyperthreading status: 0 = unknown, 1 = disabled, 2 = enabled 136 | if (maxLeaf >= 0xB) { // leaf 0xB or 0x1F: Extended Topology Enumeration 137 | int num = 0xB; 138 | // if (maxLeaf >= 0x1F) num = 0x1F; 139 | 140 | for (int c = 0; c < 5; c++) { 141 | cpuid(abcd, num, c); // enumeration level c 142 | int type = (abcd[2] >> 8) & 0xFF;// enumeration type at level c 143 | if (type == 1) { // SMT level 144 | procPerCore = abcd[1] & 0xFFFF; 145 | } 146 | else if (type >= 2) { // core level 147 | logicalProc = abcd[1] & 0xFFFF; 148 | } 149 | else if (type == 0) break; 150 | // There are more types/levels to consider if we use num = 0x1F. We may need 151 | // to fix this in the future if CPUs with more complex configurations appear 152 | } 153 | physicalProc = logicalProc / procPerCore; 154 | 155 | // The number of performance monitor registers depends on hyperthreading status 156 | // on Intel CPUs with performance monitoring version 3 or 4 157 | cpuid(abcd, 0xA, 0); // performance monitor counters information 158 | int perfVersion = abcd[0] & 0xFF; // performance monitoring version 159 | int perfNum = (abcd[0] >> 8) & 0xFF; // number of performance monitoring registers 160 | if (perfVersion == 3 || perfVersion == 4) { 161 | if (perfNum == 4) { 162 | hyper = 2; // 4 performance registers when hyperthreading enabled 163 | } 164 | else if (perfNum == 8) { // 8 performance registers when hyperthreading disabled 165 | hyper = 1; 166 | procPerCore = 1; 167 | logicalProc = physicalProc; // reduce the number of logical processors when hyperthreading is disabled 168 | } 169 | // hyper remains 0 in all other cases, indicating unknown status 170 | } 171 | } 172 | else if (maxLeaf >= 4) { // CPUID function 4: cache parameters and cores 173 | cpuid(abcd, 4); 174 | logicalProc = (abcd[0] >> 26) + 1; 175 | if (hyperthreadingSupported) { 176 | // number of logical processors per core is not known. Assume 2 if hyperthreading supported 177 | procPerCore = 2; 178 | } 179 | physicalProc = logicalProc / procPerCore; 180 | } 181 | else { 182 | // no information. Assume 1 processor 183 | } 184 | if (systemProcessors > logicalProc) { 185 | // Multiple CPU chips. Assume that chips are identical with respect to hypethreading 186 | physicalProc = systemProcessors * physicalProc / logicalProc; 187 | logicalProc = systemProcessors; 188 | } 189 | else if (logicalProc > systemProcessors && systemProcessors > 0 && hyper == 0) { 190 | // Hyperthreading is disabled 191 | logicalProc = systemProcessors; 192 | physicalProc = systemProcessors; 193 | } 194 | } 195 | else if (vendor == 2) { 196 | 197 | ////////////////// 198 | // AMD // 199 | ////////////////// 200 | 201 | cpuid(abcd, 0x80000000); // AMD specific CPUID functions 202 | int maxLeaf8 = abcd[0] & 0xFFFF; // maximum eax 0x8000.... input for CPUID 203 | 204 | if (maxLeaf8 >= 8) { 205 | cpuid(abcd, 0x80000008); 206 | logicalProc = (abcd[2] & 0xFF) + 1; 207 | 208 | if (maxLeaf8 >= 0x1E) { 209 | cpuid(abcd, 0x8000001E); 210 | procPerCore = ((abcd[1] >> 8) & 0x03) + 1; 211 | // procPerCore = 2 if simultaneous multithreading is enabled, 1 if disabled 212 | } 213 | else { 214 | if (hyperthreadingSupported) { 215 | procPerCore = 2; 216 | } 217 | else { 218 | procPerCore = 1; 219 | } 220 | } 221 | physicalProc = logicalProc / procPerCore; 222 | } 223 | else if (hyperthreadingSupported) { 224 | // number of logical processors per core is not known. Assume 2 if SMT supported 225 | logicalProc = 2; 226 | physicalProc = 1; 227 | } 228 | if (systemProcessors > logicalProc) { 229 | // Multiple CPU chips. Assume that chips are identical with respect to SMT 230 | physicalProc = systemProcessors * physicalProc / logicalProc; 231 | logicalProc = systemProcessors; 232 | } 233 | } 234 | else { 235 | 236 | ////////////////////////////// 237 | // VIA or unknown CPU // 238 | ////////////////////////////// 239 | 240 | // The behavior of VIA processors is undocumented! It is not known how to detect threads on a VIA processor 241 | physicalProc = logicalProc = systemProcessors; 242 | if (hyperthreadingSupported && physicalProc > 1) { 243 | physicalProc /= 2; 244 | } 245 | } 246 | if (logical_processors) { 247 | // return logical_processors if pointer is not null 248 | *logical_processors = logicalProc; 249 | } 250 | return physicalProc; 251 | } 252 | 253 | #ifdef VCL_NAMESPACE 254 | } 255 | #endif 256 | 257 | /* Uncomment this for testing: 258 | 259 | #include 260 | 261 | int main() { 262 | 263 | int logicalProc = 0; 264 | int physicalProc = physicalProcessors(&logicalProc); 265 | 266 | printf("\nlogical processors: %i", logicalProc); 267 | printf("\nphysical processors: %i", physicalProc); 268 | printf("\nlogical processors per core: %i", logicalProc / physicalProc); 269 | int sysproc = std::thread::hardware_concurrency(); 270 | printf("\nsystem processors: %i", sysproc); 271 | 272 | return 0; 273 | } 274 | */ 275 | -------------------------------------------------------------------------------- /decimal/decimal_manual.tex: -------------------------------------------------------------------------------- 1 | \documentclass[11pt,a4paper,oneside,openright]{report} 2 | 3 | \usepackage[bindingoffset=5mm,left=20mm,right=20mm,top=20mm,bottom=20mm,footskip=10mm]{geometry} 4 | \usepackage[utf8x]{inputenc} 5 | \usepackage{hyperref} 6 | \usepackage[english]{babel} 7 | \usepackage{listings} 8 | \usepackage{subfiles} 9 | \usepackage{longtable} 10 | \usepackage{multirow} 11 | \usepackage{ragged2e} 12 | \usepackage{cmap} % avoid fi ligatures in pdf file 13 | \usepackage{amsthm} % example numbering 14 | \usepackage{color} 15 | %\usepackage{bold-extra} % for bold tt font. Remember to include bold-extra.sty file 16 | \usepackage{graphicx} 17 | \usepackage[yyyymmdd]{datetime} 18 | \usepackage{float} 19 | 20 | % style for code listing 21 | \renewcommand{\familydefault}{\sfdefault} 22 | \newtheorem{example}{Example}[chapter] % example numbering 23 | \lstset{language=C} % formatting for code listing 24 | \lstset{basicstyle=\ttfamily,breaklines=true} 25 | \definecolor{darkGreen}{rgb}{0,0.4,0} 26 | \definecolor{mybrown}{rgb}{0.40,0.10,0.05} 27 | \lstset{commentstyle=\color{darkGreen}} % comments color 28 | \lstset{keywordstyle=\color{blue}} % keyword color 29 | \lstset{stringstyle=\color{mybrown}} % string color 30 | \lstset{showstringspaces=false} % don't mark spaces in strings 31 | 32 | \renewcommand{\dateseparator}{-} 33 | 34 | % command for turning indent back on after \flushleft 35 | \newcommand{\indenton}{\RaggedRight\parindent=15pt} 36 | 37 | % command for vertical space 38 | \newcommand{\vspacesmall}{\vspace{3mm}} 39 | \newcommand{\vspacebig}{\vspace{6mm}} 40 | 41 | % style for code inlined in text: 42 | \newcommand{\codei}[1]{\bfseries \ttfamily{#1}\normalfont} 43 | 44 | 45 | \begin{document} 46 | 47 | \begin{titlepage} 48 | \centering 49 | 50 | \null %empty box needed for vfill to work 51 | \vfill 52 | 53 | {\bfseries\Huge 54 | decimal.cpp 55 | \vspacesmall 56 | 57 | Functions for conversion of integer vectors to and from 58 | comma-separated lists of 59 | decimal ASCII numbers 60 | \vspacesmall 61 | 62 | Extension to C++ vector class library 63 | \vspacebig 64 | 65 | } 66 | \vspacebig 67 | 68 | {\Large 69 | Agner Fog 70 | \vspacebig 71 | 72 | \copyright\ \today. Apache license 2.0 73 | } 74 | 75 | \vfill 76 | 77 | \includegraphics[width=306pt]{freesoftwarelogo.jpg} 78 | \vfill 79 | 80 | \end{titlepage} 81 | 82 | \RaggedRight 83 | 84 | \chapter{Introduction}\label{chap:Introduction} 85 | The decimal ASCII extension to the Vector Class Library contains functions for conversion of integer vectors to and from comma-separated lists of numbers as human-readable decimal ASCII strings. This is useful for efficient reading and writing of comma-separated files. 86 | \vspacesmall 87 | 88 | These functions cannot read or write floating point numbers. If you have fractional numbers, then you may consider if the numbers can be converted to integers by appropriate scaling. For example, if your have dollars with two decimals, then you can multiply the numbers by 100 to get cents as integer numbers. This will make data handling faster. 89 | \vspacesmall 90 | 91 | These functions are highly efficient. Whether this efficiency actually shows in the overall program performance depends on whether string processing is a bottleneck. In many applications, the transfer of data files is the limiting bottleneck, rather than string processing. 92 | \vspacesmall 93 | 94 | Anyway, the code presented here can serve as an interesting show case. The code illustrates how strings can be processed or parsed in parallel using vector instructions in a highly efficient way. The ascii2bin function shows that it is possible to parse a string with variable-length fields without looping through the characters of the string. 95 | \vspacesmall 96 | 97 | 98 | \section{Compiling} \label{Compiling} 99 | The decimal extension to the Vector Class Library consists of the files decimal.cpp and decimal.h. The decimal.cpp file is added to the project that needs it, and the decimal.h file is \#included in C++ files that call these functions. 100 | \vspacesmall 101 | 102 | The decimal extension to the Vector Class Library is compiled in the same way as the Vector Class Library itself. All x86 and x86-64 platforms are supported, including Windows, Linux, and Mac OS. 103 | The following C++ compilers can be used: Gnu, Clang, Microsoft, and Intel. 104 | See the manual for the vector class library for further details. 105 | 106 | 107 | \chapter{Binary to decimal ASCII conversion}\label{chap:b2aConversion} 108 | 109 | The bin2ascii function has the following parameters: 110 | 111 | \begin {table}[H] 112 | \caption{bin2ascii function} 113 | \label{table:bin2asciiFunction} 114 | \begin{tabular}{|p{24mm}|p{130mm}|} 115 | \hline 116 | \bfseries Parameter & \bfseries Description \\ \hline 117 | Vec4i a \newline Vec8i a & A vector of four or eight signed or unsigned integers. 118 | \\ \hline 119 | char * string & This char array will receive the ASCII string of decimal numbers. It is the responsibility of the programmer that the array is big enough to contain the resulting string, even in case of overflow. \\ \hline 120 | int fieldlen & Desired length of each field in the output list. \\ \hline 121 | int numdat & Number of data fields to write. The maximum value is 4 or 8 for Vec4i and Vec8i, respectively. \\ \hline 122 | char ovfl & This ASCII character will indicate overflow if a number is too big to fit into a field of size \codei{fieldlen}. The default value is '*'. The field will be extended to fit the number if \codei{ovfl} is set to 0 (without quotes). \\ \hline 123 | char separator & This ASCII character will be used as separater between the number fields. The default value is ','. No separator will be used if \codei{separator} is set to 0 (without quotes). \\ \hline 124 | bool signd & Set this to \codei{true} (default) to write signed numbers. Set to \codei{false} if the input vector should be interpreted as unsigned numbers. \\ \hline 125 | bool term & Indicates whether the written ASCII string string should be zero-terminated. The default is true. A terminating zero will not be written if \codei{term} is false. \\ \hline 126 | Return value & The bin2ascii function returns the length of the written string. \\ \hline 127 | \end{tabular} 128 | \end{table} 129 | \vspacebig 130 | 131 | 132 | This example shows how to use the bin2ascii function: 133 | 134 | \begin{example} 135 | \label{example1} 136 | \end{example} % frame disappears if I put this after end lstlisting 137 | \begin{lstlisting}[frame=single] 138 | // Example for binary to decimal ASCII conversion 139 | #include 140 | #include "vectorclass.h" 141 | #include "decimal.h" 142 | #include "decimal.cpp" 143 | 144 | int main() { 145 | // make a vector of eight integers 146 | Vec8i a(1, 20, 300, 4000, -12345, 67890, 1234567890, 8); 147 | // make a char array big enough to hold the string 148 | char text[100]; 149 | // convert to human-readable decimal ASCII numbers 150 | bin2ascii(a, text, 6, 8, '*', ',', true, true); 151 | // print text 152 | printf("List of numbers:\n%s\n", text); 153 | return 0; 154 | } 155 | /* The output will be: 156 | List of numbers: 157 | 1, 20, 300, 4000,-12345, 67890,******, 8 158 | */ 159 | 160 | \end{lstlisting} 161 | \vspacesmall 162 | 163 | 164 | 165 | \chapter{Decimal ASCII to binary conversion}\label{chap:a2bConversion} 166 | 167 | The ascii2bin function has the following parameters: 168 | 169 | \begin {table}[H] 170 | \caption{ascii2bin function} 171 | \label{table:ascii2binFunction} 172 | \begin{tabular}{|p{32mm}|p{120mm}|} 173 | \hline 174 | \bfseries Parameter & \bfseries Description \\ \hline 175 | const char * string & An ASCII string containing integer numbers separated by comma or by some other separator character. \\ \hline 176 | int * chars\_read & This parameter will receive the number of characters that the function has read. In other words, the part of the string that has been used by the function. \\ \hline 177 | int * error & This parameter will receive an indication of any errors. The error codes are listed below. \\ \hline 178 | int max\_stringlen & The maximum length of the string that the function is allowed to read. Any contents after max\_stringlen will be ignored. 179 | The string may be shorter than max\_stringlen if terminated by a zero or newline. \\ \hline 180 | int numdat & The number of data fields to read. The maximum value is 8. \\ \hline 181 | char separator & The character used as separator between numbers. The default is ',' \\ \hline 182 | Return value & The function returns a vector of type Vec8i, containing up to eight signed integers. \\ \hline 183 | \end{tabular} 184 | \end{table} 185 | \vspacesmall 186 | 187 | The input string must be an ASCII string using the following syntax. 188 | The string can contain up to eight fields, each containing a signed or unsigned integer. The fields are separated by the character indicated as \codei{separator}. The default separator is a comma. Each field can contain an optional sign (\codei{+} or \codei{-}) followed by any number of digits 0 - 9. Spaces are allowed before and after the number, and between the sign and the number. No other characters are allowed. Nothing is allowed between the digits. 189 | \vspacesmall 190 | 191 | A separator (comma) after the last field is not required, but it can be useful to prevent the function from reading any irrelevant text that comes after the relevant fields, which would cause syntax errors. A terminating separator will be included in \codei{chars\_read}. 192 | \vspacesmall 193 | 194 | The following error codes are returned in \codei{*error} in case of syntax errors in the string: 195 | 196 | \begin {table}[H] 197 | \caption{ascii2bin error codes} 198 | \label{table:ascii2binErrorCodes} 199 | \begin{tabular}{|p{24mm}|p{130mm}|} 200 | \hline 201 | \bfseries Error code & \bfseries Description \\ \hline 202 | 1 & Parameters out of range. This happens if numdat \textgreater{} 8 or max\_stringlen \textgreater{} 10000. \\ \hline 203 | 2 & Illegal character. This happens if a numeric field contains any other characters than +, -, 0-9, or space. The value will be interpreted as if the illegal character was a space, if possible. \\ \hline 204 | 4 & Misplaced character. This happens if a + or - sign is placed after the number rather than before the number, or if there is anything between the digits. \newline The resulting value will be zero. \\ \hline 205 | 8 & Too few separators. The string has less than \codei{numdat-1} separators. The remaining values will be zero. \\ \hline 206 | 16 & Overflow. A value is too big to fit into a 32-bit signed integer. The resulting value will be INT\_MAX or INT\_MIN. \\ \hline 207 | 0 & Missing value. An empty field will be interpreted as a zero. This is not indicated as an error. \\ \hline 208 | \end{tabular} 209 | \end{table} 210 | 211 | The error codes can be combined if the string has multiple syntax errors. 212 | 213 | \vspacesmall 214 | Any control characters in the string, such as newline or tab, will be interpreted as end of string, unless the same character is indicated as separator. A Windows newline cannot be used as separator because it consists of two control characters (\textbackslash r\textbackslash n). 215 | \vspacebig 216 | 217 | This example shows how to use the ascii2bin function: 218 | 219 | \begin{example} 220 | \label{example2} 221 | \end{example} % frame disappears if I put this after end lstlisting 222 | \begin{lstlisting}[frame=single] 223 | // Example for conversion from comma-separated decimal ASCII 224 | // string to binary vector 225 | #include 226 | #include "vectorclass.h" 227 | #include "decimal.h" 228 | #include "decimal.cpp" 229 | 230 | int main() { 231 | // text string to interpret 232 | char text[] = " 1, -20, 30, , 555, -6, 7000, 88888 "; 233 | 234 | // length and error will be returned in these variables 235 | int length, error; 236 | 237 | // interpret the comma-separated string 238 | Vec8i dat = ascii2bin(text, &length, &error, 64, 8, ','); 239 | 240 | // check if syntax error 241 | if (error) { 242 | printf ("\nerror = 0x%X",error); 243 | } 244 | else { 245 | // write results 246 | for (int i = 0; i < 8; i++) { 247 | printf("%i ", dat[i]); 248 | } 249 | } 250 | return 0; 251 | } 252 | // Program output: 253 | // 1 -20 30 0 555 -6 7000 88888 254 | \end{lstlisting} 255 | \vspacesmall 256 | 257 | 258 | The \codei{chars\_read} variable tells where to begin the next read if the string or line contains more than eight numbers. This is illustrated in the next example: 259 | 260 | \begin{example} 261 | \label{example3} 262 | \end{example} % frame disappears if I put this after end lstlisting 263 | \begin{lstlisting}[frame=single] 264 | // Example for converting a comma-separated decimal ASCII 265 | // string containing more than eight numbers 266 | #include 267 | #include "vectorclass.h" 268 | #include "decimal.h" 269 | #include "decimal.cpp" 270 | 271 | int main() { 272 | // text string containing twelve numbers 273 | char text[] = " 2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37"; 274 | 275 | // length and error will be returned in these variables 276 | int length1, length2, error; 277 | 278 | // data vectors of eight integers each 279 | Vec8i dat1, dat2; 280 | 281 | // read first eight numbers 282 | dat1 = ascii2bin(text, &length1, &error, 64, 8, ','); 283 | 284 | // check if syntax error 285 | if (!error) { 286 | // read the next four numbers 287 | dat2 = 288 | ascii2bin(text + length1, &length2, &error, 64, 4, ','); 289 | } 290 | 291 | // check if syntax error 292 | if (error) { 293 | printf ("\nerror 0x%X",error); 294 | } 295 | else { 296 | // join the two data vectors 297 | Vec16i dat12(dat1, dat2); 298 | 299 | // write results 300 | for (int i = 0; i < 12; i++) { 301 | printf("%i ", dat12[i]); 302 | } 303 | } 304 | return 0; 305 | } 306 | \end{lstlisting} 307 | \vspacesmall 308 | 309 | 310 | \section{Efficiency} \label{Efficiency} 311 | 312 | The ascii2bin function can be highly efficient. The performance is highest if the following conditions are satisfied: 313 | 314 | \begin{itemize} 315 | \item The input string is no more than 64 characters long 316 | \item No number is more than 8 characters long, including sign 317 | \item The code is compiled for the highest instruction set supported by the CPU it is running on. The following instruction set extensions give particular advantage: AVX2, AVX512BW, and the future AVX512VBMI2\end{itemize} 318 | \vspacesmall 319 | 320 | 321 | \end{document} 322 | -------------------------------------------------------------------------------- /containers/vector_containers.h: -------------------------------------------------------------------------------- 1 | /************************ vector_containers.h ****************************** 2 | * Author: Agner Fog 3 | * Date created: 2022-07-04 4 | * Last modified: 2023-11-25 5 | * Version: 2.02.00 6 | * Project: vector class library 7 | * Description: 8 | * Header file for container classes 9 | * These containers can contain vector class objects and matrixes 10 | * 11 | * For instructions, see containers_manual.pdf 12 | * 13 | * (c) Copyright 2022 - 2023 Agner Fog. 14 | * Apache License version 2.0 or later. 15 | ******************************************************************************/ 16 | 17 | #ifndef VECTOR_CONTAINERS_H 18 | #define VECTOR_CONTAINERS_H 20200 19 | 20 | #ifdef VCL_NAMESPACE 21 | namespace VCL_NAMESPACE { 22 | #endif 23 | 24 | 25 | // Container class to store n vector class objects of type V 26 | template 27 | class ContainerV { 28 | protected: 29 | V buf[n]; // array of vectors 30 | int s_count() const { // used internally 31 | constexpr int s = V::size(); // vector size 32 | static_assert((s & s-1) == 0, "vector size must be power of 2"); // check that vector size is a power of 2 33 | return bit_scan_reverse_const(s); // shift count for fast division by vector size 34 | } 35 | void (*errorfunction)(void) = 0; // pointer to error handling function 36 | public: 37 | ContainerV() = default; // default constructor 38 | void set_error_handler(void (*e)(void)) { // set function pointer to error handler 39 | errorfunction = e; 40 | } 41 | typedef decltype (buf[0][0]) etype; // type of vector elements 42 | static constexpr int n_vectors() { // get number of vectors 43 | return n; 44 | } 45 | static constexpr int n_elements() { // get number of vector elements 46 | return n * V::size(); 47 | } 48 | static constexpr int elementtype() { // info about vector element type and container type 49 | return V::elementtype() | 0x1000; 50 | } 51 | static constexpr bool is_fp() { // true if elements are a floating point type 52 | return (V::elementtype() & 0x3F) >= 15; 53 | } 54 | V get_vector(int index) const { // extract one vector 55 | if (uint32_t(index) < n) { 56 | return buf[index]; // get vector 57 | } 58 | else { // index out of range 59 | (*errorfunction)(); // call error handler 60 | if constexpr(is_fp()) { 61 | return nan_vec(2); // floating point type. return NAN 62 | } 63 | else { 64 | return V(etype(0)); // integer type. return 0 65 | } 66 | } 67 | } 68 | void set_vector(V x, int index) { // insert one vector 69 | if (uint32_t(index) < n) { 70 | buf[index] = x; // set vector 71 | } 72 | else { // error 73 | (*errorfunction)(); // call error handler 74 | } 75 | } 76 | etype get_element(uint32_t index) const { // extract one vector element 77 | if (index < (uint32_t)n_elements()) { 78 | return buf[index >> s_count()][index & (V::size() - 1)]; 79 | } 80 | else { // index out of range 81 | (*errorfunction)(); // call error handler 82 | if constexpr(is_fp()) { 83 | return nan_vec(2)[0]; // floating point type. return NAN 84 | } 85 | else { 86 | return 0; // integer type. return 0 87 | } 88 | } 89 | } 90 | void set_element(etype x, uint32_t index) { // insert one vector element 91 | if (index < (uint32_t)n_elements()) { 92 | buf[index >> s_count()].insert(index & (V::size()-1), x); 93 | } 94 | else { // error 95 | (*errorfunction)(); // call error handler 96 | } 97 | } 98 | void load(int nn, void const * p) { // load nn elements from array 99 | if (nn <= 0) return; // nothing to do 100 | if (nn > n_elements()) nn = n_elements();// max size 101 | int m = (uint32_t)nn >> s_count(); // number of full vectors to load 102 | int i; // loop counter 103 | for (i = 0; i < m; i++) { 104 | buf[i].load((etype const*)p + i * V::size()); // store one vector 105 | } 106 | int partial = nn & (V::size() - 1); // any partial store needed 107 | if (partial) { // nn is not divisible by vector size 108 | // load partial vector in the end 109 | buf[i].load_partial(partial, (etype const*)p + i * V::size()); // load part of last vector 110 | } 111 | } 112 | void store(int nn, void * p) { // store nn elements to array 113 | if (nn <= 0) return; // nothing to do 114 | if (nn > n_elements()) nn = n_elements();// max size 115 | int m = (uint32_t)nn >> s_count(); // number of full vectors to store 116 | int i; // loop counter 117 | for (i = 0; i < m; i++) { 118 | buf[i].store((etype*)p + i * V::size()); // store one vector 119 | } 120 | int partial = nn & (V::size() - 1); // any partial store needed 121 | if (partial) { // nn is not divisible by vector size 122 | // store partial vector in the end 123 | buf[i].store_partial(partial, (etype*)p + i * V::size()); // store part of last vector 124 | } 125 | } 126 | V * get_buf() { // get address of internal buffer 127 | return buf; 128 | } 129 | void zero() { // set all contents to zero 130 | for (int i = 0; i < n; i++) { 131 | buf[i] = V(etype(0)); 132 | } 133 | } 134 | }; 135 | 136 | 137 | 138 | // Container class to store a variable number of vector class objects of type V 139 | template 140 | class ContainerV { 141 | protected: 142 | V * buf; // allocated memory buffer containing array of vectors 143 | uint32_t allocatedSize; // size of allocated buffer 144 | uint32_t nvectors; // number of vectors currently used (includes partially used) 145 | uint32_t nelements; // number of vector elements currently used 146 | void (*errorfunction)(void); // pointer to error handling function 147 | int s_count() const { // used internally 148 | constexpr int s = V::size(); // vector size 149 | static_assert((s & s-1) == 0, "vector size must be a power of 2"); // check that vector size is a power of 2 150 | return bit_scan_reverse_const(s); // shift count for fast division by vector size 151 | } 152 | public: 153 | ContainerV() { // constructor 154 | buf = 0; allocatedSize = 0; nvectors = 0; nelements = 0; errorfunction = 0; 155 | } 156 | ~ContainerV() { // destructor 157 | if (buf) delete[] buf; // free allocated memory 158 | } 159 | ContainerV(ContainerV&) = delete; // prevent copying entire container (a copy constructor would have to allocate a new buffer) 160 | ContainerV operator = (ContainerV&) = delete;// prevent copying entire container 161 | void set_error_handler(void (*e)(void)) { // set function pointer to error handler 162 | errorfunction = e; 163 | } 164 | typedef decltype (buf[0][0]) etype; // type of vector elements 165 | static constexpr int elementtype() { // info about vector element type and container type 166 | return V::elementtype() | 0x1000; 167 | } 168 | static constexpr bool is_fp() { // true if elements are a floating point type 169 | return (V::elementtype() & 0x3F) >= 15; 170 | } 171 | int n_vectors() const { // get number of vectors 172 | return nvectors; 173 | } 174 | int n_elements() const { // get number of vector elements 175 | return nelements; 176 | } 177 | int allocated_size() const { // maximum size that can be set without reallocation 178 | return allocatedSize; 179 | } 180 | void set_nvectors(int size) { 181 | // Allocate, reallocate or deallocate buffer of specified size. size is the number of vectors. 182 | // Setting size > allocated_size will allocate more buffer and fill it with zeroes 183 | // Setting size < allocated_size will decrease size so that some of the data are inaccessible 184 | // Setting size = 0 will discard all data and de-allocate the buffer. 185 | if (size <= 0) { // discard everything 186 | if (buf) delete[] buf; // de-allocate buffer 187 | buf = 0; allocatedSize = 0; nvectors = 0; nelements = 0; 188 | } 189 | else if (uint32_t(size) <= allocatedSize) { // grow or shrink within allocated size 190 | nvectors = size; nelements = size * V::size(); 191 | } 192 | else { // increase allocated size 193 | uint32_t newallocsize; // new size to allocate 194 | if (uint32_t(size) >= allocatedSize + allocatedSize/2) { 195 | newallocsize = size; // first time or big increase. allocate only the specified size 196 | } 197 | else { 198 | newallocsize = size*2; // small increase. allocate more than requested to avoid frequent reallocations 199 | } 200 | V * buf2 = 0; // pointer to new buffer 201 | buf2 = new V[newallocsize](); // allocate new buffer. () means initialize to zero 202 | uint32_t i = 0; // loop counter 203 | if (buf) { // previously allocated buffer exists 204 | for (i = 0; i < allocatedSize; i++) { 205 | buf2[i] = buf[i]; // copy from old to new buffer 206 | } 207 | delete [] buf; // deallocate old buffer 208 | } 209 | // store pointer to new buffer 210 | buf = buf2; allocatedSize = newallocsize; 211 | nvectors = size; nelements = size * V::size(); // new used size 212 | } 213 | } 214 | void set_nelements(int n) { 215 | // Allocate, reallocate or deallocate buffer of specified size, not necessarily a multiple of the vector size 216 | int nv = uint32_t(n + V::size() - 1) >> s_count(); // round up to nearest multiple of the vector size 217 | set_nvectors(nv); 218 | nelements = n; 219 | } 220 | V get_vector(int index) const { // extract one vector 221 | if (uint32_t(index) < nvectors) { 222 | return buf[index]; // get vector 223 | } 224 | else { // index out of range 225 | (*errorfunction)(); // call error handler 226 | if constexpr(is_fp()) { 227 | return nan_vec(2); // floating point type. return NAN 228 | } 229 | else { 230 | return V(etype(0)); // integer type. return 0 231 | } 232 | } 233 | } 234 | void set_vector(V x, int index) { // insert one vector 235 | if (uint32_t(index) < nvectors) { 236 | buf[index] = x; // set vector 237 | } 238 | else { // error 239 | (*errorfunction)(); // call error handler 240 | } 241 | } 242 | etype get_element(uint32_t index) const { // extract one vector element 243 | if (index < uint32_t(nelements)) { 244 | return buf[index >> s_count()][index & (V::size() - 1)]; 245 | } 246 | else { // index out of range 247 | (*errorfunction)(); // call error handler 248 | if constexpr(is_fp()) { 249 | return nan_vec(2)[0]; // floating point type. return NAN 250 | } 251 | else { 252 | return 0; // integer type. return 0 253 | } 254 | } 255 | } 256 | void set_element(etype x, uint32_t index) { // insert one vector element 257 | if (index < uint32_t(nelements)) { 258 | buf[index >> s_count()].insert(index & (V::size()-1), x); 259 | } 260 | else { // error 261 | (*errorfunction)(); // call error handler 262 | } 263 | } 264 | void load(int n, void const * p) { // load n elements from array 265 | if (n <= 0) return; // nothing to do 266 | if (uint32_t(n) > nelements) n = nelements;// max size 267 | int m = (uint32_t)n >> s_count(); // number of full vectors to load 268 | int i; // loop counter 269 | for (i = 0; i < m; i++) { 270 | buf[i].load((etype const*)p + i * V::size()); // store one vector 271 | } 272 | int partial = n & (V::size() - 1); // any partial store needed 273 | if (partial) { // n is not divisible by vector size 274 | // load partial vector in the end 275 | buf[i].load_partial(partial, (etype const*)p + i * V::size()); // load part of last vector 276 | } 277 | } 278 | void store(int n, void * p) { // store n elements to array 279 | if (n <= 0) return; // nothing to do 280 | if (uint32_t(n) > nelements) n = nelements;// max size 281 | int m = (uint32_t)n >> s_count(); // number of full vectors to store 282 | int i; // loop counter 283 | for (i = 0; i < m; i++) { 284 | buf[i].store((etype*)p + i * V::size()); // store one vector 285 | } 286 | int partial = n & (V::size() - 1); // any partial store needed 287 | if (partial) { // n is not divisible by vector size 288 | // store partial vector in the end 289 | buf[i].store_partial(partial, (etype*)p + i * V::size()); // store part of last vector 290 | } 291 | } 292 | V * get_buf() { // get address of internal buffer. warning: address may change 293 | return buf; 294 | } 295 | void zero() { // set all contents to zero 296 | for (uint32_t i = 0; i < nvectors; i++) { 297 | buf[i] = V(etype(0)); 298 | } 299 | } 300 | }; 301 | 302 | #ifdef VCL_NAMESPACE 303 | } 304 | #endif 305 | 306 | #endif // VECTOR_CONTAINERS_H 307 | -------------------------------------------------------------------------------- /quaternion/quaternion_manual.tex: -------------------------------------------------------------------------------- 1 | \documentclass[11pt,a4paper,oneside,openright]{report} 2 | 3 | \usepackage[bindingoffset=5mm,left=20mm,right=20mm,top=20mm,bottom=20mm,footskip=10mm]{geometry} 4 | \usepackage[utf8x]{inputenc} 5 | \usepackage{hyperref} 6 | \usepackage[english]{babel} 7 | \usepackage{listings} 8 | \usepackage{subfiles} 9 | \usepackage{longtable} 10 | \usepackage{multirow} 11 | \usepackage{ragged2e} 12 | \usepackage{cmap} % avoid fi ligatures in pdf file 13 | \usepackage{amsthm} % example numbering 14 | \usepackage{color} 15 | %\usepackage{bold-extra} % for bold tt font. Remember to include bold-extra.sty file 16 | \usepackage{graphicx} 17 | \usepackage[yyyymmdd]{datetime} 18 | \usepackage{float} 19 | 20 | % style for code listing 21 | \renewcommand{\familydefault}{\sfdefault} 22 | \newtheorem{example}{Example}[chapter] % example numbering 23 | \lstset{language=C} % formatting for code listing 24 | \lstset{basicstyle=\ttfamily,breaklines=true} 25 | \definecolor{darkGreen}{rgb}{0,0.4,0} 26 | \definecolor{mybrown}{rgb}{0.40,0.10,0.05} 27 | \lstset{commentstyle=\color{darkGreen}} % comments color 28 | \lstset{keywordstyle=\color{blue}} % keyword color 29 | \lstset{stringstyle=\color{mybrown}} % string color 30 | \lstset{showstringspaces=false} % don't mark spaces in strings 31 | 32 | \renewcommand{\dateseparator}{-} 33 | 34 | % command for turning indent back on after \flushleft 35 | \newcommand{\indenton}{\RaggedRight\parindent=15pt} 36 | 37 | % command for vertical space 38 | \newcommand{\vspacesmall}{\vspace{3mm}} 39 | \newcommand{\vspacebig}{\vspace{6mm}} 40 | 41 | % style for code inlined in text: 42 | \newcommand{\codei}[1]{\bfseries \ttfamily{#1}\normalfont} 43 | 44 | 45 | \begin{document} 46 | 47 | \begin{titlepage} 48 | \centering 49 | 50 | \null %empty box needed for vfill to work 51 | \vfill 52 | 53 | {\bfseries\Huge 54 | Quaternion.h 55 | \vspacesmall 56 | 57 | Quaternion extension for 58 | \vspacesmall 59 | 60 | C++ vector class library 61 | \vspacebig 62 | 63 | } 64 | \vspacebig 65 | 66 | {\Large 67 | Agner Fog 68 | \vspacebig 69 | 70 | \copyright\ \today. Apache license 2.0 71 | } 72 | 73 | \vfill 74 | 75 | \includegraphics[width=306pt]{freesoftwarelogo.jpg} 76 | \vfill 77 | 78 | \end{titlepage} 79 | 80 | \RaggedRight 81 | 82 | \chapter{Introduction}\label{chap:Introduction} 83 | Quaternions or hypercomplex numbers is a topic in theoretical algebra and quantum physics. Applications relating to 3-D geometry and electromagnetism are better served with the vector3d package to VCL. 84 | \vspacesmall 85 | 86 | The file quaternion.h provides classes, operators, and functions for 87 | calculations with quaternions. This is an extension to the Vector Class Library. 88 | \vspacesmall 89 | 90 | The classes listed below are defined. Common operators and functions are defined for these classes: 91 | 92 | \begin {table}[H] 93 | \caption{Quaternion classes} 94 | \label{table:QuaternionClasses} 95 | \begin{tabular}{|p{24mm}|p{20mm}|p{20mm}|p{22mm}|p{20mm}|p{28mm}|} 96 | \hline 97 | \bfseries Quaternion class & \bfseries Precision & \bfseries Quaternion elements per vector & \bfseries Correspon-ding real vector class & \bfseries Total bits & \bfseries Recommended minimum \newline instruction set \\ \hline 98 | Quaternion1f & \centering single & \centering 1 & \centering Vec4f & \centering 128 & SSE2 \\ \hline 99 | Quaternion1d & \centering double & \centering 1 & \centering Vec4d & \centering 256 & AVX \\ \hline 100 | \end{tabular} 101 | \end{table} 102 | \vspacebig 103 | 104 | 105 | 106 | \section{Compiling} \label{Compiling} 107 | The quaternion class extension to the Vector Class Library is compiled in the same way as the Vector Class Library itself. All x86 and x86-64 platforms are supported, including Windows, Linux, and Mac OS. 108 | The following C++ compilers can be used: Gnu, Clang, Microsoft, and Intel. 109 | See the Vector class library manual for further details. 110 | \vspacesmall 111 | 112 | This example shows how to use the quaternion classes: 113 | 114 | \begin{example} 115 | \label{example1} 116 | \end{example} % frame disappears if I put this after end lstlisting 117 | \begin{lstlisting}[frame=single] 118 | // Example for quaternions 119 | #include 120 | #include "vectorclass.h" // vector class library 121 | #include "quaternion.h" // quaternion extension 122 | 123 | // function to print quaternion 124 | template 125 | void printqx (const char * text, Q a) { 126 | auto aa = a.to_vector(); // get elements as real vector 127 | printf("\n%s ", text); // print text 128 | printf("(%.3G,%.3G,%.3G,%.3G)", aa[0], aa[1], aa[2], aa[3]); 129 | } 130 | 131 | int main() { 132 | // define quaternions 133 | Quaternion1d a(1,2,3,4); // 1 + 2*i + 3*j + 4*k 134 | Quaternion1d b(2,-3,-1,0); // 2 - 3*i - 1*j + 0*k 135 | Quaternion1d c = a + b; // add quaternions 136 | Quaternion1d d = a * b; // multiply quaternions 137 | 138 | // print results 139 | printqx("a = ", a); // a = (1,2,3,4) 140 | printqx("b = ", b); // b = (2,-3,-1,0) 141 | printqx("c = ", c); // c = (3,-1,2,4) 142 | printqx("d = ", d); // d = (11,5,-7,15) 143 | } 144 | 145 | \end{lstlisting} 146 | \vspacesmall 147 | 148 | 149 | \chapter{Constructing quaternions and loading data} 150 | \label{ConstructingQuaternions} 151 | 152 | There are several ways to create quaternions and put data into them. These methods are listed here. 153 | \vspacebig 154 | 155 | \begin{tabular}{|p{25mm}|p{100mm}|} 156 | \hline 157 | \bfseries Method & default constructor \\ \hline 158 | \bfseries Defined for & all quaternion classes \\ \hline 159 | \bfseries Description & the quaternion is created but not initialized.\newline 160 | The value is unpredictable \\ \hline 161 | \bfseries Efficiency & good \\ \hline 162 | \end{tabular} 163 | \vspacesmall 164 | 165 | \begin{lstlisting}[frame=none] 166 | // Example: 167 | quaternion1f a; // creates a quaternion of four floats 168 | \end{lstlisting} 169 | \vspacebig 170 | 171 | 172 | \begin{tabular}{|p{25mm}|p{100mm}|} 173 | \hline 174 | \bfseries Method & Construct from single real \\ \hline 175 | \bfseries Defined for & all quaternion classes \\ \hline 176 | \bfseries Description & The parameter defines the real part. The imaginary parts are zero. \\ \hline 177 | \bfseries Efficiency & good \\ \hline 178 | \end{tabular} 179 | \vspacesmall 180 | 181 | \begin{lstlisting}[frame=none] 182 | // Example: 183 | quaternion1d a(3); // a = (3,0,0,0) 184 | \end{lstlisting} 185 | \vspacebig 186 | 187 | 188 | \begin{tabular}{|p{25mm}|p{100mm}|} 189 | \hline 190 | \bfseries Method & Construct from one real and three imaginary parts \\ \hline 191 | \bfseries Defined for & all quaternion classes \\ \hline 192 | \bfseries Description & The parameters define the real and imaginary parts \\ \hline 193 | \bfseries Efficiency & good \\ \hline 194 | \end{tabular} 195 | \vspacesmall 196 | 197 | \begin{lstlisting}[frame=none] 198 | // Example: 199 | quaternion1d a(1,2,3,4); // a = (1,2,3,4) 200 | \end{lstlisting} 201 | \vspacebig 202 | 203 | 204 | \begin{tabular}{|p{25mm}|p{100mm}|} 205 | \hline 206 | \bfseries Method & Construct from two complex numbers \\ \hline 207 | \bfseries Defined for & all quaternion classes \\ \hline 208 | \bfseries Description & The second parameter is post-multiplied by j \\ \hline 209 | \bfseries Efficiency & good \\ \hline 210 | \bfseries Implementation & complexvec1.h must be included before quaternion.h \\ \hline 211 | \end{tabular} 212 | \vspacesmall 213 | 214 | \begin{lstlisting}[frame=none] 215 | // Example: 216 | Complex1d a(1,2); // a = 1 + i*2 217 | Complex1d b(3,4); // b = 3 + i*4 218 | Quaternion1d c(a,b); // c = a + b*j = 1 + i*2 + j*3 + k*4 219 | \end{lstlisting} 220 | \vspacebig 221 | 222 | \begin{tabular}{|p{25mm}|p{100mm}|} 223 | \hline 224 | \bfseries Method & member function load(p) \\ \hline 225 | \bfseries Defined for & all quaternion classes \\ \hline 226 | \bfseries Description & Load data from array of same precision. 227 | Each real part must be followed by the corresponding three imaginary parts. \\ \hline 228 | \bfseries Efficiency & good \\ \hline 229 | \end{tabular} 230 | \vspacesmall 231 | 232 | \begin{lstlisting}[frame=none] 233 | // Example: 234 | double a[4] = {1,2,3,4}; 235 | Quaternion1d b; 236 | b.load(a); // b = (1,2,3,4) 237 | \end{lstlisting} 238 | \vspacebig 239 | 240 | 241 | \begin{tabular}{|p{25mm}|p{100mm}|} 242 | \hline 243 | \bfseries Method & member function store(p) \\ \hline 244 | \bfseries Defined for & all quaternion classes \\ \hline 245 | \bfseries Description & Save data into array of same precision. 246 | Each real part is followed by the corresponding three imaginary parts. \\ \hline 247 | \bfseries Efficiency & good \\ \hline 248 | \end{tabular} 249 | \vspacesmall 250 | 251 | \begin{lstlisting}[frame=none] 252 | // Example: 253 | float a[4]; 254 | Quaternion1f b(1,2,3,4); 255 | b.store(a); // a = {1,2,3,4} 256 | \end{lstlisting} 257 | \vspacebig 258 | 259 | 260 | \begin{tabular}{|p{25mm}|p{100mm}|} 261 | \hline 262 | \bfseries Method & member function real() \\ \hline 263 | \bfseries Defined for & all quaternion classes \\ \hline 264 | \bfseries Description & Get real part of quaternion \\ \hline 265 | \bfseries Efficiency & good \\ \hline 266 | \end{tabular} 267 | \vspacesmall 268 | 269 | \begin{lstlisting}[frame=none] 270 | // Example: 271 | Quaternion1d a(1,2,3,4); 272 | double r = a.real(); // a = 1 273 | \end{lstlisting} 274 | \vspacebig 275 | 276 | 277 | \begin{tabular}{|p{25mm}|p{100mm}|} 278 | \hline 279 | \bfseries Method & member function imag() \\ \hline 280 | \bfseries Defined for & all quaternion classes \\ \hline 281 | \bfseries Description & Get imaginary parts of quaternion. The real part is set to zero \\ \hline 282 | \bfseries Efficiency & good \\ \hline 283 | \end{tabular} 284 | \vspacesmall 285 | 286 | \begin{lstlisting}[frame=none] 287 | // Example: 288 | Quaternion1d a(1,2,3,4); 289 | Quaternion1d im = a.imag(); // a = (0,2,3,4) 290 | \end{lstlisting} 291 | \vspacebig 292 | 293 | 294 | \begin{tabular}{|p{25mm}|p{100mm}|} 295 | \hline 296 | \bfseries Method & member function get\_low() \\ \hline 297 | \bfseries Defined for & all quaternion classes \\ \hline 298 | \bfseries Description & Get the real and the first imaginary part (i) as a complex vector \\ \hline 299 | \bfseries Efficiency & good \\ \hline 300 | \bfseries Implementation & complexvec1.h must be included before quaternion.h \\ \hline 301 | \end{tabular} 302 | \vspacesmall 303 | 304 | \begin{lstlisting}[frame=none] 305 | // Example: 306 | Quaternion1d a(1,2,3,4); 307 | Complex1d b = a.get_low(); // b = (1,2) 308 | \end{lstlisting} 309 | \vspacebig 310 | 311 | 312 | \begin{tabular}{|p{25mm}|p{100mm}|} 313 | \hline 314 | \bfseries Method & member function get\_high() \\ \hline 315 | \bfseries Defined for & all quaternion classes \\ \hline 316 | \bfseries Description & Get the last two imaginary parts (j and k) as a complex vector \\ \hline 317 | \bfseries Efficiency & good \\ \hline 318 | \bfseries Implementation & complexvec1.h must be included before quaternion.h \\ \hline 319 | \end{tabular} 320 | \vspacesmall 321 | 322 | \begin{lstlisting}[frame=none] 323 | // Example: 324 | Quaternion1d a(1,2,3,4); 325 | Complex1d b = a.get_low(); // b = (1,2) 326 | Complex1d c = a.get_high(); // c = (3,4) 327 | Quaternion1d d(b,c); // d = (1,2,3,4) 328 | \end{lstlisting} 329 | \vspacebig 330 | 331 | 332 | 333 | \chapter{Operators}\label{chap:Operators} 334 | 335 | \begin{tabular}{|p{25mm}|p{100mm}|} 336 | \hline 337 | \bfseries Operator & + \\ \hline 338 | \bfseries Defined for & all quaternion classes \\ \hline 339 | \bfseries Description & Add two quaternions, or one quaternion and one real scalar of the same precision \\ \hline 340 | \bfseries Efficiency & good \\ \hline 341 | \end{tabular} 342 | \vspacesmall 343 | 344 | \begin{lstlisting}[frame=none] 345 | // Example: 346 | Quaternion1d a(1,2,3,4); 347 | Quaternion1d b(5,6,7,8); 348 | Quaternion1d c = a + b; // c = (6,8,10,12) 349 | Quaternion1d d = a + 10.0; // d = (11,2,3,4) 350 | \end{lstlisting} 351 | \vspacebig 352 | 353 | 354 | \begin{tabular}{|p{25mm}|p{100mm}|} 355 | \hline 356 | \bfseries Operator & - \\ \hline 357 | \bfseries Defined for & all quaternion classes \\ \hline 358 | \bfseries Description & Subtract two quaternions, or one quaternion and one real scalar of the same precision \\ \hline 359 | \bfseries Efficiency & good \\ \hline 360 | \end{tabular} 361 | \vspacesmall 362 | 363 | \begin{lstlisting}[frame=none] 364 | // Example: 365 | Quaternion1d a(12,11,10,9); 366 | Quaternion1d b(5,6,7,8); 367 | Quaternion1d c = a - b; // c = (7,5,3,1) 368 | Quaternion1d d = a - 10.0; // d = (2,11,10,9) 369 | \end{lstlisting} 370 | \vspacebig 371 | 372 | 373 | \begin{tabular}{|p{25mm}|p{100mm}|} 374 | \hline 375 | \bfseries Operator & * \\ \hline 376 | \bfseries Defined for & all quaternion classes \\ \hline 377 | \bfseries Description & Multiply two quaternions, or one quaternion and one real scalar of the same precision. \newline 378 | Multiplication of quaternions is not commutative, i.e. a*b and b*a are not the same. 379 | \\ \hline 380 | \bfseries Efficiency & medium \\ \hline 381 | \bfseries Accuracy & Quaternion multiplication involves the calculation of sums of products. Loss of precision may occur if the result is close to zero. \\ \hline 382 | \end{tabular} 383 | \vspacesmall 384 | 385 | \begin{lstlisting}[frame=none] 386 | // Example: 387 | Quaternion1d a(1,2,3,4); 388 | Quaternion1d b(5,6,7,8); 389 | Quaternion1d c = a * b; // c = (-60,12,30,24) 390 | Quaternion1d d = b * a; // d = (-60,20,14,32) 391 | Quaternion1d e = a * 10.; // e = (10,20,30,40) 392 | 393 | \end{lstlisting} 394 | \vspacebig 395 | 396 | 397 | \begin{tabular}{|p{25mm}|p{100mm}|} 398 | \hline 399 | \bfseries Operator & / \\ \hline 400 | \bfseries Defined for & all quaternion classes \\ \hline 401 | \bfseries Description & Divide two quaternions, or one quaternion and one real scalar of the same precision. \newline 402 | Division is defined as a / b = a * reciprocal(b) \\ \hline 403 | \bfseries Efficiency & medium \\ \hline 404 | \bfseries Accuracy & Quaternion division involves the calculation of sums of products. Loss of precision may occur if the result is close to zero. \\ \hline 405 | \end{tabular} 406 | \vspacesmall 407 | 408 | \begin{lstlisting}[frame=none] 409 | // Example: 410 | Quaternion1f a(7,9,-1,7); 411 | Quaternion1f b(1,2,3,2); 412 | Quaternion1f c = a / b; // c = (2,1,-1,-2) 413 | Quaternion1f d = c * b; // d = (7,9,-1,7) 414 | Quaternion1f e = b / 2.0f; // e = (0.5,1,1.5,1) 415 | Quaternion1f f = 18.f / b; // f = (1,-2,-3,-2) 416 | Quaternion1f g = f * b; // g = (18,0,0,0) 417 | \end{lstlisting} 418 | \vspacebig 419 | 420 | 421 | \begin{tabular}{|p{25mm}|p{100mm}|} 422 | \hline 423 | \bfseries Operator & $\sim$ \\ \hline 424 | \bfseries Defined for & all quaternion classes \\ \hline 425 | \bfseries Description & Complex conjugate. The signs of the imaginary parts are inverted \\ \hline 426 | \bfseries Efficiency & good \\ \hline 427 | \end{tabular} 428 | \vspacesmall 429 | 430 | \begin{lstlisting}[frame=none] 431 | // Example: 432 | Quaternion1f a(1,2,3,4); 433 | Quaternion1f b = ~ a; // b = (1,-2,-3,-4) 434 | \end{lstlisting} 435 | \vspacebig 436 | 437 | 438 | \begin{tabular}{|p{25mm}|p{100mm}|} 439 | \hline 440 | \bfseries Operator & == \\ \hline 441 | \bfseries Defined for & all quaternion classes \\ \hline 442 | \bfseries Description & Compare for equality.\newline 443 | The result is a boolean scalar. \\ \hline 444 | \bfseries Efficiency & good \\ \hline 445 | \end{tabular} 446 | \vspacesmall 447 | 448 | \begin{lstlisting}[frame=none] 449 | // Example: 450 | Quaternion1f a(1, 2,3,4); 451 | Quaternion1f b(1,-2,3,4); 452 | bool c = (a == b); // c = false 453 | \end{lstlisting} 454 | \vspacebig 455 | 456 | 457 | \begin{tabular}{|p{25mm}|p{100mm}|} 458 | \hline 459 | \bfseries Operator & != \\ \hline 460 | \bfseries Defined for & all quaternion classes \\ \hline 461 | \bfseries Description & Compare for not equal.\newline 462 | The result is a boolean scalar. \\ \hline 463 | \bfseries Efficiency & good \\ \hline 464 | \end{tabular} 465 | \vspacesmall 466 | 467 | \begin{lstlisting}[frame=none] 468 | // Example: 469 | Quaternion1f a(1, 2,3,4); 470 | Quaternion1f b(1,-2,3,4); 471 | bool c = (a != b); // c = true 472 | \end{lstlisting} 473 | \vspacebig 474 | 475 | 476 | \chapter{Mathematical functions}\label{chap:MathematicalFunctions} 477 | 478 | 479 | \begin{tabular}{|p{25mm}|p{100mm}|} 480 | \hline 481 | \bfseries Function & abs \\ \hline 482 | \bfseries Defined for & all quaternion classes \\ \hline 483 | \bfseries Description & Gives the norm as a scalar \\ \hline 484 | \bfseries Efficiency & medium \\ \hline 485 | \end{tabular} 486 | \vspacesmall 487 | 488 | \begin{lstlisting}[frame=none] 489 | // Example: 490 | Quaternion1f a(2,1,0,2); 491 | double b = abs(a); // b = 3 492 | \end{lstlisting} 493 | \vspacebig 494 | 495 | 496 | 497 | \chapter{Other functions}\label{chap:OtherFunctions} 498 | 499 | 500 | \begin{tabular}{|p{25mm}|p{100mm}|} 501 | \hline 502 | \bfseries Function & to\_vector \\ \hline 503 | \bfseries Defined for & all quaternion classes \\ \hline 504 | \bfseries Description & Convert to a vector of the real part and the three imaginary parts. \\ \hline 505 | \bfseries Efficiency & good \\ \hline 506 | \end{tabular} 507 | \vspacesmall 508 | 509 | \begin{lstlisting}[frame=none] 510 | // Example: 511 | Quaternion1d a(1,2,3,4); 512 | Vec4d b = a.to_vector(); // b = (1,2,3,4) 513 | \end{lstlisting} 514 | \vspacebig 515 | 516 | 517 | \begin{tabular}{|p{25mm}|p{100mm}|} 518 | \hline 519 | \bfseries Function & select \\ \hline 520 | \bfseries Defined for & all quaternion classes \\ \hline 521 | \bfseries Description & Choose between two quaternions. \\ \hline 522 | \bfseries Efficiency & good \\ \hline 523 | \end{tabular} 524 | \vspacesmall 525 | 526 | \begin{lstlisting}[frame=none] 527 | // Example: 528 | Quaternion1d a(1,2,3,4); 529 | Quaternion1d b(5,6,7,8); 530 | Quaternion1d c = select(true,a,b); // c = (1,2,3,4) 531 | Quaternion1d d = select(false,a,b); // d = (5,6,7,8) 532 | \end{lstlisting} 533 | \vspacebig 534 | 535 | 536 | \end{document} 537 | -------------------------------------------------------------------------------- /vector3d/vector3d.h: -------------------------------------------------------------------------------- 1 | /**************************** vector3d.h *********************************** 2 | * Author: Agner Fog 3 | * Date created: 2012-08-01 4 | * Last modified: 2023-05-14 5 | * Version: 2.02.00 6 | * Project: Extension to vector class library 7 | * Description: Classes for 3-dimensional vectors, including operators and functions 8 | * The following classes are defined: 9 | * Vec3Df: A vector of 3 single precision floats 10 | * Vec3Dd: A vector of 3 double precision floats 11 | * 12 | * (c) Copyright 2012-2023 Apache License version 2.0 or later 13 | \*****************************************************************************/ 14 | 15 | #ifndef VECTOR3D_H 16 | #define VECTOR3D_H 20200 17 | 18 | #include "vectorclass.h" 19 | #include // define math library functions 20 | 21 | #if VECTORCLASS_H < 20000 22 | #error Incompatible version of vector class library. Must use version 2 or later 23 | #endif 24 | 25 | #ifdef VCL_NAMESPACE 26 | namespace VCL_NAMESPACE { 27 | #endif 28 | 29 | /***************************************************************************** 30 | * 31 | * Class Vec3Df: vector of 3 single precision floats 32 | * 33 | *****************************************************************************/ 34 | 35 | class Vec3Df { 36 | protected: 37 | __m128 xmm; // Float vector 38 | public: 39 | // default constructor 40 | Vec3Df() = default; 41 | // construct from three coordinates 42 | Vec3Df(float x, float y, float z) { 43 | xmm = Vec4f(x, y, z, 0.f); 44 | } 45 | // Constructor to convert from Vec4f 46 | Vec3Df(Vec4f const x) { 47 | xmm = x; 48 | // cutoff(3); 49 | } 50 | // Constructor to convert from type __m128 used in intrinsics: 51 | Vec3Df(__m128 const x) { 52 | xmm = x; 53 | } 54 | // Assignment operator to convert from type __m128 used in intrinsics: 55 | Vec3Df & operator = (__m128 const x) { 56 | xmm = x; 57 | return *this; 58 | } 59 | // Type cast operator to convert to __m128 used in intrinsics 60 | operator __m128() const { 61 | return xmm; 62 | } 63 | // Member function to convert to vector 64 | Vec4f to_vector() const { 65 | return xmm; 66 | } 67 | // Member function to load from array 68 | Vec3Df & load(float const * p) { 69 | xmm = Vec4f().load_partial(3, p); 70 | return *this; 71 | } 72 | // Member function to store into array 73 | void store(float * p) const { 74 | Vec4f(xmm).store_partial(3, p); 75 | } 76 | // get x part 77 | float get_x() const { 78 | return _mm_cvtss_f32(xmm); 79 | } 80 | // get y part 81 | float get_y() const { 82 | return Vec4f(xmm).extract(1); 83 | } 84 | // get z part 85 | float get_z() const { 86 | return Vec4f(xmm).extract(2); 87 | } 88 | // Member function to extract one coordinate 89 | float extract(int index) const { 90 | return Vec4f(xmm).extract(index); 91 | } 92 | // Operator [] to extract one coordinate 93 | // Operator [] can only read an element, not write. 94 | float operator [] (uint32_t index) const { 95 | return extract(index); 96 | } 97 | // Insert one coordinate 98 | Vec3Df & insert (uint32_t index, float x) { 99 | xmm = Vec4f(xmm).insert(index, x); 100 | return *this; 101 | } 102 | static constexpr int size() { 103 | return 1; 104 | } 105 | static constexpr int elementtype() { 106 | return 0x210; 107 | } 108 | }; 109 | 110 | /***************************************************************************** 111 | * 112 | * Operators for Vec3Df 113 | * 114 | *****************************************************************************/ 115 | 116 | // operator + : add 117 | static inline Vec3Df operator + (Vec3Df const a, Vec3Df const b) { 118 | return Vec3Df(Vec4f(a) + Vec4f(b)); 119 | } 120 | 121 | // operator += : add 122 | static inline Vec3Df & operator += (Vec3Df & a, Vec3Df const b) { 123 | a = a + b; 124 | return a; 125 | } 126 | 127 | // operator - : subtract 128 | static inline Vec3Df operator - (Vec3Df const a, Vec3Df const b) { 129 | return Vec3Df(Vec4f(a) - Vec4f(b)); 130 | } 131 | 132 | // operator - : unary minus 133 | static inline Vec3Df operator - (Vec3Df const a) { 134 | return Vec3Df(- Vec4f(a)); 135 | } 136 | 137 | // operator -= : subtract 138 | static inline Vec3Df & operator -= (Vec3Df & a, Vec3Df const b) { 139 | a = a - b; 140 | return a; 141 | } 142 | 143 | // operator * : multiply element-by-element 144 | // (see also cross_product and dot_product) 145 | static inline Vec3Df operator * (Vec3Df const a, Vec3Df const b) { 146 | return Vec3Df(Vec4f(a) * Vec4f(b)); 147 | } 148 | 149 | // operator *= : multiply element-by-element 150 | static inline Vec3Df & operator *= (Vec3Df & a, Vec3Df const b) { 151 | a = a * b; 152 | return a; 153 | } 154 | 155 | // operator / : divide element-by-element 156 | static inline Vec3Df operator / (Vec3Df const a, Vec3Df const b) { 157 | return Vec3Df(Vec4f(a) / Vec4f(b)); 158 | } 159 | 160 | // operator /= : divide element-by-element 161 | static inline Vec3Df & operator /= (Vec3Df & a, Vec3Df const b) { 162 | a = a / b; 163 | return a; 164 | } 165 | 166 | // operator == : returns true if a == b 167 | static inline bool operator == (Vec3Df const a, Vec3Df const b) { 168 | Vec4fb t1 = Vec4f(a) == Vec4f(b); 169 | #if INSTRSET >= 10 170 | return (uint8_t(t1) & 7) == 7; 171 | #else 172 | Vec4fb t2 = _mm_shuffle_ps(t1, t1, 0x24); // ignore unused top element 173 | return horizontal_and(t2); 174 | #endif 175 | } 176 | 177 | // operator != : returns true if a != b 178 | static inline bool operator != (Vec3Df const a, Vec3Df const b) { 179 | Vec4fb t1 = Vec4f(a) != Vec4f(b); 180 | #if INSTRSET >= 10 181 | return (uint8_t(t1) & 7) != 0; 182 | #else 183 | Vec4fb t2 = _mm_shuffle_ps(t1, t1, 0x24); // ignore unused top element 184 | return horizontal_or(t2); 185 | #endif 186 | } 187 | 188 | /***************************************************************************** 189 | * 190 | * Operators mixing Vec3Df and float 191 | * 192 | *****************************************************************************/ 193 | 194 | // operator * : multiply 195 | static inline Vec3Df operator * (Vec3Df const a, float b) { 196 | return _mm_mul_ps(a, _mm_set1_ps(b)); 197 | } 198 | static inline Vec3Df operator * (float a, Vec3Df const b) { 199 | return b * a; 200 | } 201 | static inline Vec3Df & operator *= (Vec3Df & a, float & b) { 202 | a = a * b; 203 | return a; 204 | } 205 | 206 | // operator / : divide 207 | static inline Vec3Df operator / (Vec3Df const a, float b) { 208 | return _mm_div_ps(a, _mm_set1_ps(b)); 209 | } 210 | 211 | static inline Vec3Df & operator /= (Vec3Df & a, float b) { 212 | a = a / b; 213 | return a; 214 | } 215 | 216 | 217 | /***************************************************************************** 218 | * 219 | * Functions for Vec3Df 220 | * 221 | *****************************************************************************/ 222 | 223 | // function cross_product 224 | static inline Vec3Df cross_product (Vec3Df const a, Vec3Df const b) { 225 | Vec4f a1 = permute4<1,2,0,V_DC>(Vec4f(a)); 226 | Vec4f b1 = permute4<1,2,0,V_DC>(Vec4f(b)); 227 | Vec4f a2 = permute4<2,0,1,V_DC>(Vec4f(a)); 228 | Vec4f b2 = permute4<2,0,1,V_DC>(Vec4f(b)); 229 | Vec4f c = a1 * b2 - a2 * b1; 230 | return c.cutoff(3); 231 | } 232 | 233 | // function dot_product 234 | static inline float dot_product (Vec3Df const a, Vec3Df const b) { 235 | Vec4f c = (Vec4f(a) * Vec4f(b)).cutoff(3); 236 | return horizontal_add(c); 237 | } 238 | 239 | // function vector_length 240 | static inline float vector_length (Vec3Df const a) { 241 | return std::sqrt(dot_product(a,a)); 242 | } 243 | 244 | // function normalize_vector 245 | static inline Vec3Df normalize_vector (Vec3Df const a) { 246 | return a / vector_length(a); 247 | } 248 | 249 | // function select 250 | static inline Vec3Df select (bool s, Vec3Df const a, Vec3Df const b) { 251 | return s ? a : b; 252 | } 253 | 254 | // function rotate 255 | // The vector a is rotated by multiplying by the matrix defined by the three columns col0, col1, col2 256 | static inline Vec3Df rotate (Vec3Df const col0, Vec3Df const col1, Vec3Df const col2, Vec3Df const a) { 257 | Vec4f xbroad = permute4<0,0,0,V_DC>(Vec4f(a)); // broadcast x 258 | Vec4f ybroad = permute4<1,1,1,V_DC>(Vec4f(a)); // broadcast y 259 | Vec4f zbroad = permute4<2,2,2,V_DC>(Vec4f(a)); // broadcast z 260 | Vec4f r = col0.to_vector() * xbroad + col1.to_vector() * ybroad + col2.to_vector() * zbroad; 261 | return r.cutoff(3); 262 | } 263 | 264 | 265 | /***************************************************************************** 266 | * 267 | * Class Vec3Dd: vector of 3 double precision floats 268 | * 269 | *****************************************************************************/ 270 | 271 | class Vec3Dd { 272 | protected: 273 | Vec4d yy; // vector of 4 doubles 274 | public: 275 | // default constructor 276 | Vec3Dd() = default; 277 | // construct from three coordinates 278 | Vec3Dd(double x, double y, double z) { 279 | yy = Vec4d(x, y, z, 0.); 280 | } 281 | // Constructor to convert from Vec4d 282 | Vec3Dd(Vec4d const x) { 283 | yy = x; 284 | // cutoff(3); 285 | } 286 | // Constructor to convert from type __m256d used in intrinsics or Vec256de used in emulation 287 | #if INSTRSET >= 7 // AVX 288 | Vec3Dd(__m256d const x) { 289 | yy = x; 290 | } 291 | #else 292 | Vec3Dd(Vec256de const x) { 293 | yy = x; 294 | } 295 | #endif 296 | // Assignment operator to convert from type __m256d used in intrinsics or Vec256de used in emulation 297 | #if INSTRSET >= 7 // AVX 298 | Vec3Dd & operator = (__m256d const x) { 299 | #else 300 | Vec3Dd & operator = (Vec256de const x) { 301 | #endif 302 | yy = x; 303 | return *this; 304 | } 305 | // Type cast operator to convert to __m256d used in intrinsics or Vec256de used in emulation 306 | #if INSTRSET >= 7 // AVX 307 | operator __m256d() const { 308 | return yy; 309 | } 310 | #endif 311 | // Member function to load from array 312 | Vec3Dd & load(double const * p) { 313 | yy.load_partial(3, p); 314 | return *this; 315 | } 316 | // Member function to store into array 317 | void store(double * p) const { 318 | yy.store_partial(3, p); 319 | } 320 | // Member function to convert to vector 321 | Vec4d to_vector() const { 322 | return yy; 323 | } 324 | // get x part 325 | double get_x() const { 326 | return _mm_cvtsd_f64(yy.get_low()); 327 | } 328 | // get y part 329 | double get_y() const { 330 | return yy.extract(1); 331 | } 332 | // get z part 333 | double get_z() const { 334 | return yy.extract(2); 335 | } 336 | // Member function to extract one coordinate 337 | double extract(uint32_t index) const { 338 | return yy.extract(index); 339 | } 340 | // Operator [] to extract one coordinate 341 | // Operator [] can only read an element, not write. 342 | double operator [] (uint32_t index) const { 343 | return extract(index); 344 | } 345 | // Insert one coordinate 346 | Vec3Dd & insert (uint32_t index, double x) { 347 | yy.insert(index, x); 348 | return *this; 349 | } 350 | static constexpr int size() { 351 | return 1; 352 | } 353 | static constexpr int elementtype() { 354 | return 0x211; 355 | } 356 | }; 357 | 358 | /***************************************************************************** 359 | * 360 | * Operators for Vec3Dd 361 | * 362 | *****************************************************************************/ 363 | 364 | // operator + : add 365 | static inline Vec3Dd operator + (Vec3Dd const a, Vec3Dd const b) { 366 | return Vec3Dd(a.to_vector() + b.to_vector()); 367 | } 368 | 369 | // operator += : add 370 | static inline Vec3Dd & operator += (Vec3Dd & a, Vec3Dd const b) { 371 | a = a + b; 372 | return a; 373 | } 374 | 375 | // operator - : subtract 376 | static inline Vec3Dd operator - (Vec3Dd const a, Vec3Dd const b) { 377 | return Vec3Dd(a.to_vector() - b.to_vector()); 378 | } 379 | 380 | // operator - : unary minus 381 | static inline Vec3Dd operator - (Vec3Dd const a) { 382 | return Vec3Dd(- a.to_vector()); 383 | } 384 | 385 | // operator -= : subtract 386 | static inline Vec3Dd & operator -= (Vec3Dd & a, Vec3Dd const b) { 387 | a = a - b; 388 | return a; 389 | } 390 | 391 | // operator * : multiply element-by-element 392 | // (see also cross_product and dot_product) 393 | static inline Vec3Dd operator * (Vec3Dd const a, Vec3Dd const b) { 394 | return Vec3Dd(a.to_vector() * b.to_vector()); 395 | } 396 | 397 | // operator *= : multiply element-by-element 398 | static inline Vec3Dd & operator *= (Vec3Dd & a, Vec3Dd const b) { 399 | a = a * b; 400 | return a; 401 | } 402 | 403 | // operator / : divide element-by-element 404 | static inline Vec3Dd operator / (Vec3Dd const a, Vec3Dd const b) { 405 | return Vec3Dd(a.to_vector() / b.to_vector()); 406 | } 407 | 408 | // operator /= : divide element-by-element 409 | static inline Vec3Dd & operator /= (Vec3Dd & a, Vec3Dd const b) { 410 | a = a / b; 411 | return a; 412 | } 413 | 414 | // operator == : returns true if a == b 415 | static inline bool operator == (Vec3Dd const a, Vec3Dd const b) { 416 | Vec4db t1 = a.to_vector() == b.to_vector(); 417 | #if INSTRSET >= 10 418 | return (uint8_t(t1) & 7) == 7; 419 | #elif INSTRSET >= 7 // AVX 420 | Vec4db t2 = Vec4db(permute4<0,1,2,2>(Vec4d(t1))); // ignore unused top element 421 | return horizontal_and(t2); 422 | #else 423 | Vec2db u0 = t1.get_low(); 424 | Vec2db u1 = t1.get_high(); 425 | u1 = permute2<0,0>(Vec2d(u1)); // ignore unused top element 426 | return horizontal_and(u0 & u1); 427 | #endif 428 | } 429 | 430 | // operator != : returns true if a != b 431 | static inline bool operator != (Vec3Dd const a, Vec3Dd const b) { 432 | Vec4db t1 = a.to_vector() != b.to_vector(); 433 | #if INSTRSET >= 10 434 | return (uint8_t(t1) & 7) != 0; 435 | #elif INSTRSET >= 7 // AVX 436 | Vec4db t2 = Vec4db(permute4<0,1,2,2>(Vec4d(t1))); // ignore unused top element 437 | return horizontal_and(t2); 438 | #else 439 | Vec2db u0 = t1.get_low(); 440 | Vec2db u1 = t1.get_high(); 441 | u1 = permute2<0,0>(Vec2d(u1)); // ignore unused top element 442 | return horizontal_or(u0 | u1); 443 | #endif 444 | } 445 | 446 | /***************************************************************************** 447 | * 448 | * Operators mixing Vec3Dd and double 449 | * 450 | *****************************************************************************/ 451 | 452 | // operator * : multiply 453 | static inline Vec3Dd operator * (Vec3Dd const a, double b) { 454 | return a.to_vector() * Vec4d(b); 455 | } 456 | static inline Vec3Dd operator * (double a, Vec3Dd const b) { 457 | return b * a; 458 | } 459 | static inline Vec3Dd & operator *= (Vec3Dd & a, double & b) { 460 | a = a * b; 461 | return a; 462 | } 463 | 464 | // operator / : divide 465 | static inline Vec3Dd operator / (Vec3Dd const a, double b) { 466 | return a.to_vector() / Vec4d(b); 467 | } 468 | 469 | static inline Vec3Dd & operator /= (Vec3Dd & a, double b) { 470 | a = a / b; 471 | return a; 472 | } 473 | 474 | 475 | /***************************************************************************** 476 | * 477 | * Functions for Vec3Dd 478 | * 479 | *****************************************************************************/ 480 | 481 | // function cross_product 482 | static inline Vec3Dd cross_product (Vec3Dd const a, Vec3Dd const b) { 483 | Vec4d a1 = permute4<1,2,0,V_DC>(a.to_vector()); 484 | Vec4d b1 = permute4<1,2,0,V_DC>(b.to_vector()); 485 | Vec4d a2 = permute4<2,0,1,V_DC>(a.to_vector()); 486 | Vec4d b2 = permute4<2,0,1,V_DC>(b.to_vector()); 487 | Vec4d c = a1 * b2 - a2 * b1; 488 | return c.cutoff(3); 489 | } 490 | 491 | // function dot_product 492 | static inline double dot_product (Vec3Dd const a, Vec3Dd const b) { 493 | Vec4d c = (a.to_vector() * b.to_vector()).cutoff(3); 494 | return horizontal_add(c); 495 | } 496 | 497 | // function vector_length 498 | static inline double vector_length (Vec3Dd const a) { 499 | return std::sqrt(dot_product(a,a)); 500 | } 501 | 502 | // function normalize_vector 503 | static inline Vec3Dd normalize_vector (Vec3Dd const a) { 504 | return a / vector_length(a); 505 | } 506 | 507 | // function select 508 | static inline Vec3Dd select (bool s, Vec3Dd const a, Vec3Dd const b) { 509 | return s ? a : b; 510 | } 511 | 512 | // function rotate 513 | // The vector a is rotated by multiplying by the matrix defined by the three columns col0, col1, col2 514 | static inline Vec3Dd rotate (Vec3Dd const col0, Vec3Dd const col1, Vec3Dd const col2, Vec3Dd const a) { 515 | Vec3Dd xbroad = permute4<0,0,0,V_DC>(a.to_vector()); // broadcast x 516 | Vec3Dd ybroad = permute4<1,1,1,V_DC>(a.to_vector()); // broadcast y 517 | Vec3Dd zbroad = permute4<2,2,2,V_DC>(a.to_vector()); // broadcast z 518 | Vec3Dd r = col0 * xbroad + col1 * ybroad + col2 * zbroad; 519 | return r.to_vector().cutoff(3); 520 | } 521 | 522 | 523 | /***************************************************************************** 524 | * 525 | * Conversion functions 526 | * 527 | *****************************************************************************/ 528 | 529 | // function to_single: convert Vec3Dd to Vec3Df 530 | static inline Vec3Df to_float(Vec3Dd const a) { 531 | #if INSTRSET >= 7 // AVX 532 | return _mm256_cvtpd_ps(a); 533 | #else 534 | //return Vec3Df(Vec4f(compress(a.to_vector().get_low(), a.to_vector().get_high()))); 535 | return to_float(a.to_vector()); 536 | #endif 537 | } 538 | 539 | // function to_double: convert Vec3Df to Vec3Dd 540 | static inline Vec3Dd to_double(Vec3Df const a) { 541 | #if INSTRSET >= 7 // AVX 542 | return _mm256_cvtps_pd(a); 543 | #else 544 | return to_double(a.to_vector()); 545 | #endif 546 | } 547 | 548 | #ifdef VCL_NAMESPACE 549 | } 550 | #endif 551 | 552 | #endif // VECTOR3D_H 553 | -------------------------------------------------------------------------------- /vector3d/vector3d_manual.tex: -------------------------------------------------------------------------------- 1 | \documentclass[11pt,a4paper,oneside,openright]{report} 2 | 3 | \usepackage[bindingoffset=5mm,left=20mm,right=20mm,top=20mm,bottom=20mm,footskip=10mm]{geometry} 4 | \usepackage[utf8x]{inputenc} 5 | \usepackage{hyperref} 6 | \usepackage[english]{babel} 7 | \usepackage{listings} 8 | \usepackage{subfiles} 9 | \usepackage{longtable} 10 | \usepackage{multirow} 11 | \usepackage{ragged2e} 12 | \usepackage{cmap} % avoid fi ligatures in pdf file 13 | \usepackage{amsthm} % example numbering 14 | \usepackage{color} 15 | %\usepackage{bold-extra} % for bold tt font. Remember to include bold-extra.sty file 16 | \usepackage{graphicx} 17 | \usepackage[yyyymmdd]{datetime} 18 | \usepackage{float} 19 | 20 | % style for code listing 21 | \renewcommand{\familydefault}{\sfdefault} 22 | \renewcommand{\ttdefault}{pcr} % selects Courier font 23 | \newtheorem{example}{Example}[chapter] % example numbering 24 | \lstset{language=C} % formatting for code listing 25 | \lstset{basicstyle=\ttfamily,breaklines=true} 26 | \definecolor{darkGreen}{rgb}{0,0.4,0} 27 | \definecolor{mybrown}{rgb}{0.40,0.10,0.05} 28 | \lstset{commentstyle=\color{darkGreen}} % comments color 29 | \lstset{keywordstyle=\color{blue}} % keyword color 30 | \lstset{stringstyle=\color{mybrown}} % string color 31 | \lstset{showstringspaces=false} % don't mark spaces in strings 32 | 33 | \renewcommand{\dateseparator}{-} 34 | 35 | % command for turning indent back on after \flushleft 36 | \newcommand{\indenton}{\RaggedRight\parindent=15pt} 37 | 38 | % command for vertical space 39 | \newcommand{\vspacesmall}{\vspace{3mm}} 40 | \newcommand{\vspacebig}{\vspace{6mm}} 41 | 42 | % style for code inlined in text: 43 | \newcommand{\codei}[1]{\bfseries \ttfamily{#1}\normalfont} 44 | 45 | 46 | \begin{document} 47 | 48 | \begin{titlepage} 49 | \centering 50 | 51 | \null %empty box needed for vfill to work 52 | \vfill 53 | 54 | {\bfseries\Huge 55 | Vector3d.h 56 | \vspacesmall 57 | 58 | 3-dimensional vector extension for 59 | \vspacesmall 60 | 61 | C++ vector class library 62 | \vspacebig 63 | 64 | } 65 | \vspacebig 66 | 67 | {\Large 68 | Agner Fog 69 | \vspacebig 70 | 71 | \copyright\ \today. Apache license 2.0 72 | } 73 | 74 | \vfill 75 | 76 | \includegraphics[width=306pt]{freesoftwarelogo.jpg} 77 | \vfill 78 | 79 | \end{titlepage} 80 | 81 | \RaggedRight 82 | 83 | \chapter{Introduction}\label{chap:Introduction} 84 | 3-dimensional vectors are useful in geometry and physics. 85 | The file vector3d.h provides vector classes, operators, and functions for 86 | calculations with 3-D vectors. This is an extension to the Vector Class Library. 87 | \vspacesmall 88 | 89 | The classes listed below are defined. Common operators and functions are defined for these classes: 90 | 91 | \begin {table}[H] 92 | \caption{3-D vector classes} 93 | \label{table:Vector3DClasses} 94 | \begin{tabular}{|p{24mm}|p{20mm}|p{20mm}|p{22mm}|p{20mm}|p{28mm}|} 95 | \hline 96 | \bfseries vector class & \bfseries Precision & \bfseries 3-D vectors per instance & \bfseries Correspon-ding real vector class & \bfseries Total bits & \bfseries Recommended minimum \newline instruction set \\ \hline 97 | Vec3Df & \centering single & \centering 1 & \centering Vec4f & \centering 128 & SSE2 \\ \hline 98 | Vec3Dd & \centering double & \centering 1 & \centering Vec4d & \centering 256 & AVX \\ \hline 99 | \end{tabular} 100 | \end{table} 101 | \vspacebig 102 | 103 | 104 | 105 | \section{Compiling} \label{Compiling} 106 | The 3-D vector class extension to the Vector Class Library is compiled in the same way as the Vector Class Library itself. All x86 and x86-64 platforms are supported, including Windows, Linux, and Mac OS. 107 | The following C++ compilers can be used: Gnu, Clang, Microsoft, and Intel. 108 | See the Vector Class Library manual for further details. 109 | \vspacesmall 110 | 111 | This example shows how to use the 3-D vector classes: 112 | 113 | \begin{example} 114 | \label{example1} 115 | \end{example} % frame disappears if I put this after end lstlisting 116 | \begin{lstlisting}[frame=single] 117 | // Example for 3-D vectors 118 | #include 119 | #include "vectorclass.h" // vector class library 120 | #include "vector3d.h" // extension for 3-D vectors 121 | 122 | // function to print 3-D vector: 123 | template 124 | void printv3 (const char * text, V a) { 125 | auto aa = a.to_vector(); // get elements as real vector 126 | printf("\n%s ", text); // print text 127 | printf("(%.3G,%.3G,%.3G)", aa[0], aa[1], aa[2]); 128 | } 129 | 130 | int main() { 131 | // define 3-D vectors 132 | Vec3Dd a(1,2,3); // x = 1, y = 2, z = 3 133 | Vec3Dd b(4,5,6); // x = 4, y = 5, z = 6 134 | Vec3Dd c = a + b; // add vectors 135 | Vec3Dd d = cross_product(a, b); // x-product 136 | double e = dot_product(a, b); // dot-product 137 | // print results 138 | printv3("a = ", a); // a = (1,2,3) 139 | printv3("b = ", b); // b = (4,5,6) 140 | printv3("c = ", c); // c = (5,7,9) 141 | printv3("d = ", d); // d = (-3,6,-3) 142 | printf ("\ne = %f", e); // e = 32 143 | } 144 | \end{lstlisting} 145 | \vspacesmall 146 | 147 | 148 | \chapter{Constructing 3-D vectors and loading data} 149 | \label{Constructing3Dvectors} 150 | 151 | There are several ways to create 3-D vectors and put data into them. These methods are listed here. 152 | \vspacebig 153 | 154 | \begin{tabular}{|p{25mm}|p{100mm}|} 155 | \hline 156 | \bfseries Method & default constructor \\ \hline 157 | \bfseries Defined for & all 3-D vectors classes \\ \hline 158 | \bfseries Description & the 3-D vector is created but not initialized.\newline 159 | The value is unpredictable \\ \hline 160 | \bfseries Efficiency & good \\ \hline 161 | \end{tabular} 162 | \vspacesmall 163 | 164 | \begin{lstlisting}[frame=none] 165 | // Example: 166 | Vec3Dd a; // creates a 3-D vector 167 | \end{lstlisting} 168 | \vspacebig 169 | 170 | 171 | \begin{tabular}{|p{25mm}|p{100mm}|} 172 | \hline 173 | \bfseries Method & Construct from x,y,z coordinates \\ \hline 174 | \bfseries Defined for & all 3-D vectors classes \\ \hline 175 | \bfseries Description & The parameters define the x, y, and z coordinates \\ \hline 176 | \bfseries Efficiency & good \\ \hline 177 | \end{tabular} 178 | \vspacesmall 179 | 180 | \begin{lstlisting}[frame=none] 181 | // Example: 182 | Vec3Dd a(1,2,3); // a = (1,2,3) (x = 1, y = 2, z = 3) 183 | \end{lstlisting} 184 | \vspacebig 185 | 186 | \begin{tabular}{|p{25mm}|p{100mm}|} 187 | \hline 188 | \bfseries Method & member function load(p) \\ \hline 189 | \bfseries Defined for & all 3-D vectors classes \\ \hline 190 | \bfseries Description & Load data from array of same precision. \\ \hline 191 | \bfseries Efficiency & good \\ \hline 192 | \end{tabular} 193 | \vspacesmall 194 | 195 | \begin{lstlisting}[frame=none] 196 | // Example: 197 | float a[3] = {2,5,-1}; 198 | Vec3Df b; 199 | b.load(a); // b = (2,5,-1) 200 | \end{lstlisting} 201 | \vspacebig 202 | 203 | 204 | \begin{tabular}{|p{25mm}|p{100mm}|} 205 | \hline 206 | \bfseries Method & member function store(p) \\ \hline 207 | \bfseries Defined for & all 3-D vectors classes \\ \hline 208 | \bfseries Description & Save data into array of same precision \\ \hline 209 | \bfseries Efficiency & good \\ \hline 210 | \end{tabular} 211 | \vspacesmall 212 | 213 | \begin{lstlisting}[frame=none] 214 | // Example: 215 | double a[3]; 216 | Vec3Dd b(4,0,3); 217 | b.store(a); // a = {4,0,3} 218 | \end{lstlisting} 219 | \vspacebig 220 | 221 | 222 | \begin{tabular}{|p{25mm}|p{100mm}|} 223 | \hline 224 | \bfseries Method & member function get\_x() \\ \hline 225 | \bfseries Defined for & all 3-D vectors classes \\ \hline 226 | \bfseries Description & Get the x-coordinate \\ \hline 227 | \bfseries Efficiency & good \\ \hline 228 | \end{tabular} 229 | \vspacesmall 230 | 231 | \begin{lstlisting}[frame=none] 232 | // Example: 233 | Vec3Dd a(1,2,3); 234 | double b = a.get_x(); // b = 1 235 | \end{lstlisting} 236 | \vspacebig 237 | 238 | \begin{tabular}{|p{25mm}|p{100mm}|} 239 | \hline 240 | \bfseries Method & member function get\_y() \\ \hline 241 | \bfseries Defined for & all 3-D vectors classes \\ \hline 242 | \bfseries Description & Get the y-coordinate \\ \hline 243 | \bfseries Efficiency & good \\ \hline 244 | \end{tabular} 245 | \vspacesmall 246 | 247 | \begin{lstlisting}[frame=none] 248 | // Example: 249 | Vec3Dd a(1,2,3); 250 | double b = a.get_y(); // b = 2 251 | \end{lstlisting} 252 | \vspacebig 253 | 254 | \begin{tabular}{|p{25mm}|p{100mm}|} 255 | \hline 256 | \bfseries Method & member function get\_z() \\ \hline 257 | \bfseries Defined for & all 3-D vectors classes \\ \hline 258 | \bfseries Description & Get the z-coordinate \\ \hline 259 | \bfseries Efficiency & good \\ \hline 260 | \end{tabular} 261 | \vspacesmall 262 | 263 | \begin{lstlisting}[frame=none] 264 | // Example: 265 | Vec3Dd a(1,2,3); 266 | double b = a.get_z(); // b = 3 267 | \end{lstlisting} 268 | \vspacebig 269 | 270 | \begin{tabular}{|p{25mm}|p{100mm}|} 271 | \hline 272 | \bfseries Method & member function extract(index) \\ \hline 273 | \bfseries Defined for & all 3-D vectors classes \\ \hline 274 | \bfseries Description & index = 0, 1, 2 give the x, y, or z-coordinate, respectively \\ \hline 275 | \bfseries Efficiency & good \\ \hline 276 | \end{tabular} 277 | \vspacesmall 278 | 279 | \begin{lstlisting}[frame=none] 280 | // Example: 281 | Vec3Dd a(1,2,3); 282 | double b = a.extract(2); // b = 3 283 | double c = a[2]; // b = 3 (the same) 284 | \end{lstlisting} 285 | \vspacebig 286 | 287 | \begin{tabular}{|p{25mm}|p{100mm}|} 288 | \hline 289 | \bfseries Method & member function insert(index, value) \\ \hline 290 | \bfseries Defined for & all 3-D vectors classes \\ \hline 291 | \bfseries Description & index = 0, 1, 2 changes the x, y, or z-coordinate, respectively \\ \hline 292 | \bfseries Efficiency & good \\ \hline 293 | \end{tabular} 294 | \vspacesmall 295 | 296 | \begin{lstlisting}[frame=none] 297 | // Example: 298 | Vec3Dd a(1,2,3); 299 | a.insert(0, 8); // a = (8, 2, 3) 300 | \end{lstlisting} 301 | \vspacebig 302 | 303 | 304 | \chapter{Operators}\label{chap:Operators} 305 | 306 | \begin{tabular}{|p{25mm}|p{100mm}|} 307 | \hline 308 | \bfseries Operator & + \\ \hline 309 | \bfseries Defined for & all 3-D vectors classes \\ \hline 310 | \bfseries Description & Add two vectors \\ \hline 311 | \bfseries Efficiency & good \\ \hline 312 | \end{tabular} 313 | \vspacesmall 314 | 315 | \begin{lstlisting}[frame=none] 316 | // Example: 317 | Vec3Dd a(1,2,3); 318 | Vec3Dd b(5,6,7); 319 | Vec3Dd c = a + b; // c = (6,8,10) 320 | \end{lstlisting} 321 | \vspacebig 322 | 323 | 324 | \begin{tabular}{|p{25mm}|p{100mm}|} 325 | \hline 326 | \bfseries Operator & - \\ \hline 327 | \bfseries Defined for & all 3-D vectors classes \\ \hline 328 | \bfseries Description & Subtract two vectors \\ \hline 329 | \bfseries Efficiency & good \\ \hline 330 | \end{tabular} 331 | \vspacesmall 332 | 333 | \begin{lstlisting}[frame=none] 334 | // Example: 335 | Vec3Dd a(11,10,9); 336 | Vec3Dd b(5,6,7); 337 | Vec3Dd c = a - b; // c = (6,4,2) 338 | Vec3Dd d = - b; // d = (-5,-6,-7) 339 | \end{lstlisting} 340 | \vspacebig 341 | 342 | 343 | \begin{tabular}{|p{25mm}|p{100mm}|} 344 | \hline 345 | \bfseries Operator & * \\ \hline 346 | \bfseries Defined for & all 3-D vectors classes \\ \hline 347 | \bfseries Description & Multiply two vectors element by element, or one vector and one scalar of the same precision \\ \hline 348 | \bfseries Efficiency & good \\ \hline 349 | \end{tabular} 350 | \vspacesmall 351 | 352 | \begin{lstlisting}[frame=none] 353 | // Example: 354 | Vec3Dd a(1,2,3); 355 | Vec3Dd b(4,5,6); 356 | Vec3Dd c = a * b; // c = (4,10,18) 357 | Vec3Dd d = a * 10.0; // d = (10,20,30) 358 | \end{lstlisting} 359 | \vspacebig 360 | 361 | 362 | \begin{tabular}{|p{25mm}|p{100mm}|} 363 | \hline 364 | \bfseries Operator & / \\ \hline 365 | \bfseries Defined for & all 3-D vectors classes \\ \hline 366 | \bfseries Description & Divide a vector by a scalar of the same precision \\ \hline 367 | \bfseries Efficiency & good \\ \hline 368 | \end{tabular} 369 | \vspacesmall 370 | 371 | \begin{lstlisting}[frame=none] 372 | // Example: 373 | Vec3Dd a(10,20,30); 374 | Vec3Dd b = a / 5.0; // b = (2,4,6) 375 | \end{lstlisting} 376 | \vspacebig 377 | 378 | 379 | \begin{tabular}{|p{25mm}|p{100mm}|} 380 | \hline 381 | \bfseries Operator & == \\ \hline 382 | \bfseries Defined for & all 3-D vectors classes \\ \hline 383 | \bfseries Description & Compare for equality.\newline 384 | The result is a boolean scalar. \\ \hline 385 | \bfseries Efficiency & good \\ \hline 386 | \end{tabular} 387 | \vspacesmall 388 | 389 | \begin{lstlisting}[frame=none] 390 | // Example: 391 | Vec3Dd a(1, 2,3); 392 | Vec3Dd b(1,-2,3); 393 | bool c = (a == b); // c = false 394 | \end{lstlisting} 395 | \vspacebig 396 | 397 | 398 | \begin{tabular}{|p{25mm}|p{100mm}|} 399 | \hline 400 | \bfseries Operator & != \\ \hline 401 | \bfseries Defined for & all 3-D vectors classes \\ \hline 402 | \bfseries Description & Compare for not equal.\newline 403 | The result is a boolean scalar. \\ \hline 404 | \bfseries Efficiency & good \\ \hline 405 | \end{tabular} 406 | \vspacesmall 407 | 408 | \begin{lstlisting}[frame=none] 409 | // Example: 410 | Vec3Dd a(1, 2,3); 411 | Vec3Dd b(1,-2,3); 412 | bool c = (a != b); // c = true 413 | \end{lstlisting} 414 | \vspacebig 415 | 416 | 417 | \chapter{Mathematical functions}\label{chap:MathematicalFunctions} 418 | 419 | 420 | \begin{tabular}{|p{25mm}|p{100mm}|} 421 | \hline 422 | \bfseries Function & cross\_product \\ \hline 423 | \bfseries Defined for & all 3-D vectors classes \\ \hline 424 | \bfseries Description & Gives the X-product of two vectors \\ \hline 425 | \bfseries Efficiency & medium \\ \hline 426 | \bfseries Accuracy & Calculation of the X-product involves the calculation of sums of products. Loss of precision may occur if the result is close to zero. \\ \hline 427 | \end{tabular} 428 | \vspacesmall 429 | 430 | \begin{lstlisting}[frame=none] 431 | // Example: 432 | Vec3Dd a(1,2,3); 433 | Vec3Dd b(4,5,6); 434 | Vec3Dd c = cross_product(a,b); // c = (-3,6,-3) 435 | Vec3Dd d = cross_product(b,a); // d = (3,-6,3) 436 | \end{lstlisting} 437 | \vspacebig 438 | 439 | 440 | \begin{tabular}{|p{25mm}|p{100mm}|} 441 | \hline 442 | \bfseries Function & dot\_product \\ \hline 443 | \bfseries Defined for & all 3-D vectors classes \\ \hline 444 | \bfseries Description & Gives the dot-product of two vectors. The result is a scalar \\ \hline 445 | \bfseries Efficiency & medium \\ \hline 446 | \end{tabular} 447 | \vspacesmall 448 | 449 | \begin{lstlisting}[frame=none] 450 | // Example: 451 | Vec3Dd a(1,2,3); 452 | Vec3Dd b(4,5,6); 453 | double c = dot_product(a,b); // c = 32 454 | \end{lstlisting} 455 | \vspacebig 456 | 457 | 458 | \begin{tabular}{|p{25mm}|p{100mm}|} 459 | \hline 460 | \bfseries Function & vector\_length \\ \hline 461 | \bfseries Defined for & all 3-D vectors classes \\ \hline 462 | \bfseries Description & Gives the length of the vector (Euclidian norm) \\ \hline 463 | \bfseries Efficiency & medium \\ \hline 464 | \end{tabular} 465 | \vspacesmall 466 | 467 | \begin{lstlisting}[frame=none] 468 | // Example: 469 | Vec3Dd a(3,0,4); 470 | double b = vector_length(a); // b = 5 471 | \end{lstlisting} 472 | \vspacebig 473 | 474 | 475 | \begin{tabular}{|p{25mm}|p{100mm}|} 476 | \hline 477 | \bfseries Function & normalize\_vector \\ \hline 478 | \bfseries Defined for & all 3-D vectors classes \\ \hline 479 | \bfseries Description & Divides the vector by its length to give a vector with the same direction and length one. \\ \hline 480 | \bfseries Efficiency & medium \\ \hline 481 | \end{tabular} 482 | \vspacesmall 483 | 484 | \begin{lstlisting}[frame=none] 485 | // Example: 486 | Vec3Dd a(3,0,4); 487 | Vec3Dd b = normalize_vector(a); // b = (0.6, 0.0, 0.8) 488 | \end{lstlisting} 489 | \vspacebig 490 | 491 | 492 | \begin{tabular}{|p{25mm}|p{100mm}|} 493 | \hline 494 | \bfseries Function & rotate \\ \hline 495 | \bfseries Defined for & all 3-D vectors classes \\ \hline 496 | \bfseries Description & Rotates a vector by multiplying a 3x3 rotation matrix by the column vector. The first three parameters define the columns of the rotation matrix. The last parameter is the vector to rotate. \\ \hline 497 | \bfseries Efficiency & medium \\ \hline 498 | \bfseries Accuracy & Calculation of the rotated vector involves the calculation of sums of products. Loss of precision may occur if the result is close to zero. \\ \hline 499 | \end{tabular} 500 | \vspacesmall 501 | 502 | \begin{lstlisting}[frame=none] 503 | // Example: 504 | Vec3Dd a(1,2,3); // vector to rotate 505 | Vec3Dd c0(1,0,0); // first column of matrix 506 | Vec3Dd c1(0,0,-1); // second column of matrix 507 | Vec3Dd c2(0,1,0); // third column of matrix 508 | Vec3Dd d = rotate(c0,c1,c2,a); // d = (1,3,-2) 509 | \end{lstlisting} 510 | \vspacebig 511 | 512 | 513 | \chapter{Other functions}\label{chap:OtherFunctions} 514 | 515 | \begin{tabular}{|p{25mm}|p{100mm}|} 516 | \hline 517 | \bfseries Function & to\_vector \\ \hline 518 | \bfseries Defined for & all 3-D vectors classes \\ \hline 519 | \bfseries Description & Convert to a vector of class Vec4f or Vec4d. \\ \hline 520 | \bfseries Efficiency & good \\ \hline 521 | \end{tabular} 522 | \vspacesmall 523 | 524 | \begin{lstlisting}[frame=none] 525 | // Example: 526 | Vec3Df a(1,2,3); 527 | Vec4f b = a.to_vector(); // b = (1,2,3,0) 528 | \end{lstlisting} 529 | \vspacebig 530 | 531 | 532 | \begin{tabular}{|p{25mm}|p{100mm}|} 533 | \hline 534 | \bfseries Function & select \\ \hline 535 | \bfseries Defined for & all 3-D vectors classes \\ \hline 536 | \bfseries Description & Choose between two vectors. \\ \hline 537 | \bfseries Efficiency & good \\ \hline 538 | \end{tabular} 539 | \vspacesmall 540 | 541 | \begin{lstlisting}[frame=none] 542 | // Example: 543 | Vec3Df a(1,2,3); 544 | Vec3Df b(4,5,6); 545 | Vec3Df c = select(true,a,b); // c = (1,2,3) 546 | Vec3Df d = select(false,a,b); // d = (4,5,6) 547 | \end{lstlisting} 548 | \vspacebig 549 | 550 | 551 | \begin{tabular}{|p{25mm}|p{100mm}|} 552 | \hline 553 | \bfseries Function & to\_float \\ \hline 554 | \bfseries Defined for & Vec3Dd \\ \hline 555 | \bfseries Description & Convert to lower precision. The result is a Vec3Df \\ \hline 556 | \bfseries Efficiency & good \\ \hline 557 | \end{tabular} 558 | \vspacesmall 559 | 560 | \begin{lstlisting}[frame=none] 561 | // Example: 562 | Vec3Dd a(1,2,3); 563 | Vec3Df b = to_float(a);// b = (1,2,3) 564 | \end{lstlisting} 565 | \vspacebig 566 | 567 | 568 | \begin{tabular}{|p{25mm}|p{100mm}|} 569 | \hline 570 | \bfseries Function & to\_double \\ \hline 571 | \bfseries Defined for & Vec3Df \\ \hline 572 | \bfseries Description & Convert to higher precision. The result is a Vec3Dd \\ \hline 573 | \bfseries Efficiency & good \\ \hline 574 | \end{tabular} 575 | \vspacesmall 576 | 577 | \begin{lstlisting}[frame=none] 578 | // Example: 579 | Vec3Df a(1,2,3); 580 | Vec3Dd b = to_double(a);// b = (1,2,3) 581 | \end{lstlisting} 582 | \vspacebig 583 | 584 | 585 | \end{document} 586 | -------------------------------------------------------------------------------- /quaternion/quaternion.h: -------------------------------------------------------------------------------- 1 | /*************************** quaternion.h ********************************* 2 | * Author: Agner Fog 3 | * Date created: 2012-08-01 4 | * Last modified: 2019-07-13 5 | * Version: 2.00 6 | * Project: Extension to vector class library 7 | * Description: 8 | * Quaternions are used in theoretical algebra 9 | * Classes for quaternions: 10 | * Quaternion1f: One quaternion consisting of four single precision floats 11 | * Quaternion1d: One quaternion consisting of four double precision floats 12 | * 13 | * (c) Copyright 2012-2019 Apache License version 2.0 or later 14 | ******************************************************************************/ 15 | 16 | 17 | #ifndef QUATERNION_H 18 | #define QUATERNION_H 200 19 | 20 | #include "vectorclass.h" 21 | #include 22 | 23 | #ifdef VCL_NAMESPACE 24 | namespace VCL_NAMESPACE { 25 | #endif 26 | 27 | /***************************************************************************** 28 | * 29 | * Class Quaternion1f 30 | * One quaternion consisting of four single precision floats 31 | * 32 | *****************************************************************************/ 33 | 34 | class Quaternion1f { 35 | protected: 36 | __m128 xmm; // vector of 4 single precision floats 37 | public: 38 | // default constructor 39 | Quaternion1f() { 40 | } 41 | // construct from real, no imaginary part 42 | Quaternion1f(float re) { 43 | xmm = _mm_load_ss(&re); 44 | } 45 | // construct from real and imaginary parts = re + im0*i + im1*j + im2*k 46 | Quaternion1f(float re, float im0, float im1, float im2) { 47 | xmm = Vec4f(re, im0, im1, im2); 48 | } 49 | // Constructor to convert from type __m128 used in intrinsics: 50 | Quaternion1f(__m128 const x) { 51 | xmm = x; 52 | } 53 | // Assignment operator to convert from type __m128 used in intrinsics: 54 | Quaternion1f & operator = (__m128 const x) { 55 | xmm = x; 56 | return *this; 57 | } 58 | // Constructor to convert from Vec4f 59 | Quaternion1f(Vec4f const x) { 60 | xmm = x; 61 | } 62 | // Type cast operator to convert to __m128 used in intrinsics 63 | operator __m128() const { 64 | return xmm; 65 | } 66 | // Member function to convert to vector 67 | Vec4f to_vector() const { 68 | return xmm; 69 | } 70 | // Member function to load from array 71 | Quaternion1f & load(float const * p) { 72 | xmm = Vec4f().load(p); 73 | return *this; 74 | } 75 | // Member function to store into array 76 | void store(float * p) const { 77 | Vec4f(xmm).store(p); 78 | } 79 | // Member function to extract real part 80 | float real() const { 81 | return _mm_cvtss_f32(xmm); 82 | } 83 | // Member function to extract imaginary parts, sets real part to 0 84 | Quaternion1f imag() const { 85 | return Quaternion1f(permute4<-1,1,2,3>(Vec4f(xmm))); 86 | } 87 | #ifdef COMPLEXVEC_H // relations to complexvec1.h 88 | // construct from two Complex1f = a0 + a1 * j 89 | Quaternion1f(Complex1f const a0, Complex1f const a1) { 90 | xmm = _mm_movelh_ps(a0, a1); 91 | } 92 | // Member functions to split into two Complex1f: 93 | // q = q.get_low() + q.get_high()*j 94 | Complex1f get_low() const { 95 | return Complex1f(Vec4f(xmm).cutoff(2)); 96 | } 97 | Complex1f get_high() const { 98 | __m128 t = _mm_movehl_ps(_mm_setzero_ps(), xmm); 99 | return Complex1f(t); 100 | } 101 | #endif 102 | #ifdef VECTOR3D_H // relations to vector3d.h 103 | // Constructor to convert from Vec3f used in geometrics: 104 | Quaternion1f(Vec3f const x) { 105 | xmm = permute4<3,0,1,2>(Vec4f(x)); // rotate elements 106 | } 107 | 108 | // Type cast operator to convert to Vec3f used in geometrics: 109 | operator Vec3f() const { 110 | return Vec3f(permute4<1,2,3,0>(Vec4f(xmm))); // rotate elements 111 | } 112 | #endif // VECTOR3D_H 113 | }; 114 | 115 | 116 | /***************************************************************************** 117 | * 118 | * Operators for Quaternion1f 119 | * 120 | *****************************************************************************/ 121 | 122 | // operator + : add 123 | static inline Quaternion1f operator + (Quaternion1f const a, Quaternion1f const b) { 124 | return Quaternion1f(a.to_vector() + b.to_vector()); 125 | } 126 | 127 | // operator += : add 128 | static inline Quaternion1f & operator += (Quaternion1f & a, Quaternion1f const b) { 129 | a = a + b; 130 | return a; 131 | } 132 | 133 | // operator - : subtract 134 | static inline Quaternion1f operator - (Quaternion1f const a, Quaternion1f const b) { 135 | return Quaternion1f(a.to_vector() - b.to_vector()); 136 | } 137 | 138 | // operator - : unary minus 139 | static inline Quaternion1f operator - (Quaternion1f const a) { 140 | return Quaternion1f(- a.to_vector()); 141 | } 142 | 143 | // operator -= : subtract 144 | static inline Quaternion1f & operator -= (Quaternion1f & a, Quaternion1f const b) { 145 | a = a - b; 146 | return a; 147 | } 148 | 149 | // operator * : quaternion multiply 150 | static inline Quaternion1f operator * (Quaternion1f const a, Quaternion1f const b) { 151 | __m128 a1123 = _mm_shuffle_ps(a,a,0xE5); 152 | __m128 a2231 = _mm_shuffle_ps(a,a,0x7A); 153 | __m128 b1000 = _mm_shuffle_ps(b,b,0x01); 154 | __m128 b2312 = _mm_shuffle_ps(b,b,0x9E); 155 | __m128 t1 = _mm_mul_ps(a1123, b1000); 156 | __m128 t2 = _mm_mul_ps(a2231, b2312); 157 | __m128 t12 = _mm_add_ps(t1, t2); 158 | __m128 t12m = change_sign<1,0,0,0>(Vec4f(t12)); 159 | __m128 a3312 = _mm_shuffle_ps(a,a,0x9F); 160 | __m128 b3231 = _mm_shuffle_ps(b,b,0x7B); 161 | __m128 a0000 = _mm_shuffle_ps(a,a,0x00); 162 | __m128 t3 = _mm_mul_ps(a3312, b3231); 163 | __m128 t0 = _mm_mul_ps(a0000, b); 164 | __m128 t03 = _mm_sub_ps(t0, t3); 165 | return _mm_add_ps(t03, t12m); 166 | } 167 | 168 | // operator *= : multiply 169 | static inline Quaternion1f & operator *= (Quaternion1f & a, Quaternion1f const b) { 170 | a = a * b; 171 | return a; 172 | } 173 | 174 | // operator ~ : complex conjugate 175 | // ~(a + b*i + c*j + d*k) = (a - b*i - c*j - d*k) 176 | static inline Quaternion1f operator ~ (Quaternion1f const a) { 177 | return Quaternion1f(change_sign<0,1,1,1>(a.to_vector())); 178 | } 179 | 180 | // function reciprocal: multiplicative inverse 181 | static inline Quaternion1f reciprocal (Quaternion1f const a) { 182 | Vec4f sq = _mm_mul_ps(a,a); 183 | float nsq = horizontal_add(sq); 184 | return Quaternion1f((~a).to_vector() / Vec4f(nsq)); 185 | } 186 | 187 | // operator / : quaternion divide is defined as 188 | // a / b = a * reciprocal(b) 189 | static inline Quaternion1f operator / (Quaternion1f const a, Quaternion1f const b) { 190 | return a * reciprocal(b); 191 | } 192 | 193 | // operator /= : divide 194 | static inline Quaternion1f & operator /= (Quaternion1f & a, Quaternion1f const b) { 195 | a = a / b; 196 | return a; 197 | } 198 | 199 | // operator == : returns true if a == b 200 | static inline bool operator == (Quaternion1f const a, Quaternion1f const b) { 201 | Vec4fb t1 = a.to_vector() == b.to_vector(); 202 | return horizontal_and(t1); 203 | } 204 | 205 | // operator != : returns true if a != b 206 | static inline bool operator != (Quaternion1f const a, Quaternion1f const b) { 207 | Vec4fb t1 = a.to_vector() != b.to_vector(); 208 | return horizontal_or(t1); 209 | } 210 | 211 | 212 | /***************************************************************************** 213 | * 214 | * Operators mixing Quaternion1f and float 215 | * 216 | *****************************************************************************/ 217 | 218 | // operator + : add 219 | static inline Quaternion1f operator + (Quaternion1f const a, float b) { 220 | return _mm_add_ss(a, _mm_set_ss(b)); 221 | } 222 | 223 | static inline Quaternion1f operator + (float a, Quaternion1f const b) { 224 | return b + a; 225 | } 226 | 227 | static inline Quaternion1f & operator += (Quaternion1f & a, float & b) { 228 | a = a + b; 229 | return a; 230 | } 231 | 232 | // operator - : subtract 233 | static inline Quaternion1f operator - (Quaternion1f const a, float b) { 234 | return _mm_sub_ss(a, _mm_set_ss(b)); 235 | } 236 | 237 | static inline Quaternion1f operator - (float a, Quaternion1f const b) { 238 | return _mm_sub_ps(_mm_set_ss(a), b); 239 | } 240 | 241 | static inline Quaternion1f & operator -= (Quaternion1f & a, float & b) { 242 | a = a - b; 243 | return a; 244 | } 245 | 246 | // operator * : multiply 247 | static inline Quaternion1f operator * (Quaternion1f const a, float b) { 248 | return _mm_mul_ps(a, _mm_set1_ps(b)); 249 | } 250 | 251 | static inline Quaternion1f operator * (float a, Quaternion1f const b) { 252 | return b * a; 253 | } 254 | 255 | static inline Quaternion1f & operator *= (Quaternion1f & a, float & b) { 256 | a = a * b; 257 | return a; 258 | } 259 | 260 | // operator / : divide 261 | static inline Quaternion1f operator / (Quaternion1f const a, float b) { 262 | return _mm_div_ps(a, _mm_set1_ps(b)); 263 | } 264 | 265 | static inline Quaternion1f operator / (float a, Quaternion1f const b) { 266 | return reciprocal(b) * a; 267 | } 268 | 269 | static inline Quaternion1f & operator /= (Quaternion1f & a, float b) { 270 | a = a / b; 271 | return a; 272 | } 273 | 274 | 275 | /***************************************************************************** 276 | * 277 | * Functions for Quaternion1f 278 | * 279 | *****************************************************************************/ 280 | 281 | // function abs: calculate the norm 282 | // abs(a + b*i + c*j + d*k) = sqrt(a*a + b*B + c*c + d*d) 283 | static inline float abs(Quaternion1f const a) { 284 | Vec4f sq = _mm_mul_ps(a,a); 285 | float nsq = horizontal_add(sq); 286 | return std::sqrt(nsq); 287 | } 288 | 289 | // function select 290 | static inline Quaternion1f select (bool s, Quaternion1f const a, Quaternion1f const b) { 291 | return Quaternion1f(s ? a : b); 292 | } 293 | 294 | 295 | 296 | /***************************************************************************** 297 | * 298 | * Class Quaternion1d 299 | * One quaternion consisting of four double precision floats 300 | * 301 | *****************************************************************************/ 302 | 303 | class Quaternion1d { 304 | protected: 305 | Vec4d y; // vector of 4 doubles 306 | public: 307 | // default constructor 308 | Quaternion1d() { 309 | } 310 | // construct from real and imaginary parts = re + im0*i + im1*j + im2*k 311 | Quaternion1d(double re, double im0, double im1, double im2) { 312 | y = Vec4d(re, im0, im1, im2); 313 | } 314 | // construct from real, no imaginary part 315 | Quaternion1d(double re) { 316 | y = Vec4d(re, 0., 0., 0.); 317 | } 318 | // Constructor to convert from type __m256d used in intrinsics or Vec256de used in emulation 319 | #if INSTRSET >= 7 // AVX 320 | Quaternion1d(__m256d const x) { 321 | #else 322 | Quaternion1d(Vec256de const x) { 323 | #endif 324 | y = x; 325 | } 326 | // Assignment operator to convert from type __m256d used in intrinsics or Vec256de used in emulation 327 | #if INSTRSET >= 7 // AVX 328 | Quaternion1d & operator = (__m256d const x) { 329 | #else 330 | Quaternion1d & operator = (Vec256de const x) { 331 | #endif 332 | y = x; 333 | return *this; 334 | } 335 | // Constructor to convert from Vec4d 336 | Quaternion1d(Vec4d const x) { 337 | y = x; 338 | } 339 | // Type cast operator to convert to __m256d used in intrinsics or Vec256de used in emulation 340 | #if INSTRSET >= 7 // AVX 341 | operator __m256d() const { 342 | #else 343 | operator Vec256de() const { 344 | #endif 345 | return y; 346 | } 347 | // Member function to convert to vector 348 | Vec4d to_vector() const { 349 | return y; 350 | } 351 | // Member function to load from array 352 | Quaternion1d & load(double const * p) { 353 | y.load(p); 354 | return *this; 355 | } 356 | // Member function to store into array 357 | void store(double * p) const { 358 | y.store(p); 359 | } 360 | #ifdef COMPLEXVEC_H // relations to complexvec1.h 361 | // construct from two Complex1d = a0 + a1 * j 362 | Quaternion1d(Complex1d const a0, Complex1d const a1) { 363 | y = Vec4d(Vec2d(a0), Vec2d(a1)); 364 | } 365 | // Member functions to split into two Complex1d: 366 | // q = q.get_low() + q.get_high()*j 367 | Complex1d get_low() const { 368 | return Complex1d(y.get_low()); 369 | } 370 | Complex1d get_high() const { 371 | return Complex1d(y.get_high()); 372 | } 373 | #endif 374 | // Member function to extract real part 375 | double real() const { 376 | return y.extract(0); 377 | } 378 | // Member function to extract imaginary parts, sets real part to 0 379 | Quaternion1d imag() const { 380 | return Quaternion1d(permute4<-1,1,2,3>(Vec4d(y))); 381 | } 382 | #ifdef VECTOR3D_H 383 | // Constructor to convert from Vec3d used in geometrics: 384 | Quaternion1d(Vec3d const x) { 385 | y = permute4<3,0,1,2>(Vec4d(x)); // rotate elements 386 | } 387 | // Type cast operator to convert to Vec3d used in geometrics: 388 | operator Vec3d() const { 389 | return Vec3d(permute4<1,2,3,0>(y)); // rotate elements 390 | } 391 | #endif // VECTOR3D_H 392 | }; 393 | 394 | 395 | /***************************************************************************** 396 | * 397 | * Operators for Quaternion1d 398 | * 399 | *****************************************************************************/ 400 | 401 | // operator + : add 402 | static inline Quaternion1d operator + (Quaternion1d const a, Quaternion1d const b) { 403 | return Quaternion1d(a.to_vector() + b.to_vector()); 404 | } 405 | 406 | // operator += : add 407 | static inline Quaternion1d & operator += (Quaternion1d & a, Quaternion1d const b) { 408 | a = a + b; 409 | return a; 410 | } 411 | 412 | // operator - : subtract 413 | static inline Quaternion1d operator - (Quaternion1d const a, Quaternion1d const b) { 414 | return Quaternion1d(a.to_vector() - b.to_vector()); 415 | } 416 | 417 | // operator - : unary minus 418 | static inline Quaternion1d operator - (Quaternion1d const a) { 419 | return Quaternion1d(- a.to_vector()); 420 | } 421 | 422 | // operator -= : subtract 423 | static inline Quaternion1d & operator -= (Quaternion1d & a, Quaternion1d const b) { 424 | a = a - b; 425 | return a; 426 | } 427 | 428 | // operator * : quaternion multiply 429 | static inline Quaternion1d operator * (Quaternion1d const a, Quaternion1d const b) { 430 | Vec4d a1123 = permute4<1,1,2,3>(a.to_vector()); 431 | Vec4d a2231 = permute4<2,2,3,1>(a.to_vector()); 432 | Vec4d b1000 = permute4<1,0,0,0>(b.to_vector()); 433 | Vec4d b2312 = permute4<2,3,1,2>(b.to_vector()); 434 | Vec4d t1 = a1123 * b1000; 435 | Vec4d t2 = a2231 * b2312; 436 | Vec4d t12 = t1 + t2; 437 | Vec4d t12m = change_sign<1,0,0,0>(t12); 438 | Vec4d a3312 = permute4<3,3,1,2>(a.to_vector()); 439 | Vec4d b3231 = permute4<3,2,3,1>(b.to_vector()); 440 | Vec4d a0000 = permute4<0,0,0,0>(a.to_vector()); 441 | Vec4d t3 = a3312 * b3231; 442 | Vec4d t0 = a0000 * b.to_vector(); 443 | Vec4d t03 = t0 - t3; 444 | return t03 + t12m; 445 | } 446 | 447 | // operator *= : multiply 448 | static inline Quaternion1d & operator *= (Quaternion1d & a, Quaternion1d const b) { 449 | a = a * b; 450 | return a; 451 | } 452 | 453 | // operator ~ : complex conjugate 454 | // ~(a + b*i + c*j + d*k) = (a - b*i - c*j - d*k) 455 | static inline Quaternion1d operator ~ (Quaternion1d const a) { 456 | return Quaternion1d(change_sign<0,1,1,1>(a.to_vector())); 457 | } 458 | 459 | // function reciprocal: multiplicative inverse 460 | static inline Quaternion1d reciprocal (Quaternion1d const a) { 461 | Vec4d sq = a.to_vector() * a.to_vector(); 462 | double nsq = horizontal_add(sq); 463 | return Quaternion1d((~a).to_vector() / Vec4d(nsq)); 464 | } 465 | 466 | // operator / : quaternion divide is defined as 467 | // a / b = a * reciprocal(b) 468 | static inline Quaternion1d operator / (Quaternion1d const a, Quaternion1d const b) { 469 | return a * reciprocal(b); 470 | } 471 | 472 | // operator /= : divide 473 | static inline Quaternion1d & operator /= (Quaternion1d & a, Quaternion1d const b) { 474 | a = a / b; 475 | return a; 476 | } 477 | 478 | // operator == : returns true if a == b 479 | static inline bool operator == (Quaternion1d const a, Quaternion1d const b) { 480 | Vec4db t1 = a.to_vector() == b.to_vector(); 481 | return horizontal_and(t1); 482 | } 483 | 484 | // operator != : returns true if a != b 485 | static inline bool operator != (Quaternion1d const a, Quaternion1d const b) { 486 | Vec4db t1 = a.to_vector() != b.to_vector(); 487 | return horizontal_or(t1); 488 | } 489 | 490 | 491 | /***************************************************************************** 492 | * 493 | * Operators mixing Quaternion1d and double 494 | * 495 | *****************************************************************************/ 496 | 497 | // operator + : add 498 | static inline Quaternion1d operator + (Quaternion1d const a, double b) { 499 | return a + Quaternion1d(b); 500 | } 501 | 502 | static inline Quaternion1d operator + (double a, Quaternion1d const b) { 503 | return b + a; 504 | } 505 | 506 | static inline Quaternion1d & operator += (Quaternion1d & a, double & b) { 507 | a = a + b; 508 | return a; 509 | } 510 | 511 | // operator - : subtract 512 | static inline Quaternion1d operator - (Quaternion1d const a, double b) { 513 | return a - Quaternion1d(b); 514 | } 515 | 516 | static inline Quaternion1d operator - (double a, Quaternion1d const b) { 517 | return Quaternion1d(a) - b; 518 | } 519 | 520 | static inline Quaternion1d & operator -= (Quaternion1d & a, double & b) { 521 | a = a - b; 522 | return a; 523 | } 524 | 525 | // operator * : multiply 526 | static inline Quaternion1d operator * (Quaternion1d const a, double b) { 527 | return Quaternion1d(a.to_vector() * b); 528 | } 529 | 530 | static inline Quaternion1d operator * (double a, Quaternion1d const b) { 531 | return b * a; 532 | } 533 | 534 | static inline Quaternion1d & operator *= (Quaternion1d & a, double & b) { 535 | a = a * b; 536 | return a; 537 | } 538 | 539 | // operator / : divide 540 | static inline Quaternion1d operator / (Quaternion1d const a, double b) { 541 | return Quaternion1d(a.to_vector() / Vec4d(b)); 542 | } 543 | 544 | static inline Quaternion1d operator / (double a, Quaternion1d const b) { 545 | return reciprocal(b) * a; 546 | } 547 | 548 | static inline Quaternion1d & operator /= (Quaternion1d & a, double b) { 549 | a = a / b; 550 | return a; 551 | } 552 | 553 | 554 | /***************************************************************************** 555 | * 556 | * Functions for Quaternion1d 557 | * 558 | *****************************************************************************/ 559 | 560 | // function abs: calculate the norm 561 | // abs(a + b*i + c*j + d*k) = sqrt(a*a + b*B + c*c + d*d) 562 | static inline double abs(Quaternion1d const a) { 563 | Vec4d sq = a.to_vector() * a.to_vector(); 564 | double nsq = horizontal_add(sq); 565 | return std::sqrt(nsq); 566 | } 567 | 568 | // function select 569 | static inline Quaternion1d select (bool s, Quaternion1d const a, Quaternion1d const b) { 570 | return Quaternion1d(s ? a : b); 571 | } 572 | 573 | 574 | #ifdef VCL_NAMESPACE 575 | } 576 | #endif 577 | 578 | #endif // QUATERNION_H 579 | -------------------------------------------------------------------------------- /decimal/testbench_decimal.cpp: -------------------------------------------------------------------------------- 1 | /************************* testbench_decimal.cpp ************************** 2 | * Author: Agner Fog 3 | * Date created: 2019-07-14 4 | * Last modified: 2019-07-14 5 | * Version: 2.00 6 | * Project: Vector class library add-on package 'decimal' 7 | * Description: Testbench for decimal.cpp using vector class library 8 | * Compile and run this program to test functions in decimal.h package 9 | * 10 | * Instructions: 11 | * The following parameters must be defined on the command line or added in the 12 | * top of this file: 13 | * 14 | * testcase: A number defining a function or operator to test. See the cases in this file. 15 | * 16 | * Compile with any compiler supported by VCL. 17 | * Specify the desired instruction set and optimization options as parameters 18 | * to the compiler. 19 | * 20 | * (c) Copyright 2019 Agner Fog. 21 | * Apache license 2.0 22 | ****************************************************************************** 23 | 24 | Test cases: 25 | 1: bin2ascii 26 | 2: ascii2bin 27 | 28 | *****************************************************************************/ 29 | 30 | #include 31 | #include 32 | 33 | #include 34 | 35 | #define MAX_VECTOR_SIZE 512 36 | #ifndef INSTRSET 37 | #define INSTRSET 8 38 | #endif 39 | 40 | //#define __AVX512VBMI2__ 41 | 42 | #include 43 | #include "decimal.cpp" 44 | 45 | 46 | // --------------------------------------------------------------------------- 47 | // Specify input parameters here if running from an IDE 48 | // ---------------------------------------------------------------------------- 49 | 50 | #ifndef testcase 51 | 52 | #define testcase 1 53 | 54 | #endif // testcase 55 | 56 | // ---------------------------------------------------------------------------- 57 | // Declarations 58 | // ---------------------------------------------------------------------------- 59 | int globalError = 0; // any error indicated in program return 60 | 61 | 62 | /************************************************************************ 63 | * 64 | * Test cases 65 | * 66 | ************************************************************************/ 67 | 68 | #if testcase == 1 // test bin2ascii 69 | 70 | // check results of ascii2bin 71 | void checkb2a (int len, const char * res, const char * expected) { 72 | int slen = (int)strlen(res); 73 | if (strcmp(res, expected) != 0) { 74 | printf("\nbin2ascii error. Result:\n >%s<\nExpected:\n >%s<", 75 | res, expected); 76 | globalError++; 77 | } 78 | else if (len != slen) { 79 | printf("\nbin2ascii length error. Actual length: %i, Reported length: %i\n (%s)", 80 | slen, len, res); 81 | globalError++; 82 | } 83 | } 84 | 85 | 86 | int main() { 87 | char text[1024]; 88 | int r = 0; 89 | 90 | #if 0 // debugging 91 | Vec4i a0 (-87654321,-200000,3000000,40000000); 92 | r = bin2ascii(a0, text, 10, 4, '*', ',', true, true); 93 | printf("\nr=%i, \ntext=%s", r, text); 94 | 95 | #else 96 | 97 | //static int bin2ascii ( 98 | // Vec4i const & a, char * string, int fieldlen = 8, int numdat = 4, bool signd = true, char ovfl = '*', char separator = ',', bool term = true) { 99 | Vec4i a1 (101,- 202,30303,-4040404); 100 | r = bin2ascii(a1, text, 10, 4, '*', ',', true, true); 101 | checkb2a(43, text, " 101, -202, 30303, -4040404"); 102 | r = bin2ascii(a1, text, 5, 4, 0, ';', true, true); 103 | checkb2a(26, text, " 101; -202;30303;-4040404"); 104 | r = bin2ascii(a1, text, 5, 4, '*', ',', true, true); 105 | checkb2a(23, text, " 101, -202,30303,*****"); 106 | 107 | Vec4i a2 (101,-20202,-30303030,404040404); 108 | r = bin2ascii(a2, text, 10, 4, '*', ',', true, true); 109 | checkb2a(43, text, " 101, -20202, -30303030, 404040404"); 110 | r = bin2ascii(a2, text, 9, 4, '*', ',', true, true); 111 | checkb2a(39, text, " 101, -20202,-30303030,404040404"); 112 | r = bin2ascii(a2, text, 8, 4, '*', ',', true, true); 113 | checkb2a(35, text, " 101, -20202,********,********"); 114 | r = bin2ascii(a2, text, 6, 4, '*', ',', true, true); 115 | checkb2a(27, text, " 101,-20202,******,******"); 116 | 117 | Vec4i a3 (-1,-100,10000,-10000); 118 | r = bin2ascii(a3, text, 6, 4, '*', ',', true, true); 119 | checkb2a(27, text, " -1, -100, 10000,-10000"); 120 | r = bin2ascii(a3, text, 2, 4, '*', ',', true, true); 121 | checkb2a(11, text, "-1,**,**,**"); 122 | r = bin2ascii(a3, text, 1, 4, '*', ',', true, true); 123 | checkb2a(7, text, "*,*,*,*"); 124 | r = bin2ascii(a3, text, 0, 4, '*', ',', true, true); 125 | checkb2a(0, text, ""); 126 | r = bin2ascii(a3, text, 5, 3, '*', ',', true, true); 127 | checkb2a(17, text, " -1, -100,10000"); 128 | r = bin2ascii(a3, text, 5, 3, '*', 0, true, true); 129 | checkb2a(15, text, " -1 -10010000"); 130 | r = bin2ascii(a3, text, 1, 3, 0, ',', true, true); 131 | checkb2a(13, text, "-1,-100,10000"); 132 | r = bin2ascii(a3, text, 1, 2, 0, ',', true, true); 133 | checkb2a(7, text, "-1,-100"); 134 | 135 | Vec4i a4 (-100000,-1000000,10000000,-100000000); 136 | r = bin2ascii(a4, text, 6, 4, 0, ',', true, true); 137 | checkb2a(36, text, "-100000,-1000000,10000000,-100000000"); 138 | r = bin2ascii(a4, text, 7, 4, '*', ',', true, true); 139 | checkb2a(31, text, "-100000,*******,*******,*******"); 140 | r = bin2ascii(a4, text, 7, 4, '*', ',', false, true); 141 | checkb2a(31, text, "*******,*******,*******,*******"); 142 | r = bin2ascii(a4, text, 7, 4, 0, ',', false, true); 143 | checkb2a(41, text, "4294867296,4293967296,10000000,4194967296"); 144 | 145 | Vec4i a5 (10000000,1000000000,2000000000,3000000000u); 146 | r = bin2ascii(a5, text, 8, 4, 0, ',', true, true); 147 | checkb2a(42, text, "10000000,1000000000,2000000000,-1294967296"); 148 | r = bin2ascii(a5, text, 8, 4, 0, ',', false, true); 149 | checkb2a(41, text, "10000000,1000000000,2000000000,3000000000"); 150 | 151 | Vec4i a6 (1,2,3,4); 152 | r = bin2ascii(a6, text, 2, 4, '*', ',', true, false); // no terminator. the rest of the previous result remains 153 | checkb2a(41, text, " 1, 2, 3, 400000000,2000000000,3000000000"); 154 | r = bin2ascii(a6, text, 2, 4, 0, ',', true, true); 155 | checkb2a(11, text, " 1, 2, 3, 4"); 156 | r = bin2ascii(a6, text, 2, 4, 0, 0, true, true); 157 | checkb2a(8, text, " 1 2 3 4"); 158 | r = bin2ascii(a6, text, 1, 4, 0, 0, true, true); 159 | checkb2a(4, text, "1234"); 160 | 161 | 162 | //static int bin2ascii ( 163 | // Vec8i const & a, char * string, int fieldlen = 8, int numdat = 8, bool signd = true, char ovfl = '*', char separator = ',', bool term = true) { 164 | Vec8i b1 (1,-22,333,-4321,55555,-666,7,8000); 165 | r = bin2ascii(b1, text, 10, 8, '*', ',', true, true); 166 | checkb2a(87, text, " 1, -22, 333, -4321, 55555, -666, 7, 8000"); 167 | r = bin2ascii(b1, text, 5, 8, 0, ';', true, true); 168 | checkb2a(47, text, " 1; -22; 333;-4321;55555; -666; 7; 8000"); 169 | r = bin2ascii(b1, text, 4, 8, '*', '|', true, true); 170 | checkb2a(39, text, " 1| -22| 333|****|****|-666| 7|8000"); 171 | r = bin2ascii(b1, text, 6, 7, '*', ',', true, true); 172 | checkb2a(48, text, " 1, -22, 333, -4321, 55555, -666, 7"); 173 | r = bin2ascii(b1, text, 6, 6, '*', ',', true, true); 174 | checkb2a(41, text, " 1, -22, 333, -4321, 55555, -666"); 175 | r = bin2ascii(b1, text, 6, 5, '*', ',', true, true); 176 | checkb2a(34, text, " 1, -22, 333, -4321, 55555"); 177 | r = bin2ascii(b1, text, 6, 4, '*', ',', true, true); 178 | checkb2a(27, text, " 1, -22, 333, -4321"); 179 | r = bin2ascii(b1, text, 6, 3, '*', ',', true, true); 180 | checkb2a(20, text, " 1, -22, 333"); 181 | r = bin2ascii(b1, text, 6, 2, '*', ',', true, true); 182 | checkb2a(13, text, " 1, -22"); 183 | r = bin2ascii(b1, text, 6, 1, '*', ',', true, true); 184 | checkb2a(6, text, " 1"); 185 | r = bin2ascii(b1, text, 6, 0, '*', ',', true, true); 186 | checkb2a(0, text, ""); 187 | 188 | Vec8i b2 (1,-20,300,4000,50000,654321,7000000,87654321); 189 | r = bin2ascii(b2, text, 10, 8, '*', ',', true, true); 190 | checkb2a(87, text, " 1, -20, 300, 4000, 50000, 654321, 7000000, 87654321"); 191 | r = bin2ascii(b2, text, 9, 8, '*', ',', true, true); 192 | checkb2a(79, text, " 1, -20, 300, 4000, 50000, 654321, 7000000, 87654321"); 193 | r = bin2ascii(b2, text, 8, 8, '*', ',', true, true); 194 | checkb2a(71, text, " 1, -20, 300, 4000, 50000, 654321, 7000000,87654321"); 195 | r = bin2ascii(b2, text, 7, 8, '*', ',', true, true); 196 | checkb2a(63, text, " 1, -20, 300, 4000, 50000, 654321,7000000,*******"); 197 | r = bin2ascii(b2, text, 6, 8, '*', ',', true, true); 198 | checkb2a(55, text, " 1, -20, 300, 4000, 50000,654321,******,******"); 199 | r = bin2ascii(b2, text, 5, 8, '*', ',', true, true); 200 | checkb2a(47, text, " 1, -20, 300, 4000,50000,*****,*****,*****"); 201 | r = bin2ascii(b2, text, 4, 8, '*', ',', true, true); 202 | checkb2a(39, text, " 1, -20, 300,4000,****,****,****,****"); 203 | r = bin2ascii(b2, text, 3, 8, '*', ',', true, true); 204 | checkb2a(31, text, " 1,-20,300,***,***,***,***,***"); 205 | r = bin2ascii(b2, text, 2, 8, '*', ',', true, true); 206 | checkb2a(23, text, " 1,**,**,**,**,**,**,**"); 207 | r = bin2ascii(b2, text, 1, 8, '*', ',', true, true); 208 | checkb2a(15, text, "1,*,*,*,*,*,*,*"); 209 | r = bin2ascii(b2, text, 0, 8, '*', ',', true, true); 210 | checkb2a(0, text, ""); 211 | 212 | r = bin2ascii(b2, text, 9, 8, '*', 0, true, true); 213 | checkb2a(72, text, " 1 -20 300 4000 50000 654321 7000000 87654321"); 214 | r = bin2ascii(b2, text, 8, 8, '*', 0, true, true); 215 | checkb2a(64, text, " 1 -20 300 4000 50000 654321 700000087654321"); 216 | r = bin2ascii(b2, text, 7, 8, '*', 0, true, true); 217 | checkb2a(56, text, " 1 -20 300 4000 50000 6543217000000*******"); 218 | r = bin2ascii(b2, text, 6, 8, '*', 0, true, true); 219 | checkb2a(48, text, " 1 -20 300 4000 50000654321************"); 220 | r = bin2ascii(b2, text, 5, 8, '*', 0, true, true); 221 | checkb2a(40, text, " 1 -20 300 400050000***************"); 222 | r = bin2ascii(b2, text, 4, 8, '*', 0, true, true); 223 | checkb2a(32, text, " 1 -20 3004000****************"); 224 | r = bin2ascii(b2, text, 3, 8, '*', 0, true, true); 225 | checkb2a(24, text, " 1-20300***************"); 226 | r = bin2ascii(b2, text, 2, 8, '*', 0, true, true); 227 | checkb2a(16, text, " 1**************"); 228 | r = bin2ascii(b2, text, 1, 8, '*', 0, true, true); 229 | checkb2a(8, text, "1*******"); 230 | r = bin2ascii(b2, text, 0, 8, '*', 0, true, true); 231 | checkb2a(0, text, ""); 232 | 233 | // fields too long 234 | Vec8i b3 (1000,-200000,3000000,40000000,205050505,3060606060u,-432100000,-87654321); 235 | r = bin2ascii(b3, text, 8, 8, '*', ',', true, true); 236 | checkb2a(71, text, " 1000, -200000, 3000000,40000000,********,********,********,********"); 237 | r = bin2ascii(b3, text, 8, 8, 0, ',', true, true); 238 | checkb2a(78, text, " 1000, -200000, 3000000,40000000,205050505,-1234361236,-432100000,-87654321"); 239 | r = bin2ascii(b3, text, 9, 8, '*', ',', true, true); 240 | checkb2a(79, text, " 1000, -200000, 3000000, 40000000,205050505,*********,*********,-87654321"); 241 | r = bin2ascii(b3, text, 10, 8, '*', ',', true, true); 242 | checkb2a(87, text, " 1000, -200000, 3000000, 40000000, 205050505,**********,-432100000, -87654321"); 243 | r = bin2ascii(b3, text, 11, 8, '*', ',', true, true); 244 | checkb2a(95, text, " 1000, -200000, 3000000, 40000000, 205050505,-1234361236, -432100000, -87654321"); 245 | r = bin2ascii(b3, text, 12, 8, '*', ',', true, true); 246 | checkb2a(103, text, " 1000, -200000, 3000000, 40000000, 205050505, -1234361236, -432100000, -87654321"); 247 | r = bin2ascii(b3, text, 10, 8, '*', ',', false, true); 248 | checkb2a(87, text, " 1000,4294767296, 3000000, 40000000, 205050505,3060606060,3862867296,4207312975"); 249 | r = bin2ascii(b3, text, 5, 1, '*', ',', true, false); // no terminator. overwrite previous string 250 | checkb2a(87, text, " 1000 1000,4294767296, 3000000, 40000000, 205050505,3060606060,3862867296,4207312975"); 251 | 252 | if (!globalError) printf("\nsuccess\n"); 253 | #endif 254 | return globalError; 255 | } 256 | 257 | 258 | #else // test ascii2bin 259 | 260 | // check results of ascii2bin 261 | void checka2b (Vec8i res, Vec8i expected, int length, int lengthExp, int err, int errExp) { 262 | bool dataerr = horizontal_or(res != expected); 263 | bool lengtherr = length != lengthExp; 264 | bool errorerr = err != errExp; 265 | 266 | if (dataerr || lengtherr || errorerr) { 267 | printf("\nascii2bin error:"); 268 | } 269 | if (dataerr) { 270 | globalError |= 1; 271 | printf("\n data error:\n found: expected:"); 272 | for (int i = 0; i < res.size(); i++) { 273 | printf("\n%10i %10i", res[i], expected[i]); 274 | } 275 | } 276 | if (lengtherr) { 277 | globalError |= 2; 278 | printf("\n length error: found: %i, expected: %i", length, lengthExp); 279 | } 280 | if (errorerr) { 281 | globalError |= 4; 282 | printf("\n error code: found: 0x%X, expected: 0x%X", err, errExp); 283 | } 284 | if ((lengtherr || errorerr) && !dataerr) { // print numbers to help identify the case 285 | printf("\n("); 286 | for (int i=0; i < res.size(); i++) { 287 | printf("%i ", res[i]); 288 | } 289 | printf(")\n"); 290 | } 291 | } 292 | 293 | int main() { 294 | 295 | int error = 0; 296 | int n_read = 0; 297 | Vec8i dat; 298 | 299 | #if 0 // for debugging only: 300 | // 10 20 30 40 50 60 301 | // v v v v v v 302 | // 01234567890123456789012345678901234567890123456789012345678901234567890 303 | char num0[] = " 1, +2 ,-3, -4321,, 007777, 88888,98765432"; 304 | 305 | dat = ascii2bin(num0, &n_read, &error, 1000, 7, ','); 306 | checka2b (dat, Vec8i(1,2,3,-4321,0,7777,88888,98765432), n_read, 41, error, 0); 307 | 308 | printf ("\nnread %i, error 0x%X\n", n_read, error); 309 | for (int i=0; i<8; i++) printf("%i ", dat[i]); 310 | return 1; 311 | 312 | #endif 313 | char num1[] = " 1, +21 , 321, -4321, 55, 7777, 88888,98765432"; 314 | dat = ascii2bin(num1, &n_read, &error, 64, 8, ','); 315 | checka2b (dat, Vec8i(1,21,321,-4321,55,7777,88888,98765432),n_read, 47, error, 0); 316 | 317 | // test no numbers 318 | dat = ascii2bin(num1, &n_read, &error, 64, 0, ','); 319 | checka2b (dat, Vec8i(0), n_read, 0, error, 0); 320 | // test fewer numbers 321 | dat = ascii2bin(num1, &n_read, &error, 64, 3, ','); 322 | checka2b (dat, Vec8i(1,21,321,0,0,0,0,0), n_read, 15, error, 0); 323 | // test fewer numbers 324 | dat = ascii2bin(num1, &n_read, &error, 64, 7, ','); 325 | checka2b (dat, Vec8i(1,21,321,-4321,55,7777,88888,0), n_read, 39, error, 0); 326 | 327 | // test short string 328 | dat = ascii2bin(num1, &n_read, &error, 40, 7, ','); 329 | checka2b (dat, Vec8i(1,21,321,-4321,55,7777,88888,0), n_read, 39, error, 0); 330 | // test short string 331 | dat = ascii2bin(num1, &n_read, &error, 26, 7, ','); 332 | checka2b (dat, Vec8i(1,21,321,-4321,55,0,0,0), n_read, 26, error, 8); 333 | 334 | // test string 64 bytes long 335 | char num2[] = "1 , +22 , 300, - 4444, 55555, 666666, 7777777,88888888"; 336 | dat = ascii2bin(num2, &n_read, &error, 64, 8, ','); 337 | checka2b (dat, Vec8i(1,22,300,-4444,55555,666666,7777777,88888888),n_read, 64, error, 0); 338 | 339 | // test string > 64 bytes long 340 | char num3[] = "1 , +22 , 300, - 4444, 55555, 666666, 7777777,88888888,999,101010,111111"; 341 | dat = ascii2bin(num3, &n_read, &error, 100, 8, ','); 342 | checka2b (dat, Vec8i(1,22,300,-4444,55555,666666,7777777,88888888), n_read, 63, error, 0); 343 | 344 | // test missing numbers 345 | char num4[] = ",- 321,+,-9876543"; 346 | dat = ascii2bin(num4, &n_read, &error, 17, 8, ','); 347 | checka2b (dat, Vec8i(0,-321,0,-9876543,0,0,0,0), n_read, 17, error, 8); 348 | // test unfinished numbers 349 | dat = ascii2bin(num4, &n_read, &error, 14, 8, ','); 350 | checka2b (dat, Vec8i(0,-321,0,-9876,0,0,0,0), n_read, 14, error, 8); 351 | dat = ascii2bin(num4, &n_read, &error, 10, 8, ','); 352 | checka2b (dat, Vec8i(0,-321,0,0,0,0,0,0), n_read, 10, error, 8); 353 | 354 | // test misplaced character and illegal character 355 | char num5[] = "111 , -222 , 333-, 444., 555E6, 6666"; 356 | dat = ascii2bin(num5, &n_read, &error, 37, 6, ','); 357 | checka2b (dat, Vec8i(111,-222,0,444,0,6666,0,0), n_read, 37, error, 2+4); 358 | 359 | dat = ascii2bin(num5, &n_read, &error, 80, 4, ','); 360 | checka2b (dat, Vec8i(111,-222,0,444,0,0,0,0), n_read, 25, error, 2+4); 361 | 362 | // test field too long 363 | char num6[] = "111 ,1234567890, -1234567890 , 4444, 55555, 666666"; 364 | dat = ascii2bin(num6, &n_read, &error, 51, 6, ','); 365 | checka2b (dat, Vec8i(111,1234567890,-1234567890,4444,55555,666666,0,0), n_read, 51, error, 0); 366 | 367 | // test overflow 368 | char num7[] = "111 ,12345678901, -1234567890 , 4444, 55555, 666666"; 369 | dat = ascii2bin(num7, &n_read, &error, 64, 6, ','); 370 | checka2b (dat, Vec8i(111,2147483647,-1234567890,4444,55555,666666,0,0), n_read, 52, error, 16); 371 | 372 | // test chain 373 | char num8[] = "-111, 222 , -333 , +4444, -55555,+ 666666, -777, 888, -999, 1010, -1111"; 374 | dat = ascii2bin(num8, &n_read, &error, 53, 8, ','); 375 | checka2b (dat, Vec8i(-111,222,-333,4444,-55555,666666,-777,888), n_read, 53, error, 0); 376 | dat = ascii2bin(num8 + n_read, &n_read, &error, 64, 3, ','); 377 | checka2b (dat, Vec8i(-999, 1010, -1111,0,0,0,0,0), n_read, 18, error, 0); 378 | 379 | // test garbage after string. multiple signs, tab as separator 380 | char num9[] = "111\t+-2\t---3\t4444\t55555\t666666\t-7\t8\t 1.2E3\ttext\t'''\t\t%&/()"; 381 | dat = ascii2bin(num9, &n_read, &error, 64, 8, '\t'); 382 | checka2b (dat, Vec8i(111,0,0,4444,55555,666666,-7,8), n_read, 36, error, 4); 383 | 384 | // test newline as end of string 385 | char num9a[] = "111,+2,-3,4444,55555,666666,-7\n8, 1.2E3"; 386 | dat = ascii2bin(num9a, &n_read, &error, 64, 8, ','); 387 | checka2b (dat, Vec8i(111,2,-3,4444,55555,666666,-7,0), n_read, 30, error, 8); 388 | 389 | // test error in first field 390 | char num10[]= "1 1 1,22,333,4444,55555,666666,7777777,"; 391 | dat = ascii2bin(num10, &n_read, &error, 64, 8, ','); 392 | checka2b (dat, Vec8i(0,22,333,4444,55555,666666,7777777,0), n_read, 39, error, 4+8); 393 | 394 | char num11[]= "+-+-0,22,333,4444,55555,666666,7777777,"; 395 | dat = ascii2bin(num11, &n_read, &error, 64, 8, ','); 396 | checka2b (dat, Vec8i(0,22,333,4444,55555,666666,7777777,0), n_read, 39, error, 4+8); 397 | 398 | // test error in last field 399 | char num12[]= "1 ,22,333,4444,55555,666666,7777777,+-8"; 400 | dat = ascii2bin(num12, &n_read, &error, 64, 8, ','); 401 | checka2b (dat, Vec8i(1,22,333,4444,55555,666666,7777777,0), n_read, 39, error, 4); 402 | 403 | char num13[]= "1 ,22,333,4444,55555,666666,7777777,8.8"; 404 | dat = ascii2bin(num13, &n_read, &error, 64, 8, ','); 405 | checka2b (dat, Vec8i(1,22,333,4444,55555,666666,7777777,0), n_read, 39, error, 2+4); 406 | 407 | char num14[]= "1 ,22,333,4444,55555,666666,7777777,...garbage 1 more garbage "; 408 | dat = ascii2bin(num14, &n_read, &error, 64, 8, ','); 409 | checka2b (dat, Vec8i(1,22,333,4444,55555,666666,7777777,1), n_read, 48, error, 2); 410 | 411 | if (!globalError) printf("\nsuccess\n"); 412 | return globalError; 413 | } 414 | 415 | #endif 416 | 417 | -------------------------------------------------------------------------------- /random/ranvec1_manual.tex: -------------------------------------------------------------------------------- 1 | \documentclass[11pt,a4paper,oneside,openright]{report} 2 | 3 | \usepackage[bindingoffset=5mm,left=20mm,right=20mm,top=20mm,bottom=20mm,footskip=10mm]{geometry} 4 | \usepackage[utf8x]{inputenc} 5 | \usepackage{hyperref} 6 | \usepackage[english]{babel} 7 | \usepackage{listings} 8 | \usepackage{subfiles} 9 | \usepackage{longtable} 10 | \usepackage{multirow} 11 | \usepackage{ragged2e} 12 | \usepackage{cmap} % avoid fi ligatures in pdf file 13 | \usepackage{amsthm} % example numbering 14 | \usepackage{color} 15 | %\usepackage{bold-extra} % for bold tt font. Remember to include bold-extra.sty file 16 | \usepackage{graphicx} 17 | \usepackage[yyyymmdd]{datetime} 18 | \usepackage{float} 19 | 20 | % style for code listing 21 | \renewcommand{\familydefault}{\sfdefault} 22 | \renewcommand{\ttdefault}{pcr} % selects Courier font 23 | \newtheorem{example}{Example}[chapter] % example numbering 24 | \lstset{language=C} % formatting for code listing 25 | \lstset{basicstyle=\ttfamily,breaklines=true} 26 | \definecolor{darkGreen}{rgb}{0,0.4,0} 27 | \definecolor{mybrown}{rgb}{0.40,0.10,0.05} 28 | \lstset{commentstyle=\color{darkGreen}} % comments color 29 | \lstset{keywordstyle=\color{blue}} % keyword color 30 | \lstset{stringstyle=\color{mybrown}} % string color 31 | \lstset{showstringspaces=false} % don't mark spaces in strings 32 | 33 | \renewcommand{\dateseparator}{-} 34 | 35 | % command for turning indent back on after \flushleft 36 | \newcommand{\indenton}{\RaggedRight\parindent=15pt} 37 | 38 | % command for vertical space 39 | \newcommand{\vspacesmall}{\vspace{3mm}} 40 | \newcommand{\vspacebig}{\vspace{6mm}} 41 | 42 | % style for code inlined in text: 43 | \newcommand{\codei}[1]{\bfseries \ttfamily{#1}\normalfont} 44 | 45 | 46 | \begin{document} 47 | 48 | \begin{titlepage} 49 | \centering 50 | 51 | \null %empty box needed for vfill to work 52 | \vfill 53 | 54 | {\bfseries\Huge 55 | Ranvec1 56 | \vspacesmall 57 | 58 | Random number generator for 59 | \vspacesmall 60 | 61 | C++ vector class library 62 | \vspacebig 63 | 64 | } 65 | \vspacebig 66 | 67 | {\Large 68 | Agner Fog 69 | \vspacebig 70 | 71 | \copyright\ \today. Apache license 2.0 72 | } 73 | 74 | \vfill 75 | 76 | \includegraphics[width=306pt]{freesoftwarelogo.jpg} 77 | \vfill 78 | 79 | \end{titlepage} 80 | 81 | \RaggedRight 82 | 83 | \chapter{Introduction}\label{chap:Introduction} 84 | 85 | Ranvec1 is an efficient high quality pseudo random number generator designed for large vector applications and multi-threaded applications in C++ language. 86 | \vspacesmall 87 | 88 | This generator has been developed based on the following design goals: 89 | 90 | \begin{itemize} 91 | 92 | \item Good randomness, as determined by both theoretical and experimental criteria. 93 | 94 | \item Suitable for vector processors and vector instructions (SIMD). 95 | 96 | \item Suitable for large multi-threaded applications without risk of overlapping subsequences. 97 | 98 | \item Fast generation of large amounts of random numbers. 99 | 100 | \end{itemize} 101 | 102 | This random number generator is designed for large Monte Carlo simulations and Monte Carlo integration. 103 | It may be useful for cryptographic applications as well, but cryptographic safety has not been a decisive design goal. It will be useful for game applications as well. 104 | \vspacesmall 105 | 106 | A physical random number generator function is included for the purpose of generating a truly random seed for initializing the pseudo random number generator. 107 | \vspacesmall 108 | 109 | The code is based on the Vector Class Library, using the x86 or x86-64 instruction set with extensions from SSE2 to AVX512. See the Vector Class Library manual for choice of compiler and compilation options. On Gnu and Clang compilers you need to specify the additional options \textbf{-mrdrnd -mrdseed} in order to enable the physical random number generator instructions. 110 | 111 | 112 | 113 | \chapter{Instructions}\label{chap:Instructions} 114 | \flushleft 115 | The files ranvec1.h and ranvec1.cpp define a high quality pseudo-random number generator with vector output. This generator is useful for producing random numbers for simulation and other Monte Carlo applications. Add the file ranvec1.cpp to your project and compile for the appropriate instruction set. This example shows a simple use of the random number generator: 116 | 117 | \begin{example} 118 | \label{exampleRandomGeneration} 119 | \end{example} % frame disappears if I put this after end lstlisting 120 | \begin{lstlisting}[frame=single] 121 | // Example for random number generator 122 | // Remember to link ranvec1.cpp into the project 123 | 124 | #include 125 | #include "vectorclass.h" 126 | #include "ranvec1.h" 127 | 128 | int main() { 129 | // Arbitrary seed 130 | int seed = 1; 131 | // Create an instance of Ranvec1 and set the type to 3 132 | Ranvec1 ran(3); 133 | // Initialize with the seed 134 | ran.init(seed); 135 | // Generate a vector of 8 random integers below 100 136 | Vec8i ri = ran.random8i(0,99); 137 | // Generate a vector of 8 random floats 138 | Vec8f rf = ran.random8f(); 139 | int i; 140 | // Output the 8 random integers 141 | printf("\nRandom integers in interval 0 - 99\n"); 142 | for (i=0; i < ri.size(); i++) printf("%3i ", ri[i]); 143 | 144 | // Output the 8 random floats 145 | printf("\nRandom floats in interval 0 - 1\n"); 146 | for (i=0; i < rf.size(); i++) printf("%7.4f ", rf[i]); 147 | printf("\n"); 148 | return 0; 149 | } 150 | \end{lstlisting} 151 | \vspacesmall 152 | 153 | The optional parameter for the constructor of the class Ranvec1 defines the type of random number generator to use: 154 | \vspacesmall 155 | 156 | \begin{tabular}{|p{30mm}|p{120mm}|} 157 | \hline 158 | \bfseries Parameter for\newline constructor & \bfseries Generator type \\ \hline 159 | 1 & MWC. Multiply-With-Carry Generator. Use this for small applications where speed is important. \newline 160 | (cycle length \textgreater{} $4 \cdot 10^{19}$) \\ \hline 161 | 162 | 2 & MTGP. A variant of Mersenne Twister. Use this for applications with multiple threads. \newline 163 | (cycle length \textgreater{} $10^{3375}$) \\ \hline 164 | 165 | 3 & MWC + MTGP combined. Use this for the best possible randomness and for large applications with many threads. \newline 166 | (cycle length \textgreater{} $10^{3395}$) \\ \hline 167 | \end{tabular} 168 | \vspacesmall 169 | 170 | It is necessary to initialize the random number generator with a seed, using either the function \codei{init} or \codei{initByArray}. The generator will produce only zeroes if it has not been initialized with any of the init functions. 171 | \vspacesmall 172 | 173 | The random number sequence depends on the seed. A different seed will produce a different sequence of random numbers. You can reproduce a random number sequence exactly after initializing again with the same seed. You may use simple values like 1, 2, 3, ... for seeds in a series of simulations if you want to be able to reproduce the results later. If you want a non-reproducible sequence then you need a seed from a source of genuine randomness. The function \codei{physicalSeed} is useful for this purpose. 174 | \vspacesmall 175 | 176 | The generator can produce vector outputs with different vector sizes. The best performance is obtained when the vector size fits the instruction set: SSE2 or higher for 128 bit vectors. AVX2 or higher for 256 bit vectors. AVX512 or higher for 512 bit vectors. Depending on details of the application, it may or may not be possible to reproduce a simulation result exactly when the vector size is changed. 177 | \vspacesmall 178 | 179 | The theory of the Ranvec1 package including the different generators, multiprocessing and vector processing is described in the article: 180 | \label{Fog2015TheoryArticle} 181 | 182 | Fog, Agner: “Pseudo-Random Number Generators for Vector Processors and Multicore Processors.” Journal of Modern Applied Statistical Methods, vol. 14, no. 1, 2015, article 23. \url{https://digitalcommons.wayne.edu/jmasm/vol14/iss1/23/} 183 | \vspacebig 184 | 185 | \section{Member functions for class Ranvec1}\label{MemberFunctions} 186 | \vspacesmall 187 | 188 | \begin{tabular}{|p{30mm}|p{100mm}|} 189 | \hline 190 | \bfseries Constructor & Ranvec1(int gtype) \\ \hline 191 | \bfseries Description & Constructor for Ranvec1 class. See the table above for values of the generator type gtype. \\ \hline 192 | \bfseries Efficiency & medium \\ \hline 193 | \end{tabular} 194 | \begin{lstlisting}[frame=none] 195 | // Example: 196 | Ranvec1 ran(3); // Create object ran 197 | \end{lstlisting} 198 | \vspacesmall 199 | 200 | \begin{tabular}{|p{30mm}|p{100mm}|} 201 | \hline 202 | \bfseries Constructor & Ranvec1(int gtype, int seed) \\ \hline 203 | \bfseries Description & Constructor for Ranvec1 class. Initializing with seed. \\ \hline 204 | \bfseries Efficiency & medium \\ \hline 205 | \end{tabular} 206 | \vspacesmall 207 | 208 | \begin{tabular}{|p{30mm}|p{100mm}|} 209 | \hline 210 | \bfseries Member function & void init(int seed) \\ \hline 211 | \bfseries Description & Initialization with one seed. Any value is allowed for seed. Use a different value of seed each time to get a different random number sequence. \\ \hline 212 | \bfseries Efficiency & poor \\ \hline 213 | \end{tabular} 214 | \begin{lstlisting}[frame=none] 215 | // Example: 216 | ran.init(0); // Initialize random generator with seed 0 217 | \end{lstlisting} 218 | \vspacesmall 219 | 220 | \begin{tabular}{|p{30mm}|p{100mm}|} 221 | \hline 222 | \bfseries Member function & void init(int seed1, int seed2) \\ \hline 223 | \bfseries Description & Initialization with two seeds. The random number sequence depends on both seeds. If the generator type is 3, then seed1 is used for the MWC generator and seed2 is used for the MTGP generator. The value of seed2 should be different for each thread in multithreaded applications. \\ \hline 224 | \bfseries Efficiency & poor \\ \hline 225 | \end{tabular} 226 | \begin{lstlisting}[frame=none] 227 | // Example: 228 | ran.init(0,1); // Initialize random generator with seeds 0 and 1 229 | \end{lstlisting} 230 | \vspacesmall 231 | 232 | \begin{tabular}{|p{30mm}|p{100mm}|} 233 | \hline 234 | \bfseries Member function & void initByArray(int const seeds[], int numSeeds) \\ \hline 235 | \bfseries Description & Initialization with multiple seeds. The seeds array must contain numSeed integers. The random number sequence depends on all these integer seeds. This can be useful for security applications in order to make it difficult to guess the seeds. The best security is obtained with generator type 3. \\ \hline 236 | \bfseries Efficiency & poor \\ \hline 237 | \end{tabular} 238 | \begin{lstlisting}[frame=none] 239 | // Example: 240 | // Initialize random generator with four seeds 241 | int seeds[4] = {5,8,12,2}; 242 | ran.initByArray(seeds, 4); 243 | \end{lstlisting} 244 | \vspacesmall 245 | 246 | \begin{tabular}{|p{30mm}|p{100mm}|} 247 | \hline 248 | \bfseries Member function & uint32\_t random32b()\newline 249 | uint64\_t random64b() \\ \hline 250 | \bfseries Description & returns an integer of 32 or 64 random bits \\ \hline 251 | \bfseries Efficiency & medium \\ \hline 252 | \end{tabular} 253 | \begin{lstlisting}[frame=none] 254 | // Example: 255 | unsigned int r = ran.random32b(); // generate 32 random bits 256 | \end{lstlisting} 257 | \vspacesmall 258 | 259 | \begin{tabular}{|p{30mm}|p{100mm}|} 260 | \hline 261 | \bfseries Member function & Vec4ui random128b() \newline 262 | Vec8ui random256b() \newline 263 | Vec16ui random512b() \\ \hline 264 | \bfseries Description & Returns an integer vector of 128, 256 or 512 random bits. \\ \hline 265 | \bfseries Efficiency & medium \\ \hline 266 | \end{tabular} 267 | \begin{lstlisting}[frame=none] 268 | // Example: 269 | Vec8ui v = ran.random256b(); // generate 256 random bits 270 | \end{lstlisting} 271 | \vspacesmall 272 | 273 | \begin{tabular}{|p{30mm}|p{100mm}|} 274 | \hline 275 | \bfseries Member function & int random1i(int min, int max) \newline 276 | Vec4i random4i(int min, int max) \newline 277 | Vec8i random8i(int min, int max) \newline 278 | Vec16i random16i(int min, int max) \\ \hline 279 | \bfseries Description & Returns a random integer or a vector of random integers 280 | with uniform distribution in the interval min $\leq$ x $\leq$ max. \newline 281 | (The distribution may be slightly inaccurate when the interval size is large and not a power of 2. See below for a more accurate version.) \\ \hline 282 | \bfseries Efficiency & medium \\ \hline 283 | \end{tabular} 284 | \begin{lstlisting}[frame=none] 285 | // Example: 286 | // Generate a random integer in the interval [1,10] 287 | int r = ran.random1i(1, 10); 288 | // Generate eight random integers in the interval [1,10] 289 | Vec8i v = ran.random8i(1, 10); 290 | \end{lstlisting} 291 | \vspacesmall 292 | 293 | 294 | \begin{tabular}{|p{30mm}|p{100mm}|} 295 | \hline 296 | \bfseries Member function & int random1ix(int min, int max) \newline 297 | Vec4i random4ix(int min, int max) \newline 298 | Vec8i random8ix(int min, int max) \newline 299 | Vec16i random16ix(int min, int max) \\ \hline 300 | \bfseries Description & Returns a random integer or a vector of random integers with uniform distribution in the interval min $\leq$ x $\leq$ max. \newline 301 | This is the same as random1i, random4i, random8i, random16i, but exact. \newline 302 | The exact version of these functions use a rejection method as described in the theory article mentioned above. To reproduce a sequence, the same function with the same vector size must be called. \\ \hline 303 | \bfseries Efficiency & medium \\ \hline 304 | \end{tabular} 305 | \begin{lstlisting}[frame=none] 306 | // Example: 307 | // Generate eight random integers in the interval [1,10] 308 | Vec8i v = ran.random8ix(1, 10); 309 | \end{lstlisting} 310 | \vspacesmall 311 | 312 | \begin{tabular}{|p{30mm}|p{100mm}|} 313 | \hline 314 | \bfseries Member function & float random1f() \\ \hline 315 | \bfseries Description & Returns a random floating point number with uniform distribution in the interval 0 $\leq$ x $<$ 1. The resolution is $2^{-24}$. \newline 316 | (A value in the interval 0 $<$ x $\leq$ 1 can be obtained as 1 - x. \\ \hline 317 | \bfseries Efficiency & medium \\ \hline 318 | \end{tabular} 319 | \begin{lstlisting}[frame=none] 320 | // Example: 321 | // Generate a random float below 100: 322 | float x = ran.random1f() * 100.f; 323 | \end{lstlisting} 324 | \vspacesmall 325 | 326 | \begin{tabular}{|p{30mm}|p{100mm}|} 327 | \hline 328 | \bfseries Member function & Vec4f random4f() \newline 329 | Vec8f random8f() \newline 330 | Vec16f random16f() \\ \hline 331 | \bfseries Description & Returns a vector of random floating point numbers with uniform distribution in the interval 0 $\leq$ x $<$ 1. The resolution is $2^{-24}$. \\ \hline 332 | \bfseries Efficiency & medium \\ \hline 333 | \end{tabular} 334 | \begin{lstlisting}[frame=none] 335 | // Example: 336 | // Generate four random float numbers below 100: 337 | Vec4f v = ran.random4f() * 100.f; 338 | \end{lstlisting} 339 | \vspacesmall 340 | 341 | \begin{tabular}{|p{30mm}|p{100mm}|} 342 | \hline 343 | \bfseries Member function & double random1d() \\ \hline 344 | \bfseries Description & Returns a random double precision number with uniform distribution in the interval 0 $\leq$ x $<$ 1. The resolution is $2^{-52}$. \\ \hline 345 | \bfseries Efficiency & medium \\ \hline 346 | \end{tabular} 347 | \begin{lstlisting}[frame=none] 348 | // Example: 349 | // Generate random double precision number below 100: 350 | double x = ran.random1d() * 100.; 351 | \end{lstlisting} 352 | \vspacesmall 353 | 354 | \begin{tabular}{|p{30mm}|p{100mm}|} 355 | \hline 356 | \bfseries Member function & Vec2d random2d() \newline 357 | Vec4d random4d() \newline 358 | Vec8d random8d() \\ \hline 359 | \bfseries Description & Returns a vector of random double precision numbers with uniform distribution in the interval 0 $\leq$ x $<$ 1. The resolution is $2^{-52}$. \\ \hline 360 | \bfseries Efficiency & medium \\ \hline 361 | \end{tabular} 362 | \begin{lstlisting}[frame=none] 363 | // Example: 364 | // Generate four random double precision numbers below 100: 365 | Vec4d v = ran.random4d() * 100.; 366 | \end{lstlisting} 367 | \vspacebig 368 | 369 | 370 | \section{Other functions}\label{OtherFunctions} 371 | 372 | \begin{tabular}{|p{30mm}|p{100mm}|} 373 | \hline 374 | \bfseries Function & int physicalSeedType() \\ \hline 375 | \bfseries Description & Finds the best source of non-reproducible randomness on the CPU that the program is running on. Return value: \newline 376 | 0: No physical seed available \newline 377 | 1: CPU clock (consecutive calls are not independent)\newline 378 | 2: RDRAND instruction \newline 379 | 3: RDSEED instruction \\ \hline 380 | \bfseries Source file & physseed.cpp \\ \hline 381 | \bfseries Efficiency & medium \\ \hline 382 | \end{tabular} 383 | \vspacebig 384 | 385 | \begin{tabular}{|p{30mm}|p{100mm}|} 386 | \hline 387 | \bfseries Function & int physicalSeed() \\ \hline 388 | \bfseries Description & Get a non-reproducible random number based on a physical process. This is intended as a seed for the pseudo random number generator. The source of randomness is indicated by physicalSeedType(); \\ \hline 389 | \bfseries Source file & physseed.cpp \\ \hline 390 | \bfseries Efficiency & medium \\ \hline 391 | \end{tabular} 392 | \begin{lstlisting}[frame=none] 393 | // Example: Generate a random seed 394 | int seed = physicalSeed(); 395 | // Make an instance of the pseudo random number generator 396 | Ranvec1 ran(2); 397 | // Initialize it with the random seed 398 | ran.init(seed); 399 | // Generate a vector of 16 random float numbers 400 | Vec16f rf = ran.random16f(); 401 | // This code will generate a different random sequence each 402 | // time it runs. 403 | \end{lstlisting} 404 | \vspacebig 405 | 406 | 407 | \section{Generating seeds}\label{GeneratingSeeds} 408 | 409 | Ranvec1 is called a pseudo random number generator because it is deterministic. You can repeat the same sequence of random numbers if you run it again with the same seed. 410 | You need to initialize Ranvec1 with a random seed if you want a sequence of random numbers that is not predictable or deterministic. 411 | \vspacesmall 412 | 413 | The \codei{physicalSeed()} function will produce such a random seed. 414 | Newer CPUs have a built-in physical source of randomness based on thermal noise. This is implemented in the RDRAND or RDSEED instruction. The RDSEED instruction is stronger than RDRAND if you want to call it multiple times to get a longer seed. The \codei{physicalSeed()} function will use the best source of randomness available on the CPU it is running on. 415 | \vspacesmall 416 | 417 | If the program is running on and older CPU without the RDRAND or RDSEED instruction, then you can use the internal CPU clock as a source of randomness. The frequency of this internal clock is typically higher than 1 GHz. The source of randomness here is the exact time at which the function is called. 418 | \vspacesmall 419 | 420 | Note that if you are calling \codei{physicalSeed()} twice on an older computer where the CPU clock is the only source of randomness, then the second call will not be independent of the first one. It will give a value that is perhaps a few hundred clock counts higher than the first one. To get an independent second value you need to wait for some external event before the second call. This external event can be a keystroke, a mouse move, or a network event. If the function \codei{physicalSeedType()} returns 1 then you need to wait for an external event before every call to \codei{physicalSeed()} except the first one. For example, you may ask the user to press a key. 421 | \vspacesmall 422 | 423 | 424 | \section{Cryptographic applications}\label{CryptographicApplications} 425 | It is theoretically possible to predict and reproduce the sequence generated by a single pseudo random number generator if you have access to a subsequence longer than the internal state buffer. This is not possible if two random number generators with long cycle lengths are combined. Therefore, you should always use the combined generator (type 3) for cryptographic applications. 426 | \vspacesmall 427 | 428 | You should use a seed longer than 32 bits to get a good unpredictable result. Use the \codei{initByArray} function with an array of multiple seeds. Use two or more array elements generated by the \codei{physicalSeed()} function and supply with other elements from other sources. These other elements do not need to be truly random; they may include date and time, a hash of the user name or password, or any other data. The resulting random number sequence depends on all the elements in the seeds array. The resulting sequence will be unpredictable as long as at least one element of the seeds array is truly unpredictable. Combining seeds from multiple sources makes it more difficult for an attacker to break the security. 429 | \vspacesmall 430 | 431 | 432 | \section{Game applications}\label{GameApplications} 433 | The source of randomness does not need to be highly secure for entertainment games. A single seed from the \codei{physicalSeed()} function will provide sufficient randomness. 434 | \vspacesmall 435 | 436 | 437 | \section{Gambling applications}\label{GamblingApplications} 438 | Gambling is a morally dubious exploitation of well-known weaknesses in the human psyche for financial gain, in my opinion. I do not endorse the use of this software in gambling applications. 439 | \vspacesmall 440 | 441 | 442 | \section{Monte Carlo simulation}\label{MonteCarloSimulation} 443 | Monte Carlo simulation and Monte Carlo integration are computational techniques that require a very long sequence of random numbers. The Ranvec1 generator was designed to be well suited for this purpose. 444 | \vspacesmall 445 | 446 | You do not need truly unpredictable randomness for Monte Carlo applications. On the contrary, it is an advantage to have a deterministic sequence so that it is possible to re-play a particular simulation in case of an interesting event that you want to analyze further. It is quite convenient to use consecutive seeds such as 1, 2, 3, ... for a series of simulation runs. 447 | \vspacesmall 448 | 449 | 450 | \section{Multi-threaded applications}\label{MultiThreadedApplications} 451 | The Ranvec1 generator is designed to be suitable for large multi-threaded applications. 452 | You can take advantage of the multiple CPU cores in modern computers by running multiple threads simultaneously in time-consuming applications. The number of threads should not be more than the number of CPU cores. Some microprocessors are able to run two or more threads in each core. In this case, the number of logical processors is higher than the number of physical processors. 453 | Two threads running in the same core are likely to be competing for the same resources, so it may not be efficient to run more threads than CPU cores in this case. 454 | \vspacesmall 455 | 456 | It is not safe to access a pseudo random number generator from multiple threads simultaneously. Instead, you need to make one instance of Ranvec1 for each thread. Each instance should have a different seed. It is recommended to use the combined generator (type 3) with two seeds. The second seed, or both seeds, should be different for each thread. The theoretical reasons for this are explained in the theory article cited on page \pageref{Fog2015TheoryArticle}. 457 | \vspacesmall 458 | 459 | Example \ref{exampleMultipleThreads} shows how to generate random numbers in multiple threads. Note that there will be one instance of the random number generator object \codei{Ranvec1} in each thread because it is declared inside the thread function. 460 | 461 | \vspacesmall 462 | 463 | 464 | \begin{example} 465 | \label{exampleMultipleThreads} 466 | \end{example} % frame disappears if I put this after end lstlisting 467 | \begin{lstlisting}[frame=single] 468 | // Example of random number generation with multiple threads 469 | // random_threads.cpp 470 | 471 | // Example of command line options for g++ and clang: 472 | // g++ -O2 -std=c++17 -mavx2 -mfma -pthread random_threads.cpp 473 | // clang++ -O2 -std=c++17 -mavx2 -mfma -pthread random_threads.cpp 474 | 475 | // for Visual Studio only: define desired instruction set: 476 | // #define INSTRSET 8 477 | 478 | #include 479 | #include 480 | 481 | #include "ranvec1.h" // random number generator 482 | #include "ranvec1.cpp" // put code in separate module or include 483 | 484 | // Thread function. Will run one instance for each thread 485 | // This function calculates the mean of 1000 random numbers 486 | void thread_function(int threadnum, int seed, double * result) { 487 | 488 | // Make an instance of the random number generator 489 | // (this instance is local to each thread) 490 | Ranvec1 ran(3); 491 | 492 | // Initialize. Use the thread number as a second seed to get 493 | // different results in each thread 494 | ran.init(seed, threadnum); 495 | 496 | // Accumulator for eight sums 497 | Vec8d accum = 0.; 498 | 499 | // Generate 1000 random double precision numbers 500 | for (int i = 0; i < 125; i++) { 501 | // Vector of eight double precision random numbers 502 | accum += ran.random8d(); 503 | } 504 | // Calculate sum and mean 505 | double sum = horizontal_add(accum); 506 | double mean = sum * 0.001; 507 | 508 | // Return result 509 | *result = mean; 510 | } 511 | 512 | int main() { 513 | 514 | // Number of threads 515 | const int number_of_threads = 4; 516 | 517 | // Array of thread objects 518 | std::thread threads[number_of_threads]; 519 | 520 | // Array of results 521 | double results[number_of_threads]; 522 | 523 | // Arbitrary seed 524 | int seed = 25; 525 | 526 | // Start threads 527 | for (int t = 0; t < number_of_threads; t++) { 528 | threads[t] = 529 | std::thread(thread_function, t, seed, &results[t]); 530 | } 531 | 532 | // Wait for threads to finish 533 | for (int t = 0; t < number_of_threads; t++) { 534 | threads[t].join(); 535 | } 536 | 537 | // write results 538 | for (int i = 0; i < number_of_threads; i++) { 539 | printf("%.6f ", results[i]); 540 | } 541 | 542 | return 0; 543 | } 544 | \end{lstlisting} 545 | \vspacesmall 546 | 547 | 548 | 549 | \end{document} -------------------------------------------------------------------------------- /vector3d/testbench_vector3d.cpp: -------------------------------------------------------------------------------- 1 | /************************* testbench_vector3d.cpp ************************** 2 | * Author: Agner Fog 3 | * Date created: 2019-07-14 4 | * Last modified: 2022-07-20 5 | * Version: 2.02.00 6 | * Project: Testbench for vector3d.h using vector class library 7 | * Description: 8 | * Compile and run this program to test operators and functions in vector3d.h package 9 | * This file contains test cases for general operators and functions. 10 | * Each function or operator is tested with many different combinations of input data. 11 | * 12 | * Instructions: 13 | * The following parameters must be defined on the command line or added in the 14 | * top of this file: 15 | * 16 | * vtype: Vector type to test 17 | * testcase: A number defining a function or operator to test. See the cases in this file. 18 | * seed: Seed for random number generator. May be any integer 19 | * 20 | * Compile with any compiler supported by VCL. 21 | * Specify the desired instruction set and optimization options as parameters 22 | * to the compiler. 23 | * 24 | * (c) Copyright 2019-2022 Agner Fog. 25 | * Apache license 2.0 26 | ****************************************************************************** 27 | 28 | Test cases: 29 | 1: operator + 30 | 2: operator - 31 | 3: operator * 32 | 4: operator / 33 | 5: unary - 34 | 8: vector * real 35 | 9: vector / real 36 | 11: cross_product 37 | 12: dot_product 38 | 13: vector_length 39 | 14: normalize_vector 40 | 15: rotate 41 | 16: to_float 42 | 17: to_double 43 | 20: constructor from three coordinates 44 | 21: get_x 45 | 22: get_y 46 | 23: get_z 47 | 24: extract 48 | 49 | *****************************************************************************/ 50 | 51 | #include 52 | #include 53 | #if defined (__linux__) && !defined(__LP64__) 54 | #include // set floating point control word 55 | #endif 56 | 57 | #define MAX_VECTOR_SIZE 512 58 | 59 | #ifndef INSTRSET 60 | #define INSTRSET 10 61 | #endif 62 | #include 63 | #include "../special/vector3d/vector3d.h" // 3-D vectors 64 | 65 | 66 | #ifndef testcase 67 | // --------------------------------------------------------------------------- 68 | // Specify input parameters here if running from an IDE 69 | // ---------------------------------------------------------------------------- 70 | 71 | #define testcase 1 72 | 73 | #define vtype Vec3Dd 74 | 75 | #define seed 1 76 | 77 | 78 | #endif // testcase 79 | 80 | 81 | // ---------------------------------------------------------------------------- 82 | // Declarations 83 | // ---------------------------------------------------------------------------- 84 | 85 | // dummy vectors used for getting vector types and element type 86 | vtype dummyc; // complex vector type 87 | typedef decltype(dummyc.to_vector()) wtype; // corresponding normal vector type 88 | wtype dummyv; 89 | typedef decltype(dummyv[0]) ST; // scalar type 90 | ST a0, a1; // scalar operands 91 | const int maxvectorsize = 16; // max number of elements in a vector 92 | //ST oplist[maxvectorsize]; // operand vector 93 | int jj0; // copy of vector index 94 | 95 | 96 | /************************************************************************ 97 | * 98 | * Test cases 99 | * 100 | ************************************************************************/ 101 | 102 | #if testcase == 1 // + 103 | inline vtype testFunction(vtype const& a, vtype const& b) { 104 | return a + b; 105 | } 106 | vtype referenceFunction(vtype a, vtype b) { 107 | ST aa[4], bb[4], cc[4]; 108 | a.store(aa); b.store(bb); 109 | for (int i=0; i<4; i++) cc[i] = aa[i] + bb[i]; 110 | return vtype().load(cc); 111 | } 112 | 113 | #elif testcase == 2 // - 114 | inline vtype testFunction(vtype const& a, vtype const& b) { 115 | return a - b; 116 | } 117 | vtype referenceFunction(vtype a, vtype b) { 118 | ST aa[4], bb[4], cc[4]; 119 | a.store(aa); b.store(bb); 120 | for (int i=0; i<4; i++) cc[i] = aa[i] - bb[i]; 121 | return vtype().load(cc); 122 | } 123 | 124 | #elif testcase == 3 // * 125 | inline vtype testFunction(vtype const& a, vtype const& b) { 126 | return a * b; 127 | } 128 | vtype referenceFunction(vtype a, vtype b) { 129 | ST aa[4], bb[4], cc[4]; 130 | a.store(aa); b.store(bb); 131 | for (int i=0; i<4; i++) cc[i] = aa[i] * bb[i]; 132 | return vtype().load(cc); 133 | } 134 | 135 | #elif testcase == 4 // / 136 | inline vtype testFunction(vtype const& a, vtype const& b) { 137 | return a / b; 138 | } 139 | vtype referenceFunction(vtype a, vtype b) { 140 | ST aa[4], bb[4], cc[4]; 141 | a.store(aa); b.store(bb); 142 | for (int i=0; i<4; i++) cc[i] = aa[i] / bb[i]; 143 | return vtype().load(cc); 144 | } 145 | 146 | #elif testcase == 5 // unary - 147 | 148 | inline vtype testFunction(vtype const& a, vtype const& b) { 149 | return -a; 150 | } 151 | vtype referenceFunction(vtype a, vtype b) { 152 | ST aa[4], bb[4], cc[4] = {0}; 153 | a.store(aa); b.store(bb); 154 | for (int i=0; i<3; i++) cc[i] = -aa[i]; 155 | return vtype().load(cc); 156 | } 157 | 158 | #elif testcase == 6 // vector - real 159 | inline vtype testFunction(vtype const& a, vtype const& b) { 160 | ST b0 = b.to_vector()[0]; 161 | return a - b0; 162 | } 163 | vtype referenceFunction(vtype a, vtype b) { 164 | ST aa[4], bb[4], cc[4] = {0}; 165 | a.store(aa); b.store(bb); 166 | for (int i=0; i<3; i++) cc[i] = aa[i] - bb[0]; 167 | return vtype().load(cc); 168 | } 169 | 170 | #elif testcase == 7 // real - vector 171 | inline vtype testFunction(vtype const& a, vtype const& b) { 172 | a0 = a.to_vector()[0]; 173 | return a0 - b; 174 | } 175 | vtype referenceFunction(vtype a, vtype b) { 176 | ST aa[4], bb[4], cc[4] = {0}; 177 | a.store(aa); b.store(bb); 178 | for (int i=0; i<4; i++) cc[i] = aa[0] - bb[i]; 179 | return vtype().load(cc); 180 | } 181 | 182 | #elif testcase == 8 // vector * real 183 | inline vtype testFunction(vtype const& a, vtype const& b) { 184 | ST b0 = b.to_vector()[0]; 185 | return a * b0; 186 | } 187 | vtype referenceFunction(vtype a, vtype b) { 188 | ST aa[4], bb[4], cc[4] = {0}; 189 | a.store(aa); b.store(bb); 190 | for (int i=0; i<4; i++) cc[i] = aa[i] * bb[0]; 191 | return vtype().load(cc); 192 | } 193 | 194 | #elif testcase == 9 // vector / real 195 | inline vtype testFunction(vtype const& a, vtype const& b) { 196 | ST b0 = b.to_vector()[0]; 197 | return a / b0; 198 | } 199 | vtype referenceFunction(vtype a, vtype b) { 200 | ST aa[4], bb[4], cc[4] = {0}; 201 | a.store(aa); b.store(bb); 202 | for (int i=0; i<3; i++) cc[i] = aa[i] / bb[0]; 203 | return vtype().load(cc); 204 | } 205 | 206 | #elif testcase == 10 // real / vector 207 | inline vtype testFunction(vtype const& a, vtype const& b) { 208 | a0 = a.to_vector()[0]; 209 | return a0 / b; 210 | } 211 | vtype referenceFunction(vtype a, vtype b) { 212 | ST aa[4], bb[4], cc[4] = {0}; 213 | a.store(aa); b.store(bb); 214 | for (int i=0; i<3; i++) cc[i] = aa[0] / bb[i]; 215 | return vtype().load(cc); 216 | } 217 | 218 | #elif testcase == 11 // cross_product 219 | inline vtype testFunction(vtype const& a, vtype const& b) { 220 | return cross_product(a, b); 221 | } 222 | vtype referenceFunction(vtype a, vtype b) { 223 | ST aa[4], bb[4], cc[4]; 224 | a.store(aa); b.store(bb); 225 | cc[0] = aa[1]*bb[2] - aa[2]*bb[1]; 226 | cc[1] = aa[2]*bb[0] - aa[0]*bb[2]; 227 | cc[2] = aa[0]*bb[1] - aa[1]*bb[0]; 228 | cc[3] = 0; 229 | return vtype().load(cc); 230 | } 231 | #define FACCURACY 100000 // possible loss of precision 232 | 233 | 234 | #elif testcase == 12 // dot_product 235 | inline vtype testFunction(vtype const& a, vtype const& b) { 236 | return vtype(dot_product(a, b), 0, 0); 237 | } 238 | vtype referenceFunction(vtype a, vtype b) { 239 | ST aa[4], bb[4], cc[4] = {0}; 240 | a.store(aa); b.store(bb); 241 | ST sum = 0; 242 | for (int i=0; i<3; i++) sum += aa[i] * bb[i]; 243 | cc[0] = sum; 244 | return vtype().load(cc); 245 | } 246 | #define FACCURACY 4 // possible loss of precision 247 | 248 | #elif testcase == 13 // vector_length 249 | inline vtype testFunction(vtype const& a, vtype const& b) { 250 | return vtype(vector_length(a), 0, 0); 251 | } 252 | vtype referenceFunction(vtype a, vtype b) { 253 | ST aa[4], cc[4] = {0}; 254 | a.store(aa); 255 | ST ssum = 0; 256 | for (int i=0; i<3; i++) ssum += aa[i] * aa[i]; 257 | cc[0] = std::sqrt(ssum); 258 | return vtype().load(cc); 259 | } 260 | #define FACCURACY 4 // possible loss of precision 261 | 262 | #elif testcase == 14 // normalize_vector 263 | inline vtype testFunction(vtype const& a, vtype const& b) { 264 | return normalize_vector(a); 265 | } 266 | vtype referenceFunction(vtype a, vtype b) { 267 | ST aa[4], cc[4] = {0}; 268 | a.store(aa); 269 | ST ssum = 0; 270 | for (int i=0; i<3; i++) ssum += aa[i] * aa[i]; 271 | ssum = std::sqrt(ssum); 272 | for (int i=0; i<3; i++) cc[i] = aa[i] / ssum; 273 | return vtype().load(cc); 274 | } 275 | #define FACCURACY 4 // possible loss of precision 276 | 277 | #elif testcase == 15 // rotate 278 | inline vtype testFunction(vtype const& a, vtype const& b) { 279 | return rotate(b, a-vtype(1,1,1), b+vtype(2,2,2), a); 280 | } 281 | 282 | vtype referenceFunction(vtype a, vtype b) { 283 | ST aa[4], cc[4]; 284 | a.store(aa); 285 | ST R[4][4] = {{0}}; // rotation matrix 286 | vtype c0 = b, c1 = a-vtype(1,1,1), c2 = b+vtype(2,2,2); // columns 287 | R[0][0] = c0[0]; R[1][0] = c0[1]; R[2][0] = c0[2]; 288 | R[0][1] = c1[0]; R[1][1] = c1[1]; R[2][1] = c1[2]; 289 | R[0][2] = c2[0]; R[1][2] = c2[1]; R[2][2] = c2[2]; 290 | for (int i=0; i<3; i++) { // multiply matrix by column vector a 291 | cc[i] = 0; 292 | for (int j=0; j<3; j++) { 293 | cc[i] += R[i][j] * aa[j]; 294 | } 295 | } 296 | cc[3] = 0; 297 | return vtype().load(cc); 298 | } 299 | #define FACCURACY 1000 // possible loss of precision 300 | 301 | #elif testcase == 16 // to_float 302 | inline vtype testFunction(vtype const& a, vtype const& b) { 303 | auto c = to_float(a); 304 | return vtype(c[0], c[1], c[2]); 305 | } 306 | 307 | vtype referenceFunction(vtype a, vtype b) { 308 | return a; 309 | } 310 | #define FACCURACY 1.E9 // loss of precision when converting to single precision 311 | 312 | 313 | #elif testcase == 17 // to_double 314 | inline vtype testFunction(vtype const& a, vtype const& b) { 315 | auto c = to_double(a); 316 | return vtype((float)c[0], (float)c[1], (float)c[2]); 317 | } 318 | 319 | vtype referenceFunction(vtype a, vtype b) { 320 | return a; 321 | } 322 | 323 | #elif testcase == 20 // constructor from three coordinates 324 | inline vtype testFunction(vtype const& a, vtype const& b) { 325 | ST aa[4]; 326 | a.store(aa); 327 | return vtype(aa[0], aa[1], aa[2]); 328 | } 329 | vtype referenceFunction(vtype a, vtype b) { 330 | return a; 331 | } 332 | 333 | #elif testcase == 21 // get_x 334 | inline vtype testFunction(vtype const& a, vtype const& b) { 335 | ST c = a.get_x(); 336 | return vtype(c, 0, 0); 337 | } 338 | vtype referenceFunction(vtype a, vtype b) { 339 | ST aa[4]; 340 | a.store(aa); 341 | return vtype(aa[0], 0, 0); 342 | } 343 | 344 | #elif testcase == 22 // get_y 345 | inline vtype testFunction(vtype const& a, vtype const& b) { 346 | ST c = a.get_y(); 347 | return vtype(c, 0, 0); 348 | } 349 | vtype referenceFunction(vtype a, vtype b) { 350 | ST aa[4]; 351 | a.store(aa); 352 | return vtype(aa[1], 0, 0); 353 | } 354 | 355 | #elif testcase == 23 // get_z 356 | inline vtype testFunction(vtype const& a, vtype const& b) { 357 | ST c = a.get_z(); 358 | return vtype(c, 0, 0); 359 | } 360 | vtype referenceFunction(vtype a, vtype b) { 361 | ST aa[4]; 362 | a.store(aa); 363 | return vtype(aa[2], 0, 0); 364 | } 365 | 366 | #elif testcase == 24 // extract 367 | inline vtype testFunction(vtype const& a, vtype const& b) { 368 | uint32_t bb = uint32_t(b.get_x()) % 3; 369 | //ST c = a.extract(bb % 3); 370 | ST c = a[bb]; 371 | return vtype(c, 0, 0); 372 | } 373 | vtype referenceFunction(vtype a, vtype b) { 374 | uint32_t bb = uint32_t(b.get_x()); 375 | ST aa[4]; 376 | a.store(aa); 377 | ST c = aa[bb % 3]; 378 | return vtype(c, 0, 0); 379 | } 380 | 381 | #elif testcase == 25 // insert 382 | inline vtype testFunction(vtype const& a, vtype const& b) { 383 | uint32_t bb = uint32_t(b.get_x()) % 3; 384 | vtype aa = a; 385 | return aa.insert(bb, 9.5f); 386 | } 387 | vtype referenceFunction(vtype a, vtype b) { 388 | uint32_t bb = uint32_t(b.get_x()) % 3; 389 | ST aa[4]; 390 | a.store(aa); 391 | aa[bb] = 9.5f; 392 | return vtype().load(aa); 393 | } 394 | 395 | 396 | 397 | #else 398 | // End of test cases 399 | #error unknown test case 400 | #endif 401 | 402 | 403 | // ---------------------------------------------------------------------------- 404 | // Overhead functions 405 | // ---------------------------------------------------------------------------- 406 | 407 | const int maxerrors = 10; // maximum errors to report 408 | int numerr = 0; // count errors 409 | 410 | // type-specific load function 411 | template 412 | inline void loadData(T & x, E const* p) { 413 | x.load(p); 414 | } 415 | 416 | template 417 | inline void loadData(T & x, bool const* p) { 418 | for (int i = 0; i < x.size(); i++) { 419 | x.insert(i, p[i]); // bool vectors have no load function 420 | } 421 | } 422 | 423 | 424 | // type-specific printing functions 425 | 426 | void printVal(float x) { 427 | printf("%10.7G", x); 428 | } 429 | 430 | void printVal(double x) { 431 | printf("%10.7G", x); 432 | } 433 | 434 | void printVal(bool x) { 435 | printf("%i", (int)x); 436 | } 437 | 438 | // Random number generator 439 | class ranGen { 440 | // parameters for multiply-with-carry generator 441 | uint64_t x, carry; 442 | public: 443 | ranGen(int Seed) { // constructor 444 | x = Seed; carry = 1765; //initialize with seed 445 | next(); next(); 446 | } 447 | uint32_t next() { // get next random number, using multiply-with-carry method 448 | const uint32_t fac = 3947008974u; 449 | x = x * fac + carry; 450 | carry = x >> 32; 451 | x = uint32_t(x); 452 | return uint32_t(x); 453 | } 454 | }; 455 | 456 | template // get random number of type T 457 | T get_random(ranGen & rangen) { 458 | return (T)rangen.next(); 459 | } 460 | 461 | template <> // special case uint64_t 462 | uint64_t get_random(ranGen & rangen) { 463 | uint64_t xx; 464 | xx = (uint64_t)rangen.next() << 32; 465 | xx |= rangen.next(); 466 | return xx; 467 | } 468 | 469 | template <> // special case int64_t 470 | int64_t get_random(ranGen & rangen) { 471 | return (int64_t)get_random(rangen); 472 | } 473 | 474 | template <> // special case float 475 | float get_random(ranGen & rangen) { 476 | union Uif { 477 | uint32_t i; 478 | float f; 479 | }; 480 | Uif u1, u2; 481 | uint32_t r = rangen.next(); // get 32 random bits 482 | // Insert exponent and random mantissa to get random number in the interval 1 <= x < 2 483 | // Subtract 1.0 if next bit is 0, or 1.0 - 2^-24 = 0.99999994f if next bit is 1 484 | u1.i = 0x3F800000 - ((r >> 8) & 1); // bit 8 485 | u2.i = (r >> 9) | 0x3F800000; // bit 9 - 31 486 | return u2.f - u1.f; 487 | } 488 | 489 | template <> // special case float 490 | double get_random(ranGen & rangen) { 491 | union Uqd { 492 | uint64_t q; 493 | double d; 494 | }; 495 | Uqd u1; 496 | uint64_t r = get_random(rangen); // get 64 random bits 497 | // Insert exponent and random mantissa to get random number in the interval 1 <= x < 2, 498 | // then subtract 1.0 to get the interval 0 <= x < 1. 499 | u1.q = (r >> 12) | 0x3FF0000000000000; // bit 12 - 63 500 | return u1.d - 1.0; 501 | } 502 | template <> // special case bool 503 | bool get_random(ranGen & rangen) { 504 | return (rangen.next() & 1) != 0; 505 | } 506 | 507 | 508 | // make random number generator instance 509 | ranGen ran(seed); 510 | 511 | // bit_cast function to make special values 512 | float bit_castf(uint32_t x) { // uint64_t -> double 513 | union { 514 | uint32_t i; 515 | float f; 516 | } u; 517 | u.i = x; 518 | return u.f; 519 | } 520 | 521 | double bit_castd(uint64_t x) { // uint32_t -> float 522 | union { 523 | uint64_t i; 524 | double f; 525 | } u; 526 | u.i = x; 527 | return u.f; 528 | } 529 | 530 | 531 | // template to generate list of testdata 532 | template 533 | class TestData { 534 | public: 535 | enum LS { 536 | // define array size. Must be a multiple of vector size: 537 | listsize = 1024 538 | }; 539 | TestData() { // constructor 540 | int i; // loop counter 541 | if (T(1.1f) != 1) { 542 | // floating point type 543 | // fill boundary data into array 544 | for (i = 0; i < 20; i++) { 545 | list[i] = T((i - 4) * T(0.25)); 546 | } 547 | #ifdef TESTNAN // test also with NAN, INF, and other special data 548 | // additional special values, float: 549 | if constexpr (sizeof(ST) == 4) { 550 | list[i++] = (T)bit_castf(0x80000000); // -0 551 | list[i++] = (T)bit_castf(0x00800000); // smallest positive normal number 552 | list[i++] = (T)bit_castf(0x80800000); // largest negative normal number 553 | list[i++] = (T)bit_castf(0x3F7FFFFF); // nextafter 1.0, 0 554 | list[i++] = (T)bit_castf(0x3F800001); // nextafter 1.0, 2 555 | list[i++] = (T)bit_castf(0x7F800000); // inf 556 | list[i++] = (T)bit_castf(0xFF800000); // -inf 557 | list[i++] = (T)bit_castf(0x7FF00000); // nan 558 | } 559 | else { // double 560 | list[i++] = (T)bit_castd(0x8000000000000000); // -0 561 | list[i++] = (T)bit_castd(0x0010000000000000); // smallest positive normal number 562 | list[i++] = (T)bit_castd(0x8010000000000000); // largest negative normal number 563 | list[i++] = (T)bit_castd(0x3FEFFFFFFFFFFFFF); // nextafter 1.0, 0 564 | list[i++] = (T)bit_castd(0x3FF0000000000001); // nextafter 1.0, 2 565 | list[i++] = (T)bit_castd(0x7FF0000000000000); // inf 566 | list[i++] = (T)bit_castd(0xFFF0000000000000); // -inf 567 | list[i++] = (T)bit_castd(0x7FFC000000000000); // nan 568 | } 569 | #endif 570 | // fill random data into rest of array 571 | for (; i < listsize; i++) { 572 | list[i] = get_random(ran) * (T)100; 573 | } 574 | } 575 | else { 576 | // integer type 577 | // fill boundary data into array 578 | for (i = 0; i < 6; i++) { 579 | list[i] = T(i - 2); 580 | } 581 | // data near mid-point of unsigned integers, or overflow point of signed integers: 582 | uint64_t m = (uint64_t(1) << (sizeof(T) * 8 - 1)) - 2; 583 | for (; i < 11; i++) { 584 | list[i] = T(m++); 585 | } 586 | // fill random data into rest of array 587 | for (; i < listsize; i++) { 588 | list[i] = get_random(ran); 589 | } 590 | } 591 | } 592 | T list[listsize]; // array of test data 593 | int size() { // get list size 594 | return listsize; 595 | } 596 | }; 597 | 598 | 599 | // get value of least significant bit 600 | float delta_unit(float x) { 601 | union { 602 | float f; 603 | uint32_t i; 604 | } u; 605 | x = fabsf(x); 606 | Vec4f xv = Vec4f(x); 607 | if (!(is_finite(xv)[0])) return 1.f; 608 | if (x == 0.f || is_subnormal(xv)[0]) { 609 | u.i = 0x00800000; // smallest positive normal number 610 | return u.f; 611 | } 612 | float x1 = x; 613 | u.f = x; 614 | u.i++; 615 | return u.f - x1; 616 | } 617 | 618 | double delta_unit(double x) { 619 | union { 620 | double f; 621 | uint64_t i; 622 | } u; 623 | x = fabs(x); 624 | Vec2d xv = Vec2d(x); 625 | if (!(is_finite(xv)[0])) return 1.; 626 | if (x == 0. || is_subnormal(xv)[0]) { 627 | u.i = 0x0010000000000000; // smallest positive normal number 628 | return u.f; 629 | } 630 | double x1 = x; 631 | u.f = x; 632 | u.i++; 633 | return u.f - x1; 634 | } 635 | 636 | 637 | // compare two scalars. return true if different 638 | template 639 | inline bool compare_scalars(T const a, T const b) { 640 | return a == b; 641 | } 642 | 643 | // special cases for float and double: 644 | template <> 645 | inline bool compare_scalars(float const a, float const b) { 646 | if (a == b || (a != a && b != b)) return true; // return false if equal or both are NAN 647 | #ifdef FACCURACY // accept minor difference 648 | float dif = fabsf(a - b) / delta_unit(a); 649 | if (dif <= FACCURACY) return true; 650 | printf("\n%.0f ULP ", dif); 651 | #endif 652 | return false; 653 | } 654 | 655 | template <> 656 | inline bool compare_scalars(double const a, double const b) { 657 | if (a == b || (a != a && b != b)) return true; // return false if equal or both are NAN 658 | #ifdef FACCURACY // accept minor difference 659 | double dif = fabs(a - b) / delta_unit(a); 660 | if (dif <= FACCURACY) return true; 661 | printf("\n%.0f ULP ", dif); 662 | #endif 663 | return false; 664 | } 665 | 666 | // compare two vectors. return true if different 667 | template 668 | inline bool compare_vectors(T const& a, T const& b) { 669 | { 670 | for (int i = 0; i < a.size(); i++) { 671 | if (!compare_scalars(a[i], b[i])) return false; 672 | } 673 | } 674 | return true; 675 | } 676 | 677 | #ifndef FACCURACY 678 | #define FACCURACY 1 679 | #endif 680 | 681 | // compare two vectors. return true if different 682 | inline ST compare_vect3(vtype const& a, vtype const& b) { 683 | ST alist[4], blist[4]; 684 | a.store(alist); b.store(blist); 685 | ST dif, dif0 = 0; 686 | for (int i = 0; i < 3; i++) { 687 | ST r = fabs(blist[i]); 688 | if (r < 1.E-2) r = 1; // use relative error for results near zero 689 | dif = ST(fabs(alist[i] - blist[i]) / delta_unit(r)); 690 | if (dif > dif0) dif0 = dif; 691 | } 692 | if (dif0 > FACCURACY) return dif0; 693 | return 0; 694 | } 695 | 696 | // program entry 697 | int main() { 698 | //const int vectorsize = vtype::size(); 699 | 700 | #if defined (__linux__) && !defined(__LP64__) 701 | // Some 32-bit compilers use x87 calculations with long double precision for 702 | // the reference function. This may give slightly different results because 703 | // the value is rounded twice. To get exactly the same value in the test function 704 | // and the reference function, we change the precision of x87 calculations. 705 | // (the fpu control function is different in Windows, but the precision is already 706 | // reduced in Windows anyway) 707 | fpu_control_t fpcw = 0x27f; 708 | _FPU_SETCW(fpcw); 709 | #endif 710 | 711 | vtype a, b, result, ref; // complex vectors for operands and result 712 | 713 | // make lists of test data 714 | TestData adata, bdata; 715 | 716 | int i, j, k = 0; // loop counters 717 | 718 | for (i = 0; i < adata.size(); i += wtype::size()) { 719 | //a.load(adata.list + i); 720 | loadData(a, adata.list + i); 721 | 722 | for (j = 0; j < bdata.size(); j += wtype::size()) { 723 | loadData(b, bdata.list + j); 724 | jj0 = j; 725 | 726 | // function under test: 727 | result = testFunction(a, b); 728 | ref = referenceFunction(a, b); 729 | ST dif = compare_vect3(result, ref); 730 | if (dif != 0) { 731 | // values are different. report error 732 | if (++numerr == 1) { 733 | printf("\ntest case %i:", testcase); // print test case first time 734 | } 735 | ST alist[4], blist[4], tlist[4], rlist[4]; 736 | a.store(alist); b.store(blist); result.store(tlist); ref.store(rlist); 737 | printf("\nError at %i, %i, dif = %.2G:", i, j, dif); 738 | for (k = 0; k < 4; k++) { 739 | printf("\n%7.4G op %7.4G -> %7.4G, expected %7.4G)", 740 | alist[k], blist[k], tlist[k], rlist[k]); 741 | } 742 | } 743 | if (numerr > maxerrors) { 744 | exit(1); // stop after maxerrors 745 | } 746 | } 747 | } 748 | 749 | if (numerr == 0) { 750 | printf("\nsuccess\n"); 751 | } 752 | printf("\n"); 753 | 754 | return numerr; 755 | } 756 | --------------------------------------------------------------------------------