├── random
    ├── ranvec1.h
    ├── ranvec1.cpp
    ├── freesoftwarelogo.jpg
    ├── ranvec1_manual.pdf
    ├── readme.md
    ├── test_ranvec.cpp
    ├── physseed.cpp
    └── ranvec1_manual.tex
├── decimal
    ├── decimal_manual.pdf
    ├── freesoftwarelogo.jpg
    ├── readme.md
    ├── decimal.h
    ├── decimal_manual.tex
    └── testbench_decimal.cpp
├── complex
    ├── complexvec_manual.pdf
    ├── freesoftwarelogo.jpg
    ├── readme.md
    └── test_complex.lst
├── vector3d
    ├── freesoftwarelogo.jpg
    ├── vector3d_manual.pdf
    ├── test_vector3d.lst
    ├── readme.md
    ├── vector3d.h
    ├── vector3d_manual.tex
    └── testbench_vector3d.cpp
├── containers
    ├── containers_manual.pdf
    ├── README.md
    ├── general_containers.h
    └── vector_containers.h
├── quaternion
    ├── freesoftwarelogo.jpg
    ├── quaternion_manual.pdf
    ├── test_quaternion.lst
    ├── readme.md
    ├── quaternion_manual.tex
    └── quaternion.h
├── physical_processors
    ├── readme.md
    └── physical_processors.cpp
├── README.md
└── license.txt


/random/ranvec1.h:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vectorclass/add-on/HEAD/random/ranvec1.h


--------------------------------------------------------------------------------
/random/ranvec1.cpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vectorclass/add-on/HEAD/random/ranvec1.cpp


--------------------------------------------------------------------------------
/decimal/decimal_manual.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vectorclass/add-on/HEAD/decimal/decimal_manual.pdf


--------------------------------------------------------------------------------
/random/freesoftwarelogo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vectorclass/add-on/HEAD/random/freesoftwarelogo.jpg


--------------------------------------------------------------------------------
/random/ranvec1_manual.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vectorclass/add-on/HEAD/random/ranvec1_manual.pdf


--------------------------------------------------------------------------------
/complex/complexvec_manual.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vectorclass/add-on/HEAD/complex/complexvec_manual.pdf


--------------------------------------------------------------------------------
/complex/freesoftwarelogo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vectorclass/add-on/HEAD/complex/freesoftwarelogo.jpg


--------------------------------------------------------------------------------
/decimal/freesoftwarelogo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vectorclass/add-on/HEAD/decimal/freesoftwarelogo.jpg


--------------------------------------------------------------------------------
/vector3d/freesoftwarelogo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vectorclass/add-on/HEAD/vector3d/freesoftwarelogo.jpg


--------------------------------------------------------------------------------
/vector3d/vector3d_manual.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vectorclass/add-on/HEAD/vector3d/vector3d_manual.pdf


--------------------------------------------------------------------------------
/containers/containers_manual.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vectorclass/add-on/HEAD/containers/containers_manual.pdf


--------------------------------------------------------------------------------
/quaternion/freesoftwarelogo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vectorclass/add-on/HEAD/quaternion/freesoftwarelogo.jpg


--------------------------------------------------------------------------------
/quaternion/quaternion_manual.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vectorclass/add-on/HEAD/quaternion/quaternion_manual.pdf


--------------------------------------------------------------------------------
/physical_processors/readme.md:
--------------------------------------------------------------------------------
1 | physical_processors.cpp
2 | 
3 | Detect the number of physical and logical processors on an x86 computer
4 | 


--------------------------------------------------------------------------------
/containers/README.md:
--------------------------------------------------------------------------------
1 | # C++ container class templates
2 | 
3 | * Containers for arrays with fixed and variable size for use with VCL vector classes
4 | * Containers for matrixes with fixed size for use with VCL vector classes
5 | * Containers for arrays with fixed and variable size for use with general types independent of VCL
6 | 
7 | See containers_manual.pdf for instructions
8 | 


--------------------------------------------------------------------------------
/vector3d/test_vector3d.lst:
--------------------------------------------------------------------------------
 1 | # Test data for vector3d.h under Vector class library
 2 | # Use with runtest.sh from testbench repository
 3 | 
 4 | $compiler=1
 5 | $mode=64
 6 | $testbench=testbench_vector3d.cpp
 7 | $include=../src2
 8 | $outfile=t.txt
 9 | $seed=1
10 | 
11 | 
12 | # test case, vector type, return type, instruction set
13 | 
14 | # operators and functions
15 | 1 2 3 4 5 8 9 11 12 13 14 15 , Vec3Df Vec3Dd , , 2 6 7 8 9 10
16 | 
17 | # conversion
18 | 16  ,  Vec3Dd , , 4 7 8 9 10
19 | 17  ,  Vec3Df , , 3 7 8 9 10
20 | 
21 | # constructors etc
22 | 20 21 22 23 24  ,  Vec3Df Vec3Dd , , 3 5 7 8 9 10
23 | 


--------------------------------------------------------------------------------
/quaternion/test_quaternion.lst:
--------------------------------------------------------------------------------
 1 | # Test data for quaternion.h under VCL
 2 | # Use with runtest.sh from testbench repository
 3 | 
 4 | $compiler=1
 5 | $mode=64
 6 | $testbench=testbench_quaternion.cpp
 7 | $include=./
 8 | $outfile=q.txt
 9 | $seed=1
10 | 
11 | 
12 | # test case, vector type, return type, instruction set
13 | 
14 | # operators and functions
15 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 , Quaternion1f Quaternion1d , , 2 6 7 8 9 10
16 | 
17 | # constructors etc
18 | 20 21 25 26 27 28  , Quaternion1f Quaternion1d , , 3 5 7 8 9 10
19 | 22  , Quaternion1f , , 4 7 8 9 10
20 | 23  , Quaternion1d , , 3 7 8 9 10
21 | 


--------------------------------------------------------------------------------
/decimal/readme.md:
--------------------------------------------------------------------------------
 1 | # Decimal.cpp
 2 | 
 3 | The decimal ASCII extension to the Vector Class Library contains functions for conversion of integer 
 4 | vectors to and from comma-separated lists of numbers as human-readable decimal ASCII strings. 
 5 | This is useful for efficient reading and writing of comma-separated files.
 6 | 
 7 | ## File list:
 8 | 
 9 | * decimal_manual.pdf: Instructions
10 | * decimal.cpp: Contains functions bin2ascii and ascii2bin
11 | * decimal.h: C++ Header file
12 | * testbench_decimal.cpp: Test program
13 | * decimal_manual.tex: Latex source for decimal_manual.pdf
14 | * freesoftwarelogo.jpg: Used by decimal_manual.tex
15 | * readme.md: This file
16 | 
17 | 


--------------------------------------------------------------------------------
/vector3d/readme.md:
--------------------------------------------------------------------------------
 1 | # 3-dimensional vectors
 2 | 
 3 | # Add-on package for Vector Class Library
 4 | 
 5 | 3-dimensional vectors are useful in geometry and physics.
 6 | The file vector3d.h provides vector classes, operators, and functions for 
 7 | calculations with 3-D vectors
 8 | 
 9 | ## File list:
10 | * vector3d.h: C++ header file, defining 3-D classes, operators, and functions
11 | * vector3d_manual.pdf: Instruction manual
12 | * testbench_vector3d.cpp: Test program for testing vector3d.h during development. Not required for applications
13 | * test_vector3d.lst: Test cases for testbench_vector3d.cpp
14 | * vector3d_manual.tex: Source for building vector3d_manual.pdf
15 | * freesoftwarelogo.jpg: Used by vector3d_manual.tex
16 | 


--------------------------------------------------------------------------------
/quaternion/readme.md:
--------------------------------------------------------------------------------
 1 | # Quaternion.h
 2 | 
 3 | # Add-on package for Vector Class Library
 4 | 
 5 | Quaternions or hypercomplex numbers is a topic in theoretical algebra and quantum physics.
 6 | 
 7 | The file quaternion.h defines quaternions with single and double precision, including operators + - * / and various functions
 8 | 
 9 | **File list:**
10 | * quaternion.h: C++ header file defining quaternion classes, operators, and functions
11 | * quaternion_manual.pdf: Instruction manual
12 | * testbench_quaternion.cpp: Test program for testing quaternion.h
13 | * test_quaternion.lst: List of test cases for testbench_quaternion.cpp
14 | * quaternion_manual.tex: Source for building quaternion_manual.pdf
15 | * freesoftwarelogo.jpg: Used by quaternion_manual.tex
16 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Add-on packages for vector class library
 2 | 
 3 | These packages are extensions to the C++ vector class library for specific applications:
 4 | 
 5 | * containers. Container class tempates for arrays with fixed and dynamic size and matrixes.
 6 | * random. Generates random number vectors of integers or floating point values. Excellent randomness. Suitable for large simulations and multi-threaded applications.
 7 | * decimal. Conversion between integer vectors and comma-separated lists in human-readable form (decimal ASCII). Useful for reading and writing comma-separated files.
 8 | * vector3d. Three-dimensional vector arithmetics, operators, cross product and dot product.
 9 | * complex. Defines complex number vectors. Operators + - * / etc., and functions. Complex exponential function and logarithm.
10 | * quaternion. Hyper-complex numbers
11 | * physical_processors. Detect the number of physical and logical processors on an x86 computer
12 | 


--------------------------------------------------------------------------------
/random/readme.md:
--------------------------------------------------------------------------------
 1 | # Ranvec1
 2 | 
 3 | # Random number generator
 4 | 
 5 | Ranvec1 is an efficient high quality pseudo random number generator designed for large vector applications and multi-threaded applications.
 6 | 
 7 | **Features:**
 8 | * Vector and scalar output
 9 | * Random integers with uniform distribution in an arbitrary interval
10 | * Random floating point numbers with single and double precision
11 | * Suitable for large Monte Carlo simulations
12 | * Suitable for multi-threaded applications
13 | * High security
14 | * High resolution and very long cycle length
15 | * Includes seed generator based on truly random physical process
16 | * Detailed theoretical description available
17 | 
18 | 
19 | **File list:**
20 | * ranvec1_manual.pdf: Instructions manual
21 | * ranvec1.h: C++ header file
22 | * ranvec1.cpp: C++ code for random number generator
23 | * physseed.cpp: Generator of nondeterministic seed through physical process
24 | * test_ranvec.cpp: test program
25 | * ranvec1_manual.tex: Source for ranvec1_manual.pdf
26 | * freesoftwarelogo.jpg: Used by ranvec1_manual.tex
27 | * readme.md: This file
28 | 


--------------------------------------------------------------------------------
/complex/readme.md:
--------------------------------------------------------------------------------
 1 | # complexvec1.h
 2 | 
 3 | # Defines C++ classes for complex numbers and complex number vectors
 4 | 
 5 | **Features:**
 6 | * Defines complex number scalars and vectors
 7 | * Vectors of up to 4 double precision, 8 single precision, or 16 half precision complex numbers
 8 | * Operators + - * / == !=
 9 | * Functions abs, sqrt, etc.
10 | * Complex exponential function and logarithm
11 | 
12 | **File list:**
13 | * complexvec_manual.pdf: Instruction manual
14 | * complexvec1.h: C++ header file defining complex number classes, operators, and functions with single and double precision
15 | * complexvecfp16.h: Additional header file defining half precision complex number vectors
16 | * complexvecfp16e.h: Additional header file emulating half precision complex number vectors
17 | * testbench_complex.cpp: Program used for testing complex number vector classes during development. Not needed for application
18 | * test_complex.lst: List of test cases for testbench_complex.cpp
19 | * complexvec_manual.tex: Source for building complexvec_manual.pdf
20 | *	freesoftwarelogo.jpg: Used by complexvec_manual.tex
21 | * readme.md: This file
22 | 


--------------------------------------------------------------------------------
/random/test_ranvec.cpp:
--------------------------------------------------------------------------------
 1 | /*************************  test_ranvec.cpp   *********************************
 2 | * Author:        Agner Fog
 3 | * Date created:  2019-07-08
 4 | * Last modified: 2022-07-16
 5 | * Version:       2.02
 6 | * Project:       add-on package for vector class library
 7 | * Description:
 8 | * Test program for ranvec1.cpp
 9 | *
10 | ******************************************************************************/
11 | 
12 | 
13 | #include <stdio.h>
14 | 
15 | 
16 | #ifndef INSTRSET
17 | #define INSTRSET 10                    // instruction set
18 | #endif
19 | 
20 | #define MAX_VECTOR_SIZE 512
21 | 
22 | #include "vectorclass.h"               // vector class library
23 | #include "ranvec1.cpp"                 // random number generator
24 | #include "physseed.cpp"
25 | 
26 | 
27 | int main() {
28 |     // Make instance of random number generator class, type 3.
29 |     Ranvec1 ran(3);
30 |     //Ranvec1 ran(3, 0);   // constructor with seed
31 | 
32 | #if true           // initialize with single seed
33 |     ran.init(0);
34 | #else              // initialize with array of seeds
35 |     const int numseeds = 5;
36 |     const int seeds[numseeds] = {5,4,3,2,1};
37 |     ran.initByArray(seeds, numseeds);
38 | #endif
39 | 
40 |     Vec16i ri = ran.random16i(0, 99);            // random integers in interval 0 - 99
41 |     Vec16f rf = ran.random16f();                 // random floats in interval 0 - 1
42 | 
43 |     for (int i=0; i<ri.size(); i++) {            // print random integers
44 |         printf("%3i  ", ri[i]);
45 |     }
46 |     printf("\n\n");
47 | 
48 |     for (int i=0; i<rf.size(); i++) {            // print random floats
49 |         printf("%8.4f  ", rf[i]);
50 |     }
51 |     printf("\n\n");    
52 | 
53 |     for (int i = 0; i < 1000; i++) {             // call 1000 times
54 |         ri += ran.random512b();
55 |     }
56 | 
57 |     printf("%X\n", ri[7]);                       // print the sum
58 | 
59 |     // test physical seed generator
60 |     printf("\nSeed type %i\n", physicalSeedType());
61 | 
62 |     // print two physical seeds. Must be independent if seed type > 1
63 |     printf("\nSeed = %08X %08X\n", physicalSeed(), physicalSeed());
64 | 
65 |     return 0;
66 | }
67 | 


--------------------------------------------------------------------------------
/random/physseed.cpp:
--------------------------------------------------------------------------------
 1 | /***************************  phusseed.cpp   *********************************
 2 | * Author:        Agner Fog
 3 | * Date created:  2014-09-09
 4 | * Last modified: 2019-08-08
 5 | * Version:       2.01
 6 | * Project:       add-on package for vector class library
 7 | * Description:
 8 | * Physical seed generator for random number generator
 9 | *
10 | * (c) Copyright 2019 Agner Fog. Apache License version 2.0 or later.
11 | ******************************************************************************/
12 | 
13 | #include "ranvec1.h"
14 | 
15 | #ifdef VCL_NAMESPACE
16 | namespace VCL_NAMESPACE {
17 | #endif
18 | 
19 | 
20 | /******************************************************************************
21 |     Physical seed generation
22 | ******************************************************************************/
23 | 
24 | // Determine the type of physical seed that can be generated by current CPU:
25 | // Return value:
26 | // 0:  No physical seed
27 | // 1:  CPU clock (consecutive calls are not independent)
28 | // 2:  RDRAND instruction
29 | // 3:  RDSEED instruction
30 | int physicalSeedType() {
31 |     int abcd[4];                       // return values from cpuid instruction
32 |     cpuid (abcd, 7);                   // call cpuid function 7
33 |     if (abcd[1] & (1 << 18)) return 3; // ebx bit 18: RDSEED available
34 |     cpuid (abcd, 1);                   // call cpuid function 1
35 |     if (abcd[2] & (1 << 30)) return 2; // ecx bit 30: RDRAND available
36 |     if (abcd[3] & (1 <<  4)) return 1; // edx bit  4: RDTSC available
37 |     return 0;
38 | }
39 | 
40 | // Get a truly random number based on a physical process.
41 | // The source of randomness is indicated by physicalSeedType();
42 | static int physicalSeedTypei = -1;
43 | int physicalSeed() {
44 |     if (physicalSeedTypei < 0) {       // get the seed type on first call
45 |         physicalSeedTypei = physicalSeedType();
46 |     }
47 |     uint32_t ran = 0;                  // random number
48 |     switch (physicalSeedTypei) {
49 |     case 1:                            // use RDTSC instruction
50 |         ran = (uint32_t)__rdtsc();
51 |         break;
52 |     case 2:                            // use RDRAND instruction
53 |         while (_rdrand32_step(&ran) == 0) {}
54 |         break;
55 |     case 3:                            // use RDSEED instruction
56 |         while (_rdseed32_step(&ran) == 0) {}
57 |         break;
58 |     }
59 |     return (int)ran;                   // return random number
60 | }
61 | 
62 | #ifdef VCL_NAMESPACE
63 | }
64 | #endif
65 | 


--------------------------------------------------------------------------------
/complex/test_complex.lst:
--------------------------------------------------------------------------------
  1 | # Test data for complex1.h under VCL
  2 | # To use with runtest.sh from testbench repository
  3 | 
  4 | $compiler=1
  5 | 
  6 | # Maximum instruction set supported by this compiler
  7 | # Set to 12 if compiler supports AVX512-FP16
  8 | $compilermax=12
  9 | 
 10 | $mode=64
 11 | 
 12 | # Testbench file
 13 | #$testbench=testbench_complex.cpp
 14 | $testbench=/mnt/c/_Public/VectorClass/special/complex/testbench_complex.cpp
 15 | 
 16 | # Path to include files
 17 | #$include=./
 18 | $include=/mnt/c/_Public/VectorClass/src2
 19 | 
 20 | # Intel emulator
 21 | $emulator=/home/agner/emulator/sde/sde
 22 | 
 23 | # Output file name
 24 | $outfile=test_complex.txt
 25 | 
 26 | # Random number seed
 27 | $seed=1
 28 | 
 29 | 
 30 | # test case, vector type, return type, instruction set
 31 | 
 32 | # half precision:
 33 | #################
 34 | 
 35 | 1 2 3 4 5 6 7 8 9 10 11 , Complex1h Complex2h Complex4h Complex8h Complex16h , , 2 4 6 7 8 9 10 12
 36 | 
 37 | # constructors
 38 | 20 21 22 , Complex1h Complex2h Complex4h Complex8h Complex16h , , 6 8 9 10 12
 39 | 23 , Complex2h Complex4h Complex8h Complex16h , , 6 8 9 10 12
 40 | 
 41 | # constructor from two halves, split into two halves
 42 | 23 24 , Complex2h Complex4h Complex8h Complex16h , , 5 8 9 10 12
 43 | # constructor from four complex scalars
 44 | 25 , Complex4h , , 5 8 9 10 12
 45 | # constructor from eight complex scalars
 46 | 26 , Complex8h , , 5 8 9 10 12
 47 | # constructor from 16 complex scalars
 48 | 27 , Complex16h , , 5 8 9 10 12
 49 | 
 50 | # Get real/imag part of complex scalar
 51 | 30 31 , Complex1h , , 4 8 9 10 12
 52 | 
 53 | # Get real/imag parts of complex vector
 54 | 32 33 34 , Complex8h Complex16h , , 4 8 9 10 12
 55 | 35 , Complex2h , , 4 8 9 10 12
 56 | 36 , Complex4h , , 4 8 9 10 12
 57 | 
 58 | # extract and insert
 59 | 39 49 , Complex1h Complex2h Complex4h Complex8h Complex16h , , 6 7 8 9 10 12
 60 | 
 61 | # various functions: extract, ==, !=, select, abs, sqrt, cexp
 62 | 40 41 42 43 50 55 56  500 , Complex1h Complex2h Complex4h Complex8h Complex16h , , 3 7 8 9 10 12
 63 | 103 104  , Complex1h Complex2h Complex4h Complex8h Complex16h , , 8 9 10 12
 64 | 
 65 | # to float
 66 | 51 , Complex1h Complex2h Complex4h Complex8h , , 3 7 8 9 10 12 
 67 | 
 68 | 
 69 | # single and double precision
 70 | #############################
 71 | 
 72 | # operators
 73 | 1 2 3 4 5 6 7 8 9 10 11 ,  Complex1f Complex2f Complex4f Complex8f Complex1d Complex2d Complex4d , , 2 5 7 8 9 10
 74 | 
 75 | # constructors
 76 | 20 21 22 23 , Complex1f Complex2f Complex4f  Complex8f Complex1d Complex2d Complex4d , , 3 6 8 9 10
 77 | 
 78 | # constructor from two halves
 79 | 24 , Complex2f Complex4f  Complex8f Complex2d Complex4d , , 4 5 8 9 10
 80 | 
 81 | # constructor from four complex scalars
 82 | 25 , Complex4f Complex4d , , 4 5 8 9 10
 83 | 
 84 | # constructor from 8 complex scalars
 85 | 26 , Complex8f , , 4 5 8 9 10
 86 | 
 87 | # real, imag scalars
 88 | 30 31 , Complex1f Complex1d  , , 7 8 10
 89 | 
 90 | # real, imag vectors
 91 | 32 33 , Complex2f Complex4f Complex8f Complex2d Complex4d , , 7 8 10
 92 | 
 93 | # extract and insert
 94 | 39 49 , Complex1f Complex2f Complex4f Complex8f Complex1d Complex2d Complex4d , , 6 7 8 9 10 12
 95 | 
 96 | # interleave real and imag parts
 97 | 34 , Complex4f Complex8f Complex2d Complex4d , , 7 8 10
 98 | 35 , Complex2f , , 7 8 10
 99 | 
100 | # to_float
101 | 51 , Complex1d Complex2d Complex4d , , 3 7 8 9 10
102 | 
103 | # to_double
104 | 52 , Complex1f Complex2f Complex4f , , 3 7 8 9 10
105 | 
106 | # various functions: extract, ==, !=, select, abs, sqrt
107 | 40 41 42 43 50 55 56 60  , Complex1f Complex2f Complex4f  Complex8f Complex1d Complex2d Complex4d , , 3 7 8 9 10
108 | 
109 | 103 104  , Complex1f Complex2f Complex4f  Complex8f  , , 8 9 10
110 | # double: no sufficiently accurate reference
111 | # 103 104  , Complex1d Complex2d Complex4d , ,  8 9 10
112 | 
113 | #  cexp, clog
114 | 500 501 , Complex1f Complex2f Complex4f  Complex8f Complex1d Complex2d Complex4d , , 3 7 8 9 10
115 | 


--------------------------------------------------------------------------------
/decimal/decimal.h:
--------------------------------------------------------------------------------
 1 | /***************************  decimal.h   *************************************
 2 | * Author:        Agner Fog
 3 | * Date created:  2012-07-08
 4 | * Last modified: 2019-07-20
 5 | * Version:       2.00
 6 | * Project:       Extension to vector class library
 7 | * Description:
 8 | * Functions for conversion between binary number vectors and comma-separated
 9 | * decimal ASCII lists.
10 | *
11 | * Please see decimal_manual.pdf for instructions
12 | *
13 | * (c) Copyright 2012-2019 Agner Fog. Apache License version 2.0 or later.
14 | ******************************************************************************/
15 | 
16 | #pragma once
17 | #include "vectorclass.h"
18 | 
19 | #ifdef VCL_NAMESPACE
20 | namespace VCL_NAMESPACE {
21 | #endif
22 | 
23 | 
24 | /*****************************************************************************
25 | *
26 | *               Conversion from binary to decimal ASCII string
27 | *
28 | *****************************************************************************/
29 | 
30 | // Convert binary numbers to decimal ASCII string.
31 | // The numbers will be written to the string as decimal numbers in human-readable format.
32 | // Each number will be right-justified with leading spaces in a field of the specified length.
33 | int bin2ascii (
34 |     Vec4i const a,           // vector of integers to convert
35 |     char * string,           // string to receive the decimal ascii numbers
36 |     int fieldlen = 8,        // length of each field
37 |     int numdat = 4,          // number of data
38 |     char ovfl = '*',         // overflow indicated by this character. 
39 |                              // ovfl = 0 will make the field wide enough to contain the number
40 |     char separator = ',',    // character to separate fields. 0 for no separator
41 |     bool signd = true,       // data are interpreted as signed integers
42 |     bool term  = true);      // write a zero-terminated string
43 | 
44 | int bin2ascii (
45 |     Vec8i const a,           // vector of integers to convert 
46 |     char * string,           // string to receive the decimal ascii numbers
47 |     int fieldlen = 8,        // length of each field
48 |     int numdat = 4,          // number of data
49 |     char ovfl = '*',         // overflow indicated by this character. 
50 |                              // ovfl = 0 will make the field wide enough to contain the number
51 |     char separator = ',',    // character to separate fields. 0 for no separator
52 |     bool signd = true,       // data are interpreted as signed integers
53 |     bool term  = true);      // write a zero-terminated string
54 | 
55 | 
56 | 
57 | /*****************************************************************************
58 | *
59 | * Conversion from comma-separated decimal ASCII string to binary number vector
60 | *
61 | *****************************************************************************/
62 | 
63 | /*
64 | The function ascii2bin shows how it is possible to parse a string of 
65 | variable-length fields without looping through the characters of the sting. 
66 | It is quite a challenge, though. There are many special cases to take care
67 | of and to test. Whether it is worth the effort depends on whether string 
68 | parsing is a bottleneck. In many cases, data transfer is the bottleneck 
69 | that limits the speed, not data parsing. 
70 | This code may serve as a source of inspiration anyway.
71 | */
72 | 
73 | Vec8i ascii2bin(
74 |     const char * string,     // ASCII string containing numdat comma-separated integers
75 |     int * chars_read,        // Number of characters read
76 |     int * error,             // Errors will be indicated here
77 |     int max_stringlen = 64,  // Maximum length of string
78 |     int numdat = 8,          // Expected number of data in string. Max 8
79 |     char separator = ',');
80 | 
81 | // Error codes returned in *error:
82 | // 1:  parameters out of range
83 | // 2:  illegal character.    value will be interpreted as if this was a space
84 | // 4:  misplaced character.  value will be zero
85 | // 8:  too few separators.   value will be zero
86 | // 16: overflow.             value will be INT_MAX or INT_MIN
87 | 
88 | 
89 | #ifdef VCL_NAMESPACE
90 | }
91 | #endif
92 | 


--------------------------------------------------------------------------------
/containers/general_containers.h:
--------------------------------------------------------------------------------
  1 | /************************  general_containers.h   *****************************
  2 | * Author:        Agner Fog
  3 | * Date created:  2022-07-05
  4 | * Last modified: 2023-09-13
  5 | * Version:       2.02.00
  6 | * Description:
  7 | * Header file for general container classes
  8 | * These containers are independent of the vector class library and intended 
  9 | * for objects that are not VCL vectors.
 10 | * It may not be suitable for objects that have non-standard constructors, 
 11 | * copy constructors, move constructors, or destructors.
 12 | *
 13 | * Example:
 14 | 
 15 |   ContainerG<double> c;      // make container for type double
 16 |   c.set_size(10);            // allocate space for 10 objects
 17 |   c[2] = 88.8;               // change value of one object
 18 |   // print out all objects
 19 |   for (int i = 0; i < c.size(); i++) printf(" %.1f", c[i]); 
 20 |  
 21 | * For further instructions, see containers_manual.pdf
 22 | *
 23 | * (c) Copyright 2022 - 2023 Agner Fog.
 24 | * Apache License version 2.0 or later.
 25 | ******************************************************************************/
 26 | 
 27 | #ifndef GENERAL_CONTAINERS_H
 28 | #define GENERAL_CONTAINERS_H 20200
 29 | 
 30 | // Container class to store a variable number of objects of any type.
 31 | // This container does not rely on the vector class library
 32 | template <typename T>
 33 | class ContainerG {
 34 | protected:
 35 |     T * buf;                                     // allocated memory buffer containing array
 36 |     unsigned int allocatedSize;                  // size of allocated buffer
 37 |     unsigned int nobjects;                       // number of objects currently used
 38 |     void (*errorfunction)(void);                 // pointer to error handling function
 39 | public:
 40 |     ContainerG() {                               // constructor
 41 |         buf = 0;  allocatedSize = 0;  nobjects = 0;  errorfunction = 0;
 42 |     }
 43 |     ~ContainerG() {                              // destructor
 44 |         if (buf) delete[] buf;                   // free allocated memory
 45 |     }
 46 |     ContainerG(ContainerG&) = delete;            // prevent copying entire container (a copy constructor would have to allocate a new buffer)
 47 |     ContainerG operator = (ContainerG&) = delete;// prevent copying entire container
 48 |     int size() const {                           // get size as number of objects
 49 |         return nobjects;
 50 |     }
 51 |     int allocated_size() const {                 // maximum size that can be set without reallocation
 52 |         return allocatedSize;
 53 |     }
 54 |     void set_error_handler(void (*e)(void)) {    // set function pointer to error handler
 55 |         errorfunction = e;
 56 |     }
 57 |     void set_size(int size) {
 58 |         // Allocate, reallocate or deallocate buffer of specified size. size is the number of objects.
 59 |         // Setting size > allocated_size will allocate more buffer and fill it with zeroes
 60 |         // Setting size < allocated_size will decrease size so that some of the data are inaccessible
 61 |         // Setting size = 0 will discard all data and de-allocate the buffer.
 62 |         if (size <= 0) {                         // discard everything         
 63 |             if (buf) delete[] buf;               // de-allocate buffer
 64 |             buf = 0;  allocatedSize = 0;  nobjects = 0;
 65 |         }
 66 |         else if ((unsigned int)size <= allocatedSize) { // grow or shrink within allocated size
 67 |             nobjects = size;
 68 |         }
 69 |         else {                                   // increase allocated size
 70 |             unsigned int newallocsize;           // new size to allocate
 71 |             if ((unsigned int)size >= allocatedSize + allocatedSize/2) {
 72 |                 newallocsize = size;             // first time or big increase. allocate only the specified size
 73 |             }
 74 |             else {
 75 |                 newallocsize = size*2;           // small increase. allocate more than requested to avoid frequent reallocations
 76 |             }
 77 |             T * buf2 = 0;                        // pointer to new buffer
 78 |             buf2 = new T[newallocsize]();        // allocate new buffer. () means initialize to zero
 79 |             if (buf) {                           // previously allocated buffer exists
 80 |                 for (unsigned int i = 0; i < allocatedSize; i++) {
 81 |                     buf2[i] = buf[i];            // copy from old to new buffer
 82 |                 }
 83 |                 delete [] buf;                   // deallocate old buffer         
 84 |             }
 85 |             // store pointer to new buffer
 86 |             buf = buf2;  allocatedSize = newallocsize;
 87 |             nobjects = size;                     // new used size        
 88 |         }
 89 |     }
 90 |     T & operator [] (int index)  {               // access one object
 91 |         if ((unsigned int)index < nobjects) {
 92 |             return buf[index];                   // get reference to object
 93 |         }
 94 |         else {                                   // index out of range
 95 |             (*errorfunction)();                  // report error
 96 |             return buf[0];
 97 |         }
 98 |     }
 99 |     void load(int n, T const * p) {              // load n objects from array
100 |         if (n <= 0) return;                      // nothing to do
101 |         if ((unsigned int)n > nobjects) n = nobjects;// max size
102 |         for (int i = 0; i < n; i++) {
103 |             buf[i] = p[i];                       // load n objects
104 |         }
105 |     }
106 |     void store(int n, T * p) {                   // store n elements to array
107 |         if (n <= 0) return;                      // nothing to do
108 |         if (uint32_t(n) > nobjects) n = nobjects;// max size
109 |         for (int i = 0; i < n; i++) {
110 |             p[i] = buf[i];                       // store n objects
111 |         }
112 |     }
113 |     T * get_buf() {                              // get address of internal buffer. warning: address may change
114 |         return buf;
115 |     }
116 | };
117 | 
118 | 
119 | #endif // GENERAL_CONTAINERS_H
120 | 


--------------------------------------------------------------------------------
/license.txt:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |   
179 |    Copyright 2012-2019 Agner Fog.
180 | 
181 |    Licensed under the Apache License, Version 2.0 (the "License");
182 |    you may not use this file except in compliance with the License.
183 |    You may obtain a copy of the License at
184 | 
185 |        http://www.apache.org/licenses/LICENSE-2.0
186 | 
187 |    Unless required by applicable law or agreed to in writing, software
188 |    distributed under the License is distributed on an "AS IS" BASIS,
189 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
190 |    See the License for the specific language governing permissions and
191 |    limitations under the License.
192 | 


--------------------------------------------------------------------------------
/physical_processors/physical_processors.cpp:
--------------------------------------------------------------------------------
  1 | /*********************  physical_processors.cpp   *****************************
  2 | * Author:        Agner Fog
  3 | * Date created:  2019-10-29
  4 | * Last modified: 2021-05-04
  5 | * Version:       2.01 
  6 | * Project:       vector class library
  7 | * Description:   Detect number of physical and logical processors on CPU chip.
  8 | *                Compile for C++11 or later
  9 | *
 10 | * (c) Copyright 2019-2021 Agner Fog.
 11 | * Apache License version 2.0 or later.
 12 | *******************************************************************************
 13 | Some modern CPUs can run two threads in each CPU core when simultaneous 
 14 | multithreading (SMT, called hyperthreading by Intel) is enabled.
 15 | 
 16 | The number of physical processors is the number of CPU cores.
 17 | The number of logical processors is the same number multiplied by the number of
 18 | threads that can run simultaneously in each CPU core.
 19 | 
 20 | Simultaneous multithreading will slow down performance when two CPU-intensive 
 21 | threads running in the same physical processor (CPU core) are competing for the
 22 | same resources. Therefore, the optimal number of threads for CPU-intensive
 23 | tasks is most likely to be the number of physical processors. 
 24 | 
 25 | Tasks that are less CPU-intensive but limited by RAM access, disk access, 
 26 | network, etc. may get an advantage by running as many threads as the number of
 27 | logical processors. This will be double the number of physical processors when
 28 | simultaneous multithreading is enabled.
 29 | 
 30 | The physicalProcessors function detects the number of physical processors and
 31 | logical processors on an x86 computer. This is useful for determining the 
 32 | optimal number of threads.
 33 | 
 34 | 
 35 | Note: There are several problems in detecting the number of physical processors:
 36 | 
 37 | 1. The CPUID instruction on Intel CPUs will return a wrong number of logical
 38 |    processors when SMT (hyperthreading) is disabled. It may be necessary to 
 39 |    compare the number of processors returned by the CPUID instruction with the
 40 |    number of processors reported by the operating system to detect if SMT is 
 41 |    enabled (AMD processors do not have this problem).
 42 | 
 43 | 2. It is necessary to rely on system functions to detect if there is more than 
 44 |    one CPU chip installed. It is assumed that the status of SMT is the same on
 45 |    all CPU chips in a system.
 46 | 
 47 | 3. The behavior of VIA processors is undocumented.
 48 |    
 49 | 4. This function is not guaranteed to work on future CPUs. It may need updating
 50 |    when new CPUs with different configurations or different CPUID functionality
 51 |    appear.
 52 | ******************************************************************************/
 53 | 
 54 | #include <thread>     // std::thread functions
 55 | 
 56 | #ifdef _MSC_VER
 57 | #include <intrin.h>   // __cpuidex intrinsic function available on microsoft compilers
 58 | #endif
 59 | 
 60 | #ifdef VCL_NAMESPACE
 61 | namespace VCL_NAMESPACE {
 62 | #endif
 63 | 
 64 | // Define interface to CPUID instruction.
 65 | // input:  leaf = eax, subleaf = ecx
 66 | // output: output[0] = eax, output[1] = ebx, output[2] = ecx, output[3] = edx
 67 | static inline void cpuid(int output[4], int leaf, int subleaf = 0) {
 68 | #if defined(__GNUC__) || defined(__clang__)      // use inline assembly, Gnu/AT&T syntax
 69 |     int a, b, c, d;
 70 |     __asm("cpuid" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "a"(leaf), "c"(subleaf) : );
 71 |     output[0] = a;
 72 |     output[1] = b;
 73 |     output[2] = c;
 74 |     output[3] = d;
 75 | 
 76 | #elif defined (_MSC_VER)                         // Microsoft compiler, intrin.h included
 77 |     __cpuidex(output, leaf, subleaf);            // intrinsic function for CPUID
 78 | 
 79 | #else                                            // unknown platform. try inline assembly with masm/intel syntax
 80 |     __asm {
 81 |         mov eax, leaf
 82 |         mov ecx, subleaf
 83 |         cpuid;
 84 |         mov esi, output
 85 |         mov[esi], eax
 86 |         mov[esi + 4], ebx
 87 |         mov[esi + 8], ecx
 88 |         mov[esi + 12], edx
 89 |     }
 90 | #endif
 91 | }
 92 | 
 93 | // Function prototype:
 94 | int physicalProcessors(int * logical_processors = 0);
 95 | 
 96 | 
 97 | // Find the number of physical and logical processors supported by CPU
 98 | // Parameter: 
 99 | // logical_processors: an optional pointer to an integer that will receive the number of logical processors.
100 | // Return value: number of physical processors
101 | int physicalProcessors(int * logical_processors) {
102 |     int vendor = 0;                              // CPU vendor: 1 = Intel, 2 = AMD, 3 = VIA, 0 = other
103 |     int logicalProc = 1;                         // number of logical processor cores
104 |     int physicalProc = 1;                        // number of physical processor cores
105 |     int procPerCore = 1;                         // logical cores per physical core
106 |     bool hyperthreadingSupported = false;        // CPU supports hyperthreading / simultaneous multithreading
107 |     int systemProcessors = std::thread::hardware_concurrency(); // number of processors reported by operating system
108 | 
109 |     int abcd[4] = { 0,0,0,0 };                   // CPUID output
110 |     cpuid(abcd, 0);                              // CPUID function 0
111 | 
112 |     int maxLeaf = abcd[0];                       // maximum eax input for CPUID
113 |     if (abcd[2] == 0x6C65746E) {                 // last 4 chars of "GenuineIntel"
114 |         vendor = 1;
115 |     }
116 |     else if (abcd[2] == 0x444D4163) {            // last 4 chars of "AuthenticAMD"
117 |         vendor = 2;
118 |     }
119 |     else if (abcd[2] == 0x736C7561) {            // last 4 chars of "CentaurHauls"
120 |         vendor = 3;
121 |     }
122 | 
123 |     if (maxLeaf >= 1) {
124 |         cpuid(abcd, 1);
125 |         if (abcd[3] & (1 << 28)) {               // hyperthreading supported
126 |             hyperthreadingSupported = true;
127 |         }
128 |     }
129 | 
130 |     if (vendor == 1) {
131 |         //////////////////
132 |         //    Intel     //
133 |         //////////////////
134 | 
135 |         int hyper = 0;                           // hyperthreading status: 0 = unknown, 1 = disabled, 2 = enabled
136 |         if (maxLeaf >= 0xB) {                    // leaf 0xB or 0x1F: Extended Topology Enumeration
137 |             int num = 0xB;
138 |             // if (maxLeaf >= 0x1F) num = 0x1F;
139 | 
140 |             for (int c = 0; c < 5; c++) {
141 |                 cpuid(abcd, num, c);             // enumeration level c
142 |                 int type = (abcd[2] >> 8) & 0xFF;// enumeration type at level c
143 |                 if (type == 1) {                 // SMT level
144 |                     procPerCore = abcd[1] & 0xFFFF;
145 |                 }
146 |                 else if (type >= 2) {            // core level
147 |                     logicalProc = abcd[1] & 0xFFFF;
148 |                 }
149 |                 else if (type == 0) break;
150 |                 // There are more types/levels to consider if we use num = 0x1F. We may need  
151 |                 // to fix this in the future if CPUs with more complex configurations appear
152 |             }
153 |             physicalProc = logicalProc / procPerCore;
154 | 
155 |             // The number of performance monitor registers depends on hyperthreading status
156 |             // on Intel CPUs with performance monitoring version 3 or 4
157 |             cpuid(abcd, 0xA, 0);                 // performance monitor counters information
158 |             int perfVersion = abcd[0] & 0xFF;    // performance monitoring version
159 |             int perfNum = (abcd[0] >> 8) & 0xFF; // number of performance monitoring registers
160 |             if (perfVersion == 3 || perfVersion == 4) {
161 |                 if (perfNum == 4) {
162 |                     hyper = 2;                   // 4 performance registers when hyperthreading enabled
163 |                 }
164 |                 else if (perfNum == 8) {         // 8 performance registers when hyperthreading disabled
165 |                     hyper = 1;
166 |                     procPerCore = 1;
167 |                     logicalProc = physicalProc;  // reduce the number of logical processors when hyperthreading is disabled
168 |                 }
169 |                 // hyper remains 0 in all other cases, indicating unknown status
170 |             }
171 |         }
172 |         else if (maxLeaf >= 4) {                 // CPUID function 4: cache parameters and cores
173 |             cpuid(abcd, 4);
174 |             logicalProc = (abcd[0] >> 26) + 1;
175 |             if (hyperthreadingSupported) {
176 |                 // number of logical processors per core is not known. Assume 2 if hyperthreading supported
177 |                 procPerCore = 2;
178 |             }
179 |             physicalProc = logicalProc / procPerCore;
180 |         }
181 |         else {
182 |             // no information. Assume 1 processor
183 |         }
184 |         if (systemProcessors > logicalProc) {
185 |             // Multiple CPU chips. Assume that chips are identical with respect to hypethreading
186 |             physicalProc = systemProcessors * physicalProc / logicalProc;
187 |             logicalProc = systemProcessors;
188 |         }
189 |         else if (logicalProc > systemProcessors && systemProcessors > 0 && hyper == 0) {
190 |             // Hyperthreading is disabled
191 |             logicalProc = systemProcessors;
192 |             physicalProc = systemProcessors;        
193 |         }
194 |     }
195 |     else if (vendor == 2) {
196 | 
197 |         //////////////////
198 |         //    AMD       //
199 |         //////////////////
200 | 
201 |         cpuid(abcd, 0x80000000);                 // AMD specific CPUID functions
202 |         int maxLeaf8 = abcd[0] & 0xFFFF;         // maximum eax 0x8000.... input for CPUID
203 | 
204 |         if (maxLeaf8 >= 8) {
205 |             cpuid(abcd, 0x80000008);
206 |             logicalProc = (abcd[2] & 0xFF) + 1;
207 | 
208 |             if (maxLeaf8 >= 0x1E) {
209 |                 cpuid(abcd, 0x8000001E);
210 |                 procPerCore = ((abcd[1] >> 8) & 0x03) + 1;
211 |                 // procPerCore = 2 if simultaneous multithreading is enabled, 1 if disabled
212 |             }
213 |             else {
214 |                 if (hyperthreadingSupported) {
215 |                     procPerCore = 2;
216 |                 }
217 |                 else {
218 |                     procPerCore = 1;
219 |                 }
220 |             }
221 |             physicalProc = logicalProc / procPerCore;
222 |         }
223 |         else if (hyperthreadingSupported) {
224 |             // number of logical processors per core is not known. Assume 2 if SMT supported
225 |             logicalProc = 2;
226 |             physicalProc = 1;
227 |         }
228 |         if (systemProcessors > logicalProc) {
229 |             // Multiple CPU chips. Assume that chips are identical with respect to SMT
230 |             physicalProc = systemProcessors * physicalProc / logicalProc;
231 |             logicalProc = systemProcessors;
232 |         }
233 |     }
234 |     else {
235 |     
236 |         //////////////////////////////
237 |         //    VIA or unknown CPU    //
238 |         //////////////////////////////
239 | 
240 |         // The behavior of VIA processors is undocumented! It is not known how to detect threads on a VIA processor
241 |         physicalProc = logicalProc = systemProcessors;
242 |         if (hyperthreadingSupported && physicalProc > 1) {
243 |             physicalProc /= 2;
244 |         }
245 |     }
246 |     if (logical_processors) {
247 |         // return logical_processors if pointer is not null
248 |         *logical_processors = logicalProc;
249 |     }
250 |     return physicalProc;
251 | }
252 | 
253 | #ifdef VCL_NAMESPACE
254 | }
255 | #endif
256 | 
257 | /* Uncomment this for testing:
258 | 
259 | #include <stdio.h>
260 | 
261 | int main() {
262 | 
263 |     int logicalProc = 0;
264 |     int physicalProc = physicalProcessors(&logicalProc); 
265 | 
266 |     printf("\nlogical processors: %i",  logicalProc);
267 |     printf("\nphysical processors: %i", physicalProc);
268 |     printf("\nlogical processors per core: %i", logicalProc / physicalProc);
269 |     int sysproc = std::thread::hardware_concurrency();
270 |     printf("\nsystem processors: %i", sysproc); 
271 | 
272 |     return 0;
273 | }
274 | */
275 | 


--------------------------------------------------------------------------------
/decimal/decimal_manual.tex:
--------------------------------------------------------------------------------
  1 | \documentclass[11pt,a4paper,oneside,openright]{report}
  2 | 
  3 | \usepackage[bindingoffset=5mm,left=20mm,right=20mm,top=20mm,bottom=20mm,footskip=10mm]{geometry}
  4 | \usepackage[utf8x]{inputenc}
  5 | \usepackage{hyperref}
  6 | \usepackage[english]{babel}
  7 | \usepackage{listings}
  8 | \usepackage{subfiles}
  9 | \usepackage{longtable}
 10 | \usepackage{multirow}
 11 | \usepackage{ragged2e} 
 12 | \usepackage{cmap} % avoid fi ligatures in pdf file
 13 | \usepackage{amsthm} % example numbering
 14 | \usepackage{color}
 15 | %\usepackage{bold-extra} % for bold tt font. Remember to include bold-extra.sty file
 16 | \usepackage{graphicx}
 17 | \usepackage[yyyymmdd]{datetime}
 18 | \usepackage{float}
 19 | 
 20 | % style for code listing
 21 | \renewcommand{\familydefault}{\sfdefault}
 22 | \newtheorem{example}{Example}[chapter]  % example numbering
 23 | \lstset{language=C}                     % formatting for code listing
 24 | \lstset{basicstyle=\ttfamily,breaklines=true}
 25 | \definecolor{darkGreen}{rgb}{0,0.4,0}
 26 | \definecolor{mybrown}{rgb}{0.40,0.10,0.05}
 27 | \lstset{commentstyle=\color{darkGreen}}  % comments color
 28 | \lstset{keywordstyle=\color{blue}}       % keyword color
 29 | \lstset{stringstyle=\color{mybrown}}     % string color
 30 | \lstset{showstringspaces=false}          % don't mark spaces in strings
 31 | 
 32 | \renewcommand{\dateseparator}{-}
 33 | 
 34 | % command for turning indent back on after \flushleft
 35 | \newcommand{\indenton}{\RaggedRight\parindent=15pt}
 36 | 
 37 | % command for vertical space
 38 | \newcommand{\vspacesmall}{\vspace{3mm}}
 39 | \newcommand{\vspacebig}{\vspace{6mm}}
 40 | 
 41 | % style for code inlined in text:
 42 | \newcommand{\codei}[1]{\bfseries \ttfamily{#1}\normalfont}
 43 | 
 44 | 
 45 | \begin{document}
 46 | 
 47 | \begin{titlepage}
 48 |     \centering
 49 |    
 50 |     \null %empty box needed for vfill to work
 51 |     \vfill
 52 | 
 53 |    {\bfseries\Huge
 54 |     decimal.cpp
 55 |     \vspacesmall
 56 |     
 57 |     Functions for conversion of integer vectors to and from
 58 |     comma-separated lists of   
 59 |     decimal ASCII numbers
 60 |     \vspacesmall
 61 |         
 62 |     Extension to C++ vector class library 
 63 |     \vspacebig
 64 |         
 65 |    }        
 66 |     \vspacebig
 67 |     
 68 |    {\Large    
 69 |     Agner Fog
 70 |     \vspacebig
 71 |     
 72 |     \copyright\ \today. Apache license 2.0
 73 |    }
 74 |     
 75 |     \vfill
 76 |     
 77 |     \includegraphics[width=306pt]{freesoftwarelogo.jpg}
 78 |     \vfill
 79 |     
 80 | \end{titlepage}
 81 | 
 82 | \RaggedRight
 83 | 
 84 | \chapter{Introduction}\label{chap:Introduction}
 85 | The decimal ASCII extension to the Vector Class Library contains functions for conversion of integer vectors to and from comma-separated lists of numbers as human-readable decimal ASCII strings. This is useful for efficient reading and writing of comma-separated files.
 86 | \vspacesmall
 87 | 
 88 | These functions cannot read or write floating point numbers. If you have fractional numbers, then you may consider if the numbers can be converted to integers by appropriate scaling. For example, if your have dollars with two decimals, then you can multiply the numbers by 100 to get cents as integer numbers. This will make data handling faster.
 89 | \vspacesmall
 90 | 
 91 | These functions are highly efficient. Whether this efficiency actually shows in the overall program performance depends on whether string processing is a bottleneck. In many applications, the transfer of data files is the limiting bottleneck, rather than string processing. 
 92 | \vspacesmall
 93 | 
 94 | Anyway, the code presented here can serve as an interesting show case. The code illustrates how strings can be processed or parsed in parallel using vector instructions in a highly efficient way. The ascii2bin function shows that it is possible to parse a string with variable-length fields without looping through the characters of the string.
 95 | \vspacesmall
 96 | 
 97 | 
 98 | \section{Compiling} \label{Compiling}
 99 | The decimal extension to the Vector Class Library consists of the files decimal.cpp and decimal.h. The decimal.cpp file is added to the project that needs it, and the decimal.h file is \#included in C++ files that call these functions.
100 | \vspacesmall
101 | 
102 | The decimal extension to the Vector Class Library is compiled in the same way as the Vector Class Library itself. All x86 and x86-64 platforms are supported, including Windows, Linux, and Mac OS. 
103 | The following C++ compilers can be used: Gnu, Clang, Microsoft, and Intel. 
104 | See the manual for the vector class library for further details.
105 | 
106 | 
107 | \chapter{Binary to decimal ASCII conversion}\label{chap:b2aConversion}
108 | 
109 | The bin2ascii function has the following parameters:
110 | 
111 | \begin {table}[H]
112 | \caption{bin2ascii function}
113 | \label{table:bin2asciiFunction}
114 | \begin{tabular}{|p{24mm}|p{130mm}|}
115 | \hline
116 | \bfseries Parameter & \bfseries Description \\ \hline
117 | Vec4i a \newline Vec8i a & A vector of four or eight signed or unsigned integers.
118 |  \\ \hline
119 | char * string & This char array will receive the ASCII string of decimal numbers. It is the responsibility of the programmer that the array is big enough to contain the resulting string, even in case of overflow. \\ \hline
120 | int fieldlen & Desired length of each field in the output list. \\ \hline
121 | int numdat & Number of data fields to write. The maximum value is 4 or 8 for Vec4i and Vec8i, respectively. \\ \hline
122 | char ovfl & This ASCII character will indicate overflow if a number is too big to fit into a field of size \codei{fieldlen}. The default value is '*'. The field will be extended to fit the number if \codei{ovfl} is set to 0 (without quotes). \\ \hline
123 | char separator & This ASCII character will be used as separater between the number fields. The default value is ','. No separator will be used if  \codei{separator} is set to 0 (without quotes). \\ \hline
124 | bool signd & Set this to \codei{true} (default) to write signed numbers. Set to \codei{false} if the input vector should be interpreted as unsigned numbers. \\  \hline
125 | bool term & Indicates whether the written ASCII string string should be zero-terminated. The default is true. A terminating zero will not be written if \codei{term} is false. \\ \hline
126 | Return value & The bin2ascii function returns the length of the written string. \\ \hline
127 | \end{tabular}
128 | \end{table}
129 | \vspacebig
130 | 
131 | 
132 | This example shows how to use the bin2ascii function:
133 | 
134 | \begin{example}
135 | \label{example1}
136 | \end{example} % frame disappears if I put this after end lstlisting
137 | \begin{lstlisting}[frame=single]
138 | // Example for binary to decimal ASCII conversion
139 | #include <stdio.h>
140 | #include "vectorclass.h"        
141 | #include "decimal.h" 
142 | #include "decimal.cpp"
143 | 
144 | int main() {
145 |     // make a vector of eight integers
146 |     Vec8i a(1, 20, 300, 4000, -12345, 67890, 1234567890, 8);
147 |     // make a char array big enough to hold the string
148 |     char text[100];
149 |     // convert to human-readable decimal ASCII numbers
150 |     bin2ascii(a, text, 6, 8, '*', ',', true, true);
151 |     // print text
152 |     printf("List of numbers:\n%s\n", text);
153 |     return 0;
154 | }
155 | /* The output will be:
156 | List of numbers:
157 |      1,    20,   300,  4000,-12345, 67890,******,     8
158 | */    
159 | 
160 | \end{lstlisting}
161 | \vspacesmall
162 | 
163 | 
164 | 
165 | \chapter{Decimal ASCII to binary conversion}\label{chap:a2bConversion}
166 | 
167 | The ascii2bin function has the following parameters:
168 | 
169 | \begin {table}[H]
170 | \caption{ascii2bin function}
171 | \label{table:ascii2binFunction}
172 | \begin{tabular}{|p{32mm}|p{120mm}|}
173 | \hline
174 | \bfseries Parameter & \bfseries Description \\ \hline
175 | const char * string & An ASCII string containing integer numbers separated by comma or by some other separator character. \\ \hline
176 | int * chars\_read & This parameter will receive the number of characters that the function has read. In other words, the part of the string that has been used by the function. \\ \hline
177 | int * error & This parameter will receive an indication of any errors. The error codes are listed below. \\ \hline
178 | int max\_stringlen & The maximum length of the string that the function is allowed to read. Any contents after max\_stringlen will be ignored.
179 | The string may be shorter than max\_stringlen if terminated by a zero or newline. \\ \hline
180 | int numdat & The number of data fields to read. The maximum value is 8. \\ \hline
181 | char separator & The character used as separator between numbers. The default is ',' \\ \hline
182 | Return value & The function returns a vector of type Vec8i, containing up to eight signed integers. \\ \hline
183 | \end{tabular}
184 | \end{table}
185 | \vspacesmall
186 | 
187 | The input string must be an ASCII string using the following syntax.
188 | The string can contain up to eight fields, each containing a signed or unsigned integer. The fields are separated by the character indicated as \codei{separator}. The default separator is a comma. Each field can contain an optional sign (\codei{+} or \codei{-}) followed by any number of digits 0 - 9. Spaces are allowed before and after the number, and between the sign and the number. No other characters are allowed. Nothing is allowed between the digits.
189 | \vspacesmall
190 | 
191 | A separator (comma) after the last field is not required, but it can be useful to prevent the function from reading any irrelevant text that comes after the relevant fields, which would cause syntax errors. A terminating separator will be included in \codei{chars\_read}.
192 | \vspacesmall
193 | 
194 | The following error codes are returned in \codei{*error} in case of syntax errors in the string:
195 | 
196 | \begin {table}[H]
197 | \caption{ascii2bin error codes}
198 | \label{table:ascii2binErrorCodes}
199 | \begin{tabular}{|p{24mm}|p{130mm}|}
200 | \hline
201 | \bfseries Error code & \bfseries Description \\ \hline
202 | 1 & Parameters out of range. This happens if numdat \textgreater{} 8 or max\_stringlen \textgreater{} 10000.  \\ \hline
203 | 2 & Illegal character. This happens if a numeric field contains any other characters than +, -, 0-9, or space. The value will be interpreted as if the illegal character was a space, if possible. \\ \hline
204 | 4 & Misplaced character. This happens if a + or - sign is placed after the number rather than before the number, or if there is anything between the digits. \newline The resulting value will be zero. \\ \hline
205 | 8 & Too few separators. The string has less than \codei{numdat-1} separators. The remaining values will be zero. \\ \hline
206 | 16 & Overflow. A value is too big to fit into a 32-bit signed integer. The resulting value will be INT\_MAX or INT\_MIN. \\ \hline
207 | 0 & Missing value. An empty field will be interpreted as a zero. This is not indicated as an error. \\ \hline
208 | \end{tabular}
209 | \end{table}
210 | 
211 | The error codes can be combined if the string has multiple syntax errors.
212 | 
213 | \vspacesmall
214 | Any control characters in the string, such as newline or tab, will be interpreted as end of string, unless the same character is indicated as separator. A Windows newline cannot be used as separator because it consists of two control characters (\textbackslash r\textbackslash n).
215 | \vspacebig
216 | 
217 | This example shows how to use the ascii2bin function:
218 | 
219 | \begin{example}
220 | \label{example2}
221 | \end{example} % frame disappears if I put this after end lstlisting
222 | \begin{lstlisting}[frame=single]
223 | // Example for conversion from comma-separated decimal ASCII 
224 | // string to binary vector
225 | #include <stdio.h>
226 | #include "vectorclass.h"        
227 | #include "decimal.h" 
228 | #include "decimal.cpp"
229 | 
230 | int main() {
231 |     // text string to interpret
232 |     char text[] = " 1, -20, 30, , 555, -6, 7000, 88888  ";
233 | 
234 |     // length and error will be returned in these variables
235 |     int length, error;
236 | 
237 |     // interpret the comma-separated string
238 |     Vec8i dat = ascii2bin(text, &length, &error, 64, 8, ',');
239 | 
240 |     // check if syntax error
241 |     if (error) {    
242 |         printf ("\nerror = 0x%X",error);
243 |     }
244 |     else {
245 |         // write results
246 |         for (int i = 0; i < 8; i++) {
247 |             printf("%i ", dat[i]);
248 |         }
249 |     }
250 |     return 0;
251 | }
252 | // Program output:
253 | // 1 -20 30 0 555 -6 7000 88888
254 | \end{lstlisting}
255 | \vspacesmall
256 | 
257 | 
258 | The \codei{chars\_read} variable tells where to begin the next read if the string or line contains more than eight numbers. This is illustrated in the next example:
259 | 
260 | \begin{example}
261 | \label{example3}
262 | \end{example} % frame disappears if I put this after end lstlisting
263 | \begin{lstlisting}[frame=single]
264 | // Example for converting a comma-separated decimal ASCII 
265 | // string containing more than eight numbers
266 | #include <stdio.h>
267 | #include "vectorclass.h"        
268 | #include "decimal.h" 
269 | #include "decimal.cpp"
270 | 
271 | int main() {
272 |     // text string containing twelve numbers
273 |     char text[] = " 2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37";
274 | 
275 |     // length and error will be returned in these variables
276 |     int length1, length2, error;
277 | 
278 |     // data vectors of eight integers each
279 |     Vec8i dat1, dat2;
280 | 
281 |     // read first eight numbers
282 |     dat1 = ascii2bin(text, &length1, &error, 64, 8, ',');
283 | 
284 |     // check if syntax error
285 |     if (!error) {
286 |         // read the next four numbers
287 |         dat2 = 
288 |         ascii2bin(text + length1, &length2, &error, 64, 4, ',');
289 |     }
290 | 
291 |     // check if syntax error
292 |     if (error) {    
293 |         printf ("\nerror 0x%X",error);
294 |     }
295 |     else {
296 |         // join the two data vectors
297 |         Vec16i dat12(dat1, dat2);
298 | 
299 |         // write results
300 |         for (int i = 0; i < 12; i++) {
301 |             printf("%i ", dat12[i]);
302 |         }
303 |     }
304 |     return 0;
305 | }
306 | \end{lstlisting}
307 | \vspacesmall
308 | 
309 | 
310 | \section{Efficiency} \label{Efficiency}
311 | 
312 | The ascii2bin function can be highly efficient. The performance is highest if the following conditions are satisfied:
313 | 
314 | \begin{itemize}
315 |   \item The input string is no more than 64 characters long
316 |   \item No number is more than 8 characters long, including sign
317 |   \item The code is compiled for the highest instruction set supported by the CPU it is running on. The following instruction set extensions give particular advantage: AVX2, AVX512BW, and the future AVX512VBMI2\end{itemize}
318 | \vspacesmall
319 | 
320 | 
321 | \end{document}
322 | 


--------------------------------------------------------------------------------
/containers/vector_containers.h:
--------------------------------------------------------------------------------
  1 | /************************  vector_containers.h   ******************************
  2 | * Author:        Agner Fog
  3 | * Date created:  2022-07-04
  4 | * Last modified: 2023-11-25
  5 | * Version:       2.02.00
  6 | * Project:       vector class library
  7 | * Description:
  8 | * Header file for container classes
  9 | * These containers can contain vector class objects and matrixes
 10 | *
 11 | * For instructions, see containers_manual.pdf
 12 | *
 13 | * (c) Copyright 2022 - 2023 Agner Fog.
 14 | * Apache License version 2.0 or later.
 15 | ******************************************************************************/
 16 | 
 17 | #ifndef VECTOR_CONTAINERS_H
 18 | #define VECTOR_CONTAINERS_H 20200
 19 | 
 20 | #ifdef VCL_NAMESPACE
 21 | namespace VCL_NAMESPACE {
 22 | #endif
 23 | 
 24 | 
 25 | // Container class to store n vector class objects of type V
 26 | template <typename V, int n>
 27 | class ContainerV {
 28 | protected:
 29 |     V buf[n];                                    // array of vectors
 30 |     int s_count() const {                        // used internally
 31 |         constexpr int s = V::size();             // vector size
 32 |         static_assert((s & s-1) == 0, "vector size must be power of 2"); // check that vector size is a power of 2
 33 |         return bit_scan_reverse_const(s);        // shift count for fast division by vector size
 34 |     }
 35 |     void (*errorfunction)(void) = 0;             // pointer to error handling function
 36 | public:
 37 |     ContainerV() = default;                      // default constructor
 38 |     void set_error_handler(void (*e)(void)) {    // set function pointer to error handler
 39 |         errorfunction = e;
 40 |     }
 41 |     typedef decltype (buf[0][0]) etype;          // type of vector elements
 42 |     static constexpr int n_vectors() {           // get number of vectors
 43 |         return n;
 44 |     }
 45 |     static constexpr int n_elements() {          // get number of vector elements
 46 |         return n * V::size();
 47 |     }
 48 |     static constexpr int elementtype() {         // info about vector element type and container type
 49 |         return  V::elementtype() | 0x1000;
 50 |     }
 51 |     static constexpr bool is_fp() {              // true if elements are a floating point type
 52 |         return (V::elementtype() & 0x3F) >= 15;
 53 |     }
 54 |     V get_vector(int index) const {              // extract one vector
 55 |         if (uint32_t(index) < n) {
 56 |             return buf[index];                   // get vector
 57 |         }
 58 |         else {                                   // index out of range
 59 |             (*errorfunction)();                  // call error handler
 60 |             if constexpr(is_fp()) {
 61 |                 return nan_vec<V>(2);            // floating point type. return NAN
 62 |             }
 63 |             else {
 64 |                 return V(etype(0));              // integer type. return 0
 65 |             }
 66 |         }
 67 |     }
 68 |     void set_vector(V x, int index) {            // insert one vector
 69 |         if (uint32_t(index) < n) {
 70 |             buf[index] = x;                      // set vector
 71 |         }
 72 |         else {                                   // error
 73 |             (*errorfunction)();                  // call error handler
 74 |         }
 75 |     }
 76 |     etype get_element(uint32_t index) const {    // extract one vector element
 77 |         if (index < (uint32_t)n_elements()) {
 78 |             return buf[index >> s_count()][index & (V::size() - 1)];        
 79 |         }
 80 |         else {                                   // index out of range
 81 |             (*errorfunction)();                  // call error handler
 82 |             if constexpr(is_fp()) {
 83 |                 return nan_vec<V>(2)[0];         // floating point type. return NAN
 84 |             }
 85 |             else {
 86 |                 return 0;                        // integer type. return 0
 87 |             }
 88 |         }
 89 |     }
 90 |     void set_element(etype x, uint32_t index) {  // insert one vector element
 91 |         if (index < (uint32_t)n_elements()) {
 92 |             buf[index >> s_count()].insert(index & (V::size()-1), x);
 93 |         }
 94 |         else {                                   // error
 95 |             (*errorfunction)();                  // call error handler
 96 |         }
 97 |     }
 98 |     void load(int nn, void const * p) {          // load nn elements from array
 99 |         if (nn <= 0) return;                     // nothing to do
100 |         if (nn > n_elements()) nn = n_elements();// max size
101 |         int m = (uint32_t)nn >> s_count();       // number of full vectors to load
102 |         int i;                                   // loop counter
103 |         for (i = 0; i < m; i++) {
104 |             buf[i].load((etype const*)p + i * V::size()); // store one vector
105 |         }
106 |         int partial = nn & (V::size() - 1);      // any partial store needed
107 |         if (partial) {                           // nn is not divisible by vector size
108 |             // load partial vector in the end
109 |             buf[i].load_partial(partial, (etype const*)p + i * V::size()); // load part of last vector       
110 |         }
111 |     }
112 |     void store(int nn, void * p) {               // store nn elements to array
113 |         if (nn <= 0) return;                     // nothing to do
114 |         if (nn > n_elements()) nn = n_elements();// max size
115 |         int m = (uint32_t)nn >> s_count();       // number of full vectors to store
116 |         int i;                                   // loop counter
117 |         for (i = 0; i < m; i++) {
118 |             buf[i].store((etype*)p + i * V::size()); // store one vector
119 |         }
120 |         int partial = nn & (V::size() - 1);      // any partial store needed
121 |         if (partial) {                           // nn is not divisible by vector size
122 |             // store partial vector in the end
123 |             buf[i].store_partial(partial, (etype*)p + i * V::size()); // store part of last vector       
124 |         }
125 |     }
126 |     V * get_buf() {                              // get address of internal buffer
127 |         return buf;
128 |     }
129 |     void zero() {                                // set all contents to zero
130 |         for (int i = 0; i < n; i++) {
131 |             buf[i] = V(etype(0));
132 |         }
133 |     }
134 | };
135 | 
136 | 
137 | 
138 | // Container class to store a variable number of vector class objects of type V
139 | template <typename V>
140 | class ContainerV <V, 0> {
141 | protected:
142 |     V * buf;                                     // allocated memory buffer containing array of vectors
143 |     uint32_t allocatedSize;                      // size of allocated buffer
144 |     uint32_t nvectors;                           // number of vectors currently used (includes partially used)
145 |     uint32_t nelements;                          // number of vector elements currently used
146 |     void (*errorfunction)(void);                 // pointer to error handling function
147 |     int s_count() const {                        // used internally
148 |         constexpr int s = V::size();             // vector size
149 |         static_assert((s & s-1) == 0, "vector size must be a power of 2"); // check that vector size is a power of 2
150 |         return bit_scan_reverse_const(s);        // shift count for fast division by vector size
151 |     }
152 | public:
153 |     ContainerV() {                               // constructor
154 |         buf = 0;  allocatedSize = 0;  nvectors = 0;  nelements = 0;  errorfunction = 0;
155 |     }
156 |     ~ContainerV() {                              // destructor
157 |         if (buf) delete[] buf;                   // free allocated memory
158 |     }
159 |     ContainerV(ContainerV&) = delete;            // prevent copying entire container (a copy constructor would have to allocate a new buffer)
160 |     ContainerV operator = (ContainerV&) = delete;// prevent copying entire container
161 |     void set_error_handler(void (*e)(void)) {    // set function pointer to error handler
162 |         errorfunction = e;
163 |     }
164 |     typedef decltype (buf[0][0]) etype;          // type of vector elements
165 |     static constexpr int elementtype() {         // info about vector element type and container type
166 |         return V::elementtype() | 0x1000;
167 |     }
168 |     static constexpr bool is_fp() {              // true if elements are a floating point type
169 |         return (V::elementtype() & 0x3F) >= 15;
170 |     }
171 |     int n_vectors() const {                      // get number of vectors
172 |         return nvectors;
173 |     }
174 |     int n_elements() const {                     // get number of vector elements
175 |         return nelements;
176 |     }
177 |     int allocated_size() const {                 // maximum size that can be set without reallocation
178 |         return allocatedSize;
179 |     }
180 |     void set_nvectors(int size) {
181 |         // Allocate, reallocate or deallocate buffer of specified size. size is the number of vectors.
182 |         // Setting size > allocated_size will allocate more buffer and fill it with zeroes
183 |         // Setting size < allocated_size will decrease size so that some of the data are inaccessible
184 |         // Setting size = 0 will discard all data and de-allocate the buffer.
185 |         if (size <= 0) {                         // discard everything         
186 |             if (buf) delete[] buf;               // de-allocate buffer
187 |             buf = 0;  allocatedSize = 0;  nvectors = 0;  nelements = 0;
188 |         }
189 |         else if (uint32_t(size) <= allocatedSize) { // grow or shrink within allocated size
190 |             nvectors = size;  nelements = size * V::size();
191 |         }
192 |         else {                                   // increase allocated size
193 |             uint32_t newallocsize;               // new size to allocate
194 |             if (uint32_t(size) >= allocatedSize + allocatedSize/2) {
195 |                 newallocsize = size;             // first time or big increase. allocate only the specified size
196 |             }
197 |             else {
198 |                 newallocsize = size*2;           // small increase. allocate more than requested to avoid frequent reallocations
199 |             }
200 |             V * buf2 = 0;                        // pointer to new buffer
201 |             buf2 = new V[newallocsize]();        // allocate new buffer. () means initialize to zero
202 |             uint32_t i = 0;                      // loop counter
203 |             if (buf) {                           // previously allocated buffer exists
204 |                 for (i = 0; i < allocatedSize; i++) {
205 |                     buf2[i] = buf[i];            // copy from old to new buffer
206 |                 }
207 |                 delete [] buf;                   // deallocate old buffer         
208 |             }
209 |             // store pointer to new buffer
210 |             buf = buf2;  allocatedSize = newallocsize;
211 |             nvectors = size;  nelements = size * V::size(); // new used size        
212 |         }
213 |     }
214 |     void set_nelements(int n) {
215 |         // Allocate, reallocate or deallocate buffer of specified size, not necessarily a multiple of the vector size
216 |         int nv = uint32_t(n + V::size() - 1) >> s_count(); // round up to nearest multiple of the vector size
217 |         set_nvectors(nv);
218 |         nelements = n;
219 |     }
220 |     V get_vector(int index) const {              // extract one vector
221 |         if (uint32_t(index) < nvectors) {
222 |             return buf[index];                   // get vector
223 |         }
224 |         else {                                   // index out of range
225 |             (*errorfunction)();                  // call error handler
226 |             if constexpr(is_fp()) {
227 |                 return nan_vec<V>(2);            // floating point type. return NAN
228 |             }
229 |             else {
230 |                 return V(etype(0));              // integer type. return 0
231 |             }
232 |         }
233 |     }
234 |     void set_vector(V x, int index) {            // insert one vector
235 |         if (uint32_t(index) < nvectors) {
236 |             buf[index] = x;                      // set vector
237 |         }
238 |         else {                                   // error
239 |             (*errorfunction)();                  // call error handler
240 |         }
241 |     }
242 |     etype get_element(uint32_t index) const {    // extract one vector element
243 |         if (index <  uint32_t(nelements)) {
244 |             return buf[index >> s_count()][index & (V::size() - 1)];        
245 |         }
246 |         else {                                   // index out of range
247 |             (*errorfunction)();                  // call error handler
248 |             if constexpr(is_fp()) {
249 |                 return nan_vec<V>(2)[0];         // floating point type. return NAN
250 |             }
251 |             else {
252 |                 return 0;                        // integer type. return 0
253 |             }
254 |         }
255 |     }
256 |     void set_element(etype x, uint32_t index) {  // insert one vector element
257 |         if (index <  uint32_t(nelements)) {
258 |             buf[index >> s_count()].insert(index & (V::size()-1), x);
259 |         }
260 |         else {                                   // error
261 |             (*errorfunction)();                  // call error handler
262 |         }
263 |     }
264 |     void load(int n, void const * p) {           // load n elements from array
265 |         if (n <= 0) return;                      // nothing to do
266 |         if (uint32_t(n) > nelements) n = nelements;// max size
267 |         int m = (uint32_t)n >> s_count();        // number of full vectors to load
268 |         int i;                                   // loop counter
269 |         for (i = 0; i < m; i++) {
270 |             buf[i].load((etype const*)p + i * V::size()); // store one vector
271 |         }
272 |         int partial = n & (V::size() - 1);       // any partial store needed
273 |         if (partial) {                           // n is not divisible by vector size
274 |             // load partial vector in the end
275 |             buf[i].load_partial(partial, (etype const*)p + i * V::size()); // load part of last vector       
276 |         }
277 |     }
278 |     void store(int n, void * p) {                // store n elements to array
279 |         if (n <= 0) return;                      // nothing to do
280 |         if (uint32_t(n) > nelements) n = nelements;// max size
281 |         int m = (uint32_t)n >> s_count();        // number of full vectors to store
282 |         int i;                                   // loop counter
283 |         for (i = 0; i < m; i++) {
284 |             buf[i].store((etype*)p + i * V::size()); // store one vector
285 |         }
286 |         int partial = n & (V::size() - 1);       // any partial store needed
287 |         if (partial) {                           // n is not divisible by vector size
288 |             // store partial vector in the end
289 |             buf[i].store_partial(partial, (etype*)p + i * V::size()); // store part of last vector       
290 |         }
291 |     }
292 |     V * get_buf() {                              // get address of internal buffer. warning: address may change
293 |         return buf;
294 |     }
295 |     void zero() {                                // set all contents to zero
296 |         for (uint32_t i = 0; i < nvectors; i++) {
297 |             buf[i] = V(etype(0));
298 |         }
299 |     }
300 | };
301 | 
302 | #ifdef VCL_NAMESPACE
303 | }
304 | #endif
305 | 
306 | #endif // VECTOR_CONTAINERS_H
307 | 


--------------------------------------------------------------------------------
/quaternion/quaternion_manual.tex:
--------------------------------------------------------------------------------
  1 | \documentclass[11pt,a4paper,oneside,openright]{report}
  2 | 
  3 | \usepackage[bindingoffset=5mm,left=20mm,right=20mm,top=20mm,bottom=20mm,footskip=10mm]{geometry}
  4 | \usepackage[utf8x]{inputenc}
  5 | \usepackage{hyperref}
  6 | \usepackage[english]{babel}
  7 | \usepackage{listings}
  8 | \usepackage{subfiles}
  9 | \usepackage{longtable}
 10 | \usepackage{multirow}
 11 | \usepackage{ragged2e} 
 12 | \usepackage{cmap} % avoid fi ligatures in pdf file
 13 | \usepackage{amsthm} % example numbering
 14 | \usepackage{color}
 15 | %\usepackage{bold-extra} % for bold tt font. Remember to include bold-extra.sty file
 16 | \usepackage{graphicx}
 17 | \usepackage[yyyymmdd]{datetime}
 18 | \usepackage{float}
 19 | 
 20 | % style for code listing
 21 | \renewcommand{\familydefault}{\sfdefault}
 22 | \newtheorem{example}{Example}[chapter]  % example numbering
 23 | \lstset{language=C}                     % formatting for code listing
 24 | \lstset{basicstyle=\ttfamily,breaklines=true}
 25 | \definecolor{darkGreen}{rgb}{0,0.4,0}
 26 | \definecolor{mybrown}{rgb}{0.40,0.10,0.05}
 27 | \lstset{commentstyle=\color{darkGreen}}  % comments color
 28 | \lstset{keywordstyle=\color{blue}}       % keyword color
 29 | \lstset{stringstyle=\color{mybrown}}     % string color
 30 | \lstset{showstringspaces=false}          % don't mark spaces in strings
 31 | 
 32 | \renewcommand{\dateseparator}{-}
 33 | 
 34 | % command for turning indent back on after \flushleft
 35 | \newcommand{\indenton}{\RaggedRight\parindent=15pt}
 36 | 
 37 | % command for vertical space
 38 | \newcommand{\vspacesmall}{\vspace{3mm}}
 39 | \newcommand{\vspacebig}{\vspace{6mm}}
 40 | 
 41 | % style for code inlined in text:
 42 | \newcommand{\codei}[1]{\bfseries \ttfamily{#1}\normalfont}
 43 | 
 44 | 
 45 | \begin{document}
 46 | 
 47 | \begin{titlepage}
 48 |     \centering
 49 |    
 50 |     \null %empty box needed for vfill to work
 51 |     \vfill
 52 | 
 53 |    {\bfseries\Huge
 54 |     Quaternion.h
 55 |     \vspacesmall
 56 |     
 57 |     Quaternion extension for 
 58 |     \vspacesmall
 59 |         
 60 |     C++ vector class library 
 61 |     \vspacebig
 62 |         
 63 |    }        
 64 |     \vspacebig
 65 |     
 66 |    {\Large    
 67 |     Agner Fog
 68 |     \vspacebig
 69 |     
 70 |     \copyright\ \today. Apache license 2.0
 71 |    }
 72 |     
 73 |     \vfill
 74 |     
 75 |     \includegraphics[width=306pt]{freesoftwarelogo.jpg}
 76 |     \vfill
 77 |     
 78 | \end{titlepage}
 79 | 
 80 | \RaggedRight
 81 | 
 82 | \chapter{Introduction}\label{chap:Introduction}
 83 | Quaternions or hypercomplex numbers is a topic in theoretical algebra and quantum physics. Applications relating to 3-D geometry and electromagnetism are better served with the vector3d package to VCL.
 84 | \vspacesmall
 85 | 
 86 | The file quaternion.h provides classes, operators, and functions for 
 87 | calculations with quaternions. This is an extension to the Vector Class Library.
 88 | \vspacesmall
 89 | 
 90 | The classes listed below are defined. Common operators and functions are defined for these classes:
 91 | 
 92 | \begin {table}[H]
 93 | \caption{Quaternion classes}
 94 | \label{table:QuaternionClasses}
 95 | \begin{tabular}{|p{24mm}|p{20mm}|p{20mm}|p{22mm}|p{20mm}|p{28mm}|}
 96 | \hline
 97 | \bfseries Quaternion class & \bfseries Precision &  \bfseries Quaternion elements per vector & \bfseries Correspon-ding real vector class & \bfseries Total bits & \bfseries Recommended minimum \newline instruction set \\ \hline
 98 | Quaternion1f  & \centering single & \centering  1 & \centering Vec4f & \centering 128 & SSE2 \\ \hline
 99 | Quaternion1d  & \centering double & \centering 1 & \centering Vec4d & \centering 256 & AVX \\ \hline
100 | \end{tabular}
101 | \end{table}
102 | \vspacebig
103 | 
104 | 
105 | 
106 | \section{Compiling} \label{Compiling}
107 | The quaternion class extension to the Vector Class Library is compiled in the same way as the Vector Class Library itself. All x86 and x86-64 platforms are supported, including Windows, Linux, and Mac OS. 
108 | The following C++ compilers can be used: Gnu, Clang, Microsoft, and Intel. 
109 | See the Vector class library manual for further details.
110 | \vspacesmall
111 | 
112 | This example shows how to use the quaternion classes:
113 | 
114 | \begin{example}
115 | \label{example1}
116 | \end{example} % frame disappears if I put this after end lstlisting
117 | \begin{lstlisting}[frame=single]
118 | // Example for quaternions
119 | #include <stdio.h>
120 | #include "vectorclass.h"  // vector class library
121 | #include "quaternion.h"   // quaternion extension
122 | 
123 | // function to print quaternion
124 | template <typename Q>
125 | void printqx (const char * text, Q a) {
126 |     auto aa = a.to_vector(); // get elements as real vector
127 |     printf("\n%s ", text);   // print text
128 |     printf("(%.3G,%.3G,%.3G,%.3G)", aa[0], aa[1], aa[2], aa[3]);
129 | }
130 | 
131 | int main() {
132 |     // define quaternions
133 |     Quaternion1d a(1,2,3,4);   //  1 + 2*i + 3*j + 4*k
134 |     Quaternion1d b(2,-3,-1,0); //  2 - 3*i - 1*j + 0*k
135 |     Quaternion1d c = a + b;    // add quaternions
136 |     Quaternion1d d = a * b;    // multiply quaternions
137 | 
138 |     // print results
139 |     printqx("a = ", a);        // a = (1,2,3,4)
140 |     printqx("b = ", b);        // b = (2,-3,-1,0)
141 |     printqx("c = ", c);        // c = (3,-1,2,4)
142 |     printqx("d = ", d);        // d = (11,5,-7,15)
143 | }
144 | 
145 | \end{lstlisting}
146 | \vspacesmall
147 | 
148 | 
149 | \chapter{Constructing quaternions and loading data} 
150 | \label{ConstructingQuaternions}
151 | 
152 | There are several ways to create quaternions and put data into them. These methods are listed here.
153 | \vspacebig
154 | 
155 | \begin{tabular}{|p{25mm}|p{100mm}|}
156 | \hline
157 | \bfseries Method & default constructor \\ \hline
158 | \bfseries Defined for & all quaternion classes \\ \hline
159 | \bfseries Description & the quaternion is created but not initialized.\newline
160 | The value is unpredictable \\ \hline
161 | \bfseries Efficiency & good \\ \hline
162 | \end{tabular}
163 | \vspacesmall
164 | 
165 | \begin{lstlisting}[frame=none]
166 | // Example:
167 | quaternion1f a;    // creates a quaternion of four floats
168 | \end{lstlisting}
169 | \vspacebig
170 | 
171 | 
172 | \begin{tabular}{|p{25mm}|p{100mm}|}
173 | \hline
174 | \bfseries Method & Construct from single real \\ \hline
175 | \bfseries Defined for & all quaternion classes \\ \hline
176 | \bfseries Description & The parameter defines the real part. The imaginary parts are zero. \\ \hline
177 | \bfseries Efficiency & good \\ \hline
178 | \end{tabular}
179 | \vspacesmall
180 | 
181 | \begin{lstlisting}[frame=none]
182 | // Example:
183 | quaternion1d a(3);  // a = (3,0,0,0)
184 | \end{lstlisting}
185 | \vspacebig
186 | 
187 | 
188 | \begin{tabular}{|p{25mm}|p{100mm}|}
189 | \hline
190 | \bfseries Method & Construct from one real and three imaginary parts \\ \hline
191 | \bfseries Defined for & all quaternion classes \\ \hline
192 | \bfseries Description & The parameters define the real and imaginary parts  \\ \hline
193 | \bfseries Efficiency & good \\ \hline
194 | \end{tabular}
195 | \vspacesmall
196 | 
197 | \begin{lstlisting}[frame=none]
198 | // Example:
199 | quaternion1d a(1,2,3,4);  // a = (1,2,3,4)
200 | \end{lstlisting}
201 | \vspacebig
202 | 
203 | 
204 | \begin{tabular}{|p{25mm}|p{100mm}|}
205 | \hline
206 | \bfseries Method & Construct from two complex numbers \\ \hline
207 | \bfseries Defined for & all quaternion classes \\ \hline
208 | \bfseries Description & The second parameter is post-multiplied by j \\ \hline
209 | \bfseries Efficiency & good \\ \hline
210 | \bfseries Implementation & complexvec1.h must be included before quaternion.h \\ \hline
211 | \end{tabular}
212 | \vspacesmall
213 | 
214 | \begin{lstlisting}[frame=none]
215 | // Example:
216 | Complex1d a(1,2);    // a = 1 + i*2
217 | Complex1d b(3,4);    // b = 3 + i*4
218 | Quaternion1d c(a,b); // c = a + b*j = 1 + i*2 + j*3 + k*4
219 | \end{lstlisting}
220 | \vspacebig
221 | 
222 | \begin{tabular}{|p{25mm}|p{100mm}|}
223 | \hline
224 | \bfseries Method & member function load(p) \\ \hline
225 | \bfseries Defined for & all quaternion classes \\ \hline
226 | \bfseries Description & Load data from array of same precision. 
227 | Each real part must be followed by the corresponding three imaginary parts. \\ \hline
228 | \bfseries Efficiency & good \\ \hline
229 | \end{tabular}
230 | \vspacesmall
231 | 
232 | \begin{lstlisting}[frame=none]
233 | // Example:
234 | double a[4] = {1,2,3,4};
235 | Quaternion1d b;
236 | b.load(a);  // b = (1,2,3,4)
237 | \end{lstlisting}
238 | \vspacebig
239 | 
240 | 
241 | \begin{tabular}{|p{25mm}|p{100mm}|}
242 | \hline
243 | \bfseries Method & member function store(p) \\ \hline
244 | \bfseries Defined for & all quaternion classes \\ \hline
245 | \bfseries Description & Save data into array of same precision. 
246 | Each real part is followed by the corresponding three imaginary parts. \\ \hline
247 | \bfseries Efficiency & good \\ \hline
248 | \end{tabular}
249 | \vspacesmall
250 | 
251 | \begin{lstlisting}[frame=none]
252 | // Example:
253 | float a[4];
254 | Quaternion1f b(1,2,3,4);
255 | b.store(a);  // a = {1,2,3,4}
256 | \end{lstlisting}
257 | \vspacebig
258 | 
259 | 
260 | \begin{tabular}{|p{25mm}|p{100mm}|}
261 | \hline
262 | \bfseries Method & member function real() \\ \hline
263 | \bfseries Defined for & all quaternion classes \\ \hline
264 | \bfseries Description & Get real part of quaternion \\ \hline
265 | \bfseries Efficiency & good \\ \hline
266 | \end{tabular}
267 | \vspacesmall
268 | 
269 | \begin{lstlisting}[frame=none]
270 | // Example:
271 | Quaternion1d a(1,2,3,4);
272 | double r = a.real();  // a = 1
273 | \end{lstlisting}
274 | \vspacebig
275 | 
276 | 
277 | \begin{tabular}{|p{25mm}|p{100mm}|}
278 | \hline
279 | \bfseries Method & member function imag() \\ \hline
280 | \bfseries Defined for & all quaternion classes \\ \hline
281 | \bfseries Description & Get imaginary parts of quaternion. The real part is set to zero \\ \hline
282 | \bfseries Efficiency & good \\ \hline
283 | \end{tabular}
284 | \vspacesmall
285 | 
286 | \begin{lstlisting}[frame=none]
287 | // Example:
288 | Quaternion1d a(1,2,3,4);
289 | Quaternion1d im = a.imag();  // a = (0,2,3,4)
290 | \end{lstlisting}
291 | \vspacebig
292 | 
293 | 
294 | \begin{tabular}{|p{25mm}|p{100mm}|}
295 | \hline
296 | \bfseries Method & member function get\_low() \\ \hline
297 | \bfseries Defined for & all quaternion classes \\ \hline
298 | \bfseries Description & Get the real and the first imaginary part (i) as a complex vector \\ \hline
299 | \bfseries Efficiency & good \\ \hline
300 | \bfseries Implementation & complexvec1.h must be included before quaternion.h \\ \hline
301 | \end{tabular}
302 | \vspacesmall
303 | 
304 | \begin{lstlisting}[frame=none]
305 | // Example:
306 | Quaternion1d a(1,2,3,4);
307 | Complex1d b = a.get_low();  // b = (1,2)
308 | \end{lstlisting}
309 | \vspacebig
310 | 
311 | 
312 | \begin{tabular}{|p{25mm}|p{100mm}|}
313 | \hline
314 | \bfseries Method & member function get\_high() \\ \hline
315 | \bfseries Defined for & all quaternion classes \\ \hline
316 | \bfseries Description & Get the last two imaginary parts (j and k) as a complex vector \\ \hline
317 | \bfseries Efficiency & good \\ \hline
318 | \bfseries Implementation & complexvec1.h must be included before quaternion.h \\ \hline
319 | \end{tabular}
320 | \vspacesmall
321 | 
322 | \begin{lstlisting}[frame=none]
323 | // Example:
324 | Quaternion1d a(1,2,3,4);
325 | Complex1d b = a.get_low();  // b = (1,2)
326 | Complex1d c = a.get_high(); // c = (3,4)
327 | Quaternion1d d(b,c);        // d = (1,2,3,4)
328 | \end{lstlisting}
329 | \vspacebig
330 | 
331 | 
332 | 
333 | \chapter{Operators}\label{chap:Operators}
334 | 
335 | \begin{tabular}{|p{25mm}|p{100mm}|}
336 | \hline
337 | \bfseries Operator & + \\ \hline
338 | \bfseries Defined for & all quaternion classes  \\ \hline
339 | \bfseries Description & Add two quaternions, or one quaternion and one real scalar of the same precision \\ \hline
340 | \bfseries Efficiency & good \\ \hline
341 | \end{tabular}
342 | \vspacesmall
343 | 
344 | \begin{lstlisting}[frame=none]
345 | // Example:
346 | Quaternion1d a(1,2,3,4);
347 | Quaternion1d b(5,6,7,8);
348 | Quaternion1d c = a + b;    // c = (6,8,10,12)
349 | Quaternion1d d = a + 10.0; // d = (11,2,3,4)
350 | \end{lstlisting}
351 | \vspacebig
352 | 
353 | 
354 | \begin{tabular}{|p{25mm}|p{100mm}|}
355 | \hline
356 | \bfseries Operator & - \\ \hline
357 | \bfseries Defined for & all quaternion classes  \\ \hline
358 | \bfseries Description & Subtract two quaternions, or one quaternion and one real scalar of the same precision \\ \hline
359 | \bfseries Efficiency & good \\ \hline
360 | \end{tabular}
361 | \vspacesmall
362 | 
363 | \begin{lstlisting}[frame=none]
364 | // Example:
365 | Quaternion1d a(12,11,10,9);
366 | Quaternion1d b(5,6,7,8);
367 | Quaternion1d c = a - b;    // c = (7,5,3,1)
368 | Quaternion1d d = a - 10.0; // d = (2,11,10,9)
369 | \end{lstlisting}
370 | \vspacebig
371 | 
372 | 
373 | \begin{tabular}{|p{25mm}|p{100mm}|}
374 | \hline
375 | \bfseries Operator & * \\ \hline
376 | \bfseries Defined for & all quaternion classes  \\ \hline
377 | \bfseries Description & Multiply two quaternions, or one quaternion and one real scalar of the same precision. \newline 
378 | Multiplication of quaternions is not commutative, i.e. a*b and b*a are not the same.
379 | \\ \hline
380 | \bfseries Efficiency & medium \\ \hline
381 | \bfseries Accuracy & Quaternion multiplication involves the calculation of sums of products. Loss of precision may occur if the result is close to zero. \\ \hline
382 | \end{tabular}
383 | \vspacesmall
384 | 
385 | \begin{lstlisting}[frame=none]
386 | // Example:
387 | Quaternion1d a(1,2,3,4);
388 | Quaternion1d b(5,6,7,8);
389 | Quaternion1d c = a * b;    // c = (-60,12,30,24)
390 | Quaternion1d d = b * a;    // d = (-60,20,14,32)
391 | Quaternion1d e = a * 10.;  // e = (10,20,30,40)
392 | 
393 | \end{lstlisting}
394 | \vspacebig
395 | 
396 | 
397 | \begin{tabular}{|p{25mm}|p{100mm}|}
398 | \hline
399 | \bfseries Operator & / \\ \hline
400 | \bfseries Defined for & all quaternion classes  \\ \hline
401 | \bfseries Description & Divide two quaternions, or one quaternion and one real scalar of the same precision. \newline
402 | Division is defined as a / b = a * reciprocal(b) \\ \hline
403 | \bfseries Efficiency & medium \\ \hline
404 | \bfseries Accuracy & Quaternion division involves the calculation of sums of products. Loss of precision may occur if the result is close to zero. \\ \hline
405 | \end{tabular}
406 | \vspacesmall
407 | 
408 | \begin{lstlisting}[frame=none]
409 | // Example:
410 | Quaternion1f a(7,9,-1,7);
411 | Quaternion1f b(1,2,3,2);
412 | Quaternion1f c = a / b;    // c = (2,1,-1,-2)
413 | Quaternion1f d = c * b;    // d = (7,9,-1,7)
414 | Quaternion1f e = b / 2.0f; // e = (0.5,1,1.5,1)
415 | Quaternion1f f = 18.f / b; // f = (1,-2,-3,-2)
416 | Quaternion1f g = f * b;    // g = (18,0,0,0)
417 | \end{lstlisting}
418 | \vspacebig
419 | 
420 | 
421 | \begin{tabular}{|p{25mm}|p{100mm}|}
422 | \hline
423 | \bfseries Operator & $\sim$ \\ \hline
424 | \bfseries Defined for & all quaternion classes  \\ \hline
425 | \bfseries Description & Complex conjugate. The signs of the imaginary parts are inverted \\ \hline
426 | \bfseries Efficiency & good \\ \hline
427 | \end{tabular}
428 | \vspacesmall
429 | 
430 | \begin{lstlisting}[frame=none]
431 | // Example:
432 | Quaternion1f a(1,2,3,4);
433 | Quaternion1f b = ~ a;    // b = (1,-2,-3,-4)
434 | \end{lstlisting}
435 | \vspacebig
436 | 
437 | 
438 | \begin{tabular}{|p{25mm}|p{100mm}|}
439 | \hline
440 | \bfseries Operator & == \\ \hline
441 | \bfseries Defined for & all quaternion classes  \\ \hline
442 | \bfseries Description & Compare for equality.\newline
443 | The result is a boolean scalar. \\ \hline
444 | \bfseries Efficiency & good \\ \hline
445 | \end{tabular}
446 | \vspacesmall
447 | 
448 | \begin{lstlisting}[frame=none]
449 | // Example:
450 | Quaternion1f a(1, 2,3,4);
451 | Quaternion1f b(1,-2,3,4);
452 | bool         c = (a == b);  // c = false
453 | \end{lstlisting}
454 | \vspacebig
455 | 
456 | 
457 | \begin{tabular}{|p{25mm}|p{100mm}|}
458 | \hline
459 | \bfseries Operator & != \\ \hline
460 | \bfseries Defined for & all quaternion classes  \\ \hline
461 | \bfseries Description & Compare for not equal.\newline
462 | The result is a boolean scalar. \\ \hline
463 | \bfseries Efficiency & good \\ \hline
464 | \end{tabular}
465 | \vspacesmall
466 | 
467 | \begin{lstlisting}[frame=none]
468 | // Example:
469 | Quaternion1f a(1, 2,3,4);
470 | Quaternion1f b(1,-2,3,4);
471 | bool         c = (a != b);  // c = true
472 | \end{lstlisting}
473 | \vspacebig
474 | 
475 | 
476 | \chapter{Mathematical functions}\label{chap:MathematicalFunctions}
477 | 
478 | 
479 | \begin{tabular}{|p{25mm}|p{100mm}|}
480 | \hline
481 | \bfseries Function & abs \\ \hline
482 | \bfseries Defined for & all quaternion classes  \\ \hline
483 | \bfseries Description & Gives the norm as a scalar \\ \hline
484 | \bfseries Efficiency & medium \\ \hline
485 | \end{tabular}
486 | \vspacesmall
487 | 
488 | \begin{lstlisting}[frame=none]
489 | // Example:
490 | Quaternion1f a(2,1,0,2);
491 | double       b = abs(a); // b = 3
492 | \end{lstlisting}
493 | \vspacebig
494 | 
495 | 
496 | 
497 | \chapter{Other functions}\label{chap:OtherFunctions}
498 | 
499 | 
500 | \begin{tabular}{|p{25mm}|p{100mm}|}
501 | \hline
502 | \bfseries Function & to\_vector \\ \hline
503 | \bfseries Defined for & all quaternion classes \\ \hline
504 | \bfseries Description & Convert to a vector of the real part and the three imaginary parts. \\ \hline
505 | \bfseries Efficiency & good \\ \hline
506 | \end{tabular}
507 | \vspacesmall
508 | 
509 | \begin{lstlisting}[frame=none]
510 | // Example:
511 | Quaternion1d a(1,2,3,4);
512 | Vec4d        b = a.to_vector(); // b = (1,2,3,4)
513 | \end{lstlisting}
514 | \vspacebig
515 | 
516 | 
517 | \begin{tabular}{|p{25mm}|p{100mm}|}
518 | \hline
519 | \bfseries Function & select \\ \hline
520 | \bfseries Defined for & all quaternion classes  \\ \hline
521 | \bfseries Description & Choose between two quaternions. \\ \hline
522 | \bfseries Efficiency & good \\ \hline
523 | \end{tabular}
524 | \vspacesmall
525 | 
526 | \begin{lstlisting}[frame=none]
527 | // Example:
528 | Quaternion1d a(1,2,3,4);
529 | Quaternion1d b(5,6,7,8);
530 | Quaternion1d c = select(true,a,b);  // c = (1,2,3,4)
531 | Quaternion1d d = select(false,a,b); // d = (5,6,7,8)
532 | \end{lstlisting}
533 | \vspacebig
534 | 
535 | 
536 | \end{document}
537 | 


--------------------------------------------------------------------------------
/vector3d/vector3d.h:
--------------------------------------------------------------------------------
  1 | /****************************  vector3d.h   ***********************************
  2 | * Author:        Agner Fog
  3 | * Date created:  2012-08-01
  4 | * Last modified: 2023-05-14
  5 | * Version:       2.02.00
  6 | * Project:       Extension to vector class library
  7 | * Description:   Classes for 3-dimensional vectors, including operators and functions
  8 | * The following classes are defined:
  9 | * Vec3Df:        A vector of 3 single precision floats
 10 | * Vec3Dd:        A vector of 3 double precision floats
 11 | *
 12 | * (c) Copyright 2012-2023 Apache License version 2.0 or later
 13 | \*****************************************************************************/
 14 | 
 15 | #ifndef VECTOR3D_H
 16 | #define VECTOR3D_H  20200
 17 | 
 18 | #include "vectorclass.h"
 19 | #include <cmath>          // define math library functions
 20 | 
 21 | #if VECTORCLASS_H < 20000
 22 | #error Incompatible version of vector class library. Must use version 2 or later
 23 | #endif
 24 | 
 25 | #ifdef VCL_NAMESPACE
 26 | namespace VCL_NAMESPACE {
 27 | #endif
 28 | 
 29 | /*****************************************************************************
 30 | *
 31 | *               Class Vec3Df: vector of 3 single precision floats
 32 | *
 33 | *****************************************************************************/
 34 | 
 35 | class Vec3Df {
 36 | protected:
 37 |     __m128 xmm; // Float vector
 38 | public:
 39 |     // default constructor
 40 |     Vec3Df() = default;
 41 |     // construct from three coordinates
 42 |     Vec3Df(float x, float y, float z) {
 43 |         xmm = Vec4f(x, y, z, 0.f);
 44 |     }
 45 |     // Constructor to convert from Vec4f
 46 |     Vec3Df(Vec4f const x) {
 47 |         xmm = x;
 48 |         // cutoff(3);
 49 |     }
 50 |     // Constructor to convert from type __m128 used in intrinsics:
 51 |     Vec3Df(__m128 const x) {
 52 |         xmm = x;
 53 |     }
 54 |     // Assignment operator to convert from type __m128 used in intrinsics:
 55 |     Vec3Df & operator = (__m128 const x) {
 56 |         xmm = x;
 57 |         return *this;
 58 |     }
 59 |     // Type cast operator to convert to __m128 used in intrinsics
 60 |     operator __m128() const {
 61 |         return xmm;
 62 |     }
 63 |     // Member function to convert to vector
 64 |     Vec4f to_vector() const {
 65 |         return xmm;
 66 |     }
 67 |     // Member function to load from array
 68 |     Vec3Df & load(float const * p) {
 69 |         xmm = Vec4f().load_partial(3, p);
 70 |         return *this;
 71 |     }
 72 |     // Member function to store into array
 73 |     void store(float * p) const {
 74 |         Vec4f(xmm).store_partial(3, p);
 75 |     }
 76 |     // get x part
 77 |     float get_x() const {
 78 |         return _mm_cvtss_f32(xmm);
 79 |     }
 80 |     // get y part
 81 |     float get_y() const {
 82 |         return Vec4f(xmm).extract(1);
 83 |     }
 84 |     // get z part
 85 |     float get_z() const {
 86 |         return Vec4f(xmm).extract(2);
 87 |     }
 88 |     // Member function to extract one coordinate
 89 |     float extract(int index) const {
 90 |         return Vec4f(xmm).extract(index);
 91 |     }
 92 |     // Operator [] to extract one coordinate
 93 |     // Operator [] can only read an element, not write.
 94 |     float operator [] (uint32_t index) const {
 95 |         return extract(index);
 96 |     }
 97 |     // Insert one coordinate
 98 |     Vec3Df & insert (uint32_t index, float x) {
 99 |         xmm = Vec4f(xmm).insert(index, x);
100 |         return *this;
101 |     }
102 |     static constexpr int size() {
103 |         return 1;
104 |     }
105 |     static constexpr int elementtype() {
106 |         return 0x210;
107 |     }
108 | };
109 | 
110 | /*****************************************************************************
111 | *
112 | *          Operators for Vec3Df
113 | *
114 | *****************************************************************************/
115 | 
116 | // operator + : add
117 | static inline Vec3Df operator + (Vec3Df const a, Vec3Df const b) {
118 |     return Vec3Df(Vec4f(a) + Vec4f(b));
119 | }
120 | 
121 | // operator += : add
122 | static inline Vec3Df & operator += (Vec3Df & a, Vec3Df const b) {
123 |     a = a + b;
124 |     return a;
125 | }
126 | 
127 | // operator - : subtract
128 | static inline Vec3Df operator - (Vec3Df const a, Vec3Df const b) {
129 |     return Vec3Df(Vec4f(a) - Vec4f(b));
130 | }
131 | 
132 | // operator - : unary minus
133 | static inline Vec3Df operator - (Vec3Df const a) {
134 |     return Vec3Df(- Vec4f(a));
135 | }
136 | 
137 | // operator -= : subtract
138 | static inline Vec3Df & operator -= (Vec3Df & a, Vec3Df const b) {
139 |     a = a - b;
140 |     return a;
141 | }
142 | 
143 | // operator * : multiply element-by-element
144 | // (see also cross_product and dot_product)
145 | static inline Vec3Df operator * (Vec3Df const a, Vec3Df const b) {
146 |     return Vec3Df(Vec4f(a) * Vec4f(b));
147 | }
148 | 
149 | // operator *= : multiply element-by-element
150 | static inline Vec3Df & operator *= (Vec3Df & a, Vec3Df const b) {
151 |     a = a * b;
152 |     return a;
153 | }
154 | 
155 | // operator / : divide element-by-element
156 | static inline Vec3Df operator / (Vec3Df const a, Vec3Df const b) {
157 |     return Vec3Df(Vec4f(a) / Vec4f(b));
158 | }
159 | 
160 | // operator /= : divide element-by-element
161 | static inline Vec3Df & operator /= (Vec3Df & a, Vec3Df const b) {
162 |     a = a / b;
163 |     return a;
164 | }
165 | 
166 | // operator == : returns true if a == b
167 | static inline bool operator == (Vec3Df const a, Vec3Df const b) {
168 |     Vec4fb t1 = Vec4f(a) == Vec4f(b);
169 | #if INSTRSET >= 10
170 |     return (uint8_t(t1) & 7) == 7;
171 | #else
172 |     Vec4fb t2 = _mm_shuffle_ps(t1, t1, 0x24);  // ignore unused top element
173 |     return horizontal_and(t2);
174 | #endif
175 | }
176 | 
177 | // operator != : returns true if a != b
178 | static inline bool operator != (Vec3Df const a, Vec3Df const b) {
179 |     Vec4fb t1 = Vec4f(a) != Vec4f(b);
180 | #if INSTRSET >= 10
181 |     return (uint8_t(t1) & 7) != 0;
182 | #else
183 |     Vec4fb t2 = _mm_shuffle_ps(t1, t1, 0x24);  // ignore unused top element
184 |     return horizontal_or(t2);
185 | #endif
186 | }
187 | 
188 | /*****************************************************************************
189 | *
190 | *          Operators mixing Vec3Df and float
191 | *
192 | *****************************************************************************/
193 | 
194 | // operator * : multiply
195 | static inline Vec3Df operator * (Vec3Df const a, float b) {
196 |     return _mm_mul_ps(a, _mm_set1_ps(b));
197 | }
198 | static inline Vec3Df operator * (float a, Vec3Df const b) {
199 |     return b * a;
200 | }
201 | static inline Vec3Df & operator *= (Vec3Df & a, float & b) {
202 |     a = a * b;
203 |     return a;
204 | }
205 | 
206 | // operator / : divide
207 | static inline Vec3Df operator / (Vec3Df const a, float b) {
208 |     return _mm_div_ps(a, _mm_set1_ps(b));
209 | }
210 | 
211 | static inline Vec3Df & operator /= (Vec3Df & a, float b) {
212 |     a = a / b;
213 |     return a;
214 | }
215 | 
216 | 
217 | /*****************************************************************************
218 | *
219 | *          Functions for Vec3Df
220 | *
221 | *****************************************************************************/
222 | 
223 | // function cross_product
224 | static inline Vec3Df cross_product (Vec3Df const a, Vec3Df const b) {
225 |     Vec4f a1 = permute4<1,2,0,V_DC>(Vec4f(a));
226 |     Vec4f b1 = permute4<1,2,0,V_DC>(Vec4f(b));
227 |     Vec4f a2 = permute4<2,0,1,V_DC>(Vec4f(a));
228 |     Vec4f b2 = permute4<2,0,1,V_DC>(Vec4f(b));
229 |     Vec4f c  = a1 * b2 - a2 * b1;
230 |     return c.cutoff(3);
231 | }
232 | 
233 | // function dot_product
234 | static inline float dot_product (Vec3Df const a, Vec3Df const b) {
235 |     Vec4f c = (Vec4f(a) * Vec4f(b)).cutoff(3);
236 |     return horizontal_add(c);
237 | }
238 | 
239 | // function vector_length
240 | static inline float vector_length (Vec3Df const a) {
241 |     return std::sqrt(dot_product(a,a));
242 | }
243 | 
244 | // function normalize_vector
245 | static inline Vec3Df normalize_vector (Vec3Df const a) {
246 |     return a / vector_length(a);
247 | }
248 | 
249 | // function select
250 | static inline Vec3Df select (bool s, Vec3Df const a, Vec3Df const b) {
251 |     return s ? a : b;
252 | }
253 | 
254 | // function rotate
255 | // The vector a is rotated by multiplying by the matrix defined by the three columns col0, col1, col2
256 | static inline Vec3Df rotate (Vec3Df const col0, Vec3Df const col1, Vec3Df const col2, Vec3Df const a) {
257 |     Vec4f xbroad = permute4<0,0,0,V_DC>(Vec4f(a));  // broadcast x
258 |     Vec4f ybroad = permute4<1,1,1,V_DC>(Vec4f(a));  // broadcast y
259 |     Vec4f zbroad = permute4<2,2,2,V_DC>(Vec4f(a));  // broadcast z
260 |     Vec4f r = col0.to_vector() * xbroad + col1.to_vector() * ybroad + col2.to_vector() * zbroad;
261 |     return r.cutoff(3);
262 | }
263 | 
264 | 
265 | /*****************************************************************************
266 | *
267 | *               Class Vec3Dd: vector of 3 double precision floats
268 | *
269 | *****************************************************************************/
270 | 
271 | class Vec3Dd  {
272 | protected:
273 |     Vec4d yy; // vector of 4 doubles
274 | public:
275 |     // default constructor
276 |     Vec3Dd() = default;
277 |     // construct from three coordinates
278 |     Vec3Dd(double x, double y, double z) {
279 |         yy = Vec4d(x, y, z, 0.);
280 |     }
281 |     // Constructor to convert from Vec4d
282 |     Vec3Dd(Vec4d const x) {
283 |         yy = x;
284 |         // cutoff(3);
285 |     }
286 |     // Constructor to convert from type __m256d used in intrinsics or Vec256de used in emulation
287 | #if INSTRSET >= 7  // AVX
288 |     Vec3Dd(__m256d const x) {
289 |         yy = x;
290 |     }
291 | #else
292 |     Vec3Dd(Vec256de const x) {
293 |         yy = x;
294 |     }
295 | #endif
296 |     // Assignment operator to convert from type __m256d used in intrinsics or Vec256de used in emulation
297 | #if INSTRSET >= 7  // AVX
298 |     Vec3Dd & operator = (__m256d const x) {
299 | #else
300 |     Vec3Dd & operator = (Vec256de const x) {
301 | #endif
302 |         yy = x;
303 |         return *this;
304 |     }
305 |     // Type cast operator to convert to __m256d used in intrinsics or Vec256de used in emulation
306 | #if INSTRSET >= 7  // AVX
307 |     operator __m256d() const {
308 |         return yy;
309 |     }
310 | #endif
311 |     // Member function to load from array
312 |     Vec3Dd & load(double const * p) {
313 |         yy.load_partial(3, p);
314 |         return *this;
315 |     }
316 |     // Member function to store into array
317 |     void store(double * p) const {
318 |         yy.store_partial(3, p);
319 |     }
320 |     // Member function to convert to vector
321 |     Vec4d to_vector() const {
322 |         return yy;
323 |     }
324 |     // get x part
325 |     double get_x() const {
326 |         return _mm_cvtsd_f64(yy.get_low());
327 |     }
328 |     // get y part
329 |     double get_y() const {
330 |         return yy.extract(1);
331 |     }
332 |     // get z part
333 |     double get_z() const {
334 |         return yy.extract(2);
335 |     }
336 |     // Member function to extract one coordinate
337 |     double extract(uint32_t index) const {
338 |         return yy.extract(index);
339 |     }
340 |     // Operator [] to extract one coordinate
341 |     // Operator [] can only read an element, not write.
342 |     double operator [] (uint32_t index) const {
343 |         return extract(index);
344 |     }
345 |     // Insert one coordinate
346 |     Vec3Dd & insert (uint32_t index, double x) {
347 |         yy.insert(index, x);
348 |         return *this;
349 |     }
350 |     static constexpr int size() {
351 |         return 1;
352 |     }
353 |     static constexpr int elementtype() {
354 |         return 0x211;
355 |     }
356 | };
357 | 
358 | /*****************************************************************************
359 | *
360 | *          Operators for Vec3Dd
361 | *
362 | *****************************************************************************/
363 | 
364 | // operator + : add
365 | static inline Vec3Dd operator + (Vec3Dd const a, Vec3Dd const b) {
366 |     return Vec3Dd(a.to_vector() + b.to_vector());
367 | }
368 | 
369 | // operator += : add
370 | static inline Vec3Dd & operator += (Vec3Dd & a, Vec3Dd const b) {
371 |     a = a + b;
372 |     return a;
373 | }
374 | 
375 | // operator - : subtract
376 | static inline Vec3Dd operator - (Vec3Dd const a, Vec3Dd const b) {
377 |     return Vec3Dd(a.to_vector() - b.to_vector());
378 | }
379 | 
380 | // operator - : unary minus
381 | static inline Vec3Dd operator - (Vec3Dd const a) {
382 |     return Vec3Dd(- a.to_vector());
383 | }
384 | 
385 | // operator -= : subtract
386 | static inline Vec3Dd & operator -= (Vec3Dd & a, Vec3Dd const b) {
387 |     a = a - b;
388 |     return a;
389 | }
390 | 
391 | // operator * : multiply element-by-element
392 | // (see also cross_product and dot_product)
393 | static inline Vec3Dd operator * (Vec3Dd const a, Vec3Dd const b) {
394 |     return Vec3Dd(a.to_vector() * b.to_vector());
395 | }
396 | 
397 | // operator *= : multiply element-by-element
398 | static inline Vec3Dd & operator *= (Vec3Dd & a, Vec3Dd const b) {
399 |     a = a * b;
400 |     return a;
401 | }
402 | 
403 | // operator / : divide element-by-element
404 | static inline Vec3Dd operator / (Vec3Dd const a, Vec3Dd const b) {
405 |     return Vec3Dd(a.to_vector() / b.to_vector());
406 | }
407 | 
408 | // operator /= : divide element-by-element
409 | static inline Vec3Dd & operator /= (Vec3Dd & a, Vec3Dd const b) {
410 |     a = a / b;
411 |     return a;
412 | }
413 | 
414 | // operator == : returns true if a == b
415 | static inline bool operator == (Vec3Dd const a, Vec3Dd const b) {
416 |     Vec4db t1 = a.to_vector() == b.to_vector();
417 | #if INSTRSET >= 10
418 |     return (uint8_t(t1) & 7) == 7;
419 | #elif INSTRSET >= 7  // AVX
420 |     Vec4db t2 = Vec4db(permute4<0,1,2,2>(Vec4d(t1))); // ignore unused top element
421 |     return horizontal_and(t2);
422 | #else
423 |     Vec2db u0 = t1.get_low();
424 |     Vec2db u1 = t1.get_high();
425 |     u1 = permute2<0,0>(Vec2d(u1));                    // ignore unused top element
426 |     return horizontal_and(u0 & u1);
427 | #endif
428 | }
429 | 
430 | // operator != : returns true if a != b
431 | static inline bool operator != (Vec3Dd const a, Vec3Dd const b) {
432 |     Vec4db t1 = a.to_vector() != b.to_vector();
433 | #if INSTRSET >= 10
434 |     return (uint8_t(t1) & 7) != 0;
435 | #elif INSTRSET >= 7  // AVX
436 |     Vec4db t2 = Vec4db(permute4<0,1,2,2>(Vec4d(t1))); // ignore unused top element
437 |     return horizontal_and(t2);
438 | #else
439 |     Vec2db u0 = t1.get_low();
440 |     Vec2db u1 = t1.get_high();
441 |     u1 = permute2<0,0>(Vec2d(u1));                    // ignore unused top element
442 |     return horizontal_or(u0 | u1);
443 | #endif
444 | }
445 | 
446 | /*****************************************************************************
447 | *
448 | *          Operators mixing Vec3Dd and double
449 | *
450 | *****************************************************************************/
451 | 
452 | // operator * : multiply
453 | static inline Vec3Dd operator * (Vec3Dd const a, double b) {
454 |     return a.to_vector() * Vec4d(b);
455 | }
456 | static inline Vec3Dd operator * (double a, Vec3Dd const b) {
457 |     return b * a;
458 | }
459 | static inline Vec3Dd & operator *= (Vec3Dd & a, double & b) {
460 |     a = a * b;
461 |     return a;
462 | }
463 | 
464 | // operator / : divide
465 | static inline Vec3Dd operator / (Vec3Dd const a, double b) {
466 |     return a.to_vector() / Vec4d(b);
467 | }
468 | 
469 | static inline Vec3Dd & operator /= (Vec3Dd & a, double b) {
470 |     a = a / b;
471 |     return a;
472 | }
473 | 
474 | 
475 | /*****************************************************************************
476 | *
477 | *          Functions for Vec3Dd
478 | *
479 | *****************************************************************************/
480 | 
481 | // function cross_product
482 | static inline Vec3Dd cross_product (Vec3Dd const a, Vec3Dd const b) {
483 |     Vec4d a1 = permute4<1,2,0,V_DC>(a.to_vector());
484 |     Vec4d b1 = permute4<1,2,0,V_DC>(b.to_vector());
485 |     Vec4d a2 = permute4<2,0,1,V_DC>(a.to_vector());
486 |     Vec4d b2 = permute4<2,0,1,V_DC>(b.to_vector());
487 |     Vec4d c  = a1 * b2 - a2 * b1;
488 |     return c.cutoff(3);
489 | }
490 | 
491 | // function dot_product
492 | static inline double dot_product (Vec3Dd const a, Vec3Dd const b) {
493 |     Vec4d c  = (a.to_vector() * b.to_vector()).cutoff(3);
494 |     return horizontal_add(c);
495 | }
496 | 
497 | // function vector_length
498 | static inline double vector_length (Vec3Dd const a) {
499 |     return std::sqrt(dot_product(a,a));
500 | }
501 | 
502 | // function normalize_vector
503 | static inline Vec3Dd normalize_vector (Vec3Dd const a) {
504 |     return a / vector_length(a);
505 | }
506 | 
507 | // function select
508 | static inline Vec3Dd select (bool s, Vec3Dd const a, Vec3Dd const b) {
509 |     return s ? a : b;
510 | }
511 | 
512 | // function rotate
513 | // The vector a is rotated by multiplying by the matrix defined by the three columns col0, col1, col2
514 | static inline Vec3Dd rotate (Vec3Dd const col0, Vec3Dd const col1, Vec3Dd const col2, Vec3Dd const a) {
515 |     Vec3Dd xbroad = permute4<0,0,0,V_DC>(a.to_vector());  // broadcast x
516 |     Vec3Dd ybroad = permute4<1,1,1,V_DC>(a.to_vector());  // broadcast y
517 |     Vec3Dd zbroad = permute4<2,2,2,V_DC>(a.to_vector());  // broadcast z
518 |     Vec3Dd r = col0 * xbroad + col1 * ybroad + col2 * zbroad;
519 |     return r.to_vector().cutoff(3);
520 | }
521 | 
522 | 
523 | /*****************************************************************************
524 | *
525 | *          Conversion functions
526 | *
527 | *****************************************************************************/
528 | 
529 | // function to_single: convert Vec3Dd to Vec3Df
530 | static inline Vec3Df to_float(Vec3Dd const a) {
531 | #if INSTRSET >= 7  // AVX
532 |     return _mm256_cvtpd_ps(a);
533 | #else
534 |     //return Vec3Df(Vec4f(compress(a.to_vector().get_low(), a.to_vector().get_high())));
535 |     return to_float(a.to_vector());
536 | #endif
537 | }
538 | 
539 | // function to_double: convert Vec3Df to Vec3Dd
540 | static inline Vec3Dd to_double(Vec3Df const a) {
541 | #if INSTRSET >= 7  // AVX
542 |     return _mm256_cvtps_pd(a);
543 | #else
544 |     return to_double(a.to_vector());
545 | #endif
546 | }
547 | 
548 | #ifdef VCL_NAMESPACE
549 | }
550 | #endif
551 | 
552 | #endif  // VECTOR3D_H
553 | 


--------------------------------------------------------------------------------
/vector3d/vector3d_manual.tex:
--------------------------------------------------------------------------------
  1 | \documentclass[11pt,a4paper,oneside,openright]{report}
  2 | 
  3 | \usepackage[bindingoffset=5mm,left=20mm,right=20mm,top=20mm,bottom=20mm,footskip=10mm]{geometry}
  4 | \usepackage[utf8x]{inputenc}
  5 | \usepackage{hyperref}
  6 | \usepackage[english]{babel}
  7 | \usepackage{listings}
  8 | \usepackage{subfiles}
  9 | \usepackage{longtable}
 10 | \usepackage{multirow}
 11 | \usepackage{ragged2e} 
 12 | \usepackage{cmap} % avoid fi ligatures in pdf file
 13 | \usepackage{amsthm} % example numbering
 14 | \usepackage{color}
 15 | %\usepackage{bold-extra} % for bold tt font. Remember to include bold-extra.sty file
 16 | \usepackage{graphicx}
 17 | \usepackage[yyyymmdd]{datetime}
 18 | \usepackage{float}
 19 | 
 20 | % style for code listing
 21 | \renewcommand{\familydefault}{\sfdefault}
 22 | \renewcommand{\ttdefault}{pcr} % selects Courier font
 23 | \newtheorem{example}{Example}[chapter]  % example numbering
 24 | \lstset{language=C}                     % formatting for code listing
 25 | \lstset{basicstyle=\ttfamily,breaklines=true}
 26 | \definecolor{darkGreen}{rgb}{0,0.4,0}
 27 | \definecolor{mybrown}{rgb}{0.40,0.10,0.05}
 28 | \lstset{commentstyle=\color{darkGreen}}  % comments color
 29 | \lstset{keywordstyle=\color{blue}}       % keyword color
 30 | \lstset{stringstyle=\color{mybrown}}     % string color
 31 | \lstset{showstringspaces=false}          % don't mark spaces in strings
 32 | 
 33 | \renewcommand{\dateseparator}{-}
 34 | 
 35 | % command for turning indent back on after \flushleft
 36 | \newcommand{\indenton}{\RaggedRight\parindent=15pt}
 37 | 
 38 | % command for vertical space
 39 | \newcommand{\vspacesmall}{\vspace{3mm}}
 40 | \newcommand{\vspacebig}{\vspace{6mm}}
 41 | 
 42 | % style for code inlined in text:
 43 | \newcommand{\codei}[1]{\bfseries \ttfamily{#1}\normalfont}
 44 | 
 45 | 
 46 | \begin{document}
 47 | 
 48 | \begin{titlepage}
 49 |     \centering
 50 |    
 51 |     \null %empty box needed for vfill to work
 52 |     \vfill
 53 | 
 54 |    {\bfseries\Huge
 55 |     Vector3d.h
 56 |     \vspacesmall
 57 |     
 58 |     3-dimensional vector extension for 
 59 |     \vspacesmall
 60 |         
 61 |     C++ vector class library 
 62 |     \vspacebig
 63 |         
 64 |    }        
 65 |     \vspacebig
 66 |     
 67 |    {\Large    
 68 |     Agner Fog
 69 |     \vspacebig
 70 |     
 71 |     \copyright\ \today. Apache license 2.0
 72 |    }
 73 |     
 74 |     \vfill
 75 |     
 76 |     \includegraphics[width=306pt]{freesoftwarelogo.jpg}
 77 |     \vfill
 78 |     
 79 | \end{titlepage}
 80 | 
 81 | \RaggedRight
 82 | 
 83 | \chapter{Introduction}\label{chap:Introduction}
 84 | 3-dimensional vectors are useful in geometry and physics.
 85 | The file vector3d.h provides vector classes, operators, and functions for 
 86 | calculations with 3-D vectors. This is an extension to the Vector Class Library.
 87 | \vspacesmall
 88 | 
 89 | The classes listed below are defined. Common operators and functions are defined for these classes:
 90 | 
 91 | \begin {table}[H]
 92 | \caption{3-D vector classes}
 93 | \label{table:Vector3DClasses}
 94 | \begin{tabular}{|p{24mm}|p{20mm}|p{20mm}|p{22mm}|p{20mm}|p{28mm}|}
 95 | \hline
 96 | \bfseries vector class & \bfseries Precision &  \bfseries 3-D vectors per instance & \bfseries Correspon-ding real vector class & \bfseries Total bits & \bfseries Recommended minimum \newline instruction set \\ \hline
 97 | Vec3Df  & \centering single & \centering  1 & \centering Vec4f & \centering 128 & SSE2 \\ \hline
 98 | Vec3Dd  & \centering double & \centering 1 & \centering Vec4d & \centering 256 & AVX \\ \hline
 99 | \end{tabular}
100 | \end{table}
101 | \vspacebig
102 | 
103 | 
104 | 
105 | \section{Compiling} \label{Compiling}
106 | The 3-D vector class extension to the Vector Class Library is compiled in the same way as the Vector Class Library itself. All x86 and x86-64 platforms are supported, including Windows, Linux, and Mac OS. 
107 | The following C++ compilers can be used: Gnu, Clang, Microsoft, and Intel. 
108 | See the Vector Class Library manual for further details.
109 | \vspacesmall
110 | 
111 | This example shows how to use the 3-D vector classes:
112 | 
113 | \begin{example}
114 | \label{example1}
115 | \end{example} % frame disappears if I put this after end lstlisting
116 | \begin{lstlisting}[frame=single]
117 | // Example for 3-D vectors
118 | #include <stdio.h>
119 | #include "vectorclass.h"  // vector class library
120 | #include "vector3d.h"     // extension for 3-D vectors
121 | 
122 | // function to print 3-D vector:
123 | template <typename V>
124 | void printv3 (const char * text, V a) {
125 |     auto aa = a.to_vector();  // get elements as real vector
126 |     printf("\n%s ", text);    // print text
127 |     printf("(%.3G,%.3G,%.3G)", aa[0], aa[1], aa[2]);
128 | }
129 | 
130 | int main() {
131 |     // define 3-D vectors
132 |     Vec3Dd a(1,2,3);                // x = 1, y = 2, z = 3
133 |     Vec3Dd b(4,5,6);                // x = 4, y = 5, z = 6
134 |     Vec3Dd c = a + b;               // add vectors
135 |     Vec3Dd d = cross_product(a, b); // x-product
136 |     double e = dot_product(a, b);   // dot-product
137 |     // print results
138 |     printv3("a = ", a);             // a = (1,2,3)
139 |     printv3("b = ", b);             // b = (4,5,6)
140 |     printv3("c = ", c);             // c = (5,7,9)
141 |     printv3("d = ", d);             // d = (-3,6,-3)
142 |     printf ("\ne = %f", e);         // e = 32
143 | }
144 | \end{lstlisting}
145 | \vspacesmall
146 | 
147 | 
148 | \chapter{Constructing 3-D vectors and loading data} 
149 | \label{Constructing3Dvectors}
150 | 
151 | There are several ways to create 3-D vectors and put data into them. These methods are listed here.
152 | \vspacebig
153 | 
154 | \begin{tabular}{|p{25mm}|p{100mm}|}
155 | \hline
156 | \bfseries Method & default constructor \\ \hline
157 | \bfseries Defined for & all 3-D vectors classes \\ \hline
158 | \bfseries Description & the 3-D vector is created but not initialized.\newline
159 | The value is unpredictable \\ \hline
160 | \bfseries Efficiency & good \\ \hline
161 | \end{tabular}
162 | \vspacesmall
163 | 
164 | \begin{lstlisting}[frame=none]
165 | // Example:
166 | Vec3Dd a;    // creates a 3-D vector
167 | \end{lstlisting}
168 | \vspacebig
169 | 
170 | 
171 | \begin{tabular}{|p{25mm}|p{100mm}|}
172 | \hline
173 | \bfseries Method & Construct from x,y,z coordinates \\ \hline
174 | \bfseries Defined for & all 3-D vectors classes \\ \hline
175 | \bfseries Description & The parameters define the x, y, and z coordinates \\ \hline
176 | \bfseries Efficiency & good \\ \hline
177 | \end{tabular}
178 | \vspacesmall
179 | 
180 | \begin{lstlisting}[frame=none]
181 | // Example:
182 | Vec3Dd a(1,2,3);  // a = (1,2,3) (x = 1, y = 2, z = 3)
183 | \end{lstlisting}
184 | \vspacebig
185 | 
186 | \begin{tabular}{|p{25mm}|p{100mm}|}
187 | \hline
188 | \bfseries Method & member function load(p) \\ \hline
189 | \bfseries Defined for & all 3-D vectors classes \\ \hline
190 | \bfseries Description & Load data from array of same precision. \\ \hline
191 | \bfseries Efficiency & good \\ \hline
192 | \end{tabular}
193 | \vspacesmall
194 | 
195 | \begin{lstlisting}[frame=none]
196 | // Example:
197 | float a[3] = {2,5,-1};
198 | Vec3Df b;
199 | b.load(a);  // b = (2,5,-1)
200 | \end{lstlisting}
201 | \vspacebig
202 | 
203 | 
204 | \begin{tabular}{|p{25mm}|p{100mm}|}
205 | \hline
206 | \bfseries Method & member function store(p) \\ \hline
207 | \bfseries Defined for & all 3-D vectors classes \\ \hline
208 | \bfseries Description & Save data into array of same precision \\ \hline
209 | \bfseries Efficiency & good \\ \hline
210 | \end{tabular}
211 | \vspacesmall
212 | 
213 | \begin{lstlisting}[frame=none]
214 | // Example:
215 | double a[3];
216 | Vec3Dd b(4,0,3);
217 | b.store(a);  // a = {4,0,3}
218 | \end{lstlisting}
219 | \vspacebig
220 | 
221 | 
222 | \begin{tabular}{|p{25mm}|p{100mm}|}
223 | \hline
224 | \bfseries Method & member function get\_x() \\ \hline
225 | \bfseries Defined for & all 3-D vectors classes \\ \hline
226 | \bfseries Description & Get the x-coordinate \\ \hline
227 | \bfseries Efficiency & good \\ \hline
228 | \end{tabular}
229 | \vspacesmall
230 | 
231 | \begin{lstlisting}[frame=none]
232 | // Example:
233 | Vec3Dd a(1,2,3);
234 | double b = a.get_x();  // b = 1
235 | \end{lstlisting}
236 | \vspacebig
237 | 
238 | \begin{tabular}{|p{25mm}|p{100mm}|}
239 | \hline
240 | \bfseries Method & member function get\_y() \\ \hline
241 | \bfseries Defined for & all 3-D vectors classes \\ \hline
242 | \bfseries Description & Get the y-coordinate \\ \hline
243 | \bfseries Efficiency & good \\ \hline
244 | \end{tabular}
245 | \vspacesmall
246 | 
247 | \begin{lstlisting}[frame=none]
248 | // Example:
249 | Vec3Dd a(1,2,3);
250 | double b = a.get_y();  // b = 2
251 | \end{lstlisting}
252 | \vspacebig
253 | 
254 | \begin{tabular}{|p{25mm}|p{100mm}|}
255 | \hline
256 | \bfseries Method & member function get\_z() \\ \hline
257 | \bfseries Defined for & all 3-D vectors classes \\ \hline
258 | \bfseries Description & Get the z-coordinate \\ \hline
259 | \bfseries Efficiency & good \\ \hline
260 | \end{tabular}
261 | \vspacesmall
262 | 
263 | \begin{lstlisting}[frame=none]
264 | // Example:
265 | Vec3Dd a(1,2,3);
266 | double b = a.get_z();  // b = 3
267 | \end{lstlisting}
268 | \vspacebig
269 | 
270 | \begin{tabular}{|p{25mm}|p{100mm}|}
271 | \hline
272 | \bfseries Method & member function extract(index) \\ \hline
273 | \bfseries Defined for & all 3-D vectors classes \\ \hline
274 | \bfseries Description & index = 0, 1, 2 give the x, y, or z-coordinate, respectively \\ \hline
275 | \bfseries Efficiency & good \\ \hline
276 | \end{tabular}
277 | \vspacesmall
278 | 
279 | \begin{lstlisting}[frame=none]
280 | // Example:
281 | Vec3Dd a(1,2,3);
282 | double b = a.extract(2);  // b = 3
283 | double c = a[2];          // b = 3 (the same)
284 | \end{lstlisting}
285 | \vspacebig
286 | 
287 | \begin{tabular}{|p{25mm}|p{100mm}|}
288 | \hline
289 | \bfseries Method & member function insert(index, value) \\ \hline
290 | \bfseries Defined for & all 3-D vectors classes \\ \hline
291 | \bfseries Description & index = 0, 1, 2 changes the x, y, or z-coordinate, respectively \\ \hline
292 | \bfseries Efficiency & good \\ \hline
293 | \end{tabular}
294 | \vspacesmall
295 | 
296 | \begin{lstlisting}[frame=none]
297 | // Example:
298 | Vec3Dd a(1,2,3);
299 | a.insert(0, 8);  // a = (8, 2, 3)
300 | \end{lstlisting}
301 | \vspacebig
302 | 
303 | 
304 | \chapter{Operators}\label{chap:Operators}
305 | 
306 | \begin{tabular}{|p{25mm}|p{100mm}|}
307 | \hline
308 | \bfseries Operator & + \\ \hline
309 | \bfseries Defined for & all 3-D vectors classes  \\ \hline
310 | \bfseries Description & Add two vectors \\ \hline
311 | \bfseries Efficiency & good \\ \hline
312 | \end{tabular}
313 | \vspacesmall
314 | 
315 | \begin{lstlisting}[frame=none]
316 | // Example:
317 | Vec3Dd a(1,2,3);
318 | Vec3Dd b(5,6,7);
319 | Vec3Dd c = a + b;    // c = (6,8,10)
320 | \end{lstlisting}
321 | \vspacebig
322 | 
323 | 
324 | \begin{tabular}{|p{25mm}|p{100mm}|}
325 | \hline
326 | \bfseries Operator & - \\ \hline
327 | \bfseries Defined for & all 3-D vectors classes  \\ \hline
328 | \bfseries Description & Subtract two vectors \\ \hline
329 | \bfseries Efficiency & good \\ \hline
330 | \end{tabular}
331 | \vspacesmall
332 | 
333 | \begin{lstlisting}[frame=none]
334 | // Example:
335 | Vec3Dd a(11,10,9);
336 | Vec3Dd b(5,6,7);
337 | Vec3Dd c = a - b;    // c = (6,4,2)
338 | Vec3Dd d = - b;      // d = (-5,-6,-7)
339 | \end{lstlisting}
340 | \vspacebig
341 | 
342 | 
343 | \begin{tabular}{|p{25mm}|p{100mm}|}
344 | \hline
345 | \bfseries Operator & * \\ \hline
346 | \bfseries Defined for & all 3-D vectors classes  \\ \hline
347 | \bfseries Description & Multiply two vectors element by element, or one vector and one scalar of the same precision \\ \hline
348 | \bfseries Efficiency & good \\ \hline
349 | \end{tabular}
350 | \vspacesmall
351 | 
352 | \begin{lstlisting}[frame=none]
353 | // Example:
354 | Vec3Dd a(1,2,3);
355 | Vec3Dd b(4,5,6);
356 | Vec3Dd c = a * b;    // c = (4,10,18)
357 | Vec3Dd d = a * 10.0; // d = (10,20,30)
358 | \end{lstlisting}
359 | \vspacebig
360 | 
361 | 
362 | \begin{tabular}{|p{25mm}|p{100mm}|}
363 | \hline
364 | \bfseries Operator & / \\ \hline
365 | \bfseries Defined for & all 3-D vectors classes  \\ \hline
366 | \bfseries Description & Divide a vector by a scalar of the same precision \\ \hline
367 | \bfseries Efficiency & good \\ \hline
368 | \end{tabular}
369 | \vspacesmall
370 | 
371 | \begin{lstlisting}[frame=none]
372 | // Example:
373 | Vec3Dd a(10,20,30);
374 | Vec3Dd b = a / 5.0; // b = (2,4,6)
375 | \end{lstlisting}
376 | \vspacebig
377 | 
378 | 
379 | \begin{tabular}{|p{25mm}|p{100mm}|}
380 | \hline
381 | \bfseries Operator & == \\ \hline
382 | \bfseries Defined for & all 3-D vectors classes  \\ \hline
383 | \bfseries Description & Compare for equality.\newline
384 | The result is a boolean scalar. \\ \hline
385 | \bfseries Efficiency & good \\ \hline
386 | \end{tabular}
387 | \vspacesmall
388 | 
389 | \begin{lstlisting}[frame=none]
390 | // Example:
391 | Vec3Dd a(1, 2,3);
392 | Vec3Dd b(1,-2,3);
393 | bool   c = (a == b);  // c = false
394 | \end{lstlisting}
395 | \vspacebig
396 | 
397 | 
398 | \begin{tabular}{|p{25mm}|p{100mm}|}
399 | \hline
400 | \bfseries Operator & != \\ \hline
401 | \bfseries Defined for & all 3-D vectors classes  \\ \hline
402 | \bfseries Description & Compare for not equal.\newline
403 | The result is a boolean scalar. \\ \hline
404 | \bfseries Efficiency & good \\ \hline
405 | \end{tabular}
406 | \vspacesmall
407 | 
408 | \begin{lstlisting}[frame=none]
409 | // Example:
410 | Vec3Dd a(1, 2,3);
411 | Vec3Dd b(1,-2,3);
412 | bool   c = (a != b);  // c = true
413 | \end{lstlisting}
414 | \vspacebig
415 | 
416 | 
417 | \chapter{Mathematical functions}\label{chap:MathematicalFunctions}
418 | 
419 | 
420 | \begin{tabular}{|p{25mm}|p{100mm}|}
421 | \hline
422 | \bfseries Function & cross\_product \\ \hline
423 | \bfseries Defined for & all 3-D vectors classes  \\ \hline
424 | \bfseries Description & Gives the X-product of two vectors \\ \hline
425 | \bfseries Efficiency & medium \\ \hline
426 | \bfseries Accuracy & Calculation of the X-product involves the calculation of sums of products. Loss of precision may occur if the result is close to zero. \\ \hline
427 | \end{tabular}
428 | \vspacesmall
429 | 
430 | \begin{lstlisting}[frame=none]
431 | // Example:
432 | Vec3Dd a(1,2,3);
433 | Vec3Dd b(4,5,6);
434 | Vec3Dd c = cross_product(a,b); // c = (-3,6,-3)
435 | Vec3Dd d = cross_product(b,a); // d = (3,-6,3)
436 | \end{lstlisting}
437 | \vspacebig
438 | 
439 | 
440 | \begin{tabular}{|p{25mm}|p{100mm}|}
441 | \hline
442 | \bfseries Function & dot\_product \\ \hline
443 | \bfseries Defined for & all 3-D vectors classes  \\ \hline
444 | \bfseries Description & Gives the dot-product of two vectors. The result is a scalar \\ \hline
445 | \bfseries Efficiency & medium \\ \hline
446 | \end{tabular}
447 | \vspacesmall
448 | 
449 | \begin{lstlisting}[frame=none]
450 | // Example:
451 | Vec3Dd a(1,2,3);
452 | Vec3Dd b(4,5,6);
453 | double c = dot_product(a,b); // c = 32
454 | \end{lstlisting}
455 | \vspacebig
456 | 
457 | 
458 | \begin{tabular}{|p{25mm}|p{100mm}|}
459 | \hline
460 | \bfseries Function & vector\_length \\ \hline
461 | \bfseries Defined for & all 3-D vectors classes  \\ \hline
462 | \bfseries Description & Gives the length of the vector (Euclidian norm) \\ \hline
463 | \bfseries Efficiency & medium \\ \hline
464 | \end{tabular}
465 | \vspacesmall
466 | 
467 | \begin{lstlisting}[frame=none]
468 | // Example:
469 | Vec3Dd a(3,0,4);
470 | double b = vector_length(a); // b = 5
471 | \end{lstlisting}
472 | \vspacebig
473 | 
474 | 
475 | \begin{tabular}{|p{25mm}|p{100mm}|}
476 | \hline
477 | \bfseries Function & normalize\_vector \\ \hline
478 | \bfseries Defined for & all 3-D vectors classes  \\ \hline
479 | \bfseries Description & Divides the vector by its length to give a vector with the same direction and length one. \\ \hline
480 | \bfseries Efficiency & medium \\ \hline
481 | \end{tabular}
482 | \vspacesmall
483 | 
484 | \begin{lstlisting}[frame=none]
485 | // Example:
486 | Vec3Dd a(3,0,4);
487 | Vec3Dd b = normalize_vector(a); // b = (0.6, 0.0, 0.8)
488 | \end{lstlisting}
489 | \vspacebig
490 | 
491 | 
492 | \begin{tabular}{|p{25mm}|p{100mm}|}
493 | \hline
494 | \bfseries Function & rotate \\ \hline
495 | \bfseries Defined for & all 3-D vectors classes  \\ \hline
496 | \bfseries Description & Rotates a vector by multiplying a 3x3 rotation matrix by the column vector. The first three parameters define the columns of the rotation matrix. The last parameter is the vector to rotate. \\ \hline
497 | \bfseries Efficiency & medium \\ \hline
498 | \bfseries Accuracy & Calculation of the rotated vector involves the calculation of sums of products. Loss of precision may occur if the result is close to zero. \\ \hline
499 | \end{tabular}
500 | \vspacesmall
501 | 
502 | \begin{lstlisting}[frame=none]
503 | // Example:
504 | Vec3Dd a(1,2,3);    // vector to rotate
505 | Vec3Dd c0(1,0,0);   // first column of matrix
506 | Vec3Dd c1(0,0,-1);  // second column of matrix
507 | Vec3Dd c2(0,1,0);   // third column of matrix
508 | Vec3Dd d = rotate(c0,c1,c2,a); // d = (1,3,-2)
509 | \end{lstlisting}
510 | \vspacebig
511 | 
512 | 
513 | \chapter{Other functions}\label{chap:OtherFunctions}
514 | 
515 | \begin{tabular}{|p{25mm}|p{100mm}|}
516 | \hline
517 | \bfseries Function & to\_vector \\ \hline
518 | \bfseries Defined for & all 3-D vectors classes \\ \hline
519 | \bfseries Description & Convert to a vector of class Vec4f or Vec4d. \\ \hline
520 | \bfseries Efficiency & good \\ \hline
521 | \end{tabular}
522 | \vspacesmall
523 | 
524 | \begin{lstlisting}[frame=none]
525 | // Example:
526 | Vec3Df a(1,2,3);
527 | Vec4f  b = a.to_vector(); // b = (1,2,3,0)
528 | \end{lstlisting}
529 | \vspacebig
530 | 
531 | 
532 | \begin{tabular}{|p{25mm}|p{100mm}|}
533 | \hline
534 | \bfseries Function & select \\ \hline
535 | \bfseries Defined for & all 3-D vectors classes  \\ \hline
536 | \bfseries Description & Choose between two vectors. \\ \hline
537 | \bfseries Efficiency & good \\ \hline
538 | \end{tabular}
539 | \vspacesmall
540 | 
541 | \begin{lstlisting}[frame=none]
542 | // Example:
543 | Vec3Df a(1,2,3);
544 | Vec3Df b(4,5,6);
545 | Vec3Df c = select(true,a,b);  // c = (1,2,3)
546 | Vec3Df d = select(false,a,b); // d = (4,5,6)
547 | \end{lstlisting}
548 | \vspacebig
549 | 
550 | 
551 | \begin{tabular}{|p{25mm}|p{100mm}|}
552 | \hline
553 | \bfseries Function & to\_float \\ \hline
554 | \bfseries Defined for & Vec3Dd  \\ \hline
555 | \bfseries Description & Convert to lower precision. The result is a Vec3Df \\ \hline
556 | \bfseries Efficiency & good \\ \hline
557 | \end{tabular}
558 | \vspacesmall
559 | 
560 | \begin{lstlisting}[frame=none]
561 | // Example:
562 | Vec3Dd a(1,2,3);
563 | Vec3Df b = to_float(a);// b = (1,2,3)
564 | \end{lstlisting}
565 | \vspacebig
566 | 
567 | 
568 | \begin{tabular}{|p{25mm}|p{100mm}|}
569 | \hline
570 | \bfseries Function & to\_double \\ \hline
571 | \bfseries Defined for & Vec3Df  \\ \hline
572 | \bfseries Description & Convert to higher precision. The result is a Vec3Dd \\ \hline
573 | \bfseries Efficiency & good \\ \hline
574 | \end{tabular}
575 | \vspacesmall
576 | 
577 | \begin{lstlisting}[frame=none]
578 | // Example:
579 | Vec3Df a(1,2,3);
580 | Vec3Dd b = to_double(a);// b = (1,2,3)
581 | \end{lstlisting}
582 | \vspacebig
583 | 
584 | 
585 | \end{document}
586 | 


--------------------------------------------------------------------------------
/quaternion/quaternion.h:
--------------------------------------------------------------------------------
  1 | /***************************  quaternion.h   *********************************
  2 | * Author:        Agner Fog
  3 | * Date created:  2012-08-01
  4 | * Last modified: 2019-07-13
  5 | * Version:       2.00
  6 | * Project:       Extension to vector class library
  7 | * Description:
  8 | * Quaternions are used in theoretical algebra
  9 | * Classes for quaternions:
 10 | * Quaternion1f:  One quaternion consisting of four single precision floats
 11 | * Quaternion1d:  One quaternion consisting of four double precision floats
 12 | *
 13 | * (c) Copyright 2012-2019 Apache License version 2.0 or later
 14 | ******************************************************************************/
 15 | 
 16 | 
 17 | #ifndef QUATERNION_H
 18 | #define QUATERNION_H  200
 19 | 
 20 | #include "vectorclass.h"
 21 | #include <cmath>
 22 | 
 23 | #ifdef VCL_NAMESPACE
 24 | namespace VCL_NAMESPACE {
 25 | #endif
 26 | 
 27 | /*****************************************************************************
 28 | *
 29 | *                     Class Quaternion1f
 30 | *         One quaternion consisting of four single precision floats
 31 | *
 32 | *****************************************************************************/
 33 | 
 34 | class Quaternion1f {
 35 | protected:
 36 |     __m128 xmm; // vector of 4 single precision floats
 37 | public:
 38 |     // default constructor
 39 |     Quaternion1f() {
 40 |     }
 41 |     // construct from real, no imaginary part
 42 |     Quaternion1f(float re) {
 43 |         xmm = _mm_load_ss(&re);
 44 |     }
 45 |     // construct from real and imaginary parts = re + im0*i + im1*j + im2*k
 46 |     Quaternion1f(float re, float im0, float im1, float im2) {
 47 |         xmm = Vec4f(re, im0, im1, im2);
 48 |     }
 49 |     // Constructor to convert from type __m128 used in intrinsics:
 50 |     Quaternion1f(__m128 const x) {
 51 |         xmm = x;
 52 |     }
 53 |     // Assignment operator to convert from type __m128 used in intrinsics:
 54 |     Quaternion1f & operator = (__m128 const x) {
 55 |         xmm = x;
 56 |         return *this;
 57 |     }
 58 |     // Constructor to convert from Vec4f
 59 |     Quaternion1f(Vec4f const x) {
 60 |         xmm = x;
 61 |     }
 62 |     // Type cast operator to convert to __m128 used in intrinsics
 63 |     operator __m128() const {
 64 |         return xmm;
 65 |     }
 66 |     // Member function to convert to vector
 67 |     Vec4f to_vector() const {
 68 |         return xmm;
 69 |     }
 70 |     // Member function to load from array
 71 |     Quaternion1f & load(float const * p) {
 72 |         xmm = Vec4f().load(p);
 73 |         return *this;
 74 |     }
 75 |     // Member function to store into array
 76 |     void store(float * p) const {
 77 |         Vec4f(xmm).store(p);
 78 |     }
 79 |     // Member function to extract real part
 80 |     float real() const {
 81 |         return _mm_cvtss_f32(xmm);
 82 |     }
 83 |     // Member function to extract imaginary parts, sets real part to 0
 84 |     Quaternion1f imag() const {
 85 |         return Quaternion1f(permute4<-1,1,2,3>(Vec4f(xmm)));
 86 |     }
 87 | #ifdef COMPLEXVEC_H  // relations to complexvec1.h
 88 |     // construct from two Complex1f = a0 + a1 * j
 89 |     Quaternion1f(Complex1f const a0, Complex1f const a1) {
 90 |         xmm = _mm_movelh_ps(a0, a1);
 91 |     }
 92 |     // Member functions to split into two Complex1f:
 93 |     // q = q.get_low() + q.get_high()*j
 94 |     Complex1f get_low() const {
 95 |         return Complex1f(Vec4f(xmm).cutoff(2));
 96 |     }
 97 |     Complex1f get_high() const {
 98 |         __m128 t = _mm_movehl_ps(_mm_setzero_ps(), xmm);
 99 |         return Complex1f(t);
100 |     } 
101 | #endif
102 | #ifdef VECTOR3D_H   // relations to vector3d.h
103 |     // Constructor to convert from Vec3f used in geometrics:
104 |     Quaternion1f(Vec3f const x) {
105 |         xmm = permute4<3,0,1,2>(Vec4f(x));  // rotate elements
106 |     }
107 | 
108 |     // Type cast operator to convert to Vec3f used in geometrics:
109 |     operator Vec3f() const {
110 |         return Vec3f(permute4<1,2,3,0>(Vec4f(xmm)));  // rotate elements
111 |     }
112 | #endif // VECTOR3D_H 
113 | };
114 | 
115 | 
116 | /*****************************************************************************
117 | *
118 | *          Operators for Quaternion1f
119 | *
120 | *****************************************************************************/
121 | 
122 | // operator + : add
123 | static inline Quaternion1f operator + (Quaternion1f const a, Quaternion1f const b) {
124 |     return Quaternion1f(a.to_vector() + b.to_vector());
125 | }
126 | 
127 | // operator += : add
128 | static inline Quaternion1f & operator += (Quaternion1f & a, Quaternion1f const b) {
129 |     a = a + b;
130 |     return a;
131 | }
132 | 
133 | // operator - : subtract
134 | static inline Quaternion1f operator - (Quaternion1f const a, Quaternion1f const b) {
135 |     return Quaternion1f(a.to_vector() - b.to_vector());
136 | }
137 | 
138 | // operator - : unary minus
139 | static inline Quaternion1f operator - (Quaternion1f const a) {
140 |     return Quaternion1f(- a.to_vector());
141 | }
142 | 
143 | // operator -= : subtract
144 | static inline Quaternion1f & operator -= (Quaternion1f & a, Quaternion1f const b) {
145 |     a = a - b;
146 |     return a;
147 | }
148 | 
149 | // operator * : quaternion multiply
150 | static inline Quaternion1f operator * (Quaternion1f const a, Quaternion1f const b) {
151 |     __m128 a1123 = _mm_shuffle_ps(a,a,0xE5);
152 |     __m128 a2231 = _mm_shuffle_ps(a,a,0x7A);
153 |     __m128 b1000 = _mm_shuffle_ps(b,b,0x01);
154 |     __m128 b2312 = _mm_shuffle_ps(b,b,0x9E);
155 |     __m128 t1    = _mm_mul_ps(a1123, b1000);
156 |     __m128 t2    = _mm_mul_ps(a2231, b2312);
157 |     __m128 t12   = _mm_add_ps(t1, t2);
158 |     __m128 t12m  = change_sign<1,0,0,0>(Vec4f(t12));
159 |     __m128 a3312 = _mm_shuffle_ps(a,a,0x9F);
160 |     __m128 b3231 = _mm_shuffle_ps(b,b,0x7B);
161 |     __m128 a0000 = _mm_shuffle_ps(a,a,0x00);
162 |     __m128 t3    = _mm_mul_ps(a3312, b3231);
163 |     __m128 t0    = _mm_mul_ps(a0000, b);
164 |     __m128 t03   = _mm_sub_ps(t0, t3);
165 |     return         _mm_add_ps(t03, t12m);
166 | }
167 | 
168 | // operator *= : multiply
169 | static inline Quaternion1f & operator *= (Quaternion1f & a, Quaternion1f const b) {
170 |     a = a * b;
171 |     return a;
172 | }
173 | 
174 | // operator ~ : complex conjugate
175 | // ~(a + b*i + c*j + d*k) = (a - b*i - c*j - d*k)
176 | static inline Quaternion1f operator ~ (Quaternion1f const a) {
177 |     return Quaternion1f(change_sign<0,1,1,1>(a.to_vector()));
178 | }
179 | 
180 | // function reciprocal: multiplicative inverse
181 | static inline Quaternion1f reciprocal (Quaternion1f const a) {
182 |     Vec4f sq  = _mm_mul_ps(a,a);
183 |     float nsq = horizontal_add(sq);
184 |     return Quaternion1f((~a).to_vector() / Vec4f(nsq));
185 | }
186 | 
187 | // operator / : quaternion divide is defined as
188 | // a / b = a * reciprocal(b)
189 | static inline Quaternion1f operator / (Quaternion1f const a, Quaternion1f const b) {
190 |     return a * reciprocal(b);
191 | }
192 | 
193 | // operator /= : divide
194 | static inline Quaternion1f & operator /= (Quaternion1f & a, Quaternion1f const b) {
195 |     a = a / b;
196 |     return a;
197 | }
198 | 
199 | // operator == : returns true if a == b
200 | static inline bool operator == (Quaternion1f const a, Quaternion1f const b) {
201 |     Vec4fb t1 = a.to_vector() == b.to_vector();
202 |     return horizontal_and(t1);
203 | }
204 | 
205 | // operator != : returns true if a != b
206 | static inline bool operator != (Quaternion1f const a, Quaternion1f const b) {
207 |     Vec4fb t1 = a.to_vector() != b.to_vector();
208 |     return horizontal_or(t1);
209 | }
210 | 
211 | 
212 | /*****************************************************************************
213 | *
214 | *          Operators mixing Quaternion1f and float
215 | *
216 | *****************************************************************************/
217 | 
218 | // operator + : add
219 | static inline Quaternion1f operator + (Quaternion1f const a, float b) {
220 |     return _mm_add_ss(a, _mm_set_ss(b));
221 | }
222 | 
223 | static inline Quaternion1f operator + (float a, Quaternion1f const b) {
224 |     return b + a;
225 | }
226 | 
227 | static inline Quaternion1f & operator += (Quaternion1f & a, float & b) {
228 |     a = a + b;
229 |     return a;
230 | }
231 | 
232 | // operator - : subtract
233 | static inline Quaternion1f operator - (Quaternion1f const a, float b) {
234 |     return _mm_sub_ss(a, _mm_set_ss(b));
235 | }
236 | 
237 | static inline Quaternion1f operator - (float a, Quaternion1f const b) {
238 |     return _mm_sub_ps(_mm_set_ss(a), b);
239 | }
240 | 
241 | static inline Quaternion1f & operator -= (Quaternion1f & a, float & b) {
242 |     a = a - b;
243 |     return a;
244 | }
245 | 
246 | // operator * : multiply
247 | static inline Quaternion1f operator * (Quaternion1f const a, float b) {
248 |     return _mm_mul_ps(a, _mm_set1_ps(b));
249 | }
250 | 
251 | static inline Quaternion1f operator * (float a, Quaternion1f const b) {
252 |     return b * a;
253 | }
254 | 
255 | static inline Quaternion1f & operator *= (Quaternion1f & a, float & b) {
256 |     a = a * b;
257 |     return a;
258 | }
259 | 
260 | // operator / : divide
261 | static inline Quaternion1f operator / (Quaternion1f const a, float b) {
262 |     return _mm_div_ps(a, _mm_set1_ps(b));
263 | }
264 | 
265 | static inline Quaternion1f operator / (float a, Quaternion1f const b) {
266 |     return reciprocal(b) * a;
267 | }
268 | 
269 | static inline Quaternion1f & operator /= (Quaternion1f & a, float b) {
270 |     a = a / b;
271 |     return a;
272 | }
273 | 
274 | 
275 | /*****************************************************************************
276 | *
277 | *          Functions for Quaternion1f
278 | *
279 | *****************************************************************************/
280 | 
281 | // function abs: calculate the norm
282 | // abs(a + b*i + c*j + d*k) = sqrt(a*a + b*B + c*c + d*d)
283 | static inline float abs(Quaternion1f const a) {
284 |     Vec4f sq  = _mm_mul_ps(a,a);
285 |     float nsq = horizontal_add(sq);
286 |     return std::sqrt(nsq);
287 | }
288 | 
289 | // function select
290 | static inline Quaternion1f select (bool s, Quaternion1f const a, Quaternion1f const b) {
291 |     return Quaternion1f(s ? a : b);
292 | }
293 | 
294 | 
295 | 
296 | /*****************************************************************************
297 | *
298 | *                     Class Quaternion1d
299 | *         One quaternion consisting of four double precision floats
300 | *
301 | *****************************************************************************/
302 | 
303 | class Quaternion1d {
304 | protected:
305 |     Vec4d y; // vector of 4 doubles
306 | public:
307 |     // default constructor
308 |     Quaternion1d() {
309 |     }
310 |     // construct from real and imaginary parts = re + im0*i + im1*j + im2*k
311 |     Quaternion1d(double re, double im0, double im1, double im2) {
312 |         y = Vec4d(re, im0, im1, im2);
313 |     }
314 |     // construct from real, no imaginary part
315 |     Quaternion1d(double re) {
316 |         y = Vec4d(re, 0., 0., 0.);
317 |     }
318 |     // Constructor to convert from type __m256d used in intrinsics or Vec256de used in emulation
319 | #if INSTRSET >= 7  // AVX
320 |     Quaternion1d(__m256d const x) {
321 | #else
322 |     Quaternion1d(Vec256de const x) {
323 | #endif
324 |         y = x;
325 |     }
326 |     // Assignment operator to convert from type __m256d used in intrinsics or Vec256de used in emulation
327 | #if INSTRSET >= 7  // AVX
328 |     Quaternion1d & operator = (__m256d const x) {
329 | #else
330 |     Quaternion1d & operator = (Vec256de const x) {
331 | #endif
332 |         y = x;
333 |         return *this;
334 |     }
335 |     // Constructor to convert from Vec4d
336 |     Quaternion1d(Vec4d const x) {
337 |         y = x;
338 |     }
339 |     // Type cast operator to convert to __m256d used in intrinsics or Vec256de used in emulation
340 | #if INSTRSET >= 7  // AVX
341 |     operator __m256d() const {
342 | #else
343 |     operator Vec256de() const {
344 | #endif
345 |         return y;
346 |     }
347 |     // Member function to convert to vector
348 |     Vec4d to_vector() const {
349 |         return y;
350 |     }
351 |     // Member function to load from array
352 |     Quaternion1d & load(double const * p) {
353 |         y.load(p);
354 |         return *this;
355 |     }
356 |     // Member function to store into array
357 |     void store(double * p) const {
358 |         y.store(p);
359 |     }
360 | #ifdef COMPLEXVEC_H  // relations to complexvec1.h
361 |     // construct from two Complex1d = a0 + a1 * j
362 |     Quaternion1d(Complex1d const a0, Complex1d const a1) {
363 |         y = Vec4d(Vec2d(a0), Vec2d(a1));
364 |     }
365 |     // Member functions to split into two Complex1d:
366 |     // q = q.get_low() + q.get_high()*j
367 |     Complex1d get_low() const {
368 |         return Complex1d(y.get_low());
369 |     }
370 |     Complex1d get_high() const {
371 |         return Complex1d(y.get_high());
372 |     }
373 | #endif
374 |     // Member function to extract real part
375 |     double real() const {
376 |         return y.extract(0);
377 |     }
378 |     // Member function to extract imaginary parts, sets real part to 0
379 |     Quaternion1d imag() const {
380 |         return Quaternion1d(permute4<-1,1,2,3>(Vec4d(y)));
381 |     }
382 | #ifdef VECTOR3D_H
383 |     // Constructor to convert from Vec3d used in geometrics:
384 |     Quaternion1d(Vec3d const x) {
385 |         y = permute4<3,0,1,2>(Vec4d(x));  // rotate elements
386 |     }
387 |     // Type cast operator to convert to Vec3d used in geometrics:
388 |     operator Vec3d() const {
389 |         return Vec3d(permute4<1,2,3,0>(y));  // rotate elements
390 |     }
391 | #endif // VECTOR3D_H 
392 | };
393 | 
394 | 
395 | /*****************************************************************************
396 | *
397 | *          Operators for Quaternion1d
398 | *
399 | *****************************************************************************/
400 | 
401 | // operator + : add
402 | static inline Quaternion1d operator + (Quaternion1d const a, Quaternion1d const b) {
403 |     return Quaternion1d(a.to_vector() + b.to_vector());
404 | }
405 | 
406 | // operator += : add
407 | static inline Quaternion1d & operator += (Quaternion1d & a, Quaternion1d const b) {
408 |     a = a + b;
409 |     return a;
410 | }
411 | 
412 | // operator - : subtract
413 | static inline Quaternion1d operator - (Quaternion1d const a, Quaternion1d const b) {
414 |     return Quaternion1d(a.to_vector() - b.to_vector());
415 | }
416 | 
417 | // operator - : unary minus
418 | static inline Quaternion1d operator - (Quaternion1d const a) {
419 |     return Quaternion1d(- a.to_vector());
420 | }
421 | 
422 | // operator -= : subtract
423 | static inline Quaternion1d & operator -= (Quaternion1d & a, Quaternion1d const b) {
424 |     a = a - b;
425 |     return a;
426 | }
427 | 
428 | // operator * : quaternion multiply
429 | static inline Quaternion1d operator * (Quaternion1d const a, Quaternion1d const b) {
430 |     Vec4d a1123 = permute4<1,1,2,3>(a.to_vector());
431 |     Vec4d a2231 = permute4<2,2,3,1>(a.to_vector());
432 |     Vec4d b1000 = permute4<1,0,0,0>(b.to_vector());
433 |     Vec4d b2312 = permute4<2,3,1,2>(b.to_vector());
434 |     Vec4d t1    = a1123 * b1000;
435 |     Vec4d t2    = a2231 * b2312;
436 |     Vec4d t12   = t1 + t2;
437 |     Vec4d t12m  = change_sign<1,0,0,0>(t12);
438 |     Vec4d a3312 = permute4<3,3,1,2>(a.to_vector());
439 |     Vec4d b3231 = permute4<3,2,3,1>(b.to_vector());
440 |     Vec4d a0000 = permute4<0,0,0,0>(a.to_vector());
441 |     Vec4d t3    = a3312 * b3231;
442 |     Vec4d t0    = a0000 * b.to_vector();
443 |     Vec4d t03   = t0  - t3;
444 |     return        t03 + t12m;
445 | }
446 | 
447 | // operator *= : multiply
448 | static inline Quaternion1d & operator *= (Quaternion1d & a, Quaternion1d const b) {
449 |     a = a * b;
450 |     return a;
451 | }
452 | 
453 | // operator ~ : complex conjugate
454 | // ~(a + b*i + c*j + d*k) = (a - b*i - c*j - d*k)
455 | static inline Quaternion1d operator ~ (Quaternion1d const a) {
456 |     return Quaternion1d(change_sign<0,1,1,1>(a.to_vector()));
457 | }
458 | 
459 | // function reciprocal: multiplicative inverse
460 | static inline Quaternion1d reciprocal (Quaternion1d const a) {
461 |     Vec4d sq  = a.to_vector() * a.to_vector();
462 |     double nsq = horizontal_add(sq);
463 |     return Quaternion1d((~a).to_vector() / Vec4d(nsq));
464 | }
465 | 
466 | // operator / : quaternion divide is defined as
467 | // a / b = a * reciprocal(b)
468 | static inline Quaternion1d operator / (Quaternion1d const a, Quaternion1d const b) {
469 |     return a * reciprocal(b);
470 | }
471 | 
472 | // operator /= : divide
473 | static inline Quaternion1d & operator /= (Quaternion1d & a, Quaternion1d const b) {
474 |     a = a / b;
475 |     return a;
476 | }
477 | 
478 | // operator == : returns true if a == b
479 | static inline bool operator == (Quaternion1d const a, Quaternion1d const b) {
480 |     Vec4db t1 = a.to_vector() == b.to_vector();
481 |     return horizontal_and(t1);
482 | }
483 | 
484 | // operator != : returns true if a != b
485 | static inline bool operator != (Quaternion1d const a, Quaternion1d const b) {
486 |     Vec4db t1 = a.to_vector() != b.to_vector();
487 |     return horizontal_or(t1);
488 | }
489 | 
490 | 
491 | /*****************************************************************************
492 | *
493 | *          Operators mixing Quaternion1d and double
494 | *
495 | *****************************************************************************/
496 | 
497 | // operator + : add
498 | static inline Quaternion1d operator + (Quaternion1d const a, double b) {
499 |     return a + Quaternion1d(b);
500 | }
501 | 
502 | static inline Quaternion1d operator + (double a, Quaternion1d const b) {
503 |     return b + a;
504 | }
505 | 
506 | static inline Quaternion1d & operator += (Quaternion1d & a, double & b) {
507 |     a = a + b;
508 |     return a;
509 | }
510 | 
511 | // operator - : subtract
512 | static inline Quaternion1d operator - (Quaternion1d const a, double b) {
513 |     return a - Quaternion1d(b);
514 | }
515 | 
516 | static inline Quaternion1d operator - (double a, Quaternion1d const b) {
517 |     return Quaternion1d(a) - b;
518 | }
519 | 
520 | static inline Quaternion1d & operator -= (Quaternion1d & a, double & b) {
521 |     a = a - b;
522 |     return a;
523 | }
524 | 
525 | // operator * : multiply
526 | static inline Quaternion1d operator * (Quaternion1d const a, double b) {
527 |     return Quaternion1d(a.to_vector() * b);
528 | }
529 | 
530 | static inline Quaternion1d operator * (double a, Quaternion1d const b) {
531 |     return b * a;
532 | }
533 | 
534 | static inline Quaternion1d & operator *= (Quaternion1d & a, double & b) {
535 |     a = a * b;
536 |     return a;
537 | }
538 | 
539 | // operator / : divide
540 | static inline Quaternion1d operator / (Quaternion1d const a, double b) {
541 |     return Quaternion1d(a.to_vector() / Vec4d(b));
542 | }
543 | 
544 | static inline Quaternion1d operator / (double a, Quaternion1d const b) {
545 |     return reciprocal(b) * a;
546 | }
547 | 
548 | static inline Quaternion1d & operator /= (Quaternion1d & a, double b) {
549 |     a = a / b;
550 |     return a;
551 | }
552 | 
553 | 
554 | /*****************************************************************************
555 | *
556 | *          Functions for Quaternion1d
557 | *
558 | *****************************************************************************/
559 | 
560 | // function abs: calculate the norm
561 | // abs(a + b*i + c*j + d*k) = sqrt(a*a + b*B + c*c + d*d)
562 | static inline double abs(Quaternion1d const a) {
563 |     Vec4d sq  = a.to_vector() * a.to_vector();
564 |     double nsq = horizontal_add(sq);
565 |     return std::sqrt(nsq);
566 | }
567 | 
568 | // function select
569 | static inline Quaternion1d select (bool s, Quaternion1d const a, Quaternion1d const b) {
570 |     return Quaternion1d(s ? a : b);
571 | }
572 | 
573 | 
574 | #ifdef VCL_NAMESPACE
575 | }
576 | #endif
577 | 
578 | #endif  // QUATERNION_H
579 | 


--------------------------------------------------------------------------------
/decimal/testbench_decimal.cpp:
--------------------------------------------------------------------------------
  1 | /*************************  testbench_decimal.cpp   **************************
  2 | * Author:        Agner Fog
  3 | * Date created:  2019-07-14
  4 | * Last modified: 2019-07-14
  5 | * Version:       2.00
  6 | * Project:       Vector class library add-on package 'decimal'
  7 | * Description:   Testbench for decimal.cpp using vector class library
  8 | * Compile and run this program to test functions in decimal.h package
  9 | *
 10 | * Instructions:
 11 | * The following parameters must be defined on the command line or added in the
 12 | * top of this file:
 13 | *
 14 | * testcase: A number defining a function or operator to test. See the cases in this file.
 15 | *
 16 | * Compile with any compiler supported by VCL.
 17 | * Specify the desired instruction set and optimization options as parameters
 18 | * to the compiler.
 19 | *
 20 | * (c) Copyright 2019 Agner Fog.
 21 | * Apache license 2.0
 22 | ******************************************************************************
 23 | 
 24 | Test cases:
 25 | 1:  bin2ascii
 26 | 2:  ascii2bin
 27 | 
 28 | *****************************************************************************/
 29 | 
 30 | #include <stdio.h>
 31 | #include <cmath>
 32 | 
 33 | #include <string.h>
 34 | 
 35 | #define MAX_VECTOR_SIZE 512
 36 | #ifndef INSTRSET
 37 | #define INSTRSET  8
 38 | #endif
 39 | 
 40 | //#define __AVX512VBMI2__
 41 | 
 42 | #include <vectorclass.h>
 43 | #include "decimal.cpp"
 44 | 
 45 | 
 46 | // ---------------------------------------------------------------------------
 47 | //            Specify input parameters here if running from an IDE
 48 | // ----------------------------------------------------------------------------
 49 | 
 50 | #ifndef testcase
 51 | 
 52 | #define testcase 1
 53 | 
 54 | #endif  // testcase 
 55 | 
 56 | // ----------------------------------------------------------------------------
 57 | //             Declarations
 58 | // ----------------------------------------------------------------------------
 59 | int globalError = 0;       // any error indicated in program return
 60 | 
 61 | 
 62 | /************************************************************************
 63 | *
 64 | *                          Test cases
 65 | *
 66 | ************************************************************************/
 67 | 
 68 | #if testcase == 1   // test bin2ascii
 69 | 
 70 | // check results of ascii2bin
 71 | void checkb2a (int len, const char * res, const char * expected) {
 72 |     int slen = (int)strlen(res);
 73 |     if (strcmp(res, expected) != 0) {
 74 |         printf("\nbin2ascii error. Result:\n  >%s<\nExpected:\n  >%s<",
 75 |             res, expected);    
 76 |         globalError++;
 77 |     }
 78 |     else if (len != slen) {
 79 |         printf("\nbin2ascii length error. Actual length: %i, Reported length: %i\n  (%s)",
 80 |             slen, len, res);
 81 |         globalError++;
 82 |     }
 83 | }
 84 | 
 85 | 
 86 | int main() {
 87 |     char text[1024];
 88 |     int r = 0;
 89 | 
 90 | #if 0  // debugging
 91 |     Vec4i a0 (-87654321,-200000,3000000,40000000);
 92 |     r = bin2ascii(a0, text, 10, 4, '*', ',', true, true);
 93 |     printf("\nr=%i, \ntext=%s", r, text);
 94 | 
 95 | #else
 96 | 
 97 |     //static int bin2ascii (
 98 |     //   Vec4i const & a, char * string, int fieldlen = 8, int numdat = 4, bool signd = true, char ovfl = '*', char separator = ',', bool term = true) {
 99 |     Vec4i a1 (101,- 202,30303,-4040404);
100 |     r = bin2ascii(a1, text, 10, 4, '*', ',', true, true);
101 |     checkb2a(43, text, "       101,      -202,     30303,  -4040404");
102 |     r = bin2ascii(a1, text, 5, 4, 0, ';', true, true);
103 |     checkb2a(26, text, "  101; -202;30303;-4040404");
104 |     r = bin2ascii(a1, text, 5, 4, '*', ',', true, true);
105 |     checkb2a(23, text, "  101, -202,30303,*****");
106 | 
107 |     Vec4i a2 (101,-20202,-30303030,404040404);
108 |     r = bin2ascii(a2, text, 10, 4, '*', ',', true, true);
109 |     checkb2a(43, text, "       101,    -20202, -30303030, 404040404");
110 |     r = bin2ascii(a2, text, 9, 4, '*', ',', true, true);
111 |     checkb2a(39, text, "      101,   -20202,-30303030,404040404");
112 |     r = bin2ascii(a2, text, 8, 4, '*', ',', true, true);
113 |     checkb2a(35, text, "     101,  -20202,********,********");
114 |     r = bin2ascii(a2, text, 6, 4, '*', ',', true, true);
115 |     checkb2a(27, text, "   101,-20202,******,******");
116 | 
117 |     Vec4i a3 (-1,-100,10000,-10000);
118 |     r = bin2ascii(a3, text, 6, 4, '*', ',', true, true);
119 |     checkb2a(27, text, "    -1,  -100, 10000,-10000");
120 |     r = bin2ascii(a3, text, 2, 4, '*', ',', true, true);
121 |     checkb2a(11, text, "-1,**,**,**");
122 |     r = bin2ascii(a3, text, 1, 4, '*', ',', true, true);
123 |     checkb2a(7, text, "*,*,*,*");
124 |     r = bin2ascii(a3, text, 0, 4, '*', ',', true, true);
125 |     checkb2a(0, text, "");
126 |     r = bin2ascii(a3, text, 5, 3, '*', ',', true, true);
127 |     checkb2a(17, text, "   -1, -100,10000");
128 |     r = bin2ascii(a3, text, 5, 3, '*', 0, true, true);
129 |     checkb2a(15, text, "   -1 -10010000");
130 |     r = bin2ascii(a3, text, 1, 3, 0, ',', true, true);
131 |     checkb2a(13, text, "-1,-100,10000");
132 |     r = bin2ascii(a3, text, 1, 2, 0, ',', true, true);
133 |     checkb2a(7, text, "-1,-100");
134 |     
135 |     Vec4i a4 (-100000,-1000000,10000000,-100000000);
136 |     r = bin2ascii(a4, text, 6, 4, 0, ',', true, true);
137 |     checkb2a(36, text, "-100000,-1000000,10000000,-100000000");
138 |     r = bin2ascii(a4, text, 7, 4, '*', ',', true, true);
139 |     checkb2a(31, text, "-100000,*******,*******,*******");
140 |     r = bin2ascii(a4, text, 7, 4, '*', ',', false, true);
141 |     checkb2a(31, text, "*******,*******,*******,*******");
142 |     r = bin2ascii(a4, text, 7, 4, 0, ',', false, true);
143 |     checkb2a(41, text, "4294867296,4293967296,10000000,4194967296");
144 |     
145 |     Vec4i a5 (10000000,1000000000,2000000000,3000000000u);
146 |     r = bin2ascii(a5, text, 8, 4, 0, ',', true, true);
147 |     checkb2a(42, text, "10000000,1000000000,2000000000,-1294967296");
148 |     r = bin2ascii(a5, text, 8, 4, 0, ',', false, true);
149 |     checkb2a(41, text, "10000000,1000000000,2000000000,3000000000");
150 |     
151 |     Vec4i a6 (1,2,3,4);
152 |     r = bin2ascii(a6, text, 2, 4, '*', ',', true, false); // no terminator. the rest of the previous result remains
153 |     checkb2a(41, text, " 1, 2, 3, 400000000,2000000000,3000000000");
154 |     r = bin2ascii(a6, text, 2, 4, 0, ',', true, true);
155 |     checkb2a(11, text, " 1, 2, 3, 4");
156 |     r = bin2ascii(a6, text, 2, 4, 0, 0, true, true);
157 |     checkb2a(8, text, " 1 2 3 4");
158 |     r = bin2ascii(a6, text, 1, 4, 0, 0, true, true);
159 |     checkb2a(4, text, "1234");
160 | 
161 | 
162 | //static int bin2ascii (
163 | //  Vec8i const & a, char * string, int fieldlen = 8, int numdat = 8, bool signd = true, char ovfl = '*', char separator = ',', bool term = true) {
164 |     Vec8i b1 (1,-22,333,-4321,55555,-666,7,8000);
165 |     r = bin2ascii(b1, text, 10, 8, '*', ',', true, true);
166 |     checkb2a(87, text, "         1,       -22,       333,     -4321,     55555,      -666,         7,      8000");
167 |     r = bin2ascii(b1, text, 5, 8, 0, ';', true, true);
168 |     checkb2a(47, text, "    1;  -22;  333;-4321;55555; -666;    7; 8000");
169 |     r = bin2ascii(b1, text, 4, 8, '*', '|', true, true);
170 |     checkb2a(39, text, "   1| -22| 333|****|****|-666|   7|8000");
171 |     r = bin2ascii(b1, text, 6, 7, '*', ',', true, true);
172 |     checkb2a(48, text, "     1,   -22,   333, -4321, 55555,  -666,     7");
173 |     r = bin2ascii(b1, text, 6, 6, '*', ',', true, true);
174 |     checkb2a(41, text, "     1,   -22,   333, -4321, 55555,  -666");
175 |     r = bin2ascii(b1, text, 6, 5, '*', ',', true, true);
176 |     checkb2a(34, text, "     1,   -22,   333, -4321, 55555");    
177 |     r = bin2ascii(b1, text, 6, 4, '*', ',', true, true);
178 |     checkb2a(27, text, "     1,   -22,   333, -4321");    
179 |     r = bin2ascii(b1, text, 6, 3, '*', ',', true, true);
180 |     checkb2a(20, text, "     1,   -22,   333");
181 |     r = bin2ascii(b1, text, 6, 2, '*', ',', true, true);
182 |     checkb2a(13, text, "     1,   -22");
183 |     r = bin2ascii(b1, text, 6, 1, '*', ',', true, true);
184 |     checkb2a(6, text, "     1");
185 |     r = bin2ascii(b1, text, 6, 0, '*', ',', true, true);
186 |     checkb2a(0, text, "");
187 |     
188 |     Vec8i b2 (1,-20,300,4000,50000,654321,7000000,87654321);
189 |     r = bin2ascii(b2, text, 10, 8, '*', ',', true, true);
190 |     checkb2a(87, text, "         1,       -20,       300,      4000,     50000,    654321,   7000000,  87654321");
191 |     r = bin2ascii(b2, text, 9, 8, '*', ',', true, true);
192 |     checkb2a(79, text, "        1,      -20,      300,     4000,    50000,   654321,  7000000, 87654321");
193 |     r = bin2ascii(b2, text, 8, 8, '*', ',', true, true);
194 |     checkb2a(71, text, "       1,     -20,     300,    4000,   50000,  654321, 7000000,87654321");
195 |     r = bin2ascii(b2, text, 7, 8, '*', ',', true, true);
196 |     checkb2a(63, text, "      1,    -20,    300,   4000,  50000, 654321,7000000,*******");
197 |     r = bin2ascii(b2, text, 6, 8, '*', ',', true, true);
198 |     checkb2a(55, text, "     1,   -20,   300,  4000, 50000,654321,******,******");
199 |     r = bin2ascii(b2, text, 5, 8, '*', ',', true, true);
200 |     checkb2a(47, text, "    1,  -20,  300, 4000,50000,*****,*****,*****");
201 |     r = bin2ascii(b2, text, 4, 8, '*', ',', true, true);
202 |     checkb2a(39, text, "   1, -20, 300,4000,****,****,****,****");
203 |     r = bin2ascii(b2, text, 3, 8, '*', ',', true, true);
204 |     checkb2a(31, text, "  1,-20,300,***,***,***,***,***");
205 |     r = bin2ascii(b2, text, 2, 8, '*', ',', true, true);
206 |     checkb2a(23, text, " 1,**,**,**,**,**,**,**");
207 |     r = bin2ascii(b2, text, 1, 8, '*', ',', true, true);
208 |     checkb2a(15, text, "1,*,*,*,*,*,*,*");
209 |     r = bin2ascii(b2, text, 0, 8, '*', ',', true, true);
210 |     checkb2a(0, text, "");
211 | 
212 |     r = bin2ascii(b2, text, 9, 8, '*', 0, true, true);
213 |     checkb2a(72, text, "        1      -20      300     4000    50000   654321  7000000 87654321");
214 |     r = bin2ascii(b2, text, 8, 8, '*', 0, true, true);
215 |     checkb2a(64, text, "       1     -20     300    4000   50000  654321 700000087654321");
216 |     r = bin2ascii(b2, text, 7, 8, '*', 0, true, true);
217 |     checkb2a(56, text, "      1    -20    300   4000  50000 6543217000000*******");
218 |     r = bin2ascii(b2, text, 6, 8, '*', 0, true, true);
219 |     checkb2a(48, text, "     1   -20   300  4000 50000654321************");
220 |     r = bin2ascii(b2, text, 5, 8, '*', 0, true, true);
221 |     checkb2a(40, text, "    1  -20  300 400050000***************");
222 |     r = bin2ascii(b2, text, 4, 8, '*', 0, true, true);
223 |     checkb2a(32, text, "   1 -20 3004000****************");
224 |     r = bin2ascii(b2, text, 3, 8, '*', 0, true, true);
225 |     checkb2a(24, text, "  1-20300***************");
226 |     r = bin2ascii(b2, text, 2, 8, '*', 0, true, true);
227 |     checkb2a(16, text, " 1**************");
228 |     r = bin2ascii(b2, text, 1, 8, '*', 0, true, true);
229 |     checkb2a(8, text, "1*******");
230 |     r = bin2ascii(b2, text, 0, 8, '*', 0, true, true);
231 |     checkb2a(0, text, "");
232 | 
233 |     // fields too long
234 |     Vec8i b3 (1000,-200000,3000000,40000000,205050505,3060606060u,-432100000,-87654321);
235 |     r = bin2ascii(b3, text, 8, 8, '*', ',', true, true);
236 |     checkb2a(71, text, "    1000, -200000, 3000000,40000000,********,********,********,********");
237 |     r = bin2ascii(b3, text, 8, 8, 0, ',', true, true);
238 |     checkb2a(78, text, "    1000, -200000, 3000000,40000000,205050505,-1234361236,-432100000,-87654321");
239 |     r = bin2ascii(b3, text, 9, 8, '*', ',', true, true);
240 |     checkb2a(79, text, "     1000,  -200000,  3000000, 40000000,205050505,*********,*********,-87654321");
241 |     r = bin2ascii(b3, text, 10, 8, '*', ',', true, true);
242 |     checkb2a(87, text, "      1000,   -200000,   3000000,  40000000, 205050505,**********,-432100000, -87654321");
243 |     r = bin2ascii(b3, text, 11, 8, '*', ',', true, true);
244 |     checkb2a(95, text, "       1000,    -200000,    3000000,   40000000,  205050505,-1234361236, -432100000,  -87654321");
245 |     r = bin2ascii(b3, text, 12, 8, '*', ',', true, true);
246 |     checkb2a(103, text, "        1000,     -200000,     3000000,    40000000,   205050505, -1234361236,  -432100000,   -87654321");
247 |     r = bin2ascii(b3, text, 10, 8, '*', ',', false, true);
248 |     checkb2a(87, text, "      1000,4294767296,   3000000,  40000000, 205050505,3060606060,3862867296,4207312975");
249 |     r = bin2ascii(b3, text, 5, 1, '*', ',', true, false); // no terminator. overwrite previous string
250 |     checkb2a(87, text, " 1000 1000,4294767296,   3000000,  40000000, 205050505,3060606060,3862867296,4207312975");
251 |     
252 |     if (!globalError) printf("\nsuccess\n");
253 | #endif
254 |     return globalError;
255 | }
256 | 
257 | 
258 | #else           // test ascii2bin
259 | 
260 | // check results of ascii2bin
261 | void checka2b (Vec8i res, Vec8i expected, int length, int lengthExp, int err, int errExp) {
262 |     bool dataerr = horizontal_or(res != expected);
263 |     bool lengtherr = length != lengthExp;
264 |     bool errorerr  = err != errExp;
265 | 
266 |     if (dataerr || lengtherr || errorerr) {
267 |         printf("\nascii2bin error:");    
268 |     } 
269 |     if (dataerr) {
270 |         globalError |= 1;
271 |         printf("\n  data error:\n    found:  expected:");
272 |         for (int i = 0; i < res.size(); i++) {
273 |             printf("\n%10i %10i", res[i], expected[i]);
274 |         }
275 |     }
276 |     if (lengtherr) {
277 |         globalError |= 2;
278 |         printf("\n  length error: found: %i, expected: %i", length, lengthExp);
279 |     }
280 |     if (errorerr) {
281 |         globalError |= 4;
282 |         printf("\n  error code: found: 0x%X, expected: 0x%X", err, errExp);
283 |     }
284 |     if ((lengtherr || errorerr) && !dataerr) { // print numbers to help identify the case
285 |         printf("\n(");
286 |         for (int i=0; i < res.size(); i++) {
287 |             printf("%i ", res[i]);
288 |         }
289 |         printf(")\n");
290 |     }
291 | }
292 | 
293 | int main() {
294 | 
295 |     int error = 0;
296 |     int n_read = 0;
297 |     Vec8i dat;
298 | 
299 | #if 0  // for debugging only:
300 |     //                      10        20        30        40        50        60
301 |     //                       v         v         v         v         v         v
302 |     //             01234567890123456789012345678901234567890123456789012345678901234567890
303 |     char num0[] = " 1, +2 ,-3, -4321,, 007777, 88888,98765432";
304 | 
305 |     dat = ascii2bin(num0, &n_read, &error, 1000, 7, ',');
306 |     checka2b (dat, Vec8i(1,2,3,-4321,0,7777,88888,98765432), n_read, 41, error, 0);
307 | 
308 |     printf ("\nnread %i, error 0x%X\n", n_read, error);
309 |     for (int i=0; i<8; i++) printf("%i ", dat[i]);
310 |     return 1;
311 | 
312 | #endif
313 |     char num1[] = " 1, +21  , 321, -4321, 55, 7777, 88888,98765432";
314 |     dat = ascii2bin(num1, &n_read, &error, 64, 8, ',');
315 |     checka2b (dat, Vec8i(1,21,321,-4321,55,7777,88888,98765432),n_read, 47, error, 0);
316 | 
317 |     // test no numbers
318 |     dat = ascii2bin(num1, &n_read, &error, 64, 0, ',');
319 |     checka2b (dat, Vec8i(0), n_read, 0, error, 0);
320 |     // test fewer numbers
321 |     dat = ascii2bin(num1, &n_read, &error, 64, 3, ',');
322 |     checka2b (dat, Vec8i(1,21,321,0,0,0,0,0), n_read, 15, error, 0);
323 |     // test fewer numbers
324 |     dat = ascii2bin(num1, &n_read, &error, 64, 7, ',');
325 |     checka2b (dat, Vec8i(1,21,321,-4321,55,7777,88888,0), n_read, 39, error, 0);
326 | 
327 |     // test short string
328 |     dat = ascii2bin(num1, &n_read, &error, 40, 7, ',');
329 |     checka2b (dat, Vec8i(1,21,321,-4321,55,7777,88888,0), n_read, 39, error, 0);
330 |     // test short string
331 |     dat = ascii2bin(num1, &n_read, &error, 26, 7, ',');
332 |     checka2b (dat, Vec8i(1,21,321,-4321,55,0,0,0), n_read, 26, error, 8);
333 | 
334 |     // test string 64 bytes long
335 |     char num2[] = "1     , +22  ,    300, - 4444,   55555, 666666, 7777777,88888888";
336 |     dat = ascii2bin(num2, &n_read, &error, 64, 8, ',');
337 |     checka2b (dat, Vec8i(1,22,300,-4444,55555,666666,7777777,88888888),n_read, 64, error, 0);
338 | 
339 |     // test string > 64 bytes long
340 |     char num3[] = "1   , +22  ,    300, - 4444,   55555, 666666, 7777777,88888888,999,101010,111111";
341 |     dat = ascii2bin(num3, &n_read, &error, 100, 8, ',');
342 |     checka2b (dat, Vec8i(1,22,300,-4444,55555,666666,7777777,88888888), n_read, 63, error, 0);
343 | 
344 |     // test missing numbers
345 |     char num4[] = ",- 321,+,-9876543";
346 |     dat = ascii2bin(num4, &n_read, &error, 17, 8, ',');
347 |     checka2b (dat, Vec8i(0,-321,0,-9876543,0,0,0,0), n_read, 17, error, 8);
348 |     // test unfinished numbers
349 |     dat = ascii2bin(num4, &n_read, &error, 14, 8, ',');
350 |     checka2b (dat, Vec8i(0,-321,0,-9876,0,0,0,0), n_read, 14, error, 8);
351 |     dat = ascii2bin(num4, &n_read, &error, 10, 8, ',');
352 |     checka2b (dat, Vec8i(0,-321,0,0,0,0,0,0), n_read, 10, error, 8);
353 | 
354 |     // test misplaced character and illegal character
355 |     char num5[] = "111 ,  -222 , 333-, 444., 555E6, 6666";
356 |     dat = ascii2bin(num5, &n_read, &error, 37, 6, ',');
357 |     checka2b (dat, Vec8i(111,-222,0,444,0,6666,0,0), n_read, 37, error, 2+4);
358 | 
359 |     dat = ascii2bin(num5, &n_read, &error, 80, 4, ',');
360 |     checka2b (dat, Vec8i(111,-222,0,444,0,0,0,0), n_read, 25, error, 2+4);
361 | 
362 |     // test field too long
363 |     char num6[] = "111 ,1234567890, -1234567890  , 4444, 55555, 666666";
364 |     dat = ascii2bin(num6, &n_read, &error, 51, 6, ',');
365 |     checka2b (dat, Vec8i(111,1234567890,-1234567890,4444,55555,666666,0,0), n_read, 51, error, 0);
366 | 
367 |     // test overflow
368 |     char num7[] = "111 ,12345678901, -1234567890  , 4444, 55555, 666666";
369 |     dat = ascii2bin(num7, &n_read, &error, 64, 6, ',');
370 |     checka2b (dat, Vec8i(111,2147483647,-1234567890,4444,55555,666666,0,0), n_read, 52, error, 16);
371 | 
372 |     // test chain
373 |     char num8[] = "-111, 222 , -333 , +4444, -55555,+ 666666, -777, 888, -999, 1010, -1111";
374 |     dat = ascii2bin(num8, &n_read, &error, 53, 8, ',');
375 |     checka2b (dat, Vec8i(-111,222,-333,4444,-55555,666666,-777,888), n_read, 53, error, 0);
376 |     dat = ascii2bin(num8 + n_read, &n_read, &error, 64, 3, ',');
377 |     checka2b (dat, Vec8i(-999, 1010, -1111,0,0,0,0,0), n_read, 18, error, 0);
378 | 
379 |     // test garbage after string. multiple signs, tab as separator
380 |     char num9[] = "111\t+-2\t---3\t4444\t55555\t666666\t-7\t8\t 1.2E3\ttext\t'''\t\t%&/()";
381 |     dat = ascii2bin(num9, &n_read, &error, 64, 8, '\t');
382 |     checka2b (dat, Vec8i(111,0,0,4444,55555,666666,-7,8), n_read, 36, error, 4);
383 | 
384 |     // test newline as end of string
385 |     char num9a[] = "111,+2,-3,4444,55555,666666,-7\n8, 1.2E3";
386 |     dat = ascii2bin(num9a, &n_read, &error, 64, 8, ',');
387 |     checka2b (dat, Vec8i(111,2,-3,4444,55555,666666,-7,0), n_read, 30, error, 8);
388 | 
389 |     // test error in first field
390 |     char num10[]= "1 1 1,22,333,4444,55555,666666,7777777,";
391 |     dat = ascii2bin(num10, &n_read, &error, 64, 8, ',');
392 |     checka2b (dat, Vec8i(0,22,333,4444,55555,666666,7777777,0), n_read, 39, error, 4+8);
393 | 
394 |     char num11[]= "+-+-0,22,333,4444,55555,666666,7777777,";
395 |     dat = ascii2bin(num11, &n_read, &error, 64, 8, ',');
396 |     checka2b (dat, Vec8i(0,22,333,4444,55555,666666,7777777,0), n_read, 39, error, 4+8);
397 | 
398 |     // test error in last field
399 |     char num12[]= "1 ,22,333,4444,55555,666666,7777777,+-8";
400 |     dat = ascii2bin(num12, &n_read, &error, 64, 8, ',');
401 |     checka2b (dat, Vec8i(1,22,333,4444,55555,666666,7777777,0), n_read, 39, error, 4);
402 | 
403 |     char num13[]= "1 ,22,333,4444,55555,666666,7777777,8.8";
404 |     dat = ascii2bin(num13, &n_read, &error, 64, 8, ',');
405 |     checka2b (dat, Vec8i(1,22,333,4444,55555,666666,7777777,0), n_read, 39, error, 2+4);
406 | 
407 |     char num14[]= "1 ,22,333,4444,55555,666666,7777777,...garbage 1 more garbage   ";
408 |     dat = ascii2bin(num14, &n_read, &error, 64, 8, ',');
409 |     checka2b (dat, Vec8i(1,22,333,4444,55555,666666,7777777,1), n_read, 48, error, 2);
410 | 
411 |     if (!globalError) printf("\nsuccess\n");
412 |     return globalError;
413 | }
414 | 
415 | #endif
416 | 
417 | 


--------------------------------------------------------------------------------
/random/ranvec1_manual.tex:
--------------------------------------------------------------------------------
  1 | \documentclass[11pt,a4paper,oneside,openright]{report}
  2 | 
  3 | \usepackage[bindingoffset=5mm,left=20mm,right=20mm,top=20mm,bottom=20mm,footskip=10mm]{geometry}
  4 | \usepackage[utf8x]{inputenc}
  5 | \usepackage{hyperref}
  6 | \usepackage[english]{babel}
  7 | \usepackage{listings}
  8 | \usepackage{subfiles}
  9 | \usepackage{longtable}
 10 | \usepackage{multirow}
 11 | \usepackage{ragged2e} 
 12 | \usepackage{cmap} % avoid fi ligatures in pdf file
 13 | \usepackage{amsthm} % example numbering
 14 | \usepackage{color}
 15 | %\usepackage{bold-extra} % for bold tt font. Remember to include bold-extra.sty file
 16 | \usepackage{graphicx}
 17 | \usepackage[yyyymmdd]{datetime}
 18 | \usepackage{float}
 19 | 
 20 | % style for code listing
 21 | \renewcommand{\familydefault}{\sfdefault}
 22 | \renewcommand{\ttdefault}{pcr} % selects Courier font
 23 | \newtheorem{example}{Example}[chapter]  % example numbering
 24 | \lstset{language=C}                     % formatting for code listing
 25 | \lstset{basicstyle=\ttfamily,breaklines=true}
 26 | \definecolor{darkGreen}{rgb}{0,0.4,0}
 27 | \definecolor{mybrown}{rgb}{0.40,0.10,0.05}
 28 | \lstset{commentstyle=\color{darkGreen}}  % comments color
 29 | \lstset{keywordstyle=\color{blue}}       % keyword color
 30 | \lstset{stringstyle=\color{mybrown}}     % string color
 31 | \lstset{showstringspaces=false}          % don't mark spaces in strings
 32 | 
 33 | \renewcommand{\dateseparator}{-}
 34 | 
 35 | % command for turning indent back on after \flushleft
 36 | \newcommand{\indenton}{\RaggedRight\parindent=15pt}
 37 | 
 38 | % command for vertical space
 39 | \newcommand{\vspacesmall}{\vspace{3mm}}
 40 | \newcommand{\vspacebig}{\vspace{6mm}}
 41 | 
 42 | % style for code inlined in text:
 43 | \newcommand{\codei}[1]{\bfseries \ttfamily{#1}\normalfont}
 44 | 
 45 | 
 46 | \begin{document}
 47 | 
 48 | \begin{titlepage}
 49 |     \centering
 50 |    
 51 |     \null %empty box needed for vfill to work
 52 |     \vfill
 53 | 
 54 |    {\bfseries\Huge
 55 |     Ranvec1
 56 |     \vspacesmall
 57 |     
 58 |     Random number generator for 
 59 |     \vspacesmall
 60 |         
 61 |     C++ vector class library 
 62 |     \vspacebig
 63 |         
 64 |    }        
 65 |     \vspacebig
 66 |     
 67 |    {\Large    
 68 |     Agner Fog
 69 |     \vspacebig
 70 |     
 71 |     \copyright\ \today. Apache license 2.0
 72 |    }
 73 |     
 74 |     \vfill
 75 |     
 76 |     \includegraphics[width=306pt]{freesoftwarelogo.jpg}
 77 |     \vfill
 78 |     
 79 | \end{titlepage}
 80 | 
 81 | \RaggedRight
 82 | 
 83 | \chapter{Introduction}\label{chap:Introduction}
 84 | 
 85 | Ranvec1 is an efficient high quality pseudo random number generator designed for large vector applications and multi-threaded applications in C++ language.
 86 | \vspacesmall
 87 | 
 88 | This generator has been developed based on the following design goals:
 89 | 
 90 | \begin{itemize}
 91 | 
 92 | \item Good randomness, as determined by both theoretical and experimental criteria.
 93 | 
 94 | \item Suitable for vector processors and vector instructions (SIMD).
 95 | 
 96 | \item Suitable for large multi-threaded applications without risk of overlapping subsequences.
 97 | 
 98 | \item Fast generation of large amounts of random numbers.
 99 | 
100 | \end{itemize}
101 | 
102 | This random number generator is designed for large Monte Carlo simulations and Monte Carlo integration. 
103 | It may be useful for cryptographic applications as well, but cryptographic safety has not been a decisive design goal. It will be useful for game applications as well.
104 | \vspacesmall
105 | 
106 | A physical random number generator function is included for the purpose of generating a truly random seed for initializing the pseudo random number generator.
107 | \vspacesmall
108 | 
109 | The code is based on the Vector Class Library, using the x86 or x86-64 instruction set with extensions from SSE2 to AVX512. See the Vector Class Library manual for choice of compiler and compilation options. On Gnu and Clang compilers you need to specify the additional options \textbf{-mrdrnd -mrdseed} in order to enable the physical random number generator instructions.
110 | 
111 | 
112 | 
113 | \chapter{Instructions}\label{chap:Instructions}
114 | \flushleft
115 | The files ranvec1.h and ranvec1.cpp define a high quality pseudo-random number generator with vector output. This generator is useful for producing random numbers for simulation and other Monte Carlo applications. Add the file ranvec1.cpp to your project and compile for the appropriate instruction set. This example shows a simple use of the random number generator:
116 | 
117 | \begin{example}
118 | \label{exampleRandomGeneration}
119 | \end{example} % frame disappears if I put this after end lstlisting
120 | \begin{lstlisting}[frame=single]
121 | // Example for random number generator
122 | // Remember to link ranvec1.cpp into the project
123 | 
124 | #include <stdio.h>
125 | #include "vectorclass.h"
126 | #include "ranvec1.h"
127 | 
128 | int main() {
129 |     // Arbitrary seed
130 |     int seed = 1;
131 |     // Create an instance of Ranvec1 and set the type to 3
132 |     Ranvec1 ran(3);
133 |     // Initialize with the seed
134 |     ran.init(seed);
135 |     // Generate a vector of 8 random integers below 100
136 |     Vec8i ri = ran.random8i(0,99);
137 |     // Generate a vector of 8 random floats
138 |     Vec8f rf = ran.random8f();
139 |     int i;
140 |     // Output the 8 random integers
141 |     printf("\nRandom integers in interval 0 - 99\n");
142 |     for (i=0; i < ri.size(); i++) printf("%3i ", ri[i]);
143 |     
144 |     // Output the 8 random floats
145 |     printf("\nRandom floats in interval 0 - 1\n");
146 |     for (i=0; i < rf.size(); i++) printf("%7.4f ", rf[i]);
147 |     printf("\n");
148 |     return 0;
149 | }
150 | \end{lstlisting}
151 | \vspacesmall
152 | 
153 | The optional parameter for the constructor of the class Ranvec1 defines the type of random number generator to use:
154 | \vspacesmall
155 | 
156 | \begin{tabular}{|p{30mm}|p{120mm}|}
157 | \hline
158 | \bfseries Parameter for\newline constructor & \bfseries Generator type \\ \hline
159 | 1 & MWC. Multiply-With-Carry Generator. Use this for small applications where speed is important. \newline
160 | (cycle length \textgreater{} $4 \cdot 10^{19}$) \\ \hline
161 | 
162 | 2 & MTGP. A variant of Mersenne Twister. Use this for applications with multiple threads. \newline
163 | (cycle length \textgreater{} $10^{3375}$) \\ \hline
164 | 
165 | 3 & MWC + MTGP combined. Use this for the best possible randomness and for large applications with many threads. \newline
166 | (cycle length \textgreater{} $10^{3395}$) \\ \hline
167 | \end{tabular}
168 | \vspacesmall
169 | 
170 | It is necessary to initialize the random number generator with a seed, using either the function \codei{init} or \codei{initByArray}. The generator will produce only zeroes if it has not been initialized with any of the init functions.
171 | \vspacesmall
172 | 
173 | The random number sequence depends on the seed. A different seed will produce a different sequence of random numbers. You can reproduce a random number sequence exactly after initializing again with the same seed. You may use simple values like 1, 2, 3, ... for seeds in a series of simulations if you want to be able to reproduce the results later. If you want a non-reproducible sequence then you need a seed from a source of genuine randomness. The function \codei{physicalSeed} is useful for this purpose.
174 | \vspacesmall
175 | 
176 | The generator can produce vector outputs with different vector sizes. The best performance is obtained when the vector size fits the instruction set: SSE2 or higher for 128 bit vectors. AVX2 or higher for 256 bit vectors. AVX512 or higher for 512 bit vectors. Depending on details of the application, it may or may not be possible to reproduce a simulation result exactly when the vector size is changed.
177 | \vspacesmall
178 | 
179 | The theory of the Ranvec1 package including the different generators, multiprocessing and vector processing is described in the article: 
180 | \label{Fog2015TheoryArticle}
181 | 
182 | Fog, Agner: “Pseudo-Random Number Generators for Vector Processors and Multicore Processors.” Journal of Modern Applied Statistical Methods, vol. 14, no. 1, 2015, article 23. \url{https://digitalcommons.wayne.edu/jmasm/vol14/iss1/23/}
183 | \vspacebig
184 | 
185 | \section{Member functions for class Ranvec1}\label{MemberFunctions}
186 | \vspacesmall
187 | 
188 | \begin{tabular}{|p{30mm}|p{100mm}|}
189 | \hline
190 | \bfseries Constructor & Ranvec1(int gtype) \\ \hline
191 | \bfseries Description & Constructor for Ranvec1 class. See the table above for values of the generator type gtype. \\ \hline
192 | \bfseries Efficiency & medium \\ \hline
193 | \end{tabular}
194 | \begin{lstlisting}[frame=none]
195 | // Example:
196 | Ranvec1 ran(3);   // Create object ran
197 | \end{lstlisting}
198 | \vspacesmall
199 | 
200 | \begin{tabular}{|p{30mm}|p{100mm}|}
201 | \hline
202 | \bfseries Constructor & Ranvec1(int gtype, int seed) \\ \hline
203 | \bfseries Description & Constructor for Ranvec1 class. Initializing with seed. \\ \hline
204 | \bfseries Efficiency & medium \\ \hline
205 | \end{tabular}
206 | \vspacesmall
207 | 
208 | \begin{tabular}{|p{30mm}|p{100mm}|}
209 | \hline
210 | \bfseries Member function & void init(int seed) \\ \hline
211 | \bfseries Description & Initialization with one seed. Any value is allowed for seed. Use a different value of seed each time to get a different random number sequence. \\ \hline
212 | \bfseries Efficiency & poor \\ \hline
213 | \end{tabular}
214 | \begin{lstlisting}[frame=none]
215 | // Example:
216 | ran.init(0);   // Initialize random generator with seed 0
217 | \end{lstlisting}
218 | \vspacesmall
219 | 
220 | \begin{tabular}{|p{30mm}|p{100mm}|}
221 | \hline
222 | \bfseries Member function & void init(int seed1, int seed2) \\ \hline
223 | \bfseries Description & Initialization with two seeds. The random number sequence depends on both seeds. If the generator type is 3, then seed1 is used for the MWC generator and seed2 is used for the MTGP generator. The value of seed2 should be different for each thread in multithreaded applications. \\ \hline
224 | \bfseries Efficiency & poor \\ \hline
225 | \end{tabular}
226 | \begin{lstlisting}[frame=none]
227 | // Example:
228 | ran.init(0,1);   // Initialize random generator with seeds 0 and 1
229 | \end{lstlisting}
230 | \vspacesmall
231 | 
232 | \begin{tabular}{|p{30mm}|p{100mm}|}
233 | \hline
234 | \bfseries Member function & void initByArray(int const seeds[], int numSeeds) \\ \hline
235 | \bfseries Description & Initialization with multiple seeds. The seeds array must contain numSeed integers. The random number sequence depends on all these integer seeds. This can be useful for security applications in order to make it difficult to guess the seeds. The best security is obtained with generator type 3. \\ \hline
236 | \bfseries Efficiency & poor \\ \hline
237 | \end{tabular}
238 | \begin{lstlisting}[frame=none]
239 | // Example:
240 | // Initialize random generator with four seeds
241 | int seeds[4] = {5,8,12,2};
242 | ran.initByArray(seeds, 4);  
243 | \end{lstlisting}
244 | \vspacesmall
245 | 
246 | \begin{tabular}{|p{30mm}|p{100mm}|}
247 | \hline
248 | \bfseries Member function & uint32\_t random32b()\newline
249 | uint64\_t random64b() \\ \hline
250 | \bfseries Description & returns an integer of 32 or 64 random bits \\ \hline
251 | \bfseries Efficiency & medium \\ \hline
252 | \end{tabular}
253 | \begin{lstlisting}[frame=none]
254 | // Example:
255 | unsigned int r = ran.random32b();  // generate 32 random bits
256 | \end{lstlisting}
257 | \vspacesmall
258 | 
259 | \begin{tabular}{|p{30mm}|p{100mm}|}
260 | \hline
261 | \bfseries Member function & Vec4ui random128b() \newline
262 | Vec8ui random256b() \newline
263 | Vec16ui random512b() \\ \hline
264 | \bfseries Description & Returns an integer vector of 128, 256 or 512 random bits.  \\ \hline
265 | \bfseries Efficiency & medium \\ \hline
266 | \end{tabular}
267 | \begin{lstlisting}[frame=none]
268 | // Example:
269 | Vec8ui v = ran.random256b();  // generate 256 random bits
270 | \end{lstlisting}
271 | \vspacesmall
272 | 
273 | \begin{tabular}{|p{30mm}|p{100mm}|}
274 | \hline
275 | \bfseries Member function & int random1i(int min, int max) \newline
276 | Vec4i random4i(int min, int max) \newline
277 | Vec8i random8i(int min, int max) \newline
278 | Vec16i random16i(int min, int max) \\ \hline
279 | \bfseries Description & Returns a random integer or a vector of random integers
280 | with uniform distribution in the interval min $\leq$ x $\leq$ max. \newline
281 | (The distribution may be slightly inaccurate when the interval size is large and not a power of 2. See below for a more accurate version.) \\ \hline
282 | \bfseries Efficiency & medium \\ \hline
283 | \end{tabular}
284 | \begin{lstlisting}[frame=none]
285 | // Example:
286 | // Generate a random integer in the interval [1,10]
287 | int r = ran.random1i(1, 10);
288 | // Generate eight random integers in the interval [1,10]
289 | Vec8i v = ran.random8i(1, 10);
290 | \end{lstlisting}
291 | \vspacesmall
292 | 
293 | 
294 | \begin{tabular}{|p{30mm}|p{100mm}|}
295 | \hline
296 | \bfseries Member function & int random1ix(int min, int max) \newline
297 | Vec4i random4ix(int min, int max) \newline
298 | Vec8i random8ix(int min, int max) \newline
299 | Vec16i random16ix(int min, int max) \\ \hline
300 | \bfseries Description & Returns a random integer or a vector of random integers with uniform distribution in the interval min $\leq$ x $\leq$ max. \newline
301 | This is the same as random1i, random4i, random8i, random16i, but exact. \newline
302 | The exact version of these functions use a rejection method as described in the theory article mentioned above. To reproduce a sequence, the same function with the same vector size must be called. \\ \hline
303 | \bfseries Efficiency & medium \\ \hline
304 | \end{tabular}
305 | \begin{lstlisting}[frame=none]
306 | // Example:
307 | // Generate eight random integers in the interval [1,10]
308 | Vec8i v = ran.random8ix(1, 10);
309 | \end{lstlisting}
310 | \vspacesmall
311 | 
312 | \begin{tabular}{|p{30mm}|p{100mm}|}
313 | \hline
314 | \bfseries Member function & float random1f() \\ \hline
315 | \bfseries Description & Returns a random floating point number with uniform distribution in the interval 0 $\leq$ x $<$ 1. The resolution is $2^{-24}$. \newline
316 | (A value in the interval 0 $<$ x $\leq$ 1 can be obtained as 1 - x. \\ \hline
317 | \bfseries Efficiency & medium \\ \hline
318 | \end{tabular}
319 | \begin{lstlisting}[frame=none]
320 | // Example:
321 | // Generate a random float below 100:
322 | float x = ran.random1f() * 100.f;
323 | \end{lstlisting}
324 | \vspacesmall
325 | 
326 | \begin{tabular}{|p{30mm}|p{100mm}|}
327 | \hline
328 | \bfseries Member function & Vec4f random4f() \newline
329 | Vec8f random8f() \newline
330 | Vec16f random16f() \\ \hline
331 | \bfseries Description & Returns a vector of random floating point numbers with uniform distribution in the interval 0 $\leq$ x $<$ 1. The resolution is $2^{-24}$.  \\ \hline
332 | \bfseries Efficiency & medium \\ \hline
333 | \end{tabular}
334 | \begin{lstlisting}[frame=none]
335 | // Example:
336 | // Generate four random float numbers below 100:
337 | Vec4f v = ran.random4f() * 100.f;
338 | \end{lstlisting}
339 | \vspacesmall
340 | 
341 | \begin{tabular}{|p{30mm}|p{100mm}|}
342 | \hline
343 | \bfseries Member function & double random1d() \\ \hline
344 | \bfseries Description & Returns a random double precision number with uniform distribution in the interval 0 $\leq$ x $<$ 1. The resolution is $2^{-52}$. \\ \hline
345 | \bfseries Efficiency & medium \\ \hline
346 | \end{tabular}
347 | \begin{lstlisting}[frame=none]
348 | // Example:
349 | // Generate random double precision number below 100:
350 | double x = ran.random1d() * 100.;
351 | \end{lstlisting}
352 | \vspacesmall
353 | 
354 | \begin{tabular}{|p{30mm}|p{100mm}|}
355 | \hline
356 | \bfseries Member function & Vec2d random2d() \newline
357 | Vec4d random4d() \newline
358 | Vec8d random8d() \\ \hline
359 | \bfseries Description & Returns a vector of random double precision numbers with uniform distribution in the interval 0 $\leq$ x $<$ 1. The resolution is $2^{-52}$.  \\ \hline
360 | \bfseries Efficiency & medium \\ \hline
361 | \end{tabular}
362 | \begin{lstlisting}[frame=none]
363 | // Example:
364 | // Generate four random double precision numbers below 100:
365 | Vec4d v = ran.random4d() * 100.;
366 | \end{lstlisting}
367 | \vspacebig
368 | 
369 | 
370 | \section{Other functions}\label{OtherFunctions}
371 | 
372 | \begin{tabular}{|p{30mm}|p{100mm}|}
373 | \hline
374 | \bfseries Function & int physicalSeedType()  \\ \hline
375 | \bfseries Description & Finds the best source of non-reproducible randomness on the CPU that the program is running on. Return value: \newline
376 | 0:  No physical seed available \newline
377 | 1:  CPU clock (consecutive calls are not independent)\newline
378 | 2:  RDRAND instruction \newline
379 | 3:  RDSEED instruction   \\ \hline
380 | \bfseries Source file & physseed.cpp \\ \hline
381 | \bfseries Efficiency & medium \\ \hline
382 | \end{tabular}
383 | \vspacebig
384 | 
385 | \begin{tabular}{|p{30mm}|p{100mm}|}
386 | \hline
387 | \bfseries Function & int physicalSeed()  \\ \hline
388 | \bfseries Description & Get a non-reproducible random number based on a physical process. This is intended as a seed for the pseudo random number generator. The source of randomness is indicated by physicalSeedType();  \\ \hline
389 | \bfseries Source file & physseed.cpp \\ \hline
390 | \bfseries Efficiency & medium \\ \hline
391 | \end{tabular}
392 | \begin{lstlisting}[frame=none]
393 | // Example: Generate a random seed
394 | int seed = physicalSeed();
395 | // Make an instance of the pseudo random number generator
396 | Ranvec1 ran(2);
397 | // Initialize it with the random seed
398 | ran.init(seed);
399 | // Generate a vector of 16 random float numbers
400 | Vec16f rf = ran.random16f();
401 | // This code will generate a different random sequence each
402 | // time it runs.
403 | \end{lstlisting}
404 | \vspacebig
405 | 
406 | 
407 | \section{Generating seeds}\label{GeneratingSeeds}
408 | 
409 | Ranvec1 is called a pseudo random number generator because it is deterministic. You can repeat the same sequence of random numbers if you run it again with the same seed. 
410 | You need to initialize Ranvec1 with a random seed if you want a sequence of random numbers that is not predictable or deterministic.
411 | \vspacesmall
412 | 
413 | The \codei{physicalSeed()} function will produce such a random seed.
414 | Newer CPUs have a built-in physical source of randomness based on thermal noise. This is implemented in the RDRAND or RDSEED instruction. The RDSEED instruction is stronger than RDRAND if you want to call it multiple times to get a longer seed. The \codei{physicalSeed()} function will use the best source of randomness available on the CPU it is running on.
415 | \vspacesmall
416 | 
417 | If the program is running on and older CPU without the RDRAND or RDSEED instruction, then you can use the internal CPU clock as a source of randomness. The frequency of this internal clock is typically higher than 1 GHz. The source of randomness here is the exact time at which the function is called.
418 | \vspacesmall
419 | 
420 | Note that if you are calling \codei{physicalSeed()} twice on an older computer where the CPU clock is the only source of randomness, then the second call will not be independent of the first one. It will give a value that is perhaps a few hundred clock counts higher than the first one. To get an independent second value you need to wait for some external event before the second call. This external event can be a keystroke, a mouse move, or a network event. If the function \codei{physicalSeedType()} returns 1 then you need to wait for an external event before every call to \codei{physicalSeed()} except the first one. For example, you may ask the user to press a key. 
421 | \vspacesmall 
422 | 
423 | 
424 | \section{Cryptographic applications}\label{CryptographicApplications}
425 | It is theoretically possible to predict and reproduce the sequence generated by a single pseudo random number generator if you have access to a subsequence longer than the internal state buffer. This is not possible if two random number generators with long cycle lengths are combined. Therefore, you should always use the combined generator (type 3) for cryptographic applications. 
426 | \vspacesmall
427 | 
428 | You should use a seed longer than 32 bits to get a good unpredictable result. Use the \codei{initByArray} function with an array of multiple seeds. Use two or more array elements generated by the \codei{physicalSeed()} function and supply with other elements from other sources. These other elements do not need to be truly random; they may include date and time, a hash of the user name or password, or any other data. The resulting random number sequence depends on all the elements in the seeds array. The resulting sequence will be unpredictable as long as at least one element of the seeds array is truly unpredictable. Combining seeds from multiple sources makes it more difficult for an attacker to break the security.
429 | \vspacesmall
430 | 
431 | 
432 | \section{Game applications}\label{GameApplications}
433 | The source of randomness does not need to be highly secure for entertainment games. A single seed from the \codei{physicalSeed()} function will provide sufficient randomness. 
434 | \vspacesmall
435 | 
436 | 
437 | \section{Gambling applications}\label{GamblingApplications}
438 | Gambling is a morally dubious exploitation of well-known weaknesses in the human psyche for financial gain, in my opinion. I do not endorse the use of this software in gambling applications.
439 | \vspacesmall
440 | 
441 | 
442 | \section{Monte Carlo simulation}\label{MonteCarloSimulation}
443 | Monte Carlo simulation and Monte Carlo integration are computational techniques that require a very long sequence of random numbers. The Ranvec1 generator was designed to be well suited for this purpose. 
444 | \vspacesmall
445 | 
446 | You do not need truly unpredictable randomness for Monte Carlo applications. On the contrary, it is an advantage to have a deterministic sequence so that it is possible to re-play a particular simulation in case of an interesting event that you want to analyze further. It is quite convenient to use consecutive seeds such as 1, 2, 3, ... for a series of simulation runs.
447 | \vspacesmall
448 | 
449 | 
450 | \section{Multi-threaded applications}\label{MultiThreadedApplications}
451 | The Ranvec1 generator is designed to be suitable for large multi-threaded applications.
452 | You can take advantage of the multiple CPU cores in modern computers by running multiple threads simultaneously in time-consuming applications. The number of threads should not be more than the number of CPU cores. Some microprocessors are able to run two or more threads in each core. In this case, the number of logical processors is higher than the number of physical processors.
453 | Two threads running in the same core are likely to be competing for the same resources, so it may not be efficient to run more threads than CPU cores in this case.
454 | \vspacesmall
455 | 
456 | It is not safe to access a pseudo random number generator from multiple threads simultaneously. Instead, you need to make one instance of Ranvec1 for each thread. Each instance should have a different seed. It is recommended to use the combined generator (type 3) with two seeds. The second seed, or both seeds, should be different for each thread. The theoretical reasons for this are explained in the theory article cited on page \pageref{Fog2015TheoryArticle}.
457 | \vspacesmall
458 | 
459 | Example \ref{exampleMultipleThreads} shows how to generate random numbers in multiple threads. Note that there will be one instance of the random number generator object \codei{Ranvec1} in each thread because it is declared inside the thread function.
460 | 
461 | \vspacesmall
462 | 
463 | 
464 | \begin{example}
465 | \label{exampleMultipleThreads}
466 | \end{example} % frame disappears if I put this after end lstlisting
467 | \begin{lstlisting}[frame=single]
468 | // Example of random number generation with multiple threads
469 | // random_threads.cpp
470 | 
471 | // Example of command line options for g++ and clang:
472 | // g++     -O2 -std=c++17 -mavx2 -mfma -pthread random_threads.cpp
473 | // clang++ -O2 -std=c++17 -mavx2 -mfma -pthread random_threads.cpp
474 | 
475 | // for Visual Studio only: define desired instruction set:
476 | // #define INSTRSET 8
477 | 
478 | #include <stdio.h>
479 | #include <thread>
480 | 
481 | #include "ranvec1.h"    // random number generator
482 | #include "ranvec1.cpp"  // put code in separate module or include
483 | 
484 | // Thread function. Will run one instance for each thread
485 | // This function calculates the mean of 1000 random numbers
486 | void thread_function(int threadnum, int seed, double * result) {
487 | 
488 |     // Make an instance of the random number generator
489 |     // (this instance is local to each thread)
490 |     Ranvec1 ran(3);
491 | 
492 |     // Initialize. Use the thread number as a second seed to get
493 |     // different results in each thread
494 |     ran.init(seed, threadnum);
495 | 
496 |     // Accumulator for eight sums
497 |     Vec8d accum = 0.;
498 | 
499 |     // Generate 1000 random double precision numbers
500 |     for (int i = 0; i < 125; i++) {
501 |         // Vector of eight double precision random numbers
502 |         accum += ran.random8d();
503 |     }
504 |     // Calculate sum and mean
505 |     double sum = horizontal_add(accum);
506 |     double mean = sum * 0.001;
507 | 
508 |     // Return result
509 |     *result = mean;    
510 | }
511 | 
512 | int main() {
513 | 
514 |     // Number of threads
515 |     const int number_of_threads = 4;
516 | 
517 |     // Array of thread objects
518 |     std::thread threads[number_of_threads];
519 | 
520 |     // Array of results
521 |     double results[number_of_threads];
522 | 
523 |     // Arbitrary seed
524 |     int seed = 25;
525 | 
526 |     // Start threads
527 |     for (int t = 0; t < number_of_threads; t++) {
528 |         threads[t] = 
529 |         std::thread(thread_function, t, seed, &results[t]);
530 |     }
531 | 
532 |     // Wait for threads to finish
533 |     for (int t = 0; t < number_of_threads; t++) {
534 |         threads[t].join();
535 |     }
536 | 
537 |     // write results
538 |     for (int i = 0; i < number_of_threads; i++) {     
539 |         printf("%.6f  ", results[i]);
540 |     }
541 | 
542 |     return 0;
543 | }
544 | \end{lstlisting}
545 | \vspacesmall
546 | 
547 | 
548 | 
549 | \end{document}


--------------------------------------------------------------------------------
/vector3d/testbench_vector3d.cpp:
--------------------------------------------------------------------------------
  1 | /*************************  testbench_vector3d.cpp   **************************
  2 | * Author:        Agner Fog
  3 | * Date created:  2019-07-14
  4 | * Last modified: 2022-07-20
  5 | * Version:       2.02.00
  6 | * Project:       Testbench for vector3d.h using vector class library
  7 | * Description:
  8 | * Compile and run this program to test operators and functions in vector3d.h package
  9 | * This file contains test cases for general operators and functions.
 10 | * Each function or operator is tested with many different combinations of input data.
 11 | *
 12 | * Instructions:
 13 | * The following parameters must be defined on the command line or added in the
 14 | * top of this file:
 15 | *
 16 | * vtype: Vector type to test
 17 | * testcase: A number defining a function or operator to test. See the cases in this file.
 18 | * seed:  Seed for random number generator. May be any integer
 19 | *
 20 | * Compile with any compiler supported by VCL.
 21 | * Specify the desired instruction set and optimization options as parameters
 22 | * to the compiler.
 23 | *
 24 | * (c) Copyright 2019-2022 Agner Fog.
 25 | * Apache license 2.0
 26 | ******************************************************************************
 27 | 
 28 | Test cases:
 29 | 1:   operator +
 30 | 2:   operator -
 31 | 3:   operator *
 32 | 4:   operator /
 33 | 5:   unary -
 34 | 8:   vector * real
 35 | 9:   vector / real
 36 | 11:  cross_product
 37 | 12:  dot_product
 38 | 13:  vector_length
 39 | 14:  normalize_vector
 40 | 15:  rotate
 41 | 16:  to_float
 42 | 17:  to_double
 43 | 20:  constructor from three coordinates
 44 | 21:  get_x
 45 | 22:  get_y
 46 | 23:  get_z
 47 | 24:  extract
 48 | 
 49 | *****************************************************************************/
 50 | 
 51 | #include <stdio.h>
 52 | #include <cmath>
 53 | #if defined (__linux__) && !defined(__LP64__)
 54 | #include <fpu_control.h>      // set floating point control word
 55 | #endif
 56 | 
 57 | #define MAX_VECTOR_SIZE 512
 58 | 
 59 | #ifndef INSTRSET
 60 | #define INSTRSET        10
 61 | #endif
 62 | #include <vectorclass.h>
 63 | #include "../special/vector3d/vector3d.h"    // 3-D vectors
 64 | 
 65 | 
 66 | #ifndef testcase 
 67 | // ---------------------------------------------------------------------------
 68 | //            Specify input parameters here if running from an IDE
 69 | // ----------------------------------------------------------------------------
 70 | 
 71 | #define testcase 1
 72 | 
 73 | #define vtype Vec3Dd 
 74 | 
 75 | #define seed 1
 76 | 
 77 | 
 78 | #endif  // testcase
 79 | 
 80 | 
 81 | // ----------------------------------------------------------------------------
 82 | //             Declarations
 83 | // ----------------------------------------------------------------------------
 84 | 
 85 | // dummy vectors used for getting vector types and element type
 86 | vtype dummyc;                                    // complex vector type
 87 | typedef decltype(dummyc.to_vector()) wtype;      // corresponding normal vector type
 88 | wtype dummyv;
 89 | typedef decltype(dummyv[0]) ST;                  // scalar type
 90 | ST a0, a1;                                       // scalar operands
 91 | const int maxvectorsize = 16;                    // max number of elements in a vector
 92 | //ST oplist[maxvectorsize];                      // operand vector
 93 | int jj0;                                         // copy of vector index
 94 | 
 95 | 
 96 | /************************************************************************
 97 | *
 98 | *                          Test cases
 99 | *
100 | ************************************************************************/
101 | 
102 | #if   testcase == 1    // +
103 | inline vtype testFunction(vtype const& a, vtype const& b) { 
104 |     return a + b; 
105 | }
106 | vtype referenceFunction(vtype a, vtype b) {
107 |     ST aa[4], bb[4], cc[4];
108 |     a.store(aa);  b.store(bb);
109 |     for (int i=0; i<4; i++) cc[i] = aa[i] + bb[i];
110 |     return vtype().load(cc);
111 | }
112 | 
113 | #elif testcase == 2    // - 
114 | inline vtype testFunction(vtype const& a, vtype const& b) { 
115 |     return a - b; 
116 | }
117 | vtype referenceFunction(vtype a, vtype b) {
118 |     ST aa[4], bb[4], cc[4];
119 |     a.store(aa);  b.store(bb);
120 |     for (int i=0; i<4; i++) cc[i] = aa[i] - bb[i];
121 |     return vtype().load(cc);
122 | }
123 | 
124 | #elif testcase == 3    // * 
125 | inline vtype testFunction(vtype const& a, vtype const& b) {
126 |     return a * b;
127 | }
128 | vtype referenceFunction(vtype a, vtype b) {
129 |     ST aa[4], bb[4], cc[4];
130 |     a.store(aa);  b.store(bb);
131 |     for (int i=0; i<4; i++) cc[i] = aa[i] * bb[i];
132 |     return vtype().load(cc);
133 | }
134 | 
135 | #elif testcase == 4    // /
136 | inline vtype testFunction(vtype const& a, vtype const& b) {
137 |     return a / b;
138 | }
139 | vtype referenceFunction(vtype a, vtype b) {
140 |     ST aa[4], bb[4], cc[4];
141 |     a.store(aa);  b.store(bb);
142 |     for (int i=0; i<4; i++) cc[i] = aa[i] / bb[i];
143 |     return vtype().load(cc);
144 | }
145 | 
146 | #elif testcase == 5    // unary -
147 | 
148 | inline vtype testFunction(vtype const& a, vtype const& b) { 
149 |     return -a;
150 | }
151 | vtype referenceFunction(vtype a, vtype b) {
152 |     ST aa[4], bb[4], cc[4] = {0};
153 |     a.store(aa);  b.store(bb);
154 |     for (int i=0; i<3; i++) cc[i] = -aa[i];
155 |     return vtype().load(cc);
156 | }
157 | 
158 | #elif testcase == 6    // vector - real
159 | inline vtype testFunction(vtype const& a, vtype const& b) { 
160 |     ST b0 = b.to_vector()[0];
161 |     return a - b0;
162 | }
163 | vtype referenceFunction(vtype a, vtype b) {
164 |     ST aa[4], bb[4], cc[4] = {0};
165 |     a.store(aa);  b.store(bb);
166 |     for (int i=0; i<3; i++) cc[i] = aa[i] - bb[0];
167 |     return vtype().load(cc);
168 | }
169 | 
170 | #elif testcase == 7    // real - vector
171 | inline vtype testFunction(vtype const& a, vtype const& b) { 
172 |     a0 = a.to_vector()[0];
173 |     return a0 - b; 
174 | }
175 | vtype referenceFunction(vtype a, vtype b) {
176 |     ST aa[4], bb[4], cc[4] = {0};
177 |     a.store(aa);  b.store(bb);
178 |     for (int i=0; i<4; i++) cc[i] = aa[0] - bb[i];
179 |     return vtype().load(cc);
180 | }
181 | 
182 | #elif testcase == 8    // vector * real
183 | inline vtype testFunction(vtype const& a, vtype const& b) { 
184 |     ST b0 = b.to_vector()[0];
185 |     return a * b0;
186 | }
187 | vtype referenceFunction(vtype a, vtype b) {
188 |     ST aa[4], bb[4], cc[4] = {0};
189 |     a.store(aa);  b.store(bb);
190 |     for (int i=0; i<4; i++) cc[i] = aa[i] * bb[0];
191 |     return vtype().load(cc);
192 | }
193 | 
194 | #elif testcase == 9    // vector / real
195 | inline vtype testFunction(vtype const& a, vtype const& b) { 
196 |     ST b0 = b.to_vector()[0];
197 |     return a / b0; 
198 | }
199 | vtype referenceFunction(vtype a, vtype b) {
200 |     ST aa[4], bb[4], cc[4] = {0};
201 |     a.store(aa);  b.store(bb);
202 |     for (int i=0; i<3; i++) cc[i] = aa[i] / bb[0];
203 |     return vtype().load(cc);
204 | }
205 | 
206 | #elif testcase == 10   // real / vector
207 | inline vtype testFunction(vtype const& a, vtype const& b) { 
208 |     a0 = a.to_vector()[0];
209 |     return a0 / b; 
210 | }
211 | vtype referenceFunction(vtype a, vtype b) { 
212 |     ST aa[4], bb[4], cc[4] = {0};
213 |     a.store(aa);  b.store(bb);
214 |     for (int i=0; i<3; i++) cc[i] = aa[0] / bb[i];
215 |     return vtype().load(cc);
216 | }
217 | 
218 | #elif testcase == 11    // cross_product
219 | inline vtype testFunction(vtype const& a, vtype const& b) { 
220 |     return cross_product(a, b);
221 | }
222 | vtype referenceFunction(vtype a, vtype b) { 
223 |     ST aa[4], bb[4], cc[4];
224 |     a.store(aa);  b.store(bb);
225 |     cc[0] = aa[1]*bb[2] - aa[2]*bb[1];
226 |     cc[1] = aa[2]*bb[0] - aa[0]*bb[2];
227 |     cc[2] = aa[0]*bb[1] - aa[1]*bb[0];
228 |     cc[3] = 0;
229 |     return vtype().load(cc);
230 | }
231 | #define FACCURACY 100000    // possible loss of precision
232 | 
233 | 
234 | #elif testcase == 12    // dot_product
235 | inline vtype testFunction(vtype const& a, vtype const& b) { 
236 |     return vtype(dot_product(a, b), 0, 0);
237 | }
238 | vtype referenceFunction(vtype a, vtype b) { 
239 |     ST aa[4], bb[4], cc[4] = {0};
240 |     a.store(aa);  b.store(bb);
241 |     ST sum = 0;
242 |     for (int i=0; i<3; i++) sum += aa[i] * bb[i];
243 |     cc[0] = sum;
244 |     return vtype().load(cc);
245 | }
246 | #define FACCURACY 4    // possible loss of precision
247 | 
248 | #elif testcase == 13    // vector_length
249 | inline vtype testFunction(vtype const& a, vtype const& b) { 
250 |     return vtype(vector_length(a), 0, 0);
251 | }
252 | vtype referenceFunction(vtype a, vtype b) { 
253 |     ST aa[4], cc[4] = {0};
254 |     a.store(aa); 
255 |     ST ssum = 0;
256 |     for (int i=0; i<3; i++) ssum += aa[i] * aa[i];
257 |     cc[0] = std::sqrt(ssum);
258 |     return vtype().load(cc);
259 | }
260 | #define FACCURACY 4    // possible loss of precision
261 | 
262 | #elif testcase == 14    // normalize_vector
263 | inline vtype testFunction(vtype const& a, vtype const& b) { 
264 |     return normalize_vector(a);
265 | }
266 | vtype referenceFunction(vtype a, vtype b) { 
267 |     ST aa[4], cc[4] = {0};
268 |     a.store(aa); 
269 |     ST ssum = 0;
270 |     for (int i=0; i<3; i++) ssum += aa[i] * aa[i];
271 |     ssum = std::sqrt(ssum);
272 |     for (int i=0; i<3; i++) cc[i] = aa[i] / ssum;
273 |     return vtype().load(cc);
274 | }
275 | #define FACCURACY 4    // possible loss of precision
276 | 
277 | #elif testcase == 15    // rotate
278 | inline vtype testFunction(vtype const& a, vtype const& b) { 
279 |     return rotate(b, a-vtype(1,1,1), b+vtype(2,2,2), a);
280 | } 
281 | 
282 | vtype referenceFunction(vtype a, vtype b) { 
283 |     ST aa[4], cc[4];
284 |     a.store(aa); 
285 |     ST R[4][4] = {{0}};  // rotation matrix
286 |     vtype c0 = b, c1 = a-vtype(1,1,1), c2 = b+vtype(2,2,2); // columns
287 |     R[0][0] = c0[0]; R[1][0] = c0[1]; R[2][0] = c0[2]; 
288 |     R[0][1] = c1[0]; R[1][1] = c1[1]; R[2][1] = c1[2]; 
289 |     R[0][2] = c2[0]; R[1][2] = c2[1]; R[2][2] = c2[2]; 
290 |     for (int i=0; i<3; i++) { // multiply matrix by column vector a
291 |         cc[i] = 0;
292 |         for (int j=0; j<3; j++) {
293 |             cc[i] += R[i][j] * aa[j];
294 |         }
295 |     }
296 |     cc[3] = 0;
297 |     return vtype().load(cc);
298 | }
299 | #define FACCURACY 1000    // possible loss of precision
300 | 
301 | #elif testcase == 16    // to_float
302 | inline vtype testFunction(vtype const& a, vtype const& b) { 
303 |     auto c = to_float(a);
304 |     return vtype(c[0], c[1], c[2]);
305 | } 
306 | 
307 | vtype referenceFunction(vtype a, vtype b) { 
308 |     return a;
309 | }
310 | #define FACCURACY 1.E9  // loss of precision when converting to single precision
311 | 
312 | 
313 | #elif testcase == 17    // to_double
314 | inline vtype testFunction(vtype const& a, vtype const& b) { 
315 |     auto c = to_double(a);
316 |     return vtype((float)c[0], (float)c[1], (float)c[2]);
317 | }
318 | 
319 | vtype referenceFunction(vtype a, vtype b) { 
320 |     return a;
321 | }
322 | 
323 | #elif testcase == 20    // constructor from three coordinates
324 | inline vtype testFunction(vtype const& a, vtype const& b) { 
325 |     ST aa[4];
326 |     a.store(aa);
327 |     return vtype(aa[0], aa[1], aa[2]);
328 | }
329 | vtype referenceFunction(vtype a, vtype b) { 
330 |     return a;
331 | } 
332 | 
333 | #elif testcase == 21    // get_x
334 | inline vtype testFunction(vtype const& a, vtype const& b) { 
335 |     ST c = a.get_x();
336 |     return vtype(c, 0, 0);
337 | }
338 | vtype referenceFunction(vtype a, vtype b) { 
339 |     ST aa[4];
340 |     a.store(aa);
341 |     return vtype(aa[0], 0, 0);
342 | }
343 | 
344 | #elif testcase == 22    // get_y
345 | inline vtype testFunction(vtype const& a, vtype const& b) { 
346 |     ST c = a.get_y();
347 |     return vtype(c, 0, 0);
348 | }
349 | vtype referenceFunction(vtype a, vtype b) { 
350 |     ST aa[4];
351 |     a.store(aa);
352 |     return vtype(aa[1], 0, 0);
353 | }
354 | 
355 | #elif testcase == 23    // get_z
356 | inline vtype testFunction(vtype const& a, vtype const& b) { 
357 |     ST c = a.get_z();
358 |     return vtype(c, 0, 0);
359 | }
360 | vtype referenceFunction(vtype a, vtype b) { 
361 |     ST aa[4];
362 |     a.store(aa);
363 |     return vtype(aa[2], 0, 0);
364 | }
365 | 
366 | #elif testcase == 24    // extract
367 | inline vtype testFunction(vtype const& a, vtype const& b) { 
368 |     uint32_t bb = uint32_t(b.get_x()) % 3;
369 |     //ST c = a.extract(bb % 3);
370 |     ST c = a[bb];
371 |     return vtype(c, 0, 0);
372 | }
373 | vtype referenceFunction(vtype a, vtype b) { 
374 |     uint32_t bb = uint32_t(b.get_x());
375 |     ST aa[4];
376 |     a.store(aa);
377 |     ST c = aa[bb % 3];
378 |     return vtype(c, 0, 0);
379 | }
380 | 
381 | #elif testcase == 25    // insert
382 | inline vtype testFunction(vtype const& a, vtype const& b) { 
383 |     uint32_t bb = uint32_t(b.get_x()) % 3;
384 |     vtype aa = a;
385 |     return aa.insert(bb, 9.5f);
386 | }
387 | vtype referenceFunction(vtype a, vtype b) { 
388 |     uint32_t bb = uint32_t(b.get_x()) % 3;
389 |     ST aa[4];
390 |     a.store(aa);
391 |     aa[bb] = 9.5f;
392 |     return vtype().load(aa);
393 | }
394 | 
395 | 
396 | 
397 | #else
398 | // End of test cases
399 | #error unknown test case
400 | #endif
401 | 
402 | 
403 | // ----------------------------------------------------------------------------
404 | //                           Overhead functions
405 | // ----------------------------------------------------------------------------
406 | 
407 | const int maxerrors = 10;      // maximum errors to report
408 | int numerr = 0;                // count errors
409 | 
410 | // type-specific load function
411 | template <typename T, typename E>
412 | inline void loadData(T & x, E const* p) {
413 |     x.load(p);
414 | }
415 | 
416 | template <typename T>
417 | inline void loadData(T & x, bool const* p) {
418 |     for (int i = 0; i < x.size(); i++) {
419 |         x.insert(i, p[i]);     // bool vectors have no load function
420 |     }
421 | }
422 | 
423 | 
424 | // type-specific printing functions
425 | 
426 | void printVal(float x) {
427 |     printf("%10.7G", x);
428 | }
429 | 
430 | void printVal(double x) {
431 |     printf("%10.7G", x);
432 | }
433 | 
434 | void printVal(bool x) {
435 |     printf("%i", (int)x);
436 | }
437 | 
438 | // Random number generator
439 | class ranGen {
440 |     // parameters for multiply-with-carry generator
441 |     uint64_t x, carry;
442 | public:
443 |     ranGen(int Seed) {                 // constructor
444 |         x = Seed;  carry = 1765;       //initialize with seed
445 |         next();  next();
446 |     }
447 |     uint32_t next() {                  // get next random number, using multiply-with-carry method
448 |         const uint32_t fac = 3947008974u;
449 |         x = x * fac + carry;
450 |         carry = x >> 32;
451 |         x = uint32_t(x);
452 |         return uint32_t(x);
453 |     }
454 | };
455 | 
456 | template <typename T>  // get random number of type T
457 | T get_random(ranGen & rangen) {
458 |     return (T)rangen.next();
459 | }
460 | 
461 | template <>  // special case uint64_t
462 | uint64_t get_random<uint64_t>(ranGen & rangen) {
463 |     uint64_t xx;
464 |     xx = (uint64_t)rangen.next() << 32;
465 |     xx |= rangen.next();
466 |     return xx;
467 | }
468 | 
469 | template <>  // special case int64_t
470 | int64_t get_random<int64_t>(ranGen & rangen) {
471 |     return (int64_t)get_random<uint64_t>(rangen);
472 | }
473 | 
474 | template <>  // special case float
475 | float get_random<float>(ranGen & rangen) {
476 |     union Uif {
477 |         uint32_t i;
478 |         float f;
479 |     };
480 |     Uif u1, u2;
481 |     uint32_t r = rangen.next();                  // get 32 random bits
482 |     // Insert exponent and random mantissa to get random number in the interval 1 <= x < 2
483 |     // Subtract 1.0 if next bit is 0, or 1.0 - 2^-24 = 0.99999994f if next bit is 1
484 |     u1.i = 0x3F800000 - ((r >> 8) & 1);          // bit 8
485 |     u2.i = (r >> 9) | 0x3F800000;                // bit 9 - 31
486 |     return u2.f - u1.f;
487 | }
488 | 
489 | template <>  // special case float
490 | double get_random<double>(ranGen & rangen) {
491 |     union Uqd {
492 |         uint64_t q;
493 |         double d;
494 |     };
495 |     Uqd u1;
496 |     uint64_t r = get_random<uint64_t>(rangen);   // get 64 random bits
497 |     // Insert exponent and random mantissa to get random number in the interval 1 <= x < 2,
498 |     // then subtract 1.0 to get the interval 0 <= x < 1.
499 |     u1.q = (r >> 12) | 0x3FF0000000000000;       // bit 12 - 63
500 |     return u1.d - 1.0;
501 | }
502 | template <>  // special case bool
503 | bool get_random<bool>(ranGen & rangen) {
504 |     return (rangen.next() & 1) != 0;
505 | }
506 | 
507 | 
508 | // make random number generator instance
509 | ranGen ran(seed);
510 | 
511 | // bit_cast function to make special values
512 | float bit_castf(uint32_t x) {  // uint64_t -> double
513 |     union {
514 |         uint32_t i;
515 |         float f;
516 |     } u;
517 |     u.i = x;
518 |     return u.f;
519 | }
520 | 
521 | double bit_castd(uint64_t x) {  // uint32_t -> float
522 |     union {
523 |         uint64_t i;
524 |         double f;
525 |     } u;
526 |     u.i = x;
527 |     return u.f;
528 | }
529 | 
530 | 
531 | // template to generate list of testdata
532 | template <typename T>
533 | class TestData {
534 | public:
535 |     enum LS {
536 |         // define array size. Must be a multiple of vector size:
537 |         listsize = 1024
538 |     };
539 |     TestData() {                            // constructor
540 |         int i;                              // loop counter
541 |         if (T(1.1f) != 1) {
542 |             // floating point type
543 |             // fill boundary data into array
544 |             for (i = 0; i < 20; i++) {
545 |                 list[i] = T((i - 4) * T(0.25));
546 |             }
547 | #ifdef TESTNAN   // test also with NAN, INF, and other special data
548 |             // additional special values, float:
549 |             if constexpr (sizeof(ST) == 4) {
550 |                 list[i++] = (T)bit_castf(0x80000000);   // -0
551 |                 list[i++] = (T)bit_castf(0x00800000);   // smallest positive normal number
552 |                 list[i++] = (T)bit_castf(0x80800000);   // largest negative normal number
553 |                 list[i++] = (T)bit_castf(0x3F7FFFFF);   // nextafter 1.0, 0
554 |                 list[i++] = (T)bit_castf(0x3F800001);   // nextafter 1.0, 2
555 |                 list[i++] = (T)bit_castf(0x7F800000);   // inf
556 |                 list[i++] = (T)bit_castf(0xFF800000);   // -inf
557 |                 list[i++] = (T)bit_castf(0x7FF00000);   // nan
558 |             }
559 |             else { // double
560 |                 list[i++] = (T)bit_castd(0x8000000000000000);   // -0
561 |                 list[i++] = (T)bit_castd(0x0010000000000000);   // smallest positive normal number
562 |                 list[i++] = (T)bit_castd(0x8010000000000000);   // largest negative normal number
563 |                 list[i++] = (T)bit_castd(0x3FEFFFFFFFFFFFFF);   // nextafter 1.0, 0
564 |                 list[i++] = (T)bit_castd(0x3FF0000000000001);   // nextafter 1.0, 2
565 |                 list[i++] = (T)bit_castd(0x7FF0000000000000);   // inf
566 |                 list[i++] = (T)bit_castd(0xFFF0000000000000);   // -inf
567 |                 list[i++] = (T)bit_castd(0x7FFC000000000000);   // nan
568 |             }
569 | #endif
570 |             // fill random data into rest of array
571 |             for (; i < listsize; i++) {
572 |                 list[i] = get_random<T>(ran) * (T)100;
573 |             }
574 |         }
575 |         else {
576 |             // integer type
577 |             // fill boundary data into array
578 |             for (i = 0; i < 6; i++) {
579 |                 list[i] = T(i - 2);
580 |             }
581 |             // data near mid-point of unsigned integers, or overflow point of signed integers:
582 |             uint64_t m = (uint64_t(1) << (sizeof(T) * 8 - 1)) - 2;
583 |             for (; i < 11; i++) {
584 |                 list[i] = T(m++);
585 |             }
586 |             // fill random data into rest of array
587 |             for (; i < listsize; i++) {
588 |                 list[i] = get_random<T>(ran);
589 |             }
590 |         }
591 |     }
592 |     T list[listsize];                  // array of test data
593 |     int size() {                       // get list size
594 |         return listsize;
595 |     }
596 | };
597 | 
598 | 
599 | // get value of least significant bit
600 | float delta_unit(float x) {
601 |     union {
602 |         float f;
603 |         uint32_t i;
604 |     } u;
605 |     x = fabsf(x);
606 |     Vec4f xv = Vec4f(x);
607 |     if (!(is_finite(xv)[0])) return 1.f;
608 |     if (x == 0.f || is_subnormal(xv)[0]) {
609 |         u.i = 0x00800000;              // smallest positive normal number
610 |         return u.f;
611 |     }
612 |     float x1 = x;
613 |     u.f = x;
614 |     u.i++;
615 |     return u.f - x1;
616 | }
617 | 
618 | double delta_unit(double x) {
619 |     union {
620 |         double f;
621 |         uint64_t i;
622 |     } u;
623 |     x = fabs(x);
624 |     Vec2d xv = Vec2d(x);
625 |     if (!(is_finite(xv)[0])) return 1.;
626 |     if (x == 0. || is_subnormal(xv)[0]) {
627 |         u.i = 0x0010000000000000;      // smallest positive normal number
628 |         return u.f;
629 |     }
630 |     double x1 = x;
631 |     u.f = x;
632 |     u.i++;
633 |     return u.f - x1;
634 | }
635 | 
636 | 
637 | // compare two scalars. return true if different
638 | template <typename T>
639 | inline bool compare_scalars(T const a, T const b) {
640 |     return a == b;
641 | }
642 | 
643 | // special cases for float and double:
644 | template <>
645 | inline bool compare_scalars<float>(float const a, float const b) {
646 |     if (a == b || (a != a && b != b)) return true; // return false if equal or both are NAN
647 | #ifdef FACCURACY     // accept minor difference
648 |     float dif = fabsf(a - b) / delta_unit(a);
649 |     if (dif <= FACCURACY) return true;
650 |     printf("\n%.0f ULP ", dif);
651 | #endif
652 |     return false;
653 | }
654 | 
655 | template <>
656 | inline bool compare_scalars<double>(double const a, double const b) {
657 |     if (a == b || (a != a && b != b)) return true; // return false if equal or both are NAN
658 | #ifdef FACCURACY     // accept minor difference
659 |     double dif = fabs(a - b) / delta_unit(a);
660 |     if (dif <= FACCURACY) return true;
661 |     printf("\n%.0f ULP ", dif);
662 | #endif
663 |     return false;
664 | }
665 | 
666 | // compare two vectors. return true if different
667 | template <typename T>
668 | inline bool compare_vectors(T const& a, T const& b) {
669 |     {
670 |         for (int i = 0; i < a.size(); i++) {
671 |             if (!compare_scalars(a[i], b[i])) return false;
672 |         }
673 |     }
674 |     return true;
675 | }
676 | 
677 | #ifndef FACCURACY
678 | #define FACCURACY 1
679 | #endif
680 | 
681 | // compare two vectors. return true if different
682 | inline ST compare_vect3(vtype const& a, vtype const& b) {
683 |     ST alist[4], blist[4];
684 |     a.store(alist);  b.store(blist);
685 |     ST dif, dif0 = 0;
686 |     for (int i = 0; i < 3; i++) {
687 |         ST r = fabs(blist[i]);
688 |         if (r < 1.E-2) r = 1;          // use relative error for results near zero
689 |         dif = ST(fabs(alist[i] - blist[i]) / delta_unit(r));
690 |         if (dif > dif0) dif0 = dif;
691 |     }
692 |     if (dif0 > FACCURACY) return dif0;
693 |     return 0;
694 | }
695 | 
696 | // program entry
697 | int main() {
698 |     //const int vectorsize = vtype::size();
699 | 
700 | #if defined (__linux__) && !defined(__LP64__)
701 |     // Some 32-bit compilers use x87 calculations with long double precision for 
702 |     // the reference function. This may give slightly different results because
703 |     // the value is rounded twice. To get exactly the same value in the test function
704 |     // and the reference function, we change the precision of x87 calculations.
705 |     // (the fpu control function is different in Windows, but the precision is already
706 |     // reduced in Windows anyway)
707 |     fpu_control_t fpcw = 0x27f;
708 |     _FPU_SETCW(fpcw);
709 | #endif
710 | 
711 |     vtype a, b, result, ref;           // complex vectors for operands and result
712 | 
713 |     // make lists of test data
714 |     TestData<ST> adata, bdata;
715 | 
716 |     int i, j, k = 0;                   // loop counters
717 | 
718 |     for (i = 0; i < adata.size(); i += wtype::size()) {
719 |         //a.load(adata.list + i);
720 |         loadData(a, adata.list + i);
721 | 
722 |         for (j = 0; j < bdata.size(); j += wtype::size()) {
723 |             loadData(b, bdata.list + j);
724 |             jj0 = j;
725 | 
726 |             // function under test:
727 |             result = testFunction(a, b);
728 |             ref = referenceFunction(a, b);
729 |             ST dif = compare_vect3(result, ref);
730 |             if (dif != 0) {
731 |                 // values are different. report error 
732 |                 if (++numerr == 1) {
733 |                     printf("\ntest case %i:", testcase);  // print test case first time
734 |                 }
735 |                 ST alist[4], blist[4], tlist[4], rlist[4];
736 |                 a.store(alist); b.store(blist); result.store(tlist); ref.store(rlist);
737 |                 printf("\nError at %i, %i, dif = %.2G:", i, j, dif);
738 |                 for (k = 0; k < 4; k++) {
739 |                     printf("\n%7.4G op %7.4G -> %7.4G, expected %7.4G)",
740 |                         alist[k], blist[k], tlist[k], rlist[k]);
741 |                 }
742 |             }
743 |             if (numerr > maxerrors) {
744 |                 exit(1);               // stop after maxerrors
745 |             }
746 |         }
747 |     }
748 | 
749 |     if (numerr == 0) {
750 |         printf("\nsuccess\n");
751 |     }
752 |     printf("\n");
753 | 
754 |     return numerr;
755 | }
756 | 


--------------------------------------------------------------------------------