├── LICENSE
└── snoise3.c


/LICENSE:
--------------------------------------------------------------------------------
 1 | Redistribution and use in source and binary forms, with or without
 2 | modification, are permitted provided that the following conditions are met:
 3 |     * Redistributions of source code must retain this list of conditions
 4 |       and the following disclaimer.
 5 |     * Redistributions in binary form must reproduce this list of conditions
 6 |       and the following disclaimer in the documentation and/or other
 7 |       materials provided with the distribution.
 8 |     * The names of its contributors may not be used to endorse or
 9 |       promote products derived from this software without specific
10 |       prior written permission.
11 | 
12 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
13 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
14 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
15 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
16 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
17 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
18 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
19 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
20 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
21 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
22 | 


--------------------------------------------------------------------------------
/snoise3.c:
--------------------------------------------------------------------------------
  1 | #include "xmmintrin.h"
  2 | #include "emmintrin.h"
  3 | 
  4 | typedef float v4sf __attribute__ ((vector_size (16)));
  5 | typedef int v4si __attribute__ ((vector_size (16)));
  6 | 
  7 | typedef struct {
  8 |   int offsets[8][2][4];
  9 |   unsigned char *perm, *mperm; // perm mod 12
 10 | } NoiseContext;
 11 | 
 12 | #define LET(A, B) typeof(B) A = B
 13 | 
 14 | static v4sf vec4f(float a, float b, float c, float d) {
 15 |   return (v4sf) _mm_set_ps(d, c, b, a);
 16 | }
 17 | 
 18 | static v4sf vec1_4f(float f) {
 19 |   return (v4sf) _mm_set1_ps(f);
 20 | }
 21 | 
 22 | static int isum(v4si vec) {
 23 |   int i[4];
 24 |   *(v4si*) &i = vec;
 25 |   return i[0] + i[1] + i[2];
 26 | }
 27 | 
 28 | static float sum3(v4sf vec) {
 29 |   float f[4];
 30 |   *(v4sf*) &f = vec;
 31 |   return f[0] + f[1] + f[2];
 32 | }
 33 | 
 34 | static float sum4(v4sf vec) {
 35 |   float f[4];
 36 |   *(v4sf*) &f = vec;
 37 |   return f[0] + f[1] + f[2] + f[3];
 38 | }
 39 | 
 40 | void permsetup(NoiseContext *nc) {
 41 |   int i, k, l;
 42 |   
 43 |   nc->perm = malloc(sizeof(unsigned char) * 256);
 44 |   nc->mperm = malloc(sizeof(unsigned char) * 256);
 45 |   
 46 |   {
 47 |     unsigned char permfill[256] = {162, 43, 153, 52, 83, 210, 193, 75, 227, 195, 233, 76, 83, 48, 252, 181, 101, 31, 13, 32, 38, 23, 72, 101, 100, 145, 105, 218, 135, 89, 39, 100, 162, 196, 51, 18, 185, 138, 76, 83, 228, 229, 128, 101, 76, 111, 68, 227, 114, 123, 72, 98, 219, 161, 8, 86, 212, 50, 219, 166, 139, 195, 195, 128, 74, 250, 154, 110, 150, 175, 36, 25, 96, 123, 101, 12, 236, 158, 227, 199, 77, 156, 6, 159, 203, 92, 27, 60, 155, 218, 239, 156, 184, 90, 213, 115, 38, 18, 39, 102, 191, 87, 177, 47, 64, 28, 224, 252, 176, 9, 111, 208, 112, 50, 78, 123, 243, 248, 99, 112, 52, 142, 253, 93, 30, 111, 56, 104, 217, 3, 204, 188, 144, 143, 155, 228, 55, 249, 45, 9, 152, 26, 250, 2, 135, 30, 4, 169, 30, 208, 56, 255, 15, 123, 237, 170, 17, 71, 182, 203, 246, 162, 184, 164, 103, 77, 49, 174, 186, 159, 201, 216, 41, 92, 246, 158, 112, 79, 99, 101, 231, 46, 88, 81, 94, 23, 24, 103, 43, 224, 151, 173, 217, 142, 64, 78, 203, 110, 151, 49, 22, 107, 3, 44, 110, 151, 253, 142, 125, 247, 3, 239, 42, 23, 238, 102, 114, 104, 58, 227, 164, 31, 214, 84, 98, 159, 67, 181, 19, 144, 133, 213, 19, 122, 245, 42, 217, 205, 0, 87, 104, 122, 35, 238, 96, 93, 116, 177, 56, 201, 147, 156, 229, 219, 16, 128};
 48 |     for (i = 0; i < 256; ++i) {
 49 |         nc->perm[i] = permfill[i];
 50 |         nc->mperm[i] = nc->perm[i] % 12;
 51 |     }
 52 |   }
 53 |   
 54 |   static int offs_init[8][2][4]
 55 |     = {
 56 |       {{1, 0, 0, 0}, {1, 1, 0, 0}},
 57 |       {{0, 1, 0, 0}, {1, 1, 0, 0}},
 58 |       {{0, 0, 0, 0}, {0, 0, 0, 0}},
 59 |       {{0, 1, 0, 0}, {0, 1, 1, 0}},
 60 |       {{1, 0, 0, 0}, {1, 0, 1, 0}},
 61 |       {{0, 0, 0, 0}, {0, 0, 0, 0}},
 62 |       {{0, 0, 1, 0}, {1, 0, 1, 0}},
 63 |       {{0, 0, 1, 0}, {0, 1, 1, 0}}
 64 |     };
 65 |   for (i = 0; i < 8; ++i)
 66 |     for (k = 0; k < 2; ++k)
 67 |       for (l = 0; l < 4; ++l)
 68 |         nc->offsets[i][k][l] = offs_init[i][k][l];
 69 | }
 70 | 
 71 | float noise3(float x, float y, float z, NoiseContext *nc) __attribute__ ((force_align_arg_pointer));
 72 | float noise3(float x, float y, float z, NoiseContext *nc) {
 73 |   v4sf vs[4], vsum;
 74 |   int gi[4], mask, c;
 75 |   v4sf v0;
 76 |   v4sf v = vec4f(x, y, z, 0);
 77 |   v4si indices;
 78 |   
 79 |   vsum = v + vec1_4f(sum3(v) / 3);
 80 |   indices = __builtin_ia32_psubd128 (__builtin_ia32_cvttps2dq(vsum), __builtin_ia32_psrldi128 ((v4si) vsum, 31));
 81 |   vs[0] = v - __builtin_ia32_cvtdq2ps(indices) + vec1_4f(isum(indices) / 6.0f);
 82 |   vs[1] = vs[0] + vec1_4f(     1.0f/6.0f);
 83 |   vs[2] = vs[0] + vec1_4f(     2.0f/6.0f);
 84 |   vs[3] = vs[0] + vec1_4f(-1.0f + 3.0f/6.0f);
 85 |   v4sf xxy = __builtin_ia32_shufps(vs[0], vs[0], _MM_SHUFFLE(0, 1, 0, 0));
 86 |   v4sf yzz = __builtin_ia32_shufps(vs[0], vs[0], _MM_SHUFFLE(0, 2, 2, 1));
 87 |   mask = __builtin_ia32_movmskps(__builtin_ia32_cmpltps(xxy, yzz));
 88 |   LET(opp, &nc->offsets[mask & 7]);
 89 |   #define op (*opp)
 90 |   #define offs1 (op[0])
 91 |   #define offs2 (op[1])
 92 |   vs[1] -= __builtin_ia32_cvtdq2ps(*(v4si*)&offs1);
 93 |   vs[2] -= __builtin_ia32_cvtdq2ps(*(v4si*)&offs2);
 94 |   int indexfield[4]; *(typeof(indices)*) indexfield = indices;
 95 |   #define ii indexfield[0]
 96 |   #define jj indexfield[1]
 97 |   #define kk indexfield[2]
 98 |   #define i1 offs1[0]
 99 |   #define i2 offs2[0]
100 |   #define j1 offs1[1]
101 |   #define j2 offs2[1]
102 |   #define k1 offs1[2]
103 |   #define k2 offs2[2]
104 |   LET(mperm, nc->mperm);
105 |   LET(perm, nc->perm);
106 |   gi[0] = mperm[(perm[(perm[(kk   )&0xff]+jj   )&0xff]+ii   )&0xff];
107 |   gi[1] = mperm[(perm[(perm[(kk+k1)&0xff]+jj+j1)&0xff]+ii+i1)&0xff];
108 |   gi[2] = mperm[(perm[(perm[(kk+k2)&0xff]+jj+j2)&0xff]+ii+i2)&0xff];
109 |   gi[3] = mperm[(perm[(perm[(kk+1 )&0xff]+jj+1 )&0xff]+ii+1 )&0xff];
110 |   float factors[4];
111 |   float pair[3], res[4];
112 |   pair[0] = 1; pair[1] = -1; pair[2] = -1;
113 |   for (c = 0; c < 4; ++c) {
114 |     LET(vscp, &(vs[c]));
115 |     LET(current, *vscp);
116 |     {
117 |       LET(A, current * current);
118 |       LET(B, __builtin_ia32_shufps(A, A, _MM_SHUFFLE(1, 1, 1, 1)));
119 |       LET(C, __builtin_ia32_shufps(A, A, _MM_SHUFFLE(2, 2, 2, 2)));
120 |       LET(D, A + B + C);
121 |       LET(E, vec1_4f(0.6f) - D);
122 |       factors[c] = *(float*) &E;
123 |     }
124 |     if (factors[c] >= 0) {
125 |       int id = gi[c];
126 |       res[c] = (((float*)vscp)[id >> 3] * pair[id & 1]) + (((float*)vscp)[(((id >> 2) | (id >> 3)) & 1) + 1] * pair[id&2]);
127 |     } else {
128 |       factors[c] = 0;
129 |       res[c] = 0;
130 |     }
131 |   }
132 |   v4sf vfactors = vec4f(factors[0], factors[1], factors[2], factors[3]);
133 |   vfactors *= vfactors;
134 |   vfactors *= vfactors;
135 |   v4sf vres = vec4f(res[0], res[1], res[2], res[3]);
136 |   vres *= vfactors;
137 |   return 0.5f + 16 * sum4(vres);
138 | }
139 | 


--------------------------------------------------------------------------------