├── LICENSE └── snoise3.c /LICENSE: -------------------------------------------------------------------------------- 1 | Redistribution and use in source and binary forms, with or without 2 | modification, are permitted provided that the following conditions are met: 3 | * Redistributions of source code must retain this list of conditions 4 | and the following disclaimer. 5 | * Redistributions in binary form must reproduce this list of conditions 6 | and the following disclaimer in the documentation and/or other 7 | materials provided with the distribution. 8 | * The names of its contributors may not be used to endorse or 9 | promote products derived from this software without specific 10 | prior written permission. 11 | 12 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 13 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 14 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 15 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY 16 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 17 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 18 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 19 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 20 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 21 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 22 | -------------------------------------------------------------------------------- /snoise3.c: -------------------------------------------------------------------------------- 1 | #include "xmmintrin.h" 2 | #include "emmintrin.h" 3 | 4 | typedef float v4sf __attribute__ ((vector_size (16))); 5 | typedef int v4si __attribute__ ((vector_size (16))); 6 | 7 | typedef struct { 8 | int offsets[8][2][4]; 9 | unsigned char *perm, *mperm; // perm mod 12 10 | } NoiseContext; 11 | 12 | #define LET(A, B) typeof(B) A = B 13 | 14 | static v4sf vec4f(float a, float b, float c, float d) { 15 | return (v4sf) _mm_set_ps(d, c, b, a); 16 | } 17 | 18 | static v4sf vec1_4f(float f) { 19 | return (v4sf) _mm_set1_ps(f); 20 | } 21 | 22 | static int isum(v4si vec) { 23 | int i[4]; 24 | *(v4si*) &i = vec; 25 | return i[0] + i[1] + i[2]; 26 | } 27 | 28 | static float sum3(v4sf vec) { 29 | float f[4]; 30 | *(v4sf*) &f = vec; 31 | return f[0] + f[1] + f[2]; 32 | } 33 | 34 | static float sum4(v4sf vec) { 35 | float f[4]; 36 | *(v4sf*) &f = vec; 37 | return f[0] + f[1] + f[2] + f[3]; 38 | } 39 | 40 | void permsetup(NoiseContext *nc) { 41 | int i, k, l; 42 | 43 | nc->perm = malloc(sizeof(unsigned char) * 256); 44 | nc->mperm = malloc(sizeof(unsigned char) * 256); 45 | 46 | { 47 | unsigned char permfill[256] = {162, 43, 153, 52, 83, 210, 193, 75, 227, 195, 233, 76, 83, 48, 252, 181, 101, 31, 13, 32, 38, 23, 72, 101, 100, 145, 105, 218, 135, 89, 39, 100, 162, 196, 51, 18, 185, 138, 76, 83, 228, 229, 128, 101, 76, 111, 68, 227, 114, 123, 72, 98, 219, 161, 8, 86, 212, 50, 219, 166, 139, 195, 195, 128, 74, 250, 154, 110, 150, 175, 36, 25, 96, 123, 101, 12, 236, 158, 227, 199, 77, 156, 6, 159, 203, 92, 27, 60, 155, 218, 239, 156, 184, 90, 213, 115, 38, 18, 39, 102, 191, 87, 177, 47, 64, 28, 224, 252, 176, 9, 111, 208, 112, 50, 78, 123, 243, 248, 99, 112, 52, 142, 253, 93, 30, 111, 56, 104, 217, 3, 204, 188, 144, 143, 155, 228, 55, 249, 45, 9, 152, 26, 250, 2, 135, 30, 4, 169, 30, 208, 56, 255, 15, 123, 237, 170, 17, 71, 182, 203, 246, 162, 184, 164, 103, 77, 49, 174, 186, 159, 201, 216, 41, 92, 246, 158, 112, 79, 99, 101, 231, 46, 88, 81, 94, 23, 24, 103, 43, 224, 151, 173, 217, 142, 64, 78, 203, 110, 151, 49, 22, 107, 3, 44, 110, 151, 253, 142, 125, 247, 3, 239, 42, 23, 238, 102, 114, 104, 58, 227, 164, 31, 214, 84, 98, 159, 67, 181, 19, 144, 133, 213, 19, 122, 245, 42, 217, 205, 0, 87, 104, 122, 35, 238, 96, 93, 116, 177, 56, 201, 147, 156, 229, 219, 16, 128}; 48 | for (i = 0; i < 256; ++i) { 49 | nc->perm[i] = permfill[i]; 50 | nc->mperm[i] = nc->perm[i] % 12; 51 | } 52 | } 53 | 54 | static int offs_init[8][2][4] 55 | = { 56 | {{1, 0, 0, 0}, {1, 1, 0, 0}}, 57 | {{0, 1, 0, 0}, {1, 1, 0, 0}}, 58 | {{0, 0, 0, 0}, {0, 0, 0, 0}}, 59 | {{0, 1, 0, 0}, {0, 1, 1, 0}}, 60 | {{1, 0, 0, 0}, {1, 0, 1, 0}}, 61 | {{0, 0, 0, 0}, {0, 0, 0, 0}}, 62 | {{0, 0, 1, 0}, {1, 0, 1, 0}}, 63 | {{0, 0, 1, 0}, {0, 1, 1, 0}} 64 | }; 65 | for (i = 0; i < 8; ++i) 66 | for (k = 0; k < 2; ++k) 67 | for (l = 0; l < 4; ++l) 68 | nc->offsets[i][k][l] = offs_init[i][k][l]; 69 | } 70 | 71 | float noise3(float x, float y, float z, NoiseContext *nc) __attribute__ ((force_align_arg_pointer)); 72 | float noise3(float x, float y, float z, NoiseContext *nc) { 73 | v4sf vs[4], vsum; 74 | int gi[4], mask, c; 75 | v4sf v0; 76 | v4sf v = vec4f(x, y, z, 0); 77 | v4si indices; 78 | 79 | vsum = v + vec1_4f(sum3(v) / 3); 80 | indices = __builtin_ia32_psubd128 (__builtin_ia32_cvttps2dq(vsum), __builtin_ia32_psrldi128 ((v4si) vsum, 31)); 81 | vs[0] = v - __builtin_ia32_cvtdq2ps(indices) + vec1_4f(isum(indices) / 6.0f); 82 | vs[1] = vs[0] + vec1_4f( 1.0f/6.0f); 83 | vs[2] = vs[0] + vec1_4f( 2.0f/6.0f); 84 | vs[3] = vs[0] + vec1_4f(-1.0f + 3.0f/6.0f); 85 | v4sf xxy = __builtin_ia32_shufps(vs[0], vs[0], _MM_SHUFFLE(0, 1, 0, 0)); 86 | v4sf yzz = __builtin_ia32_shufps(vs[0], vs[0], _MM_SHUFFLE(0, 2, 2, 1)); 87 | mask = __builtin_ia32_movmskps(__builtin_ia32_cmpltps(xxy, yzz)); 88 | LET(opp, &nc->offsets[mask & 7]); 89 | #define op (*opp) 90 | #define offs1 (op[0]) 91 | #define offs2 (op[1]) 92 | vs[1] -= __builtin_ia32_cvtdq2ps(*(v4si*)&offs1); 93 | vs[2] -= __builtin_ia32_cvtdq2ps(*(v4si*)&offs2); 94 | int indexfield[4]; *(typeof(indices)*) indexfield = indices; 95 | #define ii indexfield[0] 96 | #define jj indexfield[1] 97 | #define kk indexfield[2] 98 | #define i1 offs1[0] 99 | #define i2 offs2[0] 100 | #define j1 offs1[1] 101 | #define j2 offs2[1] 102 | #define k1 offs1[2] 103 | #define k2 offs2[2] 104 | LET(mperm, nc->mperm); 105 | LET(perm, nc->perm); 106 | gi[0] = mperm[(perm[(perm[(kk )&0xff]+jj )&0xff]+ii )&0xff]; 107 | gi[1] = mperm[(perm[(perm[(kk+k1)&0xff]+jj+j1)&0xff]+ii+i1)&0xff]; 108 | gi[2] = mperm[(perm[(perm[(kk+k2)&0xff]+jj+j2)&0xff]+ii+i2)&0xff]; 109 | gi[3] = mperm[(perm[(perm[(kk+1 )&0xff]+jj+1 )&0xff]+ii+1 )&0xff]; 110 | float factors[4]; 111 | float pair[3], res[4]; 112 | pair[0] = 1; pair[1] = -1; pair[2] = -1; 113 | for (c = 0; c < 4; ++c) { 114 | LET(vscp, &(vs[c])); 115 | LET(current, *vscp); 116 | { 117 | LET(A, current * current); 118 | LET(B, __builtin_ia32_shufps(A, A, _MM_SHUFFLE(1, 1, 1, 1))); 119 | LET(C, __builtin_ia32_shufps(A, A, _MM_SHUFFLE(2, 2, 2, 2))); 120 | LET(D, A + B + C); 121 | LET(E, vec1_4f(0.6f) - D); 122 | factors[c] = *(float*) &E; 123 | } 124 | if (factors[c] >= 0) { 125 | int id = gi[c]; 126 | res[c] = (((float*)vscp)[id >> 3] * pair[id & 1]) + (((float*)vscp)[(((id >> 2) | (id >> 3)) & 1) + 1] * pair[id&2]); 127 | } else { 128 | factors[c] = 0; 129 | res[c] = 0; 130 | } 131 | } 132 | v4sf vfactors = vec4f(factors[0], factors[1], factors[2], factors[3]); 133 | vfactors *= vfactors; 134 | vfactors *= vfactors; 135 | v4sf vres = vec4f(res[0], res[1], res[2], res[3]); 136 | vres *= vfactors; 137 | return 0.5f + 16 * sum4(vres); 138 | } 139 | --------------------------------------------------------------------------------