├── README.md ├── image ├── car.jpg ├── face.jpg ├── peizhi.jpg └── weixin.jpg ├── resources ├── SSE-指令集.pdf └── SSE指令集补充.md ├── speed_bicubic_zoom_sse.cpp ├── speed_box_filter_sse.cpp ├── speed_common_functions.cpp ├── speed_gaussian_filter_sse.cpp ├── speed_histogram_algorithm_framework ├── BoxFilter.h ├── Core.h ├── MaxFilter.h ├── SelectiveBlur.h └── Utility.h ├── speed_integral_graph_sse.cpp ├── speed_max_filter_sse.cpp ├── speed_median_filter_3x3_sse.cpp ├── speed_multi_scale_detail_boosting_see.cpp ├── speed_rgb2gray_sse.cpp ├── speed_rgb2yuv_sse.cpp ├── speed_skin_detection_sse.cpp ├── speed_sobel_edgedetection_sse.cpp ├── speed_vibrance_algorithm.cpp └── sse_implementation_of_common_functions_in_image_processing.cpp /README.md: -------------------------------------------------------------------------------- 1 | # Introduction 2 | 3 | ## speed_histogram_algorithm_framework 4 | 5 | - 局部直方图加速框架,内部使用了一些近似计算及指令集加速(SSE),可以快速处理中值滤波、最大值滤波、最小值滤波、表面模糊等算法。 6 | 7 | ## resources 8 | - SSE优化相关的资源。 9 | 10 | #### PC的CPU为I5-3230,64位。 11 | 12 | #### OpenCV版本为3.4.0 13 | 14 | 15 | 16 | - sse_implementation_of_common_functions_in_image_processing.cpp 多个图像处理中常用函数的SSE实现。 17 | - speed_rgb2gray_sse.cpp 使用sse加速RGB和灰度图转换算法,相比于原始实现有接近5倍加速。算法原理:https://mp.weixin.qq.com/s/SagVQ5gfXWWA7NATv-zvBQ 速度测试结果如下: 18 | 19 | >测试CPU型号:Intel(R) Core(TM) i7-6700 CPU @ 3.40GHz 20 | 21 | | 分辨率 | 优化 | 循环次数 | 速度 | 22 | | --------- | ---------------------------------------- | -------- | ---- | 23 | | 4032x3024 | 原始实现 | 1000 | 12.139ms | 24 | | 4032x3024 | 第一版优化(float->INT) | 1000 | 7.629ms | 25 | | 4032x3024 | OpenCV 自带函数 | 1000 | 4.287ms | 26 | | 4032x3024 | 第二版优化(手动4路并行) | 1000 | 10.528ms | 27 | | 4032x3024 | 第三版优化(OpenMP4线程) | 1000 | 7.632ms | 28 | | 4032x3024 | 第四版优化(SSE优化,一次处理12个像素) | 1000 | 5.579ms | 29 | | 4032x3024 | 第五版优化(SSE优化,一次处理15个像素) | 1000 | 5.843ms | 30 | | 4032x3024 | 第六版优化(AVX2优化,一次处理10个像素) | 1000 | 3.576ms | 31 | | 4032x3024 | 第七版优化(AVX2优化+std::async) | 1000 | 2.626ms | 32 | 33 | 34 | 35 | - speed_vibrance_algorithm.cpp 使用SSE加速自然饱和度算法,加速9倍,算法原理请看: https://mp.weixin.qq.com/s/26UVvqMNLgnquXY21Xu3OQ 。速度测试结果如下: 36 | 37 | |分辨率|优化|循环次数|速度| 38 | |----|----|----|----| 39 | |4032x3024|原始实现|100|115.36ms| 40 | |4032x3024|第一版优化|100|62.43ms| 41 | |4032x3024|第二版优化(4线程)|100|28.89ms| 42 | |4032x3024|第三版优化(SSE)|100|12.69ms| 43 | 44 | 45 | 46 | - speed_sobel_edgedetection_sse.cpp 使用SSE加速Sobel边缘检测算法,加速幅度巨大,算法原理请看:https://mp.weixin.qq.com/s/5lCfO_jmSfP7DbsgM7qbpg 。速度测试结果如下: 47 | 48 | |分辨率|算法优化|循环次数|速度| 49 | |-|-|-|-| 50 | |4032x3024|普通实现|1000|126.54 ms| 51 | |4032x3024|Float->INT+查表法|1000|81.62 ms| 52 | |4032x3024|SSE优化版本1|1000|34.95 ms| 53 | |4032x3024|SSE优化版本2|1000|28.87 ms| 54 | |4032x3024|AVX2优化版本1|1000|15.42 ms | 55 | |4032x3024|AVX2优化+std::async|1000| 5.69 ms | 56 | 57 | - speed_skin_detection_sse.cpp 使用SSE加速肤色检测算法,加速幅度较大,算法原理请看:https://mp.weixin.qq.com/s/UFzY1s6ohTM-dnNg0P4kkw 。速度测试结果如下: 58 | 59 | |分辨率|算法优化|循环次数|速度| 60 | |-|-|-|-| 61 | |4272x2848|普通实现|1000|41.40ms| 62 | |4272x2848|OpenMP 4线程|1000|36.54ms| 63 | |4272x2848|SSE第一版|1000|6.77ms| 64 | |4272x2848|SSE第二版(std::async)|1000|4.73ms| 65 | 66 | - speed_rgb2yuv_sse.cpp SSE极致优化RGB和YUV图像空间互转,算法原理请看:https://mp.weixin.qq.com/s/ryGocz-0YpqZ1CjYXJbd7Q 。速度测试结果如下: 67 | 68 | |分辨率|算法优化|循环次数|速度| 69 | |-|-|-|-| 70 | |4032x3024|普通实现|1000|150.58ms| 71 | |4032x3024|去掉浮点数,除法用位运算代替|1000|76.70ms| 72 | |4032x3024|OpenMP 4线程|1000|50.48ms| 73 | |4032x3024|普通SSE向量化|1000|48.92ms| 74 | |4032x3024|_mm_madd_epi16二次优化|1000|33.04ms| 75 | |4032x3024|SSE+4线程|1000|23.70ms| 76 | 77 | 78 | 79 | - speed_median_filter_3x3_sse.cpp 极致优化3*3中值滤波,算法原理请看:https://blog.csdn.net/just_sort/article/details/98617050 。速度测试效果如下: 80 | 81 | |分辨率|算法优化|循环次数|速度| 82 | |-|-|-|-| 83 | |4032x3024|普通实现|10| 8293.79 ms | 84 | |4032x3024|逻辑优化,更好的流水|10| 83.75 ms | 85 | |4032x3024|SSE优化|10| 11.93 ms | 86 | |4032x3024|AVX优化|10| 9.32 ms | 87 | 88 | ---------------------------------------------------------------------------------- 89 | 90 | - speed_gaussian_filter_sse.cpp 使用sse加速高斯滤波算法。算法原理:https://blog.csdn.net/just_sort/article/details/95212099 。速度测试效果如下: 91 | 92 | | 优化方式| 图像分辨率 | 速度 | 93 | | ------------------- | ---------- | ---- | 94 | | C语言普通实现+单线程 | 4032*3024 | 290.43ms | 95 | | SSE优化+单线程 | 4032*3024 | 265.96ms | 96 | 97 | - speed_integral_graph_sse.cpp 使用SSE加速积分图运算,但是在PC上并没有速度提升,算法原理请看:https://www.cnblogs.com/Imageshop/p/6897233.html 。速度测试结果如下: 98 | 99 | |优化方式|图像分辨率 |速度| 100 | |---------|----------|-------| 101 | |C语言实现+单线程|4032*3024|66.66ms| 102 | |C语言实现+4线程|4032*3024|65.34ms| 103 | |SSE优化+单线程|4032*3024|66.10ms| 104 | |SSE优化+4线程|4032*3024|66.20ms| 105 | 106 | 107 | - speed_common_functions.cpp 对图像处理的一些常用函数的快速实现,个别使用了SSE优化。 108 | - speed_max_filter_sse.cpp 使用speed_histogram_algorithm_framework框架实现最大值滤波,半径越大越明显。原理请看:https://blog.csdn.net/just_sort/article/details/97280807 。运行的时候记得把工程属性中的sdl检查关掉,不然会报一个变量未初始化的错误。速度测试效果如下: 109 | 110 | |优化方式|图像分辨率 |半径|速度| 111 | |---------|----------|-------|-------| 112 | |C语言实现+单线程|4272*2848|7|9445.90ms| 113 | |SSE优化+单线程|4272*2848|7|2234.55ms| 114 | |C语言实现+单线程|4272*2848|9|14468.76ms| 115 | |SSE优化+单线程|4272*2848|9|2221.68ms| 116 | |C语言实现+单线程|4272*2848|11|23069.10ms| 117 | |SSE优化+单线程|4272*2848|11|2180.95ms| 118 | 119 | - speed_box_filter_sse.cpp 使用speed_histogram_algorithm框架实现O(1)最大值滤波,使用了SSE优化,算法原理请看:https://blog.csdn.net/just_sort/article/details/98075712 。运行方法和speed_max_filter_sse.cpp相同,速度测试结果如下: 120 | 121 | |优化方式|图像分辨率 |半径|速度| 122 | |---------|----------|-------|-------| 123 | |C语言实现+单线程|4272*2848|11|163.16ms| 124 | |SSE优化+单线程|4272*2848|11|123.83ms| 125 | |C语言实现+单线程|4272*2848|21|167.81ms| 126 | |SSE优化+单线程|4272*2848|21|126.98ms| 127 | |C语言实现+单线程|4272*2848|31|168.62ms| 128 | |SSE优化+单线程|4272*2848|31|126.17ms| 129 | 130 | - speed_multi_scale_detail_boosting_see.cpp 在speed_box_filter_sse.cpp提供的盒子滤波sse优化的基础上,进一步使用指令集实现了对论文《DARK IMAGE ENHANCEMENT BASED ON PAIRWISE TARGET CONTRAST AND MULTI-SCALE DETAIL BOOSTING》的算法优化。算法原理请看:https://blog.csdn.net/just_sort/article/details/98485746 。在CoreI7-3770速度测试结果如下: 131 | 132 | |优化方式|图像分辨率 |半径|速度| 133 | |---------|----------|-------|-------| 134 | |C语言实现+单线程|4272*2848|7|206.00ms| 135 | |SSE优化+单线程|4272*2848|7|57.12ms| 136 | 137 | - speed_bicubic_zoom_sse.cpp SSE优化三次立方插值算法,算法原理请看:https://blog.csdn.net/just_sort/article/details/100119653 。速度测试结果如下: 138 | 139 | |优化方式|图像分辨率 |插值后大小|速度| 140 | |---------|----------|-------|-------| 141 | |C语言原始算法实现|4272*2848|长宽均为原始1.5倍|1856.29ms| 142 | |C语言实现+查表优化+边界优化|4272*2848|长宽均为原始1.5倍|839.10ms| 143 | |SSE优化+边界优化|4272*2848|长宽均为原始1.5倍|315.70ms| 144 | |OpenCV3.1.0自带的函数|4272*2848|长宽均为原始1.5倍|118.77ms| 145 | 146 | 147 | 148 | 149 | # 维护了一个微信公众号,分享论文,算法,比赛,生活,欢迎加入。 150 | 151 | - 图片要是没加载出来直接搜GiantPandaCV 就好。 152 | 153 | ![](image/weixin.jpg) 154 | -------------------------------------------------------------------------------- /image/car.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BBuf/Image-processing-algorithm-Speed/d22063d5c5b45466e80787e33ffe8a430c435854/image/car.jpg -------------------------------------------------------------------------------- /image/face.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BBuf/Image-processing-algorithm-Speed/d22063d5c5b45466e80787e33ffe8a430c435854/image/face.jpg -------------------------------------------------------------------------------- /image/peizhi.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BBuf/Image-processing-algorithm-Speed/d22063d5c5b45466e80787e33ffe8a430c435854/image/peizhi.jpg -------------------------------------------------------------------------------- /image/weixin.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BBuf/Image-processing-algorithm-Speed/d22063d5c5b45466e80787e33ffe8a430c435854/image/weixin.jpg -------------------------------------------------------------------------------- /resources/SSE-指令集.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BBuf/Image-processing-algorithm-Speed/d22063d5c5b45466e80787e33ffe8a430c435854/resources/SSE-指令集.pdf -------------------------------------------------------------------------------- /resources/SSE指令集补充.md: -------------------------------------------------------------------------------- 1 | # SSE指令集记录 2 | 3 | - _mm_cvtps_epi32 把四个float变量强转为四个int变量。其中需要注意的是他的截断规则:四舍五入,在进位后末位是偶数的进,否则不进位。 4 | 5 | - _mm_cvttps_epi32 把四个float变量强转为四个int变量。直接截断,和c/c++中的r = (int)a一样。 6 | 7 | - _mm_cvtpd_ps 将两个双精度, a 的浮点值设置为单精度的,浮点值。返回值: 8 | 9 | ```c++ 10 | r0 := (float) a0 11 | r1 := (float) a1 12 | r2 := 0.0 ; r3 := 0.0 13 | ``` 14 | 15 | - _mm_movelh_ps 移动更低两个单精度, b 的浮点值到上面两个单精度,结果的浮点值。 16 | 17 | ```c++ 18 | r3 := b1 19 | r2 := b0 20 | r1 := a1 21 | r0 := a0 22 | ``` 23 | 24 | - _mm_cmpneq_ps 比较两个单精度,如果对应位置的数相等返回0,不相等则返回1。 25 | 26 | - _mm_blendv_ps 混和打包函数: 27 | 28 | ```c++ 29 | __m128 _mm_blendv_ps( 30 | __m128 a, 31 | __m128 b, 32 | __m128 mask 33 | ); 34 | 35 | r0 := (mask0 & 0x80000000) ? b0 : a0 36 | r1 := (mask1 & 0x80000000) ? b1 : a1 37 | r2 := (mask2 & 0x80000000) ? b2 : a2 38 | r3 := (mask3 & 0x80000000) ? b3 : a3 39 | ``` 40 | 41 | - _mm_packs_epi32 将a和b的8位有符号和32位整数转化位16位整型数据。 42 | 43 | - _mm_cvtsi128_si32 移动最低有效位的32位a到32位整数。 44 | 45 | - _mm_packus_epi16 将a和b的16位整数转化位8位无符号整型数据。 46 | 47 | - _mm_cvtsi32_si128 将a的低32位赋值给一个32bits的整数,返回值为r=a0 48 | 49 | - _mm_loadu_si128表示:Loads 128-bit value;即加载128位值。 50 | 51 | - _mm_max_epu8 (a,b)表示:比较a和b中对应的无符号的8bits的整数,取其较大值,重复这个过程16次。即:r0=max(a0,b0),...,r15=max(a15,b15)。 52 | 53 | - _mm_min_epi8(a,b)表示:大体意思同上,不同的是这次比较的是有符号的8bits的整数。 54 | 55 | - _mm_setzero_si128表示:将128bits的值都赋值为0。 56 | 57 | - _mm_subs_epu8(a,b)表示:a和b中对应的8bits数相减,r0= UnsignedSaturate(a0-b0),...,r15= UnsignedSaturate(a15 - b15)。 58 | 59 | - _mm_adds_epi8(a,b)表示:a和b中对应的8bits数相加,r0=SingedSaturate(a0+b0),...,r15=SingedSaturate(a15+b15)。 60 | 61 | - _mm_unpackhi_epi64(a,b)表示:a和b的高64位交错,低64位舍去。 62 | 63 | - _mm_srli_si128(a,imm)表示:将a进行逻辑右移imm位,高位填充0。 64 | 65 | - _mm_cvtsi128_si32(a)表示:将a的低32位赋值给一个32bits的整数,返回值为r=a0。 66 | 67 | - _mm_xor_si128(a,b)表示:将a和b进行按位异或,即r=a^b。 68 | 69 | - _mm_or_si128(a,b)表示:将a和b进行或运算,即r=a|b。 70 | 71 | - _mm_and_si128(a,b)表示:将a和b进行与运算,即r=a&b。 72 | 73 | - _mm_cmpgt_epi8(a,b)表示:分别比较a的每个8bits整数是否大于b的对应位置的8bits整数,若大于,则返回0xffff,否则返回0x0。即r0=(a0>b0)?0xff:0x0 r1=(a1>b1)?0xff:0x0...r15=(a15>b15)?0xff:0x0 74 | 75 | - _mm_unpacklo_epi64表示: a和b的高64位交错,高64位舍去。 76 | 77 | - _mm_madd_epi16 表示:返回一个__m128i的寄存器,它含有4个有符号的32位整数。 78 | 79 | ```c++ 80 | r0 := (a0 * b0) + (a1 * b1) 81 | r1 := (a2 * b2) + (a3 * b3) 82 | r2 := (a4 * b4) + (a5 * b5) 83 | r3 := (a6 * b6) + (a7 * b7) 84 | ``` 85 | 86 | - _mm_extract_epi16(a, imm) 表示: 返回imm位置上的16位数。 87 | 88 | - _mm_min_epu16 表示:两个数的最小者。 89 | 90 | - _mm_minpos_epu16 表示:返回128 位值, 最低序的 16 位是参数找到的最小值a,第二个低的顺序 16 位是参数找到的最小值的索引a。 91 | 92 | - _mm_stream_si32 将数据存储到指针对应的地址中。 93 | 94 | - _mm_cvtsi128_si32 移动最低有效位的32位a到32位整数。 95 | 96 | - _mm_packus_epi32 97 | 98 | ```c++ 99 | r0 := (a0 < 0) ? 0 : ((a0 > 0xffff) ? 0xffff : a0) 100 | r1 := (a1 < 0) ? 0 : ((a1 > 0xffff) ? 0xffff : a1) 101 | r2 := (a2 < 0) ? 0 : ((a2 > 0xffff) ? 0xffff : a2) 102 | r3 := (a3 < 0) ? 0 : ((a3 > 0xffff) ? 0xffff : a3) 103 | r4 := (b0 < 0) ? 0 : ((b0 > 0xffff) ? 0xffff : b0) 104 | r5 := (b1 < 0) ? 0 : ((b1 > 0xffff) ? 0xffff : b1) 105 | r6 := (b2 < 0) ? 0 : ((b2 > 0xffff) ? 0xffff : b2) 106 | r7 := (b3 < 0) ? 0 : ((b3 > 0xffff) ? 0xffff : b3) 107 | ``` 108 | 109 | - _mm_setr_epi32 返回一个__m128i的寄存器,使用4个具体的int类型数据来设置寄存器存放数据。 110 | 111 | - _mm_mullo_epi32 返回一个__m128i的寄存器,分别对a和b的4个int类型数相乘。 112 | 113 | - _mm_hadd_epi32 返回一个__m128i的寄存器,分别对a和b的4个int类型数相加。 114 | 115 | - _mm_madd_epi16 返回一个__m128i的寄存器,分别对a和b先相乘后相加。 116 | 117 | ```c++ 118 | r0 := (a0 * b0) + (a1 * b1) 119 | r1 := (a2 * b2) + (a3 * b3) 120 | r2 := (a4 * b4) + (a5 * b5) 121 | r3 := (a6 * b6) + (a7 * b7) 122 | ``` 123 | 124 | - _mm_unpackhi_epi8 返回一个__m128i的寄存器,对a和b进行交错打包,从高位到低位。 125 | 126 | ```c++ 127 | r0 := a8 ; r1 := b8 128 | r2 := a9 ; r3 := b9 129 | ... 130 | r14 := a15 ; r15 := b15 131 | ``` 132 | 133 | - _mm_unpacklo_epi8 返回一个__m128i的寄存器,对a和b进行交错打包,从低位到高位。 -------------------------------------------------------------------------------- /speed_bicubic_zoom_sse.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | using namespace std; 4 | using namespace cv; 5 | 6 | void debug(__m128i var) { 7 | uint8_t *val = (uint8_t*)&var;//can also use uint32_t instead of 16_t 8 | printf("Numerical: %i %i %i %i %i %i %i %i %i %i %i %i %i %i %i %i\n", 9 | val[0], val[1], val[2], val[3], val[4], val[5], 10 | val[6], val[7], val[8], val[9], val[10], val[11], val[12], val[13], 11 | val[14], val[15]); 12 | } 13 | 14 | void ConvertBGR8U2BGRAF(unsigned char *Src, unsigned char *Dest, int Width, int Height, int Stride) 15 | { 16 | //#pragma omp parallel for 17 | for (int Y = 0; Y < Height; Y++) 18 | { 19 | unsigned char *LinePS = Src + Y * Stride; 20 | unsigned char *LinePD = Dest + Y * Width * 4; 21 | for (int X = 0; X < Width; X++, LinePS += 3, LinePD += 4) 22 | { 23 | LinePD[0] = LinePS[0]; LinePD[1] = LinePS[1]; LinePD[2] = LinePS[2]; LinePD[3] = 0; 24 | } 25 | } 26 | } 27 | 28 | void ConvertBGRAF2BGR8U(unsigned char *Src, unsigned char *Dest, int Width, int Height, int Stride) 29 | { 30 | //#pragma omp parallel for 31 | for (int Y = 0; Y < Height; Y++) 32 | { 33 | unsigned char *LinePS = Src + Y * Width * 4; 34 | unsigned char *LinePD = Dest + Y * Stride; 35 | for (int X = 0; X < Width; X++, LinePS += 4, LinePD += 3) 36 | { 37 | LinePD[0] = LinePS[0]; LinePD[1] = LinePS[1]; LinePD[2] = LinePS[2]; 38 | } 39 | } 40 | } 41 | 42 | void ConvertBGR8U2BGRAF_SSE(unsigned char *Src, unsigned char *Dest, int Width, int Height, int Stride) { 43 | const int BlockSize = 4; 44 | int Block = (Width - 2) / BlockSize; 45 | __m128i Mask = _mm_setr_epi8(0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, 11, -1); 46 | __m128i Mask2 = _mm_setr_epi8(0, 2, 8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); 47 | __m128i Zero = _mm_setzero_si128(); 48 | for (int Y = 0; Y < Height; Y++) { 49 | unsigned char *LinePS = Src + Y * Stride; 50 | unsigned char *LinePD = Dest + Y * Width * 4; 51 | int X = 0; 52 | for (; X < Block * BlockSize; X += BlockSize, LinePS += BlockSize * 3, LinePD += BlockSize * 4) { 53 | __m128i SrcV = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i*)LinePS), Mask); 54 | __m128i Src16L = _mm_unpacklo_epi8(SrcV, Zero); 55 | __m128i Src16H = _mm_unpackhi_epi8(SrcV, Zero); 56 | 57 | _mm_storeu_si128((__m128i *)(LinePD + 0), _mm_shuffle_epi8(_mm_unpacklo_epi32(Src16L, Zero), Mask2)); 58 | _mm_storeu_si128((__m128i *)(LinePD + 4), _mm_shuffle_epi8(_mm_unpackhi_epi32(Src16L, Zero), Mask2)); 59 | _mm_storeu_si128((__m128i *)(LinePD + 8), _mm_shuffle_epi8(_mm_unpacklo_epi32(Src16H, Zero), Mask2)); 60 | _mm_storeu_si128((__m128i *)(LinePD + 12), _mm_shuffle_epi8(_mm_unpackhi_epi32(Src16H, Zero), Mask2)); 61 | } 62 | for (; X < Width; X++, LinePS += 3, LinePD += 4) { 63 | LinePD[0] = LinePS[0]; LinePD[1] = LinePS[1]; LinePD[2] = LinePS[2]; LinePD[3] = 0; 64 | } 65 | } 66 | } 67 | 68 | void ConvertBGRAF2BGR8U_SSE(unsigned char *Src, unsigned char *Dest, int Width, int Height, int Stride) { 69 | const int BlockSize = 4; 70 | int Block = (Width - 2) / BlockSize; 71 | //__m128i Mask = _mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 3, 7, 11, 15); 72 | __m128i MaskB = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); 73 | __m128i MaskG = _mm_setr_epi8(1, 5, 9, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); 74 | __m128i MaskR = _mm_setr_epi8(2, 6, 10, 14, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); 75 | __m128i Zero = _mm_setzero_si128(); 76 | for (int Y = 0; Y < Height; Y++) { 77 | unsigned char *LinePS = Src + Y * Width * 4; 78 | unsigned char *LinePD = Dest + Y * Stride; 79 | int X = 0; 80 | for (; X < Block * BlockSize; X += BlockSize, LinePS += BlockSize * 4, LinePD += BlockSize * 3) { 81 | __m128i SrcV = _mm_loadu_si128((const __m128i*)LinePS); 82 | __m128i B = _mm_shuffle_epi8(SrcV, MaskB); 83 | __m128i G = _mm_shuffle_epi8(SrcV, MaskG); 84 | __m128i R = _mm_shuffle_epi8(SrcV, MaskR); 85 | __m128i Ans1 = Zero, Ans2 = Zero, Ans3 = Zero; 86 | Ans1 = _mm_or_si128(Ans1, _mm_shuffle_epi8(B, _mm_setr_epi8(0, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1))); 87 | Ans1 = _mm_or_si128(Ans1, _mm_shuffle_epi8(G, _mm_setr_epi8(-1, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1))); 88 | Ans1 = _mm_or_si128(Ans1, _mm_shuffle_epi8(R, _mm_setr_epi8(-1, -1, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1))); 89 | 90 | Ans2 = _mm_or_si128(Ans2, _mm_shuffle_epi8(B, _mm_setr_epi8(-1, -1, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1))); 91 | Ans2 = _mm_or_si128(Ans2, _mm_shuffle_epi8(G, _mm_setr_epi8(1, -1, -1, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1))); 92 | Ans2 = _mm_or_si128(Ans2, _mm_shuffle_epi8(R, _mm_setr_epi8(-1, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1))); 93 | 94 | Ans3 = _mm_or_si128(Ans3, _mm_shuffle_epi8(B, _mm_setr_epi8(-1, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1))); 95 | Ans3 = _mm_or_si128(Ans3, _mm_shuffle_epi8(G, _mm_setr_epi8(-1, -1, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1))); 96 | Ans3 = _mm_or_si128(Ans3, _mm_shuffle_epi8(R, _mm_setr_epi8(2, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1))); 97 | 98 | _mm_storeu_si128((__m128i*)(LinePD + 0), Ans1); 99 | _mm_storeu_si128((__m128i*)(LinePD + 4), Ans2); 100 | _mm_storeu_si128((__m128i*)(LinePD + 8), Ans3); 101 | } 102 | for (; X < Width; X++, LinePS += 4, LinePD += 3) { 103 | LinePD[0] = LinePS[0]; LinePD[1] = LinePS[1]; LinePD[2] = LinePS[2]; 104 | } 105 | } 106 | } 107 | 108 | // 将整形的Value值限定在Min和Max内,可取Min或者Max的值 109 | inline int ClampI(int Value, int Min, int Max) { 110 | if (Value < Min) return Min; 111 | else if (Value > Max) return Max; 112 | else return Value; 113 | } 114 | 115 | // 将整数限制到字节数据类型 116 | inline unsigned char ClampToByte(int Value) { 117 | if (Value < 0) return 0; 118 | else if (Value > 255) return 255; 119 | else return (unsigned char)Value; 120 | } 121 | 122 | // 获取PosX, PosY位置的像素 123 | inline unsigned char *GetCheckedPixel(unsigned char *Src, int Width, int Height, int Stride, int Channel, int PosX, int PosY) { 124 | return Src + ClampI(PosY, 0, Height - 1) * Stride + ClampI(PosX, 0, Width - 1) * Channel; 125 | } 126 | 127 | // 该函数计算插值曲线sin(x * PI) / (x * PI)的值,下面是它的近似拟合表达式 128 | float SinXDivX(float X) { 129 | const float a = -1; //a还可以取 a=-2,-1,-0.75,-0.5等等,起到调节锐化或模糊程度的作用 130 | X = abs(X); 131 | float X2 = X * X, X3 = X2 * X; 132 | if (X <= 1) 133 | return (a + 2) * X3 - (a + 3) * X2 + 1; 134 | else if (X <= 2) 135 | return a * X3 - (5 * a) * X2 + (8 * a) * X - (4 * a); 136 | else 137 | return 0; 138 | } 139 | 140 | // 精确计算插值曲线sin(x * PI) / (x * PI) 141 | float SinXDivX_Standard(float X) { 142 | if (abs(X) < 0.000001f) 143 | return 1; 144 | else 145 | return sin(X * 3.1415926f) / (X * 3.1415926f); 146 | } 147 | 148 | void Bicubic_Original(unsigned char *Src, int Width, int Height, int Stride, unsigned char *Pixel, float X, float Y) 149 | { 150 | int Channel = Stride / Width; 151 | int PosX = floor(X), PosY = floor(Y); 152 | float PartXX = X - PosX, PartYY = Y - PosY; 153 | 154 | unsigned char *Pixel00 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX - 1, PosY - 1); 155 | unsigned char *Pixel01 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX + 0, PosY - 1); 156 | unsigned char *Pixel02 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX + 1, PosY - 1); 157 | unsigned char *Pixel03 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX + 2, PosY - 1); 158 | unsigned char *Pixel10 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX - 1, PosY + 0); 159 | unsigned char *Pixel11 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX + 0, PosY + 0); 160 | unsigned char *Pixel12 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX + 1, PosY + 0); 161 | unsigned char *Pixel13 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX + 2, PosY + 0); 162 | unsigned char *Pixel20 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX - 1, PosY + 1); 163 | unsigned char *Pixel21 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX + 0, PosY + 1); 164 | unsigned char *Pixel22 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX + 1, PosY + 1); 165 | unsigned char *Pixel23 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX + 2, PosY + 1); 166 | unsigned char *Pixel30 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX - 1, PosY + 2); 167 | unsigned char *Pixel31 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX + 0, PosY + 2); 168 | unsigned char *Pixel32 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX + 1, PosY + 2); 169 | unsigned char *Pixel33 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX + 2, PosY + 2); 170 | 171 | float U0 = SinXDivX(1 + PartXX), U1 = SinXDivX(PartXX); 172 | float U2 = SinXDivX(1 - PartXX), U3 = SinXDivX(2 - PartXX); 173 | float V0 = SinXDivX(1 + PartYY), V1 = SinXDivX(PartYY); 174 | float V2 = SinXDivX(1 - PartYY), V3 = SinXDivX(2 - PartYY); 175 | 176 | for (int I = 0; I < Channel; I++) 177 | { 178 | float Sum1 = (Pixel00[I] * U0 + Pixel01[I] * U1 + Pixel02[I] * U2 + Pixel03[I] * U3) * V0; 179 | //printf("%.5f\n", Sum1); 180 | float Sum2 = (Pixel10[I] * U0 + Pixel11[I] * U1 + Pixel12[I] * U2 + Pixel13[I] * U3) * V1; 181 | //printf("%.5f\n", Sum2); 182 | float Sum3 = (Pixel20[I] * U0 + Pixel21[I] * U1 + Pixel22[I] * U2 + Pixel23[I] * U3) * V2; 183 | //printf("%.5f\n", Sum3); 184 | float Sum4 = (Pixel30[I] * U0 + Pixel31[I] * U1 + Pixel22[I] * U2 + Pixel33[I] * U3) * V3; 185 | //printf("%.5f\n", Sum4); 186 | // printf("%d %.5f %.5f %.5f %.5f\n", I, Sum1, Sum2, Sum3, Sum4); 187 | Pixel[I] = ClampToByte(Sum1 + Sum2 + Sum3 + Sum4 + 0.5f); 188 | } 189 | } 190 | 191 | // ImageShop说如果把Channel改为固定的值,速度能提高很多,待测试 192 | void Bicubic_Border(unsigned char *Src, int Width, int Height, int Stride, unsigned char *Pixel, short *SinXDivX_Table, int SrcX, int SrcY) { 193 | int Channel = Stride / Width; 194 | int U = (unsigned char)(SrcX >> 8), V = (unsigned char)(SrcY >> 8); 195 | 196 | int U0 = SinXDivX_Table[256 + U], U1 = SinXDivX_Table[U]; 197 | int U2 = SinXDivX_Table[256 - U], U3 = SinXDivX_Table[512 - U]; 198 | int V0 = SinXDivX_Table[256 + V], V1 = SinXDivX_Table[V]; 199 | int V2 = SinXDivX_Table[256 - V], V3 = SinXDivX_Table[512 - V]; 200 | int PosX = SrcX >> 16, PosY = SrcY >> 16; 201 | 202 | unsigned char *Pixel00 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX - 1, PosY - 1); 203 | unsigned char *Pixel01 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX + 0, PosY - 1); 204 | unsigned char *Pixel02 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX + 1, PosY - 1); 205 | unsigned char *Pixel03 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX + 2, PosY - 1); 206 | unsigned char *Pixel10 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX - 1, PosY + 0); 207 | unsigned char *Pixel11 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX + 0, PosY + 0); 208 | unsigned char *Pixel12 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX + 1, PosY + 0); 209 | unsigned char *Pixel13 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX + 2, PosY + 0); 210 | unsigned char *Pixel20 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX - 1, PosY + 1); 211 | unsigned char *Pixel21 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX + 0, PosY + 1); 212 | unsigned char *Pixel22 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX + 1, PosY + 1); 213 | unsigned char *Pixel23 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX + 2, PosY + 1); 214 | unsigned char *Pixel30 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX - 1, PosY + 2); 215 | unsigned char *Pixel31 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX + 0, PosY + 2); 216 | unsigned char *Pixel32 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX + 1, PosY + 2); 217 | unsigned char *Pixel33 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX + 2, PosY + 2); 218 | 219 | for (int I = 0; I < Channel; I++) 220 | { 221 | int Sum1 = (Pixel00[I] * U0 + Pixel01[I] * U1 + Pixel02[I] * U2 + Pixel03[I] * U3) * V0; 222 | int Sum2 = (Pixel10[I] * U0 + Pixel11[I] * U1 + Pixel12[I] * U2 + Pixel13[I] * U3) * V1; 223 | int Sum3 = (Pixel20[I] * U0 + Pixel21[I] * U1 + Pixel22[I] * U2 + Pixel23[I] * U3) * V2; 224 | int Sum4 = (Pixel30[I] * U0 + Pixel31[I] * U1 + Pixel22[I] * U2 + Pixel33[I] * U3) * V3; 225 | Pixel[I] = ClampToByte((Sum1 + Sum2 + Sum3 + Sum4) >> 16); 226 | } 227 | } 228 | void Bicubic_Center(unsigned char *Src, int Width, int Height, int Stride, unsigned char *Pixel, short *SinXDivX_Table, int SrcX, int SrcY) 229 | { 230 | int Channel = Stride / Width; 231 | int U = (unsigned char)(SrcX >> 8), V = (unsigned char)(SrcY >> 8); 232 | 233 | int U0 = SinXDivX_Table[256 + U], U1 = SinXDivX_Table[U]; 234 | int U2 = SinXDivX_Table[256 - U], U3 = SinXDivX_Table[512 - U]; 235 | int V0 = SinXDivX_Table[256 + V], V1 = SinXDivX_Table[V]; 236 | int V2 = SinXDivX_Table[256 - V], V3 = SinXDivX_Table[512 - V]; 237 | int PosX = SrcX >> 16, PosY = SrcY >> 16; 238 | 239 | unsigned char *Pixel00 = Src + (PosY - 1) * Stride + (PosX - 1) * Channel; 240 | unsigned char *Pixel01 = Pixel00 + Channel; 241 | unsigned char *Pixel02 = Pixel01 + Channel; 242 | unsigned char *Pixel03 = Pixel02 + Channel; 243 | unsigned char *Pixel10 = Pixel00 + Stride; 244 | unsigned char *Pixel11 = Pixel10 + Channel; 245 | unsigned char *Pixel12 = Pixel11 + Channel; 246 | unsigned char *Pixel13 = Pixel12 + Channel; 247 | unsigned char *Pixel20 = Pixel10 + Stride; 248 | unsigned char *Pixel21 = Pixel20 + Channel; 249 | unsigned char *Pixel22 = Pixel21 + Channel; 250 | unsigned char *Pixel23 = Pixel22 + Channel; 251 | unsigned char *Pixel30 = Pixel20 + Stride; 252 | unsigned char *Pixel31 = Pixel30 + Channel; 253 | unsigned char *Pixel32 = Pixel31 + Channel; 254 | unsigned char *Pixel33 = Pixel32 + Channel; 255 | for (int I = 0; I < Channel; I++) 256 | { 257 | int Sum1 = (Pixel00[I] * U0 + Pixel01[I] * U1 + Pixel02[I] * U2 + Pixel03[I] * U3) * V0; 258 | int Sum2 = (Pixel10[I] * U0 + Pixel11[I] * U1 + Pixel12[I] * U2 + Pixel13[I] * U3) * V1; 259 | int Sum3 = (Pixel20[I] * U0 + Pixel21[I] * U1 + Pixel22[I] * U2 + Pixel23[I] * U3) * V2; 260 | int Sum4 = (Pixel30[I] * U0 + Pixel31[I] * U1 + Pixel22[I] * U2 + Pixel33[I] * U3) * V3; 261 | Pixel[I] = ClampToByte((Sum1 + Sum2 + Sum3 + Sum4) >> 16); 262 | } 263 | } 264 | 265 | // 原始的插值算法 266 | void IM_Resize_Cubic_Origin(unsigned char *Src, unsigned char *Dest, int SrcW, int SrcH, int StrideS, int DstW, int DstH, int StrideD) { 267 | int Channel = StrideS / SrcW; 268 | if ((SrcW == DstW) && (SrcH == DstH)) { 269 | memcpy(Dest, Src, SrcW * SrcH * Channel * sizeof(unsigned char)); 270 | return; 271 | } 272 | printf("%d\n", Channel); 273 | for (int Y = 0; Y < DstH; Y++) 274 | { 275 | unsigned char *LinePD = Dest + Y * StrideD; 276 | float SrcY = (Y + 0.4999999f) * SrcH / DstH - 0.5f; 277 | for (int X = 0; X < DstW; X++) 278 | { 279 | float SrcX = (X + 0.4999999f) * SrcW / DstW - 0.5f; 280 | Bicubic_Original(Src, SrcW, SrcH, StrideS, LinePD, SrcX, SrcY); 281 | LinePD += Channel; 282 | } 283 | } 284 | } 285 | 286 | // C语言实现的查表+插值算法 287 | void IM_Resize_Cubic_Table(unsigned char *Src, unsigned char *Dest, int SrcW, int SrcH, int StrideS, int DstW, int DstH, int StrideD) { 288 | int Channel = StrideS / SrcW; 289 | if ((SrcW == DstW) && (SrcH == DstH)) { 290 | memcpy(Dest, Src, SrcW * SrcH * Channel * sizeof(unsigned char)); 291 | return; 292 | } 293 | short *SinXDivX_Table = (short *)malloc(513 * sizeof(short)); 294 | for (int I = 0; I < 513; I++) 295 | SinXDivX_Table[I] = int(0.5 + 256 * SinXDivX(I / 256.0f)); // 建立查找表,定点化 296 | int AddX = (SrcW << 16) / DstW, AddY = (SrcH << 16) / DstH; 297 | int ErrorX = -(1 << 15) + (AddX >> 1), ErrorY = -(1 << 15) + (AddY >> 1); 298 | 299 | int StartX = ((1 << 16) - ErrorX) / AddX + 1; // 计算出需要特殊处理的边界 300 | int StartY = ((1 << 16) - ErrorY) / AddY + 1; // y0+y*yr>=1; y0=ErrorY => y>=(1-ErrorY)/yr 301 | int EndX = (((SrcW - 3) << 16) - ErrorX) / AddX + 1; 302 | int EndY = (((SrcH - 3) << 16) - ErrorY) / AddY + 1; // y0+y*yr<=(height-3) => y<=(height-3-ErrorY)/yr 303 | if (StartY >= DstH) StartY = DstH; 304 | if (StartX >= DstW) StartX = DstW; 305 | if (EndX < StartX) EndX = StartX; 306 | if (EndY < StartY) EndY = StartY; 307 | // 输出边界 308 | //printf("%d %d %d %d\n", StartX, StartY, EndX, EndY); 309 | int SrcY = ErrorY; 310 | for (int Y = 0; Y < StartY; Y++, SrcY += AddY) // 前面的不是都有效的取样部分数据 311 | { 312 | unsigned char *LinePD = Dest + Y * StrideD; 313 | for (int X = 0, SrcX = ErrorX; X < DstW; X++, SrcX += AddX, LinePD += Channel) 314 | { 315 | Bicubic_Border(Src, SrcW, SrcH, StrideS, LinePD, SinXDivX_Table, SrcX, SrcY); 316 | } 317 | } 318 | for (int Y = StartY; Y < EndY; Y++, SrcY += AddY) 319 | { 320 | int SrcX = ErrorX; 321 | unsigned char *LinePD = Dest + Y * StrideD; 322 | for (int X = 0; X < StartX; X++, SrcX += AddX, LinePD += Channel) 323 | { 324 | Bicubic_Border(Src, SrcW, SrcH, StrideS, LinePD, SinXDivX_Table, SrcX, SrcY); 325 | } 326 | for (int X = StartX; X < EndX; X++, SrcX += AddX, LinePD += Channel) 327 | { 328 | Bicubic_Center(Src, SrcW, SrcH, StrideS, LinePD, SinXDivX_Table, SrcX, SrcY); 329 | } 330 | for (int X = EndX; X < DstW; X++, SrcX += AddX, LinePD += Channel) 331 | { 332 | Bicubic_Border(Src, SrcW, SrcH, StrideS, LinePD, SinXDivX_Table, SrcX, SrcY); 333 | } 334 | } 335 | for (int Y = EndY; Y < DstH; Y++, SrcY += AddY) 336 | { 337 | unsigned char *LinePD = Dest + Y * StrideD; 338 | for (int X = 0, SrcX = ErrorX; X < DstW; X++, SrcX += AddX, LinePD += Channel) 339 | { 340 | Bicubic_Border(Src, SrcW, SrcH, StrideS, LinePD, SinXDivX_Table, SrcX, SrcY); 341 | } 342 | } 343 | free(SinXDivX_Table); 344 | } 345 | 346 | // 4个有符号的32位的数据相加的和 347 | inline int _mm_hsum_epi32(__m128i V) { //V3 V2 V1 V0 348 | __m128i T = _mm_add_epi32(V, _mm_srli_si128(V, 8)); //V3+V1 V2+V0 V1 V0 349 | T = _mm_add_epi32(T, _mm_srli_si128(T, 4)); //V3+V1+V2+V0 V2+V0+V1 V1+V0 V0 350 | return _mm_cvtsi128_si32(T); //提取低位 351 | } 352 | 353 | // 使用SSE优化立方插值算法 354 | // 最大支持图像大小为: 32767*32767 355 | void IM_Resize_SSE(unsigned char *Src, unsigned char *Dest, int SrcW, int SrcH, int StrideS, int DstW, int DstH, int StrideD) { 356 | int Channel = StrideS / SrcW; 357 | if ((SrcW == DstW) && (SrcH == DstH)) { 358 | memcpy(Dest, Src, SrcW * SrcH * Channel * sizeof(unsigned char)); 359 | return; 360 | } 361 | short *SinXDivX_Table = (short *)malloc(513 * sizeof(short)); 362 | short *Table = (short *)malloc(DstW * 4 * sizeof(short)); 363 | for (int I = 0; I < 513; I++) 364 | SinXDivX_Table[I] = int(0.5 + 256 * SinXDivX(I / 256.0f)); // 建立查找表,定点化 365 | int AddX = (SrcW << 16) / DstW, AddY = (SrcH << 16) / DstH; 366 | int ErrorX = -(1 << 15) + (AddX >> 1), ErrorY = -(1 << 15) + (AddY >> 1); 367 | 368 | int StartX = ((1 << 16) - ErrorX) / AddX + 1; // 计算出需要特殊处理的边界 369 | int StartY = ((1 << 16) - ErrorY) / AddY + 1; // y0+y*yr>=1; y0=ErrorY => y>=(1-ErrorY)/yr 370 | int EndX = (((SrcW - 3) << 16) - ErrorX) / AddX + 1; 371 | int EndY = (((SrcH - 3) << 16) - ErrorY) / AddY + 1; // y0+y*yr<=(height-3) => y<=(height-3-ErrorY)/yr 372 | if (StartY >= DstH) StartY = DstH; 373 | if (StartX >= DstW) StartX = DstW; 374 | if (EndX < StartX) EndX = StartX; 375 | if (EndY < StartY) EndY = StartY; 376 | for (int X = StartX, SrcX = ErrorX + StartX * AddX; X < EndY; X++, SrcX += AddX) { 377 | int U = (unsigned char)(SrcX >> 8); 378 | Table[X * 4 + 0] = SinXDivX_Table[256 + U]; //建立一个新表便于SSE操作 379 | Table[X * 4 + 1] = SinXDivX_Table[U]; 380 | Table[X * 4 + 2] = SinXDivX_Table[256 - U]; 381 | Table[X * 4 + 3] = SinXDivX_Table[512 - U]; 382 | } 383 | int SrcY = ErrorY; 384 | for (int Y = 0; Y < StartY; Y++, SrcY += AddY) { // 同IM_Resize_Cubic_Table函数 385 | unsigned char *LinePD = Dest + Y * StrideD; 386 | for (int X = 0, SrcX = ErrorX; X < DstW; X++, SrcX += AddX, LinePD += Channel) { 387 | Bicubic_Border(Src, SrcW, SrcH, StrideS, LinePD, SinXDivX_Table, SrcX, SrcY); 388 | } 389 | } 390 | for (int Y = StartY; Y < EndY; Y++, SrcY += AddY) { 391 | int SrcX = ErrorX; 392 | unsigned char *LinePD = Dest + Y * StrideD; 393 | for (int X = 0; X < StartX; X++, SrcX += AddX, LinePD += Channel) { 394 | Bicubic_Border(Src, SrcW, SrcH, StrideS, LinePD, SinXDivX_Table, SrcX, SrcY); 395 | } 396 | int V = (unsigned char)(SrcY >> 8); 397 | unsigned char *LineY = Src + ((SrcY >> 16) - 1) * StrideS; 398 | __m128i PartY = _mm_setr_epi32(SinXDivX_Table[256 + V], SinXDivX_Table[V], SinXDivX_Table[256 - V], SinXDivX_Table[512 - V]); 399 | for (int X = StartX; X < EndX; X++, SrcX += AddX, LinePD += Channel) { 400 | __m128i PartX = _mm_loadl_epi64((__m128i *)(Table + X * 4)); 401 | //PartX: U0 U1 U2 U3 U0 U1 U2 U3 402 | PartX = _mm_unpacklo_epi64(PartX, PartX); 403 | unsigned char *Pixel0 = LineY + ((SrcX >> 16) - 1) * Channel; 404 | unsigned char *Pixel1 = Pixel0 + StrideS; 405 | unsigned char *Pixel2 = Pixel1 + StrideS; 406 | unsigned char *Pixel3 = Pixel2 + StrideS; 407 | if (Channel == 1) { 408 | __m128i P01 = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*((int *)Pixel0)), _mm_cvtsi32_si128(*((int *)Pixel1)))); // P00 P01 P02 P03 P10 P11 P12 P13 409 | __m128i P23 = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*((int *)Pixel2)), _mm_cvtsi32_si128(*((int *)Pixel3)))); // P20 P21 P22 P23 P30 P31 P32 P33 410 | __m128i Sum01 = _mm_madd_epi16(P01, PartX); // P00 * U0 + P01 * U1 P02 * U2 + P03 * U3 P10 * U0 + P11 * U1 P12 * U2 + P13 * U3 411 | __m128i Sum23 = _mm_madd_epi16(P23, PartX); // P20 * U0 + P21 * U1 P22 * U2 + P23 * U3 P30 * U0 + P31 * U1 P32 * U2 + P33 * U3 412 | __m128i Sum = _mm_hadd_epi32(Sum01, Sum23); // P00 * U0 + P01 * U1 + P02 * U2 + P03 * U3 P10 * U0 + P11 * U1 + P12 * U2 + P13 * U3 P20 * U0 + P21 * U1 + P22 * U2 + P23 * U3 P30 * U0 + P31 * U1 + P32 * U2 + P33 * U3 413 | LinePD[0] = ClampToByte(_mm_hsum_epi32(_mm_mullo_epi32(Sum, PartY)) >> 16); 414 | } 415 | else if (Channel == 4) { 416 | __m128i P0 = _mm_loadu_si128((__m128i *)Pixel0), P1 = _mm_loadu_si128((__m128i *)Pixel1); 417 | __m128i P2 = _mm_loadu_si128((__m128i *)Pixel2), P3 = _mm_loadu_si128((__m128i *)Pixel3); 418 | P0 = _mm_shuffle_epi8(P0, _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15)); // B0 G0 R0 A0 419 | P1 = _mm_shuffle_epi8(P1, _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15)); // B1 G1 R1 A1 420 | P2 = _mm_shuffle_epi8(P2, _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15)); // B2 G2 R2 A2 421 | P3 = _mm_shuffle_epi8(P3, _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15)); // B3 G3 R3 A3 422 | 423 | __m128i BG01 = _mm_unpacklo_epi32(P0, P1); // B0 B1 G0 G1 424 | __m128i RA01 = _mm_unpackhi_epi32(P0, P1); // R0 R1 A0 A1 425 | __m128i BG23 = _mm_unpacklo_epi32(P2, P3); // B2 B3 G2 G3 426 | __m128i RA23 = _mm_unpackhi_epi32(P2, P3); // R2 R3 A2 A3 427 | 428 | __m128i B01 = _mm_unpacklo_epi8(BG01, _mm_setzero_si128()); 429 | __m128i B23 = _mm_unpacklo_epi8(BG23, _mm_setzero_si128()); 430 | __m128i SumB = _mm_hadd_epi32(_mm_madd_epi16(B01, PartX), _mm_madd_epi16(B23, PartX)); 431 | 432 | __m128i G01 = _mm_unpackhi_epi8(BG01, _mm_setzero_si128()); 433 | __m128i G23 = _mm_unpackhi_epi8(BG23, _mm_setzero_si128()); 434 | __m128i SumG = _mm_hadd_epi32(_mm_madd_epi16(G01, PartX), _mm_madd_epi16(G23, PartX)); 435 | 436 | __m128i R01 = _mm_unpacklo_epi8(RA01, _mm_setzero_si128()); 437 | __m128i R23 = _mm_unpacklo_epi8(RA23, _mm_setzero_si128()); 438 | __m128i SumR = _mm_hadd_epi32(_mm_madd_epi16(R01, PartX), _mm_madd_epi16(R23, PartX)); 439 | 440 | __m128i A01 = _mm_unpackhi_epi8(RA01, _mm_setzero_si128()); 441 | __m128i A23 = _mm_unpackhi_epi8(RA23, _mm_setzero_si128()); 442 | __m128i SumA = _mm_hadd_epi32(_mm_madd_epi16(A01, PartX), _mm_madd_epi16(A23, PartX)); 443 | 444 | __m128i Result = _mm_setr_epi32(_mm_hsum_epi32(_mm_mullo_epi32(SumB, PartY)), _mm_hsum_epi32(_mm_mullo_epi32(SumG, PartY)), _mm_hsum_epi32(_mm_mullo_epi32(SumR, PartY)), _mm_hsum_epi32(_mm_mullo_epi32(SumA, PartY))); 445 | Result = _mm_srai_epi32(Result, 16); 446 | // *((int *)LinePD) = _mm_cvtsi128_si32(_mm_packus_epi16(_mm_packus_epi32(Result, Result), Result)); 447 | _mm_stream_si32((int *)LinePD, _mm_cvtsi128_si32(_mm_packus_epi16(_mm_packus_epi32(Result, Result), Result))); 448 | 449 | //LinePD[0] = IM_ClampToByte(_mm_hsum_epi32(_mm_mullo_epi32(SumB, PartY)) >> 16); // 确实有部分存在超出unsigned char范围的,因为定点化的缘故 450 | //LinePD[1] = IM_ClampToByte(_mm_hsum_epi32(_mm_mullo_epi32(SumG, PartY)) >> 16); 451 | //LinePD[2] = IM_ClampToByte(_mm_hsum_epi32(_mm_mullo_epi32(SumR, PartY)) >> 16); 452 | //LinePD[3] = IM_ClampToByte(_mm_hsum_epi32(_mm_mullo_epi32(SumA, PartY)) >> 16); 453 | } 454 | } 455 | for (int X = EndX; X < DstW; X++, SrcX += AddX, LinePD += Channel) 456 | { 457 | Bicubic_Border(Src, SrcW, SrcH, StrideS, LinePD, SinXDivX_Table, SrcX, SrcY); 458 | } 459 | } 460 | for (int Y = EndY; Y < DstH; Y++, SrcY += AddY) 461 | { 462 | unsigned char *LinePD = Dest + Y * StrideD; 463 | for (int X = 0, SrcX = ErrorX; X < DstW; X++, SrcX += AddX, LinePD += Channel) 464 | { 465 | Bicubic_Border(Src, SrcW, SrcH, StrideS, LinePD, SinXDivX_Table, SrcX, SrcY); 466 | } 467 | } 468 | free(Table); 469 | free(SinXDivX_Table); 470 | } 471 | 472 | int main() { 473 | Mat src = imread("F:\\car.jpg"); 474 | int Height = src.rows; 475 | int Width = src.cols; 476 | int Stride = Width * 3; 477 | unsigned char *Src = src.data; 478 | unsigned char *Buffer = new unsigned char[Height * Width * 4]; 479 | ConvertBGR8U2BGRAF(Src, Buffer, Width, Height, Stride); 480 | int SrcW = Width; 481 | int SrcH = Height; 482 | int StrideS = Width * 4; 483 | int DstW = Width * 15 / 10; 484 | int DstH = Height * 15 / 10; 485 | unsigned char *Res = new unsigned char[DstH * DstW * 4]; 486 | unsigned char *Dest = new unsigned char[DstH * DstW * 3]; 487 | int StrideD = DstW * 4; 488 | int64 st = cvGetTickCount(); 489 | for (int i = 0; i < 10; i++) { 490 | IM_Resize_SSE(Buffer, Res, SrcW, SrcH, StrideS, DstW, DstH, StrideD); 491 | } 492 | double duration = (cv::getTickCount() - st) / cv::getTickFrequency() * 100; 493 | printf("%.5f\n", duration); 494 | IM_Resize_Cubic_Origin(Buffer, Res, SrcW, SrcH, StrideS, DstW, DstH, StrideD); 495 | ConvertBGRAF2BGR8U(Res, Dest, DstW, DstH, DstW * 3); 496 | Mat dst(DstH, DstW, CV_8UC3, Dest); 497 | imshow("origin", src); 498 | imshow("result", dst); 499 | imwrite("F:\\res.jpg", dst); 500 | waitKey(0); 501 | } -------------------------------------------------------------------------------- /speed_box_filter_sse.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "../../OpencvTest/OpencvTest/Core.h" 4 | #include "../../OpencvTest/OpencvTest/MaxFilter.h" 5 | #include "../../OpencvTest/OpencvTest/Utility.h" 6 | #include "../../OpencvTest/OpencvTest/BoxFilter.h" 7 | using namespace std; 8 | using namespace cv; 9 | 10 | void BoxBlur_1(unsigned char *Src, unsigned char *Dest, int Width, int Height, int Stride, int Channel, int Radius) { 11 | TMatrix a, b; 12 | TMatrix *p1 = &a, *p2 = &b; 13 | TMatrix **p3 = &p1, **p4 = &p2; 14 | IS_CreateMatrix(Width, Height, IS_DEPTH_8U, Channel, p3); 15 | IS_CreateMatrix(Width, Height, IS_DEPTH_8U, Channel, p4); 16 | (p1)->Data = Src; 17 | (p2)->Data = Dest; 18 | BoxBlur(p1, p2, Radius, EdgeMode::Smear); 19 | } 20 | 21 | void BoxBlur_SSE(unsigned char *Src, unsigned char *Dest, int Width, int Height, int Stride, int Channel, int Radius) { 22 | TMatrix a, b; 23 | TMatrix *p1 = &a, *p2 = &b; 24 | TMatrix **p3 = &p1, **p4 = &p2; 25 | IS_CreateMatrix(Width, Height, IS_DEPTH_8U, Channel, p3); 26 | IS_CreateMatrix(Width, Height, IS_DEPTH_8U, Channel, p4); 27 | (p1)->Data = Src; 28 | (p2)->Data = Dest; 29 | BoxBlur_SSE(p1, p2, Radius, EdgeMode::Smear); 30 | } 31 | 32 | 33 | int main() { 34 | Mat src = imread("F:\\car.jpg"); 35 | int Height = src.rows; 36 | int Width = src.cols; 37 | unsigned char *Src = src.data; 38 | unsigned char *Dest = new unsigned char[Height * Width * 3]; 39 | int Stride = Width * 3; 40 | int Radius = 11; 41 | int64 st = cvGetTickCount(); 42 | for (int i = 0; i <10; i++) { 43 | //Mat temp = MaxFilter(src, Radius); 44 | BoxBlur_SSE(Src, Dest, Width, Height, Stride, 3, Radius); 45 | } 46 | double duration = (cv::getTickCount() - st) / cv::getTickFrequency() * 100; 47 | printf("%.5f\n", duration); 48 | BoxBlur_SSE(Src, Dest, Width, Height, Stride, 3, Radius); 49 | Mat dst(Height, Width, CV_8UC3, Dest); 50 | imshow("origin", src); 51 | imshow("result", dst); 52 | imwrite("F:\\res.jpg", dst); 53 | waitKey(0); 54 | return 0; 55 | } -------------------------------------------------------------------------------- /speed_common_functions.cpp: -------------------------------------------------------------------------------- 1 | //近似值 2 | union Approximation 3 | { 4 | double Value; 5 | int X[2]; 6 | }; 7 | 8 | // 函数1: 将数据截断在Byte数据类型内。 9 | // 参考: http://www.cnblogs.com/zyl910/archive/2012/03/12/noifopex1.html 10 | // 简介: 用位掩码做饱和处理,用带符号右移生成掩码。 11 | unsigned char ClampToByte(int Value){ 12 | return ((Value | ((signed int)(255 - Value) >> 31)) & ~((signed int)Value >> 31)); 13 | } 14 | 15 | //函数2: 将数据截断在指定范围内 16 | //参考: 无 17 | //简介: 无 18 | int ClampToInt(int Value, int Min, int Max) { 19 | if (Value < Min) return Min; 20 | else if (Value > Max) return Max; 21 | else return Value; 22 | } 23 | 24 | //函数3: 整数除以255 25 | //参考: 无 26 | //简介: 移位 27 | int Div255(int Value) { 28 | return (((Value >> 8) + Value + 1) >> 8); 29 | } 30 | 31 | //函数4: 取绝对值 32 | //参考: https://oi-wiki.org/math/bit/ 33 | //简介: 比n > 0 ? n : -n 快 34 | 35 | int Abs(int n) { 36 | return (n ^ (n >> 31)) - (n >> 31); 37 | /* n>>31 取得 n 的符号,若 n 为正数,n>>31 等于 0,若 n 为负数,n>>31 等于 - 1 38 | 若 n 为正数 n^0=0, 数不变,若 n 为负数有 n^-1 39 | 需要计算 n 和 - 1 的补码,然后进行异或运算, 40 | 结果 n 变号并且为 n 的绝对值减 1,再减去 - 1 就是绝对值 */ 41 | } 42 | 43 | //函数5: 四舍五入 44 | //参考: 无 45 | //简介: 无 46 | double Round(double V) 47 | { 48 | return (V > 0.0) ? floor(V + 0.5) : Round(V - 0.5); 49 | } 50 | 51 | //函数6: 返回-1到1之间的随机数 52 | //参考: 无 53 | //简介: 无 54 | double Rand() 55 | { 56 | return (double)rand() / (RAND_MAX + 1.0); 57 | } 58 | 59 | //函数7: Pow函数的近似计算,针对double类型和float类型 60 | //参考: http://www.cvchina.info/2010/03/19/log-pow-exp-approximation/ 61 | //参考: http://martin.ankerl.com/2007/10/04/optimized-pow-approximation-for-java-and-c-c/ 62 | //简介: 这个函数只是为了加速的近似计算,有5%-12%不等的误差 63 | double Pow(double X, double Y) 64 | { 65 | Approximation V = { X }; 66 | V.X[1] = (int)(Y * (V.X[1] - 1072632447) + 1072632447); 67 | V.X[0] = 0; 68 | return V.Value; 69 | } 70 | 71 | 72 | float Pow(float X, float Y) 73 | { 74 | Approximation V = { X }; 75 | V.X[1] = (int)(Y * (V.X[1] - 1072632447) + 1072632447); 76 | V.X[0] = 0; 77 | return (float)V.Value; 78 | } 79 | 80 | //函数8: Exp函数的近似计算,针对double类型和float类型 81 | double Exp(double Y) // 用联合体的方式的速度要快些 82 | { 83 | Approximation V; 84 | V.X[1] = (int)(Y * 1485963 + 1072632447); 85 | V.X[0] = 0; 86 | return V.Value; 87 | } 88 | 89 | float Exp(float Y) // 用联合体的方式的速度要快些 90 | { 91 | Approximation V; 92 | V.X[1] = (int)(Y * 1485963 + 1072632447); 93 | V.X[0] = 0; 94 | return (float)V.Value; 95 | } 96 | 97 | // 函数9: Pow函数更准一点的近似计算,但是速度会稍慢 98 | // http://martin.ankerl.com/2012/01/25/optimized-approximative-pow-in-c-and-cpp/ 99 | // Besides that, I also have now a slower approximation that has much less error 100 | // when the exponent is larger than 1. It makes use exponentiation by squaring, 101 | // which is exact for the integer part of the exponent, and uses only the exponent’s fraction for the approximation: 102 | // should be much more precise with large Y 103 | 104 | double PrecisePow(double X, double Y){ 105 | // calculate approximation with fraction of the exponent 106 | int e = (int)Y; 107 | Approximation V = { X }; 108 | V.X[1] = (int)((Y - e) * (V.X[1] - 1072632447) + 1072632447); 109 | V.X[0] = 0; 110 | // exponentiation by squaring with the exponent's integer part 111 | // double r = u.d makes everything much slower, not sure why 112 | double r = 1.0; 113 | while (e) 114 | { 115 | if (e & 1) r *= X; 116 | X *= X; 117 | e >>= 1; 118 | } 119 | return r * V.Value; 120 | } 121 | 122 | //函数10: 返回Min到Max之间的随机数 123 | //参考: 无 124 | //简介: Min为随机数的最小值,Max为随机数的最大值 125 | int Random(int Min, int Max){ 126 | return rand() % (Max + 1 - Min) + Min; 127 | } 128 | 129 | //函数11: 符号函数 130 | //参考: 无 131 | //简介: 无 132 | int sgn(int X){ 133 | if (X > 0) return 1; 134 | if (X < 0) return -1; 135 | return 0; 136 | } 137 | 138 | //函数12: 获取某个整形变量对应的颜色值 139 | //参考: 无 140 | //简介: 无 141 | void GetRGB(int Color, int *R, int *G, int *B){ 142 | *R = Color & 255; 143 | *G = (Color & 65280) / 256; 144 | *B = (Color & 16711680) / 65536; 145 | } 146 | 147 | //函数13: 牛顿法近似获取指定数字的算法平方根 148 | //参考: https://www.cnblogs.com/qlky/p/7735145.html 149 | //简介: 仍然是近似算法,近似出了指定数字的平方根 150 | float Sqrt(float X) 151 | { 152 | float HalfX = 0.5f * X; // 对double类型的数字无效 153 | int I = *(int*)&X; // get bits for floating VALUE 154 | I = 0x5f375a86 - (I >> 1); // gives initial guess y0 155 | X = *(float*)&I; // convert bits BACK to float 156 | X = X * (1.5f - HalfX * X * X); // Newton step, repeating increases accuracy 157 | X = X * (1.5f - HalfX * X * X); // Newton step, repeating increases accuracy 158 | X = X * (1.5f - HalfX * X * X); // Newton step, repeating increases accuracy 159 | return 1 / X; 160 | } 161 | 162 | //函数14: 无符号短整形直方图数据相加,即是Y = X + Y 163 | //参考: 无 164 | //简介: SSE优化 165 | void HistgramAddShort(unsigned short *X, unsigned short *Y) 166 | { 167 | *(__m128i*)(Y + 0) = _mm_add_epi16(*(__m128i*)&Y[0], *(__m128i*)&X[0]); // 不要想着用自己写的汇编超过他的速度了,已经试过了 168 | *(__m128i*)(Y + 8) = _mm_add_epi16(*(__m128i*)&Y[8], *(__m128i*)&X[8]); 169 | *(__m128i*)(Y + 16) = _mm_add_epi16(*(__m128i*)&Y[16], *(__m128i*)&X[16]); 170 | *(__m128i*)(Y + 24) = _mm_add_epi16(*(__m128i*)&Y[24], *(__m128i*)&X[24]); 171 | *(__m128i*)(Y + 32) = _mm_add_epi16(*(__m128i*)&Y[32], *(__m128i*)&X[32]); 172 | *(__m128i*)(Y + 40) = _mm_add_epi16(*(__m128i*)&Y[40], *(__m128i*)&X[40]); 173 | *(__m128i*)(Y + 48) = _mm_add_epi16(*(__m128i*)&Y[48], *(__m128i*)&X[48]); 174 | *(__m128i*)(Y + 56) = _mm_add_epi16(*(__m128i*)&Y[56], *(__m128i*)&X[56]); 175 | *(__m128i*)(Y + 64) = _mm_add_epi16(*(__m128i*)&Y[64], *(__m128i*)&X[64]); 176 | *(__m128i*)(Y + 72) = _mm_add_epi16(*(__m128i*)&Y[72], *(__m128i*)&X[72]); 177 | *(__m128i*)(Y + 80) = _mm_add_epi16(*(__m128i*)&Y[80], *(__m128i*)&X[80]); 178 | *(__m128i*)(Y + 88) = _mm_add_epi16(*(__m128i*)&Y[88], *(__m128i*)&X[88]); 179 | *(__m128i*)(Y + 96) = _mm_add_epi16(*(__m128i*)&Y[96], *(__m128i*)&X[96]); 180 | *(__m128i*)(Y + 104) = _mm_add_epi16(*(__m128i*)&Y[104], *(__m128i*)&X[104]); 181 | *(__m128i*)(Y + 112) = _mm_add_epi16(*(__m128i*)&Y[112], *(__m128i*)&X[112]); 182 | *(__m128i*)(Y + 120) = _mm_add_epi16(*(__m128i*)&Y[120], *(__m128i*)&X[120]); 183 | *(__m128i*)(Y + 128) = _mm_add_epi16(*(__m128i*)&Y[128], *(__m128i*)&X[128]); 184 | *(__m128i*)(Y + 136) = _mm_add_epi16(*(__m128i*)&Y[136], *(__m128i*)&X[136]); 185 | *(__m128i*)(Y + 144) = _mm_add_epi16(*(__m128i*)&Y[144], *(__m128i*)&X[144]); 186 | *(__m128i*)(Y + 152) = _mm_add_epi16(*(__m128i*)&Y[152], *(__m128i*)&X[152]); 187 | *(__m128i*)(Y + 160) = _mm_add_epi16(*(__m128i*)&Y[160], *(__m128i*)&X[160]); 188 | *(__m128i*)(Y + 168) = _mm_add_epi16(*(__m128i*)&Y[168], *(__m128i*)&X[168]); 189 | *(__m128i*)(Y + 176) = _mm_add_epi16(*(__m128i*)&Y[176], *(__m128i*)&X[176]); 190 | *(__m128i*)(Y + 184) = _mm_add_epi16(*(__m128i*)&Y[184], *(__m128i*)&X[184]); 191 | *(__m128i*)(Y + 192) = _mm_add_epi16(*(__m128i*)&Y[192], *(__m128i*)&X[192]); 192 | *(__m128i*)(Y + 200) = _mm_add_epi16(*(__m128i*)&Y[200], *(__m128i*)&X[200]); 193 | *(__m128i*)(Y + 208) = _mm_add_epi16(*(__m128i*)&Y[208], *(__m128i*)&X[208]); 194 | *(__m128i*)(Y + 216) = _mm_add_epi16(*(__m128i*)&Y[216], *(__m128i*)&X[216]); 195 | *(__m128i*)(Y + 224) = _mm_add_epi16(*(__m128i*)&Y[224], *(__m128i*)&X[224]); 196 | *(__m128i*)(Y + 232) = _mm_add_epi16(*(__m128i*)&Y[232], *(__m128i*)&X[232]); 197 | *(__m128i*)(Y + 240) = _mm_add_epi16(*(__m128i*)&Y[240], *(__m128i*)&X[240]); 198 | *(__m128i*)(Y + 248) = _mm_add_epi16(*(__m128i*)&Y[248], *(__m128i*)&X[248]); 199 | } 200 | 201 | //函数15: 无符号短整形直方图数据相减,即是Y = Y - X 202 | //参考: 无 203 | //简介: SSE优化 204 | void HistgramSubShort(unsigned short *X, unsigned short *Y) 205 | { 206 | *(__m128i*)(Y + 0) = _mm_sub_epi16(*(__m128i*)&Y[0], *(__m128i*)&X[0]); 207 | *(__m128i*)(Y + 8) = _mm_sub_epi16(*(__m128i*)&Y[8], *(__m128i*)&X[8]); 208 | *(__m128i*)(Y + 16) = _mm_sub_epi16(*(__m128i*)&Y[16], *(__m128i*)&X[16]); 209 | *(__m128i*)(Y + 24) = _mm_sub_epi16(*(__m128i*)&Y[24], *(__m128i*)&X[24]); 210 | *(__m128i*)(Y + 32) = _mm_sub_epi16(*(__m128i*)&Y[32], *(__m128i*)&X[32]); 211 | *(__m128i*)(Y + 40) = _mm_sub_epi16(*(__m128i*)&Y[40], *(__m128i*)&X[40]); 212 | *(__m128i*)(Y + 48) = _mm_sub_epi16(*(__m128i*)&Y[48], *(__m128i*)&X[48]); 213 | *(__m128i*)(Y + 56) = _mm_sub_epi16(*(__m128i*)&Y[56], *(__m128i*)&X[56]); 214 | *(__m128i*)(Y + 64) = _mm_sub_epi16(*(__m128i*)&Y[64], *(__m128i*)&X[64]); 215 | *(__m128i*)(Y + 72) = _mm_sub_epi16(*(__m128i*)&Y[72], *(__m128i*)&X[72]); 216 | *(__m128i*)(Y + 80) = _mm_sub_epi16(*(__m128i*)&Y[80], *(__m128i*)&X[80]); 217 | *(__m128i*)(Y + 88) = _mm_sub_epi16(*(__m128i*)&Y[88], *(__m128i*)&X[88]); 218 | *(__m128i*)(Y + 96) = _mm_sub_epi16(*(__m128i*)&Y[96], *(__m128i*)&X[96]); 219 | *(__m128i*)(Y + 104) = _mm_sub_epi16(*(__m128i*)&Y[104], *(__m128i*)&X[104]); 220 | *(__m128i*)(Y + 112) = _mm_sub_epi16(*(__m128i*)&Y[112], *(__m128i*)&X[112]); 221 | *(__m128i*)(Y + 120) = _mm_sub_epi16(*(__m128i*)&Y[120], *(__m128i*)&X[120]); 222 | *(__m128i*)(Y + 128) = _mm_sub_epi16(*(__m128i*)&Y[128], *(__m128i*)&X[128]); 223 | *(__m128i*)(Y + 136) = _mm_sub_epi16(*(__m128i*)&Y[136], *(__m128i*)&X[136]); 224 | *(__m128i*)(Y + 144) = _mm_sub_epi16(*(__m128i*)&Y[144], *(__m128i*)&X[144]); 225 | *(__m128i*)(Y + 152) = _mm_sub_epi16(*(__m128i*)&Y[152], *(__m128i*)&X[152]); 226 | *(__m128i*)(Y + 160) = _mm_sub_epi16(*(__m128i*)&Y[160], *(__m128i*)&X[160]); 227 | *(__m128i*)(Y + 168) = _mm_sub_epi16(*(__m128i*)&Y[168], *(__m128i*)&X[168]); 228 | *(__m128i*)(Y + 176) = _mm_sub_epi16(*(__m128i*)&Y[176], *(__m128i*)&X[176]); 229 | *(__m128i*)(Y + 184) = _mm_sub_epi16(*(__m128i*)&Y[184], *(__m128i*)&X[184]); 230 | *(__m128i*)(Y + 192) = _mm_sub_epi16(*(__m128i*)&Y[192], *(__m128i*)&X[192]); 231 | *(__m128i*)(Y + 200) = _mm_sub_epi16(*(__m128i*)&Y[200], *(__m128i*)&X[200]); 232 | *(__m128i*)(Y + 208) = _mm_sub_epi16(*(__m128i*)&Y[208], *(__m128i*)&X[208]); 233 | *(__m128i*)(Y + 216) = _mm_sub_epi16(*(__m128i*)&Y[216], *(__m128i*)&X[216]); 234 | *(__m128i*)(Y + 224) = _mm_sub_epi16(*(__m128i*)&Y[224], *(__m128i*)&X[224]); 235 | *(__m128i*)(Y + 232) = _mm_sub_epi16(*(__m128i*)&Y[232], *(__m128i*)&X[232]); 236 | *(__m128i*)(Y + 240) = _mm_sub_epi16(*(__m128i*)&Y[240], *(__m128i*)&X[240]); 237 | *(__m128i*)(Y + 248) = _mm_sub_epi16(*(__m128i*)&Y[248], *(__m128i*)&X[248]); 238 | } 239 | 240 | //函数16: 无符号短整形直方图数据相加减,即是Z = Z + Y - X 241 | //参考: 无 242 | //简介: SSE优化 243 | void HistgramSubAddShort(unsigned short *X, unsigned short *Y, unsigned short *Z) 244 | { 245 | *(__m128i*)(Z + 0) = _mm_sub_epi16(_mm_add_epi16(*(__m128i*)&Y[0], *(__m128i*)&Z[0]), *(__m128i*)&X[0]); // 不要想着用自己写的汇编超过他的速度了,已经试过了 246 | *(__m128i*)(Z + 8) = _mm_sub_epi16(_mm_add_epi16(*(__m128i*)&Y[8], *(__m128i*)&Z[8]), *(__m128i*)&X[8]); 247 | *(__m128i*)(Z + 16) = _mm_sub_epi16(_mm_add_epi16(*(__m128i*)&Y[16], *(__m128i*)&Z[16]), *(__m128i*)&X[16]); 248 | *(__m128i*)(Z + 24) = _mm_sub_epi16(_mm_add_epi16(*(__m128i*)&Y[24], *(__m128i*)&Z[24]), *(__m128i*)&X[24]); 249 | *(__m128i*)(Z + 32) = _mm_sub_epi16(_mm_add_epi16(*(__m128i*)&Y[32], *(__m128i*)&Z[32]), *(__m128i*)&X[32]); 250 | *(__m128i*)(Z + 40) = _mm_sub_epi16(_mm_add_epi16(*(__m128i*)&Y[40], *(__m128i*)&Z[40]), *(__m128i*)&X[40]); 251 | *(__m128i*)(Z + 48) = _mm_sub_epi16(_mm_add_epi16(*(__m128i*)&Y[48], *(__m128i*)&Z[48]), *(__m128i*)&X[48]); 252 | *(__m128i*)(Z + 56) = _mm_sub_epi16(_mm_add_epi16(*(__m128i*)&Y[56], *(__m128i*)&Z[56]), *(__m128i*)&X[56]); 253 | *(__m128i*)(Z + 64) = _mm_sub_epi16(_mm_add_epi16(*(__m128i*)&Y[64], *(__m128i*)&Z[64]), *(__m128i*)&X[64]); 254 | *(__m128i*)(Z + 72) = _mm_sub_epi16(_mm_add_epi16(*(__m128i*)&Y[72], *(__m128i*)&Z[72]), *(__m128i*)&X[72]); 255 | *(__m128i*)(Z + 80) = _mm_sub_epi16(_mm_add_epi16(*(__m128i*)&Y[80], *(__m128i*)&Z[80]), *(__m128i*)&X[80]); 256 | *(__m128i*)(Z + 88) = _mm_sub_epi16(_mm_add_epi16(*(__m128i*)&Y[88], *(__m128i*)&Z[88]), *(__m128i*)&X[88]); 257 | *(__m128i*)(Z + 96) = _mm_sub_epi16(_mm_add_epi16(*(__m128i*)&Y[96], *(__m128i*)&Z[96]), *(__m128i*)&X[96]); 258 | *(__m128i*)(Z + 104) = _mm_sub_epi16(_mm_add_epi16(*(__m128i*)&Y[104], *(__m128i*)&Z[104]), *(__m128i*)&X[104]); 259 | *(__m128i*)(Z + 112) = _mm_sub_epi16(_mm_add_epi16(*(__m128i*)&Y[112], *(__m128i*)&Z[112]), *(__m128i*)&X[112]); 260 | *(__m128i*)(Z + 120) = _mm_sub_epi16(_mm_add_epi16(*(__m128i*)&Y[120], *(__m128i*)&Z[120]), *(__m128i*)&X[120]); 261 | *(__m128i*)(Z + 128) = _mm_sub_epi16(_mm_add_epi16(*(__m128i*)&Y[128], *(__m128i*)&Z[128]), *(__m128i*)&X[128]); 262 | *(__m128i*)(Z + 136) = _mm_sub_epi16(_mm_add_epi16(*(__m128i*)&Y[136], *(__m128i*)&Z[136]), *(__m128i*)&X[136]); 263 | *(__m128i*)(Z + 144) = _mm_sub_epi16(_mm_add_epi16(*(__m128i*)&Y[144], *(__m128i*)&Z[144]), *(__m128i*)&X[144]); 264 | *(__m128i*)(Z + 152) = _mm_sub_epi16(_mm_add_epi16(*(__m128i*)&Y[152], *(__m128i*)&Z[152]), *(__m128i*)&X[152]); 265 | *(__m128i*)(Z + 160) = _mm_sub_epi16(_mm_add_epi16(*(__m128i*)&Y[160], *(__m128i*)&Z[160]), *(__m128i*)&X[160]); 266 | *(__m128i*)(Z + 168) = _mm_sub_epi16(_mm_add_epi16(*(__m128i*)&Y[168], *(__m128i*)&Z[168]), *(__m128i*)&X[168]); 267 | *(__m128i*)(Z + 176) = _mm_sub_epi16(_mm_add_epi16(*(__m128i*)&Y[176], *(__m128i*)&Z[176]), *(__m128i*)&X[176]); 268 | *(__m128i*)(Z + 184) = _mm_sub_epi16(_mm_add_epi16(*(__m128i*)&Y[184], *(__m128i*)&Z[184]), *(__m128i*)&X[184]); 269 | *(__m128i*)(Z + 192) = _mm_sub_epi16(_mm_add_epi16(*(__m128i*)&Y[192], *(__m128i*)&Z[192]), *(__m128i*)&X[192]); 270 | *(__m128i*)(Z + 200) = _mm_sub_epi16(_mm_add_epi16(*(__m128i*)&Y[200], *(__m128i*)&Z[200]), *(__m128i*)&X[200]); 271 | *(__m128i*)(Z + 208) = _mm_sub_epi16(_mm_add_epi16(*(__m128i*)&Y[208], *(__m128i*)&Z[208]), *(__m128i*)&X[208]); 272 | *(__m128i*)(Z + 216) = _mm_sub_epi16(_mm_add_epi16(*(__m128i*)&Y[216], *(__m128i*)&Z[216]), *(__m128i*)&X[216]); 273 | *(__m128i*)(Z + 224) = _mm_sub_epi16(_mm_add_epi16(*(__m128i*)&Y[224], *(__m128i*)&Z[224]), *(__m128i*)&X[224]); 274 | *(__m128i*)(Z + 232) = _mm_sub_epi16(_mm_add_epi16(*(__m128i*)&Y[232], *(__m128i*)&Z[232]), *(__m128i*)&X[232]); 275 | *(__m128i*)(Z + 240) = _mm_sub_epi16(_mm_add_epi16(*(__m128i*)&Y[240], *(__m128i*)&Z[240]), *(__m128i*)&X[240]); 276 | *(__m128i*)(Z + 248) = _mm_sub_epi16(_mm_add_epi16(*(__m128i*)&Y[248], *(__m128i*)&Z[248]), *(__m128i*)&X[248]); 277 | } 278 | -------------------------------------------------------------------------------- /speed_gaussian_filter_sse.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | using namespace std; 5 | using namespace cv; 6 | 7 | void CalcGaussCof(float Radius, float &B0, float &B1, float &B2, float &B3) 8 | { 9 | float Q, B; 10 | if (Radius >= 2.5) 11 | Q = (double)(0.98711 * Radius - 0.96330); // 对应论文公式11b 12 | else if ((Radius >= 0.5) && (Radius < 2.5)) 13 | Q = (double)(3.97156 - 4.14554 * sqrt(1 - 0.26891 * Radius)); 14 | else 15 | Q = (double)0.1147705018520355224609375; 16 | 17 | B = 1.57825 + 2.44413 * Q + 1.4281 * Q * Q + 0.422205 * Q * Q * Q; // 对应论文公式8c 18 | B1 = 2.44413 * Q + 2.85619 * Q * Q + 1.26661 * Q * Q * Q; 19 | B2 = -1.4281 * Q * Q - 1.26661 * Q * Q * Q; 20 | B3 = 0.422205 * Q * Q * Q; 21 | 22 | B0 = 1.0 - (B1 + B2 + B3) / B; 23 | B1 = B1 / B; 24 | B2 = B2 / B; 25 | B3 = B3 / B; 26 | } 27 | 28 | void ConvertBGR8U2BGRAF(unsigned char *Src, float *Dest, int Width, int Height, int Stride) 29 | { 30 | //#pragma omp parallel for 31 | for (int Y = 0; Y < Height; Y++) 32 | { 33 | unsigned char *LinePS = Src + Y * Stride; 34 | float *LinePD = Dest + Y * Width * 3; 35 | for (int X = 0; X < Width; X++, LinePS += 3, LinePD += 3) 36 | { 37 | LinePD[0] = LinePS[0]; LinePD[1] = LinePS[1]; LinePD[2] = LinePS[2]; 38 | } 39 | } 40 | } 41 | 42 | void ConvertBGR8U2BGRAF_SSE(unsigned char *Src, float *Dest, int Width, int Height, int Stride) { 43 | const int BlockSize = 4; 44 | int Block = (Width - 2) / BlockSize; 45 | __m128i Mask = _mm_setr_epi8(0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, 11, -1); 46 | __m128i Zero = _mm_setzero_si128(); 47 | for (int Y = 0; Y < Height; Y++) { 48 | unsigned char *LinePS = Src + Y * Stride; 49 | float *LinePD = Dest + Y * Width * 4; 50 | int X = 0; 51 | for (; X < Block * BlockSize; X += BlockSize, LinePS += BlockSize * 3, LinePD += BlockSize * 4) { 52 | __m128i SrcV = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i*)LinePS), Mask); 53 | __m128i Src16L = _mm_unpacklo_epi8(SrcV, Zero); 54 | __m128i Src16H = _mm_unpackhi_epi8(SrcV, Zero); 55 | _mm_store_ps(LinePD + 0, _mm_cvtepi32_ps(_mm_unpacklo_epi16(Src16L, Zero))); 56 | _mm_store_ps(LinePD + 4, _mm_cvtepi32_ps(_mm_unpackhi_epi16(Src16L, Zero))); 57 | _mm_store_ps(LinePD + 8, _mm_cvtepi32_ps(_mm_unpacklo_epi16(Src16H, Zero))); 58 | _mm_store_ps(LinePD + 12, _mm_cvtepi32_ps(_mm_unpackhi_epi16(Src16H, Zero))); 59 | } 60 | for (; X < Width; X++, LinePS += 3, LinePD += 4) { 61 | LinePD[0] = LinePS[0]; LinePD[1] = LinePS[1]; LinePD[2] = LinePS[2]; LinePD[3] = 0; 62 | } 63 | } 64 | } 65 | 66 | void GaussBlurFromLeftToRight(float *Data, int Width, int Height, float B0, float B1, float B2, float B3) 67 | { 68 | //#pragma omp parallel for 69 | for (int Y = 0; Y < Height; Y++) 70 | { 71 | float *LinePD = Data + Y * Width * 3; 72 | //w[n-1], w[n-2], w[n-3] 73 | float BS1 = LinePD[0], BS2 = LinePD[0], BS3 = LinePD[0]; //边缘处使用重复像素的方案 74 | float GS1 = LinePD[1], GS2 = LinePD[1], GS3 = LinePD[1]; 75 | float RS1 = LinePD[2], RS2 = LinePD[2], RS3 = LinePD[2]; 76 | for (int X = 0; X < Width; X++, LinePD += 3) 77 | { 78 | LinePD[0] = LinePD[0] * B0 + BS1 * B1 + BS2 * B2 + BS3 * B3; 79 | LinePD[1] = LinePD[1] * B0 + GS1 * B1 + GS2 * B2 + GS3 * B3; // 进行顺向迭代 80 | LinePD[2] = LinePD[2] * B0 + RS1 * B1 + RS2 * B2 + RS3 * B3; 81 | BS3 = BS2, BS2 = BS1, BS1 = LinePD[0]; 82 | GS3 = GS2, GS2 = GS1, GS1 = LinePD[1]; 83 | RS3 = RS2, RS2 = RS1, RS1 = LinePD[2]; 84 | } 85 | } 86 | } 87 | 88 | void GaussBlurFromLeftToRight_SSE(float *Data, int Width, int Height, float B0, float B1, float B2, float B3) { 89 | const __m128 CofB0 = _mm_set_ps(0, B0, B0, B0); 90 | const __m128 CofB1 = _mm_set_ps(0, B1, B1, B1); 91 | const __m128 CofB2 = _mm_set_ps(0, B2, B2, B2); 92 | const __m128 CofB3 = _mm_set_ps(0, B3, B3, B3); 93 | for (int Y = 0; Y < Height; Y++) { 94 | float *LinePD = Data + Y * Width * 4; 95 | __m128 V1 = _mm_set_ps(LinePD[3], LinePD[2], LinePD[1], LinePD[0]); 96 | __m128 V2 = V1, V3 = V1; 97 | for (int X = 0; X < Width; X++, LinePD += 4) { 98 | __m128 V0 = _mm_load_ps(LinePD); 99 | __m128 V01 = _mm_add_ps(_mm_mul_ps(CofB0, V0), _mm_mul_ps(CofB1, V1)); 100 | __m128 V23 = _mm_add_ps(_mm_mul_ps(CofB2, V2), _mm_mul_ps(CofB3, V3)); 101 | __m128 V = _mm_add_ps(V01, V23); 102 | V3 = V2; V2 = V1; V1 = V; 103 | _mm_store_ps(LinePD, V); 104 | } 105 | } 106 | } 107 | 108 | void GaussBlurFromRightToLeft(float *Data, int Width, int Height, float B0, float B1, float B2, float B3) { 109 | for (int Y = 0; Y < Height; Y++) { 110 | //w[n+1], w[n+2], w[n+3] 111 | float *LinePD = Data + Y * Width * 3 + (Width * 3); 112 | float BS1 = LinePD[0], BS2 = LinePD[0], BS3 = LinePD[0]; //边缘处使用重复像素的方案 113 | float GS1 = LinePD[1], GS2 = LinePD[1], GS3 = LinePD[1]; 114 | float RS1 = LinePD[2], RS2 = LinePD[2], RS3 = LinePD[2]; 115 | for (int X = Width - 1; X >= 0; X--, LinePD -= 3) 116 | { 117 | LinePD[0] = LinePD[0] * B0 + BS3 * B1 + BS2 * B2 + BS1 * B3; 118 | LinePD[1] = LinePD[1] * B0 + GS3 * B1 + GS2 * B2 + GS1 * B3; // 进行反向迭代 119 | LinePD[2] = LinePD[2] * B0 + RS3 * B1 + RS2 * B2 + RS1 * B3; 120 | BS1 = BS2, BS2 = BS3, BS3 = LinePD[0]; 121 | GS1 = GS2, GS2 = GS3, GS3 = LinePD[1]; 122 | RS1 = RS2, RS2 = RS3, RS3 = LinePD[2]; 123 | } 124 | } 125 | } 126 | 127 | void GaussBlurFromRightToLeft_SSE(float *Data, int Width, int Height, float B0, float B1, float B2, float B3) { 128 | const __m128 CofB0 = _mm_set_ps(0, B0, B0, B0); 129 | const __m128 CofB1 = _mm_set_ps(0, B1, B1, B1); 130 | const __m128 CofB2 = _mm_set_ps(0, B2, B2, B2); 131 | const __m128 CofB3 = _mm_set_ps(0, B3, B3, B3); 132 | for (int Y = 0; Y < Height; Y++) { 133 | float *LinePD = Data + Y * Width * 4 + (Width * 4); 134 | __m128 V1 = _mm_set_ps(LinePD[3], LinePD[2], LinePD[1], LinePD[0]); 135 | __m128 V2 = V1, V3 = V1; 136 | for (int X = Width - 1; X >= 0; X--, LinePD -= 4) { 137 | __m128 V0 = _mm_load_ps(LinePD); 138 | __m128 V03 = _mm_add_ps(_mm_mul_ps(CofB0, V0), _mm_mul_ps(CofB1, V3)); 139 | __m128 V12 = _mm_add_ps(_mm_mul_ps(CofB2, V2), _mm_mul_ps(CofB3, V1)); 140 | __m128 V = _mm_add_ps(V03, V12); 141 | V1 = V2; V2 = V3; V3 = V; 142 | _mm_store_ps(LinePD, V); 143 | } 144 | } 145 | } 146 | 147 | 148 | //w[n] w[n-1], w[n-2], w[n-3] 149 | void GaussBlurFromTopToBottom(float *Data, int Width, int Height, float B0, float B1, float B2, float B3) 150 | { 151 | for (int Y = 0; Y < Height; Y++) 152 | { 153 | float *LinePD3 = Data + (Y + 0) * Width * 3; 154 | float *LinePD2 = Data + (Y + 1) * Width * 3; 155 | float *LinePD1 = Data + (Y + 2) * Width * 3; 156 | float *LinePD0 = Data + (Y + 3) * Width * 3; 157 | for (int X = 0; X < Width; X++, LinePD0 += 3, LinePD1 += 3, LinePD2 += 3, LinePD3 += 3) 158 | { 159 | LinePD0[0] = LinePD0[0] * B0 + LinePD1[0] * B1 + LinePD2[0] * B2 + LinePD3[0] * B3; 160 | LinePD0[1] = LinePD0[1] * B0 + LinePD1[1] * B1 + LinePD2[1] * B2 + LinePD3[1] * B3; 161 | LinePD0[2] = LinePD0[2] * B0 + LinePD1[2] * B1 + LinePD2[2] * B2 + LinePD3[2] * B3; 162 | } 163 | } 164 | } 165 | 166 | void GaussBlurFromTopToBottom_SSE(float *Data, int Width, int Height, float B0, float B1, float B2, float B3){ 167 | const __m128 CofB0 = _mm_set_ps(0, B0, B0, B0); 168 | const __m128 CofB1 = _mm_set_ps(0, B1, B1, B1); 169 | const __m128 CofB2 = _mm_set_ps(0, B2, B2, B2); 170 | const __m128 CofB3 = _mm_set_ps(0, B3, B3, B3); 171 | for (int Y = 0; Y < Height; Y++) 172 | { 173 | float *LinePS3 = Data + (Y + 0) * Width * 4; 174 | float *LinePS2 = Data + (Y + 1) * Width * 4; 175 | float *LinePS1 = Data + (Y + 2) * Width * 4; 176 | float *LinePS0 = Data + (Y + 3) * Width * 4; 177 | for (int X = 0; X < Width * 4; X += 4) 178 | { 179 | __m128 V3 = _mm_load_ps(LinePS3 + X); 180 | __m128 V2 = _mm_load_ps(LinePS2 + X); 181 | __m128 V1 = _mm_load_ps(LinePS1 + X); 182 | __m128 V0 = _mm_load_ps(LinePS0 + X); 183 | __m128 V01 = _mm_add_ps(_mm_mul_ps(CofB0, V0), _mm_mul_ps(CofB1, V1)); 184 | __m128 V23 = _mm_add_ps(_mm_mul_ps(CofB2, V2), _mm_mul_ps(CofB3, V3)); 185 | _mm_store_ps(LinePS0 + X, _mm_add_ps(V01, V23)); 186 | } 187 | } 188 | } 189 | //w[n] w[n+1], w[n+2], w[n+3] 190 | void GaussBlurFromBottomToTop(float *Data, int Width, int Height, float B0, float B1, float B2, float B3) { 191 | for (int Y = Height - 1; Y >= 0; Y--) { 192 | float *LinePD3 = Data + (Y + 3) * Width * 3; 193 | float *LinePD2 = Data + (Y + 2) * Width * 3; 194 | float *LinePD1 = Data + (Y + 1) * Width * 3; 195 | float *LinePD0 = Data + (Y + 0) * Width * 3; 196 | for (int X = 0; X < Width; X++, LinePD0 += 3, LinePD1 += 3, LinePD2 += 3, LinePD3 += 3) { 197 | LinePD0[0] = LinePD0[0] * B0 + LinePD1[0] * B1 + LinePD2[0] * B2 + LinePD3[0] * B3; 198 | LinePD0[1] = LinePD0[1] * B0 + LinePD1[1] * B1 + LinePD2[1] * B2 + LinePD3[1] * B3; 199 | LinePD0[2] = LinePD0[2] * B0 + LinePD1[2] * B1 + LinePD2[2] * B2 + LinePD3[2] * B3; 200 | } 201 | } 202 | } 203 | 204 | void GaussBlurFromBottomToTop_SSE(float *Data, int Width, int Height, float B0, float B1, float B2, float B3) { 205 | const __m128 CofB0 = _mm_set_ps(0, B0, B0, B0); 206 | const __m128 CofB1 = _mm_set_ps(0, B1, B1, B1); 207 | const __m128 CofB2 = _mm_set_ps(0, B2, B2, B2); 208 | const __m128 CofB3 = _mm_set_ps(0, B3, B3, B3); 209 | for (int Y = Height - 1; Y >= 0; Y--) { 210 | float *LinePS3 = Data + (Y + 3) * Width * 4; 211 | float *LinePS2 = Data + (Y + 2) * Width * 4; 212 | float *LinePS1 = Data + (Y + 1) * Width * 4; 213 | float *LinePS0 = Data + (Y + 0) * Width * 4; 214 | for (int X = 0; X < Width * 4; X += 4) { 215 | __m128 V3 = _mm_load_ps(LinePS3 + X); 216 | __m128 V2 = _mm_load_ps(LinePS2 + X); 217 | __m128 V1 = _mm_load_ps(LinePS1 + X); 218 | __m128 V0 = _mm_load_ps(LinePS0 + X); 219 | __m128 V01 = _mm_add_ps(_mm_mul_ps(CofB0, V0), _mm_mul_ps(CofB1, V1)); 220 | __m128 V23 = _mm_add_ps(_mm_mul_ps(CofB2, V2), _mm_mul_ps(CofB3, V3)); 221 | _mm_store_ps(LinePS0 + X, _mm_add_ps(V01, V23)); 222 | } 223 | } 224 | } 225 | 226 | void ConvertBGRAF2BGR8U(float *Src, unsigned char *Dest, int Width, int Height, int Stride) 227 | { 228 | //#pragma omp parallel for 229 | for (int Y = 0; Y < Height; Y++) 230 | { 231 | float *LinePS = Src + Y * Width * 3; 232 | unsigned char *LinePD = Dest + Y * Stride; 233 | for (int X = 0; X < Width; X++, LinePS += 3, LinePD += 3) 234 | { 235 | LinePD[0] = LinePS[0]; LinePD[1] = LinePS[1]; LinePD[2] = LinePS[2]; 236 | } 237 | } 238 | } 239 | 240 | 241 | void ConvertBGRAF2BGR8U_SSE(unsigned char *Src, unsigned char *Dest, int Width, int Height, int Stride) { 242 | const int BlockSize = 4; 243 | int Block = (Width - 2) / BlockSize; 244 | //__m128i Mask = _mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 3, 7, 11, 15); 245 | __m128i MaskB = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); 246 | __m128i MaskG = _mm_setr_epi8(1, 5, 9, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); 247 | __m128i MaskR = _mm_setr_epi8(2, 6, 10, 14, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); 248 | __m128i Zero = _mm_setzero_si128(); 249 | for (int Y = 0; Y < Height; Y++) { 250 | unsigned char *LinePS = Src + Y * Width * 4; 251 | unsigned char *LinePD = Dest + Y * Stride; 252 | int X = 0; 253 | for (; X < Block * BlockSize; X += BlockSize, LinePS += BlockSize * 4, LinePD += BlockSize * 3) { 254 | __m128i SrcV = _mm_loadu_si128((const __m128i*)LinePS); 255 | __m128i B = _mm_shuffle_epi8(SrcV, MaskB); 256 | __m128i G = _mm_shuffle_epi8(SrcV, MaskG); 257 | __m128i R = _mm_shuffle_epi8(SrcV, MaskR); 258 | __m128i Ans1 = Zero, Ans2 = Zero, Ans3 = Zero; 259 | Ans1 = _mm_or_si128(Ans1, _mm_shuffle_epi8(B, _mm_setr_epi8(0, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1))); 260 | Ans1 = _mm_or_si128(Ans1, _mm_shuffle_epi8(G, _mm_setr_epi8(-1, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1))); 261 | Ans1 = _mm_or_si128(Ans1, _mm_shuffle_epi8(R, _mm_setr_epi8(-1, -1, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1))); 262 | 263 | Ans2 = _mm_or_si128(Ans2, _mm_shuffle_epi8(B, _mm_setr_epi8(-1, -1, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1))); 264 | Ans2 = _mm_or_si128(Ans2, _mm_shuffle_epi8(G, _mm_setr_epi8(1, -1, -1, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1))); 265 | Ans2 = _mm_or_si128(Ans2, _mm_shuffle_epi8(R, _mm_setr_epi8(-1, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1))); 266 | 267 | Ans3 = _mm_or_si128(Ans3, _mm_shuffle_epi8(B, _mm_setr_epi8(-1, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1))); 268 | Ans3 = _mm_or_si128(Ans3, _mm_shuffle_epi8(G, _mm_setr_epi8(-1, -1, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1))); 269 | Ans3 = _mm_or_si128(Ans3, _mm_shuffle_epi8(R, _mm_setr_epi8(2, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1))); 270 | 271 | _mm_storeu_si128((__m128i*)(LinePD + 0), Ans1); 272 | _mm_storeu_si128((__m128i*)(LinePD + 4), Ans2); 273 | _mm_storeu_si128((__m128i*)(LinePD + 8), Ans3); 274 | } 275 | for (; X < Width; X++, LinePS += 4, LinePD += 3) { 276 | LinePD[0] = LinePS[0]; LinePD[1] = LinePS[1]; LinePD[2] = LinePS[2]; 277 | } 278 | } 279 | } 280 | 281 | void GaussBlur(unsigned char *Src, unsigned char *Dest, int Width, int Height, int Stride, float Radius) 282 | { 283 | float B0, B1, B2, B3; 284 | float *Buffer = (float *)malloc(Width * (Height + 6) * sizeof(float) * 3); 285 | CalcGaussCof(Radius, B0, B1, B2, B3); 286 | ConvertBGR8U2BGRAF(Src, Buffer + 3 * Width * 3, Width, Height, Stride); 287 | GaussBlurFromLeftToRight(Buffer + 3 * Width * 3, Width, Height, B0, B1, B2, B3); 288 | GaussBlurFromRightToLeft(Buffer + 3 * Width * 3, Width, Height, B0, B1, B2, B3); // 如果启用多线程,建议把这个函数写到GaussBlurFromLeftToRight的for X循环里,因为这样就可以减少线程并发时的阻力 289 | 290 | memcpy(Buffer + 0 * Width * 3, Buffer + 3 * Width * 3, Width * 3 * sizeof(float)); 291 | memcpy(Buffer + 1 * Width * 3, Buffer + 3 * Width * 3, Width * 3 * sizeof(float)); 292 | memcpy(Buffer + 2 * Width * 3, Buffer + 3 * Width * 3, Width * 3 * sizeof(float)); 293 | 294 | GaussBlurFromTopToBottom(Buffer, Width, Height, B0, B1, B2, B3); 295 | 296 | memcpy(Buffer + (Height + 3) * Width * 3, Buffer + (Height + 2) * Width * 3, Width * 3 * sizeof(float)); 297 | memcpy(Buffer + (Height + 4) * Width * 3, Buffer + (Height + 2) * Width * 3, Width * 3 * sizeof(float)); 298 | memcpy(Buffer + (Height + 5) * Width * 3, Buffer + (Height + 2) * Width * 3, Width * 3 * sizeof(float)); 299 | 300 | GaussBlurFromBottomToTop(Buffer, Width, Height, B0, B1, B2, B3); 301 | 302 | ConvertBGRAF2BGR8U(Buffer + 3 * Width * 3, Dest, Width, Height, Stride); 303 | 304 | free(Buffer); 305 | } 306 | 307 | void GaussBlur_SSE(unsigned char *Src, unsigned char *Dest, int Width, int Height, int Stride, float Radius) 308 | { 309 | float B0, B1, B2, B3; 310 | float *Buffer = (float *)_mm_malloc(Width * (Height + 6) * sizeof(float) * 4, 16); 311 | CalcGaussCof(Radius, B0, B1, B2, B3); 312 | ConvertBGR8U2BGRAF_SSE(Src, Buffer + 3 * Width * 4, Width, Height, Stride); 313 | GaussBlurFromLeftToRight_SSE(Buffer + 3 * Width * 4, Width, Height, B0, B1, B2, B3); // 在SSE版本中,这两个函数占用的时间比下面两个要多,不过C语言版本也是一样的 314 | GaussBlurFromRightToLeft_SSE(Buffer + 3 * Width * 4, Width, Height, B0, B1, B2, B3); // 如果启用多线程,建议把这个函数写到GaussBlurFromLeftToRight的for X循环里,因为这样就可以减少线程并发时的阻力 315 | 316 | memcpy(Buffer + 0 * Width * 4, Buffer + 3 * Width * 4, Width * 4 * sizeof(float)); 317 | memcpy(Buffer + 1 * Width * 4, Buffer + 3 * Width * 4, Width * 4 * sizeof(float)); 318 | memcpy(Buffer + 2 * Width * 4, Buffer + 3 * Width * 4, Width * 4 * sizeof(float)); 319 | 320 | GaussBlurFromTopToBottom_SSE(Buffer, Width, Height, B0, B1, B2, B3); 321 | 322 | memcpy(Buffer + (Height + 3) * Width * 4, Buffer + (Height + 2) * Width * 4, Width * 4 * sizeof(float)); 323 | memcpy(Buffer + (Height + 4) * Width * 4, Buffer + (Height + 2) * Width * 4, Width * 4 * sizeof(float)); 324 | memcpy(Buffer + (Height + 5) * Width * 4, Buffer + (Height + 2) * Width * 4, Width * 4 * sizeof(float)); 325 | 326 | GaussBlurFromBottomToTop_SSE(Buffer, Width, Height, B0, B1, B2, B3); 327 | 328 | ConvertBGRAF2BGR8U_SSE(Buffer + 3 * Width * 4, Dest, Width, Height, Stride); 329 | 330 | _mm_free(Buffer); 331 | } 332 | 333 | int main() { 334 | Mat src = imread("F:\\car.jpg"); 335 | int Height = src.rows; 336 | int Width = src.cols; 337 | unsigned char *Src = src.data; 338 | unsigned char *Dest = new unsigned char[Height * Width * 3]; 339 | int Stride = Width * 3; 340 | int Radius = 11; 341 | int64 st = cvGetTickCount(); 342 | for (int i = 0; i < 20; i++) { 343 | GaussBlur_SSE(Src, Dest, Width, Height, Stride, Radius); 344 | } 345 | double duration = (cv::getTickCount() - st) / cv::getTickFrequency() * 50; 346 | printf("%.5f\n", duration); 347 | GaussBlur_SSE(Src, Dest, Width, Height, Stride, Radius); 348 | Mat dst(Height, Width, CV_8UC3, Dest); 349 | imshow("origin", src); 350 | imshow("result", dst); 351 | imwrite("F:\\res.jpg", dst); 352 | waitKey(0); 353 | } -------------------------------------------------------------------------------- /speed_histogram_algorithm_framework/BoxFilter.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BBuf/Image-processing-algorithm-Speed/d22063d5c5b45466e80787e33ffe8a430c435854/speed_histogram_algorithm_framework/BoxFilter.h -------------------------------------------------------------------------------- /speed_histogram_algorithm_framework/Core.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BBuf/Image-processing-algorithm-Speed/d22063d5c5b45466e80787e33ffe8a430c435854/speed_histogram_algorithm_framework/Core.h -------------------------------------------------------------------------------- /speed_histogram_algorithm_framework/MaxFilter.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "Core.h" 3 | #include "Utility.h" 4 | 5 | // 函数供能: 在指定半径内,最大值”滤镜用周围像素的最高亮度值替换当前像素的亮度值。 6 | // 参数列表: 7 | // Src: 需要处理的源图像的数据结构 8 | // Dest: 保存处理后的图像的数据结构 9 | // Radius: 半径,有效范围 10 | // 说明: 11 | // 1、程序的执行时间和半径基本无关,但和图像内容有关 12 | // 2、Src和Dest可以相同,不同时执行速度很快 13 | // 3、对于各向异性的图像来说,执行速度很快,对于有大面积相同像素的图像,速度会慢一点 14 | 15 | IS_RET MaxFilter(TMatrix *Src, TMatrix *Dest, int Radius) 16 | { 17 | if (Src == NULL || Dest == NULL) return IS_RET_ERR_NULLREFERENCE; 18 | if (Src->Data == NULL || Dest->Data == NULL) return IS_RET_ERR_NULLREFERENCE; 19 | if (Src->Width != Dest->Width || Src->Height != Dest->Height || Src->Channel != Dest->Channel || Src->Depth != Dest->Depth || Src->WidthStep != Dest->WidthStep) return IS_RET_ERR_PARAMISMATCH; 20 | if (Src->Depth != IS_DEPTH_8U || Dest->Depth != IS_DEPTH_8U) return IS_RET_ERR_NOTSUPPORTED; 21 | if (Radius < 0 || Radius >= 127) return IS_RET_ERR_ARGUMENTOUTOFRANGE; 22 | 23 | IS_RET Ret = IS_RET_OK; 24 | 25 | if (Src->Data == Dest->Data) 26 | { 27 | TMatrix *Clone = NULL; 28 | Ret = IS_CloneMatrix(Src, &Clone); 29 | if (Ret != IS_RET_OK) return Ret; 30 | Ret = MaxFilter(Clone, Dest, Radius); 31 | IS_FreeMatrix(&Clone); 32 | return Ret; 33 | } 34 | if (Src->Channel == 1) 35 | { 36 | TMatrix *Row = NULL, *Col = NULL; 37 | unsigned char *LinePS, *LinePD; 38 | int X, Y, K, Width = Src->Width, Height = Src->Height; 39 | int *RowOffset, *ColOffSet; 40 | 41 | unsigned short *ColHist = (unsigned short *)IS_AllocMemory(256 * (Width + 2 * Radius) * sizeof(unsigned short), true); 42 | if (ColHist == NULL) { Ret = IS_RET_ERR_OUTOFMEMORY; goto Done8; } 43 | unsigned short *Hist = (unsigned short *)IS_AllocMemory(256 * sizeof(unsigned short), true); 44 | if (Hist == NULL) { Ret = IS_RET_ERR_OUTOFMEMORY; goto Done8; } 45 | Ret = GetValidCoordinate(Width, Height, Radius, Radius, Radius, Radius, EdgeMode::Smear, &Row, &Col); // 获取坐标偏移量 46 | if (Ret != IS_RET_OK) goto Done8; 47 | 48 | ColHist += Radius * 256; RowOffset = ((int *)Row->Data) + Radius; 49 | ColOffSet = ((int *)Col->Data) + Radius; // 进行偏移以便操作 50 | 51 | for (Y = 0; Y < Height; Y++) 52 | { 53 | if (Y == 0) // 第一行的列直方图,要重头计算 54 | { 55 | for (K = -Radius; K <= Radius; K++) 56 | { 57 | LinePS = Src->Data + ColOffSet[K] * Src->WidthStep; 58 | for (X = -Radius; X < Width + Radius; X++) 59 | { 60 | ColHist[X * 256 + LinePS[RowOffset[X]]]++; 61 | } 62 | } 63 | } 64 | else // 其他行的列直方图,更新就可以了 65 | { 66 | LinePS = Src->Data + ColOffSet[Y - Radius - 1] * Src->WidthStep; 67 | for (X = -Radius; X < Width + Radius; X++) // 删除移出范围内的那一行的直方图数据 68 | { 69 | ColHist[X * 256 + LinePS[RowOffset[X]]]--; 70 | } 71 | 72 | LinePS = Src->Data + ColOffSet[Y + Radius] * Src->WidthStep; 73 | for (X = -Radius; X < Width + Radius; X++) // 增加进入范围内的那一行的直方图数据 74 | { 75 | ColHist[X * 256 + LinePS[RowOffset[X]]]++; 76 | } 77 | } 78 | 79 | memset(Hist, 0, 256 * sizeof(unsigned short)); // 每一行直方图数据清零先 80 | 81 | LinePD = Dest->Data + Y * Dest->WidthStep; 82 | 83 | for (X = 0; X < Width; X++) 84 | { 85 | if (X == 0) 86 | { 87 | for (K = -Radius; K <= Radius; K++) // 行第一个像素,需要重新计算 88 | HistgramAddShort(ColHist + K * 256, Hist); 89 | } 90 | else 91 | { 92 | /* HistgramAddShort(ColHist + RowOffset[X + Radius] * 256, Hist); 93 | HistgramSubShort(ColHist + RowOffset[X - Radius - 1] * 256, Hist); 94 | */ 95 | HistgramSubAddShort(ColHist + RowOffset[X - Radius - 1] * 256, ColHist + RowOffset[X + Radius] * 256, Hist); // 行内其他像素,依次删除和增加就可以了 96 | } 97 | for (K = 255; K >= 0; K--) 98 | { 99 | if (Hist[K] != 0) 100 | { 101 | LinePD[X] = K; 102 | break; 103 | } 104 | } 105 | } 106 | } 107 | ColHist -= Radius * 256; // 恢复偏移操作 108 | Done8: 109 | IS_FreeMatrix(&Row); 110 | IS_FreeMatrix(&Col); 111 | IS_FreeMemory(ColHist); 112 | IS_FreeMemory(Hist); 113 | return Ret; 114 | } 115 | else 116 | { 117 | TMatrix *Blue = NULL, *Green = NULL, *Red = NULL, *Alpha = NULL; // 由于C变量如果不初始化,其值是随机值,可能会导致释放时的错误。 118 | IS_RET Ret = SplitRGBA(Src, &Blue, &Green, &Red, &Alpha); 119 | if (Ret != IS_RET_OK) goto Done24; 120 | Ret = MaxFilter(Blue, Blue, Radius); 121 | if (Ret != IS_RET_OK) goto Done24; 122 | Ret = MaxFilter(Green, Green, Radius); 123 | if (Ret != IS_RET_OK) goto Done24; 124 | Ret = MaxFilter(Red, Red, Radius); 125 | if (Ret != IS_RET_OK) goto Done24; // 32位的Alpha不做任何处理,实际上32位的相关算法基本上是不能分通道处理的 126 | CopyAlphaChannel(Src, Dest); 127 | Ret = CombineRGBA(Dest, Blue, Green, Red, Alpha); 128 | Done24: 129 | IS_FreeMatrix(&Blue); 130 | IS_FreeMatrix(&Green); 131 | IS_FreeMatrix(&Red); 132 | IS_FreeMatrix(&Alpha); 133 | return Ret; 134 | } 135 | } -------------------------------------------------------------------------------- /speed_histogram_algorithm_framework/SelectiveBlur.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "Core.h" 3 | #include "Utility.h" 4 | 5 | void Calc(unsigned short *Hist, int Intensity, unsigned char *&Pixel, int Threshold) 6 | { 7 | int K, Low, High, Sum = 0, Weight = 0; 8 | Low = Intensity - Threshold; High = Intensity + Threshold; 9 | if (Low < 0) Low = 0; 10 | if (High > 255) High = 255; 11 | for (K = Low; K <= High; K++) 12 | { 13 | Weight += Hist[K]; 14 | Sum += Hist[K] * K; 15 | } 16 | if (Weight != 0) *Pixel = Sum / Weight; 17 | } 18 | 19 | // 函数供能: 在指定半径内,实现图像选择性模糊效果。 20 | // 参数列表: 21 | // Src: 需要处理的源图像的数据结构 22 | // Dest: 保存处理后的图像的数据结构 23 | // Radius: 半径,有效范围 24 | // 说明: 25 | // 1、程序的执行时间和半径基本无关,但和图像内容有关 26 | // 2、Src和Dest可以相同,不同时执行速度很快 27 | // 3、对于各向异性的图像来说,执行速度很快,对于有大面积相同像素的图像,速度会慢一点 28 | 29 | IS_RET SelectiveBlur(TMatrix *Src, TMatrix *Dest, int Radius, int Threshold, EdgeMode Edge) 30 | { 31 | if (Src == NULL || Dest == NULL) return IS_RET_ERR_NULLREFERENCE; 32 | if (Src->Data == NULL || Dest->Data == NULL) return IS_RET_ERR_NULLREFERENCE; 33 | if (Src->Width != Dest->Width || Src->Height != Dest->Height || Src->Channel != Dest->Channel || Src->Depth != Dest->Depth || Src->WidthStep != Dest->WidthStep) return IS_RET_ERR_PARAMISMATCH; 34 | if (Src->Depth != IS_DEPTH_8U || Dest->Depth != IS_DEPTH_8U) return IS_RET_ERR_NOTSUPPORTED; 35 | if (Radius < 0 || Radius >= 127 || Threshold < 2 || Threshold > 255) return IS_RET_ERR_ARGUMENTOUTOFRANGE; 36 | 37 | IS_RET Ret = IS_RET_OK; 38 | 39 | if (Src->Data == Dest->Data) 40 | { 41 | TMatrix *Clone = NULL; 42 | Ret = IS_CloneMatrix(Src, &Clone); 43 | if (Ret != IS_RET_OK) return Ret; 44 | Ret = SelectiveBlur(Clone, Dest, Radius, Threshold, Edge); 45 | IS_FreeMatrix(&Clone); 46 | return Ret; 47 | } 48 | if (Src->Channel == 1) 49 | { 50 | TMatrix *Row = NULL, *Col = NULL; 51 | unsigned char *LinePS, *LinePD; 52 | int X, Y, K, Width = Src->Width, Height = Src->Height; 53 | int *RowOffset, *ColOffSet; 54 | 55 | unsigned short *ColHist = (unsigned short *)IS_AllocMemory(256 * (Width + 2 * Radius) * sizeof(unsigned short), true); 56 | if (ColHist == NULL) { Ret = IS_RET_ERR_OUTOFMEMORY; goto Done8; } 57 | unsigned short *Hist = (unsigned short *)IS_AllocMemory(256 * sizeof(unsigned short), true); 58 | if (Hist == NULL) { Ret = IS_RET_ERR_OUTOFMEMORY; goto Done8; } 59 | 60 | Ret = GetValidCoordinate(Width, Height, Radius, Radius, Radius, Radius, Edge, &Row, &Col); // 获取坐标偏移量 61 | if (Ret != IS_RET_OK) goto Done8; 62 | 63 | ColHist += Radius * 256; RowOffset = ((int *)Row->Data) + Radius; ColOffSet = ((int *)Col->Data) + Radius; // 进行偏移以便操作 64 | 65 | for (Y = 0; Y < Height; Y++) 66 | { 67 | if (Y == 0) // 第一行的列直方图,要重头计算 68 | { 69 | for (K = -Radius; K <= Radius; K++) 70 | { 71 | LinePS = Src->Data + ColOffSet[K] * Src->WidthStep; 72 | for (X = -Radius; X < Width + Radius; X++) 73 | { 74 | ColHist[X * 256 + LinePS[RowOffset[X]]]++; 75 | } 76 | } 77 | } 78 | else // 其他行的列直方图,更新就可以了 79 | { 80 | LinePS = Src->Data + ColOffSet[Y - Radius - 1] * Src->WidthStep; 81 | for (X = -Radius; X < Width + Radius; X++) // 删除移出范围内的那一行的直方图数据 82 | { 83 | ColHist[X * 256 + LinePS[RowOffset[X]]]--; 84 | } 85 | 86 | LinePS = Src->Data + ColOffSet[Y + Radius] * Src->WidthStep; 87 | for (X = -Radius; X < Width + Radius; X++) // 增加进入范围内的那一行的直方图数据 88 | { 89 | ColHist[X * 256 + LinePS[RowOffset[X]]]++; 90 | } 91 | 92 | } 93 | 94 | memset(Hist, 0, 256 * sizeof(unsigned short)); // 每一行直方图数据清零先 95 | 96 | LinePS = Src->Data + Y * Src->WidthStep; 97 | LinePD = Dest->Data + Y * Dest->WidthStep; 98 | 99 | for (X = 0; X < Width; X++) 100 | { 101 | if (X == 0) 102 | { 103 | for (K = -Radius; K <= Radius; K++) // 行第一个像素,需要重新计算 104 | HistgramAddShort(ColHist + K * 256, Hist); 105 | } 106 | else 107 | { 108 | /* HistgramAddShort(ColHist + RowOffset[X + Radius] * 256, Hist); 109 | HistgramSubShort(ColHist + RowOffset[X - Radius - 1] * 256, Hist); 110 | */ 111 | HistgramSubAddShort(ColHist + RowOffset[X - Radius - 1] * 256, ColHist + RowOffset[X + Radius] * 256, Hist); // 行内其他像素,依次删除和增加就可以了 112 | } 113 | Calc(Hist, LinePS[0], LinePD, Threshold); 114 | 115 | LinePS++; 116 | LinePD++; 117 | } 118 | } 119 | ColHist -= Radius * 256; // 恢复偏移操作 120 | Done8: 121 | IS_FreeMatrix(&Row); 122 | IS_FreeMatrix(&Col); 123 | IS_FreeMemory(ColHist); 124 | IS_FreeMemory(Hist); 125 | 126 | return Ret; 127 | } 128 | else 129 | { 130 | TMatrix *Blue = NULL, *Green = NULL, *Red = NULL, *Alpha = NULL; // 由于C变量如果不初始化,其值是随机值,可能会导致释放时的错误。 131 | IS_RET Ret = SplitRGBA(Src, &Blue, &Green, &Red, &Alpha); 132 | if (Ret != IS_RET_OK) goto Done24; 133 | Ret = SelectiveBlur(Blue, Blue, Radius, Threshold, Edge); 134 | if (Ret != IS_RET_OK) goto Done24; 135 | Ret = SelectiveBlur(Green, Green, Radius, Threshold, Edge); 136 | if (Ret != IS_RET_OK) goto Done24; 137 | Ret = SelectiveBlur(Red, Red, Radius, Threshold, Edge); 138 | if (Ret != IS_RET_OK) goto Done24; // 32位的Alpha不做任何处理,实际上32位的相关算法基本上是不能分通道处理的 139 | Ret = CombineRGBA(Dest, Blue, Green, Red, Alpha); 140 | Done24: 141 | IS_FreeMatrix(&Blue); 142 | IS_FreeMatrix(&Green); 143 | IS_FreeMatrix(&Red); 144 | IS_FreeMatrix(&Alpha); 145 | return Ret; 146 | } 147 | } 148 | -------------------------------------------------------------------------------- /speed_histogram_algorithm_framework/Utility.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BBuf/Image-processing-algorithm-Speed/d22063d5c5b45466e80787e33ffe8a430c435854/speed_histogram_algorithm_framework/Utility.h -------------------------------------------------------------------------------- /speed_integral_graph_sse.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | using namespace std; 5 | using namespace cv; 6 | 7 | void GetGrayIntegralImage(unsigned char *Src, int *Integral, int Width, int Height, int Stride) 8 | { 9 | memset(Integral, 0, (Width + 1) * sizeof(int)); // 第一行都为0 10 | for (int Y = 0; Y < Height; Y++) 11 | { 12 | unsigned char *LinePS = Src + Y * Stride; 13 | int *LinePL = Integral + Y * (Width + 1) + 1; //上一行的位置 14 | int *LinePD = Integral + (Y + 1) * (Width + 1) + 1; // 当前位置,注意每行的第一列的值都为0 15 | LinePD[-1] = 0; // 第一列的值为0 16 | for (int X = 0, Sum = 0; X < Width; X++) 17 | { 18 | Sum += LinePS[X]; // 行方向累加 19 | LinePD[X] = LinePL[X] + Sum; // 更新积分图 20 | } 21 | } 22 | } 23 | 24 | void GetGrayIntegralImage_SSE(unsigned char *Src, int *Integral, int Width, int Height, int Stride) { 25 | memset(Integral, 0, (Width + 1) * sizeof(int)); //第一行都为0 26 | int BlockSize = 8, Block = Width / BlockSize; 27 | for (int Y = 0; Y < Height; Y++) { 28 | unsigned char *LinePS = Src + Y * Stride; 29 | int *LinePL = Integral + Y * (Width + 1) + 1; //上一行位置 30 | int *LinePD = Integral + (Y + 1) * (Width + 1) + 1; //当前位置,注意每行的第一列都为0 31 | LinePD[-1] = 0; 32 | __m128i PreV = _mm_setzero_si128(); 33 | __m128i Zero = _mm_setzero_si128(); 34 | for (int X = 0; X < Block * BlockSize; X += BlockSize) { 35 | __m128i Src_Shift0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(LinePS + X)), Zero); //A7 A6 A5 A 4 A3 A2 A1 A0 36 | __m128i Src_Shift1 = _mm_slli_si128(Src_Shift0, 2); //A6 A5 A4 A3 A2 A1 A0 0 37 | __m128i Src_Shift2 = _mm_slli_si128(Src_Shift1, 2); //A5 A4 A3 A2 A1 A0 0 0 38 | __m128i Src_Shift3 = _mm_slli_si128(Src_Shift2, 2); //A4 A3 A2 A1 A0 0 0 0 39 | __m128i Shift_Add12 = _mm_add_epi16(Src_Shift1, Src_Shift2); //A6+A5 A5+A4 A4+A3 A3+A2 A2+A1 A1+A0 A0+0 0+0 40 | __m128i Shift_Add03 = _mm_add_epi16(Src_Shift0, Src_Shift3); //A7+A4 A6+A3 A5+A2 A4+A1 A3+A0 A2+0 A1+0 A0+0 41 | __m128i Low = _mm_add_epi16(Shift_Add12, Shift_Add03); //A7+A6+A5+A4 A6+A5+A4+A3 A5+A4+A3+A2 A4+A3+A2+A1 A3+A2+A1+A0 A2+A1+A0+0 A1+A0+0+0 A0+0+0+0 42 | __m128i High = _mm_add_epi32(_mm_unpackhi_epi16(Low, Zero), _mm_unpacklo_epi16(Low, Zero)); //A7+A6+A5+A4+A3+A2+A1+A0 A6+A5+A4+A3+A2+A1+A0 A5+A4+A3+A2+A1+A0 A4+A3+A2+A1+A0 43 | __m128i SumL = _mm_loadu_si128((__m128i *)(LinePL + X + 0)); 44 | __m128i SumH = _mm_loadu_si128((__m128i *)(LinePL + X + 4)); 45 | SumL = _mm_add_epi32(SumL, PreV); 46 | SumL = _mm_add_epi32(SumL, _mm_unpacklo_epi16(Low, Zero)); 47 | SumH = _mm_add_epi32(SumH, PreV); 48 | SumH = _mm_add_epi32(SumH, High); 49 | PreV = _mm_add_epi32(PreV, _mm_shuffle_epi32(High, _MM_SHUFFLE(3, 3, 3, 3))); 50 | _mm_storeu_si128((__m128i *)(LinePD + X + 0), SumL); 51 | _mm_storeu_si128((__m128i *)(LinePD + X + 4), SumH); 52 | } 53 | for (int X = Block * BlockSize, V = LinePD[X - 1] - LinePL[X - 1]; X < Width; X++) 54 | { 55 | V += LinePS[X]; 56 | LinePD[X] = V + LinePL[X]; 57 | } 58 | } 59 | } 60 | 61 | void BoxBlur(unsigned char *Src, unsigned char *Dest, int Width, int Height, int Stride, int Radius) { 62 | int *Integral = (int *)malloc((Width + 1) * (Height + 1) * sizeof(int)); 63 | GetGrayIntegralImage(Src, Integral, Width, Height, Stride); 64 | //#pragma parallel for num_threads(4) 65 | for (int Y = 0; Y < Height; Y++) { 66 | int Y1 = max(Y - Radius, 0); 67 | int Y2 = min(Y + Radius + 1, Height - 1); 68 | int *LineP1 = Integral + Y1 * (Width + 1); 69 | int *LineP2 = Integral + Y2 * (Width + 1); 70 | unsigned char *LinePD = Dest + Y * Stride; 71 | for (int X = 0; X < Height; X++) { 72 | int X1 = max(X - Radius, 0); 73 | int X2 = min(X + Radius + 1, Width); 74 | int Sum = LineP2[X2] - LineP1[X2] - LineP2[X1] + LineP1[X1]; 75 | int PixelCount = (X2 - X1) * (Y2 - Y1); 76 | LinePD[X] = (Sum + (PixelCount >> 1)) / PixelCount; 77 | } 78 | } 79 | free(Integral); 80 | } 81 | 82 | int main() { 83 | Mat src = imread("F:\\car.jpg", 0); 84 | int Height = src.rows; 85 | int Width = src.cols; 86 | unsigned char *Src = src.data; 87 | unsigned char *Dest = new unsigned char[Height * Width]; 88 | int Stride = Width; 89 | int Radius = 11; 90 | int64 st = cvGetTickCount(); 91 | for (int i = 0; i < 10; i++) { 92 | BoxBlur(Src, Dest, Width, Height, Stride, Radius); 93 | } 94 | double duration = (cv::getTickCount() - st) / cv::getTickFrequency() * 100; 95 | printf("%.5f\n", duration); 96 | BoxBlur(Src, Dest, Width, Height, Stride, Radius); 97 | Mat dst(Height, Width, CV_8UC1, Dest); 98 | imshow("origin", src); 99 | imshow("result", dst); 100 | imwrite("F:\\res.jpg", dst); 101 | waitKey(0); 102 | waitKey(0); 103 | } -------------------------------------------------------------------------------- /speed_max_filter_sse.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "../../OpencvTest/OpencvTest/Core.h" 4 | #include "../../OpencvTest/OpencvTest/MaxFilter.h" 5 | #include "../../OpencvTest/OpencvTest/Utility.h" 6 | using namespace std; 7 | using namespace cv; 8 | 9 | void MaxFilter_SSE(unsigned char *Src, unsigned char *Dest, int Width, int Height, int Stride, int Channel, int Radius) { 10 | TMatrix a, b; 11 | TMatrix *p1 = &a, *p2 = &b; 12 | TMatrix **p3 = &p1, **p4 = &p2; 13 | IS_CreateMatrix(Width, Height, IS_DEPTH_8U, Channel, p3); 14 | IS_CreateMatrix(Width, Height, IS_DEPTH_8U, Channel, p4); 15 | (p1)->Data = Src; 16 | (p2)->Data = Dest; 17 | MaxFilter(p1, p2, Radius); 18 | } 19 | 20 | Mat MaxFilter(Mat src, int radius) { 21 | int row = src.rows; 22 | int col = src.cols; 23 | int border = (radius - 1) / 2; 24 | Mat dst(row, col, CV_8UC3); 25 | printf("success\n"); 26 | for (int i = border; i + border < row; i++) { 27 | for (int j = border; j + border < col; j++) { 28 | for (int k = 0; k < 3; k++) { 29 | int val = src.at(i, j)[k]; 30 | for (int x = -border; x <= border; x++) { 31 | for (int y = -border; y <= border; y++) { 32 | val = max(val, (int)src.at(i + x, j + y)[k]); 33 | } 34 | } 35 | dst.at(i, j)[k] = val; 36 | } 37 | } 38 | } 39 | printf("success\n"); 40 | return dst; 41 | } 42 | 43 | int main() { 44 | Mat src = imread("F:\\car.jpg"); 45 | int Height = src.rows; 46 | int Width = src.cols; 47 | unsigned char *Src = src.data; 48 | unsigned char *Dest = new unsigned char[Height * Width * 3]; 49 | int Stride = Width * 3; 50 | int Radius = 11; 51 | int64 st = cvGetTickCount(); 52 | for (int i = 0; i <10; i++) { 53 | Mat temp = MaxFilter(src, Radius); 54 | //MaxFilter_SSE(Src, Dest, Width, Height, Stride, 3, Radius); 55 | } 56 | double duration = (cv::getTickCount() - st) / cv::getTickFrequency() * 100; 57 | printf("%.5f\n", duration); 58 | MaxFilter_SSE(Src, Dest, Width, Height, Stride, 3, Radius); 59 | Mat dst(Height, Width, CV_8UC3, Dest); 60 | imshow("origin", src); 61 | imshow("result", dst); 62 | imwrite("F:\\res.jpg", dst); 63 | waitKey(0); 64 | return 0; 65 | } -------------------------------------------------------------------------------- /speed_median_filter_3x3_sse.cpp: -------------------------------------------------------------------------------- 1 | #include "stdafx.h" 2 | #include 3 | #include 4 | using namespace std; 5 | using namespace cv; 6 | 7 | int ComparisonFunction(const void *X, const void *Y) { 8 | unsigned char Dx = *(unsigned char *)X; 9 | unsigned char Dy = *(unsigned char *)Y; 10 | if (Dx < Dy) return -1; 11 | else if (Dx > Dy) return 1; 12 | else return 0; 13 | } 14 | 15 | void MedianBlur3X3_Ori(unsigned char *Src, unsigned char *Dest, int Width, int Height, int Stride) { 16 | int Channel = Stride / Width; 17 | if (Channel == 1) { 18 | unsigned char Array[9]; 19 | for (int Y = 1; Y < Height - 1; Y++) { 20 | unsigned char *LineP0 = Src + (Y - 1) * Stride + 1; 21 | unsigned char *LineP1 = LineP0 + Stride; 22 | unsigned char *LineP2 = LineP1 + Stride; 23 | unsigned char *LinePD = Dest + Y * Stride + 1; 24 | for (int X = 1; X < Width - 1; X++) { 25 | Array[0] = LineP0[X - 1]; Array[1] = LineP0[X]; Array[2] = LineP0[X + 1]; 26 | Array[3] = LineP1[X - 1]; Array[4] = LineP1[X]; Array[5] = LineP2[X + 1]; 27 | Array[6] = LineP2[X - 1]; Array[7] = LineP2[X]; Array[8] = LineP2[X + 1]; 28 | qsort(Array, 9, sizeof(unsigned char), &ComparisonFunction); 29 | LinePD[X] = Array[4]; 30 | } 31 | } 32 | } 33 | else { 34 | unsigned char ArrayB[9], ArrayG[9], ArrayR[9]; 35 | for (int Y = 1; Y < Height - 1; Y++) { 36 | unsigned char *LineP0 = Src + (Y - 1) * Stride + 3; 37 | unsigned char *LineP1 = LineP0 + Stride; 38 | unsigned char *LineP2 = LineP1 + Stride; 39 | unsigned char *LinePD = Dest + Y * Stride + 3; 40 | for (int X = 1; X < Width - 1; X++) { 41 | ArrayB[0] = LineP0[-3]; ArrayG[0] = LineP0[-2]; ArrayR[0] = LineP0[-1]; 42 | ArrayB[1] = LineP0[0]; ArrayG[1] = LineP0[1]; ArrayR[1] = LineP0[2]; 43 | ArrayB[2] = LineP0[3]; ArrayG[2] = LineP0[4]; ArrayR[2] = LineP0[5]; 44 | 45 | ArrayB[3] = LineP1[-3]; ArrayG[3] = LineP1[-2]; ArrayR[3] = LineP1[-1]; 46 | ArrayB[4] = LineP1[0]; ArrayG[4] = LineP1[1]; ArrayR[4] = LineP1[2]; 47 | ArrayB[5] = LineP1[3]; ArrayG[5] = LineP1[4]; ArrayR[5] = LineP1[5]; 48 | 49 | ArrayB[6] = LineP2[-3]; ArrayG[6] = LineP2[-2]; ArrayR[6] = LineP2[-1]; 50 | ArrayB[7] = LineP2[0]; ArrayG[7] = LineP2[1]; ArrayR[7] = LineP2[2]; 51 | ArrayB[8] = LineP2[3]; ArrayG[8] = LineP2[4]; ArrayR[8] = LineP2[5]; 52 | 53 | qsort(ArrayB, 9, sizeof(unsigned char), &ComparisonFunction); 54 | qsort(ArrayG, 9, sizeof(unsigned char), &ComparisonFunction); 55 | qsort(ArrayR, 9, sizeof(unsigned char), &ComparisonFunction); 56 | 57 | LinePD[0] = ArrayB[4]; 58 | LinePD[1] = ArrayG[4]; 59 | LinePD[2] = ArrayR[4]; 60 | 61 | LineP0 += 3; 62 | LineP1 += 3; 63 | LineP2 += 3; 64 | LinePD += 3; 65 | } 66 | } 67 | } 68 | } 69 | 70 | void Swap(int &X, int &Y) { 71 | X ^= Y; 72 | Y ^= X; 73 | X ^= Y; 74 | } 75 | 76 | void MedianBlur3X3_Faster(unsigned char *Src, unsigned char *Dest, int Width, int Height, int Stride) { 77 | int Channel = Stride / Width; 78 | if (Channel == 1) { 79 | 80 | for (int Y = 1; Y < Height - 1; Y++) { 81 | unsigned char *LineP0 = Src + (Y - 1) * Stride + 1; 82 | unsigned char *LineP1 = LineP0 + Stride; 83 | unsigned char *LineP2 = LineP1 + Stride; 84 | unsigned char *LinePD = Dest + Y * Stride + 1; 85 | for (int X = 1; X < Width - 1; X++) { 86 | int Gray0, Gray1, Gray2, Gray3, Gray4, Gray5, Gray6, Gray7, Gray8; 87 | Gray0 = LineP0[X - 1]; Gray1 = LineP0[X]; Gray2 = LineP0[X + 1]; 88 | Gray3 = LineP1[X - 1]; Gray4 = LineP1[X]; Gray5 = LineP1[X + 1]; 89 | Gray6 = LineP2[X - 1]; Gray7 = LineP2[X]; Gray8 = LineP2[X + 1]; 90 | 91 | if (Gray1 > Gray2) Swap(Gray1, Gray2); 92 | if (Gray4 > Gray5) Swap(Gray4, Gray5); 93 | if (Gray7 > Gray8) Swap(Gray7, Gray8); 94 | if (Gray0 > Gray1) Swap(Gray0, Gray1); 95 | if (Gray3 > Gray4) Swap(Gray3, Gray4); 96 | if (Gray6 > Gray7) Swap(Gray6, Gray7); 97 | if (Gray1 > Gray2) Swap(Gray1, Gray2); 98 | if (Gray4 > Gray5) Swap(Gray4, Gray5); 99 | if (Gray7 > Gray8) Swap(Gray7, Gray8); 100 | if (Gray0 > Gray3) Swap(Gray0, Gray3); 101 | if (Gray5 > Gray8) Swap(Gray5, Gray8); 102 | if (Gray4 > Gray7) Swap(Gray4, Gray7); 103 | if (Gray3 > Gray6) Swap(Gray3, Gray6); 104 | if (Gray1 > Gray4) Swap(Gray1, Gray4); 105 | if (Gray2 > Gray5) Swap(Gray2, Gray5); 106 | if (Gray4 > Gray7) Swap(Gray4, Gray7); 107 | if (Gray4 > Gray2) Swap(Gray4, Gray2); 108 | if (Gray6 > Gray4) Swap(Gray6, Gray4); 109 | if (Gray4 > Gray2) Swap(Gray4, Gray2); 110 | 111 | LinePD[X] = Gray4; 112 | } 113 | } 114 | 115 | } 116 | else { 117 | for (int Y = 1; Y < Height - 1; Y++) { 118 | unsigned char *LineP0 = Src + (Y - 1) * Stride + 3; 119 | unsigned char *LineP1 = LineP0 + Stride; 120 | unsigned char *LineP2 = LineP1 + Stride; 121 | unsigned char *LinePD = Dest + Y * Stride + 3; 122 | for (int X = 1; X < Width - 1; X++) { 123 | int Blue0, Blue1, Blue2, Blue3, Blue4, Blue5, Blue6, Blue7, Blue8; 124 | int Green0, Green1, Green2, Green3, Green4, Green5, Green6, Green7, Green8; 125 | int Red0, Red1, Red2, Red3, Red4, Red5, Red6, Red7, Red8; 126 | Blue0 = LineP0[-3]; Green0 = LineP0[-2]; Red0 = LineP0[-1]; 127 | Blue1 = LineP0[0]; Green1 = LineP0[1]; Red1 = LineP0[2]; 128 | Blue2 = LineP0[3]; Green2 = LineP0[4]; Red2 = LineP0[5]; 129 | 130 | Blue3 = LineP1[-3]; Green3 = LineP1[-2]; Red3 = LineP1[-1]; 131 | Blue4 = LineP1[0]; Green4 = LineP1[1]; Red4 = LineP1[2]; 132 | Blue5 = LineP1[3]; Green5 = LineP1[4]; Red5 = LineP1[5]; 133 | 134 | Blue6 = LineP2[-3]; Green6 = LineP2[-2]; Red6 = LineP2[-1]; 135 | Blue7 = LineP2[0]; Green7 = LineP2[1]; Red7 = LineP2[2]; 136 | Blue8 = LineP2[3]; Green8 = LineP2[4]; Red8 = LineP2[5]; 137 | 138 | if (Blue1 > Blue2) Swap(Blue1, Blue2); 139 | if (Blue4 > Blue5) Swap(Blue4, Blue5); 140 | if (Blue7 > Blue8) Swap(Blue7, Blue8); 141 | if (Blue0 > Blue1) Swap(Blue0, Blue1); 142 | if (Blue3 > Blue4) Swap(Blue3, Blue4); 143 | if (Blue6 > Blue7) Swap(Blue6, Blue7); 144 | if (Blue1 > Blue2) Swap(Blue1, Blue2); 145 | if (Blue4 > Blue5) Swap(Blue4, Blue5); 146 | if (Blue7 > Blue8) Swap(Blue7, Blue8); 147 | if (Blue0 > Blue3) Swap(Blue0, Blue3); 148 | if (Blue5 > Blue8) Swap(Blue5, Blue8); 149 | if (Blue4 > Blue7) Swap(Blue4, Blue7); 150 | if (Blue3 > Blue6) Swap(Blue3, Blue6); 151 | if (Blue1 > Blue4) Swap(Blue1, Blue4); 152 | if (Blue2 > Blue5) Swap(Blue2, Blue5); 153 | if (Blue4 > Blue7) Swap(Blue4, Blue7); 154 | if (Blue4 > Blue2) Swap(Blue4, Blue2); 155 | if (Blue6 > Blue4) Swap(Blue6, Blue4); 156 | if (Blue4 > Blue2) Swap(Blue4, Blue2); 157 | 158 | if (Green1 > Green2) Swap(Green1, Green2); 159 | if (Green4 > Green5) Swap(Green4, Green5); 160 | if (Green7 > Green8) Swap(Green7, Green8); 161 | if (Green0 > Green1) Swap(Green0, Green1); 162 | if (Green3 > Green4) Swap(Green3, Green4); 163 | if (Green6 > Green7) Swap(Green6, Green7); 164 | if (Green1 > Green2) Swap(Green1, Green2); 165 | if (Green4 > Green5) Swap(Green4, Green5); 166 | if (Green7 > Green8) Swap(Green7, Green8); 167 | if (Green0 > Green3) Swap(Green0, Green3); 168 | if (Green5 > Green8) Swap(Green5, Green8); 169 | if (Green4 > Green7) Swap(Green4, Green7); 170 | if (Green3 > Green6) Swap(Green3, Green6); 171 | if (Green1 > Green4) Swap(Green1, Green4); 172 | if (Green2 > Green5) Swap(Green2, Green5); 173 | if (Green4 > Green7) Swap(Green4, Green7); 174 | if (Green4 > Green2) Swap(Green4, Green2); 175 | if (Green6 > Green4) Swap(Green6, Green4); 176 | if (Green4 > Green2) Swap(Green4, Green2); 177 | 178 | if (Red1 > Red2) Swap(Red1, Red2); 179 | if (Red4 > Red5) Swap(Red4, Red5); 180 | if (Red7 > Red8) Swap(Red7, Red8); 181 | if (Red0 > Red1) Swap(Red0, Red1); 182 | if (Red3 > Red4) Swap(Red3, Red4); 183 | if (Red6 > Red7) Swap(Red6, Red7); 184 | if (Red1 > Red2) Swap(Red1, Red2); 185 | if (Red4 > Red5) Swap(Red4, Red5); 186 | if (Red7 > Red8) Swap(Red7, Red8); 187 | if (Red0 > Red3) Swap(Red0, Red3); 188 | if (Red5 > Red8) Swap(Red5, Red8); 189 | if (Red4 > Red7) Swap(Red4, Red7); 190 | if (Red3 > Red6) Swap(Red3, Red6); 191 | if (Red1 > Red4) Swap(Red1, Red4); 192 | if (Red2 > Red5) Swap(Red2, Red5); 193 | if (Red4 > Red7) Swap(Red4, Red7); 194 | if (Red4 > Red2) Swap(Red4, Red2); 195 | if (Red6 > Red4) Swap(Red6, Red4); 196 | if (Red4 > Red2) Swap(Red4, Red2); 197 | 198 | LinePD[0] = Blue4; 199 | LinePD[1] = Green4; 200 | LinePD[2] = Red4; 201 | 202 | LineP0 += 3; 203 | LineP1 += 3; 204 | LineP2 += 3; 205 | LinePD += 3; 206 | } 207 | } 208 | } 209 | } 210 | 211 | inline void _mm_sort_ab(__m128i &a, __m128i &b) { 212 | const __m128i min = _mm_min_epu8(a, b); 213 | const __m128i max = _mm_max_epu8(a, b); 214 | a = min; 215 | b = max; 216 | } 217 | 218 | void MedianBlur3X3_Fastest(unsigned char *Src, unsigned char *Dest, int Width, int Height, int Stride) { 219 | int Channel = Stride / Width; 220 | int BlockSize = 16, Block = ((Width - 2)* Channel) / BlockSize; 221 | for (int Y = 1; Y < Height - 1; Y++) { 222 | unsigned char *LineP0 = Src + (Y - 1) * Stride + Channel; 223 | unsigned char *LineP1 = LineP0 + Stride; 224 | unsigned char *LineP2 = LineP1 + Stride; 225 | unsigned char *LinePD = Dest + Y * Stride + Channel; 226 | for (int X = 0; X < Block * BlockSize; X += BlockSize, LineP0 += BlockSize, LineP1 += BlockSize, LineP2 += BlockSize, LinePD += BlockSize) 227 | { 228 | __m128i P0 = _mm_loadu_si128((__m128i *)(LineP0 - Channel)); 229 | __m128i P1 = _mm_loadu_si128((__m128i *)(LineP0 - 0)); 230 | __m128i P2 = _mm_loadu_si128((__m128i *)(LineP0 + Channel)); 231 | __m128i P3 = _mm_loadu_si128((__m128i *)(LineP1 - Channel)); 232 | __m128i P4 = _mm_loadu_si128((__m128i *)(LineP1 - 0)); 233 | __m128i P5 = _mm_loadu_si128((__m128i *)(LineP1 + Channel)); 234 | __m128i P6 = _mm_loadu_si128((__m128i *)(LineP2 - Channel)); 235 | __m128i P7 = _mm_loadu_si128((__m128i *)(LineP2 - 0)); 236 | __m128i P8 = _mm_loadu_si128((__m128i *)(LineP2 + Channel)); 237 | 238 | _mm_sort_ab(P1, P2); _mm_sort_ab(P4, P5); _mm_sort_ab(P7, P8); 239 | _mm_sort_ab(P0, P1); _mm_sort_ab(P3, P4); _mm_sort_ab(P6, P7); 240 | _mm_sort_ab(P1, P2); _mm_sort_ab(P4, P5); _mm_sort_ab(P7, P8); 241 | _mm_sort_ab(P0, P3); _mm_sort_ab(P5, P8); _mm_sort_ab(P4, P7); 242 | _mm_sort_ab(P3, P6); _mm_sort_ab(P1, P4); _mm_sort_ab(P2, P5); 243 | _mm_sort_ab(P4, P7); _mm_sort_ab(P4, P2); _mm_sort_ab(P6, P4); 244 | _mm_sort_ab(P4, P2); 245 | 246 | _mm_storeu_si128((__m128i *)LinePD, P4); 247 | } 248 | 249 | for (int X = Block * BlockSize; X < (Width - 2) * Channel; X++, LinePD++) { 250 | int Gray0, Gray1, Gray2, Gray3, Gray4, Gray5, Gray6, Gray7, Gray8; 251 | Gray0 = LineP0[X - Block * BlockSize - Channel]; Gray1 = LineP0[X - Block * BlockSize]; Gray2 = LineP0[X - Block * BlockSize + Channel]; 252 | Gray3 = LineP1[X - Block * BlockSize - Channel]; Gray4 = LineP1[X - Block * BlockSize]; Gray5 = LineP1[X - Block * BlockSize + Channel]; 253 | Gray6 = LineP2[X - Block * BlockSize - Channel]; Gray7 = LineP2[X - Block * BlockSize]; Gray8 = LineP2[X - Block * BlockSize + Channel]; 254 | 255 | if (Gray1 > Gray2) Swap(Gray1, Gray2); 256 | if (Gray4 > Gray5) Swap(Gray4, Gray5); 257 | if (Gray7 > Gray8) Swap(Gray7, Gray8); 258 | if (Gray0 > Gray1) Swap(Gray0, Gray1); 259 | if (Gray3 > Gray4) Swap(Gray3, Gray4); 260 | if (Gray6 > Gray7) Swap(Gray6, Gray7); 261 | if (Gray1 > Gray2) Swap(Gray1, Gray2); 262 | if (Gray4 > Gray5) Swap(Gray4, Gray5); 263 | if (Gray7 > Gray8) Swap(Gray7, Gray8); 264 | if (Gray0 > Gray3) Swap(Gray0, Gray3); 265 | if (Gray5 > Gray8) Swap(Gray5, Gray8); 266 | if (Gray4 > Gray7) Swap(Gray4, Gray7); 267 | if (Gray3 > Gray6) Swap(Gray3, Gray6); 268 | if (Gray1 > Gray4) Swap(Gray1, Gray4); 269 | if (Gray2 > Gray5) Swap(Gray2, Gray5); 270 | if (Gray4 > Gray7) Swap(Gray4, Gray7); 271 | if (Gray4 > Gray2) Swap(Gray4, Gray2); 272 | if (Gray6 > Gray4) Swap(Gray6, Gray4); 273 | if (Gray4 > Gray2) Swap(Gray4, Gray2); 274 | 275 | LinePD[X] = Gray4; 276 | LineP0 += 1; 277 | LineP1 += 1; 278 | LineP2 += 1; 279 | } 280 | } 281 | } 282 | 283 | inline void _mm_sort_AB(__m256i &a, __m256i &b) { 284 | const __m256i min = _mm256_min_epu8(a, b); 285 | const __m256i max = _mm256_max_epu8(a, b); 286 | a = min; 287 | b = max; 288 | } 289 | 290 | void MedianBlur3X3_Fastest_AVX(unsigned char *Src, unsigned char *Dest, int Width, int Height, int Stride) { 291 | int Channel = Stride / Width; 292 | int BlockSize = 32, Block = ((Width - 2)* Channel) / BlockSize; 293 | for (int Y = 1; Y < Height - 1; Y++) { 294 | unsigned char *LineP0 = Src + (Y - 1) * Stride + Channel; 295 | unsigned char *LineP1 = LineP0 + Stride; 296 | unsigned char *LineP2 = LineP1 + Stride; 297 | unsigned char *LinePD = Dest + Y * Stride + Channel; 298 | for (int X = 0; X < Block * BlockSize; X += BlockSize, LineP0 += BlockSize, LineP1 += BlockSize, LineP2 += BlockSize, LinePD += BlockSize) 299 | { 300 | __m256i P0 = _mm256_loadu_si256((const __m256i*)(LineP0 - Channel)); 301 | __m256i P1 = _mm256_loadu_si256((const __m256i*)(LineP0 - 0)); 302 | __m256i P2 = _mm256_loadu_si256((const __m256i*)(LineP0 + Channel)); 303 | __m256i P3 = _mm256_loadu_si256((const __m256i*)(LineP1 - Channel)); 304 | __m256i P4 = _mm256_loadu_si256((const __m256i*)(LineP1 - 0)); 305 | __m256i P5 = _mm256_loadu_si256((const __m256i*)(LineP1 + Channel)); 306 | __m256i P6 = _mm256_loadu_si256((const __m256i*)(LineP2 - Channel)); 307 | __m256i P7 = _mm256_loadu_si256((const __m256i*)(LineP2 - 0)); 308 | __m256i P8 = _mm256_loadu_si256((const __m256i*)(LineP2 + Channel)); 309 | 310 | _mm_sort_AB(P1, P2); _mm_sort_AB(P4, P5); _mm_sort_AB(P7, P8); 311 | _mm_sort_AB(P0, P1); _mm_sort_AB(P3, P4); _mm_sort_AB(P6, P7); 312 | _mm_sort_AB(P1, P2); _mm_sort_AB(P4, P5); _mm_sort_AB(P7, P8); 313 | _mm_sort_AB(P0, P3); _mm_sort_AB(P5, P8); _mm_sort_AB(P4, P7); 314 | _mm_sort_AB(P3, P6); _mm_sort_AB(P1, P4); _mm_sort_AB(P2, P5); 315 | _mm_sort_AB(P4, P7); _mm_sort_AB(P4, P2); _mm_sort_AB(P6, P4); 316 | _mm_sort_AB(P4, P2); 317 | 318 | _mm256_storeu_si256((__m256i *)LinePD, P4); 319 | } 320 | 321 | for (int X = Block * BlockSize; X < (Width - 2) * Channel; X++, LinePD++) { 322 | int Gray0, Gray1, Gray2, Gray3, Gray4, Gray5, Gray6, Gray7, Gray8; 323 | Gray0 = LineP0[X - Block * BlockSize - Channel]; Gray1 = LineP0[X - Block * BlockSize]; Gray2 = LineP0[X - Block * BlockSize + Channel]; 324 | Gray3 = LineP1[X - Block * BlockSize - Channel]; Gray4 = LineP1[X - Block * BlockSize]; Gray5 = LineP1[X - Block * BlockSize + Channel]; 325 | Gray6 = LineP2[X - Block * BlockSize - Channel]; Gray7 = LineP2[X - Block * BlockSize]; Gray8 = LineP2[X - Block * BlockSize + Channel]; 326 | 327 | if (Gray1 > Gray2) Swap(Gray1, Gray2); 328 | if (Gray4 > Gray5) Swap(Gray4, Gray5); 329 | if (Gray7 > Gray8) Swap(Gray7, Gray8); 330 | if (Gray0 > Gray1) Swap(Gray0, Gray1); 331 | if (Gray3 > Gray4) Swap(Gray3, Gray4); 332 | if (Gray6 > Gray7) Swap(Gray6, Gray7); 333 | if (Gray1 > Gray2) Swap(Gray1, Gray2); 334 | if (Gray4 > Gray5) Swap(Gray4, Gray5); 335 | if (Gray7 > Gray8) Swap(Gray7, Gray8); 336 | if (Gray0 > Gray3) Swap(Gray0, Gray3); 337 | if (Gray5 > Gray8) Swap(Gray5, Gray8); 338 | if (Gray4 > Gray7) Swap(Gray4, Gray7); 339 | if (Gray3 > Gray6) Swap(Gray3, Gray6); 340 | if (Gray1 > Gray4) Swap(Gray1, Gray4); 341 | if (Gray2 > Gray5) Swap(Gray2, Gray5); 342 | if (Gray4 > Gray7) Swap(Gray4, Gray7); 343 | if (Gray4 > Gray2) Swap(Gray4, Gray2); 344 | if (Gray6 > Gray4) Swap(Gray6, Gray4); 345 | if (Gray4 > Gray2) Swap(Gray4, Gray2); 346 | 347 | LinePD[X] = Gray4; 348 | LineP0 += 1; 349 | LineP1 += 1; 350 | LineP2 += 1; 351 | } 352 | } 353 | } 354 | 355 | int main() { 356 | Mat src = imread("F:\\car.jpg"); 357 | int Height = src.rows; 358 | int Width = src.cols; 359 | unsigned char *Src = src.data; 360 | unsigned char *Dest = new unsigned char[Height * Width * 3]; 361 | int Stride = Width * 3; 362 | int Radius = 7; 363 | int64 st = cvGetTickCount(); 364 | for (int i = 0; i <10; i++) { 365 | //Mat temp = MaxFilter(src, Radius); 366 | MedianBlur3X3_Fastest_AVX(Src, Dest, Width, Height, Stride); 367 | } 368 | double duration = (cv::getTickCount() - st) / cv::getTickFrequency() * 100; 369 | printf("%.5f\n", duration); 370 | MedianBlur3X3_Fastest_AVX(Src, Dest, Width, Height, Stride); 371 | Mat dst(Height, Width, CV_8UC3, Dest); 372 | imshow("origin", src); 373 | imshow("result", dst); 374 | imwrite("F:\\res.jpg", dst); 375 | waitKey(0); 376 | return 0; 377 | } 378 | -------------------------------------------------------------------------------- /speed_multi_scale_detail_boosting_see.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "../../OpencvTest/OpencvTest/Core.h" 4 | #include "../../OpencvTest/OpencvTest/MaxFilter.h" 5 | #include "../../OpencvTest/OpencvTest/Utility.h" 6 | #include "../../OpencvTest/OpencvTest/BoxFilter.h" 7 | using namespace std; 8 | using namespace cv; 9 | #define __SSSE3__ 1 10 | 11 | void BoxBlur_SSE(unsigned char *Src, unsigned char *Dest, int Width, int Height, int Stride, int Channel, int Radius) { 12 | TMatrix a, b; 13 | TMatrix *p1 = &a, *p2 = &b; 14 | TMatrix **p3 = &p1, **p4 = &p2; 15 | IS_CreateMatrix(Width, Height, IS_DEPTH_8U, Channel, p3); 16 | IS_CreateMatrix(Width, Height, IS_DEPTH_8U, Channel, p4); 17 | (p1)->Data = Src; 18 | (p2)->Data = Dest; 19 | BoxBlur_SSE(p1, p2, Radius, EdgeMode::Smear); 20 | } 21 | 22 | int IM_Sign(int X) { 23 | return (X >> 31) | (unsigned(-X)) >> 31; 24 | } 25 | 26 | inline unsigned char IM_ClampToByte(int Value) 27 | { 28 | if (Value < 0) 29 | return 0; 30 | else if (Value > 255) 31 | return 255; 32 | else 33 | return (unsigned char)Value; 34 | //return ((Value | ((signed int)(255 - Value) >> 31)) & ~((signed int)Value >> 31)); 35 | } 36 | 37 | 38 | inline __m128i _mm_sgn_epi16(__m128i v) { 39 | #ifdef __SSSE3__ 40 | v = _mm_sign_epi16(_mm_set1_epi16(1), v); // use PSIGNW on SSSE3 and later 41 | #else 42 | v = _mm_min_epi16(v, _mm_set1_epi16(1)); // use PMINSW/PMAXSW on SSE2/SSE3. 43 | v = _mm_max_epi16(v, _mm_set1_epi16(-1)); 44 | //_mm_set1_epi16(1) = _mm_srli_epi16(_mm_cmpeq_epi16(v, v), 15); 45 | //_mm_set1_epi16(-1) = _mm_cmpeq_epi16(v, v); 46 | 47 | #endif 48 | return v; 49 | } 50 | 51 | void MultiScaleSharpen(unsigned char *Src, unsigned char *Dest, int Width, int Height, int Stride, int Radius) { 52 | int Channel = Stride / Width; 53 | unsigned char *B1 = (unsigned char *)malloc(Height * Stride * sizeof(unsigned char)); 54 | unsigned char *B2 = (unsigned char *)malloc(Height * Stride * sizeof(unsigned char)); 55 | unsigned char *B3 = (unsigned char *)malloc(Height * Stride * sizeof(unsigned char)); 56 | BoxBlur_SSE(Src, B1, Width, Height, Channel, Stride, Radius); 57 | BoxBlur_SSE(Src, B2, Width, Height, Channel, Stride, Radius * 2); 58 | BoxBlur_SSE(Src, B3, Width, Height, Channel, Stride, Radius * 4); 59 | for (int Y = 0; Y < Height * Stride; Y++) { 60 | int DiffB1 = Src[Y] - B1[Y]; 61 | int DiffB2 = B1[Y] - B2[Y]; 62 | int DiffB3 = B2[Y] - B3[Y]; 63 | Dest[Y] = IM_ClampToByte(((4 - 2 * IM_Sign(DiffB1)) * DiffB1 + 2 * DiffB2 + DiffB3) / 4 + Src[Y]); 64 | } 65 | } 66 | 67 | void MultiScaleSharpen_SSE(unsigned char *Src, unsigned char *Dest, int Width, int Height, int Stride, int Radius) { 68 | int Channel = Stride / Width; 69 | unsigned char *B1 = (unsigned char *)malloc(Height * Stride * sizeof(unsigned char)); 70 | unsigned char *B2 = (unsigned char *)malloc(Height * Stride * sizeof(unsigned char)); 71 | unsigned char *B3 = (unsigned char *)malloc(Height * Stride * sizeof(unsigned char)); 72 | BoxBlur_SSE(Src, B1, Width, Height, Channel, Stride, Radius); 73 | BoxBlur_SSE(Src, B2, Width, Height, Channel, Stride, Radius * 2); 74 | BoxBlur_SSE(Src, B3, Width, Height, Channel, Stride, Radius * 4); 75 | int BlockSize = 8, Block = (Height * Stride) / BlockSize; 76 | __m128i Zero = _mm_setzero_si128(); 77 | __m128i Four = _mm_set1_epi16(4); 78 | for (int Y = 0; Y < Block * BlockSize; Y += BlockSize) { 79 | __m128i SrcV = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(Src + Y)), Zero); 80 | __m128i SrcB1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(B1 + Y)), Zero); 81 | __m128i SrcB2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(B2 + Y)), Zero); 82 | __m128i SrcB3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(B3 + Y)), Zero); 83 | __m128i DiffB1 = _mm_sub_epi16(SrcV, SrcB1); 84 | __m128i DiffB2 = _mm_sub_epi16(SrcB1, SrcB2); 85 | __m128i DiffB3 = _mm_sub_epi16(SrcB2, SrcB3); 86 | //__m128i Offset = _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(_mm_sub_epi16(Four, _mm_slli_epi16(_mm_sgn_epi16(DiffB1), 1)), DiffB1), _mm_slli_epi16(DiffB2, 1)), DiffB3), 2); 87 | __m128i Offset = _mm_add_epi16(_mm_srai_epi16(_mm_sub_epi16(_mm_slli_epi16(_mm_sub_epi16(SrcB1, _mm_sign_epi16(DiffB1, DiffB1)), 1), _mm_add_epi16(SrcB2, SrcB3)), 2), DiffB1); 88 | _mm_storel_epi64((__m128i *)(Dest + Y), _mm_packus_epi16(_mm_add_epi16(SrcV, Offset), Zero)); 89 | } 90 | for (int Y = Block * BlockSize; Y < Height * Stride; Y++) { 91 | int DiffB1 = Src[Y] - B1[Y]; 92 | int DiffB2 = B1[Y] - B2[Y]; 93 | int DiffB3 = B2[Y] - B3[Y]; 94 | Dest[Y] = IM_ClampToByte(((4 - 2 * IM_Sign(DiffB1)) * DiffB1 + 2 * DiffB2 + DiffB3) / 4 + Src[Y]); 95 | } 96 | } 97 | 98 | int main() { 99 | Mat src = imread("F:\\car.jpg"); 100 | int Height = src.rows; 101 | int Width = src.cols; 102 | unsigned char *Src = src.data; 103 | unsigned char *Dest = new unsigned char[Height * Width * 3]; 104 | int Stride = Width * 3; 105 | int Radius = 5; 106 | int64 st = cvGetTickCount(); 107 | for (int i = 0; i <10; i++) { 108 | //Mat temp = MaxFilter(src, Radius); 109 | MultiScaleSharpen_SSE(Src, Dest, Width, Height, Stride, Radius); 110 | } 111 | double duration = (cv::getTickCount() - st) / cv::getTickFrequency() * 100; 112 | printf("%.5f\n", duration); 113 | MultiScaleSharpen(Src, Dest, Width, Height, Stride, Radius); 114 | Mat dst(Height, Width, CV_8UC3, Dest); 115 | imshow("origin", src); 116 | imshow("result", dst); 117 | imwrite("F:\\res.jpg", dst); 118 | waitKey(0); 119 | return 0; 120 | } -------------------------------------------------------------------------------- /speed_rgb2gray_sse.cpp: -------------------------------------------------------------------------------- 1 | #include "stdafx.h" 2 | #include 3 | #include 4 | using namespace std; 5 | using namespace cv; 6 | 7 | //origin 8 | void RGB2Y(unsigned char *Src, unsigned char *Dest, int Width, int Height, int Stride) { 9 | for (int Y = 0; Y < Height; Y++) { 10 | unsigned char *LinePS = Src + Y * Stride; 11 | unsigned char *LinePD = Dest + Y * Width; 12 | for (int X = 0; X < Width; X++, LinePS += 3) { 13 | LinePD[X] = int(0.114 * LinePS[0] + 0.587 * LinePS[1] + 0.299 * LinePS[2]); 14 | } 15 | } 16 | } 17 | 18 | //int 19 | void RGB2Y_1(unsigned char *Src, unsigned char *Dest, int Width, int Height, int Stride) { 20 | const int B_WT = int(0.114 * 256 + 0.5); 21 | const int G_WT = int(0.587 * 256 + 0.5); 22 | const int R_WT = 256 - B_WT - G_WT; 23 | for (int Y = 0; Y < Height; Y++) { 24 | unsigned char *LinePS = Src + Y * Stride; 25 | unsigned char *LinePD = Dest + Y * Width; 26 | for (int X = 0; X < Width; X++, LinePS += 3) { 27 | LinePD[X] = (B_WT * LinePS[0] + G_WT * LinePS[1] + R_WT * LinePS[2]) >> 8; 28 | } 29 | } 30 | } 31 | 32 | //4路并行 33 | void RGB2Y_2(unsigned char *Src, unsigned char *Dest, int Width, int Height, int Stride) { 34 | const int B_WT = int(0.114 * 256 + 0.5); 35 | const int G_WT = int(0.587 * 256 + 0.5); 36 | const int R_WT = 256 - B_WT - G_WT; // int(0.299 * 256 + 0.5) 37 | for (int Y = 0; Y < Height; Y++) { 38 | unsigned char *LinePS = Src + Y * Stride; 39 | unsigned char *LinePD = Dest + Y * Width; 40 | int X = 0; 41 | for (; X < Width - 4; X += 4, LinePS += 12) { 42 | LinePD[X + 0] = (B_WT * LinePS[0] + G_WT * LinePS[1] + R_WT * LinePS[2]) >> 8; 43 | LinePD[X + 1] = (B_WT * LinePS[3] + G_WT * LinePS[4] + R_WT * LinePS[5]) >> 8; 44 | LinePD[X + 2] = (B_WT * LinePS[6] + G_WT * LinePS[7] + R_WT * LinePS[8]) >> 8; 45 | LinePD[X + 3] = (B_WT * LinePS[9] + G_WT * LinePS[10] + R_WT * LinePS[11]) >> 8; 46 | } 47 | for (; X < Width; X++, LinePS += 3) { 48 | LinePD[X] = (B_WT * LinePS[0] + G_WT * LinePS[1] + R_WT * LinePS[2]) >> 8; 49 | } 50 | } 51 | } 52 | 53 | //openmp 54 | void RGB2Y_3(unsigned char *Src, unsigned char *Dest, int Width, int Height, int Stride) { 55 | const int B_WT = int(0.114 * 256 + 0.5); 56 | const int G_WT = int(0.587 * 256 + 0.5); 57 | const int R_WT = 256 - B_WT - G_WT; 58 | for (int Y = 0; Y < Height; Y++) { 59 | unsigned char *LinePS = Src + Y * Stride; 60 | unsigned char *LinePD = Dest + Y * Width; 61 | #pragma omp parallel for num_threads(4) 62 | for (int X = 0; X < Width; X++) { 63 | LinePD[X] = (B_WT * LinePS[0 + X*3] + G_WT * LinePS[1 + X*3] + R_WT * LinePS[2 + X*3]) >> 8; 64 | } 65 | } 66 | } 67 | 68 | //sse 一次处理12个 69 | void RGB2Y_4(unsigned char *Src, unsigned char *Dest, int Width, int Height, int Stride) { 70 | const int B_WT = int(0.114 * 256 + 0.5); 71 | const int G_WT = int(0.587 * 256 + 0.5); 72 | const int R_WT = 256 - B_WT - G_WT; // int(0.299 * 256 + 0.5) 73 | 74 | for (int Y = 0; Y < Height; Y++) { 75 | unsigned char *LinePS = Src + Y * Stride; 76 | unsigned char *LinePD = Dest + Y * Width; 77 | int X = 0; 78 | for (; X < Width - 12; X += 12, LinePS += 36) { 79 | __m128i p1aL = _mm_mullo_epi16(_mm_cvtepu8_epi16(_mm_loadu_si128((__m128i *)(LinePS + 0))), _mm_setr_epi16(B_WT, G_WT, R_WT, B_WT, G_WT, R_WT, B_WT, G_WT)); //1 80 | __m128i p2aL = _mm_mullo_epi16(_mm_cvtepu8_epi16(_mm_loadu_si128((__m128i *)(LinePS + 1))), _mm_setr_epi16(G_WT, R_WT, B_WT, G_WT, R_WT, B_WT, G_WT, R_WT)); //2 81 | __m128i p3aL = _mm_mullo_epi16(_mm_cvtepu8_epi16(_mm_loadu_si128((__m128i *)(LinePS + 2))), _mm_setr_epi16(R_WT, B_WT, G_WT, R_WT, B_WT, G_WT, R_WT, B_WT)); //3 82 | 83 | __m128i p1aH = _mm_mullo_epi16(_mm_cvtepu8_epi16(_mm_loadu_si128((__m128i *)(LinePS + 8))), _mm_setr_epi16(R_WT, B_WT, G_WT, R_WT, B_WT, G_WT, R_WT, B_WT));//4 84 | __m128i p2aH = _mm_mullo_epi16(_mm_cvtepu8_epi16(_mm_loadu_si128((__m128i *)(LinePS + 9))), _mm_setr_epi16(B_WT, G_WT, R_WT, B_WT, G_WT, R_WT, B_WT, G_WT));//5 85 | __m128i p3aH = _mm_mullo_epi16(_mm_cvtepu8_epi16(_mm_loadu_si128((__m128i *)(LinePS + 10))), _mm_setr_epi16(G_WT, R_WT, B_WT, G_WT, R_WT, B_WT, G_WT, R_WT));//6 86 | 87 | __m128i p1bL = _mm_mullo_epi16(_mm_cvtepu8_epi16(_mm_loadu_si128((__m128i *)(LinePS + 18))), _mm_setr_epi16(B_WT, G_WT, R_WT, B_WT, G_WT, R_WT, B_WT, G_WT));//7 88 | __m128i p2bL = _mm_mullo_epi16(_mm_cvtepu8_epi16(_mm_loadu_si128((__m128i *)(LinePS + 19))), _mm_setr_epi16(G_WT, R_WT, B_WT, G_WT, R_WT, B_WT, G_WT, R_WT));//8 89 | __m128i p3bL = _mm_mullo_epi16(_mm_cvtepu8_epi16(_mm_loadu_si128((__m128i *)(LinePS + 20))), _mm_setr_epi16(R_WT, B_WT, G_WT, R_WT, B_WT, G_WT, R_WT, B_WT));//9 90 | 91 | __m128i p1bH = _mm_mullo_epi16(_mm_cvtepu8_epi16(_mm_loadu_si128((__m128i *)(LinePS + 26))), _mm_setr_epi16(R_WT, B_WT, G_WT, R_WT, B_WT, G_WT, R_WT, B_WT));//10 92 | __m128i p2bH = _mm_mullo_epi16(_mm_cvtepu8_epi16(_mm_loadu_si128((__m128i *)(LinePS + 27))), _mm_setr_epi16(B_WT, G_WT, R_WT, B_WT, G_WT, R_WT, B_WT, G_WT));//11 93 | __m128i p3bH = _mm_mullo_epi16(_mm_cvtepu8_epi16(_mm_loadu_si128((__m128i *)(LinePS + 28))), _mm_setr_epi16(G_WT, R_WT, B_WT, G_WT, R_WT, B_WT, G_WT, R_WT));//12 94 | 95 | __m128i sumaL = _mm_add_epi16(p3aL, _mm_add_epi16(p1aL, p2aL));//13 96 | __m128i sumaH = _mm_add_epi16(p3aH, _mm_add_epi16(p1aH, p2aH));//14 97 | __m128i sumbL = _mm_add_epi16(p3bL, _mm_add_epi16(p1bL, p2bL));//15 98 | __m128i sumbH = _mm_add_epi16(p3bH, _mm_add_epi16(p1bH, p2bH));//16 99 | __m128i sclaL = _mm_srli_epi16(sumaL, 8);//17 100 | __m128i sclaH = _mm_srli_epi16(sumaH, 8);//18 101 | __m128i sclbL = _mm_srli_epi16(sumbL, 8);//19 102 | __m128i sclbH = _mm_srli_epi16(sumbH, 8);//20 103 | __m128i shftaL = _mm_shuffle_epi8(sclaL, _mm_setr_epi8(0, 6, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1));//21 104 | __m128i shftaH = _mm_shuffle_epi8(sclaH, _mm_setr_epi8(-1, -1, -1, 18, 24, 30, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1));//22 105 | __m128i shftbL = _mm_shuffle_epi8(sclbL, _mm_setr_epi8(-1, -1, -1, -1, -1, -1, 0, 6, 12, -1, -1, -1, -1, -1, -1, -1));//23 106 | __m128i shftbH = _mm_shuffle_epi8(sclbH, _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, 18, 24, 30, -1, -1, -1, -1));//24 107 | __m128i accumL = _mm_or_si128(shftaL, shftbL);//25 108 | __m128i accumH = _mm_or_si128(shftaH, shftbH);//26 109 | __m128i h3 = _mm_or_si128(accumL, accumH);//27 110 | //__m128i h3 = _mm_blendv_epi8(accumL, accumH, _mm_setr_epi8(0, 0, 0, -1, -1, -1, 0, 0, 0, -1, -1, -1, 1, 1, 1, 1)); 111 | _mm_storeu_si128((__m128i *)(LinePD + X), h3); 112 | } 113 | for (; X < Width; X++, LinePS += 3) { 114 | LinePD[X] = (B_WT * LinePS[0] + G_WT * LinePS[1] + R_WT * LinePS[2]) >> 8; 115 | } 116 | } 117 | } 118 | 119 | //sse 一次处理15个 120 | void RGB2Y_5(unsigned char *Src, unsigned char *Dest, int Width, int Height, int Stride) { 121 | const int B_WT = int(0.114 * 256 + 0.5); 122 | const int G_WT = int(0.587 * 256 + 0.5); 123 | const int R_WT = 256 - B_WT - G_WT; // int(0.299 * 256 + 0.5) 124 | 125 | for (int Y = 0; Y < Height; Y++) { 126 | unsigned char *LinePS = Src + Y * Stride; 127 | unsigned char *LinePD = Dest + Y * Width; 128 | int X = 0; 129 | for (; X < Width - 15; X += 15, LinePS += 45) 130 | { 131 | __m128i p1aL = _mm_mullo_epi16(_mm_cvtepu8_epi16(_mm_loadu_si128((__m128i *)(LinePS + 0))), _mm_setr_epi16(B_WT, G_WT, R_WT, B_WT, G_WT, R_WT, B_WT, G_WT)); //1 132 | __m128i p2aL = _mm_mullo_epi16(_mm_cvtepu8_epi16(_mm_loadu_si128((__m128i *)(LinePS + 1))), _mm_setr_epi16(G_WT, R_WT, B_WT, G_WT, R_WT, B_WT, G_WT, R_WT)); //2 133 | __m128i p3aL = _mm_mullo_epi16(_mm_cvtepu8_epi16(_mm_loadu_si128((__m128i *)(LinePS + 2))), _mm_setr_epi16(R_WT, B_WT, G_WT, R_WT, B_WT, G_WT, R_WT, B_WT)); //3 134 | 135 | __m128i p1aH = _mm_mullo_epi16(_mm_cvtepu8_epi16(_mm_loadu_si128((__m128i *)(LinePS + 8))), _mm_setr_epi16(R_WT, B_WT, G_WT, R_WT, B_WT, G_WT, R_WT, B_WT)); 136 | __m128i p2aH = _mm_mullo_epi16(_mm_cvtepu8_epi16(_mm_loadu_si128((__m128i *)(LinePS + 9))), _mm_setr_epi16(B_WT, G_WT, R_WT, B_WT, G_WT, R_WT, B_WT, G_WT)); 137 | __m128i p3aH = _mm_mullo_epi16(_mm_cvtepu8_epi16(_mm_loadu_si128((__m128i *)(LinePS + 10))), _mm_setr_epi16(G_WT, R_WT, B_WT, G_WT, R_WT, B_WT, G_WT, R_WT)); 138 | 139 | __m128i p1bL = _mm_mullo_epi16(_mm_cvtepu8_epi16(_mm_loadu_si128((__m128i *)(LinePS + 18))), _mm_setr_epi16(B_WT, G_WT, R_WT, B_WT, G_WT, R_WT, B_WT, G_WT)); 140 | __m128i p2bL = _mm_mullo_epi16(_mm_cvtepu8_epi16(_mm_loadu_si128((__m128i *)(LinePS + 19))), _mm_setr_epi16(G_WT, R_WT, B_WT, G_WT, R_WT, B_WT, G_WT, R_WT)); 141 | __m128i p3bL = _mm_mullo_epi16(_mm_cvtepu8_epi16(_mm_loadu_si128((__m128i *)(LinePS + 20))), _mm_setr_epi16(R_WT, B_WT, G_WT, R_WT, B_WT, G_WT, R_WT, B_WT)); 142 | 143 | __m128i p1bH = _mm_mullo_epi16(_mm_cvtepu8_epi16(_mm_loadu_si128((__m128i *)(LinePS + 26))), _mm_setr_epi16(R_WT, B_WT, G_WT, R_WT, B_WT, G_WT, R_WT, B_WT)); 144 | __m128i p2bH = _mm_mullo_epi16(_mm_cvtepu8_epi16(_mm_loadu_si128((__m128i *)(LinePS + 27))), _mm_setr_epi16(B_WT, G_WT, R_WT, B_WT, G_WT, R_WT, B_WT, G_WT)); 145 | __m128i p3bH = _mm_mullo_epi16(_mm_cvtepu8_epi16(_mm_loadu_si128((__m128i *)(LinePS + 28))), _mm_setr_epi16(G_WT, R_WT, B_WT, G_WT, R_WT, B_WT, G_WT, R_WT)); 146 | 147 | __m128i p1cH = _mm_mullo_epi16(_mm_cvtepu8_epi16(_mm_loadu_si128((__m128i *)(LinePS + 36))), _mm_setr_epi16(B_WT, G_WT, R_WT, B_WT, G_WT, R_WT, B_WT, G_WT)); 148 | __m128i p2cH = _mm_mullo_epi16(_mm_cvtepu8_epi16(_mm_loadu_si128((__m128i *)(LinePS + 37))), _mm_setr_epi16(G_WT, R_WT, B_WT, G_WT, R_WT, B_WT, G_WT, R_WT)); 149 | __m128i p3cH = _mm_mullo_epi16(_mm_cvtepu8_epi16(_mm_loadu_si128((__m128i *)(LinePS + 38))), _mm_setr_epi16(R_WT, B_WT, G_WT, R_WT, B_WT, G_WT, R_WT, B_WT)); 150 | 151 | __m128i sumaL = _mm_add_epi16(p3aL, _mm_add_epi16(p1aL, p2aL)); 152 | __m128i sumaH = _mm_add_epi16(p3aH, _mm_add_epi16(p1aH, p2aH)); 153 | __m128i sumbL = _mm_add_epi16(p3bL, _mm_add_epi16(p1bL, p2bL)); 154 | __m128i sumbH = _mm_add_epi16(p3bH, _mm_add_epi16(p1bH, p2bH)); 155 | __m128i sumcH = _mm_add_epi16(p3cH, _mm_add_epi16(p1cH, p2cH)); 156 | 157 | __m128i sclaL = _mm_srli_epi16(sumaL, 8); 158 | __m128i sclaH = _mm_srli_epi16(sumaH, 8); 159 | __m128i sclbL = _mm_srli_epi16(sumbL, 8); 160 | __m128i sclbH = _mm_srli_epi16(sumbH, 8); 161 | __m128i sclcH = _mm_srli_epi16(sumcH, 8); 162 | 163 | __m128i shftaL = _mm_shuffle_epi8(sclaL, _mm_setr_epi8(0, 6, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1)); 164 | __m128i shftaH = _mm_shuffle_epi8(sclaH, _mm_setr_epi8(-1, -1, -1, 2, 8, 14, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1)); 165 | __m128i shftbL = _mm_shuffle_epi8(sclbL, _mm_setr_epi8(-1, -1, -1, -1, -1, -1, 0, 6, 12, -1, -1, -1, -1, -1, -1, -1)); 166 | __m128i shftbH = _mm_shuffle_epi8(sclbH, _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, 2, 8, 14, -1, -1, -1, -1)); 167 | __m128i shftcH = _mm_shuffle_epi8(sclcH, _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 6, 12, -1)); 168 | __m128i accumL = _mm_or_si128(shftaL, shftbL); 169 | __m128i accumH = _mm_or_si128(shftaH, shftbH); 170 | __m128i h3 = _mm_or_si128(accumL, accumH); 171 | h3 = _mm_or_si128(h3, shftcH); 172 | _mm_storeu_si128((__m128i *)(LinePD + X), h3); 173 | } 174 | for (; X < Width; X++, LinePS += 3) { 175 | LinePD[X] = (B_WT * LinePS[0] + G_WT * LinePS[1] + R_WT * LinePS[2]) >> 8; 176 | } 177 | } 178 | } 179 | 180 | void debug(__m128i var) { 181 | uint8_t *val = (uint8_t*)&var;//can also use uint32_t instead of 16_t 182 | printf("Numerical: %i %i %i %i %i %i %i %i %i %i %i %i %i %i %i %i\n", 183 | val[0], val[1], val[2], val[3], val[4], val[5], 184 | val[6], val[7], val[8], val[9], val[10], val[11], val[12], val[13], 185 | val[14], val[15]); 186 | } 187 | 188 | void debug2(__m256i var) { 189 | uint8_t *val = (uint8_t*)&var;//can also use uint32_t instead of 16_t 190 | printf("Numerical: %i %i %i %i %i %i %i %i %i %i %i %i %i %i %i %i %i %i %i %i %i %i %i %i %i %i %i %i %i %i %i %i\n", 191 | val[0], val[1], val[2], val[3], val[4], val[5], 192 | val[6], val[7], val[8], val[9], val[10], val[11], val[12], val[13], 193 | val[14], val[15], val[16], val[17], val[18], val[19], val[20], val[21], val[22], val[23], val[24], val[25], val[26], val[27], 194 | val[28], val[29], val[30], val[31]); 195 | } 196 | 197 | // AVX2 198 | constexpr double B_WEIGHT = 0.114; 199 | constexpr double G_WEIGHT = 0.587; 200 | constexpr double R_WEIGHT = 0.299; 201 | constexpr uint16_t B_WT = static_cast(32768.0 * B_WEIGHT + 0.5); 202 | constexpr uint16_t G_WT = static_cast(32768.0 * G_WEIGHT + 0.5); 203 | constexpr uint16_t R_WT = static_cast(32768.0 * R_WEIGHT + 0.5); 204 | static const __m256i weight_vec = _mm256_setr_epi16(B_WT, G_WT, R_WT, B_WT, G_WT, R_WT, B_WT, G_WT, R_WT, B_WT, G_WT, R_WT, B_WT, G_WT, R_WT, B_WT); 205 | 206 | void _RGB2Y(unsigned char* Src, const int32_t Width, const int32_t start_row, const int32_t thread_stride, const int32_t Stride, unsigned char* Dest) 207 | { 208 | for (int Y = start_row; Y < start_row + thread_stride; Y++) 209 | { 210 | //Sleep(1); 211 | unsigned char *LinePS = Src + Y * Stride; 212 | unsigned char *LinePD = Dest + Y * Width; 213 | int X = 0; 214 | for (; X < Width - 10; X += 10, LinePS += 30) 215 | { 216 | //B1 G1 R1 B2 G2 R2 B3 G3 R3 B4 G4 R4 B5 G5 R5 B6 217 | __m256i temp = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i*)(LinePS + 0))); 218 | __m256i in1 = _mm256_mulhrs_epi16(temp, weight_vec); 219 | 220 | //B6 G6 R6 B7 G7 R7 B8 G8 R8 B9 G9 R9 B10 G10 R10 B11 221 | temp = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i*)(LinePS + 15))); 222 | __m256i in2 = _mm256_mulhrs_epi16(temp, weight_vec); 223 | 224 | 225 | //0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 226 | //B1 G1 R1 B2 G2 R2 B3 G3 B6 G6 R6 B7 G7 R7 B8 G8 R3 B4 G4 R4 B5 G5 R5 B6 R8 B9 G9 R9 B10 G10 R10 B11 227 | __m256i mul = _mm256_packus_epi16(in1, in2); 228 | 229 | __m256i b1 = _mm256_shuffle_epi8(mul, _mm256_setr_epi8( 230 | // B1 B2 B3 -1, -1, -1 B7 B8 -1, -1, -1, -1, -1, -1, -1, -1, 231 | 0, 3, 6, -1, -1, -1, 11, 14, -1, -1, -1, -1, -1, -1, -1, -1, 232 | 233 | // -1, -1, -1, B4 B5 B6 -1, -1 B9 B10 -1, -1, -1, -1, -1, -1 234 | -1, -1, -1, 1, 4, 7, -1, -1, 9, 12, -1, -1, -1, -1, -1, -1)); 235 | 236 | __m256i g1 = _mm256_shuffle_epi8(mul, _mm256_setr_epi8( 237 | 238 | // G1 G2 G3 -1, -1 G6 G7 G8 -1, -1, -1, -1, -1, -1, -1, -1, 239 | 1, 4, 7, -1, -1, 9, 12, 15, -1, -1, -1, -1, -1, -1, -1, -1, 240 | 241 | // -1, -1, -1 G4 G5 -1, -1, -1 G9 G10 -1, -1, -1, -1, -1, -1 242 | -1, -1, -1, 2, 5, -1, -1, -1, 10, 13, -1, -1, -1, -1, -1, -1)); 243 | 244 | __m256i r1 = _mm256_shuffle_epi8(mul, _mm256_setr_epi8( 245 | 246 | // R1 R2 -1 -1 -1 R6 R7 -1, -1, -1, -1, -1, -1, -1, -1, -1, 247 | 2, 5, -1, -1, -1, 10, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, 248 | 249 | // -1, -1, R3 R4 R5 -1, -1, R8 R9 R10 -1, -1, -1, -1, -1, -1 250 | -1, -1, 0, 3, 6, -1, -1, 8, 11, 14, -1, -1, -1, -1, -1, -1)); 251 | 252 | 253 | 254 | // B1+G1+R1 B2+G2+R2 B3+G3 0 0 G6+R6 B7+G7+R7 B8+G8 0 0 0 0 0 0 0 0 0 0 R3 B4+G4+R4 B5+G5+R5 B6 0 R8 B9+G9+R9 B10+G10+R10 0 0 0 0 0 0 255 | 256 | __m256i accum = _mm256_adds_epu8(r1, _mm256_adds_epu8(b1, g1)); 257 | 258 | 259 | // _mm256_castsi256_si128(accum) 260 | // B1+G1+R1 B2+G2+R2 B3+G3 0 0 G6+R6 B7+G7+R7 B8+G8 0 0 0 0 0 0 0 0 261 | 262 | // _mm256_extracti128_si256(accum, 1) 263 | // 0 0 R3 B4+G4+R4 B5+G5+R5 B6 0 R8 B9+G9+R9 B10+G10+R10 0 0 0 0 0 0 264 | 265 | __m128i h3 = _mm_adds_epu8(_mm256_castsi256_si128(accum), _mm256_extracti128_si256(accum, 1)); 266 | 267 | _mm_storeu_si128((__m128i *)(LinePD + X), h3); 268 | } 269 | for (; X < Width; X++, LinePS += 3) { 270 | int tmpB = (B_WT * LinePS[0]) >> 14 + 1; 271 | tmpB = max(min(255, tmpB), 0); 272 | 273 | int tmpG = (G_WT * LinePS[1]) >> 14 + 1; 274 | tmpG = max(min(255, tmpG), 0); 275 | 276 | int tmpR = (R_WT * LinePS[2]) >> 14 + 1; 277 | tmpR = max(min(255, tmpR), 0); 278 | 279 | int tmp = tmpB + tmpG + tmpR; 280 | LinePD[X] = max(min(255, tmp), 0); 281 | } 282 | } 283 | } 284 | 285 | //avx2 286 | void RGB2Y_6(unsigned char *Src, unsigned char *Dest, int width, int height, int stride) 287 | { 288 | _RGB2Y(Src, width, 0, height, stride, Dest); 289 | } 290 | 291 | //avx2 + std::async异步编程 292 | void RGB2Y_7(unsigned char *Src, unsigned char *Dest, int width, int height, int stride) { 293 | const int32_t hw_concur = std::min(height >> 4, static_cast(std::thread::hardware_concurrency())); 294 | std::vector> fut(hw_concur); 295 | const int thread_stride = (height - 1) / hw_concur + 1; 296 | int i = 0, start = 0; 297 | for (; i < std::min(height, hw_concur); i++, start += thread_stride) 298 | { 299 | fut[i] = std::async(std::launch::async, _RGB2Y, Src, width, start, thread_stride, stride, Dest); 300 | } 301 | for (int j = 0; j < i; ++j) 302 | fut[j].wait(); 303 | } 304 | 305 | int main() { 306 | Mat src = imread("F:\\car.jpg"); 307 | int Height = src.rows; 308 | int Width = src.cols; 309 | unsigned char *Src = src.data; 310 | unsigned char *Dest = new unsigned char[Height * Width]; 311 | int Stride = Width * 3; 312 | int Radius = 11; 313 | int64 st = cvGetTickCount(); 314 | for (int i = 0; i < 100; i++) { 315 | RGB2Y_3(Src, Dest, Width, Height, Stride); 316 | } 317 | double duration = (cv::getTickCount() - st) / cv::getTickFrequency() * 10; 318 | printf("%.5f\n", duration); 319 | RGB2Y_5(Src, Dest, Width, Height, Stride); 320 | Mat dst(Height, Width, CV_8UC1, Dest); 321 | imshow("origin", src); 322 | imshow("result", dst); 323 | imwrite("F:\\res.jpg", dst); 324 | waitKey(0); 325 | return 0; 326 | } -------------------------------------------------------------------------------- /speed_skin_detection_sse.cpp: -------------------------------------------------------------------------------- 1 | #include "stdafx.h" 2 | #include 3 | #include 4 | #include 5 | using namespace std; 6 | using namespace cv; 7 | 8 | #define IM_Max(a, b) (((a) >= (b)) ? (a): (b)) 9 | #define IM_Min(a, b) (((a) >= (b)) ? (b): (a)) 10 | #define _mm_cmpge_epu8(a, b) _mm_cmpeq_epi8(_mm_max_epu8(a, b), a) 11 | 12 | void IM_GetRoughSkinRegion(unsigned char *Src, unsigned char *Skin, int Width, int Height, int Stride) { 13 | for (int Y = 0; Y < Height; Y++) 14 | { 15 | unsigned char *LinePS = Src + Y * Stride; 16 | unsigned char *LinePD = Skin + Y * Width; 17 | for (int X = 0; X < Width; X++) 18 | { 19 | int Blue = LinePS[0], Green = LinePS[1], Red = LinePS[2]; 20 | if (Red >= 60 && Green >= 40 && Blue >= 20 && Red >= Blue && (Red - Green) >= 10 && IM_Max(IM_Max(Red, Green), Blue) - IM_Min(IM_Min(Red, Green), Blue) >= 10) 21 | LinePD[X] = 255; 22 | else 23 | LinePD[X] = 16; 24 | LinePS += 3; 25 | } 26 | } 27 | } 28 | 29 | void IM_GetRoughSkinRegion_OpenMP(unsigned char *Src, unsigned char *Skin, int Width, int Height, int Stride) { 30 | for (int Y = 0; Y < Height; Y++) 31 | { 32 | unsigned char *LinePS = Src + Y * Stride; 33 | unsigned char *LinePD = Skin + Y * Width; 34 | #pragma omp parallel for num_threads(4) 35 | for (int X = 0; X < Width; X++) 36 | { 37 | int Blue = LinePS[X*3 + 0], Green = LinePS[X*3 + 1], Red = LinePS[X*3 + 2]; 38 | if (Red >= 60 && Green >= 40 && Blue >= 20 && Red >= Blue && (Red - Green) >= 10 && IM_Max(IM_Max(Red, Green), Blue) - IM_Min(IM_Min(Red, Green), Blue) >= 10) 39 | LinePD[X] = 255; 40 | else 41 | LinePD[X] = 16; 42 | } 43 | } 44 | } 45 | 46 | 47 | void IM_GetRoughSkinRegion_SSE(unsigned char *Src, unsigned char *Skin, int Width, int Height, int Stride) { 48 | const int NonSkinLevel = 10; //非肤色部分的处理程序,本例取16,最大值取100,那样就是所有区域都为肤色,毫无意义 49 | const int BlockSize = 16; 50 | int Block = Width / BlockSize; 51 | for (int Y = 0; Y < Height; Y++) { 52 | unsigned char *LinePS = Src + Y * Stride; 53 | unsigned char *LinePD = Skin + Y * Width; 54 | for (int X = 0; X < Block * BlockSize; X += BlockSize, LinePS += BlockSize * 3, LinePD += BlockSize) { 55 | __m128i Src1, Src2, Src3, Blue, Green, Red, Result, Max, Min, AbsDiff; 56 | Src1 = _mm_loadu_si128((__m128i *)(LinePS + 0)); 57 | Src2 = _mm_loadu_si128((__m128i *)(LinePS + 16)); 58 | Src3 = _mm_loadu_si128((__m128i *)(LinePS + 32)); 59 | 60 | Blue = _mm_shuffle_epi8(Src1, _mm_setr_epi8(0, 3, 6, 9, 12, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1)); 61 | Blue = _mm_or_si128(Blue, _mm_shuffle_epi8(Src2, _mm_setr_epi8(-1, -1, -1, -1, -1, -1, 2, 5, 8, 11, 14, -1, -1, -1, -1, -1))); 62 | Blue = _mm_or_si128(Blue, _mm_shuffle_epi8(Src3, _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 4, 7, 10, 13))); 63 | 64 | Green = _mm_shuffle_epi8(Src1, _mm_setr_epi8(1, 4, 7, 10, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1)); 65 | Green = _mm_or_si128(Green, _mm_shuffle_epi8(Src2, _mm_setr_epi8(-1, -1, -1, -1, -1, 0, 3, 6, 9, 12, 15, -1, -1, -1, -1, -1))); 66 | Green = _mm_or_si128(Green, _mm_shuffle_epi8(Src3, _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2, 5, 8, 11, 14))); 67 | 68 | Red = _mm_shuffle_epi8(Src1, _mm_setr_epi8(2, 5, 8, 11, 14, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1)); 69 | Red = _mm_or_si128(Red, _mm_shuffle_epi8(Src2, _mm_setr_epi8(-1, -1, -1, -1, -1, 1, 4, 7, 10, 13, -1, -1, -1, -1, -1, -1))); 70 | Red = _mm_or_si128(Red, _mm_shuffle_epi8(Src3, _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 3, 6, 9, 12, 15))); 71 | 72 | Max = _mm_max_epu8(_mm_max_epu8(Blue, Green), Red); //IM_Max(IM_Max(Red, Green), Blue) 73 | Min = _mm_min_epu8(_mm_min_epu8(Blue, Green), Red); //IM_Min(IM_Min(Red, Green), Blue) 74 | Result = _mm_cmpge_epu8(Blue, _mm_set1_epi8(20)); //Blue >= 20 75 | Result = _mm_and_si128(Result, _mm_cmpge_epu8(Green, _mm_set1_epi8(40))); //Green >= 40 76 | Result = _mm_and_si128(Result, _mm_cmpge_epu8(Red, _mm_set1_epi8(60))); //Red >= 60 77 | Result = _mm_and_si128(Result, _mm_cmpge_epu8(Red, Blue)); //Red >= Blue 78 | Result = _mm_and_si128(Result, _mm_cmpge_epu8(_mm_subs_epu8(Red, Green), _mm_set1_epi8(10))); //(Red - Green) >= 10 79 | Result = _mm_and_si128(Result, _mm_cmpge_epu8(_mm_subs_epu8(Max, Min), _mm_set1_epi8(10))); //IM_Max(IM_Max(Red, Green), Blue) - IM_Min(IM_Min(Red, Green), Blue) >= 10 80 | Result = _mm_or_si128(Result, _mm_set1_epi8(16)); 81 | _mm_storeu_si128((__m128i*)(LinePD + 0), Result); 82 | } 83 | for (int X = Block * BlockSize; X < Width; X++, LinePS += 3, LinePD++) 84 | { 85 | int Blue = LinePS[0], Green = LinePS[1], Red = LinePS[2]; 86 | if (Red >= 60 && Green >= 40 && Blue >= 20 && Red >= Blue && (Red - Green) >= 10 && IM_Max(IM_Max(Red, Green), Blue) - IM_Min(IM_Min(Red, Green), Blue) >= 10) 87 | LinePD[0] = 255; // 全为肤色部分 88 | else 89 | LinePD[0] = 16; 90 | } 91 | } 92 | } 93 | 94 | void _IM_GetRoughSkinRegion(unsigned char* Src, const int32_t Width, const int32_t start_row, const int32_t thread_stride, const int32_t Stride, unsigned char* Dest) { 95 | const int NonSkinLevel = 10; //非肤色部分的处理程序,本例取16,最大值取100,那样就是所有区域都为肤色,毫无意义 96 | const int BlockSize = 16; 97 | int Block = Width / BlockSize; 98 | for (int Y = start_row; Y < start_row + thread_stride; Y++) { 99 | unsigned char *LinePS = Src + Y * Stride; 100 | unsigned char *LinePD = Dest + Y * Width; 101 | for (int X = 0; X < Block * BlockSize; X += BlockSize, LinePS += BlockSize * 3, LinePD += BlockSize) { 102 | __m128i Src1, Src2, Src3, Blue, Green, Red, Result, Max, Min, AbsDiff; 103 | Src1 = _mm_loadu_si128((__m128i *)(LinePS + 0)); 104 | Src2 = _mm_loadu_si128((__m128i *)(LinePS + 16)); 105 | Src3 = _mm_loadu_si128((__m128i *)(LinePS + 32)); 106 | 107 | Blue = _mm_shuffle_epi8(Src1, _mm_setr_epi8(0, 3, 6, 9, 12, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1)); 108 | Blue = _mm_or_si128(Blue, _mm_shuffle_epi8(Src2, _mm_setr_epi8(-1, -1, -1, -1, -1, -1, 2, 5, 8, 11, 14, -1, -1, -1, -1, -1))); 109 | Blue = _mm_or_si128(Blue, _mm_shuffle_epi8(Src3, _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 4, 7, 10, 13))); 110 | 111 | Green = _mm_shuffle_epi8(Src1, _mm_setr_epi8(1, 4, 7, 10, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1)); 112 | Green = _mm_or_si128(Green, _mm_shuffle_epi8(Src2, _mm_setr_epi8(-1, -1, -1, -1, -1, 0, 3, 6, 9, 12, 15, -1, -1, -1, -1, -1))); 113 | Green = _mm_or_si128(Green, _mm_shuffle_epi8(Src3, _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2, 5, 8, 11, 14))); 114 | 115 | Red = _mm_shuffle_epi8(Src1, _mm_setr_epi8(2, 5, 8, 11, 14, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1)); 116 | Red = _mm_or_si128(Red, _mm_shuffle_epi8(Src2, _mm_setr_epi8(-1, -1, -1, -1, -1, 1, 4, 7, 10, 13, -1, -1, -1, -1, -1, -1))); 117 | Red = _mm_or_si128(Red, _mm_shuffle_epi8(Src3, _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 3, 6, 9, 12, 15))); 118 | 119 | Max = _mm_max_epu8(_mm_max_epu8(Blue, Green), Red); //IM_Max(IM_Max(Red, Green), Blue) 120 | Min = _mm_min_epu8(_mm_min_epu8(Blue, Green), Red); //IM_Min(IM_Min(Red, Green), Blue) 121 | Result = _mm_cmpge_epu8(Blue, _mm_set1_epi8(20)); //Blue >= 20 122 | Result = _mm_and_si128(Result, _mm_cmpge_epu8(Green, _mm_set1_epi8(40))); //Green >= 40 123 | Result = _mm_and_si128(Result, _mm_cmpge_epu8(Red, _mm_set1_epi8(60))); //Red >= 60 124 | Result = _mm_and_si128(Result, _mm_cmpge_epu8(Red, Blue)); //Red >= Blue 125 | Result = _mm_and_si128(Result, _mm_cmpge_epu8(_mm_subs_epu8(Red, Green), _mm_set1_epi8(10))); //(Red - Green) >= 10 126 | Result = _mm_and_si128(Result, _mm_cmpge_epu8(_mm_subs_epu8(Max, Min), _mm_set1_epi8(10))); //IM_Max(IM_Max(Red, Green), Blue) - IM_Min(IM_Min(Red, Green), Blue) >= 10 127 | Result = _mm_or_si128(Result, _mm_set1_epi8(16)); 128 | _mm_storeu_si128((__m128i*)(LinePD + 0), Result); 129 | } 130 | for (int X = Block * BlockSize; X < Width; X++, LinePS += 3, LinePD++) 131 | { 132 | int Blue = LinePS[0], Green = LinePS[1], Red = LinePS[2]; 133 | if (Red >= 60 && Green >= 40 && Blue >= 20 && Red >= Blue && (Red - Green) >= 10 && IM_Max(IM_Max(Red, Green), Blue) - IM_Min(IM_Min(Red, Green), Blue) >= 10) 134 | LinePD[0] = 255; // 全为肤色部分 135 | else 136 | LinePD[0] = 16; 137 | } 138 | } 139 | } 140 | 141 | void IM_GetRoughSkinRegion_SSE2(unsigned char *Src, unsigned char *Skin, int width, int height, int stride) { 142 | const int32_t hw_concur = std::min(height >> 4, static_cast(std::thread::hardware_concurrency())); 143 | std::vector> fut(hw_concur); 144 | const int thread_stride = (height - 1) / hw_concur + 1; 145 | int i = 0, start = 0; 146 | for (; i < std::min(height, hw_concur); i++, start += thread_stride) 147 | { 148 | fut[i] = std::async(std::launch::async, _IM_GetRoughSkinRegion, Src, width, start, thread_stride, stride, Skin); 149 | } 150 | for (int j = 0; j < i; ++j) 151 | fut[j].wait(); 152 | } 153 | 154 | void IM_GrayToRGB(unsigned char *Gray, unsigned char *RGB, int Width, int Height, int Stride) 155 | { 156 | for (int Y = 0; Y < Height; Y++) 157 | { 158 | unsigned char *LinePS = Gray + Y * Width; // 源图的第Y行像素的首地址 159 | unsigned char *LinePD = RGB + Y * Stride; // Skin区域的第Y行像素的首地址 160 | int X = 0; 161 | for (int X = 0; X < Width; X++) 162 | { 163 | LinePD[0] = LinePD[1] = LinePD[2] = LinePS[X]; 164 | LinePD += 3; 165 | } 166 | } 167 | } 168 | 169 | int main() { 170 | Mat src = imread("F:\\face.jpg"); 171 | int Height = src.rows; 172 | int Width = src.cols; 173 | unsigned char *Src = src.data; 174 | unsigned char *Skin = new unsigned char[Height * Width]; 175 | unsigned char *Dest = new unsigned char[Height * Width * 3]; 176 | int Stride = Width * 3; 177 | int Radius = 11; 178 | int Adjustment = 50; 179 | int64 st = cvGetTickCount(); 180 | for (int i = 0; i <1000; i++) { 181 | IM_GetRoughSkinRegion_SSE2(Src, Skin, Width, Height, Stride); 182 | //IM_GrayToRGB(Skin, Dest, Width, Height, Stride); 183 | } 184 | double duration = (cv::getTickCount() - st) / cv::getTickFrequency(); 185 | printf("%.5f\n", duration); 186 | IM_GetRoughSkinRegion_SSE2(Src, Skin, Width, Height, Stride); 187 | IM_GrayToRGB(Skin, Dest, Width, Height, Stride); 188 | Mat dst(Height, Width, CV_8UC3, Dest); 189 | imshow("origin", src); 190 | imshow("result", dst); 191 | imwrite("F:\\res.jpg", dst); 192 | waitKey(0); 193 | } -------------------------------------------------------------------------------- /speed_sobel_edgedetection_sse.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | using namespace std; 5 | using namespace cv; 6 | 7 | inline unsigned char IM_ClampToByte(int Value) 8 | { 9 | if (Value < 0) 10 | return 0; 11 | else if (Value > 255) 12 | return 255; 13 | else 14 | return (unsigned char)Value; 15 | //return ((Value | ((signed int)(255 - Value) >> 31)) & ~((signed int)Value >> 31)); 16 | } 17 | 18 | void Sobel_FLOAT(unsigned char *Src, unsigned char *Dest, int Width, int Height, int Stride) { 19 | int Channel = Stride / Width; 20 | unsigned char *RowCopy = (unsigned char*)malloc((Width + 2) * 3 * Channel); 21 | unsigned char *First = RowCopy; 22 | unsigned char *Second = RowCopy + (Width + 2) * Channel; 23 | unsigned char *Third = RowCopy + (Width + 2) * 2 * Channel; 24 | //拷贝第二行数据,边界值填充 25 | memcpy(Second, Src, Channel); 26 | memcpy(Second + Channel, Src, Width*Channel); 27 | memcpy(Second + (Width + 1)*Channel, Src + (Width - 1)*Channel, Channel); 28 | //第一行和第二行一样 29 | memcpy(First, Second, (Width + 2) * Channel); 30 | //拷贝第三行数据,边界值填充 31 | memcpy(Third, Src + Stride, Channel); 32 | memcpy(Third + Channel, Src + Stride, Width * Channel); 33 | memcpy(Third + (Width + 1) * Channel, Src + Stride + (Width - 1) * Channel, Channel); 34 | 35 | for (int Y = 0; Y < Height; Y++) { 36 | unsigned char *LinePS = Src + Y * Stride; 37 | unsigned char *LinePD = Dest + Y * Stride; 38 | if (Y != 0) { 39 | unsigned char *Temp = First; 40 | First = Second; 41 | Second = Third; 42 | Third = Temp; 43 | } 44 | if (Y == Height - 1) { 45 | memcpy(Third, Second, (Width + 2) * Channel); 46 | } 47 | else { 48 | memcpy(Third, Src + (Y + 1) * Stride, Channel); 49 | memcpy(Third + Channel, Src + (Y + 1) * Stride, Width * Channel); // 由于备份了前面一行的数据,这里即使Src和Dest相同也是没有问题的 50 | memcpy(Third + (Width + 1) * Channel, Src + (Y + 1) * Stride + (Width - 1) * Channel, Channel); 51 | } 52 | if (Channel == 1) { 53 | for (int X = 0; X < Width; X++) 54 | { 55 | int GX = First[X] - First[X + 2] + (Second[X] - Second[X + 2]) * 2 + Third[X] - Third[X + 2]; 56 | int GY = First[X] + First[X + 2] + (First[X + 1] - Third[X + 1]) * 2 - Third[X] - Third[X + 2]; 57 | LinePD[X] = IM_ClampToByte(sqrtf(GX * GX + GY * GY + 0.0F)); 58 | } 59 | } 60 | else 61 | { 62 | for (int X = 0; X < Width * 3; X++) 63 | { 64 | int GX = First[X] - First[X + 6] + (Second[X] - Second[X + 6]) * 2 + Third[X] - Third[X + 6]; 65 | int GY = First[X] + First[X + 6] + (First[X + 3] - Third[X + 3]) * 2 - Third[X] - Third[X + 6]; 66 | LinePD[X] = IM_ClampToByte(sqrtf(GX * GX + GY * GY + 0.0F)); 67 | } 68 | } 69 | } 70 | free(RowCopy); 71 | } 72 | 73 | void Sobel_INT(unsigned char *Src, unsigned char *Dest, int Width, int Height, int Stride) { 74 | int Channel = Stride / Width; 75 | unsigned char *RowCopy = (unsigned char*)malloc((Width + 2) * 3 * Channel); 76 | unsigned char *First = RowCopy; 77 | unsigned char *Second = RowCopy + (Width + 2) * Channel; 78 | unsigned char *Third = RowCopy + (Width + 2) * 2 * Channel; 79 | //拷贝第二行数据,边界值填充 80 | memcpy(Second, Src, Channel); 81 | memcpy(Second + Channel, Src, Width*Channel); 82 | memcpy(Second + (Width + 1)*Channel, Src + (Width - 1)*Channel, Channel); 83 | //第一行和第二行一样 84 | memcpy(First, Second, (Width + 2) * Channel); 85 | //拷贝第三行数据,边界值填充 86 | memcpy(Third, Src + Stride, Channel); 87 | memcpy(Third + Channel, Src + Stride, Width * Channel); 88 | memcpy(Third + (Width + 1) * Channel, Src + Stride + (Width - 1) * Channel, Channel); 89 | 90 | unsigned char Table[65026]; 91 | for (int Y = 0; Y < 65026; Y++) Table[Y] = (sqrtf(Y + 0.0f) + 0.5f); 92 | for (int Y = 0; Y < Height; Y++) { 93 | unsigned char *LinePS = Src + Y * Stride; 94 | unsigned char *LinePD = Dest + Y * Stride; 95 | if (Y != 0) { 96 | unsigned char *Temp = First; 97 | First = Second; 98 | Second = Third; 99 | Third = Temp; 100 | } 101 | if (Y == Height - 1) { 102 | memcpy(Third, Second, (Width + 2) * Channel); 103 | } 104 | else { 105 | memcpy(Third, Src + (Y + 1) * Stride, Channel); 106 | memcpy(Third + Channel, Src + (Y + 1) * Stride, Width * Channel); // 由于备份了前面一行的数据,这里即使Src和Dest相同也是没有问题的 107 | memcpy(Third + (Width + 1) * Channel, Src + (Y + 1) * Stride + (Width - 1) * Channel, Channel); 108 | } 109 | if (Channel == 1) { 110 | for (int X = 0; X < Width; X++) 111 | { 112 | int GX = First[X] - First[X + 2] + (Second[X] - Second[X + 2]) * 2 + Third[X] - Third[X + 2]; 113 | int GY = First[X] + First[X + 2] + (First[X + 1] - Third[X + 1]) * 2 - Third[X] - Third[X + 2]; 114 | LinePD[X] = Table[min(GX * GX + GY * GY, 65025)]; 115 | } 116 | } 117 | else 118 | { 119 | for (int X = 0; X < Width * 3; X++) 120 | { 121 | int GX = First[X] - First[X + 6] + (Second[X] - Second[X + 6]) * 2 + Third[X] - Third[X + 6]; 122 | int GY = First[X] + First[X + 6] + (First[X + 3] - Third[X + 3]) * 2 - Third[X] - Third[X + 6]; 123 | LinePD[X] = Table[min(GX * GX + GY * GY, 65025)]; 124 | } 125 | } 126 | } 127 | free(RowCopy); 128 | } 129 | 130 | void Sobel_SSE1(unsigned char *Src, unsigned char *Dest, int Width, int Height, int Stride) { 131 | int Channel = Stride / Width; 132 | unsigned char *RowCopy = (unsigned char*)malloc((Width + 2) * 3 * Channel); 133 | unsigned char *First = RowCopy; 134 | unsigned char *Second = RowCopy + (Width + 2) * Channel; 135 | unsigned char *Third = RowCopy + (Width + 2) * 2 * Channel; 136 | //拷贝第二行数据,边界值填充 137 | memcpy(Second, Src, Channel); 138 | memcpy(Second + Channel, Src, Width*Channel); 139 | memcpy(Second + (Width + 1)*Channel, Src + (Width - 1)*Channel, Channel); 140 | //第一行和第二行一样 141 | memcpy(First, Second, (Width + 2) * Channel); 142 | //拷贝第三行数据,边界值填充 143 | memcpy(Third, Src + Stride, Channel); 144 | memcpy(Third + Channel, Src + Stride, Width * Channel); 145 | memcpy(Third + (Width + 1) * Channel, Src + Stride + (Width - 1) * Channel, Channel); 146 | 147 | int BlockSize = 8, Block = (Width * Channel) / BlockSize; 148 | 149 | unsigned char Table[65026]; 150 | for (int Y = 0; Y < 65026; Y++) Table[Y] = (sqrtf(Y + 0.0f) + 0.5f); 151 | for (int Y = 0; Y < Height; Y++) { 152 | unsigned char *LinePS = Src + Y * Stride; 153 | unsigned char *LinePD = Dest + Y * Stride; 154 | if (Y != 0) { 155 | unsigned char *Temp = First; 156 | First = Second; 157 | Second = Third; 158 | Third = Temp; 159 | } 160 | if (Y == Height - 1) { 161 | memcpy(Third, Second, (Width + 2) * Channel); 162 | } 163 | else { 164 | memcpy(Third, Src + (Y + 1) * Stride, Channel); 165 | memcpy(Third + Channel, Src + (Y + 1) * Stride, Width * Channel); // 由于备份了前面一行的数据,这里即使Src和Dest相同也是没有问题的 166 | memcpy(Third + (Width + 1) * Channel, Src + (Y + 1) * Stride + (Width - 1) * Channel, Channel); 167 | } 168 | if (Channel == 1) { 169 | for (int X = 0; X < Width; X++) 170 | { 171 | int GX = First[X] - First[X + 2] + (Second[X] - Second[X + 2]) * 2 + Third[X] - Third[X + 2]; 172 | int GY = First[X] + First[X + 2] + (First[X + 1] - Third[X + 1]) * 2 - Third[X] - Third[X + 2]; 173 | //LinePD[X] = Table[min(GX * GX + GY * GY, 65025)]; 174 | } 175 | } 176 | else 177 | { 178 | __m128i Zero = _mm_setzero_si128(); 179 | for (int X = 0; X < Block * BlockSize; X += BlockSize) 180 | { 181 | __m128i FirstP0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(First + X)), Zero); 182 | __m128i FirstP1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(First + X + 3)), Zero); 183 | __m128i FirstP2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(First + X + 6)), Zero); 184 | 185 | __m128i SecondP0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(Second + X)), Zero); 186 | __m128i SecondP2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(Second + X + 6)), Zero); 187 | 188 | __m128i ThirdP0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(Third + X)), Zero); 189 | __m128i ThirdP1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(Third + X + 3)), Zero); 190 | __m128i ThirdP2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(Third + X + 6)), Zero); 191 | 192 | __m128i GX16 = _mm_abs_epi16(_mm_add_epi16(_mm_add_epi16(_mm_sub_epi16(FirstP0, FirstP2), _mm_slli_epi16(_mm_sub_epi16(SecondP0, SecondP2), 1)), _mm_sub_epi16(ThirdP0, ThirdP2))); 193 | __m128i GY16 = _mm_abs_epi16(_mm_sub_epi16(_mm_add_epi16(_mm_add_epi16(FirstP0, FirstP2), _mm_slli_epi16(_mm_sub_epi16(FirstP1, ThirdP1), 1)), _mm_add_epi16(ThirdP0, ThirdP2))); 194 | 195 | __m128i GX32L = _mm_unpacklo_epi16(GX16, Zero); 196 | __m128i GX32H = _mm_unpackhi_epi16(GX16, Zero); 197 | __m128i GY32L = _mm_unpacklo_epi16(GY16, Zero); 198 | __m128i GY32H = _mm_unpackhi_epi16(GY16, Zero); 199 | __m128i ResultL = _mm_cvtps_epi32(_mm_sqrt_ps(_mm_cvtepi32_ps(_mm_add_epi32(_mm_mullo_epi32(GX32L, GX32L), _mm_mullo_epi32(GY32L, GY32L))))); 200 | __m128i ResultH = _mm_cvtps_epi32(_mm_sqrt_ps(_mm_cvtepi32_ps(_mm_add_epi32(_mm_mullo_epi32(GX32H, GX32H), _mm_mullo_epi32(GY32H, GY32H))))); 201 | _mm_storel_epi64((__m128i *)(LinePD + X), _mm_packus_epi16(_mm_packus_epi32(ResultL, ResultH), Zero)); 202 | } 203 | 204 | for (int X = Block * BlockSize; X < Width * 3; X++) 205 | { 206 | int GX = First[X] - First[X + 6] + (Second[X] - Second[X + 6]) * 2 + Third[X] - Third[X + 6]; 207 | int GY = First[X] + First[X + 6] + (First[X + 3] - Third[X + 3]) * 2 - Third[X] - Third[X + 6]; 208 | LinePD[X] = IM_ClampToByte(sqrtf(GX * GX + GY * GY + 0.0F)); 209 | } 210 | } 211 | } 212 | free(RowCopy); 213 | } 214 | 215 | void Sobel_SSE2(unsigned char *Src, unsigned char *Dest, int Width, int Height, int Stride) { 216 | int Channel = Stride / Width; 217 | unsigned char *RowCopy = (unsigned char*)malloc((Width + 2) * 3 * Channel); 218 | unsigned char *First = RowCopy; 219 | unsigned char *Second = RowCopy + (Width + 2) * Channel; 220 | unsigned char *Third = RowCopy + (Width + 2) * 2 * Channel; 221 | //拷贝第二行数据,边界值填充 222 | memcpy(Second, Src, Channel); 223 | memcpy(Second + Channel, Src, Width*Channel); 224 | memcpy(Second + (Width + 1)*Channel, Src + (Width - 1)*Channel, Channel); 225 | //第一行和第二行一样 226 | memcpy(First, Second, (Width + 2) * Channel); 227 | //拷贝第三行数据,边界值填充 228 | memcpy(Third, Src + Stride, Channel); 229 | memcpy(Third + Channel, Src + Stride, Width * Channel); 230 | memcpy(Third + (Width + 1) * Channel, Src + Stride + (Width - 1) * Channel, Channel); 231 | 232 | int BlockSize = 8, Block = (Width * Channel) / BlockSize; 233 | 234 | unsigned char Table[65026]; 235 | for (int Y = 0; Y < 65026; Y++) Table[Y] = (sqrtf(Y + 0.0f) + 0.5f); 236 | for (int Y = 0; Y < Height; Y++) { 237 | unsigned char *LinePS = Src + Y * Stride; 238 | unsigned char *LinePD = Dest + Y * Stride; 239 | if (Y != 0) { 240 | unsigned char *Temp = First; 241 | First = Second; 242 | Second = Third; 243 | Third = Temp; 244 | } 245 | if (Y == Height - 1) { 246 | memcpy(Third, Second, (Width + 2) * Channel); 247 | } 248 | else { 249 | memcpy(Third, Src + (Y + 1) * Stride, Channel); 250 | memcpy(Third + Channel, Src + (Y + 1) * Stride, Width * Channel); // 由于备份了前面一行的数据,这里即使Src和Dest相同也是没有问题的 251 | memcpy(Third + (Width + 1) * Channel, Src + (Y + 1) * Stride + (Width - 1) * Channel, Channel); 252 | } 253 | if (Channel == 1) { 254 | for (int X = 0; X < Width; X++) 255 | { 256 | int GX = First[X] - First[X + 2] + (Second[X] - Second[X + 2]) * 2 + Third[X] - Third[X + 2]; 257 | int GY = First[X] + First[X + 2] + (First[X + 1] - Third[X + 1]) * 2 - Third[X] - Third[X + 2]; 258 | //LinePD[X] = Table[min(GX * GX + GY * GY, 65025)]; 259 | } 260 | } 261 | else 262 | { 263 | __m128i Zero = _mm_setzero_si128(); 264 | for (int X = 0; X < Block * BlockSize; X += BlockSize) 265 | { 266 | __m128i FirstP0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(First + X)), Zero); 267 | __m128i FirstP1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(First + X + 3)), Zero); 268 | __m128i FirstP2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(First + X + 6)), Zero); 269 | 270 | __m128i SecondP0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(Second + X)), Zero); 271 | __m128i SecondP2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(Second + X + 6)), Zero); 272 | 273 | __m128i ThirdP0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(Third + X)), Zero); 274 | __m128i ThirdP1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(Third + X + 3)), Zero); 275 | __m128i ThirdP2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(Third + X + 6)), Zero); 276 | 277 | __m128i GX16 = _mm_abs_epi16(_mm_add_epi16(_mm_add_epi16(_mm_sub_epi16(FirstP0, FirstP2), _mm_slli_epi16(_mm_sub_epi16(SecondP0, SecondP2), 1)), _mm_sub_epi16(ThirdP0, ThirdP2))); 278 | __m128i GY16 = _mm_abs_epi16(_mm_sub_epi16(_mm_add_epi16(_mm_add_epi16(FirstP0, FirstP2), _mm_slli_epi16(_mm_sub_epi16(FirstP1, ThirdP1), 1)), _mm_add_epi16(ThirdP0, ThirdP2))); 279 | 280 | __m128i GXYL = _mm_unpacklo_epi16(GX16, GY16); 281 | __m128i GXYH = _mm_unpackhi_epi16(GX16, GY16); 282 | 283 | __m128i ResultL = _mm_cvtps_epi32(_mm_sqrt_ps(_mm_cvtepi32_ps(_mm_madd_epi16(GXYL, GXYL)))); 284 | __m128i ResultH = _mm_cvtps_epi32(_mm_sqrt_ps(_mm_cvtepi32_ps(_mm_madd_epi16(GXYH, GXYH)))); 285 | _mm_storel_epi64((__m128i *)(LinePD + X), _mm_packus_epi16(_mm_packus_epi32(ResultL, ResultH), Zero)); 286 | } 287 | 288 | for (int X = Block * BlockSize; X < Width * 3; X++) 289 | { 290 | int GX = First[X] - First[X + 6] + (Second[X] - Second[X + 6]) * 2 + Third[X] - Third[X + 6]; 291 | int GY = First[X] + First[X + 6] + (First[X + 3] - Third[X + 3]) * 2 - Third[X] - Third[X + 6]; 292 | LinePD[X] = IM_ClampToByte(sqrtf(GX * GX + GY * GY + 0.0F)); 293 | } 294 | } 295 | } 296 | free(RowCopy); 297 | } 298 | 299 | unsigned char *RowCopy; 300 | unsigned char *First; 301 | unsigned char *Second; 302 | unsigned char *Third; 303 | int Channel, Block, BlockSize; 304 | void _Sobel(unsigned char* Src, const int32_t Width, const int32_t Height, const int32_t start_row, const int32_t thread_stride, const int32_t Stride, unsigned char* Dest) { 305 | for (int Y = start_row; Y < start_row + thread_stride; Y++) { 306 | unsigned char *LinePS = Src + Y * Stride; 307 | unsigned char *LinePD = Dest + Y * Stride; 308 | if (Y != 0) { 309 | unsigned char *Temp = First; 310 | First = Second; 311 | Second = Third; 312 | Third = Temp; 313 | } 314 | if (Y == Height - 1) { 315 | memcpy(Third, Second, (Width + 2) * Channel); 316 | } 317 | else { 318 | memcpy(Third, Src + (Y + 1) * Stride, Channel); 319 | memcpy(Third + Channel, Src + (Y + 1) * Stride, Width * Channel); // 由于备份了前面一行的数据,这里即使Src和Dest相同也是没有问题的 320 | memcpy(Third + (Width + 1) * Channel, Src + (Y + 1) * Stride + (Width - 1) * Channel, Channel); 321 | } 322 | if (Channel == 1) { 323 | for (int X = 0; X < Width; X++) 324 | { 325 | int GX = First[X] - First[X + 2] + (Second[X] - Second[X + 2]) * 2 + Third[X] - Third[X + 2]; 326 | int GY = First[X] + First[X + 2] + (First[X + 1] - Third[X + 1]) * 2 - Third[X] - Third[X + 2]; 327 | //LinePD[X] = Table[min(GX * GX + GY * GY, 65025)]; 328 | } 329 | } 330 | else 331 | { 332 | __m256i Zero = _mm256_setzero_si256(); 333 | for (int X = 0; X < Block * BlockSize; X += BlockSize) 334 | { 335 | __m256i FirstP0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i*)(First + X))); 336 | __m256i FirstP1 = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i*)(First + X + 3))); 337 | __m256i FirstP2 = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i*)(First + X + 6))); 338 | 339 | __m256i SecondP0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i*)(Second + X))); 340 | __m256i SecondP2 = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i*)(Second + X + 6))); 341 | 342 | __m256i ThirdP0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i*)(Third + X))); 343 | __m256i ThirdP1 = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i*)(Third + X + 3))); 344 | __m256i ThirdP2 = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i*)(Third + X + 6))); 345 | 346 | //GX0 GX1 GX2 GX3 GX4 GX5 GX6 GX7 GX8 GX9 GX10 GX11 GX12 GX13 GX14 GX15 347 | __m256i GX16 = _mm256_abs_epi16(_mm256_adds_epi16(_mm256_adds_epi16(_mm256_subs_epi16(FirstP0, FirstP2), _mm256_slli_epi16(_mm256_subs_epi16(SecondP0, SecondP2), 1)), _mm256_subs_epi16(ThirdP0, ThirdP2))); 348 | //GY0 GY1 GY2 GY3 GY4 GY5 GY6 GY7 GY8 GY9 GY10 GY11 GY12 GY13 GY14 GY15 349 | __m256i GY16 = _mm256_abs_epi16(_mm256_subs_epi16(_mm256_adds_epi16(_mm256_adds_epi16(FirstP0, FirstP2), _mm256_slli_epi16(_mm256_subs_epi16(FirstP1, ThirdP1), 1)), _mm256_adds_epi16(ThirdP0, ThirdP2))); 350 | //GX0  GY0  GX1  GY1  GX2  GY2  GX3  GY3 GX4 GY4 GX5 GY5 GX6 GY6 GX7 GY7 351 | __m256i GXYL = _mm256_unpacklo_epi16(GX16, GY16); 352 | //GX8  GY8  GX9  GY9  GX10 GY10  GX11 GY11 GX12 GY12 GX13 GY13 GX14 GY14 GX15 GY15 353 | __m256i GXYH = _mm256_unpackhi_epi16(GX16, GY16); 354 | 355 | 356 | __m256i ResultL = _mm256_cvtps_epi32(_mm256_sqrt_ps(_mm256_cvtepi32_ps(_mm256_madd_epi16(GXYL, GXYL)))); 357 | __m256i ResultH = _mm256_cvtps_epi32(_mm256_sqrt_ps(_mm256_cvtepi32_ps(_mm256_madd_epi16(GXYH, GXYH)))); 358 | 359 | //__m256i Result = _mm256_packus_epi16(_mm256_packus_epi32(ResultL, ResultH), Zero); 360 | 361 | __m128i Ans1 = _mm256_castsi256_si128(ResultL); 362 | _mm_storeu_si128((__m128i *)(LinePD + X), Ans1); 363 | 364 | __m128i Ans2 = _mm256_castsi256_si128(ResultL); 365 | _mm_storeu_si128((__m128i *)(LinePD + X + 8), Ans2); 366 | } 367 | 368 | for (int X = Block * BlockSize; X < Width * 3; X++) 369 | { 370 | int GX = First[X] - First[X + 6] + (Second[X] - Second[X + 6]) * 2 + Third[X] - Third[X + 6]; 371 | int GY = First[X] + First[X + 6] + (First[X + 3] - Third[X + 3]) * 2 - Third[X] - Third[X + 6]; 372 | LinePD[X] = IM_ClampToByte(sqrtf(GX * GX + GY * GY + 0.0F)); 373 | } 374 | } 375 | } 376 | } 377 | 378 | void Sobel_AVX1(unsigned char *Src, unsigned char *Dest, int Width, int Height, int Stride) { 379 | Channel = Stride / Width; 380 | RowCopy = (unsigned char*)malloc((Width + 2) * 3 * Channel); 381 | First = RowCopy; 382 | Second = RowCopy + (Width + 2) * Channel; 383 | Third = RowCopy + (Width + 2) * 2 * Channel; 384 | //拷贝第二行数据,边界值填充 385 | memcpy(Second, Src, Channel); 386 | memcpy(Second + Channel, Src, Width*Channel); 387 | memcpy(Second + (Width + 1)*Channel, Src + (Width - 1)*Channel, Channel); 388 | //第一行和第二行一样 389 | memcpy(First, Second, (Width + 2) * Channel); 390 | //拷贝第三行数据,边界值填充 391 | memcpy(Third, Src + Stride, Channel); 392 | memcpy(Third + Channel, Src + Stride, Width * Channel); 393 | memcpy(Third + (Width + 1) * Channel, Src + Stride + (Width - 1) * Channel, Channel); 394 | 395 | BlockSize = 16, Block = (Width * Channel) / BlockSize; 396 | 397 | _Sobel(Src, Width, Height, 0, Height, Stride, Dest); 398 | 399 | free(RowCopy); 400 | } 401 | 402 | void Sobel_AVX2(unsigned char *Src, unsigned char *Dest, int Width, int Height, int Stride) { 403 | //INIT 404 | Channel = Stride / Width; 405 | RowCopy = (unsigned char*)malloc((Width + 2) * 3 * Channel); 406 | First = RowCopy; 407 | Second = RowCopy + (Width + 2) * Channel; 408 | Third = RowCopy + (Width + 2) * 2 * Channel; 409 | //拷贝第二行数据,边界值填充 410 | memcpy(Second, Src, Channel); 411 | memcpy(Second + Channel, Src, Width*Channel); 412 | memcpy(Second + (Width + 1)*Channel, Src + (Width - 1)*Channel, Channel); 413 | //第一行和第二行一样 414 | memcpy(First, Second, (Width + 2) * Channel); 415 | //拷贝第三行数据,边界值填充 416 | memcpy(Third, Src + Stride, Channel); 417 | memcpy(Third + Channel, Src + Stride, Width * Channel); 418 | memcpy(Third + (Width + 1) * Channel, Src + Stride + (Width - 1) * Channel, Channel); 419 | 420 | BlockSize = 16, Block = (Width * Channel) / BlockSize; 421 | 422 | //Run 423 | const int32_t hw_concur = std::min(Height >> 4, static_cast(std::thread::hardware_concurrency())); 424 | std::vector> fut(hw_concur); 425 | const int thread_stride = (Height - 1) / hw_concur + 1; 426 | int i = 0, start = 0; 427 | for (; i < std::min(Height, hw_concur); i++, start += thread_stride) 428 | { 429 | fut[i] = std::async(std::launch::async, _Sobel, Src, Width, Height, start, thread_stride, Stride, Dest); 430 | } 431 | for (int j = 0; j < i; ++j) 432 | fut[j].wait(); 433 | 434 | free(RowCopy); 435 | } 436 | 437 | 438 | int main() { 439 | Mat src = imread("F:\\car.jpg"); 440 | int Height = src.rows; 441 | int Width = src.cols; 442 | unsigned char *Src = src.data; 443 | unsigned char *Dest = new unsigned char[Height * Width * 3]; 444 | int Stride = Width * 3; 445 | int Radius = 11; 446 | int Adjustment = 50; 447 | int64 st = cvGetTickCount(); 448 | /*for (int i = 0; i <1000; i++) { 449 | Sobel_SSE3(Src, Dest, Width, Height, Stride); 450 | }*/ 451 | double duration = (cv::getTickCount() - st) / cv::getTickFrequency(); 452 | printf("%.5f\n", duration); 453 | Sobel_SSE1(Src, Dest, Width, Height, Stride); 454 | Mat dst(Height, Width, CV_8UC3, Dest); 455 | imshow("origin", src); 456 | imshow("result", dst); 457 | imwrite("F:\\res.jpg", dst); 458 | waitKey(0); 459 | } 460 | -------------------------------------------------------------------------------- /speed_vibrance_algorithm.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | using namespace std; 6 | using namespace cv; 7 | 8 | void GetGrayIntegralImage(unsigned char *Src, int *Integral, int Width, int Height, int Stride) 9 | { 10 | memset(Integral, 0, (Width + 1) * sizeof(int)); // 第一行都为0 11 | for (int Y = 0; Y < Height; Y++) 12 | { 13 | unsigned char *LinePS = Src + Y * Stride; 14 | int *LinePL = Integral + Y * (Width + 1) + 1; //上一行的位置 15 | int *LinePD = Integral + (Y + 1) * (Width + 1) + 1; // 当前位置,注意每行的第一列的值都为0 16 | LinePD[-1] = 0; // 第一列的值为0 17 | for (int X = 0, Sum = 0; X < Width; X++) 18 | { 19 | Sum += LinePS[X]; // 行方向累加 20 | LinePD[X] = LinePL[X] + Sum; // 更新积分图 21 | } 22 | } 23 | } 24 | 25 | void GetGrayIntegralImage_SSE(unsigned char *Src, int *Integral, int Width, int Height, int Stride) { 26 | memset(Integral, 0, (Width + 1) * sizeof(int)); //第一行都为0 27 | int BlockSize = 8, Block = Width / BlockSize; 28 | for (int Y = 0; Y < Height; Y++) { 29 | unsigned char *LinePS = Src + Y * Stride; 30 | int *LinePL = Integral + Y * (Width + 1) + 1; //上一行位置 31 | int *LinePD = Integral + (Y + 1) * (Width + 1) + 1; //当前位置,注意每行的第一列都为0 32 | LinePD[-1] = 0; 33 | __m128i PreV = _mm_setzero_si128(); 34 | __m128i Zero = _mm_setzero_si128(); 35 | for (int X = 0; X < Block * BlockSize; X += BlockSize) { 36 | __m128i Src_Shift0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(LinePS + X)), Zero); //A7 A6 A5 A 4 A3 A2 A1 A0 37 | __m128i Src_Shift1 = _mm_slli_si128(Src_Shift0, 2); //A6 A5 A4 A3 A2 A1 A0 0 38 | __m128i Src_Shift2 = _mm_slli_si128(Src_Shift1, 2); //A5 A4 A3 A2 A1 A0 0 0 39 | __m128i Src_Shift3 = _mm_slli_si128(Src_Shift2, 2); //A4 A3 A2 A1 A0 0 0 0 40 | __m128i Shift_Add12 = _mm_add_epi16(Src_Shift1, Src_Shift2); //A6+A5 A5+A4 A4+A3 A3+A2 A2+A1 A1+A0 A0+0 0+0 41 | __m128i Shift_Add03 = _mm_add_epi16(Src_Shift0, Src_Shift3); //A7+A4 A6+A3 A5+A2 A4+A1 A3+A0 A2+0 A1+0 A0+0 42 | __m128i Low = _mm_add_epi16(Shift_Add12, Shift_Add03); //A7+A6+A5+A4 A6+A5+A4+A3 A5+A4+A3+A2 A4+A3+A2+A1 A3+A2+A1+A0 A2+A1+A0+0 A1+A0+0+0 A0+0+0+0 43 | __m128i High = _mm_add_epi32(_mm_unpackhi_epi16(Low, Zero), _mm_unpacklo_epi16(Low, Zero)); //A7+A6+A5+A4+A3+A2+A1+A0 A6+A5+A4+A3+A2+A1+A0 A5+A4+A3+A2+A1+A0 A4+A3+A2+A1+A0 44 | __m128i SumL = _mm_loadu_si128((__m128i *)(LinePL + X + 0)); 45 | __m128i SumH = _mm_loadu_si128((__m128i *)(LinePL + X + 4)); 46 | SumL = _mm_add_epi32(SumL, PreV); 47 | SumL = _mm_add_epi32(SumL, _mm_unpacklo_epi16(Low, Zero)); 48 | SumH = _mm_add_epi32(SumH, PreV); 49 | SumH = _mm_add_epi32(SumH, High); 50 | PreV = _mm_add_epi32(PreV, _mm_shuffle_epi32(High, _MM_SHUFFLE(3, 3, 3, 3))); 51 | _mm_storeu_si128((__m128i *)(LinePD + X + 0), SumL); 52 | _mm_storeu_si128((__m128i *)(LinePD + X + 4), SumH); 53 | } 54 | for (int X = Block * BlockSize, V = LinePD[X - 1] - LinePL[X - 1]; X < Width; X++) 55 | { 56 | V += LinePS[X]; 57 | LinePD[X] = V + LinePL[X]; 58 | } 59 | } 60 | } 61 | 62 | void BoxBlur(unsigned char *Src, unsigned char *Dest, int Width, int Height, int Stride, int Radius) { 63 | int *Integral = (int *)malloc((Width + 1) * (Height + 1) * sizeof(int)); 64 | GetGrayIntegralImage(Src, Integral, Width, Height, Stride); 65 | #pragma parallel for num_threads(4) 66 | for (int Y = 0; Y < Height; Y++) { 67 | int Y1 = max(Y - Radius, 0); 68 | int Y2 = min(Y + Radius + 1, Height - 1); 69 | int *LineP1 = Integral + Y1 * (Width + 1); 70 | int *LineP2 = Integral + Y2 * (Width + 1); 71 | unsigned char *LinePD = Dest + Y * Stride; 72 | for (int X = 0; X < Height; X++) { 73 | int X1 = max(X - Radius, 0); 74 | int X2 = min(X + Radius + 1, Width); 75 | int Sum = LineP2[X2] - LineP1[X2] - LineP2[X1] + LineP1[X1]; 76 | int PixelCount = (X2 - X1) * (Y2 - Y1); 77 | LinePD[X] = (Sum + (PixelCount >> 1)) / PixelCount; 78 | } 79 | } 80 | free(Integral); 81 | } 82 | 83 | //Adjustment如果为正值,会增加饱和度 84 | //Adjustment如果为负值,会降低饱和度 85 | void VibranceAlgorithm_FLOAT(unsigned char *Src, unsigned char *Dest, int Width, int Height, int Stride, int Adjustment) { 86 | float VibranceAdjustment = -0.01 * Adjustment; 87 | for (int Y = 0; Y < Height; Y++) { 88 | unsigned char *LinePS = Src + Y * Stride; 89 | unsigned char *LinePD = Dest + Y * Stride; 90 | for (int X = 0; X < Width; X++) { 91 | int Blue = LinePS[0], Green = LinePS[1], Red = LinePS[2]; 92 | int Avg = (Blue + Green + Green + Red) >> 2; 93 | int Max = max(max(Blue, Green), Red); 94 | float AmtVal = (abs(Max - Avg) / 127.0f) * VibranceAdjustment; 95 | if (Blue != Max) Blue += (Max - Blue) * AmtVal; 96 | if (Green != Max) Green += (Max - Green) * AmtVal; 97 | if (Red != Max) Red += (Max - Red) * AmtVal; 98 | if (Red < 0) Red = 0; 99 | else if (Red > 255) Red = 255; 100 | if (Green < 0) Green = 0; 101 | else if (Green > 255) Green = 255; 102 | if (Blue < 0) Blue = 0; 103 | else if (Blue > 255) Blue = 255; 104 | LinePD[0] = Blue; 105 | LinePD[1] = Green; 106 | LinePD[2] = Red; 107 | LinePS += 3; 108 | LinePD += 3; 109 | } 110 | } 111 | } 112 | 113 | void VibranceAlgorithm_INT(unsigned char *Src, unsigned char *Dest, int Width, int Height, int Stride, int Adjustment) { 114 | int VibranceAdjustment = -1.28 * Adjustment; 115 | for (int Y = 0; Y < Height; Y++) { 116 | unsigned char *LinePS = Src + Y * Stride; 117 | unsigned char *LinePD = Dest + Y * Stride; 118 | for (int X = 0; X < Width; X++) { 119 | int Blue, Green, Red, Max; 120 | Blue = LinePS[0], Green = LinePS[1], Red = LinePS[2]; 121 | int Avg = (Blue + Green + Green + Red) >> 2; 122 | if (Blue > Green) 123 | Max = Blue; 124 | else 125 | Max = Green; 126 | if (Red > Max) 127 | Max = Red; 128 | int AmtVal = (Max - Avg) * VibranceAdjustment; 129 | if (Blue != Max) Blue += (((Max - Blue) * AmtVal) >> 14); 130 | if (Green != Max) Green += (((Max - Green) * AmtVal) >> 14); 131 | if (Red != Max) Red += (((Max - Red) * AmtVal) >> 14); 132 | if (Red < 0) Red = 0; 133 | else if (Red > 255) Red = 255; 134 | if (Green < 0) Green = 0; 135 | else if (Green > 255) Green = 255; 136 | if (Blue < 0) Blue = 0; 137 | else if (Blue > 255) Blue = 255; 138 | LinePD[0] = Blue; 139 | LinePD[1] = Green; 140 | LinePD[2] = Red; 141 | LinePS += 3; 142 | LinePD += 3; 143 | } 144 | } 145 | } 146 | 147 | void VibranceAlgorithm_INT_OpenMP(unsigned char *Src, unsigned char *Dest, int Width, int Height, int Stride, int Adjustment) { 148 | int VibranceAdjustment = -1.28 * Adjustment; 149 | for (int Y = 0; Y < Height; Y++) { 150 | unsigned char *LinePS = Src + Y * Stride; 151 | unsigned char *LinePD = Dest + Y * Stride; 152 | #pragma omp parallel for num_threads(4) 153 | for (int X = 0; X < Width; X++) { 154 | int Blue, Green, Red, Max; 155 | Blue = LinePS[X*3 + 0], Green = LinePS[X*3 + 1], Red = LinePS[X*3 + 2]; 156 | int Avg = (Blue + Green + Green + Red) >> 2; 157 | if (Blue > Green) 158 | Max = Blue; 159 | else 160 | Max = Green; 161 | if (Red > Max) 162 | Max = Red; 163 | int AmtVal = (Max - Avg) * VibranceAdjustment; 164 | if (Blue != Max) Blue += (((Max - Blue) * AmtVal) >> 14); 165 | if (Green != Max) Green += (((Max - Green) * AmtVal) >> 14); 166 | if (Red != Max) Red += (((Max - Red) * AmtVal) >> 14); 167 | if (Red < 0) Red = 0; 168 | else if (Red > 255) Red = 255; 169 | if (Green < 0) Green = 0; 170 | else if (Green > 255) Green = 255; 171 | if (Blue < 0) Blue = 0; 172 | else if (Blue > 255) Blue = 255; 173 | LinePD[X*3 + 0] = Blue; 174 | LinePD[X*3 + 1] = Green; 175 | LinePD[X*3 + 2] = Red; 176 | } 177 | } 178 | } 179 | 180 | void VibranceAlgorithm_SSE(unsigned char *Src, unsigned char *Dest, int Width, int Height, int Stride, int Adjustment) { 181 | int VibranceAdjustment = (int)(-1.28 * Adjustment); 182 | __m128i Adjustment128 = _mm_setr_epi16(VibranceAdjustment, VibranceAdjustment, VibranceAdjustment, VibranceAdjustment, 183 | VibranceAdjustment, VibranceAdjustment, VibranceAdjustment, VibranceAdjustment); 184 | int X; 185 | for (int Y = 0; Y < Height; Y++) { 186 | unsigned char *LinePS = Src + Y * Stride; 187 | unsigned char *LinePD = Dest + Y * Stride; 188 | X = 0; 189 | __m128i Src1, Src2, Src3, Dest1, Dest2, Dest3, Blue8, Green8, Red8, Max8; 190 | __m128i BL16, BH16, GL16, GH16, RL16, RH16, MaxL16, MaxH16, AvgL16, AvgH16, AmtVal; 191 | __m128i Zero = _mm_setzero_si128(); 192 | for (; X < Width - 16; X += 16, LinePS += 48, LinePD += 48) { 193 | Src1 = _mm_loadu_si128((__m128i *)(LinePS + 0)); //B1,G1,R1,B2,G2,R2,B3,G3,R3,B4,G4,R4,B5,G5,R5,B6 194 | Src2 = _mm_loadu_si128((__m128i *)(LinePS + 16));//G6,R6,B7,G7,R7,B8,G8,R8,B9,G9,R9,B10,G10,R10,B11,G11 195 | Src3 = _mm_loadu_si128((__m128i *)(LinePS + 32));//R11,B12,G12,R12,B13,G13,R13,B14,G14,R14,B15,G15,R15,B16,G16,R16 196 | 197 | Blue8 = _mm_shuffle_epi8(Src1, _mm_setr_epi8(0, 3, 6, 9, 12, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1)); 198 | Blue8 = _mm_or_si128(Blue8, _mm_shuffle_epi8(Src2, _mm_setr_epi8(-1, -1, -1, -1, -1, -1, 2, 5, 8, 11, 14, -1, -1, -1, -1, -1))); 199 | Blue8 = _mm_or_si128(Blue8, _mm_shuffle_epi8(Src3, _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 4, 7, 10, 13))); 200 | 201 | Green8 = _mm_shuffle_epi8(Src1, _mm_setr_epi8(1, 4, 7, 10, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1)); 202 | Green8 = _mm_or_si128(Green8, _mm_shuffle_epi8(Src2, _mm_setr_epi8(-1, -1, -1, -1, -1, 0, 3, 6, 9, 12, 15, -1, -1, -1, -1, -1))); 203 | Green8 = _mm_or_si128(Green8, _mm_shuffle_epi8(Src3, _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2, 5, 8, 11, 14))); 204 | 205 | Red8 = _mm_shuffle_epi8(Src1, _mm_setr_epi8(2, 5, 8, 11, 14, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1)); 206 | Red8 = _mm_or_si128(Red8, _mm_shuffle_epi8(Src2, _mm_setr_epi8(-1, -1, -1, -1, -1, 1, 4, 7, 10, 13, -1, -1, -1, -1, -1, -1))); 207 | Red8 = _mm_or_si128(Red8, _mm_shuffle_epi8(Src3, _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 3, 6, 9, 12, 15))); 208 | 209 | Max8 = _mm_max_epu8(_mm_max_epu8(Blue8, Green8), Red8); 210 | 211 | BL16 = _mm_unpacklo_epi8(Blue8, Zero); 212 | BH16 = _mm_unpackhi_epi8(Blue8, Zero); 213 | GL16 = _mm_unpacklo_epi8(Green8, Zero); 214 | GH16 = _mm_unpackhi_epi8(Green8, Zero); 215 | RL16 = _mm_unpacklo_epi8(Red8, Zero); 216 | RH16 = _mm_unpackhi_epi8(Red8, Zero); 217 | MaxL16 = _mm_unpacklo_epi8(Max8, Zero); 218 | MaxH16 = _mm_unpackhi_epi8(Max8, Zero); 219 | 220 | AvgL16 = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(BL16, RL16), _mm_slli_epi16(GL16, 1)), 2); 221 | AvgH16 = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(BH16, RH16), _mm_slli_epi16(GH16, 1)), 2); 222 | 223 | AmtVal = _mm_mullo_epi16(_mm_sub_epi16(MaxL16, AvgL16), Adjustment128); 224 | BL16 = _mm_adds_epi16(BL16, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(MaxL16, BL16), 2), AmtVal)); 225 | GL16 = _mm_adds_epi16(GL16, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(MaxL16, GL16), 2), AmtVal)); 226 | RL16 = _mm_adds_epi16(RL16, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(MaxL16, RL16), 2), AmtVal)); 227 | 228 | AmtVal = _mm_mullo_epi16(_mm_sub_epi16(MaxH16, AvgH16), Adjustment128); 229 | BH16 = _mm_adds_epi16(BH16, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(MaxH16, BH16), 2), AmtVal)); 230 | GH16 = _mm_adds_epi16(GH16, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(MaxH16, GH16), 2), AmtVal)); 231 | RH16 = _mm_adds_epi16(RH16, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(MaxH16, RH16), 2), AmtVal)); 232 | 233 | Blue8 = _mm_packus_epi16(BL16, BH16); 234 | Green8 = _mm_packus_epi16(GL16, GH16); 235 | Red8 = _mm_packus_epi16(RL16, RH16); 236 | 237 | Dest1 = _mm_shuffle_epi8(Blue8, _mm_setr_epi8(0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, -1, 5)); 238 | Dest1 = _mm_or_si128(Dest1, _mm_shuffle_epi8(Green8, _mm_setr_epi8(-1, 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, -1))); 239 | Dest1 = _mm_or_si128(Dest1, _mm_shuffle_epi8(Red8, _mm_setr_epi8(-1, -1, 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1))); 240 | 241 | Dest2 = _mm_shuffle_epi8(Blue8, _mm_setr_epi8(-1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1, 10, -1)); 242 | Dest2 = _mm_or_si128(Dest2, _mm_shuffle_epi8(Green8, _mm_setr_epi8(5, -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1, 10))); 243 | Dest2 = _mm_or_si128(Dest2, _mm_shuffle_epi8(Red8, _mm_setr_epi8(-1, 5, -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1))); 244 | 245 | Dest3 = _mm_shuffle_epi8(Blue8, _mm_setr_epi8(-1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, -1, -1)); 246 | Dest3 = _mm_or_si128(Dest3, _mm_shuffle_epi8(Green8, _mm_setr_epi8(-1, -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, -1))); 247 | Dest3 = _mm_or_si128(Dest3, _mm_shuffle_epi8(Red8, _mm_setr_epi8(10, -1, -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15))); 248 | 249 | _mm_storeu_si128((__m128i *)(LinePD + 0), Dest1); 250 | _mm_storeu_si128((__m128i *)(LinePD + 16), Dest2); 251 | _mm_storeu_si128((__m128i *)(LinePD + 32), Dest3); 252 | } 253 | for (; X < Width; X++) { 254 | int Blue, Green, Red, Max; 255 | Blue = LinePS[0], Green = LinePS[1], Red = LinePS[2]; 256 | int Avg = (Blue + Green + Green + Red) >> 2; 257 | if (Blue > Green) 258 | Max = Blue; 259 | else 260 | Max = Green; 261 | if (Red > Max) 262 | Max = Red; 263 | int AmtVal = (Max - Avg) * VibranceAdjustment; 264 | if (Blue != Max) Blue += (((Max - Blue) * AmtVal) >> 14); 265 | if (Green != Max) Green += (((Max - Green) * AmtVal) >> 14); 266 | if (Red != Max) Red += (((Max - Red) * AmtVal) >> 14); 267 | if (Red < 0) Red = 0; 268 | else if (Red > 255) Red = 255; 269 | if (Green < 0) Green = 0; 270 | else if (Green > 255) Green = 255; 271 | if (Blue < 0) Blue = 0; 272 | else if (Blue > 255) Blue = 255; 273 | LinePD[0] = Blue; 274 | LinePD[1] = Green; 275 | LinePD[2] = Red; 276 | LinePS += 3; 277 | LinePD += 3; 278 | } 279 | } 280 | } 281 | 282 | int main() { 283 | Mat src = imread("F:\\car.jpg"); 284 | int Height = src.rows; 285 | int Width = src.cols; 286 | unsigned char *Src = src.data; 287 | unsigned char *Dest = new unsigned char[Height * Width * 3]; 288 | int Stride = Width * 3; 289 | int Radius = 11; 290 | int Adjustment = 50; 291 | int64 st = cvGetTickCount(); 292 | for (int i = 0; i <100; i++) { 293 | VibranceAlgorithm_SSE(Src, Dest, Width, Height, Stride, Adjustment); 294 | } 295 | double duration = (cv::getTickCount() - st) / cv::getTickFrequency() * 10; 296 | printf("%.5f\n", duration); 297 | VibranceAlgorithm_SSE(Src, Dest, Width, Height, Stride, Adjustment); 298 | Mat dst(Height, Width, CV_8UC3, Dest); 299 | imshow("origin", src); 300 | imshow("result", dst); 301 | imwrite("F:\\res.jpg", dst); 302 | waitKey(0); 303 | waitKey(0); 304 | } -------------------------------------------------------------------------------- /sse_implementation_of_common_functions_in_image_processing.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | using namespace std; 4 | using namespace cv; 5 | 6 | // 函数1: 对数函数的SSE实现,高精度版 7 | inline __m128 _mm_log_ps(__m128 x) 8 | { 9 | static const __declspec(align(16)) int _ps_min_norm_pos[4] = { 0x00800000, 0x00800000, 0x00800000, 0x00800000 }; 10 | static const __declspec(align(16)) int _ps_inv_mant_mask[4] = { ~0x7f800000, ~0x7f800000, ~0x7f800000, ~0x7f800000 }; 11 | static const __declspec(align(16)) int _pi32_0x7f[4] = { 0x7f, 0x7f, 0x7f, 0x7f }; 12 | static const __declspec(align(16)) float _ps_1[4] = { 1.0f, 1.0f, 1.0f, 1.0f }; 13 | static const __declspec(align(16)) float _ps_0p5[4] = { 0.5f, 0.5f, 0.5f, 0.5f }; 14 | static const __declspec(align(16)) float _ps_sqrthf[4] = { 0.707106781186547524f, 0.707106781186547524f, 0.707106781186547524f, 0.707106781186547524f }; 15 | static const __declspec(align(16)) float _ps_log_p0[4] = { 7.0376836292E-2f, 7.0376836292E-2f, 7.0376836292E-2f, 7.0376836292E-2f }; 16 | static const __declspec(align(16)) float _ps_log_p1[4] = { -1.1514610310E-1f, -1.1514610310E-1f, -1.1514610310E-1f, -1.1514610310E-1f }; 17 | static const __declspec(align(16)) float _ps_log_p2[4] = { 1.1676998740E-1f, 1.1676998740E-1f, 1.1676998740E-1f, 1.1676998740E-1f }; 18 | static const __declspec(align(16)) float _ps_log_p3[4] = { -1.2420140846E-1f, -1.2420140846E-1f, -1.2420140846E-1f, -1.2420140846E-1f }; 19 | static const __declspec(align(16)) float _ps_log_p4[4] = { 1.4249322787E-1f, 1.4249322787E-1f, 1.4249322787E-1f, 1.4249322787E-1f }; 20 | static const __declspec(align(16)) float _ps_log_p5[4] = { -1.6668057665E-1f, -1.6668057665E-1f, -1.6668057665E-1f, -1.6668057665E-1f }; 21 | static const __declspec(align(16)) float _ps_log_p6[4] = { 2.0000714765E-1f, 2.0000714765E-1f, 2.0000714765E-1f, 2.0000714765E-1f }; 22 | static const __declspec(align(16)) float _ps_log_p7[4] = { -2.4999993993E-1f, -2.4999993993E-1f, -2.4999993993E-1f, -2.4999993993E-1f }; 23 | static const __declspec(align(16)) float _ps_log_p8[4] = { 3.3333331174E-1f, 3.3333331174E-1f, 3.3333331174E-1f, 3.3333331174E-1f }; 24 | static const __declspec(align(16)) float _ps_log_q1[4] = { -2.12194440e-4f, -2.12194440e-4f, -2.12194440e-4f, -2.12194440e-4f }; 25 | static const __declspec(align(16)) float _ps_log_q2[4] = { 0.693359375f, 0.693359375f, 0.693359375f, 0.693359375f }; 26 | 27 | __m128 one = *(__m128*)_ps_1; 28 | __m128 invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps()); 29 | /* cut off denormalized stuff */ 30 | x = _mm_max_ps(x, *(__m128*)_ps_min_norm_pos); 31 | __m128i emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23); 32 | 33 | /* keep only the fractional part */ 34 | x = _mm_and_ps(x, *(__m128*)_ps_inv_mant_mask); 35 | x = _mm_or_ps(x, _mm_set1_ps(0.5f)); 36 | 37 | emm0 = _mm_sub_epi32(emm0, *(__m128i *)_pi32_0x7f); 38 | __m128 e = _mm_cvtepi32_ps(emm0); 39 | e = _mm_add_ps(e, one); 40 | 41 | __m128 mask = _mm_cmplt_ps(x, *(__m128*)_ps_sqrthf); 42 | __m128 tmp = _mm_and_ps(x, mask); 43 | x = _mm_sub_ps(x, one); 44 | e = _mm_sub_ps(e, _mm_and_ps(one, mask)); 45 | x = _mm_add_ps(x, tmp); 46 | 47 | __m128 z = _mm_mul_ps(x, x); 48 | __m128 y = *(__m128*)_ps_log_p0; 49 | y = _mm_mul_ps(y, x); 50 | y = _mm_add_ps(y, *(__m128*)_ps_log_p1); 51 | y = _mm_mul_ps(y, x); 52 | y = _mm_add_ps(y, *(__m128*)_ps_log_p2); 53 | y = _mm_mul_ps(y, x); 54 | y = _mm_add_ps(y, *(__m128*)_ps_log_p3); 55 | y = _mm_mul_ps(y, x); 56 | y = _mm_add_ps(y, *(__m128*)_ps_log_p4); 57 | y = _mm_mul_ps(y, x); 58 | y = _mm_add_ps(y, *(__m128*)_ps_log_p5); 59 | y = _mm_mul_ps(y, x); 60 | y = _mm_add_ps(y, *(__m128*)_ps_log_p6); 61 | y = _mm_mul_ps(y, x); 62 | y = _mm_add_ps(y, *(__m128*)_ps_log_p7); 63 | y = _mm_mul_ps(y, x); 64 | y = _mm_add_ps(y, *(__m128*)_ps_log_p8); 65 | y = _mm_mul_ps(y, x); 66 | 67 | y = _mm_mul_ps(y, z); 68 | tmp = _mm_mul_ps(e, *(__m128*)_ps_log_q1); 69 | y = _mm_add_ps(y, tmp); 70 | tmp = _mm_mul_ps(z, *(__m128*)_ps_0p5); 71 | y = _mm_sub_ps(y, tmp); 72 | tmp = _mm_mul_ps(e, *(__m128*)_ps_log_q2); 73 | x = _mm_add_ps(x, y); 74 | x = _mm_add_ps(x, tmp); 75 | x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN 76 | 77 | return x; 78 | } 79 | 80 | // 函数2: 低精度的log函数,大概有小数点后2位的精度 81 | // 算法来源: https://stackoverflow.com/questions/9411823/fast-log2float-x-implementation-c 82 | inline float IM_Flog(float val) 83 | { 84 | union 85 | { 86 | float val; 87 | int x; 88 | } u = { val }; 89 | float log_2 = (float)(((u.x >> 23) & 255) - 128); 90 | u.x &= ~(255 << 23); 91 | u.x += (127 << 23); 92 | log_2 += ((-0.34484843f) * u.val + 2.02466578f) * u.val - 0.67487759f; 93 | return log_2 * 0.69314718f; 94 | } 95 | 96 | // 函数3: 函数2的SSE实现 97 | inline __m128 _mm_flog_ps(__m128 x) 98 | { 99 | __m128i I = _mm_castps_si128(x); 100 | __m128 log_2 = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_and_si128(_mm_srli_epi32(I, 23), _mm_set1_epi32(255)), _mm_set1_epi32(128))); 101 | I = _mm_and_si128(I, _mm_set1_epi32(-2139095041)); // 255 << 23 102 | I = _mm_add_epi32(I, _mm_set1_epi32(1065353216)); // 127 << 23 103 | __m128 F = _mm_castsi128_ps(I); 104 | __m128 T = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(-0.34484843f), F), _mm_set1_ps(2.02466578f)); 105 | T = _mm_sub_ps(_mm_mul_ps(T, F), _mm_set1_ps(0.67487759f)); 106 | return _mm_mul_ps(_mm_add_ps(log_2, T), _mm_set1_ps(0.69314718f)); 107 | } 108 | 109 | // 函数4: e^x的近似计算 110 | inline float IM_Fexp(float Y) 111 | { 112 | union 113 | { 114 | double Value; 115 | int X[2]; 116 | } V; 117 | V.X[1] = (int)(Y * 1512775 + 1072632447 + 0.5F); 118 | V.X[0] = 0; 119 | return (float)V.Value; 120 | } 121 | 122 | // 函数5: 函数4的SSE实现 123 | inline __m128 _mm_fexp_ps(__m128 Y) 124 | { 125 | __m128i T = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(Y, _mm_set1_ps(1512775)), _mm_set1_ps(1072632447))); 126 | __m128i TL = _mm_unpacklo_epi32(_mm_setzero_si128(), T); 127 | __m128i TH = _mm_unpackhi_epi32(_mm_setzero_si128(), T); 128 | return _mm_movelh_ps(_mm_cvtpd_ps(_mm_castsi128_pd(TL)), _mm_cvtpd_ps(_mm_castsi128_pd(TH))); 129 | } 130 | 131 | //函数6: pow函数的近似实现 132 | inline float IM_Fpow(float a, float b) 133 | { 134 | union 135 | { 136 | double Value; 137 | int X[2]; 138 | } V; 139 | V.X[1] = (int)(b * (V.X[1] - 1072632447) + 1072632447); 140 | V.X[0] = 0; 141 | return (float)V.Value; 142 | } 143 | 144 | // 函数7: 通过_mm_rcp_ps,_mm_rsqrt_ps(求导数的近似值,大概为小数点后12bit),结合牛顿迭代法,求精度更高的导数 145 | __m128 _mm_prcp_ps(__m128 a) { 146 | __m128 rcp = _mm_rcp_ps(a); //此函数只有12bit的精度 147 | return _mm_sub_ps(_mm_add_ps(rcp, rcp), _mm_mul_ps(a, _mm_mul_ps(rcp, rcp))); //x1 = x0 * (2 - d * x0) = 2 * x0 - d * x0 * x0,使用牛顿 - 拉弗森方法这种方法可以提高精度到23bit 148 | } 149 | 150 | // 函数8: 直接用导数实现a / b 151 | __m128 _mm_fdiv_ps(__m128 a, __m128 b) 152 | { 153 | return _mm_mul_ps(a, _mm_rcp_ps(b)); 154 | } 155 | 156 | // 函数9: 避免除数为0时无法获得效果 157 | // 在SSE指令中,没有提供整数的除法指令,不知道这是为什么,所以整数除法一般只能借用浮点版本的指令。 158 | // 同时,除法存在的一个问题就是如果除数为0,可能会触发异常,不过SSE在这种情况下不会抛出异常,但是我们应该避免。 159 | // 避免的方式有很多,比如判断如果除数为0,就做特殊处理,或者如果除数为0就除以一个很小的数,不过大部分的需求是, 160 | // 除数为0,则返回0,此时就可以使用下面的SSE指令代替_mm_div_ps 161 | //四个浮点数的除法a/b,如果b中某个分量为0,则对应位置返回0值 162 | 163 | inline __m128 _mm_divz_ps(__m128 a, __m128 b) 164 | { 165 | __m128 Mask = _mm_cmpeq_ps(b, _mm_setzero_ps()); 166 | return _mm_blendv_ps(_mm_div_ps(a, b), _mm_setzero_ps(), Mask); 167 | } 168 | 169 | // 函数10: 将4个32位整数转换为字节数并保存 170 | // 将4个32位整形变量数据打包到4个字节数据中 171 | 172 | inline void _mm_storesi128_4char(unsigned char *Dest, __m128i P) 173 | { 174 | __m128i T = _mm_packs_epi32(P, P); 175 | *((int *)Dest) = _mm_cvtsi128_si32(_mm_packus_epi16(T, T)); 176 | } 177 | 178 | // 函数11: 读取12个字节数到一个XMM寄存器中 179 | // XMM寄存器是16个字节大小的,而且SSE的很多计算是以4的整数倍字节位单位进行的, 180 | // 但是在图像处理中,70%情况下处理的是彩色的24位图像,即一个像素占用3个字节, 181 | // 如果直接使用load指令载入数据,一次性可载入5加1 / 3个像素,这对算法的处理是很不方便的, 182 | // 一般状况下都是加载4个像素,即12个字节,然后扩展成16个字节(给每个像素增加一个Alpha值), 183 | // 我们当然可以直接使用load加载16个字节,然后每次跳过12个字节在进行load加载,但是其实也可以 184 | // 使用下面的加载12个字节的函数: 185 | // 从指针p处加载12个字节数据到XMM寄存器中,寄存器最高32位清0 186 | 187 | inline __m128i _mm_loadu_epi96(const __m128i * p) 188 | { 189 | return _mm_unpacklo_epi64(_mm_loadl_epi64(p), _mm_cvtsi32_si128(((int *)p)[2])); 190 | } 191 | 192 | // 函数12: 保存XMM的高12位 193 | // 将寄存器Q的低位12个字节数据写入到指针P中。 194 | inline void _mm_storeu_epi96(__m128i *P, __m128i Q) 195 | { 196 | _mm_storel_epi64(P, Q); 197 | ((int *)P)[2] = _mm_cvtsi128_si32(_mm_srli_si128(Q, 8)); 198 | } 199 | 200 | // 函数13: 计算整数整除255的四舍五入结果。 201 | inline int IM_Div255(int V) 202 | { 203 | return (((V >> 8) + V + 1) >> 8); // 似乎V可以是负数 204 | } 205 | 206 | // 函数14: 函数13的SSE实现 207 | // 返回16位无符号整形数据整除255后四舍五入的结果: x = ((x + 1) + (x >> 8)) >> 8 208 | 209 | inline __m128i _mm_div255_epu16(__m128i x) 210 | { 211 | return _mm_srli_epi16(_mm_adds_epu16(_mm_adds_epu16(x, _mm_set1_epi16(1)), _mm_srli_epi16(x, 8)), 8); 212 | } 213 | 214 | // 函数15: 求XMM寄存器内所有元素的累加值 215 | // 这也是个常见的需求,我们可能把某个结果重复的结果保存在寄存器中,最后结束时在把寄存器中的每个元素想加, 216 | // 你当然可以通过访问__m128i变量的内部的元素实现,但是据说这样会降低循环内的优化,一种方式是直接用SSE指令实现, 217 | // 比如对8个有符号的short类型的相加代码如下所示: 218 | // 8个有符号的16位的数据相加的和。 219 | // https://stackoverflow.com/questions/31382209/computing-the-inner-product-of-vectors-with-allowed-scalar-values-0-1-and-2-usi/31382878#31382878 220 | 221 | inline int _mm_hsum_epi16(__m128i V) // V7 V6 V5 V4 V3 V2 V1 V0 222 | { 223 | // V = _mm_unpacklo_epi16(_mm_hadd_epi16(V, _mm_setzero_si128()), _mm_setzero_si128()); 也可以用这句,_mm_hadd_epi16似乎对计算结果超出32768能获得正确结果 224 | __m128i T = _mm_madd_epi16(V, _mm_set1_epi16(1)); // V7+V6 V5+V4 V3+V2 V1+V0 225 | T = _mm_add_epi32(T, _mm_srli_si128(T, 8)); // V7+V6+V3+V2 V5+V4+V1+V0 0 0 226 | T = _mm_add_epi32(T, _mm_srli_si128(T, 4)); // V7+V6+V3+V2+V5+V4+V1+V0 V5+V4+V1+V0 0 0 227 | return _mm_cvtsi128_si32(T); // 提取低位 228 | } 229 | 230 | // 函数16: 求16个字节的最小值 231 | // 比如我们要求一个字节序列的最小值,我们肯定会使用_mm_min_epi8这样的函数保存每隔16个字节的最小值, 232 | // 这样最终我们得到16个字节的一个XMM寄存器,整个序列的最小值肯定在这个16个字节里面, 233 | // 这个时候我们可以巧妙的借用下面的SSE语句得到这16个字节的最小值: 234 | // 求16个字节数据的最小值, 只能针对字节数据。 235 | 236 | inline int _mm_hmin_epu8(__m128i a) 237 | { 238 | __m128i L = _mm_unpacklo_epi8(a, _mm_setzero_si128()); 239 | __m128i H = _mm_unpackhi_epi8(a, _mm_setzero_si128()); 240 | return _mm_extract_epi16(_mm_min_epu16(_mm_minpos_epu16(L), _mm_minpos_epu16(H)), 0); 241 | } 242 | 243 | // 函数17: 求16个字节的最大值 244 | // 求16个字节数据的最大值, 只能针对字节数据。 245 | inline int _mm_hmax_epu8(__m128i a) 246 | { 247 | __m128i b = _mm_subs_epu8(_mm_set1_epi8(255), a); 248 | __m128i L = _mm_unpacklo_epi8(b, _mm_setzero_si128()); 249 | __m128i H = _mm_unpackhi_epi8(b, _mm_setzero_si128()); 250 | return 255 - _mm_extract_epi16(_mm_min_epu16(_mm_minpos_epu16(L), _mm_minpos_epu16(H)), 0); 251 | } 252 | 253 | int main() { 254 | 255 | } 256 | --------------------------------------------------------------------------------