├── .gitignore
├── data
    ├── zgr.jpg
    ├── zgr2.jpg
    └── zgrbig.jpg
├── pic
    ├── siftysifty.jpg
    ├── siftymatchsifty2.jpg
    └── siftysiftymatchopencv.jpg
├── include
    ├── boxfilter.h
    ├── filter.h
    ├── iirfilter.h
    ├── gaussfiler.h
    ├── linearfilter.h
    ├── siftysiftytest.h
    ├── structs.h
    ├── utils.h
    ├── siftysifty.h
    └── imageutils.h
├── main.cpp
├── CMakeLists.txt
├── README.md
├── LICENSE
└── src
    ├── gaussfiler.cpp
    ├── boxfilter.cpp
    ├── siftysiftytest.cpp
    ├── iirfilter.cpp
    ├── siftysifty.cpp
    └── linearfilter.cpp


/.gitignore:
--------------------------------------------------------------------------------
1 | /.idea
2 | /cmake-build-debug
3 | 


--------------------------------------------------------------------------------
/data/zgr.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazingyyc/SiftySifty/HEAD/data/zgr.jpg


--------------------------------------------------------------------------------
/data/zgr2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazingyyc/SiftySifty/HEAD/data/zgr2.jpg


--------------------------------------------------------------------------------
/data/zgrbig.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazingyyc/SiftySifty/HEAD/data/zgrbig.jpg


--------------------------------------------------------------------------------
/pic/siftysifty.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazingyyc/SiftySifty/HEAD/pic/siftysifty.jpg


--------------------------------------------------------------------------------
/pic/siftymatchsifty2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazingyyc/SiftySifty/HEAD/pic/siftymatchsifty2.jpg


--------------------------------------------------------------------------------
/pic/siftysiftymatchopencv.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazingyyc/SiftySifty/HEAD/pic/siftysiftymatchopencv.jpg


--------------------------------------------------------------------------------
/include/boxfilter.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Created by yanyuanchi on 2017/3/21.
 3 |  */
 4 | #ifndef SIFTYSIFTY_BOXFILTER_H
 5 | #define SIFTYSIFTY_BOXFILTER_H
 6 | 
 7 | namespace SiftySifty {
 8 | /**
 9 |  * box blur
10 |  */
11 | void boxFilter(int16_t *src, int16_t *dst, int width, int height, int radius);
12 | 
13 | }
14 | 
15 | #endif //SIFTYSIFTY_BOXFILTER_H
16 | 


--------------------------------------------------------------------------------
/include/filter.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Created by yanyuanchi on 2017/3/23.
 3 |  */
 4 | #ifndef SIFTYSIFTY_FILTER_H
 5 | #define SIFTYSIFTY_FILTER_H
 6 | 
 7 | namespace SiftySifty {
 8 | 
 9 | static const int FILTER_SHIFT = 16;
10 | static const int FILTER_SCALE = (1 << FILTER_SHIFT);
11 | static const int FILTER_DELTA = (1 << (FILTER_SHIFT - 1));
12 | 
13 | }
14 | 
15 | #endif //SIFTYSIFTY_FILTER_H
16 | 


--------------------------------------------------------------------------------
/include/iirfilter.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Created by yanyuanchi on 2017/3/23.
 3 |  */
 4 | 
 5 | #ifndef SIFTYSIFTY_IIRFILTER_H
 6 | #define SIFTYSIFTY_IIRFILTER_H
 7 | 
 8 | namespace SiftySifty {
 9 | 
10 | void IIRFilter(int16_t *src, int16_t *dst, int width, int height, float sigma);
11 | 
12 | void IIRFilter(uint8_t *src, uint8_t *dst, int width, int height, float sigma);
13 | 
14 | }
15 | 
16 | #endif //SIFTYSIFTY_IIRFILTER_H
17 | 


--------------------------------------------------------------------------------
/include/gaussfiler.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Created by yanyuanchi on 2017/3/21.
 3 |  */
 4 | #ifndef SIFTYSIFTY_GAUSSFILER_H
 5 | #define SIFTYSIFTY_GAUSSFILER_H
 6 | 
 7 | namespace SiftySifty {
 8 | 
 9 | /**
10 |  * use 3 box filter to fitting gauss filter
11 |  */
12 | void gaussFilterBy3BoxFilter(short *src, short *dst, int width, int height, float sigma);
13 | 
14 | /**
15 |  * usr IIR filter to fitting gauss filter
16 |  */
17 | void gaussFilterByIIRFilter(short *src, short *dst, int width, int height, float sigma);
18 | 
19 | }
20 | 
21 | #endif //SIFTYSIFTY_GAUSSFILER_H
22 | 


--------------------------------------------------------------------------------
/main.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | 
 3 | #include "siftysiftytest.h"
 4 | 
 5 | using namespace std;
 6 | using namespace SiftySifty;
 7 | 
 8 | #define IMAGE_PATH "../data/zgr.jpg"
 9 | #define IMAGE_PATH2 "../data/zgr2.jpg"
10 | #define IMAGE_PATH_BIG "../data/zgrbig.jpg"
11 | 
12 | int main() {
13 |     if (false) {
14 |         drawKeyPoint(IMAGE_PATH);
15 |     }
16 |     
17 |     if (false) {
18 |         drawKeyPointCmpToOpenCV(IMAGE_PATH);
19 |     }
20 |     
21 |     if (false) {
22 |         matchKeyPoint(IMAGE_PATH, IMAGE_PATH);
23 |     }
24 |     
25 |     if (true) {
26 |         matchKeyPoint(IMAGE_PATH, IMAGE_PATH2);
27 |     }
28 |     
29 |     if (false) {
30 |         matchKeyPointSiftySiftyWithOpenCV(IMAGE_PATH, IMAGE_PATH);
31 |     }
32 |     
33 |     if (false) {
34 |         testSpeedSiftySiftyAndOpenCV(IMAGE_PATH_BIG);
35 |     }
36 |     
37 |     return 0;
38 | }
39 | 
40 | 
41 | 
42 | 
43 | 
44 | 
45 | 
46 | 
47 | 
48 | 


--------------------------------------------------------------------------------
/include/linearfilter.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Created by yanyuanchi on 2017/1/18.
 3 |  */
 4 | #ifndef SIFT_LINEARFILTER_H
 5 | #define SIFT_LINEARFILTER_H
 6 | 
 7 | namespace SiftySifty {
 8 | /**
 9 |  * filter the rows
10 |  */
11 | void linearFilterHorizon(uint8_t *src,
12 |                          uint8_t *dst,
13 |                          int width,
14 |                          int height,
15 |                          int (*mult)[256],
16 |                          int delta,
17 |                          int shift,
18 |                          int size);
19 | 
20 | /**
21 |  * filter the cols
22 |  */
23 | void linearFilterVertical(uint8_t *src,
24 |                           uint8_t *dst,
25 |                           int width,
26 |                           int height,
27 |                           int (*mult)[256],
28 |                           int delta,
29 |                           int shift,
30 |                           int size);
31 | }
32 | #endif //SIFT_LINEARFILTER_H
33 | 


--------------------------------------------------------------------------------
/include/siftysiftytest.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Created by yanyuanchi on 2017/4/3.
 3 |  */
 4 | #ifndef SIFTYSIFTY_SIFTYTEST_H
 5 | #define SIFTYSIFTY_SIFTYTEST_H
 6 | 
 7 | #include <iostream>
 8 | 
 9 | using namespace std;
10 | 
11 | namespace SiftySifty {
12 | 
13 | /**
14 |  * use the opencv to draw the siftysifty keypoint
15 |  * @param path
16 |  */
17 | void drawKeyPoint(string path);
18 | 
19 | /**
20 |  * draw the keypoint on the same pic by SiftySifty and OpenCV
21 |  * @param path
22 |  */
23 | void drawKeyPointCmpToOpenCV(string path);
24 | 
25 | /**
26 |  * math the keypoint that extracted by SiftySifty
27 |  * @param path
28 |  */
29 | void matchKeyPoint(string path1, string path2);
30 | 
31 | /**
32 |  * match the siftysifty keypoint with opencv keypoint
33 |  * @param path
34 |  */
35 | void matchKeyPointSiftySiftyWithOpenCV(string path1, string path2);
36 | 
37 | /**
38 |  * test the speed of SiftySifty and OpenCV
39 |  * @param path
40 |  */
41 | void testSpeedSiftySiftyAndOpenCV(string path);
42 | 
43 | }
44 | 
45 | #endif //SIFTYSIFTY_SIFTYTEST_H
46 | 
47 | 
48 | 
49 | 
50 | 
51 | 
52 | 
53 | 
54 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.7)
 2 | project(SiftySifty)
 3 | 
 4 | FIND_PACKAGE(OpenCV REQUIRED)
 5 | 
 6 | # for speed test should add openmp support
 7 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
 8 | # set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -fopenmp -lstdc++ -O3")
 9 | # set(CMAKE_CXX_COMPILER /usr/local/bin/gcc-7)
10 | 
11 | FIND_PACKAGE(OpenCV REQUIRED)
12 | 
13 | include_directories(include)
14 | 
15 | set(SOURCE_FILES
16 |         main.cpp
17 |         include/utils.h
18 |         include/structs.h
19 |         src/linearfilter.cpp
20 |         include/linearfilter.h
21 |         include/boxfilter.h
22 |         src/boxfilter.cpp
23 |         src/gaussfiler.cpp
24 |         include/gaussfiler.h
25 |         include/siftysifty.h
26 |         src/siftysifty.cpp
27 |         src/iirfilter.cpp
28 |         include/iirfilter.h
29 |         include/filter.h
30 |         src/siftysiftytest.cpp
31 |         include/siftysiftytest.h
32 |         include/imageutils.h)
33 | 
34 | add_executable(SiftySifty ${SOURCE_FILES})
35 | 
36 | target_link_libraries(SiftySifty ${OpenCV_LIBS})
37 | 


--------------------------------------------------------------------------------
/include/structs.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Created by yanyuanchi on 2017/1/16.
 3 |  */
 4 | 
 5 | #ifndef SIFT_STRUCTS_H_H
 6 | #define SIFT_STRUCTS_H_H
 7 | 
 8 | #include <iostream>
 9 | #include <cstdint>
10 | 
11 | namespace SiftySifty {
12 | 
13 | typedef struct KeyPoint {
14 |     /**the coordinate of keyPoint*/
15 |     float x;
16 |     float y;
17 |     
18 |     /**the coordinate in pyramid*/
19 |     int octaveX;
20 |     int octaveY;
21 |     
22 |     /**nothing*/
23 |     float score;
24 |     
25 |     float response;
26 |     
27 |     int octave;
28 |     
29 |     int octaveLayer;
30 |     
31 |     float octaveLayersShift;
32 |     
33 |     /**the scale of the keyPoint*/
34 |     float size;
35 |     
36 |     /**the scale of the keyPoint, in the pyramid*/
37 |     float octaveSize;
38 |     
39 |     /**the direction of the keypoint*/
40 |     float angle;
41 |     
42 |     /**the descriptor of the keyPoint*/
43 |     float *descriptor;
44 | } KeyPoint;
45 | 
46 | template<class T>
47 | struct Mat {
48 |     T *data;
49 |     
50 |     int width;
51 |     int height;
52 | };
53 | 
54 | }
55 | #endif //SIFT_STRUCTS_H_H
56 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # SiftySifty
 2 | SiftySifty is a open source library that extract SIFT keypoins from a image. Writed by pure c++ and does't need any other library.
 3 | 
 4 | ## Include
 5 | Only include extracting SIFT keypoints form a image, does't contains match and display. The SIFT algorithm ref:Lowe, D. Distinctive image features from scale-invariant keypoints. International Journal of Computer Vision, 60, 2 (2004), pp.91--110. Website:http://www.cs.ubc.ca/~lowe/keypoints/
 6 | 
 7 | ## Demo 
 8 | Running The demo must need OpenCV. The demo includes display the keypoints on a image and match between with OpenCV.
 9 | 
10 | ### Display SIFT keypoints by SiftySifty
11 | the keypoints that draws in the image is detected by SiftySifty and displayed by OepnCV.
12 | 
13 | ![](pic/siftysifty.jpg)
14 | <br>
15 | <br>
16 | ### Math the images using SIFT keypoints by SiftySifty
17 | the keypoints that draws in the image is detected by SiftySifty and matching&display is by OepnCV.
18 | ![](pic/siftymatchsifty2.jpg)
19 | <br>
20 | <br>
21 | ### Math the images using SIFT keypoints by SiftySifty and OpenCV
22 | math keypoints between SiftySifty and OpenCV, left is SiftySifty, right is OpenCV. Displayed by OepnCV.
23 | ![](pic/siftysiftymatchopencv.jpg)
24 | 
25 | 
26 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 2-Clause License
 2 | 
 3 | Copyright (c) 2017, 惊奇漫画
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
20 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 


--------------------------------------------------------------------------------
/src/gaussfiler.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Created by yanyuanchi on 2017/3/21.
 3 |  */
 4 | 
 5 | #include <cmath>
 6 | #include <cstdlib>
 7 | #include <iostream>
 8 | 
 9 | #include "boxfilter.h"
10 | #include "iirfilter.h"
11 | #include "gaussfiler.h"
12 | 
13 | namespace SiftySifty {
14 | 
15 | /**
16 |  * use 3 box filter to fitting gauss filter
17 |  */
18 | void gaussFilterBy3BoxFilter(int16_t *src, int16_t *dst, int width, int height, float sigma) {
19 |     /**
20 |      * get the radius for 3 box filter
21 |      * ref:http://blog.ivank.net/fastest-gaussian-blur.html
22 |      */
23 |     float wIdeal = sqrt(12.0 * sigma * sigma / 3 + 1.0);
24 |     int wl = floor(wIdeal);
25 |     
26 |     if (0 == wl % 2) {
27 |         wl--;
28 |     }
29 |     
30 |     int wu = wl + 2;
31 |     
32 |     float mIdeal = (12.0 * sigma * sigma - 3 * wl * wl - 4 * 3 * wl - 3 * 3) / (-4 * wl - 4);
33 |     int m = round(mIdeal);
34 |     
35 |     int radius[3];
36 |     for (int i = 0; i < 3; ++i) {
37 |         radius[i] = (i < m ? wl : wu) / 2;
38 |     }
39 |     
40 |     short *tmp = (short *) malloc(sizeof(short) * width * height);
41 |     
42 |     boxFilter(src, dst, width, height, radius[0]);
43 |     boxFilter(dst, tmp, width, height, radius[1]);
44 |     boxFilter(tmp, dst, width, height, radius[2]);
45 |     
46 |     free(tmp);
47 | }
48 | 
49 | /**
50 |  * use the IIR method to fit the gauss filter
51 |  * @param src
52 |  * @param dst
53 |  * @param width
54 |  * @param height
55 |  * @param sigma
56 |  */
57 | void gaussFilterByIIRFilter(short *src, short *dst, int width, int height, float sigma) {
58 |     IIRFilter(src, dst, width, height, sigma);
59 | }
60 | 
61 | }


--------------------------------------------------------------------------------
/include/utils.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Created by yanyuanchi on 2017/1/16.
  3 |  */
  4 | 
  5 | #ifndef SIFT_UTILS_H
  6 | #define SIFT_UTILS_H
  7 | 
  8 | #include <stdlib.h>
  9 | #include <math.h>
 10 | #include <zconf.h>
 11 | #include <sys/time.h>
 12 | #include <string.h>
 13 | 
 14 | #include "structs.h"
 15 | 
 16 | #ifdef _OPENMP
 17 | 
 18 | #include <omp.h>
 19 | 
 20 | #endif
 21 | 
 22 | namespace SiftySifty {
 23 | 
 24 | #ifndef max_value
 25 | #define max_value(a, b) ((a) > ((b)) ? (a) : (b))
 26 | #endif
 27 | 
 28 | #ifndef min_value
 29 | #define min_value(a, b) (((a) < (b)) ? (a) : (b))
 30 | #endif
 31 | 
 32 | /**PI*/
 33 | static float PI = 3.1415926f;
 34 | 
 35 | static int HARDWARE_CPU_NUM = -1;
 36 | 
 37 | static int getHardwareCPUNum() {
 38 |     if (0 >= HARDWARE_CPU_NUM) {
 39 |         HARDWARE_CPU_NUM = static_cast<int>(sysconf(_SC_NPROCESSORS_CONF));
 40 |         
 41 |         if (0 >= HARDWARE_CPU_NUM) {
 42 |             HARDWARE_CPU_NUM = 4;
 43 |         }
 44 |     }
 45 |     
 46 |     return HARDWARE_CPU_NUM;
 47 | }
 48 | 
 49 | static long getCurrentTime() {
 50 |     struct timeval tv;
 51 |     gettimeofday(&tv, NULL);
 52 |     return tv.tv_sec * 1000 + tv.tv_usec / 1000;
 53 | }
 54 | 
 55 | /**
 56 |  * get the angle of [0, 360)
 57 |  * @param y
 58 |  * @param x
 59 |  * @return
 60 |  */
 61 | static float atan2f360(float y, float x) {
 62 |     float aX = fabsf(x);
 63 |     float aY = fabsf(y);
 64 |     
 65 |     if (0 == x) {
 66 |         return (y > 0) ? 90 : 270;
 67 |     }
 68 |     
 69 |     if (0 == y) {
 70 |         return (x >= 0) ? 0 : 180;
 71 |     }
 72 |     
 73 |     float angle = atan2f(aY, aX) * 180.f / PI;
 74 |     
 75 |     if (x > 0) {
 76 |         return (y > 0) ? angle : (360 - angle);
 77 |     } else {
 78 |         return (y > 0) ? (180 - angle) : (180 + angle);
 79 |     }
 80 | }
 81 | 
 82 | template<class T>
 83 | Mat<T> *newMat(int width, int height) {
 84 |     if (0 >= width || 0 >= height) {
 85 |         return nullptr;
 86 |     }
 87 |     
 88 |     Mat<T> *mat = (Mat<T> *) malloc(sizeof(Mat<T>));
 89 |     mat->width = width;
 90 |     mat->height = height;
 91 |     mat->data = (T *) malloc(sizeof(T) * width * height);
 92 |     
 93 |     return mat;
 94 | }
 95 | 
 96 | template<class T>
 97 | void deleteMat(Mat<T> *mat) {
 98 |     if (nullptr != mat) {
 99 |         if (nullptr != mat->data) {
100 |             free(mat->data);
101 |         }
102 |         
103 |         free(mat);
104 |     }
105 | }
106 | 
107 | }
108 | #endif //SIFT_UTILS_H
109 | 


--------------------------------------------------------------------------------
/include/siftysifty.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Created by yanyuanchi on 2017/3/22.
  3 |  */
  4 | #ifndef SIFTYSIFTY_SIFTYSIFTY_H
  5 | #define SIFTYSIFTY_SIFTYSIFTY_H
  6 | 
  7 | #include <iostream>
  8 | #include <vector>
  9 | 
 10 | #include "utils.h"
 11 | #include "structs.h"
 12 | #include "gaussfiler.h"
 13 | #include "siftysifty.h"
 14 | 
 15 | using namespace std;
 16 | 
 17 | namespace SiftySifty {
 18 | /**the gray image will be scale by the SIFT_IMAGE_SCALE*/
 19 | static const int SIFT_IMAGE_SCALE_SHIFT = 6;
 20 | static const int SIFT_IMAGE_SCALE = (1 << SIFT_IMAGE_SCALE_SHIFT);
 21 | 
 22 | /**默认的金字塔每一层需要计算特征点的图片个数*/
 23 | /**the pic's number on one layer*/
 24 | static const int SIFT_OCTAVE_LAYERS = 3;
 25 | 
 26 | /**the sigma of sift*/
 27 | static const float SIFT_SIGMA = 1.6f;
 28 | 
 29 | /**the base pic's sigma*/
 30 | static const float SIFT_INIT_SIGMA = 0.5f;
 31 | 
 32 | /**sift's contrast threshold*/
 33 | static const float SIFT_CONTRAST_THRESHOLD = 0.04f;
 34 | 
 35 | /**the edge threshold*/
 36 | static const float SIFT_EDGE_THESHOLD = 10;
 37 | 
 38 | /**if the inited pic will be doubled*/
 39 | static const bool SIFT_DOUBLE_INITED_IMAGE = true;
 40 | 
 41 | /**the region width of for descriptor*/
 42 | static const int SIFT_DESCRIPTOR_WIDTH = 4;
 43 | 
 44 | /**the number of image region*/
 45 | static const int SIFT_DESCRIPTOR_HIST_BIN = 8;
 46 | 
 47 | static const float SIFT_ORIENTATION_PEAK_RATIO = 0.8f;
 48 | 
 49 | /**360 splited to 36*/
 50 | static const int SIFT_ORIENTATION_HIST_BINS = 36;
 51 | 
 52 | /**adjust 5 time*/
 53 | static const int SIFT_MAX_ADJUST_STEP = 5;
 54 | 
 55 | /**the border of the image*/
 56 | static const int SIFT_IMAGE_BORDER = 5;
 57 | 
 58 | static const float SIFT_ORIENTATION_SIGMA_FCTER = 1.5f;
 59 | 
 60 | /**the radius of hist SIFT_ORIENTATION_RADIUS * sigma*/
 61 | static const float SIFT_ORIENTATION_RADIUS = (3 * SIFT_ORIENTATION_SIGMA_FCTER);
 62 | 
 63 | /**SIFT_DESCRPTOR_SCAE_FCTER * sigma*/
 64 | static const float SIFT_DESCRIPTOR_SCAE_FCTER = 3.0f;
 65 | 
 66 | static const float SIFT_DESCRIPTOR_MAGNITUDE_THRESHOLD = 0.2f;
 67 | 
 68 | static const float SIFT_DESCRIPTOR_FCTOR = 512.0f;
 69 | 
 70 | void sift(uint8_t *image,
 71 |           int width, int height,
 72 |           vector<SiftySifty::KeyPoint> &keyPoints,
 73 |           int octaveLayers,
 74 |           float sigma,
 75 |           float contrastThreshold,
 76 |           int edgeThreshold,
 77 |           bool doubleInitImage,
 78 |           int descriptorWidth,
 79 |           int descriptorHistBin);
 80 | 
 81 | void sift(uint8_t *image, int width, int height, vector<SiftySifty::KeyPoint> &keyPoints);
 82 | 
 83 | void initSpeed(uint8_t *src, int width, int height);
 84 | 
 85 | }
 86 | 
 87 | #endif //SIFTYSIFTY_SIFTYSIFTY_H
 88 | 
 89 | 
 90 | 
 91 | 
 92 | 
 93 | 
 94 | 
 95 | 
 96 | 
 97 | 
 98 | 
 99 | 
100 | 
101 | 
102 | 
103 | 
104 | 
105 | 
106 | 
107 | 


--------------------------------------------------------------------------------
/src/boxfilter.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Created by yanyuanchi on 2017/3/21.
  3 |  */
  4 | #include <math.h>
  5 | 
  6 | #include "utils.h"
  7 | #include "filter.h"
  8 | #include "boxfilter.h"
  9 | 
 10 | namespace SiftySifty {
 11 | 
 12 | /**
 13 |  * boxFilter in row
 14 |  * dst[i] = sum(src[i - radius, i + radius]) / (2 * radius + 1)
 15 |  * scale = (int) (1.0f / (2 * radius + 1) * (1 << shift))
 16 |  * delta = (1 << (shift - 1))
 17 |  *
 18 |  * dst[i] = (sum(src[i - radius, i + radius]) * scale + delta) >> shift
 19 |  *
 20 |  * the src have border with radius's cols in left and radius's cols in right
 21 |  *
 22 |  * T can be uint8_t/int8_t/uint16_t/int16_t
 23 |  */
 24 | template<class T>
 25 | void boxFilterRow(T *src, T *dst, int width, int height, int radius, int scale, int delta, int shift) {
 26 |     int radius2 = radius + radius;
 27 |     int size = radius2 + 1;
 28 |     
 29 |     int maxThreadNum = getHardwareCPUNum();
 30 |     int threadIndex = 0;
 31 |     
 32 |     int stride = max_value((int) (roundf(1.f * height / maxThreadNum)), 1);
 33 | 
 34 | #pragma omp parallel for private(threadIndex)
 35 |     for (threadIndex = 0; threadIndex < maxThreadNum; ++threadIndex) {
 36 |         int start = threadIndex * stride;
 37 |         int end = (threadIndex == (maxThreadNum - 1)) ? height : min_value(start + stride, height);
 38 |         
 39 |         T *srcData = src + start * (width + radius2);
 40 |         T *dstData = dst + start * width;
 41 |         
 42 |         int64_t sum;
 43 |         
 44 |         for (int y = start; y < end; ++y) {
 45 |             sum = 0;
 46 |             
 47 |             for (int x = 0; x < size; ++x) {
 48 |                 sum += srcData[x];
 49 |             }
 50 |             
 51 |             dstData[0] = (T) ((sum * scale + delta) >> shift);
 52 |             
 53 |             for (int x = 1; x < width; ++x) {
 54 |                 sum += srcData[x + radius2] - srcData[x - 1];
 55 |                 
 56 |                 dstData[x] = (T) ((sum * scale + delta) >> shift);
 57 |             }
 58 |             
 59 |             srcData += (width + radius2);
 60 |             dstData += width;
 61 |         }
 62 |     }
 63 | }
 64 | 
 65 | /**
 66 |  * the src with radius rows int top and bottom
 67 |  */
 68 | template<class T>
 69 | void boxFilterCol(T *src, T *dst, int width, int height, int radius, int scale, int delta, int shift) {
 70 |     int radius2 = radius + radius;
 71 |     int size = radius2 + 1;
 72 |     
 73 |     int maxThreadNum = getHardwareCPUNum();
 74 |     int threadIndex = 0;
 75 |     
 76 |     /**split the width's cols to maxThreadNum's thread*/
 77 |     int stride = max_value((int) (roundf(1.f * width / maxThreadNum)), 1);
 78 | 
 79 | #pragma omp parallel for private(threadIndex)
 80 |     for (threadIndex = 0; threadIndex < maxThreadNum; ++threadIndex) {
 81 |         int start = threadIndex * stride;
 82 |         int end = (threadIndex == (maxThreadNum - 1)) ? width : min_value(width, start + stride);
 83 |         
 84 |         int range = end - start;
 85 |         
 86 |         int interval = radius2 * width;
 87 |         
 88 |         int64_t *sum = (int64_t *) malloc(sizeof(int64_t) * range);
 89 |         memset(sum, 0, sizeof(int64_t) * range);
 90 |         
 91 |         T *srcData = src + start;
 92 |         T *dstData = dst + start;
 93 |         
 94 |         for (int y = 0; y < radius2; ++y) {
 95 |             for (int x = 0; x < range; ++x) {
 96 |                 sum[x] += srcData[x];
 97 |             }
 98 |             
 99 |             srcData += width;
100 |         }
101 |         
102 |         for (int y = 0; y < height; ++y) {
103 |             for (int x = 0; x < range; ++x) {
104 |                 sum[x] += srcData[x];
105 |                 
106 |                 dstData[x] = (T) ((sum[x] * scale + delta) >> shift);
107 |                 
108 |                 sum[x] -= srcData[x - interval];
109 |             }
110 |             
111 |             srcData += width;
112 |             dstData += width;
113 |         }
114 |         
115 |         free(sum);
116 |     }
117 | }
118 | 
119 | /**
120 |  * box blur
121 |  * first blur int row
122 |  * than blur int cols
123 |  */
124 | template<class T>
125 | void boxFilter(T *src, T *dst, int width, int height, int radius) {
126 |     if (0 >= radius) {
127 |         memcpy(dst, src, sizeof(T) * width * height);
128 |         
129 |         return;
130 |     }
131 |     
132 |     int radius2 = radius + radius;
133 |     int size = radius2 + 1;
134 |     
135 |     int scale = (int) (1.0 / size * FILTER_SCALE);
136 |     
137 |     T *srcTemp = (T *) malloc(sizeof(T) * (width + radius2) * height);
138 |     
139 |     T *srcData = src;
140 |     T *srcTempData = srcTemp;
141 |     
142 |     /**copy memory to srcMediate*/
143 |     for (int y = 0; y < height; ++y) {
144 |         std::fill(srcTempData, srcTempData + radius, srcData[0]);
145 |         
146 |         memcpy(srcTempData + radius, srcData, sizeof(T) * width);
147 |         
148 |         std::fill(srcTempData + radius + width,
149 |                   srcTempData + radius2 + width,
150 |                   srcData[width - 1]);
151 |         
152 |         srcData += width;
153 |         srcTempData += (width + radius2);
154 |     }
155 |     
156 |     T *dstTemp = (T *) malloc(sizeof(T) * width * (height + radius2));
157 |     
158 |     /**blur in row*/
159 |     boxFilterRow<T>(srcTemp, dstTemp + (radius * width),
160 |                     width, height, radius,
161 |                     scale, FILTER_DELTA, FILTER_SHIFT);
162 |     
163 |     for (int y = 0; y < radius; ++y) {
164 |         memcpy(dstTemp + y * width, dstTemp + radius * width, sizeof(T) * width);
165 |         memcpy(dstTemp + (radius + height + y) * width, dstTemp + (radius + height - 1) * width, sizeof(T) * width);
166 |     }
167 |     
168 |     boxFilterCol<T>(dstTemp, dst, width, height, radius, scale, FILTER_DELTA, FILTER_SHIFT);
169 |     
170 |     free(srcTemp);
171 |     free(dstTemp);
172 | }
173 | 
174 | /**
175 |  * box blur
176 |  */
177 | void boxFilter(int16_t *src, int16_t *dst, int width, int height, int radius) {
178 |     boxFilter<int16_t>(src, dst, width, height, radius);
179 | }
180 | 
181 | }
182 | 
183 | 
184 | 
185 | 
186 | 
187 | 
188 | 
189 | 
190 | 
191 | 
192 | 
193 | 
194 | 
195 | 
196 | 
197 | 
198 | 
199 | 
200 | 
201 | 
202 | 


--------------------------------------------------------------------------------
/src/siftysiftytest.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Created by yanyuanchi on 2017/4/3.
  3 |  */
  4 | #include <iostream>
  5 | #include <vector>
  6 | 
  7 | #include <opencv2/core/core.hpp>
  8 | #include <opencv2/highgui/highgui.hpp>
  9 | #include <opencv2/xfeatures2d.hpp>
 10 | #include <opencv2/imgproc.hpp>
 11 | 
 12 | #include "utils.h"
 13 | #include "siftysifty.h"
 14 | #include "siftysiftytest.h"
 15 | 
 16 | using namespace std;
 17 | using namespace cv;
 18 | 
 19 | namespace SiftySifty {
 20 | /**
 21 |  * use the opencv to draw the siftysifty keypoint
 22 |  * @param path
 23 |  */
 24 | void drawKeyPoint(string path)
 25 | {
 26 |     /**read the pic*/
 27 |     cv::Mat originImage = imread(path);
 28 |     cv::Mat grayImage;
 29 |     cvtColor(originImage, grayImage, CV_RGB2GRAY);
 30 | 
 31 |     vector<SiftySifty::KeyPoint> keyPoints;
 32 |     SiftySifty::sift(grayImage.data, grayImage.cols, grayImage.rows, keyPoints);
 33 | 
 34 |     vector<cv::KeyPoint> opencvKeyPoints(keyPoints.size());
 35 |     for (int i = 0; i < keyPoints.size(); ++i) {
 36 |         opencvKeyPoints[i].pt.x = keyPoints[i].x;
 37 |         opencvKeyPoints[i].pt.y = keyPoints[i].y;
 38 |         opencvKeyPoints[i].size = keyPoints[i].size;
 39 |         opencvKeyPoints[i].angle = keyPoints[i].angle;
 40 |     }
 41 | 
 42 |     /**draw it*/
 43 |     cv::Mat output;
 44 |     drawKeypoints(grayImage, opencvKeyPoints, output, Scalar::all(-1), DrawMatchesFlags::DRAW_RICH_KEYPOINTS);
 45 |     imshow("drawKeyPoint", output);
 46 | 
 47 |     cvWaitKey(0);
 48 | }
 49 | 
 50 | /**
 51 |  * draw the keypoint on the same pic by SiftySifty and OpenCV
 52 |  * @param path
 53 |  */
 54 | void drawKeyPointCmpToOpenCV(string path)
 55 | {
 56 |     /**read the pic*/
 57 |     cv::Mat originImage = imread(path);
 58 |     cv::Mat grayImage;
 59 |     cvtColor(originImage, grayImage, CV_RGB2GRAY);
 60 | 
 61 |     vector<SiftySifty::KeyPoint> keyPoints;
 62 |     SiftySifty::sift(grayImage.data, grayImage.cols, grayImage.rows, keyPoints);
 63 | 
 64 |     vector<cv::KeyPoint> opencvKeyPoints1(keyPoints.size());
 65 |     for (int i = 0; i < keyPoints.size(); ++i)
 66 |     {
 67 |         opencvKeyPoints1[i].pt.x  = keyPoints[i].x;
 68 |         opencvKeyPoints1[i].pt.y  = keyPoints[i].y;
 69 |         opencvKeyPoints1[i].size  = keyPoints[i].size;
 70 |         opencvKeyPoints1[i].angle = keyPoints[i].angle;
 71 |     }
 72 | 
 73 |     cv::Mat output1;
 74 |     drawKeypoints(grayImage, opencvKeyPoints1, output1, Scalar::all(-1), DrawMatchesFlags::DRAW_RICH_KEYPOINTS);
 75 |     imshow("SiftySifty", output1);
 76 | 
 77 |     vector<cv::KeyPoint> opencvKeyPoints2;
 78 |     Ptr<Feature2D> f2d = xfeatures2d::SIFT::create();
 79 |     f2d->detect(grayImage, opencvKeyPoints2);
 80 | 
 81 |     cv::Mat output2;
 82 |     drawKeypoints(grayImage, opencvKeyPoints2, output2, Scalar::all(-1), DrawMatchesFlags::DRAW_RICH_KEYPOINTS);
 83 |     imshow("OpenCV", output2);
 84 | 
 85 |     cvWaitKey(0);
 86 | }
 87 | 
 88 | /**
 89 |  * math the keypoint that extracted by SiftySifty
 90 |  * @param path
 91 |  */
 92 | void matchKeyPoint(string path1, string path2)
 93 | {
 94 |     /**read the pic*/
 95 |     cv::Mat originImage1 = imread(path1);
 96 |     cv::Mat grayImage1;
 97 |     cvtColor(originImage1, grayImage1, CV_RGB2GRAY);
 98 | 
 99 |     vector<SiftySifty::KeyPoint> keyPoints1;
100 |     SiftySifty::sift(grayImage1.data, grayImage1.cols, grayImage1.rows, keyPoints1);
101 | 
102 |     vector<cv::KeyPoint> opencvKeyPoints1(keyPoints1.size());
103 |     cv::Mat ds1(keyPoints1.size(), 128, CV_32F);
104 |     for (int i = 0; i < keyPoints1.size(); ++i)
105 |     {
106 |         opencvKeyPoints1[i].pt.x = keyPoints1[i].x;
107 |         opencvKeyPoints1[i].pt.y = keyPoints1[i].y;
108 |         opencvKeyPoints1[i].size = keyPoints1[i].size;
109 |         opencvKeyPoints1[i].angle = keyPoints1[i].angle;
110 | 
111 |         memcpy((float *) ds1.data + i * 128, keyPoints1[i].descriptor, sizeof(float) * 128);
112 |     }
113 | 
114 |     /**read the pic*/
115 |     cv::Mat originImage2 = imread(path2);
116 |     cv::Mat grayImage2;
117 |     cvtColor(originImage2, grayImage2, CV_RGB2GRAY);
118 | 
119 |     vector<SiftySifty::KeyPoint> keyPoints2;
120 |     SiftySifty::sift(grayImage2.data, grayImage2.cols, grayImage2.rows, keyPoints2);
121 | 
122 |     vector<cv::KeyPoint> opencvKeyPoints2(keyPoints2.size());
123 |     cv::Mat ds2(keyPoints2.size(), 128, CV_32F);
124 |     for (int i = 0; i < keyPoints2.size(); ++i)
125 |     {
126 |         opencvKeyPoints2[i].pt.x = keyPoints2[i].x;
127 |         opencvKeyPoints2[i].pt.y = keyPoints2[i].y;
128 |         opencvKeyPoints2[i].size = keyPoints2[i].size;
129 |         opencvKeyPoints2[i].angle = keyPoints2[i].angle;
130 | 
131 |         memcpy((float *) ds2.data + i * 128, keyPoints2[i].descriptor, sizeof(float) * 128);
132 |     }
133 | 
134 |     BFMatcher matcher;
135 |     vector<DMatch> matches;
136 |     matcher.match(ds1, ds2, matches);
137 | 
138 |     cv::Mat img_matches;
139 |     drawMatches(grayImage1, opencvKeyPoints1, grayImage2, opencvKeyPoints2, matches, img_matches);
140 |     imshow("SiftySifty match SiftySifty", img_matches);
141 | 
142 |     cvWaitKey(0);
143 | }
144 | 
145 | /**
146 |  * match the siftysifty keypoint with opencv keypoint
147 |  * @param path
148 |  */
149 | void matchKeyPointSiftySiftyWithOpenCV(string path1, string path2)
150 | {
151 |     /**read the pic*/
152 |     cv::Mat originImage1 = imread(path1);
153 |     cv::Mat grayImage1;
154 |     cvtColor(originImage1, grayImage1, CV_RGB2GRAY);
155 | 
156 |     vector<SiftySifty::KeyPoint> keyPoints1;
157 |     SiftySifty::sift(grayImage1.data, grayImage1.cols, grayImage1.rows, keyPoints1);
158 | 
159 |     vector<cv::KeyPoint> opencvKeyPoints1(keyPoints1.size());
160 |     cv::Mat ds1(keyPoints1.size(), 128, CV_32F);
161 |     for (int i = 0; i < keyPoints1.size(); ++i)
162 |     {
163 |         opencvKeyPoints1[i].pt.x = keyPoints1[i].x;
164 |         opencvKeyPoints1[i].pt.y = keyPoints1[i].y;
165 |         opencvKeyPoints1[i].size = keyPoints1[i].size;
166 |         opencvKeyPoints1[i].angle = keyPoints1[i].angle;
167 | 
168 |         memcpy((float *) ds1.data + i * 128, keyPoints1[i].descriptor, sizeof(float) * 128);
169 |     }
170 | 
171 |     /**read the pic*/
172 |     cv::Mat originImage2 = imread(path2);
173 |     cv::Mat grayImage2;
174 |     cvtColor(originImage2, grayImage2, CV_RGB2GRAY);
175 |     vector<cv::KeyPoint> opencvKeyPoints2;
176 |     Ptr<Feature2D> f2d = xfeatures2d::SIFT::create();
177 |     cv::Mat ds2;
178 |     f2d->detectAndCompute(grayImage2, noArray(), opencvKeyPoints2, ds2);
179 | 
180 |     BFMatcher matcher;
181 |     vector<DMatch> matches;
182 |     matcher.match(ds1, ds2, matches);
183 | 
184 |     cv::Mat img_matches;
185 |     drawMatches(grayImage1, opencvKeyPoints1, grayImage2, opencvKeyPoints2, matches, img_matches);
186 |     imshow("SiftySifty match OpenCV", img_matches);
187 | 
188 |     cvWaitKey(0);
189 | }
190 | 
191 | /**
192 |  * test the speed of SiftySifty and OpenCV
193 |  * @param path
194 |  */
195 | void testSpeedSiftySiftyAndOpenCV(string path) {
196 |     /**read the pic*/
197 |     cv::Mat originImage = imread(path);
198 |     cv::Mat grayImage;
199 |     cvtColor(originImage, grayImage, CV_RGB2GRAY);
200 | 
201 |     long total = 0;
202 |     for (int i = 0; i < 100; ++i) {
203 |         long t1 = getCurrentTime();
204 | 
205 |         vector<SiftySifty::KeyPoint> keyPoints;
206 |         sift(grayImage.data, grayImage.cols, grayImage.rows, keyPoints);
207 | 
208 |         long t2 = getCurrentTime();
209 | 
210 |         long cur = t2 - t1;
211 |         total += cur;
212 | 
213 |         cout << "SiftySifty, time:" << (i + 1) << ", cost:" << cur << "ms" << endl;
214 |     }
215 | 
216 |     float siftysiftyCost = 1.0 * total / 100;
217 | 
218 |     total = 0;
219 |     for (int i = 0; i < 100; ++i) {
220 |         long t1 = getCurrentTime();
221 | 
222 |         vector<cv::KeyPoint> opencvKeyPoints;
223 |         Ptr<Feature2D> f2d = xfeatures2d::SIFT::create();
224 |         cv::Mat ds;
225 |         f2d->detectAndCompute(grayImage, noArray(), opencvKeyPoints, ds);
226 | 
227 |         long t2 = getCurrentTime();
228 | 
229 |         long cur = t2 - t1;
230 |         total += cur;
231 | 
232 |         cout << "OpenCV, time:" << (i + 1) << ", cost:" << cur << "ms" << endl;
233 |     }
234 | 
235 |     float opencvCost = 1.0 * total / 100;
236 | 
237 |     cout << "SiftySifty cost time(the average of 100 times):" << siftysiftyCost << "ms" << endl;
238 |     cout << "OpenCV cost time(the average of 100 times):" << opencvCost << "ms" << endl;
239 | }
240 | 
241 | }
242 | 
243 | 
244 | 
245 | 
246 | 
247 | 
248 | 
249 | 
250 | 
251 | 
252 | 
253 | 
254 | 
255 | 
256 | 
257 | 
258 | 
259 | 


--------------------------------------------------------------------------------
/src/iirfilter.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Created by yanyuanchi on 2017/3/23.
  3 |  */
  4 | 
  5 | #include "utils.h"
  6 | #include "filter.h"
  7 | #include "iirfilter.h"
  8 | 
  9 | #ifdef _OPENMP
 10 | 
 11 | #include <omp.h>
 12 | 
 13 | #endif
 14 | 
 15 | namespace SiftySifty {
 16 | /**
 17 |  * use iir to filter the src
 18 |  * ref:"Recursive Implementation of the gaussian filter."
 19 |  * w[n] = (B * input[n] + b1 * w[n-1] + b2 * w[n-2] + b3 * w[n-3] + delta) >> shift
 20 |  */
 21 | template<class T>
 22 | void IIRFilterRow(T *src, T *dst,
 23 |                   const int width, const int height,
 24 |                   const int32_t B,
 25 |                   const int32_t b1, const int32_t b2, const int32_t b3,
 26 |                   const int32_t delta, const int32_t shift) {
 27 |     int maxThreadNum = getHardwareCPUNum();
 28 |     int threadIndex = 0;
 29 |     
 30 |     int stride = max_value((int) (roundf(1.f * height / maxThreadNum)), 1);
 31 | 
 32 | #pragma omp parallel for private(threadIndex)
 33 |     for (threadIndex = 0; threadIndex < maxThreadNum; ++threadIndex) {
 34 |         int start = threadIndex * stride;
 35 |         int end = (threadIndex == (maxThreadNum - 1)) ? height : min_value(start + stride, height);
 36 |         
 37 |         T *srcData = src + start * width;
 38 |         T *dstData = dst + start * width;
 39 |         
 40 |         int size;
 41 |         
 42 |         T *w = (T *) malloc(sizeof(T) * (width + 3));
 43 |         T tail;
 44 |         
 45 |         for (int y = start; y < end; ++y) {
 46 |             size = width - 1;
 47 |             
 48 |             w[0] = w[1] = w[2] = srcData[0];
 49 |             
 50 |             for (int x = 0, n = 3; x <= size; ++x, ++n) {
 51 |                 w[n] = (T) ((B * srcData[x] + b1 * w[n - 1] + b2 * w[n - 2] + b3 * w[n - 3] + delta) >> shift);
 52 |             }
 53 |             
 54 |             tail = w[size + 3];
 55 |             
 56 |             dstData[size] = (T) ((B * w[size + 3] + b1 * tail + b2 * tail + b3 * tail + delta) >> shift);
 57 |             size--;
 58 |             dstData[size] = (T) ((B * w[size + 3] + b1 * dstData[size + 1] + b2 * tail + b3 * tail + delta) >> shift);
 59 |             size--;
 60 |             dstData[size] = (T) ((B * w[size + 3] + b1 * dstData[size + 1] + b2 * dstData[size + 2] + b3 * tail + delta)
 61 |                     >> shift);
 62 |             size--;
 63 |             
 64 |             for (int x = size; x >= 0; --x) {
 65 |                 dstData[x] = (T) (
 66 |                         (B * w[x + 3] + b1 * dstData[x + 1] + b2 * dstData[x + 2] + b3 * dstData[x + 3] + delta)
 67 |                                 >> shift);
 68 |             }
 69 |             
 70 |             srcData += width;
 71 |             dstData += width;
 72 |         }
 73 |         
 74 |         free(w);
 75 |     }
 76 | }
 77 | 
 78 | /**
 79 |  * filter on col same as the row
 80 |  * @tparam T
 81 |  * @param src
 82 |  * @param dst
 83 |  * @param width
 84 |  * @param height
 85 |  * @param delta
 86 |  * @param shift
 87 |  * @param B
 88 |  * @param b1
 89 |  * @param b2
 90 |  * @param b3
 91 |  */
 92 | template<class T>
 93 | void IIRFilterCol(T *src, T *dst,
 94 |                   const int width, const int height,
 95 |                   const int32_t B,
 96 |                   const int32_t b1, const int32_t b2, const int32_t b3,
 97 |                   const int32_t delta, const int32_t shift) {
 98 |     int maxThreadNum = getHardwareCPUNum();
 99 |     int threadIndex = 0;
100 |     
101 |     int stride = max_value((int) (roundf(1.f * width / maxThreadNum)), 1);
102 | 
103 | #pragma omp parallel for private(threadIndex)
104 |     for (threadIndex = 0; threadIndex < maxThreadNum; ++threadIndex) {
105 |         int start = threadIndex * stride;
106 |         int end = (threadIndex == (maxThreadNum - 1)) ? width : min_value(start + stride, width);
107 |         int range = end - start;
108 |         
109 |         T *w = (T *) malloc(sizeof(T) * range * (height + 3));
110 |         
111 |         T *srcData = src + start;
112 |         T *dstData = dst + start;
113 |         T *wData = w + 3 * range;
114 |         
115 |         T *srcOffsetData;
116 |         T *dstOffsetData;
117 |         T *wOffsetData;
118 |         
119 |         memcpy(w, srcData, sizeof(T) * range);
120 |         memcpy(w + range, srcData, sizeof(T) * range);
121 |         memcpy(w + 2 * range, srcData, sizeof(T) * range);
122 |         
123 |         int off1 = -range;
124 |         int off2 = off1 - range;
125 |         int off3 = off2 - range;
126 |         
127 |         int size = height - 1;
128 |         
129 |         /**forward pass*/
130 |         for (int y = 0; y <= size; ++y) {
131 |             srcOffsetData = srcData;
132 |             wOffsetData = wData;
133 |             
134 |             for (int x = 0; x < range; ++x) {
135 |                 wOffsetData[0] = (T) ((B * srcOffsetData[0] + b1 * wOffsetData[off1] + b2 * wOffsetData[off2] +
136 |                                        b3 * wOffsetData[off3] + delta) >> shift);
137 |                 
138 |                 srcOffsetData++;
139 |                 wOffsetData++;
140 |             }
141 |             
142 |             srcData += width;
143 |             wData += range;
144 |         }
145 |         
146 |         /**backward pass*/
147 |         T *tail = (T *) malloc(sizeof(T) * range);
148 |         memcpy(tail, w + range * (size + 3), sizeof(T) * range);
149 |         
150 |         off1 = width;
151 |         off2 = off1 + width;
152 |         off3 = off2 + width;
153 |         
154 |         dstData = dst + start + size * width;
155 |         wData = w + (size + 3) * range;
156 |         
157 |         dstOffsetData = dstData;
158 |         wOffsetData = wData;
159 |         
160 |         for (int x = 0; x < range; ++x) {
161 |             dstOffsetData[0] = (T) ((B * wOffsetData[0] + b1 * tail[x] + b2 * tail[x] + b3 * tail[x] + delta) >> shift);
162 |             
163 |             dstOffsetData++;
164 |             wOffsetData++;
165 |         }
166 |         
167 |         dstData -= width;
168 |         wData -= range;
169 |         
170 |         dstOffsetData = dstData;
171 |         wOffsetData = wData;
172 |         
173 |         for (int x = 0; x < range; ++x) {
174 |             dstOffsetData[0] = (T) ((B * wOffsetData[0] + b1 * dstOffsetData[off1] + b2 * tail[x] + b3 * tail[x]
175 |                                      + delta) >> shift);
176 |             
177 |             dstOffsetData++;
178 |             wOffsetData++;
179 |         }
180 |         
181 |         dstData -= width;
182 |         wData -= range;
183 |         
184 |         dstOffsetData = dstData;
185 |         wOffsetData = wData;
186 |         
187 |         for (int x = 0; x < range; ++x) {
188 |             dstOffsetData[0] = (T) (
189 |                     (B * wOffsetData[0] + b1 * dstOffsetData[off1] + b2 * dstOffsetData[off2] + b3 * tail[x]
190 |                      + delta) >> shift);
191 |             
192 |             dstOffsetData++;
193 |             wOffsetData++;
194 |         }
195 |         
196 |         dstData -= width;
197 |         wData -= range;
198 |         
199 |         for (int y = size - 3; y >= 0; --y) {
200 |             dstOffsetData = dstData;
201 |             wOffsetData = wData;
202 |             
203 |             for (int x = 0; x < range; ++x) {
204 |                 dstOffsetData[0] = (T) ((B * wOffsetData[0] + b1 * dstOffsetData[off1] + b2 * dstOffsetData[off2]
205 |                                          + b3 * dstOffsetData[off3] + delta) >> shift);
206 |                 
207 |                 dstOffsetData++;
208 |                 wOffsetData++;
209 |             }
210 |             
211 |             dstData -= width;
212 |             wData -= range;
213 |         }
214 |         
215 |         free(tail);
216 |         free(w);
217 |     }
218 | }
219 | 
220 | template<class T>
221 | void IIRFilter(T *src, T *dst, const int width, const int height, const float sigma) {
222 |     if (nullptr == src || nullptr == dst || 3 > width || 3 > height || 0 > sigma) {
223 |         return;
224 |     }
225 |     
226 |     double_t q, q2, q3;
227 |     
228 |     if (sigma >= 2.5) {
229 |         q = 0.98711 * sigma - 0.96330;
230 |     } else if (sigma >= 0.5 && sigma < 2.5) {
231 |         q = 3.97156 - 4.14554 * sqrt(1.0 - 0.26891 * sigma);
232 |     } else {
233 |         q = 0.1147705018520355224609375;
234 |     }
235 |     
236 |     q2 = q * q;
237 |     q3 = q * q2;
238 |     
239 |     double_t db0 = 1.57825 + 2.44413 * q + 1.4281 * q2 + 0.422205 * q3;
240 |     double_t db1 = 2.44413 * q + 2.85619 * q2 + 1.26661 * q3;
241 |     double_t db2 = -(1.4281 * q2 + 1.26661 * q3);
242 |     double_t db3 = 0.4222205 * q3;
243 |     
244 |     double_t dB = 1.0 - (db1 + db2 + db3) / db0;
245 |     
246 |     int32_t B  = (int32_t) (dB * FILTER_SCALE);
247 |     int32_t b1 = (int32_t) (db1 / db0 * FILTER_SCALE);
248 |     int32_t b2 = (int32_t) (db2 / db0 * FILTER_SCALE);
249 |     int32_t b3 = (int32_t) (db3 / db0 * FILTER_SCALE);
250 |     
251 |     T *tmp = (T *) malloc(sizeof(T) * width * height);
252 |     
253 |     IIRFilterRow(src, tmp, width, height, B, b1, b2, b3, FILTER_DELTA, FILTER_SHIFT);
254 |     IIRFilterCol(tmp, dst, width, height, B, b1, b2, b3, FILTER_DELTA, FILTER_SHIFT);
255 |     
256 |     free(tmp);
257 | }
258 | 
259 | void IIRFilter(int16_t *src, int16_t *dst, int width, int height, float sigma) {
260 |     IIRFilter<int16_t>(src, dst, width, height, sigma);
261 | }
262 | 
263 | void IIRFilter(uint8_t *src, uint8_t *dst, int width, int height, float sigma) {
264 |     IIRFilter<uint8_t>(src, dst, width, height, sigma);
265 | }
266 | 
267 | }
268 | 
269 | 
270 | 
271 | 
272 | 
273 | 
274 | 
275 | 


--------------------------------------------------------------------------------
/include/imageutils.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Created by yanyuanchi on 2017/4/8.
  3 |  */
  4 | 
  5 | #ifndef SIFTYSIFTY_IMAGEUTILS_H
  6 | #define SIFTYSIFTY_IMAGEUTILS_H
  7 | 
  8 | #include "structs.h"
  9 | #include "utils.h"
 10 | 
 11 | #ifdef _OPENMP
 12 | #include <omp.h>
 13 | #endif
 14 | 
 15 | namespace SiftySifty {
 16 | 
 17 | /**
 18 |  * half sample the src Mat
 19 |  * @tparam T type
 20 |  * @param src src mat
 21 |  * @param dst dst mat
 22 |  * @return true/false
 23 |  */
 24 | template<class T>
 25 | void halfSampleMat(Mat<T> *src, Mat<T> *dst) {
 26 |     if (nullptr == src || nullptr == dst || nullptr == src->data || nullptr == dst->data) {
 27 |         return;
 28 |     }
 29 |     
 30 |     int srcWidth  = src->width;
 31 |     int srcHeight = src->height;
 32 |     
 33 |     int dstWidth  = dst->width;
 34 |     int dstHeight = dst->height;
 35 |     
 36 |     if ((srcWidth >> 1) != dstWidth || (srcHeight >> 1) != dstHeight) {
 37 |         return;
 38 |     }
 39 |     
 40 |     int maxThreadNum = getHardwareCPUNum();
 41 |     int threadIndex  = 0;
 42 |     
 43 |     int stride = max_value((int) (roundf(1.0f * dstHeight / maxThreadNum)), 1);
 44 | 
 45 | #pragma omp parallel for private(threadIndex)
 46 |     for (threadIndex = 0; threadIndex < maxThreadNum; ++threadIndex) {
 47 |         int start = threadIndex * stride;
 48 |         int end   = (threadIndex == (maxThreadNum - 1)) ? dstHeight : min_value(start + stride, dstHeight);
 49 |         
 50 |         T *srcData = src->data + (srcWidth * (start << 1));
 51 |         T *dstData = dst->data + (dstWidth * start);
 52 |         
 53 |         for (int y = start; y < end; ++y) {
 54 |             for (int x = 0; x < dstWidth; ++x) {
 55 |                 dstData[x] = srcData[(x << 1)];
 56 |             }
 57 |             
 58 |             srcData += (srcWidth + srcWidth);
 59 |             dstData += dstWidth;
 60 |         }
 61 |     }
 62 | }
 63 | 
 64 | 
 65 | /**
 66 |  * resize src to dst
 67 |  * @tparam T
 68 |  * @param src
 69 |  * @param dst
 70 |  * @return
 71 |  */
 72 | template<class T>
 73 | void resizeMat(Mat<T> *src, Mat<T> *dst) {
 74 |     if (nullptr == src || nullptr == dst || nullptr == src->data || nullptr == dst->data) {
 75 |         return;
 76 |     }
 77 |     
 78 |     int srcWidth = src->width;
 79 |     int srcHeight = src->height;
 80 |     
 81 |     int dstWidth = dst->width;
 82 |     int dstHeight = dst->height;
 83 |     
 84 |     T *srcData = src->data;
 85 |     T *dstData = dst->data;
 86 |     
 87 |     if (srcWidth == dstWidth && srcHeight == dstHeight) {
 88 |         memcpy(dstData, srcData, sizeof(T) * srcWidth * srcHeight);
 89 |         
 90 |         return;
 91 |     }
 92 |     
 93 |     int32_t shift = 22;
 94 |     int64_t scale = (1 << (shift >> 1));
 95 |     int64_t delta = (1 << (shift - 1));
 96 |     
 97 |     float xRatio = 1.f * (srcWidth - 1.f) / dstWidth;
 98 |     float yRatio = 1.f * (srcHeight - 1.f) / dstHeight;
 99 |     
100 |     int maxThreadNum = getHardwareCPUNum();
101 |     int threadIndex  = 0;
102 |     
103 |     int stride = max_value((int) (roundf(1.0f * dstHeight / maxThreadNum)), 1);
104 | 
105 | #pragma omp parallel for private(threadIndex)
106 |     for (threadIndex = 0; threadIndex < maxThreadNum; ++threadIndex) {
107 |         int start = threadIndex * stride;
108 |         int end = (threadIndex == (maxThreadNum - 1)) ? dstHeight : min_value(start + stride, dstHeight);
109 |         
110 |         T *dstOffsetData = dstData + start * dstWidth;
111 |         
112 |         for (int y = start; y < end; ++y) {
113 |             float yOffset = (y + 0.5f) * yRatio;
114 |             int yUp = (int) floorf(yOffset);
115 |             
116 |             yOffset -= yUp;
117 |             
118 |             int64_t multUp = (int64_t) (yOffset * scale);
119 |             int64_t multDown = scale - multUp;
120 |             
121 |             for (int x = 0; x < dstWidth; ++x) {
122 |                 float xOffset = (x + 0.5f) * xRatio;
123 |                 int xLeft = (int) floorf(xOffset);
124 |                 
125 |                 xOffset -= xLeft;
126 |                 
127 |                 int64_t multLeft = (int64_t) (xOffset * scale);
128 |                 int64_t multRight = scale - multLeft;
129 |                 
130 |                 T *srcOffsetData = srcData + yUp * srcWidth + xLeft;
131 |                 
132 |                 dstOffsetData[x] = (T) ((srcOffsetData[0] * multRight * multDown
133 |                                          + srcOffsetData[1] * multLeft * multDown
134 |                                          + srcOffsetData[srcWidth] * multRight * multUp
135 |                                          + srcOffsetData[srcWidth + 1] * multLeft * multUp
136 |                                          + delta) >> shift);
137 |             }
138 |             
139 |             dstOffsetData += dstWidth;
140 |         }
141 |     }
142 | }
143 | 
144 | /**
145 |  * resize the src to dst
146 |  * @tparam T
147 |  * @param src
148 |  * @param srcWidth
149 |  * @param srcHeight
150 |  * @param dst
151 |  * @param dstWidth
152 |  * @param dstHeight
153 |  * @return
154 |  */
155 | template<class T>
156 | void resizeMat2(Mat<T> *srcMat, Mat<T> *dstMat) {
157 |     if (nullptr == srcMat || nullptr == dstMat) {
158 |         return;
159 |     }
160 |     
161 |     T *src = srcMat->data;
162 |     T *dst = dstMat->data;
163 |     
164 |     int srcWidth  = srcMat->width;
165 |     int srcHeight = srcMat->height;
166 |     int dstWidth  = dstMat->width;
167 |     int dstHeight = dstMat->height;
168 | 
169 |     if (srcWidth == dstWidth && srcHeight == dstHeight) {
170 |         memcpy(dst, src, sizeof(T) * srcWidth * srcHeight);
171 |         return;
172 |     }
173 |     
174 |     int32_t shift = 22;
175 |     int64_t scale = (1 << (shift >> 1));
176 |     int64_t delta = (1 << (shift - 1));
177 |     
178 |     /**
179 |      * src = (dst + 0.5) * srcWidth / dstWidth - 0.5
180 |      */
181 |     float xRatio = 1.f * srcWidth / dstWidth;
182 |     float yRatio = 1.f * srcHeight / dstHeight;
183 |     
184 |     int *xTable = (int*) malloc(sizeof(int) * 2 * dstWidth);
185 |     int64_t *xMult = (int64_t*) malloc(sizeof(int64_t) * 2 * dstWidth);
186 |     
187 |     for (int x = 0; x < dstWidth; ++x) {
188 |         float xOffset = (x + 0.5f) * xRatio - 0.5f;
189 |         int xLeft;
190 |         
191 |         int64_t multLeft, multRight;
192 |         
193 |         if (0 >= xOffset) {
194 |             xLeft = 0;
195 |             multLeft = 0;
196 |             multRight = scale;
197 |         } else if (xOffset >= (srcWidth - 1)) {
198 |             xLeft = srcWidth - 2;
199 |             multLeft = scale;
200 |             multRight = 0;
201 |         } else {
202 |             xLeft = (int) floorf(xOffset);
203 |             
204 |             xOffset -= xLeft;
205 |             
206 |             multLeft = (int64_t) (xOffset * scale);
207 |             multRight = scale - multLeft;
208 |         }
209 |         
210 |         xTable[(x << 1)] = xLeft;
211 |         xTable[(x << 1) + 1] = xLeft + 1;
212 |         xMult[(x << 1)] = multLeft;
213 |         xMult[(x << 1) + 1] = multRight;
214 |     }
215 |     
216 |     int maxThreadNum = getHardwareCPUNum();
217 |     int threadIndex  = 0;
218 |     
219 |     int stride = max_value((int) (roundf(1.0f * dstHeight / maxThreadNum)), 1);
220 | 
221 | #pragma omp parallel for private(threadIndex)
222 |     for (threadIndex = 0; threadIndex < maxThreadNum; ++threadIndex) {
223 |         int start = threadIndex * stride;
224 |         int end   = (threadIndex == (maxThreadNum - 1)) ? dstHeight : min_value(start + stride, dstHeight);
225 |         
226 |         T *dstData = dst + start * dstWidth;
227 |         
228 |         for (int y = start; y < end; ++y) {
229 |             float yOffset = (y + 0.5f) * yRatio - 0.5f;
230 |             int yUp;
231 |             int64_t multUp, multDown;
232 |             
233 |             if (0 >= yOffset) {
234 |                 yUp = 0;
235 |                 multUp = 0;
236 |                 multDown = scale;
237 |             } else if (yOffset >= (srcHeight - 1)) {
238 |                 yUp = srcHeight - 2;
239 |                 multUp = scale;
240 |                 multDown = 0;
241 |             } else {
242 |                 yUp = (int) floorf(yOffset);
243 |                 yOffset -= yUp;
244 |                 
245 |                 multUp = (int64_t) (yOffset * scale);
246 |                 multDown = scale - multUp;
247 |             }
248 |             
249 |             T *upSrc = src + yUp * srcWidth;
250 |             T *downSrc = upSrc + srcWidth;
251 |             
252 |             for (int x = 0; x < dstWidth; ++x) {
253 |                 int x2 = (x << 1);
254 |                 
255 |                 dstData[x] = (T)((((upSrc[xTable[x2]] * xMult[x2+1]
256 |                                     + upSrc[xTable[x2+1]]*xMult[x2]) * multDown
257 |                                    + (downSrc[xTable[x2]] * xMult[x2+1]
258 |                                       + downSrc[xTable[x2+1]]*xMult[x2]) * multUp)
259 |                                   + delta) >> shift);
260 |             }
261 |             
262 |             dstData += dstWidth;
263 |         }
264 |     }
265 |     
266 |     free(xTable);
267 |     free(xMult);
268 | }
269 | 
270 | template<class T1, class T2>
271 | void scaleMatByScale(T1 *src, T2 *dst, int width, int height, int scale) {
272 |     int maxThreadNum = getHardwareCPUNum();
273 |     int threadIndex  = 0;
274 |     
275 |     int stride = max_value((int) (roundf(1.0f * height / maxThreadNum)), 1);
276 | 
277 | #pragma omp parallel for private(threadIndex)
278 |     for (threadIndex = 0; threadIndex < maxThreadNum; ++threadIndex) {
279 |         int start = threadIndex * stride;
280 |         int end = (threadIndex == (maxThreadNum - 1)) ? height : min_value(start + stride, height);
281 |         
282 |         T1 *srcData = src + start * width;
283 |         T2 *dstData = dst + start * width;
284 |         
285 |         int64_t length = (end - start) * width;
286 |         int64_t limit = length - 3;
287 |         
288 |         int64_t i = 0;
289 |         for (; i < limit; i += 4) {
290 |             dstData[i] = (srcData[i] * scale);
291 |             dstData[i + 1] = (srcData[i + 1] * scale);
292 |             dstData[i + 2] = (srcData[i + 2] * scale);
293 |             dstData[i + 3] = (srcData[i + 3] * scale);
294 |         }
295 |         
296 |         for (; i < length; ++i) {
297 |             dstData[i] = (srcData[i] * scale);
298 |         }
299 |     }
300 | }
301 | 
302 | template<class T1, class T2>
303 | void scaleMatByShift(T1 *src, T2 *dst, int width, int height, int shift) {
304 |     int maxThreadNum = getHardwareCPUNum();
305 |     int threadIndex = 0;
306 |     
307 |     int stride = max_value((int) (roundf(1.0f * height / maxThreadNum)), 1);
308 | 
309 | #pragma omp parallel for private(threadIndex)
310 |     for (threadIndex = 0; threadIndex < maxThreadNum; ++threadIndex) {
311 |         int start = threadIndex * stride;
312 |         int end = (threadIndex == (maxThreadNum - 1)) ? height : min_value(start + stride, height);
313 |         
314 |         T1 *srcData = src + start * width;
315 |         T2 *dstData = dst + start * width;
316 |         
317 |         int64_t length = (end - start) * width;
318 |         int64_t limit = length - 3;
319 |         
320 |         int64_t i = 0;
321 |         for (; i < limit; i += 4) {
322 |             dstData[i] = (srcData[i] << shift);
323 |             dstData[i + 1] = (srcData[i + 1] << shift);
324 |             dstData[i + 2] = (srcData[i + 2] << shift);
325 |             dstData[i + 3] = (srcData[i + 3] << shift);
326 |         }
327 |         
328 |         for (; i < length; ++i) {
329 |             dstData[i] = (srcData[i] << shift);
330 |         }
331 |     }
332 | }
333 | 
334 | template<class T>
335 | void subMat(T *src1, T *src2, T *dst, int width, int height) {
336 |     int maxThreadNum = getHardwareCPUNum();
337 |     int threadIndex  = 0;
338 |     
339 |     int stride = max_value((int) (roundf(1.0f * height / maxThreadNum)), 1);
340 | 
341 | #pragma omp parallel for private(threadIndex)
342 |     for (threadIndex = 0; threadIndex < maxThreadNum; ++threadIndex) {
343 |         int start = threadIndex * stride;
344 |         int end = (threadIndex == (maxThreadNum - 1)) ? height : min_value(start + stride, height);
345 |         
346 |         T *src1Data = src1 + start * width;
347 |         T *src2Data = src2 + start * width;
348 |         T *dstData  = dst + start * width;
349 |         
350 |         int64_t length = (end - start) * width;
351 |         int64_t limit  = length - 3;
352 |         
353 |         int64_t i = 0;
354 |         for (; i < limit; i += 4) {
355 |             dstData[i] = src1Data[i] - src2Data[i];
356 |             dstData[i + 1] = src1Data[i + 1] - src2Data[i + 1];
357 |             dstData[i + 2] = src1Data[i + 2] - src2Data[i + 2];
358 |             dstData[i + 3] = src1Data[i + 3] - src2Data[i + 3];
359 |         }
360 |         
361 |         for (; i < length; ++i) {
362 |             dstData[i] = src1Data[i] - src2Data[i];
363 |         }
364 |     }
365 | }
366 | 
367 | }
368 | 
369 | 
370 | #endif //SIFTYSIFTY_IMAGEUTILS_H
371 | 


--------------------------------------------------------------------------------
/src/siftysifty.cpp:
--------------------------------------------------------------------------------
   1 | /**
   2 |  * Created by yanyuanchi on 2017/3/22.
   3 |  */
   4 | #include <iostream>
   5 | #include <vector>
   6 | #include <cmath>
   7 | #include <cfloat>
   8 | #include <climits>
   9 | #include <algorithm>
  10 | 
  11 | #include "utils.h"
  12 | #include "gaussfiler.h"
  13 | #include "imageutils.h"
  14 | #include "iirfilter.h"
  15 | #include "siftysifty.h"
  16 | 
  17 | #ifdef _OPENMP
  18 | #include <omp.h>
  19 | #endif
  20 | 
  21 | namespace SiftySifty {
  22 | 
  23 | /**
  24 |  * gauss to Mat<short>
  25 |  */
  26 | void gaussFilter(Mat<int16_t> *src, Mat<short> *dst, float sigma) {
  27 |     if (nullptr == src || nullptr == dst || 0 >= sigma || src->width != dst->width || src->height != dst->height) {
  28 |         return;
  29 |     }
  30 |     
  31 |     gaussFilterByIIRFilter(src->data, dst->data, src->width, src->height, sigma);
  32 | }
  33 | 
  34 | /**
  35 |  * gauss blur on the mat
  36 |  */
  37 | void gaussFilter(Mat<int16_t> *mat, float sigma) {
  38 |     Mat<int16_t> *tmp = newMat<int16_t>(mat->width, mat->height);
  39 |     
  40 |     memcpy(tmp->data, mat->data, sizeof(int16_t) * mat->width * mat->height);
  41 |     
  42 |     gaussFilter(tmp, mat, sigma);
  43 |     
  44 |     deleteMat<int16_t>(tmp);
  45 | }
  46 | 
  47 | /**
  48 |  * create the base image of sift
  49 |  * @param src the gray image with uint8_t
  50 |  * @param width width
  51 |  * @param height height
  52 |  * @param doubleImage if double the image
  53 |  * @param sigma the init sigma
  54 |  * @param shift the base image will be src * (1 << shift)
  55 |  * @return
  56 |  */
  57 | Mat<int16_t> *initBaseImage(uint8_t *src, int width, int height, bool doubleImage, double sigma, int shift) {
  58 |     Mat<int16_t> *base = newMat<int16_t>(width, height);
  59 |     
  60 |     /**scale the src*/
  61 |     scaleMatByShift<uint8_t, int16_t>(src, base->data, width, height, shift);
  62 |     
  63 |     if (doubleImage) {
  64 |         float diffSigma = (float) sqrt(sigma * sigma - 4.0 * SIFT_INIT_SIGMA * SIFT_INIT_SIGMA);
  65 |         
  66 |         Mat<int16_t> *doubleBase = newMat<int16_t>(base->width * 2, base->height * 2);
  67 |         
  68 |         resizeMat2<int16_t>(base, doubleBase);
  69 |         gaussFilter(doubleBase, diffSigma);
  70 |         
  71 |         deleteMat<int16_t>(base);
  72 |         
  73 |         return doubleBase;
  74 |     } else {
  75 |         float diffSigma = (float) sqrt(sigma * sigma - SIFT_INIT_SIGMA * SIFT_SIGMA);
  76 |         
  77 |         gaussFilter(base, diffSigma);
  78 |         
  79 |         return base;
  80 |     }
  81 | }
  82 | 
  83 | vector<vector<SiftySifty::Mat<int16_t> *> > buildGaussPyramid(SiftySifty::Mat<int16_t> *base,
  84 |                                                               const int octave,
  85 |                                                               const int octaveLayers,
  86 |                                                               const float sigma) {
  87 |     double sig[octaveLayers + 3];
  88 |     sig[0] = sigma;
  89 |     
  90 |     double k = pow(2.0, 1.0 / octaveLayers);
  91 |     for (int i = 1; i < octaveLayers + 3; i++) {
  92 |         double sigPrev = pow(k, (double) (i - 1)) * sigma;
  93 |         double sigTotal = sigPrev * k;
  94 |         
  95 |         sig[i] = sqrt(sigTotal * sigTotal - sigPrev * sigPrev);
  96 |     }
  97 |     
  98 |     vector<vector<SiftySifty::Mat<int16_t> *> > gaussPyramid(octave,
  99 |                                                              vector<SiftySifty::Mat<int16_t> *>(octaveLayers + 3));
 100 |     
 101 |     for (int y = 0; y < octave; ++y) {
 102 |         for (int x = 0; x < (octaveLayers + 3); ++x) {
 103 |             if (0 == y && 0 == x) {
 104 |                 gaussPyramid[y][x] = base;
 105 |             } else if (0 == x) {
 106 |                 Mat<int16_t> *pre = gaussPyramid[y - 1][octaveLayers];
 107 |                 Mat<int16_t> *mat = newMat<int16_t>(pre->width >> 1, pre->height >> 1);
 108 |                 
 109 |                 halfSampleMat<int16_t>(pre, mat);
 110 |                 
 111 |                 gaussPyramid[y][x] = mat;
 112 |             } else {
 113 |                 Mat<int16_t> *pre = gaussPyramid[y][x - 1];
 114 |                 Mat<int16_t> *mat = newMat<int16_t>(pre->width, pre->height);
 115 |                 
 116 |                 gaussFilter(pre, mat, sig[x]);
 117 |                 
 118 |                 gaussPyramid[y][x] = mat;
 119 |             }
 120 |         }
 121 |     }
 122 |     
 123 |     return gaussPyramid;
 124 | }
 125 | 
 126 | /**
 127 |  * build dogPyramid
 128 |  */
 129 | vector<vector<SiftySifty::Mat<int16_t> *> > buildDoGPyramid(vector<vector<SiftySifty::Mat<int16_t> *> > &gaussPyramid,
 130 |                                                             const int octave,
 131 |                                                             const int octaveLayers) {
 132 |     vector<vector<SiftySifty::Mat<int16_t> *> > doGPyramid(octave,
 133 |                                                            vector<SiftySifty::Mat<int16_t> *>(octaveLayers + 2));
 134 |     
 135 |     for (int y = 0; y < octave; ++y) {
 136 |         for (int x = 0; x < (octaveLayers + 2); ++x) {
 137 |             SiftySifty::Mat<int16_t> *src1 = gaussPyramid[y][x + 1];
 138 |             SiftySifty::Mat<int16_t> *src2 = gaussPyramid[y][x];
 139 |             
 140 |             SiftySifty::Mat<int16_t> *dst = newMat<int16_t>(src1->width, src1->height);
 141 |             
 142 |             subMat<int16_t>(src1->data, src2->data, dst->data, src1->width, src1->height);
 143 |             
 144 |             doGPyramid[y][x] = dst;
 145 |         }
 146 |     }
 147 |     
 148 |     return doGPyramid;
 149 | }
 150 | 
 151 | /**
 152 |  * adjust the real extrema point
 153 |  * @param doGPyramid dog pyramid
 154 |  * @param keyPoint the current keypoint
 155 |  * @param r the row of keypoint
 156 |  * @param c the col of keypoint
 157 |  * @param layer the layer of the keypoint
 158 |  * @param octaveLayers the total images of the layer
 159 |  * @param curOctave the current octave
 160 |  * @param contrastThreshold contrast threshold
 161 |  * @param edgeThreshold edge threshold
 162 |  * @param sigma
 163 |  * @param offset
 164 |  * @return true: is a keypint false:not
 165 |  */
 166 | bool adjustExtremaPoint(vector<vector<SiftySifty::Mat<int16_t> * > > &doGPyramid,
 167 |                         KeyPoint &keyPoint,
 168 |                         int &r, int &c, int &l,
 169 |                         int octaveLayers,
 170 |                         int curOctave,
 171 |                         float contrastThreshold,
 172 |                         float edgeThreshold,
 173 |                         float sigma,
 174 |                         int *offset) {
 175 |     /**
 176 |      * the origin sift paper use the float to store the image,
 177 |      * so the imageScale have to be scaled by the SIFT_IMAGE_SCALE
 178 |      */
 179 |     const float imageScale = 1.0f / 255.0f * SIFT_IMAGE_SCALE;
 180 |     
 181 |     /**benn used to calcualte the first-order derivative*/
 182 |     const float deriveScale = 0.5f * imageScale;
 183 |     
 184 |     /**second-order derivative*/
 185 |     const float secondDeriveScale = imageScale;
 186 |     
 187 |     /**cross--order derivative*/
 188 |     const float crossDeriveScale = 0.25f * imageScale;
 189 |     
 190 |     float xR = 0, xC = 0, xL = 0;
 191 |     
 192 |     bool confirm = false;
 193 |     
 194 |     for (int i = 0; i < SIFT_MAX_ADJUST_STEP; ++i) {
 195 |         int width = doGPyramid[curOctave][l]->width;
 196 |         int height = doGPyramid[curOctave][l]->height;
 197 |         
 198 |         int16_t *cur = doGPyramid[curOctave][l]->data + width * r + c;
 199 |         int16_t *pre = doGPyramid[curOctave][l - 1]->data + width * r + c;
 200 |         int16_t *nex = doGPyramid[curOctave][l + 1]->data + width * r + c;
 201 |         
 202 |         /**
 203 |          * calculate the derive of x, y, sigma
 204 |          *         |
 205 |          *         |
 206 |          *         |
 207 |          * ---------------->x
 208 |          *         |
 209 |          *         |
 210 |          *         |
 211 |          *         ^ y
 212 |          */
 213 |         float dx = (cur[offset[5]] - cur[offset[3]]) * deriveScale;
 214 |         float dy = (cur[offset[7]] - cur[offset[1]]) * deriveScale;
 215 |         float ds = (nex[0] - pre[0]) * deriveScale;
 216 |         
 217 |         /**
 218 |          * calculate dxx dxy dxs dyx dyy dys dsx dsy dss
 219 |          */
 220 |         float value2 = 2.0f * cur[0];
 221 |         float dxx = (cur[offset[5]] + cur[offset[3]] - value2) * secondDeriveScale;
 222 |         float dyy = (cur[offset[7]] + cur[offset[1]] - value2) * secondDeriveScale;
 223 |         float dss = (nex[0] + pre[0] - value2) * secondDeriveScale;
 224 |         
 225 |         float dxy = (cur[offset[8]] + cur[offset[0]] - cur[offset[2]] - cur[offset[6]]) * crossDeriveScale;
 226 |         float dxs = (nex[offset[5]] + pre[offset[3]] - nex[offset[3]] - pre[offset[5]]) * crossDeriveScale;
 227 |         float dys = (nex[offset[7]] + pre[offset[1]] - nex[offset[1]] - pre[offset[7]]) * crossDeriveScale;
 228 |         
 229 |         /**
 230 |          * X = -[dx, dy, ds] ^ T * ([Dxx, Dxy, Dxs]) ^ -1
 231 |          *                         ([Dyx, Dyy, Dys])
 232 |          *                         ([Dsx, Dsy, Dss])
 233 |          */
 234 |         float detD = dxx * dyy * dss + dxy * dys * dxs + dxs * dxy * dys -
 235 |                      dxx * dys * dys - dxy * dxy * dss - dxs * dyy * dxs;
 236 |         
 237 |         if (fabsf(detD) < 1e-6) {
 238 |             return false;
 239 |         }
 240 |         
 241 |         detD = 1.0 / detD;
 242 |         
 243 |         xC = dx * (dyy * dss - dys * dys) + dy * (-dxy * dss + dys * dxs) + ds * (dxy * dys - dxs * dyy);
 244 |         xR = dx * (dys * dxs - dss * dxy) + dy * (-dxs * dxs + dss * dxx) + ds * (dxs * dxy - dxx * dys);
 245 |         xL = dx * (dxy * dys - dxs * dyy) + dy * (-dxx * dys + dxs * dxy) + ds * (dxx * dyy - dxy * dxy);
 246 |         
 247 |         xC = -detD * xC;
 248 |         xR = -detD * xR;
 249 |         xL = -detD * xL;
 250 |         
 251 |         if (fabsf(xC) < 0.5f && fabsf(xR) < 0.5f && fabsf(xL) < 0.5f) {
 252 |             confirm = true;
 253 |             break;
 254 |         }
 255 |         
 256 |         if (std::abs(xC) > (float) (INT_MAX / 3) ||
 257 |             std::abs(xR) > (float) (INT_MAX / 3) ||
 258 |             std::abs(xL) > (float) (INT_MAX / 3)) {
 259 |             return false;
 260 |         }
 261 |         
 262 |         r += (int) (roundf(xR));
 263 |         c += (int) (roundf(xC));
 264 |         l += (int) (roundf(xL));
 265 |         
 266 |         /**out of border*/
 267 |         if (l < 1 || l > octaveLayers
 268 |             || c < SIFT_IMAGE_BORDER || c >= width - SIFT_IMAGE_BORDER
 269 |             || r < SIFT_IMAGE_BORDER || r >= height - SIFT_IMAGE_BORDER) {
 270 |             return false;
 271 |         }
 272 |     }
 273 |     
 274 |     if (!confirm) {
 275 |         return false;
 276 |     }
 277 |     
 278 |     int width = doGPyramid[curOctave][l]->width;
 279 |     
 280 |     int16_t *cur = doGPyramid[curOctave][l]->data + width * r + c;
 281 |     int16_t *pre = doGPyramid[curOctave][l - 1]->data + width * r + c;
 282 |     int16_t *nex = doGPyramid[curOctave][l + 1]->data + width * r + c;
 283 |     
 284 |     float dx = (cur[offset[5]] - cur[offset[3]]) * deriveScale;
 285 |     float dy = (cur[offset[7]] - cur[offset[1]]) * deriveScale;
 286 |     float ds = (nex[0] - pre[0]) * deriveScale;
 287 |     
 288 |     float response = cur[0] * imageScale + 0.5f * (dx * xC + dy * xR + ds * xL);
 289 |     
 290 |     if (fabsf(response) * octaveLayers < contrastThreshold) {
 291 |         return false;
 292 |     }
 293 |     
 294 |     float value2 = 2.0f * cur[0];
 295 |     float dxx = (cur[offset[5]] + cur[offset[3]] - value2) * secondDeriveScale;
 296 |     float dyy = (cur[offset[7]] + cur[offset[1]] - value2) * secondDeriveScale;
 297 |     float dxy = (cur[offset[8]] + cur[offset[0]] - cur[offset[2]] - cur[offset[6]]) * crossDeriveScale;
 298 |     
 299 |     float tr = dxx + dyy;
 300 |     float det = dxx * dyy - dxy * dxy;
 301 |     
 302 |     if (0 > det || tr * tr * edgeThreshold >= (edgeThreshold + 1) * (edgeThreshold + 1) * det) {
 303 |         return false;
 304 |     }
 305 |     
 306 |     keyPoint.x = (c + xC) * (1 << curOctave);
 307 |     keyPoint.y = (r + xR) * (1 << curOctave);
 308 |     
 309 |     keyPoint.octaveX = c;
 310 |     keyPoint.octaveY = r;
 311 |     
 312 |     keyPoint.octave = curOctave;
 313 |     keyPoint.octaveLayer = l;
 314 |     
 315 |     keyPoint.octaveLayersShift = xL;
 316 |     
 317 |     keyPoint.size = sigma * powf(2.f, (l + xL) / octaveLayers) * (1 << curOctave);
 318 |     keyPoint.octaveSize = sigma * powf(2.f, (l + xL) / octaveLayers);
 319 |     
 320 |     /**[-1, 1]*/
 321 |     keyPoint.response = response;
 322 |     
 323 |     return true;
 324 | }
 325 | 
 326 | /**
 327 |  * calculate hist
 328 |  */
 329 | float calculateHist(SiftySifty::Mat<int16_t> *image, int x, int y, int radius, float sigma, float *hist, int n) {
 330 |     int16_t *imageData = image->data;
 331 |     int width = image->width;
 332 |     int height = image->height;
 333 |     
 334 |     int length = (2 * radius + 1) * (2 * radius + 1);
 335 |     
 336 |     float scale = -1.0f / (2.0f * sigma * sigma);
 337 |     
 338 |     float *buffer = (float *) malloc(sizeof(float) * (5 * length + n + 4));
 339 |     
 340 |     float *weights = buffer, *dx = weights + length, *dy = dx + length, *ori = dy + length, *mag = ori + length;
 341 |     float *tmpHist = mag + length + 2;
 342 |     
 343 |     memset(tmpHist, 0, sizeof(float) * n);
 344 |     
 345 |     int r, c;
 346 |     int realLength = 0;
 347 |     for (int i = -radius; i <= radius; ++i) {
 348 |         r = y + i;
 349 |         if (0 >= r || r >= (height - 1)) {
 350 |             continue;
 351 |         }
 352 |         
 353 |         for (int j = -radius; j <= radius; ++j) {
 354 |             c = x + j;
 355 |             if (0 >= c || c >= (width - 1)) {
 356 |                 continue;
 357 |             }
 358 |             /**
 359 |              *             ^ y
 360 |              *             |
 361 |              *             |
 362 |              *             |
 363 |              * -----------------------> x
 364 |              *             |
 365 |              *             |
 366 |              *             |
 367 |              */
 368 |             dx[realLength] = (float) (imageData[r * width + c + 1] - imageData[r * width + c - 1]);
 369 |             dy[realLength] = (float) (imageData[(r - 1) * width + c] - imageData[(r + 1) * width + c]);
 370 |             
 371 |             weights[realLength] = (i * i + j * j) * scale;
 372 |             
 373 |             realLength++;
 374 |         }
 375 |     }
 376 |     
 377 |     for (int i = 0; i < realLength; ++i) {
 378 |         weights[i] = expf(weights[i]);
 379 |         mag[i] = sqrtf(dx[i] * dx[i] + dy[i] * dy[i]);
 380 |         ori[i] = atan2f360(dy[i], dx[i]);
 381 |     }
 382 |     
 383 |     for (int i = 0; i < realLength; ++i) {
 384 |         int index = static_cast<int>(roundf(ori[i] * n / 360.0f));
 385 |         
 386 |         if (index > n) {
 387 |             index -= n;
 388 |         }
 389 |         
 390 |         if (0 > index) {
 391 |             index += n;
 392 |         }
 393 |         
 394 |         tmpHist[index] += (weights[i] * mag[i]);
 395 |     }
 396 |     
 397 |     tmpHist[-1] = tmpHist[n - 1];
 398 |     tmpHist[-2] = tmpHist[n - 2];
 399 |     tmpHist[n] = tmpHist[0];
 400 |     tmpHist[n + 1] = tmpHist[1];
 401 |     
 402 |     for (int i = 0; i < n; ++i) {
 403 |         hist[i] = (tmpHist[i - 2] + tmpHist[i + 2]) * (1.f / 16.f) +
 404 |                   (tmpHist[i - 1] + tmpHist[i + 1]) * (4.f / 16.f) + tmpHist[i] * (6.f / 16.f);
 405 |     }
 406 |     
 407 |     float maxValue = hist[0];
 408 |     for (int i = 0; i < n; ++i) {
 409 |         maxValue = max_value(hist[i], maxValue);
 410 |     }
 411 |     
 412 |     free(buffer);
 413 |     
 414 |     return maxValue;
 415 | }
 416 | 
 417 | void findKeyPoints(vector<KeyPoint> &kpts, KeyPoint &keyPoint, float threshold, float *hist, int n) {
 418 |     for (int i = 0; i < n; ++i) {
 419 |         int left = (i > 0) ? (i - 1) : (n - 1);
 420 |         int right = (i < (n - 1)) ? (i + 1) : 0;
 421 |         
 422 |         if (hist[i] > hist[left] && hist[i] >= hist[right] && hist[i] >= threshold) {
 423 |             float bin = i + 0.5f * (hist[left] - hist[right]) / (hist[left] - 2 * hist[i] + hist[right]);
 424 |             
 425 |             bin = (bin < 0) ? (n + bin) : ((bin >= n) ? (bin - n) : bin);
 426 |             
 427 |             keyPoint.angle = 360.0f - (360.0f / n) * bin;
 428 |             
 429 |             if (fabsf(keyPoint.angle - 360.0f) < FLT_EPSILON) {
 430 |                 keyPoint.angle = 0.f;
 431 |             }
 432 |             
 433 |             kpts.push_back(keyPoint);
 434 |         }
 435 |     }
 436 | }
 437 | 
 438 | /**
 439 |  * find the extrema point on one pic
 440 |  * @param gaussPyramid gauss pyramid
 441 |  * @param doGPyramid dog pyramid
 442 |  * @param keyPoints store the keyPoints
 443 |  * @param octave the size of pyramid
 444 |  * @param octaveLayers the image's number in one layer
 445 |  * @param contrastThreshold the contrast threshold int sift paper
 446 |  * @param edgeThreshold the edge threshold in sift paper
 447 |  * @param valueThreshold the image value threshold
 448 |  * @param curOctave the current octave
 449 |  * @param curLayer the curent layer
 450 |  * @param n
 451 |  * @param sigma
 452 |  * @param offset
 453 |  */
 454 | void findExtremaPointOne(vector<vector<SiftySifty::Mat<int16_t> * > > &gaussPyramid,
 455 |                          vector<vector<SiftySifty::Mat<int16_t> * > > &doGPyramid,
 456 |                          vector<SiftySifty::KeyPoint> &keyPoints,
 457 |                          int octave,
 458 |                          int octaveLayers,
 459 |                          float contrastThreshold,
 460 |                          float edgeThreshold,
 461 |                          int valueThreshold,
 462 |                          int curOctave,
 463 |                          int curLayer,
 464 |                          int n,
 465 |                          float sigma,
 466 |                          int *offset) {
 467 |     /**get the pre current next image*/
 468 |     Mat<int16_t> *pre = doGPyramid[curOctave][curLayer - 1];
 469 |     Mat<int16_t> *cur = doGPyramid[curOctave][curLayer];
 470 |     Mat<int16_t> *nex = doGPyramid[curOctave][curLayer + 1];
 471 |     
 472 |     int width = cur->width;
 473 |     int height = cur->height;
 474 |     
 475 |     int maxThreadNum = getHardwareCPUNum();
 476 |     int threadIndex = 0;
 477 |     
 478 |     /**remove the border*/
 479 |     int stride = max((int) (roundf(1.0f * (height - SIFT_IMAGE_BORDER - SIFT_IMAGE_BORDER) / maxThreadNum)), 1);
 480 | 
 481 | #ifdef _OPENMP
 482 |     omp_lock_t lock;
 483 |     omp_init_lock(&lock);
 484 | #endif
 485 | 
 486 | #pragma omp parallel for private(threadIndex)
 487 |     for (threadIndex = 0; threadIndex < maxThreadNum; ++threadIndex) {
 488 |         int start = threadIndex * stride + SIFT_IMAGE_BORDER;
 489 |         int end = (threadIndex == (maxThreadNum - 1)) ?
 490 |                   (height - SIFT_IMAGE_BORDER) : min_value(start + stride, height - SIFT_IMAGE_BORDER);
 491 |         
 492 |         float hist[n];
 493 |         
 494 |         KeyPoint keyPoint;
 495 |         vector<KeyPoint> kpts;
 496 |         
 497 |         int16_t *prePtr = pre->data + start * width + SIFT_IMAGE_BORDER;
 498 |         int16_t *curPtr = cur->data + start * width + SIFT_IMAGE_BORDER;
 499 |         int16_t *nexPtr = nex->data + start * width + SIFT_IMAGE_BORDER;
 500 |         
 501 |         for (int y = start; y < end; ++y) {
 502 |             int16_t *preData = prePtr - 1;
 503 |             int16_t *curData = curPtr - 1;
 504 |             int16_t *nexData = nexPtr - 1;
 505 |             
 506 |             for (int x = SIFT_IMAGE_BORDER; x < (width - SIFT_IMAGE_BORDER); ++x) {
 507 |                 preData++;
 508 |                 curData++;
 509 |                 nexData++;
 510 |                 
 511 |                 int val = curData[0];
 512 |                 
 513 |                 if (!(abs(val) > valueThreshold &&
 514 |                       ((val > 0 && val >= curData[offset[0]] && val >= curData[offset[1]] &&
 515 |                         val >= curData[offset[2]] && val >= curData[offset[3]] && val >= curData[offset[5]] &&
 516 |                         val >= curData[offset[6]] && val >= curData[offset[7]] && val >= curData[offset[8]] &&
 517 |                         val >= preData[offset[0]] && val >= preData[offset[1]] && val >= preData[offset[2]] &&
 518 |                         val >= preData[offset[3]] && val >= preData[offset[4]] && val >= preData[offset[5]] &&
 519 |                         val >= preData[offset[6]] && val >= preData[offset[7]] && val >= preData[offset[8]] &&
 520 |                         val >= nexData[offset[0]] && val >= nexData[offset[1]] && val >= nexData[offset[2]] &&
 521 |                         val >= nexData[offset[3]] && val >= nexData[offset[4]] && val >= nexData[offset[5]] &&
 522 |                         val >= nexData[offset[6]] && val >= nexData[offset[7]] && val >= nexData[offset[8]]) ||
 523 |                        (val < 0 && val <= curData[offset[0]] && val <= curData[offset[1]] &&
 524 |                         val <= curData[offset[2]] && val <= curData[offset[3]] && val <= curData[offset[5]] &&
 525 |                         val <= curData[offset[6]] && val <= curData[offset[7]] && val <= curData[offset[8]] &&
 526 |                         val <= preData[offset[0]] && val <= preData[offset[1]] && val <= preData[offset[2]] &&
 527 |                         val <= preData[offset[3]] && val <= preData[offset[4]] && val <= preData[offset[5]] &&
 528 |                         val <= preData[offset[6]] && val <= preData[offset[7]] && val <= preData[offset[8]] &&
 529 |                         val <= nexData[offset[0]] && val <= nexData[offset[1]] && val <= nexData[offset[2]] &&
 530 |                         val <= nexData[offset[3]] && val <= nexData[offset[4]] && val <= nexData[offset[5]] &&
 531 |                         val <= nexData[offset[6]] && val <= nexData[offset[7]] && val <= nexData[offset[8]])))) {
 532 |                     continue;
 533 |                 }
 534 |                 
 535 |                 int extremaR = y, extremaC = x, extremaL = curLayer;
 536 |                 
 537 |                 if (!adjustExtremaPoint(doGPyramid,
 538 |                                         keyPoint,
 539 |                                         extremaR, extremaC, extremaL,
 540 |                                         octaveLayers,
 541 |                                         curOctave,
 542 |                                         contrastThreshold,
 543 |                                         edgeThreshold,
 544 |                                         sigma,
 545 |                                         offset)) {
 546 |                     continue;
 547 |                 }
 548 |                 
 549 |                 float octaveSigma = keyPoint.octaveSize;
 550 |                 
 551 |                 float maxValue = calculateHist(gaussPyramid[curOctave][extremaL],
 552 |                                                extremaC,
 553 |                                                extremaR,
 554 |                                                (int) roundf(SIFT_ORIENTATION_SIGMA_FCTER * octaveSigma),
 555 |                                                SIFT_ORIENTATION_RADIUS * octaveSigma,
 556 |                                                hist,
 557 |                                                n);
 558 |                 
 559 |                 findKeyPoints(kpts, keyPoint, maxValue * SIFT_ORIENTATION_PEAK_RATIO, hist, n);
 560 |             }
 561 |             
 562 |             prePtr += width;
 563 |             curPtr += width;
 564 |             nexPtr += width;
 565 |         }
 566 |         
 567 |         if (!kpts.empty()) {
 568 | #ifdef _OPENMP
 569 |             omp_set_lock(&lock);
 570 | #endif
 571 |             for (auto iter = kpts.begin(); iter < kpts.end(); ++iter) {
 572 |                 keyPoints.push_back(*iter);
 573 |             }
 574 | 
 575 | #ifdef _OPENMP
 576 |             omp_unset_lock(&lock);
 577 | #endif
 578 |         }
 579 |     }
 580 | 
 581 | #ifdef _OPENMP
 582 |     omp_destroy_lock(&lock);
 583 | #endif
 584 | }
 585 | 
 586 | /**
 587 |  * find the extrema point
 588 |  */
 589 | void findExtremaPoint(vector<vector<SiftySifty::Mat<int16_t> * >> &gaussPyramid,
 590 |                       vector<vector<SiftySifty::Mat<int16_t> * >> &doGPyramid,
 591 |                       vector<SiftySifty::KeyPoint> &keyPoints,
 592 |                       int octave,
 593 |                       int octaveLayers,
 594 |                       float contrastThreshold,
 595 |                       float edgeThreshold,
 596 |                       float sigma) {
 597 |     int n = SIFT_ORIENTATION_HIST_BINS;
 598 |     int threshold = (int) (0.5f * contrastThreshold / octaveLayers * 255 * SIFT_IMAGE_SCALE);
 599 |     
 600 |     int offset[9];
 601 |     
 602 |     for (int o = 0; o < octave; ++o) {
 603 |         int width = doGPyramid[o][0]->width;
 604 |         
 605 |         offset[0] = -width - 1;
 606 |         offset[1] = -width;
 607 |         offset[2] = -width + 1;
 608 |         offset[3] = -1;
 609 |         offset[4] = 0;
 610 |         offset[5] = 1;
 611 |         offset[6] = width - 1;
 612 |         offset[7] = width;
 613 |         offset[8] = width + 1;
 614 |         
 615 |         for (int l = 1; l <= octaveLayers; ++l) {
 616 |             findExtremaPointOne(gaussPyramid,
 617 |                                 doGPyramid,
 618 |                                 keyPoints,
 619 |                                 octave,
 620 |                                 octaveLayers,
 621 |                                 contrastThreshold,
 622 |                                 edgeThreshold,
 623 |                                 threshold,
 624 |                                 o,
 625 |                                 l,
 626 |                                 n,
 627 |                                 sigma,
 628 |                                 offset);
 629 |         }
 630 |     }
 631 | }
 632 | 
 633 | /**
 634 |  * if the inited image is been doubled, than resize the size
 635 |  */
 636 | void resizeKeyPoints(vector<SiftySifty::KeyPoint> &keyPoints) {
 637 |     auto iter = keyPoints.begin();
 638 |     
 639 |     for (; iter < keyPoints.end(); ++iter) {
 640 |         (*iter).x /= 2.0f;
 641 |         (*iter).y /= 2.0f;
 642 |         (*iter).size /= 2.0f;
 643 |     }
 644 | }
 645 | 
 646 | /**
 647 |  * sort the keypoint
 648 |  */
 649 | struct KeyPointCMP {
 650 |     vector<SiftySifty::KeyPoint> keyPoints;
 651 |     
 652 |     KeyPointCMP(const vector<SiftySifty::KeyPoint> &keyPoints) {
 653 |         this->keyPoints = keyPoints;
 654 |     }
 655 |     
 656 |     bool operator()(int i, int j) const {
 657 |         SiftySifty::KeyPoint kp1 = keyPoints[i];
 658 |         SiftySifty::KeyPoint kp2 = keyPoints[j];
 659 |         
 660 |         if (kp1.x != kp2.x) {
 661 |             return kp1.x < kp2.x;
 662 |         }
 663 |         
 664 |         if (kp1.y != kp2.y) {
 665 |             return kp1.y < kp2.y;
 666 |         }
 667 |         
 668 |         if (kp1.size != kp2.size) {
 669 |             return kp1.size < kp2.size;
 670 |         }
 671 |         
 672 |         if (kp1.angle != kp2.angle) {
 673 |             return kp1.angle < kp2.angle;
 674 |         }
 675 |         
 676 |         return i < j;
 677 |     }
 678 | };
 679 | 
 680 | /**
 681 |  * remove the doubled keypoint
 682 |  * @param keyPoints
 683 |  */
 684 | void removeDoubleKeyPoints(vector<SiftySifty::KeyPoint> &keyPoints) {
 685 |     int i, j;
 686 |     int n = keyPoints.size();
 687 |     
 688 |     /**sorted keypoint*/
 689 |     vector<int> sortedIndex(n);
 690 |     
 691 |     /**mark the keypoint*/
 692 |     vector<uint8_t> map(n, 1);
 693 |     
 694 |     for (int i = 0; i < sortedIndex.size(); ++i) {
 695 |         sortedIndex[i] = i;
 696 |     }
 697 |     
 698 |     /**sort the keypoint*/
 699 |     std::sort(sortedIndex.begin(), sortedIndex.end(), KeyPointCMP(keyPoints));
 700 |     
 701 |     for (i = 1, j = 0; i < n; ++i) {
 702 |         KeyPoint kp1 = keyPoints[sortedIndex[j]];
 703 |         KeyPoint kp2 = keyPoints[sortedIndex[i]];
 704 |         
 705 |         if (kp1.x != kp2.x
 706 |             || kp1.y != kp2.y
 707 |             || kp1.size != kp2.size
 708 |             || kp1.angle != kp2.angle) {
 709 |             j = i;
 710 |         } else {
 711 |             map[i] = 0;
 712 |         }
 713 |     }
 714 |     
 715 |     for (i = 0, j = 0; i < n; ++i) {
 716 |         if (1 == map[i]) {
 717 |             if (i != j) {
 718 |                 keyPoints[j] = keyPoints[i];
 719 |             }
 720 |             
 721 |             j++;
 722 |         }
 723 |     }
 724 |     
 725 |     keyPoints.resize(j);
 726 | }
 727 | 
 728 | /**
 729 |  * calculate the keypoint's descriptor
 730 |  * @param image
 731 |  * @param x
 732 |  * @param y
 733 |  * @param angle
 734 |  * @param scale
 735 |  * @param d
 736 |  * @param n
 737 |  * @param descriptor
 738 |  */
 739 | void calculateDescriptorOne(Mat<int16_t> *image,
 740 |                             int x, int y,
 741 |                             float angle,
 742 |                             float scale,
 743 |                             int d,
 744 |                             int n,
 745 |                             float *descriptor) {
 746 |     int16_t *data = image->data;
 747 |     int width = image->width;
 748 |     int height = image->height;
 749 |     
 750 |     float expScale = -1.f / (d * d * 0.5f);
 751 |     
 752 |     float sin = sinf(angle * PI / 180.0f);
 753 |     float cos = cosf(angle * PI / 180.0f);
 754 |     
 755 |     /**360 to n*/
 756 |     float binPerRad = n / 360.f;
 757 |     
 758 |     float histWidth = SIFT_DESCRIPTOR_SCAE_FCTER * scale;
 759 |     
 760 |     int radius = (int) (roundf(histWidth * 1.4142135623730951f * (d + 1) * 0.5f));
 761 |     
 762 |     radius = min_value(radius, (int) sqrt(width * width + height * height));
 763 |     
 764 |     sin /= histWidth;
 765 |     cos /= histWidth;
 766 |     
 767 |     int len = (radius + radius + 1) * (radius + radius + 1);
 768 |     int histLen = (d + 2) * (d + 2) * (n + 2);
 769 |     
 770 |     float *buffer = (float *) malloc(sizeof(float) * (len * 7 + histLen));
 771 |     float *dx = buffer, *dy = dx + len, *mag = dy + len, *ori = mag + len, *weight = ori + len;
 772 |     float *rBin = weight + len, *cBin = rBin + len, *hist = cBin + len;
 773 |     
 774 |     memset(hist, 0, sizeof(float) * histLen);
 775 |     
 776 |     int realLen = 0;
 777 |     for (int i = -radius; i <= radius; ++i) {
 778 |         for (int j = -radius; j <= radius; ++j) {
 779 |             float cRotate = cos * j - sin * i;
 780 |             float rRotate = sin * j + cos * i;
 781 |             
 782 |             float rBin0 = rRotate + d / 2.0f - 0.5f;
 783 |             float cBin0 = cRotate + d / 2.0f - 0.5f;
 784 |             
 785 |             int realX = x + j;
 786 |             int realY = y + i;
 787 |             
 788 |             if (rBin0 > -1 && rBin0 < d && cBin0 > -1 && cBin0 < d
 789 |                 && realX > 0 && realX < width - 1 && realY > 0 && realY < height - 1) {
 790 |                 dx[realLen] = (float) (data[realY * width + realX + 1] - data[realY * width + realX - 1]);
 791 |                 dy[realLen] = (float) (data[(realY - 1) * width + realX] - data[(realY + 1) * width + realX]);
 792 |                 
 793 |                 rBin[realLen] = rBin0;
 794 |                 cBin[realLen] = cBin0;
 795 |                 
 796 |                 weight[realLen] = (rRotate * rRotate + cRotate * cRotate) * expScale;
 797 |                 
 798 |                 realLen++;
 799 |             }
 800 |         }
 801 |     }
 802 |     
 803 |     for (int i = 0; i < realLen; ++i) {
 804 |         ori[i] = atan2f360(dy[i], dx[i]);
 805 |         mag[i] = sqrtf(dy[i] * dy[i] + dx[i] * dx[i]);
 806 |         weight[i] = exp(weight[i]);
 807 |     }
 808 |     
 809 |     for (int i = 0; i < realLen; ++i) {
 810 |         float rbin = rBin[i];
 811 |         float cbin = cBin[i];
 812 |         float obin = (ori[i] - angle) * binPerRad;
 813 |         
 814 |         float magnitude = mag[i] * weight[i];
 815 |         
 816 |         int r0 = (int) floorf(rbin);
 817 |         int c0 = (int) floorf(cbin);
 818 |         int o0 = (int) floorf(obin);
 819 |         
 820 |         rbin -= r0;
 821 |         cbin -= c0;
 822 |         obin -= o0;
 823 |         
 824 |         if (o0 < 0) {
 825 |             o0 += n;
 826 |         }
 827 |         if (o0 >= n) {
 828 |             o0 -= n;
 829 |         }
 830 |         
 831 |         float v_r1 = magnitude * rbin, v_r0 = magnitude - v_r1;
 832 |         float v_rc11 = v_r1 * cbin, v_rc10 = v_r1 - v_rc11;
 833 |         float v_rc01 = v_r0 * cbin, v_rc00 = v_r0 - v_rc01;
 834 |         float v_rco111 = v_rc11 * obin, v_rco110 = v_rc11 - v_rco111;
 835 |         float v_rco101 = v_rc10 * obin, v_rco100 = v_rc10 - v_rco101;
 836 |         float v_rco011 = v_rc01 * obin, v_rco010 = v_rc01 - v_rco011;
 837 |         float v_rco001 = v_rc00 * obin, v_rco000 = v_rc00 - v_rco001;
 838 |         
 839 |         int idx = ((r0 + 1) * (d + 2) + c0 + 1) * (n + 2) + o0;
 840 |         
 841 |         hist[idx] += v_rco000;
 842 |         hist[idx + 1] += v_rco001;
 843 |         hist[idx + (n + 2)] += v_rco010;
 844 |         hist[idx + (n + 3)] += v_rco011;
 845 |         hist[idx + (d + 2) * (n + 2)] += v_rco100;
 846 |         hist[idx + (d + 2) * (n + 2) + 1] += v_rco101;
 847 |         hist[idx + (d + 3) * (n + 2)] += v_rco110;
 848 |         hist[idx + (d + 3) * (n + 2) + 1] += v_rco111;
 849 |     }
 850 |     
 851 |     for (int i = 0; i < d; ++i) {
 852 |         for (int j = 0; j < d; ++j) {
 853 |             int idx = ((i + 1) * (d + 2) + (j + 1)) * (n + 2);
 854 |             hist[idx] += hist[idx + n];
 855 |             hist[idx + 1] += hist[idx + n + 1];
 856 |             
 857 |             for (int k = 0; k < n; k++) {
 858 |                 descriptor[(i * d + j) * n + k] = hist[idx + k];
 859 |             }
 860 |         }
 861 |     }
 862 |     
 863 |     len = d * d * n;
 864 |     float norm = 0;
 865 |     for (int i = 0; i < len; ++i) {
 866 |         norm += descriptor[i] * descriptor[i];
 867 |     }
 868 |     
 869 |     float threshold = sqrt(norm) * SIFT_DESCRIPTOR_MAGNITUDE_THRESHOLD;
 870 |     
 871 |     norm = 0;
 872 |     for (int i = 0; i < len; ++i) {
 873 |         descriptor[i] = min_value(threshold, descriptor[i]);
 874 |         norm += descriptor[i] * descriptor[i];
 875 |     }
 876 |     
 877 |     norm = SIFT_DESCRIPTOR_FCTOR / max_value(sqrt(norm), FLT_EPSILON);
 878 |     for (int i = 0; i < len; ++i) {
 879 |         descriptor[i] = (uint8_t) (descriptor[i] * norm);
 880 |     }
 881 |     
 882 |     free(buffer);
 883 | }
 884 | 
 885 | void calculateDescriptor(vector<vector<Mat<int16_t> *> > &gaussPyramid, vector<KeyPoint> &keyPoints, int d, int n) {
 886 |     for (auto iter = keyPoints.begin(); iter < keyPoints.end(); ++iter) {
 887 |         int octave = (*iter).octave;
 888 |         int octaveLayer = (*iter).octaveLayer;
 889 |         
 890 |         int x = (*iter).octaveX;
 891 |         int y = (*iter).octaveY;
 892 |         
 893 |         float angle = 360.f - (*iter).angle;
 894 |         float size = (*iter).octaveSize;
 895 |         
 896 |         (*iter).descriptor = (float *) malloc(sizeof(float) * d * d * n);
 897 |         
 898 |         calculateDescriptorOne(gaussPyramid[octave][octaveLayer],
 899 |                                x, y,
 900 |                                angle, size,
 901 |                                d, n,
 902 |                                (*iter).descriptor);
 903 |     }
 904 | }
 905 | 
 906 | /**
 907 |  * calculate the descriptor
 908 |  * @param gaussPyramid
 909 |  * @param keyPoints
 910 |  * @param d
 911 |  * @param n
 912 |  */
 913 | void calculateDescriptor1(vector<vector<Mat<int16_t> *> > &gaussPyramid, vector<KeyPoint> &keyPoints, int d, int n) {
 914 |     int size = keyPoints.size();
 915 |     
 916 |     int maxThreadNum = getHardwareCPUNum();
 917 |     int threadIndex  = 0;
 918 |     
 919 |     /**remove the border*/
 920 |     int stride = max((int) (roundf(1.0f * size / maxThreadNum)), 1);
 921 | 
 922 | #pragma omp parallel for private(threadIndex)
 923 |     for (threadIndex = 0; threadIndex < maxThreadNum; ++threadIndex) {
 924 |         int start = threadIndex * stride;
 925 |         int end = (threadIndex == (maxThreadNum - 1)) ? size : min_value(start + stride, size);
 926 |         
 927 |         for (int i = start; i < end; ++i) {
 928 |             int octave = keyPoints[i].octave;
 929 |             int octaveLayer = keyPoints[i].octaveLayer;
 930 |             
 931 |             int x = keyPoints[i].octaveX;
 932 |             int y = keyPoints[i].octaveY;
 933 |             
 934 |             float angle = 360.f - keyPoints[i].angle;
 935 |             float size = keyPoints[i].octaveSize;
 936 |             
 937 |             keyPoints[i].descriptor = (float *) malloc(sizeof(float) * d * d * n);
 938 |             
 939 |             calculateDescriptorOne(gaussPyramid[octave][octaveLayer],
 940 |                                    x, y,
 941 |                                    angle, size,
 942 |                                    d, n,
 943 |                                    keyPoints[i].descriptor);
 944 |         }
 945 |     }
 946 | }
 947 | 
 948 | template<class T>
 949 | void deleteMatPyramid(vector<vector<SiftySifty::Mat<T> * >> &pyramid) {
 950 |     auto i1 = pyramid.begin();
 951 |     
 952 |     for (; i1 < pyramid.end(); ++i1) {
 953 |         auto i2 = (*i1).begin();
 954 |         
 955 |         for (; i2 < (*i1).end(); ++i2) {
 956 |             deleteMat<T>(*i2);
 957 |         }
 958 |         
 959 |         (*i1).clear();
 960 |     }
 961 |     
 962 |     pyramid.clear();
 963 | }
 964 | 
 965 | void sift(uint8_t *image, int width, int height,
 966 |           vector<SiftySifty::KeyPoint> &keyPoints,
 967 |           int octaveLayers,
 968 |           float sigma,
 969 |           float contrastThreshold,
 970 |           int edgeThreshold,
 971 |           bool doubleInitImage,
 972 |           int descriptorWidth,
 973 |           int descriptorHistBin) {
 974 |     
 975 |     Mat<int16_t> *base = initBaseImage(image, width, height, doubleInitImage, sigma, SIFT_IMAGE_SCALE_SHIFT);
 976 |    
 977 |     int octave = (int) (round(log(min(base->width, base->height)) / log(2) - 2));
 978 |     
 979 |     vector<vector<Mat<int16_t> *> > gaussPyramid = buildGaussPyramid(base, octave, octaveLayers, sigma);
 980 |     
 981 |     vector<vector<Mat<int16_t> *> > dogPyramid = buildDoGPyramid(gaussPyramid, octave, octaveLayers);
 982 |     
 983 |     findExtremaPoint(gaussPyramid,
 984 |                      dogPyramid,
 985 |                      keyPoints,
 986 |                      octave,
 987 |                      octaveLayers,
 988 |                      contrastThreshold,
 989 |                      edgeThreshold,
 990 |                      sigma);
 991 |     
 992 |     if (doubleInitImage) {
 993 |         resizeKeyPoints(keyPoints);
 994 |     }
 995 |     
 996 |     removeDoubleKeyPoints(keyPoints);
 997 | 
 998 |     /**calcaulate descip*/
 999 |     calculateDescriptor1(gaussPyramid, keyPoints, descriptorWidth, descriptorHistBin);
1000 |     
1001 |     deleteMatPyramid(gaussPyramid);
1002 |     deleteMatPyramid(dogPyramid);
1003 | }
1004 | 
1005 | void sift(uint8_t *image, int width, int height, vector<SiftySifty::KeyPoint> &keyPoints) {
1006 |     sift(image, width, height,
1007 |          keyPoints,
1008 |          SIFT_OCTAVE_LAYERS,
1009 |          SIFT_SIGMA,
1010 |          SIFT_CONTRAST_THRESHOLD,
1011 |          SIFT_EDGE_THESHOLD,
1012 |          SIFT_DOUBLE_INITED_IMAGE,
1013 |          SIFT_DESCRIPTOR_WIDTH,
1014 |          SIFT_DESCRIPTOR_HIST_BIN);
1015 | }
1016 | 
1017 | }
1018 | 
1019 | 
1020 | 
1021 | 
1022 | 
1023 | 
1024 | 
1025 | 
1026 | 
1027 | 
1028 | 
1029 | 
1030 | 


--------------------------------------------------------------------------------
/src/linearfilter.cpp:
--------------------------------------------------------------------------------
   1 | /**
   2 |  * Created by yanyuanchi on 2017/1/18.
   3 |  */
   4 | 
   5 | #include <cmath>
   6 | 
   7 | #include "utils.h"
   8 | #include "linearfilter.h"
   9 | 
  10 | namespace SiftySifty {
  11 | void linearFilterHorizonByKernel7(uint8_t *src,
  12 |                                   uint8_t *dst,
  13 |                                   int width,
  14 |                                   int height,
  15 |                                   int (*mult)[256],
  16 |                                   int delta,
  17 |                                   int shift) {
  18 |     
  19 |     int maxThreadNum = getHardwareCPUNum();
  20 |     int threadIndex = 0;
  21 |     
  22 |     int stride = std::max(static_cast<int>(roundf(1.f * height / maxThreadNum)), 1);
  23 | 
  24 | #pragma omp parallel for private(threadIndex)
  25 |     for (threadIndex = 0; threadIndex < maxThreadNum; ++threadIndex) {
  26 |         int start = threadIndex * stride;
  27 |         int end = (threadIndex == (maxThreadNum - 1)) ? height : std::min(start + stride, height);
  28 |         
  29 |         uint8_t *srcData = src + start * (width + 6);
  30 |         uint8_t *dstData = dst + start * width;
  31 |         
  32 |         uint8_t *realSrcData;
  33 |         
  34 |         int sum;
  35 |         
  36 |         for (int y = start; y < end; ++y) {
  37 |             for (int x = 0; x < width; ++x) {
  38 |                 realSrcData = srcData + x;
  39 |                 sum = delta;
  40 |                 
  41 |                 sum += mult[0][realSrcData[0]];
  42 |                 sum += mult[1][realSrcData[1]];
  43 |                 sum += mult[2][realSrcData[2]];
  44 |                 sum += mult[3][realSrcData[3]];
  45 |                 sum += mult[4][realSrcData[4]];
  46 |                 sum += mult[5][realSrcData[5]];
  47 |                 sum += mult[6][realSrcData[6]];
  48 |                 
  49 |                 dstData[x] = static_cast<uint8_t>((sum + delta) >> shift);
  50 |             }
  51 |             
  52 |             srcData += (width + 6);
  53 |             dstData += width;
  54 |         }
  55 |     }
  56 | }
  57 | 
  58 | void linearFilterHorizonByKernel9(uint8_t *src,
  59 |                                   uint8_t *dst,
  60 |                                   int width,
  61 |                                   int height,
  62 |                                   int (*mult)[256],
  63 |                                   int delta,
  64 |                                   int shift) {
  65 |     int maxThreadNum = getHardwareCPUNum();
  66 |     int threadIndex = 0;
  67 |     
  68 |     int stride = std::max(static_cast<int>(roundf(1.f * height / maxThreadNum)), 1);
  69 | 
  70 | #pragma omp parallel for private(threadIndex)
  71 |     for (threadIndex = 0; threadIndex < maxThreadNum; ++threadIndex) {
  72 |         int start = threadIndex * stride;
  73 |         int end = (threadIndex == (maxThreadNum - 1)) ? height : std::min(start + stride, height);
  74 |         
  75 |         uint8_t *srcData = src + start * (width + 8);
  76 |         uint8_t *dstData = dst + start * width;
  77 |         
  78 |         uint8_t *realSrcData;
  79 |         
  80 |         int sum;
  81 |         
  82 |         for (int y = start; y < end; ++y) {
  83 |             for (int x = 0; x < width; ++x) {
  84 |                 realSrcData = srcData + x;
  85 |                 sum = delta;
  86 |                 
  87 |                 sum += mult[0][realSrcData[0]];
  88 |                 sum += mult[1][realSrcData[1]];
  89 |                 sum += mult[2][realSrcData[2]];
  90 |                 sum += mult[3][realSrcData[3]];
  91 |                 sum += mult[4][realSrcData[4]];
  92 |                 sum += mult[5][realSrcData[5]];
  93 |                 sum += mult[6][realSrcData[6]];
  94 |                 sum += mult[7][realSrcData[7]];
  95 |                 sum += mult[8][realSrcData[8]];
  96 |                 
  97 |                 dstData[x] = static_cast<uint8_t>((sum + delta) >> shift);
  98 |             }
  99 |             
 100 |             srcData += (width + 8);
 101 |             dstData += width;
 102 |         }
 103 |     }
 104 | }
 105 | 
 106 | void linearFilterHorizonByKernel11(uint8_t *src,
 107 |                                    uint8_t *dst,
 108 |                                    int width,
 109 |                                    int height,
 110 |                                    int (*mult)[256],
 111 |                                    int delta,
 112 |                                    int shift) {
 113 |     int maxThreadNum = getHardwareCPUNum();
 114 |     int threadIndex = 0;
 115 |     
 116 |     int stride = std::max(static_cast<int>(roundf(1.f * height / maxThreadNum)), 1);
 117 | 
 118 | #pragma omp parallel for private(threadIndex)
 119 |     for (threadIndex = 0; threadIndex < maxThreadNum; ++threadIndex) {
 120 |         int start = threadIndex * stride;
 121 |         int end = (threadIndex == (maxThreadNum - 1)) ? height : std::min(start + stride, height);
 122 |         
 123 |         uint8_t *srcData = src + start * (width + 10);
 124 |         uint8_t *dstData = dst + start * width;
 125 |         
 126 |         uint8_t *realSrcData;
 127 |         
 128 |         int sum;
 129 |         
 130 |         for (int y = start; y < end; ++y) {
 131 |             for (int x = 0; x < width; ++x) {
 132 |                 realSrcData = srcData + x;
 133 |                 sum = delta;
 134 |                 
 135 |                 sum += mult[0][realSrcData[0]];
 136 |                 sum += mult[1][realSrcData[1]];
 137 |                 sum += mult[2][realSrcData[2]];
 138 |                 sum += mult[3][realSrcData[3]];
 139 |                 sum += mult[4][realSrcData[4]];
 140 |                 sum += mult[5][realSrcData[5]];
 141 |                 sum += mult[6][realSrcData[6]];
 142 |                 sum += mult[7][realSrcData[7]];
 143 |                 sum += mult[8][realSrcData[8]];
 144 |                 sum += mult[9][realSrcData[9]];
 145 |                 sum += mult[10][realSrcData[10]];
 146 |                 
 147 |                 dstData[x] = static_cast<uint8_t>((sum + delta) >> shift);
 148 |             }
 149 |             
 150 |             srcData += (width + 10);
 151 |             dstData += width;
 152 |         }
 153 |     }
 154 | }
 155 | 
 156 | 
 157 | void linearFilterHorizonByKernel13(uint8_t *src,
 158 |                                    uint8_t *dst,
 159 |                                    int width,
 160 |                                    int height,
 161 |                                    int (*mult)[256],
 162 |                                    int delta,
 163 |                                    int shift) {
 164 |     int maxThreadNum = getHardwareCPUNum();
 165 |     int threadIndex = 0;
 166 |     
 167 |     int stride = std::max(static_cast<int>(roundf(1.f * height / maxThreadNum)), 1);
 168 | 
 169 | #pragma omp parallel for private(threadIndex)
 170 |     for (threadIndex = 0; threadIndex < maxThreadNum; ++threadIndex) {
 171 |         int start = threadIndex * stride;
 172 |         int end = (threadIndex == (maxThreadNum - 1)) ? height : std::min(start + stride, height);
 173 |         
 174 |         uint8_t *srcData = src + start * (width + 12);
 175 |         uint8_t *dstData = dst + start * width;
 176 |         
 177 |         uint8_t *realSrcData;
 178 |         
 179 |         int sum;
 180 |         
 181 |         for (int y = start; y < end; ++y) {
 182 |             for (int x = 0; x < width; ++x) {
 183 |                 realSrcData = srcData + x;
 184 |                 sum = delta;
 185 |                 
 186 |                 sum += mult[0][realSrcData[0]];
 187 |                 sum += mult[1][realSrcData[1]];
 188 |                 sum += mult[2][realSrcData[2]];
 189 |                 sum += mult[3][realSrcData[3]];
 190 |                 sum += mult[4][realSrcData[4]];
 191 |                 sum += mult[5][realSrcData[5]];
 192 |                 sum += mult[6][realSrcData[6]];
 193 |                 sum += mult[7][realSrcData[7]];
 194 |                 sum += mult[8][realSrcData[8]];
 195 |                 sum += mult[9][realSrcData[9]];
 196 |                 sum += mult[10][realSrcData[10]];
 197 |                 sum += mult[11][realSrcData[11]];
 198 |                 sum += mult[12][realSrcData[12]];
 199 |                 
 200 |                 dstData[x] = static_cast<uint8_t>((sum + delta) >> shift);
 201 |             }
 202 |             
 203 |             srcData += (width + 12);
 204 |             dstData += width;
 205 |         }
 206 |     }
 207 | }
 208 | 
 209 | void linearFilterHorizonByKernel15(uint8_t *src,
 210 |                                    uint8_t *dst,
 211 |                                    int width,
 212 |                                    int height,
 213 |                                    int (*mult)[256],
 214 |                                    int delta,
 215 |                                    int shift) {
 216 |     int maxThreadNum = getHardwareCPUNum();
 217 |     int threadIndex = 0;
 218 |     
 219 |     int stride = std::max(static_cast<int>(roundf(1.f * height / maxThreadNum)), 1);
 220 | 
 221 | #pragma omp parallel for private(threadIndex)
 222 |     for (threadIndex = 0; threadIndex < maxThreadNum; ++threadIndex) {
 223 |         int start = threadIndex * stride;
 224 |         int end = (threadIndex == (maxThreadNum - 1)) ? height : std::min(start + stride, height);
 225 |         
 226 |         uint8_t *srcData = src + start * (width + 14);
 227 |         uint8_t *dstData = dst + start * width;
 228 |         
 229 |         uint8_t *realSrcData;
 230 |         
 231 |         int sum;
 232 |         
 233 |         for (int y = start; y < end; ++y) {
 234 |             for (int x = 0; x < width; ++x) {
 235 |                 realSrcData = srcData + x;
 236 |                 sum = delta;
 237 |                 
 238 |                 sum += mult[0][realSrcData[0]];
 239 |                 sum += mult[1][realSrcData[1]];
 240 |                 sum += mult[2][realSrcData[2]];
 241 |                 sum += mult[3][realSrcData[3]];
 242 |                 sum += mult[4][realSrcData[4]];
 243 |                 sum += mult[5][realSrcData[5]];
 244 |                 sum += mult[6][realSrcData[6]];
 245 |                 sum += mult[7][realSrcData[7]];
 246 |                 sum += mult[8][realSrcData[8]];
 247 |                 sum += mult[9][realSrcData[9]];
 248 |                 sum += mult[10][realSrcData[10]];
 249 |                 sum += mult[11][realSrcData[11]];
 250 |                 sum += mult[12][realSrcData[12]];
 251 |                 sum += mult[13][realSrcData[13]];
 252 |                 sum += mult[14][realSrcData[14]];
 253 |                 
 254 |                 dstData[x] = static_cast<uint8_t>((sum + delta) >> shift);
 255 |             }
 256 |             
 257 |             srcData += (width + 14);
 258 |             dstData += width;
 259 |         }
 260 |     }
 261 | }
 262 | 
 263 | void linearFilterHorizonByKernel17(uint8_t *src,
 264 |                                    uint8_t *dst,
 265 |                                    int width,
 266 |                                    int height,
 267 |                                    int (*mult)[256],
 268 |                                    int delta,
 269 |                                    int shift) {
 270 |     int maxThreadNum = getHardwareCPUNum();
 271 |     int threadIndex = 0;
 272 |     
 273 |     int stride = std::max(static_cast<int>(roundf(1.f * height / maxThreadNum)), 1);
 274 | 
 275 | #pragma omp parallel for private(threadIndex)
 276 |     for (threadIndex = 0; threadIndex < maxThreadNum; ++threadIndex) {
 277 |         int start = threadIndex * stride;
 278 |         int end = (threadIndex == (maxThreadNum - 1)) ? height : std::min(start + stride, height);
 279 |         
 280 |         uint8_t *srcData = src + start * (width + 16);
 281 |         uint8_t *dstData = dst + start * width;
 282 |         
 283 |         uint8_t *realSrcData;
 284 |         
 285 |         int sum;
 286 |         
 287 |         for (int y = start; y < end; ++y) {
 288 |             for (int x = 0; x < width; ++x) {
 289 |                 realSrcData = srcData + x;
 290 |                 sum = delta;
 291 |                 
 292 |                 sum += mult[0][realSrcData[0]];
 293 |                 sum += mult[1][realSrcData[1]];
 294 |                 sum += mult[2][realSrcData[2]];
 295 |                 sum += mult[3][realSrcData[3]];
 296 |                 sum += mult[4][realSrcData[4]];
 297 |                 sum += mult[5][realSrcData[5]];
 298 |                 sum += mult[6][realSrcData[6]];
 299 |                 sum += mult[7][realSrcData[7]];
 300 |                 sum += mult[8][realSrcData[8]];
 301 |                 sum += mult[9][realSrcData[9]];
 302 |                 sum += mult[10][realSrcData[10]];
 303 |                 sum += mult[11][realSrcData[11]];
 304 |                 sum += mult[12][realSrcData[12]];
 305 |                 sum += mult[13][realSrcData[13]];
 306 |                 sum += mult[14][realSrcData[14]];
 307 |                 sum += mult[15][realSrcData[15]];
 308 |                 sum += mult[16][realSrcData[16]];
 309 |                 
 310 |                 dstData[x] = static_cast<uint8_t>((sum + delta) >> shift);
 311 |             }
 312 |             
 313 |             srcData += (width + 16);
 314 |             dstData += width;
 315 |         }
 316 |     }
 317 | }
 318 | 
 319 | void linearFilterHorizonByKernel19(uint8_t *src,
 320 |                                    uint8_t *dst,
 321 |                                    int width,
 322 |                                    int height,
 323 |                                    int (*mult)[256],
 324 |                                    int delta,
 325 |                                    int shift) {
 326 |     int maxThreadNum = getHardwareCPUNum();
 327 |     int threadIndex = 0;
 328 |     
 329 |     int stride = std::max(static_cast<int>(roundf(1.f * height / maxThreadNum)), 1);
 330 | 
 331 | #pragma omp parallel for private(threadIndex)
 332 |     for (threadIndex = 0; threadIndex < maxThreadNum; ++threadIndex) {
 333 |         int start = threadIndex * stride;
 334 |         int end = (threadIndex == (maxThreadNum - 1)) ? height : std::min(start + stride, height);
 335 |         
 336 |         uint8_t *srcData = src + start * (width + 18);
 337 |         uint8_t *dstData = dst + start * width;
 338 |         
 339 |         uint8_t *realSrcData;
 340 |         
 341 |         int sum;
 342 |         
 343 |         for (int y = start; y < end; ++y) {
 344 |             for (int x = 0; x < width; ++x) {
 345 |                 realSrcData = srcData + x;
 346 |                 sum = delta;
 347 |                 
 348 |                 sum += mult[0][realSrcData[0]];
 349 |                 sum += mult[1][realSrcData[1]];
 350 |                 sum += mult[2][realSrcData[2]];
 351 |                 sum += mult[3][realSrcData[3]];
 352 |                 sum += mult[4][realSrcData[4]];
 353 |                 sum += mult[5][realSrcData[5]];
 354 |                 sum += mult[6][realSrcData[6]];
 355 |                 sum += mult[7][realSrcData[7]];
 356 |                 sum += mult[8][realSrcData[8]];
 357 |                 sum += mult[9][realSrcData[9]];
 358 |                 sum += mult[10][realSrcData[10]];
 359 |                 sum += mult[11][realSrcData[11]];
 360 |                 sum += mult[12][realSrcData[12]];
 361 |                 sum += mult[13][realSrcData[13]];
 362 |                 sum += mult[14][realSrcData[14]];
 363 |                 sum += mult[15][realSrcData[15]];
 364 |                 sum += mult[16][realSrcData[16]];
 365 |                 sum += mult[17][realSrcData[17]];
 366 |                 sum += mult[18][realSrcData[18]];
 367 |                 
 368 |                 dstData[x] = static_cast<uint8_t>((sum + delta) >> shift);
 369 |             }
 370 |             
 371 |             srcData += (width + 18);
 372 |             dstData += width;
 373 |         }
 374 |     }
 375 | }
 376 | 
 377 | void linearFilterHorizonByKernel(uint8_t *src,
 378 |                                  uint8_t *dst,
 379 |                                  int width,
 380 |                                  int height,
 381 |                                  int (*mult)[256],
 382 |                                  int delta,
 383 |                                  int shift,
 384 |                                  int size) {
 385 |     int radius = (size - 1) / 2;
 386 |     
 387 |     int maxThreadNum = getHardwareCPUNum();
 388 |     int threadIndex = 0;
 389 |     
 390 |     int stride = std::max(static_cast<int>(roundf(1.f * height / maxThreadNum)), 1);
 391 | 
 392 | #pragma omp parallel for private(threadIndex)
 393 |     for (threadIndex = 0; threadIndex < maxThreadNum; ++threadIndex) {
 394 |         int start = threadIndex * stride;
 395 |         int end = (threadIndex == (maxThreadNum - 1)) ? height : std::min(start + stride, height);
 396 |         
 397 |         uint8_t *srcData = src + start * (width + radius + radius);
 398 |         uint8_t *dstData = dst + start * width;
 399 |         
 400 |         uint8_t *realSrcData;
 401 |         
 402 |         int sum;
 403 |         
 404 |         for (int y = start; y < end; ++y) {
 405 |             for (int x = 0; x < width; ++x) {
 406 |                 realSrcData = srcData + x;
 407 |                 sum = delta;
 408 |                 
 409 |                 for (int k = 0; k < size; ++k) {
 410 |                     sum += mult[k][realSrcData[k]];
 411 |                 }
 412 |                 
 413 |                 dstData[x] = static_cast<uint8_t>((sum + delta) >> shift);
 414 |             }
 415 |             
 416 |             srcData += width + radius + radius;
 417 |             dstData += width;
 418 |         }
 419 |     }
 420 | }
 421 | 
 422 | void linearFilterHorizon(uint8_t *src,
 423 |                          uint8_t *dst,
 424 |                          int width,
 425 |                          int height,
 426 |                          int (*mult)[256],
 427 |                          int delta,
 428 |                          int shift,
 429 |                          int size) {
 430 |     int radius = (size - 1) / 2;
 431 |     
 432 |     uint8_t *tmp = new uint8_t[(width + radius + radius) * height];
 433 |     
 434 |     /**
 435 |      * --------------------------------
 436 |      * |       |              |       |
 437 |      * |radius |              |radius |
 438 |      * |       |              |       |
 439 |      * |       |              |       |
 440 |      * |       |              |       |
 441 |      * |       |              |       |
 442 |      * --------------------------------
 443 |      */
 444 |     uint8_t *srcOffset = src;
 445 |     uint8_t *tmpOffset = tmp;
 446 |     
 447 |     for (int y = 0; y < height; ++y) {
 448 |         std::fill(tmpOffset,
 449 |                   tmpOffset + radius,
 450 |                   srcOffset[0]);
 451 |         
 452 |         memcpy(tmpOffset + radius,
 453 |                srcOffset,
 454 |                sizeof(uint8_t) * width);
 455 |         
 456 |         std::fill(tmpOffset + radius + width,
 457 |                   tmpOffset + +radius + radius + width,
 458 |                   srcOffset[width - 1]);
 459 |         
 460 |         srcOffset += width;
 461 |         tmpOffset += width + radius + radius;
 462 |     }
 463 |     
 464 |     if (7 == size) {
 465 |         linearFilterHorizonByKernel7(tmp, dst, width, height, mult, delta, shift);
 466 |     } else if (9 == size) {
 467 |         linearFilterHorizonByKernel9(tmp, dst, width, height, mult, delta, shift);
 468 |     } else if (11 == size) {
 469 |         linearFilterHorizonByKernel11(tmp, dst, width, height, mult, delta, shift);
 470 |     } else if (13 == size) {
 471 |         linearFilterHorizonByKernel13(tmp, dst, width, height, mult, delta, shift);
 472 |     } else if (15 == size) {
 473 |         linearFilterHorizonByKernel15(tmp, dst, width, height, mult, delta, shift);
 474 |     } else if (17 == size) {
 475 |         linearFilterHorizonByKernel17(tmp, dst, width, height, mult, delta, shift);
 476 |     } else if (19 == size) {
 477 |         linearFilterHorizonByKernel19(tmp, dst, width, height, mult, delta, shift);
 478 |     } else {
 479 |         linearFilterHorizonByKernel(tmp, dst, width, height, mult, delta, shift, size);
 480 |     }
 481 |     
 482 |     delete[] tmp;
 483 | }
 484 | 
 485 | 
 486 | void linearFilterVerticalByKernel7(uint8_t *src,
 487 |                                    uint8_t *dst,
 488 |                                    int width,
 489 |                                    int height,
 490 |                                    int (*mult)[256],
 491 |                                    int delta,
 492 |                                    int shift) {
 493 |     int maxThreadNum = getHardwareCPUNum();
 494 |     int threadIndex = 0;
 495 |     
 496 |     int stride = std::max(static_cast<int>(roundf(1.f * height / maxThreadNum)), 1);
 497 | 
 498 | #pragma omp parallel for private(threadIndex)
 499 |     for (threadIndex = 0; threadIndex < maxThreadNum; ++threadIndex) {
 500 |         int start = threadIndex * stride;
 501 |         int end = (threadIndex == (maxThreadNum - 1)) ? height : std::min(start + stride, height);
 502 |         
 503 |         uint8_t *srcData = src + start * width;
 504 |         uint8_t *dstData = dst + start * width;
 505 |         
 506 |         uint8_t *realSrcData;
 507 |         
 508 |         int sum;
 509 |         
 510 |         for (int y = start; y < end; ++y) {
 511 |             for (int x = 0; x < width; ++x) {
 512 |                 realSrcData = srcData + x;
 513 |                 
 514 |                 sum = delta;
 515 |                 
 516 |                 sum += mult[0][realSrcData[0]];
 517 |                 
 518 |                 realSrcData += width;
 519 |                 sum += mult[1][realSrcData[0]];
 520 |                 
 521 |                 realSrcData += width;
 522 |                 sum += mult[2][realSrcData[0]];
 523 |                 
 524 |                 realSrcData += width;
 525 |                 sum += mult[3][realSrcData[0]];
 526 |                 
 527 |                 realSrcData += width;
 528 |                 sum += mult[4][realSrcData[0]];
 529 |                 
 530 |                 realSrcData += width;
 531 |                 sum += mult[5][realSrcData[0]];
 532 |                 
 533 |                 realSrcData += width;
 534 |                 sum += mult[6][realSrcData[0]];
 535 |                 
 536 |                 dstData[x] = static_cast<uint8_t>((sum + delta) >> shift);
 537 |             }
 538 |             
 539 |             srcData += width;
 540 |             dstData += width;
 541 |         }
 542 |     }
 543 | }
 544 | 
 545 | void linearFilterVerticalByKernel9(uint8_t *src,
 546 |                                    uint8_t *dst,
 547 |                                    int width,
 548 |                                    int height,
 549 |                                    int (*mult)[256],
 550 |                                    int delta,
 551 |                                    int shift) {
 552 |     int maxThreadNum = getHardwareCPUNum();
 553 |     int threadIndex = 0;
 554 |     
 555 |     int stride = std::max(static_cast<int>(roundf(1.f * height / maxThreadNum)), 1);
 556 | 
 557 | #pragma omp parallel for private(threadIndex)
 558 |     for (threadIndex = 0; threadIndex < maxThreadNum; ++threadIndex) {
 559 |         int start = threadIndex * stride;
 560 |         int end = (threadIndex == (maxThreadNum - 1)) ? height : std::min(start + stride, height);
 561 |         
 562 |         uint8_t *srcData = src + start * width;
 563 |         uint8_t *dstData = dst + start * width;
 564 |         
 565 |         uint8_t *realSrcData;
 566 |         
 567 |         int sum;
 568 |         
 569 |         for (int y = start; y < end; ++y) {
 570 |             for (int x = 0; x < width; ++x) {
 571 |                 realSrcData = srcData + x;
 572 |                 sum = delta;
 573 |                 
 574 |                 sum += mult[0][realSrcData[0]];
 575 |                 
 576 |                 realSrcData += width;
 577 |                 sum += mult[1][realSrcData[0]];
 578 |                 
 579 |                 realSrcData += width;
 580 |                 sum += mult[2][realSrcData[0]];
 581 |                 
 582 |                 realSrcData += width;
 583 |                 sum += mult[3][realSrcData[0]];
 584 |                 
 585 |                 realSrcData += width;
 586 |                 sum += mult[4][realSrcData[0]];
 587 |                 
 588 |                 realSrcData += width;
 589 |                 sum += mult[5][realSrcData[0]];
 590 |                 
 591 |                 realSrcData += width;
 592 |                 sum += mult[6][realSrcData[0]];
 593 |                 
 594 |                 realSrcData += width;
 595 |                 sum += mult[7][realSrcData[0]];
 596 |                 
 597 |                 realSrcData += width;
 598 |                 sum += mult[8][realSrcData[0]];
 599 |                 
 600 |                 dstData[x] = static_cast<uint8_t>((sum + delta) >> shift);
 601 |             }
 602 |             
 603 |             srcData += width;
 604 |             dstData += width;
 605 |         }
 606 |     }
 607 | }
 608 | 
 609 | void linearFilterVerticalByKernel11(uint8_t *src,
 610 |                                     uint8_t *dst,
 611 |                                     int width,
 612 |                                     int height,
 613 |                                     int (*mult)[256],
 614 |                                     int delta,
 615 |                                     int shift) {
 616 |     int maxThreadNum = getHardwareCPUNum();
 617 |     int threadIndex = 0;
 618 |     
 619 |     int stride = std::max(static_cast<int>(roundf(1.f * height / maxThreadNum)), 1);
 620 | 
 621 | #pragma omp parallel for private(threadIndex)
 622 |     for (threadIndex = 0; threadIndex < maxThreadNum; ++threadIndex) {
 623 |         int start = threadIndex * stride;
 624 |         int end = (threadIndex == (maxThreadNum - 1)) ? height : std::min(start + stride, height);
 625 |         
 626 |         uint8_t *srcData = src + start * width;
 627 |         uint8_t *dstData = dst + start * width;
 628 |         
 629 |         uint8_t *realSrcData;
 630 |         
 631 |         int sum;
 632 |         
 633 |         for (int y = start; y < end; ++y) {
 634 |             for (int x = 0; x < width; ++x) {
 635 |                 realSrcData = srcData + x;
 636 |                 sum = delta;
 637 |                 
 638 |                 sum += mult[0][realSrcData[0]];
 639 |                 
 640 |                 realSrcData += width;
 641 |                 sum += mult[1][realSrcData[0]];
 642 |                 
 643 |                 realSrcData += width;
 644 |                 sum += mult[2][realSrcData[0]];
 645 |                 
 646 |                 realSrcData += width;
 647 |                 sum += mult[3][realSrcData[0]];
 648 |                 
 649 |                 realSrcData += width;
 650 |                 sum += mult[4][realSrcData[0]];
 651 |                 
 652 |                 realSrcData += width;
 653 |                 sum += mult[5][realSrcData[0]];
 654 |                 
 655 |                 realSrcData += width;
 656 |                 sum += mult[6][realSrcData[0]];
 657 |                 
 658 |                 realSrcData += width;
 659 |                 sum += mult[7][realSrcData[0]];
 660 |                 
 661 |                 realSrcData += width;
 662 |                 sum += mult[8][realSrcData[0]];
 663 |                 
 664 |                 realSrcData += width;
 665 |                 sum += mult[9][realSrcData[0]];
 666 |                 
 667 |                 realSrcData += width;
 668 |                 sum += mult[10][realSrcData[0]];
 669 |                 
 670 |                 dstData[x] = static_cast<uint8_t>((sum + delta) >> shift);
 671 |             }
 672 |             
 673 |             srcData += width;
 674 |             dstData += width;
 675 |         }
 676 |     }
 677 | }
 678 | 
 679 | void linearFilterVerticalByKernel13(uint8_t *src,
 680 |                                     uint8_t *dst,
 681 |                                     int width,
 682 |                                     int height,
 683 |                                     int (*mult)[256],
 684 |                                     int delta,
 685 |                                     int shift) {
 686 |     int maxThreadNum = getHardwareCPUNum();
 687 |     int threadIndex = 0;
 688 |     
 689 |     int stride = std::max(static_cast<int>(roundf(1.f * height / maxThreadNum)), 1);
 690 | 
 691 | #pragma omp parallel for private(threadIndex)
 692 |     for (threadIndex = 0; threadIndex < maxThreadNum; ++threadIndex) {
 693 |         int start = threadIndex * stride;
 694 |         int end = (threadIndex == (maxThreadNum - 1)) ? height : std::min(start + stride, height);
 695 |         
 696 |         uint8_t *srcData = src + start * width;
 697 |         uint8_t *dstData = dst + start * width;
 698 |         
 699 |         uint8_t *realSrcData;
 700 |         
 701 |         int sum;
 702 |         
 703 |         for (int y = start; y < end; ++y) {
 704 |             for (int x = 0; x < width; ++x) {
 705 |                 realSrcData = srcData + x;
 706 |                 sum = delta;
 707 |                 
 708 |                 sum += mult[0][realSrcData[0]];
 709 |                 
 710 |                 realSrcData += width;
 711 |                 sum += mult[1][realSrcData[0]];
 712 |                 
 713 |                 realSrcData += width;
 714 |                 sum += mult[2][realSrcData[0]];
 715 |                 
 716 |                 realSrcData += width;
 717 |                 sum += mult[3][realSrcData[0]];
 718 |                 
 719 |                 realSrcData += width;
 720 |                 sum += mult[4][realSrcData[0]];
 721 |                 
 722 |                 realSrcData += width;
 723 |                 sum += mult[5][realSrcData[0]];
 724 |                 
 725 |                 realSrcData += width;
 726 |                 sum += mult[6][realSrcData[0]];
 727 |                 
 728 |                 realSrcData += width;
 729 |                 sum += mult[7][realSrcData[0]];
 730 |                 
 731 |                 realSrcData += width;
 732 |                 sum += mult[8][realSrcData[0]];
 733 |                 
 734 |                 realSrcData += width;
 735 |                 sum += mult[9][realSrcData[0]];
 736 |                 
 737 |                 realSrcData += width;
 738 |                 sum += mult[10][realSrcData[0]];
 739 |                 
 740 |                 realSrcData += width;
 741 |                 sum += mult[11][realSrcData[0]];
 742 |                 
 743 |                 realSrcData += width;
 744 |                 sum += mult[12][realSrcData[0]];
 745 |                 
 746 |                 dstData[x] = static_cast<uint8_t>((sum + delta) >> shift);
 747 |             }
 748 |             
 749 |             srcData += width;
 750 |             dstData += width;
 751 |         }
 752 |     }
 753 | }
 754 | 
 755 | void linearFilterVerticalByKernel15(uint8_t *src,
 756 |                                     uint8_t *dst,
 757 |                                     int width,
 758 |                                     int height,
 759 |                                     int (*mult)[256],
 760 |                                     int delta,
 761 |                                     int shift) {
 762 |     int maxThreadNum = getHardwareCPUNum();
 763 |     int threadIndex = 0;
 764 |     
 765 |     int stride = std::max(static_cast<int>(roundf(1.f * height / maxThreadNum)), 1);
 766 | 
 767 | #pragma omp parallel for private(threadIndex)
 768 |     for (threadIndex = 0; threadIndex < maxThreadNum; ++threadIndex) {
 769 |         int start = threadIndex * stride;
 770 |         int end = (threadIndex == (maxThreadNum - 1)) ? height : std::min(start + stride, height);
 771 |         
 772 |         uint8_t *srcData = src + start * width;
 773 |         uint8_t *dstData = dst + start * width;
 774 |         
 775 |         uint8_t *realSrcData;
 776 |         
 777 |         int sum;
 778 |         
 779 |         for (int y = start; y < end; ++y) {
 780 |             for (int x = 0; x < width; ++x) {
 781 |                 realSrcData = srcData + x;
 782 |                 sum = delta;
 783 |                 
 784 |                 sum += mult[0][realSrcData[0]];
 785 |                 
 786 |                 realSrcData += width;
 787 |                 sum += mult[1][realSrcData[0]];
 788 |                 
 789 |                 realSrcData += width;
 790 |                 sum += mult[2][realSrcData[0]];
 791 |                 
 792 |                 realSrcData += width;
 793 |                 sum += mult[3][realSrcData[0]];
 794 |                 
 795 |                 realSrcData += width;
 796 |                 sum += mult[4][realSrcData[0]];
 797 |                 
 798 |                 realSrcData += width;
 799 |                 sum += mult[5][realSrcData[0]];
 800 |                 
 801 |                 realSrcData += width;
 802 |                 sum += mult[6][realSrcData[0]];
 803 |                 
 804 |                 realSrcData += width;
 805 |                 sum += mult[7][realSrcData[0]];
 806 |                 
 807 |                 realSrcData += width;
 808 |                 sum += mult[8][realSrcData[0]];
 809 |                 
 810 |                 realSrcData += width;
 811 |                 sum += mult[9][realSrcData[0]];
 812 |                 
 813 |                 realSrcData += width;
 814 |                 sum += mult[10][realSrcData[0]];
 815 |                 
 816 |                 realSrcData += width;
 817 |                 sum += mult[11][realSrcData[0]];
 818 |                 
 819 |                 realSrcData += width;
 820 |                 sum += mult[12][realSrcData[0]];
 821 |                 
 822 |                 realSrcData += width;
 823 |                 sum += mult[13][realSrcData[0]];
 824 |                 
 825 |                 realSrcData += width;
 826 |                 sum += mult[14][realSrcData[0]];
 827 |                 
 828 |                 dstData[x] = static_cast<uint8_t>((sum + delta) >> shift);
 829 |             }
 830 |             
 831 |             srcData += width;
 832 |             dstData += width;
 833 |         }
 834 |     }
 835 | }
 836 | 
 837 | void linearFilterVerticalByKernel17(uint8_t *src,
 838 |                                     uint8_t *dst,
 839 |                                     int width,
 840 |                                     int height,
 841 |                                     int (*mult)[256],
 842 |                                     int delta,
 843 |                                     int shift) {
 844 |     int maxThreadNum = getHardwareCPUNum();
 845 |     int threadIndex = 0;
 846 |     
 847 |     int stride = std::max(static_cast<int>(roundf(1.f * height / maxThreadNum)), 1);
 848 | 
 849 | #pragma omp parallel for private(threadIndex)
 850 |     for (threadIndex = 0; threadIndex < maxThreadNum; ++threadIndex) {
 851 |         int start = threadIndex * stride;
 852 |         int end = (threadIndex == (maxThreadNum - 1)) ? height : std::min(start + stride, height);
 853 |         
 854 |         uint8_t *srcData = src + start * width;
 855 |         uint8_t *dstData = dst + start * width;
 856 |         
 857 |         uint8_t *realSrcData;
 858 |         
 859 |         int sum;
 860 |         
 861 |         for (int y = start; y < end; ++y) {
 862 |             for (int x = 0; x < width; ++x) {
 863 |                 realSrcData = srcData + x;
 864 |                 sum = delta;
 865 |                 
 866 |                 sum += mult[0][realSrcData[0]];
 867 |                 
 868 |                 realSrcData += width;
 869 |                 sum += mult[1][realSrcData[0]];
 870 |                 
 871 |                 realSrcData += width;
 872 |                 sum += mult[2][realSrcData[0]];
 873 |                 
 874 |                 realSrcData += width;
 875 |                 sum += mult[3][realSrcData[0]];
 876 |                 
 877 |                 realSrcData += width;
 878 |                 sum += mult[4][realSrcData[0]];
 879 |                 
 880 |                 realSrcData += width;
 881 |                 sum += mult[5][realSrcData[0]];
 882 |                 
 883 |                 realSrcData += width;
 884 |                 sum += mult[6][realSrcData[0]];
 885 |                 
 886 |                 realSrcData += width;
 887 |                 sum += mult[7][realSrcData[0]];
 888 |                 
 889 |                 realSrcData += width;
 890 |                 sum += mult[8][realSrcData[0]];
 891 |                 
 892 |                 realSrcData += width;
 893 |                 sum += mult[9][realSrcData[0]];
 894 |                 
 895 |                 realSrcData += width;
 896 |                 sum += mult[10][realSrcData[0]];
 897 |                 
 898 |                 realSrcData += width;
 899 |                 sum += mult[11][realSrcData[0]];
 900 |                 
 901 |                 realSrcData += width;
 902 |                 sum += mult[12][realSrcData[0]];
 903 |                 
 904 |                 realSrcData += width;
 905 |                 sum += mult[13][realSrcData[0]];
 906 |                 
 907 |                 realSrcData += width;
 908 |                 sum += mult[14][realSrcData[0]];
 909 |                 
 910 |                 realSrcData += width;
 911 |                 sum += mult[15][realSrcData[0]];
 912 |                 
 913 |                 realSrcData += width;
 914 |                 sum += mult[16][realSrcData[0]];
 915 |                 
 916 |                 dstData[x] = static_cast<uint8_t>((sum + delta) >> shift);
 917 |             }
 918 |             
 919 |             srcData += width;
 920 |             dstData += width;
 921 |         }
 922 |     }
 923 | }
 924 | 
 925 | void linearFilterVerticalByKernel19(uint8_t *src,
 926 |                                     uint8_t *dst,
 927 |                                     int width,
 928 |                                     int height,
 929 |                                     int (*mult)[256],
 930 |                                     int delta,
 931 |                                     int shift) {
 932 |     int maxThreadNum = getHardwareCPUNum();
 933 |     int threadIndex = 0;
 934 |     
 935 |     int stride = std::max(static_cast<int>(roundf(1.f * height / maxThreadNum)), 1);
 936 | 
 937 | #pragma omp parallel for private(threadIndex)
 938 |     for (threadIndex = 0; threadIndex < maxThreadNum; ++threadIndex) {
 939 |         int start = threadIndex * stride;
 940 |         int end = (threadIndex == (maxThreadNum - 1)) ? height : std::min(start + stride, height);
 941 |         
 942 |         uint8_t *srcData = src + start * width;
 943 |         uint8_t *dstData = dst + start * width;
 944 |         
 945 |         uint8_t *realSrcData;
 946 |         
 947 |         int sum;
 948 |         
 949 |         for (int y = start; y < end; ++y) {
 950 |             for (int x = 0; x < width; ++x) {
 951 |                 realSrcData = srcData + x;
 952 |                 sum = delta;
 953 |                 
 954 |                 sum += mult[0][realSrcData[0]];
 955 |                 
 956 |                 realSrcData += width;
 957 |                 sum += mult[1][realSrcData[0]];
 958 |                 
 959 |                 realSrcData += width;
 960 |                 sum += mult[2][realSrcData[0]];
 961 |                 
 962 |                 realSrcData += width;
 963 |                 sum += mult[3][realSrcData[0]];
 964 |                 
 965 |                 realSrcData += width;
 966 |                 sum += mult[4][realSrcData[0]];
 967 |                 
 968 |                 realSrcData += width;
 969 |                 sum += mult[5][realSrcData[0]];
 970 |                 
 971 |                 realSrcData += width;
 972 |                 sum += mult[6][realSrcData[0]];
 973 |                 
 974 |                 realSrcData += width;
 975 |                 sum += mult[7][realSrcData[0]];
 976 |                 
 977 |                 realSrcData += width;
 978 |                 sum += mult[8][realSrcData[0]];
 979 |                 
 980 |                 realSrcData += width;
 981 |                 sum += mult[9][realSrcData[0]];
 982 |                 
 983 |                 realSrcData += width;
 984 |                 sum += mult[10][realSrcData[0]];
 985 |                 
 986 |                 realSrcData += width;
 987 |                 sum += mult[11][realSrcData[0]];
 988 |                 
 989 |                 realSrcData += width;
 990 |                 sum += mult[12][realSrcData[0]];
 991 |                 
 992 |                 realSrcData += width;
 993 |                 sum += mult[13][realSrcData[0]];
 994 |                 
 995 |                 realSrcData += width;
 996 |                 sum += mult[14][realSrcData[0]];
 997 |                 
 998 |                 realSrcData += width;
 999 |                 sum += mult[15][realSrcData[0]];
1000 |                 
1001 |                 realSrcData += width;
1002 |                 sum += mult[16][realSrcData[0]];
1003 |                 
1004 |                 realSrcData += width;
1005 |                 sum += mult[17][realSrcData[0]];
1006 |                 
1007 |                 realSrcData += width;
1008 |                 sum += mult[18][realSrcData[0]];
1009 |                 
1010 |                 dstData[x] = static_cast<uint8_t>((sum + delta) >> shift);
1011 |             }
1012 |             
1013 |             srcData += width;
1014 |             dstData += width;
1015 |         }
1016 |     }
1017 | }
1018 | 
1019 | void linearFilterVerticalByKernel(uint8_t *src,
1020 |                                   uint8_t *dst,
1021 |                                   int width,
1022 |                                   int height,
1023 |                                   int (*mult)[256],
1024 |                                   int delta,
1025 |                                   int shift,
1026 |                                   int size) {
1027 |     int maxThreadNum = getHardwareCPUNum();
1028 |     int threadIndex = 0;
1029 |     
1030 |     int stride = std::max(static_cast<int>(roundf(1.f * height / maxThreadNum)), 1);
1031 | 
1032 | #pragma omp parallel for private(threadIndex)
1033 |     for (threadIndex = 0; threadIndex < maxThreadNum; ++threadIndex) {
1034 |         int start = threadIndex * stride;
1035 |         int end = (threadIndex == (maxThreadNum - 1)) ? height : std::min(start + stride, height);
1036 |         
1037 |         uint8_t *srcData = src + start * width;
1038 |         uint8_t *dstData = dst + start * width;
1039 |         
1040 |         uint8_t *realSrcData;
1041 |         
1042 |         int *sum = (int *) malloc(sizeof(int) * width);
1043 |         
1044 |         for (int y = start; y < end; ++y) {
1045 |             std::fill(sum, sum + width, delta);
1046 |             
1047 |             realSrcData = srcData;
1048 |             
1049 |             for (int k = 0; k < size; ++k) {
1050 |                 for (int x = 0; x < width; ++x) {
1051 |                     sum[x] += mult[k][realSrcData[x]];
1052 |                 }
1053 |                 
1054 |                 realSrcData += width;
1055 |             }
1056 |             
1057 |             for (int x = 0; x < width; ++x) {
1058 |                 dstData[x] = static_cast<uint8_t>(((sum[x] + delta) >> shift));
1059 |             }
1060 |             
1061 |             srcData += width;
1062 |             dstData += width;
1063 |         }
1064 |         
1065 |         free(sum);
1066 |     }
1067 | }
1068 | 
1069 | void linearFilterVertical(uint8_t *src,
1070 |                           uint8_t *dst,
1071 |                           int width,
1072 |                           int height,
1073 |                           int (*mult)[256],
1074 |                           int delta,
1075 |                           int shift,
1076 |                           int size) {
1077 |     int radius = (size - 1) / 2;
1078 |     uint8_t *tmp = new uint8_t[width * (height + radius + radius)];
1079 |     
1080 |     memcpy(tmp + radius * width, src, sizeof(uint8_t) * width * height);
1081 |     
1082 |     for (int i = 0; i < radius; ++i) {
1083 |         memcpy(tmp + i * width, src, sizeof(uint8_t) * width);
1084 |         memcpy(tmp + (radius + height + i) * width, src + (height - 1) * width, sizeof(uint8_t) * width);
1085 |     }
1086 |     
1087 |     if (7 == size) {
1088 |         linearFilterVerticalByKernel7(tmp, dst, width, height, mult, delta, shift);
1089 |     } else if (9 == size) {
1090 |         linearFilterVerticalByKernel9(tmp, dst, width, height, mult, delta, shift);
1091 |     } else if (11 == size) {
1092 |         linearFilterVerticalByKernel11(tmp, dst, width, height, mult, delta, shift);
1093 |     } else if (13 == size) {
1094 |         linearFilterVerticalByKernel13(tmp, dst, width, height, mult, delta, shift);
1095 |     } else if (15 == size) {
1096 |         linearFilterVerticalByKernel15(tmp, dst, width, height, mult, delta, shift);
1097 |     } else if (17 == size) {
1098 |         linearFilterVerticalByKernel17(tmp, dst, width, height, mult, delta, shift);
1099 |     } else if (19 == size) {
1100 |         linearFilterVerticalByKernel19(tmp, dst, width, height, mult, delta, shift);
1101 |     } else {
1102 |         linearFilterVerticalByKernel(tmp, dst, width, height, mult, delta, shift, size);
1103 |     }
1104 |     
1105 |     delete[] tmp;
1106 | }
1107 |     
1108 | }
1109 | 
1110 | 
1111 | 
1112 | 
1113 | 
1114 | 
1115 | 
1116 | 
1117 | 
1118 | 
1119 | 
1120 | 
1121 | 
1122 | 
1123 | 
1124 | 
1125 | 
1126 | 
1127 | 
1128 | 
1129 | 
1130 | 
1131 | 
1132 | 
1133 | 
1134 | 
1135 | 
1136 | 


--------------------------------------------------------------------------------