├── README.MD
├── build.gradle
├── gradle
    └── wrapper
    │   ├── gradle-wrapper.jar
    │   └── gradle-wrapper.properties
├── gradlew
├── gradlew.bat
└── src
    ├── main
        └── java
        │   └── benchmark
        │       ├── BenchmarkOperations.java
        │       ├── ImageProcessing.java
        │       └── MatrixMultiplication.java
    └── test
        └── java
            └── benchmark
                ├── ImageProcessingTest.java
                └── MatrixMultiplicationTest.java


/README.MD:
--------------------------------------------------------------------------------
 1 | Proof of concept test using new Vector API ([JEP 338](https://openjdk.java.net/jeps/338)). Vectorized code is compared against already optimized code from 
 2 | [EJML](https://ejml.org) and [BoofCV](https://boofcv.org).
 3 | 
 4 | * [Matrix Multiplication](https://en.wikipedia.org/wiki/Matrix_multiplication) IKJ Order (double)
 5 | * [Image Convolution](https://boofcv.org/index.php?title=Example_Image_Blur) (float)
 6 | * [Image Thresholding](https://boofcv.org/index.php?title=Example_Thresholding) (unsigned byte)
 7 | 
 8 | To run the benchmark just type the command below. The first time you run it there will be a lot of downloads. If you
 9 | don't have JDK 16 installed it will download it for you automatically. Once it starts running the actual benchmark 
10 | that will take about 12 minutes to complete.
11 | ```bash
12 | ./gradlew runtimeBenchmark
13 | ```
14 | 
15 | If you load this up in your favorite IDE (in my case IntelliJ) you're highly likely to experience issues. This
16 | is using bleeding edge version of Gradle with a bleeding edge JDK, and a new API.
17 | 
18 | # Learning About Vector API
19 | 
20 | * https://richardstartin.github.io/posts/vectorised-algorithms-in-java
21 | 
22 | 
23 | # Results
24 | 
25 | Setup
26 | * OpenJDK 64-Bit Server VM AdoptOpenJDK (build 16+36, mixed mode, sharing)
27 | * Intel(R) Core(TM) i7-6700 CPU @ 3.40GHz
28 | * Ubuntu 18.04.5 LTS
29 | 
30 | Summary
31 | ```
32 | Operation                    | Data |     Size     |  Relative   |
33 |                              | Type |              | Performance |
34 | -----------------------------------------------------------------------------------------
35 | Matrix Mult IKJ Real         |   D  | Large Matrix |    1.84     | [1]
36 | Matrix Mult IKJ Real         |   D  | Small Matrix |     .86     | [2]
37 | Matrix Mult IKJ Complex      |   D  | Large Matrix |             | Vector code needed
38 | Matrix Mult IKJ Complex      |   D  | Small Matrix |             | Vector code needed
39 | Image 1D Conv                |   F  | Large kernel |    1.82     | 
40 | Image 1D Conv                |   F  | Small kernel |    1.86     |
41 | Image 1D Conv  BoofCV        |   F  | Small kernel |     .41     | [3] Compared to unrolled
42 | Image 1D Mean                |   F  |              |             | Vector code needed
43 | Image Threshold              |  U8  |              |    6.78     | [4]
44 | Image Histogram              |  U16 |              |             | Vector code needed
45 | YUV 420 888 to RGB           |   D  |              |             | Vector code needed
46 | Image Debayer                |   D  |              |             | Vector code needed
47 | ```
48 | Unless otherwise stated, all performance is baseline code over vectorized code. Values > 1 mean vectorized code was
49 | faster and values < 1 mean vectorized was slower. In some cases unrolled code from EJML and BoofCV have been
50 | included to provide a point of comparison.
51 | 
52 | ```
53 | Benchmark                                       (kernelSize)  (size)  Mode  Cnt           Score           Error  Units
54 | BenchmarkOperations.convolve_horizontal                    5     N/A  avgt    5     8080015.121 ±    169251.559  ns/op
55 | BenchmarkOperations.convolve_horizontal                   31     N/A  avgt    5    24767084.561 ±    462767.053  ns/op
56 | BenchmarkOperations.convolve_horizontal_boofcv             5     N/A  avgt    5     1775128.816 ±     13315.269  ns/op
57 | BenchmarkOperations.convolve_horizontal_boofcv            31     N/A  avgt    5    24833110.727 ±    265814.061  ns/op
58 | BenchmarkOperations.convolve_horizontal_vector             5     N/A  avgt    5     4351633.285 ±     29120.843  ns/op
59 | BenchmarkOperations.convolve_horizontal_vector            31     N/A  avgt    5    13615354.944 ±    263422.696  ns/op
60 | BenchmarkOperations.image_threshold                      N/A     N/A  avgt    5      345424.878 ±      6195.410  ns/op
61 | BenchmarkOperations.image_threshold_vector_v1            N/A     N/A  avgt    5      580158.660 ±      8812.190  ns/op
62 | BenchmarkOperations.image_threshold_vector_v2            N/A     N/A  avgt    5       50925.242 ±      2032.203  ns/op
63 | BenchmarkOperations.matrix_mult                          N/A       4  avgt    5         104.410 ±         5.414  ns/op
64 | BenchmarkOperations.matrix_mult                          N/A    1000  avgt    5   606881005.900 ±   3875032.130  ns/op
65 | BenchmarkOperations.matrix_mult_complex                  N/A       4  avgt    5         202.456 ±         1.172  ns/op
66 | BenchmarkOperations.matrix_mult_complex                  N/A    1000  avgt    5  1616543112.600 ± 500480900.857  ns/op
67 | BenchmarkOperations.matrix_mult_ejml                     N/A       4  avgt    5          84.286 ±         2.964  ns/op
68 | BenchmarkOperations.matrix_mult_ejml                     N/A    1000  avgt    5   611016316.100 ±  13669452.444  ns/op
69 | BenchmarkOperations.matrix_mult_vectors                  N/A       4  avgt    5         121.820 ±         1.471  ns/op
70 | BenchmarkOperations.matrix_mult_vectors                  N/A    1000  avgt    5   329232594.200 ±  10054084.639  ns/op
71 | BenchmarkOperations.mean_horizontal                      N/A     N/A  avgt    5     2188929.877 ±     19603.001  ns/op
72 | ```
73 | 
74 | [1] I would expect a well writen C++ port of that same function to run about 2.5x faster than pure Java on large
75 | matrices. That's about the performance different you get when you compare the top performing pure Java 
76 | libraries against Eigen or LAPACK. The code used is designed for medium sized matrices.
77 | 
78 | [2] This result isn't surprising. Optimizing for small matrices requires very different approaches than large ones.
79 | One potential improvement for Vector API would be to allow recycling of memory. More hand optimization of the
80 | loops could reduce the gap. While the current API is easy to use it's clobbering the innermost loop with calls to new.
81 | That's a big no in writing high performance code. I could be  wrong, maybe there's some specialized code that 
82 | recognizes what's going on and recycles memory. Small matrix perform is critical in computer vision and signal 
83 | processing.
84 | 
85 | [3] BoofCV includes code where if the kernel is small, it will invoke code which is unrolled. This typically
86 | results in massive speed up. I wish the JVM was better is at recognizing when to unroll a loop, so I don't
87 | need to write all this auto generated code.
88 | 
89 | [4] Vector doesn't support unsigned bytes yet and the Vector implementation fails the unit test. Based on comments 
90 | in the JDK looks like that is will be added.
91 | 
92 | Author: Peter Abeles
93 | 
94 | https://twitter.com/NotSoOptimal


--------------------------------------------------------------------------------
/build.gradle:
--------------------------------------------------------------------------------
 1 | apply plugin: 'java-library'
 2 | 
 3 | group 'lessthanoptimal'
 4 | version '0.1'
 5 | 
 6 | java {
 7 |     withJavadocJar()
 8 |     withSourcesJar()
 9 |     toolchain { languageVersion = JavaLanguageVersion.of(16) }
10 | }
11 | 
12 | javadoc { configure(options) { enabled=false } }
13 | test { useJUnitPlatform() }
14 | 
15 | var incubatorArguments = ["--enable-preview","--add-modules", "jdk.incubator.vector"]
16 | 
17 | tasks.withType(AbstractCompile) { options.compilerArgs += incubatorArguments }
18 | tasks.withType(Test) { jvmArgs += incubatorArguments }
19 | 
20 | repositories {
21 |     mavenCentral()
22 |     mavenLocal()
23 | }
24 | 
25 | dependencies {
26 |     ['1.27'].each { String a ->
27 |         api('org.openjdk.jmh:jmh-core:' + a)
28 |         annotationProcessor 'org.openjdk.jmh:jmh-generator-annprocess:' + a
29 |     }
30 | 
31 |     ['ejml-core','ejml-ddense'].each { String a ->
32 |         api group: 'org.ejml', name: a, version: '0.40' }
33 | 
34 |     ['boofcv-ip'].each { String a ->
35 |         implementation group: 'org.boofcv', name: a, version: '0.36' }
36 | 
37 |     testImplementation( 'org.junit.jupiter:junit-jupiter-api:5.4.0')
38 |     testRuntimeOnly( 'org.junit.jupiter:junit-jupiter-engine:5.4.0')
39 | }
40 | 
41 | task runtimeBenchmark(type: JavaExec) {
42 |     dependsOn build
43 |     javaLauncher = javaToolchains.launcherFor { languageVersion = JavaLanguageVersion.of(16) }
44 |     jvmArgs += incubatorArguments
45 |     group = "Execution"
46 |     description = "Runs the benchmark code"
47 |     classpath = sourceSets.main.runtimeClasspath
48 |     main = "benchmark.BenchmarkOperations"
49 | }
50 | 
51 | 
52 | wrapper {
53 |     distributionType = Wrapper.DistributionType.BIN
54 |     gradleVersion = '7.0'
55 | }
56 | 


--------------------------------------------------------------------------------
/gradle/wrapper/gradle-wrapper.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lessthanoptimal/VectorPerformance/50c1a39d63fcee82fa2b3a8e88704c45941882fd/gradle/wrapper/gradle-wrapper.jar


--------------------------------------------------------------------------------
/gradle/wrapper/gradle-wrapper.properties:
--------------------------------------------------------------------------------
1 | distributionBase=GRADLE_USER_HOME
2 | distributionPath=wrapper/dists
3 | distributionUrl=https\://services.gradle.org/distributions/gradle-7.0-bin.zip
4 | zipStoreBase=GRADLE_USER_HOME
5 | zipStorePath=wrapper/dists
6 | 


--------------------------------------------------------------------------------
/gradlew:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env sh
  2 | 
  3 | #
  4 | # Copyright 2015 the original author or authors.
  5 | #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | #
 10 | #      https://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | #
 18 | 
 19 | ##############################################################################
 20 | ##
 21 | ##  Gradle start up script for UN*X
 22 | ##
 23 | ##############################################################################
 24 | 
 25 | # Attempt to set APP_HOME
 26 | # Resolve links: $0 may be a link
 27 | PRG="$0"
 28 | # Need this for relative symlinks.
 29 | while [ -h "$PRG" ] ; do
 30 |     ls=`ls -ld "$PRG"`
 31 |     link=`expr "$ls" : '.*-> \(.*\)$'`
 32 |     if expr "$link" : '/.*' > /dev/null; then
 33 |         PRG="$link"
 34 |     else
 35 |         PRG=`dirname "$PRG"`"/$link"
 36 |     fi
 37 | done
 38 | SAVED="`pwd`"
 39 | cd "`dirname \"$PRG\"`/" >/dev/null
 40 | APP_HOME="`pwd -P`"
 41 | cd "$SAVED" >/dev/null
 42 | 
 43 | APP_NAME="Gradle"
 44 | APP_BASE_NAME=`basename "$0"`
 45 | 
 46 | # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
 47 | DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"'
 48 | 
 49 | # Use the maximum available, or set MAX_FD != -1 to use that value.
 50 | MAX_FD="maximum"
 51 | 
 52 | warn () {
 53 |     echo "$*"
 54 | }
 55 | 
 56 | die () {
 57 |     echo
 58 |     echo "$*"
 59 |     echo
 60 |     exit 1
 61 | }
 62 | 
 63 | # OS specific support (must be 'true' or 'false').
 64 | cygwin=false
 65 | msys=false
 66 | darwin=false
 67 | nonstop=false
 68 | case "`uname`" in
 69 |   CYGWIN* )
 70 |     cygwin=true
 71 |     ;;
 72 |   Darwin* )
 73 |     darwin=true
 74 |     ;;
 75 |   MINGW* )
 76 |     msys=true
 77 |     ;;
 78 |   NONSTOP* )
 79 |     nonstop=true
 80 |     ;;
 81 | esac
 82 | 
 83 | CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
 84 | 
 85 | 
 86 | # Determine the Java command to use to start the JVM.
 87 | if [ -n "$JAVA_HOME" ] ; then
 88 |     if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
 89 |         # IBM's JDK on AIX uses strange locations for the executables
 90 |         JAVACMD="$JAVA_HOME/jre/sh/java"
 91 |     else
 92 |         JAVACMD="$JAVA_HOME/bin/java"
 93 |     fi
 94 |     if [ ! -x "$JAVACMD" ] ; then
 95 |         die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
 96 | 
 97 | Please set the JAVA_HOME variable in your environment to match the
 98 | location of your Java installation."
 99 |     fi
100 | else
101 |     JAVACMD="java"
102 |     which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
103 | 
104 | Please set the JAVA_HOME variable in your environment to match the
105 | location of your Java installation."
106 | fi
107 | 
108 | # Increase the maximum file descriptors if we can.
109 | if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then
110 |     MAX_FD_LIMIT=`ulimit -H -n`
111 |     if [ $? -eq 0 ] ; then
112 |         if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
113 |             MAX_FD="$MAX_FD_LIMIT"
114 |         fi
115 |         ulimit -n $MAX_FD
116 |         if [ $? -ne 0 ] ; then
117 |             warn "Could not set maximum file descriptor limit: $MAX_FD"
118 |         fi
119 |     else
120 |         warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
121 |     fi
122 | fi
123 | 
124 | # For Darwin, add options to specify how the application appears in the dock
125 | if $darwin; then
126 |     GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
127 | fi
128 | 
129 | # For Cygwin or MSYS, switch paths to Windows format before running java
130 | if [ "$cygwin" = "true" -o "$msys" = "true" ] ; then
131 |     APP_HOME=`cygpath --path --mixed "$APP_HOME"`
132 |     CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
133 | 
134 |     JAVACMD=`cygpath --unix "$JAVACMD"`
135 | 
136 |     # We build the pattern for arguments to be converted via cygpath
137 |     ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
138 |     SEP=""
139 |     for dir in $ROOTDIRSRAW ; do
140 |         ROOTDIRS="$ROOTDIRS$SEP$dir"
141 |         SEP="|"
142 |     done
143 |     OURCYGPATTERN="(^($ROOTDIRS))"
144 |     # Add a user-defined pattern to the cygpath arguments
145 |     if [ "$GRADLE_CYGPATTERN" != "" ] ; then
146 |         OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
147 |     fi
148 |     # Now convert the arguments - kludge to limit ourselves to /bin/sh
149 |     i=0
150 |     for arg in "$@" ; do
151 |         CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
152 |         CHECK2=`echo "$arg"|egrep -c "^-"`                                 ### Determine if an option
153 | 
154 |         if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then                    ### Added a condition
155 |             eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
156 |         else
157 |             eval `echo args$i`="\"$arg\""
158 |         fi
159 |         i=`expr $i + 1`
160 |     done
161 |     case $i in
162 |         0) set -- ;;
163 |         1) set -- "$args0" ;;
164 |         2) set -- "$args0" "$args1" ;;
165 |         3) set -- "$args0" "$args1" "$args2" ;;
166 |         4) set -- "$args0" "$args1" "$args2" "$args3" ;;
167 |         5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
168 |         6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
169 |         7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
170 |         8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
171 |         9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
172 |     esac
173 | fi
174 | 
175 | # Escape application args
176 | save () {
177 |     for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done
178 |     echo " "
179 | }
180 | APP_ARGS=`save "$@"`
181 | 
182 | # Collect all arguments for the java command, following the shell quoting and substitution rules
183 | eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS"
184 | 
185 | exec "$JAVACMD" "$@"
186 | 


--------------------------------------------------------------------------------
/gradlew.bat:
--------------------------------------------------------------------------------
 1 | @rem
 2 | @rem Copyright 2015 the original author or authors.
 3 | @rem
 4 | @rem Licensed under the Apache License, Version 2.0 (the "License");
 5 | @rem you may not use this file except in compliance with the License.
 6 | @rem You may obtain a copy of the License at
 7 | @rem
 8 | @rem      https://www.apache.org/licenses/LICENSE-2.0
 9 | @rem
10 | @rem Unless required by applicable law or agreed to in writing, software
11 | @rem distributed under the License is distributed on an "AS IS" BASIS,
12 | @rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | @rem See the License for the specific language governing permissions and
14 | @rem limitations under the License.
15 | @rem
16 | 
17 | @if "%DEBUG%" == "" @echo off
18 | @rem ##########################################################################
19 | @rem
20 | @rem  Gradle startup script for Windows
21 | @rem
22 | @rem ##########################################################################
23 | 
24 | @rem Set local scope for the variables with windows NT shell
25 | if "%OS%"=="Windows_NT" setlocal
26 | 
27 | set DIRNAME=%~dp0
28 | if "%DIRNAME%" == "" set DIRNAME=.
29 | set APP_BASE_NAME=%~n0
30 | set APP_HOME=%DIRNAME%
31 | 
32 | @rem Resolve any "." and ".." in APP_HOME to make it shorter.
33 | for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi
34 | 
35 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
36 | set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m"
37 | 
38 | @rem Find java.exe
39 | if defined JAVA_HOME goto findJavaFromJavaHome
40 | 
41 | set JAVA_EXE=java.exe
42 | %JAVA_EXE% -version >NUL 2>&1
43 | if "%ERRORLEVEL%" == "0" goto execute
44 | 
45 | echo.
46 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
47 | echo.
48 | echo Please set the JAVA_HOME variable in your environment to match the
49 | echo location of your Java installation.
50 | 
51 | goto fail
52 | 
53 | :findJavaFromJavaHome
54 | set JAVA_HOME=%JAVA_HOME:"=%
55 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe
56 | 
57 | if exist "%JAVA_EXE%" goto execute
58 | 
59 | echo.
60 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
61 | echo.
62 | echo Please set the JAVA_HOME variable in your environment to match the
63 | echo location of your Java installation.
64 | 
65 | goto fail
66 | 
67 | :execute
68 | @rem Setup the command line
69 | 
70 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
71 | 
72 | 
73 | @rem Execute Gradle
74 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %*
75 | 
76 | :end
77 | @rem End local scope for the variables with windows NT shell
78 | if "%ERRORLEVEL%"=="0" goto mainEnd
79 | 
80 | :fail
81 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
82 | rem the _cmd.exe /c_ return code!
83 | if  not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
84 | exit /b 1
85 | 
86 | :mainEnd
87 | if "%OS%"=="Windows_NT" endlocal
88 | 
89 | :omega
90 | 


--------------------------------------------------------------------------------
/src/main/java/benchmark/BenchmarkOperations.java:
--------------------------------------------------------------------------------
  1 | package benchmark;
  2 | 
  3 | import boofcv.alg.filter.convolve.ConvolveImageNoBorder;
  4 | import boofcv.alg.misc.ImageMiscOps;
  5 | import boofcv.concurrency.BoofConcurrency;
  6 | import boofcv.factory.filter.kernel.FactoryKernelGaussian;
  7 | import boofcv.struct.convolve.Kernel1D_F32;
  8 | import boofcv.struct.image.GrayF32;
  9 | import boofcv.struct.image.GrayU16;
 10 | import boofcv.struct.image.GrayU8;
 11 | import org.ejml.data.DMatrixRMaj;
 12 | import org.ejml.data.ZMatrixRMaj;
 13 | import org.ejml.dense.row.CommonOps_DDRM;
 14 | import org.ejml.dense.row.RandomMatrices_DDRM;
 15 | import org.ejml.dense.row.RandomMatrices_ZDRM;
 16 | import org.openjdk.jmh.annotations.*;
 17 | import org.openjdk.jmh.runner.Runner;
 18 | import org.openjdk.jmh.runner.RunnerException;
 19 | import org.openjdk.jmh.runner.options.Options;
 20 | import org.openjdk.jmh.runner.options.OptionsBuilder;
 21 | import org.openjdk.jmh.runner.options.TimeValue;
 22 | 
 23 | import java.util.Random;
 24 | import java.util.concurrent.TimeUnit;
 25 | 
 26 | @BenchmarkMode(Mode.AverageTime)
 27 | @Warmup(iterations = 2)
 28 | @Measurement(iterations = 5)
 29 | @State(Scope.Benchmark)
 30 | @OutputTimeUnit(TimeUnit.NANOSECONDS)
 31 | @Fork(value = 1)
 32 | public class BenchmarkOperations {
 33 |     final static int MAX_PIXEL_U16 = 2000;
 34 | 
 35 |     @State(Scope.Thread)
 36 |     public static class MatrixState {
 37 |         @Param({"4","1000"}) // 4x4 matrices are very common in computer vision. 1000 is a medium sized matrix
 38 |         public int size;
 39 | 
 40 |         DMatrixRMaj A = new DMatrixRMaj(1, 1);
 41 |         DMatrixRMaj B = new DMatrixRMaj(1, 1);
 42 |         DMatrixRMaj C = new DMatrixRMaj(1, 1);
 43 | 
 44 |         ZMatrixRMaj CA = new ZMatrixRMaj(1, 1);
 45 |         ZMatrixRMaj CB = new ZMatrixRMaj(1, 1);
 46 |         ZMatrixRMaj CC = new ZMatrixRMaj(1, 1);
 47 | 
 48 |         @Setup(Level.Trial)
 49 |         public void setup() {
 50 |             Random rand = new Random(345);
 51 | 
 52 |             A.reshape(size, size);
 53 |             B.reshape(size, size);
 54 |             C.reshape(size, size);
 55 |             RandomMatrices_DDRM.fillUniform(A, -1, 1, rand);
 56 |             RandomMatrices_DDRM.fillUniform(B, -1, 1, rand);
 57 |             RandomMatrices_DDRM.fillUniform(C, -1, 1, rand);
 58 | 
 59 |             CA.reshape(size, size);
 60 |             CB.reshape(size, size);
 61 |             CC.reshape(size, size);
 62 |             RandomMatrices_ZDRM.fillUniform(CA, -1, 1, rand);
 63 |             RandomMatrices_ZDRM.fillUniform(CB, -1, 1, rand);
 64 |             RandomMatrices_ZDRM.fillUniform(CC, -1, 1, rand);
 65 |         }
 66 |     }
 67 | 
 68 |     @State(Scope.Thread)
 69 |     public static class FloatImageState {
 70 |         @Param({"5","31"})
 71 |         public int kernelSize;
 72 | 
 73 |         GrayF32 src = new GrayF32(1200,800);
 74 |         GrayF32 dst = src.createSameShape();
 75 | 
 76 |         Kernel1D_F32 kernel;
 77 | 
 78 |         @Setup(Level.Trial)
 79 |         public void setup() {
 80 |             // When calling a BoofCV function make sure it doesn't run concurrent code
 81 |             BoofConcurrency.USE_CONCURRENT = false;
 82 | 
 83 |             Random rand = new Random(345);
 84 |             ImageMiscOps.fillUniform(src, rand, 0, 255);
 85 | 
 86 |             kernel = FactoryKernelGaussian.gaussian1D(GrayF32.class, -1, kernelSize/2);
 87 |         }
 88 |     }
 89 | 
 90 |     @State(Scope.Thread)
 91 |     public static class ByteImageState {
 92 |         GrayU8 src = new GrayU8(1200,800);
 93 |         GrayU8 dst = src.createSameShape();
 94 | 
 95 |         @Setup(Level.Trial)
 96 |         public void setup() {
 97 |             // When calling a BoofCV function make sure it doesn't run concurrent code
 98 |             BoofConcurrency.USE_CONCURRENT = false;
 99 | 
100 |             Random rand = new Random(345);
101 |             ImageMiscOps.fillUniform(src, rand, 0, 255);
102 |         }
103 |     }
104 | 
105 |     @State(Scope.Thread)
106 |     public static class ShortImageState {
107 |         GrayU16 src = new GrayU16(1200,800);
108 |         GrayU16 dst = src.createSameShape();
109 |         int[] histogram = new int[MAX_PIXEL_U16];
110 | 
111 |         @Setup(Level.Trial)
112 |         public void setup() {
113 |             // When calling a BoofCV function make sure it doesn't run concurrent code
114 |             BoofConcurrency.USE_CONCURRENT = false;
115 | 
116 |             Random rand = new Random(345);
117 |             ImageMiscOps.fillUniform(src, rand, 0, MAX_PIXEL_U16);
118 |         }
119 |     }
120 | 
121 | //    @Benchmark public void matrix_mult_real(MatrixState state) {
122 | //        MatrixMultiplication.mult_ikj(state.A, state.B, state.C);
123 | //    }
124 | //
125 | //    @Benchmark public void matrix_mult_real_ejml(MatrixState state) {
126 | //        // There is specialized code for small matrices here and if large enough a block matrix will kick in
127 | //        CommonOps_DDRM.mult(state.A, state.B, state.C);
128 | //    }
129 | //
130 | //    @Benchmark public void matrix_mult_real_vectors(MatrixState state) {
131 | //        MatrixMultiplication.mult_ikj_vector(state.A, state.B, state.C);
132 | //    }
133 | //
134 | //    @Benchmark public void matrix_mult_complex(MatrixState state) {
135 | //        MatrixMultiplication.mult_ikj(state.CA, state.CB, state.CC);
136 | //    }
137 | //
138 | //    @Benchmark public void matrix_mult_complex_vector(MatrixState state) {
139 | //        MatrixMultiplication.mult_ikj_vector(state.CA, state.CB, state.CC);
140 | //    }
141 | //
142 | //    @Benchmark public void convolve_horizontal(FloatImageState state) {
143 | //        ImageProcessing.horizontal(state.kernel, state.src, state.dst);
144 | //    }
145 | //
146 | //    @Benchmark public void convolve_horizontal_vector(FloatImageState state) {
147 | //        ImageProcessing.horizontal_vector(state.kernel, state.src, state.dst);
148 | //    }
149 | //
150 | //    @Benchmark public void convolve_horizontal_boofcv(FloatImageState state) {
151 | //        // If possible this method will run an unrolled kernel
152 | //        ConvolveImageNoBorder.horizontal(state.kernel, state.src, state.dst);
153 | //    }
154 | 
155 |     @Benchmark public void mean_horizontal(ByteImageState state) {
156 |         ImageProcessing.mean_horizontal(state.src, state.dst, 5, 11);
157 |     }
158 | 
159 |     @Benchmark public void mean_horizontal_vector(ByteImageState state) {
160 |         ImageProcessing.mean_horizontal_vector(state.src, state.dst, 5, 11);
161 |     }
162 | 
163 | //    @Benchmark public void image_threshold(ByteImageState state) {
164 | //        ImageProcessing.threshold(state.src, state.dst, 125);
165 | //    }
166 | //
167 | //    @Benchmark public void image_threshold_vector_v1(ByteImageState state) {
168 | //        ImageProcessing.threshold_vector_v1(state.src, state.dst, 125);
169 | //    }
170 | //
171 | //    @Benchmark public void image_threshold_vector_v2(ByteImageState state) {
172 | //        ImageProcessing.threshold_vector_v2(state.src, state.dst, 125);
173 | //    }
174 | //
175 | //    @Benchmark public void histogram(ShortImageState state) {
176 | //        ImageProcessing.histogram(state.src, 0, state.histogram);
177 | //    }
178 | 
179 |     public static void main(String[] args) throws RunnerException {
180 |         Options opt = new OptionsBuilder()
181 |                 .include(BenchmarkOperations.class.getSimpleName())
182 |                 .warmupTime(TimeValue.seconds(1))
183 |                 .measurementTime(TimeValue.seconds(1))
184 |                 .build();
185 |         new Runner(opt).run();
186 |     }
187 | }
188 | 


--------------------------------------------------------------------------------
/src/main/java/benchmark/ImageProcessing.java:
--------------------------------------------------------------------------------
  1 | package benchmark;
  2 | 
  3 | import boofcv.struct.convolve.Kernel1D_F32;
  4 | import boofcv.struct.image.GrayF32;
  5 | import boofcv.struct.image.GrayI8;
  6 | import boofcv.struct.image.GrayU16;
  7 | import boofcv.struct.image.GrayU8;
  8 | import jdk.incubator.vector.*;
  9 | 
 10 | import java.util.Arrays;
 11 | 
 12 | public class ImageProcessing {
 13 |     public static void horizontal(Kernel1D_F32 kernel,
 14 |                                   GrayF32 image, GrayF32 dest ) {
 15 |         final float[] dataSrc = image.data;
 16 |         final float[] dataDst = dest.data;
 17 |         final float[] dataKer = kernel.data;
 18 | 
 19 |         final int offset = kernel.getOffset();
 20 |         final int kernelWidth = kernel.getWidth();
 21 | 
 22 |         final int width = image.getWidth();
 23 | 
 24 |         //CONCURRENT_BELOW BoofConcurrency.loopFor(0, image.height, i -> {
 25 |         for( int i = 0; i < image.height; i++ ) {
 26 |             int indexDst = dest.startIndex + i*dest.stride + offset;
 27 |             int j = image.startIndex + i*image.stride;
 28 |             final int jEnd = j+width-(kernelWidth-1);
 29 | 
 30 |             for (; j < jEnd; j++) {
 31 |                 float total = 0;
 32 |                 int indexSrc = j;
 33 |                 for (int k = 0; k < kernelWidth; k++) {
 34 |                     total += (dataSrc[indexSrc++])*dataKer[k];
 35 |                 }
 36 |                 dataDst[indexDst++] = total;
 37 |             }
 38 |         }
 39 |         //CONCURRENT_ABOVE });
 40 |     }
 41 | 
 42 |     public static void horizontal_vector(Kernel1D_F32 kernel,
 43 |                                          GrayF32 image, GrayF32 dest ) {
 44 |         final float[] dataSrc = image.data;
 45 |         final float[] dataDst = dest.data;
 46 |         final float[] dataKer = kernel.data;
 47 | 
 48 |         final int offset = kernel.getOffset();
 49 |         final int kernelWidth = kernel.getWidth();
 50 | 
 51 |         final int width = image.getWidth();
 52 | 
 53 |         VectorSpecies<Float> SPECIES = FloatVector.SPECIES_PREFERRED;
 54 | 
 55 |         for( int i = 0; i < image.height; i++ ) {
 56 |             int indexDst = dest.startIndex + i*dest.stride + offset;
 57 |             int j = image.startIndex + i*image.stride;
 58 |             final int jEnd = j+width-(kernelWidth-1);
 59 | 
 60 |             for (; j < jEnd; j++) {
 61 |                 float total = 0;
 62 |                 int k = 0;
 63 |                 for (; k < SPECIES.loopBound(kernelWidth); k += SPECIES.length()) {
 64 |                     var vsrc = FloatVector.fromArray(SPECIES, dataSrc, j+k);
 65 |                     var vker = FloatVector.fromArray(SPECIES, dataKer, k);
 66 |                     total += vsrc.mul(vker).reduceLanes(VectorOperators.ADD);
 67 |                 }
 68 |                 for (; k < kernelWidth; k++) {
 69 |                     total += (dataSrc[j+k])*dataKer[k];
 70 |                 }
 71 |                 dataDst[indexDst++] = total;
 72 |             }
 73 |         }
 74 |     }
 75 | 
 76 |     public static void mean_horizontal(GrayU8 input , GrayI8 output, int offset, int length ) {
 77 |         final int divisor = length;
 78 |         final int halfDivisor = divisor/2;
 79 | 
 80 |         //CONCURRENT_BELOW BoofConcurrency.loopFor(0, input.height, y -> {
 81 |         for( int y = 0; y < input.height; y++ ) {
 82 |             int indexIn = input.startIndex + input.stride*y;
 83 |             int indexOut = output.startIndex + output.stride*y + offset;
 84 | 
 85 |             int total = 0;
 86 | 
 87 |             int indexEnd = indexIn + length;
 88 | 
 89 |             for (; indexIn < indexEnd; indexIn++) {
 90 |                 total += input.data[indexIn] & 0xFF;
 91 |             }
 92 |             output.data[indexOut++] = (byte)((total+halfDivisor)/divisor);
 93 | 
 94 |             indexEnd = indexIn + input.width - length;
 95 |             for (; indexIn < indexEnd; indexIn++) {
 96 |                 total -= input.data[indexIn - length] & 0xFF;
 97 |                 total += input.data[indexIn] & 0xFF;
 98 | 
 99 |                 output.data[indexOut++] = (byte)((total+halfDivisor)/divisor);
100 |             }
101 |         }
102 |         //CONCURRENT_ABOVE });
103 |     }
104 | 
105 |     public static void mean_horizontal_vector(GrayU8 input , GrayI8 output, int offset, int length ) {
106 |         final int divisor = length;
107 |         final int halfDivisor = divisor/2;
108 | 
109 |         short[] tmp = new short[input.width];
110 | 
111 |         //CONCURRENT_BELOW BoofConcurrency.loopFor(0, input.height, y -> {
112 |         for( int y = 0; y < input.height; y++ ) {
113 |             int indexIn = input.startIndex + input.stride*y;
114 |             int indexOut = output.startIndex + output.stride*y + offset;
115 | 
116 |             int total = 0;
117 | 
118 |             int indexEnd = indexIn + length;
119 | 
120 |             for (; indexIn < indexEnd; indexIn++) {
121 |                 total += input.data[indexIn] & 0xFF;
122 |             }
123 |             output.data[indexOut++] = (byte)((total+halfDivisor)/divisor);
124 | 
125 |             // TODO 1) first pass compute the result of summing the head and the tail. Like the code below
126 |             //
127 | 
128 |             final int end = input.width-length;
129 |             for (int i = 0, idx=indexIn; i < end; i++, idx++) {
130 |                 tmp[i] = (short)((input.data[idx] & 0xFF) - (input.data[idx - length] & 0xFF));
131 |             }
132 | 
133 |             indexEnd = indexIn + input.width - length;
134 |             for (int i = 0; indexIn < indexEnd; i++, indexIn++) {
135 |                 total += tmp[i];
136 |                 output.data[indexOut++] = (byte)((total+halfDivisor)/divisor);
137 |             }
138 |         }
139 |         //CONCURRENT_ABOVE });
140 |     }
141 | 
142 |     public static GrayU8 threshold( GrayU8 input, GrayU8 output, int threshold ) {
143 |         //CONCURRENT_BELOW BoofConcurrency.loopFor(0, input.height, y -> {
144 |         for( int y = 0; y < input.height; y++ ) {
145 |             int indexIn = input.startIndex + y*input.stride;
146 |             int indexOut = output.startIndex + y*output.stride;
147 | 
148 |             for( int i = input.width; i>0; i-- ) {
149 |                 output.data[indexOut++] = (byte)((input.data[indexIn++]& 0xFF) <= threshold ? 1 : 0);
150 |             }
151 |         }
152 |         //CONCURRENT_ABOVE });
153 | 
154 |         return output;
155 |     }
156 | 
157 |     public static GrayU8 threshold_vector_v1(GrayU8 input, GrayU8 output, int threshold ) {
158 |         VectorSpecies<Byte> SPECIES = ByteVector.SPECIES_PREFERRED;
159 | 
160 |         // Vector applies threshold by writing to booleans
161 |         boolean[] tmp = new boolean[input.width];
162 | 
163 |         for( int y = 0; y < input.height; y++ ) {
164 |             int indexIn = input.startIndex + y*input.stride;
165 |             int indexOut = output.startIndex + y*output.stride;
166 | 
167 |             int i = 0;
168 |             for(; i < SPECIES.loopBound(input.width); i += SPECIES.length() ) {
169 |                 var vinput = ByteVector.fromArray(SPECIES, input.data, indexIn+i);
170 |                 vinput.compare(VectorOperators.LE, threshold).intoArray(tmp, i);
171 |                 // NOTE: This will yield incorrect results because JDK doesn't support unsigned comparisons
172 |             }
173 |             for (int vectorIdx = 0; vectorIdx < i; vectorIdx++) {
174 |                 output.data[indexOut+vectorIdx] = (byte)(tmp[vectorIdx] ? 1 : 0);
175 |             }
176 | 
177 |             for(; i < input.width; i++ ) {
178 |                 output.data[indexOut+i] = (byte)((input.data[indexIn+i]& 0xFF) <= threshold ? 1 : 0);
179 |             }
180 |         }
181 | 
182 |         return output;
183 |     }
184 | 
185 |     public static GrayU8 threshold_vector_v2(GrayU8 input, GrayU8 output, int threshold ) {
186 |         VectorSpecies<Byte> SPECIES = ByteVector.SPECIES_PREFERRED;
187 | 
188 |         for( int y = 0; y < input.height; y++ ) {
189 |             int indexIn = input.startIndex + y*input.stride;
190 |             int indexOut = output.startIndex + y*output.stride;
191 | 
192 |             int i = 0;
193 |             for(; i < SPECIES.loopBound(input.width); i += SPECIES.length() ) {
194 |                 var vinput = ByteVector.fromArray(SPECIES, input.data, indexIn+i);
195 |                 VectorMask<Byte> compare = vinput.compare(VectorOperators.LE, threshold);
196 |                 // NOTE: This will yield incorrect results because JDK doesn't support unsigned comparisons
197 |                 ByteVector.zero(SPECIES).blend(1, compare).intoArray(output.data, indexOut+i);
198 |             }
199 | 
200 |             for(; i < input.width; i++ ) {
201 |                 output.data[indexOut+i] = (byte)((input.data[indexIn+i]& 0xFF) <= threshold ? 1 : 0);
202 |             }
203 |         }
204 | 
205 |         return output;
206 |     }
207 | 
208 |     public static void histogram(GrayU16 input, int minValue, int[] histogram ) {
209 |         Arrays.fill(histogram,0);
210 | 
211 |         for( int y = 0; y < input.height; y++ ) {
212 |             int index = input.startIndex + y*input.stride;
213 |             int end = index + input.width;
214 | 
215 |             while( index < end ) {
216 |                 histogram[(input.data[index++]& 0xFFFF) - minValue ]++;
217 |             }
218 |         }
219 |     }
220 | 
221 |     public static void histogram_vector(GrayU16 input, int minValue, int[] histogram ) {
222 |         Arrays.fill(histogram,0);
223 | 
224 |         VectorSpecies<Short> SPECIES = ShortVector.SPECIES_PREFERRED;
225 | 
226 |         for( int y = 0; y < input.height; y++ ) {
227 |             int index = input.startIndex + y*input.stride;
228 |             int end = index + input.width;
229 | 
230 |             for(; index < SPECIES.loopBound(input.width); index += SPECIES.length() ) {
231 |                 var vinput = ShortVector.fromArray(SPECIES, input.data, index);
232 |                 // NOTE: This will yield incorrect results because JDK doesn't support unsigned comparisons
233 |                 vinput.sub((short)minValue);
234 | 
235 |                 // can I convert this into a mask? what if the same element is referenced more than once?
236 | 
237 |                 // TODO is it possible to vectorize incrementing different array elements? I suspect not...
238 |             }
239 | 
240 |             while( index < end ) {
241 |                 histogram[(input.data[index++]& 0xFFFF) - minValue ]++;
242 |             }
243 |         }
244 |     }
245 | }
246 | 


--------------------------------------------------------------------------------
/src/main/java/benchmark/MatrixMultiplication.java:
--------------------------------------------------------------------------------
  1 | package benchmark;
  2 | 
  3 | import jdk.incubator.vector.DoubleVector;
  4 | import jdk.incubator.vector.VectorSpecies;
  5 | import org.ejml.data.DMatrix1Row;
  6 | import org.ejml.data.ZMatrixRMaj;
  7 | 
  8 | /**
  9 |  * @author Peter Abeles
 10 |  */
 11 | public class MatrixMultiplication {
 12 |     static final VectorSpecies<Double> SPECIES = DoubleVector.SPECIES_PREFERRED;
 13 | 
 14 |     /**
 15 |      * Matrix multiplication with IKJ ordering from EJML. This is designed to minimize cache misses and is a
 16 |      * top performer in internal benchmarks. For larger matrices EJML switches to a block multiplication, which
 17 |      * is excessively complex for this benchmark.
 18 |      */
 19 |     public static void mult_ikj(DMatrix1Row A, DMatrix1Row B, DMatrix1Row C) {
 20 |         C.reshape(A.numRows, B.numCols);
 21 | 
 22 |         // Note to people looking at this code. It might look like there is a bunch of unnecessary hand optimizations.
 23 |         // You might be right, but this code goes back to probably Java 1.7 and when you're writing high
 24 |         // performance code that needs to be high performance on a wide range of platforms and JVM versions,
 25 |         // the general pattern is the more hand holding you do for the JVM the better your code will run even in
 26 |         // resource constrained environments.
 27 |         //
 28 |         // Having said that, the code could probably be cleaned up but doing that and proving it won't cause
 29 |         // a performance regression on some ancient JVM on some underpowered ARM processor is not trivial. So if it's
 30 |         // not broken don't fix it. Earlier attempts to follow "good practices" have resulted in improvements
 31 |         // being reverted, even after careful micro benchmarking.
 32 | 
 33 |         final int endOfKLoop = B.numRows * B.numCols;
 34 | 
 35 |         for (int i = 0; i < A.numRows; i++) {
 36 |             int indexCbase = i * C.numCols;
 37 |             int indexA = i * A.numCols;
 38 | 
 39 |             // need to assign C.data to a value initially
 40 |             int indexB = 0;
 41 |             int indexC = indexCbase;
 42 |             int end = indexB + B.numCols;
 43 | 
 44 |             double valA = A.data[indexA++];
 45 | 
 46 |             while (indexB < end) {
 47 |                 C.data[indexC++] = valA * B.data[indexB++];
 48 |             }
 49 | 
 50 |             // now add to it
 51 |             while (indexB != endOfKLoop) { // k loop
 52 |                 indexC = indexCbase;
 53 |                 end = indexB + B.numCols;
 54 | 
 55 |                 valA = A.data[indexA++];
 56 | 
 57 |                 while (indexB < end) { // j loop
 58 |                     C.data[indexC++] += valA * B.data[indexB++];
 59 |                 }
 60 |             }
 61 |         }
 62 |     }
 63 | 
 64 |     public static void mult_ikj_simple(DMatrix1Row A, DMatrix1Row B, DMatrix1Row C) {
 65 |         C.reshape(A.numRows, B.numCols);
 66 | 
 67 |         for (int i = 0; i < A.numRows; i++) {
 68 |             int indexCbase = i * C.numCols;
 69 | 
 70 |             // Initialize the row in C
 71 |             {
 72 |                 double valA = A.data[i * A.numCols];
 73 |                 for (int j = 0; j < B.numCols; j++) {
 74 |                     C.data[indexCbase + j] = valA * B.data[j];
 75 |                 }
 76 |             }
 77 | 
 78 |             // Now sum up the final results
 79 |             for (int k = 1; k < B.numRows; k++) {
 80 |                 int indexC = indexCbase;
 81 |                 int indexB = k * B.numCols;
 82 | 
 83 |                 double valA = A.data[i * A.numCols + k];
 84 |                 for (int j = 0; j < B.numCols; j++) {
 85 |                     C.data[indexC++] += valA * B.data[indexB++];
 86 |                 }
 87 |             }
 88 |         }
 89 |     }
 90 | 
 91 |     public static void mult_ikj_vector(DMatrix1Row A, DMatrix1Row B, DMatrix1Row C) {
 92 |         C.reshape(A.numRows, B.numCols);
 93 | 
 94 |         for (int i = 0; i < A.numRows; i++) {
 95 |             int indexCbase = i * C.numCols;
 96 |             {
 97 |                 double valA = A.data[i * A.numCols];
 98 |                 int j;
 99 |                 for (j = 0; j < SPECIES.loopBound(B.numCols); j += SPECIES.length()) {
100 |                     var vb = DoubleVector.fromArray(SPECIES, B.data, j);
101 |                     vb.mul(valA).intoArray(C.data, indexCbase + j);
102 |                 }
103 |                 for (; j < B.numCols; j++) {
104 |                     C.data[indexCbase + j] = valA * B.data[j];
105 |                 }
106 |             }
107 | 
108 |             for (int k = 1; k < B.numRows; k++) {
109 |                 int indexB = k * B.numCols;
110 | 
111 |                 double valA = A.data[i * A.numCols + k];
112 | 
113 |                 int j;
114 |                 for (j = 0; j < SPECIES.loopBound(B.numCols); j += SPECIES.length()) {
115 |                     var vb = DoubleVector.fromArray(SPECIES, B.data, indexB + j);
116 |                     var vc = DoubleVector.fromArray(SPECIES, C.data, indexCbase + j);
117 |                     vc.add(vb.mul(valA)).intoArray(C.data, indexCbase + j);
118 |                 }
119 | 
120 |                 for (; j < B.numCols; j++) {
121 |                     C.data[indexCbase + j] += valA * B.data[indexB + j];
122 |                 }
123 |             }
124 |         }
125 |     }
126 | 
127 |     // Matrix multiplication for a complex matrix
128 |     public static void mult_ikj(ZMatrixRMaj A, ZMatrixRMaj B, ZMatrixRMaj C) {
129 |         double realA, imagA;
130 | 
131 |         int indexCbase = 0;
132 |         int strideA = A.getRowStride();
133 |         int strideB = B.getRowStride();
134 |         int strideC = C.getRowStride();
135 |         int endOfKLoop = B.numRows * strideB;
136 | 
137 |         for (int i = 0; i < A.numRows; i++) {
138 |             int indexA = i * strideA;
139 | 
140 |             // need to assign c.data to a value initially
141 |             int indexB = 0;
142 |             int indexC = indexCbase;
143 |             int end = indexB + strideB;
144 | 
145 |             realA = A.data[indexA++];
146 |             imagA = A.data[indexA++];
147 | 
148 |             while (indexB < end) {
149 |                 double realB = B.data[indexB++];
150 |                 double imagB = B.data[indexB++];
151 | 
152 |                 C.data[indexC++] = realA * realB - imagA * imagB;
153 |                 C.data[indexC++] = realA * imagB + imagA * realB;
154 |             }
155 | 
156 |             // now add to it
157 |             while (indexB != endOfKLoop) { // k loop
158 |                 indexC = indexCbase;
159 |                 end = indexB + strideB;
160 | 
161 |                 realA = A.data[indexA++];
162 |                 imagA = A.data[indexA++];
163 | 
164 |                 while (indexB < end) { // j loop
165 |                     double realB = B.data[indexB++];
166 |                     double imagB = B.data[indexB++];
167 | 
168 |                     C.data[indexC++] += realA * realB - imagA * imagB;
169 |                     C.data[indexC++] += realA * imagB + imagA * realB;
170 |                 }
171 |             }
172 |             indexCbase += strideC;
173 |         }
174 |     }
175 | 
176 |     public static void mult_ikj_vector(ZMatrixRMaj A, ZMatrixRMaj B, ZMatrixRMaj C) {
177 |         double realA, imagA;
178 | 
179 |         int indexCbase = 0;
180 |         int strideA = A.getRowStride();
181 |         int strideB = B.getRowStride();
182 |         int strideC = C.getRowStride();
183 |         int endOfKLoop = B.numRows * strideB;
184 | 
185 |         final int speciesLength = SPECIES.length();
186 |         double[] multiRealA = new double[speciesLength];
187 |         double[] multiImagA = new double[speciesLength];
188 | 
189 |         if (speciesLength % 2 != 0)
190 |             throw new RuntimeException("Code below assumes an even length");
191 | 
192 |         for (int i = 0; i < A.numRows; i++) {
193 |             int indexA = i * strideA;
194 | 
195 |             // need to assign c.data to a value initially
196 |             int indexB = 0;
197 |             int indexC = indexCbase;
198 |             int end = indexB + strideB;
199 | 
200 |             realA = A.data[indexA++];
201 |             imagA = A.data[indexA++];
202 | 
203 |             for (; indexB < SPECIES.loopBound(B.numCols); indexB += SPECIES.length()) {
204 |                 var vb = DoubleVector.fromArray(SPECIES, B.data, indexB);
205 |                 vb.mul(realA).intoArray(multiRealA, 0);
206 |                 vb.mul(imagA).intoArray(multiImagA, 0);
207 | 
208 |                 // TODO figure out how to use shuffle to re-order the arrays quickly
209 |                 for (int j = 0; j < speciesLength; j += 2) {
210 |                     C.data[indexC++] = multiRealA[j] - multiImagA[j + 1];
211 |                     C.data[indexC++] = multiRealA[j + 1] + multiImagA[j];
212 |                 }
213 |             }
214 | 
215 |             while (indexB < end) {
216 |                 double realB = B.data[indexB++];
217 |                 double imagB = B.data[indexB++];
218 | 
219 |                 C.data[indexC++] = realA * realB - imagA * imagB;
220 |                 C.data[indexC++] = realA * imagB + imagA * realB;
221 |             }
222 | 
223 |             // now add to it
224 |             while (indexB != endOfKLoop) { // k loop
225 |                 indexC = indexCbase;
226 |                 end = indexB + strideB;
227 | 
228 |                 realA = A.data[indexA++];
229 |                 imagA = A.data[indexA++];
230 | 
231 |                 for (; indexB < SPECIES.loopBound(B.numCols); indexB += SPECIES.length()) {
232 |                     var vb = DoubleVector.fromArray(SPECIES, B.data, indexB);
233 |                     vb.mul(realA).intoArray(multiRealA, 0);
234 |                     vb.mul(imagA).intoArray(multiImagA, 0);
235 | 
236 |                     for (int j = 0; j < speciesLength; j += 2) {
237 |                         C.data[indexC++] += multiRealA[j] - multiImagA[j + 1];
238 |                         C.data[indexC++] += multiRealA[j + 1] + multiImagA[j];
239 |                     }
240 |                 }
241 | 
242 |                 while (indexB < end) { // j loop
243 |                     double realB = B.data[indexB++];
244 |                     double imgB = B.data[indexB++];
245 | 
246 |                     C.data[indexC++] += realA * realB - imagA * imgB;
247 |                     C.data[indexC++] += realA * imgB + imagA * realB;
248 |                 }
249 |             }
250 |             indexCbase += strideC;
251 |         }
252 |     }
253 | }
254 | 


--------------------------------------------------------------------------------
/src/test/java/benchmark/ImageProcessingTest.java:
--------------------------------------------------------------------------------
 1 | package benchmark;
 2 | 
 3 | import boofcv.alg.misc.ImageMiscOps;
 4 | import boofcv.factory.filter.kernel.FactoryKernel;
 5 | import boofcv.struct.convolve.Kernel1D_F32;
 6 | import boofcv.struct.image.GrayF32;
 7 | import boofcv.struct.image.GrayU8;
 8 | import boofcv.testing.BoofTesting;
 9 | import org.ejml.UtilEjml;
10 | import org.junit.jupiter.api.Disabled;
11 | import org.junit.jupiter.api.Test;
12 | 
13 | import java.util.Random;
14 | 
15 | class ImageProcessingTest {
16 |     int imageSize = 100;
17 |     int kernelSize = 11;
18 |     Random rand = new Random(345);
19 | 
20 |     @Test void horizontal_vector() {
21 |         GrayF32 src = new GrayF32(imageSize, imageSize);
22 |         GrayF32 expected = src.createSameShape();
23 |         GrayF32 found = src.createSameShape();
24 | 
25 |         Kernel1D_F32 kernel = FactoryKernel.random1D_F32(kernelSize,kernelSize/2,0.0f,1.0f,rand);
26 | 
27 |         ImageMiscOps.fillUniform(src, rand, -1, 1);
28 | 
29 |         ImageProcessing.horizontal(kernel, src, expected);
30 |         ImageProcessing.horizontal_vector(kernel, src, found);
31 | 
32 |         BoofTesting.assertEquals(expected, found, UtilEjml.TEST_F32);
33 |     }
34 | 
35 |     @Disabled
36 |     @Test void threshold_vector_v1() {
37 |         GrayU8 src = new GrayU8(imageSize, imageSize);
38 |         GrayU8 expected = src.createSameShape();
39 |         GrayU8 found = src.createSameShape();
40 | 
41 |         ImageMiscOps.fillUniform(src, rand, 0, 255);
42 | 
43 |         ImageProcessing.threshold(src, expected, 125);
44 |         ImageProcessing.threshold_vector_v1(src, found, 125);
45 | 
46 |         BoofTesting.assertEquals(expected, found, UtilEjml.TEST_F32);
47 |     }
48 | 
49 |     @Disabled
50 |     @Test void threshold_vector_v2() {
51 |         GrayU8 src = new GrayU8(imageSize, imageSize);
52 |         GrayU8 expected = src.createSameShape();
53 |         GrayU8 found = src.createSameShape();
54 | 
55 |         ImageMiscOps.fillUniform(src, rand, 0, 255);
56 | 
57 |         ImageProcessing.threshold(src, expected, 125);
58 |         ImageProcessing.threshold_vector_v2(src, found, 125);
59 | 
60 |         BoofTesting.assertEquals(expected, found, UtilEjml.TEST_F32);
61 |     }
62 | }


--------------------------------------------------------------------------------
/src/test/java/benchmark/MatrixMultiplicationTest.java:
--------------------------------------------------------------------------------
 1 | package benchmark;
 2 | 
 3 | import org.ejml.UtilEjml;
 4 | import org.ejml.data.DMatrixRMaj;
 5 | import org.ejml.data.ZMatrixRMaj;
 6 | import org.ejml.dense.row.MatrixFeatures_DDRM;
 7 | import org.ejml.dense.row.MatrixFeatures_ZDRM;
 8 | import org.ejml.dense.row.RandomMatrices_DDRM;
 9 | import org.ejml.dense.row.RandomMatrices_ZDRM;
10 | import org.ejml.sparse.csc.RandomMatrices_DSCC;
11 | import org.junit.jupiter.api.Test;
12 | 
13 | import java.util.Random;
14 | 
15 | import static org.ejml.UtilEjml.assertTrue;
16 | 
17 | /**
18 |  * @author Peter Abeles
19 |  */
20 | class MatrixMultiplicationTest {
21 |     Random rand = new Random(3453);
22 | 
23 |     @Test void simpleCompareToEJML() {
24 |         int N = 10;
25 |         DMatrixRMaj A = RandomMatrices_DDRM.rectangle(N, N, rand);
26 |         DMatrixRMaj B = RandomMatrices_DDRM.rectangle(N, N, rand);
27 |         DMatrixRMaj found = RandomMatrices_DDRM.rectangle(N, N, rand);
28 |         DMatrixRMaj expected = found.copy();
29 | 
30 |         MatrixMultiplication.mult_ikj_simple(A,B,found);
31 |         MatrixMultiplication.mult_ikj(A,B,expected);
32 | 
33 |         assertTrue(MatrixFeatures_DDRM.isIdentical(found, expected, UtilEjml.TEST_F64));
34 |     }
35 | 
36 |     @Test void vectorCompareToSimple() {
37 |         int N = 10;
38 |         DMatrixRMaj A = RandomMatrices_DDRM.rectangle(N, N, rand);
39 |         DMatrixRMaj B = RandomMatrices_DDRM.rectangle(N, N, rand);
40 |         DMatrixRMaj found = RandomMatrices_DDRM.rectangle(N, N, rand);
41 |         DMatrixRMaj expected = found.copy();
42 | 
43 |         MatrixMultiplication.mult_ikj_vector(A,B,found);
44 |         MatrixMultiplication.mult_ikj_simple(A,B,expected);
45 | 
46 |         assertTrue(MatrixFeatures_DDRM.isIdentical(found, expected, UtilEjml.TEST_F64));
47 |     }
48 | 
49 |     @Test void vectorCompareToSimple_complex() {
50 |         int N = 10;
51 |         ZMatrixRMaj A = RandomMatrices_ZDRM.rectangle(N, N, rand);
52 |         ZMatrixRMaj B = RandomMatrices_ZDRM.rectangle(N, N, rand);
53 |         ZMatrixRMaj found = RandomMatrices_ZDRM.rectangle(N, N, rand);
54 |         ZMatrixRMaj expected = found.copy();
55 | 
56 |         MatrixMultiplication.mult_ikj_vector(A,B,found);
57 |         MatrixMultiplication.mult_ikj(A,B,expected);
58 | 
59 |         assertTrue(MatrixFeatures_ZDRM.isIdentical(found, expected, UtilEjml.TEST_F64));
60 |     }
61 | }


--------------------------------------------------------------------------------