├── .gitignore ├── 3rd-party-lib ├── Makefile └── armadillo.cpp ├── notes.txt ├── measured ├── xeon-3.5GHz-4_2 │ ├── armadillo.out │ ├── 8192.out │ ├── 256.out │ ├── 512.out │ ├── 1024.out │ ├── 2048.out │ └── 4096.out ├── celeron-1.8GHz-4 │ ├── armadillo.out │ ├── 256.out │ ├── 512.out │ ├── 1024.out │ ├── 2048.out │ └── 4096.out ├── buldozer-4.2GHz-8 │ ├── armadillo.out │ ├── 8192.out │ ├── 16384.out │ ├── 256.out │ ├── 512.out │ ├── 1024.out │ ├── 2048.out │ └── 4096.out └── xeon-1.8GHz-20_2 │ ├── armadillo.out │ ├── 8192.out │ ├── 16384.out │ ├── 256.out │ ├── 512.out │ ├── 1024.out │ ├── 2048.out │ └── 4096.out ├── README.md ├── presentation ├── mult.bob ├── z-order.bob ├── z-order-2.bob ├── simd.bob ├── Makefile ├── template │ ├── index.html │ ├── style.scss │ └── auto-render.min.js ├── hier.bob ├── recursion.bob ├── numa.bob ├── cache-matrix.bob ├── presentation.md └── fg.svg ├── src ├── lib.rs ├── bin │ ├── manip.rs │ ├── strass.rs │ └── measure.rs ├── simd.rs ├── simple.rs └── znot.rs ├── scripts ├── measure.sh ├── crunch.pl └── plot.pl ├── Cargo.toml ├── LICENSE-MIT └── LICENSE-APACHE /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | **/*.rs.bk 3 | tags 4 | 3rd-party-lib/armadillo 5 | presentation/*.svg 6 | *.dat 7 | -------------------------------------------------------------------------------- /3rd-party-lib/Makefile: -------------------------------------------------------------------------------- 1 | armadillo: armadillo.cpp 2 | g++ --std=c++14 -O3 -larmadillo -DARMA_USE_OPENMP -fopenmp -march=native $< -o $@ 3 | -------------------------------------------------------------------------------- /notes.txt: -------------------------------------------------------------------------------- 1 | sudo perf record -e cycles -g ./target/release/measure big* 2 | perf report 3 | perf script | ../FlameGraph/stackcollapse-perf.pl | ../FlameGraph/flamegraph.pl > fg.svg 4 | -------------------------------------------------------------------------------- /measured/xeon-3.5GHz-4_2/armadillo.out: -------------------------------------------------------------------------------- 1 | Generating matrices 2 | Starting armadillo 3 | 1024: 0.575 4 | Generating matrices 5 | Starting armadillo 6 | 2048: 4.796 7 | Generating matrices 8 | Starting armadillo 9 | 4096: 37.937 10 | Generating matrices 11 | Starting armadillo 12 | 8192: 302.21 13 | -------------------------------------------------------------------------------- /measured/celeron-1.8GHz-4/armadillo.out: -------------------------------------------------------------------------------- 1 | Generating matrices 2 | Starting armadillo 3 | 256: 0.24 4 | Generating matrices 5 | Starting armadillo 6 | 512: 0.207 7 | Generating matrices 8 | Starting armadillo 9 | 1024: 1.637 10 | Generating matrices 11 | Starting armadillo 12 | 2048: 12.759 13 | Generating matrices 14 | Starting armadillo 15 | 4096: 108.955 16 | Generating matrices 17 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # This is not the library you're looking for 2 | 3 | This is a case study about how to approach optimizing things. There's a 4 | presentation about that (included here) and a [blog 5 | post](https://vorner.github.io/2018/05/12/Mat-perf.html). 6 | 7 | If you are looking for an actual, practically usable library to multiply 8 | matrices, you are at the wrong place. 9 | 10 | # Compilation 11 | 12 | Compiles with nightly-2018-04-12 13 | 14 | -------------------------------------------------------------------------------- /presentation/mult.bob: -------------------------------------------------------------------------------- 1 | +-----------+ +-------+--------+ +---------------+ 2 | | | | | | | | 3 | | | | | | | | 4 | +-----------+ | | | -----> | # | 5 | | | | | | | | 6 | | | | | | | | 7 | +-----------+ +-------+--------+ +---------------+ 8 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | #![feature(nll)] 2 | extern crate bincode; 3 | extern crate failure; 4 | #[macro_use] // tuplify macro ‒ abused somewhere else, but who cares 5 | extern crate faster; 6 | extern crate itertools; 7 | extern crate rand; 8 | extern crate rayon; 9 | extern crate serde; 10 | #[macro_use] 11 | extern crate serde_derive; 12 | extern crate smallvec; 13 | extern crate typenum; 14 | 15 | pub mod simd; 16 | pub mod simple; 17 | pub mod znot; 18 | 19 | pub type Element = f32; 20 | -------------------------------------------------------------------------------- /measured/buldozer-4.2GHz-8/armadillo.out: -------------------------------------------------------------------------------- 1 | Generating matrices 2 | Starting armadillo 3 | 256: 0.10 4 | Generating matrices 5 | Starting armadillo 6 | 512: 0.84 7 | Generating matrices 8 | Starting armadillo 9 | 1024: 0.730 10 | Generating matrices 11 | Starting armadillo 12 | 2048: 5.727 13 | Generating matrices 14 | Starting armadillo 15 | 4096: 48.355 16 | Generating matrices 17 | Starting armadillo 18 | 8192: 381.177 19 | Generating matrices 20 | Starting armadillo 21 | 16384: 3106.686 22 | -------------------------------------------------------------------------------- /measured/xeon-1.8GHz-20_2/armadillo.out: -------------------------------------------------------------------------------- 1 | Generating matrices 2 | Starting armadillo 3 | 256: 0.10 4 | Generating matrices 5 | Starting armadillo 6 | 512: 0.78 7 | Generating matrices 8 | Starting armadillo 9 | 1024: 0.623 10 | Generating matrices 11 | Starting armadillo 12 | 2048: 4.971 13 | Generating matrices 14 | Starting armadillo 15 | 4096: 42.107 16 | Generating matrices 17 | Starting armadillo 18 | 8192: 353.211 19 | Generating matrices 20 | Starting armadillo 21 | 16384: 2695.10 22 | -------------------------------------------------------------------------------- /presentation/z-order.bob: -------------------------------------------------------------------------------- 1 | +-----------+-----------+ 2 | | | | 3 | | | | 4 | | | | 5 | | 1 | 2 | 6 | | | | 7 | | | | 8 | | | | 9 | +-----------+-----------+ 10 | | | | 11 | | | | 12 | | | | 13 | | 3 | 4 | 14 | | | | 15 | | | | 16 | | | | 17 | +-----------+-----------+ 18 | -------------------------------------------------------------------------------- /presentation/z-order-2.bob: -------------------------------------------------------------------------------- 1 | +-----+-----+-----+-----+ 2 | | | | | | 3 | | 1 | 2 | 5 | 6 | 4 | | | | | | 5 | +-----+-----+-----+-----+ 6 | | | | | | 7 | | 3 | 4 | 7 | 8 | 8 | | | | | | 9 | +-----+-----+-----+-----+ 10 | | | | | | 11 | | 9 | 10 | 13 | 14 | 12 | | | | | | 13 | +-----+-----+-----+-----+ 14 | | | | | | 15 | | 11 | 12 | 15 | 16 | 16 | | | | | | 17 | +-----+-----+-----+-----+ 18 | -------------------------------------------------------------------------------- /presentation/simd.bob: -------------------------------------------------------------------------------- 1 | Scalar 2 | 3 | +---+ +---+ +---+ 4 | | | + | | --> | | 5 | +---+ +---+ +---+ 6 | 7 | +---+ +---+ +---+ 8 | | | + | | --> | | 9 | +---+ +---+ +---+ 10 | 11 | +---+ +---+ +---+ 12 | | | + | | --> | | 13 | +---+ +---+ +---+ 14 | 15 | SIMD 16 | 17 | +---+ +---+ +---+ 18 | | | | | | | 19 | +---+ +---+ +---+ 20 | | | | | | | 21 | +---+ + +---+ --> +---+ 22 | | | | | | | 23 | +---+ +---+ +---+ 24 | | | | | | | 25 | +---+ +---+ +---+ 26 | -------------------------------------------------------------------------------- /scripts/measure.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | set -ex 4 | 5 | export RUSTFLAGS='-C target-cpu=native' 6 | export CARGO_INCREMENTAL= 7 | 8 | run() { 9 | SIZE=$1 10 | CHEAP=$2 11 | 12 | if [ -f "$SIZE.out" ] ; then 13 | echo "$SIZE already exists" 14 | else 15 | echo "Running $SIZE" 16 | cargo run --release --bin manip -- generate $SIZE $SIZE a.in 17 | cargo run --release --bin manip -- generate $SIZE $SIZE b.in 18 | cargo run --release --bin measure -- $CHEAP a.in b.in | tee tmp.out 19 | mv tmp.out $SIZE.out 20 | fi 21 | } 22 | 23 | for i in 256 512 1024 2048 4096 ; do 24 | run $i 25 | done 26 | 27 | for i in 8192 16384 ; do 28 | run $i --cheap 29 | done 30 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "fastmatmult" 3 | version = "0.1.0" 4 | authors = ["Michal 'vorner' Vaner "] 5 | license = "Apache-2.0/MIT" 6 | publish = false 7 | 8 | [dependencies] 9 | bincode = "~1" 10 | failure = "~0.1" 11 | faster = { git = "https://github.com/AdamNiederer/faster/" } 12 | itertools = "~0.7" 13 | rand = "~0.4" 14 | rayon = "~1" 15 | serde = "~1" 16 | serde_derive = "~1" 17 | smallvec = "~0.6" 18 | structopt = "~0.2" 19 | typenum = "~1" 20 | 21 | [profile.release] 22 | lto = true 23 | codegen-units = 1 24 | #panic = 'abort' # Doesn't seem to work right now :-( 25 | incremental = false 26 | overflow-checks = false 27 | debug-assertions = false 28 | opt-level = 3 29 | -------------------------------------------------------------------------------- /presentation/Makefile: -------------------------------------------------------------------------------- 1 | BOB := $(wildcard *.bob) 2 | SVG := $(patsubst %.bob,%.svg,$(BOB)) \ 3 | arm.svg buldozer.svg celeron.svg xeon.svg beast.svg 4 | 5 | all: $(SVG) 6 | 7 | %.svg: %.bob 8 | svgbob $< -o $@ 9 | 10 | arm.svg: ../measured/buldozer-4.2GHz-8/armadillo.out ../scripts/crunch.pl 11 | ../scripts/crunch.pl $< | gnuplot 12 | 13 | define graph 14 | $1.svg: $$(wildcard ../measured/$2/*.out) ../scripts/plot.pl 15 | cd ../measured/$2/ && ../../scripts/plot.pl | gnuplot 16 | mv ../measured/$2/graph.svg $$@ 17 | endef 18 | 19 | $(eval $(call graph,buldozer,buldozer-4.2GHz-8)) 20 | $(eval $(call graph,celeron,celeron-1.8GHz-4)) 21 | $(eval $(call graph,xeon,xeon-3.5GHz-4_2)) 22 | $(eval $(call graph,beast,xeon-1.8GHz-20_2)) 23 | 24 | .PHONY: all 25 | -------------------------------------------------------------------------------- /3rd-party-lib/armadillo.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | int main() { 6 | 7 | for (size_t i = 8; i < 15; i ++) { 8 | std::cout << "Generating matrices" << std::endl; 9 | size_t size = 1 << i; 10 | arma::fmat a = arma::randu(size, size); 11 | arma::fmat b = arma::randu(size, size); 12 | 13 | std::cout << "Starting armadillo" << std::endl; 14 | std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now(); 15 | 16 | arma::fmat c = a * b; 17 | 18 | size_t millis = std::chrono::duration_cast(std::chrono::steady_clock::now() - start).count(); 19 | size_t secs = millis / 1000; 20 | millis %= 1000; 21 | 22 | std::cout << size << ": " << secs << "." << millis << std::endl; 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /scripts/crunch.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use common::sense; 3 | use Data::Dumper; 4 | 5 | my %data; 6 | 7 | for my $f (@ARGV) { 8 | open my $in, '<', $f or die "Couldn't read $f: $!\n"; 9 | 10 | for (<$in>) { 11 | if (/^(.*): (.*)$/) { 12 | $data{$1} = $2; 13 | } 14 | } 15 | } 16 | 17 | open my $out, '>', "arm.dat" or die "Couldn't write to arm.dat: $!\n"; 18 | for my $size (sort { $a <=> $b } keys %data) { 19 | print $out "$size\t$data{$size}\n"; 20 | } 21 | undef $out; 22 | 23 | $\ = ";\n"; 24 | print "set terminal svg size 400, 400 background rgb 'white'"; 25 | print "set output 'arm.svg'"; 26 | print "set log xyz"; 27 | print "set key right bottom"; 28 | print "set xlabel \"Side of the matrix\""; 29 | print "set ylabel \"Time (seconds)\""; 30 | 31 | print "plot 'arm.dat' title 'Armadillo' with linespoints lt 1 lc rgb 'red'"; 32 | -------------------------------------------------------------------------------- /presentation/template/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | {{title}} 6 | {{{style}}} 7 | 8 | 9 | 10 | 11 | 26 | 27 | 28 | 29 | 30 | -------------------------------------------------------------------------------- /presentation/hier.bob: -------------------------------------------------------------------------------- 1 | +------+ +------+ +--------------+ +-----------+ +------------------------+ 2 | | Core +-+ L1 +-+ L2 +--+ | | | 3 | +------+ +------+ +--------------+ | | | | 4 | | | | | 5 | +------+ +------+ +--------------+ | | | | 6 | | Core +-+ L1 +-+ L2 +--+ L3 +--+ RAM | 7 | +------+ +------+ +--------------+ | | | | 8 | | | | | 9 | +------+ +------+ +--------------+ | | | | 10 | | Core +-+ L1 +-+ L2 +--+ | | | 11 | +------+ +------+ +--------------+ +-----------+ +------------------------+ 12 | -------------------------------------------------------------------------------- /presentation/recursion.bob: -------------------------------------------------------------------------------- 1 | Fits into cache Doesn't fit 2 | +------+ | 3 | | | +--------+ | 4 | | + | | | +-----------+ 5 | | |\ | | | | | 6 | +------+ \| | | | | 7 | + + | | | 8 | +------+ /| |\ | | | 9 | | |/ | | \| | | 10 | | + | | | | | 11 | | | +--------+ |\ | | 12 | +------+ | \| | 13 | | + | 14 | +------+ | /| | 15 | | | +--------+ |/ | | 16 | | + | | | | | 17 | | |\ | | /| | | 18 | +------+ \| |/ | | | 19 | + + | | | 20 | +------+ /| | | | | 21 | | |/ | | | | | 22 | | + | | | +-----------+ 23 | | | +--------+ | 24 | +------+ | 25 | -------------------------------------------------------------------------------- /LICENSE-MIT: -------------------------------------------------------------------------------- 1 | Copyright (c) 2017 tokio-jsonrpc developers 2 | 3 | Permission is hereby granted, free of charge, to any 4 | person obtaining a copy of this software and associated 5 | documentation files (the "Software"), to deal in the 6 | Software without restriction, including without 7 | limitation the rights to use, copy, modify, merge, 8 | publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software 10 | is furnished to do so, subject to the following 11 | conditions: 12 | 13 | The above copyright notice and this permission notice 14 | shall be included in all copies or substantial portions 15 | of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF 18 | ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED 19 | TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 20 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT 21 | SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 22 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 23 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR 24 | IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 25 | DEALINGS IN THE SOFTWARE. 26 | -------------------------------------------------------------------------------- /measured/xeon-1.8GHz-20_2/8192.out: -------------------------------------------------------------------------------- 1 | simd: 188.199 2 | recursive-inner-paral-cutoff-64: 112.215 3 | recursive-paral-cutoff-64: 113.302 4 | recursive-inner-simd-paral-cutoff-64: 12.409 5 | recursive-simd-paral-cutoff-64: 13.522 6 | strassen-inner-64: 7.851 7 | strassen-64: 8.952 8 | recursive-inner-paral-cutoff-128: 120.109 9 | recursive-paral-cutoff-128: 121.250 10 | recursive-inner-simd-paral-cutoff-128: 10.354 11 | recursive-simd-paral-cutoff-128: 11.447 12 | strassen-inner-128: 7.091 13 | strassen-128: 8.137 14 | recursive-inner-paral-cutoff-256: 152.694 15 | recursive-paral-cutoff-256: 153.816 16 | recursive-inner-simd-paral-cutoff-256: 10.060 17 | recursive-simd-paral-cutoff-256: 11.112 18 | strassen-inner-256: 6.980 19 | strassen-256: 7.988 20 | recursive-inner-paral-cutoff-512: 175.066 21 | recursive-paral-cutoff-512: 176.118 22 | recursive-inner-simd-paral-cutoff-512: 9.121 23 | recursive-simd-paral-cutoff-512: 10.116 24 | strassen-inner-512: 8.777 25 | strassen-512: 9.666 26 | recursive-inner-paral-cutoff-1024: 210.018 27 | recursive-paral-cutoff-1024: 210.969 28 | recursive-inner-simd-paral-cutoff-1024: 36.410 29 | recursive-simd-paral-cutoff-1024: 37.415 30 | strassen-inner-1024: 52.313 31 | strassen-1024: 53.298 32 | -------------------------------------------------------------------------------- /measured/xeon-3.5GHz-4_2/8192.out: -------------------------------------------------------------------------------- 1 | simd: 126.505 2 | recursive-inner-paral-cutoff-64: 279.786 3 | recursive-paral-cutoff-64: 280.346 4 | recursive-inner-simd-paral-cutoff-64: 28.851 5 | recursive-simd-paral-cutoff-64: 29.411 6 | strassen-inner-64: 15.220 7 | strassen-64: 15.831 8 | recursive-inner-paral-cutoff-128: 310.686 9 | recursive-paral-cutoff-128: 311.312 10 | recursive-inner-simd-paral-cutoff-128: 23.760 11 | recursive-simd-paral-cutoff-128: 24.332 12 | strassen-inner-128: 13.593 13 | strassen-128: 14.160 14 | recursive-inner-paral-cutoff-256: 412.515 15 | recursive-paral-cutoff-256: 413.076 16 | recursive-inner-simd-paral-cutoff-256: 24.261 17 | recursive-simd-paral-cutoff-256: 24.818 18 | strassen-inner-256: 14.458 19 | strassen-256: 15.021 20 | recursive-inner-paral-cutoff-512: 458.025 21 | recursive-paral-cutoff-512: 458.592 22 | recursive-inner-simd-paral-cutoff-512: 32.752 23 | recursive-simd-paral-cutoff-512: 33.325 24 | strassen-inner-512: 25.490 25 | strassen-512: 26.074 26 | recursive-inner-paral-cutoff-1024: 697.710 27 | recursive-paral-cutoff-1024: 698.320 28 | recursive-inner-simd-paral-cutoff-1024: 60.533 29 | recursive-simd-paral-cutoff-1024: 61.106 30 | strassen-inner-1024: 54.272 31 | strassen-1024: 54.880 32 | -------------------------------------------------------------------------------- /measured/buldozer-4.2GHz-8/8192.out: -------------------------------------------------------------------------------- 1 | simd: 349.456 2 | recursive-inner-paral-cutoff-64: 337.296 3 | recursive-paral-cutoff-64: 338.123 4 | recursive-inner-simd-paral-cutoff-64: 57.939 5 | recursive-simd-paral-cutoff-64: 58.784 6 | strassen-inner-64: 27.973 7 | strassen-64: 28.830 8 | recursive-inner-paral-cutoff-128: 358.494 9 | recursive-paral-cutoff-128: 359.329 10 | recursive-inner-simd-paral-cutoff-128: 50.062 11 | recursive-simd-paral-cutoff-128: 50.917 12 | strassen-inner-128: 27.005 13 | strassen-128: 27.851 14 | recursive-inner-paral-cutoff-256: 389.003 15 | recursive-paral-cutoff-256: 389.837 16 | recursive-inner-simd-paral-cutoff-256: 44.315 17 | recursive-simd-paral-cutoff-256: 45.149 18 | strassen-inner-256: 25.915 19 | strassen-256: 26.750 20 | recursive-inner-paral-cutoff-512: 467.797 21 | recursive-paral-cutoff-512: 468.641 22 | recursive-inner-simd-paral-cutoff-512: 49.029 23 | recursive-simd-paral-cutoff-512: 49.863 24 | strassen-inner-512: 31.742 25 | strassen-512: 32.589 26 | recursive-inner-paral-cutoff-1024: 1263.565 27 | recursive-paral-cutoff-1024: 1264.374 28 | recursive-inner-simd-paral-cutoff-1024: 104.393 29 | recursive-simd-paral-cutoff-1024: 105.209 30 | strassen-inner-1024: 83.073 31 | strassen-1024: 83.890 32 | -------------------------------------------------------------------------------- /measured/xeon-1.8GHz-20_2/16384.out: -------------------------------------------------------------------------------- 1 | simd: 1488.711 2 | recursive-inner-paral-cutoff-64: 884.314 3 | recursive-paral-cutoff-64: 888.824 4 | recursive-inner-simd-paral-cutoff-64: 96.614 5 | recursive-simd-paral-cutoff-64: 101.251 6 | strassen-inner-64: 74.418 7 | strassen-64: 78.640 8 | recursive-inner-paral-cutoff-128: 958.728 9 | recursive-paral-cutoff-128: 963.366 10 | recursive-inner-simd-paral-cutoff-128: 80.841 11 | recursive-simd-paral-cutoff-128: 85.303 12 | strassen-inner-128: 48.692 13 | strassen-128: 52.883 14 | recursive-inner-paral-cutoff-256: 1210.017 15 | recursive-paral-cutoff-256: 1214.433 16 | recursive-inner-simd-paral-cutoff-256: 78.325 17 | recursive-simd-paral-cutoff-256: 82.494 18 | strassen-inner-256: 45.153 19 | strassen-256: 48.953 20 | recursive-inner-paral-cutoff-512: 1393.988 21 | recursive-paral-cutoff-512: 1398.223 22 | recursive-inner-simd-paral-cutoff-512: 74.919 23 | recursive-simd-paral-cutoff-512: 78.844 24 | strassen-inner-512: 62.292 25 | strassen-512: 66.107 26 | recursive-inner-paral-cutoff-1024: 1669.125 27 | recursive-paral-cutoff-1024: 1673.940 28 | recursive-inner-simd-paral-cutoff-1024: 380.593 29 | recursive-simd-paral-cutoff-1024: 384.641 30 | strassen-inner-1024: 454.792 31 | strassen-1024: 458.503 32 | -------------------------------------------------------------------------------- /measured/buldozer-4.2GHz-8/16384.out: -------------------------------------------------------------------------------- 1 | simd: 2835.497 2 | recursive-inner-paral-cutoff-64: 2697.705 3 | recursive-paral-cutoff-64: 2701.215 4 | recursive-inner-simd-paral-cutoff-64: 459.779 5 | recursive-simd-paral-cutoff-64: 463.285 6 | strassen-inner-64: 197.714 7 | strassen-64: 201.374 8 | recursive-inner-paral-cutoff-128: 2860.568 9 | recursive-paral-cutoff-128: 2863.834 10 | recursive-inner-simd-paral-cutoff-128: 396.157 11 | recursive-simd-paral-cutoff-128: 399.079 12 | strassen-inner-128: 185.928 13 | strassen-128: 188.881 14 | recursive-inner-paral-cutoff-256: 2977.628 15 | recursive-paral-cutoff-256: 2980.794 16 | recursive-inner-simd-paral-cutoff-256: 349.393 17 | recursive-simd-paral-cutoff-256: 352.594 18 | strassen-inner-256: 178.627 19 | strassen-256: 181.609 20 | recursive-inner-paral-cutoff-512: 3186.518 21 | recursive-paral-cutoff-512: 3189.609 22 | recursive-inner-simd-paral-cutoff-512: 343.105 23 | recursive-simd-paral-cutoff-512: 346.008 24 | strassen-inner-512: 220.439 25 | strassen-512: 223.339 26 | recursive-inner-paral-cutoff-1024: 9695.138 27 | recursive-paral-cutoff-1024: 9698.073 28 | recursive-inner-simd-paral-cutoff-1024: 839.439 29 | recursive-simd-paral-cutoff-1024: 842.224 30 | strassen-inner-1024: 577.478 31 | strassen-1024: 580.251 32 | -------------------------------------------------------------------------------- /src/bin/manip.rs: -------------------------------------------------------------------------------- 1 | extern crate bincode; 2 | extern crate failure; 3 | extern crate fastmatmult; 4 | extern crate itertools; 5 | #[macro_use] 6 | extern crate structopt; 7 | 8 | use std::path::{Path, PathBuf}; 9 | use std::process; 10 | 11 | use failure::Error; 12 | use itertools::Itertools; 13 | use structopt::StructOpt; 14 | 15 | use fastmatmult::simple::Matrix; 16 | 17 | #[derive(Debug, StructOpt)] 18 | enum Command { 19 | #[structopt(name = "generate")] 20 | Generate { 21 | width: usize, 22 | height: usize, 23 | #[structopt(parse(from_os_str))] 24 | file: PathBuf, 25 | }, 26 | #[structopt(name = "show")] 27 | Show { 28 | #[structopt(parse(from_os_str))] 29 | file: PathBuf, 30 | }, 31 | } 32 | 33 | #[derive(Debug, StructOpt)] 34 | struct Opts { 35 | #[structopt(subcommand)] 36 | command: Command, 37 | } 38 | 39 | fn generate(width: usize, height: usize, file: &Path) -> Result<(), Error> { 40 | let matrix = Matrix::random(width, height); 41 | matrix.store(file)?; 42 | Ok(()) 43 | } 44 | 45 | fn show(file: &Path) -> Result<(), Error> { 46 | let matrix = Matrix::load(file)?; 47 | for row in matrix.rows() { 48 | println!("{:.3}", row.iter().format(" ")); 49 | } 50 | Ok(()) 51 | } 52 | 53 | fn main() { 54 | let opts = Opts::from_args(); 55 | let result = match opts.command { 56 | Command::Generate { width, height, file } => generate(width, height, &file), 57 | Command::Show { file } => show(&file), 58 | }; 59 | if let Err(e) = result { 60 | eprintln!("{}", e); 61 | process::exit(1); 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /scripts/plot.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use common::sense; 3 | use Data::Dumper; 4 | 5 | my %data; 6 | 7 | for my $f (glob '*.out') { 8 | open my $in, '<', $f or die "Couldn't read '$f': $!\n"; 9 | 10 | my ($fbase) = ($f =~ /([^.]*)/); 11 | 12 | for (<$in>) { 13 | chomp; 14 | if (/^(.*): (.*)$/) { 15 | my ($size, $name) = ($fbase, $1); 16 | my $t = $2; 17 | if ($size eq 'armadillo') { 18 | # The armadillo is in a separate file with sizes instead of names 19 | ($size, $name) = ($name, $size); 20 | } 21 | $data{$name}->{$size} = $t; 22 | } 23 | } 24 | } 25 | 26 | my @colors = qw(red blue black orchid green brown purple olivegreen orange #83ffd5 #007f00 #8a0000); 27 | my $cnum; 28 | 29 | $\ = ";\n"; 30 | print "set terminal svg size 700, 450 background rgb 'white'"; 31 | print "set output 'graph.svg'"; 32 | print "set log xyz"; 33 | print "set key right bottom"; 34 | print "set xlabel \"Side of the matrix\""; 35 | print "set ylabel \"Time (seconds)\""; 36 | 37 | while (my ($algo, $data) = each %data) { 38 | open my $out, '>', "$algo.dat" or die "Couldn't write $algo.dat: $!\n"; 39 | my $data = $data{$algo}; 40 | for my $size (sort { $a <=> $b } keys %$data) { 41 | print $out "$size\t$data->{$size}\n"; 42 | } 43 | } 44 | 45 | sub conv($) { 46 | my ($name) = @_; 47 | $name =~ s/_/\\_/g; 48 | return $name; 49 | } 50 | 51 | my %algos = ( 52 | Armadillo => 'armadillo', 53 | Simple => 'simple', 54 | SIMD => 'simd', 55 | Recursive => 'recursive-16', 56 | Parallel => 'recursive-paral-cutoff-16', 57 | Combined => 'recursive-simd-paral-cutoff-256', 58 | Strassen => 'strassen-256', 59 | ); 60 | 61 | print "plot " . join ', ', (map "'$algos{$_}.dat' title '".conv($_)."' with linespoints lt 1 lc rgb \"".$colors[$cnum ++ % scalar @colors]."\"", sort keys %algos); 62 | -------------------------------------------------------------------------------- /presentation/numa.bob: -------------------------------------------------------------------------------- 1 | +------+ +------+ +--------------+ +-----------+ +------------------------+ 2 | | Core +-+ L1 +-+ L2 +--+ | | | 3 | +------+ +------+ +--------------+ | | | | 4 | | | + | | 5 | +------+ +------+ +--------------+ | | / \ | | 6 | | Core +-+ L1 +-+ L2 +--+ L3 +-+ +-+ RAM | 7 | +------+ +------+ +--------------+ | | \ / | | 8 | | | + | | 9 | +------+ +------+ +--------------+ | | | | | 10 | | Core +-+ L1 +-+ L2 +--+ | | | | 11 | +------+ +------+ +--------------+ +-----------+ | +------------------------+ 12 | | 13 | +------+ +------+ +--------------+ +-----------+ | +------------------------+ 14 | | Core +-+ L1 +-+ L2 +--+ | | | | 15 | +------+ +------+ +--------------+ | | | | | 16 | | | + | | 17 | +------+ +------+ +--------------+ | | / \ | | 18 | | Core +-+ L1 +-+ L2 +--+ L3 +-+ +-+ RAM | 19 | +------+ +------+ +--------------+ | | \ / | | 20 | | | + | | 21 | +------+ +------+ +--------------+ | | | | 22 | | Core +-+ L1 +-+ L2 +--+ | | | 23 | +------+ +------+ +--------------+ +-----------+ +------------------------+ 24 | -------------------------------------------------------------------------------- /src/bin/strass.rs: -------------------------------------------------------------------------------- 1 | #![feature(test)] 2 | 3 | extern crate failure; 4 | extern crate fastmatmult; 5 | #[macro_use] 6 | extern crate structopt; 7 | extern crate test; 8 | extern crate typenum; 9 | 10 | use std::fmt::Display; 11 | use std::path::PathBuf; 12 | use std::process; 13 | use std::time::Instant; 14 | 15 | use failure::Error; 16 | use structopt::StructOpt; 17 | use typenum::U256; 18 | 19 | use fastmatmult::simple::Matrix; 20 | use fastmatmult::znot::{Matrix as ZMat, RayonDistribute, SimdMultiplyAdd}; 21 | 22 | #[derive(Debug, StructOpt)] 23 | struct Opts { 24 | #[structopt(parse(from_os_str))] 25 | input1: PathBuf, 26 | #[structopt(parse(from_os_str))] 27 | input2: PathBuf, 28 | /// Skip over some expensive computations. 29 | /// 30 | /// This is to be able to measure somewhat larger inputs, so skipping the really slow ones 31 | /// helps. 32 | #[structopt(short = "c", long = "cheap")] 33 | cheap: bool, 34 | 35 | /// Run only the simple multiplication. 36 | #[structopt(short = "s", long = "simple-only")] 37 | simple_only: bool, 38 | } 39 | 40 | fn measure R>(name: N, f: F) -> R { 41 | let start = Instant::now(); 42 | let result = test::black_box(f()); 43 | let stop = Instant::now(); 44 | let elapsed = stop - start; 45 | println!("{}: {}.{:03}", name, elapsed.as_secs(), elapsed.subsec_nanos() / 1_000_000); 46 | result 47 | } 48 | 49 | fn run() -> Result<(), Error> { 50 | let opts = Opts::from_args(); 51 | let m1 = Matrix::load(&opts.input1)?; 52 | let m2 = Matrix::load(&opts.input2)?; 53 | 54 | measure("strassen-256", || { 55 | let a_z = ZMat::::from(&m1); 56 | let b_z = ZMat::::from(&m2); 57 | let r_z = fastmatmult::znot::strassen::<_, RayonDistribute, SimdMultiplyAdd>(&a_z, &b_z); 58 | Matrix::from(&r_z) 59 | }); 60 | 61 | Ok(()) 62 | } 63 | 64 | fn main() { 65 | if let Err(e) = run() { 66 | eprintln!("{}", e); 67 | process::exit(1); 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /presentation/cache-matrix.bob: -------------------------------------------------------------------------------- 1 | +--------+--------+--------+--------+--------+ +--------+--------+--------+--------+--------+ 2 | |* * * * |* * * * |* * * * |* * * * |* * * * | | | * | | | | 3 | +--------+--------+--------+--------+--------+ +--------+--------+--------+--------+--------+ 4 | | | | | | | | | * | | | | 5 | +--------+--------+--------+--------+--------+ +--------+--------+--------+--------+--------+ 6 | | | | | | | | | * | | | | 7 | +--------+--------+--------+--------+--------+ +--------+--------+--------+--------+--------+ 8 | | | | | | | | | * | | | | 9 | +--------+--------+--------+--------+--------+ +--------+--------+--------+--------+--------+ 10 | | | | | | | | | * | | | | 11 | +--------+--------+--------+--------+--------+ +--------+--------+--------+--------+--------+ 12 | | | | | | | | | * | | | | 13 | +--------+--------+--------+--------+--------+ +--------+--------+--------+--------+--------+ 14 | | | | | | | | | * | | | | 15 | +--------+--------+--------+--------+--------+ +--------+--------+--------+--------+--------+ 16 | | | | | | | | | * | | | | 17 | +--------+--------+--------+--------+--------+ +--------+--------+--------+--------+--------+ 18 | | | | | | | | | * | | | | 19 | +--------+--------+--------+--------+--------+ +--------+--------+--------+--------+--------+ 20 | | | | | | | | | * | | | | 21 | +--------+--------+--------+--------+--------+ +--------+--------+--------+--------+--------+ 22 | | | | | | | | | * | | | | 23 | +--------+--------+--------+--------+--------+ +--------+--------+--------+--------+--------+ 24 | | | | | | | | | * | | | | 25 | +--------+--------+--------+--------+--------+ +--------+--------+--------+--------+--------+ 26 | -------------------------------------------------------------------------------- /src/simd.rs: -------------------------------------------------------------------------------- 1 | use std::iter; 2 | 3 | use faster::*; 4 | use smallvec::SmallVec; 5 | 6 | use super::simple::{Matrix, Slice, SliceMut}; 7 | 8 | pub(crate) fn multiply_add(into: &mut SliceMut, a: &Slice, b: &Slice) { 9 | assert_eq!(a.width, b.height); 10 | assert_eq!(a.height, into.height); 11 | assert_eq!(b.width, into.width); 12 | 13 | let h = into.height; 14 | let l = a.width; 15 | 16 | let pads = iter::repeat(f32s(0.)) 17 | .take(b.width) 18 | .collect::>(); 19 | let columns = b.content 20 | .simd_iter(f32s(0.)); 21 | let columns = columns 22 | .stride_into::>(b.width, &pads); 23 | let mut column_data = iter::repeat(0.0) 24 | .take(b.height) 25 | .collect::>(); 26 | 27 | for (x, mut column) in columns.into_iter().enumerate() { 28 | column.scalar_fill(&mut column_data); 29 | for y in 0..h { 30 | let row = &a.content[y * l .. (y + 1) * l]; 31 | into[(x, y)] += (row.simd_iter(f32s(0.)), column_data.simd_iter(f32s(0.))).zip() 32 | .simd_reduce(f32s(0.0), |acc, (a, b)| acc + a * b) 33 | .sum(); 34 | } 35 | } 36 | } 37 | 38 | pub fn multiply(a: &Matrix, b: &Matrix) -> Matrix { 39 | let mut result = Matrix::sized(b.width(), a.height()); 40 | 41 | multiply_add(&mut result.slice_mut(), &a.slice(), &b.slice()); 42 | 43 | result 44 | } 45 | 46 | #[cfg(test)] 47 | mod tests { 48 | use super::*; 49 | 50 | use ::simple::{self, Matrix}; 51 | 52 | /* 53 | * By using SIMD vectors to sum many at once, we reorder the additions on floats. It so happens 54 | * this changes the result somewhat, so we put a margin there. 55 | */ 56 | fn approx_eq(mut a: Matrix, mut b: Matrix) { 57 | for val in a.slice_mut().content { 58 | *val = (*val / 20.0).round(); 59 | } 60 | for val in b.slice_mut().content { 61 | *val = (*val / 20.0).round(); 62 | } 63 | 64 | assert_eq!(a, b); 65 | } 66 | 67 | #[test] 68 | fn test_multi() { 69 | for shift in 0..7 { 70 | let s = 1 << shift; 71 | let a = Matrix::random(s, s); 72 | let b = Matrix::random(s, s); 73 | let expected = simple::multiply(&a, &b); 74 | let result = multiply(&a, &b); 75 | approx_eq(expected, result); 76 | } 77 | } 78 | 79 | #[test] 80 | fn id() { 81 | for size in 1..4 { 82 | let id = Matrix::identity(size); 83 | let result = multiply(&id, &id); 84 | approx_eq(result, id); 85 | } 86 | } 87 | 88 | #[test] 89 | fn rect() { 90 | let a = Matrix::random(2, 3); 91 | let b = Matrix::random(3, 2); 92 | let result = multiply(&a, &b); 93 | let expected = simple::multiply(&a, &b); 94 | approx_eq(expected, result); 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /measured/buldozer-4.2GHz-8/256.out: -------------------------------------------------------------------------------- 1 | simple: 0.077 2 | colcp: 0.067 3 | simd: 0.007 4 | recursive-inner-1: 0.347 5 | recursive-1: 0.348 6 | recursive-inner-paral-1: 0.234 7 | recursive-paral-1: 0.235 8 | recursive-inner-paral-cutoff-1: 0.129 9 | recursive-paral-cutoff-1: 0.130 10 | recursive-inner-2: 0.104 11 | recursive-2: 0.105 12 | recursive-inner-paral-2: 0.041 13 | recursive-paral-2: 0.042 14 | recursive-inner-paral-cutoff-2: 0.033 15 | recursive-paral-cutoff-2: 0.034 16 | recursive-inner-4: 0.066 17 | recursive-4: 0.067 18 | recursive-inner-paral-4: 0.014 19 | recursive-paral-4: 0.015 20 | recursive-inner-paral-cutoff-4: 0.018 21 | recursive-paral-cutoff-4: 0.018 22 | recursive-inner-simd-4: 0.892 23 | recursive-simd-4: 0.892 24 | recursive-inner-simd-paral-4: 0.219 25 | recursive-simd-paral-4: 0.219 26 | recursive-inner-simd-paral-cutoff-4: 0.236 27 | recursive-simd-paral-cutoff-4: 0.237 28 | recursive-inner-8: 0.062 29 | recursive-8: 0.063 30 | recursive-inner-paral-8: 0.011 31 | recursive-paral-8: 0.011 32 | recursive-inner-paral-cutoff-8: 0.017 33 | recursive-paral-cutoff-8: 0.017 34 | recursive-inner-simd-8: 0.132 35 | recursive-simd-8: 0.132 36 | recursive-inner-simd-paral-8: 0.030 37 | recursive-simd-paral-8: 0.031 38 | recursive-inner-simd-paral-cutoff-8: 0.035 39 | recursive-simd-paral-cutoff-8: 0.035 40 | recursive-inner-16: 0.066 41 | recursive-16: 0.066 42 | recursive-inner-paral-16: 0.009 43 | recursive-paral-16: 0.009 44 | recursive-inner-paral-cutoff-16: 0.016 45 | recursive-paral-cutoff-16: 0.017 46 | recursive-inner-simd-16: 0.030 47 | recursive-simd-16: 0.031 48 | recursive-inner-simd-paral-16: 0.006 49 | recursive-simd-paral-16: 0.007 50 | recursive-inner-simd-paral-cutoff-16: 0.010 51 | recursive-simd-paral-cutoff-16: 0.010 52 | recursive-inner-32: 0.072 53 | recursive-32: 0.072 54 | recursive-inner-paral-32: 0.010 55 | recursive-paral-32: 0.010 56 | recursive-inner-paral-cutoff-32: 0.018 57 | recursive-paral-cutoff-32: 0.018 58 | recursive-inner-simd-32: 0.013 59 | recursive-simd-32: 0.014 60 | recursive-inner-simd-paral-32: 0.002 61 | recursive-simd-paral-32: 0.003 62 | recursive-inner-simd-paral-cutoff-32: 0.004 63 | recursive-simd-paral-cutoff-32: 0.005 64 | strassen-inner-32: 0.002 65 | strassen-32: 0.003 66 | recursive-inner-64: 0.071 67 | recursive-64: 0.072 68 | recursive-inner-paral-64: 0.012 69 | recursive-paral-64: 0.013 70 | recursive-inner-paral-cutoff-64: 0.019 71 | recursive-paral-cutoff-64: 0.019 72 | recursive-inner-simd-64: 0.009 73 | recursive-simd-64: 0.010 74 | recursive-inner-simd-paral-64: 0.001 75 | recursive-simd-paral-64: 0.002 76 | recursive-inner-simd-paral-cutoff-64: 0.003 77 | recursive-simd-paral-cutoff-64: 0.004 78 | strassen-inner-64: 0.001 79 | strassen-64: 0.002 80 | recursive-inner-128: 0.070 81 | recursive-128: 0.071 82 | recursive-inner-paral-128: 0.018 83 | recursive-paral-128: 0.018 84 | recursive-inner-paral-cutoff-128: 0.018 85 | recursive-paral-cutoff-128: 0.018 86 | recursive-inner-simd-128: 0.008 87 | recursive-simd-128: 0.009 88 | recursive-inner-simd-paral-128: 0.003 89 | recursive-simd-paral-128: 0.003 90 | recursive-inner-simd-paral-cutoff-128: 0.003 91 | recursive-simd-paral-cutoff-128: 0.003 92 | strassen-inner-128: 0.002 93 | strassen-128: 0.002 94 | recursive-inner-256: 0.078 95 | recursive-256: 0.079 96 | recursive-inner-paral-256: 0.080 97 | recursive-paral-256: 0.081 98 | recursive-inner-paral-cutoff-256: 0.082 99 | recursive-paral-cutoff-256: 0.083 100 | recursive-inner-simd-256: 0.007 101 | recursive-simd-256: 0.008 102 | recursive-inner-simd-paral-256: 0.007 103 | recursive-simd-paral-256: 0.007 104 | recursive-inner-simd-paral-cutoff-256: 0.007 105 | recursive-simd-paral-cutoff-256: 0.007 106 | strassen-inner-256: 0.007 107 | strassen-256: 0.008 108 | -------------------------------------------------------------------------------- /measured/celeron-1.8GHz-4/256.out: -------------------------------------------------------------------------------- 1 | simple: 0.187 2 | colcp: 0.074 3 | simd: 0.020 4 | recursive-inner-1: 0.822 5 | recursive-1: 0.827 6 | recursive-inner-paral-1: 0.395 7 | recursive-paral-1: 0.399 8 | recursive-inner-paral-cutoff-1: 0.244 9 | recursive-paral-cutoff-1: 0.249 10 | recursive-inner-2: 0.280 11 | recursive-2: 0.284 12 | recursive-inner-paral-2: 0.107 13 | recursive-paral-2: 0.110 14 | recursive-inner-paral-cutoff-2: 0.077 15 | recursive-paral-cutoff-2: 0.080 16 | recursive-inner-4: 0.193 17 | recursive-4: 0.195 18 | recursive-inner-paral-4: 0.055 19 | recursive-paral-4: 0.057 20 | recursive-inner-paral-cutoff-4: 0.054 21 | recursive-paral-cutoff-4: 0.056 22 | recursive-inner-simd-4: 2.027 23 | recursive-simd-4: 2.029 24 | recursive-inner-simd-paral-4: 0.623 25 | recursive-simd-paral-4: 0.625 26 | recursive-inner-simd-paral-cutoff-4: 0.618 27 | recursive-simd-paral-cutoff-4: 0.621 28 | recursive-inner-8: 0.170 29 | recursive-8: 0.173 30 | recursive-inner-paral-8: 0.054 31 | recursive-paral-8: 0.056 32 | recursive-inner-paral-cutoff-8: 0.055 33 | recursive-paral-cutoff-8: 0.057 34 | recursive-inner-simd-8: 0.320 35 | recursive-simd-8: 0.323 36 | recursive-inner-simd-paral-8: 0.096 37 | recursive-simd-paral-8: 0.098 38 | recursive-inner-simd-paral-cutoff-8: 0.098 39 | recursive-simd-paral-cutoff-8: 0.100 40 | recursive-inner-16: 0.156 41 | recursive-16: 0.158 42 | recursive-inner-paral-16: 0.044 43 | recursive-paral-16: 0.046 44 | recursive-inner-paral-cutoff-16: 0.045 45 | recursive-paral-cutoff-16: 0.047 46 | recursive-inner-simd-16: 0.075 47 | recursive-simd-16: 0.077 48 | recursive-inner-simd-paral-16: 0.025 49 | recursive-simd-paral-16: 0.026 50 | recursive-inner-simd-paral-cutoff-16: 0.024 51 | recursive-simd-paral-cutoff-16: 0.026 52 | recursive-inner-32: 0.146 53 | recursive-32: 0.148 54 | recursive-inner-paral-32: 0.041 55 | recursive-paral-32: 0.043 56 | recursive-inner-paral-cutoff-32: 0.042 57 | recursive-paral-cutoff-32: 0.044 58 | recursive-inner-simd-32: 0.034 59 | recursive-simd-32: 0.036 60 | recursive-inner-simd-paral-32: 0.012 61 | recursive-simd-paral-32: 0.014 62 | recursive-inner-simd-paral-cutoff-32: 0.014 63 | recursive-simd-paral-cutoff-32: 0.016 64 | strassen-inner-32: 0.008 65 | strassen-32: 0.011 66 | recursive-inner-64: 0.149 67 | recursive-64: 0.151 68 | recursive-inner-paral-64: 0.043 69 | recursive-paral-64: 0.044 70 | recursive-inner-paral-cutoff-64: 0.042 71 | recursive-paral-cutoff-64: 0.044 72 | recursive-inner-simd-64: 0.030 73 | recursive-simd-64: 0.032 74 | recursive-inner-simd-paral-64: 0.012 75 | recursive-simd-paral-64: 0.013 76 | recursive-inner-simd-paral-cutoff-64: 0.012 77 | recursive-simd-paral-cutoff-64: 0.014 78 | strassen-inner-64: 0.008 79 | strassen-64: 0.010 80 | recursive-inner-128: 0.177 81 | recursive-128: 0.179 82 | recursive-inner-paral-128: 0.058 83 | recursive-paral-128: 0.059 84 | recursive-inner-paral-cutoff-128: 0.052 85 | recursive-paral-cutoff-128: 0.055 86 | recursive-inner-simd-128: 0.025 87 | recursive-simd-128: 0.027 88 | recursive-inner-simd-paral-128: 0.009 89 | recursive-simd-paral-128: 0.010 90 | recursive-inner-simd-paral-cutoff-128: 0.007 91 | recursive-simd-paral-cutoff-128: 0.009 92 | strassen-inner-128: 0.008 93 | strassen-128: 0.010 94 | recursive-inner-256: 0.188 95 | recursive-256: 0.190 96 | recursive-inner-paral-256: 0.186 97 | recursive-paral-256: 0.188 98 | recursive-inner-paral-cutoff-256: 0.185 99 | recursive-paral-cutoff-256: 0.186 100 | recursive-inner-simd-256: 0.020 101 | recursive-simd-256: 0.022 102 | recursive-inner-simd-paral-256: 0.020 103 | recursive-simd-paral-256: 0.022 104 | recursive-inner-simd-paral-cutoff-256: 0.020 105 | recursive-simd-paral-cutoff-256: 0.022 106 | strassen-inner-256: 0.021 107 | strassen-256: 0.022 108 | -------------------------------------------------------------------------------- /measured/xeon-1.8GHz-20_2/256.out: -------------------------------------------------------------------------------- 1 | simple: 0.065 2 | colcp: 0.052 3 | simd: 0.004 4 | recursive-inner-1: 0.254 5 | recursive-1: 0.255 6 | recursive-inner-paral-1: 1.199 7 | recursive-paral-1: 1.202 8 | recursive-inner-paral-cutoff-1: 0.080 9 | recursive-paral-cutoff-1: 0.084 10 | recursive-inner-2: 0.071 11 | recursive-2: 0.072 12 | recursive-inner-paral-2: 0.183 13 | recursive-paral-2: 0.184 14 | recursive-inner-paral-cutoff-2: 0.023 15 | recursive-paral-cutoff-2: 0.025 16 | recursive-inner-4: 0.047 17 | recursive-4: 0.048 18 | recursive-inner-paral-4: 0.038 19 | recursive-paral-4: 0.039 20 | recursive-inner-paral-cutoff-4: 0.025 21 | recursive-paral-cutoff-4: 0.026 22 | recursive-inner-simd-4: 1.257 23 | recursive-simd-4: 1.258 24 | recursive-inner-simd-paral-4: 0.102 25 | recursive-simd-paral-4: 0.102 26 | recursive-inner-simd-paral-cutoff-4: 0.348 27 | recursive-simd-paral-cutoff-4: 0.349 28 | recursive-inner-8: 0.047 29 | recursive-8: 0.048 30 | recursive-inner-paral-8: 0.008 31 | recursive-paral-8: 0.009 32 | recursive-inner-paral-cutoff-8: 0.014 33 | recursive-paral-cutoff-8: 0.015 34 | recursive-inner-simd-8: 0.157 35 | recursive-simd-8: 0.157 36 | recursive-inner-simd-paral-8: 0.014 37 | recursive-simd-paral-8: 0.015 38 | recursive-inner-simd-paral-cutoff-8: 0.045 39 | recursive-simd-paral-cutoff-8: 0.046 40 | recursive-inner-16: 0.047 41 | recursive-16: 0.048 42 | recursive-inner-paral-16: 0.005 43 | recursive-paral-16: 0.005 44 | recursive-inner-paral-cutoff-16: 0.016 45 | recursive-paral-cutoff-16: 0.017 46 | recursive-inner-simd-16: 0.027 47 | recursive-simd-16: 0.028 48 | recursive-inner-simd-paral-16: 0.003 49 | recursive-simd-paral-16: 0.003 50 | recursive-inner-simd-paral-cutoff-16: 0.009 51 | recursive-simd-paral-cutoff-16: 0.010 52 | recursive-inner-32: 0.052 53 | recursive-32: 0.052 54 | recursive-inner-paral-32: 0.006 55 | recursive-paral-32: 0.007 56 | recursive-inner-paral-cutoff-32: 0.017 57 | recursive-paral-cutoff-32: 0.018 58 | recursive-inner-simd-32: 0.011 59 | recursive-simd-32: 0.011 60 | recursive-inner-simd-paral-32: 0.001 61 | recursive-simd-paral-32: 0.002 62 | recursive-inner-simd-paral-cutoff-32: 0.005 63 | recursive-simd-paral-cutoff-32: 0.006 64 | strassen-inner-32: 0.002 65 | strassen-32: 0.003 66 | recursive-inner-64: 0.053 67 | recursive-64: 0.054 68 | recursive-inner-paral-64: 0.011 69 | recursive-paral-64: 0.012 70 | recursive-inner-paral-cutoff-64: 0.025 71 | recursive-paral-cutoff-64: 0.026 72 | recursive-inner-simd-64: 0.007 73 | recursive-simd-64: 0.007 74 | recursive-inner-simd-paral-64: 0.001 75 | recursive-simd-paral-64: 0.002 76 | recursive-inner-simd-paral-cutoff-64: 0.003 77 | recursive-simd-paral-cutoff-64: 0.004 78 | strassen-inner-64: 0.002 79 | strassen-64: 0.003 80 | recursive-inner-128: 0.054 81 | recursive-128: 0.055 82 | recursive-inner-paral-128: 0.028 83 | recursive-paral-128: 0.029 84 | recursive-inner-paral-cutoff-128: 0.018 85 | recursive-paral-cutoff-128: 0.019 86 | recursive-inner-simd-128: 0.006 87 | recursive-simd-128: 0.007 88 | recursive-inner-simd-paral-128: 0.003 89 | recursive-simd-paral-128: 0.004 90 | recursive-inner-simd-paral-cutoff-128: 0.002 91 | recursive-simd-paral-cutoff-128: 0.003 92 | strassen-inner-128: 0.003 93 | strassen-128: 0.005 94 | recursive-inner-256: 0.069 95 | recursive-256: 0.070 96 | recursive-inner-paral-256: 0.069 97 | recursive-paral-256: 0.069 98 | recursive-inner-paral-cutoff-256: 0.067 99 | recursive-paral-cutoff-256: 0.067 100 | recursive-inner-simd-256: 0.003 101 | recursive-simd-256: 0.004 102 | recursive-inner-simd-paral-256: 0.004 103 | recursive-simd-paral-256: 0.004 104 | recursive-inner-simd-paral-cutoff-256: 0.004 105 | recursive-simd-paral-cutoff-256: 0.004 106 | strassen-inner-256: 0.004 107 | strassen-256: 0.004 108 | -------------------------------------------------------------------------------- /measured/xeon-3.5GHz-4_2/256.out: -------------------------------------------------------------------------------- 1 | simple: 0.042 2 | colcp: 0.039 3 | simd: 0.002 4 | recursive-inner-1: 0.179 5 | recursive-1: 0.179 6 | recursive-inner-paral-1: 0.135 7 | recursive-paral-1: 0.136 8 | recursive-inner-paral-cutoff-1: 0.067 9 | recursive-paral-cutoff-1: 0.067 10 | recursive-inner-2: 0.052 11 | recursive-2: 0.053 12 | recursive-inner-paral-2: 0.028 13 | recursive-paral-2: 0.028 14 | recursive-inner-paral-cutoff-2: 0.026 15 | recursive-paral-cutoff-2: 0.027 16 | recursive-inner-4: 0.036 17 | recursive-4: 0.036 18 | recursive-inner-paral-4: 0.011 19 | recursive-paral-4: 0.011 20 | recursive-inner-paral-cutoff-4: 0.018 21 | recursive-paral-cutoff-4: 0.018 22 | recursive-inner-simd-4: 0.681 23 | recursive-simd-4: 0.681 24 | recursive-inner-simd-paral-4: 0.161 25 | recursive-simd-paral-4: 0.161 26 | recursive-inner-simd-paral-cutoff-4: 0.265 27 | recursive-simd-paral-cutoff-4: 0.265 28 | recursive-inner-8: 0.036 29 | recursive-8: 0.036 30 | recursive-inner-paral-8: 0.010 31 | recursive-paral-8: 0.010 32 | recursive-inner-paral-cutoff-8: 0.016 33 | recursive-paral-cutoff-8: 0.016 34 | recursive-inner-simd-8: 0.094 35 | recursive-simd-8: 0.095 36 | recursive-inner-simd-paral-8: 0.027 37 | recursive-simd-paral-8: 0.028 38 | recursive-inner-simd-paral-cutoff-8: 0.034 39 | recursive-simd-paral-cutoff-8: 0.035 40 | recursive-inner-16: 0.042 41 | recursive-16: 0.042 42 | recursive-inner-paral-16: 0.012 43 | recursive-paral-16: 0.012 44 | recursive-inner-paral-cutoff-16: 0.016 45 | recursive-paral-cutoff-16: 0.016 46 | recursive-inner-simd-16: 0.015 47 | recursive-simd-16: 0.015 48 | recursive-inner-simd-paral-16: 0.006 49 | recursive-simd-paral-16: 0.006 50 | recursive-inner-simd-paral-cutoff-16: 0.006 51 | recursive-simd-paral-cutoff-16: 0.006 52 | recursive-inner-32: 0.040 53 | recursive-32: 0.040 54 | recursive-inner-paral-32: 0.010 55 | recursive-paral-32: 0.011 56 | recursive-inner-paral-cutoff-32: 0.017 57 | recursive-paral-cutoff-32: 0.017 58 | recursive-inner-simd-32: 0.005 59 | recursive-simd-32: 0.006 60 | recursive-inner-simd-paral-32: 0.001 61 | recursive-simd-paral-32: 0.002 62 | recursive-inner-simd-paral-cutoff-32: 0.002 63 | recursive-simd-paral-cutoff-32: 0.003 64 | strassen-inner-32: 0.001 65 | strassen-32: 0.002 66 | recursive-inner-64: 0.040 67 | recursive-64: 0.041 68 | recursive-inner-paral-64: 0.011 69 | recursive-paral-64: 0.011 70 | recursive-inner-paral-cutoff-64: 0.016 71 | recursive-paral-cutoff-64: 0.017 72 | recursive-inner-simd-64: 0.003 73 | recursive-simd-64: 0.003 74 | recursive-inner-simd-paral-64: 0.000 75 | recursive-simd-paral-64: 0.001 76 | recursive-inner-simd-paral-cutoff-64: 0.001 77 | recursive-simd-paral-cutoff-64: 0.002 78 | strassen-inner-64: 0.000 79 | strassen-64: 0.001 80 | recursive-inner-128: 0.041 81 | recursive-128: 0.042 82 | recursive-inner-paral-128: 0.017 83 | recursive-paral-128: 0.017 84 | recursive-inner-paral-cutoff-128: 0.018 85 | recursive-paral-cutoff-128: 0.019 86 | recursive-inner-simd-128: 0.002 87 | recursive-simd-128: 0.003 88 | recursive-inner-simd-paral-128: 0.001 89 | recursive-simd-paral-128: 0.001 90 | recursive-inner-simd-paral-cutoff-128: 0.001 91 | recursive-simd-paral-cutoff-128: 0.001 92 | strassen-inner-128: 0.001 93 | strassen-128: 0.001 94 | recursive-inner-256: 0.047 95 | recursive-256: 0.047 96 | recursive-inner-paral-256: 0.047 97 | recursive-paral-256: 0.047 98 | recursive-inner-paral-cutoff-256: 0.046 99 | recursive-paral-cutoff-256: 0.047 100 | recursive-inner-simd-256: 0.003 101 | recursive-simd-256: 0.003 102 | recursive-inner-simd-paral-256: 0.002 103 | recursive-simd-paral-256: 0.003 104 | recursive-inner-simd-paral-cutoff-256: 0.002 105 | recursive-simd-paral-cutoff-256: 0.003 106 | strassen-inner-256: 0.002 107 | strassen-256: 0.003 108 | -------------------------------------------------------------------------------- /measured/xeon-3.5GHz-4_2/512.out: -------------------------------------------------------------------------------- 1 | simple: 0.390 2 | colcp: 0.320 3 | simd: 0.021 4 | recursive-inner-1: 1.425 5 | recursive-1: 1.430 6 | recursive-inner-paral-1: 1.039 7 | recursive-paral-1: 1.044 8 | recursive-inner-paral-cutoff-1: 0.464 9 | recursive-paral-cutoff-1: 0.469 10 | recursive-inner-2: 0.414 11 | recursive-2: 0.417 12 | recursive-inner-paral-2: 0.188 13 | recursive-paral-2: 0.191 14 | recursive-inner-paral-cutoff-2: 0.109 15 | recursive-paral-cutoff-2: 0.112 16 | recursive-inner-4: 0.288 17 | recursive-4: 0.290 18 | recursive-inner-paral-4: 0.087 19 | recursive-paral-4: 0.089 20 | recursive-inner-paral-cutoff-4: 0.074 21 | recursive-paral-cutoff-4: 0.076 22 | recursive-inner-simd-4: 5.219 23 | recursive-simd-4: 5.221 24 | recursive-inner-simd-paral-4: 1.182 25 | recursive-simd-paral-4: 1.184 26 | recursive-inner-simd-paral-cutoff-4: 1.231 27 | recursive-simd-paral-cutoff-4: 1.233 28 | recursive-inner-8: 0.311 29 | recursive-8: 0.313 30 | recursive-inner-paral-8: 0.085 31 | recursive-paral-8: 0.087 32 | recursive-inner-paral-cutoff-8: 0.093 33 | recursive-paral-cutoff-8: 0.095 34 | recursive-inner-simd-8: 0.663 35 | recursive-simd-8: 0.664 36 | recursive-inner-simd-paral-8: 0.152 37 | recursive-simd-paral-8: 0.154 38 | recursive-inner-simd-paral-cutoff-8: 0.150 39 | recursive-simd-paral-cutoff-8: 0.152 40 | recursive-inner-16: 0.296 41 | recursive-16: 0.298 42 | recursive-inner-paral-16: 0.067 43 | recursive-paral-16: 0.070 44 | recursive-inner-paral-cutoff-16: 0.067 45 | recursive-paral-cutoff-16: 0.069 46 | recursive-inner-simd-16: 0.130 47 | recursive-simd-16: 0.132 48 | recursive-inner-simd-paral-16: 0.030 49 | recursive-simd-paral-16: 0.033 50 | recursive-inner-simd-paral-cutoff-16: 0.027 51 | recursive-simd-paral-cutoff-16: 0.029 52 | recursive-inner-32: 0.333 53 | recursive-32: 0.335 54 | recursive-inner-paral-32: 0.072 55 | recursive-paral-32: 0.074 56 | recursive-inner-paral-cutoff-32: 0.072 57 | recursive-paral-cutoff-32: 0.074 58 | recursive-inner-simd-32: 0.045 59 | recursive-simd-32: 0.047 60 | recursive-inner-simd-paral-32: 0.011 61 | recursive-simd-paral-32: 0.014 62 | recursive-inner-simd-paral-cutoff-32: 0.012 63 | recursive-simd-paral-cutoff-32: 0.015 64 | strassen-inner-32: 0.008 65 | strassen-32: 0.010 66 | recursive-inner-64: 0.332 67 | recursive-64: 0.334 68 | recursive-inner-paral-64: 0.081 69 | recursive-paral-64: 0.083 70 | recursive-inner-paral-cutoff-64: 0.104 71 | recursive-paral-cutoff-64: 0.106 72 | recursive-inner-simd-64: 0.030 73 | recursive-simd-64: 0.032 74 | recursive-inner-simd-paral-64: 0.009 75 | recursive-simd-paral-64: 0.011 76 | recursive-inner-simd-paral-cutoff-64: 0.011 77 | recursive-simd-paral-cutoff-64: 0.014 78 | strassen-inner-64: 0.009 79 | strassen-64: 0.012 80 | recursive-inner-128: 0.336 81 | recursive-128: 0.339 82 | recursive-inner-paral-128: 0.078 83 | recursive-paral-128: 0.080 84 | recursive-inner-paral-cutoff-128: 0.078 85 | recursive-paral-cutoff-128: 0.080 86 | recursive-inner-simd-128: 0.024 87 | recursive-simd-128: 0.026 88 | recursive-inner-simd-paral-128: 0.007 89 | recursive-simd-paral-128: 0.009 90 | recursive-inner-simd-paral-cutoff-128: 0.008 91 | recursive-simd-paral-cutoff-128: 0.011 92 | strassen-inner-128: 0.008 93 | strassen-128: 0.010 94 | recursive-inner-256: 0.434 95 | recursive-256: 0.437 96 | recursive-inner-paral-256: 0.123 97 | recursive-paral-256: 0.126 98 | recursive-inner-paral-cutoff-256: 0.169 99 | recursive-paral-cutoff-256: 0.172 100 | recursive-inner-simd-256: 0.025 101 | recursive-simd-256: 0.027 102 | recursive-inner-simd-paral-256: 0.012 103 | recursive-simd-paral-256: 0.014 104 | recursive-inner-simd-paral-cutoff-256: 0.012 105 | recursive-simd-paral-cutoff-256: 0.014 106 | strassen-inner-256: 0.007 107 | strassen-256: 0.009 108 | recursive-inner-512: 0.487 109 | recursive-512: 0.490 110 | recursive-inner-paral-512: 0.445 111 | recursive-paral-512: 0.447 112 | recursive-inner-paral-cutoff-512: 0.459 113 | recursive-paral-cutoff-512: 0.461 114 | recursive-inner-simd-512: 0.022 115 | recursive-simd-512: 0.024 116 | recursive-inner-simd-paral-512: 0.021 117 | recursive-simd-paral-512: 0.023 118 | recursive-inner-simd-paral-cutoff-512: 0.021 119 | recursive-simd-paral-cutoff-512: 0.023 120 | strassen-inner-512: 0.022 121 | strassen-512: 0.024 122 | -------------------------------------------------------------------------------- /measured/buldozer-4.2GHz-8/512.out: -------------------------------------------------------------------------------- 1 | simple: 0.546 2 | colcp: 0.538 3 | simd: 0.054 4 | recursive-inner-1: 2.780 5 | recursive-1: 2.787 6 | recursive-inner-paral-1: 1.649 7 | recursive-paral-1: 1.656 8 | recursive-inner-paral-cutoff-1: 0.537 9 | recursive-paral-cutoff-1: 0.544 10 | recursive-inner-2: 0.840 11 | recursive-2: 0.845 12 | recursive-inner-paral-2: 0.300 13 | recursive-paral-2: 0.304 14 | recursive-inner-paral-cutoff-2: 0.160 15 | recursive-paral-cutoff-2: 0.164 16 | recursive-inner-4: 0.519 17 | recursive-4: 0.522 18 | recursive-inner-paral-4: 0.111 19 | recursive-paral-4: 0.115 20 | recursive-inner-paral-cutoff-4: 0.077 21 | recursive-paral-cutoff-4: 0.081 22 | recursive-inner-simd-4: 7.090 23 | recursive-simd-4: 7.093 24 | recursive-inner-simd-paral-4: 1.712 25 | recursive-simd-paral-4: 1.716 26 | recursive-inner-simd-paral-cutoff-4: 2.022 27 | recursive-simd-paral-cutoff-4: 2.026 28 | recursive-inner-8: 0.506 29 | recursive-8: 0.510 30 | recursive-inner-paral-8: 0.086 31 | recursive-paral-8: 0.090 32 | recursive-inner-paral-cutoff-8: 0.097 33 | recursive-paral-cutoff-8: 0.101 34 | recursive-inner-simd-8: 1.054 35 | recursive-simd-8: 1.057 36 | recursive-inner-simd-paral-8: 0.233 37 | recursive-simd-paral-8: 0.236 38 | recursive-inner-simd-paral-cutoff-8: 0.227 39 | recursive-simd-paral-cutoff-8: 0.231 40 | recursive-inner-16: 0.515 41 | recursive-16: 0.519 42 | recursive-inner-paral-16: 0.075 43 | recursive-paral-16: 0.078 44 | recursive-inner-paral-cutoff-16: 0.072 45 | recursive-paral-cutoff-16: 0.076 46 | recursive-inner-simd-16: 0.254 47 | recursive-simd-16: 0.257 48 | recursive-inner-simd-paral-16: 0.050 49 | recursive-simd-paral-16: 0.054 50 | recursive-inner-simd-paral-cutoff-16: 0.047 51 | recursive-simd-paral-cutoff-16: 0.050 52 | recursive-inner-32: 0.578 53 | recursive-32: 0.582 54 | recursive-inner-paral-32: 0.083 55 | recursive-paral-32: 0.087 56 | recursive-inner-paral-cutoff-32: 0.095 57 | recursive-paral-cutoff-32: 0.098 58 | recursive-inner-simd-32: 0.113 59 | recursive-simd-32: 0.116 60 | recursive-inner-simd-paral-32: 0.021 61 | recursive-simd-paral-32: 0.024 62 | recursive-inner-simd-paral-cutoff-32: 0.027 63 | recursive-simd-paral-cutoff-32: 0.031 64 | strassen-inner-32: 0.016 65 | strassen-32: 0.019 66 | recursive-inner-64: 0.569 67 | recursive-64: 0.572 68 | recursive-inner-paral-64: 0.087 69 | recursive-paral-64: 0.091 70 | recursive-inner-paral-cutoff-64: 0.080 71 | recursive-paral-cutoff-64: 0.083 72 | recursive-inner-simd-64: 0.081 73 | recursive-simd-64: 0.084 74 | recursive-inner-simd-paral-64: 0.014 75 | recursive-simd-paral-64: 0.018 76 | recursive-inner-simd-paral-cutoff-64: 0.014 77 | recursive-simd-paral-cutoff-64: 0.017 78 | strassen-inner-64: 0.013 79 | strassen-64: 0.016 80 | recursive-inner-128: 0.556 81 | recursive-128: 0.559 82 | recursive-inner-paral-128: 0.104 83 | recursive-paral-128: 0.107 84 | recursive-inner-paral-cutoff-128: 0.081 85 | recursive-paral-cutoff-128: 0.084 86 | recursive-inner-simd-128: 0.067 87 | recursive-simd-128: 0.070 88 | recursive-inner-simd-paral-128: 0.017 89 | recursive-simd-paral-128: 0.020 90 | recursive-inner-simd-paral-cutoff-128: 0.015 91 | recursive-simd-paral-cutoff-128: 0.018 92 | strassen-inner-128: 0.013 93 | strassen-128: 0.016 94 | recursive-inner-256: 0.631 95 | recursive-256: 0.634 96 | recursive-inner-paral-256: 0.171 97 | recursive-paral-256: 0.175 98 | recursive-inner-paral-cutoff-256: 0.169 99 | recursive-paral-cutoff-256: 0.172 100 | recursive-inner-simd-256: 0.060 101 | recursive-simd-256: 0.063 102 | recursive-inner-simd-paral-256: 0.020 103 | recursive-simd-paral-256: 0.024 104 | recursive-inner-simd-paral-cutoff-256: 0.020 105 | recursive-simd-paral-cutoff-256: 0.024 106 | strassen-inner-256: 0.016 107 | strassen-256: 0.019 108 | recursive-inner-512: 0.647 109 | recursive-512: 0.650 110 | recursive-inner-paral-512: 0.648 111 | recursive-paral-512: 0.652 112 | recursive-inner-paral-cutoff-512: 0.656 113 | recursive-paral-cutoff-512: 0.659 114 | recursive-inner-simd-512: 0.058 115 | recursive-simd-512: 0.061 116 | recursive-inner-simd-paral-512: 0.057 117 | recursive-simd-paral-512: 0.060 118 | recursive-inner-simd-paral-cutoff-512: 0.057 119 | recursive-simd-paral-cutoff-512: 0.060 120 | strassen-inner-512: 0.056 121 | strassen-512: 0.059 122 | -------------------------------------------------------------------------------- /measured/celeron-1.8GHz-4/512.out: -------------------------------------------------------------------------------- 1 | simple: 1.923 2 | colcp: 0.636 3 | simd: 0.210 4 | recursive-inner-1: 6.583 5 | recursive-1: 6.601 6 | recursive-inner-paral-1: 3.089 7 | recursive-paral-1: 3.108 8 | recursive-inner-paral-cutoff-1: 1.704 9 | recursive-paral-cutoff-1: 1.724 10 | recursive-inner-2: 2.210 11 | recursive-2: 2.223 12 | recursive-inner-paral-2: 0.769 13 | recursive-paral-2: 0.780 14 | recursive-inner-paral-cutoff-2: 0.589 15 | recursive-paral-cutoff-2: 0.602 16 | recursive-inner-4: 1.519 17 | recursive-4: 1.531 18 | recursive-inner-paral-4: 0.439 19 | recursive-paral-4: 0.450 20 | recursive-inner-paral-cutoff-4: 0.391 21 | recursive-paral-cutoff-4: 0.401 22 | recursive-inner-simd-4: 16.666 23 | recursive-simd-4: 16.675 24 | recursive-inner-simd-paral-4: 4.996 25 | recursive-simd-paral-4: 5.007 26 | recursive-inner-simd-paral-cutoff-4: 5.002 27 | recursive-simd-paral-cutoff-4: 5.017 28 | recursive-inner-8: 1.306 29 | recursive-8: 1.321 30 | recursive-inner-paral-8: 0.333 31 | recursive-paral-8: 0.343 32 | recursive-inner-paral-cutoff-8: 0.338 33 | recursive-paral-cutoff-8: 0.348 34 | recursive-inner-simd-8: 2.527 35 | recursive-simd-8: 2.538 36 | recursive-inner-simd-paral-8: 0.729 37 | recursive-simd-paral-8: 0.739 38 | recursive-inner-simd-paral-cutoff-8: 0.776 39 | recursive-simd-paral-cutoff-8: 0.786 40 | recursive-inner-16: 1.219 41 | recursive-16: 1.228 42 | recursive-inner-paral-16: 0.325 43 | recursive-paral-16: 0.334 44 | recursive-inner-paral-cutoff-16: 0.335 45 | recursive-paral-cutoff-16: 0.344 46 | recursive-inner-simd-16: 0.588 47 | recursive-simd-16: 0.597 48 | recursive-inner-simd-paral-16: 0.167 49 | recursive-simd-paral-16: 0.176 50 | recursive-inner-simd-paral-cutoff-16: 0.174 51 | recursive-simd-paral-cutoff-16: 0.183 52 | recursive-inner-32: 1.178 53 | recursive-32: 1.187 54 | recursive-inner-paral-32: 0.302 55 | recursive-paral-32: 0.313 56 | recursive-inner-paral-cutoff-32: 0.302 57 | recursive-paral-cutoff-32: 0.312 58 | recursive-inner-simd-32: 0.260 59 | recursive-simd-32: 0.270 60 | recursive-inner-simd-paral-32: 0.077 61 | recursive-simd-paral-32: 0.087 62 | recursive-inner-simd-paral-cutoff-32: 0.075 63 | recursive-simd-paral-cutoff-32: 0.086 64 | strassen-inner-32: 0.055 65 | strassen-32: 0.066 66 | recursive-inner-64: 1.168 67 | recursive-64: 1.177 68 | recursive-inner-paral-64: 0.309 69 | recursive-paral-64: 0.319 70 | recursive-inner-paral-cutoff-64: 0.303 71 | recursive-paral-cutoff-64: 0.312 72 | recursive-inner-simd-64: 0.230 73 | recursive-simd-64: 0.240 74 | recursive-inner-simd-paral-64: 0.063 75 | recursive-simd-paral-64: 0.073 76 | recursive-inner-simd-paral-cutoff-64: 0.069 77 | recursive-simd-paral-cutoff-64: 0.077 78 | strassen-inner-64: 0.054 79 | strassen-64: 0.064 80 | recursive-inner-128: 1.421 81 | recursive-128: 1.430 82 | recursive-inner-paral-128: 0.386 83 | recursive-paral-128: 0.395 84 | recursive-inner-paral-cutoff-128: 0.370 85 | recursive-paral-cutoff-128: 0.384 86 | recursive-inner-simd-128: 0.184 87 | recursive-simd-128: 0.193 88 | recursive-inner-simd-paral-128: 0.059 89 | recursive-simd-paral-128: 0.068 90 | recursive-inner-simd-paral-cutoff-128: 0.056 91 | recursive-simd-paral-cutoff-128: 0.066 92 | strassen-inner-128: 0.059 93 | strassen-128: 0.070 94 | recursive-inner-256: 1.500 95 | recursive-256: 1.509 96 | recursive-inner-paral-256: 0.409 97 | recursive-paral-256: 0.419 98 | recursive-inner-paral-cutoff-256: 0.423 99 | recursive-paral-cutoff-256: 0.436 100 | recursive-inner-simd-256: 0.162 101 | recursive-simd-256: 0.172 102 | recursive-inner-simd-paral-256: 0.065 103 | recursive-simd-paral-256: 0.075 104 | recursive-inner-simd-paral-cutoff-256: 0.061 105 | recursive-simd-paral-cutoff-256: 0.074 106 | strassen-inner-256: 0.061 107 | strassen-256: 0.072 108 | recursive-inner-512: 7.904 109 | recursive-512: 7.914 110 | recursive-inner-paral-512: 8.612 111 | recursive-paral-512: 8.621 112 | recursive-inner-paral-cutoff-512: 7.171 113 | recursive-paral-cutoff-512: 7.179 114 | recursive-inner-simd-512: 0.225 115 | recursive-simd-512: 0.234 116 | recursive-inner-simd-paral-512: 0.211 117 | recursive-simd-paral-512: 0.220 118 | recursive-inner-simd-paral-cutoff-512: 0.211 119 | recursive-simd-paral-cutoff-512: 0.219 120 | strassen-inner-512: 0.205 121 | strassen-512: 0.214 122 | -------------------------------------------------------------------------------- /measured/xeon-1.8GHz-20_2/512.out: -------------------------------------------------------------------------------- 1 | simple: 0.599 2 | colcp: 0.417 3 | simd: 0.028 4 | recursive-inner-1: 2.021 5 | recursive-1: 2.027 6 | recursive-inner-paral-1: 8.628 7 | recursive-paral-1: 8.637 8 | recursive-inner-paral-cutoff-1: 0.247 9 | recursive-paral-cutoff-1: 0.257 10 | recursive-inner-2: 0.556 11 | recursive-2: 0.561 12 | recursive-inner-paral-2: 1.372 13 | recursive-paral-2: 1.378 14 | recursive-inner-paral-cutoff-2: 0.116 15 | recursive-paral-cutoff-2: 0.123 16 | recursive-inner-4: 0.375 17 | recursive-4: 0.379 18 | recursive-inner-paral-4: 0.284 19 | recursive-paral-4: 0.288 20 | recursive-inner-paral-cutoff-4: 0.065 21 | recursive-paral-cutoff-4: 0.070 22 | recursive-inner-simd-4: 10.061 23 | recursive-simd-4: 10.065 24 | recursive-inner-simd-paral-4: 0.751 25 | recursive-simd-paral-4: 0.755 26 | recursive-inner-simd-paral-cutoff-4: 1.207 27 | recursive-simd-paral-cutoff-4: 1.213 28 | recursive-inner-8: 0.356 29 | recursive-8: 0.361 30 | recursive-inner-paral-8: 0.058 31 | recursive-paral-8: 0.062 32 | recursive-inner-paral-cutoff-8: 0.057 33 | recursive-paral-cutoff-8: 0.063 34 | recursive-inner-simd-8: 1.232 35 | recursive-simd-8: 1.236 36 | recursive-inner-simd-paral-8: 0.104 37 | recursive-simd-paral-8: 0.109 38 | recursive-inner-simd-paral-cutoff-8: 0.158 39 | recursive-simd-paral-cutoff-8: 0.164 40 | recursive-inner-16: 0.370 41 | recursive-16: 0.374 42 | recursive-inner-paral-16: 0.032 43 | recursive-paral-16: 0.036 44 | recursive-inner-paral-cutoff-16: 0.065 45 | recursive-paral-cutoff-16: 0.070 46 | recursive-inner-simd-16: 0.362 47 | recursive-simd-16: 0.366 48 | recursive-inner-simd-paral-16: 0.020 49 | recursive-simd-paral-16: 0.024 50 | recursive-inner-simd-paral-cutoff-16: 0.030 51 | recursive-simd-paral-cutoff-16: 0.035 52 | recursive-inner-32: 0.401 53 | recursive-32: 0.405 54 | recursive-inner-paral-32: 0.034 55 | recursive-paral-32: 0.039 56 | recursive-inner-paral-cutoff-32: 0.074 57 | recursive-paral-cutoff-32: 0.079 58 | recursive-inner-simd-32: 0.063 59 | recursive-simd-32: 0.067 60 | recursive-inner-simd-paral-32: 0.009 61 | recursive-simd-paral-32: 0.014 62 | recursive-inner-simd-paral-cutoff-32: 0.017 63 | recursive-simd-paral-cutoff-32: 0.023 64 | strassen-inner-32: 0.010 65 | strassen-32: 0.014 66 | recursive-inner-64: 0.448 67 | recursive-64: 0.452 68 | recursive-inner-paral-64: 0.042 69 | recursive-paral-64: 0.047 70 | recursive-inner-paral-cutoff-64: 0.061 71 | recursive-paral-cutoff-64: 0.065 72 | recursive-inner-simd-64: 0.036 73 | recursive-simd-64: 0.041 74 | recursive-inner-simd-paral-64: 0.007 75 | recursive-simd-paral-64: 0.011 76 | recursive-inner-simd-paral-cutoff-64: 0.012 77 | recursive-simd-paral-cutoff-64: 0.017 78 | strassen-inner-64: 0.009 79 | strassen-64: 0.014 80 | recursive-inner-128: 0.428 81 | recursive-128: 0.432 82 | recursive-inner-paral-128: 0.088 83 | recursive-paral-128: 0.092 84 | recursive-inner-paral-cutoff-128: 0.068 85 | recursive-paral-cutoff-128: 0.073 86 | recursive-inner-simd-128: 0.028 87 | recursive-simd-128: 0.032 88 | recursive-inner-simd-paral-128: 0.009 89 | recursive-simd-paral-128: 0.013 90 | recursive-inner-simd-paral-cutoff-128: 0.011 91 | recursive-simd-paral-cutoff-128: 0.016 92 | strassen-inner-128: 0.009 93 | strassen-128: 0.013 94 | recursive-inner-256: 0.537 95 | recursive-256: 0.541 96 | recursive-inner-paral-256: 0.151 97 | recursive-paral-256: 0.155 98 | recursive-inner-paral-cutoff-256: 0.141 99 | recursive-paral-cutoff-256: 0.146 100 | recursive-inner-simd-256: 0.032 101 | recursive-simd-256: 0.036 102 | recursive-inner-simd-paral-256: 0.020 103 | recursive-simd-paral-256: 0.024 104 | recursive-inner-simd-paral-cutoff-256: 0.022 105 | recursive-simd-paral-cutoff-256: 0.027 106 | strassen-inner-256: 0.014 107 | strassen-256: 0.018 108 | recursive-inner-512: 0.596 109 | recursive-512: 0.600 110 | recursive-inner-paral-512: 0.602 111 | recursive-paral-512: 0.605 112 | recursive-inner-paral-cutoff-512: 0.600 113 | recursive-paral-cutoff-512: 0.603 114 | recursive-inner-simd-512: 0.028 115 | recursive-simd-512: 0.031 116 | recursive-inner-simd-paral-512: 0.029 117 | recursive-simd-paral-512: 0.033 118 | recursive-inner-simd-paral-cutoff-512: 0.028 119 | recursive-simd-paral-cutoff-512: 0.031 120 | strassen-inner-512: 0.031 121 | strassen-512: 0.035 122 | -------------------------------------------------------------------------------- /measured/xeon-3.5GHz-4_2/1024.out: -------------------------------------------------------------------------------- 1 | simple: 4.433 2 | colcp: 2.652 3 | simd: 0.254 4 | recursive-inner-1: 11.653 5 | recursive-1: 11.670 6 | recursive-inner-paral-1: 8.008 7 | recursive-paral-1: 8.025 8 | recursive-inner-paral-cutoff-1: 3.598 9 | recursive-paral-cutoff-1: 3.615 10 | recursive-inner-2: 3.391 11 | recursive-2: 3.401 12 | recursive-inner-paral-2: 1.597 13 | recursive-paral-2: 1.607 14 | recursive-inner-paral-cutoff-2: 0.899 15 | recursive-paral-cutoff-2: 0.910 16 | recursive-inner-4: 2.408 17 | recursive-4: 2.416 18 | recursive-inner-paral-4: 0.665 19 | recursive-paral-4: 0.673 20 | recursive-inner-paral-cutoff-4: 0.610 21 | recursive-paral-cutoff-4: 0.619 22 | recursive-inner-simd-4: 43.387 23 | recursive-simd-4: 43.395 24 | recursive-inner-simd-paral-4: 9.455 25 | recursive-simd-paral-4: 9.463 26 | recursive-inner-simd-paral-cutoff-4: 9.604 27 | recursive-simd-paral-cutoff-4: 9.612 28 | recursive-inner-8: 2.374 29 | recursive-8: 2.382 30 | recursive-inner-paral-8: 0.607 31 | recursive-paral-8: 0.615 32 | recursive-inner-paral-cutoff-8: 0.556 33 | recursive-paral-cutoff-8: 0.565 34 | recursive-inner-simd-8: 5.197 35 | recursive-simd-8: 5.206 36 | recursive-inner-simd-paral-8: 1.446 37 | recursive-simd-paral-8: 1.456 38 | recursive-inner-simd-paral-cutoff-8: 1.390 39 | recursive-simd-paral-cutoff-8: 1.399 40 | recursive-inner-16: 2.535 41 | recursive-16: 2.544 42 | recursive-inner-paral-16: 0.651 43 | recursive-paral-16: 0.659 44 | recursive-inner-paral-cutoff-16: 0.645 45 | recursive-paral-cutoff-16: 0.653 46 | recursive-inner-simd-16: 0.960 47 | recursive-simd-16: 0.968 48 | recursive-inner-simd-paral-16: 0.211 49 | recursive-simd-paral-16: 0.220 50 | recursive-inner-simd-paral-cutoff-16: 0.223 51 | recursive-simd-paral-cutoff-16: 0.231 52 | recursive-inner-32: 2.637 53 | recursive-32: 2.645 54 | recursive-inner-paral-32: 0.576 55 | recursive-paral-32: 0.584 56 | recursive-inner-paral-cutoff-32: 0.570 57 | recursive-paral-cutoff-32: 0.578 58 | recursive-inner-simd-32: 0.359 59 | recursive-simd-32: 0.367 60 | recursive-inner-simd-paral-32: 0.089 61 | recursive-simd-paral-32: 0.098 62 | recursive-inner-simd-paral-cutoff-32: 0.095 63 | recursive-simd-paral-cutoff-32: 0.103 64 | strassen-inner-32: 0.060 65 | strassen-32: 0.068 66 | recursive-inner-64: 2.599 67 | recursive-64: 2.607 68 | recursive-inner-paral-64: 0.545 69 | recursive-paral-64: 0.553 70 | recursive-inner-paral-cutoff-64: 0.550 71 | recursive-paral-cutoff-64: 0.558 72 | recursive-inner-simd-64: 0.227 73 | recursive-simd-64: 0.235 74 | recursive-inner-simd-paral-64: 0.058 75 | recursive-simd-paral-64: 0.067 76 | recursive-inner-simd-paral-cutoff-64: 0.063 77 | recursive-simd-paral-cutoff-64: 0.072 78 | strassen-inner-64: 0.046 79 | strassen-64: 0.054 80 | recursive-inner-128: 2.764 81 | recursive-128: 2.772 82 | recursive-inner-paral-128: 0.640 83 | recursive-paral-128: 0.648 84 | recursive-inner-paral-cutoff-128: 0.629 85 | recursive-paral-cutoff-128: 0.637 86 | recursive-inner-simd-128: 0.180 87 | recursive-simd-128: 0.188 88 | recursive-inner-simd-paral-128: 0.047 89 | recursive-simd-paral-128: 0.055 90 | recursive-inner-simd-paral-cutoff-128: 0.048 91 | recursive-simd-paral-cutoff-128: 0.056 92 | strassen-inner-128: 0.043 93 | strassen-128: 0.051 94 | recursive-inner-256: 2.885 95 | recursive-256: 2.893 96 | recursive-inner-paral-256: 0.780 97 | recursive-paral-256: 0.788 98 | recursive-inner-paral-cutoff-256: 0.873 99 | recursive-paral-cutoff-256: 0.881 100 | recursive-inner-simd-256: 0.194 101 | recursive-simd-256: 0.202 102 | recursive-inner-simd-paral-256: 0.058 103 | recursive-simd-paral-256: 0.066 104 | recursive-inner-simd-paral-cutoff-256: 0.058 105 | recursive-simd-paral-cutoff-256: 0.067 106 | strassen-inner-256: 0.053 107 | strassen-256: 0.061 108 | recursive-inner-512: 3.553 109 | recursive-512: 3.561 110 | recursive-inner-paral-512: 0.949 111 | recursive-paral-512: 0.957 112 | recursive-inner-paral-cutoff-512: 0.956 113 | recursive-paral-cutoff-512: 0.964 114 | recursive-inner-simd-512: 0.175 115 | recursive-simd-512: 0.183 116 | recursive-inner-simd-paral-512: 0.083 117 | recursive-simd-paral-512: 0.091 118 | recursive-inner-simd-paral-cutoff-512: 0.063 119 | recursive-simd-paral-cutoff-512: 0.071 120 | strassen-inner-512: 0.064 121 | strassen-512: 0.072 122 | recursive-inner-1024: 3.976 123 | recursive-1024: 3.984 124 | recursive-inner-paral-1024: 3.860 125 | recursive-paral-1024: 3.868 126 | recursive-inner-paral-cutoff-1024: 4.093 127 | recursive-paral-cutoff-1024: 4.102 128 | recursive-inner-simd-1024: 0.251 129 | recursive-simd-1024: 0.259 130 | recursive-inner-simd-paral-1024: 0.256 131 | recursive-simd-paral-1024: 0.265 132 | recursive-inner-simd-paral-cutoff-1024: 0.260 133 | recursive-simd-paral-cutoff-1024: 0.269 134 | strassen-inner-1024: 0.250 135 | strassen-1024: 0.259 136 | -------------------------------------------------------------------------------- /measured/xeon-1.8GHz-20_2/1024.out: -------------------------------------------------------------------------------- 1 | simple: 5.247 2 | colcp: 3.364 3 | simd: 0.206 4 | recursive-inner-1: 16.311 5 | recursive-1: 16.337 6 | recursive-inner-paral-1: 63.167 7 | recursive-paral-1: 63.201 8 | recursive-inner-paral-cutoff-1: 1.678 9 | recursive-paral-cutoff-1: 1.711 10 | recursive-inner-2: 4.441 11 | recursive-2: 4.460 12 | recursive-inner-paral-2: 9.879 13 | recursive-paral-2: 9.900 14 | recursive-inner-paral-cutoff-2: 0.436 15 | recursive-paral-cutoff-2: 0.458 16 | recursive-inner-4: 3.000 17 | recursive-4: 3.015 18 | recursive-inner-paral-4: 2.106 19 | recursive-paral-4: 2.124 20 | recursive-inner-paral-cutoff-4: 0.288 21 | recursive-paral-cutoff-4: 0.305 22 | recursive-inner-simd-4: 80.532 23 | recursive-simd-4: 80.556 24 | recursive-inner-simd-paral-4: 6.018 25 | recursive-simd-paral-4: 6.037 26 | recursive-inner-simd-paral-cutoff-4: 5.904 27 | recursive-simd-paral-cutoff-4: 5.921 28 | recursive-inner-8: 2.918 29 | recursive-8: 2.932 30 | recursive-inner-paral-8: 0.435 31 | recursive-paral-8: 0.451 32 | recursive-inner-paral-cutoff-8: 0.281 33 | recursive-paral-cutoff-8: 0.298 34 | recursive-inner-simd-8: 9.952 35 | recursive-simd-8: 9.967 36 | recursive-inner-simd-paral-8: 0.796 37 | recursive-simd-paral-8: 0.813 38 | recursive-inner-simd-paral-cutoff-8: 0.766 39 | recursive-simd-paral-cutoff-8: 0.783 40 | recursive-inner-16: 3.053 41 | recursive-16: 3.067 42 | recursive-inner-paral-16: 0.240 43 | recursive-paral-16: 0.257 44 | recursive-inner-paral-cutoff-16: 0.243 45 | recursive-paral-cutoff-16: 0.260 46 | recursive-inner-simd-16: 1.603 47 | recursive-simd-16: 1.617 48 | recursive-inner-simd-paral-16: 0.140 49 | recursive-simd-paral-16: 0.157 50 | recursive-inner-simd-paral-cutoff-16: 0.131 51 | recursive-simd-paral-cutoff-16: 0.148 52 | recursive-inner-32: 3.200 53 | recursive-32: 3.216 54 | recursive-inner-paral-32: 0.234 55 | recursive-paral-32: 0.252 56 | recursive-inner-paral-cutoff-32: 0.253 57 | recursive-paral-cutoff-32: 0.269 58 | recursive-inner-simd-32: 0.512 59 | recursive-simd-32: 0.527 60 | recursive-inner-simd-paral-32: 0.047 61 | recursive-simd-paral-32: 0.064 62 | recursive-inner-simd-paral-cutoff-32: 0.055 63 | recursive-simd-paral-cutoff-32: 0.071 64 | strassen-inner-32: 0.050 65 | strassen-32: 0.065 66 | recursive-inner-64: 3.442 67 | recursive-64: 3.457 68 | recursive-inner-paral-64: 0.237 69 | recursive-paral-64: 0.253 70 | recursive-inner-paral-cutoff-64: 0.274 71 | recursive-paral-cutoff-64: 0.291 72 | recursive-inner-simd-64: 0.300 73 | recursive-simd-64: 0.315 74 | recursive-inner-simd-paral-64: 0.042 75 | recursive-simd-paral-64: 0.058 76 | recursive-inner-simd-paral-cutoff-64: 0.037 77 | recursive-simd-paral-cutoff-64: 0.053 78 | strassen-inner-64: 0.043 79 | strassen-64: 0.057 80 | recursive-inner-128: 3.431 81 | recursive-128: 3.446 82 | recursive-inner-paral-128: 0.302 83 | recursive-paral-128: 0.319 84 | recursive-inner-paral-cutoff-128: 0.294 85 | recursive-paral-cutoff-128: 0.310 86 | recursive-inner-simd-128: 0.232 87 | recursive-simd-128: 0.246 88 | recursive-inner-simd-paral-128: 0.041 89 | recursive-simd-paral-128: 0.056 90 | recursive-inner-simd-paral-cutoff-128: 0.036 91 | recursive-simd-paral-cutoff-128: 0.052 92 | strassen-inner-128: 0.038 93 | strassen-128: 0.052 94 | recursive-inner-256: 4.290 95 | recursive-256: 4.304 96 | recursive-inner-paral-256: 0.467 97 | recursive-paral-256: 0.482 98 | recursive-inner-paral-cutoff-256: 0.492 99 | recursive-paral-cutoff-256: 0.507 100 | recursive-inner-simd-256: 0.252 101 | recursive-simd-256: 0.266 102 | recursive-inner-simd-paral-256: 0.045 103 | recursive-simd-paral-256: 0.060 104 | recursive-inner-simd-paral-cutoff-256: 0.047 105 | recursive-simd-paral-cutoff-256: 0.063 106 | strassen-inner-256: 0.042 107 | strassen-256: 0.055 108 | recursive-inner-512: 4.905 109 | recursive-512: 4.919 110 | recursive-inner-paral-512: 1.286 111 | recursive-paral-512: 1.302 112 | recursive-inner-paral-cutoff-512: 1.313 113 | recursive-paral-cutoff-512: 1.328 114 | recursive-inner-simd-512: 0.225 115 | recursive-simd-512: 0.238 116 | recursive-inner-simd-paral-512: 0.074 117 | recursive-simd-paral-512: 0.089 118 | recursive-inner-simd-paral-cutoff-512: 0.074 119 | recursive-simd-paral-cutoff-512: 0.089 120 | strassen-inner-512: 0.077 121 | strassen-512: 0.090 122 | recursive-inner-1024: 5.254 123 | recursive-1024: 5.268 124 | recursive-inner-paral-1024: 5.234 125 | recursive-paral-1024: 5.247 126 | recursive-inner-paral-cutoff-1024: 5.247 127 | recursive-paral-cutoff-1024: 5.260 128 | recursive-inner-simd-1024: 0.208 129 | recursive-simd-1024: 0.221 130 | recursive-inner-simd-paral-1024: 0.205 131 | recursive-simd-paral-1024: 0.218 132 | recursive-inner-simd-paral-cutoff-1024: 0.206 133 | recursive-simd-paral-cutoff-1024: 0.218 134 | strassen-inner-1024: 0.205 135 | strassen-1024: 0.218 136 | -------------------------------------------------------------------------------- /measured/buldozer-4.2GHz-8/1024.out: -------------------------------------------------------------------------------- 1 | simple: 14.739 2 | colcp: 4.389 3 | simd: 0.606 4 | recursive-inner-1: 22.222 5 | recursive-1: 22.253 6 | recursive-inner-paral-1: 12.220 7 | recursive-paral-1: 12.252 8 | recursive-inner-paral-cutoff-1: 4.538 9 | recursive-paral-cutoff-1: 4.570 10 | recursive-inner-2: 6.753 11 | recursive-2: 6.771 12 | recursive-inner-paral-2: 2.155 13 | recursive-paral-2: 2.174 14 | recursive-inner-paral-cutoff-2: 1.136 15 | recursive-paral-cutoff-2: 1.155 16 | recursive-inner-4: 4.242 17 | recursive-4: 4.255 18 | recursive-inner-paral-4: 0.810 19 | recursive-paral-4: 0.824 20 | recursive-inner-paral-cutoff-4: 0.656 21 | recursive-paral-cutoff-4: 0.671 22 | recursive-inner-simd-4: 56.128 23 | recursive-simd-4: 56.141 24 | recursive-inner-simd-paral-4: 13.568 25 | recursive-simd-paral-4: 13.582 26 | recursive-inner-simd-paral-cutoff-4: 14.015 27 | recursive-simd-paral-cutoff-4: 14.028 28 | recursive-inner-8: 4.134 29 | recursive-8: 4.150 30 | recursive-inner-paral-8: 0.624 31 | recursive-paral-8: 0.640 32 | recursive-inner-paral-cutoff-8: 0.652 33 | recursive-paral-cutoff-8: 0.669 34 | recursive-inner-simd-8: 8.520 35 | recursive-simd-8: 8.537 36 | recursive-inner-simd-paral-8: 1.832 37 | recursive-simd-paral-8: 1.848 38 | recursive-inner-simd-paral-cutoff-8: 1.849 39 | recursive-simd-paral-cutoff-8: 1.865 40 | recursive-inner-16: 4.220 41 | recursive-16: 4.235 42 | recursive-inner-paral-16: 0.619 43 | recursive-paral-16: 0.635 44 | recursive-inner-paral-cutoff-16: 0.621 45 | recursive-paral-cutoff-16: 0.636 46 | recursive-inner-simd-16: 1.973 47 | recursive-simd-16: 1.989 48 | recursive-inner-simd-paral-16: 0.378 49 | recursive-simd-paral-16: 0.394 50 | recursive-inner-simd-paral-cutoff-16: 0.384 51 | recursive-simd-paral-cutoff-16: 0.400 52 | recursive-inner-32: 4.679 53 | recursive-32: 4.691 54 | recursive-inner-paral-32: 0.656 55 | recursive-paral-32: 0.668 56 | recursive-inner-paral-cutoff-32: 0.667 57 | recursive-paral-cutoff-32: 0.679 58 | recursive-inner-simd-32: 0.884 59 | recursive-simd-32: 0.896 60 | recursive-inner-simd-paral-32: 0.168 61 | recursive-simd-paral-32: 0.180 62 | recursive-inner-simd-paral-cutoff-32: 0.171 63 | recursive-simd-paral-cutoff-32: 0.184 64 | strassen-inner-32: 0.112 65 | strassen-32: 0.124 66 | recursive-inner-64: 4.786 67 | recursive-64: 4.796 68 | recursive-inner-paral-64: 0.701 69 | recursive-paral-64: 0.711 70 | recursive-inner-paral-cutoff-64: 0.673 71 | recursive-paral-cutoff-64: 0.683 72 | recursive-inner-simd-64: 0.630 73 | recursive-simd-64: 0.640 74 | recursive-inner-simd-paral-64: 0.120 75 | recursive-simd-paral-64: 0.131 76 | recursive-inner-simd-paral-cutoff-64: 0.129 77 | recursive-simd-paral-cutoff-64: 0.139 78 | strassen-inner-64: 0.092 79 | strassen-64: 0.102 80 | recursive-inner-128: 4.781 81 | recursive-128: 4.792 82 | recursive-inner-paral-128: 0.765 83 | recursive-paral-128: 0.776 84 | recursive-inner-paral-cutoff-128: 0.752 85 | recursive-paral-cutoff-128: 0.762 86 | recursive-inner-simd-128: 0.545 87 | recursive-simd-128: 0.556 88 | recursive-inner-simd-paral-128: 0.107 89 | recursive-simd-paral-128: 0.117 90 | recursive-inner-simd-paral-cutoff-128: 0.107 91 | recursive-simd-paral-cutoff-128: 0.117 92 | strassen-inner-128: 0.089 93 | strassen-128: 0.100 94 | recursive-inner-256: 5.005 95 | recursive-256: 5.016 96 | recursive-inner-paral-256: 0.739 97 | recursive-paral-256: 0.750 98 | recursive-inner-paral-cutoff-256: 0.781 99 | recursive-paral-cutoff-256: 0.791 100 | recursive-inner-simd-256: 0.470 101 | recursive-simd-256: 0.480 102 | recursive-inner-simd-paral-256: 0.110 103 | recursive-simd-paral-256: 0.120 104 | recursive-inner-simd-paral-cutoff-256: 0.085 105 | recursive-simd-paral-cutoff-256: 0.095 106 | strassen-inner-256: 0.086 107 | strassen-256: 0.097 108 | recursive-inner-512: 5.689 109 | recursive-512: 5.699 110 | recursive-inner-paral-512: 1.488 111 | recursive-paral-512: 1.498 112 | recursive-inner-paral-cutoff-512: 1.489 113 | recursive-paral-cutoff-512: 1.500 114 | recursive-inner-simd-512: 0.440 115 | recursive-simd-512: 0.451 116 | recursive-inner-simd-paral-512: 0.155 117 | recursive-simd-paral-512: 0.166 118 | recursive-inner-simd-paral-cutoff-512: 0.154 119 | recursive-simd-paral-cutoff-512: 0.165 120 | strassen-inner-512: 0.097 121 | strassen-512: 0.108 122 | recursive-inner-1024: 14.608 123 | recursive-1024: 14.618 124 | recursive-inner-paral-1024: 14.638 125 | recursive-paral-1024: 14.649 126 | recursive-inner-paral-cutoff-1024: 14.694 127 | recursive-paral-cutoff-1024: 14.704 128 | recursive-inner-simd-1024: 0.625 129 | recursive-simd-1024: 0.635 130 | recursive-inner-simd-paral-1024: 0.620 131 | recursive-simd-paral-1024: 0.631 132 | recursive-inner-simd-paral-cutoff-1024: 0.615 133 | recursive-simd-paral-cutoff-1024: 0.626 134 | strassen-inner-1024: 0.620 135 | strassen-1024: 0.631 136 | -------------------------------------------------------------------------------- /measured/xeon-3.5GHz-4_2/2048.out: -------------------------------------------------------------------------------- 1 | simple: 110.030 2 | colcp: 21.403 3 | simd: 2.130 4 | recursive-inner-1: 93.737 5 | recursive-1: 93.811 6 | recursive-inner-paral-1: 61.583 7 | recursive-paral-1: 61.654 8 | recursive-inner-paral-cutoff-1: 27.381 9 | recursive-paral-cutoff-1: 27.453 10 | recursive-inner-2: 27.145 11 | recursive-2: 27.189 12 | recursive-inner-paral-2: 11.889 13 | recursive-paral-2: 11.933 14 | recursive-inner-paral-cutoff-2: 7.277 15 | recursive-paral-cutoff-2: 7.325 16 | recursive-inner-4: 18.838 17 | recursive-4: 18.875 18 | recursive-inner-paral-4: 5.422 19 | recursive-paral-4: 5.458 20 | recursive-inner-paral-cutoff-4: 4.710 21 | recursive-paral-cutoff-4: 4.746 22 | recursive-inner-simd-4: 342.044 23 | recursive-simd-4: 342.081 24 | recursive-inner-simd-paral-4: 79.819 25 | recursive-simd-paral-4: 79.856 26 | recursive-inner-simd-paral-cutoff-4: 77.865 27 | recursive-simd-paral-cutoff-4: 77.903 28 | recursive-inner-8: 18.448 29 | recursive-8: 18.482 30 | recursive-inner-paral-8: 4.468 31 | recursive-paral-8: 4.503 32 | recursive-inner-paral-cutoff-8: 4.513 33 | recursive-paral-cutoff-8: 4.548 34 | recursive-inner-simd-8: 41.383 35 | recursive-simd-8: 41.418 36 | recursive-inner-simd-paral-8: 9.712 37 | recursive-simd-paral-8: 9.746 38 | recursive-inner-simd-paral-cutoff-8: 9.847 39 | recursive-simd-paral-cutoff-8: 9.882 40 | recursive-inner-16: 19.185 41 | recursive-16: 19.221 42 | recursive-inner-paral-16: 4.391 43 | recursive-paral-16: 4.426 44 | recursive-inner-paral-cutoff-16: 4.263 45 | recursive-paral-cutoff-16: 4.298 46 | recursive-inner-simd-16: 7.664 47 | recursive-simd-16: 7.700 48 | recursive-inner-simd-paral-16: 1.700 49 | recursive-simd-paral-16: 1.734 50 | recursive-inner-simd-paral-cutoff-16: 1.769 51 | recursive-simd-paral-cutoff-16: 1.804 52 | recursive-inner-32: 20.538 53 | recursive-32: 20.576 54 | recursive-inner-paral-32: 4.451 55 | recursive-paral-32: 4.486 56 | recursive-inner-paral-cutoff-32: 4.659 57 | recursive-paral-cutoff-32: 4.694 58 | recursive-inner-simd-32: 2.830 59 | recursive-simd-32: 2.865 60 | recursive-inner-simd-paral-32: 0.699 61 | recursive-simd-paral-32: 0.734 62 | recursive-inner-simd-paral-cutoff-32: 0.703 63 | recursive-simd-paral-cutoff-32: 0.738 64 | strassen-inner-32: 0.432 65 | strassen-32: 0.469 66 | recursive-inner-64: 20.946 67 | recursive-64: 20.982 68 | recursive-inner-paral-64: 4.491 69 | recursive-paral-64: 4.526 70 | recursive-inner-paral-cutoff-64: 4.414 71 | recursive-paral-cutoff-64: 4.448 72 | recursive-inner-simd-64: 1.764 73 | recursive-simd-64: 1.798 74 | recursive-inner-simd-paral-64: 0.451 75 | recursive-simd-paral-64: 0.486 76 | recursive-inner-simd-paral-cutoff-64: 0.465 77 | recursive-simd-paral-cutoff-64: 0.500 78 | strassen-inner-64: 0.322 79 | strassen-64: 0.359 80 | recursive-inner-128: 21.455 81 | recursive-128: 21.493 82 | recursive-inner-paral-128: 4.884 83 | recursive-paral-128: 4.920 84 | recursive-inner-paral-cutoff-128: 4.992 85 | recursive-paral-cutoff-128: 5.028 86 | recursive-inner-simd-128: 1.443 87 | recursive-simd-128: 1.479 88 | recursive-inner-simd-paral-128: 0.378 89 | recursive-simd-paral-128: 0.414 90 | recursive-inner-simd-paral-cutoff-128: 0.376 91 | recursive-simd-paral-cutoff-128: 0.412 92 | strassen-inner-128: 0.300 93 | strassen-128: 0.338 94 | recursive-inner-256: 25.501 95 | recursive-256: 25.537 96 | recursive-inner-paral-256: 6.613 97 | recursive-paral-256: 6.648 98 | recursive-inner-paral-cutoff-256: 6.509 99 | recursive-paral-cutoff-256: 6.545 100 | recursive-inner-simd-256: 1.775 101 | recursive-simd-256: 1.816 102 | recursive-inner-simd-paral-256: 0.444 103 | recursive-simd-paral-256: 0.484 104 | recursive-inner-simd-paral-cutoff-256: 0.374 105 | recursive-simd-paral-cutoff-256: 0.409 106 | strassen-inner-256: 0.335 107 | strassen-256: 0.372 108 | recursive-inner-512: 28.823 109 | recursive-512: 28.859 110 | recursive-inner-paral-512: 7.474 111 | recursive-paral-512: 7.509 112 | recursive-inner-paral-cutoff-512: 7.262 113 | recursive-paral-cutoff-512: 7.296 114 | recursive-inner-simd-512: 1.390 115 | recursive-simd-512: 1.425 116 | recursive-inner-simd-paral-512: 0.415 117 | recursive-simd-paral-512: 0.449 118 | recursive-inner-simd-paral-cutoff-512: 0.391 119 | recursive-simd-paral-cutoff-512: 0.425 120 | strassen-inner-512: 0.509 121 | strassen-512: 0.545 122 | recursive-inner-1024: 36.068 123 | recursive-1024: 36.104 124 | recursive-inner-paral-1024: 12.769 125 | recursive-paral-1024: 12.804 126 | recursive-inner-paral-cutoff-1024: 14.735 127 | recursive-paral-cutoff-1024: 14.771 128 | recursive-inner-simd-1024: 2.061 129 | recursive-simd-1024: 2.101 130 | recursive-inner-simd-paral-1024: 0.723 131 | recursive-simd-paral-1024: 0.761 132 | recursive-inner-simd-paral-cutoff-1024: 0.710 133 | recursive-simd-paral-cutoff-1024: 0.753 134 | strassen-inner-1024: 0.999 135 | strassen-1024: 1.039 136 | -------------------------------------------------------------------------------- /measured/celeron-1.8GHz-4/1024.out: -------------------------------------------------------------------------------- 1 | simple: 107.478 2 | colcp: 5.441 3 | simd: 1.377 4 | recursive-inner-1: 52.681 5 | recursive-1: 52.751 6 | recursive-inner-paral-1: 24.554 7 | recursive-paral-1: 24.624 8 | recursive-inner-paral-cutoff-1: 13.412 9 | recursive-paral-cutoff-1: 13.486 10 | recursive-inner-2: 17.574 11 | recursive-2: 17.615 12 | recursive-inner-paral-2: 6.110 13 | recursive-paral-2: 6.151 14 | recursive-inner-paral-cutoff-2: 4.608 15 | recursive-paral-cutoff-2: 4.649 16 | recursive-inner-4: 12.136 17 | recursive-4: 12.169 18 | recursive-inner-paral-4: 3.282 19 | recursive-paral-4: 3.316 20 | recursive-inner-paral-cutoff-4: 3.137 21 | recursive-paral-cutoff-4: 3.174 22 | recursive-inner-simd-4: 132.083 23 | recursive-simd-4: 132.117 24 | recursive-inner-simd-paral-4: 39.803 25 | recursive-simd-paral-4: 39.836 26 | recursive-inner-simd-paral-cutoff-4: 39.344 27 | recursive-simd-paral-cutoff-4: 39.382 28 | recursive-inner-8: 10.448 29 | recursive-8: 10.479 30 | recursive-inner-paral-8: 2.679 31 | recursive-paral-8: 2.711 32 | recursive-inner-paral-cutoff-8: 2.691 33 | recursive-paral-cutoff-8: 2.727 34 | recursive-inner-simd-8: 20.036 35 | recursive-simd-8: 20.068 36 | recursive-inner-simd-paral-8: 5.911 37 | recursive-simd-paral-8: 5.942 38 | recursive-inner-simd-paral-cutoff-8: 5.907 39 | recursive-simd-paral-cutoff-8: 5.946 40 | recursive-inner-16: 9.724 41 | recursive-16: 9.754 42 | recursive-inner-paral-16: 2.469 43 | recursive-paral-16: 2.499 44 | recursive-inner-paral-cutoff-16: 2.477 45 | recursive-paral-cutoff-16: 2.507 46 | recursive-inner-simd-16: 4.675 47 | recursive-simd-16: 4.706 48 | recursive-inner-simd-paral-16: 1.288 49 | recursive-simd-paral-16: 1.318 50 | recursive-inner-simd-paral-cutoff-16: 1.287 51 | recursive-simd-paral-cutoff-16: 1.317 52 | recursive-inner-32: 9.414 53 | recursive-32: 9.448 54 | recursive-inner-paral-32: 2.395 55 | recursive-paral-32: 2.428 56 | recursive-inner-paral-cutoff-32: 2.454 57 | recursive-paral-cutoff-32: 2.488 58 | recursive-inner-simd-32: 2.079 59 | recursive-simd-32: 2.113 60 | recursive-inner-simd-paral-32: 0.548 61 | recursive-simd-paral-32: 0.582 62 | recursive-inner-simd-paral-cutoff-32: 0.546 63 | recursive-simd-paral-cutoff-32: 0.580 64 | strassen-inner-32: 0.374 65 | strassen-32: 0.409 66 | recursive-inner-64: 9.384 67 | recursive-64: 9.416 68 | recursive-inner-paral-64: 2.409 69 | recursive-paral-64: 2.439 70 | recursive-inner-paral-cutoff-64: 2.396 71 | recursive-paral-cutoff-64: 2.427 72 | recursive-inner-simd-64: 1.792 73 | recursive-simd-64: 1.822 74 | recursive-inner-simd-paral-64: 0.473 75 | recursive-simd-paral-64: 0.504 76 | recursive-inner-simd-paral-cutoff-64: 0.468 77 | recursive-simd-paral-cutoff-64: 0.499 78 | strassen-inner-64: 0.364 79 | strassen-64: 0.395 80 | recursive-inner-128: 11.481 81 | recursive-128: 11.510 82 | recursive-inner-paral-128: 2.969 83 | recursive-paral-128: 2.998 84 | recursive-inner-paral-cutoff-128: 2.974 85 | recursive-paral-cutoff-128: 3.004 86 | recursive-inner-simd-128: 1.434 87 | recursive-simd-128: 1.464 88 | recursive-inner-simd-paral-128: 0.383 89 | recursive-simd-paral-128: 0.413 90 | recursive-inner-simd-paral-cutoff-128: 0.394 91 | recursive-simd-paral-cutoff-128: 0.425 92 | strassen-inner-128: 0.333 93 | strassen-128: 0.365 94 | recursive-inner-256: 11.997 95 | recursive-256: 12.026 96 | recursive-inner-paral-256: 3.196 97 | recursive-paral-256: 3.229 98 | recursive-inner-paral-cutoff-256: 3.198 99 | recursive-paral-cutoff-256: 3.229 100 | recursive-inner-simd-256: 1.455 101 | recursive-simd-256: 1.483 102 | recursive-inner-simd-paral-256: 0.466 103 | recursive-simd-paral-256: 0.495 104 | recursive-inner-simd-paral-cutoff-256: 0.478 105 | recursive-simd-paral-cutoff-256: 0.514 106 | strassen-inner-256: 0.441 107 | strassen-256: 0.470 108 | recursive-inner-512: 16.512 109 | recursive-512: 16.540 110 | recursive-inner-paral-512: 26.321 111 | recursive-paral-512: 26.353 112 | recursive-inner-paral-cutoff-512: 26.365 113 | recursive-paral-cutoff-512: 26.398 114 | recursive-inner-simd-512: 1.637 115 | recursive-simd-512: 1.664 116 | recursive-inner-simd-paral-512: 0.565 117 | recursive-simd-paral-512: 0.592 118 | recursive-inner-simd-paral-cutoff-512: 0.571 119 | recursive-simd-paral-cutoff-512: 0.603 120 | strassen-inner-512: 0.531 121 | strassen-512: 0.560 122 | recursive-inner-1024: 107.437 123 | recursive-1024: 107.466 124 | recursive-inner-paral-1024: 107.468 125 | recursive-paral-1024: 107.495 126 | recursive-inner-paral-cutoff-1024: 107.483 127 | recursive-paral-cutoff-1024: 107.511 128 | recursive-inner-simd-1024: 1.363 129 | recursive-simd-1024: 1.391 130 | recursive-inner-simd-paral-1024: 1.344 131 | recursive-simd-paral-1024: 1.372 132 | recursive-inner-simd-paral-cutoff-1024: 1.362 133 | recursive-simd-paral-cutoff-1024: 1.390 134 | strassen-inner-1024: 1.357 135 | strassen-1024: 1.385 136 | -------------------------------------------------------------------------------- /measured/xeon-1.8GHz-20_2/2048.out: -------------------------------------------------------------------------------- 1 | simple: 45.963 2 | colcp: 27.053 3 | simd: 1.876 4 | recursive-inner-1: 129.510 5 | recursive-1: 129.632 6 | recursive-inner-paral-1: 481.157 7 | recursive-paral-1: 481.302 8 | recursive-inner-paral-cutoff-1: 11.446 9 | recursive-paral-cutoff-1: 11.590 10 | recursive-inner-2: 35.535 11 | recursive-2: 35.621 12 | recursive-inner-paral-2: 69.466 13 | recursive-paral-2: 69.556 14 | recursive-inner-paral-cutoff-2: 3.000 15 | recursive-paral-cutoff-2: 3.088 16 | recursive-inner-4: 24.011 17 | recursive-4: 24.073 18 | recursive-inner-paral-4: 15.422 19 | recursive-paral-4: 15.490 20 | recursive-inner-paral-cutoff-4: 2.104 21 | recursive-paral-cutoff-4: 2.179 22 | recursive-inner-simd-4: 649.417 23 | recursive-simd-4: 649.482 24 | recursive-inner-simd-paral-4: 46.372 25 | recursive-simd-paral-4: 46.452 26 | recursive-inner-simd-paral-cutoff-4: 40.055 27 | recursive-simd-paral-cutoff-4: 40.130 28 | recursive-inner-8: 23.368 29 | recursive-8: 23.429 30 | recursive-inner-paral-8: 3.082 31 | recursive-paral-8: 3.151 32 | recursive-inner-paral-cutoff-8: 1.851 33 | recursive-paral-cutoff-8: 1.914 34 | recursive-inner-simd-8: 80.225 35 | recursive-simd-8: 80.289 36 | recursive-inner-simd-paral-8: 6.081 37 | recursive-simd-paral-8: 6.148 38 | recursive-inner-simd-paral-cutoff-8: 5.563 39 | recursive-simd-paral-cutoff-8: 5.627 40 | recursive-inner-16: 24.499 41 | recursive-16: 24.560 42 | recursive-inner-paral-16: 1.858 43 | recursive-paral-16: 1.921 44 | recursive-inner-paral-cutoff-16: 1.774 45 | recursive-paral-cutoff-16: 1.837 46 | recursive-inner-simd-16: 12.807 47 | recursive-simd-16: 12.871 48 | recursive-inner-simd-paral-16: 1.071 49 | recursive-simd-paral-16: 1.135 50 | recursive-inner-simd-paral-cutoff-16: 0.838 51 | recursive-simd-paral-cutoff-16: 0.901 52 | recursive-inner-32: 25.529 53 | recursive-32: 25.589 54 | recursive-inner-paral-32: 1.847 55 | recursive-paral-32: 1.912 56 | recursive-inner-paral-cutoff-32: 1.882 57 | recursive-paral-cutoff-32: 1.945 58 | recursive-inner-simd-32: 4.070 59 | recursive-simd-32: 4.131 60 | recursive-inner-simd-paral-32: 0.341 61 | recursive-simd-paral-32: 0.404 62 | recursive-inner-simd-paral-cutoff-32: 0.340 63 | recursive-simd-paral-cutoff-32: 0.402 64 | strassen-inner-32: 0.262 65 | strassen-32: 0.322 66 | recursive-inner-64: 26.962 67 | recursive-64: 27.023 68 | recursive-inner-paral-64: 1.809 69 | recursive-paral-64: 1.871 70 | recursive-inner-paral-cutoff-64: 1.919 71 | recursive-paral-cutoff-64: 1.980 72 | recursive-inner-simd-64: 2.404 73 | recursive-simd-64: 2.463 74 | recursive-inner-simd-paral-64: 0.202 75 | recursive-simd-paral-64: 0.265 76 | recursive-inner-simd-paral-cutoff-64: 0.209 77 | recursive-simd-paral-cutoff-64: 0.272 78 | strassen-inner-64: 0.210 79 | strassen-64: 0.269 80 | recursive-inner-128: 27.371 81 | recursive-128: 27.432 82 | recursive-inner-paral-128: 1.951 83 | recursive-paral-128: 2.013 84 | recursive-inner-paral-cutoff-128: 1.973 85 | recursive-paral-cutoff-128: 2.033 86 | recursive-inner-simd-128: 1.847 87 | recursive-simd-128: 1.904 88 | recursive-inner-simd-paral-128: 0.189 89 | recursive-simd-paral-128: 0.250 90 | recursive-inner-simd-paral-cutoff-128: 0.199 91 | recursive-simd-paral-cutoff-128: 0.260 92 | strassen-inner-128: 0.202 93 | strassen-128: 0.259 94 | recursive-inner-256: 33.911 95 | recursive-256: 33.967 96 | recursive-inner-paral-256: 2.616 97 | recursive-paral-256: 2.680 98 | recursive-inner-paral-cutoff-256: 2.581 99 | recursive-paral-cutoff-256: 2.638 100 | recursive-inner-simd-256: 2.552 101 | recursive-simd-256: 2.607 102 | recursive-inner-simd-paral-256: 0.217 103 | recursive-simd-paral-256: 0.275 104 | recursive-inner-simd-paral-cutoff-256: 0.188 105 | recursive-simd-paral-cutoff-256: 0.246 106 | strassen-inner-256: 0.228 107 | strassen-256: 0.283 108 | recursive-inner-512: 38.888 109 | recursive-512: 38.944 110 | recursive-inner-paral-512: 3.378 111 | recursive-paral-512: 3.436 112 | recursive-inner-paral-cutoff-512: 4.333 113 | recursive-paral-cutoff-512: 4.389 114 | recursive-inner-simd-512: 2.013 115 | recursive-simd-512: 2.067 116 | recursive-inner-simd-paral-512: 0.294 117 | recursive-simd-paral-512: 0.351 118 | recursive-inner-simd-paral-cutoff-512: 0.259 119 | recursive-simd-paral-cutoff-512: 0.316 120 | strassen-inner-512: 0.239 121 | strassen-512: 0.293 122 | recursive-inner-1024: 41.918 123 | recursive-1024: 41.972 124 | recursive-inner-paral-1024: 11.030 125 | recursive-paral-1024: 11.087 126 | recursive-inner-paral-cutoff-1024: 11.118 127 | recursive-paral-cutoff-1024: 11.175 128 | recursive-inner-simd-1024: 1.652 129 | recursive-simd-1024: 1.707 130 | recursive-inner-simd-paral-1024: 0.484 131 | recursive-simd-paral-1024: 0.540 132 | recursive-inner-simd-paral-cutoff-1024: 0.450 133 | recursive-simd-paral-cutoff-1024: 0.507 134 | strassen-inner-1024: 0.446 135 | strassen-1024: 0.500 136 | -------------------------------------------------------------------------------- /measured/buldozer-4.2GHz-8/2048.out: -------------------------------------------------------------------------------- 1 | simple: 322.512 2 | colcp: 34.859 3 | simd: 5.803 4 | recursive-inner-1: 177.891 5 | recursive-1: 178.025 6 | recursive-inner-paral-1: 90.966 7 | recursive-paral-1: 91.098 8 | recursive-inner-paral-cutoff-1: 35.022 9 | recursive-paral-cutoff-1: 35.156 10 | recursive-inner-2: 54.023 11 | recursive-2: 54.099 12 | recursive-inner-paral-2: 16.020 13 | recursive-paral-2: 16.098 14 | recursive-inner-paral-cutoff-2: 8.513 15 | recursive-paral-cutoff-2: 8.590 16 | recursive-inner-4: 34.713 17 | recursive-4: 34.773 18 | recursive-inner-paral-4: 6.144 19 | recursive-paral-4: 6.203 20 | recursive-inner-paral-cutoff-4: 5.188 21 | recursive-paral-cutoff-4: 5.246 22 | recursive-inner-simd-4: 447.524 23 | recursive-simd-4: 447.583 24 | recursive-inner-simd-paral-4: 107.978 25 | recursive-simd-paral-4: 108.036 26 | recursive-inner-simd-paral-cutoff-4: 108.885 27 | recursive-simd-paral-cutoff-4: 108.947 28 | recursive-inner-8: 33.017 29 | recursive-8: 33.086 30 | recursive-inner-paral-8: 4.947 31 | recursive-paral-8: 5.015 32 | recursive-inner-paral-cutoff-8: 4.851 33 | recursive-paral-cutoff-8: 4.920 34 | recursive-inner-simd-8: 67.380 35 | recursive-simd-8: 67.449 36 | recursive-inner-simd-paral-8: 14.634 37 | recursive-simd-paral-8: 14.707 38 | recursive-inner-simd-paral-cutoff-8: 14.598 39 | recursive-simd-paral-cutoff-8: 14.666 40 | recursive-inner-16: 33.465 41 | recursive-16: 33.531 42 | recursive-inner-paral-16: 4.786 43 | recursive-paral-16: 4.856 44 | recursive-inner-paral-cutoff-16: 4.853 45 | recursive-paral-cutoff-16: 4.922 46 | recursive-inner-simd-16: 15.825 47 | recursive-simd-16: 15.891 48 | recursive-inner-simd-paral-16: 2.930 49 | recursive-simd-paral-16: 2.996 50 | recursive-inner-simd-paral-cutoff-16: 2.935 51 | recursive-simd-paral-cutoff-16: 3.002 52 | recursive-inner-32: 37.145 53 | recursive-32: 37.197 54 | recursive-inner-paral-32: 5.079 55 | recursive-paral-32: 5.132 56 | recursive-inner-paral-cutoff-32: 5.173 57 | recursive-paral-cutoff-32: 5.226 58 | recursive-inner-simd-32: 7.133 59 | recursive-simd-32: 7.185 60 | recursive-inner-simd-paral-32: 1.317 61 | recursive-simd-paral-32: 1.369 62 | recursive-inner-simd-paral-cutoff-32: 1.319 63 | recursive-simd-paral-cutoff-32: 1.372 64 | strassen-inner-32: 0.759 65 | strassen-32: 0.810 66 | recursive-inner-64: 38.166 67 | recursive-64: 38.208 68 | recursive-inner-paral-64: 5.295 69 | recursive-paral-64: 5.338 70 | recursive-inner-paral-cutoff-64: 5.317 71 | recursive-paral-cutoff-64: 5.357 72 | recursive-inner-simd-64: 5.057 73 | recursive-simd-64: 5.100 74 | recursive-inner-simd-paral-64: 0.908 75 | recursive-simd-paral-64: 0.952 76 | recursive-inner-simd-paral-cutoff-64: 0.919 77 | recursive-simd-paral-cutoff-64: 0.962 78 | strassen-inner-64: 0.625 79 | strassen-64: 0.668 80 | recursive-inner-128: 37.630 81 | recursive-128: 37.672 82 | recursive-inner-paral-128: 5.747 83 | recursive-paral-128: 5.790 84 | recursive-inner-paral-cutoff-128: 5.658 85 | recursive-paral-cutoff-128: 5.701 86 | recursive-inner-simd-128: 4.388 87 | recursive-simd-128: 4.430 88 | recursive-inner-simd-paral-128: 0.791 89 | recursive-simd-paral-128: 0.834 90 | recursive-inner-simd-paral-cutoff-128: 0.791 91 | recursive-simd-paral-cutoff-128: 0.834 92 | strassen-inner-128: 0.590 93 | strassen-128: 0.632 94 | recursive-inner-256: 39.992 95 | recursive-256: 40.035 96 | recursive-inner-paral-256: 6.145 97 | recursive-paral-256: 6.186 98 | recursive-inner-paral-cutoff-256: 6.046 99 | recursive-paral-cutoff-256: 6.087 100 | recursive-inner-simd-256: 3.706 101 | recursive-simd-256: 3.749 102 | recursive-inner-simd-paral-256: 0.718 103 | recursive-simd-paral-256: 0.759 104 | recursive-inner-simd-paral-cutoff-256: 0.710 105 | recursive-simd-paral-cutoff-256: 0.751 106 | strassen-inner-256: 0.555 107 | strassen-256: 0.599 108 | recursive-inner-512: 45.647 109 | recursive-512: 45.690 110 | recursive-inner-paral-512: 6.251 111 | recursive-paral-512: 6.295 112 | recursive-inner-paral-cutoff-512: 6.284 113 | recursive-paral-cutoff-512: 6.324 114 | recursive-inner-simd-512: 3.458 115 | recursive-simd-512: 3.500 116 | recursive-inner-simd-paral-512: 0.689 117 | recursive-simd-paral-512: 0.731 118 | recursive-inner-simd-paral-cutoff-512: 0.666 119 | recursive-simd-paral-cutoff-512: 0.708 120 | strassen-inner-512: 0.595 121 | strassen-512: 0.637 122 | recursive-inner-1024: 116.501 123 | recursive-1024: 116.541 124 | recursive-inner-paral-1024: 32.653 125 | recursive-paral-1024: 32.699 126 | recursive-inner-paral-cutoff-1024: 31.158 127 | recursive-paral-cutoff-1024: 31.199 128 | recursive-inner-simd-1024: 4.962 129 | recursive-simd-1024: 5.002 130 | recursive-inner-simd-paral-1024: 1.596 131 | recursive-simd-paral-1024: 1.634 132 | recursive-inner-simd-paral-cutoff-1024: 1.558 133 | recursive-simd-paral-cutoff-1024: 1.598 134 | strassen-inner-1024: 1.680 135 | strassen-1024: 1.720 136 | -------------------------------------------------------------------------------- /measured/celeron-1.8GHz-4/2048.out: -------------------------------------------------------------------------------- 1 | simple: 861.452 2 | colcp: 43.175 3 | simd: 10.456 4 | recursive-inner-1: 421.227 5 | recursive-1: 421.508 6 | recursive-inner-paral-1: 196.227 7 | recursive-paral-1: 196.508 8 | recursive-inner-paral-cutoff-1: 106.742 9 | recursive-paral-cutoff-1: 107.023 10 | recursive-inner-2: 141.325 11 | recursive-2: 141.490 12 | recursive-inner-paral-2: 48.682 13 | recursive-paral-2: 48.846 14 | recursive-inner-paral-cutoff-2: 36.515 15 | recursive-paral-cutoff-2: 36.684 16 | recursive-inner-4: 97.049 17 | recursive-4: 97.182 18 | recursive-inner-paral-4: 26.127 19 | recursive-paral-4: 26.260 20 | recursive-inner-paral-cutoff-4: 24.758 21 | recursive-paral-cutoff-4: 24.890 22 | recursive-inner-simd-4: 1050.338 23 | recursive-simd-4: 1050.470 24 | recursive-inner-simd-paral-4: 318.628 25 | recursive-simd-paral-4: 318.760 26 | recursive-inner-simd-paral-cutoff-4: 315.719 27 | recursive-simd-paral-cutoff-4: 315.856 28 | recursive-inner-8: 83.576 29 | recursive-8: 83.703 30 | recursive-inner-paral-8: 21.411 31 | recursive-paral-8: 21.538 32 | recursive-inner-paral-cutoff-8: 21.309 33 | recursive-paral-cutoff-8: 21.439 34 | recursive-inner-simd-8: 158.618 35 | recursive-simd-8: 158.744 36 | recursive-inner-simd-paral-8: 46.569 37 | recursive-simd-paral-8: 46.695 38 | recursive-inner-simd-paral-cutoff-8: 46.887 39 | recursive-simd-paral-cutoff-8: 47.017 40 | recursive-inner-16: 77.788 41 | recursive-16: 77.910 42 | recursive-inner-paral-16: 19.742 43 | recursive-paral-16: 19.863 44 | recursive-inner-paral-cutoff-16: 19.745 45 | recursive-paral-cutoff-16: 19.864 46 | recursive-inner-simd-16: 37.812 47 | recursive-simd-16: 37.934 48 | recursive-inner-simd-paral-16: 10.225 49 | recursive-simd-paral-16: 10.346 50 | recursive-inner-simd-paral-cutoff-16: 10.226 51 | recursive-simd-paral-cutoff-16: 10.348 52 | recursive-inner-32: 75.304 53 | recursive-32: 75.439 54 | recursive-inner-paral-32: 19.113 55 | recursive-paral-32: 19.247 56 | recursive-inner-paral-cutoff-32: 19.162 57 | recursive-paral-cutoff-32: 19.301 58 | recursive-inner-simd-32: 16.598 59 | recursive-simd-32: 16.732 60 | recursive-inner-simd-paral-32: 4.318 61 | recursive-simd-paral-32: 4.454 62 | recursive-inner-simd-paral-cutoff-32: 4.298 63 | recursive-simd-paral-cutoff-32: 4.433 64 | strassen-inner-32: 2.623 65 | strassen-32: 2.759 66 | recursive-inner-64: 75.073 67 | recursive-64: 75.197 68 | recursive-inner-paral-64: 19.067 69 | recursive-paral-64: 19.191 70 | recursive-inner-paral-cutoff-64: 19.066 71 | recursive-paral-cutoff-64: 19.191 72 | recursive-inner-simd-64: 14.181 73 | recursive-simd-64: 14.304 74 | recursive-inner-simd-paral-64: 3.705 75 | recursive-simd-paral-64: 3.827 76 | recursive-inner-simd-paral-cutoff-64: 3.740 77 | recursive-simd-paral-cutoff-64: 3.863 78 | strassen-inner-64: 2.501 79 | strassen-64: 2.624 80 | recursive-inner-128: 91.706 81 | recursive-128: 91.828 82 | recursive-inner-paral-128: 23.796 83 | recursive-paral-128: 23.921 84 | recursive-inner-paral-cutoff-128: 23.691 85 | recursive-paral-cutoff-128: 23.820 86 | recursive-inner-simd-128: 12.586 87 | recursive-simd-128: 12.707 88 | recursive-inner-simd-paral-128: 3.062 89 | recursive-simd-paral-128: 3.184 90 | recursive-inner-simd-paral-cutoff-128: 3.069 91 | recursive-simd-paral-cutoff-128: 3.191 92 | strassen-inner-128: 2.176 93 | strassen-128: 2.297 94 | recursive-inner-256: 95.722 95 | recursive-256: 95.837 96 | recursive-inner-paral-256: 25.468 97 | recursive-paral-256: 25.591 98 | recursive-inner-paral-cutoff-256: 25.219 99 | recursive-paral-cutoff-256: 25.333 100 | recursive-inner-simd-256: 10.732 101 | recursive-simd-256: 10.845 102 | recursive-inner-simd-paral-256: 3.704 103 | recursive-simd-paral-256: 3.820 104 | recursive-inner-simd-paral-cutoff-256: 3.640 105 | recursive-simd-paral-cutoff-256: 3.754 106 | strassen-inner-256: 2.728 107 | strassen-256: 2.843 108 | recursive-inner-512: 131.094 109 | recursive-512: 131.204 110 | recursive-inner-paral-512: 163.235 111 | recursive-paral-512: 163.352 112 | recursive-inner-paral-cutoff-512: 95.145 113 | recursive-paral-cutoff-512: 95.259 114 | recursive-inner-simd-512: 13.019 115 | recursive-simd-512: 13.129 116 | recursive-inner-simd-paral-512: 3.823 117 | recursive-simd-paral-512: 3.940 118 | recursive-inner-simd-paral-cutoff-512: 4.489 119 | recursive-simd-paral-cutoff-512: 4.608 120 | strassen-inner-512: 3.722 121 | strassen-512: 3.833 122 | recursive-inner-1024: 859.533 123 | recursive-1024: 859.642 124 | recursive-inner-paral-1024: 228.785 125 | recursive-paral-1024: 228.903 126 | recursive-inner-paral-cutoff-1024: 224.399 127 | recursive-paral-cutoff-1024: 224.517 128 | recursive-inner-simd-1024: 10.847 129 | recursive-simd-1024: 10.956 130 | recursive-inner-simd-paral-1024: 4.157 131 | recursive-simd-paral-1024: 4.269 132 | recursive-inner-simd-paral-cutoff-1024: 3.794 133 | recursive-simd-paral-cutoff-1024: 3.908 134 | strassen-inner-1024: 3.877 135 | strassen-1024: 3.986 136 | -------------------------------------------------------------------------------- /measured/xeon-3.5GHz-4_2/4096.out: -------------------------------------------------------------------------------- 1 | simple: 1180.623 2 | colcp: 168.238 3 | simd: 15.677 4 | recursive-inner-1: 764.258 5 | recursive-1: 764.545 6 | recursive-inner-paral-1: 482.762 7 | recursive-paral-1: 483.049 8 | recursive-inner-paral-cutoff-1: 216.387 9 | recursive-paral-cutoff-1: 216.672 10 | recursive-inner-2: 215.565 11 | recursive-2: 215.743 12 | recursive-inner-paral-2: 93.789 13 | recursive-paral-2: 93.966 14 | recursive-inner-paral-cutoff-2: 57.691 15 | recursive-paral-cutoff-2: 57.876 16 | recursive-inner-4: 152.099 17 | recursive-4: 152.275 18 | recursive-inner-paral-4: 42.795 19 | recursive-paral-4: 42.940 20 | recursive-inner-paral-cutoff-4: 37.802 21 | recursive-paral-cutoff-4: 37.950 22 | recursive-inner-simd-4: 2763.179 23 | recursive-simd-4: 2763.351 24 | recursive-inner-simd-paral-4: 605.781 25 | recursive-simd-paral-4: 605.927 26 | recursive-inner-simd-paral-cutoff-4: 599.949 27 | recursive-simd-paral-cutoff-4: 600.094 28 | recursive-inner-8: 146.411 29 | recursive-8: 146.549 30 | recursive-inner-paral-8: 35.880 31 | recursive-paral-8: 36.018 32 | recursive-inner-paral-cutoff-8: 34.969 33 | recursive-paral-cutoff-8: 35.109 34 | recursive-inner-simd-8: 328.874 35 | recursive-simd-8: 329.025 36 | recursive-inner-simd-paral-8: 76.885 37 | recursive-simd-paral-8: 77.049 38 | recursive-inner-simd-paral-cutoff-8: 76.355 39 | recursive-simd-paral-cutoff-8: 76.493 40 | recursive-inner-16: 153.405 41 | recursive-16: 153.545 42 | recursive-inner-paral-16: 34.292 43 | recursive-paral-16: 34.435 44 | recursive-inner-paral-cutoff-16: 34.558 45 | recursive-paral-cutoff-16: 34.699 46 | recursive-inner-simd-16: 60.050 47 | recursive-simd-16: 60.191 48 | recursive-inner-simd-paral-16: 13.065 49 | recursive-simd-paral-16: 13.208 50 | recursive-inner-simd-paral-cutoff-16: 13.130 51 | recursive-simd-paral-cutoff-16: 13.280 52 | recursive-inner-32: 164.084 53 | recursive-32: 164.251 54 | recursive-inner-paral-32: 36.112 55 | recursive-paral-32: 36.258 56 | recursive-inner-paral-cutoff-32: 35.792 57 | recursive-paral-cutoff-32: 35.936 58 | recursive-inner-simd-32: 23.015 59 | recursive-simd-32: 23.159 60 | recursive-inner-simd-paral-32: 5.694 61 | recursive-simd-paral-32: 5.841 62 | recursive-inner-simd-paral-cutoff-32: 5.613 63 | recursive-simd-paral-cutoff-32: 5.757 64 | strassen-inner-32: 2.921 65 | strassen-32: 3.080 66 | recursive-inner-64: 166.780 67 | recursive-64: 166.953 68 | recursive-inner-paral-64: 34.960 69 | recursive-paral-64: 35.105 70 | recursive-inner-paral-cutoff-64: 35.223 71 | recursive-paral-cutoff-64: 35.367 72 | recursive-inner-simd-64: 14.187 73 | recursive-simd-64: 14.340 74 | recursive-inner-simd-paral-64: 3.834 75 | recursive-simd-paral-64: 4.006 76 | recursive-inner-simd-paral-cutoff-64: 3.586 77 | recursive-simd-paral-cutoff-64: 3.732 78 | strassen-inner-64: 2.335 79 | strassen-64: 2.487 80 | recursive-inner-128: 171.203 81 | recursive-128: 171.347 82 | recursive-inner-paral-128: 38.909 83 | recursive-paral-128: 39.054 84 | recursive-inner-paral-cutoff-128: 39.269 85 | recursive-paral-cutoff-128: 39.412 86 | recursive-inner-simd-128: 11.585 87 | recursive-simd-128: 11.728 88 | recursive-inner-simd-paral-128: 2.975 89 | recursive-simd-paral-128: 3.118 90 | recursive-inner-simd-paral-cutoff-128: 2.934 91 | recursive-simd-paral-cutoff-128: 3.078 92 | strassen-inner-128: 2.128 93 | strassen-128: 2.275 94 | recursive-inner-256: 199.471 95 | recursive-256: 199.613 96 | recursive-inner-paral-256: 51.124 97 | recursive-paral-256: 51.266 98 | recursive-inner-paral-cutoff-256: 50.832 99 | recursive-paral-cutoff-256: 50.976 100 | recursive-inner-simd-256: 12.558 101 | recursive-simd-256: 12.701 102 | recursive-inner-simd-paral-256: 2.986 103 | recursive-simd-paral-256: 3.128 104 | recursive-inner-simd-paral-cutoff-256: 3.089 105 | recursive-simd-paral-cutoff-256: 3.236 106 | strassen-inner-256: 2.181 107 | strassen-256: 2.351 108 | recursive-inner-512: 235.186 109 | recursive-512: 235.327 110 | recursive-inner-paral-512: 57.566 111 | recursive-paral-512: 57.705 112 | recursive-inner-paral-cutoff-512: 58.041 113 | recursive-paral-cutoff-512: 58.181 114 | recursive-inner-simd-512: 11.326 115 | recursive-simd-512: 11.466 116 | recursive-inner-simd-paral-512: 3.453 117 | recursive-simd-paral-512: 3.593 118 | recursive-inner-simd-paral-cutoff-512: 3.264 119 | recursive-simd-paral-cutoff-512: 3.404 120 | strassen-inner-512: 3.462 121 | strassen-512: 3.615 122 | recursive-inner-1024: 283.807 123 | recursive-1024: 283.958 124 | recursive-inner-paral-1024: 83.778 125 | recursive-paral-1024: 83.928 126 | recursive-inner-paral-cutoff-1024: 90.004 127 | recursive-paral-cutoff-1024: 90.143 128 | recursive-inner-simd-1024: 16.652 129 | recursive-simd-1024: 16.794 130 | recursive-inner-simd-paral-1024: 5.879 131 | recursive-simd-paral-1024: 6.020 132 | recursive-inner-simd-paral-cutoff-1024: 5.740 133 | recursive-simd-paral-cutoff-1024: 5.885 134 | strassen-inner-1024: 7.671 135 | strassen-1024: 7.825 136 | -------------------------------------------------------------------------------- /measured/xeon-1.8GHz-20_2/4096.out: -------------------------------------------------------------------------------- 1 | simple: 494.976 2 | colcp: 216.626 3 | simd: 23.838 4 | recursive-inner-1: 1043.710 5 | recursive-1: 1044.223 6 | recursive-inner-paral-1: 3430.315 7 | recursive-paral-1: 3430.940 8 | recursive-inner-paral-cutoff-1: 88.616 9 | recursive-paral-cutoff-1: 89.190 10 | recursive-inner-2: 285.257 11 | recursive-2: 285.626 12 | recursive-inner-paral-2: 566.596 13 | recursive-paral-2: 567.040 14 | recursive-inner-paral-cutoff-2: 23.482 15 | recursive-paral-cutoff-2: 23.897 16 | recursive-inner-4: 192.555 17 | recursive-4: 192.852 18 | recursive-inner-paral-4: 115.511 19 | recursive-paral-4: 115.806 20 | recursive-inner-paral-cutoff-4: 15.133 21 | recursive-paral-cutoff-4: 15.527 22 | recursive-inner-simd-4: 5178.543 23 | recursive-simd-4: 5178.813 24 | recursive-inner-simd-paral-4: 368.278 25 | recursive-simd-paral-4: 368.586 26 | recursive-inner-simd-paral-cutoff-4: 316.895 27 | recursive-simd-paral-cutoff-4: 317.207 28 | recursive-inner-8: 187.071 29 | recursive-8: 187.440 30 | recursive-inner-paral-8: 25.932 31 | recursive-paral-8: 26.198 32 | recursive-inner-paral-cutoff-8: 14.198 33 | recursive-paral-cutoff-8: 14.451 34 | recursive-inner-simd-8: 632.247 35 | recursive-simd-8: 632.521 36 | recursive-inner-simd-paral-8: 48.212 37 | recursive-simd-paral-8: 48.511 38 | recursive-inner-simd-paral-cutoff-8: 41.823 39 | recursive-simd-paral-cutoff-8: 42.120 40 | recursive-inner-16: 195.366 41 | recursive-16: 195.686 42 | recursive-inner-paral-16: 14.684 43 | recursive-paral-16: 15.152 44 | recursive-inner-paral-cutoff-16: 14.108 45 | recursive-paral-cutoff-16: 14.376 46 | recursive-inner-simd-16: 102.099 47 | recursive-simd-16: 102.348 48 | recursive-inner-simd-paral-16: 8.530 49 | recursive-simd-paral-16: 8.847 50 | recursive-inner-simd-paral-cutoff-16: 6.673 51 | recursive-simd-paral-cutoff-16: 6.992 52 | recursive-inner-32: 204.687 53 | recursive-32: 204.958 54 | recursive-inner-paral-32: 14.491 55 | recursive-paral-32: 14.757 56 | recursive-inner-paral-cutoff-32: 14.340 57 | recursive-paral-cutoff-32: 14.593 58 | recursive-inner-simd-32: 32.567 59 | recursive-simd-32: 32.830 60 | recursive-inner-simd-paral-32: 2.640 61 | recursive-simd-paral-32: 2.884 62 | recursive-inner-simd-paral-cutoff-32: 2.489 63 | recursive-simd-paral-cutoff-32: 2.733 64 | strassen-inner-32: 1.526 65 | strassen-32: 1.766 66 | recursive-inner-64: 215.963 67 | recursive-64: 216.206 68 | recursive-inner-paral-64: 14.483 69 | recursive-paral-64: 14.759 70 | recursive-inner-paral-cutoff-64: 14.627 71 | recursive-paral-cutoff-64: 14.869 72 | recursive-inner-simd-64: 19.179 73 | recursive-simd-64: 19.436 74 | recursive-inner-simd-paral-64: 1.559 75 | recursive-simd-paral-64: 1.812 76 | recursive-inner-simd-paral-cutoff-64: 1.609 77 | recursive-simd-paral-cutoff-64: 1.882 78 | strassen-inner-64: 1.225 79 | strassen-64: 1.481 80 | recursive-inner-128: 219.139 81 | recursive-128: 219.376 82 | recursive-inner-paral-128: 15.198 83 | recursive-paral-128: 15.429 84 | recursive-inner-paral-cutoff-128: 14.892 85 | recursive-paral-cutoff-128: 15.129 86 | recursive-inner-simd-128: 14.742 87 | recursive-simd-128: 14.995 88 | recursive-inner-simd-paral-128: 1.358 89 | recursive-simd-paral-128: 1.592 90 | recursive-inner-simd-paral-cutoff-128: 1.350 91 | recursive-simd-paral-cutoff-128: 1.607 92 | strassen-inner-128: 1.154 93 | strassen-128: 1.382 94 | recursive-inner-256: 269.391 95 | recursive-256: 269.611 96 | recursive-inner-paral-256: 19.733 97 | recursive-paral-256: 19.984 98 | recursive-inner-paral-cutoff-256: 19.373 99 | recursive-paral-cutoff-256: 19.622 100 | recursive-inner-simd-256: 17.753 101 | recursive-simd-256: 17.973 102 | recursive-inner-simd-paral-256: 1.289 103 | recursive-simd-paral-256: 1.512 104 | recursive-inner-simd-paral-cutoff-256: 1.355 105 | recursive-simd-paral-cutoff-256: 1.579 106 | strassen-inner-256: 1.316 107 | strassen-256: 1.535 108 | recursive-inner-512: 311.659 109 | recursive-512: 311.884 110 | recursive-inner-paral-512: 22.399 111 | recursive-paral-512: 22.645 112 | recursive-inner-paral-cutoff-512: 23.337 113 | recursive-paral-cutoff-512: 23.581 114 | recursive-inner-simd-512: 16.148 115 | recursive-simd-512: 16.370 116 | recursive-inner-simd-paral-512: 1.247 117 | recursive-simd-paral-512: 1.466 118 | recursive-inner-simd-paral-cutoff-512: 1.315 119 | recursive-simd-paral-cutoff-512: 1.535 120 | strassen-inner-512: 1.409 121 | strassen-512: 1.625 122 | recursive-inner-1024: 336.774 123 | recursive-1024: 336.998 124 | recursive-inner-paral-1024: 27.816 125 | recursive-paral-1024: 28.062 126 | recursive-inner-paral-cutoff-1024: 37.513 127 | recursive-paral-cutoff-1024: 37.759 128 | recursive-inner-simd-1024: 13.170 129 | recursive-simd-1024: 13.409 130 | recursive-inner-simd-paral-1024: 2.069 131 | recursive-simd-paral-1024: 2.285 132 | recursive-inner-simd-paral-cutoff-1024: 1.935 133 | recursive-simd-paral-cutoff-1024: 2.156 134 | strassen-inner-1024: 7.195 135 | strassen-1024: 7.408 136 | -------------------------------------------------------------------------------- /measured/buldozer-4.2GHz-8/4096.out: -------------------------------------------------------------------------------- 1 | simple: 2928.770 2 | colcp: 277.041 3 | simd: 44.112 4 | recursive-inner-1: 1422.658 5 | recursive-1: 1423.251 6 | recursive-inner-paral-1: 725.342 7 | recursive-paral-1: 725.931 8 | recursive-inner-paral-cutoff-1: 278.657 9 | recursive-paral-cutoff-1: 279.253 10 | recursive-inner-2: 432.606 11 | recursive-2: 432.935 12 | recursive-inner-paral-2: 124.779 13 | recursive-paral-2: 125.100 14 | recursive-inner-paral-cutoff-2: 67.234 15 | recursive-paral-cutoff-2: 67.557 16 | recursive-inner-4: 271.491 17 | recursive-4: 271.743 18 | recursive-inner-paral-4: 47.902 19 | recursive-paral-4: 48.155 20 | recursive-inner-paral-cutoff-4: 40.753 21 | recursive-paral-cutoff-4: 41.001 22 | recursive-inner-simd-4: 3641.101 23 | recursive-simd-4: 3641.347 24 | recursive-inner-simd-paral-4: 863.580 25 | recursive-simd-paral-4: 863.833 26 | recursive-inner-simd-paral-cutoff-4: 860.633 27 | recursive-simd-paral-cutoff-4: 860.880 28 | recursive-inner-8: 265.425 29 | recursive-8: 265.749 30 | recursive-inner-paral-8: 39.271 31 | recursive-paral-8: 39.605 32 | recursive-inner-paral-cutoff-8: 38.995 33 | recursive-paral-cutoff-8: 39.321 34 | recursive-inner-simd-8: 544.127 35 | recursive-simd-8: 544.448 36 | recursive-inner-simd-paral-8: 115.386 37 | recursive-simd-paral-8: 115.715 38 | recursive-inner-simd-paral-cutoff-8: 115.688 39 | recursive-simd-paral-cutoff-8: 116.019 40 | recursive-inner-16: 282.668 41 | recursive-16: 282.976 42 | recursive-inner-paral-16: 38.305 43 | recursive-paral-16: 38.619 44 | recursive-inner-paral-cutoff-16: 38.645 45 | recursive-paral-cutoff-16: 38.962 46 | recursive-inner-simd-16: 126.792 47 | recursive-simd-16: 127.092 48 | recursive-inner-simd-paral-16: 23.196 49 | recursive-simd-paral-16: 23.505 50 | recursive-inner-simd-paral-cutoff-16: 23.336 51 | recursive-simd-paral-cutoff-16: 23.647 52 | recursive-inner-32: 298.865 53 | recursive-32: 299.092 54 | recursive-inner-paral-32: 40.854 55 | recursive-paral-32: 41.080 56 | recursive-inner-paral-cutoff-32: 41.384 57 | recursive-paral-cutoff-32: 41.612 58 | recursive-inner-simd-32: 57.804 59 | recursive-simd-32: 58.030 60 | recursive-inner-simd-paral-32: 10.360 61 | recursive-simd-paral-32: 10.584 62 | recursive-inner-simd-paral-cutoff-32: 10.402 63 | recursive-simd-paral-cutoff-32: 10.627 64 | strassen-inner-32: 5.047 65 | strassen-32: 5.271 66 | recursive-inner-64: 301.486 67 | recursive-64: 301.675 68 | recursive-inner-paral-64: 42.352 69 | recursive-paral-64: 42.540 70 | recursive-inner-paral-cutoff-64: 42.771 71 | recursive-paral-cutoff-64: 42.958 72 | recursive-inner-simd-64: 40.356 73 | recursive-simd-64: 40.545 74 | recursive-inner-simd-paral-64: 7.350 75 | recursive-simd-paral-64: 7.540 76 | recursive-inner-simd-paral-cutoff-64: 7.257 77 | recursive-simd-paral-cutoff-64: 7.448 78 | strassen-inner-64: 4.081 79 | strassen-64: 4.268 80 | recursive-inner-128: 299.983 81 | recursive-128: 300.173 82 | recursive-inner-paral-128: 45.250 83 | recursive-paral-128: 45.443 84 | recursive-inner-paral-cutoff-128: 44.856 85 | recursive-paral-cutoff-128: 45.045 86 | recursive-inner-simd-128: 35.272 87 | recursive-simd-128: 35.460 88 | recursive-inner-simd-paral-128: 6.257 89 | recursive-simd-paral-128: 6.449 90 | recursive-inner-simd-paral-cutoff-128: 6.268 91 | recursive-simd-paral-cutoff-128: 6.458 92 | strassen-inner-128: 3.967 93 | strassen-128: 4.159 94 | recursive-inner-256: 316.579 95 | recursive-256: 316.771 96 | recursive-inner-paral-256: 47.707 97 | recursive-paral-256: 47.898 98 | recursive-inner-paral-cutoff-256: 47.029 99 | recursive-paral-cutoff-256: 47.220 100 | recursive-inner-simd-256: 29.753 101 | recursive-simd-256: 29.944 102 | recursive-inner-simd-paral-256: 5.566 103 | recursive-simd-paral-256: 5.758 104 | recursive-inner-simd-paral-cutoff-256: 5.531 105 | recursive-simd-paral-cutoff-256: 5.722 106 | strassen-inner-256: 3.839 107 | strassen-256: 4.030 108 | recursive-inner-512: 365.487 109 | recursive-512: 365.682 110 | recursive-inner-paral-512: 52.087 111 | recursive-paral-512: 52.280 112 | recursive-inner-paral-cutoff-512: 52.679 113 | recursive-paral-cutoff-512: 52.875 114 | recursive-inner-simd-512: 27.799 115 | recursive-simd-512: 27.991 116 | recursive-inner-simd-paral-512: 5.358 117 | recursive-simd-paral-512: 5.553 118 | recursive-inner-simd-paral-cutoff-512: 5.484 119 | recursive-simd-paral-cutoff-512: 5.679 120 | strassen-inner-512: 4.639 121 | strassen-512: 4.835 122 | recursive-inner-1024: 935.708 123 | recursive-1024: 935.896 124 | recursive-inner-paral-1024: 143.237 125 | recursive-paral-1024: 143.418 126 | recursive-inner-paral-cutoff-1024: 144.542 127 | recursive-paral-cutoff-1024: 144.709 128 | recursive-inner-simd-1024: 39.628 129 | recursive-simd-1024: 39.811 130 | recursive-inner-simd-paral-1024: 10.136 131 | recursive-simd-paral-1024: 10.309 132 | recursive-inner-simd-paral-cutoff-1024: 11.232 133 | recursive-simd-paral-cutoff-1024: 11.401 134 | strassen-inner-1024: 12.061 135 | strassen-1024: 12.251 136 | -------------------------------------------------------------------------------- /measured/celeron-1.8GHz-4/4096.out: -------------------------------------------------------------------------------- 1 | simple: 7469.527 2 | colcp: 346.775 3 | simd: 92.743 4 | recursive-inner-1: 3370.916 5 | recursive-1: 3372.189 6 | recursive-inner-paral-1: 1603.735 7 | recursive-paral-1: 1605.009 8 | recursive-inner-paral-cutoff-1: 865.086 9 | recursive-paral-cutoff-1: 866.356 10 | recursive-inner-2: 1131.948 11 | recursive-2: 1132.735 12 | recursive-inner-paral-2: 395.361 13 | recursive-paral-2: 396.150 14 | recursive-inner-paral-cutoff-2: 295.637 15 | recursive-paral-cutoff-2: 296.430 16 | recursive-inner-4: 777.121 17 | recursive-4: 777.781 18 | recursive-inner-paral-4: 210.907 19 | recursive-paral-4: 211.560 20 | recursive-inner-paral-cutoff-4: 199.575 21 | recursive-paral-cutoff-4: 200.224 22 | recursive-inner-simd-4: 8371.820 23 | recursive-simd-4: 8372.469 24 | recursive-inner-simd-paral-4: 2544.586 25 | recursive-simd-paral-4: 2545.234 26 | recursive-inner-simd-paral-cutoff-4: 2526.409 27 | recursive-simd-paral-cutoff-4: 2527.055 28 | recursive-inner-8: 669.354 29 | recursive-8: 669.974 30 | recursive-inner-paral-8: 172.154 31 | recursive-paral-8: 172.772 32 | recursive-inner-paral-cutoff-8: 170.636 33 | recursive-paral-cutoff-8: 171.256 34 | recursive-inner-simd-8: 1270.562 35 | recursive-simd-8: 1271.180 36 | recursive-inner-simd-paral-8: 380.486 37 | recursive-simd-paral-8: 381.107 38 | recursive-inner-simd-paral-cutoff-8: 377.207 39 | recursive-simd-paral-cutoff-8: 377.829 40 | recursive-inner-16: 622.858 41 | recursive-16: 623.460 42 | recursive-inner-paral-16: 158.917 43 | recursive-paral-16: 159.507 44 | recursive-inner-paral-cutoff-16: 158.969 45 | recursive-paral-cutoff-16: 159.570 46 | recursive-inner-simd-16: 303.020 47 | recursive-simd-16: 303.610 48 | recursive-inner-simd-paral-16: 82.124 49 | recursive-simd-paral-16: 82.716 50 | recursive-inner-simd-paral-cutoff-16: 81.891 51 | recursive-simd-paral-cutoff-16: 82.485 52 | recursive-inner-32: 603.029 53 | recursive-32: 603.681 54 | recursive-inner-paral-32: 153.627 55 | recursive-paral-32: 154.280 56 | recursive-inner-paral-cutoff-32: 153.556 57 | recursive-paral-cutoff-32: 154.215 58 | recursive-inner-simd-32: 133.075 59 | recursive-simd-32: 133.733 60 | recursive-inner-simd-paral-32: 34.457 61 | recursive-simd-paral-32: 35.108 62 | recursive-inner-simd-paral-cutoff-32: 34.674 63 | recursive-simd-paral-cutoff-32: 35.334 64 | strassen-inner-32: 18.619 65 | strassen-32: 19.275 66 | recursive-inner-64: 601.321 67 | recursive-64: 601.924 68 | recursive-inner-paral-64: 154.064 69 | recursive-paral-64: 154.622 70 | recursive-inner-paral-cutoff-64: 153.791 71 | recursive-paral-cutoff-64: 154.338 72 | recursive-inner-simd-64: 113.344 73 | recursive-simd-64: 113.891 74 | recursive-inner-simd-paral-64: 30.027 75 | recursive-simd-paral-64: 30.574 76 | recursive-inner-simd-paral-cutoff-64: 30.210 77 | recursive-simd-paral-cutoff-64: 30.751 78 | strassen-inner-64: 17.250 79 | strassen-64: 17.794 80 | recursive-inner-128: 732.889 81 | recursive-128: 733.408 82 | recursive-inner-paral-128: 190.462 83 | recursive-paral-128: 190.976 84 | recursive-inner-paral-cutoff-128: 193.690 85 | recursive-paral-cutoff-128: 194.212 86 | recursive-inner-simd-128: 100.763 87 | recursive-simd-128: 101.283 88 | recursive-inner-simd-paral-128: 24.170 89 | recursive-simd-paral-128: 24.687 90 | recursive-inner-simd-paral-cutoff-128: 24.098 91 | recursive-simd-paral-cutoff-128: 24.625 92 | strassen-inner-128: 15.445 93 | strassen-128: 15.962 94 | recursive-inner-256: 770.836 95 | recursive-256: 771.340 96 | recursive-inner-paral-256: 203.800 97 | recursive-paral-256: 204.311 98 | recursive-inner-paral-cutoff-256: 204.397 99 | recursive-paral-cutoff-256: 204.909 100 | recursive-inner-simd-256: 87.189 101 | recursive-simd-256: 87.694 102 | recursive-inner-simd-paral-256: 31.540 103 | recursive-simd-paral-256: 32.048 104 | recursive-inner-simd-paral-cutoff-256: 31.349 105 | recursive-simd-paral-cutoff-256: 31.854 106 | strassen-inner-256: 20.176 107 | strassen-256: 20.683 108 | recursive-inner-512: 1481.440 109 | recursive-512: 1481.929 110 | recursive-inner-paral-512: 890.969 111 | recursive-paral-512: 891.467 112 | recursive-inner-paral-cutoff-512: 965.696 113 | recursive-paral-cutoff-512: 966.192 114 | recursive-inner-simd-512: 105.211 115 | recursive-simd-512: 105.701 116 | recursive-inner-simd-paral-512: 36.580 117 | recursive-simd-paral-512: 37.076 118 | recursive-inner-simd-paral-cutoff-512: 35.655 119 | recursive-simd-paral-cutoff-512: 36.152 120 | strassen-inner-512: 25.562 121 | strassen-512: 26.052 122 | recursive-inner-1024: 6827.805 123 | recursive-1024: 6828.284 124 | recursive-inner-paral-1024: 1851.604 125 | recursive-paral-1024: 1852.093 126 | recursive-inner-paral-cutoff-1024: 1865.643 127 | recursive-paral-cutoff-1024: 1866.128 128 | recursive-inner-simd-1024: 87.427 129 | recursive-simd-1024: 87.907 130 | recursive-inner-simd-paral-1024: 33.347 131 | recursive-simd-paral-1024: 33.834 132 | recursive-inner-simd-paral-cutoff-1024: 33.343 133 | recursive-simd-paral-cutoff-1024: 33.832 134 | strassen-inner-1024: 27.206 135 | strassen-1024: 27.685 136 | -------------------------------------------------------------------------------- /src/bin/measure.rs: -------------------------------------------------------------------------------- 1 | #![feature(test)] 2 | 3 | extern crate failure; 4 | extern crate fastmatmult; 5 | #[macro_use] 6 | extern crate structopt; 7 | extern crate test; 8 | extern crate typenum; 9 | 10 | use std::fmt::Display; 11 | use std::path::PathBuf; 12 | use std::process; 13 | use std::time::Instant; 14 | 15 | use failure::Error; 16 | use structopt::StructOpt; 17 | use typenum::{U1, U2, U4, U8, U16, U32, U64, U128, U256, U512, U1024, Unsigned}; 18 | 19 | use fastmatmult::simple::Matrix; 20 | use fastmatmult::znot::{ 21 | Distribute, DontDistribute, FragMultiplyAdd, Matrix as ZMat, RayonDistribute, SimdMultiplyAdd, 22 | SimpleMultiplyAdd 23 | }; 24 | 25 | #[derive(Debug, StructOpt)] 26 | struct Opts { 27 | #[structopt(parse(from_os_str))] 28 | input1: PathBuf, 29 | #[structopt(parse(from_os_str))] 30 | input2: PathBuf, 31 | /// Skip over some expensive computations. 32 | /// 33 | /// This is to be able to measure somewhat larger inputs, so skipping the really slow ones 34 | /// helps. 35 | #[structopt(short = "c", long = "cheap")] 36 | cheap: bool, 37 | 38 | /// Run only the simple multiplication. 39 | #[structopt(short = "s", long = "simple-only")] 40 | simple_only: bool, 41 | } 42 | 43 | fn measure R>(name: N, f: F) -> R { 44 | let start = Instant::now(); 45 | let result = test::black_box(f()); 46 | let stop = Instant::now(); 47 | let elapsed = stop - start; 48 | println!("{}: {}.{:03}", name, elapsed.as_secs(), elapsed.subsec_nanos() / 1_000_000); 49 | result 50 | } 51 | 52 | fn block_inner(suffix: &str, a: &Matrix, b: &Matrix, expected: Option<&Matrix>) 53 | where 54 | Dist: Distribute, 55 | Frag: Unsigned + Default, 56 | Mult: FragMultiplyAdd, 57 | { 58 | let r = measure(format!("recursive{}-{}", suffix, Frag::USIZE), || { 59 | let a_z = ZMat::::from(a); 60 | let b_z = ZMat::::from(b); 61 | let r_z = measure(format!("recursive-inner{}-{}", suffix, Frag::USIZE), || { 62 | fastmatmult::znot::multiply::<_, Dist, Mult>(&a_z, &b_z) 63 | }); 64 | Matrix::from(&r_z) 65 | }); 66 | 67 | if let Some(expected) = expected { 68 | assert_eq!(expected, &r); 69 | } 70 | } 71 | 72 | fn block(a: &Matrix, b: &Matrix, expected: Option<&Matrix>, cheap: bool) 73 | where 74 | Frag: Unsigned + Default, 75 | { 76 | if a.width() < Frag::USIZE { 77 | return; 78 | } 79 | if !cheap { 80 | block_inner::("", a, b, expected); 81 | block_inner::, SimpleMultiplyAdd, Frag>("-paral", a, b, expected); 82 | } 83 | block_inner::, SimpleMultiplyAdd, Frag>("-paral-cutoff", a, b, expected); 84 | if Frag::USIZE >= 4 { 85 | if !cheap { 86 | block_inner::("-simd", a, b, None); 87 | block_inner::, SimdMultiplyAdd, Frag>("-simd-paral", a, b, None); 88 | } 89 | block_inner::, SimdMultiplyAdd, Frag>( 90 | "-simd-paral-cutoff", 91 | a, 92 | b, 93 | None 94 | ); 95 | } 96 | if Frag::USIZE >= 32 { 97 | measure(format!("strassen-{}", Frag::USIZE), || { 98 | let a_z = ZMat::::from(a); 99 | let b_z = ZMat::::from(b); 100 | let r_z = measure(format!("strassen-inner-{}", Frag::USIZE), || { 101 | fastmatmult::znot::strassen::<_, RayonDistribute, SimdMultiplyAdd>(&a_z, &b_z) 102 | }); 103 | Matrix::from(&r_z) 104 | }); 105 | } 106 | } 107 | 108 | fn run() -> Result<(), Error> { 109 | let opts = Opts::from_args(); 110 | let m1 = Matrix::load(&opts.input1)?; 111 | let m2 = Matrix::load(&opts.input2)?; 112 | 113 | let simple = if opts.cheap { 114 | None 115 | } else { 116 | let simple = measure("simple", || fastmatmult::simple::multiply(&m1, &m2)); 117 | if opts.simple_only { 118 | return Ok(()); 119 | } 120 | let col_cp = measure("colcp", || fastmatmult::simple::multiply_col_cp(&m1, &m2)); 121 | assert_eq!(simple, col_cp); 122 | Some(simple) 123 | }; 124 | let simple = simple.as_ref(); 125 | 126 | measure("simd", || fastmatmult::simd::multiply(&m1, &m2)); 127 | // Not checking equality, because simd does slightly different results due to reordering of the 128 | // summing 129 | 130 | if !opts.cheap { 131 | block::(&m1, &m2, simple, opts.cheap); 132 | block::(&m1, &m2, simple, opts.cheap); 133 | block::(&m1, &m2, simple, opts.cheap); 134 | block::(&m1, &m2, simple, opts.cheap); 135 | block::(&m1, &m2, simple, opts.cheap); 136 | block::(&m1, &m2, simple, opts.cheap); 137 | } 138 | block::(&m1, &m2, simple, opts.cheap); 139 | block::(&m1, &m2, simple, opts.cheap); 140 | block::(&m1, &m2, simple, opts.cheap); 141 | block::(&m1, &m2, simple, opts.cheap); 142 | block::(&m1, &m2, simple, opts.cheap); 143 | 144 | Ok(()) 145 | } 146 | 147 | fn main() { 148 | if let Err(e) = run() { 149 | eprintln!("{}", e); 150 | process::exit(1); 151 | } 152 | } 153 | -------------------------------------------------------------------------------- /presentation/template/style.scss: -------------------------------------------------------------------------------- 1 | // Theme 2 | // --------------------------------------------------------------------------- 3 | 4 | $primary : orange; 5 | $secondary : blue; 6 | $tertiary : black; 7 | $light : #FFF; 8 | $dark : #333; 9 | $text-dark : #212121; 10 | $text-light : $light; 11 | $code-background : #FCFCFC; 12 | $overlay : transparentize(#000, .5); 13 | $font-size : 28px; 14 | $font-size-impact : 128px; 15 | $font : Arial, Helvetica, sans-serif; 16 | $font-title : Arial, Helvetica, sans-serif; 17 | $font-fixed : 'Lucida Console', Monaco, monospace; 18 | $margin : 20px; 19 | $iframe-scale : 1.5; 20 | 21 | 22 | table { 23 | border-width: 3px; 24 | } 25 | 26 | .left-column { 27 | width: 48%; 28 | float: left; 29 | } 30 | 31 | 32 | .right-column { 33 | width: 48%; 34 | float: right; 35 | } 36 | 37 | // CSS Base 38 | // --------------------------------------------------------------------------- 39 | 40 | * { box-sizing: border-box; } 41 | body { font-family: $font; } 42 | h1, h2, h3, h4, h5, h6 { 43 | margin: 0 0 $margin 0; 44 | font-family: $font-title; 45 | } 46 | h1 { color: $primary; } 47 | h2 { color: $secondary; } 48 | h3 { color: $tertiary; } 49 | li { margin-bottom: .25em; }; 50 | pre, code { 51 | text-align: left; 52 | font-family: $font-fixed; 53 | color: $secondary; 54 | background: $code-background; 55 | } 56 | a, a:visited, a:hover, a:active { color: $text-dark; } 57 | img { vertical-align: inherit; } 58 | blockquote { 59 | border-left: 8px solid; 60 | padding-left: .5em; 61 | color: $tertiary; 62 | text-align: left; 63 | margin: 1em 0; 64 | & > p { margin: 0; } 65 | } 66 | 67 | 68 | // Remark base 69 | // --------------------------------------------------------------------------- 70 | 71 | .remark-code { font-size: .9em; } 72 | .remark-container { background: $dark; } 73 | .remark-slide-scaler { box-shadow: none; } 74 | .remark-notes { font-size: 1.5em; } 75 | 76 | .remark-slide-content { 77 | font-size: $font-size; 78 | padding: 1em 2em; 79 | color: $text-dark; 80 | background-size: cover; 81 | } 82 | 83 | .remark-slide-number { 84 | color: $text-light; 85 | right: 1em; 86 | opacity: .6; 87 | font-size: 0.8em; 88 | z-index: 2; 89 | .no-counter & { display: none; } 90 | } 91 | 92 | // Additions 93 | .impact { 94 | background-color: $primary; 95 | vertical-align: middle; 96 | text-align: center; 97 | &, h1, h2 { color: $text-light; } 98 | h1 { font-size: $font-size-impact; } 99 | } 100 | 101 | .full { 102 | &, h1, h2 { color: $text-light; } 103 | &iframe { 104 | height: calc(#{100%/$iframe-scale} - 1.2em); 105 | width: 100%/$iframe-scale; 106 | transform: scale($iframe-scale); 107 | transform-origin: 0 0; 108 | border: 0; 109 | } 110 | } 111 | 112 | .bottom-bar { 113 | background-color: $primary; 114 | color: $text-light; 115 | position: absolute; 116 | bottom: 0; 117 | left: 0; 118 | right: 0; 119 | font-size: 20px; 120 | padding: .8em; 121 | text-align: left; 122 | z-index: 1; 123 | p { margin: 0;} 124 | .impact &, .full & { display: none; } 125 | } 126 | 127 | 128 | // Utilities 129 | // --------------------------------------------------------------------------- 130 | 131 | // Positioning 132 | .side-layer { 133 | position: absolute; 134 | left: 0; 135 | width: 100%; 136 | padding: 0 2em; 137 | } 138 | .middle { &, & img, & span { vertical-align: middle; } }; 139 | .top { vertical-align: top; }; 140 | .bottom { vertical-align: bottom; }; 141 | .inline-block { 142 | p, ul, ol, blockquote { 143 | display: inline-block; 144 | text-align: left; 145 | } 146 | } 147 | .no-margin { &, & > p, & > pre, & > ul, & > ol { margin: 0; } } 148 | .no-padding { padding: 0; } 149 | .space-left { padding-left: 1em; } 150 | .space-right { padding-right: 1em; } 151 | 152 | // Images 153 | .responsive > img { width: 100%; height: auto; }; 154 | .contain { background-size: contain; }; 155 | .overlay { box-shadow: inset 0 0 0 9999px $overlay; } 156 | 157 | // Text 158 | .left { text-align: left; } 159 | .right { text-align: right; } 160 | .center { text-align: center; } 161 | .justify { text-align: justify; } 162 | .primary { color: $primary; } 163 | .alt { color: $secondary; }; 164 | .em { color: $tertiary; }; 165 | .thin { font-weight: 200; } 166 | .huge { font-size: 2em; } 167 | .big { font-size: 1.5em; } 168 | .small { font-size: .8em; } 169 | .dark-bg { background-color: $dark; } 170 | .alt-bg { background-color: $secondary; }; 171 | 172 | // Simple 12-columns grid system 173 | .row { 174 | width: 100%; 175 | &::after { 176 | content: ''; 177 | display: table; 178 | clear: both; 179 | } 180 | &.table { display: table; }; 181 | &.table [class^="col-"] { 182 | float: none; 183 | display: table-cell; 184 | vertical-align: inherit; 185 | } 186 | } 187 | 188 | [class^="col-"] { 189 | float: left; 190 | &.inline-block { 191 | float: none; 192 | display: inline-block; 193 | } 194 | } 195 | 196 | @for $i from 1 through 12 { 197 | .col-#{$i} {width: 100% / 12 * $i; } 198 | } 199 | 200 | // Animations 201 | @keyframes fadeIn { 202 | from { opacity: 0; } 203 | to { opacity: 1; } 204 | } 205 | 206 | .animation-fade { 207 | animation-duration: 300ms; 208 | animation-fill-mode: both; 209 | animation-timing-function: ease-out; 210 | .remark-visible & { animation-name: fadeIn; } 211 | } 212 | 213 | 214 | // Fix PDF print with chrome 215 | // --------------------------------------------------------------------------- 216 | 217 | @page { 218 | // 908px 681px for 4/3 slides 219 | size: 1210px 681px; 220 | margin: 0; 221 | } 222 | 223 | @media print { 224 | .remark-slide-scaler { 225 | width: 100% !important; 226 | height: 100% !important; 227 | transform: scale(1) !important; 228 | top: 0 !important; 229 | left: 0 !important; 230 | } 231 | } 232 | -------------------------------------------------------------------------------- /presentation/template/auto-render.min.js: -------------------------------------------------------------------------------- 1 | !function(t,e){"object"==typeof exports&&"object"==typeof module?module.exports=e(require("katex")):"function"==typeof define&&define.amd?define(["katex"],e):"object"==typeof exports?exports.renderMathInElement=e(require("katex")):t.renderMathInElement=e(t.katex)}(this,function(t){return function(t){var e={};function n(r){if(e[r])return e[r].exports;var o=e[r]={i:r,l:!1,exports:{}};return t[r].call(o.exports,o,o.exports,n),o.l=!0,o.exports}return n.m=t,n.c=e,n.d=function(t,e,r){n.o(t,e)||Object.defineProperty(t,e,{configurable:!1,enumerable:!0,get:r})},n.n=function(t){var e=t&&t.__esModule?function(){return t.default}:function(){return t};return n.d(e,"a",e),e},n.o=function(t,e){return Object.prototype.hasOwnProperty.call(t,e)},n.p="",n(n.s=9)}([function(t,e){var n=t.exports="undefined"!=typeof window&&window.Math==Math?window:"undefined"!=typeof self&&self.Math==Math?self:Function("return this")();"number"==typeof __g&&(__g=n)},function(t,e){t.exports=function(t){return"object"==typeof t?null!==t:"function"==typeof t}},function(t,e,n){t.exports=!n(3)(function(){return 7!=Object.defineProperty({},"a",{get:function(){return 7}}).a})},function(t,e){t.exports=function(t){try{return!!t()}catch(t){return!0}}},function(t,e){var n=t.exports={version:"2.4.0"};"number"==typeof __e&&(__e=n)},function(t,e,n){var r=n(6),o=n(7);t.exports=function(t){return r(o(t))}},function(t,e,n){var r=n(27);t.exports=Object("z").propertyIsEnumerable(0)?Object:function(t){return"String"==r(t)?t.split(""):Object(t)}},function(t,e){t.exports=function(t){if(void 0==t)throw TypeError("Can't call method on "+t);return t}},function(t,e){var n=Math.ceil,r=Math.floor;t.exports=function(t){return isNaN(t=+t)?0:(t>0?r:n)(t)}},function(t,e,n){"use strict";Object.defineProperty(e,"__esModule",{value:!0});var r=n(10),o=n.n(r),i=n(38),u=n.n(i),c=n(39),a=function(t,e){for(var n=function(t,e){for(var n=[{type:"text",data:t}],r=0;rf;)for(var p,d=c(arguments[f++]),h=s?r(d).concat(s(d)):r(d),v=h.length,y=0;v>y;)l.call(d,p=h[y++])&&(n[p]=d[p]);return n}:a},function(t,e,n){var r=n(25),o=n(34);t.exports=Object.keys||function(t){return r(t,o)}},function(t,e,n){var r=n(26),o=n(5),i=n(28)(!1),u=n(31)("IE_PROTO");t.exports=function(t,e){var n,c=o(t),a=0,f=[];for(n in c)n!=u&&r(c,n)&&f.push(n);for(;e.length>a;)r(c,n=e[a++])&&(~i(f,n)||f.push(n));return f}},function(t,e){var n={}.hasOwnProperty;t.exports=function(t,e){return n.call(t,e)}},function(t,e){var n={}.toString;t.exports=function(t){return n.call(t).slice(8,-1)}},function(t,e,n){var r=n(5),o=n(29),i=n(30);t.exports=function(t){return function(e,n,u){var c,a=r(e),f=o(a.length),s=i(u,f);if(t&&n!=n){for(;f>s;)if((c=a[s++])!=c)return!0}else for(;f>s;s++)if((t||s in a)&&a[s]===n)return t||s||0;return!t&&-1}}},function(t,e,n){var r=n(8),o=Math.min;t.exports=function(t){return t>0?o(r(t),9007199254740991):0}},function(t,e,n){var r=n(8),o=Math.max,i=Math.min;t.exports=function(t,e){return(t=r(t))<0?o(t+e,0):i(t,e)}},function(t,e,n){var r=n(32)("keys"),o=n(33);t.exports=function(t){return r[t]||(r[t]=o(t))}},function(t,e,n){var r=n(0),o="__core-js_shared__",i=r[o]||(r[o]={});t.exports=function(t){return i[t]||(i[t]={})}},function(t,e){var n=0,r=Math.random();t.exports=function(t){return"Symbol(".concat(void 0===t?"":t,")_",(++n+r).toString(36))}},function(t,e){t.exports="constructor,hasOwnProperty,isPrototypeOf,propertyIsEnumerable,toLocaleString,toString,valueOf".split(",")},function(t,e){e.f=Object.getOwnPropertySymbols},function(t,e){e.f={}.propertyIsEnumerable},function(t,e,n){var r=n(7);t.exports=function(t){return Object(r(t))}},function(e,n){e.exports=t},function(t,e,n){"use strict";var r=function(t,e,n){for(var r=n,o=0,i=t.length;r { 15 | matrix: &'a Matrix, 16 | pos: usize, 17 | } 18 | 19 | impl<'a> Iterator for Rows<'a> { 20 | type Item = &'a [Element]; 21 | fn next(&mut self) -> Option { 22 | if self.pos == self.matrix.height { 23 | None 24 | } else { 25 | let start = self.matrix.width * self.pos; 26 | self.pos += 1; 27 | Some(&self.matrix.content[start .. start + self.matrix.width]) 28 | } 29 | } 30 | } 31 | 32 | #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] 33 | pub struct Matrix { 34 | width: usize, 35 | height: usize, 36 | content: Vec, 37 | } 38 | 39 | impl Matrix { 40 | fn validate(&self) { 41 | assert_eq!(self.content.len(), self.width * self.height); 42 | } 43 | pub fn sized(w: usize, h: usize) -> Self { 44 | Self { 45 | width: w, 46 | height: h, 47 | content: vec![Element::default(); w * h], 48 | } 49 | } 50 | pub fn random(w: usize, h: usize) -> Self { 51 | let mut result = Self::sized(w, h); 52 | let mut rng = rand::thread_rng(); 53 | for x in 0..w { 54 | for y in 0..h { 55 | result[(x, y)] = rng.gen_range(0., 10.); 56 | } 57 | } 58 | result 59 | } 60 | pub fn rows(&self) -> Rows { 61 | Rows { 62 | matrix: self, 63 | pos: 0, 64 | } 65 | } 66 | pub fn load(file: &Path) -> Result { 67 | let f = File::open(file)?; 68 | Ok(bincode::deserialize_from(BufReader::new(f))?) 69 | } 70 | pub fn store(&self, file: &Path) -> Result<(), Error> { 71 | let f = File::create(file)?; 72 | bincode::serialize_into(BufWriter::new(f), self)?; 73 | Ok(()) 74 | } 75 | pub fn height(&self) -> usize { self.height } 76 | pub fn width(&self) -> usize { self.width } 77 | pub(crate) fn slice(&self) -> Slice { 78 | Slice { 79 | width: self.width, 80 | height: self.height, 81 | content: &self.content, 82 | } 83 | } 84 | pub(crate) fn slice_mut(&mut self) -> SliceMut { 85 | SliceMut { 86 | width: self.width, 87 | height: self.height, 88 | content: &mut self.content, 89 | } 90 | } 91 | } 92 | 93 | impl Index<(usize, usize)> for Matrix { 94 | type Output = Element; 95 | fn index(&self, index: (usize, usize)) -> &Element { 96 | &self.content[index.0 + self.width * index.1] 97 | } 98 | } 99 | 100 | impl IndexMut<(usize, usize)> for Matrix { 101 | fn index_mut(&mut self, index: (usize, usize)) -> &mut Element { 102 | &mut self.content[index.0 + self.width * index.1] 103 | } 104 | } 105 | 106 | pub(crate) struct Slice<'a> { 107 | pub(crate) width: usize, 108 | pub(crate) height: usize, 109 | pub(crate) content: &'a [Element], 110 | } 111 | 112 | impl<'a> Index<(usize, usize)> for Slice<'a> { 113 | type Output = Element; 114 | fn index(&self, index: (usize, usize)) -> &Element { 115 | &self.content[index.0 + self.width * index.1] 116 | } 117 | } 118 | 119 | pub(crate) struct SliceMut<'a> { 120 | pub(crate) width: usize, 121 | pub(crate) height: usize, 122 | pub(crate) content: &'a mut [Element], 123 | } 124 | 125 | impl<'a> Index<(usize, usize)> for SliceMut<'a> { 126 | type Output = Element; 127 | fn index(&self, index: (usize, usize)) -> &Element { 128 | &self.content[index.0 + self.width * index.1] 129 | } 130 | } 131 | 132 | impl<'a> IndexMut<(usize, usize)> for SliceMut<'a> { 133 | fn index_mut(&mut self, index: (usize, usize)) -> &mut Element { 134 | &mut self.content[index.0 + self.width * index.1] 135 | } 136 | } 137 | 138 | pub(crate) fn multiply_add(into: &mut SliceMut, a: &Slice, b: &Slice) { 139 | assert_eq!(a.width, b.height); 140 | 141 | let w = into.width; 142 | let h = into.height; 143 | let l = a.width; 144 | 145 | for x in 0..w { 146 | for y in 0..h { 147 | for p in 0..l { 148 | into[(x, y)] += a[(p, y)] * b[(x, p)]; 149 | } 150 | } 151 | } 152 | } 153 | 154 | pub fn multiply(a: &Matrix, b: &Matrix) -> Matrix { 155 | let mut r = Matrix::sized(b.width, a.height); 156 | 157 | // These serve two purposes: 158 | // * Sanity check the matrix implementations. 159 | // * Allow the optimiser to remove the range checks from the below indexing. 160 | a.validate(); 161 | b.validate(); 162 | r.validate(); 163 | 164 | multiply_add(&mut r.slice_mut(), &a.slice(), &b.slice()); 165 | 166 | r 167 | } 168 | 169 | pub fn multiply_col_cp(a: &Matrix, b: &Matrix) -> Matrix { 170 | let mut r = Matrix::sized(b.width, a.height); 171 | 172 | // These serve two purposes: 173 | // * Sanity check the matrix implementations. 174 | // * Allow the optimiser to remove the range checks from the below indexing. 175 | a.validate(); 176 | b.validate(); 177 | r.validate(); 178 | 179 | let w = r.width; 180 | let h = r.height; 181 | let l = a.width; 182 | 183 | let mut col = iter::repeat(0.) 184 | .take(l) 185 | .collect::>(); 186 | 187 | for x in 0..w { 188 | // Copy the column out, so it's more cache-friendly 189 | for p in 0..l { 190 | col[p] = b[(x, p)]; 191 | } 192 | 193 | for y in 0..h { 194 | for p in 0..l { 195 | r[(x, y)] += a[(p, y)] * col[p]; 196 | } 197 | } 198 | } 199 | 200 | r 201 | } 202 | 203 | #[cfg(test)] 204 | mod tests { 205 | use super::*; 206 | 207 | impl Matrix { 208 | pub(crate) fn identity(size: usize) -> Self { 209 | let mut r = Self::sized(size, size); 210 | for i in 0..size { 211 | r[(i, i)] = 1.0; 212 | } 213 | r 214 | } 215 | } 216 | 217 | #[test] 218 | fn add_mult() { 219 | let mut result = Matrix::sized(2, 2); 220 | let id = Matrix::identity(2); 221 | multiply_add(&mut result.slice_mut(), &id.slice(), &id.slice()); 222 | assert_eq!(result, id); 223 | multiply_add(&mut result.slice_mut(), &id.slice(), &id.slice()); 224 | let double = Matrix { 225 | width: 2, 226 | height: 2, 227 | content: vec![ 228 | 2., 0., 229 | 0., 2.0, 230 | ], 231 | }; 232 | assert_eq!(result, double); 233 | } 234 | 235 | #[test] 236 | fn square_identity() { 237 | let id = Matrix::identity(3); 238 | let other = Matrix { 239 | width: 3, 240 | height: 3, 241 | content: vec![ 242 | 2., 3., 4., 243 | 0., 0., 0., 244 | 5., 6., 7., 245 | ], 246 | }; 247 | let left_id = multiply(&id, &other); 248 | assert_eq!(other, left_id); 249 | let right_id = multiply(&other, &id); 250 | assert_eq!(other, right_id); 251 | } 252 | 253 | #[test] 254 | fn rect_identity() { 255 | let rect = Matrix { 256 | width: 2, 257 | height: 3, 258 | content: vec![ 259 | 1., 2., 260 | 3., 4., 261 | 5., 6., 262 | ] 263 | }; 264 | let left_rect = multiply(&Matrix::identity(3), &rect); 265 | assert_eq!(rect, left_rect); 266 | let right_rect = multiply(&rect, &Matrix::identity(2)); 267 | assert_eq!(rect, right_rect); 268 | } 269 | 270 | #[test] 271 | fn col_cp() { 272 | for shift in 0..7 { 273 | let a = Matrix::random(1 << shift, 1 << shift); 274 | let b = Matrix::random(1 << shift, 1 << shift); 275 | assert_eq!(multiply(&a, &b), multiply_col_cp(&a, &b)); 276 | } 277 | } 278 | 279 | #[test] 280 | fn arbitrary() { 281 | let a = Matrix { 282 | width: 2, 283 | height: 3, 284 | content: vec![ 285 | 1., 2., 286 | 3., 4., 287 | 5., 6., 288 | ], 289 | }; 290 | let b = Matrix { 291 | width: 3, 292 | height: 2, 293 | content: vec![ 294 | 10., 11., 12., 295 | 13., 14., 15., 296 | ], 297 | }; 298 | let res_a = multiply(&a, &b); 299 | let exp_a = Matrix { 300 | width: 3, 301 | height: 3, 302 | content: vec![ 303 | 36., 39., 42., 304 | 82., 89., 96., 305 | 128., 139., 150., 306 | ], 307 | }; 308 | assert_eq!(res_a, exp_a); 309 | let res_b = multiply(&b, &a); 310 | let exp_b = Matrix { 311 | width: 2, 312 | height: 2, 313 | content: vec![ 314 | 103., 136., 315 | 130., 172., 316 | ], 317 | }; 318 | assert_eq!(res_b, exp_b); 319 | } 320 | } 321 | -------------------------------------------------------------------------------- /LICENSE-APACHE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /presentation/presentation.md: -------------------------------------------------------------------------------- 1 | title: Squeezing CPUs for speed 2 | class: animation-fade 3 | layout: true 4 | 5 | 6 | .bottom-bar[ 7 | {{title}} 8 | ] 9 | 10 | --- 11 | 12 | class: impact 13 | 14 | # {{title}} 15 | ## Optimization case study with matrix multiplication 16 | 17 | Michal Vaner 18 | 19 | [michal.vaner@avast.com](mailto:michal.vaner@avast.com) 20 | 21 | --- 22 | 23 | # Goals 24 | 25 | * Explore modern CPU capabilities for faster execution 26 | - Memory hierarchy & caches 27 | - Parallelism (CPUs, cores, hyper-threading) 28 | - Vector instructions 29 | * Walk through some usual steps to obtain faster programs 30 | * Demonstrate some tools that help with all that 31 | - Will be done in Rust 32 | - Some other languages allow optimizing too, but I like Rust 33 | - Some languages make optimizing *hard* 34 | * Show a lot of graphs from unscientific benchmarks 😇 35 | - Numbers for AMD FX-8370 Eight-Core Processor @ 4.2GHz 36 | - Unless stated otherwise 37 | 38 | --- 39 | 40 | # Case study 41 | 42 | .left-column[ 43 | * Two matrices, compute a product 44 | * Composed of floats 45 | * Simplification for educational purposes 46 | - Square size 47 | - Power of two side 48 | * Comparison with the [`armadillo`](http://arma.sourceforge.net) library 49 | - 8192: 350s (almost 6 minutes) 50 | - 16384: 2783s (46 minutes) 51 | - Spoiler: we're going to do better 😈 52 | * Another spoiler: We'll reach 1000× speedup 53 | ] 54 | 55 | .right-column[ 56 | ![Armadillo](arm.svg) 57 | ] 58 | 59 | --- 60 | 61 | # Recap: matrix multiplication 62 | 63 | .left-column[ 64 | * Cell is a dot-product of a row from the left and column from the right 65 | * \\(C = A \times B\\) 66 | * \\(C\_{x, y}\\) = \\(\sum\_{i=1}^n A\_{i, y}\cdot B\_{x, i} \\) 67 | * \\(O(n^3)\\) 68 | ] 69 | 70 | .right-column[![Multiplication](mult.svg)] 71 | 72 | --- 73 | 74 | # Trivial implementation 75 | 76 | ```rust 77 | for x in 0..w { 78 | for y in 0..h { 79 | for p in 0..l { 80 | into[(x, y)] += a[(p, y)] * b[(x, p)]; 81 | } 82 | } 83 | } 84 | ``` 85 | 86 | -- 87 | 88 | * 1024: 135s 89 | * 2048: 1664s 90 | * That's just terrible 91 | * 😭 92 | 93 | --- 94 | 95 | # Step 0: Try to avoid the problem 96 | 97 | * Find a library or ready-made solution 98 | * Buy some better HW 99 | * Switch projects 100 | * Pretend it's normal 101 | * Sell something else to the customer 102 | - Sum the matrices instead of multiplying 103 | - They won't notice, will they? 104 | * Promise better speed for the next version 105 | - They should have motivation to buy it 106 | * Become a shepherd 107 | - 🐑 🐑 🐐 🐑 108 | 109 | --- 110 | 111 | # Step 1: Let the compiler do it's job 112 | 113 | - And remember to turn **the optimizations** on 114 | - `cargo run --release` 115 | - Possibly with CPU-specific features 116 | * `-march=native` 117 | * `-C target-cpu=native` 118 | -- 119 | 120 | * 1024: 15s 121 | * 2048: 323s 122 | * 4096: 2930s 123 | 124 | - That's better, but not enough 125 | - About 5× speedup (and no actual work done) 126 | 127 | --- 128 | 129 | # Step 2: Find the slow part 130 | 131 | * Optimizing takes effort 132 | - We want to optimize where it makes sense 133 | * Guessing is often wrong 134 | * Let's use a profiler 135 | - `perf` is usually a good choice 136 | - `FlameGraph` is a nice extension 137 | 138 | ``` 139 | 99.89% 99.58% measure measure 140 | | 141 | ---fastmatmult::simple::multiply_add 142 | ``` 143 | 144 | --- 145 | 146 | # Step 2: Find the slow part 147 | 148 | .center[] 149 | 150 | --- 151 | 152 | # Step 3: Find why it is slow 153 | 154 | * Doing too much overall 155 | * IO 156 | * Syscalls 157 | * Thread synchronization 158 | * Branch mispredictions 159 | * Waiting for memory 160 | 161 | ??? 162 | 163 | * I'll explain the ones that turn out to be our problems. 164 | * Others on request. 165 | 166 | --- 167 | 168 | # Step 3: Find why it is slow 169 | 170 | * Common sense and `htop` rules out IO, syscalls and threads 171 | * `perf stat` gives more info (1024×1024) 172 | - 60G cycles vs. 30G instructions ‒ 2 cycles per instruction 173 | - 1.5G cache accesses vs. 1G cache misses 174 | - 4G branches vs. 1M mispredicted 175 | ??? 176 | 177 | * Point out the low instruction vs. cycle count (hyperscalar processor) 178 | * Maybe a good time to describe how perf works 179 | * Perf stat and summing up 180 | * Counter for events, when it overflows, a sample is taken 181 | -- 182 | 183 | * Mostly cache misses are to blame 184 | * Number of instructions too 185 | 186 | --- 187 | 188 | # Perf 189 | 190 | ``` 191 | Performance counter stats for './target/release/measure -s a.out b.out': 192 | 193 | 60,535,345,914 cycles 194 | 30,287,912,793 instructions # 0.50 insn per cycle 195 | 4,340,795,281 branches 196 | 1,160,771 branch-misses # 0.03% of all branches 197 | 1,433,441,916 cache-references 198 | 1,077,840,136 cache-misses # 75.192 % of all cache refs 199 | 200 | 14.154347044 seconds time elapsed 201 | ``` 202 | 203 | --- 204 | 205 | # Memory hierarchy 206 | 207 | ![Memory hierarchy](hier.svg) 208 | 209 | * Cache lines, pages, predictors... 210 | * Better to access recently accessed data 211 | * Or data close to recently accessed data 212 | * Linear or other predictable pattern is good 213 | 214 | ??? 215 | 216 | * Further away from CPU is more mem, but slower 217 | * From some point shared 218 | * Cache lines, evictions 219 | * Preloading 220 | 221 | --- 222 | 223 | # Memory hierarchy: NUMA 224 | 225 | ![Numa hierarchy](numa.svg) 226 | 227 | ??? 228 | 229 | Note that it can be even worse... 230 | 231 | --- 232 | 233 | # Matrix layout in memory 234 | 235 | .center[ 236 | ![Matrix & cache lines](cache-matrix.svg) 237 | ] 238 | 239 | --- 240 | 241 | # Step 4: Do some research 242 | 243 | * Is there a better algorithm? 244 | * Could I precompute or reuse something? 245 | 246 | -- 247 | 248 | - Z-order layout 249 | * Good for caches 250 | - Strassen algorithm 251 | 252 | ??? 253 | 254 | * Let's start with the layout, it looks simpler 255 | * Postpone strassen algorithm for later on, it looks complex 256 | 257 | --- 258 | 259 | # Z-Order 260 | 261 | * Split matrix in quarters 262 | - Can be taken as matrix 2×2 263 | - Matrix multiplication works on the quarters 264 | * Each quarter is continuous in memory 265 | * Each quarter is encoded recursively 266 | * At certain level, the whole matrix fits into cache 267 | 268 | --- 269 | 270 | # Z-Order: splitting schema 271 | 272 | .left-column[ 273 | .center[![Z Order](z-order.svg)] 274 | ] 275 | 276 | -- 277 | 278 | .right-column[ 279 | .center[![Z order 2](z-order-2.svg)] 280 | ] 281 | 282 | --- 283 | 284 | # Z-Order: multiplication 285 | 286 | ```rust 287 | fn mult(r: &mut [Element], a: &[Element], b: &[Element], size: usize) { 288 | if size == 1 { 289 | r[0] += a[0] * b[0]; 290 | } else { 291 | let s = size / 2; 292 | let (a11, a12, a21, a22) = quads!(a); 293 | let (b11, b12, b21, b22) = quads!(b); 294 | let (r11, r12, r21, r22) = quads!(mut r); 295 | 296 | mult(r11, a11, b11, s); 297 | mult(r11, a12, b21, s); 298 | ... 299 | mult(r22, a22, b22, s); 300 | } 301 | } 302 | ``` 303 | 304 | --- 305 | 306 | # Problems with recursion 307 | 308 | .left-column[ 309 | * 2048: 178s 310 | * 4096: 1423s 311 | * About 2× better 312 | 313 | - There's a cost to recursion 314 | * It dominates on small tasks 315 | * Small tasks already fit into cache 316 | - Let's do a hybrid approach 317 | * Switch to simple from some size 318 | 319 | | size | 4 | 8 | 16 | 32 | 320 | |----------|--------:|-------:|---------:|---------:| 321 | | **2048** | 35s | *33s*| 34s | 37s | 322 | | **4096** | 272s | *266s*| 283s | 299s | 323 | ] 324 | 325 | .right-column[ 326 | ![Recursion](recursion.svg) 327 | 328 | * Another 5× speedup 329 | ] 330 | 331 | ??? 332 | 333 | * General pattern of hybrid approach 334 | * Where one algorithm is faster on large inputs and another on small ones 335 | 336 | --- 337 | 338 | # Parallelism 339 | 340 | * Computing of the quarters is independent 341 | * We can distribute the work between cores 342 | * There's a synchronization cost 343 | - Makes no sense to share tiny tasks 344 | * Can't expect N× speedup 345 | - Shared memory bandwidth, caches 346 | - Shared parts (FPU, scheduler) 347 | - Cooling and power units 348 | 349 | --- 350 | 351 | # Thread pools (Rayon) 352 | 353 | ```rust 354 | fn run( 355 | size: usize, 356 | tasks: &mut [I], f: F 357 | ) { 358 | if size >= Limit::USIZE { 359 | tasks 360 | // Potentially runs on multiple threads 361 | * .into_par_iter() 362 | .for_each(f); 363 | } else { 364 | for t in tasks { 365 | f(t); 366 | } 367 | } 368 | } 369 | ``` 370 | 371 | --- 372 | 373 | # Results 374 | 375 | * Distributing down to the small matrices (`s`) 376 | * Not distributing smaller than 256 (`c`) 377 | 378 | | size | 2 | 4 | 8 | 16 | 32 | 379 | |-----------|---------:|---------:|--------:|--------:|-------:| 380 | | **2048s** | 16s | 6s | 5s | 5s | 5s | 381 | | **2048c** | 9s | 5s | 5s | 5s | 5s | 382 | | **4096s** | 152s | 48s | 40s | *38s* | 41s | 383 | | **4096c** | 68s | 41s | 39s | 39s | 42s | 384 | 385 | * Further 6× speedup on 8-core CPU 386 | * Can expect more with more cores 387 | - Measured 14× on machine with 2×10 cores with 2×HT (40 virtual cores, 20 388 | physical) 389 | 390 | --- 391 | 392 | # SIMD (Single Instruction Multiple Data) 393 | 394 | .left-column[ 395 | * Usually, one instruction ‒ one result 396 | * SIMD ‒ a vector in each register 397 | * For example registers for 16 floats 398 | * Fast on long arrays 399 | * Stronger hints for cache pre-loading? 400 | * Problems with column access 401 | - Acceleration for that, still slow 402 | * Let's use a library (`faster`) 403 | - For portability and ease of use 404 | - Needs nightly Rust now, going to stabilize soon 405 | ] 406 | 407 | .right-column[ 408 | ![SIMD](simd.svg) 409 | ] 410 | 411 | --- 412 | 413 | # SIMD 414 | 415 | ```rust 416 | let columns = b.content.simd_iter(f32s(0.)); 417 | let columns = columns.stride(b.width, &pads); 418 | let mut column_data = iter::repeat(0.0).take(b.height).collect(); 419 | for (x, mut column) in columns.into_iter().enumerate() { 420 | * column.scalar_fill(&mut column_data); 421 | for y in 0..h { 422 | let row = &a.content[y * l .. (y + 1) * l]; 423 | into[(x, y)] += (row.simd_iter(f32s(0.)), 424 | column_data.simd_iter(f32s(0.))) 425 | .zip() 426 | * .simd_reduce(f32s(0.0), |acc, (a, b)| acc + a * b) 427 | .sum(); 428 | } 429 | } 430 | ``` 431 | 432 | ??? 433 | 434 | * Describe the reason for that first highlighted line 435 | * Made it actually much faster 436 | 437 | --- 438 | 439 | # Speeds 440 | 441 | * Can be combined with other solutions 442 | - Goes a bit against the recursive/cache optimisation 443 | - Recursive, parallelized, with 256 sized fragments 444 | * The column-copy trick alone helps only a little 445 | 446 | | size | simple | column | simd | recursive+simd | 447 | |-----------|------------:|-----------:|---------:|---------------:| 448 | | **2048** | 323s | 35s | 6s | 0.7s | 449 | | **4096** | 2930s | 277s | 44s | 6s | 450 | 451 | --- 452 | 453 | # Strassen 454 | 455 | * Similar to the recursive 456 | * Reduces the number of smaller multiplications to 7 457 | * At the cost of some additions and removals 458 | - Isn't worth for small inputs 459 | - Wins on large ones 460 | * Exact formulae at [wikipedia](https://en.wikipedia.org/wiki/Strassen_algorithm) or elsewhere 461 | 462 | - 2048: 0.6s 463 | - 4096: 4s 464 | - 8192: 27s 465 | - 16384: 182s 466 | * 2783s for Armadillo 467 | * 348s for theoretical parallelized Armadillo 468 | 469 | --- 470 | 471 | # Final profile 472 | 473 | ``` 474 | Performance counter stats for './target/release/strass a.out b.out': 475 | 476 | 115,374,630,652 cycles 477 | 187,871,575,042 instructions # 1.63 insn per cycle 478 | 12,756,647,765 branches 479 | 163,876,845 branch-misses # 1.28% of all branches 480 | 3,187,674,343 cache-references 481 | 100,655,288 cache-misses # 3.158 % of all cache refs 482 | 483 | 6.254030767 seconds time elapsed 484 | ``` 485 | 486 | ??? 487 | 488 | * As can be seen, it is not only faster, but we got much better in instructions 489 | per cycle & cache-misses 490 | 491 | --- 492 | 493 | # Source code 494 | 495 | * https://github.com/vorner/fastmatmult 496 | * Somewhat templated to assemble all the measured variants 497 | * Needs specific version of rust nightly 498 | - SIMD is about to stabilize, therefore a lot of last-minute changes. 499 | 500 | --- 501 | 502 | # Buldozer 8-core 503 | 504 | .center[ 505 | ![Buldozer](buldozer.svg) 506 | ] 507 | 508 | --- 509 | 510 | # Xeon 2×10×2HT 511 | 512 | .center[ 513 | ![40-cores](beast.svg) 514 | ] 515 | 516 | --- 517 | 518 | # Celeron 4-core 519 | 520 | .center[ 521 | ![Celeron](celeron.svg) 522 | ] 523 | -------------------------------------------------------------------------------- /src/znot.rs: -------------------------------------------------------------------------------- 1 | use faster::prelude::*; 2 | use rayon::prelude::*; 3 | use typenum::Unsigned; 4 | 5 | use super::Element; 6 | use super::simple::{self, Matrix as Simple, Slice, SliceMut}; 7 | use super::simd; 8 | 9 | #[derive(Clone, Debug, PartialEq)] 10 | pub struct Matrix { 11 | _frag: Frag, 12 | size: usize, 13 | content: Vec, 14 | } 15 | 16 | impl<'a, Frag: Unsigned + Default> From<&'a Simple> for Matrix { 17 | fn from(matrix: &'a Simple) -> Self { 18 | fn convert( 19 | matrix: &Simple, 20 | content: &mut Vec, 21 | x: usize, 22 | y: usize, 23 | s: usize, 24 | frag: usize 25 | ) { 26 | if s == frag { 27 | for j in 0..frag { 28 | for i in 0..frag { 29 | content.push(matrix[(i + x, j + y)]); 30 | } 31 | } 32 | } else { 33 | let s = s / 2; 34 | convert(matrix, content, x, y, s, frag); 35 | convert(matrix, content, x + s, y, s, frag); 36 | convert(matrix, content, x, y + s, s, frag); 37 | convert(matrix, content, x + s, y + s, s, frag); 38 | } 39 | } 40 | 41 | let size = matrix.width(); 42 | 43 | assert_eq!(matrix.width(), matrix.height(), "We support only square matrices"); 44 | assert!(size % Frag::USIZE == 0, "Matrix size must be multiple of {}", Frag::USIZE); 45 | assert_eq!((size / Frag::USIZE).count_ones(), 1, "Matrix size must be power of 2"); 46 | 47 | let mut content = Vec::with_capacity(size * size); 48 | convert(matrix, &mut content, 0, 0, size, Frag::USIZE); 49 | Self { 50 | _frag: Frag::default(), 51 | size, 52 | content, 53 | } 54 | } 55 | } 56 | 57 | impl<'a, Frag: Unsigned> From<&'a Matrix> for Simple { 58 | fn from(matrix: &'a Matrix) -> Self { 59 | fn convert( 60 | matrix: &Matrix, 61 | result: &mut Simple, 62 | x: usize, 63 | y: usize, 64 | s: usize, 65 | pos: &mut usize, 66 | ) { 67 | if s == Frag::USIZE { 68 | for j in 0..Frag::USIZE { 69 | for i in 0..Frag::USIZE { 70 | result[(i + x, j + y)] = matrix.content[*pos]; 71 | *pos += 1; 72 | } 73 | } 74 | } else { 75 | let s = s / 2; 76 | convert(matrix, result, x, y, s, pos); 77 | convert(matrix, result, x + s, y, s, pos); 78 | convert(matrix, result, x, y + s, s, pos); 79 | convert(matrix, result, x + s, y + s, s, pos); 80 | } 81 | } 82 | let mut result = Simple::sized(matrix.size, matrix.size); 83 | convert(matrix, &mut result, 0, 0, matrix.size, &mut 0); 84 | result 85 | } 86 | } 87 | 88 | pub trait Distribute { 89 | fn run(size: usize, tasks: &mut [I], f: F); 90 | } 91 | 92 | pub struct DontDistribute; 93 | 94 | impl Distribute for DontDistribute { 95 | fn run(_: usize, tasks: &mut [I], f: F) { 96 | for task in tasks { 97 | f(task); 98 | } 99 | } 100 | } 101 | 102 | pub struct RayonDistribute(pub Limit); 103 | 104 | impl Distribute for RayonDistribute { 105 | fn run(size: usize, tasks: &mut [I], f: F) { 106 | if size >= Limit::USIZE { 107 | tasks 108 | .into_par_iter() 109 | .for_each(f); 110 | } else { 111 | DontDistribute::run(size, tasks, f); 112 | } 113 | } 114 | } 115 | 116 | pub trait FragMultiplyAdd { 117 | fn multiply_add(r: &mut [Element], a: &[Element], b: &[Element], size: usize); 118 | } 119 | 120 | pub struct SimpleMultiplyAdd; 121 | 122 | impl FragMultiplyAdd for SimpleMultiplyAdd { 123 | fn multiply_add(r: &mut [Element], a: &[Element], b: &[Element], size: usize) { 124 | simple::multiply_add( 125 | &mut SliceMut { 126 | width: size, 127 | height: size, 128 | content: r, 129 | }, 130 | &Slice { 131 | width: size, 132 | height: size, 133 | content: a, 134 | }, 135 | &Slice { 136 | width: size, 137 | height: size, 138 | content: b, 139 | }, 140 | ); 141 | } 142 | } 143 | 144 | pub struct SimdMultiplyAdd; 145 | 146 | impl FragMultiplyAdd for SimdMultiplyAdd { 147 | fn multiply_add(r: &mut [Element], a: &[Element], b: &[Element], size: usize) { 148 | simd::multiply_add( 149 | &mut SliceMut { 150 | width: size, 151 | height: size, 152 | content: r, 153 | }, 154 | &Slice { 155 | width: size, 156 | height: size, 157 | content: a, 158 | }, 159 | &Slice { 160 | width: size, 161 | height: size, 162 | content: b, 163 | }, 164 | ); 165 | } 166 | } 167 | 168 | macro_rules! quads { 169 | ($slice: expr) => {{ 170 | let len = $slice.len() / 4; 171 | let mut iter = $slice.chunks(len); 172 | tuplify!(4, iter.next().unwrap()) 173 | }}; 174 | (mut $slice: expr) => {{ 175 | let len = $slice.len() / 4; 176 | let mut iter = $slice.chunks_mut(len); 177 | tuplify!(4, iter.next().unwrap()) 178 | }}; 179 | } 180 | 181 | pub fn multiply(a: &Matrix, b: &Matrix) -> Matrix 182 | where 183 | Frag: Unsigned + Default, 184 | Dist: Distribute, 185 | Mult: FragMultiplyAdd, 186 | { 187 | assert_eq!(a.size, b.size); 188 | let mut result = Matrix { 189 | _frag: Frag::default(), 190 | size: a.size, 191 | content: vec![0.; a.size * a.size], 192 | }; 193 | 194 | fn mult_add( 195 | r: &mut [Element], 196 | a: &[Element], 197 | b: &[Element], 198 | size: usize, 199 | frag: usize, 200 | ) { 201 | if size == frag { 202 | Mult::multiply_add(r, a, b, size); 203 | } else { 204 | let s = size / 2; 205 | let (a11, a12, a21, a22) = quads!(a); 206 | let (b11, b12, b21, b22) = quads!(b); 207 | let (r11, r12, r21, r22) = quads!(mut r); 208 | 209 | let mut tasks = [ 210 | (r11, a11, b11, a12, b21), 211 | (r12, a11, b12, a12, b22), 212 | (r21, a21, b11, a22, b21), 213 | (r22, a21, b12, a22, b22), 214 | ]; 215 | Dist::run(size, &mut tasks, |&mut (ref mut r, ref a1, ref b1, ref a2, ref b2)| { 216 | mult_add::(r, a1, b1, s, frag); 217 | mult_add::(r, a2, b2, s, frag); 218 | }); 219 | } 220 | } 221 | mult_add::(&mut result.content, &a.content, &b.content, a.size, Frag::USIZE); 222 | 223 | result 224 | } 225 | 226 | macro_rules! op { 227 | ($res: expr => $first: ident $($op: tt $next: ident)*) => {{ 228 | ($first.simd_iter(f32s(0.)), $($next.simd_iter(f32s(0.)),)*).zip() 229 | .simd_map(|($first, $($next,)*)| $first $($op $next)*) 230 | .scalar_fill($res); 231 | }}; 232 | ($buf: expr, $first: ident $($op: tt $next: ident)*) => {{ 233 | let res: &mut [_] = $buf.next().unwrap(); 234 | op!(res => $first $($op $next)*); 235 | // Get rid of mut 236 | &*res 237 | }}; 238 | } 239 | 240 | pub fn strassen(a: &Matrix, b: &Matrix) -> Matrix 241 | where 242 | Frag: Unsigned + Default, 243 | Dist: Distribute, 244 | Mult: FragMultiplyAdd, 245 | { 246 | assert_eq!(a.size, b.size); 247 | let mut result = Matrix { 248 | _frag: Frag::default(), 249 | size: a.size, 250 | content: vec![0.; a.size * a.size], 251 | }; 252 | 253 | fn step( 254 | r: &mut [Element], 255 | a: &[Element], 256 | b: &[Element], 257 | size: usize, 258 | frag: usize, 259 | ) { 260 | if size == frag { 261 | Mult::multiply_add(r, a, b, size); 262 | } else { 263 | let s = size / 2; 264 | let block = s * s; 265 | let (a11, a12, a21, a22) = quads!(a); 266 | let (b11, b12, b21, b22) = quads!(b); 267 | let (r11, r12, r21, r22) = quads!(mut r); 268 | 269 | // We need some auxiliary space (for 17 matrices ‒ or can we optimise? Can we reuse the 270 | // space of the results?). Allocate it in just one chunk and split it up. 271 | let mut buffer = vec![0.; 17 * block]; 272 | let mut bc = buffer.chunks_mut(block); 273 | 274 | // Prepare for the smaller multiplications. These are summed/subtracted with SIMD and 275 | // we don't have to care about the element orders, since both matrices have them the 276 | // same. 277 | let m1l = op!(bc, a11 + a22); 278 | let m1r = op!(bc, b11 + b22); 279 | let m2l = op!(bc, a21 + a22); 280 | let m3r = op!(bc, b12 - b22); 281 | let m4r = op!(bc, b21 - b11); 282 | let m5l = op!(bc, a11 + a12); 283 | let m6l = op!(bc, a21 - a11); 284 | let m6r = op!(bc, b11 + b12); 285 | let m7l = op!(bc, a21 - a11); 286 | let m7r = op!(bc, b21 + b22); 287 | 288 | // Run the sub-multiplications, possibly across multiple threads 289 | let (mut m1, mut m2, mut m3, mut m4, mut m5, mut m6, mut m7) = 290 | tuplify!(7, bc.next().unwrap()); 291 | let mut tasks = [ 292 | (&mut m1, m1l, m1r), 293 | (&mut m2, m2l, b11), 294 | (&mut m3, a11, m3r), 295 | (&mut m4, a22, m4r), 296 | (&mut m5, m5l, b22), 297 | (&mut m6, m6l, m6r), 298 | (&mut m7, m7l, m7r), 299 | ]; 300 | Dist::run(size, &mut tasks, |&mut (ref mut r, ref a, ref b)| { 301 | step::(r, a, b, s, frag); 302 | }); 303 | 304 | // Consolidate the results 305 | op!(r11 => m1 + m4 - m5 + m7); 306 | op!(r12 => m3 + m5); 307 | op!(r21 => m2 + m4); 308 | op!(r22 => m1 - m2 + m3 + m6); 309 | } 310 | } 311 | step::(&mut result.content, &a.content, &b.content, a.size, Frag::USIZE); 312 | 313 | result 314 | } 315 | 316 | #[cfg(test)] 317 | mod tests { 318 | use super::*; 319 | 320 | use typenum::{U1, U2, U7, U16, U32}; 321 | 322 | fn test_tab() { 323 | for shift in 0..7 { 324 | let matrix = Simple::random(Frag::USIZE * 1 << shift, Frag::USIZE * 1 << shift); 325 | let there = Matrix::::from(&matrix); 326 | let back = Simple::from(&there); 327 | assert_eq!(matrix, back); 328 | } 329 | } 330 | 331 | #[test] 332 | fn there_and_back_16() { 333 | test_tab::(); 334 | } 335 | 336 | #[test] 337 | fn there_and_back_1() { 338 | test_tab::(); 339 | } 340 | 341 | #[test] 342 | fn there_and_back_2() { 343 | test_tab::(); 344 | } 345 | 346 | #[test] 347 | fn there_and_back_7() { 348 | test_tab::(); 349 | } 350 | 351 | #[test] 352 | fn no_frag() { 353 | // Matrix 2*2 stays the same 354 | let ar = vec![1., 2., 3., 4.]; 355 | let mut matrix = Simple::sized(2, 2); 356 | matrix[(0, 0)] = 1.; 357 | matrix[(1, 0)] = 2.; 358 | matrix[(0, 1)] = 3.; 359 | matrix[(1, 1)] = 4.; 360 | let conv = Matrix::::from(&matrix); 361 | let exp = Matrix { 362 | _frag: U1::default(), 363 | size: 2, 364 | content: ar.clone(), 365 | }; 366 | assert_eq!(exp, conv); 367 | let back = Simple::from(&conv); 368 | assert_eq!(matrix, back); 369 | } 370 | 371 | /* 372 | * By using SIMD vectors to sum many at once, we reorder the additions on floats. It so happens 373 | * this changes the result somewhat, so we put a margin there. 374 | */ 375 | fn approx_eq(mut a: Simple, mut b: Simple) { 376 | for val in a.slice_mut().content { 377 | *val = (*val / 20.0).round(); 378 | } 379 | for val in b.slice_mut().content { 380 | *val = (*val / 20.0).round(); 381 | } 382 | } 383 | 384 | fn test_multi() { 385 | for shift in 0..5 { 386 | let a = Simple::random(Frag::USIZE * 1 << shift, Frag::USIZE * 1 << shift); 387 | let b = Simple::random(Frag::USIZE * 1 << shift, Frag::USIZE * 1 << shift); 388 | let expected = simple::multiply(&a, &b); 389 | let a_z = Matrix::::from(&a); 390 | let b_z = Matrix::::from(&b); 391 | let r_z = multiply::<_, DontDistribute, Mult>(&a_z, &b_z); 392 | let result = Simple::from(&r_z); 393 | approx_eq(expected.clone(), result); 394 | let ra_z = multiply::<_, RayonDistribute, Mult>(&a_z, &b_z); 395 | let result = Simple::from(&ra_z); 396 | approx_eq(expected.clone(), result); 397 | let rs_z = strassen::<_, RayonDistribute, Mult>(&a_z, &b_z); 398 | let result = Simple::from(&rs_z); 399 | approx_eq(expected, result); 400 | } 401 | } 402 | 403 | #[test] 404 | fn test_multi_1() { 405 | test_multi::(); 406 | } 407 | 408 | #[test] 409 | fn test_multi_2() { 410 | test_multi::(); 411 | } 412 | 413 | #[test] 414 | fn test_multi_7() { 415 | test_multi::(); 416 | } 417 | 418 | #[test] 419 | fn test_multi_16() { 420 | test_multi::(); 421 | } 422 | 423 | #[test] 424 | fn test_multi_1_simd() { 425 | test_multi::(); 426 | } 427 | 428 | #[test] 429 | fn test_multi_2_simd() { 430 | test_multi::(); 431 | } 432 | 433 | #[test] 434 | fn test_multi_7_simd() { 435 | test_multi::(); 436 | } 437 | 438 | #[test] 439 | fn test_multi_16_simd() { 440 | test_multi::(); 441 | } 442 | } 443 | -------------------------------------------------------------------------------- /presentation/fg.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 15 | 332 | 333 | Flame Graph 334 | 335 | Reset Zoom 336 | Search 337 | 338 | 339 | [vmlinux] (1,297 samples, 0.30%) 340 | 341 | 342 | 343 | [vmlinux] (201 samples, 0.05%) 344 | 345 | 346 | 347 | [vmlinux] (372 samples, 0.09%) 348 | 349 | 350 | 351 | [vmlinux] (1,115 samples, 0.26%) 352 | 353 | 354 | 355 | byteorder::io::ReadBytesExt::read_f32 (214 samples, 0.05%) 356 | 357 | 358 | 359 | [vmlinux] (534 samples, 0.12%) 360 | 361 | 362 | 363 | [libc-2.26.so] (111 samples, 0.03%) 364 | 365 | 366 | 367 | [unknown] (141 samples, 0.03%) 368 | 369 | 370 | 371 | fastmatmult::simple::multiply_add (435,890 samples, 99.88%) 372 | fastmatmult::simple::multiply_add 373 | 374 | 375 | all (436,427 samples, 100%) 376 | 377 | 378 | 379 | [vmlinux] (893 samples, 0.20%) 380 | 381 | 382 | 383 | byteorder::io::ReadBytesExt::read_f32 (38 samples, 0.01%) 384 | 385 | 386 | 387 | [vmlinux] (768 samples, 0.18%) 388 | 389 | 390 | 391 | bincode::internal::deserialize_from (53 samples, 0.01%) 392 | 393 | 394 | 395 | [vmlinux] (1,052 samples, 0.24%) 396 | 397 | 398 | 399 | [vmlinux] (90 samples, 0.02%) 400 | 401 | 402 | 403 | [vmlinux] (1,211 samples, 0.28%) 404 | 405 | 406 | 407 | [vmlinux] (1,264 samples, 0.29%) 408 | 409 | 410 | 411 | measure (436,412 samples, 100.00%) 412 | measure 413 | 414 | 415 | [vmlinux] (1,385 samples, 0.32%) 416 | 417 | 418 | 419 | --------------------------------------------------------------------------------