├── linear-chase.png ├── random-chase.png ├── fused-linear-chase.png ├── .gitmodules ├── LICENSE ├── Makefile ├── chase-pointers.hpp ├── linear-chain.hpp ├── walltime.hpp ├── uniform-int-distribution.hpp ├── linear-chase.cpp ├── chase-pointers.cpp ├── random-chase.cpp ├── linear-chain.cpp ├── fused-linear-chase.cpp └── README.md /linear-chase.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/afborchert/pointer-chasing/HEAD/linear-chase.png -------------------------------------------------------------------------------- /random-chase.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/afborchert/pointer-chasing/HEAD/random-chase.png -------------------------------------------------------------------------------- /fused-linear-chase.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/afborchert/pointer-chasing/HEAD/fused-linear-chase.png -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "fmt"] 2 | path = fmt 3 | url = https://github.com/afborchert/fmt.git 4 | [submodule "gcc-makedepend"] 5 | path = gcc-makedepend 6 | url = https://github.com/afborchert/gcc-makedepend.git 7 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 Andreas F. Borchert 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | CPPSources := $(wildcard *.cpp) 2 | Objects := $(patsubst %.cpp,%.o,$(CPPSources)) 3 | 4 | fused-linear-chase-objects := fused-linear-chase.o \ 5 | chase-pointers.o linear-chain.o 6 | linear-chase-objects := linear-chase.o \ 7 | chase-pointers.o linear-chain.o 8 | random-chase-objects := random-chase.o \ 9 | chase-pointers.o 10 | 11 | CXX := g++ 12 | CPPFLAGS := -std=gnu++11 -Ifmt 13 | CXXFLAGS := -g -O2 14 | Targets := fused-linear-chase linear-chase random-chase 15 | 16 | .PHONY: all clean realclean depend 17 | all: $(Objects) $(Targets) 18 | clean: ; rm -f $(Objects) 19 | realclean: clean 20 | rm -f $(Targets) 21 | 22 | depend: $(CPPSources) 23 | perl gcc-makedepend/gcc-makedepend.pl $(CPPFLAGS) $(CPPSources) 24 | 25 | fused-linear-chase: $(fused-linear-chase-objects) 26 | $(CXX) $(LDFLAGS) -o $@ $(fused-linear-chase-objects) 27 | linear-chase: $(linear-chase-objects) 28 | $(CXX) $(LDFLAGS) -o $@ $(linear-chase-objects) 29 | random-chase: $(random-chase-objects) 30 | $(CXX) $(LDFLAGS) -o $@ $(random-chase-objects) 31 | 32 | # DO NOT DELETE 33 | random-chase.o: random-chase.cpp fmt/printf.hpp chase-pointers.hpp \ 34 | uniform-int-distribution.hpp 35 | chase-pointers.o: chase-pointers.cpp fmt/printf.hpp chase-pointers.hpp \ 36 | walltime.hpp 37 | fused-linear-chase.o: fused-linear-chase.cpp fmt/printf.hpp \ 38 | linear-chain.hpp walltime.hpp 39 | linear-chase.o: linear-chase.cpp fmt/printf.hpp chase-pointers.hpp \ 40 | linear-chain.hpp 41 | linear-chain.o: linear-chain.cpp linear-chain.hpp 42 | -------------------------------------------------------------------------------- /chase-pointers.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2016 Andreas F. Borchert 3 | All rights reserved. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY 17 | KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE 18 | WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 20 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 21 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 22 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | */ 25 | 26 | 27 | #ifndef CHASE_POINTERS_HPP 28 | #define CHASE_POINTERS_HPP 29 | 30 | /* follow a circular pointer chain a given number of times 31 | and return the real time used in seconds as double */ 32 | double chase_pointers(void** memory, std::size_t count); 33 | 34 | /* print pointer chain to std::cout (for debugging) */ 35 | void debug_chain(void** memory); 36 | 37 | #endif 38 | -------------------------------------------------------------------------------- /linear-chain.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2016 Andreas F. Borchert 3 | All rights reserved. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY 17 | KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE 18 | WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 20 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 21 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 22 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | */ 25 | 26 | #ifndef LINEAR_CHAIN_HPP 27 | #define LINEAR_CHAIN_HPP 28 | 29 | #include 30 | 31 | /* create a cyclic pointer chain where the individual locations 32 | are stride bytes apart; 33 | if the stride allows multiple runs within the same buffer, 34 | subsequent runs will operate with different offsets making 35 | it less likely that we hit something still in cache from 36 | the earlier run 37 | */ 38 | void** create_linear_chain(std::size_t size, std::size_t stride); 39 | 40 | #endif 41 | -------------------------------------------------------------------------------- /walltime.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2016, 2018 Andreas F. Borchert 3 | All rights reserved. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY 17 | KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE 18 | WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 20 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 21 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 22 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | */ 25 | 26 | #ifndef WALLTIME_HPP 27 | #define WALLTIME_HPP 28 | 29 | #include 30 | 31 | template 32 | class WallTime { 33 | public: 34 | WallTime() : t0(std::chrono::high_resolution_clock::now()) { 35 | } 36 | /* return real time in seconds using T since construction 37 | of this object */ 38 | T elapsed() const { 39 | using namespace std::chrono; 40 | auto time_spent = high_resolution_clock::now() - t0; 41 | return duration(time_spent).count(); 42 | } 43 | private: 44 | std::chrono::high_resolution_clock::time_point t0; 45 | }; 46 | 47 | #endif 48 | -------------------------------------------------------------------------------- /uniform-int-distribution.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2016 Andreas F. Borchert 3 | All rights reserved. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY 17 | KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE 18 | WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 20 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 21 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 22 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | */ 25 | 26 | #ifndef UNIFORM_INT_DISTRIBUTION_HPP 27 | #define UNIFORM_INT_DISTRIBUTION_HPP 28 | 29 | #include 30 | 31 | /* simple class for a pseudo-random generator producing 32 | uniformely distributed integers */ 33 | class UniformIntDistribution { 34 | public: 35 | UniformIntDistribution() : engine(std::random_device()()) {} 36 | /* return number in the range of [0..upper_limit) */ 37 | unsigned int draw(unsigned int upper_limit) { 38 | return std::uniform_int_distribution 39 | (0, upper_limit-1)(engine); 40 | } 41 | private: 42 | std::mt19937 engine; 43 | }; 44 | 45 | #endif 46 | -------------------------------------------------------------------------------- /linear-chase.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2016, 2018 Andreas F. Borchert 3 | All rights reserved. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY 17 | KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE 18 | WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 20 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 21 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 22 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | */ 25 | 26 | /* utility that measures the effect of automatic hardware-driven 27 | prefetches for various stride values */ 28 | 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include /* see https://github.com/afborchert/fmt */ 36 | #include "chase-pointers.hpp" 37 | #include "linear-chain.hpp" 38 | 39 | #ifndef MIN_STRIDE 40 | #define MIN_STRIDE (sizeof(void*)) 41 | #endif 42 | #ifndef MAX_STRIDE 43 | #define MAX_STRIDE 1200 44 | #endif 45 | 46 | int main() { 47 | fmt::printf(" stride time in ns\n"); 48 | for (std::size_t stride = MIN_STRIDE; stride <= MAX_STRIDE; 49 | stride += sizeof(void*)) { 50 | size_t memsize = std::min(std::size_t{1}<<26, 51 | stride * 1024 * sizeof(void*)); 52 | void** memory = create_linear_chain(memsize, stride); 53 | std::size_t count = std::size_t{1}<<30; 54 | double t = chase_pointers(memory, count); 55 | delete[] memory; 56 | double ns = t * 1000000000 / count; 57 | fmt::printf(" %8u %10.5lf\n", stride, ns); std::cout.flush(); 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /chase-pointers.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2016, 2018 Andreas F. Borchert 3 | All rights reserved. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY 17 | KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE 18 | WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 20 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 21 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 22 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | */ 25 | 26 | /* follow a circular pointer chain a given number of times 27 | and return the real time used in seconds as double */ 28 | 29 | #include 30 | #include "chase-pointers.hpp" 31 | #include "walltime.hpp" 32 | 33 | /* this variable must not be declared static */ 34 | volatile void* chase_pointers_global; // to defeat optimizations 35 | 36 | /* follow a pointer chain the given number of times and 37 | return the measured time */ 38 | double chase_pointers(void** memory, std::size_t count) { 39 | WallTime walltime; 40 | // chase the pointers count times 41 | void** p = (void**) memory; 42 | while (count-- > 0) { 43 | p = (void**) *p; 44 | } 45 | auto elapsed = walltime.elapsed(); 46 | chase_pointers_global = *p; 47 | return elapsed; 48 | } 49 | 50 | /* print pointer chain to std::cout (for debugging) */ 51 | void debug_chain(void** memory) { 52 | void** p = memory; 53 | std::size_t count = 0; 54 | fmt::printf("chain at %p:\n", memory); 55 | do { 56 | fmt::printf("[0x%p] 0x%p", p, *p); ++count; 57 | if (*p < p) fmt::printf(" (rewinding)"); 58 | fmt::printf("\n"); 59 | p = (void**) *p; 60 | } while (p != memory); 61 | fmt::printf("# of pointers in chain: %zu\n", count); 62 | } 63 | -------------------------------------------------------------------------------- /random-chase.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2016, 2018 Andreas F. Borchert 3 | All rights reserved. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY 17 | KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE 18 | WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 20 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 21 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 22 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | */ 25 | 26 | /* utility to measure cache and memory read access times */ 27 | 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include /* see https://github.com/afborchert/fmt */ 33 | #include "chase-pointers.hpp" 34 | #include "uniform-int-distribution.hpp" 35 | 36 | /* create a cyclic pointer chain that covers all words 37 | in a memory section of the given size in a randomized order */ 38 | void** create_random_chain(std::size_t size) { 39 | std::size_t len = size / sizeof(void*); 40 | void** memory = new void*[len]; 41 | 42 | UniformIntDistribution uniform; 43 | 44 | // shuffle indices 45 | size_t* indices = new std::size_t[len]; 46 | for (std::size_t i = 0; i < len; ++i) { 47 | indices[i] = i; 48 | } 49 | for (std::size_t i = 0; i < len-1; ++i) { 50 | std::size_t j = i + uniform.draw(len - i); 51 | if (i != j) { 52 | std::swap(indices[i], indices[j]); 53 | } 54 | } 55 | // fill memory with pointer references 56 | for (std::size_t i = 1; i < len; ++i) { 57 | memory[indices[i-1]] = (void*) &memory[indices[i]]; 58 | } 59 | memory[indices[len-1]] = (void*) &memory[indices[0]]; 60 | delete[] indices; 61 | return memory; 62 | } 63 | 64 | unsigned int log2(std::size_t val) { 65 | unsigned int count = 0; 66 | while (val >>= 1) { 67 | ++count; 68 | } 69 | return count; 70 | } 71 | 72 | #ifndef MIN_SIZE 73 | #define MIN_SIZE 1024 74 | #endif 75 | #ifndef MAX_SIZE 76 | #define MAX_SIZE 1024 * 1024 * 128 77 | #endif 78 | #ifndef GRANULARITY 79 | #define GRANULARITY (1u) 80 | #endif 81 | 82 | int main() { 83 | fmt::printf(" memsize time in ns\n"); 84 | for (std::size_t memsize = MIN_SIZE; memsize <= MAX_SIZE; 85 | memsize += (std::size_t{1} << 86 | (std::max(GRANULARITY, log2(memsize))-GRANULARITY))) { 87 | void** memory = create_random_chain(memsize); 88 | std::size_t count = std::max(memsize * 16, std::size_t{1}<<30); 89 | double t = chase_pointers(memory, count); 90 | delete[] memory; 91 | double ns = t * 1000000000 / count; 92 | fmt::printf(" %9u %10.5lf\n", memsize, ns); std::cout.flush(); 93 | } 94 | } 95 | -------------------------------------------------------------------------------- /linear-chain.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2016 Andreas F. Borchert 3 | All rights reserved. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY 17 | KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE 18 | WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 20 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 21 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 22 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | */ 25 | 26 | #include "linear-chain.hpp" 27 | 28 | /* return the index of the most significant bit which is 1 29 | where the lowest significant bit has index 0, 30 | i.e. return minimal n where 2^(n+1) > val 31 | */ 32 | static unsigned int log2(unsigned int val) { 33 | unsigned int count = 0; 34 | while (val >>= 1) { 35 | ++count; 36 | } 37 | return count; 38 | } 39 | 40 | /* reverse the given number of bits within val */ 41 | static unsigned int bit_reverse(unsigned int val, unsigned int bits) { 42 | unsigned int result = 0; 43 | while (bits > 0) { 44 | result = (result << 1) | (val & 1); 45 | val >>= 1; 46 | --bits; 47 | } 48 | return result; 49 | } 50 | 51 | /* generate a bit-reversal permutation; see https://oeis.org/A030109 */ 52 | static void gen_bit_reversal_permutation(unsigned int* seq, 53 | unsigned int bits, unsigned int count) { 54 | /* generate a bit-reversal permutation for integers from 0 to (2^bits)-1 */ 55 | unsigned int maxval = 1< count) { 63 | while (seq[index] < count) ++index; 64 | --current; seq[index] = seq[current]; 65 | } 66 | } 67 | 68 | /* create a cyclic pointer chain where the individual locations 69 | are stride bytes apart */ 70 | void** create_linear_chain(std::size_t size, std::size_t stride) { 71 | std::size_t len = size / sizeof(void*); 72 | void** memory = new void*[len]; 73 | 74 | /* if we have multiple runs through the same buffer 75 | make sure that we operate with offsets where it appears 76 | more likely that the associated lines are not yet in 77 | one of the caches; 78 | to achieve this, we operate with bit reversal permutations, 79 | if runs == 8 we would get following sequence 80 | 81 | 0 4 2 6 1 5 3 7 82 | */ 83 | unsigned int runs = stride / sizeof(void*); 84 | unsigned int bits = log2(runs); 85 | if ((1<= (char*) memory + size) break; 101 | *last = (void*) next; last = (void**) next; 102 | } 103 | } 104 | *last = (void*) memory; /* close the cycle */ 105 | return memory; 106 | } 107 | -------------------------------------------------------------------------------- /fused-linear-chase.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2016, 2018 Andreas F. Borchert 3 | All rights reserved. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY 17 | KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE 18 | WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 20 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 21 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 22 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | */ 25 | 26 | /* this utility is an extension of the linear-chase: 27 | where linear-chase just has one pointer chase using a constant stride, 28 | this utility chases n pointers in an interleaved pattern 29 | for n running from 1 to 8 */ 30 | 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include 36 | #include 37 | #include /* see https://github.com/afborchert/fmt */ 38 | #include "linear-chain.hpp" 39 | #include "walltime.hpp" 40 | 41 | template 42 | inline void fused_action(Body body, Object& object) { 43 | body(object); 44 | } 45 | template 46 | inline void fused_action(Body body, Object& object, Objects&... objects) { 47 | body(object); 48 | fused_action(body, objects...); 49 | } 50 | 51 | /* this variable must not be declared static */ 52 | volatile void* fused_linear_global; // to defeat optimizations 53 | 54 | template 55 | double fused_chase(std::size_t count, Pointers&... ptrs) { 56 | WallTime walltime; 57 | // chase the pointers count times 58 | while (count-- > 0) { 59 | fused_action([](void**& p) { p = (void**) *p; }, ptrs...); 60 | } 61 | auto elapsed = walltime.elapsed(); 62 | // defeat the optimization that removes the chasing 63 | fused_action([](void**& p) { fused_linear_global = *p; }, ptrs...); 64 | return elapsed; 65 | } 66 | 67 | #ifndef MIN_STRIDE 68 | #define MIN_STRIDE (sizeof(void*)) 69 | #endif 70 | #ifndef MAX_STRIDE 71 | #define MAX_STRIDE 120 72 | #endif 73 | 74 | int main() { 75 | void **p1, **p2, **p3, **p4, **p5, **p6, **p7, **p8; 76 | void **m1, **m2, **m3, **m4, **m5, **m6, **m7, **m8; 77 | fmt::printf(" " 78 | "data access speeds in GiB/s\n"); 79 | fmt::printf(" fuse"); 80 | for (int i = 1; i <= 8; ++i) fmt::printf("%12d", i); 81 | fmt::printf("\n stride\n"); 82 | for (std::size_t stride = MIN_STRIDE; stride <= MAX_STRIDE; 83 | stride += sizeof(void*)) { 84 | size_t memsize = std::min(std::size_t{1}<<26, 85 | stride * 1024 * sizeof(void*)); 86 | std::size_t count = std::size_t{1}<<30; 87 | fmt::printf(" %8u", stride); 88 | 89 | auto print_result = [=](int fuse, double t) { 90 | auto volume = static_cast(sizeof(void*)) * count * fuse; 91 | auto speed = volume / t / (1<<30); /* in GiB/s */ 92 | fmt::printf(" %10.5lf", speed); std::cout.flush(); 93 | }; 94 | 95 | fused_action([=](void**& p) { 96 | p = create_linear_chain(memsize, stride); 97 | }, m1, m2, m3, m4, m5, m6, m7, m8); 98 | p1 = m1; p2 = m2; p3 = m3; p4 = m4; 99 | p5 = m5; p6 = m6; p7 = m7; p8 = m8; 100 | 101 | print_result(1, fused_chase(count, p1)); 102 | print_result(2, fused_chase(count, p1, p2)); 103 | print_result(3, fused_chase(count, p1, p2, p3)); 104 | print_result(4, fused_chase(count, p1, p2, p3, p4)); 105 | print_result(5, fused_chase(count, p1, p2, p3, p4, p5)); 106 | print_result(6, fused_chase(count, p1, p2, p3, p4, p5, p6)); 107 | print_result(7, fused_chase(count, p1, p2, p3, p4, p5, p6, p7)); 108 | print_result(8, fused_chase(count, p1, p2, p3, p4, p5, p6, p7, p8)); 109 | 110 | fused_action([](void**& p) { 111 | delete[] p; 112 | }, m1, m2, m3, m4, m5, m6, m7, m8); 113 | 114 | fmt::printf("\n"); std::cout.flush(); 115 | } 116 | } 117 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pointer-chasing 2 | Utilities to measure read access times of caches, memory, and hardware prefetches for simple and fused operations 3 | 4 | ## Summary 5 | 6 | This package provides the following three utilities: 7 | 8 | * _random-chase_: measure average read access times of all cache 9 | levels and main memory 10 | * _linear-chase_: measure read access times for a linear access 11 | pattern with a constant stride 12 | * _fused-linear-chase_: like _linear-chase_ but for an interleaved 13 | access pattern of multiple linear sequences, all with the same stride 14 | 15 | All of them work with memory buffers that are organized as an array 16 | of pointers where 17 | * all pointers point into the very same buffer, and where 18 | * beginning from any pointer all other pointers can be reached 19 | following the pointer chain, and where 20 | * all locations are reached. 21 | 22 | Once such a memory buffer has been set up, we measure the time of 23 | 24 | ```C 25 | void** p = (void**) memory[0]; 26 | while (count-- > 0) { 27 | p = (void**) *p; 28 | } 29 | ``` 30 | 31 | The `p = (void**) *p` construct enforces all memory accesses to be 32 | serialized, i.e. the next access can only be scheduled by the processor 33 | when the previous fetch has been finished. To defeat optimizers who 34 | tend to optimize the loop away when the result isn't used, the last 35 | pointer value is assigned to a `volatile` global variable which is 36 | otherwise unused. 37 | 38 | In case of _fused-linear-chase_ multiple such buffers are configured 39 | in dependence of the fuse factor. 40 | 41 | For the sake of simplicity, all utilities are parameterized through 42 | preprocessor macros. 43 | 44 | The idea of pointer chasing is not new, in fact there exist quite 45 | a number of papers and other utilities related to it. 46 | 47 | ## random-chase 48 | 49 | Following preprocessor macros allow to configure this utility: 50 | 51 | * *MIN_SIZE*: Minimal buffer size in bytes which should be small enough 52 | to fit comfortably into the L1 cache. 53 | * *MAX_SIZE*: Maximal buffer size in bytes which should be larger than 54 | the L3 cache. 55 | * *GRANULARITY*: All powers of two between *MIN_SIZE* and *MAX_SIZE* 56 | are tested. The granularity specifies how many sizes are tested 57 | in-between. For a granularity of _n_ > 0 we get _2^{n-1}_ sizes in-between. 58 | 59 | The output consists of a header line and then a line for each tested 60 | buffer size from *MIN_SIZE* to *MAX_SIZE* where the memory size and 61 | the measured access time in nanoseconds is given. 62 | 63 | This is the sample output for an Intel Xeon 5650 with three caches 64 | (L1: 32 KiB, L2: 256 KiB, L3: 12 MiB) with default 65 | parameters, i.e. *MIN_SIZE* = 1024, *MAX_SIZE* = 32 MiB, and 66 | *GRANULARITY* = 1: 67 | 68 | ``` 69 | memsize time in ns 70 | 1024 1.36904 71 | 1536 1.35973 72 | 2048 1.36904 73 | 3072 1.35973 74 | 4096 1.35973 75 | 6144 1.36904 76 | 8192 1.35973 77 | 12288 1.35973 78 | 16384 1.35973 79 | 24576 1.37836 80 | 32768 1.36904 81 | 49152 2.06754 82 | 65536 2.43075 83 | 98304 2.79397 84 | 131072 2.97092 85 | 196608 3.14787 86 | 262144 3.29688 87 | 393216 8.37259 88 | 524288 10.85922 89 | 786432 13.20615 90 | 1048576 14.38893 91 | 1572864 15.59965 92 | 2097152 16.23295 93 | 3145728 18.08628 94 | 4194304 18.34705 95 | 6291456 19.79060 96 | 8388608 22.66839 97 | 12582912 31.55321 98 | 16777216 43.27856 99 | 25165824 59.39975 100 | 33554432 64.04705 101 | ``` 102 | 103 | A gnuplot script may be helpful to visualize this: 104 | 105 | ```gnuplot 106 | set terminal png size 900, 500 107 | set output "random-chase.png" 108 | set xlabel "memory area in bytes" 109 | set logscale x 110 | set ylabel "avg access time in ns" 111 | set title "Access times in dependence of memory area" 112 | set key out 113 | set pointsize 0.5 114 | 115 | # determine maximal y value by plotting to a dummy terminal 116 | set terminal push 117 | set terminal unknown 118 | plot "random-chase.out" using 2 119 | set terminal pop 120 | 121 | # mark L1, L2, and L3: 122 | maxy = GPVAL_Y_MAX 123 | l1 = 32 124 | l2 = 256 125 | l3 = 12288 126 | set arrow from l1*1024,0 to l1*1024,maxy nohead lc rgb 'blue'; 127 | set arrow from l2*1024,0 to l2*1024,maxy nohead lc rgb 'blue'; 128 | set arrow from l3*1024,0 to l3*1024,maxy nohead lc rgb 'blue'; 129 | 130 | plot "random-chase.out" using 1:2 with linespoints lt 2 title "Intel Xeon 5650" 131 | ``` 132 | 133 | Result: 134 | 135 | ![Memory access times with random chain](random-chase.png) 136 | 137 | ## linear-chase 138 | 139 | Following preprocessor macros configure this utility: 140 | 141 | * *MIN_STRIDE*: Minimal stride value. By default, `sizeof(void*)` 142 | is taken. 143 | * *MAX_STRIDE*: Maximal stride value. 144 | 145 | The output consists of a header line and then a line for each 146 | tested stride value from *MIN_STRIDE* to *MAX_STRIDE* in 147 | steps of `sizeof(void*)` and the measured acess time in 148 | nanoseconds. 149 | 150 | This is a sample output for the same Intel Xeon 5650 151 | compiled for an 32-bit address space, i.e. `sizeof(void*) == 4` 152 | which has been shortened for brevity: 153 | 154 | ``` 155 | stride time in ns 156 | 4 1.30385 157 | 8 1.31316 158 | 12 1.31316 159 | ... 160 | 1188 13.57868 161 | 1192 13.10371 162 | 1196 13.52280 163 | 1200 13.58800 164 | ``` 165 | 166 | A gnuplot script may be helpful as before: 167 | 168 | ```gnuplot 169 | set terminal png size 900, 500 170 | set output "linear-chase.png" 171 | set xlabel "stride in bytes" 172 | set ylabel "avg access time in ns" 173 | set title "Access times in dependence of stride" 174 | set key out 175 | set pointsize 0.5 176 | 177 | plot "linear-chase.out" using 1:2 with linespoints lt 2 title "Intel Xeon 5650" 178 | ``` 179 | 180 | Result: 181 | 182 | ![Memory access times in dependence of stride](linear-chase.png) 183 | 184 | ## fused-linear-chase 185 | 186 | Like _linear-chase_, the macro parameters *MIN_STRIDE* and 187 | *MAX_STRIDE* are supported. The range of tested fuse factors extends 188 | from 1 to 8. This test allows to analyze how many interleaved 189 | access patterns with a constant stride are supported by the 190 | hardware prefetch. 191 | 192 | The output is a table with a column for each fuse factor from 193 | 1 to 8 and a line for each stride value tested between *MIN_STRIDE* 194 | and *MAX_STRIDE*. For each combination the aggregated data access 195 | speed in GiB/s is given. There are three header lines. 196 | 197 | This is a sample output for the very same Intel Xeon 5650 as above: 198 | 199 | ``` 200 | data access speeds in GiB/s 201 | fuse 1 2 3 4 5 6 7 8 202 | stride 203 | 4 2.81690 5.71429 7.50000 10.06289 10.63830 9.26641 6.42202 6.47773 204 | 8 2.83688 5.63380 7.27273 9.03955 9.85222 8.79121 6.23608 5.98131 205 | 12 2.85714 5.55556 6.81818 8.24742 9.30233 7.97342 5.85774 5.89319 206 | 16 2.83688 5.55556 6.31579 7.51174 5.79710 5.56845 4.00000 3.65714 207 | 20 2.39521 4.73373 5.74163 4.73373 4.90196 5.02092 4.95575 4.87062 208 | 24 2.07254 4.04040 3.77358 3.85542 4.21941 4.30108 4.20420 4.18301 209 | 28 1.90476 3.75587 3.32410 3.48584 3.68324 3.75000 3.66492 3.76471 210 | 32 1.63934 3.38983 2.94118 3.07102 3.18979 3.23015 3.05344 2.96571 211 | 36 1.54440 2.36686 2.63158 2.76339 2.86123 3.00375 2.96610 2.99345 212 | 40 1.45985 2.14477 2.38569 2.50784 2.67023 2.72109 2.73171 2.74914 213 | 44 1.40351 1.95122 2.17786 2.29555 2.43902 2.49480 2.44541 2.51572 214 | 48 1.33333 1.80180 1.99336 2.11640 2.17155 2.18182 2.14559 2.14909 215 | 52 1.25786 1.69851 1.85759 1.96802 2.08551 2.13144 2.13090 2.16802 216 | 56 1.20482 1.59681 1.72166 1.83276 1.94553 1.97694 1.98020 2.00501 217 | 60 1.15274 1.49533 1.61725 1.72043 1.82482 1.86335 1.84941 1.89798 218 | 64 1.08992 1.40351 1.51324 1.60804 1.64339 1.63154 1.63075 1.65889 219 | 68 1.00503 1.37694 1.47059 1.58730 1.68209 1.75439 1.76879 1.79574 220 | 72 0.93240 1.33556 1.42518 1.55039 1.62866 1.68658 1.68980 1.71306 221 | 76 0.90909 1.29450 1.38090 1.49953 1.58479 1.64948 1.67164 1.69223 222 | 80 0.89888 1.26582 1.35135 1.47738 1.50038 1.49254 1.47446 1.48285 223 | 84 0.88300 1.23267 1.32013 1.47194 1.56617 1.67131 1.68370 1.70758 224 | 88 0.87912 1.21581 1.30293 1.46119 1.49477 1.59893 1.59817 1.62025 225 | 92 0.85106 1.17820 1.26449 1.40105 1.48038 1.59151 1.59091 1.63016 226 | 96 0.83333 1.13636 1.24224 1.43498 1.43885 1.41260 1.39651 1.44993 227 | 100 0.82816 1.11888 1.20846 1.36519 1.46520 1.57274 1.56863 1.60966 228 | 104 0.79051 1.06809 1.16959 1.38289 1.54440 1.62712 1.61570 1.62850 229 | 108 0.74906 1.04439 1.13422 1.35021 1.47384 1.55440 1.56337 1.64271 230 | 112 0.74488 1.03896 1.11008 1.31796 1.29955 1.25720 1.24611 1.31201 231 | 116 0.68376 1.02828 1.07431 1.34567 1.47820 1.55039 1.55729 1.63850 232 | 120 0.66116 1.02960 1.08794 1.41970 1.47601 1.55039 1.54525 1.56479 233 | ``` 234 | 235 | This can be visualized using a gnuplot script: 236 | 237 | ```gnuplot 238 | set terminal png size 1500, 900 239 | set output "fused-linear-chase.png" 240 | set xlabel "stride in bytes" 241 | set ylabel "data access speed in GiB/s" 242 | set title "Data access speeds in dependence of stride and fuse (on Intel Xeon 5650)" 243 | set pointsize 0.5 244 | plot \ 245 | "fused-linear-chase.out" every ::3::32 using 1:2 title "fuse 1" with linespoints lt 2, \ 246 | "fused-linear-chase.out" every ::3::32 using 1:3 title "fuse 2" with linespoints lt 3, \ 247 | "fused-linear-chase.out" every ::3::32 using 1:4 title "fuse 3" with linespoints lt 4, \ 248 | "fused-linear-chase.out" every ::3::32 using 1:5 title "fuse 4" with linespoints lt 5, \ 249 | "fused-linear-chase.out" every ::3::32 using 1:6 title "fuse 5" with linespoints lt 1, \ 250 | "fused-linear-chase.out" every ::3::32 using 1:7 title "fuse 6" with linespoints lt 7, \ 251 | "fused-linear-chase.out" every ::3::32 using 1:8 title "fuse 7" with linespoints lt 8, \ 252 | "fused-linear-chase.out" every ::3::32 using 1:9 title "fuse 8" with linespoints lt 9 253 | ``` 254 | 255 | Result: 256 | 257 | ![Data access speeds in dependence of stride and fuse](fused-linear-chase.png) 258 | 259 | ## Downloading and testing 260 | 261 | If you want to clone this project, you should do this recursively: 262 | 263 | ``` 264 | git clone --recursive https://github.com/afborchert/pointer-chasing.git 265 | ``` 266 | 267 | To build it, just invoke make. You need g++ supporting C++11 and GNU make 268 | for this to work. 269 | --------------------------------------------------------------------------------