├── LICENSE ├── Makefile ├── README.md ├── create_load.c ├── gifs └── psi-by-example.gif └── monitor.c /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2019, shuveb 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | 3. Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | monitor: monitor.c 2 | gcc -o $@ $< 3 | 4 | create_load: create_load.c 5 | gcc -o $@ $< -lpthread 6 | 7 | all: monitor create_load 8 | 9 | .PHONY: clean 10 | 11 | clean: 12 | rm -f monitor create_load 13 | 14 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Linux Pressure Stall Information (PSI) By Example 2 | This is the source code repository for the article [Linux Pressure Stall Information by Example on unixism.net](https://unixism.net/2019/08/linux-pressure-stall-information-psi-by-example/). 3 | 4 | ## In action 5 | ![PSI](https://github.com/shuveb/psi-by-example/raw/master/gifs/psi-by-example.gif "Linux PSI by example") 6 | -------------------------------------------------------------------------------- /create_load.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | /* 11 | * These defines are the number of seconds for which we load 12 | * CPU or I/O 13 | * */ 14 | #define CPU_LOAD_TIME_SECS 10 15 | #define IO_LOAD_TIME_SECS 10 16 | 17 | /* 18 | * We split a list of directories to traverse between 2 I/O 19 | * Loader threads. This struct is passed to each of them, 20 | * letting them know the starting index of that list and 21 | * number of directories to traverse. 22 | * 23 | * */ 24 | 25 | typedef struct dir_list { 26 | char **dirs; 27 | int begin_idx; 28 | int count; 29 | }dir_list; 30 | 31 | /* 32 | One function that prints the system call and the error details 33 | and then exits with error code 1. Non-zero meaning things didn't go well. 34 | */ 35 | void fatal_error(const char *syscall) 36 | { 37 | perror(syscall); 38 | exit(1); 39 | } 40 | 41 | /* 42 | * Get all the top level directories from the root directory. 43 | * */ 44 | char **get_root_dir_entries() { 45 | char **entries = NULL; 46 | DIR *root_dir = opendir("/"); 47 | if (root_dir == NULL) 48 | fatal_error("readdir()"); 49 | 50 | struct dirent *dir; 51 | int i = 0; 52 | while ((dir = readdir(root_dir)) != NULL) { 53 | /* We only save directories and those with names other than "." or ".." */ 54 | if (dir->d_type != DT_DIR || strcmp(dir->d_name, ".") == 0 || strcmp(dir->d_name, "..") == 0) 55 | continue; 56 | 57 | entries = realloc(entries, sizeof(char *) * (i + 1)); 58 | entries[i] = malloc(strlen(dir->d_name) + 2); 59 | strcpy(entries[i], "/"); 60 | strcat(entries[i], dir->d_name); 61 | i++; 62 | } 63 | closedir(root_dir); 64 | 65 | /* We NULL-terminate the list */ 66 | entries = realloc(entries, sizeof(char *) * (i + 1)); 67 | entries[i] = NULL; 68 | 69 | return entries; 70 | } 71 | 72 | /* 73 | * This function is the one that causes the actual I/O load. 74 | * It recursively traverses the directory passed as an argument. 75 | * */ 76 | 77 | void read_dir_contents(char *dir_path) { 78 | struct dirent *entry; 79 | struct stat st; 80 | char buff[16384]; 81 | DIR *dir = opendir(dir_path); 82 | if (dir == NULL) 83 | return; 84 | 85 | while ((entry = readdir(dir)) != NULL) { 86 | /* Let's get the attributes of this entry. 87 | * Though we don't need it, this generates more I/O. */ 88 | stat(entry->d_name, &st); 89 | 90 | if (entry->d_type == DT_REG) { 91 | /* Regular file. Read a little bit from it. */ 92 | int fd = open(entry->d_name, O_RDONLY); 93 | if (fd > 0) { 94 | read(fd, buff, sizeof(buff)); 95 | close(fd); 96 | } 97 | } 98 | if (entry->d_type == DT_DIR && strcmp(entry->d_name, ".") != 0 && strcmp(entry->d_name, "..") != 0) { 99 | /* Found a directory, let's get into it recursively */ 100 | char new_path[1024]; 101 | snprintf(new_path, sizeof(new_path), "%s/%s", dir_path, entry->d_name ); 102 | read_dir_contents(new_path); 103 | } 104 | } 105 | closedir(dir); 106 | } 107 | 108 | /* 109 | * This function is called in a thread. It it iterates through the list 110 | * of directories passed and calls read_dir_contents() for each directory 111 | * in the list. 112 | * 113 | * Since 2 threads are created and they get passed the same list of 114 | * directories, we pass the starting index and the count of directories 115 | * to traverse so that each thread can, in parallel, act on its own 116 | * unique set of directories. This creates more I/O load since 2 threads 117 | * access the filesystem information / data in parallel. 118 | * 119 | * */ 120 | 121 | void *iterate_dirs(void *data) { 122 | time_t time1 = time(NULL); 123 | time_t time2; 124 | dir_list *dl = (dir_list *) data; 125 | printf("I/O Loader thread starting with %d directories to traverse.\n", dl->count); 126 | char **dirs = dl->dirs; 127 | char *dname; 128 | int i = dl->begin_idx; 129 | while (dl->count--) { 130 | dname = dl->dirs[i++]; 131 | read_dir_contents(dname); 132 | time2 = time(NULL); 133 | if (time2 - time1 >= IO_LOAD_TIME_SECS) 134 | break; 135 | } 136 | 137 | return NULL; 138 | } 139 | 140 | /* 141 | * This function gets the names of top-level directories in the root 142 | * directory, splits up that list and passes it to two threads both 143 | * running the same function, iterate_dirs(). 144 | * */ 145 | 146 | void load_disk() { 147 | int i = 0; 148 | pthread_t pthread1, pthread2; 149 | 150 | char **root_dir_entries = get_root_dir_entries(); 151 | while (root_dir_entries[i++] != NULL); 152 | 153 | dir_list dl1, dl2; 154 | dl1.dirs = root_dir_entries; 155 | dl1.begin_idx = 0; 156 | dl1.count = i/2; 157 | 158 | dl2.dirs = root_dir_entries; 159 | dl2.begin_idx = dl1.count - 1; 160 | dl2.count = i - dl1.count; 161 | 162 | pthread_create(&pthread1, NULL, iterate_dirs, (void *) &dl1); 163 | pthread_create(&pthread2, NULL, iterate_dirs, (void *) &dl2); 164 | 165 | /* Wait for both the threads to run to completion */ 166 | pthread_join(pthread1, NULL); 167 | pthread_join(pthread2, NULL); 168 | 169 | printf("********************************************************************************\n"); 170 | printf("Now that the I/O loader threads have run, disk blocks will be cached in RAM.\n"); 171 | printf("You are unlikely to see further I/O-related PSI notifications should you run\n"); 172 | printf("this again. If you want to however, you can run this again after dropping all\n"); 173 | printf("disk caches like so as root:\n"); 174 | printf("\necho 3 > /proc/sys/vm/drop_caches\n"); 175 | printf("\nOr with sudo:\n"); 176 | printf("echo 3 | sudo tee /proc/sys/vm/drop_caches\n"); 177 | printf("********************************************************************************\n"); 178 | 179 | /* Free allocated memory */ 180 | i = 0; 181 | while (root_dir_entries[i++] != NULL) 182 | free(root_dir_entries[i]); 183 | free(root_dir_entries); 184 | } 185 | 186 | /* 187 | * This routine runs in threads. This creates load on the CPU 188 | * by running a tight loop for CPU_LOAD_TIME_SECS seconds. 189 | * 190 | * We create a thread more than there are CPUs. e.g: If there 191 | * are 2 CPUs, we create 3 threads. This is to ensure that 192 | * the system is loaded *beyond* capacity. This creates 193 | * pressure, which is then notified by the PSI subsystem 194 | * to our monitor.c program. 195 | * 196 | * */ 197 | void *cpu_loader_thread(void *data) { 198 | long tid = (long) data; 199 | time_t time1 = time(NULL); 200 | printf("CPU Loader thread %ld starting...\n", tid); 201 | 202 | while (1) { 203 | for (tid=0; tid < 50000000; tid++); 204 | time_t time2 = time(NULL); 205 | if (time2 - time1 >= CPU_LOAD_TIME_SECS) 206 | break; 207 | } 208 | return NULL; 209 | } 210 | 211 | void load_cpu() { 212 | /* Some crazy future-proofing when this runs 213 | * on a 1024-core Arm CPU. Sorry, Intel.*/ 214 | pthread_t threads[1024]; 215 | 216 | /* Get the number of installed CPUs and create as many +1 threads. */ 217 | long num_cpus = sysconf(_SC_NPROCESSORS_ONLN); 218 | for (long i=0; i < num_cpus + 1; i++) { 219 | pthread_create(&threads[i], NULL, cpu_loader_thread, (void *) i); 220 | } 221 | 222 | /* Wait for all threads to complete */ 223 | for (long i=0; i < num_cpus; i++) { 224 | pthread_join(threads[i], NULL); 225 | } 226 | } 227 | 228 | int main() { 229 | load_cpu(); 230 | load_disk(); 231 | return 0; 232 | } 233 | -------------------------------------------------------------------------------- /gifs/psi-by-example.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shuveb/psi-by-example/2c85cc201955008ef90df3f5f4dbb3fe2c096279/gifs/psi-by-example.gif -------------------------------------------------------------------------------- /monitor.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #define CPU_TRACKING_WINDOW_SECS 1 10 | #define IO_TRACKING_WINDOW_SECS 1 11 | #define CPU_TRIGGER_THRESHOLD_MS 100 12 | #define IO_TRIGGER_THRESHOLD_MS 100 13 | #define CPU_PRESSURE_FILE "/proc/pressure/cpu" 14 | #define IO_PRESSURE_FILE "/proc/pressure/io" 15 | #define FD_CPU_IDX 0 16 | #define FD_IO_IDX 1 17 | 18 | struct pollfd fds[2]; 19 | 20 | /* 21 | One function that prints the system call and the error details 22 | and then exits with error code 1. Non-zero meaning things didn't go well. 23 | */ 24 | void fatal_error(const char *syscall) 25 | { 26 | perror(syscall); 27 | exit(1); 28 | } 29 | 30 | /* 31 | * PSI allows programs to wait for events related to pressure stalls 32 | * via poll() so that they can avoid continuously polling files in the 33 | * /proc/pressure directory. 34 | * 35 | * We setup to be notified via poll for two types of PSI events, one 36 | * for CPU and the other for I/O. 37 | * 38 | * */ 39 | 40 | void setup_polling() { 41 | /* Let's first setup our CPU PSI trigger */ 42 | fds[FD_CPU_IDX].fd = open(CPU_PRESSURE_FILE, O_RDWR | O_NONBLOCK); 43 | if (fds[FD_CPU_IDX].fd < 0) 44 | fatal_error("open(): " CPU_PRESSURE_FILE); 45 | 46 | /* Next, our I/O PSI trigger */ 47 | fds[FD_IO_IDX].fd = open(IO_PRESSURE_FILE, O_RDWR | O_NONBLOCK); 48 | if (fds[FD_IO_IDX].fd < 0) 49 | fatal_error("open(): " IO_PRESSURE_FILE); 50 | 51 | fds[FD_CPU_IDX].events = fds[FD_IO_IDX].events = POLLPRI; 52 | 53 | char trigger[128]; 54 | snprintf(trigger, 128, "some %d %d", CPU_TRIGGER_THRESHOLD_MS * 1000, CPU_TRACKING_WINDOW_SECS * 1000000); 55 | printf("Trigger: %s\n", trigger); 56 | if (write(fds[FD_CPU_IDX].fd, trigger, strlen(trigger) + 1) < 0) 57 | fatal_error("write(): " CPU_PRESSURE_FILE); 58 | snprintf(trigger, 128, "some %d %d", IO_TRIGGER_THRESHOLD_MS * 1000, IO_TRACKING_WINDOW_SECS * 1000000); 59 | printf("Trigger: %s\n", trigger); 60 | if (write(fds[FD_IO_IDX].fd, trigger, strlen(trigger) + 1) < 0) 61 | fatal_error("write(): " IO_PRESSURE_FILE); 62 | } 63 | 64 | 65 | /* 66 | * This is the main function where we wait for notifications from 67 | * PSI. We increment 2 separate variables that track CPU and I/O 68 | * notification counts separately and print them. 69 | * */ 70 | 71 | void wait_for_notification() { 72 | int cpu_event_counter = 1; 73 | int io_event_counter = 1; 74 | 75 | while (1) { 76 | int n = poll(fds, 2, -1); 77 | if (n < 0) { 78 | fatal_error("poll()"); 79 | } 80 | 81 | for (int i = 0; i < 2; i++) { 82 | 83 | /* If the fd of the current iteration does not have any 84 | * events, move on to the next fd. 85 | * */ 86 | if (fds[i].revents == 0) 87 | continue; 88 | 89 | if (fds[i].revents & POLLERR) { 90 | fprintf(stderr, "Error: poll() event source is gone.\n"); 91 | exit(1); 92 | } 93 | if (fds[i].revents & POLLPRI) { 94 | if (i == FD_CPU_IDX) 95 | printf("CPU PSI event %d triggered.\n", cpu_event_counter++); 96 | else 97 | printf("I/O PSI event %d triggered.\n", io_event_counter++); 98 | } else { 99 | fprintf(stderr, "Unrecognized event: 0x%x.\n", fds[i].revents); 100 | exit(1); 101 | } 102 | } 103 | } 104 | } 105 | 106 | /* 107 | * We check for tell-tale signs of the running kernel supporting PSI. 108 | * Else, we print a friendly message and exit. 109 | * */ 110 | 111 | void check_basics() { 112 | struct stat st; 113 | int sret = stat(CPU_PRESSURE_FILE, &st); 114 | if (sret == -1) { 115 | fprintf(stderr, "Error! Your kernel does not expose pressure stall information.\n"); 116 | fprintf(stderr, "You may want to check if you have Linux Kernel v5.2+ with PSI enabled.\n"); 117 | exit(1); 118 | } 119 | } 120 | 121 | int main() { 122 | check_basics(); 123 | setup_polling(); 124 | wait_for_notification(); 125 | return 0; 126 | } --------------------------------------------------------------------------------