├── 2-io ├── 2-IO.pptx ├── demo-1 │ ├── cat_uring │ ├── readme.md │ └── main.c └── demo-0 │ ├── server │ ├── server │ └── main.c │ └── readme.md ├── 4-containers.pptx ├── 7-Building Linux.pptx ├── 8 - Extending Linux.pptx ├── 0-processes ├── 0-Processes.pptx ├── demo-1 │ ├── hello-world │ │ └── program.c │ ├── setuid │ │ └── program.c │ └── readme.md ├── demo-2 │ ├── readme.md │ └── allocator │ │ └── program.c ├── demo-0 │ ├── crawl-ps-tree.sh │ └── readme.md └── demo-3 │ ├── mmap │ ├── reader.c │ └── writer.c │ ├── readme.md │ └── pipe │ └── program.c ├── 3-networking ├── 3-Networking (Part I).pptx └── 3-Networking (Part II).pptx └── 1-blkdev-and-fs ├── 1-filesystems-blockdevices.pptx └── demo-0 └── readme.md /2-io/2-IO.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/khenidak/on-linux/HEAD/2-io/2-IO.pptx -------------------------------------------------------------------------------- /4-containers.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/khenidak/on-linux/HEAD/4-containers.pptx -------------------------------------------------------------------------------- /2-io/demo-1/cat_uring: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/khenidak/on-linux/HEAD/2-io/demo-1/cat_uring -------------------------------------------------------------------------------- /7-Building Linux.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/khenidak/on-linux/HEAD/7-Building Linux.pptx -------------------------------------------------------------------------------- /2-io/demo-0/server/server: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/khenidak/on-linux/HEAD/2-io/demo-0/server/server -------------------------------------------------------------------------------- /8 - Extending Linux.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/khenidak/on-linux/HEAD/8 - Extending Linux.pptx -------------------------------------------------------------------------------- /0-processes/0-Processes.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/khenidak/on-linux/HEAD/0-processes/0-Processes.pptx -------------------------------------------------------------------------------- /3-networking/3-Networking (Part I).pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/khenidak/on-linux/HEAD/3-networking/3-Networking (Part I).pptx -------------------------------------------------------------------------------- /3-networking/3-Networking (Part II).pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/khenidak/on-linux/HEAD/3-networking/3-Networking (Part II).pptx -------------------------------------------------------------------------------- /1-blkdev-and-fs/1-filesystems-blockdevices.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/khenidak/on-linux/HEAD/1-blkdev-and-fs/1-filesystems-blockdevices.pptx -------------------------------------------------------------------------------- /0-processes/demo-1/hello-world/program.c: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | // How many sys call this program would do? 4 | int main() { 5 | printf("Hello, World!"); 6 | return 0; 7 | } 8 | -------------------------------------------------------------------------------- /2-io/demo-0/readme.md: -------------------------------------------------------------------------------- 1 | # build server 2 | 3 | The example is a simple epoll loop that reads from n sockets. You can extend it for write too. 4 | 5 | ``` 6 | gcc main.c -o server 7 | ``` 8 | 9 | # run server 10 | 11 | ``` 12 | ./server 5505 13 | ``` 14 | 15 | run test clients by running multiple concurrent 16 | 17 | ``` 18 | echo $(date) | nc localhost 5505 19 | ``` 20 | 21 | -------------------------------------------------------------------------------- /2-io/demo-1/readme.md: -------------------------------------------------------------------------------- 1 | # note 2 | the example is based on https://raw.githubusercontent.com/shuveb/io_uring-by-example/master/02_cat_uring/main.c 3 | 4 | 5 | a good write up on io_uring and supports the above example here: https://unixism.net/2020/04/io-uring-by-example-part-1-introduction/ 6 | 7 | # build 8 | 9 | ``` 10 | gcc main.c -o cat_uring 11 | ``` 12 | 13 | # run 14 | 15 | ``` 16 | cat_uring 17 | ``` 18 | -------------------------------------------------------------------------------- /0-processes/demo-1/setuid/program.c: -------------------------------------------------------------------------------- 1 | #define _GNU_SOURCE 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | int main( void ) 8 | { 9 | uid_t ruid, euid, suid; 10 | 11 | printf( "my userid is %d\n", getuid() ); 12 | 13 | if (getresuid(&ruid, &euid, &suid) < 0) 14 | perror("failed to get real user id"); 15 | 16 | 17 | printf( "my effective userid is %d\n", euid ); 18 | 19 | return EXIT_SUCCESS; 20 | } 21 | -------------------------------------------------------------------------------- /0-processes/demo-2/readme.md: -------------------------------------------------------------------------------- 1 | ## demo - 20 - memory allocation 2 | 3 | build the program 4 | ``` 5 | gcc -Wall ./program.c -o program 6 | ``` 7 | 8 | 9 | whenever it asks you check memory run (on a separate session): `ps aux | grep -E '(USER|program)'` 10 | 11 | while the program is running run `cat /proc//maps` to explore the memory map for this process. this has a lot more details https://gist.github.com/CMCDragonkai/10ab53654b2aa6ce55c11cfc5b2432a4 12 | 13 | the key finding: 14 | each row is a memory with the following: 15 | start-end-offsets {permission} {file map offset if applicable} {device if file mapped} {inode if file mapped} {path} 16 | 17 | 18 | vdso = virtual dynamic shared object (kernel) 19 | vsyscall = unprivileged syscall (that does not require raising irq) such as gettimeofday(); 20 | vvar = variables declared by kernel visible in userspace. 21 | 22 | 23 | note: while we allocate memory in unaligned bytes. memory works in pages. os page size can be identified by `getconf PAGESIZE` 24 | -------------------------------------------------------------------------------- /0-processes/demo-0/crawl-ps-tree.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -eu 3 | set -o pipefail 4 | 5 | 6 | currentPid="$$" 7 | parentPid="${PPID}" 8 | 9 | 10 | formatOutput(){ 11 | local thisPid=$1 12 | local parentPid=2 13 | 14 | echo "* process: ${thisPid}" 15 | echo "** runs: $(file /proc/${thisPid}/exe)" 16 | echo "** cmd: $(cat /proc/${thisPid}/cmdline | tr -d '\0')" 17 | echo "** parent: ${parentPid}" 18 | echo "*******" 19 | } 20 | 21 | 22 | 23 | formatOutput "${currentPid}" "${parentPid}" 24 | # walk the tree 25 | for (( ; ; )) 26 | do 27 | 28 | currentPid="${parentPid}" 29 | if [[ "$currentPid" == "1" ]]; then 30 | echo "* process: ${currentPid}" 31 | echo "** runs: $(file /proc/${currentPid}/exe)" 32 | echo "** cmd: $(cat /proc/${currentPid}/cmdline | tr -d '\0')" 33 | echo "** at root, exiting.." 34 | break 35 | fi 36 | 37 | parentPid="$(cat /proc/${currentPid}/status | grep PPid | awk '{print $2}')" 38 | formatOutput "${currentPid}" "${parentPid}" 39 | done 40 | 41 | 42 | -------------------------------------------------------------------------------- /0-processes/demo-3/mmap/reader.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | 9 | #define errExit(msg) do { perror(msg); exit(EXIT_FAILURE); } while (0) 10 | 11 | const char* shared_mem_path="/my-mmmap"; 12 | 13 | 14 | 15 | int 16 | main() 17 | { 18 | 19 | size_t total_size = sizeof(size_t) * 2; /* lock and counter*/ 20 | size_t *shared_mem = NULL; 21 | 22 | int fd = shm_open(shared_mem_path, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR); 23 | if (fd == -1) errExit("shm_open"); 24 | 25 | 26 | shared_mem = mmap(NULL, total_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); 27 | if (shared_mem == MAP_FAILED) errExit("mmap"); 28 | 29 | for(;;) 30 | { 31 | while(0 == __sync_bool_compare_and_swap (shared_mem, 0, 1)) 32 | { 33 | sleep(1); 34 | } 35 | // get counter 36 | size_t counter = *(shared_mem + 1); 37 | 38 | printf("counter set to %lu \n", counter); 39 | 40 | // release lock 41 | *shared_mem = 0; 42 | sleep(1); 43 | } 44 | 45 | return 0; 46 | } 47 | -------------------------------------------------------------------------------- /0-processes/demo-2/allocator/program.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | 7 | int main() { 8 | printf("main is at %p\n", main); 9 | printf("end of data is at:%p\n", sbrk(0)); 10 | 11 | void *all; 12 | char string[4]; 13 | int *somewhereInTheMiddle; 14 | 15 | 16 | printf("we allocated stack vars and end of data is at:%p\n", sbrk(0)); 17 | printf("data is at:%p\n", sbrk(0)); 18 | 19 | 20 | printf("waiting\n"); 21 | printf("allocating 1 GB\n"); 22 | all = malloc(1024 * 1024 * 1024); 23 | if (all == NULL) perror("failed to allocate 1gb"); 24 | 25 | printf("waiting while you check the memory ..\n"); 26 | scanf("%1s", string); 27 | 28 | // let us set a mem somewhere in the middle 29 | printf("touch the content in the middle page \n"); 30 | somewhereInTheMiddle = (int *) all + (1024 * 1024 ); 31 | *somewhereInTheMiddle = ~0; 32 | 33 | printf("waiting while you check the memory ..\n"); 34 | scanf("%1s", string); 35 | 36 | 37 | printf("set all \n"); 38 | memset(all, ~0, 1024 * 1024 * 1024); 39 | 40 | printf("waiting while you check the memory ..\n"); 41 | scanf("%1s", string); 42 | 43 | free(all); 44 | return 0; 45 | } 46 | -------------------------------------------------------------------------------- /0-processes/demo-1/readme.md: -------------------------------------------------------------------------------- 1 | ## demo - 10 - syscalls 2 | 3 | explore the ultra basic hello-world 4 | 5 | ``` 6 | cd ./0-processes/demo-1/hello-world 7 | cat ./hello-world.c 8 | ``` 9 | 10 | how many syscall do you think this program will do? 11 | 12 | build it 13 | 14 | ``` 15 | gcc program.c -o ./program 16 | ``` 17 | 18 | you can test where the program spent its time using 19 | 20 | ``` 21 | time ./program 22 | ``` 23 | 24 | because the program spent very little time in kernel space you get 0.0s in system. That *does not* mean the program didn't spent time in kernel space. try running 25 | 26 | ``` 27 | strace ./program 28 | ``` 29 | 30 | the output contains all the syscalls executed by this running process 31 | 32 | 33 | 34 | ## demo - 11 - setuid 35 | 36 | build and run the program 37 | 38 | ``` 39 | cd 0-processes/demo-1/setuid/ 40 | gcc ./program.c -o program 41 | ``` 42 | 43 | get the user by its id 44 | 45 | ``` 46 | id -nu 1000 47 | ``` 48 | 49 | 50 | Now change the owner of the file 51 | ``` 52 | sudo chown root ./program 53 | ``` 54 | 55 | then set the sticky bit (allows a program to run with the user who created the binary) 56 | 57 | ``` 58 | sudo chmod u+s ./program 59 | ``` 60 | 61 | rerun the program 62 | 63 | 64 | 65 | -------------------------------------------------------------------------------- /0-processes/demo-3/mmap/writer.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | 9 | #define errExit(msg) do { perror(msg); exit(EXIT_FAILURE); } while (0) 10 | 11 | const char* shared_mem_path="/my-mmmap"; 12 | 13 | 14 | 15 | int 16 | main() 17 | { 18 | 19 | size_t total_size = sizeof(size_t) * 2; /* lock and counter*/ 20 | size_t *shared_mem = NULL; 21 | 22 | int fd = shm_open(shared_mem_path, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR); 23 | if (fd == -1) errExit("shm_open"); 24 | 25 | if (ftruncate(fd, total_size) == -1) 26 | errExit("ftruncate"); 27 | 28 | 29 | shared_mem = mmap(NULL, total_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); 30 | if (shared_mem == MAP_FAILED) errExit("mmap"); 31 | 32 | int counter = *(shared_mem + 1); 33 | for(; counter < 999; counter++) 34 | { 35 | while(0 == __sync_bool_compare_and_swap (shared_mem, 0, 1)) 36 | { 37 | sleep(1); 38 | } 39 | // set counter 40 | *(shared_mem + 1) = counter; 41 | 42 | printf("counter set to %d \n", counter); 43 | 44 | // release lock 45 | *shared_mem = 0; 46 | sleep(1); 47 | } 48 | 49 | printf("counter done!"); 50 | return 0; 51 | } 52 | -------------------------------------------------------------------------------- /0-processes/demo-3/readme.md: -------------------------------------------------------------------------------- 1 | ## demo - 31 - memory maps 2 | 3 | the reader/writer are basic producer consumer using a mapped memory as a communication medium 4 | 5 | build and run the writer 6 | 7 | ``` 8 | gcc -Wall writer.c -o writer -lrt # need to link to rt posix extention 9 | ./writer 10 | ``` 11 | 12 | writer created a named shared memory object that consists of two unsigned longs. one is used 13 | for locking (with compare and swap call) the other is used as simple counter. 14 | 15 | build and run the reader 16 | 17 | ``` 18 | gcc -Wall reader.c -o reader -lrt 19 | ./reader 20 | ``` 21 | 22 | reader tries to open an existing shared memory object map it to its own memory and read the counter with lock. 23 | 24 | > in theory we don't need the lock since it is an x64 and we are well under the boundaries of torn read/write. *but* it is always a good idea to have a lock around shared objects. 25 | 26 | 27 | 28 | ## demo - 32 - shared memory as storage 29 | 30 | 1. stop both reader and writer. 31 | 2. init the counter from the shared memory instead of `0`. 32 | 3. build and run the writer 33 | 4. stop the writer 34 | 5. run it again 35 | 36 | shared memory objects life cycle is detached from the process that created it. This include the data that sets in this memory. 37 | 38 | -------------------------------------------------------------------------------- /0-processes/demo-3/pipe/program.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | 9 | // code is based on man page example: https://man7.org/linux/man-pages/man2/pipe.2.html 10 | 11 | int 12 | main(int argc, char *argv[]) 13 | { 14 | int pipefd[2]; 15 | pid_t cpid; 16 | char buf; 17 | 18 | if (argc != 2) { 19 | fprintf(stderr, "Usage: %s \n", argv[0]); 20 | exit(EXIT_FAILURE); 21 | } 22 | 23 | if (pipe(pipefd) == -1) { 24 | perror("pipe"); 25 | exit(EXIT_FAILURE); 26 | } 27 | 28 | printf("parent pid:%d\n", getpid()); 29 | cpid = fork(); 30 | if (cpid == -1) { 31 | perror("fork"); 32 | exit(EXIT_FAILURE); 33 | } 34 | 35 | if (cpid == 0) { /* Child reads from pipe */ 36 | printf("child forked to pid:%d\n", getpid()); 37 | close(pipefd[1]); /* Close unused write end */ 38 | 39 | while (read(pipefd[0], &buf, 1) > 0) 40 | write(STDOUT_FILENO, &buf, 1); 41 | 42 | write(STDOUT_FILENO, "\n", 1); 43 | 44 | printf("child CLOSING pid:%d\n", getpid()); 45 | close(pipefd[0]); 46 | _exit(EXIT_SUCCESS); 47 | 48 | } else { /* Parent writes argv[1] to pipe */ 49 | close(pipefd[0]); /* Close unused read end */ 50 | write(pipefd[1], argv[1], strlen(argv[1])); 51 | close(pipefd[1]); /* Reader will see EOF */ 52 | wait(NULL); /* Wait for child DO NOT CREATE ZOMBIE https://man7.org/linux/man-pages/man2/wait.2.html */ 53 | printf("parent CLOSING pid:%d\n", getpid()); 54 | exit(EXIT_SUCCESS); 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /0-processes/demo-0/readme.md: -------------------------------------------------------------------------------- 1 | # demo - 00 - exploring processes 2 | 3 | ## explore processes with ps command 4 | 5 | ``` 6 | sudo ps -auxww 7 | ``` 8 | 9 | ### notice: 10 | - pid (second column) is process id 11 | - Process STAT field 12 | 13 | 14 | ## demo - 01 - explore process tree 15 | 16 | ``` 17 | sudo ps -aux --forest 18 | ``` 19 | 20 | 21 | ### notice: 22 | 23 | how process start each process has a parent 24 | 25 | 26 | ## demo - 02 - use proc fs 27 | 28 | > don't worry about proc fs details, we are covering this next session (File Systems) 29 | 30 | 31 | ``` 32 | sudo ls -lah /proc/$$ # $$ is current process id (where do you think it points to?) 33 | ``` 34 | 35 | ### notice 36 | - the cwd,root and exe are sylinks 37 | 38 | 39 | ## demo - 03 - scriptable proc fs 40 | 41 | ``` 42 | ./crawl-ps-tree.sh 43 | ``` 44 | 45 | 46 | as an additional exercise check out the content of an elf file. this https://linux-audit.com/elf-binaries-on-linux-understanding-and-analysis/ has a lot more details 47 | 48 | ## demo - 03 - limits 49 | check limits of your system 50 | ``` 51 | ulimit -aH # all hard limits 52 | ulimit -aS # all soft limit 53 | ``` 54 | 55 | to check limits applied on your shell process 56 | ``` 57 | cat /proc/$$/limits 58 | ``` 59 | 60 | ## demo - 04 -- caps 61 | 62 | run 63 | ``` 64 | ## or for pid 65 | cat /proc/$$/status | grep Cap 66 | 67 | ## decode 68 | cat /proc/$$/status | grep Cap | xargs -I {} echo {} | awk '{print $2}'| xargs -I {} capsh --decode={} 69 | 70 | ## you can also do 71 | capsh --print 72 | ``` 73 | 74 | > `inherited` v `permitted` v `effective` v `bound` v `ambient` is discussed in a different session 75 | 76 | -------------------------------------------------------------------------------- /1-blkdev-and-fs/demo-0/readme.md: -------------------------------------------------------------------------------- 1 | # demo - 00 - a tour around blkdev 2 | 3 | 4 | to list all blk devices use 5 | 6 | ``` 7 | lsblk 8 | ``` 9 | 10 | this command lists all the blkdev (and their partitions) available on the system. Note: 11 | 1. devices don't need to be mounted/formatted, these are the devices that are *plugged* in the system 12 | 2. notice the major:minor (and also where minor is assigned for partitions - same major) 13 | 14 | 15 | pick one of the block devices and do 16 | 17 | ``` 18 | cat /sys/block//queue/scheduler 19 | ``` 20 | 21 | this will tell you 1) all the schedulers in the system and which one is assigned to this block device. Note that scheduler works on all partitions (you can not select scheduler per partition). 22 | 23 | 24 | files that identifies write back throttling are wbt_lat_usec and wb_window_usec. Other files that identify sector, max, min etc are there in .../queue this https://www.kernel.org/doc/html/latest/block/queue-sysfs.html covers them in good details 25 | 26 | # demo - 01 - file attributes 27 | 28 | > it is unlikely that your file system has xattr. Unless you have enabled and correctly configured 29 | selinux. for reference to use shell to get xattrs use 30 | 31 | ``` 32 | getfattr -d -m 33 | ``` 34 | 35 | to get standard attributes (which are available for any file on any file system) 36 | 37 | ``` 38 | stat 39 | ``` 40 | 41 | notice the inode number, the file type the blocks. The device is hex and decimal encoded major:minor 42 | 43 | try the following 44 | 45 | ``` 46 | stat /dev/ 47 | ``` 48 | 49 | can you tell the difference? 50 | 51 | let us test this with symlinks and hard links 52 | 53 | ``` 54 | cd /tmp 55 | echo "Hello, World!" >> ./hello 56 | stat ./hello # take a note of the inode # 57 | # symlink 58 | ln -s ./hello h_s 59 | stat h_s # notice how they carry different inode #? 60 | # hardlink 61 | ln ./hello h_h 62 | stat h_h # notice how it uses the same inode # as the original file 63 | ``` 64 | 65 | # demo - 02 - file systems 66 | 67 | One of the way to learn about inode stats for file system is 68 | ``` 69 | df -i /dev/ 70 | ``` 71 | 72 | you can also get superblock by 73 | 74 | ``` 75 | dumpe2fs -h /dev/ 76 | # if you want to learn where they -super blocks - are saved 77 | sudo dumpe2fs /dev/sda | grep -i superblock 78 | ``` 79 | 80 | dropping dentry and inode cache can be done via 81 | 82 | ``` 83 | echo 2 > /proc/sys/vm/drop_caches 84 | ``` 85 | 86 | running `slabtop` will get you all the information about how big inode/dentry caches are 87 | 88 | 89 | understanding your i/o utilization/latency 90 | run 91 | ``` 92 | iostat -x -p 93 | ``` 94 | 95 | The output is explained here: https://manpages.debian.org/testing/sysstat/iostat.1.en.html -- FYI: the tool parses /sys/block//stat 96 | -------------------------------------------------------------------------------- /2-io/demo-0/server/main.c: -------------------------------------------------------------------------------- 1 | // modified example from: https://github.com/millken/c-example/blob/master/epoll-example.c 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #define MAXEVENTS 64 14 | 15 | static int 16 | make_socket_non_blocking (int sfd) 17 | { 18 | int flags; 19 | //when setting flags, you have to always read-modify-write 20 | if((flags = fcntl (sfd, F_GETFL, 0)) == -1) 21 | { 22 | perror ("fcntl"); 23 | return -1; 24 | } 25 | 26 | flags |= O_NONBLOCK; 27 | if(fcntl (sfd, F_SETFL, flags) == -1) 28 | { 29 | perror ("fcntl"); 30 | return -1; 31 | } 32 | 33 | return 0; 34 | } 35 | 36 | static int 37 | create_and_bind (char *port) 38 | { 39 | struct addrinfo hints; 40 | struct addrinfo *result, *rp; 41 | int s, sfd; 42 | 43 | memset (&hints, 0, sizeof (struct addrinfo)); 44 | hints.ai_family = AF_UNSPEC; /* Return IPv4 and IPv6 choices */ 45 | hints.ai_socktype = SOCK_STREAM; /* We want a TCP socket */ 46 | hints.ai_flags = AI_PASSIVE; /* All interfaces */ 47 | 48 | if((s = getaddrinfo(NULL, port, &hints, &result)) != 0){ 49 | fprintf (stderr, "getaddrinfo: %s\n", gai_strerror (s)); 50 | return -1; 51 | } 52 | 53 | // bind to any of the addresses 54 | for (rp = result; rp != NULL; rp = rp->ai_next) 55 | { 56 | // can we create a socket on this address? 57 | if((sfd = socket (rp->ai_family, rp->ai_socktype, rp->ai_protocol)) == -1) continue; 58 | 59 | // can we bind? 60 | if(bind(sfd, rp->ai_addr, rp->ai_addrlen) == 0) break; 61 | 62 | close (sfd); 63 | } 64 | 65 | // couldn't bind. return fail 66 | if (rp == NULL) 67 | { 68 | fprintf (stderr, "Could not bind\n"); 69 | return -1; 70 | } 71 | 72 | freeaddrinfo(result); 73 | return sfd; 74 | } 75 | 76 | int 77 | main (int argc, char *argv[]) 78 | { 79 | int sfd, s; 80 | int efd; 81 | struct epoll_event event; 82 | struct epoll_event *events; 83 | 84 | if (argc != 2) 85 | { 86 | fprintf (stderr, "Usage: %s [port]\n", argv[0]); 87 | exit (EXIT_FAILURE); 88 | } 89 | 90 | sfd = create_and_bind (argv[1]); 91 | if (sfd == -1)abort(); 92 | 93 | s = make_socket_non_blocking (sfd); 94 | if (s == -1)abort(); 95 | 96 | s = listen(sfd, SOMAXCONN); 97 | if (s == -1) 98 | { 99 | perror ("listen"); 100 | abort (); 101 | } 102 | 103 | fprintf (stdout, "Listing on: %s \n", argv[1]); 104 | efd = epoll_create1 (0); 105 | if (efd == -1) 106 | { 107 | perror ("epoll_create"); 108 | abort (); 109 | } 110 | 111 | event.data.fd = sfd; 112 | event.events = EPOLLIN | EPOLLET; 113 | s = epoll_ctl(efd, EPOLL_CTL_ADD, sfd, &event); 114 | if (s == -1) 115 | { 116 | perror ("epoll_ctl"); 117 | abort (); 118 | } 119 | 120 | /* Buffer where events are returned */ 121 | events = calloc (MAXEVENTS, sizeof event); 122 | 123 | /* The event loop */ 124 | while (1) 125 | { 126 | int n, i; 127 | 128 | printf("about to wait\n"); 129 | n = epoll_wait(efd, events, MAXEVENTS, -1); 130 | printf("got events %d\n", n); 131 | for (i = 0; i < n; i++) 132 | { 133 | if ((events[i].events & EPOLLERR) || (events[i].events & EPOLLHUP) || (!(events[i].events & EPOLLIN))) 134 | { 135 | /* An error has occured on this fd, or the socket is not 136 | ready for reading (why were we notified then?) */ 137 | fprintf (stderr, "epoll error\n"); 138 | close(events[i].data.fd); 139 | continue; 140 | } 141 | else if (sfd == events[i].data.fd) 142 | { 143 | /* We have a notification on the listening socket, which 144 | means one or more incoming connections. */ 145 | while (1) 146 | { 147 | struct sockaddr in_addr; 148 | socklen_t in_len; 149 | int infd; 150 | char hbuf[NI_MAXHOST], sbuf[NI_MAXSERV]; 151 | 152 | in_len = sizeof in_addr; 153 | infd = accept(sfd, &in_addr, &in_len); 154 | if (infd == -1) 155 | { 156 | if ((errno == EAGAIN) || (errno == EWOULDBLOCK)) 157 | { 158 | /* We have processed all incoming 159 | connections. */ 160 | break; 161 | } 162 | else 163 | { 164 | perror ("accept"); 165 | abort(); 166 | break; 167 | } 168 | } 169 | 170 | s = getnameinfo (&in_addr, in_len, 171 | hbuf, sizeof hbuf, 172 | sbuf, sizeof sbuf, 173 | NI_NUMERICHOST | NI_NUMERICSERV); 174 | if (s == 0) 175 | { 176 | printf("Accepted connection on descriptor %d (host=%s, port=%s)\n", infd, hbuf, sbuf); 177 | } 178 | 179 | /* Make the incoming socket non-blocking and add it to the 180 | list of fds to monitor. */ 181 | s = make_socket_non_blocking (infd); 182 | if (s == -1) abort(); 183 | 184 | event.data.fd = infd; 185 | event.events = EPOLLIN | EPOLLET; 186 | s = epoll_ctl (efd, EPOLL_CTL_ADD, infd, &event); 187 | if (s == -1) 188 | { 189 | perror ("epoll_ctl"); 190 | abort (); 191 | } 192 | } 193 | continue; 194 | } 195 | else 196 | { 197 | /* We have data on the fd waiting to be read. Read and 198 | display it. We must read whatever data is available 199 | completely, as we are running in edge-triggered mode 200 | and won't get a notification again for the same 201 | data. */ 202 | int done = 0; 203 | 204 | while (1) 205 | { 206 | ssize_t count; 207 | char buf[512]; 208 | 209 | count = read (events[i].data.fd, buf, sizeof buf); 210 | if (count == -1) 211 | { 212 | /* If errno == EAGAIN, that means we have read all 213 | data. So go back to the main loop. */ 214 | if (errno != EAGAIN) 215 | { 216 | perror ("read"); 217 | done = 1; 218 | } 219 | break; 220 | } 221 | else if (count == 0) 222 | { 223 | /* End of file. The remote has closed the 224 | connection. */ 225 | done = 1; 226 | break; 227 | } 228 | 229 | /* Write the buffer to standard output */ 230 | s = write (1, buf, count); 231 | if (s == -1) 232 | { 233 | perror ("write"); 234 | abort (); 235 | } 236 | /* THINK: How can we nonblock write to this socket? */ 237 | } 238 | 239 | if (done) 240 | { 241 | printf ("Closed connection on descriptor %d\n", events[i].data.fd); 242 | 243 | /* Closing the descriptor will make epoll remove it 244 | from the set of descriptors which are monitored. */ 245 | close (events[i].data.fd); 246 | } 247 | } 248 | } 249 | } 250 | 251 | free (events); 252 | close (sfd); 253 | return EXIT_SUCCESS; 254 | } 255 | 256 | -------------------------------------------------------------------------------- /2-io/demo-1/main.c: -------------------------------------------------------------------------------- 1 | // code is based on https://unixism.net/2020/04/io-uring-by-example-part-1-introduction/ 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | /* If your compilation fails because the header file below is missing, 15 | * your kernel is probably too old to support io_uring. 16 | * */ 17 | #include 18 | 19 | #define QUEUE_DEPTH 1 20 | #define BLOCK_SZ 1024 21 | 22 | /* This is x86 specific */ 23 | #define read_barrier() __asm__ __volatile__("":::"memory") 24 | #define write_barrier() __asm__ __volatile__("":::"memory") 25 | 26 | struct app_io_sq_ring { 27 | unsigned *head; 28 | unsigned *tail; 29 | unsigned *ring_mask; 30 | unsigned *ring_entries; 31 | unsigned *flags; 32 | unsigned *array; 33 | }; 34 | 35 | struct app_io_cq_ring { 36 | unsigned *head; 37 | unsigned *tail; 38 | unsigned *ring_mask; 39 | unsigned *ring_entries; 40 | struct io_uring_cqe *cqes; 41 | }; 42 | 43 | struct submitter { 44 | int ring_fd; 45 | struct app_io_sq_ring sq_ring; 46 | struct io_uring_sqe *sqes; 47 | struct app_io_cq_ring cq_ring; 48 | }; 49 | 50 | struct file_info { 51 | off_t file_sz; 52 | struct iovec iovecs[]; /* Referred by readv/writev */ 53 | }; 54 | 55 | /* 56 | * This code is written in the days when io_uring-related system calls are not 57 | * part of standard C libraries. So, we roll our own system call wrapper 58 | * functions. 59 | * */ 60 | 61 | int io_uring_setup(unsigned entries, struct io_uring_params *p) 62 | { 63 | return (int) syscall(__NR_io_uring_setup, entries, p); 64 | } 65 | 66 | int io_uring_enter(int ring_fd, unsigned int to_submit, 67 | unsigned int min_complete, unsigned int flags) 68 | { 69 | return (int) syscall(__NR_io_uring_enter, ring_fd, to_submit, min_complete, 70 | flags, NULL, 0); 71 | } 72 | 73 | /* 74 | * Returns the size of the file whose open file descriptor is passed in. 75 | * Properly handles regular file and block devices as well. Pretty. 76 | * */ 77 | 78 | off_t get_file_size(int fd) { 79 | struct stat st; 80 | 81 | if(fstat(fd, &st) < 0) { 82 | perror("fstat"); 83 | return -1; 84 | } 85 | if (S_ISBLK(st.st_mode)) { 86 | unsigned long long bytes; 87 | if (ioctl(fd, BLKGETSIZE64, &bytes) != 0) { 88 | perror("ioctl"); 89 | return -1; 90 | } 91 | return bytes; 92 | } else if (S_ISREG(st.st_mode)) 93 | return st.st_size; 94 | 95 | return -1; 96 | } 97 | 98 | /* 99 | * io_uring requires a lot of setup which looks pretty hairy, but isn't all 100 | * that difficult to understand. Because of all this boilerplate code, 101 | * io_uring's author has created liburing, which is relatively easy to use. 102 | * However, you should take your time and understand this code. It is always 103 | * good to know how it all works underneath. Apart from bragging rights, 104 | * it does offer you a certain strange geeky peace. 105 | * */ 106 | 107 | int app_setup_uring(struct submitter *s) { 108 | struct app_io_sq_ring *sring = &s->sq_ring; 109 | struct app_io_cq_ring *cring = &s->cq_ring; 110 | struct io_uring_params p; 111 | void *sq_ptr, *cq_ptr; 112 | 113 | /* 114 | * We need to pass in the io_uring_params structure to the io_uring_setup() 115 | * call zeroed out. We could set any flags if we need to, but for this 116 | * example, we don't. 117 | * */ 118 | memset(&p, 0, sizeof(p)); 119 | s->ring_fd = io_uring_setup(QUEUE_DEPTH, &p); 120 | if (s->ring_fd < 0) { 121 | perror("io_uring_setup"); 122 | return 1; 123 | } 124 | 125 | /* 126 | * io_uring communication happens via 2 shared kernel-user space ring buffers, 127 | * which can be jointly mapped with a single mmap() call in recent kernels. 128 | * While the completion queue is directly manipulated, the submission queue 129 | * has an indirection array in between. We map that in as well. 130 | * */ 131 | 132 | int sring_sz = p.sq_off.array + p.sq_entries * sizeof(unsigned); 133 | int cring_sz = p.cq_off.cqes + p.cq_entries * sizeof(struct io_uring_cqe); 134 | 135 | /* In kernel version 5.4 and above, it is possible to map the submission and 136 | * completion buffers with a single mmap() call. Rather than check for kernel 137 | * versions, the recommended way is to just check the features field of the 138 | * io_uring_params structure, which is a bit mask. If the 139 | * IORING_FEAT_SINGLE_MMAP is set, then we can do away with the second mmap() 140 | * call to map the completion ring. 141 | * */ 142 | if (p.features & IORING_FEAT_SINGLE_MMAP) { 143 | if (cring_sz > sring_sz) { 144 | sring_sz = cring_sz; 145 | } 146 | cring_sz = sring_sz; 147 | } 148 | 149 | /* Map in the submission and completion queue ring buffers. 150 | * Older kernels only map in the submission queue, though. 151 | * */ 152 | sq_ptr = mmap(0, sring_sz, PROT_READ | PROT_WRITE, 153 | MAP_SHARED | MAP_POPULATE, 154 | s->ring_fd, IORING_OFF_SQ_RING); 155 | if (sq_ptr == MAP_FAILED) { 156 | perror("mmap"); 157 | return 1; 158 | } 159 | 160 | if (p.features & IORING_FEAT_SINGLE_MMAP) { 161 | cq_ptr = sq_ptr; 162 | } else { 163 | /* Map in the completion queue ring buffer in older kernels separately */ 164 | cq_ptr = mmap(0, cring_sz, PROT_READ | PROT_WRITE, 165 | MAP_SHARED | MAP_POPULATE, 166 | s->ring_fd, IORING_OFF_CQ_RING); 167 | if (cq_ptr == MAP_FAILED) { 168 | perror("mmap"); 169 | return 1; 170 | } 171 | } 172 | /* Save useful fields in a global app_io_sq_ring struct for later 173 | * easy reference */ 174 | sring->head = sq_ptr + p.sq_off.head; 175 | sring->tail = sq_ptr + p.sq_off.tail; 176 | sring->ring_mask = sq_ptr + p.sq_off.ring_mask; 177 | sring->ring_entries = sq_ptr + p.sq_off.ring_entries; 178 | sring->flags = sq_ptr + p.sq_off.flags; 179 | sring->array = sq_ptr + p.sq_off.array; 180 | 181 | /* Map in the submission queue entries array */ 182 | s->sqes = mmap(0, p.sq_entries * sizeof(struct io_uring_sqe), 183 | PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, 184 | s->ring_fd, IORING_OFF_SQES); 185 | if (s->sqes == MAP_FAILED) { 186 | perror("mmap"); 187 | return 1; 188 | } 189 | 190 | /* Save useful fields in a global app_io_cq_ring struct for later 191 | * easy reference */ 192 | cring->head = cq_ptr + p.cq_off.head; 193 | cring->tail = cq_ptr + p.cq_off.tail; 194 | cring->ring_mask = cq_ptr + p.cq_off.ring_mask; 195 | cring->ring_entries = cq_ptr + p.cq_off.ring_entries; 196 | cring->cqes = cq_ptr + p.cq_off.cqes; 197 | 198 | return 0; 199 | } 200 | 201 | /* 202 | * Output a string of characters of len length to stdout. 203 | * We use buffered output here to be efficient, 204 | * since we need to output character-by-character. 205 | * */ 206 | void output_to_console(char *buf, int len) { 207 | while (len--) { 208 | fputc(*buf++, stdout); 209 | } 210 | } 211 | 212 | /* 213 | * Read from completion queue. 214 | * In this function, we read completion events from the completion queue, get 215 | * the data buffer that will have the file data and print it to the console. 216 | * */ 217 | 218 | void read_from_cq(struct submitter *s) { 219 | struct file_info *fi; 220 | struct app_io_cq_ring *cring = &s->cq_ring; 221 | struct io_uring_cqe *cqe; 222 | unsigned head, reaped = 0; 223 | 224 | head = *cring->head; 225 | 226 | do { 227 | read_barrier(); 228 | /* 229 | * Remember, this is a ring buffer. If head == tail, it means that the 230 | * buffer is empty. 231 | * */ 232 | if (head == *cring->tail) 233 | break; 234 | 235 | /* Get the entry */ 236 | cqe = &cring->cqes[head & *s->cq_ring.ring_mask]; 237 | fi = (struct file_info*) cqe->user_data; 238 | if (cqe->res < 0) 239 | fprintf(stderr, "Error: %s\n", strerror(abs(cqe->res))); 240 | 241 | int blocks = (int) fi->file_sz / BLOCK_SZ; 242 | if (fi->file_sz % BLOCK_SZ) blocks++; 243 | 244 | for (int i = 0; i < blocks; i++) 245 | output_to_console(fi->iovecs[i].iov_base, fi->iovecs[i].iov_len); 246 | 247 | head++; 248 | } while (1); 249 | 250 | *cring->head = head; 251 | write_barrier(); 252 | } 253 | /* 254 | * Submit to submission queue. 255 | * In this function, we submit requests to the submission queue. You can submit 256 | * many types of requests. Ours is going to be the readv() request, which we 257 | * specify via IORING_OP_READV. 258 | * 259 | * */ 260 | int submit_to_sq(char *file_path, struct submitter *s) { 261 | struct file_info *fi; 262 | 263 | int file_fd = open(file_path, O_RDONLY); 264 | if (file_fd < 0 ) { 265 | perror("open"); 266 | return 1; 267 | } 268 | 269 | struct app_io_sq_ring *sring = &s->sq_ring; 270 | unsigned index = 0, current_block = 0, tail = 0, next_tail = 0; 271 | 272 | off_t file_sz = get_file_size(file_fd); 273 | if (file_sz < 0) 274 | return 1; 275 | off_t bytes_remaining = file_sz; 276 | int blocks = (int) file_sz / BLOCK_SZ; 277 | if (file_sz % BLOCK_SZ) blocks++; 278 | 279 | fi = malloc(sizeof(*fi) + sizeof(struct iovec) * blocks); 280 | if (!fi) { 281 | fprintf(stderr, "Unable to allocate memory\n"); 282 | return 1; 283 | } 284 | fi->file_sz = file_sz; 285 | 286 | /* 287 | * For each block of the file we need to read, we allocate an iovec struct 288 | * which is indexed into the iovecs array. This array is passed in as part 289 | * of the submission. If you don't understand this, then you need to look 290 | * up how the readv() and writev() system calls work. 291 | * */ 292 | while (bytes_remaining) { 293 | off_t bytes_to_read = bytes_remaining; 294 | if (bytes_to_read > BLOCK_SZ) 295 | bytes_to_read = BLOCK_SZ; 296 | 297 | fi->iovecs[current_block].iov_len = bytes_to_read; 298 | 299 | void *buf; 300 | if( posix_memalign(&buf, BLOCK_SZ, BLOCK_SZ)) { 301 | perror("posix_memalign"); 302 | return 1; 303 | } 304 | fi->iovecs[current_block].iov_base = buf; 305 | 306 | current_block++; 307 | bytes_remaining -= bytes_to_read; 308 | } 309 | 310 | /* Add our submission queue entry to the tail of the SQE ring buffer */ 311 | next_tail = tail = *sring->tail; 312 | next_tail++; 313 | read_barrier(); 314 | index = tail & *s->sq_ring.ring_mask; 315 | struct io_uring_sqe *sqe = &s->sqes[index]; 316 | sqe->fd = file_fd; 317 | sqe->flags = 0; 318 | sqe->opcode = IORING_OP_READV; 319 | sqe->addr = (unsigned long) fi->iovecs; 320 | sqe->len = blocks; 321 | sqe->off = 0; 322 | sqe->user_data = (unsigned long long) fi; 323 | sring->array[index] = index; 324 | tail = next_tail; 325 | 326 | /* Update the tail so the kernel can see it. */ 327 | if(*sring->tail != tail) { 328 | *sring->tail = tail; 329 | write_barrier(); 330 | } 331 | 332 | /* 333 | * Tell the kernel we have submitted events with the io_uring_enter() system 334 | * call. We also pass in the IOURING_ENTER_GETEVENTS flag which causes the 335 | * io_uring_enter() call to wait until min_complete events (the 3rd param) 336 | * complete. 337 | * */ 338 | int ret = io_uring_enter(s->ring_fd, 1,1, 339 | IORING_ENTER_GETEVENTS); 340 | if(ret < 0) { 341 | perror("io_uring_enter"); 342 | return 1; 343 | } 344 | 345 | return 0; 346 | } 347 | 348 | int main(int argc, char *argv[]) { 349 | struct submitter *s; 350 | 351 | if (argc < 2) { 352 | fprintf(stderr, "Usage: %s \n", argv[0]); 353 | return 1; 354 | } 355 | 356 | s = malloc(sizeof(*s)); 357 | if (!s) { 358 | perror("malloc"); 359 | return 1; 360 | } 361 | memset(s, 0, sizeof(*s)); 362 | 363 | if(app_setup_uring(s)) { 364 | fprintf(stderr, "Unable to setup uring!\n"); 365 | return 1; 366 | } 367 | 368 | for (int i = 1; i < argc; i++) { 369 | if(submit_to_sq(argv[i], s)) { 370 | fprintf(stderr, "Error reading file\n"); 371 | return 1; 372 | } 373 | read_from_cq(s); 374 | } 375 | 376 | return 0; 377 | } 378 | --------------------------------------------------------------------------------