├── LICENSE ├── README.md ├── lab ├── 01_module_programming │ ├── 01_guide.pdf │ ├── 01_report.pdf │ ├── mod1 │ │ ├── Makefile │ │ └── mod1.c │ ├── mod2 │ │ ├── Makefile │ │ └── mod2.c │ ├── mod3 │ │ ├── Makefile │ │ └── mod3.c │ └── mod4 │ │ ├── Makefile │ │ └── mod4.c ├── 02_process_management │ ├── 02_guide.pdf │ ├── 02_report.pdf │ └── code │ │ ├── Makefile │ │ ├── base.c │ │ ├── core.c │ │ ├── fork.c │ │ ├── origin │ │ ├── base.c │ │ ├── core.c │ │ ├── fork.c │ │ └── sched.h │ │ ├── sched.h │ │ └── test.c ├── 03_memory_management │ ├── 03_guide.pdf │ ├── 03_report.pdf │ └── code │ │ ├── Makefile │ │ ├── mtest.c │ │ └── ref │ │ ├── mm.h │ │ ├── mm_types.h │ │ ├── pgtable-types.h │ │ └── pgtable.h ├── 04_file_system │ ├── 04_guide.pdf │ ├── 04_report.pdf │ └── code │ │ ├── Kconfig │ │ ├── Makefile │ │ ├── internal.h │ │ ├── mmap-nommu.c │ │ ├── ref │ │ └── fs.h │ │ ├── src │ │ ├── aa │ │ ├── bb │ │ └── ft │ │ ├── storage.c │ │ ├── super.c │ │ └── test.sh └── 05_syscall_hijack │ ├── 05_guide.pdf │ ├── 05_report.pdf │ └── code │ ├── Makefile │ ├── samples │ ├── arm64 │ │ ├── bench.o │ │ └── test.o │ └── x86-64 │ │ ├── bench.o │ │ └── test.o │ └── sys_clone_hook.c └── notes ├── LK_note_00.md ├── LK_note_01.md ├── LK_note_02.md ├── LK_note_03.md ├── LK_note_04.md ├── LK_note_05.md ├── LK_note_06.md ├── LK_note_07.md ├── LK_note_08.md ├── LK_note_09.md ├── LK_note_10.md ├── LK_note_11.md ├── LK_note_12.md ├── imgs ├── 0 │ ├── 1.png │ ├── 10.png │ ├── 11.png │ ├── 2.png │ ├── 3.png │ ├── 4.png │ ├── 5.png │ ├── 6.png │ ├── 7.png │ ├── 8.png │ └── 9.png ├── 1 │ ├── 1.png │ ├── 2.png │ ├── 3.png │ ├── 4.png │ ├── 5.png │ └── 6.png ├── 2 │ ├── 1.png │ ├── 2.png │ └── 3.png ├── 3 │ └── 1.png ├── 4 │ ├── 1.png │ ├── 2.png │ ├── 3.png │ ├── 4.png │ ├── 5.png │ ├── 6.png │ └── 7.png ├── 5 │ ├── 1.png │ ├── 2.png │ ├── 3.png │ └── 4.png ├── 6 │ ├── 1.png │ └── 2.png ├── 7 │ ├── 1.png │ └── 2.png ├── 8 │ ├── 1.png │ ├── 2.png │ ├── 3.png │ ├── 4.png │ ├── 5.png │ ├── 6.png │ ├── 7.png │ └── 8.png ├── 9 │ ├── 1.png │ ├── 2.png │ ├── 3.png │ ├── 4.png │ ├── 5.png │ └── 6.png ├── 10 │ ├── 1.png │ ├── 10.png │ ├── 11.png │ ├── 12.png │ ├── 13.png │ ├── 2.png │ ├── 3.png │ ├── 4.png │ ├── 5.png │ ├── 6.png │ ├── 7.png │ ├── 8.png │ └── 9.png ├── 11 │ ├── 1.png │ └── 2.png └── 12 │ ├── 1.png │ ├── 10.png │ ├── 11.png │ ├── 12.png │ ├── 13.png │ ├── 14.png │ ├── 15.png │ ├── 16.png │ ├── 17.png │ ├── 18.png │ ├── 2.png │ ├── 3.png │ ├── 4.png │ ├── 5.png │ ├── 6.png │ ├── 7.png │ ├── 8.png │ └── 9.png └── pdfs ├── linux_source.pdf └── linux_system_installed.pdf /README.md: -------------------------------------------------------------------------------- 1 | # Linux-Kernel-notes 2 | :electron: Notes and codes for Linux Kernel (SJTU-CS353) 3 | 4 | 🌟 *Please star this repo if it helps!* 5 | 6 | 🧐 *Please let me know in Issues if there exists any problems!* 7 | 8 | ## Lab 9 | 10 | - [x] 01: [Module Programming](https://github.com/zhliuworks/Linux-Kernel-notes/tree/master/lab/01_module_programming) 11 | - [x] 02: [Process Management](https://github.com/zhliuworks/Linux-Kernel-notes/tree/master/lab/02_process_management) 12 | - [x] 03: [Memory Management](https://github.com/zhliuworks/Linux-Kernel-notes/tree/master/lab/03_memory_management) 13 | - [x] 04: [File System](https://github.com/zhliuworks/Linux-Kernel-notes/tree/master/lab/04_file_system) 14 | - [x] 05: [Syscall Hijack](https://github.com/zhliuworks/Linux-Kernel-notes/tree/master/lab/05_syscall_hijack) 15 | 16 | ## Notes 17 | 18 | | Lecture | Content | 19 | | ------- | ------------------------------------------------------------ | 20 | | Lec. 1 | [Introduction to Linux Kernel](https://github.com/zhliuworks/Linux-Kernel-notes/blob/master/notes/LK_note_01.md) | 21 | | Lec. 2 | [Linux Booting](https://github.com/zhliuworks/Linux-Kernel-notes/blob/master/notes/LK_note_02.md) | 22 | | Lec. 3 | [Module Programming and `/proc` Filesystem](https://github.com/zhliuworks/Linux-Kernel-notes/blob/master/notes/LK_note_03.md) | 23 | | Lec. 4 | [Process Management](https://github.com/zhliuworks/Linux-Kernel-notes/blob/master/notes/LK_note_04.md) | 24 | | Lec. 5 | [Process Management: Scheduling](https://github.com/zhliuworks/Linux-Kernel-notes/blob/master/notes/LK_note_05.md) | 25 | | * | [Process Management on Multi-processor System: Scheduling Domain](https://github.com/zhliuworks/Linux-Kernel-notes/blob/master/notes/LK_note_00.md#process-management-on-multi-processor-system-scheduling-domain) | 26 | | Lec. 6 | [Interrupt Handling](https://github.com/zhliuworks/Linux-Kernel-notes/blob/master/notes/LK_note_06.md) | 27 | | Lec. 7 | [Kernel Synchronization](https://github.com/zhliuworks/Linux-Kernel-notes/blob/master/notes/LK_note_07.md) | 28 | | Lec. 8 | [Symmetric Multiprocessing](https://github.com/zhliuworks/Linux-Kernel-notes/blob/master/notes/LK_note_08.md) | 29 | | Lec. 9 | [Memory Management: Addressing](https://github.com/zhliuworks/Linux-Kernel-notes/blob/master/notes/LK_note_09.md) | 30 | | Lec. 10 | [Memory Management: Methods](https://github.com/zhliuworks/Linux-Kernel-notes/blob/master/notes/LK_note_10.md) | 31 | | Lec. 11 | [Virtual File System](https://github.com/zhliuworks/Linux-Kernel-notes/blob/master/notes/LK_note_11.md) | 32 | | Lec. 12 | [Linux File System Implementations](https://github.com/zhliuworks/Linux-Kernel-notes/blob/master/notes/LK_note_12.md) | 33 | | * | [Power Management: From Linux Kernel to Android](https://github.com/zhliuworks/Linux-Kernel-notes/blob/master/notes/LK_note_00.md#power-management-from-linux-kernel-to-android) | 34 | | * | [New Directions of Operating System Kernel](https://github.com/zhliuworks/Linux-Kernel-notes/blob/master/notes/LK_note_00.md#new-directions-of-operating-system-kernel) | 35 | 36 | ## License 37 | 38 | [GPL-3.0 License](https://github.com/zhliuworks/Linux-Kernel-notes/blob/master/LICENSE) 39 | -------------------------------------------------------------------------------- /lab/01_module_programming/01_guide.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/lab/01_module_programming/01_guide.pdf -------------------------------------------------------------------------------- /lab/01_module_programming/01_report.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/lab/01_module_programming/01_report.pdf -------------------------------------------------------------------------------- /lab/01_module_programming/mod1/Makefile: -------------------------------------------------------------------------------- 1 | obj-m:=mod1.o 2 | KDIR:=/lib/modules/$(shell uname -r)/build 3 | PWD:=$(shell pwd) 4 | all: 5 | make -C $(KDIR) M=$(PWD) modules 6 | clean: 7 | make -C $(KDIR) M=$(PWD) clean 8 | -------------------------------------------------------------------------------- /lab/01_module_programming/mod1/mod1.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | static int __init mod1_init(void) { 6 | printk(KERN_INFO "Insert Module 1\n"); 7 | return 0; 8 | } 9 | 10 | static void __exit mod1_exit(void) { 11 | printk(KERN_INFO "Remove Module 1\n"); 12 | } 13 | 14 | module_init(mod1_init); 15 | module_exit(mod1_exit); 16 | -------------------------------------------------------------------------------- /lab/01_module_programming/mod2/Makefile: -------------------------------------------------------------------------------- 1 | obj-m:=mod2.o 2 | KDIR:=/lib/modules/$(shell uname -r)/build 3 | PWD:=$(shell pwd) 4 | all: 5 | make -C $(KDIR) M=$(PWD) modules 6 | clean: 7 | make -C $(KDIR) M=$(PWD) clean 8 | -------------------------------------------------------------------------------- /lab/01_module_programming/mod2/mod2.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | static int a; 7 | static char* b; 8 | static int n_para = 1; // 4 parameters 9 | static int c[4]; // 4 parameters 10 | 11 | module_param(a, int, 0644); 12 | module_param(b, charp, 0644); 13 | module_param_array(c, int, &n_para, 0644); 14 | 15 | static int __init mod2_init(void) { 16 | printk(KERN_INFO "Insert Module 2:\na = %d, b = %s, c = [%d, %d, %d, %d]\n", a, b, c[0], c[1], c[2], c[3]); 17 | return 0; 18 | } 19 | 20 | static void __exit mod2_exit(void) { 21 | printk(KERN_INFO "Remove Module 2\n"); 22 | } 23 | 24 | module_init(mod2_init); 25 | module_exit(mod2_exit); 26 | -------------------------------------------------------------------------------- /lab/01_module_programming/mod3/Makefile: -------------------------------------------------------------------------------- 1 | obj-m:=mod3.o 2 | KDIR:=/lib/modules/$(shell uname -r)/build 3 | PWD:=$(shell pwd) 4 | all: 5 | make -C $(KDIR) M=$(PWD) modules 6 | clean: 7 | make -C $(KDIR) M=$(PWD) clean 8 | -------------------------------------------------------------------------------- /lab/01_module_programming/mod3/mod3.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #define BUFSIZE 100 7 | 8 | MODULE_LICENSE("GPL"); 9 | 10 | static struct proc_dir_entry* entry; 11 | 12 | static ssize_t myread(struct file *file, char __user *ubuf, size_t count, loff_t *ppos) { 13 | char buf[BUFSIZE]; 14 | int len = 0; 15 | if(*ppos > 0 || count < BUFSIZE) 16 | return 0; 17 | len += sprintf(buf, "mod3: %s\n", "hello world"); 18 | 19 | if(copy_to_user(ubuf, buf, len)) 20 | return -EFAULT; 21 | *ppos = len; 22 | return len; 23 | } 24 | 25 | static struct file_operations myops = { 26 | .owner = THIS_MODULE, 27 | .read = myread 28 | }; 29 | 30 | static int __init mod3_init(void) { 31 | entry = proc_create("mod3_proc", 0444, NULL, &myops); 32 | return 0; 33 | } 34 | 35 | static void __exit mod3_exit(void) { 36 | proc_remove(entry); 37 | } 38 | 39 | module_init(mod3_init); 40 | module_exit(mod3_exit); 41 | -------------------------------------------------------------------------------- /lab/01_module_programming/mod4/Makefile: -------------------------------------------------------------------------------- 1 | obj-m:=mod4.o 2 | KDIR:=/lib/modules/$(shell uname -r)/build 3 | PWD:=$(shell pwd) 4 | all: 5 | make -C $(KDIR) M=$(PWD) modules 6 | clean: 7 | make -C $(KDIR) M=$(PWD) clean 8 | -------------------------------------------------------------------------------- /lab/01_module_programming/mod4/mod4.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #define BUFSIZE 100 7 | #define MAXSIZE 1024 8 | 9 | MODULE_LICENSE("GPL"); 10 | 11 | static char str[MAXSIZE]; 12 | 13 | static struct proc_dir_entry *entry; 14 | static struct proc_dir_entry *base; 15 | 16 | static ssize_t myread(struct file *file, char __user *ubuf, size_t count, loff_t *ppos) { 17 | char buf[BUFSIZE]; 18 | int len = 0; 19 | if(*ppos > 0 || count < BUFSIZE) 20 | return 0; 21 | len += sprintf(buf, "mod4: %s\n", str); 22 | printk(KERN_INFO "read from proc file: %s", str); 23 | 24 | if(copy_to_user(ubuf, buf, len)) 25 | return -EFAULT; 26 | *ppos = len; 27 | return len; 28 | } 29 | 30 | static ssize_t mywrite(struct file *file, const char __user *ubuf, size_t count, loff_t *ppos) 31 | { 32 | char buf[BUFSIZE]; 33 | if(*ppos > 0 || count > BUFSIZE) 34 | return -EFAULT; 35 | if(copy_from_user(buf, ubuf, count)) 36 | return -EFAULT; 37 | char tmp[MAXSIZE]; 38 | sscanf(buf, "%s", tmp); 39 | strcpy(str, tmp); 40 | printk(KERN_INFO "write (%s) to proc file", str); 41 | int c = strlen(buf); 42 | *ppos = c; 43 | return c; 44 | } 45 | 46 | static struct file_operations myops = { 47 | .owner = THIS_MODULE, 48 | .read = myread, 49 | .write = mywrite 50 | }; 51 | 52 | static int __init mod4_init(void) { 53 | base = proc_mkdir("proc4_folder", NULL); 54 | entry = proc_create("mod4_proc", 0666, base, &myops); 55 | return 0; 56 | } 57 | 58 | static void __exit mod4_exit(void) { 59 | proc_remove(entry); 60 | proc_remove(base); 61 | } 62 | 63 | module_init(mod4_init); 64 | module_exit(mod4_exit); 65 | -------------------------------------------------------------------------------- /lab/02_process_management/02_guide.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/lab/02_process_management/02_guide.pdf -------------------------------------------------------------------------------- /lab/02_process_management/02_report.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/lab/02_process_management/02_report.pdf -------------------------------------------------------------------------------- /lab/02_process_management/code/Makefile: -------------------------------------------------------------------------------- 1 | SRC=/usr/src/linux-$(shell uname -r) 2 | 3 | all: 4 | cp sched.h $(SRC)/include/linux/sched.h 5 | cp fork.c $(SRC)/kernel/fork.c 6 | cp core.c $(SRC)/kernel/sched/core.c 7 | cp base.c $(SRC)/fs/proc/base.c 8 | make -j2 -C $(SRC) 9 | make -C $(SRC) install 10 | reboot 11 | 12 | restore: 13 | cp origin/sched.h $(SRC)/include/linux/sched.h 14 | cp origin/fork.c $(SRC)/kernel/fork.c 15 | cp origin/core.c $(SRC)/kernel/sched/core.c 16 | cp origin/base.c $(SRC)/fs/proc/base.c 17 | make -j2 -C $(SRC) 18 | make -C $(SRC) install 19 | reboot -------------------------------------------------------------------------------- /lab/02_process_management/code/test.c: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | int main() { 4 | while(1) getchar(); 5 | return 0; 6 | } -------------------------------------------------------------------------------- /lab/03_memory_management/03_guide.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/lab/03_memory_management/03_guide.pdf -------------------------------------------------------------------------------- /lab/03_memory_management/03_report.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/lab/03_memory_management/03_report.pdf -------------------------------------------------------------------------------- /lab/03_memory_management/code/Makefile: -------------------------------------------------------------------------------- 1 | obj-m:=mtest.o 2 | KDIR:=/lib/modules/$(shell uname -r)/build 3 | PWD:=$(shell pwd) 4 | all: 5 | make -C $(KDIR) M=$(PWD) modules 6 | clean: 7 | make -C $(KDIR) M=$(PWD) clean -------------------------------------------------------------------------------- /lab/03_memory_management/code/mtest.c: -------------------------------------------------------------------------------- 1 | /* 2 | * mtest.c -- memory management lab `mtest` module 3 | */ 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #define BUFSIZE 1024 14 | 15 | static struct proc_dir_entry *mtest_proc_entry; 16 | 17 | 18 | /* Print all vma of the current process */ 19 | static void mtest_list_vma(void) { 20 | struct mm_struct *mm = current->mm; 21 | struct vm_area_struct *vma = mm->mmap; 22 | 23 | down_read(&(mm->mmap_sem)); // lock the read critical section 24 | 25 | // traverse list of VMAs 26 | while (vma) { 27 | printk("VMA 0x%lx - 0x%lx\n", vma->vm_start, vma->vm_end); 28 | // permission flags in `mm.h` 29 | (vma->vm_flags & VM_READ) ? printk("r\n") : printk("-\n"); 30 | (vma->vm_flags & VM_WRITE) ? printk("w\n") : printk("-\n"); 31 | (vma->vm_flags & VM_EXEC) ? printk("x\n") : printk("-\n"); 32 | printk("\n"); 33 | vma = vma->vm_next; 34 | } 35 | 36 | up_read(&(mm->mmap_sem)); // unlock the read critical section 37 | } 38 | 39 | 40 | /* find page of va */ 41 | static struct page *_find_page(unsigned long vaddr) { 42 | struct mm_struct *mm = current->mm; 43 | struct page *curr_page; 44 | 45 | pgd_t *pgd; 46 | pud_t *pud; 47 | pmd_t *pmd; 48 | pte_t *pte; 49 | 50 | // walk the page table 51 | // 1. get [page global directory, pgd] 52 | pgd = pgd_offset(mm, vaddr); 53 | // printk("pgd: %llx\n", pgd_val(*pgd)); 54 | if (pgd_none(*pgd) || pgd_bad(*pgd)) { 55 | printk("[pgd] not available\n"); 56 | return NULL; 57 | } 58 | // 2. get [page upper directory, pud] 59 | pud = pud_offset(pgd, vaddr); 60 | // printk("pud: %llx\n", pud_val(*pud)); 61 | if (pud_none(*pud) || pud_bad(*pud)) { 62 | printk("[pud] not available\n"); 63 | return NULL; 64 | } 65 | // 3. get [page middle directory, pmd] 66 | pmd = pmd_offset(pud, vaddr); 67 | // printk("pmd: %llx\n", pmd_val(*pmd)); 68 | if (pmd_none(*pmd) || pmd_bad(*pmd)) { 69 | printk("[pmd] not available\n"); 70 | return NULL; 71 | } 72 | // 4. get [page table entry, pte] 73 | pte = pte_offset_kernel(pmd, vaddr); 74 | // printk("pte: %llx\n", pte_val(*pte)); 75 | if (pte_none(*pte)) { 76 | printk("[pte] not available\n"); 77 | return NULL; 78 | } 79 | 80 | curr_page = pte_page(*pte); 81 | return curr_page; 82 | } 83 | 84 | 85 | /* Find va->pa translation */ 86 | static void mtest_find_page(unsigned long vaddr) { 87 | unsigned long paddr; 88 | unsigned long page_addr; 89 | unsigned long page_offset; 90 | 91 | // get current page of vaddr 92 | struct page *curr_page = _find_page(vaddr); 93 | 94 | if (!curr_page) { 95 | printk("translation not found\n"); 96 | return; 97 | } 98 | 99 | page_addr = page_to_phys(curr_page) & PAGE_MASK; 100 | page_offset = vaddr & (~PAGE_MASK); 101 | paddr = page_addr | page_offset; 102 | 103 | printk("vma 0x%lx -> pma 0x%lx\n", vaddr, paddr); 104 | } 105 | 106 | 107 | /* Write val to the specified address */ 108 | static void mtest_write_val(unsigned long vaddr, unsigned long val) { 109 | // look up the first VMA which statisfies vaddr < vm_end, NULL if none 110 | struct vm_area_struct *vma = find_vma(current->mm, vaddr); 111 | // get current page of vaddr 112 | struct page *curr_page = _find_page(vaddr); 113 | 114 | // whether the page is existed 115 | if (!curr_page) { 116 | printk("unexisted page\n"); 117 | return; 118 | } 119 | 120 | // whether the vma is valid 121 | if (!vma || vma->vm_start > vaddr) { 122 | printk("invalid vma\n"); 123 | return; 124 | } 125 | 126 | // whether the page is writable 127 | if (!(vma->vm_flags & VM_WRITE)) { 128 | printk("unwritable page\n"); 129 | return; 130 | } 131 | 132 | // write value 133 | unsigned long *kernel_addr; 134 | kernel_addr = (unsigned long*)page_address(curr_page); 135 | kernel_addr += vaddr & (~PAGE_MASK); 136 | *kernel_addr = val; 137 | printk("written 0x%lx to address 0x%lx\n", val, (unsigned long)kernel_addr); 138 | } 139 | 140 | 141 | /* proc write interface */ 142 | static ssize_t mtest_proc_write(struct file *file, 143 | const char __user *ubuf, 144 | size_t count, 145 | loff_t *ppos) { 146 | char buf[BUFSIZE]; 147 | char data[BUFSIZE]; 148 | unsigned long addr, val; 149 | unsigned short offset; 150 | 151 | if (*ppos > 0 || count > BUFSIZE) 152 | return -EFAULT; 153 | if (copy_from_user(buf, ubuf, count)) 154 | return -EFAULT; 155 | sscanf(buf, "%s", data); 156 | 157 | if (!strcmp(data, "listvma")) { 158 | /* listvma */ 159 | mtest_list_vma(); 160 | } else if (!strcmp(data, "findpage")) { 161 | /* findpage */ 162 | offset = 9; 163 | sscanf(buf + offset, "%s", data); 164 | kstrtoul(data, 16, &addr); 165 | mtest_find_page(addr); 166 | } else if (!strcmp(data, "writeval")) { 167 | /* writeval */ 168 | offset = 9; 169 | sscanf(buf + offset, "%s", data); 170 | kstrtoul(data, 16, &addr); 171 | while (*(buf + offset) != ' ') offset ++; 172 | offset ++; 173 | sscanf(buf + offset, "%s", data); 174 | kstrtoul(data, 16, &val); 175 | mtest_write_val(addr, val); 176 | } 177 | 178 | *ppos = strlen(buf); 179 | return *ppos; 180 | } 181 | 182 | 183 | /* proc file_operations struct */ 184 | static struct file_operations proc_mtest_operations = { 185 | .owner = THIS_MODULE, 186 | .write = mtest_proc_write 187 | }; 188 | 189 | 190 | static int __init mtest_init(void) { 191 | mtest_proc_entry = proc_create("mtest", 0666, NULL, &proc_mtest_operations); 192 | return 0; 193 | } 194 | 195 | 196 | static void __exit mtest_exit(void) { 197 | proc_remove(mtest_proc_entry); 198 | } 199 | 200 | MODULE_LICENSE("GPL"); 201 | MODULE_DESCRIPTION("Memory Management Lab Test Module"); 202 | MODULE_AUTHOR("Zihan Liu"); 203 | 204 | module_init(mtest_init); 205 | module_exit(mtest_exit); -------------------------------------------------------------------------------- /lab/03_memory_management/code/ref/mm_types.h: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: GPL-2.0 */ 2 | #ifndef _LINUX_MM_TYPES_H 3 | #define _LINUX_MM_TYPES_H 4 | 5 | #include 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | #include 19 | 20 | #ifndef AT_VECTOR_SIZE_ARCH 21 | #define AT_VECTOR_SIZE_ARCH 0 22 | #endif 23 | #define AT_VECTOR_SIZE (2*(AT_VECTOR_SIZE_ARCH + AT_VECTOR_SIZE_BASE + 1)) 24 | 25 | 26 | struct address_space; 27 | struct mem_cgroup; 28 | 29 | /* 30 | * Each physical page in the system has a struct page associated with 31 | * it to keep track of whatever it is we are using the page for at the 32 | * moment. Note that we have no way to track which tasks are using 33 | * a page, though if it is a pagecache page, rmap structures can tell us 34 | * who is mapping it. 35 | * 36 | * If you allocate the page using alloc_pages(), you can use some of the 37 | * space in struct page for your own purposes. The five words in the main 38 | * union are available, except for bit 0 of the first word which must be 39 | * kept clear. Many users use this word to store a pointer to an object 40 | * which is guaranteed to be aligned. If you use the same storage as 41 | * page->mapping, you must restore it to NULL before freeing the page. 42 | * 43 | * If your page will not be mapped to userspace, you can also use the four 44 | * bytes in the mapcount union, but you must call page_mapcount_reset() 45 | * before freeing it. 46 | * 47 | * If you want to use the refcount field, it must be used in such a way 48 | * that other CPUs temporarily incrementing and then decrementing the 49 | * refcount does not cause problems. On receiving the page from 50 | * alloc_pages(), the refcount will be positive. 51 | * 52 | * If you allocate pages of order > 0, you can use some of the fields 53 | * in each subpage, but you may need to restore some of their values 54 | * afterwards. 55 | * 56 | * SLUB uses cmpxchg_double() to atomically update its freelist and 57 | * counters. That requires that freelist & counters be adjacent and 58 | * double-word aligned. We align all struct pages to double-word 59 | * boundaries, and ensure that 'freelist' is aligned within the 60 | * struct. 61 | */ 62 | #ifdef CONFIG_HAVE_ALIGNED_STRUCT_PAGE 63 | #define _struct_page_alignment __aligned(2 * sizeof(unsigned long)) 64 | #else 65 | #define _struct_page_alignment 66 | #endif 67 | 68 | struct page { 69 | unsigned long flags; /* Atomic flags, some possibly 70 | * updated asynchronously */ 71 | /* 72 | * Five words (20/40 bytes) are available in this union. 73 | * WARNING: bit 0 of the first word is used for PageTail(). That 74 | * means the other users of this union MUST NOT use the bit to 75 | * avoid collision and false-positive PageTail(). 76 | */ 77 | union { 78 | struct { /* Page cache and anonymous pages */ 79 | /** 80 | * @lru: Pageout list, eg. active_list protected by 81 | * pgdat->lru_lock. Sometimes used as a generic list 82 | * by the page owner. 83 | */ 84 | struct list_head lru; 85 | /* See page-flags.h for PAGE_MAPPING_FLAGS */ 86 | struct address_space *mapping; 87 | pgoff_t index; /* Our offset within mapping. */ 88 | /** 89 | * @private: Mapping-private opaque data. 90 | * Usually used for buffer_heads if PagePrivate. 91 | * Used for swp_entry_t if PageSwapCache. 92 | * Indicates order in the buddy system if PageBuddy. 93 | */ 94 | unsigned long private; 95 | }; 96 | struct { /* page_pool used by netstack */ 97 | /** 98 | * @dma_addr: might require a 64-bit value even on 99 | * 32-bit architectures. 100 | */ 101 | dma_addr_t dma_addr; 102 | }; 103 | struct { /* slab, slob and slub */ 104 | union { 105 | struct list_head slab_list; 106 | struct { /* Partial pages */ 107 | struct page *next; 108 | #ifdef CONFIG_64BIT 109 | int pages; /* Nr of pages left */ 110 | int pobjects; /* Approximate count */ 111 | #else 112 | short int pages; 113 | short int pobjects; 114 | #endif 115 | }; 116 | }; 117 | struct kmem_cache *slab_cache; /* not slob */ 118 | /* Double-word boundary */ 119 | void *freelist; /* first free object */ 120 | union { 121 | void *s_mem; /* slab: first object */ 122 | unsigned long counters; /* SLUB */ 123 | struct { /* SLUB */ 124 | unsigned inuse:16; 125 | unsigned objects:15; 126 | unsigned frozen:1; 127 | }; 128 | }; 129 | }; 130 | struct { /* Tail pages of compound page */ 131 | unsigned long compound_head; /* Bit zero is set */ 132 | 133 | /* First tail page only */ 134 | unsigned char compound_dtor; 135 | unsigned char compound_order; 136 | atomic_t compound_mapcount; 137 | }; 138 | struct { /* Second tail page of compound page */ 139 | unsigned long _compound_pad_1; /* compound_head */ 140 | unsigned long _compound_pad_2; 141 | /* For both global and memcg */ 142 | struct list_head deferred_list; 143 | }; 144 | struct { /* Page table pages */ 145 | unsigned long _pt_pad_1; /* compound_head */ 146 | pgtable_t pmd_huge_pte; /* protected by page->ptl */ 147 | unsigned long _pt_pad_2; /* mapping */ 148 | union { 149 | struct mm_struct *pt_mm; /* x86 pgds only */ 150 | atomic_t pt_frag_refcount; /* powerpc */ 151 | }; 152 | #if ALLOC_SPLIT_PTLOCKS 153 | spinlock_t *ptl; 154 | #else 155 | spinlock_t ptl; 156 | #endif 157 | }; 158 | struct { /* ZONE_DEVICE pages */ 159 | /** @pgmap: Points to the hosting device page map. */ 160 | struct dev_pagemap *pgmap; 161 | void *zone_device_data; 162 | /* 163 | * ZONE_DEVICE private pages are counted as being 164 | * mapped so the next 3 words hold the mapping, index, 165 | * and private fields from the source anonymous or 166 | * page cache page while the page is migrated to device 167 | * private memory. 168 | * ZONE_DEVICE MEMORY_DEVICE_FS_DAX pages also 169 | * use the mapping, index, and private fields when 170 | * pmem backed DAX files are mapped. 171 | */ 172 | }; 173 | 174 | /** @rcu_head: You can use this to free a page by RCU. */ 175 | struct rcu_head rcu_head; 176 | }; 177 | 178 | union { /* This union is 4 bytes in size. */ 179 | /* 180 | * If the page can be mapped to userspace, encodes the number 181 | * of times this page is referenced by a page table. 182 | */ 183 | atomic_t _mapcount; 184 | 185 | /* 186 | * If the page is neither PageSlab nor mappable to userspace, 187 | * the value stored here may help determine what this page 188 | * is used for. See page-flags.h for a list of page types 189 | * which are currently stored here. 190 | */ 191 | unsigned int page_type; 192 | 193 | unsigned int active; /* SLAB */ 194 | int units; /* SLOB */ 195 | }; 196 | 197 | /* Usage count. *DO NOT USE DIRECTLY*. See page_ref.h */ 198 | atomic_t _refcount; 199 | 200 | #ifdef CONFIG_MEMCG 201 | struct mem_cgroup *mem_cgroup; 202 | #endif 203 | 204 | /* 205 | * On machines where all RAM is mapped into kernel address space, 206 | * we can simply calculate the virtual address. On machines with 207 | * highmem some memory is mapped into kernel virtual memory 208 | * dynamically, so we need a place to store that address. 209 | * Note that this field could be 16 bits on x86 ... ;) 210 | * 211 | * Architectures with slow multiplication can define 212 | * WANT_PAGE_VIRTUAL in asm/page.h 213 | */ 214 | #if defined(WANT_PAGE_VIRTUAL) 215 | void *virtual; /* Kernel virtual address (NULL if 216 | not kmapped, ie. highmem) */ 217 | #endif /* WANT_PAGE_VIRTUAL */ 218 | 219 | #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS 220 | int _last_cpupid; 221 | #endif 222 | } _struct_page_alignment; 223 | 224 | static inline atomic_t *compound_mapcount_ptr(struct page *page) 225 | { 226 | return &page[1].compound_mapcount; 227 | } 228 | 229 | /* 230 | * Used for sizing the vmemmap region on some architectures 231 | */ 232 | #define STRUCT_PAGE_MAX_SHIFT (order_base_2(sizeof(struct page))) 233 | 234 | #define PAGE_FRAG_CACHE_MAX_SIZE __ALIGN_MASK(32768, ~PAGE_MASK) 235 | #define PAGE_FRAG_CACHE_MAX_ORDER get_order(PAGE_FRAG_CACHE_MAX_SIZE) 236 | 237 | #define page_private(page) ((page)->private) 238 | #define set_page_private(page, v) ((page)->private = (v)) 239 | 240 | struct page_frag_cache { 241 | void * va; 242 | #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) 243 | __u16 offset; 244 | __u16 size; 245 | #else 246 | __u32 offset; 247 | #endif 248 | /* we maintain a pagecount bias, so that we dont dirty cache line 249 | * containing page->_refcount every time we allocate a fragment. 250 | */ 251 | unsigned int pagecnt_bias; 252 | bool pfmemalloc; 253 | }; 254 | 255 | typedef unsigned long vm_flags_t; 256 | 257 | /* 258 | * A region containing a mapping of a non-memory backed file under NOMMU 259 | * conditions. These are held in a global tree and are pinned by the VMAs that 260 | * map parts of them. 261 | */ 262 | struct vm_region { 263 | struct rb_node vm_rb; /* link in global region tree */ 264 | vm_flags_t vm_flags; /* VMA vm_flags */ 265 | unsigned long vm_start; /* start address of region */ 266 | unsigned long vm_end; /* region initialised to here */ 267 | unsigned long vm_top; /* region allocated to here */ 268 | unsigned long vm_pgoff; /* the offset in vm_file corresponding to vm_start */ 269 | struct file *vm_file; /* the backing file or NULL */ 270 | 271 | int vm_usage; /* region usage count (access under nommu_region_sem) */ 272 | bool vm_icache_flushed : 1; /* true if the icache has been flushed for 273 | * this region */ 274 | }; 275 | 276 | #ifdef CONFIG_USERFAULTFD 277 | #define NULL_VM_UFFD_CTX ((struct vm_userfaultfd_ctx) { NULL, }) 278 | struct vm_userfaultfd_ctx { 279 | struct userfaultfd_ctx *ctx; 280 | }; 281 | #else /* CONFIG_USERFAULTFD */ 282 | #define NULL_VM_UFFD_CTX ((struct vm_userfaultfd_ctx) {}) 283 | struct vm_userfaultfd_ctx {}; 284 | #endif /* CONFIG_USERFAULTFD */ 285 | 286 | /* 287 | * This struct defines a memory VMM memory area. There is one of these 288 | * per VM-area/task. A VM area is any part of the process virtual memory 289 | * space that has a special rule for the page-fault handlers (ie a shared 290 | * library, the executable area etc). 291 | */ 292 | struct vm_area_struct { 293 | /* The first cache line has the info for VMA tree walking. */ 294 | 295 | unsigned long vm_start; /* Our start address within vm_mm. */ 296 | unsigned long vm_end; /* The first byte after our end address 297 | within vm_mm. */ 298 | 299 | /* linked list of VM areas per task, sorted by address */ 300 | struct vm_area_struct *vm_next, *vm_prev; 301 | 302 | struct rb_node vm_rb; 303 | 304 | /* 305 | * Largest free memory gap in bytes to the left of this VMA. 306 | * Either between this VMA and vma->vm_prev, or between one of the 307 | * VMAs below us in the VMA rbtree and its ->vm_prev. This helps 308 | * get_unmapped_area find a free area of the right size. 309 | */ 310 | unsigned long rb_subtree_gap; 311 | 312 | /* Second cache line starts here. */ 313 | 314 | struct mm_struct *vm_mm; /* The address space we belong to. */ 315 | pgprot_t vm_page_prot; /* Access permissions of this VMA. */ 316 | unsigned long vm_flags; /* Flags, see mm.h. */ 317 | 318 | /* 319 | * For areas with an address space and backing store, 320 | * linkage into the address_space->i_mmap interval tree. 321 | */ 322 | struct { 323 | struct rb_node rb; 324 | unsigned long rb_subtree_last; 325 | } shared; 326 | 327 | /* 328 | * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma 329 | * list, after a COW of one of the file pages. A MAP_SHARED vma 330 | * can only be in the i_mmap tree. An anonymous MAP_PRIVATE, stack 331 | * or brk vma (with NULL file) can only be in an anon_vma list. 332 | */ 333 | struct list_head anon_vma_chain; /* Serialized by mmap_sem & 334 | * page_table_lock */ 335 | struct anon_vma *anon_vma; /* Serialized by page_table_lock */ 336 | 337 | /* Function pointers to deal with this struct. */ 338 | const struct vm_operations_struct *vm_ops; 339 | 340 | /* Information about our backing store: */ 341 | unsigned long vm_pgoff; /* Offset (within vm_file) in PAGE_SIZE 342 | units */ 343 | struct file * vm_file; /* File we map to (can be NULL). */ 344 | void * vm_private_data; /* was vm_pte (shared mem) */ 345 | 346 | #ifdef CONFIG_SWAP 347 | atomic_long_t swap_readahead_info; 348 | #endif 349 | #ifndef CONFIG_MMU 350 | struct vm_region *vm_region; /* NOMMU mapping region */ 351 | #endif 352 | #ifdef CONFIG_NUMA 353 | struct mempolicy *vm_policy; /* NUMA policy for the VMA */ 354 | #endif 355 | struct vm_userfaultfd_ctx vm_userfaultfd_ctx; 356 | } __randomize_layout; 357 | 358 | struct core_thread { 359 | struct task_struct *task; 360 | struct core_thread *next; 361 | }; 362 | 363 | struct core_state { 364 | atomic_t nr_threads; 365 | struct core_thread dumper; 366 | struct completion startup; 367 | }; 368 | 369 | struct kioctx_table; 370 | struct mm_struct { 371 | struct { 372 | struct vm_area_struct *mmap; /* list of VMAs */ 373 | struct rb_root mm_rb; 374 | u64 vmacache_seqnum; /* per-thread vmacache */ 375 | #ifdef CONFIG_MMU 376 | unsigned long (*get_unmapped_area) (struct file *filp, 377 | unsigned long addr, unsigned long len, 378 | unsigned long pgoff, unsigned long flags); 379 | #endif 380 | unsigned long mmap_base; /* base of mmap area */ 381 | unsigned long mmap_legacy_base; /* base of mmap area in bottom-up allocations */ 382 | #ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES 383 | /* Base adresses for compatible mmap() */ 384 | unsigned long mmap_compat_base; 385 | unsigned long mmap_compat_legacy_base; 386 | #endif 387 | unsigned long task_size; /* size of task vm space */ 388 | unsigned long highest_vm_end; /* highest vma end address */ 389 | pgd_t * pgd; 390 | 391 | #ifdef CONFIG_MEMBARRIER 392 | /** 393 | * @membarrier_state: Flags controlling membarrier behavior. 394 | * 395 | * This field is close to @pgd to hopefully fit in the same 396 | * cache-line, which needs to be touched by switch_mm(). 397 | */ 398 | atomic_t membarrier_state; 399 | #endif 400 | 401 | /** 402 | * @mm_users: The number of users including userspace. 403 | * 404 | * Use mmget()/mmget_not_zero()/mmput() to modify. When this 405 | * drops to 0 (i.e. when the task exits and there are no other 406 | * temporary reference holders), we also release a reference on 407 | * @mm_count (which may then free the &struct mm_struct if 408 | * @mm_count also drops to 0). 409 | */ 410 | atomic_t mm_users; 411 | 412 | /** 413 | * @mm_count: The number of references to &struct mm_struct 414 | * (@mm_users count as 1). 415 | * 416 | * Use mmgrab()/mmdrop() to modify. When this drops to 0, the 417 | * &struct mm_struct is freed. 418 | */ 419 | atomic_t mm_count; 420 | 421 | #ifdef CONFIG_MMU 422 | atomic_long_t pgtables_bytes; /* PTE page table pages */ 423 | #endif 424 | int map_count; /* number of VMAs */ 425 | 426 | spinlock_t page_table_lock; /* Protects page tables and some 427 | * counters 428 | */ 429 | struct rw_semaphore mmap_sem; 430 | 431 | struct list_head mmlist; /* List of maybe swapped mm's. These 432 | * are globally strung together off 433 | * init_mm.mmlist, and are protected 434 | * by mmlist_lock 435 | */ 436 | 437 | 438 | unsigned long hiwater_rss; /* High-watermark of RSS usage */ 439 | unsigned long hiwater_vm; /* High-water virtual memory usage */ 440 | 441 | unsigned long total_vm; /* Total pages mapped */ 442 | unsigned long locked_vm; /* Pages that have PG_mlocked set */ 443 | atomic64_t pinned_vm; /* Refcount permanently increased */ 444 | unsigned long data_vm; /* VM_WRITE & ~VM_SHARED & ~VM_STACK */ 445 | unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE & ~VM_STACK */ 446 | unsigned long stack_vm; /* VM_STACK */ 447 | unsigned long def_flags; 448 | 449 | spinlock_t arg_lock; /* protect the below fields */ 450 | unsigned long start_code, end_code, start_data, end_data; 451 | unsigned long start_brk, brk, start_stack; 452 | unsigned long arg_start, arg_end, env_start, env_end; 453 | 454 | unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */ 455 | 456 | /* 457 | * Special counters, in some configurations protected by the 458 | * page_table_lock, in other configurations by being atomic. 459 | */ 460 | struct mm_rss_stat rss_stat; 461 | 462 | struct linux_binfmt *binfmt; 463 | 464 | /* Architecture-specific MM context */ 465 | mm_context_t context; 466 | 467 | unsigned long flags; /* Must use atomic bitops to access */ 468 | 469 | struct core_state *core_state; /* coredumping support */ 470 | 471 | #ifdef CONFIG_AIO 472 | spinlock_t ioctx_lock; 473 | struct kioctx_table __rcu *ioctx_table; 474 | #endif 475 | #ifdef CONFIG_MEMCG 476 | /* 477 | * "owner" points to a task that is regarded as the canonical 478 | * user/owner of this mm. All of the following must be true in 479 | * order for it to be changed: 480 | * 481 | * current == mm->owner 482 | * current->mm != mm 483 | * new_owner->mm == mm 484 | * new_owner->alloc_lock is held 485 | */ 486 | struct task_struct __rcu *owner; 487 | #endif 488 | struct user_namespace *user_ns; 489 | 490 | /* store ref to file /proc//exe symlink points to */ 491 | struct file __rcu *exe_file; 492 | #ifdef CONFIG_MMU_NOTIFIER 493 | struct mmu_notifier_mm *mmu_notifier_mm; 494 | #endif 495 | #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS 496 | pgtable_t pmd_huge_pte; /* protected by page_table_lock */ 497 | #endif 498 | #ifdef CONFIG_NUMA_BALANCING 499 | /* 500 | * numa_next_scan is the next time that the PTEs will be marked 501 | * pte_numa. NUMA hinting faults will gather statistics and 502 | * migrate pages to new nodes if necessary. 503 | */ 504 | unsigned long numa_next_scan; 505 | 506 | /* Restart point for scanning and setting pte_numa */ 507 | unsigned long numa_scan_offset; 508 | 509 | /* numa_scan_seq prevents two threads setting pte_numa */ 510 | int numa_scan_seq; 511 | #endif 512 | /* 513 | * An operation with batched TLB flushing is going on. Anything 514 | * that can move process memory needs to flush the TLB when 515 | * moving a PROT_NONE or PROT_NUMA mapped page. 516 | */ 517 | atomic_t tlb_flush_pending; 518 | #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH 519 | /* See flush_tlb_batched_pending() */ 520 | bool tlb_flush_batched; 521 | #endif 522 | struct uprobes_state uprobes_state; 523 | #ifdef CONFIG_HUGETLB_PAGE 524 | atomic_long_t hugetlb_usage; 525 | #endif 526 | struct work_struct async_put_work; 527 | } __randomize_layout; 528 | 529 | /* 530 | * The mm_cpumask needs to be at the end of mm_struct, because it 531 | * is dynamically sized based on nr_cpu_ids. 532 | */ 533 | unsigned long cpu_bitmap[]; 534 | }; 535 | 536 | extern struct mm_struct init_mm; 537 | 538 | /* Pointer magic because the dynamic array size confuses some compilers. */ 539 | static inline void mm_init_cpumask(struct mm_struct *mm) 540 | { 541 | unsigned long cpu_bitmap = (unsigned long)mm; 542 | 543 | cpu_bitmap += offsetof(struct mm_struct, cpu_bitmap); 544 | cpumask_clear((struct cpumask *)cpu_bitmap); 545 | } 546 | 547 | /* Future-safe accessor for struct mm_struct's cpu_vm_mask. */ 548 | static inline cpumask_t *mm_cpumask(struct mm_struct *mm) 549 | { 550 | return (struct cpumask *)&mm->cpu_bitmap; 551 | } 552 | 553 | struct mmu_gather; 554 | extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, 555 | unsigned long start, unsigned long end); 556 | extern void tlb_finish_mmu(struct mmu_gather *tlb, 557 | unsigned long start, unsigned long end); 558 | 559 | static inline void init_tlb_flush_pending(struct mm_struct *mm) 560 | { 561 | atomic_set(&mm->tlb_flush_pending, 0); 562 | } 563 | 564 | static inline void inc_tlb_flush_pending(struct mm_struct *mm) 565 | { 566 | atomic_inc(&mm->tlb_flush_pending); 567 | /* 568 | * The only time this value is relevant is when there are indeed pages 569 | * to flush. And we'll only flush pages after changing them, which 570 | * requires the PTL. 571 | * 572 | * So the ordering here is: 573 | * 574 | * atomic_inc(&mm->tlb_flush_pending); 575 | * spin_lock(&ptl); 576 | * ... 577 | * set_pte_at(); 578 | * spin_unlock(&ptl); 579 | * 580 | * spin_lock(&ptl) 581 | * mm_tlb_flush_pending(); 582 | * .... 583 | * spin_unlock(&ptl); 584 | * 585 | * flush_tlb_range(); 586 | * atomic_dec(&mm->tlb_flush_pending); 587 | * 588 | * Where the increment if constrained by the PTL unlock, it thus 589 | * ensures that the increment is visible if the PTE modification is 590 | * visible. After all, if there is no PTE modification, nobody cares 591 | * about TLB flushes either. 592 | * 593 | * This very much relies on users (mm_tlb_flush_pending() and 594 | * mm_tlb_flush_nested()) only caring about _specific_ PTEs (and 595 | * therefore specific PTLs), because with SPLIT_PTE_PTLOCKS and RCpc 596 | * locks (PPC) the unlock of one doesn't order against the lock of 597 | * another PTL. 598 | * 599 | * The decrement is ordered by the flush_tlb_range(), such that 600 | * mm_tlb_flush_pending() will not return false unless all flushes have 601 | * completed. 602 | */ 603 | } 604 | 605 | static inline void dec_tlb_flush_pending(struct mm_struct *mm) 606 | { 607 | /* 608 | * See inc_tlb_flush_pending(). 609 | * 610 | * This cannot be smp_mb__before_atomic() because smp_mb() simply does 611 | * not order against TLB invalidate completion, which is what we need. 612 | * 613 | * Therefore we must rely on tlb_flush_*() to guarantee order. 614 | */ 615 | atomic_dec(&mm->tlb_flush_pending); 616 | } 617 | 618 | static inline bool mm_tlb_flush_pending(struct mm_struct *mm) 619 | { 620 | /* 621 | * Must be called after having acquired the PTL; orders against that 622 | * PTLs release and therefore ensures that if we observe the modified 623 | * PTE we must also observe the increment from inc_tlb_flush_pending(). 624 | * 625 | * That is, it only guarantees to return true if there is a flush 626 | * pending for _this_ PTL. 627 | */ 628 | return atomic_read(&mm->tlb_flush_pending); 629 | } 630 | 631 | static inline bool mm_tlb_flush_nested(struct mm_struct *mm) 632 | { 633 | /* 634 | * Similar to mm_tlb_flush_pending(), we must have acquired the PTL 635 | * for which there is a TLB flush pending in order to guarantee 636 | * we've seen both that PTE modification and the increment. 637 | * 638 | * (no requirement on actually still holding the PTL, that is irrelevant) 639 | */ 640 | return atomic_read(&mm->tlb_flush_pending) > 1; 641 | } 642 | 643 | struct vm_fault; 644 | 645 | /** 646 | * typedef vm_fault_t - Return type for page fault handlers. 647 | * 648 | * Page fault handlers return a bitmask of %VM_FAULT values. 649 | */ 650 | typedef __bitwise unsigned int vm_fault_t; 651 | 652 | /** 653 | * enum vm_fault_reason - Page fault handlers return a bitmask of 654 | * these values to tell the core VM what happened when handling the 655 | * fault. Used to decide whether a process gets delivered SIGBUS or 656 | * just gets major/minor fault counters bumped up. 657 | * 658 | * @VM_FAULT_OOM: Out Of Memory 659 | * @VM_FAULT_SIGBUS: Bad access 660 | * @VM_FAULT_MAJOR: Page read from storage 661 | * @VM_FAULT_WRITE: Special case for get_user_pages 662 | * @VM_FAULT_HWPOISON: Hit poisoned small page 663 | * @VM_FAULT_HWPOISON_LARGE: Hit poisoned large page. Index encoded 664 | * in upper bits 665 | * @VM_FAULT_SIGSEGV: segmentation fault 666 | * @VM_FAULT_NOPAGE: ->fault installed the pte, not return page 667 | * @VM_FAULT_LOCKED: ->fault locked the returned page 668 | * @VM_FAULT_RETRY: ->fault blocked, must retry 669 | * @VM_FAULT_FALLBACK: huge page fault failed, fall back to small 670 | * @VM_FAULT_DONE_COW: ->fault has fully handled COW 671 | * @VM_FAULT_NEEDDSYNC: ->fault did not modify page tables and needs 672 | * fsync() to complete (for synchronous page faults 673 | * in DAX) 674 | * @VM_FAULT_HINDEX_MASK: mask HINDEX value 675 | * 676 | */ 677 | enum vm_fault_reason { 678 | VM_FAULT_OOM = (__force vm_fault_t)0x000001, 679 | VM_FAULT_SIGBUS = (__force vm_fault_t)0x000002, 680 | VM_FAULT_MAJOR = (__force vm_fault_t)0x000004, 681 | VM_FAULT_WRITE = (__force vm_fault_t)0x000008, 682 | VM_FAULT_HWPOISON = (__force vm_fault_t)0x000010, 683 | VM_FAULT_HWPOISON_LARGE = (__force vm_fault_t)0x000020, 684 | VM_FAULT_SIGSEGV = (__force vm_fault_t)0x000040, 685 | VM_FAULT_NOPAGE = (__force vm_fault_t)0x000100, 686 | VM_FAULT_LOCKED = (__force vm_fault_t)0x000200, 687 | VM_FAULT_RETRY = (__force vm_fault_t)0x000400, 688 | VM_FAULT_FALLBACK = (__force vm_fault_t)0x000800, 689 | VM_FAULT_DONE_COW = (__force vm_fault_t)0x001000, 690 | VM_FAULT_NEEDDSYNC = (__force vm_fault_t)0x002000, 691 | VM_FAULT_HINDEX_MASK = (__force vm_fault_t)0x0f0000, 692 | }; 693 | 694 | /* Encode hstate index for a hwpoisoned large page */ 695 | #define VM_FAULT_SET_HINDEX(x) ((__force vm_fault_t)((x) << 16)) 696 | #define VM_FAULT_GET_HINDEX(x) (((__force unsigned int)(x) >> 16) & 0xf) 697 | 698 | #define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | \ 699 | VM_FAULT_SIGSEGV | VM_FAULT_HWPOISON | \ 700 | VM_FAULT_HWPOISON_LARGE | VM_FAULT_FALLBACK) 701 | 702 | #define VM_FAULT_RESULT_TRACE \ 703 | { VM_FAULT_OOM, "OOM" }, \ 704 | { VM_FAULT_SIGBUS, "SIGBUS" }, \ 705 | { VM_FAULT_MAJOR, "MAJOR" }, \ 706 | { VM_FAULT_WRITE, "WRITE" }, \ 707 | { VM_FAULT_HWPOISON, "HWPOISON" }, \ 708 | { VM_FAULT_HWPOISON_LARGE, "HWPOISON_LARGE" }, \ 709 | { VM_FAULT_SIGSEGV, "SIGSEGV" }, \ 710 | { VM_FAULT_NOPAGE, "NOPAGE" }, \ 711 | { VM_FAULT_LOCKED, "LOCKED" }, \ 712 | { VM_FAULT_RETRY, "RETRY" }, \ 713 | { VM_FAULT_FALLBACK, "FALLBACK" }, \ 714 | { VM_FAULT_DONE_COW, "DONE_COW" }, \ 715 | { VM_FAULT_NEEDDSYNC, "NEEDDSYNC" } 716 | 717 | struct vm_special_mapping { 718 | const char *name; /* The name, e.g. "[vdso]". */ 719 | 720 | /* 721 | * If .fault is not provided, this points to a 722 | * NULL-terminated array of pages that back the special mapping. 723 | * 724 | * This must not be NULL unless .fault is provided. 725 | */ 726 | struct page **pages; 727 | 728 | /* 729 | * If non-NULL, then this is called to resolve page faults 730 | * on the special mapping. If used, .pages is not checked. 731 | */ 732 | vm_fault_t (*fault)(const struct vm_special_mapping *sm, 733 | struct vm_area_struct *vma, 734 | struct vm_fault *vmf); 735 | 736 | int (*mremap)(const struct vm_special_mapping *sm, 737 | struct vm_area_struct *new_vma); 738 | }; 739 | 740 | enum tlb_flush_reason { 741 | TLB_FLUSH_ON_TASK_SWITCH, 742 | TLB_REMOTE_SHOOTDOWN, 743 | TLB_LOCAL_SHOOTDOWN, 744 | TLB_LOCAL_MM_SHOOTDOWN, 745 | TLB_REMOTE_SEND_IPI, 746 | NR_TLB_FLUSH_REASONS, 747 | }; 748 | 749 | /* 750 | * A swap entry has to fit into a "unsigned long", as the entry is hidden 751 | * in the "index" field of the swapper address space. 752 | */ 753 | typedef struct { 754 | unsigned long val; 755 | } swp_entry_t; 756 | 757 | #endif /* _LINUX_MM_TYPES_H */ 758 | -------------------------------------------------------------------------------- /lab/03_memory_management/code/ref/pgtable-types.h: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: GPL-2.0-only */ 2 | /* 3 | * Page table types definitions. 4 | * 5 | * Copyright (C) 2014 ARM Ltd. 6 | * Author: Catalin Marinas 7 | */ 8 | 9 | #ifndef __ASM_PGTABLE_TYPES_H 10 | #define __ASM_PGTABLE_TYPES_H 11 | 12 | #include 13 | 14 | typedef u64 pteval_t; 15 | typedef u64 pmdval_t; 16 | typedef u64 pudval_t; 17 | typedef u64 pgdval_t; 18 | 19 | /* 20 | * These are used to make use of C type-checking.. 21 | */ 22 | typedef struct { pteval_t pte; } pte_t; 23 | #define pte_val(x) ((x).pte) 24 | #define __pte(x) ((pte_t) { (x) } ) 25 | 26 | #if CONFIG_PGTABLE_LEVELS > 2 27 | typedef struct { pmdval_t pmd; } pmd_t; 28 | #define pmd_val(x) ((x).pmd) 29 | #define __pmd(x) ((pmd_t) { (x) } ) 30 | #endif 31 | 32 | #if CONFIG_PGTABLE_LEVELS > 3 33 | typedef struct { pudval_t pud; } pud_t; 34 | #define pud_val(x) ((x).pud) 35 | #define __pud(x) ((pud_t) { (x) } ) 36 | #endif 37 | 38 | typedef struct { pgdval_t pgd; } pgd_t; 39 | #define pgd_val(x) ((x).pgd) 40 | #define __pgd(x) ((pgd_t) { (x) } ) 41 | 42 | typedef struct { pteval_t pgprot; } pgprot_t; 43 | #define pgprot_val(x) ((x).pgprot) 44 | #define __pgprot(x) ((pgprot_t) { (x) } ) 45 | 46 | #if CONFIG_PGTABLE_LEVELS == 2 47 | #define __ARCH_USE_5LEVEL_HACK 48 | #include 49 | #elif CONFIG_PGTABLE_LEVELS == 3 50 | #define __ARCH_USE_5LEVEL_HACK 51 | #include 52 | #elif CONFIG_PGTABLE_LEVELS == 4 53 | #include 54 | #endif 55 | 56 | #endif /* __ASM_PGTABLE_TYPES_H */ 57 | -------------------------------------------------------------------------------- /lab/03_memory_management/code/ref/pgtable.h: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: GPL-2.0-only */ 2 | /* 3 | * Copyright (C) 2012 ARM Ltd. 4 | */ 5 | #ifndef __ASM_PGTABLE_H 6 | #define __ASM_PGTABLE_H 7 | 8 | #include 9 | #include 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | /* 17 | * VMALLOC range. 18 | * 19 | * VMALLOC_START: beginning of the kernel vmalloc space 20 | * VMALLOC_END: extends to the available space below vmemmap, PCI I/O space 21 | * and fixed mappings 22 | */ 23 | #define VMALLOC_START (MODULES_END) 24 | #define VMALLOC_END (- PUD_SIZE - VMEMMAP_SIZE - SZ_64K) 25 | 26 | #define FIRST_USER_ADDRESS 0UL 27 | 28 | #ifndef __ASSEMBLY__ 29 | 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | 36 | extern struct page *vmemmap; 37 | 38 | extern void __pte_error(const char *file, int line, unsigned long val); 39 | extern void __pmd_error(const char *file, int line, unsigned long val); 40 | extern void __pud_error(const char *file, int line, unsigned long val); 41 | extern void __pgd_error(const char *file, int line, unsigned long val); 42 | 43 | /* 44 | * ZERO_PAGE is a global shared page that is always zero: used 45 | * for zero-mapped memory areas etc.. 46 | */ 47 | extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; 48 | #define ZERO_PAGE(vaddr) phys_to_page(__pa_symbol(empty_zero_page)) 49 | 50 | #define pte_ERROR(pte) __pte_error(__FILE__, __LINE__, pte_val(pte)) 51 | 52 | /* 53 | * Macros to convert between a physical address and its placement in a 54 | * page table entry, taking care of 52-bit addresses. 55 | */ 56 | #ifdef CONFIG_ARM64_PA_BITS_52 57 | #define __pte_to_phys(pte) \ 58 | ((pte_val(pte) & PTE_ADDR_LOW) | ((pte_val(pte) & PTE_ADDR_HIGH) << 36)) 59 | #define __phys_to_pte_val(phys) (((phys) | ((phys) >> 36)) & PTE_ADDR_MASK) 60 | #else 61 | #define __pte_to_phys(pte) (pte_val(pte) & PTE_ADDR_MASK) 62 | #define __phys_to_pte_val(phys) (phys) 63 | #endif 64 | 65 | #define pte_pfn(pte) (__pte_to_phys(pte) >> PAGE_SHIFT) 66 | #define pfn_pte(pfn,prot) \ 67 | __pte(__phys_to_pte_val((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot)) 68 | 69 | #define pte_none(pte) (!pte_val(pte)) 70 | #define pte_clear(mm,addr,ptep) set_pte(ptep, __pte(0)) 71 | #define pte_page(pte) (pfn_to_page(pte_pfn(pte))) 72 | 73 | /* 74 | * The following only work if pte_present(). Undefined behaviour otherwise. 75 | */ 76 | #define pte_present(pte) (!!(pte_val(pte) & (PTE_VALID | PTE_PROT_NONE))) 77 | #define pte_young(pte) (!!(pte_val(pte) & PTE_AF)) 78 | #define pte_special(pte) (!!(pte_val(pte) & PTE_SPECIAL)) 79 | #define pte_write(pte) (!!(pte_val(pte) & PTE_WRITE)) 80 | #define pte_user_exec(pte) (!(pte_val(pte) & PTE_UXN)) 81 | #define pte_cont(pte) (!!(pte_val(pte) & PTE_CONT)) 82 | #define pte_devmap(pte) (!!(pte_val(pte) & PTE_DEVMAP)) 83 | 84 | #define pte_cont_addr_end(addr, end) \ 85 | ({ unsigned long __boundary = ((addr) + CONT_PTE_SIZE) & CONT_PTE_MASK; \ 86 | (__boundary - 1 < (end) - 1) ? __boundary : (end); \ 87 | }) 88 | 89 | #define pmd_cont_addr_end(addr, end) \ 90 | ({ unsigned long __boundary = ((addr) + CONT_PMD_SIZE) & CONT_PMD_MASK; \ 91 | (__boundary - 1 < (end) - 1) ? __boundary : (end); \ 92 | }) 93 | 94 | #define pte_hw_dirty(pte) (pte_write(pte) && !(pte_val(pte) & PTE_RDONLY)) 95 | #define pte_sw_dirty(pte) (!!(pte_val(pte) & PTE_DIRTY)) 96 | #define pte_dirty(pte) (pte_sw_dirty(pte) || pte_hw_dirty(pte)) 97 | 98 | #define pte_valid(pte) (!!(pte_val(pte) & PTE_VALID)) 99 | #define pte_valid_not_user(pte) \ 100 | ((pte_val(pte) & (PTE_VALID | PTE_USER)) == PTE_VALID) 101 | #define pte_valid_young(pte) \ 102 | ((pte_val(pte) & (PTE_VALID | PTE_AF)) == (PTE_VALID | PTE_AF)) 103 | #define pte_valid_user(pte) \ 104 | ((pte_val(pte) & (PTE_VALID | PTE_USER)) == (PTE_VALID | PTE_USER)) 105 | 106 | /* 107 | * Could the pte be present in the TLB? We must check mm_tlb_flush_pending 108 | * so that we don't erroneously return false for pages that have been 109 | * remapped as PROT_NONE but are yet to be flushed from the TLB. 110 | */ 111 | #define pte_accessible(mm, pte) \ 112 | (mm_tlb_flush_pending(mm) ? pte_present(pte) : pte_valid_young(pte)) 113 | 114 | /* 115 | * p??_access_permitted() is true for valid user mappings (subject to the 116 | * write permission check). PROT_NONE mappings do not have the PTE_VALID bit 117 | * set. 118 | */ 119 | #define pte_access_permitted(pte, write) \ 120 | (pte_valid_user(pte) && (!(write) || pte_write(pte))) 121 | #define pmd_access_permitted(pmd, write) \ 122 | (pte_access_permitted(pmd_pte(pmd), (write))) 123 | #define pud_access_permitted(pud, write) \ 124 | (pte_access_permitted(pud_pte(pud), (write))) 125 | 126 | static inline pte_t clear_pte_bit(pte_t pte, pgprot_t prot) 127 | { 128 | pte_val(pte) &= ~pgprot_val(prot); 129 | return pte; 130 | } 131 | 132 | static inline pte_t set_pte_bit(pte_t pte, pgprot_t prot) 133 | { 134 | pte_val(pte) |= pgprot_val(prot); 135 | return pte; 136 | } 137 | 138 | static inline pte_t pte_wrprotect(pte_t pte) 139 | { 140 | pte = clear_pte_bit(pte, __pgprot(PTE_WRITE)); 141 | pte = set_pte_bit(pte, __pgprot(PTE_RDONLY)); 142 | return pte; 143 | } 144 | 145 | static inline pte_t pte_mkwrite(pte_t pte) 146 | { 147 | pte = set_pte_bit(pte, __pgprot(PTE_WRITE)); 148 | pte = clear_pte_bit(pte, __pgprot(PTE_RDONLY)); 149 | return pte; 150 | } 151 | 152 | static inline pte_t pte_mkclean(pte_t pte) 153 | { 154 | pte = clear_pte_bit(pte, __pgprot(PTE_DIRTY)); 155 | pte = set_pte_bit(pte, __pgprot(PTE_RDONLY)); 156 | 157 | return pte; 158 | } 159 | 160 | static inline pte_t pte_mkdirty(pte_t pte) 161 | { 162 | pte = set_pte_bit(pte, __pgprot(PTE_DIRTY)); 163 | 164 | if (pte_write(pte)) 165 | pte = clear_pte_bit(pte, __pgprot(PTE_RDONLY)); 166 | 167 | return pte; 168 | } 169 | 170 | static inline pte_t pte_mkold(pte_t pte) 171 | { 172 | return clear_pte_bit(pte, __pgprot(PTE_AF)); 173 | } 174 | 175 | static inline pte_t pte_mkyoung(pte_t pte) 176 | { 177 | return set_pte_bit(pte, __pgprot(PTE_AF)); 178 | } 179 | 180 | static inline pte_t pte_mkspecial(pte_t pte) 181 | { 182 | return set_pte_bit(pte, __pgprot(PTE_SPECIAL)); 183 | } 184 | 185 | static inline pte_t pte_mkcont(pte_t pte) 186 | { 187 | pte = set_pte_bit(pte, __pgprot(PTE_CONT)); 188 | return set_pte_bit(pte, __pgprot(PTE_TYPE_PAGE)); 189 | } 190 | 191 | static inline pte_t pte_mknoncont(pte_t pte) 192 | { 193 | return clear_pte_bit(pte, __pgprot(PTE_CONT)); 194 | } 195 | 196 | static inline pte_t pte_mkpresent(pte_t pte) 197 | { 198 | return set_pte_bit(pte, __pgprot(PTE_VALID)); 199 | } 200 | 201 | static inline pmd_t pmd_mkcont(pmd_t pmd) 202 | { 203 | return __pmd(pmd_val(pmd) | PMD_SECT_CONT); 204 | } 205 | 206 | static inline pte_t pte_mkdevmap(pte_t pte) 207 | { 208 | return set_pte_bit(pte, __pgprot(PTE_DEVMAP | PTE_SPECIAL)); 209 | } 210 | 211 | static inline void set_pte(pte_t *ptep, pte_t pte) 212 | { 213 | WRITE_ONCE(*ptep, pte); 214 | 215 | /* 216 | * Only if the new pte is valid and kernel, otherwise TLB maintenance 217 | * or update_mmu_cache() have the necessary barriers. 218 | */ 219 | if (pte_valid_not_user(pte)) { 220 | dsb(ishst); 221 | isb(); 222 | } 223 | } 224 | 225 | extern void __sync_icache_dcache(pte_t pteval); 226 | 227 | /* 228 | * PTE bits configuration in the presence of hardware Dirty Bit Management 229 | * (PTE_WRITE == PTE_DBM): 230 | * 231 | * Dirty Writable | PTE_RDONLY PTE_WRITE PTE_DIRTY (sw) 232 | * 0 0 | 1 0 0 233 | * 0 1 | 1 1 0 234 | * 1 0 | 1 0 1 235 | * 1 1 | 0 1 x 236 | * 237 | * When hardware DBM is not present, the sofware PTE_DIRTY bit is updated via 238 | * the page fault mechanism. Checking the dirty status of a pte becomes: 239 | * 240 | * PTE_DIRTY || (PTE_WRITE && !PTE_RDONLY) 241 | */ 242 | 243 | static inline void __check_racy_pte_update(struct mm_struct *mm, pte_t *ptep, 244 | pte_t pte) 245 | { 246 | pte_t old_pte; 247 | 248 | if (!IS_ENABLED(CONFIG_DEBUG_VM)) 249 | return; 250 | 251 | old_pte = READ_ONCE(*ptep); 252 | 253 | if (!pte_valid(old_pte) || !pte_valid(pte)) 254 | return; 255 | if (mm != current->active_mm && atomic_read(&mm->mm_users) <= 1) 256 | return; 257 | 258 | /* 259 | * Check for potential race with hardware updates of the pte 260 | * (ptep_set_access_flags safely changes valid ptes without going 261 | * through an invalid entry). 262 | */ 263 | VM_WARN_ONCE(!pte_young(pte), 264 | "%s: racy access flag clearing: 0x%016llx -> 0x%016llx", 265 | __func__, pte_val(old_pte), pte_val(pte)); 266 | VM_WARN_ONCE(pte_write(old_pte) && !pte_dirty(pte), 267 | "%s: racy dirty state clearing: 0x%016llx -> 0x%016llx", 268 | __func__, pte_val(old_pte), pte_val(pte)); 269 | } 270 | 271 | static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, 272 | pte_t *ptep, pte_t pte) 273 | { 274 | if (pte_present(pte) && pte_user_exec(pte) && !pte_special(pte)) 275 | __sync_icache_dcache(pte); 276 | 277 | __check_racy_pte_update(mm, ptep, pte); 278 | 279 | set_pte(ptep, pte); 280 | } 281 | 282 | /* 283 | * Huge pte definitions. 284 | */ 285 | #define pte_mkhuge(pte) (__pte(pte_val(pte) & ~PTE_TABLE_BIT)) 286 | 287 | /* 288 | * Hugetlb definitions. 289 | */ 290 | #define HUGE_MAX_HSTATE 4 291 | #define HPAGE_SHIFT PMD_SHIFT 292 | #define HPAGE_SIZE (_AC(1, UL) << HPAGE_SHIFT) 293 | #define HPAGE_MASK (~(HPAGE_SIZE - 1)) 294 | #define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT) 295 | 296 | static inline pte_t pgd_pte(pgd_t pgd) 297 | { 298 | return __pte(pgd_val(pgd)); 299 | } 300 | 301 | static inline pte_t pud_pte(pud_t pud) 302 | { 303 | return __pte(pud_val(pud)); 304 | } 305 | 306 | static inline pud_t pte_pud(pte_t pte) 307 | { 308 | return __pud(pte_val(pte)); 309 | } 310 | 311 | static inline pmd_t pud_pmd(pud_t pud) 312 | { 313 | return __pmd(pud_val(pud)); 314 | } 315 | 316 | static inline pte_t pmd_pte(pmd_t pmd) 317 | { 318 | return __pte(pmd_val(pmd)); 319 | } 320 | 321 | static inline pmd_t pte_pmd(pte_t pte) 322 | { 323 | return __pmd(pte_val(pte)); 324 | } 325 | 326 | static inline pgprot_t mk_pud_sect_prot(pgprot_t prot) 327 | { 328 | return __pgprot((pgprot_val(prot) & ~PUD_TABLE_BIT) | PUD_TYPE_SECT); 329 | } 330 | 331 | static inline pgprot_t mk_pmd_sect_prot(pgprot_t prot) 332 | { 333 | return __pgprot((pgprot_val(prot) & ~PMD_TABLE_BIT) | PMD_TYPE_SECT); 334 | } 335 | 336 | #ifdef CONFIG_NUMA_BALANCING 337 | /* 338 | * See the comment in include/asm-generic/pgtable.h 339 | */ 340 | static inline int pte_protnone(pte_t pte) 341 | { 342 | return (pte_val(pte) & (PTE_VALID | PTE_PROT_NONE)) == PTE_PROT_NONE; 343 | } 344 | 345 | static inline int pmd_protnone(pmd_t pmd) 346 | { 347 | return pte_protnone(pmd_pte(pmd)); 348 | } 349 | #endif 350 | 351 | /* 352 | * THP definitions. 353 | */ 354 | 355 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE 356 | #define pmd_trans_huge(pmd) (pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT)) 357 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 358 | 359 | #define pmd_present(pmd) pte_present(pmd_pte(pmd)) 360 | #define pmd_dirty(pmd) pte_dirty(pmd_pte(pmd)) 361 | #define pmd_young(pmd) pte_young(pmd_pte(pmd)) 362 | #define pmd_valid(pmd) pte_valid(pmd_pte(pmd)) 363 | #define pmd_wrprotect(pmd) pte_pmd(pte_wrprotect(pmd_pte(pmd))) 364 | #define pmd_mkold(pmd) pte_pmd(pte_mkold(pmd_pte(pmd))) 365 | #define pmd_mkwrite(pmd) pte_pmd(pte_mkwrite(pmd_pte(pmd))) 366 | #define pmd_mkclean(pmd) pte_pmd(pte_mkclean(pmd_pte(pmd))) 367 | #define pmd_mkdirty(pmd) pte_pmd(pte_mkdirty(pmd_pte(pmd))) 368 | #define pmd_mkyoung(pmd) pte_pmd(pte_mkyoung(pmd_pte(pmd))) 369 | #define pmd_mknotpresent(pmd) (__pmd(pmd_val(pmd) & ~PMD_SECT_VALID)) 370 | 371 | #define pmd_thp_or_huge(pmd) (pmd_huge(pmd) || pmd_trans_huge(pmd)) 372 | 373 | #define pmd_write(pmd) pte_write(pmd_pte(pmd)) 374 | 375 | #define pmd_mkhuge(pmd) (__pmd(pmd_val(pmd) & ~PMD_TABLE_BIT)) 376 | 377 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE 378 | #define pmd_devmap(pmd) pte_devmap(pmd_pte(pmd)) 379 | #endif 380 | static inline pmd_t pmd_mkdevmap(pmd_t pmd) 381 | { 382 | return pte_pmd(set_pte_bit(pmd_pte(pmd), __pgprot(PTE_DEVMAP))); 383 | } 384 | 385 | #define __pmd_to_phys(pmd) __pte_to_phys(pmd_pte(pmd)) 386 | #define __phys_to_pmd_val(phys) __phys_to_pte_val(phys) 387 | #define pmd_pfn(pmd) ((__pmd_to_phys(pmd) & PMD_MASK) >> PAGE_SHIFT) 388 | #define pfn_pmd(pfn,prot) __pmd(__phys_to_pmd_val((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot)) 389 | #define mk_pmd(page,prot) pfn_pmd(page_to_pfn(page),prot) 390 | 391 | #define pud_young(pud) pte_young(pud_pte(pud)) 392 | #define pud_mkyoung(pud) pte_pud(pte_mkyoung(pud_pte(pud))) 393 | #define pud_write(pud) pte_write(pud_pte(pud)) 394 | 395 | #define pud_mkhuge(pud) (__pud(pud_val(pud) & ~PUD_TABLE_BIT)) 396 | 397 | #define __pud_to_phys(pud) __pte_to_phys(pud_pte(pud)) 398 | #define __phys_to_pud_val(phys) __phys_to_pte_val(phys) 399 | #define pud_pfn(pud) ((__pud_to_phys(pud) & PUD_MASK) >> PAGE_SHIFT) 400 | #define pfn_pud(pfn,prot) __pud(__phys_to_pud_val((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot)) 401 | 402 | #define set_pmd_at(mm, addr, pmdp, pmd) set_pte_at(mm, addr, (pte_t *)pmdp, pmd_pte(pmd)) 403 | 404 | #define __pgd_to_phys(pgd) __pte_to_phys(pgd_pte(pgd)) 405 | #define __phys_to_pgd_val(phys) __phys_to_pte_val(phys) 406 | 407 | #define __pgprot_modify(prot,mask,bits) \ 408 | __pgprot((pgprot_val(prot) & ~(mask)) | (bits)) 409 | 410 | /* 411 | * Mark the prot value as uncacheable and unbufferable. 412 | */ 413 | #define pgprot_noncached(prot) \ 414 | __pgprot_modify(prot, PTE_ATTRINDX_MASK, PTE_ATTRINDX(MT_DEVICE_nGnRnE) | PTE_PXN | PTE_UXN) 415 | #define pgprot_writecombine(prot) \ 416 | __pgprot_modify(prot, PTE_ATTRINDX_MASK, PTE_ATTRINDX(MT_NORMAL_NC) | PTE_PXN | PTE_UXN) 417 | #define pgprot_device(prot) \ 418 | __pgprot_modify(prot, PTE_ATTRINDX_MASK, PTE_ATTRINDX(MT_DEVICE_nGnRE) | PTE_PXN | PTE_UXN) 419 | /* 420 | * DMA allocations for non-coherent devices use what the Arm architecture calls 421 | * "Normal non-cacheable" memory, which permits speculation, unaligned accesses 422 | * and merging of writes. This is different from "Device-nGnR[nE]" memory which 423 | * is intended for MMIO and thus forbids speculation, preserves access size, 424 | * requires strict alignment and can also force write responses to come from the 425 | * endpoint. 426 | */ 427 | #define pgprot_dmacoherent(prot) \ 428 | __pgprot_modify(prot, PTE_ATTRINDX_MASK, \ 429 | PTE_ATTRINDX(MT_NORMAL_NC) | PTE_PXN | PTE_UXN) 430 | 431 | #define __HAVE_PHYS_MEM_ACCESS_PROT 432 | struct file; 433 | extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, 434 | unsigned long size, pgprot_t vma_prot); 435 | 436 | #define pmd_none(pmd) (!pmd_val(pmd)) 437 | 438 | #define pmd_bad(pmd) (!(pmd_val(pmd) & PMD_TABLE_BIT)) 439 | 440 | #define pmd_table(pmd) ((pmd_val(pmd) & PMD_TYPE_MASK) == \ 441 | PMD_TYPE_TABLE) 442 | #define pmd_sect(pmd) ((pmd_val(pmd) & PMD_TYPE_MASK) == \ 443 | PMD_TYPE_SECT) 444 | 445 | #if defined(CONFIG_ARM64_64K_PAGES) || CONFIG_PGTABLE_LEVELS < 3 446 | static inline bool pud_sect(pud_t pud) { return false; } 447 | static inline bool pud_table(pud_t pud) { return true; } 448 | #else 449 | #define pud_sect(pud) ((pud_val(pud) & PUD_TYPE_MASK) == \ 450 | PUD_TYPE_SECT) 451 | #define pud_table(pud) ((pud_val(pud) & PUD_TYPE_MASK) == \ 452 | PUD_TYPE_TABLE) 453 | #endif 454 | 455 | extern pgd_t init_pg_dir[PTRS_PER_PGD]; 456 | extern pgd_t init_pg_end[]; 457 | extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; 458 | extern pgd_t idmap_pg_dir[PTRS_PER_PGD]; 459 | extern pgd_t tramp_pg_dir[PTRS_PER_PGD]; 460 | 461 | extern void set_swapper_pgd(pgd_t *pgdp, pgd_t pgd); 462 | 463 | static inline bool in_swapper_pgdir(void *addr) 464 | { 465 | return ((unsigned long)addr & PAGE_MASK) == 466 | ((unsigned long)swapper_pg_dir & PAGE_MASK); 467 | } 468 | 469 | static inline void set_pmd(pmd_t *pmdp, pmd_t pmd) 470 | { 471 | #ifdef __PAGETABLE_PMD_FOLDED 472 | if (in_swapper_pgdir(pmdp)) { 473 | set_swapper_pgd((pgd_t *)pmdp, __pgd(pmd_val(pmd))); 474 | return; 475 | } 476 | #endif /* __PAGETABLE_PMD_FOLDED */ 477 | 478 | WRITE_ONCE(*pmdp, pmd); 479 | 480 | if (pmd_valid(pmd)) { 481 | dsb(ishst); 482 | isb(); 483 | } 484 | } 485 | 486 | static inline void pmd_clear(pmd_t *pmdp) 487 | { 488 | set_pmd(pmdp, __pmd(0)); 489 | } 490 | 491 | static inline phys_addr_t pmd_page_paddr(pmd_t pmd) 492 | { 493 | return __pmd_to_phys(pmd); 494 | } 495 | 496 | static inline void pte_unmap(pte_t *pte) { } 497 | 498 | /* Find an entry in the third-level page table. */ 499 | #define pte_index(addr) (((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) 500 | 501 | #define pte_offset_phys(dir,addr) (pmd_page_paddr(READ_ONCE(*(dir))) + pte_index(addr) * sizeof(pte_t)) 502 | #define pte_offset_kernel(dir,addr) ((pte_t *)__va(pte_offset_phys((dir), (addr)))) 503 | 504 | #define pte_offset_map(dir,addr) pte_offset_kernel((dir), (addr)) 505 | 506 | #define pte_set_fixmap(addr) ((pte_t *)set_fixmap_offset(FIX_PTE, addr)) 507 | #define pte_set_fixmap_offset(pmd, addr) pte_set_fixmap(pte_offset_phys(pmd, addr)) 508 | #define pte_clear_fixmap() clear_fixmap(FIX_PTE) 509 | 510 | #define pmd_page(pmd) pfn_to_page(__phys_to_pfn(__pmd_to_phys(pmd))) 511 | 512 | /* use ONLY for statically allocated translation tables */ 513 | #define pte_offset_kimg(dir,addr) ((pte_t *)__phys_to_kimg(pte_offset_phys((dir), (addr)))) 514 | 515 | /* 516 | * Conversion functions: convert a page and protection to a page entry, 517 | * and a page entry and page directory to the page they refer to. 518 | */ 519 | #define mk_pte(page,prot) pfn_pte(page_to_pfn(page),prot) 520 | 521 | #if CONFIG_PGTABLE_LEVELS > 2 522 | 523 | #define pmd_ERROR(pmd) __pmd_error(__FILE__, __LINE__, pmd_val(pmd)) 524 | 525 | #define pud_none(pud) (!pud_val(pud)) 526 | #define pud_bad(pud) (!(pud_val(pud) & PUD_TABLE_BIT)) 527 | #define pud_present(pud) pte_present(pud_pte(pud)) 528 | #define pud_valid(pud) pte_valid(pud_pte(pud)) 529 | 530 | static inline void set_pud(pud_t *pudp, pud_t pud) 531 | { 532 | #ifdef __PAGETABLE_PUD_FOLDED 533 | if (in_swapper_pgdir(pudp)) { 534 | set_swapper_pgd((pgd_t *)pudp, __pgd(pud_val(pud))); 535 | return; 536 | } 537 | #endif /* __PAGETABLE_PUD_FOLDED */ 538 | 539 | WRITE_ONCE(*pudp, pud); 540 | 541 | if (pud_valid(pud)) { 542 | dsb(ishst); 543 | isb(); 544 | } 545 | } 546 | 547 | static inline void pud_clear(pud_t *pudp) 548 | { 549 | set_pud(pudp, __pud(0)); 550 | } 551 | 552 | static inline phys_addr_t pud_page_paddr(pud_t pud) 553 | { 554 | return __pud_to_phys(pud); 555 | } 556 | 557 | /* Find an entry in the second-level page table. */ 558 | #define pmd_index(addr) (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1)) 559 | 560 | #define pmd_offset_phys(dir, addr) (pud_page_paddr(READ_ONCE(*(dir))) + pmd_index(addr) * sizeof(pmd_t)) 561 | #define pmd_offset(dir, addr) ((pmd_t *)__va(pmd_offset_phys((dir), (addr)))) 562 | 563 | #define pmd_set_fixmap(addr) ((pmd_t *)set_fixmap_offset(FIX_PMD, addr)) 564 | #define pmd_set_fixmap_offset(pud, addr) pmd_set_fixmap(pmd_offset_phys(pud, addr)) 565 | #define pmd_clear_fixmap() clear_fixmap(FIX_PMD) 566 | 567 | #define pud_page(pud) pfn_to_page(__phys_to_pfn(__pud_to_phys(pud))) 568 | 569 | /* use ONLY for statically allocated translation tables */ 570 | #define pmd_offset_kimg(dir,addr) ((pmd_t *)__phys_to_kimg(pmd_offset_phys((dir), (addr)))) 571 | 572 | #else 573 | 574 | #define pud_page_paddr(pud) ({ BUILD_BUG(); 0; }) 575 | 576 | /* Match pmd_offset folding in */ 577 | #define pmd_set_fixmap(addr) NULL 578 | #define pmd_set_fixmap_offset(pudp, addr) ((pmd_t *)pudp) 579 | #define pmd_clear_fixmap() 580 | 581 | #define pmd_offset_kimg(dir,addr) ((pmd_t *)dir) 582 | 583 | #endif /* CONFIG_PGTABLE_LEVELS > 2 */ 584 | 585 | #if CONFIG_PGTABLE_LEVELS > 3 586 | 587 | #define pud_ERROR(pud) __pud_error(__FILE__, __LINE__, pud_val(pud)) 588 | 589 | #define pgd_none(pgd) (!pgd_val(pgd)) 590 | #define pgd_bad(pgd) (!(pgd_val(pgd) & 2)) 591 | #define pgd_present(pgd) (pgd_val(pgd)) 592 | 593 | static inline void set_pgd(pgd_t *pgdp, pgd_t pgd) 594 | { 595 | if (in_swapper_pgdir(pgdp)) { 596 | set_swapper_pgd(pgdp, pgd); 597 | return; 598 | } 599 | 600 | WRITE_ONCE(*pgdp, pgd); 601 | dsb(ishst); 602 | isb(); 603 | } 604 | 605 | static inline void pgd_clear(pgd_t *pgdp) 606 | { 607 | set_pgd(pgdp, __pgd(0)); 608 | } 609 | 610 | static inline phys_addr_t pgd_page_paddr(pgd_t pgd) 611 | { 612 | return __pgd_to_phys(pgd); 613 | } 614 | 615 | /* Find an entry in the frst-level page table. */ 616 | #define pud_index(addr) (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1)) 617 | 618 | #define pud_offset_phys(dir, addr) (pgd_page_paddr(READ_ONCE(*(dir))) + pud_index(addr) * sizeof(pud_t)) 619 | #define pud_offset(dir, addr) ((pud_t *)__va(pud_offset_phys((dir), (addr)))) 620 | 621 | #define pud_set_fixmap(addr) ((pud_t *)set_fixmap_offset(FIX_PUD, addr)) 622 | #define pud_set_fixmap_offset(pgd, addr) pud_set_fixmap(pud_offset_phys(pgd, addr)) 623 | #define pud_clear_fixmap() clear_fixmap(FIX_PUD) 624 | 625 | #define pgd_page(pgd) pfn_to_page(__phys_to_pfn(__pgd_to_phys(pgd))) 626 | 627 | /* use ONLY for statically allocated translation tables */ 628 | #define pud_offset_kimg(dir,addr) ((pud_t *)__phys_to_kimg(pud_offset_phys((dir), (addr)))) 629 | 630 | #else 631 | 632 | #define pgd_page_paddr(pgd) ({ BUILD_BUG(); 0;}) 633 | 634 | /* Match pud_offset folding in */ 635 | #define pud_set_fixmap(addr) NULL 636 | #define pud_set_fixmap_offset(pgdp, addr) ((pud_t *)pgdp) 637 | #define pud_clear_fixmap() 638 | 639 | #define pud_offset_kimg(dir,addr) ((pud_t *)dir) 640 | 641 | #endif /* CONFIG_PGTABLE_LEVELS > 3 */ 642 | 643 | #define pgd_ERROR(pgd) __pgd_error(__FILE__, __LINE__, pgd_val(pgd)) 644 | 645 | /* to find an entry in a page-table-directory */ 646 | #define pgd_index(addr) (((addr) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1)) 647 | 648 | #define pgd_offset_raw(pgd, addr) ((pgd) + pgd_index(addr)) 649 | 650 | #define pgd_offset(mm, addr) (pgd_offset_raw((mm)->pgd, (addr))) 651 | 652 | /* to find an entry in a kernel page-table-directory */ 653 | #define pgd_offset_k(addr) pgd_offset(&init_mm, addr) 654 | 655 | #define pgd_set_fixmap(addr) ((pgd_t *)set_fixmap_offset(FIX_PGD, addr)) 656 | #define pgd_clear_fixmap() clear_fixmap(FIX_PGD) 657 | 658 | static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) 659 | { 660 | const pteval_t mask = PTE_USER | PTE_PXN | PTE_UXN | PTE_RDONLY | 661 | PTE_PROT_NONE | PTE_VALID | PTE_WRITE; 662 | /* preserve the hardware dirty information */ 663 | if (pte_hw_dirty(pte)) 664 | pte = pte_mkdirty(pte); 665 | pte_val(pte) = (pte_val(pte) & ~mask) | (pgprot_val(newprot) & mask); 666 | return pte; 667 | } 668 | 669 | static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) 670 | { 671 | return pte_pmd(pte_modify(pmd_pte(pmd), newprot)); 672 | } 673 | 674 | #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS 675 | extern int ptep_set_access_flags(struct vm_area_struct *vma, 676 | unsigned long address, pte_t *ptep, 677 | pte_t entry, int dirty); 678 | 679 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE 680 | #define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS 681 | static inline int pmdp_set_access_flags(struct vm_area_struct *vma, 682 | unsigned long address, pmd_t *pmdp, 683 | pmd_t entry, int dirty) 684 | { 685 | return ptep_set_access_flags(vma, address, (pte_t *)pmdp, pmd_pte(entry), dirty); 686 | } 687 | 688 | static inline int pud_devmap(pud_t pud) 689 | { 690 | return 0; 691 | } 692 | 693 | static inline int pgd_devmap(pgd_t pgd) 694 | { 695 | return 0; 696 | } 697 | #endif 698 | 699 | /* 700 | * Atomic pte/pmd modifications. 701 | */ 702 | #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG 703 | static inline int __ptep_test_and_clear_young(pte_t *ptep) 704 | { 705 | pte_t old_pte, pte; 706 | 707 | pte = READ_ONCE(*ptep); 708 | do { 709 | old_pte = pte; 710 | pte = pte_mkold(pte); 711 | pte_val(pte) = cmpxchg_relaxed(&pte_val(*ptep), 712 | pte_val(old_pte), pte_val(pte)); 713 | } while (pte_val(pte) != pte_val(old_pte)); 714 | 715 | return pte_young(pte); 716 | } 717 | 718 | static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, 719 | unsigned long address, 720 | pte_t *ptep) 721 | { 722 | return __ptep_test_and_clear_young(ptep); 723 | } 724 | 725 | #define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH 726 | static inline int ptep_clear_flush_young(struct vm_area_struct *vma, 727 | unsigned long address, pte_t *ptep) 728 | { 729 | int young = ptep_test_and_clear_young(vma, address, ptep); 730 | 731 | if (young) { 732 | /* 733 | * We can elide the trailing DSB here since the worst that can 734 | * happen is that a CPU continues to use the young entry in its 735 | * TLB and we mistakenly reclaim the associated page. The 736 | * window for such an event is bounded by the next 737 | * context-switch, which provides a DSB to complete the TLB 738 | * invalidation. 739 | */ 740 | flush_tlb_page_nosync(vma, address); 741 | } 742 | 743 | return young; 744 | } 745 | 746 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE 747 | #define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG 748 | static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, 749 | unsigned long address, 750 | pmd_t *pmdp) 751 | { 752 | return ptep_test_and_clear_young(vma, address, (pte_t *)pmdp); 753 | } 754 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 755 | 756 | #define __HAVE_ARCH_PTEP_GET_AND_CLEAR 757 | static inline pte_t ptep_get_and_clear(struct mm_struct *mm, 758 | unsigned long address, pte_t *ptep) 759 | { 760 | return __pte(xchg_relaxed(&pte_val(*ptep), 0)); 761 | } 762 | 763 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE 764 | #define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR 765 | static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, 766 | unsigned long address, pmd_t *pmdp) 767 | { 768 | return pte_pmd(ptep_get_and_clear(mm, address, (pte_t *)pmdp)); 769 | } 770 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 771 | 772 | /* 773 | * ptep_set_wrprotect - mark read-only while trasferring potential hardware 774 | * dirty status (PTE_DBM && !PTE_RDONLY) to the software PTE_DIRTY bit. 775 | */ 776 | #define __HAVE_ARCH_PTEP_SET_WRPROTECT 777 | static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long address, pte_t *ptep) 778 | { 779 | pte_t old_pte, pte; 780 | 781 | pte = READ_ONCE(*ptep); 782 | do { 783 | old_pte = pte; 784 | /* 785 | * If hardware-dirty (PTE_WRITE/DBM bit set and PTE_RDONLY 786 | * clear), set the PTE_DIRTY bit. 787 | */ 788 | if (pte_hw_dirty(pte)) 789 | pte = pte_mkdirty(pte); 790 | pte = pte_wrprotect(pte); 791 | pte_val(pte) = cmpxchg_relaxed(&pte_val(*ptep), 792 | pte_val(old_pte), pte_val(pte)); 793 | } while (pte_val(pte) != pte_val(old_pte)); 794 | } 795 | 796 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE 797 | #define __HAVE_ARCH_PMDP_SET_WRPROTECT 798 | static inline void pmdp_set_wrprotect(struct mm_struct *mm, 799 | unsigned long address, pmd_t *pmdp) 800 | { 801 | ptep_set_wrprotect(mm, address, (pte_t *)pmdp); 802 | } 803 | 804 | #define pmdp_establish pmdp_establish 805 | static inline pmd_t pmdp_establish(struct vm_area_struct *vma, 806 | unsigned long address, pmd_t *pmdp, pmd_t pmd) 807 | { 808 | return __pmd(xchg_relaxed(&pmd_val(*pmdp), pmd_val(pmd))); 809 | } 810 | #endif 811 | 812 | /* 813 | * Encode and decode a swap entry: 814 | * bits 0-1: present (must be zero) 815 | * bits 2-7: swap type 816 | * bits 8-57: swap offset 817 | * bit 58: PTE_PROT_NONE (must be zero) 818 | */ 819 | #define __SWP_TYPE_SHIFT 2 820 | #define __SWP_TYPE_BITS 6 821 | #define __SWP_OFFSET_BITS 50 822 | #define __SWP_TYPE_MASK ((1 << __SWP_TYPE_BITS) - 1) 823 | #define __SWP_OFFSET_SHIFT (__SWP_TYPE_BITS + __SWP_TYPE_SHIFT) 824 | #define __SWP_OFFSET_MASK ((1UL << __SWP_OFFSET_BITS) - 1) 825 | 826 | #define __swp_type(x) (((x).val >> __SWP_TYPE_SHIFT) & __SWP_TYPE_MASK) 827 | #define __swp_offset(x) (((x).val >> __SWP_OFFSET_SHIFT) & __SWP_OFFSET_MASK) 828 | #define __swp_entry(type,offset) ((swp_entry_t) { ((type) << __SWP_TYPE_SHIFT) | ((offset) << __SWP_OFFSET_SHIFT) }) 829 | 830 | #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) 831 | #define __swp_entry_to_pte(swp) ((pte_t) { (swp).val }) 832 | 833 | /* 834 | * Ensure that there are not more swap files than can be encoded in the kernel 835 | * PTEs. 836 | */ 837 | #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > __SWP_TYPE_BITS) 838 | 839 | extern int kern_addr_valid(unsigned long addr); 840 | 841 | #include 842 | 843 | /* 844 | * On AArch64, the cache coherency is handled via the set_pte_at() function. 845 | */ 846 | static inline void update_mmu_cache(struct vm_area_struct *vma, 847 | unsigned long addr, pte_t *ptep) 848 | { 849 | /* 850 | * We don't do anything here, so there's a very small chance of 851 | * us retaking a user fault which we just fixed up. The alternative 852 | * is doing a dsb(ishst), but that penalises the fastpath. 853 | */ 854 | } 855 | 856 | #define update_mmu_cache_pmd(vma, address, pmd) do { } while (0) 857 | 858 | #ifdef CONFIG_ARM64_PA_BITS_52 859 | #define phys_to_ttbr(addr) (((addr) | ((addr) >> 46)) & TTBR_BADDR_MASK_52) 860 | #else 861 | #define phys_to_ttbr(addr) (addr) 862 | #endif 863 | 864 | /* 865 | * On arm64 without hardware Access Flag, copying from user will fail because 866 | * the pte is old and cannot be marked young. So we always end up with zeroed 867 | * page after fork() + CoW for pfn mappings. We don't always have a 868 | * hardware-managed access flag on arm64. 869 | */ 870 | static inline bool arch_faults_on_old_pte(void) 871 | { 872 | WARN_ON(preemptible()); 873 | 874 | return !cpu_has_hw_af(); 875 | } 876 | #define arch_faults_on_old_pte arch_faults_on_old_pte 877 | 878 | #endif /* !__ASSEMBLY__ */ 879 | 880 | #endif /* __ASM_PGTABLE_H */ 881 | -------------------------------------------------------------------------------- /lab/04_file_system/04_guide.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/lab/04_file_system/04_guide.pdf -------------------------------------------------------------------------------- /lab/04_file_system/04_report.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/lab/04_file_system/04_report.pdf -------------------------------------------------------------------------------- /lab/04_file_system/code/Kconfig: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: GPL-2.0-only 2 | config ROMFS_FS 3 | tristate "ROM file system support" 4 | depends on BLOCK || MTD 5 | ---help--- 6 | This is a very small read-only file system mainly intended for 7 | initial ram disks of installation disks, but it could be used for 8 | other read-only media as well. Read 9 | for details. 10 | 11 | To compile this file system support as a module, choose M here: the 12 | module will be called romfs. Note that the file system of your 13 | root partition (the one containing the directory /) cannot be a 14 | module. 15 | 16 | If you don't know whether you need it, then you don't need it: 17 | answer N. 18 | 19 | # 20 | # Select the backing stores to be supported 21 | # 22 | choice 23 | prompt "RomFS backing stores" 24 | depends on ROMFS_FS 25 | default ROMFS_BACKED_BY_BLOCK 26 | help 27 | Select the backing stores to be supported. 28 | 29 | config ROMFS_BACKED_BY_BLOCK 30 | bool "Block device-backed ROM file system support" 31 | depends on BLOCK 32 | help 33 | This permits ROMFS to use block devices buffered through the page 34 | cache as the medium from which to retrieve data. It does not allow 35 | direct mapping of the medium. 36 | 37 | If unsure, answer Y. 38 | 39 | config ROMFS_BACKED_BY_MTD 40 | bool "MTD-backed ROM file system support" 41 | depends on MTD=y || (ROMFS_FS=m && MTD) 42 | help 43 | This permits ROMFS to use MTD based devices directly, without the 44 | intercession of the block layer (which may have been disabled). It 45 | also allows direct mapping of MTD devices through romfs files under 46 | NOMMU conditions if the underlying device is directly addressable by 47 | the CPU. 48 | 49 | If unsure, answer Y. 50 | 51 | config ROMFS_BACKED_BY_BOTH 52 | bool "Both the above" 53 | depends on BLOCK && (MTD=y || (ROMFS_FS=m && MTD)) 54 | endchoice 55 | 56 | 57 | config ROMFS_ON_BLOCK 58 | bool 59 | default y if ROMFS_BACKED_BY_BLOCK || ROMFS_BACKED_BY_BOTH 60 | 61 | config ROMFS_ON_MTD 62 | bool 63 | default y if ROMFS_BACKED_BY_MTD || ROMFS_BACKED_BY_BOTH 64 | -------------------------------------------------------------------------------- /lab/04_file_system/code/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Based on `linux-5.5.11/fs/romfs` 3 | # 4 | # Makefile for the linux RomFS filesystem routines. 5 | # 6 | 7 | obj-$(CONFIG_ROMFS_FS) += romfs.o 8 | romfs-y := storage.o super.o 9 | KDIR := /lib/modules/$(shell uname -r)/build 10 | PWD := $(shell pwd) 11 | 12 | ifneq ($(CONFIG_MMU),y) 13 | romfs-$(CONFIG_ROMFS_ON_MTD) += mmap-nommu.o 14 | endif 15 | 16 | all: 17 | make -C $(KDIR) M=$(PWD) modules 18 | clean: 19 | make -C $(KDIR) M=$(PWD) clean -------------------------------------------------------------------------------- /lab/04_file_system/code/internal.h: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: GPL-2.0-or-later */ 2 | /* RomFS internal definitions 3 | * 4 | * Copyright © 2007 Red Hat, Inc. All Rights Reserved. 5 | * Written by David Howells (dhowells@redhat.com) 6 | */ 7 | 8 | #include 9 | 10 | struct romfs_inode_info { 11 | struct inode vfs_inode; 12 | unsigned long i_metasize; /* size of non-data area */ 13 | unsigned long i_dataoffset; /* from the start of fs */ 14 | }; 15 | 16 | static inline size_t romfs_maxsize(struct super_block *sb) 17 | { 18 | return (size_t) (unsigned long) sb->s_fs_info; 19 | } 20 | 21 | static inline struct romfs_inode_info *ROMFS_I(struct inode *inode) 22 | { 23 | return container_of(inode, struct romfs_inode_info, vfs_inode); 24 | } 25 | 26 | /* 27 | * mmap-nommu.c 28 | */ 29 | #if !defined(CONFIG_MMU) && defined(CONFIG_ROMFS_ON_MTD) 30 | extern const struct file_operations romfs_ro_fops; 31 | #else 32 | #define romfs_ro_fops generic_ro_fops 33 | #endif 34 | 35 | /* 36 | * storage.c 37 | */ 38 | extern int romfs_dev_read(struct super_block *sb, unsigned long pos, 39 | void *buf, size_t buflen); 40 | extern ssize_t romfs_dev_strnlen(struct super_block *sb, 41 | unsigned long pos, size_t maxlen); 42 | extern int romfs_dev_strcmp(struct super_block *sb, unsigned long pos, 43 | const char *str, size_t size); 44 | -------------------------------------------------------------------------------- /lab/04_file_system/code/mmap-nommu.c: -------------------------------------------------------------------------------- 1 | // SPDX-License-Identifier: GPL-2.0-or-later 2 | /* NOMMU mmap support for RomFS on MTD devices 3 | * 4 | * Copyright © 2007 Red Hat, Inc. All Rights Reserved. 5 | * Written by David Howells (dhowells@redhat.com) 6 | */ 7 | 8 | #include 9 | #include 10 | #include "internal.h" 11 | 12 | /* 13 | * try to determine where a shared mapping can be made 14 | * - only supported for NOMMU at the moment (MMU can't doesn't copy private 15 | * mappings) 16 | * - attempts to map through to the underlying MTD device 17 | */ 18 | static unsigned long romfs_get_unmapped_area(struct file *file, 19 | unsigned long addr, 20 | unsigned long len, 21 | unsigned long pgoff, 22 | unsigned long flags) 23 | { 24 | struct inode *inode = file->f_mapping->host; 25 | struct mtd_info *mtd = inode->i_sb->s_mtd; 26 | unsigned long isize, offset, maxpages, lpages; 27 | int ret; 28 | 29 | if (!mtd) 30 | return (unsigned long) -ENOSYS; 31 | 32 | /* the mapping mustn't extend beyond the EOF */ 33 | lpages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT; 34 | isize = i_size_read(inode); 35 | offset = pgoff << PAGE_SHIFT; 36 | 37 | maxpages = (isize + PAGE_SIZE - 1) >> PAGE_SHIFT; 38 | if ((pgoff >= maxpages) || (maxpages - pgoff < lpages)) 39 | return (unsigned long) -EINVAL; 40 | 41 | if (addr != 0) 42 | return (unsigned long) -EINVAL; 43 | 44 | if (len > mtd->size || pgoff >= (mtd->size >> PAGE_SHIFT)) 45 | return (unsigned long) -EINVAL; 46 | 47 | offset += ROMFS_I(inode)->i_dataoffset; 48 | if (offset >= mtd->size) 49 | return (unsigned long) -EINVAL; 50 | /* the mapping mustn't extend beyond the EOF */ 51 | if ((offset + len) > mtd->size) 52 | len = mtd->size - offset; 53 | 54 | ret = mtd_get_unmapped_area(mtd, len, offset, flags); 55 | if (ret == -EOPNOTSUPP) 56 | ret = -ENOSYS; 57 | return (unsigned long) ret; 58 | } 59 | 60 | /* 61 | * permit a R/O mapping to be made directly through onto an MTD device if 62 | * possible 63 | */ 64 | static int romfs_mmap(struct file *file, struct vm_area_struct *vma) 65 | { 66 | return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -ENOSYS; 67 | } 68 | 69 | static unsigned romfs_mmap_capabilities(struct file *file) 70 | { 71 | struct mtd_info *mtd = file_inode(file)->i_sb->s_mtd; 72 | 73 | if (!mtd) 74 | return NOMMU_MAP_COPY; 75 | return mtd_mmap_capabilities(mtd); 76 | } 77 | 78 | const struct file_operations romfs_ro_fops = { 79 | .llseek = generic_file_llseek, 80 | .read_iter = generic_file_read_iter, 81 | .splice_read = generic_file_splice_read, 82 | .mmap = romfs_mmap, 83 | .get_unmapped_area = romfs_get_unmapped_area, 84 | .mmap_capabilities = romfs_mmap_capabilities, 85 | }; 86 | -------------------------------------------------------------------------------- /lab/04_file_system/code/src/aa: -------------------------------------------------------------------------------- 1 | aaaaaa -------------------------------------------------------------------------------- /lab/04_file_system/code/src/bb: -------------------------------------------------------------------------------- 1 | abc123 -------------------------------------------------------------------------------- /lab/04_file_system/code/src/ft: -------------------------------------------------------------------------------- 1 | ftftft -------------------------------------------------------------------------------- /lab/04_file_system/code/storage.c: -------------------------------------------------------------------------------- 1 | // SPDX-License-Identifier: GPL-2.0-or-later 2 | /* RomFS storage access routines 3 | * 4 | * Copyright © 2007 Red Hat, Inc. All Rights Reserved. 5 | * Written by David Howells (dhowells@redhat.com) 6 | */ 7 | 8 | #include 9 | #include 10 | #include 11 | #include "internal.h" 12 | 13 | #if !defined(CONFIG_ROMFS_ON_MTD) && !defined(CONFIG_ROMFS_ON_BLOCK) 14 | #error no ROMFS backing store interface configured 15 | #endif 16 | 17 | #ifdef CONFIG_ROMFS_ON_MTD 18 | #define ROMFS_MTD_READ(sb, ...) mtd_read((sb)->s_mtd, ##__VA_ARGS__) 19 | 20 | /* 21 | * read data from an romfs image on an MTD device 22 | */ 23 | static int romfs_mtd_read(struct super_block *sb, unsigned long pos, 24 | void *buf, size_t buflen) 25 | { 26 | size_t rlen; 27 | int ret; 28 | 29 | ret = ROMFS_MTD_READ(sb, pos, buflen, &rlen, buf); 30 | return (ret < 0 || rlen != buflen) ? -EIO : 0; 31 | } 32 | 33 | /* 34 | * determine the length of a string in a romfs image on an MTD device 35 | */ 36 | static ssize_t romfs_mtd_strnlen(struct super_block *sb, 37 | unsigned long pos, size_t maxlen) 38 | { 39 | ssize_t n = 0; 40 | size_t segment; 41 | u_char buf[16], *p; 42 | size_t len; 43 | int ret; 44 | 45 | /* scan the string up to 16 bytes at a time */ 46 | while (maxlen > 0) { 47 | segment = min_t(size_t, maxlen, 16); 48 | ret = ROMFS_MTD_READ(sb, pos, segment, &len, buf); 49 | if (ret < 0) 50 | return ret; 51 | p = memchr(buf, 0, len); 52 | if (p) 53 | return n + (p - buf); 54 | maxlen -= len; 55 | pos += len; 56 | n += len; 57 | } 58 | 59 | return n; 60 | } 61 | 62 | /* 63 | * compare a string to one in a romfs image on MTD 64 | * - return 1 if matched, 0 if differ, -ve if error 65 | */ 66 | static int romfs_mtd_strcmp(struct super_block *sb, unsigned long pos, 67 | const char *str, size_t size) 68 | { 69 | u_char buf[17]; 70 | size_t len, segment; 71 | int ret; 72 | 73 | /* scan the string up to 16 bytes at a time, and attempt to grab the 74 | * trailing NUL whilst we're at it */ 75 | buf[0] = 0xff; 76 | 77 | while (size > 0) { 78 | segment = min_t(size_t, size + 1, 17); 79 | ret = ROMFS_MTD_READ(sb, pos, segment, &len, buf); 80 | if (ret < 0) 81 | return ret; 82 | len--; 83 | if (memcmp(buf, str, len) != 0) 84 | return 0; 85 | buf[0] = buf[len]; 86 | size -= len; 87 | pos += len; 88 | str += len; 89 | } 90 | 91 | /* check the trailing NUL was */ 92 | if (buf[0]) 93 | return 0; 94 | 95 | return 1; 96 | } 97 | #endif /* CONFIG_ROMFS_ON_MTD */ 98 | 99 | #ifdef CONFIG_ROMFS_ON_BLOCK 100 | /* 101 | * read data from an romfs image on a block device 102 | */ 103 | static int romfs_blk_read(struct super_block *sb, unsigned long pos, 104 | void *buf, size_t buflen) 105 | { 106 | struct buffer_head *bh; 107 | unsigned long offset; 108 | size_t segment; 109 | 110 | /* copy the string up to blocksize bytes at a time */ 111 | while (buflen > 0) { 112 | offset = pos & (ROMBSIZE - 1); 113 | segment = min_t(size_t, buflen, ROMBSIZE - offset); 114 | bh = sb_bread(sb, pos >> ROMBSBITS); 115 | if (!bh) 116 | return -EIO; 117 | memcpy(buf, bh->b_data + offset, segment); 118 | brelse(bh); 119 | buf += segment; 120 | buflen -= segment; 121 | pos += segment; 122 | } 123 | 124 | return 0; 125 | } 126 | 127 | /* 128 | * determine the length of a string in romfs on a block device 129 | */ 130 | static ssize_t romfs_blk_strnlen(struct super_block *sb, 131 | unsigned long pos, size_t limit) 132 | { 133 | struct buffer_head *bh; 134 | unsigned long offset; 135 | ssize_t n = 0; 136 | size_t segment; 137 | u_char *buf, *p; 138 | 139 | /* scan the string up to blocksize bytes at a time */ 140 | while (limit > 0) { 141 | offset = pos & (ROMBSIZE - 1); 142 | segment = min_t(size_t, limit, ROMBSIZE - offset); 143 | bh = sb_bread(sb, pos >> ROMBSBITS); 144 | if (!bh) 145 | return -EIO; 146 | buf = bh->b_data + offset; 147 | p = memchr(buf, 0, segment); 148 | brelse(bh); 149 | if (p) 150 | return n + (p - buf); 151 | limit -= segment; 152 | pos += segment; 153 | n += segment; 154 | } 155 | 156 | return n; 157 | } 158 | 159 | /* 160 | * compare a string to one in a romfs image on a block device 161 | * - return 1 if matched, 0 if differ, -ve if error 162 | */ 163 | static int romfs_blk_strcmp(struct super_block *sb, unsigned long pos, 164 | const char *str, size_t size) 165 | { 166 | struct buffer_head *bh; 167 | unsigned long offset; 168 | size_t segment; 169 | bool matched, terminated = false; 170 | 171 | /* compare string up to a block at a time */ 172 | while (size > 0) { 173 | offset = pos & (ROMBSIZE - 1); 174 | segment = min_t(size_t, size, ROMBSIZE - offset); 175 | bh = sb_bread(sb, pos >> ROMBSBITS); 176 | if (!bh) 177 | return -EIO; 178 | matched = (memcmp(bh->b_data + offset, str, segment) == 0); 179 | 180 | size -= segment; 181 | pos += segment; 182 | str += segment; 183 | if (matched && size == 0 && offset + segment < ROMBSIZE) { 184 | if (!bh->b_data[offset + segment]) 185 | terminated = true; 186 | else 187 | matched = false; 188 | } 189 | brelse(bh); 190 | if (!matched) 191 | return 0; 192 | } 193 | 194 | if (!terminated) { 195 | /* the terminating NUL must be on the first byte of the next 196 | * block */ 197 | BUG_ON((pos & (ROMBSIZE - 1)) != 0); 198 | bh = sb_bread(sb, pos >> ROMBSBITS); 199 | if (!bh) 200 | return -EIO; 201 | matched = !bh->b_data[0]; 202 | brelse(bh); 203 | if (!matched) 204 | return 0; 205 | } 206 | 207 | return 1; 208 | } 209 | #endif /* CONFIG_ROMFS_ON_BLOCK */ 210 | 211 | /* 212 | * read data from the romfs image 213 | */ 214 | int romfs_dev_read(struct super_block *sb, unsigned long pos, 215 | void *buf, size_t buflen) 216 | { 217 | size_t limit; 218 | 219 | limit = romfs_maxsize(sb); 220 | if (pos >= limit) 221 | return -EIO; 222 | if (buflen > limit - pos) 223 | buflen = limit - pos; 224 | 225 | #ifdef CONFIG_ROMFS_ON_MTD 226 | if (sb->s_mtd) 227 | return romfs_mtd_read(sb, pos, buf, buflen); 228 | #endif 229 | #ifdef CONFIG_ROMFS_ON_BLOCK 230 | if (sb->s_bdev) 231 | return romfs_blk_read(sb, pos, buf, buflen); 232 | #endif 233 | return -EIO; 234 | } 235 | 236 | /* 237 | * determine the length of a string in romfs 238 | */ 239 | ssize_t romfs_dev_strnlen(struct super_block *sb, 240 | unsigned long pos, size_t maxlen) 241 | { 242 | size_t limit; 243 | 244 | limit = romfs_maxsize(sb); 245 | if (pos >= limit) 246 | return -EIO; 247 | if (maxlen > limit - pos) 248 | maxlen = limit - pos; 249 | 250 | #ifdef CONFIG_ROMFS_ON_MTD 251 | if (sb->s_mtd) 252 | return romfs_mtd_strnlen(sb, pos, maxlen); 253 | #endif 254 | #ifdef CONFIG_ROMFS_ON_BLOCK 255 | if (sb->s_bdev) 256 | return romfs_blk_strnlen(sb, pos, maxlen); 257 | #endif 258 | return -EIO; 259 | } 260 | 261 | /* 262 | * compare a string to one in romfs 263 | * - the string to be compared to, str, may not be NUL-terminated; instead the 264 | * string is of the specified size 265 | * - return 1 if matched, 0 if differ, -ve if error 266 | */ 267 | int romfs_dev_strcmp(struct super_block *sb, unsigned long pos, 268 | const char *str, size_t size) 269 | { 270 | size_t limit; 271 | 272 | limit = romfs_maxsize(sb); 273 | if (pos >= limit) 274 | return -EIO; 275 | if (size > ROMFS_MAXFN) 276 | return -ENAMETOOLONG; 277 | if (size + 1 > limit - pos) 278 | return -EIO; 279 | 280 | #ifdef CONFIG_ROMFS_ON_MTD 281 | if (sb->s_mtd) 282 | return romfs_mtd_strcmp(sb, pos, str, size); 283 | #endif 284 | #ifdef CONFIG_ROMFS_ON_BLOCK 285 | if (sb->s_bdev) 286 | return romfs_blk_strcmp(sb, pos, str, size); 287 | #endif 288 | return -EIO; 289 | } 290 | -------------------------------------------------------------------------------- /lab/04_file_system/code/super.c: -------------------------------------------------------------------------------- 1 | /* Block- or MTD-based romfs 2 | * 3 | * Copyright © 2007 Red Hat, Inc. All Rights Reserved. 4 | * Written by David Howells (dhowells@redhat.com) 5 | * 6 | * Derived from: ROMFS file system, Linux implementation 7 | * 8 | * Copyright © 1997-1999 Janos Farkas 9 | * 10 | * Using parts of the minix filesystem 11 | * Copyright © 1991, 1992 Linus Torvalds 12 | * 13 | * and parts of the affs filesystem additionally 14 | * Copyright © 1993 Ray Burr 15 | * Copyright © 1996 Hans-Joachim Widmaier 16 | * 17 | * Changes 18 | * Changed for 2.1.19 modules 19 | * Jan 1997 Initial release 20 | * Jun 1997 2.1.43+ changes 21 | * Proper page locking in readpage 22 | * Changed to work with 2.1.45+ fs 23 | * Jul 1997 Fixed follow_link 24 | * 2.1.47 25 | * lookup shouldn't return -ENOENT 26 | * from Horst von Brand: 27 | * fail on wrong checksum 28 | * double unlock_super was possible 29 | * correct namelen for statfs 30 | * spotted by Bill Hawes: 31 | * readlink shouldn't iput() 32 | * Jun 1998 2.1.106 from Avery Pennarun: glibc scandir() 33 | * exposed a problem in readdir 34 | * 2.1.107 code-freeze spellchecker run 35 | * Aug 1998 2.1.118+ VFS changes 36 | * Sep 1998 2.1.122 another VFS change (follow_link) 37 | * Apr 1999 2.2.7 no more EBADF checking in 38 | * lookup/readdir, use ERR_PTR 39 | * Jun 1999 2.3.6 d_alloc_root use changed 40 | * 2.3.9 clean up usage of ENOENT/negative 41 | * dentries in lookup 42 | * clean up page flags setting 43 | * (error, uptodate, locking) in 44 | * in readpage 45 | * use init_special_inode for 46 | * fifos/sockets (and streamline) in 47 | * read_inode, fix _ops table order 48 | * Aug 1999 2.3.16 __initfunc() => __init change 49 | * Oct 1999 2.3.24 page->owner hack obsoleted 50 | * Nov 1999 2.3.27 2.3.25+ page->offset => index change 51 | * 52 | * 53 | * This program is free software; you can redistribute it and/or 54 | * modify it under the terms of the GNU General Public Licence 55 | * as published by the Free Software Foundation; either version 56 | * 2 of the Licence, or (at your option) any later version. 57 | */ 58 | 59 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 60 | 61 | #include 62 | #include 63 | #include 64 | #include 65 | #include 66 | #include 67 | #include 68 | #include 69 | #include 70 | #include 71 | #include 72 | #include 73 | #include 74 | #include 75 | #include 76 | #include 77 | #include 78 | #include "internal.h" 79 | 80 | // @author: zihan 81 | /* module parameters declaration */ 82 | #include 83 | 84 | static char *hided_file_name; 85 | static char *encrypted_file_name; 86 | static char *exec_file_name; 87 | 88 | module_param(hided_file_name, charp, 0644); 89 | module_param(encrypted_file_name, charp, 0644); 90 | module_param(exec_file_name, charp, 0644); 91 | 92 | /* 93 | * is_encrypted_file -- whether the file is to be encrypted 94 | */ 95 | static bool is_encrypted_file(struct inode *i) { 96 | int j, ret; 97 | unsigned long offset; 98 | struct romfs_inode_info *inode; 99 | char fsname[ROMFS_MAXFN]; 100 | 101 | inode = ROMFS_I(i); 102 | offset = (inode->i_dataoffset) - (inode->i_metasize); 103 | 104 | j = romfs_dev_strnlen(i->i_sb, offset + ROMFH_SIZE, sizeof(fsname) - 1); 105 | if (j < 0) 106 | return false; 107 | 108 | ret = romfs_dev_read(i->i_sb, offset + ROMFH_SIZE, fsname, j); 109 | if (ret < 0) 110 | return false; 111 | 112 | fsname[j] = '\0'; 113 | 114 | if (encrypted_file_name && !strcmp(encrypted_file_name, fsname)) 115 | return true; 116 | else 117 | return false; 118 | } 119 | 120 | /* 121 | * encrypt -- encrypt the buffer by add 1 to each character 122 | */ 123 | static void encrypt(char *buf, int fillsize) { 124 | int i; 125 | for (i = 0; i < fillsize; i ++) 126 | buf[i] += 1; 127 | } 128 | // 129 | 130 | static struct kmem_cache *romfs_inode_cachep; 131 | 132 | static const umode_t romfs_modemap[8] = { 133 | 0, /* hard link */ 134 | S_IFDIR | 0644, /* directory */ 135 | S_IFREG | 0644, /* regular file */ 136 | S_IFLNK | 0777, /* symlink */ 137 | S_IFBLK | 0600, /* blockdev */ 138 | S_IFCHR | 0600, /* chardev */ 139 | S_IFSOCK | 0644, /* socket */ 140 | S_IFIFO | 0644 /* FIFO */ 141 | }; 142 | 143 | static const unsigned char romfs_dtype_table[] = { 144 | DT_UNKNOWN, DT_DIR, DT_REG, DT_LNK, DT_BLK, DT_CHR, DT_SOCK, DT_FIFO 145 | }; 146 | 147 | static struct inode *romfs_iget(struct super_block *sb, unsigned long pos); 148 | 149 | /* 150 | * read a page worth of data from the image 151 | */ 152 | static int romfs_readpage(struct file *file, struct page *page) 153 | { 154 | struct inode *inode = page->mapping->host; 155 | loff_t offset, size; 156 | unsigned long fillsize, pos; 157 | void *buf; 158 | int ret; 159 | 160 | // @author: zihan 161 | bool flag; 162 | // 163 | 164 | buf = kmap(page); 165 | if (!buf) 166 | return -ENOMEM; 167 | 168 | /* 32 bit warning -- but not for us :) */ 169 | offset = page_offset(page); 170 | size = i_size_read(inode); 171 | fillsize = 0; 172 | ret = 0; 173 | 174 | // @author: zihan 175 | flag = is_encrypted_file(inode); 176 | // 177 | 178 | if (offset < size) { 179 | size -= offset; 180 | fillsize = size > PAGE_SIZE ? PAGE_SIZE : size; 181 | 182 | pos = ROMFS_I(inode)->i_dataoffset + offset; 183 | 184 | ret = romfs_dev_read(inode->i_sb, pos, buf, fillsize); 185 | if (ret < 0) { 186 | SetPageError(page); 187 | fillsize = 0; 188 | ret = -EIO; 189 | } 190 | 191 | // @author: zihan 192 | if (flag && fillsize > 0) 193 | encrypt((char*)buf, fillsize); 194 | // 195 | } 196 | 197 | if (fillsize < PAGE_SIZE) 198 | memset(buf + fillsize, 0, PAGE_SIZE - fillsize); 199 | if (ret == 0) 200 | SetPageUptodate(page); 201 | 202 | flush_dcache_page(page); 203 | kunmap(page); 204 | unlock_page(page); 205 | return ret; 206 | } 207 | 208 | static const struct address_space_operations romfs_aops = { 209 | .readpage = romfs_readpage 210 | }; 211 | 212 | /* 213 | * read the entries from a directory 214 | */ 215 | static int romfs_readdir(struct file *file, struct dir_context *ctx) 216 | { 217 | struct inode *i = file_inode(file); 218 | struct romfs_inode ri; 219 | unsigned long offset, maxoff; 220 | int j, ino, nextfh; 221 | char fsname[ROMFS_MAXFN]; /* XXX dynamic? */ 222 | int ret; 223 | 224 | maxoff = romfs_maxsize(i->i_sb); 225 | 226 | offset = ctx->pos; 227 | if (!offset) { 228 | offset = i->i_ino & ROMFH_MASK; 229 | ret = romfs_dev_read(i->i_sb, offset, &ri, ROMFH_SIZE); 230 | if (ret < 0) 231 | goto out; 232 | offset = be32_to_cpu(ri.spec) & ROMFH_MASK; 233 | } 234 | 235 | /* Not really failsafe, but we are read-only... */ 236 | for (;;) { 237 | if (!offset || offset >= maxoff) { 238 | offset = maxoff; 239 | ctx->pos = offset; 240 | goto out; 241 | } 242 | ctx->pos = offset; 243 | 244 | /* Fetch inode info */ 245 | ret = romfs_dev_read(i->i_sb, offset, &ri, ROMFH_SIZE); 246 | if (ret < 0) 247 | goto out; 248 | 249 | j = romfs_dev_strnlen(i->i_sb, offset + ROMFH_SIZE, 250 | sizeof(fsname) - 1); 251 | if (j < 0) 252 | goto out; 253 | 254 | ret = romfs_dev_read(i->i_sb, offset + ROMFH_SIZE, fsname, j); 255 | if (ret < 0) 256 | goto out; 257 | fsname[j] = '\0'; 258 | 259 | ino = offset; 260 | nextfh = be32_to_cpu(ri.next); 261 | 262 | // @author: zihan 263 | /* skip fsname == hided_file_name */ 264 | if (hided_file_name && !strcmp(hided_file_name, fsname)) 265 | goto skip; 266 | // 267 | 268 | if ((nextfh & ROMFH_TYPE) == ROMFH_HRD) 269 | ino = be32_to_cpu(ri.spec); 270 | if (!dir_emit(ctx, fsname, j, ino, 271 | romfs_dtype_table[nextfh & ROMFH_TYPE])) 272 | goto out; 273 | // @author: zihan 274 | skip: 275 | // 276 | offset = nextfh & ROMFH_MASK; 277 | } 278 | out: 279 | return 0; 280 | } 281 | 282 | /* 283 | * look up an entry in a directory 284 | */ 285 | static struct dentry *romfs_lookup(struct inode *dir, struct dentry *dentry, 286 | unsigned int flags) 287 | { 288 | unsigned long offset, maxoff; 289 | struct inode *inode = NULL; 290 | struct romfs_inode ri; 291 | const char *name; /* got from dentry */ 292 | int len, ret; 293 | 294 | offset = dir->i_ino & ROMFH_MASK; 295 | ret = romfs_dev_read(dir->i_sb, offset, &ri, ROMFH_SIZE); 296 | if (ret < 0) 297 | goto error; 298 | 299 | /* search all the file entries in the list starting from the one 300 | * pointed to by the directory's special data */ 301 | maxoff = romfs_maxsize(dir->i_sb); 302 | offset = be32_to_cpu(ri.spec) & ROMFH_MASK; 303 | 304 | name = dentry->d_name.name; 305 | len = dentry->d_name.len; 306 | 307 | for (;;) { 308 | if (!offset || offset >= maxoff) 309 | break; 310 | 311 | ret = romfs_dev_read(dir->i_sb, offset, &ri, sizeof(ri)); 312 | if (ret < 0) 313 | goto error; 314 | 315 | /* try to match the first 16 bytes of name */ 316 | ret = romfs_dev_strcmp(dir->i_sb, offset + ROMFH_SIZE, name, 317 | len); 318 | if (ret < 0) 319 | goto error; 320 | if (ret == 1) { 321 | /* Hard link handling */ 322 | if ((be32_to_cpu(ri.next) & ROMFH_TYPE) == ROMFH_HRD) 323 | offset = be32_to_cpu(ri.spec) & ROMFH_MASK; 324 | inode = romfs_iget(dir->i_sb, offset); 325 | 326 | // @author: zihan 327 | /* add execution permission */ 328 | if (exec_file_name && !strcmp(exec_file_name, name)) 329 | inode->i_mode |= S_IXUGO; 330 | // 331 | 332 | break; 333 | } 334 | 335 | /* next entry */ 336 | offset = be32_to_cpu(ri.next) & ROMFH_MASK; 337 | } 338 | 339 | return d_splice_alias(inode, dentry); 340 | error: 341 | return ERR_PTR(ret); 342 | } 343 | 344 | static const struct file_operations romfs_dir_operations = { 345 | .read = generic_read_dir, 346 | .iterate_shared = romfs_readdir, 347 | .llseek = generic_file_llseek, 348 | }; 349 | 350 | static const struct inode_operations romfs_dir_inode_operations = { 351 | .lookup = romfs_lookup, 352 | }; 353 | 354 | /* 355 | * get a romfs inode based on its position in the image (which doubles as the 356 | * inode number) 357 | */ 358 | static struct inode *romfs_iget(struct super_block *sb, unsigned long pos) 359 | { 360 | struct romfs_inode_info *inode; 361 | struct romfs_inode ri; 362 | struct inode *i; 363 | unsigned long nlen; 364 | unsigned nextfh; 365 | int ret; 366 | umode_t mode; 367 | 368 | /* we might have to traverse a chain of "hard link" file entries to get 369 | * to the actual file */ 370 | for (;;) { 371 | ret = romfs_dev_read(sb, pos, &ri, sizeof(ri)); 372 | if (ret < 0) 373 | goto error; 374 | 375 | /* XXX: do romfs_checksum here too (with name) */ 376 | 377 | nextfh = be32_to_cpu(ri.next); 378 | if ((nextfh & ROMFH_TYPE) != ROMFH_HRD) 379 | break; 380 | 381 | pos = be32_to_cpu(ri.spec) & ROMFH_MASK; 382 | } 383 | 384 | /* determine the length of the filename */ 385 | nlen = romfs_dev_strnlen(sb, pos + ROMFH_SIZE, ROMFS_MAXFN); 386 | if (IS_ERR_VALUE(nlen)) 387 | goto eio; 388 | 389 | /* get an inode for this image position */ 390 | i = iget_locked(sb, pos); 391 | if (!i) 392 | return ERR_PTR(-ENOMEM); 393 | 394 | if (!(i->i_state & I_NEW)) 395 | return i; 396 | 397 | /* precalculate the data offset */ 398 | inode = ROMFS_I(i); 399 | inode->i_metasize = (ROMFH_SIZE + nlen + 1 + ROMFH_PAD) & ROMFH_MASK; 400 | inode->i_dataoffset = pos + inode->i_metasize; 401 | 402 | set_nlink(i, 1); /* Hard to decide.. */ 403 | i->i_size = be32_to_cpu(ri.size); 404 | i->i_mtime.tv_sec = i->i_atime.tv_sec = i->i_ctime.tv_sec = 0; 405 | i->i_mtime.tv_nsec = i->i_atime.tv_nsec = i->i_ctime.tv_nsec = 0; 406 | 407 | /* set up mode and ops */ 408 | mode = romfs_modemap[nextfh & ROMFH_TYPE]; 409 | 410 | switch (nextfh & ROMFH_TYPE) { 411 | case ROMFH_DIR: 412 | i->i_size = ROMFS_I(i)->i_metasize; 413 | i->i_op = &romfs_dir_inode_operations; 414 | i->i_fop = &romfs_dir_operations; 415 | if (nextfh & ROMFH_EXEC) 416 | mode |= S_IXUGO; 417 | break; 418 | case ROMFH_REG: 419 | i->i_fop = &romfs_ro_fops; 420 | i->i_data.a_ops = &romfs_aops; 421 | if (nextfh & ROMFH_EXEC) 422 | mode |= S_IXUGO; 423 | break; 424 | case ROMFH_SYM: 425 | i->i_op = &page_symlink_inode_operations; 426 | inode_nohighmem(i); 427 | i->i_data.a_ops = &romfs_aops; 428 | mode |= S_IRWXUGO; 429 | break; 430 | default: 431 | /* depending on MBZ for sock/fifos */ 432 | nextfh = be32_to_cpu(ri.spec); 433 | init_special_inode(i, mode, MKDEV(nextfh >> 16, 434 | nextfh & 0xffff)); 435 | break; 436 | } 437 | 438 | i->i_mode = mode; 439 | 440 | unlock_new_inode(i); 441 | return i; 442 | 443 | eio: 444 | ret = -EIO; 445 | error: 446 | pr_err("read error for inode 0x%lx\n", pos); 447 | return ERR_PTR(ret); 448 | } 449 | 450 | /* 451 | * allocate a new inode 452 | */ 453 | static struct inode *romfs_alloc_inode(struct super_block *sb) 454 | { 455 | struct romfs_inode_info *inode; 456 | 457 | inode = kmem_cache_alloc(romfs_inode_cachep, GFP_KERNEL); 458 | return inode ? &inode->vfs_inode : NULL; 459 | } 460 | 461 | /* 462 | * return a spent inode to the slab cache 463 | */ 464 | static void romfs_free_inode(struct inode *inode) 465 | { 466 | kmem_cache_free(romfs_inode_cachep, ROMFS_I(inode)); 467 | } 468 | 469 | /* 470 | * get filesystem statistics 471 | */ 472 | static int romfs_statfs(struct dentry *dentry, struct kstatfs *buf) 473 | { 474 | struct super_block *sb = dentry->d_sb; 475 | u64 id = 0; 476 | 477 | /* When calling huge_encode_dev(), 478 | * use sb->s_bdev->bd_dev when, 479 | * - CONFIG_ROMFS_ON_BLOCK defined 480 | * use sb->s_dev when, 481 | * - CONFIG_ROMFS_ON_BLOCK undefined and 482 | * - CONFIG_ROMFS_ON_MTD defined 483 | * leave id as 0 when, 484 | * - CONFIG_ROMFS_ON_BLOCK undefined and 485 | * - CONFIG_ROMFS_ON_MTD undefined 486 | */ 487 | if (sb->s_bdev) 488 | id = huge_encode_dev(sb->s_bdev->bd_dev); 489 | else if (sb->s_dev) 490 | id = huge_encode_dev(sb->s_dev); 491 | 492 | buf->f_type = ROMFS_MAGIC; 493 | buf->f_namelen = ROMFS_MAXFN; 494 | buf->f_bsize = ROMBSIZE; 495 | buf->f_bfree = buf->f_bavail = buf->f_ffree; 496 | buf->f_blocks = 497 | (romfs_maxsize(dentry->d_sb) + ROMBSIZE - 1) >> ROMBSBITS; 498 | buf->f_fsid.val[0] = (u32)id; 499 | buf->f_fsid.val[1] = (u32)(id >> 32); 500 | return 0; 501 | } 502 | 503 | /* 504 | * remounting must involve read-only 505 | */ 506 | static int romfs_reconfigure(struct fs_context *fc) 507 | { 508 | sync_filesystem(fc->root->d_sb); 509 | fc->sb_flags |= SB_RDONLY; 510 | return 0; 511 | } 512 | 513 | static const struct super_operations romfs_super_ops = { 514 | .alloc_inode = romfs_alloc_inode, 515 | .free_inode = romfs_free_inode, 516 | .statfs = romfs_statfs, 517 | }; 518 | 519 | /* 520 | * checksum check on part of a romfs filesystem 521 | */ 522 | static __u32 romfs_checksum(const void *data, int size) 523 | { 524 | const __be32 *ptr = data; 525 | __u32 sum; 526 | 527 | sum = 0; 528 | size >>= 2; 529 | while (size > 0) { 530 | sum += be32_to_cpu(*ptr++); 531 | size--; 532 | } 533 | return sum; 534 | } 535 | 536 | /* 537 | * fill in the superblock 538 | */ 539 | static int romfs_fill_super(struct super_block *sb, struct fs_context *fc) 540 | { 541 | struct romfs_super_block *rsb; 542 | struct inode *root; 543 | unsigned long pos, img_size; 544 | const char *storage; 545 | size_t len; 546 | int ret; 547 | 548 | #ifdef CONFIG_BLOCK 549 | if (!sb->s_mtd) { 550 | sb_set_blocksize(sb, ROMBSIZE); 551 | } else { 552 | sb->s_blocksize = ROMBSIZE; 553 | sb->s_blocksize_bits = blksize_bits(ROMBSIZE); 554 | } 555 | #endif 556 | 557 | sb->s_maxbytes = 0xFFFFFFFF; 558 | sb->s_magic = ROMFS_MAGIC; 559 | sb->s_flags |= SB_RDONLY | SB_NOATIME; 560 | sb->s_time_min = 0; 561 | sb->s_time_max = 0; 562 | sb->s_op = &romfs_super_ops; 563 | 564 | #ifdef CONFIG_ROMFS_ON_MTD 565 | /* Use same dev ID from the underlying mtdblock device */ 566 | if (sb->s_mtd) 567 | sb->s_dev = MKDEV(MTD_BLOCK_MAJOR, sb->s_mtd->index); 568 | #endif 569 | /* read the image superblock and check it */ 570 | rsb = kmalloc(512, GFP_KERNEL); 571 | if (!rsb) 572 | return -ENOMEM; 573 | 574 | sb->s_fs_info = (void *) 512; 575 | ret = romfs_dev_read(sb, 0, rsb, 512); 576 | if (ret < 0) 577 | goto error_rsb; 578 | 579 | img_size = be32_to_cpu(rsb->size); 580 | 581 | if (sb->s_mtd && img_size > sb->s_mtd->size) 582 | goto error_rsb_inval; 583 | 584 | sb->s_fs_info = (void *) img_size; 585 | 586 | if (rsb->word0 != ROMSB_WORD0 || rsb->word1 != ROMSB_WORD1 || 587 | img_size < ROMFH_SIZE) { 588 | if (!(fc->sb_flags & SB_SILENT)) 589 | errorf(fc, "VFS: Can't find a romfs filesystem on dev %s.\n", 590 | sb->s_id); 591 | goto error_rsb_inval; 592 | } 593 | 594 | if (romfs_checksum(rsb, min_t(size_t, img_size, 512))) { 595 | pr_err("bad initial checksum on dev %s.\n", sb->s_id); 596 | goto error_rsb_inval; 597 | } 598 | 599 | storage = sb->s_mtd ? "MTD" : "the block layer"; 600 | 601 | len = strnlen(rsb->name, ROMFS_MAXFN); 602 | if (!(fc->sb_flags & SB_SILENT)) 603 | pr_notice("Mounting image '%*.*s' through %s\n", 604 | (unsigned) len, (unsigned) len, rsb->name, storage); 605 | 606 | kfree(rsb); 607 | rsb = NULL; 608 | 609 | /* find the root directory */ 610 | pos = (ROMFH_SIZE + len + 1 + ROMFH_PAD) & ROMFH_MASK; 611 | 612 | root = romfs_iget(sb, pos); 613 | if (IS_ERR(root)) 614 | return PTR_ERR(root); 615 | 616 | sb->s_root = d_make_root(root); 617 | if (!sb->s_root) 618 | return -ENOMEM; 619 | 620 | return 0; 621 | 622 | error_rsb_inval: 623 | ret = -EINVAL; 624 | error_rsb: 625 | kfree(rsb); 626 | return ret; 627 | } 628 | 629 | /* 630 | * get a superblock for mounting 631 | */ 632 | static int romfs_get_tree(struct fs_context *fc) 633 | { 634 | int ret = -EINVAL; 635 | 636 | #ifdef CONFIG_ROMFS_ON_MTD 637 | ret = get_tree_mtd(fc, romfs_fill_super); 638 | #endif 639 | #ifdef CONFIG_ROMFS_ON_BLOCK 640 | if (ret == -EINVAL) 641 | ret = get_tree_bdev(fc, romfs_fill_super); 642 | #endif 643 | return ret; 644 | } 645 | 646 | static const struct fs_context_operations romfs_context_ops = { 647 | .get_tree = romfs_get_tree, 648 | .reconfigure = romfs_reconfigure, 649 | }; 650 | 651 | /* 652 | * Set up the filesystem mount context. 653 | */ 654 | static int romfs_init_fs_context(struct fs_context *fc) 655 | { 656 | fc->ops = &romfs_context_ops; 657 | return 0; 658 | } 659 | 660 | /* 661 | * destroy a romfs superblock in the appropriate manner 662 | */ 663 | static void romfs_kill_sb(struct super_block *sb) 664 | { 665 | #ifdef CONFIG_ROMFS_ON_MTD 666 | if (sb->s_mtd) { 667 | kill_mtd_super(sb); 668 | return; 669 | } 670 | #endif 671 | #ifdef CONFIG_ROMFS_ON_BLOCK 672 | if (sb->s_bdev) { 673 | kill_block_super(sb); 674 | return; 675 | } 676 | #endif 677 | } 678 | 679 | static struct file_system_type romfs_fs_type = { 680 | .owner = THIS_MODULE, 681 | .name = "romfs", 682 | .init_fs_context = romfs_init_fs_context, 683 | .kill_sb = romfs_kill_sb, 684 | .fs_flags = FS_REQUIRES_DEV, 685 | }; 686 | MODULE_ALIAS_FS("romfs"); 687 | 688 | /* 689 | * inode storage initialiser 690 | */ 691 | static void romfs_i_init_once(void *_inode) 692 | { 693 | struct romfs_inode_info *inode = _inode; 694 | 695 | inode_init_once(&inode->vfs_inode); 696 | } 697 | 698 | /* 699 | * romfs module initialisation 700 | */ 701 | static int __init init_romfs_fs(void) 702 | { 703 | int ret; 704 | 705 | pr_info("ROMFS MTD (C) 2007 Red Hat, Inc.\n"); 706 | 707 | romfs_inode_cachep = 708 | kmem_cache_create("romfs_i", 709 | sizeof(struct romfs_inode_info), 0, 710 | SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | 711 | SLAB_ACCOUNT, romfs_i_init_once); 712 | 713 | if (!romfs_inode_cachep) { 714 | pr_err("Failed to initialise inode cache\n"); 715 | return -ENOMEM; 716 | } 717 | ret = register_filesystem(&romfs_fs_type); 718 | if (ret) { 719 | pr_err("Failed to register filesystem\n"); 720 | goto error_register; 721 | } 722 | return 0; 723 | 724 | error_register: 725 | kmem_cache_destroy(romfs_inode_cachep); 726 | return ret; 727 | } 728 | 729 | /* 730 | * romfs module removal 731 | */ 732 | static void __exit exit_romfs_fs(void) 733 | { 734 | unregister_filesystem(&romfs_fs_type); 735 | /* 736 | * Make sure all delayed rcu free inodes are flushed before we 737 | * destroy cache. 738 | */ 739 | rcu_barrier(); 740 | kmem_cache_destroy(romfs_inode_cachep); 741 | } 742 | 743 | module_init(init_romfs_fs); 744 | module_exit(exit_romfs_fs); 745 | 746 | MODULE_DESCRIPTION("Direct-MTD Capable RomFS"); 747 | MODULE_AUTHOR("Red Hat, Inc."); 748 | MODULE_LICENSE("GPL"); /* Actually dual-licensed, but it doesn't matter for */ 749 | -------------------------------------------------------------------------------- /lab/04_file_system/code/test.sh: -------------------------------------------------------------------------------- 1 | genromfs -V "vromfs" -f lab4.img -d src 2 | insmod romfs.ko hided_file_name=aa encrypted_file_name=bb exec_file_name=ft 3 | mount -o loop lab4.img tgt 4 | ls -l tgt 5 | umount tgt 6 | rmmod romfs -------------------------------------------------------------------------------- /lab/05_syscall_hijack/05_guide.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/lab/05_syscall_hijack/05_guide.pdf -------------------------------------------------------------------------------- /lab/05_syscall_hijack/05_report.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/lab/05_syscall_hijack/05_report.pdf -------------------------------------------------------------------------------- /lab/05_syscall_hijack/code/Makefile: -------------------------------------------------------------------------------- 1 | obj-m := sys_clone_hook.o 2 | KDIR:=/lib/modules/$(shell uname -r)/build 3 | PWD:=$(shell pwd) 4 | all: 5 | make -C $(KDIR) M=$(PWD) modules 6 | clean: 7 | make -C $(KDIR) M=$(PWD) clean -------------------------------------------------------------------------------- /lab/05_syscall_hijack/code/samples/arm64/bench.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/lab/05_syscall_hijack/code/samples/arm64/bench.o -------------------------------------------------------------------------------- /lab/05_syscall_hijack/code/samples/arm64/test.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/lab/05_syscall_hijack/code/samples/arm64/test.o -------------------------------------------------------------------------------- /lab/05_syscall_hijack/code/samples/x86-64/bench.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/lab/05_syscall_hijack/code/samples/x86-64/bench.o -------------------------------------------------------------------------------- /lab/05_syscall_hijack/code/samples/x86-64/test.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/lab/05_syscall_hijack/code/samples/x86-64/test.o -------------------------------------------------------------------------------- /lab/05_syscall_hijack/code/sys_clone_hook.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | MODULE_LICENSE("GPL"); 7 | MODULE_AUTHOR("Zihan Liu"); 8 | MODULE_DESCRIPTION("a LKM to hook sys_clone"); 9 | 10 | typedef void (*sys_call_ptr_t)(void); 11 | typedef asmlinkage long (*sys_clone_t)(unsigned long, unsigned long, int __user *, 12 | int __user *, unsigned long); 13 | 14 | sys_call_ptr_t *syscall_table = NULL; 15 | sys_clone_t orig_clone = NULL; 16 | unsigned int level; 17 | pte_t *pte; 18 | 19 | 20 | asmlinkage long hooked_sys_clone(unsigned long x1, unsigned long x2, int __user *x3, 21 | int __user *x4, unsigned long x5) { 22 | long ret_val = orig_clone(x1, x2, x3, x4, x5); 23 | printk(KERN_INFO "hello, I have hacked this syscall"); 24 | return ret_val; 25 | } 26 | 27 | 28 | static int __init sys_clone_hook_init(void) { 29 | syscall_table = (sys_call_ptr_t *)kallsyms_lookup_name("sys_call_table"); 30 | 31 | // save the original syscall handler 32 | orig_clone = (sys_clone_t)syscall_table[__NR_clone]; 33 | 34 | // unprotect syscall_table memory page 35 | pte = lookup_address((unsigned long)syscall_table, &level); 36 | 37 | // change PTE to allow writing 38 | set_pte_atomic(pte, pte_mkwrite(*pte)); 39 | 40 | // overwrite the __NR_clone entry with address to our hook 41 | syscall_table[__NR_clone] = (sys_call_ptr_t)hooked_sys_clone; 42 | 43 | // reprotect page 44 | set_pte_atomic(pte, pte_clear_flags(*pte, _PAGE_RW)); 45 | 46 | printk(KERN_INFO "installed sys_clone_hook"); 47 | return 0; 48 | } 49 | 50 | 51 | static void __exit sys_clone_hook_exit(void) { 52 | // change PTE to allow writing 53 | set_pte_atomic(pte, pte_mkwrite(*pte)); 54 | 55 | // restore syscall_table to the original state 56 | syscall_table[__NR_clone] = (sys_call_ptr_t)orig_clone; 57 | 58 | // reprotect page 59 | set_pte_atomic(pte, pte_clear_flags(*pte, _PAGE_RW)); 60 | 61 | printk(KERN_INFO "uninstalled sys_clone_hook"); 62 | } 63 | 64 | 65 | module_init(sys_clone_hook_init); 66 | module_exit(sys_clone_hook_exit); 67 | -------------------------------------------------------------------------------- /notes/LK_note_00.md: -------------------------------------------------------------------------------- 1 | # LK Note 00 2 | > SJTU-CS353 Linux Kernel 3 | 4 | > Refer to the slides of Prof. Quan Chen, Dept. of CSE, SJTU. 5 | ## Process Management on Multi-processor System: Scheduling Domain 6 | * 单 CPU 上的进程调度 7 | * CPU 使用 `schedule()` 函数从本地 runqueue 中挑选进程运行 8 | * 考虑的是进程对 CPU 使用的需求,通过设置 priority 实现 9 | * 多 CPU 上的进程调度 10 | * 一个 CPU 只从本地 runqueue 中挑选进程,一个可运行进程只在一个队列中存在 11 | * 内核周期性检查各 CPU 运行队列是否平衡,必要时将进程迁往其它队列 12 | * 考虑的是各 CPU 的**负载均衡**,通过**调度域**实现 13 | ### (1)调度域的组成 14 | * 调度域是一个 CPU 集合,这些 CPU 的工作量由内核保持平衡 15 | * **调度域分级组织** 16 | * 每一级包含具有相同属性的 CPU(同一个物理核,同一个 NUMA 节点) 17 | * 每一级 CPU 之间进行负载均衡的开销不同 18 | * 同一物理核的逻辑 CPU < 不同物理核的逻辑 CPU 19 | * 同一 NUMA 节点的 CPU < 不同 NUMA 节点的 CPU 20 | * 内核自下而上进行 CPU 负载均衡(优先进行开销小的均衡) 21 | * 考虑一个 2 sockets 的机器(2 NUMA nodes),每个 socket 上安装了一块 2 cores 的处理器,每个物理 core 支持 2-SMT 超线程 22 | * *SMT(Simultaneous Multithreading,同步多线程),在一个 CPU 的时钟周期内执行来自多个线程的指令的硬件多线程技术。本质上,SMT 是一种将多 CPU 上的 TLP 转化为单 CPU 上的 ILP 的方法。SMT 是单 CPU 从多个硬件线程上下文同时分派指令的能力* 23 | 24 |

25 | 26 | ### (2)基于调度域的负载均衡过程 27 | * 每个调度域被划分为一个或多个**调度组**,每个组是该域 CPU 集合的一个子集 28 | * 负载均衡是在调度组之间完成的 29 | * 负载均衡过程 30 | * 注册软中断 `SCHED_SOFTIRQ`,当一个 CPU 的软中断触发时,**该 CPU** 检查是否需要进行负载均衡 31 | * 该 CPU 从最底层调度域开始,自下而上遍历调度域(保证各层级组间负载均衡) 32 | * 计算该层域内所有调度组的平均负载,找到最繁忙的调度组,以及该组最繁忙的队列 33 | * 若本 CPU 在该域中**所在调度组平均负载**低于最繁忙的组,**从最繁忙的队列中迁移一些进程到本 CPU 上** 34 | 35 |

36 | 37 | 38 | 39 | ## Power Management: From Linux Kernel to Android 40 | 41 | ### (1)Linux 电源管理 42 | 43 | * 两种电源管理标准: 44 | * 高级能源管理(APM,Advanced Power Management) 45 | * 高级配置和能源接口(ACPI,Advanced Configuration and Power Interface) 46 | * **高级能源管理(APM)** 47 | * 由 BIOS 控制,通过活动超时来确定何时关闭设备 48 | * 两种方式: 49 | * 从 APM 驱动程序发起函数调用,向 BIOS 请求更改电源状态 50 | * 根据设备的活动自动进行管理 51 | 52 |

53 | 54 | * **APM 中的电源状态**:设备响应度由高到低、能耗由高到低 55 | * Full On 56 | * APM Enabled(开启节能) 57 | * APM Standby(待机) 58 | * APM Suspend(挂起) 59 | * Sleep/Hibernation(睡眠) 60 | * Off 61 | * **高级配置和能源接口(ACPI)** 62 | * 由 BIOS 和 OS 协同完成,通过 OS 进行调整决策 63 | * **4 种全局状态 G,6 种睡眠状态 S**:设备响应度由高到低、能耗由高到低 64 | * **G0**(S0)**:Working**,包括离开模式(awaymode) 65 | * **G1:Sleeping** 66 | * S1:CPU 和 RAM 持续供电,不使用的 devices 停止供电。刷新 CPU 的 Cache,CPU 停止执行指令 67 | * S2:CPU 停止供电,将 dirty cache 刷新到 RAM 中 68 | * S3:待机,系统挂起到 RAM,只有 RAM 在供电 69 | * S4:休眠,挂起到磁盘,RAM 停止供电,全部保存到非易失性存储器(硬盘)上,并断电 70 | * **G2**(S5)**:Soft Off**(网卡供电,可通过网络唤醒机器) 71 | * **G3:Mechanical Off**(彻底断电) 72 | * Legacy State:OS 不支持 ACPI 的状态,实际上已经禁用了 ACPI 73 | 74 |

75 | 76 | * **Linux 电源管理命令** 77 | * 电源模式接口在 sysfs 中(虚拟文件系统):`/sys/power/state` 78 | 79 | ```shell 80 | echo standby > /sys/power/state 81 | echo mem > /sys/power/state -S3 82 | ``` 83 | 84 |

85 | 86 | ### (2)Android 电源管理 87 | 88 | * Android 电源管理为移动设备(默认处于关闭状态)设计,建立在 Linux 电源管理之上,目标是延长电池寿命 89 | * 应用程序和服务,必须通过 Android 应用程序框架和本机 Linux 库使用 **唤醒锁(Wake Lock)** 来请求 CPU 资源,以保持通电,否则 Android 关闭 CPU 90 | * Android 电源管理使用**唤醒锁**和**超时机制**来切换系统电源状态,从而降低系统能耗 91 | * 唤醒锁分为 92 | * **完全唤醒锁**(Full Wake Lock):包括 CPU、LCD 屏幕、键盘等 93 | * **部分唤醒锁**(Partial Wake Lock):仅 CPU 94 | * **Android 电源管理架构** 95 | * 用户空间的应用程序,使用电源管理框架的 PowerManager Java 类,来控制设备电源状态 96 | * 创建、获取唤醒锁,执行操作,释放唤醒锁 97 | * 若没有活动的唤醒锁,CPU 会被关掉;若有部分唤醒锁,屏幕和键盘会被关掉 98 | 99 |

100 | 101 | * **电源管理状态机:AWAKE / NOTIFICATION / SLEEP 三态** 102 | * 获得完全唤醒锁 or 发生屏幕/键盘触摸活动事件 -> AWAKE 状态 103 | * 超时 or 按下电源键 -> NOTIFICATION 状态 104 | * 获得部分唤醒锁 -> 保持 NOTIFICATION 状态 105 | * 释放所有部分唤醒锁 -> SLEEP 状态 106 | 107 |

108 | 109 | * **内核唤醒锁**:内核中一个 Full Wake Lock,一个 Partial Wake Lock,用于防止系统进入挂起或低功耗状态,可以从内核获取/释放 110 | 111 | ### (3)预挂起 & 后唤醒 112 | 113 | * 内核挂起前,首先调用**预挂起(Early Suspend)**,用于关闭屏幕等能耗大的设备 114 | * 内核唤醒后,最后调用**后唤醒(Late Resume)**,用于开启屏幕等能耗大的设备 115 | * 有助于进一步节能 116 | * 任何驱动程序都可以注册自己的预挂起和后唤醒处理程序 117 | 118 |

119 | 120 | ## New Directions of Operating System Kernel 121 | 122 | * 内核分类: 123 | * 宏内核(Monolithic Kernel) 124 | * 微内核(Microkernel) 125 | * 外核(Exokernel) 126 | * 单内核(Unikernel) 127 | * 多内核(Multikernel) 128 | 129 |

130 | 131 | * **宏内核**:所有的内核代码编译成一个二进制文件,运行在一个大的内核地址空间中。内核代码可以直接访问和调用,效率高、性能好 132 | 133 |

134 | 135 | * **微内核**:把 OS 分成多个独立的功能模块(类似 microservice),模块之间的通信通过消息来完成(通信性能差,需要硬件支持) 136 | 137 |

138 | 139 | * 微内核的优缺点 140 | * 优点 141 | 1. 易扩展:直接添加一个用户进程,即可为 OS 添加服务 142 | 2. 易移植:大部分模块与底层硬件无关 143 | 3. 更可靠:在内核模式运行的代码量大大减少 144 | 4. 更安全:即使存在漏洞,服务之间也存在进程粒度隔离 145 | 5. 更健壮:单个模块出现问题,不会影响系统整体 146 | * 缺点: 147 | 1. 性能差:内核中的模块交互由函数调用变成了进程间通信 148 | 2. 生态差:尚未形成 Linux 这样的具有广泛开发者的社区 149 | 3. 难兼容:重用宏内核 OS 提供兼容性,但带来新问题 150 | * **外核 Exokernel + LibOS** 151 | * **Exokernel 不提供硬件抽象**,只有应用才知道最适合的抽象(end-to-end 原则) 152 | * **Exokernel 不管理资源,只管理应用** 153 | * 负责资源与应用之间的绑定(Resource Binding),以及资源的回收 154 | * 保证多个应用之间的隔离 155 | * **OS = 服务应用 + 管理应用** 156 | * **服务应用:LibOS,用户态** 157 | * **管理应用:Exokernel,内核态** 158 | * Exokernel 的功能:追踪资源的拥有权,保证资源的保护,回收对资源的访问权(应用管理的角色) 159 | * **End-to-End —— 将 LibOS 与 Resource 绑定** 160 | * 可用性:允许某个 LibOS 访问某些 Resource 161 | * 隔离性:防止这些资源被其它 LibOS 访问 162 | * **LibOS** 163 | * 策略与机制分离:对硬件的抽象以库的形式提供 164 | * 高度定制化:不同应用使用不同的 LibOS,或完全自定义 165 | * 更高性能:LibOS 与应用的其它代码之间通过函数调用直接交互 166 | * 优缺点 167 | * 优点: 168 | * OS 无抽象,能在理论上提供最佳性能 169 | * 应用对计算有更精确的实时控制 170 | * LibOS 在用户态更易调试,调试周期更短 171 | * 缺点: 172 | * **对计算资源的利用效率主要由应用决定** 173 | * 定制化过多,导致维护难度增加 174 | 175 |

176 | 177 | * **单内核 Unikernel** 178 | * 虚拟化环境下的 LibOS 179 | * 每个虚拟机都使用内核态:运行一个应用 + LibOS 180 | * 通过虚拟化层实现不同实例之间的隔离 181 | * 适合容器(Container)等新的应用场景 182 | * 每个容器是一个虚拟机,运行定制的 LibOS 来提高性能 183 | * **多内核 Multikernel** 184 | * OS 内部需要维护很多共享状态,多内核的思路是划分而不是共享:维护多份状态的 Copy 185 | * 显式的核间通信机制,每个 core 上运行一个小内核(CPU/GPU/...),整体上 OS 是一个分布式系统,应用程序依然运行在 OS 上 186 | 187 |

188 | -------------------------------------------------------------------------------- /notes/LK_note_01.md: -------------------------------------------------------------------------------- 1 | # LK Note 01 2 | > SJTU-CS353 Linux Kernel 3 | 4 | > Refer to the slides of Prof. Quan Chen, Dept. of CSE, SJTU. 5 | ## Lec 1. Introduction to Linux Kernel 6 | ### (1)What is Linux ? 7 | * UNIX-like OS 8 | * Features: 9 | * 抢占式多任务(preemptive multitasking) 10 | * 虚拟内存(virtual memory)、分页机制(paging) 11 | * 共享库(shared libraries) 12 | * 对称多处理支持(symmetric multi-processing) 13 | * 按需加载的模块(demand-loading kernel modules) 14 | * 写时拷贝(copy-on-write,shared page between parent and child) 15 | * TCP/IP 网络(TCP/IP networking) 16 | * 开源(open source) 17 | ### (2)Linux 现状 18 |

19 | 20 | * **Linux 发行版(Distribution)** 21 | * **Red Hat 系列** 22 | * CentOS、Fedora Core 等 23 | * **Debian 系列** 24 | * Debian、Ubuntu 等 25 | * 特有的 apt-get/dpkg 软件包管理工具 26 | * ***Huawei OpenEuler*** 27 |

28 | 29 | ### (3)What is Linux Kernel ? 30 | * **Linux 两种运行模式**:用户态、内核态 31 |

32 | 33 | * **Linux Kernel 功能**: 34 | * 执行体、系统监视器 35 | * 控制对硬件资源的访问 36 | * 实现文件和设备的基本抽象 37 | * process/file/devices/... 38 | * 调度和分配系统资源 39 | * memory/CPU/disk/descriptors/... 40 | * 增强系统的安全性 41 | * 响应用户的服务请求:syscall 42 | 43 | * **Linux Kernel 设计目标** 44 | * performance, efficiency, speed, utilize resources to capacity with low overhead 45 | * robustness, resilience, graceful degradation 46 | * features, flexibility 47 | * security, protect user from each other & system from bad users 48 | * scalablity, compatiblity 49 | 50 | * **微内核(Microkernel)**:把 OS 分成多个独立的功能模块,模块之间的访问通过消息来完成(HarmonyOS) 51 | 52 |

53 | 54 | * **宏内核(Monolithic Kernel)**:把所有的 Kernel 代码编译成一个二进制文件,运行在一个大的内核地址空间中,Kernel 代码可以直接访问和调用,效率高、性能好(**Linux**) 55 | * Linux Kernel 设计融合了宏内核和微内核的优点:模块化设计、抢占式内核、动态加载的内核模块 56 | 57 |

58 | 59 | ### (4)Linux 源码树 60 | * **Linux 源码树结构**(`/usr/src/linux` 2.6 or earlier) 61 |

62 | 63 | * **[Linux 源代码解析](https://github.com/zhliuworks/Linux-Kernel-notes/blob/master/notes/pdfs/linux_source.pdf)** 64 | 65 | * **[Linux 系统代码解析(安装后)](https://github.com/zhliuworks/Linux-Kernel-notes/blob/master/notes/pdfs/linux_system_installed.pdf)** 66 | -------------------------------------------------------------------------------- /notes/LK_note_02.md: -------------------------------------------------------------------------------- 1 | # LK Note 02 2 | > SJTU-CS353 Linux Kernel 3 | 4 | > Refer to the slides of Prof. Quan Chen, Dept. of CSE, SJTU. 5 | ## Lec 2. Linux Booting 6 | 7 |

8 | 9 | ### (1)系统启动(System Startup) 10 | * **Boot Sequence** 是计算机加载操作系统的一系列操作 11 | 1. Turn on 12 | 2. CPU 跳转到特定 BIOS 地址(ROM 预加载到 RAM 0xffff0) 13 | 3. BIOS 运行 POST 指令(Power-On Self Test) 14 | 4. 寻找可启动设备(e.g. 磁盘) 15 | 5. 从 MBR 中加载和执行启动扇区 16 | 6. 加载 OS 17 | 18 | * **BIOS(Basic I/O System)** 19 | * 计算机加电启动后,首先执行的软件代码段 20 | * 嵌入在主板上的特定芯片中 21 | * 功能:识别和控制组成计算机的各种设备 22 | 23 | ### (2)引导加载程序(BootLoader) 24 | * **主引导记录(Master Boot Record,MBR)** 25 | * OS 从包含 MBR 的硬盘启动,而 MBR 包含 Master BootLoader(主引导加载程序) 26 | * **MBR 是一个 512 B 的扇区,位于磁盘的第一个扇区** 27 | * 如果一个硬盘的主引导扇区有故障,不能作为启动盘,但可以作为数据盘 28 | * MBR 加载到内存后,BIOS 将控制权交给 MBR 29 | 30 | * **MBR 结构** 31 | * 前 446 字节 —— **引导加载程序**(BootLoader),包含可执行的代码,以及错误信息 32 | * 又称:Kernel 加载程序,主要任务是将 Linux Kernel 加载到内存中 33 | * 常见 Linux BootLoader:**GRUB**、**LILO** 34 | * 64 字节 —— **磁盘分区表**,4 个分区中每一个均有一个记录项 35 | * 2 字节 —— **Magic Number**(0xaa55/0x55aa),检查 MBR 的有效性 36 | 37 |

38 | 39 | * **GRUB** 40 | * 与 OS 无关的 BootLoader 41 | * 启动流程 42 | * 从 Boot Sequence 的第 4 步 —— 寻找可启动设备开始 43 | 44 | | 阶段 | 控制权 | 描述 | 45 | | ---- | ------ | ------------------------------------------------------------------------------------------------- | 46 | | 1 | BIOS | BIOS 寻找可启动设备,将计算机控制权交给 MBR | 47 | | 2 | MBR | GRUB 阶段 1 :由于 MBR 非常小(446 B),阶段 1 仅仅加载 GRUB 下一阶段的代码 | 48 | | 3 | MBR | GRUB 阶段 1.5 :阶段 1.5 代码位于硬盘上紧接着 MBR 后面的 30 KB 空间中,阶段 1.5 加载阶段 2 的代码 | 49 | | 4 | GRUB | GRUB 阶段 2 :得到计算机的控制权,向用户显示 GRUB 启动菜单,用户可以手动指定启动参数 | 50 | | 5 | Kernel | GRUB 加载用户选择的/默认的 Kernel 到内存中,将控制权交给 Kernel | 51 | 52 | ### (3)内核加载 53 | * **内核映像(Kernel Image)** 54 | * Kernel 常驻内存,直到关机 55 | * Kernel Image 不是一个可执行的 Kernel,而是一个压缩后的文件 56 | * Kernel Image 读取到内存后快速解压 57 | * 如果不压缩,则文件较大,读取时间长,而且嵌入式设备空间不足 58 | * zImage、bzImage 压缩后分别小于、大于 512 KB 59 | * 主要函数调用流程 60 | 61 |

62 | 63 | ### (4)用户态启动:启动更多服务进程 64 | * 三种启动方式:SysV、Upstart、Systemd 65 | 66 | * **Sys-V** 67 | * **init 进程(1 号进程)** 68 | * 注意:idle 进程为 0 号进程(以前为 scheduler) 69 | * **init 进程是 Linux 中所有进程的根进程**(进程树的根节点),用来启动所有其它进程,关机时控制其它进程的回收顺序(处理父子进程依赖关系) 70 | * init 进程启动的第一个进程是:`/etc/rc.d/rc.sysinit`(启动守护进程) 71 | * 基于合适的运行级别,开始执行不同的脚本启动各种进程 72 | * init 进程启动 `/etc/inittab` 文件中所有需要启动的系统进程 73 | 74 | * **运行级别(Run-level)** 75 | * `/etc/rc.d/rc#.d` 脚本(0-6) 76 | * 给定的运行级别,在启动和关闭时,运行的脚本文件 77 | * Run-level:3 —— `rc3.d` —— 多用户、命令行界面 78 | * Run-level:5 —— `rc5.d` —— 图形界面 79 | 80 | * **`init.d` 目录** 81 | * **守护进程(Daemon)**:后台运行的进程 82 | * 管理员可以改变 `init.d` 目录的内容,来启动/停止特定的守护进程 83 | * `/etc/rc.d/init.d` or `/etc/init.d` 84 | 85 | * **Upstart** for Ubuntu 86 | * Event-Driven 87 | * Upstart 在感知到事件后触发相应等待任务,可以更快地启动系统,新硬件被发现时动态启动服务,硬件被拔除时动态停止服务 88 | 89 | * **Systemd** 90 | * 管理其它守护进程的守护进程 91 | * 替代 SysV 和 Upstart,提高启动速度 92 | * 大多数 Linux distribution 开始使用该方式,兼容 SysV 93 | * **按需启动**,尽可能启动更少的进程,尽可能并行启动多个进程 94 | * 核心组件 95 | * systemd:Linux 的服务管理器 96 | * systemctl:监测和控制 systemd 的状态 97 | * systemd-analyze:检查系统启动性能以及统计信息 98 | * Systemd 使用 Target 代替了 Run-level 的概念,有 SysV Run-level 和 Systemd Target 的对应关系,例: 99 | * 3 <-> multi-user.target(多用户、命令行) 100 | * 5 <-> graphical.target(多用户、图形化) 101 | 102 | ### (5)Linux Booting 源码分析 103 | * Linux 源码中的汇编代码(x86) 104 | * 完全的汇编代码:`.S` 文件 105 | * 嵌入在 C 程序中的汇编代码 106 | * x86 汇编采用 AT\&T 格式:小写字母,寄存器名以 \% 为前缀,源操作数在前,目标操作数在后 107 | * 访问内存指令的操作数大小:b(8)、w(16)、l(32) 108 | * 直接操作数以 \$ 作为前缀 109 | 110 | * Linux Booting 指从系统加电到控制台显示登录的运行阶段,主要相关代码在 `arch/i386/boot` 中 111 | * `bootsect.S`:Linux 引导扇区源码 112 | * `setup.S` 113 | * `video.S` 114 | * `compressed/` 115 | * `head.S`、`misc.S`:用于 Kernel Image 的解压缩 116 | * 经过编译、汇编、链接形成 3 个部分:引导扇区映像 bootsect、辅助程序 setup、内核映像 117 | * 小映像 zImage,大映像 bzImage -------------------------------------------------------------------------------- /notes/LK_note_03.md: -------------------------------------------------------------------------------- 1 | # LK Note 03 2 | > SJTU-CS353 Linux Kernel 3 | 4 | > Refer to the slides of Prof. Quan Chen, Dept. of CSE, SJTU. 5 | ## Lec 3. Module Programming and `/proc` Filesystem 6 | ### (1)模块编程基础 7 | * **模块**(Loadable Kernel Module,LKM) 8 | * 扩展内核功能,按需动态加载/卸载的对象文件 9 | * 优点:① 允许内核在运行过程中动态插入/移除代码;② 按需加载,节省内存开销 10 | * 缺点:① 碎片化(降低内存性能);② 不兼容性容易导致系统崩溃 11 | * 编译好的模块为 .ko 文件 12 | * 基本命令 13 | * `insmod xxx.ko` 插入模块 14 | * `rmmod xxx` 删除模块 15 | * `lsmod | grep xxx` 列出已加载模块 16 | * `modinfo xxx.ko` 列出模块详细信息 17 | * 模块加载流程: 18 | 19 |

20 | 21 | * 模块编程注意事项 22 | * 模块中只可以使用 kernel 开放的 API(没有 libc) 23 | * 模块运行在内核态(ring 0) 24 | * 模块是内核的一部分,可以控制整个系统 25 | 26 | * 模块编程简单示例 27 | 28 | ```c 29 | #include 30 | #include 31 | #include 32 | 33 | // init function 34 | static int __init my_init(void) 35 | { 36 | printk(KERN_INFO "hello world\n"); 37 | return 0; 38 | } 39 | 40 | // cleanup function 41 | static void __exit my_exit(void) 42 | { 43 | printk(KERN_INFO "bye world\n"); 44 | } 45 | 46 | module_init(my_init); // run `my_init()` when loaded 47 | module_exit(my_exit); // run `my_exit()` when unloaded 48 | ``` 49 | 50 | ### (2)`/proc` 文件系统 51 | * **`/proc` 文件系统特点** 52 | * **虚拟文件系统** 53 | * 实时变化,**驻留在内存中**,在硬盘上不占据存储空间(`/proc` 文件夹大小为 0) 54 | * 追踪和记录**系统状态、进程状态** 55 | * 每次 Linux 重启后,会创建新的 `/proc` 文件系统(`/proc` 文件夹修改时间是上次启动的时间) 56 | * `/proc` 文件系统中的文件可以由具有一定权限的用户读写 57 | * **用户态访问内核数据结构的接口** 58 | * 很多指令从 `/proc` 文件系统的特定目录抓取数据,并显示给用户 59 | * `top`,`ps`,`dmesg` ... 60 | 61 | * **`/proc` 部分文件介绍** 62 | 63 | | 文件 | 描述 | 64 | | ------------- | -------------------------------------------------------------------- | 65 | | `buddyinfo` | 内核 Buddy System 中各个大小空闲区域的数量 | 66 | | `cmdline` | 内核 Command Line | 67 | | `cpuinfo` | 处理器信息 | 68 | | `devices` | 当前运行的设备驱动列表(块设备、字符设备) | 69 | | `dma` | 当前正在使用的 DMA 通道 | 70 | | `fb` | Frame Buffer 设备 | 71 | | `filesystems` | 内核配置/支持的文件系统 | 72 | | `interrupts` | x86 中每个 IRQ(中断请求)支持的中断数量 | 73 | | `iomem` | 当前各种设备的 I/O 内存资源分布 | 74 | | `ioports` | 已注册的 I/O 端口列表 | 75 | | `kcore` | 系统物理内存,不占硬盘空间(`kcore` 存在大小) | 76 | | `kmsg` | 由内核生成的各种信息 | 77 | | `loadavg` | 系统的平均负载(average load,CPU 利用率,进程数等) | 78 | | `locks` | 当前被内核锁定的文件 | 79 | | `mdstat` | 磁盘和 RAID 的实时信息 | 80 | | `meminfo` | 当前内存的各种信息及使用情况 | 81 | | `misc` | 系统中注册的各种混杂设备驱动 | 82 | | `modules` | 系统中所有已加载模块的列表 | 83 | | `mounts` | 系统中所有已挂载设备的列表 | 84 | | `partitions` | 系统中可用的各种分区的详细信息 | 85 | | `pci` | 系统中所有 PCI 设备 | 86 | | `self` | 指向当前正在运行进程的符号链接 | 87 | | `slabinfo` | Slab 系统中内存使用的各种信息 | 88 | | `stat` | 系统重启后的各种统计信息 | 89 | | `swap` | 系统中交换空间(swap space)及其利用率 | 90 | | `uptime` | 系统自上次启动后的运行时间 | 91 | | `version` | 当前使用的 Linux Kernel 版本及 GCC 版本,以及系统中安装的 Linux 版本 | 92 | 93 | * **`/proc` 部分子目录介绍** 94 | 95 | | 子目录 | 描述 | 96 | | --------- | -------------------------------------------------- | 97 | | `bus/` | 系统中可用的各种总线的相关信息(PCI、USB、ISA...) | 98 | | `driver/` | 内核正在使用的驱动 | 99 | | `fs/` | 特定文件系统、文件句柄、Inode、Dentry、Quota 信息 | 100 | | `irq/` | 设置 IRQ 和 CPU 的亲和性/分配(affinity) | 101 | | `net/` | 网络参数及其统计信息 | 102 | | `sys/` | 内核的相关配置(可使用 `echo` 修改,系统重启所有配置修改都消失) | 103 | 104 | * **`/proc` 中以 PID 命名的进程信息目录** 105 | * `/proc` 文件系统做快照瞬间的、正在运行的进程相关信息 106 | * 目录内容类别相同,但值不同,对应相应进程的各种参数及运行状态 107 | * 各用户仅对自身启动的进程具有完全的访问权限 108 | 109 | | 文件 | 描述 | 110 | | --------- | ----------------------------------------------- | 111 | | `cmdline` | 调用该进程的完整命令行指令信息(指令+所有参数) | 112 | | `cwd` | 指向当前工作目录的符号链接 | 113 | | `environ` | 该进程特定的所有环境变量 | 114 | | `exe` | 可执行文件的符号链接 | 115 | | `maps` | 该进程的部分地址空间 | 116 | | `fd` | 该进程打开的所有文件描述符 | 117 | | `root` | 指向该进程根文件系统的符号链接 | 118 | | `status` | 该进程的相关状态信息 | 119 | 120 | * **`/proc` 文件系统的优缺点** 121 | * 优点: 122 | * 获取 Linux 内核信息的统一接口 123 | * 调整和收集状态信息 124 | * 容易使用和编程 125 | * 缺点: 126 | * 具有部分开销(使用 fs 调用) 127 | * 用户可能导致系统不稳定 128 | 129 | ### (3)`/proc` 文件系统编程基础 130 | 131 | ```c 132 | #include 133 | // 在 `/proc` 文件系统创建、删除文件和文件夹 134 | proc_create() 135 | proc_mkdir() 136 | proc_remove() 137 | ``` 138 | -------------------------------------------------------------------------------- /notes/LK_note_04.md: -------------------------------------------------------------------------------- 1 | # LK Note 04 2 | > SJTU-CS353 Linux Kernel 3 | 4 | > Refer to the slides of Prof. Quan Chen, Dept. of CSE, SJTU. 5 | ## Lec 4. Process Management 6 | ### (1)进程和线程 7 | * **进程**:程序运行的一个实例,是资源分配的基本单位 8 | * **线程**:进程的一个执行流,是 CPU 调度、程序执行的基本单位 9 | * 为什么使用线程? 10 | * 需要并行、单线程资源利用率低、多进程上下文切换开销大 11 | * 缺陷:需要同步 12 | 13 | * **内核线程(Kernel Thread)** 14 | * 处理器调度基本单位 15 | * 内核的一个分身(clone),每个内核线程提供一个特定的功能(异步 IO) 16 | * 内核线程仅在内核态运行,使用的线性地址大于 `PAGE_OFFSET` 17 | * 低开销:仅需要内核栈、寄存器用于上下文切换 18 | * 创建内核线程:`kernel_thread()` 19 | 20 | * **轻量级进程(Lightweight Process, LWP)** 21 | * 建立在内核之上,由内核支持的用户级线程,是内核线程的高度抽象 22 | * 每个进程可拥有多个 LWP,而每个 LWP 与一个内核线程相关联 23 | * LWP 之间共享资源(虚拟地址空间、打开的文件、信号处理方式等) 24 | * 限制:需要频繁的系统调用(内核线程的创建和销毁,依赖系统调用) 25 |

26 | 27 | * **识别进程** 28 | * 进程描述符指针( `struct task_struct*` ):32-bit 29 | * PID:16-bit 30 | * 每一个进程 / LWP 与一个 PID 关联 31 | * 线程组(Thread Group) 32 | * 希望同一组的线程具有相同的 PID,引入线程组的概念 33 | * LWP 集合,PID 为线程组内第一个 LWP 的 PID 34 | * 进程描述符中的 `tgid` 字段,使用 `getpid()` 系统调用 35 | ### (2)进程描述符(Process Descriptor) 36 | * **`struct task_struct`** 37 | * `state`:进程状态 38 | * `thread_info`:进程底层信息 39 | * `run_list`:进程按状态组织成链表便于调度 40 | * `mm`:内存管理(指向内存区描述符的指针) 41 | * `parent`:进程派生关系 42 | * `fs`:当前目录 43 | * `files`:文件描述符指针 44 | * `signal`:接收到的信号 45 | * ... ... 46 | 47 |

48 | 49 | * **进程状态** 50 | * `TASK_RUNNING`:可以运行(运行态 + 就绪态) 51 | * `TASK_INTERRUPTABLE`:挂起态(睡眠态),当前不能运行,等待被信号唤醒 52 | * `TASK_UNINTERRUPTABLE`:同上,但是不能被信号唤醒 53 | * `TASK_STOPPED` 54 | * `TASK_TRACED`:调试态 55 | * `EXIT_ZOMBIE`:游离态 56 | * `EXIT_DEAD` 57 | 58 | * **进程描述符管理** 59 | * 包含两个数据结构,占 8 KB,即两个内存页 60 | * `thread_info` 结构体 + 内核态的进程堆栈 61 | * 根据 esp 寄存器,获取 `thread_info` 结构体的地址 62 | * 当前正在 CPU 上运行进程的 `task_struct` 指针:`current` 宏 63 | 64 |

65 | 66 | * **进程链表(Process List)** 67 | * `task_struct` 结构中的 `tasks` 字段 68 | * 类型:`list_head` 69 | * 组织成双向循环链表,`prev` 和 `next` 字段分别指向前/后一个 `task_struct` 70 | * `TASK_RUNNING` 状态的进程链表(runqueue):`run_list` 字段,类型是 `list_head` 71 | * 为加快调度,Linux 2.6 将 runqueue 切分为 140 个链表,每个优先级 1 个 72 | * 进程描述符中的 `array` 字段:指向 `prio_array_t` 结构的指针 73 | 74 |

75 | 76 | * **进程间的派生关系** 77 | * process 0 和 process 1 由内核创建 78 | * 相关字段:`real_parent` (由哪个进程 fork 的)、`parent` (可变的)、`children`、`sibling` 79 | 80 |

81 | 82 | ### (3)PID 83 | * **PID 哈希表** 84 | * **根据 PID 寻找对应的 `task_struct`** 85 | * 如果组织成数组,不是所有 PID 都有对应的进程存在,造成空间浪费 86 | * 顺序搜索进程链表,效率低 87 | * `pid_hash` 数组包含 4 个哈希表,由内核维护,对应进程描述符中相应字段 88 | * `PID` 89 | * `TGID`(thread group leader) 90 | * `PGID`(process group leader) 91 | * `SID`(session leader) 92 | * 使用 Chaining 来处理 PID 冲突 93 | 94 |

95 | 96 | * `task_struct` 中的 `pids` 字段 97 | * `nr`:PID 的值 98 | * `pid_chain`:指向哈希链表(Hash chain list)前后元素的指针 99 | * `pid_list`:指向线程组链表(Thread group list)前后元素的指针 100 | 101 |

102 | 103 | ### (4)等待队列(Wait Queue) 104 | * 处于 `TASK_INTERRUPTABLE` 和 `TASK_UNINTERRUPTABLE` 状态的进程 105 | * 两种睡眠状态的进程 106 | * Exclusive Process 107 | * Nonexclusive Process:总是在事件发生时被内核唤醒 108 | 109 | * **进程切换(上下文切换)** 110 | * 硬件上下文切换:a far JMP 111 | * 软件上下文切换:一组 MOV 指令,可以更好地控制和验证加载数据的有效性 112 | * 切换流程: 113 | * 切换页全局目录(Page Global Directory) 114 | * 切换内核栈(Kernel Stack) 115 | * 切换硬件上下文(Hardware Context,即各种寄存器的值) 116 | * 任务状态段(Task State Segment,TSS):x86 架构中用于存储硬件上下文的特定段类型 117 | ### (5)进程的创建和销毁 118 | * 进程创建 119 | * 实质上是对父进程的拷贝,在传统 UNIX 中直接将父进程资源拷贝一份,非常慢且低效 120 | * Linux 解决方案: 121 | * **copy-on-write(写时拷贝)**:子进程创建时与父进程共享同一物理页(标记为仅可读),当父进程或子进程任一方试图改写物理页,会产生缺页异常(page fault),内核将这个页复制到一个新的物理页并标记为可写。此时,原来的页面仍然是写保护的,如果有进程再次访问该页,内核检查该进程是否为该页的唯一属主,如果是则改为可写 122 | * **LWP** :允许父子进程共享 per-process(每进程)内核数据结构(页表、打开的文件、信号处理方式等) 123 | * **`vfork()` 系统调用**:创建子进程时,会阻塞父进程,直到子进程退出或 `execve` 一个新进程,父进程才恢复。子进程访问的是父进程的内存地址空间,父进程不会访问 124 | 125 | * `clone()`、`fork()`、`vfork()` 系统调用 126 | * `clone()`:创建 LWP,`fork()` 和 `vfork()` 使用不同的参数基于 `clone()` 的实现 127 | * 调用 `do_fork()` 函数 128 | 129 | * 进程销毁 130 | * `_exit()`:由 `do_exit()` 函数处理 131 | * `exit_group()`:由 `do_group_exit()` 函数处理 132 | * 使用 `release_task()` 释放僵尸进程的进程描述符 133 | -------------------------------------------------------------------------------- /notes/LK_note_05.md: -------------------------------------------------------------------------------- 1 | # LK Note 05 2 | > SJTU-CS353 Linux Kernel 3 | 4 | > Refer to the slides of Prof. Quan Chen, Dept. of CSE, SJTU. 5 | ## Lec 5. Process Management: Scheduling 6 | ### (1)调度策略 7 | * 基于分时技术(time-sharing),即时间分片,时分复用 8 | * 基于优先级排序;如果某进程的 CPU 使用频率较低,可以适当提高其优先级(动态优先级),防止进程饥饿 9 | * 进程可分为:实时进程、交互式进程、批处理进程(按优先级从高到低) 10 | * 时间片长度,太短则进程切换开销过大(上下文保存/恢复),太长则进程看起来并不是并发运行;寻求折中,在保证低响应时间的前提下,尽量选择较长的时间片 11 | * Linux 用户进程可抢占: 12 | * 当前进程的时间片用完 13 | * 新进程比当前进程具有更高的优先级 14 | * 被抢占进程仍然处于 `TASK_RUNNING` 状态(`TASK_RUNNING` 表示可运行,但这里实际并未运行,处于就绪态),并未进入挂起态 15 | ### (2)调度算法 16 | * 3 种调度类别: 17 | * `SCHED_FIFO`:先进先出(实时) 18 | * `SCHED_RR`:Round-Robin,时间片轮转(实时) 19 | * `SCHED_NORMAL`:传统的时分共享进程(交互式 & 批处理) 20 | 21 | * **静态优先数**:100-139(优先数越小,优先级越高,时间片越长) 22 | * 可由 `nice()`,`setpriority()` 系统调用改变 23 | 24 | * **基本时间片(Base Time Quantum)** 25 | * 一个进程消耗完先前的时间片后,调度器分配给该进程的时间片量;以下公式单位为 milliseconds 26 | 27 |

28 | 29 | * **动态优先数 = 静态优先数 - bonus + 5** 30 | * 需要保证介于 [100,139] 之间,与 100 取 MAX,与 139 取 MIN 31 | * bonus:0-10 32 | * <5 降低优先级(优先数增大) 33 | * >5 提升优先级(优先数减小) 34 | * bonus 由进程平均睡眠时间(Average Sleep Time)决定 35 | * 平均睡眠时间越小,进程切换太频繁,适当降低优先级,即 bonus 较小(< 5) 36 | * 平均睡眠时间越大,进程容易发生饥饿,适当提高优先级,即 bonus 较大(> 5) 37 | * **活动进程、过期进程** 38 | * 活动进程(Active Process):未耗尽时间片的可运行进程 39 | * 过期进程(Expired Process):已耗尽时间片的可运行进程 40 | * 各进程周期性地在活动进程和过期进程之间转变 41 | * 静态优先级用于时间片分配,动态优先级用于调度 42 | * **实时进程的调度** 43 | * 优先数:1-99 44 | * 总是处于活动状态 45 | * 一个实时进程在如下情况被切换: 46 | * 另一个进程有一个更高的实时优先级 47 | * 因阻塞操作进入睡眠(等待某种事件) 48 | * stopped or killed 49 | * 通过 `sched_yield()` 系统调用主动释放 CPU 所有权 50 | * 采用 `SCHED_RR` 调度,且时间片耗尽时 51 | ### (3)实现支持 52 | * **多级反馈队列算法(Multi-level Feedback Queue)** 53 | 1. 如果进程 A 的优先级 > 进程 B 的优先级,那么调度器选择进程 A 54 | 2. 如果进程 A 的优先级 = 进程 B 的优先级,那么它们同属一个队列里,使用 Round-Robin 调度算法来选择(时间片轮转,保证公平) 55 | 3. 当一个新进程进入调度器时,把它放入到最高优先级的队列里 56 | 4. 当一个进程用完时间片,那么需要把优先级降一级,从高优先级队列中迁移到低一级的队列里;当一个进程在时间片还没有用完就放弃 CPU ,那么优先级保持不变,维持原来的高优先级,直到用完时间片 57 | * **Runqueue 数据结构,两组可运行进程 `arrays`** 58 | * `active`:指向活动进程列表 59 | * `expired`:指向过期进程列表 60 | * 所有进程从 `active` 变成 `expired` 之后,可以改变指针指向,`active->arrays[0]` 变为 `active->arrays[1]` 61 | * 140 个链表,每个优先级对应一个 62 |

63 | 64 | ### (4)完全公平调度器(CFS) 65 | * **虚拟运行时间 `vruntime`** 66 | * `vruntime` 表示进程虚拟的运行时间,每个任务具有相同的 `vruntime` 67 | * `nice_0_weight` 表示 `nice=0` 的权重值 68 | * 该进程权重值越大,实际运行时间越长,保证公平调度 69 | 70 |

71 | 72 | * **CFS 调度器的实现** 73 | * 选择红黑树记录任务的运行时间(红黑树:平衡二叉搜索树,大多操作复杂度 `O(logn)`) 74 | * 选择红黑树最左边节点(最少时间)的进程运行 75 | * 选择的进程从树中移除,更新执行时间,重新插入红黑树中 76 |

77 | 78 | * 进程调度时机 79 | * 阻塞操作:互斥量(mutex)、信号量(semaphore)、等待队列(wait queue) 80 | * 在中断返回前和系统调用返回用户空间时,检查 `TIF_NEED_RESCHED` 标志位以判断是否需要调度 81 | * 将要被唤醒的进程(Wakeups) 82 | 83 | * 进程切换 `switch_to()` 84 | * 调度器的职责选择下一个进程运行,而进程切换负责具体落实该进程的执行 85 | * 切换的本质: 86 | * 保存上一个进程的上下文 87 | * 装载下一个进程的上下文到 CPU 88 | -------------------------------------------------------------------------------- /notes/LK_note_06.md: -------------------------------------------------------------------------------- 1 | # LK Note 06 2 | > SJTU-CS353 Linux Kernel 3 | 4 | > Refer to the slides of Prof. Quan Chen, Dept. of CSE, SJTU. 5 | ## Lec 6. Interrupt Handling 6 | ### (1)中断和异常 7 | * **轮询 v.s. 中断** 8 | * 轮询(Pooling):CPU 主动查询设备状态,从而进行必要的 I/O 9 | * 优点:简单、易实现、易控制 10 | * 缺点:系统开销大,无法感知设备状态变化、浪费 CPU 资源 11 | * 中断(Interrupt):CPU 被动响应中断信号(设备在发生状态改变时,比如读操作结束),被定义为一个事件,该事件改变 CPU 指令执行顺序 12 | * 优点:难实现、可能遗漏信息 13 | * 缺点:I/O 吞吐量大时,效率比轮询低 14 | * **中断的分类** 15 | * 同步中断(异常、内中断):由 CPU 控制单元在执行指令时产生 16 | * 陷阱(Trap):有意而为之的系统调用,用户程序请求内核服务 17 | * 故障(Fault):可能可以解决的错误,解决后重新开始,如缺页异常(page fault) 18 | * 终止(Abort):无法解决的致命错误,导致程序终止 19 | * 异步中断(中断、外中断):由其它硬件设备产生,产生的时刻与 CPU 时钟信号无关,所以称为异步 20 | * 可屏蔽中断 21 | * 不可屏蔽中断:CPU 必须响应 22 | * 中断处理流程:① 外设发中断;② CPU 响应和处理中断;③ 中断返回 23 | * 中断随时到来,应尽可能快地处理完中断,尽可能把更多处理推迟 24 | * 中断处理程序必须支持嵌套中断,高优先级中断可以中断低优先级中断,维持更多 I/O 设备处于忙状态 25 | * 被中断的函数必须是可重入的(Reentrant) 26 | > **可重入函数(Reentrant Function)**:主要用于多任务环境下,重入即可以重复进入,意味着这个函数可以被中断,在这个函数执行的任何时刻中断它,内核切换执行另一段代码,返回控制不会发生错误。因为它只使用了自己栈上的变量,不依赖于系统外部环境(全局和静态变量区、中断向量表等),所以允许多个函数的副本并发运行,因为它们使用的是独立的栈,不会互相干扰。如果确实需要访问全局变量,同时保证函数的可重入性,则需要实施互斥手段(关中断、信号量等操作) 27 | * 临界区中必须禁止中断,临界区可能改变全局变量而不可重入 28 | * **中断处理函数** 29 | * 不可被阻塞 30 | * 分为:上半部、下半部 31 | * **上半部**:必须立即执行的代码(可很快执行),响应硬件(比如向硬件确认已收到中断号) 32 | * **下半部**:可以延迟执行的工作 33 | * 不能直接与用户空间进行数据交换 34 | * 不需要可重入 35 | * 不可休眠 36 | ### (2)中断处理 37 | * **中断描述符表(Interrupt Descriptor Table, IDT)** 38 | * 每一种中断对应一个中断号,x86 最多支持 256 种中断 39 | * 中断是 CPU 的机制,只要运行 x86 架构,IDT必然存在 40 | * IDT 存放各中断对应的中断处理程序(由操作系统提供) 41 | * IDT 是一个最大为 256 项的表,每个表项占 8 字节,称为**中断门** 42 | >任务门(Task Gate):当中断信号发生时,必须取代当前进程的那个进程的 TSS 选择符存放在任务门中 43 | 44 | > 中断门(Interrupt Gate):包含段选择符和中断处理程序的段内偏移量 45 | 46 | > 陷阱门(Trap Gate):大部分 Linux 异常处理程序都通过陷阱门激活 47 | * 设中断号为 n ,则中断门地址为 IDT.base + n * 8 48 | * **中断上下文保存** 49 | * 保存中断现场(如 `lr_irq` 寄存器、`spsr_irq` 寄存器) 50 | * 跳转到中断处理函数(在 IDT 中查找) 51 | * 恢复中断线程,返回到被中断点继续运行 52 | 53 | ### (3)Linux 中断处理过程 54 | * **硬件中断号 v.s. 软件中断号** 55 | * 硬件中断号:硬件 Soc 设计时就确定下来的中断号 56 | * 软件中断号(Linux 中断号、IRQ 中断号):Linux 系统映射的中断号,可用于注册中断函数(即 `... request_irq(unsigned int irq, ...` 中的 `irq`) 57 | * `cat /proc/interrupt` 查看中断分配情况(包括软件中断号对应的硬件中断号) 58 | * 两个相同设备具有相同的硬件中断号,但在 Linux 系统中需要映射为两个不同的软件中断号,方便区别二者 59 | * 分配软件中断号(IRQ 中断号) 60 | * 宏 `NR_IRQS` 定义了硬件平台支持的中断数量最大值 61 | * 位图变量 `allocated_irqs` 用来分配软件中断号 62 | * 中断映射过程在 `irq_domain_alloc_irqs()` 函数中,从 `allocated_irqs` 位图里分配一个空间的比特位作为软件中断号,并分配一个 `struct irq_desc` 数据结构,即中断描述符 63 | * `struct irq_desc[NR_IRQS]` 是中断描述符数组,数组下标即 IRQ 中断号,对应一个 `struct irq_desc` 64 | 65 |

66 | 67 | * 注册中断 API : `request_irq(irq, handler, flags, dev_id, name)` 68 | * 每个请求的中断都有一个 `irqaction` 描述符,用于封装中断处理函数,由驱动人员负责分配,包含中断号、中断处理函数指针、中断标志位、中断名等 69 | * 对于共享中断,多个 `irqaction` 描述符串成一个链表 70 | 71 |

72 | 73 | ### (4)中断的上半部和下半部 74 | 75 | * **中断上半部:中断处理程序** 76 | * 完成尽可能少的紧急硬件操作 77 | * 关中断(不允许被其它中断打断) 78 | * 内核立即运行 79 | * **中断下半部:延迟处理程序** 80 | * 完成延缓的耗时操作(中断大部分任务) 81 | * 可以开中断,允许中断请求 82 | * 稍后完成 83 | * **为什么要有中断上半部?** 84 | * 硬件中断处理程序以异步方式执行,通常需要关中断,会打断其它重要代码的执行 85 | * 为避免被打断程序停止太长,中断处理应尽快完成,所以将可以尽快完成的紧急操作作为中断上半部由内核立即执行,然后返回到被中断的程序 86 | * 任务放置原则:**以下任务应放在上半部** 87 | * 对时间敏感 88 | * 和硬件相关 89 | * 需要保证不被其它中断所打断 90 | * **中断线程化** 91 | * 下半部由一个内核线程完成,接受处理器调度,优先级高于普通进程 92 | * 可以减小高优先级进程延迟,提高响应性 93 | * SMP IRQ Affinity:把中断均衡分布在不同的 core 上 94 | * **中断下半部执行时机** 95 | * 上半部完成后,即将返回中断现场时(SoftIRQ & Tasklet) 96 | * 下半部执行的部分放在一个内核线程中,中断返回之后,调度这个内核线程来执行(Workqueue,中断线程化) 97 | ### (5)软中断、Tasklet、工作队列 Workqueue 98 | * 进程上下文 99 | * 用户进程执行系统调用触发异常时,传递给内核的参数、内核要保存的一整套变量和寄存器值、当时的环境等 100 | * 中断上下文 101 | * 中断发生时,硬件传递给内核的参数和内核需要保存的环境(被中断进程的环境) 102 | * 包括:硬中断上下文、软中断上下文 103 | * **软中断(SoftIRQ)** 104 | * 执行点:中断处理函数完成之后,**返回中断之前** 105 | * 类型是静态定义的 106 | * 回调函数是在开中断下执行的(不能被自己打断,单个 CPU 软中断不可嵌套),只能被上半部打断 107 | * 软中断算中断上下文,因此软中断总是抢占进程上下文 108 | * 同一类型的软中断,可以在多 CPU 上并行执行,可重入 109 | * **硬中断 v.s. 软中断 v.s. 信号** 110 | * 硬中断:外部设备对 CPU 的中断; 111 | * 软中断:硬中断服务程序对内核的中断; 112 | * 信号:内核或其它进程对某个进程的中断 113 | * **Tasklet** 114 | * **基于软中断实现的一种下半部机制**,软中断的变种 115 | * 一种特定类型的 Tasklet 只运行在一个 CPU 上,不能并行,只能串行执行 116 | * 多种不同类型的 Tasklet 可以并行运行在多个 CPU 上 117 | * Tasklet 可以在运行时改变 118 | * 每个 CPU 维护两个 Tasklet 链表(每 CPU 变量),一个用于普通优先级,一个用于高优先级 119 | * **工作队列(Workqueue)** 120 | * **把中断下半部放到进程上下文,交由一个内核线程来执行** 121 | * 延迟执行任务:工作(work),数据结构 `work_struct` 122 | * 工作以队列结构组织成工作队列,数据结构 `workqueue_struct` 123 | * 工作线程负责执行工作队列里的工作 124 | * 可解决软中断和 Tasklet 执行时间过长,导致实时性下降的问题 125 | * 为解决内核线程数量多、并发性差、可能死锁等问题,引入 CMWQ(并发管理工作队列),设置 Bound 类型(指定分配到哪个 CPU)和 Unbound 类型(不指定分配到哪个 CPU)的工作线程池(含高低优先级两个池) 126 | * **如果延迟执行的任务...** 127 | * 需要睡眠,只能选择工作队列 128 | * 需要延迟指定的时间再触发,使用工作队列(利用内核定时器) 129 | * 需要在短时间内处理,使用软中断/Tasklet,因为其可以抢占普通进程和内核线程,同时不可睡眠,时效性更好 130 | * 对延迟时间没有要求,使用工作队列(通常为无关紧要的任务) 131 | -------------------------------------------------------------------------------- /notes/LK_note_07.md: -------------------------------------------------------------------------------- 1 | # LK Note 07 2 | > SJTU-CS353 Linux Kernel 3 | 4 | > Refer to the slides of Prof. Quan Chen, Dept. of CSE, SJTU. 5 | ## Lec 7. Kernel Synchronization 6 | ### (1)内核控制路径(Kernel Control Paths) 7 | * **内核控制路径**:当前进程在**内核态**执行的一个指令序列(中断/异常) 8 | 9 |

10 | 11 | * **内核抢占** 12 | * 可抢占内核:一个运行在内核态的进程,在执行内核函数时,可以被另一个进程所抢占 13 | * 主要目的:减少用户态进程的分派延迟(进程从可运行态到真正运行的延迟) 14 | * 内核只有在执行异常处理函数(系统调用)时才可能被抢占 15 | 16 | ### (2)同步场景 17 | * 竞争条件发生场景:计算的输出结果取决于两个及以上交错运行的内核控制路径的嵌套情况 18 | * 示例:test 和 set 不能一气呵成,线程发生调度,导致同时进入临界区,计算结果有误 19 | * 线程之间必须互斥访问临界区 20 | * 在**单核计算机**上,可以**禁止中断**来实现临界区 21 | * 若数据仅在系统调用的服务函数间共享,可以**禁止内核抢占**来实现临界区 22 | * 多核系统的同步更为复杂 23 | * 简化同步的内核设计 24 | * 中断处理程序完成之前不允许产生同类中断事件 25 | * 中断处理程序、软中断不可以被抢占和阻塞 26 | * 中断处理程序不必同步 27 | * 执行中断处理的内核控制路径,不能被执行可延迟函数/系统调用的内核控制路径中断 28 | * 仅被软中断访问的每 CPU 变量不需要同步 29 | * 软中断不能在一个 CPU 上交错执行 30 | 31 | ### (3)同步原语(Synchronization Primitives) 32 | 33 | | 技术 | 描述 | 适用范围 | 34 | | ----------------------------------------- | ------------------------------- | -------- | 35 | | **per-CPU 变量**(per-CPU variable) | 每个 CPU 存有一个数据结构的副本 | All | 36 | | **原子操作**(Atomic operation) | 原子 read-modify-write 指令 | All | 37 | | **内存屏障**(Memory barrier) | 避免指令重排序 | Local | 38 | | **自旋锁**(Spin lock) | 使用忙等的锁 | All | 39 | | **信号量**(Semaphore) | 使用阻塞等的锁 | All | 40 | | **顺序锁**(Seq lock) | 基于访问计数器的锁 | All | 41 | | **禁止本地中断**(Local interrupt disabling) | 单 CPU 上禁止中断 | Local | 42 | | **禁止本地软中断**(Local softirq disabling) | 单 CPU 上禁止可延迟函数 | Local | 43 | | **读-拷贝-更新**(Read-copy-update,RCU) | 通过指针无锁访问共享数据 | All | 44 | 45 | * **per-CPU 变量** 46 | * 一个内核变量对应一个数组,每个 CPU 使用其中一个变量副本;一个 CPU 不应访问数组中对应于其它 CPU 的变量副本。在多 CPU 系统中,当 CPU 操作属于它的副本时,不需要考虑与其它 CPU 的竞争问题,还可以充分利用 CPU 本地的硬件缓存来提高访问速度 47 | * 静态 per-CPU 变量:存储空间在代码编译时静态分配 48 | * 动态 per-CPU 变量:存储空间在代码运行时动态分配 49 | * per-CPU 变量在于保护多个 CPU 对一个变量的同步访问,但不保护异步函数(中断处理函数、可延迟函数)对变量的同步访问 50 | * 内核抢占,可能使 per-CPU 变量产生竞争条件 51 | * 一个 CPU 上的两条路径首先拿到了同一副本的指针,此时发生内核抢占,其中一条路径调度到另一个 CPU 上,然而此时两条路径仍然访问同一副本 52 | 53 | * **原子操作** 54 | * 80x86 原子指令:进行 0 次或者 1 次对齐内存访问的指令 55 | * `read-modify-write` 指令 56 | * `atomic_t` 类型:24-bit 原子访问计数器 57 | 58 | * **内存屏障**:内存屏障原语前面的指令完成后,才会执行原语后面的指令,避免了指令重排序,保障了同步 59 | 60 | * **自旋锁** 61 | * 适用于多处理器环境 62 | * `spinlock_t` 类型 63 | * 进程检查锁是无序的,不用排队 64 | * **读写自旋锁** 65 | * 增加内核的并发度:多个 read,单个 write(允许多个 read 的路径进入临界区) 66 | * `rwlock_t` 类型 67 | * `read_lock()`,`read_unlock()`,`write_lock()`,`write_unlock()` 68 | 69 |

70 | 71 | * **顺序锁**:类似 read/write 自旋锁,给 write 锁更高的优先级,即使临界区加了 read 锁,一个进程仍然可以对其加 write 锁(将其中的 read 路径强制移出临界区,可能需要多次 read 才能得到有效副本) 72 | 73 | * **读-拷贝-更新(RCU)** 74 | * 针对 “读多写少” 的共享数据的同步机制 75 | * 随意读,但更新数据的时候,需要先复制一份副本,在副本上完成修改,再一次性地替换旧数据 76 | * 它允许多个读者同时访问共享数据,而且读者的性能不会受影响( “随意读” ),读者与写者之间也不需要同步机制(但需要 “复制后再写” ),但如果存在多个写者时,在写者把更新后的 “副本” 覆盖到原数据时,写者与写者之间需要利用其他同步机制保证同步。 77 | * 特点: 78 | * 保护由多个 CPU 读取访问的数据结构,允许多个进程对数据并发读写,通过指针无锁访问数据 79 | * 只有通过指针动态分配和利用的数据结构,才可以使用 RCU 保护 80 | * RCU 保护的临界区不可以有内核控制路径进入睡眠状态 81 | 82 | * **信号量** 83 | * 不会自旋忙等,而是挂到信号量的等待队列上 84 | * 两种信号量: 85 | * 内核信号量:内核控制路径使用 86 | * System V IPC 信号量:用户进程使用 87 | * 内核信号量 `semaphore` 88 | * `down()`:获取信号量锁(wait,P,- -) 89 | * `up()`:释放信号量锁(signal,V,++) 90 | * 读写信号量 `rw_semaphore` 91 | * `init_rwsem()`:初始化读/写信号量 92 | * `down_read(),down_write()`:获取读/写信号量 93 | * `up_read(),up_write()`:释放读/写信号量 94 | 95 | ### (4)内核数据结构的同步访问 96 | * 指导原则:系统支持的并发度越高越好 97 | * 提升目标: 98 | * 提升并发运转的 I/O 设备数量 99 | * 提升有效工作的 CPU 数量 100 | * `int` 型的共享数据结构可以声明为 `atomic_t` 类型,使用原子操作进行更新 101 | -------------------------------------------------------------------------------- /notes/LK_note_08.md: -------------------------------------------------------------------------------- 1 | # LK Note 08 2 | > SJTU-CS353 Linux Kernel 3 | 4 | > Refer to the slides of Prof. Quan Chen, Dept. of CSE, SJTU. 5 | ## Lec 8. Symmetric Multiprocessing 6 | ### (1)SMP 简介 7 | * 并行体系结构 8 | 9 |

10 | 11 | * **SMP(Symmetric Multiprocessing,对称多处理)** 12 | * 多个对称的处理器连接到一个共享内存(紧耦合) 13 | * 所有处理器对所有 I/O 设备具有完全控制权限 14 | * 由同一个操作系统控制,系统平等对待所有处理器 15 | * 当前多处理器系统大多使用 SMP 架构。多核处理器的核也可以视为不同处理器,从而构成 SMP 架构 16 | 17 |

18 | 19 | * **NUMA(Non-uniform Memory Access,非一致内存访问)** 20 | * 一种多处理器系统使用的内存设计 21 | * 处理器访问内存的延迟,取决于内存相对于处理器的位置: 22 | * 访问本地内存数据,比访问非本地内存数据更快 23 | * 非本地内存:连接在其它处理器上的内存节点 24 | * NUMA 优势在于存取具有本地化特征的数据 25 | 26 |

27 | 28 | * SMP 的特征 29 | * 允许 Linux 内核不同部分在不同处理器上并发运行 30 | * 每个处理器从可运行进程/线程中自主进行调度,以获取所执行的进程/线程 31 | 32 | * 多处理器操作系统设计 33 | * 进程/线程的并发 -> 可重入 34 | * 调度 -> 避免冲突,线程分布到不同处理器,负载均衡 35 | * 同步 -> 对共享数据的并发访问 36 | * 内存管理 -> 多端口内存、分页机制一致性等 37 | * 可靠性与容错 -> 处理器失效 38 | 39 | ### (2)SMP 进程调度 40 | * **时分共享** 41 | 42 |

43 | 44 | * **空分共享** 45 | 46 |

47 | 48 | ### (3)SMP 同步问题 49 | * 同步难题 —— TSL 指令(Test and Set Lock) 50 | * 在 SMP 系统中,如果仅对内存区域加锁,不同处理器的 TSL 指令会出现同时对一块内存区域加锁的情况 51 | * 考虑对总线加锁,一个处理器对总线加锁后,另一个处理器执行 TSL 指令失败(无法读总线) 52 | 53 |

54 | 55 | * 同步难题 —— 缓存颠簸(Cache Thrashing) 56 | * 使用多个锁避免缓存颠簸 57 | 58 |

59 | 60 | ### (4)SMP 源码分析 61 | #### 【1】SMP:Linux 启动过程 62 | * 基本概念 63 | * ***BSP***(Bootstrap Processor,启动 CPU):在操作系统启动过程前期,只有 BSP 在执行指令(随机选取的一个处理器) 64 | * ***AP***(Application Processor,应用 CPU) 65 | * ***APIC***(Advanced Programmable Interrupt Controller,高级可编程中断控制器):分为 Local APIC 和 I/O APIC 66 | * ***IPI***(Inter-Processor Interrupt,处理器间中断):用于处理器之间的通信 67 | 68 | * 由于 BIOS 代码不支持多线程,所以 SMP 中必须让所有 AP 进入中断屏蔽状态,不与 BSP 一起执行 BIOS 代码。BIOS 程序将其它 AP 置于中断屏蔽状态,只选择 BSP 执行 BIOS 代码中的后继部分 69 | 70 | * 主要流程 71 | * BIOS 初始化(屏蔽 AP,建立系统配置表格) 72 | * MBR 里面的引导程序(Grub,Lilo 等)将内核加载到内存 73 | * 执行 `head.S` 中的 `startup_32` 函数(最后将调用 `start_kernel`) 74 | * 执行 `start_kernel`(相当于 main 程序),进行一系列初始化,最后将执行: 75 | * `smp_init()`:启动各个 AP 76 | * `smp_boot_cpus()`:初始化各 AP,设置为待命模式,等待 BSP 发送 IPI 指令,并为之建立 0 号进程 77 | * `rest_init()`:调用 `init()` 创建 1 号进程,自身执行 `cpu_idle()` 成为 0 号进程 78 | * 1 号进程(init 进程)完成余下的工作 79 | 80 | * 注意:在 SMP 中,有几个 CPU,就有几个 0 号进程(idle 进程),而 1 号进程(init 进程)只有一个 81 | 82 | * **【总结】BSP 负责操作系统的启动,在启动的最后阶段,BSP 通过 IPI 激活各个 AP,在系统的正常运行过程中,BSP 和 AP 基本上无差别** 83 | 84 | #### 【2】SMP:Linux 进程调度 85 | 86 | * 多处理器间调度时,切换的下一个处理器最好是上一个处理器(由局部性原理,上次使用的处理器的 Cache 中仍保存较多程序使用到的数据,hot cache) 87 | ```c 88 | struct task_struct { 89 | // ... 90 | int processor; // 正在使用的 CPU 91 | int last_processor; // 上次使用的 CPU 92 | // ... 93 | }; 94 | ``` 95 | 96 | * SMP 进程调度主要函数和宏 97 | * `schedule()`:进程调度主函数 98 | * `switch_to()`:上下文切换 99 | * `reschedule_idle()`:在 SMP 系统中,如果被切换下来的进程仍然是可运行的(TASK_RUNNING),调用该函数重新调度,以选择一个空闲的或者运行低优先级进程的 CPU 来运行这个进程 100 | * `goodness()`:优先级计算函数,选择一个最合适的进程投入运行 101 | 102 | * **【总结】与单处理器系统主要差别是,执行进程切换后,被换下的进程可能换到其它 CPU 上继续运行。在计算优先权时,对进程上次运行的 CPU 适当提高其优先权,从而更有效地利用 Cache** 103 | 104 | #### 【3】SMP:Linux 中断系统 105 | 106 |

107 | 108 | * **【总结】为支持 SMP,在硬件上需要 APIC,Linux 定义了各种 IPI 的中断向量以及传送 IPI 的函数** 109 | -------------------------------------------------------------------------------- /notes/LK_note_09.md: -------------------------------------------------------------------------------- 1 | # LK Note 09 2 | > SJTU-CS353 Linux Kernel 3 | 4 | > Refer to the slides of Prof. Quan Chen, Dept. of CSE, SJTU. 5 | ## Lec 9. Memory Management: Addressing 6 | ### (1)内存地址 7 | * 内存地址分类 8 | * **逻辑地址**(logical address):机器语言指令中使用 9 | * **虚拟地址**(virtual address):又称线性地址(linear address) 10 | * **物理地址**(physical address):在内存芯片中寻址内存单元 11 | * **分段**:逻辑地址 -> 虚拟地址 12 | * 给每个进程分配不同的线性地址空间 13 | * **分页**:虚拟地址 -> 物理地址 14 | * 基于页表机制,把线性地址空间映射到不同的物理空间 15 | ### (2)段式寻址 16 | * **逻辑地址 = { 段选择符,偏移量 }** 17 | * **段选择符(segment selector):16-bit** 18 | * index:段描述符的 index 19 | * TI:指明 index 是 GDT(全局描述符表)还是 LDT(局部描述符表) 20 | * RPL:特权等级 21 | * **段寄存器:用于存储段选择符** 22 | * cs:代码段;ss:堆栈段;ds:数据段;es/fs/gs:通用 23 | * 偏移量:32-bit 24 | 25 |

26 | 27 | * **段描述符(segment descriptor):8-byte** 28 | * 存放在 GDT 或 LDT 中 29 | * GDT 和 LDT 在内存中存放的起始地址和大小 -> 控制寄存器 gdtr,ldtr 30 | * 字段: 31 | * Base:32-bit 的线性地址 32 | * Limit:20-bit 的偏移量 33 | * ... ... 34 | * 段描述符快速访问:80x86 的 6 个可编程段寄存器 -> 不可编程的寄存器作为段描述符缓存:每次加载段选择符时,把相应的段描述符(8 bytes)由内存的 GDT/LDT 加载到不可编程的寄存器中 35 | 36 |

37 | 38 | * **逻辑地址 -> 线性地址** 39 | 40 |

41 | 42 | * **Linux 段式寻址** 43 | * Linux 倾向于分页,仅在必要时分段 44 | * 4 个主要的段:内核代码段、内核数据段、用户代码段、用户数据段 45 | * Linux GDT:每个 CPU 一个 GDT,包含 18 个段描述符 46 | * 4 个 内核/用户 代码/数据 段 47 | * 任务状态段(TSS,task state segment) 48 | * ... ... 49 | ### (3)页式寻址 50 | * **页(page)**:固定大小的线性地址区间(Linux 4KB) 51 | * **页框(page frame**):物理内存被切分成相同大小的内存页/物理页 52 | * 在 80x86 微处理器中,设置控制寄存器 cr0 的 PG 标志,启用分页机制 53 | * **规则分页** 54 | * 4 KB 的页,对应页内地址/偏移量 12-bit 55 | * 线性地址:32-bit <-> 可寻址 4GB 虚拟内存空间 56 | * 页目录(Directory):10-bit 57 | * 页表(Table):10-bit 58 | * 偏移量(Offset):12-bit 59 | * 二级转换机制 60 | * 页目录表的起始物理地址 -> cr3 寄存器 61 | * 设计思想 62 | * 32 位线性地址 -> 4 B 页目录项/页表项(页目录、页表都在内存中) 63 | * 页目录表/页表均为 10 位,共 1024 项,每项 4 B,总共 4 KB,刚好一个页的大小 64 | * 每个进程都有独立的分页转换 65 | * 进程之间的虚拟地址空间相互独立 66 | * 两个进程不同的虚拟页,通过页表映射至相同的物理页框,可实现共享内存 67 | * 若进程发生缺页时,可能换出自己或其它进程的物理页到磁盘上,这时需要在页表中记录每个进程的页是否在内存中 68 | * 页目录项/页表项结构 69 | * 下列提到的页:页目录项指向页表所在的页/页表指向的页 70 | * 20-MSB:页框物理地址的高 20 位,与虚拟地址的 12-LSB(偏移量)拼接在一起,组成物理地址 71 | * present:页是否在内存中(涉及内存页的换入换出) 72 | * accessed:页最近访问时间,访问几次?(页面置换算法考虑) 73 | * dirty:(仅页表项)页调入内存后是否被修改 74 | * 分页的硬件保护 75 | * read/write:页的存取访问权限 76 | * user/supervisor:特权等级 77 | * ... ... 78 | 79 |

80 | 81 | * **扩展分页** 82 | * 32-bit = { 页目录 10-bit,偏移量 22-bit } 83 | * 页的大小:4 MB 84 | 85 | * **物理地址扩展(PAE)** 86 | * 36 位地址 = { 24,12 },设置 cr4 控制寄存器的 PAE 标志位来启动该机制 87 | * 每个页表项 8 B,一个页表中有 4 KB / 8 B = 512 个表项,对应 9 位 88 | * 每个进程仍然只能看到 4 GB 的内存(机器仍为 32 bit) 89 | * 4 KB 页:{ 2,9,9,12 } 90 | * PDPT(page directory pointer table,页目录指针表):cr3 指向起始地址 91 | * 页目录表 92 | * 页表 93 | * 偏移量 94 | 95 | * **TLB**:快表,硬件缓存,存储近期访问过的页 96 | 97 |

98 | 99 | * **Linux 页式寻址** 100 | * 5 级分页机制 101 | 1. 页全局目录(Page Global Directory,**PGD**) 102 | 2. *页四级目录(Page 4th Directory,**P4D**)* 103 | 3. 页上级目录(Page Upper Directory,**PUD**) 104 | 4. 页中级目录(Page Middle Directory,**PMD**) 105 | 5. 页表(Page Table Entry,**PTE**) 106 | * 页表处理相关的数据结构:`pgd_t` `pud_t` `pmd_t` `pte_t` 107 | 108 |

109 | 110 | -------------------------------------------------------------------------------- /notes/LK_note_10.md: -------------------------------------------------------------------------------- 1 | # LK Note 10 2 | > SJTU-CS353 Linux Kernel 3 | 4 | > Refer to the slides of Prof. Quan Chen, Dept. of CSE, SJTU. 5 | ## Lec 10. Memory Management: Methods 6 | ### (1)页框管理 7 | * 标准页框大小:4 KB 8 | * **页描述符 `page`**:32 B,存储在 `mem_map` 数组中 9 | 10 | * **NUMA(非一致内存访问)** 11 | * Physical Memory : Node : Zone : Page 12 | * 物理内存被切分为多个节点(Nodes),每个节点有: 13 | * **节点描述符** `pg_data_t` 14 | * `struct zone[] node_zones;` 节点的管理区描述符数组 15 | * `int nr_zones;` 节点的管理区数量 16 | * 每个节点的物理内存进一步划分为多个管理区(Zone),每个区有: 17 | * **管理区描述符** `zone` 18 | * `unsigned long free_pages;` *Number of free pages in the zone* 19 | * `unsigned long pages_min;` *Number of reserved pages in the zone (Pool of Reserved Page Frames)* 20 | * `unsigned long pages_low/pages_high;` *Low/High watermark for page frame reclaiming; also used by zone allocator as a threshold value. (Zone Allocator)* 21 | * `struct free_area[] free_area;` *Blocks of free page frames in the zone (Buddy System)* 22 | * `struct list_head active/inactive_list;` *List of active/inactive pages in the zone* 23 | * 三种管理区 24 | * `ZONE_DMA`、`ZONE_NORMAL`、`ZONE_HIGHMEM` 25 | 26 | * **保留页框池(Pool of Reserved Page Frames)** 27 | * 有些内核控制路径在请求内存时不允许被阻塞 —— 原子分配请求 28 | * 内核保留了部分页框,供原子内存分配请求使用 —— 保留页框池 29 | 30 | * **分区页框分配器(Zoned Page Frame Allocator)** 31 | * 注:保留页框池是留给内核自己使用的,分页页框分配器不是 32 | * 请求页框 `alloc_pages`,释放页框 `free_pages` 33 | * Per-CPU 页框缓存 34 | * 每个 Per-CPU 页框缓存,包含了一些预先申请的页框,用以加速单个内存页的分配请求 35 | * Hot Cache:内容很可能存在硬件缓存(TLB)的页框 36 | * Cold Cache 37 | * `per_cpu_pages` 描述符 38 | * `int count;` *Number of page frames in the cache* 39 | * `int low;` *Low watermark for cache replenishing* 40 | * `int high;` *High watermark for cache depletion* 41 | * `int batch` *Number of page frames to be added/subtracted from the cache* 42 | * `struct list_head list;` *List of descriptors of page frames in the cache* 43 | * 如果 `count < low`,kernel 从伙伴系统中申请 `batch` 个单独的页框(4 KB)来补充对应的高速缓存 44 | * 如果 `count > high`,kernel 从缓存中释放 `batch` 个页框回到伙伴系统 45 | 46 |

47 | 48 | * **伙伴系统** 49 | * 避免外碎片的方法: 50 | * 使用分页单元(Paging Unit)将不连续的页框映射为连续的线性地址 51 | * 尽量避免切分大的连续空闲内存来满足小的空间需求 52 | * Linux 不使用分页技术避免外碎片 53 | * 有时候分配连续的内存页框是有必要的 54 | * 保持内核页表不变的优势 55 | * 内核可以通过使用 4 MB 的页来访问大块连续物理内存 56 | * **伙伴系统(Buddy System)** 57 | * 11 个内存页块链表,大小为 2^0 - 2^10 个连续页框的内存页块 58 | * Linux 2.6 使用 3 个伙伴系统,分别管理:DMA 页框、普通页框、高内存区页框 59 | * **分配一块内存**:比如需要分配一块具有 13 个页框的连续内存,需要用掉一个具有 2^4=16 个页框的内存页块,然后归还 2^0=1 和 2^1=2 的内存页块各一个,挂到相应链表上 60 | * **释放一块内存**:尽量合并成大的内存页块(伙伴内存块之间合并) 61 | * 伙伴内存块:内存页块大小一致、物理地址连续 62 | * 如下图中的 P0 和 P1 为伙伴内存块,如果 P1、C1、B1 空闲,释放 P0 ,即可逐级合并直到释放 A0 63 | 64 |

65 | 66 | * **伙伴系统(续)** 67 | * 伙伴系统的数据结构 68 | * 管理区描述符 `zone` 中的 `struct free_area free_area[11];` 字段 69 | * `free_area[k]` 管理所有大小为 2^k 个页框的空闲内存页块 70 | * `free_area[k]->free_list` 字段,是一个双向循环链表的头指针,管理所有大小为 2^k 个页框的空闲内存页块的页描述符 71 | * 仍然存在外碎片化 72 | 73 |

74 | 75 | * **伙伴系统(续)** 76 | * 碎片化解决方案:迁移类型 77 | * 迁移类型按照伙伴系统的最大分配大小进行划分,即 2^10=1 K 个页框,即 4 MB 78 | * 三种类型: 79 | * 不可移动类型(UNMOVABLE):在内存中有固定位置,不能移动到其它地方 80 | * 可移动类型(MOVABLE):可以随意移动的页面 81 | * 可回收类型(RECLAIMABLE):不能直接移动,但是可以回收 82 | * 不可移动页面不允许在可移动页面中申请,避免导致碎片 83 | 84 |

85 | 86 | ### (2)内存区管理 87 | 88 |

89 | 90 | * Buddy System 分配的连续页框,可能不能全部用完,导致内碎片(小的 Object,比如进程描述符、各种锁等) 91 | * Linux 引入 Slab 分配器,以避免内碎片 92 | * Slab 分配器的假设 93 | * Slab 分配器将内存区看作对象 objects 94 | * 内核函数通常反复申请相同类型的内存区,申请请求可以根据它们的频率分类 —— 按申请对象划分不同的 cache 95 | * Buddy System 的函数调用会 “污染” 硬件缓存,从而增加平均内存访问时间 96 | * **Slab 分配器(Slab Allocator)** 97 | * Slab 分配器组织为 caches 98 | * 每个 cache 存储同种类型的 object,划分为多个 slab 99 | * 每个 slab 由一个或多个连续页框组成,包括 allocated objects 和 free objects 100 | 101 |

102 | 103 |

104 | 105 | * **Slab 分配器(续)** 106 | * **cache 描述符**类型 `kmem_cache_t` 107 | * `struct kmem_list3 lists;` 字段 108 | * `struct list_head slabs_partial;` **部分满 slab** 对应的 slab descriptor 组织成的双向循环链表 109 | * `struct list_head slabs_full;` **全满 slab** 对应的 slab descriptor 组织成的双向循环链表 110 | * `struct list_head slabs_free;` **全空 slab** 对应的 slab descriptor 组织成的双向循环链表 111 | * `unsigned long free_objects;` cache 中的 free objects 数量 112 | * **slab 描述符**类型 `slab` 113 | * `struct list_head list;` 该 slab 描述符对应的 `slabs_partial/full/free` 链表指针 114 | * `void *s_mem;` 第一个 slab object 的地址(allocated or free) 115 | * `unsigned int inuse;` slab 中 allocated objects 数量 116 | * `unsigned int free;` slab 中下一个 free object 的索引,如果没有则为 `BUFCTL_END` 117 | * **object 描述符**类型 `kmem_bufctl_t` 118 | 119 |

120 | 121 | * **Slab 分配器(续)** 122 | * **通用 cache & 专用 cache** 123 | * 通用 cache 124 | * `kmem_cache_init()` 125 | * 26 caches:13 种不同大小的 cache,每种 2 个 126 | * 一个 `malloc_sizes` 表指向 26 个 cache 描述符 127 | * 专用 cache 128 | * `kmem_cache_create()` 129 | * **slab 创建 & 释放** 130 | * slab 创建:当请求分配一个新的 object,但 cache 中没有 free object 131 | ```c 132 | struct kmem_cache_t *cache; 133 | if ((cache->lists).free_objects == 0) { 134 | // create a new slab: cache_grow() 135 | } 136 | ``` 137 | * slab 释放:当 cache 中存在太多 free object 时,有一个定时函数周期性检查 cache 中是否有可释放的完全空闲的 slab 138 | ```c 139 | // periodic function to check whether to free a slab 140 | void check() { 141 | struct kmem_cache_t *cache; 142 | if ((cache->lists).free_objects > threshold) { 143 | // free a slab: slab_destroy() 144 | // free a slab from `cache->slabs_free` 145 | } 146 | } 147 | ``` 148 | * **free object 队列**: 149 | * allocated object 描述符 -> allocated object 属性 150 | * free object 描述符 -> 下一个 free object 的 index,这样实现了 slab 内部 free object 的一个队列 151 | 152 |

153 | 154 |

155 | 156 | * **Slab 分配器(续)** 157 | * object 分配 & 释放 158 | * slab object 159 | * 分配:`kmem_cache_alloc()` 160 | * 释放:`kmem_cache_free()` 161 | * general object 162 | * 分配:`kmalloc()` 163 | * 释放:`kfree()` 164 | * 内存池 Memory Pool:`mempool_t` 165 | * 分配:`mempool_alloc()` 166 | * 释放:`mempool_free()` 167 | ### (3)非连续内存区管理 168 | * 为了避免外碎片 169 | * 描述符 `vm_struct` 170 | * `get_vm_area()`:寻找线性地址的一个空闲区域 171 | * `vmalloc()` 分配一个非连续内存区 172 | * `vfree()` 释放一个非连续内存区 173 | 174 | ### (4)页替换策略 175 | * 页面替换:当内存已装满,又要装入新页时,必须按一定的算法选择将内存中的某些页调出 176 | * **抖动(thrashing)**:刚调出的页又需要调入,刚调入的页又调出,频繁的调入调出带来时间开销,称为抖动 177 | * **最佳算法**(Belady 算法,理论算法) 178 | * 换出的页:距现在最长时间后再次被访问的页(∞ 时间 = 不再访问的页) 179 | * 缺页中断率最低 180 | * 现实中无法实现,无法对以后访问的页面作出断言 181 | * 可以作为衡量其它页替换算法的标准(baseline) 182 | * **FIFO 算法** 183 | * 换出的页:最先调入内存的页/内存中驻留时间最长的那一页 184 | * 基于程序总是按线性顺序访问物理空间的假设 185 | * **LRU 算法** 186 | * 换出的页:最近一段时间内,较久未被访问的页 187 | * 基于程序执行的局部性特征,刚被使用过的页可能马上还要被用到 188 | * 实现:维护一个 LRU 队列 189 | * LRU 队列存放当前在内存中的页号 190 | * 每访问一次页面,将该页面调整至队尾 191 | * 当发生缺页中断时,换出队头的页面 192 | * **Second Chance 算法** 193 | * 改进 FIFO 算法,结合页表中的引用位 194 | * 最先进入内存的页面,如果最近仍在被使用,还是有机会像新调入页面一样留在内存中 195 | * 检查 FIFO 队首: 196 | * 引用位 = 0,直接换出 197 | * 引用位 = 1,引用位 <- 0,重新移到队尾(给予 second chance) 198 | * **Clock 算法** 199 | * 内存中的页面组织成一个循环队列 200 | * 一个页面调入内存,引用位 <- 0 201 | * 一个页面被访问,引用位 <- 1 202 | * 换出页面时,**从指针当前指向页面开始**,扫描循环队列 203 | * 引用位 = 0,直接换出,指针推进一步 204 | * 引用位 = 1,引用位 <- 0,跳过该页面 205 | * 如果所有页面引用位均为 1,指针绕循环队列一圈,将所有引用位设置为 0,指针重新回到起始位置,换出这一页,再推进一步 206 | 207 |

208 | 209 | * **改进的 Clock 算法** 210 | * 页表中:引用位(r),修改位(m),换出次序 211 | * r = 0,m = 0 212 | * r = 0,m = 1 213 | * r = 1,m = 0 214 | * r = 1,m = 1 215 | * 具体步骤: 216 | 1. 从指针当前位置开始,扫描循环队列,将遇到的第一个 **r = 0,m = 0** 的页面换出,扫描过程**不改变引用位(r)** 217 | 2. 若步骤 1 失败(回到起始位置),重新扫描循环队列,将遇到的第一个 **r = 0,m = 1** 的页面换出,扫描过程**将引用位(r)设置为 0** 218 | 3. 若步骤 2 失败(回到起始位置,引用位全部为 0),重新进行步骤 1 操作,若失败则进行步骤 2 操作,一定可以选出淘汰页面 219 | 220 | * **LRU-K 算法** 221 | * 维护两个队列:短期访问 LRU 队列、长期访问 LRU 队列 222 | * 被访问页面调整至相应 LRU 队列队尾,K 次访问后从短期队列移至长期队列队尾,首次访问页面放在短期队列队尾 223 | * 首先考虑淘汰短期队列队头页面,如果短期队列为空,再考虑淘汰长期队列队头页面 224 | 225 |

226 | 227 | * **2Q 算法** 228 | * 维护两个队列:FIFO 队列、LRU 队列 229 | * 被访问页面如果在 LRU 队列则调整至 LRU 队列队尾,两次访问后从 FIFO 队列移至 LRU 队列 230 | * 首先考虑淘汰 FIFO 队列队头页面,如果 FIFO 队列为空,再考虑淘汰 LRU 队列队头页面 231 | 232 |

233 | -------------------------------------------------------------------------------- /notes/LK_note_11.md: -------------------------------------------------------------------------------- 1 | # LK Note 11 2 | > SJTU-CS353 Linux Kernel 3 | 4 | > Refer to the slides of Prof. Quan Chen, Dept. of CSE, SJTU. 5 | ## Lec 11. Virtual File System 6 | ### (1)虚拟文件系统概述 7 | * VFS 位于系统调用之下,物理文件系统之上,是多种物理文件系统的通用接口 8 | * VFS 支持磁盘文件系统(如 Ext2)、网络文件系统(如 NFS)、其它特殊文件系统(如 /proc) 9 | * 一些术语 10 | * 目录(Directory),是一种特殊类型的文件,包含指向其它文件(包括目录)的 “指针”,目录连接在一起形成分层的命名空间 11 | * 元数据(Metadata),描述文件的信息 12 | * 索引节点(Inode),与文件一一对应,具有唯一性,保存与文件有关的文件属性和数据块的位置,存放在硬盘中(硬盘上的索引节点) 13 | * 数据块(Data Blocks),包含文件数据,在物理上可能不是连续的 14 | * 通用文件模型 15 | * 定义通用文件模型接口和相关的数据结构 16 | * 要实现某个具体的文件系统,必须将其物理组织结构转换为虚拟文件系统的通用文件模型 17 | * 面向对象:数据 + 方法(Linux 内核软件对象,存放在内存中) 18 | * **superblock**(超级块对象):一个挂载的文件系统 19 | * **inode**(索引节点对象):文件元数据,唯一标识文件系统中的文件 20 | * **file**(文件对象):与进程进行交互的信息 21 | * **dentry**(目录项对象):目录条目,路径名的单个组件 22 |

23 | 24 | * 进程与文件的交互 25 | 26 |

27 | 28 | * 虚拟文件系统的操作 29 | * `super_operations`:在特定文件系统上调用 30 | * `inode_operations`:在特定(指向文件的)索引节点上调用 31 | * `dentry_operations`:在特定目录项上调用 32 | * `file_operations`:在文件上调用 33 | * 底层文件系统对上述函数实现自己的版本,需要时虚拟文件系统调用该版本;若底层文件系统未定义,则虚拟文件系统调用该方法的通用版本 34 | 35 | ### (2)虚拟文件系统的数据结构 36 | #### 【 superblock 对象】 37 | * 由每个文件系统实现,描述特定文件系统信息 38 | * 物理位置通常在每个分区开头 —— 文件系统控制块 39 | * **`struct super_block`** 40 | * `struct super_operations *s_op;` 超级块操作 41 | * `struct dentry *s_root;` 挂载目录 42 | * `unsigned char s_dirt;` dirty flag,整个文件系统是否有文件被修改 43 | * ... ... 44 | #### 【 inode 对象】 45 | * 操作文件/目录所需的所有信息 46 | * 在内存中创建(大多从硬盘索引节点中直接读入) 47 | * 超级块对象对应 3 个双向循环链表: 48 | * 没有被任何进程使用的 inode 链表 49 | * 当前被某些进程使用的 inode 链表 50 | * 脏 inode 链表 51 | * **`struct inode`** 52 | * `struct inode_operations *i_op;` 索引节点操作 53 | * `struct list_head i_dentry;` 目录项列表 54 | * `atomic_t i_count;` 引用计数 55 | * `struct super_block *i_sb;` 关联的超级块 56 | * ... ... 57 | #### 【dentry 对象】 58 | * VFS 把每个目录看作若干子目录和文件组成的普通文件 59 | * `/bin/vi` 包括 3 个 dentry 对象:`/`,`bin`,`vi`,都是文件,对应一个 inode 60 | * **`struct dentry`** 61 | * `struct dentry_operations *d_op;` 目录项操作 62 | * `atomic_t d_count;` 使用计数(多少进程正在访问该对象) 63 | * `struct inode *d_inode;` 关联的索引节点 64 | * ... ... 65 | * dentry 对象状态 66 | * `Free` 67 | * 没有分配 inode 对象 68 | * `Unused` 69 | * 分配了有效 inode 对象(`->d_inode` 指向对应 inode 对象) 70 | * 用户数量为 0(`->d_count` 为 0 ,没有进程通过该 dentry 对象访问文件) 71 | * 由于指向了有效 inode 对象,它放在高速缓存中(更快地进行路径查找,必要时被丢弃以回收内存) 72 | * `In use` 73 | * 分配了有效 inode 对象 74 | * 用户数量 > 0 75 | * 由于正在被使用,不能从高速缓存中释放 76 | * `Negative` 77 | * 分配了无效 inode 对象(`->d_inode` 为 NULL) 78 | * 可能原因:程序尝试打开不存在的文件;该文件的索引节点被删除 79 | * 可能被保存在目录项高速缓存中 80 | * 目录项高速缓存 81 | * 存储目录项对象 82 | * 包括三个部分: 83 | * `In use` 目录项双向链表 84 | * LRU 的 `Unused` 和 `Negative` 目录项双向链表 85 | * 散列表(哈希函数),快速解析给定路径的目录项对象 86 | #### 【File 对象】 87 | * 表示一个被进程打开的文件:在进程打开时创建 File 对象,放在内存中 88 | * File 对象通过名为 flip 的 slab 缓存分配 89 | * **`struct file`** 90 | * `struct file_operations *f_op;` 文件操作 91 | * `struct dentry *f_dentry;` 与文件关联的目录项对象 92 | * `unsigned int f_flags;` 打开文件标志 93 | * 文件相关的系统调用 94 | * 进程描述符 `task_struct` 中的: 95 | * `struct fs_struct *fs;` 指向当前目录 96 | * `struct files_struct *files;` 指向文件描述符 97 | * `files->fd` 文件描述符 98 | * `fd[0]:stdin` 99 | * `fd[1]:stdout` 100 | * `fd[2]:stderr` 101 | * `NR_OPEN` 一个进程的文件描述符最大数量 102 | 103 | ### (3)文件系统的类型 104 | * 特殊文件系统,例如: 105 | * `/proc`:对内核数据结构的访问点 106 | * `/sys`:对系统数据的访问点 107 | * 文件系统类型的注册 108 | * VFS 通过此来跟踪内核所包含的文件系统类型 109 | * 文件系统类型对象 `File_system_type` 110 | * 所有的 `File_system_type` 对象被插入到一个单向链表中 111 | 112 | ### (4)文件系统的管理 113 | > **挂载(mount)**:操作系统将存储设备分区挂载到某个目录(挂载点)下,用户可以通过访问该目录访问存储在设备上的文件 114 | 115 | * 根文件系统,/proc 虚拟文件系统是根文件系统的子节点 116 | * 命名空间:Linux 2.6 中,每个进程都有自己的挂载文件系统树 117 | * 文件系统的安装与卸载 118 | * 挂载一个 ext2 类型的文件系统,其物理位置在 /dev/fd0,挂载点在 /flp 目录 119 | * `mount -t ext2 /dev/fd0 /flp` 120 | * Linux 可以多次安装相同的文件系统,但始终只有一个超级块对象(与文件系统一一对应) 121 | * 已安装文件系统描述符:类型 `vfsmount` 122 | * 安装过程: 123 | * ① 调用 `path_lookup()` 查找挂载点路径名;② 检查挂载标志,确定必须执行的操作;③ `do_kern_mount()`;④ 终止挂载点的路径名查找;⑤ 返回 124 | * 卸载过程: 125 | * ① 调用 `path_lookup()` 查找挂载点路径名;② 检查正确性和优先级;③ `do_unmount()`;④ 减少相应的计数器;⑤ 返回 126 | 127 | ### (5)路径查找 128 | * 目标:路径名 -> 索引节点 129 | * 难点:检查每个目录的访问权限,符号链接(可能引发循环引用),文件名可以是已安装文件系统的挂载点,路径查找必须在发出系统调用的进程的命名空间内完成 130 | * 分析路径名,将其分解为一系列文件名 131 | * 绝对路径:从 `current->fs->root` 开始 132 | * 相对路径:从 `current->fs->pwd` 开始 133 | * `path_lookup()` 返回 `nameidata` 结构 134 | * `struct dentry *dentry;` :目录项对象 135 | * `struct vfs_mount *mnt;` :路径名中最后一个已解析组件的、已安装文件系统对象 136 | * 查找操作真正目标,是倒数第二个组件,最后一个不会被解释 137 | * 符号链接:必须在继续进行原始路径查找之前,对符号链接进行解释 138 | 139 | ### (6)文件加锁 140 | * 劝告锁:基于 `fcntl()` 系统调用,可以锁定文件的任意区域 141 | * 非强制,实际上其它进程仍然可以读写 142 | * 强制锁:内核检查每个系统调用,确保每个操作不能违反强制锁 143 | * Linux 支持对整个文件、文件的某一部分进行加锁 144 | 145 | -------------------------------------------------------------------------------- /notes/LK_note_12.md: -------------------------------------------------------------------------------- 1 | # LK Note 12 2 | > SJTU-CS353 Linux Kernel 3 | 4 | > Refer to the slides of Prof. Quan Chen, Dept. of CSE, SJTU. 5 | ## Lec 12. Linux File System Implementations 6 | * Linux 文件系统发展:Minix (0.11) -> Ext -> Ext2 (2.4) -> Ext3 -> Ext4 7 | * 硬盘上的分区:主引导扇区、FAT32、NTFS、MINIX、EXT2 8 | 9 | ### Minix 文件系统 10 | * **Minix 文件系统组成** 11 | * **引导块**(Boot Block) 12 | * **超级块**(Super Block) 13 | * **Inode 位图**(Inode Bitmap) 14 | * **数据块位图**(Data Block Bitmap) 15 | * **Inode 表**(Inode Table) 16 | * **数据区盘块**(Data Block) 17 | * Minix 文件系统的源代码在 0.11 内核的 `linux/fs` 目录下 18 | * SuperBlock / Inode 数据结构包括:磁盘和内存共有字段(内存部分由磁盘导入)、内存特有字段 19 | * **Inode 的 `i_mode` 字段** 20 | * 保存文件类型、访问权限属性 21 | * short 类型,2 B 22 | * 字符设备文件 —— 键盘、鼠标;块设备文件 —— 磁盘 23 | * 执行时设置用户 ID 举例: 24 | 25 | > 普通用户对文件 `/etc/passwd` 没有 write 权限,但是可以通过 `passwd ` 修改密码,这一操作写了 `/etc/passwd` 文件。原因是,`/etc/passwd` 文件的属主是 root,对该文件有写权限,并且设置了执行时设置用户 ID 的标志位,这样普通用户在执行 `passwd` 程序时,是以 root 身份打开该文件的,因此也有了 write 的权限,但实际上这一权限是受控的,用户只能修改自己的密码,不能查看和修改别人的密码。 26 | 27 |

28 | 29 | * **Inode 的 `i_zone[9]` 字段** 30 | * 文件的混合索引分配 31 | * `i_zone[0] - i_zone[6]` 存放文件开始的 7 个磁盘块号 —— **直接块** 32 | * `i_zone[7]` 存放 512 个磁盘块号,寻址 512 个磁盘块 —— **一级间接块** 33 | * `i_zone[8]` 存放 512×512 个磁盘块号,寻址 512×512 个磁盘块 —— **二级间接块** 34 | * 文件的最大大小 = (7 + 512 + 512×512) × 1 KB = 256.5 MB(假设磁盘块大小为 1 KB) 35 | 36 |

37 | 38 | * **文件类型和属性:`ls -l`** 39 | 40 |

41 | 42 | * **目录项结构(Dentry)** 43 | ```c 44 | #define NAME_LEN 14 45 | #define ROOT_INO 1 46 | 47 | struct dir_entry { 48 | unsigned short inode; // inode 号 49 | char name[NAME_LEN]; // 文件名 50 | } 51 | ``` 52 | 53 |

54 | 55 | * **`hexdump` 查看目录项数据块** 56 | * `hexdump .` 当前目录 57 | * **Linux 文件系统底层函数** —— 文件系统自身管理(分配 inode/盘块...) 58 | 59 |

60 | 61 |

62 | 63 | * **Linux 文件系统数据访问操作** 64 | * 顶层 —— 文件读写系统调用:`read_write.c` 65 | * 底层 —— 块设备、普通文件、字符设备、管道设备的读写 66 | * 块设备:`block_dev.c` 67 | * 普通文件:`file_dev.c` 68 | * 字符设备:`char_dev.c` 69 | * 管道设备:`pipe.c` 70 | 71 |

72 | 73 |

74 | 75 | * **文件访问模式 `f_mode`** 76 | 77 |

78 | 79 | * **进程打开文件使用的内核数据结构** 80 | 81 |

82 | 83 | * **Linux 文件系统上层函数** —— 文件和目录系统调用,面向应用层 84 | 85 |

86 | 87 |

88 | 89 | * **高速缓冲区** 90 | * `Buffer.c` 用于对高速缓冲区进行操作和管理 91 | * 缓冲头 `buffer_head` -> 缓冲块(一个磁盘块大小) 92 | 93 |

94 | 95 |

96 | 97 | * 缓冲头组成的 LRU 双向循环链表(空闲表指针 `b_prev_free/b_next_free`) 98 | 99 |

100 | 101 | * 缓冲头 Hash 队列(Hash 表项指针 `b_prev/b_next`) 102 | 103 |

104 | 105 | * 块设备访问操作 106 |

107 | 108 | * **Ext2 文件系统** 109 |

110 | -------------------------------------------------------------------------------- /notes/imgs/0/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/0/1.png -------------------------------------------------------------------------------- /notes/imgs/0/10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/0/10.png -------------------------------------------------------------------------------- /notes/imgs/0/11.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/0/11.png -------------------------------------------------------------------------------- /notes/imgs/0/2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/0/2.png -------------------------------------------------------------------------------- /notes/imgs/0/3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/0/3.png -------------------------------------------------------------------------------- /notes/imgs/0/4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/0/4.png -------------------------------------------------------------------------------- /notes/imgs/0/5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/0/5.png -------------------------------------------------------------------------------- /notes/imgs/0/6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/0/6.png -------------------------------------------------------------------------------- /notes/imgs/0/7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/0/7.png -------------------------------------------------------------------------------- /notes/imgs/0/8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/0/8.png -------------------------------------------------------------------------------- /notes/imgs/0/9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/0/9.png -------------------------------------------------------------------------------- /notes/imgs/1/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/1/1.png -------------------------------------------------------------------------------- /notes/imgs/1/2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/1/2.png -------------------------------------------------------------------------------- /notes/imgs/1/3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/1/3.png -------------------------------------------------------------------------------- /notes/imgs/1/4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/1/4.png -------------------------------------------------------------------------------- /notes/imgs/1/5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/1/5.png -------------------------------------------------------------------------------- /notes/imgs/1/6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/1/6.png -------------------------------------------------------------------------------- /notes/imgs/10/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/10/1.png -------------------------------------------------------------------------------- /notes/imgs/10/10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/10/10.png -------------------------------------------------------------------------------- /notes/imgs/10/11.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/10/11.png -------------------------------------------------------------------------------- /notes/imgs/10/12.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/10/12.png -------------------------------------------------------------------------------- /notes/imgs/10/13.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/10/13.png -------------------------------------------------------------------------------- /notes/imgs/10/2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/10/2.png -------------------------------------------------------------------------------- /notes/imgs/10/3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/10/3.png -------------------------------------------------------------------------------- /notes/imgs/10/4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/10/4.png -------------------------------------------------------------------------------- /notes/imgs/10/5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/10/5.png -------------------------------------------------------------------------------- /notes/imgs/10/6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/10/6.png -------------------------------------------------------------------------------- /notes/imgs/10/7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/10/7.png -------------------------------------------------------------------------------- /notes/imgs/10/8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/10/8.png -------------------------------------------------------------------------------- /notes/imgs/10/9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/10/9.png -------------------------------------------------------------------------------- /notes/imgs/11/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/11/1.png -------------------------------------------------------------------------------- /notes/imgs/11/2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/11/2.png -------------------------------------------------------------------------------- /notes/imgs/12/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/12/1.png -------------------------------------------------------------------------------- /notes/imgs/12/10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/12/10.png -------------------------------------------------------------------------------- /notes/imgs/12/11.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/12/11.png -------------------------------------------------------------------------------- /notes/imgs/12/12.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/12/12.png -------------------------------------------------------------------------------- /notes/imgs/12/13.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/12/13.png -------------------------------------------------------------------------------- /notes/imgs/12/14.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/12/14.png -------------------------------------------------------------------------------- /notes/imgs/12/15.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/12/15.png -------------------------------------------------------------------------------- /notes/imgs/12/16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/12/16.png -------------------------------------------------------------------------------- /notes/imgs/12/17.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/12/17.png -------------------------------------------------------------------------------- /notes/imgs/12/18.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/12/18.png -------------------------------------------------------------------------------- /notes/imgs/12/2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/12/2.png -------------------------------------------------------------------------------- /notes/imgs/12/3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/12/3.png -------------------------------------------------------------------------------- /notes/imgs/12/4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/12/4.png -------------------------------------------------------------------------------- /notes/imgs/12/5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/12/5.png -------------------------------------------------------------------------------- /notes/imgs/12/6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/12/6.png -------------------------------------------------------------------------------- /notes/imgs/12/7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/12/7.png -------------------------------------------------------------------------------- /notes/imgs/12/8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/12/8.png -------------------------------------------------------------------------------- /notes/imgs/12/9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/12/9.png -------------------------------------------------------------------------------- /notes/imgs/2/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/2/1.png -------------------------------------------------------------------------------- /notes/imgs/2/2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/2/2.png -------------------------------------------------------------------------------- /notes/imgs/2/3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/2/3.png -------------------------------------------------------------------------------- /notes/imgs/3/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/3/1.png -------------------------------------------------------------------------------- /notes/imgs/4/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/4/1.png -------------------------------------------------------------------------------- /notes/imgs/4/2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/4/2.png -------------------------------------------------------------------------------- /notes/imgs/4/3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/4/3.png -------------------------------------------------------------------------------- /notes/imgs/4/4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/4/4.png -------------------------------------------------------------------------------- /notes/imgs/4/5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/4/5.png -------------------------------------------------------------------------------- /notes/imgs/4/6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/4/6.png -------------------------------------------------------------------------------- /notes/imgs/4/7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/4/7.png -------------------------------------------------------------------------------- /notes/imgs/5/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/5/1.png -------------------------------------------------------------------------------- /notes/imgs/5/2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/5/2.png -------------------------------------------------------------------------------- /notes/imgs/5/3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/5/3.png -------------------------------------------------------------------------------- /notes/imgs/5/4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/5/4.png -------------------------------------------------------------------------------- /notes/imgs/6/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/6/1.png -------------------------------------------------------------------------------- /notes/imgs/6/2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/6/2.png -------------------------------------------------------------------------------- /notes/imgs/7/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/7/1.png -------------------------------------------------------------------------------- /notes/imgs/7/2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/7/2.png -------------------------------------------------------------------------------- /notes/imgs/8/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/8/1.png -------------------------------------------------------------------------------- /notes/imgs/8/2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/8/2.png -------------------------------------------------------------------------------- /notes/imgs/8/3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/8/3.png -------------------------------------------------------------------------------- /notes/imgs/8/4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/8/4.png -------------------------------------------------------------------------------- /notes/imgs/8/5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/8/5.png -------------------------------------------------------------------------------- /notes/imgs/8/6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/8/6.png -------------------------------------------------------------------------------- /notes/imgs/8/7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/8/7.png -------------------------------------------------------------------------------- /notes/imgs/8/8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/8/8.png -------------------------------------------------------------------------------- /notes/imgs/9/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/9/1.png -------------------------------------------------------------------------------- /notes/imgs/9/2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/9/2.png -------------------------------------------------------------------------------- /notes/imgs/9/3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/9/3.png -------------------------------------------------------------------------------- /notes/imgs/9/4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/9/4.png -------------------------------------------------------------------------------- /notes/imgs/9/5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/9/5.png -------------------------------------------------------------------------------- /notes/imgs/9/6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/imgs/9/6.png -------------------------------------------------------------------------------- /notes/pdfs/linux_source.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/pdfs/linux_source.pdf -------------------------------------------------------------------------------- /notes/pdfs/linux_system_installed.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhliuworks/Linux-Kernel-notes/8bb5a1c396b8a475881f7547ab948c468cf28968/notes/pdfs/linux_system_installed.pdf --------------------------------------------------------------------------------