├── .gitignore
├── 00-introduction.md
├── 11-process.md
├── 12-scheduling.md
├── 13-segmentation.md
├── 14-paging.md
├── 15-virtualization.md
├── 16-midterm.md
├── 21-locking.md
├── 22-semaphores.md
├── 23-concurrency.md
├── 31-drivers.md
├── 32-filesystem.md
├── 33-ffs.md
├── 34-journaling.md
├── 35-persistence.md
├── 41-testing.md
├── 42-mitigations.md
├── 43-security.md
├── 50-os.md
├── Makefile
├── README.md
├── demos
    ├── 11-fork.c
    ├── 13-heap.c
    ├── 13-quiz.c
    ├── 21-race.c
    ├── 21-race0no.c
    ├── 21-race1sw.c
    ├── 21-race2gp.c
    ├── 21-race3tas.c
    ├── 21-race4cas.c
    ├── 21-race5pth.c
    ├── 22-producer.c
    ├── 22-semaphore.c
    ├── 22-thread_exit.c
    ├── 22-workers.c
    ├── 32-tempfile.c
    └── Makefile
├── figures
    ├── 00-hamilton_kernighan.png
    ├── 00-puppets.jpg
    ├── 00-safe_space.jpg
    ├── 00-waiter.png
    ├── 12-scheduling.jpg
    ├── 12-timetable.png
    ├── 12-tractor.png
    ├── 31-filestack.png
    ├── 31-pdp11.jpg
    ├── 32-nyancat.gif
    ├── 32-winfs.png
    ├── 33-portal.jpg
    ├── 33-thankyou.jpg
    ├── 34-journaling.jpg
    ├── 41-asan.gif
    ├── 41-coveragewall.png
    ├── 41-fuzzing.png
    ├── 42-aslr.png
    ├── 42-cfi.png
    ├── 42-dep.png
    ├── 42-dep_and_aslr.png
    ├── 42-mem_safety_overview.png
    ├── 50-done1.jpg
    ├── 50-done2.jpg
    └── 50-learn.jpg
├── pdf
    └── README.md
└── preamble.tex


/.gitignore:
--------------------------------------------------------------------------------
1 | pdf/*.pdf
2 | 


--------------------------------------------------------------------------------
/00-introduction.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | subtitle: Introduction
  3 | ---
  4 | 
  5 | # Topics covered in this lecture
  6 | 
  7 | * What you will learn in this course
  8 | * What an OS is and why you want one
  9 | * Why you should know about OSes
 10 | 
 11 | ---
 12 | 
 13 | # This class provides a safe space
 14 | 
 15 | ![Equality, Diversity, Dialogue, Responsibility, Tolerance, and Inclusion form
 16 | the basis for a safe space.](./figures/00-safe_space.jpg)
 17 | 
 18 | ---
 19 | 
 20 | # Class organization
 21 | 
 22 | * Lectures cover OS design
 23 | * Book: [OSTEP](http://pages.cs.wisc.edu/~remzi/OSTEP/)
 24 | * Five (graded) labs focus on practical OS aspects
 25 |     - C programming (out: 09/22; in: 10/05)
 26 |     - Threading (out: 10/06; in: 10/26)
 27 |     - Concurrency (out: 10/27; in: 11/16)
 28 |     - File systems and storage (out: 10/17; in: 12/07)
 29 |     - Security (out: 12/08; in 12/22)
 30 | * TAs handle all labs/homework questions
 31 | * Grading
 32 |     - Quizzes after each class (10%)
 33 |     - Labs during the semester (50%)
 34 |     - Final exam in the exam session (40%)
 35 | * Feedback: through questions, quizzes, emails, office hours.
 36 | 
 37 | ---
 38 | 
 39 | # Time management
 40 | 
 41 | * 5 ECTS points map to, on average, 7 hours/week
 42 | * Divide and conquer: theory and labs
 43 | 
 44 | . . .
 45 | 
 46 | * 3 hours of theory/lectures
 47 |     * 2 hours class and reading
 48 |     * 30 minutes quiz
 49 |     * 30 minutes exercise
 50 | 
 51 | . . .
 52 | 
 53 | * 4 hours of programming
 54 |     * 2 hours of lab session and Q&A
 55 |     * 2 hours implementation on your own
 56 | 
 57 | ---
 58 | 
 59 | # What is an Operating System?
 60 | 
 61 | . . .
 62 | 
 63 | \Begin{multicols}{2}
 64 | 
 65 | \begin{tikzpicture}
 66 | \node (A) at (0,6) [draw,orange,ultra thick,minimum width=4cm,minimum height=1.5cm] {User};
 67 | \node (B) at (0,4) [draw,green,ultra thick,minimum width=4cm,minimum height=1.5cm] {Application};
 68 | \node (C) at (0,2) [draw,blue,ultra thick,minimum width=4cm,minimum height=1.5cm] {Operating System};
 69 | \node (D) at (0,0) [draw,brown,ultra thick,minimum width=4cm,minimum height=1.5cm] {Hardware};
 70 | 
 71 | \path (A.south) -- (A.south west) coordinate[pos=0.5] (a00);
 72 | \path (B.north) -- (B.north west) coordinate[pos=0.5] (b01);
 73 | \draw[latex-] (a00) -- (b01);
 74 | 
 75 | \path (B.south) -- (B.south west) coordinate[pos=0.5] (b00);
 76 | \path (C.north) -- (C.north west) coordinate[pos=0.5] (c01);
 77 | \draw[latex-] (b00) -- (c01);
 78 | 
 79 | \path (C.south) -- (C.south west) coordinate[pos=0.5] (c00);
 80 | \path (D.north) -- (D.north west) coordinate[pos=0.5] (d01);
 81 | \draw[latex-] (c00) -- (d01);
 82 | 
 83 | \path (A.south) -- (A.south east) coordinate[pos=0.5] (a10);
 84 | \path (B.north) -- (B.north east) coordinate[pos=0.5] (b11);
 85 | \draw[latex-] (b11) -- (a10);
 86 | 
 87 | \path (B.south) -- (B.south east) coordinate[pos=0.5] (b10);
 88 | \path (C.north) -- (C.north east) coordinate[pos=0.5] (c11);
 89 | \draw[latex-] (c11) -- (b10);
 90 | 
 91 | \path (C.south) -- (C.south east) coordinate[pos=0.5] (c10);
 92 | \path (D.north) -- (D.north east) coordinate[pos=0.5] (d11);
 93 | \draw[latex-] (d11) -- (c10);
 94 | 
 95 | \end{tikzpicture}
 96 | 
 97 | OS is middleware between applications and hardware. 
 98 | 
 99 | * Provides standardized interface to resources
100 | * Manages hardware
101 | * Orchestrates currently executing processes
102 | * Responds to resource access requests
103 | * Handles access control
104 | 
105 | \End{multicols}
106 | 
107 | ---
108 | 
109 | # OS role #1: Standardized interface
110 | 
111 | > The OS provides common functionality to access resources.
112 | > The OS abstracts hardware, provides a unified interface (e.g.,
113 | > network chips A and B are accessed using the same network API that allows
114 | > sending and receiving packets).
115 | 
116 | * Challenges:
117 |     * Defining the correct abstractions (e.g., what level)
118 |     * What hardware aspects should be exposed and how much
119 |     * Discussion: how to abstract GPUs
120 | 
121 | ---
122 | 
123 | # OS role #2: Resource management
124 | 
125 | > The OS shares (limited) resources between applications.
126 | 
127 | * Isolation: protect applications from each other
128 | * Scheduling: provide efficient and fair access to resources
129 | * Limit: share access to resources
130 | 
131 | ---
132 | 
133 | # OS role analogy
134 | 
135 | The OS is like a waiter that serves individual clients. The waiter knows the
136 | menu, records orders, and delivers food to the right table while keeping track
137 | of the bill.
138 | 
139 | ![OS as a waiter for processes](./figures/00-waiter.png)
140 | 
141 | ---
142 | 
143 | # What management services does an OS provide?
144 | 
145 | * ***CPU:*** initializes program counter/registers, shares CPU
146 | * ***Program memory:*** initializes process address space, loads program (code,
147 |   data, heap, stack)
148 | * ***Devices:*** read/write from/to disk; device driver is hardware
149 |   specific, abstracts to common interface
150 | 
151 | ---
152 | 
153 | # (Short) History of Operating Systems
154 | 
155 | * Started as a convenience library of common functions
156 | * Evolved from procedure calls to system calls
157 | * OS code executes at higher privilege level
158 | * Moved from single process to concurrently executing processes
159 | 
160 | ![](./figures/00-hamilton_kernighan.png){width=300px}
161 | 
162 | ---
163 | 
164 | # OS building blocks
165 | 
166 | OS design nicely separates into three pillars, with security as a
167 | transcendental layer covering/overarching all pillars.
168 | 
169 | \begin{tikzpicture}
170 | \draw [orange, ultra thick] (0,0) rectangle (2,4);
171 | \node[text width=3cm, rotate=90] at (1, 2.5) {Virtualization};
172 | \draw [blue, ultra thick] (3,0) rectangle (5,4);
173 | \node[text width=3cm, rotate=90] at (4, 2.5) {Concurrency};
174 | \draw [green, ultra thick] (6,0) rectangle (8,4);
175 | \node[text width=3cm, rotate=90] at (7, 2.5) {Persistence};
176 | 
177 | \draw [red, ultra thick] (0,4.5) -- (8,4.5);
178 | \draw [red, ultra thick] (0,4.5) -- (4,6);
179 | \draw [red, ultra thick] (4,6) -- (8,4.5);
180 | \node[text width=3cm] at (5, 5) {Security};
181 | 
182 | \end{tikzpicture}
183 | 
184 | ---
185 | 
186 | # Building block: Virtualization
187 | 
188 | Each application believes it has all resources for itself
189 | 
190 | * ***CPU:*** unlimited amount of instructions, continuous execution
191 | * ***Memory:*** unlimited memory is available
192 | * ***Challenge:*** how to share constrained resources
193 | 
194 | ---
195 | 
196 | # Building block: Concurrency
197 | 
198 | OS must handle ***concurrent events*** and untangle them as necessary.
199 | 
200 | * Hide concurrency from ***independent*** processes
201 | * Manage concurrency from ***dependent*** processes by providing
202 |   synchronization and communication primitives
203 | * ***Challenge:*** providing the right primitives
204 | 
205 | ---
206 | 
207 | # Building block: Persistence
208 | 
209 | Lifetime of information is greater than lifetime of a process.
210 | 
211 | * Enable processes to access ***non-volatile information***
212 | * Abstract how data is stored (through a file system)
213 | * Be ***resilient to failures*** (e.g., power loss)
214 | * Provide ***access control***
215 | * ***Challenge:*** authentication and permissions
216 | 
217 | ---
218 | 
219 | # Building block: Security
220 | 
221 | OS is a gatekeeper, it ensures and enforces security. OS is also privileged
222 | and therefore frequently attacked.
223 | 
224 | * ***Isolate*** processes from each other and the OS
225 | * ***Authenticate*** users (who is allowed to do what)
226 | * Protect itself against malicious network/user input
227 | * Harden program execution (through mitigations)
228 | * ***Challenge:*** performance versus security
229 | 
230 | ---
231 | 
232 | # Why you should study OS!
233 | 
234 | * Build, modify, or administer an operating system. 
235 | * Understand design decisions
236 | * Understand system performance
237 | * Enables understanding of complex systems
238 | * Turns you into a better (systems) programmer
239 | 
240 | 


--------------------------------------------------------------------------------
/11-process.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | subtitle: Virtual CPU (Processes and Threads)
  3 | ---
  4 | 
  5 | # Virtualization
  6 | 
  7 | \begin{tikzpicture}
  8 | \draw [fill, orange, ultra thick] (0,0) rectangle (2,4);
  9 | \node[text width=3cm, rotate=90] at (1, 2.5) {Virtualization};
 10 | \draw [blue, ultra thick] (3,0) rectangle (5,4);
 11 | \node[text width=3cm, rotate=90] at (4, 2.5) {Concurrency};
 12 | \draw [green, ultra thick] (6,0) rectangle (8,4);
 13 | \node[text width=3cm, rotate=90] at (7, 2.5) {Persistence};
 14 | 
 15 | \draw [red, ultra thick] (0,4.5) -- (8,4.5);
 16 | \draw [red, ultra thick] (0,4.5) -- (4,6);
 17 | \draw [red, ultra thick] (4,6) -- (8,4.5);
 18 | \node[text width=3cm] at (5, 5) {Security};
 19 | 
 20 | \end{tikzpicture}
 21 | 
 22 | ---
 23 | 
 24 | # Topics covered in this lecture
 25 | 
 26 | * The (virtual) process abstraction
 27 | * A notion on address spaces
 28 | * How processes are created
 29 | * Interaction between processes and the OS
 30 | 
 31 | This slide deck covers chapters 4--6 in OSTEP.
 32 | 
 33 | ---
 34 | 
 35 | # What is a Process?
 36 | 
 37 | ![Processes are controlled by the Operating System](./figures/00-puppets.jpg)
 38 | 
 39 | ---
 40 | 
 41 | # CPU, memory, and disk: limitations
 42 | 
 43 | Status quo[^1]:
 44 | 
 45 | * CPUs execute an endless stream of instructions (in memory)
 46 | * All system memory is in a contiguous physical address space
 47 | * The disk is a finite set of blocks
 48 | * All instructions execute in privileged mode
 49 | 
 50 | To handle concurrent programs, the OS must *separate* the execution of different
 51 | programs, providing the *illusion* to programs that each program is 
 52 | ***the only running program***.
 53 | 
 54 | . . .
 55 | 
 56 | The ***virtual*** **process abstraction** provides this illusion.
 57 | 
 58 | [^1]: Some simplifying assumptions apply to make our life easier.
 59 | 
 60 | ---
 61 | 
 62 | # Process abstraction
 63 | 
 64 | * A ***program*** consists of static code and data, e.g., on the disk.
 65 | * A ***process*** is an instance of a program (at any time there may be 0 or
 66 |   more instances of a program running, e.g., a user may run multiple concurrent
 67 |   shells).
 68 | 
 69 | ---
 70 | 
 71 | # Process definition
 72 | 
 73 | > A ***process*** is an ***execution stream*** in the context of a 
 74 | > ***process sate***.
 75 | > The execution stream is the sequence of executing instructions (i.e., the
 76 | > "thread of control"). The process state encompasses everything that executing
 77 | > instructions can affect or are affected by (e.g., registers, address space,
 78 | > persistent state such as files).
 79 | 
 80 | Note: state has two sides, the process view and the OS view. The OS keeps track
 81 | of the address space and persistence.
 82 | 
 83 | ---
 84 | 
 85 | # Process creation process (1/2)
 86 | 
 87 | \begin{tikzpicture}
 88 | 
 89 | \node at (0,5) {CPU};
 90 | \node (A) at (0,3) [draw,ultra thick,minimum width=3cm, minimum height=3cm] {};
 91 | \node at (5,5) {Memory};
 92 | \node (B) at (5,3) [draw,ultra thick,minimum width=3cm, minimum height=3cm] {};
 93 | 
 94 | \node at (4.5,-2) {Disk};
 95 | \draw (2.5,-1) [ultra thick] circle (1.5cm);
 96 | 
 97 | \node at (2.5,-1.9) {Program};
 98 | \node at (2.5,-1) [red,draw,ultra thick, text width=1.5cm, minimum width=1.8cm,
 99 | minimum height=1.2cm] {code data};
100 | 
101 | \draw [ultra thick] (0.2, 1) -- (4.8, 1);
102 | \draw [ultra thick] (0.2, 1) -- (0.2, 1.5);
103 | \draw [ultra thick] (4.8, 1) -- (4.8, 1.5);
104 | \draw [ultra thick] (2.5, 1) -- (2.5, 0.5);
105 | 
106 | 
107 | \end{tikzpicture}
108 | 
109 | ---
110 | 
111 | # Process creation process (2/2)
112 | 
113 | \begin{tikzpicture}
114 | 
115 | \node at (0,5) {CPU};
116 | \node (A) at (0,3) [draw,ultra thick,minimum width=3cm, minimum height=3cm] {};
117 | \node at (5,5) {Memory};
118 | \node (B) at (5,3) [draw,ultra thick,minimum width=3cm, minimum height=3cm] {};
119 | 
120 | \node at (4.5,-2) {Disk};
121 | \draw (2.5,-1) [ultra thick] circle (1.5cm);
122 | 
123 | \node at (2.5,-1.9) {Program};
124 | \node (PRG) at (2.5,-1) [red,draw,ultra thick, text width=1.5cm, minimum width=1.8cm,
125 | minimum height=1.2cm] {code data};
126 | 
127 | \node at (5,2) {Process};
128 | \node (PRC) at (5,3.2) [blue,draw,ultra thick, text width=1.5cm, minimum width=1.5cm,
129 | minimum height=1.2cm] {code data heap stack};
130 | 
131 | \draw [ultra thick] (0.2, 1) -- (4.8, 1);
132 | \draw [ultra thick] (0.2, 1) -- (0.2, 1.5);
133 | \draw [ultra thick] (4.8, 1) -- (4.8, 1.5);
134 | \draw [ultra thick] (2.5, 1) -- (2.5, 0.5);
135 | 
136 | 
137 | \draw [ultra thick, red] (2.5, -0.4) -- (2.5, 3.2);
138 | \draw [ultra thick, red, ->] (2.5, 3.2) -- (4, 3.2);
139 | 
140 | \end{tikzpicture}
141 | 
142 | ---
143 | 
144 | # Comparison of terms:
145 | 
146 | * A ***program*** is on-disk application, consisting of code and data; 
147 |   programs become a process when they are executed
148 | * A ***process*** is a running instance of a program. A process starts with a
149 |   single thread of execution and an address space.
150 | * A process can launch multiple ***threads*** of execution in the same address
151 |   space. Each thread receives its own stack but they share global data, code,
152 |   and heap.
153 | 
154 | ---
155 | 
156 | # Sharing resources: two forms
157 | 
158 | . . .
159 | 
160 | \Begin{multicols}{2}
161 | \begin{tikzpicture}
162 | \fill [blue] (1.4,1) rectangle (1.6,0.2);
163 | \fill [blue] (0,0) rectangle (3,0.2);
164 | 
165 | \draw [ultra thick, purple] (0.2,0) -- (3,0);
166 | 
167 | \fill [blue] (0,0) rectangle (0.2,-1);
168 | 
169 | 
170 | \draw [ultra thick, black] (0,0.2) -- (1.4,0.2) -- (1.4,1);
171 | \draw [ultra thick, black] (1.6,1) -- (1.6,0.2) -- (3,0.2);
172 | \draw [ultra thick, black] (0,0.2) -- (0,-1);
173 | \draw [ultra thick, black] (3,0.2) -- (3,-1);
174 | \draw [ultra thick, black] (0.2,-1) -- (0.2,0) -- (0.7,0) -- (0.7,-1);
175 | \draw [ultra thick, black] (0.9,-1) -- (0.9,0) -- (1.4,0) -- (1.4,-1);
176 | \draw [ultra thick, black] (1.6,-1) -- (1.6,0) -- (2.1,0) -- (2.1,-1);
177 | \draw [ultra thick, black] (2.3,-1) -- (2.3,0) -- (2.8,0) -- (2.8,-1);
178 | 
179 | \end{tikzpicture}
180 | 
181 | Time sharing (one at a time)
182 | 
183 | \begin{tikzpicture}
184 | \fill [blue] (1.4,1) rectangle (1.6,0.2);
185 | \fill [blue] (0,0) rectangle (3,0.2);
186 | 
187 | \draw [ultra thick, purple] (0,0) -- (3,0);
188 | 
189 | \fill [blue!50] (0,0.03) rectangle (0.1,-1);
190 | \fill [blue!50] (0.7,0.03) rectangle (0.8,-1);
191 | \fill [blue!50] (1.4,0.03) rectangle (1.5,-1);
192 | \fill [blue!50] (2.1,0.03) rectangle (2.2,-1);
193 | \fill [blue!50] (2.8,0.03) rectangle (2.9,-1);
194 | 
195 | \draw [ultra thick, black] (0,0.2) -- (1.4,0.2) -- (1.4,1);
196 | \draw [ultra thick, black] (1.6,1) -- (1.6,0.2) -- (3,0.2);
197 | \draw [ultra thick, black] (0,0.2) -- (0,-1);
198 | \draw [ultra thick, black] (3,0.2) -- (3,-1);
199 | \draw [ultra thick, black] (0.2,-1) -- (0.2,0) -- (0.7,0) -- (0.7,-1);
200 | \draw [ultra thick, black] (0.9,-1) -- (0.9,0) -- (1.4,0) -- (1.4,-1);
201 | \draw [ultra thick, black] (1.6,-1) -- (1.6,0) -- (2.1,0) -- (2.1,-1);
202 | \draw [ultra thick, black] (2.3,-1) -- (2.3,0) -- (2.8,0) -- (2.8,-1);
203 | 
204 | \end{tikzpicture}
205 | 
206 | Space sharing (all a little)
207 | 
208 | \End{multicols}
209 | 
210 | . . .
211 | 
212 | * Shared in time (I get to use the toolbox exclusively)
213 | * Shared in space (I get to pick the two screwdrivers I need)
214 | 
215 | ---
216 | 
217 | # Virtualizing the CPU
218 | 
219 | * Goal: give each process the illusion of exclusive CPU access
220 | * Reality: the CPU is a shared resource among all processes
221 | 
222 | * Two approaches: shared in time or space
223 |     * ***time sharing***: exclusive use, one at a time
224 |     * ***space sharing***: everyone gets a small chunk all the time
225 | 
226 | . . .
227 | 
228 | * Different strategies for CPU, memory, and disk
229 |     * ***CPU:*** time sharing, alternate between tasks
230 |     * ***Memory:*** space sharing (more later)
231 |     * ***Disk:*** space sharing (more later)
232 | 
233 | ---
234 | 
235 | # OS provides process abstraction
236 | 
237 | * When the user executes a program, the OS creates a process.
238 | * OS time-shares CPU across multiple processes.
239 | * OS scheduler picks ***one*** of the executable processes to run.
240 |     * Scheduler must keep a list of processes
241 |     * Scheduler must keep metadata for policy
242 | 
243 | ---
244 | 
245 | # Difference between policy and mechanism
246 | 
247 | * ***Policy:*** which process to run
248 | * ***Mechanism:*** how to switch from one process to another
249 | 
250 | Distinction between policy and mechanism enables modularity. The scheduling
251 | policy is independent of the context switch functionality.
252 | 
253 | ---
254 | 
255 | # Process creation
256 | 
257 | * OS allocates internal data structures
258 | * OS allocates an address space
259 |     * Loads code, data from disk
260 |     * Creates runtime stack, heap
261 | * OS opens basic files (STDIN, STDOUT, STDERR)
262 | * OS initializes CPU registers
263 | 
264 | ---
265 | 
266 | # Process states
267 | 
268 | * ***Running***: this process is currently executing
269 | * ***Ready***: this process is ready to execute (and will be scheduled when
270 |   the policy decides so)
271 | * ***Blocked***: this process is suspended (e.g., waiting for some action; OS
272 |   will unblock it when that action is complete)
273 | * ***New***: this process is being created (to ensure it will not be scheduled)
274 | * ***Dead***: this process has terminated (e.g., if the parent process has not 
275 |   read out the return value yet)
276 | 
277 | ---
278 | 
279 | # Process state transitions
280 | 
281 | \begin{tikzpicture}
282 | 
283 | \node [draw, circle, ultra thick, minimum width=2cm] at (0,0) {Blocked};
284 | \node [draw, circle, ultra thick, minimum width=2cm] at (-3,3) {Running};
285 | \node [draw, circle, ultra thick, minimum width=2cm] at (3,3) {Ready};
286 | 
287 | \draw [ultra thick, ->] (-2.3, 2.3) -- (-0.7, 0.7);
288 | \node at (-2.8, 1.2) {I/O: start};
289 | \draw [ultra thick, ->] (0.7, 0.7) -- (2.3, 2.3);
290 | \node at (2.5, 1.2) {I/O: done};
291 | \draw [ultra thick, ->] (-2, 3.1) -- (2, 3.1);
292 | \node at (0, 3.5) {Deschedule};
293 | \draw [ultra thick, ->] (2, 2.9) -- (-2, 2.9);
294 | \node at (0, 2.5) {Schedule};
295 | 
296 | \end{tikzpicture}
297 | 
298 | ---
299 | 
300 | # Example: process state transitions
301 | 
302 | | Time   | Process 0 | Process 1 | Notes                  |
303 | |-------:|-----------|-----------|------------------------|
304 | | 1      | Running   | Ready     | |
305 | | 2      | Running   | Ready     | |
306 | | 3      | Running   | Ready     | P0 initiates I/O       |
307 | | 4      | Blocked   | Running   | P0 is blocked, P1 runs |
308 | | 5      | Blocked   | Running   | |
309 | | 6      | Blocked   | Running   | |
310 | | 7      | Blocked   | Running   | I/O completes          |
311 | | 8      | Ready     | Running   | P1 is complete/exits   |
312 | | 9      | Running   | -         | |
313 | 
314 | 
315 | ---
316 | 
317 | # Tangent: idling
318 | 
319 | What process should be scheduled if all processes are blocked?
320 | 
321 | . . . 
322 | 
323 | The ***idle*** process.
324 | 
325 | Modern kernels use a low priority idle process that is scheduled and executes
326 | if no other process iss ready. The idle process never blocks or executes any
327 | I/O. 
328 | 
329 | . . .
330 | 
331 | The idle process is a simple solution to a challenging problem.
332 | Without the idle process, the scheduler would have to check if no
333 | processes are ready to run and would have to conservatively take action. The
334 | idle process guarantees that there is always ***at least one*** process to run.
335 | 
336 | ---
337 | 
338 | # OS data structures
339 | 
340 | * OS maintains data structure (array/list) of active processes.
341 | * Information for each process is stored in a process control block (on Linux,
342 |   this is called `task struct`) that contains:
343 |     * Process identifier (PID)
344 |     * Process state (e.g., ready)
345 |     * Pointer to parent process (`cat /proc/self/status`)
346 |     * CPU context (if process is not running)
347 |     * Pointer to address space (`cat /proc/self/maps`)
348 |     * Pointer to list of open files (file descriptors, `cat /proc/self/fdinfo/*`)
349 | 
350 | ---
351 | 
352 | # Distinction program / process / thread
353 | 
354 | * ***Program:*** consists of an executable on disk. Contains all information
355 |   to boostrap a process
356 | * ***Process:*** a running instance of a program; has data section and stack
357 |   initialized
358 | * ***Thread:*** a process can have multiple threads in the same address space
359 |   (computing on the same data)
360 | 
361 | ---
362 | 
363 | # Distinction between processes and threads
364 | 
365 | * A thread is a "lightweight process" (LWP)
366 |     * A thread consists of a stack and register state (stack pointer, code pointer, other registers).
367 |     * Each process has one or more threads.
368 | 
369 | For example, two processes reading address `0xc0f3` may read different values.
370 | While two threads in the same process will read the same value.
371 | 
372 | ---
373 | 
374 | # Requesting OS services
375 | 
376 | * Processes can request services through the system call API (Application
377 |   Programming Interface).
378 | * System calls transfer execution to the OS (the OS generally runs at higher
379 |   privileges, enabling privileged operations).
380 | * Sensitive operations (e.g., hardware access, raw memory access) require
381 |   (execution) privileges.
382 | * Some system calls (e.g., `read`, `write`) may cause the process to block,
383 |   allowing the OS to schedule other processes.
384 | * Libraries (the libc) hide system call complexity, export OS functionality as
385 |   regular function calls.
386 | 
387 | ---
388 | 
389 | # Process API
390 | 
391 | The process API enables a process to control itself and other processes through
392 | a set of system calls:
393 | 
394 | * `fork()` creates a new child process (a copy of the process)
395 | * `exec()` executes a new program
396 | * `exit()` terminates the current process
397 | * `wait()` blocks the parent until the child terminates
398 | * This is a small subset of the complex process API (more later)
399 | 
400 | ---
401 | 
402 | # Process API: `fork()`, creating a new process
403 | 
404 | * The OS allocates data structures for the new process (child).
405 | * The OS makes a copy of the caller's (parent's) address space.
406 | * The child is made ready and added to the list of processes.
407 | * `fork()` returns different values for parent/child.
408 | * Parent and child continue execution in ***their own separate copy*** of their
409 |   address space (next week: how can we efficiently handle the copy of address
410 |   spaces?)
411 | 
412 | <!-- TODO: figure about splitting processes -->
413 | 
414 | ---
415 | 
416 | # Process API: `fork()` demo!
417 | 
418 | ```.C
419 | #include <stdio.h>
420 | #include <stdlib.h>
421 | #include <unistd.h>
422 | 
423 | int main(int argc, char* argv[]) {
424 |   printf("Hello, I'm PID %d (%d, %s)\n", (int)getpid(),
425 |          argc, argv[0]);
426 |   int pid = fork();
427 |   if (pid < 0) exit(-1);  // fork failed
428 |   if (pid == 0) {
429 |     printf("o/ I'm PID %d\n", (int)getpid());
430 |   } else {
431 |     printf("\\o, my child is PID %d\n", pid);
432 |   }
433 |   return 0;
434 | }
435 | ```
436 | 
437 | ---
438 | 
439 | # Process API: `exec()`, executing a (new) program
440 | 
441 | * Always executing the same program is boring
442 |   (we would need one massive program with all functionality, e.g., `emacs`).
443 | * `exec()` replaces address space, loads new program from disk.
444 | * Program can pass command line arguments and environment.
445 | * Old address space/state is destroyed except for STDIN, STDOUT, STDERR which
446 |  are kept, allowing the parent to redirect/rewire child's output!
447 | 
448 | ---
449 | 
450 | # Why do we need `fork()` and `exec()`?
451 | 
452 | Assume a user wants to start a different program. For that, the operating
453 | system needs to create a new process and create a new address space to load
454 | the program.
455 | 
456 | . . .
457 | 
458 | Let's use divide and conquer:
459 | 
460 | * `fork()` creates a new process with a copy of this address space
461 | * `exec()` creates a new address space for a program
462 | * `clone()` adds a thread (of execution) to this address space
463 | 
464 | ---
465 | 
466 | # Process API: `wait()`, waiting for a child
467 | 
468 | * Child processes are tied to their parent.
469 | * `exit(int retval)` takes a return value argument.
470 | * Parent can `wait()` for termination of child and read child's return value.
471 | 
472 | ---
473 | 
474 | # A tree of processes
475 | 
476 | * Each process has a parent process
477 | * A process can have many child process
478 | * Each process again can have child processes
479 | 
480 | ```
481 |  3621  ?        Ss   \_ tmux
482 |  3645  pts/2    Ss+  |   \_ -zsh
483 |  3673  pts/3    Ss+  |   \_ -zsh
484 |  4455  pts/4    Ss+  |   \_ -zsh
485 | 27124  pts/1    Ss+  |   \_ -zsh
486 | 21093  pts/5    Ss   |   \_ -zsh
487 | 10589  pts/5    T    |   |   \_ vim 02-review.md
488 | 10882  pts/5    R+   |   |   \_ ps -auxwf
489 | 10883  pts/5    S+   |   |   \_ less
490 | 21264  pts/7    Ss   |   \_ -zsh
491 |  1382  pts/7    T    |   |   \_ vim /home/gannimo/notes.txt
492 | 14368  pts/9    Ss   |   \_ -zsh
493 | 29963  pts/9    S+   |       \_ python
494 | ```
495 | 
496 | ---
497 | 
498 | # Ensuring efficient execution
499 | 
500 | > A process executes instructions *directly* on the CPU.
501 | 
502 | . . . 
503 | 
504 | Issues with running directly on hardware:
505 | 
506 | * Process could do something illegal (read/write to memory that does not belong
507 |   to the process, access hardware directly)
508 | * Process could run forever (OS must stay in control)
509 | * Process could do something slow, e.g., I/O (OS may want to switch to another
510 |   process)
511 | 
512 | . . .
513 | 
514 | ***Solution:*** OS maintains some control with help from hardware.
515 | For example, the OS maintains timers to intercept the execution at regular
516 | intervals and the process may not execute privileged instructions that access
517 | the hardware directly.
518 | 
519 | ---
520 | 
521 | # Process isolation policy
522 | 
523 | * On most operating systems, processes are:
524 |     * Isolated from each other
525 |     * Isolated from the OS
526 | * Isolation is a core requirement for security:
527 |     * Constrains bugs to the process
528 |     * Enables privilege isolation
529 |     * Enables compartmentalization (breaking complex systems into independent
530 |       fault domains)
531 | 
532 | What *mechanism* allows process isolation?
533 | 
534 | ---
535 | 
536 | # Process isolation mechanism
537 | 
538 | * Virtual memory: one (virtual) address space per process
539 | * Different execution modes: OS executes at higher privileges
540 |     * Process executes in user mode (ring 3 on x86)
541 |     * OS executes in super mode (ring 0 on x86)
542 | 
543 | ---
544 | 
545 | # Summary
546 | 
547 | * Processes are a purely ***virtual concept***
548 | * Separating policies and mechanisms enables modularity
549 | * OS is a server, ***reacts to requests*** from hardware and processes
550 | * Processes are ***isolated*** from the OS/other processes
551 |     * Processes have no direct access to devices
552 |     * Processes run in virtual memory
553 |     * OS provides functionality through system calls
554 | * A process consists of an address space, associated kernel state (e.g., open
555 |   files, network channels), and one or more threads of execution
556 | 
557 | Don't forget the Moodle quiz!
558 | 


--------------------------------------------------------------------------------
/13-segmentation.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | subtitle: Virtual Memory (Segmentation)
  3 | ---
  4 | 
  5 | # Topics covered in this lecture
  6 | 
  7 | * Abstraction: address space
  8 | * Policy: isolation
  9 | * Mechanism: address translation
 10 | * Mechanism: heap management
 11 | 
 12 | This slide deck covers chapters 13--17 in OSTEP.
 13 | 
 14 | ---
 15 | 
 16 | # Virtualization
 17 | 
 18 | Goal: isolate processes (and their faults) from each other.
 19 | 
 20 | . . .
 21 | 
 22 | Virtualization enables isolation, but isolation requires separation.
 23 | A process must be prohibited to access memory/registers of another process.
 24 | 
 25 | * Step 1: Virtual CPU provides illusion of private CPU registers (mechanisms and
 26 |   policy)
 27 | * Step 2: Virtual RAM provides illusion of  private memory
 28 | 
 29 | ---
 30 | 
 31 | # History: uniprogramming
 32 | 
 33 | * Initially the OS was a set of library routines
 34 | * Issue 1: only one task at a time
 35 | * Issue 2: no isolation between OS / task
 36 | 
 37 | \begin{tikzpicture}
 38 | 
 39 | \draw [ultra thick] (0,0) rectangle (3,2) node[pos=.5] {Task};
 40 | \draw [ultra thick] (0,2) rectangle (3,3) node[pos=.5] {OS};
 41 | \node at (-1,0) {$2^{n}-1$};
 42 | \node at (-1,3) {$0$};
 43 | 
 44 | \draw [ultra thick] (3.2, 0) -- (4.8, 0);
 45 | \draw [ultra thick] (3.2, 2) -- (4.8, 3.5);
 46 | 
 47 | \draw [ultra thick] (5,0) rectangle (8,1) node[pos=.5] {Stack};
 48 | \draw [ultra thick, ->] (6.5, 1) -- (6.5, 1.4);
 49 | \draw [ultra thick] (5,1) rectangle (8,2.5);
 50 | \draw [ultra thick, ->] (6.5, 2.5) -- (6.5, 2.1);
 51 | \draw [ultra thick] (5,2.5) rectangle (8,3) node[pos=.5] {Heap};
 52 | \draw [ultra thick] (5,3) rectangle (8,3.5) node[pos=.5] {Code};
 53 | \end{tikzpicture}
 54 | 
 55 | ---
 56 | 
 57 | # Goals for multiprogramming
 58 | 
 59 | * **Transparency:** processes are unaware of memory sharing and the existence of other processes
 60 | * **Protection:** OS/other processes are isolated from process (read/write)
 61 | * **Efficiency (1):** do not waste resources (e.g., fragmentation)
 62 | * **Efficiency (2):** run as close to the metal as much as possible
 63 | * **Sharing:** processes *may* share part of address space
 64 | 
 65 | ---
 66 | 
 67 | # Abstraction: address space
 68 | 
 69 | ***Address space***: each process has a set of addresses that map to data
 70 | (i.e., a map from pointers to bytes)
 71 | 
 72 | * Static: code and global variables
 73 | * Dynamic: stack, heap
 74 | 
 75 | . . .
 76 | 
 77 | Why do we need dynamic memory?
 78 | 
 79 | . . .
 80 | 
 81 | * The amount of required memory may be task dependent
 82 | * Input size may be unknown at compile time
 83 | * Conservative pre-allocation would be wasteful
 84 | * Recursive functions (invocation frames)
 85 | 
 86 | ---
 87 | 
 88 | # Dynamic data structure: stack
 89 | 
 90 | * Data is returned in reverse order from insertion
 91 |     * `push(1); push(2); push(3);`
 92 |     * `pop()->3; pop()->2; pop()->1;`
 93 | * Memory is freed in reverse order from allocation
 94 |     * `a=alloc(20); b=alloc(10);`
 95 |     * `free(b); free(a);`
 96 | 
 97 | . . .
 98 | 
 99 | * Straight-forward implementation: bump or decrement a pointer
100 |     * Advantage: no fragmentation, no metadata
101 |     * Note: deallocations ***must*** be in ***reverse order***
102 | 
103 | ---
104 | 
105 | # Excursion: procedure invocation frames
106 | 
107 | Calling a function allocates an invocation frame to
108 | store all local variables and the necessary context to return to the callee.
109 | 
110 | ```.C
111 | int called(int a, int b) {
112 |   int tmp = a * b;
113 |   return tmp / 42;
114 | }
115 | void main(int argc, char *argv[]) {
116 |   int tmp = called(argc, argc);
117 | }
118 | ```
119 | 
120 | What data is stored in the invocation frame of `called`?
121 | 
122 | . . .
123 |  
124 | * Slot for `int tmp`
125 | * Slots for the parameters a, b
126 | * Slot for the return code pointer
127 | * Order in most ABIs: b, a, RIP, tmp
128 | 
129 | . . .
130 | 
131 | The compiler creates the necessary code, according to the ABI.
132 | 
133 | ---
134 | 
135 | # Stack for procedure invocation frames
136 | 
137 | * The stack enables simple storage of function invocation frames
138 | * Stores calling context and sequence of active parent frames
139 | * Memory allocated in function prologue, freed when returned
140 | 
141 | . . .
142 | 
143 | What happens to the data when function returns?
144 | 
145 | . . .
146 | 
147 | * Data from previous function lingers, overwritten when the next function
148 |   initializes its data
149 | 
150 | ---
151 | 
152 | # Quiz: scopes, stack, and persistence
153 | 
154 | ```.C
155 | int a = 2;
156 | int called(int b) {
157 |   int c = a * b;
158 |   printf("a: %d b: %d c: %d\n", a, b, c);
159 |   a = 5;  
160 |   return c;
161 | }
162 | int main(int argc, char* argv) {
163 |   int b = 2, c = 3;
164 |   printf("a: %d b: %d c: %d\n", a, b, c);
165 |   b = called(c);
166 |   printf("a: %d b: %d c: %d\n", a, b, c);
167 |   return 0;
168 | }
169 | ```
170 | 
171 | <!--
172 | a: 2 b: 2 c: 3
173 | a: 2 b: 3 c: 6
174 | a: 5 b: 6 c: 3
175 | -->
176 | 
177 | ---
178 | 
179 | # Dynamic data structure: heap
180 | 
181 | A heap of randomly allocated memory objects with *statically unknown size* and
182 | *statically unknown allocation patterns*. The size and lifetime of each
183 | allocated object is unknown.
184 | 
185 | API: `alloc` creates an object, `free` indicates it is no longer used.
186 | 
187 | . . .
188 | 
189 | How would you manage such a data structure?
190 | 
191 | ---
192 | 
193 | # Heap: straw man implementation
194 | 
195 | ```.C
196 | char storage[4096], *heap = storage;
197 | char *alloc(size_t len) {
198 |   char *tmp = heap;
199 |   heap = heap + len;
200 |   return tmp;
201 | }
202 | 
203 | void free(char *ptr) {}
204 | 
205 | ```
206 | 
207 | * Advantage: simple
208 | * Disadvantage: no reuse, will run out of memory
209 | 
210 | ---
211 | 
212 | # Heap: free list
213 | 
214 | Idea: abstract heap into list of free blocks.
215 | 
216 | * Keep track of free space, program handles allocated space
217 | * Keep a list of all available memory objects and their size
218 | 
219 | Implementation:
220 | 
221 | * `alloc`: take a free block, split, put remainder back on free list
222 | * `free`: add block to free list
223 | 
224 | . . .
225 | 
226 | What are advantages/disadvantages with this implementation?
227 | 
228 | ---
229 | 
230 | # Heap: better implementations
231 | 
232 | * Allocation: find a fitting object (first, best, worst fit)
233 |     * first fit: find the first object in the list and split it
234 |     * best fit: find the object that is closest to the size
235 |     * worst fit: find the largest object and split it
236 | 
237 | . . .
238 | 
239 | * Free: merge adjacent blocks
240 |     * if the adjacent region is free, merge the two blocks
241 | 
242 | <!-- TODO: demo implementation -->
243 | 
244 | ---
245 | 
246 | # Heap and OS interaction
247 | 
248 | * The OS hands the process a large chunk of memory to store heap objects
249 | * A runtime library (the libc) manages this chunk
250 | * Memory allocators aim for performance, reliability, or security
251 | 
252 | ---
253 | 
254 | # Quiz: where is it?
255 | 
256 | ```.C
257 | int g;
258 | int main(int argc, char *argv[]) {
259 |   int foo;
260 |   char *c = (char*)malloc(argc*sizeof(int));
261 |   free(c);
262 | }
263 | ```
264 | 
265 | Possible storage locations: stack, heap, globals, code
266 | 
267 | . . .
268 | 
269 | * Stack: argc, argv, foo, c
270 | * Heap: \*c
271 | * Globals: g
272 | * Code: main
273 | 
274 | ---
275 | 
276 | # Virtualizing memory
277 | 
278 | * Challenge: how can we run multiple programs in parallel?
279 |     * Addresses are hard coded in code
280 |     * Static allocation? What about executing the same task twice?
281 | 
282 | * Possible sharing mechanisms:
283 |     * Time sharing
284 |     * Static relocation/allocation
285 |     * Base (+ bounds)
286 |     * Segmentation
287 |     * Virtual memory
288 | 
289 | ---
290 | 
291 | # Virtualizing memory: time sharing
292 | 
293 | * Reuse idea from CPU virtualization
294 |     * OS virtualizes CPU by storing register state to memory
295 |     * Could virtualize memory by storing state to disk
296 | 
297 | * Disadvantage: incredibly bad performance due to I/O latency
298 | * Better: space sharing (divide memory among processes)
299 | 
300 | <!-- TODO: memory hierarchy, explaining that storing a few registers is cheap
301 | but storing all of a process' address space is expensive -->
302 | 
303 | ---
304 | 
305 | # Tangent: track that memory access
306 | 
307 | * How many memory accesses are executed? 
308 | * What kind of memory accesses (read or write)?
309 | 
310 | ```.ASM
311 | 0x10: mov -0x4(%rbp),%edx
312 | 0x13: mov -0x8(%rbp),%eax
313 | 0x16: add %edx,%eax
314 | 0x18: mov %eax,-0x8(%rbp)
315 | ```
316 | 
317 | . . .
318 | 
319 | ```.ASM
320 | 0x10: mov -0x4(%rbp),%edx  # Load 0x10 Exe Load *(%rbp-4)
321 | 0x13: mov -0x8(%rbp),%eax  # Load 0x13 Exe Load *(%rbp-8)
322 | 0x16: add %edx,%eax        # Load 0x16 Exe
323 | 0x18: mov %eax,-0x8(%rbp)  # Load 0x18 Exe Store *(%rbp-8)
324 | ```
325 | 
326 | ---
327 | 
328 | # Virtualizing memory: static relocation
329 | 
330 | ```.ASM
331 | 0x10: mov -0x4(%rbp),%edx
332 | 0x13: mov -0x8(%rbp),%eax
333 | 0x16: add %edx,%eax
334 | 0x18: call 60 <printf@plt>
335 | ```
336 | 
337 | OS relocates text segment (code area) when new task is started:
338 | 
339 | . . .
340 | 
341 | ```.ASM
342 | # Task 1                      # Task 2
343 | 0x1010: mov -0x4(%rbp),%edx   0x5010: mov -0x4(%rbp),%edx
344 | 0x1013: mov -0x8(%rbp),%eax   0x5013: mov -0x8(%rbp),%eax
345 | 0x1016: add %edx,%eax         0x5016: add %edx,%eax
346 | 0x1018: call 1060 <printf>    0x5018: call 5060 <printf>
347 | ```
348 | 
349 | ---
350 | 
351 | # Virtualizing memory: static relocation
352 | 
353 | * When loading a program, relocate it to an assigned area
354 | * Carefully adjusts all pointers in code and globals, set the stack pointer to
355 |   the assigned stack
356 | 
357 | . . .
358 | 
359 | * There is only one address space, no physical/virtual separation
360 | * Issue 1: no separation between processes (no integrity or confidentiality)
361 | * Issue 2: fragmentation, address space remains fixed as long as program runs
362 | * Issue 3: programs have to be adjusted when loaded (e.g., target of a jump
363 |   will be at different addresses depending on the location in the address
364 |   space)
365 | 
366 | ---
367 | 
368 | # Challenge: illusion of private address space
369 | 
370 | > How can the OS provide the illusion of a *private* address space to
371 | > each process?
372 | 
373 | ---
374 | 
375 | # Virtualizing memory: dynamic relocation
376 | 
377 | * What if, instead of relocating the memory accesses ahead of time, the hardware
378 |   could help us relocate accesses just-in-time? 
379 | * In dynamic relocation, a hardware mechanism translates each memory address
380 |   from the program's viewpoint to the hardware's viewpoint.
381 | 
382 | ***Interposition:*** the hardware will intercept each memory access and
383 | dynamically
384 | and transparently translate for the program from virtual addresses (VA) to
385 | physical addresses (PA). The OS manages the book keeping of which physical
386 | addresses are associated with what processes.
387 | 
388 | ---
389 | 
390 | # Indirection
391 | 
392 | > We can solve any problem by introducing an extra level of indirection. [Except
393 | > for the problem of too many layers of indirection.]
394 | 
395 | (Andrew Koenig attributed the quote to Butler Lampson who attributed it to David
396 | J. Wheeler, adding another layer of indirection.)
397 | 
398 | ---
399 | 
400 | # MMU: Memory Management Unit
401 | 
402 | \begin{tikzpicture}
403 | 
404 | \draw [ultra thick] (0,0) rectangle (1,1) node[pos=.5] {CPU};
405 | \draw [ultra thick, <->] (1, 0.5) -- (2, 0.5);
406 | \draw [ultra thick] (2,0) rectangle (3,1) node[pos=.5] {MMU};
407 | \draw [ultra thick, <->] (3, 0.5) -- (4, 0.5);
408 | \draw [ultra thick] (4,0) rectangle (5.5,2) node[pos=.5] {Memory};
409 | \end{tikzpicture}
410 | 
411 | * Process runs on the CPU
412 | * OS controls CPU and MMU
413 | * MMU translates virtual addresses (logical addresses) to physical addresses
414 | 
415 | ---
416 | 
417 | # Privilege modes
418 | 
419 | How do you keep the process from modifying the MMU  configuration?
420 | 
421 | . . .
422 | 
423 | * Separation: OS runs at higher privileges than process
424 | * OS privileges include special instructions for MMU config
425 | * Switch from user-space (process) to kernel space through system call (special
426 |   call instruction)
427 | * OS returns to unprivileged user mode (with special return)
428 | * Exceptions in user space (e.g., illegal memory access, division by 0) switch
429 |   to privileged mode, OS handles the exception
430 | 
431 | ---
432 | 
433 | # A simple MMU: base register
434 | 
435 | * Idea: translate virtual to physical addresses by adding offset.
436 | * Store offset in special register (OS controlled, used by MMU).
437 | * Each process has a different offset in their base register
438 | 
439 | ---
440 | 
441 | # A simple MMU: base register
442 | 
443 | \begin{tikzpicture}
444 | 
445 | % Changed these to multiples of 4KiB so addresses would be 0x1000 etc...
446 | \node at (-1, 5) {0 KiB};
447 | \node at (-1, 4) {4 KiB};
448 | \node at (-1, 3) {8 KiB};
449 | \node at (-1, 2) {12 KiB};
450 | \node at (-1, 1) {16 KiB};
451 | \node at (-1, 0) {20 KiB};
452 | \draw [ultra thick, fill=gray] (0,0) rectangle (3,5);
453 | \draw [ultra thick, fill=green] (0,1) rectangle (3,2) node[pos=.5] {P2};
454 | \draw [ultra thick, fill=teal] (0,3) rectangle (3,4) node[pos=.5] {P1};
455 | 
456 | \draw [ultra thick, ->] (4, 4) -- (3.2, 4);
457 | \node at (5.2, 4) {base register};
458 | 
459 | \end{tikzpicture}
460 | 
461 | ---
462 | 
463 | # A simple MMU: base register
464 | 
465 | * Set base register to $0x1000$ for P1
466 | * Load of address $0x100_{v}$ becomes $0x1100_{p}$
467 | * Set base register to $0x3000$ for P2
468 | * Load of address $0x52_{v}$ becomes $0x3052_{p}$
469 | 
470 | ---
471 | 
472 | # A simple MMU: base register
473 | 
474 | <!-- Rephrased so both questions would have the same answer: No -->
475 | * Is this design free from security issues? 
476 |     * Are processes P1 and P2 truly separated?
477 | 
478 | . . .
479 | 
480 | No! P1 can access the memory of P2 as the base register is simply added. In the
481 | previous example, with `base=0x1000`, accessing address $0x2000_{v}$ will
482 | access the first byte of memory of P2 while P1 is executing!
483 | 
484 | ---
485 | 
486 | # A simple MMU: base and bounds
487 | 
488 | * Simple solution: base and bounds
489 |     * Base register sets minimum address
490 |     * Bounds register sets (virtual) limit of the address space, highest
491 |       physical address that is accessible becomes `base+bounds`
492 | 
493 | * New concept: access check
494 | 
495 | ```.C
496 | if (addr < bounds) {
497 |   return *(base+addr);
498 | } else {
499 |   throw new SegFaultException();
500 | }
501 | ```
502 | 
503 | Note: bounds can either store the size of the address space or the upper
504 | memory address; this is an implementation choice.
505 | 
506 | ---
507 | 
508 | # A simple MMU: base and bounds
509 | 
510 | * Achieves security (isolation property is satisfied)
511 | * Achieves performance (translation and check are cheap)
512 | * What's the remaining problem?
513 | 
514 | . . . 
515 | 
516 | * All memory must be continuously allocated
517 |     * Waste of physical memory (all must be allocated)
518 |     * No (easy) sharing between processes
519 | 
520 | ---
521 | 
522 | # A simple MMU: segmentation
523 | 
524 | Instead of a single base/bounds register pair, have one pair per memory area:
525 | 
526 | * Code Segment (CS on x86, default for instructions)
527 | * Data Segment (DS on x86, default for data accesses)
528 | * Stack Segment (SS on x86, default for push/pop)
529 | * Extra Segments (ES, FS, and GS on x86, for anything else)
530 | 
531 | Allow a process to have several regions of continuous memory mapped from a
532 | virtual address space to a physical address space. 
533 | 
534 | Note that hardware also allows to override default segment registers, allowing
535 | the programmer to specify which segment should be used. E.g., loading data from
536 | the code segment.
537 | 
538 | ---
539 | 
540 | # Summary
541 | 
542 | * OS manages access to constrained resources
543 |     * Principle: limited direct execution (bare metal when possible, intercept
544 |       when needed)
545 |     * CPU: time sharing between processes (low switching cost)
546 |     * Memory: space sharing (disk I/O is slow, so time sharing is expensive)
547 | * Programs use dynamic data
548 |     * Stack: program invocation frames
549 |     * Heap: unordered data, managed by user-space library (allocator)
550 | * Time sharing: one process uses all of memory
551 | * Base register: share space, calculate address through offset
552 | * Base + bounds: share space, limit process' address space
553 | * Segments: movable segments, virtual offsets to segment base
554 | 
555 | Don't forget to fill out the Moodle quiz!
556 | 


--------------------------------------------------------------------------------
/15-virtualization.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | subtitle: Virtualization Summary
  3 | ---
  4 | 
  5 | # Virtualization
  6 | 
  7 | \begin{tikzpicture}
  8 | \draw [fill, orange, ultra thick] (0,0) rectangle (2,4);
  9 | \node[text width=3cm, rotate=90] at (1, 2.5) {Virtualization};
 10 | \draw [blue, ultra thick] (3,0) rectangle (5,4);
 11 | \node[text width=3cm, rotate=90] at (4, 2.5) {Concurrency};
 12 | \draw [green, ultra thick] (6,0) rectangle (8,4);
 13 | \node[text width=3cm, rotate=90] at (7, 2.5) {Persistence};
 14 | 
 15 | \draw [red, ultra thick] (0,4.5) -- (8,4.5);
 16 | \draw [red, ultra thick] (0,4.5) -- (4,6);
 17 | \draw [red, ultra thick] (4,6) -- (8,4.5);
 18 | \node[text width=3cm] at (5, 5) {Security};
 19 | 
 20 | \end{tikzpicture}
 21 | 
 22 | ---
 23 | 
 24 | # Virtualization: Summary
 25 | 
 26 | \begin{tikzpicture}[level distance=1.5cm,
 27 |   level 1/.style={sibling distance=6cm},
 28 |   level 2/.style={sibling distance=2.5cm}]
 29 |   \node {Virtualization}
 30 |     child {node {CPU}
 31 |       child {node {Processes}}
 32 |       child {node {Scheduling}}
 33 |     }
 34 |     child {node {Memory}
 35 |       child {node {Allocation}}
 36 |       child {node {Segmentation}}
 37 |       child {node {Paging}}
 38 |     };
 39 | \end{tikzpicture}
 40 | 
 41 | ---
 42 | 
 43 | # CPU Virtualization: Processes
 44 | 
 45 | * Processes are a purely virtual concept
 46 | * Separating policies and mechanisms enables modularity
 47 | * OS is a server, reacts to requests from hardware and processes
 48 | * Processes are isolated from the OS/other processes
 49 |     * Processes have no direct hardware access
 50 |     * Processes run in virtual memory
 51 |     * OS provides functionality through system calls
 52 | * A process consists of an address space, associated kernel state (e.g., open
 53 |   files, network channels) and one or more threads of execution
 54 | 
 55 | ---
 56 | 
 57 | # CPU Virtualization: Scheduling
 58 | 
 59 | * Context switch and preemption are fundamental mechanisms that allow the
 60 |   OS to remain in control and to implement higher level scheduling policies.
 61 | * Schedulers need to optimize for different metrics: utilization, turnaround,
 62 |   response time, fairness and forward progress
 63 |     * FIFO: simple, non-preemptive scheduler
 64 |     * SJF: non-preemptive, prevents process jams
 65 |     * STFC: preemptive, prevents jams of late processes
 66 |     * RR: preemptive, great response time, bad turnaround
 67 |     * MLFQ: preemptive, most realistic
 68 |     * CFS: fair scheduler by virtualizing time
 69 | * Past behavior is good predictor for future behavior
 70 | 
 71 | ---
 72 | 
 73 | # Memory Virtualization: Segmentation
 74 | 
 75 | * OS manages access to constrained resources
 76 |     * Principle: limited direct execution (bare metal when possible, intercept
 77 |       when needed)
 78 |     * CPU: time sharing between processes (low switching cost)
 79 |     * Memory: space sharing (disk I/O is slow, so time sharing is expensive)
 80 | * Programs use dynamic data
 81 |     * Stack: program invocation frames
 82 |     * Heap: unordered data, managed by user-space library (allocator)
 83 | * Time sharing: one process uses all of memory
 84 | * Base register: share space, calculate process address through offset
 85 | * Base + bounds: share space, limit process' address space
 86 | * Segments: movable segments, virtual offsets to segment base
 87 | 
 88 | ---
 89 | 
 90 | # Memory Virtualization: Paging
 91 | 
 92 | * Fragmentation: space lost due to internal or external padding
 93 | * Paging: MMU fully translates between virtual and physical addresses
 94 |     * One flat page table (array)
 95 |     * Multi-level page table
 96 |     * Pros? Cons? What are size requirements?
 97 | * Paging and swapping allows process to execute with only the working set
 98 |   resident in memory, remaining pages can be stored on disk
 99 | 
100 | ---
101 | 
102 | # Book chapters
103 | 
104 | * Virtual CPU (Processes and Threads): OSTEP 4--6
105 | * Virtual CPU (Scheduling): OSTEP 7--10
106 | * Virtual Memory (Segmentation): OSTEP 13--17
107 | * Virtual Memory (Paging and Swapping): OSTEP 18--22
108 | 
109 | This concludes the first pillar of OS.
110 | 


--------------------------------------------------------------------------------
/16-midterm.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title   : CS-323 Operating Systems
  3 | subtitle: Midterm
  4 | author  : Mathias Payer
  5 | date    : EPFL, Fall 2019
  6 | ---
  7 | 
  8 | # Scheduling (1a, 1b, 1c)
  9 | 
 10 | * Pros/cons of implementing scheduling in user land?
 11 |     * Pros: lightweight, flexible policies
 12 |     * Cons: No preemption support, must reimplement synchronization, imprecise time measurements
 13 | * Context switch: why cheaper between threads than processes?
 14 |     * No need to change address space
 15 | * Context switch: why is thread switching not free?
 16 |     * CPU state/thread context must still be swapped
 17 | * Why do we need an idle process?
 18 |     * Simplifies scheduling, always ready to run
 19 | 
 20 | ---
 21 | 
 22 | # Scheduling (1d)
 23 | 
 24 | ```.C
 25 | int a = 0;
 26 | __thread int b = 1;
 27 | void doit() {
 28 |   pid_t pid = spawn();
 29 |   if (pid != 0) {
 30 |     wait(pid);
 31 |     a = 2;
 32 |     b = 3;
 33 |   } else {
 34 |     a = 4;
 35 |     b = 5;
 36 |   }
 37 |   printf("a: %d, b: %d\n", a, b);
 38 |   exit();
 39 | }
 40 | ```
 41 | 
 42 | * Code: thread: `4, 5; 2, 3`; process: `4, 5; 2, 3`
 43 | 
 44 | ---
 45 | 
 46 | # Scheduling (2)
 47 | 
 48 | Tasks: (0, 2, 1), (0, 4, 2), (1, 3, 3), (4, 1, 4), (4, 3, 5), (5, 1, 6), (6, 2, 7).
 49 | 
 50 | * First In First Out: 1, 2, 3, 4, 5, 6, 7
 51 | * Shortest Job First: 1, 3, 4, 6, 7, 5, 2
 52 | * Round Robin: 1, 2, 1, 3, 2, 3, 4 ,5, 2, 6, 3, 7, 5, 2, 7, 5
 53 | 
 54 | ---
 55 | 
 56 | # Virtual memory (3a)
 57 | 
 58 | * Fragmentation
 59 | 
 60 | > Definition: free memory that cannot be used. External fragmentation: visible
 61 | > to allocator (OS), e.g., between segments. Internal fragmentation: visible to
 62 | > requester, e.g., rounding if segment size is a power of 2
 63 | 
 64 | ---
 65 | 
 66 | # Virtual memory (3b, 3c)
 67 | 
 68 | * Assume 12 bit pages, 32 bit virtual and 32 bit physical address space, 4 byte
 69 |   page table entries.
 70 | * What is the size of a page table?
 71 |     * Page table size: entries * size of entry
 72 |     * entries: $2^{log(virtual address space) - log(page size)}$
 73 |     * entries: $2^{32-12} = 2^{20} = 1 MiB$
 74 |     * Page table size: $1 MiB * 4 B = 4 MiB$
 75 | 
 76 | * Offset: 6b, physical/virtual page number: 10b
 77 | * First level: 5b, second level: 5b, offset: 6b
 78 | 
 79 | ---
 80 | 
 81 | # Virtual memory (3d)
 82 | 
 83 | Assume that virtual addresses (not page numbers) 0x0000 -- 0x02FF and
 84 | 0x1FC0 -- 0x2FFF are allocated. How many pages of memory are required for a
 85 | single, flat page table? How many are needed for a two-level page table?
 86 | How many pages are required to hold the data?
 87 | 
 88 | ```
 89 | 0000.0|000.00|00.0000
 90 | 0000.0|010.11|11.1111 0x02FF
 91 | 0001.1|111.11|00.0000 0x1FC0
 92 | 0010.1|111.11|11.1111 0x2FFF
 93 | ```
 94 | 
 95 | * Flat: 2^10 (#entries)/2^5 (entries per page) = 2^5 pages
 96 | * 2 layer: 1 top, 4 2nd = 5 pages
 97 | * Data: 12 + 1 + 32 + 32 = 77 pages
 98 | 
 99 | ---
100 | 
101 | # Concurrent programming (4a)
102 | 
103 | * 1-1 copy of the slides
104 | 
105 | ---
106 | 
107 | # Concurrent programming (4b)
108 | 
109 | * Dining philosophers
110 | * `T1: lock(l1)`, `T2: lock(l2)`, `T3: lock(l3)`, `T4: lock(l4)`
111 | * Solution: lock in increasing order (e.g., T4 locks l1 then l4)
112 | 
113 | ---
114 | 
115 | # Lab1: memory allocator
116 | 
117 | Use the provided test cases to double check, ping us for questions
118 | 
119 | ---
120 | 
121 | # Moodle quizzes:
122 | 
123 | * Each completed quiz will give you 1% credit for up to 5% of assignment credit.
124 | 


--------------------------------------------------------------------------------
/21-locking.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | subtitle: Locking
  3 | ---
  4 | 
  5 | # Concurrency
  6 | 
  7 | \begin{tikzpicture}
  8 | \draw [orange, ultra thick] (0,0) rectangle (2,4);
  9 | \node[text width=3cm, rotate=90] at (1, 2.5) {Virtualization};
 10 | \draw [fill, blue, ultra thick] (3,0) rectangle (5,4);
 11 | \node[text width=3cm, rotate=90] at (4, 2.5) {Concurrency};
 12 | \draw [green, ultra thick] (6,0) rectangle (8,4);
 13 | \node[text width=3cm, rotate=90] at (7, 2.5) {Persistence};
 14 | 
 15 | \draw [red, ultra thick] (0,4.5) -- (8,4.5);
 16 | \draw [red, ultra thick] (0,4.5) -- (4,6);
 17 | \draw [red, ultra thick] (4,6) -- (8,4.5);
 18 | \node[text width=3cm] at (5, 5) {Security};
 19 | 
 20 | \end{tikzpicture}
 21 | 
 22 | ---
 23 | 
 24 | # Topics covered in this lecture
 25 | 
 26 | * Review of threading and mutual exclusion
 27 | * Abstraction: locks to protect shared data structures
 28 | * Mechanism: interrupt-based locks
 29 | * Mechanism: atomic hardware locks
 30 | * Busy waiting (spin locks) versus wait queues
 31 | 
 32 | This slide deck covers chapters 28, 29, 30 in OSTEP.
 33 | 
 34 | ---
 35 | 
 36 | # Threads: Executions context
 37 | 
 38 | * Threads are independent execution context
 39 | * Similar to processes
 40 | 
 41 | . . .
 42 | 
 43 | * Except that they share the same address space
 44 | 
 45 | . . .
 46 | 
 47 | * Why do we need threads?
 48 | 
 49 | . . .
 50 | 
 51 | * CPUs run very fast, they might get blocked for fetching data
 52 | * Multiples of CPUs are available that can do a job in parallel
 53 | 
 54 | # For parallelism and concurrency
 55 | 
 56 | * **Parallelism**: multiple threads (or processes) working on a single task using
 57 |   multiple CPU cores
 58 | * **Concurrency**: tasks can start, run, and complete in overlapping time periods,
 59 |   e.g., through time multiplexing by interleaving their executions, or through
 60 |   parallelism when they are executed at the same time
 61 | 
 62 | Note that processes can share information through partially overlapping address
 63 | spaces or by communicating (future lectures).
 64 | 
 65 | ---
 66 | 
 67 | 
 68 | # Race conditions
 69 | 
 70 | ```.C
 71 | int cnt = 0;
 72 | void *incer(void *arg) {
 73 |   printf("%s starts\n", (char*)arg);
 74 |   for (int i=0; i < 1000000; ++i) {
 75 |     cnt = cnt + 1;
 76 |   }
 77 |   return NULL;
 78 | }
 79 | int main(int argc, char *argv[]) {
 80 |   pthread_t t1, t2;
 81 |   pthread_create(&t1, NULL, incer, "T1");
 82 |   pthread_create(&t2, NULL, incer, "T2");
 83 |   pthread_join(t1, NULL);
 84 |   pthread_join(t2, NULL);
 85 |   printf("Counter: %d (expected: %d)\n", cnt, 1000000*2);
 86 |   return 0;
 87 | }
 88 | ```
 89 | 
 90 | ---
 91 | 
 92 | # Race conditions: what is happening?
 93 | 
 94 | ```
 95 | $ ./21-race
 96 | T1 starts
 97 | T2 starts
 98 | T1 is done
 99 | T2 is done
100 | Counter: 1150897 (expected: 2000000)
101 | $
102 | ```
103 | 
104 | . . . 
105 | 
106 | Assembly of `incer`:
107 | ```.ASM
108 | mov    0x601044,%eax ; load value
109 | add    $0x1,%eax     ; increment
110 | mov    %eax,0x601044 ; store value
111 | ```
112 | 
113 | . . .
114 | 
115 | Both threads load the same value, increment, and write back. The addition of one
116 | thread is lost!
117 | 
118 | ---
119 | 
120 | # Race conditions
121 | 
122 | * Concurrent execution leads to race conditions
123 |     * Access to shared data must be mediated
124 | * **Critical section:** part of code that accesses shared data
125 | * **Mutual exclusion:** only one process is allowed to execute critical section at
126 |   any point in time
127 | * **Atomicity:** critical section executes as an uninterruptible block
128 | 
129 | A **mechanism** to achieve atomicity is through locking.
130 | 
131 | ---
132 | 
133 | # Locks: basic idea
134 | 
135 | * Lock variable protects critical section
136 | * All threads competing for *critical section* share a lock
137 | * Only one thread succeeds at acquiring the lock (at a time)
138 | * Other threads must wait until lock is released
139 | 
140 | ```.C
141 | lock_t mutex;
142 | ...
143 | lock(&mutex);
144 | cnt = cnt + 1;
145 | unlock(&mutex);
146 | ```
147 | 
148 | ---
149 | 
150 | # Locks: basic idea
151 | 
152 | * Requirements: mutual exclusion, fairness, and performance
153 |     * **Mutual exclusion**: only one thread in critical section
154 |     * **Fairness**: all threads should eventually get the lock
155 |     * **Performance**: low overhead for acquiring/releasing lock
156 | * Lock implementation requires hardware support
157 |     * ... and OS support for performance
158 | 
159 | ---
160 | 
161 | # Lock operations
162 | 
163 | * `void lock(lock_t *lck)`: acquires the lock, current thread owns the lock when function returns
164 | * `void unlock(lock_t *lck)`: releases the lock
165 | 
166 | . . .
167 | 
168 | Note that we assume that the application *correctly* uses locks for *each*
169 | access to the critical section.
170 | 
171 | ---
172 | 
173 | # Interrupting locks
174 | 
175 | * Turn off interrupts when executing critical sections
176 |     * Neither hardware nor timer can interrupt execution
177 |     * Prevent scheduler from switching to another thread
178 |     * Code between interrupts executes atomically
179 | 
180 | ```.C
181 | void acquire(lock_t *l) {
182 |   disable_interrupts();
183 | }
184 | 
185 | void release(lock_t *l) {
186 |   enable_interrupts();
187 | }
188 | ```
189 | 
190 | ---
191 | 
192 | # Interrupting locks (disadvantages)
193 | 
194 | * No support for locking multiple locks
195 | * Only works on uniprocessors (no support for locking across cores in multicore system)
196 | * Process may keep lock for arbitrary length
197 | * Hardware interrupts may get lost (hardware only stores information that
198 |   interrupt X happened, not how many times it happened)
199 | 
200 | ---
201 | 
202 | # Interrupting locks (perspective)
203 | 
204 | * Interrupt-based locks are extremely simple
205 | * Work well for low-complexity code
206 | 
207 | . . .
208 | 
209 | * Implementing locks through interrupts is great for MCUs
210 | 
211 | ---
212 | 
213 | # (Faulty) spin lock
214 | 
215 | * Use a shared variable to synchronize access to critical section
216 | 
217 | ```.C
218 | bool lock1 = false;
219 | 
220 | void acquire(bool *lock) {
221 |   while (*lock); /* spin until we grab the lock */
222 |   *lock = true;
223 | }
224 | 
225 | void release(bool *lock) {
226 |   *lock = false
227 | }
228 | ```
229 | 
230 | . . .
231 | 
232 | Bug: both threads can grab the lock if thread is preempted before setting the
233 | lock but after the `while` loop completes.
234 | 
235 | ---
236 | 
237 | # Required hardware support
238 | 
239 | Locking requires an atomic *test-and-set* instruction.
240 | 
241 | ```.C
242 | int tas(int *addr, int val) {
243 |   int old = *addr;
244 |   *addr = val;
245 |   return old;
246 | }
247 | ```
248 | 
249 | . . .
250 | 
251 | ```.C
252 | int tas(int *addr, int val) {
253 |   int old;
254 |   asm volatile("lock; xchgl %0, %1" :
255 |                "+m" (*addr), "=a" (old) :
256 |                "1" (val) : "cc");
257 |   return old;
258 | }
259 | ```
260 | 
261 | 
262 | ---
263 | 
264 | # Required hardware support
265 | 
266 | * Hardware support is required for (i) an instruction that updates memory location and returns old value and (ii) executes the instruction atomically.
267 | * Directly encoding inline assembly is error prone, use intrinsics instead:
268 | 
269 | `type __sync_lock_test_and_set(type *ptr, type val);`
270 | 
271 | ---
272 | 
273 | # Test-and-set spin lock
274 | 
275 | ```.C
276 | int lock1;
277 | 
278 | void acquire(int *l) {
279 |   while (__sync_lock_test_and_set(l, 1) == 1); /* spin */
280 | }
281 | 
282 | void release(int *l) {
283 |   *l = 0;
284 | }
285 | 
286 | acquire(&lock1);
287 | critical_section();
288 | release(&lock1);
289 | ```
290 | 
291 | ---
292 | 
293 | # Compare-and-swap spin lock
294 | 
295 | ```.C
296 | bool cas(T *ptr, T expt, T new) {
297 |   if (*ptr == expt) {
298 |     *ptr = new;
299 |     return true;
300 |   }
301 |   return false;
302 | }
303 | ```
304 | 
305 | The function compares the value at `*ptr` and if it is equal to `expt` then the value is overwritten with `new`. The function returns `true` if the swap happened.
306 | 
307 | ---
308 | 
309 | # Compare-and-swap spin lock
310 | 
311 | ```.C
312 | __sync_bool_compare_and_swap(T *ptr, T expt, T new);
313 | ```
314 | 
315 | How would you implement the lock `acquire` operation?
316 | 
317 | . . .
318 | 
319 | 
320 | ```.C
321 | void acquire_cas(bool *lck) {
322 |   while (__sync_bool_compare_and_swap(lck, false, true)
323 |        == false);
324 | }
325 | ```
326 | 
327 | ---
328 | 
329 | # Spin lock: reduce spinning
330 | 
331 | * A simple way to reduce the cost of spinning is to `yield()` whenever lock acquisition fails
332 |     * This is no longer a "strict" spin lock as we give up control to the scheduler every loop iteration
333 | 
334 | ```.C
335 | void acquire(bool *lck) {
336 |   while (__sync_lock_test_and_set(l, 1) == 1) {
337 |     yield();
338 |   }
339 | }
340 | ```
341 | 
342 | ---
343 | 
344 | # Lock requirements: spin locks
345 | 
346 | -  **Correctness:** mutual exclusion, progress, and, bounded
347 |     * Mutual exclusion: $\leq$ one thread in critical section at a time
348 |     * Progress (deadlock freedom): one waiting process will proceed
349 |     * Bounded (no starvation): eventually each process will proceed
350 | - Fairness: each thread waits for the same amount of time
351 | - Performance: CPU is not used unecessarily
352 | 
353 | . . .
354 | 
355 | Spinlocks are unfair (threads race for lock) and hog performance (spinning and burning CPU time)!
356 | 
357 | ---
358 | 
359 | # Queue lock
360 | 
361 | * Idea: instead of spinning, put threads on a queue
362 | * Wake up thread(s) when lock is released
363 |     * Wake up all threads to have them race for the lock
364 |     * Selectively wake one thread up for fairness
365 | 
366 | ---
367 | 
368 | # Queue lock implementation: nptl
369 | 
370 | ```.C
371 | /* Bit 31 clear means unlocked; bit 31 set means locked.
372 |    Remaining bits encode num. interested threads. */
373 | static inline void mutex_lock(int *mutex) {
374 |   int v;
375 |   /* Bit 31 was clear, we got the mutex.  (fastpath). */
376 |   if (atomic_bit_test_set(mutex, 31) == 0) return;
377 |   atomic_increment(mutex);
378 |   while (1) {
379 |     if (atomic_bit_test_set(mutex, 31) == 0) {
380 |       atomic_decrement(mutex); return;
381 |     }
382 |     /* We have to wait. Make sure futex is act. locked */
383 |     v = *mutex;
384 |     if (v >= 0) continue;
385 |     futex_wait(mutex, v);
386 |   }
387 | }
388 | ```
389 | 
390 | ---
391 | 
392 | # Queue lock implementation: nptl
393 | 
394 | ```.C
395 | static inline void mutex_unlock(int *mutex) {
396 |   /* Adding 0x80000000 to the counter results in 0 iff
397 |      there are no other waiting threads (fastpath).  */
398 |   if (atomic_add_zero(mutex, 0x80000000)) return;
399 | 
400 |   /* There are other threads waiting, wake one up. */
401 |   futex_wake(mutex, 1);
402 | }
403 | ```
404 | 
405 | Do you want to know more? Check out the [Linux futex system call](https://linux.die.net/man/2/futex).
406 | 
407 | ---
408 | 
409 | # Comparison spinlock / queue lock
410 | 
411 | * Spinlock works well when critical section is short and rare and we execute on more than one CPU (i.e., no context switch, likely to acquire lock soon)
412 | * Queue locks work well when critical section is longer or more frequent (i.e., high contention, likelihood that thread must wait)
413 | 
414 | . . .
415 | 
416 | * Hybrid approach: spin for a while, then yield and enqueue
417 | 
418 | ---
419 | 
420 | # Lock principles
421 | 
422 | * Locks protect access to shared data structures
423 | * Shared kernel data structures rely on locks
424 | * Locking strategy: coarse-grained (one lock) versus fine-grained (many locks)
425 | * OS only provides locks, locking strategy is up to programmer
426 | 
427 | ---
428 | 
429 | # Lock best practices
430 | 
431 | * When acquiring a lock, recheck assumptions
432 | * Ensure that all shared information is refreshed (and not stale)
433 | * Multiple threads may wake up and race for the lock (i.e., loop if unsuccessful)
434 | 
435 | ---
436 | 
437 | # Summary
438 | 
439 | * Locks enforce mutual exclusion for critical section (i.e., an object that can only be owned by a single thread)
440 | * Trade-offs between spinlock and queue lock
441 |     * Time lock is held
442 |     * Contention for lock
443 |     * How many concurrent cores execute
444 | * Locking requires kernel support or atomic instructions
445 |     * **test-and-set** atomically modifies the contents of a memory location, returning its old value
446 |     * **compare-and-swap** atomically compares the contents of a memory location to a given value and, iff they are equal, modifies the contents of that memory location to a given new value.
447 | 
448 | Don't forget to get your learning feedback through the Moodle quiz!
449 | 


--------------------------------------------------------------------------------
/22-semaphores.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | subtitle: Semaphores
  3 | ---
  4 | 
  5 | # Topics covered in this lecture
  6 | 
  7 | * Condition variables
  8 | * Semaphores
  9 | * Signaling through condition variables and semaphores
 10 | * Concurrency bugs
 11 | 
 12 | This slide deck covers chapters 30, 31, 32 in OSTEP.
 13 | 
 14 | ---
 15 | 
 16 | # Condition variables (CV)
 17 | 
 18 | In concurrent programming, a common scenario is one thread waiting for
 19 | another thread to complete an action.
 20 | 
 21 | ```.C++
 22 | 1  bool done = false;
 23 | 2
 24 | 3  /* called in the child to signal termination */
 25 | 4  void thr_exit() {
 26 | 5   done = true;
 27 | 6  }
 28 | 7  /* called in the parent to wait for a child thread */
 29 | 8  void thr_join() {
 30 | 9    while (!done);
 31 | 10 }
 32 | ```
 33 | 
 34 | ---
 35 | 
 36 | # Condition variables (CV)
 37 | 
 38 | * Locks enable mutual exclusion of a shared region.
 39 |   	* Unfortunately they are oblivious to ordering
 40 | * Waiting and signaling (i.e., T2 waits until T1 completes a given task)
 41 |   could be implemented by spinning until the value changes
 42 | 
 43 | . . .
 44 | 
 45 | * But spinning is incredibly _inefficient_
 46 | 
 47 | . . .
 48 | 
 49 | * New synchronization primitive: ***condition variables***
 50 | 
 51 | ---
 52 | 
 53 | # Condition variables (CV)
 54 | 
 55 | * A CV allows:
 56 |     * A thread to wait for a condition
 57 |     * Another thread signals the waiting thread
 58 | 
 59 | * Implement CV using queues
 60 | 
 61 | . . .
 62 | 
 63 | * API: `wait`, `signal` or `broadcast`
 64 |     * `wait`: wait until a condition is satisfied
 65 |     * `signal`: wake up one waiting thread
 66 |     * `broadcast`: wake up all waiting threads
 67 | * On Linux, `pthreads` provides CV implementation
 68 | 
 69 | ---
 70 | 
 71 | # Signal parent that child has exited
 72 | 
 73 | ```.C++
 74 | 1  bool done = false;
 75 | 2  pthread_mutex_t m = PTHREAD_MUTEX_INITIALIZER;
 76 | 3  pthread_cond_t c = PTHREAD_COND_INITIALIZER;
 77 | 4  /* called in the child to signal termination */
 78 | 5  void thr_exit() {
 79 | 6    pthread_mutex_lock(&m);
 80 | 7    done = true;
 81 | 8    pthread_cond_signal(&c);
 82 | 9    pthread_mutex_unlock(&m);
 83 | 10  }
 84 | 11 /* called in the parent to wait for a child thread */
 85 | 12 void thr_join() {
 86 | 13  pthread_mutex_lock(&m);
 87 | 14  while (!done)
 88 | 15    pthread_cond_wait(&c, &m);
 89 | 16  pthread_mutex_unlock(&m);
 90 | 17 }
 91 | ```
 92 | 
 93 | ---
 94 | 
 95 | # Signal parent that child has exited (2)
 96 | 
 97 | * `pthread_cond_wait(pthread_cond_t *c, pthread_mutex_t *m)`
 98 |     * Assume mutex `m` is held; *atomically* unlock mutex when waiting, retake it when waking up
 99 | * Question: Why do we need to check a condition before sleeping?
100 | 
101 | . . .
102 | 
103 | * Thread may have already exited, i.e., no need to wait
104 |     * Principle: Check the condition before sleeping
105 | 
106 | . . .
107 | 
108 | * Question: Why can't we use `if` when waiting?
109 | 
110 | . . .
111 | 
112 | * Multiple threads could be woken up, racing for `done` flag
113 |     * Principle: `while` instead of `if` when waiting
114 | 
115 | ---
116 | 
117 | # Signal parent that child has exited (3)
118 | 
119 | * Question: Why do we need to proctect `done` with mutex `m`?
120 | 
121 | . . .
122 | 
123 | * Mutex `m` allows one thread to access `done` for protecting against missed updates
124 |     * Parent reads `done == false` but is interrupted
125 |     * Child sets `done = true` and signals but no one is waiting
126 |     * Parent continues and goes to sleep (forever)
127 | * Lock is therefore required for wait/signal synchronization
128 | 
129 | 
130 | ---
131 | 
132 | # Producer/consumer synchronization
133 | 
134 | * Producer/consumer is a common programming pattern
135 | * For example: map (producers) / reduce (consumer)
136 | * For example: a concurrent database (consumers) handling parallel requests from clients (producers)
137 |     * Clients produce new requests (encoded in a queue)
138 |     * Handlers consume these requests (popping from the queue)
139 | 
140 | . . .
141 | 
142 | * Strategy: use CV to synchronize
143 |     * Make producers wait if buffer is full
144 |     * Make consumers wait if buffer is empty (nothing to consume)
145 | 
146 | ---
147 | 
148 | # Condition variables
149 | 
150 | * Programmer must keep state, orthogonal to locks
151 | * CV enables access to critical section with a thread wait queue
152 | * Always wait/signal while holding lock
153 | * Whenever thread wakes, recheck state
154 | 
155 | ---
156 | 
157 | # Semaphore
158 | 
159 | * A semaphore extends a CV with an integer as internal state
160 | * `int sem_init(sem_t *sem, unsigned int value)`: creates a new semaphore with `value` slots
161 | * `int sem_wait(sem_t *sem)`: waits until the semaphore has at least one slot, decrements the number of slots
162 | * `int sem_post(sem_t *sem)`: increments the semaphore (and wakes one waiting thread)
163 | * `int sem_destroy(sem_t *sem)`: destroys the semaphore and releases any waiting threads
164 | 
165 | ---
166 | 
167 | # Concurrent programming: producer consumer
168 | 
169 | * One or more producers create items, store them in buffer
170 | * One or more consumers process items from buffer
171 | 
172 | . . .
173 | 
174 | * Need synchronization for buffer
175 |     * Want concurrent production and consumption
176 |     * Use as many cores as available
177 |     * Minimize access time to shared data structure
178 | 
179 | ---
180 | 
181 | # Concurrent programming: producer consumer
182 | 
183 | ```.C
184 | 1 void *producer(void *arg) {
185 | 2 	unsigned int max = (unsigned int)arg;
186 | 3 	for (unsigned int i = 0; i < max; i++) {
187 | 4   		put(i); // store in shared buffer
188 | 5	}
189 | 6 	return NULL;
190 | 7 }
191 | 8 void *consumer(void *arg) {
192 | 9 	unsigned int max = (unsigned int)arg;
193 | 10	for (unsigned int i = 0; i < max; i++) {
194 | 11		printf("%d\n", get(i)); // recv from buffer
195 | 12	}
196 | 13	return NULL;
197 | 14 }
198 | pthread_t p, c;
199 | pthread_create(&p, NULL, &producer, (void*)NUMITEMS);
200 | pthread_create(&c, NULL, &consumer, (void*)NUMITEMS);
201 | ```
202 | 
203 | ---
204 | 
205 | # Concurrent programming: producer consumer
206 | 
207 | ```.C
208 | 1  unsigned int buffer[BUFSIZE] = { 0 };
209 | 2  unsigned int cpos = 0, ppos = 0;
210 | 3
211 | 4  void put(unsigned int val) {
212 | 5 	buffer[ppos] = val;
213 | 6	ppos = (ppos + 1) % BUFSIZE;
214 | 7  }
215 | 8
216 | 9  unsigned int get() {
217 | 10	unsigned long val = buffer[cpos];
218 | 11	cpos = (cpos + 1) % BUFSIZE;
219 | 12	return val;
220 | 13 }
221 | ```
222 | 
223 | What are the issues in this code?
224 | 
225 | . . .
226 | 
227 | * Producers may overwrite unconsumed entries
228 | * Consumers may consume uninitialized or stale entries
229 | 
230 | ---
231 | 
232 | # Producer/consumer: use semaphores!
233 | 
234 | ```.C
235 | sem_t csem, psem;
236 | 
237 | /* BUFSIZE items are available for producer to create */
238 | sem_init(&psem, 0, BUFSIZE);
239 | 
240 | /* 0 items are available for consumer */
241 | sem_init(&csem, 0, 0);
242 | ```
243 | 
244 | ---
245 | 
246 | # Producer: semaphores
247 | 
248 | ```.C
249 | 1  void put(unsigned int val) {
250 | 2	/* we wait until there is buffer space available */
251 | 3	sem_wait(&psem);
252 | 4
253 | 5	/* store element in buffer */
254 | 6	buffer[ppos] = val;
255 | 7	ppos = (ppos + 1) % BUFSIZE;
256 | 8
257 | 9	/* notify consumer that data is available */
258 | 10	sem_post(&csem);
259 | 11 }
260 | ```
261 | 
262 | ---
263 | 
264 | # Consumer: semaphores
265 | 
266 | ```.C
267 | 1  unsigned int get() {
268 | 2	/* wait until data is produced */
269 | 3	sem_wait(&csem);
270 | 4
271 | 5	/* consumer entry */
272 | 6	unsigned long val = buffer[cpos];
273 | 7	cpos = (cpos + 1) % BUFSIZE;
274 | 8
275 | 9	/* notify producer that a space has freed up */
276 | 10	sem_post(&psem);
277 | 11	return val;
278 | 12 }
279 | ```
280 | 
281 | ---
282 | 
283 | # Producer/consumer: remaining issues?
284 | 
285 | * We now synchronize between consumers and producers
286 |     * Producer waits until buffer space is available
287 |     * Consumer waits until data is ready
288 | 
289 | . . .
290 | 
291 | * How would you handle multiple producers/consumers?
292 |     * Currently no synchronization between producers (or consumers)
293 | 
294 | 
295 | ---
296 | 
297 | # Multiple producers: use locking!
298 | 
299 | ```.C
300 | /* mutex handling mutual exclusive access to ppos */
301 | 1  pthread_mutex_t pmutex = PTHREAD_MUTEX_INITIALIZER;
302 | 2
303 | 3  void put(unsigned int val) {
304 | 4	unsigned int mypos;
305 | 5	/* we wait until there is buffer space available */
306 | 6	sem_wait(&psem);
307 | 7	/* ppos is shared between all producers */
308 | 8	pthread_mutex_lock(&pmutex);
309 | 9	mypos = ppos;
310 | 10	ppos = (ppos + 1) % BUFSIZE;
311 | 11	/* store information in buffer */
312 | 12	buffer[mypos] = val;
313 | 13	pthread_mutex_unlock(&pmutex);
314 | 14	sem_post(&csem);
315 | 15 }
316 | ```
317 | 
318 | ---
319 | 
320 | # Semaphores/spin locks/CVs are interchangeable
321 | 
322 | * Each is implementable through a combination of the others
323 | * Depending on the use-case one is faster than the other
324 |     * How often is the critical section executed?
325 |     * How many threads compete for a critical section?
326 |     * How long is the lock taken?
327 | 
328 | ---
329 | 
330 | # Implementing a mutex with a semaphore
331 | 
332 | ```.C
333 | 1 sem_t sem;
334 | 2 sem_init(&sem, 1);
335 | 3
336 | 4 sem_wait(&sem);
337 | 5 ... // critical section
338 | 6 sem_post(&sem);
339 | ```
340 | 
341 | ---
342 | 
343 | # Implementing a semaphore with CV/locks
344 | 
345 | ```.C
346 | 1  typedef struct {
347 | 2	int value;            // sem value
348 | 3	pthread_mutex_t lock; // access to sem
349 | 4	pthread_cond_t cond;  // wait queue
350 | 5  } sem_t;
351 | 6
352 | 7  void sem_init(sem_t *s, int val) {
353 | 8 	s->value = val;
354 | 9	pthread_mutex_init(&(s->lock), NULL);
355 | 10	pthread_cond_init(&(s->cond), NULL);
356 | 11 }
357 | ```
358 | 
359 | ---
360 | 
361 | # Implementing a semaphore with CV/locks
362 | 
363 | ```.C
364 | 1  void sem_wait(sem_t *s) {
365 | 2	pthread_mutex_lock(&(s->lock));
366 | 3	while (s->value <= 0)
367 | 4		pthread_cond_wait(&(s->cond), &(s->lock));
368 | 5	s->value--;
369 | 6	pthread_mutex_unlock(&(s->lock));
370 | 7  }
371 | 8
372 | 9  void sem_post(sem_t *s) {
373 | 10	pthread_mutex_lock(&(s->lock));
374 | 11	s->value++;
375 | 12	pthread_cond_signal(&(s->cond));
376 | 13	pthread_mutex_unlock(&(s->lock));
377 | 14  }
378 | ```
379 | 
380 | 
381 | ---
382 | 
383 | # Reader/writer locks
384 | 
385 | * A single (exclusive) writer, multiple (N) concurrent readers
386 | * Implement using two semaphores: `lock` for the data structure, `wlock` for the writer
387 |     * Both semaphores initialized with (1)
388 |     * Writer only waits/posts on `wlock` when acquiring/releasing
389 |     * Reader waits on `lock`, increments/decrements reader count
390 |     * If number of `readers==0`, must wait/post on `wlock`
391 | 
392 | ---
393 | 
394 | # Reader/writer locks
395 | 
396 | ```.C
397 | 1  void rwlock_acquire_readlock(rwlock_t *rw) {
398 | 2   sem_wait(&rw->lock);
399 | 3   rw->readers++;
400 | 4   if (rw->readers == 1)
401 | 5      sem_wait(&rw->wlock); // first r, also grab wlock
402 | 6   sem_post(&rw->lock); 
403 | 7  }
404 | 8
405 | 9  void rwlock_release_readlock(rwlock_t *rw) {
406 | 10  sem_wait(&rw->lock);
407 | 11  rw->readers--;
408 | 13  if (rw->readers == 0)
409 | 14    sem_post(&rw->wlock); // last r, also release wlock
410 | 15  sem_post(&rw->lock);
411 | 16 }
412 | ```
413 | 
414 | ---
415 | 
416 | # Bugs in concurrent programs
417 | 
418 | * Writing concurrent programs is hard!
419 | * ***Atomicity bug:*** concurrent, unsynchronized modification (lock!)
420 | * ***Order-violating bug:*** data is accessed in wrong order (use CV!)
421 | * ***Deadlock:*** program no longer makes progress (locking order)
422 | 
423 | ---
424 | 
425 | # Atomicity bugs
426 | 
427 | One thread checks value and prints it while another thread concurrently modifies it.
428 | 
429 | ```.C
430 | 1  int shared = 24;
431 | 2
432 | 3  void T1() {
433 | 4	if (shared > 23) {
434 | 5		printf("Shared is >23: %d\n", shared);
435 | 6	}
436 | 7  }
437 | 8  void T2() {
438 | 9	shared = 12;
439 | 10 }
440 | ```
441 | 
442 | . . .
443 | 
444 | * T2 may modify `shared` between `if` check and `printf` in T1.
445 | * Fix: use a common mutex between both threads when accessing the shared resource.
446 | 
447 | ---
448 | 
449 | # Order-violating bug
450 | 
451 | One thread assumes the other has already updated a value.
452 | 
453 | ```.C
454 | Thread 1::
455 | void init() {
456 |   mThread = PR_CreateThread(mMain, ...);
457 |   mThread->State = ...;
458 | }
459 | 
460 | Thread 2::
461 | void mMain(...) {
462 |   mState = mThread->State;
463 | }
464 | ```
465 | 
466 | . . . 
467 | 
468 | * Thread 2 may run before `mThread` is assigned in T1.
469 | * Fix: use a CV to signal that `mThread` has been initialized.
470 | 
471 | ---
472 | 
473 | # Deadlock
474 | 
475 | Locks are taken in conflicting order.
476 | 
477 | ```.C
478 | void T1() {
479 | 	lock(L1);
480 | 	lock(L2);
481 | }
482 | 
483 | void T2() {
484 | 	lock(L2);
485 | 	lock(L1);
486 | }
487 | ```
488 | 
489 | . . .
490 | 
491 | * Threads 1/2 may be stuck after taking the first lock, program makes no more progress
492 | * Fix: acquire locks in increasing (global) order.
493 | 
494 | ---
495 | 
496 | # Summary
497 | 
498 | * Spin lock, CV, and semaphore synchronize multiple threads
499 |     * Spin lock: atomic access, no ordering, spinning
500 |     * Condition variable: atomic access, queue, OS primitive
501 |     * Semaphore: shared access to critical section with (int) state
502 | * All three primitives are equally powerful
503 |     * Each primitive can be used to implement both other primitives
504 |     * Performance may differ!
505 | * Synchronization is challenging and may introduce different types of
506 |   bugs such as atomicity violation, order violation, or deadlocks.
507 | 
508 | Don't forget to get your learning feedback through the Moodle quiz!
509 | 


--------------------------------------------------------------------------------
/23-concurrency.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | subtitle: Concurrency Summary
  3 | ---
  4 | 
  5 | # Concurrency
  6 | 
  7 | \begin{tikzpicture}
  8 | \draw [orange, ultra thick] (0,0) rectangle (2,4);
  9 | \node[text width=3cm, rotate=90] at (1, 2.5) {Virtualization};
 10 | \draw [fill, blue, ultra thick] (3,0) rectangle (5,4);
 11 | \node[text width=3cm, rotate=90] at (4, 2.5) {Concurrency};
 12 | \draw [green, ultra thick] (6,0) rectangle (8,4);
 13 | \node[text width=3cm, rotate=90] at (7, 2.5) {Persistence};
 14 | 
 15 | \draw [red, ultra thick] (0,4.5) -- (8,4.5);
 16 | \draw [red, ultra thick] (0,4.5) -- (4,6);
 17 | \draw [red, ultra thick] (4,6) -- (8,4.5);
 18 | \node[text width=3cm] at (5, 5) {Security};
 19 | 
 20 | \end{tikzpicture}
 21 | 
 22 | ---
 23 | 
 24 | # Concurrency topics
 25 | 
 26 | * Abstraction: locks to protect shared data structures
 27 | * Mechanism: interrupt-based locks
 28 | * Mechanism: atomic hardware locks
 29 | * Busy waiting (spin locks) versus wait queues
 30 | * Condition variables
 31 | * Semaphores
 32 | * Signaling through condition variables and semaphores
 33 | 
 34 | ---
 35 | 
 36 | # Difference parallelism and concurrency
 37 | 
 38 | * ***Parallelism:*** multiple threads (or processes) working on a single task
 39 |   using multiple CPU cores (i.e., stuff happens at the same physical time)
 40 | * ***Concurrency:*** tasks can start, run, and complete in overlapping time
 41 |   periods (i.e., tasks run at the same virtual time)
 42 | 
 43 | \begin{tikzpicture}
 44 | \node at (3, 5) {Parallelism};
 45 | \node[rotate=90] at (1, 3) {Processes};
 46 | \draw (1.5, 2) -- (1.5, 4.5);
 47 | \node at (3, 1.5) {Time};
 48 | \draw (1.5, 2) -- (4.5, 2);
 49 | \draw (1.6, 2.5) -- (4.5, 2.5);
 50 | \draw (1.6, 3) -- (4.5, 3);
 51 | \draw (1.6, 3.5) -- (4.5, 3.5);
 52 | \draw (1.6, 4) -- (4.5, 4);
 53 | 
 54 | \node at (8, 5) {Concurrency};
 55 | \node[rotate=90] at (6, 3) {Processes};
 56 | \draw (6.5, 2) -- (6.5, 4.5);
 57 | \node at (8, 1.5) {Time};
 58 | \draw (6.5, 2) -- (9.5, 2);
 59 | \draw (6.6, 2.5) -- (7, 2.5);
 60 | \draw (7, 3) -- (7.5, 3);
 61 | \draw (7.5, 3.5) -- (8, 3.5);
 62 | \draw (8, 4) -- (8.5, 4);
 63 | \draw (8.5, 3.5) -- (9, 3.5);
 64 | \draw (9, 2.5) -- (9.5, 2.5);
 65 | \end{tikzpicture}
 66 | 
 67 | ---
 68 | 
 69 | # Locks: basic idea
 70 | 
 71 | * Requirements: mutual exclusion, fairness, and performance
 72 |     * ***Mutual exclusion***: only one thread in critical section
 73 |     * ***Fairness***: all threads should eventually get the lock
 74 |     * ***Performance***: low overhead for acquiring/releasing lock
 75 | * Lock implementation requires hardware support
 76 |     * ... and OS support for performance
 77 | 
 78 | ---
 79 | 
 80 | # Lock types
 81 | 
 82 | * Interrupts
 83 | * (Buggy) software lock
 84 | * (Buggy) Peterson's lock
 85 | * Spin lock: test-and-set
 86 | * Spin lock: compare-and-swap
 87 | * Queue lock
 88 | 
 89 | ---
 90 | 
 91 | # Lock best practices
 92 | 
 93 | * When acquiring a lock, recheck assumptions
 94 | * Ensure that all shared information is refreshed (and not stale)
 95 | * Multiple threads may wake up and race for the lock (i.e., loop if unsuccessful)
 96 | 
 97 | ---
 98 | 
 99 | # Lock summary
100 | 
101 | * Locks enforce mutual exclusion for critical section (i.e., an object that can only be owned by a single thread)
102 | * Trade-offs between spinlock and queue lock
103 |     * Time lock is held
104 |     * Contention for lock
105 |     * How many concurrent cores execute
106 | * Locking requires kernel support or atomic instructions
107 |     * **test-and-set** atomically modifies the contents of a memory location, returning its old value
108 |     * **compare-and-swap** atomically compares the contents of a memory location to a given value and, iff they are equal, modifies the contents of that memory location to a given new value.
109 | 
110 | ---
111 | 
112 | # Condition Variables (CVs)
113 | 
114 | * A CV allows a thread to wait for a condition
115 |     * Usually implemented as queues
116 |     * Another thread signals the waiting thread
117 | * API: `wait`, `signal` or `broadcast`
118 |     * `wait`: wait until a condition is satisfied
119 |     * `signal`: wake up one waiting thread
120 |     * `broadcast`: wake up all waiting threads
121 | 
122 | ---
123 | 
124 | # Locks vs. CVs
125 | 
126 | * Lock an object that can only be owned by a single thread
127 |     * Enforces mutual exclusion
128 |     * `acquire(lock_t *lck)`: acquire the lock, wait if needed
129 |     * `release(lock_t *lck)`: release the lock
130 | * CVs allow a thread to wait for an event (condition)
131 |     * Lock for mutual exclusion, condition to signal event has passed
132 |     * `wait(cond_t *cond, lock_t *lck)`: wait until cond is true
133 |     * `signal(cond_t *cond, lock_t *lck)`: signal one thread
134 |     * `broadcast(cond_t *cond, lock_t *lck)`: signal all threads
135 | 
136 | ---
137 | 
138 | # Semaphores
139 | 
140 | * A semaphore extends a CV with an integer as internal state
141 | * `int sem_init(sem_t *sem, unsigned int value)`: creates a new semaphore with `value` slots
142 | * `int sem_wait(sem_t *sem)`: waits until the semaphore has at least one slot
143 | * `int sem_post(sem_t *sem)`: increments the semaphore (and wakes one waiting thread)
144 | * `int sem_destroy(sem_t *sem)`: destroys the semaphore and releases any waiting threads
145 | 
146 | ---
147 | 
148 | # Semaphores/spin locks/CVs are equivalent
149 | 
150 | * Each can be implemented through a combination of the others
151 | * Depending on the use-case, performance will vary
152 |     * How often is the critical section executed?
153 |     * How many threads compete for a critical section?
154 |     * How long is the lock taken?
155 | 
156 | ---
157 | 
158 | # Book chapters
159 | 
160 | * Concurrency/Locking: OSTEP 28-30
161 | * Concurrency/Semaphores: OSTEP 30-32
162 | 
163 | ---
164 | 
165 | # Concurrency summary
166 | 
167 | * Spin lock, CV, and semaphore synchronize multiple threads
168 |     * Spin lock: atomic access, no ordering, spinning
169 |     * Condition variable: atomic access, queue, OS primitive
170 |     * Semaphore: shared access to critical section with (int) state
171 | * All three primitives are equally powerful
172 |     * Each primitive can be used to implement both other primitives
173 |     * Performance may differ!
174 | * Synchronization is challenging and may introduce different types of
175 |   bugs such as atomicity violation, order violation, or deadlocks.


--------------------------------------------------------------------------------
/33-ffs.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | subtitle: Filesystem implementation
  3 | ---
  4 | 
  5 | # Topics covered in this lecture
  6 | 
  7 | * Filesystem implementation
  8 | * Special requirements (write back)
  9 | 
 10 | This slide deck covers chapters 40, 41 in OSTEP.
 11 | 
 12 | ---
 13 | 
 14 | # Last week: API and abstractions
 15 | 
 16 | * Highlevel API and abstractions
 17 | * Filesystem API
 18 | * Different names for different use cases
 19 |     * Inodes and devices
 20 |     * Path
 21 |     * File descriptor
 22 | 
 23 | ---
 24 | 
 25 | # Different names for different use cases
 26 | 
 27 | * inode/device id (`53135/2`)
 28 |     * Unique internal name
 29 |     * Records metadata about the file (size, permissions, owner)
 30 | * path (`/foo/bar/baz`)
 31 |     * Human readable name
 32 |     * Organizes files in a hierarchical layout
 33 | * file descriptor (`5`)
 34 |     * Process internal view
 35 |     * Avoids frequent path to inode traversal
 36 |     * Remembers offset for next read/write
 37 | 
 38 | ---
 39 | 
 40 | # Common file API
 41 | 
 42 | * `int open(char *path, int flag, mode_t mode)`
 43 | * `size_t read(int fd, char *buf, size_t nbyte)`
 44 | * `size_t write(int fd, char *buf, size_t nbyte)`
 45 | * `int close(int fd)`
 46 | 
 47 | What kind of on disk data structures do we need?
 48 | How is data accessed?
 49 | 
 50 | ---
 51 | 
 52 | # This week: Abstractions -> Implementation
 53 | 
 54 | ![](./figures/33-portal.jpg){width=400px}
 55 | 
 56 | ---
 57 | 
 58 | # Virtual File System
 59 | 
 60 | * File systems (EXT4, NTFS, FAT) use different data structures
 61 | * A Virtual File System (VFS) abstracts from the real filesystem
 62 | * VFS abstracts the FS as objects with specific operations
 63 |     * Superblock (mount): a life filesystem
 64 |     * File (open): a file opened by a process ("open file description")
 65 |     * Directory entry cache: speeds up path to inode translation
 66 |     * Inode (lookup): a filesystem object (e.g., file or directory)
 67 | * System call logic (open, seek, ...) maps to VFS operations
 68 | * When implementing a new FS, implement the VFS API
 69 | * System calls are now independent of FS implementation
 70 | 
 71 | ---
 72 | 
 73 | # Challenge: renaming files
 74 | 
 75 | * How would you implement `rename`?
 76 | 
 77 | . . .
 78 | 
 79 | * Renaming only changes the name of the file
 80 | * Directory contains the name of the file
 81 | * No data needs to be moved, inode remains unchanged
 82 | 
 83 | . . .
 84 | 
 85 | * Note, you may need to move the data if it is on another disk/partition!
 86 | 
 87 | ---
 88 | 
 89 | # Filesystem implementation
 90 | 
 91 | * A filesystem is an exercise in data management
 92 | * Given: a large set (N) of blocks
 93 | * Need: data structures to encode (i) file hierarchy and (ii) per file metadata
 94 |     * Overhead (metadata size versus file data) should be low
 95 |     * Internal fragmentation should be low
 96 |     * File contents must be accessed efficiently (external fragmentation, number of metadata accesses)
 97 |     * Define operations for file API
 98 | 
 99 | . . .
100 | 
101 | * Many different choices are possible!
102 |     * Similar to virtual memory!
103 |     * Software implementation enables experimentation with strategies
104 | 
105 | ---
106 | 
107 | # Allocating file data
108 | 
109 | * Contiguous
110 | * Linked blocks (blocks end with a next pointer)
111 | * File-allocation tables (table that contains block references)
112 | * Indexed (inode contains data pointers)
113 | * Multi-level indexed (tree of pointers)
114 | 
115 | For each approach, think about fragmentation, ability to grow/shrink files, sequential access performance, random access performance, overhead of meta data.
116 | 
117 | ---
118 | 
119 | # File allocation: contiguous
120 | 
121 | Each file is allocated contiguously
122 | 
123 | \begin{tikzpicture}
124 | \node (F1) at (0,0) [draw,thick, rectangle,minimum width=0.5cm, minimum height=0.5cm] {};
125 | \node (F1) at (0.5,0) [draw,thick, rectangle,minimum width=0.5cm, minimum height=0.5cm] {};
126 | \node (F1) at (1,0) [draw,thick, fill=gray, rectangle,minimum width=0.5cm, minimum height=0.5cm] {A};
127 | \node (F1) at (1.5,0) [draw,thick, fill=gray, rectangle,minimum width=0.5cm, minimum height=0.5cm] {A};
128 | \node (F1) at (2,0) [draw,thick, fill=gray, rectangle,minimum width=0.5cm, minimum height=0.5cm] {A};
129 | \node (F1) at (2.5,0) [draw,thick, rectangle,minimum width=0.5cm, minimum height=0.5cm] {};
130 | \node (F1) at (3,0) [draw,thick, fill=gray, rectangle,minimum width=0.5cm, minimum height=0.5cm] {B};
131 | \node (F1) at (3.5,0) [draw,thick, fill=gray, rectangle,minimum width=0.5cm, minimum height=0.5cm] {B};
132 | \node (F1) at (4,0) [draw,thick, rectangle,minimum width=0.5cm, minimum height=0.5cm] {};
133 | \node (F1) at (4.5,0) [draw,thick, rectangle,minimum width=0.5cm, minimum height=0.5cm] {};
134 | \end{tikzpicture}
135 | 
136 | . . .
137 | 
138 | * Terrible external fragmentation (OS must anticipate)
139 | * Likely unable to grow file
140 | * Excellent read and seek performance
141 | * Small overhead for metadata
142 | 
143 | . . .
144 | 
145 | * Great for read-only file systems (CD/DVD/BlueRay)
146 | 
147 | ---
148 | 
149 | # File allocation: linked blocks
150 | 
151 | Each file consists of a linked list of blocks
152 | 
153 | \begin{tikzpicture}
154 | \node (F1) at (0,0) [draw,thick, rectangle,minimum width=0.5cm, minimum height=0.5cm] {};
155 | \node (F1) at (0.5,0) [draw,thick, rectangle,minimum width=0.5cm, minimum height=0.5cm] {};
156 | \node (A1) at (1,0) [draw,thick, fill=gray, rectangle,minimum width=0.5cm, minimum height=0.5cm] {A};
157 | \node (A2) at (1.5,0) [draw,thick, fill=gray, rectangle,minimum width=0.5cm, minimum height=0.5cm] {A};
158 | \node (B2) at (2,0) [draw,thick, fill=gray, rectangle,minimum width=0.5cm, minimum height=0.5cm] {B};
159 | \node (F1) at (2.5,0) [draw,thick, rectangle,minimum width=0.5cm, minimum height=0.5cm] {};
160 | \node (B1) at (3,0) [draw,thick, fill=gray, rectangle,minimum width=0.5cm, minimum height=0.5cm] {B};
161 | \node (A3) at (3.5,0) [draw,thick, fill=gray, rectangle,minimum width=0.5cm, minimum height=0.5cm] {A};
162 | \node (F1) at (4,0) [draw,thick, rectangle,minimum width=0.5cm, minimum height=0.5cm] {};
163 | \node (F1) at (4.5,0) [draw,thick, rectangle,minimum width=0.5cm, minimum height=0.5cm] {};
164 | 
165 | \draw [->,thick] (A1.north) to [out=30,in=150] (A2.north);
166 | \draw [->,thick] (A2.north) to [out=30,in=150] (A3.north);
167 | \draw [->,thick] (B1.south) to [out=-150,in=-30] (B2.south);
168 | 
169 | \end{tikzpicture}
170 | 
171 | . . .
172 | 
173 | * No external fragmentation
174 | * Files can grow easily
175 | * Reasonable read cost (depending on layout), high seek cost
176 | * One pointer per block metadata overhead
177 | 
178 | ---
179 | 
180 | # File allocation: File Allocation Table (FAT)
181 | 
182 | Idea: keep linked list information in a single table.
183 | Instead of storing the next pointer at the end of the block, store all next pointers in a central table
184 | 
185 | \begin{tikzpicture}
186 | \node (F1) at (0,0) [draw,thick, rectangle,minimum width=0.5cm, minimum height=0.5cm] {};
187 | \node (F1) at (0.5,0) [draw,thick, rectangle,minimum width=0.5cm, minimum height=0.5cm] {};
188 | \node (A1) at (1,0) [draw,thick, fill=gray, rectangle,minimum width=0.5cm, minimum height=0.5cm] {A};
189 | \node (A2) at (1.5,0) [draw,thick, fill=gray, rectangle,minimum width=0.5cm, minimum height=0.5cm] {A};
190 | \node (B2) at (2,0) [draw,thick, fill=gray, rectangle,minimum width=0.5cm, minimum height=0.5cm] {B};
191 | \node (F1) at (2.5,0) [draw,thick, rectangle,minimum width=0.5cm, minimum height=0.5cm] {};
192 | \node (B1) at (3,0) [draw,thick, fill=gray, rectangle,minimum width=0.5cm, minimum height=0.5cm] {B};
193 | \node (A3) at (3.5,0) [draw,thick, fill=gray, rectangle,minimum width=0.5cm, minimum height=0.5cm] {A};
194 | \node (F1) at (4,0) [draw,thick, rectangle,minimum width=0.5cm, minimum height=0.5cm] {};
195 | \node (F1) at (4.5,0) [draw,thick, rectangle,minimum width=0.5cm, minimum height=0.5cm] {};
196 | \node (F1) at (0,-0.5) {-1};
197 | \node (F1) at (0.5,-0.5) {-1};
198 | \node (A1) at (1,-0.5) {3};
199 | \node (A2) at (1.5,-0.5) {7};
200 | \node (B2) at (2,-0.5) {0};
201 | \node (F1) at (2.5,-0.5) {-1};
202 | \node (B1) at (3,-0.5) {4};
203 | \node (A3) at (3.5,-0.5) {0};
204 | \node (F1) at (4,-0.5) {-1};
205 | \node (F1) at (4.5,-0.5) {-1};
206 | \node (F1) at (6.1,-0.5) {Block pointer};
207 | 
208 | \node (F1) at (0,0.5) {0};
209 | \node (F1) at (0.5,0.5) {1};
210 | \node (A1) at (1,0.5) {2};
211 | \node (A2) at (1.5,0.5) {3};
212 | \node (B2) at (2,0.5) {4};
213 | \node (F1) at (2.5,0.5) {5};
214 | \node (B1) at (3,0.5) {6};
215 | \node (A3) at (3.5,0.5) {7};
216 | \node (F1) at (4,0.5) {8};
217 | \node (F1) at (4.5,0.5) {9};
218 | \node (F1) at (6.1,0.5) {Block number};
219 | 
220 | \end{tikzpicture}
221 | 
222 | . . .
223 | 
224 | * No external fragmentation
225 | * Files can grow easily
226 | * Reasonable read and seek cost
227 | * One pointer per block metadata overhead
228 | 
229 | ---
230 | 
231 | # File allocation: indexed
232 | 
233 | Idea: metadata contains an array of block pointers
234 | 
235 | \begin{tikzpicture}
236 | \node (F1) at (0,0) [draw,thick, rectangle,minimum width=0.5cm, minimum height=0.5cm] {};
237 | \node (F1) at (0.5,0) [draw,thick, rectangle,minimum width=0.5cm, minimum height=0.5cm] {};
238 | \node (A1) at (1,0) [draw,thick, fill=gray, rectangle,minimum width=0.5cm, minimum height=0.5cm] {A};
239 | \node (A2) at (1.5,0) [draw,thick, fill=gray, rectangle,minimum width=0.5cm, minimum height=0.5cm] {A};
240 | \node (B2) at (2,0) [draw,thick, fill=gray, rectangle,minimum width=0.5cm, minimum height=0.5cm] {B};
241 | \node (F1) at (2.5,0) [draw,thick, rectangle,minimum width=0.5cm, minimum height=0.5cm] {};
242 | \node (B1) at (3,0) [draw,thick, fill=gray, rectangle,minimum width=0.5cm, minimum height=0.5cm] {B};
243 | \node (A3) at (3.5,0) [draw,thick, fill=gray, rectangle,minimum width=0.5cm, minimum height=0.5cm] {A};
244 | \node (F1) at (4,0) [draw,thick, rectangle,minimum width=0.5cm, minimum height=0.5cm] {};
245 | \node (F1) at (4.5,0) [draw,thick, rectangle,minimum width=0.5cm, minimum height=0.5cm] {};
246 | 
247 | \node (F1) at (0,0.5) {0};
248 | \node (F1) at (0.5,0.5) {1};
249 | \node (A1) at (1,0.5) {2};
250 | \node (A2) at (1.5,0.5) {3};
251 | \node (B2) at (2,0.5) {4};
252 | \node (F1) at (2.5,0.5) {5};
253 | \node (B1) at (3,0.5) {6};
254 | \node (A3) at (3.5,0.5) {7};
255 | \node (F1) at (4,0.5) {8};
256 | \node (F1) at (4.5,0.5) {9};
257 | \node (F1) at (6.1,0.5) {Block number};
258 | 
259 | \node (F1) at (8,-0.2) [draw,thick, rectangle,minimum width=3.5cm, minimum height=0.5cm] {File A: 2, 3, 7, -1};
260 | \node (F1) at (8,-1.2) [draw,thick, rectangle,minimum width=3.5cm, minimum height=0.5cm] {File B: 6, 4, -1, -1};
261 | 
262 | \end{tikzpicture}
263 | 
264 | . . .
265 | 
266 | * No external fragmentation
267 | * Files can grow easily up to maximum size
268 | * Reasonable read and low seek cost
269 | * *Large metadata overhead* (wastes space as most files are small)
270 | 
271 | ---
272 | 
273 | # File allocation: multi-level indexing (1/3)
274 | 
275 | Idea: have a mix of direct, indirect, double indirect, and triple indirect pointers
276 | 
277 | \begin{tikzpicture}
278 | \node (D1) at (0,0) [draw,thick, rectangle,minimum width=0.5cm, minimum height=0.5cm] {};
279 | \node (D2) at (0.5,0) [draw,thick, rectangle,minimum width=0.5cm, minimum height=0.5cm] {};
280 | \node (D3) at (1,0) [draw,thick, rectangle,minimum width=0.5cm, minimum height=0.5cm] {};
281 | \node (D4) at (1.5,0) [draw,thick, rectangle,minimum width=0.5cm, minimum height=0.5cm] {};
282 | \node (D5) at (2,0) [draw,thick, rectangle,minimum width=0.5cm, minimum height=0.5cm] {};
283 | \node (D6) at (2.5,0) [draw,thick, rectangle,minimum width=0.5cm, minimum height=0.5cm] {};
284 | \node (D7) at (3,0) [draw,thick, rectangle,minimum width=0.5cm, minimum height=0.5cm] {};
285 | \node (I1) at (3.5,0) [draw,thick, fill=red, rectangle,minimum width=0.5cm, minimum height=0.5cm] {I};
286 | \node (II1) at (4,0) [draw,thick, fill=blue, rectangle,minimum width=0.5cm, minimum height=0.5cm] {D};
287 | \node (III1) at (4.5,0) [draw,thick, fill=green, rectangle,minimum width=0.5cm, minimum height=0.5cm] {T};
288 | 
289 | \node (DB1) at (0,-1) [draw,thick, fill=brown, rectangle,minimum width=0.5cm, minimum height=0.5cm] {};
290 | \node (DB2) at (0.8,-1) [draw,thick, fill=brown, rectangle,minimum width=0.5cm, minimum height=0.5cm] {};
291 | \node (DB3) at (3,-1) [draw,thick, fill=brown, rectangle,minimum width=0.5cm, minimum height=0.5cm] {};
292 | 
293 | \draw [->,thick] (D1.south) to (DB1.north);
294 | \draw [->,thick] (D2.south) to (DB2.north);
295 | \draw [->,thick] (D7.south) to (DB3.north);
296 | 
297 | \node (IB1) at (3.5,-2) [draw,thick, fill=red, rectangle,minimum width=0.5cm, minimum height=0.5cm] {};
298 | 
299 | \node (IDB1) at (3.5,-3) [draw,thick, fill=brown, rectangle,minimum width=0.5cm, minimum height=0.5cm] {};
300 | \node (IDB2) at (4.2,-3) [draw,thick, fill=brown, rectangle,minimum width=0.5cm, minimum height=0.5cm] {};
301 | 
302 | \draw [->,thick] (I1.south) to (IB1.north);
303 | \draw [->,thick] (IB1.south) to (IDB1.north);
304 | \draw [->,thick] (IB1.south) to (IDB2.north);
305 | 
306 | 
307 | \node (IIB1) at (5,-2) [draw,thick, fill=blue, rectangle,minimum width=0.5cm, minimum height=0.5cm] {};
308 | 
309 | \node (IB2) at (5,-4) [draw,thick, fill=red, rectangle,minimum width=0.5cm, minimum height=0.5cm] {};
310 | 
311 | \node (IDB3) at (5,-5) [draw,thick, fill=brown, rectangle,minimum width=0.5cm, minimum height=0.5cm] {};
312 | \node (IDB4) at (5.8,-5) [draw,thick, fill=brown, rectangle,minimum width=0.5cm, minimum height=0.5cm] {};
313 | 
314 | \node (IB3) at (7,-4) [draw,thick, fill=red, rectangle,minimum width=0.5cm, minimum height=0.5cm] {};
315 | 
316 | \node (IDB5) at (7,-5) [draw,thick, fill=brown, rectangle,minimum width=0.5cm, minimum height=0.5cm] {};
317 | \node (IDB6) at (7.8,-5) [draw,thick, fill=brown, rectangle,minimum width=0.5cm, minimum height=0.5cm] {};
318 | 
319 | \draw [->,thick] (II1.south) to (IIB1.north);
320 | 
321 | \draw [->,thick] (IIB1.south) to (IB2.north);
322 | \draw [->,thick] (IIB1.south) to (IB3.north);
323 | 
324 | \draw [->,thick] (IB2.south) to (IDB3.north);
325 | \draw [->,thick] (IB2.south) to (IDB4.north);
326 | 
327 | \draw [->,thick] (IB3.south) to (IDB5.north);
328 | \draw [->,thick] (IB3.south) to (IDB6.north);
329 | 
330 | \node (IIIB1) at (8,-2) [draw,thick, fill=green, rectangle,minimum width=0.5cm, minimum height=0.5cm] {triple indirect};
331 | \draw [->,thick] (III1.south) to (IIIB1.north);
332 | 
333 | \end{tikzpicture}
334 | 
335 | ---
336 | 
337 | # File allocation: multi-level indexing (2/3)
338 | 
339 | Idea: have a mix of direct, indirect, double indirect, and triple indirect pointers
340 | 
341 | ```.C
342 | struct inode {
343 |   umode_t        i_mode;
344 |   unsigned short i_opflags;
345 |   kuid_t         i_uid;
346 |   kgid_t         i_gid;
347 |   unsigned int   i_flags;
348 |   ...
349 |   // direct pointers to data blocks
350 |   struct dblock  *direct[10];
351 |   // block of N ptrs to data blocks
352 |   struct dblock  **dindirect;
353 |   // block of N ptrs to each N ptrs to data blocks
354 |   struct dblock  ***tindirect; 
355 | };
356 | ```
357 | 
358 | ---
359 | 
360 | # File allocation: multi-level indexing (3/3)
361 | 
362 | Idea: have a mix of direct, indirect, double indirect, and triple indirect pointers
363 | 
364 | * No external fragmentation
365 | * Files can grow easily up to maximum size
366 | * Reasonable read and low seek cost
367 | * Low metadata overhead but needs extra reads for indirect/double indirect access
368 | 
369 | ---
370 | 
371 | # Simple FS
372 | 
373 | \begin{tikzpicture}
374 | \node (F1) at (0,0) [draw,thick, rectangle,minimum width=0.5cm, minimum height=0.5cm] {S};
375 | \node (F1) at (0.5,0) [draw,thick, fill=teal, rectangle,minimum width=0.5cm, minimum height=0.5cm] {i};
376 | \node (F1) at (1,0) [draw,thick, fill=cyan, rectangle,minimum width=0.5cm, minimum height=0.5cm] {d};
377 | \foreach \x in {0,...,4} {
378 |     \node (F1) at (0.5*\x+1.5,0) [draw,thick,fill=green,rectangle,minimum width=0.5cm, minimum height=0.5cm] {I};
379 | }
380 | \foreach \x in {0,...,8} {%
381 |     \node (F1) at (0.5*\x+3.5,0) [draw,thick,fill=blue,rectangle,minimum width=0.5cm, minimum height=0.5cm] {D};
382 | }
383 | \end{tikzpicture}
384 | 
385 | * Superblock (S): file system metadata
386 | * Bitmaps (i, d): indicates free blocks
387 | * Inodes (I): hold file/directory metadata, reference data blocks
388 | * Data blocks (D): file contents, referenced by an inode
389 | 
390 | The inode size may be different (smaller) from the data block size.
391 | 
392 | ---
393 | 
394 | # Simple FS: superblock
395 | 
396 | * The superblock stores the characteristics of the filesystem
397 | * What do you store in the superblock?
398 | 
399 | . . .
400 | 
401 | * Magic number and revision level
402 | * Mount count and maximum mount count
403 | * Block size of the filesystem (1, 2, 4, 8, 16, 32, 64K for ext4)
404 | * Name of the filesystem
405 | * Number of inodes/data blocks
406 | * Number of free inodes/data blocks
407 | * Number of "first" inode (i.e., root directory)
408 | 
409 | ---
410 | 
411 | # Simple FS: inode
412 | 
413 | * The inode stores all file metadata
414 | * What would you store in an inode?
415 | 
416 | . . .
417 | 
418 | * File type
419 | * File uid, gid
420 | * File permissions (for user, group, others)
421 | * Size
422 | * Access time
423 | * Create time
424 | * Number of links
425 | 
426 | ---
427 | 
428 | # Maximum file size
429 | 
430 | * Maximum file size is related to
431 |     * Block size
432 |     * Number of direct inodes
433 |     * Number of indirect inodes
434 |     * Number of double indirect inodes
435 |     * Number of triple indirect inodes
436 | * `blocksize * (direct + inodeblock + inodeblock^2 + inodeblock^3)`
437 | 
438 | ---
439 | 
440 | # Directories
441 | 
442 | * Directories are special files (`inode->type`)
443 | * Store a set of file name to inode mappings
444 | * Special entries `.` for current directory and `..` for parent directory
445 | 
446 | ---
447 | 
448 | # File operation: create /foo/bar
449 | 
450 | * Read root inode (locate directory data)
451 | * Read root data (read directory)
452 | * Read foo inode (locate directory data)
453 | * Read foo data (read directory)
454 | * Read/write inode bitmap (allocate inode)
455 | * Write foo data (add file name)
456 | * Read/write bar inode (create file)
457 | * Write foo inode (update date, maybe allocate data block)
458 | 
459 | ---
460 | 
461 | # File operation: open /foo/bar
462 | 
463 | * Read root inode (locate directory data)
464 | * Read root data (read directory)
465 | * Read foo inode (locate directory data)
466 | * Read foo data (read directory)
467 | * Read bar inode (read file metadata)
468 | 
469 | ---
470 | 
471 | # File operation: write to /foo/bar
472 | 
473 | * First: `open("/foo/bar")`
474 | * Read bar inode (read file metadata)
475 | * Read/write data bitmap (allocate data blocks)
476 | * Write bar data (write data)
477 | * Write bar inode (update inode)
478 | 
479 | ---
480 | 
481 | # File operation: read from /foo/bar
482 | 
483 | * First: `open("/foo/bar")`
484 | * Read bar inode (read file metadata)
485 | * Read bar data (read data)
486 | * Write bar inode (update time)
487 | 
488 | ---
489 | 
490 | # File operation: close /foo/bar
491 | 
492 | * No disk I/O
493 | 
494 | ---
495 | 
496 | # File operation: observations
497 | 
498 | * Path traversal and translation is costly
499 |     * Reduce number of lookups (file descriptors!)
500 |     * Introduce caching (dcache)
501 | * Lookup aside, operations are cheap and local
502 | 
503 | ---
504 | 
505 | # Summary
506 | 
507 | * Filesystem implementation
508 | * Inodes for metadata
509 | * Bitmaps for inodes/data blocks
510 | * Superblock for global metadata
511 | 
512 | Don't forget to get your learning feedback through the Moodle quiz!
513 | 


--------------------------------------------------------------------------------
/34-journaling.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | subtitle: Filesystem journaling
  3 | ---
  4 | 
  5 | # Topics covered in this lecture
  6 | 
  7 | * Crash resistance
  8 | * Journaling
  9 | 
 10 | This slide deck covers chapters 42 in OSTEP.
 11 | 
 12 | ---
 13 | 
 14 | # Last two weeks: API, abstractions, disk layout
 15 | 
 16 | * Highlevel API and abstractions
 17 | * Filesystem API
 18 | * Different names for different use cases
 19 |     * Inodes and devices
 20 |     * Path
 21 |     * File descriptor
 22 | * Disk layout and inode/data block implementations
 23 | 
 24 | ---
 25 | 
 26 | # This week: Crash resistance and recovery
 27 | 
 28 | <!-- TODO figure -->
 29 | 
 30 | ---
 31 | 
 32 | # Recall atomic file update challenge
 33 | 
 34 | * Assume you want to update `important.txt` atomically
 35 | 
 36 | . . .
 37 | 
 38 | * If the application or the system crashes, the old version must remain
 39 |     * Write data to `./gener8 > important.txt.tmp`
 40 |     * Flush data to disk: `fsync important.txt.tmp`
 41 |     * Rename atomically: `mv important.txt.tmp important.txt`, replacing it
 42 | * What could still go wrong?
 43 | 
 44 | . . .
 45 | 
 46 | * File system metadata may not be written back to disk!
 47 | 
 48 | ---
 49 | 
 50 | # Crash resistance
 51 | 
 52 | * Power loss during writing
 53 | * Mechanical failure
 54 | * Magnetization failure
 55 | * [Mechanical destruction (link)](https://www.youtube.com/watch?v=-bpX8YvNg6Y&t=1815)
 56 | 
 57 | ---
 58 | 
 59 | # Redundancy
 60 | 
 61 | > Given A and B. If knowing A allows you to infer some or all values of B then
 62 | > there is redundancy between A and B.
 63 | 
 64 | * RAID1: mirrored disks (complete redundancy)
 65 | * RAID5 or 6: parity blocks (partial redundancy)
 66 | 
 67 | ---
 68 | 
 69 | # Redundancy in a filesystem
 70 | 
 71 | * Directory entries and inode table
 72 | * Directory entries and inode link count
 73 | * Data bitmap and inode pointers
 74 | * Data bitmap and group descriptor (for sets of blocks)
 75 | * Inode file size and inode/indirect pointers
 76 | 
 77 | ---
 78 | 
 79 | # Advantages of redundancy
 80 | 
 81 | * Can improve reliability (recover from failures)
 82 | * Can improve performance (easier to read file size from inode than parsing
 83 |   the full structure)
 84 | * Requires more storage (inefficient encoding)
 85 | * Requires consistency (all sides must agree)
 86 | 
 87 | ---
 88 | 
 89 | # Consistency
 90 | 
 91 | > Redundant data must be consistent to ensure correctness.
 92 | > Otherwise functionality may break.
 93 | 
 94 | * Keeping redundant data consistent is challenging
 95 | * Filesystem may perform several writes to redundant blocks
 96 | * The sequence of writes is not atomic
 97 | * Interrupts due to power loss, kernel bugs, hardware failure
 98 | 
 99 | ---
100 | 
101 | # Consistency scenario (1/2)
102 | 
103 | * Filesystem appends to a file
104 | * Must write to inode, data bitmap, data block
105 | * What happens if only some writes succeed?
106 |     * `001` Bitmap
107 |     * `010` Data
108 |     * `100` Inode
109 |     * `011` Bitmap and data
110 |     * `101` Bitmap and inode
111 |     * `110` Data and inode
112 | 
113 | ---
114 | 
115 | # Consistency scenario (2/2)
116 | 
117 | * Filesystem appends to a file
118 | * Must write to inode, data bitmap, data block
119 | * What happens if only some writes succeed?
120 |     * `001` Bitmap: lost block
121 |     * `010` Data: lost data write (i.e., file is not updated)
122 |     * `100` Inode: references garbage (another file may use)
123 |     * `011` Bitmap and data: lost block
124 |     * `101` Bitmap and inode: reference garbage (from previous usage)
125 |     * `110` Data and inode: another file may grab the block
126 | * How would you order the writes?
127 | 
128 | . . .
129 | 
130 | * Data (nothing bad happens), bitmap (lost block is detectable), then inode
131 | 
132 | ---
133 | 
134 | # Consistency through filesystem check (1/3)
135 | 
136 | * After a certain number of mount operations (remember the mount count in the
137 |   super block?) or after a crash, check the consistency of the filesystem!
138 | * Hundreds of consistency checks across different fields
139 |     * Do superblocks match?
140 |     * Are all '.' and '..' linked correctly?
141 |     * Are link counts equal to number of directory entries?
142 |     * Do different inodes point to the same block?
143 | 
144 | ---
145 | 
146 | # Consistency through filesystem check (2/3)
147 | 
148 | * Q: Two directory entries point to the same inode, link count is 1
149 | 
150 | . . .
151 | 
152 | * A: Update the link count to 2
153 | 
154 | . . .
155 | 
156 | * Q: Inode link count is 1 but no directory links this file
157 | 
158 | . . .
159 | 
160 | * A: Link the file in a `lost+found` directory
161 | 
162 | . . .
163 | 
164 | * Q: A referenced block is marked as free in the bitmap
165 | 
166 | . . .
167 | 
168 | * A: Update the bitmap to 1
169 | 
170 | . . .
171 | 
172 | * Q: Two inodes reference the same data block
173 | 
174 | . . .
175 | 
176 | * A: Make a copy of the data block
177 | 
178 | . . .
179 | 
180 | * Q: An inode points to an inexistent block
181 | 
182 | . . .
183 | 
184 | * A: Remove the reference
185 | 
186 | 
187 | ---
188 | 
189 | # Consistency through filesystem check (3/3)
190 | 
191 | * Are these operations correct?
192 | 
193 | . . .
194 | 
195 | * The file system is inconsistent, so all we do is best effort!
196 | * It's not obvious how to fix filesystem corruption
197 | * Correct state is unkown, just that it is inconsistent
198 | * FSCK is slow and may take hours (must read full disk)
199 | 
200 | . . .
201 | 
202 | * Are there better approaches?
203 | 
204 | ---
205 | 
206 | # Consistency through journaling
207 | 
208 | * Goal: limit the amount of required work after crash
209 | * Goal: get correct state, not just consistent state
210 | * Strategy: atomicity
211 | 
212 | * Atomicity: being composed of indivisible units
213 |     * *Concurrency*: operations in critical sections are not interrupted
214 |     * *Persistence*: collections of writes are not interrupted by crashes 
215 |       (i.e., either all new or all old data is visible)
216 | 
217 | ---
218 | 
219 | # Consistency versus correctness
220 | 
221 | * Given: filesystem in state A, set of writes, resulting in state B
222 | * Assume it crashes somewhere between the writes from A to B
223 |     * Filesystem check (FSCK) gives consistency
224 |     * Atomicity gives A or B
225 | 
226 | \begin{tikzpicture}
227 | 
228 | \draw (0,-0.2) [fill=gray] ellipse (3cm and 1.3cm);
229 | \node at (0,0.8) {consistent states};
230 | 
231 | \draw (0,0) ellipse (4cm and 2cm);
232 | \node at (0,1.8) {all states};
233 | 
234 | \node at (-1.5,-0.5) [draw,fill=red,circle](0.5cm) {};
235 | \node at (-1.5,-0.9) {empty};
236 | 
237 | \node at (0,-0.5) [draw,fill=blue,circle](0.5cm) (A) {A};
238 | \node at (1.5,-0.5) [draw,fill=green,circle](0.5cm) (B) {B};
239 | \path (A) edge[bend left,->] node [left] {} (B);
240 | 
241 | \end{tikzpicture}
242 | 
243 | ---
244 | 
245 | # Journaling strategy
246 | 
247 | * Never delete (or overwrite) ANY old data until you have received confirmation
248 |   that ALL new data is committed
249 |     * Add redundancy to fix the problem with redundancy
250 | 
251 | . . .
252 | 
253 | ![](./figures/34-journaling.jpg){width=350px}
254 | 
255 | ---
256 | 
257 | # Journaling strategy (1/4)
258 | 
259 | * Goal update file X with contents Y
260 |     * Write Y, update metadata f(Y)
261 | 
262 | . . .
263 | 
264 | * Classic strategy
265 |     * Overwrite f(X) with f(Y), overwrite X with Y; or
266 |     * Overwrite X with Y, overwrite f(X) with f(Y)
267 |     * No matter the order, crash in the middle is bad!
268 | 
269 | . . .
270 | 
271 | * Journaling strategy
272 |     * Commit Y and f(Y) to journal
273 |     * Update X with Y
274 |     * Update f(X) with f(Y)
275 |     * Delete journal entries
276 |     * Resilient to crash in the middle, journal allows recovery
277 | 
278 | ---
279 | 
280 | # Journaling strategy (2/4)
281 | 
282 | * Goal: write 10 to block 0 and 5 to block 1 *atomically*
283 | 
284 | | Time | Block 0 | Block 1 | Extra | Extra | Extra |
285 | |-----:|--------:|--------:|------:|------:|------:|
286 | |    0 |      12 |       3 |     0 |     0 |     0 |
287 | |    1 |      10 |       3 |     0 |     0 |     0 |
288 | |    2 |      10 |       5 |     0 |     0 |     0 |
289 | 
290 | . . .
291 | 
292 | * This does not work! Must not crash between time 1 and 2!
293 | 
294 | ---
295 | 
296 | # Journaling strategy (3/4)
297 | 
298 | * Goal: write 10 to block 0 and 5 to block 1 *atomically*
299 | 
300 | | Time | Block 0 | Block 1 | Block 0' | Block 1' | Valid? |
301 | |-----:|--------:|--------:|---------:|---------:|-------:|
302 | |    0 |      12 |       3 |        0 |        0 |      0 |
303 | |    1 |      12 |       3 |       10 |        0 |      0 |
304 | |    2 |      12 |       3 |       10 |        5 |      0 |
305 | |    3 |      12 |       3 |       10 |        5 |      1 |
306 | |    4 |      10 |       3 |       10 |        5 |      1 |
307 | |    5 |      10 |       5 |       10 |        5 |      1 |
308 | |    6 |      10 |       5 |       10 |        5 |      0 |
309 | 
310 | . . .
311 | 
312 | * Crash before 3: old data
313 | * Crash after 3: new data (need recovery)
314 | * Crash after 6: new data
315 | 
316 | ---
317 | 
318 | # Journaling strategy (4/4)
319 | 
320 | ```.C
321 | // Pseudocode, assume we operate on blocks
322 | void recovery() {
323 |     if (*valid == 1) {
324 |         *block0 = *block0p;
325 |         *block1 = *block1p;
326 |         *valid = 0;
327 |         fsync();
328 |     }
329 | }
330 | ```
331 | 
332 | ---
333 | 
334 | # Journaling terminology
335 | 
336 | * Extra blocks are called 'journal'
337 | * Writes to the journal are a 'journal transaction'
338 | * The valid bit is a 'journal commit block'
339 | 
340 | ---
341 | 
342 | # Journal optimizations
343 | 
344 | * Dedicated (small) journal area
345 | * Write barriers
346 | * Checksums
347 | * Circular journal
348 | * Logical journal
349 | * Ordered journal
350 | 
351 | ---
352 | 
353 | # Journal optimization: small journal
354 | 
355 | * Allocating a shadow block per data block is wasteful 
356 |     * Recovery cost and lost storage
357 | 
358 | . . .
359 | 
360 | * Dedicate a small area of blocks to the journal
361 |     * Store block number along with data
362 |     * At the start of the transaction, mark which blocks are modified
363 |     * Store the data blocks in the journal
364 |     * Commit the transaction
365 | 
366 | ---
367 | 
368 | # Journal optimization: write barriers
369 | 
370 | * Enforcing total write order is costly (remember seek cost?)
371 | * Idea: only wait until blocks of writes have completed
372 |     * Wait before journal commit (journal data blocks were written)
373 |     * Wait after journal commit (journal was committed)
374 |     * Wait after data blocks are written (journal can be freed)
375 | 
376 | ---
377 | 
378 | # Journal optimization: checksums
379 | 
380 | * Can we get rid of the write barrier after journal commit?
381 | * Idea: replace valid/invalid bit with checksum of written blocks
382 |     * Checksum mismatch: one of the blocks was not written
383 |     * Checksum match: all blocks were committed correctly
384 | * We now only have two write barriers for each transaction
385 |     * After writing the journal (make sure data ended up in journal)
386 |     * Before clearing the journal entry (data was written to disk)
387 | 
388 | ---
389 | 
390 | # Journal optimization: circular buffer
391 | 
392 | * After data is written to journal, there is no rush to update/write back
393 |     * Journaled data can be recovered
394 | * Delay journaling for some time for better performance
395 |     * Keep journal transactions in circular buffer
396 |     * Flush when buffer space is used up
397 | 
398 | ---
399 | 
400 | # Journal optimization: logical journal
401 | 
402 | * Appending a block to the file causes writes to the data block, the inode,
403 |   the data bitmap
404 |     * Many small writes
405 |     * Writing full blocks to journal is wasteful
406 | * Idea: keep track how data changed (diff between old and new)
407 |     * Logical journals record changes to bytes, not blocks
408 |     * Save lots of journal space
409 |     * Must read original block during recovery
410 | 
411 | ---
412 | 
413 | # Journal optimization: ordered journal
414 | 
415 | * How can we avoid writing all data twice?
416 | * Idea: store only metadata in journal
417 |     * Write data to new block
418 |     * Store updates to metadata in logical journal
419 |     * Commit journal (and new data blocks)
420 |     * Update metadata
421 |     * Free journal
422 | 
423 | ---
424 | 
425 | # Summary
426 | 
427 | * Crash resistance: filesystem check (FSCK)
428 | * Journaling: keep track of metadata, enforce atomicity
429 |     * All modern filesystems use journaling
430 |     * FSCK still useful due to bitflips/bugs
431 | 
432 | Don't forget to get your learning feedback through the Moodle quiz!
433 | 


--------------------------------------------------------------------------------
/35-persistence.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | subtitle: Persistence Summary
  3 | ---
  4 | 
  5 | # Persistence
  6 | 
  7 | \begin{tikzpicture}
  8 | \draw [orange, ultra thick] (0,0) rectangle (2,4);
  9 | \node[text width=3cm, rotate=90] at (1, 2.5) {Virtualization};
 10 | \draw [blue, ultra thick] (3,0) rectangle (5,4);
 11 | \node[text width=3cm, rotate=90] at (4, 2.5) {Concurrency};
 12 | \draw [fill, green, ultra thick] (6,0) rectangle (8,4);
 13 | \node[text width=3cm, rotate=90] at (7, 2.5) {Persistence};
 14 | 
 15 | \draw [red, ultra thick] (0,4.5) -- (8,4.5);
 16 | \draw [red, ultra thick] (0,4.5) -- (4,6);
 17 | \draw [red, ultra thick] (4,6) -- (8,4.5);
 18 | \node[text width=3cm] at (5, 5) {Security};
 19 | 
 20 | \end{tikzpicture}
 21 | 
 22 | ---
 23 | 
 24 | # Persistence Topics
 25 | 
 26 | * Device interaction and device drivers
 27 | * IO Scheduling and harddrive throughput
 28 | * Filesystem API
 29 | * Filesystem implementation
 30 |     * Inodes and devices
 31 |     * File descriptors
 32 |     * File names
 33 | * Crash resistance
 34 | * Journaling
 35 | 
 36 | ---
 37 | 
 38 | # Motivation
 39 | 
 40 | * So far we have talked about the CPU and about RAM
 41 | * How do we get data into RAM?
 42 |     * Load programs and data from storage
 43 |     * Read and write packets from the network (maybe even streams?)
 44 |     * Write data to a terminal or the screen
 45 |     * Read data from input devices such as keyboard/mouse/camera
 46 | * Devices provide input/output (IO) to a system
 47 | 
 48 | * IO allows information to *persist* (RAM is volatile)!
 49 | * Enables interesting computation!
 50 | 
 51 | ---
 52 | 
 53 | # IO buses
 54 | 
 55 | \begin{tikzpicture}
 56 | 
 57 | \node [draw, rectangle, ultra thick, minimum width=2cm, minimum height=1cm] at (0,3) {CPU};
 58 | \draw [red, ultra thick] (0,2.5) -- (0,2);
 59 | \node [draw, rectangle, ultra thick, minimum width=2cm, minimum height=1cm] at (3,3) {RAM};
 60 | \draw [red, ultra thick] (3,2.5) -- (3,2);
 61 | 
 62 | 
 63 | \draw [red, ultra thick] (-1,2) -- (5,2);
 64 | \node at (7.3, 2)   {Northbridge, \textasciitilde 19,600 MB/s};
 65 | \node at (6.8, 1.5) {memory bus / PCI-E};
 66 | 
 67 | \node [draw, rectangle, ultra thick, minimum width=2cm, minimum height=1cm] at (3,1) {GPU};
 68 | \draw [red, ultra thick] (3,1.5) -- (3,2);
 69 | 
 70 | \draw [orange, ultra thick] (-1,0) -- (5,0);
 71 | \node at (7.15, 0)    {Southbridge \textasciitilde 4,000 MB/s};
 72 | \node at (6.65, -0.5) {I/O bus (e.g., PCI)};
 73 | 
 74 | \draw [orange, ultra thick] (1.5,0) -- (1.5,2);
 75 | 
 76 | \node [draw, rectangle, ultra thick, minimum width=2cm, minimum height=1cm] at (0,1) {PCI Dev};
 77 | \draw [orange, ultra thick] (0,0.5) -- (0,0);
 78 | 
 79 | 
 80 | \draw [yellow, ultra thick] (-1,-2) -- (5,-2);
 81 | \node at (7.8, -2) {Peripheral I/O bus, \textasciitilde 2,000 MB/s};
 82 | \node at (6.7, -2.5) {(e.g., SATA, USB)};
 83 | 
 84 | \draw [yellow, ultra thick] (1.5,-2) -- (1.5,0);
 85 | 
 86 | \node [draw, rectangle, ultra thick, minimum width=1cm, minimum height=1cm] at (0,-1) {HDD};
 87 | \draw [yellow, ultra thick] (0,-1.5) -- (0,-2);
 88 | 
 89 | \node [draw, rectangle, ultra thick, minimum width=1cm, minimum height=1cm] at (2.5,-1) {Cam};
 90 | \draw [yellow, ultra thick] (2.5,-1.5) -- (2.5,-2);
 91 | 
 92 | \node [draw, rectangle, ultra thick, minimum width=1cm, minimum height=1cm] at (4,-1) {WiFi};
 93 | \draw [yellow, ultra thick] (4,-1.5) -- (4,-2);
 94 | 
 95 | \end{tikzpicture}
 96 | 
 97 | ---
 98 | 
 99 | # Canonical IO device
100 | 
101 | \begin{tikzpicture}
102 | 
103 | \node [draw, rectangle, ultra thick, minimum width=6cm, minimum height=4cm] at (0,3) {};
104 | \node [draw, rectangle, ultra thick, minimum width=2cm, minimum height=0.8cm] at (-1.5,4.6) {Status};
105 | \node [draw, rectangle, ultra thick, minimum width=1cm, minimum height=0.8cm] at (0.5,4.6) {CMD};
106 | \node [draw, rectangle, ultra thick, minimum width=1cm, minimum height=0.8cm] at (2,4.6) {DTA};
107 | \node at (5, 4.6) {(Device registers)};
108 | 
109 | \draw [red, ultra thick, dotted] (-2.95,4) -- (2.95,4);
110 | 
111 | \node at (0, 3) {Microcontroller (CPU+RAM)};
112 | \node at (0, 2.3) {Storage (RAM/Flash)};
113 | \node at (0, 1.6) {Special purpose chips};
114 | 
115 | \node at (5, 3.6) {(Device internals)};
116 | 
117 | \end{tikzpicture}
118 | 
119 | * OS communicates based on agreed protocol (through "driver")
120 | * Device signals OS through memory or interrupt
121 | 
122 | ---
123 | 
124 | # IO data transfer
125 | 
126 | \begin{tikzpicture}
127 | 
128 | %\draw [decorate,decoration={brace,amplitude=4pt}] (0.5,0.5) -- (2,0.5) node[midway,yshift=1em]{1};
129 | \draw[<-] (0.5,0.4) -- (0.5,1) node[midway,yshift=1.3em]{1};
130 | \draw [color=red,line width=0.5mm,decorate,decoration={brace,amplitude=4pt}] (2,0.5) -- (2.5,0.5) node[midway,yshift=1em]{2};
131 | \draw[<-] (2.5,0.4) -- (2.5,1) node[midway,yshift=1.3em]{3};
132 | %\draw [decorate,decoration={brace,amplitude=4pt}] (2.5,0.5) -- (4,0.5) node[midway,yshift=1em]{4};
133 | \draw[<-] (4,0.4) -- (4,1) node[midway,yshift=1.3em]{4};
134 | 
135 | %\draw [cyan, ultra thick] (0,-0.4) rectangle (2,0.4) node[midway] {A};
136 | \node [draw, rectangle, fill, color=cyan, minimum width=0.5cm, minimum height=0.8cm, text=black] at (0.25,0) {A};
137 | \node [draw, rectangle, fill, color=green, minimum width=1.5cm, minimum height=0.8cm, text=black] at (1.25,0) {B};
138 | \node [draw, rectangle, fill, color=cyan, minimum width=0.5cm, minimum height=0.8cm, text=black] at (2.25,0) {A};
139 | \node [draw, rectangle, fill, color=green, minimum width=1.5cm, minimum height=0.8cm, text=black] at (3.25,0) {B};
140 | \node [draw, rectangle, fill, color=cyan, minimum width=1cm, minimum height=0.8cm, text=black] at (4.5,0) {A};
141 | \node at (-1,0) {CPU};
142 | 
143 | \node [draw, rectangle, fill, color=teal, minimum width=2cm, minimum height=0.8cm, text=black] at (1,-1) {C};
144 | \node [draw, rectangle, fill, color=cyan, minimum width=2cm, minimum height=0.8cm, text=black] at (3,-1) {A};
145 | 
146 | \node at (-1,-1) {Disk};
147 | 
148 | \end{tikzpicture}
149 | 
150 | * **PIO (Programmed IO)**: CPU tells the device *what* data is
151 |     * One instruction for each byte/word
152 |     * Efficient for a few bytes, scales terribly
153 | 
154 | * **DMA (Direct Memory Access)**: tell device *where* data is
155 |     * One instruction to send a pointer
156 |     * Efficient for large data transfers
157 | 
158 | ---
159 | 
160 | # Support for different devices
161 | 
162 | * Challenge: different devices have *different protocols*
163 | * Drivers are specialized pieces of code for a particular device
164 |     * Low end communicates with the device
165 |     * High end exposes generic interface to OS
166 | * Drivers are an example of *encapsulation*
167 |     * Different drivers adhere to the same API
168 |     * OS only implements support for APIs based on device class
169 | * Requirement: well-designed interface/API
170 |     * Trade-off between versatility and over-specialization
171 |     * Due to device class complexity, OS ends with layers of APIs
172 | 
173 | ---
174 | 
175 | # Hard disk
176 | 
177 | \begin{tikzpicture}
178 | 
179 | \node [draw, circle, ultra thick, minimum width=1cm] at (0,0) {};
180 | \node [draw, circle, ultra thick, minimum width=2cm] at (0,0) {};
181 | \node [draw, circle, ultra thick, minimum width=3cm] at (0,0) {};
182 | \node [draw, circle, ultra thick, minimum width=4cm] at (0,0) {};
183 | \node [draw, circle, ultra thick, minimum width=5cm] at (0,0) {};
184 | \node [draw, circle, ultra thick, minimum width=6cm] at (0,0) {};
185 | 
186 | \draw [ultra thick] (-3,0) -- (3,0);
187 | \draw [ultra thick] (0,-3) -- (0,3);
188 | \draw [ultra thick, rotate=45] (0,-3) -- (0,3);
189 | \draw [ultra thick, rotate=135] (0,-3) -- (0,3);
190 | 
191 | 
192 | \node [draw, fill, color=teal, rectangle, ultra thick, minimum width=4cm, minimum height=0.3cm, rotate=-44] at (3.05,0.2) {};
193 | \node [color=red, thick] at (1.85,1.4) {A};
194 | 
195 | \end{tikzpicture}
196 | 
197 | * IO: seek time, rotation time, transfer time
198 | 
199 | ---
200 | 
201 | # RAID: Redundant Array of Inexpensive Disks
202 | 
203 | * Idea: build logical disk from (many) physical disks
204 | * RAID0: Striping (no mirroring or parity)
205 |     * n performance, n capacity, 0/n can fail
206 | * RAID1: Data mirroring (no parity or striping)
207 |     * n performance, (n-1)/n can fail
208 | * RAID2: bit level striping (historic, sync'd, one parity drive)
209 | * RAID3: byte level striping (historic, sync'd, one parity drive)
210 | * RAID4: block level striping (historic, one drive holds parity)
211 | * RAID5: block level striping, distributed parity
212 |     * n performance, n-1 capacity, 1/n can fail
213 | * RAID6: block level striping, distributed parity
214 |     * n performance, n-2 capacity, 2/n can fail
215 | * RAID 01: two stripes (RAID0) that are mirrored (RAID1)
216 | * RAID 10: stripe (RAID0) a set of mirrored devices (RAID1)
217 | 
218 | ---
219 | 
220 | # IO/Driver Summary
221 | 
222 | * Overlap IO and computation as much as possible!
223 |     * Use interrupts
224 |     * Use DMA
225 | * Driver classes provide common interface
226 | * Storage: read/write/seek of blocks
227 | * Minimize random IO (i.e., quick sort is really bad on HDDs)
228 | * Carefully schedule IO on slow devices
229 | * RAID virtualizes disks
230 | 
231 | ---
232 | 
233 | # Purpose of a file system
234 | 
235 | * Given: set of persistent blocks
236 | * Goal: manage these blocks efficiently. How?
237 | 
238 | * Provide mechanisms to organize files and their metadata (e.g., owner, permissions, or type)
239 | * Share files (concurrently?) among users and processes
240 |     * Decide on locking granularity and binding operations
241 |     * Semantics of operations like truncating in a shared world
242 | 
243 | ---
244 | 
245 | # The `file` abstraction
246 | 
247 | * A file is a linear persistent array of bytes
248 |     * Operations: read or write
249 |     * Metaoperations: create, delete, modify permissions/user/...
250 | * Different perspectives
251 |     * File name (human readable)
252 |     * Inode and device number (persistent ID)
253 |     * File descriptor (process view)
254 | * Directory contains subdirectories
255 |     * List of directories, files, inode mappings
256 | 
257 | ---
258 | 
259 | # Different names for different use cases
260 | 
261 | * inode/device id (`53135/2`)
262 |     * Unique internal name
263 |     * Records metadata about the file (size, permissions, owner)
264 | * path (`/foo/bar/baz`)
265 |     * Human readable name
266 |     * Organizes files in a hierarchical layout
267 | * file descriptor (`5`)
268 |     * Process internal view
269 |     * Avoids frequent path to inode traversal
270 |     * Remembers offset for next read/write
271 | 
272 | ---
273 | 
274 | # File API
275 | 
276 | * `int open(char *path, int flag, mode_t mode)`
277 | * `size_t read(int fd, char *buf, size_t nbyte)`
278 | * `size_t write(int fd, char *buf, size_t nbyte)`
279 | * `int close(int fd)`
280 | 
281 | Open translates a string name to an inode. OS allocates a file descriptor that points to that inode and returns the file descriptor table index.
282 | The path is only traversed once, the OS can cache inodes and each process keeps track of its open files.
283 | 
284 | ---
285 | 
286 | # Virtual File System
287 | 
288 | * File systems (EXT4, NTFS, FAT) use different data structures
289 | * A Virtual File System (VFS) abstracts from the real filesystem
290 | * VFS abstracts the FS as objects with specific operations
291 |     * Superblock (mount): a life filesystem
292 |     * File (open): a file opened by a process ("open file description")
293 |     * Directory entry cache: speeds up path to inode translation
294 |     * Inode (lookup): a filesystem object (e.g., file or directory)
295 | * System call logic (open, seek, ...) maps to VFS operations
296 | * When implementing a new FS, implement the VFS API
297 | * System calls are now independent of FS implementation
298 | 
299 | ---
300 | 
301 | # Allocating file data
302 | 
303 | * Contiguous
304 | * Linked blocks (blocks end with a next pointer)
305 | * File-allocation tables (table that contains block references)
306 | * Indexed (inode contains data pointers)
307 | * Multi-level indexed (tree of pointers)
308 | 
309 | For each approach, think about fragmentation, ability to grow/shrink files, sequential access performance, random access performance, overhead of meta data.
310 | 
311 | ---
312 | 
313 | # Inodes
314 | 
315 | * The inode stores all file metadata
316 | * File type
317 | * File uid, gid
318 | * File permissions (for user, group, others)
319 | * Size
320 | * Access time
321 | * Create time
322 | * Number of links
323 | 
324 | ---
325 | 
326 | # Consistency
327 | 
328 | > Redundant data must be consistent to ensure correctness.
329 | > Otherwise functionality may break.
330 | 
331 | * Keeping redundant data consistent is challenging
332 | * Filesystem may perform several writes to redundant blocks
333 | * The sequence of writes is not atomic
334 | * Interrupts due to power loss, kernel bugs, hardware failure
335 | 
336 | ---
337 | 
338 | # Journaling strategy
339 | 
340 | * Never delete (or overwrite) ANY old data until you have received confirmation
341 |   that ALL new data is committed
342 |     * Add redundancy to fix the problem with redundancy
343 | 
344 | ![](./figures/34-journaling.jpg){width=350px}
345 | 
346 | ---
347 | 
348 | # Persistence summary
349 | 
350 | * Drivers and IO allow abstraction and persistence
351 | * Filesystem API: handle interaction with the file system
352 | * Three ways to identify a file
353 |     * File names (for humans)
354 |     * Inodes and devices (on the disk)
355 |     * File descriptors (for a process)
356 | * Filesystem implementation
357 |     * Inodes for metadata
358 |     * Bitmaps for inodes/data blocks
359 |     * Superblock for global metadata
360 | * Crash resistance: filesystem check (FSCK)
361 | * Journaling: keep track of metadata, enforce atomicity
362 |     * All modern filesystems use journaling
363 |     * FSCK still useful due to bitflips/bugs
364 | 


--------------------------------------------------------------------------------
/41-testing.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | subtitle: Testing (Fuzzing/Sanitization)
  3 | ---
  4 | 
  5 | # Security
  6 | 
  7 | \begin{tikzpicture}
  8 | \draw [orange, ultra thick] (0,0) rectangle (2,4);
  9 | \node[text width=3cm, rotate=90] at (1, 2.5) {Virtualization};
 10 | \draw [blue, ultra thick] (3,0) rectangle (5,4);
 11 | \node[text width=3cm, rotate=90] at (4, 2.5) {Concurrency};
 12 | \draw [green, ultra thick] (6,0) rectangle (8,4);
 13 | \node[text width=3cm, rotate=90] at (7, 2.5) {Persistence};
 14 | 
 15 | \filldraw[draw=red, fill=red] (0,4.5) -- (8,4.5) -- (4,6) -- (0,4.5) -- cycle;
 16 | \node[text width=3cm] at (5, 5) {Security};
 17 | 
 18 | \end{tikzpicture}
 19 | 
 20 | ---
 21 | 
 22 | # Topics covered in this lecture
 23 | 
 24 | * Software testing
 25 | * Fuzzing
 26 | * Sanitization
 27 | 
 28 | This slide deck covers [chapter 5.3 in SS3P](https://nebelwelt.net/SS3P/softsec.pdf).
 29 | 
 30 | ---
 31 | 
 32 | # Why testing?
 33 | 
 34 | > Testing is the process of ***executing code*** to ***find errors***.
 35 | 
 36 | An error is a deviation between observed behavior and specified behavior, i.e.,
 37 | a violation of the underlying specification:
 38 | 
 39 | * Functional requirements (features a, b, c)
 40 | * Operational requirements (performance, usability)
 41 | * Security requirements?
 42 | 
 43 | ---
 44 | 
 45 | # Limitations of testing
 46 | 
 47 | > Testing can only show the presence of bugs, never their absence. (Edsger W. Dijkstra)
 48 | 
 49 | A successful test finds a deviation.  Testing is a form of dynamic analysis.
 50 | Code is executed, the testing environment observes the behavior of the code,
 51 | detecting violations.
 52 | 
 53 | * Key advantage: reproducible, generally testing gives you the concrete input
 54 |   for failed test cases.
 55 | * Key disadvantage: complete testing of all control-flow/data-flow paths reduces
 56 |   to the halting problem, in practice, testing is hindered due to state explosion.
 57 | 
 58 | ---
 59 | 
 60 | # Forms of testing
 61 | 
 62 | * Manual testing
 63 | * Fuzz testing
 64 | * Symbolic and concolic testing
 65 | 
 66 | . . .
 67 | 
 68 | We focus on *security* testing or testing to find *security* bugs, i.e., bugs
 69 | that are reachable through attacker-controlled inputs.
 70 | 
 71 | Recommended reading: [A Few Billion Lines of Code Later: Using Static Analysis to Find Bugs in the Real World](https://cacm.acm.org/magazines/2010/2/69354-a-few-billion-lines-of-code-later/fulltext)
 72 | 
 73 | ---
 74 | 
 75 | # Coverage as completeness metric
 76 | 
 77 | > Intuition: A software flaw is only detected if the flawed statement is
 78 | > executed. Effectiveness of test suite therefore depends on how many statements
 79 | > are executed.
 80 | 
 81 | ---
 82 | 
 83 | # How to measure code coverage?
 84 | 
 85 | Several approaches exist, all rely on instrumentation:
 86 | 
 87 | * gcov: <https://gcc.gnu.org/onlinedocs/gcc/Gcov.html>
 88 | * SanitizerCoverage: <https://clang.llvm.org/docs/SourceBasedCodeCoverage.html>
 89 | 
 90 | Sampling may reduce collection cost at slight loss of precision.
 91 | 
 92 | ---
 93 | 
 94 | # Fuzzing
 95 | 
 96 | Fuzz testing (fuzzing) is an automated software testing technique. 
 97 | Key idea: execute the target program with an input and check if it crashes.
 98 | The fuzzing engine automatically generates new inputs based on some criteria:
 99 | 
100 | * Random mutation
101 | * Leveraging input structure
102 | * Leveraging program structure
103 | 
104 | The inputs are then run on the test program and, if it crashes, a crash report
105 | is generated.
106 | 
107 | ---
108 | 
109 | # Fuzzing effectiveness
110 | 
111 | * Fuzzing finds bugs effectively (CVEs---unique bug numbers)
112 | * Proactive defense during software development/testing
113 | * Preparing offense, as part of exploit development
114 | 
115 | ---
116 | 
117 | # Fuzz input generation
118 | 
119 | Fuzzers generate new input based on generations or mutations.
120 | 
121 | * ***Generation-based*** input generation produces new input seeds in each round,
122 |   independent from each other.
123 | 
124 | * ***Mutation-based*** input generation leverages existing inputs and modifies
125 |   them based on feedback from previous rounds.
126 | 
127 | ---
128 | 
129 | # Fuzz input structure awareness
130 | 
131 | Programs accept some form of *input/output*. Generally, the input/output is
132 | structured and follows some form of protocol.
133 | 
134 | * ***Dumb fuzzing*** is unaware of the underlying structure.
135 | 
136 | * ***Smart fuzzing*** is aware of the protocol and modifies the input accordingly.
137 | 
138 | Example: a checksum at the end of the input. A dumb fuzzer will likely fail the
139 | checksum.
140 | 
141 | ---
142 | 
143 | # Fuzz program structure awareness
144 | 
145 | Input is processed by the program, based on the *program structure* (past executions), input can be adapted to trigger new conditions.
146 | 
147 | * ***White-box*** fuzzing leverages (expensive) semantic program analysis to
148 |   mutate input; often does not scale
149 | * ***Grey-box*** leverages program instrumentation based on previous inputs;
150 |   light runtime cost, scales to large programs
151 | * ***Black-box*** fuzzing is unaware of the program structure; often cannot
152 |   explore beyond simple/early functionality
153 | 
154 | ---
155 | 
156 | # Coverage-guided grey-box fuzzing
157 | 
158 | ![](./figures/41-fuzzing.png){ width=400px }
159 | 
160 | ---
161 | 
162 | # American Fuzzy Lop (++)
163 | 
164 | * AFL++ is a well-established fuzzer
165 | * AFL uses grey-box instrumentation to track branch coverage and mutate fuzzing
166 |   seeds based on previous branch coverage
167 | * Branch coverage tracks frequency of executed edges between basic blocks
168 | * Global coverage map keeps track of "power of two" changes
169 | * AFL: <http://lcamtuf.coredump.cx/afl/>
170 | * AFL++ <https://aflplus.plus/>
171 | 
172 | ---
173 | 
174 | # Fuzzer challenges: coverage wall
175 | 
176 | * After certain iterations the fuzzer no longer makes progress
177 | * Hard to satisfy checks
178 | * Chains of checks
179 | * Leaps in input changes
180 | 
181 | ---
182 | 
183 | # Fuzzer challenges: coverage wall
184 | 
185 | ![](./figures/41-coveragewall.png){ width=280px }
186 | 
187 | ---
188 | 
189 | # Fuzzer challenges: coverage wall
190 | 
191 | 
192 | Bypassing the coverage wall is hard, the following lists some approaches:
193 | 
194 | * Better input (seeds) can mitigate the coverage wall
195 | * Fuzz individual components by writing fuzzer stubs (LibFuzzer)
196 | * Better mutation operators (help the fuzzer guide exploration)
197 | * Stateful fuzzing (teach fuzzer about different program states)
198 | * Grammar-aware fuzzing (teach fuzzer about input grammar)
199 | 
200 | ---
201 | 
202 | # Fault detection
203 | 
204 | * How do we detect program faults?
205 | 
206 | . . .
207 | 
208 | * Test cases detect bugs through
209 |     * Assertions (`assert(var != 0x23 && "var has illegal value");`) detect violations
210 |     * Segmentation faults
211 |     * Division by zero traps
212 |     * Uncaught exceptions
213 |     * Mitigations triggering termination
214 | * How can you increase the chances of detecting a bug?
215 | 
216 | ---
217 | 
218 | # Sanitization
219 | 
220 | Sanitizers enforce a given policy, detect bugs earlier and increase
221 | effectiveness of testing.
222 | Most sanitizers rely on a combination of static analysis, instrumentation, and
223 | dynamic analysis.
224 | 
225 | * The program is analyzed during compilation (e.g., to learn properties such as
226 |   type graphs or to enable optimizations)
227 | * The program is instrumented, often to record metadata at certain places and to
228 |   enforce metadata checks at other places.
229 | * At runtime, the instrumentation constantly verifies that the policy is not
230 |   violated.
231 | 
232 | What policies are interesting? What metadata do you need? Where would you check?
233 | 
234 | ---
235 | 
236 | # AddressSanitizer (1/2)
237 | 
238 | AddressSanitizer (ASan) detects memory errors. It places red zones around
239 | objects and checks those objects on trigger events. ASan detects the
240 | following types of bugs:
241 | 
242 | * Out-of-bounds accesses to heap, stack and globals
243 | * Use-after-free
244 | * Use-after-return (configurable)
245 | * Use-after-scope (configurable)
246 | * Double-free, invalid free
247 | * Memory leaks (experimental)
248 | 
249 | Typical slowdown introduced by AddressSanitizer is 2x.
250 | 
251 | ---
252 | 
253 | # AddressSanitizer (2/2)
254 | 
255 | Goal: detect memory safety violations (both spatial and temporal)
256 | 
257 | Key idea: allocate redzones (prohibited area around memory objects), check
258 | each memory access if it targets a redzone.
259 | 
260 | * What kind of metadata would you record? Where?
261 | 
262 | * What kind of operations would you instrument?
263 | 
264 | * What kind of optimizations could you think of?
265 | 
266 | ---
267 | 
268 | # ASan Metadata
269 | 
270 | Record live objects, guard them by placing redzones around them.
271 | 
272 | ASan uses a table that maps each 8-byte word in memory to one byte in the
273 | table. Advantage: simple address calculation (`offset+address>>3`);
274 | disadvantage: memory overhead.
275 | 
276 | ASan stores accessibility of each word as metadata (i.e., is a given address
277 | accessible or not).
278 | 
279 | * An 8-byte aligned word of memory has only 9 states
280 | * First N-bytes are accessible, 8-N are not
281 |     * 0: all accessible
282 |     * 7: first 7 accessible
283 |     * 6: first 6 accessible
284 |     * ... 
285 |     * 1: first byte accessible
286 |     * -1: no byte accessible
287 | * Trade-off between alignment and encoding (extreme: 128 byte alignment, per
288 |   byte)
289 | 
290 | <!-- TODO figure with accessible blocks and non-accessible blocks -->
291 | 
292 | ---
293 | 
294 | # ASan instrumentation: memory access
295 | 
296 | ```C
297 | long *addr = getAddr();
298 | long val;
299 | char *shadow = (addr>>3) + shadowbse;
300 | 
301 | // 8-byte access (read/write)
302 | if (*shadow)
303 |   ReportError(a);
304 | val = *addr;
305 | 
306 | // N-byte access instead:
307 | if (*shadow && *shadow <= ((addr&7)+N-1))
308 | ```
309 | 
310 | ---
311 | 
312 | # ASan instrumentation: asm
313 | 
314 | ```ASM
315 | shr $0x3,%rax       # shift by 3
316 | mov $0x100000000000,%rcx
317 | or %rax,%rcx        # add offset
318 | cmpb $0x0,(%rcx)    # load shadow
319 | je .out
320 | ud2                 # generate SIGILL
321 | .out:
322 | movq $0x1234,(%rdi) # original store
323 | ```
324 | 
325 | ---
326 | 
327 | # ASan instrumentation: stack
328 | 
329 | Insert red zones around objects on stack, poison them when entering stack
330 | frames.
331 | 
332 | ```C
333 | void foo() {
334 |   char rz1[32]; // 32-byte aligned
335 |   char a[8];
336 |   char rz2[24];
337 |   char rz3[32];
338 |   int *shadow = (&rz1 >> 3) + kOffset;
339 |   shadow[0] = 0xffffffff; // poison rz1
340 |   shadow[1] = 0xffffff00; // poison rz2
341 |   shadow[2] = 0xffffffff; // poison rz3
342 |   <------------- CODE ------------->
343 |   shadow[0] = shadow[1] = shadow[2] = 0;
344 | }
345 | ```
346 | 
347 | ---
348 | 
349 | # ASan instrumentation: globals
350 | 
351 | Insert red zone after global object, poison in init.
352 | 
353 | ```C
354 | int a;
355 | // translates to
356 | struct {
357 |   int original; 
358 |   char redzone[60];
359 | } a; // again, 32-byte aligned
360 | ```
361 | 
362 | ---
363 | 
364 | # ASan runtime library
365 | 
366 | * Initializes shadow map at startup
367 | * Replaces malloc/free to update metadata (and pad allocations with redzones)
368 | * Intercepts special functions such as `memset`
369 | 
370 | ---
371 | 
372 | # ASan report (1/2)
373 | 
374 | ```C++
375 | int main(int argc, char **argv) {
376 |   int *array = new int[100];
377 |   delete [] array;
378 |   return array[argc];  // BOOM
379 | }
380 | ```
381 | 
382 | ---
383 | 
384 | # ASan report (2/2)
385 | 
386 | ![ASan failure report](./figures/41-asan.gif){width=450px}
387 | 
388 | ---
389 | 
390 | # ASan policy
391 | 
392 | * Instrument every single access, check for poison value in shadow table
393 | * Advantage: fast checks
394 | * Disadvantage: large memory overhead (especially on 64 bit), still slow (2x)
395 | 
396 | ---
397 | 
398 | # LeakSanitizer
399 | 
400 | LeakSanitizer detects run-time memory leaks. It can be combined with
401 | AddressSanitizer to get both memory error and leak detection, or used in a
402 | stand-alone mode. 
403 | 
404 | LSan adds almost no performance overhead until process termination, when the
405 | extra leak detection phase runs.
406 | 
407 | ---
408 | 
409 | # MemorySanitizer
410 | 
411 | MemorySanitizer detects uninitialized reads. Memory allocations are tagged and
412 | uninitialized reads are flagged.
413 | 
414 | Typical slowdown of MemorySanitizer is 3x.
415 | 
416 | Note: do not confuse MemorySanitizer and AddressSanitizer.
417 | 
418 | ---
419 | 
420 | # UndefinedBehaviorSanitizer
421 | 
422 | UndefinedBehaviorSanitizer (UBSan) detects undefined behavior. It instruments
423 | code to trap on typical undefined behavior in C/C++ programs. Detectable errors
424 | are:
425 | 
426 | * Unsigned/misaligned pointers
427 | * Signed integer overflow
428 | * Conversion between floating point types leading to overflow
429 | * Illegal use of NULL pointers
430 | * Illegal pointer arithmetic
431 | * ...
432 | 
433 | Slowdown depends on the amount and frequency of checks. This is the only
434 | sanitizer that can be used in production. For production use, a special minimal
435 | runtime library is used with minimal attack surface.
436 | 
437 | ---
438 | 
439 | # ThreadSanitizer
440 | 
441 | ThreadSanitizer detects data races between threads. It instruments writes to
442 | global and heap variables and records which thread wrote the value last,
443 | allowing detecting of WAW, RAW, WAR data races.
444 | 
445 | Typical slowdown is 5-15x with 5-15x memory overhead.
446 | 
447 | ---
448 | 
449 | # HexType
450 | 
451 | HexType detects type safety violations. It records the true type of allocated
452 | objects and makes all type casts explicit.
453 | 
454 | Typical slowdown is 0.5x.
455 | 
456 | ---
457 | 
458 | # Sanitizers
459 | 
460 | * AddressSanitizer: <https://clang.llvm.org/docs/AddressSanitizer.html>
461 | * LeakSanitizer: <https://clang.llvm.org/docs/LeakSanitizer.html>
462 | * MemorySanitizer: <https://clang.llvm.org/docs/MemorySanitizer.html>
463 | * UndefinedBehaviorSanitizer: <https://clang.llvm.org/docs/UndefinedBehaviorSanitizer.html>
464 | * ThreadSanitizer: <https://clang.llvm.org/docs/ThreadSanitizer.html>
465 | * HexType: <https://github.com/HexHive/HexType>
466 | 
467 | Use sanitizers to test your code. More sanitizers are in development.
468 | 
469 | ---
470 | 
471 | 
472 | # Summary and conclusion
473 | 
474 | * Software testing finds bugs before an attacker can exploit them
475 | * Manual testing: write test cases to trigger exceptions
476 | * Fuzz testing automates and randomizes testing
477 | * Sanitizers allow early bug detection, not just on exceptions
478 | * AddressSanitizer is the most commonly used sanitizer and enforces
479 |   probabilistic memory safety by recording metadata for every allocated object
480 |   and checking every memory read/write.
481 | 
482 | Don't forget the Moodle quiz!


--------------------------------------------------------------------------------
/42-mitigations.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | subtitle: Mitigations
  3 | ---
  4 | 
  5 | # Topics covered in this lecture
  6 | 
  7 | * Data Execution Prevention
  8 | * Address Space Layout Randomization
  9 | * Stack canaries
 10 | * Control-Flow Integrity (CFI)
 11 | 
 12 | This slide deck covers [chapter 6.4 in SS3P](https://nebelwelt.net/SS3P/softsec.pdf).
 13 | 
 14 | ---
 15 | 
 16 | # Model for Control-Flow Hijack Attacks
 17 | 
 18 | ![](./figures/42-mem_safety_overview.png)
 19 | 
 20 | ---
 21 | 
 22 | # Widely-adopted defense mechanisms
 23 | 
 24 | * Hundreds of defense mechanisms were proposed
 25 | * Only few mitigations were adopted
 26 | * Factors that increase chances of adoption:
 27 |     * Mitigation of the most imminent problem
 28 |     * (Very) low performance overhead
 29 |     * Fits into the development cycle
 30 | 
 31 | ---
 32 | 
 33 | # Attack vector: code injection
 34 | 
 35 | * Simplest form of code execution
 36 | * Generally consists of two steps:
 37 |     * Inject `code` somewhere into the process
 38 |     * Redirect control-flow to injected code 
 39 | 
 40 | ---
 41 | 
 42 | # Data Execution Prevention (DEP)
 43 | 
 44 | * No distinction between code and data (e.g., x86, ARM)
 45 | * Any data in the process could be interpreted as code (code injection: an
 46 |   attacker redirects control-flow to a buffer that contains attacker-controlled
 47 |   data as shellcode)
 48 | * ***Defense assumption:*** if an attacker cannot inject code (as data), then a
 49 |   code execution attack is not possible.
 50 | 
 51 | ---
 52 | 
 53 | # DEP: process layout
 54 | 
 55 | ![](./figures/42-dep.png)
 56 | 
 57 | ---
 58 | 
 59 | # DEP implementation
 60 | 
 61 | * HW extension, add NX-bit (No eXecute) to page table entry
 62 |     * Intel calls this per-page bit XD (eXecute Disable)
 63 |     * AMD calls it Enhanced Virus Protection
 64 |     * ARM calls it XN (eXecute Never)
 65 | * This is an additional bit for every mapped virtual page. If the bit is
 66 |   set, then data on that page cannot be interpreted as code and the processor
 67 |   will trap if control flow reaches that page.
 68 | 
 69 | ---
 70 | 
 71 | # DEP summary
 72 | 
 73 | * DEP is now enabled widely by default (whenever a hardware support is
 74 |   available such as for x86 and ARM)
 75 | * Stops all code injection
 76 | * Check for DEP with [checksec.sh](https:////github.com/slimm609/checksec.sh)
 77 | * DEP may be disabled through gcc flags: `-z execstack`
 78 | 
 79 | ---
 80 | 
 81 | # Attacks evolve: from code injection to reuse
 82 | 
 83 | * Did DEP solve all code execution attacks?
 84 | 
 85 | . . .
 86 | 
 87 | * Unfortunately not! But attacks got (much?) harder
 88 | * A code injection attack consists of two stages: 
 89 |     a) redirecting control flow
 90 |     b) to injected code
 91 | * DEP prohibits execution of injected code
 92 |     * DEP does not stop the redirection of control flow
 93 |     * Attackers can still hijack control flow to ***existing*** code
 94 | 
 95 | ---
 96 | 
 97 | # Code reuse
 98 | 
 99 | * The attacker can overwrite a code pointer (e.g., a function pointer, or a
100 |   return pointer on the stack)
101 | * Prepare the right parameters on the stack, reuse a full function (or part of
102 |   a function)
103 | 
104 | ---
105 | 
106 | # From Code Reuse to full ROP
107 | 
108 | Instead of targeting a simple function, we can target a gadget
109 | 
110 | * Gadgets are a sequence of instructions ending in an indirect control-flow
111 |   transfer (e.g., return, indirect call, indirect jump)
112 | * Prepare data and environment so that, e.g., pop instructions load data into
113 |   registers
114 | * A gadget invocation frame consists of a sequence of 0 to n data values and an
115 |   pointer to the next gadget. The gadget uses the data values and transfers
116 |   control to the next gadget
117 | 
118 | [Link to simple ROP tutorial](https://crypto.stanford.edu/~blynn/rop/)
119 | 
120 | ---
121 | 
122 | # Address Space Randomization (ASR)
123 | 
124 | > The security improvement of ASR depends on (i) the available entropy
125 | > for randomized locations, (ii) the completeness of randomization (i.e., are 
126 | > all objects randomized), and (iii) the absence of information leaks.
127 | 
128 | . . .
129 | 
130 | * Successful control-flow hijack attacks depend on the attacker overwriting
131 |   a code pointer with a known alternate target
132 | * ASR changes (randomizes) the process memory layout
133 | * If the attacker does not know where a piece of code (or data) is, then it
134 |   cannot be reused in an attack
135 | * Attacker must first ***learn*** or recover the address layout
136 | 
137 | ---
138 | 
139 | # Candidates for randomization
140 | 
141 | * Trade-off between overhead, complexity, and security benefit.
142 | * Randomize start of heap
143 | * Randomize start of stack
144 | * Randomize start of code (PIE for executable, PIC each library)
145 | * Randomize mmap allocated regions
146 | * Randomize individual allocations (malloc)
147 | * Randomize the code itself, e.g., gap between functions, order of
148 |   functions, basic blocks, ...
149 | * Randomize members of structs, e.g., padding, order.
150 | 
151 | Different forms of fine-grained randomization exist. Software diversity is a
152 | related concept.
153 | 
154 | ---
155 | 
156 | # Address Space *Layout* Randomization (ASLR)
157 | 
158 | > ASLR is a practical form of ASR.
159 | 
160 | * ASLR focuses on blocks of memory
161 | * Heap, stack, code, executable, mmap regions
162 | * ASLR is inherently page-based
163 | 
164 | ---
165 | 
166 | # ASLR entropy
167 | 
168 | * Assume start addresses of all sections are randomized
169 | * Entropy of each section is key to security
170 | * Attacker targets section with lowest entropy
171 | * Early ASLR implementations had low entropy on the stack and no entropy on
172 |   x86 for the executable (non-PIE executables)
173 | * Linux (through Exec Shield) uses 19 bits of entropy for the stack (on 16
174 |   byte period) and 8 bits of mmap entropy (on 4096 byte period).
175 | 
176 | ---
177 | 
178 | # ASLR changes to the address space
179 | 
180 | ![](./figures/42-aslr.png)
181 | 
182 | ---
183 | 
184 | # ASLR and DEP changes to the address space
185 | 
186 | ![](./figures/42-dep_and_aslr.png)
187 | 
188 | ---
189 | 
190 | # Stack canaries
191 | 
192 | * Attacks relied on a stack-based buffer overflow to inject code
193 | * Memory safety would mitigate this problem but adding full safety checks is
194 |   not feasible due to high performance overhead
195 | * Key insight: buffer overflows require pointer arithmetic
196 |     * Instead of checking each memory dereference during function execution,
197 |       we check the integrity of a variable once
198 | * ***Assumption:*** we only prevent RIP control-flow hijack attacks
199 | * We therefore only need to protect the integrity of the return instruction
200 |   pointer
201 | 
202 | 
203 | ---
204 | 
205 | # Stack canaries
206 | 
207 | * Place a canary after a potentially vulnerable buffer
208 | * Check the integrity of the canary before the function returns
209 | * The compiler may place all buffers at the end of the stack frame and the
210 |   canary just before the first buffer. This way, all non-buffer local variables
211 |   are protected as well.
212 | * Limitation: the stack canary only protects against ***continuous overwrites***
213 |   iff the attacker does ***not know*** the canary
214 | * An alternative is to encrypt the return instruction pointer by xoring it
215 |   with a secret
216 | 
217 | ---
218 | 
219 | # Stack protector: code
220 | 
221 | ```C
222 | char unsafe(char *vuln) {
223 |   char foo[12];
224 |   strcpy(foo, vuln);
225 |   return foo[1];
226 | }
227 | 
228 | int main(int ac,
229 |     char* av[]) {
230 |   unsafe(argv[0]);
231 |   return 0;
232 | }
233 | ```
234 | 
235 | ---
236 | 
237 | # Stack protector: assembly
238 | 
239 | ```ASM
240 | push   %rbp
241 | mov %rsp,%rbp
242 | sub    $0x30,%rsp
243 | mov    %rdi,-0x28(%rbp)
244 | mov    %fs:0x28,%rax         ; load secret canary
245 | mov    %rax,-0x8(%rbp)       ; store canary on stack
246 | xor    %eax,%eax             ; clear register
247 | mov    -0x28(%rbp),%rsi
248 | lea    -0x20(%rbp),%rdi
249 | callq  <strcpy@plt>
250 | movzbl -0x1f(%rbp),%eax
251 | mov    -0x8(%rbp),%rcx       ; load canary from stack
252 | xor    %fs:0x28,%rcx         ; check canary
253 | je     <out>
254 | callq <__stack_chk_fail@plt> ; terminate if check failed
255 | out: leaveq, retq  
256 | ```
257 | 
258 | ---
259 | 
260 | # Other mitigations
261 | 
262 | * Fortify source: protect against format string attacks
263 | * Safe exception handling: protect against popping exception frames
264 | 
265 | 
266 | ---
267 | 
268 | # Control-Flow Integrity
269 | 
270 | > CFI is a defense mechanism that protects applications against control-flow
271 | > hijack attacks. A successful CFI mechanism ensures that the control-flow of
272 | > the application never leaves the predetermined, valid control-flow that is
273 | > defined at the source code/application level. This means that an attacker
274 | > cannot redirect control-flow to alternate or new locations.
275 | 
276 | ![CFI target restriction](./figures/42-cfi.png){width=8cm}
277 | 
278 | ---
279 | 
280 | # Basics of a CFI mechanism
281 | 
282 | Core idea: restrict the dynamic control flow of the application to the
283 | control-flow graph of the application.
284 | 
285 | * Target set construction
286 | * Dynamic enforcement mechanism to execute runtime checks
287 | 
288 | ---
289 | 
290 | # CFI: target set construction
291 | 
292 | How do we infer the control-flow graph (for C/C++ programs)?
293 | A static analysis (on source code or binary) can recover an approximation of the
294 | control-flow graph. Precision of the analysis is crucial!
295 | 
296 | * Valid functions
297 | * Arity
298 | * Function prototypes
299 | * Class hierarchy analysis
300 | 
301 | ---
302 | 
303 | # CFI: target set construction
304 | 
305 | * Trade-off between precision and compatibility.
306 | 
307 | * A single set of ***valid functions*** is highly compatible with other software
308 |   but results in imprecision due to the large set size
309 | 
310 | * ***Class hierarchy analysis*** results in small sets but may be incompatible with
311 |   other source code and some programmer patterns (e.g., casting to void or not
312 |   passing all parameters)
313 | 
314 | ---
315 | 
316 | # CFI: limitations
317 | 
318 | * CFI allows the underlying bug to fire and the memory corruption can be
319 |   controlled by the attacker. The defense only detects the deviation after the
320 |   fact, i.e., when a corrupted pointer is used in the program
321 | * Over-approximation in the static analysis reduces security guarantees
322 | * Some attacks remain possible
323 |     * An attacker is free to modify the outcome of any conditional jump (e.g.,
324 |       `if` clauses depend on unprotected data values)
325 |     * An attacker can choose any allowed target at each indirect control-flow
326 |       transfer location
327 |     * For return instructions: one set of return targets is too broad and even
328 |       localized return sets are too broad for most cases
329 |     * For indirect calls and jumps, attacks like COOP (Counterfeit Object
330 |       Oriented Programming) have shown that full functions serve as gadgets
331 | 
332 | ---
333 | 
334 | # OS support for mitigation and sanitization
335 | 
336 | * Fault or trap signal: a segmentation fault serves as a fast and efficient way
337 |   to interrupt and stop execution. Instead of adding `if (x)` to each
338 |   instruction, an illegal access quickly and efficiently stops program
339 |   execution.
340 | * Virtual address space: the OS controls this important abstraction and during
341 |   program instantiation the OS can introduce randomness and diversity to make
342 |   exploitation more costly (and requires the attacker to first recover
343 |   information).
344 | * Segments: the OS enables thread-local data by repurposing segment registers,
345 |   stack canaries are stored in thread-local data
346 | * Virtual address space: not all memory needs to be mapped to physical memory,
347 |   enabling shadow data structures as used for sanitization
348 | * Access to new architecture features such as Intel MPK (memory protection
349 |   keys), ARM PAC (pointer authentication codes), shadow stacks, ...
350 | 
351 | ---
352 | 
353 | # Summary and conclusion
354 | 
355 | * Deployed mitigations do not stop all attacks
356 | * ***Data Execution Prevention*** stops code injection attacks, but does not
357 |   stop code reuse attacks
358 | * ***Address Space Layout Randomization*** is probabilistic, shuffles memory
359 |   space, prone to information leaks
360 | * ***Stack Canaries*** are probabilistic, do not protect against direct
361 |   overwrites, prone to information leaks
362 | * ***CFI*** restricts control-flow hijack attacks, does not protect against
363 |   data-only attacks
364 | 
365 | Don't forget the Moodle quiz!
366 | 


--------------------------------------------------------------------------------
/43-security.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | subtitle: Security Summary
  3 | ---
  4 | 
  5 | # Security
  6 | 
  7 | \begin{tikzpicture}
  8 | \draw [orange, ultra thick] (0,0) rectangle (2,4);
  9 | \node[text width=3cm, rotate=90] at (1, 2.5) {Virtualization};
 10 | \draw [blue, ultra thick] (3,0) rectangle (5,4);
 11 | \node[text width=3cm, rotate=90] at (4, 2.5) {Concurrency};
 12 | \draw [green, ultra thick] (6,0) rectangle (8,4);
 13 | \node[text width=3cm, rotate=90] at (7, 2.5) {Persistence};
 14 | 
 15 | \filldraw[draw=red, fill=red] (0,4.5) -- (8,4.5) -- (4,6) -- (0,4.5) -- cycle;
 16 | \node[text width=3cm] at (5, 5) {Security};
 17 | 
 18 | \end{tikzpicture}
 19 | 
 20 | ---
 21 | 
 22 | # Two topics: testing and mitigations
 23 | 
 24 | * Developers find as many bugs as possible by testing software
 25 |     * Fuzzing generates test cases
 26 |     * Sanitization detects policy violations
 27 | * Mitigations detect policy violations at runtime, stop exploits
 28 | * OS support both testing and mitigations, making them efficient
 29 | 
 30 | ---
 31 | 
 32 | # Fuzzing
 33 | 
 34 | Fuzz testing (fuzzing) is an automated software testing technique. The fuzzing
 35 | engine generates inputs based on some criteria:
 36 | 
 37 | * Random mutation
 38 | * Leveraging input structure
 39 | * Leveraging program structure
 40 | 
 41 | The inputs are then run on the test program and, if it crashes, a crash report
 42 | is generated.
 43 | 
 44 | ---
 45 | 
 46 | # Fuzzing dimensions
 47 | 
 48 | Fuzzers can be classified along three dimensions:
 49 | 
 50 | * How are new inputs created?
 51 |     * Generational fuzzers create input based on a description
 52 |     * Mutational fuzzers modify an existing input
 53 | * Aware of input structure (e.g., the PNG format)?
 54 |     * Smart fuzzers mutate according to valid format
 55 |     * Dumb fuzzers mutate randomly
 56 | * Aware of program structure (e.g., execution path)?
 57 |     * White-box fuzzers are aware of full path constraints
 58 |     * Grey-box fuzzers are aware of coverage
 59 |     * Black-box fuzzers have no clue
 60 | 
 61 | A mutational dumb grey-box fuzzer uses coverage feedback from program execution
 62 | to randomly modify seeds that have resulted in new coverage.
 63 | 
 64 | ---
 65 | 
 66 | # Coverage-guided grey box fuzzing
 67 | 
 68 | ![](./figures/41-fuzzing.png){ width=400px }
 69 | 
 70 | ---
 71 | 
 72 | # Sanitization
 73 | 
 74 | Sanitizers enforce a given policy, detect bugs earlier and increase
 75 | effectiveness of testing.
 76 | 
 77 | Most sanitizers rely on a combination of static analysis, instrumentation, and
 78 | dynamic analysis.
 79 | 
 80 | * The program is analyzed during compilation (e.g., to learn properties such as
 81 |   type graphs or to enable optimizations)
 82 | * The program is instrumented, often to record metadata at certain places and to
 83 |   enforce metadata checks at other places.
 84 | * At runtime, the instrumentation constantly verified that the policy is not
 85 |   violated.
 86 | 
 87 | What policies are interesting? What metadata do you need? Where would you check?
 88 | 
 89 | ---
 90 | 
 91 | # AddressSanitizer
 92 | 
 93 | AddressSanitizer (ASan) detects memory errors. It places red zones (padding 
 94 | areas) around objects and, for checks that no memory operation accesses these
 95 | padding areas. The tool can detect the following types of bugs:
 96 | 
 97 | * Out-of-bounds accesses to heap, stack and globals
 98 | * Use-after-free
 99 | * Use-after-return (configurable)
100 | * Use-after-scope (configurable)
101 | * Double-free, invalid free
102 | * Memory leaks (experimental)
103 | 
104 | Typical slowdown introduced by AddressSanitizer is 2x.
105 | 
106 | ---
107 | 
108 | # Sanitizers
109 | 
110 | * AddressSanitizer: finds memory errors
111 | * LeakSanitizer: finds memory leaks (extends ASan)
112 | * MemorySanitizer: finds uninitialized reads
113 | * UndefinedBehaviorSanitizer: finds integer overflows
114 | * ThreadSanitizer: finds thread race conditions
115 | * HexType: finds type violations
116 | 
117 | ---
118 | 
119 | # Testing summary
120 | 
121 | * Software testing finds bugs before an attacker can exploit them
122 | * Manual testing: write test cases to trigger exceptions
123 | * Fuzz testing automates and randomizes testing
124 | * Sanitizers allow early bug detection, not just on exceptions
125 | * AddressSanitizer is the most commonly used sanitizer and enforces
126 |   probabilistic memory safety by recording metadata for every allocated object
127 |   and checking every memory read/write.
128 | 
129 | ---
130 | 
131 | # Data Execution Prevention (DEP)
132 | 
133 | * No distinction between code and data (e.g., x86, ARM)
134 | * Any data in the process could be interpreted as code (code injection: an
135 |   attacker redirects control-flow to a buffer that contains attacker-controlled
136 |   data as shellcode)
137 | * ***Defense assumption:*** if an attacker cannot inject code (as data), then a
138 |   code execution attack is not possible.
139 | 
140 | ---
141 | 
142 | # Address Space ***Layout*** Randomization (ASLR)
143 | 
144 | * Successful control-flow hijack attacks depend on the attacker overwriting
145 |   a code pointer with a known alternate target
146 | * ASR changes (randomizes) the process memory layout
147 | * If the attacker does not know where a piece of code (or data) is, then it
148 |   cannot be reused in an attack
149 | * Attacker must first *learn* or recover the address layout
150 | * ASLR focuses on blocks of memory
151 | * Heap, stack, code, executable, mmap regions
152 | * ASLR is inherently page-based
153 | 
154 | ---
155 | 
156 | # Stack canaries
157 | 
158 | * Place a canary after a potentially vulnerable buffer
159 | * Check the integrity of the canary before the function returns
160 | * The compiler may place all buffers at the end of the stack frame and the
161 |   canary just before the first buffer. This way, all non-buffer local variables
162 |   are protected as well.
163 | * Limitation: the stack canary only protects against ***continuous overwrites***
164 |   iff the attacker does ***not know*** the canary
165 | * An alternative is to encrypt the return instruction pointer by xoring it
166 |   with a secret
167 | 
168 | ---
169 | 
170 | # Control-Flow Integrity
171 | 
172 | > CFI is a defense mechanism that protects applications against control-flow
173 | > hijack attacks. A successful CFI mechanism ensures that the control-flow of
174 | > the application never leaves the predetermined, valid control-flow that is
175 | > defined at the source code/application level. This means that an attacker
176 | > cannot redirect control-flow to alternate or new locations.
177 | 
178 | ![CFI target restriction](./figures/42-cfi.png){width=8cm}
179 | 
180 | ---
181 | 
182 | # Mitigations Summary
183 | 
184 | * Deployed mitigations do not stop all attacks
185 | * Data Execution Prevention stops code injection attacks, but does not
186 |   stop code reuse attacks
187 | * Address Space Layout Randomization is probabilistic, shuffles memory space,
188 |   prone to information leaks
189 | * Stack Canaries are probabilistic, do not protect against direct
190 |   overwrites, prone to information leaks
191 | * CFI restricts control-flow hijack attacks, does not protect against
192 |   data-only attacks
193 | 


--------------------------------------------------------------------------------
/50-os.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | subtitle: Operating Systems Summary
  3 | ---
  4 | 
  5 | # Virtualization
  6 | 
  7 | \begin{tikzpicture}
  8 | \draw [fill, orange, ultra thick] (0,0) rectangle (2,4);
  9 | \node[text width=3cm, rotate=90] at (1, 2.5) {Virtualization};
 10 | \draw [blue, ultra thick] (3,0) rectangle (5,4);
 11 | \node[text width=3cm, rotate=90] at (4, 2.5) {Concurrency};
 12 | \draw [green, ultra thick] (6,0) rectangle (8,4);
 13 | \node[text width=3cm, rotate=90] at (7, 2.5) {Persistence};
 14 | 
 15 | \draw [red, ultra thick] (0,4.5) -- (8,4.5);
 16 | \draw [red, ultra thick] (0,4.5) -- (4,6);
 17 | \draw [red, ultra thick] (4,6) -- (8,4.5);
 18 | \node[text width=3cm] at (5, 5) {Security};
 19 | 
 20 | \end{tikzpicture}
 21 | 
 22 | ---
 23 | 
 24 | # Virtualization: Summary
 25 | 
 26 | \begin{tikzpicture}[level distance=1.5cm,
 27 |   level 1/.style={sibling distance=6cm},
 28 |   level 2/.style={sibling distance=2.5cm}]
 29 |   \node {Virtualization}
 30 |     child {node {CPU}
 31 |       child {node {Processes}}
 32 |       child {node {Scheduling}}
 33 |     }
 34 |     child {node {Memory}
 35 |       child {node {Allocation}}
 36 |       child {node {Segmentation}}
 37 |       child {node {Paging}}
 38 |     };
 39 | \end{tikzpicture}
 40 | 
 41 | ---
 42 | 
 43 | # CPU Virtualization: Processes and Scheduling
 44 | 
 45 | * Processes are a purely virtual concept
 46 | * Separating policies and mechanisms enables modularity
 47 | * Schedulers need to optimize for different metrics: utilization, turnaround,
 48 |   response time, fairness and forward progress
 49 |     * FIFO: simple, non-preemptive scheduler
 50 |     * SJF: non-preemptive, prevents process jams
 51 |     * STFC: preemptive, prevents jams of late processes
 52 |     * RR: preemptive, great response time, bad turnaround
 53 |     * MLFQ: preemptive, most realistic
 54 |     * CFS: fair scheduler by virtualizing time
 55 | * Past behavior is good predictor for future behavior
 56 | 
 57 | ---
 58 | 
 59 | # Memory Virtualization: Segmentation and Paging
 60 | 
 61 | * OS manages access to constrained resources
 62 |     * Principle: limited direct execution (bare metal when possible, intercept
 63 |       when needed)
 64 |     * ***CPU:*** time sharing between processes (low switching cost)
 65 |     * ***Memory:*** space sharing (disk I/O is slow, so time sharing is expensive)
 66 | * ***Fragmentation:*** space lost due to internal or external padding
 67 | * ***Paging:*** MMU fully translates between virtual and physical addresses
 68 |     * One flat page table (array)
 69 |     * Multi-level page table
 70 |     * Pros? Cons? What are size requirements?
 71 | * Paging and swapping allows process to execute with only the working set
 72 |   resident in memory, remaining pages can be stored on disk
 73 | 
 74 | ---
 75 | 
 76 | # Concurrency
 77 | 
 78 | \begin{tikzpicture}
 79 | \draw [orange, ultra thick] (0,0) rectangle (2,4);
 80 | \node[text width=3cm, rotate=90] at (1, 2.5) {Virtualization};
 81 | \draw [fill, blue, ultra thick] (3,0) rectangle (5,4);
 82 | \node[text width=3cm, rotate=90] at (4, 2.5) {Concurrency};
 83 | \draw [green, ultra thick] (6,0) rectangle (8,4);
 84 | \node[text width=3cm, rotate=90] at (7, 2.5) {Persistence};
 85 | 
 86 | \draw [red, ultra thick] (0,4.5) -- (8,4.5);
 87 | \draw [red, ultra thick] (0,4.5) -- (4,6);
 88 | \draw [red, ultra thick] (4,6) -- (8,4.5);
 89 | \node[text width=3cm] at (5, 5) {Security};
 90 | 
 91 | \end{tikzpicture}
 92 | 
 93 | ---
 94 | 
 95 | # Concurrency topics
 96 | 
 97 | * Abstraction: locks to protect shared data structures
 98 | * Mechanism: interrupt-based locks
 99 | * Mechanism: atomic hardware locks
100 | * Busy waiting (spin locks) versus wait queues
101 | * Condition variables
102 | * Semaphores
103 | * Signaling through condition variables and semaphores
104 | 
105 | ---
106 | 
107 | # Difference parallelism and concurrency
108 | 
109 | * ***Parallelism:*** multiple threads (or processes) working on a single task using
110 |   multiple CPU cores (i.e., stuff happens at the same physical time)
111 | * ***Concurrency:*** tasks can start, run, and complete in overlapping time periods
112 |   (i.e., tasks run at the same virtual time)
113 | 
114 | ---
115 | 
116 | # Concurrency summary
117 | 
118 | * Spin lock, CV, and semaphore synchronize multiple threads/processes
119 |     * ***Spin lock:*** atomic access, no ordering, spinning
120 |     * ***Condition variable:*** atomic access, queue, OS primitive
121 |     * ***Semaphore:*** shared access to critical section with (int) state
122 | * All three primitives are equally powerful
123 |     * Each primitive can be used to implement both other primitives
124 |     * Performance may differ!
125 | * Synchronization is challenging and may introduce different types of
126 |   bugs such as atomicity violation, order violation, or deadlocks.
127 | 
128 | 
129 | ---
130 | 
131 | # Persistence
132 | 
133 | \begin{tikzpicture}
134 | \draw [orange, ultra thick] (0,0) rectangle (2,4);
135 | \node[text width=3cm, rotate=90] at (1, 2.5) {Virtualization};
136 | \draw [blue, ultra thick] (3,0) rectangle (5,4);
137 | \node[text width=3cm, rotate=90] at (4, 2.5) {Concurrency};
138 | \draw [fill, green, ultra thick] (6,0) rectangle (8,4);
139 | \node[text width=3cm, rotate=90] at (7, 2.5) {Persistence};
140 | 
141 | \draw [red, ultra thick] (0,4.5) -- (8,4.5);
142 | \draw [red, ultra thick] (0,4.5) -- (4,6);
143 | \draw [red, ultra thick] (4,6) -- (8,4.5);
144 | \node[text width=3cm] at (5, 5) {Security};
145 | 
146 | \end{tikzpicture}
147 | 
148 | ---
149 | 
150 | # Persistence Topics
151 | 
152 | * Device interaction and device drivers
153 | * IO Scheduling and harddrive throughput
154 |     * Disk layout
155 |     * Disk virtualization (RAID)
156 | * Filesystem API
157 | * Filesystem implementation
158 |     * Inodes and devices
159 |     * File descriptors
160 |     * File names
161 | * Crash resistance
162 | * Journaling
163 | 
164 | ---
165 | 
166 | # IO/Driver Summary
167 | 
168 | * Overlap IO and computation as much as possible!
169 |     * Use interrupts
170 |     * Use DMA
171 | * Driver classes provide common interface
172 | * Storage: read/write/seek of blocks
173 | * Minimize random IO (i.e., quick sort is really bad on HDDs)
174 | * Carefully schedule IO on slow devices
175 | * RAID virtualizes disks
176 | 
177 | ---
178 | 
179 | # Filesystem summary
180 | 
181 | * Filesystem API: handle interaction with the file system
182 | * Three ways to identify a file
183 |     * File names (for humans)
184 |     * Inodes and devices (on the disk)
185 |     * File descriptors (for a process)
186 | * Filesystem implementation
187 |     * Inodes for metadata
188 |     * Bitmaps for inodes/data blocks
189 |     * Superblock for global metadata
190 | * Crash resistance: filesystem check (FSCK)
191 | * Journaling: keep track of metadata, enforce atomicity
192 |     * All modern filesystems use journaling
193 |     * FSCK still useful due to bitflips/bugs
194 | 
195 | ---
196 | 
197 | # Security
198 | 
199 | \begin{tikzpicture}
200 | \draw [orange, ultra thick] (0,0) rectangle (2,4);
201 | \node[text width=3cm, rotate=90] at (1, 2.5) {Virtualization};
202 | \draw [blue, ultra thick] (3,0) rectangle (5,4);
203 | \node[text width=3cm, rotate=90] at (4, 2.5) {Concurrency};
204 | \draw [green, ultra thick] (6,0) rectangle (8,4);
205 | \node[text width=3cm, rotate=90] at (7, 2.5) {Persistence};
206 | 
207 | \filldraw[draw=red, fill=red] (0,4.5) -- (8,4.5) -- (4,6) -- (0,4.5) -- cycle;
208 | \node[text width=3cm] at (5, 5) {Security};
209 | 
210 | \end{tikzpicture}
211 | 
212 | ---
213 | 
214 | # Two topics: testing and mitigations
215 | 
216 | * Testing helps developers find as many bugs as possible
217 |     * Fuzzing generates test cases
218 |     * Sanitization detects policy violations
219 | * Mitigations detect policy violations at runtime, stop exploits
220 | 
221 | ---
222 | 
223 | # Testing summary
224 | 
225 | * Software testing finds bugs before an attacker can exploit them
226 | * Manual testing: write test cases to trigger exceptions
227 | * Fuzz testing automates and randomizes testing
228 | * Sanitizers allow early bug detection, not just on exceptions
229 | * AddressSanitizer is the most commonly used sanitizer and enforces
230 |   probabilistic memory safety by recording metadata for every allocated object
231 |   and checking every memory read/write.
232 | 
233 | ---
234 | 
235 | # Mitigations Summary
236 | 
237 | * Deployed mitigations do not stop all attacks
238 | * ***Data Execution Prevention*** stops code injection attacks, but does not
239 |   stop code reuse attacks
240 | * ***Address Space Layout Randomization*** is probabilistic, shuffles memory space,
241 |   prone to information leaks
242 | * ***Stack Canaries*** are probabilistic, do not protect against direct
243 |   overwrites, prone to information leaks
244 | * ***Control-Flow Integrity*** restricts control-flow hijack attacks, does not protect against
245 |   data-only attacks
246 | 
247 | ---
248 | 
249 | # Learning goals
250 | 
251 | ![Understading OS' will help your career!](./figures/50-learn.jpg){width=400px}
252 | 
253 | ---
254 | 
255 | # Learning goals: class
256 | 
257 | * Learn core concepts
258 | * Become aware of design decisions and policies
259 |     * Virtualization: CPU and Memory
260 |     * Concurrency: performance trade-offs
261 |     * Persistence: correctness and recovery
262 |     * Security: software testing versus mitigations
263 | 
264 | ---
265 | 
266 | # Learning goals: labs
267 | 
268 | * Lab 0: practice C programming and debugging
269 | * Lab 1: thread scheduling and memory allocation
270 | * Lab 2: concurrency and message passing
271 | * Lab 3: simple file system
272 | * Lab 4: software security testing
273 | 
274 | But the main goal was to become better programmers, i.e., using a specification
275 | to implement and test a prototype, then integrate it into the overall system.
276 | 
277 | ---
278 | 
279 | # Final Exam: Outline
280 | 
281 | * Saturday 22.01.2022 from 16h15 to 19h15 (SG0211, SG1)
282 | 
283 | . . .
284 | 
285 | * Several questions for each topic
286 |     * Theory: based on the lectures
287 |     * Practice: based on the labs
288 |     * 2-3 questions per topic (between 8 and 10 questions)
289 | 
290 | . . .
291 | 
292 | * Answers will be
293 |     * Numbers (e.g., 15 pages)
294 |     * Code (merging for buddy allocator)
295 |     * Prose (why you don't temporally separate memory)
296 | 
297 | ---
298 | 
299 | # Prepare for the exam
300 | 
301 | * Watch the lectures and read the book chapters
302 | * Solve the exercises
303 | * Solve the practice midterm/final
304 | * Create summaris and indexes
305 | * Ask questions on Moodle
306 | 
307 | ---
308 | 
309 | # All done?
310 | 
311 | ![I hope your class was not like that!](./figures/50-done1.jpg){width=400px}
312 | 
313 | ---
314 | 
315 | # Feedback
316 | 
317 | * Feedback is appreciated, be as detailed as possible
318 |     * For good statistics, I need all of you to respond!
319 |     * Be open and positive!
320 | * What was great? What can be improved?
321 | * Be as detailed as possible
322 |     * How can I improve the class?
323 |     * How can I improve the labs?
324 |     * Was the workload and distribution reasonable?
325 | * Let me know what else you were missing!
326 | 
327 | ---
328 | 
329 | # All done?
330 | 
331 | ![Keep your curiosity going, the HexHive lab offers fun {BSc|MSc} software/systems security projects. Talk to us!](./figures/50-done2.jpg){width=400px}
332 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | TITLE="CS323 Operating Systems"
 2 | AUTHOR="Mathias Payer and Sanidhya Kashyap"
 3 | DATE="EPFL, Fall 2021"
 4 | 
 5 | #SRC=$(shell ls ??-*.md ???-*.md)
 6 | SRC=$(shell ls ??-*.md)
 7 | DSThtml=$(SRC:%.md=%.html)
 8 | DSTpdf=$(SRC:%.md=./pdf/%.pdf)
 9 | 
10 | .PHONY=all pdf clean
11 | 
12 | all: ${DSTpdf}
13 | 
14 | pdf: ${DSTpdf}
15 | 
16 | ./pdf/%.pdf: %.md
17 | 	pandoc -f markdown+emoji -t beamer -s -o $@ -V theme:Warsaw \
18 | 		--metadata=title:${TITLE} --metadata=author:${AUTHOR} --metadata=date:${DATE} -H preamble.tex $<
19 | 
20 | clean:
21 | 	rm -f ${DST}
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Introduction to Operating Systems (CS-323, EPFL)
 2 | 
 3 | This class is a gently introduction into operating systems concepts at EPFL for undergraduate students in their third year.
 4 | The students come with a light background in C programming from a mandatory class on C concepts and an optional C programming project in their second year.
 5 | Generally, the C background of the students is rather light and many have not really worked with Linux environments yet.
 6 | 
 7 | The class follows the excellent [Operating Systems: Three Easy Pieces](https://pages.cs.wisc.edu/~remzi/OSTEP/) by Remzi H. Arpaci-Dusseau and Andrea C. Arpaci-Dusseau.
 8 | We break the class into four components:
 9 | 
10 | * Virtualization
11 | * Concurrency
12 | * Persistence
13 | * Security
14 | 
15 | While recent editions of the book have added classic systems security with a heavy focus on authentication, access control, and cryptography, we focus on software security concepts.
16 | The software security component enables students to reason about bugs, understand vulnerabilities, and gives them tools on how to handle security issues.
17 | We feel that this prepares them better for software development and security aspects finally connect different pieces from the earlier pillars in a different light.
18 | 
19 | 
20 | ## Building the slides
21 | 
22 | The slides are written in Markdown and can be translated to PDF through `pandoc`.
23 | We leverage inline TikZ figures in LaTeX environments for visual figures.
24 | If you have the necessary packages installed, you can create the slides through a hearty
25 | 
26 | ```
27 | $ make
28 | 
29 | ```
30 | 
31 | If your LaTeX and pandoc environments are not on par, you may have to install the following:
32 | 
33 | ```
34 | $ sudo apt install pandoc graphviz dot2tex texlive-full
35 | 
36 | ```
37 | 
38 | Note that `texlive-full` is overkill but storage is almost free nowadays.
39 | 
40 | 
41 | ## Changelog
42 | 
43 | * 2021: Updated slides and handout material (Mathias Payer and Sanidhya Kashyap)
44 | * 2020: Heavily reworked slides in the 2nd year (Mathias Payer)
45 | * 2019: First version of the slides, created from scratch (Mathias Payer)
46 | 
47 | 
48 | ## References and license
49 | 
50 | Feel free to reuse these slides but credit [Mathias Payer](https://nebelwelt.net) when reusing the material.
51 | If you have suggestions on improvements, please let me know before you fire off a huge pull request.
52 | Notes on small improvements and typos are always welcome!
53 | 


--------------------------------------------------------------------------------
/demos/11-fork.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <unistd.h>
 4 | 
 5 | int main(int argc, char* argv[]) {
 6 |   printf("Hello, I'm PID %d (%d, %s)\n", (int)getpid(), argc, argv[0]);
 7 |   int pid = fork();
 8 |   if (pid < 0) exit(-1);  // fork failed
 9 |   if (pid == 0) {
10 |     printf("o/ I'm PID %d\n", (int)getpid());
11 |   } else {
12 |     printf("\\o, my child is PID %d\n", pid);
13 |   }
14 |   return 0;
15 | }
16 | 


--------------------------------------------------------------------------------
/demos/13-heap.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <string.h>
  4 | #include <assert.h>
  5 | 
  6 | /**
  7 |  * Heap area, provided by the OS
  8 |  */
  9 | #define HEAPSIZE 1024*1024
 10 | char heap[HEAPSIZE];
 11 | 
 12 | /****************************
 13 |  * Allocator implementation *
 14 |  ****************************/
 15 | 
 16 | /**
 17 |  * Data structure for free memory objects.
 18 |  * The object size is first so that allocated objects can use the same slot.
 19 |  */
 20 | struct mitem {
 21 |   size_t size;
 22 |   struct mitem *next;
 23 | };
 24 | 
 25 | 
 26 | /**
 27 |  * Global pointer to first free memory object.
 28 |  */
 29 | struct mitem *first = (struct mitem*)heap;
 30 | 
 31 | 
 32 | /**
 33 |  * Initialize heap memory allocator
 34 |  */
 35 | void init() {
 36 |   first->next = NULL;
 37 |   first->size = HEAPSIZE - sizeof(struct mitem);
 38 | }
 39 | 
 40 | 
 41 | /**
 42 |  * alloc a memory chunk of size bytes
 43 |  */
 44 | void *xalloc(size_t size) {
 45 |   struct mitem *ptr = first, *prev = NULL, *new;
 46 |   if (size == 0) return NULL;
 47 |   printf("alloc %ld bytes", size);
 48 |   // round up to nearest 16 bytes (at 15 and round down)
 49 |   size = (size + sizeof(size_t) + sizeof(struct mitem) - 1) 
 50 |            & (long)(-sizeof(struct mitem));
 51 |   printf(" (%ld effectively used)", size);
 52 | 
 53 |   // find empty slot or return NULL
 54 |   while (ptr->size < size) {
 55 |     if (ptr->next == NULL) {
 56 |       printf(" not enough memory, bailing.\n");
 57 |       return NULL;
 58 |     }
 59 |     prev = ptr;
 60 |     ptr = ptr->next;
 61 |   }
 62 |   printf(" at ptr %p\n", (void*)ptr);
 63 | 
 64 |   // split slot
 65 |   if (ptr->size > size) {
 66 |     new = (struct mitem*)(((char*)(ptr)+size));
 67 |     new->next = ptr->next;
 68 |     new->size = ptr->size - size;
 69 |     ptr->size = size;
 70 |   } else {
 71 |     // perfect fit, just return next pointer
 72 |     new = ptr->next;
 73 |   }
 74 | 
 75 |   // reconnect list
 76 |   if (prev == NULL)
 77 |     first = new;
 78 |   else
 79 |     prev->next = new;
 80 | 
 81 |   // return adjusted pointer
 82 |   return (char*)(ptr)+sizeof(size_t);
 83 | }
 84 | 
 85 | 
 86 | /**
 87 |  * Free an object, read size from LHS.
 88 |  */
 89 | void xfree(void *vptr) {
 90 |   // get pointer to size
 91 |   struct mitem *ptr = (struct mitem*)(((char*)vptr)-sizeof(size_t));
 92 |   printf("free %p (metadata at %p), %ld\n", vptr, (void*)ptr, ptr->size);
 93 |   // link back to list
 94 |   ptr->next = first;
 95 |   first = ptr;
 96 | }
 97 | 
 98 | 
 99 | /************************************
100 |  * Some simple test vectors in main *
101 |  ************************************/
102 | int main(int argc __attribute__((unused)), char *argtv[] __attribute__((unused))) {
103 |   char *ptrs[64] = {0};
104 |   init();
105 |   // simple test
106 |   for (int i = 0; i < 64; i++) {
107 |     ptrs[i] = (char*)xalloc(i);
108 |     for (int j = 0; j < i; j++)
109 |       ptrs[i][j] = 0x41;
110 |   }
111 |   for (int i = 63; i >= 0; i--) {
112 |     xfree(ptrs[i]);
113 |     ptrs[i] = 0;
114 |   }
115 | 
116 |   // fuzz a bit
117 |   for (int i=0; i<1000000; i++) {
118 |     int item = rand()%64;
119 |     if (ptrs[item] == NULL) {
120 |         int size = rand()%256;
121 |         ptrs[item] = xalloc(size);
122 |         if (ptrs[item] != NULL && size >= 1) {
123 |           ptrs[item][0] = (unsigned char)size;
124 |           memset(ptrs[item]+1, 0x41, size-1);
125 |         }
126 |     } else {
127 |         xfree(ptrs[item]);
128 |         ptrs[item] = NULL;
129 |     }
130 |   }
131 |   
132 |   return 0;
133 | }
134 | 


--------------------------------------------------------------------------------
/demos/13-quiz.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | 
 4 | int a = 2;
 5 | int called(int b) {
 6 |   int c = a * b;
 7 |   printf("a: %d b: %d c: %d\n", a, b, c);
 8 |   a = 5;  
 9 |   return c;
10 | }
11 | 
12 | 
13 | int main(int argc __attribute__((unused)), char* argv[] __attribute__((unused))) {
14 |   int b = 2, c = 3;
15 |   printf("a: %d b: %d c: %d\n", a, b, c);
16 |   b = called(c);
17 |   printf("a: %d b: %d c: %d\n", a, b, c);
18 |   return 0;
19 | }
20 | 


--------------------------------------------------------------------------------
/demos/21-race.c:
--------------------------------------------------------------------------------
  1 | #include <pthread.h>
  2 | #include <stdbool.h>
  3 | #include <stdio.h>
  4 | #include <stdlib.h>
  5 | #include <time.h>
  6 | 
  7 | #define ITER 10000000
  8 | 
  9 | struct lock {
 10 |   /* generic lock variable */
 11 |   bool taken;
 12 |   /* special fields for peterson */
 13 |   bool peterson[2];
 14 |   unsigned int turn;
 15 |   /* special field for pthread mutex */
 16 |   pthread_mutex_t pmutex;
 17 | };
 18 | 
 19 | 
 20 | void nop(struct lock *);
 21 | void acquire_sw(struct lock *);
 22 | void release_sw(struct lock *);
 23 | void acquire_peterson(struct lock *);
 24 | void release_peterson(struct lock *);
 25 | void acquire_tas(struct lock *);
 26 | void release_tas(struct lock *);
 27 | void acquire_cas(struct lock *);
 28 | void release_cas(struct lock *);
 29 | void acquire_pth(struct lock *);
 30 | void release_pth(struct lock *);
 31 | 
 32 | unsigned long counter = 0;
 33 | 
 34 | struct lock lock1 = { false, { false, false}, 0, {{0, 0, 0, 0, 0, 0, 0, {0, 0}}} };
 35 | 
 36 | void (*acquire)(struct lock *) = &nop;
 37 | void (*release)(struct lock *) = &nop;
 38 | 
 39 | __thread unsigned int tid = -1;
 40 | 
 41 | void *incer(void *ltid) {
 42 |   tid = (unsigned int)ltid;
 43 |   for (unsigned long i = 0; i < ITER; ++i) {
 44 |     acquire(&lock1);
 45 |     counter = counter + 1;
 46 |     release(&lock1);
 47 |   }
 48 |   // printf("%s is done", (char*)arg);
 49 |   return NULL;
 50 | }
 51 | 
 52 | void launch_test(const char *str) {
 53 |   pthread_t t1, t2;
 54 |   clock_t start, end;
 55 | 
 56 |   printf("Launching test '%s'\n", str);
 57 |   counter = 0;
 58 | 
 59 |   start = clock();
 60 | 
 61 |   /* create worker threads */
 62 |   if (pthread_create(&t1, NULL, incer, (void*)0) != 0) exit(-1);
 63 |   if (pthread_create(&t2, NULL, incer, (void*)1) != 0) exit(-1);
 64 | 
 65 |   /* work is done, merge threads */
 66 |   if (pthread_join(t1, NULL) != 0) exit(-1);
 67 |   if (pthread_join(t2, NULL) != 0) exit(-1);
 68 | 
 69 |   end = clock();
 70 | 
 71 |   printf("Counter: %lu (expected: %lu), elapsed: %fs\n", counter,
 72 |           ((unsigned long)ITER)*2, (double)(end-start)/CLOCKS_PER_SEC);
 73 | }
 74 | 
 75 | 
 76 | /**
 77 |  * No locking
 78 |  */
 79 | void nop(struct lock *l __attribute__((unused))) {
 80 | }
 81 | 
 82 | 
 83 | /**
 84 |  * Software locking, no hardware support
 85 |  */
 86 | void acquire_sw(struct lock *l) {
 87 |   while (l->taken); /* spin until we grab the lock */
 88 |   l->taken = true;
 89 | }
 90 | 
 91 | void release_sw(struct lock *l) {
 92 |   l->taken = false;
 93 | }
 94 | 
 95 | 
 96 | /**
 97 |  * Peterson's locking mechanism, assumes atomic hardware instr and no caching
 98 |  */
 99 | void acquire_peterson(struct lock *l) {
100 |   l->peterson[tid] = true;
101 |   l->turn = 1-tid;
102 |   while (l->peterson[1-tid] && l->turn == 1-tid); /* wait */
103 | }
104 | 
105 | void release_peterson(struct lock *l) {
106 |   l->peterson[tid] = false;
107 | }
108 | 
109 | 
110 | /**
111 |  * Test and Set
112 |  */
113 | bool tas(bool *addr, bool val) {
114 |   /* This code is equivalent to the intrinsic:
115 |   bool old;
116 |   __asm__ volatile("lock; xchgb %0, %1" :
117 |                "+m" (*addr), "=a" (old) :
118 |                "1" (val) : "cc");
119 |   return old; */
120 |   return __sync_lock_test_and_set(addr, val);
121 | }
122 | 
123 | void acquire_tas(struct lock *l) {
124 |   while (tas(&(l->taken), true) == true); /* spin */
125 | }
126 | 
127 | void release_tas(struct lock *l) {
128 |   l->taken = false;
129 | }
130 | 
131 | 
132 | /**
133 |  * Compare and Swap
134 |  */
135 | void acquire_cas(struct lock *l) {
136 |   /* instruction on x86: lock; cmpxchgb */
137 |   while (__sync_bool_compare_and_swap(&(l->taken), false, true) == false); /* spin */
138 | }
139 | 
140 | void release_cas(struct lock *l) {
141 |   l->taken = false;
142 | }
143 | 
144 | 
145 | /**
146 |  * pthread mutex
147 |  */
148 | void acquire_pth(struct lock *l) {
149 |   pthread_mutex_lock(&(l->pmutex));
150 | }
151 | 
152 | void release_pth(struct lock *l) {
153 |   pthread_mutex_unlock(&(l->pmutex));
154 | 
155 | }
156 | 
157 | 
158 | /**
159 |  * Simple main that launches tests for each
160 |  * type of locking mechanism.
161 |  */
162 | int main(int argc __attribute__((unused)), 
163 |          char* argv[] __attribute__((unused))) {
164 | 
165 |   launch_test("No locking");
166 | 
167 |   acquire = &acquire_sw;
168 |   release = &release_sw;
169 |   launch_test("SW only, no HW support");
170 | 
171 |   acquire = &acquire_peterson;
172 |   release = &release_peterson;
173 |   launch_test("Peterson's locking");
174 | 
175 |   acquire = &acquire_tas;
176 |   release = &release_tas;
177 |   launch_test("Test and Set");
178 | 
179 |   acquire = &acquire_cas;
180 |   release = &release_cas;
181 |   launch_test("Compare and Swap");
182 | 
183 |   /* prepare mutex */
184 |   if (pthread_mutex_init(&(lock1.pmutex), NULL) != 0) exit(-1);
185 |   acquire = &acquire_pth;
186 |   release = &release_pth;
187 |   launch_test("pthread mutex");
188 | 
189 |   return 0;
190 | }
191 | 
192 | 


--------------------------------------------------------------------------------
/demos/21-race0no.c:
--------------------------------------------------------------------------------
 1 | #include <pthread.h>
 2 | #include <stdbool.h>
 3 | #include <stdio.h>
 4 | #include <stdlib.h>
 5 | #include <time.h>
 6 | 
 7 | #define ITER 10000000
 8 | 
 9 | unsigned long counter = 0;
10 | 
11 | __thread unsigned int tid = -1;
12 | 
13 | void *incer(void *ltid) {
14 |   tid = (unsigned int)ltid;
15 |   for (unsigned long i = 0; i < ITER; ++i) {
16 |     counter = counter + 1;
17 |   }
18 |   return NULL;
19 | }
20 | 
21 | 
22 | /**
23 |  * Simple main that launches tests for each
24 |  * type of locking mechanism.
25 |  */
26 | int main(int argc __attribute__((unused)), 
27 |          char* argv[] __attribute__((unused))) {
28 | 
29 |   pthread_t t1, t2;
30 |   clock_t start, end;
31 | 
32 |   printf("Launching test 'no locking'\n");
33 |   counter = 0;
34 | 
35 |   start = clock();
36 | 
37 |   /* create worker threads */
38 |   if (pthread_create(&t1, NULL, incer, (void*)0) != 0) exit(-1);
39 |   if (pthread_create(&t2, NULL, incer, (void*)1) != 0) exit(-1);
40 | 
41 |   /* work is done, merge threads */
42 |   if (pthread_join(t1, NULL) != 0) exit(-1);
43 |   if (pthread_join(t2, NULL) != 0) exit(-1);
44 | 
45 |   end = clock();
46 | 
47 |   printf("Counter: %lu (expected: %lu), elapsed: %fs\n", counter,
48 |           ((unsigned long)ITER)*2, (double)(end-start)/CLOCKS_PER_SEC);
49 | 
50 |   return 0;
51 | }
52 | 


--------------------------------------------------------------------------------
/demos/21-race1sw.c:
--------------------------------------------------------------------------------
 1 | #include <pthread.h>
 2 | #include <stdbool.h>
 3 | #include <stdio.h>
 4 | #include <stdlib.h>
 5 | #include <time.h>
 6 | 
 7 | #define ITER 10000000
 8 | 
 9 | struct lock {
10 |   /* generic lock variable */
11 |   bool taken;
12 | };
13 | 
14 | 
15 | void acquire_sw(struct lock *);
16 | void release_sw(struct lock *);
17 | 
18 | unsigned long counter = 0;
19 | 
20 | __thread unsigned int tid = -1;
21 | 
22 | struct lock lock1 = { false };
23 | 
24 | void *incer(void *ltid) {
25 |   tid = (unsigned int)ltid;
26 |   for (unsigned long i = 0; i < ITER; ++i) {
27 |     acquire_sw(&lock1);
28 |     counter = counter + 1;
29 |     release_sw(&lock1);
30 |   }
31 |   // printf("%s is done", (char*)arg);
32 |   return NULL;
33 | }
34 | 
35 | 
36 | /**
37 |  * Software locking, no hardware support
38 |  */
39 | void acquire_sw(struct lock *l) {
40 |   while (l->taken); /* spin until we grab the lock */
41 |   l->taken = true;
42 | }
43 | 
44 | void release_sw(struct lock *l) {
45 |   l->taken = false;
46 | }
47 | 
48 | 
49 | /**
50 |  * Simple main that launches tests for each
51 |  * type of locking mechanism.
52 |  */
53 | int main(int argc __attribute__((unused)), 
54 |          char* argv[] __attribute__((unused))) {
55 | 
56 |   pthread_t t1, t2;
57 |   clock_t start, end;
58 | 
59 |   printf("Launching test 'SW locking'\n");
60 |   counter = 0;
61 | 
62 |   start = clock();
63 | 
64 |   /* create worker threads */
65 |   if (pthread_create(&t1, NULL, incer, (void*)0) != 0) exit(-1);
66 |   if (pthread_create(&t2, NULL, incer, (void*)1) != 0) exit(-1);
67 | 
68 |   /* work is done, merge threads */
69 |   if (pthread_join(t1, NULL) != 0) exit(-1);
70 |   if (pthread_join(t2, NULL) != 0) exit(-1);
71 | 
72 |   end = clock();
73 | 
74 |   printf("Counter: %lu (expected: %lu), elapsed: %fs\n", counter,
75 |           ((unsigned long)ITER)*2, (double)(end-start)/CLOCKS_PER_SEC);
76 | 
77 |   return 0;
78 | }
79 | 
80 | 


--------------------------------------------------------------------------------
/demos/21-race2gp.c:
--------------------------------------------------------------------------------
 1 | #include <pthread.h>
 2 | #include <stdbool.h>
 3 | #include <stdio.h>
 4 | #include <stdlib.h>
 5 | #include <time.h>
 6 | 
 7 | #define ITER 10000000
 8 | 
 9 | struct lock {
10 |   /* generic lock variable */
11 |   bool taken;
12 |   /* special fields for peterson */
13 |   bool peterson[2];
14 |   unsigned int turn;
15 | };
16 | 
17 | void acquire_peterson(struct lock *);
18 | void release_peterson(struct lock *);
19 | 
20 | unsigned long counter = 0;
21 | 
22 | __thread unsigned int tid = -1;
23 | 
24 | struct lock lock1 = { false, {false, false}, 0 };
25 | 
26 | void *incer(void *ltid) {
27 |   tid = (unsigned int)ltid;
28 |   for (unsigned long i = 0; i < ITER; ++i) {
29 |     acquire_peterson(&lock1);
30 |     counter = counter + 1;
31 |     release_peterson(&lock1);
32 |   }
33 |   return NULL;
34 | }
35 | 
36 | 
37 | /**
38 |  * Peterson's locking mechanism, assumes atomic hardware instr and no caching
39 |  */
40 | void acquire_peterson(struct lock *l) {
41 |   l->peterson[tid] = true;
42 |   l->turn = 1-tid;
43 |   while (l->peterson[1-tid] && l->turn == 1-tid); /* wait */
44 | }
45 | 
46 | void release_peterson(struct lock *l) {
47 |   l->peterson[tid] = false;
48 | }
49 | 
50 | /**
51 |  * Simple main that launches tests for each
52 |  * type of locking mechanism.
53 |  */
54 | int main(int argc __attribute__((unused)), 
55 |          char* argv[] __attribute__((unused))) {
56 | 
57 |   pthread_t t1, t2;
58 |   clock_t start, end;
59 | 
60 |   printf("Launching test 'Gary Peterson locking'\n");
61 |   counter = 0;
62 | 
63 |   start = clock();
64 | 
65 |   /* create worker threads */
66 |   if (pthread_create(&t1, NULL, incer, (void*)0) != 0) exit(-1);
67 |   if (pthread_create(&t2, NULL, incer, (void*)1) != 0) exit(-1);
68 | 
69 |   /* work is done, merge threads */
70 |   if (pthread_join(t1, NULL) != 0) exit(-1);
71 |   if (pthread_join(t2, NULL) != 0) exit(-1);
72 | 
73 |   end = clock();
74 | 
75 |   printf("Counter: %lu (expected: %lu), elapsed: %fs\n", counter,
76 |           ((unsigned long)ITER)*2, (double)(end-start)/CLOCKS_PER_SEC);
77 | 
78 |   return 0;
79 | }
80 | 
81 | 


--------------------------------------------------------------------------------
/demos/21-race3tas.c:
--------------------------------------------------------------------------------
 1 | #include <pthread.h>
 2 | #include <stdbool.h>
 3 | #include <stdio.h>
 4 | #include <stdlib.h>
 5 | #include <time.h>
 6 | 
 7 | #define ITER 10000000
 8 | 
 9 | struct lock {
10 |   /* generic lock variable */
11 |   bool taken;
12 | };
13 | 
14 | void acquire_tas(struct lock *);
15 | void release_tas(struct lock *);
16 | 
17 | unsigned long counter = 0;
18 | 
19 | __thread unsigned int tid = -1;
20 | 
21 | struct lock lock1 = { false };
22 | 
23 | void *incer(void *ltid) {
24 |   tid = (unsigned int)ltid;
25 |   for (unsigned long i = 0; i < ITER; ++i) {
26 |     acquire_tas(&lock1);
27 |     counter = counter + 1;
28 |     release_tas(&lock1);
29 |   }
30 |   return NULL;
31 | }
32 | 
33 | 
34 | /**
35 |  * Test and Set
36 |  */
37 | bool tas(bool *addr, bool val) {
38 |   /* This code is equivalent to the intrinsic:
39 |   bool old;
40 |   __asm__ volatile("lock; xchgb %0, %1" :
41 |                "+m" (*addr), "=a" (old) :
42 |                "1" (val) : "cc");
43 |   return old; */
44 |   return __sync_lock_test_and_set(addr, val);
45 | }
46 | 
47 | void acquire_tas(struct lock *l) {
48 |   while (tas(&(l->taken), true) == true); /* spin */
49 | }
50 | 
51 | void release_tas(struct lock *l) {
52 |   l->taken = false;
53 | }
54 | 
55 | /**
56 |  * Simple main that launches tests for each
57 |  * type of locking mechanism.
58 |  */
59 | int main(int argc __attribute__((unused)), 
60 |          char* argv[] __attribute__((unused))) {
61 | 
62 |   pthread_t t1, t2;
63 |   clock_t start, end;
64 | 
65 |   printf("Launching test 'Test-and-Set locking'\n");
66 |   counter = 0;
67 | 
68 |   start = clock();
69 | 
70 |   /* create worker threads */
71 |   if (pthread_create(&t1, NULL, incer, (void*)0) != 0) exit(-1);
72 |   if (pthread_create(&t2, NULL, incer, (void*)1) != 0) exit(-1);
73 | 
74 |   /* work is done, merge threads */
75 |   if (pthread_join(t1, NULL) != 0) exit(-1);
76 |   if (pthread_join(t2, NULL) != 0) exit(-1);
77 | 
78 |   end = clock();
79 | 
80 |   printf("Counter: %lu (expected: %lu), elapsed: %fs\n", counter,
81 |           ((unsigned long)ITER)*2, (double)(end-start)/CLOCKS_PER_SEC);
82 | 
83 |   return 0;
84 | }
85 | 
86 | 


--------------------------------------------------------------------------------
/demos/21-race4cas.c:
--------------------------------------------------------------------------------
 1 | #include <pthread.h>
 2 | #include <stdbool.h>
 3 | #include <stdio.h>
 4 | #include <stdlib.h>
 5 | #include <time.h>
 6 | 
 7 | #define ITER 10000000
 8 | 
 9 | struct lock {
10 |   /* generic lock variable */
11 |   bool taken;
12 | };
13 | 
14 | void acquire_cas(struct lock *);
15 | void release_cas(struct lock *);
16 | 
17 | unsigned long counter = 0;
18 | 
19 | __thread unsigned int tid = -1;
20 | 
21 | struct lock lock1 = { false };
22 | 
23 | void *incer(void *ltid) {
24 |   tid = (unsigned int)ltid;
25 |   for (unsigned long i = 0; i < ITER; ++i) {
26 |     acquire_cas(&lock1);
27 |     counter = counter + 1;
28 |     release_cas(&lock1);
29 |   }
30 |   return NULL;
31 | }
32 | 
33 | 
34 | /**
35 |  * Compare and Swap
36 |  */
37 | void acquire_cas(struct lock *l) {
38 |   /* instruction on x86: lock; cmpxchgb */
39 |   while (__sync_bool_compare_and_swap(&(l->taken), false, true) == false); /* spin */
40 | }
41 | 
42 | void release_cas(struct lock *l) {
43 |   l->taken = false;
44 | }
45 | 
46 | 
47 | /**
48 |  * Simple main that launches tests for each
49 |  * type of locking mechanism.
50 |  */
51 | int main(int argc __attribute__((unused)), 
52 |          char* argv[] __attribute__((unused))) {
53 | 
54 |   pthread_t t1, t2;
55 |   clock_t start, end;
56 | 
57 |   printf("Launching test 'Compare-and-Swap locking'\n");
58 |   counter = 0;
59 | 
60 |   start = clock();
61 | 
62 |   /* create worker threads */
63 |   if (pthread_create(&t1, NULL, incer, (void*)0) != 0) exit(-1);
64 |   if (pthread_create(&t2, NULL, incer, (void*)1) != 0) exit(-1);
65 | 
66 |   /* work is done, merge threads */
67 |   if (pthread_join(t1, NULL) != 0) exit(-1);
68 |   if (pthread_join(t2, NULL) != 0) exit(-1);
69 | 
70 |   end = clock();
71 | 
72 |   printf("Counter: %lu (expected: %lu), elapsed: %fs\n", counter,
73 |           ((unsigned long)ITER)*2, (double)(end-start)/CLOCKS_PER_SEC);
74 | 
75 |   return 0;
76 | }
77 | 
78 | 


--------------------------------------------------------------------------------
/demos/21-race5pth.c:
--------------------------------------------------------------------------------
 1 | #include <pthread.h>
 2 | #include <stdbool.h>
 3 | #include <stdio.h>
 4 | #include <stdlib.h>
 5 | #include <time.h>
 6 | 
 7 | #define ITER 10000000
 8 | 
 9 | struct lock {
10 |   /* special field for pthread mutex */
11 |   pthread_mutex_t pmutex;
12 | };
13 | 
14 | void acquire_pth(struct lock *);
15 | void release_pth(struct lock *);
16 | 
17 | unsigned long counter = 0;
18 | 
19 | __thread unsigned int tid = -1;
20 | 
21 | struct lock lock1 = { {{0, 0, 0, 0, 0, 0, 0, {0, 0}}} };
22 | 
23 | void *incer(void *ltid) {
24 |   tid = (unsigned int)ltid;
25 |   for (unsigned long i = 0; i < ITER; ++i) {
26 |     acquire_pth(&lock1);
27 |     counter = counter + 1;
28 |     release_pth(&lock1);
29 |   }
30 |   return NULL;
31 | }
32 | 
33 | 
34 | /**
35 |  * pthread mutex
36 |  */
37 | void acquire_pth(struct lock *l) {
38 |   pthread_mutex_lock(&(l->pmutex));
39 | }
40 | 
41 | void release_pth(struct lock *l) {
42 |   pthread_mutex_unlock(&(l->pmutex));
43 | 
44 | }
45 | 
46 | 
47 | /**
48 |  * Simple main that launches tests for each
49 |  * type of locking mechanism.
50 |  */
51 | int main(int argc __attribute__((unused)), 
52 |          char* argv[] __attribute__((unused))) {
53 | 
54 |   pthread_t t1, t2;
55 |   clock_t start, end;
56 | 
57 |   printf("Launching test 'pthread mutex locking'\n");
58 |   counter = 0;
59 | 
60 |   start = clock();
61 | 
62 |   /* create worker threads */
63 |   if (pthread_create(&t1, NULL, incer, (void*)0) != 0) exit(-1);
64 |   if (pthread_create(&t2, NULL, incer, (void*)1) != 0) exit(-1);
65 | 
66 |   /* work is done, merge threads */
67 |   if (pthread_join(t1, NULL) != 0) exit(-1);
68 |   if (pthread_join(t2, NULL) != 0) exit(-1);
69 | 
70 |   end = clock();
71 | 
72 |   printf("Counter: %lu (expected: %lu), elapsed: %fs\n", counter,
73 |           ((unsigned long)ITER)*2, (double)(end-start)/CLOCKS_PER_SEC);
74 | 
75 |   return 0;
76 | }
77 | 


--------------------------------------------------------------------------------
/demos/22-producer.c:
--------------------------------------------------------------------------------
 1 | #include <pthread.h>
 2 | #include <stdbool.h>
 3 | #include <stdio.h>
 4 | #include <stdlib.h>
 5 | #include <time.h>
 6 | #include <unistd.h>
 7 | #include <semaphore.h>
 8 | 
 9 | /**
10 |  * Example of 2 producers / 1 consumer
11 |  */
12 | 
13 | #define BUFSIZE 2
14 | #define NUMITEMS 8
15 | 
16 | void put(unsigned int);
17 | unsigned int get();
18 | 
19 | unsigned int buffer[BUFSIZE] = { 0 };
20 | 
21 | /* semaphores handling *concurrent* access to buffer */
22 | sem_t csem, psem;
23 | 
24 | /* mutex handling mutual exclusive access to ppos */
25 | pthread_mutex_t pmutex = PTHREAD_MUTEX_INITIALIZER;
26 | 
27 | unsigned int cpos = 0, ppos = 0;
28 | 
29 | void *producer(void *arg) {
30 | 	unsigned int max = (unsigned int)arg;
31 | 	for (unsigned int i = 0; i < max; i++) {
32 | 		printf("Produced %d\n", i);
33 | 		put(i);
34 | 	}
35 | 	return NULL;
36 | }
37 | 
38 | void *consumer(void *arg) {
39 | 	unsigned int max = (unsigned int)arg;
40 | 	for (unsigned int i = 0; i < max; i++) {
41 | 		printf("Consumed %d\n", get(i));
42 | 	}
43 | 	return NULL;
44 | }
45 | 
46 | void put(unsigned int val) {
47 | 	unsigned int mypos;
48 | 
49 | 	/* we wait until there is buffer space available */
50 | 	sem_wait(&psem);
51 | 
52 | 	/* ppos is shared between all producers */
53 | 	pthread_mutex_lock(&pmutex);
54 | 	mypos = ppos;
55 | 	ppos = (ppos + 1) % BUFSIZE;
56 | 
57 | 	/* store information in buffer */
58 | 	buffer[mypos] = val;
59 | 	pthread_mutex_unlock(&pmutex);
60 | 	sem_post(&csem);
61 | }
62 | 
63 | unsigned int get() {
64 | 	sem_wait(&csem);
65 | 	unsigned long val = buffer[cpos];
66 | 	cpos = (cpos + 1) % BUFSIZE;
67 | 	sem_post(&psem);
68 | 	return val;
69 | }
70 | 
71 | int main(int argc, char *argv[]) {
72 | 	printf("main thread (%s, %d)\n", argv[0], argc);
73 | 	pthread_t p1, p2, c;
74 | 
75 | 	// BUFSIZE items are available for producer to create
76 | 	sem_init(&psem, 0, BUFSIZE);
77 | 	// 0 items are available for consumer
78 | 	sem_init(&csem, 0, 0);
79 | 
80 | 	pthread_create(&p1, NULL, &producer, (void*)NUMITEMS);
81 | 	pthread_create(&p2, NULL, &producer, (void*)NUMITEMS);
82 | 	pthread_create(&c, NULL, &consumer, (void*)(2*NUMITEMS));
83 | 	
84 | 	pthread_join(p1, NULL);
85 | 	pthread_join(p2, NULL);
86 | 	pthread_join(c, NULL);
87 | 	printf("main ends\n");
88 | 	return 0;
89 | }
90 | 


--------------------------------------------------------------------------------
/demos/22-semaphore.c:
--------------------------------------------------------------------------------
  1 | #include <pthread.h>
  2 | #include <stdbool.h>
  3 | #include <stdio.h>
  4 | #include <stdlib.h>
  5 | #include <time.h>
  6 | #include <unistd.h>
  7 | 
  8 | /**
  9 |  * Same example as 22-producer.c but this time we
 10 |  * implement the semaphore ourselves!
 11 |  */
 12 | 
 13 | typedef struct {
 14 | 	int value;
 15 | 	pthread_mutex_t lock;
 16 | 	pthread_cond_t cond;
 17 | } sem_t;
 18 | 
 19 | void sem_init(sem_t *s, int val) {
 20 | 	s->value = val;
 21 | 	pthread_mutex_init(&(s->lock), NULL);
 22 | 	pthread_cond_init(&(s->cond), NULL);
 23 | }
 24 | 
 25 | void sem_wait(sem_t *s) {
 26 | 	pthread_mutex_lock(&(s->lock));
 27 | 	while (s->value <= 0)
 28 | 		pthread_cond_wait(&(s->cond), &(s->lock));
 29 | 	s->value--;
 30 | 	pthread_mutex_unlock(&(s->lock));
 31 | }
 32 | 
 33 | void sem_post(sem_t *s) {
 34 | 	pthread_mutex_lock(&(s->lock));
 35 | 	s->value++;
 36 | 	pthread_cond_signal(&(s->cond));
 37 | 	pthread_mutex_unlock(&(s->lock));
 38 | }
 39 | 
 40 | #define BUFSIZE 2
 41 | #define NUMITEMS 8
 42 | 
 43 | void put(unsigned int);
 44 | unsigned int get();
 45 | 
 46 | unsigned int buffer[BUFSIZE] = { 0 };
 47 | 
 48 | /* semaphores handling *concurrent* access to buffer */
 49 | sem_t csem, psem;
 50 | 
 51 | /* mutex handling mutual exclusive access to ppos */
 52 | pthread_mutex_t pmutex = PTHREAD_MUTEX_INITIALIZER;
 53 | 
 54 | unsigned int cpos = 0, ppos = 0;
 55 | 
 56 | void *producer(void *arg) {
 57 | 	unsigned int max = (unsigned int)arg;
 58 | 	for (unsigned int i = 0; i < max; i++) {
 59 | 		printf("Produced %d\n", i);
 60 | 		put(i);
 61 | 	}
 62 | 	return NULL;
 63 | }
 64 | 
 65 | void *consumer(void *arg) {
 66 | 	unsigned int max = (unsigned int)arg;
 67 | 	for (unsigned int i = 0; i < max; i++) {
 68 | 		printf("Consumed %d\n", get(i));
 69 | 	}
 70 | 	return NULL;
 71 | }
 72 | 
 73 | void put(unsigned int val) {
 74 | 	unsigned int mypos;
 75 | 
 76 | 	/* we wait until there is buffer space available */
 77 | 	sem_wait(&psem);
 78 | 
 79 | 	/* ppos is shared between all producers */
 80 | 	pthread_mutex_lock(&pmutex);
 81 | 	mypos = ppos;
 82 | 	ppos = (ppos + 1) % BUFSIZE;
 83 | 
 84 | 	/* store information in buffer */
 85 | 	buffer[mypos] = val;
 86 | 	pthread_mutex_unlock(&pmutex);
 87 | 	sem_post(&csem);
 88 | }
 89 | 
 90 | unsigned int get() {
 91 | 	sem_wait(&csem);
 92 | 	unsigned long val = buffer[cpos];
 93 | 	cpos = (cpos + 1) % BUFSIZE;
 94 | 	sem_post(&psem);
 95 | 	return val;
 96 | }
 97 | 
 98 | int main(int argc, char *argv[]) {
 99 | 	printf("main thread (%s, %d)\n", argv[0], argc);
100 | 	pthread_t p1, p2, c;
101 | 
102 | 	// BUFSIZE items are available for producer to create
103 | 	sem_init(&psem, BUFSIZE);
104 | 	// 0 items are available for consumer
105 | 	sem_init(&csem, 0);
106 | 
107 | 	pthread_create(&p1, NULL, &producer, (void*)NUMITEMS);
108 | 	pthread_create(&p2, NULL, &producer, (void*)NUMITEMS);
109 | 	pthread_create(&c, NULL, &consumer, (void*)(2*NUMITEMS));
110 | 	
111 | 	pthread_join(p1, NULL);
112 | 	pthread_join(p2, NULL);
113 | 	pthread_join(c, NULL);
114 | 	printf("main ends\n");
115 | 	return 0;
116 | }
117 | 


--------------------------------------------------------------------------------
/demos/22-thread_exit.c:
--------------------------------------------------------------------------------
 1 | #include <pthread.h>
 2 | #include <stdbool.h>
 3 | #include <stdio.h>
 4 | #include <stdlib.h>
 5 | #include <time.h>
 6 | #include <unistd.h>
 7 | 
 8 | bool done = false;
 9 | pthread_mutex_t m = PTHREAD_MUTEX_INITIALIZER;
10 | pthread_cond_t c = PTHREAD_COND_INITIALIZER;
11 | 
12 | void thr_exit() {
13 | 	pthread_mutex_lock(&m);
14 | 	done = true;
15 | 	pthread_cond_signal(&c);
16 | 	pthread_mutex_unlock(&m);
17 | }
18 | 
19 | void thr_join() {
20 | 	pthread_mutex_lock(&m);
21 | 	while (!done)
22 | 		pthread_cond_wait(&c, &m);
23 | 	pthread_mutex_unlock(&m);
24 | }
25 | 
26 | void *runner(void *arg) {
27 | 	printf("child: %s\n", (char*)arg);
28 | 	sleep(1);
29 | 	thr_exit();
30 | 	return NULL;
31 | }
32 | 
33 | int main(int argc, char *argv[]) {
34 | 	printf("main thread (%s, %d)\n", argv[0], argc);
35 | 	pthread_t p;
36 | 	pthread_create(&p, NULL, &runner, "1");
37 | 	thr_join();
38 | 	printf("main ends\n");
39 | 	return 0;
40 | }
41 | 


--------------------------------------------------------------------------------
/demos/22-workers.c:
--------------------------------------------------------------------------------
 1 | #include <pthread.h>
 2 | #include <stdbool.h>
 3 | #include <stdio.h>
 4 | #include <stdlib.h>
 5 | #include <time.h>
 6 | #include <unistd.h>
 7 | #include <semaphore.h>
 8 | 
 9 | #define BUFSIZE 4
10 | #define NRITEMS 5000
11 | 
12 | pthread_mutex_t box[BUFSIZE] = { PTHREAD_MUTEX_INITIALIZER };
13 | int buf[BUFSIZE] = { 0 };
14 | 
15 | void *worker(void *arg) {
16 | 	int tid = (int)arg;
17 | 	int nritems = NRITEMS;
18 | 	printf("Worker %d starting, need to pass on %d items.\n", tid, nritems);
19 | 	pthread_mutex_t *l1, *l2;
20 | 	l1 = &(box[tid]);
21 | 	l2 = &(box[(tid+1) % BUFSIZE]);
22 | 	if (tid == BUFSIZE - 1) {
23 | 		pthread_mutex_t *tmp = l2;
24 | 		l2 = l1;
25 | 		l1 = tmp;
26 | 	}
27 | 	while (nritems != 0) {
28 | 		pthread_mutex_lock(l1);
29 | 		pthread_mutex_lock(l2);
30 | 		if (buf[tid] > 0) {
31 | 			buf[(tid+1) % BUFSIZE]++;
32 | 			buf[tid]--;
33 | 			nritems--;
34 | 		}
35 | 		pthread_mutex_unlock(l2);
36 | 		pthread_mutex_unlock(l1);
37 | 	}
38 | 	return NULL;
39 | }
40 | 
41 | int main(int argc, char *argv[]) {
42 | 	printf("main thread (%s, %d)\n", argv[0], argc);
43 | 	pthread_t p[BUFSIZE];
44 | 
45 | 	for (int i = 0; i < BUFSIZE; i++)
46 | 		pthread_create(&p[i], NULL, &worker, (void*)(long)i);
47 | 	buf[0] = 25;
48 | 
49 | 	for (int i = 0; i < BUFSIZE; i++)
50 | 		pthread_join(p[i], NULL);
51 | 	for (int i = 0; i < BUFSIZE; i++)
52 | 		printf("%d ", buf[i]);
53 | 	printf("\nmain ends\n");
54 | 	return 0;
55 | }
56 | 


--------------------------------------------------------------------------------
/demos/32-tempfile.c:
--------------------------------------------------------------------------------
 1 | #include <stdlib.h>
 2 | #include <stdio.h>
 3 | #include <unistd.h>
 4 | #include <fcntl.h>
 5 | #include <sys/stat.h>
 6 | #include <sys/types.h>
 7 | #include <string.h>
 8 | 
 9 | int main(int argc __attribute__((unused)),
10 |          char* argv[] __attribute__((unused))) {
11 |   int fd = open("test", O_CREAT | O_RDWR, 0600);
12 |   //unlink(FILENAME);
13 |   char *data = "test";
14 |   char rdata[64];
15 |   write(fd, data, strlen(data));
16 |   sleep(10);
17 |   lseek(fd, 0, SEEK_SET);
18 |   read(fd, &rdata, 64);
19 |   rdata[63] = 0;
20 |   printf("We read '%s'\n", rdata);
21 |   close(fd);
22 |   return 0;
23 | }
24 | 


--------------------------------------------------------------------------------
/demos/Makefile:
--------------------------------------------------------------------------------
 1 | CC=clang
 2 | CFLAGS=--std=gnu99 -Wall -Wextra -Wpedantic -lpthread
 3 | SRC=$(shell ls *.c)
 4 | DST=$(SRC:%.c=%.out)
 5 | 
 6 | .PHONY=all clean
 7 | all: $(DST)
 8 | 
 9 | %.out: %.c
10 | 	$(CC) $(CFLAGS) -o $@ $<
11 | 
12 | clean:
13 | 	rm -f ${DST}
14 | 


--------------------------------------------------------------------------------
/figures/00-hamilton_kernighan.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HexHive/OSTEP-slides/2ed49850de72e48c36b46b31f0330823e59986d2/figures/00-hamilton_kernighan.png


--------------------------------------------------------------------------------
/figures/00-puppets.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HexHive/OSTEP-slides/2ed49850de72e48c36b46b31f0330823e59986d2/figures/00-puppets.jpg


--------------------------------------------------------------------------------
/figures/00-safe_space.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HexHive/OSTEP-slides/2ed49850de72e48c36b46b31f0330823e59986d2/figures/00-safe_space.jpg


--------------------------------------------------------------------------------
/figures/00-waiter.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HexHive/OSTEP-slides/2ed49850de72e48c36b46b31f0330823e59986d2/figures/00-waiter.png


--------------------------------------------------------------------------------
/figures/12-scheduling.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HexHive/OSTEP-slides/2ed49850de72e48c36b46b31f0330823e59986d2/figures/12-scheduling.jpg


--------------------------------------------------------------------------------
/figures/12-timetable.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HexHive/OSTEP-slides/2ed49850de72e48c36b46b31f0330823e59986d2/figures/12-timetable.png


--------------------------------------------------------------------------------
/figures/12-tractor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HexHive/OSTEP-slides/2ed49850de72e48c36b46b31f0330823e59986d2/figures/12-tractor.png


--------------------------------------------------------------------------------
/figures/31-filestack.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HexHive/OSTEP-slides/2ed49850de72e48c36b46b31f0330823e59986d2/figures/31-filestack.png


--------------------------------------------------------------------------------
/figures/31-pdp11.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HexHive/OSTEP-slides/2ed49850de72e48c36b46b31f0330823e59986d2/figures/31-pdp11.jpg


--------------------------------------------------------------------------------
/figures/32-nyancat.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HexHive/OSTEP-slides/2ed49850de72e48c36b46b31f0330823e59986d2/figures/32-nyancat.gif


--------------------------------------------------------------------------------
/figures/32-winfs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HexHive/OSTEP-slides/2ed49850de72e48c36b46b31f0330823e59986d2/figures/32-winfs.png


--------------------------------------------------------------------------------
/figures/33-portal.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HexHive/OSTEP-slides/2ed49850de72e48c36b46b31f0330823e59986d2/figures/33-portal.jpg


--------------------------------------------------------------------------------
/figures/33-thankyou.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HexHive/OSTEP-slides/2ed49850de72e48c36b46b31f0330823e59986d2/figures/33-thankyou.jpg


--------------------------------------------------------------------------------
/figures/34-journaling.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HexHive/OSTEP-slides/2ed49850de72e48c36b46b31f0330823e59986d2/figures/34-journaling.jpg


--------------------------------------------------------------------------------
/figures/41-asan.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HexHive/OSTEP-slides/2ed49850de72e48c36b46b31f0330823e59986d2/figures/41-asan.gif


--------------------------------------------------------------------------------
/figures/41-coveragewall.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HexHive/OSTEP-slides/2ed49850de72e48c36b46b31f0330823e59986d2/figures/41-coveragewall.png


--------------------------------------------------------------------------------
/figures/41-fuzzing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HexHive/OSTEP-slides/2ed49850de72e48c36b46b31f0330823e59986d2/figures/41-fuzzing.png


--------------------------------------------------------------------------------
/figures/42-aslr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HexHive/OSTEP-slides/2ed49850de72e48c36b46b31f0330823e59986d2/figures/42-aslr.png


--------------------------------------------------------------------------------
/figures/42-cfi.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HexHive/OSTEP-slides/2ed49850de72e48c36b46b31f0330823e59986d2/figures/42-cfi.png


--------------------------------------------------------------------------------
/figures/42-dep.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HexHive/OSTEP-slides/2ed49850de72e48c36b46b31f0330823e59986d2/figures/42-dep.png


--------------------------------------------------------------------------------
/figures/42-dep_and_aslr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HexHive/OSTEP-slides/2ed49850de72e48c36b46b31f0330823e59986d2/figures/42-dep_and_aslr.png


--------------------------------------------------------------------------------
/figures/42-mem_safety_overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HexHive/OSTEP-slides/2ed49850de72e48c36b46b31f0330823e59986d2/figures/42-mem_safety_overview.png


--------------------------------------------------------------------------------
/figures/50-done1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HexHive/OSTEP-slides/2ed49850de72e48c36b46b31f0330823e59986d2/figures/50-done1.jpg


--------------------------------------------------------------------------------
/figures/50-done2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HexHive/OSTEP-slides/2ed49850de72e48c36b46b31f0330823e59986d2/figures/50-done2.jpg


--------------------------------------------------------------------------------
/figures/50-learn.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HexHive/OSTEP-slides/2ed49850de72e48c36b46b31f0330823e59986d2/figures/50-learn.jpg


--------------------------------------------------------------------------------
/pdf/README.md:
--------------------------------------------------------------------------------
1 | # Directory holding all compiled PDFs
2 | 


--------------------------------------------------------------------------------
/preamble.tex:
--------------------------------------------------------------------------------
 1 | \usepackage{tikz}
 2 | \usetikzlibrary{calc,decorations.pathreplacing,snakes}
 3 | \usepackage{multicol}
 4 | 
 5 | \hypersetup{colorlinks=true,linkcolor=,urlcolor=blue}
 6 | 
 7 | \expandafter\def\expandafter\insertshorttitle\expandafter{%
 8 |   \insertshorttitle\hfill%
 9 |   \insertframenumber\,/\,\inserttotalframenumber}
10 | 
11 | \newcommand{\hideFromPandoc}[1]{#1}
12 | \hideFromPandoc{
13 |   \let\Begin\begin
14 |   \let\End\end
15 | }
16 | 


--------------------------------------------------------------------------------