├── vercel.json ├── tsconfig.json ├── public ├── stats │ └── js │ │ └── script.js ├── banner.png ├── robots.txt ├── favicon-on-dark.png ├── images │ ├── the-end.png │ ├── cpu-pleading-face.png │ ├── elf-data-section.png │ ├── elf-file-structure.png │ ├── init-process-tree.png │ ├── binprm-buf-changelog.png │ ├── fetch-execute-cycle.png │ ├── instruction-pointer.png │ ├── writing-this-article.png │ ├── gnu-linux-elf-drawing.jpg │ ├── hardware-interrupt-meme.png │ ├── interrupt-vector-table.png │ ├── elf-program-header-types.png │ ├── kernel-mode-vs-user-mode.png │ ├── linux-shebang-truncation.png │ ├── static-vs-dynamic-linking.png │ ├── keyboard-hardware-interrupt.png │ ├── multilevel-paging-explainer.png │ ├── page-table-entry-permissions.png │ ├── virtual-memory-mmu-example.png │ ├── 4kib-paging-address-breakdown.png │ ├── higher-half-kernel-memory-map.png │ ├── linux-scheduler-target-latency.png │ ├── process-virtual-memory-mapping.png │ ├── elf-section-header-table-diagram.png │ ├── linux-program-execution-process.png │ ├── syscall-architecture-differences.png │ ├── assembly-to-machine-code-translation.png │ └── demand-paging-with-page-faults-comic.png ├── favicon-on-light.png ├── editions │ └── printable.pdf ├── github-images │ ├── banner-dark.png │ └── banner-light.png ├── orpheus-flag.svg └── squiggles │ └── bottom.svg ├── postcss.config.cjs ├── src ├── env.d.ts ├── metadata.ts ├── content │ ├── config.ts │ └── chapters │ │ ├── 0-intro.mdx │ │ ├── 7-epilogue.mdx │ │ ├── 2-slice-dat-time.mdx │ │ ├── 4-becoming-an-elf-lord.mdx │ │ ├── 1-the-basics.mdx │ │ ├── 6-lets-talk-about-forks-and-cows.mdx │ │ ├── 5-the-translator-in-your-computer.mdx │ │ └── 3-how-to-run-a-program.mdx ├── components │ ├── EditButton.astro │ ├── DowngradeHeadings.astro │ ├── TOCList.astro │ ├── ScrollPadding.astro │ ├── OldNav.astro │ ├── ColoredTitle.astro │ ├── CodeBlock.astro │ ├── ExternalNav.astro │ └── SEO.astro ├── styles │ ├── 404.css │ ├── home.css │ ├── chapter.css │ ├── one-pager.css │ └── global.css └── pages │ ├── 404.astro │ ├── editions │ └── one-pager.astro │ ├── index.astro │ └── [...slug].astro ├── .gitignore ├── package.json ├── astro.config.mjs ├── LICENSE ├── .github └── workflows │ └── deploy.yml ├── README.md └── scripts ├── pdfgen.js └── run-pdf-ci.mjs /vercel.json: -------------------------------------------------------------------------------- 1 | { 2 | "trailingSlash": false 3 | } 4 | -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "astro/tsconfigs/base" 3 | } 4 | -------------------------------------------------------------------------------- /public/stats/js/script.js: -------------------------------------------------------------------------------- 1 | // This is a Vercel rewrite in production, see vercel.json. -------------------------------------------------------------------------------- /postcss.config.cjs: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | plugins: [ require('postcss-nesting') ] 3 | } -------------------------------------------------------------------------------- /public/banner.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Revisto/putting-the-you-in-cpu/main/public/banner.png -------------------------------------------------------------------------------- /public/robots.txt: -------------------------------------------------------------------------------- 1 | User-agent: * 2 | Allow: / 3 | 4 | Sitemap: https://cpu.land/sitemap-index.xml 5 | -------------------------------------------------------------------------------- /src/env.d.ts: -------------------------------------------------------------------------------- 1 | /// 2 | /// 3 | -------------------------------------------------------------------------------- /public/favicon-on-dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Revisto/putting-the-you-in-cpu/main/public/favicon-on-dark.png -------------------------------------------------------------------------------- /public/images/the-end.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Revisto/putting-the-you-in-cpu/main/public/images/the-end.png -------------------------------------------------------------------------------- /public/favicon-on-light.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Revisto/putting-the-you-in-cpu/main/public/favicon-on-light.png -------------------------------------------------------------------------------- /public/editions/printable.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Revisto/putting-the-you-in-cpu/main/public/editions/printable.pdf -------------------------------------------------------------------------------- /public/github-images/banner-dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Revisto/putting-the-you-in-cpu/main/public/github-images/banner-dark.png -------------------------------------------------------------------------------- /public/images/cpu-pleading-face.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Revisto/putting-the-you-in-cpu/main/public/images/cpu-pleading-face.png -------------------------------------------------------------------------------- /public/images/elf-data-section.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Revisto/putting-the-you-in-cpu/main/public/images/elf-data-section.png -------------------------------------------------------------------------------- /public/images/elf-file-structure.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Revisto/putting-the-you-in-cpu/main/public/images/elf-file-structure.png -------------------------------------------------------------------------------- /public/images/init-process-tree.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Revisto/putting-the-you-in-cpu/main/public/images/init-process-tree.png -------------------------------------------------------------------------------- /public/github-images/banner-light.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Revisto/putting-the-you-in-cpu/main/public/github-images/banner-light.png -------------------------------------------------------------------------------- /public/images/binprm-buf-changelog.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Revisto/putting-the-you-in-cpu/main/public/images/binprm-buf-changelog.png -------------------------------------------------------------------------------- /public/images/fetch-execute-cycle.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Revisto/putting-the-you-in-cpu/main/public/images/fetch-execute-cycle.png -------------------------------------------------------------------------------- /public/images/instruction-pointer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Revisto/putting-the-you-in-cpu/main/public/images/instruction-pointer.png -------------------------------------------------------------------------------- /public/images/writing-this-article.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Revisto/putting-the-you-in-cpu/main/public/images/writing-this-article.png -------------------------------------------------------------------------------- /public/images/gnu-linux-elf-drawing.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Revisto/putting-the-you-in-cpu/main/public/images/gnu-linux-elf-drawing.jpg -------------------------------------------------------------------------------- /public/images/hardware-interrupt-meme.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Revisto/putting-the-you-in-cpu/main/public/images/hardware-interrupt-meme.png -------------------------------------------------------------------------------- /public/images/interrupt-vector-table.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Revisto/putting-the-you-in-cpu/main/public/images/interrupt-vector-table.png -------------------------------------------------------------------------------- /public/images/elf-program-header-types.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Revisto/putting-the-you-in-cpu/main/public/images/elf-program-header-types.png -------------------------------------------------------------------------------- /public/images/kernel-mode-vs-user-mode.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Revisto/putting-the-you-in-cpu/main/public/images/kernel-mode-vs-user-mode.png -------------------------------------------------------------------------------- /public/images/linux-shebang-truncation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Revisto/putting-the-you-in-cpu/main/public/images/linux-shebang-truncation.png -------------------------------------------------------------------------------- /public/images/static-vs-dynamic-linking.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Revisto/putting-the-you-in-cpu/main/public/images/static-vs-dynamic-linking.png -------------------------------------------------------------------------------- /public/images/keyboard-hardware-interrupt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Revisto/putting-the-you-in-cpu/main/public/images/keyboard-hardware-interrupt.png -------------------------------------------------------------------------------- /public/images/multilevel-paging-explainer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Revisto/putting-the-you-in-cpu/main/public/images/multilevel-paging-explainer.png -------------------------------------------------------------------------------- /public/images/page-table-entry-permissions.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Revisto/putting-the-you-in-cpu/main/public/images/page-table-entry-permissions.png -------------------------------------------------------------------------------- /public/images/virtual-memory-mmu-example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Revisto/putting-the-you-in-cpu/main/public/images/virtual-memory-mmu-example.png -------------------------------------------------------------------------------- /public/images/4kib-paging-address-breakdown.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Revisto/putting-the-you-in-cpu/main/public/images/4kib-paging-address-breakdown.png -------------------------------------------------------------------------------- /public/images/higher-half-kernel-memory-map.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Revisto/putting-the-you-in-cpu/main/public/images/higher-half-kernel-memory-map.png -------------------------------------------------------------------------------- /public/images/linux-scheduler-target-latency.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Revisto/putting-the-you-in-cpu/main/public/images/linux-scheduler-target-latency.png -------------------------------------------------------------------------------- /public/images/process-virtual-memory-mapping.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Revisto/putting-the-you-in-cpu/main/public/images/process-virtual-memory-mapping.png -------------------------------------------------------------------------------- /public/images/elf-section-header-table-diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Revisto/putting-the-you-in-cpu/main/public/images/elf-section-header-table-diagram.png -------------------------------------------------------------------------------- /public/images/linux-program-execution-process.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Revisto/putting-the-you-in-cpu/main/public/images/linux-program-execution-process.png -------------------------------------------------------------------------------- /public/images/syscall-architecture-differences.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Revisto/putting-the-you-in-cpu/main/public/images/syscall-architecture-differences.png -------------------------------------------------------------------------------- /public/images/assembly-to-machine-code-translation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Revisto/putting-the-you-in-cpu/main/public/images/assembly-to-machine-code-translation.png -------------------------------------------------------------------------------- /public/images/demand-paging-with-page-faults-comic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Revisto/putting-the-you-in-cpu/main/public/images/demand-paging-with-page-faults-comic.png -------------------------------------------------------------------------------- /src/metadata.ts: -------------------------------------------------------------------------------- 1 | export const abstract = 'Curious exactly what happens when you run a program on your computer? Learn how multiprocessing works, what system calls really are, how computers manage memory with hardware interrupts, and how Linux loads executables.' -------------------------------------------------------------------------------- /src/content/config.ts: -------------------------------------------------------------------------------- 1 | import { z, defineCollection } from 'astro:content' 2 | 3 | export const collections = { 4 | chapters: defineCollection({ 5 | schema: z.object({ 6 | chapter: z.number(), 7 | title: z.string(), 8 | shortname: z.string(), 9 | updatedAt: z.date() 10 | }) 11 | }) 12 | } 13 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # build output 2 | dist 3 | 4 | # dependencies 5 | node_modules/ 6 | .snowpack/ 7 | 8 | # logs 9 | npm-debug.log* 10 | yarn-debug.log* 11 | yarn-error.log* 12 | 13 | # environment variables 14 | .env 15 | .env.production 16 | 17 | # macOS-specific files 18 | .DS_Store 19 | 20 | .vercel 21 | .astro 22 | -------------------------------------------------------------------------------- /src/components/EditButton.astro: -------------------------------------------------------------------------------- 1 | --- 2 | import { getEntryBySlug } from 'astro:content' 3 | 4 | export interface Props { 5 | chapterSlug: string 6 | } 7 | 8 | const { chapterSlug } = Astro.props 9 | const chapter = await getEntryBySlug('chapters', chapterSlug) 10 | 11 | const url = 'https://github.com/tehlug/putting-the-you-in-cpu/tree/main/src/content/chapters/' 12 | + chapter.data.chapter + '-' + chapterSlug + '.mdx' 13 | --- 14 | 15 | ویرایش در گیت‌هاب 16 | -------------------------------------------------------------------------------- /src/components/DowngradeHeadings.astro: -------------------------------------------------------------------------------- 1 | --- 2 | import { JSDOM } from 'jsdom' 3 | 4 | const html = await Astro.slots.render('default') 5 | const dom = new JSDOM(html) 6 | 7 | // Downgrade all headings by one level 8 | for (let i = 5; i >= 1; i--) { 9 | const headings = dom.window.document.querySelectorAll(`h${i}`) 10 | for (const heading of headings) { 11 | const newHeading = dom.window.document.createElement(`h${i + 1}`) 12 | for (const attribute of heading.attributes) { 13 | newHeading.setAttribute(attribute.name, attribute.value) 14 | } 15 | newHeading.innerHTML = heading.innerHTML 16 | heading.replaceWith(newHeading) 17 | } 18 | } 19 | 20 | const newHtml = dom.serialize() 21 | --- 22 | 23 | 24 | -------------------------------------------------------------------------------- /src/components/TOCList.astro: -------------------------------------------------------------------------------- 1 | --- 2 | interface Props { 3 | headings: { depth: number, slug: string, text: string }[] 4 | } 5 | 6 | const firstLayer = [] 7 | let depth 8 | 9 | for (const heading of Astro.props.headings) { 10 | if (!firstLayer.length || heading.depth <= depth) { 11 | depth = heading.depth 12 | firstLayer.push({ ...heading, children: [] }) 13 | } else if (heading.depth > depth) { 14 | firstLayer.at(-1).children.push(heading) 15 | } 16 | } 17 | --- 18 | 19 |
    20 | {firstLayer.map(heading => ( 21 |
  • 22 | 23 | {heading.text} 24 | 25 | {heading.children.length > 0 ? : null} 26 |
  • 27 | ))} 28 |
29 | -------------------------------------------------------------------------------- /src/styles/404.css: -------------------------------------------------------------------------------- 1 | html { 2 | background: #000000; 3 | color: #00ff00; 4 | } 5 | 6 | main { 7 | font-family: var(--font-mono); 8 | font-size: 1.1rem; 9 | text-align: left; 10 | padding: 60px; 11 | line-height: 2.2; 12 | } 13 | 14 | h1 { 15 | font-weight: bold; 16 | font-size: inherit; 17 | } 18 | 19 | a { 20 | --color: currentColor; 21 | --hover-background: #004800; 22 | text-underline-offset: 6px; 23 | text-decoration: underline; 24 | } 25 | 26 | h1, p { 27 | margin: 0; 28 | line-height: inherit; 29 | } 30 | 31 | @media (max-width: 600px) { 32 | main { 33 | padding: 30px 30px; 34 | font-size: 0.9rem; 35 | } 36 | 37 | h1, p { 38 | margin: 16px 0; 39 | } 40 | } 41 | 42 | @media (max-width: 400px) { 43 | main { 44 | font-size: 0.8rem; 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/components/ScrollPadding.astro: -------------------------------------------------------------------------------- 1 | --- 2 | export interface Props { 3 | hideEditions?: boolean 4 | } 5 | 6 | const { hideEditions } = Astro.props 7 | --- 8 | 9 | 29 | -------------------------------------------------------------------------------- /src/components/OldNav.astro: -------------------------------------------------------------------------------- 1 | --- 2 | import { getCollection, CollectionEntry } from 'astro:content' 3 | 4 | export interface Props { 5 | chapterSlug: CollectionEntry<'chapters'>['slug'] 6 | } 7 | 8 | const { chapterSlug } = Astro.props 9 | const allChapters = await getCollection('chapters') 10 | --- 11 | 12 |
13 | همه فصل‌ها 14 | 15 |
    16 |
  1. 17 | Intro 18 |
  2. 19 | {allChapters 20 | .filter(chapter => chapter.data.chapter !== 0) // Skip the intro 21 | .map(otherChapter => ( 22 |
  3. 23 | 24 | {otherChapter.data.title} 25 | 26 |
  4. 27 | )) 28 | } 29 |
30 |
31 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "putting-the-you-in-cpu", 3 | "version": "0.0.1", 4 | "private": true, 5 | "license": "MIT", 6 | "scripts": { 7 | "dev": "astro dev", 8 | "fix-images": "node scripts/fix-image-paths.mjs", 9 | "build": "astro build", 10 | "preview": "astro preview", 11 | "generate-pdf:local": "node scripts/pdfgen.js", 12 | "generate-pdf:ci": "node scripts/run-pdf-ci.mjs" 13 | }, 14 | "devDependencies": { 15 | "@astrojs/mdx": "^4.3.0", 16 | "@astrojs/sitemap": "^3.4.0", 17 | "@astrojs/vercel": "^8.0.4", 18 | "@types/jsdom": "^21.1.7", 19 | "astro": "^5.8.1", 20 | "glob": "^11.0.2", 21 | "jsdom": "^26.1.0", 22 | "postcss-nesting": "^13.0.1", 23 | "puppeteer": "^21.0.2", 24 | "rehype-external-links": "^3.0.0", 25 | "rehype-preset-minify": "^7.0.1", 26 | "shiki": "^0.14.2", 27 | "start-server-and-test": "^2.0.12" 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /astro.config.mjs: -------------------------------------------------------------------------------- 1 | import { defineConfig } from 'astro/config' 2 | import mdx from '@astrojs/mdx' 3 | import sitemap from '@astrojs/sitemap' 4 | import rehypeExternalLinks from 'rehype-external-links' 5 | import rehypePresetMinify from 'rehype-preset-minify' 6 | 7 | const rehypeExternalLinksConfig = [ 8 | rehypeExternalLinks, 9 | { target: '_blank', rel: ['noopener', 'noreferrer'] } 10 | ] 11 | 12 | export default defineConfig({ 13 | site: 'https://tehlug.github.io/putting-the-you-in-cpu/', 14 | base: '/putting-the-you-in-cpu/', 15 | trailingSlash: 'never', 16 | output: 'static', 17 | server: { 18 | port: parseInt(process.env.PORT || '3000') 19 | }, 20 | integrations: [ 21 | mdx({ 22 | rehypePlugins: [ rehypeExternalLinksConfig, rehypePresetMinify ] 23 | }), 24 | sitemap({ 25 | filter: page => page !== 'https://cpu.land/404' 26 | }) 27 | ], 28 | markdown: { 29 | smartypants: true, 30 | rehypePlugins: [ rehypeExternalLinksConfig ], 31 | shikiConfig: { 32 | theme: 'one-dark-pro' 33 | } 34 | } 35 | }) 36 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Lexi Mattick 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/pages/404.astro: -------------------------------------------------------------------------------- 1 | --- 2 | import '../styles/global.css' 3 | import '../styles/404.css' 4 | 5 | const url = new URL(Astro.url.pathname, Astro.site) 6 | url.pathname = url.pathname.replace(/\/$/, '') // Strip slash from the end of the path 7 | 8 | const baseUrl = import.meta.env.BASE_URL && import.meta.env.BASE_URL !== '/' ? `${import.meta.env.BASE_URL}/` : '/' 9 | --- 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | Not Found | Putting the "You" in CPU 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 33 | 34 | 35 | 36 | 37 | 38 | 45 | 88 | -------------------------------------------------------------------------------- /src/pages/index.astro: -------------------------------------------------------------------------------- 1 | --- 2 | import '../styles/global.css' 3 | import '../styles/home.css' 4 | 5 | import { getEntryBySlug } from 'astro:content' 6 | import SEO from '../components/SEO.astro' 7 | import ColoredTitle from '../components/ColoredTitle.astro' 8 | import ExternalNav from '../components/ExternalNav.astro' 9 | import EditButton from '../components/EditButton.astro' 10 | import ScrollPadding from '../components/ScrollPadding.astro' 11 | 12 | const intro = await getEntryBySlug('chapters', 'intro') 13 | const chapterOne = await getEntryBySlug('chapters', 'the-basics') 14 | const { Content: Intro } = await intro.render() 15 | const baseUrl = import.meta.env.BASE_URL && import.meta.env.BASE_URL !== '/' ? `${import.meta.env.BASE_URL}/` : '/' 16 | 17 | --- 18 | 19 | 20 | 21 | 22 | 23 | 24 | 34 | 35 | 36 | 37 | 38 | 39 | 52 | 53 | 54 | 55 | 56 | 57 | 122 | 166 | 167 | 168 | -------------------------------------------------------------------------------- /public/orpheus-flag.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/styles/global.css: -------------------------------------------------------------------------------- 1 | :root { 2 | --font-serif: 'Vazirmatn', serif; 3 | --font-sans: 'Vazirmatn', sans-serif; 4 | --font-mono: 'Vazir Code', monospace; 5 | --font-main-title: 'Vazirmatn', serif; 6 | } 7 | 8 | * { 9 | box-sizing: border-box; 10 | } 11 | 12 | html { 13 | background: #f8f9fa; 14 | font-size: 1.1rem; 15 | font-family: var(--font-serif); 16 | overflow-wrap: break-word; 17 | line-height: 1.8; 18 | -webkit-text-size-adjust: 100%; 19 | } 20 | 21 | body { 22 | margin: 0; 23 | } 24 | 25 | img, video, iframe { 26 | width: 100%; 27 | height: auto; 28 | display: block; 29 | } 30 | 31 | h1 { 32 | line-height: 1.3; 33 | } 34 | 35 | h2 { 36 | line-height: 1.4; 37 | margin: 0; 38 | margin-top: 80px; 39 | } 40 | 41 | h3 { 42 | line-height: 1.5; 43 | margin-top: 60px; 44 | } 45 | 46 | h4 { 47 | margin-top: 40px; 48 | } 49 | 50 | pre { 51 | tab-size: 8; 52 | } 53 | 54 | code { 55 | font-size: 0.9em; 56 | font-family: var(--font-mono); 57 | } 58 | 59 | code:not(pre code) { 60 | color: #8f310c; 61 | background: #fff4e6; 62 | padding: 1px 2px; 63 | border-radius: 2px; 64 | } 65 | 66 | pre code { 67 | font-size: 0.85em; 68 | } 69 | 70 | p, ul:not(li ul), ol:not(li ol), pre, .code-block { 71 | margin: 20px 0; 72 | } 73 | 74 | .code-block { 75 | direction: ltr; 76 | } 77 | 78 | blockquote { 79 | background: #e9ecef; 80 | border-left: 4px solid #868e96; 81 | margin: 30px 0; 82 | padding: 10px 20px; 83 | padding-left: 30px; 84 | 85 | & p, & ul:not(li ul), & ol:not(li ol) { 86 | margin: 14px 0; 87 | } 88 | } 89 | 90 | a { 91 | --color: #6741d9; 92 | --hover-background: #e5dbff; 93 | color: var(--color); 94 | text-decoration: none; 95 | padding: 0 2px; 96 | margin: 0 -2px; 97 | 98 | &:hover { 99 | background: var(--hover-background); 100 | } 101 | 102 | &:not([href^=\#]):visited { 103 | --color: #9c36b5; 104 | --hover-background: #f3d9fa; 105 | } 106 | } 107 | 108 | hr { 109 | border: none; 110 | height: 1px; 111 | background: #ced4da; 112 | border: none; 113 | margin: 40px 0; 114 | } 115 | 116 | .sr-only { 117 | position: absolute; 118 | width: 1px; 119 | height: 1px; 120 | padding: 0; 121 | margin: -1px; 122 | overflow: hidden; 123 | clip: rect(0, 0, 0, 0); 124 | white-space: nowrap; 125 | border-width: 0; 126 | } 127 | 128 | .big { 129 | position: relative; 130 | left: calc((100% - var(--width)) / 2); 131 | margin: 40px auto; 132 | } 133 | 134 | .scroll-padding { 135 | padding-top: 60vh; 136 | padding-bottom: 20px; 137 | font-size: 0.9em; 138 | text-align: center; 139 | color: #adb5bd; 140 | line-height: 1.5; 141 | 142 | & p { 143 | margin: 6px 0; 144 | } 145 | 146 | & p:not(:first-child):not(:last-child) { 147 | font-style: italic; 148 | } 149 | 150 | & a { 151 | --color: #b197fc; 152 | } 153 | 154 | & a:not([href^="#"]):visited { 155 | --color: #da77f2; 156 | } 157 | 158 | & hr { 159 | margin: 16px 0; 160 | } 161 | } 162 | 163 | .content { 164 | width: 100%; 165 | max-width: 700px; 166 | padding: 10px; 167 | } 168 | 169 | summary { 170 | cursor: default; 171 | 172 | &:hover { 173 | color: #6741d9; 174 | } 175 | } 176 | 177 | .continue { 178 | --color: #000000; 179 | --hover-background: transparent; 180 | border-top: 2px solid #ced4da; 181 | border-bottom: 2px solid #ced4da; 182 | padding: 8px 16px; 183 | margin: 0 auto; 184 | text-align: center; 185 | margin-top: 40px; 186 | display: block; 187 | width: fit-content; 188 | font-style: italic; 189 | 190 | &:hover { 191 | border-color: #845ef7; 192 | } 193 | } 194 | 195 | nav.external { 196 | font-size: 0.95em; 197 | width: 100%; 198 | position: fixed; 199 | top: 0; 200 | left: 0; 201 | background: #f8f9fa; 202 | transform: translateY(0); 203 | transition: transform 180ms ease-out; 204 | z-index: 99; 205 | 206 | & ul { 207 | padding: 0; 208 | padding-top: 3px; 209 | padding-bottom: 4px; 210 | margin: 0; 211 | list-style-type: none; 212 | display: flex; 213 | gap: 16px; 214 | align-items: center; 215 | justify-content: center; 216 | 217 | &:hover li { 218 | opacity: 1 !important; 219 | } 220 | } 221 | 222 | & li { 223 | position: relative; 224 | transition: opacity 200ms ease-in-out; 225 | 226 | &.active::after { 227 | content: ""; 228 | position: absolute; 229 | top: calc(100% - 6px); 230 | left: 50%; 231 | transform: translateX(-50%); 232 | display: block; 233 | border: 5px solid transparent; 234 | border-bottom-color: #fa5252; 235 | } 236 | } 237 | 238 | & a, & a:not([href^=\#]):visited { 239 | --color: currentColor; 240 | --hover-background: #dee2e6; 241 | display: block; 242 | text-align: center; 243 | line-height: 1.4; 244 | padding: 2px 6px; 245 | 246 | & .chapter { 247 | font-size: 0.8em; 248 | font-weight: 400; 249 | color: #868e96; 250 | } 251 | } 252 | } 253 | 254 | .old-nav { 255 | display: none; 256 | 257 | & a { 258 | --color: #000000; 259 | --hover-background: #ced4da; 260 | } 261 | 262 | & .active { 263 | font-weight: bold; 264 | } 265 | 266 | & ol { 267 | margin: 5px 0; 268 | } 269 | } 270 | 271 | .edit-button { 272 | --color: #adb5bd; 273 | --hover-background: #e9ecef; 274 | vertical-align: baseline; 275 | padding: 0 3px; 276 | margin-left: 4px; 277 | font-size: 0.95rem; 278 | font-weight: normal; 279 | 280 | &:hover { 281 | color: #868e96; 282 | } 283 | } 284 | 285 | @media (max-width: 720px) { 286 | .old-nav { 287 | display: block; 288 | } 289 | 290 | .md-hide { 291 | display: none; 292 | } 293 | } 294 | 295 | @media (max-width: 420px) { 296 | .sm-hide { 297 | display: none; 298 | } 299 | } 300 | 301 | ::selection { 302 | background: #adb5bd9b; 303 | } 304 | 305 | .orpheus-flag { 306 | position: absolute; 307 | top: 0; 308 | left: 20px; 309 | display: block; 310 | width: 140px; 311 | transform-origin: top left; 312 | z-index: 999; 313 | 314 | & img { 315 | width: 100%; 316 | } 317 | 318 | &:hover { 319 | background: none; 320 | animation: orpheus-flag-wave 0.5s linear infinite alternate; 321 | } 322 | } 323 | 324 | @keyframes orpheus-flag-wave { 325 | 0% { 326 | transform: rotate(0deg); 327 | } 328 | 329 | 100% { 330 | transform: rotate(-5deg); 331 | } 332 | } 333 | 334 | @page { 335 | size: auto; 336 | margin: 0.8in; 337 | } 338 | 339 | @media print { 340 | html { 341 | background: #ffffff; 342 | font-size: 0.9em; 343 | } 344 | 345 | .orpheus-flag, nav.external, .scroll-padding, .edit-button, .continue { 346 | display: none; 347 | } 348 | 349 | .content { 350 | max-width: none; 351 | } 352 | 353 | h1, h2, h3, h4, h5, h6 { 354 | page-break-after: avoid; 355 | page-break-inside: avoid; 356 | } 357 | 358 | p { 359 | orphans: 2; 360 | widows: 2; 361 | } 362 | 363 | blockquote { 364 | background: none; 365 | border-left: 3px solid #adb5bd; 366 | padding: 0 8px; 367 | padding-left: 20px; 368 | page-break-inside: avoid; 369 | } 370 | 371 | code { 372 | color: #000000; 373 | } 374 | 375 | code:not(pre code) { 376 | color: #8f310c; 377 | background: none; 378 | padding: 0; 379 | border-radius: 0; 380 | } 381 | 382 | pre span { 383 | color: inherit !important; 384 | } 385 | 386 | h2 { 387 | margin-top: 50px; 388 | } 389 | 390 | hr { 391 | background: #ced4da; 392 | } 393 | 394 | a { 395 | text-decoration: underline; 396 | } 397 | 398 | a[href^=\#], a[href^=\/] { 399 | text-decoration: none; 400 | font-weight: bold; 401 | color: #000000; 402 | 403 | &::before { 404 | content: "["; 405 | color: #495057; 406 | } 407 | 408 | &::after { 409 | content: "]"; 410 | color: #495057; 411 | } 412 | } 413 | } -------------------------------------------------------------------------------- /src/content/chapters/2-slice-dat-time.mdx: -------------------------------------------------------------------------------- 1 | --- 2 | chapter: 2 3 | title: Slice Dat Time 4 | shortname: Multitasking 5 | slug: slice-dat-time 6 | updatedAt: 2023-08-02T18:14:02.296Z 7 | --- 8 | 9 | Let's say you're building an operating system and you want users to be able to run multiple programs at once. You don’t have a fancy multi-core processor though, so your CPU can only run one instruction at a time! 10 | 11 | Luckily, you’re a very smart OS developer. You figure out that you can fake parallelism by letting processes take turns on the CPU. If you cycle through the processes and run a couple instructions from each one, they can all be responsive without any single process hogging the CPU. 12 | 13 | But how do you take control back from program code to switch processes? After a bit of research, you discover that most computers come with timer chips. You can program a timer chip to trigger a switch to an OS interrupt handler after a certain amount of time passes. 14 | 15 | ## Hardware Interrupts 16 | 17 | Earlier, we talked about how software interrupts are used to hand control from a userland program to the OS. These are called “software” interrupts because they’re voluntarily triggered by a program — machine code executed by the processor in the normal fetch-execute cycle tells it to switch control to the kernel. 18 | 19 | A drawing illustrating how hardware interrupts break normal execution. On top: a drawing of a keyboard with a highlighted key, with a lightning bolt drawn to a CPU on the right. On the bottom: some binary labeled "program code," a similar lightning bolt, and some more binary labeled "kernel code." The lightning bolt is labeled "interrupt triggers context switch." 20 | 21 | OS schedulers use *timer chips* like [PITs](https://en.wikipedia.org/wiki/Programmable_interval_timer) to trigger hardware interrupts for multitasking: 22 | 23 | 1. Before jumping to program code, the OS sets the timer chip to trigger an interrupt after some period of time. 24 | 2. The OS switches to user mode and jumps to the next instruction of the program. 25 | 3. When the timer elapses, it triggers a hardware interrupt to switch to kernel mode and jump to OS code. 26 | 4. The OS can now save where the program left off, load a different program, and repeat the process. 27 | 28 | This is called *preemptive multitasking*; the interruption of a process is called [*preemption*](https://en.wikipedia.org/wiki/Preemption_(computing)). If you’re, say, reading this article on a browser and listening to music on the same machine, your very own computer is probably following this exact cycle thousands of times a second. 29 | 30 | ## Timeslice Calculation 31 | 32 | A *timeslice* is the duration an OS scheduler allows a process to run before preempting it. The simplest way to pick timeslices is to give every process the same timeslice, perhaps in the 10 ms range, and cycle through tasks in order. This is called *fixed timeslice round-robin* scheduling. 33 | 34 | > **Aside: fun jargon facts!** 35 | > 36 | > Did you know that timeslices are often called "quantums?" Now you do, and you can impress all your tech friends. I think I deserve heaps of praise for not saying quantum in every other sentence in this article. 37 | > 38 | > Speaking of timeslice jargon, Linux kernel devs use the [jiffy](https://github.com/torvalds/linux/blob/22b8cc3e78f5448b4c5df00303817a9137cd663f/include/linux/jiffies.h) time unit to count fixed frequency timer ticks. Among other things, jiffies are used for measuring the lengths of timeslices. Linux's jiffy frequency is typically 1000 Hz but can be configured when compiling the kernel. 39 | 40 | A slight improvement to fixed timeslice scheduling is to pick a *target latency* — the ideal longest time for a process to respond. The target latency is the time it takes for a process to resume execution after being preempted, assuming a reasonable number of processes. *This is pretty hard to visualize! Don't worry, a diagram is coming soon.* 41 | 42 | Timeslices are calculated by dividing the target latency by the total number of tasks; this is better than fixed timeslice scheduling because it eliminates wasteful task switching with fewer processes. With a target latency of 15 ms and 10 processes, each process would get 15/10 or 1.5 ms to run. With only 3 processes, each process gets a longer 5 ms timeslice while still hitting the target latency. 43 | 44 | Process switching is computationally expensive because it requires saving the entire state of the current program and restoring a different one. Past a certain point, too small a timeslice can result in performance problems with processes switching too rapidly. It's common to give the timeslice duration a lower bound (*minimum granularity*). This does mean that the target latency is exceeded when there are enough processes for the minimum granularity to take effect. 45 | 46 | At the time of writing this article, Linux's scheduler uses a target latency of 6 ms and a minimum granularity of 0.75 ms. 47 | 48 | A diagram titled "Naive Dynamic Timeslice Round-Robin Scheduling." It depicts a time series of 3 different processes getting time to execute in a repeated cycle. In between the execution blocks of each process is a much shorter block labeled "kernel scheduler." The length of each program execution block is labeled "timeslice (2ms)." The distance from the start of process 1 executing to the next start of process 1 executing, encompassing the execution time of processes 2 and 3, is labeled as "target latency (6ms)." 49 | 50 | Round-robin scheduling with this basic timeslice calculation is close to what most computers do nowadays. It's still a bit naive; most operating systems tend to have more complex schedulers which take process priorities and deadlines into account. Since 2007, Linux has used a scheduler called [Completely Fair Scheduler](https://docs.kernel.org/scheduler/sched-design-CFS.html). CFS does a bunch of very fancy computer science things to prioritize tasks and divvy up CPU time. 51 | 52 | Every time the OS preempts a process it needs to load the new program's saved execution context, including its memory environment. This is accomplished by telling the CPU to use a different *page table*, the mapping from "virtual" to physical addresses. This is also the system that prevents programs from accessing each other's memory; we'll go down this rabbit hole in chapters [5](/the-translator-in-your-computer) and [6](/lets-talk-about-forks-and-cows) of this article. 53 | 54 | ## Note #1: Kernel Preemptability 55 | 56 | So far, we've been only talking about the preemption and scheduling of userland processes. Kernel code might make programs feel laggy if it took too long handling a syscall or executing driver code. 57 | 58 | Modern kernels, including Linux, are [preemptive kernels](https://en.wikipedia.org/wiki/Kernel_preemption). This means they're programmed in a way that allows kernel code itself to be interrupted and scheduled just like userland processes. 59 | 60 | This isn't very important to know about unless you're writing a kernel or something, but basically every article I've read has mentioned it so I thought I would too! Extra knowledge is rarely a bad thing. 61 | 62 | ## Note #2: A History Lesson 63 | 64 | Ancient operating systems, including classic Mac OS and versions of Windows long before NT, used a predecessor to preemptive multitasking. Rather than the OS deciding when to preempt programs, the programs themselves would choose to yield to the OS. They would trigger a software interrupt to say, "hey, you can let another program run now." These explicit yields were the only way for the OS to regain control and switch to the next scheduled process. 65 | 66 | This is called [*cooperative multitasking*](https://en.wikipedia.org/wiki/Cooperative_multitasking). It has a couple major flaws: malicious or just poorly designed programs can easily freeze the entire operating system, and it's nigh impossible to ensure temporal consistency for realtime/time-sensitive tasks. For these reasons, the tech world switched to preemptive multitasking a long time ago and never looked back. 67 | -------------------------------------------------------------------------------- /src/content/chapters/4-becoming-an-elf-lord.mdx: -------------------------------------------------------------------------------- 1 | --- 2 | chapter: 4 3 | title: Becoming an Elf-Lord 4 | shortname: ELF 5 | slug: becoming-an-elf-lord 6 | updatedAt: 2023-07-17T17:16:18.079Z 7 | --- 8 | 9 | import CodeBlock from '../../components/CodeBlock.astro' 10 | 11 | We pretty thoroughly understand `execve` now. At the end of most paths, the kernel will reach a final program containing machine code for it to launch. Typically, a setup process is required before actually jumping to the code — for example, different parts of the program have to be loaded into the right places in memory. Each program needs different amounts of memory for different things, so we have standard file formats that specify how to set up a program for execution. While Linux supports many such formats, the most common format by far is *ELF* (executable and linkable format). 12 | 13 |

14 | A marker drawing on paper. A wizard elf is shown meditating, holding the head of a gnu in one hand and a Linux penguin in the other. The elf trails off, saying "Well, actually, Linux is just the kernel, the operating system is..." The drawing is captioned in red marker: "You've heard of elf on a shelf! Now, get ready for... elf on a GNU/Linux." The drawing is signed "Nicky." 15 |

16 |
17 |

18 | (Thank you to Nicky Case for the adorable drawing.) 19 |

20 |
21 | 22 | > **Aside: are elves everywhere?** 23 | > 24 | > When you run an app or command-line program on Linux, it's exceedingly likely that it's an ELF binary. However, on macOS the de-facto format is [Mach-O](https://en.wikipedia.org/wiki/Mach-O) instead. Mach-O does all the same things as ELF but is structured differently. On Windows, .exe files use the [Portable Executable](https://en.wikipedia.org/wiki/Portable_Executable) format which is, again, a different format with the same concept. 25 | 26 | In the Linux kernel, ELF binaries are handled by the `binfmt_elf` handler, which is more complex than many other handlers and contains thousands of lines of code. It's responsible for parsing out certain details from the ELF file and using them to load the process into memory and execute it. 27 | 28 | *I ran some command-line kung fu to sort binfmt handlers by line count:* 29 | 30 | 31 | ``` 32 | $ wc -l binfmt_* | sort -nr | sed 1d 33 | 2181 binfmt_elf.c 34 | 1658 binfmt_elf_fdpic.c 35 | 944 binfmt_flat.c 36 | 836 binfmt_misc.c 37 | 158 binfmt_script.c 38 | 64 binfmt_elf_test.c 39 | ``` 40 | 41 | 42 | ## File Structure 43 | 44 | Before looking more deeply at how `binfmt_elf` executes ELF files, let's take a look at the file format itself. ELF files are typically made up of four parts: 45 | 46 | A diagram showing an overview of the structure of ELF files, with four sequential sections. Section 1, ELF Header: basic information about the binary, and locations of PHT and SHT. Section 2, Program Header Table (PHT): describes how and where to load the ELF file's data into memory. Section 3, Section Header Table (SHT): optional "map" of the data to assist in debugging. Section 4, Data: all of the binary's data. The PHT and SHT point into this section. 47 | 48 | ### ELF Header 49 | 50 | Every ELF file has an [ELF header](https://refspecs.linuxfoundation.org/elf/gabi4+/ch4.eheader.html). It has the very important job of conveying basic information about the binary such as: 51 | 52 | - What processor it's designed to run on. ELF files can contain machine code for different processor types, like ARM and x86. 53 | - Whether the binary is meant to be run on its own as an executable, or whether it's meant to be loaded by other programs as a "dynamically linked library." We'll go into details about what dynamic linking is soon. 54 | - The entry point of the executable. Later sections specify exactly where to load data contained in the ELF file into memory. The entry point is a memory address pointing to where the first machine code instruction is in memory after the entire process has been loaded. 55 | 56 | The ELF header is always at the start of the file. It specifies the locations of the program header table and section header, which can be anywhere within the file. Those tables, in turn, point to data stored elsewhere in the file. 57 | 58 | ### Program Header Table 59 | 60 | The [program header table](https://refspecs.linuxbase.org/elf/gabi4+/ch5.pheader.html) is a series of entries containing specific details for how to load and execute the binary at runtime. Each entry has a type field that says what detail it's specifying — for example, `PT_LOAD` means it contains data that should be loaded into memory, but `PT_NOTE` means the segment contains informational text that shouldn't necessarily be loaded anywhere. 61 | 62 | A table showing four different common program header types. Type 1, PT_LOAD: data to be loaded into memory. Type 2, PT_NOTE: freeform text like copyright notices, version info, etc.. Type 3, PT_DYNAMIC: Info about dynamic linking. Type 4, PT_INTERP: Path to the location of an "ELF interpreter." 63 | 64 | Each entry specifies information about where its data is in the file and, sometimes, how to load the data into memory: 65 | 66 | - It points to the position of its data within the ELF file. 67 | - It can specify what virtual memory address the data should be loaded into memory at. This is typically left blank if the segment isn't meant to be loaded into memory. 68 | - Two fields specify the length of the data: one for the length of the data in the file, and one for the length of the memory region to be created. If the memory region length is longer than the length in the file, the extra memory will be filled with zeroes. This is beneficial for programs that might want a static segment of memory to use at runtime; these empty segments of memory are typically called [BSS](https://en.wikipedia.org/wiki/.bss) segments. 69 | - Finally, a flags field specifies what operations should be permitted if it's loaded into memory: `PF_R` makes it readable, `PF_W` makes it writable, and `PF_X` means it's code that should be allowed to execute on the CPU. 70 | 71 | ### Section Header Table 72 | 73 | The [section header table](https://refspecs.linuxbase.org/elf/gabi4+/ch4.sheader.html) is a series of entries containing information about *sections*. This section information is like a map, charting the data inside the ELF file. It makes it easy for [programs like debuggers](https://www.sourceware.org/gdb/) to understand the intended uses of different portions of the data. 74 | 75 | An old treasure map with islands, rivers, palm trees, and a compass rose. Some of the islands are labeled with ELF section names such as ".text", ".data", ".shstrtab", and ".bss". The drawing is captioned "The section header table is like a map for binary data." 76 | 77 | For example, the program header table can specify a large swath of data to be loaded into memory together. That single `PT_LOAD` block might contain both code and global variables! There's no reason those have to be specified separately to *run* the program; the CPU just starts at the entry point and steps forward, accessing data when and where the program requests it. However, software like a debugger for *analyzing* the program needs to know exactly where each area starts and ends, otherwise it might try to decode some text that says "hello" as code (and since that isn't valid code, explode). This information is stored in the section header table. 78 | 79 | While it's usually included, the section header table is actually optional. ELF files can run perfectly well with the section header table completely removed, and developers who want to hide what their code does will sometimes intentionally strip or mangle the section header table from their ELF binaries to [make them harder to decode](https://binaryresearch.github.io/2019/09/17/Analyzing-ELF-Binaries-with-Malformed-Headers-Part-1-Emulating-Tiny-Programs.html). 80 | 81 | Each section has a name, a type, and some flags that specify how it's intended to be used and decoded. Standard names usually start with a dot by convention. The most common sections are: 82 | 83 | - `.text`: machine code to be loaded into memory and executed on the CPU. `SHT_PROGBITS` type with the `SHF_EXECINSTR` flag to mark it as executable, and the `SHF_ALLOC` flag which means it's loaded into memory for execution. (Don't get confused by the name, it's still just binary machine code! I always found it somewhat strange that it's called `.text` despite not being readable "text.") 84 | - `.data`: initialized data hardcoded in the executable to be loaded into memory. For example, a global variable containing some text might be in this section. If you write low-level code, this is the section where statics go. This also has the type `SHT_PROGBITS`, which just means the section contains "information for the program." Its flags are `SHF_ALLOC` and `SHF_WRITE` to mark it as writable memory. 85 | - `.bss`: I mentioned earlier that it's common to have some allocated memory that starts out zeroed. It would be a waste to include a bunch of empty bytes in the ELF file, so a special segment type called BSS is used. It's helpful to know about BSS segments during debugging, so there's also a section header table entry that specifies the length of the memory to be allocated. It's of type `SHT_NOBITS`, and is flagged `SHF_ALLOC` and `SHF_WRITE`. 86 | - `.rodata`: this is like `.data` except it's read-only. In a very basic C program that runs `printf("Hello, world!")`, the string "Hello world!" would be in a `.rodata` section, while the actual printing code would be in a `.text` section. 87 | - `.shstrtab`: this is a fun implementation detail! The names of sections themselves (like `.text` and `.shstrtab`) aren't included directly in the section header table. Instead, each entry contains an offset to a location in the ELF file that contains its name. This way, each entry in the section header table can be the same size, making them easier to parse — an offset to the name is a fixed-size number, whereas including the name in the table would use a variable-size string. All of this name data is stored in its own section called `.shstrtab`, of type `SHT_STRTAB`. 88 | 89 | ### Data 90 | 91 | The program and section header table entries all point to blocks of data within the ELF file, whether to load them into memory, to specify where program code is, or just to name sections. All of these different pieces of data are contained in the data section of the ELF file. 92 | 93 | A diagram demonstrating how different parts of the ELF file reference locations within the data block. A continuous collection of data is depicted, fading out at the end, containing some clearly recognizable things such as the path to an ELF interpreter, the section title ".rodata", and the string "Hello, world!" A couple example ELF sections float above the data block, with arrows pointing to their data. For example, the data sections from both the PHT and SHT entry examples point to the same "Hello, world!" text. The SHT entry's label is also stored in the data block. 94 | 95 | ## A Brief Explanation of Linking 96 | 97 | Back to the `binfmt_elf` code: the kernel cares about two types of entries in the program header table. 98 | 99 | `PT_LOAD` segments specify where all the program data, like the `.text` and `.data` sections, need to be loaded into memory. The kernel reads these entries from the ELF file to load the data into memory so the program can be executed by the CPU. 100 | 101 | The other type of program header table entry that the kernel cares about is `PT_INTERP`, which specifies a "dynamic linking runtime." 102 | 103 | Before we talk about what dynamic linking is, let's talk about "linking" in general. Programmers tend to build their programs on top of libraries of reusable code — for example, libc, which we talked about earlier. When turning your source code into an executable binary, a program called a linker resolves all these references by finding the library code and copying it into the binary. This process is called *static linking*, which means external code is included directly in the file that's distributed. 104 | 105 | However, some libraries are super common. You'll find libc is used by basically every program under the sun, since it's the canonical interface for interacting with the OS through syscalls. It would be a terrible use of space to include a separate copy of libc in every single program on your computer. Also, it might be nice if bugs in libraries could be fixed in one place rather than having to wait for each program that uses the library to be updated. Dynamic linking is the solution to these problems. 106 | 107 | If a statically linked program needs a function `foo` from a library called `bar`, the program would include a copy of the entirety of `foo`. However, if it's dynamically linked it would only include a reference saying "I need `foo` from library `bar`." When the program is run, `bar` is hopefully installed on the computer and the `foo` function's machine code can be loaded into memory on-demand. If the computer's installation of the `bar` library is updated, the new code will be loaded the next time the program runs without needing any change in the program itself. 108 | 109 | A diagram showing the difference between static and dynamic linking. On the left, static linking is shown with the contents of some code called "foo" being separately copied into two programs. This is accompanied with text saying that library functions are copied from the developer's computer into each binary at built time. On the right side, dynamic linking is shown: each program contains the name of the "foo" function, with arrows pointing outside the programs into the foo program lying on the user's computer. This is paired with accompanying text stating that binaries reference the names of library functions, which are loaded from the user's computer at runtime. 110 | 111 | ## Dynamic Linking in the Wild 112 | 113 | On Linux, dynamically linkable libraries like `bar` are typically packaged into files with the .so (Shared Object) extension. These .so files are ELF files just like programs — you may recall that the ELF header includes a field to specify whether the file is an executable or a library. In addition, shared objects have a `.dynsym` section in the section header table which contains information on what symbols are exported from the file and can be dynamically linked to. 114 | 115 | On Windows, libraries like `bar` are packaged into .dll (**d**ynamic **l**ink **l**ibrary) files. macOS uses the .dylib (**dy**namically linked **lib**rary) extension. Just like macOS apps and Windows .exe files, these are formatted slightly differently from ELF files but are the same concept and technique. 116 | 117 | An interesting distinction between the two types of linking is that with static linking, only the portions of the library that are used are included in the executable and thus loaded into memory. With dynamic linking, the *entire library* is loaded into memory. This might initially sound less efficient, but it actually allows modern operating systems to save *more* space by loading a library into memory once and then sharing that code between processes. Only code can be shared as the library needs different state for different programs, but the savings can still be on the order of tens to hundreds of megabytes of RAM. 118 | 119 | ## Execution 120 | 121 | Let's hop on back to the kernel running ELF files: if the binary it's executing is dynamically linked, the OS can't just jump to the binary's code right away because there would be missing code — remember, dynamically linked programs only have references to the library functions they need! 122 | 123 | To run the binary, the OS needs to figure out what libraries are needed, load them, replace all the named pointers with actual jump instructions, and *then* start the actual program code. This is very complex code that interacts deeply with the ELF format, so it's usually a standalone program rather than part of the kernel. ELF files specify the path to the program they want to use (typically something like `/lib64/ld-linux-x86-64.so.2`) in a `PT_INTERP` entry in the program header table. 124 | 125 | After reading the ELF header and scanning through the program header table, the kernel can set up the memory structure for the new program. It starts by loading all `PT_LOAD` segments into memory, populating the program's static data, BSS space, and machine code. If the program is dynamically linked, the kernel will have to execute the [ELF interpreter](https://unix.stackexchange.com/questions/400621/what-is-lib64-ld-linux-x86-64-so-2-and-why-can-it-be-used-to-execute-file) (`PT_INTERP`), so it also loads the interpreter's data, BSS, and code into memory. 126 | 127 | Now the kernel needs to set the instruction pointer for the CPU to restore when returning to userland. If the executable is dynamically linked, the kernel sets the instruction pointer to the start of the ELF interpreter's code in memory. Otherwise, the kernel sets it to the start of the executable. 128 | 129 | The kernel is almost ready to return from the syscall (remember, we're still in `execve`). It pushes the `argc`, `argv`, and environment variables to the stack for the program to read when it begins. 130 | 131 | The registers are now cleared. Before handling a syscall, the kernel stores the current value of registers to the stack to be restored when switching back to user space. Before returning to user space, the kernel zeroes this part of the stack. 132 | 133 | Finally, the syscall is over and the kernel returns to userland. It restores the registers, which are now zeroed, and jumps to the stored instruction pointer. That instruction pointer is now the starting point of the new program (or the ELF interpreter) and the current process has been replaced! 134 | -------------------------------------------------------------------------------- /src/content/chapters/1-the-basics.mdx: -------------------------------------------------------------------------------- 1 | --- 2 | chapter: 1 3 | title: مقدمات 4 | shortname: مقدمات 5 | slug: the-basics 6 | updatedAt: 2023-07-19T18:57:54.630Z 7 | --- 8 | 9 | یه چیزی که موقع نوشتن این کتاب منو بارها و بارها شگفت‌زده کرد این بود که کامپیوترها چقدر ساده‌ان. هنوزم برام سخته که ذهنمو درگیر نکنم و انتظار پیچیدگی و انتزاع بیشتری نداشته باشم. اگه نکته‌ای وجود داره که قبل از ادامه دادن باید توی ذهنت حک کنی اینه که هر چیزی که ساده به نظر می‌رسه، واقعا به همون سادگیه. سادگی خیلی قشنگه و بعضی وقت‌ها هم خیلی...خیلی رو اعصابه. 10 | 11 | بیاین از پایه‌ای‌ترین بخش کارکرد کامپیوتر توی عمیق‌ترین قسمتش شروع کنیم. 12 | 13 | ## کامپیوترها چطوری طراحی شدن؟ 14 | 15 | *واحد پردازش مرکزی* (CPU) توی کامپیوتر مسئول انجام همه‌ی محاسباته. همون رئیس بزرگه، جادوگر بزرگ. از لحظه‌ای که کامپیوتر رو روشن می‌کنی شروع می‌کنه به کار کردن و دستورات رو پشت سر همدیگه یکی‌یکی اجرا می‌کنه. 16 | 17 | اولین CPUیی که به تولید انبوه رسید، مدل Intel 4004 بود ک اواخر دهه ۶۰ توسط فیزیکدان و مهندس ایتالیایی به نام فدریکو فاگین طراحی شد. برخلاف سیستم‌های ۶۴ بیتی که ما امروز استفاده می‌کنیم، این پردازنده معماری ۴ بیتی داشت و از پیچیدگی کمتری نسبت به پردازنده‌های مدرن امروزی برخوردار بود اما بخش زیادی از سادگی طراحی اون هنوز توی پردازنده‌های امروزی ما دیده می‌شه. 18 | 19 | «دستوراتی» که CPU اجرا می‌کنه، فقط داده‌های باینری هستن: یک یا دو بایت به مشخص کردن اینکه چه دستوری باید اجرا بشه (opcode) اختصاص داره و بعدش هم هر داده‌ای که برای اجرای اون دستور لازمه. چیزی که بهش می‌گیم کد ماشین، فقط یه سری دستور باینریه که پشت سر هم قرار گرفتن. اسمبلی هم یه زبان کمکیه که باعث می‌شه خوندن و نوشتن کد ماشین برای ما آدما راحت‌تر از کار کردن با بیت‌های خام باشه. وقتی کد اسمبلی می‌نویسیم هم در نهایت همیشه به همون کد باینری تبدیل می‌شه که CPU بلده بخونه. 20 | 21 | A diagram demonstrating how machine code translates to assembly and back. A bidirectional arrow connects three examples: Machine Code (Binary) followed by 3 bytes of binary numbers, Machine Code (Hex) followed by those 3 bytes translated to hex (0x83, 0xC3, 0x0A), and Assembly followed by "add ebx, 10". The Assembly and Machine Code are color-coded so it is clear that each byte of the machine code translate to one word in the assembly. 22 | 23 | > نکته: دستورها همیشه به‌صورت یک به یک توی کد ماشین تبدیل نمی‌شن، مثل مثالی که بالا دیدی. مثلاً دستور اسمبلی `add eax, 512` به این کد تبدیل می‌شه: `05 00 02 00 00`. 24 | > 25 | > بایت اول (`05`) یه opcode خاصه که مشخصاً به دستور اضافه کردن یه عدد ۳۲ بیتی به رجیستر EAX اشاره داره. بقیه بایت‌ها عدد ۵۱۲ (یا `0x200`) هستن که به صورت little-endian ذخیره شدن. 26 | > 27 | > سایت Defuse Security یه ابزار مفید ساخته برای تبدیل کد اسمبلی و کد ماشین بهمدیگه که می‌تونید ازش استفاده کنید. 28 | 29 | RAM is your computer's main memory bank, a large multi-purpose space which stores all the data used by programs running on your computer. That includes the program code itself as well as the code at the core of the operating system. The CPU always reads machine code directly from RAM, and code can't be run if it isn't loaded into RAM. 30 | 31 | The CPU stores an *instruction pointer* which points to the location in RAM where it's going to fetch the next instruction. After executing each instruction, the CPU moves the pointer and repeats. This is the *fetch-execute cycle*. 32 | 33 | A diagram demonstrating the fetch-execute cycle. There are two bubbles of text. The first is labeled "Fetch" and has the text "Read instruction from memory at the current instruction pointer." The second is titled "Execute" and has the text "Run the instruction and then move the instruction pointer." The fetch bubble has an arrow pointing to the execute bubble, and the execute bubble has an arrow pointing back to the fetch bubble, implying a repeated process. 34 | 35 | After executing an instruction, the pointer moves forward to immediately after the instruction in RAM so that it now points to the next instruction. That's why code runs! The instruction pointer just keeps chugging forward, executing machine code in the order in which it has been stored in memory. Some instructions can tell the instruction pointer to jump somewhere else instead, or jump different places depending on a certain condition; this makes reusable code and conditional logic possible. 36 | 37 | This instruction pointer is stored in a [*register*](https://en.wikipedia.org/wiki/Processor_register). Registers are small storage buckets that are extremely fast for the CPU to read and write to. Each CPU architecture has a fixed set of registers, used for everything from storing temporary values during computations to configuring the processor. 38 | 39 | Some registers are directly accessible from machine code, like `ebx` in the earlier diagram. 40 | 41 | Other registers are only used internally by the CPU, but can often be updated or read using specialized instructions. One example is the instruction pointer, which can't be read directly but can be updated with, for example, a jump instruction. 42 | 43 | ## Processors Are Naive 44 | 45 | Let's go back to the original question: what happens when you run an executable program on your computer? First, a bunch of magic happens to get ready to run it — we’ll work through all of this later — but at the end of the process there’s machine code in a file somewhere. The operating system loads this into RAM and instructs the CPU to jump the instruction pointer to that position in RAM. The CPU continues running its fetch-execute cycle as usual, so the program begins executing! 46 | 47 | (This was one of those psyching-myself-out moments for me — seriously, this is how the program you are using to read this article is running! Your CPU is fetching your browser's instructions from RAM in sequence and directly executing them, and they're rendering this article.) 48 | 49 | A diagram depicting a series of bytes of machine code in RAM. A highlighted byte is pointed to by an arrow labeled "Instruction Pointer," and there are arrows representing how the instruction pointer moves forward in RAM. 50 | 51 | It turns out CPUs have a super basic worldview; they only see the current instruction pointer and a bit of internal state. Processes are entirely operating system abstractions, not something CPUs natively understand or keep track of. 52 | 53 | *\*waves hands\* processes are abstractions made up by ~~os devs~~ big byte to sell more computers* 54 | 55 | For me, this raises more questions than it answers: 56 | 57 | 1. If the CPU doesn’t know about multiprocessing and just executes instructions sequentially, why doesn’t it get stuck inside whatever program it’s running? How can multiple programs run at once? 58 | 2. If programs run directly on the CPU, and the CPU can directly access RAM, why can't code access memory from other processes, or, god forbid, the kernel? 59 | 3. Speaking of which, what's the mechanism that prevents every process from running any instruction and doing anything to your computer? AND WHAT'S A DAMN SYSCALL? 60 | 61 | The question about memory deserves its own section and is covered in [chapter 5](/the-translator-in-your-computer) — the TL;DR is that most memory accesses actually go through a layer of misdirection that remaps the entire address space. For now, we're going to pretend that programs can access all RAM directly and computers can only run one process at once. We'll explain away both of these assumptions in time. 62 | 63 | It's time to leap through our first rabbit hole into a land filled with syscalls and security rings. 64 | 65 | > **Aside: what is a kernel, btw?** 66 | > 67 | > Your computer's operating system, like macOS, Windows, or Linux, is the collection of software that runs on your computer and makes all the basic stuff work. "Basic stuff" is a really general term, and so is "operating system" — depending on who you ask, it can include such things as the apps, fonts, and icons that come with your computer by default. 68 | > 69 | > The kernel, however, is the core of the operating system. When you boot up your computer, the instruction pointer starts at a program somewhere. That program is the kernel. The kernel has near-full access to your computer's memory, peripherals, and other resources, and is in charge of running software installed on your computer (known as userland programs). We'll learn about how the kernel has this access — and how userland programs don't — over the course of this article. 70 | > 71 | > Linux is just a kernel and needs plenty of userland software like shells and display servers to be usable. The kernel in macOS is called [XNU](https://en.wikipedia.org/wiki/XNU) and is Unix-like, and the modern Windows kernel is called the [NT Kernel](https://en.wikipedia.org/wiki/Architecture_of_Windows_NT). 72 | 73 | ## Two Rings to Rule Them All 74 | 75 | The *mode* (sometimes called privilege level or ring) a processor is in controls what it's allowed to do. Modern architectures have at least two options: kernel/supervisor mode and user mode. While an architecture might support more than two modes, only kernel mode and user mode are commonly used these days. 76 | 77 | In kernel mode, anything goes: the CPU is allowed to execute any supported instruction and access any memory. In user mode, only a subset of instructions is allowed, I/O and memory access is limited, and many CPU settings are locked. Generally, the kernel and drivers run in kernel mode while applications run in user mode. 78 | 79 | Processors start in kernel mode. Before executing a program, the kernel initiates the switch to user mode. 80 | 81 | Two fake iMessage screenshots demonstrating the different between user and kernel mode protections. The first, labeled Kernel Mode: right side says "Read this protected memory!", left side replies "Here you go, dear :)". The second, labeled User Mode: right side says "Read this protected memory!", left side replies "No! Segmentation fault!" 82 | 83 | An example of how processor modes manifest in a real architecture: on x86-64, the current privilege level (CPL) can be read from a register called `cs` (code segment). Specifically, the CPL is contained in the two [least significant bits](https://en.wikipedia.org/wiki/Bit_numbering) of the `cs` register. Those two bits can store x86-64's four possible rings: ring 0 is kernel mode and ring 3 is user mode. Rings 1 and 2 are designed for running drivers but are only used by a handful of older niche operating systems. If the CPL bits are `11`, for example, the CPU is running in ring 3: user mode. 84 | 85 | ## What Even is a Syscall? 86 | 87 | Programs run in user mode because they can't be trusted with full access to the computer. User mode does its job, preventing access to most of the computer — but programs need to be able to access I/O, allocate memory, and interact with the operating system *somehow*! To do so, software running in user mode has to ask the operating system kernel for help. The OS can then implement its own security protections to prevent programs from doing anything malicious. 88 | 89 | If you've ever written code that interacts with the OS, you'll probably recognize functions like `open`, `read`, `fork`, and `exit`. Below a couple of layers of abstraction, these functions all use *system calls* to ask the OS for help. A system call is a special procedure that lets a program start a transition from user space to kernel space, jumping from the program's code into OS code. 90 | 91 | User space to kernel space control transfers are accomplished using a processor feature called [*software interrupts*](https://en.wikipedia.org/wiki/Interrupt#Software_interrupts): 92 | 93 | 1. During the boot process, the operating system stores a table called an [*interrupt vector table*](https://en.wikipedia.org/wiki/Interrupt_vector_table) (IVT; x86-64 calls this the [interrupt descriptor table](https://en.wikipedia.org/wiki/Interrupt_descriptor_table)) in RAM and registers it with the CPU. The IVT maps interrupt numbers to handler code pointers. 94 | 95 | A image of a table captioned "Interrupt Vector Table". The first column, labeled with a number sign, has a series of numbers starting at 01 and going to 04. The corresponding second column of the table, labeled "Handler Address", contains a random 8-byte-long hex number per entry. The bottom of the table has the text "So on and such forth..." 96 | 97 | 2. Then, userland programs can use an instruction like [INT](https://www.felixcloutier.com/x86/intn:into:int3:int1) which tells the processor to look up the given interrupt number in the IVT, switch to kernel mode, and then jump the instruction pointer to the memory address stored in the IVT. 98 | 99 | When this kernel code finishes, it uses an instruction like [IRET](https://www.felixcloutier.com/x86/iret:iretd:iretq) to tell the CPU to switch back to user mode and return the instruction pointer to where it was when the interrupt was triggered. 100 | 101 | (If you were curious, the interrupt ID used for system calls on Linux is `0x80`. You can read a list of Linux system calls on [Michael Kerrisk's online manpage directory](https://man7.org/linux/man-pages/man2/syscalls.2.html).) 102 | 103 | ### Wrapper APIs: Abstracting Away Interrupts 104 | 105 | Here's what we know so far about system calls: 106 | 107 | - User mode programs can't access I/O or memory directly. They have to ask the OS for help interacting with the outside world. 108 | - Programs can delegate control to the OS with special machine code instructions like INT and IRET. 109 | - Programs can't directly switch privilege levels; software interrupts are safe because the processor has been preconfigured *by the OS* with where in the OS code to jump to. The interrupt vector table can only be configured from kernel mode. 110 | 111 | Programs need to pass data to the operating system when triggering a syscall; the OS needs to know which specific system call to execute alongside any data the syscall itself needs, for example, what filename to open. The mechanism for passing this data varies by operating system and architecture, but it's usually done by placing data in certain registers or on the stack before triggering the interrupt. 112 | 113 | The variance in how system calls are called across devices means it would be wildly impractical for programmers to implement system calls themselves for every program. This would also mean operating systems couldn't change their interrupt handling for fear of breaking every program that was written to use the old system. Finally, we typically don't write programs in raw assembly anymore — programmers can't be expected to drop down to assembly any time they want to read a file or allocate memory. 114 | 115 | A drawing captioned "System calls are implemented differently across architectures." On the left is a smiling CPU receiving some binary and spitting out a filename, file.txt. Separated on the right is a different CPU receiving the same binary data but with a confused and nauseous facial expression. 116 | 117 | So, operating systems provide an abstraction layer on top of these interrupts. Reusable higher-level library functions that wrap the necessary assembly instructions are provided by [libc](https://www.gnu.org/software/libc/) on Unix-like systems and part of a library called [ntdll.dll](https://learn.microsoft.com/en-us/windows-hardware/drivers/kernel/libraries-and-headers) on Windows. Calls to these library functions themselves don't cause switches to kernel mode, they're just standard function calls. Inside the libraries, assembly code does actually transfer control to the kernel, and is a lot more platform-dependent than the wrapping library subroutine. 118 | 119 | When you call `exit(1)` from C running on a Unix-like system, that function is internally running machine code to trigger an interrupt, after placing the system call's opcode and arguments in the right registers/stack/whatever. Computers are so cool! 120 | 121 | ## The Need for Speed / Let's Get CISC-y 122 | 123 | Many [CISC](https://en.wikipedia.org/wiki/Complex_instruction_set_computer) architectures like x86-64 contain instructions designed for system calls, created due to the prevalence of the system call paradigm. 124 | 125 | Intel and AMD managed not to coordinate very well on x86-64; it actually has *two* sets of optimized system call instructions. [SYSCALL](https://www.felixcloutier.com/x86/syscall.html) and [SYSENTER](https://www.felixcloutier.com/x86/sysenter) are optimized alternatives to instructions like `INT 0x80`. Their corresponding return instructions, [SYSRET](https://www.felixcloutier.com/x86/sysret.html) and [SYSEXIT](https://www.felixcloutier.com/x86/sysexit), are designed to transition quickly back to user space and resume program code. 126 | 127 | (AMD and Intel processors have slightly different compatibility with these instructions. `SYSCALL` is generally the best option for 64-bit programs, while `SYSENTER` has better support with 32-bit programs.) 128 | 129 | Representative of the style, [RISC](https://en.wikipedia.org/wiki/Reduced_instruction_set_computer) architectures tend not to have such special instructions. AArch64, the RISC architecture Apple Silicon is based on, uses only [one interrupt instruction](https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/SVC--Supervisor-Call-) for syscalls and software interrupts alike. I think Mac users are doing fine :) 130 | 131 | --- 132 | 133 | Whew, that was a lot! Let's do a brief recap: 134 | 135 | - Processors execute instructions in an infinite fetch-execute loop and don't have any concept of operating systems or programs. The processor's mode, usually stored in a register, determines what instructions may be executed. Operating system code runs in kernel mode and switches to user mode to run programs. 136 | - To run a binary, the operating system switches to user mode and points the processor to the code's entry point in RAM. Because they only have the privileges of user mode, programs that want to interact with the world need to jump to OS code for help. System calls are a standardized way for programs to switch from user mode to kernel mode and into OS code. 137 | - Programs typically use these syscalls by calling shared library functions. These wrap machine code for either software interrupts or architecture-specific syscall instructions that transfer control to the OS kernel and switch rings. The kernel does its business and switches back to user mode and returns to the program code. 138 | 139 | Let’s figure out how to answer my first question from earlier: 140 | 141 | > If the CPU doesn't keep track of more than one process and just executes instruction after instruction, why doesn't it get stuck inside whatever program it's running? How can multiple programs run at once? 142 | 143 | The answer to this, my dear friend, is also the answer to why Coldplay is so popular... clocks! (Well, technically timers. I just wanted to shoehorn that joke in.) 144 | -------------------------------------------------------------------------------- /src/content/chapters/6-lets-talk-about-forks-and-cows.mdx: -------------------------------------------------------------------------------- 1 | --- 2 | chapter: 6 3 | title: Let's Talk About Forks and Cows 4 | shortname: Fork-Exec 5 | slug: lets-talk-about-forks-and-cows 6 | updatedAt: 2023-07-17T17:16:18.079Z 7 | --- 8 | 9 | import CodeBlock from '../../components/CodeBlock.astro' 10 | 11 | The final question: how did we get here? Where do the first processes come from? 12 | 13 | This article is almost done. We're on the final stretch. About to hit a home run. Moving on to greener pastures. And various other terrible idioms that mean you are a single *Length of Chapter 6* away from touching grass or whatever you do with your time when you aren't reading 15,000 word articles about CPU architecture. 14 | 15 | If `execve` starts a new program by replacing the current process, how do you start a new program separately, in a new process? This is a pretty important ability if you want to do multiple things on your computer; when you double-click an app to start it, the app opens separately while the program you were previously on continues running. 16 | 17 | The answer is another system call: `fork`, the system call fundamental to all multiprocessing. `fork` is quite simple, actually — it clones the current process and its memory, leaving the saved instruction pointer exactly where it is, and then allows both processes to proceed as usual. Without intervention, the programs continue to run independently from each other and all computation is doubled. 18 | 19 | The newly running process is referred to as the "child," with the process originally calling `fork` the "parent." Processes can call `fork` multiple times, thus having multiple children. Each child is numbered with a *process ID* (PID), starting with 1. 20 | 21 | Cluelessly doubling the same code is pretty useless, so `fork` returns a different value on the parent vs the child. On the parent, it returns the PID of the new child process, while on the child it returns 0. This makes it possible to do different work on the new process so that forking is actually helpful. 22 | 23 | 24 | ```c 25 | pid_t pid = fork(); 26 | 27 | // Code continues from this point as usual, but now across 28 | // two "identical" processes. 29 | // 30 | // Identical... except for the PID returned from fork! 31 | // 32 | // This is the only indicator to either program that they 33 | // are not one of a kind. 34 | 35 | if (pid == 0) { 36 | // We're in the child. 37 | // Do some computation and feed results to the parent! 38 | } else { 39 | // We're in the parent. 40 | // Probably continue whatever we were doing before. 41 | } 42 | ``` 43 | 44 | 45 | Process forking can be a bit hard to wrap your head around. From this point on I will assume you've figured it out; if you have not, check out [this hideous-looking website](https://www.csl.mtu.edu/cs4411.ck/www/NOTES/process/fork/create.html) for a pretty good explainer. 46 | 47 | Anyways, Unix programs launch new programs by calling `fork` and then immediately running `execve` in the child process. This is called the *fork-exec pattern*. When you run a program, your computer executes code similar to the following: 48 | 49 | 50 | ```c 51 | pid_t pid = fork(); 52 | 53 | if (pid == 0) { 54 | // Immediately replace the child process with the new program. 55 | execve(...); 56 | } 57 | 58 | // Since we got here, the process didn't get replaced. We're in the parent! 59 | // Helpfully, we also now have the PID of the new child process in the PID 60 | // variable, if we ever need to kill it. 61 | 62 | // Parent program continues here... 63 | ``` 64 | 65 | 66 | ## Mooooo! 67 | 68 | You might've noticed that duplicating a process's memory only to immediately discard all of it when loading a different program sounds a bit inefficient. Luckily, we have an MMU. Duplicating data in physical memory is the slow part, not duplicating page tables, so we simply *don't* duplicate any RAM: we create a copy of the old process's page table for the new process and keep the mapping pointing to the same underlying physical memory. 69 | 70 | But the child process is supposed to be independent and isolated from the parent! It's not okay for the child to write to the parent's memory, or vice versa! 71 | 72 | Introducing *COW* (copy on write) pages. With COW pages, both processes read from the same physical addresses as long as they don't attempt to write to the memory. As soon as one of them tries to write to memory, that page is copied in RAM. COW pages allow both processes to have memory isolation without an upfront cost of cloning the entire memory space. This is why the fork-exec pattern is efficient; since none of the old process's memory is written to before loading a new binary, no memory copying is necessary. 73 | 74 | COW is implemented, like many fun things, with paging hacks and hardware interrupt handling. After `fork` clones the parent, it flags all of the pages of both processes as read-only. When a program writes to memory, the write fails because the memory is read-only. This triggers a segfault (the hardware interrupt kind) which is handled by the kernel. The kernel which duplicates the memory, updates the page to allow writing, and returns from the interrupt to reattempt the write. 75 | 76 | > *A: Knock, knock! 77 | > B: Who's there? 78 | > A: Interrupting cow. 79 | > B: Interrupting cow wh — 80 | > A: **MOOOOO!*** 81 | 82 | ## In the Beginning (Not Genesis 1:1) 83 | 84 | Every process on your computer was fork-execed by a parent program, except for one: the *init process*. The init process is set up manually, directly by the kernel. It is the first userland program to run and the last to be killed at shutdown. 85 | 86 | Want to see a cool instant blackscreen? If you're on macOS or Linux, save your work, open a terminal, and kill the init process (PID 1): 87 | 88 | 89 | ``` 90 | $ sudo kill 1 91 | ``` 92 | 93 | 94 | > *Author's note: knowledge about init processes, unfortunately, only applies to Unix-like systems like macOS and Linux. Most of what you learn from now on will not apply to understanding Windows, which has a very different kernel architecture.* 95 | > 96 | > *Just like the section on `execve`, I am explicitly addressing this — I could write another entire article on the NT kernel, but I am holding myself back from doing so. (For now.)* 97 | 98 | The init process is responsible for spawning all of the programs and services that make up your operating system. Many of those, in turn, spawn their own services and programs. 99 | 100 | A tree of processes. The root node is labeled "init." All child nodes are unlabeled but implied to be spawned by the init process. 101 | 102 | Killing the init process kills all of its children and all of their children, shutting down your OS environment. 103 | 104 | ## Back to the Kernel 105 | 106 | We had a lot of fun looking at Linux kernel code [back in chapter 3](/how-to-run-a-program), so we're gonna do some more of that! This time we'll start with a look at how the kernel starts the init process. 107 | 108 | Your computer boots up in a sequence like the following: 109 | 110 | 1. The motherboard is bundled with a tiny piece of software that searches your connected disks for a program called a *bootloader*. It picks a bootloader, loads its machine code into RAM, and executes it. 111 | 112 | Keep in mind that we are not yet in the world of a running OS. Until the OS kernel starts an init process, multiprocessing and syscalls don’t really exist. In the pre-init context, "executing" a program means directly jumping to its machine code in RAM without expectation of return. 113 | 2. The bootloader is responsible for finding a kernel, loading it into RAM, and executing it. Some bootloaders, like [GRUB](https://www.gnu.org/software/grub/), are configurable and/or let you select between multiple operating systems. BootX and Windows Boot Manager are the built-in bootloaders of macOS and Windows, respectively. 114 | 3. The kernel is now running and begins a large routine of initialization tasks including setting up interrupt handlers, loading drivers, and creating the initial memory mapping. Finally, the kernel switches the privilege level to user mode and starts the init program. 115 | 4. We're finally in userland in an operating system! The init program begins running init scripts, starting services, and executing programs like the shell/UI. 116 | 117 | ### Initializing Linux 118 | 119 | On Linux, the bulk of step 3 (kernel initialization) occurs in the `start_kernel` function in [init/main.c](https://github.com/torvalds/linux/blob/22b8cc3e78f5448b4c5df00303817a9137cd663f/init/main.c). This function is over 200 lines of calls to various other init functions, so I won't include [the whole thing](https://github.com/torvalds/linux/blob/22b8cc3e78f5448b4c5df00303817a9137cd663f/init/main.c#L880-L1091) in this article, but I do recommend scanning through it! At the end of `start_kernel` a function named `arch_call_rest_init` is called: 120 | 121 | 122 | ```c 123 | /* Do the rest non-__init'ed, we're now alive */ 124 | arch_call_rest_init(); 125 | ``` 126 | 127 | 128 | > **What does non-\_\_init'ed mean?** 129 | > 130 | > The `start_kernel` function is defined as `asmlinkage __visible void __init __no_sanitize_address start_kernel(void)`. The weird keywords like `__visible`, `__init`, and `__no_sanitize_address` are all C preprocessor macros used in the Linux kernel to add various code or behaviors to a function. 131 | > 132 | > In this case, `__init` is a macro that instructs the kernel to free the function and its data from memory as soon as the boot process is completed, simply to save space. 133 | > 134 | > How does it work? Without getting too deep into the weeds, the Linux kernel is itself packaged as an ELF file. The `__init` macro expands to `__section(".init.text")`, which is a compiler directive to place the code in a section called `.init.text` instead of the usual `.text` section. Other macros allow data and constants to be placed in special init sections as well, such as `__initdata` that expands to `__section(".init.data")`. 135 | 136 | `arch_call_rest_init` is nothing but a wrapper function: 137 | 138 | 139 | ```c 140 | void __init __weak arch_call_rest_init(void) 141 | { 142 | rest_init(); 143 | } 144 | ``` 145 | 146 | 147 | The comment said "do the rest non-\_\_init'ed" because `rest_init` is not defined with the `__init` macro. This means it is not freed when cleaning up init memory: 148 | 149 | 150 | ```c 151 | noinline void __ref rest_init(void) 152 | { 153 | ``` 154 | 155 | 156 | `rest_init` now creates a thread for the init process: 157 | 158 | 159 | ```c 160 | /* 161 | * We need to spawn init first so that it obtains pid 1, however 162 | * the init task will end up wanting to create kthreads, which, if 163 | * we schedule it before we create kthreadd, will OOPS. 164 | */ 165 | pid = user_mode_thread(kernel_init, NULL, CLONE_FS); 166 | ``` 167 | 168 | 169 | The `kernel_init` parameter passed to `user_mode_thread` is a function that finishes some initialization tasks and then searches for a valid init program to execute it. This procedure starts with some basic setup tasks; I will skip through these for the most part, except for where `free_initmem` is called. This is where the kernel frees our `.init` sections! 170 | 171 | 172 | ```c 173 | free_initmem(); 174 | ``` 175 | 176 | 177 | Now the kernel can find a suitable init program to run: 178 | 179 | 180 | ```c 181 | /* 182 | * We try each of these until one succeeds. 183 | * 184 | * The Bourne shell can be used instead of init if we are 185 | * trying to recover a really broken machine. 186 | */ 187 | if (execute_command) { 188 | ret = run_init_process(execute_command); 189 | if (!ret) 190 | return 0; 191 | panic("Requested init %s failed (error %d).", 192 | execute_command, ret); 193 | } 194 | 195 | if (CONFIG_DEFAULT_INIT[0] != '\0') { 196 | ret = run_init_process(CONFIG_DEFAULT_INIT); 197 | if (ret) 198 | pr_err("Default init %s failed (error %d)\n", 199 | CONFIG_DEFAULT_INIT, ret); 200 | else 201 | return 0; 202 | } 203 | 204 | if (!try_to_run_init_process("/sbin/init") || 205 | !try_to_run_init_process("/etc/init") || 206 | !try_to_run_init_process("/bin/init") || 207 | !try_to_run_init_process("/bin/sh")) 208 | return 0; 209 | 210 | panic("No working init found. Try passing init= option to kernel. " 211 | "See Linux Documentation/admin-guide/init.rst for guidance."); 212 | ``` 213 | 214 | 215 | On Linux, the init program is almost always located at or symbolic-linked to `/sbin/init`. Common inits include [systemd](https://systemd.io/) (which has an abnormally good website), [OpenRC](https://wiki.gentoo.org/wiki/OpenRC/openrc-init), and [runit](http://smarden.org/runit/). `kernel_init` will default to `/bin/sh` if it can't find anything else — and if it can't find `/bin/sh`, something is TERRIBLY wrong. 216 | 217 | *MacOS has an init program, too! It's called launchd and is located at `/sbin/launchd`. Try running that in a terminal to get yelled for not being a kernel.* 218 | 219 | From this point on, we're at step 4 in the boot process: the init process is running in userland and begins launching various programs using the fork-exec pattern. 220 | 221 | ### Fork Memory Mapping 222 | 223 | I was curious how the Linux kernel remaps the bottom half of memory when forking processes, so I poked around a bit. [kernel/fork.c](https://github.com/torvalds/linux/blob/22b8cc3e78f5448b4c5df00303817a9137cd663f/kernel/fork.c) seems to contain most of the code for forking processes. The start of that file helpfully pointed me to the right place to look: 224 | 225 | 226 | ```c 227 | /* 228 | * 'fork.c' contains the help-routines for the 'fork' system call 229 | * (see also entry.S and others). 230 | * Fork is rather simple, once you get the hang of it, but the memory 231 | * management can be a bitch. See 'mm/memory.c': 'copy_page_range()' 232 | */ 233 | ``` 234 | 235 | 236 | It looks like this `copy_page_range` function takes some information about a memory mapping and copies the page tables. Quickly skimming through the functions it calls, this is also where pages are set to be read-only to make them COW pages. It checks whether it should do this by calling a function called `is_cow_mapping`. 237 | 238 | `is_cow_mapping` is defined back in [include/linux/mm.h](https://github.com/torvalds/linux/blob/22b8cc3e78f5448b4c5df00303817a9137cd663f/include/linux/mm.h), and returns true if the memory mapping has [flags](http://books.gigatux.nl/mirror/kerneldevelopment/0672327201/ch14lev1sec2.html) that indicate the memory is writeable and isn't shared between processes. Shared memory doesn't need to be COWed because it is designed to be shared. Admire the slightly incomprehensible bitmasking: 239 | 240 | 241 | ```c 242 | static inline bool is_cow_mapping(vm_flags_t flags) 243 | { 244 | return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; 245 | } 246 | ``` 247 | 248 | 249 | Back in [kernel/fork.c](https://github.com/torvalds/linux/blob/22b8cc3e78f5448b4c5df00303817a9137cd663f/kernel/fork.c), doing a simple Command-F for `copy_page_range` yields one call from the `dup_mmap` function... which is in turn called by `dup_mm`... which is called by `copy_mm`... which is finally called by the massive `copy_process` function! `copy_process` is the core of the fork function, and, in a way, the centerpoint of how Unix systems execute programs — always copying and editing a template created for the first process at startup. 250 | 251 |