├── .github ├── ISSUE_TEMPLATE │ └── project-proposal.md └── workflows │ └── build.yaml ├── .gitignore ├── LICENSE ├── Makefile ├── README.md ├── config.toml ├── content ├── _index.md ├── blog │ ├── 2025-01-14-welcome.md │ ├── 2025-03-06-path-profiling.md │ ├── 2025-03-11-Alive-Peephole-Optimizations │ ├── 2025-03-11-efficient_ssa │ │ ├── image-27.png │ │ ├── image-28.png │ │ ├── image-29.png │ │ ├── image-30.png │ │ └── index.md │ ├── 2025-04-10-fancy-memory-management.md │ ├── 2025-04-16-unified-gc-theory.md │ ├── 2025-04-17-blog.md │ ├── 2025-04-24-super-optimization │ │ ├── egraph-a.png │ │ ├── egraph-d.png │ │ └── index.md │ ├── 2025-05-013-parallel-dataflow │ │ ├── averages_by_bmark_ReachingDefinitions_runtime.png │ │ ├── averages_by_bmark_ReachingDefinitions_runtime_par.png │ │ ├── averages_runtime.png │ │ ├── averages_runtime_par.png │ │ ├── index.md │ │ ├── violin_runtime_AvailableExpr.png │ │ ├── violin_runtime_LiveVariables.png │ │ └── violin_runtime_ReachingDefinitions.png │ ├── 2025-05-13-Final Project.txt │ ├── 2025-05-13-Final_Project │ │ ├── evaluation.png │ │ ├── graph_new.pdf │ │ ├── graph_new.png │ │ ├── graph_old.pdf │ │ └── graph_old.png │ ├── 2025-05-13-bril-mlir.md │ ├── 2025-05-13-bril2wasm.md │ ├── 2025-05-13-brilooped │ │ ├── index.md │ │ ├── log_scale_comparison.png │ │ ├── merge-nodes.png │ │ ├── relative_change.png │ │ ├── size_vs_change.png │ │ ├── speedup_comparison.png │ │ ├── top_improvements.png │ │ └── top_regressions.png │ ├── 2025-05-13-flat-bril │ │ ├── bench_results.png │ │ ├── heap_figure1.png │ │ ├── heap_figure2.png │ │ ├── index.md │ │ ├── stack_chart1.png │ │ └── stack_chart2.png │ ├── 2025-05-13-global-value-numbering │ │ ├── index.md │ │ └── plot.png │ ├── 2025-05-13-imperative-quantifier-elimination.md │ ├── 2025-05-13-polyhedral.md │ ├── 2025-05-13-superopt │ │ ├── canonical-example-after.svg │ │ ├── canonical-example-before.svg │ │ ├── canonical-example.svg │ │ ├── index.md │ │ ├── simple_2_output_before.svg │ │ └── simple_2_output_lvv_after.svg │ ├── 2025-05-13-zihan-ethan-project.md │ ├── 2025-05-14-bril-to-x86.md │ ├── 2025-05-bril2c.md │ └── _index.md ├── lesson │ ├── 1.md │ ├── 10.md │ ├── 11.md │ ├── 12.md │ ├── 13.md │ ├── 14.md │ ├── 2.md │ ├── 3.md │ ├── 4.md │ ├── 5.md │ ├── 6.md │ ├── 7.md │ ├── 8.md │ ├── 9.md │ └── _index.md ├── schedule.md ├── self-guided.md └── syllabus.md ├── data ├── blog_footer.md ├── content.toml ├── reading.toml └── schedule.toml ├── sass └── main.scss ├── static └── img │ ├── book.svg │ ├── calendar.svg │ ├── construction.gif │ ├── discussion.svg │ ├── favicon.ico │ ├── favicon152.png │ ├── rss.svg │ └── video.svg └── templates ├── base.html ├── blog.html ├── home.html ├── index.html ├── lesson.html ├── lessons.html ├── macros.html ├── page.html ├── post.html ├── rss.xml ├── schedule.html ├── section.html └── sg.html /.github/ISSUE_TEMPLATE/project-proposal.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Project Proposal 3 | about: Fill this out to propose your course project. 4 | title: 'Project Proposal: [TITLE]' 5 | labels: 'proposal' 6 | --- 7 | 8 | **What will you do?** 9 | 10 | **How will you do it?** 11 | 12 | **How will you empirically measure success?** 13 | 14 | **Team members:** 15 | [@mention their GitHub usernames, if it's not just you] 16 | -------------------------------------------------------------------------------- /.github/workflows/build.yaml: -------------------------------------------------------------------------------- 1 | on: 2 | push: 3 | branches: 4 | - 2025sp 5 | pull_request: 6 | branches: 7 | - 2025sp 8 | 9 | jobs: 10 | build: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - uses: actions/checkout@v3 14 | - name: 'Install Zola' 15 | run: sudo snap install --edge zola 16 | - name: 'Build Zola site' 17 | run: zola build --drafts 18 | - name: Upload artifact 19 | uses: actions/upload-artifact@v4 20 | with: 21 | name: site 22 | path: public 23 | deploy: 24 | needs: build 25 | runs-on: ubuntu-latest 26 | if: ${{github.event_name=='push' && github.ref=='refs/heads/2025sp'}} 27 | steps: 28 | - name: Tailscale 29 | uses: tailscale/github-action@v2 30 | with: 31 | oauth-client-id: ${{ secrets.TS_OAUTH_CLIENT_ID }} 32 | oauth-secret: ${{ secrets.TS_OAUTH_SECRET }} 33 | tags: tag:ci 34 | - name: Download artifact 35 | uses: actions/download-artifact@v4 36 | with: 37 | name: site 38 | path: public 39 | - run: ls -R 40 | - name: rsync 41 | env: 42 | DEPLOY_HOST: cslinux.cs.cornell.edu 43 | DEPLOY_USER: als485 44 | DEPLOY_KEY: ${{ secrets.DEPLOY_KEY }} 45 | DEPLOY_KNOWN_HOSTS: ${{ secrets.DEPLOY_KNOWN_HOSTS }} 46 | DEPLOY_SRC: ./public/ 47 | DEPLOY_DEST: /courses/cs6120/2025sp/site 48 | run: | 49 | echo "$DEPLOY_KEY" > pk 50 | echo "$DEPLOY_KNOWN_HOSTS" > kh 51 | chmod 600 pk 52 | rsync --compress --recursive --checksum --itemize-changes --delete \ 53 | --perms --chmod=Du=rwx,Dgo=rx,Fu=rw,Fog=r \ 54 | -e "ssh -i pk -o 'UserKnownHostsFile kh'" \ 55 | $DEPLOY_SRC $DEPLOY_USER@$DEPLOY_HOST:$DEPLOY_DEST 56 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | public/ 3 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License 2 | 3 | Copyright (c) 2019-2023 Adrian Sampson 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: site serve deploy 2 | 3 | # Clean build. 4 | site: 5 | rm -rf public 6 | zola build --drafts 7 | 8 | serve: 9 | zola serve --drafts 10 | 11 | # Deployment. 12 | RSYNCARGS := --compress --recursive --checksum --itemize-changes \ 13 | --delete -e ssh --perms --chmod=Du=rwx,Dgo=rx,Fu=rw,Fog=r 14 | DEST := cslinux:/courses/cs6120/2025sp/site 15 | deploy: site 16 | rsync $(RSYNCARGS) ./public/ $(DEST) 17 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | CS 6120 2 | ======= 3 | 4 | This is the [website][cs6120] for a new grad course at Cornell on compilers. 5 | It uses [Zola][]. 6 | 7 | [zola]: https://www.getzola.org 8 | [cs6120]: https://www.cs.cornell.edu/courses/cs6120/2025sp/ 9 | 10 | 11 | Adding Blog Posts 12 | ----------------- 13 | 14 | To add a blog post (which you must do for discussion leading and project reports), use a [pull request][pr]. 15 | 16 | You'll want to create a text file in the `content/blog/` directory with your new post. 17 | Use a filename like `YYYY-MM-DD-title.md`, where the date is the discussion day or the project deadline and the title is up to you. 18 | Include Zola-style "[TOML][] front matter" at the top, which looks like this: 19 | 20 | +++ 21 | title = "Welcome to CS 6120!" 22 | [extra] 23 | bio = """ 24 | Grace Hopper made the first compiler. [Adrian Sampson](https://www.cs.cornell.edu/~asampson/) is an associate professor of computer science, so that's pretty cool too I guess. 25 | """ 26 | [[extra.authors]] 27 | name = "Adrian Sampson" 28 | link = "https://www.cs.cornell.edu/~asampson/" # Links are optional. 29 | [[extra.authors]] 30 | name = "Grace Hopper" 31 | +++ 32 | 33 | List all the authors of your post. 34 | Include a link to your homepage if you have one, but it's optional. 35 | Also write a short bio for yourselves (using [Markdown][]), which will appear at the bottom of the post. 36 | Then, the rest of the text file is the Markdown text of your blog post. 37 | 38 | If you want to use math in your blog post, put `latex = true` in your `[extra]` section to enable [KaTeX][]. Then you can use `$\pi$` for inline math and `\[ e^{i\pi} + 1 = 0 \]` for display math. 39 | 40 | To include images or other resources in your post, make your post into a directory. 41 | That is, make a new directory called `YYYY-MM-DD-title` inside `content/blog/`. 42 | Then, put your text in a file called `index.md` inside that. 43 | Put your images in the same directory and refer to them with relative paths. 44 | See [the Zola docs on assets][zola-assets] for more details. 45 | 46 | You can preview your writing with any Markdown renderer. 47 | To see what it will look like when published, [install Zola][zola-install] and type `zola serve` to preview the entire site. 48 | 49 | [pr]: https://help.github.com/en/articles/about-pull-requests 50 | [toml]: https://github.com/toml-lang/toml 51 | [markdown]: https://daringfireball.net/projects/markdown/ 52 | [zola-install]: https://www.getzola.org/documentation/getting-started/installation/ 53 | [zola-assets]: https://www.getzola.org/documentation/content/overview/#assets-colocation 54 | [katex]: https://katex.org 55 | -------------------------------------------------------------------------------- /config.toml: -------------------------------------------------------------------------------- 1 | base_url = "https://www.cs.cornell.edu/courses/cs6120/2025sp" 2 | compile_sass = true 3 | build_search_index = false 4 | 5 | generate_feeds = true 6 | 7 | title = "CS 6120" 8 | 9 | [markdown] 10 | smart_punctuation = true 11 | highlight_code = true 12 | highlight_theme = "subway-madrid" 13 | 14 | [extra] 15 | favicon = "/img/favicon.ico" 16 | bigicon = "/img/favicon152.png" 17 | 18 | [[extra.links]] 19 | name = "Zulip" 20 | url = "https://cs6120.zulipchat.com" 21 | 22 | [[extra.links]] 23 | name = "Discussions" 24 | url = "https://github.com/sampsyo/cs6120/discussions" 25 | -------------------------------------------------------------------------------- /content/_index.md: -------------------------------------------------------------------------------- 1 | +++ 2 | title = "Advanced Compilers" 3 | template = "home.html" 4 | +++ 5 | # CS 6120: Advanced Compilers 6 | 7 | **Instructor:** [Adrian Sampson][adrian] 8 | **When:** Tuesday and Thursday, 10:10–11:25am 9 | **Where:** [Hollister][] 206 10 | **Office hours:** by appointment (DM Adrian on [Zulip][] to find a time) 11 | 12 | Read & [subscribe][rss] to our [course blog][blog]! 13 | You can also clone the [source code][gh] for this course. 14 | While you're at it, check out the old blogs from [2019][blog2019], [2020][blog2020], [2022][blog2022], and [2023][blog2023]. 15 | The videos are also available on [Kaltura][] and [Box][box]. 16 | 17 | ## Self-Guided Version 18 | 19 | Anyone can follow along with [a self-guided version of the course][self-guided]. 20 | If you do, please fill out [this feedback survey][form] when you're done. 21 | 22 | [adrian]: https://www.cs.cornell.edu/~asampson/ 23 | [zulip]: https://cs6120.zulipchat.com 24 | [zoom]: https://www.cs.cornell.edu/courses/cs6120/2025sp/private/zoom.html 25 | [blog]: @/blog/_index.md 26 | [rss]: rss.xml 27 | [gh]: https://github.com/sampsyo/cs6120 28 | [blog2019]: https://www.cs.cornell.edu/courses/cs6120/2019fa/blog/ 29 | [blog2020]: https://www.cs.cornell.edu/courses/cs6120/2020fa/blog/ 30 | [blog2022]: https://www.cs.cornell.edu/courses/cs6120/2022sp/blog/ 31 | [blog2023]: https://www.cs.cornell.edu/courses/cs6120/2022fa/blog/ 32 | [kaltura]: https://vod.video.cornell.edu/channel/CS%2B6120/179754792 33 | [box]: https://cornell.box.com/s/wb3387ebfbte9btx3weekmc8nij5glep 34 | [lesson]: @/lesson/_index.md 35 | [self-guided]: @/self-guided.md 36 | [hollister]: https://www.fs.cornell.edu/fs/facinfo/fs_facilInfo.cfm?facil_cd=2046 37 | [form]: https://forms.gle/GuRiMa728DUvTbZQ7 38 | -------------------------------------------------------------------------------- /content/blog/2025-01-14-welcome.md: -------------------------------------------------------------------------------- 1 | +++ 2 | title = "Welcome to CS 6120!" 3 | [[extra.authors]] 4 | name = "Adrian Sampson" 5 | link = "https://www.cs.cornell.edu/~asampson/" 6 | +++ 7 | I'm incredibly excited to teach CS 6120, an experimental, open-source, hacking-oriented PhD-level course about compilers at Cornell! 8 | We're using a broad definition of "compilers" that covers all aspects of language implementation. 9 | 10 | We'll use this course blog for paper discussion and project reports throughout the semester. 11 | -------------------------------------------------------------------------------- /content/blog/2025-03-11-Alive-Peephole-Optimizations: -------------------------------------------------------------------------------- 1 | +++ 2 | title = "Alive: Provably Correct Peephole Optimizations" 3 | [extra] 4 | [[extra.authors]] 5 | name = "Annabel Baniak" 6 | [[extra.authors]] 7 | name = "Katherine Wu" 8 | [[extra.authors]] 9 | name = "Max Fan" 10 | [[extra.authors]] 11 | name = "Stephanie Ma" 12 | +++ 13 | 14 | ## Alive 15 | [“Provably Correct Peephole Optimizations with Alive”](https://dl.acm.org/doi/pdf/10.1145/2737924.2737965), by Lopes et al, introduces Alive, a verification framework and [Domain Specific Language](https://www.geeksforgeeks.org/domain-specific-languages-in-scala/) for expressing [peephole optimizations](https://www.geeksforgeeks.org/peephole-optimization-in-compiler-design/). As input, Alive takes in a DSL that expresses a certain class of peephole optimizations. As output, Alive either verifies peephole optimizations and provides LLVM code to perform the optimization, or provides a counterexample that witnesses a miscompile. Alive was intended for an audience of LLVM developers, so the syntax of the DSL is designed to resemble LLVM as much as possible to optimize familiarity and thus usability. 16 | 17 | One substantial challenge to model and verify optimizations that LLVM programmers have to consider is the presence of undefined behavior; the essential aim of Alive is to codify and encode this undefined behavior to check that optimizations don’t violate correctness even working within the framework of undefined behavior. To do this, the paper runs through the typing rules of Alive, and the undefined behavioral features of the tool that non-LLVM programmers may be unfamiliar with. LLVM contains support for both undefined values themselves, which represent undefined behavior and thus let the compiler pick a convenient value for each appearance of *undef*, to allow efficient optimizations, as well as poison values, which is similar to an undefined value, but different because the poison value is only triggers when it is used by an instruction with side effects, and taint all subsequent operations. 18 | 19 | These values representing undefined behavior allow the developers of Alive to codify correctness; when correctness is verified, it means that, for some given source expression, and the target expression that exists post-optimization, that the target expression is defined whenever the source is defined, the target is poison free when the source is poison free, and, if the source is defined and poison free, then the source and the target agree on the result. 20 | 21 | In order to do this verification procedure, [SMT solvers](https://ocamlpro.github.io/verification_for_dummies/smt/) are employed to see if each given (source, target) pair meets the correctness conditions outlined using the undefined and poison values. If an optimization fails the correctness check, then a counterexample is generated automatically by Alive as well, typically a four or eight bitwidth counterexample as those are the easiest to understand for human developers. Once any optimization transformation has been proven correct, Alive can also automatically generate C++ code which can be used as an LLVM optimization pass as well. 22 | 23 | The authors display the real-world usefulness of their tool by testing it on the LLVM InstCombine transformations, in which they found multiple bugs. The buggiest file in their testing suite on its own made up *six of fourty-four*, or 14%, of the total errors found by Alive. They also tested their tool on proposed LLVM patches, successfully proving correctness of LLVM patches before the PR was accepted, or locating bugs in the code which could be fixed before the patch was accepted. 24 | 25 | ## Follow-up work regarding Alive: 26 | There have been several variations of Alive following this paper, including floating-point support by [Alive-FP](https://people.cs.rutgers.edu/~santosh.nagarakatte/papers/alive-fp-sas16.pdf), and precondition inference for peephole optimizations expressed in Alive by [Alive-Infer](https://people.cs.rutgers.edu/~santosh.nagarakatte/papers/pldi2017-alive-infer.pdf). 27 | 28 | Six years after the release of this paper, in 2021, Alive was updated to the tool [Alive2](https://dl.acm.org/doi/pdf/10.1145/3453483.3454030), which expands Alive to include loop handling (by partially unrolling loops) and goes beyond simple local optimizations to handle function calls as well. It also uses translation validation which takes two IR files and checks whether the translation from one to the other is correct, so that the optimization can be tried on a test case before it is implemented. Alive2 also covers floats and aggregate types, which were missing from the implementation of the original Alive. 29 | 30 | This new version of Alive, Alive2, has essentially replaced Alive’s usefulness in the LLVM development sphere, but it is still in use today and the contributions of Alive were essential in facilitating its creation. 31 | 32 | ## Criticisms/Concerns: 33 | The authors do a lot to justify the usefulness of their tool; the anecdote about finding bugs and then eventually verifying correctness in a real developer’s actual LLVM patch is particularly effective. Reading the article, one can’t help but feel bad for that poor embarrassed developer for having bugs caught in the first two editions of their PR, but it is a convincing argument to use the tool; certainly from a software development perspective, you’d rather use Alive in-house and catch those bugs yourself, rather than send your patch out into the world and have someone else embarrass you by catching the bugs in your code! 34 | 35 | We also think it makes a lot of sense to design Alive with LLVM in mind, to find an already thriving developer community and create a tool specifically for them, rather than create something radically new and different that people are then resistant to actually use in real life. Tools with a concrete audience in mind are usually better loved and more impactful overall than even the most groundbreaking tool that people resist using. One can argue that since Alive can be rather difficult to understand for people without an LLVM background, there is some loss in impact with broader audiences for the language, but this is not really a shortcoming. 36 | 37 | However, there were a lot of concerns raised in the online discussion about both the scope and correctness of the DSL. Some argued that the scope of Alive, covering only simple peephole optimizations, was not sufficient to provide confidence in correctness. As such, there was a lot of interest in trying to expand a verification tool like Alive to more complex optimizations. However, through our discussion in class, we came to the conclusion that this kind of expansion was impossible or at least prohibitively difficult to realistically achieve. And even from the start, there were some dissenting voices saying that focusing too much on changing Alive to tackle more complex optimizations was not relevant, or would have diminishing returns; that the evidence the paper presents about catching bugs in real-life LLVM programs using only Alive’s peephole optimization focused process proves how useful even local optimization verification can be. 38 | 39 | There were also some concerns about correctness in Alive’s verification procedures, since they rely on the use of SMT solvers, which have [known bugs in them](https://testsmt.github.io/). However, again in in-class discussion, we couldn’t come up with any better solution; to paraphrase Prof. Sampson in our in-class discussion, “It seems to me that SMT solvers are the worst solution, besides all the other solutions.” 40 | 41 | Alive is designed to work closely alongside LLVM, but they do have a note acknowledging that, while the tool follows the semantics described in the documentation for LLVM, there could be human misunderstandings which create disparities in the actual semantics of LLVM versus the semantics written in the documentation. While it is obviously not Alive’s fault if LLVM, it does introduce another degree of uncertainty about the correctness of Alive. 42 | 43 | Overall, while there were some concerns raised about Alive in the discussion posts, we concluded in class that these concerns were either impossible to solve or not that big of a deal; Alive is one tool in the compiler developer’s toolbox, useful for debugging with its counterexample generation, and useful in that it provides guarantee of some correctness. Even if it’s not the magic bullet to automatically solve the problem of compiler correctness all in one go, it’s still a useful tool and has been in use in the LLVM community since its release. 44 | 45 | 46 | ## Discussion Questions, derived from our in-class discussion: 47 | * Many compiler passes are not as simple as peephole optimizations, spanning basic blocks if not function calls. Alive does not support control flow analysis, but could a similar solver-based architecture be used to verify control flow modifying passes? How? (It may be helpful to refer to [Alive2](https://dl.acm.org/doi/pdf/10.1145/3453483.3454030) here) 48 | * To what extent can we trust SMT solvers to check correctness of program transformations? It seems Alive defers the proof obligation to the SMT solver, although it is well-known that SMT solvers have bugs — there’s a [line of research](https://testsmt.github.io/) showing that popular SMT solvers have >500 soundness bugs. With this in mind, is it still ok to rely on SMT solvers? If not, how they could replace SMT solvers? They do note at the end (and parts within) that SMT solvers are expensive and some of their constraints grow uncomfortably superlinearly. 49 | * It's good for the developers that Alive is designed to resemble LLVM as much as possible, but does it make it hard to design or limit the expressiveness of automated verification tool like Alive in some way? 50 | * What’s the best way to ensure correctness for sources with floats and aggregate types? Alive currently does not implement these. (Again, to answer this, it may be helpful to refer to external work, in this case [Alive-FP](https://people.cs.rutgers.edu/~santosh.nagarakatte/papers/alive-fp-sas16.pdf)) 51 | 52 | 53 | 54 | 55 | -------------------------------------------------------------------------------- /content/blog/2025-03-11-efficient_ssa/image-27.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sampsyo/cs6120/8d343c04f6a6bd3ff783de0abda6b282c96791ae/content/blog/2025-03-11-efficient_ssa/image-27.png -------------------------------------------------------------------------------- /content/blog/2025-03-11-efficient_ssa/image-28.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sampsyo/cs6120/8d343c04f6a6bd3ff783de0abda6b282c96791ae/content/blog/2025-03-11-efficient_ssa/image-28.png -------------------------------------------------------------------------------- /content/blog/2025-03-11-efficient_ssa/image-29.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sampsyo/cs6120/8d343c04f6a6bd3ff783de0abda6b282c96791ae/content/blog/2025-03-11-efficient_ssa/image-29.png -------------------------------------------------------------------------------- /content/blog/2025-03-11-efficient_ssa/image-30.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sampsyo/cs6120/8d343c04f6a6bd3ff783de0abda6b282c96791ae/content/blog/2025-03-11-efficient_ssa/image-30.png -------------------------------------------------------------------------------- /content/blog/2025-03-11-efficient_ssa/index.md: -------------------------------------------------------------------------------- 1 | +++ 2 | title = "Simple and Efficient Construction of Static Single Assignment Form" 3 | [[extra.authors]] 4 | name = "Dev Patel" 5 | [[extra.authors]] 6 | name = "Neel Patel" 7 | +++ 8 | # Background 9 | 10 | [Single static assignment (SSA) form](https://compilers.cs.uni-saarland.de/ssasem/talks/Kenneth.Zadeck.pdf) and the [first efficient conversion algorithm](http://www.cs.utexas.edu/~pingali/CS380C/2010/papers/ssaCytron.pdf) emerged in the 1980's. A program in SSA form has the property that every variable assignment has a unique name. It provides a convenient way of thinking about programs which makes some optimizations more efficient. Generally, compilers put programs into SSA form before performing optimizations. 11 | 12 | 13 | 14 | The [first published algorithm](https://dl.acm.org/doi/pdf/10.1145/75277.75280) for SSA construction, written by Cytron et al., proceeds in two steps. The first places phi functions throughout the program, indicating ambiguities in assignments due to control flow. The second renames variables to ensure SSA’s single assignment property is satisfied. 15 | Importantly, Cytron et al.'s algorithm takes the CFG representation of the program as input. To create phi functions, it relies on calculation of the dominance frontier -- “the set of all CFG nodes Y such that X dominates a predecessor of Y but does not strictly dominate Y”: 16 | 17 | 18 | 19 | # Contributions of [Braun et al.’s algorithm](https://c9x.me/compile/bib/braun13cc.pdf) 20 | 21 | In “Simple and Efficient Construction of Static Single Assignment Form”, Braun et al. present an alternative algorithm for SSA construction. The main benefit to their algorithm is that it goes straight from the abstract syntax tree (AST) representation of a program to SSA form, and eschews calculation of some auxiliary data structures, like the dominance frontier. 22 | 23 | The algorithm calculates phi nodes lazily using recursion. The main steps are (1) local value numbering to lookup values defined in the same basic block and (2) global value numbering to recursively lookup values defined in the predecessors of a basic block. 24 | 25 |
26 | alt text 27 |
28 |
29 | alt text 30 |
31 | 32 | During global value numbering trivial phi functions that just reference themselves and one other value are removed. The algorithm also enables local, on-the fly optimizations such as constant folding, copy propagation, arithmetic simplification, and common subexpression elimination. If arbitrary control flow is possible (e.g., goto statements), strongly connected components of redundant phi functions (groups of phis that only reference each other and one other incoming definition from outside the group) are also removed. 33 | 34 | # Discussion 35 | ### An efficient algorithm with nice properties 36 | It is often the case that constructing alternative program representations and data structures incurs significant overhead -- memory for the data structures must be allocated and the corresponding construction algorithm must be executed. By eliminating the construction of the dominance-related data structures, there is an opportunity for reducing the end-to-end compilation time. 37 | Moreover, the on-the-fly optimizations could be useful when compiling in time-constrained scenarios (e.g., just-in-time compilers), where there may not be time to run a separate pass to perform an optimization. 38 | Some compilers enthusiasts in CS6120 found the algorithm to be unique and thought the proofs of some of its properties were elegant. Specifically, we appreciated the use of recursion to calculate phi nodes lazily and the use of strongly connected components to prove that the algorithm minimized the number of phi functions placed throughout the SSA IR. 39 | 40 | ### Potential for maintainence challenges 41 | Because Braun et al.'s algorithm goes straight from the AST to SSA form, the compiler's front-end may become more complex. Specifically, any modularity afforded by first converting a language-dependent AST into a CFG is lost. While this may be fine for compilers targeting a single language, compiler frameworks, like LLVM, implement front-ends for multiple languages. It may become burdensome for front-end developers to always have to implement Braun et al.'s algorithm. Some developers may prefer to transform the AST into a different IR, like a non-SSA CFG, and a framework that separates these concerns seems simpler to maintain. 42 | 43 | ### Loss of data structures which may be useful for other passes 44 | Since Braun et al.'s algorithm does not rely on the dominance frontier or dominator tree, these data structures would not be created at SSA construction time. If later analyses or optimizations that use these data structures are performed, much of the speedup from Braun et al.'s algorithm may be lost to subsequent passes. For example, the "dominates" relation is used to determine whether it is safe to perform [loop-invariant code motion](https://www.cs.cornell.edu/courses/cs6120/2025sp/lesson/8/) and the dominator tree is used for [contification](https://dl.acm.org/doi/10.1145/507635.507639), which can expose function inlining opportunities. 45 | 46 | ### Compile-time speedups? - Probably 47 | It certainly seems reasonable that a front-end based on Braun et al.'s algorithm could significantly speed up end-to-end compilation. However, Braun et al.'s implementation in LLVM 3.1 only executed 0.28% fewer instructions when compiling all programs in the SPEC CINT2000 suite. It is worth noting that their implementation was not as highly tuned as the baseline LLVM implementation they compared against. 48 | It is also worth noting that [some compilers and tools](#historical-context) have begun adopting Braun et al.'s algorithm for SSA conversion, lending credence to the author's claim that direct AST-to-SSA conversion could provide non-negligible speedups. 49 | 50 | # Historical Context 51 | SSA form and the first efficient conversion algorithm emerged in the 1980's, whereas the simple and efficient algorithm discussed in class was published in 2013. One question discussed in class was why Cytron et al.'s implementation has been the de facto SSA conversion scheme, used in the [LLVM](https://llvm.org/docs/LangRef.html) IR and other languages’ compiler toolchains (e.g., [Rust](https://github.com/rust-lang/rust/blob/master/compiler/rustc_mir_transform/src/ssa.rs)). Its ~25 year head start is one likely reason. 52 | A few projects have adopted Braun et al.'s algorithm for SSA construction, suggesting that the algorithm may be gaining traction in the compiler community. 53 | For example, a [SPIR-V-Tools](https://github.com/KhronosGroup/SPIRV-Tools/blob/ada1771a9f7a125573aa94fe551fdc44b45769bd/source/opt/ssa_rewrite_pass.h#L40C3-L40C3) pass converts SPIR-V functions directly into SSA form using Braun et al.'s algorithm. The [Memory SSA analysis](https://github.com/llvm/llvm-project/blob/main/llvm/lib/Analysis/MemorySSAUpdater.cpp#L29), which builds an SSA-like representation for LLVM memory operations, also uses the marker algorithm presented by Braun et al. Also, the [MIR project](https://github.com/vnmakarov/mir), which focuses on building JIT compilers currently uses Braun et al.'s algorithm for SSA construction. 54 | 55 | Perhaps in spring of 2049, CS6120 students will be debating whether a new SSA construction algorithm will replace the marker algorithm in their favorite JIT compilers. -------------------------------------------------------------------------------- /content/blog/2025-04-16-unified-gc-theory.md: -------------------------------------------------------------------------------- 1 | +++ 2 | title = "Exploring the Merits of the Garbage Collection Spectrum" 3 | [extra] 4 | bio = """ 5 | This week, we read [A Unified Theory of Garbage Collection](https://dl.acm.org/doi/10.1145/1028976.1028982) by Bacon et al. from OOPSLA 2004. 6 | To what extent is its conclusion, specifically that garbage collection algorithms lies on a spectrum between reference counting and tracing, 7 | practical, useful, or even true? 8 | """ 9 | latex = true 10 | [[extra.authors]] 11 | name = "Ethan Gabizon" 12 | [[extra.authors]] 13 | name = "Ernest Ng" 14 | [[extra.authors]] 15 | name = "Parth Sarkar" 16 | +++ 17 | 18 | # Background 19 | 20 | Garbage collection, a form of automatic memory management, frees programmers from needing to deallocate memory themselves. 21 | The two main garbage collection algorithms are tracing (also known as mark-and-sweep) and reference counting (RC). They each present distinct performance tradeoffs -- a tracing collector (e.g. the [Java Virtual Machine](https://stackoverflow.com/questions/65312024/why-are-jvm-garbage-collectors-trace-based)'s collector) will pause execution of the program to scan the entire heap at once, whereas a reference counting collector (e.g. [CPython](https://github.com/python/cpython/blob/main/InternalDocs/garbage_collector.md)'s collector) incurs shorter pause times by incrementally tracking objects. However, reference counting collectors require some extra machinery for tracking cycles. 22 | 23 | The paper's central thesis is that tracing and RC, although traditionally viewed as being entirely distinct, are actually algorithmic duals. Moreover, the authors demonstrate how various (more sophisticated) GC strategies, such as generational garbage collection, can be viewed as hybrids of tracing and RC. The authors argue that this notion of duality allows one to "systematically" explore the 24 | design space for GCs and better select the best GC strategy for a particular application. 25 | 26 | # Contributions 27 | 28 | ## Qualitative analysis 29 | By presenting tracing and RC as duals, the authors introduce a novel mental model for approaching garbage collectors. 30 | Specifically, tracing operates on live objects (“matter”), while RC operates on dead objects (“anti-matter”). Concretely, tracing initializes reference counts to 0 (an underestimate of the true count), *incrementing* them during graph traversal until they reach the true count. On the other hand, RC initializes reference counts to an overestimate of the true count, *decrementing* them during graph traversal until they reach the true count (ignoring cycles). (With RC, we start with an overestimate since we count in-edges from objects that are no longer live.) 31 | 32 | In addition, the authors formulate garbage collection as a fix-point problem, and they demonstrate that 33 | tracing computes the *least* fix point, while RC computes the *greatest* fix point, with their set difference being cyclic garbage. 34 | 35 | The authors also identify several key characteristics of each algorithm: 36 | | | Tracing | RC | 37 | | --- | --- | --- | 38 | | Starting point | Roots | Anti-roots | 39 | | Graph traversal | Forward from roots | Forward from anti-roots | 40 | | Objects traversed | Live | Dead | 41 | 42 | With this in mind, the authors leave the reader with 3 considerations to keep in mind when designing a new GC algorithm: 43 | 1. **Partition**: Should memory be divided into different regions which are each subjected to (possibly different) strategies? 44 | 2. **Traversal**: For each partition, should tracing or RC be used? 45 | 3. **Trade-offs**: For each partition, decide how to handle space-time trade-offs 46 | 47 | ## Hybrid collectors 48 | The authors demonstrate that in practice, various GC strategies lie on a continuum between tracing and RC. For example, 49 | 50 | For collectors where there is a *unified heap* (i.e. a single heap in which all data resides), we have: 51 | - **Deferred RC (DRC)**: References from the stack to the heap are traced, while references within the heap are reference-counted 52 | - **Partial tracing**: Converse of DRC, i.e. reference-count roots and trace the heap 53 | 54 | For generational GCs, where the heap is split into a nursery and a mature space, we have variations where: 55 | - both the nursery and mature space are traced (standard generational GC) 56 | - the nursery is RCed and the mature space is traced 57 | - the nursery is traced and the mature space is reference-counted (Ulterior Reference Counting) 58 | 59 | ## Cost model 60 | The authors also present a formal cost model for comparing the performance characteristics of different collectors. Specifically, they introduce: 61 | - $\kappa$, the time overhead for a single garbage collection 62 | - $\sigma$, the space overhead for a single garbage collection 63 | - $\phi$, the frequency of collection 64 | - $\mu$, the mutation overhead 65 | - $\tau$, the total time overhead for an entire program 66 | 67 | Notably, the authors mention that these quantities represent "real costs with implied coefficients", as opposed to 68 | idealized big-Oh notation. Crucially, the authors claim that their cost model accounts for space-time tradeoffs and allows for an somewhat realistic, "apples-to-apples" comparison of different collectors' performance. 69 | 70 | # Merits & Shortcomings 71 | Many people in class found the formulation of tracing and RC as algorithmic duals to be particularly elegant. 72 | Additionally, we found their classification of different GC strategies based on how they use tracing/RC on different regions to be a 73 | succinct way to highlight the conceptual differences between different non-trivial GC regimes. 74 | 75 | Part of our discussion focused on two aspects of the authors' cost model: (1) its accuracy, and (2) its utility. Regarding (1), we decided we would have liked to see a quantitative analysis of the cost model with actual benchmarks. While it seems like an elegant abstraction, we can't know for sure if the introduced model accounts for all variations between collectors, or at least the important ones. Additionally, we were interested about the impact of the model's assumptions -- it treats the allocation rate and the garbage fraction of the heap as constants. We would have liked to know how accurate the cost model is in cases where these assumptions don't hold. Evaluating various hybrid collectors on a set of benchmarks with varying allocation rates and garbage fractions would have answered these questions. 76 | 77 | Regarding (2), we wondered how useful an abstract cost model is given that programmers would likely benchmark their application with different collectors if they were really concerned about garbage collection as a performance bottleneck. 78 | 79 | Both of these concerns make us doubtful about the practicality of the proposed cost model. While it is an elegant abstraction and we appreciated the way the authors used it to compare their collectors, it would have been nice to see quantitative support for their comparisons. 80 | 81 | # Connections to state of the art 82 | An interesting thread of the discussion led some folks to the idea that the programmer 83 | could play a bigger role in garbage collection, instead of the more abstract interface 84 | that the mainstream paradigm provides. If a language implementation breaks 85 | down some of the existing abstractions, collection could be more "domain-specific", 86 | letting the compiler know which algorithms and hybrids to use on particular data structures 87 | or sections of memory, how often to collect, or generally providing useful compile-time 88 | guarantees. 89 | 90 | A great example was the idea that standard library data structures could come 91 | with programmer-facing guarantees on the GC's behavior. In this language, the user 92 | might explicitly choose to use specific data structures because they have the 93 | guarantee of being reference-counted, for example, avoiding larger pauses. 94 | 95 | One other instance of breaking existing abstractions, this time relying slightly more on 96 | the programmer, is the idea that there should be a fundamental separation between 97 | "regions of memory" and "objects to free". In particular, the user should be able to 98 | operate on some allocated space with the guarantee that the collector will not 99 | collect objects in that space until directed by the programmer. For example, 100 | if the program is operating on a graph, and all nodes are constantly active, 101 | it would be wasteful to increment and decrement reference counts for every 102 | change in the graph (or, indeed, mark-and-sweeping every so often); instead, 103 | there could be pointers into and out of the _space as a whole_, clearly outlining 104 | when to free this large chunk of space. 105 | 106 | (Side note: a while after this point was brought up, it occurred to us that this 107 | is close to what a programmer would do to manually manage memory. It seems buggy 108 | to free individual nodes during processing, so they might just free the whole 109 | section when they're sure they've finished their computation. This goes back to 110 | a meta-discussion-point about how there's still GC research to be done to fill the gap 111 | between automatic, GC-managed memory and manual memory management.) 112 | -------------------------------------------------------------------------------- /content/blog/2025-04-17-blog.md: -------------------------------------------------------------------------------- 1 | +++ 2 | title = "SELF love: The origins of dynamic compilers" 3 | [extra] 4 | bio = """ 5 | Ananya Goenka is an undergraduate studying CS at Cornell. While she isn’t nerding out over programming languages, or weird ISA quirks, she’s writing for Creme de Cornell, or getting way too invested in obscure books. 6 | 7 | Serena Duncan is an undergraduate studying CS at Cornell. When she isn't crying over Rust, she can be found taking long walks around campus and performing with the Shakespeare Troupe! 8 | """ 9 | [[extra.authors]] 10 | name = "Ananya Goenka" 11 | 12 | [[extra.authors]] 13 | name = "Serena Duncan" 14 | +++ 15 | 16 | # Background 17 | In the late 1980s, dynamic languages were admired for their expressive power yet dismissed as impractical for performance-critical work. The fastest Smalltalk runtimes of the day still ran an order of magnitude slower than optimized C, leaving object-oriented enthusiasts to choose between elegance and efficiency. It is in this landscape that Chambers, Ungar, and Lee introduced SELF—a language that eliminates classes entirely, treats every state access as a message send, and relies on prototypes for inheritance. Their 1989 OOPSLA paper, [An Efficient Implementation of SELF, a Dynamically-Typed Object-Oriented Language Based on Prototypes](http://portal.acm.org/citation.cfm?id=74884), is not a language manifesto so much as a bold engineering claim: with the right virtual-machine architecture, these radical semantics need not carry a performance tax. 18 | 19 | # Highlights 20 | 21 | The authors begin with a discussion of downsides to typical implementations of SELF. First, the object prototype model, while it allows incredible flexibility, drastically increases the storage required for any SELF program. Secondly, message passing induces large penalties on runtime, as searching for a matching object slot can be very costly due to SELF’s multiple inheritance. The authors attack these twin costs of space and time with a set of mutually reinforcing techniques: 22 | - **Maps.** Maps provide implicit “hidden classes” for prototypes. Instead of storing slot names in every object, the VM factors layout and constant slots into a shared map. A Cartesian point shrinks from ten machine words to three, matching the compactness of class-based objects without reviving classes. 23 | 24 | - **Segregated heap for faster scanning.** Byte arrays live in their own region, so the garbage collector can sweep ordinary objects word-by-word without pausing to distinguish raw bytes from tagged pointers. This allowed SELF to scan memory at nearly double the rate of the fastest Smalltalk-80 implementation. 25 | 26 | - **Customized compilation.** At first send, SELF compiles a receiver-specific native method, locking in the receiver’s map and propagating that static knowledge through the body of the method. 27 | Message splitting and static type anticipation. When control-flow merges blur types, the compiler clones hot call-sites, keeps a “likely” branch where the receiver’s type is known, and leaves a slower fallback branch for rarer cases. The same mechanism lets the compiler anticipate that “+” or “<” will probably see integers and inlines those primitives aggressively. 28 | 29 | - **Zero-friction tooling.** Each compiled method records its dependents; thus, a slot update invalidates only the affected machine code, leaving the rest of the system untouched. Rich metadata lets the debugger reconstruct inlined frames so developers see a clean, source-level stack even after heavy optimization. 30 | 31 | Taken together, these ideas deliver a dynamic language that is only four to five times slower than C on the same hardware — a leap forward at the time — and lay the conceptual groundwork for the hidden-class JITs that now power JavaScript, Ruby, and more. 32 | 33 | # Analysis 34 | 35 | In the over 30 years since this paper was published, there have been significant advances in the world of fast, efficient JIT compilers. Many of these developments stemmed directly from the groundwork laid in this paper, giving rise to the JIT compilers we know today - including the Java JIT compiler. After this paper was published, the described VM morphed into the [Strongtalk](https://www.strongtalk.org) engine which, after Sun bought Animorphic Systems in 1997, became the core of Java’s HotSpot VM. HotSpot still identifies “hot” call-sites, speculatively inlines them, and rolls back if reality diverges — precisely the speculative compilation playbook SELF introduced. 36 | 37 | Beyond the HotSpot VM, there are many other patterns in modern JIT compilers that can be attributed to this paper. 38 | 39 | The idea of utilizing maps to provide hidden classes lives on inside every modern JavaScript engine. V8, SpiderMonkey, and Chakra assign each object a lightweight shape (often literally named Map) so that property look-ups devolve to a pointer compare plus a constant offset. Chrome’s V8 documentation even cites maps as the cornerstone of its object model. 40 | 41 | This paper’s monomorphic inline cache cut the cost of a send to a single indirect jump. A few years later, the same group extended the idea to polymorphic inline caches, now standard in V8, HotSpot, PyPy, and Graal. The core intuition—let the first few executions specialize the call-site—remains a workhorse optimization. 42 | 43 | Other ideas that are still used today include: speculative compilation, as profile-guided inlining with cheap de-optimizations fuels every tiered JIT pipeline today; code invalidation via dependency lists, which enable the edit-refresh cycles developers now take for granted; and object shape sharing, which is critical for memory as programs continue to get larger (for example: Chrome routinely juggles millions of JS objects; sharing one descriptor per shape tames this footprint). 44 | 45 | However, that is not to say that modern JIT compilers are perfect. Indeed, many of the downsides from SELF's implementation continue to be an issue today. Most notably is the extra startup latency and overhead JIT compilers incure. Customized machine code multiplies binary size, thereby increasing startup time. On mobile, cold-start budgets are measured in tens of milliseconds, prompting projects (such as V8’s [Sparkplug](https://v8.dev/blog/sparkplug) baseline) to curb SELF-style mega-methods. Furthermore, the rich PC-to-byte-code maps that make de-optimization and on-the-fly debugging possible also swell metadata. That footprint is incurred even for users who never open DevTools. Finally, prototype mutation remains a fertile ground for attacks such as [prototype pollution](https://www.netspi.com/blog/technical-blog/web-application-pentesting/ultimate-guide-to-prototype-pollution/). Static languages dodge whole classes of such issues by construction. 46 | 47 | ## Looking towards the future 48 | 49 | The era of web development was characterized by a constant tug-of-war between static and dynamic compilers. During the 2000s and 2010s, developers embraced the “let the JIT sort it out” mindset and overwhelmingly used JavaScript. Today, the majority of developers are switching to TypeScript, which layers a static type system over top JavaScript. Yet despite this top-level switch in language, at runtime those .ts files are converted to .js files and still funnel into the same hidden-class JS JIT compiler that SELF inspired. The major languages of the 2020s therefore inhabit a middle ground: dynamic under the hood, statically flavored at the surface. That compromise echoes the paper’s own wager that runtime type knowledge, harnessed smartly, can rival compile-time guarantees. 50 | 51 | Recently, ARM's big-little cores, WebAssembly’s rise, and the carbon cost of always-on optimisation are reopening the question of whether speculation is worth its wattage. Yet the conceptual toolkit SELF gifted — inline caches, speculative inlining, object shapes, and more — keeps resurfacing. Though today's dynamic languages and JIT compilers look very differeny, the DNA is still recognisably SELF. 52 | 53 | 54 | # Discussion Questions: 55 | 1. Section 6 shows SELF’s compiler emitting scope descriptions and a bidirectional PC↔byte‑code map so the debugger can rebuild inlined stack frames and update them after hot re‑compilation. This metadata adds memory overhead and tooling complexity—problems today’s HotSpot safepoints and V8’s de‑opt paths still wrestle with. Is heavyweight JIT‑aware debugging worth the complexity, or is line‑by‑line interpretation enough? 56 | 2. One of the main benefits of SELF is that it enables a lot more flexibility and expressiveness. However, the prototype-based system also enables new security bugs, such as [prototype pollution](https://www.netspi.com/blog/technical-blog/web-application-pentesting/ultimate-guide-to-prototype-pollution/), and exhibits generally slower performance than static languages (as evidenced by this paper). Where should language designers draw the line between prototype flexibility and performance/security? When would a prototype language be worth these tradeoffs? 57 | 3. SELF’s compiler emits a new machine‑code body for each receiver map but the evaluation focuses only on execution time. Modern systems hit instruction cache limits (Android dex, WebAssembly bundle sizes). Customized compilation duplicates code; when does code‑size blow‑up outweigh speed? Have processors’ larger caches and better branch predictors made code bloat less worrisome? 58 | 4. The authors argue that, with aggressive JITing, “runtime type information is just as good as static type information” for performance. Since 1989, we’ve seen gradual‐typing hybrids (TypeScript, Python mypy), powerful inference (Rust, Swift) and optional dynamic escape hatches in C#. Will the equilibrium keep favoring mixed static/dynamic systems, or will one paradigm “win” the way static scoping did? -------------------------------------------------------------------------------- /content/blog/2025-04-24-super-optimization/egraph-a.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sampsyo/cs6120/8d343c04f6a6bd3ff783de0abda6b282c96791ae/content/blog/2025-04-24-super-optimization/egraph-a.png -------------------------------------------------------------------------------- /content/blog/2025-04-24-super-optimization/egraph-d.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sampsyo/cs6120/8d343c04f6a6bd3ff783de0abda6b282c96791ae/content/blog/2025-04-24-super-optimization/egraph-d.png -------------------------------------------------------------------------------- /content/blog/2025-05-013-parallel-dataflow/averages_by_bmark_ReachingDefinitions_runtime.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sampsyo/cs6120/8d343c04f6a6bd3ff783de0abda6b282c96791ae/content/blog/2025-05-013-parallel-dataflow/averages_by_bmark_ReachingDefinitions_runtime.png -------------------------------------------------------------------------------- /content/blog/2025-05-013-parallel-dataflow/averages_by_bmark_ReachingDefinitions_runtime_par.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sampsyo/cs6120/8d343c04f6a6bd3ff783de0abda6b282c96791ae/content/blog/2025-05-013-parallel-dataflow/averages_by_bmark_ReachingDefinitions_runtime_par.png -------------------------------------------------------------------------------- /content/blog/2025-05-013-parallel-dataflow/averages_runtime.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sampsyo/cs6120/8d343c04f6a6bd3ff783de0abda6b282c96791ae/content/blog/2025-05-013-parallel-dataflow/averages_runtime.png -------------------------------------------------------------------------------- /content/blog/2025-05-013-parallel-dataflow/averages_runtime_par.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sampsyo/cs6120/8d343c04f6a6bd3ff783de0abda6b282c96791ae/content/blog/2025-05-013-parallel-dataflow/averages_runtime_par.png -------------------------------------------------------------------------------- /content/blog/2025-05-013-parallel-dataflow/violin_runtime_AvailableExpr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sampsyo/cs6120/8d343c04f6a6bd3ff783de0abda6b282c96791ae/content/blog/2025-05-013-parallel-dataflow/violin_runtime_AvailableExpr.png -------------------------------------------------------------------------------- /content/blog/2025-05-013-parallel-dataflow/violin_runtime_LiveVariables.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sampsyo/cs6120/8d343c04f6a6bd3ff783de0abda6b282c96791ae/content/blog/2025-05-013-parallel-dataflow/violin_runtime_LiveVariables.png -------------------------------------------------------------------------------- /content/blog/2025-05-013-parallel-dataflow/violin_runtime_ReachingDefinitions.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sampsyo/cs6120/8d343c04f6a6bd3ff783de0abda6b282c96791ae/content/blog/2025-05-013-parallel-dataflow/violin_runtime_ReachingDefinitions.png -------------------------------------------------------------------------------- /content/blog/2025-05-13-Final Project.txt: -------------------------------------------------------------------------------- 1 | +++ 2 | title = Optimizing code with trigonometric functions" 3 | bio = """ 4 | Mariia Soroka is a second-year PhD student at Cornell working on differentiable rendering. 5 | """ 6 | name = "Mariia Soroka" 7 | +++ 8 | 9 | My project was based on Dr.Jit codebase. Here is the [paper](https://dl.acm.org/doi/10.1145/3528223.3530099) that describes the compiler. In short, Dr.Jit traces the program to compute an AST, performs some optimizations on this representation, then assembles either LLVM IR or PTX code depending on the used backend, and finally compiles it into a kernel. The LLVM IR or PTX code assembly is explicit in a sense that each Dr.Jit operation comes with a rule on how it should be translated to LLVM or PTX code. For example, `dr.sum` is mapped to LLVM using the following rule 10 | ``` 11 | case VarKind::Add: 12 | fmt(jitc_is_float(v) ? " $v = fadd $V, $v\n" 13 | : " $v = add $V, $v\n", 14 | v, a0, a1); 15 | break; 16 | ``` 17 | 18 | My first goal was to familiarize myself with the internals of the codebase by implementing a simple optimization: replacing computation of $\sin^2(x)$ with $1 - \cos^2(x)$, if $\cos(x)$ is already computed elsewhere in the program. 19 | 20 | To accomplish this, I made modifications to both the [drjit](https://github.com/mariasoroka/drjit_CS6120) and [drjit-core](https://github.com/mariasoroka/drjit-core_CS6120) codebases. The implementation is straightforward: I examine the trace to identify the nodes where the optimization is applicable and edit the trace accordingly. It was not trivial, however, to navigate the codebase and to figure out which functions to use. For example, at first, I did not decrement the reference counters correctly, which led to variable leaks. This was resolved by using the appropriate function for reference counters. 21 | 22 | 23 | I first verified that my implementation is correct on several hand-crafted programs. Here is an example of one of them: 24 | ``` 25 | w = Float(1, 2) 26 | dr.make_opaque(w) 27 | cos = dr.cos(w) 28 | tmp_1 = 1 + cos**2 29 | sin = dr.sin(w) 30 | tmp_2 = 1 + dr.sin(w)**2 31 | tmp_3 = tmp_1 - tmp_2 32 | ``` 33 | 34 | Below is the trace of the following code snippet before and after the modification. Clearly, the optimization worked out and $\sin^2$ is now computed as $1 - \cos^2$. Note that $\sin$ node still appears in the trace because $\sin$ was also created as a separate variable. 35 | 36 | 37 | drawing 38 | drawing 39 | 40 | To better test the optimization and evaluate the performance, I planned to render the three scenes shown in Fig. 6 of the Dr.Jit paper. However, I noticed that during rendering, the optimization was never invoked. To address this, I modified the renderer code to make it less efficient, ensuring that there will be nodes to which the optimization can be applied. I changed the way the microfacet normal distribution function is evaluated. The expression that should be computed is as follows: 41 | $$D(\mathbf{m}) = \frac{1}{\pi \alpha_x \alpha_y (\sin^2\theta (\frac{\cos^2\phi}{\alpha_x^2} + \frac{\sin^2\phi}{\alpha_y^2}) + \cos^2\theta)^2}, \ (1)$$ 42 | where $\mathbf{m}$ is a unit vector: $\mathbf{m} = (\sin\theta \cos\phi, \sin\theta \sin\phi, \cos\theta)^T = (m_x, m_y, m_z)^T$. 43 | 44 | $D(m)$ can be efficiently evaluated without explicitly computing $\theta$ and $\phi$: 45 | 46 | $$D(\mathbf{m}) = \frac{1}{\pi \alpha_x \alpha_y ((\frac{m_x^2}{\alpha_x^2} + \frac{m_y^2}{\alpha_y^2}) + m_z)^2}. \ (2)$$ 47 | 48 | This computation, however, does not trigger my optimization. Therefore, I replaced (2) in the Mitsuba 3 code with computation that explicitly evaluates $\theta$ and $\phi$ from the vector $\mathbf{m}$, and then computes expression (1). 49 | 50 | 51 | I performed my experiment on a machine with an Intel(R) Xeon(R) Silver 4214 CPU running Ubuntu 20.04 and an NVIDIA GeForce RTX 3090 GPU. I used Mitsuba 3 (version 3.6.4) and modified v1.0.5 version of Dr.Jit to render all the scenes. Each experiment was run five times to get the average execution times reported below. Each rendering used a relatively large number of samples per pixel (2048 spp) to ensure that the measured time is reliable. 52 | 53 | drawing 54 | 55 | Well, my optimization did not improve the performance, but at least I know that the trace modification was correct since produced images were identical. 56 | 57 | The second part of the project was much less straightforward, and I was not able to figure it out. As described in [the project proposal](https://github.com/sampsyo/cs6120/issues/514#issue-2958547735), the idea was to trace functions that lack hardware support and cannot be represented by a single node into a separate trace, and then redirect the main AST to that newly created trace. I was unable to find a way to achieve this using the existing tools in the codebase. Implementing this optimization would require introducing a new type of node (e.g., `call`) and writing the corresponding PTX or LLVM IR code to support it. 58 | -------------------------------------------------------------------------------- /content/blog/2025-05-13-Final_Project/evaluation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sampsyo/cs6120/8d343c04f6a6bd3ff783de0abda6b282c96791ae/content/blog/2025-05-13-Final_Project/evaluation.png -------------------------------------------------------------------------------- /content/blog/2025-05-13-Final_Project/graph_new.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sampsyo/cs6120/8d343c04f6a6bd3ff783de0abda6b282c96791ae/content/blog/2025-05-13-Final_Project/graph_new.pdf -------------------------------------------------------------------------------- /content/blog/2025-05-13-Final_Project/graph_new.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sampsyo/cs6120/8d343c04f6a6bd3ff783de0abda6b282c96791ae/content/blog/2025-05-13-Final_Project/graph_new.png -------------------------------------------------------------------------------- /content/blog/2025-05-13-Final_Project/graph_old.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sampsyo/cs6120/8d343c04f6a6bd3ff783de0abda6b282c96791ae/content/blog/2025-05-13-Final_Project/graph_old.pdf -------------------------------------------------------------------------------- /content/blog/2025-05-13-Final_Project/graph_old.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sampsyo/cs6120/8d343c04f6a6bd3ff783de0abda6b282c96791ae/content/blog/2025-05-13-Final_Project/graph_old.png -------------------------------------------------------------------------------- /content/blog/2025-05-13-bril2wasm.md: -------------------------------------------------------------------------------- 1 | +++ 2 | title = "It's About (wasm)Time: A Bril to Wasm Translator" 3 | [extra] 4 | bio = """ 5 | Annabel Baniak is Masters of Science student studying CS at Cornell. In her free time, she enjoys participating in theatre and collecting antique books! 6 | 7 | Serena Duncan is an undergraduate studying CS at Cornell. When she isn't crying over Rust, she can be found taking long walks around campus and performing with the Shakespeare Troupe! 8 | 9 | Michael Xing is a Masters of Science student studying CS at Cornell and a Software Engineer at Microsoft. When not writing code, he enjoys travelling the world, riding roller coasters, and playing the piano. And video games. 10 | """ 11 | [[extra.authors]] 12 | name = "Annabel Baniak" 13 | 14 | [[extra.authors]] 15 | name = "Michael Xing" 16 | 17 | [[extra.authors]] 18 | name = "Serena Duncan" 19 | +++ 20 | 21 | # Project Goal 22 | Our goal was to build a compiler from Bril to WebAssembly. To do this, we collaborated with another group consisting of Gerardo Teruel and Dev Patel, who worked on the “Brilooped” project; this project took Bril, a CFG-based language with unstructured control flow, and changed it to a structured control flow language by creating a relooper for Bril. This means that all the branch and jump statements in Bril were replaced by while/continue/break statements. After agreeing on a shared syntax for the output of their project as the input of ours, we were able to take their project as finished and compile the Brilooped syntax to WebAssembly, which also uses structured control flow. 23 | 24 | # Implementation 25 | We built a tool to compile Bril programs into wasm, and then executed the resulting program using [Wasmtime](https://docs.Wasmtime.dev/). This allowed us to test our output WebAssembly programs for correctness and performance against the original equivalent Bril programs. 26 | 27 | The bulk of the project was matching across the input JSON Brilooped instruction types, and converting them linearly into WebAssembly instructions. For the most part, we used a simple system of storing all variables in WebAssembly locals and shuttling them onto and off the stack only when needed for each instruction. This may not be as efficient as a more sophisticated register allocation algorithm, but it allowed us to guarantee correctness with high confidence. 28 | 29 | We anticipated that the most difficult instructions to translate would be those involving control flow– namely, `if`, `while`, `break`, and `continue`, as these would have to be translated while keeping in mind the context of the rest of the program; for example, how many nested `while`/`if` statements we were breaking out of. Luckily, we were able to convert Bril `if` into the WebAssembly `if` instruction directly, simply changing the syntax as otherwise the instructions act statically the same. For `while` commands, there was a slight wrinkle to address, which was the fact that WebAssembly only allows code to jump to the top of a loop, and not out of one. This made implementing `break` difficult. To solve this problem, we wrapped an outer WebAssembly `block` with an inner `loop`, since `block` constructs can be arbitrarily exited from. Thus, any `continue` statements could jump back to the top of the inner loop, while `break` could then jump out of the outer block. To implement the proper semantics of nested loops, we maintained a stack of label names at compile-time, one label for each nested loop, so when we needed to break multiple layers back, we could simply pop an appropriate number of entries off our stack to find the proper jump target. 30 | 31 | ## Printing 32 | One thing to note is that we used Gemini to generate a runtime library, which provides functions to print base values– ints and bools– to the terminal. We could then statically link this into our output WebAssembly program that we generated. We made this choice because this process of printing WebAssembly values was highly nontrivial but also extremely non-interesting to the aims of the project, namely the translation between Bril (in Brilooped form) and WebAssembly. However, we found that using Gemini was nearly as much of a struggle as writing the code ourselves; it took nearly 10 back and forth interactions to get something that was even compiled. 33 | 34 | # Challenges 35 | In general, we found that the earliest parts of the project were the most difficult. At first, we had to decide what language to work in, as the three of us had been using different languages for our 6120 homework assignments all semester long; Serena using Rust, Annabel using Python, and Michael using TypeScript. Eventually, we decided TypeScript would be the most natural choice for this project due to its robust type system and ease of learning. However, it was still difficult as Serena and Annabel had limited experience using this language. 36 | 37 | Crucially, no one on our team had substantial prior experience with WebAssembly itself. This made it difficult at the very start of the project to know how to even begin– we went crawling through the WebAssembly documentation, but unfortunately most of the examples and documentation we found online did not match the kind of thing we needed from the documentation; the general use case for WebAssembly is compiling into it using someone else’s compiler program, so much of the documentation is geared toward this use-case rather than encoding in WebAssembly itself. Much of the first half of the time we spent working on the project was simply sitting with the WebAssembly documentation, trying to understand the way the system worked (we spent, for example, a good amount of time pursuing an implementation to emit the bytecode format of WebAssembly code rather than the text format, before deciding not to continue with this direction, nullifying a good deal of work). 38 | 39 | Another challenge was working with code created by another group in the class, who were working toward the same deadline we were. Throughout the course of their Brilooped project, they naturally ran into adjustments to the Brilooped format, which then cascaded into affecting our project as well; as the two were developing concurrently, we did not have a well of extensive Briloop documentation to refer back to when developing our programs. However, we did have the advantage of being able to talk to Dev and Gerardo in real time. We could, and did, ask for help directly whenever we ran into trouble, and they were really really helpful in helping us get our input from Briloop working. We’re incredibly grateful for their help throughout this process. 40 | 41 | One more challenging aspect of the project was, for the limited capacity in which we used Gemini, wrangling the LLM to produce actually correct (or even executable) code; though it still probably saved us the time it would take to do the uninteresting bit ourselves, it was certainly not an effortless “get out of jail free” card. 42 | 43 | # Analysis 44 | 45 | ## Testing methodology 46 | We tested our code thoroughly on small examples as we implemented core functionality. This allowed us to catch errors before our code became too complex to fix. After completing our implementation, we tested on the Bril benchmarks core test suite. To do this, we first took [Briloop core benchmark translations](https://github.com/gerardogtn/cs6120-compilers/tree/main/project/core), provided by the Briloop group, and translated them into JSON files using their Briloop2JSON command. After, we ran the files through our code to output the WebAssembly translation, then ran this translation using Wasmtime. We compared the output of our WebAssembly files to the output from the original Bril files. With this method, we were able to confirm correctness, in the sense that the WebAssembly output program behaved identically on our test cases to the original Bril programs (1 test per benchmark). 47 | 48 | Note that our WebAssembly runtime prints all arguments one per line, instead of multiple arguments on the same line. Thus, our test cases ignored whitespace differences. 49 | 50 | ## Performance 51 | Fortunately (and perhaps unsurprisingly), the WebAssembly executed significantly faster in Wasmtime than the original Bril interpreter could run Bril files. Using a script that runs all of the core benchmarks in sequence using the provided arguments, we benchmarked performance using [Hyperfine](https://github.com/sharkdp/hyperfine) on a Windows machine with an AMD Ryzen 9 6900HS processor with Deno 2.3.1 and Wasmtime 34.0.0. The original Bril interpreter took a mean of 3.883 seconds with a standard deviation of 0.106 seconds to run through the entire folder. The WebAssembly versions took only 1.043 seconds with a standard deviation of 0.044 seconds. This is an observed speed increase of over 73.1%. 52 | 53 | ## Notes and Future Work 54 | There is some future work to explore. For example, we currently do not support Bril programs that have variables change types after declaration and during execution; this kind of instruction is available in both Bril and WebAssembly syntax. 55 | 56 | Also, to replace our Gemini-generated runtime library that we used to print test cases, we could, instead of writing print functions directly in WebAssembly S-expressions, write them in some other language and then compile them to wasm, using Clang or some similar process. This would be a perhaps more efficient way to complete this step of execution. 57 | 58 | As we noted above, our WebAssembly output prints arguments each on their own line, rather than on the same line. To make our project work with arguments printed on the same line, using the method of compiling our printing functions using Clang or something similar, we could simply take in a command like ‘print a b c’ and compile it to print commands which thread in spaces between each argument. 59 | 60 | We currently only support core Bril instructions. Implementing floats proved to be rather tricky as we would have needed to write our own float printing function in WebAssembly. Since the WebAssembly documentation is rather sparse for how to actually code in the language, we decided to leave this as work for the future. For memory, the way WebAssembly handles memory is very different from how Bril handles it. Support for memory would require digging much further into the WebAssembly (and Bril) internals than we had time for, so this was also left for the future. 61 | -------------------------------------------------------------------------------- /content/blog/2025-05-13-brilooped/log_scale_comparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sampsyo/cs6120/8d343c04f6a6bd3ff783de0abda6b282c96791ae/content/blog/2025-05-13-brilooped/log_scale_comparison.png -------------------------------------------------------------------------------- /content/blog/2025-05-13-brilooped/merge-nodes.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sampsyo/cs6120/8d343c04f6a6bd3ff783de0abda6b282c96791ae/content/blog/2025-05-13-brilooped/merge-nodes.png -------------------------------------------------------------------------------- /content/blog/2025-05-13-brilooped/relative_change.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sampsyo/cs6120/8d343c04f6a6bd3ff783de0abda6b282c96791ae/content/blog/2025-05-13-brilooped/relative_change.png -------------------------------------------------------------------------------- /content/blog/2025-05-13-brilooped/size_vs_change.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sampsyo/cs6120/8d343c04f6a6bd3ff783de0abda6b282c96791ae/content/blog/2025-05-13-brilooped/size_vs_change.png -------------------------------------------------------------------------------- /content/blog/2025-05-13-brilooped/speedup_comparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sampsyo/cs6120/8d343c04f6a6bd3ff783de0abda6b282c96791ae/content/blog/2025-05-13-brilooped/speedup_comparison.png -------------------------------------------------------------------------------- /content/blog/2025-05-13-brilooped/top_improvements.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sampsyo/cs6120/8d343c04f6a6bd3ff783de0abda6b282c96791ae/content/blog/2025-05-13-brilooped/top_improvements.png -------------------------------------------------------------------------------- /content/blog/2025-05-13-brilooped/top_regressions.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sampsyo/cs6120/8d343c04f6a6bd3ff783de0abda6b282c96791ae/content/blog/2025-05-13-brilooped/top_regressions.png -------------------------------------------------------------------------------- /content/blog/2025-05-13-flat-bril/bench_results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sampsyo/cs6120/8d343c04f6a6bd3ff783de0abda6b282c96791ae/content/blog/2025-05-13-flat-bril/bench_results.png -------------------------------------------------------------------------------- /content/blog/2025-05-13-flat-bril/heap_figure1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sampsyo/cs6120/8d343c04f6a6bd3ff783de0abda6b282c96791ae/content/blog/2025-05-13-flat-bril/heap_figure1.png -------------------------------------------------------------------------------- /content/blog/2025-05-13-flat-bril/heap_figure2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sampsyo/cs6120/8d343c04f6a6bd3ff783de0abda6b282c96791ae/content/blog/2025-05-13-flat-bril/heap_figure2.png -------------------------------------------------------------------------------- /content/blog/2025-05-13-flat-bril/stack_chart1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sampsyo/cs6120/8d343c04f6a6bd3ff783de0abda6b282c96791ae/content/blog/2025-05-13-flat-bril/stack_chart1.png -------------------------------------------------------------------------------- /content/blog/2025-05-13-flat-bril/stack_chart2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sampsyo/cs6120/8d343c04f6a6bd3ff783de0abda6b282c96791ae/content/blog/2025-05-13-flat-bril/stack_chart2.png -------------------------------------------------------------------------------- /content/blog/2025-05-13-global-value-numbering/index.md: -------------------------------------------------------------------------------- 1 | +++ 2 | title = "Global Value Numbering for Bril" 3 | [extra] 4 | bio = """ 5 | Allen Wang is a CS M.Eng student at Cornell University. He's pretty tired right now. 6 | """ 7 | [[extra.authors]] 8 | name = "Allen Wang" 9 | +++ 10 | 11 | ### Overview 12 | 13 | The goal of this project was to implement global value numbering for Bril programs using value partitioning. I then used this to perform redundancy elimination using available expressions and benchmarked the performance impact. 14 | 15 | ### Value Numbering 16 | 17 | Value numbering is a family of program analysis techniques that involve assigning an identifying (value) number to each expression, where expressions that are guaranteed to evaluate to the same value have the same identifying number. By separating values from expressions, we can find duplicate expressions that are syntactically different but evaluate to the same value as already-existing expressions, then remove them. For example, we can use value numbering on this program: 18 | ``` 19 | sum1 : int = add a b; 20 | sum2 : int = add b a; 21 | prod: int = mul sum1 sum2; 22 | ``` 23 | To find out that sum1 and sum2 evaluate to the same value, then optimize it to: 24 | ``` 25 | sum1 : int = add a b; 26 | prod: int = mul sum1 sum1; 27 | ``` 28 | We went over one value numbering algorithm [here](https://www.cs.cornell.edu/courses/cs6120/2025sp/lesson/3/). However, this algorithm assumes a single linear control flow, which means it's only suitable for local value numbering within blocks. 29 | 30 | ### Global Value Numbering 31 | 32 | Global value numbering is a set of techniques which perform value numbering at the level of a function, rather than a single block. [This paper](https://www.cs.tufts.edu/~nr/cs257/archive/keith-cooper/value-numbering.pdf) goes over hash-based and partitioning implementations of global value numbering. There's already a hash-based implementation for Bril [here](https://www.cs.cornell.edu/courses/cs6120/2019fa/blog/global-value-numbering/) and it's very conceptually similar to local value numbering, so I decided to implement value partitioning instead. 33 | 34 | ### Value partitioning 35 | 36 | Instead of hashing expressions to values like local value numbering, value partitioning works by directly computing congruence classes of expressions, where two expressions are congruent if they have the same opcode and all their arguments are congruent with each other. To perform value partitioning, we first put a program into SSA to ensure that each value has a unique variable associated with it. We assume that all operations of a type are in the same congruence class, then repeatedly partition congruence classes where this cannot be true until we obtain a maximum fixed point. 37 | 38 | We implemented this algorithm for value partitioning, which was given in the paper: 39 | ``` 40 | Initial partition: all values computed by the same opcode are in the same congruence classes 41 | 42 | worklist = classes in initial partitio 43 | while worklist is not empty: 44 | select a class c from worklist 45 | for each possible arg position p: 46 | touched = ∅ 47 | for each value v: 48 | if arg p of v is in c, add v to touched 49 | for each class s where some but not all members are touched: 50 | n = s & touched 51 | s = s - n 52 | if s in worklist: 53 | add n to worklist 54 | else: 55 | add smaller of n and s to worklist 56 | ``` 57 | After this, we pick a representative for each congruence class, then replace every operation of that type with the representative. 58 | 59 | ### Redundancy Elimination 60 | After standardizing our program to use values instead of expressions, we still need to convert this into a performance improvement. To do this, we use an available expressions dataflow analysis to calculate which values are available at each point in the program. Fortunately, the properties of the renaming algorithm make it very easy to define the analysis for calculating available expressions. 61 | 62 | - The initial input is the empty set. 63 | - The transfer function takes the union of a block's input and every expression in the block. If an expression already exists, it's redundant and can be removed. 64 | - The merge function takes the intersection of all the outputs of a block's predecessors. 65 | 66 | I also tried implementing partial redundancy elimination, which moves computations that are redundant along some execution paths back through the control graph to turn them fully redundant and optimize them away. This can be accomplished by performing global analyses to determine where computations can be safely moved and where moving them would save time, but I didn't have time to fully wrap my head around this and fix the bugs. 67 | 68 | 69 | 70 | #### Implementation Notes 71 | 72 | Getting GVN right was very finicky and required reading the text very carefully. My biggest struggles in the end were first understanding the processing algorithm, then figuring out and debugging all the edge cases that arose from not reading the paper carefully enough. The most annoying edge case I ran into was handling phi statements. I kept getting differences in code execution, and after going through the source code and a log of instructions executed, I discovered that phi statements were mysteriously being removed. After this, I managed to pinpoint that the problem was that the phi statements could not be congruent with phi statements in other blocks and fixed this. Immediately after, I read through the paper again and found that it mentioned this in an aside ... I also had a lot of trouble implementing copy propagation to match the hash-based implementation, then discovered that the paper explicitly used this as an example of an optimization that couldn't be done using value partitioning in a later section. 73 | 74 | ### evaluation 75 | 76 | For correctness, I ran my optimizations on the core benchmarks with different inputs to test whether they would cause problems. I also wrote a series of test cases for various edge cases and optimizations GVN should be able to identify. Each of the benchmarks and hand-written test cases produced the same outputs before and after optimization. I also verified that the benchmarks were able to catch incorrect implementations while fixing bugs. 77 | 78 | For performance, I tested against the core benchmarks, using the same inputs as the correctness tests. I found that using only the AVAIL-based removal resulted in a minimum improvement of 0% less instructions executed (as global value numbering doesn't add any operations to a program), a median improvement of 1.5% less instructions executed compared to base SSA, and a maximum improvement around 58% less instructions executed. Most of the benchmarks were written directly in Bril, so they were relatively optimized and there were few opportunities to identify congruence classes across blocks. 79 | 80 | 81 | -------------------------------------------------------------------------------- /content/blog/2025-05-13-global-value-numbering/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sampsyo/cs6120/8d343c04f6a6bd3ff783de0abda6b282c96791ae/content/blog/2025-05-13-global-value-numbering/plot.png -------------------------------------------------------------------------------- /content/blog/2025-05-13-polyhedral.md: -------------------------------------------------------------------------------- 1 | +++ 2 | title = "A Simplified Polyhedral IR for Bril" 3 | [[extra.authors]] 4 | name = "Mark Barbone" 5 | +++ 6 | 7 | In [this project][github], I set out to build a polyhedral IR for Bril 8 | programs. My original goal was to implement polyhedral optimizations on it; 9 | however, simply constructing and going out of a polyhedral IR ended up being 10 | complex enough that it filled my budget. 11 | 12 | [github]: https://github.com/mb64/cs6120-work/tree/main/poly 13 | 14 | ## Background 15 | 16 | Here's an example program: 17 | 18 | ```c 19 | for (i = 0; i < N; i++) { 20 | for (j = i; j < N; j++) { 21 | foo(i, j); 22 | } 23 | } 24 | ``` 25 | 26 | This program has a pretty complex control flow graph -- each `for` loop is 27 | hiding whole host of basic blocks that it compiles to. Often, this is great 28 | for a compiler, as breaking up the control flow into small but regular pieces 29 | means that general-purpose optimizations can handle all kinds of programs 30 | uniformly. 31 | 32 | However, sometimes we want to do more high-level optimizations. For example, it 33 | may be possible to interchange the two loops: 34 | 35 | ```c 36 | for (j = 0; j < N; j++) { 37 | for (i = 0; i <= j; i++) { 38 | foo(i, j); 39 | } 40 | } 41 | ``` 42 | 43 | Figuring out how to do this interchange correctly is extremely non-trivial, 44 | even on the high-level `for`-loop-containing program text! For this, it would 45 | be very convenient to have some intermediate language that explicitly accounts 46 | for iteration spaces like `{ [i, j] : 0 <= i and i <= j and j < N }`, so that 47 | these transformations would be possible. This is precisely the goal of the 48 | _polyhedral model_. In my IR, this is represented as: 49 | 50 | ```c 51 | for (i ∈ { [i] : 0 <= i < N }) { 52 | for (j ∈ { [i, j] : 0 <= i <= j < N }) { 53 | foo(i, j); 54 | } 55 | } 56 | ``` 57 | 58 | Here, the iteration domain appears as a first-class entity in the IR's looping 59 | constructs. I don't exactly use the classic polyhedral framework, but rather a 60 | simplified form inspired by [MLIR's simplified form][mlir], and I'll explain 61 | the details in the next section. 62 | 63 | 64 | ## The intermediate representation 65 | 66 | A classic polyhedral IR has a bag of ordered statements, each with an 67 | associated domain of points on which it's run, and an associated schedule of 68 | when to run it, in the form of an affine function outputting a timestamp. 69 | 70 | With this classical representation, turning it all back to code in the end is 71 | very hard: you need to come up with an optimal set of loops and `if` statements 72 | that correctly implements the ordering requested by the schedule, all without 73 | incurring too much blow-up or obscuring hot loops with too many conditionals. 74 | Here are a few papers I found on how to do this, roughly ordered by how complex 75 | their algorithms are: [IJPP'00][ijpp-00], [PACT'04][pact-04], [TOPLAS'15][toplas-15]. 76 | 77 | [ijpp-00]: https://link.springer.com/article/10.1023/A:1007554627716 78 | [pact-04]: https://icps.u-strasbg.fr/~bastoul/research/papers/Bas04-PACT.pdf 79 | [toplas-15]: https://lirias.kuleuven.be/retrieve/319112 80 | 81 | Because of this (and other reasons), [MLIR introduced][mlir] a _simplified_ 82 | polyhedral representation, which has an explicit representation of iteration 83 | domains, and the analysis / transformation benefits that affords, but also a 84 | preferred way of iterating over it. 85 | 86 | I copied their ideas to design my own IR, which has iteration domains 87 | represented as integer sets (using ISL, the Integer Set Library), but also has 88 | a relatively straightforward interpretation as code: 89 | 90 | ``` 91 | s ::= for (i ∈ affine domain) { ss } (single-variable for loop) 92 | | if (affine condition) { ss } (conditionals) 93 | | instr_1; ...; instr_N (basic blocks) 94 | 95 | ss ::= s_1; ...; s_N 96 | ``` 97 | 98 | Here, both `affine domain` and `affine condition` are integer sets; the only 99 | difference is that `affine domain` introduces a new variable, while `affine 100 | condition` does not. 101 | 102 | A core insight is that we have _static control flow_: there is a subset of 103 | variables, which I'll call "affine variables", which are only transformed in 104 | very easy-to-reason-about ways, and moreover, control flow only depends on 105 | these variables. Of course, not every program has this format, but we still 106 | want to be able to optimize the programs that do, while leaving those that 107 | don't. 108 | 109 | ## To `affine`-ity, and beyond! 110 | 111 | We've made the process of emitting code out of our IR easier, with the 112 | simplified polyhedral representation. But how do we get it there in the first 113 | place? This is still fairly tricky! 114 | 115 | Static control flow is a subset of structured control flow, so it makes sense 116 | to start with a program with structured control flow. Bril doesn't have 117 | structured control flow, but we know that it's possible to construct if you 118 | have reducible control flow. So the first step is to reconstruct the structured 119 | control flow of the input Bril program. This is the job of the relooper. 120 | 121 | Given a Bril function as input, the relooper either: 122 | 123 | * raises an exception, if it has irreducible control flow, or 124 | * returns a Bril AST with structured control flow. 125 | 126 | Here's what this structured control flow IR looks like: 127 | 128 | ``` 129 | c ::= block .l { c1 } c2 (l is in scope for c1 but not c2) 130 | | loop .l { c } (l is in scope for c) 131 | | instr_1; ...; instr_N; c 132 | | if b then c else c 133 | | jmp .l 134 | | ret v 135 | ``` 136 | 137 | A jump to label `.l` is only allowed when `.l` is in scope (this is what makes 138 | it structured!). Jumping to a `block` label is like a `break`, escaping to the 139 | end of the block, while jumping to a `loop` label is like a `continue`, going 140 | back to the loop header for the next iteration. 141 | 142 | I mostly follow the algorithm from [Beyond Relooper (PLDI'22 functional 143 | pearl)][beyond-relooper], though my structured IR looks a little different, and 144 | I don't make any effort to do anything with irreducible CFGs. 145 | 146 | [beyond-relooper]: https://dl.acm.org/doi/abs/10.1145/3547621 147 | 148 | This is far more structured than a sea of basic blocks, but still a far cry 149 | from our nice static control flow! The next step is to recursively process the 150 | structured IR, eventually getting to the simplified polyhedral representation. 151 | This step is by far the most complex, accounting for a large portion of the 152 | code, and with many tricky cases to consider. The biggest difference between 153 | the structured IR and the polyhedral IR, other than that the polyhedral IR has 154 | static control flow, is that code in the structured IR can have multiple exits: 155 | you can branch to any label that's in scope, depending on whatever conditions 156 | you want. By contrast, the polyhedral IR can only go on to the next statement. 157 | The combination of both multiple exits and recovering static control flow makes 158 | this a complex implementation task. 159 | 160 | [mlir]: https://mlir.llvm.org/docs/Rationale/RationaleSimplifiedPolyhedralForm/ 161 | 162 | ## Evaluation 163 | 164 | I evaluated the code in three different ways: 165 | 166 | 1. Correctness 167 | 2. Performance overhead incurred by round-tripping through the IR 168 | 3. Applicability of potential polyhedral optimizations 169 | 170 | For all three, I used both the core Bril benchmark suite and the memory 171 | benchmark suite. Sadly, as I didn't get to implement any real polyhedral 172 | optimizations, I don't have speed-ups to measure (yet?). 173 | 174 | **Correctness.** I tested correctness of round-tripping just through the 175 | relooper, and then also through the polyhedral IR, by comparing against 176 | interpreting the original code. 177 | 178 | **Performance.** On the combined mem and core benchmarks, round-tripping 179 | through the polyhedral IR gives a mean 0.78x speedup. However, my compiler at 180 | the moment does zero backend optimizations, which would be quite useful to 181 | clean up the code. I expect this to go back to close to 1x with simple backend 182 | clean-up optimizations. 183 | 184 | **Applicability.** Not every program has static control flow, so only some 185 | could potentially benefit from future polyhedral optimizations. In the combined 186 | mem and core benchmarks, 176 out of 272 functions had fully static control 187 | flow, or around 65%. Considering just the relooper, interestingly, all 272 188 | functions in these combined test suites exhibit reducible control flow, and 189 | successfully round-trip through just the structured control flow IR. 190 | 191 | -------------------------------------------------------------------------------- /content/blog/2025-05-13-superopt/canonical-example-after.svg: -------------------------------------------------------------------------------- 1 | 2 | 4 | 6 | 7 | 9 | 10 | 11 | 12 | outer_cluster_0 13 | 14 | 15 | cluster_0 16 | 17 | 18 | 19 | outer_cluster_2 20 | 21 | 22 | cluster_2 23 | 24 | 25 | 26 | outer_cluster_5 27 | 28 | 29 | cluster_5 30 | 31 | 32 | 33 | outer_cluster_4 34 | 35 | 36 | cluster_4 37 | 38 | 39 | 40 | outer_cluster_1 41 | 42 | 43 | cluster_1 44 | 45 | 46 | 47 | 48 | 2.0:s->0.0 49 | 50 | 51 | 52 | 53 | 54 | 2.0:s->1.0 55 | 56 | 57 | 58 | 59 | 60 | 5.0:s->0.0 61 | 62 | 63 | 64 | 65 | 66 | 5.0:s->4.0 67 | 68 | 69 | 70 | 71 | 72 | 4.0:s->0.0 73 | 74 | 75 | 76 | 77 | 78 | 4.0:s->1.0 79 | 80 | 81 | 82 | 83 | 84 | 5.1:s->0.0 85 | 86 | 87 | 88 | 89 | 90 | 5.1:s->2.0 91 | 92 | 93 | 94 | 95 | 96 | 0.0 97 | 98 | 99 | 2 100 | 101 | 102 | 103 | 104 | 105 | 106 | 2.0 107 | 108 | 109 | * 110 | 111 | 112 | 113 | 114 | 115 | 116 | 1.0 117 | 118 | 119 | a 120 | 121 | 122 | 123 | 124 | 125 | 126 | 5.0 127 | 128 | 129 | * 130 | 131 | 132 | 133 | 134 | 135 | 136 | 4.0 137 | 138 | 139 | / 140 | 141 | 142 | 143 | 144 | 145 | 146 | 5.1 147 | 148 | 149 | / 150 | 151 | 152 | 153 | 154 | 155 | 156 | -------------------------------------------------------------------------------- /content/blog/2025-05-13-superopt/canonical-example-before.svg: -------------------------------------------------------------------------------- 1 | 2 | 4 | 6 | 7 | 9 | 10 | 11 | 12 | outer_cluster_1 13 | 14 | 15 | cluster_1 16 | 17 | 18 | 19 | outer_cluster_3 20 | 21 | 22 | cluster_3 23 | 24 | 25 | 26 | outer_cluster_0 27 | 28 | 29 | cluster_0 30 | 31 | 32 | 33 | outer_cluster_2 34 | 35 | 36 | cluster_2 37 | 38 | 39 | 40 | 41 | 3.0:s->2.0 42 | 43 | 44 | 45 | 46 | 47 | 3.0:s->0.0 48 | 49 | 50 | 51 | 52 | 53 | 2.0:s->1.0 54 | 55 | 56 | 57 | 58 | 59 | 2.0:s->0.0 60 | 61 | 62 | 63 | 64 | 65 | 1.0 66 | 67 | 68 | a 69 | 70 | 71 | 72 | 73 | 74 | 75 | 3.0 76 | 77 | 78 | / 79 | 80 | 81 | 82 | 83 | 84 | 85 | 2.0 86 | 87 | 88 | * 89 | 90 | 91 | 92 | 93 | 94 | 95 | 0.0 96 | 97 | 98 | 2 99 | 100 | 101 | 102 | 103 | 104 | 105 | -------------------------------------------------------------------------------- /content/blog/2025-05-13-superopt/simple_2_output_before.svg: -------------------------------------------------------------------------------- 1 | 2 | 4 | 6 | 7 | 9 | 10 | 11 | 12 | outer_cluster_2 13 | 14 | 15 | cluster_2 16 | 17 | 18 | 19 | outer_cluster_3 20 | 21 | 22 | cluster_3 23 | 24 | 25 | 26 | outer_cluster_4 27 | 28 | 29 | cluster_4 30 | 31 | 32 | 33 | outer_cluster_0 34 | 35 | 36 | cluster_0 37 | 38 | 39 | 40 | outer_cluster_1 41 | 42 | 43 | cluster_1 44 | 45 | 46 | 47 | 48 | 2.0:s->0.0 49 | 50 | 51 | 52 | 53 | 54 | 2.0:s->1.0 55 | 56 | 57 | 58 | 59 | 60 | 3.0:s->0.0 61 | 62 | 63 | 64 | 65 | 66 | 3.0:s->1.0 67 | 68 | 69 | 70 | 71 | 72 | 4.0:s->2.0 73 | 74 | 75 | 76 | 77 | 78 | 4.0:s->3.0 79 | 80 | 81 | 82 | 83 | 84 | 2.0 85 | 86 | 87 | AND 88 | 89 | 90 | 91 | 92 | 93 | 94 | 0.0 95 | 96 | 97 | a 98 | 99 | 100 | 101 | 102 | 103 | 104 | 1.0 105 | 106 | 107 | b 108 | 109 | 110 | 111 | 112 | 113 | 114 | 3.0 115 | 116 | 117 | XOR 118 | 119 | 120 | 121 | 122 | 123 | 124 | 4.0 125 | 126 | 127 | BUS 128 | 129 | 130 | 131 | 132 | 133 | 134 | -------------------------------------------------------------------------------- /content/blog/2025-05-14-bril-to-x86.md: -------------------------------------------------------------------------------- 1 | +++ 2 | title = "A Bril to x86 Compiler" 3 | [[extra.authors]] 4 | name = "Tean Lai" 5 | +++ 6 | 7 | ## Goal 8 | The goal of [the project](https://github.com/tean-lai/bril-to-x86) was to build a Bril compiler targeting x86_64. The priority of the project was to implement something working, and not necessarily something optimal. So for this project, things like register allocation were out of scope. 9 | 10 | ## Design and Implementation 11 | The current implementation was done in Python and it supports all the core features of Bril. I decided to go with Python despite its slowness because it's fast to iterate with, but I would expect the process of converting it to another language to be extremely smooth. 12 | 13 | The overall design of the compiler at a high-level is relatively simple. The design of Bril resembles assembly, so the compiler just needs to do linera passes converting each instruction at a time to the relevant code. The program does two main passes: first it converts all the Bril instructions into another intermediate representation that is a one-to-one correspondence to an x86 instruction. Then the second pass will actually produce the assembly program and format it. The reason I decided to go with splitting it into two passes as opposed to printing the program directly was because there are multiple different syntaxes for x86, so the intention was if different syntax styles were to be supported, the changes would be more isolated. For the project, I implemented support for Intel syntax. 14 | 15 | 16 | ## The Tricky Parts 17 | Although at a high level, things were simple, the details were tricky to get right. The first big hurdle was actually getting a compiled program to run. This required support for several Bril instructions, you would need to support constants, and either printing or returning in order to observe that some changes had happened. But it also required support for some calling conventions to be supported, otherwise the program wouldn't terminate. What helped got me over this hurdle proved to be quite useful in general for developing this compiler, which was modeling Bril programs as C programs and comparing my Bril -> x86 results with Clang's C -> x86 results. This helped me find stuff like the following snippet of text, which I would include at the beginning of every file for generating code targeting MacOS: 18 | 19 | ``` 20 | .section __TEXT,__text,regular,pure_instructions 21 | .build_version macos, 15, 0 sdk_version 15, 2 22 | ``` 23 | 24 | These directives seemed to vary quite a bit device-to-device; but I found comparing with a reference compiler was a reliable way of finding out what directives are necessary. Online sources often lack the specific things you need for your own device. 25 | 26 | It was also quite tricky to get printing right. I implemented Prof. Sampson's suggestion of linking the printing with a helper [rt.c file](https://github.com/sampsyo/bril/blob/main/brilift/rt.c) to avoid printing Booleans by hand. To get linking right, again it helped a lot to model the behavior I would expect in an equivalent C program, and guide my implementation to match that output. 27 | 28 | Another tricky part was getting command line arguments handled correctly. A particularly trick case is the following Bril program: 29 | ``` 30 | @main(x: int) { 31 | n: int = const 10; 32 | b: bool = lt x n; 33 | br b .loop .done; 34 | .loop: 35 | one: int = const 1; 36 | print x; 37 | y: int = add x one; 38 | call @main y; 39 | .done: 40 | ret; 41 | } 42 | ``` 43 | This program takes in a single int as a command line argument, and keeps printing its value and incrementing it until it hits 10. The tricky part comes from the fact that this function also calls itself. So this means the assembly code for this function must somehow handle both command line parsing and support proper calling conventions when it's called as a normal function. More concretely, in x86, it's expected that the main function's first and second parameters are `argc` and `argv` respectively. But we also want the main function's first function to be `x`. 44 | 45 | The solution to this problem was to wrap the main function with a separate function that first handles command line arguments, and then calls the real main function with the proper arguments. However, the current implementation for this might cause bugs for 0.1% of Bril programs because it creates another function called "main_main", so any Bril programs with such a function name might run into some issues. 46 | 47 | However, one big downside of this compiler is that it doesn't work for plenty of devices. There are more cases that need to be hard-coded differently for different devices. For example, MacOS starts their program with the _main label, but for Linux it looks for a _start label. Since I primarily developed this on a Mac, it was hard to get support for more devices working. Extending support to different devices doesn't seem too hard, it seems like a matter of adding different formatting and wrapping the program with different assembly programs, but this is seems hard to do without multiple devices. 48 | 49 | ## What wasn't tricky 50 | It was nice to see how smooth most instructions were to convert to x86. Most instructions were a very close correspondence, with the only exception being the boolean comparison operators. But that seemed to be because x86 doesn't have instructions for things like less than, unlike RISC-V, but this wasn't too bad overall to implement. 51 | 52 | ## Evaluation 53 | I had wanted to evaluate on several things, like comparing this compiler to other aot compilers like [Brilift](https://capra.cs.cornell.edu/bril/tools/brilift.html), but I didn't end up with enough time for that. 54 | 55 | To test for correctness, I have an automated script that interprets every core benchmark against brili and uses that as a reference. And fortunately, every output matches! It checks for both matching standard outs and matching exit codes. I can confidently say this compiler preserves correctness in 95% of programs, the estimated missing 5% being from programs that start have a "main_main" function and functions with more than 6 arguments. The current implementation does not support more than 6 arguments since calling conventions only reserve 6 registers for function calls. 56 | 57 | Overall, this was a pretty fun project and I definitely learned a lot about x86 and am glad I don't normally program at such a low level. -------------------------------------------------------------------------------- /content/blog/2025-05-bril2c.md: -------------------------------------------------------------------------------- 1 | 2 | +++ 3 | title = "Bril2C" 4 | [extra] 5 | bio = """ 6 | Mahmoud is an MEng student enrolled in CS6120 as of Spring 2025. 7 | """ 8 | [[extra.authors]] 9 | name = "Mahmoud Elsharawy" 10 | +++ 11 | 12 | This project is intended to translate code from [Bril](https://capra.cs.cornell.edu/bril/) (Big Red Intermediate Language) to C. Bril is an educational intermediate language used in the CS6120 course at Cornell University. As an intermediate language, it is low-level, which would make it difficult to translate to a high-level language. However, C, with features such as manual memory management and goto statements, makes it a natural choice. In my initial proposal, I thought creating a complete and correct translation from Bril to C would be trivial, but indeed quite a bit of hacking was needed, due to limitations of C not placed on Bril. 13 | 14 | ## Implementation 15 | Bril2C is written in Rust, using the "bril-rs" package. Most of the actual translation work happens in a single trait, `Crep`, which contains a single function `crep(self) -> String`. Each piece of Bril implements this function with how it translates itself to C, calling `crep` on smaller pieces, effectively inducting on the structure of the program. For example, `crep` for a program will, along with other things, call `crep` for each function in that program, which will call `crep` for each instruction, which will call `crep` for each operation and variable. In this way, adding additional features of Bril into C becomes as easy as implementing a single function for a specific type. 16 | 17 | As it turns out, quite some fiddling is required to get even a majority of Bril code working. Almost all of the benchmarks have `main` function which takes in arguments, something not possible in C. We get around this by creating our own `main` function in C, which parses arguments from the command line, and passes them into a copy that's supposed to represent the Bril's main function, called `main_f`. Taking an example, `sum-bits.bril`, with a main function that takes in a single integer argument, this is represented as: 18 | ```C 19 | int main(int argc, char *argv[]) { 20 | int64_t input = atoi(argv[1]); 21 | main_f(input); 22 | return 0; 23 | } 24 | ``` 25 | This way the resulting program can take in arguments from the command line for the main function in the same way as `brili`. 26 | 27 | Additionally, Bril's print statement isn't type-annotated, meaning some form of polymorphism is needed in C. We accomplish this by creating our own generic print macro, allowing us to call `print` as easily as it is called in Bril: 28 | ```C 29 | #define print(x) 30 | _Generic((x), 31 | int64_t: printf("%" PRId64 " ", x), 32 | uint8_t: printf("%s ", (x) ? "true" : "false"), 33 | double: printf("%.17f ", x)) 34 | ``` 35 | For ease of programming, each function declares all of the variables used in it at the beginning of the function. After that, it's a "simple" translation from Bril instructions to C statements. Therefore, the following function in Bril: 36 | ``` 37 | @mod(dividend : int, divisor : int) : int { 38 | quotient : int = div dividend divisor; 39 | two : int = const 2; 40 | prod : int = mul two quotient; 41 | diff : int = sub dividend prod; 42 | ret diff; 43 | } 44 | ``` 45 | becomes this C function: 46 | ```C 47 | int64_t mod_f(int64_t dividend_, int64_t divisor_) { 48 | int64_t quotient_; 49 | int64_t two_; 50 | int64_t prod_; 51 | int64_t diff_; 52 | quotient_ = dividend_ / divisor_; 53 | two_ = 2; 54 | prod_ = two_ * quotient_; 55 | diff_ = dividend_ - prod_; 56 | return diff_; 57 | } 58 | ``` 59 | At this point, you may have noticed the additional characters in each variable name and suffix name. These are inserted to prevent Bril registers from being confused with C keywords, which is surprisingly popular in Bril code, especially for words like `if` and `continue`. 60 | 61 | In order to capture the maximum subset of Bril, we also implemented the `memory` and `float` extensions to Bril. Adding these were surprisingly simple, as Bril's `alloc`, `free`, `load`, and `store` map rather painlessly to C's `malloc`, `free`, and referencing/dereferencing. To show a full example, `Bril2C` successfully translated this Bril code: 62 | ``` 63 | # ARGS: 42 64 | @main(input : int) { 65 | sum : int = const 0; 66 | two : int = const 2; 67 | zero : int = const 0; 68 | .loop: 69 | cond : bool = eq input zero; 70 | br cond .done .body; 71 | .body: 72 | bit : int = call @mod input two; 73 | input : int = div input two; 74 | sum : int = add sum bit; 75 | jmp .loop; 76 | .done: 77 | print sum; 78 | ret; 79 | } 80 | 81 | @mod(dividend : int, divisor : int) : int { 82 | quotient : int = div dividend divisor; 83 | two : int = const 2; 84 | prod : int = mul two quotient; 85 | diff : int = sub dividend prod; 86 | ret diff; 87 | } 88 | ``` 89 | into this C code: 90 | ```C 91 | #include 92 | #include 93 | #include 94 | #include 95 | 96 | uint8_t true = 1; 97 | uint8_t false = 0; 98 | #define print(x) \ 99 | _Generic((x), \ 100 | int64_t: printf("%" PRId64 " ", x), \ 101 | uint8_t: printf("%s ", (x) ? "true" : "false"), \ 102 | double: printf("%.17f ", x)) 103 | 104 | void main_f(int64_t input_); 105 | 106 | int64_t mod_f(int64_t dividend_, int64_t divisor_); 107 | 108 | void main_f(int64_t input_) { 109 | int64_t sum_; 110 | int64_t two_; 111 | int64_t zero_; 112 | uint8_t cond_; 113 | int64_t bit_; 114 | sum_ = 0; 115 | two_ = 2; 116 | zero_ = 0; 117 | loop_: 118 | cond_ = input_ == zero_; 119 | if (cond_) { 120 | goto done_; 121 | } else { 122 | goto body_; 123 | } 124 | body_: 125 | bit_ = mod_f(input_, two_); 126 | input_ = input_ / two_; 127 | sum_ = sum_ + bit_; 128 | goto loop_; 129 | done_: 130 | print(sum_); 131 | printf("\n"); 132 | return; 133 | } 134 | int64_t mod_f(int64_t dividend_, int64_t divisor_) { 135 | int64_t quotient_; 136 | int64_t two_; 137 | int64_t prod_; 138 | int64_t diff_; 139 | quotient_ = dividend_ / divisor_; 140 | two_ = 2; 141 | prod_ = two_ * quotient_; 142 | diff_ = dividend_ - prod_; 143 | return diff_; 144 | } 145 | int main(int argc, char *argv[]) { 146 | int64_t input = atoi(argv[1]); 147 | main_f(input); 148 | return 0; 149 | } 150 | ``` 151 | 152 | ## Evaluation 153 | Creating a program to translate a single Bril program into C is easy. Creating a program to translate _every_ Bril program into C is much more difficult. I tested my code with a simple script, `test.sh` which would run every `.bril` file in `benchmarks`, and compare its output with the translated, compiled, and run code from `Bril2C`. Many times, I thought I had completed my project, only to discover a Bril program that broke my seemingly correct implementation, such as by naming a register the same name as a function, or by using the same register with two different types in two different functions. I did manage to resist the temptation of simply declaring that I would only consider a subset of Bril programs with "nice" formatting, however, and powered on, handling edge case after edge case. After enough bug-fixing, however, I was able to successfully produce a correct program for every single Bril program in the benchmark (as shown in [test_results.txt](https://github.com/mse63/bril2c/blob/master/test_results.txt)). 154 | 155 | ## Future Work 156 | I believe this project still has room for improvement. Although my code did pass every Bril program in the benchmark, it would not surprise me if someone could create a nefarious Bril program that Bril2C would fail with, perhaps by coming up with cursed enough variable naming. 157 | 158 | Additionally, I believe there is room for improvement in the generated C code. I'm not a fan of the use of `goto` in C, and I'm confident many if not most of the control logic in Bril code could be represented with `if` statements or `while` loops. Unfortunately, I did not end up having the time to explore this idea further. 159 | -------------------------------------------------------------------------------- /content/blog/_index.md: -------------------------------------------------------------------------------- 1 | +++ 2 | title = "The CS 6120 Course Blog" 3 | sort_by = "date" 4 | template = "blog.html" 5 | page_template = "post.html" 6 | +++ 7 | Testing? 8 | -------------------------------------------------------------------------------- /content/lesson/1.md: -------------------------------------------------------------------------------- 1 | +++ 2 | title = "Welcome & Overview" 3 | [extra] 4 | due = "January 28" 5 | index = 1 6 | thread = 443 7 | [[extra.videos]] 8 | id = "0_bug89uok" 9 | [[extra.readings]] 10 | name = "On Proebsting’s Law" 11 | url = "https://gwern.net/doc/cs/algorithm/2001-scott.pdf" 12 | details = "Kevin Scott" 13 | [[extra.readings]] 14 | name = "On Proebsting’s Law" 15 | url = "https://zeux.io/2022/01/08/on-proebstings-law/" 16 | details = "Arseny Kapoulkine" 17 | +++ 18 | 19 | ## Gist 20 | 21 | * Welcome to CS 6120! I'm really excited for the semester. 22 | * What is a compiler, and why should we care? 23 | * A simplistic answer is all about making programs go fast. And optimization is important, but compare Moore's law to Proebsting's law. 24 | * This class is about *language implementation* more broadly. What else goes into a good language implementation other than optimizations? (Not parsing.) Post your ideas in the [discussion topic][topic]. 25 | * By the way, [lessons][] in 6120 have an associated discussion thread. We'll use these for just generally talking about stuff and also for reporting our results. 26 | * New languages + new hardware = lots of interesting problems. 27 | * Either one alone would be plenty to keep compiler engineers busy for a long time. 28 | * Course overview and the [syllabus][]. 29 | * Communication will be on [Zulip][]. [Sign up][zulip-signup] and stay on top of it! 30 | * Class sessions: 31 | * "Lesson" days. 32 | * I'll do some traditional "lecturing" in class, but that stuff is (mostly) mirrored by the videos attached to each lesson. 33 | * Most (but not all) lessons have associated tasks for you to complete. 34 | * "Discussion" days. 35 | * Everyone's job is to read the paper ahead of time and post a thoughtful comment. 36 | * Small-group and larger-group discussions. 37 | * The discussion leader's job is to write a blog post. 38 | * Remember to keep track of the [schedule][] so you know when to do these things! 39 | * The first discussion day is the next class, so plan to do the reading and submit a comment by 10pm on the day before. 40 | * The course project. 41 | * "Michelin star" grading. 42 | * Policies, academic integrity, generative AI, respect, accessibility. 43 | * This lesson's task is about debugging the course structure. 44 | * Ask questions about the course setup in this lesson's [discussion topic][topic]! 45 | * Your first task is below: introduce yourself in the [introductions topic][intro] right now, and (offline) pick a paper discussion to lead. 46 | * This website and the [blog][], including the [GitHub repository][gh] where you'll send PRs. Work for 6120 is "real," open source, and for the world. 47 | * When you've done those things, submit the L1 marker assignment on [CMS][]. 48 | 49 | ## Tasks 50 | 51 | * Read the [syllabus][]! Seriously, please [actually read it][s]. 52 | * [Sign up for Zulip][zulip-signup] and introduce yourself in [the introductions topic][intro]. Mention a compilers topic you'd like to learn about someday, either in this class or beyond. 53 | * Pick a paper from the [schedule][] whose discussion you want to lead. 54 | Claim it by opening a pull request that modifies [`content.toml`][cont-gh] to fill in your name on one of the `leader = "TK"` lines. 55 | (Feel free to ask questions about the papers on Zulip to help you decide!) 56 | * Submit any text file you like to the L1 [CMS][] assignment to indicate that you've done the introduction and claimed a paper. 57 | * For this task, submit an empty text file or whatever. 58 | * For others, include a link to your code! I encourage (but don't require) you to do all your work as open source. If you don't want to open-source your solutions for some reason, you can upload your code instead. 59 | 60 | [s]: https://www.cameo.com/v/5f2b392a0299b100202e624a 61 | [intro]: https://cs6120.zulipchat.com/#narrow/stream/254729-general/topic/introductions.202023 62 | [flipped]: https://en.wikipedia.org/wiki/Flipped_classroom 63 | [zulip-signup]: https://www.cs.cornell.edu/courses/cs6120/2025sp/private/zulip.html 64 | [syllabus]: @/syllabus.md 65 | [zulip]: https://cs6120.zulipchat.com 66 | [blog]: @/blog/_index.md 67 | [gh]: https://github.com/sampsyo/cs6120 68 | [topic]: https://github.com/sampsyo/cs6120/discussions/443 69 | [cms]: https://cmsx.cs.cornell.edu/ 70 | [schedule]: @/schedule.md 71 | [cont-gh]: https://github.com/sampsyo/cs6120/blob/2025sp/data/content.toml 72 | [lessons]: @/lesson/_index.md 73 | -------------------------------------------------------------------------------- /content/lesson/10.md: -------------------------------------------------------------------------------- 1 | +++ 2 | title = "Alias Analysis" 3 | [extra] 4 | index = 10 5 | [[extra.readings]] 6 | name = "Pointer Analysis" 7 | url = "https://yanniss.github.io/points-to-tutorial15.pdf" 8 | details = "tutorial by Yannis Smaragdakis and George Balatsouras" 9 | [[extra.videos]] 10 | id = "1_7ngps985" 11 | +++ 12 | ## Gist 13 | 14 | ### Motivation 15 | 16 | Lots of languages have pointers! Whether you call them that or not. 17 | 18 | * C, obviously. 19 | * In Java, most values are pointers—everything but the [primitive types][javaprim]. 20 | * In ML, [pointers are everywhere implicitly][ocamlptr], and you can create them yourself explicitly with the `'a ref` type. 21 | * Bril has pointers too, with the [memory extension][mem]. 22 | 23 | Tricky quiz to emphasize the point that pointers make semantics hard: what does this C program return? 24 | 25 | bool foo(char *a, char *b) { 26 | *a = 'a'; 27 | *b = 'b'; 28 | return *a == 'a'; 29 | } 30 | 31 | The answer is that it depends on *aliasing*, i.e., whether `a` and `b` point to the same location in memory. 32 | Really, doing *anything* with pointers depends on aliasing! 33 | Basically anything that you can normally do with non-aliasable local (stack) variables is extra hard to do on heap variables (i.e., pointers) without aliasing information. 34 | 35 | An example especially close to my heart is parallelization. 36 | Aliasing is a major impediment (the major impediment?) to automatic parallelization of code—you need to be sure that two things running in parallel can't be interfering with each others' data, which requires knowing that they don't touch pointers that alias. 37 | 38 | ### Stating the Alias Analysis Problem 39 | 40 | The problem: "For every program point, and for every pair of pointer-typed variables `p` and `q`, do `p` and `q` point to the same memory location at that point in time?" 41 | 42 | * Of course, this problem in undecidable. And even when it is computable, sometimes it can be very expensive. So we will have to make do with partial information: sometimes (often), the answer will be "maybe." 43 | * Useful answers are "must" alias vs. "must not" alias. 44 | * A common from of question to ask is a "may alias" query, which says "yes" (not very useful, and optimizations must be conservative) or "no" (we *know* these things must not alias, which is often useful for optimization). 45 | 46 | ### Alias Analysis with Data Flow 47 | 48 | Let's try to concoct a simple alias analysis using the data flow framework! 49 | 50 | * Direction: Forward. 51 | * Domain: A map from variable names to *sets of locations* that the variable *may* refer to. (You can use this data structure to answer may-alias queries by checking whether two variables map to sets with a nonempty intersection.) (What's a "location"? See "Heap Models," below.) 52 | * Initial value: Every variable has an empty set. 53 | * Merge function: Union for every variable. 54 | * Transfer function: do these things to the mapping for pointer-relevant Bril instructions. 55 | * `x = const K`: `map[x] = {}` 56 | * `x = id y`: `map[x] = map[y]` 57 | * `x = alloc y`: `map[x] = {fresh location}` 58 | 59 | ### Heap Models 60 | 61 | Any alias needs a definition of what a "memory location" is. 62 | A common answer is that there is one location per *static allocation site*. 63 | In Bril, for example, every `alloc` instruction becomes a memory location. 64 | 65 | For realistic languages, it often helps to disambiguate memory locations further: 66 | for example, to give every offset in an array a different location, 67 | or to give every field in an object a different location. 68 | 69 | ### Context-Sensitive Alias Analysis 70 | 71 | See [last time's discussion][l8] about context sensitivity in general. 72 | Context sensitivity is a big topic in alias analysis: it's common to use some limited calling-context context to disambiguate memory locations and alias information. 73 | 74 | Of course, there is a sharp trade-off between cost and precision. 75 | Scalable & precise alias analysis remains an open problem. 76 | Seriously, it's its own world of ongoing research. 77 | For much more, [see Smaragdakis and Balatsouras][patut]. 78 | 79 | ## Tasks 80 | 81 | There are no implementation tasks for this lesson. 82 | If alias analysis is your bag, you can start with using your data flow implementation to implement a straightforward may-alias analysis for [Bril pointers][mem], then proceed on to the literature to find and implement more and more interesting pointer analyses. 83 | 84 | [mem]: https://capra.cs.cornell.edu/bril/lang/memory.html 85 | [patut]: https://yanniss.github.io/points-to-tutorial15.pdf 86 | [ocamlptr]: https://github.com/ocaml/v2.ocaml.org/blob/8414244e502ca04e014ff772a224f9cb31472708/site/learn/tutorials/pointers.md 87 | [javaprim]: https://docs.oracle.com/javase/tutorial/java/nutsandbolts/datatypes.html 88 | [l8]: @/lesson/8.md 89 | -------------------------------------------------------------------------------- /content/lesson/11.md: -------------------------------------------------------------------------------- 1 | +++ 2 | title = "Memory Management" 3 | [extra] 4 | due = "April 15" 5 | index = 11 6 | thread = 457 7 | [[extra.videos]] 8 | id = "1_21p8mjsw" 9 | +++ 10 | ## Gist 11 | 12 | * A [silly tweet][tweet] by [Steve Blackburn][steve], who is one of the world's preeminent GC researchers. 13 | * The calamitous failure of manual memory management. 14 | * See, for example, this [finding from Microsoft that 70% of their CVEs are from memory safety bugs][ms-bugs]. 15 | * The vast majority of languages today have either (a) manual memory management, or (b) automatic, dynamic memory management. Automatic, static memory management is mostly the domain of [Rust][]. 16 | * Some terminology: 17 | * *collector:* Short for *garbage collector*. The runtime system that calls `free` for you. 18 | * *mutator:* The actual program you're trying to run, whose garbage is being collected. 19 | * *live:* Will be accessed again in the future. 20 | * *dead:* Otherwise. 21 | * Approaches to garbage collection. 22 | * Reference counting vs. mark/sweep a.k.a. tracing (sometimes just called "garbage collection" unto itself, but that's kind of confusing). 23 | * What metadata do they maintain, when do they run, what do they do, and what do they do when they run? 24 | * Reference cycles. 25 | * Refinements under the mark/sweep umbrella. 26 | * Conservative GC. 27 | * Its applicability to memory-unsafe languages like C/C++, where the compiler does not necessarily know which values are pointers. 28 | * A [cool blog post about Riptide][riptide], the GC in Apple's JavaScript compiler, that mentions that it uses a conservative GC. In fact, counterintuitively, lots of implementations of safe languages use conservative GCs—try to think of reasons why this is the case! 29 | * The wasted memory seems bad, but ["Bounding Space Usage of Conservative Garbage Collectors"][boehm] by Hans-J. Boehm shows how data structures often don't have such a big problem. 30 | * A [thread on the Go mailing list][gothread] demonstrates why conservative GC often works much better on 64-bit than 32-bit architectures: because valid pointers are far more sparse in the space of all integers. 31 | * Parallel collectors use multithreaded tracing algorithms. (Useful but not *that* interesting.) 32 | * Some aspects of "when" the GC runs: 33 | * Concurrent collection: the mutator runs in parallel with the collector. 34 | * Incremental collection. 35 | * Moving/copying/compacting. 36 | * Semispace. 37 | * Generational. 38 | * The "generational hypothesis": pithily, that *most objects die young*. More usefully, when a given GC run occurs, recently-allocated objects are more likely to be unreachable than objects that have already lasted a long time. 39 | 40 | [gothread]: https://groups.google.com/g/golang-nuts/c/qxlxu5RZAl0/ 41 | [boehm]: https://dl.acm.org/doi/abs/10.1145/565816.503282 42 | [riptide]: https://webkit.org/blog/7122/introducing-riptide-webkits-retreating-wavefront-concurrent-garbage-collector/ 43 | [steve]: http://users.cecs.anu.edu.au/~steveb/ 44 | [mem]: https://capra.cs.cornell.edu/bril/lang/memory.html 45 | [ms-bugs]: https://msrc-blog.microsoft.com/2019/07/16/a-proactive-approach-to-more-secure-code/ 46 | [rust]: https://www.rust-lang.org 47 | [tweet]: https://twitter.com/stevemblackburn/status/1075127763739144192?lang=en 48 | 49 | ## Tasks 50 | 51 | **These tasks are optional. Try it if you want, and I'll send you feedback! Or don't, and that's fine.** 52 | 53 | * Implement a garbage collector for a Bril interpreter and the Bril [memory extension][mem], eliminating the need for the `free` instruction. 54 | * Most people will want to start with [the reference interpreter][brili], which is written in TypeScript. But if you like Rust a lot, you might consider modifying [brilirs][] instead. 55 | * Stick with something simple, like reference counting or a semispace collector. Because Bril lacks objects/structs, it's not really that meaningful to implement anything much fancier than that. 56 | * Check that it works by running the [benchmarks][] that use memory after deleting all their `free` instructions. 57 | * First, check that the programs still work (and produce the same answers) when you delete `free`. 58 | * Second, add some tracking to your implementation to be sure that, when the program finishes, all the garbage that should be collected is collected. For example: 59 | * If you implement typical reference counting: After you finish returning from any function, the RC scheme should have freed anything that is only reachable from that stack frame (except for cycles). So when you return from `main`, *everything* should be freed (again, modulo cycles). You can check that this is true and print a warning about memory leaks if it's not. 60 | * If you implement a typical tracing scheme: You'll probably choose some heuristic to determine how often GC runs. Consider amending the heuristic to run one more time after `main` returns. After this final GC run, all the garbage should be collected. Consider printing an error at this point if there is a nonempty heap at this point. 61 | 62 | [benchmarks]: https://capra.cs.cornell.edu/bril/tools/bench.html 63 | [prop]: @/syllabus.md#proposal 64 | [brili]: https://capra.cs.cornell.edu/bril/tools/interp.html 65 | [brilirs]: https://capra.cs.cornell.edu/bril/tools/brilirs.html 66 | -------------------------------------------------------------------------------- /content/lesson/13.md: -------------------------------------------------------------------------------- 1 | +++ 2 | title = "Concurrency & Parallelism" 3 | [extra] 4 | index = 14 5 | [[extra.readings]] 6 | name = "Threads cannot be implemented as a library" 7 | url = "https://dl.acm.org/citation.cfm?id=1065042" 8 | details = "Hans-J. Boehm. PLDI 2005." 9 | [[extra.videos]] 10 | id = "1_8cpusna2" 11 | +++ 12 | ## Gist 13 | 14 | * The end of Moore's law rears its cliché head. The need for parallelism to sustain advances in computing. 15 | * Kinds of parallel programming models. 16 | * Message passing. 17 | * Shared memory. (The predominant model in the early 21st century.) 18 | * Data parallelism. 19 | * Task parallelism. 20 | * Actors. 21 | * The problem of semantics in shared-memory multithreading, as demonstrated by an imaginary extension to Bril. 22 | 23 | Imagine this setup code: 24 | 25 | zero: int = const 0; 26 | one: int = const 1; 27 | a: ptr = alloc one; 28 | b: ptr = alloc one; 29 | store a zero; 30 | store b zero; 31 | 32 | that stores 0 in two pointers, `a` and `b`. Then imagine these two threads running concurrently: 33 | 34 | # thread 1 35 | store a one; 36 | bv: int = load b; 37 | 38 | # thread 2 39 | store b one; 40 | av: int = load a; 41 | 42 | What are the possible values of `av` and `bv` at the end? 43 | If you said the set {(0, 1), (1, 0), (1, 1)}, then you're implicitly imagining that parallel execution looks like an *interleaving* of the instructions from parallel threads. 44 | That would be nice! 45 | It's a *memory model* called *sequential consistency*. 46 | (For more on memory models, I strongly recommend [a primer by Sorin, Hill, and Wood][primer].) 47 | 48 | * Sequential consistency and its discontents. 49 | * The happens-before relation, due to [Lamport][]. 50 | * Demonstrating a [C program][scviol] that violates SC. 51 | * Memory consistency models, more generally. 52 | * Defining data races. 53 | * The C++11 memory model and DRF⇒SC. 54 | * The Java memory model's long tale of woe. 55 | * [Threads cannot be implemented as a library.][boehm] 56 | * Attempt 1: Just compile programs normally, I guess? The lesson is that there need to be special "privileged" operations for synchronization. 57 | * Attempt 2: Synchronization operations are "black-box" calls that can't be analyzed interprocedurally. Some very tricky examples [due to Boehm][boehm] where things can go wrong: when optimizations that are *perfectly fine* in a single-threaded context are *completely incorrect* in a multi-threaded context, even when there are no threading calls anywhere in sight. 58 | 59 | [lamport]: https://lamport.azurewebsites.net/pubs/time-clocks.pdf 60 | [boehm]: https://dl.acm.org/citation.cfm?id=1065042 61 | [primer]: https://course.ece.cmu.edu/~ece847c/S15/lib/exe/fetch.php?media=part2_2_sorin12.pdf 62 | [scviol]: https://github.com/cs4110/2016fa-web/blob/4407be7203a56ddcdaae32362c2e172663678d3a/src/lectures/code32/scviol.c 63 | 64 | ## Tasks 65 | 66 | There are no tasks for this lesson other than pondering the deep weirdness of programming with shared-memory multithreading. 67 | -------------------------------------------------------------------------------- /content/lesson/14.md: -------------------------------------------------------------------------------- 1 | +++ 2 | title = "Fast Compilers" 3 | [extra] 4 | index = 15 5 | [[extra.readings]] 6 | name = "Flattening ASTs (and Other Compiler Data Structures)" 7 | url = "https://www.cs.cornell.edu/~asampson/blog/flattening.html" 8 | +++ 9 | ## Gist 10 | 11 | * The motivation for fast compilers. 12 | * The motivation in [dynamic compilers][jit] is probably self evident. 13 | * In AOT compilers, I personally think compiler performance matters in two regimes: 14 | * For small projects, it affects programmer productivity via the compile-edit-run cycle. (Can you compare how you felt while programming in (for example) Scala or C++ vs. Go or OCaml?) 15 | * For large projects, compilation time can be a huge resource expenditure. If it takes multiple hours to rebuild your main product, can you think of "business" reasons why a company might care about compiler efficiency? 16 | * The ingredients in compiler performance. 17 | * Asymptotic complexity of the algorithms. 18 | * Host-language efficiency taxes. 19 | * Complexity of the source language's grammar (people love to focus on this one). 20 | * Data structures. 21 | * Some evidence-free generalities about compiler performance. 22 | * The terrible thing is that compilers tend to have a *flat profile:* there is no single performance bottleneck. Making a compiler go faster usually requires a huge number of small optimizations. 23 | * See, for example, Nicholas Nethercote's series of blog posts titled ["How to speed up the Rust compiler in…"][nnrust], which illustrate some of these small wins in a production compiler. 24 | * This is related to why compilers are so often single-threaded, and why there are almost no (but [not zero][co-dfns]) GPU-accelerated compilers. 25 | * It is therefore a good idea to proactively focus on performance at the design stage (even though this is ["the root of all evil."][knuth-evil]) 26 | * Let's focus on data structures. 27 | * One reason: It's a cross-cutting way to address performance across a flat profile. 28 | * We will learn about one extremely simple trick that you can apply in many ways. It is definitely not the only ingredient in a fast compiler, nor is it even clearly the most important one. But it is probably an important one. 29 | * Proceed to [this blog post about *data structure flattening*][flat-blog]. 30 | * Construct a simple interpreter, and try it out. 31 | * Introduce a flat representation, and adjust the `interp` function accordingly. 32 | * Discuss: 33 | * Why *should* this change be good for performance? 34 | * Are there any implications for ergonomics? 35 | * A little performance evaluation. 36 | * A random program generator, so we can skip I/O. 37 | * I endorse [Hyperfine][] for quick-and-dirty measurements like this: `hyperfine -w1 -r3 './target/release/flatcalc gen_interp 1234'` 38 | * I endorse [samply][] as a simple way to see where the time goes: `samply record --no-open -- ./target/profiling/flatcalc gen_interp 1234` 39 | * Use [my `bench.sh`][bench.sh] to reproduce a full set of results. 40 | * Consider a different kind of interpreter that exploits the flat representation. Reflect on how we have essentially reinvented the idea of a bytecode interpreter. 41 | 42 | [jit]: @/lesson/12.md 43 | [knuth-evil]: https://en.wikipedia.org/wiki/Program_optimization#When_to_optimize 44 | [co-dfns]: https://github.com/Co-dfns/Co-dfns 45 | [nnrust]: https://nnethercote.github.io/2025/03/19/how-to-speed-up-the-rust-compiler-in-march-2025.html 46 | [flat-blog]: https://www.cs.cornell.edu/~asampson/blog/flattening.html 47 | [hyperfine]: https://github.com/sharkdp/hyperfine 48 | [bench.sh]: https://github.com/sampsyo/flatcalc/blob/main/bench.sh 49 | [samply]: https://github.com/mstange/samply 50 | 51 | ## Tasks 52 | 53 | There are no tasks for this lesson. 54 | Good luck on your course project! 55 | 56 | This is the last lesson of CS 6120. If you're taking the [self-guided version of the course][sg], please fill out [this feedback form][form]. I would love to know who has taken the class---and I want your opinions about how to make it better! 57 | 58 | [sg]: @/self-guided.md 59 | [form]: https://forms.gle/GuRiMa728DUvTbZQ7 60 | -------------------------------------------------------------------------------- /content/lesson/2.md: -------------------------------------------------------------------------------- 1 | +++ 2 | title = "Representing Programs" 3 | [extra] 4 | due = "January 30" 5 | index = 2 6 | thread = 450 7 | [[extra.videos]] 8 | id = "1_vnx6laq9" 9 | name = "representing programs" 10 | [[extra.videos]] 11 | id = "1_jc91ke0h" 12 | name = "getting started with Bril" 13 | +++ 14 | 15 | ## Gist 16 | 17 | * How do you represent a program? The classic options: 18 | * Concrete syntax. 19 | * Abstract syntax (AST). 20 | * Instructions. 21 | * We like the instruction representation for its regularity. To do anything useful with it, however, we need to extract higher-level representations. 22 | * Control flow graph (CFG). 23 | * Basic blocks. 24 | * Terminators (`jmp` and `br`, here). 25 | * Derive the algorithm for forming basic blocks. 26 | * Successors & predecessors. 27 | * Derive the algorithm for forming a CFG of basic blocks. 28 | * [Bril][], the Big Red Intermediate Language, is a language invented just for 6120. 29 | * Philosophy. 30 | * The canonical representation is JSON. There is a text format, but that's just a detail. (Remember, this class is not about parsing. If you ever find yourself doing something fancy with the text format, stop—that's not your job.) 31 | * This way, you can use any language you want to work with Bril (and for 6120 work). 32 | * A consequence is that you can (and should!) start "from scratch" with Bril. Do not attempt to use my garbage Python stuff; write your own! 33 | * Getting started with working with Bril. 34 | * The [documentation][bril-docs]. 35 | * The [git repository][bril]. 36 | * Part of the Bril philosophy is that it is language-neutral. As a consequence, the tools you get in the Bril repository are written in a hodgepodge of different languages. 37 | * **The video's installation instructions are stale:** it refers to using `yarn` to install stuff. It's now somewhat easier: 38 | 1. Install [Deno][]. `brew install deno` on macOS, for example. 39 | 2. Type `deno install -g brili.ts`. 40 | 3. As Deno instructs, add `$HOME/.deno/bin` to your `$PATH`. 41 | That's it! 42 | * Also, there is another (maybe easier) way to install the Python text-format tools, using [uv][]. Try `uv tool install .` in the `bril-txt` directory. 43 | * Lots of examples in the `tests` and `benchmarks` directories. 44 | * An extremely simple example: [`add.json`](https://github.com/sampsyo/bril/blob/main/test/print/add.json) in the canonical representation, or [the equivalent text for your human convenience](https://github.com/sampsyo/bril/blob/main/test/print/add.bril). 45 | * Interacting with Bril programs using Unix pipelines. For example, to run programs you write in the text format, use `bril2json < stuff.bril | brili`. 46 | * How to load up and process a Bril program, using Python as an example, but remember that you can use any language you like. 47 | * Getting started with generating the CFG. 48 | * Notably, `call` instructions are (usually) not considered terminators. 49 | * Try it yourself! 50 | * Afterward, if you're curious, check out a completed implementation (with some added fanciness) in [the examples directory](https://github.com/sampsyo/bril/tree/main/examples) (see `form_blocks.py`, `cfg.py`, and `cfg_dot.py`). 51 | * [Turnt][] is a tool you might like for testing compiler tools. 52 | * Turnt does *snapshot testing*, also known as *golden* or *expect* testing. It's different from other kinds of testing you might have done in the past, like unit testing, in that you don't have to write a spec: you just run the program and "lock in" its output as the expected output. 53 | * It might feel weird, but I personally think snapshot testing represents a wonderful effort/reward trade-off for compilers. Check out a gentler introduction in [a blog post about snapshot testing for compilers][turnt-blog]. 54 | 55 | 56 | ## Tasks 57 | 58 | Your goal is to get familiar with [Bril][]. 59 | 60 | * Write a new benchmark. 61 | * You can write it by hand, use the [TypeScript compiler][ts2bril], or generate it some other way. 62 | * Try running it with [brili][]. 63 | * Open a pull request to add your new benchmark. 64 | * Add your code to the [the `benchmarks` directory][benchdir]. 65 | * Use `turnt --save yours.bril` to create the test outputs for your new benchmark. (See the [Turnt][] README for details.) 66 | * If your `@main` function takes arguments, you can specify ones to use in testing with an `ARGS:` comment, [like this][args-example]. 67 | * Mention it in [the docs][bmdocs]. 68 | * Write a program to analyze or transform Bril programs in some other small way that you invent. 69 | * Pick your favorite programming language—there is no "starter code," so you can start from scratch. 70 | * Load up a JSON file. You can start with [this tiny one][add]! 71 | * Read [the docs][bril-docs]. 72 | * Do something unambitious with it: count the number of add instructions, or add a `print` instruction before every jump, or whatever. Pick something small and contrived! 73 | * Use [Turnt][] to test your new tool. Think carefully about what makes a good test for your tool, no matter how trivial. 74 | * Implement the algorithms to form basic blocks and build a control flow graph. 75 | * Along the way, you may run into problems! Ask questions on [Zulip][], and open issues and pull requests to describe or fix problems. For example, some benchmarks you might imagine probably can't be written easily because Bril is too simple. Mention this on Zulip, and consider pitching in to help add features. 76 | * As with all implementation tasks: 77 | * Summarize your work in the discussion thread associated with this lesson (see the link above). Check out [the relevant section in the syllabus][syl-tasks] to know more about what this post should consist of. 78 | * Submit the URL for your source code on [CMS][]. 79 | 80 | [bril]: https://github.com/sampsyo/bril 81 | [bril-docs]: https://capra.cs.cornell.edu/bril/ 82 | [add]: https://github.com/sampsyo/bril/blob/main/test/parse/add.json 83 | [turnt]: https://github.com/cucapra/turnt 84 | [ts2bril]: https://capra.cs.cornell.edu/bril/tools/ts2bril.html 85 | [brili]: https://capra.cs.cornell.edu/bril/tools/brilirs.html 86 | [benchdir]: https://github.com/sampsyo/bril/tree/main/benchmarks 87 | [bmdocs]: https://github.com/sampsyo/bril/blob/main/docs/tools/bench.md 88 | [zulip]: https://cs6120.zulipchat.com 89 | [cms]: https://cmsx.cs.cornell.edu/ 90 | [args-example]: https://github.com/sampsyo/bril/blob/06ed7bd18324fbb8902f1ebc43fd71deac8bfb03/benchmarks/fizz-buzz.bril#L1-L2 91 | [deno]: https://deno.land 92 | [syl-tasks]: @/syllabus.md#tasks 93 | [turnt-blog]: https://www.cs.cornell.edu/~asampson/blog/turnt.html 94 | [uv]: https://docs.astral.sh/uv/ 95 | -------------------------------------------------------------------------------- /content/lesson/3.md: -------------------------------------------------------------------------------- 1 | +++ 2 | title = "Local Analysis & Optimization" 3 | [extra] 4 | due = "February 6" 5 | index = 3 6 | thread = 451 7 | [[extra.readings]] 8 | name = "slides from Phil Gibbons at CMU" 9 | url = "http://www.cs.cmu.edu/afs/cs/academic/class/15745-s19/www/lectures/L3-Local-Opts.pdf" 10 | details = "for more details and context on LVN" 11 | [[extra.videos]] 12 | name = "simple dead code elimination" 13 | id = "1_6k52flbg" 14 | [[extra.videos]] 15 | name = "local value numbering" 16 | id = "1_i2gnhw41" 17 | +++ 18 | ## Gist 19 | 20 | ### Intro & Simple DCE Passes 21 | 22 | Contrasting local vs. global vs. interprocedural analysis. 23 | 24 | Then, our first optimizations! 25 | 26 | * Definition of dead code elimination (DCE). 27 | * Globally unused instructions. 28 | * Derive an algorithm for deleting them. 29 | * Iterating to convergence. 30 | * Then implement it. 31 | * Locally killed instructions. 32 | * The limits of local reasoning: why can't we do this globally? 33 | * Derive an algorithm for removing them. 34 | * Then implement that too. 35 | * Let's try our new optimization pass out on the Bril benchmarks. 36 | * When working on your implementations, try them on [`simple.bril`](https://github.com/sampsyo/bril/blob/main/examples/test/tdce/simple.bril), [`reassign.bril`](https://github.com/sampsyo/bril/blob/main/examples/test/tdce/reassign.bril), and other examples in [the DCE test directory](https://github.com/sampsyo/bril/tree/main/examples/test/tdce) to see if they actually work. 37 | * First, combine all your implementations into one command somehow (including iterating to convergence). Try out your pass—something like this: 38 | 39 | $ bril2json < bench.bril | python3 tdce.py | bril2txt 40 | 41 | * Next, try using `wc` to check static code size differences: 42 | 43 | $ bril2json < bench.bril | wc -l 44 | $ bril2json < bench.bril | python3 tdce.py | wc -l 45 | 46 | * Then profiling to measure dynamic instruction count: 47 | 48 | $ bril2json < bench.bril | brili -p 5 49 | $ bril2json < bench.bril | python3 tdce.py | brili -p 5 50 | 51 | You can see my implementation in `tdce.py` in [the examples directory][ex] in the Bril repository. 52 | 53 | ### Local Value Numbering 54 | 55 | * Local value numbering. 56 | * Consider the common thread between dead code elimination (DCE), copy propagation, and common subexpression elimination. In some compilers classes/textbooks, these are all individual optimizations. 57 | * Value numbering is a general framework for understanding & optimizing computations. 58 | * If you can deeply understand the mystical metaphysics of value numbering, you will have gotten most of what you need to get out of this part of 6120. 59 | * Extending LVN. 60 | * LVN can subsume constant folding, copy propagation, and algebraic identities. You will need to extend it with language semantics. 61 | * Write complete pseudocode for the base LVN algorithm, and work out where the "extension points" need to be to capture those optimizations. 62 | 63 | Here's the pseudocode from the video: 64 | 65 | table = mapping from value tuples to canonical variables, 66 | with each row numbered 67 | var2num = mapping from variable names to their current 68 | value numbers (i.e., rows in table) 69 | 70 | for instr in block: 71 | value = (instr.op, var2num[instr.args[0]], ...) 72 | 73 | if value in table: 74 | # The value has been computed before; reuse it. 75 | num, var = table[value] 76 | replace instr with copy of var 77 | 78 | else: 79 | # A newly computed value. 80 | num = fresh value number 81 | 82 | dest = instr.dest 83 | if instr will be overwritten later: 84 | dest = fresh variable name 85 | instr.dest = dest 86 | else: 87 | dest = instr.dest 88 | 89 | table[value] = num, dest 90 | 91 | for a in instr.args: 92 | replace a with table[var2num[a]].var 93 | 94 | var2num[instr.dest] = num 95 | 96 | You can see my implementation in `lvn.py` in [the examples directory][ex] in the Bril repository. But seriously, don't be tempted! You want to write your implementation without looking at mine! 97 | 98 | [ex]: https://github.com/sampsyo/bril/tree/main/examples 99 | 100 | ### Testing Your Optimizations 101 | 102 | As part of your tasks for this lesson, you will implement your first two optimizations. 103 | The two main things you want your optimizations to do are: 104 | 105 | 1. Not break programs. 106 | 2. Make programs faster, most of the time. 107 | 108 | As with every task in this class, part of the work is checking that you have done what you set out to do---in this case, that your optimizations do those two things. 109 | Think carefully about how to make a convincing case for each of those criteria. 110 | 111 | One tempting methodology might be to handwrite a few small test-case Bril programs (or, worse, borrow the woefully inadequate ones sitting around in the Bril git repository), run them through your optimizations, and look at them to check whether they look right. 112 | This does not amount to convincing evidence (maybe you can think of a few specific reasons why). 113 | 114 | While there are many ways to be convincing, a pretty good way might be to run your optimization on *every single available [Bril benchmark][bench]*, 115 | systematically check that it still produces the right output for at least one input, 116 | and collect aggregate statistics about some metric you're interested in. 117 | This is a nice way to check for unexpected behavior in programs that you didn't carefully write yourself to test the cases you're thinking of. 118 | 119 | If this is the route you choose, you can do it however you like, I have made a simple tool that you can consider using, called [Brench][]. 120 | Brench is not very fancy; it does three things: 121 | 122 | 1. It makes it easy to run a long list of inputs through several different commands. (For example, you can run a long list of Bril benchmarks through an "interpret" command and an "optimize-and-then-interpret" command.) 123 | 2. It checks that all the commands agree on their output. (So, in our use case, it checks that optimizing the benchmark doesn't change its output when interpreted.) 124 | 3. It can collect a statistic from each command for comparison. (Like the number of dynamic instructions the interpreter executed, which is a pretty good metric for standard optimizations.) 125 | 126 | Those three things are probably what you want to do to make a convincing case for an optimization's correctness and effectiveness, whether or not you use Brench. It's there if you want it, but feel free to go your own way! 127 | 128 | [bench]: https://capra.cs.cornell.edu/bril/tools/bench.html 129 | 130 | 131 | ## Tasks 132 | 133 | * Implement "trivial" dead code elimination, in which you delete instructions that are never used before they are reassigned. Remember to iterate to convergence. (This should not take you long.) 134 | * Implement local value numbering. Make sure it eliminates some common subexpressions. Try pairing it with trivial dead code elimination as a post-processing step. (This might take you quite a while.) 135 | * In your summary on the GitHub Discussions thread, briefly write up the evidence you have that your LVN implementation is correct and actually optimizes programs. (Hint: To be convincing, do not just use a few handcrafted test cases. One good way to do this is to run it on all the benchmarks in the Bril repository, perhaps using [Brench][]. But this is truly open ended; you do you!) 136 | * For bonus "points," extend your LVN implementation to optimize the trickier examples given in class. 137 | 138 | [brench]: https://capra.cs.cornell.edu/bril/tools/brench.html 139 | -------------------------------------------------------------------------------- /content/lesson/4.md: -------------------------------------------------------------------------------- 1 | +++ 2 | title = "Data Flow" 3 | [extra] 4 | due = "February 13" 5 | index = 4 6 | thread = 452 7 | [[extra.readings]] 8 | name = "the CS 4120 notes" 9 | url = "http://www.cs.cornell.edu/courses/cs4120/2019sp/lectures/20dataflow/lec20-sp16.pdf" 10 | details = "for a longer introduction to data flow" 11 | [[extra.readings]] 12 | name = "Section 5.3 of SPA" 13 | url = "https://cs.au.dk/~amoeller/spa/spa.pdf" 14 | details = "has more on fixed point algorithms for solving data flow" 15 | [[extra.videos]] 16 | name = "data flow" 17 | id = "1_72tqupsb" 18 | [[extra.videos]] 19 | name = "implementation task" 20 | id = "1_mjy6lamo" 21 | +++ 22 | ## Gist 23 | 24 | ### The Data Flow Framework 25 | 26 | * *Reaching definitions* are an example of a global property that require you to look at an entire CFG. 27 | * Terminology: 28 | * *Use:* An instruction uses all of its arguments. (So any instruction with arguments is a "use.") 29 | * *Definition:* An instruction defines the variable it writes to. (You can call every instruction that writes to a variable a "definition.") 30 | * *Available:* Definitions that reach a given point in the program are available at that point. 31 | * *Kills:* Any definition kills all of the currently available definitions. 32 | * The data flow framework. Here's how you solve a global analysis problem by imbuing a local analysis with a dose of data-flow magic: 33 | 1. Figure out the thing you want to know at the entry and exit of each block. 34 | 2. Write an equation for every block relating that thing at the entry to that thing at the exit. (In general, this is a *local analysis* for the block.) 35 | 3. Generate equalities according to the edges in the CFG. 36 | 4. Solve the system of equations! (Using a general solver algorithm that you don't need to adapt to every problem.) 37 | * Instantiating the data flow framework for reaching definitions. 38 | * Initial value: the empty set. 39 | * Transfer function: `out(in) = def(b) ∪ (in - kills(b))` 40 | * Merge function: union. 41 | * The worklist algorithm for solving data flow. 42 | 43 | Here's the pseudocode for solving a forward data flow problem with a worklist algorithm: 44 | 45 | in[entry] = init 46 | out[*] = init 47 | 48 | worklist = all blocks 49 | while worklist is not empty: 50 | b = pick any block from worklist 51 | in[b] = merge(out[p] for every predecessor p of b) 52 | out[b] = transfer(b, in[b]) 53 | if out[b] changed: 54 | worklist += successors of b 55 | 56 | For the backward version, flip predecessors & successors, and flip `in` and `out`. 57 | 58 | 59 | ### Instantiating Data Flow 60 | 61 | * Theory interlude: the requirements for a general data flow analysis. How do you know the worklist algorithm terminates and gives you the right answer? 62 | * The domain of values you're trying compute needs to form a *partial order* with a unique lower bound. The rough idea is that the worklist algorithm should only "move" values monotonically in the order, so it's guaranteed to eventually terminate. 63 | * In terms of a partial order ⊑, the merge function is the *meet* (greatest lower bound) operator ⊓; the initial value is the top value ⊤; and the transfer function must be a monotonic function, so `x ⊑ y` implies `f(x) ⊑ f(y)`. 64 | * The usual definition of a "correct" solution to a data-flow problem is the *meet-over-all-paths* solution: the meet of chained applications of the transfer functions for every path in the CFG from the entry block to any given block. 65 | * For more on the theory, I recommend Chapter 5 of [*Static Program Analysis* by Møller and Schwartzbach][spa]. 66 | * More examples of things you can do with the data flow framework. 67 | * Reaching definitions. 68 | * Live variables: which variables are both defined and going to be used at some point in the future? 69 | * Constant propagation: which variables have statically knowable constant values? 70 | * Available expressions: which *expressions* have already been computed computed in the past? (Useful in CSE.) 71 | * Initialized variables: like in Java, which variables have had *something* written to them? 72 | * Interval analysis: what is the numerical range of values that a given variable might hold? 73 | * Demonstrating a simple generic implementation. 74 | * If you want, you can use the `{args}` feature of [Turnt][] and its `-a` command-line flag to quickly switch between different analyses. 75 | 76 | [spa]: https://cs.au.dk/~amoeller/spa/spa.pdf 77 | [turnt]: https://github.com/cucapra/turnt 78 | 79 | ## Tasks 80 | 81 | * Implement at least one data flow analysis. You choose which. 82 | * For bonus “points,” implement a generic solver that supports multiple analyses. 83 | * As always: Try to convince yourself that your analyses are working how you want them to. 84 | -------------------------------------------------------------------------------- /content/lesson/5.md: -------------------------------------------------------------------------------- 1 | +++ 2 | title = "Global Analysis" 3 | [extra] 4 | due = "February 25" 5 | index = 5 6 | thread = 453 7 | [[extra.videos]] 8 | name = "global analysis & optimization" 9 | id = "1_i5apfx6t" 10 | +++ 11 | ## Gist 12 | 13 | ### Dominators 14 | 15 | Lots of definitions! 16 | 17 | * Reminders: Successors & predecessors. Paths in CFGs. 18 | * *A* *dominates* *B* iff all paths from the entry to *B* include *A*. 19 | * The *dominator tree* is a convenient data structure for storing the dominance relationships in an entire function. The recursive children of a given node in a tree are the nodes that that node dominates. 20 | * *A* *strictly dominates* *B* iff *A* dominates *B* and *A ≠ B*. (Dominance is reflexive, so "strict" dominance just takes that part away.) 21 | * *A* *immediately dominates* *B* iff *A* dominates *B* but *A* does not strictly dominate any other node that strictly dominates *B*. (In which case *A* is *B*'s direct parent in the dominator tree.) 22 | * A *dominance frontier* is the set of nodes that are just "one edge away" from being dominated by a given node. Put differently, *A*'s dominance frontier contains *B* iff *A* does not strictly dominate *B*, but *A* does dominate some predecessor of *B*. 23 | * *Post-dominance* is the reverse of dominance. *A* post-dominates *B* iff all paths from *B* to the exit include *A*. (You can extend the strict version, the immediate version, trees, etc. to post-dominance.) 24 | 25 | An algorithm for finding dominators: 26 | 27 | dom = {every block -> all blocks} 28 | dom[entry] = {entry} 29 | while dom is still changing: 30 | for vertex in CFG except entry: 31 | dom[vertex] = {vertex} ∪ ⋂(dom[p] for p in vertex.preds} 32 | 33 | The `dom` relation will, in the end, map each block to its set of dominators. 34 | We initialize it as the "complete" relation, i.e., mapping every block to the set of _all_ blocks. 35 | The exception is the entry block, which we ensure *always* only has itself as a dominator. 36 | (This keeps the algorithm from being confused by blocks that jump back to the entry node.) 37 | The loop pares down the sets by iterating to convergence. 38 | 39 | The running time is O(n²) in the worst case. 40 | But there's a trick: if you iterate over the CFG in *reverse post-order*, and the CFG is well behaved (reducible), it runs in linear time—the outer loop runs a constant number of times. 41 | 42 | ### Natural Loops 43 | 44 | Some things about loops: 45 | 46 | * *Natural loops* are strongly connected components in the CFG with a single entry. 47 | * Natural loops are formed around *backedges*, which are edges from *A* to *B* where *B* dominates *A*. 48 | * (Side note: There are actually two common definitions of *backedges:* this one, and one that relies on a depth-first search (DFS). By the other definition, a backedge is any edge that takes you to an already-visited node during DFS. The relationship between these two definitions is not 100% clear to me, although they are certainly not equivalent, at least for irreducible CFGs.) 49 | * A natural loop is the smallest set of vertices *L* including *A* and *B* such that, for every *v* in *L*, either all the predecessors of *v* are in *L* or *v*=*B*. 50 | * A CFG is *reducible* iff every backedge has a natural loop. 51 | * A language that only has `for`, `while`, `if`, `break`, `continue`, etc. can only generate reducible CFGs. You need `goto` or something to generate irreducible CFGs. 52 | 53 | ### Loop-Invariant Code Motion 54 | 55 | Here's a preview of what we'll do with natural loops. 56 | The *loop-invariant code motion* (LICM) optimization transforms code like this: 57 | 58 | let a = ...; 59 | let b = ...; 60 | for (let i = 0; i < 100; ++i) { 61 | f(a * b); 62 | } 63 | 64 | Into this, by moving code that does the same thing on every iteration to the loop's *preheader* block: 65 | 66 | let a = ...; 67 | let b = ...; 68 | let c = a * b; 69 | for (let i = 0; i < 100; ++i) { 70 | f(c); 71 | } 72 | 73 | That is, we want to move code from inside the loop to before it---when that computation always results in the same value. 74 | We'll return to LICM in [Lesson 8][l8]. 75 | 76 | ## Tasks 77 | 78 | * Implement some dominance utilities: 79 | * Find dominators for a function. 80 | * Construct the dominance tree. 81 | * Compute the dominance frontier. 82 | * Devise a way to test your implementations. For example, is there a way you can algorithmically confirm that a block *A* dominates a block *B*? While *computing* these sets should be cheap, *checking* their output could use slow, naive algorithms. 83 | 84 | [is_ssa]: https://github.com/sampsyo/bril/blob/main/examples/is_ssa.py 85 | [l8]: @/lesson/8.md 86 | -------------------------------------------------------------------------------- /content/lesson/7.md: -------------------------------------------------------------------------------- 1 | +++ 2 | title = "LLVM" 3 | [extra] 4 | due = "March 13" 5 | index = 7 6 | thread = 455 7 | [[extra.readings]] 8 | name = "LLVM for Grad Students" 9 | url = "https://www.cs.cornell.edu/~asampson/blog/llvm.html" 10 | [[extra.videos]] 11 | name = "introduction to LLVM" 12 | id = "1_f231lwkz" 13 | [[extra.videos]] 14 | name = "writing an LLVM pass" 15 | id = "1_4nrtmvc9" 16 | +++ 17 | ## Gist 18 | 19 | ### What is LLVM? 20 | 21 | - [LLVM][] is a very popular compiler infrastructure for C and other languages, including Rust. 22 | - It has a nice SSA-form IR and a nice API for manipulating it. 23 | - In one sense, it is like a supercharged version of Bril. (Not coincidental because I modeled Bril after it.) 24 | - It is also, in a sense, the *opposite* of Bril because it is *real* infrastructure that the world *really* runs on for *lots* of software out there. It’s likely that some of the software you’re using to watch this video, somewhere along the chain, was compiled with LLVM. 25 | - So it is 10,000 times more realistic. Which is great, but it means that some ways Bril is “idealized” will be ruthlessly not-idealized in LLVM. 26 | - Did I mention that LLVM is written in C++? So we’re going to write in C++ (a sharp contrast from Bril, where any language will do). 27 | 28 | ### Getting Started 29 | 30 | - We will follow an old [tutorial][blog] I wrote. 31 | - You will need to install LLVM. (And also [CMake][].) 32 | - Not covered in detail here. It’s probably in your package manager. There are also tips in [that blog post][blog]. 33 | - For example, try `brew install cmake llvm` using [Homebrew][brew] on macOS. (On macOS, please install from Homebrew instead of using the version that comes with Xcode.) 34 | - Make sure you can do `clang --version`. 35 | - Compile C programs to LLVM IR: `clang -emit-llvm -S -o - something.c` 36 | - Show off the [language reference][langref]. (RTFM often!) 37 | - Show off the Doxygen pages (and how to Google for them). 38 | 39 | ### Writing a Pass 40 | 41 | Please follow the [“LLVM for Grad Students”][blog] tutorial. 42 | (The second video walks through the same steps.) 43 | 44 | The command in the video for *running* your pass is out of date. 45 | (It's correct in [the blog post][blog].) 46 | Instead of that `-Xclang` business, you want: 47 | 48 | $ clang -fpass-plugin=build/skeleton/SkeletonPass.so something.c 49 | 50 | ...or possibly `.dylib` instead of `.so`, if you're on macOS. 51 | 52 | ## Tasks 53 | 54 | - Follow the LLVM tutorial blog post far enough to implement a pass that changes program execution. 55 | - This is intentionally open-ended. You can be as ambitious or as unambitious as you want. 56 | - An example of an unambitious but acceptable task would be to print out a message every time the program uses floating-point division. 57 | - An example of an ambitious task would be to implement a fancy optimization on LLVM IR and make sure it speeds things up in actual wall-clock time execution. 58 | - Find a real-ish C/C++ program somewhere and run your pass on it to observe the results. 59 | 60 | [blog]: https://www.cs.cornell.edu/~asampson/blog/llvm.html 61 | [llvm]: https://llvm.org 62 | [langref]: https://llvm.org/docs/LangRef.html 63 | [cmake]: https://cmake.org 64 | [brew]: https://brew.sh 65 | -------------------------------------------------------------------------------- /content/lesson/9.md: -------------------------------------------------------------------------------- 1 | +++ 2 | title = "Interprocedural Analysis" 3 | [extra] 4 | index = 9 5 | [[extra.videos]] 6 | id = "1_9csov2la" 7 | +++ 8 | ## Gist 9 | 10 | Recap: we have done *local* (within a basic block) and *global* (within a function) analyses & optimizations. But what about optimizations that have to cross function boundaries? Those are *interprocedural*. 11 | 12 | Like, what if we want to do LICM to this code? 13 | 14 | main() { 15 | let x = ...; 16 | for (...) { 17 | g(f(x)); 18 | } 19 | } 20 | 21 | f(x) { 22 | return x * 42; 23 | } 24 | 25 | The multiplication in `f` is clearly loop invariant, but no global optimization can tell that. 26 | And worse still, we can't very well move the multiplication "out of" the body of `f` because there might be other places that call `f` that expect it to work as written. 27 | 28 | ### The Call Graph 29 | 30 | * In interprocedural analysis, you often want to know which functions call which other functions. Like a CFG but for functions, in a way. 31 | * In a call graph, every function is a vertex and every edge represents a call from one function to another. 32 | 33 | ### Open vs. Closed World 34 | 35 | * Practically speaking, most interprocedural analyses need to assume they work in an *open world*, i.e., they don't get to see *all* the code that will eventually turn into the final program. Some code is "hidden," and the analysis must assume it can do anything. 36 | * In more exotic scenarios, you get to make a *closed-world assumption*, i.e., that your analysis gets to see everything. That's often called a *whole-program* analysis. 37 | * Assuming a closed world, for example, lets you delete a function if you see that it is never called. In an open world, you can't be sure the function is actually dead. (You don't get to see the whole call graph.) 38 | * While whole-program analysis is strictly more powerful, there are many reasons why realistic analyses need to assume an open world: 39 | * Separate compilation. People want the ability to compile `*.c` files to `*.o` files independently—for many good reasons. 40 | * Speed: it can be impractical to analyze whole programs if they're really big. 41 | * In many more dynamic languages, including Java even, it's possible for the program to load more code at run time. So assumptions you make about the "whole program" may be invalidated when the application loads a plugin. 42 | * Practically speaking, whole-program analysis generally happens in these scenarios: 43 | * [Link-time optimization (LTO)][lto] is an *extra* optimization phase that happens *after* you independently compile all those `*.c` files to `*.o` files, when you want to link them together into an executable, so you do get to see all the code. 44 | * [Just-in-time (JIT)][jit] compilers get to see a snapshot of the code for the entire program right before it runs. They can temporarily apply closed-world optimizations and then invalidate them later on if the program loads more code later on. 45 | 46 | ### Inlining 47 | 48 | * Inlining is a pretty simple idea: take a function call, and replace it with a copy of the called function's body. You eliminate the call and just do the computation right there. 49 | * Inlining is the "ur-optimization" because it gives interprocedural superpowers to local & global optimizations. If you can do a good job with inlining, you unlock many more downstream global optimizations. 50 | * Of course, you can't inline *everything*. 51 | * Inlining the entire call graph into `main` would make the code exponentially large. And it's impossible when there's recursion, of course. 52 | * In general, inlining has a cost (code size increases, worse instruction locality) and a benefit (remove the cost of the call, enable more optimization). Inliners need to decide when the benefit outweighs the cost. 53 | * Inevitably, you need some kind of heuristic. For example, an easy one is to only inline functions that are small enough, i.e., below a fixed instruction-count threshold. 54 | 55 | ### Devirtualization 56 | 57 | * Lots of languages (but not Bril—yet!) have virtual function calls. 58 | * Every method call in Java is a virtual call, for example: the actual code you invoke for `o.m()` depends on the run-time type of `o` and whether the subclass overrides the `m` method. 59 | * In assembly, these show up as [indirect jumps][ij]. 60 | * It would be *really nice* to inline virtual function calls, but doing it in a straightforward way is impossible because we don't know which function is being called! 61 | * *Devirtualization* is an optimization that turns virtual (indirect) calls into direct calls, when possible. Then inlining and other interprocedural optimizations can work on those direct calls. 62 | * To use Java as an example again, an easy case is when you initialize an object with `Foo o = new Baz()` and then *immediately*, like on the next line of code, call `o.m()`. You know that `o` will be a `Baz` object at that call site, you so you know it *must* call `Baz`'s version of the `m` method. 63 | * In general, you want to do a data flow analysis to propagate information about the dynamic types of objects from assignments to method calls, and then use that information to decide whether there is exactly one possibility for the function you need to invoke directly. 64 | * I recommend [this blog post about devirtualization in LLVM][llvm-devirt]. 65 | 66 | ### Context Sensitivity 67 | 68 | Sometimes the answer to a question about a function depends on another question: *which call are we talking about?* 69 | 70 | For example, imagine we are trying to optimize this somewhat funky program that uses lots of evil mutable global state: 71 | 72 | bool b; // global variables 73 | int i = 0; 74 | 75 | main() { 76 | g1(); 77 | print(i): 78 | 79 | g2() 80 | print(i); 81 | } 82 | 83 | g1() { 84 | b = true; 85 | f(); 86 | } 87 | 88 | g2() { 89 | b = false; 90 | f(); 91 | } 92 | 93 | f() { 94 | if (b) { 95 | i++; 96 | } 97 | } 98 | 99 | The call to `f` in `g1` matters, but the one in `g2` is "dead code" that can't affect anything the program will do. 100 | Inlining could reveal this fact, of course, but we know it's not always practical to inline everything. 101 | And any self-respecting (i.e., sound) interprocedural analysis that asks *does `f` modify `i`?* must say *yes, it might!* 102 | 103 | So is there a way to tell that the second `f` can be eliminated? 104 | For that, we need a *context-sensitive* analysis. 105 | The idea is to use some kind of contextual information to distinguish different invocations of the same code. 106 | For example, one common kind of context is the call stack: 107 | a context-sensitive analysis could draw different conclusions about calls to `f` from `g1` versus calls to `f` from `g2`. 108 | 109 | Context-sensitive analyses can get expensive quickly. 110 | For example, in our imaginary *does the function modify `i`?* analysis, a context-insensitive analysis would need to answer *n* queries where *n* is the number of functions in the program. 111 | But a context-sensitive analysis that uses the calling function as the context needs *n²* queries. 112 | And you can even use deeper call stacks as context to get even more precise answers—if you use the *i* most recent calls as context, you now need to answer *nⁱ* queries. 113 | So in general, context sensitivity represents a pretty steep trade-off between analysis precision and cost. 114 | 115 | [jit]: https://en.wikipedia.org/wiki/Just-in-time_compilation 116 | [lto]: https://en.wikipedia.org/wiki/Interprocedural_optimization#WPO_and_LTO 117 | [ij]: https://en.wikipedia.org/wiki/Indirect_branch 118 | [llvm-devirt]: https://blog.llvm.org/2017/03/devirtualization-in-llvm-and-clang.html 119 | 120 | ## Tasks 121 | 122 | There are no tasks to turn in for this lesson. 123 | For "fun," you can consider implementing inlining for Bril function calls! 124 | -------------------------------------------------------------------------------- /content/lesson/_index.md: -------------------------------------------------------------------------------- 1 | +++ 2 | title = "Lessons" 3 | template = "lessons.html" 4 | page_template = "lesson.html" 5 | +++ 6 | # Lessons 7 | -------------------------------------------------------------------------------- /content/schedule.md: -------------------------------------------------------------------------------- 1 | +++ 2 | title = "Schedule" 3 | template = "schedule.html" 4 | [extra] 5 | hide = false 6 | class = "wide" 7 | +++ 8 | -------------------------------------------------------------------------------- /content/self-guided.md: -------------------------------------------------------------------------------- 1 | +++ 2 | title = "The Self-Guided Course" 3 | template = "sg.html" 4 | [extra] 5 | hide = true 6 | [extra.readings] 7 | "1" = ["wrongdata", "eeg"] 8 | "5" = ["ball-larus"] 9 | "6" = ["alive"] 10 | "9" = ["tbaa"] 11 | "10" = ["ugc", "consgc"] 12 | "11" = ["self", "tracemonkey"] 13 | "12" = ["super", "chlorophyll"] 14 | "13" = ["notlib", "slp", "dpj", "compcert"] 15 | +++ 16 | # CS 6120: Advanced Compilers: The Self-Guided Online Course 17 | 18 | CS 6120 is a PhD-level [Cornell CS][cs] course by [Adrian Sampson][adrian] on programming language implementation. 19 | It covers universal compilers topics like intermediate representations, data flow, and “classic” optimizations as well as more research-flavored topics such as parallelization, just-in-time compilation, and garbage collection. 20 | The work consists of reading papers and open-source hacking tasks, which use [LLVM][] and [an educational IR invented just for this class][bril]. 21 | 22 | This page lists the curriculum for following this course at the university of your imagination, for four imagination credits (ungraded). 23 | There’s a linear timeline of lessons interspersed with papers to read. 24 | Each lesson has videos and written notes, and some have *implementation tasks* for you to complete. 25 | Tasks are all open-ended, to one degree or another, and are meant to solidify your understanding of the abstract concepts by turning them into real code. 26 | The order represents a suggested interleaving of video-watching and paper-reading. 27 | 28 | Some differences with the “real” CS 6120 are that you can ignore the task deadlines and you can’t participate in our discussion threads on Zulip. 29 | Real 6120 also has an end-of-semester course project—in the self-guided version, your end-of-semester assignment is to change the world through the magic of compilers. 30 | 31 | The instructor is a video production neophyte, so please excuse the production values, especially in the early lessons. 32 | CS 6120 is [open source and on GitHub][gh], so please file bugs if you find problems. 33 | 34 | When you finish the course, please fill out [this feedback form][form]. 35 | 36 | [gh]: https://github.com/sampsyo/cs6120 37 | [cs]: https://www.cs.cornell.edu/ 38 | [adrian]: https://www.cs.cornell.edu/~asampson/ 39 | [bril]: https://capra.cs.cornell.edu/bril/ 40 | [llvm]: https://llvm.org/ 41 | [form]: https://forms.gle/GuRiMa728DUvTbZQ7 42 | -------------------------------------------------------------------------------- /data/blog_footer.md: -------------------------------------------------------------------------------- 1 | This is the course blog for [CS 6120][cs6120], the PhD-level compilers course in the computer science department at Cornell. 2 | You can subscribe to [posts on the blog][blog] with [RSS][]. 3 | 4 | [cs6120]: https://www.cs.cornell.edu/courses/cs6120/2025sp/ 5 | [blog]: https://www.cs.cornell.edu/courses/cs6120/2025sp/blog/ 6 | [rss]: https://www.cs.cornell.edu/courses/cs6120/2025sp/rss.xml 7 | -------------------------------------------------------------------------------- /data/content.toml: -------------------------------------------------------------------------------- 1 | # Introductions. 2 | 3 | [[classes]] 4 | lesson = 1 5 | 6 | [[classes]] 7 | title = "Performance and Measurement" 8 | leader = "Adrian" 9 | readings = ["wrongdata", "eeg"] 10 | 11 | [[classes]] 12 | lesson = 2 13 | 14 | 15 | # "Classic" optimization. 16 | 17 | [[classes]] 18 | lesson = 3 19 | 20 | [[classes]] 21 | lesson = 3 22 | 23 | [[classes]] 24 | lesson = 4 25 | 26 | [[classes]] 27 | lesson = 4 28 | 29 | [[classes]] 30 | lesson = 5 31 | 32 | [[classes]] 33 | title = "Profiling" 34 | leader = "Simon, Noah Schiff, Kabir Samsi, Allen" 35 | readings = ["ball-larus"] 36 | 37 | [[classes]] 38 | lesson = 6 39 | 40 | [[classes]] 41 | lesson = 6 42 | 43 | [[classes]] 44 | title = "SSA" 45 | leader = "Neel, Dev Patel (dp673)" 46 | readings = ["ssa-construct"] 47 | 48 | 49 | # LLVM. 50 | 51 | [[classes]] 52 | lesson = 7 53 | 54 | [[classes]] 55 | title = "Automatic Verification" 56 | leader = "Annabel Baniak, Katherine Wu, Stephanie, Max" 57 | readings = ["alive"] 58 | 59 | 60 | # "Fancy" optimization. 61 | 62 | [[classes]] 63 | lesson = 8 64 | 65 | [[classes]] 66 | lesson = 9 67 | 68 | [[classes]] 69 | lesson = 10 70 | 71 | [[classes]] 72 | title = "Alias-Based Optimization" 73 | leader = "Michael Xing, Gerardo Teruel (gg525), Arnav Muthiayen" 74 | readings = ["tbaa"] 75 | 76 | 77 | # Memory management. 78 | 79 | [[classes]] 80 | lesson = 11 81 | 82 | [[classes]] 83 | title = "GC & Reference Counting" 84 | leader = "Ethan Gabizon, Parth Sarkar, Ernest Ng" 85 | readings = ["ugc"] 86 | 87 | [[classes]] 88 | title = "Fancy Memory Management" 89 | leader = "Bryant Park, David Han, Mariia Soroka, Lisa Li" 90 | readings = ["mesh"] 91 | 92 | 93 | # Dynamic languages. 94 | 95 | [[classes]] 96 | lesson = 12 97 | 98 | [[classes]] 99 | title = "Dynamic Languages" 100 | leader = "Serena Duncan, and Ananya Goenka" 101 | readings = ["self"] 102 | 103 | [[classes]] 104 | title = "Tracing" 105 | leader = "Ethan, Tean Lai (tml95)" 106 | readings = ["tracemonkey"] 107 | 108 | 109 | # Potpourri. 110 | 111 | [[classes]] 112 | title = "Superoptimization" 113 | leader = "Mark Barbone, Zihan Li, Sam Breckenridge" 114 | readings = ["denali"] 115 | 116 | [[classes]] 117 | lesson = 13 118 | 119 | [[classes]] 120 | lesson = 14 121 | 122 | [[classes]] 123 | title = "Interactive Verification" 124 | leader = "Edmund, Jonah Bernard, Mahmoud Elsharawy" 125 | readings = ["compcert"] 126 | -------------------------------------------------------------------------------- /data/reading.toml: -------------------------------------------------------------------------------- 1 | [wrongdata] 2 | name = "Producing Wrong Data Without Doing Anything Obviously Wrong!" 3 | link = "https://dl.acm.org/citation.cfm?id=1508275" 4 | details = "Todd Mytkowicz, Amer Diwan, Matthias Hauswirth, and Peter F. Sweeney. ASPLOS 2009." 5 | 6 | [eeg] 7 | name = "SIGPLAN Empirical Evaluation Guidelines" 8 | link = "https://www.sigplan.org/Resources/EmpiricalEvaluation/" 9 | 10 | [ball-larus] 11 | name = "Efficient Path Profiling" 12 | link = "https://dl.acm.org/citation.cfm?id=243857" 13 | details = "Thomas Ball and James R. Larus. MICRO 1996." 14 | 15 | [alive] 16 | name = "Provably Correct Peephole Optimizations with Alive" 17 | link = "https://dl.acm.org/citation.cfm?id=2737965" 18 | details = "Nuno P. Lopes, David Menendez, Santosh Nagarakatte, and John Regehr. PLDI 2015." 19 | 20 | [tbaa] 21 | name = "Type-Based Alias Analysis" 22 | link = "https://dl.acm.org/citation.cfm?id=277670" 23 | details = "Amer Diwan, Kathryn S. McKinley, and J. Eliot B. Moss." 24 | 25 | [ugc] 26 | name = "A Unified Theory of Garbage Collection" 27 | link = "https://dl.acm.org/citation.cfm?id=1028982" 28 | details = "David F. Bacon, Perry Cheng, and V. T. Rajan. OOPSLA 2004." 29 | 30 | [consgc] 31 | name = "Fast Conservative Garbage Collection" 32 | link = "http://www.cs.utexas.edu/~mckinley/papers/conservative-gc-oopsla-2014.pdf" 33 | details = "Rifat Shahriyar, Stephen M. Blackburn, and Kathryn S. McKinley. OOPSLA 2014." 34 | 35 | [mesh] 36 | name = "Mesh: Compacting Memory Management for C/C++ Applications" 37 | link = "https://dl.acm.org/doi/10.1145/3314221.3314582" 38 | details = "Bobby Powers, David Tench, Emery D. Berger, and Andrew McGregor. PLDI 2019." 39 | 40 | [self] 41 | name = "An Efficient Implementation of SELF, a Dynamically-Typed Object-Oriented Language Based on Prototypes" 42 | link = "http://portal.acm.org/citation.cfm?id=74884" 43 | details = "C. Chambers, D. Ungar, and E. Lee. OOPSLA 1989." 44 | 45 | [tracemonkey] 46 | name = "Trace-Based Just-in-Time Type Specialization for Dynamic Languages" 47 | details = "Andreas Gal, Brendan Eich, Mike Shaver, David Anderson, David Mandelin, Mohammad R. Haghighat, Blake Kaplan, Graydon Hoare, Boris Zbarsky, Jason Orendorff, Jesse Ruderman, Edwin W. Smith, Rick Reitmaier, Michael Bebenita, Mason Chang, and Michael Franz. PLDI 2009." 48 | link = "https://dl.acm.org/citation.cfm?id=1542476.1542528" 49 | 50 | [super] 51 | name = "Superoptimizer: A Look at the Smallest Program" 52 | details = "Alexia Massalin. ASPLOS 1987." 53 | link = "https://courses.cs.washington.edu/courses/cse501/15sp/papers/massalin.pdf" 54 | 55 | [chlorophyll] 56 | name = "Chlorophyll: Synthesis-Aided Compiler for Low-Power Spatial Architectures" 57 | link = "https://dl.acm.org/citation.cfm?id=2594339" 58 | details = "Phitchaya Mangpo Phothilimthana, Tikhon Jelvis, Rohin Shah, Nishant Totla, Sarah Chasins, and Rastislav Bodik. PLDI 2014." 59 | 60 | [notlib] 61 | name = "Threads Cannot Be Implemented as a Library" 62 | link = "https://dl.acm.org/doi/10.1145/1065010.1065042" 63 | details = "Hans-J. Boehm. PLDI 2005." 64 | 65 | [slp] 66 | name = "Exploiting Superword Level Parallelism with Multimedia Instruction Sets" 67 | link = "https://dl.acm.org/doi/10.1145/358438.349320" 68 | details = "Samuel Larsen and Saman Amarasinghe. PLDI 2000." 69 | 70 | [dpj] 71 | name = "A Type and Effect System for Deterministic Parallel Java" 72 | link = "http://dpj.cs.illinois.edu/DPJ/Publications_files/DPJ-OOPSLA-2009.pdf" 73 | details = "Robert L. Bocchino, Vikram S. Adve, Danny Dig, Sarita V. Adve, Stephen Heumann, Rakesh Komuravelli, Jeffrey Overbey, Patrick Simmons, Hyojin Sung, and Mohsen Vakilian. OOPSLA 2009." 74 | 75 | [compcert] 76 | name = "Formal Verification of a Realistic Compiler" 77 | link = "https://dl.acm.org/citation.cfm?id=1538814" 78 | details = "Xavier Leroy. CACM in 2009." 79 | 80 | [mlir] 81 | name = "MLIR: A Compiler Infrastructure for the End of Moore’s Law" 82 | link = "https://arxiv.org/abs/2002.11054" 83 | details = "Chris Lattner, Mehdi Amini, Uday Bondhugula, Albert Cohen, Andy Davis, Jacques Pienaar, River Riddle, Tatiana Shpeisman, Nicolas Vasilache, and Oleksandr Zinenko. arXiv preprint, 2020." 84 | 85 | [ssa-construct] 86 | name = "Simple and Efficient Construction of Static Single Assignment Form" 87 | link = "https://c9x.me/compile/bib/braun13cc.pdf" 88 | details = "Matthias Braun, Sebastian Buchwald, Sebastian Hack, Roland Leißa, Christoph Mallon, and Andreas Zwinkau. CC 2013." 89 | 90 | [denali] 91 | name = "Denali: A Goal-Directed Superoptimizer" 92 | link = "https://dl.acm.org/doi/10.1145/543552.512566" 93 | details = "Rajeev Joshi, Greg Nelson, and Keith Randall. PLDI 2002." 94 | 95 | [eqsat] 96 | name = "Equality Saturation: a New Approach to Optimization" 97 | link = "https://rosstate.org/publications/eqsat/eqsat_tate_popl09.pdf" 98 | details = "Ross Tate, Michael Stepp, Zachary Tatlock, and Sorin Lerner. POPL 2009." 99 | -------------------------------------------------------------------------------- /data/schedule.toml: -------------------------------------------------------------------------------- 1 | [[days]] 2 | date = "January 21" 3 | month = true 4 | 5 | [[days]] 6 | date = "January 23" 7 | 8 | [[days]] 9 | date = "January 28" 10 | 11 | [[days]] 12 | date = "January 30" 13 | 14 | [[days]] 15 | date = "February 4" 16 | month = true 17 | 18 | [[days]] 19 | date = "February 6" 20 | 21 | [[days]] 22 | date = "February 11" 23 | 24 | [[days]] 25 | date = "February 13" 26 | 27 | [[days]] 28 | date = "February 18" 29 | canceled = true 30 | event = "February break" 31 | 32 | [[days]] 33 | date = "February 20" 34 | 35 | [[days]] 36 | date = "February 25" 37 | 38 | [[days]] 39 | date = "February 27" 40 | 41 | [[days]] 42 | date = "March 4" 43 | month = true 44 | 45 | [[days]] 46 | date = "March 6" 47 | 48 | [[days]] 49 | date = "March 11" 50 | 51 | [[days]] 52 | date = "March 13" 53 | 54 | [[days]] 55 | date = "March 18" 56 | 57 | [[days]] 58 | date = "March 20" 59 | 60 | [[days]] 61 | date = "March 25" 62 | 63 | [[days]] 64 | date = "March 27" 65 | notes = ["Due: [Project](../syllabus/#project) proposal."] 66 | 67 | [[days]] 68 | date = "April 1" 69 | month = true 70 | canceled = true 71 | event = "Spring break" 72 | 73 | [[days]] 74 | date = "April 3" 75 | canceled = true 76 | event = "Spring break" 77 | 78 | [[days]] 79 | date = "April 8" 80 | 81 | [[days]] 82 | date = "April 10" 83 | 84 | [[days]] 85 | date = "April 15" 86 | 87 | [[days]] 88 | date = "April 17" 89 | 90 | [[days]] 91 | date = "April 22" 92 | 93 | [[days]] 94 | date = "April 24" 95 | 96 | [[days]] 97 | date = "April 29" 98 | 99 | [[days]] 100 | date = "May 1" 101 | month = true 102 | 103 | [[days]] 104 | date = "May 6" 105 | 106 | [[days]] 107 | date = "May 13" 108 | event = "Final project deadline (12pm)" 109 | canceled = true 110 | notes = ["Due: [Project](../syllabus/#project)."] 111 | month = true 112 | -------------------------------------------------------------------------------- /sass/main.scss: -------------------------------------------------------------------------------- 1 | $main-color: rgb(179, 27, 27); 2 | $disabled-color: #666; 3 | $exception-color: #ccc; 4 | $max-width: 40rem; 5 | $max-width-wide: 60rem; 6 | 7 | 8 | // Layout. 9 | body { 10 | margin: 0; 11 | } 12 | 13 | header { 14 | width: 100%; 15 | padding: 0.5rem 0; 16 | nav { 17 | display: block; 18 | margin: 0.5rem auto; 19 | max-width: $max-width; 20 | padding: 0 1rem; 21 | } 22 | } 23 | 24 | main { 25 | margin: 1rem auto; 26 | max-width: $max-width; 27 | 28 | padding-left: 1rem; 29 | padding-right: 1rem; 30 | } 31 | 32 | // On wide pages, allow wider column. 33 | body.wide { 34 | display: flex; 35 | flex-direction: column; 36 | align-items: center; 37 | justify-content: center; 38 | main { 39 | margin: 1rem; 40 | max-width: $max-width-wide; 41 | } 42 | } 43 | 44 | 45 | // Fonts. 46 | body { 47 | font-family: 'Source Sans Pro', sans-serif; 48 | line-height: 140%; 49 | } 50 | 51 | a { 52 | color: $main-color; 53 | } 54 | 55 | h1 { 56 | line-height: 110%; 57 | margin-top: 0; 58 | } 59 | 60 | h1, h2 { 61 | a { 62 | color: inherit; 63 | text-decoration: none; 64 | } 65 | } 66 | 67 | 68 | // Header links. 69 | header { 70 | background: $main-color; 71 | color: white; 72 | a { 73 | color: inherit; 74 | text-decoration: none; 75 | } 76 | h1 { 77 | font-size: inherit; 78 | font-weight: bold; 79 | margin-right: 1rem; 80 | } 81 | nav { 82 | h1 { 83 | display: inline-block; 84 | } 85 | p { 86 | display: inline-block; 87 | } 88 | h1, p { 89 | margin: 0 1rem 0 0; 90 | } 91 | } 92 | } 93 | 94 | 95 | // Footer links. 96 | footer { 97 | text-align: center; 98 | margin: 1rem; 99 | color: $disabled-color; 100 | font-size: 80%; 101 | a { 102 | color: inherit; 103 | text-decoration: none; 104 | } 105 | } 106 | 107 | 108 | // Schedule table. 109 | table { 110 | border-collapse: collapse; 111 | th { 112 | text-align: left; 113 | font-weight: bold; 114 | } 115 | thead th { 116 | padding: 0 0.5rem; 117 | } 118 | td, th { 119 | padding: 0.5rem 0.5rem; 120 | } 121 | } 122 | 123 | 124 | // Course schedule. 125 | .schedule { 126 | .canceled { 127 | background: $exception-color; 128 | .month { 129 | color: inherit; 130 | } 131 | } 132 | .event { 133 | font-weight: bold; 134 | } 135 | .mon > td, .mon > th { 136 | // Spacing between weeks. 137 | border-top: 1.5rem solid white; 138 | } 139 | ul { 140 | margin: 0; 141 | padding-left: 1em; 142 | list-style: square; 143 | } 144 | .short { 145 | white-space: nowrap; 146 | } 147 | .num { 148 | text-align: right; 149 | } 150 | .month { 151 | color: $exception-color; 152 | &.first { 153 | color: inherit; 154 | } 155 | } 156 | } 157 | 158 | 159 | // Logo. 160 | img.logo { 161 | width: 40%; 162 | height: auto; 163 | float: right; 164 | 165 | margin-left: 0.5rem; 166 | margin-bottom: 0.5rem; 167 | } 168 | 169 | article { 170 | h1 a { 171 | text-decoration: none; 172 | color: $main-color; 173 | } 174 | h1 { 175 | margin-bottom: 0; 176 | } 177 | footer { 178 | font-style: italic; 179 | margin: 1em 0; 180 | text-align: inherit; 181 | color: $disabled-color; 182 | font-size: inherit; 183 | line-height: 130%; 184 | border-top: 1px solid $disabled-color; 185 | a { 186 | color: inherit; 187 | text-decoration: underline; 188 | } 189 | } 190 | 191 | &.small { 192 | display: flex; 193 | align-items: baseline; 194 | 195 | margin-bottom: 0.4rem; 196 | 197 | time { 198 | width: 4rem; 199 | flex: none; 200 | text-align: right; 201 | margin-right: 0.5rem; 202 | } 203 | h1 { 204 | font-size: inherit; 205 | line-height: 130%; 206 | &:hover { 207 | text-decoration: underline; 208 | } 209 | } 210 | } 211 | 212 | code { 213 | background-color: #eff0f1; 214 | padding: 2px 4px; 215 | border-radius: 2px; 216 | } 217 | 218 | p > img { 219 | width: 100% 220 | } 221 | } 222 | 223 | pre { 224 | line-height: 120%; 225 | overflow-x: auto; 226 | code { 227 | background-color: inherit; 228 | padding: 0; 229 | border-radius: 0; 230 | } 231 | } 232 | 233 | .details { 234 | margin-top: 0; 235 | time { 236 | margin-left: 1em; 237 | } 238 | } 239 | 240 | img.rss { 241 | width: 0.8em; 242 | height: 0.8em; 243 | } 244 | 245 | a.icon:before { 246 | content: ""; 247 | padding-right: 1em; 248 | margin-right: 0.3em; 249 | background-repeat: no-repeat; 250 | background-size: 0.9em 0.9em; 251 | background-position-y: 2px; 252 | filter: invert(17%) sepia(66%) saturate(4981%) hue-rotate(355deg) 253 | brightness(77%) contrast(85%); 254 | } 255 | 256 | a.discussion:before { 257 | background-image: url(img/discussion.svg); 258 | } 259 | 260 | a.video:before { 261 | background-image: url(img/video.svg); 262 | } 263 | 264 | a.reading:before { 265 | background-image: url(img/book.svg); 266 | } 267 | 268 | a.due:before { 269 | background-image: url(img/calendar.svg); 270 | } 271 | 272 | ul.links { 273 | list-style: none; 274 | padding-left: 1.5em; 275 | li { 276 | margin: 1ex 0; 277 | } 278 | a.icon:before { 279 | margin-left: -1.3em; 280 | } 281 | } 282 | 283 | ul.compact { 284 | margin: 0 0 1rem; 285 | list-style: none; 286 | padding-left: 0; 287 | li { 288 | display: inline-block; 289 | margin-right: 1em; 290 | } 291 | } 292 | 293 | ul.inline { 294 | margin: 1.5rem 0; 295 | list-style: none; 296 | padding-left: 0; 297 | li { 298 | margin: 0.5rem 0; 299 | } 300 | } 301 | 302 | ul.simple { 303 | list-style: none; 304 | padding-left: 0; 305 | } 306 | 307 | // Funky responsive video embedding. 308 | .videos { 309 | .video { 310 | position: relative; 311 | margin: 1rem auto; 312 | max-width: 100%; 313 | height: 0; 314 | padding-bottom: 56.25%; 315 | overflow: hidden; 316 | iframe { 317 | position: absolute; 318 | width: 100%; 319 | height: 100%; 320 | border: 0; 321 | } 322 | } 323 | } 324 | 325 | .ornament { 326 | max-width: 15rem; 327 | margin: 0 auto; 328 | text-align: center; 329 | } 330 | 331 | .footnote-definition { 332 | color: $disabled-color; 333 | .footnote-definition-label { 334 | float: left; 335 | padding-right: 1em; 336 | font-size: 0.8em; 337 | &:before { content: '['; } 338 | &:after { content: ']'; } 339 | } 340 | } 341 | -------------------------------------------------------------------------------- /static/img/book.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /static/img/calendar.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /static/img/construction.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sampsyo/cs6120/8d343c04f6a6bd3ff783de0abda6b282c96791ae/static/img/construction.gif -------------------------------------------------------------------------------- /static/img/discussion.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /static/img/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sampsyo/cs6120/8d343c04f6a6bd3ff783de0abda6b282c96791ae/static/img/favicon.ico -------------------------------------------------------------------------------- /static/img/favicon152.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sampsyo/cs6120/8d343c04f6a6bd3ff783de0abda6b282c96791ae/static/img/favicon152.png -------------------------------------------------------------------------------- /static/img/rss.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /static/img/video.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /templates/base.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | {{ config.title }}{% if page.title %}: {{ page.title }}{% endif %} 9 | 10 | 11 | 12 | 13 | 14 | {% block header %}{% endblock header %} 15 | 16 | 17 |
18 | 42 |
43 |
44 | {% block main %} 45 | Hello, world! 46 | {% endblock main %} 47 |
48 | 53 | 54 | 55 | -------------------------------------------------------------------------------- /templates/blog.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | {% block main %} 3 |

4 | {{ section.title }} 5 | 6 | 7 | 8 |

9 | {% for post in section.pages %} 10 |
11 |

{{ post.title }}

12 |

13 | {% if post.extra.authors %} 14 | by 15 | {% for author in post.extra.authors %} 16 | {{ author.name }}{% if not loop.last %},{% endif %} 17 | {% endfor %} 18 | 19 | {% else %} 20 | by {{ post.extra.author }} 21 | {% endif %} 22 | 25 |

26 |
27 | {% endfor %} 28 | {% endblock main %} 29 | -------------------------------------------------------------------------------- /templates/home.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | {% block main %} 3 | {{ section.content | safe }} 4 | 5 | {% set blog = get_section(path="blog/_index.md") %} 6 |

7 | Latest Blog Posts 8 | 9 | 10 | 11 |

12 | 13 | {% for post in blog.pages | slice(end=7) %} 14 |
15 | 18 |

{{ post.title }}

19 |
20 | {% endfor %} 21 | 22 |

23 | See all posts on the course blog. 24 |

25 | 26 |

Acknowledgment

27 |

28 | Many thanks to Zulip for sponsoring a free hosting plan for us. Zulip is a wonderful, open-source communication tool that works great for discussion-focused classes like ours. 29 |

30 | {% endblock main %} 31 | -------------------------------------------------------------------------------- /templates/index.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | {% block main %} 3 | {{ section.content | safe }} 4 | {% endblock main %} 5 | -------------------------------------------------------------------------------- /templates/lesson.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | {% block main %} 3 |

4 | Lesson {{ page.slug }}: 5 | {{ page.title }} 6 | {% if page.draft %}(Draft){% endif %} 7 |

8 | 33 | {% if page.extra.videos %} 34 |
35 | {% for video in page.extra.videos %} 36 | {% if video.id %} 37 |
38 | 39 |
40 | {% elif video.box_id %} 41 |
42 | 43 |
44 | {% elif video.pt_id %} 45 |
46 | 47 |
48 | {% endif %} 49 | {% endfor %} 50 |
51 | {% endif %} 52 | {{ page.content | safe }} 53 | {% endblock main %} 54 | -------------------------------------------------------------------------------- /templates/lessons.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | {% block main %} 3 | {{ section.content | safe }} 4 | {% for page in section.pages | sort(attribute="extra.index") %} 5 | {% if not page.draft %} 6 | 27 | {% endif %} 28 | {% endfor %} 29 | {% endblock main %} 30 | -------------------------------------------------------------------------------- /templates/macros.html: -------------------------------------------------------------------------------- 1 | {% macro lesson_link(id, title=false) %} 2 | 3 | {% set lesson = false %} 4 | {% set lessons = get_section(path="lesson/_index.md") %} 5 | {% for cand in lessons.pages %} 6 | {% if cand.slug == id | as_str %} 7 | {% set_global lesson = cand %} 8 | {% break %} 9 | {% endif %} 10 | {% endfor %} 11 | 12 | {% if lesson %} 13 | {% if not lesson.draft %}{% endif %}Lesson {{id}}{% if not lesson.draft %}{% endif %}{% if title %}: {{ lesson.title }}{% endif %} 14 | {% else %} 15 | Lesson {{id}} 16 | {% endif %} 17 | {% endmacro lesson_link %} 18 | -------------------------------------------------------------------------------- /templates/page.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | {% block main %} 3 | {{ page.content | safe }} 4 | {% endblock main %} 5 | -------------------------------------------------------------------------------- /templates/post.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | {% block header %} 3 | 4 | 5 | 6 | 8 | {% if page.extra.latex %} 9 | 10 | 11 | 12 | 13 | {% endif %} 14 | {% endblock header %} 15 | {% block main %} 16 | {% set secpath = page.ancestors | last %} 17 | {% set section = get_section(path=secpath, metadata_only=true) %} 18 |

19 | 20 | {{ section.title }} 21 | 22 |

23 |
24 |

{{ page.title }}

25 |

26 | {% if page.extra.authors %} 27 | by 28 | {% for author in page.extra.authors %} 29 | {% if author.link %}{% endif %}{{ author.name }}{% if author.link %}{% endif %}{% if not loop.last %},{% endif %} 30 | {% endfor %} 31 | 32 | {% else %} 33 | by {% if page.extra.author_link %}{% endif %}{{ page.extra.author }}{% if page.extra.author_link %}{% endif %} 34 | {% endif %} 35 | 38 |

39 | {{ page.content | safe }} 40 |
41 | {% if page.extra.bio %} 42 | {{ page.extra.bio | markdown | safe }} 43 | {% endif %} 44 | {{ load_data(path="data/blog_footer.md") | markdown | safe }} 45 |
46 |
47 | {% endblock main %} 48 | -------------------------------------------------------------------------------- /templates/rss.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | {{ config.title }} 5 | {{ config.base_url | urlencode | safe }} 6 | {{ config.description }} 7 | Zola 8 | {{ lang }} 9 | 10 | {% if config.extra.favicon %}{{ get_url(path=config.extra.favicon) | safe | urlencode | safe }}{% endif %} 11 | {{ last_updated | date(format="%a, %d %b %Y %H:%M:%S %z") }} 12 | {% for page in pages %} 13 | 14 | {{ page.title }} 15 | {{ page.date | date(format="%a, %d %b %Y %H:%M:%S %z") }} 16 | {{ page.permalink | urlencode | safe }} 17 | {{ page.permalink | urlencode | safe }} 18 | {% if page.summary %}{{ page.summary }}{% else %}{{ page.content }}{% endif %} 19 | 20 | {% endfor %} 21 | 22 | 23 | -------------------------------------------------------------------------------- /templates/schedule.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | {% import "macros.html" as macros %} 3 | {% block main %} 4 | {% set schedule = load_data(path="data/schedule.toml") %} 5 | {% set content = load_data(path="data/content.toml") %} 6 | {% set readings = load_data(path="data/reading.toml") %} 7 | {% set lessons = get_section(path="lesson/_index.md") %} 8 | 9 |

{{ page.title }}

10 | 11 | {% set count = schedule.days | length %} 12 | {% set class_count = content.classes | length %} 13 | 14 | 15 | {% set_global class_idx = 0 %} 16 | {% for idx in range(end=count) %} 17 | {% set day = schedule.days[idx] %} 18 | 19 | 27 | 60 | 74 | 75 | {% endfor %} 76 | 77 |
20 | 21 | {{ day.date | split(pat=" ") | nth(n=0) }} 22 | 23 | 24 | {{ day.date | split(pat=" ") | nth(n=1) }} 25 | 26 | 28 | {% if day.event %} 29 | {{ day.event | markdown(inline=true) | safe }} 30 | {% else %} 31 | {% if class_idx < class_count %} 32 | {% set collapse = day.collapse | default(value=1) %} 33 | {% for collapse_idx in range(end=collapse) %} 34 | {% set class = content.classes[class_idx] %} 35 | {% set_global class_idx = class_idx + 1 %} 36 |
37 | {% if class.lesson %} 38 | {{ macros::lesson_link(id=class.lesson, title=true) }} 39 | {% endif %} 40 | {{ class.title | default(value='') }} 41 | {% if class.leader %}({{class.leader}}){% endif %} 42 | {% if class.readings %} 43 |
    44 | {% for rdkey in class.readings %} 45 | {% set reading = readings[rdkey] %} 46 |
  • 47 | {{reading.name}} 48 | {% if reading.details %} 49 |
    {{reading.details}} 50 | {% endif %} 51 |
  • 52 | {% endfor %} 53 |
54 | {% endif %} 55 |
56 | {% endfor %} 57 | {% endif %} 58 | {% endif %} 59 |
61 |
    62 | {% for note in day.notes | default(value=[]) %} 63 |
  • {{ note | markdown(inline=true) | safe }}
  • 64 | {% endfor %} 65 | {% for lesson in lessons.pages %} 66 | {% if lesson.extra.due | default(value='') == day.date %} 67 |
  • Due: 68 | {% if not lesson.draft %}{% endif %}Lesson {{lesson.slug}}{% if not lesson.draft %}{% endif %} 69 | tasks.
  • 70 | {% endif %} 71 | {% endfor %} 72 |
73 |
78 | 79 | {% endblock main %} 80 | -------------------------------------------------------------------------------- /templates/section.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | {% block main %} 3 | {{ section.content | safe }} 4 | {% endblock main %} 5 | -------------------------------------------------------------------------------- /templates/sg.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | {% block main %} 3 | {% set lessons = get_section(path="lesson/_index.md") %} 4 | {% set readings = load_data(path="data/reading.toml") %} 5 | 6 | {{ page.content | safe }} 7 | {% for lesson in lessons.pages | sort(attribute="extra.index") %} 8 | {% if not lesson.draft %} 9 | 22 | {% if page.extra.readings[lesson.slug] %} 23 |
    24 | {% for rdkey in page.extra.readings[lesson.slug] %} 25 | {% set reading = readings[rdkey] %} 26 |
  • 27 | {{reading.name}} 28 | {% if reading.details %} 29 |
    {{reading.details}} 30 | {% endif %} 31 |
  • 32 | {% endfor %} 33 |
34 | {% endif %} 35 | {% endif %} 36 | {% endfor %} 37 | 38 | {% endblock main %} 39 | --------------------------------------------------------------------------------