├── .gitignore
├── .python-version
├── README.md
├── hulk
    └── __init__.py
├── pages
    ├── .gitignore
    ├── .python-version
    ├── _quarto.yml
    ├── building
    │   └── index.qmd
    ├── cover.png
    ├── extra
    │   └── index.qmd
    ├── hulk
    │   ├── conditionals.md
    │   ├── expressions.md
    │   ├── functions.md
    │   ├── functors.md
    │   ├── index.qmd
    │   ├── inference.md
    │   ├── iterables.md
    │   ├── loops.md
    │   ├── macros.md
    │   ├── protocols.md
    │   ├── types.md
    │   ├── typing.md
    │   ├── variables.md
    │   └── vectors.md
    ├── index.qmd
    ├── instructors.qmd
    ├── intro.qmd
    ├── principles
    │   ├── grammars.qmd
    │   ├── index.qmd
    │   └── intro.qmd
    └── references.bib
├── pyproject.toml
└── requirements.txt


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Created by https://www.toptal.com/developers/gitignore/api/python
  2 | # Edit at https://www.toptal.com/developers/gitignore?templates=python
  3 | 
  4 | ### Python ###
  5 | # Byte-compiled / optimized / DLL files
  6 | __pycache__/
  7 | *.py[cod]
  8 | *$py.class
  9 | 
 10 | # C extensions
 11 | *.so
 12 | 
 13 | # Distribution / packaging
 14 | .Python
 15 | build/
 16 | develop-eggs/
 17 | dist/
 18 | downloads/
 19 | eggs/
 20 | .eggs/
 21 | lib/
 22 | lib64/
 23 | parts/
 24 | sdist/
 25 | var/
 26 | wheels/
 27 | share/python-wheels/
 28 | *.egg-info/
 29 | .installed.cfg
 30 | *.egg
 31 | MANIFEST
 32 | 
 33 | # PyInstaller
 34 | #  Usually these files are written by a python script from a template
 35 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 36 | *.manifest
 37 | *.spec
 38 | 
 39 | # Installer logs
 40 | pip-log.txt
 41 | pip-delete-this-directory.txt
 42 | 
 43 | # Unit test / coverage reports
 44 | htmlcov/
 45 | .tox/
 46 | .nox/
 47 | .coverage
 48 | .coverage.*
 49 | .cache
 50 | nosetests.xml
 51 | coverage.xml
 52 | *.cover
 53 | *.py,cover
 54 | .hypothesis/
 55 | .pytest_cache/
 56 | cover/
 57 | 
 58 | # Translations
 59 | *.mo
 60 | *.pot
 61 | 
 62 | # Django stuff:
 63 | *.log
 64 | local_settings.py
 65 | db.sqlite3
 66 | db.sqlite3-journal
 67 | 
 68 | # Flask stuff:
 69 | instance/
 70 | .webassets-cache
 71 | 
 72 | # Scrapy stuff:
 73 | .scrapy
 74 | 
 75 | # Sphinx documentation
 76 | docs/_build/
 77 | 
 78 | # PyBuilder
 79 | .pybuilder/
 80 | target/
 81 | 
 82 | # Jupyter Notebook
 83 | .ipynb_checkpoints
 84 | 
 85 | # IPython
 86 | profile_default/
 87 | ipython_config.py
 88 | 
 89 | # pyenv
 90 | #   For a library or package, you might want to ignore these files since the code is
 91 | #   intended to run in multiple environments; otherwise, check them in:
 92 | # .python-version
 93 | 
 94 | # pipenv
 95 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 96 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 97 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 98 | #   install all needed dependencies.
 99 | #Pipfile.lock
100 | 
101 | # poetry
102 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
103 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
104 | #   commonly ignored for libraries.
105 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
106 | #poetry.lock
107 | 
108 | # pdm
109 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
110 | #pdm.lock
111 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
112 | #   in version control.
113 | #   https://pdm.fming.dev/#use-with-ide
114 | .pdm.toml
115 | 
116 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
117 | __pypackages__/
118 | 
119 | # Celery stuff
120 | celerybeat-schedule
121 | celerybeat.pid
122 | 
123 | # SageMath parsed files
124 | *.sage.py
125 | 
126 | # Environments
127 | .env
128 | .venv
129 | env/
130 | venv/
131 | ENV/
132 | env.bak/
133 | venv.bak/
134 | 
135 | # Spyder project settings
136 | .spyderproject
137 | .spyproject
138 | 
139 | # Rope project settings
140 | .ropeproject
141 | 
142 | # mkdocs documentation
143 | /site
144 | 
145 | # mypy
146 | .mypy_cache/
147 | .dmypy.json
148 | dmypy.json
149 | 
150 | # Pyre type checker
151 | .pyre/
152 | 
153 | # pytype static type analyzer
154 | .pytype/
155 | 
156 | # Cython debug symbols
157 | cython_debug/
158 | 
159 | # PyCharm
160 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
161 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
162 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
163 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
164 | #.idea/
165 | 
166 | # End of https://www.toptal.com/developers/gitignore/api/python
167 | 
168 | .quarto/
169 | *.pdf
170 | *.epub
171 | 


--------------------------------------------------------------------------------
/.python-version:
--------------------------------------------------------------------------------
1 | 3.13
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # HULK -- Havana University Language for Kompilers
 2 | 
 3 | > Reference definition for the HULK language.
 4 | 
 5 | This repository contains the reference definition for the HULK programming language.
 6 | 
 7 | HULK is a didactic, type-safe, object-oriented and incremental language designed for teaching compilers at college level.
 8 | You can read all about the language [here](https://matcom.github.io/hulk).
 9 | 
10 | ## License
11 | 
12 | <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/"><img alt="Creative Commons License" style="border-width:0" src="https://i.creativecommons.org/l/by-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">Havana University Language for Kompilers</span> by <a xmlns:cc="http://creativecommons.org/ns#" href="https://matcom.github.io/hulk" property="cc:attributionName" rel="cc:attributionURL">University of Havana</a> is licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">Creative Commons Attribution-ShareAlike 4.0 International License</a>.
13 | 


--------------------------------------------------------------------------------
/hulk/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/matcom/hulk/300fae34dd8d10dd74e2b3756bfcb44a7ed4cf3c/hulk/__init__.py


--------------------------------------------------------------------------------
/pages/.gitignore:
--------------------------------------------------------------------------------
1 | /.quarto/
2 | 


--------------------------------------------------------------------------------
/pages/.python-version:
--------------------------------------------------------------------------------
1 | 3.11
2 | 


--------------------------------------------------------------------------------
/pages/_quarto.yml:
--------------------------------------------------------------------------------
 1 | project:
 2 |   type: book
 3 |   output-dir: "../docs"
 4 | 
 5 | book:
 6 |   title: "Compiler Construction with HULK"
 7 |   author: "Alejandro Piad Morffis"
 8 |   date: "7/12/2024"
 9 |   cover-image: cover.png
10 |   chapters:
11 |     - index.qmd
12 |     - intro.qmd
13 |     - part: principles/index.qmd
14 |       chapters:
15 |         - "principles/intro.qmd"
16 |         - "principles/grammars.qmd"
17 |     - part: building/index.qmd
18 |       chapters: []
19 |     - part: extra/index.qmd
20 |       chapters: []
21 |     - part: hulk/index.qmd
22 |       chapters:
23 |         - hulk/expressions.md
24 |         - hulk/functions.md
25 |         - hulk/variables.md
26 |         - hulk/conditionals.md
27 |         - hulk/loops.md
28 |         - hulk/types.md
29 |         - hulk/typing.md
30 |         - hulk/inference.md
31 |         - hulk/protocols.md
32 |         - hulk/iterables.md
33 |         - hulk/vectors.md
34 |         - hulk/functors.md
35 |         - hulk/macros.md
36 |   appendices:
37 |     - instructors.qmd
38 | 
39 | bibliography: references.bib
40 | 
41 | format:
42 |   html:
43 |     theme: cosmo
44 |   pdf:
45 |     documentclass: scrreprt
46 |   epub:
47 |     epub-cover-image: cover.png
48 | 


--------------------------------------------------------------------------------
/pages/building/index.qmd:
--------------------------------------------------------------------------------
1 | # Building a Compiler
2 | 
3 | This is the main part of the book. In the following chapters, we will build a HULK compiler from the ground up, in an incremental way. Each chapter is a practical guide to implementing some new feature of the HULK language or the compiler, or some optimization of previous features.
4 | 
5 | These chapters are based on Jupyter notebooks distributed with the source code of the book, available in the [public Github repository](https://github.com/matcom/hulk). Additionally, the notebooks this part is based on are used to generate a fully working reference implementation of the HULK compiler, which can be installed with `pip install hulk-compiler`.
6 | 


--------------------------------------------------------------------------------
/pages/cover.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/matcom/hulk/300fae34dd8d10dd74e2b3756bfcb44a7ed4cf3c/pages/cover.png


--------------------------------------------------------------------------------
/pages/extra/index.qmd:
--------------------------------------------------------------------------------
1 | # Extensions and Advanced Techniques


--------------------------------------------------------------------------------
/pages/hulk/conditionals.md:
--------------------------------------------------------------------------------
 1 | # Conditionals
 2 | 
 3 | The `if` expression allows evaluating different expressions based on a condition.
 4 | 
 5 | ```js
 6 | let a = 42 in if (a % 2 == 0) print("Even") else print("odd");
 7 | ```
 8 | 
 9 | Since `if` is itself an expression, returning the value of the branch that evaluated true, the previous program can be rewritten as follows:
10 | 
11 | ```js
12 | let a = 42 in print(if (a % 2 == 0) "even" else "odd");
13 | ```
14 | 
15 | Conditions are just expressions of boolean type. The following are the valid boolean expressions:
16 | 
17 | - Boolean literals: `true` and `false`.
18 | - Arithmetic comparison operators: `<`, `>`, `<=`, `>=`, `==`, `!=`, with their usual semantics.
19 | - Boolean operators: `&` (and), `|` (or), and `!` (not) with their usual semantics.
20 | 
21 | ## Expression blocks in conditionals
22 | 
23 | The body of the `if` or the `else` part of a conditional (or both) can be an expression block as well:
24 | 
25 | ```js
26 | let a = 42 in
27 |     if (a % 2 == 0) {
28 |         print(a);
29 |         print("Even");
30 |     }
31 |     else print("Odd");
32 | ```
33 | 
34 | ## Multiple branches
35 | 
36 | The `if` expression supports multiple branches with the `elif` construction, which introduces another conditioned branch:
37 | 
38 | ```js
39 | let a = 42, let mod = a % 3 in
40 |     print(
41 |         if (mod == 0) "Magic"
42 |         elif (mod % 3 == 1) "Woke"
43 |         else "Dumb"
44 |     );
45 | ```
46 | 


--------------------------------------------------------------------------------
/pages/hulk/expressions.md:
--------------------------------------------------------------------------------
 1 | # Expressions
 2 | 
 3 | HULK is ultimately an expression-based language. Most of the syntactic constructions in HULK are expressions, including the body of all functions, loops, and any other block of code.
 4 | 
 5 | The body of a program in HULK always ends with a single global expression (and, if necessary, a final semicolon) that serves as the entrypoint of the program. This means that, of course, a program in HULK can consist of just one global expression.
 6 | 
 7 | For example, the following is a valid program in HULK:
 8 | 
 9 | ```js
10 | 42;
11 | ```
12 | 
13 | Obviously, this program has no side effects. A slightly more complicated program, probably the first one that does something, is this:
14 | 
15 | ```js
16 | print(42);
17 | ```
18 | 
19 | In this program, `print` refers to a builtin function that prints the result of any expression in the output stream. We will talk about functions in a later section.
20 | 
21 | The rest of this section explains the basic expressions in HULK.
22 | 
23 | ## Arithmetic expressions
24 | 
25 | HULK defines three types of literal values: **numbers**, **strings**, and **booleans**.
26 | We will leave strings and booleans for later.
27 | 
28 | Numbers are 32-bit floating-point and support all basic arithmetic operations with the usual semantics: `+` (addition), `-` (subtraction), `*` (multiplication), `\` (floating-point division), `^` (power), and parenthesized sub-expressions.
29 | 
30 | The following is a valid HULK program that computes and prints the result of a rather useless arithmetic expression:
31 | 
32 | ```js
33 | print((((1 + 2) ^ 3) * 4) / 5);
34 | ```
35 | 
36 | All usual syntactic and precedence rules apply.
37 | 
38 | ## Strings
39 | 
40 | String literals in HULK are defined within enclosed double-quotes (`"`), such as in:
41 | 
42 | ```js
43 | print("Hello World");
44 | ```
45 | 
46 | A double-quote can be included literally by escaping it:
47 | 
48 | ```js
49 | print("The message is \"Hello World\"");
50 | ```
51 | 
52 | Other escaped characters are `\n` for line endings, and `\t` for tabs.
53 | 
54 | Strings can be concatenated with other strings (or the string representation of numbers) using the `@` operator:
55 | 
56 | ```js
57 | print("The meaning of life is " @ 42);
58 | ```
59 | 
60 | ## Builtin math functions and constants
61 | 
62 | Besides `print`, HULK also provides some common mathematical operations encapsulated as builtin functions with their usual semantics. The list of builtin math functions is the following:
63 | 
64 | - `sqrt(<value>)` computes the square root if a value.
65 | - `sin(<angle>)` computes the sine of an angle in radians.
66 | - `cos(<angle>)` computes the cosine of an angle in radians.
67 | - `exp(<value>)` computes the value of `e` raised to a value.
68 | - `log(<base>, <value>)` computes the logarithm of a value in a given base.
69 | - `rand()` returns a random uniform number between 0 and 1 (both inclusive).
70 | 
71 | Besides these functions, HULK also ships with two global constants: `PI` and `E` which represent the floating-point value of these mathematical constants.
72 | 
73 | As expected, functions can be nested in HULK (provided the use of types is consistent, but so far all we care about is functions from numbers to numbers, so we can forget about types until later on). Hence, the following is a valid HULK program.
74 | 
75 | ```js
76 | print(sin(2 * PI) ^ 2 + cos(3 * PI / log(4, 64)));
77 | ```
78 | 
79 | More formally, function invocation is also an expression in HULK, so everywhere you expect an expression you can also put a call to builtin function, and you can freely mix arithmetic expressions and mathematical functions, as you would expect in any programming language.
80 | 
81 | ## Expression blocks
82 | 
83 | Anywhere an expression is allowed (or almost), you can also use an expression block, which is nothing but a series of expressions between curly braces (`{` and `}`), and separated by `;`.
84 | 
85 | The most trivial usage of expression blocks is to allow multiple `print` statements as the body of a program. For example, the following is a valid HULK program:
86 | 
87 | ```js
88 | {
89 |     print(42);
90 |     print(sin(PI/2));
91 |     print("Hello World");
92 | }
93 | ```
94 | 
95 | When you use an expression block instead of a single expression, it is often not necessary to end with a semicolon (`;`), but it is not erroneous to do so either.
96 | 


--------------------------------------------------------------------------------
/pages/hulk/functions.md:
--------------------------------------------------------------------------------
 1 | # Functions
 2 | 
 3 | HULK also lets you define your own functions (of course!). A program in HULK can have an arbitrary number of functions defined before the final global expression (or expression block).
 4 | 
 5 | A function's body is always an expression (or expression block), hence all functions have a return value (and type), that is, the return value (and type) of its body.
 6 | 
 7 | ## Inline functions
 8 | 
 9 |  The easiest way to define a function is the inline form. Here's an example:
10 | 
11 | ```js
12 | function tan(x) => sin(x) / cos(x);
13 | ```
14 | 
15 | An inline function is defined by an identifier followed by arguments between parenthesis, then the `=>` symbol, and then a simple expression (not an expression block) as body, ending in `;`.
16 | 
17 | In HULK, all functions must be defined before the final global expression. All these functions live in a single global namespace, hence it is not allowed to repeat function names. Similarly, there are no overloads in HULK (at least in "basic" HULK).
18 | 
19 | Finally, the body of any function can use other functions, regardless of whether they are defined before or after the corresponding function. Thus, the following is a valid HULK program:
20 | 
21 | ```js
22 | function cot(x) => 1 / tan(x);
23 | function tan(x) => sin(x) / cos(x);
24 | 
25 | print(tan(PI) ** 2 + cot(PI) ** 2);
26 | ```
27 | 
28 | And of course, inline functions (and any other type of function) can call themselves recursively.
29 | 
30 | ## Full-form functions
31 | 
32 | Since inline functions only allow for a single expression as body (as complex as that may be), HULK also allows full-form functions, in which the body is an expression block.
33 | 
34 | Here's an example of a rather useless function that prints 4 times:
35 | 
36 | ```js
37 | function operate(x, y) {
38 |     print(x + y);
39 |     print(x - y);
40 |     print(x * y);
41 |     print(x / y);
42 | }
43 | ```
44 | 
45 | Note that the following form is discouraged for stylistic reasons:
46 | 
47 | ```js
48 | function id(<args>) => {
49 |     <...>
50 | }
51 | ```
52 | 
53 | That is, you should either use the inline form with `=>` and a simple expression, or the full form with `{}` and an expression block.
54 | 


--------------------------------------------------------------------------------
/pages/hulk/functors.md:
--------------------------------------------------------------------------------
  1 | # Functors
  2 | 
  3 | A functor in HULK is an object that encapsulates a function, which means it supports the `obj()` syntax. This can be accomplished with protocols easily, via transpilation. If you have a type that implements a functor protocol, then HULK will allow you to use the functor syntax. A functor protocol is any protocol that has an `invoke` method with appropriate type annotations.
  4 | 
  5 | For example, suppose you declare the following protocol in HULK:
  6 | 
  7 | ```go
  8 | protocol NumberFilter {
  9 |     invoke(x: Number): Boolean;
 10 | }
 11 | ```
 12 | 
 13 | Then, you can annotate a function to receive an object that implements this protocol:
 14 | 
 15 | ```go
 16 | function count_when(numbers: Number*, filter: NumberFilter) {
 17 |     let total = 0 in
 18 |         for (x in numbers)
 19 |             total := total + if (filter.invoke(x)) 1 else 0;
 20 | }
 21 | ```
 22 | 
 23 | But, since that protocol is a functor (it contains an `invoke` method), you can also use it directly as if it where a method, with the following syntax:
 24 | 
 25 | ```go
 26 | function count_when(numbers: Number*, filter: NumberFilter) {
 27 |     let total = 0 in
 28 |         for (x in numbers)
 29 |             total := total + if (filter(x)) 1 else 0;
 30 | }
 31 | ```
 32 | 
 33 | To implement a functor protocol, you simply define a type that implements the protocol, as usual, and then you can use it:
 34 | 
 35 | ```go
 36 | type IsOdd {
 37 |     invoke(x: Number): Boolean => x % 2 == 0;
 38 | }
 39 | 
 40 | let numbers = range(0, 100) in
 41 |     print(count_when(numbers, IsOdd())); // prints `50`
 42 | ```
 43 | 
 44 | But this syntax is extremely cumbersome, so HULK provides lots of syntax sugar to simplify the declaration and usage of functors.
 45 | 
 46 | ## Implicit functor implementation
 47 | 
 48 | The first aid that HULK provides is by implicitely implementing wrapping functions as functor types upong usage. For example, instead of defining the `IsOdd` type like before, you can simply define an `is_odd` function like the following, and pass it directly to the `count_when` function:
 49 | 
 50 | ```go
 51 | function is_odd(x: Number) => x % 2 == 0;
 52 | 
 53 | let numbers = range(0, 100) in
 54 |     print(count_when(numbers, is_odd));
 55 | ```
 56 | 
 57 | And then HULK will automatically create an appropriate functor type that implements the desired protocol, which means the previous code is transpiled to something like the following:
 58 | 
 59 | ```go
 60 | function is_odd(x: Number) => x % 2 == 0;
 61 | 
 62 | type _IsOddWrapper {
 63 |     invoke(x: Number): Boolean => is_odd(x);
 64 | }
 65 | 
 66 | let numbers = range(0, 100) in
 67 |     print(count_when(numbers, _IsOddWrapper()));
 68 | ```
 69 | 
 70 | Naturally, this syntax sugar extends to variable assignment as well, which means the following is valid:
 71 | 
 72 | ```go
 73 | let numbers = range(0, 100), filter: NumberFilter = is_odd in
 74 |     print(count_when(numbers, filter));
 75 | ```
 76 | 
 77 | ## Lambda expressions
 78 | 
 79 | Keeping up with the previous example, we can eliminate the explicit `is_odd` definition and pass a lambda expression, which is an anonymous function defined directly in the place when the functor is needed:
 80 | 
 81 | ```go
 82 | let numbers = range(0, 100) in
 83 |     print(count_when(numbers, (x: Number): Boolean => x % 2 == 0));
 84 | ```
 85 | 
 86 | The general syntax for lambda expressions is very similar to the syntax for inline functions, except that you don't need to name the function.
 87 | 
 88 | Also, if the type inferrer is good enough, you can almost always drop the explicit type annotations:
 89 | 
 90 | 
 91 | ```go
 92 | let numbers = range(0, 100) in
 93 |     print(count_when(numbers, (x) => x % 2 == 0));
 94 | ```
 95 | 
 96 | And of course, lambda expressions can be stored in appropriately typed variables:
 97 | 
 98 | ```go
 99 | let numbers = range(0, 100), filter: NumberFilter = (x) => x % 2 = 0 in
100 |     print(count_when(numbers, filter));
101 | ```
102 | 
103 | And the type inferrer is good enough, since `count_when` requires a `NumberFilter`, you can drop the explicit type annotation:
104 | 
105 | ```go
106 | let numbers = range(0, 100), filter = (x) => x % 2 = 0 in
107 |     print(count_when(numbers, filter));
108 | ```
109 | 
110 | ## Typing functors
111 | 
112 | And finally, we can also skip the protocol definition and use a special syntax for typing functors directly in the type annotaion:
113 | 
114 | ```go
115 | function count_when(numbers: Number*, filter: (Number) -> Boolean) {
116 |     // same code
117 | }
118 | ```
119 | 
120 | The syntax `(Number) -> Boolean` indicates that we expect a functor with a single input of type `Number` and an output of type `Boolean`. Upon finding this definition, HULK will transpile that into something that is very similar to our explicit protocol definition:
121 | 
122 | ```go
123 | protocol _Functor0 {
124 |     invoke(_arg0: Number) : Boolean;
125 | }
126 | 
127 | function count_when(numbers: Number*, filter: _Functor0) {
128 |     // same code
129 | }
130 | ```
131 | 


--------------------------------------------------------------------------------
/pages/hulk/index.qmd:
--------------------------------------------------------------------------------
 1 | # The HULK Programming Language
 2 | 
 3 | In this final part of the book, we present a straightforward and comprehensive definition of the HULK programming language, along with a set of possible extensions. This part formalizes the language we've been working on the entire book, and should serve as a reference for anyone wanting to build their own HULK compiler.
 4 | 
 5 | ## HULK in a nutshell
 6 | 
 7 | **HULK** (**H**avana **U**niversity **L**anguage for **K**ompilers) is a didactic, type-safe, object-oriented and incremental programming language, designed for the course Introduction to Compilers in the Computer Science major at University of Havana.
 8 | 
 9 | A simple "Hello World" in HULK looks like this:
10 | 
11 | ```js
12 | print("Hello World");
13 | ```
14 | 
15 | In a bird's eye view HULK is an object-oriented programming language, with simple inheritance, polymorphism, and encapsulation at the class level. Also, in HULK it is possible to define global functions outside the scope of all classes. It is also possible to define a _single global expression_ that constitutes the entry point to the program.
16 | 
17 | Most of the syntactic constructions in HULK are expressions, including conditional instructions and cycles. HULK is a statically typed language with optional type inference, which means that some (or all) parts of a program can be annotated with types, and the compiler will verify the consistency of all operations.
18 | 
19 | ## A didactic language
20 | 
21 | The HULK language has been designed as a mechanism for learning and evaluating a college course about compilers. For this reason, certain language design decisions respond more to didactic questions than to theoretical or pragmatic questions. An illustrative example is the inclusion of a single basic numerical type. In practice, programming languages have several numeric types (`int`, `float`, `double`, `decimal`) to cover the wide range of trade-off between efficiency and expressivity. However, from the didactic point of view, it is enough complexity to have to deal with a numerical type, and the inclusion of others does not bring anything new from our point of view.
22 | 
23 | Another important decision is the static typing with type inference, which will be explained later in detail. The motivation behind this feature is to allow students to first implement an evaluator for the language, and then worry about type verification. Likewise, the decision to have global expressions, global functions, and classes, responds to the need to introduce the various elements of language little by little. By having global expressions, it is possible to implement an expression interpreter without the need to solve context-sensitive problems. Later, students can implement functions and finally the object-oriented features. In this way students can learn on the fly as they add characteristics to the language, always having a valid subset of the language implemented.
24 | 
25 | ## An incremental language
26 | 
27 | As its name indicates, HULK is a huge language. Actually, the HULK language really is not really a single programming language, but a set of programming languages. That is, HULK is designed as a set of layers, each with a new language feature that add increasingly more complex functionalities on top of the previous layers. It starts with a basic syntax for expressions, then global functions, and then a unified type system with simple inheritance. Afterwards, HULK grows to contain arrays, delegates, type inference, iterators, among other characteristics. All these language features have been designed to be compatible with each other. Furthermore, each language feature clearly describes on which other language features it depends.
28 | 
29 | This design has been conceived to allow the use of HULK at a wide range of learning levels. As a language of expressions and functions, it is useful for introductory courses on parsing and basic compilation techniques. Object orientation introduces a whole universe of semantic complexities; however, the HULK type system is simple enough to illustrate the most common problems in semantic type verification. Vectors introduce problems related to memory management, while anonymous functions and iterators are fundamentally problems of transpilation and code generation. The inference of types and the verification of null-safety is an exercise in logical inference, which can be used in advanced courses. The idea is that each course defines its objectives of interest, and can use an appropriate subset of HULK to illustrate and evaluate them.
30 | 
31 | ## BANNER: Intermediate Representation
32 | 
33 | Even though HULK can be defined without specific compilation details, we also provide a didactic 3-address code for intermediate representation that is convenient to use with HULK. For obvious reasons, it's called BANNER -- **B**asic 3-**A**dress li**N**ear i**N**t**E**mediate **R**epresentation.
34 | 


--------------------------------------------------------------------------------
/pages/hulk/inference.md:
--------------------------------------------------------------------------------
  1 | # Type inference
  2 | 
  3 | Since every program in HULK is statically type-checked, and type annotations are optional in most cases, this means that HULK infers types for most of the symbols in a program.
  4 | 
  5 | Because the problem of type inference is computationally complex, and ultimately unsolvable in the general case, the HULK reference definition doesn't give precise semantics about how the type inferer must work. Rather, we will give only a set of minimal constraints that the type inferer must assert if a type is inferred at all for a given symbol, or otherwise it must fail to infer types.
  6 | 
  7 | ## Type inference vs type checking
  8 | 
  9 | The type inferer works before the type checker, and assigns type annotations to all symbols that are not explicitly annotated, and to all the expressions. Afterwards, the type checker verifies that all semantic rules are valid.
 10 | 
 11 | Thus, even if a program is fully annotated, the type inferer still needs to work, since it needs to infer the type of all expressions. When some symbols are not explicitly annotated, the type inferer must also assign types for them.
 12 | 
 13 | Hence, there are two different moments when a semantic error can be reported. First, if the type inferer cannot infer the type of some symbol, a semantic error will be thrown to indicate the programmer that some symbol must be explicitly typed. Second, if the type inferer finished without errors, the type checker will verify that all types are consistent, and will report a semantic error if there is some incompatibilty.
 14 | 
 15 | ## Type inference of expressions
 16 | 
 17 | The first task of the type inferer is to infer the runtime type of any expression that appears in a HULK program. This process is performed bottom-up, starting from atomic sub-expressions (e.g., literals) and working up the AST. The exact rules for type inference of expressions is given in the section a`bout [type semantics](/guide/type_semantics), but an intuitive introduction can be given at this point.
 18 | 
 19 | Literals are the easiest to type-infer, because their type comes directly from the parser. Arithmetic expressions are also easy, because their type is always `Number`. Likewise, string and boolean operators are straightforward.
 20 | 
 21 | The type of complex expressions that have an expression body is determined by the type of the body. This is the case of `let`, `while`, and `for`. The type of an expression block is the type of the last expresion of the block. The type of a function or method invocation is the type of its body. The type of expressions that have more than one branch (`if`) is the lowest common ancestor of the types of each branch, or ultimately `Object`.
 22 | 
 23 | ## Type inference of symbols
 24 | 
 25 | Once all expressions have been type-inferred, the type inferer will attempt to assing a type to each symbol declaration that is not explicitly annotated. Instead of providing an exact algorithm, we will define a set of constraints that the type inferer must satisfy whenever it succeeds in assigning a type.
 26 | 
 27 | Specific implementations of HULK can choose different methods to attempt the type inference of symbols. According to the order in which symbols are processed, and the sophistication of each method, some implementations may succed where others fail. However, if two type inference algorithms are correct, they most agree on all types for which both succeed in the inference.
 28 | 
 29 | These are the constraints a type inference algorithm must satisfy to be correct, or otherwise it must report a failed inference.
 30 | 
 31 | - In a `let` expression, whenever a variable is not type-annotated, the type inferer must asign a type for the variable that is equivalent to the type infered for its initialization expression.
 32 | - Similarly, in an attribute declaration that is not type-annotated, the type inferer must assign a type that is equivalent to the type inferred for its initialization expression.
 33 | - In a function or method, whenever an argument is not type-annotated, the type inferer must assign the lowest (most specific) type that would be consistent with the use of that argument in the method or function body. If more than one type in different branches of the type hierarchy would be consistent, the type inferer must fail.
 34 | - Similarly, in a type argument, the type inferer must assign the lowest type that is consistent with the use of that argument in all attribute initialization expressions where it is referenced.
 35 | 
 36 | If a type inferer satisfies those constraints, we will say it is *sound*. This means that, for example, the simplest sound strategy for type inference is to infer types for all expressions and fail for all symbols. We will call this the *basic inference* strategy.
 37 | 
 38 | ## Examples of ad-hoc type inference
 39 | 
 40 | These are some programs where a sufficiently sophisticated type inference strategy should work.
 41 | 
 42 | In the following program the type of the variable `x` should be inferred to `Number` because the type of `42` is trivially `Number`:
 43 | 
 44 | ```js
 45 | let x = 42 in print(x);
 46 | ```
 47 | 
 48 | In the following function, the type of the argument `n` should be inferred as `Number` because it is the only possible type where arithmetic operators (i.e., `+`) are defined, as there is no operator overloading in HULK:
 49 | 
 50 | ```js
 51 | function fib(n) => if (n == 0 | n == 1) 1 else fib(n-1) + fib(n-2);
 52 | ```
 53 | 
 54 | If you implement operator overloading, then the inferred type should be the appropriate protocol.
 55 | 
 56 | For the same reason, in the following function, the type of the argument `x` should be inferred as `Number`. Likewise, the type of the variable `f` should be inferred as `Number` because the initialization expression is a literal `Number`.
 57 | 
 58 | ```js
 59 | function fact(x) => let f = 1 in for (i in range(1, x+1)) f := f * i;
 60 | ```
 61 | 
 62 | ## A general strategy for type inference
 63 | 
 64 | If you implement protocols (explained later), then a general strategy for type inference consists in synthesizing appropriate protocols for all non-annotated symbols, based on their use. Since protocols support structural type checking, this should allow the type checker to detect any inconsistencies in a later pass.
 65 | 
 66 | For example, consider the following code:
 67 | 
 68 | ```go
 69 | type A {
 70 |     f() => "Hello";
 71 |     g() => "World";
 72 | }
 73 | 
 74 | function h(x) => x.f() @@ x.g();
 75 | 
 76 | let x = new A() in print(h(x));
 77 | ```
 78 | 
 79 | In the previous code, the type inferrer can determine that, whatever type `x` has, it should support two methods, `f` and `g`. Furthermore, given the use of the `@@` operator, the return value of both methods should support the `@@` operation (in principle, only `String` does, but if you implement operator overloading, there is a specific protocol for that operator).
 80 | 
 81 | Thus, the type inferrer can synthesize the following protocol:
 82 | 
 83 | ```go
 84 | protocol _P1 {
 85 |     f(): String;
 86 |     g(): String;
 87 | }
 88 | ```
 89 | 
 90 | And it should annotate the code (actually, the AST) in a way that is equivalent to the following:
 91 | 
 92 | ```go
 93 | // type A and Protocol _P1
 94 | 
 95 | function h(x: _P1): String => x.f() @@ x.g();
 96 | 
 97 | let x: A = new A() in print(h(x));
 98 | ```
 99 | 
100 | From the point of view of the type checker, the previous code is semantically correct, since `A` conforms to the protocol `_P1`.
101 | 
102 | Note that the process of synthesizing protocols could require several iterations, since not all types in a synthesized protocol may be known at a first glance. For example:
103 | 
104 | ```go
105 | function f(x) => x.a();
106 | 
107 | function g(x) => x.b();
108 | 
109 | let x = new T() in print(g(f(x)));
110 | ```
111 | 
112 | Regardless of how `T` looks like, the type inferrer here must first define a protocol for `f` that has a method `a()`, and analogous for `g`. But crucially, at this point, it is not clear from either `f` or `g` what the return type of these methods is.
113 | 
114 | Thus, at this point, the best a type inferrer can do is claim `f` receives something like:
115 | 
116 | ```go
117 | protocol _P1 {
118 |     a(): Any;
119 | }
120 | 
121 | // ...
122 | 
123 | function f(x: _P1): Any => x.a();
124 | ```
125 | 
126 | And similarly for `g`:
127 | 
128 | 
129 | ```go
130 | protocol _P2 {
131 |     b(): Any;
132 | }
133 | 
134 | // ...
135 | 
136 | function g(x: _P2): Any => x.b();
137 | ```
138 | 
139 | Then, a series of passes on the AST start to refine these protocols. For example, the call to `g` in the last line of the code above will force the type inferrer to refine `_P1` to:
140 | 
141 | ```go
142 | protocol _P1 {
143 |     a(): _P2;
144 | }
145 | ```
146 | 
147 | Which in turns, makes `f` now return `_P2`. Likewise, the call to `print` makes the type inferrer refine `_P2` to:
148 | 
149 | ```go
150 | protocol _P2 {
151 |     b(): Object;
152 | }
153 | ```
154 | 
155 | Once now new information can be inferred, the type inferrer will stop and the program will be type checked. All types left as `Any` will be reported as errors.
156 | 
157 | > **NOTE:** To code a robust type inferrer is much harder than what the previous explanation might seem. There are plenty of corner cases and heuristics. This section is just an initial suggestion to guide the implementation.
158 | 


--------------------------------------------------------------------------------
/pages/hulk/iterables.md:
--------------------------------------------------------------------------------
  1 | # Iterables
  2 | 
  3 | An iterable in HULK is any object that follows the iterable protocol, which is defined as follows:
  4 | 
  5 | ```js
  6 | protocol Iterable {
  7 |     next() : Boolean;
  8 |     current() : Object;
  9 | }
 10 | ```
 11 | 
 12 | An example of iterable is the builtin `range` function, which returns an instance of the builtin `Range` type, defined as follows:
 13 | 
 14 | ```js
 15 | type Range(min:Number, max:Number) {
 16 |     min = min;
 17 |     max = max;
 18 |     current = min - 1;
 19 | 
 20 |     next(): Boolean => (self.current := self.current + 1) < max;
 21 |     current(): Number => self.current;
 22 | }
 23 | ```
 24 | 
 25 | Notice that since protocols are covariant in the return types of the methods, the `Range` type correctly implements the `Iterable` protocol.
 26 | 
 27 | ## Using iterables with the `for` loop
 28 | 
 29 | As explained in [the loops section](/guide/loops), the `for` loop works with the `Iterable` protocol, which means you can apply `for` on any instance of a type that implements the protocol.
 30 | 
 31 | In compile-time, `for` is transpiled to a code that is equivalent, but explicitely uses the `Iterable` protocol members.
 32 | 
 33 | For example, the code:
 34 | 
 35 | ```js
 36 | for (x in range(0,10)) {
 37 |     // code that uses `x`
 38 | }
 39 | ```
 40 | 
 41 | Is transpiled to:
 42 | 
 43 | ```js
 44 | let iterable = range(0, 10) in
 45 |     while (iterable.next())
 46 |         let x = iterable.current() in {
 47 |             // code that uses `x`
 48 |         }
 49 | ```
 50 | 
 51 | This transpilation guarantees that even though the `Iterable` protocol defines the `current` method with return type `Object`, when you use a `for` loop you will get the exact covariant type inferred in `x`.
 52 | 
 53 | As a matter of fact, due to the transpilation process, the `Iterable` protocol itself is not even necessary, since nowhere is a symbol annotated as `Iterable`. However, the protocol is explicitely defined as a builtin type so that you can explicitly use it if you need to annotate a method to receive a black-box iterable.
 54 | 
 55 | Keep in mind, thought, that when you annotate something explicitely as `Iterable`, you are effectively forcing the type inferrer to assign `Object` as the type of the iteration variable (`x` in this example). This is one of the  reasons it is often better to let HULK infer types than annotating them yourself.
 56 | 
 57 | ## Typing iterables
 58 | 
 59 | Since in the `Iterable` protocol we can only define (at this point) the return value of `current()` as `Object`, it is cumbersome to type arguments of a function or method as `Iterable`, because doing so will force you to downcast the elements to a desired type.
 60 | 
 61 | For this reason, HULK allows a special syntax for typing iterables of a specific type `T` using the format `T*`:
 62 | 
 63 | ```js
 64 | function sum(numbers: Number*): Number =>
 65 |     let total = 0 in
 66 |         for (x in numbers)
 67 |             total := total + x;
 68 | ```
 69 | 
 70 | What happens under the hood is that when you use of `T*` anywhere in a HULK program, the compiler will insert an implicit protocol definition that looks like this:
 71 | 
 72 | ```js
 73 | protocol Iterable_T extends Iterable {
 74 |     current(): T;
 75 | }
 76 | ```
 77 | 
 78 | Since protocols can be extended by [overriding some methods with the correct variance constraints](/guide/protocols), the previous code will compile correctly.
 79 | 
 80 | ## Implementing collections
 81 | 
 82 | The iterable protocols defined so far encapsulates the concept of making *a single iteration* over the sequence of elements. In contrast, most collection types you will define allow for multiple iterations, even simultaneously, over the same sequence of elements.
 83 | 
 84 | To accomodate for this kind of behaviour, we can define an *enumerable* protocol that simply provides one method to create an iterable for one specific iteration everytime that is needed:
 85 | 
 86 | ```js
 87 | protocol Enumerable {
 88 |     iter(): Iterable;
 89 | }
 90 | ```
 91 | 
 92 | With this protocol defined, the `for` loop is extended such that, when used with an enumerable instead of directly an iterable, it will transpile to a slightly different code:
 93 | 
 94 | ```js
 95 | let iterable = enumerable.iter() in
 96 |     while (iterable.next())
 97 |         let x = iterable.current() in {
 98 |             // ..
 99 |         }
100 | ```
101 | 


--------------------------------------------------------------------------------
/pages/hulk/loops.md:
--------------------------------------------------------------------------------
 1 | # Loops
 2 | 
 3 | HULK defines two kinds of loops, the `while` expression and the `for` expression.
 4 | Both loop constructions are expressions, returing the value of the
 5 | 
 6 | ## The `while` loop
 7 | 
 8 | A `while` loop evaluates a condition and its body while the condition is true. The body can be a simple expression or an expression block.
 9 | 
10 | ```js
11 | let a = 10 in while (a >= 0) {
12 |     print(a);
13 |     a := a - 1;
14 | }
15 | ```
16 | 
17 | Since the return value of the `while` loop is the return value of its expression body, it can often be used directly as the body of a function.
18 | 
19 | ```js
20 | function gcd(a, b) => while (a > 0)
21 |     let m = a % b in {
22 |         b := a;
23 |         a := m;
24 |     };
25 | ```
26 | 
27 | ## The `for` loop
28 | 
29 | A `for` loop iterates over an _iterable_ of elements of a certain type. We will [talk about iterables](/iterables) later on, but for now it suffices to say that if some expression evaluates to a collection, then the `for` loop can be used to iterate it.
30 | 
31 | For example, the builtin `range(<start>, <end>)` function evaluates to an iterable of numbers between `<start>` (inclusive) and `<end>` (non-inclusive).
32 | 
33 | ```js
34 | for (x in range(0, 10)) print(x);
35 | ```
36 | 
37 | The `for` loop is semantically and operationally equivalent to the following:
38 | 
39 | ```js
40 | let iterable = range(0, 10) in
41 |     while (iterable.next())
42 |         let x = iterable.current() in
43 |             print(x);
44 | ```
45 | 
46 | In fact, what the reference implementation of the HULK compiler does in `for` loops is to transpile them into their `while` equivalent. This also effectively means that, just like the `while` loop, the `for` loop returns the last value of its body expression.
47 | 


--------------------------------------------------------------------------------
/pages/hulk/macros.md:
--------------------------------------------------------------------------------
  1 | # Macros
  2 | 
  3 | Macros are a way to extend HULK with "functions" that are transpiled at compilation-time to standard HULK, instead of executed in runtime.
  4 | But macros are considerable more powerful than functions, both sintactically and semantically.
  5 | Macros in HULK are extremely powerful because they work at the sintactic level, which means they perform transformations directly over the abstract syntax tree.
  6 | Besides that, their syntax allows to define sort of keyword-like language constructs.
  7 | 
  8 | Since macros are a complex topic, let's start with a simple scenario.
  9 | 
 10 | Suppose you want to have something like the following in HULK:
 11 | 
 12 | ```js
 13 | repeat(10) {
 14 |     // expressions
 15 | }
 16 | ```
 17 | 
 18 | You quickly see that this code is equivalent to the (arguably a lot more verbose) following syntax:
 19 | 
 20 | ```js
 21 | let total = n in
 22 |     while (total >= 0) {
 23 |         total := total - 1;
 24 |         // expressions
 25 |     };
 26 | ```
 27 | 
 28 | You can easily encapsulate this pattern in a `repeat` function that takes a number and an a general expression (as a [functor](../functors)):
 29 | 
 30 | ```js
 31 | function repeat(times: Number, expr: () -> Object): Object {
 32 |     let total = n in
 33 |         while (total >= 0) {
 34 |             total := total - 1;
 35 |             expr();
 36 |         };
 37 | }
 38 | ```
 39 | 
 40 | And while this may work for your case, it has a couple of downsides. First, you don't exactly get the desired syntax, instead of:
 41 | 
 42 | ```js
 43 | repeat(10) {
 44 |     // expressions
 45 | }
 46 | ```
 47 | 
 48 | You have to write something like the following, which is close, but still slightly more cumbersome and dirty.
 49 | 
 50 | ```js
 51 | repeat(10, () => {
 52 |     // expressions
 53 | });
 54 | ```
 55 | 
 56 | The second, and most important one, is that the `expr` here encapsulates a computation that, from the point of view of the `repeat` function, is a black box. We will focus on why this matters later on.
 57 | 
 58 | ## Defining macros
 59 | 
 60 | Instead of a function, you can use a *macro*, which has a very similar syntax in HULK:
 61 | 
 62 | ```js
 63 | def repeat(n: Number, *expr: Object): Object =>
 64 |     let total = n in
 65 |         while (total >= 0) {
 66 |             total := total - 1;
 67 |             expr;
 68 |         };
 69 | ```
 70 | 
 71 | But this change makes macros exceedingly more powerful than functions in a lot of cases, for a few reasons. First, notice the use of the `*expr: Object` syntax, instead of the `expr: () -> Object`. Here the `*` denotes that this `expr` is **not** a regular argument, instead it is a special argument that refers to the code inside the brackets *after* the macro invocation. Thus, you can use the following syntax:
 72 | 
 73 | ```js
 74 | repeat(10) {
 75 |     print("Hello World");
 76 | }
 77 | ```
 78 | 
 79 | The `{ print("Hello World"); }` expression block is precisely what is passed on in the special argument `*expr`.
 80 | 
 81 | However, there is much more going on under that macro invocation. Instead of calling a functor in runtime, macros are expanded in compile time and transpiled into their bodies, which means there is no real `repeat` function anywhere in the compiled code. Instead, the actual code that is executed is something like:
 82 | 
 83 | ```js
 84 | let _total = 10 in
 85 |     while (_total >= 0) {
 86 |         _total := _total - 1;
 87 |         {
 88 |             print("Hello World");
 89 |         };
 90 |     }
 91 | ```
 92 | 
 93 | This is the reason why you don't see `expr();` in the macro body, but `expr;`. That is, the body is not *executed* but *interpolated* inside the macro. This transpilation step makes macros often faster than functions because there is no extra overhead for passing arguments, however, you must be careful when thinking about the operational semantics of a macro especially where they differ from a regular function call.
 94 | 
 95 | ## Variable sanitization
 96 | 
 97 | Upon macro expansion, the variables inside the body of a macro are replaced with a special unique name generated by the compiler. This ensures that no variable in the context of the macro invocation can be accidentally hidden or used in unpredictable ways.
 98 | 
 99 | Take for example the following code:
100 | 
101 | ```js
102 | let total = 10 in repeat(total) {
103 |     print(total);
104 | };
105 | ```
106 | 
107 | If variables inside the body of the `repeat` macro wheren't sanitazed, then the `print` statement would print `9`, `8`, etc, which is kind of unexpected unless you happen to know how the `repeat` macro is implemented, violating the principle of encapsulation. Even worse, this would happen if your variable is named `total`, but not if it's named something else, which again is surprising and inconsistent. However, since the variable `total` inside the body of `repeat` will be renamed to something completely different upon macro expansion, you can be certain that the `print` statement will work as expected, regardless of the name you happen to choose for your variable.
108 | 
109 | ## Symbolic arguments
110 | 
111 | There are times, though, when you want the macro to reuse a symbol that comes from its external context (a variable or attribute).
112 | In these cases, you can use the especial syntax `@symbol` to define a *symbolic argument* in the macro, and then bind a specific symbol upon macro expansion.
113 | 
114 | This is best explained with an example. Let's suppose we want to implement a `swap` macro that swaps the content of two variables. This cannot be done unless the macro can actually assign to the variables we want to swap. We would define the macro as:
115 | 
116 | ```js
117 | def swap(@a: Object, @b: Object) {
118 |     let temp: Object = a in {
119 |         a := b;
120 |         b := temp;
121 |     }
122 | }
123 | ```
124 | 
125 | And we invoke the macro as:
126 | 
127 | ```js
128 | let x: Object = 5, y: Object = "Hello World" in {
129 |     swap(@x, @y);
130 |     print(x);
131 |     print(y);
132 | };
133 | ```
134 | 
135 | Which will be expanded to something like (except that `_temp` will be a generated name):
136 | 
137 | ```js
138 | let x: Object = 5, y: Object = "Hello World" in {
139 |     let _temp = x in {
140 |         x := y;
141 |         y := _temp;
142 |     };
143 |     print(x);
144 |     print(y);
145 | };
146 | ```
147 | 
148 | Notice how the actual names of the `x` and `y` variables are interpolated in the macro expansion. Of course, the type checker will guarantee that on invocation the `x` and `y` symbols are variables of the corresponding type.
149 | 
150 | ## Variable placeholders
151 | 
152 | Macros can also introduce a new symbol into the scope in which they are expanded, which can then be used in the body argument (or the other arguments).
153 | The syntax for this is `$symbol`. We call this a "variable placeholder", because it holds the name for a variable that will be introduced upon macro expansion.
154 | 
155 | Again, this is best explained with an example. Let's add a variable to the `repeat` macro to indicates the current iteration. We would define the macro as:
156 | 
157 | ```js
158 | def repeat($iter: Number, n: Number, *expr:Object) {
159 |     let iter: Number = 0, total:Number = n in {
160 |         while (total >= 0) {
161 |             total := total - 1;
162 |             expr;
163 |             iter := iter + 1
164 |         };
165 |     }
166 | }
167 | ```
168 | 
169 | Now when calling the macro, you can specify a name for the `$iter` variable placeholder:
170 | 
171 | ```js
172 | repeat(current, 10) {
173 |     print(current);
174 | };
175 | ```
176 | 
177 | The effect is that upon macro expansion, the variable placeholder `$iter` will be renamed to `current` and thus the body of the macro will correctly reference it. The actual expansion looks similar to the following code:
178 | 
179 | ```js
180 | let current: Number = 0, _total:Number = n in {
181 |     while (_total >= 0) {
182 |         _total := _total - 1;
183 |         {
184 |             print(current);
185 |         };
186 |         current := current + 1
187 |     };
188 | };
189 | ```
190 | 
191 | The compiler ensures that the use of the new variable in the body of the macro is consistent with the type declared for the variable placeholder in the macro. However, it is entirely possible for the macro not to define the variable, or to define it conditioned on some structure of the body (we will see how that's achieved in the [pattern matching section](#pattern-matching)). In any case, since macro expansion is performed at compile time, any inconsistency that may arise will be captured by the compiler.
192 | 
193 | ## Pattern matching
194 | 
195 | By far the most powerful feature of macros is structural pattern matching. This feature allows to deconstruct an argument and generate a specific code depending on the argument *structure*. The reason this is possible is because macros run on compile time, so when you declare an argument of type `Number`, for example, what you'll get in the macro body is the actual expression tree of the argument, and not just the final evaluated object.
196 | 
197 | As everything else with macros, this feature is much better understood with examples. Let's suppose you want to define a macro called `simplify`, for no better use than to illustrate how powerful macros are compared to regular functions. This is how you would do it:
198 | 
199 | ```js
200 | def simplify(expr:Number) {
201 |     match(expr) {
202 |         case (x1:Number + x2:Number) => simplify(x1) + simplify(x2);
203 |         case (x1:Number + 0) => simplify(x1);
204 |         case (x1:Number - x2:Number) => simplify(x1) + simplify(x2);
205 |         case (x1:Number - 0) => simplify(x1);
206 |         case (x1:Number * x2:Number) => simplify(x1) * simplify(x2);
207 |         case (x1:Number * 1) => simplify(x1);
208 |         // ... you get the idea
209 |         default => expr;
210 |     };
211 | }
212 | ```
213 | 
214 | You would use the macro as follows:
215 | 
216 | ```js
217 | print(simplify((42+0)*1));
218 | ```
219 | 
220 | And the actual generated code would be:
221 | 
222 | ```js
223 | print(42);
224 | ```
225 | 
226 | Notice that this transformation happens during compilation time, not execution. The actual code that gets compiled is the simplified expression.
227 | 


--------------------------------------------------------------------------------
/pages/hulk/protocols.md:
--------------------------------------------------------------------------------
 1 | # Protocols
 2 | 
 3 | Protocols are special types which support a limited form of structural typing in HULK. The difference between structural and nominal typing in HULK, is that the latter is explicit while the former is implicitely defined. That is, a type doesn't need to explicitely declare that it conforms to a protocol.
 4 | 
 5 | Protocols have a syntax similar to that of types, except that they only have method declarations, and they have no body, only signatures. Hence, protocols define the methods that a type must have in order to support some operation.
 6 | 
 7 | Protocols don't exist at runtime, they are compile-time only concept that helps writing more flexible programs. After type checking, all information about protocols can be safely removed.
 8 | 
 9 | ## Defining protocols
10 | 
11 | A protocol is defined with the keyword `protocol` followed by a collection of method declarations:
12 | 
13 | ```js
14 | protocol Hashable {
15 |     hash(): Number;
16 | }
17 | ```
18 | 
19 | A protocol can have any number of method declarations. For obvious reasons, all method declarations in protocol definitions must be fully typed, as it is impossible to infer any types since they have no body.
20 | 
21 | A protocol can extend anoter protocol by adding new methods, but never overriding (since there is no actual body) or removing any method (althought you can override the types of some method arguments or return types provided with some restrictions explained below).
22 | 
23 | ```js
24 | protocol Equatable extends Hashable {
25 |     equals(other: Object): Boolean;
26 | }
27 | ```
28 | 
29 | ## Implementing protocols
30 | 
31 | A type implements a protocol implicitely, simply by having methods with the right signature. There is no need to explicitely declare which types implement which protocols.
32 | 
33 | Thus, you can annotated a variable or argument with a protocol type, and the type checker will correctly verify the consistency of both the method body and the invocation.
34 | 
35 | ```js
36 | type Person {
37 |     // ...
38 | 
39 |     hash() : Number {
40 |         // ...
41 |     }
42 | }
43 | 
44 | let x : Hashable = new Person() in print(x.hash());
45 | ```
46 | 
47 | Anywhere you can annotate a symbol with a type (variables, attributes, function, method and type arguments, and return values), you can also use a protocol. For the purpose of type inference, protocols are treated as types.
48 | 
49 | ## Variance in protocol implementation
50 | 
51 | In order to implementing a protocol, a type doesn't necessarily have to match the exact signature of the protocol. Instead, method and type arguments are considered *contravariant*, and return values *covariant*. This means that arguments can be of the same type or higher, and the return values of the same type or lower than as defined in the protocol.
52 | 
53 | Similarly, when you extend a protocol, you can override some of the methods as long as you respect the variance constraints.
54 | 
55 | ## Conforming with protocols
56 | 
57 | More formally, protocols extend the notion of type conforming by adding the following rules:
58 | 
59 | - A type `T` conforms to a protocol `P` if `T` has all the method defined in `P` with the appropriate types (respecting the variance constraints explained before).
60 | - If a protocol `P1` extends a protocol `P2`, then trivially `P1 <= P2`.
61 | - A protocol `P1` also conforms to another protocol `P2` if any type that conforms to `P1` would also conform to `P2`, even if there is no explicit extension declared.
62 | 


--------------------------------------------------------------------------------
/pages/hulk/types.md:
--------------------------------------------------------------------------------
  1 | # Types
  2 | 
  3 | HULK is ultimately an object-oriented language with simple inheritance and nominal typing. It also has features of structural typing via [protocols](/guide/protocols), which support language features such as [iterables](/guide/iterables), which we will explain later.
  4 | 
  5 | This section explains the basics of HULK's nominal typing system.
  6 | 
  7 | A type in HULK is basically a collection of attributes and methods, encapsulated under a type name. Attributes are always private, which means they can't be read or writen to from any code outside the type in which they are defined (not even inheritors), while methods are always public and virtual.
  8 | 
  9 | ## Declaring types
 10 | 
 11 | A new type is declared using the `type` keyword followed by a name, and a body composed of attribute definitions and method definitions. All attributes must be given an initialization expression. Methods, like functions, can have a single expression or an expression block as body;
 12 | 
 13 | ```js
 14 | type Point {
 15 |     x = 0;
 16 |     y = 0;
 17 | 
 18 |     getX() => self.x;
 19 |     getY() => self.y;
 20 | 
 21 |     setX(x) => self.x := x;
 22 |     setY(y) => self.y := y;
 23 | }
 24 | ```
 25 | 
 26 | The body of every method is evaluated in a namespace that contains global symbols plus an especial symbol named `self` that references the current instance. The `self` symbol is **not** a keyword, which means it can be hidden by a `let` expression, or by a method argument.
 27 | 
 28 | However, when referring to the current instance, `self` is not a valid assignment target, so the following code should fail with a semantic error:
 29 | 
 30 | ```js
 31 | type A {
 32 |     // ...
 33 |     f() {
 34 |         self := new A(); // <-- Semantic error, `self` is not a valid assignment target
 35 |     }
 36 | }
 37 | ```
 38 | 
 39 | ## Instantiating types
 40 | 
 41 | To instantiate a type you use the keyword `new` followed by the type name:
 42 | 
 43 | ```js
 44 | let pt = new Point() in
 45 |     print("x: " @ pt.getX() @ "; y: " @ pt.getY());
 46 | ```
 47 | 
 48 | As you can see, type members are accessed by dot notation (`instance.member`).
 49 | 
 50 | You can pass arguments to a type, that you can use in the initialization expressions. This achieves an effect similar to having a single constructor.
 51 | 
 52 | ```js
 53 | type Point(x, y) {
 54 |     x = x;
 55 |     y = y;
 56 | 
 57 |     // ...
 58 | }
 59 | ```
 60 | 
 61 | Then, at instantiation time, you can pass specific values:
 62 | 
 63 | ```js
 64 | let pt = new Point(3,4) in
 65 |     print("x: " @ pt.getX() @ "; y: " @ pt.getY());
 66 | ```
 67 | 
 68 | Each attribute initialization expression is evaluated in a namespace that contains the global symbols and the type arguments, but no the `self` symbol. This means you cannot use other attributes of the same instance in an attribute initialization expression. This also means that you cannot assume any specifc order of initialization of attributes.
 69 | 
 70 | ## Inheritance
 71 | 
 72 | Types in HULK can inherit from other types. The base of the type hierarchy is a type named `Object` which has no public members, which is the type you implicitely inherit from by default. To inherit from a specific type, you use the `inherits` keyword followed by the type name:
 73 | 
 74 | ```js
 75 | type PolarPoint inherits Point {
 76 |     rho() => sqrt(self.getX() ^ 2 + self.getY() ^ 2);
 77 |     // ...
 78 | }
 79 | ```
 80 | 
 81 | By default, a type inherits its parent type arguments, which means that to construct a `PolarPoint` you have to pass the `x` and `y` that `Point` is expecting:
 82 | 
 83 | ```js
 84 | let pt = new PolarPoint(3,4) in
 85 |     print("rho: " @ pt.rho());
 86 | ```
 87 | 
 88 | If you want to define a different set of type arguments, then you have to provide initialization expressions for the parent type at the declaration:
 89 | 
 90 | ```js
 91 | type PolarPoint(phi, rho) inherits Point(rho * sin(phi), rho * cos(phi)) {
 92 |     // ...
 93 | }
 94 | ```
 95 | 
 96 | During construction, the expressions for type arguments of the parent are evaluated in a namespace that contains global symbols plus the type arguments of the inheritor. Like before, you cannot assume a specific order of evaluation.
 97 | 
 98 | In HULK, the three builtin types (`Number`, `String`, and `Boolean`) implicitely inherit from `Object`, but it is a semantic error to inherit from these types.
 99 | 
100 | ## Polymorphism
101 | 
102 | All type methods in HULK are virtual by definition, and can be redefined by an inheritor provided the exact same signature is used:
103 | 
104 | ```js
105 | type Person(firstname, lastname) {
106 |     firstname = firstname;
107 |     lastname = lastname;
108 | 
109 |     name() => self.firstname @@ self.lastname;
110 | }
111 | ```
112 | 
113 | > **NOTE**: `@@` is equivalent to `@ "  " @`. It is a shorthand to insert a whitespace between two concatenated strings. There is no `@@@` or beyond, we're not savages.
114 | 
115 | ```js
116 | type Knight inherits Person {
117 |     name() => "Sir" @@ base();
118 | }
119 | 
120 | let p = new Knight("Phil", "Collins") in
121 |     print(p.name()); // prints 'Sir Phil Collins'
122 | ```
123 | 
124 | The `base` symbol in every method refers to the implementation of the parent (or the closest ancestor that has an implementation).
125 | 


--------------------------------------------------------------------------------
/pages/hulk/typing.md:
--------------------------------------------------------------------------------
  1 | # Type checking
  2 | 
  3 | HULK is a statically-typed language with optional type annotations. So far you haven't seen any because HULK has a powerful [type inference system](/guide/inference) which we will talk about later on. However, all symbols in HULK have a static type, and all programs in HULK are statically checked during compilation.
  4 | 
  5 | Tye annotations can be added anywhere a symbol is defined, that is:
  6 | 
  7 | - in variable declarations with `let` expressions;
  8 | - in function or method arguments and return type;
  9 | - in type attributes; and,
 10 | - in type arguments.
 11 | 
 12 | Let's see an example of each case.
 13 | 
 14 | ## Typing variables
 15 | 
 16 | Variables can be explicitely type-annotated in `let` expressions with the following syntax:
 17 | 
 18 | ```js
 19 | let x: Number = 42 in print(x);
 20 | ```
 21 | 
 22 | The type checker will verify that the type inferred for the initialization expression is compatible with (formally, [conforms to](/#type-conforming)) the annotated type.
 23 | 
 24 | ## Typing functions and methods
 25 | 
 26 | All or a subset of a function's or method's arguments, and its return value, can be type-annotated with a similar syntax:
 27 | 
 28 | ```js
 29 | function tan(x: Number): Number => sin(x) / cos(x);
 30 | ```
 31 | 
 32 | On the declaration side, the type checker will verify that the body of the method uses the types in a way that is consistent with their declaration. The exact meaning of this consistency is defined in the section about [type semantics](/guide/type_semantics). The type checker will also verify that the return type of the body conforms to the annotated return type.
 33 | 
 34 | On the invocation side, the type checker will verify that the values passed as parameters conform to the annotated types.
 35 | 
 36 | Inside methods of a type `T`, the implicitly defined `self` symbol is always assumed as if annotated with type `T`.
 37 | 
 38 | ## Typing attributes and type arguments
 39 | 
 40 | In type definitions, attributes and type arguments can be type-annotated as follows:
 41 | 
 42 | ```js
 43 | type Point(x: Number, y: Number) {
 44 |     x: Number = x;
 45 |     y: Number = y;
 46 | 
 47 |     // ...
 48 | }
 49 | ```
 50 | 
 51 | The type checker will verify that type arguments are used consistently inside attribute initialization expressions, and that the inferred type for each attribute initialization expression conforms to the attribute annotation.
 52 | 
 53 | ## Type conforming
 54 | 
 55 | The basic type relation in HULK is called *conforming* (`<=`). A type `T1` is said to *conform to* to another type `T2` (writen as `T1 <= T2`) if a variable of type `T2`  can hold a value of type `T1` such that every possible operation that is semantically valid with `T2` is guaranteed to be semantically valid with `T1`.
 56 | 
 57 | In general, this means that the type checker will verify that the inferred type for any expression conforms to the corresponding type declared for that expression (e.g., the type of a variable, or the return type of a function).
 58 | 
 59 | The following rules provide an initial definition for the *conforming* relationship. The formal definition is given in the section about [type semantics](/guide/type_semantics).
 60 | 
 61 | - Every type conforms to `Object`.
 62 | - Every type conforms to itself.
 63 | - If `T1` inherits `T2` then `T1` conforms to `T2`.
 64 | - If `T1` conforms to `T2` and `T2` conforms to `T3` then `T1` conforms to `T3`.
 65 | - The only types that conform to `Number`, `String`, and `Boolean`, are respectively those same types.
 66 | 
 67 | Types in HULK form a single hierarchy rooted at `Object`. In this hierarchy the *conforming* relationship is equivalent to the *descendant* relationship. Thus, if `T1` conforms to `T2` that means that `T1` is a descendant of `T2` (or trivially the same type). Thus, we can talk of the lowest common ancestor of a set of types `T1`, `T2`, ..., `Tn`, which is the most specific type `T` such that all `Ti` conform to `T`. When two types are in different branches of the type hierarchy, they are effectively incomparable.
 68 | 
 69 | > **NOTE**: this conforming relationship is extended when we add [protocols](/guide/protocols).
 70 | 
 71 | ## Testing for dynamic types
 72 | 
 73 | The `is` operator allows to test an object to check whether its dynamic type conforms to a specific static type.
 74 | 
 75 | ```js
 76 | type Bird {
 77 |     // ...
 78 | }
 79 | 
 80 | type Plane {
 81 |     // ...
 82 | }
 83 | 
 84 | type Superman {
 85 |     // ...
 86 | }
 87 | 
 88 | let x = new Superman() in
 89 |     print(
 90 |         if (x is Bird) "It's bird!"
 91 |         elif (x is Plane) "It's a plane!"
 92 |         else "No, it's Superman!"
 93 |     );
 94 | ```
 95 | 
 96 | In general, before the `is` operator you can put any expression, not just a variable.
 97 | 
 98 | ## Downcasting
 99 | 
100 | You can use the `as` operator to downcast an expression to a given static type. The result is a runtime error if the expression is not a suitable dynamic type, which means you should always test if you're unsure:
101 | 
102 | 
103 | ```js
104 | type A {
105 |     // ...
106 | }
107 | 
108 | type B inherits A {
109 |     // ...
110 | }
111 | 
112 | type C inherits A {
113 |     // ...
114 | }
115 | 
116 | let x : A = if (rand() < 0.5) new B() else new C() in
117 |     if (x is B)
118 |         let y : B = x as B in {
119 |             // you can use y with static type B
120 |         }
121 |     else {
122 |         // x cannot be downcasted to B
123 |     }
124 | ```
125 | 


--------------------------------------------------------------------------------
/pages/hulk/variables.md:
--------------------------------------------------------------------------------
  1 | # Variables
  2 | 
  3 | Variables in HULK are lexically-scoped, which means that their scope is explicitely defined by the syntax. You use the `let` expression to introduce one or more variables and evaluate an expression in a new scope where those variables are defined.
  4 | 
  5 | The simplest form is introducing a single variable and using a single expression as body.
  6 | 
  7 | ```js
  8 | let msg = "Hello World" in print(msg);
  9 | ```
 10 | 
 11 | Here `msg` is a new symbol that is defined *only* within the expression that goes after `in`.
 12 | 
 13 | ## Multiple variables
 14 | 
 15 | The `let` expression admits defining multiple variables at once like this:
 16 | 
 17 | ```js
 18 | let number = 42, text = "The meaning of life is" in
 19 |     print(text @ number);
 20 | ```
 21 | 
 22 | This is semantically equivalent to the following long form:
 23 | 
 24 | ```js
 25 | let number = 42 in
 26 |     let text = "The meaning of life is" in
 27 |         print(text @ number);
 28 | ```
 29 | 
 30 | As you can notice, `let` associates to the right, so the previous is also equivalent to:
 31 | 
 32 | ```js
 33 | let number = 42 in (
 34 |     let text = "The meaning of life is" in (
 35 |             print(text @ number)
 36 |         )
 37 |     );
 38 | ```
 39 | 
 40 | ## Scoping rules
 41 | 
 42 | Since the binding is performed left-to-right (or equivalently starting from the outer let), and every variable is effectively bound in a new scope, you can safely use one variable when defining another:
 43 | 
 44 | ```js
 45 | let a = 6, b = a * 7 in print(b);
 46 | ```
 47 | 
 48 | Which is equivalent to (and thus valid):
 49 | 
 50 | ```js
 51 | let a = 6 in
 52 |     let b = a * 7 in
 53 |         print(b);
 54 | ```
 55 | 
 56 | ## Expression block body
 57 | 
 58 | You can also use an expression block as the body of a `let` expression:
 59 | 
 60 | ```js
 61 | let a = 5, b = 10, c = 20 in {
 62 |     print(a+b);
 63 |     print(b*c);
 64 |     print(c/a);
 65 | }
 66 | ```
 67 | 
 68 | As we said before, semicolons (`;`) are seldom necessary after an expression block, but they are never wrong.
 69 | 
 70 | ## The `let` return value
 71 | 
 72 | As with almost everything in HULK, `let` is an expression, so it has a return value, which is obviously the return value of its body. This means the following is a valid HULK program:
 73 | 
 74 | ```js
 75 | let a = (let b = 6 in b * 7) in print(a);
 76 | ```
 77 | 
 78 | Or more directly:
 79 | 
 80 | ```js
 81 | print(let b = 6 in b * 7);
 82 | ```
 83 | 
 84 | This can be of course nested ad infinitum.
 85 | 
 86 | ## Redefining symbols
 87 | 
 88 | In HULK every new scope hides the symbols from the parent scope, which means you can redefine a variable name in an inner `let` expression:
 89 | 
 90 | ```js
 91 | let a = 20 in {
 92 |     let a = 42 in print(a);
 93 |     print(a);
 94 | }
 95 | ```
 96 | 
 97 | The previous code prints `42` then `20`, since the inner `let` redefines the value of `a` inside its scope, but the value outside is still the one defined by the outer `let`.
 98 | 
 99 | And because of the [scoping rules](#scoping-rules), the following is also valid:
100 | 
101 | ```js
102 | let a = 7, a = 7 * 6 in print(a);
103 | ```
104 | 
105 | Which is equivalent to:
106 | 
107 | ```js
108 | let a = 7 in
109 |     let a = 7 * 6 in
110 |         print(a);
111 | ```
112 | 
113 | ## Destructive assignment
114 | 
115 | Most of the time in HULK you won't need to overwrite a variable, but there are cases where you do. In those cases, you can use the destructive assignment operator `:=`, like this:
116 | 
117 | ```js
118 | let a = 0 in {
119 |     print(a);
120 |     a := 1;
121 |     print(a);
122 | }
123 | ```
124 | 
125 | The previous program prints `0` and then `1`, since the value of `a` is overwritten before the second `print`.
126 | This is the **only** way in which a variable can be written to outside of a `let`.
127 | 
128 | As you would expect, the `:=` operator defines an expression too, which returns the value just assigned, so you can do the following:
129 | 
130 | ```js
131 | let a = 0 in
132 |     let b = a := 1 in {
133 |         print(a);
134 |         print(b);
135 |     };
136 | ```
137 | 
138 | This is useful if you want to evaluate a complex expression to both test it (e.g, to se if its greater than zero) and store it for later use.
139 | 
140 | ## Rules for naming identifiers
141 | 
142 | Variables (and identifiers in general) in HULK can be named with any sequence of alphanumeric characters, plus underscore `_`, but must *always* begin with a letter (not a digit or `_`), hence the following are all valid identifiers:
143 | 
144 | - `x`
145 | - `x0`
146 | - `x_0`
147 | - `lowercase`
148 | - `TitleCase`
149 | - `snake_case`
150 | - `camelCase`
151 | 
152 | The following are invalid HULK identifiers:
153 | 
154 | - `_x`
155 | - `x+y`
156 | - `some method`
157 | - `8ball`
158 | 
159 | And many others of course!
160 | 
161 | Since starting with an underscore `_` is invalid in user-produced HULK code, you will notice that when we talk about transpilation in HULK, variables and identifiers in transpiled code always start with `_`.
162 | 


--------------------------------------------------------------------------------
/pages/hulk/vectors.md:
--------------------------------------------------------------------------------
 1 | # Vectors
 2 | 
 3 | The builtin vector type provides a simple but powerful abstraction for creating collections of objects of the same type. In terms of functionality, a vector is close to plain arrays as defined in most programming languages. Vectors implement the [iterable protocol](/iterables) so they can be iterated with a `for` syntax.
 4 | 
 5 | Vectors in HULK can be defined with two different syntactic forms: explicit and implicit.
 6 | 
 7 | ## Explicit syntax
 8 | 
 9 | An explicit vector of `Number`, for example, can be defined as follows:
10 | 
11 | ```js
12 | let numbers = [1,2,3,4,5,6,7,8,9] in
13 |     for (x in numbers)
14 |         print(x);
15 | ```
16 | 
17 | Because vectors implement the iterable protocol, you can explicitely find a `next` and `current` methods in case you ever need them. Besides that, vectors also have a `size(): Number` method that returns the number of items in the vector.
18 | 
19 | Vectors also support an indexing syntax using square brackets `[]`, as in the following example:
20 | 
21 | ```js
22 | let numbers = [1,2,3,4,5,6,7,8,9] in print(numbers[7]);
23 | ```
24 | 
25 | ## Implicit syntax
26 | 
27 | An implicit vector can be created using what we call a generator pattern, which is always an expression.
28 | 
29 | Here's one example:
30 | 
31 | ```js
32 | let squares = [x^2 | x in range(1,10)] in print(x);
33 | // prints 2, 4, 6, 8, 10, ...
34 | ```
35 | 
36 | In general, the syntax has the form `[<expr> | <symbol> in <iterable>]`, where `<expr>` is run in a new scope where `symbol` is iteratively bound to each element in the vector.
37 | 
38 | ## Typing vectors
39 | 
40 | Since vectors are iterables, you can safely pass a vector as argument to method that expects an iterable:
41 | 
42 | ```js
43 | function sum(numbers: Number*): Number =>
44 |     let total = 0 in
45 |         for (x in numbers)
46 |             total := total + x;
47 | 
48 | let numbers = [1,2,3,4,5] in
49 |     print(sum(numbers));
50 | ```
51 | 
52 | However, inside `sum` you cannot use the indexing operator `[]` or the `size` method, because the argument is typed as an iterable, and not explicitly as a vector. To fix this, HULK provides another special syntax for vectors, using the `T[]` notation:
53 | 
54 | ```js
55 | function mean(numbers: Number[]): Number =>
56 |     let total = 0 in {
57 |         for (x in numbers)
58 |             total := total + x;
59 | 
60 |         // here `numbers` is known to be vector
61 |         total / numbers.size();
62 |     };
63 | 
64 | let numbers = [1,2,3,4,5] in
65 |     print(mean(numbers));
66 | ```
67 | 
68 | Like with iterables, what happens under the hood is that the compiler implicitely defines a type with the following structure:
69 | 
70 | ```js
71 | type Vector_T {
72 |     size() {
73 |         // impementation of size ...
74 |     }
75 | 
76 |     iter(): Iterable_T {
77 |         // implementation of iter
78 |     }
79 | }
80 | ```
81 | 


--------------------------------------------------------------------------------
/pages/index.qmd:
--------------------------------------------------------------------------------
 1 | # Preface {.unnumbered}
 2 | 
 3 | This book is primarily about making compilers, but it is also so much more. A compiler is one of the most exciting (and complex) projects you could attempt, and of the most interesting pieces of software you can examine. Building a compiler requires a combination of deep theoretical foundations, robust software engineering practices, and clever algorithm design and optimization. In a way, a compiler is the quintessential Computer Science application. This is why, in the process of building a compiler from scratch, you can learn a whole lot about many interrelated areas in Computer Science.
 4 | 
 5 | But why do we need compilers at all? You see, there is a large distance between the level of reasoning that occurs in the brain and the level of reasoning that occurs on a computer---at least, modern, traditional electronic computers like the one where you're reading this. Compilers are our best tools so far to bridge this gap. Here's why.
 6 | 
 7 | Problems in any domain are solved by thinking at a level of abstraction with a language that describes the rules of that domain. For example, if you're sending a rocket to the moon, you will think in terms of the physics and chemistry of rocket propulsion, the differential equations that model orbital mechanics, and the logistics and scheduling involved.
 8 | 
 9 | On the other hand, you have to explain all these things to a computer. And computers are very dumb. At their core, computers are just complex state machines that can do some basic arithmetics and move bits from one part of the memory to another. One of the most surprising insights in all of science is that, it turns out, this is all you need to be able to solve _any_ solvable problem---an idea we will revisit in some detail in later chapters.
10 | 
11 | But let's go back to the core of the problem. The issue is that we have to deal with two widely different levels of abstractions: the higher level where you can talk and reason about rockets and planets and physics---the domain language---and the lower level where you have to talk and reason about bits and registers and arithmetic operations---the machine language.
12 | 
13 | There was a time when these levels of abstraction---these two languages--- had to be connected by the programmer. In fact, at this time, the difference between analyst and programmer was precisely that the analyst designed the solution in his domain language, and the programmer translated it into an executable program in machine language. (This was, incidentally, also the time when women were mostly programmers and men mostly analysts, because many considered "programming" just a low-level translation task not worthy of intellectual pursuit. Oh, the irony! But I digress.)
14 | 
15 | Then, 1952, Grace Hooper came up with a brilliant idea. She was working on the simulation of ballistic trajectories. To direct a projectile to its target, physical models are described in a language of differential equations and Newtonian mechanics. However, in order to implement these models in a computing device, it is necessary to speak in a language of registers, stacks and interrupts.
16 | 
17 | This gap made programming extremely difficult, and slowed the development of new models extremely because at every step there could be errors in both modeling and coding. When something went wrong, whose fault was it? From the analyst or from the programmer? Or worse, the computer system?
18 | 
19 | But here is the kicker. Seeing that the process of converting differential equations into concrete programs was fundamentally mechanical--Hopper thought---why not let the computer itself do this conversion? And thus, the notion of a high level _programming language_ was born!
20 | 
21 | The idea seems straightforward in hindsight: let's design a language that allows analysts to express their solutions to problems---their algorithms---as close as possible to the problem domain---e.g., using standard mathematical notation, functions, collections of numbers, and other relevant abstractions. Then, let's write another program that will translate this high level program into a low level _equivalent_ program, taking care of all the complicated bits and registry manipulation, abstracting away the machine language so the analyst doesn't need to learn it at all.
22 | 
23 | This genius idea would take several years to perfect to the point of becoming a reality. Grace Hooper's first compiler for the A-0 language was actually practically a linker with some basic functions. The first high-level languages ​​to have "serious" compilers are FORTRAN (1957, John Backus), ALGOL (1958, Friedrich Bauer), and COBOL (1960, Grace Hooper). An additional advantage, in addition to reducing development time, was the possibility of compiling the same program for multiple platforms. In 1960, for the first time, the same COBOL program was compiled for two different machines: UNIVAC II and RCA 501.
24 | 
25 | At this point the languages ​​became sufficiently complicated, to the point that compilers could no longer be written "by hand." So it was necessary to turn to theory, and develop a science about what types of programming languages ​​could be compiled, and with what compilers. This gave birth, in 1960, to the science that we know today as Compilation.
26 | 
27 | Motivated not only by a practical reason, but also based on the most solid theoretical principles, building compilers became one of the first justifications for Computer Science to question its own problems and limitations, and stop being seen as a mere calculation tool. Problems as distant as natural language processing and the nature of computable functions have fallen under the scope of the problems studied in this field. Today compilation is a solid science, founded on years of formal theory and engineering practice.
28 | 
29 | Hidden beneath all this formal apparatus and the full range of theoretical and practical experiences and results of the last 60 years, we can find a more fundamental question, a question that perhaps goes back to Alan Turing himself, or even further, to Ada Lovelace and Charles Babbage with his analytical engine. The question is this:
30 | 
31 | **How ​​to talk to a computer?**
32 | 
33 | All attempts to design languages, all algorithms and techniques discovered, all design patterns and architectures, are ultimately tied to the desire to be able to ask a *question* to the computer, and get an *answer* in return. It doesn't matter if the question is to calculate a certain projectile trajectory, or to find the sequence of parameters that minimize a certain function. Every program is in a way a conversation with the computer, a communication channel, which we want to be powerful enough to be able to express our most complex ideas, and simple enough to be understood by a Turing machine. As we will see in this book, finding the right balance is an extremely interesting problem, and trying to answer it will take us down a path that will raise many other questions, including the following:
34 | 
35 | - What types of languages ​​is a computer capable of *understanding*?
36 | - How much of a language must be *understood* in order to have a conversation?
37 | - What is *understanding* a language?
38 | - Is it as easy or difficult to *understand* as to *speak* a language?
39 | - Can we characterize languages ​​in computational terms according to their complexity to be *understood* by a computer?
40 | - How are these languages ​​related to human language?
41 | - What can we learn about the nature of computers and computable problems from the languages ​​they are able to recognize?
42 | - What can we learn about human language to make computers smarter?
43 | - What can we learn about human language, and the very nature of our intelligence, from studying languages ​​understandable by different types of machines?
44 | 
45 | These questions, although not all will be directly answered in the following chapters, form the backbone of the book content, in the sense that everything presented is with the intention of, at least, shedding a little light on these topics. We hope that at the end of the book, students will be able to discuss the philosophical implications of the possible answers to these questions, and not just the technical or more practical issues that the book attacks. For this reason, we will try as far as possible, in addition to the technical content, to occasionally add some comments or more philosophical discussions regarding these and similar questions.
46 | 
47 | So this book is primarily about making compilers. But it is also about some of most profound questions in Computer Science and some of the most surprising answers---including, the lack of answers for many of these questions.
48 | 


--------------------------------------------------------------------------------
/pages/instructors.qmd:
--------------------------------------------------------------------------------
1 | # The Instructor's Manual
2 | 


--------------------------------------------------------------------------------
/pages/intro.qmd:
--------------------------------------------------------------------------------
 1 | # Introduction
 2 | 
 3 | We will begin this journey by dissecting the canonical computational system of formal language theory: a compiler. Broadly speaking, a compiler is nothing more than a program, whose input and output also happen to be programs. The input is a program in a language that we will call "high level", and the output in a "low level" language, which is equivalent to the first. Exactly what is high and low will depend on many factors, and there is no formal definition. In general, a high-level language is one that is comfortable for us as programmers to express the operations that we are interested in executing. Likewise, a low-level language is one that a computing device can execute efficiently. Perhaps the most typical examples are an object-oriented language and an assembly language respectively, but there are many other input and output language combinations of interest.
 4 | 
 5 | ![Abstract representation of the compilation process.](../graphics/mountain.svg){ width=100% }
 6 | 
 7 | Now, before diving headlong into the anatomy of a compiler, it is worth mentioning some related language processing systems. We can try to categorize them according to the "type" of the input and output language. First of all, the classic example is when we want to convert a high-level language to a low-level language, and we call this system a **compiler**. The opposite case, when we want to convert from a low-level language to a high-level language, we can call it a **decompiler** by analogy. These types of tools are useful for analyzing and reverse engineering programs for which, perhaps, we no longer have the source code, and we need to understand or modify. The other two cases, high level to high level and low level to level are basically **translators**; and sometimes they are also called **transpilers**. For example, TypeScript is a high-level language that "transpiles" to JavaScript, another high-level language. Among low-level languages ​​we can also have translators. An example is the so-called **JIT** (*just-in-time*) compilers, which are used to translate a program compiled in a generic low-level language (for example **.NET IL**) into a machine language. specific to the architecture where it is executed.
 8 | 
 9 | Let us then return to the classic case, the **compiler**. In this course we are going to use as a teaching guide the design of a compiler for the HULK language, which will compile to a machine language called MIPS. Details of both languages ​​will be introduced as appropriate, but for now it can be said that HULK is an object-oriented language, with automatic garbage collection, simple inheritance, polymorphism, and a unified type system. MIPS is a stack assembly language for a 32-bit architecture with registers and arithmetic, logic, and string-oriented operations.
10 | 
11 | Let us then try to define this machinery step by step. Abstractly our compiler is a "black box" that converts programs written in HULK to programs written in MIPS:
12 | 
13 | ```python echo=False, results="plain"
14 | Pipeline(['HULK', 'Compiler', 'MIPS'], [
15 |  (0, 1, ""),
16 |  (1, 2, ""),
17 | ], startshape='plaintext', endshape='plaintext').print(width="50%", float=False)
18 | ```
19 | 
20 | To begin to uncover this black box, let's notice that we have at least two independent components: one that operates in COOL language and another that operates in MIPS language. We need to be able to "read" a program in COOL and "write" it in MIPS. We will call the first module, which "reads", *parser*, or syntax analyzer, for historical reasons that we will see later. We will simply call the second component the *generator*.
21 | 
22 | ```python echo=False, results="plain"
23 | Pipeline(['HULK', 'Parser', 'Generator', 'MIPS'], [
24 |  (0, 1, ""),
25 |  (1, 2, ""),
26 |  (23, ""),
27 | ], startshape='plaintext', endshape='plaintext').print(width="70%", float=False)
28 | ```
29 | 
30 | From here a question immediately arises: what communication protocol do these modules have? It is necessary to design a kind of intermediate language, a representation mechanism that is neither COOL nor MIPS, but something that is "halfway" between the two. That is, it is necessary to translate the COOL program into some form of abstract representation, independent of the syntax, which can then be interpreted by the generator and written in MIPS. Let's call it *intermediate representation (IR)* for now.
31 | 
32 | ```python echo=False, results="plain"
33 | Pipeline(['HULK', 'Parser', 'Generator', 'MIPS'], [
34 |  (0, 1, ""),
35 |  (1, 2, "GO"),
36 |  (23, ""),
37 | ], startshape='plaintext', endshape='plaintext').print(width="75%", float=False)
38 | ```
39 | 


--------------------------------------------------------------------------------
/pages/principles/grammars.qmd:
--------------------------------------------------------------------------------
1 | # Formal grammars


--------------------------------------------------------------------------------
/pages/principles/index.qmd:
--------------------------------------------------------------------------------
1 | # Principles of Compiler Construction
2 | 
3 | In this first part of the book we will lay out most of the necessary theory to understand how a compiler works. This includes a significant chunk of formal language theory, but not all of it.


--------------------------------------------------------------------------------
/pages/principles/intro.qmd:
--------------------------------------------------------------------------------
  1 | # Introduction to Formal Languages
  2 | 
  3 | At its core, a compiler is a translator between two languages: the source language (Python, C#, Java, HULK, etc) and the target language (Assembly, C, LLVM, MIPS, etc). Thus, it will pay of to study languages from a computational perspective.
  4 | 
  5 | In this first part of the book, we introduce Formal Language Theory, a major field in Computer Science that deals with an abstract notion of language. Formal Language Theory is one of the foundational fields in CS, and its early development during the 60s and 70s laid the grounds for many of the most important theoretical results in Computer Science.
  6 | 
  7 | So, although our focus in this book is on building compilers, in the next few chapters we will forget about them for a while, and just look at languages as mathematical constructions. We will prove a bunch of surprising theorems and discover a breadth of theory that touches upon all parts of Computer Science. Towards the end of this part, we will peek outside formal languages and look at some of the most interesting connections with computability theory, computational complexity, artificial intelligence, and everything else.
  8 | 
  9 | But let's start at the basics.
 10 | 
 11 | ## What is a language
 12 | 
 13 | Intuitively, a language is just a collection of correct sentences. In natural languages (Spanish, English, etc,), each sentence is made up of words, which have some intrinsic meaning, and there are rules that describe which sequences of words are valid.
 14 | 
 15 | Some of these rules, which we often call "syntactic" are just about the structure of words and sentences, and not their meaning--like how nouns and adjectives must match in gender and number or how verbs connect to adverbs and other modifiers. Other rules, which we call "semantic", deal with the valid meanings of collections of words--the reason why the sentence "the salad was happy" is perfectly valid syntactically but makes no sense. In linguistics, the set of rules that determine which sentences are valid is called a "grammar".
 16 | 
 17 | In formal language theory, we want to make all these notions as precise as possible in mathematical terms. To achieve so, we will have to make some simplifications which, ultimately, will imply that natural languages fall outside the scope of what formal language theory can fully study. But these simplifications will enable us to define a very robust notion of language for which we can make pretty strong theoretical claims.
 18 | 
 19 | So let's build this definition from the ground up, starting with our notion of words, or, formally, symbols:
 20 | 
 21 | ::: {#def-symbol}
 22 | ### Symbol
 23 | 
 24 | A symbol is an atomic element that has an intrinsic meaning.
 25 | :::
 26 | 
 27 | Examples of symbols in abstract languages might be single letters like `a`, `b` or `c`. In programming languages, a symbol might be a variable name, a number, or a keyword like `for` or `class`. The next step is to define sentences:
 28 | 
 29 | ::: {#def-sentence}
 30 | ### Sentence
 31 | 
 32 | A sentence (alternatively called a _string_) is a finite sequence of symbols.
 33 | :::
 34 | 
 35 | An example of a sentence formed with the symbols `a` and `b` is `abba`. In a programming language like C# or Python, a sentence can be anything from a single expression to a full program.
 36 | 
 37 | One special string is the _empty string_, which has zero symbols, and will often bite us in proofs. It is often denoted as $\epsilon$.
 38 | 
 39 | We are almost ready to define a language. But before, we need to define a "vocabulary", which is just a collection of valid symbols.
 40 | 
 41 | ::: {#def-vocabulary}
 42 | ### Vocabulary
 43 | 
 44 | A vocabulary $V$ is a finite set of symbols.
 45 | :::
 46 | 
 47 | An example of a vocabulary is $\{ a,b,c \}$, which contains three symbols. In a programming language like Python, a sensible vocabulary would be something like $\{ \mathrm{for}, \mathrm{while}, \mathrm{def}, \mathrm{class}, ... \}$ containing all keywords, but also symbols like `+`, `.`, etc.
 48 | 
 49 | ::: {.callout-note}
 50 | # What about identifiers?
 51 | 
 52 | If you think about our definition of vocabulary for a little bit, you'll notice we defined it as _finite_ set of symbols. At the same time, I'm claiming that things like variable and function names, and all identifiers in general, will end up being part of the vocabulary in programming languages. However, there are infinitely many valid identifiers, so... how does that work?
 53 | 
 54 | The solution to this problem is that we will actually deal with _two_ different languages, on two different levels. We will define a first language for the _tokens_, which just determines what types of identifiers, numbers, etc., are valid. Then the actual programming language will be defined based on the _types_ of tokens available. So, all numbers are the same token, all identifiers are another token, and so on.
 55 | :::
 56 | 
 57 | Given a concrete vocabulary, we can then define a language as a (posibly infinite) subset of all the sentences that can be formed with the symbols from that vocabulary.
 58 | 
 59 | ::: {#def-language}
 60 | ### Language
 61 | 
 62 | Given a vocabulary $V$, a language $L$ is a set of sentences with symbols taken from $V$.
 63 | :::
 64 | 
 65 | Let's see some examples.
 66 | 
 67 | ## Examples of languages
 68 | 
 69 | To illustrate how rich languages can be, let's define a simple vocabulary with just two symbols, $V = \{a,b\}$, and see how many interesting languages we can come up with.
 70 | 
 71 | The simplest possible language in any vocabulary is the singleton language whose only sentence is formed by a single symbol from the vocabulary. For example, $L_a=\{a\}$ or $L_b = \{b\}$. This is, of course, rather useless, so let's keep up.
 72 | 
 73 | We can also define what's called a _finite_ language, which is just a collection a few (or perhaps many) specific strings. For example,
 74 | $$L_1 = \{bab, abba, ababa, babba\}$$
 75 | 
 76 | ::: {.callout-note}
 77 | Since languages are sets, there is no intrinsic order to the sentences in a language. For visualization purposes, we will often sort sentences in a language in shortest-to-largest, and then lexicographic order, assuming there is a natural order for the symbols. But this is just one arbitrary way of doing it.
 78 | :::
 79 | 
 80 | Now we can enter the realm of _infinite_ languages. Even when the vocabulary is finite, and each sentence itself is also as finite sequence of symbols, we can have infinitely many different sentences in a language. If you need to convince yourself of this claim, think about the language of natural numbers: every natural number is a finite sequence of, at most, 10 different digits, and yet, we have infinitely many natural numbers because we always take a number and add a digit at the end to make a new one.
 81 | 
 82 | In the same sense, we can have infinite languages simply by concatenating symbols from the vocabulary _ad infinitum_. The most straightforward infinite language we can make from an arbitrary vocabulary $V$ is called the _universe_ language, and it's just the collection of all possible strings one can form with symbols from $V$.
 83 | 
 84 | ::: {#def-universe}
 85 | ### Universe language
 86 | 
 87 | Given a vocabulary $V$, the universe language, denoted $V^*$ is the set of all possible strings that can be formed with symbols from $V$.
 88 | :::
 89 | 
 90 | An extensional representation of a finite portion of $V^*$ would be:
 91 | 
 92 | $$V^* = \{\epsilon,a,b,aa,ab,ba,bb,aaa,aab,aba,abb,baa,bab,bba,bbb,...\}$$
 93 | 
 94 | We can now easily see that an alternative definition of language could be any subset of the universe language of a given vocabulary $V$.
 95 | 
 96 | Now let's take it up a notch. We can come up with a gazillion languages just involving $a$ and $b$, by concocting different relationships between the symbols.
 97 | For this, we will need some way to describe the languages that doesn't require listing all the elements--as they are infinitely many. We can do it with natural language, of course, but in the long run it will pay to be a slightly more formal when describing infinite languages.
 98 | 
 99 | For example, let $L_2$ be the language of strings over the alphabet $V=\{a,b\}$ that has the exact same number of $a$ and $b$.
100 | 
101 | $$L_2 = \{\epsilon, ab, aabb, abab, baba, baab, abba, ...\}$$
102 | 
103 | We can define it with a bit of math syntax sugar as follows:
104 | 
105 | $$L_2 = \{ \omega \in \{a,b\}^* | \#(a,\omega) = \#(b,\omega) \}$$
106 | 
107 | Let's unpack this definition. We start by saying, $\omega \in \{a,b\}^*$, which literaly parses as "strings $\omega$ in the universe language of the vocabulary $\{a,b\}$", but is just standard jargon to say "string made out of $a$ and $b$. Then we add the conditional part $\#(a,\omega) = \#(b,\omega)$ which should be pretty straightforward: we are using the $\#(\mathrm{<symbol>},\mathrm{<string>})$ notation to denote the function that counts a given symbol in a string.
108 | 
109 | $L_2$ is slightly more interesting than $V^*$ because it introduces the notion that _a formal language is equivalent to some computation_. This insight is the fundamental idea that links formal languages and computability theory, and we will formalize this idea in the next section. But first, let's see other, even more interesting languages, to solidify this intuition that _languages equal computation_.
110 | 
111 | Let's define $L_3$ as the language of all strings in $V^*$ where the number $a$ is a prime factor of the number of $b$. Intuitively, working with this language--e.g., finding valid strings--will require us to solve prime factoring, as any question about $L$ that has different answers for string in $L$ than for strings not in $L$ will necessarily go through what it means for a number to be a prime factor of another.
112 | 
113 | But it gets better. We can define the language of all strings made out of $a$ and $b$ such that, when interpreting $a$ as $0$ and $b$ as $1$, the resulting binary number has any property we want. We can thus codify all problems in number theory as problems in formal language theory.
114 | 
115 | And, as you can probably understand already, we can easily codify _any_ mathematical problem, not just about number theory. Ultimately, we can define a language as the set of strings that are valid input/ouput pairs for any specific problem we can come up with. Let's make this intuition formal.
116 | 
117 | ## Recognizing a language
118 | 
119 | The central problem is formal language theory is called _the word problem_. Intuitively, it is about determining whether a given string is part of a language, or not. Formally:
120 | 
121 | ::: {#def-word-problem}
122 | ### The Word Problem
123 | 
124 | Given a language $L$ on some vocabulary $V$, the word problem is defined as devising a procedure that, for any string $\omega \in V^*$, determines where $\omega \in L$.
125 | :::
126 | 
127 | Notice that we didn't define the word problem simply as "given a language $L$ and a string $\omega$, is \$omega \in L$". Why? Because we might be able to answer that question correctly _only_ for some $\omega$, but not all. Instead, the word problem is coming up with an algorithm that answers for _all_ possible strings $\omega$--technically, a _procedure_, which is not exactly the same, we will see the details in @sec-computability.
128 | 
129 | The word problem is the most important question in formal language theory, and one of the central problems in computer science in general. So much so, that we actually classify languages (and by extension, all computer science problems) according to how easy or hard it is to solve their related word problem.
130 | 
131 | In the next few chapters, we will review different _classes_ of languages that have certain common characterists which make them, in a sense, equally complex. But first, let's see what it would take to solve the word problem in our example languages.
132 | 
133 | Solving the word problem in any finite language is trivial. You only need to iterate through all of the strings in the language. The word problem becomes way more interesting when we have infinite languages. In these cases, we need to define a _recognizer mechanism_, that is, some sort of computational algorithm or procedure to determine whether any particular string is part of the language.
134 | 
135 | For example, language $L_2$ has a very simple solution to the word problem. The following Python program gets the job done:
136 | 
137 | ```{python}
138 | #| lst-label: lst-recognize-l2
139 | #| lst-cap: Recognizing strings from $L_2$.
140 | def recognize_l2(s):
141 |     a,b = 0,0
142 | 
143 |     for c in s:
144 |         if c == "a":
145 |             a += 1
146 |         else:
147 |             b += 1
148 | 
149 |     return a == b
150 | ```
151 | 
152 | A fundamental question in formal language theory is not only coming up with a solution to the word problem for a given language but, actually, coming up with the _simplest_ solution--for a very specific definition of _simple_: how much do you need to remember. In other words: _what kind of algorithms can solve the word problem for what kind of languages?_
153 | 
154 | For example, we can solve $L_2$ with $O(n)$ memory. That is, we need to remember something that is proportional to how many $a$'s and $b$'s are in the string. And we cannot solve it with anything less than that, as we will prove a couple chapters down the road.
155 | 
156 | Now, let's turn to the opposite problem, that of generating strings from a given language, and wonder what, if any, is the connection between these two.
157 | 
158 | ## Generating a language
159 | 
160 | Suppose you want to generate all strings from a language like $L_2$. To make things simpler, let's redefine it as $L_2'$, the language of strings over $\{a,b\}$ with the same number of $a$'s and $b$' but where all $a$'s come before all $b$'s. This means $aabb$ is a valid string in $L$, but not $abba$. This language is also called $a^n b^n$, that is, $n$ symbols $a$ followed by $n$ symbols $b$.
161 | 
162 | Here is a simple Python method that generates infinitely many strings from $L_2'$:
163 | 
164 | ```{python}
165 | #| lst-cap: Generating language $L_2$.
166 | #| lst-label: lst-generate-l2
167 | def generate_l2():
168 |     s = ""
169 | 
170 |     while True:
171 |         yield s
172 |         s = "a" + s + "b"
173 | ```
174 | 
175 | Here are the first few iterations:
176 | 
177 | ```{python}
178 | #| echo: false
179 | l2 = generate_l2()
180 | 
181 | for i in range(5):
182 |     print(repr(next(l2)))
183 | ```
184 | 
185 | 
186 | Let's unpack this. We start with the empty string $\epsilon$, defined in code as `s = ""`. Then, we enter an infinite cycle where we yield the current string, and then attach an $a$ to the front and a $b$ to the back.
187 | Take a moment to convince yourself that _any_ string in the form $a^n b^n$ is eventually generated by this method and, furthermore, _only_ those strings are generated by the method.
188 | 
189 | This method is actually pretty neat because it not only generates (eventually) all of $a^n b^n$; it does so in increasing length order. It isn't immediately obvious why this is such a good thing but here's a bold claim: if you have a generating method for any language $L$, then you have a recognizing method too.
190 | 
191 | Wait, what!? Yep, you heard it right. And actually, it goes both ways. If you have a recognizing algorithm, you also have a generating one. Let's make this our first theorem in formal language theory.
192 | 
193 | ::: {#thm-generation-recognition}
194 | Let $L$ be a formal language. There exists an algorithm $A$ for generating all strings in $L$ (in increasing length order) if and only if there also exists another algorithm $A'$ for solving the word problem in $L$.
195 | :::
196 | 
197 | ::: {.proof}
198 | To prove this, let's first understand what the theorem is saying. If we have an algorithm $A$ that generates all strings in a language, we can also come up with another algorithm $A'$ (presumably using $A$) that solves the word problem, and viceversa.
199 | 
200 | To prove this type of theorems, the most usual approach is to assume you have $A$ (or $A'$) as some kind of abstract, black-box algorithm, and try to construct the other. Let's do it from generation to recognition first, as the other way around will be fairly easy once this is done.
201 | 
202 | $\Rightarrow$ Suppose we have an algorithm $A$ that generates all strings in $L$, and we are given an arbitrary string $\omega$. Let $n = |\omega|$ be the length of $\omega$. We just need to run $A$ until we either see $\omega$, in which case the answer is true ($\omega \in L$) or until we see one string with length greater than $n$, in which case the answer is false ($\omega \notin L$). Since $A$ generates strings in increasing length order, one of these must happen in a finite time for any $\omega$.
203 | 
204 | Here is a suitable code:
205 | 
206 | ```python
207 | def recognize_L(w):
208 |     for s in generate_L():
209 |         if s == w:
210 |             return True
211 | 
212 |         if len(s) > len(w):
213 |             break
214 | 
215 |     return False
216 | ```
217 | 
218 | Now let's do the other way around.
219 | 
220 | $\Leftarrow$ Suppose we have an algorithm $A'$ that solves the word problem from $L$. Then we do the following. Define $L^*$ as the universe language associated with $L$. We can very easily code a generating algorithm $A^*$ for $L^*$ in increasing length order, simply by permuting all symbols. Now, run $A^*$ and, for each string $\omega$ generated, run $A'(\omega)$. If the output is true, then yield $\omega$. Otherwise, skip it.
221 | 
222 | Here is a suitable code:
223 | 
224 | ```python
225 | def generate_L():
226 |     V = ... # vocabulary of L
227 | 
228 |     def generate_L_star(empty=True):
229 |         if empty:
230 |             yield ""
231 | 
232 |         for c in V:
233 |             yield c
234 | 
235 |         # recursively generate all strings in
236 |         # the correspondind universe language L*
237 |         # except the empty string, to avoid
238 |         # an infinite loop
239 |         for s in generate_L_star(False):
240 |             for c in V:
241 |                 yield s + c
242 | 
243 |     for s in generate_L_star():
244 |         if recognize_L(s):
245 |             yield s
246 | ```
247 | 
248 | :::
249 | 
250 | So there you have it. Generating (in increasing order) and recognizing are two faces of the same problem. Cool, right? But why does this matter? For starters, it gives us a tremendously powerful connection between two sub-branches of formal language theory that we will explore in the following chapters.
251 | 
252 | ## Moving on
253 | 
254 | We are just scratching the surface of what formal language theory can do, and we have already touched upon several areas of computer science.
255 | 
256 | We have defined a super general notion (language) that is ultimately as profound and powerful as the very notion of algorithm. We have identified a central problem in formal language theory (the word problem) that is as deep as the very question of what problems can be solved, _at all_, with a computer. We connected two fundamental problems in languages (recognizing and generating) and discovered they are but two sides of the same coin. And we left hanging the question of which languages can be solved with which types of algorithms, which is ultimately a question about complexity theory. Phew!
257 | 
258 | In the next few chapters we will continue exploring the world of formal languages. We will dive into the different classes of languages, according to the complexity of their generating and recognizing algorithms. We will find many intringuing unsolvable problems that have deep connections with other areas in computer science, from the most practical to the most esotherical. When we finish this dive, we will have a much more solid understanding of what computers can ultimately do. And then, will turn to programming languages and apply all these ideas to solving the more practical problem of actually building a compiler.
259 | 
260 | Buckle up!
261 | 
262 | ## Questions and Exercises {.unnumbered}
263 | 
264 | 1. For each of the following languages, write the following:
265 |    - A formal definition in mathematical notation.
266 |    - A generating algorithm (in any programming language) that produces all strings in increasing length order.
267 |    - A recognizing algorithm (in any programming language) that requires only one pass for any string.
268 | 
269 |    a) Strings made with $a$ and $b$ where the number of $a$'s is strictly less than the number of $b$'s.
270 |    b) Strings that represent binary numbers divisible by two (without converting the string to a number.)
271 |    c) Strings that represent binary numbers divisible by three (without converting the string to a number.)
272 |    d) Strings of well-formed parenthesis (every open parenthesis has its closed matching parenthesis, and there are no prefixes with more closed than open parentheses).
273 |    e) Strings made of $a$, $b$, and $c$ with the same number of each symbol.
274 | 
275 | 2. Argue why, in @thm-generation-recognition, we require that a generating algorithm produces the strings in length-increasing order. If that wasn't the case, could we find a recognizing algorithm? Why, or why not? In any case, what is the best we could have?
276 | 


--------------------------------------------------------------------------------
/pages/references.bib:
--------------------------------------------------------------------------------
 1 | @article{knuth84,
 2 |   author = {Knuth, Donald E.},
 3 |   title = {Literate Programming},
 4 |   year = {1984},
 5 |   issue_date = {May 1984},
 6 |   publisher = {Oxford University Press, Inc.},
 7 |   address = {USA},
 8 |   volume = {27},
 9 |   number = {2},
10 |   issn = {0010-4620},
11 |   url = {https://doi.org/10.1093/comjnl/27.2.97},
12 |   doi = {10.1093/comjnl/27.2.97},
13 |   journal = {Comput. J.},
14 |   month = may,
15 |   pages = {97–111},
16 |   numpages = {15}
17 | }
18 | 
19 | 
20 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "hulk"
3 | version = "0.1.0"
4 | description = "Add your description here"
5 | readme = "README.md"
6 | requires-python = ">=3.13"
7 | dependencies = []
8 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | certifi==2022.9.24
 2 | charset-normalizer==2.1.1
 3 | click==8.1.3
 4 | ghp-import==2.1.0
 5 | idna==3.4
 6 | Jinja2==3.1.2
 7 | Markdown==3.3.7
 8 | MarkupSafe==2.1.1
 9 | mergedeep==1.3.4
10 | mkdocs==1.4.0
11 | mkdocs-material==8.5.6
12 | mkdocs-material-extensions==1.0.3
13 | packaging==21.3
14 | Pygments==2.13.0
15 | pymdown-extensions==9.6
16 | pyparsing==3.0.9
17 | python-dateutil==2.8.2
18 | PyYAML==6.0
19 | pyyaml_env_tag==0.1
20 | requests==2.28.1
21 | six==1.16.0
22 | urllib3==1.26.12
23 | watchdog==2.1.9
24 | 


--------------------------------------------------------------------------------