├── .gitignore
├── Gemfile
├── Gemfile.lock
├── LICENSE
├── Makefile
├── README.md
├── _config.yml
├── _includes
    └── image.html
├── _layouts
    └── default.html
├── _parts
    ├── part1.md
    ├── part10.md
    ├── part11.md
    ├── part12.md
    ├── part13.md
    ├── part14.md
    ├── part15.md
    ├── part2.md
    ├── part3.md
    ├── part4.md
    ├── part5.md
    ├── part6.md
    ├── part7.md
    ├── part8.md
    └── part9.md
├── assets
    ├── css
    │   └── style.scss
    └── images
    │   ├── B-tree.png
    │   ├── arch-part5.gif
    │   ├── arch1.gif
    │   ├── arch2.gif
    │   ├── btree1.png
    │   ├── btree2.png
    │   ├── btree3.png
    │   ├── btree4.png
    │   ├── btree5.png
    │   ├── btree6.png
    │   ├── code-crafters.jpeg
    │   ├── file-format.png
    │   ├── internal-node-format.png
    │   ├── leaf-node-format.png
    │   ├── splitting-internal-node.png
    │   └── updating-internal-node.png
├── db.c
├── feed.xml
├── index.md
└── spec
    └── main_spec.rb


/.gitignore:
--------------------------------------------------------------------------------
1 | db
2 | *.db
3 | _site/
4 | TODO
5 | 


--------------------------------------------------------------------------------
/Gemfile:
--------------------------------------------------------------------------------
1 | source 'https://rubygems.org'
2 | gem 'github-pages', group: :jekyll_plugins
3 | gem 'webrick'
4 | gem "jekyll-theme-minimal"
5 | gem "rspec"
6 | 


--------------------------------------------------------------------------------
/Gemfile.lock:
--------------------------------------------------------------------------------
  1 | GEM
  2 |   remote: https://rubygems.org/
  3 |   specs:
  4 |     activesupport (7.0.4.3)
  5 |       concurrent-ruby (~> 1.0, >= 1.0.2)
  6 |       i18n (>= 1.6, < 2)
  7 |       minitest (>= 5.1)
  8 |       tzinfo (~> 2.0)
  9 |     addressable (2.8.4)
 10 |       public_suffix (>= 2.0.2, < 6.0)
 11 |     coffee-script (2.4.1)
 12 |       coffee-script-source
 13 |       execjs
 14 |     coffee-script-source (1.11.1)
 15 |     colorator (1.1.0)
 16 |     commonmarker (0.23.9)
 17 |     concurrent-ruby (1.2.2)
 18 |     diff-lcs (1.5.0)
 19 |     dnsruby (1.70.0)
 20 |       simpleidn (~> 0.2.1)
 21 |     em-websocket (0.5.3)
 22 |       eventmachine (>= 0.12.9)
 23 |       http_parser.rb (~> 0)
 24 |     ethon (0.16.0)
 25 |       ffi (>= 1.15.0)
 26 |     eventmachine (1.2.7)
 27 |     eventmachine (1.2.7-x86-mingw32)
 28 |     execjs (2.8.1)
 29 |     faraday (2.7.4)
 30 |       faraday-net_http (>= 2.0, < 3.1)
 31 |       ruby2_keywords (>= 0.0.4)
 32 |     faraday-net_http (3.0.2)
 33 |     ffi (1.15.5)
 34 |     ffi (1.15.5-x86-mingw32)
 35 |     forwardable-extended (2.6.0)
 36 |     gemoji (3.0.1)
 37 |     github-pages (228)
 38 |       github-pages-health-check (= 1.17.9)
 39 |       jekyll (= 3.9.3)
 40 |       jekyll-avatar (= 0.7.0)
 41 |       jekyll-coffeescript (= 1.1.1)
 42 |       jekyll-commonmark-ghpages (= 0.4.0)
 43 |       jekyll-default-layout (= 0.1.4)
 44 |       jekyll-feed (= 0.15.1)
 45 |       jekyll-gist (= 1.5.0)
 46 |       jekyll-github-metadata (= 2.13.0)
 47 |       jekyll-include-cache (= 0.2.1)
 48 |       jekyll-mentions (= 1.6.0)
 49 |       jekyll-optional-front-matter (= 0.3.2)
 50 |       jekyll-paginate (= 1.1.0)
 51 |       jekyll-readme-index (= 0.3.0)
 52 |       jekyll-redirect-from (= 0.16.0)
 53 |       jekyll-relative-links (= 0.6.1)
 54 |       jekyll-remote-theme (= 0.4.3)
 55 |       jekyll-sass-converter (= 1.5.2)
 56 |       jekyll-seo-tag (= 2.8.0)
 57 |       jekyll-sitemap (= 1.4.0)
 58 |       jekyll-swiss (= 1.0.0)
 59 |       jekyll-theme-architect (= 0.2.0)
 60 |       jekyll-theme-cayman (= 0.2.0)
 61 |       jekyll-theme-dinky (= 0.2.0)
 62 |       jekyll-theme-hacker (= 0.2.0)
 63 |       jekyll-theme-leap-day (= 0.2.0)
 64 |       jekyll-theme-merlot (= 0.2.0)
 65 |       jekyll-theme-midnight (= 0.2.0)
 66 |       jekyll-theme-minimal (= 0.2.0)
 67 |       jekyll-theme-modernist (= 0.2.0)
 68 |       jekyll-theme-primer (= 0.6.0)
 69 |       jekyll-theme-slate (= 0.2.0)
 70 |       jekyll-theme-tactile (= 0.2.0)
 71 |       jekyll-theme-time-machine (= 0.2.0)
 72 |       jekyll-titles-from-headings (= 0.5.3)
 73 |       jemoji (= 0.12.0)
 74 |       kramdown (= 2.3.2)
 75 |       kramdown-parser-gfm (= 1.1.0)
 76 |       liquid (= 4.0.4)
 77 |       mercenary (~> 0.3)
 78 |       minima (= 2.5.1)
 79 |       nokogiri (>= 1.13.6, < 2.0)
 80 |       rouge (= 3.26.0)
 81 |       terminal-table (~> 1.4)
 82 |     github-pages-health-check (1.17.9)
 83 |       addressable (~> 2.3)
 84 |       dnsruby (~> 1.60)
 85 |       octokit (~> 4.0)
 86 |       public_suffix (>= 3.0, < 5.0)
 87 |       typhoeus (~> 1.3)
 88 |     html-pipeline (2.14.3)
 89 |       activesupport (>= 2)
 90 |       nokogiri (>= 1.4)
 91 |     http_parser.rb (0.8.0)
 92 |     i18n (1.13.0)
 93 |       concurrent-ruby (~> 1.0)
 94 |     jekyll (3.9.3)
 95 |       addressable (~> 2.4)
 96 |       colorator (~> 1.0)
 97 |       em-websocket (~> 0.5)
 98 |       i18n (>= 0.7, < 2)
 99 |       jekyll-sass-converter (~> 1.0)
100 |       jekyll-watch (~> 2.0)
101 |       kramdown (>= 1.17, < 3)
102 |       liquid (~> 4.0)
103 |       mercenary (~> 0.3.3)
104 |       pathutil (~> 0.9)
105 |       rouge (>= 1.7, < 4)
106 |       safe_yaml (~> 1.0)
107 |     jekyll-avatar (0.7.0)
108 |       jekyll (>= 3.0, < 5.0)
109 |     jekyll-coffeescript (1.1.1)
110 |       coffee-script (~> 2.2)
111 |       coffee-script-source (~> 1.11.1)
112 |     jekyll-commonmark (1.4.0)
113 |       commonmarker (~> 0.22)
114 |     jekyll-commonmark-ghpages (0.4.0)
115 |       commonmarker (~> 0.23.7)
116 |       jekyll (~> 3.9.0)
117 |       jekyll-commonmark (~> 1.4.0)
118 |       rouge (>= 2.0, < 5.0)
119 |     jekyll-default-layout (0.1.4)
120 |       jekyll (~> 3.0)
121 |     jekyll-feed (0.15.1)
122 |       jekyll (>= 3.7, < 5.0)
123 |     jekyll-gist (1.5.0)
124 |       octokit (~> 4.2)
125 |     jekyll-github-metadata (2.13.0)
126 |       jekyll (>= 3.4, < 5.0)
127 |       octokit (~> 4.0, != 4.4.0)
128 |     jekyll-include-cache (0.2.1)
129 |       jekyll (>= 3.7, < 5.0)
130 |     jekyll-mentions (1.6.0)
131 |       html-pipeline (~> 2.3)
132 |       jekyll (>= 3.7, < 5.0)
133 |     jekyll-optional-front-matter (0.3.2)
134 |       jekyll (>= 3.0, < 5.0)
135 |     jekyll-paginate (1.1.0)
136 |     jekyll-readme-index (0.3.0)
137 |       jekyll (>= 3.0, < 5.0)
138 |     jekyll-redirect-from (0.16.0)
139 |       jekyll (>= 3.3, < 5.0)
140 |     jekyll-relative-links (0.6.1)
141 |       jekyll (>= 3.3, < 5.0)
142 |     jekyll-remote-theme (0.4.3)
143 |       addressable (~> 2.0)
144 |       jekyll (>= 3.5, < 5.0)
145 |       jekyll-sass-converter (>= 1.0, <= 3.0.0, != 2.0.0)
146 |       rubyzip (>= 1.3.0, < 3.0)
147 |     jekyll-sass-converter (1.5.2)
148 |       sass (~> 3.4)
149 |     jekyll-seo-tag (2.8.0)
150 |       jekyll (>= 3.8, < 5.0)
151 |     jekyll-sitemap (1.4.0)
152 |       jekyll (>= 3.7, < 5.0)
153 |     jekyll-swiss (1.0.0)
154 |     jekyll-theme-architect (0.2.0)
155 |       jekyll (> 3.5, < 5.0)
156 |       jekyll-seo-tag (~> 2.0)
157 |     jekyll-theme-cayman (0.2.0)
158 |       jekyll (> 3.5, < 5.0)
159 |       jekyll-seo-tag (~> 2.0)
160 |     jekyll-theme-dinky (0.2.0)
161 |       jekyll (> 3.5, < 5.0)
162 |       jekyll-seo-tag (~> 2.0)
163 |     jekyll-theme-hacker (0.2.0)
164 |       jekyll (> 3.5, < 5.0)
165 |       jekyll-seo-tag (~> 2.0)
166 |     jekyll-theme-leap-day (0.2.0)
167 |       jekyll (> 3.5, < 5.0)
168 |       jekyll-seo-tag (~> 2.0)
169 |     jekyll-theme-merlot (0.2.0)
170 |       jekyll (> 3.5, < 5.0)
171 |       jekyll-seo-tag (~> 2.0)
172 |     jekyll-theme-midnight (0.2.0)
173 |       jekyll (> 3.5, < 5.0)
174 |       jekyll-seo-tag (~> 2.0)
175 |     jekyll-theme-minimal (0.2.0)
176 |       jekyll (> 3.5, < 5.0)
177 |       jekyll-seo-tag (~> 2.0)
178 |     jekyll-theme-modernist (0.2.0)
179 |       jekyll (> 3.5, < 5.0)
180 |       jekyll-seo-tag (~> 2.0)
181 |     jekyll-theme-primer (0.6.0)
182 |       jekyll (> 3.5, < 5.0)
183 |       jekyll-github-metadata (~> 2.9)
184 |       jekyll-seo-tag (~> 2.0)
185 |     jekyll-theme-slate (0.2.0)
186 |       jekyll (> 3.5, < 5.0)
187 |       jekyll-seo-tag (~> 2.0)
188 |     jekyll-theme-tactile (0.2.0)
189 |       jekyll (> 3.5, < 5.0)
190 |       jekyll-seo-tag (~> 2.0)
191 |     jekyll-theme-time-machine (0.2.0)
192 |       jekyll (> 3.5, < 5.0)
193 |       jekyll-seo-tag (~> 2.0)
194 |     jekyll-titles-from-headings (0.5.3)
195 |       jekyll (>= 3.3, < 5.0)
196 |     jekyll-watch (2.2.1)
197 |       listen (~> 3.0)
198 |     jemoji (0.12.0)
199 |       gemoji (~> 3.0)
200 |       html-pipeline (~> 2.2)
201 |       jekyll (>= 3.0, < 5.0)
202 |     kramdown (2.3.2)
203 |       rexml
204 |     kramdown-parser-gfm (1.1.0)
205 |       kramdown (~> 2.0)
206 |     liquid (4.0.4)
207 |     listen (3.8.0)
208 |       rb-fsevent (~> 0.10, >= 0.10.3)
209 |       rb-inotify (~> 0.9, >= 0.9.10)
210 |     mercenary (0.3.6)
211 |     mini_portile2 (2.8.2)
212 |     minima (2.5.1)
213 |       jekyll (>= 3.5, < 5.0)
214 |       jekyll-feed (~> 0.9)
215 |       jekyll-seo-tag (~> 2.1)
216 |     minitest (5.18.0)
217 |     nokogiri (1.15.1)
218 |       mini_portile2 (~> 2.8.2)
219 |       racc (~> 1.4)
220 |     nokogiri (1.15.1-arm64-darwin)
221 |       racc (~> 1.4)
222 |     nokogiri (1.15.1-x86-mingw32)
223 |       racc (~> 1.4)
224 |     nokogiri (1.15.1-x86_64-linux)
225 |       racc (~> 1.4)
226 |     octokit (4.25.1)
227 |       faraday (>= 1, < 3)
228 |       sawyer (~> 0.9)
229 |     pathutil (0.16.2)
230 |       forwardable-extended (~> 2.6)
231 |     public_suffix (4.0.7)
232 |     racc (1.6.2)
233 |     rb-fsevent (0.11.2)
234 |     rb-inotify (0.10.1)
235 |       ffi (~> 1.0)
236 |     rexml (3.2.5)
237 |     rouge (3.26.0)
238 |     rspec (3.12.0)
239 |       rspec-core (~> 3.12.0)
240 |       rspec-expectations (~> 3.12.0)
241 |       rspec-mocks (~> 3.12.0)
242 |     rspec-core (3.12.2)
243 |       rspec-support (~> 3.12.0)
244 |     rspec-expectations (3.12.3)
245 |       diff-lcs (>= 1.2.0, < 2.0)
246 |       rspec-support (~> 3.12.0)
247 |     rspec-mocks (3.12.5)
248 |       diff-lcs (>= 1.2.0, < 2.0)
249 |       rspec-support (~> 3.12.0)
250 |     rspec-support (3.12.0)
251 |     ruby2_keywords (0.0.5)
252 |     rubyzip (2.3.2)
253 |     safe_yaml (1.0.5)
254 |     sass (3.7.4)
255 |       sass-listen (~> 4.0.0)
256 |     sass-listen (4.0.0)
257 |       rb-fsevent (~> 0.9, >= 0.9.4)
258 |       rb-inotify (~> 0.9, >= 0.9.7)
259 |     sawyer (0.9.2)
260 |       addressable (>= 2.3.5)
261 |       faraday (>= 0.17.3, < 3)
262 |     simpleidn (0.2.1)
263 |       unf (~> 0.1.4)
264 |     terminal-table (1.8.0)
265 |       unicode-display_width (~> 1.1, >= 1.1.1)
266 |     typhoeus (1.4.0)
267 |       ethon (>= 0.9.0)
268 |     tzinfo (2.0.6)
269 |       concurrent-ruby (~> 1.0)
270 |     unf (0.1.4)
271 |       unf_ext
272 |     unf_ext (0.0.8.2)
273 |     unf_ext (0.0.8.2-x86-mingw32)
274 |     unicode-display_width (1.8.0)
275 |     webrick (1.8.1)
276 | 
277 | PLATFORMS
278 |   arm64-darwin-21
279 |   x86-mingw32
280 |   x86-mswin32-60
281 |   x86_64-linux
282 | 
283 | DEPENDENCIES
284 |   github-pages
285 |   jekyll-theme-minimal
286 |   rspec
287 |   webrick
288 | 
289 | BUNDLED WITH
290 |    2.2.33
291 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Connor Stack
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | db: db.c
 2 | 	gcc db.c -o db
 3 | 
 4 | run: db
 5 | 	./db mydb.db
 6 | 
 7 | clean:
 8 | 	rm -f db *.db
 9 | 
10 | test: db
11 | 	bundle exec rspec
12 | 
13 | format: *.c
14 | 	clang-format -style=Google -i *.c


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Let's Build a Simple Database
 2 | 
 3 | [View rendered tutorial](https://cstack.github.io/db_tutorial/) (with more details on what this is.)
 4 | 
 5 | ## Notes to myself
 6 | 
 7 | Run site locally:
 8 | ```
 9 | bundle exec jekyll serve
10 | ```


--------------------------------------------------------------------------------
/_config.yml:
--------------------------------------------------------------------------------
 1 | theme: jekyll-theme-minimal
 2 | title: Let's Build a Simple Database
 3 | description: Writing a sqlite clone from scratch in C
 4 | google_analytics: UA-105767942-1
 5 | collections:
 6 |   parts:
 7 |     output: true
 8 | defaults:
 9 |   - scope:
10 |       path: ""
11 |       type: parts
12 |     values:
13 |       layout: default
14 | 


--------------------------------------------------------------------------------
/_includes/image.html:
--------------------------------------------------------------------------------
1 | <table class="image">
2 | <caption align="bottom">{{ include.description }}</caption>
3 | <tr><td><a href="{{site.url}}{{site.baseurl}}/{{ include.url }}"><img src="{{site.url}}{{site.baseurl}}/{{ include.url }}" alt="{{ include.description }}"/></a></td></tr>
4 | </table>
5 | 


--------------------------------------------------------------------------------
/_layouts/default.html:
--------------------------------------------------------------------------------
  1 | <!doctype html>
  2 | <html lang="{{ site.lang | default: "en-US" }}">
  3 |   <head>
  4 |     <meta charset="utf-8">
  5 |     <meta http-equiv="X-UA-Compatible" content="chrome=1">
  6 | 
  7 | {% seo %}
  8 | 
  9 |     <link rel="stylesheet" href="{{ '/assets/css/style.css?v=' | append: site.github.build_revision | relative_url }}">
 10 |     <meta name="viewport" content="width=device-width">
 11 |     <!--[if lt IE 9]>
 12 |     <script src="//html5shiv.googlecode.com/svn/trunk/html5.js"></script>
 13 |     <![endif]-->
 14 |   </head>
 15 |   <body>
 16 |     <div class="wrapper">
 17 |       <header>
 18 |         <h1>{{ site.title | default: site.github.repository_name }}</h1>
 19 |         <p>{{ site.description | default: site.github.project_tagline }}</p>
 20 | 
 21 |         <p class="view"><a href="{{ site.baseurl }}/">Overview</a></p>
 22 | 
 23 |         {% if site.github.is_project_page %}
 24 |           <p class="view"><a href="{{ site.github.repository_url }}">View on GitHub (pull requests welcome)</a></p>
 25 |         {% endif %}
 26 | 
 27 |         {% if site.github.is_user_page %}
 28 |           <p class="view"><a href="{{ site.github.owner_url }}">View My GitHub Profile</a></p>
 29 |         {% endif %}
 30 | 
 31 |         {% if site.show_downloads %}
 32 |           <ul>
 33 |             <li><a href="{{ site.github.zip_url }}">Download <strong>ZIP File</strong></a></li>
 34 |             <li><a href="{{ site.github.tar_url }}">Download <strong>TAR Ball</strong></a></li>
 35 |             <li><a href="{{ site.github.repository_url }}">View On <strong>GitHub</strong></a></li>
 36 |           </ul>
 37 |         {% endif %}
 38 | 
 39 |         <div style="color:#721c24;background-color:#f8d7da;border-color:#f5c6cb;padding:.75rem 1.25rem;border-radius:.25rem;margin-bottom:1rem">
 40 |           <p><b>This project is no longer under active development.</b> You can read more <a href="{{site.baseurl}}/parts/part15.html">here</a>. But if you'd like to keep learning how to make your own SQLite clone from scratch, or one of many other projects like Docker, Redis, Git or BitTorrent, try <a href="https://app.codecrafters.io/join?via=cstack"><b>CodeCrafters</b></a>.</p>
 41 |         </div>
 42 |         <div>
 43 |           <a href="https://app.codecrafters.io/join?via=cstack"><img src="{{site.url}}{{site.baseurl}}/assets/images/code-crafters.jpeg" alt="{{ include.description }}"/></a>
 44 |         </div>
 45 |       </header>
 46 |       <section>
 47 | 
 48 |       {% for part in site.parts %}
 49 |           {% if part.url == page.url %}
 50 |               {% assign is_part = true %}
 51 |               {% assign part_index0 = forloop.index0 %}
 52 |               {% assign part_index1 = forloop.index %}
 53 |           {% endif %}
 54 |       {% endfor %}
 55 |       {% for part in site.parts %}
 56 |           {% if part_index0 == forloop.index %}
 57 |               {% assign prev_part = part %}
 58 |           {% endif %}
 59 |           {% if part_index1 == forloop.index0 %}
 60 |               {% assign next_part = part %}
 61 |           {% endif %}
 62 |       {% endfor %}
 63 | 
 64 |       <h1>{{ page.title }}</h1>
 65 |       {% if prev_part %}
 66 |           <p><a class="prev" href="{{site.baseurl}}{{ prev_part.url }}">{{prev_part.title}}</a> <br></p>
 67 |       {% endif %}
 68 |       {% if next_part %}
 69 |           <p><a class="next" href="{{site.baseurl}}{{ next_part.url }}">{{next_part.title}}</a> <br></p>
 70 |       {% endif %}
 71 | 
 72 |       {{ content }}
 73 | 
 74 |       {% if prev_part %}
 75 |           <p><a class="prev" href="{{site.baseurl}}{{ prev_part.url }}">{{prev_part.title}}</a> <br></p>
 76 |       {% endif %}
 77 |       {% if next_part %}
 78 |           <p><a class="next" href="{{site.baseurl}}{{ next_part.url }}">{{next_part.title}}</a> <br></p>
 79 |       {% endif %}
 80 | 
 81 |       </section>
 82 |       <footer>
 83 |         {% if site.github.is_project_page %}
 84 |         <p><a href="{{site.baseurl}}/feed.xml">rss</a> | <a href="https://ifttt.com/applets/60708433d-rss-feed-to-email">subscribe by email</a></p>
 85 |         <p>This project is maintained by <a href="{{ site.github.owner_url }}">{{ site.github.owner_name }}</a></p>
 86 |         {% endif %}
 87 |         <p><small>Hosted on GitHub Pages &mdash; Theme by <a href="https://github.com/orderedlist">orderedlist</a></small></p>
 88 |       </footer>
 89 |     </div>
 90 |     <script src="{{ '/assets/js/scale.fix.js' | relative_url }}"></script>
 91 | 
 92 | 
 93 |   {% if site.google_analytics %}
 94 |     <script>
 95 |         (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
 96 |         (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
 97 |         m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
 98 |         })(window,document,'script','https://www.google-analytics.com/analytics.js','ga');
 99 | 
100 |         ga('create', '{{ site.google_analytics }}', 'auto');
101 |         ga('send', 'pageview');
102 |     </script>
103 |   {% endif %}
104 |   </body>
105 | </html>
106 | 


--------------------------------------------------------------------------------
/_parts/part1.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: Part 1 - Introduction and Setting up the REPL
  3 | date: 2017-08-30
  4 | ---
  5 | 
  6 | As a web developer, I use relational databases every day at my job, but they're a black box to me. Some questions I have:
  7 | - What format is data saved in? (in memory and on disk)
  8 | - When does it move from memory to disk?
  9 | - Why can there only be one primary key per table?
 10 | - How does rolling back a transaction work?
 11 | - How are indexes formatted?
 12 | - When and how does a full table scan happen?
 13 | - What format is a prepared statement saved in?
 14 | 
 15 | In other words, how does a database **work**?
 16 | 
 17 | To figure things out, I'm writing a database from scratch. It's modeled off sqlite because it is designed to be small with fewer features than MySQL or PostgreSQL, so I have a better hope of understanding it. The entire database is stored in a single file!
 18 | 
 19 | # Sqlite
 20 | 
 21 | There's lots of [documentation of sqlite internals](https://www.sqlite.org/arch.html) on their website, plus I've got a copy of [SQLite Database System: Design and Implementation](https://play.google.com/store/books/details?id=9Z6IQQnX1JEC).
 22 | 
 23 | {% include image.html url="assets/images/arch1.gif" description="sqlite architecture (https://www.sqlite.org/zipvfs/doc/trunk/www/howitworks.wiki)" %}
 24 | 
 25 | A query goes through a chain of components in order to retrieve or modify data. The **front-end** consists of the:
 26 | - tokenizer
 27 | - parser
 28 | - code generator
 29 | 
 30 | The input to the front-end is a SQL query. the output is sqlite virtual machine bytecode (essentially a compiled program that can operate on the database).
 31 | 
 32 | The _back-end_ consists of the:
 33 | - virtual machine
 34 | - B-tree
 35 | - pager
 36 | - os interface
 37 | 
 38 | The **virtual machine** takes bytecode generated by the front-end as instructions. It can then perform operations on one or more tables or indexes, each of which is stored in a data structure called a B-tree. The VM is essentially a big switch statement on the type of bytecode instruction.
 39 | 
 40 | Each **B-tree** consists of many nodes. Each node is one page in length. The B-tree can retrieve a page from disk or save it back to disk by issuing commands to the pager.
 41 | 
 42 | The **pager** receives commands to read or write pages of data. It is responsible for reading/writing at appropriate offsets in the database file. It also keeps a cache of recently-accessed pages in memory, and determines when those pages need to be written back to disk.
 43 | 
 44 | The **os interface** is the layer that differs depending on which operating system sqlite was compiled for. In this tutorial, I'm not going to support multiple platforms.
 45 | 
 46 | [A journey of a thousand miles begins with a single step](https://en.wiktionary.org/wiki/a_journey_of_a_thousand_miles_begins_with_a_single_step), so let's start with something a little more straightforward: the REPL.
 47 | 
 48 | ## Making a Simple REPL
 49 | 
 50 | Sqlite starts a read-execute-print loop when you start it from the command line:
 51 | 
 52 | ```shell
 53 | ~ sqlite3
 54 | SQLite version 3.16.0 2016-11-04 19:09:39
 55 | Enter ".help" for usage hints.
 56 | Connected to a transient in-memory database.
 57 | Use ".open FILENAME" to reopen on a persistent database.
 58 | sqlite> create table users (id int, username varchar(255), email varchar(255));
 59 | sqlite> .tables
 60 | users
 61 | sqlite> .exit
 62 | ~
 63 | ```
 64 | 
 65 | To do that, our main function will have an infinite loop that prints the prompt, gets a line of input, then processes that line of input:
 66 | 
 67 | ```c
 68 | int main(int argc, char* argv[]) {
 69 |   InputBuffer* input_buffer = new_input_buffer();
 70 |   while (true) {
 71 |     print_prompt();
 72 |     read_input(input_buffer);
 73 | 
 74 |     if (strcmp(input_buffer->buffer, ".exit") == 0) {
 75 |       close_input_buffer(input_buffer);
 76 |       exit(EXIT_SUCCESS);
 77 |     } else {
 78 |       printf("Unrecognized command '%s'.\n", input_buffer->buffer);
 79 |     }
 80 |   }
 81 | }
 82 | ```
 83 | 
 84 | We'll define `InputBuffer` as a small wrapper around the state we need to store to interact with [getline()](http://man7.org/linux/man-pages/man3/getline.3.html). (More on that in a minute)
 85 | ```c
 86 | typedef struct {
 87 |   char* buffer;
 88 |   size_t buffer_length;
 89 |   ssize_t input_length;
 90 | } InputBuffer;
 91 | 
 92 | InputBuffer* new_input_buffer() {
 93 |   InputBuffer* input_buffer = (InputBuffer*)malloc(sizeof(InputBuffer));
 94 |   input_buffer->buffer = NULL;
 95 |   input_buffer->buffer_length = 0;
 96 |   input_buffer->input_length = 0;
 97 | 
 98 |   return input_buffer;
 99 | }
100 | ```
101 | 
102 | Next, `print_prompt()` prints a prompt to the user. We do this before reading each line of input.
103 | 
104 | ```c
105 | void print_prompt() { printf("db > "); }
106 | ```
107 | 
108 | To read a line of input, use [getline()](http://man7.org/linux/man-pages/man3/getline.3.html):
109 | ```c
110 | ssize_t getline(char **lineptr, size_t *n, FILE *stream);
111 | ```
112 | `lineptr` : a pointer to the variable we use to point to the buffer containing the read line. If it set to `NULL` it is mallocatted by `getline` and should thus be freed by the user, even if the command fails.
113 | 
114 | `n` : a pointer to the variable we use to save the size of allocated buffer.
115 | 
116 | `stream` : the input stream to read from. We'll be reading from standard input.
117 | 
118 | `return value` : the number of bytes read, which may be less than the size of the buffer.
119 | 
120 | We tell `getline` to store the read line in `input_buffer->buffer` and the size of the allocated buffer in `input_buffer->buffer_length`. We store the return value in `input_buffer->input_length`.
121 | 
122 | `buffer` starts as null, so `getline` allocates enough memory to hold the line of input and makes `buffer` point to it.
123 | 
124 | ```c
125 | void read_input(InputBuffer* input_buffer) {
126 |   ssize_t bytes_read =
127 |       getline(&(input_buffer->buffer), &(input_buffer->buffer_length), stdin);
128 | 
129 |   if (bytes_read <= 0) {
130 |     printf("Error reading input\n");
131 |     exit(EXIT_FAILURE);
132 |   }
133 | 
134 |   // Ignore trailing newline
135 |   input_buffer->input_length = bytes_read - 1;
136 |   input_buffer->buffer[bytes_read - 1] = 0;
137 | }
138 | ```
139 | 
140 | Now it is proper to define a function that frees the memory allocated for an
141 | instance of `InputBuffer *` and the `buffer` element of the respective
142 | structure (`getline` allocates memory for `input_buffer->buffer` in
143 | `read_input`).
144 | 
145 | ```c
146 | void close_input_buffer(InputBuffer* input_buffer) {
147 |     free(input_buffer->buffer);
148 |     free(input_buffer);
149 | }
150 | ```
151 | 
152 | Finally, we parse and execute the command. There is only one recognized command right now : `.exit`, which terminates the program. Otherwise we print an error message and continue the loop.
153 | 
154 | ```c
155 | if (strcmp(input_buffer->buffer, ".exit") == 0) {
156 |   close_input_buffer(input_buffer);
157 |   exit(EXIT_SUCCESS);
158 | } else {
159 |   printf("Unrecognized command '%s'.\n", input_buffer->buffer);
160 | }
161 | ```
162 | 
163 | Let's try it out!
164 | ```shell
165 | ~ ./db
166 | db > .tables
167 | Unrecognized command '.tables'.
168 | db > .exit
169 | ~
170 | ```
171 | 
172 | Alright, we've got a working REPL. In the next part, we'll start developing our command language. Meanwhile, here's the entire program from this part:
173 | 
174 | ```c
175 | #include <stdbool.h>
176 | #include <stdio.h>
177 | #include <stdlib.h>
178 | #include <string.h>
179 | 
180 | typedef struct {
181 |   char* buffer;
182 |   size_t buffer_length;
183 |   ssize_t input_length;
184 | } InputBuffer;
185 | 
186 | InputBuffer* new_input_buffer() {
187 |   InputBuffer* input_buffer = malloc(sizeof(InputBuffer));
188 |   input_buffer->buffer = NULL;
189 |   input_buffer->buffer_length = 0;
190 |   input_buffer->input_length = 0;
191 | 
192 |   return input_buffer;
193 | }
194 | 
195 | void print_prompt() { printf("db > "); }
196 | 
197 | void read_input(InputBuffer* input_buffer) {
198 |   ssize_t bytes_read =
199 |       getline(&(input_buffer->buffer), &(input_buffer->buffer_length), stdin);
200 | 
201 |   if (bytes_read <= 0) {
202 |     printf("Error reading input\n");
203 |     exit(EXIT_FAILURE);
204 |   }
205 | 
206 |   // Ignore trailing newline
207 |   input_buffer->input_length = bytes_read - 1;
208 |   input_buffer->buffer[bytes_read - 1] = 0;
209 | }
210 | 
211 | void close_input_buffer(InputBuffer* input_buffer) {
212 |     free(input_buffer->buffer);
213 |     free(input_buffer);
214 | }
215 | 
216 | int main(int argc, char* argv[]) {
217 |   InputBuffer* input_buffer = new_input_buffer();
218 |   while (true) {
219 |     print_prompt();
220 |     read_input(input_buffer);
221 | 
222 |     if (strcmp(input_buffer->buffer, ".exit") == 0) {
223 |       close_input_buffer(input_buffer);
224 |       exit(EXIT_SUCCESS);
225 |     } else {
226 |       printf("Unrecognized command '%s'.\n", input_buffer->buffer);
227 |     }
228 |   }
229 | }
230 | ```
231 | 


--------------------------------------------------------------------------------
/_parts/part10.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: Part 10 - Splitting a Leaf Node
  3 | date: 2017-10-09
  4 | ---
  5 | 
  6 | Our B-Tree doesn't feel like much of a tree with only one node. To fix that, we need some code to split a leaf node in twain. And after that, we need to create an internal node to serve as a parent for the two leaf nodes.
  7 | 
  8 | Basically our goal for this article is to go from this:
  9 | 
 10 | {% include image.html url="assets/images/btree2.png" description="one-node btree" %}
 11 | 
 12 | to this:
 13 | 
 14 | {% include image.html url="assets/images/btree3.png" description="two-level btree" %}
 15 | 
 16 | First things first, let's remove the error handling for a full leaf node:
 17 | 
 18 | ```diff
 19 |  void leaf_node_insert(Cursor* cursor, uint32_t key, Row* value) {
 20 |    void* node = get_page(cursor->table->pager, cursor->page_num);
 21 |  
 22 |    uint32_t num_cells = *leaf_node_num_cells(node);
 23 |    if (num_cells >= LEAF_NODE_MAX_CELLS) {
 24 |      // Node full
 25 | -    printf("Need to implement splitting a leaf node.\n");
 26 | -    exit(EXIT_FAILURE);
 27 | +    leaf_node_split_and_insert(cursor, key, value);
 28 | +    return;
 29 |    }
 30 | ```
 31 | 
 32 | ```diff
 33 | ExecuteResult execute_insert(Statement* statement, Table* table) {
 34 |    void* node = get_page(table->pager, table->root_page_num);
 35 |    uint32_t num_cells = (*leaf_node_num_cells(node));
 36 | -  if (num_cells >= LEAF_NODE_MAX_CELLS) {
 37 | -    return EXECUTE_TABLE_FULL;
 38 | -  }
 39 |  
 40 |    Row* row_to_insert = &(statement->row_to_insert);
 41 |    uint32_t key_to_insert = row_to_insert->id;
 42 | ```
 43 | 
 44 | ## Splitting Algorithm
 45 | 
 46 | Easy part's over. Here's a description of what we need to do from [SQLite Database System: Design and Implementation](https://play.google.com/store/books/details/Sibsankar_Haldar_SQLite_Database_System_Design_and?id=9Z6IQQnX1JEC&hl=en)
 47 | 
 48 | > If there is no space on the leaf node, we would split the existing entries residing there and the new one (being inserted) into two equal halves: lower and upper halves. (Keys on the upper half are strictly greater than those on the lower half.) We allocate a new leaf node, and move the upper half into the new node.
 49 | 
 50 | 
 51 | Let's get a handle to the old node and create the new node:
 52 | 
 53 | ```diff
 54 | +void leaf_node_split_and_insert(Cursor* cursor, uint32_t key, Row* value) {
 55 | +  /*
 56 | +  Create a new node and move half the cells over.
 57 | +  Insert the new value in one of the two nodes.
 58 | +  Update parent or create a new parent.
 59 | +  */
 60 | +
 61 | +  void* old_node = get_page(cursor->table->pager, cursor->page_num);
 62 | +  uint32_t new_page_num = get_unused_page_num(cursor->table->pager);
 63 | +  void* new_node = get_page(cursor->table->pager, new_page_num);
 64 | +  initialize_leaf_node(new_node);
 65 | ```
 66 | 
 67 | Next, copy every cell into its new location:
 68 | 
 69 | ```diff
 70 | +  /*
 71 | +  All existing keys plus new key should be divided
 72 | +  evenly between old (left) and new (right) nodes.
 73 | +  Starting from the right, move each key to correct position.
 74 | +  */
 75 | +  for (int32_t i = LEAF_NODE_MAX_CELLS; i >= 0; i--) {
 76 | +    void* destination_node;
 77 | +    if (i >= LEAF_NODE_LEFT_SPLIT_COUNT) {
 78 | +      destination_node = new_node;
 79 | +    } else {
 80 | +      destination_node = old_node;
 81 | +    }
 82 | +    uint32_t index_within_node = i % LEAF_NODE_LEFT_SPLIT_COUNT;
 83 | +    void* destination = leaf_node_cell(destination_node, index_within_node);
 84 | +
 85 | +    if (i == cursor->cell_num) {
 86 | +      serialize_row(value, destination);
 87 | +    } else if (i > cursor->cell_num) {
 88 | +      memcpy(destination, leaf_node_cell(old_node, i - 1), LEAF_NODE_CELL_SIZE);
 89 | +    } else {
 90 | +      memcpy(destination, leaf_node_cell(old_node, i), LEAF_NODE_CELL_SIZE);
 91 | +    }
 92 | +  }
 93 | ```
 94 | 
 95 | Update cell counts in each node's header:
 96 | 
 97 | ```diff
 98 | +  /* Update cell count on both leaf nodes */
 99 | +  *(leaf_node_num_cells(old_node)) = LEAF_NODE_LEFT_SPLIT_COUNT;
100 | +  *(leaf_node_num_cells(new_node)) = LEAF_NODE_RIGHT_SPLIT_COUNT;
101 | ```
102 | 
103 | Then we need to update the nodes' parent. If the original node was the root, it had no parent. In that case, create a new root node to act as the parent. I'll stub out the other branch for now:
104 | 
105 | ```diff
106 | +  if (is_node_root(old_node)) {
107 | +    return create_new_root(cursor->table, new_page_num);
108 | +  } else {
109 | +    printf("Need to implement updating parent after split\n");
110 | +    exit(EXIT_FAILURE);
111 | +  }
112 | +}
113 | ```
114 | 
115 | ## Allocating New Pages
116 | 
117 | Let's go back and define a few new functions and constants. When we created a new leaf node, we put it in a page decided by `get_unused_page_num()`:
118 | 
119 | ```diff
120 | +/*
121 | +Until we start recycling free pages, new pages will always
122 | +go onto the end of the database file
123 | +*/
124 | +uint32_t get_unused_page_num(Pager* pager) { return pager->num_pages; }
125 | ```
126 | 
127 | For now, we're assuming that in a database with N pages, page numbers 0 through N-1 are allocated. Therefore we can always allocate page number N for new pages. Eventually after we implement deletion, some pages may become empty and their page numbers unused. To be more efficient, we could re-allocate those free pages.
128 | 
129 | ## Leaf Node Sizes
130 | 
131 | To keep the tree balanced, we evenly distribute cells between the two new nodes. If a leaf node can hold N cells, then during a split we need to distribute N+1 cells between two nodes (N original cells plus one new one). I'm arbitrarily choosing the left node to get one more cell if N+1 is odd.
132 | 
133 | ```diff
134 | +const uint32_t LEAF_NODE_RIGHT_SPLIT_COUNT = (LEAF_NODE_MAX_CELLS + 1) / 2;
135 | +const uint32_t LEAF_NODE_LEFT_SPLIT_COUNT =
136 | +    (LEAF_NODE_MAX_CELLS + 1) - LEAF_NODE_RIGHT_SPLIT_COUNT;
137 | ```
138 | 
139 | ## Creating a New Root
140 | 
141 | Here's how [SQLite Database System](https://play.google.com/store/books/details/Sibsankar_Haldar_SQLite_Database_System_Design_and?id=9Z6IQQnX1JEC&hl=en) explains the process of creating a new root node:
142 | 
143 | > Let N be the root node. First allocate two nodes, say L and R. Move lower half of N into L and the upper half into R. Now N is empty. Add 〈L, K,R〉 in N, where K is the max key in L. Page N remains the root. Note that the depth of the tree has increased by one, but the new tree remains height balanced without violating any B+-tree property.
144 | 
145 | At this point, we've already allocated the right child and moved the upper half into it. Our function takes the right child as input and allocates a new page to store the left child.
146 | 
147 | ```diff
148 | +void create_new_root(Table* table, uint32_t right_child_page_num) {
149 | +  /*
150 | +  Handle splitting the root.
151 | +  Old root copied to new page, becomes left child.
152 | +  Address of right child passed in.
153 | +  Re-initialize root page to contain the new root node.
154 | +  New root node points to two children.
155 | +  */
156 | +
157 | +  void* root = get_page(table->pager, table->root_page_num);
158 | +  void* right_child = get_page(table->pager, right_child_page_num);
159 | +  uint32_t left_child_page_num = get_unused_page_num(table->pager);
160 | +  void* left_child = get_page(table->pager, left_child_page_num);
161 | ```
162 | 
163 | The old root is copied to the left child so we can reuse the root page:
164 | 
165 | ```diff
166 | +  /* Left child has data copied from old root */
167 | +  memcpy(left_child, root, PAGE_SIZE);
168 | +  set_node_root(left_child, false);
169 | ```
170 | 
171 | Finally we initialize the root page as a new internal node with two children.
172 | 
173 | ```diff
174 | +  /* Root node is a new internal node with one key and two children */
175 | +  initialize_internal_node(root);
176 | +  set_node_root(root, true);
177 | +  *internal_node_num_keys(root) = 1;
178 | +  *internal_node_child(root, 0) = left_child_page_num;
179 | +  uint32_t left_child_max_key = get_node_max_key(left_child);
180 | +  *internal_node_key(root, 0) = left_child_max_key;
181 | +  *internal_node_right_child(root) = right_child_page_num;
182 | +}
183 | ```
184 | 
185 | ## Internal Node Format
186 | 
187 | Now that we're finally creating an internal node, we have to define its layout. It starts with the common header, then the number of keys it contains, then the page number of its rightmost child. Internal nodes always have one more child pointer than they have keys. That extra child pointer is stored in the header.
188 | 
189 | ```diff
190 | +/*
191 | + * Internal Node Header Layout
192 | + */
193 | +const uint32_t INTERNAL_NODE_NUM_KEYS_SIZE = sizeof(uint32_t);
194 | +const uint32_t INTERNAL_NODE_NUM_KEYS_OFFSET = COMMON_NODE_HEADER_SIZE;
195 | +const uint32_t INTERNAL_NODE_RIGHT_CHILD_SIZE = sizeof(uint32_t);
196 | +const uint32_t INTERNAL_NODE_RIGHT_CHILD_OFFSET =
197 | +    INTERNAL_NODE_NUM_KEYS_OFFSET + INTERNAL_NODE_NUM_KEYS_SIZE;
198 | +const uint32_t INTERNAL_NODE_HEADER_SIZE = COMMON_NODE_HEADER_SIZE +
199 | +                                           INTERNAL_NODE_NUM_KEYS_SIZE +
200 | +                                           INTERNAL_NODE_RIGHT_CHILD_SIZE;
201 | ```
202 | 
203 | The body is an array of cells where each cell contains a child pointer and a key. Every key should be the maximum key contained in the child to its left.
204 | 
205 | ```diff
206 | +/*
207 | + * Internal Node Body Layout
208 | + */
209 | +const uint32_t INTERNAL_NODE_KEY_SIZE = sizeof(uint32_t);
210 | +const uint32_t INTERNAL_NODE_CHILD_SIZE = sizeof(uint32_t);
211 | +const uint32_t INTERNAL_NODE_CELL_SIZE =
212 | +    INTERNAL_NODE_CHILD_SIZE + INTERNAL_NODE_KEY_SIZE;
213 | ```
214 | 
215 | Based on these constants, here's how the layout of an internal node will look:
216 | 
217 | {% include image.html url="assets/images/internal-node-format.png" description="Our internal node format" %}
218 | 
219 | Notice our huge branching factor. Because each child pointer / key pair is so small, we can fit 510 keys and 511 child pointers in each internal node. That means we'll never have to traverse many layers of the tree to find a given key!
220 | 
221 | | # internal node layers | max # leaf nodes    | Size of all leaf nodes |
222 | |------------------------|---------------------|------------------------|
223 | | 0                      | 511^0 = 1           | 4 KB                   |
224 | | 1                      | 511^1 = 512         | ~2 MB                   |
225 | | 2                      | 511^2 = 261,121     | ~1 GB                   |
226 | | 3                      | 511^3 = 133,432,831 | ~550 GB                 |
227 | 
228 | In actuality, we can't store a full 4 KB of data per leaf node due to the overhead of the header, keys, and wasted space. But we can search through something like 500 GB of data by loading only 4 pages from disk. This is why the B-Tree is a useful data structure for databases.
229 | 
230 | Here are the methods for reading and writing to an internal node:
231 | 
232 | ```diff
233 | +uint32_t* internal_node_num_keys(void* node) {
234 | +  return node + INTERNAL_NODE_NUM_KEYS_OFFSET;
235 | +}
236 | +
237 | +uint32_t* internal_node_right_child(void* node) {
238 | +  return node + INTERNAL_NODE_RIGHT_CHILD_OFFSET;
239 | +}
240 | +
241 | +uint32_t* internal_node_cell(void* node, uint32_t cell_num) {
242 | +  return node + INTERNAL_NODE_HEADER_SIZE + cell_num * INTERNAL_NODE_CELL_SIZE;
243 | +}
244 | +
245 | +uint32_t* internal_node_child(void* node, uint32_t child_num) {
246 | +  uint32_t num_keys = *internal_node_num_keys(node);
247 | +  if (child_num > num_keys) {
248 | +    printf("Tried to access child_num %d > num_keys %d\n", child_num, num_keys);
249 | +    exit(EXIT_FAILURE);
250 | +  } else if (child_num == num_keys) {
251 | +    return internal_node_right_child(node);
252 | +  } else {
253 | +    return internal_node_cell(node, child_num);
254 | +  }
255 | +}
256 | +
257 | +uint32_t* internal_node_key(void* node, uint32_t key_num) {
258 | +  return internal_node_cell(node, key_num) + INTERNAL_NODE_CHILD_SIZE;
259 | +}
260 | ```
261 | 
262 | For an internal node, the maximum key is always its right key. For a leaf node, it's the key at the maximum index:
263 | 
264 | ```diff
265 | +uint32_t get_node_max_key(void* node) {
266 | +  switch (get_node_type(node)) {
267 | +    case NODE_INTERNAL:
268 | +      return *internal_node_key(node, *internal_node_num_keys(node) - 1);
269 | +    case NODE_LEAF:
270 | +      return *leaf_node_key(node, *leaf_node_num_cells(node) - 1);
271 | +  }
272 | +}
273 | ```
274 | 
275 | ## Keeping Track of the Root
276 | 
277 | We're finally using the `is_root` field in the common node header. Recall that we use it to decide how to split a leaf node:
278 | 
279 | ```c
280 |   if (is_node_root(old_node)) {
281 |     return create_new_root(cursor->table, new_page_num);
282 |   } else {
283 |     printf("Need to implement updating parent after split\n");
284 |     exit(EXIT_FAILURE);
285 |   }
286 | }
287 | ```
288 | 
289 | Here are the getter and setter:
290 | 
291 | ```diff
292 | +bool is_node_root(void* node) {
293 | +  uint8_t value = *((uint8_t*)(node + IS_ROOT_OFFSET));
294 | +  return (bool)value;
295 | +}
296 | +
297 | +void set_node_root(void* node, bool is_root) {
298 | +  uint8_t value = is_root;
299 | +  *((uint8_t*)(node + IS_ROOT_OFFSET)) = value;
300 | +}
301 | ```
302 | 
303 | 
304 | Initializing both types of nodes should default to setting `is_root` to false:
305 | 
306 | ```diff
307 |  void initialize_leaf_node(void* node) {
308 |    set_node_type(node, NODE_LEAF);
309 | +  set_node_root(node, false);
310 |    *leaf_node_num_cells(node) = 0;
311 |  }
312 | 
313 | +void initialize_internal_node(void* node) {
314 | +  set_node_type(node, NODE_INTERNAL);
315 | +  set_node_root(node, false);
316 | +  *internal_node_num_keys(node) = 0;
317 | +}
318 | ```
319 | 
320 | We should set `is_root` to true when creating the first node of the table:
321 | 
322 | ```diff
323 |      // New database file. Initialize page 0 as leaf node.
324 |      void* root_node = get_page(pager, 0);
325 |      initialize_leaf_node(root_node);
326 | +    set_node_root(root_node, true);
327 |    }
328 |  
329 |    return table;
330 | ```
331 | 
332 | ## Printing the Tree
333 | 
334 | To help us visualize the state of the database, we should update our `.btree` metacommand to print a multi-level tree.
335 | 
336 | I'm going to replace the current `print_leaf_node()` function
337 | 
338 | ```diff
339 | -void print_leaf_node(void* node) {
340 | -  uint32_t num_cells = *leaf_node_num_cells(node);
341 | -  printf("leaf (size %d)\n", num_cells);
342 | -  for (uint32_t i = 0; i < num_cells; i++) {
343 | -    uint32_t key = *leaf_node_key(node, i);
344 | -    printf("  - %d : %d\n", i, key);
345 | -  }
346 | -}
347 | ```
348 | 
349 | with a new recursive function that takes any node, then prints it and its children. It takes an indentation level as a parameter, which increases with each recursive call. I'm also adding a tiny helper function to indent.
350 | 
351 | ```diff
352 | +void indent(uint32_t level) {
353 | +  for (uint32_t i = 0; i < level; i++) {
354 | +    printf("  ");
355 | +  }
356 | +}
357 | +
358 | +void print_tree(Pager* pager, uint32_t page_num, uint32_t indentation_level) {
359 | +  void* node = get_page(pager, page_num);
360 | +  uint32_t num_keys, child;
361 | +
362 | +  switch (get_node_type(node)) {
363 | +    case (NODE_LEAF):
364 | +      num_keys = *leaf_node_num_cells(node);
365 | +      indent(indentation_level);
366 | +      printf("- leaf (size %d)\n", num_keys);
367 | +      for (uint32_t i = 0; i < num_keys; i++) {
368 | +        indent(indentation_level + 1);
369 | +        printf("- %d\n", *leaf_node_key(node, i));
370 | +      }
371 | +      break;
372 | +    case (NODE_INTERNAL):
373 | +      num_keys = *internal_node_num_keys(node);
374 | +      indent(indentation_level);
375 | +      printf("- internal (size %d)\n", num_keys);
376 | +      for (uint32_t i = 0; i < num_keys; i++) {
377 | +        child = *internal_node_child(node, i);
378 | +        print_tree(pager, child, indentation_level + 1);
379 | +
380 | +        indent(indentation_level + 1);
381 | +        printf("- key %d\n", *internal_node_key(node, i));
382 | +      }
383 | +      child = *internal_node_right_child(node);
384 | +      print_tree(pager, child, indentation_level + 1);
385 | +      break;
386 | +  }
387 | +}
388 | ```
389 | 
390 | And update the call to the print function, passing an indentation level of zero.
391 | 
392 | ```diff
393 |    } else if (strcmp(input_buffer->buffer, ".btree") == 0) {
394 |      printf("Tree:\n");
395 | -    print_leaf_node(get_page(table->pager, 0));
396 | +    print_tree(table->pager, 0, 0);
397 |      return META_COMMAND_SUCCESS;
398 | ```
399 | 
400 | Here's a test case for the new printing functionality!
401 | 
402 | ```diff
403 | +  it 'allows printing out the structure of a 3-leaf-node btree' do
404 | +    script = (1..14).map do |i|
405 | +      "insert #{i} user#{i} person#{i}@example.com"
406 | +    end
407 | +    script << ".btree"
408 | +    script << "insert 15 user15 person15@example.com"
409 | +    script << ".exit"
410 | +    result = run_script(script)
411 | +
412 | +    expect(result[14...(result.length)]).to match_array([
413 | +      "db > Tree:",
414 | +      "- internal (size 1)",
415 | +      "  - leaf (size 7)",
416 | +      "    - 1",
417 | +      "    - 2",
418 | +      "    - 3",
419 | +      "    - 4",
420 | +      "    - 5",
421 | +      "    - 6",
422 | +      "    - 7",
423 | +      "  - key 7",
424 | +      "  - leaf (size 7)",
425 | +      "    - 8",
426 | +      "    - 9",
427 | +      "    - 10",
428 | +      "    - 11",
429 | +      "    - 12",
430 | +      "    - 13",
431 | +      "    - 14",
432 | +      "db > Need to implement searching an internal node",
433 | +    ])
434 | +  end
435 | ```
436 | 
437 | The new format is a little simplified, so we need to update the existing `.btree` test:
438 | 
439 | ```diff
440 |        "db > Executed.",
441 |        "db > Executed.",
442 |        "db > Tree:",
443 | -      "leaf (size 3)",
444 | -      "  - 0 : 1",
445 | -      "  - 1 : 2",
446 | -      "  - 2 : 3",
447 | +      "- leaf (size 3)",
448 | +      "  - 1",
449 | +      "  - 2",
450 | +      "  - 3",
451 |        "db > "
452 |      ])
453 |    end
454 | ```
455 | 
456 | Here's the `.btree` output of the new test on its own:
457 | 
458 | ```
459 | Tree:
460 | - internal (size 1)
461 |   - leaf (size 7)
462 |     - 1
463 |     - 2
464 |     - 3
465 |     - 4
466 |     - 5
467 |     - 6
468 |     - 7
469 |   - key 7
470 |   - leaf (size 7)
471 |     - 8
472 |     - 9
473 |     - 10
474 |     - 11
475 |     - 12
476 |     - 13
477 |     - 14
478 | ```
479 | 
480 | On the least indented level, we see the root node (an internal node). It says `size 1` because it has one key. Indented one level, we see a leaf node, a key, and another leaf node. The key in the root node (7) is is the maximum key in the first leaf node. Every key greater than 7 is in the second leaf node.
481 | 
482 | ## A Major Problem
483 | 
484 | If you've been following along closely you may notice we've missed something big. Look what happens if we try to insert one additional row:
485 | 
486 | ```
487 | db > insert 15 user15 person15@example.com
488 | Need to implement searching an internal node
489 | ```
490 | 
491 | Whoops! Who wrote that TODO message? :P
492 | 
493 | Next time we'll continue the epic B-tree saga by implementing search on a multi-level tree.
494 | 


--------------------------------------------------------------------------------
/_parts/part11.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: Part 11 - Recursively Searching the B-Tree
  3 | date: 2017-10-22
  4 | ---
  5 | 
  6 | Last time we ended with an error inserting our 15th row:
  7 | 
  8 | ```
  9 | db > insert 15 user15 person15@example.com
 10 | Need to implement searching an internal node
 11 | ```
 12 | 
 13 | First, replace the code stub with a new function call.
 14 | 
 15 | ```diff
 16 |    if (get_node_type(root_node) == NODE_LEAF) {
 17 |      return leaf_node_find(table, root_page_num, key);
 18 |    } else {
 19 | -    printf("Need to implement searching an internal node\n");
 20 | -    exit(EXIT_FAILURE);
 21 | +    return internal_node_find(table, root_page_num, key);
 22 |    }
 23 |  }
 24 | ```
 25 | 
 26 | This function will perform binary search to find the child that should contain the given key. Remember that the key to the right of each child pointer is the maximum key contained by that child.
 27 | 
 28 | {% include image.html url="assets/images/btree6.png" description="three-level btree" %}
 29 | 
 30 | So our binary search compares the key to find and the key to the right of the child pointer:
 31 | 
 32 | ```diff
 33 | +Cursor* internal_node_find(Table* table, uint32_t page_num, uint32_t key) {
 34 | +  void* node = get_page(table->pager, page_num);
 35 | +  uint32_t num_keys = *internal_node_num_keys(node);
 36 | +
 37 | +  /* Binary search to find index of child to search */
 38 | +  uint32_t min_index = 0;
 39 | +  uint32_t max_index = num_keys; /* there is one more child than key */
 40 | +
 41 | +  while (min_index != max_index) {
 42 | +    uint32_t index = (min_index + max_index) / 2;
 43 | +    uint32_t key_to_right = *internal_node_key(node, index);
 44 | +    if (key_to_right >= key) {
 45 | +      max_index = index;
 46 | +    } else {
 47 | +      min_index = index + 1;
 48 | +    }
 49 | +  }
 50 | ```
 51 | 
 52 | Also remember that the children of an internal node can be either leaf nodes or more internal nodes. After we find the correct child, call the appropriate search function on it:
 53 | 
 54 | ```diff
 55 | +  uint32_t child_num = *internal_node_child(node, min_index);
 56 | +  void* child = get_page(table->pager, child_num);
 57 | +  switch (get_node_type(child)) {
 58 | +    case NODE_LEAF:
 59 | +      return leaf_node_find(table, child_num, key);
 60 | +    case NODE_INTERNAL:
 61 | +      return internal_node_find(table, child_num, key);
 62 | +  }
 63 | +}
 64 | ```
 65 | 
 66 | # Tests
 67 | 
 68 | Now inserting a key into a multi-node btree no longer results in an error. And we can update our test:
 69 | 
 70 | ```diff
 71 |        "    - 12",
 72 |        "    - 13",
 73 |        "    - 14",
 74 | -      "db > Need to implement searching an internal node",
 75 | +      "db > Executed.",
 76 | +      "db > ",
 77 |      ])
 78 |    end
 79 | ```
 80 | 
 81 | I also think it's time we revisit another test. The one that tries inserting 1400 rows. It still errors, but the error message is new. Right now, our tests don't handle it very well when the program crashes. If that happens, let's just use the output we've gotten so far:
 82 | 
 83 | ```diff
 84 |      raw_output = nil
 85 |      IO.popen("./db test.db", "r+") do |pipe|
 86 |        commands.each do |command|
 87 | -        pipe.puts command
 88 | +        begin
 89 | +          pipe.puts command
 90 | +        rescue Errno::EPIPE
 91 | +          break
 92 | +        end
 93 |        end
 94 | 
 95 |        pipe.close_write
 96 | ```
 97 | 
 98 | And that reveals that our 1400-row test outputs this error:
 99 | 
100 | ```diff
101 |      end
102 |      script << ".exit"
103 |      result = run_script(script)
104 | -    expect(result[-2]).to eq('db > Error: Table full.')
105 | +    expect(result.last(2)).to match_array([
106 | +      "db > Executed.",
107 | +      "db > Need to implement updating parent after split",
108 | +    ])
109 |    end
110 | ```
111 | 
112 | Looks like that's next on our to-do list!
113 | 
114 | 


--------------------------------------------------------------------------------
/_parts/part12.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: Part 12 - Scanning a Multi-Level B-Tree
  3 | date: 2017-11-11
  4 | ---
  5 | 
  6 | We now support constructing a multi-level btree, but we've broken `select` statements in the process. Here's a test case that inserts 15 rows and then tries to print them.
  7 | 
  8 | ```diff
  9 | +  it 'prints all rows in a multi-level tree' do
 10 | +    script = []
 11 | +    (1..15).each do |i|
 12 | +      script << "insert #{i} user#{i} person#{i}@example.com"
 13 | +    end
 14 | +    script << "select"
 15 | +    script << ".exit"
 16 | +    result = run_script(script)
 17 | +
 18 | +    expect(result[15...result.length]).to match_array([
 19 | +      "db > (1, user1, person1@example.com)",
 20 | +      "(2, user2, person2@example.com)",
 21 | +      "(3, user3, person3@example.com)",
 22 | +      "(4, user4, person4@example.com)",
 23 | +      "(5, user5, person5@example.com)",
 24 | +      "(6, user6, person6@example.com)",
 25 | +      "(7, user7, person7@example.com)",
 26 | +      "(8, user8, person8@example.com)",
 27 | +      "(9, user9, person9@example.com)",
 28 | +      "(10, user10, person10@example.com)",
 29 | +      "(11, user11, person11@example.com)",
 30 | +      "(12, user12, person12@example.com)",
 31 | +      "(13, user13, person13@example.com)",
 32 | +      "(14, user14, person14@example.com)",
 33 | +      "(15, user15, person15@example.com)",
 34 | +      "Executed.", "db > ",
 35 | +    ])
 36 | +  end
 37 | ```
 38 | 
 39 | But when we run that test case right now, what actually happens is:
 40 | 
 41 | ```
 42 | db > select
 43 | (2, user1, person1@example.com)
 44 | Executed.
 45 | ```
 46 | 
 47 | That's weird. It's only printing one row, and that row looks corrupted (notice the id doesn't match the username).
 48 | 
 49 | The weirdness is because `execute_select()` begins at the start of the table, and our current implementation of `table_start()` returns cell 0 of the root node. But the root of our tree is now an internal node which doesn't contain any rows. The data that was printed must have been left over from when the root node was a leaf. `execute_select()` should really return cell 0 of the leftmost leaf node.
 50 | 
 51 | So get rid of the old implementation:
 52 | 
 53 | ```diff
 54 | -Cursor* table_start(Table* table) {
 55 | -  Cursor* cursor = malloc(sizeof(Cursor));
 56 | -  cursor->table = table;
 57 | -  cursor->page_num = table->root_page_num;
 58 | -  cursor->cell_num = 0;
 59 | -
 60 | -  void* root_node = get_page(table->pager, table->root_page_num);
 61 | -  uint32_t num_cells = *leaf_node_num_cells(root_node);
 62 | -  cursor->end_of_table = (num_cells == 0);
 63 | -
 64 | -  return cursor;
 65 | -}
 66 | ```
 67 | 
 68 | And add a new implementation that searches for key 0 (the minimum possible key). Even if key 0 does not exist in the table, this method will return the position of the lowest id (the start of the left-most leaf node).
 69 | 
 70 | ```diff
 71 | +Cursor* table_start(Table* table) {
 72 | +  Cursor* cursor =  table_find(table, 0);
 73 | +
 74 | +  void* node = get_page(table->pager, cursor->page_num);
 75 | +  uint32_t num_cells = *leaf_node_num_cells(node);
 76 | +  cursor->end_of_table = (num_cells == 0);
 77 | +
 78 | +  return cursor;
 79 | +}
 80 | ```
 81 | 
 82 | With those changes, it still only prints out one node's worth of rows:
 83 | 
 84 | ```
 85 | db > select
 86 | (1, user1, person1@example.com)
 87 | (2, user2, person2@example.com)
 88 | (3, user3, person3@example.com)
 89 | (4, user4, person4@example.com)
 90 | (5, user5, person5@example.com)
 91 | (6, user6, person6@example.com)
 92 | (7, user7, person7@example.com)
 93 | Executed.
 94 | db >
 95 | ```
 96 | 
 97 | With 15 entries, our btree consists of one internal node and two leaf nodes, which looks something like this:
 98 | 
 99 | {% include image.html url="assets/images/btree3.png" description="structure of our btree" %}
100 | 
101 | To scan the entire table, we need to jump to the second leaf node after we reach the end of the first. To do that, we're going to save a new field in the leaf node header called "next_leaf", which will hold the page number of the leaf's sibling node on the right. The rightmost leaf node will have a `next_leaf` value of 0 to denote no sibling (page 0 is reserved for the root node of the table anyway).
102 | 
103 | Update the leaf node header format to include the new field:
104 | 
105 | ```diff
106 |  const uint32_t LEAF_NODE_NUM_CELLS_SIZE = sizeof(uint32_t);
107 |  const uint32_t LEAF_NODE_NUM_CELLS_OFFSET = COMMON_NODE_HEADER_SIZE;
108 | -const uint32_t LEAF_NODE_HEADER_SIZE =
109 | -    COMMON_NODE_HEADER_SIZE + LEAF_NODE_NUM_CELLS_SIZE;
110 | +const uint32_t LEAF_NODE_NEXT_LEAF_SIZE = sizeof(uint32_t);
111 | +const uint32_t LEAF_NODE_NEXT_LEAF_OFFSET =
112 | +    LEAF_NODE_NUM_CELLS_OFFSET + LEAF_NODE_NUM_CELLS_SIZE;
113 | +const uint32_t LEAF_NODE_HEADER_SIZE = COMMON_NODE_HEADER_SIZE +
114 | +                                       LEAF_NODE_NUM_CELLS_SIZE +
115 | +                                       LEAF_NODE_NEXT_LEAF_SIZE;
116 |  
117 |  ```
118 | 
119 | Add a method to access the new field:
120 | ```diff
121 | +uint32_t* leaf_node_next_leaf(void* node) {
122 | +  return node + LEAF_NODE_NEXT_LEAF_OFFSET;
123 | +}
124 | ```
125 | 
126 | Set `next_leaf` to 0 by default when initializing a new leaf node:
127 | 
128 | ```diff
129 | @@ -322,6 +330,7 @@ void initialize_leaf_node(void* node) {
130 |    set_node_type(node, NODE_LEAF);
131 |    set_node_root(node, false);
132 |    *leaf_node_num_cells(node) = 0;
133 | +  *leaf_node_next_leaf(node) = 0;  // 0 represents no sibling
134 |  }
135 | ```
136 | 
137 | Whenever we split a leaf node, update the sibling pointers. The old leaf's sibling becomes the new leaf, and the new leaf's sibling becomes whatever used to be the old leaf's sibling.
138 | 
139 | ```diff
140 | @@ -659,6 +671,8 @@ void leaf_node_split_and_insert(Cursor* cursor, uint32_t key, Row* value) {
141 |    uint32_t new_page_num = get_unused_page_num(cursor->table->pager);
142 |    void* new_node = get_page(cursor->table->pager, new_page_num);
143 |    initialize_leaf_node(new_node);
144 | +  *leaf_node_next_leaf(new_node) = *leaf_node_next_leaf(old_node);
145 | +  *leaf_node_next_leaf(old_node) = new_page_num;
146 | ```
147 | 
148 | Adding a new field changes a few constants:
149 | ```diff
150 |    it 'prints constants' do
151 |      script = [
152 |        ".constants",
153 | @@ -199,9 +228,9 @@ describe 'database' do
154 |        "db > Constants:",
155 |        "ROW_SIZE: 293",
156 |        "COMMON_NODE_HEADER_SIZE: 6",
157 | -      "LEAF_NODE_HEADER_SIZE: 10",
158 | +      "LEAF_NODE_HEADER_SIZE: 14",
159 |        "LEAF_NODE_CELL_SIZE: 297",
160 | -      "LEAF_NODE_SPACE_FOR_CELLS: 4086",
161 | +      "LEAF_NODE_SPACE_FOR_CELLS: 4082",
162 |        "LEAF_NODE_MAX_CELLS: 13",
163 |        "db > ",
164 |      ])
165 | ```
166 | 
167 | Now whenever we want to advance the cursor past the end of a leaf node, we can check if the leaf node has a sibling. If it does, jump to it. Otherwise, we're at the end of the table.
168 | 
169 | ```diff
170 | @@ -428,7 +432,15 @@ void cursor_advance(Cursor* cursor) {
171 |  
172 |    cursor->cell_num += 1;
173 |    if (cursor->cell_num >= (*leaf_node_num_cells(node))) {
174 | -    cursor->end_of_table = true;
175 | +    /* Advance to next leaf node */
176 | +    uint32_t next_page_num = *leaf_node_next_leaf(node);
177 | +    if (next_page_num == 0) {
178 | +      /* This was rightmost leaf */
179 | +      cursor->end_of_table = true;
180 | +    } else {
181 | +      cursor->page_num = next_page_num;
182 | +      cursor->cell_num = 0;
183 | +    }
184 |    }
185 |  }
186 | ```
187 | 
188 | After those changes, we actually print 15 rows...
189 | ```
190 | db > select
191 | (1, user1, person1@example.com)
192 | (2, user2, person2@example.com)
193 | (3, user3, person3@example.com)
194 | (4, user4, person4@example.com)
195 | (5, user5, person5@example.com)
196 | (6, user6, person6@example.com)
197 | (7, user7, person7@example.com)
198 | (8, user8, person8@example.com)
199 | (9, user9, person9@example.com)
200 | (10, user10, person10@example.com)
201 | (11, user11, person11@example.com)
202 | (12, user12, person12@example.com)
203 | (13, user13, person13@example.com)
204 | (1919251317, 14, on14@example.com)
205 | (15, user15, person15@example.com)
206 | Executed.
207 | db >
208 | ```
209 | 
210 | ...but one of them looks corrupted
211 | ```
212 | (1919251317, 14, on14@example.com)
213 | ```
214 | 
215 | After some debugging, I found out it's because of a bug in how we split leaf nodes:
216 | 
217 | ```diff
218 | @@ -676,7 +690,9 @@ void leaf_node_split_and_insert(Cursor* cursor, uint32_t key, Row* value) {
219 |      void* destination = leaf_node_cell(destination_node, index_within_node);
220 |  
221 |      if (i == cursor->cell_num) {
222 | -      serialize_row(value, destination);
223 | +      serialize_row(value,
224 | +                    leaf_node_value(destination_node, index_within_node));
225 | +      *leaf_node_key(destination_node, index_within_node) = key;
226 |      } else if (i > cursor->cell_num) {
227 |        memcpy(destination, leaf_node_cell(old_node, i - 1), LEAF_NODE_CELL_SIZE);
228 |      } else {
229 | ```
230 | 
231 | Remember that each cell in a leaf node consists of first a key then a value:
232 | 
233 | {% include image.html url="assets/images/leaf-node-format.png" description="Original leaf node format" %}
234 | 
235 | We were writing the new row (value) into the start of the cell, where the key should go. That means part of the username was going into the section for id (hence the crazy large id).
236 | 
237 | After fixing that bug, we finally print out the entire table as expected:
238 | 
239 | ```
240 | db > select
241 | (1, user1, person1@example.com)
242 | (2, user2, person2@example.com)
243 | (3, user3, person3@example.com)
244 | (4, user4, person4@example.com)
245 | (5, user5, person5@example.com)
246 | (6, user6, person6@example.com)
247 | (7, user7, person7@example.com)
248 | (8, user8, person8@example.com)
249 | (9, user9, person9@example.com)
250 | (10, user10, person10@example.com)
251 | (11, user11, person11@example.com)
252 | (12, user12, person12@example.com)
253 | (13, user13, person13@example.com)
254 | (14, user14, person14@example.com)
255 | (15, user15, person15@example.com)
256 | Executed.
257 | db >
258 | ```
259 | 
260 | Whew! One bug after another, but we're making progress.
261 | 
262 | Until next time.
263 | 


--------------------------------------------------------------------------------
/_parts/part13.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: Part 13 - Updating Parent Node After a Split
  3 | date: 2017-11-26
  4 | ---
  5 | 
  6 | For the next step on our epic b-tree implementation journey, we're going to handle fixing up the parent node after splitting a leaf. I'm going to use the following example as a reference:
  7 | 
  8 | {% include image.html url="assets/images/updating-internal-node.png" description="Example of updating internal node" %}
  9 | 
 10 | In this example, we add the key "3" to the tree. That causes the left leaf node to split. After the split we fix up the tree by doing the following:
 11 | 
 12 | 1. Update the first key in the parent to be the maximum key in the left child ("3")
 13 | 2. Add a new child pointer / key pair after the updated key
 14 |   - The new pointer points to the new child node
 15 |   - The new key is the maximum key in the new child node ("5")
 16 | 
 17 | So first things first, replace our stub code with two new function calls: `update_internal_node_key()` for step 1 and `internal_node_insert()` for step 2
 18 | 
 19 | 
 20 | ```diff
 21 | @@ -670,9 +725,11 @@ void leaf_node_split_and_insert(Cursor* cursor, uint32_t key, Row* value) {
 22 |    */
 23 |  
 24 |    void* old_node = get_page(cursor->table->pager, cursor->page_num);
 25 | +  uint32_t old_max = get_node_max_key(old_node);
 26 |    uint32_t new_page_num = get_unused_page_num(cursor->table->pager);
 27 |    void* new_node = get_page(cursor->table->pager, new_page_num);
 28 |    initialize_leaf_node(new_node);
 29 | +  *node_parent(new_node) = *node_parent(old_node);
 30 |    *leaf_node_next_leaf(new_node) = *leaf_node_next_leaf(old_node);
 31 |    *leaf_node_next_leaf(old_node) = new_page_num;
 32 |  
 33 | @@ -709,8 +766,12 @@ void leaf_node_split_and_insert(Cursor* cursor, uint32_t key, Row* value) {
 34 |    if (is_node_root(old_node)) {
 35 |      return create_new_root(cursor->table, new_page_num);
 36 |    } else {
 37 | -    printf("Need to implement updating parent after split\n");
 38 | -    exit(EXIT_FAILURE);
 39 | +    uint32_t parent_page_num = *node_parent(old_node);
 40 | +    uint32_t new_max = get_node_max_key(old_node);
 41 | +    void* parent = get_page(cursor->table->pager, parent_page_num);
 42 | +
 43 | +    update_internal_node_key(parent, old_max, new_max);
 44 | +    internal_node_insert(cursor->table, parent_page_num, new_page_num);
 45 | +    return;
 46 |    }
 47 |  }
 48 | ```
 49 | 
 50 | In order to get a reference to the parent, we need to start recording in each node a pointer to its parent node.
 51 | 
 52 | ```diff
 53 | +uint32_t* node_parent(void* node) { return node + PARENT_POINTER_OFFSET; }
 54 | ```
 55 | ```diff
 56 | @@ -660,6 +675,48 @@ void create_new_root(Table* table, uint32_t right_child_page_num) {
 57 |    uint32_t left_child_max_key = get_node_max_key(left_child);
 58 |    *internal_node_key(root, 0) = left_child_max_key;
 59 |    *internal_node_right_child(root) = right_child_page_num;
 60 | +  *node_parent(left_child) = table->root_page_num;
 61 | +  *node_parent(right_child) = table->root_page_num;
 62 |  }
 63 | ```
 64 | 
 65 | Now we need to find the affected cell in the parent node. The child doesn't know its own page number, so we can't look for that. But it does know its own maximum key, so we can search the parent for that key.
 66 | 
 67 | ```diff
 68 | +void update_internal_node_key(void* node, uint32_t old_key, uint32_t new_key) {
 69 | +  uint32_t old_child_index = internal_node_find_child(node, old_key);
 70 | +  *internal_node_key(node, old_child_index) = new_key;
 71 |  }
 72 | ```
 73 | 
 74 | Inside `internal_node_find_child()` we'll reuse some code we already have for finding a key in an internal node. Refactor `internal_node_find()` to use the new helper method.
 75 | 
 76 | ```diff
 77 | -Cursor* internal_node_find(Table* table, uint32_t page_num, uint32_t key) {
 78 | -  void* node = get_page(table->pager, page_num);
 79 | +uint32_t internal_node_find_child(void* node, uint32_t key) {
 80 | +  /*
 81 | +  Return the index of the child which should contain
 82 | +  the given key.
 83 | +  */
 84 | +
 85 |    uint32_t num_keys = *internal_node_num_keys(node);
 86 |  
 87 | -  /* Binary search to find index of child to search */
 88 | +  /* Binary search */
 89 |    uint32_t min_index = 0;
 90 |    uint32_t max_index = num_keys; /* there is one more child than key */
 91 |  
 92 | @@ -386,7 +394,14 @@ Cursor* internal_node_find(Table* table, uint32_t page_num, uint32_t key) {
 93 |      }
 94 |    }
 95 |  
 96 | -  uint32_t child_num = *internal_node_child(node, min_index);
 97 | +  return min_index;
 98 | +}
 99 | +
100 | +Cursor* internal_node_find(Table* table, uint32_t page_num, uint32_t key) {
101 | +  void* node = get_page(table->pager, page_num);
102 | +
103 | +  uint32_t child_index = internal_node_find_child(node, key);
104 | +  uint32_t child_num = *internal_node_child(node, child_index);
105 |    void* child = get_page(table->pager, child_num);
106 |    switch (get_node_type(child)) {
107 |      case NODE_LEAF:
108 | ```
109 | 
110 | Now we get to the heart of this article, implementing `internal_node_insert()`. I'll explain it in pieces.
111 | 
112 | ```diff
113 | +void internal_node_insert(Table* table, uint32_t parent_page_num,
114 | +                          uint32_t child_page_num) {
115 | +  /*
116 | +  Add a new child/key pair to parent that corresponds to child
117 | +  */
118 | +
119 | +  void* parent = get_page(table->pager, parent_page_num);
120 | +  void* child = get_page(table->pager, child_page_num);
121 | +  uint32_t child_max_key = get_node_max_key(child);
122 | +  uint32_t index = internal_node_find_child(parent, child_max_key);
123 | +
124 | +  uint32_t original_num_keys = *internal_node_num_keys(parent);
125 | +  *internal_node_num_keys(parent) = original_num_keys + 1;
126 | +
127 | +  if (original_num_keys >= INTERNAL_NODE_MAX_CELLS) {
128 | +    printf("Need to implement splitting internal node\n");
129 | +    exit(EXIT_FAILURE);
130 | +  }
131 | ```
132 | 
133 | The index where the new cell (child/key pair) should be inserted depends on the maximum key in the new child. In the example we looked at, `child_max_key` would be 5 and `index` would be 1.
134 | 
135 | If there's no room in the internal node for another cell, throw an error. We'll implement that later.
136 | 
137 | Now let's look at the rest of the function:
138 | 
139 | ```diff
140 | +
141 | +  uint32_t right_child_page_num = *internal_node_right_child(parent);
142 | +  void* right_child = get_page(table->pager, right_child_page_num);
143 | +
144 | +  if (child_max_key > get_node_max_key(right_child)) {
145 | +    /* Replace right child */
146 | +    *internal_node_child(parent, original_num_keys) = right_child_page_num;
147 | +    *internal_node_key(parent, original_num_keys) =
148 | +        get_node_max_key(right_child);
149 | +    *internal_node_right_child(parent) = child_page_num;
150 | +  } else {
151 | +    /* Make room for the new cell */
152 | +    for (uint32_t i = original_num_keys; i > index; i--) {
153 | +      void* destination = internal_node_cell(parent, i);
154 | +      void* source = internal_node_cell(parent, i - 1);
155 | +      memcpy(destination, source, INTERNAL_NODE_CELL_SIZE);
156 | +    }
157 | +    *internal_node_child(parent, index) = child_page_num;
158 | +    *internal_node_key(parent, index) = child_max_key;
159 | +  }
160 | +}
161 | ```
162 | 
163 | Because we store the rightmost child pointer separately from the rest of the child/key pairs, we have to handle things differently if the new child is going to become the rightmost child.
164 | 
165 | In our example, we would get into the `else` block. First we make room for the new cell by shifting other cells one space to the right. (Although in our example there are 0 cells to shift)
166 | 
167 | Next, we write the new child pointer and key into the cell determined by `index`.
168 | 
169 | To reduce the size of testcases needed, I'm hardcoding `INTERNAL_NODE_MAX_CELLS` for now
170 | 
171 | ```diff
172 | @@ -126,6 +126,8 @@ const uint32_t INTERNAL_NODE_KEY_SIZE = sizeof(uint32_t);
173 |  const uint32_t INTERNAL_NODE_CHILD_SIZE = sizeof(uint32_t);
174 |  const uint32_t INTERNAL_NODE_CELL_SIZE =
175 |      INTERNAL_NODE_CHILD_SIZE + INTERNAL_NODE_KEY_SIZE;
176 | +/* Keep this small for testing */
177 | +const uint32_t INTERNAL_NODE_MAX_CELLS = 3;
178 | ```
179 | 
180 | Speaking of tests, our large-dataset test gets past our old stub and gets to our new one:
181 | 
182 | ```diff
183 | @@ -65,7 +65,7 @@ describe 'database' do
184 |      result = run_script(script)
185 |      expect(result.last(2)).to match_array([
186 |        "db > Executed.",
187 | -      "db > Need to implement updating parent after split",
188 | +      "db > Need to implement splitting internal node",
189 |      ])
190 | ```
191 | 
192 | Very satisfying, I know.
193 | 
194 | I'll add another test that prints a four-node tree. Just so we test more cases than sequential ids, this test will add records in a pseudorandom order.
195 | 
196 | ```diff
197 | +  it 'allows printing out the structure of a 4-leaf-node btree' do
198 | +    script = [
199 | +      "insert 18 user18 person18@example.com",
200 | +      "insert 7 user7 person7@example.com",
201 | +      "insert 10 user10 person10@example.com",
202 | +      "insert 29 user29 person29@example.com",
203 | +      "insert 23 user23 person23@example.com",
204 | +      "insert 4 user4 person4@example.com",
205 | +      "insert 14 user14 person14@example.com",
206 | +      "insert 30 user30 person30@example.com",
207 | +      "insert 15 user15 person15@example.com",
208 | +      "insert 26 user26 person26@example.com",
209 | +      "insert 22 user22 person22@example.com",
210 | +      "insert 19 user19 person19@example.com",
211 | +      "insert 2 user2 person2@example.com",
212 | +      "insert 1 user1 person1@example.com",
213 | +      "insert 21 user21 person21@example.com",
214 | +      "insert 11 user11 person11@example.com",
215 | +      "insert 6 user6 person6@example.com",
216 | +      "insert 20 user20 person20@example.com",
217 | +      "insert 5 user5 person5@example.com",
218 | +      "insert 8 user8 person8@example.com",
219 | +      "insert 9 user9 person9@example.com",
220 | +      "insert 3 user3 person3@example.com",
221 | +      "insert 12 user12 person12@example.com",
222 | +      "insert 27 user27 person27@example.com",
223 | +      "insert 17 user17 person17@example.com",
224 | +      "insert 16 user16 person16@example.com",
225 | +      "insert 13 user13 person13@example.com",
226 | +      "insert 24 user24 person24@example.com",
227 | +      "insert 25 user25 person25@example.com",
228 | +      "insert 28 user28 person28@example.com",
229 | +      ".btree",
230 | +      ".exit",
231 | +    ]
232 | +    result = run_script(script)
233 | ```
234 | 
235 | As-is, it will output this:
236 | 
237 | ```
238 | - internal (size 3)
239 |   - leaf (size 7)
240 |     - 1
241 |     - 2
242 |     - 3
243 |     - 4
244 |     - 5
245 |     - 6
246 |     - 7
247 |   - key 1
248 |   - leaf (size 8)
249 |     - 8
250 |     - 9
251 |     - 10
252 |     - 11
253 |     - 12
254 |     - 13
255 |     - 14
256 |     - 15
257 |   - key 15
258 |   - leaf (size 7)
259 |     - 16
260 |     - 17
261 |     - 18
262 |     - 19
263 |     - 20
264 |     - 21
265 |     - 22
266 |   - key 22
267 |   - leaf (size 8)
268 |     - 23
269 |     - 24
270 |     - 25
271 |     - 26
272 |     - 27
273 |     - 28
274 |     - 29
275 |     - 30
276 | db >
277 | ```
278 | 
279 | Look carefully and you'll spot a bug:
280 | ```
281 |     - 5
282 |     - 6
283 |     - 7
284 |   - key 1
285 | ```
286 | 
287 | The key there should be 7, not 1!
288 | 
289 | After a bunch of debugging, I discovered this was due to some bad pointer arithmetic.
290 | 
291 | ```diff
292 |  uint32_t* internal_node_key(void* node, uint32_t key_num) {
293 | -  return internal_node_cell(node, key_num) + INTERNAL_NODE_CHILD_SIZE;
294 | +  return (void*)internal_node_cell(node, key_num) + INTERNAL_NODE_CHILD_SIZE;
295 |  }
296 | ```
297 | 
298 | `INTERNAL_NODE_CHILD_SIZE` is 4. My intention here was to add 4 bytes to the result of `internal_node_cell()`, but since `internal_node_cell()` returns a `uint32_t*`, this it was actually adding `4 * sizeof(uint32_t)` bytes. I fixed it by casting to a `void*` before doing the arithmetic.
299 | 
300 | NOTE! [Pointer arithmetic on void pointers is not part of the C standard and may not work with your compiler](https://stackoverflow.com/questions/3523145/pointer-arithmetic-for-void-pointer-in-c/46238658#46238658). I may do an article in the future on portability, but I'm leaving my void pointer arithmetic for now.
301 | 
302 | Alright. One more step toward a fully-operational btree implementation. The next step should be splitting internal nodes. Until then!
303 | 


--------------------------------------------------------------------------------
/_parts/part14.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: Part 14 - Splitting Internal Nodes
  3 | date: 2023-05-23
  4 | ---
  5 | 
  6 | The next leg of our journey will be splitting internal nodes which are unable to accommodate new keys. Consider the example below:
  7 | 
  8 | {% include image.html url="assets/images/splitting-internal-node.png" description="Example of splitting an internal" %}
  9 | 
 10 | In this example, we add the key "11" to the tree. This will cause our root to split. When splitting an internal node, we will have to do a few things in order to keep everything straight:
 11 | 
 12 | 1. Create a sibling node to store (n-1)/2 of the original node's keys
 13 | 2. Move these keys from the original node to the sibling node
 14 | 3. Update the original node's key in the parent to reflect its new max key after splitting
 15 | 4. Insert the sibling node into the parent (could result in the parent also being split)
 16 | 
 17 | We will begin by replacing our stub code with the call to `internal_node_split_and_insert`
 18 | 
 19 | ```diff
 20 | +void internal_node_split_and_insert(Table* table, uint32_t parent_page_num,
 21 | +                          uint32_t child_page_num);
 22 | +
 23 |  void internal_node_insert(Table* table, uint32_t parent_page_num,
 24 |                            uint32_t child_page_num) {
 25 |    /*
 26 | @@ -685,25 +714,39 @@ void internal_node_insert(Table* table, uint32_t parent_page_num,
 27 |  
 28 |    void* parent = get_page(table->pager, parent_page_num);
 29 |    void* child = get_page(table->pager, child_page_num);
 30 | -  uint32_t child_max_key = get_node_max_key(child);
 31 | +  uint32_t child_max_key = get_node_max_key(table->pager, child);
 32 |    uint32_t index = internal_node_find_child(parent, child_max_key);
 33 |  
 34 |    uint32_t original_num_keys = *internal_node_num_keys(parent);
 35 | -  *internal_node_num_keys(parent) = original_num_keys + 1;
 36 |  
 37 |    if (original_num_keys >= INTERNAL_NODE_MAX_CELLS) {
 38 | -    printf("Need to implement splitting internal node\n");
 39 | -    exit(EXIT_FAILURE);
 40 | +    internal_node_split_and_insert(table, parent_page_num, child_page_num);
 41 | +    return;
 42 |    }
 43 |  
 44 |    uint32_t right_child_page_num = *internal_node_right_child(parent);
 45 | +  /*
 46 | +  An internal node with a right child of INVALID_PAGE_NUM is empty
 47 | +  */
 48 | +  if (right_child_page_num == INVALID_PAGE_NUM) {
 49 | +    *internal_node_right_child(parent) = child_page_num;
 50 | +    return;
 51 | +  }
 52 | +
 53 |    void* right_child = get_page(table->pager, right_child_page_num);
 54 | +  /*
 55 | +  If we are already at the max number of cells for a node, we cannot increment
 56 | +  before splitting. Incrementing without inserting a new key/child pair
 57 | +  and immediately calling internal_node_split_and_insert has the effect
 58 | +  of creating a new key at (max_cells + 1) with an uninitialized value
 59 | +  */
 60 | +  *internal_node_num_keys(parent) = original_num_keys + 1;
 61 |  
 62 | -  if (child_max_key > get_node_max_key(right_child)) {
 63 | +  if (child_max_key > get_node_max_key(table->pager, right_child)) {
 64 |      /* Replace right child */
 65 |      *internal_node_child(parent, original_num_keys) = right_child_page_num;
 66 |      *internal_node_key(parent, original_num_keys) =
 67 | -        get_node_max_key(right_child);
 68 | +        get_node_max_key(table->pager, right_child);
 69 |      *internal_node_right_child(parent) = child_page_num;
 70 | ```
 71 | 
 72 | There are three important changes we are making here aside from replacing the stub:
 73 |  - First, `internal_node_split_and_insert` is forward-declared because we will be calling `internal_node_insert` in its definition to avoid code duplication.
 74 |  - In addition, we are moving the logic which increments the parent's number of keys further down in the function definition to ensure that this does not happen before the split.
 75 |  - Finally, we are ensuring that a child node inserted into an empty internal node will become that internal node's right child without any other operations being performed, since an empty internal node has no keys to manipulate. 
 76 | 
 77 | The changes above require that we be able to identify an empty node - to this end, we will first define a constant which represents an invalid page number that is the child of every empty node.
 78 | 
 79 | ```diff
 80 | +#define INVALID_PAGE_NUM UINT32_MAX
 81 | ```
 82 | Now, when an internal node is initialized, we initialize its right child with this invalid page number.
 83 | 
 84 | ```diff
 85 | @@ -330,6 +335,12 @@ void initialize_internal_node(void* node) {
 86 |    set_node_type(node, NODE_INTERNAL);
 87 |    set_node_root(node, false);
 88 |    *internal_node_num_keys(node) = 0;
 89 | +  /*
 90 | +  Necessary because the root page number is 0; by not initializing an internal 
 91 | +  node's right child to an invalid page number when initializing the node, we may
 92 | +  end up with 0 as the node's right child, which makes the node a parent of the root
 93 | +  */
 94 | +  *internal_node_right_child(node) = INVALID_PAGE_NUM;
 95 |  }
 96 | ```
 97 | 
 98 | This step was made necessary by a problem that the comment above attempts to summarize - when initializing an internal node without explicitly initializing the right child field, the value of that field at runtime could be 0 depending on the compiler or the architecture of the machine on which the program is being executed. Since we are using 0 as our root page number, this means that a newly allocated internal node will be a parent of the root.
 99 | 
100 | We have introduced some guards in our `internal_node_child` function to throw an error in the case of an attempt to access an invalid page.
101 | 
102 | ```diff
103 | @@ -186,9 +188,19 @@ uint32_t* internal_node_child(void* node, uint32_t child_num) {
104 |      printf("Tried to access child_num %d > num_keys %d\n", child_num, num_keys);
105 |      exit(EXIT_FAILURE);
106 |    } else if (child_num == num_keys) {
107 | -    return internal_node_right_child(node);
108 | +    uint32_t* right_child = internal_node_right_child(node);
109 | +    if (*right_child == INVALID_PAGE_NUM) {
110 | +      printf("Tried to access right child of node, but was invalid page\n");
111 | +      exit(EXIT_FAILURE);
112 | +    }
113 | +    return right_child;
114 |    } else {
115 | -    return internal_node_cell(node, child_num);
116 | +    uint32_t* child = internal_node_cell(node, child_num);
117 | +    if (*child == INVALID_PAGE_NUM) {
118 | +      printf("Tried to access child %d of node, but was invalid page\n", child_num);
119 | +      exit(EXIT_FAILURE);
120 | +    }
121 | +    return child;
122 |    }
123 |  }
124 | ```
125 | 
126 | One additional guard is needed in our `print_tree` function to ensure that we do not attempt to print an empty node, as that would involve trying to access an invalid page.
127 | 
128 | ```diff
129 | @@ -294,15 +305,17 @@ void print_tree(Pager* pager, uint32_t page_num, uint32_t indentation_level) {
130 |        num_keys = *internal_node_num_keys(node);
131 |        indent(indentation_level);
132 |        printf("- internal (size %d)\n", num_keys);
133 | -      for (uint32_t i = 0; i < num_keys; i++) {
134 | -        child = *internal_node_child(node, i);
135 | +      if (num_keys > 0) {
136 | +        for (uint32_t i = 0; i < num_keys; i++) {
137 | +          child = *internal_node_child(node, i);
138 | +          print_tree(pager, child, indentation_level + 1);
139 | +
140 | +          indent(indentation_level + 1);
141 | +          printf("- key %d\n", *internal_node_key(node, i));
142 | +        }
143 | +        child = *internal_node_right_child(node);
144 |          print_tree(pager, child, indentation_level + 1);
145 | -
146 | -        indent(indentation_level + 1);
147 | -        printf("- key %d\n", *internal_node_key(node, i));
148 |        }
149 | -      child = *internal_node_right_child(node);
150 | -      print_tree(pager, child, indentation_level + 1);
151 |        break;
152 |    }
153 |  }
154 | ```
155 | 
156 | Now for the headliner, `internal_node_split_and_insert`. We will first provide it in its entirety, and then break it down by steps.
157 | 
158 | ```diff
159 | +void internal_node_split_and_insert(Table* table, uint32_t parent_page_num,
160 | +                          uint32_t child_page_num) {
161 | +  uint32_t old_page_num = parent_page_num;
162 | +  void* old_node = get_page(table->pager,parent_page_num);
163 | +  uint32_t old_max = get_node_max_key(table->pager, old_node);
164 | +
165 | +  void* child = get_page(table->pager, child_page_num); 
166 | +  uint32_t child_max = get_node_max_key(table->pager, child);
167 | +
168 | +  uint32_t new_page_num = get_unused_page_num(table->pager);
169 | +
170 | +  /*
171 | +  Declaring a flag before updating pointers which
172 | +  records whether this operation involves splitting the root -
173 | +  if it does, we will insert our newly created node during
174 | +  the step where the table's new root is created. If it does
175 | +  not, we have to insert the newly created node into its parent
176 | +  after the old node's keys have been transferred over. We are not
177 | +  able to do this if the newly created node's parent is not a newly
178 | +  initialized root node, because in that case its parent may have existing
179 | +  keys aside from our old node which we are splitting. If that is true, we
180 | +  need to find a place for our newly created node in its parent, and we
181 | +  cannot insert it at the correct index if it does not yet have any keys
182 | +  */
183 | +  uint32_t splitting_root = is_node_root(old_node);
184 | +
185 | +  void* parent;
186 | +  void* new_node;
187 | +  if (splitting_root) {
188 | +    create_new_root(table, new_page_num);
189 | +    parent = get_page(table->pager,table->root_page_num);
190 | +    /*
191 | +    If we are splitting the root, we need to update old_node to point
192 | +    to the new root's left child, new_page_num will already point to
193 | +    the new root's right child
194 | +    */
195 | +    old_page_num = *internal_node_child(parent,0);
196 | +    old_node = get_page(table->pager, old_page_num);
197 | +  } else {
198 | +    parent = get_page(table->pager,*node_parent(old_node));
199 | +    new_node = get_page(table->pager, new_page_num);
200 | +    initialize_internal_node(new_node);
201 | +  }
202 | +  
203 | +  uint32_t* old_num_keys = internal_node_num_keys(old_node);
204 | +
205 | +  uint32_t cur_page_num = *internal_node_right_child(old_node);
206 | +  void* cur = get_page(table->pager, cur_page_num);
207 | +
208 | +  /*
209 | +  First put right child into new node and set right child of old node to invalid page number
210 | +  */
211 | +  internal_node_insert(table, new_page_num, cur_page_num);
212 | +  *node_parent(cur) = new_page_num;
213 | +  *internal_node_right_child(old_node) = INVALID_PAGE_NUM;
214 | +  /*
215 | +  For each key until you get to the middle key, move the key and the child to the new node
216 | +  */
217 | +  for (int i = INTERNAL_NODE_MAX_CELLS - 1; i > INTERNAL_NODE_MAX_CELLS / 2; i--) {
218 | +    cur_page_num = *internal_node_child(old_node, i);
219 | +    cur = get_page(table->pager, cur_page_num);
220 | +
221 | +    internal_node_insert(table, new_page_num, cur_page_num);
222 | +    *node_parent(cur) = new_page_num;
223 | +
224 | +    (*old_num_keys)--;
225 | +  }
226 | +
227 | +  /*
228 | +  Set child before middle key, which is now the highest key, to be node's right child,
229 | +  and decrement number of keys
230 | +  */
231 | +  *internal_node_right_child(old_node) = *internal_node_child(old_node,*old_num_keys - 1);
232 | +  (*old_num_keys)--;
233 | +
234 | +  /*
235 | +  Determine which of the two nodes after the split should contain the child to be inserted,
236 | +  and insert the child
237 | +  */
238 | +  uint32_t max_after_split = get_node_max_key(table->pager, old_node);
239 | +
240 | +  uint32_t destination_page_num = child_max < max_after_split ? old_page_num : new_page_num;
241 | +
242 | +  internal_node_insert(table, destination_page_num, child_page_num);
243 | +  *node_parent(child) = destination_page_num;
244 | +
245 | +  update_internal_node_key(parent, old_max, get_node_max_key(table->pager, old_node));
246 | +
247 | +  if (!splitting_root) {
248 | +    internal_node_insert(table,*node_parent(old_node),new_page_num);
249 | +    *node_parent(new_node) = *node_parent(old_node);
250 | +  }
251 | +}
252 | +
253 | ```
254 | 
255 | The first thing we need to do is create a variable to store the page number of the node we are splitting (the old node from here out). This is necessary because the page number of the old node will change if it happens to be the table's root node. We also need to remember what the node's current max is, because that value represents its key in the parent, and that key will need to be updated with the old node's new maximum after the split occurs.
256 | 
257 | ```diff
258 | +  uint32_t old_page_num = parent_page_num;
259 | +  void* old_node = get_page(table->pager,parent_page_num);
260 | +  uint32_t old_max = get_node_max_key(table->pager, old_node);
261 | ```
262 | 
263 | The next important step is the branching logic which depends on whether the old node is the table's root node. We will need to keep track of this value for later use; as the comment attempts to convey, we run into a problem if we do not store this information at the beginning of our function definition - if we are not splitting the root, we cannot insert our newly created sibling node into the old node's parent right away, because it does not yet contain any keys and therefore will not be placed at the right index among the other key/child pairs which may or may not already be present in the parent node.
264 | 
265 | ```diff
266 | +  uint32_t splitting_root = is_node_root(old_node);
267 | +
268 | +  void* parent;
269 | +  void* new_node;
270 | +  if (splitting_root) {
271 | +    create_new_root(table, new_page_num);
272 | +    parent = get_page(table->pager,table->root_page_num);
273 | +    /*
274 | +    If we are splitting the root, we need to update old_node to point
275 | +    to the new root's left child, new_page_num will already point to
276 | +    the new root's right child
277 | +    */
278 | +    old_page_num = *internal_node_child(parent,0);
279 | +    old_node = get_page(table->pager, old_page_num);
280 | +  } else {
281 | +    parent = get_page(table->pager,*node_parent(old_node));
282 | +    new_node = get_page(table->pager, new_page_num);
283 | +    initialize_internal_node(new_node);
284 | +  }
285 | ```
286 | 
287 | Once we have settled the question of splitting or not splitting the root, we begin moving keys from the old node to its sibling. We must first move the old node's right child and set its right child field to an invalid page to indicate that it is empty. Now, we loop over the old node's remaining keys, performing the following steps on each iteration:
288 |  1. Obtain a reference to the old node's key and child at the current index
289 |  2. Insert the child into the sibling node
290 |  3. Update the child's parent value to point to the sibling node
291 |  4. Decrement the old node's number of keys
292 | 
293 | ```diff
294 | +  uint32_t* old_num_keys = internal_node_num_keys(old_node);
295 | +
296 | +  uint32_t cur_page_num = *internal_node_right_child(old_node);
297 | +  void* cur = get_page(table->pager, cur_page_num);
298 | +
299 | +  /*
300 | +  First put right child into new node and set right child of old node to invalid page number
301 | +  */
302 | +  internal_node_insert(table, new_page_num, cur_page_num);
303 | +  *node_parent(cur) = new_page_num;
304 | +  *internal_node_right_child(old_node) = INVALID_PAGE_NUM;
305 | +  /*
306 | +  For each key until you get to the middle key, move the key and the child to the new node
307 | +  */
308 | +  for (int i = INTERNAL_NODE_MAX_CELLS - 1; i > INTERNAL_NODE_MAX_CELLS / 2; i--) {
309 | +    cur_page_num = *internal_node_child(old_node, i);
310 | +    cur = get_page(table->pager, cur_page_num);
311 | +
312 | +    internal_node_insert(table, new_page_num, cur_page_num);
313 | +    *node_parent(cur) = new_page_num;
314 | +
315 | +    (*old_num_keys)--;
316 | +  }
317 | ```
318 | 
319 | Step 4 is important, because it serves the purpose of "erasing" the key/child pair from the old node. Although we are not actually freeing the memory at that byte offset in the old node's page, by decrementing the old node's number of keys we are making that memory location inaccessible, and the bytes will be overwritten the next time a child is inserted into the old node.
320 | 
321 | Also note the behavior of our loop invariant - if our maximum number of internal node keys changes in the future, our logic ensures that both our old node and our sibling node will end up with (n-1)/2 keys after the split, with the 1 remaining node going to the parent. If an even number is chosen as the maximum number of nodes, n/2 nodes will remain with the old node while (n-1)/2 will be moved to the sibling node. This logic would be straightforward to revise as needed.
322 | 
323 | Once the keys to be moved have been, we set the old node's i'th child as its right child and decrement its number of keys.
324 | 
325 | ```diff
326 | +  /*
327 | +  Set child before middle key, which is now the highest key, to be node's right child,
328 | +  and decrement number of keys
329 | +  */
330 | +  *internal_node_right_child(old_node) = *internal_node_child(old_node,*old_num_keys - 1);
331 | +  (*old_num_keys)--;
332 | ```
333 | 
334 | We then insert the child node into either the old node or the sibling node depending on the value of its max key.
335 | 
336 | ```diff
337 | +  uint32_t max_after_split = get_node_max_key(table->pager, old_node);
338 | +
339 | +  uint32_t destination_page_num = child_max < max_after_split ? old_page_num : new_page_num;
340 | +
341 | +  internal_node_insert(table, destination_page_num, child_page_num);
342 | +  *node_parent(child) = destination_page_num;
343 | ```
344 | 
345 | Finally, we update the old node's key in its parent, and insert the sibling node and update the sibling node's parent pointer if necessary.
346 | 
347 | ```diff
348 | +  update_internal_node_key(parent, old_max, get_node_max_key(table->pager, old_node));
349 | +
350 | +  if (!splitting_root) {
351 | +    internal_node_insert(table,*node_parent(old_node),new_page_num);
352 | +    *node_parent(new_node) = *node_parent(old_node);
353 | +  }
354 | ```
355 | 
356 | One important change required to support this new logic is in our `create_new_root` function. Before, we were only taking into account situations where the new root's children would be leaf nodes. If the new root's children are instead internal nodes, we need to do two things:
357 |  1. Correctly initialize the root's new children to be internal nodes
358 |  2. In addition to the call to memcpy, we need to insert each of the root's keys into its new left child and update the parent pointer of each of those children
359 | 
360 | ```diff
361 | @@ -661,22 +680,40 @@ void create_new_root(Table* table, uint32_t right_child_page_num) {
362 |    uint32_t left_child_page_num = get_unused_page_num(table->pager);
363 |    void* left_child = get_page(table->pager, left_child_page_num);
364 |  
365 | +  if (get_node_type(root) == NODE_INTERNAL) {
366 | +    initialize_internal_node(right_child);
367 | +    initialize_internal_node(left_child);
368 | +  }
369 | +
370 |    /* Left child has data copied from old root */
371 |    memcpy(left_child, root, PAGE_SIZE);
372 |    set_node_root(left_child, false);
373 |  
374 | +  if (get_node_type(left_child) == NODE_INTERNAL) {
375 | +    void* child;
376 | +    for (int i = 0; i < *internal_node_num_keys(left_child); i++) {
377 | +      child = get_page(table->pager, *internal_node_child(left_child,i));
378 | +      *node_parent(child) = left_child_page_num;
379 | +    }
380 | +    child = get_page(table->pager, *internal_node_right_child(left_child));
381 | +    *node_parent(child) = left_child_page_num;
382 | +  }
383 | +
384 |    /* Root node is a new internal node with one key and two children */
385 |    initialize_internal_node(root);
386 |    set_node_root(root, true);
387 |    *internal_node_num_keys(root) = 1;
388 |    *internal_node_child(root, 0) = left_child_page_num;
389 | -  uint32_t left_child_max_key = get_node_max_key(left_child);
390 | +  uint32_t left_child_max_key = get_node_max_key(table->pager, left_child);
391 |    *internal_node_key(root, 0) = left_child_max_key;
392 |    *internal_node_right_child(root) = right_child_page_num;
393 |    *node_parent(left_child) = table->root_page_num;
394 |    *node_parent(right_child) = table->root_page_num;
395 |  }
396 | ```
397 | 
398 | Another important change has been made to `get_node_max_key`, as mentioned at the beginning of this article. Since an internal node's key represents the maximum of the tree pointed to by the child to its left, and that child can be a tree of arbitrary depth, we need to walk down the right children of that tree until we get to a leaf node, and then take the maximum key of that leaf node.
399 | 
400 | ```diff
401 | +uint32_t get_node_max_key(Pager* pager, void* node) {
402 | +  if (get_node_type(node) == NODE_LEAF) {
403 | +    return *leaf_node_key(node, *leaf_node_num_cells(node) - 1);
404 | +  }
405 | +  void* right_child = get_page(pager,*internal_node_right_child(node));
406 | +  return get_node_max_key(pager, right_child);
407 | +}
408 | ```
409 | 
410 | We have written a single test to demonstrate that our `print_tree` function still works after the introduction of internal node splitting. 
411 | 
412 | ```diff
413 | +  it 'allows printing out the structure of a 7-leaf-node btree' do
414 | +    script = [
415 | +      "insert 58 user58 person58@example.com",
416 | +      "insert 56 user56 person56@example.com",
417 | +      "insert 8 user8 person8@example.com",
418 | +      "insert 54 user54 person54@example.com",
419 | +      "insert 77 user77 person77@example.com",
420 | +      "insert 7 user7 person7@example.com",
421 | +      "insert 25 user25 person25@example.com",
422 | +      "insert 71 user71 person71@example.com",
423 | +      "insert 13 user13 person13@example.com",
424 | +      "insert 22 user22 person22@example.com",
425 | +      "insert 53 user53 person53@example.com",
426 | +      "insert 51 user51 person51@example.com",
427 | +      "insert 59 user59 person59@example.com",
428 | +      "insert 32 user32 person32@example.com",
429 | +      "insert 36 user36 person36@example.com",
430 | +      "insert 79 user79 person79@example.com",
431 | +      "insert 10 user10 person10@example.com",
432 | +      "insert 33 user33 person33@example.com",
433 | +      "insert 20 user20 person20@example.com",
434 | +      "insert 4 user4 person4@example.com",
435 | +      "insert 35 user35 person35@example.com",
436 | +      "insert 76 user76 person76@example.com",
437 | +      "insert 49 user49 person49@example.com",
438 | +      "insert 24 user24 person24@example.com",
439 | +      "insert 70 user70 person70@example.com",
440 | +      "insert 48 user48 person48@example.com",
441 | +      "insert 39 user39 person39@example.com",
442 | +      "insert 15 user15 person15@example.com",
443 | +      "insert 47 user47 person47@example.com",
444 | +      "insert 30 user30 person30@example.com",
445 | +      "insert 86 user86 person86@example.com",
446 | +      "insert 31 user31 person31@example.com",
447 | +      "insert 68 user68 person68@example.com",
448 | +      "insert 37 user37 person37@example.com",
449 | +      "insert 66 user66 person66@example.com",
450 | +      "insert 63 user63 person63@example.com",
451 | +      "insert 40 user40 person40@example.com",
452 | +      "insert 78 user78 person78@example.com",
453 | +      "insert 19 user19 person19@example.com",
454 | +      "insert 46 user46 person46@example.com",
455 | +      "insert 14 user14 person14@example.com",
456 | +      "insert 81 user81 person81@example.com",
457 | +      "insert 72 user72 person72@example.com",
458 | +      "insert 6 user6 person6@example.com",
459 | +      "insert 50 user50 person50@example.com",
460 | +      "insert 85 user85 person85@example.com",
461 | +      "insert 67 user67 person67@example.com",
462 | +      "insert 2 user2 person2@example.com",
463 | +      "insert 55 user55 person55@example.com",
464 | +      "insert 69 user69 person69@example.com",
465 | +      "insert 5 user5 person5@example.com",
466 | +      "insert 65 user65 person65@example.com",
467 | +      "insert 52 user52 person52@example.com",
468 | +      "insert 1 user1 person1@example.com",
469 | +      "insert 29 user29 person29@example.com",
470 | +      "insert 9 user9 person9@example.com",
471 | +      "insert 43 user43 person43@example.com",
472 | +      "insert 75 user75 person75@example.com",
473 | +      "insert 21 user21 person21@example.com",
474 | +      "insert 82 user82 person82@example.com",
475 | +      "insert 12 user12 person12@example.com",
476 | +      "insert 18 user18 person18@example.com",
477 | +      "insert 60 user60 person60@example.com",
478 | +      "insert 44 user44 person44@example.com",
479 | +      ".btree",
480 | +      ".exit",
481 | +    ]
482 | +    result = run_script(script)
483 | +
484 | +    expect(result[64...(result.length)]).to match_array([
485 | +      "db > Tree:",
486 | +      "- internal (size 1)",
487 | +      "  - internal (size 2)",
488 | +      "    - leaf (size 7)",
489 | +      "      - 1",
490 | +      "      - 2",
491 | +      "      - 4",
492 | +      "      - 5",
493 | +      "      - 6",
494 | +      "      - 7",
495 | +      "      - 8",
496 | +      "    - key 8",
497 | +      "    - leaf (size 11)",
498 | +      "      - 9",
499 | +      "      - 10",
500 | +      "      - 12",
501 | +      "      - 13",
502 | +      "      - 14",
503 | +      "      - 15",
504 | +      "      - 18",
505 | +      "      - 19",
506 | +      "      - 20",
507 | +      "      - 21",
508 | +      "      - 22",
509 | +      "    - key 22",
510 | +      "    - leaf (size 8)",
511 | +      "      - 24",
512 | +      "      - 25",
513 | +      "      - 29",
514 | +      "      - 30",
515 | +      "      - 31",
516 | +      "      - 32",
517 | +      "      - 33",
518 | +      "      - 35",
519 | +      "  - key 35",
520 | +      "  - internal (size 3)",
521 | +      "    - leaf (size 12)",
522 | +      "      - 36",
523 | +      "      - 37",
524 | +      "      - 39",
525 | +      "      - 40",
526 | +      "      - 43",
527 | +      "      - 44",
528 | +      "      - 46",
529 | +      "      - 47",
530 | +      "      - 48",
531 | +      "      - 49",
532 | +      "      - 50",
533 | +      "      - 51",
534 | +      "    - key 51",
535 | +      "    - leaf (size 11)",
536 | +      "      - 52",
537 | +      "      - 53",
538 | +      "      - 54",
539 | +      "      - 55",
540 | +      "      - 56",
541 | +      "      - 58",
542 | +      "      - 59",
543 | +      "      - 60",
544 | +      "      - 63",
545 | +      "      - 65",
546 | +      "      - 66",
547 | +      "    - key 66",
548 | +      "    - leaf (size 7)",
549 | +      "      - 67",
550 | +      "      - 68",
551 | +      "      - 69",
552 | +      "      - 70",
553 | +      "      - 71",
554 | +      "      - 72",
555 | +      "      - 75",
556 | +      "    - key 75",
557 | +      "    - leaf (size 8)",
558 | +      "      - 76",
559 | +      "      - 77",
560 | +      "      - 78",
561 | +      "      - 79",
562 | +      "      - 81",
563 | +      "      - 82",
564 | +      "      - 85",
565 | +      "      - 86",
566 | +      "db > ",
567 | +    ])
568 | +  end
569 | ```
570 | 


--------------------------------------------------------------------------------
/_parts/part15.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Part 15 - Where to go next
 3 | date: 2024-03-04
 4 | ---
 5 | 
 6 | This project is no longer under active development.
 7 | 
 8 | But if you'd like to keep learning how to make your own SQLite clone from scratch, or one of many other projects like Docker, Redis, Git or BitTorrent, try <a href="https://app.codecrafters.io/join?via=cstack"><b>CodeCrafters</b></a>.
 9 | 
10 | CodeCrafters maintains a <a href="https://github.com/codecrafters-io/build-your-own-x?tab=readme-ov-file#build-your-own-docker">pretty comprehensive list of "Build your own X" tutorials</a> including "Build your own Database".
11 | 
12 | Plus, if your company has a learning and development budget, you can use it to pay for CodeCrafter's paid service:
13 | 
14 | <a href="https://app.codecrafters.io/join?via=cstack"><img src="{{site.url}}{{site.baseurl}}/assets/images/code-crafters.jpeg" alt="{{ include.description }}"/></a>
15 | 
16 | If you use my referral link, I get a commision.


--------------------------------------------------------------------------------
/_parts/part2.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: Part 2 - World's Simplest SQL Compiler and Virtual Machine
  3 | date: 2017-08-31
  4 | ---
  5 | 
  6 | We're making a clone of sqlite. The "front-end" of sqlite is a SQL compiler that parses a string and outputs an internal representation called bytecode.
  7 | 
  8 | This bytecode is passed to the virtual machine, which executes it.
  9 | 
 10 | {% include image.html url="assets/images/arch2.gif" description="SQLite Architecture (https://www.sqlite.org/arch.html)" %}
 11 | 
 12 | Breaking things into two steps like this has a couple advantages:
 13 | - Reduces the complexity of each part (e.g. virtual machine does not worry about syntax errors)
 14 | - Allows compiling common queries once and caching the bytecode for improved performance
 15 | 
 16 | With this in mind, let's refactor our `main` function and support two new keywords in the process:
 17 | 
 18 | ```diff
 19 |  int main(int argc, char* argv[]) {
 20 |    InputBuffer* input_buffer = new_input_buffer();
 21 |    while (true) {
 22 |      print_prompt();
 23 |      read_input(input_buffer);
 24 | 
 25 | -    if (strcmp(input_buffer->buffer, ".exit") == 0) {
 26 | -      exit(EXIT_SUCCESS);
 27 | -    } else {
 28 | -      printf("Unrecognized command '%s'.\n", input_buffer->buffer);
 29 | +    if (input_buffer->buffer[0] == '.') {
 30 | +      switch (do_meta_command(input_buffer)) {
 31 | +        case (META_COMMAND_SUCCESS):
 32 | +          continue;
 33 | +        case (META_COMMAND_UNRECOGNIZED_COMMAND):
 34 | +          printf("Unrecognized command '%s'\n", input_buffer->buffer);
 35 | +          continue;
 36 | +      }
 37 |      }
 38 | +
 39 | +    Statement statement;
 40 | +    switch (prepare_statement(input_buffer, &statement)) {
 41 | +      case (PREPARE_SUCCESS):
 42 | +        break;
 43 | +      case (PREPARE_UNRECOGNIZED_STATEMENT):
 44 | +        printf("Unrecognized keyword at start of '%s'.\n",
 45 | +               input_buffer->buffer);
 46 | +        continue;
 47 | +    }
 48 | +
 49 | +    execute_statement(&statement);
 50 | +    printf("Executed.\n");
 51 |    }
 52 |  }
 53 | ```
 54 | 
 55 | Non-SQL statements like `.exit` are called "meta-commands". They all start with a dot, so we check for them and handle them in a separate function.
 56 | 
 57 | Next, we add a step that converts the line of input into our internal representation of a statement. This is our hacky version of the sqlite front-end.
 58 | 
 59 | Lastly, we pass the prepared statement to `execute_statement`. This function will eventually become our virtual machine.
 60 | 
 61 | Notice that two of our new functions return enums indicating success or failure:
 62 | 
 63 | ```c
 64 | typedef enum {
 65 |   META_COMMAND_SUCCESS,
 66 |   META_COMMAND_UNRECOGNIZED_COMMAND
 67 | } MetaCommandResult;
 68 | 
 69 | typedef enum { PREPARE_SUCCESS, PREPARE_UNRECOGNIZED_STATEMENT } PrepareResult;
 70 | ```
 71 | 
 72 | "Unrecognized statement"? That seems a bit like an exception. I prefer not to use exceptions (and C doesn't even support them), so I'm using enum result codes wherever practical. The C compiler will complain if my switch statement doesn't handle a member of the enum, so we can feel a little more confident we handle every result of a function. Expect more result codes to be added in the future.
 73 | 
 74 | `do_meta_command` is just a wrapper for existing functionality that leaves room for more commands:
 75 | 
 76 | ```c
 77 | MetaCommandResult do_meta_command(InputBuffer* input_buffer) {
 78 |   if (strcmp(input_buffer->buffer, ".exit") == 0) {
 79 |     exit(EXIT_SUCCESS);
 80 |   } else {
 81 |     return META_COMMAND_UNRECOGNIZED_COMMAND;
 82 |   }
 83 | }
 84 | ```
 85 | 
 86 | Our "prepared statement" right now just contains an enum with two possible values. It will contain more data as we allow parameters in statements:
 87 | 
 88 | ```c
 89 | typedef enum { STATEMENT_INSERT, STATEMENT_SELECT } StatementType;
 90 | 
 91 | typedef struct {
 92 |   StatementType type;
 93 | } Statement;
 94 | ```
 95 | 
 96 | `prepare_statement` (our "SQL Compiler") does not understand SQL right now. In fact, it only understands two words:
 97 | ```c
 98 | PrepareResult prepare_statement(InputBuffer* input_buffer,
 99 |                                 Statement* statement) {
100 |   if (strncmp(input_buffer->buffer, "insert", 6) == 0) {
101 |     statement->type = STATEMENT_INSERT;
102 |     return PREPARE_SUCCESS;
103 |   }
104 |   if (strcmp(input_buffer->buffer, "select") == 0) {
105 |     statement->type = STATEMENT_SELECT;
106 |     return PREPARE_SUCCESS;
107 |   }
108 | 
109 |   return PREPARE_UNRECOGNIZED_STATEMENT;
110 | }
111 | ```
112 | 
113 | Note that we use `strncmp` for "insert" since the "insert" keyword will be followed by data. (e.g. `insert 1 cstack foo@bar.com`)
114 | 
115 | Lastly, `execute_statement` contains a few stubs:
116 | ```c
117 | void execute_statement(Statement* statement) {
118 |   switch (statement->type) {
119 |     case (STATEMENT_INSERT):
120 |       printf("This is where we would do an insert.\n");
121 |       break;
122 |     case (STATEMENT_SELECT):
123 |       printf("This is where we would do a select.\n");
124 |       break;
125 |   }
126 | }
127 | ```
128 | 
129 | Note that it doesn't return any error codes because there's nothing that could go wrong yet.
130 | 
131 | With these refactors, we now recognize two new keywords!
132 | ```command-line
133 | ~ ./db
134 | db > insert foo bar
135 | This is where we would do an insert.
136 | Executed.
137 | db > delete foo
138 | Unrecognized keyword at start of 'delete foo'.
139 | db > select
140 | This is where we would do a select.
141 | Executed.
142 | db > .tables
143 | Unrecognized command '.tables'
144 | db > .exit
145 | ~
146 | ```
147 | 
148 | The skeleton of our database is taking shape... wouldn't it be nice if it stored data? In the next part, we'll implement `insert` and `select`, creating the world's worst data store. In the mean time, here's the entire diff from this part:
149 | 
150 | ```diff
151 | @@ -10,6 +10,23 @@ struct InputBuffer_t {
152 |  } InputBuffer;
153 |  
154 | +typedef enum {
155 | +  META_COMMAND_SUCCESS,
156 | +  META_COMMAND_UNRECOGNIZED_COMMAND
157 | +} MetaCommandResult;
158 | +
159 | +typedef enum { PREPARE_SUCCESS, PREPARE_UNRECOGNIZED_STATEMENT } PrepareResult;
160 | +
161 | +typedef enum { STATEMENT_INSERT, STATEMENT_SELECT } StatementType;
162 | +
163 | +typedef struct {
164 | +  StatementType type;
165 | +} Statement;
166 | +
167 |  InputBuffer* new_input_buffer() {
168 |    InputBuffer* input_buffer = malloc(sizeof(InputBuffer));
169 |    input_buffer->buffer = NULL;
170 | @@ -40,17 +57,67 @@ void close_input_buffer(InputBuffer* input_buffer) {
171 |      free(input_buffer);
172 |  }
173 |  
174 | +MetaCommandResult do_meta_command(InputBuffer* input_buffer) {
175 | +  if (strcmp(input_buffer->buffer, ".exit") == 0) {
176 | +    close_input_buffer(input_buffer);
177 | +    exit(EXIT_SUCCESS);
178 | +  } else {
179 | +    return META_COMMAND_UNRECOGNIZED_COMMAND;
180 | +  }
181 | +}
182 | +
183 | +PrepareResult prepare_statement(InputBuffer* input_buffer,
184 | +                                Statement* statement) {
185 | +  if (strncmp(input_buffer->buffer, "insert", 6) == 0) {
186 | +    statement->type = STATEMENT_INSERT;
187 | +    return PREPARE_SUCCESS;
188 | +  }
189 | +  if (strcmp(input_buffer->buffer, "select") == 0) {
190 | +    statement->type = STATEMENT_SELECT;
191 | +    return PREPARE_SUCCESS;
192 | +  }
193 | +
194 | +  return PREPARE_UNRECOGNIZED_STATEMENT;
195 | +}
196 | +
197 | +void execute_statement(Statement* statement) {
198 | +  switch (statement->type) {
199 | +    case (STATEMENT_INSERT):
200 | +      printf("This is where we would do an insert.\n");
201 | +      break;
202 | +    case (STATEMENT_SELECT):
203 | +      printf("This is where we would do a select.\n");
204 | +      break;
205 | +  }
206 | +}
207 | +
208 |  int main(int argc, char* argv[]) {
209 |    InputBuffer* input_buffer = new_input_buffer();
210 |    while (true) {
211 |      print_prompt();
212 |      read_input(input_buffer);
213 |  
214 | -    if (strcmp(input_buffer->buffer, ".exit") == 0) {
215 | -      close_input_buffer(input_buffer);
216 | -      exit(EXIT_SUCCESS);
217 | -    } else {
218 | -      printf("Unrecognized command '%s'.\n", input_buffer->buffer);
219 | +    if (input_buffer->buffer[0] == '.') {
220 | +      switch (do_meta_command(input_buffer)) {
221 | +        case (META_COMMAND_SUCCESS):
222 | +          continue;
223 | +        case (META_COMMAND_UNRECOGNIZED_COMMAND):
224 | +          printf("Unrecognized command '%s'\n", input_buffer->buffer);
225 | +          continue;
226 | +      }
227 |      }
228 | +
229 | +    Statement statement;
230 | +    switch (prepare_statement(input_buffer, &statement)) {
231 | +      case (PREPARE_SUCCESS):
232 | +        break;
233 | +      case (PREPARE_UNRECOGNIZED_STATEMENT):
234 | +        printf("Unrecognized keyword at start of '%s'.\n",
235 | +               input_buffer->buffer);
236 | +        continue;
237 | +    }
238 | +
239 | +    execute_statement(&statement);
240 | +    printf("Executed.\n");
241 |    }
242 |  }
243 | ```
244 | 


--------------------------------------------------------------------------------
/_parts/part3.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: Part 3 - An In-Memory, Append-Only, Single-Table Database
  3 | date: 2017-09-01
  4 | ---
  5 | 
  6 | We're going to start small by putting a lot of limitations on our database. For now, it will:
  7 | 
  8 | - support two operations: inserting a row and printing all rows
  9 | - reside only in memory (no persistence to disk)
 10 | - support a single, hard-coded table
 11 | 
 12 | Our hard-coded table is going to store users and look like this:
 13 | 
 14 | | column   | type         |
 15 | |----------|--------------|
 16 | | id       | integer      |
 17 | | username | varchar(32)  |
 18 | | email    | varchar(255) |
 19 | 
 20 | This is a simple schema, but it gets us to support multiple data types and multiple sizes of text data types.
 21 | 
 22 | `insert` statements are now going to look like this:
 23 | 
 24 | ```
 25 | insert 1 cstack foo@bar.com
 26 | ```
 27 | 
 28 | That means we need to upgrade our `prepare_statement` function to parse arguments
 29 | 
 30 | ```diff
 31 |    if (strncmp(input_buffer->buffer, "insert", 6) == 0) {
 32 |      statement->type = STATEMENT_INSERT;
 33 | +    int args_assigned = sscanf(
 34 | +        input_buffer->buffer, "insert %d %s %s", &(statement->row_to_insert.id),
 35 | +        statement->row_to_insert.username, statement->row_to_insert.email);
 36 | +    if (args_assigned < 3) {
 37 | +      return PREPARE_SYNTAX_ERROR;
 38 | +    }
 39 |      return PREPARE_SUCCESS;
 40 |    }
 41 |    if (strcmp(input_buffer->buffer, "select") == 0) {
 42 | ```
 43 | 
 44 | We store those parsed arguments into a new `Row` data structure inside the statement object:
 45 | 
 46 | ```diff
 47 | +#define COLUMN_USERNAME_SIZE 32
 48 | +#define COLUMN_EMAIL_SIZE 255
 49 | +typedef struct {
 50 | +  uint32_t id;
 51 | +  char username[COLUMN_USERNAME_SIZE];
 52 | +  char email[COLUMN_EMAIL_SIZE];
 53 | +} Row;
 54 | +
 55 |  typedef struct {
 56 |    StatementType type;
 57 | +  Row row_to_insert;  // only used by insert statement
 58 |  } Statement;
 59 | ```
 60 | 
 61 | Now we need to copy that data into some data structure representing the table. SQLite uses a B-tree for fast lookups, inserts and deletes. We'll start with something simpler. Like a B-tree, it will group rows into pages, but instead of arranging those pages as a tree it will arrange them as an array.
 62 | 
 63 | Here's my plan:
 64 | 
 65 | - Store rows in blocks of memory called pages
 66 | - Each page stores as many rows as it can fit
 67 | - Rows are serialized into a compact representation with each page
 68 | - Pages are only allocated as needed
 69 | - Keep a fixed-size array of pointers to pages
 70 | 
 71 | First we'll define the compact representation of a row:
 72 | ```diff
 73 | +#define size_of_attribute(Struct, Attribute) sizeof(((Struct*)0)->Attribute)
 74 | +
 75 | +const uint32_t ID_SIZE = size_of_attribute(Row, id);
 76 | +const uint32_t USERNAME_SIZE = size_of_attribute(Row, username);
 77 | +const uint32_t EMAIL_SIZE = size_of_attribute(Row, email);
 78 | +const uint32_t ID_OFFSET = 0;
 79 | +const uint32_t USERNAME_OFFSET = ID_OFFSET + ID_SIZE;
 80 | +const uint32_t EMAIL_OFFSET = USERNAME_OFFSET + USERNAME_SIZE;
 81 | +const uint32_t ROW_SIZE = ID_SIZE + USERNAME_SIZE + EMAIL_SIZE;
 82 | ```
 83 | 
 84 | This means the layout of a serialized row will look like this:
 85 | 
 86 | | column   | size (bytes) | offset       |
 87 | |----------|--------------|--------------|
 88 | | id       | 4            | 0            |
 89 | | username | 32           | 4            |
 90 | | email    | 255          | 36           |
 91 | | total    | 291          |              |
 92 | 
 93 | We also need code to convert to and from the compact representation.
 94 | ```diff
 95 | +void serialize_row(Row* source, void* destination) {
 96 | +  memcpy(destination + ID_OFFSET, &(source->id), ID_SIZE);
 97 | +  memcpy(destination + USERNAME_OFFSET, &(source->username), USERNAME_SIZE);
 98 | +  memcpy(destination + EMAIL_OFFSET, &(source->email), EMAIL_SIZE);
 99 | +}
100 | +
101 | +void deserialize_row(void* source, Row* destination) {
102 | +  memcpy(&(destination->id), source + ID_OFFSET, ID_SIZE);
103 | +  memcpy(&(destination->username), source + USERNAME_OFFSET, USERNAME_SIZE);
104 | +  memcpy(&(destination->email), source + EMAIL_OFFSET, EMAIL_SIZE);
105 | +}
106 | ```
107 | 
108 | Next, a `Table` structure that points to pages of rows and keeps track of how many rows there are:
109 | ```diff
110 | +const uint32_t PAGE_SIZE = 4096;
111 | +#define TABLE_MAX_PAGES 100
112 | +const uint32_t ROWS_PER_PAGE = PAGE_SIZE / ROW_SIZE;
113 | +const uint32_t TABLE_MAX_ROWS = ROWS_PER_PAGE * TABLE_MAX_PAGES;
114 | +
115 | +typedef struct {
116 | +  uint32_t num_rows;
117 | +  void* pages[TABLE_MAX_PAGES];
118 | +} Table;
119 | ```
120 | 
121 | I'm making our page size 4 kilobytes because it's the same size as a page used in the virtual memory systems of most computer architectures. This means one page in our database corresponds to one page used by the operating system. The operating system will move pages in and out of memory as whole units instead of breaking them up.
122 | 
123 | I'm setting an arbitrary limit of 100 pages that we will allocate. When we switch to a tree structure, our database's maximum size will only be limited by the maximum size of a file. (Although we'll still limit how many pages we keep in memory at once)
124 | 
125 | Rows should not cross page boundaries. Since pages probably won't exist next to each other in memory, this assumption makes it easier to read/write rows.
126 | 
127 | Speaking of which, here is how we figure out where to read/write in memory for a particular row:
128 | ```diff
129 | +void* row_slot(Table* table, uint32_t row_num) {
130 | +  uint32_t page_num = row_num / ROWS_PER_PAGE;
131 | +  void* page = table->pages[page_num];
132 | +  if (page == NULL) {
133 | +    // Allocate memory only when we try to access page
134 | +    page = table->pages[page_num] = malloc(PAGE_SIZE);
135 | +  }
136 | +  uint32_t row_offset = row_num % ROWS_PER_PAGE;
137 | +  uint32_t byte_offset = row_offset * ROW_SIZE;
138 | +  return page + byte_offset;
139 | +}
140 | ```
141 | 
142 | Now we can make `execute_statement` read/write from our table structure:
143 | ```diff
144 | -void execute_statement(Statement* statement) {
145 | +ExecuteResult execute_insert(Statement* statement, Table* table) {
146 | +  if (table->num_rows >= TABLE_MAX_ROWS) {
147 | +    return EXECUTE_TABLE_FULL;
148 | +  }
149 | +
150 | +  Row* row_to_insert = &(statement->row_to_insert);
151 | +
152 | +  serialize_row(row_to_insert, row_slot(table, table->num_rows));
153 | +  table->num_rows += 1;
154 | +
155 | +  return EXECUTE_SUCCESS;
156 | +}
157 | +
158 | +ExecuteResult execute_select(Statement* statement, Table* table) {
159 | +  Row row;
160 | +  for (uint32_t i = 0; i < table->num_rows; i++) {
161 | +    deserialize_row(row_slot(table, i), &row);
162 | +    print_row(&row);
163 | +  }
164 | +  return EXECUTE_SUCCESS;
165 | +}
166 | +
167 | +ExecuteResult execute_statement(Statement* statement, Table* table) {
168 |    switch (statement->type) {
169 |      case (STATEMENT_INSERT):
170 | -      printf("This is where we would do an insert.\n");
171 | -      break;
172 | +      return execute_insert(statement, table);
173 |      case (STATEMENT_SELECT):
174 | -      printf("This is where we would do a select.\n");
175 | -      break;
176 | +      return execute_select(statement, table);
177 |    }
178 |  }
179 | ```
180 | 
181 | Lastly, we need to initialize the table, create the respective
182 | memory release function and handle a few more error cases:
183 | 
184 | ```diff
185 | + Table* new_table() {
186 | +  Table* table = (Table*)malloc(sizeof(Table));
187 | +  table->num_rows = 0;
188 | +  for (uint32_t i = 0; i < TABLE_MAX_PAGES; i++) {
189 | +     table->pages[i] = NULL;
190 | +  }
191 | +  return table;
192 | +}
193 | +
194 | +void free_table(Table* table) {
195 | +    for (int i = 0; table->pages[i]; i++) {
196 | +	free(table->pages[i]);
197 | +    }
198 | +    free(table);
199 | +}
200 | ```
201 | ```diff
202 |  int main(int argc, char* argv[]) {
203 | +  Table* table = new_table();
204 |    InputBuffer* input_buffer = new_input_buffer();
205 |    while (true) {
206 |      print_prompt();
207 | @@ -105,13 +203,22 @@ int main(int argc, char* argv[]) {
208 |      switch (prepare_statement(input_buffer, &statement)) {
209 |        case (PREPARE_SUCCESS):
210 |          break;
211 | +      case (PREPARE_SYNTAX_ERROR):
212 | +        printf("Syntax error. Could not parse statement.\n");
213 | +        continue;
214 |        case (PREPARE_UNRECOGNIZED_STATEMENT):
215 |          printf("Unrecognized keyword at start of '%s'.\n",
216 |                 input_buffer->buffer);
217 |          continue;
218 |      }
219 | 
220 | -    execute_statement(&statement);
221 | -    printf("Executed.\n");
222 | +    switch (execute_statement(&statement, table)) {
223 | +      case (EXECUTE_SUCCESS):
224 | +        printf("Executed.\n");
225 | +        break;
226 | +      case (EXECUTE_TABLE_FULL):
227 | +        printf("Error: Table full.\n");
228 | +        break;
229 | +    }
230 |    }
231 |  }
232 |  ```
233 | 
234 |  With those changes we can actually save data in our database!
235 |  ```command-line
236 | ~ ./db
237 | db > insert 1 cstack foo@bar.com
238 | Executed.
239 | db > insert 2 bob bob@example.com
240 | Executed.
241 | db > select
242 | (1, cstack, foo@bar.com)
243 | (2, bob, bob@example.com)
244 | Executed.
245 | db > insert foo bar 1
246 | Syntax error. Could not parse statement.
247 | db > .exit
248 | ~
249 | ```
250 | 
251 | Now would be a great time to write some tests, for a couple reasons:
252 | - We're planning to dramatically change the data structure storing our table, and tests would catch regressions.
253 | - There are a couple edge cases we haven't tested manually (e.g. filling up the table)
254 | 
255 | We'll address those issues in the next part. For now, here's the complete diff from this part:
256 | ```diff
257 | @@ -2,6 +2,7 @@
258 |  #include <stdio.h>
259 |  #include <stdlib.h>
260 |  #include <string.h>
261 | +#include <stdint.h>
262 | 
263 |  typedef struct {
264 |    char* buffer;
265 | @@ -10,6 +11,105 @@ typedef struct {
266 |  } InputBuffer;
267 | 
268 | +typedef enum { EXECUTE_SUCCESS, EXECUTE_TABLE_FULL } ExecuteResult;
269 | +
270 | +typedef enum {
271 | +  META_COMMAND_SUCCESS,
272 | +  META_COMMAND_UNRECOGNIZED_COMMAND
273 | +} MetaCommandResult;
274 | +
275 | +typedef enum {
276 | +  PREPARE_SUCCESS,
277 | +  PREPARE_SYNTAX_ERROR,
278 | +  PREPARE_UNRECOGNIZED_STATEMENT
279 | + } PrepareResult;
280 | +
281 | +typedef enum { STATEMENT_INSERT, STATEMENT_SELECT } StatementType;
282 | +
283 | +#define COLUMN_USERNAME_SIZE 32
284 | +#define COLUMN_EMAIL_SIZE 255
285 | +typedef struct {
286 | +  uint32_t id;
287 | +  char username[COLUMN_USERNAME_SIZE];
288 | +  char email[COLUMN_EMAIL_SIZE];
289 | +} Row;
290 | +
291 | +typedef struct {
292 | +  StatementType type;
293 | +  Row row_to_insert; //only used by insert statement
294 | +} Statement;
295 | +
296 | +#define size_of_attribute(Struct, Attribute) sizeof(((Struct*)0)->Attribute)
297 | +
298 | +const uint32_t ID_SIZE = size_of_attribute(Row, id);
299 | +const uint32_t USERNAME_SIZE = size_of_attribute(Row, username);
300 | +const uint32_t EMAIL_SIZE = size_of_attribute(Row, email);
301 | +const uint32_t ID_OFFSET = 0;
302 | +const uint32_t USERNAME_OFFSET = ID_OFFSET + ID_SIZE;
303 | +const uint32_t EMAIL_OFFSET = USERNAME_OFFSET + USERNAME_SIZE;
304 | +const uint32_t ROW_SIZE = ID_SIZE + USERNAME_SIZE + EMAIL_SIZE;
305 | +
306 | +const uint32_t PAGE_SIZE = 4096;
307 | +#define TABLE_MAX_PAGES 100
308 | +const uint32_t ROWS_PER_PAGE = PAGE_SIZE / ROW_SIZE;
309 | +const uint32_t TABLE_MAX_ROWS = ROWS_PER_PAGE * TABLE_MAX_PAGES;
310 | +
311 | +typedef struct {
312 | +  uint32_t num_rows;
313 | +  void* pages[TABLE_MAX_PAGES];
314 | +} Table;
315 | +
316 | +void print_row(Row* row) {
317 | +  printf("(%d, %s, %s)\n", row->id, row->username, row->email);
318 | +}
319 | +
320 | +void serialize_row(Row* source, void* destination) {
321 | +  memcpy(destination + ID_OFFSET, &(source->id), ID_SIZE);
322 | +  memcpy(destination + USERNAME_OFFSET, &(source->username), USERNAME_SIZE);
323 | +  memcpy(destination + EMAIL_OFFSET, &(source->email), EMAIL_SIZE);
324 | +}
325 | +
326 | +void deserialize_row(void *source, Row* destination) {
327 | +  memcpy(&(destination->id), source + ID_OFFSET, ID_SIZE);
328 | +  memcpy(&(destination->username), source + USERNAME_OFFSET, USERNAME_SIZE);
329 | +  memcpy(&(destination->email), source + EMAIL_OFFSET, EMAIL_SIZE);
330 | +}
331 | +
332 | +void* row_slot(Table* table, uint32_t row_num) {
333 | +  uint32_t page_num = row_num / ROWS_PER_PAGE;
334 | +  void *page = table->pages[page_num];
335 | +  if (page == NULL) {
336 | +     // Allocate memory only when we try to access page
337 | +     page = table->pages[page_num] = malloc(PAGE_SIZE);
338 | +  }
339 | +  uint32_t row_offset = row_num % ROWS_PER_PAGE;
340 | +  uint32_t byte_offset = row_offset * ROW_SIZE;
341 | +  return page + byte_offset;
342 | +}
343 | +
344 | +Table* new_table() {
345 | +  Table* table = (Table*)malloc(sizeof(Table));
346 | +  table->num_rows = 0;
347 | +  for (uint32_t i = 0; i < TABLE_MAX_PAGES; i++) {
348 | +     table->pages[i] = NULL;
349 | +  }
350 | +  return table;
351 | +}
352 | +
353 | +void free_table(Table* table) {
354 | +  for (int i = 0; table->pages[i]; i++) {
355 | +     free(table->pages[i]);
356 | +  }
357 | +  free(table);
358 | +}
359 | +
360 |  InputBuffer* new_input_buffer() {
361 |    InputBuffer* input_buffer = (InputBuffer*)malloc(sizeof(InputBuffer));
362 |    input_buffer->buffer = NULL;
363 | @@ -40,17 +140,105 @@ void close_input_buffer(InputBuffer* input_buffer) {
364 |      free(input_buffer);
365 |  }
366 | 
367 | +MetaCommandResult do_meta_command(InputBuffer* input_buffer, Table *table) {
368 | +  if (strcmp(input_buffer->buffer, ".exit") == 0) {
369 | +    close_input_buffer(input_buffer);
370 | +    free_table(table);
371 | +    exit(EXIT_SUCCESS);
372 | +  } else {
373 | +    return META_COMMAND_UNRECOGNIZED_COMMAND;
374 | +  }
375 | +}
376 | +
377 | +PrepareResult prepare_statement(InputBuffer* input_buffer,
378 | +                                Statement* statement) {
379 | +  if (strncmp(input_buffer->buffer, "insert", 6) == 0) {
380 | +    statement->type = STATEMENT_INSERT;
381 | +    int args_assigned = sscanf(
382 | +	input_buffer->buffer, "insert %d %s %s", &(statement->row_to_insert.id),
383 | +	statement->row_to_insert.username, statement->row_to_insert.email
384 | +	);
385 | +    if (args_assigned < 3) {
386 | +	return PREPARE_SYNTAX_ERROR;
387 | +    }
388 | +    return PREPARE_SUCCESS;
389 | +  }
390 | +  if (strcmp(input_buffer->buffer, "select") == 0) {
391 | +    statement->type = STATEMENT_SELECT;
392 | +    return PREPARE_SUCCESS;
393 | +  }
394 | +
395 | +  return PREPARE_UNRECOGNIZED_STATEMENT;
396 | +}
397 | +
398 | +ExecuteResult execute_insert(Statement* statement, Table* table) {
399 | +  if (table->num_rows >= TABLE_MAX_ROWS) {
400 | +     return EXECUTE_TABLE_FULL;
401 | +  }
402 | +
403 | +  Row* row_to_insert = &(statement->row_to_insert);
404 | +
405 | +  serialize_row(row_to_insert, row_slot(table, table->num_rows));
406 | +  table->num_rows += 1;
407 | +
408 | +  return EXECUTE_SUCCESS;
409 | +}
410 | +
411 | +ExecuteResult execute_select(Statement* statement, Table* table) {
412 | +  Row row;
413 | +  for (uint32_t i = 0; i < table->num_rows; i++) {
414 | +     deserialize_row(row_slot(table, i), &row);
415 | +     print_row(&row);
416 | +  }
417 | +  return EXECUTE_SUCCESS;
418 | +}
419 | +
420 | +ExecuteResult execute_statement(Statement* statement, Table *table) {
421 | +  switch (statement->type) {
422 | +    case (STATEMENT_INSERT):
423 | +       	return execute_insert(statement, table);
424 | +    case (STATEMENT_SELECT):
425 | +	return execute_select(statement, table);
426 | +  }
427 | +}
428 | +
429 |  int main(int argc, char* argv[]) {
430 | +  Table* table = new_table();
431 |    InputBuffer* input_buffer = new_input_buffer();
432 |    while (true) {
433 |      print_prompt();
434 |      read_input(input_buffer);
435 | 
436 | -    if (strcmp(input_buffer->buffer, ".exit") == 0) {
437 | -      close_input_buffer(input_buffer);
438 | -      exit(EXIT_SUCCESS);
439 | -    } else {
440 | -      printf("Unrecognized command '%s'.\n", input_buffer->buffer);
441 | +    if (input_buffer->buffer[0] == '.') {
442 | +      switch (do_meta_command(input_buffer, table)) {
443 | +        case (META_COMMAND_SUCCESS):
444 | +          continue;
445 | +        case (META_COMMAND_UNRECOGNIZED_COMMAND):
446 | +          printf("Unrecognized command '%s'\n", input_buffer->buffer);
447 | +          continue;
448 | +      }
449 | +    }
450 | +
451 | +    Statement statement;
452 | +    switch (prepare_statement(input_buffer, &statement)) {
453 | +      case (PREPARE_SUCCESS):
454 | +        break;
455 | +      case (PREPARE_SYNTAX_ERROR):
456 | +	printf("Syntax error. Could not parse statement.\n");
457 | +	continue;
458 | +      case (PREPARE_UNRECOGNIZED_STATEMENT):
459 | +        printf("Unrecognized keyword at start of '%s'.\n",
460 | +               input_buffer->buffer);
461 | +        continue;
462 | +    }
463 | +
464 | +    switch (execute_statement(&statement, table)) {
465 | +	case (EXECUTE_SUCCESS):
466 | +	    printf("Executed.\n");
467 | +	    break;
468 | +	case (EXECUTE_TABLE_FULL):
469 | +	    printf("Error: Table full.\n");
470 | +	    break;
471 |      }
472 |    }
473 |  }
474 | ```
475 | 


--------------------------------------------------------------------------------
/_parts/part4.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: Part 4 - Our First Tests (and Bugs)
  3 | date: 2017-09-03
  4 | ---
  5 | 
  6 | We've got the ability to insert rows into our database and to print out all rows. Let's take a moment to test what we've got so far.
  7 | 
  8 | I'm going to use [rspec](http://rspec.info/) to write my tests because I'm familiar with it, and the syntax is fairly readable.
  9 | 
 10 | I'll define a short helper to send a list of commands to our database program then make assertions about the output:
 11 | 
 12 | ```ruby
 13 | describe 'database' do
 14 |   def run_script(commands)
 15 |     raw_output = nil
 16 |     IO.popen("./db", "r+") do |pipe|
 17 |       commands.each do |command|
 18 |         pipe.puts command
 19 |       end
 20 | 
 21 |       pipe.close_write
 22 | 
 23 |       # Read entire output
 24 |       raw_output = pipe.gets(nil)
 25 |     end
 26 |     raw_output.split("\n")
 27 |   end
 28 | 
 29 |   it 'inserts and retrieves a row' do
 30 |     result = run_script([
 31 |       "insert 1 user1 person1@example.com",
 32 |       "select",
 33 |       ".exit",
 34 |     ])
 35 |     expect(result).to match_array([
 36 |       "db > Executed.",
 37 |       "db > (1, user1, person1@example.com)",
 38 |       "Executed.",
 39 |       "db > ",
 40 |     ])
 41 |   end
 42 | end
 43 | ```
 44 | 
 45 | This simple test makes sure we get back what we put in. And indeed it passes:
 46 | ```command-line
 47 | bundle exec rspec
 48 | .
 49 | 
 50 | Finished in 0.00871 seconds (files took 0.09506 seconds to load)
 51 | 1 example, 0 failures
 52 | ```
 53 | 
 54 | Now it's feasible to test inserting a large number of rows into the database:
 55 | ```ruby
 56 | it 'prints error message when table is full' do
 57 |   script = (1..1401).map do |i|
 58 |     "insert #{i} user#{i} person#{i}@example.com"
 59 |   end
 60 |   script << ".exit"
 61 |   result = run_script(script)
 62 |   expect(result[-2]).to eq('db > Error: Table full.')
 63 | end
 64 | ```
 65 | 
 66 | Running tests again...
 67 | ```command-line
 68 | bundle exec rspec
 69 | ..
 70 | 
 71 | Finished in 0.01553 seconds (files took 0.08156 seconds to load)
 72 | 2 examples, 0 failures
 73 | ```
 74 | 
 75 | Sweet, it works! Our db can hold 1400 rows right now because we set the maximum number of pages to 100, and 14 rows can fit in a page.
 76 | 
 77 | Reading through the code we have so far, I realized we might not handle storing text fields correctly. Easy to test with this example:
 78 | ```ruby
 79 | it 'allows inserting strings that are the maximum length' do
 80 |   long_username = "a"*32
 81 |   long_email = "a"*255
 82 |   script = [
 83 |     "insert 1 #{long_username} #{long_email}",
 84 |     "select",
 85 |     ".exit",
 86 |   ]
 87 |   result = run_script(script)
 88 |   expect(result).to match_array([
 89 |     "db > Executed.",
 90 |     "db > (1, #{long_username}, #{long_email})",
 91 |     "Executed.",
 92 |     "db > ",
 93 |   ])
 94 | end
 95 | ```
 96 | 
 97 | And the test fails!
 98 | ```ruby
 99 | Failures:
100 | 
101 |   1) database allows inserting strings that are the maximum length
102 |      Failure/Error: raw_output.split("\n")
103 | 
104 |      ArgumentError:
105 |        invalid byte sequence in UTF-8
106 |      # ./spec/main_spec.rb:14:in `split'
107 |      # ./spec/main_spec.rb:14:in `run_script'
108 |      # ./spec/main_spec.rb:48:in `block (2 levels) in <top (required)>'
109 | ```
110 | 
111 | If we try it ourselves, we'll see that there's some weird characters when we try to print out the row. (I'm abbreviating the long strings):
112 | ```command-line
113 | db > insert 1 aaaaa... aaaaa...
114 | Executed.
115 | db > select
116 | (1, aaaaa...aaa\�, aaaaa...aaa\�)
117 | Executed.
118 | db >
119 | ```
120 | 
121 | What's going on? If you take a look at our definition of a Row, we allocate exactly 32 bytes for username and exactly 255 bytes for email. But [C strings](http://www.cprogramming.com/tutorial/c/lesson9.html) are supposed to end with a null character, which we didn't allocate space for. The solution is to allocate one additional byte:
122 | ```diff
123 |  const uint32_t COLUMN_EMAIL_SIZE = 255;
124 |  typedef struct {
125 |    uint32_t id;
126 | -  char username[COLUMN_USERNAME_SIZE];
127 | -  char email[COLUMN_EMAIL_SIZE];
128 | +  char username[COLUMN_USERNAME_SIZE + 1];
129 | +  char email[COLUMN_EMAIL_SIZE + 1];
130 |  } Row;
131 |  ```
132 | 
133 |  And indeed that fixes it:
134 |  ```ruby
135 |  bundle exec rspec
136 | ...
137 | 
138 | Finished in 0.0188 seconds (files took 0.08516 seconds to load)
139 | 3 examples, 0 failures
140 | ```
141 | 
142 | We should not allow inserting usernames or emails that are longer than column size. The spec for that looks like this:
143 | ```ruby
144 | it 'prints error message if strings are too long' do
145 |   long_username = "a"*33
146 |   long_email = "a"*256
147 |   script = [
148 |     "insert 1 #{long_username} #{long_email}",
149 |     "select",
150 |     ".exit",
151 |   ]
152 |   result = run_script(script)
153 |   expect(result).to match_array([
154 |     "db > String is too long.",
155 |     "db > Executed.",
156 |     "db > ",
157 |   ])
158 | end
159 | ```
160 | 
161 | In order to do this we need to upgrade our parser. As a reminder, we're currently using [scanf()](https://linux.die.net/man/3/scanf):
162 | ```c
163 | if (strncmp(input_buffer->buffer, "insert", 6) == 0) {
164 |   statement->type = STATEMENT_INSERT;
165 |   int args_assigned = sscanf(
166 |       input_buffer->buffer, "insert %d %s %s", &(statement->row_to_insert.id),
167 |       statement->row_to_insert.username, statement->row_to_insert.email);
168 |   if (args_assigned < 3) {
169 |     return PREPARE_SYNTAX_ERROR;
170 |   }
171 |   return PREPARE_SUCCESS;
172 | }
173 | ```
174 | 
175 | But [scanf has some disadvantages](https://stackoverflow.com/questions/2430303/disadvantages-of-scanf). If the string it's reading is larger than the buffer it's reading into, it will cause a buffer overflow and start writing into unexpected places. We want to check the length of each string before we copy it into a `Row` structure. And to do that, we need to divide the input by spaces.
176 | 
177 | I'm going to use [strtok()](http://www.cplusplus.com/reference/cstring/strtok/) to do that. I think it's easiest to understand if you see it in action:
178 | 
179 | ```diff
180 | +PrepareResult prepare_insert(InputBuffer* input_buffer, Statement* statement) {
181 | +  statement->type = STATEMENT_INSERT;
182 | +
183 | +  char* keyword = strtok(input_buffer->buffer, " ");
184 | +  char* id_string = strtok(NULL, " ");
185 | +  char* username = strtok(NULL, " ");
186 | +  char* email = strtok(NULL, " ");
187 | +
188 | +  if (id_string == NULL || username == NULL || email == NULL) {
189 | +    return PREPARE_SYNTAX_ERROR;
190 | +  }
191 | +
192 | +  int id = atoi(id_string);
193 | +  if (strlen(username) > COLUMN_USERNAME_SIZE) {
194 | +    return PREPARE_STRING_TOO_LONG;
195 | +  }
196 | +  if (strlen(email) > COLUMN_EMAIL_SIZE) {
197 | +    return PREPARE_STRING_TOO_LONG;
198 | +  }
199 | +
200 | +  statement->row_to_insert.id = id;
201 | +  strcpy(statement->row_to_insert.username, username);
202 | +  strcpy(statement->row_to_insert.email, email);
203 | +
204 | +  return PREPARE_SUCCESS;
205 | +}
206 | +
207 |  PrepareResult prepare_statement(InputBuffer* input_buffer,
208 |                                  Statement* statement) {
209 |    if (strncmp(input_buffer->buffer, "insert", 6) == 0) {
210 | +    return prepare_insert(input_buffer, statement);
211 | -    statement->type = STATEMENT_INSERT;
212 | -    int args_assigned = sscanf(
213 | -        input_buffer->buffer, "insert %d %s %s", &(statement->row_to_insert.id),
214 | -        statement->row_to_insert.username, statement->row_to_insert.email);
215 | -    if (args_assigned < 3) {
216 | -      return PREPARE_SYNTAX_ERROR;
217 | -    }
218 | -    return PREPARE_SUCCESS;
219 |    }
220 | ```
221 | 
222 | Calling `strtok` successively on the input buffer breaks it into substrings by inserting a null character whenever it reaches a delimiter (space, in our case). It returns a pointer to the start of the substring.
223 | 
224 | We can call [strlen()](http://www.cplusplus.com/reference/cstring/strlen/) on each text value to see if it's too long.
225 | 
226 | We can handle the error like we do any other error code:
227 | ```diff
228 |  enum PrepareResult_t {
229 |    PREPARE_SUCCESS,
230 | +  PREPARE_STRING_TOO_LONG,
231 |    PREPARE_SYNTAX_ERROR,
232 |    PREPARE_UNRECOGNIZED_STATEMENT
233 |  };
234 | ```
235 | ```diff
236 |  switch (prepare_statement(input_buffer, &statement)) {
237 |    case (PREPARE_SUCCESS):
238 |      break;
239 | +  case (PREPARE_STRING_TOO_LONG):
240 | +    printf("String is too long.\n");
241 | +    continue;
242 |    case (PREPARE_SYNTAX_ERROR):
243 |      printf("Syntax error. Could not parse statement.\n");
244 |      continue;
245 | ```
246 | 
247 | Which makes our test pass
248 | ```command-line
249 | bundle exec rspec
250 | ....
251 | 
252 | Finished in 0.02284 seconds (files took 0.116 seconds to load)
253 | 4 examples, 0 failures
254 | ```
255 | 
256 | While we're here, we might as well handle one more error case:
257 | ```ruby
258 | it 'prints an error message if id is negative' do
259 |   script = [
260 |     "insert -1 cstack foo@bar.com",
261 |     "select",
262 |     ".exit",
263 |   ]
264 |   result = run_script(script)
265 |   expect(result).to match_array([
266 |     "db > ID must be positive.",
267 |     "db > Executed.",
268 |     "db > ",
269 |   ])
270 | end
271 | ```
272 | ```diff
273 |  enum PrepareResult_t {
274 |    PREPARE_SUCCESS,
275 | +  PREPARE_NEGATIVE_ID,
276 |    PREPARE_STRING_TOO_LONG,
277 |    PREPARE_SYNTAX_ERROR,
278 |    PREPARE_UNRECOGNIZED_STATEMENT
279 | @@ -148,9 +147,6 @@ PrepareResult prepare_insert(InputBuffer* input_buffer, Statement* statement) {
280 |    }
281 | 
282 |    int id = atoi(id_string);
283 | +  if (id < 0) {
284 | +    return PREPARE_NEGATIVE_ID;
285 | +  }
286 |    if (strlen(username) > COLUMN_USERNAME_SIZE) {
287 |      return PREPARE_STRING_TOO_LONG;
288 |    }
289 | @@ -230,9 +226,6 @@ int main(int argc, char* argv[]) {
290 |      switch (prepare_statement(input_buffer, &statement)) {
291 |        case (PREPARE_SUCCESS):
292 |          break;
293 | +      case (PREPARE_NEGATIVE_ID):
294 | +        printf("ID must be positive.\n");
295 | +        continue;
296 |        case (PREPARE_STRING_TOO_LONG):
297 |          printf("String is too long.\n");
298 |          continue;
299 | ```
300 | 
301 | Alright, that's enough testing for now. Next is a very important feature: persistence! We're going to save our database to a file and read it back out again.
302 | 
303 | It's gonna be great.
304 | 
305 | Here's the complete diff for this part:
306 | ```diff
307 | @@ -22,6 +22,8 @@
308 | 
309 |  enum PrepareResult_t {
310 |    PREPARE_SUCCESS,
311 | +  PREPARE_NEGATIVE_ID,
312 | +  PREPARE_STRING_TOO_LONG,
313 |    PREPARE_SYNTAX_ERROR,
314 |    PREPARE_UNRECOGNIZED_STATEMENT
315 |   };
316 | @@ -34,8 +36,8 @@
317 |  #define COLUMN_EMAIL_SIZE 255
318 |  typedef struct {
319 |    uint32_t id;
320 | -  char username[COLUMN_USERNAME_SIZE];
321 | -  char email[COLUMN_EMAIL_SIZE];
322 | +  char username[COLUMN_USERNAME_SIZE + 1];
323 | +  char email[COLUMN_EMAIL_SIZE + 1];
324 |  } Row;
325 | 
326 | @@ -150,18 +152,40 @@ MetaCommandResult do_meta_command(InputBuffer* input_buffer, Table *table) {
327 |    }
328 |  }
329 | 
330 | -PrepareResult prepare_statement(InputBuffer* input_buffer,
331 | -                                Statement* statement) {
332 | -  if (strncmp(input_buffer->buffer, "insert", 6) == 0) {
333 | +PrepareResult prepare_insert(InputBuffer* input_buffer, Statement* statement) {
334 |    statement->type = STATEMENT_INSERT;
335 | -  int args_assigned = sscanf(
336 | -     input_buffer->buffer, "insert %d %s %s", &(statement->row_to_insert.id),
337 | -     statement->row_to_insert.username, statement->row_to_insert.email
338 | -     );
339 | -  if (args_assigned < 3) {
340 | +
341 | +  char* keyword = strtok(input_buffer->buffer, " ");
342 | +  char* id_string = strtok(NULL, " ");
343 | +  char* username = strtok(NULL, " ");
344 | +  char* email = strtok(NULL, " ");
345 | +
346 | +  if (id_string == NULL || username == NULL || email == NULL) {
347 |       return PREPARE_SYNTAX_ERROR;
348 |    }
349 | +
350 | +  int id = atoi(id_string);
351 | +  if (id < 0) {
352 | +     return PREPARE_NEGATIVE_ID;
353 | +  }
354 | +  if (strlen(username) > COLUMN_USERNAME_SIZE) {
355 | +     return PREPARE_STRING_TOO_LONG;
356 | +  }
357 | +  if (strlen(email) > COLUMN_EMAIL_SIZE) {
358 | +     return PREPARE_STRING_TOO_LONG;
359 | +  }
360 | +
361 | +  statement->row_to_insert.id = id;
362 | +  strcpy(statement->row_to_insert.username, username);
363 | +  strcpy(statement->row_to_insert.email, email);
364 | +
365 |    return PREPARE_SUCCESS;
366 | +
367 | +}
368 | +PrepareResult prepare_statement(InputBuffer* input_buffer,
369 | +                                Statement* statement) {
370 | +  if (strncmp(input_buffer->buffer, "insert", 6) == 0) {
371 | +      return prepare_insert(input_buffer, statement);
372 |    }
373 |    if (strcmp(input_buffer->buffer, "select") == 0) {
374 |      statement->type = STATEMENT_SELECT;
375 | @@ -223,6 +247,12 @@ int main(int argc, char* argv[]) {
376 |      switch (prepare_statement(input_buffer, &statement)) {
377 |        case (PREPARE_SUCCESS):
378 |          break;
379 | +      case (PREPARE_NEGATIVE_ID):
380 | +	printf("ID must be positive.\n");
381 | +	continue;
382 | +      case (PREPARE_STRING_TOO_LONG):
383 | +	printf("String is too long.\n");
384 | +	continue;
385 |        case (PREPARE_SYNTAX_ERROR):
386 |  	printf("Syntax error. Could not parse statement.\n");
387 |  	continue;
388 | ```
389 | And we added tests:
390 | ```diff
391 | +describe 'database' do
392 | +  def run_script(commands)
393 | +    raw_output = nil
394 | +    IO.popen("./db", "r+") do |pipe|
395 | +      commands.each do |command|
396 | +        pipe.puts command
397 | +      end
398 | +
399 | +      pipe.close_write
400 | +
401 | +      # Read entire output
402 | +      raw_output = pipe.gets(nil)
403 | +    end
404 | +    raw_output.split("\n")
405 | +  end
406 | +
407 | +  it 'inserts and retrieves a row' do
408 | +    result = run_script([
409 | +      "insert 1 user1 person1@example.com",
410 | +      "select",
411 | +      ".exit",
412 | +    ])
413 | +    expect(result).to match_array([
414 | +      "db > Executed.",
415 | +      "db > (1, user1, person1@example.com)",
416 | +      "Executed.",
417 | +      "db > ",
418 | +    ])
419 | +  end
420 | +
421 | +  it 'prints error message when table is full' do
422 | +    script = (1..1401).map do |i|
423 | +      "insert #{i} user#{i} person#{i}@example.com"
424 | +    end
425 | +    script << ".exit"
426 | +    result = run_script(script)
427 | +    expect(result[-2]).to eq('db > Error: Table full.')
428 | +  end
429 | +
430 | +  it 'allows inserting strings that are the maximum length' do
431 | +    long_username = "a"*32
432 | +    long_email = "a"*255
433 | +    script = [
434 | +      "insert 1 #{long_username} #{long_email}",
435 | +      "select",
436 | +      ".exit",
437 | +    ]
438 | +    result = run_script(script)
439 | +    expect(result).to match_array([
440 | +      "db > Executed.",
441 | +      "db > (1, #{long_username}, #{long_email})",
442 | +      "Executed.",
443 | +      "db > ",
444 | +    ])
445 | +  end
446 | +
447 | +  it 'prints error message if strings are too long' do
448 | +    long_username = "a"*33
449 | +    long_email = "a"*256
450 | +    script = [
451 | +      "insert 1 #{long_username} #{long_email}",
452 | +      "select",
453 | +      ".exit",
454 | +    ]
455 | +    result = run_script(script)
456 | +    expect(result).to match_array([
457 | +      "db > String is too long.",
458 | +      "db > Executed.",
459 | +      "db > ",
460 | +    ])
461 | +  end
462 | +
463 | +  it 'prints an error message if id is negative' do
464 | +    script = [
465 | +      "insert -1 cstack foo@bar.com",
466 | +      "select",
467 | +      ".exit",
468 | +    ]
469 | +    result = run_script(script)
470 | +    expect(result).to match_array([
471 | +      "db > ID must be positive.",
472 | +      "db > Executed.",
473 | +      "db > ",
474 | +    ])
475 | +  end
476 | +end
477 | ```
478 | 


--------------------------------------------------------------------------------
/_parts/part5.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: Part 5 - Persistence to Disk
  3 | date: 2017-09-08
  4 | ---
  5 | 
  6 | > "Nothing in the world can take the place of persistence." -- [Calvin Coolidge](https://en.wikiquote.org/wiki/Calvin_Coolidge)
  7 | 
  8 | Our database lets you insert records and read them back out, but only as long as you keep the program running. If you kill the program and start it back up, all your records are gone. Here's a spec for the behavior we want:
  9 | 
 10 | ```ruby
 11 | it 'keeps data after closing connection' do
 12 |   result1 = run_script([
 13 |     "insert 1 user1 person1@example.com",
 14 |     ".exit",
 15 |   ])
 16 |   expect(result1).to match_array([
 17 |     "db > Executed.",
 18 |     "db > ",
 19 |   ])
 20 |   result2 = run_script([
 21 |     "select",
 22 |     ".exit",
 23 |   ])
 24 |   expect(result2).to match_array([
 25 |     "db > (1, user1, person1@example.com)",
 26 |     "Executed.",
 27 |     "db > ",
 28 |   ])
 29 | end
 30 | ```
 31 | 
 32 | Like sqlite, we're going to persist records by saving the entire database to a file.
 33 | 
 34 | We already set ourselves up to do that by serializing rows into page-sized memory blocks. To add persistence, we can simply write those blocks of memory to a file, and read them back into memory the next time the program starts up.
 35 | 
 36 | To make this easier, we're going to make an abstraction called the pager. We ask the pager for page number `x`, and the pager gives us back a block of memory. It first looks in its cache. On a cache miss, it copies data from disk into memory (by reading the database file).
 37 | 
 38 | {% include image.html url="assets/images/arch-part5.gif" description="How our program matches up with SQLite architecture" %}
 39 | 
 40 | The Pager accesses the page cache and the file. The Table object makes requests for pages through the pager:
 41 | 
 42 | ```diff
 43 | +typedef struct {
 44 | +  int file_descriptor;
 45 | +  uint32_t file_length;
 46 | +  void* pages[TABLE_MAX_PAGES];
 47 | +} Pager;
 48 | +
 49 |  typedef struct {
 50 | -  void* pages[TABLE_MAX_PAGES];
 51 | +  Pager* pager;
 52 |    uint32_t num_rows;
 53 |  } Table;
 54 | ```
 55 | 
 56 | I'm renaming `new_table()` to `db_open()` because it now has the effect of opening a connection to the database. By opening a connection, I mean:
 57 | 
 58 | - opening the database file
 59 | - initializing a pager data structure
 60 | - initializing a table data structure
 61 | 
 62 | ```diff
 63 | -Table* new_table() {
 64 | +Table* db_open(const char* filename) {
 65 | +  Pager* pager = pager_open(filename);
 66 | +  uint32_t num_rows = pager->file_length / ROW_SIZE;
 67 | +
 68 |    Table* table = malloc(sizeof(Table));
 69 | -  table->num_rows = 0;
 70 | +  table->pager = pager;
 71 | +  table->num_rows = num_rows;
 72 | 
 73 |    return table;
 74 |  }
 75 | ```
 76 | 
 77 | `db_open()` in turn calls `pager_open()`, which opens the database file and keeps track of its size. It also initializes the page cache to all `NULL`s.
 78 | 
 79 | ```diff
 80 | +Pager* pager_open(const char* filename) {
 81 | +  int fd = open(filename,
 82 | +                O_RDWR |      // Read/Write mode
 83 | +                    O_CREAT,  // Create file if it does not exist
 84 | +                S_IWUSR |     // User write permission
 85 | +                    S_IRUSR   // User read permission
 86 | +                );
 87 | +
 88 | +  if (fd == -1) {
 89 | +    printf("Unable to open file\n");
 90 | +    exit(EXIT_FAILURE);
 91 | +  }
 92 | +
 93 | +  off_t file_length = lseek(fd, 0, SEEK_END);
 94 | +
 95 | +  Pager* pager = malloc(sizeof(Pager));
 96 | +  pager->file_descriptor = fd;
 97 | +  pager->file_length = file_length;
 98 | +
 99 | +  for (uint32_t i = 0; i < TABLE_MAX_PAGES; i++) {
100 | +    pager->pages[i] = NULL;
101 | +  }
102 | +
103 | +  return pager;
104 | +}
105 | ```
106 | 
107 | Following our new abstraction, we move the logic for fetching a page into its own method:
108 | 
109 | ```diff
110 |  void* row_slot(Table* table, uint32_t row_num) {
111 |    uint32_t page_num = row_num / ROWS_PER_PAGE;
112 | -  void* page = table->pages[page_num];
113 | -  if (page == NULL) {
114 | -    // Allocate memory only when we try to access page
115 | -    page = table->pages[page_num] = malloc(PAGE_SIZE);
116 | -  }
117 | +  void* page = get_page(table->pager, page_num);
118 |    uint32_t row_offset = row_num % ROWS_PER_PAGE;
119 |    uint32_t byte_offset = row_offset * ROW_SIZE;
120 |    return page + byte_offset;
121 |  }
122 | ```
123 | 
124 | The `get_page()` method has the logic for handling a cache miss. We assume pages are saved one after the other in the database file: Page 0 at offset 0, page 1 at offset 4096, page 2 at offset 8192, etc. If the requested page lies outside the bounds of the file, we know it should be blank, so we just allocate some memory and return it. The page will be added to the file when we flush the cache to disk later.
125 | 
126 | 
127 | ```diff
128 | +void* get_page(Pager* pager, uint32_t page_num) {
129 | +  if (page_num > TABLE_MAX_PAGES) {
130 | +    printf("Tried to fetch page number out of bounds. %d > %d\n", page_num,
131 | +           TABLE_MAX_PAGES);
132 | +    exit(EXIT_FAILURE);
133 | +  }
134 | +
135 | +  if (pager->pages[page_num] == NULL) {
136 | +    // Cache miss. Allocate memory and load from file.
137 | +    void* page = malloc(PAGE_SIZE);
138 | +    uint32_t num_pages = pager->file_length / PAGE_SIZE;
139 | +
140 | +    // We might save a partial page at the end of the file
141 | +    if (pager->file_length % PAGE_SIZE) {
142 | +      num_pages += 1;
143 | +    }
144 | +
145 | +    if (page_num <= num_pages) {
146 | +      lseek(pager->file_descriptor, page_num * PAGE_SIZE, SEEK_SET);
147 | +      ssize_t bytes_read = read(pager->file_descriptor, page, PAGE_SIZE);
148 | +      if (bytes_read == -1) {
149 | +        printf("Error reading file: %d\n", errno);
150 | +        exit(EXIT_FAILURE);
151 | +      }
152 | +    }
153 | +
154 | +    pager->pages[page_num] = page;
155 | +  }
156 | +
157 | +  return pager->pages[page_num];
158 | +}
159 | ```
160 | 
161 | For now, we'll wait to flush the cache to disk until the user closes the connection to the database. When the user exits, we'll call a new method called `db_close()`, which
162 | 
163 | - flushes the page cache to disk
164 | - closes the database file
165 | - frees the memory for the Pager and Table data structures
166 | 
167 | ```diff
168 | +void db_close(Table* table) {
169 | +  Pager* pager = table->pager;
170 | +  uint32_t num_full_pages = table->num_rows / ROWS_PER_PAGE;
171 | +
172 | +  for (uint32_t i = 0; i < num_full_pages; i++) {
173 | +    if (pager->pages[i] == NULL) {
174 | +      continue;
175 | +    }
176 | +    pager_flush(pager, i, PAGE_SIZE);
177 | +    free(pager->pages[i]);
178 | +    pager->pages[i] = NULL;
179 | +  }
180 | +
181 | +  // There may be a partial page to write to the end of the file
182 | +  // This should not be needed after we switch to a B-tree
183 | +  uint32_t num_additional_rows = table->num_rows % ROWS_PER_PAGE;
184 | +  if (num_additional_rows > 0) {
185 | +    uint32_t page_num = num_full_pages;
186 | +    if (pager->pages[page_num] != NULL) {
187 | +      pager_flush(pager, page_num, num_additional_rows * ROW_SIZE);
188 | +      free(pager->pages[page_num]);
189 | +      pager->pages[page_num] = NULL;
190 | +    }
191 | +  }
192 | +
193 | +  int result = close(pager->file_descriptor);
194 | +  if (result == -1) {
195 | +    printf("Error closing db file.\n");
196 | +    exit(EXIT_FAILURE);
197 | +  }
198 | +  for (uint32_t i = 0; i < TABLE_MAX_PAGES; i++) {
199 | +    void* page = pager->pages[i];
200 | +    if (page) {
201 | +      free(page);
202 | +      pager->pages[i] = NULL;
203 | +    }
204 | +  }
205 | +  free(pager);
206 | +  free(table);
207 | +}
208 | +
209 | -MetaCommandResult do_meta_command(InputBuffer* input_buffer) {
210 | +MetaCommandResult do_meta_command(InputBuffer* input_buffer, Table* table) {
211 |    if (strcmp(input_buffer->buffer, ".exit") == 0) {
212 | +    db_close(table);
213 |      exit(EXIT_SUCCESS);
214 |    } else {
215 |      return META_COMMAND_UNRECOGNIZED_COMMAND;
216 | ```
217 | 
218 | In our current design, the length of the file encodes how many rows are in the database, so we need to write a partial page at the end of the file. That's why `pager_flush()` takes both a page number and a size. It's not the greatest design, but it will go away pretty quickly when we start implementing the B-tree.
219 | 
220 | ```diff
221 | +void pager_flush(Pager* pager, uint32_t page_num, uint32_t size) {
222 | +  if (pager->pages[page_num] == NULL) {
223 | +    printf("Tried to flush null page\n");
224 | +    exit(EXIT_FAILURE);
225 | +  }
226 | +
227 | +  off_t offset = lseek(pager->file_descriptor, page_num * PAGE_SIZE, SEEK_SET);
228 | +
229 | +  if (offset == -1) {
230 | +    printf("Error seeking: %d\n", errno);
231 | +    exit(EXIT_FAILURE);
232 | +  }
233 | +
234 | +  ssize_t bytes_written =
235 | +      write(pager->file_descriptor, pager->pages[page_num], size);
236 | +
237 | +  if (bytes_written == -1) {
238 | +    printf("Error writing: %d\n", errno);
239 | +    exit(EXIT_FAILURE);
240 | +  }
241 | +}
242 | ```
243 | 
244 | Lastly, we need to accept the filename as a command-line argument. Don't forget to also add the extra argument to `do_meta_command`:
245 | 
246 | ```diff
247 |  int main(int argc, char* argv[]) {
248 | -  Table* table = new_table();
249 | +  if (argc < 2) {
250 | +    printf("Must supply a database filename.\n");
251 | +    exit(EXIT_FAILURE);
252 | +  }
253 | +
254 | +  char* filename = argv[1];
255 | +  Table* table = db_open(filename);
256 | +
257 |    InputBuffer* input_buffer = new_input_buffer();
258 |    while (true) {
259 |      print_prompt();
260 |      read_input(input_buffer);
261 | 
262 |      if (input_buffer->buffer[0] == '.') {
263 | -      switch (do_meta_command(input_buffer)) {
264 | +      switch (do_meta_command(input_buffer, table)) {
265 | ```
266 | With these changes, we're able to close then reopen the database, and our records are still there!
267 | 
268 | ```
269 | ~ ./db mydb.db
270 | db > insert 1 cstack foo@bar.com
271 | Executed.
272 | db > insert 2 voltorb volty@example.com
273 | Executed.
274 | db > .exit
275 | ~
276 | ~ ./db mydb.db
277 | db > select
278 | (1, cstack, foo@bar.com)
279 | (2, voltorb, volty@example.com)
280 | Executed.
281 | db > .exit
282 | ~
283 | ```
284 | 
285 | For extra fun, let's take a look at `mydb.db` to see how our data is being stored. I'll use vim as a hex editor to look at the memory layout of the file:
286 | 
287 | ```
288 | vim mydb.db
289 | :%!xxd
290 | ```
291 | {% include image.html url="assets/images/file-format.png" description="Current File Format" %}
292 | 
293 | The first four bytes are the id of the first row (4 bytes because we store a `uint32_t`). It's stored in little-endian byte order, so the least significant byte comes first (01), followed by the higher-order bytes (00 00 00). We used `memcpy()` to copy bytes from our `Row` struct into the page cache, so that means the struct was laid out in memory in little-endian byte order. That's an attribute of the machine I compiled the program for. If we wanted to write a database file on my machine, then read it on a big-endian machine, we'd have to change our `serialize_row()` and `deserialize_row()` methods to always store and read bytes in the same order.
294 | 
295 | The next 33 bytes store the username as a null-terminated string. Apparently "cstack" in ASCII hexadecimal is `63 73 74 61 63 6b`, followed by a null character (`00`). The rest of the 33 bytes are unused.
296 | 
297 | The next 256 bytes store the email in the same way. Here we can see some random junk after the terminating null character. This is most likely due to uninitialized memory in our `Row` struct. We copy the entire 256-byte email buffer into the file, including any bytes after the end of the string. Whatever was in memory when we allocated that struct is still there. But since we use a terminating null character, it has no effect on behavior.
298 | 
299 | **NOTE**: If we wanted to ensure that all bytes are initialized, it would
300 | suffice to use `strncpy` instead of `memcpy` while copying the `username`
301 | and `email` fields of rows in `serialize_row`, like so:
302 | 
303 | ```diff
304 |  void serialize_row(Row* source, void* destination) {
305 |      memcpy(destination + ID_OFFSET, &(source->id), ID_SIZE);
306 | -    memcpy(destination + USERNAME_OFFSET, &(source->username), USERNAME_SIZE);
307 | -    memcpy(destination + EMAIL_OFFSET, &(source->email), EMAIL_SIZE);
308 | +    strncpy(destination + USERNAME_OFFSET, source->username, USERNAME_SIZE);
309 | +    strncpy(destination + EMAIL_OFFSET, source->email, EMAIL_SIZE);
310 |  }
311 | ```
312 | 
313 | ## Conclusion
314 | 
315 | Alright! We've got persistence. It's not the greatest. For example if you kill the program without typing `.exit`, you lose your changes. Additionally, we're writing all pages back to disk, even pages that haven't changed since we read them from disk. These are issues we can address later.
316 | 
317 | Next time we'll introduce cursors, which should make it easier to implement the B-tree.
318 | 
319 | Until then!
320 | 
321 | ## Complete Diff
322 | ```diff
323 | +#include <errno.h>
324 | +#include <fcntl.h>
325 |  #include <stdbool.h>
326 |  #include <stdio.h>
327 |  #include <stdlib.h>
328 |  #include <string.h>
329 |  #include <stdint.h>
330 | +#include <unistd.h>
331 | 
332 |  struct InputBuffer_t {
333 |    char* buffer;
334 | @@ -62,9 +65,16 @@ const uint32_t PAGE_SIZE = 4096;
335 |  const uint32_t ROWS_PER_PAGE = PAGE_SIZE / ROW_SIZE;
336 |  const uint32_t TABLE_MAX_ROWS = ROWS_PER_PAGE * TABLE_MAX_PAGES;
337 | 
338 | +typedef struct {
339 | +  int file_descriptor;
340 | +  uint32_t file_length;
341 | +  void* pages[TABLE_MAX_PAGES];
342 | +} Pager;
343 | +
344 |  typedef struct {
345 |    uint32_t num_rows;
346 | -  void* pages[TABLE_MAX_PAGES];
347 | +  Pager* pager;
348 |  } Table;
349 | 
350 | @@ -84,32 +94,81 @@ void deserialize_row(void *source, Row* destination) {
351 |    memcpy(&(destination->email), source + EMAIL_OFFSET, EMAIL_SIZE);
352 |  }
353 | 
354 | +void* get_page(Pager* pager, uint32_t page_num) {
355 | +  if (page_num > TABLE_MAX_PAGES) {
356 | +     printf("Tried to fetch page number out of bounds. %d > %d\n", page_num,
357 | +     	TABLE_MAX_PAGES);
358 | +     exit(EXIT_FAILURE);
359 | +  }
360 | +
361 | +  if (pager->pages[page_num] == NULL) {
362 | +     // Cache miss. Allocate memory and load from file.
363 | +     void* page = malloc(PAGE_SIZE);
364 | +     uint32_t num_pages = pager->file_length / PAGE_SIZE;
365 | +
366 | +     // We might save a partial page at the end of the file
367 | +     if (pager->file_length % PAGE_SIZE) {
368 | +         num_pages += 1;
369 | +     }
370 | +
371 | +     if (page_num <= num_pages) {
372 | +         lseek(pager->file_descriptor, page_num * PAGE_SIZE, SEEK_SET);
373 | +         ssize_t bytes_read = read(pager->file_descriptor, page, PAGE_SIZE);
374 | +         if (bytes_read == -1) {
375 | +     	printf("Error reading file: %d\n", errno);
376 | +     	exit(EXIT_FAILURE);
377 | +         }
378 | +     }
379 | +
380 | +     pager->pages[page_num] = page;
381 | +  }
382 | +
383 | +  return pager->pages[page_num];
384 | +}
385 | +
386 |  void* row_slot(Table* table, uint32_t row_num) {
387 |    uint32_t page_num = row_num / ROWS_PER_PAGE;
388 | -  void *page = table->pages[page_num];
389 | -  if (page == NULL) {
390 | -     // Allocate memory only when we try to access page
391 | -     page = table->pages[page_num] = malloc(PAGE_SIZE);
392 | -  }
393 | +  void *page = get_page(table->pager, page_num);
394 |    uint32_t row_offset = row_num % ROWS_PER_PAGE;
395 |    uint32_t byte_offset = row_offset * ROW_SIZE;
396 |    return page + byte_offset;
397 |  }
398 | 
399 | -Table* new_table() {
400 | -  Table* table = malloc(sizeof(Table));
401 | -  table->num_rows = 0;
402 | +Pager* pager_open(const char* filename) {
403 | +  int fd = open(filename,
404 | +     	  O_RDWR | 	// Read/Write mode
405 | +     	      O_CREAT,	// Create file if it does not exist
406 | +     	  S_IWUSR |	// User write permission
407 | +     	      S_IRUSR	// User read permission
408 | +     	  );
409 | +
410 | +  if (fd == -1) {
411 | +     printf("Unable to open file\n");
412 | +     exit(EXIT_FAILURE);
413 | +  }
414 | +
415 | +  off_t file_length = lseek(fd, 0, SEEK_END);
416 | +
417 | +  Pager* pager = malloc(sizeof(Pager));
418 | +  pager->file_descriptor = fd;
419 | +  pager->file_length = file_length;
420 | +
421 |    for (uint32_t i = 0; i < TABLE_MAX_PAGES; i++) {
422 | -     table->pages[i] = NULL;
423 | +     pager->pages[i] = NULL;
424 |    }
425 | -  return table;
426 | +
427 | +  return pager;
428 |  }
429 | 
430 | -void free_table(Table* table) {
431 | -  for (int i = 0; table->pages[i]; i++) {
432 | -     free(table->pages[i]);
433 | -  }
434 | -  free(table);
435 | +Table* db_open(const char* filename) {
436 | +  Pager* pager = pager_open(filename);
437 | +  uint32_t num_rows = pager->file_length / ROW_SIZE;
438 | +
439 | +  Table* table = malloc(sizeof(Table));
440 | +  table->pager = pager;
441 | +  table->num_rows = num_rows;
442 | +
443 | +  return table;
444 |  }
445 | 
446 |  InputBuffer* new_input_buffer() {
447 | @@ -142,10 +201,76 @@ void close_input_buffer(InputBuffer* input_buffer) {
448 |    free(input_buffer);
449 |  }
450 | 
451 | +void pager_flush(Pager* pager, uint32_t page_num, uint32_t size) {
452 | +  if (pager->pages[page_num] == NULL) {
453 | +     printf("Tried to flush null page\n");
454 | +     exit(EXIT_FAILURE);
455 | +  }
456 | +
457 | +  off_t offset = lseek(pager->file_descriptor, page_num * PAGE_SIZE,
458 | +     		 SEEK_SET);
459 | +
460 | +  if (offset == -1) {
461 | +     printf("Error seeking: %d\n", errno);
462 | +     exit(EXIT_FAILURE);
463 | +  }
464 | +
465 | +  ssize_t bytes_written = write(
466 | +     pager->file_descriptor, pager->pages[page_num], size
467 | +     );
468 | +
469 | +  if (bytes_written == -1) {
470 | +     printf("Error writing: %d\n", errno);
471 | +     exit(EXIT_FAILURE);
472 | +  }
473 | +}
474 | +
475 | +void db_close(Table* table) {
476 | +  Pager* pager = table->pager;
477 | +  uint32_t num_full_pages = table->num_rows / ROWS_PER_PAGE;
478 | +
479 | +  for (uint32_t i = 0; i < num_full_pages; i++) {
480 | +     if (pager->pages[i] == NULL) {
481 | +         continue;
482 | +     }
483 | +     pager_flush(pager, i, PAGE_SIZE);
484 | +     free(pager->pages[i]);
485 | +     pager->pages[i] = NULL;
486 | +  }
487 | +
488 | +  // There may be a partial page to write to the end of the file
489 | +  // This should not be needed after we switch to a B-tree
490 | +  uint32_t num_additional_rows = table->num_rows % ROWS_PER_PAGE;
491 | +  if (num_additional_rows > 0) {
492 | +     uint32_t page_num = num_full_pages;
493 | +     if (pager->pages[page_num] != NULL) {
494 | +         pager_flush(pager, page_num, num_additional_rows * ROW_SIZE);
495 | +         free(pager->pages[page_num]);
496 | +         pager->pages[page_num] = NULL;
497 | +     }
498 | +  }
499 | +
500 | +  int result = close(pager->file_descriptor);
501 | +  if (result == -1) {
502 | +     printf("Error closing db file.\n");
503 | +     exit(EXIT_FAILURE);
504 | +  }
505 | +  for (uint32_t i = 0; i < TABLE_MAX_PAGES; i++) {
506 | +     void* page = pager->pages[i];
507 | +     if (page) {
508 | +         free(page);
509 | +         pager->pages[i] = NULL;
510 | +     }
511 | +  }
512 | +
513 | +  free(pager);
514 | +  free(table);
515 | +}
516 | +
517 |  MetaCommandResult do_meta_command(InputBuffer* input_buffer, Table *table) {
518 |    if (strcmp(input_buffer->buffer, ".exit") == 0) {
519 |      close_input_buffer(input_buffer);
520 | -    free_table(table);
521 | +    db_close(table);
522 |      exit(EXIT_SUCCESS);
523 |    } else {
524 |      return META_COMMAND_UNRECOGNIZED_COMMAND;
525 | @@ -182,6 +308,7 @@ PrepareResult prepare_insert(InputBuffer* input_buffer, Statement* statement) {
526 |      return PREPARE_SUCCESS;
527 | 
528 |  }
529 | +
530 |  PrepareResult prepare_statement(InputBuffer* input_buffer,
531 |                                  Statement* statement) {
532 |    if (strncmp(input_buffer->buffer, "insert", 6) == 0) {
533 | @@ -227,7 +354,14 @@ ExecuteResult execute_statement(Statement* statement, Table *table) {
534 |  }
535 | 
536 |  int main(int argc, char* argv[]) {
537 | -  Table* table = new_table();
538 | +  if (argc < 2) {
539 | +      printf("Must supply a database filename.\n");
540 | +      exit(EXIT_FAILURE);
541 | +  }
542 | +
543 | +  char* filename = argv[1];
544 | +  Table* table = db_open(filename);
545 | +
546 |    InputBuffer* input_buffer = new_input_buffer();
547 |    while (true) {
548 |      print_prompt();
549 | ```
550 | 
551 | And the diff to our tests:
552 | ```diff
553 |  describe 'database' do
554 | +  before do
555 | +    `rm -rf test.db`
556 | +  end
557 | +
558 |    def run_script(commands)
559 |      raw_output = nil
560 | -    IO.popen("./db", "r+") do |pipe|
561 | +    IO.popen("./db test.db", "r+") do |pipe|
562 |        commands.each do |command|
563 |          pipe.puts command
564 |        end
565 | @@ -28,6 +32,27 @@ describe 'database' do
566 |      ])
567 |    end
568 | 
569 | +  it 'keeps data after closing connection' do
570 | +    result1 = run_script([
571 | +      "insert 1 user1 person1@example.com",
572 | +      ".exit",
573 | +    ])
574 | +    expect(result1).to match_array([
575 | +      "db > Executed.",
576 | +      "db > ",
577 | +    ])
578 | +
579 | +    result2 = run_script([
580 | +      "select",
581 | +      ".exit",
582 | +    ])
583 | +    expect(result2).to match_array([
584 | +      "db > (1, user1, person1@example.com)",
585 | +      "Executed.",
586 | +      "db > ",
587 | +    ])
588 | +  end
589 | +
590 |    it 'prints error message when table is full' do
591 |      script = (1..1401).map do |i|
592 |        "insert #{i} user#{i} person#{i}@example.com"
593 | ```
594 | 


--------------------------------------------------------------------------------
/_parts/part6.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: Part 6 - The Cursor Abstraction
  3 | date: 2017-09-10
  4 | ---
  5 | 
  6 | This should be a shorter part than the last one. We're just going to refactor a bit to make it easier to start the B-Tree implementation.
  7 | 
  8 | We're going to add a `Cursor` object which represents a location in the table. Things you might want to do with cursors:
  9 | 
 10 | - Create a cursor at the beginning of the table
 11 | - Create a cursor at the end of the table
 12 | - Access the row the cursor is pointing to
 13 | - Advance the cursor to the next row
 14 | 
 15 | Those are the behaviors we're going to implement now. Later, we will also want to:
 16 | 
 17 | - Delete the row pointed to by a cursor
 18 | - Modify the row pointed to by a cursor
 19 | - Search a table for a given ID, and create a cursor pointing to the row with that ID
 20 | 
 21 | Without further ado, here's the `Cursor` type:
 22 | 
 23 | ```diff
 24 | +typedef struct {
 25 | +  Table* table;
 26 | +  uint32_t row_num;
 27 | +  bool end_of_table;  // Indicates a position one past the last element
 28 | +} Cursor;
 29 | ```
 30 | 
 31 | Given our current table data structure, all you need to identify a location in a table is the row number.
 32 | 
 33 | A cursor also has a reference to the table it's part of (so our cursor functions can take just the cursor as a parameter).
 34 | 
 35 | Finally, it has a boolean called `end_of_table`. This is so we can represent a position past the end of the table (which is somewhere we may want to insert a row).
 36 | 
 37 | `table_start()` and `table_end()` create new cursors:
 38 | 
 39 | ```diff
 40 | +Cursor* table_start(Table* table) {
 41 | +  Cursor* cursor = malloc(sizeof(Cursor));
 42 | +  cursor->table = table;
 43 | +  cursor->row_num = 0;
 44 | +  cursor->end_of_table = (table->num_rows == 0);
 45 | +
 46 | +  return cursor;
 47 | +}
 48 | +
 49 | +Cursor* table_end(Table* table) {
 50 | +  Cursor* cursor = malloc(sizeof(Cursor));
 51 | +  cursor->table = table;
 52 | +  cursor->row_num = table->num_rows;
 53 | +  cursor->end_of_table = true;
 54 | +
 55 | +  return cursor;
 56 | +}
 57 | ```
 58 | 
 59 | Our `row_slot()` function will become `cursor_value()`, which returns a pointer to the position described by the cursor:
 60 | 
 61 | ```diff
 62 | -void* row_slot(Table* table, uint32_t row_num) {
 63 | +void* cursor_value(Cursor* cursor) {
 64 | +  uint32_t row_num = cursor->row_num;
 65 |    uint32_t page_num = row_num / ROWS_PER_PAGE;
 66 | -  void* page = get_page(table->pager, page_num);
 67 | +  void* page = get_page(cursor->table->pager, page_num);
 68 |    uint32_t row_offset = row_num % ROWS_PER_PAGE;
 69 |    uint32_t byte_offset = row_offset * ROW_SIZE;
 70 |    return page + byte_offset;
 71 |  }
 72 | ```
 73 | 
 74 | Advancing the cursor in our current table structure is as simple as incrementing the row number. This will be a bit more complicated in a B-tree.
 75 | 
 76 | ```diff
 77 | +void cursor_advance(Cursor* cursor) {
 78 | +  cursor->row_num += 1;
 79 | +  if (cursor->row_num >= cursor->table->num_rows) {
 80 | +    cursor->end_of_table = true;
 81 | +  }
 82 | +}
 83 | ```
 84 | 
 85 | Finally we can change our "virtual machine" methods to use the cursor abstraction. When inserting a row, we open a cursor at the end of table, write to that cursor location, then close the cursor.
 86 | 
 87 | ```diff
 88 |    Row* row_to_insert = &(statement->row_to_insert);
 89 | +  Cursor* cursor = table_end(table);
 90 | 
 91 | -  serialize_row(row_to_insert, row_slot(table, table->num_rows));
 92 | +  serialize_row(row_to_insert, cursor_value(cursor));
 93 |    table->num_rows += 1;
 94 | 
 95 | +  free(cursor);
 96 | +
 97 |    return EXECUTE_SUCCESS;
 98 |  }
 99 |  ```
100 | 
101 | When selecting all rows in the table, we open a cursor at the start of the table, print the row, then advance the cursor to the next row. Repeat until we've reached the end of the table.
102 | 
103 | ```diff
104 |  ExecuteResult execute_select(Statement* statement, Table* table) {
105 | +  Cursor* cursor = table_start(table);
106 | +
107 |    Row row;
108 | -  for (uint32_t i = 0; i < table->num_rows; i++) {
109 | -    deserialize_row(row_slot(table, i), &row);
110 | +  while (!(cursor->end_of_table)) {
111 | +    deserialize_row(cursor_value(cursor), &row);
112 |      print_row(&row);
113 | +    cursor_advance(cursor);
114 |    }
115 | +
116 | +  free(cursor);
117 | +
118 |    return EXECUTE_SUCCESS;
119 |  }
120 |  ```
121 | 
122 | Alright, that's it! Like I said, this was a shorter refactor that should help us as we rewrite our table data structure into a B-Tree. `execute_select()` and `execute_insert()` can interact with the table entirely through the cursor without assuming anything about how the table is stored.
123 | 
124 | Here's the complete diff to this part:
125 | ```diff
126 | @@ -78,6 +78,13 @@ struct {
127 |  } Table;
128 | 
129 | +typedef struct {
130 | +  Table* table;
131 | +  uint32_t row_num;
132 | +  bool end_of_table; // Indicates a position one past the last element
133 | +} Cursor;
134 | +
135 |  void print_row(Row* row) {
136 |      printf("(%d, %s, %s)\n", row->id, row->username, row->email);
137 |  }
138 | @@ -126,12 +133,38 @@ void* get_page(Pager* pager, uint32_t page_num) {
139 |      return pager->pages[page_num];
140 |  }
141 | 
142 | -void* row_slot(Table* table, uint32_t row_num) {
143 | -  uint32_t page_num = row_num / ROWS_PER_PAGE;
144 | -  void *page = get_page(table->pager, page_num);
145 | -  uint32_t row_offset = row_num % ROWS_PER_PAGE;
146 | -  uint32_t byte_offset = row_offset * ROW_SIZE;
147 | -  return page + byte_offset;
148 | +Cursor* table_start(Table* table) {
149 | +  Cursor* cursor = malloc(sizeof(Cursor));
150 | +  cursor->table = table;
151 | +  cursor->row_num = 0;
152 | +  cursor->end_of_table = (table->num_rows == 0);
153 | +
154 | +  return cursor;
155 | +}
156 | +
157 | +Cursor* table_end(Table* table) {
158 | +  Cursor* cursor = malloc(sizeof(Cursor));
159 | +  cursor->table = table;
160 | +  cursor->row_num = table->num_rows;
161 | +  cursor->end_of_table = true;
162 | +
163 | +  return cursor;
164 | +}
165 | +
166 | +void* cursor_value(Cursor* cursor) {
167 | +  uint32_t row_num = cursor->row_num;
168 | +  uint32_t page_num = row_num / ROWS_PER_PAGE;
169 | +  void *page = get_page(cursor->table->pager, page_num);
170 | +  uint32_t row_offset = row_num % ROWS_PER_PAGE;
171 | +  uint32_t byte_offset = row_offset * ROW_SIZE;
172 | +  return page + byte_offset;
173 | +}
174 | +
175 | +void cursor_advance(Cursor* cursor) {
176 | +  cursor->row_num += 1;
177 | +  if (cursor->row_num >= cursor->table->num_rows) {
178 | +    cursor->end_of_table = true;
179 | +  }
180 |  }
181 | 
182 |  Pager* pager_open(const char* filename) {
183 | @@ -327,19 +360,28 @@ ExecuteResult execute_insert(Statement* statement, Table* table) {
184 |      }
185 | 
186 |    Row* row_to_insert = &(statement->row_to_insert);
187 | +  Cursor* cursor = table_end(table);
188 | 
189 | -  serialize_row(row_to_insert, row_slot(table, table->num_rows));
190 | +  serialize_row(row_to_insert, cursor_value(cursor));
191 |    table->num_rows += 1;
192 | 
193 | +  free(cursor);
194 | +
195 |    return EXECUTE_SUCCESS;
196 |  }
197 | 
198 |  ExecuteResult execute_select(Statement* statement, Table* table) {
199 | +  Cursor* cursor = table_start(table);
200 | +
201 |    Row row;
202 | -  for (uint32_t i = 0; i < table->num_rows; i++) {
203 | -     deserialize_row(row_slot(table, i), &row);
204 | +  while (!(cursor->end_of_table)) {
205 | +     deserialize_row(cursor_value(cursor), &row);
206 |       print_row(&row);
207 | +     cursor_advance(cursor);
208 |    }
209 | +
210 | +  free(cursor);
211 | +
212 |    return EXECUTE_SUCCESS;
213 |  }
214 | ```
215 | 


--------------------------------------------------------------------------------
/_parts/part7.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Part 7 - Introduction to the B-Tree
 3 | date: 2017-09-23
 4 | ---
 5 | 
 6 | The B-Tree is the data structure SQLite uses to represent both tables and indexes, so it's a pretty central idea. This article will just introduce the data structure, so it won't have any code.
 7 | 
 8 | Why is a tree a good data structure for a database?
 9 | 
10 | - Searching for a particular value is fast (logarithmic time)
11 | - Inserting / deleting a value you've already found is fast (constant-ish time to rebalance)
12 | - Traversing a range of values is fast (unlike a hash map)
13 | 
14 | A B-Tree is different from a binary tree (the "B" probably stands for the inventor's name, but could also stand for "balanced"). Here's an example B-Tree:
15 | 
16 | {% include image.html url="assets/images/B-tree.png" description="example B-Tree (https://en.wikipedia.org/wiki/File:B-tree.svg)" %}
17 | 
18 | Unlike a binary tree, each node in a B-Tree can have more than 2 children. Each node can have up to m children, where m is called the tree's "order". To keep the tree mostly balanced, we also say nodes have to have at least m/2 children (rounded up).
19 | 
20 | Exceptions:
21 | - Leaf nodes have 0 children
22 | - The root node can have fewer than m children but must have at least 2
23 | - If the root node is a leaf node (the only node), it still has 0 children
24 | 
25 | The picture from above is a B-Tree, which SQLite uses to store indexes. To store tables, SQLites uses a variation called a B+ tree.
26 | 
27 | |                               | B-tree         | B+ tree             |
28 | |-------------------------------|----------------|---------------------|
29 | | Pronounced                    | "Bee Tree"     | "Bee Plus Tree"     |
30 | | Used to store                 | Indexes        | Tables              |
31 | | Internal nodes store keys     | Yes            | Yes                 |
32 | | Internal nodes store values   | Yes            | No                  |
33 | | Number of children per node   | Less           | More                |
34 | | Internal nodes vs. leaf nodes | Same structure | Different structure |
35 | 
36 | Until we get to implementing indexes, I'm going to talk solely about B+ trees, but I'll just refer to it as a B-tree or a btree.
37 | 
38 | Nodes with children are called "internal" nodes. Internal nodes and leaf nodes are structured differently:
39 | 
40 | | For an order-m tree... | Internal Node                 | Leaf Node           |
41 | |------------------------|-------------------------------|---------------------|
42 | | Stores                 | keys and pointers to children | keys and values     |
43 | | Number of keys         | up to m-1                     | as many as will fit |
44 | | Number of pointers     | number of keys + 1            | none                |
45 | | Number of values       | none                          | number of keys      |
46 | | Key purpose            | used for routing              | paired with value   |
47 | | Stores values?         | No                            | Yes                 |
48 | 
49 | Let's work through an example to see how a B-tree grows as you insert elements into it. To keep things simple, the tree will be order 3. That means:
50 | 
51 | - up to 3 children per internal node
52 | - up to 2 keys per internal node
53 | - at least 2 children per internal node
54 | - at least 1 key per internal node
55 | 
56 | An empty B-tree has a single node: the root node. The root node starts as a leaf node with zero key/value pairs:
57 | 
58 | {% include image.html url="assets/images/btree1.png" description="empty btree" %}
59 | 
60 | If we insert a couple key/value pairs, they are stored in the leaf node in sorted order.
61 | 
62 | {% include image.html url="assets/images/btree2.png" description="one-node btree" %}
63 | 
64 | Let's say that the capacity of a leaf node is two key/value pairs. When we insert another, we have to split the leaf node and put half the pairs in each node. Both nodes become children of a new internal node which will now be the root node.
65 | 
66 | {% include image.html url="assets/images/btree3.png" description="two-level btree" %}
67 | 
68 | The internal node has 1 key and 2 pointers to child nodes. If we want to look up a key that is less than or equal to 5, we look in the left child. If we want to look up a key greater than 5, we look in the right child.
69 | 
70 | Now let's insert the key "2". First we look up which leaf node it would be in if it was present, and we arrive at the left leaf node. The node is full, so we split the leaf node and create a new entry in the parent node.
71 | 
72 | {% include image.html url="assets/images/btree4.png" description="four-node btree" %}
73 | 
74 | Let's keep adding keys. 18 and 21. We get to the point where we have to split again, but there's no room in the parent node for another key/pointer pair.
75 | 
76 | {% include image.html url="assets/images/btree5.png" description="no room in internal node" %}
77 | 
78 | The solution is to split the root node into two internal nodes, then create new root node to be their parent.
79 | 
80 | {% include image.html url="assets/images/btree6.png" description="three-level btree" %}
81 | 
82 | The depth of the tree only increases when we split the root node. Every leaf node has the same depth and close to the same number of key/value pairs, so the tree remains balanced and quick to search.
83 | 
84 | I'm going to hold off on discussion of deleting keys from the tree until after we've implemented insertion.
85 | 
86 | When we implement this data structure, each node will correspond to one page. The root node will exist in page 0. Child pointers will simply be the page number that contains the child node.
87 | 
88 | Next time, we start implementing the btree!
89 | 


--------------------------------------------------------------------------------
/_parts/part9.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: Part 9 - Binary Search and Duplicate Keys
  3 | date: 2017-10-01
  4 | ---
  5 | 
  6 | Last time we noted that we're still storing keys in unsorted order. We're going to fix that problem, plus detect and reject duplicate keys.
  7 | 
  8 | Right now, our `execute_insert()` function always chooses to insert at the end of the table. Instead, we should search the table for the correct place to insert, then insert there. If the key already exists there, return an error.
  9 | 
 10 | ```diff
 11 | ExecuteResult execute_insert(Statement* statement, Table* table) {
 12 |    void* node = get_page(table->pager, table->root_page_num);
 13 | -  if ((*leaf_node_num_cells(node) >= LEAF_NODE_MAX_CELLS)) {
 14 | +  uint32_t num_cells = (*leaf_node_num_cells(node));
 15 | +  if (num_cells >= LEAF_NODE_MAX_CELLS) {
 16 |      return EXECUTE_TABLE_FULL;
 17 |    }
 18 | 
 19 |    Row* row_to_insert = &(statement->row_to_insert);
 20 | -  Cursor* cursor = table_end(table);
 21 | +  uint32_t key_to_insert = row_to_insert->id;
 22 | +  Cursor* cursor = table_find(table, key_to_insert);
 23 | +
 24 | +  if (cursor->cell_num < num_cells) {
 25 | +    uint32_t key_at_index = *leaf_node_key(node, cursor->cell_num);
 26 | +    if (key_at_index == key_to_insert) {
 27 | +      return EXECUTE_DUPLICATE_KEY;
 28 | +    }
 29 | +  }
 30 | 
 31 |    leaf_node_insert(cursor, row_to_insert->id, row_to_insert);
 32 | ```
 33 | 
 34 | We don't need the `table_end()` function anymore.
 35 | 
 36 | ```diff
 37 | -Cursor* table_end(Table* table) {
 38 | -  Cursor* cursor = malloc(sizeof(Cursor));
 39 | -  cursor->table = table;
 40 | -  cursor->page_num = table->root_page_num;
 41 | -
 42 | -  void* root_node = get_page(table->pager, table->root_page_num);
 43 | -  uint32_t num_cells = *leaf_node_num_cells(root_node);
 44 | -  cursor->cell_num = num_cells;
 45 | -  cursor->end_of_table = true;
 46 | -
 47 | -  return cursor;
 48 | -}
 49 | ```
 50 | 
 51 | We'll replace it with a method that searches the tree for a given key.
 52 | 
 53 | ```diff
 54 | +/*
 55 | +Return the position of the given key.
 56 | +If the key is not present, return the position
 57 | +where it should be inserted
 58 | +*/
 59 | +Cursor* table_find(Table* table, uint32_t key) {
 60 | +  uint32_t root_page_num = table->root_page_num;
 61 | +  void* root_node = get_page(table->pager, root_page_num);
 62 | +
 63 | +  if (get_node_type(root_node) == NODE_LEAF) {
 64 | +    return leaf_node_find(table, root_page_num, key);
 65 | +  } else {
 66 | +    printf("Need to implement searching an internal node\n");
 67 | +    exit(EXIT_FAILURE);
 68 | +  }
 69 | +}
 70 | ```
 71 | 
 72 | I'm stubbing out the branch for internal nodes because we haven't implemented internal nodes yet. We can search the leaf node with binary search.
 73 | 
 74 | ```diff
 75 | +Cursor* leaf_node_find(Table* table, uint32_t page_num, uint32_t key) {
 76 | +  void* node = get_page(table->pager, page_num);
 77 | +  uint32_t num_cells = *leaf_node_num_cells(node);
 78 | +
 79 | +  Cursor* cursor = malloc(sizeof(Cursor));
 80 | +  cursor->table = table;
 81 | +  cursor->page_num = page_num;
 82 | +
 83 | +  // Binary search
 84 | +  uint32_t min_index = 0;
 85 | +  uint32_t one_past_max_index = num_cells;
 86 | +  while (one_past_max_index != min_index) {
 87 | +    uint32_t index = (min_index + one_past_max_index) / 2;
 88 | +    uint32_t key_at_index = *leaf_node_key(node, index);
 89 | +    if (key == key_at_index) {
 90 | +      cursor->cell_num = index;
 91 | +      return cursor;
 92 | +    }
 93 | +    if (key < key_at_index) {
 94 | +      one_past_max_index = index;
 95 | +    } else {
 96 | +      min_index = index + 1;
 97 | +    }
 98 | +  }
 99 | +
100 | +  cursor->cell_num = min_index;
101 | +  return cursor;
102 | +}
103 | ```
104 | 
105 | This will either return
106 | - the position of the key,
107 | - the position of another key that we'll need to move if we want to insert the new key, or
108 | - the position one past the last key
109 | 
110 | Since we're now checking node type, we need functions to get and set that value in a node.
111 | 
112 | ```diff
113 | +NodeType get_node_type(void* node) {
114 | +  uint8_t value = *((uint8_t*)(node + NODE_TYPE_OFFSET));
115 | +  return (NodeType)value;
116 | +}
117 | +
118 | +void set_node_type(void* node, NodeType type) {
119 | +  uint8_t value = type;
120 | +  *((uint8_t*)(node + NODE_TYPE_OFFSET)) = value;
121 | +}
122 | ```
123 | 
124 | We have to cast to `uint8_t` first to ensure it's serialized as a single byte.
125 | 
126 | We also need to initialize node type.
127 | 
128 | ```diff
129 | -void initialize_leaf_node(void* node) { *leaf_node_num_cells(node) = 0; }
130 | +void initialize_leaf_node(void* node) {
131 | +  set_node_type(node, NODE_LEAF);
132 | +  *leaf_node_num_cells(node) = 0;
133 | +}
134 | ```
135 | 
136 | Lastly, we need to make and handle a new error code.
137 | 
138 | ```diff
139 | -enum ExecuteResult_t { EXECUTE_SUCCESS, EXECUTE_TABLE_FULL };
140 | +enum ExecuteResult_t {
141 | +  EXECUTE_SUCCESS,
142 | +  EXECUTE_DUPLICATE_KEY,
143 | +  EXECUTE_TABLE_FULL
144 | +};
145 | ```
146 | 
147 | ```diff
148 |        case (EXECUTE_SUCCESS):
149 |          printf("Executed.\n");
150 |          break;
151 | +      case (EXECUTE_DUPLICATE_KEY):
152 | +        printf("Error: Duplicate key.\n");
153 | +        break;
154 |        case (EXECUTE_TABLE_FULL):
155 |          printf("Error: Table full.\n");
156 |          break;
157 | ```
158 | 
159 | With these changes, our test can change to check for sorted order:
160 | 
161 | ```diff
162 |        "db > Executed.",
163 |        "db > Tree:",
164 |        "leaf (size 3)",
165 | -      "  - 0 : 3",
166 | -      "  - 1 : 1",
167 | -      "  - 2 : 2",
168 | +      "  - 0 : 1",
169 | +      "  - 1 : 2",
170 | +      "  - 2 : 3",
171 |        "db > "
172 |      ])
173 |    end
174 | ```
175 | 
176 | And we can add a new test for duplicate keys:
177 | 
178 | ```diff
179 | +  it 'prints an error message if there is a duplicate id' do
180 | +    script = [
181 | +      "insert 1 user1 person1@example.com",
182 | +      "insert 1 user1 person1@example.com",
183 | +      "select",
184 | +      ".exit",
185 | +    ]
186 | +    result = run_script(script)
187 | +    expect(result).to match_array([
188 | +      "db > Executed.",
189 | +      "db > Error: Duplicate key.",
190 | +      "db > (1, user1, person1@example.com)",
191 | +      "Executed.",
192 | +      "db > ",
193 | +    ])
194 | +  end
195 | ```
196 | 
197 | That's it! Next up: implement splitting leaf nodes and creating internal nodes.
198 | 


--------------------------------------------------------------------------------
/assets/css/style.scss:
--------------------------------------------------------------------------------
 1 | ---
 2 | ---
 3 | 
 4 | @import "{{ site.theme }}";
 5 | 
 6 | // Everything below this line will override the default template styles
 7 | 
 8 | a:hover, a:focus { color: #069; font-weight: normal; text-decoration: underline; }
 9 | 
10 | table.image td {
11 |   text-align: center;
12 | }
13 | 
14 | body {
15 |   color: #323232;
16 | }
17 | 
18 | a.prev {
19 |   float: left;
20 | }
21 | 
22 | a.prev:before {
23 |   content: "< "
24 | }
25 | 
26 | a.next {
27 |   float: right;
28 | }
29 | 
30 | a.next:after {
31 |   content: " >"
32 | }


--------------------------------------------------------------------------------
/assets/images/B-tree.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstack/db_tutorial/60b50c5b7be787a4aaa1e50ab8a90c6cabb75159/assets/images/B-tree.png


--------------------------------------------------------------------------------
/assets/images/arch-part5.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstack/db_tutorial/60b50c5b7be787a4aaa1e50ab8a90c6cabb75159/assets/images/arch-part5.gif


--------------------------------------------------------------------------------
/assets/images/arch1.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstack/db_tutorial/60b50c5b7be787a4aaa1e50ab8a90c6cabb75159/assets/images/arch1.gif


--------------------------------------------------------------------------------
/assets/images/arch2.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstack/db_tutorial/60b50c5b7be787a4aaa1e50ab8a90c6cabb75159/assets/images/arch2.gif


--------------------------------------------------------------------------------
/assets/images/btree1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstack/db_tutorial/60b50c5b7be787a4aaa1e50ab8a90c6cabb75159/assets/images/btree1.png


--------------------------------------------------------------------------------
/assets/images/btree2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstack/db_tutorial/60b50c5b7be787a4aaa1e50ab8a90c6cabb75159/assets/images/btree2.png


--------------------------------------------------------------------------------
/assets/images/btree3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstack/db_tutorial/60b50c5b7be787a4aaa1e50ab8a90c6cabb75159/assets/images/btree3.png


--------------------------------------------------------------------------------
/assets/images/btree4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstack/db_tutorial/60b50c5b7be787a4aaa1e50ab8a90c6cabb75159/assets/images/btree4.png


--------------------------------------------------------------------------------
/assets/images/btree5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstack/db_tutorial/60b50c5b7be787a4aaa1e50ab8a90c6cabb75159/assets/images/btree5.png


--------------------------------------------------------------------------------
/assets/images/btree6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstack/db_tutorial/60b50c5b7be787a4aaa1e50ab8a90c6cabb75159/assets/images/btree6.png


--------------------------------------------------------------------------------
/assets/images/code-crafters.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstack/db_tutorial/60b50c5b7be787a4aaa1e50ab8a90c6cabb75159/assets/images/code-crafters.jpeg


--------------------------------------------------------------------------------
/assets/images/file-format.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstack/db_tutorial/60b50c5b7be787a4aaa1e50ab8a90c6cabb75159/assets/images/file-format.png


--------------------------------------------------------------------------------
/assets/images/internal-node-format.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstack/db_tutorial/60b50c5b7be787a4aaa1e50ab8a90c6cabb75159/assets/images/internal-node-format.png


--------------------------------------------------------------------------------
/assets/images/leaf-node-format.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstack/db_tutorial/60b50c5b7be787a4aaa1e50ab8a90c6cabb75159/assets/images/leaf-node-format.png


--------------------------------------------------------------------------------
/assets/images/splitting-internal-node.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstack/db_tutorial/60b50c5b7be787a4aaa1e50ab8a90c6cabb75159/assets/images/splitting-internal-node.png


--------------------------------------------------------------------------------
/assets/images/updating-internal-node.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstack/db_tutorial/60b50c5b7be787a4aaa1e50ab8a90c6cabb75159/assets/images/updating-internal-node.png


--------------------------------------------------------------------------------
/feed.xml:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: none
 3 | ---
 4 | <?xml version="1.0" encoding="UTF-8"?>
 5 | <rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom">
 6 |   <channel>
 7 |     <title>{{ site.title | xml_escape }}</title>
 8 |     <description>{{ site.description | xml_escape }}</description>
 9 |     <link>{{site.url}}{{site.baseurl}}</link>
10 |     <atom:link href="{{site.url}}{{site.baseurl}}/feed.xml" rel="self" type="application/rss+xml" />
11 |     {% assign limit = 10 %}
12 |     {% assign offset = (site.parts.size | minus: limit) %}
13 |     {% for part in site.parts offset:offset limit:limit %}
14 |       <item>
15 |         <title>{{ part.title | xml_escape }}</title>
16 |         <description>{{ part.content | xml_escape }}</description>
17 |         <pubDate>{{ part.date | date: "%a, %d %b %Y %H:%M:%S %z" }}</pubDate>
18 |         <link>{{site.url}}{{site.baseurl}}{{part.url}}</link>
19 |         <guid isPermaLink="true">{{site.url}}{{site.baseurl}}{{part.url}}</guid>
20 |       </item>
21 |     {% endfor %}
22 |   </channel>
23 | </rss>
24 | 


--------------------------------------------------------------------------------
/index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: How Does a Database Work?
 3 | ---
 4 | 
 5 | - What format is data saved in? (in memory and on disk)
 6 | - When does it move from memory to disk?
 7 | - Why can there only be one primary key per table?
 8 | - How does rolling back a transaction work?
 9 | - How are indexes formatted?
10 | - When and how does a full table scan happen?
11 | - What format is a prepared statement saved in?
12 | 
13 | In short, how does a database **work**?
14 | 
15 | I'm building a clone of [sqlite](https://www.sqlite.org/arch.html) from scratch in C in order to understand, and I'm going to document my process as I go.
16 | 
17 | # Table of Contents
18 | {% for part in site.parts %}- [{{part.title}}]({{site.baseurl}}{{part.url}})
19 | {% endfor %}
20 | 
21 | > "What I cannot create, I do not understand." -- [Richard Feynman](https://en.m.wikiquote.org/wiki/Richard_Feynman)
22 | 
23 | {% include image.html url="assets/images/arch2.gif" description="sqlite architecture (https://www.sqlite.org/arch.html)" %}


--------------------------------------------------------------------------------
/spec/main_spec.rb:
--------------------------------------------------------------------------------
  1 | describe 'database' do
  2 |   before do
  3 |     `rm -rf test.db`
  4 |   end
  5 | 
  6 |   def run_script(commands)
  7 |     raw_output = nil
  8 |     IO.popen("./db test.db", "r+") do |pipe|
  9 |       commands.each do |command|
 10 |         begin
 11 |           pipe.puts command
 12 |         rescue Errno::EPIPE
 13 |           break
 14 |         end
 15 |       end
 16 | 
 17 |       pipe.close_write
 18 | 
 19 |       # Read entire output
 20 |       raw_output = pipe.gets(nil)
 21 |     end
 22 |     raw_output.split("\n")
 23 |   end
 24 | 
 25 |   it 'inserts and retrieves a row' do
 26 |     result = run_script([
 27 |       "insert 1 user1 person1@example.com",
 28 |       "select",
 29 |       ".exit",
 30 |     ])
 31 |     expect(result).to match_array([
 32 |       "db > Executed.",
 33 |       "db > (1, user1, person1@example.com)",
 34 |       "Executed.",
 35 |       "db > ",
 36 |     ])
 37 |   end
 38 | 
 39 |   it 'keeps data after closing connection' do
 40 |     result1 = run_script([
 41 |       "insert 1 user1 person1@example.com",
 42 |       ".exit",
 43 |     ])
 44 |     expect(result1).to match_array([
 45 |       "db > Executed.",
 46 |       "db > ",
 47 |     ])
 48 | 
 49 |     result2 = run_script([
 50 |       "select",
 51 |       ".exit",
 52 |     ])
 53 |     expect(result2).to match_array([
 54 |       "db > (1, user1, person1@example.com)",
 55 |       "Executed.",
 56 |       "db > ",
 57 |     ])
 58 |   end
 59 | 
 60 |   it 'prints error message when table is full' do
 61 |     script = (1..1401).map do |i|
 62 |       "insert #{i} user#{i} person#{i}@example.com"
 63 |     end
 64 |     script << ".exit"
 65 |     result = run_script(script)
 66 |     expect(result.last(2)).to match_array([
 67 |       "db > Executed.",
 68 |       "db > ",
 69 |     ])
 70 |   end
 71 | 
 72 |   it 'allows inserting strings that are the maximum length' do
 73 |     long_username = "a"*32
 74 |     long_email = "a"*255
 75 |     script = [
 76 |       "insert 1 #{long_username} #{long_email}",
 77 |       "select",
 78 |       ".exit",
 79 |     ]
 80 |     result = run_script(script)
 81 |     expect(result).to match_array([
 82 |       "db > Executed.",
 83 |       "db > (1, #{long_username}, #{long_email})",
 84 |       "Executed.",
 85 |       "db > ",
 86 |     ])
 87 |   end
 88 | 
 89 |   it 'prints error message if strings are too long' do
 90 |     long_username = "a"*33
 91 |     long_email = "a"*256
 92 |     script = [
 93 |       "insert 1 #{long_username} #{long_email}",
 94 |       "select",
 95 |       ".exit",
 96 |     ]
 97 |     result = run_script(script)
 98 |     expect(result).to match_array([
 99 |       "db > String is too long.",
100 |       "db > Executed.",
101 |       "db > ",
102 |     ])
103 |   end
104 | 
105 |   it 'prints an error message if id is negative' do
106 |     script = [
107 |       "insert -1 cstack foo@bar.com",
108 |       "select",
109 |       ".exit",
110 |     ]
111 |     result = run_script(script)
112 |     expect(result).to match_array([
113 |       "db > ID must be positive.",
114 |       "db > Executed.",
115 |       "db > ",
116 |     ])
117 |   end
118 | 
119 |   it 'prints an error message if there is a duplicate id' do
120 |     script = [
121 |       "insert 1 user1 person1@example.com",
122 |       "insert 1 user1 person1@example.com",
123 |       "select",
124 |       ".exit",
125 |     ]
126 |     result = run_script(script)
127 |     expect(result).to match_array([
128 |       "db > Executed.",
129 |       "db > Error: Duplicate key.",
130 |       "db > (1, user1, person1@example.com)",
131 |       "Executed.",
132 |       "db > ",
133 |     ])
134 |   end
135 | 
136 |   it 'allows printing out the structure of a one-node btree' do
137 |     script = [3, 1, 2].map do |i|
138 |       "insert #{i} user#{i} person#{i}@example.com"
139 |     end
140 |     script << ".btree"
141 |     script << ".exit"
142 |     result = run_script(script)
143 | 
144 |     expect(result).to match_array([
145 |       "db > Executed.",
146 |       "db > Executed.",
147 |       "db > Executed.",
148 |       "db > Tree:",
149 |       "- leaf (size 3)",
150 |       "  - 1",
151 |       "  - 2",
152 |       "  - 3",
153 |       "db > "
154 |     ])
155 |   end
156 | 
157 |   it 'allows printing out the structure of a 3-leaf-node btree' do
158 |     script = (1..14).map do |i|
159 |       "insert #{i} user#{i} person#{i}@example.com"
160 |     end
161 |     script << ".btree"
162 |     script << "insert 15 user15 person15@example.com"
163 |     script << ".exit"
164 |     result = run_script(script)
165 | 
166 |     expect(result[14...(result.length)]).to match_array([
167 |       "db > Tree:",
168 |       "- internal (size 1)",
169 |       "  - leaf (size 7)",
170 |       "    - 1",
171 |       "    - 2",
172 |       "    - 3",
173 |       "    - 4",
174 |       "    - 5",
175 |       "    - 6",
176 |       "    - 7",
177 |       "  - key 7",
178 |       "  - leaf (size 7)",
179 |       "    - 8",
180 |       "    - 9",
181 |       "    - 10",
182 |       "    - 11",
183 |       "    - 12",
184 |       "    - 13",
185 |       "    - 14",
186 |       "db > Executed.",
187 |       "db > ",
188 |     ])
189 |   end
190 | 
191 |   it 'allows printing out the structure of a 4-leaf-node btree' do
192 |     script = [
193 |       "insert 18 user18 person18@example.com",
194 |       "insert 7 user7 person7@example.com",
195 |       "insert 10 user10 person10@example.com",
196 |       "insert 29 user29 person29@example.com",
197 |       "insert 23 user23 person23@example.com",
198 |       "insert 4 user4 person4@example.com",
199 |       "insert 14 user14 person14@example.com",
200 |       "insert 30 user30 person30@example.com",
201 |       "insert 15 user15 person15@example.com",
202 |       "insert 26 user26 person26@example.com",
203 |       "insert 22 user22 person22@example.com",
204 |       "insert 19 user19 person19@example.com",
205 |       "insert 2 user2 person2@example.com",
206 |       "insert 1 user1 person1@example.com",
207 |       "insert 21 user21 person21@example.com",
208 |       "insert 11 user11 person11@example.com",
209 |       "insert 6 user6 person6@example.com",
210 |       "insert 20 user20 person20@example.com",
211 |       "insert 5 user5 person5@example.com",
212 |       "insert 8 user8 person8@example.com",
213 |       "insert 9 user9 person9@example.com",
214 |       "insert 3 user3 person3@example.com",
215 |       "insert 12 user12 person12@example.com",
216 |       "insert 27 user27 person27@example.com",
217 |       "insert 17 user17 person17@example.com",
218 |       "insert 16 user16 person16@example.com",
219 |       "insert 13 user13 person13@example.com",
220 |       "insert 24 user24 person24@example.com",
221 |       "insert 25 user25 person25@example.com",
222 |       "insert 28 user28 person28@example.com",
223 |       ".btree",
224 |       ".exit",
225 |     ]
226 |     result = run_script(script)
227 | 
228 |     expect(result[30...(result.length)]).to match_array([
229 |       "db > Tree:",
230 |       "- internal (size 3)",
231 |       "  - leaf (size 7)",
232 |       "    - 1",
233 |       "    - 2",
234 |       "    - 3",
235 |       "    - 4",
236 |       "    - 5",
237 |       "    - 6",
238 |       "    - 7",
239 |       "  - key 7",
240 |       "  - leaf (size 8)",
241 |       "    - 8",
242 |       "    - 9",
243 |       "    - 10",
244 |       "    - 11",
245 |       "    - 12",
246 |       "    - 13",
247 |       "    - 14",
248 |       "    - 15",
249 |       "  - key 15",
250 |       "  - leaf (size 7)",
251 |       "    - 16",
252 |       "    - 17",
253 |       "    - 18",
254 |       "    - 19",
255 |       "    - 20",
256 |       "    - 21",
257 |       "    - 22",
258 |       "  - key 22",
259 |       "  - leaf (size 8)",
260 |       "    - 23",
261 |       "    - 24",
262 |       "    - 25",
263 |       "    - 26",
264 |       "    - 27",
265 |       "    - 28",
266 |       "    - 29",
267 |       "    - 30",
268 |       "db > ",
269 |     ])
270 |   end
271 | 
272 |   it 'allows printing out the structure of a 7-leaf-node btree' do
273 |     script = [
274 |       "insert 58 user58 person58@example.com",
275 |       "insert 56 user56 person56@example.com",
276 |       "insert 8 user8 person8@example.com",
277 |       "insert 54 user54 person54@example.com",
278 |       "insert 77 user77 person77@example.com",
279 |       "insert 7 user7 person7@example.com",
280 |       "insert 25 user25 person25@example.com",
281 |       "insert 71 user71 person71@example.com",
282 |       "insert 13 user13 person13@example.com",
283 |       "insert 22 user22 person22@example.com",
284 |       "insert 53 user53 person53@example.com",
285 |       "insert 51 user51 person51@example.com",
286 |       "insert 59 user59 person59@example.com",
287 |       "insert 32 user32 person32@example.com",
288 |       "insert 36 user36 person36@example.com",
289 |       "insert 79 user79 person79@example.com",
290 |       "insert 10 user10 person10@example.com",
291 |       "insert 33 user33 person33@example.com",
292 |       "insert 20 user20 person20@example.com",
293 |       "insert 4 user4 person4@example.com",
294 |       "insert 35 user35 person35@example.com",
295 |       "insert 76 user76 person76@example.com",
296 |       "insert 49 user49 person49@example.com",
297 |       "insert 24 user24 person24@example.com",
298 |       "insert 70 user70 person70@example.com",
299 |       "insert 48 user48 person48@example.com",
300 |       "insert 39 user39 person39@example.com",
301 |       "insert 15 user15 person15@example.com",
302 |       "insert 47 user47 person47@example.com",
303 |       "insert 30 user30 person30@example.com",
304 |       "insert 86 user86 person86@example.com",
305 |       "insert 31 user31 person31@example.com",
306 |       "insert 68 user68 person68@example.com",
307 |       "insert 37 user37 person37@example.com",
308 |       "insert 66 user66 person66@example.com",
309 |       "insert 63 user63 person63@example.com",
310 |       "insert 40 user40 person40@example.com",
311 |       "insert 78 user78 person78@example.com",
312 |       "insert 19 user19 person19@example.com",
313 |       "insert 46 user46 person46@example.com",
314 |       "insert 14 user14 person14@example.com",
315 |       "insert 81 user81 person81@example.com",
316 |       "insert 72 user72 person72@example.com",
317 |       "insert 6 user6 person6@example.com",
318 |       "insert 50 user50 person50@example.com",
319 |       "insert 85 user85 person85@example.com",
320 |       "insert 67 user67 person67@example.com",
321 |       "insert 2 user2 person2@example.com",
322 |       "insert 55 user55 person55@example.com",
323 |       "insert 69 user69 person69@example.com",
324 |       "insert 5 user5 person5@example.com",
325 |       "insert 65 user65 person65@example.com",
326 |       "insert 52 user52 person52@example.com",
327 |       "insert 1 user1 person1@example.com",
328 |       "insert 29 user29 person29@example.com",
329 |       "insert 9 user9 person9@example.com",
330 |       "insert 43 user43 person43@example.com",
331 |       "insert 75 user75 person75@example.com",
332 |       "insert 21 user21 person21@example.com",
333 |       "insert 82 user82 person82@example.com",
334 |       "insert 12 user12 person12@example.com",
335 |       "insert 18 user18 person18@example.com",
336 |       "insert 60 user60 person60@example.com",
337 |       "insert 44 user44 person44@example.com",
338 |       ".btree",
339 |       ".exit",
340 |     ]
341 |     result = run_script(script)
342 | 
343 |     expect(result[64...(result.length)]).to match_array([
344 |       "db > Tree:",
345 |       "- internal (size 1)",
346 |       "  - internal (size 2)",
347 |       "    - leaf (size 7)",
348 |       "      - 1",
349 |       "      - 2",
350 |       "      - 4",
351 |       "      - 5",
352 |       "      - 6",
353 |       "      - 7",
354 |       "      - 8",
355 |       "    - key 8",
356 |       "    - leaf (size 11)",
357 |       "      - 9",
358 |       "      - 10",
359 |       "      - 12",
360 |       "      - 13",
361 |       "      - 14",
362 |       "      - 15",
363 |       "      - 18",
364 |       "      - 19",
365 |       "      - 20",
366 |       "      - 21",
367 |       "      - 22",
368 |       "    - key 22",
369 |       "    - leaf (size 8)",
370 |       "      - 24",
371 |       "      - 25",
372 |       "      - 29",
373 |       "      - 30",
374 |       "      - 31",
375 |       "      - 32",
376 |       "      - 33",
377 |       "      - 35",
378 |       "  - key 35",
379 |       "  - internal (size 3)",
380 |       "    - leaf (size 12)",
381 |       "      - 36",
382 |       "      - 37",
383 |       "      - 39",
384 |       "      - 40",
385 |       "      - 43",
386 |       "      - 44",
387 |       "      - 46",
388 |       "      - 47",
389 |       "      - 48",
390 |       "      - 49",
391 |       "      - 50",
392 |       "      - 51",
393 |       "    - key 51",
394 |       "    - leaf (size 11)",
395 |       "      - 52",
396 |       "      - 53",
397 |       "      - 54",
398 |       "      - 55",
399 |       "      - 56",
400 |       "      - 58",
401 |       "      - 59",
402 |       "      - 60",
403 |       "      - 63",
404 |       "      - 65",
405 |       "      - 66",
406 |       "    - key 66",
407 |       "    - leaf (size 7)",
408 |       "      - 67",
409 |       "      - 68",
410 |       "      - 69",
411 |       "      - 70",
412 |       "      - 71",
413 |       "      - 72",
414 |       "      - 75",
415 |       "    - key 75",
416 |       "    - leaf (size 8)",
417 |       "      - 76",
418 |       "      - 77",
419 |       "      - 78",
420 |       "      - 79",
421 |       "      - 81",
422 |       "      - 82",
423 |       "      - 85",
424 |       "      - 86",
425 |       "db > ",
426 |     ])
427 |   end
428 | 
429 |   it 'prints constants' do
430 |     script = [
431 |       ".constants",
432 |       ".exit",
433 |     ]
434 |     result = run_script(script)
435 | 
436 |     expect(result).to match_array([
437 |       "db > Constants:",
438 |       "ROW_SIZE: 293",
439 |       "COMMON_NODE_HEADER_SIZE: 6",
440 |       "LEAF_NODE_HEADER_SIZE: 14",
441 |       "LEAF_NODE_CELL_SIZE: 297",
442 |       "LEAF_NODE_SPACE_FOR_CELLS: 4082",
443 |       "LEAF_NODE_MAX_CELLS: 13",
444 |       "db > ",
445 |     ])
446 |   end
447 | 
448 |   it 'prints all rows in a multi-level tree' do
449 |     script = []
450 |     (1..15).each do |i|
451 |       script << "insert #{i} user#{i} person#{i}@example.com"
452 |     end
453 |     script << "select"
454 |     script << ".exit"
455 |     result = run_script(script)
456 |     expect(result[15...result.length]).to match_array([
457 |       "db > (1, user1, person1@example.com)",
458 |       "(2, user2, person2@example.com)",
459 |       "(3, user3, person3@example.com)",
460 |       "(4, user4, person4@example.com)",
461 |       "(5, user5, person5@example.com)",
462 |       "(6, user6, person6@example.com)",
463 |       "(7, user7, person7@example.com)",
464 |       "(8, user8, person8@example.com)",
465 |       "(9, user9, person9@example.com)",
466 |       "(10, user10, person10@example.com)",
467 |       "(11, user11, person11@example.com)",
468 |       "(12, user12, person12@example.com)",
469 |       "(13, user13, person13@example.com)",
470 |       "(14, user14, person14@example.com)",
471 |       "(15, user15, person15@example.com)",
472 |       "Executed.", "db > ",
473 |     ])
474 |   end
475 | end
476 | 


--------------------------------------------------------------------------------