├── .gitignore ├── ACM-Reference-Format-Journals.bst ├── README.md ├── acmcopyright.sty ├── acmsmall.cls ├── apalike-refs.bst ├── appendix.tex ├── applications.tex ├── arxiv.tex ├── binary.tex ├── common.tex ├── conclusions.tex ├── constraints.tex ├── environment.tex ├── executors.tex ├── explosion.tex ├── hang.tex ├── images ├── blackbox.odg ├── blackbox.pdf ├── compiler.odg ├── compiler.odg.new ├── compiler.pdf ├── compiler.pdf.new ├── concolic-execution-2.odg ├── concolic-execution-2.pdf ├── concolic-execution.odg ├── concolic-execution.pdf ├── concolic-execution_old.odg ├── concolic-execution_old.pdf ├── concrete-abstract.eps ├── concrete-abstract.pdf ├── concrete-abstract.svg ├── concrete-execution.odg ├── concrete-execution.pdf ├── eager-evaluation.odg ├── eager-evaluation.pdf ├── example.odg ├── example.pdf ├── execution-tree-text.svg ├── execution-tree-text.tex ├── execution-tree.eps ├── execution-tree.pdf ├── execution-tree.svg ├── lazy-initialization-C.odg ├── lazy-initialization.odg ├── lazy-initialization.pdf ├── memory-fork.odg ├── memory-fork.pdf ├── memory-ite.odg ├── memory-ite.pdf ├── photo_tree.pdf ├── state-merging-2.odg ├── state-merging-2.pdf ├── state-merging.odg ├── state-merging.pdf ├── state-merging_old.png ├── whitebox.odg └── whitebox.pdf ├── intro.tex ├── main.tex ├── memory.tex ├── misc ├── glossary.tex ├── loops.tex └── sandbox.tex ├── overview.tex ├── submissions ├── fifth │ ├── appendix.pdf │ ├── main.pdf │ └── survey-with-appendix.pdf ├── first │ ├── cover_letter.docx │ ├── cover_letter.pdf │ ├── proof.pdf │ └── survey.pdf ├── fourth │ ├── ACM-CSUR-Revision.pdf │ ├── proof.pdf │ └── survey.pdf ├── second │ ├── proof.pdf │ ├── survey-similarities.pdf │ └── survey-symbolic-exec-v1.pdf └── third │ ├── proof.pdf │ └── survey.pdf ├── symbolic.bib └── tables.tex /.gitignore: -------------------------------------------------------------------------------- 1 | *.log 2 | *.toc 3 | /main.pdf 4 | /appendix.pdf 5 | *.out 6 | *.aux 7 | *.bbl 8 | *.blg 9 | *.fdb_latexmk 10 | *.fls 11 | *.synctex.gz 12 | .DS_Store 13 | Icon* 14 | *-eps-converted-to.pdf 15 | *-tree-eps-converted-to.pdf 16 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # README # 2 | 3 | This is a survey by the [SEASON lab](http://season-lab.github.io) on symbolic execution tools and techniques. 4 | 5 | If you are considering citing our work, we would be grateful if you could use the following BibTeX entry: 6 | ``` tex 7 | @article{SurveySymExec-CSUR18, 8 | author = {Baldoni, Roberto and Coppa, Emilio and D'Elia, Daniele Cono and Demetrescu, Camil and Finocchi, Irene}, 9 | title = {A Survey of Symbolic Execution Techniques}, 10 | journal = {ACM Comput. Surv.}, 11 | volume = {51}, 12 | number = {3}, 13 | articleno = {50}, 14 | publisher = {ACM}, 15 | address = {New York, NY, USA}, 16 | year = {2018} 17 | } 18 | ``` 19 | -------------------------------------------------------------------------------- /acmcopyright.sty: -------------------------------------------------------------------------------- 1 | %% 2 | %% This is file `acmcopyright.sty', 3 | %% generated with the docstrip utility. 4 | %% 5 | %% The original source files were: 6 | %% 7 | %% acmcopyright.dtx (with options: `style') 8 | %% 9 | %% IMPORTANT NOTICE: 10 | %% 11 | %% For the copyright see the source file. 12 | %% 13 | %% Any modified versions of this file must be renamed 14 | %% with new filenames distinct from acmcopyright.sty. 15 | %% 16 | %% For distribution of the original source see the terms 17 | %% for copying and modification in the file acmcopyright.dtx. 18 | %% 19 | %% This generated file may be distributed as long as the 20 | %% original source files, as listed above, are part of the 21 | %% same distribution. (The sources need not necessarily be 22 | %% in the same archive or directory.) 23 | %% \CharacterTable 24 | %% {Upper-case \A\B\C\D\E\F\G\H\I\J\K\L\M\N\O\P\Q\R\S\T\U\V\W\X\Y\Z 25 | %% Lower-case \a\b\c\d\e\f\g\h\i\j\k\l\m\n\o\p\q\r\s\t\u\v\w\x\y\z 26 | %% Digits \0\1\2\3\4\5\6\7\8\9 27 | %% Exclamation \! Double quote \" Hash (number) \# 28 | %% Dollar \$ Percent \% Ampersand \& 29 | %% Acute accent \' Left paren \( Right paren \) 30 | %% Asterisk \* Plus \+ Comma \, 31 | %% Minus \- Point \. Solidus \/ 32 | %% Colon \: Semicolon \; Less than \< 33 | %% Equals \= Greater than \> Question mark \? 34 | %% Commercial at \@ Left bracket \[ Backslash \\ 35 | %% Right bracket \] Circumflex \^ Underscore \_ 36 | %% Grave accent \` Left brace \{ Vertical bar \| 37 | %% Right brace \} Tilde \~} 38 | \NeedsTeXFormat{LaTeX2e} 39 | \ProvidesPackage{acmcopyright} 40 | [2014/06/29 v1.2 Copyright statemens for ACM classes] 41 | \newif\if@printcopyright 42 | \@printcopyrighttrue 43 | \newif\if@printpermission 44 | \@printpermissiontrue 45 | \newif\if@acmowned 46 | \@acmownedtrue 47 | \RequirePackage{xkeyval} 48 | \define@choicekey*{ACM@}{acmcopyrightmode}[% 49 | \acm@copyrightinput\acm@copyrightmode]{none,acmcopyright,acmlicensed,% 50 | rightsretained,usgov,usgovmixed,cagov,cagovmixed,% 51 | licensedusgovmixed,licensedcagovmixed,othergov,licensedothergov}{% 52 | \@printpermissiontrue 53 | \@printcopyrighttrue 54 | \@acmownedtrue 55 | \ifnum\acm@copyrightmode=0\relax % none 56 | \@printpermissionfalse 57 | \@printcopyrightfalse 58 | \@acmownedfalse 59 | \fi 60 | \ifnum\acm@copyrightmode=2\relax % acmlicensed 61 | \@acmownedfalse 62 | \fi 63 | \ifnum\acm@copyrightmode=3\relax % rightsretained 64 | \@acmownedfalse 65 | \fi 66 | \ifnum\acm@copyrightmode=4\relax % usgov 67 | \@printpermissiontrue 68 | \@printcopyrightfalse 69 | \@acmownedfalse 70 | \fi 71 | \ifnum\acm@copyrightmode=6\relax % cagov 72 | \@acmownedfalse 73 | \fi 74 | \ifnum\acm@copyrightmode=8\relax % licensedusgovmixed 75 | \@acmownedfalse 76 | \fi 77 | \ifnum\acm@copyrightmode=9\relax % licensedcagovmixed 78 | \@acmownedfalse 79 | \fi 80 | \ifnum\acm@copyrightmode=10\relax % othergov 81 | \@acmownedtrue 82 | \fi 83 | \ifnum\acm@copyrightmode=11\relax % licensedothergov 84 | \@acmownedfalse 85 | \@printcopyrightfalse 86 | \fi} 87 | \def\setcopyright#1{\setkeys{ACM@}{acmcopyrightmode=#1}} 88 | \setcopyright{acmcopyright} 89 | \def\@copyrightowner{% 90 | \ifcase\acm@copyrightmode\relax % none 91 | \or % acmcopyright 92 | ACM. 93 | \or % acmlicensed 94 | Copyright held by the owner/author(s). Publication rights licensed to 95 | ACM. 96 | \or % rightsretained 97 | Copyright held by the owner/author(s). 98 | \or % usgov 99 | \or % usgovmixed 100 | ACM. 101 | \or % cagov 102 | Crown in Right of Canada. 103 | \or %cagovmixed 104 | ACM. 105 | \or %licensedusgovmixed 106 | Copyright held by the owner/author(s). Publication rights licensed to 107 | ACM. 108 | \or %licensedcagovmixed 109 | Copyright held by the owner/author(s). Publication rights licensed to 110 | ACM. 111 | \or % othergov 112 | ACM. 113 | \or % licensedothergov 114 | \fi} 115 | \def\@copyrightpermission{% 116 | \ifcase\acm@copyrightmode\relax % none 117 | \or % acmcopyright 118 | Permission to make digital or hard copies of all or part of this 119 | work for personal or classroom use is granted without fee provided 120 | that copies are not made or distributed for profit or commercial 121 | advantage and that copies bear this notice and the full citation on 122 | the first page. Copyrights for components of this work owned by 123 | others than ACM must be honored. Abstracting with credit is 124 | permitted. To copy otherwise, or republish, to post on servers or to 125 | redistribute to lists, requires prior specific permission 126 | and\hspace*{.5pt}/or a fee. Request permissions from 127 | permissions@acm.org. 128 | \or % acmlicensed 129 | Permission to make digital or hard copies of all or part of this 130 | work for personal or classroom use is granted without fee provided 131 | that copies are not made or distributed for profit or commercial 132 | advantage and that copies bear this notice and the full citation on 133 | the first page. Copyrights for components of this work owned by 134 | others than the author(s) must be honored. Abstracting with credit 135 | is permitted. To copy otherwise, or republish, to post on servers 136 | or to redistribute to lists, requires prior specific permission 137 | and\hspace*{.5pt}/or a fee. Request permissions from 138 | permissions@acm.org. 139 | \or % rightsretained 140 | Permission to make digital or hard copies of part or all of this work 141 | for personal or classroom use is granted without fee provided that 142 | copies are not made or distributed for profit or commercial advantage 143 | and that copies bear this notice and the full citation on the first 144 | page. Copyrights for third-party components of this work must be 145 | honored. For all other uses, contact the 146 | owner\hspace*{.5pt}/author(s). 147 | \or % usgov 148 | This paper is authored by an employee(s) of the United States 149 | Government and is in the public domain. Non-exclusive copying or 150 | redistribution is allowed, provided that the article citation is 151 | given and the authors and agency are clearly identified as its 152 | source. 153 | \or % usgovmixed 154 | ACM acknowledges that this contribution was authored or co-authored 155 | by an employee, or contractor of the national government. As such, 156 | the Government retains a nonexclusive, royalty-free right to 157 | publish or reproduce this article, or to allow others to do so, for 158 | Government purposes only. Permission to make digital or hard copies 159 | for personal or classroom use is granted. Copies must bear this 160 | notice and the full citation on the first page. Copyrights for 161 | components of this work owned by others than ACM must be 162 | honored. To copy otherwise, distribute, republish, or post, 163 | requires prior specific permission and\hspace*{.5pt}/or a 164 | fee. Request permissions from permissions@acm.org. 165 | \or % cagov 166 | This article was authored by employees of the Government of Canada. 167 | As such, the Canadian government retains all interest in the 168 | copyright to this work and grants to ACM a nonexclusive, 169 | royalty-free right to publish or reproduce this article, or to allow 170 | others to do so, provided that clear attribution is given both to 171 | the authors and the Canadian government agency employing them. 172 | Permission to make digital or hard copies for personal or classroom 173 | use is granted. Copies must bear this notice and the full citation 174 | on the first page. Copyrights for components of this work owned by 175 | others than the Canadain Government must be honored. To copy 176 | otherwise, distribute, republish, or post, requires prior specific 177 | permission and\hspace*{.5pt}/or a fee. Request permissions from 178 | permissions@acm.org. 179 | \or % cagovmixed 180 | ACM acknowledges that this contribution was co-authored by an 181 | affiliate of the national government of Canada. As such, the Crown 182 | in Right of Canada retains an equal interest in the copyright. 183 | Reprints must include clear attribution to ACM and the author's 184 | government agency affiliation. Permission to make digital or hard 185 | copies for personal or classroom use is granted. Copies must bear 186 | this notice and the full citation on the first page. Copyrights for 187 | components of this work owned by others than ACM must be honored. 188 | To copy otherwise, distribute, republish, or post, requires prior 189 | specific permission and\hspace*{.5pt}/or a fee. Request permissions 190 | from permissions@acm.org. 191 | \or % licensedusgovmixed 192 | Publication rights licensed to ACM. ACM acknowledges that this 193 | contribution was authored or co-authored by an employee, contractor 194 | or affiliate of the United States government. As such, the 195 | Government retains a nonexclusive, royalty-free right to publish or 196 | reproduce this article, or to allow others to do so, for Government 197 | purposes only. 198 | \or % licensedcagovmixed 199 | Publication rights licensed to ACM. ACM acknowledges that this 200 | contribution was authored or co-authored by an employee, contractor 201 | or affiliate of the national government of Canada. As such, the 202 | Government retains a nonexclusive, royalty-free right to publish or 203 | reproduce this article, or to allow others to do so, for Government 204 | purposes only. 205 | \or % othergov 206 | ACM acknowledges that this contribution was authored or co-authored 207 | by an employee, contractor or affiliate of a national government. As 208 | such, the Government retains a nonexclusive, royalty-free right to 209 | publish or reproduce this article, or to allow others to do so, for 210 | Government purposes only. 211 | \or % licensedothergov 212 | Publication rights licensed to ACM. ACM acknowledges that this 213 | contribution was authored or co-authored by an employee, contractor 214 | or affiliate of a national government. As such, the Government 215 | retains a nonexclusive, royalty-free right to publish or reproduce 216 | this article, or to allow others to do so, for Government purposes 217 | only. 218 | \fi} 219 | \endinput 220 | %% 221 | %% End of file `acmcopyright.sty'. 222 | -------------------------------------------------------------------------------- /apalike-refs.bst: -------------------------------------------------------------------------------- 1 | % BibTeX `apalike-refs' bibliography style which displays different IDs like DOI, ISBN, ISSN, but also the URL. 2 | % It uses \href and \url, so be sure to use the package hyperref for it to work. 3 | % It is based on the `apalike-doi' of Jan Even Øie Nilsen: 4 | % http://web.nersc.no/~even/tex/apalike-doi.bst 5 | % 6 | % MODIFICATIONS: 7 | % - Add ISBN, ISSN, and URL functions (format.xxx) 8 | % - Centralise all the IDs functions into format.refs 9 | % - Call format.refs in all types (book, article, etc.) 10 | % - Make a URL for DOI based on dx.doi.org 11 | % - Make a URL for ISBN based on openlibrary.org 12 | % - Manage multiple IDs for ISBN and ISSN (although URL feature for ISSN is abandoned by lack of open resource) 13 | % 14 | % Time-stamp: 15 | % 16 | % Was: 17 | % BibTeX `apalike-doi' bibliography style 18 | % an attmpt to have apalike use doi and eventually eid: 19 | % MODIFICATIONS: 20 | % : 21 | % ENTRY : eid and doi put in 22 | % FUNCTION {format.eid} and 23 | % FUNCTION {format.doi} : put in before FUNCTION {format.title} 24 | % FUNCTION {article} : changes at end 25 | % 26 | % Time-stamp: 27 | % File: 28 | % 29 | % Was: 30 | % BibTeX `apalike' bibliography style (24-Jan-88 version) 31 | % Adapted from the `alpha' style, version 0.99a; for BibTeX version 0.99a. 32 | % Copyright (C) 1988, all rights reserved. 33 | % Copying of this file is allowed, provided that if you make any changes at all 34 | % you name it something other than `apalike.bst'. 35 | % This restriction helps ensure that all copies are identical. 36 | % Differences between this style and `alpha' are generally heralded by a `%'. 37 | % The file btxbst.doc has the documentation for alpha.bst. 38 | % 39 | % This style should be used with the `apalike' LaTeX style (apalike.sty). 40 | % \cite's come out like "(Jones, 1986)" in the text but there are no labels 41 | % in the bibliography, and something like "(1986)" comes out immediately 42 | % after the author. Author (and editor) names appear as last name, comma, 43 | % initials. A `year' field is required for every entry, and so is either 44 | % an author (or in some cases, an editor) field or a key field. 45 | % 46 | % Editorial note: 47 | % Many journals require a style like `apalike', but I strongly, strongly, 48 | % strongly recommend that you not use it if you have a choice---use something 49 | % like `plain' instead. Mary-Claire van Leunen (A Handbook for Scholars, 50 | % Knopf, 1979) argues convincingly that a style like `plain' encourages better 51 | % writing than one like `apalike'. Furthermore the strongest arguments for 52 | % using an author-date style like `apalike'---that it's "the most practical" 53 | % (The Chicago Manual of Style, University of Chicago Press, thirteenth 54 | % edition, 1982, pages 400--401)---fall flat on their face with the new 55 | % computer-typesetting technology. For instance page 401 anachronistically 56 | % states "The chief disadvantage of [a style like `plain'] is that additions 57 | % or deletions cannot be made after the manuscript is typed without changing 58 | % numbers in both text references and list." LaTeX sidesteps the disadvantage. 59 | % 60 | % History: 61 | % 15-sep-86 (SK,OP) Original version, by Susan King and Oren Patashnik. 62 | % 10-nov-86 (OP) Truncated the sort.key$ string to the correct length 63 | % in bib.sort.order to eliminate error message. 64 | % 24-jan-88 (OP) Updated for BibTeX version 0.99a, from alpha.bst 0.99a; 65 | % apalike now sorts by author, then year, then title; 66 | % THIS `apalike' VERSION DOES NOT WORK WITH BIBTEX 0.98i. 67 | 68 | ENTRY 69 | { address 70 | author 71 | booktitle 72 | chapter 73 | eid 74 | doi 75 | isbn 76 | issn 77 | url 78 | edition 79 | editor 80 | howpublished 81 | institution 82 | journal 83 | key 84 | % month not used in apalike 85 | note 86 | number 87 | organization 88 | pages 89 | publisher 90 | school 91 | series 92 | title 93 | type 94 | volume 95 | year 96 | } 97 | {} 98 | { label extra.label sort.label } 99 | 100 | INTEGERS { output.state before.all mid.sentence after.sentence after.block } 101 | 102 | FUNCTION {init.state.consts} 103 | { #0 'before.all := 104 | #1 'mid.sentence := 105 | #2 'after.sentence := 106 | #3 'after.block := 107 | } 108 | 109 | STRINGS { s t } 110 | 111 | FUNCTION {output.nonnull} 112 | { 's := 113 | output.state mid.sentence = 114 | { ", " * write$ } 115 | { output.state after.block = 116 | { add.period$ write$ 117 | newline$ 118 | "\newblock " write$ 119 | } 120 | { output.state before.all = 121 | 'write$ 122 | { add.period$ " " * write$ } 123 | if$ 124 | } 125 | if$ 126 | mid.sentence 'output.state := 127 | } 128 | if$ 129 | s 130 | } 131 | 132 | FUNCTION {output} 133 | { duplicate$ empty$ 134 | 'pop$ 135 | 'output.nonnull 136 | if$ 137 | } 138 | 139 | FUNCTION {output.check} 140 | { 't := 141 | duplicate$ empty$ 142 | { pop$ "empty " t * " in " * cite$ * warning$ } 143 | 'output.nonnull 144 | if$ 145 | } 146 | 147 | % apalike needs this function because 148 | % the year has special punctuation; 149 | % apalike ignores the month 150 | FUNCTION {output.year.check} 151 | { year empty$ 152 | { "empty year in " cite$ * warning$ } 153 | { write$ 154 | " (" year * extra.label * ")" * 155 | mid.sentence 'output.state := 156 | } 157 | if$ 158 | } 159 | 160 | FUNCTION {output.bibitem} 161 | { newline$ 162 | "\bibitem[" write$ 163 | label write$ 164 | "]{" write$ 165 | cite$ write$ 166 | "}" write$ 167 | newline$ 168 | "" 169 | before.all 'output.state := 170 | } 171 | 172 | FUNCTION {fin.entry} 173 | { add.period$ 174 | write$ 175 | newline$ 176 | } 177 | 178 | FUNCTION {new.block} 179 | { output.state before.all = 180 | 'skip$ 181 | { after.block 'output.state := } 182 | if$ 183 | } 184 | 185 | FUNCTION {new.sentence} 186 | { output.state after.block = 187 | 'skip$ 188 | { output.state before.all = 189 | 'skip$ 190 | { after.sentence 'output.state := } 191 | if$ 192 | } 193 | if$ 194 | } 195 | 196 | FUNCTION {not} 197 | { { #0 } 198 | { #1 } 199 | if$ 200 | } 201 | 202 | FUNCTION {and} 203 | { 'skip$ 204 | { pop$ #0 } 205 | if$ 206 | } 207 | 208 | FUNCTION {or} 209 | { { pop$ #1 } 210 | 'skip$ 211 | if$ 212 | } 213 | 214 | FUNCTION {new.block.checkb} 215 | { empty$ 216 | swap$ empty$ 217 | and 218 | 'skip$ 219 | 'new.block 220 | if$ 221 | } 222 | 223 | FUNCTION {field.or.null} 224 | { duplicate$ empty$ 225 | { pop$ "" } 226 | 'skip$ 227 | if$ 228 | } 229 | 230 | FUNCTION {emphasize} 231 | { duplicate$ empty$ 232 | { pop$ "" } 233 | { "{\em " swap$ * "}" * } 234 | if$ 235 | } 236 | 237 | INTEGERS { index length } 238 | 239 | STRINGS { fullString } 240 | 241 | FUNCTION {split.at.first.space} 242 | { 243 | duplicate$ 244 | text.length$ 245 | 'length := 246 | #1 247 | { 248 | 'index := 249 | duplicate$ 250 | index #1 substring$ 251 | " " = not index length #1 + < and 252 | } 253 | { 254 | index #1 + 255 | } 256 | while$ 257 | 'fullString := 258 | fullString #1 index #1 - substring$ 259 | fullString index #1 + fullString text.length$ index - substring$ 260 | } 261 | 262 | STRINGS { str1 str2 char } 263 | 264 | FUNCTION {escape.url.characters} 265 | { 266 | duplicate$ text.length$ 267 | 'length := 268 | "" 269 | { 270 | 'str1 := 271 | duplicate$ 272 | empty$ not 273 | } 274 | { 275 | 'str2 := 276 | str2 #1 #1 substring$ 277 | 'char := 278 | char "_" = 279 | { str1 "\" * char * } 280 | { str1 char * } 281 | if$ 282 | 'str1 := 283 | str2 #2 length #1 - substring$ 284 | str1 285 | } 286 | while$ 287 | pop$ 288 | str1 289 | } 290 | 291 | INTEGERS { val } 292 | 293 | FUNCTION {is.number.character} 294 | { 295 | chr.to.int$ 296 | 'val := 297 | val #47 > 298 | val #58 < 299 | + #2 = 300 | } 301 | 302 | INTEGERS { nameptr namesleft numnames } 303 | 304 | FUNCTION {format.names} 305 | { 's := 306 | #1 'nameptr := 307 | s num.names$ 'numnames := 308 | numnames 'namesleft := 309 | { namesleft #0 > } 310 | { s nameptr "{vv~}{ll}{, jj}{, f.}" format.name$ 't := % last name first 311 | nameptr #1 > 312 | { namesleft #1 > 313 | { ", " * t * } 314 | { numnames #2 > 315 | { "," * } 316 | 'skip$ 317 | if$ 318 | t "others" = 319 | { " et~al." * } 320 | { " and " * t * } 321 | if$ 322 | } 323 | if$ 324 | } 325 | 't 326 | if$ 327 | nameptr #1 + 'nameptr := 328 | namesleft #1 - 'namesleft := 329 | } 330 | while$ 331 | } 332 | 333 | FUNCTION {format.authors} 334 | { author empty$ 335 | { "" } 336 | { author format.names } 337 | if$ 338 | } 339 | 340 | FUNCTION {format.key} % this function is just for apalike 341 | { empty$ 342 | { key field.or.null } 343 | { "" } 344 | if$ 345 | } 346 | 347 | FUNCTION {format.editors} 348 | { editor empty$ 349 | { "" } 350 | { editor format.names 351 | editor num.names$ #1 > 352 | { ", editors" * } 353 | { ", editor" * } 354 | if$ 355 | } 356 | if$ 357 | } 358 | 359 | FUNCTION {format.eid} 360 | { 361 | eid empty$ 362 | { "" } 363 | { "" eid * } 364 | if$ 365 | } 366 | 367 | FUNCTION {format.doi} 368 | { 369 | doi empty$ 370 | { "" } 371 | { "DOI: \href{http://dx.doi.org/" doi * "}{\tt " * doi escape.url.characters * "}" * } 372 | if$ 373 | } 374 | 375 | STRINGS { str rem } 376 | 377 | FUNCTION {format.isbn} 378 | { 379 | isbn empty$ 380 | { "" } 381 | { 382 | "ISBN:" 383 | isbn 384 | { 385 | duplicate$ empty$ not 386 | } 387 | { 388 | split.at.first.space 389 | 'rem := 390 | 'str := 391 | str #1 #1 substring$ 392 | is.number.character 393 | % DCD {" \href{https://openlibrary.org/search?isbn=" * str * "}{\tt " * str escape.url.characters * "}" *} 394 | {" {\tt " * str escape.url.characters * "}" *} 395 | {" " * str escape.url.characters *} 396 | if$ 397 | rem 398 | } 399 | while$ 400 | pop$ 401 | } 402 | if$ 403 | } 404 | 405 | FUNCTION {format.issn} 406 | { 407 | issn empty$ 408 | { "" } 409 | { 410 | "ISSN:" 411 | issn 412 | { 413 | duplicate$ empty$ not 414 | } 415 | { 416 | split.at.first.space 417 | 'rem := 418 | 'str := 419 | % If you find an open search engine for ISSN, please tell to the author. 420 | % To make it a URL, pay attention: there might have a coma, so purify before. 421 | " {\tt " * str escape.url.characters * "}" * 422 | rem 423 | } 424 | while$ 425 | pop$ 426 | } 427 | if$ 428 | } 429 | 430 | FUNCTION {format.url} 431 | { 432 | url empty$ 433 | { "" } 434 | { "\url{" url * "}" * } 435 | if$ 436 | } 437 | 438 | FUNCTION {format.refs} 439 | { 440 | format.eid output 441 | format.isbn output 442 | format.issn output 443 | format.doi output 444 | format.url output 445 | } 446 | 447 | FUNCTION {format.title} 448 | { title empty$ 449 | { "" } 450 | { title "t" change.case$ } 451 | if$ 452 | } 453 | 454 | FUNCTION {n.dashify} 455 | { 't := 456 | "" 457 | { t empty$ not } 458 | { t #1 #1 substring$ "-" = 459 | { t #1 #2 substring$ "--" = not 460 | { "--" * 461 | t #2 global.max$ substring$ 't := 462 | } 463 | { { t #1 #1 substring$ "-" = } 464 | { "-" * 465 | t #2 global.max$ substring$ 't := 466 | } 467 | while$ 468 | } 469 | if$ 470 | } 471 | { t #1 #1 substring$ * 472 | t #2 global.max$ substring$ 't := 473 | } 474 | if$ 475 | } 476 | while$ 477 | } 478 | 479 | FUNCTION {format.btitle} 480 | { title emphasize 481 | } 482 | 483 | FUNCTION {tie.or.space.connect} 484 | { duplicate$ text.length$ #3 < 485 | { "~" } 486 | { " " } 487 | if$ 488 | swap$ * * 489 | } 490 | 491 | FUNCTION {either.or.check} 492 | { empty$ 493 | 'pop$ 494 | { "can't use both " swap$ * " fields in " * cite$ * warning$ } 495 | if$ 496 | } 497 | 498 | FUNCTION {format.bvolume} 499 | { volume empty$ 500 | { "" } 501 | { "volume" volume tie.or.space.connect 502 | series empty$ 503 | 'skip$ 504 | { " of " * series emphasize * } 505 | if$ 506 | "volume and number" number either.or.check 507 | } 508 | if$ 509 | } 510 | 511 | FUNCTION {format.number.series} 512 | { volume empty$ 513 | { number empty$ 514 | { series field.or.null } 515 | { output.state mid.sentence = 516 | { "number" } 517 | { "Number" } 518 | if$ 519 | number tie.or.space.connect 520 | series empty$ 521 | { "there's a number but no series in " cite$ * warning$ } 522 | { " in " * series * } 523 | if$ 524 | } 525 | if$ 526 | } 527 | { "" } 528 | if$ 529 | } 530 | 531 | FUNCTION {format.edition} 532 | { edition empty$ 533 | { "" } 534 | { output.state mid.sentence = 535 | { edition "l" change.case$ " edition" * } 536 | { edition "t" change.case$ " edition" * } 537 | if$ 538 | } 539 | if$ 540 | } 541 | 542 | INTEGERS { multiresult } 543 | 544 | FUNCTION {multi.page.check} 545 | { 't := 546 | #0 'multiresult := 547 | { multiresult not 548 | t empty$ not 549 | and 550 | } 551 | { t #1 #1 substring$ 552 | duplicate$ "-" = 553 | swap$ duplicate$ "," = 554 | swap$ "+" = 555 | or or 556 | { #1 'multiresult := } 557 | { t #2 global.max$ substring$ 't := } 558 | if$ 559 | } 560 | while$ 561 | multiresult 562 | } 563 | 564 | FUNCTION {format.pages} 565 | { pages empty$ 566 | { "" } 567 | { pages multi.page.check 568 | { "pages" pages n.dashify tie.or.space.connect } 569 | { "page" pages tie.or.space.connect } 570 | if$ 571 | } 572 | if$ 573 | } 574 | 575 | FUNCTION {format.vol.num.pages} 576 | { volume field.or.null 577 | number empty$ 578 | 'skip$ 579 | { "(" number * ")" * * 580 | volume empty$ 581 | { "there's a number but no volume in " cite$ * warning$ } 582 | 'skip$ 583 | if$ 584 | } 585 | if$ 586 | pages empty$ 587 | 'skip$ 588 | { duplicate$ empty$ 589 | { pop$ format.pages } 590 | { ":" * pages n.dashify * } 591 | if$ 592 | } 593 | if$ 594 | } 595 | 596 | FUNCTION {format.chapter.pages} 597 | { chapter empty$ 598 | 'format.pages 599 | { type empty$ 600 | { "chapter" } 601 | { type "l" change.case$ } 602 | if$ 603 | chapter tie.or.space.connect 604 | pages empty$ 605 | 'skip$ 606 | { ", " * format.pages * } 607 | if$ 608 | } 609 | if$ 610 | } 611 | 612 | FUNCTION {format.in.ed.booktitle} 613 | { booktitle empty$ 614 | { "" } 615 | { editor empty$ 616 | { "In " booktitle emphasize * } 617 | { "In " format.editors * ", " * booktitle emphasize * } 618 | if$ 619 | } 620 | if$ 621 | } 622 | 623 | FUNCTION {format.thesis.type} 624 | { type empty$ 625 | 'skip$ 626 | { pop$ 627 | type "t" change.case$ 628 | } 629 | if$ 630 | } 631 | 632 | FUNCTION {format.tr.number} 633 | { type empty$ 634 | { "Technical Report" } 635 | 'type 636 | if$ 637 | number empty$ 638 | { "t" change.case$ } 639 | { number tie.or.space.connect } 640 | if$ 641 | } 642 | 643 | FUNCTION {format.article.crossref} 644 | { "In" % this is for apalike 645 | " \cite{" * crossref * "}" * 646 | } 647 | 648 | FUNCTION {format.book.crossref} 649 | { volume empty$ 650 | { "empty volume in " cite$ * "'s crossref of " * crossref * warning$ 651 | "In " 652 | } 653 | { "Volume" volume tie.or.space.connect 654 | " of " * 655 | } 656 | if$ 657 | "\cite{" * crossref * "}" * % this is for apalike 658 | } 659 | 660 | FUNCTION {format.incoll.inproc.crossref} 661 | { "In" % this is for apalike 662 | " \cite{" * crossref * "}" * 663 | } 664 | 665 | FUNCTION {article} 666 | { output.bibitem 667 | format.authors "author" output.check 668 | author format.key output % special for 669 | output.year.check % apalike 670 | new.block 671 | format.title "title" output.check 672 | new.block 673 | crossref missing$ 674 | { journal emphasize "journal" output.check 675 | format.vol.num.pages output 676 | } 677 | { format.article.crossref output.nonnull 678 | format.pages output 679 | } 680 | if$ 681 | format.refs 682 | new.block 683 | note output 684 | fin.entry 685 | } 686 | 687 | FUNCTION {book} 688 | { output.bibitem 689 | author empty$ 690 | { format.editors "author and editor" output.check 691 | editor format.key output 692 | } 693 | { format.authors output.nonnull 694 | crossref missing$ 695 | { "author and editor" editor either.or.check } 696 | 'skip$ 697 | if$ 698 | } 699 | if$ 700 | output.year.check % special for apalike 701 | new.block 702 | format.btitle "title" output.check 703 | crossref missing$ 704 | { format.bvolume output 705 | new.block 706 | format.number.series output 707 | new.sentence 708 | publisher "publisher" output.check 709 | address output 710 | } 711 | { new.block 712 | format.book.crossref output.nonnull 713 | } 714 | if$ 715 | format.edition output 716 | format.refs 717 | new.block 718 | note output 719 | fin.entry 720 | } 721 | 722 | FUNCTION {booklet} 723 | { output.bibitem 724 | format.authors output 725 | author format.key output % special for 726 | output.year.check % apalike 727 | new.block 728 | format.title "title" output.check 729 | new.block 730 | howpublished output 731 | address output 732 | format.refs 733 | new.block 734 | note output 735 | fin.entry 736 | } 737 | 738 | FUNCTION {inbook} 739 | { output.bibitem 740 | author empty$ 741 | { format.editors "author and editor" output.check 742 | editor format.key output 743 | } 744 | { format.authors output.nonnull 745 | crossref missing$ 746 | { "author and editor" editor either.or.check } 747 | 'skip$ 748 | if$ 749 | } 750 | if$ 751 | output.year.check % special for apalike 752 | new.block 753 | format.btitle "title" output.check 754 | crossref missing$ 755 | { format.bvolume output 756 | format.chapter.pages "chapter and pages" output.check 757 | new.block 758 | format.number.series output 759 | new.sentence 760 | publisher "publisher" output.check 761 | address output 762 | } 763 | { format.chapter.pages "chapter and pages" output.check 764 | new.block 765 | format.book.crossref output.nonnull 766 | } 767 | if$ 768 | format.edition output 769 | format.refs 770 | new.block 771 | note output 772 | fin.entry 773 | } 774 | 775 | FUNCTION {incollection} 776 | { output.bibitem 777 | format.authors "author" output.check 778 | author format.key output % special for 779 | output.year.check % apalike 780 | new.block 781 | format.title "title" output.check 782 | new.block 783 | crossref missing$ 784 | { format.in.ed.booktitle "booktitle" output.check 785 | format.bvolume output 786 | format.number.series output 787 | format.chapter.pages output 788 | new.sentence 789 | publisher "publisher" output.check 790 | address output 791 | format.edition output 792 | } 793 | { format.incoll.inproc.crossref output.nonnull 794 | format.chapter.pages output 795 | } 796 | if$ 797 | format.refs 798 | new.block 799 | note output 800 | fin.entry 801 | } 802 | 803 | FUNCTION {inproceedings} 804 | { output.bibitem 805 | format.authors "author" output.check 806 | author format.key output % special for 807 | output.year.check % apalike 808 | new.block 809 | format.title "title" output.check 810 | new.block 811 | crossref missing$ 812 | { format.in.ed.booktitle "booktitle" output.check 813 | format.bvolume output 814 | format.number.series output 815 | format.pages output 816 | address output % for apalike 817 | new.sentence % there's no year 818 | organization output % here so things 819 | publisher output % are simpler 820 | } 821 | { format.incoll.inproc.crossref output.nonnull 822 | format.pages output 823 | } 824 | if$ 825 | format.refs 826 | new.block 827 | note output 828 | fin.entry 829 | } 830 | 831 | FUNCTION {conference} { inproceedings } 832 | 833 | FUNCTION {manual} 834 | { output.bibitem 835 | format.authors output 836 | author format.key output % special for 837 | output.year.check % apalike 838 | new.block 839 | format.btitle "title" output.check 840 | organization address new.block.checkb 841 | organization output 842 | address output 843 | format.edition output 844 | format.refs 845 | new.block 846 | note output 847 | fin.entry 848 | } 849 | 850 | FUNCTION {mastersthesis} 851 | { output.bibitem 852 | format.authors "author" output.check 853 | author format.key output % special for 854 | output.year.check % apalike 855 | new.block 856 | format.title "title" output.check 857 | new.block 858 | "Master's thesis" format.thesis.type output.nonnull 859 | school "school" output.check 860 | address output 861 | format.refs 862 | new.block 863 | note output 864 | fin.entry 865 | } 866 | 867 | FUNCTION {misc} 868 | { output.bibitem 869 | format.authors output 870 | author format.key output % special for 871 | output.year.check % apalike 872 | new.block 873 | format.title output 874 | new.block 875 | howpublished output 876 | format.refs 877 | new.block 878 | note output 879 | fin.entry 880 | } 881 | 882 | FUNCTION {phdthesis} 883 | { output.bibitem 884 | format.authors "author" output.check 885 | author format.key output % special for 886 | output.year.check % apalike 887 | new.block 888 | format.btitle "title" output.check 889 | new.block 890 | "PhD thesis" format.thesis.type output.nonnull 891 | school "school" output.check 892 | address output 893 | format.refs 894 | new.block 895 | note output 896 | fin.entry 897 | } 898 | 899 | FUNCTION {proceedings} 900 | { output.bibitem 901 | format.editors output 902 | editor format.key output % special for 903 | output.year.check % apalike 904 | new.block 905 | format.btitle "title" output.check 906 | format.bvolume output 907 | format.number.series output 908 | address output % for apalike 909 | new.sentence % we always output 910 | organization output % a nonempty organization 911 | publisher output % here 912 | format.refs 913 | new.block 914 | note output 915 | fin.entry 916 | } 917 | 918 | FUNCTION {techreport} 919 | { output.bibitem 920 | format.authors "author" output.check 921 | author format.key output % special for 922 | output.year.check % apalike 923 | new.block 924 | format.title "title" output.check 925 | new.block 926 | format.tr.number output.nonnull 927 | institution "institution" output.check 928 | address output 929 | format.refs 930 | new.block 931 | note output 932 | fin.entry 933 | } 934 | 935 | FUNCTION {unpublished} 936 | { output.bibitem 937 | format.authors "author" output.check 938 | author format.key output % special for 939 | output.year.check % apalike 940 | new.block 941 | format.title "title" output.check 942 | format.refs 943 | new.block 944 | note "note" output.check 945 | fin.entry 946 | } 947 | 948 | FUNCTION {default.type} { misc } 949 | 950 | MACRO {jan} {"January"} 951 | 952 | MACRO {feb} {"February"} 953 | 954 | MACRO {mar} {"March"} 955 | 956 | MACRO {apr} {"April"} 957 | 958 | MACRO {may} {"May"} 959 | 960 | MACRO {jun} {"June"} 961 | 962 | MACRO {jul} {"July"} 963 | 964 | MACRO {aug} {"August"} 965 | 966 | MACRO {sep} {"September"} 967 | 968 | MACRO {oct} {"October"} 969 | 970 | MACRO {nov} {"November"} 971 | 972 | MACRO {dec} {"December"} 973 | 974 | MACRO {acmcs} {"ACM Computing Surveys"} 975 | 976 | MACRO {acta} {"Acta Informatica"} 977 | 978 | MACRO {cacm} {"Communications of the ACM"} 979 | 980 | MACRO {ibmjrd} {"IBM Journal of Research and Development"} 981 | 982 | MACRO {ibmsj} {"IBM Systems Journal"} 983 | 984 | MACRO {ieeese} {"IEEE Transactions on Software Engineering"} 985 | 986 | MACRO {ieeetc} {"IEEE Transactions on Computers"} 987 | 988 | MACRO {ieeetcad} 989 | {"IEEE Transactions on Computer-Aided Design of Integrated Circuits"} 990 | 991 | MACRO {ipl} {"Information Processing Letters"} 992 | 993 | MACRO {jacm} {"Journal of the ACM"} 994 | 995 | MACRO {jcss} {"Journal of Computer and System Sciences"} 996 | 997 | MACRO {scp} {"Science of Computer Programming"} 998 | 999 | MACRO {sicomp} {"SIAM Journal on Computing"} 1000 | 1001 | MACRO {tocs} {"ACM Transactions on Computer Systems"} 1002 | 1003 | MACRO {tods} {"ACM Transactions on Database Systems"} 1004 | 1005 | MACRO {tog} {"ACM Transactions on Graphics"} 1006 | 1007 | MACRO {toms} {"ACM Transactions on Mathematical Software"} 1008 | 1009 | MACRO {toois} {"ACM Transactions on Office Information Systems"} 1010 | 1011 | MACRO {toplas} {"ACM Transactions on Programming Languages and Systems"} 1012 | 1013 | MACRO {tcs} {"Theoretical Computer Science"} 1014 | 1015 | READ 1016 | 1017 | FUNCTION {sortify} 1018 | { purify$ 1019 | "l" change.case$ 1020 | } 1021 | 1022 | INTEGERS { len } 1023 | 1024 | FUNCTION {chop.word} 1025 | { 's := 1026 | 'len := 1027 | s #1 len substring$ = 1028 | { s len #1 + global.max$ substring$ } 1029 | 's 1030 | if$ 1031 | } 1032 | 1033 | % There are three apalike cases: one person (Jones), 1034 | % two (Jones and de~Bruijn), and more (Jones et~al.). 1035 | % This function is much like format.crossref.editors. 1036 | % 1037 | FUNCTION {format.lab.names} 1038 | { 's := 1039 | s #1 "{vv~}{ll}" format.name$ 1040 | s num.names$ duplicate$ 1041 | #2 > 1042 | { pop$ " et~al." * } 1043 | { #2 < 1044 | 'skip$ 1045 | { s #2 "{ff }{vv }{ll}{ jj}" format.name$ "others" = 1046 | { " et~al." * } 1047 | { " and " * s #2 "{vv~}{ll}" format.name$ * } 1048 | if$ 1049 | } 1050 | if$ 1051 | } 1052 | if$ 1053 | } 1054 | 1055 | FUNCTION {author.key.label} 1056 | { author empty$ 1057 | { key empty$ 1058 | { cite$ #1 #3 substring$ } 1059 | 'key % apalike uses the whole key 1060 | if$ 1061 | } 1062 | { author format.lab.names } 1063 | if$ 1064 | } 1065 | 1066 | FUNCTION {author.editor.key.label} 1067 | { author empty$ 1068 | { editor empty$ 1069 | { key empty$ 1070 | { cite$ #1 #3 substring$ } 1071 | 'key % apalike uses the whole key 1072 | if$ 1073 | } 1074 | { editor format.lab.names } 1075 | if$ 1076 | } 1077 | { author format.lab.names } 1078 | if$ 1079 | } 1080 | 1081 | FUNCTION {editor.key.label} 1082 | { editor empty$ 1083 | { key empty$ 1084 | { cite$ #1 #3 substring$ } 1085 | 'key % apalike uses the whole key, no organization 1086 | if$ 1087 | } 1088 | { editor format.lab.names } 1089 | if$ 1090 | } 1091 | 1092 | FUNCTION {calc.label} 1093 | { type$ "book" = 1094 | type$ "inbook" = 1095 | or 1096 | 'author.editor.key.label 1097 | { type$ "proceedings" = 1098 | 'editor.key.label % apalike ignores organization 1099 | 'author.key.label % for labeling and sorting 1100 | if$ 1101 | } 1102 | if$ 1103 | ", " % these three lines are 1104 | * % for apalike, which 1105 | year field.or.null purify$ #-1 #4 substring$ % uses all four digits 1106 | * 1107 | 'label := 1108 | } 1109 | 1110 | FUNCTION {sort.format.names} 1111 | { 's := 1112 | #1 'nameptr := 1113 | "" 1114 | s num.names$ 'numnames := 1115 | numnames 'namesleft := 1116 | { namesleft #0 > } 1117 | { nameptr #1 > 1118 | { " " * } 1119 | 'skip$ 1120 | if$ % apalike uses initials 1121 | s nameptr "{vv{ } }{ll{ }}{ f{ }}{ jj{ }}" format.name$ 't := % <= here 1122 | nameptr numnames = t "others" = and 1123 | { "et al" * } 1124 | { t sortify * } 1125 | if$ 1126 | nameptr #1 + 'nameptr := 1127 | namesleft #1 - 'namesleft := 1128 | } 1129 | while$ 1130 | } 1131 | 1132 | FUNCTION {sort.format.title} 1133 | { 't := 1134 | "A " #2 1135 | "An " #3 1136 | "The " #4 t chop.word 1137 | chop.word 1138 | chop.word 1139 | sortify 1140 | #1 global.max$ substring$ 1141 | } 1142 | 1143 | FUNCTION {author.sort} 1144 | { author empty$ 1145 | { key empty$ 1146 | { "to sort, need author or key in " cite$ * warning$ 1147 | "" 1148 | } 1149 | { key sortify } 1150 | if$ 1151 | } 1152 | { author sort.format.names } 1153 | if$ 1154 | } 1155 | 1156 | FUNCTION {author.editor.sort} 1157 | { author empty$ 1158 | { editor empty$ 1159 | { key empty$ 1160 | { "to sort, need author, editor, or key in " cite$ * warning$ 1161 | "" 1162 | } 1163 | { key sortify } 1164 | if$ 1165 | } 1166 | { editor sort.format.names } 1167 | if$ 1168 | } 1169 | { author sort.format.names } 1170 | if$ 1171 | } 1172 | 1173 | FUNCTION {editor.sort} 1174 | { editor empty$ 1175 | { key empty$ 1176 | { "to sort, need editor or key in " cite$ * warning$ 1177 | "" 1178 | } 1179 | { key sortify } 1180 | if$ 1181 | } 1182 | { editor sort.format.names } 1183 | if$ 1184 | } 1185 | 1186 | % apalike uses two sorting passes; the first one sets the 1187 | % labels so that the `a's, `b's, etc. can be computed; 1188 | % the second pass puts the references in "correct" order. 1189 | % The presort function is for the first pass. It computes 1190 | % label, sort.label, and title, and then concatenates. 1191 | FUNCTION {presort} 1192 | { calc.label 1193 | label sortify 1194 | " " 1195 | * 1196 | type$ "book" = 1197 | type$ "inbook" = 1198 | or 1199 | 'author.editor.sort 1200 | { type$ "proceedings" = 1201 | 'editor.sort 1202 | 'author.sort 1203 | if$ 1204 | } 1205 | if$ 1206 | #1 entry.max$ substring$ % for 1207 | 'sort.label := % apalike 1208 | sort.label % style 1209 | * 1210 | " " 1211 | * 1212 | title field.or.null 1213 | sort.format.title 1214 | * 1215 | #1 entry.max$ substring$ 1216 | 'sort.key$ := 1217 | } 1218 | 1219 | ITERATE {presort} 1220 | 1221 | SORT % by label, sort.label, title---for final label calculation 1222 | 1223 | STRINGS { last.label next.extra } % apalike labels are only for the text; 1224 | 1225 | INTEGERS { last.extra.num } % there are none in the bibliography 1226 | 1227 | FUNCTION {initialize.extra.label.stuff} % and hence there is no `longest.label' 1228 | { #0 int.to.chr$ 'last.label := 1229 | "" 'next.extra := 1230 | #0 'last.extra.num := 1231 | } 1232 | 1233 | FUNCTION {forward.pass} 1234 | { last.label label = 1235 | { last.extra.num #1 + 'last.extra.num := 1236 | last.extra.num int.to.chr$ 'extra.label := 1237 | } 1238 | { "a" chr.to.int$ 'last.extra.num := 1239 | "" 'extra.label := 1240 | label 'last.label := 1241 | } 1242 | if$ 1243 | } 1244 | 1245 | FUNCTION {reverse.pass} 1246 | { next.extra "b" = 1247 | { "a" 'extra.label := } 1248 | 'skip$ 1249 | if$ 1250 | label extra.label * 'label := 1251 | extra.label 'next.extra := 1252 | } 1253 | 1254 | EXECUTE {initialize.extra.label.stuff} 1255 | 1256 | ITERATE {forward.pass} 1257 | 1258 | REVERSE {reverse.pass} 1259 | 1260 | % Now that the label is right we sort for real, 1261 | % on sort.label then year then title. This is 1262 | % for the second sorting pass. 1263 | FUNCTION {bib.sort.order} 1264 | { sort.label 1265 | " " 1266 | * 1267 | year field.or.null sortify 1268 | * 1269 | " " 1270 | * 1271 | title field.or.null 1272 | sort.format.title 1273 | * 1274 | #1 entry.max$ substring$ 1275 | 'sort.key$ := 1276 | } 1277 | 1278 | ITERATE {bib.sort.order} 1279 | 1280 | SORT % by sort.label, year, title---giving final bibliography order 1281 | 1282 | FUNCTION {begin.bib} 1283 | { preamble$ empty$ % no \etalchar in apalike 1284 | 'skip$ 1285 | { preamble$ write$ newline$ } 1286 | if$ 1287 | "\begin{thebibliography}{}" write$ newline$ % no labels in apalike 1288 | } 1289 | 1290 | EXECUTE {begin.bib} 1291 | 1292 | EXECUTE {init.state.consts} 1293 | 1294 | ITERATE {call.type$} 1295 | 1296 | FUNCTION {end.bib} 1297 | { newline$ 1298 | "\end{thebibliography}" write$ newline$ 1299 | } 1300 | 1301 | EXECUTE {end.bib} 1302 | -------------------------------------------------------------------------------- /appendix.tex: -------------------------------------------------------------------------------- 1 | % v2-acmsmall-sample.tex, dated March 6 2012 2 | % This is a sample file for ACM small trim journals 3 | % 4 | % Compilation using 'acmsmall.cls' - version 1.3 (March 2012), Aptara Inc. 5 | % (c) 2010 Association for Computing Machinery (ACM) 6 | % 7 | % Questions/Suggestions/Feedback should be addressed to => "acmtexsupport@aptaracorp.com". 8 | % Users can also go through the FAQs available on the journal's submission webpage. 9 | % 10 | % Steps to compile: latex, bibtex, latex latex 11 | % 12 | % For tracking purposes => this is v1.3 - March 2012 13 | 14 | \documentclass[prodmode,acmcsur]{acmsmall} % Aptara syntax 15 | 16 | % Package to generate and customize Algorithm as per ACM style 17 | \usepackage[ruled]{algorithm2e} 18 | \renewcommand{\algorithmcfname}{ALGORITHM} 19 | \SetAlFnt{\small} 20 | \SetAlCapFnt{\small} 21 | \SetAlCapNameFnt{\small} 22 | \SetAlCapHSkip{0pt} 23 | \IncMargin{-\parindent} 24 | 25 | % Metadata Information 26 | \acmVolume{0} 27 | \acmNumber{0} 28 | \acmArticle{0} 29 | \acmYear{0000} 30 | \acmMonth{0} 31 | 32 | % Copyright 33 | %\setcopyright{acmcopyright} 34 | %\setcopyright{acmlicensed} 35 | %\setcopyright{rightsretained} 36 | %\setcopyright{usgov} 37 | %\setcopyright{usgovmixed} 38 | %\setcopyright{cagov} 39 | %\setcopyright{cagovmixed} 40 | 41 | \input{common} 42 | 43 | % DOI 44 | \doi{0000001.0000001} 45 | 46 | %ISSN 47 | \issn{1234-56789} 48 | 49 | % Document starts 50 | \begin{document} 51 | 52 | % Page heads 53 | \markboth{R. Baldoni, E. Coppa, D. C. D'Elia, C. Demetrescu, and I. Finocchi}{A Survey of Symbolic Execution Techniques} 54 | 55 | % Title portion 56 | \title{Online Appendix to:\\ A Survey of Symbolic Execution Techniques\\} 57 | \author{ROBERTO BALDONI 58 | \affil{\href{http://www.cis.uniroma1.it/}{Cyber Intelligence and Information Security Research Center}, Sapienza} 59 | EMILIO COPPA 60 | \affil{\href{http://season-lab.github.io}{SEASON Lab}, Sapienza University of Rome} 61 | DANIELE CONO D'ELIA 62 | \affil{\href{http://season-lab.github.io}{SEASON Lab}, Sapienza University of Rome} 63 | CAMIL DEMETRESCU 64 | \affil{\href{http://season-lab.github.io}{SEASON Lab}, Sapienza University of Rome} 65 | IRENE FINOCCHI 66 | \affil{\href{http://season-lab.github.io}{SEASON Lab}, Sapienza University of Rome} 67 | } 68 | % NOTE! Affiliations placed here should be for the institution where the 69 | % BULK of the research was done. If the author has gone to a new 70 | % institution, before publication, the (above) affiliation should NOT be changed. 71 | % The authors 'current' address may be given in the "Author's addresses:" block (below). 72 | % So for example, Mr. Abdelzaher, the bulk of the research was done at UIUC, and he is 73 | % currently affiliated with NASA. 74 | 75 | %\begin{abstract} 76 | %\end{abstract} 77 | 78 | \begin{comment} 79 | \begin{CCSXML} % http://dl.acm.org/ccs.cfm 80 | 81 | 82 | 10011007.10010940.10010992.10010998.10010999 83 | Software and its engineering~Software verification 84 | 500 85 | 86 | 87 | 10011007.10010940.10010992.10010998.10011001 88 | Software and its engineering~Dynamic analysis 89 | 300 90 | 91 | 92 | 10011007.10011074.10011099.10011102.10011103 93 | Software and its engineering~Software testing and debugging 94 | 300 95 | 96 | 97 | 10002978.10003022 98 | Security and privacy~Software and application security 99 | 100 100 | 101 | 102 | \end{CCSXML} 103 | 104 | \ccsdesc[500]{Software and its engineering~Software verification} 105 | \ccsdesc[300]{Software and its engineering~Dynamic analysis} 106 | \ccsdesc[300]{Software and its engineering~Software testing and debugging} 107 | \ccsdesc[100]{Security and privacy~Software and application security} 108 | \end{comment} 109 | 110 | % We no longer use \terms command 111 | %\terms{Design, Algorithms, Performance} 112 | 113 | %\keywords{Symbolic execution, static analysis, concolic execution, malware analysis} 114 | 115 | %\acmformat{Roberto Baldoni, Emilio Coppa, Daniele Cono D'Elia, Camil Demetrescu, and Irene Finocchi, 2016. A survey of symbolic execution techniques.} 116 | % At a minimum you need to supply the author names, year and a title. 117 | % IMPORTANT: 118 | % Full first names whenever they are known, surname last, followed by a period. 119 | % In the case of two authors, 'and' is placed between them. 120 | % In the case of three or more authors, the serial comma is used, that is, all author names 121 | % except the last one but including the penultimate author's name are followed by a comma, 122 | % and then 'and' is placed before the final author's name. 123 | % If only first and middle initials are known, then each initial 124 | % is followed by a period and they are separated by a space. 125 | % The remaining information (journal title, volume, article number, date, etc.) is 'auto-generated'. 126 | 127 | 128 | %\begin{bottomstuff} 129 | \begin{comment} 130 | Author's addresses: R. Baldoni, E. Coppa, D.C. D'Elia, and C. Demetrescu, Department of Computer, Control, and Management Engineering, Sapienza University of Rome; I. Finocchi, Department of Computer Science, Sapienza University of Rome. 131 | This work is supported in part by a grant of the Italian Presidency of the Council of Ministers and by the CINI National Laboratory of Cyber Security. % (Consorzio Interuniversitario Nazionale Informatica) 132 | \end{comment} 133 | %\end{bottomstuff} 134 | 135 | \maketitle 136 | 137 | \renewcommand{\thesection}{\Alph{section}} 138 | %\renewcommand\thefigure{\thesection.\arabic{figure}} 139 | \setcounter{figure}{11} % we have 11 figures in the main article 140 | \setcounter{page}{38} 141 | \renewcommand\thepage{\arabic{page}} 142 | 143 | \input{tables} 144 | \input{binary} 145 | \input{applications} 146 | 147 | % Bibliography 148 | %\bibliographystyle{abstract} 149 | \bibliographystyle{ACM-Reference-Format-Journals} 150 | \bibliography{symbolic} 151 | 152 | % History dates 153 | %\received{--- 2016}{--- XXXX}{---- XXXX} 154 | 155 | \end{document} 156 | 157 | % End of v2-acmsmall-sample.tex (March 2012) - Gerry Murray, ACM 158 | 159 | 160 | -------------------------------------------------------------------------------- /applications.tex: -------------------------------------------------------------------------------- 1 | % !TEX root = appendix.tex 2 | 3 | \section{Applications of Symbolic Execution} 4 | \label{se:applications} 5 | 6 | \revedit{ 7 | \cite{CGK-ICSE11} observes how the recent explosion of research work in symbolic execution makes for an interesting story about the increasing impact of this program analysis since its introduction in the mid '70s. The availability of powerful off-the-shelf SMT solvers and hardware resources, along with advances in symbolic execution techniques to deal with the challenges identified in Section 1.2, facilitated the application of symbolic execution to increasing large problem instances from many domains. 8 | 9 | %The last decade has witnessed an increasing adoption of symbolic execution techniques not only in the software testing domain, but also to address other compelling engineering problems such as automatic generation of exploits or authentication bypass. We now discuss prominent applications of symbolic execution techniques to these domains. Examples of extensions to other areas can be found, e.g., in~\cite{CGK-ICSE11}. 10 | 11 | In this section we do not aim at presenting a comprehensive overview of applications of symbolic execution. Our goal is instead to provide the reader with a selection of works appeared in the last few years that either incubated novel ideas that might be effective in other domains too (e.g., to deal with the path explosion problem), or significantly affected the state of the art of a specific field. 12 | 13 | The works we are about to discuss are drawn from four domains: software testing, program understanding, bug exploitation, and authentication bypass. Other fields that have seen uses of symbolic execution, such as automatic filter generation (e.g., \cite{BND-SP06,BOUNCER-SOSP07}) and code analysis (e.g., \cite{HMH-VSTTE12,BCP-USENIXSEC17}), are not covered here. Also, we do not address techniques tailored to programs with concurrent threads (e.g., \cite{BGC-OOPSLA14,GKW-ESEC15}) or floating-point arithmetic (e.g., \cite{RPW-SIGSOFT15,LSC-ASE17}).} 14 | 15 | %The last decade has witnessed an increasing adoption of symbolic execution techniques not only in the software testing domain, but also to address other compelling engineering problems such as automatic generation of exploits or authentication bypass. We now discuss \iffullver{three prominent}{prominent} applications of symbolic execution techniques to these domains. Examples of extensions to other areas can be found, e.g., in~\cite{CGK-ICSE11}. 16 | 17 | \subsection{Software Testing}%\mynote{Rendere piu' di ampio respiro il titolo di questa sezione? Keyword: software testing, program understanding} 18 | \label{ss:bug-detection} 19 | 20 | Software testing strategies typically attempt to execute a program with the intent of finding bugs. As manual test input generation is an error-prone and usually non-exhaustive process, automated testing techniques have drawn a lot of attention over the years. Random testing techniques such as fuzzing are cheap in terms of run-time overhead, but fail to obtain a wide exploration of a program state space. Symbolic and concolic execution techniques on the other hand achieve a more exhaustive exploration, but they become expensive as the length of the execution grows: for this reason, they usually reveal shallow bugs only. 21 | 22 | \cite{RK-ICSE07} proposes {\em hybrid concolic testing} for test input generation, which combines random search and concolic execution to achieve both deep program states and wide exploration. The two techniques are interleaved: in particular, when random testing saturates (i.e., it is unable to hit new code coverage points after a number of steps), concolic execution is used to mutate the current program state by performing a bounded depth-first search for an uncovered coverage point. For a fixed time budget, the technique outperforms both random and concolic testing in terms of branch coverage. The intuition behind this approach is that many programs show behaviors where a state can be easily reached through random testing, but then a precise sequence of events -- identifiable by a symbolic engine -- is required to hit a specific coverage point. 23 | 24 | % which uses preconstraining on the program states to ensure consistency 25 | % fuzzy \revedit 26 | \cite{DRILLER-NDSS16} refines this idea by devising Driller, a vulnerability excavation tool based on {\sc Angr}~\cite{ANGR-SSP16} that interleaves fuzzing and concolic execution to discover memory corruption vulnerabilities. The authors remark that user inputs can be categorized as {\em general} input, which has a wide range of valid values, and {\em specific} input; a check for particular values of a specific input splits an application into {\em compartments}. Driller offloads the majority of unique path discovery to a fuzzy engine, and relies on concolic execution to move across compartments. During the fuzzy phase, Driller marks a number of inputs as interesting (for instance, when an input was the first to trigger some state transition) and once it gets stuck in the exploration, it passes the set of such paths to a concolic engine, which preconstraints the program states to ensure consistency with the results of the native execution. On the dataset used for the DARPA Cyber Grand Challenge qualifying event, Driller could identify crashing inputs in 77 applications, including both the 68 and 16 applications for which fuzzing and symbolic execution alone succeeded, respectively. For 6 applications, Driller was the only one to detect a vulnerability. 27 | 28 | % temporaneamente messo qui 29 | % \cite{QRL-TOSEM12} \revedit 30 | \smallskip 31 | Maintenance of large and complex applications is a very hard task. Fixing bugs can sometimes introduce new and unexpected issues in the software, which in turn may require several hours or even weeks to be detected and properly addressed by the developers. \cite{QRL-TOSEM12} tackles the problem of identifying the root cause of failures during regression testing. Given a program $P$ and a newer revision of the program $P'$, if a testing input $t$ generates a failure in $P'$ but not in $P$, then symbolic execution is used to track the path constraints $\pi$ and $\pi'$ when executing $P$ and $P'$ on the failing input $t$, respectively. Using an SMT solver, a new input $t'$ is generated by solving the formula $\pi ~\wedge \neg\pi'$. If $t'$ exists (i.e., the formula is satisfiable), then $P'$ has one or more {\em deviations} in the control flow graph with respect to $P$ that can be the root cause of the failure. By carefully tracking branch conditions during symbolic execution, \cite{QRL-TOSEM12} are also able to pinpoint which branches are responsible for these deviations. If $\pi \wedge \neg\pi'$ is unsatisfiable, the symmetric formula $\neg\pi \wedge \pi'$ is evaluated and analogous actions are taken to detect possible branch conditions that may have led to the failure. If also $\neg\pi \wedge \pi'$ is unsatisfiable, the root cause of the problem cannot be determined. 32 | %\revedit{the technique} cannot determine the root cause of the problem. 33 | 34 | % over, to check \revedit 35 | Another interesting work that targets the problem of software regressions through the use of symbolic execution is~\cite{BOR-ICSE13}. The work introduces an approach called {\em partition-based regression verification} that combines the advantages of both regression verification (RV) and regression testing (RT). Indeed, RV is a very powerful technique for identifying regressions but hardly scales to large programs due to the difficulty in proving behavioral equivalence between the original and the modified program. On the other hand, RT allows for checking a modified program for regressions by testing selected concrete sample inputs, making it more scalable but providing limited verification guarantees. The main intuition behind partition-based regression verification is the identification of {\em differential partitions}. Each differential partition can be seen as a subset of the input space for which the two program versions -- given the same path constraints -- either expose the same output ({\em equivalence-revealing partition}) or produce different results ({\em difference-revealing partition}). For each partition, a test case is generated and added to the regression test suite, which can later be used by a developer for classical RT. Since differential partitions are derived by exploiting symbolic execution, this approach suffers from the common limitations that come with this technique. However, if the exploration is interrupted (e.g., due to excessive time or memory usage), partition-based regression verification can still provide guarantees over the subset of input space that has been covered so far by the detected partitions. 36 | 37 | \revedit{ 38 | Directed incremental symbolic execution (DiSE) is usually used for regression testing. As pointed out in the main article, its strength lies in applying static analyses in synergy with symbolic execution, directing the exploration to the sole code portions affected by changes. \cite{BPR-SPIN13} uses DiSE to generate summaries of behaviors affected by differences, and proves behavioral equivalence of two program versions by comparing the affected behaviors only. Their approach is sound and complete for sequential programs under a given depth bound for the symbolic exploration.} 39 | 40 | \smallskip 41 | Static data flow analysis tools can significantly help developers track malicious data leaks in software applications. Unfortunately, they often report several alleged bugs that only after a manual inspection can be regarded as false positives. To mitigate this issue,~\cite{ARH-SOAP15} proposes TASMAN, a system that, after performing data-flow analysis to track information leaks, uses symbolic backward execution to test each reported bug. Starting from a leaking statement, TASMAN explores the code backwards, pruning any path that can be proved unfeasible. If all the paths starting at the leaking statement are discarded by TASMAN, the bug is deemed a false positive. 42 | 43 | % . Intuitively, a usage profile can be seen as the distribution over the input space. 44 | % other -> several \revedit 45 | \subsection{Program Understanding} 46 | While symbolic execution is largely employed in testing activities, over the few last years several works (e.g., \cite{GDV-ISSTA12,FPV-ICSE13,CLL-ICSE16}) have shown how it can be valuable also for program understanding activities. 47 | 48 | \cite{GDV-ISSTA12} introduces {\em probabilistic symbolic execution}, an approach that makes it possible to compute the probability of executing different code portions of a program. This is achieved by exploiting model counting techniques, such as the {\tt LattE}~\cite{LHT-JSC04} toolset, to determine the number of solutions for the different path constraints given by the alternative execution paths of a program. 49 | 50 | The work by~\cite{FPV-ICSE13} takes a step further by using probabilistic symbolic execution to perform software reliability analysis. Reliability is computed as the probability of executing paths that have been labeled as successful given a usage profile, which represents the input space of all the successfully accomplished external interactions (with the user and with external resources) of the program. Since in general the termination of symbolic execution cannot be guaranteed in presence of loops, the proposed technique resorts to bounded exploration. Nonetheless, the authors define a metric for evaluating the confidence of their reliability estimation, allowing a developer to increase the bounds in order to improve the confidence value. 51 | 52 | Of a different flavor is the work by~\cite{CLL-ICSE16}, which uses probabilistic symbolic execution to conduct performance analysis. Based on usage profiles and on path execution probabilities, paths are classified into two types: {\em low-probability} and {\em high-probability}. Initially, high-probability paths are explored in a way that maximizes path diversity, generating a first set of test inputs. In a second phase, low-probability paths are analyzed using symbolic execution, generating a second set of test inputs that should expose executions characterized by the best and by the worst execution times. Finally, the program is executed using the test inputs generated during the two phases, and its running time is measured to generate performance distributions. 53 | 54 | Another interesting application of symbolic execution to program understanding is presented in~\cite{PPM-CSF18}. The technique exploits model counting and symbolic execution for computing quantitative bounds on the amount of information that can be leaked by a program through side-channel attacks. 55 | 56 | %As it is based on {\sc Angr}, Driller adopts an index-based memory model as in Section~\ref{ss:index-based-memory} where reads can be symbolic and writes are always concretized. % read/write addresses 57 | 58 | \subsection{Bug Exploitation} 59 | \label{ss:bug-exploitation} 60 | Bugs are a consequence of the nature of human factors in software development and are everywhere. Those that can be exploited by an attacker should normally be fixed first: systems for automatically and effectively identifying them are thus very valuable. 61 | 62 | {\sc AEG}~\cite{AEG-NDSS11} employs preconditioned symbolic execution to analyze a potentially buggy program in source form and look for bugs amenable to stack smashing or return-into-libc exploits~\cite{PB-SSP04}, which are popular control hijack attack techniques. The tool augments path constraints with exploitability constraints and queries a constraint solver, generating a concrete exploit when the constraints are satisfiable. The authors devise the {\em buggy-path-first} and {\em loop-exhaustion} strategies (Table~\ref{tab:heuristics}) to prioritize paths in the search. On a suite of 14 Linux applications, {\sc AEG} discovered 16 vulnerabilities, 2 of which were previously unknown, and constructed control hijack exploits for them. 63 | 64 | {\sc Mayhem}~\cite{MAYHEM-SP12} takes another step forward by presenting the first system for binary programs that is able identify end-to-end exploitable bugs. It adopts a hybrid execution model based on checkpoints and two components: a concrete executor that injects taint-analysis instrumentation in the code and a symbolic executor that takes over when a tainted branch or jump instruction is met. Exploitability constraints for symbolic instruction pointers and format strings are generated, targeting a wide range of exploits, e.g., SEH-based and jump-to-register ones. Three path selection heuristics help prioritizing paths that are most likely to contain vulnerabilities (e.g., those containing symbolic memory accesses or instruction pointers). A virtualization layer intercepts and emulates all the system calls to the host OS, while preconditioned symbolic execution can be used to reduce the size of the search space. Also, restricting symbolic execution to tainted basic blocks only gives very good speedups in this setting, as in the reported experiments more than $95\%$ of the processed instructions were not tainted. {\sc Mayhem} was able to find exploitable vulnerabilities in the 29 Linux and Windows applications considered in the evaluation, 2 of which were previously undocumented. Although the goal in {\sc Mayhem} is to reveal exploitable bugs, the generated simple exploits can be likely transformed in an automated fashion to work in the presence of classical OS defenses such as data execution prevention and address space layout randomization~\cite{Q-SEC11}. 65 | 66 | \vspace{-1mm} % TODO 67 | \subsection{Authentication Bypass} 68 | \label{ss:auth-bypass} 69 | Software backdoors are a method of bypassing authentication in an algorithm, a software product, or even in a full computer system. Although sometimes these software flaws are injected by external attackers using subtle tricks such as compiler tampering~\cite{KRS-TR74}, there are reported cases of backdoors that have been surreptitiously installed by the hardware and/or software manufacturers~\cite{CZF-USEC14}, or even by governments~\cite{NSA-BACKDOOR}. 70 | 71 | Different works (e.g., \cite{DMR-USEC13,ZBF-NDSS14,FIRMALICE-NDSS15}) have exploited symbolic execution for analyzing the behavior of binary firmwares. Indeed, an advantage of this technique is that it can be used even in environments, such as embedded systems, where the documentation and the source code that are publicly released by the manufacturer are typically very limited or none at all. For instance,~\cite{FIRMALICE-NDSS15} proposes Firmalice, a binary analysis framework based on {\sc Angr}~\cite{ANGR-SSP16} that can be effectively used for identifying authentication bypass flaws inside firmwares running on devices such as routers and printers. Given a user-provided description of a privileged operation in the device, Firmalice identifies a set of program points that, if executed, forces the privileged operation to be performed. The program slice that involves the privileged program points is then symbolically analyzed using {\sc Angr}. If any such point can be reached by the engine, a set of concrete inputs is generated using an SMT solver. These values can be then used to effectively bypass authentication inside the device. On three commercially available devices, Firmalice could detect vulnerabilities in two of them, and determine that a backdoor in the third firmware is not remotely exploitable. -------------------------------------------------------------------------------- /arxiv.tex: -------------------------------------------------------------------------------- 1 | % v2-acmsmall-sample.tex, dated March 6 2012 2 | % This is a sample file for ACM small trim journals 3 | % 4 | % Compilation using 'acmsmall.cls' - version 1.3 (March 2012), Aptara Inc. 5 | % (c) 2010 Association for Computing Machinery (ACM) 6 | % 7 | % Questions/Suggestions/Feedback should be addressed to => "acmtexsupport@aptaracorp.com". 8 | % Users can also go through the FAQs available on the journal's submission webpage. 9 | % 10 | % Steps to compile: latex, bibtex, latex latex 11 | % 12 | % For tracking purposes => this is v1.3 - March 2012 13 | 14 | \documentclass[10pt,a4paper]{article} % Aptara syntax 15 | 16 | \usepackage{authblk} 17 | \usepackage{breakcites} 18 | 19 | % Package to generate and customize Algorithm as per ACM style 20 | \usepackage[ruled]{algorithm2e} 21 | \renewcommand{\algorithmcfname}{ALGORITHM} 22 | \SetAlFnt{\small} 23 | \SetAlCapFnt{\small} 24 | \SetAlCapNameFnt{\small} 25 | \SetAlCapHSkip{0pt} 26 | \IncMargin{-\parindent} 27 | 28 | \newcommand{\fullver}{} 29 | \newcommand{\arxivver}{} 30 | \input{common} 31 | 32 | % Document starts 33 | \begin{document} 34 | 35 | % Page heads 36 | \markboth{R. Baldoni, E. Coppa, D. C. D'Elia, C. Demetrescu, and I. Finocchi}{A Survey of Symbolic Execution Techniques} 37 | 38 | % Title portion 39 | \title{\fontsize{22}{12}\selectfont{A Survey of Symbolic Execution Techniques}} 40 | \author[1]{Roberto Baldoni} 41 | \author[2]{Emilio Coppa} 42 | \author[2]{Daniele Cono D'Elia} 43 | \author[2]{\authorcr Camil Demetrescu} 44 | \author[2]{Irene Finocchi} 45 | \affil[1]{\small\href{http://www.cis.uniroma1.it/}{Cyber Intelligence and Information Security Research Center}, Sapienza University of Rome} 46 | \affil[2]{\href{season-lab.github.io}{SEASON Lab}, Sapienza University of Rome} 47 | \affil[ ]{{\vskip 1pt}\textit {\{baldoni,coppa,delia,demetres\}@dis.uniroma1.it\\ finocchi@di.uniroma1.it}} 48 | 49 | \date{\vspace{-4mm}} 50 | 51 | \maketitle 52 | 53 | \begin{abstract} 54 | Many security and software testing applications require checking whether certain properties of a program hold for any possible usage scenario. For instance, a tool for identifying software vulnerabilities may need to rule out the existence of any backdoor to bypass a program's authentication. One approach would be to test the program using different, possibly random inputs. As the backdoor may only be hit for very specific program workloads, automated exploration of the space of possible inputs is of the essence. Symbolic execution provides an elegant solution to the problem, by systematically exploring many possible execution paths at the same time without necessarily requiring concrete inputs. Rather than taking on fully specified input values, the technique abstractly represents them as symbols, resorting to constraint solvers to construct actual instances that would cause property violations. Symbolic execution has been incubated in dozens of tools developed over the last four decades, leading to major practical breakthroughs in a number of prominent software reliability applications. The goal of this survey is to provide an overview of the main ideas, challenges, and solutions developed in the area, distilling them for a broad audience. 55 | \end{abstract} 56 | 57 | % We no longer use \terms command 58 | %\terms{Design, Algorithms, Performance} 59 | 60 | %\keywords{Symbolic execution, static analysis, concolic execution, malware analysis} 61 | 62 | %\acmformat{Roberto Baldoni, Emilio Coppa, Daniele Cono D'Elia, Camil Demetrescu, and Irene Finocchi, 2016. A survey of symbolic execution techniques.} 63 | 64 | \iffalse 65 | \begin{bottomstuff} 66 | Author's addresses: R. Baldoni, E. Coppa, D.C. D'Elia, and C. Demetrescu, Department of Computer, Control, and Management Engineering, Sapienza University of Rome; I. Finocchi, Department of Computer Science, Sapienza University of Rome. 67 | \end{bottomstuff} 68 | \fi 69 | 70 | 71 | 72 | % \input{intro} 73 | % \myinput{executors} 74 | % \myinput{memory} 75 | % \myinput{environment} 76 | % \myinput{loops} 77 | % \myinput{explosion} 78 | % \myinput{constraints} 79 | % \myinput{binary} 80 | % \input{applications} 81 | % \input{conclusions} 82 | % \input{glossary} 83 | 84 | \input{intro} 85 | \myinput{executors} 86 | \myinput{memory} 87 | \myinput{environment} 88 | \myinput{explosion} 89 | \myinput{constraints} 90 | %\myinput{binary} 91 | %\input{applications} 92 | \input{hang} 93 | \input{conclusions} 94 | 95 | \myparagraph{Acknowledgements} 96 | %This work is partially supported by a grant of the Italian Presidency of Ministry Council and by the CINI (Consorzio Interuniversitario Nazionale Informatica) Cybersecurity National Laboratory. 97 | This work is supported in part by a grant of the Italian Presidency of the Council of Ministers and by the CINI (Consorzio Interuniversitario Nazionale Informatica) National Laboratory of Cyber Security. 98 | 99 | 100 | \input{glossary} 101 | 102 | \appendix 103 | \input{tables} 104 | \myinput{binary} 105 | \input{applications} 106 | 107 | 108 | % Bibliography 109 | %\bibliographystyle{abstract} 110 | \bibliographystyle{apalike-refs} 111 | \bibliography{symbolic} 112 | 113 | % History dates 114 | %\received{--- 2016}{--- XXXX}{---- XXXX} 115 | 116 | \end{document} 117 | 118 | % End of v2-acmsmall-sample.tex (March 2012) - Gerry Murray, ACM 119 | 120 | 121 | -------------------------------------------------------------------------------- /binary.tex: -------------------------------------------------------------------------------- 1 | % !TEX root = appendix.tex 2 | 3 | \section{Symbolic execution of binary code} 4 | \label{se:symbolic-binary} 5 | 6 | The importance of performing symbolic analysis of program properties on binary code is on the rise for a number of reasons. Binary code analysis is attractive as it reasons on code that will actually execute: not requiring the source code significantly extends the applicability of such techniques (to, e.g., common off-the-shelf proprietary programs, firmwares for embedded systems, and malicious software), and it gives the ground truth important for security applications whereas source code analysis may yield misleading results due to compiler optimizations~\cite{BITBLAZE-ICISS08}. % compiler errors/defects too 7 | Binary analysis is relevant also for programs written in dynamic languages and executed in runtimes that deeply transform and optimize the code through just-in-time compilation. 8 | 9 | %Also, the recent advances in runtimes for programs written in dynamic languages brought just-in-time compilation to the masses, taking over on interpreters used when no efficient source-to-binary translation of code was statically possible. 10 | 11 | 12 | % [D] In this paragraph perhaps it is not worth mentioning obfuscation, packing and encryption 13 | %Analyzing binary code is commonly seen as a challenging task 14 | \revedit{Working on binary code is often a challenging task for many program analyses} due to its complexity and lack of a high-level semantics. Modern architectures offer complex instruction sets: modeling each instruction can be difficult, especially in the presence of multiple side effects on processor flags to determine branch conditions. The second major challenge comes from the high-level semantics of the source code being lost in the lowering process (Figure~\ref{fig:lowering}), especially when debugging information is absent. Types are not explicitly encoded in binary code: even with register types, it is common to read values assuming a different type (e.g., 8-bit integer) from what was used to store them (e.g., 16-bit integer). Similar considerations can be made for array bounds as well. Also, control flow graph information is not explicitly available, as control flow is performed through jump instructions at both inter- and intra-procedural level. The function abstraction at the binary level does not exist as we intend it at source-code level: functions can be separated in non-contiguous pieces, and code may also call in the middle of a code block generated for a source-level function. 15 | 16 | In the remainder of this section we provide an overview of how symbolic executors can address some of the most significant challenges in the analysis of binary code. 17 | 18 | \subsection{Lifting to an Intermediate Representation} 19 | Motivated by the complexity in modeling native instructions and by the variety of architectures on which applications can be deployed (e.g., x86, x86-64, ARM, MIPS), symbolic executors for binary code typically rely on a {\em lifter} that transforms native instructions into an {\em intermediate representation} (IR), also known as {\em bytecode}. Modern compilers such as \iffullver{LLVM~\cite{LLVM-CGO04}}{LLVM} typically generate IR by {\em lowering} the user-provided source code during the first step of compilation, optimize it, and eventually lower it to native code for a specific platform. Source-code symbolic executors can resort to compiler-assisted lowering to reason on bytecode rather than source-language statements: for instance, {\sc KLEE}~\cite{KLEE-OSDI08} reasons on the IR generated by the LLVM compiler for static languages such as C and C++. Figure~\ref{fig:lowering} summarizes the relationships between source code, IR, and binary code. % \mynote{[D] Java?} 20 | 21 | % encoded as architecture-agnostic 22 | % for expressing 23 | Reasoning at the intermediate representation level allows for encoding program analyses in an architecture-agnostic fashion. Translated instructions will always expose all the side-effects of a native instruction, and support for additional platforms can be added over time. A number of symbolic executors use VEX, the IR of the Valgrind dynamic instrumentation framework~\cite{VALGRIND-PLDI07}. VEX is a RISC-like language designed for program analysis that offers a compact set of instructions to express programs in static single assignment form~\cite{SSA-TOPLAS91}. Lifters are available for both 32-bit and 64-bit ARM, MIPS, PPC, and x86 binaries. 24 | 25 | \begin{figure}[t!] 26 | \centering 27 | \includegraphics[width=.67\columnwidth]{images/compiler} % TODO was 0.7 28 | \vspace{-2mm} 29 | \caption{\label{fig:lowering} Lowering and lifting processes in native vs. source code processing.} 30 | \vspace{-1mm} % TODO 31 | \end{figure} 32 | 33 | %{\sc Angr}~\cite{ANGR-SSP16} performs analysis directly on the VEX IR 34 | %translating it to a custom language allowed them to simplify the development of their analysis framework 35 | {\sc Angr}~\cite{ANGR-SSP16} performs analysis directly on VEX IR. Authors chose VEX over other IR formats as at that time it was the only choice that offered a publicly available implementation with support for many architectures. Also, they mention that writing a binary lifter can be a daunting task, and a well-documented and program analysis-oriented solution can be a bonus. {\sc BitBlaze}~\cite{BITBLAZE-ICISS08} uses VEX too, although it translates it to a custom intermediate language. The reason for this is that VEX captures the side effects of some instructions only implicitly, such as the {\tt EFLAGS} bits set by instructions of the x86 ISA: translating it to a custom language simplified the development of {\sc BitBlaze}'s analysis framework. 36 | 37 | % guest operating systems 38 | The authors of {\sc \stwoe}~\cite{CKC-TOCS12} have implemented an x86-to-LLVM-IR lifter in order to use the {\sc KLEE}~\cite{KLEE-OSDI08} symbolic execution engine for whole-system symbolic analysis of binary code in a virtualized environment. The translation is transparent to both the guest operating system and KLEE, thus enabling the analysis of binaries using the full power of {\sc KLEE}. Another x86-to-LLVM-IR lifter that can be used to run {\sc KLEE} on binary code is {\tt mcsema}\footnote{\url{https://github.com/trailofbits/mcsema}.}. 39 | 40 | \subsection{Reconstructing the Control Flow Graph} 41 | 42 | A control flow graph (CFG) can provide valuable information for a symbolic executor as it captures the set of potential control flow transfers for all feasible execution paths. A fundamental issue that arises when reconstructing CFGs for binaries is that the possible targets of an indirect jump may not be identified correctly. Direct jumps are straightforward to process: as they encode their targets explicitly in the code, successor basic blocks can be identified and visited until no new edge is found. The target of an indirect jump is determined instead at run time: it might be computed by carrying out a calculation (e.g., a jump table) or depend on the current calling context (e.g., a function pointer is passed as argument, or a virtual C++ method is invoked). %We refer the interested reader to ~\cite{ANGR-SP16} for a detailed overview. 43 | 44 | % [D] we are focusing on CFG reconstruction here rather than on its applications 45 | % In general, not all the analyses based on CFGs require successor nodes to be accurately identified. This property can be exploited to perform further refinements on an initially less accurate CFG using techniques such as Value Set Analysis (VSA)~\cite{VSA-CC04}, which require an input CFG themselves. 46 | CFG recovery is typically an iterative refinement process based on a number of program analysis techniques. For instance, value-set analysis (VSA)~\cite{VSA-CC04} is a technique that can be used to identify a tight over-approximation of certain program state properties (e.g., the set of possible targets of an indirect jump or a memory write). In {\sc BitBlaze}~\cite{BITBLAZE-ICISS08} an initial CFG is generated by inserting special successor nodes for unresolved indirect jump targets. This choice is conceptually similar to widening a fact to the bottom of a lattice in a data-flow analysis. When an analysis requires more precise information, VSA is then applied on demand. %Indeed, not all the CFG-based analyses require successor nodes to be accurately identified. 47 | 48 | {\sc Angr}~\cite{ANGR-SSP16} implements two algorithms for CFG recovery. An iterative algorithm starts from the entry point of the program and interleaves a number of techniques to achieve speed and completeness, including VSA, inter-procedural backward program slicing, and symbolic execution of blocks. This algorithm is however rather slow and may miss code portions reachable only through unresolved jump targets. The authors thus devise a fast secondary algorithm that uses a number of heuristics to identify functions based on prologue signatures, and performs simple analyses (e.g., a lightweight alias analysis) to solve a number of indirect jumps. The algorithm is context-insensitive, so it can be used to quickly recover a CFG without a concern for understanding the reachability of functions from one another. 49 | 50 | \subsection{Code Obfuscation} 51 | In recent years, code obfuscation has received considerable attention as a cheap way to hinder the understanding of the inner workings of a proprietary program. Obfuscation is employed not only to thwart software piracy and improve software security, but also to avoid detection and resist analysis for malicious software~\cite{UDM-WCRE15,YJW-SSP15}. 52 | 53 | A significant motivation behind using symbolic/concolic execution in the analysis of malware is to deal with code obfuscations. However, current analysis techniques have trouble getting around some of those obfuscations, leading to imprecision and/or excessive resource usage~\cite{YD-CCS15}. For instance, obfuscation tools can transform conditional branches into indirect jumps that symbolic analysis find difficult to analyze, while run-time code self-modification might conceal conditional jumps on symbolic values so that they are missed by the analysis. 54 | 55 | A few works have described obfuscation techniques aiming at thwarting symbolic execution. \cite{SLG-NDSS08} uses one-way hash functions to devise a {\em conditional code obfuscation} scheme that makes it hard to identify the values of symbolic variables for which branch conditions are satisfied. They also present an encryption scheme for the code to execute based on a key derived from the value that satisfies a branch condition. %Although this approach has a few limitations (for instance, it can be applied to equality tests only, and is easy to detect), it represents the first work aiming at defeating symbolic execution-based malware analyzers. 56 | \cite{WMJ-ESORICS11} takes a step forward by proposing an obfuscation technique that is effective \iffullver{in spite of the fact that it uses}{despite it uses} linear operations only, for which symbolic execution usually works well. %In particular, the authors take advantage of the limitations of symbolic execution in analyzing loops: 57 | The obfuscation tool inserts a simple loop incorporating an unsolved mathematical conjecture that converges to a known value after a number of iterations, and the produced result is then combined with the original branch condition. %Conjectures are chosen in a way that a symbolic engine would not have to discard the generated constraints for their complexity (e.g., no floating-point or non-linear operations are performed). 58 | 59 | \cite{HOT-FPS15} presents BE-PUM, a tool to generate a precise CFG in the presence of obfuscation techniques that are common in the malware domain, including indirect jumps, structured exception handlers (SEHs), overlapping instructions, and self-modifying code. \iffullver{While engines such as {\sc BitBlaze}~\cite{BITBLAZE-ICISS08} typically rely on existing disassemblers like IDA Pro\footnote{\url{https://www.hex-rays.com/products/ida/}.} for obfuscated code, BE-PUM relies on concolic execution for deobfuscation, using a binary emulator for the user process and stubs for API calls.}{While engines such as {\sc BitBlaze}~\cite{BITBLAZE-ICISS08} typically rely on disassemblers like IDA Pro\footnote{\url{https://www.hex-rays.com/products/ida/}.}, BE-PUM relies on concolic execution to deobfuscate code, using a binary emulator for the user process and stubs for API calls.} % TODO check fullver "for deobfuscation" 60 | 61 | \cite{YD-CCS15} discusses the limitations of symbolic execution in the presence of three generic obfuscation techniques: (1) conditional-to-indirect jump transformation, also known as {\em symbolic jump problem}~\cite{SAB-SP10}; (2) conditional-to-conditional jump transformation, where the predicate is deeply changed; and (3) symbolic code, when code modification is carried out using an input-derived value. The authors show how resorting to bit-level taint analysis and architecture-aware constraint generation can allow symbolic execution to circumvent such obfuscations. -------------------------------------------------------------------------------- /common.tex: -------------------------------------------------------------------------------- 1 | % !TEX root = main.tex 2 | \usepackage{a4wide} 3 | \usepackage{listings} 4 | \usepackage{comment} 5 | \usepackage{amsmath} 6 | \usepackage{graphicx} 7 | \usepackage{amssymb} 8 | \usepackage{url} 9 | \usepackage{hyperref} 10 | \usepackage{float} 11 | \usepackage{lipsum} 12 | \usepackage{caption} 13 | \usepackage{subcaption} 14 | \usepackage{adjustbox} 15 | \usepackage{framed} 16 | \usepackage{multirow} 17 | \usepackage{framed} 18 | \usepackage{enumitem} 19 | \usepackage{epigraph} 20 | \usepackage{wasysym} % \brokenvert 21 | \usepackage{wrapfig} 22 | 23 | \usepackage[usenames, dvipsnames]{xcolor} 24 | 25 | % commands 26 | %\newcommand{\fullver}{} 27 | \ifdefined\fullver 28 | \newcommand{\iffullver}[2]{#1} 29 | \else 30 | \newcommand{\iffullver}[2]{#2} 31 | \fi 32 | 33 | \usepackage{tikz} 34 | \newcommand*\circled[1]{\tikz[baseline=(char.base)]{ 35 | \node[shape=circle,draw,inner sep=2pt] (char) {#1};}} 36 | 37 | %\usepackage{titlesec} 38 | %\titlespacing{\section}{0pt}{*1.2}{*1.2} 39 | %\titlespacing{\subsection}{0pt}{*1.1}{*1.1} 40 | %\titlespacing{\subsubsection}{0pt}{*.6}{*.6} 41 | %\titlespacing{\paragraph}{0pt}{*.6}{*.60} 42 | %\titleformat{\paragraph}[runin]{\normalsize\bfseries\scshape}{}{}{} 43 | %Get rid of some extra whitespace in the bibliography 44 | %\setlength{\bibsep}{0.75pt} 45 | %Get rid of some extra whitespace around float (containing figures) 46 | %\setlength{\textfloatsep}{4pt plus 0.25pt minus 1pt} 47 | %\setlength{\intextsep}{4.0pt plus 0.25pt minus .5pt} 48 | %\setlength{\floatsep}{2pt plus 2pt minus 1pt} 49 | %\setlength{\abovecaptionskip}{5pt plus 1pt minus 1pt} 50 | % \setlength{\belowcaptionskip}{10pt plus 1pt minus 1pt} 51 | %\setlength{\parskip}{0pt} 52 | 53 | \renewcommand{\epigraphsize}{\footnotesize} 54 | \setlength{\epigraphwidth}{10cm} 55 | %\renewcommand{\epigraphrule}{0pt} 56 | 57 | \definecolor{shadecolor}{rgb}{0.92,0.92,0.92} 58 | 59 | \hypersetup{ 60 | colorlinks = true, % colours links instead of ugly boxes 61 | urlcolor = blue, % colour for external hyperlinks 62 | linkcolor = black, % colour of internal links 63 | citecolor = black, % colour of citations 64 | pdftitle = {A Survey of Symbolic Execution Techniques}, 65 | pdfauthor= {Roberto Baldoni, Emilio Coppa, Daniele Cono D'Elia, Camil Demetrescu, Irene Finocchi} 66 | } 67 | 68 | %\usepackage{xcolor} 69 | %\newcommand{\myedit}[1]{{\leavevmode\color{red}#1}} 70 | %\newcommand{\mytempedit}[1]{{\leavevmode\color{blue}#1}} 71 | %\newcommand{\myedit}[1]{{\color{red}\underline{#1}}} 72 | %\newcommand{\mytempedit}[1]{{\color{black}#1}} 73 | \newcommand{\mytempedit}[1]{\ignorespaces#1} 74 | \newcommand{\revedit}[1]{{\color{blue}#1}} 75 | \newcommand{\lateredit}[1]{{\color{red}#1}} 76 | 77 | %\newcommand{\mytempedit}[1]{{\color{blue}#1}} 78 | 79 | %\newcommand{\mytempedit}[1]{{\color{blue}\fontfamily{lmdh}\selectfont #1}} 80 | 81 | %\setlength{\parindent}{0pt} 82 | \setlength{\FrameSep}{2pt} 83 | \newcommand{\myparagraph}[1]{\medskip\noindent{\bf\small #1.} } 84 | \newcommand{\myparagraphnoperiod}[1]{\medskip\noindent{\bf\small #1} } 85 | 86 | % EDIT TO ENABLE NOTES 87 | \newcommand{\mynote}[1]{\ignorespaces} % TODO 88 | %\newcommand{\mynote}[1]{\marginpar{\raggedleft{\fontfamily{pbk}\selectfont\scriptsize{\em #1}}}} 89 | 90 | \newcommand{\stwoe}{\text{S\textsuperscript{2}E}} 91 | \newcommand{\myinput}[1]{\ifdefined\internalrep \input{../#1} \else \input{#1} \fi} 92 | \newcommand{\missing}{\textbf{XXX}} 93 | %\newcommand{\boxedexample}[1]{\vspace{2mm}\noindent\fbox{\parbox{0.98\textwidth}{{\em Example.} #1}}} 94 | 95 | \ifdefined\arxivver 96 | \newcommand{\boxedexample}[1]{ 97 | \begin{shaded} 98 | \noindent{\bf\small Example.} #1 99 | \end{shaded} 100 | } 101 | \else 102 | \newcommand{\boxedexample}[1]{ 103 | %\vspace{-2mm} 104 | \begin{shaded*} 105 | \noindent{\bf\small Example.} #1 106 | \end{shaded*} 107 | %\vspace{-2mm} 108 | } 109 | \fi 110 | 111 | 112 | -------------------------------------------------------------------------------- /conclusions.tex: -------------------------------------------------------------------------------- 1 | % !TEX root = main.tex 2 | 3 | \vspace{-2pt} % TODO 4 | \section{Conclusions} 5 | \label{se:conclusions} 6 | 7 | \revedit{ 8 | Symbolic execution techniques have evolved significantly in the last decade, with notable applications to compelling problems from several domains like software testing (e.g., test input generation, regression testing), security (e.g., exploit generation, authentication bypass), and code analysis (e.g., program deobfuscation, dynamic software updating). This trend has not only improved existing solutions, but also led to novel ideas and, in some cases, to major practical breakthroughs. For instance, the push for scalable automated program analyses in security has culminated in the 2016 DARPA Cyber Grand Challenge, which hosted systems for detecting and fixing vulnerabilities in unknown software with no human intervention, such as {\sc Angr}~\cite{ANGR-SSP16} and {\sc Mayhem}~\cite{MAYHEM-SP12}, that competed for nearly \$4M in prize money. 9 | 10 | %\noindent 11 | This survey has discussed some of the key aspects and challenges of symbolic execution, presenting for a broad audience the basic design principles of symbolic executors and the main optimization techniques. We hope it will help non-experts grasp the key inventions in this exciting line of research, inspiring further work and new ideas.} 12 | 13 | \specialcomment{online}{ 14 | \begingroup 15 | \subsection*{ELECTRONIC APPENDIX} 16 | \phantomsection\addcontentsline{toc}{subsection}{Electronic Appendix} 17 | }{% 18 | \endgroup 19 | } 20 | 21 | %\begin{online} 22 | \subsection*{ELECTRONIC APPENDIX} 23 | \revedit{ 24 | The online appendix of this manuscript discusses a selection of prominent applications of symbolic execution techniques, addresses further challenges that arise in the analysis of programs in binary form, and provides a list of popular symbolic engines. 25 | } 26 | %\end{online} 27 | 28 | \iffalse 29 | Techniques for symbolic execution have evolved significantly in the last decade, leading to major practical breakthroughs. In 2016, the DARPA Cyber Grand Challenge hosted systems that can detect and fix vulnerabilities in unknown software with no human intervention, such as {\sc Angr}~\cite{ANGR-SSP16} and {\sc Mayhem}~\cite{MAYHEM-SP12}, which won the \$2M first prize. {\sc Mayhem} was also the first autonomous software to play the Capture-The-Flag contest at the DEF CON 24 hacker convention\footnote{\url{https://www.defcon.org/html/defcon-24/dc-24-ctf.html}.}. The event demonstrated that tools for automatic exploit detection based on symbolic execution can be competitive with human experts, paving the road to unprecedented applications %and the rise of start-ups 30 | that have the potential to shape software %security and 31 | reliability in the next decades. 32 | 33 | This survey has discussed some of the key aspects and challenges of symbolic execution, presenting them for a broad audience. 34 | To explain the basic design principles of symbolic executors and the main optimization techniques, we have focused on single-threaded applications with integer arithmetic. Symbolic execution of multi-threaded programs is treated, e.g., \iffullver{in~\cite{KPV-TACAS03,SA-HVC06,CLOUD9-EUROSYS11,FHR-ESEC13,BGC-OOPSLA14,GKW-ESEC15}} 35 | {in~\cite{BGC-OOPSLA14,GKW-ESEC15}}, 36 | %{in~\cite{FHR-ESEC13,BGC-OOPSLA14,GKW-ESEC15}}, 37 | while techniques for programs that manipulate floating point data are addressed \iffullver{in, e.g., \cite{M-STVR01,BGM-STVR06,LTH-ICTSS10,CCK-EUROSYS11,BVL-POPL13,CCK-TSE14,RPW-SIGSOFT15}} 38 | {in, e.g., \cite{RPW-SIGSOFT15}}. 39 | %{in, e.g., \cite{BVL-POPL13,CCK-TSE14,RPW-SIGSOFT15}}. 40 | 41 | We hope that this survey will help non-experts grasp the key inventions in the exciting line of research of symbolic execution, inspiring further work and new ideas. 42 | \fi 43 | 44 | 45 | %\myparagraph{Acknowledgements} 46 | %This work is partially supported by a grant of the Italian Presidency of Ministry Council and by the CINI (Consorzio Interuniversitario Nazionale Informatica) Cybersecurity National Laboratory. 47 | %This work is supported in part by a grant of the Italian Presidency of the Council of Ministers and by the CINI (Consorzio Interuniversitario Nazionale Informatica) National Laboratory of Cyber Security. 48 | 49 | \ifdefined\arxivver 50 | \myparagraph{Live Version of this Article} 51 | We complement the traditional scholarly publication model by maintaining a live version of this article at {\href{https://github.com/season-lab/survey-symbolic-execution}{https://github.com/season-lab/survey-symbolic-execution/}}. The live version incorporates continuous feedback by the community, providing post-publication fixes, improvements, and extensions. 52 | \fi 53 | -------------------------------------------------------------------------------- /constraints.tex: -------------------------------------------------------------------------------- 1 | % !TEX root = main.tex 2 | 3 | \section{Constraint solving} 4 | \label{se:constraint-solving} 5 | 6 | Constraint satisfaction problems arise in many domains, including analysis, testing, and verification of software programs. Constraint solvers are decision procedures for problems expressed in logical formulas: for instance, the boolean satisfiability problem (also known as SAT) aims at determining whether there exists an interpretation of the symbols of a formula that makes it true. Although SAT is a well-known NP-complete problem, recent advances have moved the boundaries for what is intractable when it comes to practical applications~\cite{SMT-CACM11}. 7 | 8 | % linear arithmetic inequalities 9 | Observe that some problems are more naturally described with languages that are more expressive than the one of boolean formulas with logical connectives. For this reason, satisfiability modulo theories (SMT) generalize the SAT problem with supporting theories to capture formulas involving, for instance, linear arithmetic and operations over \iffullver{arrays (see, e.g., Section~\ref{ss:fully-symbolic-memory}).}{arrays.} SMT solvers map the atoms in an SMT formula to fresh boolean variables: a SAT decision procedure checks the rewritten formula for satisfiability, and a theory solver checks the model generated by the SAT procedure. 10 | 11 | %\mytempedit{In particular, SMT-compliant theory solvers are required to be able to: (i) work incrementally when checking for consistency as novel constraints are added, (ii) support backtracking, i.e., constraint removal, and (iii) provide explanations for inconsistent constraints~\cite{Abraham15}.} 12 | 13 | SMT solvers show several distinctive strengths. Their core algorithms are generic, and can handle complex combinations of many individual constraints. They can work incrementally and backtrack as constraints are added or removed, and provide explanations for inconsistencies. Theories can be added and combined in arbitrary ways, e.g., to reason about arrays of strings. Decision procedures do not need to be carried out in isolation: often, they are profitably combined to reduce the amount of time spent in heavier procedures, e.g., by solving linear parts first in a non-linear arithmetic formula. Incomplete procedures are valuable too: complete but expensive procedures get called only when conclusive answers could not be produced. All these factors allows SMT solvers to tackle large problems that no single procedure can solve in isolation\footnote{We refer the interested reader to~\cite{BKM14} for an exhaustive introduction to SMT solving, and to~\cite{SC2} for a discussion of its distinctive strengths.}. 14 | % SHORTER VERSION 15 | % }%\footnote{\cite{BKM14,SC2} provide interesting discussions of the strengths of SMT solvers.}.} 16 | 17 | 18 | %\mytempedit{SMT solvers show a number of distinctive strengths. They can work incrementally as constraints are added to formulas, backtrack for constraint removal, and provide explanations for inconsistent constraints. Their core algorithms are generic and can handle complex combinations of many individual constraints. Theories can be added and, more importantly, combined in arbitrary ways, e.g., to reason about arrays of strings. Decision procedures are not required to be carried out in isolation: often, they can profitably be combined to reduce the amount of time spent in heavier procedures, e.g., by solving linear problem parts first for a non-linear arithmetic formula. Incomplete procedures are valuable too: complete but expensive procedures get called only when conclusive answers could not be produced. The combination of these factors allows SMT solvers to tackle large problems that no single procedure can solve in isolation\footnote{We refer the interested reader to~\cite{BKM14} for an exhaustive introduction to SMT solving, and to~\cite{SC2} for a discussion of its distinctive strengths.}.} 19 | 20 | % STP~\cite{STP-CAV07,STP-TR07} solver 21 | % {\sc MineSweeper}~\cite{MineSweeper-BOTNET08}, and {\sc AEG}~\cite{AEG-NDSS11} 22 | In a symbolic executor, constraint solving plays a crucial role in checking the feasibility of a path, generating assignments to symbolic variables, and verifying assertions. 23 | % 24 | Over the years, different solvers have been employed by symbolic executors, depending on the supported theories and the relative performance at the time. For instance, the STP~\cite{STP-CAV07} solver has been employed in, e.g., {\sc EXE}~\cite{EXE-CCS06}, {\sc KLEE}~\cite{KLEE-OSDI08}, and {\sc AEG}~\cite{AEG-NDSS11}, which all leverage its support for bit-vector and array theories. Other executors such as {\sc Java PathFinder}~\cite{PATHFINDER-ASE10} have complemented SMT solving with additional decision procedures (e.g., libraries for constraint programming~\cite{CHOCO}) and heuristics to handle complex non-linear mathematical constraints~\cite{CORAL-NFM11}. 25 | 26 | Recently, Z3~\cite{Z3-TACS08} has emerged as leading solution for SMT solving. Developed at Microsoft Research, Z3 offers cutting-edge performance and supports a large number of theories, including bit-vectors, arrays, quantifiers, uninterpreted functions, linear integer and real arithmetic, and non-linear arithmetic. 27 | % 28 | %Effective support for strings has been recently offered by Z3-str~\cite{ZZG-FSE13}, an extension of Z3 that makes it possible to treat string as a primitive type, allowing the solver to reason on common string operations such as concatenation, substring, and replacement. 29 | Its Z3-str~\cite{ZZG-FSE13} extension makes it possible to treat also strings as a primitive type, allowing the solver to reason on common string operations such as concatenation, substring, and replacement. 30 | % 31 | Z3 is employed in most recently appeared symbolic executors such as {\sc Mayhem}~\cite{MAYHEM-SP12}, {\sc SAGE}~\cite{SAGE-QUEUE12}, and {\sc Angr}~\cite{ANGR-SSP16}. Due to the extensive number of supported theories in Z3, such executors typically do not to employ additional decision procedures. 32 | 33 | %The two most popular solvers used in symbolic executors are STP and Z3. STP~\cite{STP-CAV07,STP-TR07} is an SMT solver with bitvector and array theories initially developed at Stanford and employed in, e.g., {\sc EXE}~\cite{EXE-CCS06}, {\sc KLEE}~\cite{KLEE-OSDI08}, {\sc MineSweeper}~\cite{MineSweeper-BOTNET08}, and {\sc AEG}~\cite{AEG-NDSS11}. Z3~\cite{Z3-TACS08} is an SMT solver developed at Microsoft with support for nonlinear arithmetic, bitvector, and array theories, and is used in, e.g., {\sc Mayhem}~\cite{MAYHEM-SP12}, {\sc SAGE}~\cite{SAGE-QUEUE12}, and {\sc Angr}~\cite{ANGR-SSP16}. CVC3~\cite{CVC3-CAV07} is another SMT solver that supports theories for linear arithmetic, bitvectors, arrays, and quantifiers, and is employed in {\sc Java PathFinder}~\cite{PATHFINDER-ASE10} along with CHOCO~\cite{CHOCO} for integer/real constraints and CORAL~\cite{CORAL-NFM11} for complex mathematical constraints. Modern symbolic executors can typically choose between different underlying solvers through a common API, and also resort to a native interface to a specific solver for better performance. 34 | 35 | %only for efficiency reasons. 36 | 37 | %For instance, many solvers have the development of ~\cite{PATHFINDER-ASE10} can use a large number of SMT solvers, including Yices, 38 | %~\cite{YICES-CAV06} is an incremental solver with support for rational and integer linear arithmetic, bitvectors, and arrays, and was originally used in 39 | %In Table~\ref{tab:solvers} we report a number of constraint solving tools used in popular symbolic execution engines. 40 | 41 | % feasibility or applicability? TODO 42 | However, despite the significant advances observed over the past few years -- which also made symbolic execution practical in the first place~\cite{CS-CACM13} -- constraint solving remains one of the main obstacles to the scalability of symbolic execution engines, and also hinders its feasibility in the face of constraints that involve expensive theories (e.g., non-linear arithmetic) or opaque library calls. 43 | 44 | %\subsection{Optimization Techniques} 45 | %\label{ss:constraint-opt} 46 | 47 | % handling or skipping over 48 | In the remainder of this section, we address different techniques to extend the range of programs \iffullver{that can be handled by}{amenable to} symbolic execution and to optimize the performance of constraint solving. Prominent approaches consist in: (i) reducing the size and complexity of the constraints to check, (ii) unburdening the solver by, e.g., resorting to constraint solution caching, deferring of \iffullver{constraint solver queries}{solver queries}, or concretization, and (iii) augmenting symbolic execution to handle constraints problematic for decision procedures. 49 | 50 | %We conclude by pointing out potential directions to improve support for non-linear arithmetic}. 51 | 52 | %\mytempedit{and (iii) augmenting symbolic execution with techniques aimed at handling constraints that are problematic for the underlying decision procedure. We conclude the section by pointing out potential research directions to improve support for non-linear arithmetic}. 53 | 54 | %: (i) {\em constraint reduction} techniques aim at simplifying constraints fed to a solver by rewriting them into a shorter form: (ii) techniques for {\em reuse of constraint solutions} explore the space-time trade-off of retrieving previously computed query results rather than repeating expensive satisfiability checks. 55 | 56 | \myparagraph{Constraint Reduction} 57 | A common optimization approach followed by both solvers and symbolic executors is to reduce constraints into simpler forms. For example, the {\em expression rewriting} optimization can apply classical techniques from optimizing compilers such as constant folding, strength reduction, and simplification of linear expressions (see, e.g., {\sc KLEE}~\cite{KLEE-OSDI08}). 58 | 59 | {\sc EXE}~\cite{EXE-CCS06} introduces a {\em constraint independence} optimization that exploits the fact that a set of constraints can frequently be divided into multiple independent subsets of constraints. This optimization interacts well with query result caching strategies, and offers an additional advantage when an engine asks the solver about the satisfiability of a specific constraint, as it removes irrelevant constraints from the query. In fact, independent branches, which tend to be frequent in real programs, could lead to unnecessary constraints that would get quickly accumulated. 60 | 61 | Another fact that can be exploited by reduction techniques is that the natural structure of programs can lead to the introduction of more specific constraints for some variables as the execution proceeds. Since path conditions are generated by conjoining new terms to an existing sequence, it might become possible to rewrite and optimize existing constraints. For instance, adding an equality constraint of the form $x:=5$ enables not only the simplification to true of other constraints over the value of the variable (e.g., $x>0$), but also the substitution of the symbol $x$ with the associated concrete value in the other subsequent constraints involving it. The latter optimization is also known as {\em implied value concretization} and, for instance, it is employed by {\sc KLEE}~\cite{KLEE-OSDI08}. 62 | 63 | In a similar spirit, {\sc \stwoe}~\cite{CKC-TOCS12} introduces a bitfield-theory expression simplifier to replace with concrete values parts of a symbolic variable that bit operations mask away. For instance, for any 8-bit symbolic value $v$, the most significant bit in the value of expression $v\,|\,10000000_2$ is always 1. The simplifier can propagate information across the tree representation of an expression, and if each bit in its value can be determined, the expression is replaced with the corresponding constant. 64 | 65 | %path conditions in a symbolic executor are typically generated by conjoining a new term to an existing (and possibly satisfiable) sequence of constraints. As the exploration proceeds, the natural structure of programs means that constraints might become more specific for some variables, and constraints can be rewritten accordingly. 66 | 67 | %\subsubsection{Reuse of Constraint Solutions} 68 | %\label{ss:constraint-reuse} 69 | 70 | %\subsection{Unburdening the Constraint Solver} 71 | %\label{ss:solver-unburdening} 72 | 73 | \myparagraph{Reuse of Constraint Solutions} 74 | The idea of reusing previously computed results to speed up constraint solving can be particularly effective in the setting of a symbolic executor, especially when combined with other techniques such as constraint independence optimization. Most reuse approaches for constraint solving are currently based on semantic or syntactic equivalence of the constraints. 75 | 76 | {\sc EXE}~\cite{EXE-CCS06} caches the results of constraint solutions and satisfiability queries in order to reduce as much as possible the need for calling the solver. A cache is handled by a server process that can receive queries from multiple parallel instances of the execution engine, each exploring a different program state. 77 | 78 | {\sc KLEE}~\cite{KLEE-OSDI08} implements an incremental optimization strategy called {\em counterexample caching}. Using a cache, constraint sets are mapped to concrete variable assignments, or to a special null value when a constraint set is unsatisfiable. When an unsatisfiable set in the cache is a subset for a given constraint set $S$, $S$ is deemed unsatisfiable as well. Conversely, when the cache contains a solution for a superset of $S$, the solution trivially satisfies $S$ too. Finally, when the cache contains a solution for one or more subsets of $S$, the algorithm tries substituting in all the solutions to check whether a satisfying solution for $S$ can be found. 79 | 80 | {\em Memoized symbolic execution}~\cite{MEMO-ISSTA12} is motivated by the observation that symbolic execution often results in re-running largely similar sub-problems, e.g., finding a bug, fixing it, and then testing the program again to check if the fix was effective. The taken choices during path exploration are compactly encoded in a prefix tree, opening up the possibility to reuse previously computed results in successive runs. 81 | % in a trie-based data structure 82 | 83 | The Green framework~\cite{GREEN-FSE12} explores constraint solution reuse across runs of not only the same program, but also similar programs, different programs, and different analyses. Constraints are distilled into their essential parts through a {\em slicing} transformation and represented in a canonical form to achieve good reuse, even within a single analysis run. \cite{JGY-ISSTA15} presents an extension to the framework that exploits logical implication relations between constraints to support constraint reuse and faster execution times. 84 | 85 | %\subsection{Other Optimizations in Symbolic Executors} 86 | %\subsection{Reducing the Symbolic Executor's Pressure on Constraint Solvers} 87 | %\label{ss:reducing-constraint-solver-pressure} 88 | 89 | %In this section we present a number of other optimizations that become possible in the setting of a symbolic executor to reduce the time spent in the constraint solver. 90 | 91 | \myparagraph{Lazy Constraints} 92 | \cite{UCKLEE-USEC15} adopts a timeout approach for constraint solver queries. In their initial experiments, the authors traced most timeouts to symbolic division and remainder operations, with the worst cases occurring when an unsigned remainder operation had a symbolic value in the denominator. 93 | They thus implemented a solution that works as follow: when the executor encounters a branch statement involving an expensive symbolic operation, it will take both the true and false branches and add a {\em lazy} constraint on the result of the expensive operation to the path conditions. When the exploration reaches a state that satisfies some goal (e.g., an error is found), the algorithm will check for the feasibility of the path, and suppress it if deemed unreachable in a real execution. 94 | 95 | Compared to the {\em eager} approach of checking the feasibility of a branch as encountered (Section~\ref{ss:unrealizable-paths}), a lazy strategy may lead to a larger number of active states, and in turn to more solver queries. However, the authors report that the delayed queries are in many cases more efficient than their eager counterparts: the path constraints added after a lazy constraint can in fact narrow down the solution space for the solver. 96 | 97 | \begin{figure}[t] 98 | \begin{center} 99 | \begin{subfigure}{.43\textwidth} 100 | \vspace{0mm} 101 | \begin{lstlisting}[basicstyle=\ttfamily\scriptsize] 102 | 1. void test(int x, int y) { 103 | 2. if (non_linear(y) == x) 104 | 3. if (x > y + 10) ERROR; } 105 | \end{lstlisting} 106 | %\vspace{8.5mm} 107 | %\caption{} 108 | \end{subfigure}% 109 | \begin{subfigure}{.43\textwidth} 110 | %\vspace{-5.2mm} 111 | \begin{lstlisting}[basicstyle=\ttfamily\scriptsize] 112 | 4. int non_linear(int v) { 113 | 5. return (v*v) % 50; 114 | 6. } 115 | \end{lstlisting} 116 | %\vspace{3.5mm} 117 | %\caption{} 118 | \end{subfigure}% 119 | \end{center} 120 | \vspace{-4.0mm} 121 | \caption{Example with non-linear constraints.} 122 | \label{fi:non-linear-constraints} 123 | \vspace{-2mm} 124 | \end{figure} 125 | 126 | 127 | \myparagraph{Concretization} 128 | \cite{CS-CACM13} discusses limitations of classical symbolic execution in the presence of formulas that constraint solvers cannot solve, at least not efficiently. A concolic executor generates some random input for the program and executes it both concretely and symbolically: a possible value from the concrete execution can be used for a symbolic operand involved in a formula that is inherently hard for the solver, albeit at the cost of possibly sacrificing soundness in the exploration. 129 | %For instance, in the presence of three nested branches with only one being non-linear, {\sc DART}~\cite{DART-PLDI05} starts from a random valid input for the function, and then alters it when symbolically exploring the two linear branches. The work resorts to concretization also to avoid performing expensive or imprecise alias analysis on pointers. % with only one of them being 130 | 131 | 132 | \boxedexample{In the code fragment of Figure~\ref{fi:non-linear-constraints}, the engine stores a non-linear constraint of the form $\alpha_x = (\alpha_y*\alpha_y)\,\%\,50$ for the $true$ branch at line 2. A solver that does not support non-linear arithmetic fails to generate any input for the program. However, a concolic engine can exploit concrete values to help the solver. For instance, if $x=3$ and $y=5$ are randomly chosen as initial input parameters, then the concrete execution does not take any of the two branches. Nonetheless, the engine can reuse the concrete value of $y$, simplifying the previous query as $\alpha_x = 25$ due to $\alpha_y = 5$. The straightforward solution to this query can now be used by the engine to explore both branches. Notice that if the value of $y$ is fixed to $5$, then there is no way of generating a new input that takes the first but not the second branch, inducing a false negative. In this case, a trivial solution could be to rerun the program choosing a different value for $y$ (e.g., if $y=2$ then $x=4$, which satisfies the first but not the second branch). 133 | } 134 | 135 | 136 | % suggests to 137 | To partially overcome the incompleteness due to concretization,~\cite{PRV-ISSTA11} suggests {\em mixed concrete-symbolic solving}, which considers {\em all} the path constraints collectable over a path before binding one or more symbols to specific concrete values. Indeed, {\sc DART}~\cite{DART-PLDI05} concretizes symbols based on the path constraints collected up to a target branch. In this manner, a constraint contained in a subsequent branch in the same path is not considered and it may be not satisfiable due to already concretized symbols. If this happen, {\sc DART} restarts the execution with different random concrete values, hoping to be able to satisfy the subsequent branch. The approach presented in~\cite{PRV-ISSTA11} requires instead to detect {\em solvable} constraints along a full path and to delay concretization as much as possible. 138 | 139 | \myparagraph{Handling Problematic Constraints} 140 | Strong SMT solvers allow executors to handle more path constraints directly, reducing the need to resort to concretization. This also results in a lower risk to incur a {\em blind commitment} to concrete values~\cite{DA-FSE14}, which happens when the under-approximation of path conditions from a random choice of concrete values for some variables results in an arbitrary restriction of the search space. 141 | \revedit{However, the decision problem for certain classes of constraints is well known to be undecidable, e.g., like for non-linear integer arithmetic, or the theory of reals with trigonometric functions often used to model real-world systems.} 142 | %\revedit{However, problems such as non-linear integer arithmetic or the theory of reals together with trigonometric functions are well known to be undecidable.} % SHORT VERSION 143 | %Unfortunately, some constraints remain prohibitive for SMT solvers: for instance, non-linear integer arithmetic is undecidable in general; also, a branch condition might contain calls to opaque library methods such as trigonometric functions that would require special extensions to the solver to reason about. 144 | 145 | \cite{DA-FSE14} proposes a {\em concolic walk} algorithm that can tackle control-flow dependencies involving non-linear arithmetic and library calls. The algorithm treats assignments of values to variables as a valuation space: the solutions of the linear constraints define a polytope that can be walked heuristically, while the remaining constraints are assigned with a fitness function measuring how close a valuation point is to matching the constraint. An adaptive search is performed on the polytope as points are picked on it and non-linear constraints evaluated on them. Compared to mixed concrete-symbolic solving~\cite{PRV-ISSTA11}, both techniques seek to avoid blind commitment. However, concolic walk does not rely on the solver for obtaining all the concrete inputs needed to evaluate complex constraints, and implements search heuristics that guide the walk on the polytope toward promising regions. 146 | 147 | % Symcretic execution 148 | % , which determines how close the branch conditions are to being satisfied and alters the concrete inputs to move closer to a full solution 149 | %For instance, if an {\tt assert} statement is guarded by a branch condition that can be proven unsatisfiable, then there is no need to take into account all the other constraints along the path to the entry point to declare the target unreachable. A traditional concolic executor reasons instead about all the constraints along a path with a top-down approach, making it hard to detect the unreachability of a target statement because of constraints ``deep'' in the path. 150 | 151 | \cite{DA-ASE14} describes {\em symcretic} execution, a novel combination of symbolic backward execution (SBE) (Section~\ref{se:executors}) and forward symbolic execution. The main idea is to divide exploration into two phases. In the first phase, SBE is performed from a target point and a trace is collected for each followed path. If any problematic constraints are met during the backward exploration, the engine marks them as {\em potentially} satisfiable by adding a special event to the trace and continues its reversed traversal. Whenever an entry point of the program is reached along any of the followed paths, the second phase starts. The engine concretely evaluates the collected trace, trying to satisfy any constraint marked as problematic during the first phase. This is done using a heuristic search, such as the concolic walk described above. An advantage of symcretic over classical concolic execution is that it can prevent the exploration of some unfeasible paths. For instance, the backward phase may determine that a statement is guarded by an unsatisfiable branch regardless of how the statement is reached, while a traditional concolic executor would detect the unfeasibility on a per-path basis only when the statement is reached, which is unfavorable for statements ``deep'' in a path. 152 | 153 | %\myparagraph{Memory Page Size} 154 | %In {\sc \stwoe}~\cite{CKC-TOCS12}, when a symbolic pointer is dereferenced, the engine determines which memory pages are referenced by it and passes their contents to the solver. As large page sizes can overwhelm the solver, {\sc \stwoe} uses small pages of configurable size rather than the default 4KB pages. The authors report significant performance benefits from using pages of smaller size. -------------------------------------------------------------------------------- /environment.tex: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/environment.tex -------------------------------------------------------------------------------- /hang.tex: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/hang.tex -------------------------------------------------------------------------------- /images/blackbox.odg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/images/blackbox.odg -------------------------------------------------------------------------------- /images/blackbox.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/images/blackbox.pdf -------------------------------------------------------------------------------- /images/compiler.odg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/images/compiler.odg -------------------------------------------------------------------------------- /images/compiler.odg.new: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/images/compiler.odg.new -------------------------------------------------------------------------------- /images/compiler.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/images/compiler.pdf -------------------------------------------------------------------------------- /images/compiler.pdf.new: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/images/compiler.pdf.new -------------------------------------------------------------------------------- /images/concolic-execution-2.odg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/images/concolic-execution-2.odg -------------------------------------------------------------------------------- /images/concolic-execution-2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/images/concolic-execution-2.pdf -------------------------------------------------------------------------------- /images/concolic-execution.odg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/images/concolic-execution.odg -------------------------------------------------------------------------------- /images/concolic-execution.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/images/concolic-execution.pdf -------------------------------------------------------------------------------- /images/concolic-execution_old.odg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/images/concolic-execution_old.odg -------------------------------------------------------------------------------- /images/concolic-execution_old.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/images/concolic-execution_old.pdf -------------------------------------------------------------------------------- /images/concrete-abstract.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/images/concrete-abstract.pdf -------------------------------------------------------------------------------- /images/concrete-abstract.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | image/svg+xmlconcrete 252 | 264 | symbolic 284 | abstract 297 | concolic 310 | -------------------------------------------------------------------------------- /images/concrete-execution.odg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/images/concrete-execution.odg -------------------------------------------------------------------------------- /images/concrete-execution.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/images/concrete-execution.pdf -------------------------------------------------------------------------------- /images/eager-evaluation.odg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/images/eager-evaluation.odg -------------------------------------------------------------------------------- /images/eager-evaluation.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/images/eager-evaluation.pdf -------------------------------------------------------------------------------- /images/example.odg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/images/example.odg -------------------------------------------------------------------------------- /images/example.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/images/example.pdf -------------------------------------------------------------------------------- /images/execution-tree-text.tex: -------------------------------------------------------------------------------- 1 | \documentclass[10pt]{article} 2 | \usepackage[usenames]{color} %used for font color 3 | \usepackage{amssymb} %maths 4 | \usepackage{amsmath} %maths 5 | \usepackage[utf8]{inputenc} %useful to type directly diacritic characters 6 | \begin{document} 7 | \begin{align*}\mbox{A} ~~~~ 2.~~\texttt{int x = 1, y = 0} ~~~~ \sigma=\{ a\mapsto \alpha_a, b\mapsto \alpha_b \} ~~~~ \pi=true \\ 8 | \mbox{B} ~~~~ 3.~~\texttt{if (a != 0)} ~~~~ \sigma=\{ a\mapsto \alpha_a, b\mapsto \alpha_b, x\mapsto 1, y\mapsto 0 \} ~~~~ \pi=true \\ 9 | \mbox{C} ~~~~ 4.~~\texttt{y = 3+x} ~~~~ \sigma=\{ a\mapsto \alpha_a, b\mapsto \alpha_b, x\mapsto 1, y\mapsto 0 \} ~~~~ \pi=\alpha_a\neq 0 \\ 10 | \mbox{D} ~~~~ 8.~~\texttt{assert(x-y != 0)} ~~~~ \sigma=\{ a\mapsto \alpha_a, b\mapsto \alpha_b, x\mapsto 1, y\mapsto 0 \} ~~~~ \pi=\alpha_a= 0 \\ 11 | 1-0 = 0 \wedge \alpha_a = 0\Longleftrightarrow false ~~~~ \mbox{OK} \\ 12 | \mbox{E} ~~~~ 5.~~\texttt{if (b == 0)} ~~~~ \sigma=\{ a\mapsto \alpha_a, b\mapsto \alpha_b, x\mapsto 1, y\mapsto 4 \} ~~~~ \pi=\alpha_a\neq 0 \\ 13 | \mbox{F} ~~~~ 6.~~\texttt{x = 2*(a+b)} ~~~~ \sigma=\{ a\mapsto \alpha_a, b\mapsto \alpha_b, x\mapsto 1, y\mapsto 4 \} ~~~~ \pi=\alpha_a\neq 0 \wedge \alpha_b = 0 \\ 14 | \mbox{G} ~~~~ 8.~~\texttt{assert(x-y != 0)} ~~~~ \sigma=\{ a\mapsto \alpha_a, b\mapsto \alpha_b, x\mapsto 1, y\mapsto 4 \} ~~~~ \pi=\alpha_a \neq 0 \wedge \alpha_b \neq 0 \\ 15 | 1-4 = 0 \wedge \alpha_a \neq 0 \wedge \alpha_b \neq 0\Longleftrightarrow false ~~~~ \mbox{OK} \\ 16 | \mbox{H} ~~~~ 8.~~\texttt{assert(x-y != 0)} ~~~~ \sigma=\{ a\mapsto \alpha_a, b\mapsto \alpha_b, x\mapsto 2(\alpha_a+\alpha_b), y\mapsto 4 \} ~~~~ \pi=\alpha_a \neq 0 \wedge \alpha_b = 0 \\ 17 | 2(\alpha_a+\alpha_b)-4 = 0 \wedge \alpha_a \neq 0 \wedge \alpha_b = 0~~\mbox{if}~~\alpha_a=2\wedge\alpha_b=0 ~~~~ \mbox{ERROR} \\ 18 | \end{align*} 19 | \end{document} -------------------------------------------------------------------------------- /images/execution-tree.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/images/execution-tree.pdf -------------------------------------------------------------------------------- /images/lazy-initialization-C.odg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/images/lazy-initialization-C.odg -------------------------------------------------------------------------------- /images/lazy-initialization.odg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/images/lazy-initialization.odg -------------------------------------------------------------------------------- /images/lazy-initialization.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/images/lazy-initialization.pdf -------------------------------------------------------------------------------- /images/memory-fork.odg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/images/memory-fork.odg -------------------------------------------------------------------------------- /images/memory-fork.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/images/memory-fork.pdf -------------------------------------------------------------------------------- /images/memory-ite.odg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/images/memory-ite.odg -------------------------------------------------------------------------------- /images/memory-ite.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/images/memory-ite.pdf -------------------------------------------------------------------------------- /images/photo_tree.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/images/photo_tree.pdf -------------------------------------------------------------------------------- /images/state-merging-2.odg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/images/state-merging-2.odg -------------------------------------------------------------------------------- /images/state-merging-2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/images/state-merging-2.pdf -------------------------------------------------------------------------------- /images/state-merging.odg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/images/state-merging.odg -------------------------------------------------------------------------------- /images/state-merging.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/images/state-merging.pdf -------------------------------------------------------------------------------- /images/state-merging_old.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/images/state-merging_old.png -------------------------------------------------------------------------------- /images/whitebox.odg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/images/whitebox.odg -------------------------------------------------------------------------------- /images/whitebox.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/images/whitebox.pdf -------------------------------------------------------------------------------- /intro.tex: -------------------------------------------------------------------------------- 1 | % !TEX root = main.tex 2 | 3 | \epigraph{\textit{``Sometimes you can't see how important something is in its moment, even if it seems kind of important. This is probably one of those times.''}}{(Cyber Grand Challenge highlights from DEF CON 24, August 6, 2016)} 4 | 5 | \vspace{-2.5mm} 6 | \section{Introduction} 7 | \label{se:intro} 8 | 9 | Symbolic execution is a popular program analysis technique introduced in the mid '70s to test whether certain properties can be violated by a piece of software~\cite{K-ICRS75,SELECT-ICRS75,K-CACM76,H-TSE77}. Aspects of interest could be that no division by zero is ever performed, no {\tt NULL} pointer is ever dereferenced, no backdoor exists that can bypass authentication, etc. While in general there is no automated way to decide some properties (e.g., the target of an indirect jump), heuristics and approximate analyses can prove useful in practice in a variety of settings, including mission-critical and security applications. 10 | 11 | %While in general there is no automated way to decide some properties (think, e.g., of the halting problem), decidable approximations often exist (e.g., ``does a program always terminate within a certain amount of time?''). Such approximations can prove useful in practice in a variety of settings, including mission-critical and security applications. 12 | 13 | In a concrete execution, a program is run on a specific input and a single control flow path is explored. Hence, in most cases concrete executions can only under-approximate the analysis of the property of interest. In contrast, symbolic execution can simultaneously explore multiple paths that a program could take under different inputs. This paves the road to sound analyses that can yield strong guarantees on the checked property. 14 | %\mynote{I: a cosa serve ridirlo? Abbiamo gia' fatto esempi di proprieta' che possono essere verificate}Symbolic execution may answer useful questions on concrete programs like: ``does function {\tt foo(x)} always return a positive value for any possible value of {\tt x}?'' 15 | The key idea is to allow a program to take on {\em symbolic} -- rather than concrete -- input values. Execution is performed by a {\em symbolic execution engine}, which maintains for each explored control flow path: (i) a first-order Boolean {\em formula} that describes the conditions satisfied by the branches taken along that path, and (ii) a {\em symbolic memory store} that maps variables to symbolic expressions or values. Branch execution updates the formula, while assignments update the symbolic store. A {\em model checker}, typically based on a {\em satisfiability modulo theories} (SMT) solver~\cite{BKM14}, is eventually used to verify whether there are any violations of the property along each explored path and if the path itself is realizable, i.e., if its formula can be satisfied by some assignment of concrete values to the program's symbolic arguments. 16 | %HandbookOfSAT2009 17 | 18 | %Variables and control flow paths are associated with expressions and constraints in terms of those symbols during a symbolic execution of the program, and constraints are eventually solved via SMT (satisfiability modulo theories) solvers. 19 | 20 | Symbolic execution techniques have been brought to the attention of a heterogeneous audience since DARPA announced in 2013 the Cyber Grand Challenge, a two-year competition seeking to create automatic systems for vulnerability detection, exploitation, and patching in near real-time~\cite{ANGR-SSP16}. 21 | 22 | % other static program 23 | % which were missed by other program analyses and blackbox testing techniques 24 | More remarkably, symbolic execution tools have been running 24/7 in the testing process of many Microsoft applications since 2008, revealing for instance nearly 30\% of all the bugs discovered by file fuzzing during the development of Windows 7, which other program analyses and blackbox testing techniques missed~\cite{SAGE-QUEUE12}. 25 | 26 | In this article, we survey the main aspects of symbolic execution and discuss the most prominent techniques employed for instance in software testing and computer security applications. Our discussion is mainly focused on {\em forward} symbolic execution, where a symbolic engine analyzes many paths simultaneously starting its exploration from the main entry point of a program. 27 | %its extensive usage in software testing and computer security applications\mynote{[D] this should change}, where software vulnerabilities can be found by symbolically executing programs at the level of either source or binary code. 28 | %A different approach is symbolic {\em backward} execution, where exploration is started from a specific point of the program (e.g., an {\tt assert} statement) and the engine proceeds backward, trying to reconstruct a valid path from an entry point of the program. Since forward symbolic execution is the mainline technique in literature, throughout this article we will always refer to this approach when using the term symbolic execution. Nonetheless, some benefits offered by symbolic backward execution will be pointed out when relevant for the discussion. 29 | % 30 | We start with a simple example that highlights many of the fundamental issues addressed in the remainder of the article. 31 | 32 | % -------------------------------------------------------------------------------------------------------------------- 33 | \subsection{A Warm-Up Example} 34 | \label{symbolic-execution-example} 35 | 36 | \begin{figure}[t] 37 | \begin{center} 38 | \begin{tabular}{c} 39 | \begin{lstlisting}[basicstyle=\ttfamily\scriptsize] 40 | 1. void foobar(int a, int b) { 41 | 2. int x = 1, y = 0; 42 | 3. if (a != 0) { 43 | 4. y = 3+x; 44 | 5. if (b == 0) 45 | 6. x = 2*(a+b); 46 | 7. } 47 | 8. assert(x-y != 0); 48 | 9. } 49 | \end{lstlisting} 50 | \end{tabular} 51 | \end{center} 52 | \vspace{-2mm} 53 | \caption{Warm-up example: which values of \texttt{a} and \texttt{b} make the \texttt{assert} fail?} 54 | \label{fig:example-1} 55 | \vspace{-1.5mm} 56 | \end{figure} 57 | 58 | %\revedit{in the common 4-byte representation} 59 | Consider the C code of Figure~\ref{fig:example-1} and assume that our goal is to determine which inputs make the {\tt assert} at line 8 of function \texttt{foobar} fail. Since each \revedit{4-byte} input parameter can take as many as $2^{32}$ distinct integer values, the approach of running concretely function \texttt{foobar} on randomly generated inputs will unlikely pick up exactly the assert-failing inputs. 60 | %Techniques such as random testing could generate bottomless input tests for this function. 61 | %However, it is unlikely that exactly the assert-failing inputs would be randomly picked up\mynote{Fuzzing?}. 62 | By evaluating the code using symbols for its inputs, instead of concrete values, symbolic execution overcomes this limitation and makes it possible to reason on {\em classes of inputs}, rather than single input values. 63 | 64 | In more detail, every value that cannot be determined by a static analysis of the code, such as an actual parameter of a function or the result of a system call that reads data from a stream, is represented by a symbol $\alpha_i$. At any time, the symbolic execution engine maintains a state $(stmt,~\sigma,~\pi)$ where: 65 | 66 | \begin{itemize}[itemsep=1pt] 67 | 68 | \item $stmt$ is the next statement to evaluate. For the time being, we assume that $stmt$ can be an assignment, a conditional branch, or a jump (more complex constructs such as function calls and loops will be discussed in Section~\ref{se:path-explosion}). 69 | 70 | %\item $\sigma$ is a {\em symbolic store} that associates program variables with expressions over \mynote{[D] $\alpha_i$ also concrete?} concrete and symbolic values $\alpha_i$. 71 | 72 | \item $\sigma$ is a {\em symbolic store} that associates program variables with either expressions over concrete values or symbolic values $\alpha_i$. 73 | 74 | \item $\pi$ denotes the {\em path constraints}, i.e., is a formula that expresses a set of assumptions on the symbols $\alpha_i$ due to branches taken in the execution to reach $stmt$. At the beginning of the analysis, $\pi=true$. 75 | 76 | \end{itemize} 77 | 78 | \noindent Depending on $stmt$, the symbolic engine changes the state as follows: 79 | 80 | \begin{itemize}[topsep=3pt,itemsep=1pt] % TODO 81 | \item The evaluation of an assignment $x=e$ updates the symbolic store $\sigma$ by associating $x$ with a new symbolic expression $e_s$. We denote this association with $x\mapsto e_s$, where $e_s$ is obtained by evaluating $e$ in the context of the current execution state and can be any expression involving unary or binary operators over symbols and concrete values. 82 | 83 | % $\alpha_i = e$: when an expression $e$ is assigned to a symbol $\alpha_i$, $pc$ is extended by adding a constraint on $\alpha_i$: 84 | % \[ pc \gets pc \wedge \alpha_i = e\] 85 | % where $e$ can be any expression, involving unary or binary operators, over symbols and constants. 86 | 87 | \item The evaluation of a conditional branch ${\tt if}~e~{\tt then}~s_{true}~{\tt else}~s_{false}$ affects the path constraints $\pi$. The symbolic execution is forked by creating two execution states with path constraints $\pi_{true}$ and $\pi_{false}$, respectively, which correspond to the two branches: $\pi_{true}=\pi \wedge e_s$ and $\pi_{false}=\pi \wedge \neg e_s$, where $e_s$ is a symbolic expression obtained by evaluating $e$. 88 | % \[ (s_{true}, pc_{true}) \text{ where } pc_{true} = pc \wedge e \] 89 | % \[ (s_{false}, pc_{false}) \text{ where } pc_{false} = pc \wedge \neg e \] 90 | Symbolic execution independently proceeds on both states. 91 | 92 | \item The evaluation of a jump {\tt goto} $s$ updates the execution state by advancing the symbolic execution to statement $s$. 93 | \end{itemize} 94 | 95 | %\subsection{Example} 96 | %\label{symbolic-execution-example} 97 | 98 | %\begin{figure}[t] 99 | % \centering 100 | % \includegraphics[width=1.0\columnwidth]{images/example} 101 | % \caption{Symbolic execution tree of the function {\tt foobar}. Each execution state is labeled with an alphabet letter. Side effects on execution states are highlighted in gray. Leaves are evaluated against division by zero error. For the sake of presentation the conjunction of constraints is shown as a list of constraints. } 102 | % \label{fig:example-symbolic-execution} 103 | %\end{figure} 104 | 105 | \begin{figure}[t] 106 | \centering 107 | \includegraphics[width=0.975\columnwidth]{images/execution-tree.eps} 108 | \caption{Symbolic execution tree of function {\tt foobar} given in Figure~\ref{fig:example-1}. Each execution state, labeled with an upper case letter, shows the statement to be executed, the symbolic store $\sigma$, and the path constraints $\pi$. Leaves are evaluated against the condition in the {\tt assert} statement. } 109 | %For the sake of presentation the conjunction of constraints is shown as a list of constraints. } 110 | \label{fig:example-symbolic-execution} 111 | \vspace{-1mm} 112 | \end{figure} 113 | 114 | \noindent A symbolic execution of function {\tt foobar}, which can be effectively represented as a tree, is shown in Figure~\ref{fig:example-symbolic-execution}. Initially (execution state $A$) the path constraints are {\tt true} and input arguments {\tt a} and {\tt b} are associated with symbolic values. 115 | After initializing local variables {\tt x} and {\tt y} at line 2, the symbolic store is updated by associating {\tt x} and {\tt y} with concrete values 1 and 0, respectively (execution state $B$). Line 3 contains a conditional branch and the execution is forked: depending on the branch taken, a different statement is evaluated next and different assumptions are made on symbol $\alpha_a$ (execution states $C$ and $D$, respectively). In the branch where $\alpha_a\neq 0$, variable {\tt y} is assigned with ${\tt x}+3$, obtaining $y\mapsto 4$ in state $E$ because $x\mapsto 1$ in state $C$. In general, arithmetic expression evaluation simply manipulates the symbolic values. 116 | After expanding every execution state until the {\tt assert} at line 8 is reached on all branches, we can check which input values for parameters {\tt a} and {\tt b} can make the {\tt assert} fail. By analyzing execution states $\{D,G,H\}$, we can conclude that only $H$ can make {\tt x-y = 0} true. The path constraints for $H$ at this point implicitly define the set of inputs that are unsafe for {\tt foobar}. 117 | In particular, any input values such that: 118 | \[ 2(\alpha_a+\alpha_b)-4 = 0 \wedge \alpha_a \neq 0 \wedge \alpha_b = 0 \] 119 | will make {\tt assert} fail. An instance of unsafe input parameters can be eventually determined by invoking an {\em SMT solver}~\cite{BKM14} to solve the path constraints, which in this example would yield $a = 2$ and $b = 0$. % HandbookOfSAT2009 120 | 121 | %Notice\mynote{Say earlier?} that a constraint solver is also needed when evaluating the satisfiability of branch conditions. 122 | 123 | % -------------------------------------------------------------------------------------------------------------------- 124 | \subsection{Challenges in Symbolic Execution} 125 | \label{example-discussion} 126 | 127 | In the example discussed in Section~\ref{symbolic-execution-example} symbolic execution can identify {\em all} the possible unsafe inputs that make the {\tt assert} fail. This is achieved through an exhaustive exploration of the possible execution states. From a theoretical perspective, exhaustive symbolic execution provides a {\em sound} and {\em complete} methodology for any decidable analysis. Soundness prevents false negatives, i.e., all possible unsafe inputs are guaranteed to be found, while completeness prevents false positives, i.e., input values deemed unsafe are actually unsafe. As we will discuss later on, exhaustive symbolic execution is unlikely to scale beyond small applications. Hence, in practice we often settle for less ambitious goals, e.g., by trading soundness for performance. 128 | 129 | Challenges that symbolic execution has to face when processing real-world code can be significantly more complex than those illustrated in our warm-up example. Several observations and questions naturally arise: 130 | 131 | \begin{itemize}[itemsep=1mm] 132 | %%% 133 | \item \noindent {\em Memory}: how does the symbolic engine handle pointers, arrays, or other complex objects? Code manipulating pointers and data structures may give rise not only to symbolic stored data, but also to addresses being described by symbolic expressions. 134 | %Any arbitrarily complex object can be regarded as an array of bytes and each byte associated with a distinct symbol. However, when possible, exploiting structural properties of the data may be more convenient: for instance, relational bounds on the class fields in object-oriented languages could be used for refining the search performed by symbolic execution. 135 | %%% 136 | \item {\em Environment}: how does the engine handle interactions across the software stack? Calls to library and system code can cause side-effects, e.g., the creation of a file \revedit{or a call back to user code}, that could later affect the execution and must be accounted for. However, evaluating any possible interaction outcome may be unfeasible. 137 | %: it would give rise to a large number of states, while only a fraction of them can \mynote{[D] likely?}actually happen in a non-symbolic scenario. 138 | %%\mytempedit{Also, third-party closed-source components and popular frameworks (e.g., Java Swing and Android) pose further challenges to an executor, for instance because of the control flows occurring within them through callbacks.}\mynote{CD: may be dropped if we run out of space} 139 | % Real-world applications constantly interact with the environment (e.g., the file system or the network) through libraries and system calls. These interactions may cause side-effects (such as the creation of a file) that could later affect the execution and must be therefore taken into account. Evaluating any possible interaction outcome is generally unfeasible: it could generate a large number of execution states, of which only a small number can actually happen in a non-symbolic scenario. %A typical strategy is to consider popular library and system routines and create models that can help the symbolic engine analyze only significant outcomes. 140 | %%% 141 | \item {\em State space explosion}: how does symbolic execution deal with path explosion? 142 | %\mynote{[D] I felt it was too long and loop-centric} 143 | Language constructs such as loops might exponentially increase the number of execution states. It is thus unlikely that a symbolic execution engine can exhaustively explore all the possible states within a reasonable amount of time. %In practice, heuristics are used to guide exploration and prioritize certain states first (e.g., to maximize code coverage). In addition, 144 | %\mytempedit{Efficient mechanisms can be implemented for preventing repeated exploration of the same piece of code 145 | %\mytempedit{for skipping over states subsumed by previously explored paths} 146 | %and for evaluating multiple states in parallel without running out of resources.} 147 | %%A loop\mynote{IF: rimuoverei la prima frase, perche' va detto?} can be encoded using conditional branches and {\tt goto} statements, which is typical when compiling high-level languages to an intermediate representation or native code. 148 | %Choosing the number of loop iterations to analyze is especially critical when this number cannot be determined in advance (e.g., depends on an input parameter). The naive approach of unrolling iterations for every valid bound would result in a prohibitively large number of states. Typical solutions are to compute an underapproximation of the analysis by limiting the number of iterations to some value $k$, thus trading speed for soundness. Other approaches infer loop invariants through static analysis and use them to merge equivalent states. % \mynote{i.e. or e.g.?} (e.g., when differences are not observable from outside the loop body). 149 | %In practice, several heuristics must be exploited to prioritize evaluation of some states, hoping to still be able to spot interesting things. Moreover, the symbolic execution engine should include efficient mechanism for efficiently evaluating in parallel different execution states without running out of computational resources. 150 | %%% 151 | \item {\em Constraint solving}: what can a constraint solver do in practice? 152 | %{\em What is a constraint solver in practice}? \\ 153 | SMT solvers can scale to complex combinations of constraints over hundreds of variables. However, constructs such as non-linear arithmetic pose a major obstacle to efficiency. 154 | %Constraint solvers suffer from a number of limitations. They can typically handle complex constraints in a reasonable amount of time only if they are made of linear expressions over their constituents. 155 | %Constraint solvers suffer from a number of limitations. They can typically handle complex constraints in a reasonable amount of time only if they are made of linear expressions over their constituents. %Symbolic execution engines normally implement a number of optimizations to make queries as much {\em solver-friendly} as possible, for instance by splitting queries into independent components to be processed separately or by performing algebraic simplifications. 156 | %%% 157 | \iffullver{ \item {\em Binary code}: what issues can arise when symbolically executing binary code? 158 | %what are the disadvantages of symbolically executing binary code? 159 | While the warm-up example of Section~\ref{symbolic-execution-example} is written in C, in several scenarios binary code is the only available representation of a program. However, having the source code of an application can make symbolic execution significantly easier, as it can exploit high-level properties (e.g., object shapes) that can be inferred statically by analyzing the source code. 160 | }{} 161 | %(e.g., the maximum size of a buffer or the number of iterations for a loop). 162 | %%% 163 | \end{itemize} 164 | %Depending on the specific application context of symbolic execution 165 | 166 | \noindent Depending on the specific context in which symbolic execution is used, different choices and assumptions are made to address the questions highlighted above. Although these choices typically affect soundness or completeness, in several scenarios a partial exploration of the space of possible execution states may be sufficient to achieve the goal (e.g., identifying a crashing input for an application) within a limited time budget. 167 | 168 | %\mynote{Better example?} 169 | 170 | %different choices and assumptions are made to address the above questions. Although soundness and completeness of symbolic execution may be negatively affected by these choices, there are several application scenarios where a partial exploration of the possible execution states is sufficient for reaching the ultimate goal (e.g., identify a single input that crashes an application). 171 | 172 | % -------------------------------------------------------------------------------------------------------------------- 173 | \subsection{Related Work} 174 | \label{ss:related-surveys} 175 | 176 | Symbolic execution has been the focus of a vast body of literature. As of August 2017, Google Scholar reports 742 articles that include the exact phrase ``symbolic execution'' in the title. Prior to this survey, other authors have contributed technical overviews of the field, such as \cite{PV-JSTTT09} and \cite{CS-CACM13}. \cite{CHEN20131758} focuses on the more specific setting of automated test generation: it provides a comprehensive view of the literature, covering in depth a variety of techniques and complementing the technical discussions with a number of running examples. 177 | %Besides complementing the technical discussions with a number of running examples, it covers in depth recent techniques for key aspects such as memory modelling, environment interaction, path explosion, and constraint solving. 178 | 179 | % -------------------------------------------------------------------------------------------------------------------- 180 | \subsection{Organization of the Article} 181 | \label{ss:article-organization} 182 | 183 | %\iffullver{ 184 | %The remainder of this article is organized as follows. In Section~\ref{se:executors}, we discuss the overall principles and evaluation strategies of a symbolic execution engine. Section~\ref{memory-model} through Section~\ref{se:symbolic-binary} address the key challenges that we listed in Section~\ref{example-discussion}. Prominent applications based on symbolic execution techniques are discussed in Section~\ref{se:applications}, while concluding remarks are addressed in Section~\ref{se:conclusions}. %We provide a glossary of the main terms used in the article in Section~\ref{se:glossary}. 185 | %} 186 | 187 | The remainder of this article is organized as follows. In Section~\ref{se:executors} we discuss the overall principles and evaluation strategies of a symbolic execution engine. Section~\ref{memory-model} through Section~\ref{se:constraint-solving} address the key challenges that we listed in Section~\ref{example-discussion}, while Section~\ref{se:hang} discusses how recent advances in other areas could be applied to enhance symbolic execution techniques. Concluding remarks are addressed in Section~\ref{se:conclusions}. %We provide a glossary of the main terms used in the article in Section~\ref{se:glossary}. 188 | 189 | % removed as \revedit{} 190 | %The appendix addresses further challenges that arise when applying symbolic execution to binary code, discusses some prominent applications of symbolic execution, and includes tables listing some prominent tools and techniques. 191 | 192 | 193 | %\vspace{2cm} 194 | %\subsection{Removed stuff} 195 | % 196 | %\paragraph{Black-box approach versus white-box approach} 197 | % 198 | %Discussion\mynote{IF: do we really need this?} of black-box approach and white-box approach. Symbolic execution is a white-box technique. Black-box approaches can be very fast but not always effective. White-box approaches can be very effective but are typically slower than black-box techniques. An in-depth discussion of this aspect will be done when we will discuss~\cite{DRILLER-NDSS16}. 199 | % 200 | %\begin{figure}[H] 201 | % \vspace{-3mm} 202 | % \centering 203 | % \begin{subfigure}{.5\textwidth} 204 | % \centering 205 | % \includegraphics[width=0.9\linewidth]{images/blackbox} 206 | % \caption{Black-box approach} 207 | % %\label{fig:sub1} 208 | % \end{subfigure}% 209 | % \begin{subfigure}{.5\textwidth} 210 | % \centering 211 | % \includegraphics[width=0.9\linewidth]{images/whitebox} 212 | % \caption{White-box approach} 213 | % %\label{fig:sub2} 214 | % \end{subfigure} 215 | % %\label{fig:example-symbolic-execution} 216 | % \vspace{-3mm} 217 | %\end{figure} 218 | % 219 | %\paragraph{Taken from old Overview} 220 | % 221 | %Symbolic execution has been originally introduced in~\cite{K-CACM76} and~\cite{H-TSE77}. A good introduction to symbolic execution is presented in~\cite{KLEE-OSDI08}.\mynote{Extend this paragraph} 222 | %%(while~\cite{EXE-CCS06} is a previous effort of the same authors). 223 | %\cite{SAGE-NDSS08} is one successful story of symbolic execution. \cite{SAB-SP10} presents a neat formalization of symbolic execution and of taint analysis as well. 224 | % 225 | -------------------------------------------------------------------------------- /main.tex: -------------------------------------------------------------------------------- 1 | % v2-acmsmall-sample.tex, dated March 6 2012 2 | % This is a sample file for ACM small trim journals 3 | % 4 | % Compilation using 'acmsmall.cls' - version 1.3 (March 2012), Aptara Inc. 5 | % (c) 2010 Association for Computing Machinery (ACM) 6 | % 7 | % Questions/Suggestions/Feedback should be addressed to => "acmtexsupport@aptaracorp.com". 8 | % Users can also go through the FAQs available on the journal's submission webpage. 9 | % 10 | % Steps to compile: latex, bibtex, latex latex 11 | % 12 | % For tracking purposes => this is v1.3 - March 2012 13 | 14 | \documentclass[prodmode,acmcsur]{acmsmall} % Aptara syntax 15 | 16 | % Package to generate and customize Algorithm as per ACM style 17 | \usepackage[ruled]{algorithm2e} 18 | \renewcommand{\algorithmcfname}{ALGORITHM} 19 | \SetAlFnt{\small} 20 | \SetAlCapFnt{\small} 21 | \SetAlCapNameFnt{\small} 22 | \SetAlCapHSkip{0pt} 23 | \IncMargin{-\parindent} 24 | 25 | % Metadata Information 26 | \acmVolume{0} 27 | \acmNumber{0} 28 | \acmArticle{0} 29 | \acmYear{0000} 30 | \acmMonth{0} 31 | 32 | % Copyright 33 | %\setcopyright{acmcopyright} 34 | %\setcopyright{acmlicensed} 35 | %\setcopyright{rightsretained} 36 | %\setcopyright{usgov} 37 | %\setcopyright{usgovmixed} 38 | %\setcopyright{cagov} 39 | %\setcopyright{cagovmixed} 40 | 41 | \input{common} 42 | 43 | % DOI 44 | \doi{0000001.0000001} 45 | 46 | %ISSN 47 | \issn{1234-56789} 48 | 49 | % Document starts 50 | \begin{document} 51 | 52 | % Page heads 53 | \markboth{R. Baldoni, E. Coppa, D. C. D'Elia, C. Demetrescu, and I. Finocchi}{A Survey of Symbolic Execution Techniques} 54 | 55 | % Title portion 56 | \title{A Survey of Symbolic Execution Techniques\\} 57 | \author{ROBERTO BALDONI 58 | \affil{\href{http://www.cis.uniroma1.it/}{Cyber Intelligence and Information Security Research Center}, Sapienza} 59 | EMILIO COPPA 60 | \affil{\href{http://season-lab.github.io}{SEASON Lab}, Sapienza University of Rome} 61 | DANIELE CONO D'ELIA 62 | \affil{\href{http://season-lab.github.io}{SEASON Lab}, Sapienza University of Rome} 63 | CAMIL DEMETRESCU 64 | \affil{\href{http://season-lab.github.io}{SEASON Lab}, Sapienza University of Rome} 65 | IRENE FINOCCHI 66 | \affil{\href{http://season-lab.github.io}{SEASON Lab}, Sapienza University of Rome} 67 | } 68 | % NOTE! Affiliations placed here should be for the institution where the 69 | % BULK of the research was done. If the author has gone to a new 70 | % institution, before publication, the (above) affiliation should NOT be changed. 71 | % The authors 'current' address may be given in the "Author's addresses:" block (below). 72 | % So for example, Mr. Abdelzaher, the bulk of the research was done at UIUC, and he is 73 | % currently affiliated with NASA. 74 | 75 | \begin{abstract} 76 | Many security and software testing applications require checking whether certain properties of a program hold for any possible usage scenario. For instance, a tool for identifying software vulnerabilities may need to rule out the existence of any backdoor to bypass a program's authentication. One approach would be to test the program using different, possibly random inputs. As the backdoor may only be hit for very specific program workloads, automated exploration of the space of possible inputs is of the essence. Symbolic execution provides an elegant solution to the problem, by systematically exploring many possible execution paths at the same time without necessarily requiring concrete inputs. Rather than taking on fully specified input values, the technique abstractly represents them as symbols, resorting to constraint solvers to construct actual instances that would cause property violations. Symbolic execution has been incubated in dozens of tools developed over the last four decades, leading to major practical breakthroughs in a number of prominent software reliability applications. The goal of this survey is to provide an overview of the main ideas, challenges, and solutions developed in the area, distilling them for a broad audience. 77 | \end{abstract} 78 | 79 | %\begin{comment} 80 | \begin{CCSXML} % http://dl.acm.org/ccs.cfm 81 | 82 | 83 | 10011007.10010940.10010992.10010998.10010999 84 | Software and its engineering~Software verification 85 | 500 86 | 87 | 88 | 10011007.10010940.10010992.10010998.10011001 89 | Software and its engineering~Dynamic analysis 90 | 300 91 | 92 | 93 | 10011007.10011074.10011099.10011102.10011103 94 | Software and its engineering~Software testing and debugging 95 | 300 96 | 97 | 98 | 10002978.10003022 99 | Security and privacy~Software and application security 100 | 100 101 | 102 | 103 | \end{CCSXML} 104 | 105 | \ccsdesc[500]{Software and its engineering~Software verification} 106 | %\ccsdesc[300]{Software and its engineering~Dynamic analysis} 107 | \ccsdesc[300]{Software and its engineering~Software testing and debugging} 108 | \ccsdesc[100]{Security and privacy~Software and application security} 109 | %\end{comment} 110 | 111 | % We no longer use \terms command 112 | %\terms{Design, Algorithms, Performance} 113 | 114 | \keywords{Symbolic execution, static analysis, concolic execution, software testing} 115 | 116 | \acmformat{Roberto Baldoni, Emilio Coppa, Daniele Cono D'Elia, Camil Demetrescu, 117 | and Irene Finocchi, 2016. A survey of symbolic execution techniques.} 118 | % At a minimum you need to supply the author names, year and a title. 119 | % IMPORTANT: 120 | % Full first names whenever they are known, surname last, followed by a period. 121 | % In the case of two authors, 'and' is placed between them. 122 | % In the case of three or more authors, the serial comma is used, that is, all author names 123 | % except the last one but including the penultimate author's name are followed by a comma, 124 | % and then 'and' is placed before the final author's name. 125 | % If only first and middle initials are known, then each initial 126 | % is followed by a period and they are separated by a space. 127 | % The remaining information (journal title, volume, article number, date, etc.) is 'auto-generated'. 128 | 129 | \begin{bottomstuff} 130 | %This work is supported by the National Science Foundation, under grant CNS-0435060, grant CCR-0325197 and grant EN-CS-0329609. 131 | 132 | Author's addresses: R. Baldoni, E. Coppa, D.C. D'Elia, and C. Demetrescu, Department of Computer, Control, and Management Engineering, Sapienza University of Rome; I. Finocchi, Department of Computer Science, Sapienza University of Rome. 133 | This work is supported in part by a grant of the Italian Presidency of the Council of Ministers and by the CINI National Laboratory of Cyber Security. % (Consorzio Interuniversitario Nazionale Informatica) 134 | \end{bottomstuff} 135 | 136 | \maketitle 137 | 138 | \input{intro} 139 | \myinput{executors} 140 | \myinput{memory} 141 | \myinput{environment} 142 | \myinput{explosion} 143 | \myinput{constraints} 144 | \input{hang} 145 | \input{conclusions} 146 | 147 | % Bibliography 148 | %\bibliographystyle{abstract} 149 | \bibliographystyle{ACM-Reference-Format-Journals} 150 | \bibliography{symbolic} 151 | 152 | % History dates 153 | %\received{--- 2016}{--- XXXX}{---- XXXX} 154 | 155 | \end{document} 156 | 157 | % End of v2-acmsmall-sample.tex (March 2012) - Gerry Murray, ACM 158 | 159 | 160 | -------------------------------------------------------------------------------- /memory.tex: -------------------------------------------------------------------------------- 1 | % !TEX root = main.tex 2 | 3 | 4 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 5 | \section{Memory model} 6 | \label{memory-model} 7 | 8 | Our warm-up example of Section~\ref{symbolic-execution-example} presented a simplified memory model where data are stored in scalar variables only, with no indirection. A crucial aspect of symbolic execution is how memory should be modeled to support programs with pointers and arrays. This requires extending our notion of memory store by mapping not only variables, but also memory addresses to symbolic expressions or concrete values. In general, a store $\sigma$ that explicitly models memory addresses can be thought as a mapping that associates memory addresses (indexes) with either expressions over concrete values or symbolic values. We can still support variables by using their address rather than their name in the mapping. In the following, when we write $x\mapsto e$ for a variable $x$ and an expression $e$ we mean $\&x\mapsto e$, where $\&x$ is the concrete address of variable $x$. Also, if $v$ is an array and $c$ is an integer constant, by $v[c]\mapsto e$ we mean $\&v+c\mapsto e$. 9 | 10 | %A memory model is an important design choice for a symbolic engine, as it can have a significant influence on the coverage achieved by symbolic execution, as well as on the scalability of constraint solving~\cite{CS-CACM13}. 11 | \mynote{[D] shorter}A memory model is an important design choice for a symbolic engine, as it can significantly affect the coverage achieved by the exploration and the scalability of constraint solving~\cite{CS-CACM13}. 12 | % 13 | The {\em symbolic memory address} problem~\cite{SAB-SP10} arises when the address referenced in the operation is a symbolic expression. In the remainder of this section, we discuss a number of popular solutions. 14 | 15 | \subsection{Fully Symbolic Memory} 16 | \label{ss:fully-symbolic-memory} 17 | 18 | \begin{figure}[t] 19 | \vspace{-1mm} 20 | \begin{center} 21 | \begin{tabular}{c} 22 | \begin{lstlisting}[basicstyle=\ttfamily\scriptsize] 23 | 1. void foobar(unsigned i, unsigned j) { 24 | 2. int a[2] = { 0 }; 25 | 3. if (i>1 || j>1) return; 26 | 4. a[i] = 5; 27 | 5. assert(a[j] != 5); 28 | 6. } 29 | \end{lstlisting} 30 | \end{tabular} 31 | \end{center} 32 | \vspace{-2.5mm} 33 | \caption{Memory modeling example: which values of \texttt{i} and \texttt{j} make the \texttt{assert} fail?} 34 | \label{fi:example-mem} 35 | \end{figure} 36 | 37 | \begin{figure}[t] 38 | \vspace{-3mm} 39 | \includegraphics[width=1\columnwidth]{images/memory-fork} 40 | \vspace{-4.5mm} 41 | \caption{Fully symbolic memory via state forking for the example of Figure~\ref{fi:example-mem}.} 42 | \label{fi:memory-fork} 43 | \vspace{-0.5mm} 44 | \end{figure} 45 | 46 | At the highest level of generality, an engine may treat memory addresses as fully symbolic. This is the approach taken by a number of works (e.g., {\sc BitBlaze}~\cite{BITBLAZE-ICISS08},~\cite{TLL-CAV10}, {\sc BAP}~\cite{BAP-CAV11}, and~\cite{TS-ATVA14}). Two fundamental approaches, pioneered by King in a seminal paper~\cite{K-CACM76}, are the following: 47 | 48 | \begin{itemize} 49 | 50 | \item {\em State forking.} If an operation reads from or writes to a symbolic address, the state is forked by considering all possible states that may result from the operation. The path constraints are updated accordingly for each forked state. 51 | \boxedexample{Consider the code shown in Figure~\ref{fi:example-mem}. The write operation at line 4 affects either $a[0]$ or $a[1]$, depending on the unknown value of array index $i$. State forking creates two states after executing the memory assignment to explicitly consider both possible scenarios (Figure~\ref{fi:memory-fork}). The path constraints for the forked states encode the assumption made on the value of $i$. Similarly, the memory read operation \texttt{a[j]} at line 5 may access either $a[0]$ or $a[1]$, depending on the unknown value of array index $j$. Therefore, for each of the two possible outcomes of the assignment \texttt{a[i]=5}, there are two possible outcomes of the \texttt{assert}, which are explicitly explored by forking the corresponding states. } 52 | 53 | \begin{figure}[t] 54 | \begin{center} 55 | \includegraphics[width=0.7\columnwidth]{images/memory-ite} 56 | \end{center} 57 | \vspace{-3mm} 58 | \caption{Fully symbolic memory via if-then-else formulas for the example of Figure~\ref{fi:example-mem}.} 59 | %\vspace{-1mm} % TODO 60 | \label{fi:memory-ite} 61 | \vspace{-1.5mm} 62 | \end{figure} 63 | 64 | % otherwise\footnote{In propositional logic, the $ite(\texttt{c}, \texttt{t}, \texttt{f})$ expression could be replaced with the formula $(\texttt{c} \wedge \texttt{t}) \vee (\neg\texttt{c} \wedge \texttt{f})$.}. 65 | \item {\em if-then-else formulas.} An alternative approach consists in encoding the uncertainty on the possible values of a symbolic pointer into the expressions kept in the symbolic store and in the path constraints, without forking any new states. The key idea is to exploit the capability of some solvers to reason on formulas that contain if-then-else expressions of the form $ite(\texttt{c}, \texttt{t}, \texttt{f})$, which yields \texttt{t} if \texttt{c} is true, and \texttt{f} otherwise. 66 | The approach works differently for memory read and write operations. Let $\alpha$ be a symbolic address that may assume the concrete values $a_1, a_2, \ldots$: 67 | \begin{itemize} 68 | \item reading from $\alpha$ yields the expression $ite(\alpha=a_1,\sigma(a_1), ite(\alpha=a_2,\sigma(a_2), \ldots))$; 69 | \item writing an expression $e$ at $\alpha$ updates the symbolic store for each $a_1, a_2, \ldots$ as $\sigma(a_i)\gets ite(\alpha=a_i,e,\sigma(a_i))$. 70 | \end{itemize} 71 | Notice that in both cases, a memory operation introduces in the store as many $ite$ expressions as the number of possible values the accessed symbolic address may assume. The $ite$ approach to symbolic memory is used, e.g., in {\sc Angr}~\cite{ANGR-SSP16} (Section~\ref{ss:index-based-memory}). 72 | \boxedexample{Consider again the example shown in Figure~\ref{fi:example-mem}. Rather than forking the state after the operation \texttt{a[i]=5} at line 4, the if-then-else approach updates the memory store by encoding both possible outcomes of the assignment, i.e., $a[0]\mapsto ite(\alpha_i=0,5,0)$ and $a[1]\mapsto ite(\alpha_i=1,5,0)$ (Figure~\ref{fi:memory-ite}). Similarly, rather than creating a new state for each possible distinct address of \texttt{a[j]} at line 5, the uncertainty on $j$ is encoded in the single expression $ite(\alpha_j=0,\sigma(a[0]),\sigma(a[1]))=ite(\alpha_j=0,ite(\alpha_i=0,5,0),ite(\alpha_i=1,5,0))$. 73 | %: if $\alpha_i=0$ then $a[0]\mapsto 5$ and $a[1]\mapsto 0$; conversely, if $\alpha_i=1$ then $a[0]\mapsto 0$ and $a[1]\mapsto 5$. 74 | %State forking creates two states after executing the memory assigment to explicitly consider both possible scenarios (Figure~\ref{fi:memory-fork}). The path constraints for the forked states encode the assumption made on the value of $i$. Similarly, the memory read operation \texttt{a[j]} at line 5 may access either $a[0]$ or $a[1]$, depending on the unknown value of array index $j$. Therefore, for each of the two possible outcomes of the assignment \texttt{a[i]=5}, there are two possible outcomes of the \texttt{assert}, which are explicitly explored by forking the corresponding states. 75 | } 76 | 77 | %Indeed, the $ite(\texttt{c}, \texttt{t}, \texttt{f})$ expression introduced in the symbolic store $\sigma$ is a short term for an {\tt if-then-else} expression and means that if the condition {\tt c} is verified then {\tt t} holds, otherwise {\tt f} must be assumed as true. Nonetheless, $ite$ expressions are often just syntactic sugar for disjunctive formulas and are commonly supported by most prominent constraint solvers. For instance, in the context of propositional logic the $ite(\texttt{c}, \texttt{t}, \texttt{f})$ expression could be replaced with the formula $(\texttt{c} \wedge \texttt{t}) \vee (\neg\texttt{c} \wedge \texttt{f})$ . 78 | 79 | \end{itemize} 80 | 81 | %\noindent To model fully symbolic pointers, an extensive line of research (e.g., {\sc EXE}~\cite{EXE-CCS06}, {\sc KLEE}~\cite{KLEE-OSDI08}, {\sc SAGE}~\cite{EGL-ISSTA09}) leverages the expressive power of SMT solvers to model array operations as first-class entities in constraint formulas using a {\em theory of arrays} in the decision procedure~\cite{STP-CAV07}. 82 | 83 | %\noindent % TODO trick if you need one more line 84 | An extensive line of research (e.g., {\sc EXE}~\cite{EXE-CCS06}, {\sc KLEE}~\cite{KLEE-OSDI08}, {\sc SAGE}~\cite{EGL-ISSTA09}) leverages the expressive power of some SMT solvers to model fully symbolic pointers. Using a {\em theory of arrays}~\cite{STP-CAV07}, array operations can in fact be expressed as first-class entities in constraint formulas. 85 | 86 | Due to its generality, fully symbolic memory supports the most accurate description of the memory behavior of a program, accounting for all possible memory manipulations. In many practical scenarios, the set of possible addresses a memory operation may reference is small~\cite{BITBLAZE-ICISS08} as in the example shown in Figure~\ref{fi:example-mem} where indexes $i$ and $j$ range in a bounded interval, allowing accurate analyses using a reasonable amount of resources. In general, however, a symbolic address may reference any cell in memory, leading to an intractable explosion in the number of possible states. For this reason, a number of techniques have been designed to improve scalability, which elaborate along the following main lines: 87 | 88 | \begin{itemize} 89 | \item {\em Representing memory in a compact form.} This approach was taken in~\cite{MEMSIGHT-ASE17}, which maps symbolic -- rather than concrete -- address expressions to data, representing the possible alternative states resulting from referencing memory using symbolic addresses in a compact, implicit form. Queries are offloaded to efficient paged interval tree implementations to determine which stored data are possibly referenced by a memory read operation. 90 | 91 | \item {\em Trading soundness for performance.} The idea, discussed in the remainder of this section, consists in corseting symbolic exploration to a subset of the execution states by replacing symbolic pointers with concrete addresses. 92 | 93 | \item {\em Heap modeling.} An additional idea is to corset the exploration to states where pointers are restricted to be either null, or point to previously heap-allocated objects, rather than to any generic memory location (Section~\ref{ss:address-concretization} and Section~\ref{ss:complex-objects}). 94 | \end{itemize} 95 | 96 | %When obtained ranges are too large, {\sc BitBlaze}~\cite{BITBLAZE-ICISS08} adds a further constraint to the system to limit its size. However, the authors observe that most symbolic memory accesses are typically already constrained to small ranges in practice, making it unnecessary. 97 | 98 | %\vspace{-2pt} % TODO 99 | \subsection{Address Concretization} 100 | \label{ss:address-concretization} 101 | 102 | In all cases where the combinatorial complexity of the analysis explodes as pointer values cannot be bounded to sufficiently small ranges, {\em address concretization}, which consists in concretizing a pointer to a single specific address, is a popular alternative. This can reduce the number of states and the complexity of the formulas fed to the solver and thus improve running time, although may cause the engine to miss paths that, for instance, depend on specific values for some pointers. 103 | 104 | 105 | %Systems such as {\sc CUTE}~\cite{CUTE-FSE05} and {\sc CREST}~\cite{CREST-ASE08} are capable of reasoning only about equality constraints for pointers, as they can be solved efficiently, and resort to concretization for general symbolic references. % equality and inequality 106 | 107 | 108 | 109 | %\mynote{DART is mentioned in CS-CACM13 as using theories of arrays} --> added to the list above. 110 | Concretization naturally arises in offline executors (Section~\ref{ss:principles}). Prominent examples are {\sc DART}~\cite{DART-PLDI05} and {\sc CUTE}~\cite{CUTE-FSE05}, 111 | %and early {\sc SAGE} releases~\cite{SAGE-NDSS08}. % that concretely execute one path at a time while collecting path constraints along executed paths. %\mynote{[D] was: equality and inequality} 112 | which handle memory initialization by concretizing a reference of type {\tt T*} either to {\tt NULL}, or to the address of a newly allocated object of {\tt sizeof(T)} bytes. DART makes the choice randomly, while CUTE first tries {\tt NULL}, and then, in a subsequent execution, a concrete address. If {\tt T} is a structure, the same concretization approach is recursively applied to all fields of a pointed object. Since memory addresses (e.g., returned by {\tt malloc}) may non-deterministically change at different concrete executions, CUTE uses {\em logical addresses} in symbolic formulas to maintain consistency across different runs. 113 | Another reason for concretization is due to efficiency in constraint solving: for instance, CUTE reasons only about pointer equality constraints using an equivalence graph, resorting to concretization for more general constraints that would need costly SMT theories. 114 | %Another reason for concretization is due to limitations in constraint handling: for instance, CUTE is capable of reasoning only about equality constraints for pointers, as they can be solved efficiently, and resort to concretization for general symbolic references. 115 | 116 | %we normally get or set a concrete value at a particular memory address. When executing symbolically, a design choice for a symbolic engine concerns what to do when a memory reference is an expression instead of a concrete address. 117 | 118 | %\subsection{Theory of Arrays} 119 | %\label{ss:theory-arrays} 120 | 121 | %A number of works (e.g., {\sc EXE}~\cite{EXE-CCS06}, {\sc KLEE}~\cite{KLEE-OSDI08}, and {\sc SAGE}~\cite{SAGE-NDSS08}) model pointers using the theory of arrays available from SMT decision procedures. 122 | 123 | %In this section we provide a description of its implementation in the popular STP solver~\cite{STP-CAV07}. 124 | 125 | %The design of STP has been mainly driven by the demands of research projects on software analysis. Its input language supports one-dimensional arrays that are indexed by bitvectors and contain bitvectors. Given an array $A$, a $read(A,i)$ operation returns the value $A[i]$ at the location expressed by the index $i$, while a $write(A,i,v)$ returns a new array with the same values as $A$ at all indexes except $i$, where it contains the value $v$. Array reads and write typically appear as subexpressions of an $ite(c,a,b)$ expression, which is syntactic sugar for $(if\,c\;then\,b\;else\,a)$. 126 | 127 | %STP reduces formulas over array to an equisatisfiable form that contains no $read$ or $write$ operations by applying three standard transformations and introducing fresh bitvector variables. Generated formulas are then amenable to SAT solving. However, transformations can also introduce bottlenecks, for instance by destroying sharing of subterms, and thus are typically procrastinated using refinement algorithms. SMT attempts also to eliminate variables through linear solving~\cite{STP-CAV07}. 128 | 129 | %\vspace{-2pt} % TODO 130 | \subsection{Partial Memory Modeling} 131 | \label{ss:index-based-memory} 132 | 133 | To mitigate the scalability problems of fully symbolic memory and the loss of soundness of memory concretization, 134 | %Motivated by the observation that concretizing all memory indexes might not work well in some scenarios, while fully symbolic memory does not scale, 135 | {\sc Mayhem}~\cite{MAYHEM-SP12} explores a middle point in the spectrum by introducing a {\em partial} memory model. The key idea is that written addresses are always concretized and read addresses are modeled symbolically if the contiguous interval of possible values they may assume is small enough. This model is based on a trade-off: it uses more expressive formulas than concretization, since it encodes multiple pointer values per state, but does not attempt to encode all of them like in fully symbolic memory~\cite{MAYHEM-THESIS}. A basic approach to bound the set of possible values that an address may assume consists in trying different concrete values and checking whether they satisfy the current path constraints, excluding large portions of the address space at each trial until a tight range is found. 136 | %This choice is important to keep the analysis feasible: for instance, in a fully symbolic model a repeated read and write on the same symbolic index would result in quadratic increase in either the symbolic constraints or the complexity of the stored symbolic expressions~\cite{DRILLER-NDSS16}. 137 | %Global memory is defined as a map $\mu$ from 32-bit addresses ({\em indexes}) to expressions. When a symbolic index $i$ is used to read memory, the algorithm generates a memory object $M$ containing the projection of $\mu$ over all the valid values that $i$ can assume. The evaluation of a $load(\mu,i)$ operation is thus reduced to $M[i]$, where $M$ is typically orders of magnitude smaller than the entire memory $\mu$. 138 | %Instantiating a memory object still requires finding all the possible values for a symbolic index. A naive algorithm would employ the constraint solver to refine the range of an index using binary search under the current path constraints. 139 | This algorithm comes with a number of caveats: for instance, querying the solver on each symbolic dereference is expensive, the memory range may not be continuous, and the values within the memory region of a symbolic pointer might have structure. {\sc Mayhem} thus performs a number of optimizations such as {\em value-set analysis}~\cite{VSA-CC04} and forms of query caching (Section~\ref{se:constraint-solving}) to refine ranges efficiently. If at the end of the process the range size exceeds a given threshold (e.g., 1024), the address is concretized. {\sc Angr}~\cite{ANGR-SSP16} also adopts the partial memory model idea and extends it by optionally supporting write operations on symbolic pointers that range within small contiguous intervals (up to 128 addresses). % [D] ptr may also be redirected to symbolic data 140 | 141 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 142 | %\subsection{Complex Objects} 143 | % 144 | 145 | \subsection{Lazy Initialization} 146 | \label{ss:complex-objects} 147 | 148 | \cite{KPV-TACAS03} \revedit{proposes} symbolic execution techniques for advanced object-oriented language constructs, such as those offered by C++ and Java. The authors describe a framework for software verification that combines symbolic execution and model checking to handle linked data structures such as lists and trees. % [D] added dynamically allocated & discarded primitive data types, and concurrency. 149 | 150 | In particular, they generalize symbolic execution by introducing {\em lazy initialization} to effectively handle dynamically allocated objects. Compared to our warm-up example from Section~\ref{symbolic-execution-example}, the state representation is extended with a {\em heap configuration} used to maintain such objects. Symbolic execution of a method taking complex objects as inputs starts with uninitialized fields, and assigns values to them in a lazy fashion, i.e., they are initialized when first accessed during execution. 151 | 152 | When an uninitialized reference field is accessed, the algorithm forks the current state with three different heap configurations, in which the field is initialized with: (1) {\tt null}, (2) a reference to a new object with all symbolic attributes, and (3) a previously introduced concrete object of the desired type, respectively. \iffullver{This on-demand concretization enables symbolic execution of methods without the need for any previous knowledge on the number of objects given as input. Also, forking the state as in (2) results into a systematic treatment for aliasing, i.e., when an object can be accessed through multiple references.}{} 153 | 154 | \cite{KPV-TACAS03,SPF-ISSTA04} combine lazy initialization with user-provided {\em method preconditions}, i.e., conditions that are assumed to be true before the execution of a method. Preconditions are used to characterize those program input states in which the method is expected to behave as intended by the programmer. For instance, we expect a binary tree data structure to be acyclic and with every node - except for the root - having exactly one parent. Conservative preconditions are used to ensure that incorrect heap configurations are eliminated during initialization, speeding up the symbolic execution process. %\mytempedit{To better illustrate this technique, we now discuss an example in which lazy initialization is used to handle a {\tt struct} data type.} 155 | 156 | \begin{figure*}[t] 157 | %\vspace{-3mm} 158 | \centering 159 | \includegraphics[width=0.875\columnwidth]{images/lazy-initialization} % TODO was 0.9 160 | \vspace{-0.75mm} 161 | \caption{Example of lazy initialization} 162 | \label{fig:example-lazy-initialization} 163 | %\vspace{-3mm} 164 | \end{figure*} 165 | 166 | \boxedexample{ 167 | % For the sake of simplicity, we assume that fragment C does not actually evaluate {\tt l->next}, but leaves this task to fragment A. When expanding the [...] 168 | %the value of 169 | Figure~\ref{fig:example-lazy-initialization} shows a recursive Java method {\tt add}, which appends a node of type {\tt Node} to a linked list, and a minimal representation of its symbolic execution when applying lazy initialization. The tree nodes represent executions of straight-line fragments of {\tt add}. Initially, fragment A evaluates reference {\tt l}, which is symbolic and thus uninitialized. The symbolic engine considers three options: (1) {\tt l} is {\tt null}, (2) {\tt l} points to a new object, and (3) {\tt l} points to a previously allocated object. Since this is the first time that a reference of type {\tt Node} is met, option (3) is ruled out. The two remaining options are then expanded, executing the involved fragments. While the first path ends after executing fragment B, the second one implicitly creates a new object {\tt o$_\texttt{1}$} due to lazy initialization and then executes C, recursively invoking {\tt add}. When expanding the recursive call, fragment A is executed and the three options are again considered by the engine, which forks into three distinct paths. Option (3) is now taken into account since a {\tt Node} object has been previously allocated (i.e., {\tt o$_\texttt{1}$}). However, this path is soon aborted by the engine since it violates the acyclicity precondition (expressed as a comment in this example). The other forked paths are further expanded, repeating the same process. Since the linked list has an unknown maximum length, the exploration can proceed indefinitely. For this reason, it is common to assume an upper bound on the depth of the materialization (i.e., field instantiation) chain. 170 | } 171 | 172 | % \boxedexample{Consider the C function {\tt add} shown in Figure~\ref{fig:example-lazy-initialization}. This recursive function appends a node of type {\tt node\_t} to the tail of a linked list. A compact representation of the symbolic tree for {\tt add} when applying lazy initialization is given in Figure~\ref{fig:example-lazy-initialization}. Tree nodes A, B, C, and D represent execution of straight-line fragments of code in {\tt add}. Initially, fragment A evaluates the value of the pointer {\tt l}, which is symbolic and thus uninitialized. The symbolic engine considers the three possible options: (1) {\tt l} is {\tt NULL}, (2) {\tt l} points to a new object of type {\tt node\_t}, and (3) {\tt l} points to a previously allocated object. Since this is the first time that a pointer of type {\tt node\_t} is met, option (3) is not considered. The two remaining options are then expanded, executing the required fragments. While the first path ends after executing fragment B, the second path implicitly creates a new object {\tt o$_\texttt{1}$} due to lazy initialization and then executes C, recursively invoking the {\tt add} function. For the sake of simplicity, we assume that fragment C does not actually evaluate {\tt l->next}, but leaves this task to fragment A. Expanding the recursive call, fragment A is executed and the three options are again considered by the engine, forking into three distinct paths. In this case, option (3) is taken into account since an object of type {\tt node\_t} has been previously allocated (i.e., {\tt o$_\texttt{1}$}). However, this forked path is soon aborted by the engine since it violates the acyclic precondition (which is simply expressed as a comment in this example). The other forked paths are further expanded, repeating the same process. Since the linked list has an unknown maximum length, the exploration can proceed indefinitely. For this reason, it is common to assume an upper bound on the depth of the materialization chain.} 173 | 174 | Recent advances in the area have focused on improving efficiency in generating heap configurations. For instance, in~\cite{DLR-ASE12} the concretization of a reference variable is deferred until the object is actually accessed. The work also provides a formalization of lazy initialization. \cite{BLISS-TSE15} instead employs bound refinement to prune uninteresting heap configurations by using information from already concretized fields, while a SAT solver is used to check whether declarative -- rather than imperative as in the original algorithm -- preconditions hold for a given configuration. 175 | %For instance, in~\cite{DLR-ASE12} the concretization of a reference variable is deferred until the object is actually accessed. The work also provides a formalization of lazy initialization. \cite{BLISS-TSE15} instead employs bound refinement to prune uninteresting heap configurations by using information from already concretized fields, while a SAT solver is used to check whether declarative -- rather than imperative as in the original algorithm -- preconditions hold for a given configuration. 176 | %Further refinements to lazy initialization are described in a number of works, e.g.,~\cite{DLR-ASE12,BLI-NFM13,BLISS-TSE15}. \cite{DLR-ASE12} besides providing a formalization of this technique, extends lazy initialization by adding support for subtypes and by deferring even further concretization when possible (e.g., a check for nullity does not always imply immediate materialization for an object). \cite{BLI-NFM13} presents {\em bounded lazy initialization} (BLI), which exploits {\em tight field bounds}~\cite{GRP-ISSTA10} to prune unfeasible heap configurations. BLISS~\cite{BLISS-TSE15} extends BLI by integrating two techniques: {\em bound refinement} and {\em satisfiability checks}. The former prunes uninteresting heap configurations by leveraging information from already-concretized fields, while the latter queries a SAT solver to check declarative preconditions, discarding unrealistic heap configurations. 177 | 178 | 179 | %, which all share the goal of reducing the number of heap configurations to generate when forking the state. extends lazy initialization by handling subtypes and by making the approach even more lazier, provides a formal treatment of lazy initialization in Java. 180 | 181 | \iffullver{ 182 | \myparagraph{Verifying Client Code Only} 183 | Of a different flavor is the technique presented in~\cite{SHZ-TAIC07} for symbolic execution over objects instantiated from commonly used libraries. The authors argue that performing symbolic execution at the representation level might be redundant if the aim is to only check the client code, thus trusting the correctness of the library implementation. They discuss the idea of symbolically executing methods of the Java {\tt String} class using a finite-state automaton that abstracts away the implementation details. They present a case study of an application that dynamically generates SQL queries: symbolic execution is used to check whether the statements conform to the SQL grammar and possibly match injection patterns. \iffullver{The authors mention that their approach might be used to symbolically execute over standard container classes such as trees or maps. It is worth mentioning that symbolic execution is used to detect SQL injection vulnerabilities also in~\cite{FLP-COMPSAC07}.}{The authors mention that their approach might be used to symbolically execute over standard container classes such as trees or maps.} 184 | }{} 185 | 186 | %% citations for SL tools omitted 187 | % Several tools based on SL are available to date for automatically finding memory bugs in user~\cite{INFER} and system-level code~\cite{SLAYER-CAV11}, and for verifying annotated programs with respect to, e.g., memory safety properties~\cite{VERIFAST-APLAS10} and design patterns~\cite{JSTAR-OOPSLA08}. While tailor-made theorem provers are implemented in many extant tools, recent works~\cite{BPS-ENTCS09,PWZ-CAV13} 188 | 189 | % While some of them implement tailor-made theorem provers, it has been shown~\cite{BPS-ENTCS09,PWZ-CAV13} that provers for decidable fragments of SL can be integrated in an SMT solver, allowing for complete combinations with other theories relevant for program verification. This paves the way for interesting applications of SL in general-purpose verification tools. In particular, symbolic executors could use it to reason inductively over manipulations of data structures such as lists and trees in C and Java programs. To the best of our knowledge, while symbolic execution is at the core of SL, there have not been applications of SL in symbolic executors yet. We believe this might represent a promising research direction to follow. 190 | 191 | 192 | % Additional optimizations are presented in~\cite{DLR-ASE12}, which also provides a complete formalization of this approach for the Java language. 193 | 194 | % [D] this is related to input test generation 195 | %Also, generated heap configurations are pairwise non-isomorphic: eliminating symmetric structures can greatly reduce the number of heaps that a symbolic executor must explore, while guaranteeing that no relevant states are missed~\cite{BLISS-TSE15}. 196 | 197 | %~\cite{KPV-TACAS03,SPF-ISSTA04} combine lazy initialization with user-provided {\em method preconditions}, i.e., conditions which are assumed to be true before the execution of a method. Such conditions are used to characterize those input states in which the method is expected to behave as intended by the programmer. For instance, we expect a binary tree data structure to be acyclic and with every node - except for the root - having exactly one parent. Conservative method preconditions are used to ensure that incorrect structures are eliminated during initialization, speeding the symbolic execution process up. 198 | 199 | %Further refinements to lazy initialization are described in a number of works. \cite{BLI-NFM13} introduces {\em bounded lazy initialization} (BLI) to reduce the number of alternatives to explore using available field bounds expressed in TACO, a tool for SAT-based bounded verification of JML-annotated Java code. ~\cite{BLISS-TSE15} presents two novel techniques that build upon BLI. The first technique refines field bounds by leveraging information from already-concretized fields; the technique is then extended by auxiliary satisfiability checks to determine the feasibility of partially symbolic structure. 200 | -------------------------------------------------------------------------------- /misc/glossary.tex: -------------------------------------------------------------------------------- 1 | % !TEX root = main.tex 2 | 3 | \iffalse 4 | \section{Glossary} 5 | \label{se:glossary} 6 | 7 | \noindent {\bf Complete analysis.} Analysis that guarantees no false positives, i.e., all reported property violations are true. 8 | 9 | \smallskip\noindent {\bf Concrete execution.} An execution of a program using concrete inputs in a real-world environment. 10 | 11 | \smallskip\noindent {\bf Concolic execution.} \ldots 12 | 13 | \smallskip\noindent {\bf Control flow graph (CFG).} Representation of a program that uses nodes to model instructions and edges to model the control flow between them. 14 | 15 | \smallskip\noindent {\bf Control flow path.} Path in the control flow graph of a program. Represents the sequence of instructions executed by the program for a given concrete input. 16 | 17 | \smallskip\noindent {\bf Decidable analysis} \ldots 18 | 19 | \smallskip\noindent {\bf Model checker.} Given a model of a system, a model checker exhaustively and automatically checks whether the model meets a given specification. 20 | 21 | \smallskip\noindent {\bf Path constraints.} \ldots 22 | 23 | \smallskip\noindent {\bf SMT solver.} A Satisfiability Modulo Theories (SMT) instance is a formula in first-order logic, where some function and predicate symbols have additional interpretations, and SMT is the problem of determining whether such a formula is satisfiable. A SMT solver is a tool able to reason over SMT formulas. 24 | 25 | \smallskip\noindent {\bf Sound analysis.} Analysis that guarantees no false negatives, i.e., if there is a property violation, then it is reported. 26 | 27 | \smallskip\noindent {\bf Symbolic execution.} \ldots 28 | 29 | \smallskip\noindent {\bf Symbolic store.} \ldots 30 | 31 | \smallskip\noindent {\bf Symbolic value.} \ldots 32 | \fi -------------------------------------------------------------------------------- /misc/loops.tex: -------------------------------------------------------------------------------- 1 | % !TEX root = main.tex 2 | 3 | \section{Loops} 4 | \label{se:loops} 5 | 6 | Loops are one of the main causes of path explosion: each iteration of a loop can be seen as an {\tt if-goto} statement, leading to a conditional branch in the execution tree. If the loop condition involves one or more symbolic values, the number of generated branches may be potentially infinite. 7 | 8 | \begin{figure}[t] 9 | \begin{center} 10 | \begin{tabular}{c} 11 | \begin{lstlisting}[basicstyle=\ttfamily\scriptsize] 12 | 1. int x = sym_input(); // e.g., read from file 13 | 2. while (x > 0) { 14 | 3. x = sym_input(); 15 | 4. } 16 | \end{lstlisting} 17 | \end{tabular} 18 | \end{center} 19 | \vspace{-2mm} 20 | \caption{Loop example with input read from the environment~\protect\cite{CS-CACM13}.} 21 | \label{fi:example-loop} 22 | \end{figure} 23 | 24 | \vspace{-2pt} % TODO 25 | \boxedexample{Consider the code fragment of Figure~\ref{fi:example-loop}~\cite{CS-CACM13}, where \texttt{sym\_input()} is an external routine that interacts with the environment (e.g., by reading input data from a network) and returns a fresh symbolic input. The path constraint set at any final state has the form: 26 | \[ \pi = \left ( \bigwedge_{i \in [1, k]} \alpha_i > 0 \right ) \wedge (\alpha_{k+1} \leq 0) \] 27 | where $k$ is the number of iterations and $\alpha_i$ is the symbol produced by \texttt{sym\_input()} at the $i$-th iteration.} 28 | 29 | \noindent The problem of path explosion due to symbolic execution of loops has been attacked from different sides. A first natural strategy adopted by many symbolic engines is to limit the loop exploration up to a certain number of iterations. Obviously, this may lead to missing interesting paths in the program. For this reason, some works (e.g., {\sc AEG}~\cite{AEG-NDSS11}) have also considered the opposite strategy, allowing the engine to fully explore some loops. To mitigate the path explosion problem, only a single instance of the symbolic executor is allowed to fully unroll a loop, while other instances conservatively explore other paths. This approach has been shown to be effective in some application contexts such as security (e.g., identification of buffer overflows) where interesting behavior may be observed at the loop boundaries. 30 | 31 | By using static or dynamic analysis techniques, it may be possible to derive properties over a loop that can be exploited by the symbolic engine to significantly prune branching paths. For instance, knowledge of the exact number of loop iterations - or at least a constant upper bound on it - can significantly help the engine. Section~\ref{precontioned-symbolic-execution} provides a more general discussion of how preconditions can help symbolic execution. Nevertheless, even symbolic execution can be used to derive loop invariants. Indeed, if a program contains an assertion after the loop, the approach presented in~\cite{PV-SPIN04} works backwards from the property to be checked and it iteratively applies approximation to derive loop invariants. The main idea is to pick the asserted property as the initial invariant candidate and then to exploit symbolic execution to check whether this property is inductive. If the invariant cannot be verified for some loop paths, it is replaced by a different invariant. The next candidate for the invariant is generated by exploiting the path constraints for the paths on which the verification has failed. Additional refinements steps are performed to guarantee termination. 32 | 33 | %this can be exploited by a symbolic engine for automatically discovering some invariants over the loop. In~\cite{PV-SPIN04}, this is achieved by iteratively using \mynote{[D] Define?} invariant strengthening and approximation techniques. 34 | 35 | \cite{GL-ISSTA11} presents a technique that automatically derives partial summarizations for loops. A loop summarization is similar to a function summary (Section~\ref{ss:caching}), using a set of preconditions and a set of postconditions. These are computed dynamically during the symbolic execution by reasoning on the dependencies among loop conditions and symbolic variables. As soon as a loop summary is computed, it is cached for possibly subsequent reuse. This not only allows the symbolic engine to avoid redundant executions of the same loop under the same program state, but also makes it possible to generalize the loop summary to cover even different executions of the same loop that run under different conditions. A main limitation of this approach is that it can generate summaries only for loops that iteratively update symbolic variables across loop iterations by adding a constant, non-zero amount. 36 | 37 | \cite{SST-ATVA13} introduces a technique of a different flavor that analyzes cyclic paths in the control flow graph of a given program and produces {\em templates} that declaratively describe the program states generated by these portions of code into a symbolic execution tree. By exploiting templates, the symbolic execution engine needs to explore a significantly reduced number of program states. A drawback of this approach is that templates introduce quantifiers in the path constraints: in turn, this may significantly increase the burden on the constraint solver. 38 | 39 | % [D] I don't think mentioning trip counts adds value to the discussion, better keep things simple 40 | % By relating {\em trip counts} (i.e., number of iterations for loops) with features of the program input 41 | It has also been observed that loop executions may strictly depend on input features. {\em Loop-extended symbolic execution}~\cite{SPM-ISSTA09} is able to effectively explore a loop whenever a grammar describing the input program is available. Relating the number of iterations with features of the program input can guide the exploration of the program states generated by a loop. 42 | -------------------------------------------------------------------------------- /overview.tex: -------------------------------------------------------------------------------- 1 | % !TEX root = main.tex 2 | 3 | 4 | 5 | -------------------------------------------------------------------------------- /submissions/fifth/appendix.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/submissions/fifth/appendix.pdf -------------------------------------------------------------------------------- /submissions/fifth/main.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/submissions/fifth/main.pdf -------------------------------------------------------------------------------- /submissions/fifth/survey-with-appendix.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/submissions/fifth/survey-with-appendix.pdf -------------------------------------------------------------------------------- /submissions/first/cover_letter.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/submissions/first/cover_letter.docx -------------------------------------------------------------------------------- /submissions/first/cover_letter.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/submissions/first/cover_letter.pdf -------------------------------------------------------------------------------- /submissions/first/proof.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/submissions/first/proof.pdf -------------------------------------------------------------------------------- /submissions/first/survey.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/submissions/first/survey.pdf -------------------------------------------------------------------------------- /submissions/fourth/ACM-CSUR-Revision.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/submissions/fourth/ACM-CSUR-Revision.pdf -------------------------------------------------------------------------------- /submissions/fourth/proof.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/submissions/fourth/proof.pdf -------------------------------------------------------------------------------- /submissions/fourth/survey.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/submissions/fourth/survey.pdf -------------------------------------------------------------------------------- /submissions/second/proof.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/submissions/second/proof.pdf -------------------------------------------------------------------------------- /submissions/second/survey-similarities.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/submissions/second/survey-similarities.pdf -------------------------------------------------------------------------------- /submissions/second/survey-symbolic-exec-v1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/submissions/second/survey-symbolic-exec-v1.pdf -------------------------------------------------------------------------------- /submissions/third/proof.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/submissions/third/proof.pdf -------------------------------------------------------------------------------- /submissions/third/survey.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/submissions/third/survey.pdf -------------------------------------------------------------------------------- /symbolic.bib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/symbolic.bib -------------------------------------------------------------------------------- /tables.tex: -------------------------------------------------------------------------------- 1 | % !TEX root = appendix.tex 2 | 3 | \section{Additional Tables} 4 | 5 | \begin{table}[b] 6 | \centering 7 | \begin{adjustbox}{width=\columnwidth} 8 | %\begin{small} 9 | \begin{tabular}{| l || c || l |} 10 | \hline 11 | {\bf Symbolic engine} & {\bf References} & {\bf Project URL} (last retrieved: December 2017) \\ \hline\hline 12 | 13 | % CNC is not a symbolic engine but it uses constrained solver 14 | %{\sc Check 'n' Crash} & \cite{CS-ICSE05} & \url{http://ranger.uta.edu/~csallner/cnc/}\\ 15 | 16 | {\sc CUTE} & \cite{CUTE-FSE05} & -- \\ 17 | {\sc DART} & \cite{DART-PLDI05} & -- \\ 18 | {\sc jCUTE} & \cite{SA-CAV06} & \url{https://github.com/osl/jcute} \\ % : Java Concolic Unit Testing Engine 19 | {\sc KLEE} & \cite{EXE-CCS06,KLEE-OSDI08} & \url{https://klee.github.io/} \\ % : a LLVM Execution Engine 20 | {\sc SAGE} & \cite{SAGE-NDSS08,EGL-ISSTA09} & -- \\ 21 | {\sc BitBlaze} & \cite{BITBLAZE-ICISS08} & \url{http://bitblaze.cs.berkeley.edu/} \\ % , BHK-TR07 22 | {\sc CREST} & \cite{CREST-ASE08} & \url{https://github.com/jburnim/crest} \\ % : a concolic test generation tool for C 23 | {\sc PEX} & \cite{PEX-TAP08} & \url{http://research.microsoft.com/en-us/projects/pex/} \\ 24 | {\sc Rubyx} & \cite{CF-CCS10} & -- \\ 25 | {\sc Java PathFinder} & \cite{PATHFINDER-ASE10} & \url{http://babelfish.arc.nasa.gov/trac/jpf}\\ 26 | {\sc Otter} & \cite{RSM-ICSE10} & \url{https://bitbucket.org/khooyp/otter/} \\ 27 | {\sc BAP} & \cite{BAP-CAV11} & \url{https://github.com/BinaryAnalysisPlatform/bap} \\ 28 | {\sc Cloud9} & \cite{CLOUD9-EUROSYS11} & \url{http://cloud9.epfl.ch/} \\ 29 | {\sc Mayhem} & \cite{MAYHEM-SP12} & -- \\ 30 | {\sc SymDroid} & \cite{JMF-TECH12} & -- \\ 31 | {\sc \stwoe} & \cite{CKC-TOCS12} & \url{http://s2e.systems/} \\ 32 | {\sc FuzzBALL} & \cite{MMP-ASPLOS12,FUZZBALL-ESORICS13} & \url{http://bitblaze.cs.berkeley.edu/fuzzball.html} \\ 33 | {\sc Jalangi} & \cite{SKB-FSE13} & \url{https://github.com/Samsung/jalangi2} \\ 34 | {\sc Pathgrind} & \cite{S-ICSE04} & \url{https://github.com/codelion/pathgrind} \\ 35 | {\sc Kite} & \cite{V-THESIS14} & \url{http://www.cs.ubc.ca/labs/isd/Projects/Kite} \\ 36 | {\sc SymJS} & \cite{LAG-FSE14} & -- \\ 37 | {\sc CIVL} & \cite{CIVL-SC15} & \url{http://vsl.cis.udel.edu/civl/}\\ % : The Concurrency Intermediate Verification Language 38 | {\sc KeY} & \cite{HBR-RV14} & \url{http://www.key-project.org/} \\ 39 | {\sc Angr} & \cite{FIRMALICE-NDSS15,ANGR-SSP16} & \url{http://angr.io/} \\ 40 | {\sc Triton} & \cite{TRITON-SSTIC15} & \url{http://triton.quarkslab.com/} \\ 41 | {\sc PyExZ3} & \cite{BD-TECH15} & \url{https://github.com/thomasjball/PyExZ3} \\ 42 | {\sc JDart} & \cite{JDART-TACAS16} & \url{https://github.com/psycopaths/jdart} \\ 43 | 44 | {\sc CATG} & -- & \url{https://github.com/ksen007/janala2} \\ 45 | {\sc PySymEmu} & -- & \url{https://github.com/feliam/pysymemu/} \\ 46 | {\sc Miasm} & -- & \url{https://github.com/cea-sec/miasm} \\ 47 | 48 | \hline 49 | \end{tabular} 50 | %\end{small} 51 | \end{adjustbox} 52 | \caption{Selection of symbolic execution engines, along with their reference article(s) and software project web site (if any).} 53 | \label{tab:symbolic-engines} 54 | \vspace{-3.2mm} % TODO 55 | \end{table} 56 | 57 | \vspace{-2pt} 58 | \myparagraph{Tools} 59 | Table~\ref{tab:symbolic-engines} lists a number of symbolic execution engines that have worked as incubators for several of the techniques surveyed in this article. The novel contributions introduced by tools that represented milestones in the area are described in the appropriate sections throughout the main article. 60 | 61 | \vspace{-1pt} 62 | \myparagraph{Path Selection Heuristics} 63 | Table~\ref{tab:heuristics} provides a categorization of the search heuristics that have been discussed in Section 2.3 of the main article. For each category, we list several works that have proposed interesting embodiments of the category. 64 | 65 | \begin{table}[t] 66 | \centering 67 | \begin{adjustbox}{width=0.99\columnwidth} % TODO was 1; with 0.88 the last paragraph will fit 68 | \begin{small} 69 | \begin{tabular}{| l || l |} 70 | \hline 71 | {\bf Heuristic} & {\bf Goal} \\ \hline\hline 72 | \multirow{2}*{BFS} & {\em Maximize coverage} \\ & \cite{CKC-TOCS12,PEX-TAP08} \\\hline 73 | \multirow{3}*{DFS} & {\em Exhaust paths, minimize memory usage} \\ & \cite{EXE-CCS06,CKC-TOCS12}\\ & \cite{PEX-TAP08,DART-PLDI05} \\\hline 74 | \multirow{2}*{Random path selection} & {\em Randomly pick a path with probability based on its length} \\ & \cite{KLEE-OSDI08} \\\hline 75 | %low-covered code & prioritize paths that execute low-covered code & \cite{EXE-CCS06} \\ 76 | \multirow{4}*{Code coverage search} & {\em Prioritize paths that may explore unexplored code or that may} \\ & {\em soon reach a particular target program point} \\ & \cite{EXE-CCS06,KLEE-OSDI08,MAYHEM-SP12}\\ & \cite{CKC-TOCS12,GV-ISSTA02,MPF-SAS11} \\\hline 77 | \multirow{2}*{Buggy-path-first} & {\em Prioritize bug-friendly path} \\ & \cite{AEG-NDSS11} \\\hline 78 | \multirow{2}*{Loop exhaustion} & {\em Fully explore specific loops} \\ & \cite{AEG-NDSS11} \\\hline 79 | \multirow{2}*{Symbolic instruction pointers} & {\em Prioritize paths with symbolic instruction pointers} \\ & \cite{MAYHEM-SP12} \\\hline 80 | \multirow{2}*{Symbolic memory accesses} & {\em Prioritize paths with symbolic memory accesses} \\ & \cite{MAYHEM-SP12} \\ \hline 81 | \multirow{2}*{Fitness function} & {\em Prioritize paths based on a fitness function} \\ & \cite{XTD-DSN09,CS-CACM13,XTD-DSN09} \\ \hline 82 | \multirow{3}*{Subpath-guided search} & {\em Use frequency distributions of explored subpaths to prioritize}\\ & {\em less covered parts of a program} \\ & \cite{LZL-OOPSLA13} \\ \hline 83 | \multirow{2}*{Property-guided search} & {\em Prioritize paths that are most likely to satisfy the target property} \\ & \cite{ZCWDL15} \\ 84 | %kill path & filter uninteresting path & \cite{CKC-TOCS12} \\ 85 | \hline 86 | \end{tabular} 87 | \end{small} 88 | \end{adjustbox} 89 | \caption{Common path selection heuristics discussed in Section 2.3.} % of the main article 90 | \label{tab:heuristics} 91 | \end{table} 92 | --------------------------------------------------------------------------------