├── .gitignore
├── ACM-Reference-Format-Journals.bst
├── README.md
├── acmcopyright.sty
├── acmsmall.cls
├── apalike-refs.bst
├── appendix.tex
├── applications.tex
├── arxiv.tex
├── binary.tex
├── common.tex
├── conclusions.tex
├── constraints.tex
├── environment.tex
├── executors.tex
├── explosion.tex
├── hang.tex
├── images
    ├── blackbox.odg
    ├── blackbox.pdf
    ├── compiler.odg
    ├── compiler.odg.new
    ├── compiler.pdf
    ├── compiler.pdf.new
    ├── concolic-execution-2.odg
    ├── concolic-execution-2.pdf
    ├── concolic-execution.odg
    ├── concolic-execution.pdf
    ├── concolic-execution_old.odg
    ├── concolic-execution_old.pdf
    ├── concrete-abstract.eps
    ├── concrete-abstract.pdf
    ├── concrete-abstract.svg
    ├── concrete-execution.odg
    ├── concrete-execution.pdf
    ├── eager-evaluation.odg
    ├── eager-evaluation.pdf
    ├── example.odg
    ├── example.pdf
    ├── execution-tree-text.svg
    ├── execution-tree-text.tex
    ├── execution-tree.eps
    ├── execution-tree.pdf
    ├── execution-tree.svg
    ├── lazy-initialization-C.odg
    ├── lazy-initialization.odg
    ├── lazy-initialization.pdf
    ├── memory-fork.odg
    ├── memory-fork.pdf
    ├── memory-ite.odg
    ├── memory-ite.pdf
    ├── photo_tree.pdf
    ├── state-merging-2.odg
    ├── state-merging-2.pdf
    ├── state-merging.odg
    ├── state-merging.pdf
    ├── state-merging_old.png
    ├── whitebox.odg
    └── whitebox.pdf
├── intro.tex
├── main.tex
├── memory.tex
├── misc
    ├── glossary.tex
    ├── loops.tex
    └── sandbox.tex
├── overview.tex
├── submissions
    ├── fifth
    │   ├── appendix.pdf
    │   ├── main.pdf
    │   └── survey-with-appendix.pdf
    ├── first
    │   ├── cover_letter.docx
    │   ├── cover_letter.pdf
    │   ├── proof.pdf
    │   └── survey.pdf
    ├── fourth
    │   ├── ACM-CSUR-Revision.pdf
    │   ├── proof.pdf
    │   └── survey.pdf
    ├── second
    │   ├── proof.pdf
    │   ├── survey-similarities.pdf
    │   └── survey-symbolic-exec-v1.pdf
    └── third
    │   ├── proof.pdf
    │   └── survey.pdf
├── symbolic.bib
└── tables.tex


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.log
 2 | *.toc
 3 | /main.pdf
 4 | /appendix.pdf
 5 | *.out
 6 | *.aux
 7 | *.bbl
 8 | *.blg
 9 | *.fdb_latexmk
10 | *.fls
11 | *.synctex.gz
12 | .DS_Store
13 | Icon*
14 | *-eps-converted-to.pdf
15 | *-tree-eps-converted-to.pdf
16 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # README #
 2 | 
 3 | This is a survey by the [SEASON lab](http://season-lab.github.io) on symbolic execution tools and techniques.
 4 | 
 5 | If you are considering citing our work, we would be grateful if you could use the following BibTeX entry:
 6 | ``` tex
 7 | @article{SurveySymExec-CSUR18,
 8 |   author    = {Baldoni, Roberto and Coppa, Emilio and D'Elia, Daniele Cono and Demetrescu, Camil and Finocchi, Irene},
 9 |   title     = {A Survey of Symbolic Execution Techniques},
10 |   journal   = {ACM Comput. Surv.},
11 |   volume    = {51},
12 |   number = {3},
13 |   articleno = {50},
14 |   publisher = {ACM},
15 |   address = {New York, NY, USA},
16 |   year = {2018}
17 | }
18 | ```
19 | 


--------------------------------------------------------------------------------
/acmcopyright.sty:
--------------------------------------------------------------------------------
  1 | %%
  2 | %% This is file `acmcopyright.sty',
  3 | %% generated with the docstrip utility.
  4 | %%
  5 | %% The original source files were:
  6 | %%
  7 | %% acmcopyright.dtx  (with options: `style')
  8 | %% 
  9 | %% IMPORTANT NOTICE:
 10 | %% 
 11 | %% For the copyright see the source file.
 12 | %% 
 13 | %% Any modified versions of this file must be renamed
 14 | %% with new filenames distinct from acmcopyright.sty.
 15 | %% 
 16 | %% For distribution of the original source see the terms
 17 | %% for copying and modification in the file acmcopyright.dtx.
 18 | %% 
 19 | %% This generated file may be distributed as long as the
 20 | %% original source files, as listed above, are part of the
 21 | %% same distribution. (The sources need not necessarily be
 22 | %% in the same archive or directory.)
 23 | %% \CharacterTable
 24 | %%  {Upper-case    \A\B\C\D\E\F\G\H\I\J\K\L\M\N\O\P\Q\R\S\T\U\V\W\X\Y\Z
 25 | %%   Lower-case    \a\b\c\d\e\f\g\h\i\j\k\l\m\n\o\p\q\r\s\t\u\v\w\x\y\z
 26 | %%   Digits        \0\1\2\3\4\5\6\7\8\9
 27 | %%   Exclamation   \!     Double quote  \"     Hash (number) \#
 28 | %%   Dollar        \$     Percent       \%     Ampersand     \&
 29 | %%   Acute accent  \'     Left paren    \(     Right paren   \)
 30 | %%   Asterisk      \*     Plus          \+     Comma         \,
 31 | %%   Minus         \-     Point         \.     Solidus       \/
 32 | %%   Colon         \:     Semicolon     \;     Less than     \<
 33 | %%   Equals        \=     Greater than  \>     Question mark \?
 34 | %%   Commercial at \@     Left bracket  \[     Backslash     \\
 35 | %%   Right bracket \]     Circumflex    \^     Underscore    \_
 36 | %%   Grave accent  \`     Left brace    \{     Vertical bar  \|
 37 | %%   Right brace   \}     Tilde         \~}
 38 | \NeedsTeXFormat{LaTeX2e}
 39 | \ProvidesPackage{acmcopyright}
 40 | [2014/06/29 v1.2 Copyright statemens for ACM classes]
 41 | \newif\if@printcopyright
 42 | \@printcopyrighttrue
 43 | \newif\if@printpermission
 44 | \@printpermissiontrue
 45 | \newif\if@acmowned
 46 | \@acmownedtrue
 47 | \RequirePackage{xkeyval}
 48 | \define@choicekey*{ACM@}{acmcopyrightmode}[%
 49 |   \acm@copyrightinput\acm@copyrightmode]{none,acmcopyright,acmlicensed,%
 50 |   rightsretained,usgov,usgovmixed,cagov,cagovmixed,%
 51 |   licensedusgovmixed,licensedcagovmixed,othergov,licensedothergov}{%
 52 |   \@printpermissiontrue
 53 |   \@printcopyrighttrue
 54 |   \@acmownedtrue
 55 |   \ifnum\acm@copyrightmode=0\relax % none
 56 |    \@printpermissionfalse
 57 |    \@printcopyrightfalse
 58 |    \@acmownedfalse
 59 |   \fi
 60 |   \ifnum\acm@copyrightmode=2\relax % acmlicensed
 61 |    \@acmownedfalse
 62 |   \fi
 63 |   \ifnum\acm@copyrightmode=3\relax % rightsretained
 64 |    \@acmownedfalse
 65 |   \fi
 66 |   \ifnum\acm@copyrightmode=4\relax % usgov
 67 |    \@printpermissiontrue
 68 |    \@printcopyrightfalse
 69 |    \@acmownedfalse
 70 |   \fi
 71 |   \ifnum\acm@copyrightmode=6\relax % cagov
 72 |    \@acmownedfalse
 73 |   \fi
 74 |   \ifnum\acm@copyrightmode=8\relax % licensedusgovmixed
 75 |    \@acmownedfalse
 76 |   \fi
 77 |   \ifnum\acm@copyrightmode=9\relax % licensedcagovmixed
 78 |    \@acmownedfalse
 79 |   \fi
 80 |   \ifnum\acm@copyrightmode=10\relax % othergov
 81 |    \@acmownedtrue
 82 |   \fi
 83 |   \ifnum\acm@copyrightmode=11\relax % licensedothergov
 84 |    \@acmownedfalse
 85 |    \@printcopyrightfalse
 86 |   \fi}
 87 | \def\setcopyright#1{\setkeys{ACM@}{acmcopyrightmode=#1}}
 88 | \setcopyright{acmcopyright}
 89 | \def\@copyrightowner{%
 90 |   \ifcase\acm@copyrightmode\relax % none
 91 |   \or % acmcopyright
 92 |   ACM.
 93 |   \or % acmlicensed
 94 |   Copyright held by the owner/author(s). Publication rights licensed to
 95 |   ACM.
 96 |   \or % rightsretained
 97 |   Copyright held by the owner/author(s).
 98 |   \or % usgov
 99 |   \or % usgovmixed
100 |   ACM.
101 |   \or % cagov
102 |   Crown in Right of Canada.
103 |   \or %cagovmixed
104 |   ACM.
105 |   \or %licensedusgovmixed
106 |   Copyright held by the owner/author(s). Publication rights licensed to
107 |   ACM.
108 |   \or %licensedcagovmixed
109 |   Copyright held by the owner/author(s). Publication rights licensed to
110 |   ACM.
111 |   \or % othergov
112 |   ACM.
113 |   \or % licensedothergov
114 |   \fi}
115 | \def\@copyrightpermission{%
116 |   \ifcase\acm@copyrightmode\relax % none
117 |   \or % acmcopyright
118 |    Permission to make digital or hard copies of all or part of this
119 |    work for personal or classroom use is granted without fee provided
120 |    that copies are not made or distributed for profit or commercial
121 |    advantage and that copies bear this notice and the full citation on
122 |    the first page. Copyrights for components of this work owned by
123 |    others than ACM must be honored. Abstracting with credit is
124 |    permitted. To copy otherwise, or republish, to post on servers or to
125 |    redistribute to lists, requires prior specific permission
126 |    and\hspace*{.5pt}/or  a fee. Request permissions from
127 |    permissions@acm.org.
128 |   \or % acmlicensed
129 |    Permission to make digital or hard copies of all or part of this
130 |    work for personal or classroom use is granted without fee provided
131 |    that copies are not made or distributed for profit or commercial
132 |    advantage and that copies bear this notice and the full citation on
133 |    the first page. Copyrights for components of this work owned by
134 |    others than the author(s) must be honored. Abstracting with credit
135 |    is permitted.  To copy otherwise, or republish, to post on servers
136 |    or to  redistribute to lists, requires prior specific permission
137 |    and\hspace*{.5pt}/or  a fee. Request permissions from
138 |    permissions@acm.org.
139 |   \or % rightsretained
140 |    Permission to make digital or hard copies of part or all of this work
141 |    for personal or classroom use is granted without fee provided that
142 |    copies are not made or distributed for profit or commercial advantage
143 |    and that copies bear this notice and the full citation on the first
144 |    page. Copyrights for third-party components of this work must be
145 |    honored. For all other uses, contact the
146 |    owner\hspace*{.5pt}/author(s).
147 |   \or % usgov
148 |    This paper is authored by an employee(s) of the United States
149 |    Government and is in the public domain. Non-exclusive copying or
150 |    redistribution is allowed, provided that the article citation is
151 |    given and the authors and agency are clearly identified as its
152 |    source.
153 |   \or % usgovmixed
154 |    ACM acknowledges that this contribution was authored or co-authored
155 |    by an employee, or contractor of the national government. As such,
156 |    the Government retains a nonexclusive, royalty-free right to
157 |    publish or reproduce this article, or to allow others to do so, for
158 |    Government purposes only. Permission to make digital or hard copies
159 |    for personal or classroom use is granted. Copies must bear this
160 |    notice and the full citation on the first page. Copyrights for
161 |    components of this work owned by others than ACM must be
162 |    honored. To copy otherwise, distribute, republish, or post,
163 |    requires prior specific permission and\hspace*{.5pt}/or a
164 |    fee. Request permissions from permissions@acm.org.
165 |   \or % cagov
166 |    This article was authored by employees of the Government of Canada.
167 |    As such, the Canadian government retains all interest in the
168 |    copyright to this work and grants to ACM a nonexclusive,
169 |    royalty-free right to publish or reproduce this article, or to allow
170 |    others to do so, provided that clear attribution is given both to
171 |    the authors and the Canadian government agency employing them.
172 |    Permission to make digital or hard copies for personal or classroom
173 |    use is granted. Copies must bear this notice and the full citation
174 |    on the first page.  Copyrights for components of this work owned by
175 |    others than the Canadain Government must be honored. To copy
176 |    otherwise, distribute, republish, or post, requires prior specific
177 |    permission and\hspace*{.5pt}/or a fee. Request permissions from
178 |    permissions@acm.org.
179 |   \or % cagovmixed
180 |    ACM acknowledges that this contribution was co-authored by an
181 |    affiliate of the national government of Canada. As such, the Crown
182 |    in Right of Canada retains an equal interest in the copyright.
183 |    Reprints must include clear attribution to ACM and the author's
184 |    government agency affiliation.  Permission to make digital or hard
185 |    copies for personal or classroom use is granted.  Copies must bear
186 |    this notice and the full citation on the first page. Copyrights for
187 |    components of this work owned by others than ACM must be honored.
188 |    To copy otherwise, distribute, republish, or post, requires prior
189 |    specific permission and\hspace*{.5pt}/or a fee. Request permissions
190 |    from permissions@acm.org.
191 |   \or % licensedusgovmixed
192 |    Publication rights licensed to ACM. ACM acknowledges that this
193 |    contribution was authored or co-authored by an employee, contractor
194 |    or affiliate of the United States government. As such, the
195 |    Government retains a nonexclusive, royalty-free right to publish or
196 |    reproduce this article, or to allow others to do so, for Government
197 |    purposes only.
198 |   \or % licensedcagovmixed
199 |    Publication rights licensed to ACM. ACM acknowledges that this
200 |    contribution was authored or co-authored by an employee, contractor
201 |    or affiliate of the national government of Canada. As such, the
202 |    Government retains a nonexclusive, royalty-free right to publish or
203 |    reproduce this article, or to allow others to do so, for Government
204 |    purposes only.
205 |   \or % othergov
206 |    ACM acknowledges that this contribution was authored or co-authored
207 |    by an employee, contractor or affiliate of a national government. As
208 |    such, the Government retains a nonexclusive, royalty-free right to
209 |    publish or reproduce this article, or to allow others to do so, for
210 |    Government purposes only.
211 |   \or % licensedothergov
212 |    Publication rights licensed to ACM. ACM acknowledges that this
213 |    contribution was authored or co-authored by an employee, contractor
214 |    or affiliate of a national government. As such, the Government
215 |    retains a nonexclusive, royalty-free right to publish or reproduce
216 |    this article, or to allow others to do so, for Government purposes
217 |    only.
218 |   \fi}
219 | \endinput
220 | %%
221 | %% End of file `acmcopyright.sty'.
222 | 


--------------------------------------------------------------------------------
/apalike-refs.bst:
--------------------------------------------------------------------------------
   1 | % BibTeX `apalike-refs' bibliography style which displays different IDs like DOI, ISBN, ISSN, but also the URL.
   2 | % It uses \href and \url, so be sure to use the package hyperref for it to work.
   3 | % It is based on the `apalike-doi' of Jan Even Øie Nilsen:
   4 | % http://web.nersc.no/~even/tex/apalike-doi.bst
   5 | % 
   6 | % MODIFICATIONS:
   7 | % - Add ISBN, ISSN, and URL functions (format.xxx)
   8 | % - Centralise all the IDs functions into format.refs
   9 | % - Call format.refs in all types (book, article, etc.)
  10 | % - Make a URL for DOI based on dx.doi.org
  11 | % - Make a URL for ISBN based on openlibrary.org
  12 | % - Manage multiple IDs for ISBN and ISSN (although URL feature for ISSN is abandoned by lack of open resource)
  13 | %
  14 | % Time-stamp:<Last updated on 2016-03-16 at 05:57 by matthieu.vergne@gmail.com>
  15 | %
  16 | % Was:
  17 | % BibTeX `apalike-doi' bibliography style 
  18 | % an attmpt to have apalike use doi and eventually eid:
  19 | % MODIFICATIONS:
  20 | % <search term> : <change>
  21 | % ENTRY : eid and doi put in 
  22 | % FUNCTION {format.eid} and 
  23 | % FUNCTION {format.doi} : put in before FUNCTION {format.title}
  24 | % FUNCTION {article} : changes at end 
  25 | %
  26 | % Time-stamp:<Last updated on 03/08/15 at 01:05:21 by even@gfi.uib.no>
  27 | % File:</home/janeven/tex/bibtex/apalike-doi.bst>
  28 | %
  29 | % Was:
  30 | % BibTeX `apalike' bibliography style (24-Jan-88 version)
  31 | % Adapted from the `alpha' style, version 0.99a; for BibTeX version 0.99a.
  32 | % Copyright (C) 1988, all rights reserved.
  33 | % Copying of this file is allowed, provided that if you make any changes at all
  34 | % you name it something other than `apalike.bst'.
  35 | % This restriction helps ensure that all copies are identical.
  36 | % Differences between this style and `alpha' are generally heralded by a `%'.
  37 | % The file btxbst.doc has the documentation for alpha.bst.
  38 | %
  39 | % This style should be used with the `apalike' LaTeX style (apalike.sty).
  40 | % \cite's come out like "(Jones, 1986)" in the text but there are no labels
  41 | % in the bibliography, and something like "(1986)" comes out immediately
  42 | % after the author.  Author (and editor) names appear as last name, comma,
  43 | % initials.  A `year' field is required for every entry, and so is either
  44 | % an author (or in some cases, an editor) field or a key field.
  45 | %
  46 | % Editorial note:
  47 | % Many journals require a style like `apalike', but I strongly, strongly,
  48 | % strongly recommend that you not use it if you have a choice---use something
  49 | % like `plain' instead.  Mary-Claire van Leunen (A Handbook for Scholars,
  50 | % Knopf, 1979) argues convincingly that a style like `plain' encourages better
  51 | % writing than one like `apalike'.  Furthermore the strongest arguments for
  52 | % using an author-date style like `apalike'---that it's "the most practical"
  53 | % (The Chicago Manual of Style, University of Chicago Press, thirteenth
  54 | % edition, 1982, pages 400--401)---fall flat on their face with the new
  55 | % computer-typesetting technology.  For instance page 401 anachronistically
  56 | % states "The chief disadvantage of [a style like `plain'] is that additions
  57 | % or deletions cannot be made after the manuscript is typed without changing
  58 | % numbers in both text references and list."  LaTeX sidesteps the disadvantage.
  59 | %
  60 | % History:
  61 | %   15-sep-86	(SK,OP)	Original version, by Susan King and Oren Patashnik.
  62 | %   10-nov-86	(OP)	Truncated the sort.key$ string to the correct length
  63 | %			in bib.sort.order to eliminate error message.
  64 | %   24-jan-88	(OP)	Updated for BibTeX version 0.99a, from alpha.bst 0.99a;
  65 | %			apalike now sorts by author, then year, then title;
  66 | %			THIS `apalike' VERSION DOES NOT WORK WITH BIBTEX 0.98i.
  67 | 
  68 | ENTRY
  69 |   { address
  70 |     author
  71 |     booktitle
  72 |     chapter
  73 |     eid
  74 |     doi
  75 |     isbn
  76 |     issn
  77 |     url
  78 |     edition
  79 |     editor
  80 |     howpublished
  81 |     institution
  82 |     journal
  83 |     key
  84 | %    month		not used in apalike
  85 |     note
  86 |     number
  87 |     organization
  88 |     pages
  89 |     publisher
  90 |     school
  91 |     series
  92 |     title
  93 |     type
  94 |     volume
  95 |     year
  96 |   }
  97 |   {}
  98 |   { label extra.label sort.label }
  99 | 
 100 | INTEGERS { output.state before.all mid.sentence after.sentence after.block }
 101 | 
 102 | FUNCTION {init.state.consts}
 103 | { #0 'before.all :=
 104 |   #1 'mid.sentence :=
 105 |   #2 'after.sentence :=
 106 |   #3 'after.block :=
 107 | }
 108 | 
 109 | STRINGS { s t }
 110 | 
 111 | FUNCTION {output.nonnull}
 112 | { 's :=
 113 |   output.state mid.sentence =
 114 |     { ", " * write$ }
 115 |     { output.state after.block =
 116 | 	{ add.period$ write$
 117 | 	  newline$
 118 | 	  "\newblock " write$
 119 | 	}
 120 | 	{ output.state before.all =
 121 | 	    'write$
 122 | 	    { add.period$ " " * write$ }
 123 | 	  if$
 124 | 	}
 125 |       if$
 126 |       mid.sentence 'output.state :=
 127 |     }
 128 |   if$
 129 |   s
 130 | }
 131 | 
 132 | FUNCTION {output}
 133 | { duplicate$ empty$
 134 |     'pop$
 135 |     'output.nonnull
 136 |   if$
 137 | }
 138 | 
 139 | FUNCTION {output.check}
 140 | { 't :=
 141 |   duplicate$ empty$
 142 |     { pop$ "empty " t * " in " * cite$ * warning$ }
 143 |     'output.nonnull
 144 |   if$
 145 | }
 146 | 
 147 | %					apalike needs this function because
 148 | %					the year has special punctuation;
 149 | %					apalike ignores the month
 150 | FUNCTION {output.year.check}
 151 | { year empty$
 152 |     { "empty year in " cite$ * warning$ }
 153 |     { write$
 154 |       " (" year * extra.label * ")" *
 155 |       mid.sentence 'output.state :=
 156 |     }
 157 |   if$
 158 | }
 159 | 
 160 | FUNCTION {output.bibitem}
 161 | { newline$
 162 |   "\bibitem[" write$
 163 |   label write$
 164 |   "]{" write$
 165 |   cite$ write$
 166 |   "}" write$
 167 |   newline$
 168 |   ""
 169 |   before.all 'output.state :=
 170 | }
 171 | 
 172 | FUNCTION {fin.entry}
 173 | { add.period$
 174 |   write$
 175 |   newline$
 176 | }
 177 | 
 178 | FUNCTION {new.block}
 179 | { output.state before.all =
 180 |     'skip$
 181 |     { after.block 'output.state := }
 182 |   if$
 183 | }
 184 | 
 185 | FUNCTION {new.sentence}
 186 | { output.state after.block =
 187 |     'skip$
 188 |     { output.state before.all =
 189 | 	'skip$
 190 | 	{ after.sentence 'output.state := }
 191 |       if$
 192 |     }
 193 |   if$
 194 | }
 195 | 
 196 | FUNCTION {not}
 197 | {   { #0 }
 198 |     { #1 }
 199 |   if$
 200 | }
 201 | 
 202 | FUNCTION {and}
 203 | {   'skip$
 204 |     { pop$ #0 }
 205 |   if$
 206 | }
 207 | 
 208 | FUNCTION {or}
 209 | {   { pop$ #1 }
 210 |     'skip$
 211 |   if$
 212 | }
 213 | 
 214 | FUNCTION {new.block.checkb}
 215 | { empty$
 216 |   swap$ empty$
 217 |   and
 218 |     'skip$
 219 |     'new.block
 220 |   if$
 221 | }
 222 | 
 223 | FUNCTION {field.or.null}
 224 | { duplicate$ empty$
 225 |     { pop$ "" }
 226 |     'skip$
 227 |   if$
 228 | }
 229 | 
 230 | FUNCTION {emphasize}
 231 | { duplicate$ empty$
 232 |     { pop$ "" }
 233 |     { "{\em " swap$ * "}" * }
 234 |   if$
 235 | }
 236 | 
 237 | INTEGERS { index length }
 238 | 
 239 | STRINGS { fullString }
 240 | 
 241 | FUNCTION {split.at.first.space}
 242 | {
 243 |   duplicate$
 244 |   text.length$
 245 |   'length :=
 246 |   #1
 247 |   {
 248 |     'index :=
 249 |     duplicate$
 250 |     index #1 substring$
 251 |     " " = not index length #1 + < and
 252 |   }
 253 |   {
 254 |     index #1 +
 255 |   }
 256 |   while$
 257 |   'fullString :=
 258 |   fullString #1 index #1 - substring$
 259 |   fullString index #1 + fullString text.length$ index - substring$
 260 | }
 261 | 
 262 | STRINGS { str1 str2 char }
 263 | 
 264 | FUNCTION {escape.url.characters}
 265 | {
 266 |   duplicate$ text.length$
 267 |   'length :=
 268 |   ""
 269 |   {
 270 |     'str1 :=
 271 |     duplicate$
 272 |     empty$ not
 273 |   }
 274 |   {
 275 |     'str2 :=
 276 |     str2 #1 #1 substring$
 277 |     'char :=
 278 |     char "_" =
 279 |       { str1 "\" * char * }
 280 |       { str1 char * }
 281 |     if$
 282 |     'str1 :=
 283 |     str2 #2 length #1 - substring$
 284 |     str1
 285 |   }
 286 |   while$
 287 |   pop$
 288 |   str1
 289 | }
 290 | 
 291 | INTEGERS { val }
 292 | 
 293 | FUNCTION {is.number.character}
 294 | {
 295 |   chr.to.int$
 296 |   'val :=
 297 |   val #47 >
 298 |   val #58 <
 299 |   + #2 =
 300 | }
 301 | 
 302 | INTEGERS { nameptr namesleft numnames }
 303 | 
 304 | FUNCTION {format.names}
 305 | { 's :=
 306 |   #1 'nameptr :=
 307 |   s num.names$ 'numnames :=
 308 |   numnames 'namesleft :=
 309 |     { namesleft #0 > }
 310 |     { s nameptr "{vv~}{ll}{, jj}{, f.}" format.name$ 't :=   % last name first
 311 |       nameptr #1 >
 312 | 	{ namesleft #1 >
 313 | 	    { ", " * t * }
 314 | 	    { numnames #2 >
 315 | 		{ "," * }
 316 | 		'skip$
 317 | 	      if$
 318 | 	      t "others" =
 319 | 		{ " et~al." * }
 320 | 		{ " and " * t * }
 321 | 	      if$
 322 | 	    }
 323 | 	  if$
 324 | 	}
 325 | 	't
 326 |       if$
 327 |       nameptr #1 + 'nameptr :=
 328 |       namesleft #1 - 'namesleft :=
 329 |     }
 330 |   while$
 331 | }
 332 | 
 333 | FUNCTION {format.authors}
 334 | { author empty$
 335 |     { "" }
 336 |     { author format.names }
 337 |   if$
 338 | }
 339 | 
 340 | FUNCTION {format.key}			% this function is just for apalike
 341 | { empty$
 342 |     { key field.or.null }
 343 |     { "" }
 344 |   if$
 345 | }
 346 | 
 347 | FUNCTION {format.editors}
 348 | { editor empty$
 349 |     { "" }
 350 |     { editor format.names
 351 |       editor num.names$ #1 >
 352 | 	{ ", editors" * }
 353 | 	{ ", editor" * }
 354 |       if$
 355 |     }
 356 |   if$
 357 | }
 358 | 
 359 | FUNCTION {format.eid}
 360 | {
 361 |   eid empty$
 362 |     { "" }
 363 |     { "" eid * }
 364 |   if$
 365 | }
 366 |   
 367 | FUNCTION {format.doi}
 368 | {
 369 |   doi empty$
 370 |     { "" }
 371 |     { "DOI: \href{http://dx.doi.org/" doi * "}{\tt " * doi escape.url.characters * "}" * }
 372 |   if$
 373 | }
 374 | 
 375 | STRINGS { str rem }
 376 | 
 377 | FUNCTION {format.isbn}
 378 | {
 379 |   isbn empty$
 380 |     { "" }
 381 |     {
 382 |       "ISBN:"
 383 |       isbn
 384 |       {
 385 | 	duplicate$ empty$ not
 386 |       }
 387 |       {
 388 | 	split.at.first.space
 389 | 	'rem :=
 390 | 	'str :=
 391 | 	str #1 #1 substring$
 392 | 	is.number.character
 393 | 	  % DCD {" \href{https://openlibrary.org/search?isbn=" * str * "}{\tt " * str escape.url.characters * "}" *}
 394 | 	  {" {\tt " * str escape.url.characters * "}" *}
 395 | 	  {" " * str escape.url.characters *}
 396 | 	if$
 397 | 	rem
 398 |       }
 399 |       while$
 400 |       pop$
 401 |     }
 402 |   if$
 403 | }
 404 |   
 405 | FUNCTION {format.issn}
 406 | {
 407 |   issn empty$
 408 |     { "" }
 409 |     {
 410 |       "ISSN:"
 411 |       issn
 412 |       {
 413 | 	duplicate$ empty$ not
 414 |       }
 415 |       {
 416 | 	split.at.first.space
 417 | 	'rem :=
 418 | 	'str :=
 419 | 	% If you find an open search engine for ISSN, please tell to the author.
 420 | 	% To make it a URL, pay attention: there might have a coma, so purify before.
 421 | 	" {\tt " * str escape.url.characters * "}" *
 422 | 	rem
 423 |       }
 424 |       while$
 425 |       pop$
 426 |     }
 427 |   if$
 428 | }
 429 |   
 430 | FUNCTION {format.url}
 431 | {
 432 |   url empty$
 433 |     { "" }
 434 |     { "\url{" url * "}" * }
 435 |   if$
 436 | }
 437 |   
 438 | FUNCTION {format.refs}
 439 | {
 440 |   format.eid output
 441 |   format.isbn output
 442 |   format.issn output
 443 |   format.doi output
 444 |   format.url output
 445 | }
 446 | 
 447 | FUNCTION {format.title}
 448 | { title empty$
 449 |     { "" }
 450 |     { title "t" change.case$ }
 451 |   if$
 452 | }
 453 | 
 454 | FUNCTION {n.dashify}
 455 | { 't :=
 456 |   ""
 457 |     { t empty$ not }
 458 |     { t #1 #1 substring$ "-" =
 459 | 	{ t #1 #2 substring$ "--" = not
 460 | 	    { "--" *
 461 | 	      t #2 global.max$ substring$ 't :=
 462 | 	    }
 463 | 	    {   { t #1 #1 substring$ "-" = }
 464 | 		{ "-" *
 465 | 		  t #2 global.max$ substring$ 't :=
 466 | 		}
 467 | 	      while$
 468 | 	    }
 469 | 	  if$
 470 | 	}
 471 | 	{ t #1 #1 substring$ *
 472 | 	  t #2 global.max$ substring$ 't :=
 473 | 	}
 474 |       if$
 475 |     }
 476 |   while$
 477 | }
 478 | 
 479 | FUNCTION {format.btitle}
 480 | { title emphasize
 481 | }
 482 | 
 483 | FUNCTION {tie.or.space.connect}
 484 | { duplicate$ text.length$ #3 <
 485 |     { "~" }
 486 |     { " " }
 487 |   if$
 488 |   swap$ * *
 489 | }
 490 | 
 491 | FUNCTION {either.or.check}
 492 | { empty$
 493 |     'pop$
 494 |     { "can't use both " swap$ * " fields in " * cite$ * warning$ }
 495 |   if$
 496 | }
 497 | 
 498 | FUNCTION {format.bvolume}
 499 | { volume empty$
 500 |     { "" }
 501 |     { "volume" volume tie.or.space.connect
 502 |       series empty$
 503 | 	'skip$
 504 | 	{ " of " * series emphasize * }
 505 |       if$
 506 |       "volume and number" number either.or.check
 507 |     }
 508 |   if$
 509 | }
 510 | 
 511 | FUNCTION {format.number.series}
 512 | { volume empty$
 513 |     { number empty$
 514 | 	{ series field.or.null }
 515 | 	{ output.state mid.sentence =
 516 | 	    { "number" }
 517 | 	    { "Number" }
 518 | 	  if$
 519 | 	  number tie.or.space.connect
 520 | 	  series empty$
 521 | 	    { "there's a number but no series in " cite$ * warning$ }
 522 | 	    { " in " * series * }
 523 | 	  if$
 524 | 	}
 525 |       if$
 526 |     }
 527 |     { "" }
 528 |   if$
 529 | }
 530 | 
 531 | FUNCTION {format.edition}
 532 | { edition empty$
 533 |     { "" }
 534 |     { output.state mid.sentence =
 535 | 	{ edition "l" change.case$ " edition" * }
 536 | 	{ edition "t" change.case$ " edition" * }
 537 |       if$
 538 |     }
 539 |   if$
 540 | }
 541 | 
 542 | INTEGERS { multiresult }
 543 | 
 544 | FUNCTION {multi.page.check}
 545 | { 't :=
 546 |   #0 'multiresult :=
 547 |     { multiresult not
 548 |       t empty$ not
 549 |       and
 550 |     }
 551 |     { t #1 #1 substring$
 552 |       duplicate$ "-" =
 553 |       swap$ duplicate$ "," =
 554 |       swap$ "+" =
 555 |       or or
 556 | 	{ #1 'multiresult := }
 557 | 	{ t #2 global.max$ substring$ 't := }
 558 |       if$
 559 |     }
 560 |   while$
 561 |   multiresult
 562 | }
 563 | 
 564 | FUNCTION {format.pages}
 565 | { pages empty$
 566 |     { "" }
 567 |     { pages multi.page.check
 568 | 	{ "pages" pages n.dashify tie.or.space.connect }
 569 | 	{ "page" pages tie.or.space.connect }
 570 |       if$
 571 |     }
 572 |   if$
 573 | }
 574 | 
 575 | FUNCTION {format.vol.num.pages}
 576 | { volume field.or.null
 577 |   number empty$
 578 |     'skip$
 579 |     { "(" number * ")" * *
 580 |       volume empty$
 581 | 	{ "there's a number but no volume in " cite$ * warning$ }
 582 | 	'skip$
 583 |       if$
 584 |     }
 585 |   if$
 586 |   pages empty$
 587 |     'skip$
 588 |     { duplicate$ empty$
 589 | 	{ pop$ format.pages }
 590 | 	{ ":" * pages n.dashify * }
 591 |       if$
 592 |     }
 593 |   if$
 594 | }
 595 | 
 596 | FUNCTION {format.chapter.pages}
 597 | { chapter empty$
 598 |     'format.pages
 599 |     { type empty$
 600 | 	{ "chapter" }
 601 | 	{ type "l" change.case$ }
 602 |       if$
 603 |       chapter tie.or.space.connect
 604 |       pages empty$
 605 | 	'skip$
 606 | 	{ ", " * format.pages * }
 607 |       if$
 608 |     }
 609 |   if$
 610 | }
 611 | 
 612 | FUNCTION {format.in.ed.booktitle}
 613 | { booktitle empty$
 614 |     { "" }
 615 |     { editor empty$
 616 | 	{ "In " booktitle emphasize * }
 617 | 	{ "In " format.editors * ", " * booktitle emphasize * }
 618 |       if$
 619 |     }
 620 |   if$
 621 | }
 622 | 
 623 | FUNCTION {format.thesis.type}
 624 | { type empty$
 625 |     'skip$
 626 |     { pop$
 627 |       type "t" change.case$
 628 |     }
 629 |   if$
 630 | }
 631 | 
 632 | FUNCTION {format.tr.number}
 633 | { type empty$
 634 |     { "Technical Report" }
 635 |     'type
 636 |   if$
 637 |   number empty$
 638 |     { "t" change.case$ }
 639 |     { number tie.or.space.connect }
 640 |   if$
 641 | }
 642 | 
 643 | FUNCTION {format.article.crossref}
 644 | { "In"							% this is for apalike
 645 |   " \cite{" * crossref * "}" *
 646 | }
 647 | 
 648 | FUNCTION {format.book.crossref}
 649 | { volume empty$
 650 |     { "empty volume in " cite$ * "'s crossref of " * crossref * warning$
 651 |       "In "
 652 |     }
 653 |     { "Volume" volume tie.or.space.connect
 654 |       " of " *
 655 |     }
 656 |   if$
 657 |   "\cite{" * crossref * "}" *				% this is for apalike
 658 | }
 659 | 
 660 | FUNCTION {format.incoll.inproc.crossref}
 661 | { "In"							% this is for apalike
 662 |   " \cite{" * crossref * "}" *
 663 | }
 664 | 
 665 | FUNCTION {article}
 666 | { output.bibitem
 667 |   format.authors "author" output.check
 668 |   author format.key output				% special for
 669 |   output.year.check					% apalike
 670 |   new.block
 671 |   format.title "title" output.check
 672 |   new.block
 673 |   crossref missing$
 674 |     { journal emphasize "journal" output.check
 675 |       format.vol.num.pages output
 676 |     }
 677 |     { format.article.crossref output.nonnull
 678 |       format.pages output
 679 |     }
 680 |   if$
 681 |   format.refs
 682 |   new.block
 683 |   note output
 684 |   fin.entry
 685 | }
 686 | 
 687 | FUNCTION {book}
 688 | { output.bibitem
 689 |   author empty$
 690 |     { format.editors "author and editor" output.check
 691 |       editor format.key output
 692 |     }
 693 |     { format.authors output.nonnull
 694 |       crossref missing$
 695 | 	{ "author and editor" editor either.or.check }
 696 | 	'skip$
 697 |       if$
 698 |     }
 699 |   if$
 700 |   output.year.check				% special for apalike
 701 |   new.block
 702 |   format.btitle "title" output.check
 703 |   crossref missing$
 704 |     { format.bvolume output
 705 |       new.block
 706 |       format.number.series output
 707 |       new.sentence
 708 |       publisher "publisher" output.check
 709 |       address output
 710 |     }
 711 |     { new.block
 712 |       format.book.crossref output.nonnull
 713 |     }
 714 |   if$
 715 |   format.edition output
 716 |   format.refs
 717 |   new.block
 718 |   note output
 719 |   fin.entry
 720 | }
 721 | 
 722 | FUNCTION {booklet}
 723 | { output.bibitem
 724 |   format.authors output
 725 |   author format.key output				% special for
 726 |   output.year.check					% apalike
 727 |   new.block
 728 |   format.title "title" output.check
 729 |   new.block
 730 |   howpublished output
 731 |   address output
 732 |   format.refs
 733 |   new.block
 734 |   note output
 735 |   fin.entry
 736 | }
 737 | 
 738 | FUNCTION {inbook}
 739 | { output.bibitem
 740 |   author empty$
 741 |     { format.editors "author and editor" output.check
 742 |       editor format.key output
 743 |     }
 744 |     { format.authors output.nonnull
 745 |       crossref missing$
 746 | 	{ "author and editor" editor either.or.check }
 747 | 	'skip$
 748 |       if$
 749 |     }
 750 |   if$
 751 |   output.year.check				% special for apalike
 752 |   new.block
 753 |   format.btitle "title" output.check
 754 |   crossref missing$
 755 |     { format.bvolume output
 756 |       format.chapter.pages "chapter and pages" output.check
 757 |       new.block
 758 |       format.number.series output
 759 |       new.sentence
 760 |       publisher "publisher" output.check
 761 |       address output
 762 |     }
 763 |     { format.chapter.pages "chapter and pages" output.check
 764 |       new.block
 765 |       format.book.crossref output.nonnull
 766 |     }
 767 |   if$
 768 |   format.edition output
 769 |   format.refs
 770 |   new.block
 771 |   note output
 772 |   fin.entry
 773 | }
 774 | 
 775 | FUNCTION {incollection}
 776 | { output.bibitem
 777 |   format.authors "author" output.check
 778 |   author format.key output				% special for
 779 |   output.year.check					% apalike
 780 |   new.block
 781 |   format.title "title" output.check
 782 |   new.block
 783 |   crossref missing$
 784 |     { format.in.ed.booktitle "booktitle" output.check
 785 |       format.bvolume output
 786 |       format.number.series output
 787 |       format.chapter.pages output
 788 |       new.sentence
 789 |       publisher "publisher" output.check
 790 |       address output
 791 |       format.edition output
 792 |     }
 793 |     { format.incoll.inproc.crossref output.nonnull
 794 |       format.chapter.pages output
 795 |     }
 796 |   if$
 797 |   format.refs
 798 |   new.block
 799 |   note output
 800 |   fin.entry
 801 | }
 802 | 
 803 | FUNCTION {inproceedings}
 804 | { output.bibitem
 805 |   format.authors "author" output.check
 806 |   author format.key output				% special for
 807 |   output.year.check					% apalike
 808 |   new.block
 809 |   format.title "title" output.check
 810 |   new.block
 811 |   crossref missing$
 812 |     { format.in.ed.booktitle "booktitle" output.check
 813 |       format.bvolume output
 814 |       format.number.series output
 815 |       format.pages output
 816 |       address output					% for apalike
 817 |       new.sentence					% there's no year
 818 |       organization output				% here so things
 819 |       publisher output					% are simpler
 820 |     }
 821 |     { format.incoll.inproc.crossref output.nonnull
 822 |       format.pages output
 823 |     }
 824 |   if$
 825 |   format.refs
 826 |   new.block
 827 |   note output
 828 |   fin.entry
 829 | }
 830 | 
 831 | FUNCTION {conference} { inproceedings }
 832 | 
 833 | FUNCTION {manual}
 834 | { output.bibitem
 835 |   format.authors output
 836 |   author format.key output				% special for
 837 |   output.year.check					% apalike
 838 |   new.block
 839 |   format.btitle "title" output.check
 840 |   organization address new.block.checkb
 841 |   organization output
 842 |   address output
 843 |   format.edition output
 844 |   format.refs
 845 |   new.block
 846 |   note output
 847 |   fin.entry
 848 | }
 849 | 
 850 | FUNCTION {mastersthesis}
 851 | { output.bibitem
 852 |   format.authors "author" output.check
 853 |   author format.key output				% special for
 854 |   output.year.check					% apalike
 855 |   new.block
 856 |   format.title "title" output.check
 857 |   new.block
 858 |   "Master's thesis" format.thesis.type output.nonnull
 859 |   school "school" output.check
 860 |   address output
 861 |   format.refs
 862 |   new.block
 863 |   note output
 864 |   fin.entry
 865 | }
 866 | 
 867 | FUNCTION {misc}
 868 | { output.bibitem
 869 |   format.authors output
 870 |   author format.key output				% special for
 871 |   output.year.check					% apalike
 872 |   new.block
 873 |   format.title output
 874 |   new.block
 875 |   howpublished output
 876 |   format.refs
 877 |   new.block
 878 |   note output
 879 |   fin.entry
 880 | }
 881 | 
 882 | FUNCTION {phdthesis}
 883 | { output.bibitem
 884 |   format.authors "author" output.check
 885 |   author format.key output				% special for
 886 |   output.year.check					% apalike
 887 |   new.block
 888 |   format.btitle "title" output.check
 889 |   new.block
 890 |   "PhD thesis" format.thesis.type output.nonnull
 891 |   school "school" output.check
 892 |   address output
 893 |   format.refs
 894 |   new.block
 895 |   note output
 896 |   fin.entry
 897 | }
 898 | 
 899 | FUNCTION {proceedings}
 900 | { output.bibitem
 901 |   format.editors output
 902 |   editor format.key output				% special for
 903 |   output.year.check					% apalike
 904 |   new.block
 905 |   format.btitle "title" output.check
 906 |   format.bvolume output
 907 |   format.number.series output
 908 |   address output				% for apalike
 909 |   new.sentence					% we always output
 910 |   organization output				% a nonempty organization
 911 |   publisher output				% here
 912 |   format.refs
 913 |   new.block
 914 |   note output
 915 |   fin.entry
 916 | }
 917 | 
 918 | FUNCTION {techreport}
 919 | { output.bibitem
 920 |   format.authors "author" output.check
 921 |   author format.key output				% special for
 922 |   output.year.check					% apalike
 923 |   new.block
 924 |   format.title "title" output.check
 925 |   new.block
 926 |   format.tr.number output.nonnull
 927 |   institution "institution" output.check
 928 |   address output
 929 |   format.refs
 930 |   new.block
 931 |   note output
 932 |   fin.entry
 933 | }
 934 | 
 935 | FUNCTION {unpublished}
 936 | { output.bibitem
 937 |   format.authors "author" output.check
 938 |   author format.key output				% special for
 939 |   output.year.check					% apalike
 940 |   new.block
 941 |   format.title "title" output.check
 942 |   format.refs
 943 |   new.block
 944 |   note "note" output.check
 945 |   fin.entry
 946 | }
 947 | 
 948 | FUNCTION {default.type} { misc }
 949 | 
 950 | MACRO {jan} {"January"}
 951 | 
 952 | MACRO {feb} {"February"}
 953 | 
 954 | MACRO {mar} {"March"}
 955 | 
 956 | MACRO {apr} {"April"}
 957 | 
 958 | MACRO {may} {"May"}
 959 | 
 960 | MACRO {jun} {"June"}
 961 | 
 962 | MACRO {jul} {"July"}
 963 | 
 964 | MACRO {aug} {"August"}
 965 | 
 966 | MACRO {sep} {"September"}
 967 | 
 968 | MACRO {oct} {"October"}
 969 | 
 970 | MACRO {nov} {"November"}
 971 | 
 972 | MACRO {dec} {"December"}
 973 | 
 974 | MACRO {acmcs} {"ACM Computing Surveys"}
 975 | 
 976 | MACRO {acta} {"Acta Informatica"}
 977 | 
 978 | MACRO {cacm} {"Communications of the ACM"}
 979 | 
 980 | MACRO {ibmjrd} {"IBM Journal of Research and Development"}
 981 | 
 982 | MACRO {ibmsj} {"IBM Systems Journal"}
 983 | 
 984 | MACRO {ieeese} {"IEEE Transactions on Software Engineering"}
 985 | 
 986 | MACRO {ieeetc} {"IEEE Transactions on Computers"}
 987 | 
 988 | MACRO {ieeetcad}
 989 |  {"IEEE Transactions on Computer-Aided Design of Integrated Circuits"}
 990 | 
 991 | MACRO {ipl} {"Information Processing Letters"}
 992 | 
 993 | MACRO {jacm} {"Journal of the ACM"}
 994 | 
 995 | MACRO {jcss} {"Journal of Computer and System Sciences"}
 996 | 
 997 | MACRO {scp} {"Science of Computer Programming"}
 998 | 
 999 | MACRO {sicomp} {"SIAM Journal on Computing"}
1000 | 
1001 | MACRO {tocs} {"ACM Transactions on Computer Systems"}
1002 | 
1003 | MACRO {tods} {"ACM Transactions on Database Systems"}
1004 | 
1005 | MACRO {tog} {"ACM Transactions on Graphics"}
1006 | 
1007 | MACRO {toms} {"ACM Transactions on Mathematical Software"}
1008 | 
1009 | MACRO {toois} {"ACM Transactions on Office Information Systems"}
1010 | 
1011 | MACRO {toplas} {"ACM Transactions on Programming Languages and Systems"}
1012 | 
1013 | MACRO {tcs} {"Theoretical Computer Science"}
1014 | 
1015 | READ
1016 | 
1017 | FUNCTION {sortify}
1018 | { purify$
1019 |   "l" change.case$
1020 | }
1021 | 
1022 | INTEGERS { len }
1023 | 
1024 | FUNCTION {chop.word}
1025 | { 's :=
1026 |   'len :=
1027 |   s #1 len substring$ =
1028 |     { s len #1 + global.max$ substring$ }
1029 |     's
1030 |   if$
1031 | }
1032 | 
1033 | %			There are three apalike cases: one person (Jones),
1034 | %			two (Jones and de~Bruijn), and more (Jones et~al.).
1035 | %			This function is much like format.crossref.editors.
1036 | %
1037 | FUNCTION {format.lab.names}
1038 | { 's :=
1039 |   s #1 "{vv~}{ll}" format.name$
1040 |   s num.names$ duplicate$
1041 |   #2 >
1042 |     { pop$ " et~al." * }
1043 |     { #2 <
1044 | 	'skip$
1045 | 	{ s #2 "{ff }{vv }{ll}{ jj}" format.name$ "others" =
1046 | 	    { " et~al." * }
1047 | 	    { " and " * s #2 "{vv~}{ll}" format.name$ * }
1048 | 	  if$
1049 | 	}
1050 |       if$
1051 |     }
1052 |   if$
1053 | }
1054 | 
1055 | FUNCTION {author.key.label}
1056 | { author empty$
1057 |     { key empty$
1058 | 	{ cite$ #1 #3 substring$ }
1059 | 	'key					% apalike uses the whole key
1060 |       if$
1061 |     }
1062 |     { author format.lab.names }
1063 |   if$
1064 | }
1065 | 
1066 | FUNCTION {author.editor.key.label}
1067 | { author empty$
1068 |     { editor empty$
1069 | 	{ key empty$
1070 | 	    { cite$ #1 #3 substring$ }
1071 | 	    'key				% apalike uses the whole key
1072 | 	  if$
1073 | 	}
1074 | 	{ editor format.lab.names }
1075 |       if$
1076 |     }
1077 |     { author format.lab.names }
1078 |   if$
1079 | }
1080 | 
1081 | FUNCTION {editor.key.label}
1082 | { editor empty$
1083 |     { key empty$
1084 | 	{ cite$ #1 #3 substring$ }
1085 | 	'key			% apalike uses the whole key, no organization
1086 |       if$
1087 |     }
1088 |     { editor format.lab.names }
1089 |   if$
1090 | }
1091 | 
1092 | FUNCTION {calc.label}
1093 | { type$ "book" =
1094 |   type$ "inbook" =
1095 |   or
1096 |     'author.editor.key.label
1097 |     { type$ "proceedings" =
1098 | 	'editor.key.label			% apalike ignores organization
1099 | 	'author.key.label			% for labeling and sorting
1100 |       if$
1101 |     }
1102 |   if$
1103 |   ", "							% these three lines are
1104 |   *							% for apalike, which
1105 |   year field.or.null purify$ #-1 #4 substring$		% uses all four digits
1106 |   *
1107 |   'label :=
1108 | }
1109 | 
1110 | FUNCTION {sort.format.names}
1111 | { 's :=
1112 |   #1 'nameptr :=
1113 |   ""
1114 |   s num.names$ 'numnames :=
1115 |   numnames 'namesleft :=
1116 |     { namesleft #0 > }
1117 |     { nameptr #1 >
1118 | 	{ "   " * }
1119 | 	'skip$
1120 |       if$						% apalike uses initials
1121 |       s nameptr "{vv{ } }{ll{ }}{  f{ }}{  jj{ }}" format.name$ 't := % <= here
1122 |       nameptr numnames = t "others" = and
1123 | 	{ "et al" * }
1124 | 	{ t sortify * }
1125 |       if$
1126 |       nameptr #1 + 'nameptr :=
1127 |       namesleft #1 - 'namesleft :=
1128 |     }
1129 |   while$
1130 | }
1131 | 
1132 | FUNCTION {sort.format.title}
1133 | { 't :=
1134 |   "A " #2
1135 |     "An " #3
1136 |       "The " #4 t chop.word
1137 |     chop.word
1138 |   chop.word
1139 |   sortify
1140 |   #1 global.max$ substring$
1141 | }
1142 | 
1143 | FUNCTION {author.sort}
1144 | { author empty$
1145 |     { key empty$
1146 | 	{ "to sort, need author or key in " cite$ * warning$
1147 | 	  ""
1148 | 	}
1149 | 	{ key sortify }
1150 |       if$
1151 |     }
1152 |     { author sort.format.names }
1153 |   if$
1154 | }
1155 | 
1156 | FUNCTION {author.editor.sort}
1157 | { author empty$
1158 |     { editor empty$
1159 | 	{ key empty$
1160 | 	    { "to sort, need author, editor, or key in " cite$ * warning$
1161 | 	      ""
1162 | 	    }
1163 | 	    { key sortify }
1164 | 	  if$
1165 | 	}
1166 | 	{ editor sort.format.names }
1167 |       if$
1168 |     }
1169 |     { author sort.format.names }
1170 |   if$
1171 | }
1172 | 
1173 | FUNCTION {editor.sort}
1174 | { editor empty$
1175 |     { key empty$
1176 | 	{ "to sort, need editor or key in " cite$ * warning$
1177 | 	  ""
1178 | 	}
1179 | 	{ key sortify }
1180 |       if$
1181 |     }
1182 |     { editor sort.format.names }
1183 |   if$
1184 | }
1185 | 
1186 | %			apalike uses two sorting passes; the first one sets the
1187 | %			labels so that the `a's, `b's, etc. can be computed;
1188 | %			the second pass puts the references in "correct" order.
1189 | %			The presort function is for the first pass. It computes
1190 | %			label, sort.label, and title, and then concatenates.
1191 | FUNCTION {presort}
1192 | { calc.label
1193 |   label sortify
1194 |   "    "
1195 |   *
1196 |   type$ "book" =
1197 |   type$ "inbook" =
1198 |   or
1199 |     'author.editor.sort
1200 |     { type$ "proceedings" =
1201 | 	'editor.sort
1202 | 	'author.sort
1203 |       if$
1204 |     }
1205 |   if$
1206 |   #1 entry.max$ substring$	% for
1207 |   'sort.label :=		% apalike
1208 |   sort.label			% style
1209 |   *
1210 |   "    "
1211 |   *
1212 |   title field.or.null
1213 |   sort.format.title
1214 |   *
1215 |   #1 entry.max$ substring$
1216 |   'sort.key$ :=
1217 | }
1218 | 
1219 | ITERATE {presort}
1220 | 
1221 | SORT		% by label, sort.label, title---for final label calculation
1222 | 
1223 | STRINGS { last.label next.extra }	% apalike labels are only for the text;
1224 | 
1225 | INTEGERS { last.extra.num }		% there are none in the bibliography
1226 | 
1227 | FUNCTION {initialize.extra.label.stuff}	% and hence there is no `longest.label'
1228 | { #0 int.to.chr$ 'last.label :=
1229 |   "" 'next.extra :=
1230 |   #0 'last.extra.num :=
1231 | }
1232 | 
1233 | FUNCTION {forward.pass}
1234 | { last.label label =
1235 |     { last.extra.num #1 + 'last.extra.num :=
1236 |       last.extra.num int.to.chr$ 'extra.label :=
1237 |     }
1238 |     { "a" chr.to.int$ 'last.extra.num :=
1239 |       "" 'extra.label :=
1240 |       label 'last.label :=
1241 |     }
1242 |   if$
1243 | }
1244 | 
1245 | FUNCTION {reverse.pass}
1246 | { next.extra "b" =
1247 |     { "a" 'extra.label := }
1248 |     'skip$
1249 |   if$
1250 |   label extra.label * 'label :=
1251 |   extra.label 'next.extra :=
1252 | }
1253 | 
1254 | EXECUTE {initialize.extra.label.stuff}
1255 | 
1256 | ITERATE {forward.pass}
1257 | 
1258 | REVERSE {reverse.pass}
1259 | 
1260 | %				Now that the label is right we sort for real,
1261 | %				on sort.label then year then title.  This is
1262 | %				for the second sorting pass.
1263 | FUNCTION {bib.sort.order}
1264 | { sort.label
1265 |   "    "
1266 |   *
1267 |   year field.or.null sortify
1268 |   *
1269 |   "    "
1270 |   *
1271 |   title field.or.null
1272 |   sort.format.title
1273 |   *
1274 |   #1 entry.max$ substring$
1275 |   'sort.key$ :=
1276 | }
1277 | 
1278 | ITERATE {bib.sort.order}
1279 | 
1280 | SORT		% by sort.label, year, title---giving final bibliography order
1281 | 
1282 | FUNCTION {begin.bib}
1283 | { preamble$ empty$				% no \etalchar in apalike
1284 |     'skip$
1285 |     { preamble$ write$ newline$ }
1286 |   if$
1287 |   "\begin{thebibliography}{}" write$ newline$		% no labels in apalike
1288 | }
1289 | 
1290 | EXECUTE {begin.bib}
1291 | 
1292 | EXECUTE {init.state.consts}
1293 | 
1294 | ITERATE {call.type$}
1295 | 
1296 | FUNCTION {end.bib}
1297 | { newline$
1298 |   "\end{thebibliography}" write$ newline$
1299 | }
1300 | 
1301 | EXECUTE {end.bib}
1302 | 


--------------------------------------------------------------------------------
/appendix.tex:
--------------------------------------------------------------------------------
  1 | % v2-acmsmall-sample.tex, dated March 6 2012
  2 | % This is a sample file for ACM small trim journals
  3 | %
  4 | % Compilation using 'acmsmall.cls' - version 1.3 (March 2012), Aptara Inc.
  5 | % (c) 2010 Association for Computing Machinery (ACM)
  6 | %
  7 | % Questions/Suggestions/Feedback should be addressed to => "acmtexsupport@aptaracorp.com".
  8 | % Users can also go through the FAQs available on the journal's submission webpage.
  9 | %
 10 | % Steps to compile: latex, bibtex, latex latex
 11 | %
 12 | % For tracking purposes => this is v1.3 - March 2012
 13 | 
 14 | \documentclass[prodmode,acmcsur]{acmsmall} % Aptara syntax
 15 | 
 16 | % Package to generate and customize Algorithm as per ACM style
 17 | \usepackage[ruled]{algorithm2e} 
 18 | \renewcommand{\algorithmcfname}{ALGORITHM}
 19 | \SetAlFnt{\small}
 20 | \SetAlCapFnt{\small}
 21 | \SetAlCapNameFnt{\small}
 22 | \SetAlCapHSkip{0pt}
 23 | \IncMargin{-\parindent}
 24 | 
 25 | % Metadata Information
 26 | \acmVolume{0}
 27 | \acmNumber{0}
 28 | \acmArticle{0}
 29 | \acmYear{0000}
 30 | \acmMonth{0}
 31 | 
 32 | % Copyright
 33 | %\setcopyright{acmcopyright}
 34 | %\setcopyright{acmlicensed}
 35 | %\setcopyright{rightsretained}
 36 | %\setcopyright{usgov}
 37 | %\setcopyright{usgovmixed}
 38 | %\setcopyright{cagov}
 39 | %\setcopyright{cagovmixed}
 40 | 
 41 | \input{common}
 42 | 
 43 | % DOI
 44 | \doi{0000001.0000001}
 45 | 
 46 | %ISSN
 47 | \issn{1234-56789}
 48 | 
 49 | % Document starts
 50 | \begin{document}
 51 | 
 52 | % Page heads
 53 | \markboth{R. Baldoni, E. Coppa, D. C. D'Elia, C. Demetrescu, and I. Finocchi}{A Survey of Symbolic Execution Techniques}
 54 | 
 55 | % Title portion
 56 | \title{Online Appendix to:\\ A Survey of Symbolic Execution Techniques\\}
 57 | \author{ROBERTO BALDONI
 58 | \affil{\href{http://www.cis.uniroma1.it/}{Cyber Intelligence and Information Security Research Center}, Sapienza}
 59 | EMILIO COPPA
 60 | \affil{\href{http://season-lab.github.io}{SEASON Lab}, Sapienza University of Rome}
 61 | DANIELE CONO D'ELIA
 62 | \affil{\href{http://season-lab.github.io}{SEASON Lab}, Sapienza University of Rome}
 63 | CAMIL DEMETRESCU
 64 | \affil{\href{http://season-lab.github.io}{SEASON Lab}, Sapienza University of Rome}
 65 | IRENE FINOCCHI
 66 | \affil{\href{http://season-lab.github.io}{SEASON Lab}, Sapienza University of Rome}
 67 | }
 68 | % NOTE! Affiliations placed here should be for the institution where the
 69 | %       BULK of the research was done. If the author has gone to a new
 70 | %       institution, before publication, the (above) affiliation should NOT be changed.
 71 | %       The authors 'current' address may be given in the "Author's addresses:" block (below).
 72 | %       So for example, Mr. Abdelzaher, the bulk of the research was done at UIUC, and he is
 73 | %       currently affiliated with NASA.
 74 | 
 75 | %\begin{abstract}
 76 | %\end{abstract}
 77 | 
 78 | \begin{comment}
 79 | \begin{CCSXML} % http://dl.acm.org/ccs.cfm
 80 | <ccs2012>
 81 | <concept>
 82 | <concept_id>10011007.10010940.10010992.10010998.10010999</concept_id>
 83 | <concept_desc>Software and its engineering~Software verification</concept_desc>
 84 | <concept_significance>500</concept_significance>
 85 | </concept>
 86 | <concept>
 87 | <concept_id>10011007.10010940.10010992.10010998.10011001</concept_id>
 88 | <concept_desc>Software and its engineering~Dynamic analysis</concept_desc>
 89 | <concept_significance>300</concept_significance>
 90 | </concept>
 91 | <concept>
 92 | <concept_id>10011007.10011074.10011099.10011102.10011103</concept_id>
 93 | <concept_desc>Software and its engineering~Software testing and debugging</concept_desc>
 94 | <concept_significance>300</concept_significance>
 95 | </concept>
 96 | <concept>
 97 | <concept_id>10002978.10003022</concept_id>
 98 | <concept_desc>Security and privacy~Software and application security</concept_desc>
 99 | <concept_significance>100</concept_significance>
100 | </concept>
101 | </ccs2012>
102 | \end{CCSXML}
103 | 
104 | \ccsdesc[500]{Software and its engineering~Software verification}
105 | \ccsdesc[300]{Software and its engineering~Dynamic analysis}
106 | \ccsdesc[300]{Software and its engineering~Software testing and debugging}
107 | \ccsdesc[100]{Security and privacy~Software and application security}
108 | \end{comment}
109 | 
110 | % We no longer use \terms command
111 | %\terms{Design, Algorithms, Performance}
112 | 
113 | %\keywords{Symbolic execution, static analysis, concolic execution, malware analysis}
114 | 
115 | %\acmformat{Roberto Baldoni, Emilio Coppa, Daniele Cono D'Elia, Camil Demetrescu, and Irene Finocchi, 2016. A survey of symbolic execution techniques.}
116 | % At a minimum you need to supply the author names, year and a title.
117 | % IMPORTANT:
118 | % Full first names whenever they are known, surname last, followed by a period.
119 | % In the case of two authors, 'and' is placed between them.
120 | % In the case of three or more authors, the serial comma is used, that is, all author names
121 | % except the last one but including the penultimate author's name are followed by a comma,
122 | % and then 'and' is placed before the final author's name.
123 | % If only first and middle initials are known, then each initial
124 | % is followed by a period and they are separated by a space.
125 | % The remaining information (journal title, volume, article number, date, etc.) is 'auto-generated'.
126 | 
127 | 
128 | %\begin{bottomstuff}
129 | \begin{comment}
130 | Author's addresses: R. Baldoni, E. Coppa, D.C. D'Elia, and C. Demetrescu, Department of Computer, Control, and Management Engineering, Sapienza University of Rome; I. Finocchi, Department of Computer Science, Sapienza University of Rome. 
131 | This work is supported in part by a grant of the Italian Presidency of the Council of Ministers and by the CINI National Laboratory of Cyber Security. % (Consorzio Interuniversitario Nazionale Informatica) 
132 | \end{comment}
133 | %\end{bottomstuff}
134 | 
135 | \maketitle
136 | 
137 | \renewcommand{\thesection}{\Alph{section}}
138 | %\renewcommand\thefigure{\thesection.\arabic{figure}} 
139 | \setcounter{figure}{11} % we have 11 figures in the main article
140 | \setcounter{page}{38}
141 | \renewcommand\thepage{\arabic{page}} 
142 | 
143 | \input{tables}
144 | \input{binary}
145 | \input{applications}
146 | 
147 | % Bibliography
148 | %\bibliographystyle{abstract} 
149 | \bibliographystyle{ACM-Reference-Format-Journals}
150 | \bibliography{symbolic}
151 | 
152 | % History dates
153 | %\received{--- 2016}{--- XXXX}{---- XXXX}
154 | 
155 | \end{document}
156 | 
157 | % End of v2-acmsmall-sample.tex (March 2012) - Gerry Murray, ACM
158 | 
159 | 
160 | 


--------------------------------------------------------------------------------
/applications.tex:
--------------------------------------------------------------------------------
 1 | % !TEX root = appendix.tex
 2 | 
 3 | \section{Applications of Symbolic Execution}
 4 | \label{se:applications}
 5 | 
 6 | \revedit{
 7 | \cite{CGK-ICSE11} observes how the recent explosion of research work in symbolic execution makes for an interesting story about the increasing impact of this program analysis since its introduction in the mid '70s. The availability of powerful off-the-shelf SMT solvers and hardware resources, along with advances in symbolic execution techniques to deal with the challenges identified in Section 1.2, facilitated the application of symbolic execution to increasing large problem instances from many domains.
 8 | 
 9 | %The last decade has witnessed an increasing adoption of symbolic execution techniques not only in the software testing domain, but also to address other compelling engineering problems such as automatic generation of exploits or authentication bypass. We now discuss prominent applications of symbolic execution techniques to these domains. Examples of extensions to other areas can be found, e.g., in~\cite{CGK-ICSE11}.
10 | 
11 | In this section we do not aim at presenting a comprehensive overview of applications of symbolic execution. Our goal is instead to provide the reader with a selection of works appeared in the last few years that either incubated novel ideas that might be effective in other domains too (e.g., to deal with the path explosion problem), or significantly affected the state of the art of a specific field.
12 | 
13 | The works we are about to discuss are drawn from four domains: software testing, program understanding, bug exploitation, and authentication bypass. Other fields that have seen uses of symbolic execution, such as automatic filter generation (e.g., \cite{BND-SP06,BOUNCER-SOSP07}) and code analysis (e.g., \cite{HMH-VSTTE12,BCP-USENIXSEC17}), are not covered here. Also, we do not address techniques tailored to programs with concurrent threads (e.g., \cite{BGC-OOPSLA14,GKW-ESEC15}) or floating-point arithmetic (e.g., \cite{RPW-SIGSOFT15,LSC-ASE17}).}
14 | 
15 | %The last decade has witnessed an increasing adoption of symbolic execution techniques not only in the software testing domain, but also to address other compelling engineering problems such as automatic generation of exploits or authentication bypass. We now discuss \iffullver{three prominent}{prominent} applications of symbolic execution techniques to these domains. Examples of extensions to other areas can be found, e.g., in~\cite{CGK-ICSE11}.
16 | 
17 | \subsection{Software Testing}%\mynote{Rendere piu' di ampio respiro il titolo di questa sezione? Keyword: software testing, program understanding}
18 | \label{ss:bug-detection}
19 | 
20 | Software testing strategies typically attempt to execute a program with the intent of finding bugs. As manual test input generation is an error-prone and usually non-exhaustive process, automated testing techniques have drawn a lot of attention over the years. Random testing techniques such as fuzzing are cheap in terms of run-time overhead, but fail to obtain a wide exploration of a program state space. Symbolic and concolic execution techniques on the other hand achieve a more exhaustive exploration, but they become expensive as the length of the execution grows: for this reason, they usually reveal shallow bugs only.
21 | 
22 | \cite{RK-ICSE07} proposes {\em hybrid concolic testing} for test input generation, which combines random search and concolic execution to achieve both deep program states and wide exploration. The two techniques are interleaved: in particular, when random testing saturates (i.e., it is unable to hit new code coverage points after a number of steps), concolic execution is used to mutate the current program state by performing a bounded depth-first search for an uncovered coverage point. For a fixed time budget, the technique outperforms both random and concolic testing in terms of branch coverage. The intuition behind this approach is that many programs show behaviors where a state can be easily reached through random testing, but then a precise sequence of events -- identifiable by a symbolic engine -- is required to hit a specific coverage point.
23 | 
24 | % which uses preconstraining on the program states to ensure consistency
25 | % fuzzy \revedit
26 | \cite{DRILLER-NDSS16} refines this idea by devising Driller, a vulnerability excavation tool based on {\sc Angr}~\cite{ANGR-SSP16} that interleaves fuzzing and concolic execution to discover memory corruption vulnerabilities. The authors remark that user inputs can be categorized as {\em general} input, which has a wide range of valid values, and {\em specific} input; a check for particular values of a specific input splits an application into {\em compartments}. Driller offloads the majority of unique path discovery to a fuzzy engine, and relies on concolic execution to move across compartments. During the fuzzy phase, Driller marks a number of inputs as interesting (for instance, when an input was the first to trigger some state transition) and once it gets stuck in the exploration, it passes the set of such paths to a concolic engine, which preconstraints the program states to ensure consistency with the results of the native execution. On the dataset used for the DARPA Cyber Grand Challenge qualifying event, Driller could identify crashing inputs in 77 applications, including both the 68 and 16 applications for which fuzzing and symbolic execution alone succeeded, respectively. For 6 applications, Driller was the only one to detect a vulnerability.
27 | 
28 | % temporaneamente messo qui
29 | %  \cite{QRL-TOSEM12} \revedit
30 | \smallskip
31 | Maintenance of large and complex applications is a very hard task. Fixing bugs can sometimes introduce new and unexpected issues in the software, which in turn may require several hours or even weeks to be detected and properly addressed by the developers. \cite{QRL-TOSEM12} tackles the problem of identifying the root cause of failures during regression testing. Given a program $P$ and a newer revision of the program $P'$, if a testing input $t$ generates a failure in $P'$ but not in  $P$, then symbolic execution is used to track the path constraints $\pi$ and $\pi'$ when executing $P$ and $P'$ on the failing input $t$, respectively. Using an SMT solver, a new input $t'$ is generated by solving the formula $\pi ~\wedge \neg\pi'$. If $t'$ exists (i.e., the formula is satisfiable), then $P'$ has one or more {\em deviations} in the control flow graph with respect to $P$ that can be the root cause of the failure. By carefully tracking branch conditions during symbolic execution, \cite{QRL-TOSEM12} are also able to pinpoint which branches are responsible for these deviations. If $\pi \wedge \neg\pi'$ is unsatisfiable, the symmetric formula $\neg\pi \wedge \pi'$ is evaluated and analogous actions are taken to detect possible branch conditions that may have led to the failure. If also $\neg\pi \wedge \pi'$ is unsatisfiable, the root cause of the problem cannot be determined.
32 | %\revedit{the technique} cannot determine the root cause of the problem.
33 | 
34 | % over, to check \revedit
35 | Another interesting work that targets the problem of software regressions through the use of symbolic execution is~\cite{BOR-ICSE13}. The work introduces an approach called {\em partition-based regression verification} that combines the advantages of both regression verification (RV) and regression testing (RT). Indeed, RV is a very powerful technique for identifying regressions but hardly scales to large programs due to the difficulty in proving behavioral equivalence between the original and the modified program. On the other hand, RT allows for checking a modified program for regressions by testing selected concrete sample inputs, making it more scalable but providing limited verification guarantees. The main intuition behind partition-based regression verification is the identification of {\em differential partitions}. Each differential partition can be seen as a subset of the input space for which the two program versions -- given the same path constraints -- either expose the same output ({\em equivalence-revealing partition}) or produce different results ({\em difference-revealing partition}). For each partition, a test case is generated and added to the regression test suite, which can later be used by a developer for classical RT. Since differential partitions are derived by exploiting symbolic execution, this approach suffers from the common limitations that come with this technique. However, if the exploration is interrupted (e.g., due to excessive time or memory usage), partition-based regression verification can still provide guarantees over the subset of input space that has been covered so far by the detected partitions.
36 | 
37 | \revedit{
38 | Directed incremental symbolic execution (DiSE) is usually used for regression testing. As pointed out in the main article, its strength lies in applying static analyses in synergy with symbolic execution, directing the exploration to the sole code portions affected by changes. \cite{BPR-SPIN13} uses DiSE to generate summaries of behaviors affected by differences, and proves behavioral equivalence of two program versions by comparing the affected behaviors only. Their approach is sound and complete for sequential programs under a given depth bound for the symbolic exploration.}
39 | 
40 | \smallskip
41 | Static data flow analysis tools can significantly help developers track malicious data leaks in software applications. Unfortunately, they often report several alleged bugs that only after a manual inspection can be regarded as false positives. To mitigate this issue,~\cite{ARH-SOAP15} proposes TASMAN, a system that, after performing data-flow analysis to track information leaks, uses symbolic backward execution to test each reported bug. Starting from a leaking statement, TASMAN explores the code backwards, pruning any path that can be proved unfeasible. If all the paths starting at the leaking statement are discarded by TASMAN, the bug is deemed a false positive.
42 | 
43 | % . Intuitively, a usage profile can be seen as the distribution over the input space.
44 | % other -> several \revedit
45 | \subsection{Program Understanding}
46 | While symbolic execution is largely employed in testing activities, over the few last years several works (e.g., \cite{GDV-ISSTA12,FPV-ICSE13,CLL-ICSE16}) have shown how it can be valuable also for program understanding activities.
47 | 
48 | \cite{GDV-ISSTA12} introduces {\em probabilistic symbolic execution}, an approach that makes it possible to compute the probability of executing different code portions of a program. This is achieved by exploiting model counting techniques, such as the {\tt LattE}~\cite{LHT-JSC04} toolset, to determine the number of solutions for the different path constraints given by the alternative execution paths of a program.
49 | 
50 | The work by~\cite{FPV-ICSE13} takes a step further by using probabilistic symbolic execution to perform software reliability analysis. Reliability is computed as the probability of executing paths that have been labeled as successful given a usage profile, which represents the input space of all the successfully accomplished external interactions (with the user and with external resources) of the program.  Since in general the termination of symbolic execution cannot be guaranteed in presence of loops, the proposed technique resorts to bounded exploration. Nonetheless, the authors define a metric for evaluating the confidence of their reliability estimation, allowing a developer to increase the bounds in order to improve the confidence value.
51 | 
52 | Of a different flavor is the work by~\cite{CLL-ICSE16}, which uses probabilistic symbolic execution to conduct performance analysis. Based on usage profiles and on path execution probabilities, paths are classified into two types: {\em low-probability} and {\em high-probability}. Initially, high-probability paths are explored in a way that maximizes path diversity, generating a first set of test inputs. In a second phase, low-probability paths are analyzed using symbolic execution, generating a second set of test inputs that should expose executions characterized by the best and by the worst execution times. Finally, the program is executed using the test inputs generated during the two phases, and its running time is measured to generate performance distributions. 
53 | 
54 | Another interesting application of symbolic execution to program understanding is presented in~\cite{PPM-CSF18}. The technique exploits model counting and symbolic execution for computing quantitative bounds on the amount of information that can be leaked by a program through side-channel attacks. 
55 | 	
56 | %As it is based on {\sc Angr}, Driller adopts an index-based memory model as in Section~\ref{ss:index-based-memory} where reads can be symbolic and writes are always concretized. % read/write addresses
57 | 
58 | \subsection{Bug Exploitation}
59 | \label{ss:bug-exploitation}
60 | Bugs are a consequence of the nature of human factors in software development and are everywhere. Those that can be exploited by an attacker should normally be fixed first: systems for automatically and effectively identifying them are thus very valuable.
61 | 
62 | {\sc AEG}~\cite{AEG-NDSS11} employs preconditioned symbolic execution to analyze a potentially buggy program in source form and look for bugs amenable to stack smashing or return-into-libc exploits~\cite{PB-SSP04}, which are popular control hijack attack techniques. The tool augments path constraints with exploitability constraints and queries a constraint solver, generating a concrete exploit when the constraints are satisfiable. The authors devise the {\em buggy-path-first} and {\em loop-exhaustion} strategies (Table~\ref{tab:heuristics}) to prioritize paths in the search. On a suite of 14 Linux applications, {\sc AEG} discovered 16 vulnerabilities, 2 of which were previously unknown, and constructed control hijack exploits for them.
63 | 
64 | {\sc Mayhem}~\cite{MAYHEM-SP12} takes another step forward by presenting the first system for binary programs that is able identify end-to-end exploitable bugs. It adopts a hybrid execution model based on checkpoints and two components: a concrete executor that injects taint-analysis instrumentation in the code and a symbolic executor that takes over when a tainted branch or jump instruction is met. Exploitability constraints for symbolic instruction pointers and format strings are generated, targeting a wide range of exploits, e.g., SEH-based and jump-to-register ones. Three path selection heuristics help prioritizing paths that are most likely to contain vulnerabilities (e.g., those containing symbolic memory accesses or instruction pointers). A virtualization layer intercepts and emulates all the system calls to the host OS, while preconditioned symbolic execution can be used to reduce the size of the search space. Also, restricting symbolic execution to tainted basic blocks only gives very good speedups in this setting, as in the reported experiments more than $95\%$ of the processed instructions were not tainted. {\sc Mayhem} was able to find exploitable vulnerabilities in the 29 Linux and Windows applications considered in the evaluation, 2 of which were previously undocumented. Although the goal in {\sc Mayhem} is to reveal exploitable bugs, the generated simple exploits can be likely transformed in an automated fashion to work in the presence of classical OS defenses such as data execution prevention and address space layout randomization~\cite{Q-SEC11}. 
65 | 
66 | \vspace{-1mm} % TODO
67 | \subsection{Authentication Bypass}
68 | \label{ss:auth-bypass}
69 | Software backdoors are a method of bypassing authentication in an algorithm, a software product, or even in a full computer system. Although sometimes these software flaws are injected by external attackers using subtle tricks such as compiler tampering~\cite{KRS-TR74}, there are reported cases of backdoors that have been surreptitiously installed by the hardware and/or software manufacturers~\cite{CZF-USEC14}, or even by governments~\cite{NSA-BACKDOOR}. 
70 | 
71 | Different works (e.g., \cite{DMR-USEC13,ZBF-NDSS14,FIRMALICE-NDSS15}) have exploited symbolic execution for analyzing the behavior of binary firmwares. Indeed, an advantage of this technique is that it can be used even in environments, such as embedded systems, where the documentation and the source code that are publicly released by the manufacturer are typically very limited or none at all. For instance,~\cite{FIRMALICE-NDSS15} proposes Firmalice, a binary analysis framework based on {\sc Angr}~\cite{ANGR-SSP16} that can be effectively used for identifying authentication bypass flaws inside firmwares running on devices such as routers and printers. Given a user-provided description of a privileged operation in the device, Firmalice identifies a set of program points that, if executed, forces the privileged operation to be performed. The program slice that involves the privileged program points is then symbolically analyzed using {\sc Angr}. If any such point can be reached by the engine, a set of concrete inputs is generated using an SMT solver. These values can be then used to effectively bypass authentication inside the device. On three commercially available devices, Firmalice could detect vulnerabilities in two of them, and determine that a backdoor in the third firmware is not remotely exploitable.


--------------------------------------------------------------------------------
/arxiv.tex:
--------------------------------------------------------------------------------
  1 | % v2-acmsmall-sample.tex, dated March 6 2012
  2 | % This is a sample file for ACM small trim journals
  3 | %
  4 | % Compilation using 'acmsmall.cls' - version 1.3 (March 2012), Aptara Inc.
  5 | % (c) 2010 Association for Computing Machinery (ACM)
  6 | %
  7 | % Questions/Suggestions/Feedback should be addressed to => "acmtexsupport@aptaracorp.com".
  8 | % Users can also go through the FAQs available on the journal's submission webpage.
  9 | %
 10 | % Steps to compile: latex, bibtex, latex latex
 11 | %
 12 | % For tracking purposes => this is v1.3 - March 2012
 13 | 
 14 | \documentclass[10pt,a4paper]{article} % Aptara syntax
 15 | 
 16 | \usepackage{authblk}
 17 | \usepackage{breakcites}
 18 | 
 19 | % Package to generate and customize Algorithm as per ACM style
 20 | \usepackage[ruled]{algorithm2e} 
 21 | \renewcommand{\algorithmcfname}{ALGORITHM}
 22 | \SetAlFnt{\small}
 23 | \SetAlCapFnt{\small}
 24 | \SetAlCapNameFnt{\small}
 25 | \SetAlCapHSkip{0pt}
 26 | \IncMargin{-\parindent}
 27 | 
 28 | \newcommand{\fullver}{}
 29 | \newcommand{\arxivver}{}
 30 | \input{common}
 31 | 
 32 | % Document starts
 33 | \begin{document}
 34 | 
 35 | % Page heads
 36 | \markboth{R. Baldoni, E. Coppa, D. C. D'Elia, C. Demetrescu, and I. Finocchi}{A Survey of Symbolic Execution Techniques}
 37 | 
 38 | % Title portion
 39 | \title{\fontsize{22}{12}\selectfont{A Survey of Symbolic Execution Techniques}}
 40 | \author[1]{Roberto Baldoni}
 41 | \author[2]{Emilio Coppa}
 42 | \author[2]{Daniele Cono D'Elia}
 43 | \author[2]{\authorcr Camil Demetrescu}
 44 | \author[2]{Irene Finocchi}
 45 | \affil[1]{\small\href{http://www.cis.uniroma1.it/}{Cyber Intelligence and Information Security Research Center}, Sapienza University of Rome}
 46 | \affil[2]{\href{season-lab.github.io}{SEASON Lab}, Sapienza University of Rome}
 47 | \affil[ ]{{\vskip 1pt}\textit {\{baldoni,coppa,delia,demetres\}@dis.uniroma1.it\\ finocchi@di.uniroma1.it}}
 48 | 
 49 | \date{\vspace{-4mm}}
 50 | 
 51 | \maketitle
 52 | 
 53 | \begin{abstract}
 54 | Many security and software testing applications require checking whether certain properties of a program hold for any possible usage scenario. For instance, a tool for identifying software vulnerabilities may need to rule out the existence of any backdoor to bypass a program's authentication. One approach would be to test the program using different, possibly random inputs. As the backdoor may only be hit for very specific program workloads, automated exploration of the space of possible inputs is of the essence. Symbolic execution provides an elegant solution to the problem, by systematically exploring many possible execution paths at the same time without necessarily requiring concrete inputs. Rather than taking on fully specified input values, the technique abstractly represents them as symbols, resorting to constraint solvers to construct actual instances that would cause property violations. Symbolic execution has been incubated in dozens of tools developed over the last four decades, leading to major practical breakthroughs in a number of prominent software reliability applications. The goal of this survey is to provide an overview of the main ideas, challenges, and solutions developed in the area, distilling them for a broad audience.
 55 | \end{abstract}
 56 | 
 57 | % We no longer use \terms command
 58 | %\terms{Design, Algorithms, Performance}
 59 | 
 60 | %\keywords{Symbolic execution, static analysis, concolic execution, malware analysis}
 61 | 
 62 | %\acmformat{Roberto Baldoni, Emilio Coppa, Daniele Cono D'Elia, Camil Demetrescu, and Irene Finocchi, 2016. A survey of symbolic execution techniques.}
 63 | 
 64 | \iffalse
 65 | \begin{bottomstuff}
 66 | Author's addresses: R. Baldoni, E. Coppa, D.C. D'Elia, and C. Demetrescu, Department of Computer, Control, and Management Engineering, Sapienza University of Rome; I. Finocchi, Department of Computer Science, Sapienza University of Rome. 
 67 | \end{bottomstuff}
 68 | \fi
 69 | 
 70 | 
 71 | 
 72 | % \input{intro}
 73 | % \myinput{executors}
 74 | % \myinput{memory}
 75 | % \myinput{environment}
 76 | % \myinput{loops}
 77 | % \myinput{explosion}
 78 | % \myinput{constraints}
 79 | % \myinput{binary}
 80 | % \input{applications}
 81 | % \input{conclusions}
 82 | % \input{glossary}
 83 | 
 84 | \input{intro}
 85 | \myinput{executors}
 86 | \myinput{memory}
 87 | \myinput{environment}
 88 | \myinput{explosion}
 89 | \myinput{constraints}
 90 | %\myinput{binary}
 91 | %\input{applications}
 92 | \input{hang}
 93 | \input{conclusions}
 94 | 
 95 | \myparagraph{Acknowledgements}
 96 | %This work is partially supported by a grant of the Italian Presidency of Ministry Council and by the CINI  (Consorzio Interuniversitario Nazionale Informatica) Cybersecurity National Laboratory.
 97 | This work is supported in part by a grant of the Italian Presidency of the Council of Ministers and by the CINI (Consorzio Interuniversitario Nazionale Informatica) National Laboratory of Cyber Security.
 98 | 
 99 | 
100 | \input{glossary}
101 | 
102 | \appendix
103 | \input{tables}
104 | \myinput{binary}
105 | \input{applications}
106 | 
107 | 
108 | % Bibliography
109 | %\bibliographystyle{abstract} 
110 | \bibliographystyle{apalike-refs}
111 | \bibliography{symbolic}
112 | 
113 | % History dates
114 | %\received{--- 2016}{--- XXXX}{---- XXXX}
115 | 
116 | \end{document}
117 | 
118 | % End of v2-acmsmall-sample.tex (March 2012) - Gerry Murray, ACM
119 | 
120 | 
121 | 


--------------------------------------------------------------------------------
/binary.tex:
--------------------------------------------------------------------------------
 1 | % !TEX root = appendix.tex
 2 | 
 3 | \section{Symbolic execution of binary code}
 4 | \label{se:symbolic-binary}
 5 | 
 6 | The importance of performing symbolic analysis of program properties on binary code is on the rise for a number of reasons. Binary code analysis is attractive as it reasons on code that will actually execute: not requiring the source code significantly extends the applicability of such techniques (to, e.g., common off-the-shelf proprietary programs, firmwares for embedded systems, and malicious software), and it gives the ground truth important for security applications whereas source code analysis may yield misleading results due to compiler optimizations~\cite{BITBLAZE-ICISS08}. % compiler errors/defects too
 7 | Binary analysis is relevant also for programs written in dynamic languages and executed in runtimes that deeply transform and optimize the code through just-in-time compilation.
 8 | 
 9 | %Also, the recent advances in runtimes for programs written in dynamic languages brought just-in-time compilation to the masses, taking over on interpreters used when no efficient source-to-binary translation of code was statically possible. 
10 | 
11 | 
12 | % [D] In this paragraph perhaps it is not worth mentioning obfuscation, packing and encryption
13 | %Analyzing binary code is commonly seen as a challenging task
14 | \revedit{Working on binary code is often a challenging task for many program analyses} due to its complexity and lack of a high-level semantics. Modern architectures offer complex instruction sets: modeling each instruction can be difficult, especially in the presence of multiple side effects on processor flags to determine branch conditions. The second major challenge comes from the high-level semantics of the source code being lost in the lowering process (Figure~\ref{fig:lowering}), especially when debugging information is absent. Types are not explicitly encoded in binary code: even with register types, it is common to read values assuming a different type (e.g., 8-bit integer) from what was used to store them (e.g., 16-bit integer). Similar considerations can be made for array bounds as well. Also, control flow graph information is not explicitly available, as control flow is performed through jump instructions at both inter- and intra-procedural level. The function abstraction at the binary level does not exist as we intend it at source-code level: functions can be separated in non-contiguous pieces, and code may also call in the middle of a code block generated for a source-level function.
15 | 
16 | In the remainder of this section we provide an overview of how symbolic executors can address some of the most significant challenges in the analysis of binary code.
17 | 
18 | \subsection{Lifting to an Intermediate Representation}
19 | Motivated by the complexity in modeling native instructions and by the variety of architectures on which applications can be deployed (e.g., x86, x86-64, ARM, MIPS), symbolic executors for binary code typically rely on a {\em lifter} that transforms native instructions into an {\em intermediate representation} (IR), also known as {\em bytecode}. Modern compilers such as \iffullver{LLVM~\cite{LLVM-CGO04}}{LLVM} typically generate IR by {\em lowering} the user-provided source code during the first step of compilation, optimize it, and eventually lower it to native code for a specific platform. Source-code symbolic executors can resort to compiler-assisted lowering to reason on bytecode rather than source-language statements: for instance, {\sc KLEE}~\cite{KLEE-OSDI08} reasons on the IR generated by the LLVM compiler for static languages such as C and C++. Figure~\ref{fig:lowering} summarizes the relationships between source code, IR, and binary code. % \mynote{[D] Java?}
20 | 
21 | % encoded as architecture-agnostic
22 | % for expressing
23 | Reasoning at the intermediate representation level allows for encoding program analyses in an architecture-agnostic fashion. Translated instructions will always expose all the side-effects of a native instruction, and support for additional platforms can be added over time. A number of symbolic executors use VEX, the IR of the Valgrind dynamic instrumentation framework~\cite{VALGRIND-PLDI07}. VEX is a RISC-like language designed for program analysis that offers a compact set of instructions to express programs in static single assignment form~\cite{SSA-TOPLAS91}. Lifters are available for both 32-bit and 64-bit ARM, MIPS, PPC, and x86 binaries.
24 | 
25 | \begin{figure}[t!]
26 |   \centering
27 |   \includegraphics[width=.67\columnwidth]{images/compiler} % TODO was 0.7
28 |   \vspace{-2mm}
29 |   \caption{\label{fig:lowering} Lowering and lifting processes in native vs. source code processing.}
30 |   \vspace{-1mm} % TODO
31 | \end{figure}
32 | 
33 | %{\sc Angr}~\cite{ANGR-SSP16} performs analysis directly on the VEX IR
34 | %translating it to a custom language allowed them to simplify the development of their analysis framework
35 | {\sc Angr}~\cite{ANGR-SSP16} performs analysis directly on VEX IR. Authors chose VEX over other IR formats as at that time it was the only choice that offered a publicly available implementation with support for many architectures. Also, they mention that writing a binary lifter can be a daunting task, and a well-documented and program analysis-oriented solution can be a bonus. {\sc BitBlaze}~\cite{BITBLAZE-ICISS08} uses VEX too, although it translates it to a custom intermediate language. The reason for this is that VEX captures the side effects of some instructions only implicitly, such as the {\tt EFLAGS} bits set by instructions of the x86 ISA: translating it to a custom language simplified the development of {\sc BitBlaze}'s analysis framework.
36 | 
37 | % guest operating systems
38 | The authors of {\sc \stwoe}~\cite{CKC-TOCS12} have implemented an x86-to-LLVM-IR lifter in order to use the {\sc KLEE}~\cite{KLEE-OSDI08} symbolic execution engine for whole-system symbolic analysis of binary code in a virtualized environment. The translation is transparent to both the guest operating system and KLEE, thus enabling the analysis of binaries using the full power of {\sc KLEE}. Another x86-to-LLVM-IR lifter that can be used to run {\sc KLEE} on binary code is {\tt mcsema}\footnote{\url{https://github.com/trailofbits/mcsema}.}.
39 | 
40 | \subsection{Reconstructing the Control Flow Graph}
41 | 
42 | A control flow graph (CFG) can provide valuable information for a symbolic executor as it captures the set of potential control flow transfers for all feasible execution paths. A fundamental issue that arises when reconstructing CFGs for binaries is that the possible targets of an indirect jump may not be identified correctly. Direct jumps are straightforward to process: as they encode their targets explicitly in the code, successor basic blocks can be identified and visited until no new edge is found. The target of an indirect jump is determined instead at run time: it might be computed by carrying out a calculation (e.g., a jump table) or depend on the current calling context (e.g., a function pointer is passed as argument, or a virtual C++ method is invoked). %We refer the interested reader to ~\cite{ANGR-SP16} for a detailed overview.
43 | 
44 | % [D] we are focusing on CFG reconstruction here rather than on its applications
45 | % In general, not all the analyses based on CFGs require successor nodes to be accurately identified. This property can be exploited to perform further refinements on an initially less accurate CFG using techniques such as Value Set Analysis (VSA)~\cite{VSA-CC04}, which require an input CFG themselves. 
46 | CFG recovery is typically an iterative refinement process based on a number of program analysis techniques. For instance, value-set analysis (VSA)~\cite{VSA-CC04} is a technique that can be used to identify a tight over-approximation of certain program state properties (e.g., the set of possible targets of an indirect jump or a memory write). In {\sc BitBlaze}~\cite{BITBLAZE-ICISS08} an initial CFG is generated by inserting special successor nodes for unresolved indirect jump targets. This choice is conceptually similar to widening a fact to the bottom of a lattice in a data-flow analysis. When an analysis requires more precise information, VSA is then applied on demand. %Indeed, not all the CFG-based analyses require successor nodes to be accurately identified.
47 | 
48 | {\sc Angr}~\cite{ANGR-SSP16} implements two algorithms for CFG recovery. An iterative algorithm starts from the entry point of the program and interleaves a number of techniques to achieve speed and completeness, including VSA, inter-procedural backward program slicing, and symbolic execution of blocks. This algorithm is however rather slow and may miss code portions reachable only through unresolved jump targets. The authors thus devise a fast secondary algorithm that uses a number of heuristics to identify functions based on prologue signatures, and performs simple analyses (e.g., a lightweight alias analysis) to solve a number of indirect jumps. The algorithm is context-insensitive, so it can be used to quickly recover a CFG without a concern for understanding the reachability of functions from one another. 
49 | 
50 | \subsection{Code Obfuscation}
51 | In recent years, code obfuscation has received considerable attention as a cheap way to hinder the understanding of the inner workings of a proprietary program. Obfuscation is employed not only to thwart software piracy and improve software security, but also to avoid detection and resist analysis for malicious software~\cite{UDM-WCRE15,YJW-SSP15}.
52 | 
53 | A significant motivation behind using symbolic/concolic execution in the analysis of malware is to deal with code obfuscations. However, current analysis techniques have trouble getting around some of those obfuscations, leading to imprecision and/or excessive resource usage~\cite{YD-CCS15}. For instance, obfuscation tools can transform conditional branches into indirect jumps that symbolic analysis find difficult to analyze, while run-time code self-modification might conceal conditional jumps on symbolic values so that they are missed by the analysis.
54 | 
55 | A few works have described obfuscation techniques aiming at thwarting symbolic execution. \cite{SLG-NDSS08} uses one-way hash functions to devise a {\em conditional code obfuscation} scheme that makes it hard to identify the values of symbolic variables for which branch conditions are satisfied. They also present an encryption scheme for the code to execute based on a key derived from the value that satisfies a branch condition. %Although this approach has a few limitations (for instance, it can be applied to equality tests only, and is easy to detect), it represents the first work aiming at defeating symbolic execution-based malware analyzers.
56 | \cite{WMJ-ESORICS11} takes a step forward by proposing an obfuscation technique that is effective \iffullver{in spite of the fact that it uses}{despite it uses} linear operations only, for which symbolic execution usually works well. %In particular, the authors take advantage of the limitations of symbolic execution in analyzing loops:
57 | The obfuscation tool inserts a simple loop incorporating an unsolved mathematical conjecture that converges to a known value after a number of iterations, and the produced result is then combined with the original branch condition. %Conjectures are chosen in a way that a symbolic engine would not have to discard the generated constraints for their complexity (e.g., no floating-point or non-linear operations are performed).
58 | 
59 | \cite{HOT-FPS15} presents BE-PUM, a tool to generate a precise CFG in the presence of obfuscation techniques that are common in the malware domain, including indirect jumps, structured exception handlers (SEHs), overlapping instructions, and self-modifying code. \iffullver{While engines such as {\sc BitBlaze}~\cite{BITBLAZE-ICISS08} typically rely on existing disassemblers like IDA Pro\footnote{\url{https://www.hex-rays.com/products/ida/}.} for obfuscated code, BE-PUM relies on concolic execution for deobfuscation, using a binary emulator for the user process and stubs for API calls.}{While engines such as {\sc BitBlaze}~\cite{BITBLAZE-ICISS08} typically rely on disassemblers like IDA Pro\footnote{\url{https://www.hex-rays.com/products/ida/}.}, BE-PUM relies on concolic execution to deobfuscate code, using a binary emulator for the user process and stubs for API calls.} % TODO check fullver "for deobfuscation"
60 | 
61 | \cite{YD-CCS15} discusses the limitations of symbolic execution in the presence of three generic obfuscation techniques: (1) conditional-to-indirect jump transformation, also known as {\em symbolic jump problem}~\cite{SAB-SP10}; (2) conditional-to-conditional jump transformation, where the predicate is deeply changed; and (3) symbolic code, when code modification is carried out using an input-derived value. The authors show how resorting to bit-level taint analysis and architecture-aware constraint generation can allow symbolic execution to circumvent such obfuscations.


--------------------------------------------------------------------------------
/common.tex:
--------------------------------------------------------------------------------
  1 | % !TEX root = main.tex
  2 | \usepackage{a4wide}
  3 | \usepackage{listings}
  4 | \usepackage{comment}
  5 | \usepackage{amsmath}
  6 | \usepackage{graphicx}
  7 | \usepackage{amssymb}
  8 | \usepackage{url}
  9 | \usepackage{hyperref}
 10 | \usepackage{float}
 11 | \usepackage{lipsum}
 12 | \usepackage{caption}
 13 | \usepackage{subcaption}
 14 | \usepackage{adjustbox}
 15 | \usepackage{framed}
 16 | \usepackage{multirow}
 17 | \usepackage{framed}
 18 | \usepackage{enumitem}
 19 | \usepackage{epigraph}
 20 | \usepackage{wasysym} % \brokenvert
 21 | \usepackage{wrapfig}
 22 | 
 23 | \usepackage[usenames, dvipsnames]{xcolor}
 24 | 
 25 | % commands
 26 | %\newcommand{\fullver}{}
 27 | \ifdefined\fullver
 28 | \newcommand{\iffullver}[2]{#1}
 29 | \else
 30 | \newcommand{\iffullver}[2]{#2}
 31 | \fi
 32 | 
 33 | \usepackage{tikz}
 34 | \newcommand*\circled[1]{\tikz[baseline=(char.base)]{
 35 |             \node[shape=circle,draw,inner sep=2pt] (char) {#1};}}
 36 | 
 37 | %\usepackage{titlesec}
 38 |  %\titlespacing{\section}{0pt}{*1.2}{*1.2}
 39 |  %\titlespacing{\subsection}{0pt}{*1.1}{*1.1}
 40 |  %\titlespacing{\subsubsection}{0pt}{*.6}{*.6}
 41 | %\titlespacing{\paragraph}{0pt}{*.6}{*.60}
 42 | %\titleformat{\paragraph}[runin]{\normalsize\bfseries\scshape}{}{}{}
 43 | %Get rid of some extra whitespace in the bibliography
 44 |  %\setlength{\bibsep}{0.75pt}
 45 | %Get rid of some extra whitespace around float (containing figures)
 46 |  %\setlength{\textfloatsep}{4pt plus 0.25pt minus 1pt}
 47 |  %\setlength{\intextsep}{4.0pt plus 0.25pt minus .5pt}
 48 |  %\setlength{\floatsep}{2pt plus 2pt minus 1pt}
 49 |  %\setlength{\abovecaptionskip}{5pt plus 1pt minus 1pt}
 50 | % \setlength{\belowcaptionskip}{10pt plus 1pt minus 1pt}
 51 |  %\setlength{\parskip}{0pt}
 52 | 
 53 | \renewcommand{\epigraphsize}{\footnotesize}
 54 | \setlength{\epigraphwidth}{10cm}
 55 | %\renewcommand{\epigraphrule}{0pt}
 56 | 
 57 | \definecolor{shadecolor}{rgb}{0.92,0.92,0.92}
 58 | 
 59 | \hypersetup{
 60 |   colorlinks = true, % colours links instead of ugly boxes
 61 |   urlcolor = blue, %  colour for external hyperlinks
 62 |   linkcolor = black, % colour of internal links
 63 |   citecolor = black, % colour of citations
 64 |   pdftitle = {A Survey of Symbolic Execution Techniques},
 65 |   pdfauthor= {Roberto Baldoni, Emilio Coppa, Daniele Cono D'Elia, Camil Demetrescu, Irene Finocchi}
 66 | }	
 67 | 
 68 | %\usepackage{xcolor}
 69 | %\newcommand{\myedit}[1]{{\leavevmode\color{red}#1}}
 70 | %\newcommand{\mytempedit}[1]{{\leavevmode\color{blue}#1}}
 71 | %\newcommand{\myedit}[1]{{\color{red}\underline{#1}}}
 72 | %\newcommand{\mytempedit}[1]{{\color{black}#1}}
 73 | \newcommand{\mytempedit}[1]{\ignorespaces#1}
 74 | \newcommand{\revedit}[1]{{\color{blue}#1}}
 75 | \newcommand{\lateredit}[1]{{\color{red}#1}}
 76 | 
 77 | %\newcommand{\mytempedit}[1]{{\color{blue}#1}}
 78 | 
 79 | %\newcommand{\mytempedit}[1]{{\color{blue}\fontfamily{lmdh}\selectfont #1}}
 80 | 
 81 | %\setlength{\parindent}{0pt}
 82 | \setlength{\FrameSep}{2pt}
 83 | \newcommand{\myparagraph}[1]{\medskip\noindent{\bf\small #1.} }
 84 | \newcommand{\myparagraphnoperiod}[1]{\medskip\noindent{\bf\small #1} }
 85 | 
 86 | % EDIT TO ENABLE NOTES
 87 | \newcommand{\mynote}[1]{\ignorespaces} % TODO
 88 | %\newcommand{\mynote}[1]{\marginpar{\raggedleft{\fontfamily{pbk}\selectfont\scriptsize{\em #1}}}}
 89 | 
 90 | \newcommand{\stwoe}{\text{S\textsuperscript{2}E}}
 91 | \newcommand{\myinput}[1]{\ifdefined\internalrep \input{../#1} \else \input{#1} \fi}
 92 | \newcommand{\missing}{\textbf{XXX}}
 93 | %\newcommand{\boxedexample}[1]{\vspace{2mm}\noindent\fbox{\parbox{0.98\textwidth}{{\em Example.} #1}}}
 94 | 
 95 | \ifdefined\arxivver
 96 | \newcommand{\boxedexample}[1]{
 97 | \begin{shaded}
 98 | \noindent{\bf\small Example.} #1
 99 | \end{shaded}
100 | }
101 | \else
102 | \newcommand{\boxedexample}[1]{
103 | %\vspace{-2mm}
104 | \begin{shaded*}
105 | \noindent{\bf\small Example.} #1
106 | \end{shaded*}
107 | %\vspace{-2mm}
108 | }
109 | \fi
110 | 
111 | 
112 | 


--------------------------------------------------------------------------------
/conclusions.tex:
--------------------------------------------------------------------------------
 1 | % !TEX root = main.tex
 2 | 
 3 | \vspace{-2pt} % TODO
 4 | \section{Conclusions}
 5 | \label{se:conclusions}
 6 | 
 7 | \revedit{
 8 | Symbolic execution techniques have evolved significantly in the last decade, with notable applications to compelling problems from several domains like software testing (e.g., test input generation, regression testing), security (e.g., exploit generation, authentication bypass), and code analysis (e.g., program deobfuscation, dynamic software updating). This trend has not only improved existing solutions, but also led to novel ideas and, in some cases, to major practical breakthroughs. For instance, the push for scalable automated program analyses in security has culminated in the 2016 DARPA Cyber Grand Challenge, which hosted systems for detecting and fixing vulnerabilities in unknown software with no human intervention, such as {\sc Angr}~\cite{ANGR-SSP16} and {\sc Mayhem}~\cite{MAYHEM-SP12}, that competed for nearly \$4M in prize money.
 9 | 
10 | %\noindent
11 | This survey has discussed some of the key aspects and challenges of symbolic execution, presenting for a broad audience the basic design principles of symbolic executors and the main optimization techniques. We hope it will help non-experts grasp the key inventions in this exciting line of research, inspiring further work and new ideas.}
12 | 
13 | \specialcomment{online}{
14 | \begingroup
15 | \subsection*{ELECTRONIC APPENDIX}
16 | \phantomsection\addcontentsline{toc}{subsection}{Electronic Appendix}
17 | }{%
18 | \endgroup
19 | }
20 | 
21 | %\begin{online}
22 | \subsection*{ELECTRONIC APPENDIX}
23 | \revedit{
24 | The online appendix of this manuscript discusses a selection of prominent applications of symbolic execution techniques, addresses further challenges that arise in the analysis of programs in binary form, and provides a list of popular symbolic engines.
25 | }
26 | %\end{online}
27 | 
28 | \iffalse
29 | Techniques for symbolic execution have evolved significantly in the last decade, leading to major practical breakthroughs. In 2016, the DARPA Cyber Grand Challenge hosted systems that can detect and fix vulnerabilities in unknown software with no human intervention, such as {\sc Angr}~\cite{ANGR-SSP16} and {\sc Mayhem}~\cite{MAYHEM-SP12}, which won the \$2M first prize. {\sc Mayhem} was also the first autonomous software to play the Capture-The-Flag contest at the DEF CON 24 hacker convention\footnote{\url{https://www.defcon.org/html/defcon-24/dc-24-ctf.html}.}. The event demonstrated that tools for automatic exploit detection based on symbolic execution can be competitive with human experts, paving the road to unprecedented applications %and the rise of start-ups 
30 | that have the potential to shape software %security and 
31 | reliability in the next decades. 
32 | 
33 | This survey has discussed some of the key aspects and challenges of symbolic execution, presenting them for a broad audience. 
34 | To explain the basic design principles of symbolic executors and the main optimization techniques, we have focused on single-threaded applications with integer arithmetic. Symbolic execution of multi-threaded programs is treated, e.g., \iffullver{in~\cite{KPV-TACAS03,SA-HVC06,CLOUD9-EUROSYS11,FHR-ESEC13,BGC-OOPSLA14,GKW-ESEC15}}
35 | {in~\cite{BGC-OOPSLA14,GKW-ESEC15}}, 
36 | %{in~\cite{FHR-ESEC13,BGC-OOPSLA14,GKW-ESEC15}}, 
37 | while techniques for programs that manipulate floating point data are addressed \iffullver{in, e.g., \cite{M-STVR01,BGM-STVR06,LTH-ICTSS10,CCK-EUROSYS11,BVL-POPL13,CCK-TSE14,RPW-SIGSOFT15}}
38 | {in, e.g., \cite{RPW-SIGSOFT15}}.
39 | %{in, e.g., \cite{BVL-POPL13,CCK-TSE14,RPW-SIGSOFT15}}.
40 | 
41 | We hope that this survey will help non-experts grasp the key inventions in the exciting line of research of symbolic execution, inspiring further work and new ideas.
42 | \fi
43 | 
44 | 
45 | %\myparagraph{Acknowledgements}
46 | %This work is partially supported by a grant of the Italian Presidency of Ministry Council and by the CINI  (Consorzio Interuniversitario Nazionale Informatica) Cybersecurity National Laboratory.
47 | %This work is supported in part by a grant of the Italian Presidency of the Council of Ministers and by the CINI (Consorzio Interuniversitario Nazionale Informatica) National Laboratory of Cyber Security.
48 | 
49 | \ifdefined\arxivver
50 | \myparagraph{Live Version of this Article}
51 | We complement the traditional scholarly publication model by maintaining a live version of this article at {\href{https://github.com/season-lab/survey-symbolic-execution}{https://github.com/season-lab/survey-symbolic-execution/}}. The live version incorporates continuous feedback by the community, providing post-publication fixes, improvements, and extensions.
52 | \fi
53 | 


--------------------------------------------------------------------------------
/constraints.tex:
--------------------------------------------------------------------------------
  1 | % !TEX root = main.tex
  2 | 
  3 | \section{Constraint solving}
  4 | \label{se:constraint-solving}
  5 | 
  6 | Constraint satisfaction problems arise in many domains, including analysis, testing, and verification of software programs. Constraint solvers are decision procedures for problems expressed in logical formulas: for instance, the boolean satisfiability problem (also known as SAT) aims at determining whether there exists an interpretation of the symbols of a formula that makes it true. Although SAT is a well-known NP-complete problem, recent advances have moved the boundaries for what is intractable when it comes to practical applications~\cite{SMT-CACM11}. 
  7 | 
  8 | % linear arithmetic inequalities
  9 | Observe that some problems are more naturally described with languages that are more expressive than the one of boolean formulas with logical connectives. For this reason, satisfiability modulo theories (SMT) generalize the SAT problem with supporting theories to capture formulas involving, for instance, linear arithmetic and operations over \iffullver{arrays (see, e.g., Section~\ref{ss:fully-symbolic-memory}).}{arrays.} SMT solvers map the atoms in an SMT formula to fresh boolean variables: a SAT decision procedure checks the rewritten formula for satisfiability, and a theory solver checks the model generated by the SAT procedure.
 10 | 
 11 | %\mytempedit{In particular, SMT-compliant theory solvers are required to be able to: (i) work incrementally when checking for consistency as novel constraints are added, (ii) support backtracking, i.e., constraint removal, and (iii) provide explanations for inconsistent constraints~\cite{Abraham15}.}
 12 | 
 13 | SMT solvers show several distinctive strengths. Their core algorithms are generic, and can handle complex combinations of many individual constraints. They can work incrementally and backtrack as constraints are added or removed, and provide explanations for inconsistencies. Theories can be added and combined in arbitrary ways, e.g., to reason about arrays of strings. Decision procedures do not need to be carried out in isolation: often, they are profitably combined to reduce the amount of time spent in heavier procedures, e.g., by solving linear parts first in a non-linear arithmetic formula. Incomplete procedures are valuable too: complete but expensive procedures get called only when conclusive answers could not be produced. All these factors allows SMT solvers to tackle large problems that no single procedure can solve in isolation\footnote{We refer the interested reader to~\cite{BKM14} for an exhaustive introduction to SMT solving, and to~\cite{SC2} for a discussion of its distinctive strengths.}.
 14 | % SHORTER VERSION
 15 | % }%\footnote{\cite{BKM14,SC2} provide interesting discussions of the strengths of SMT solvers.}.}
 16 | 
 17 | 
 18 | %\mytempedit{SMT solvers show a number of distinctive strengths. They can work incrementally as constraints are added to formulas, backtrack for constraint removal, and provide explanations for inconsistent constraints. Their core algorithms are generic and can handle complex combinations of many individual constraints. Theories can be added and, more importantly, combined in arbitrary ways, e.g., to reason about arrays of strings. Decision procedures are not required to be carried out in isolation: often, they can profitably be combined to reduce the amount of time spent in heavier procedures, e.g., by solving linear problem parts first for a non-linear arithmetic formula. Incomplete procedures are valuable too: complete but expensive procedures get called only when conclusive answers could not be produced. The combination of these factors allows SMT solvers to tackle large problems that no single procedure can solve in isolation\footnote{We refer the interested reader to~\cite{BKM14} for an exhaustive introduction to SMT solving, and to~\cite{SC2} for a discussion of its distinctive strengths.}.}
 19 | 
 20 | % STP~\cite{STP-CAV07,STP-TR07} solver
 21 | % {\sc MineSweeper}~\cite{MineSweeper-BOTNET08}, and {\sc AEG}~\cite{AEG-NDSS11}
 22 | In a symbolic executor, constraint solving plays a crucial role in checking the feasibility of a path, generating assignments to symbolic variables, and verifying assertions.
 23 | %
 24 | Over the years, different solvers have been employed by symbolic executors, depending on the supported theories and the relative performance at the time. For instance, the STP~\cite{STP-CAV07} solver has been employed in, e.g., {\sc EXE}~\cite{EXE-CCS06}, {\sc KLEE}~\cite{KLEE-OSDI08}, and {\sc AEG}~\cite{AEG-NDSS11}, which all leverage its support for bit-vector and array theories. Other executors such as {\sc Java PathFinder}~\cite{PATHFINDER-ASE10} have complemented SMT solving with additional decision procedures (e.g., libraries for constraint programming~\cite{CHOCO}) and heuristics to handle complex non-linear mathematical constraints~\cite{CORAL-NFM11}.
 25 | 
 26 | Recently, Z3~\cite{Z3-TACS08} has emerged as leading solution for SMT solving. Developed at Microsoft Research, Z3 offers cutting-edge performance and supports a large number of theories, including bit-vectors, arrays, quantifiers, uninterpreted functions, linear integer and real arithmetic, and non-linear arithmetic. 
 27 | %
 28 | %Effective support for strings has been recently offered by Z3-str~\cite{ZZG-FSE13}, an extension of Z3 that makes it possible to treat string as a primitive type, allowing the solver to reason on common string operations such as concatenation, substring, and replacement.
 29 | Its Z3-str~\cite{ZZG-FSE13} extension makes it possible to treat also strings as a primitive type, allowing the solver to reason on common string operations such as concatenation, substring, and replacement.
 30 | %
 31 | Z3 is employed in most recently appeared symbolic executors such as {\sc Mayhem}~\cite{MAYHEM-SP12}, {\sc SAGE}~\cite{SAGE-QUEUE12}, and {\sc Angr}~\cite{ANGR-SSP16}. Due to the extensive number of supported theories in Z3, such executors typically do not to employ additional decision procedures.
 32 | 
 33 | %The two most popular solvers used in symbolic executors are STP and Z3. STP~\cite{STP-CAV07,STP-TR07} is an SMT solver with bitvector and array theories initially developed at Stanford and employed in, e.g., {\sc EXE}~\cite{EXE-CCS06}, {\sc KLEE}~\cite{KLEE-OSDI08}, {\sc MineSweeper}~\cite{MineSweeper-BOTNET08}, and {\sc AEG}~\cite{AEG-NDSS11}. Z3~\cite{Z3-TACS08} is an SMT solver developed at Microsoft with support for nonlinear arithmetic, bitvector, and array theories, and is used in, e.g., {\sc Mayhem}~\cite{MAYHEM-SP12}, {\sc SAGE}~\cite{SAGE-QUEUE12}, and {\sc Angr}~\cite{ANGR-SSP16}. CVC3~\cite{CVC3-CAV07} is another SMT solver that supports theories for linear arithmetic, bitvectors, arrays, and quantifiers, and is employed in {\sc Java PathFinder}~\cite{PATHFINDER-ASE10} along with CHOCO~\cite{CHOCO} for integer/real constraints and CORAL~\cite{CORAL-NFM11} for complex mathematical constraints. Modern symbolic executors can typically choose between different underlying solvers through a common API, and also resort to a native interface to a specific solver for better performance.
 34 | 
 35 | %only for efficiency reasons.
 36 | 
 37 | %For instance, many solvers have the development of ~\cite{PATHFINDER-ASE10} can use a large number of SMT solvers, including Yices, 
 38 | %~\cite{YICES-CAV06} is an incremental solver with support for rational and integer linear arithmetic, bitvectors, and arrays, and was originally used in 
 39 | %In Table~\ref{tab:solvers} we report a number of constraint solving tools used in popular symbolic execution engines.
 40 | 
 41 | % feasibility or applicability? TODO
 42 | However, despite the significant advances observed over the past few years -- which also made symbolic execution practical in the first place~\cite{CS-CACM13} -- constraint solving remains one of the main obstacles to the scalability of symbolic execution engines, and also hinders its feasibility in the face of constraints that involve expensive theories (e.g., non-linear arithmetic) or opaque library calls.
 43 | 
 44 | %\subsection{Optimization Techniques}
 45 | %\label{ss:constraint-opt}
 46 | 
 47 | % handling or skipping over
 48 | In the remainder of this section, we address different techniques to extend the range of programs \iffullver{that can be handled by}{amenable to} symbolic execution and to optimize the performance of constraint solving. Prominent approaches consist in: (i) reducing the size and complexity of the constraints to check, (ii) unburdening the solver by, e.g., resorting to constraint solution caching, deferring of \iffullver{constraint solver queries}{solver queries}, or concretization, and (iii) augmenting symbolic execution to handle constraints problematic for decision procedures.
 49 | 
 50 | %We conclude by pointing out potential directions to improve support for non-linear arithmetic}.
 51 | 
 52 | %\mytempedit{and (iii) augmenting symbolic execution with techniques aimed at handling constraints that are problematic for the underlying decision procedure. We conclude the section by pointing out potential research directions to improve support for non-linear arithmetic}.
 53 | 
 54 | %: (i) {\em constraint reduction} techniques aim at simplifying constraints fed to a solver by rewriting them into a shorter form: (ii) techniques for {\em reuse of constraint solutions} explore the space-time trade-off of retrieving previously computed query results rather than repeating expensive satisfiability checks.
 55 | 
 56 | \myparagraph{Constraint Reduction} 
 57 | A common optimization approach followed by both solvers and symbolic executors is to reduce constraints into simpler forms. For example, the {\em expression rewriting} optimization can apply classical techniques from optimizing compilers such as constant folding, strength reduction, and simplification of linear expressions (see, e.g., {\sc KLEE}~\cite{KLEE-OSDI08}).
 58 | 
 59 | {\sc EXE}~\cite{EXE-CCS06} introduces a {\em constraint independence} optimization that exploits the fact that a set of constraints can frequently be divided into multiple independent subsets of constraints. This optimization interacts well with query result caching strategies, and offers an additional advantage when an engine asks the solver about the satisfiability of a specific constraint, as it removes irrelevant constraints from the query. In fact, independent branches, which tend to be frequent in real programs, could lead to unnecessary constraints that would get quickly accumulated.
 60 | 
 61 | Another fact that can be exploited by reduction techniques is that the natural structure of programs can lead to the introduction of more specific constraints for some variables as the execution proceeds. Since path conditions are generated by conjoining new terms to an existing sequence, it might become possible to rewrite and optimize existing constraints. For instance, adding an equality constraint of the form $x:=5$ enables not only the simplification to true of other constraints over the value of the variable (e.g., $x>0$), but also the substitution of the symbol $x$ with the associated concrete value in the other subsequent constraints involving it. The latter optimization is also known as {\em implied value concretization} and, for instance, it is employed by {\sc KLEE}~\cite{KLEE-OSDI08}.
 62 | 
 63 | In a similar spirit, {\sc \stwoe}~\cite{CKC-TOCS12} introduces a bitfield-theory expression simplifier to replace with concrete values parts of a symbolic variable that bit operations mask away. For instance, for any 8-bit symbolic value $v$, the most significant bit in the value of expression $v\,|\,10000000_2$ is always 1. The simplifier can propagate information across the tree representation of an expression, and if each bit in its value can be determined, the expression is replaced with the corresponding constant.
 64 |  
 65 | %path conditions in a symbolic executor are typically generated by conjoining a new term to an existing (and possibly satisfiable) sequence of constraints. As the exploration proceeds, the natural structure of programs means that constraints might become more specific for some variables, and constraints can be rewritten accordingly. 
 66 | 
 67 | %\subsubsection{Reuse of Constraint Solutions}
 68 | %\label{ss:constraint-reuse}
 69 | 
 70 | %\subsection{Unburdening the Constraint Solver} 
 71 | %\label{ss:solver-unburdening}
 72 | 
 73 | \myparagraph{Reuse of Constraint Solutions} 
 74 | The idea of reusing previously computed results to speed up constraint solving can be particularly effective in the setting of a symbolic executor, especially when combined with other techniques such as constraint independence optimization. Most reuse approaches for constraint solving are currently based on semantic or syntactic equivalence of the constraints.
 75 | 
 76 | {\sc EXE}~\cite{EXE-CCS06} caches the results of constraint solutions and satisfiability queries in order to reduce as much as possible the need for calling the solver. A cache is handled by a server process that can receive queries from multiple parallel instances of the execution engine, each exploring a different program state.
 77 | 
 78 | {\sc KLEE}~\cite{KLEE-OSDI08} implements an incremental optimization strategy called {\em counterexample caching}. Using a cache, constraint sets are mapped to concrete variable assignments, or to a special null value when a constraint set is unsatisfiable. When an unsatisfiable set in the cache is a subset for a given constraint set $S$, $S$ is deemed unsatisfiable as well. Conversely, when the cache contains a solution for a superset of $S$, the solution trivially satisfies $S$ too. Finally, when the cache contains a solution for one or more subsets of $S$, the algorithm tries substituting in all the solutions to check whether a satisfying solution for $S$ can be found.
 79 | 
 80 | {\em Memoized symbolic execution}~\cite{MEMO-ISSTA12} is motivated by the observation that symbolic execution often results in re-running largely similar sub-problems, e.g., finding a bug, fixing it, and then testing the program again to check if the fix was effective. The taken choices during path exploration are compactly encoded in a prefix tree, opening up the possibility to reuse previously computed results in successive runs.
 81 | %  in a trie-based data structure
 82 | 
 83 | The Green framework~\cite{GREEN-FSE12} explores constraint solution reuse across runs of not only the same program, but also similar programs, different programs, and different analyses. Constraints are distilled into their essential parts through a {\em slicing} transformation and represented in a canonical form to achieve good reuse, even within a single analysis run. \cite{JGY-ISSTA15} presents an extension to the framework that exploits logical implication relations between constraints to support constraint reuse and faster execution times.
 84 | 
 85 | %\subsection{Other Optimizations in Symbolic Executors}
 86 | %\subsection{Reducing the Symbolic Executor's Pressure on Constraint Solvers}
 87 | %\label{ss:reducing-constraint-solver-pressure}
 88 | 
 89 | %In this section we present a number of other optimizations that become possible in the setting of a symbolic executor to reduce the time spent in the constraint solver.
 90 | 
 91 | \myparagraph{Lazy Constraints}
 92 | \cite{UCKLEE-USEC15} adopts a timeout approach for constraint solver queries. In their initial experiments, the authors traced most timeouts to symbolic division and remainder operations, with the worst cases occurring when an unsigned remainder operation had a symbolic value in the denominator.
 93 | They thus implemented a solution that works as follow: when the executor encounters a branch statement involving an expensive symbolic operation, it will take both the true and false branches and add a {\em lazy} constraint on the result of the expensive operation to the path conditions. When the exploration reaches a state that satisfies some goal (e.g., an error is found), the algorithm will check for the feasibility of the path, and suppress it if deemed unreachable in a real execution.
 94 | 
 95 | Compared to the {\em eager} approach of checking the feasibility of a branch as encountered (Section~\ref{ss:unrealizable-paths}), a lazy strategy may lead to a larger number of active states, and in turn to more solver queries. However, the authors report that the delayed queries are in many cases more efficient than their eager counterparts: the path constraints added after a lazy constraint can in fact narrow down the solution space for the solver.
 96 | 
 97 | \begin{figure}[t]
 98 |   \begin{center}
 99 |   \begin{subfigure}{.43\textwidth}
100 |     \vspace{0mm}
101 |     \begin{lstlisting}[basicstyle=\ttfamily\scriptsize]
102 |     1. void test(int x, int y) {
103 |     2.    if (non_linear(y) == x) 
104 |     3.      if (x > y + 10) ERROR; }
105 |     \end{lstlisting}
106 |     %\vspace{8.5mm}
107 |     %\caption{}
108 |   \end{subfigure}%
109 |     \begin{subfigure}{.43\textwidth}
110 |     %\vspace{-5.2mm}
111 |     \begin{lstlisting}[basicstyle=\ttfamily\scriptsize]
112 |       4. int non_linear(int v) {
113 |       5.    return (v*v) % 50;
114 |       6. }
115 |     \end{lstlisting}
116 |     %\vspace{3.5mm}
117 |     %\caption{}
118 |   \end{subfigure}%
119 |   \end{center}
120 |   \vspace{-4.0mm}
121 |   \caption{Example with non-linear constraints.}
122 |   \label{fi:non-linear-constraints}
123 |   \vspace{-2mm}
124 | \end{figure}
125 | 
126 | 
127 | \myparagraph{Concretization}
128 | \cite{CS-CACM13} discusses limitations of classical symbolic execution in the presence of formulas that constraint solvers cannot solve, at least not efficiently. A concolic executor generates some random input for the program and executes it both concretely and symbolically: a possible value from the concrete execution can be used for a symbolic operand involved in a formula that is inherently hard for the solver, albeit at the cost of possibly sacrificing soundness in the exploration. 
129 | %For instance, in the presence of three nested branches with only one being non-linear, {\sc DART}~\cite{DART-PLDI05} starts from a random valid input for the function, and then alters it when symbolically exploring the two linear branches. The work resorts to concretization also to avoid performing expensive or imprecise alias analysis on pointers. % with only one of them being
130 | 
131 | 
132 | \boxedexample{In the code fragment of Figure~\ref{fi:non-linear-constraints}, the engine stores a non-linear constraint of the form $\alpha_x = (\alpha_y*\alpha_y)\,\%\,50$ for the $true$ branch at line 2. A solver that does not support non-linear arithmetic fails to generate any input for the program. However, a concolic engine can exploit concrete values to help the solver. For instance, if $x=3$ and $y=5$ are randomly chosen as initial input parameters, then the concrete execution does not take any of the two branches. Nonetheless, the engine can reuse the concrete value of $y$, simplifying the previous query as $\alpha_x = 25$ due to $\alpha_y = 5$. The straightforward solution to this query can now be used by the engine to explore both branches. Notice that if the value of $y$ is fixed to $5$, then there is no way of generating a new input that takes the first but not the second branch, inducing a false negative. In this case, a trivial solution could be to rerun the program choosing a different value for $y$ (e.g., if $y=2$ then $x=4$, which satisfies the first but not the second branch).   
133 | }
134 | 
135 | 
136 | % suggests to
137 | To partially overcome the incompleteness due to concretization,~\cite{PRV-ISSTA11} suggests {\em mixed concrete-symbolic solving}, which considers {\em all} the path constraints collectable over a path before binding one or more symbols to specific concrete values. Indeed, {\sc DART}~\cite{DART-PLDI05} concretizes symbols based on the path constraints collected up to a target branch. In this manner, a constraint contained in a subsequent branch in the same path is not considered and it may be not satisfiable due to already concretized symbols. If this happen, {\sc DART} restarts the execution with different random concrete values, hoping to be able to satisfy the subsequent branch. The approach presented in~\cite{PRV-ISSTA11} requires instead to detect {\em solvable} constraints along a full path and to delay concretization as much as possible.
138 | 
139 | \myparagraph{Handling Problematic Constraints}
140 | Strong SMT solvers allow executors to handle more path constraints directly, reducing the need to resort to concretization. This also results in a lower risk to incur a {\em blind commitment} to concrete values~\cite{DA-FSE14}, which happens when the under-approximation of path conditions from a random choice of concrete values for some variables results in an arbitrary restriction of the search space.
141 | \revedit{However, the decision problem for certain classes of constraints is well known to be undecidable, e.g., like for non-linear integer arithmetic, or the theory of reals with trigonometric functions often used to model real-world systems.}
142 | %\revedit{However, problems such as non-linear integer arithmetic or the theory of reals together with trigonometric functions are well known to be undecidable.} % SHORT VERSION
143 | %Unfortunately, some constraints remain prohibitive for SMT solvers: for instance, non-linear integer arithmetic is undecidable in general; also, a branch condition might contain calls to opaque library methods such as trigonometric functions that would require special extensions to the solver to reason about.
144 | 
145 | \cite{DA-FSE14} proposes a {\em concolic walk} algorithm that can tackle control-flow dependencies involving non-linear arithmetic and library calls. The algorithm treats assignments of values to variables as a valuation space: the solutions of the linear constraints define a polytope that can be walked heuristically, while the remaining constraints are assigned with a fitness function measuring how close a valuation point is to matching the constraint. An adaptive search is performed on the polytope as points are picked on it and non-linear constraints evaluated on them. Compared to mixed concrete-symbolic solving~\cite{PRV-ISSTA11}, both techniques seek to avoid blind commitment. However, concolic walk does not rely on the solver for obtaining all the concrete inputs needed to evaluate complex constraints, and implements search heuristics that guide the walk on the polytope toward promising regions.
146 | 
147 | % Symcretic execution
148 | % , which determines how close the branch conditions are to being satisfied and alters the concrete inputs to move closer to a full solution
149 | %For instance, if an {\tt assert} statement is guarded by a branch condition that can be proven unsatisfiable, then there is no need to take into account all the other constraints along the path to the entry point to declare the target unreachable. A traditional concolic executor reasons instead about all the constraints along a path with a top-down approach, making it hard to detect the unreachability of a target statement because of constraints ``deep'' in the path.
150 | 
151 | \cite{DA-ASE14} describes {\em symcretic} execution, a novel combination of symbolic backward execution (SBE) (Section~\ref{se:executors}) and forward symbolic execution. The main idea is to divide exploration into two phases. In the first phase, SBE is performed from a target point and a trace is collected for each followed path. If any problematic constraints are met during the backward exploration, the engine marks them as {\em potentially} satisfiable by adding a special event to the trace and continues its reversed traversal. Whenever an entry point of the program is reached along any of the followed paths, the second phase starts. The engine concretely evaluates the collected trace, trying to satisfy any constraint marked as problematic during the first phase. This is done using a heuristic search, such as the concolic walk described above. An advantage of symcretic over classical concolic execution is that it can prevent the exploration of some unfeasible paths. For instance, the backward phase may determine that a statement is guarded by an unsatisfiable branch regardless of how the statement is reached, while a traditional concolic executor would detect the unfeasibility on a per-path basis only when the statement is reached, which is unfavorable for statements ``deep'' in a path.
152 | 
153 | %\myparagraph{Memory Page Size}
154 | %In {\sc \stwoe}~\cite{CKC-TOCS12}, when a symbolic pointer is dereferenced, the engine determines which memory pages are referenced by it and passes their contents to the solver. As large page sizes can overwhelm the solver, {\sc \stwoe} uses small pages of configurable size rather than the default 4KB pages. The authors report significant performance benefits from using pages of smaller size.


--------------------------------------------------------------------------------
/environment.tex:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/environment.tex


--------------------------------------------------------------------------------
/hang.tex:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/hang.tex


--------------------------------------------------------------------------------
/images/blackbox.odg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/images/blackbox.odg


--------------------------------------------------------------------------------
/images/blackbox.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/images/blackbox.pdf


--------------------------------------------------------------------------------
/images/compiler.odg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/images/compiler.odg


--------------------------------------------------------------------------------
/images/compiler.odg.new:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/images/compiler.odg.new


--------------------------------------------------------------------------------
/images/compiler.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/images/compiler.pdf


--------------------------------------------------------------------------------
/images/compiler.pdf.new:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/images/compiler.pdf.new


--------------------------------------------------------------------------------
/images/concolic-execution-2.odg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/images/concolic-execution-2.odg


--------------------------------------------------------------------------------
/images/concolic-execution-2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/images/concolic-execution-2.pdf


--------------------------------------------------------------------------------
/images/concolic-execution.odg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/images/concolic-execution.odg


--------------------------------------------------------------------------------
/images/concolic-execution.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/images/concolic-execution.pdf


--------------------------------------------------------------------------------
/images/concolic-execution_old.odg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/images/concolic-execution_old.odg


--------------------------------------------------------------------------------
/images/concolic-execution_old.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/images/concolic-execution_old.pdf


--------------------------------------------------------------------------------
/images/concrete-abstract.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/images/concrete-abstract.pdf


--------------------------------------------------------------------------------
/images/concrete-abstract.svg:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?>
  2 | <!-- Created with Inkscape (http://www.inkscape.org/) -->
  3 | 
  4 | <svg
  5 |    xmlns:dc="http://purl.org/dc/elements/1.1/"
  6 |    xmlns:cc="http://creativecommons.org/ns#"
  7 |    xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
  8 |    xmlns:svg="http://www.w3.org/2000/svg"
  9 |    xmlns="http://www.w3.org/2000/svg"
 10 |    xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
 11 |    xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
 12 |    id="svg5607"
 13 |    version="1.1"
 14 |    inkscape:version="0.91 r13725"
 15 |    width="1052.5"
 16 |    height="743.75"
 17 |    xml:space="preserve"
 18 |    sodipodi:docname="concrete-abstract.svg"><metadata
 19 |      id="metadata5613"><rdf:RDF><cc:Work
 20 |          rdf:about=""><dc:format>image/svg+xml</dc:format><dc:type
 21 |            rdf:resource="http://purl.org/dc/dcmitype/StillImage" /><dc:title /></cc:Work></rdf:RDF></metadata><defs
 22 |      id="defs5611"><marker
 23 |        inkscape:isstock="true"
 24 |        style="overflow:visible;"
 25 |        id="marker4861"
 26 |        refX="0.0"
 27 |        refY="0.0"
 28 |        orient="auto"
 29 |        inkscape:stockid="Arrow2Mend"><path
 30 |          transform="scale(0.6) rotate(180) translate(0,0)"
 31 |          d="M 8.7185878,4.0337352 L -2.2072895,0.016013256 L 8.7185884,-4.0017078 C 6.9730900,-1.6296469 6.9831476,1.6157441 8.7185878,4.0337352 z "
 32 |          style="fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round;stroke:#000000;stroke-opacity:1;fill:#000000;fill-opacity:1"
 33 |          id="path4863" /></marker><marker
 34 |        inkscape:isstock="true"
 35 |        style="overflow:visible;"
 36 |        id="marker6583"
 37 |        refX="0.0"
 38 |        refY="0.0"
 39 |        orient="auto"
 40 |        inkscape:stockid="Arrow2Mend"><path
 41 |          transform="scale(0.6) rotate(180) translate(0,0)"
 42 |          d="M 8.7185878,4.0337352 L -2.2072895,0.016013256 L 8.7185884,-4.0017078 C 6.9730900,-1.6296469 6.9831476,1.6157441 8.7185878,4.0337352 z "
 43 |          style="fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round;stroke:#000000;stroke-opacity:1;fill:#000000;fill-opacity:1"
 44 |          id="path6585" /></marker><marker
 45 |        inkscape:stockid="Arrow1Mend"
 46 |        orient="auto"
 47 |        refY="0.0"
 48 |        refX="0.0"
 49 |        id="Arrow1Mend"
 50 |        style="overflow:visible;"
 51 |        inkscape:isstock="true"><path
 52 |          id="path4548"
 53 |          d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z "
 54 |          style="fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1;fill:#000000;fill-opacity:1"
 55 |          transform="scale(0.4) rotate(180) translate(10,0)" /></marker><marker
 56 |        inkscape:stockid="Arrow1Lstart"
 57 |        orient="auto"
 58 |        refY="0.0"
 59 |        refX="0.0"
 60 |        id="Arrow1Lstart"
 61 |        style="overflow:visible"><path
 62 |          id="path4648"
 63 |          d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z "
 64 |          style="fill-rule:evenodd;stroke:#000000;stroke-width:1.0pt;marker-start:none"
 65 |          transform="scale(0.8) translate(12.5,0)" /></marker><marker
 66 |        inkscape:stockid="Arrow2Lend"
 67 |        orient="auto"
 68 |        refY="0.0"
 69 |        refX="0.0"
 70 |        id="Arrow2Lend"
 71 |        style="overflow:visible;"><path
 72 |          id="path3952"
 73 |          style="font-size:12.0;fill-rule:evenodd;stroke-width:0.62500000;stroke-linejoin:round;"
 74 |          d="M 8.7185878,4.0337352 L -2.2072895,0.016013256 L 8.7185884,-4.0017078 C 6.9730900,-1.6296469 6.9831476,1.6157441 8.7185878,4.0337352 z "
 75 |          transform="scale(1.1) rotate(180) translate(1,0)" /></marker><marker
 76 |        inkscape:stockid="Arrow1Lend"
 77 |        orient="auto"
 78 |        refY="0.0"
 79 |        refX="0.0"
 80 |        id="Arrow1Lend"
 81 |        style="overflow:visible;"><path
 82 |          id="path3934"
 83 |          d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z "
 84 |          style="fill-rule:evenodd;stroke:#000000;stroke-width:1.0pt;marker-start:none;"
 85 |          transform="scale(0.8) rotate(180) translate(12.5,0)" /></marker><marker
 86 |        inkscape:stockid="Arrow2Lstart"
 87 |        orient="auto"
 88 |        refY="0.0"
 89 |        refX="0.0"
 90 |        id="Arrow2Lstart"
 91 |        style="overflow:visible"><path
 92 |          id="path3949"
 93 |          style="font-size:12.0;fill-rule:evenodd;stroke-width:0.62500000;stroke-linejoin:round"
 94 |          d="M 8.7185878,4.0337352 L -2.2072895,0.016013256 L 8.7185884,-4.0017078 C 6.9730900,-1.6296469 6.9831476,1.6157441 8.7185878,4.0337352 z "
 95 |          transform="scale(1.1) translate(1,0)" /></marker><inkscape:perspective
 96 |        sodipodi:type="inkscape:persp3d"
 97 |        inkscape:vp_x="0 : 0.5 : 1"
 98 |        inkscape:vp_y="0 : 1000 : 0"
 99 |        inkscape:vp_z="1 : 0.5 : 1"
100 |        inkscape:persp3d-origin="0.5 : 0.33333333 : 1"
101 |        id="perspective5615" /></defs><sodipodi:namedview
102 |      pagecolor="#ffffff"
103 |      bordercolor="#666666"
104 |      borderopacity="1"
105 |      objecttolerance="10"
106 |      gridtolerance="10"
107 |      guidetolerance="10"
108 |      inkscape:pageopacity="0"
109 |      inkscape:pageshadow="2"
110 |      inkscape:window-width="1440"
111 |      inkscape:window-height="851"
112 |      id="namedview5609"
113 |      showgrid="true"
114 |      inkscape:zoom="2.46"
115 |      inkscape:cx="186.32506"
116 |      inkscape:cy="520.84305"
117 |      inkscape:window-x="0"
118 |      inkscape:window-y="1"
119 |      inkscape:window-maximized="1"
120 |      inkscape:current-layer="g5617"
121 |      inkscape:snap-center="false"
122 |      inkscape:snap-object-midpoints="false"
123 |      showguides="true"
124 |      inkscape:guide-bbox="true"
125 |      inkscape:snap-bbox="true"
126 |      inkscape:snap-text-baseline="true"
127 |      inkscape:snap-others="true"
128 |      inkscape:bbox-nodes="true"
129 |      inkscape:object-nodes="true"><inkscape:grid
130 |        type="xygrid"
131 |        id="grid3029"
132 |        empspacing="5"
133 |        visible="true"
134 |        enabled="true"
135 |        snapvisiblegridlinesonly="true" /><sodipodi:guide
136 |        orientation="1,0"
137 |        position="-150,500"
138 |        id="guide3732" /><sodipodi:guide
139 |        orientation="1,0"
140 |        position="-150,500"
141 |        id="guide3734" /></sodipodi:namedview><g
142 |      id="g5617"
143 |      inkscape:groupmode="layer"
144 |      inkscape:label="automa"
145 |      transform="matrix(1.25,0,0,-1.25,0,743.75)"><ellipse
146 |        style="opacity:1;fill:#cfcfcf;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:0.31999999;stroke-linecap:square;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1"
147 |        id="ellipse5778"
148 |        cx="133.84"
149 |        cy="-412"
150 |        transform="scale(1,-1)"
151 |        rx="42.000004"
152 |        ry="11.84" /><flowRoot
153 |        transform="matrix(0.8,0,0,-0.8,0,595)"
154 |        style="font-size:40px;font-style:normal;font-weight:normal;fill:#000000;fill-opacity:1;stroke:none;font-family:Bitstream Vera Sans"
155 |        id="flowRoot2925"
156 |        xml:space="preserve"><flowRegion
157 |          id="flowRegion2927"><rect
158 |            y="382.10107"
159 |            x="207.50348"
160 |            height="59.286709"
161 |            width="118.57342"
162 |            id="rect2929" /></flowRegion><flowPara
163 |          id="flowPara2931" /></flowRoot><flowRoot
164 |        transform="matrix(0.8,0,0,-0.8,0,595)"
165 |        style="font-size:40px;font-style:normal;font-weight:normal;fill:#000000;fill-opacity:1;stroke:none;font-family:Bitstream Vera Sans"
166 |        id="flowRoot2933"
167 |        xml:space="preserve"><flowRegion
168 |          id="flowRegion2935"><rect
169 |            y="363.32693"
170 |            x="187.74124"
171 |            height="99.799294"
172 |            width="182.80069"
173 |            id="rect2937" /></flowRegion><flowPara
174 |          id="flowPara2939" /></flowRoot><flowRoot
175 |        style="font-size:30px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:100%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Times New Roman;-inkscape-font-specification:'Times New Roman,'"
176 |        id="flowRoot3789"
177 |        xml:space="preserve"><flowRegion
178 |          id="flowRegion3791"><rect
179 |            y="334.67169"
180 |            x="468.36502"
181 |            height="51.381817"
182 |            width="50.393703"
183 |            id="rect3793" /></flowRegion><flowPara
184 |          id="flowPara3795" /></flowRoot><flowRoot
185 |        xml:space="preserve"
186 |        id="flowRoot3224"
187 |        style="fill:black;stroke:none;stroke-opacity:1;stroke-width:1px;stroke-linejoin:miter;stroke-linecap:butt;fill-opacity:1;font-family:Courier;font-style:normal;font-weight:normal;font-size:12px;-inkscape-font-specification:Courier;font-stretch:normal;font-variant:normal;text-anchor:middle;text-align:center;writing-mode:lr;line-height:100%;letter-spacing:0px;word-spacing:0px;"><flowRegion
188 |          id="flowRegion3226"><rect
189 |            id="rect3228"
190 |            width="107.42156"
191 |            height="181.08206"
192 |            x="386.71762"
193 |            y="17.580261"
194 |            style="-inkscape-font-specification:Courier;font-family:Courier;font-weight:normal;font-style:normal;font-stretch:normal;font-variant:normal;" /></flowRegion><flowPara
195 |          id="flowPara3230" /></flowRoot><flowRoot
196 |        xml:space="preserve"
197 |        id="flowRoot3975"
198 |        style="fill:black;stroke:none;stroke-opacity:1;stroke-width:1px;stroke-linejoin:miter;stroke-linecap:butt;fill-opacity:1;font-family:Courier;font-style:normal;font-weight:normal;font-size:12px;-inkscape-font-specification:Courier;font-stretch:normal;font-variant:normal;text-anchor:middle;text-align:center;writing-mode:lr;line-height:100%;letter-spacing:0px;word-spacing:0px;"><flowRegion
199 |          id="flowRegion3977"><rect
200 |            id="rect3979"
201 |            width="135.60606"
202 |            height="221.9697"
203 |            x="862.87878"
204 |            y="159.65909"
205 |            style="-inkscape-font-specification:Courier;font-family:Courier;font-weight:normal;font-style:normal;font-stretch:normal;font-variant:normal;" /></flowRegion><flowPara
206 |          id="flowPara3981" /></flowRoot><flowRoot
207 |        xml:space="preserve"
208 |        id="flowRoot3637"
209 |        style="fill:black;stroke:none;stroke-opacity:1;stroke-width:1px;stroke-linejoin:miter;stroke-linecap:butt;fill-opacity:1;font-family:Courier;font-style:normal;font-weight:normal;font-size:15px;line-height:125%;letter-spacing:0px;word-spacing:0px;-inkscape-font-specification:Courier;font-stretch:normal;font-variant:normal;"><flowRegion
210 |          id="flowRegion3639"><rect
211 |            id="rect3641"
212 |            width="145"
213 |            height="175"
214 |            x="65"
215 |            y="93.75"
216 |            style="-inkscape-font-specification:Courier;font-family:Courier;font-weight:normal;font-style:normal;font-stretch:normal;font-variant:normal;" /></flowRegion><flowPara
217 |          id="flowPara3643" /></flowRoot><flowRoot
218 |        transform="matrix(0.8,0,0,-0.8,0,494.30859)"
219 |        style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:12.5px;line-height:125%;font-family:serif;-inkscape-font-specification:'serif, Normal';text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
220 |        id="flowRoot4383"
221 |        xml:space="preserve"><flowRegion
222 |          id="flowRegion4385"><rect
223 |            y="103.75"
224 |            x="45"
225 |            height="185"
226 |            width="165"
227 |            id="rect4387"
228 |            style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:12.5px;line-height:125%;font-family:serif;-inkscape-font-specification:'serif, Normal';text-align:start;writing-mode:lr-tb;text-anchor:start" /></flowRegion><flowPara
229 |          id="flowPara4441" /></flowRoot><flowRoot
230 |        xml:space="preserve"
231 |        id="flowRoot4423"
232 |        style="fill:black;stroke:none;stroke-opacity:1;stroke-width:1px;stroke-linejoin:miter;stroke-linecap:butt;fill-opacity:1;font-family:sans-serif;font-style:normal;font-weight:normal;font-size:15px;line-height:125%;letter-spacing:0px;word-spacing:0px"><flowRegion
233 |          id="flowRegion4425"><rect
234 |            id="rect4427"
235 |            width="160"
236 |            height="106.67638"
237 |            x="45"
238 |            y="242.07362" /></flowRegion><flowPara
239 |          id="flowPara4429" /></flowRoot><text
240 |        xml:space="preserve"
241 |        style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:8px;line-height:125%;font-family:'Times New Roman';-inkscape-font-specification:'Times New Roman, Normal';text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
242 |        x="58.926567"
243 |        y="-410.08777"
244 |        id="text4521"
245 |        sodipodi:linespacing="125%"
246 |        transform="scale(1,-1)"><tspan
247 |          sodipodi:role="line"
248 |          id="tspan4523"
249 |          x="58.926567"
250 |          y="-410.08777"
251 |          style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:8px;line-height:125%;font-family:'Times New Roman';-inkscape-font-specification:'Times New Roman, Normal';text-align:start;writing-mode:lr-tb;text-anchor:start">concrete</tspan></text>
252 | <text
253 |        xml:space="preserve"
254 |        style="font-style:normal;font-weight:normal;font-size:12px;line-height:125%;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
255 |        x="208"
256 |        y="-476"
257 |        id="text4274"
258 |        sodipodi:linespacing="125%"
259 |        transform="scale(1,-1)"><tspan
260 |          sodipodi:role="line"
261 |          id="tspan4276"
262 |          x="208"
263 |          y="-476" /></text>
264 | <ellipse
265 |        ry="7.8399992"
266 |        rx="26.000002"
267 |        transform="scale(1,-1)"
268 |        cy="-412"
269 |        cx="117.84"
270 |        id="ellipse5776"
271 |        style="opacity:1;fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:0.31999999;stroke-linecap:square;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1" /><text
272 |        transform="scale(1,-1)"
273 |        sodipodi:linespacing="125%"
274 |        id="text5780"
275 |        y="-410.24738"
276 |        x="110.94531"
277 |        style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:8px;line-height:125%;font-family:'Times New Roman';-inkscape-font-specification:'Times New Roman, Normal';text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
278 |        xml:space="preserve"><tspan
279 |          style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:8px;line-height:125%;font-family:'Times New Roman';-inkscape-font-specification:'Times New Roman, Normal';text-align:start;writing-mode:lr-tb;text-anchor:start"
280 |          y="-410.24738"
281 |          x="110.94531"
282 |          id="tspan5782"
283 |          sodipodi:role="line">symbolic</tspan></text>
284 | <text
285 |        xml:space="preserve"
286 |        style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:8px;line-height:125%;font-family:'Times New Roman';-inkscape-font-specification:'Times New Roman, Normal';text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
287 |        x="146.00937"
288 |        y="-410.24738"
289 |        id="text5784"
290 |        sodipodi:linespacing="125%"
291 |        transform="scale(1,-1)"><tspan
292 |          sodipodi:role="line"
293 |          id="tspan5786"
294 |          x="146.00937"
295 |          y="-410.24738"
296 |          style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:8px;line-height:125%;font-family:'Times New Roman';-inkscape-font-specification:'Times New Roman, Normal';text-align:start;writing-mode:lr-tb;text-anchor:start">abstract</tspan></text>
297 | <text
298 |        transform="scale(1,-1)"
299 |        sodipodi:linespacing="125%"
300 |        id="text5788"
301 |        y="-426.44531"
302 |        x="82.926567"
303 |        style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:8px;line-height:125%;font-family:'Times New Roman';-inkscape-font-specification:'Times New Roman, Normal';text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
304 |        xml:space="preserve"><tspan
305 |          style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:8px;line-height:125%;font-family:'Times New Roman';-inkscape-font-specification:'Times New Roman, Normal';text-align:start;writing-mode:lr-tb;text-anchor:start"
306 |          y="-426.44531"
307 |          x="82.926567"
308 |          id="tspan5790"
309 |          sodipodi:role="line">concolic</tspan></text>
310 | <path
311 |        style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:0.80000001px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
312 |        d="m 96,424 c 4,-12 4,-12 4,-12"
313 |        id="path5834"
314 |        inkscape:connector-curvature="0" /><ellipse
315 |        style="opacity:1;fill:none;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:0.31999999;stroke-linecap:square;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1"
316 |        id="path3422"
317 |        cx="80.080002"
318 |        cy="-412"
319 |        transform="scale(1,-1)"
320 |        rx="27.92"
321 |        ry="7.8400068" /></g></svg>


--------------------------------------------------------------------------------
/images/concrete-execution.odg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/images/concrete-execution.odg


--------------------------------------------------------------------------------
/images/concrete-execution.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/images/concrete-execution.pdf


--------------------------------------------------------------------------------
/images/eager-evaluation.odg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/images/eager-evaluation.odg


--------------------------------------------------------------------------------
/images/eager-evaluation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/images/eager-evaluation.pdf


--------------------------------------------------------------------------------
/images/example.odg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/images/example.odg


--------------------------------------------------------------------------------
/images/example.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/images/example.pdf


--------------------------------------------------------------------------------
/images/execution-tree-text.tex:
--------------------------------------------------------------------------------
 1 | \documentclass[10pt]{article}
 2 | \usepackage[usenames]{color} %used for font color
 3 | \usepackage{amssymb} %maths
 4 | \usepackage{amsmath} %maths
 5 | \usepackage[utf8]{inputenc} %useful to type directly diacritic characters
 6 | \begin{document}
 7 | \begin{align*}\mbox{A} ~~~~ 2.~~\texttt{int x = 1, y = 0} ~~~~ \sigma=\{ a\mapsto \alpha_a, b\mapsto \alpha_b \} ~~~~ \pi=true \\
 8 | \mbox{B} ~~~~ 3.~~\texttt{if (a != 0)} ~~~~ \sigma=\{ a\mapsto \alpha_a, b\mapsto \alpha_b, x\mapsto 1, y\mapsto 0 \} ~~~~ \pi=true \\
 9 | \mbox{C} ~~~~ 4.~~\texttt{y = 3+x} ~~~~ \sigma=\{ a\mapsto \alpha_a, b\mapsto \alpha_b, x\mapsto 1, y\mapsto 0 \} ~~~~ \pi=\alpha_a\neq 0 \\
10 | \mbox{D} ~~~~ 8.~~\texttt{assert(x-y != 0)} ~~~~ \sigma=\{ a\mapsto \alpha_a, b\mapsto \alpha_b, x\mapsto 1, y\mapsto 0 \} ~~~~ \pi=\alpha_a= 0 \\
11 | 1-0 = 0 \wedge \alpha_a = 0\Longleftrightarrow false ~~~~ \mbox{OK} \\
12 | \mbox{E} ~~~~ 5.~~\texttt{if (b == 0)} ~~~~ \sigma=\{ a\mapsto \alpha_a, b\mapsto \alpha_b, x\mapsto 1, y\mapsto 4 \} ~~~~ \pi=\alpha_a\neq 0 \\
13 | \mbox{F} ~~~~ 6.~~\texttt{x = 2*(a+b)} ~~~~ \sigma=\{ a\mapsto \alpha_a, b\mapsto \alpha_b, x\mapsto 1, y\mapsto 4 \} ~~~~ \pi=\alpha_a\neq 0 \wedge \alpha_b = 0 \\
14 | \mbox{G} ~~~~ 8.~~\texttt{assert(x-y != 0)} ~~~~ \sigma=\{ a\mapsto \alpha_a, b\mapsto \alpha_b, x\mapsto 1, y\mapsto 4 \} ~~~~ \pi=\alpha_a \neq 0 \wedge \alpha_b \neq 0 \\
15 | 1-4 = 0 \wedge \alpha_a \neq 0 \wedge \alpha_b \neq 0\Longleftrightarrow false ~~~~ \mbox{OK} \\
16 | \mbox{H} ~~~~ 8.~~\texttt{assert(x-y != 0)} ~~~~ \sigma=\{ a\mapsto \alpha_a, b\mapsto \alpha_b, x\mapsto 2(\alpha_a+\alpha_b), y\mapsto 4 \} ~~~~ \pi=\alpha_a \neq 0 \wedge \alpha_b = 0 \\
17 | 2(\alpha_a+\alpha_b)-4 = 0 \wedge \alpha_a \neq 0 \wedge \alpha_b = 0~~\mbox{if}~~\alpha_a=2\wedge\alpha_b=0 ~~~~ \mbox{ERROR} \\
18 | \end{align*}
19 | \end{document}


--------------------------------------------------------------------------------
/images/execution-tree.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/images/execution-tree.pdf


--------------------------------------------------------------------------------
/images/lazy-initialization-C.odg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/images/lazy-initialization-C.odg


--------------------------------------------------------------------------------
/images/lazy-initialization.odg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/images/lazy-initialization.odg


--------------------------------------------------------------------------------
/images/lazy-initialization.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/images/lazy-initialization.pdf


--------------------------------------------------------------------------------
/images/memory-fork.odg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/images/memory-fork.odg


--------------------------------------------------------------------------------
/images/memory-fork.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/images/memory-fork.pdf


--------------------------------------------------------------------------------
/images/memory-ite.odg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/images/memory-ite.odg


--------------------------------------------------------------------------------
/images/memory-ite.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/images/memory-ite.pdf


--------------------------------------------------------------------------------
/images/photo_tree.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/images/photo_tree.pdf


--------------------------------------------------------------------------------
/images/state-merging-2.odg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/images/state-merging-2.odg


--------------------------------------------------------------------------------
/images/state-merging-2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/images/state-merging-2.pdf


--------------------------------------------------------------------------------
/images/state-merging.odg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/images/state-merging.odg


--------------------------------------------------------------------------------
/images/state-merging.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/images/state-merging.pdf


--------------------------------------------------------------------------------
/images/state-merging_old.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/images/state-merging_old.png


--------------------------------------------------------------------------------
/images/whitebox.odg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/images/whitebox.odg


--------------------------------------------------------------------------------
/images/whitebox.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/images/whitebox.pdf


--------------------------------------------------------------------------------
/intro.tex:
--------------------------------------------------------------------------------
  1 | % !TEX root = main.tex
  2 | 
  3 | \epigraph{\textit{``Sometimes you can't see how important something is in its moment, even if it seems kind of important. This is probably one of those times.''}}{(Cyber Grand Challenge highlights from DEF CON 24, August 6, 2016)}
  4 | 
  5 | \vspace{-2.5mm}
  6 | \section{Introduction}
  7 | \label{se:intro}
  8 | 
  9 | Symbolic execution is a popular program analysis technique introduced in the mid '70s to test whether certain properties can be violated by a piece of software~\cite{K-ICRS75,SELECT-ICRS75,K-CACM76,H-TSE77}. Aspects of interest could be that no division by zero is ever performed, no {\tt NULL} pointer is ever dereferenced, no backdoor exists that can bypass authentication, etc. While in general there is no automated way to decide some properties (e.g., the target of an indirect jump), heuristics and approximate analyses can prove useful in practice in a variety of settings, including mission-critical and security applications.
 10 | 
 11 | %While in general there is no automated way to decide some properties (think, e.g., of the halting problem), decidable approximations often exist (e.g., ``does a program always terminate within a certain amount of time?''). Such approximations can prove useful in practice in a variety of settings, including mission-critical and security applications.
 12 | 
 13 | In a concrete execution, a program is run on a specific input and a single control flow path is explored. Hence, in most cases concrete executions can only under-approximate the analysis of the property of interest. In contrast, symbolic execution can simultaneously explore multiple paths that a program could take under different inputs. This paves the road to sound analyses that can yield strong guarantees on the checked property. 
 14 | %\mynote{I: a cosa serve ridirlo? Abbiamo gia' fatto esempi di proprieta' che possono essere verificate}Symbolic execution may answer useful questions on concrete programs like: ``does function {\tt foo(x)} always return a positive value for any possible value of {\tt x}?'' 
 15 | The key idea is to allow a program to take on {\em symbolic} -- rather than concrete -- input values. Execution is performed by a {\em symbolic execution engine}, which maintains for each explored control flow path: (i) a first-order Boolean {\em formula} that describes the conditions satisfied by the branches taken along that path, and (ii) a {\em symbolic memory store} that maps variables to symbolic expressions or values. Branch execution updates the formula, while assignments update the symbolic store. A {\em model checker}, typically based on a {\em satisfiability modulo theories} (SMT) solver~\cite{BKM14}, is eventually used to verify whether there are any violations of the property along each explored path and if the path itself is realizable, i.e., if its formula can be satisfied by some assignment of concrete values to the program's symbolic arguments.
 16 | %HandbookOfSAT2009
 17 | 
 18 | %Variables and control flow paths are associated with expressions and constraints in terms of those symbols during a symbolic execution of the program, and constraints are eventually solved via SMT (satisfiability modulo theories) solvers.
 19 | 
 20 | Symbolic execution techniques have been brought to the attention of a heterogeneous audience since DARPA announced in 2013 the Cyber Grand Challenge, a two-year competition seeking to create automatic systems for vulnerability detection, exploitation, and patching in near real-time~\cite{ANGR-SSP16}.
 21 | 
 22 | % other static program
 23 | % which were missed by other program analyses and blackbox testing techniques
 24 | More remarkably, symbolic execution tools have been running 24/7 in the testing process of many Microsoft applications since 2008, revealing for instance nearly 30\% of all the bugs discovered by file fuzzing during the development of Windows 7, which other program analyses and blackbox testing techniques missed~\cite{SAGE-QUEUE12}.
 25 | 
 26 | In this article, we survey the main aspects of symbolic execution and discuss the most prominent techniques employed for instance in software testing and computer security applications. Our discussion is mainly focused on {\em forward} symbolic execution, where a symbolic engine analyzes many paths simultaneously starting its exploration from the main entry point of a program.
 27 | %its extensive usage in software testing and computer security applications\mynote{[D] this should change}, where software vulnerabilities can be found by symbolically executing programs at the level of either source or binary code. 
 28 | %A different approach is symbolic {\em backward} execution, where exploration is started from a specific point of the program (e.g., an {\tt assert} statement) and the engine proceeds backward, trying to reconstruct a valid path from an entry point of the program. Since forward symbolic execution is the mainline technique in literature, throughout this article we will always refer to this approach when using the term symbolic execution. Nonetheless, some benefits offered by symbolic backward execution will be pointed out when relevant for the discussion.
 29 | %
 30 | We start with a simple example that highlights many of the fundamental issues addressed in the remainder of the article.
 31 | 
 32 | % --------------------------------------------------------------------------------------------------------------------
 33 | \subsection{A Warm-Up Example}
 34 | \label{symbolic-execution-example}
 35 | 
 36 | \begin{figure}[t]
 37 | \begin{center}
 38 | \begin{tabular}{c}
 39 | \begin{lstlisting}[basicstyle=\ttfamily\scriptsize]
 40 | 1.  void foobar(int a, int b) {
 41 | 2.     int x = 1, y = 0;
 42 | 3.     if (a != 0) {
 43 | 4.        y = 3+x;
 44 | 5.        if (b == 0)
 45 | 6.           x = 2*(a+b);
 46 | 7.     }
 47 | 8.     assert(x-y != 0);
 48 | 9.  }
 49 | \end{lstlisting}
 50 | \end{tabular}
 51 | \end{center}
 52 | \vspace{-2mm}
 53 | \caption{Warm-up example: which values of \texttt{a} and \texttt{b} make the \texttt{assert} fail?}
 54 | \label{fig:example-1}
 55 | \vspace{-1.5mm}
 56 | \end{figure}
 57 | 
 58 | %\revedit{in the common 4-byte representation}
 59 | Consider the C code of Figure~\ref{fig:example-1} and assume that our goal is to determine which inputs make the {\tt assert} at line 8 of function \texttt{foobar} fail. Since each \revedit{4-byte} input parameter can take as many as $2^{32}$ distinct integer values, the approach of running concretely function \texttt{foobar} on randomly generated inputs will unlikely pick up exactly the assert-failing inputs.
 60 | %Techniques such as random testing could generate bottomless input tests for this function. 
 61 | %However, it is unlikely that exactly the assert-failing inputs would be randomly picked up\mynote{Fuzzing?}. 
 62 | By evaluating the code using symbols for its inputs, instead of concrete values, symbolic execution overcomes this limitation and makes it possible to reason on {\em classes of inputs}, rather than single input values. 
 63 | 
 64 | In more detail, every value that cannot be determined by a static analysis of the code, such as an actual parameter of a function or the result of a system call that reads data from a stream, is represented by a symbol $\alpha_i$. At any time, the symbolic execution engine maintains a state $(stmt,~\sigma,~\pi)$ where:
 65 | 
 66 | \begin{itemize}[itemsep=1pt]
 67 | 
 68 | \item $stmt$ is the next statement to evaluate. For the time being, we assume that $stmt$ can be an assignment, a conditional branch, or a jump (more complex constructs such as function calls and loops will be discussed in  Section~\ref{se:path-explosion}).
 69 | 
 70 | %\item $\sigma$ is a {\em symbolic store} that associates program variables with expressions over \mynote{[D] $\alpha_i$ also concrete?} concrete and symbolic values $\alpha_i$.
 71 | 
 72 | \item $\sigma$ is a {\em symbolic store} that associates program variables with either expressions over concrete values or symbolic values $\alpha_i$.
 73 | 
 74 | \item $\pi$ denotes the {\em path constraints}, i.e., is a formula that expresses a set of assumptions on the symbols $\alpha_i$ due to branches taken in the execution to reach $stmt$. At the beginning of the analysis, $\pi=true$.
 75 | 
 76 | \end{itemize}
 77 | 
 78 | \noindent Depending on $stmt$, the symbolic engine changes the state as follows:
 79 | 
 80 | \begin{itemize}[topsep=3pt,itemsep=1pt] % TODO
 81 |   \item The evaluation of an assignment $x=e$ updates the symbolic store $\sigma$ by associating $x$ with a new symbolic expression $e_s$. We denote this association with $x\mapsto e_s$, where $e_s$ is obtained by evaluating $e$ in the context of the current execution state and  can be any expression involving unary or binary operators over symbols and concrete values.
 82 |   
 83 | %   $\alpha_i = e$: when an expression $e$ is assigned to a symbol $\alpha_i$, $pc$ is extended by adding a constraint on $\alpha_i$:
 84 | %    \[ pc \gets pc \wedge \alpha_i = e\]
 85 | %  where $e$ can be any expression, involving unary or binary operators, over symbols and constants.
 86 | 
 87 |   \item The evaluation of a conditional branch ${\tt if}~e~{\tt then}~s_{true}~{\tt else}~s_{false}$ affects the path constraints $\pi$. The symbolic execution is forked by creating two execution states with path constraints $\pi_{true}$ and $\pi_{false}$, respectively, which correspond to the two branches: $\pi_{true}=\pi \wedge e_s$ and $\pi_{false}=\pi \wedge \neg e_s$, where $e_s$ is a symbolic expression obtained by evaluating $e$. 
 88 | %        \[ (s_{true}, pc_{true}) \text{ where } pc_{true} = pc \wedge e \]
 89 | %        \[ (s_{false}, pc_{false}) \text{ where } pc_{false} = pc \wedge \neg e \]
 90 |     Symbolic execution independently proceeds on both states.
 91 | 
 92 |   \item The evaluation of a jump {\tt goto} $s$ updates the execution state by advancing the symbolic execution to statement $s$. 
 93 | \end{itemize}
 94 | 
 95 | %\subsection{Example}
 96 | %\label{symbolic-execution-example}
 97 | 
 98 | %\begin{figure}[t]
 99 | %  \centering
100 | %  \includegraphics[width=1.0\columnwidth]{images/example} 
101 | %  \caption{Symbolic execution tree of the function {\tt foobar}. Each execution state is labeled with an alphabet letter. Side effects on execution states are highlighted in gray. Leaves are evaluated against division by zero error. For the sake of presentation the conjunction of constraints is shown as a list of constraints. }
102 | %  \label{fig:example-symbolic-execution}
103 | %\end{figure}
104 | 
105 | \begin{figure}[t]
106 |   \centering
107 |   \includegraphics[width=0.975\columnwidth]{images/execution-tree.eps} 
108 |   \caption{Symbolic execution tree of function {\tt foobar} given in Figure~\ref{fig:example-1}. Each execution state, labeled with an upper case letter, shows the statement to be executed, the symbolic store $\sigma$, and the path constraints $\pi$. Leaves are evaluated against the condition in the {\tt assert} statement. }
109 | %For the sake of presentation the conjunction of constraints is shown as a list of constraints. }
110 |   \label{fig:example-symbolic-execution}
111 |   \vspace{-1mm}
112 | \end{figure}
113 | 
114 | \noindent A symbolic execution of function {\tt foobar}, which can be effectively represented as a tree, is shown in Figure~\ref{fig:example-symbolic-execution}. Initially (execution state $A$) the path constraints are {\tt true} and input arguments {\tt a} and {\tt b} are associated with symbolic values. 
115 | After initializing local variables {\tt x} and {\tt y} at line 2, the symbolic store is updated by associating {\tt x} and {\tt y} with concrete values 1 and 0, respectively (execution state $B$). Line 3 contains a conditional branch and the execution is forked: depending on the branch taken, a different statement is evaluated next and different assumptions are made on symbol $\alpha_a$ (execution states $C$ and $D$, respectively). In the branch where $\alpha_a\neq 0$, variable {\tt y} is assigned with ${\tt x}+3$, obtaining $y\mapsto 4$ in state $E$ because $x\mapsto 1$ in state $C$. In general, arithmetic expression evaluation simply manipulates the symbolic values.
116 | After expanding every execution state until the {\tt assert} at line 8 is reached on all branches, we can check which input values for parameters {\tt a} and {\tt b} can make the {\tt assert} fail. By analyzing execution states $\{D,G,H\}$, we can conclude that only $H$ can make {\tt x-y = 0} true. The path constraints for $H$ at this point implicitly define the set of inputs that are unsafe for {\tt foobar}. 
117 | In particular, any input values such that:
118 |  \[ 2(\alpha_a+\alpha_b)-4 = 0 \wedge \alpha_a \neq 0 \wedge \alpha_b = 0 \]
119 | will make {\tt assert} fail. An instance of unsafe input parameters can be eventually determined by invoking an {\em SMT solver}~\cite{BKM14} to solve the path constraints, which in this example would yield $a = 2$ and $b = 0$. % HandbookOfSAT2009
120 | 
121 | %Notice\mynote{Say earlier?} that a constraint solver is also needed when evaluating the satisfiability of branch conditions.
122 | 
123 | % --------------------------------------------------------------------------------------------------------------------
124 | \subsection{Challenges in Symbolic Execution}
125 | \label{example-discussion}
126 | 
127 | In the example discussed in Section~\ref{symbolic-execution-example} symbolic execution can identify {\em all} the possible unsafe inputs that make the {\tt assert} fail. This is achieved through an exhaustive exploration of the possible execution states. From a theoretical perspective, exhaustive symbolic execution provides a {\em sound} and {\em complete} methodology for any decidable analysis. Soundness prevents false negatives, i.e., all possible unsafe inputs are guaranteed to be found, while completeness prevents false positives, i.e.,  input values deemed unsafe are actually unsafe. As we will discuss later on, exhaustive symbolic execution is unlikely to scale beyond small applications. Hence, in practice we often settle for less ambitious goals, e.g., by trading soundness for performance.
128 | 
129 | Challenges that symbolic execution has to face when processing real-world code can be significantly more complex than those illustrated in our warm-up example. Several observations and questions naturally arise:
130 | 
131 | \begin{itemize}[itemsep=1mm]
132 | %%%
133 | \item \noindent {\em Memory}: how does the symbolic engine handle pointers, arrays, or other complex objects? Code manipulating pointers and data structures may give rise not only to symbolic stored data, but also to addresses being described by symbolic expressions.
134 | %Any arbitrarily complex object can be regarded as an array of bytes and each byte associated with a distinct symbol. However, when possible, exploiting structural properties of the data may be more convenient: for instance, relational bounds on the class fields in object-oriented languages could be used for refining the search performed by symbolic execution.
135 | %%%
136 | \item {\em Environment}: how does the engine handle interactions across the software stack? Calls to library and system code can cause side-effects, e.g., the creation of a file \revedit{or a call back to user code}, that could later affect the execution and must be accounted for. However, evaluating any possible interaction outcome may be unfeasible.
137 | %: it would give rise to a large number of states, while only a fraction of them can \mynote{[D] likely?}actually happen in a non-symbolic scenario.
138 | %%\mytempedit{Also, third-party closed-source components and popular frameworks (e.g., Java Swing and Android) pose further challenges to an executor, for instance because of the control flows occurring within them through callbacks.}\mynote{CD: may be dropped if we run out of space}
139 | % Real-world applications constantly interact with the environment (e.g., the file system or the network) through libraries and system calls. These interactions may cause side-effects (such as the creation of a file) that could later affect the execution and must be therefore taken into account. Evaluating any possible interaction outcome is generally unfeasible: it could generate a large number of execution states, of which only a small number can actually happen in a non-symbolic scenario. %A typical strategy is to consider popular library and system routines and create models that can help the symbolic engine analyze only significant outcomes.
140 | %%%
141 |   \item {\em State space explosion}: how does symbolic execution deal with path explosion?
142 | %\mynote{[D] I felt it was too long and loop-centric} 
143 | Language constructs such as loops might exponentially increase the number of execution states. It is thus unlikely that a symbolic execution engine can exhaustively explore all the possible states within a reasonable amount of time. %In practice, heuristics are used to guide exploration and prioritize certain states first (e.g., to maximize code coverage). In addition, 
144 | %\mytempedit{Efficient mechanisms can be implemented for preventing repeated exploration of the same piece of code
145 | %\mytempedit{for skipping over states subsumed by previously explored paths} 
146 | %and for evaluating multiple states in parallel without running out of resources.}
147 | %%A loop\mynote{IF: rimuoverei la prima frase, perche' va detto?} can be encoded using conditional branches and {\tt goto} statements, which is typical  when compiling high-level languages to an intermediate representation or native code. 
148 | %Choosing the number of loop iterations to analyze is especially critical when this number cannot be determined in advance (e.g., depends on an input parameter). The naive approach of unrolling iterations for every valid bound would result in a prohibitively large number of states. Typical solutions are to compute an underapproximation of the analysis by limiting the number of iterations to some value $k$, thus trading speed for soundness. Other approaches infer loop invariants through static analysis  and use them to merge equivalent states. % \mynote{i.e. or e.g.?}  (e.g., when differences are not observable from outside the loop body). 
149 |   %In practice, several heuristics must be exploited to prioritize evaluation of some states, hoping to still be able to spot interesting things. Moreover, the symbolic execution engine should include efficient mechanism for efficiently evaluating in parallel different execution states without running out of computational resources.
150 | %%%
151 |   \item {\em Constraint solving}: what can a constraint solver do in practice?
152 |   %{\em What is a constraint solver in practice}? \\
153 | SMT solvers can scale to complex combinations of constraints over hundreds of variables. However, constructs such as non-linear arithmetic pose a major obstacle to efficiency.
154 | %Constraint solvers suffer from a number of limitations. They can typically handle complex constraints in a reasonable amount of time only if they are made of linear expressions over their constituents.
155 | %Constraint solvers suffer from a number of limitations. They can typically handle complex constraints in a reasonable amount of time only if they are made of linear expressions over their constituents. %Symbolic execution engines normally implement a number of optimizations to make queries as much {\em solver-friendly} as possible, for instance by splitting queries into independent components to be processed separately or by performing algebraic simplifications.
156 | %%%
157 | \iffullver{  \item {\em Binary code}: what issues can arise when symbolically executing binary code?
158 |   %what are the disadvantages of symbolically executing binary code?
159 |  While the warm-up example of Section~\ref{symbolic-execution-example} is written in C, in several scenarios binary code is the only available representation of a program. However, having the source code of an application can make symbolic execution significantly easier, as it can exploit high-level properties (e.g., object shapes) that can be inferred statically by analyzing the source code.
160 |  }{}
161 | %(e.g., the maximum size of a buffer or the number of iterations for a loop).
162 | %%%   
163 | \end{itemize}
164 | %Depending on the specific application context of symbolic execution
165 | 
166 | \noindent Depending on the specific context in which symbolic execution is used, different choices and assumptions are made to address the questions highlighted above. Although these choices typically affect soundness or completeness, in several scenarios a partial exploration of the space of possible execution states may be sufficient to achieve the goal (e.g., identifying a crashing input for an application) within a limited time budget.
167 | 
168 | %\mynote{Better example?}
169 | 
170 | %different choices and assumptions are made to address the above questions. Although soundness and completeness of symbolic execution may be negatively affected by these choices, there are several application scenarios where a partial exploration of the possible execution states is sufficient for reaching the ultimate goal (e.g., identify a single input that crashes an application).
171 | 
172 | % --------------------------------------------------------------------------------------------------------------------
173 | \subsection{Related Work}
174 | \label{ss:related-surveys}
175 | 
176 | Symbolic execution has been the focus of a vast body of literature. As of August 2017, Google Scholar reports 742 articles that include the exact phrase ``symbolic execution'' in the title. Prior to this survey, other authors have contributed technical overviews of the field, such as \cite{PV-JSTTT09} and \cite{CS-CACM13}. \cite{CHEN20131758} focuses on the more specific setting of automated test generation: it provides a comprehensive view of the literature, covering in depth a variety of techniques and complementing the technical discussions with a number of running examples.
177 | %Besides  complementing the technical discussions with a number of running examples, it covers in depth recent techniques for key aspects such as memory modelling, environment interaction, path explosion, and constraint solving.
178 | 
179 | % --------------------------------------------------------------------------------------------------------------------
180 | \subsection{Organization of the Article}
181 | \label{ss:article-organization}
182 | 
183 | %\iffullver{
184 | %The remainder of this article is organized as follows. In Section~\ref{se:executors}, we discuss the overall principles and evaluation strategies of a symbolic execution engine. Section~\ref{memory-model} through Section~\ref{se:symbolic-binary} address the key challenges that we listed in Section~\ref{example-discussion}. Prominent applications based on symbolic execution techniques are discussed in Section~\ref{se:applications}, while concluding remarks are addressed in Section~\ref{se:conclusions}. %We provide a glossary of the main terms used in the article in Section~\ref{se:glossary}.
185 | %}
186 | 
187 | The remainder of this article is organized as follows. In Section~\ref{se:executors} we discuss the overall principles and evaluation strategies of a symbolic execution engine. Section~\ref{memory-model} through Section~\ref{se:constraint-solving} address the key challenges that we listed in Section~\ref{example-discussion}, while Section~\ref{se:hang} discusses how recent advances in other areas could be applied to enhance symbolic execution techniques. Concluding remarks are addressed in Section~\ref{se:conclusions}. %We provide a glossary of the main terms used in the article in Section~\ref{se:glossary}.
188 | 
189 | % removed as \revedit{}
190 | %The appendix addresses further challenges that arise when applying symbolic execution to binary code, discusses some prominent applications of symbolic execution, and includes tables listing some prominent tools and techniques.
191 | 
192 | 
193 | %\vspace{2cm}
194 | %\subsection{Removed stuff}
195 | %
196 | %\paragraph{Black-box approach versus white-box approach}
197 | %
198 | %Discussion\mynote{IF: do we really need this?} of black-box approach and white-box approach. Symbolic execution is a white-box technique. Black-box approaches can be very fast but not always effective. White-box approaches can be very effective but are typically slower than black-box techniques. An in-depth discussion of this aspect will be done when we will discuss~\cite{DRILLER-NDSS16}.
199 | %
200 | %\begin{figure}[H]
201 | %  \vspace{-3mm}
202 | %  \centering
203 | %  \begin{subfigure}{.5\textwidth}
204 | %    \centering
205 | %    \includegraphics[width=0.9\linewidth]{images/blackbox} 
206 | %    \caption{Black-box approach}
207 | %    %\label{fig:sub1}
208 | %  \end{subfigure}%
209 | %  \begin{subfigure}{.5\textwidth}
210 | %    \centering
211 | %    \includegraphics[width=0.9\linewidth]{images/whitebox} 
212 | %    \caption{White-box approach}
213 | %    %\label{fig:sub2}
214 | %  \end{subfigure}
215 | %  %\label{fig:example-symbolic-execution}
216 | %  \vspace{-3mm}
217 | %\end{figure}
218 | %
219 | %\paragraph{Taken from old Overview}
220 | %
221 | %Symbolic execution has been originally introduced in~\cite{K-CACM76} and~\cite{H-TSE77}. A good introduction to symbolic execution is presented in~\cite{KLEE-OSDI08}.\mynote{Extend this paragraph}
222 | %%(while~\cite{EXE-CCS06} is a previous effort of the same authors).
223 | %\cite{SAGE-NDSS08} is one successful story of symbolic execution. \cite{SAB-SP10} presents a neat formalization of symbolic execution and of taint analysis as well.
224 | %
225 | 


--------------------------------------------------------------------------------
/main.tex:
--------------------------------------------------------------------------------
  1 | % v2-acmsmall-sample.tex, dated March 6 2012
  2 | % This is a sample file for ACM small trim journals
  3 | %
  4 | % Compilation using 'acmsmall.cls' - version 1.3 (March 2012), Aptara Inc.
  5 | % (c) 2010 Association for Computing Machinery (ACM)
  6 | %
  7 | % Questions/Suggestions/Feedback should be addressed to => "acmtexsupport@aptaracorp.com".
  8 | % Users can also go through the FAQs available on the journal's submission webpage.
  9 | %
 10 | % Steps to compile: latex, bibtex, latex latex
 11 | %
 12 | % For tracking purposes => this is v1.3 - March 2012
 13 | 
 14 | \documentclass[prodmode,acmcsur]{acmsmall} % Aptara syntax
 15 | 
 16 | % Package to generate and customize Algorithm as per ACM style
 17 | \usepackage[ruled]{algorithm2e} 
 18 | \renewcommand{\algorithmcfname}{ALGORITHM}
 19 | \SetAlFnt{\small}
 20 | \SetAlCapFnt{\small}
 21 | \SetAlCapNameFnt{\small}
 22 | \SetAlCapHSkip{0pt}
 23 | \IncMargin{-\parindent}
 24 | 
 25 | % Metadata Information
 26 | \acmVolume{0}
 27 | \acmNumber{0}
 28 | \acmArticle{0}
 29 | \acmYear{0000}
 30 | \acmMonth{0}
 31 | 
 32 | % Copyright
 33 | %\setcopyright{acmcopyright}
 34 | %\setcopyright{acmlicensed}
 35 | %\setcopyright{rightsretained}
 36 | %\setcopyright{usgov}
 37 | %\setcopyright{usgovmixed}
 38 | %\setcopyright{cagov}
 39 | %\setcopyright{cagovmixed}
 40 | 
 41 | \input{common}
 42 | 
 43 | % DOI
 44 | \doi{0000001.0000001}
 45 | 
 46 | %ISSN
 47 | \issn{1234-56789}
 48 | 
 49 | % Document starts
 50 | \begin{document}
 51 | 
 52 | % Page heads
 53 | \markboth{R. Baldoni, E. Coppa, D. C. D'Elia, C. Demetrescu, and I. Finocchi}{A Survey of Symbolic Execution Techniques}
 54 | 
 55 | % Title portion
 56 | \title{A Survey of Symbolic Execution Techniques\\}
 57 | \author{ROBERTO BALDONI
 58 | \affil{\href{http://www.cis.uniroma1.it/}{Cyber Intelligence and Information Security Research Center}, Sapienza}
 59 | EMILIO COPPA
 60 | \affil{\href{http://season-lab.github.io}{SEASON Lab}, Sapienza University of Rome}
 61 | DANIELE CONO D'ELIA
 62 | \affil{\href{http://season-lab.github.io}{SEASON Lab}, Sapienza University of Rome}
 63 | CAMIL DEMETRESCU
 64 | \affil{\href{http://season-lab.github.io}{SEASON Lab}, Sapienza University of Rome}
 65 | IRENE FINOCCHI
 66 | \affil{\href{http://season-lab.github.io}{SEASON Lab}, Sapienza University of Rome}
 67 | }
 68 | % NOTE! Affiliations placed here should be for the institution where the
 69 | %       BULK of the research was done. If the author has gone to a new
 70 | %       institution, before publication, the (above) affiliation should NOT be changed.
 71 | %       The authors 'current' address may be given in the "Author's addresses:" block (below).
 72 | %       So for example, Mr. Abdelzaher, the bulk of the research was done at UIUC, and he is
 73 | %       currently affiliated with NASA.
 74 | 
 75 | \begin{abstract}
 76 | Many security and software testing applications require checking whether certain properties of a program hold for any possible usage scenario. For instance, a tool for identifying software vulnerabilities may need to rule out the existence of any backdoor to bypass a program's authentication. One approach would be to test the program using different, possibly random inputs. As the backdoor may only be hit for very specific program workloads, automated exploration of the space of possible inputs is of the essence. Symbolic execution provides an elegant solution to the problem, by systematically exploring many possible execution paths at the same time without necessarily requiring concrete inputs. Rather than taking on fully specified input values, the technique abstractly represents them as symbols, resorting to constraint solvers to construct actual instances that would cause property violations. Symbolic execution has been incubated in dozens of tools developed over the last four decades, leading to major practical breakthroughs in a number of prominent software reliability applications. The goal of this survey is to provide an overview of the main ideas, challenges, and solutions developed in the area, distilling them for a broad audience.
 77 | \end{abstract}
 78 | 
 79 | %\begin{comment}
 80 | \begin{CCSXML} % http://dl.acm.org/ccs.cfm
 81 | <ccs2012>
 82 | <concept>
 83 | <concept_id>10011007.10010940.10010992.10010998.10010999</concept_id>
 84 | <concept_desc>Software and its engineering~Software verification</concept_desc>
 85 | <concept_significance>500</concept_significance>
 86 | </concept>
 87 | <concept>
 88 | <concept_id>10011007.10010940.10010992.10010998.10011001</concept_id>
 89 | <concept_desc>Software and its engineering~Dynamic analysis</concept_desc>
 90 | <concept_significance>300</concept_significance>
 91 | </concept>
 92 | <concept>
 93 | <concept_id>10011007.10011074.10011099.10011102.10011103</concept_id>
 94 | <concept_desc>Software and its engineering~Software testing and debugging</concept_desc>
 95 | <concept_significance>300</concept_significance>
 96 | </concept>
 97 | <concept>
 98 | <concept_id>10002978.10003022</concept_id>
 99 | <concept_desc>Security and privacy~Software and application security</concept_desc>
100 | <concept_significance>100</concept_significance>
101 | </concept>
102 | </ccs2012>
103 | \end{CCSXML}
104 | 
105 | \ccsdesc[500]{Software and its engineering~Software verification}
106 | %\ccsdesc[300]{Software and its engineering~Dynamic analysis}
107 | \ccsdesc[300]{Software and its engineering~Software testing and debugging}
108 | \ccsdesc[100]{Security and privacy~Software and application security}
109 | %\end{comment}
110 | 
111 | % We no longer use \terms command
112 | %\terms{Design, Algorithms, Performance}
113 | 
114 | \keywords{Symbolic execution, static analysis, concolic execution, software testing}
115 | 
116 | \acmformat{Roberto Baldoni, Emilio Coppa, Daniele Cono D'Elia, Camil Demetrescu,
117 | and Irene Finocchi, 2016. A survey of symbolic execution techniques.}
118 | % At a minimum you need to supply the author names, year and a title.
119 | % IMPORTANT:
120 | % Full first names whenever they are known, surname last, followed by a period.
121 | % In the case of two authors, 'and' is placed between them.
122 | % In the case of three or more authors, the serial comma is used, that is, all author names
123 | % except the last one but including the penultimate author's name are followed by a comma,
124 | % and then 'and' is placed before the final author's name.
125 | % If only first and middle initials are known, then each initial
126 | % is followed by a period and they are separated by a space.
127 | % The remaining information (journal title, volume, article number, date, etc.) is 'auto-generated'.
128 | 
129 | \begin{bottomstuff}
130 | %This work is supported by the National Science Foundation, under grant CNS-0435060, grant CCR-0325197 and grant EN-CS-0329609.
131 | 
132 | Author's addresses: R. Baldoni, E. Coppa, D.C. D'Elia, and C. Demetrescu, Department of Computer, Control, and Management Engineering, Sapienza University of Rome; I. Finocchi, Department of Computer Science, Sapienza University of Rome. 
133 | This work is supported in part by a grant of the Italian Presidency of the Council of Ministers and by the CINI National Laboratory of Cyber Security. % (Consorzio Interuniversitario Nazionale Informatica) 
134 | \end{bottomstuff}
135 | 
136 | \maketitle
137 | 
138 | \input{intro}
139 | \myinput{executors}
140 | \myinput{memory}
141 | \myinput{environment}
142 | \myinput{explosion}
143 | \myinput{constraints}
144 | \input{hang}
145 | \input{conclusions}
146 | 
147 | % Bibliography
148 | %\bibliographystyle{abstract} 
149 | \bibliographystyle{ACM-Reference-Format-Journals}
150 | \bibliography{symbolic}
151 | 
152 | % History dates
153 | %\received{--- 2016}{--- XXXX}{---- XXXX}
154 | 
155 | \end{document}
156 | 
157 | % End of v2-acmsmall-sample.tex (March 2012) - Gerry Murray, ACM
158 | 
159 | 
160 | 


--------------------------------------------------------------------------------
/memory.tex:
--------------------------------------------------------------------------------
  1 | % !TEX root = main.tex
  2 | 
  3 | 
  4 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  5 | \section{Memory model}
  6 | \label{memory-model}
  7 | 
  8 | Our warm-up example of Section~\ref{symbolic-execution-example} presented a simplified memory model where data are stored in scalar variables only, with no indirection. A crucial aspect of symbolic execution is how memory should be modeled to support programs with pointers and arrays. This requires extending our notion of memory store by mapping not only variables, but also memory addresses to symbolic expressions or concrete values. In general, a store $\sigma$ that explicitly models memory addresses can be thought as a mapping that associates memory addresses (indexes) with either expressions over concrete values or symbolic values. We can still support variables by using their address rather than their name in the mapping. In the following, when we write $x\mapsto e$ for a variable $x$ and an expression $e$ we mean $\&x\mapsto e$, where $\&x$ is the concrete address of variable $x$. Also, if $v$ is an array and $c$ is an integer constant, by $v[c]\mapsto e$ we mean $\&v+c\mapsto e$.
  9 | 
 10 | %A memory model is an important design choice for a symbolic engine, as it can have a significant influence on the coverage achieved by symbolic execution, as well as on the scalability of constraint solving~\cite{CS-CACM13}.
 11 | \mynote{[D] shorter}A memory model is an important design choice for a symbolic engine, as it can significantly affect the coverage achieved by the exploration and the scalability of constraint solving~\cite{CS-CACM13}.
 12 | %
 13 | The {\em symbolic memory address} problem~\cite{SAB-SP10} arises when the address referenced in the operation is a symbolic expression. In the remainder of this section, we discuss a number of popular solutions.
 14 | 
 15 | \subsection{Fully Symbolic Memory}
 16 | \label{ss:fully-symbolic-memory}
 17 | 
 18 | \begin{figure}[t]
 19 | \vspace{-1mm}
 20 | \begin{center}
 21 | \begin{tabular}{c}
 22 | \begin{lstlisting}[basicstyle=\ttfamily\scriptsize]
 23 | 1.  void foobar(unsigned i, unsigned j) {
 24 | 2.     int a[2] = { 0 };
 25 | 3.     if (i>1 || j>1) return;
 26 | 4.     a[i] = 5;
 27 | 5.     assert(a[j] != 5);
 28 | 6.  }
 29 | \end{lstlisting}
 30 | \end{tabular}
 31 | \end{center}
 32 | \vspace{-2.5mm}
 33 | \caption{Memory modeling example: which values of \texttt{i} and \texttt{j} make the \texttt{assert} fail?}
 34 | \label{fi:example-mem}
 35 | \end{figure}
 36 | 
 37 | \begin{figure}[t]
 38 | \vspace{-3mm}
 39 | \includegraphics[width=1\columnwidth]{images/memory-fork} 
 40 | \vspace{-4.5mm}
 41 | \caption{Fully symbolic memory via state forking for the example of Figure~\ref{fi:example-mem}.}
 42 | \label{fi:memory-fork}
 43 | \vspace{-0.5mm}
 44 | \end{figure}
 45 | 
 46 | At the highest level of generality, an engine may treat memory addresses as fully symbolic. This is the approach taken by a number of works (e.g., {\sc BitBlaze}~\cite{BITBLAZE-ICISS08},~\cite{TLL-CAV10}, {\sc BAP}~\cite{BAP-CAV11}, and~\cite{TS-ATVA14}). Two fundamental approaches, pioneered by King in a seminal paper~\cite{K-CACM76}, are the following:
 47 | 
 48 | \begin{itemize}
 49 | 
 50 | \item {\em State forking.} If an operation reads from or writes to a symbolic address, the state is forked by considering all possible states that may result from the operation. The path constraints are updated accordingly for each forked state.
 51 | \boxedexample{Consider the code shown in Figure~\ref{fi:example-mem}. The write operation at line 4 affects either $a[0]$ or $a[1]$, depending on the unknown value of array index $i$. State forking creates two states after executing the memory assignment to explicitly consider both possible scenarios (Figure~\ref{fi:memory-fork}). The path constraints for the forked states encode the assumption made on the value of $i$. Similarly, the memory read operation \texttt{a[j]} at line 5 may access either $a[0]$ or $a[1]$, depending on the unknown value of array index $j$. Therefore, for each of the two possible outcomes of the assignment \texttt{a[i]=5}, there are two possible outcomes of the \texttt{assert}, which are explicitly explored by forking the corresponding states. }
 52 | 
 53 | \begin{figure}[t]
 54 | \begin{center}
 55 | \includegraphics[width=0.7\columnwidth]{images/memory-ite}
 56 | \end{center}
 57 | \vspace{-3mm}
 58 | \caption{Fully symbolic memory via if-then-else formulas for the example of Figure~\ref{fi:example-mem}.}
 59 | %\vspace{-1mm} % TODO
 60 | \label{fi:memory-ite}
 61 | \vspace{-1.5mm}
 62 | \end{figure}
 63 | 
 64 | % otherwise\footnote{In propositional logic, the $ite(\texttt{c}, \texttt{t}, \texttt{f})$ expression could be replaced with the formula $(\texttt{c} \wedge \texttt{t}) \vee (\neg\texttt{c} \wedge \texttt{f})$.}.
 65 | \item {\em if-then-else formulas.} An alternative approach consists in encoding the uncertainty on the possible values of a symbolic pointer into the expressions kept in the symbolic store and in the path constraints, without forking any new states. The key idea is to exploit the capability of some solvers to reason on formulas that contain if-then-else expressions of the form $ite(\texttt{c}, \texttt{t}, \texttt{f})$, which yields \texttt{t} if \texttt{c} is true, and \texttt{f} otherwise.
 66 | The approach works differently for memory read and write operations. Let $\alpha$ be a symbolic address that may assume the concrete values $a_1, a_2, \ldots$:
 67 | \begin{itemize}
 68 | \item reading from $\alpha$ yields the expression $ite(\alpha=a_1,\sigma(a_1), ite(\alpha=a_2,\sigma(a_2), \ldots))$;
 69 | \item writing an expression $e$ at $\alpha$ updates the symbolic store for each $a_1, a_2, \ldots$ as $\sigma(a_i)\gets ite(\alpha=a_i,e,\sigma(a_i))$.
 70 | \end{itemize}
 71 | Notice that in both cases, a memory operation introduces in the store as many $ite$ expressions as the number of possible values the accessed symbolic address may assume. The $ite$ approach to symbolic memory is used, e.g., in {\sc Angr}~\cite{ANGR-SSP16} (Section~\ref{ss:index-based-memory}).
 72 | \boxedexample{Consider again the example shown in Figure~\ref{fi:example-mem}. Rather than forking the state after the operation \texttt{a[i]=5} at line 4, the if-then-else approach updates the memory store by encoding both possible outcomes of the assignment, i.e., $a[0]\mapsto ite(\alpha_i=0,5,0)$ and $a[1]\mapsto ite(\alpha_i=1,5,0)$ (Figure~\ref{fi:memory-ite}). Similarly, rather than creating a new state for each possible distinct address of \texttt{a[j]} at line 5, the uncertainty on $j$ is encoded in the single expression $ite(\alpha_j=0,\sigma(a[0]),\sigma(a[1]))=ite(\alpha_j=0,ite(\alpha_i=0,5,0),ite(\alpha_i=1,5,0))$.
 73 | %: if $\alpha_i=0$ then $a[0]\mapsto 5$ and $a[1]\mapsto 0$; conversely, if $\alpha_i=1$ then $a[0]\mapsto 0$ and $a[1]\mapsto 5$.
 74 | %State forking creates two states after executing the memory assigment to explicitly consider both possible scenarios (Figure~\ref{fi:memory-fork}). The path constraints for the forked states encode the assumption made on the value of $i$. Similarly, the memory read operation \texttt{a[j]} at line 5 may access either $a[0]$ or $a[1]$, depending on the unknown value of array index $j$. Therefore, for each of the two possible outcomes of the assignment \texttt{a[i]=5}, there are two possible outcomes of the \texttt{assert}, which are explicitly explored by forking the corresponding states. 
 75 | }
 76 | 
 77 | %Indeed, the $ite(\texttt{c}, \texttt{t}, \texttt{f})$ expression introduced in the symbolic store $\sigma$ is a short term for an {\tt if-then-else} expression and means that if the condition {\tt c} is verified then {\tt t} holds, otherwise {\tt f} must be assumed as true. Nonetheless, $ite$ expressions are often just syntactic sugar for disjunctive formulas and are commonly supported by most prominent constraint solvers. For instance, in the context of propositional logic the $ite(\texttt{c}, \texttt{t}, \texttt{f})$  expression could be replaced with the formula $(\texttt{c} \wedge \texttt{t}) \vee (\neg\texttt{c} \wedge \texttt{f})$ . 
 78 | 
 79 | \end{itemize}
 80 | 
 81 | %\noindent To model fully symbolic pointers, an extensive line of research (e.g., {\sc EXE}~\cite{EXE-CCS06}, {\sc KLEE}~\cite{KLEE-OSDI08}, {\sc SAGE}~\cite{EGL-ISSTA09}) leverages the expressive power of SMT solvers to model array operations as first-class entities in constraint formulas using a {\em theory of arrays} in the decision procedure~\cite{STP-CAV07}.
 82 | 
 83 | %\noindent % TODO trick if you need one more line
 84 | An extensive line of research (e.g., {\sc EXE}~\cite{EXE-CCS06}, {\sc KLEE}~\cite{KLEE-OSDI08}, {\sc SAGE}~\cite{EGL-ISSTA09}) leverages the expressive power of some SMT solvers to model fully symbolic pointers. Using a {\em theory of arrays}~\cite{STP-CAV07}, array operations can in fact be expressed as first-class entities in constraint formulas.
 85 | 
 86 | Due to its generality, fully symbolic memory supports the most accurate description of the memory behavior of a program, accounting for all possible memory manipulations. In many practical scenarios, the set of possible addresses a memory operation may reference is small~\cite{BITBLAZE-ICISS08} as in the example shown in Figure~\ref{fi:example-mem} where indexes $i$ and $j$ range in a bounded interval, allowing accurate analyses using a reasonable amount of resources. In general, however, a symbolic address may reference any cell in memory, leading to an intractable explosion in the number of possible states. For this reason, a number of techniques have been designed to improve scalability, which elaborate along the following main lines:
 87 | 
 88 | \begin{itemize}
 89 | \item {\em Representing memory in a compact form.} This approach was taken in~\cite{MEMSIGHT-ASE17}, which maps symbolic -- rather than concrete -- address expressions to data, representing the possible alternative states resulting from referencing memory using symbolic addresses in a compact, implicit form. Queries are offloaded to efficient paged interval tree implementations to determine which stored data are possibly referenced by a memory read operation.
 90 | 
 91 | \item {\em Trading soundness for performance.} The idea, discussed in the remainder of this section, consists in corseting symbolic exploration to a subset of the execution states by replacing symbolic pointers with concrete addresses.
 92 | 
 93 | \item {\em Heap modeling.} An additional idea is to corset the exploration to states where pointers are restricted to be either null, or point to previously heap-allocated objects, rather than to any generic memory location (Section~\ref{ss:address-concretization} and Section~\ref{ss:complex-objects}).
 94 | \end{itemize}
 95 | 
 96 | %When obtained ranges are too large, {\sc BitBlaze}~\cite{BITBLAZE-ICISS08} adds a further constraint to the system to limit its size. However, the authors observe that most symbolic memory accesses are typically already constrained to small ranges in practice, making it unnecessary.
 97 | 
 98 | %\vspace{-2pt} % TODO
 99 | \subsection{Address Concretization}
100 | \label{ss:address-concretization}
101 | 
102 | In all cases where the combinatorial complexity of the analysis explodes as pointer values cannot be bounded to sufficiently small ranges, {\em address concretization}, which consists in concretizing a pointer to a single specific address, is a popular alternative. This can reduce the number of states and the complexity of the formulas fed to the solver and thus improve running time, although may cause the engine to miss paths that, for instance, depend on specific values for some pointers. 
103 | 
104 | 
105 | %Systems such as {\sc CUTE}~\cite{CUTE-FSE05} and {\sc CREST}~\cite{CREST-ASE08} are capable of reasoning only about equality constraints for pointers, as they can be solved efficiently, and resort to concretization for general symbolic references. % equality and inequality
106 | 
107 | 
108 | 
109 | %\mynote{DART is mentioned in CS-CACM13 as  using theories of arrays} --> added to the list above.
110 | Concretization naturally arises in offline executors (Section~\ref{ss:principles}). Prominent examples are {\sc DART}~\cite{DART-PLDI05} and {\sc CUTE}~\cite{CUTE-FSE05},
111 | %and early {\sc SAGE} releases~\cite{SAGE-NDSS08}. % that concretely execute one path at a time while collecting path constraints along executed paths. %\mynote{[D] was: equality and inequality}  
112 | which handle memory initialization by concretizing a reference of type {\tt T*} either to {\tt NULL}, or to the address of a newly allocated object of {\tt sizeof(T)} bytes. DART makes the choice randomly, while CUTE first tries {\tt NULL}, and then, in a subsequent execution, a concrete address. If {\tt T} is a structure, the same concretization approach is recursively applied to all fields of a pointed object. Since memory addresses (e.g., returned by {\tt malloc}) may non-deterministically change at different concrete executions, CUTE uses {\em logical addresses} in symbolic formulas to maintain consistency across different runs.
113 | Another reason for concretization is due to efficiency in constraint solving: for instance, CUTE reasons only about pointer equality constraints using an equivalence graph, resorting to concretization for more general constraints that would need costly SMT theories.
114 | %Another reason for concretization is due to limitations in constraint handling: for instance, CUTE is capable of reasoning only about equality constraints for pointers, as they can be solved efficiently, and resort to concretization for general symbolic references.
115 | 
116 | %we normally get or set a concrete value at a particular memory address. When executing symbolically, a design choice for a symbolic engine concerns what to do when a memory reference is an expression instead of a concrete address.
117 | 
118 | %\subsection{Theory of Arrays}
119 | %\label{ss:theory-arrays}
120 | 
121 | %A number of works (e.g., {\sc EXE}~\cite{EXE-CCS06}, {\sc KLEE}~\cite{KLEE-OSDI08}, and {\sc SAGE}~\cite{SAGE-NDSS08}) model pointers using the theory of arrays available from SMT decision procedures. 
122 | 
123 | %In this section we provide a description of its implementation in the popular STP solver~\cite{STP-CAV07}.
124 | 
125 | %The design of STP has been mainly driven by the demands of research projects on software analysis. Its input language supports one-dimensional arrays that are indexed by bitvectors and contain bitvectors. Given an array $A$, a $read(A,i)$ operation returns the value $A[i]$ at the location expressed by the index $i$, while a $write(A,i,v)$ returns a new array with the same values as $A$ at all indexes except $i$, where it contains the value $v$. Array reads and write typically appear as subexpressions of an $ite(c,a,b)$ expression, which is syntactic sugar for $(if\,c\;then\,b\;else\,a)$.
126 | 
127 | %STP reduces formulas over array to an equisatisfiable form that contains no $read$ or $write$ operations by applying three standard transformations and introducing fresh bitvector variables. Generated formulas are then amenable to SAT solving. However, transformations can also introduce bottlenecks, for instance by destroying sharing of subterms, and thus are typically procrastinated using refinement algorithms. SMT attempts also to eliminate variables through linear solving~\cite{STP-CAV07}.
128 | 
129 | %\vspace{-2pt} % TODO
130 | \subsection{Partial Memory Modeling}
131 | \label{ss:index-based-memory}
132 | 
133 | To mitigate the scalability problems of fully symbolic memory and the loss of soundness of memory concretization,
134 | %Motivated by the observation that concretizing all memory indexes might not work well in some scenarios, while fully symbolic memory does not scale, 
135 | {\sc Mayhem}~\cite{MAYHEM-SP12} explores a middle point in the spectrum by introducing a {\em partial} memory model. The key idea is that written addresses are always concretized and read addresses are modeled symbolically if the contiguous interval of possible values they may assume is small enough. This model is based on a trade-off: it uses more expressive formulas than concretization, since it encodes multiple pointer values per state, but does not attempt to encode all of them like in fully symbolic memory~\cite{MAYHEM-THESIS}. A basic approach to bound the set of possible values that an address may assume consists in trying different concrete values and checking whether they satisfy the current path constraints, excluding large portions of the address space at each trial until a tight range is found.  
136 | %This choice is important to keep the analysis feasible: for instance, in a fully symbolic model a repeated read and write on the same symbolic index would result in quadratic increase in either the symbolic constraints or the complexity of the stored symbolic expressions~\cite{DRILLER-NDSS16}.
137 | %Global memory is defined as a map $\mu$ from 32-bit addresses ({\em indexes}) to expressions. When a symbolic index $i$ is used to read memory, the algorithm generates a memory object $M$ containing the projection of $\mu$ over all the valid values that $i$ can assume. The evaluation of a $load(\mu,i)$ operation is thus reduced to $M[i]$, where $M$ is typically orders of magnitude smaller than the entire memory $\mu$.
138 | %Instantiating a memory object still requires finding all the possible values for a symbolic index. A naive algorithm would employ the constraint solver to refine the range of an index using binary search under the current path constraints. 
139 | This algorithm comes with a number of caveats: for instance, querying the solver on each symbolic dereference is expensive, the memory range may not be continuous, and the values within the memory region of a symbolic pointer might have structure. {\sc Mayhem} thus performs a number of optimizations such as {\em value-set analysis}~\cite{VSA-CC04} and forms of query caching (Section~\ref{se:constraint-solving}) to refine ranges efficiently. If at the end of the process the range size exceeds a given threshold (e.g., 1024), the address is concretized. {\sc Angr}~\cite{ANGR-SSP16} also adopts the partial memory model idea and extends it by optionally supporting write operations on symbolic pointers that range within small contiguous intervals (up to 128 addresses). % [D] ptr may also be redirected to symbolic data
140 | 
141 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
142 | %\subsection{Complex Objects}
143 | %
144 | 
145 | \subsection{Lazy Initialization}
146 | \label{ss:complex-objects}
147 | 
148 | \cite{KPV-TACAS03} \revedit{proposes} symbolic execution techniques for advanced object-oriented language constructs, such as those offered by C++ and Java. The authors describe a framework for software verification that combines symbolic execution and model checking to handle linked data structures such as lists and trees. % [D] added dynamically allocated & discarded primitive data types, and concurrency.
149 | 
150 | In particular, they generalize symbolic execution by introducing {\em lazy initialization} to effectively handle dynamically allocated objects. Compared to our warm-up example from Section~\ref{symbolic-execution-example}, the state representation is extended with a {\em heap configuration} used to maintain such objects. Symbolic execution of a method taking complex objects as inputs starts with uninitialized fields, and assigns values to them in a lazy fashion, i.e., they are initialized when first accessed during execution.
151 | 
152 | When an uninitialized reference field is accessed, the algorithm forks the current state with three different heap configurations, in which the field is initialized with: (1) {\tt null}, (2) a reference to a new object with all symbolic attributes, and (3) a previously introduced concrete object of the desired type, respectively. \iffullver{This on-demand concretization enables symbolic execution of methods without the need for any previous knowledge on the number of objects given as input. Also, forking the state as in (2) results into a systematic treatment for aliasing, i.e., when an object can be accessed through multiple references.}{}
153 | 
154 | \cite{KPV-TACAS03,SPF-ISSTA04} combine lazy initialization with user-provided {\em method preconditions}, i.e., conditions that are assumed to be true before the execution of a method. Preconditions are used to characterize those program input states in which the method is expected to behave as intended by the programmer. For instance, we expect a binary tree data structure to be acyclic and with every node - except for the root - having exactly one parent. Conservative preconditions are used to ensure that incorrect heap configurations are eliminated during initialization, speeding up the symbolic execution process. %\mytempedit{To better illustrate this technique, we now discuss an example in which lazy initialization is used to handle a {\tt struct} data type.}
155 | 
156 | \begin{figure*}[t]
157 |   %\vspace{-3mm}
158 |   \centering
159 |   \includegraphics[width=0.875\columnwidth]{images/lazy-initialization} % TODO was 0.9
160 |   \vspace{-0.75mm}
161 |   \caption{Example of lazy initialization}
162 |   \label{fig:example-lazy-initialization}
163 |   %\vspace{-3mm}
164 | \end{figure*}
165 | 
166 | \boxedexample{
167 | % For the sake of simplicity, we assume that fragment C does not actually evaluate {\tt l->next}, but leaves this task to fragment A. When expanding the [...] 
168 | %the value of
169 |  Figure~\ref{fig:example-lazy-initialization} shows a recursive Java method {\tt add}, which appends a node of type {\tt Node} to a linked list, and a minimal representation of its symbolic execution when applying lazy initialization. The tree nodes represent executions of straight-line fragments of {\tt add}. Initially, fragment A evaluates reference {\tt l}, which is symbolic and thus uninitialized. The symbolic engine considers three  options: (1) {\tt l} is {\tt null}, (2) {\tt l} points to a new object, and (3) {\tt l} points to a previously allocated object. Since this is the first time that a reference of type {\tt Node} is met, option (3) is ruled out. The two remaining options are then expanded, executing the involved fragments. While the first path ends after executing fragment B, the second one implicitly creates a new object {\tt o$_\texttt{1}$} due to lazy initialization and then executes C, recursively invoking {\tt add}. When expanding the recursive call, fragment A is executed and the three options are again considered by the engine, which forks into three distinct paths. Option (3) is now taken into account since a {\tt Node} object has been previously allocated (i.e., {\tt o$_\texttt{1}$}). However, this path is soon aborted by the engine since it violates the acyclicity precondition (expressed as a comment in this example). The other forked paths are further expanded, repeating the same process. Since the linked list has an unknown maximum length, the exploration can proceed indefinitely. For this reason, it is common to assume an upper bound on the depth of the materialization (i.e., field instantiation) chain.
170 | }
171 | 
172 | % \boxedexample{Consider the C function {\tt add} shown in Figure~\ref{fig:example-lazy-initialization}. This recursive function appends a node of type {\tt node\_t} to the tail of a linked list. A compact representation of the symbolic tree for {\tt add} when applying lazy initialization is given in Figure~\ref{fig:example-lazy-initialization}. Tree nodes A, B, C, and D represent execution of straight-line fragments of code in {\tt add}. Initially, fragment A evaluates the value of the pointer {\tt l}, which is symbolic and thus uninitialized. The symbolic engine considers the three possible options: (1) {\tt l} is {\tt NULL}, (2) {\tt l} points to a new object of type {\tt node\_t}, and (3) {\tt l} points to a previously allocated object. Since this is the first time that a pointer of type {\tt node\_t} is met, option (3) is not considered. The two remaining options are then expanded, executing the required fragments. While the first path ends after executing fragment B, the second path implicitly creates a new object {\tt o$_\texttt{1}$} due to lazy initialization and then executes C, recursively invoking the {\tt add} function. For the sake of simplicity, we assume that fragment C does not actually evaluate {\tt l->next}, but leaves this task to fragment A. Expanding the recursive call, fragment A is executed and the three options are again considered by the engine, forking into three distinct paths. In this case, option (3) is taken into account since an object of type {\tt node\_t} has been previously allocated (i.e., {\tt o$_\texttt{1}$}). However, this forked path is soon aborted by the engine since it violates the acyclic precondition (which is simply expressed as a comment in this example). The other forked paths are further expanded, repeating the same process. Since the linked list has an unknown maximum length, the exploration can proceed indefinitely. For this reason, it is common to assume an upper bound on the depth of the materialization chain.}
173 | 
174 | Recent advances in the area have focused on improving efficiency in generating heap configurations. For instance, in~\cite{DLR-ASE12} the concretization of a reference variable is deferred until the object is actually accessed. The work also provides a formalization of lazy initialization. \cite{BLISS-TSE15} instead employs bound refinement to prune uninteresting heap configurations by using information from already concretized fields, while a SAT solver is used to check whether declarative -- rather than imperative as in the original algorithm -- preconditions hold for a given configuration.
175 | %For instance, in~\cite{DLR-ASE12} the concretization of a reference variable is deferred until the object is actually accessed. The work also provides a formalization of lazy initialization. \cite{BLISS-TSE15} instead employs bound refinement to prune uninteresting heap configurations by using information from already concretized fields, while a SAT solver is used to check whether declarative -- rather than imperative as in the original algorithm -- preconditions hold for a given configuration.
176 | %Further refinements to lazy initialization are described in a number of works, e.g.,~\cite{DLR-ASE12,BLI-NFM13,BLISS-TSE15}. \cite{DLR-ASE12} besides providing a formalization of this technique, extends lazy initialization by adding support for subtypes and by deferring even further concretization when possible (e.g., a check for nullity does not always imply immediate materialization for an object). \cite{BLI-NFM13} presents {\em bounded lazy initialization} (BLI), which exploits {\em tight field bounds}~\cite{GRP-ISSTA10} to prune unfeasible heap configurations. BLISS~\cite{BLISS-TSE15} extends BLI by integrating two techniques: {\em bound refinement} and {\em satisfiability checks}. The former prunes uninteresting heap configurations by leveraging information from already-concretized fields, while the latter queries a SAT solver to check declarative preconditions, discarding unrealistic heap configurations.
177 | 
178 | 
179 | %, which all share the goal of reducing the number of heap configurations to generate when forking the state. extends lazy initialization by handling subtypes and by making the approach even more lazier, provides a formal treatment of lazy initialization in Java.
180 | 
181 | \iffullver{
182 | \myparagraph{Verifying Client Code Only}
183 | Of a different flavor is the technique presented in~\cite{SHZ-TAIC07} for symbolic execution over objects instantiated from commonly used libraries. The authors argue that performing symbolic execution at the representation level might be redundant if the aim is to only check the client code, thus trusting the correctness of the library implementation. They discuss the idea of symbolically executing methods of the Java {\tt String} class using a finite-state automaton that abstracts away the implementation details. They present a case study of an application that dynamically generates SQL queries: symbolic execution is used to check whether the statements conform to the SQL grammar and possibly match injection patterns. \iffullver{The authors mention that their approach might be used to symbolically execute over standard container classes such as trees or maps. It is worth mentioning that symbolic execution is used to detect SQL injection vulnerabilities also in~\cite{FLP-COMPSAC07}.}{The authors mention that their approach might be used to symbolically execute over standard container classes such as trees or maps.}
184 | }{}
185 | 
186 | %% citations for SL tools omitted
187 | % Several tools based on SL are available to date for automatically finding memory bugs in user~\cite{INFER} and system-level code~\cite{SLAYER-CAV11}, and for verifying annotated programs with respect to, e.g., memory safety properties~\cite{VERIFAST-APLAS10} and design patterns~\cite{JSTAR-OOPSLA08}. While tailor-made theorem provers are implemented in many extant tools, recent works~\cite{BPS-ENTCS09,PWZ-CAV13}
188 | 
189 | % While some of them implement tailor-made theorem provers, it has been shown~\cite{BPS-ENTCS09,PWZ-CAV13} that provers for decidable fragments of SL can be integrated in an SMT solver, allowing for complete combinations with other theories relevant for program verification. This paves the way for interesting applications of SL in general-purpose verification tools. In particular, symbolic executors could use it to reason inductively over manipulations of data structures such as lists and trees in C and Java programs. To the best of our knowledge, while symbolic execution is at the core of SL, there have not been applications of SL in symbolic executors yet. We believe this might represent a promising research direction to follow.
190 | 
191 | 
192 | % Additional optimizations are presented in~\cite{DLR-ASE12}, which also provides a complete formalization of this approach for the Java language.
193 | 
194 | % [D] this is related to input test generation
195 | %Also, generated heap configurations are pairwise non-isomorphic: eliminating symmetric structures can greatly reduce the number of heaps that a symbolic executor must explore, while guaranteeing that no relevant states are missed~\cite{BLISS-TSE15}. 
196 | 
197 | %~\cite{KPV-TACAS03,SPF-ISSTA04} combine lazy initialization with user-provided {\em method preconditions}, i.e., conditions which are assumed to be true before the execution of a method. Such conditions are used to characterize those input states in which the method is expected to behave as intended by the programmer. For instance, we expect a binary tree data structure to be acyclic and with every node - except for the root - having exactly one parent. Conservative method preconditions are used to ensure that incorrect structures are eliminated during initialization, speeding the symbolic execution process up.
198 | 
199 | %Further refinements to lazy initialization are described in a number of works. \cite{BLI-NFM13} introduces {\em bounded lazy initialization} (BLI) to reduce the number of alternatives to explore using available field bounds expressed in TACO, a tool for SAT-based bounded verification of JML-annotated Java code. ~\cite{BLISS-TSE15} presents two novel techniques that build upon BLI. The first technique refines field bounds by leveraging information from already-concretized fields; the technique is then extended by  auxiliary satisfiability checks to determine the feasibility of partially symbolic structure.
200 | 


--------------------------------------------------------------------------------
/misc/glossary.tex:
--------------------------------------------------------------------------------
 1 | % !TEX root = main.tex
 2 | 
 3 | \iffalse
 4 | \section{Glossary}
 5 | \label{se:glossary}
 6 | 
 7 | \noindent {\bf Complete analysis.} Analysis that guarantees no false positives, i.e., all reported property violations are true.
 8 | 
 9 | \smallskip\noindent {\bf Concrete execution.} An execution of a program using concrete inputs in a real-world environment.
10 | 
11 | \smallskip\noindent {\bf Concolic execution.} \ldots
12 | 
13 | \smallskip\noindent {\bf Control flow graph (CFG).} Representation of a program that uses nodes to model instructions and edges to model the control flow between them.
14 | 
15 | \smallskip\noindent {\bf Control flow path.} Path in the control flow graph of a program. Represents the sequence of instructions executed by the program for a given concrete input.
16 | 
17 | \smallskip\noindent {\bf Decidable analysis} \ldots
18 | 
19 | \smallskip\noindent {\bf Model checker.} Given a model of a system, a model checker exhaustively and automatically checks whether the model meets a given specification.
20 | 
21 | \smallskip\noindent {\bf Path constraints.} \ldots
22 | 
23 | \smallskip\noindent {\bf SMT solver.} A Satisfiability Modulo Theories (SMT) instance is a formula in first-order logic, where some function and predicate symbols have additional interpretations, and SMT is the problem of determining whether such a formula is satisfiable. A SMT solver is a tool able to reason over SMT formulas.
24 | 
25 | \smallskip\noindent {\bf Sound analysis.} Analysis that guarantees no false negatives, i.e., if there is a property violation, then it is reported.
26 | 
27 | \smallskip\noindent {\bf Symbolic execution.} \ldots
28 | 
29 | \smallskip\noindent {\bf Symbolic store.} \ldots
30 | 
31 | \smallskip\noindent {\bf Symbolic value.} \ldots
32 | \fi


--------------------------------------------------------------------------------
/misc/loops.tex:
--------------------------------------------------------------------------------
 1 | % !TEX root = main.tex
 2 | 
 3 | \section{Loops}
 4 | \label{se:loops}
 5 | 
 6 | Loops are one of the main causes of path explosion: each iteration of a loop can be seen as an {\tt if-goto} statement, leading to a conditional branch in the execution tree. If the loop condition involves one or more symbolic values, the number of generated branches may be potentially infinite. 
 7 | 
 8 | \begin{figure}[t]
 9 | \begin{center}
10 | \begin{tabular}{c}
11 | \begin{lstlisting}[basicstyle=\ttfamily\scriptsize]
12 | 1.  int x = sym_input(); // e.g., read from file
13 | 2.  while (x > 0) {
14 | 3.     x = sym_input();  
15 | 4.  }
16 | \end{lstlisting}
17 | \end{tabular}
18 | \end{center}
19 | \vspace{-2mm}
20 | \caption{Loop example with input read from the environment~\protect\cite{CS-CACM13}.}
21 | \label{fi:example-loop}
22 | \end{figure}
23 | 
24 | \vspace{-2pt} % TODO
25 | \boxedexample{Consider the code fragment of Figure~\ref{fi:example-loop}~\cite{CS-CACM13}, where \texttt{sym\_input()} is an external routine that interacts with the environment (e.g., by reading input data from a network) and returns a fresh symbolic input. The path constraint set at any final state has the form: 
26 | \[ \pi = \left ( \bigwedge_{i \in [1, k]} \alpha_i > 0 \right ) \wedge (\alpha_{k+1} \leq 0) \] 
27 | where $k$ is the number of iterations and $\alpha_i$ is the symbol produced by \texttt{sym\_input()} at the $i$-th iteration.}
28 | 
29 | \noindent The problem of path explosion due to symbolic execution of loops has been attacked from different sides. A first natural strategy adopted by many symbolic engines is to limit the loop exploration up to a certain number of iterations. Obviously, this may lead to missing interesting paths in the program. For this reason, some works (e.g., {\sc AEG}~\cite{AEG-NDSS11}) have also considered the opposite strategy, allowing the engine to fully explore some loops. To mitigate the path explosion problem, only a single instance of the symbolic executor is allowed to fully unroll a loop, while other instances conservatively explore other paths. This approach has been shown to be effective in some application contexts such as security (e.g., identification of buffer overflows) where interesting behavior may be observed at the loop boundaries.
30 | 
31 | By using static or dynamic analysis techniques, it may be possible to derive properties over a loop that can be exploited by the symbolic engine to significantly prune branching paths. For instance, knowledge of the exact number of loop iterations - or at least a constant upper bound on it - can significantly help the engine. Section~\ref{precontioned-symbolic-execution} provides a more general discussion of how preconditions can help symbolic execution. Nevertheless, even symbolic execution can be used to derive loop invariants. Indeed, if a program contains an assertion after the loop, the approach presented in~\cite{PV-SPIN04} works backwards from the property to be checked and it iteratively applies approximation to derive loop invariants. The main idea is to pick the asserted property as the initial invariant candidate and then to exploit symbolic execution to check whether this property is inductive. If the invariant cannot be verified for some loop paths, it is replaced by a different invariant. The next candidate for the invariant is generated by exploiting the path constraints for the paths on which the verification has failed. Additional refinements steps are performed to guarantee termination.
32 | 
33 | %this can be exploited by a symbolic engine for automatically discovering some invariants over the loop. In~\cite{PV-SPIN04}, this is achieved by iteratively using \mynote{[D] Define?} invariant strengthening and approximation techniques. 
34 | 
35 | \cite{GL-ISSTA11} presents a technique that automatically derives partial summarizations for loops. A loop summarization is similar to a function summary (Section~\ref{ss:caching}), using a set of preconditions and a set of postconditions. These are computed dynamically during the symbolic execution by reasoning on the dependencies among loop conditions and symbolic variables. As soon as a loop summary is computed, it is cached for possibly subsequent reuse. This not only allows the symbolic engine to avoid redundant executions of the same loop under the same program state, but also makes it possible to generalize the loop summary to cover even different executions of the same loop that run under different conditions. A main limitation of this approach is that it can generate summaries only for loops that iteratively update symbolic variables across loop iterations by adding a constant, non-zero amount.
36 | 
37 | \cite{SST-ATVA13} introduces a technique of a different flavor that analyzes cyclic paths in the control flow graph of a given program and produces {\em templates} that declaratively describe the program states generated by these portions of code into a symbolic execution tree. By exploiting templates, the symbolic execution engine needs to explore a significantly reduced number of program states. A drawback of this approach is that templates introduce quantifiers in the path constraints: in turn, this may significantly increase the burden on the constraint solver.
38 | 
39 | % [D] I don't think mentioning trip counts adds value to the discussion, better keep things simple
40 | % By relating {\em trip counts} (i.e., number of iterations for loops) with features of the program input
41 | It has also been observed that loop executions may strictly depend on input features. {\em Loop-extended symbolic execution}~\cite{SPM-ISSTA09} is able to effectively explore a loop whenever a grammar describing the input program is available. Relating the number of iterations with features of the program input can guide the exploration of the program states generated by a loop.
42 | 


--------------------------------------------------------------------------------
/overview.tex:
--------------------------------------------------------------------------------
1 | % !TEX root = main.tex
2 | 
3 | 
4 | 
5 | 


--------------------------------------------------------------------------------
/submissions/fifth/appendix.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/submissions/fifth/appendix.pdf


--------------------------------------------------------------------------------
/submissions/fifth/main.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/submissions/fifth/main.pdf


--------------------------------------------------------------------------------
/submissions/fifth/survey-with-appendix.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/submissions/fifth/survey-with-appendix.pdf


--------------------------------------------------------------------------------
/submissions/first/cover_letter.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/submissions/first/cover_letter.docx


--------------------------------------------------------------------------------
/submissions/first/cover_letter.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/submissions/first/cover_letter.pdf


--------------------------------------------------------------------------------
/submissions/first/proof.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/submissions/first/proof.pdf


--------------------------------------------------------------------------------
/submissions/first/survey.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/submissions/first/survey.pdf


--------------------------------------------------------------------------------
/submissions/fourth/ACM-CSUR-Revision.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/submissions/fourth/ACM-CSUR-Revision.pdf


--------------------------------------------------------------------------------
/submissions/fourth/proof.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/submissions/fourth/proof.pdf


--------------------------------------------------------------------------------
/submissions/fourth/survey.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/submissions/fourth/survey.pdf


--------------------------------------------------------------------------------
/submissions/second/proof.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/submissions/second/proof.pdf


--------------------------------------------------------------------------------
/submissions/second/survey-similarities.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/submissions/second/survey-similarities.pdf


--------------------------------------------------------------------------------
/submissions/second/survey-symbolic-exec-v1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/submissions/second/survey-symbolic-exec-v1.pdf


--------------------------------------------------------------------------------
/submissions/third/proof.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/submissions/third/proof.pdf


--------------------------------------------------------------------------------
/submissions/third/survey.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/submissions/third/survey.pdf


--------------------------------------------------------------------------------
/symbolic.bib:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/season-lab/survey-symbolic-execution/b7bc4026dcab38f5f1ed048b4ebc879d3a77cf0f/symbolic.bib


--------------------------------------------------------------------------------
/tables.tex:
--------------------------------------------------------------------------------
 1 | % !TEX root = appendix.tex
 2 | 
 3 | \section{Additional Tables}
 4 | 
 5 | \begin{table}[b]
 6 |   \centering
 7 |   \begin{adjustbox}{width=\columnwidth}
 8 |   %\begin{small}
 9 |   \begin{tabular}{| l || c || l |}
10 |     \hline      
11 |     {\bf Symbolic engine} & {\bf References} & {\bf Project URL} (last retrieved: December 2017)  \\ \hline\hline
12 |     
13 |     % CNC is not a symbolic engine but it uses constrained solver
14 |     %{\sc Check 'n' Crash} & \cite{CS-ICSE05} & \url{http://ranger.uta.edu/~csallner/cnc/}\\
15 |     
16 |     {\sc CUTE} & \cite{CUTE-FSE05} & -- \\
17 |     {\sc DART} & \cite{DART-PLDI05} & -- \\
18 |     {\sc jCUTE} & \cite{SA-CAV06} & \url{https://github.com/osl/jcute} \\ % : Java Concolic Unit Testing Engine
19 |     {\sc KLEE} & \cite{EXE-CCS06,KLEE-OSDI08} & \url{https://klee.github.io/} \\ % : a LLVM Execution Engine
20 |     {\sc SAGE} & \cite{SAGE-NDSS08,EGL-ISSTA09} & -- \\
21 |     {\sc BitBlaze} & \cite{BITBLAZE-ICISS08} & \url{http://bitblaze.cs.berkeley.edu/} \\ % , BHK-TR07
22 |     {\sc CREST} & \cite{CREST-ASE08} & \url{https://github.com/jburnim/crest} \\ % : a concolic test generation tool for C
23 |     {\sc PEX} & \cite{PEX-TAP08} & \url{http://research.microsoft.com/en-us/projects/pex/} \\
24 |     {\sc Rubyx} & \cite{CF-CCS10} & -- \\
25 |     {\sc Java PathFinder} & \cite{PATHFINDER-ASE10} & \url{http://babelfish.arc.nasa.gov/trac/jpf}\\
26 |     {\sc Otter} & \cite{RSM-ICSE10} & \url{https://bitbucket.org/khooyp/otter/} \\
27 |     {\sc BAP} & \cite{BAP-CAV11} & \url{https://github.com/BinaryAnalysisPlatform/bap} \\
28 |     {\sc Cloud9} & \cite{CLOUD9-EUROSYS11} & \url{http://cloud9.epfl.ch/} \\
29 |     {\sc Mayhem} & \cite{MAYHEM-SP12} & -- \\
30 |     {\sc SymDroid} & \cite{JMF-TECH12} & -- \\
31 |     {\sc \stwoe} & \cite{CKC-TOCS12} & \url{http://s2e.systems/} \\
32 |     {\sc FuzzBALL} & \cite{MMP-ASPLOS12,FUZZBALL-ESORICS13} & \url{http://bitblaze.cs.berkeley.edu/fuzzball.html} \\
33 |     {\sc Jalangi} & \cite{SKB-FSE13} & \url{https://github.com/Samsung/jalangi2} \\
34 |     {\sc Pathgrind} & \cite{S-ICSE04} & \url{https://github.com/codelion/pathgrind} \\
35 |     {\sc Kite} & \cite{V-THESIS14} & \url{http://www.cs.ubc.ca/labs/isd/Projects/Kite} \\
36 |     {\sc SymJS} & \cite{LAG-FSE14} & -- \\
37 |     {\sc CIVL} & \cite{CIVL-SC15} & \url{http://vsl.cis.udel.edu/civl/}\\ % : The Concurrency Intermediate Verification Language 
38 |     {\sc KeY} & \cite{HBR-RV14} & \url{http://www.key-project.org/} \\
39 |     {\sc Angr} & \cite{FIRMALICE-NDSS15,ANGR-SSP16} & \url{http://angr.io/} \\
40 |     {\sc Triton} & \cite{TRITON-SSTIC15} & \url{http://triton.quarkslab.com/} \\
41 |     {\sc PyExZ3} & \cite{BD-TECH15} & \url{https://github.com/thomasjball/PyExZ3} \\
42 |     {\sc JDart} & \cite{JDART-TACAS16} & \url{https://github.com/psycopaths/jdart} \\
43 | 
44 |     {\sc CATG} & -- & \url{https://github.com/ksen007/janala2} \\
45 |     {\sc PySymEmu} & -- & \url{https://github.com/feliam/pysymemu/} \\
46 |     {\sc Miasm} & -- & \url{https://github.com/cea-sec/miasm} \\
47 |     
48 |     \hline  
49 |   \end{tabular}
50 |   %\end{small}
51 |   \end{adjustbox}
52 |   \caption{Selection of symbolic execution engines, along with their reference article(s) and software project web site (if any).}
53 |   \label{tab:symbolic-engines}
54 |   \vspace{-3.2mm} % TODO
55 | \end{table}
56 | 
57 | \vspace{-2pt}
58 | \myparagraph{Tools}
59 | Table~\ref{tab:symbolic-engines} lists a number of symbolic execution engines that have worked as incubators for several of the techniques surveyed in this article. The novel contributions introduced by tools that represented milestones in the area are described in the appropriate sections throughout the main article.
60 | 
61 | \vspace{-1pt}
62 | \myparagraph{Path Selection Heuristics}
63 | Table~\ref{tab:heuristics} provides a categorization of the search heuristics that have been discussed in Section 2.3 of the main article. For each category, we list several works that have proposed interesting embodiments of the category.
64 | 
65 | \begin{table}[t]
66 |   \centering
67 |   \begin{adjustbox}{width=0.99\columnwidth} % TODO was 1; with 0.88 the last paragraph will fit
68 |   \begin{small}
69 |   \begin{tabular}{| l || l |}
70 |     \hline      
71 |     {\bf Heuristic} & {\bf Goal} \\ \hline\hline
72 |     \multirow{2}*{BFS} & {\em Maximize coverage} \\ & \cite{CKC-TOCS12,PEX-TAP08} \\\hline
73 |     \multirow{3}*{DFS} & {\em Exhaust paths, minimize memory usage} \\ & \cite{EXE-CCS06,CKC-TOCS12}\\ & \cite{PEX-TAP08,DART-PLDI05} \\\hline
74 |     \multirow{2}*{Random path selection} & {\em Randomly pick a path with probability based on its length} \\ & \cite{KLEE-OSDI08} \\\hline
75 |     %low-covered code & prioritize paths that execute low-covered code  & \cite{EXE-CCS06} \\
76 |     \multirow{4}*{Code coverage search} & {\em Prioritize paths that may explore unexplored code or that may} \\ & {\em soon reach a particular target program point}  \\ & \cite{EXE-CCS06,KLEE-OSDI08,MAYHEM-SP12}\\ & \cite{CKC-TOCS12,GV-ISSTA02,MPF-SAS11} \\\hline
77 |     \multirow{2}*{Buggy-path-first} & {\em Prioritize bug-friendly path} \\ & \cite{AEG-NDSS11} \\\hline
78 |     \multirow{2}*{Loop exhaustion} & {\em Fully explore specific loops} \\ & \cite{AEG-NDSS11} \\\hline
79 |     \multirow{2}*{Symbolic instruction pointers} & {\em Prioritize paths with symbolic instruction pointers} \\ & \cite{MAYHEM-SP12} \\\hline
80 |     \multirow{2}*{Symbolic memory accesses} & {\em Prioritize paths with symbolic memory accesses} \\ & \cite{MAYHEM-SP12} \\ \hline
81 |     \multirow{2}*{Fitness function} & {\em Prioritize paths based on a fitness function} \\ & \cite{XTD-DSN09,CS-CACM13,XTD-DSN09} \\ \hline
82 |     \multirow{3}*{Subpath-guided search} & {\em Use frequency distributions of explored subpaths to prioritize}\\ & {\em less covered parts of a program} \\ & \cite{LZL-OOPSLA13} \\ \hline
83 |     \multirow{2}*{Property-guided search} & {\em Prioritize paths that are most likely to satisfy the target property} \\ & \cite{ZCWDL15} \\ 
84 |     %kill path & filter uninteresting path & \cite{CKC-TOCS12} \\
85 |     \hline  
86 |   \end{tabular}
87 |   \end{small}
88 |   \end{adjustbox}
89 |   \caption{Common path selection heuristics discussed in Section 2.3.} % of the main article
90 |   \label{tab:heuristics}
91 | \end{table}
92 | 


--------------------------------------------------------------------------------