├── .gitignore
├── GPL
├── LGPL
├── LICENSE
├── Makefile
├── README
├── bench
    ├── .gitignore
    ├── bpramcount
    ├── bpramcount.cpp
    ├── makefile-pin
    ├── makefile-postmark
    ├── microbench.py
    ├── parse_bpramcount
    ├── postmark-1_5.c
    ├── postmark.large.config
    └── postmark.small.config
├── bpfs.c
├── bpfs.h
├── bpfs_structs.h
├── crawler.c
├── crawler.h
├── dcache.c
├── dcache.h
├── fuse_limits
├── hash_map.c
├── hash_map.h
├── indirect_cow.c
├── indirect_cow.h
├── mkbpfs.c
├── mkbpfs.h
├── mkfs.bpfs.c
├── pool.h
├── pwrite.c
├── todo
├── util.h
├── vector.c
└── vector.h


/.gitignore:
--------------------------------------------------------------------------------
 1 | bpfs
 2 | mkfs.bpfs
 3 | pwrite
 4 | *.o
 5 | tags
 6 | TAGS
 7 | mnt
 8 | bench/postmark-1_5
 9 | bench/linux-2.6.15.tar
10 | bench/httpd-2.0.63.tar.gz
11 | 


--------------------------------------------------------------------------------
/GPL:
--------------------------------------------------------------------------------
  1 | 		    GNU GENERAL PUBLIC LICENSE
  2 | 		       Version 2, June 1991
  3 | 
  4 |  Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
  5 |  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  6 |  Everyone is permitted to copy and distribute verbatim copies
  7 |  of this license document, but changing it is not allowed.
  8 | 
  9 | 			    Preamble
 10 | 
 11 |   The licenses for most software are designed to take away your
 12 | freedom to share and change it.  By contrast, the GNU General Public
 13 | License is intended to guarantee your freedom to share and change free
 14 | software--to make sure the software is free for all its users.  This
 15 | General Public License applies to most of the Free Software
 16 | Foundation's software and to any other program whose authors commit to
 17 | using it.  (Some other Free Software Foundation software is covered by
 18 | the GNU Lesser General Public License instead.)  You can apply it to
 19 | your programs, too.
 20 | 
 21 |   When we speak of free software, we are referring to freedom, not
 22 | price.  Our General Public Licenses are designed to make sure that you
 23 | have the freedom to distribute copies of free software (and charge for
 24 | this service if you wish), that you receive source code or can get it
 25 | if you want it, that you can change the software or use pieces of it
 26 | in new free programs; and that you know you can do these things.
 27 | 
 28 |   To protect your rights, we need to make restrictions that forbid
 29 | anyone to deny you these rights or to ask you to surrender the rights.
 30 | These restrictions translate to certain responsibilities for you if you
 31 | distribute copies of the software, or if you modify it.
 32 | 
 33 |   For example, if you distribute copies of such a program, whether
 34 | gratis or for a fee, you must give the recipients all the rights that
 35 | you have.  You must make sure that they, too, receive or can get the
 36 | source code.  And you must show them these terms so they know their
 37 | rights.
 38 | 
 39 |   We protect your rights with two steps: (1) copyright the software, and
 40 | (2) offer you this license which gives you legal permission to copy,
 41 | distribute and/or modify the software.
 42 | 
 43 |   Also, for each author's protection and ours, we want to make certain
 44 | that everyone understands that there is no warranty for this free
 45 | software.  If the software is modified by someone else and passed on, we
 46 | want its recipients to know that what they have is not the original, so
 47 | that any problems introduced by others will not reflect on the original
 48 | authors' reputations.
 49 | 
 50 |   Finally, any free program is threatened constantly by software
 51 | patents.  We wish to avoid the danger that redistributors of a free
 52 | program will individually obtain patent licenses, in effect making the
 53 | program proprietary.  To prevent this, we have made it clear that any
 54 | patent must be licensed for everyone's free use or not licensed at all.
 55 | 
 56 |   The precise terms and conditions for copying, distribution and
 57 | modification follow.
 58 | 
 59 | 		    GNU GENERAL PUBLIC LICENSE
 60 |    TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
 61 | 
 62 |   0. This License applies to any program or other work which contains
 63 | a notice placed by the copyright holder saying it may be distributed
 64 | under the terms of this General Public License.  The "Program", below,
 65 | refers to any such program or work, and a "work based on the Program"
 66 | means either the Program or any derivative work under copyright law:
 67 | that is to say, a work containing the Program or a portion of it,
 68 | either verbatim or with modifications and/or translated into another
 69 | language.  (Hereinafter, translation is included without limitation in
 70 | the term "modification".)  Each licensee is addressed as "you".
 71 | 
 72 | Activities other than copying, distribution and modification are not
 73 | covered by this License; they are outside its scope.  The act of
 74 | running the Program is not restricted, and the output from the Program
 75 | is covered only if its contents constitute a work based on the
 76 | Program (independent of having been made by running the Program).
 77 | Whether that is true depends on what the Program does.
 78 | 
 79 |   1. You may copy and distribute verbatim copies of the Program's
 80 | source code as you receive it, in any medium, provided that you
 81 | conspicuously and appropriately publish on each copy an appropriate
 82 | copyright notice and disclaimer of warranty; keep intact all the
 83 | notices that refer to this License and to the absence of any warranty;
 84 | and give any other recipients of the Program a copy of this License
 85 | along with the Program.
 86 | 
 87 | You may charge a fee for the physical act of transferring a copy, and
 88 | you may at your option offer warranty protection in exchange for a fee.
 89 | 
 90 |   2. You may modify your copy or copies of the Program or any portion
 91 | of it, thus forming a work based on the Program, and copy and
 92 | distribute such modifications or work under the terms of Section 1
 93 | above, provided that you also meet all of these conditions:
 94 | 
 95 |     a) You must cause the modified files to carry prominent notices
 96 |     stating that you changed the files and the date of any change.
 97 | 
 98 |     b) You must cause any work that you distribute or publish, that in
 99 |     whole or in part contains or is derived from the Program or any
100 |     part thereof, to be licensed as a whole at no charge to all third
101 |     parties under the terms of this License.
102 | 
103 |     c) If the modified program normally reads commands interactively
104 |     when run, you must cause it, when started running for such
105 |     interactive use in the most ordinary way, to print or display an
106 |     announcement including an appropriate copyright notice and a
107 |     notice that there is no warranty (or else, saying that you provide
108 |     a warranty) and that users may redistribute the program under
109 |     these conditions, and telling the user how to view a copy of this
110 |     License.  (Exception: if the Program itself is interactive but
111 |     does not normally print such an announcement, your work based on
112 |     the Program is not required to print an announcement.)
113 | 
114 | These requirements apply to the modified work as a whole.  If
115 | identifiable sections of that work are not derived from the Program,
116 | and can be reasonably considered independent and separate works in
117 | themselves, then this License, and its terms, do not apply to those
118 | sections when you distribute them as separate works.  But when you
119 | distribute the same sections as part of a whole which is a work based
120 | on the Program, the distribution of the whole must be on the terms of
121 | this License, whose permissions for other licensees extend to the
122 | entire whole, and thus to each and every part regardless of who wrote it.
123 | 
124 | Thus, it is not the intent of this section to claim rights or contest
125 | your rights to work written entirely by you; rather, the intent is to
126 | exercise the right to control the distribution of derivative or
127 | collective works based on the Program.
128 | 
129 | In addition, mere aggregation of another work not based on the Program
130 | with the Program (or with a work based on the Program) on a volume of
131 | a storage or distribution medium does not bring the other work under
132 | the scope of this License.
133 | 
134 |   3. You may copy and distribute the Program (or a work based on it,
135 | under Section 2) in object code or executable form under the terms of
136 | Sections 1 and 2 above provided that you also do one of the following:
137 | 
138 |     a) Accompany it with the complete corresponding machine-readable
139 |     source code, which must be distributed under the terms of Sections
140 |     1 and 2 above on a medium customarily used for software interchange; or,
141 | 
142 |     b) Accompany it with a written offer, valid for at least three
143 |     years, to give any third party, for a charge no more than your
144 |     cost of physically performing source distribution, a complete
145 |     machine-readable copy of the corresponding source code, to be
146 |     distributed under the terms of Sections 1 and 2 above on a medium
147 |     customarily used for software interchange; or,
148 | 
149 |     c) Accompany it with the information you received as to the offer
150 |     to distribute corresponding source code.  (This alternative is
151 |     allowed only for noncommercial distribution and only if you
152 |     received the program in object code or executable form with such
153 |     an offer, in accord with Subsection b above.)
154 | 
155 | The source code for a work means the preferred form of the work for
156 | making modifications to it.  For an executable work, complete source
157 | code means all the source code for all modules it contains, plus any
158 | associated interface definition files, plus the scripts used to
159 | control compilation and installation of the executable.  However, as a
160 | special exception, the source code distributed need not include
161 | anything that is normally distributed (in either source or binary
162 | form) with the major components (compiler, kernel, and so on) of the
163 | operating system on which the executable runs, unless that component
164 | itself accompanies the executable.
165 | 
166 | If distribution of executable or object code is made by offering
167 | access to copy from a designated place, then offering equivalent
168 | access to copy the source code from the same place counts as
169 | distribution of the source code, even though third parties are not
170 | compelled to copy the source along with the object code.
171 | 
172 |   4. You may not copy, modify, sublicense, or distribute the Program
173 | except as expressly provided under this License.  Any attempt
174 | otherwise to copy, modify, sublicense or distribute the Program is
175 | void, and will automatically terminate your rights under this License.
176 | However, parties who have received copies, or rights, from you under
177 | this License will not have their licenses terminated so long as such
178 | parties remain in full compliance.
179 | 
180 |   5. You are not required to accept this License, since you have not
181 | signed it.  However, nothing else grants you permission to modify or
182 | distribute the Program or its derivative works.  These actions are
183 | prohibited by law if you do not accept this License.  Therefore, by
184 | modifying or distributing the Program (or any work based on the
185 | Program), you indicate your acceptance of this License to do so, and
186 | all its terms and conditions for copying, distributing or modifying
187 | the Program or works based on it.
188 | 
189 |   6. Each time you redistribute the Program (or any work based on the
190 | Program), the recipient automatically receives a license from the
191 | original licensor to copy, distribute or modify the Program subject to
192 | these terms and conditions.  You may not impose any further
193 | restrictions on the recipients' exercise of the rights granted herein.
194 | You are not responsible for enforcing compliance by third parties to
195 | this License.
196 | 
197 |   7. If, as a consequence of a court judgment or allegation of patent
198 | infringement or for any other reason (not limited to patent issues),
199 | conditions are imposed on you (whether by court order, agreement or
200 | otherwise) that contradict the conditions of this License, they do not
201 | excuse you from the conditions of this License.  If you cannot
202 | distribute so as to satisfy simultaneously your obligations under this
203 | License and any other pertinent obligations, then as a consequence you
204 | may not distribute the Program at all.  For example, if a patent
205 | license would not permit royalty-free redistribution of the Program by
206 | all those who receive copies directly or indirectly through you, then
207 | the only way you could satisfy both it and this License would be to
208 | refrain entirely from distribution of the Program.
209 | 
210 | If any portion of this section is held invalid or unenforceable under
211 | any particular circumstance, the balance of the section is intended to
212 | apply and the section as a whole is intended to apply in other
213 | circumstances.
214 | 
215 | It is not the purpose of this section to induce you to infringe any
216 | patents or other property right claims or to contest validity of any
217 | such claims; this section has the sole purpose of protecting the
218 | integrity of the free software distribution system, which is
219 | implemented by public license practices.  Many people have made
220 | generous contributions to the wide range of software distributed
221 | through that system in reliance on consistent application of that
222 | system; it is up to the author/donor to decide if he or she is willing
223 | to distribute software through any other system and a licensee cannot
224 | impose that choice.
225 | 
226 | This section is intended to make thoroughly clear what is believed to
227 | be a consequence of the rest of this License.
228 | 
229 |   8. If the distribution and/or use of the Program is restricted in
230 | certain countries either by patents or by copyrighted interfaces, the
231 | original copyright holder who places the Program under this License
232 | may add an explicit geographical distribution limitation excluding
233 | those countries, so that distribution is permitted only in or among
234 | countries not thus excluded.  In such case, this License incorporates
235 | the limitation as if written in the body of this License.
236 | 
237 |   9. The Free Software Foundation may publish revised and/or new versions
238 | of the General Public License from time to time.  Such new versions will
239 | be similar in spirit to the present version, but may differ in detail to
240 | address new problems or concerns.
241 | 
242 | Each version is given a distinguishing version number.  If the Program
243 | specifies a version number of this License which applies to it and "any
244 | later version", you have the option of following the terms and conditions
245 | either of that version or of any later version published by the Free
246 | Software Foundation.  If the Program does not specify a version number of
247 | this License, you may choose any version ever published by the Free Software
248 | Foundation.
249 | 
250 |   10. If you wish to incorporate parts of the Program into other free
251 | programs whose distribution conditions are different, write to the author
252 | to ask for permission.  For software which is copyrighted by the Free
253 | Software Foundation, write to the Free Software Foundation; we sometimes
254 | make exceptions for this.  Our decision will be guided by the two goals
255 | of preserving the free status of all derivatives of our free software and
256 | of promoting the sharing and reuse of software generally.
257 | 
258 | 			    NO WARRANTY
259 | 
260 |   11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
268 | REPAIR OR CORRECTION.
269 | 
270 |   12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
278 | POSSIBILITY OF SUCH DAMAGES.
279 | 
280 | 		     END OF TERMS AND CONDITIONS
281 | 
282 | 	    How to Apply These Terms to Your New Programs
283 | 
284 |   If you develop a new program, and you want it to be of the greatest
285 | possible use to the public, the best way to achieve this is to make it
286 | free software which everyone can redistribute and change under these terms.
287 | 
288 |   To do so, attach the following notices to the program.  It is safest
289 | to attach them to the start of each source file to most effectively
290 | convey the exclusion of warranty; and each file should have at least
291 | the "copyright" line and a pointer to where the full notice is found.
292 | 
293 |     <one line to give the program's name and a brief idea of what it does.>
294 |     Copyright (C) <year>  <name of author>
295 | 
296 |     This program is free software; you can redistribute it and/or modify
297 |     it under the terms of the GNU General Public License as published by
298 |     the Free Software Foundation; either version 2 of the License, or
299 |     (at your option) any later version.
300 | 
301 |     This program is distributed in the hope that it will be useful,
302 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
303 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
304 |     GNU General Public License for more details.
305 | 
306 |     You should have received a copy of the GNU General Public License along
307 |     with this program; if not, write to the Free Software Foundation, Inc.,
308 |     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
309 | 
310 | Also add information on how to contact you by electronic and paper mail.
311 | 
312 | If the program is interactive, make it output a short notice like this
313 | when it starts in an interactive mode:
314 | 
315 |     Gnomovision version 69, Copyright (C) year name of author
316 |     Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
317 |     This is free software, and you are welcome to redistribute it
318 |     under certain conditions; type `show c' for details.
319 | 
320 | The hypothetical commands `show w' and `show c' should show the appropriate
321 | parts of the General Public License.  Of course, the commands you use may
322 | be called something other than `show w' and `show c'; they could even be
323 | mouse-clicks or menu items--whatever suits your program.
324 | 
325 | You should also get your employer (if you work as a programmer) or your
326 | school, if any, to sign a "copyright disclaimer" for the program, if
327 | necessary.  Here is a sample; alter the names:
328 | 
329 |   Yoyodyne, Inc., hereby disclaims all copyright interest in the program
330 |   `Gnomovision' (which makes passes at compilers) written by James Hacker.
331 | 
332 |   <signature of Ty Coon>, 1 April 1989
333 |   Ty Coon, President of Vice
334 | 
335 | This General Public License does not permit incorporating your program into
336 | proprietary programs.  If your program is a subroutine library, you may
337 | consider it more useful to permit linking proprietary applications with the
338 | library.  If this is what you want to do, use the GNU Lesser General
339 | Public License instead of this License.
340 | 


--------------------------------------------------------------------------------
/LGPL:
--------------------------------------------------------------------------------
  1 |                   GNU LESSER GENERAL PUBLIC LICENSE
  2 |                        Version 2.1, February 1999
  3 | 
  4 |  Copyright (C) 1991, 1999 Free Software Foundation, Inc.
  5 |  51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
  6 |  Everyone is permitted to copy and distribute verbatim copies
  7 |  of this license document, but changing it is not allowed.
  8 | 
  9 | [This is the first released version of the Lesser GPL.  It also counts
 10 |  as the successor of the GNU Library Public License, version 2, hence
 11 |  the version number 2.1.]
 12 | 
 13 |                             Preamble
 14 | 
 15 |   The licenses for most software are designed to take away your
 16 | freedom to share and change it.  By contrast, the GNU General Public
 17 | Licenses are intended to guarantee your freedom to share and change
 18 | free software--to make sure the software is free for all its users.
 19 | 
 20 |   This license, the Lesser General Public License, applies to some
 21 | specially designated software packages--typically libraries--of the
 22 | Free Software Foundation and other authors who decide to use it.  You
 23 | can use it too, but we suggest you first think carefully about whether
 24 | this license or the ordinary General Public License is the better
 25 | strategy to use in any particular case, based on the explanations below.
 26 | 
 27 |   When we speak of free software, we are referring to freedom of use,
 28 | not price.  Our General Public Licenses are designed to make sure that
 29 | you have the freedom to distribute copies of free software (and charge
 30 | for this service if you wish); that you receive source code or can get
 31 | it if you want it; that you can change the software and use pieces of
 32 | it in new free programs; and that you are informed that you can do
 33 | these things.
 34 | 
 35 |   To protect your rights, we need to make restrictions that forbid
 36 | distributors to deny you these rights or to ask you to surrender these
 37 | rights.  These restrictions translate to certain responsibilities for
 38 | you if you distribute copies of the library or if you modify it.
 39 | 
 40 |   For example, if you distribute copies of the library, whether gratis
 41 | or for a fee, you must give the recipients all the rights that we gave
 42 | you.  You must make sure that they, too, receive or can get the source
 43 | code.  If you link other code with the library, you must provide
 44 | complete object files to the recipients, so that they can relink them
 45 | with the library after making changes to the library and recompiling
 46 | it.  And you must show them these terms so they know their rights.
 47 | 
 48 |   We protect your rights with a two-step method: (1) we copyright the
 49 | library, and (2) we offer you this license, which gives you legal
 50 | permission to copy, distribute and/or modify the library.
 51 | 
 52 |   To protect each distributor, we want to make it very clear that
 53 | there is no warranty for the free library.  Also, if the library is
 54 | modified by someone else and passed on, the recipients should know
 55 | that what they have is not the original version, so that the original
 56 | author's reputation will not be affected by problems that might be
 57 | introduced by others.
 58 | 
 59 |   Finally, software patents pose a constant threat to the existence of
 60 | any free program.  We wish to make sure that a company cannot
 61 | effectively restrict the users of a free program by obtaining a
 62 | restrictive license from a patent holder.  Therefore, we insist that
 63 | any patent license obtained for a version of the library must be
 64 | consistent with the full freedom of use specified in this license.
 65 | 
 66 |   Most GNU software, including some libraries, is covered by the
 67 | ordinary GNU General Public License.  This license, the GNU Lesser
 68 | General Public License, applies to certain designated libraries, and
 69 | is quite different from the ordinary General Public License.  We use
 70 | this license for certain libraries in order to permit linking those
 71 | libraries into non-free programs.
 72 | 
 73 |   When a program is linked with a library, whether statically or using
 74 | a shared library, the combination of the two is legally speaking a
 75 | combined work, a derivative of the original library.  The ordinary
 76 | General Public License therefore permits such linking only if the
 77 | entire combination fits its criteria of freedom.  The Lesser General
 78 | Public License permits more lax criteria for linking other code with
 79 | the library.
 80 | 
 81 |   We call this license the "Lesser" General Public License because it
 82 | does Less to protect the user's freedom than the ordinary General
 83 | Public License.  It also provides other free software developers Less
 84 | of an advantage over competing non-free programs.  These disadvantages
 85 | are the reason we use the ordinary General Public License for many
 86 | libraries.  However, the Lesser license provides advantages in certain
 87 | special circumstances.
 88 | 
 89 |   For example, on rare occasions, there may be a special need to
 90 | encourage the widest possible use of a certain library, so that it becomes
 91 | a de-facto standard.  To achieve this, non-free programs must be
 92 | allowed to use the library.  A more frequent case is that a free
 93 | library does the same job as widely used non-free libraries.  In this
 94 | case, there is little to gain by limiting the free library to free
 95 | software only, so we use the Lesser General Public License.
 96 | 
 97 |   In other cases, permission to use a particular library in non-free
 98 | programs enables a greater number of people to use a large body of
 99 | free software.  For example, permission to use the GNU C Library in
100 | non-free programs enables many more people to use the whole GNU
101 | operating system, as well as its variant, the GNU/Linux operating
102 | system.
103 | 
104 |   Although the Lesser General Public License is Less protective of the
105 | users' freedom, it does ensure that the user of a program that is
106 | linked with the Library has the freedom and the wherewithal to run
107 | that program using a modified version of the Library.
108 | 
109 |   The precise terms and conditions for copying, distribution and
110 | modification follow.  Pay close attention to the difference between a
111 | "work based on the library" and a "work that uses the library".  The
112 | former contains code derived from the library, whereas the latter must
113 | be combined with the library in order to run.
114 | 
115 |                   GNU LESSER GENERAL PUBLIC LICENSE
116 |    TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
117 | 
118 |   0. This License Agreement applies to any software library or other
119 | program which contains a notice placed by the copyright holder or
120 | other authorized party saying it may be distributed under the terms of
121 | this Lesser General Public License (also called "this License").
122 | Each licensee is addressed as "you".
123 | 
124 |   A "library" means a collection of software functions and/or data
125 | prepared so as to be conveniently linked with application programs
126 | (which use some of those functions and data) to form executables.
127 | 
128 |   The "Library", below, refers to any such software library or work
129 | which has been distributed under these terms.  A "work based on the
130 | Library" means either the Library or any derivative work under
131 | copyright law: that is to say, a work containing the Library or a
132 | portion of it, either verbatim or with modifications and/or translated
133 | straightforwardly into another language.  (Hereinafter, translation is
134 | included without limitation in the term "modification".)
135 | 
136 |   "Source code" for a work means the preferred form of the work for
137 | making modifications to it.  For a library, complete source code means
138 | all the source code for all modules it contains, plus any associated
139 | interface definition files, plus the scripts used to control compilation
140 | and installation of the library.
141 | 
142 |   Activities other than copying, distribution and modification are not
143 | covered by this License; they are outside its scope.  The act of
144 | running a program using the Library is not restricted, and output from
145 | such a program is covered only if its contents constitute a work based
146 | on the Library (independent of the use of the Library in a tool for
147 | writing it).  Whether that is true depends on what the Library does
148 | and what the program that uses the Library does.
149 | 
150 |   1. You may copy and distribute verbatim copies of the Library's
151 | complete source code as you receive it, in any medium, provided that
152 | you conspicuously and appropriately publish on each copy an
153 | appropriate copyright notice and disclaimer of warranty; keep intact
154 | all the notices that refer to this License and to the absence of any
155 | warranty; and distribute a copy of this License along with the
156 | Library.
157 | 
158 |   You may charge a fee for the physical act of transferring a copy,
159 | and you may at your option offer warranty protection in exchange for a
160 | fee.
161 | 
162 |   2. You may modify your copy or copies of the Library or any portion
163 | of it, thus forming a work based on the Library, and copy and
164 | distribute such modifications or work under the terms of Section 1
165 | above, provided that you also meet all of these conditions:
166 | 
167 |     a) The modified work must itself be a software library.
168 | 
169 |     b) You must cause the files modified to carry prominent notices
170 |     stating that you changed the files and the date of any change.
171 | 
172 |     c) You must cause the whole of the work to be licensed at no
173 |     charge to all third parties under the terms of this License.
174 | 
175 |     d) If a facility in the modified Library refers to a function or a
176 |     table of data to be supplied by an application program that uses
177 |     the facility, other than as an argument passed when the facility
178 |     is invoked, then you must make a good faith effort to ensure that,
179 |     in the event an application does not supply such function or
180 |     table, the facility still operates, and performs whatever part of
181 |     its purpose remains meaningful.
182 | 
183 |     (For example, a function in a library to compute square roots has
184 |     a purpose that is entirely well-defined independent of the
185 |     application.  Therefore, Subsection 2d requires that any
186 |     application-supplied function or table used by this function must
187 |     be optional: if the application does not supply it, the square
188 |     root function must still compute square roots.)
189 | 
190 | These requirements apply to the modified work as a whole.  If
191 | identifiable sections of that work are not derived from the Library,
192 | and can be reasonably considered independent and separate works in
193 | themselves, then this License, and its terms, do not apply to those
194 | sections when you distribute them as separate works.  But when you
195 | distribute the same sections as part of a whole which is a work based
196 | on the Library, the distribution of the whole must be on the terms of
197 | this License, whose permissions for other licensees extend to the
198 | entire whole, and thus to each and every part regardless of who wrote
199 | it.
200 | 
201 | Thus, it is not the intent of this section to claim rights or contest
202 | your rights to work written entirely by you; rather, the intent is to
203 | exercise the right to control the distribution of derivative or
204 | collective works based on the Library.
205 | 
206 | In addition, mere aggregation of another work not based on the Library
207 | with the Library (or with a work based on the Library) on a volume of
208 | a storage or distribution medium does not bring the other work under
209 | the scope of this License.
210 | 
211 |   3. You may opt to apply the terms of the ordinary GNU General Public
212 | License instead of this License to a given copy of the Library.  To do
213 | this, you must alter all the notices that refer to this License, so
214 | that they refer to the ordinary GNU General Public License, version 2,
215 | instead of to this License.  (If a newer version than version 2 of the
216 | ordinary GNU General Public License has appeared, then you can specify
217 | that version instead if you wish.)  Do not make any other change in
218 | these notices.
219 | 
220 |   Once this change is made in a given copy, it is irreversible for
221 | that copy, so the ordinary GNU General Public License applies to all
222 | subsequent copies and derivative works made from that copy.
223 | 
224 |   This option is useful when you wish to copy part of the code of
225 | the Library into a program that is not a library.
226 | 
227 |   4. You may copy and distribute the Library (or a portion or
228 | derivative of it, under Section 2) in object code or executable form
229 | under the terms of Sections 1 and 2 above provided that you accompany
230 | it with the complete corresponding machine-readable source code, which
231 | must be distributed under the terms of Sections 1 and 2 above on a
232 | medium customarily used for software interchange.
233 | 
234 |   If distribution of object code is made by offering access to copy
235 | from a designated place, then offering equivalent access to copy the
236 | source code from the same place satisfies the requirement to
237 | distribute the source code, even though third parties are not
238 | compelled to copy the source along with the object code.
239 | 
240 |   5. A program that contains no derivative of any portion of the
241 | Library, but is designed to work with the Library by being compiled or
242 | linked with it, is called a "work that uses the Library".  Such a
243 | work, in isolation, is not a derivative work of the Library, and
244 | therefore falls outside the scope of this License.
245 | 
246 |   However, linking a "work that uses the Library" with the Library
247 | creates an executable that is a derivative of the Library (because it
248 | contains portions of the Library), rather than a "work that uses the
249 | library".  The executable is therefore covered by this License.
250 | Section 6 states terms for distribution of such executables.
251 | 
252 |   When a "work that uses the Library" uses material from a header file
253 | that is part of the Library, the object code for the work may be a
254 | derivative work of the Library even though the source code is not.
255 | Whether this is true is especially significant if the work can be
256 | linked without the Library, or if the work is itself a library.  The
257 | threshold for this to be true is not precisely defined by law.
258 | 
259 |   If such an object file uses only numerical parameters, data
260 | structure layouts and accessors, and small macros and small inline
261 | functions (ten lines or less in length), then the use of the object
262 | file is unrestricted, regardless of whether it is legally a derivative
263 | work.  (Executables containing this object code plus portions of the
264 | Library will still fall under Section 6.)
265 | 
266 |   Otherwise, if the work is a derivative of the Library, you may
267 | distribute the object code for the work under the terms of Section 6.
268 | Any executables containing that work also fall under Section 6,
269 | whether or not they are linked directly with the Library itself.
270 | 
271 |   6. As an exception to the Sections above, you may also combine or
272 | link a "work that uses the Library" with the Library to produce a
273 | work containing portions of the Library, and distribute that work
274 | under terms of your choice, provided that the terms permit
275 | modification of the work for the customer's own use and reverse
276 | engineering for debugging such modifications.
277 | 
278 |   You must give prominent notice with each copy of the work that the
279 | Library is used in it and that the Library and its use are covered by
280 | this License.  You must supply a copy of this License.  If the work
281 | during execution displays copyright notices, you must include the
282 | copyright notice for the Library among them, as well as a reference
283 | directing the user to the copy of this License.  Also, you must do one
284 | of these things:
285 | 
286 |     a) Accompany the work with the complete corresponding
287 |     machine-readable source code for the Library including whatever
288 |     changes were used in the work (which must be distributed under
289 |     Sections 1 and 2 above); and, if the work is an executable linked
290 |     with the Library, with the complete machine-readable "work that
291 |     uses the Library", as object code and/or source code, so that the
292 |     user can modify the Library and then relink to produce a modified
293 |     executable containing the modified Library.  (It is understood
294 |     that the user who changes the contents of definitions files in the
295 |     Library will not necessarily be able to recompile the application
296 |     to use the modified definitions.)
297 | 
298 |     b) Use a suitable shared library mechanism for linking with the
299 |     Library.  A suitable mechanism is one that (1) uses at run time a
300 |     copy of the library already present on the user's computer system,
301 |     rather than copying library functions into the executable, and (2)
302 |     will operate properly with a modified version of the library, if
303 |     the user installs one, as long as the modified version is
304 |     interface-compatible with the version that the work was made with.
305 | 
306 |     c) Accompany the work with a written offer, valid for at
307 |     least three years, to give the same user the materials
308 |     specified in Subsection 6a, above, for a charge no more
309 |     than the cost of performing this distribution.
310 | 
311 |     d) If distribution of the work is made by offering access to copy
312 |     from a designated place, offer equivalent access to copy the above
313 |     specified materials from the same place.
314 | 
315 |     e) Verify that the user has already received a copy of these
316 |     materials or that you have already sent this user a copy.
317 | 
318 |   For an executable, the required form of the "work that uses the
319 | Library" must include any data and utility programs needed for
320 | reproducing the executable from it.  However, as a special exception,
321 | the materials to be distributed need not include anything that is
322 | normally distributed (in either source or binary form) with the major
323 | components (compiler, kernel, and so on) of the operating system on
324 | which the executable runs, unless that component itself accompanies
325 | the executable.
326 | 
327 |   It may happen that this requirement contradicts the license
328 | restrictions of other proprietary libraries that do not normally
329 | accompany the operating system.  Such a contradiction means you cannot
330 | use both them and the Library together in an executable that you
331 | distribute.
332 | 
333 |   7. You may place library facilities that are a work based on the
334 | Library side-by-side in a single library together with other library
335 | facilities not covered by this License, and distribute such a combined
336 | library, provided that the separate distribution of the work based on
337 | the Library and of the other library facilities is otherwise
338 | permitted, and provided that you do these two things:
339 | 
340 |     a) Accompany the combined library with a copy of the same work
341 |     based on the Library, uncombined with any other library
342 |     facilities.  This must be distributed under the terms of the
343 |     Sections above.
344 | 
345 |     b) Give prominent notice with the combined library of the fact
346 |     that part of it is a work based on the Library, and explaining
347 |     where to find the accompanying uncombined form of the same work.
348 | 
349 |   8. You may not copy, modify, sublicense, link with, or distribute
350 | the Library except as expressly provided under this License.  Any
351 | attempt otherwise to copy, modify, sublicense, link with, or
352 | distribute the Library is void, and will automatically terminate your
353 | rights under this License.  However, parties who have received copies,
354 | or rights, from you under this License will not have their licenses
355 | terminated so long as such parties remain in full compliance.
356 | 
357 |   9. You are not required to accept this License, since you have not
358 | signed it.  However, nothing else grants you permission to modify or
359 | distribute the Library or its derivative works.  These actions are
360 | prohibited by law if you do not accept this License.  Therefore, by
361 | modifying or distributing the Library (or any work based on the
362 | Library), you indicate your acceptance of this License to do so, and
363 | all its terms and conditions for copying, distributing or modifying
364 | the Library or works based on it.
365 | 
366 |   10. Each time you redistribute the Library (or any work based on the
367 | Library), the recipient automatically receives a license from the
368 | original licensor to copy, distribute, link with or modify the Library
369 | subject to these terms and conditions.  You may not impose any further
370 | restrictions on the recipients' exercise of the rights granted herein.
371 | You are not responsible for enforcing compliance by third parties with
372 | this License.
373 | 
374 |   11. If, as a consequence of a court judgment or allegation of patent
375 | infringement or for any other reason (not limited to patent issues),
376 | conditions are imposed on you (whether by court order, agreement or
377 | otherwise) that contradict the conditions of this License, they do not
378 | excuse you from the conditions of this License.  If you cannot
379 | distribute so as to satisfy simultaneously your obligations under this
380 | License and any other pertinent obligations, then as a consequence you
381 | may not distribute the Library at all.  For example, if a patent
382 | license would not permit royalty-free redistribution of the Library by
383 | all those who receive copies directly or indirectly through you, then
384 | the only way you could satisfy both it and this License would be to
385 | refrain entirely from distribution of the Library.
386 | 
387 | If any portion of this section is held invalid or unenforceable under any
388 | particular circumstance, the balance of the section is intended to apply,
389 | and the section as a whole is intended to apply in other circumstances.
390 | 
391 | It is not the purpose of this section to induce you to infringe any
392 | patents or other property right claims or to contest validity of any
393 | such claims; this section has the sole purpose of protecting the
394 | integrity of the free software distribution system which is
395 | implemented by public license practices.  Many people have made
396 | generous contributions to the wide range of software distributed
397 | through that system in reliance on consistent application of that
398 | system; it is up to the author/donor to decide if he or she is willing
399 | to distribute software through any other system and a licensee cannot
400 | impose that choice.
401 | 
402 | This section is intended to make thoroughly clear what is believed to
403 | be a consequence of the rest of this License.
404 | 
405 |   12. If the distribution and/or use of the Library is restricted in
406 | certain countries either by patents or by copyrighted interfaces, the
407 | original copyright holder who places the Library under this License may add
408 | an explicit geographical distribution limitation excluding those countries,
409 | so that distribution is permitted only in or among countries not thus
410 | excluded.  In such case, this License incorporates the limitation as if
411 | written in the body of this License.
412 | 
413 |   13. The Free Software Foundation may publish revised and/or new
414 | versions of the Lesser General Public License from time to time.
415 | Such new versions will be similar in spirit to the present version,
416 | but may differ in detail to address new problems or concerns.
417 | 
418 | Each version is given a distinguishing version number.  If the Library
419 | specifies a version number of this License which applies to it and
420 | "any later version", you have the option of following the terms and
421 | conditions either of that version or of any later version published by
422 | the Free Software Foundation.  If the Library does not specify a
423 | license version number, you may choose any version ever published by
424 | the Free Software Foundation.
425 | 
426 |   14. If you wish to incorporate parts of the Library into other free
427 | programs whose distribution conditions are incompatible with these,
428 | write to the author to ask for permission.  For software which is
429 | copyrighted by the Free Software Foundation, write to the Free
430 | Software Foundation; we sometimes make exceptions for this.  Our
431 | decision will be guided by the two goals of preserving the free status
432 | of all derivatives of our free software and of promoting the sharing
433 | and reuse of software generally.
434 | 
435 |                             NO WARRANTY
436 | 
437 |   15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO
438 | WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
439 | EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR
440 | OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY
441 | KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE
442 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
443 | PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
444 | LIBRARY IS WITH YOU.  SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME
445 | THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
446 | 
447 |   16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
448 | WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
449 | AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU
450 | FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR
451 | CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
452 | LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
453 | RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
454 | FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
455 | SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
456 | DAMAGES.
457 | 
458 |                      END OF TERMS AND CONDITIONS
459 | 
460 |            How to Apply These Terms to Your New Libraries
461 | 
462 |   If you develop a new library, and you want it to be of the greatest
463 | possible use to the public, we recommend making it free software that
464 | everyone can redistribute and change.  You can do so by permitting
465 | redistribution under these terms (or, alternatively, under the terms of the
466 | ordinary General Public License).
467 | 
468 |   To apply these terms, attach the following notices to the library.  It is
469 | safest to attach them to the start of each source file to most effectively
470 | convey the exclusion of warranty; and each file should have at least the
471 | "copyright" line and a pointer to where the full notice is found.
472 | 
473 |     <one line to give the library's name and a brief idea of what it does.>
474 |     Copyright (C) <year>  <name of author>
475 | 
476 |     This library is free software; you can redistribute it and/or
477 |     modify it under the terms of the GNU Lesser General Public
478 |     License as published by the Free Software Foundation; either
479 |     version 2.1 of the License, or (at your option) any later version.
480 | 
481 |     This library is distributed in the hope that it will be useful,
482 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
483 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
484 |     Lesser General Public License for more details.
485 | 
486 |     You should have received a copy of the GNU Lesser General Public
487 |     License along with this library; if not, write to the Free Software
488 |     Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
489 | 
490 | Also add information on how to contact you by electronic and paper mail.
491 | 
492 | You should also get your employer (if you work as a programmer) or your
493 | school, if any, to sign a "copyright disclaimer" for the library, if
494 | necessary.  Here is a sample; alter the names:
495 | 
496 |   Yoyodyne, Inc., hereby disclaims all copyright interest in the
497 |   library `Frob' (a library for tweaking knobs) written by James Random Hacker.
498 | 
499 |   <signature of Ty Coon>, 1 April 1990
500 |   Ty Coon, President of Vice
501 | 
502 | That's all there is to it!
503 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BPFS is copyright 2009-2010 The Regents of the University of California.
 2 | 
 3 | BPFS is released under the terms of version 2 of the GNU General Public
 4 | License as published by the Free Software Foundation. A copy of this
 5 | license is included with Featherstitch in the file "GPL".
 6 | 
 7 | The BPFS Pintool bpramcount is released under the terms of version 2.1 of
 8 | the GNU Lesser General Public License as published by the Free Software
 9 | Foundation. A copy of this license is included with Featherstitch in the
10 | file "LGPL".
11 | 
12 | The Microsoft Corporation may hold rights to portions of the algorithms
13 | employed by BPFS.
14 | 
15 | Some portions of BPFS may also be available under different licenses;
16 | the applicable licenses are listed near the top of each file as appropriate.
17 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | CC = gcc
 2 | CFLAGS = -Wall -g
 3 | # Remove debug checks:
 4 | #CFLAGS += -DNDEBUG
 5 | # Enable optimizations:
 6 | #CFLAGS += -O3 -march=native # GCC >4.4: -flto
 7 | # Enable Nehalem optimizations (GCC 4.4 -march only knows up to Core2):
 8 | #CFLAGS += -msahf -msse4 -msse4.1 -msse4.2
 9 | # Enable gprof:
10 | #CFLAGS += -pg
11 | 
12 | .PHONY: all clean
13 | 
14 | BIN = bpfs mkfs.bpfs pwrite
15 | OBJS = bpfs.o crawler.o indirect_cow.o mkfs.bpfs.o mkbpfs.o dcache.o \
16 |        hash_map.o vector.o
17 | TAGS = tags TAGS
18 | SRCS = bpfs_structs.h bpfs.h bpfs.c crawler.h crawler.c dcache.h dcache.c \
19 |        indirect_cow.h indirect_cow.c mkbpfs.h mkbpfs.c mkfs.bpfs.c \
20 |        util.h hash_map.h hash_map.c vector.h vector.c pool.h pwrite.c
21 | # Non-compile sources (at least, for this Makefile):
22 | NCSRCS = bench/bpramcount.cpp bench/microbench.py
23 | 
24 | all: $(BIN) $(TAGS)
25 | 
26 | clean:
27 | 	rm -f $(BIN) $(OBJS) $(TAGS)
28 | 
29 | tags: $(SRCS) $(NCSRCS)
30 | 	@echo + ctags tags
31 | 	@if ctags --version | grep -q Exuberant; then ctags $(SRCS) $(NCSRCS); else touch $@; fi
32 | TAGS: $(SRCS) $(NCSRCS)
33 | 	@echo + ctags TAGS
34 | 	@if ctags --version | grep -q Exuberant; then ctags -e $(SRCS) $(NCSRCS); else touch $@; fi
35 | 
36 | bpfs.o: bpfs.c bpfs_structs.h bpfs.h crawler.h indirect_cow.h \
37 | 	mkbpfs.h dcache.h util.h hash_map.h
38 | 	$(CC) $(CFLAGS) `pkg-config --cflags fuse` -c -o $@ $<
39 | 
40 | mkfs.bpfs.o: mkfs.bpfs.c mkbpfs.h util.h
41 | 	$(CC) $(CFLAGS) -c -o $@ $<
42 | 
43 | indirect_cow.o: indirect_cow.c indirect_cow.h bpfs.h bpfs_structs.h util.h \
44 | 	hash_map.h
45 | 	$(CC) $(CFLAGS) -c -o $@ $<
46 | 
47 | crawler.o: crawler.c crawler.h bpfs.h bpfs_structs.h util.h
48 | 	$(CC) $(CFLAGS) -c -o $@ $<
49 | 
50 | mkbpfs.o: mkbpfs.c mkbpfs.h bpfs.h bpfs_structs.h util.h
51 | 	$(CC) $(CFLAGS) -c -o $@ $<
52 | 
53 | dcache.o: dcache.c dcache.h hash_map.h util.h
54 | 	$(CC) $(CFLAGS) -c -o $@ $<
55 | 
56 | vector.o: vector.c vector.h
57 | 	$(CC) $(CFLAGS) -c -o $@ $<
58 | 
59 | hash_map.o: hash_map.c hash_map.h vector.h pool.h
60 | 	$(CC) $(CFLAGS) -c -o $@ $<
61 | 
62 | bpfs: bpfs.o crawler.o indirect_cow.o mkbpfs.o dcache.o hash_map.o vector.o
63 | 	$(CC) $(CFLAGS) `pkg-config --libs fuse` -luuid -o $@ $^
64 | 
65 | mkfs.bpfs: mkfs.bpfs.o mkbpfs.o
66 | 	$(CC) $(CFLAGS) -luuid -o $@ $^
67 | 


--------------------------------------------------------------------------------
/README:
--------------------------------------------------------------------------------
 1 | BPFS - byte-addressable file system
 2 | 
 3 | * Requirements
 4 | 
 5 | For a correct system:
 6 | - 64-bit Linux (2.6 tested)
 7 | - FUSE >= 2.8
 8 | 
 9 | For development:
10 | - 32/64-bit Linux (2.6 tested)
11 | - FUSE >= 2.5
12 | 
13 | * Installation
14 | 
15 | $ make
16 | 
17 | * How to Use
18 | 
19 | BPFS can use a memory-mapped file/device or run in DRAM:
20 | - File/device:
21 |   1. (File) Create the file. E.g., dd if=/dev/zero of=bpram.img bs=1M count=$N
22 |   2. Format the file system: ./mkfs.bpfs bpram.img
23 |   3. Mount the file system: ./bpfs -f bpram.mnt $MNT
24 | - DRAM (no need to create a file and contents are lost at exit):
25 |   1. ./bpfs -s $((N * 1024 * 1024)) $MNT
26 | 
27 | There are several configuration macros at the top of bpfs.h and bpfs.c.
28 | 
29 | You can also profile BPFS's memory write traffic using the Pintool
30 | bench/bpramcount.cpp. bench/bpramcount runs BPFS inside of Pin and
31 | contains setup directions.
32 | 


--------------------------------------------------------------------------------
/bench/.gitignore:
--------------------------------------------------------------------------------
1 | pin
2 | obj-ia32
3 | obj-intel64
4 | 


--------------------------------------------------------------------------------
/bench/bpramcount:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # This file is part of BPFS. BPFS is copyright 2009-2010 The Regents of the
 4 | # University of California. It is distributed under the terms of version 2
 5 | # of the GNU GPL. See the file LICENSE for details.
 6 | 
 7 | # Run bpfs inside Pin to collect stats on bpfs's writes to BPRAM
 8 | 
 9 | DIR="`dirname "$0"`"
10 | 
11 | PIN=$DIR/pin/pin
12 | 
13 | PINOPTS=${PINOPTS:-}
14 | 
15 | if [ ! -d $DIR/pin ]; then
16 | 	echo "Pin not found at $DIR/pin/." 1>&2
17 | 	echo "Pin is available from http://www.pintool.org/." 1>&2
18 | 	echo "bpramcount was developed against Pin 27887 gcc4 ia32/intel64." 1>&2
19 | 	exit 1
20 | fi
21 | 
22 | if [ ! -x $PIN ]; then
23 | 	echo "Could not find $PIN" 1>&2
24 | 	exit 1
25 | fi
26 | 
27 | # Assume there is just one obj-* so that we don't have to detect the
28 | # appropriate directory
29 | TOOL="`ls $DIR/obj-*/bpramcount.so`"
30 | 
31 | if [ ! -f "$TOOL" ]; then
32 | 	echo "Could not find $DIR/obj-*/bpramcount.so." 1>&2
33 | 	echo "Did you compile bpramcount.so? 'make -f makefile-pin'." 1>&2
34 | 	exit 1
35 | fi
36 | 
37 | exec $PIN -t $TOOL $PINOPTS -- $DIR/../bpfs $@
38 | 


--------------------------------------------------------------------------------
/bench/bpramcount.cpp:
--------------------------------------------------------------------------------
  1 | /* This file is part of BPFS. BPFS is copyright 2009-2010 The Regents of the
  2 |  * University of California. It is distributed under the terms of version 2.1
  3 |  * of the GNU LGPL. See the file LICENSE for details. */
  4 | 
  5 | // This file contains an ISA-portable PIN tool for tracing BPFS writes to BPRAM.
  6 | 
  7 | // TODO: see source/tools/ManualExamples/proccount.cpp for obtaining fn names
  8 | 
  9 | #define __STDC_FORMAT_MACROS
 10 | 
 11 | #include <stdio.h>
 12 | #include <string.h>
 13 | #include <inttypes.h>
 14 | #include "pin.H"
 15 | 
 16 | #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 4)
 17 | # include <unordered_map>
 18 | using std::unordered_map;
 19 | #else
 20 | # include <ext/hash_map>
 21 | # define unordered_map __gnu_cxx::hash_map
 22 | #endif
 23 | 
 24 | #define BPRAM_INFO "inform_pin_of_bpram"
 25 | 
 26 | // Max backtrace depth
 27 | #define NBSTEPS 20
 28 | 
 29 | // Whether to log each write
 30 | #define LOG_WRITES 0
 31 | 
 32 | 
 33 | const void *bpram_start;
 34 | const void *bpram_end;
 35 | 
 36 | UINT64 nbytes;
 37 | 
 38 | FILE *trace;
 39 | 
 40 | KNOB<string> KnobOutputFile(KNOB_MODE_WRITEONCE, "pintool",
 41 | 	"o", "bpramcount.out", "specify output file name");
 42 | 
 43 | KNOB<string> KnobBacktrace(KNOB_MODE_WRITEONCE, "pintool",
 44 | 	"b", "false", "specify whether to log write backtraces: true/false");
 45 | 
 46 | 
 47 | //
 48 | // Log the number of bytes written to BPRAM
 49 | 
 50 | VOID RecordMemWrite(VOID *addr, ADDRINT size)
 51 | {
 52 | 	if (bpram_start <= addr && addr < bpram_end)
 53 | 	{
 54 | 		nbytes += size;
 55 | #if LOG_WRITES
 56 | 		// TODO: Log to memory instead of file; output to file at exit.
 57 | 		//       Perhaps structure the stats per IP, rather than as a log.
 58 | 	    fprintf(trace,"%zu B to %p\n", size, addr);
 59 | #endif
 60 | 	}
 61 | }
 62 | 
 63 | VOID Instruction(INS ins, VOID *v)
 64 | {
 65 |     // Checking !INS_IsIpRelWrite() does not seem to affect performance
 66 |     if (INS_IsMemoryWrite(ins) && !INS_IsStackWrite(ins))
 67 |     {
 68 |         // The Pin manual suggests dividing this into If and Then pieces
 69 |         // to permit inlining of the If case, but I've found that If-Then
 70 |         // is slower. Maybe if RecordMemWrite() becomes more expensive
 71 |         // this tradeoff will change?
 72 |         INS_InsertPredicatedCall(
 73 |             ins, IPOINT_BEFORE, (AFUNPTR) RecordMemWrite,
 74 |             IARG_MEMORYWRITE_EA,
 75 |             IARG_MEMORYWRITE_SIZE,
 76 |             IARG_END);
 77 |     }
 78 | }
 79 | 
 80 | 
 81 | //
 82 | // Log the number of bytes written to BPRAM and the backtrace for each write
 83 | 
 84 | struct backtrace
 85 | {
 86 | 	backtrace() { memset(ips, 0, sizeof(ips)); }
 87 | 	void *ips[NBSTEPS];
 88 | };
 89 | 
 90 | bool operator==(const backtrace &bt1, const backtrace &bt2)
 91 | {
 92 | 	return !memcmp(bt1.ips, bt2.ips, sizeof(bt1.ips));
 93 | }
 94 | 
 95 | struct backtrace_hash : public std::unary_function<backtrace,size_t>
 96 | {
 97 | 	size_t operator()(const backtrace &b) const
 98 | 	{
 99 | 		// uses FNV hash taken from Anvil from stl::tr1::hash
100 | 		size_t r = 2166136261u;
101 | 		for (size_t i = 0; i < NBSTEPS; i++)
102 | 		{
103 | 			r ^= reinterpret_cast<uintptr_t>(b.ips[i]);
104 | 			r *= 16777619;
105 | 		}
106 | 		return r;
107 | 	}
108 | };
109 | 
110 | typedef unordered_map<backtrace,UINT64,backtrace_hash> backtrace_writes;
111 | 
112 | backtrace_writes bt_writes;
113 | 
114 | 
115 | ADDRINT BpramWriteIf(VOID *addr)
116 | {
117 | 	return (bpram_start <= addr && addr < bpram_end);
118 | }
119 | 
120 | #ifdef __i386__
121 | # define REG_BP_ARCH REG_EBP
122 | #elif defined(__x86_64__)
123 | # define REG_BP_ARCH REG_RBP
124 | #endif
125 | 
126 | struct stack_frame
127 | {
128 | 	struct stack_frame *next;
129 | 	void *ret;
130 | };
131 | 
132 | VOID RecordMemWriteBacktrace(CONTEXT *ctxt, VOID *rip, ADDRINT size)
133 | {
134 | 	const char *btopt = "(Might this be because you are trying to backtrace optimized code?)";
135 | 	struct stack_frame *fp = reinterpret_cast<struct stack_frame*>(PIN_GetContextReg(ctxt, REG_BP_ARCH));
136 | 	struct stack_frame *last_fp = NULL;
137 | 	backtrace bt;
138 | 	int i = 0;
139 | 
140 | 	nbytes += size;
141 | 
142 | 	bt.ips[0] = reinterpret_cast<void*>(PIN_GetContextReg(ctxt, REG_INST_PTR));
143 | 
144 | 	// Normally rip contains numbers that are small and not in a function.
145 | 	// But sometimes REG_INST_PTR (aka EIP) is bogus and rip is not.
146 | 	if (rip)
147 | 		bt.ips[++i] = rip;
148 | 
149 | 	while (fp >= last_fp && i < NBSTEPS)
150 | 	{
151 | 		void *ret;
152 | 		size_t n;
153 | 		EXCEPTION_INFO ei;
154 | 
155 | 		n = PIN_SafeCopyEx(&ret, &fp->ret, sizeof(ret), &ei);
156 | 		if (!n)
157 | 		{
158 | 			printf("pin: stack trace failed at depth %d (read ret)\n", i);
159 | 			printf("%s\n", btopt);
160 | 			printf("EI: \"%s\"\n", PIN_ExceptionToString(&ei).c_str());
161 | 			break;
162 | 		}
163 | 		if (!ret)
164 | 			break;
165 | 		bt.ips[++i] = ret;
166 | 		last_fp = fp;
167 | 
168 | 		n = PIN_SafeCopyEx(&fp, &last_fp->next, sizeof(fp), &ei);
169 | 		if (!n)
170 | 		{
171 | 			printf("pin: stack trace failed at depth %d (read next)\n", i);
172 | 			printf("%s\n", btopt);
173 | 			printf("EI: \"%s\"\n", PIN_ExceptionToString(&ei).c_str());
174 | 			break;
175 | 		}
176 | 	}
177 | 
178 | 	backtrace_writes::iterator it = bt_writes.find(bt);
179 | 	if (it != bt_writes.end())
180 | 		it->second += size;
181 | 	else
182 | 		bt_writes[bt] = size;
183 | }
184 | 
185 | VOID InstructionWithBacktrace(INS ins, VOID *v)
186 | {
187 |     // Checking !INS_IsIpRelWrite() does not seem to affect performance
188 |     if (INS_IsMemoryWrite(ins) && !INS_IsStackWrite(ins))
189 |     {
190 |         INS_InsertIfPredicatedCall(
191 |             ins, IPOINT_BEFORE, (AFUNPTR) BpramWriteIf,
192 |             IARG_MEMORYWRITE_EA,
193 |             IARG_END);
194 |         INS_InsertThenPredicatedCall(
195 |             ins, IPOINT_BEFORE, (AFUNPTR) RecordMemWriteBacktrace,
196 | 			IARG_CONTEXT,
197 | 			IARG_RETURN_IP,
198 |             IARG_MEMORYWRITE_SIZE,
199 |             IARG_END);
200 |     }
201 | }
202 | 
203 | 
204 | //
205 | // General
206 | 
207 | VOID Fini(INT32 code, VOID *v)
208 | {
209 | 	printf("pin: %" PRIu64 " bytes written to BPRAM\n", nbytes);
210 | 
211 | 	if (trace)
212 | 	{
213 | 		fprintf(trace, "total number of bytes written: %" PRIu64 "\n", nbytes);
214 | 
215 | 		fprintf(trace, "write backtraces start:\n");
216 | 		for (backtrace_writes::const_iterator it = bt_writes.begin();
217 | 		     it != bt_writes.end();
218 | 		     ++it)
219 | 		{
220 | 			fprintf(trace, "%" PRIu64, it->second);
221 | 			for (int i = 0; i < NBSTEPS && it->first.ips[i]; i++)
222 | 				fprintf(trace, " %p", it->first.ips[i]);
223 | 			fprintf(trace, "\n");
224 | 		}
225 | 		fprintf(trace, "write backtraces end\n");
226 | 
227 | 		fclose(trace);
228 | 	}
229 | }
230 | 
231 | VOID InformPinBpramBefore(ADDRINT addr, ADDRINT size)
232 | {
233 | 	printf("pin: detected %zu MiB (%zu bytes) of BPRAM\n",
234 | 	       size / (1024 * 1024), size);
235 | 	if (trace)
236 | 		fprintf(trace, "detected %zu MiB (%zu bytes) of BPRAM @ %p\n",
237 | 		        size / (1024 * 1024), size, (void*) addr);
238 | 	bpram_start = (void*) addr;
239 | 	bpram_end = (void*) (addr + size);
240 | }
241 | 
242 | VOID Image(IMG img, VOID *v)
243 | {
244 | 	// Detect the address and size of BPRAM by inspecting a call to
245 | 	// BPRAM_INFO().
246 | 	// Alternatively, we could require debug symbols and lookup 'bpram' and
247 | 	// 'bpram_size' and either detect when their contents change, to get
248 | 	// their post-init values, or watch for a known function call made
249 | 	// after bpram is inited but before fuse starts (eg fuse_mount()).
250 | 	RTN rtn = RTN_FindByName(img, BPRAM_INFO);
251 | 	if (RTN_Valid(rtn))
252 | 	{
253 | 		RTN_Open(rtn);
254 | 		RTN_InsertCall(rtn, IPOINT_BEFORE, (AFUNPTR) InformPinBpramBefore,
255 | 			IARG_FUNCARG_ENTRYPOINT_VALUE, 0,
256 | 			IARG_FUNCARG_ENTRYPOINT_VALUE, 1,
257 | 			IARG_END);
258 | 		RTN_Close(rtn);
259 | 	}
260 | }
261 | 
262 | int main(int argc, char **argv)
263 | {
264 | 	PIN_InitSymbols();
265 |     PIN_Init(argc, argv);
266 | 
267 | 	IMG_AddInstrumentFunction(Image, 0);
268 | 	if (KnobBacktrace.Value() == "true")
269 | 	{
270 | 		trace = fopen(KnobOutputFile.Value().c_str(), "w");
271 | 		if (!trace)
272 | 			fprintf(stderr, "pin: unable to open trace file\n");
273 | 		INS_AddInstrumentFunction(InstructionWithBacktrace, 0);
274 | 	}
275 | 	else
276 | 		INS_AddInstrumentFunction(Instruction, 0);
277 | #if LOG_WRITES
278 | 	if (!trace)
279 | 	{
280 | 		trace = fopen(KnobOutputFile.Value().c_str(), "w");
281 | 		if (!trace)
282 | 			fprintf(stderr, "pin: unable to open trace file\n");
283 | 	}
284 | #endif
285 |     PIN_AddFiniFunction(Fini, 0);
286 | 	if (trace)
287 | 	{
288 | 		printf("pin: logging to %s\n", KnobOutputFile.Value().c_str());
289 | 		fflush(stdout);
290 | 	}
291 | 
292 |     PIN_StartProgram(); // does not return
293 |     return 0;
294 | }
295 | 


--------------------------------------------------------------------------------
/bench/makefile-pin:
--------------------------------------------------------------------------------
  1 | ##
  2 | ## This is a sample makefile for building Pin tools outside
  3 | ## of the Pin environment.  This makefile is suitable for
  4 | ## building with the Pin kit, not a Pin source development tree.
  5 | ##
  6 | ## To build the tool, execute the make command:
  7 | ##
  8 | ##      make
  9 | ## or
 10 | ##      make PIN_HOME=<top-level directory where Pin was installed>
 11 | ##
 12 | ## After building your tool, you would invoke Pin like this:
 13 | ## 
 14 | ##      $PIN_HOME/pin -t MyPinTool -- /bin/ls
 15 | ##
 16 | ##############################################################
 17 | #
 18 | # User-specific configuration
 19 | #
 20 | ##############################################################
 21 | 
 22 | #
 23 | # 1. Change PIN_HOME to point to the top-level directory where
 24 | #    Pin was installed. This can also be set on the command line,
 25 | #    or as an environment variable.
 26 | #
 27 | PIN_HOME ?= pin
 28 | 
 29 | 
 30 | ##############################################################
 31 | #
 32 | # set up and include *.config files
 33 | #
 34 | ##############################################################
 35 | 
 36 | PIN_KIT=$(PIN_HOME)
 37 | KIT=1
 38 | TESTAPP=$(OBJDIR)cp-pin.exe
 39 | 
 40 | TARGET_COMPILER?=gnu
 41 | ifdef OS
 42 |     ifeq (${OS},Windows_NT)
 43 |         TARGET_COMPILER=ms
 44 |     endif
 45 | endif
 46 | 
 47 | ifeq ($(TARGET_COMPILER),gnu)
 48 |     include $(PIN_HOME)/source/tools/makefile.gnu.config
 49 |     CXXFLAGS ?= -Wall -Werror -Wno-unknown-pragmas $(DBG) $(OPT)
 50 | 	CXXFLAGS += -std=c++0x # Disable for GCC <4.4
 51 |     PIN=$(PIN_HOME)/pin
 52 | endif
 53 | 
 54 | ifeq ($(TARGET_COMPILER),ms)
 55 |     include $(PIN_HOME)/source/tools/makefile.ms.config
 56 |     DBG?=
 57 |     PIN=$(PIN_HOME)/pin.bat
 58 | endif
 59 | 
 60 | 
 61 | ##############################################################
 62 | #
 63 | # Tools - you may wish to add your tool name to TOOL_ROOTS
 64 | #
 65 | ##############################################################
 66 | 
 67 | 
 68 | TOOL_ROOTS = bpramcount
 69 | 
 70 | TOOLS = $(TOOL_ROOTS:%=$(OBJDIR)%$(PINTOOL_SUFFIX))
 71 | 
 72 | 
 73 | ##############################################################
 74 | #
 75 | # build rules
 76 | #
 77 | ##############################################################
 78 | 
 79 | all: tools
 80 | tools: $(OBJDIR) $(TOOLS) $(OBJDIR)cp-pin.exe
 81 | test: $(OBJDIR) $(TOOL_ROOTS:%=%.test)
 82 | 
 83 | MyPinTool.test: $(OBJDIR)cp-pin.exe
 84 |       $(MAKE) -k -C MyPinTool PIN_HOME=$(PIN_HOME)
 85 | 
 86 | $(OBJDIR)cp-pin.exe:
 87 | 	$(CXX) $(PIN_HOME)/source/tools/Tests/cp-pin.cpp $(APP_CXXFLAGS) -o $(OBJDIR)cp-pin.exe
 88 | 
 89 | $(OBJDIR):
 90 | 	mkdir -p $(OBJDIR)
 91 | 
 92 | $(OBJDIR)%.o : %.cpp
 93 | 	$(CXX) -c $(CXXFLAGS) $(PIN_CXXFLAGS) ${OUTOPT}$@ $<
 94 | 
 95 | $(TOOLS): $(PIN_LIBNAMES)
 96 | 
 97 | $(TOOLS): %$(PINTOOL_SUFFIX) : %.o
 98 | 	${PIN_LD} $(PIN_LDFLAGS) $(LINK_DEBUG) ${LINK_OUT}$@ $< ${PIN_LPATHS} $(PIN_LIBS) $(DBG)
 99 | 
100 | 
101 | ## cleaning
102 | clean:
103 | 	-rm -rf $(OBJDIR) *.out *.tested *.failed makefile.copy
104 | 


--------------------------------------------------------------------------------
/bench/makefile-postmark:
--------------------------------------------------------------------------------
 1 | .PHONY: all clean
 2 | 
 3 | postmark-1_5: postmark-1_5.c
 4 | 	$(CC) -O2 $(CFLAGS) -o $@ $^
 5 | 
 6 | all: postmark-1_5
 7 | 
 8 | clean:
 9 | 	rm -f postmark-1_5
10 | 


--------------------------------------------------------------------------------
/bench/microbench.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # This file is part of BPFS. BPFS is copyright 2009-2010 The Regents of the
  4 | # University of California. It is distributed under the terms of version 2
  5 | # of the GNU GPL. See the file LICENSE for details.
  6 | 
  7 | # TODO:
  8 | # - try with different types of file systems
  9 | #   (eg many inodes or dirents or large file)
 10 | # - create dir/file cases that are not SCSP optimal (that must CoW some)?
 11 | 
 12 | import inspect
 13 | import getopt
 14 | import os
 15 | import subprocess
 16 | import stat
 17 | import sys
 18 | import tempfile
 19 | import time
 20 | 
 21 | def benchmacro(bench_class):
 22 |     bench_class.benchmacro = True
 23 |     return bench_class
 24 | 
 25 | class postmark:
 26 |     free_space = 6 * 1024
 27 |     def run(self):
 28 |         config = tempfile.NamedTemporaryFile()
 29 |         config.write('set location ' + self.mnt + '\n')
 30 |         defaults = open(self.config)
 31 |         for line in defaults:
 32 |             config.write(line)
 33 |         defaults.close()
 34 |         config.flush()
 35 |         config.seek(0)
 36 |         subprocess.check_call(['bench/postmark-1_5'],
 37 |                               stdin=config, close_fds=True)
 38 | 
 39 | class benchmarks:
 40 |     @staticmethod
 41 |     def all():
 42 |         for name, obj in inspect.getmembers(benchmarks):
 43 |             if inspect.isclass(obj):
 44 |                 yield (name, obj)
 45 | 
 46 |     @staticmethod
 47 |     def micro():
 48 |         for (name, obj) in benchmarks.all():
 49 |             if not hasattr(obj, 'benchmacro'):
 50 |                 yield (name, obj)
 51 | 
 52 |     @staticmethod
 53 |     def macro():
 54 |         for (name, obj) in benchmarks.all():
 55 |             if hasattr(obj, 'benchmacro'):
 56 |                 yield (name, obj)
 57 | 
 58 |     class empty:
 59 |         opt = 0
 60 |         def run(self):
 61 |             pass
 62 | 
 63 |     class create:
 64 |         #     dirent + ino                 + cmtime + d.ft + d.ino
 65 |         opt = 4+1+2  + 8+4+4+4+4+8+8+8+3*4 + 4+4    + 1    + 8
 66 |         def run(self):
 67 |             open(os.path.join(self.mnt, 'a'), 'w').close()
 68 | 
 69 |     class mkdir:
 70 |         #     dirent + ino                 + cmtime + d.ft + d.nlinks + d.ino + dirent.rec_len
 71 |         opt = 4+1+2  + 8+4+4+4+4+8+8+8+3*4 + 4+4    + 1    + 4        + 8     + 2
 72 |         def run(self):
 73 |             os.mkdir(os.path.join(self.mnt, 'a'))
 74 | 
 75 |     class symlink:
 76 |         #     dirent + ino                 + cmtime + d.ft + d.ino + filename
 77 |         opt = 4+1+2  + 8+4+4+4+4+8+8+8+3*4 + 4+4    + 1    + 8     + 2
 78 |         def run(self):
 79 |             os.symlink('a', os.path.join(self.mnt, 'b'))
 80 | 
 81 |     class unlink_0B:
 82 |         #     dirent.ino + cmtime
 83 |         opt = 8          + 8
 84 |         def prepare(self):
 85 |             open(os.path.join(self.mnt, 'a'), 'w').close()
 86 |         def run(self):
 87 |             os.unlink(os.path.join(self.mnt, 'a'))
 88 | 
 89 |     class unlink_4k:
 90 |         #     dirent.ino + cmtime
 91 |         opt = 8          + 8
 92 |         def prepare(self):
 93 |             file = open(os.path.join(self.mnt, 'a'), 'w')
 94 |             file.write('1' * 4096)
 95 |             file.close()
 96 |         def run(self):
 97 |             os.unlink(os.path.join(self.mnt, 'a'))
 98 | 
 99 |     class unlink_1M:
100 |         #     dirent.ino + cmtime
101 |         opt = 8          + 8
102 |         def prepare(self):
103 |             file = open(os.path.join(self.mnt, 'a'), 'w')
104 |             for i in range(1 * 64):
105 |                 file.write('1' * (16 * 1024))
106 |             file.close()
107 |         def run(self):
108 |             os.unlink(os.path.join(self.mnt, 'a'))
109 | 
110 |     class unlink_16M:
111 |         #     dirent.ino + cmtime
112 |         opt = 8          + 8
113 |         def prepare(self):
114 |             file = open(os.path.join(self.mnt, 'a'), 'w')
115 |             for i in range(16 * 64):
116 |                 file.write('1' * (16 * 1024))
117 |             file.close()
118 |         def run(self):
119 |             os.unlink(os.path.join(self.mnt, 'a'))
120 | 
121 |     class rmdir:
122 |         #     nlinks + dirent.ino + cmtime
123 |         opt = 4      + 8          + 8
124 |         def prepare(self):
125 |             os.mkdir(os.path.join(self.mnt, 'a'))
126 |         def run(self):
127 |             os.rmdir(os.path.join(self.mnt, 'a'))
128 | 
129 |     class unlink_symlink:
130 |         #     dirent.ino + cmtime
131 |         opt = 8          + 8
132 |         def prepare(self):
133 |             os.symlink('a', os.path.join(self.mnt, 'b'))
134 |         def run(self):
135 |             os.unlink(os.path.join(self.mnt, 'b'))
136 | 
137 |     class rename_file_intra:
138 |         # TODO: could reduce dirent block by 2*8 and by unused
139 |         #     inos + dirents + ino_root + cmtime + rec_len + dirent
140 |         opt = 2*8  + 4096    + 8        + 2*4    + 2       + 2+1+2+1
141 |         def prepare(self):
142 |             open(os.path.join(self.mnt, 'a'), 'w').close()
143 |         def run(self):
144 |             os.rename(os.path.join(self.mnt, 'a'), os.path.join(self.mnt, 'b'))
145 | 
146 |     class rename_file_inter:
147 |         # TODO: could reduce dirent blocks by 2*8 and by unused
148 |         # TODO: could reduce ino_roots by 2*8 and by unused
149 |         #     inos + dirents + ino_roots+ ira + cmtime + rec_len + dirent
150 |         #                                (ira = root_inode addr)
151 |         opt = 2*8  + 2*4096  + 4096+2*8 + 8   + 4*4    + 2       + 2+1+2+1
152 |         def prepare(self):
153 |             os.mkdir(os.path.join(self.mnt, 'a'))
154 |             os.mkdir(os.path.join(self.mnt, 'b'))
155 |             open(os.path.join(self.mnt, 'a', 'c'), 'w').close()
156 |         def run(self):
157 |             os.rename(os.path.join(self.mnt, 'a', 'c'),
158 |                       os.path.join(self.mnt, 'b', 'c'))
159 | 
160 |     class rename_file_clobber:
161 |         # TODO: could reduce dirent blocks by 2*8 and by unused
162 |         # TODO: could reduce ino_roots by 2*8 and by unused
163 |         #     inos + dirents + ino_root + cmtime
164 |         opt = 2*8  + 4096    + 8        + 2*4
165 |         def prepare(self):
166 |             open(os.path.join(self.mnt, 'a'), 'w').close()
167 |             open(os.path.join(self.mnt, 'b'), 'w').close()
168 |         def run(self):
169 |             os.rename(os.path.join(self.mnt, 'a'), os.path.join(self.mnt, 'b'))
170 | 
171 |     class rename_dir_intra:
172 |         # TODO: could reduce dirent block by 2*8 and by unused
173 |         #     inos + dirents + ino_root + cmtime + rec_len + dirent  + child ctime
174 |         opt = 2*8  + 4096    + 8        + 2*4    + 2       + 2+1+2+1 + 4
175 |         # over file, has 3x4B callback_crawl_inode calls: nlinks [on]p, ctime
176 |         def prepare(self):
177 |             os.mkdir(os.path.join(self.mnt, 'a'))
178 |         def run(self):
179 |             os.rename(os.path.join(self.mnt, 'a'), os.path.join(self.mnt, 'b'))
180 | 
181 |     class rename_dir_inter:
182 |         # TODO: could reduce dirent blocks by 2*8 and by unused
183 |         # TODO: could reduce ino_roots by 2*8 and by unused
184 |         #     inos + dirents + ino_roots+ ira + cmtime + rec_len + dirent  + old and new parent nlinks + child ctime
185 |         #                                (ira = root_inode addr)
186 |         opt = 2*8  + 2*4096  + 4096+2*8 + 8   + 4*4    + 2       + 2+1+2+1 + 2*4                       + 4
187 |         def prepare(self):
188 |             os.mkdir(os.path.join(self.mnt, 'a'))
189 |             os.mkdir(os.path.join(self.mnt, 'b'))
190 |             os.mkdir(os.path.join(self.mnt, 'a', 'c'))
191 |         def run(self):
192 |             os.rename(os.path.join(self.mnt, 'a', 'c'),
193 |                       os.path.join(self.mnt, 'b', 'c'))
194 | 
195 |     class rename_dir_clobber:
196 |         # TODO: could reduce dirent blocks by 2*8 and by unused
197 |         # TODO: could reduce ino_roots by 2*8 and by unused
198 |         #     inos + dirents + ino_root + cmtime + old parent nlinks + child ctime
199 |         opt = 2*8  + 4096    + 8        + 2*4    + 4                 + 4
200 |         def prepare(self):
201 |             os.mkdir(os.path.join(self.mnt, 'a'))
202 |             os.mkdir(os.path.join(self.mnt, 'b'))
203 |         def run(self):
204 |             os.rename(os.path.join(self.mnt, 'a'), os.path.join(self.mnt, 'b'))
205 | 
206 |     class link:
207 |         #     dirent + cmtime + nlinks + ctime + d.ft + d.ino
208 |         opt = 4+1+2  + 8      + 4      + 4     + 1    + 8
209 |         def prepare(self):
210 |             open(os.path.join(self.mnt, 'a'), 'w').close()
211 |         def run(self):
212 |             os.link(os.path.join(self.mnt, 'a'), os.path.join(self.mnt, 'b'))
213 | 
214 |     class unlink_hardlink:
215 |         #     dirent.ino + cmtime + nlinks + ctime
216 |         opt = 8          + 8      + 4      + 4
217 |         def prepare(self):
218 |             open(os.path.join(self.mnt, 'a'), 'w').close()
219 |             os.link(os.path.join(self.mnt, 'a'), os.path.join(self.mnt, 'b'))
220 |         def run(self):
221 |             os.unlink(os.path.join(self.mnt, 'a'))
222 | 
223 |     class chmod:
224 |         #     mode + ctime
225 |         opt = 4    + 4
226 |         def prepare(self):
227 |             open(os.path.join(self.mnt, 'a'), 'w').close()
228 |         def run(self):
229 |             os.chmod(os.path.join(self.mnt, 'a'), stat.S_IWUSR | stat.S_IRUSR)
230 | 
231 |     class chown:
232 |         #     uid + gid + ctime
233 |         opt = 4   + 4   + 4
234 |         def prepare(self):
235 |             open(os.path.join(self.mnt, 'a'), 'w').close()
236 |         def run(self):
237 |             os.chown(os.path.join(self.mnt, 'a'), 0, 0)
238 | 
239 |     class append_0B_8B:
240 |         #     data + root + size + mtime
241 |         opt = 8    + 8    + 8    + 4
242 |         def prepare(self):
243 |             open(os.path.join(self.mnt, 'a'), 'w').close()
244 |         def run(self):
245 |             file = open(os.path.join(self.mnt, 'a'), 'a')
246 |             file.write('2' * 8)
247 |             file.close()
248 | 
249 |     class append_8B_8B:
250 |         #     data + size + mtime
251 |         opt = 8    + 8    + 4
252 |         def prepare(self):
253 |             file = open(os.path.join(self.mnt, 'a'), 'w')
254 |             file.write('1' * 8)
255 |             file.close()
256 |         def run(self):
257 |             file = open(os.path.join(self.mnt, 'a'), 'a')
258 |             file.write('2' * 8)
259 |             file.close()
260 | 
261 |     class append_0B_4k:
262 |         #     data + root + size + mtime
263 |         opt = 4096 + 8    + 8    + 4
264 |         def prepare(self):
265 |             open(os.path.join(self.mnt, 'a'), 'w').close()
266 |         def run(self):
267 |             file = open(os.path.join(self.mnt, 'a'), 'a')
268 |             file.write('2' * 4096)
269 |             file.close()
270 | 
271 |     class append_8k_4k:
272 |         #     data + root + size + mtime
273 |         opt = 4096 + 8    + 8    + 4
274 |         def prepare(self):
275 |             file = open(os.path.join(self.mnt, 'a'), 'w')
276 |             file.write('1' * (8 * 1024))
277 |             file.close()
278 |         def run(self):
279 |             file = open(os.path.join(self.mnt, 'a'), 'a')
280 |             file.write('2' * 4096)
281 |             file.close()
282 | 
283 |     # 128kiB is the largest that FUSE will atomically write
284 |     class append_0B_128k:
285 |         # TODO: changing height separately from root is needless
286 |         #     data     + indir   + height + root + size + mtime
287 |         opt = 128*1024 + 128/4*8 + 8      + 8    + 8    + 4
288 |         def prepare(self):
289 |             open(os.path.join(self.mnt, 'a'), 'w').close()
290 |         def run(self):
291 |             file = open(os.path.join(self.mnt, 'a'), 'a')
292 |             file.write('2' * (128 * 1024))
293 |             file.close()
294 | 
295 |     class append_2M_4k:
296 |         #     data + nr + or + in0 + in1 + size + mtime
297 |         opt = 4096 + 8  + 8  + 8   + 8   + 8    + 4
298 |         def prepare(self):
299 |             file = open(os.path.join(self.mnt, 'a'), 'w')
300 |             for i in range(2 * 64):
301 |                 file.write('1' * (16 * 1024))
302 |             file.close()
303 |         def run(self):
304 |             file = open(os.path.join(self.mnt, 'a'), 'a')
305 |             file.write('2' * 4096)
306 |             file.close()
307 | 
308 |     # 128kiB is the largest that FUSE will atomically write
309 |     class append_2M_128k:
310 |         #     data     + indir1  + indir0 + root addr/height + size + mtime
311 |         opt = 128*1024 + 128/4*8 + 2*8    + 8                + 8    + 4
312 |         def prepare(self):
313 |             file = open(os.path.join(self.mnt, 'a'), 'w')
314 |             for i in range(2 * 64):
315 |                 file.write('1' * (16 * 1024))
316 |             file.close()
317 |         def run(self):
318 |             file = open(os.path.join(self.mnt, 'a'), 'a')
319 |             file.write('2' * (128 * 1024))
320 |             file.close()
321 | 
322 |     class write_1M_8B:
323 |         #     data + mtime
324 |         opt = 8    + 4
325 |         def prepare(self):
326 |             file = open(os.path.join(self.mnt, 'a'), 'w')
327 |             for i in range(64):
328 |                 file.write('1' * (16 * 1024))
329 |             file.close()
330 |         def run(self):
331 |             file = open(os.path.join(self.mnt, 'a'), 'r+', 0)
332 |             file.write('2' * 8)
333 |             file.close()
334 | 
335 |     class write_1M_8B_4092:
336 |         #     dCoW     + data + iCoW  + indir + mtime
337 |         opt = 2*4096-8 + 8    + 4096  + 2*8+8 + 4
338 |         # extra: iCoW+16
339 |         def prepare(self):
340 |             file = open(os.path.join(self.mnt, 'a'), 'w')
341 |             for i in range(64):
342 |                 file.write('1' * (16 * 1024))
343 |             file.close()
344 |         def run(self):
345 |             file = open(os.path.join(self.mnt, 'a'), 'r+', 0)
346 |             file.seek(4096 - 4)
347 |             file.write('2' * 8)
348 |             file.close()
349 | 
350 |     class write_1M_16B:
351 |         #     CoW     + indir + data  + mtime
352 |         opt = 4096-16 + 8     + 16    + 4
353 |         def prepare(self):
354 |             file = open(os.path.join(self.mnt, 'a'), 'w')
355 |             for i in range(64):
356 |                 file.write('1' * (16 * 1024))
357 |             file.close()
358 |         def run(self):
359 |             file = open(os.path.join(self.mnt, 'a'), 'r+', 0)
360 |             file.write('2' * 16)
361 |             file.close()
362 | 
363 |     class write_1M_4k:
364 |         #     data + indir + mtime
365 |         opt = 4096 + 8     + 4
366 |         def prepare(self):
367 |             file = open(os.path.join(self.mnt, 'a'), 'w')
368 |             for i in range(64):
369 |                 file.write('1' * (16 * 1024))
370 |             file.close()
371 |         def run(self):
372 |             file = open(os.path.join(self.mnt, 'a'), 'r+', 0)
373 |             file.write('2' * 4096)
374 |             file.close()
375 | 
376 |     class write_1M_4k_1:
377 |         #     CoW data    + data + indir      + mtime
378 |         opt = 2*4096-4096 + 4096 + 4096+2*8+8 + 4
379 |         def prepare(self):
380 |             file = open(os.path.join(self.mnt, 'a'), 'w')
381 |             for i in range(64):
382 |                 file.write('1' * (16 * 1024))
383 |             file.close()
384 |         def run(self):
385 |             file = open(os.path.join(self.mnt, 'a'), 'r+', 0)
386 |             file.seek(1)
387 |             file.write('2' * 4096)
388 |             file.close()
389 | 
390 |     # 128kiB is the largest that FUSE will atomically write
391 |     class write_1M_128k:
392 |         # TODO: avoid CoWing indir slots that will be overwritten
393 |         #     data     + indir   + iCoW + root + mtime
394 |         opt = 128*1024 + 128/4*8 + 4096 + 8    + 4
395 |         def prepare(self):
396 |             file = open(os.path.join(self.mnt, 'a'), 'w')
397 |             for i in range(64):
398 |                 file.write('1' * (16 * 1024))
399 |             file.close()
400 |         def run(self):
401 |             file = open(os.path.join(self.mnt, 'a'), 'r+', 0)
402 |             file.write('2' * (128 * 1024))
403 |             file.close()
404 | 
405 |     # 128kiB is the largest that FUSE will atomically write
406 |     class write_1M_124k_1:
407 |         # TODO: avoid CoWing indir slots that will be overwritten
408 |         #     dCoW   + data     + indir   + iCoW + root + mtime
409 |         opt = 1+4095 + 124*1024 + 128/4*8 + 4096 + 8    + 4
410 |         def prepare(self):
411 |             file = open(os.path.join(self.mnt, 'a'), 'w')
412 |             for i in range(64):
413 |                 file.write('1' * (16 * 1024))
414 |             file.close()
415 |         def run(self):
416 |             file = open(os.path.join(self.mnt, 'a'), 'r+', 0)
417 |             file.seek(1)
418 |             file.write('2' * (124 * 1024))
419 |             file.close()
420 | 
421 |     class read:
422 |         #     atime
423 |         opt = 4
424 |         def prepare(self):
425 |             open(os.path.join(self.mnt, 'a'), 'w').close()
426 |         def run(self):
427 |             file = open(os.path.join(self.mnt, 'a'), 'r')
428 |             file.read(1)
429 |             file.close()
430 | 
431 |     class readdir:
432 |         #     atime + atime
433 |         opt = 4     + 4
434 |         def run(self):
435 |             os.listdir(self.mnt)
436 | 
437 |     @benchmacro
438 |     class postmark_small(postmark):
439 |         config = 'bench/postmark.small.config'
440 | 
441 |     @benchmacro
442 |     class postmark_large(postmark):
443 |         config = 'bench/postmark.large.config'
444 | 
445 |     @benchmacro
446 |     class tarx:
447 |         free_space = 512
448 |         def run(self):
449 |             tar_file = 'bench/linux-2.6.15.tar'
450 |             subprocess.check_call(['tar', '-xf', tar_file, '-C', self.mnt],
451 |                                   close_fds=True)
452 | 
453 |     @benchmacro
454 |     class delete:
455 |         free_space = 512
456 |         def prepare(self):
457 |             tar_file = 'bench/linux-2.6.15.tar'
458 |             subprocess.check_call(['tar', '-xf', tar_file, '-C', self.mnt],
459 |                                   close_fds=True)
460 |         def run(self):
461 |             subprocess.check_call(['rm', '-rf',
462 |                                    os.path.join(self.mnt, 'linux-2.6.15')],
463 |                                    close_fds=True)
464 | 
465 |     @benchmacro
466 |     class build_apache:
467 |         free_space = 6 * 1024
468 |         def run(self):
469 |             tar_file = 'bench/httpd-2.0.63.tar.gz'
470 |             path = os.path.join(self.mnt, 'httpd-2.0.63')
471 |             devnull = open('/dev/null', 'rw')
472 |             subprocess.check_call(['tar', '-xf', tar_file, '-C', self.mnt],
473 |                                   close_fds=True)
474 |             subprocess.check_call([os.path.join(path, 'configure')],
475 |                                   stdout=devnull, stderr=devnull,
476 |                                   cwd=path, close_fds=True)
477 |             # TODO: why does make exit with an error with devnull?
478 |             # 'make -j8 &>/dev/null' does not.
479 |             subprocess.check_call(['make', '-j8'],
480 | #                                  stdout=devnull, stderr=devnull,
481 |                                   cwd=path, close_fds=True)
482 |             devnull.close()
483 | 
484 |     @benchmacro
485 |     class bonnie:
486 |         free_space = 6 * 1024
487 |         def run(self):
488 |             cmd = ['bonnie++', '-d', self.mnt, '-r', '1024']
489 |             devnull = open('/dev/null', 'rw')
490 |             subprocess.check_call(cmd,
491 |                                   stdout=devnull, stderr=devnull,
492 |                                   close_fds=True)
493 |             devnull.close()
494 | 
495 |     @benchmacro
496 |     class bonnie_sync:
497 |         free_space = 6 * 1024
498 |         def run(self):
499 |             cmd = ['bonnie++', '-d', self.mnt, '-r', '1024', '-b']
500 |             devnull = open('/dev/null', 'rw')
501 |             subprocess.check_call(cmd,
502 |                                   stdout=devnull, stderr=devnull,
503 |                                   close_fds=True)
504 |             devnull.close()
505 | 
506 | 
507 | class filesystem_bpfs:
508 |     _mount_overheads = { 'BPFS': 1 } # the valid field
509 |     def __init__(self, megabytes):
510 |         self.img = tempfile.NamedTemporaryFile()
511 |         # NOTE: self.mnt should not be in ~/ so that gvfs does not readdir it
512 |         self.mnt = tempfile.mkdtemp()
513 |         self.proc = None
514 |         for i in range(megabytes * 64):
515 |             self.img.write('0' * (16 * 1024))
516 |     def __del__(self):
517 |         if self.proc:
518 |             self.unmount()
519 |         os.rmdir(self.mnt)
520 |     def format(self):
521 |         subprocess.check_call(['./mkfs.bpfs', self.img.name], close_fds=True)
522 |     def mount(self, pinfile=None, count=False):
523 |         env = None
524 |         if pinfile:
525 |             env = os.environ
526 |             env['PINOPTS'] = '-b true -o ' + pinfile
527 |         bin = './bpfs'
528 |         if count:
529 |             bin = './bench/bpramcount'
530 |         self._count = count
531 |         self.proc = subprocess.Popen([bin, '-f', self.img.name, self.mnt],
532 |                                       stdout=subprocess.PIPE,
533 |                                       stderr=subprocess.STDOUT,
534 |                                       close_fds=True,
535 |                                       env=env)
536 |         while self.proc.stdout:
537 |             line = self.proc.stdout.readline()
538 |             if line.startswith('BPFS running'):
539 |                 self._commit_mode = line.split()[3]
540 |                 return
541 |         raise NameError('Unable to start BPFS')
542 |     def unmount(self):
543 |         cowed = (-1, -1)
544 |         # 'fusermount -u' rather than self.proc.terminate() because the
545 |         # second does not always get its signal into the process.
546 |         # (In particular for benchmarks.rename_clober when running
547 |         # all benchmarks. This behavior seems to come and go.)
548 |         subprocess.check_call(['fusermount', '-u', self.mnt], close_fds=True)
549 |         output = self.proc.communicate()[0]
550 |         self.proc = None
551 |         if not self._count:
552 |             return 0
553 |         for line in output.splitlines():
554 |             if line.startswith('CoW: ') and line.endswith(' blocks'):
555 |                 linea = line.split()
556 |                 cowed = (int(linea[1]), int(linea[4]))
557 |             if line.startswith('pin: ') and line.endswith(' bytes written to BPRAM'):
558 |                 bytes_written = int(line.split()[1])
559 |                 if self._commit_mode in self._mount_overheads:
560 |                     bytes_written -= self._mount_overheads[self._commit_mode]
561 |                 return (bytes_written, cowed)
562 |         raise NameError('BPFS failed to exit correctly')
563 | 
564 | class filesystem_kernel:
565 |     def __init__(self, fs_name, img):
566 |         fs_full = fs_name.split('-')
567 |         if len(fs_full) > 1:
568 |             self.fs_name = fs_full[0]
569 |             self.fs_mode = fs_full[1]
570 |         else:
571 |             self.fs_name = fs_name
572 |             self.fs_mode = None
573 |         self.img = img
574 |         # NOTE: self.mnt should not be in ~/ so that gvfs does not readdir it
575 |         self.mnt = tempfile.mkdtemp()
576 |         self.mounted = False
577 |     def __del__(self):
578 |         if self.mounted:
579 |             self.unmount()
580 |         os.rmdir(self.mnt)
581 |     def format(self):
582 |         cmd = ['sudo', 'mkfs.' + self.fs_name, self.img]
583 |         if self.fs_name in ['ext2', 'ext3', 'ext4']:
584 |             cmd.append('-q')
585 |         subprocess.check_call(cmd, close_fds=True)
586 |     def _get_dev_writes(self):
587 |         dev_name = os.path.basename(self.img)
588 |         file = open('/proc/diskstats', 'r')
589 |         for line in file:
590 |             fields = line.split()
591 |             if fields[2] == dev_name:
592 |                 return int(fields[9]) * 512
593 |         raise NameError('Device ' + dev_name + ' not found in /proc/diskstats')
594 |     def mount(self, pinfile=None, count=False):
595 |         cmd = ['sudo', 'mount', self.img, self.mnt]
596 |         if self.fs_mode and self.fs_name in ['ext3', 'ext4']:
597 |                 cmd.append('-o')
598 |                 cmd.append('data=' + self.fs_mode)
599 |         subprocess.check_call(cmd, close_fds=True)
600 |         self.mounted = True
601 |         subprocess.check_call(['sudo', 'chmod', '777', self.mnt],
602 |                               close_fds=True)
603 |         # Try to ignore the format and mount in write stats:
604 |         subprocess.check_call(['sync'], close_fds=True)
605 |         self.start_bytes = self._get_dev_writes()
606 |     def unmount(self):
607 |         # Catch all fs activity in write stats:
608 |         subprocess.check_call(['sync'], close_fds=True)
609 |         # Get write number before unmount to avoid including its activity
610 |         stop_bytes = self._get_dev_writes()
611 |         subprocess.check_call(['sudo', 'umount', self.mnt],
612 |                               close_fds=True)
613 |         self.mounted = False
614 |         return (stop_bytes - self.start_bytes, (-1, -1))
615 | 
616 | def run(fs, benches, profile):
617 |     for name, clz in benches:
618 |         pinfile = None
619 |         if profile:
620 |             pinfile = 'pin-' + name + '.log'
621 |         sys.stdout.write('Benchmark ' + name + ': ')
622 |         b = clz()
623 |         b.mnt = fs.mnt
624 |         fs.format()
625 | 
626 |         if hasattr(b, 'prepare'):
627 |             fs.mount()
628 |             b.prepare()
629 |             fs.unmount()
630 | 
631 |         fs.mount(pinfile=pinfile, count=True)
632 |         b.run()
633 |         (bytes, (cow_bytes, cow_blocks)) = fs.unmount()
634 | 
635 |         sys.stdout.write(str(bytes) + ' bytes')
636 |         if hasattr(b, 'opt'):
637 |             delta = bytes - b.opt
638 |             delta = '%+d' % delta
639 |             sys.stdout.write(' (' + delta + ' bytes')
640 |             if b.opt:
641 |                 factor = float(delta) / float(b.opt)
642 |                 factor = '%+.2f' % factor
643 |                 sys.stdout.write(' = ' + factor + 'x')
644 |             sys.stdout.write(')')
645 |         if cow_bytes != -1:
646 |             sys.stdout.write(' (cow: ' + str(cow_bytes) + ' bytes in ' + str(cow_blocks) + ' blocks)')
647 |         print ''
648 |         if profile:
649 |             #subprocess.check_call(['cat'], stdin=open(pinfile))
650 |             subprocess.check_call(['./bench/parse_bpramcount'],
651 |                                   stdin=open(pinfile))
652 |             os.unlink(pinfile)
653 |         sys.stdout.flush()
654 | 
655 | def usage():
656 |     print 'Usage: ' + sys.argv[0] + ' [-h|--help] [-t FS [-d DEV]] [-p] [BENCHMARK ...]'
657 |     print '\t-t FS: use file system FS (e.g., bpfs or ext4)'
658 |     print '\t-d DEV: use DEV for (non-bpfs) file system backing'
659 |     print '\t-p: profile each run (bpfs only)'
660 |     print '\tThree meta benchmark names exist: all, micro, and macro'
661 |     print '\tSpecifying no benchmarks runs all micro benchmarks'
662 | 
663 | def main():
664 |     try:
665 |         opts, bench_names = getopt.getopt(sys.argv[1:], 'hpt:d:', ['help'])
666 |     except getopt.GetoptError, err:
667 |         print str(err)
668 |         sys.exit(1)
669 |     profile = False
670 |     benches = []
671 |     fs_name = 'bpfs'
672 |     dev = None
673 |     fs = None
674 |     for o, a in opts:
675 |         if o == '-t':
676 |             fs_name = a
677 |         elif o == '-d':
678 |             dev = a
679 |         elif o == '-p':
680 |             profile = True
681 |         elif o in ('-h', '--help'):
682 |             usage()
683 |             sys.exit()
684 |         else:
685 |             assert False, 'unhandled option'
686 | 
687 |     if not bench_names:
688 |         benches = list(benchmarks.micro())
689 |     else:
690 |         all_benches = dict(benchmarks.all())
691 |         for name in bench_names:
692 |             if name in all_benches:
693 |                 benches.append((name, all_benches[name]))
694 |             elif name == 'all':
695 |                 benches.extend(benchmarks.all())
696 |             elif name == 'micro':
697 |                 benches.extend(benchmarks.micro())
698 |             elif name == 'macro':
699 |                 benches.extend(benchmarks.macro())
700 |             else:
701 |                 print '"%s" is not a benchmark' % name
702 | 
703 |     if fs_name == 'bpfs':
704 |         bpfs_size = 32
705 |         for (name, obj) in benches:
706 |             if hasattr(obj, 'free_space'):
707 |                 bpfs_size = max(bpfs_size, obj.free_space)
708 |         fs = filesystem_bpfs(bpfs_size)
709 |     else:
710 |         if dev == None:
711 |             raise NameError('Must provide a backing device for ' + fs_name)
712 |         fs = filesystem_kernel(fs_name, dev)
713 | 
714 |     run(fs, benches, profile)
715 | 
716 | 
717 | if __name__ == '__main__':
718 |     main()
719 | 


--------------------------------------------------------------------------------
/bench/parse_bpramcount:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # This file is part of BPFS. BPFS is copyright 2009-2010 The Regents of the
 4 | # University of California. It is distributed under the terms of version 2
 5 | # of the GNU GPL. See the file LICENSE for details.
 6 | 
 7 | # Print a bpramcount log with function names in place of instruction pointers
 8 | # NOTE: addr2line only seems useful when bpfs is compiled with -O0
 9 | 
10 | import subprocess
11 | import sys
12 | import os
13 | 
14 | bpfs = os.path.dirname(sys.argv[0]) + '/../bpfs'
15 | 
16 | ap = subprocess.Popen(['addr2line', '-f', '-e', bpfs],
17 | 					  stdin=subprocess.PIPE,
18 | 					  stdout=subprocess.PIPE)
19 | 
20 | in_backtraces = False
21 | 
22 | backtraces = dict()
23 | 
24 | for line in sys.stdin:
25 | 	line = line[:-1]
26 | 	if (not in_backtraces):
27 | 		if (line == 'write backtraces start:'):
28 | 			in_backtraces = True
29 | 		if (line.startswith('total number of bytes written: ')):
30 | 			sys.stdout.write('# ' + line + '\n');
31 | 		continue
32 | 	if (in_backtraces and line == 'write backtraces end'):
33 | 		in_backtraces = False
34 | 		break
35 | 
36 | 	backtrace = []
37 | 	
38 | 	linelist = line.split()
39 | 	nbytes = int(linelist[0])
40 | 	ips = linelist[1:]
41 | 
42 | 	for ip in ips:
43 | 		ap.stdin.write("%s\n" % ip)
44 | 		ap.stdin.flush()
45 | 		function = ap.stdout.readline()[:-1]
46 | 		fileline = ap.stdout.readline()[:-1].split(':')
47 | 		filename = fileline[0]
48 | 		filename = filename.split('/')[-1] # just the file, not dirs
49 | 		lineno = fileline[1]
50 | 		if function == '??' and filename == '??' and lineno == '0':
51 | 			backtrace.append(ip)
52 | 		else:
53 | 			backtrace.append("%s:%s" % (function, lineno))
54 | 
55 | 	backtrace = tuple(backtrace)
56 | 	if backtrace in backtraces:
57 | 		backtraces[backtrace] += nbytes
58 | 	else:
59 | 		backtraces[backtrace] = nbytes
60 | 
61 | def backtrace_sort(x, y):
62 | 	if (x[0] > y[0]):
63 | 		return 1
64 | 	elif x[0] == y[0]:
65 | 		return 0
66 | 	else:
67 | 		return -1
68 | 
69 | backtraces_list = map(lambda rec: (rec[1], rec[0]), backtraces.iteritems())
70 | backtraces_list.sort(backtrace_sort)
71 | backtraces_list.reverse()
72 | 
73 | for nbytes, backtrace in backtraces_list:
74 | 	line = str(nbytes)
75 | 	for elt in backtrace:
76 | 		line += ' ' + elt
77 | 	print line
78 | 


--------------------------------------------------------------------------------
/bench/postmark.large.config:
--------------------------------------------------------------------------------
1 | #set location /mnt
2 | set size 512 16777216
3 | set write 4096
4 | set read 4096
5 | run
6 | 


--------------------------------------------------------------------------------
/bench/postmark.small.config:
--------------------------------------------------------------------------------
1 | #set location /mnt
2 | set number 100
3 | set size 512 1048576
4 | set write 4096
5 | set read 4096
6 | run
7 | 


--------------------------------------------------------------------------------
/bpfs.h:
--------------------------------------------------------------------------------
  1 | /* This file is part of BPFS. BPFS is copyright 2009-2010 The Regents of the
  2 |  * University of California. It is distributed under the terms of version 2
  3 |  * of the GNU GPL. See the file LICENSE for details. */
  4 | 
  5 | #ifndef BPFS_H
  6 | #define BPFS_H
  7 | 
  8 | #include "bpfs_structs.h"
  9 | 
 10 | #include <stdbool.h>
 11 | #include <stdint.h>
 12 | 
 13 | #define MODE_SP 1
 14 | #define MODE_SCSP 2
 15 | #define MODE_BPFS 3
 16 | 
 17 | #define COMMIT_MODE MODE_BPFS
 18 | 
 19 | // Allow in-place append writes
 20 | #define SCSP_OPT_APPEND (1 && COMMIT_MODE == MODE_SCSP)
 21 | // Write [acm]time independently of the commit
 22 | #define SCSP_OPT_TIME (1 && COMMIT_MODE == MODE_SCSP)
 23 | 
 24 | #define APPEASE_VALGRIND 0
 25 | // Detect when an inode is used that should no longer be linked into any dir.
 26 | // NOTE: This causes additional writes.
 27 | #define DETECT_ZEROLINKS_WITH_LINKS (0 && !defined(NDEBUG))
 28 | 
 29 | #define SCSP_OPT_DIRECT (SCSP_OPT_APPEND || SCSP_OPT_TIME)
 30 | #define INDIRECT_COW (COMMIT_MODE == MODE_SCSP)
 31 | 
 32 | // TODO: rephrase this as you-see-everything-p?
 33 | // NOTE: this doesn't describe situations where the top block is already COWed
 34 | //       but child blocks are refed by the original top block.
 35 | enum commit {
 36 | 	COMMIT_NONE,   // no writes allowed
 37 | 	COMMIT_COPY,   // writes only to copies
 38 | #if COMMIT_MODE == MODE_BPFS
 39 | 	COMMIT_ATOMIC, // write in-place if write is atomic; otherwise, copy
 40 | #else
 41 | 	COMMIT_ATOMIC = COMMIT_COPY,
 42 | #endif
 43 | 	COMMIT_FREE,   // no restrictions on writes (e.g., region is not yet refed)
 44 | };
 45 | 
 46 | // Max size that can be written atomically (hardcoded for unsafe 32b testing)
 47 | #define ATOMIC_SIZE 8
 48 | 
 49 | #define BPFS_EOF UINT64_MAX
 50 | 
 51 | uint64_t cow_block(uint64_t old_blockno,
 52 |                    unsigned off, unsigned size, unsigned valid);
 53 | uint64_t cow_block_hole(unsigned off, unsigned size, unsigned valid);
 54 | uint64_t cow_block_entire(uint64_t old_blockno);
 55 | 
 56 | #if COMMIT_MODE != MODE_BPFS
 57 | bool block_freshly_alloced(uint64_t blockno);
 58 | #endif
 59 | 
 60 | uint64_t tree_max_nblocks(uint64_t height);
 61 | uint64_t tree_height(uint64_t nblocks);
 62 | int tree_change_height(struct bpfs_tree_root *root,
 63 |                        unsigned new_height,
 64 |                        enum commit commit, uint64_t *blockno);
 65 | 
 66 | void set_super(struct bpfs_super *super);
 67 | struct bpfs_super*  get_bpram_super(void);
 68 | struct bpfs_super* get_super(void);
 69 | #if COMMIT_MODE == MODE_SCSP
 70 | uint64_t get_super_blockno(void);
 71 | #endif
 72 | 
 73 | char* get_block(uint64_t blockno);
 74 | static __inline
 75 | unsigned block_offset(const void *x) __attribute__((always_inline));
 76 | void unfree_block(uint64_t blockno);
 77 | void unalloc_block(uint64_t blockno);
 78 | 
 79 | struct bpfs_tree_root* get_inode_root(void);
 80 | int get_inode_offset(uint64_t ino, uint64_t *poffset);
 81 | 
 82 | void ha_set_addr(struct height_addr *pha, uint64_t addr);
 83 | void ha_set(struct height_addr *pha, uint64_t height, uint64_t addr);
 84 | 
 85 | uint64_t tree_root_height(const struct bpfs_tree_root *root);
 86 | uint64_t tree_root_addr(const struct bpfs_tree_root *root);
 87 | 
 88 | int truncate_block_zero(struct bpfs_tree_root *root,
 89 |                         uint64_t begin, uint64_t end, uint64_t valid,
 90 |                         uint64_t *blockno);
 91 | 
 92 | 
 93 | static __inline
 94 | unsigned block_offset(const void *x)
 95 | {
 96 | 	return ((uintptr_t) x) % BPFS_BLOCK_SIZE;
 97 | }
 98 | 
 99 | #endif
100 | 


--------------------------------------------------------------------------------
/bpfs_structs.h:
--------------------------------------------------------------------------------
  1 | /* This file is part of BPFS. BPFS is copyright 2009-2010 The Regents of the
  2 |  * University of California. It is distributed under the terms of version 2
  3 |  * of the GNU GPL. See the file LICENSE for details. */
  4 | 
  5 | #ifndef BPFS_STRUCTS_H
  6 | #define BPFS_STRUCTS_H
  7 | 
  8 | #include "util.h"
  9 | 
 10 | #include <stdint.h>
 11 | 
 12 | #define BPFS_FS_MAGIC 0xB9F5
 13 | 
 14 | #define BPFS_STRUCT_VERSION 7
 15 | 
 16 | #define BPFS_BLOCK_SIZE 4096
 17 | 
 18 | #define BPFS_BLOCKNO_INVALID 0
 19 | #define BPFS_BLOCKNO_SUPER 1
 20 | #define BPFS_BLOCKNO_SUPER_2 2
 21 | #define BPFS_BLOCKNO_FIRST_ALLOC 3
 22 | 
 23 | #define BPFS_INO_INVALID 0
 24 | #define BPFS_INO_ROOT    1
 25 | 
 26 | #define BPFS_S_IFMT   0xF000
 27 | #define BPFS_S_IFSOCK 0xC000
 28 | #define BPFS_S_IFLNK  0xA000
 29 | #define BPFS_S_IFREG  0x8000
 30 | #define BPFS_S_IFBLK  0x6000
 31 | #define BPFS_S_IFDIR  0x4000
 32 | #define BPFS_S_IFCHR  0x2000
 33 | #define BPFS_S_IFIFO  0x1000
 34 | 
 35 | #define __BPFS_S_ISTYPE(mode, mask) (((mode) & BPFS_S_IFMT) == (mask))
 36 | #define BPFS_S_ISSOCK(mode) __BPFS_S_ISTYPE((mode), BPFS_S_IFSOCK)
 37 | #define BPFS_S_ISLNK(mode)  __BPFS_S_ISTYPE((mode), BPFS_S_IFLNK)
 38 | #define BPFS_S_ISREG(mode)  __BPFS_S_ISTYPE((mode), BPFS_S_IFREG)
 39 | #define BPFS_S_ISBLK(mode)  __BPFS_S_ISTYPE((mode), BPFS_S_IFBLK)
 40 | #define BPFS_S_ISDIR(mode)  __BPFS_S_ISTYPE((mode), BPFS_S_IFDIR)
 41 | #define BPFS_S_ISCHR(mode)  __BPFS_S_ISTYPE((mode), BPFS_S_IFCHR)
 42 | #define BPFS_S_ISFIFO(mode) __BPFS_S_ISTYPE((mode), BPFS_S_IFIFO)
 43 | 
 44 | #define BPFS_S_IPERM  0x0FFF
 45 | #define BPFS_S_ISUID  0x0800  // SUID
 46 | #define BPFS_S_ISGID  0x0400  // SGID
 47 | #define BPFS_S_ISVTX  0x0200  // sticky bit
 48 | #define BPFS_S_IRWXU  0x01C0  // user access rights mask
 49 | #define BPFS_S_IRUSR  0x0100  // read
 50 | #define BPFS_S_IWUSR  0x0080  // write
 51 | #define BPFS_S_IXUSR  0x0040  // execute
 52 | #define BPFS_S_IRWXG  0x0038  // group access rights mask
 53 | #define BPFS_S_IRGRP  0x0020  // read
 54 | #define BPFS_S_IWGRP  0x0010  // write
 55 | #define BPFS_S_IXGRP  0x0008  // execute
 56 | #define BPFS_S_IRWXO  0x0007  // others access rights mask
 57 | #define BPFS_S_IROTH  0x0004  // read
 58 | #define BPFS_S_IWOTH  0x0002  // write
 59 | #define BPFS_S_IXOTH  0x0001  // execute
 60 | 
 61 | #define BPFS_TYPE_UNKNOWN 0
 62 | #define BPFS_TYPE_FILE    1
 63 | #define BPFS_TYPE_DIR     2
 64 | #define BPFS_TYPE_CHRDEV  3
 65 | #define BPFS_TYPE_BLKDEV  4
 66 | #define BPFS_TYPE_FIFO    5
 67 | #define BPFS_TYPE_SOCK    6
 68 | #define BPFS_TYPE_SYMLINK 7
 69 | 
 70 | #define BPFS_TREE_LOG_MAX_HEIGHT 3
 71 | #define BPFS_TREE_MAX_HEIGHT ((((uint64_t) 1) << BPFS_TREE_LOG_MAX_HEIGHT) - 1)
 72 | #define BPFS_TREE_LOG_ROOT_MAX_ADDR (sizeof(uint64_t) * 8 - BPFS_TREE_LOG_MAX_HEIGHT)
 73 | #define BPFS_TREE_ROOT_MAX_ADDR ((((uint64_t) 1) << BPFS_TREE_LOG_ROOT_MAX_ADDR) - 1)
 74 | 
 75 | struct height_addr
 76 | {
 77 | 	uint64_t height : BPFS_TREE_LOG_MAX_HEIGHT; // #levels of indir blocks
 78 | 	uint64_t addr   : BPFS_TREE_LOG_ROOT_MAX_ADDR;
 79 | };
 80 | 
 81 | struct bpfs_tree_root
 82 | {
 83 | 	struct height_addr ha; // valid iff !!nbytes
 84 | 	uint64_t nbytes;
 85 | };
 86 | 
 87 | // bpfs_super.commit_mode options:
 88 | #define BPFS_COMMIT_SP 0
 89 | #define BPFS_COMMIT_SCSP 1
 90 | 
 91 | struct bpfs_super
 92 | {
 93 | 	uint32_t magic;
 94 | 	uint32_t version;
 95 | 	uint8_t  uuid[16];
 96 | 	uint64_t nblocks;
 97 | 	uint64_t inode_root_addr; // block number containing the inode tree root
 98 | 	uint64_t inode_root_addr_2; // only used with SP; for commit consistency
 99 | 	uint8_t commit_mode;
100 | 	uint8_t ephemeral_valid; // for SCSP, inode link count validity
101 | 	uint8_t pad[4046]; // pad to full block
102 | };
103 | 
104 | 
105 | #define BPFS_BLOCKNOS_PER_INDIR (BPFS_BLOCK_SIZE / sizeof(uint64_t))
106 | 
107 | struct bpfs_indir_block
108 | {
109 | 	uint64_t addr[BPFS_BLOCKNOS_PER_INDIR];
110 | };
111 | 
112 | 
113 | struct bpfs_time
114 | {
115 | 	uint32_t sec;
116 | //	uint32_t ns;
117 | };
118 | 
119 | struct bpfs_inode
120 | {
121 | 	uint64_t generation;
122 | 	uint32_t uid;
123 | 	uint32_t gid;
124 | 	uint32_t mode;
125 | 	uint32_t nlinks; // valid at mount iff bpfs_super.ephemeral_valid
126 | 	uint64_t flags;
127 | 	struct bpfs_tree_root root;
128 | 	struct bpfs_time atime;
129 | 	struct bpfs_time ctime;
130 | 	struct bpfs_time mtime;
131 | 	uint8_t pad[68]; // pad to evenly fill a block
132 | };
133 | 
134 | #define BPFS_INODES_PER_BLOCK (BPFS_BLOCK_SIZE / sizeof(struct bpfs_inode))
135 | 
136 | 
137 | struct bpfs_dirent
138 | {
139 | 	uint64_t ino;
140 | 	uint16_t rec_len;
141 | 	uint8_t file_type;
142 | 	uint8_t name_len;
143 | 	char name[];
144 | } __attribute__((packed)); // pack rather than manually pad for char name[]
145 | 
146 | #define BPFS_DIRENT_ALIGN 8
147 | #define BPFS_DIRENT_MAX_NAME_LEN \
148 | 	MIN(BPFS_BLOCK_SIZE - sizeof(struct bpfs_dirent), \
149 | 	    1 << (sizeof(((struct bpfs_dirent*) NULL)->name_len) * 8))
150 | #define BPFS_DIRENT_LEN(name_len) \
151 | 	ROUNDUP64(sizeof(struct bpfs_dirent) + (name_len), BPFS_DIRENT_ALIGN)
152 | #define BPFS_DIRENT_MIN_LEN BPFS_DIRENT_LEN(0)
153 | 
154 | 
155 | // static_assert() must be used in a function, so declare one solely for this
156 | // purpose. It returns its own address to avoid an unused function warning.
157 | static inline void* __bpfs_structs_static_asserts(void)
158 | {
159 | 	static_assert(sizeof(struct height_addr) == 8); // need to set atomically
160 | 	static_assert(!(sizeof(struct bpfs_tree_root) % 8));
161 | 	static_assert(sizeof(struct bpfs_super) == BPFS_BLOCK_SIZE);
162 | 	static_assert(sizeof(struct bpfs_indir_block) == BPFS_BLOCK_SIZE);
163 | 	static_assert(sizeof(struct bpfs_time) == 4);
164 | 	static_assert(sizeof(struct bpfs_inode) == 128); // fit evenly in a block
165 | 	// struct bpfs_dirent itself does not have alignment restrictions
166 | 	static_assert(sizeof(struct bpfs_dirent) == 12);
167 | 	static_assert(!(BPFS_DIRENT_MIN_LEN % 8));
168 | 	return __bpfs_structs_static_asserts;
169 | }
170 | 
171 | #endif
172 | 


--------------------------------------------------------------------------------
/crawler.c:
--------------------------------------------------------------------------------
  1 | /* This file is part of BPFS. BPFS is copyright 2009-2010 The Regents of the
  2 |  * University of California. It is distributed under the terms of version 2
  3 |  * of the GNU GPL. See the file LICENSE for details. */
  4 | 
  5 | #include "crawler.h"
  6 | #include "bpfs.h"
  7 | #include "indirect_cow.h"
  8 | #include "util.h"
  9 | 
 10 | #include <sys/mman.h>
 11 | #include <unistd.h>
 12 | 
 13 | 
 14 | //
 15 | // Core crawler
 16 | 
 17 | static char zero_block[BPFS_BLOCK_SIZE]
 18 | 	__attribute__((aligned(BPFS_BLOCK_SIZE)));
 19 | 
 20 | static int crawl_leaf(uint64_t prev_blockno, uint64_t blockoff,
 21 |                       unsigned off, unsigned size, unsigned valid,
 22 |                       uint64_t crawl_start, enum commit commit,
 23 | 					  crawl_callback callback, void *user,
 24 | 					  crawl_blockno_callback bcallback,
 25 | 					  uint64_t *new_blockno)
 26 | {
 27 | 	uint64_t blockno = prev_blockno;
 28 | 	bool is_hole = blockno == BPFS_BLOCKNO_INVALID && commit == COMMIT_NONE;
 29 | 	uint64_t child_blockno;
 30 | 	int r;
 31 | 
 32 | 	assert(crawl_start / BPFS_BLOCK_SIZE <= blockoff);
 33 | 	assert(off < BPFS_BLOCK_SIZE);
 34 | 	assert(off + size <= BPFS_BLOCK_SIZE);
 35 | 	assert(valid <= BPFS_BLOCK_SIZE);
 36 | 
 37 | 	if (commit != COMMIT_NONE && blockno == BPFS_BLOCKNO_INVALID)
 38 | 	{
 39 | 		blockno = cow_block_hole(off, size, valid);
 40 | 		if (blockno == BPFS_BLOCKNO_INVALID)
 41 | 			return -ENOSPC;
 42 | 	}
 43 | 	child_blockno = blockno;
 44 | 
 45 | 	if (callback)
 46 | 	{
 47 | 		enum commit child_commit = (child_blockno == prev_blockno)
 48 | 		                           ? commit : COMMIT_FREE;
 49 | 		char *child_block;
 50 | 		if (is_hole)
 51 | 			child_block = zero_block;
 52 | 		else
 53 | 			child_block = get_block(child_blockno);
 54 | 
 55 | 		r = callback(blockoff, child_block, off, size, valid,
 56 | 		             crawl_start, child_commit, user, &child_blockno);
 57 | 		if (r >= 0 && prev_blockno != child_blockno)
 58 | 			*new_blockno = child_blockno;
 59 | 	}
 60 | 	else
 61 | 	{
 62 | 		if (!is_hole)
 63 | 		{
 64 | 			assert(blockno == prev_blockno);
 65 | 			assert(bcallback);
 66 | 			bcallback(child_blockno, true);
 67 | 		}
 68 | 		r = 0;
 69 | 	}
 70 | 	return r;
 71 | }
 72 | 
 73 | static int crawl_hole(uint64_t blockoff,
 74 |                       uint64_t off, uint64_t size, uint64_t valid,
 75 |                       uint64_t crawl_start,
 76 |                       crawl_callback callback, void *user)
 77 | {
 78 | 	uint64_t off_block = ROUNDDOWN64(off, BPFS_BLOCK_SIZE);
 79 | 	uint64_t end = off + size;
 80 | 
 81 | 	assert(crawl_start / BPFS_BLOCK_SIZE <= blockoff);
 82 | 	assert(off + size <= valid);
 83 | 
 84 | 	while (off < end)
 85 | 	{
 86 | 		unsigned child_off = off % BPFS_BLOCK_SIZE;
 87 | 		unsigned child_size = MIN(end - off, BPFS_BLOCK_SIZE);
 88 | 		unsigned child_valid = MIN(valid - off_block, BPFS_BLOCK_SIZE);
 89 | 		uint64_t child_blockno = BPFS_BLOCKNO_INVALID;
 90 | 		int r;
 91 | 
 92 | 		r = callback(blockoff, zero_block, child_off, child_size, child_valid,
 93 | 		             crawl_start, COMMIT_NONE, user, &child_blockno);
 94 | 		assert(child_blockno == BPFS_BLOCKNO_INVALID);
 95 | 		if (r != 0)
 96 | 			return r;
 97 | 
 98 | 		blockoff++;
 99 | 		off_block += BPFS_BLOCK_SIZE;
100 | 		off = off_block;
101 | 	}
102 | 
103 | 	return 0;
104 | }
105 | 
106 | static int crawl_indir(uint64_t prev_blockno, uint64_t blockoff,
107 |                        uint64_t off, uint64_t size, uint64_t valid,
108 |                        uint64_t crawl_start, enum commit commit,
109 |                        unsigned height, uint64_t max_nblocks,
110 |                        crawl_callback callback, void *user,
111 |                        crawl_blockno_callback bcallback,
112 | 					   uint64_t *new_blockno)
113 | {
114 | 	uint64_t blockno = prev_blockno;
115 | 	struct bpfs_indir_block *indir;
116 | 	uint64_t child_max_nblocks = max_nblocks / BPFS_BLOCKNOS_PER_INDIR;
117 | 	uint64_t child_max_nbytes = child_max_nblocks * BPFS_BLOCK_SIZE;
118 | 	uint64_t firstno = off / child_max_nbytes;
119 | 	uint64_t lastno = (off + size - 1) / child_max_nbytes;
120 | 	uint64_t validno = (valid + child_max_nbytes - 1) / child_max_nbytes;
121 | 	uint64_t in_hole = false;
122 | 	bool only_invalid = off >= valid;
123 | 	enum commit child_commit;
124 | 	uint64_t no;
125 | 	int ret = 0;
126 | 
127 | 	switch (commit) {
128 | #if COMMIT_MODE == MODE_BPFS
129 | 	case COMMIT_ATOMIC:
130 | 		child_commit = (firstno == lastno || only_invalid)
131 | 		               ? commit : COMMIT_COPY;
132 | 		break;
133 | #endif
134 | 	case COMMIT_FREE:
135 | 	case COMMIT_COPY:
136 | 	case COMMIT_NONE:
137 | 		child_commit = commit;
138 | 		break;
139 | 	}
140 | 
141 | 	if (blockno == BPFS_BLOCKNO_INVALID)
142 | 	{
143 | 		if (commit == COMMIT_NONE)
144 | 			return crawl_hole(blockoff, off, size, valid, crawl_start,
145 | 			                  callback, user);
146 | 
147 | 		static_assert(BPFS_BLOCKNO_INVALID == 0);
148 | 		blockno = cow_block_hole(firstno * sizeof(indir->addr[0]),
149 | 		                         (lastno + 1 - firstno) * sizeof(indir->addr[0]),
150 | 		                         validno * sizeof(indir->addr[0]));
151 | 		// indirect_cow_block_required(blockno) not required
152 | 		if (blockno == BPFS_BLOCKNO_INVALID)
153 | 			return -ENOSPC;
154 | 		in_hole = true;
155 | 	}
156 | 	indir = (struct bpfs_indir_block*) get_block(blockno);
157 | 
158 | 	for (no = firstno; no <= lastno; no++)
159 | 	{
160 | 		uint64_t child_off, child_size, child_valid;
161 | 		uint64_t child_blockno, child_new_blockno;
162 | 		uint64_t child_blockoff;
163 | 		int r;
164 | 
165 | 		if (no == firstno)
166 | 		{
167 | 			child_off = off % child_max_nbytes;
168 | 			child_blockoff = blockoff;
169 | 		}
170 | 		else
171 | 		{
172 | 			child_off = 0;
173 | 			child_blockoff = blockoff + (no - firstno) * child_max_nblocks
174 | 			                 - ((off % child_max_nbytes) / BPFS_BLOCK_SIZE);
175 | 		}
176 | 		assert(blockoff <= child_blockoff);
177 | 
178 | 		if (no == lastno)
179 | 			child_size = off + size - (no * child_max_nbytes + child_off);
180 | 		else
181 | 			child_size = child_max_nbytes - child_off;
182 | 		assert(child_size <= size);
183 | 		assert(child_size <= child_max_nbytes);
184 | 
185 | 		if (no < validno)
186 | 		{
187 | 			if ((no + 1) * child_max_nbytes <= valid)
188 | 				child_valid = child_max_nbytes;
189 | 			else
190 | 				child_valid = valid % child_max_nbytes;
191 | 		}
192 | 		else
193 | 		{
194 | 			child_valid = 0;
195 | 		}
196 | 
197 | 		if (!child_valid || in_hole)
198 | 			child_blockno = child_new_blockno = BPFS_BLOCKNO_INVALID;
199 | 		else
200 | 			child_blockno = child_new_blockno = indir->addr[no];
201 | 
202 | 		if (commit != COMMIT_NONE)
203 | 			xcall(indirect_cow_parent_push(blockno));
204 | 		if (height == 1)
205 | 			r = crawl_leaf(child_blockno, child_blockoff,
206 | 			               child_off, child_size, child_valid,
207 | 			               crawl_start, child_commit, callback, user,
208 | 			               bcallback, &child_new_blockno);
209 | 		else
210 | 			r = crawl_indir(child_blockno, child_blockoff,
211 | 			                child_off, child_size, child_valid,
212 | 			                crawl_start, child_commit,
213 | 			                height - 1, child_max_nblocks,
214 | 			                callback, user, bcallback,
215 | 			                &child_new_blockno);
216 | 		if (commit != COMMIT_NONE)
217 | 			indirect_cow_parent_pop(blockno);
218 | 		if (r < 0)
219 | 			return r;
220 | 		if (child_blockno != child_new_blockno || in_hole)
221 | 		{
222 | 			bool single = firstno == lastno || r == 1;
223 | 			assert(commit != COMMIT_NONE);
224 | 			if (!(prev_blockno != blockno
225 | 			      || (SCSP_OPT_APPEND && only_invalid)
226 | 			      || (COMMIT_MODE == MODE_BPFS
227 | 			          && ((commit == COMMIT_ATOMIC && single)
228 | 			              || !child_valid))))
229 | 			{
230 | #if COMMIT_MODE == MODE_BPFS
231 | 				// Could avoid the CoW in this case, but it should not occur:
232 | 				assert(!(commit == COMMIT_ATOMIC && only_invalid));
233 | #endif
234 | 				// TODO: avoid copying data that will be overwritten?
235 | 				if ((blockno = cow_block_entire(blockno))
236 | 				    == BPFS_BLOCKNO_INVALID)
237 | 					return -ENOSPC;
238 | 				// indirect_cow_block_required(blockno) not required
239 | 				indir = (struct bpfs_indir_block*) get_block(blockno);
240 | 			}
241 | 			indir->addr[no] = child_new_blockno;
242 | #if INDIRECT_COW
243 | 			// Neccessary for plugging a file hole.
244 | 			// There may be broader related problems, e.g, when increasing
245 | 			// a tree's height?, but so far I've not noticed any breakage.
246 | 			// Might alternatively or additionally consider supporting
247 | 			// in cow_is_atomically_writable() (!block->orig_blkno -> false)
248 | 			// and adding a required() call in/after cow_block_hole().
249 | 			if (child_blockno == BPFS_INO_INVALID && block_freshly_alloced(blockno))
250 | 				indirect_cow_block_required(blockno);
251 | #endif
252 | 			if (SCSP_OPT_APPEND && only_invalid)
253 | 				indirect_cow_block_direct(blockno, no * sizeof(*indir->addr),
254 | 				                          sizeof(*indir->addr));
255 | 		}
256 | 		if (r == 1)
257 | 		{
258 | 			assert(!in_hole); // TODO: set the remaining entries to invalid
259 | 			ret = 1;
260 | 			break;
261 | 		}
262 | 	}
263 | 
264 | 	if (bcallback && !off)
265 | 	{
266 | 		assert(commit == COMMIT_NONE);
267 | 		assert(prev_blockno == blockno);
268 | 		bcallback(blockno, false);
269 | 	}
270 | 
271 | 	if (prev_blockno != blockno)
272 | 		*new_blockno = blockno;
273 | 	return ret;
274 | }
275 | 
276 | //
277 | // crawl_blocknos()
278 | // Read-only crawl over the indirect and data blocks in root
279 | 
280 | void crawl_blocknos(const struct bpfs_tree_root *root,
281 |                     uint64_t off, uint64_t size,
282 |                     crawl_blockno_callback callback)
283 | {
284 | 	uint64_t max_nblocks = tree_max_nblocks(tree_root_height(root));
285 | 	uint64_t max_nbytes = max_nblocks * BPFS_BLOCK_SIZE;
286 | 	uint64_t valid;
287 | 
288 | 	/* convenience */
289 | 	if (off == BPFS_EOF)
290 | 		off = root->nbytes;
291 | 	assert(!off || off < root->nbytes);
292 | 	if (size == BPFS_EOF)
293 | 		size = root->nbytes - off;
294 | 	else
295 | 		assert(off + size <= root->nbytes);
296 | 	assert(size <= root->nbytes);
297 | 	assert(off + size <= root->nbytes);
298 | 
299 | 	if (!(off + size))
300 | 		return;
301 | 
302 | 	size = MIN(size, max_nbytes - off);
303 | 	valid = MIN(root->nbytes, max_nbytes);
304 | 
305 | 
306 | 	if (!tree_root_height(root))
307 | 	{
308 | 		if (!off)
309 | 			crawl_leaf(tree_root_addr(root), 0, off, size, valid, off,
310 | 			           COMMIT_NONE, NULL, NULL, callback, NULL);
311 | 	}
312 | 	else
313 | 	{
314 | 		crawl_indir(tree_root_addr(root), off / BPFS_BLOCK_SIZE,
315 | 		            off, size, valid, off, COMMIT_NONE,
316 | 		            tree_root_height(root), max_nblocks,
317 | 		            NULL, NULL, callback, NULL);
318 | 	}
319 | }
320 | 
321 | //
322 | // crawl_tree()
323 | 
324 | static int crawl_tree_ref(struct bpfs_tree_root *root, uint64_t off,
325 |                           uint64_t size, enum commit commit,
326 |                           crawl_callback callback, void *user,
327 |                           uint64_t *prev_blockno, bool blockno_refed)
328 | {
329 | 	uint64_t new_blockno = *prev_blockno;
330 | 	unsigned root_off = block_offset(root);
331 | 	uint64_t end;
332 | 	uint64_t max_nblocks;
333 | 	uint64_t child_new_blockno;
334 | 	uint64_t child_size;
335 | 	uint64_t child_valid;
336 | 	enum commit child_commit;
337 | 	bool change_height_holes = false;
338 | 	int r;
339 | 
340 | 	/* convenience to help callers avoid get_inode() calls */
341 | 	if (off == BPFS_EOF)
342 | 		off = root->nbytes;
343 | 	if (size == BPFS_EOF)
344 | 	{
345 | 		assert(root->nbytes >= off);
346 | 		size = root->nbytes - off;
347 | 	}
348 | 	end = off + size;
349 | 
350 | 	assert(commit != COMMIT_NONE || end <= root->nbytes);
351 | 
352 | 	if (commit != COMMIT_NONE)
353 | 	{
354 | 		uint64_t prev_height = tree_root_height(root);
355 | 		uint64_t requested_height = tree_height(NBLOCKS_FOR_NBYTES(end));
356 | 		uint64_t new_height = MAXU64(prev_height, requested_height);
357 | #ifndef NDEBUG
358 | 		uint64_t new_max_nblocks = tree_max_nblocks(new_height);
359 | #endif
360 | 		uint64_t int_valid = MIN(root->nbytes,
361 | 		                         BPFS_BLOCK_SIZE
362 | 		                         * tree_max_nblocks(new_height));
363 | #ifndef NDEBUG
364 | 		uint64_t new_valid = MIN(MAX(root->nbytes, end),
365 | 		                         BPFS_BLOCK_SIZE
366 | 		                         * tree_max_nblocks(new_height));
367 | #endif
368 | 
369 | 		// FYI:
370 | 		assert(end <= new_valid);
371 | 		assert(root->nbytes >= new_valid || (root->nbytes < end && end == new_valid));
372 | 		assert(root->nbytes <= new_valid || root->nbytes > end);
373 | 		assert(root->nbytes != new_valid || root->nbytes >= end);
374 | 		assert(new_valid <= BPFS_BLOCK_SIZE * new_max_nblocks);
375 | 
376 | 		if (prev_height < new_height)
377 | 		{
378 | 			r = tree_change_height(root, new_height, COMMIT_ATOMIC, &new_blockno);
379 | 			if (r < 0)
380 | 				return r;
381 | 			if (*prev_blockno != new_blockno)
382 | 			{
383 | 				root = (struct bpfs_tree_root*)
384 | 				           (get_block(new_blockno) + root_off);
385 | 				change_height_holes = true;
386 | 			}
387 | 		}
388 | 
389 | 		if (int_valid < off)
390 | 		{
391 | 			r = truncate_block_zero(root, int_valid, off, int_valid,
392 | 			                        &new_blockno);
393 | 			if (r < 0)
394 | 				return r;
395 | 			if (*prev_blockno != new_blockno)
396 | 			{
397 | 				root = (struct bpfs_tree_root*)
398 | 				           (get_block(new_blockno) + root_off);
399 | 				change_height_holes = true;
400 | 			}
401 | 		}
402 | 	}
403 | 
404 | 	child_new_blockno = tree_root_addr(root);
405 | 	max_nblocks = tree_max_nblocks(root->ha.height);
406 | 	if (commit != COMMIT_NONE)
407 | 	{
408 | 		child_size = size;
409 | 	}
410 | 	else
411 | 	{
412 | 		assert(end <= root->nbytes);
413 | 		child_size = MIN(size, max_nblocks * BPFS_BLOCK_SIZE - off);
414 | 	}
415 | 	child_valid = MIN(root->nbytes, max_nblocks * BPFS_BLOCK_SIZE);
416 | 
417 | 	if (commit == COMMIT_NONE || commit == COMMIT_FREE || commit == COMMIT_COPY)
418 | 		child_commit = commit;
419 | 	else if (off < root->nbytes && root->nbytes < end)
420 | 		child_commit = COMMIT_COPY; // data needs atomic commit with nbytes
421 | 	else
422 | 		child_commit = commit;
423 | 
424 | 	if (commit != COMMIT_NONE)
425 | 		xcall(indirect_cow_parent_push(new_blockno));
426 | 	if (!root->ha.height)
427 | 	{
428 | 		if (child_size)
429 | 			r = crawl_leaf(child_new_blockno, 0, off, child_size,
430 | 			               child_valid, off,
431 | 			               child_commit, callback, user, NULL,
432 | 			               &child_new_blockno);
433 | 		else
434 | 			r = 0;
435 | 	}
436 | 	else
437 | 	{
438 | 		r = crawl_indir(child_new_blockno, off / BPFS_BLOCK_SIZE,
439 | 		                off, child_size, child_valid,
440 |                         off, child_commit, root->ha.height, max_nblocks,
441 | 		                callback, user, NULL, &child_new_blockno);
442 | 	}
443 | 	if (commit != COMMIT_NONE)
444 | 		indirect_cow_parent_pop(new_blockno);
445 | 
446 | 	if (r >= 0)
447 | 	{
448 | 		bool change_addr = tree_root_addr(root) != child_new_blockno;
449 | 		bool change_size = end > root->nbytes;
450 | 
451 | 		if (commit == COMMIT_NONE)
452 | 		{
453 | 			assert(!change_addr && !change_size);
454 | 			assert(*prev_blockno == new_blockno);
455 | 			if (r == 0)
456 | 				r = crawl_hole((off + child_size) / BPFS_BLOCK_SIZE,
457 | 				               child_size, size - child_size, root->nbytes,
458 | 				               off, callback, user);
459 | 		}
460 | 		else if (change_addr || change_size || change_height_holes)
461 | 		{
462 | 			bool overwrite = off < root->nbytes;
463 | 			bool inplace;
464 | 
465 | 			// FYI:
466 | 			assert(!(!change_addr && overwrite && change_size));
467 | 
468 | #if COMMIT_MODE != MODE_BPFS
469 | 			assert(blockno_refed || block_freshly_alloced(*prev_blockno));
470 | #endif
471 | 
472 | 			if (*prev_blockno != new_blockno || !blockno_refed)
473 | 				inplace = true;
474 | 			else if (change_addr && overwrite && change_size)
475 | 				inplace = commit == COMMIT_FREE;
476 | 			else
477 | 			{
478 | 				inplace = commit == COMMIT_FREE;
479 | #if COMMIT_MODE == MODE_BPFS
480 | 				static_assert(COMMIT_ATOMIC != COMMIT_COPY);
481 | 				inplace = inplace || commit == COMMIT_ATOMIC;
482 | #endif
483 | 			}
484 | 
485 | 			if (!inplace)
486 | 			{
487 | 				new_blockno = cow_block_entire(new_blockno);
488 | 				if (new_blockno == BPFS_BLOCKNO_INVALID)
489 | 					return -ENOSPC;
490 | 				if (change_size)
491 | 					indirect_cow_block_required(new_blockno);
492 | 				// else indirect_cow_block_required(new_blockno) not required
493 | 				root = (struct bpfs_tree_root*)
494 | 				           (get_block(new_blockno) + root_off);
495 | 			}
496 | 
497 | 			if (change_addr)
498 | 			{
499 | 				ha_set_addr(&root->ha, child_new_blockno);
500 | #if SCSP_OPT_APPEND
501 | 				if (!root->nbytes)
502 | 					indirect_cow_block_direct(new_blockno,
503 | 					                          block_offset(&root->ha),
504 | 					                          sizeof(root->ha));
505 | #endif
506 | 			}
507 | 			if (change_size)
508 | 				root->nbytes = end;
509 | 
510 | 			*prev_blockno = new_blockno;
511 | 		}
512 | 		else
513 | 		{
514 | 			assert(*prev_blockno == new_blockno);
515 | 		}
516 | 	}
517 | 
518 | 	return r;
519 | }
520 | 
521 | int crawl_tree(struct bpfs_tree_root *root, uint64_t off,
522 |                uint64_t size, enum commit commit,
523 |                crawl_callback callback, void *user,
524 |                uint64_t *prev_blockno)
525 | {
526 | 	return crawl_tree_ref(root, off, size, commit, callback, user, prev_blockno,
527 | 	                      true);
528 | }
529 | 
530 | //
531 | // crawl_inodes()
532 | 
533 | int crawl_inodes(uint64_t off, uint64_t size, enum commit commit,
534 |                  crawl_callback callback, void *user)
535 | {
536 | 	struct bpfs_tree_root *root = get_inode_root();
537 | 	struct bpfs_super *super = get_super();
538 | 	uint64_t super_blockno = get_super_blockno();
539 | 	uint64_t child_blockno = super->inode_root_addr;
540 | 	int r;
541 | 
542 | 	if (commit != COMMIT_NONE)
543 | 		xcall(indirect_cow_parent_push(super_blockno));
544 | 	r = crawl_tree(root, off, size, commit, callback, user,
545 | 	               &child_blockno);
546 | 	if (commit != COMMIT_NONE)
547 | 		indirect_cow_parent_pop(super_blockno);
548 | 
549 | 	if (r >= 0 && child_blockno != super->inode_root_addr)
550 | 	{
551 | #if COMMIT_MODE == BPFS
552 | 		assert(commit == COMMIT_ATOMIC);
553 | #else
554 | 		// COPY is ok because super points at a non-persistent block
555 | 		assert(commit == COMMIT_COPY || commit == COMMIT_ATOMIC);
556 | #endif
557 | #if COMMIT_MODE == MODE_SCSP
558 | 		assert(super_blockno != BPFS_BLOCKNO_SUPER);
559 | #endif
560 | 		super->inode_root_addr = child_blockno;
561 | 	}
562 | 
563 | 	return r;
564 | }
565 | 
566 | //
567 | // crawl_inode()
568 | 
569 | struct callback_crawl_inode_data {
570 | 	crawl_callback_inode callback;
571 | 	void *user;
572 | };
573 | 
574 | static int callback_crawl_inode(uint64_t blockoff, char *block,
575 |                                 unsigned off, unsigned size, unsigned valid,
576 |                                 uint64_t crawl_start, enum commit commit,
577 |                                 void *ccid_void, uint64_t *blockno)
578 | {
579 | 	struct callback_crawl_inode_data *ccid = (struct callback_crawl_inode_data*) ccid_void;
580 | 	struct bpfs_inode *inode = (struct bpfs_inode*) (block + off);
581 | 
582 | 	assert(size == sizeof(struct bpfs_inode));
583 | 
584 | 	return ccid->callback(block, off, inode, commit, ccid->user, blockno);
585 | }
586 | 
587 | int crawl_inode(uint64_t ino, enum commit commit,
588 |                 crawl_callback_inode callback, void *user)
589 | {
590 | 	struct callback_crawl_inode_data ccid = {callback, user};
591 | 	uint64_t ino_off;
592 | 
593 | 	xcall(get_inode_offset(ino, &ino_off));
594 | 
595 | 	return crawl_inodes(ino_off, sizeof(struct bpfs_inode), commit,
596 | 	                    callback_crawl_inode, &ccid);
597 | }
598 | 
599 | //
600 | // crawl_data()
601 | 
602 | struct callback_crawl_data_data {
603 | 	uint64_t off;
604 | 	uint64_t size;
605 | 	crawl_callback callback;
606 | 	void *user;
607 | };
608 | 
609 | static int callback_crawl_data(char *block, unsigned off,
610 |                                struct bpfs_inode *inode, enum commit commit,
611 |                                void *ccdd_void, uint64_t *blockno)
612 | {
613 | 	struct callback_crawl_data_data *ccdd = (struct callback_crawl_data_data*) ccdd_void;
614 | 
615 | 	return crawl_tree(&inode->root, ccdd->off, ccdd->size, commit,
616 | 	                  ccdd->callback, ccdd->user, blockno);
617 | }
618 | 
619 | int crawl_data(uint64_t ino, uint64_t off, uint64_t size,
620 |                enum commit commit,
621 |                crawl_callback callback, void *user)
622 | {
623 | 	struct callback_crawl_data_data ccdd = {off, size, callback, user};
624 | 
625 | 	return crawl_inode(ino, commit, callback_crawl_data, &ccdd);
626 | }
627 | 
628 | //
629 | // crawl_data_2()
630 | // Crawl 2: atomically commit two non-contiguous writes
631 | 
632 | struct callback_crawl_data_2_data {
633 | 	struct ccd2dd {
634 | 		uint64_t ino;
635 | 		uint64_t ino_off;
636 | 		uint64_t off;
637 | 		uint64_t size;
638 | 		crawl_callback callback;
639 | 		void *user;
640 | 	} d[2];
641 | };
642 | 
643 | static void ccd2dd_fill(struct ccd2dd *d,
644 |                         uint64_t ino, uint64_t off, uint64_t size,
645 |                         crawl_callback callback, void *user)
646 | {
647 | 	d->ino = ino;
648 | 	xcall(get_inode_offset(ino, &d->ino_off));
649 | 	d->off = off;
650 | 	d->size = size;
651 | 	d->callback = callback;
652 | 	d->user = user;
653 | }
654 | 
655 | static int callback_crawl_data_2_tree(uint64_t blockoff, char *block,
656 |                                       unsigned off, unsigned size,
657 |                                       unsigned valid, uint64_t crawl_start,
658 |                                       enum commit commit, void *ccd2d_void,
659 |                                       uint64_t *blockno)
660 | {
661 | 	struct callback_crawl_data_2_data *ccd2d = (struct callback_crawl_data_2_data*) ccd2d_void;
662 | 	uint64_t prev_blockno = *blockno;
663 | 	uint64_t first_offset = blockoff * BPFS_BLOCK_SIZE + off;
664 | 	uint64_t last_offset = first_offset + size;
665 | 	unsigned mask, i;
666 | 
667 | 	mask = first_offset == ccd2d->d[0].off;
668 | 	mask |= (last_offset == ccd2d->d[1].off + ccd2d->d[1].size) << 1;
669 | 
670 | 	for (i = 1; i <= 2; i++)
671 | 	{
672 | 		if (i & mask)
673 | 		{
674 | 			struct ccd2dd *d = &ccd2d->d[i >> 1];
675 | 			bool new = *blockno != prev_blockno;
676 | 			enum commit c = new ? COMMIT_FREE : COMMIT_COPY;
677 | 			block = get_block(*blockno);
678 | 			int r = d->callback(blockoff, block, d->off % BPFS_BLOCK_SIZE,
679 | 			                    d->size, valid, crawl_start, c,
680 | 			                    d->user, blockno);
681 | 			if (r < 0)
682 | 			{
683 | 				assert(i == 1); // Need cleanup for i==2, but shouldn't happen
684 | 				return r;
685 | 			}
686 | 		}
687 | 	}
688 | 
689 | 	return 0;
690 | }
691 | 
692 | static int callback_crawl_data_2(uint64_t blockoff, char *block,
693 |                                  unsigned off, unsigned size, unsigned valid,
694 |                                  uint64_t crawl_start, enum commit commit,
695 |                                  void *ccd2d_void, uint64_t *blockno)
696 | {
697 | 	struct callback_crawl_data_2_data *ccd2d = (struct callback_crawl_data_2_data*) ccd2d_void;
698 | 	uint64_t first_offset = blockoff * BPFS_BLOCK_SIZE + off;
699 | 	uint64_t last_offset = first_offset + size - sizeof(struct bpfs_inode);
700 | 	unsigned mask;
701 | 
702 | 	mask  = first_offset == ccd2d->d[0].ino_off;
703 | 	mask |= (last_offset == ccd2d->d[1].ino_off) << 1;
704 | 
705 | 	if (mask == 3)
706 | 	{
707 | 		struct bpfs_inode *inode = (struct bpfs_inode*) (block + off);
708 | 		if (ccd2d->d[0].ino == ccd2d->d[1].ino)
709 | 		{
710 | 			assert(ccd2d->d[0].off < ccd2d->d[1].off);
711 | 			return crawl_tree(&inode->root, ccd2d->d[0].off,
712 | 			                  ccd2d->d[1].off - ccd2d->d[0].off
713 | 			                  + ccd2d->d[1].size, commit,
714 | 			                  callback_crawl_data_2_tree, ccd2d, blockno);
715 | 		}
716 | 		else
717 | 		{
718 | #if COMMIT_MODE == MODE_BPFS && !defined(NDEBUG)
719 | 			uint64_t prev_blockno = *blockno;
720 | #endif
721 | 			struct bpfs_inode *inode_1;
722 | 			int r;
723 | 
724 | 			r = crawl_tree(&inode->root, ccd2d->d[0].off, ccd2d->d[0].size,
725 | 			               COMMIT_COPY,
726 | 			               ccd2d->d[0].callback, ccd2d->d[0].user,
727 | 			               blockno);
728 | 			if (r < 0)
729 | 				return r;
730 | 			block = get_block(*blockno);
731 | 			inode_1 =  (struct bpfs_inode*)
732 | 				(block + off + size - sizeof(struct bpfs_inode));
733 | 
734 | #if COMMIT_MODE == MODE_BPFS
735 | 			assert(prev_blockno != *blockno); // Required for !blockno_refed
736 | #endif
737 | 			r = crawl_tree_ref(&inode_1->root, ccd2d->d[1].off,
738 | 			                   ccd2d->d[1].size, COMMIT_COPY,
739 | 			                   ccd2d->d[1].callback, ccd2d->d[1].user,
740 | 			                   blockno, false);
741 | 			assert(r >= 0); // FIXME: recover first crawl_tree() changes
742 | 			return r;
743 | 		}
744 | 	}
745 | 	else if (mask)
746 | 	{
747 | 		struct bpfs_inode *inode;
748 | 		struct ccd2dd *d = &ccd2d->d[mask >> 1];
749 | 		if (mask == 1)
750 | 			inode = (struct bpfs_inode*) (block + off);
751 | 		else
752 | 		{
753 | 			assert(mask == 2);
754 | 			inode = (struct bpfs_inode*)
755 | 				(block + off + size - sizeof(struct bpfs_inode));
756 | 		}
757 | 		assert(commit == COMMIT_COPY);
758 | 		return crawl_tree(&inode->root, d->off, d->size, commit,
759 | 		                  d->callback, d->user, blockno);
760 | 	}
761 | 
762 | 	return 0;
763 | }
764 | 
765 | #ifndef NDEBUG
766 | static bool region_in_one_block(uint64_t off, uint64_t size)
767 | {
768 | 	return (off % BPFS_BLOCK_SIZE) + size <= BPFS_BLOCK_SIZE;
769 | }
770 | #endif
771 | 
772 | int crawl_data_2(uint64_t ino_0, uint64_t off_0, uint64_t size_0,
773 |                  crawl_callback callback_0, void *user_0,
774 |                  uint64_t ino_1, uint64_t off_1, uint64_t size_1,
775 |                  crawl_callback callback_1, void *user_1,
776 |                  enum commit commit)
777 | {
778 | 	struct callback_crawl_data_2_data ccd2d;
779 | 	uint64_t ino_start, ino_end, ino_size;
780 | 	unsigned idx_0, idx_1;
781 | 
782 | 	// Overlap not allowed
783 | 	assert(!(ino_0 == ino_1
784 | 	         && ((off_0 <= off_1 && off_1 < off_0 + size_0)
785 | 	             || (off_1 <= off_0 && off_0 < off_1 + size_1))));
786 | 	// callback_crawl_data_2_tree() simplification:
787 | 	assert(region_in_one_block(off_0, size_0));
788 | 	assert(region_in_one_block(off_1, size_1));
789 | 
790 | 	if (ino_0 < ino_1 || (ino_0 == ino_1 && off_0 <= off_1))
791 | 	{
792 | 		idx_0 = 0;
793 | 		idx_1 = 1;
794 | 	}
795 | 	else
796 | 	{
797 | 		idx_0 = 1;
798 | 		idx_1 = 0;
799 | 	}
800 | 	ccd2dd_fill(&ccd2d.d[idx_0], ino_0, off_0, size_0, callback_0, user_0);
801 | 	ccd2dd_fill(&ccd2d.d[idx_1], ino_1, off_1, size_1, callback_1, user_1);
802 | 
803 | 	ino_start = ccd2d.d[0].ino_off;
804 | 	ino_end = ccd2d.d[1].ino_off + sizeof(struct bpfs_inode);
805 | 	ino_size = ino_end - ino_start;
806 | 
807 | 	return crawl_inodes(ino_start, ino_size, commit,
808 | 	                    callback_crawl_data_2, &ccd2d);
809 | }
810 | 
811 | //
812 | // crawler_init()
813 | 
814 | void crawler_init(void)
815 | {
816 | 	// linkers have maximum alignments:
817 | 	assert(!(((uintptr_t) zero_block) % sysconf(_SC_PAGE_SIZE)));
818 | 	// make sure mprotect() doesn't mark other data as read-only:
819 | 	assert(!(BPFS_BLOCK_SIZE % sysconf(_SC_PAGE_SIZE)));
820 | 	// make sure code does not write into the block of zeros:
821 | 	xsyscall(mprotect(zero_block, BPFS_BLOCK_SIZE, PROT_READ));
822 | }
823 | 


--------------------------------------------------------------------------------
/crawler.h:
--------------------------------------------------------------------------------
 1 | /* This file is part of BPFS. BPFS is copyright 2009-2010 The Regents of the
 2 |  * University of California. It is distributed under the terms of version 2
 3 |  * of the GNU GPL. See the file LICENSE for details. */
 4 | 
 5 | #ifndef CRAWLER_H
 6 | #define CRAWLER_H
 7 | 
 8 | #include "bpfs.h"
 9 | #include "bpfs_structs.h"
10 | 
11 | #include <stdbool.h>
12 | #include <stdint.h>
13 | 
14 | 
15 | void crawler_init(void);
16 | 
17 | 
18 | // @param blockoff block no in the file (blockoff * BPFS_BLOCK_SIZE is byte off)
19 | // @param block pointer to the block
20 | // @param off offset into the block
21 | // @param valid number of valid bytes in the block
22 | // @param crawl_start byte offset into the file at which the crawl started
23 | // @param commit allowed commit type
24 | // @param user user data
25 | // @param blockno *blockno is the block number (in/out)
26 | // Return <0 for error, 0 for success, 1 for success and stop crawl
27 | typedef int (*crawl_callback)(uint64_t blockoff, char *block,
28 |                               unsigned off, unsigned size, unsigned valid,
29 |                               uint64_t crawl_start, enum commit commit,
30 |                               void *user, uint64_t *blockno);
31 | 
32 | typedef void (*crawl_blockno_callback)(uint64_t blockno, bool leaf);
33 | 
34 | // Return <0 for error, 0 for success, 1 for success and stop crawl
35 | typedef int (*crawl_callback_inode)(char *block, unsigned off,
36 |                                     struct bpfs_inode *inode,
37 |                                     enum commit commit, void *user,
38 |                                     uint64_t *blockno);
39 | 
40 | 
41 | int crawl_tree(struct bpfs_tree_root *root, uint64_t off,
42 |                uint64_t size, enum commit commit,
43 |                crawl_callback callback, void *user,
44 |                uint64_t *prev_blockno);
45 | 
46 | void crawl_blocknos(const struct bpfs_tree_root *root,
47 |                     uint64_t off, uint64_t size,
48 |                     crawl_blockno_callback callback);
49 | 
50 | int crawl_inodes(uint64_t off, uint64_t size, enum commit commit,
51 |                  crawl_callback callback, void *user);
52 | 
53 | int crawl_inode(uint64_t ino, enum commit commit,
54 |                 crawl_callback_inode callback, void *user);
55 | 
56 | int crawl_data(uint64_t ino, uint64_t off, uint64_t size,
57 |                enum commit commit,
58 |                crawl_callback callback, void *user);
59 | 
60 | int crawl_data_2(uint64_t ino_0, uint64_t off_0, uint64_t size_0,
61 |                  crawl_callback callback_0, void *user_0,
62 |                  uint64_t ino_1, uint64_t off_1, uint64_t size_1,
63 |                  crawl_callback callback_1, void *user_1,
64 |                  enum commit commit);
65 | 
66 | #endif
67 | 


--------------------------------------------------------------------------------
/dcache.c:
--------------------------------------------------------------------------------
  1 | /* This file is part of BPFS. BPFS is copyright 2009-2010 The Regents of the
  2 |  * University of California. It is distributed under the terms of version 2
  3 |  * of the GNU GPL. See the file LICENSE for details. */
  4 | 
  5 | #include "dcache.h"
  6 | #include "util.h"
  7 | #include "hash_map.h"
  8 | 
  9 | #include <assert.h>
 10 | #include <errno.h>
 11 | #include <stdlib.h>
 12 | #include <string.h>
 13 | 
 14 | // NOTE: It may be faster to use pools for these data-structures.
 15 | 
 16 | 
 17 | // Fixed-size cache for now. Must be at least 2, for rename. 1024? Why not.
 18 | #define NMDIRS_MAX 1024
 19 | 
 20 | struct mdirent_free
 21 | {
 22 | 	uint64_t off;
 23 | 	struct mdirent_free *next;
 24 | 	uint16_t rec_len;
 25 | };
 26 | 
 27 | struct mdirectory
 28 | {
 29 | 	hash_map_t *dirents; // name -> mdirent
 30 | 	struct mdirent_free *free_dirents;
 31 | 	uint64_t ino; // inode number of this directory
 32 | 	struct mdirectory **lru_polder, *lru_newer;
 33 | };
 34 | 
 35 | struct dcache {
 36 | 	hash_map_t *directories; // directory ino -> mdirectory
 37 | 	struct mdirectory *lru_oldest, *lru_newest;
 38 | };
 39 | 
 40 | 
 41 | static struct dcache dcache;
 42 | 
 43 | 
 44 | // mdirent
 45 | 
 46 | static void mdirent_free(struct mdirent *md)
 47 | {
 48 | 	free((char*) md->name);
 49 | 	free(md);
 50 | }
 51 | 
 52 | static struct mdirent* mdirent_dup(const struct mdirent *md)
 53 | {
 54 | 	struct mdirent *dup = malloc(sizeof(*dup));
 55 | 	if (!dup)
 56 | 		return NULL;
 57 | 	memcpy(dup, md, sizeof(*dup));
 58 | 
 59 | 	dup->name = strdup(md->name);
 60 | 	if (!dup->name)
 61 | 	{
 62 | 		free(dup);
 63 | 		return NULL;
 64 | 	}
 65 | 
 66 | 	return dup;
 67 | }
 68 | 
 69 | 
 70 | // mdirectory
 71 | 
 72 | static void mdirectory_touch(struct mdirectory *mdir)
 73 | {
 74 | 	assert(dcache.lru_newest && dcache.lru_oldest);
 75 | 
 76 | 	if (!mdir->lru_newer)
 77 | 		return; // mdir is already the head
 78 | 
 79 | 	// Remove mdir from the LRU
 80 | 	mdir->lru_newer->lru_polder = mdir->lru_polder;
 81 | 	*mdir->lru_polder = mdir->lru_newer;
 82 | 
 83 | 	// Add mdir to the head of the LRU
 84 | 	mdir->lru_polder = &dcache.lru_newest->lru_newer;
 85 | 	*mdir->lru_polder = mdir;
 86 | 	mdir->lru_newer = NULL;
 87 | 	dcache.lru_newest = mdir;
 88 | }
 89 | 
 90 | static void mdirectory_rem(struct mdirectory *mdir)
 91 | {
 92 | 	hash_map_it2_t it = hash_map_it2_create(mdir->dirents);
 93 | 
 94 | 	while (hash_map_it2_next(&it))
 95 | 		mdirent_free(it.val);
 96 | 	hash_map_destroy(mdir->dirents);
 97 | 
 98 | 	hash_map_erase(dcache.directories, u64_ptr(mdir->ino));
 99 | 
100 | 	// Remove mdir from the LRU
101 | 	if (mdir->lru_newer)
102 | 		mdir->lru_newer->lru_polder = mdir->lru_polder;
103 | 	else if (mdir->lru_polder != &dcache.lru_oldest)
104 | 		dcache.lru_newest = container_of(mdir->lru_polder, struct mdirectory,
105 | 		                                 lru_newer);
106 | 	else
107 | 		dcache.lru_newest = NULL;
108 | 	*mdir->lru_polder = mdir->lru_newer;
109 | 
110 | 	while (mdir->free_dirents)
111 | 	{
112 | 		struct mdirent_free *next = mdir->free_dirents->next;
113 | 		free(mdir->free_dirents);
114 | 		mdir->free_dirents = next;
115 | 	}
116 | 
117 | 	free(mdir);
118 | }
119 | 
120 | static struct mdirectory* mdirectory_add(uint64_t ino)
121 | {
122 | 	struct mdirectory *mdir;
123 | 	int r;
124 | 
125 | 	if (hash_map_size(dcache.directories) == NMDIRS_MAX)
126 | 		mdirectory_rem(dcache.lru_oldest);
127 | 
128 | 	mdir = malloc(sizeof(*mdir));
129 | 	if (!mdir)
130 | 		return NULL;
131 | 
132 | 	mdir->dirents = hash_map_create_str();
133 | 	if (!mdir->dirents)
134 | 		goto oom_mdir;
135 | 
136 | 	mdir->free_dirents = NULL;
137 | 
138 | 	mdir->ino = ino;
139 | 
140 | 	r = hash_map_insert(dcache.directories, u64_ptr(ino), mdir);
141 | 	if (r < 0)
142 | 		goto oom_dirents;
143 | 
144 | 	// Add mdir to the head of the LRU
145 | 	if (dcache.lru_newest)
146 | 		mdir->lru_polder = &dcache.lru_newest->lru_newer;
147 | 	else
148 | 		mdir->lru_polder = &dcache.lru_oldest;
149 | 	*mdir->lru_polder = mdir;
150 | 	mdir->lru_newer = NULL;
151 | 	dcache.lru_newest = mdir;
152 | 
153 | 	return mdir;
154 | 
155 |   oom_dirents:
156 | 	hash_map_destroy(mdir->dirents);
157 |   oom_mdir:
158 | 	free(mdir);
159 | 	return NULL;
160 | }
161 | 
162 | 
163 | // external API
164 | 
165 | int dcache_init(void)
166 | {
167 | 	assert(!dcache.directories);
168 | 
169 | 	dcache.directories = hash_map_create_size_ptr(NMDIRS_MAX, 0);
170 | 	if (!dcache.directories)
171 | 		return -ENOMEM;
172 | 
173 | 	dcache.lru_newest = dcache.lru_oldest = NULL;
174 | 
175 | 	return 0;
176 | }
177 | 
178 | void dcache_destroy(void)
179 | {
180 | 	hash_map_it2_t it = hash_map_it2_create(dcache.directories);
181 | 	while (hash_map_it2_next(&it))
182 | 		mdirectory_rem(it.val);
183 | 
184 | 	hash_map_destroy(dcache.directories);
185 | 	dcache.directories = NULL;
186 | 	dcache.lru_newest = dcache.lru_oldest = NULL;
187 | }
188 | 
189 | 
190 | bool dcache_has_dir(uint64_t ino)
191 | {
192 | 	return !!hash_map_find_val(dcache.directories, u64_ptr(ino));
193 | }
194 | 
195 | int dcache_add_dir(uint64_t ino)
196 | {
197 | 	struct mdirectory *mdir;
198 | 	assert(!hash_map_find_val(dcache.directories, u64_ptr(ino)));
199 | 	mdir = mdirectory_add(ino);
200 | 	if (!mdir)
201 | 		return -ENOMEM;
202 | 	return 0;
203 | }
204 | 
205 | void dcache_rem_dir(uint64_t ino)
206 | {
207 | 	struct mdirectory *mdir = hash_map_find_val(dcache.directories,
208 | 	                                            u64_ptr(ino));
209 | 	assert(mdir);
210 | 	mdirectory_rem(mdir);
211 | }
212 | 
213 | int dcache_add_dirent(uint64_t parent_ino, const char *name,
214 |                       const struct mdirent *mdo)
215 | {
216 | 	struct mdirectory *mdir = hash_map_find_val(dcache.directories,
217 | 	                                            u64_ptr(parent_ino));
218 | 	struct mdirent *mdc;
219 | 	int r;
220 | 
221 | 	assert(mdir);
222 | 	mdirectory_touch(mdir);
223 | 
224 | 	mdc = mdirent_dup(mdo);
225 | 	if (!mdc)
226 | 		return -ENOMEM;
227 | 
228 | 	r = hash_map_insert(mdir->dirents, (void*) mdc->name, mdc);
229 | 	if (r < 0)
230 | 	{
231 | 		mdirent_free(mdc);
232 | 		return r;
233 | 	}
234 | 	assert(!r);
235 | 
236 | 	return 0;
237 | }
238 | 
239 | const struct mdirent* dcache_get_dirent(uint64_t parent_ino, const char *name)
240 | {
241 | 	struct mdirectory *mdir = hash_map_find_val(dcache.directories,
242 | 	                                            u64_ptr(parent_ino));
243 | 	assert(mdir);
244 | 	mdirectory_touch(mdir);
245 | 	return hash_map_find_val(mdir->dirents, name);
246 | }
247 | 
248 | int dcache_rem_dirent(uint64_t parent_ino, const char *name)
249 | {
250 | 	struct mdirectory *mdir = hash_map_find_val(dcache.directories,
251 | 	                                            u64_ptr(parent_ino));
252 | 	struct mdirent *md;
253 | 
254 | 	assert(mdir);
255 | 	mdirectory_touch(mdir);
256 | 
257 | 	md = hash_map_erase(mdir->dirents, name);
258 | 	if (!md)
259 | 		return -EINVAL;
260 | 
261 | 	mdirent_free(md);
262 | 
263 | 	return 0;
264 | }
265 | 
266 | 
267 | int dcache_add_free(uint64_t parent_ino, uint64_t off, uint16_t rec_len)
268 | {
269 | 	struct mdirectory *mdir = hash_map_find_val(dcache.directories,
270 | 	                                            u64_ptr(parent_ino));
271 | 	struct mdirent_free *mdf;
272 | 
273 | 	assert(mdir);
274 | 	assert(off != DCACHE_FREE_NONE);
275 | 
276 | 	mdf = malloc(sizeof(*mdf));
277 | 	if (!mdf)
278 | 		return -ENOMEM;
279 | 	mdf->off = off;
280 | 	mdf->rec_len = rec_len;
281 | 	mdf->next = mdir->free_dirents;
282 | 	mdir->free_dirents = mdf;
283 | 
284 | 	return 0;
285 | }
286 | 
287 | uint64_t dcache_take_free(uint64_t parent_ino, uint16_t min_rec_len)
288 | {
289 | 	struct mdirectory *mdir = hash_map_find_val(dcache.directories,
290 | 	                                            u64_ptr(parent_ino));
291 | 	struct mdirent_free *prev_mdf, *mdf;
292 | 
293 | 	assert(mdir);
294 | 
295 | 	for (prev_mdf = NULL, mdf = mdir->free_dirents; mdf;
296 | 	     prev_mdf = mdf, mdf = mdf->next)
297 | 	{
298 | 		if (mdf->rec_len >= min_rec_len)
299 | 		{
300 | 			uint64_t off = mdf->off;
301 | 			if (prev_mdf)
302 | 				prev_mdf->next = mdf->next;
303 | 			else
304 | 				mdir->free_dirents = mdf->next;
305 | 			free(mdf);
306 | 			return off;
307 | 		}
308 | 	}
309 | 
310 | 	return DCACHE_FREE_NONE;
311 | }
312 | 


--------------------------------------------------------------------------------
/dcache.h:
--------------------------------------------------------------------------------
 1 | /* This file is part of BPFS. BPFS is copyright 2009-2010 The Regents of the
 2 |  * University of California. It is distributed under the terms of version 2
 3 |  * of the GNU GPL. See the file LICENSE for details. */
 4 | 
 5 | #ifndef DCACHE_H
 6 | #define DCACHE_H
 7 | 
 8 | #include <inttypes.h>
 9 | #include <stdbool.h>
10 | 
11 | struct mdirent
12 | {
13 | 	const char *name;
14 | 	uint64_t off;
15 | 	uint64_t ino;
16 | 	uint64_t ino_generation;
17 | 	uint16_t rec_len;
18 | 	uint8_t file_type;
19 | };
20 | 
21 | static __inline
22 | void mdirent_init(struct mdirent *md,
23 |                   const char *name, uint64_t off, uint64_t ino,
24 |                   uint64_t ino_gen, uint16_t rec_len, uint8_t ft)
25 | 	__attribute__((always_inline));
26 | 
27 | //
28 | // The directory entry cache
29 | 
30 | int dcache_init(void);
31 | void dcache_destroy(void);
32 | 
33 | //
34 | // Directories
35 | 
36 | // Return whether the directory ino is currently in the dcache.
37 | bool dcache_has_dir(uint64_t ino);
38 | 
39 | // Add the ino directory.
40 | int dcache_add_dir(uint64_t ino);
41 | 
42 | // Remove the ino directory and its contents.
43 | void dcache_rem_dir(uint64_t ino);
44 | 
45 | //
46 | // Directory entries
47 | 
48 | // Add <name, md> to the parent_ino directory.
49 | int dcache_add_dirent(uint64_t parent_ino, const char *name,
50 |                       const struct mdirent *md);
51 | 
52 | // Get the dirent for <parent_ino, name>.
53 | // parent_ino must be in the dcache.
54 | const struct mdirent* dcache_get_dirent(uint64_t parent_ino, const char *name);
55 | 
56 | // Remove the dirent for name from the parent_ino directory.
57 | // parent_ino must be in the dcache.
58 | int dcache_rem_dirent(uint64_t parent_ino, const char *name);
59 | 
60 | //
61 | // Free directory entries
62 | 
63 | // Add a free dirent.
64 | int dcache_add_free(uint64_t parent_ino, uint64_t off, uint16_t rec_len);
65 | 
66 | #define DCACHE_FREE_NONE UINT64_MAX
67 | 
68 | // Find a free dirent with a rec_len at least min_rec_len,
69 | // remove it from the set of free dirents, and return its offset.
70 | // Returns DCACHE_FREE_NONE if none is found.
71 | uint64_t dcache_take_free(uint64_t parent_ino, uint16_t min_rec_len);
72 | 
73 | //
74 | // Inline implementation
75 | 
76 | static __inline
77 | void mdirent_init(struct mdirent *md,
78 |                   const char *name, uint64_t off, uint64_t ino,
79 |                   uint64_t ino_gen, uint16_t rec_len, uint8_t ft)
80 | {
81 | 	md->name = name;
82 | 	md->off = off;
83 | 	md->ino = ino;
84 | 	md->ino_generation = ino_gen;
85 | 	md->rec_len = rec_len;
86 | 	md->file_type = ft;
87 | }
88 | 
89 | #endif
90 | 


--------------------------------------------------------------------------------
/fuse_limits:
--------------------------------------------------------------------------------
1 | - fuse_reply_buf() can only be called once per read/readdir.
2 |   fuse_reply_iov() exists, but requires allocating an array to track the ptrs.
3 | - file creation is done in two calls (create and setattr), so not atomic
4 | - write calls are limited to 128kB
5 | 


--------------------------------------------------------------------------------
/hash_map.c:
--------------------------------------------------------------------------------
  1 | /* This file is part of BPFS. BPFS is copyright 2009-2010 The Regents of the
  2 |  * University of California. It is distributed under the terms of version 2
  3 |  * of the GNU GPL. See the file LICENSE for details. */
  4 | 
  5 | #include "vector.h"
  6 | #include "hash_map.h"
  7 | #include "pool.h"
  8 | 
  9 | #include <assert.h>
 10 | #include <errno.h>
 11 | #include <string.h>
 12 | 
 13 | #define HASH_MAP_DEBUG 0
 14 | 
 15 | #if HASH_MAP_DEBUG
 16 | #include <stdio.h>
 17 | #define Dprintf(x...) printf(x)
 18 | #else
 19 | #define Dprintf(x...)
 20 | #endif
 21 | 
 22 | 
 23 | //
 24 | // Implement hash_map.h using a chaining hash table.
 25 | 
 26 | // Since we are storing only a pointer in each entry it might make more
 27 | // sense to use open addressing with the same amount of memory used than
 28 | // chaining, since each chain entry needs two ptrs for the chain and each
 29 | // bucket uses one pointer to point to the chain. TAOCP page 545 lightly
 30 | // discusses this.
 31 | 
 32 | 
 33 | struct chain_elt {
 34 | 	hash_map_elt_t elt;
 35 | 	struct chain_elt * next;
 36 | 	struct chain_elt * prev;
 37 | };
 38 | 
 39 | struct hash_map {
 40 | 	size_t size;
 41 | 	bool auto_resize;
 42 | 	vector_t * tbl;
 43 | 	enum {PTR, STR} type;
 44 | #if HASH_MAP_TRACK_BUCKET_SIZES
 45 | 	vector_t * tbl_size;
 46 | 	vector_t * tbl_max_size;
 47 | #endif
 48 | #if HASH_MAP_IT_MOD_DEBUG
 49 | 	size_t version; // Incremented for every change
 50 | 	size_t loose_version; // Incremented for inserts and resizes (not removes)
 51 | #endif
 52 | };
 53 | 
 54 | 
 55 | //
 56 | // The hashing function.
 57 | // Essentially, GNU C++ STL 3.4's hash_fun and hashtable.
 58 | 
 59 | // Note: assumes long is at least 32 bits.
 60 | enum { num_primes = 28 };
 61 | 
 62 | static const unsigned long prime_list[num_primes] =
 63 | {
 64 |   53ul,         97ul,         193ul,       389ul,       769ul,
 65 |   1543ul,       3079ul,       6151ul,      12289ul,     24593ul,
 66 |   49157ul,      98317ul,      196613ul,    393241ul,    786433ul,
 67 |   1572869ul,    3145739ul,    6291469ul,   12582917ul,  25165843ul,
 68 |   50331653ul,   100663319ul,  201326611ul, 402653189ul, 805306457ul,
 69 |   1610612741ul, 3221225473ul, 4294967291ul
 70 | };
 71 | 
 72 | static inline unsigned long next_size(size_t n)
 73 | {
 74 | 	const unsigned long * first = prime_list;
 75 | 	const unsigned long * last = prime_list + (int) num_primes;
 76 | 	const unsigned long * pos = first;
 77 | 	for (pos = first; *pos < n && pos != last; pos++) ;
 78 | 	return pos == last ? *(last - 1) : *pos;
 79 | }
 80 | 
 81 | inline static size_t hash_ptr(const void * k, size_t tbl_size) __attribute__((always_inline));
 82 | inline static size_t hash_ptr(const void * k, size_t tbl_size)
 83 | {
 84 | 	return ((size_t) k) % tbl_size;
 85 | }
 86 | 
 87 | inline static size_t hash_str(const char * s, size_t tbl_size) __attribute__((always_inline));
 88 | inline static size_t hash_str(const char * s, size_t tbl_size)
 89 | {
 90 |     unsigned long h = 0;
 91 |     for ( ; *s; ++s)
 92 | 		h = 5*h + *s;
 93 |     return h % tbl_size;
 94 | }
 95 | 
 96 | static __inline size_t hash(const hash_map_t * hm, const void * k) __attribute__((always_inline));
 97 | static __inline size_t hash(const hash_map_t * hm, const void * k)
 98 | {
 99 | 	switch(hm->type)
100 | 	{
101 | 		case PTR: return hash_ptr(k, vector_size(hm->tbl));
102 | 		case STR: return hash_str(k, vector_size(hm->tbl));
103 | 		default: assert(0); return -1;
104 | 	}
105 | }
106 | 
107 | 
108 | //
109 | // Chains
110 | 
111 | DECLARE_POOL(chain_elt, chain_elt_t);
112 | 
113 | static chain_elt_t * chain_elt_create(const hash_map_t * hm, void * k, void * v)
114 | {
115 | 	chain_elt_t * elt = chain_elt_alloc();
116 | 	elt->elt.key = k;
117 | 	elt->elt.val = v;
118 | 	elt->next = NULL;
119 | 	elt->prev = NULL;
120 | 	return elt;
121 | }
122 | 
123 | static void chain_elt_destroy(chain_elt_t * elt)
124 | {
125 | 	chain_elt_free(elt);
126 | }
127 | 
128 | static __inline chain_elt_t * chain_search_ptr_key(const chain_elt_t * head, const void * k) __attribute__((always_inline));
129 | static __inline chain_elt_t * chain_search_ptr_key(const chain_elt_t * head, const void * k)
130 | {
131 | 	while (head)
132 | 	{
133 | 		if (head->elt.key == k)
134 | 			return (chain_elt_t *) head;
135 | 		head = head->next;
136 | 	}
137 | 
138 | 	return NULL;
139 | }
140 | 
141 | static __inline chain_elt_t * chain_search_str_key(const chain_elt_t * head, const char * k) __attribute__((always_inline));
142 | static __inline chain_elt_t * chain_search_str_key(const chain_elt_t * head, const char * k)
143 | {
144 | 	while (head)
145 | 	{
146 | 		// Cache key lengths?
147 | 		if (!strcmp((const char *) head->elt.key, k))
148 | 			return (chain_elt_t *) head;
149 | 		head = head->next;
150 | 	}
151 | 
152 | 	return NULL;
153 | }
154 | 
155 | static __inline chain_elt_t * chain_search_key(const hash_map_t * hm, const chain_elt_t * head, const void * k)
156 | {
157 | 	switch(hm->type)
158 | 	{
159 | 		case PTR: return chain_search_ptr_key(head, k);
160 | 		case STR: return chain_search_str_key(head, k);
161 | 		default: assert(0); return NULL;
162 | 	}
163 | }
164 | 
165 | 
166 | //
167 | // Construction/destruction
168 | 
169 | static hash_map_t * hash_map_create_size_type(size_t n, bool auto_resize, int type)
170 | {
171 | 	hash_map_t * hm;
172 | 	if (!n)
173 | 		return NULL;
174 | 
175 | 	hm = malloc(sizeof(*hm));
176 | 	if (!hm)
177 | 		return NULL;
178 | 
179 | 	hm->size = 0;
180 | 	hm->auto_resize = auto_resize;
181 | 	hm->tbl = vector_create_size(next_size(n));
182 | 	if (!hm->tbl)
183 | 	{
184 | 		free(hm);
185 | 		return NULL;
186 | 	}
187 | 	hm->type = type;
188 | 
189 | #if HASH_MAP_TRACK_BUCKET_SIZES
190 | 	hm->tbl_size = vector_create_size(vector_size(hm->tbl));
191 | 	assert(hm->tbl_size);
192 | 	hm->tbl_max_size = vector_create_size(vector_size(hm->tbl));
193 | 	assert(hm->tbl_max_size);
194 | 	size_t i;
195 | 	for(i = 0; i < vector_size(hm->tbl); i++)
196 | 	{
197 | 		vector_elt_set(hm->tbl_size, i, 0);
198 | 		vector_elt_set(hm->tbl_max_size, i, 0);
199 | 	}
200 | #endif
201 | 
202 | #if HASH_MAP_IT_MOD_DEBUG
203 | 	hm->version = 0;
204 | 	hm->loose_version = 0;
205 | #endif
206 | 
207 | 	return hm;
208 | }
209 | 
210 | hash_map_t * hash_map_create_ptr(void)
211 | {
212 | 	return hash_map_create_size_type(1, 1, PTR);
213 | }
214 | 
215 | hash_map_t * hash_map_create_str(void)
216 | {
217 | 	return hash_map_create_size_type(1, 1, STR);
218 | }
219 | 
220 | hash_map_t * hash_map_create_size_ptr(size_t n, bool auto_resize)
221 | {
222 | 	return hash_map_create_size_type(1, 1, PTR);
223 | }
224 | 
225 | hash_map_t * hash_map_create_size_str(size_t n, bool auto_resize)
226 | {
227 | 	return hash_map_create_size_type(1, 1, STR);
228 | }
229 | 
230 | hash_map_t * hash_map_copy(const hash_map_t * hm)
231 | {
232 | 	hash_map_t * hm_copy;
233 | 	size_t i;
234 | 	chain_elt_t * elt;
235 | 	int r;
236 | 
237 | 	// Create new hash table
238 | 	hm_copy = hash_map_create_size_type(hm->size, hm->auto_resize, hm->type);
239 | 	if (!hm_copy)
240 | 		return NULL;
241 | 
242 | 	// Copy elements (rehashing them; we could do this more quickly)
243 | 	for (i=0; i < vector_size(hm->tbl); i++)
244 | 	{
245 | 		elt = vector_elt(hm->tbl, i);
246 | 		while (elt)
247 | 		{
248 | 			if ((r = hash_map_insert(hm_copy, elt->elt.key, elt->elt.val)) < 0)
249 | 			{
250 | 				hash_map_destroy(hm_copy);
251 | 				return NULL;
252 | 			}
253 | 			elt = elt->next;
254 | 		}
255 | 	}
256 | 
257 | 	return hm_copy;
258 | }
259 | 
260 | void hash_map_destroy(hash_map_t * hm)
261 | {
262 | 	hash_map_clear(hm);
263 | 	vector_destroy(hm->tbl);
264 | 	hm->tbl = NULL;
265 | #if HASH_MAP_TRACK_BUCKET_SIZES
266 | 	vector_destroy(hm->tbl_size);
267 | 	hm->tbl_size = NULL;
268 | 	vector_destroy(hm->tbl_max_size);
269 | 	hm->tbl_max_size = NULL;
270 | #endif
271 | 	free(hm);
272 | }
273 | 
274 | 
275 | //
276 | // General
277 | 
278 | size_t hash_map_size(const hash_map_t * hm)
279 | {
280 | 	return hm->size;
281 | }
282 | 
283 | bool hash_map_empty(const hash_map_t * hm)
284 | {
285 | 	return (hm->size == 0);
286 | }
287 | 
288 | int hash_map_insert(hash_map_t * hm, void * k, void * v)
289 | {
290 | 	Dprintf("%s(%p, %p, %p)\n", __FUNCTION__, hm, k, v);
291 | 	const size_t elt_num = hash(hm, k);
292 | 	chain_elt_t * head = vector_elt(hm->tbl, elt_num);
293 | 
294 | 	if (!head)
295 | 	{
296 | 		head = chain_elt_create(hm, k, v);
297 | 		if (!head)
298 | 			return -ENOMEM;
299 | 	}
300 | 	else
301 | 	{
302 | 		// See if k is already in the chain, simply update its value if so.
303 | 		chain_elt_t * existing_elt;
304 | 		chain_elt_t * new_head;
305 | 		if ((existing_elt = chain_search_key(hm, head, k)))
306 | 		{
307 | 			existing_elt->elt.val = v;
308 | #if HASH_MAP_IT_MOD_DEBUG
309 | 			hm->version++;
310 | 			hm->loose_version++;
311 | #endif
312 | 			return 1;
313 | 		}
314 | 
315 | 		// k isn't already in the chain, add it.
316 | 
317 | 		new_head = chain_elt_create(hm, k, v);
318 | 		if (!new_head)
319 | 			return -ENOMEM;
320 | 
321 | 		new_head->next = head;
322 | 		head->prev = new_head;
323 | 		head = new_head;
324 | 	}
325 | 
326 | 	vector_elt_set(hm->tbl, elt_num, head);
327 | #if HASH_MAP_TRACK_BUCKET_SIZES
328 | 	vector_elt_set(hm->tbl_size, elt_num, vector_elt(hm->tbl_size, elt_num) + 1);
329 | 	if(vector_elt(hm->tbl_size, elt_num) > vector_elt(hm->tbl_max_size, elt_num))
330 | 		vector_elt_set(hm->tbl_max_size, elt_num, vector_elt(hm->tbl_size, elt_num));
331 | #endif
332 | 	hm->size++;
333 | #if HASH_MAP_IT_MOD_DEBUG
334 | 	hm->version++;
335 | 	hm->loose_version++;
336 | #endif
337 | 
338 | 	if (hm->auto_resize && next_size(hash_map_size(hm)) > hash_map_bucket_count(hm))
339 | 	{
340 | 		// (safe to ignore failure)
341 | 		(void) hash_map_resize(hm, hash_map_size(hm));
342 | 	}
343 | 
344 | 	return 0;
345 | }
346 | 
347 | // Insert an elt into hm. elt must not already exist in hm.
348 | // This allows movement of an elt from one hm to another;
349 | // thus no malloc()/free() overhead and the elt maintains its memory location.
350 | static void insert_chain_elt(hash_map_t * hm, chain_elt_t * elt)
351 | {
352 | 	Dprintf("%s(%p, %p)\n", __FUNCTION__, hm, elt);
353 | 	const size_t elt_num = hash(hm, elt->elt.key);
354 | 	chain_elt_t * head = vector_elt(hm->tbl, elt_num);
355 | 
356 | 	if (head)
357 | 	{
358 | 		// Assume !chain_search_key(hm, head, elt->elt.key)
359 | 		elt->next = head;
360 | 		head->prev = elt;
361 | 	}
362 | 
363 | 	vector_elt_set(hm->tbl, elt_num, elt);
364 | #if HASH_MAP_TRACK_BUCKET_SIZES
365 | 	vector_elt_set(hm->tbl_size, elt_num, vector_elt(hm->tbl_size, elt_num) + 1);
366 | 	if(vector_elt(hm->tbl_size, elt_num) > vector_elt(hm->tbl_max_size, elt_num))
367 | 		vector_elt_set(hm->tbl_max_size, elt_num, vector_elt(hm->tbl_size, elt_num));
368 | #endif
369 | 	hm->size++;
370 | #if HASH_MAP_IT_MOD_DEBUG
371 | 	hm->version++;
372 | 	hm->loose_version++;
373 | #endif
374 | }
375 | 
376 | // Erase the key-value pair for k from hm, return the element.
377 | static chain_elt_t * erase_chain_elt(hash_map_t * hm, const void * k)
378 | {
379 | 	Dprintf("%s(%p, %p)\n", __FUNCTION__, hm, k);
380 | 	const size_t elt_num = hash(hm, k);
381 | 	chain_elt_t * head = vector_elt(hm->tbl, elt_num);
382 | 	chain_elt_t * k_chain;
383 | 
384 | 	if (!head)
385 | 		return NULL;
386 | 
387 | 	k_chain = chain_search_key(hm, head, k);
388 | 	if (!k_chain)
389 | 		return NULL;
390 | 
391 | 	if (k_chain->prev)
392 | 		k_chain->prev->next = k_chain->next;
393 | 	else
394 | 		vector_elt_set(hm->tbl, elt_num, k_chain->next);
395 | 	if (k_chain->next)
396 | 		k_chain->next->prev = k_chain->prev;
397 | 
398 | 	k_chain->next = NULL;
399 | 	k_chain->prev = NULL;
400 | 
401 | #if HASH_MAP_TRACK_BUCKET_SIZES
402 | 	vector_elt_set(hm->tbl_size, elt_num, vector_elt(hm->tbl_size, elt_num) - 1);
403 | #endif
404 | 	hm->size--;
405 | #if HASH_MAP_IT_MOD_DEBUG
406 | 	hm->version++;
407 | 	/* do not update hm->loose_version */
408 | #endif
409 | 
410 | 	return k_chain;
411 | }
412 | 
413 | void * hash_map_erase(hash_map_t * hm, const void * k)
414 | {
415 | 	Dprintf("%s(%p, %p)\n", __FUNCTION__, hm, k);
416 | 	chain_elt_t * k_chain;
417 | 	void * v;
418 | 
419 | 	k_chain = erase_chain_elt(hm, k);
420 | 	if (!k_chain)
421 | 		return NULL;
422 | 
423 | 	v = k_chain->elt.val;
424 | 	chain_elt_destroy(k_chain);
425 | 
426 | #if 0
427 | 	// Auto-shrink support is untested; we might enable this later should
428 | 	// we find it may be helpful. This is not enabled because code that
429 | 	// calls hash_map_erase() on every element to destroy the map
430 | 	// would pay a time and max space penalty.
431 | 	size_t ns = next_size(hash_map_size(hm));
432 | 	if (hm->auto_resize && (next_size(ns + 1) < hash_map_bucket_count(hm)))
433 | 	{
434 | 		// (safe to ignore failure)
435 | 		(void) hash_map_resize(hm, ns);
436 | 	}
437 | #endif
438 | 
439 | 	return v;
440 | }
441 | 
442 | int hash_map_change_key(hash_map_t * hm, void * oldk, void * newk)
443 | {
444 | 	Dprintf("%s(%p, %p, %p)\n", __FUNCTION__, hm, oldk, newk);
445 | 	chain_elt_t * head;
446 | 	chain_elt_t * elt;
447 | 
448 | 	// Check that newk isn't already in use
449 | 
450 | 	const size_t newk_elt_num = hash(hm, newk);
451 | 	head = vector_elt(hm->tbl, newk_elt_num);
452 | 	if (head && chain_search_key(hm, head, newk))
453 | 		return -EEXIST;
454 | 
455 | 	// Find oldk
456 | 
457 | 	const size_t oldk_elt_num = hash(hm, oldk);
458 | 	head = vector_elt(hm->tbl, oldk_elt_num);
459 | 	if (!head)
460 | 		return -ENOENT;
461 | 
462 | 	head = chain_search_key(hm, head, oldk);
463 | 	if (!head)
464 | 		return -ENOENT;
465 | 
466 | 	// The hashmap has oldk, move elt to its new home
467 | 
468 | 	elt = head;
469 | 	if (elt->prev)
470 | 		elt->prev->next = elt->next;
471 | 	else
472 | 		vector_elt_set(hm->tbl, oldk_elt_num, elt->next);
473 | 	if (elt->next)
474 | 		elt->next->prev = elt->prev;
475 | 
476 | 	elt->elt.key = newk;
477 | 	elt->prev = NULL;
478 | 	elt->next = NULL;
479 | 
480 | 	head = vector_elt(hm->tbl, newk_elt_num);
481 | 	if (head)
482 | 	{
483 | 		elt->next = head;
484 | 		head->prev = elt;
485 | 	}
486 | 	vector_elt_set(hm->tbl, newk_elt_num, elt);
487 | #if HASH_MAP_IT_MOD_DEBUG
488 | 	hm->version++;
489 | 	hm->loose_version++;
490 | #endif
491 | 
492 | 	return 0;
493 | }
494 | 
495 | void hash_map_clear(hash_map_t * hm)
496 | {
497 | 	Dprintf("%s(%p)\n", __FUNCTION__, hm);
498 | 	size_t i;
499 | 
500 | 	for (i=0; i < vector_size(hm->tbl); i++)
501 | 	{
502 | 		chain_elt_t * head = vector_elt(hm->tbl, i);
503 | 		chain_elt_t * next;
504 | 		while (head)
505 | 		{
506 | 			next = head->next;
507 | 			chain_elt_destroy(head);
508 | 			head = next;
509 | 		}
510 | 		vector_elt_set(hm->tbl, i, NULL);
511 | #if HASH_MAP_TRACK_BUCKET_SIZES
512 | 		vector_elt_set(hm->tbl_size, i, 0);
513 | #endif
514 | 	}
515 | 
516 | 	hm->size = 0;
517 | #if HASH_MAP_IT_MOD_DEBUG
518 | 	hm->version++;
519 | 	hm->loose_version++;
520 | #endif
521 | }
522 | 
523 | static __inline hash_map_elt_t * hash_map_find_internal(const hash_map_t * hm, const void * k) __attribute__((always_inline));
524 | static __inline hash_map_elt_t * hash_map_find_internal(const hash_map_t * hm, const void * k)
525 | {
526 | 	const size_t elt_num = hash(hm, k);
527 | 	chain_elt_t * head = vector_elt(hm->tbl, elt_num);
528 | 	chain_elt_t * k_chain;
529 | 
530 | 	if (!head)
531 | 		return NULL;
532 | 
533 | 	k_chain = chain_search_key(hm, head, k);
534 | 	if (!k_chain)
535 | 		return NULL;
536 | 
537 | 	return &k_chain->elt;
538 | }
539 | 
540 | void * hash_map_find_val(const hash_map_t * hm, const void * k)
541 | {
542 | 	hash_map_elt_t * hme = hash_map_find_internal(hm, k);
543 | 	if (!hme)
544 | 	{
545 | 		return NULL;
546 | 	}
547 | 	return hme->val;
548 | }
549 | 
550 | hash_map_elt_t * hash_map_find_eltp(const hash_map_t * hm, const void * k)
551 | {
552 | 	return hash_map_find_internal(hm, k);
553 | }
554 | 
555 | hash_map_elt_t hash_map_find_elt(const hash_map_t * hm, const void * k)
556 | {
557 | 	hash_map_elt_t * hme = hash_map_find_internal(hm, k);
558 | 	if (!hme)
559 | 	{
560 | 		hash_map_elt_t not_found = { .key = NULL, .val = NULL };
561 | 		return not_found;
562 | 	}
563 | 	return *hme;
564 | }
565 | 
566 | 
567 | //
568 | // Resizing
569 | 
570 | size_t hash_map_bucket_count(const hash_map_t * hm)
571 | {
572 | 	return vector_size(hm->tbl);
573 | }
574 | 
575 | int hash_map_resize(hash_map_t * hm, size_t n)
576 | {
577 | 	hash_map_t * new_hm;
578 | 	size_t i;
579 | 	n = next_size(n);
580 | 
581 | 	// Avoid unnecessary work when there is no change in the number of buckets
582 | 	// and avoid making the hash table smaller than this implementation desires
583 | 	if (n == hash_map_bucket_count(hm))
584 | 		return 1;
585 | 
586 | 	// Possible speedup if we could use one:
587 | 	// http://sources.redhat.com/ml/guile/1998-10/msg00864.html
588 | 
589 | 	// Create new hash table
590 | 	new_hm = hash_map_create_size_type(n, hm->auto_resize, hm->type);
591 | 	if (!new_hm)
592 | 		return -ENOMEM;
593 | 
594 | 	// Rehash elements
595 | 	for (i=0; i < vector_size(hm->tbl); i++)
596 | 	{
597 | 		chain_elt_t * elt = vector_elt(hm->tbl, i);
598 | 		while (elt)
599 | 		{
600 | 			chain_elt_t * next_elt = elt->next;
601 | 			chain_elt_t * found = erase_chain_elt(hm, elt->elt.key);
602 | 			assert(found); (void) found; // we are rehashing; elt.key is in the source map
603 | 			insert_chain_elt(new_hm, elt);
604 | 			elt = next_elt;
605 | 		}
606 | 	}
607 | 
608 | 	// Expire the old hash table and move in the new
609 | 	hash_map_clear(hm);
610 | 	vector_destroy(hm->tbl);
611 | 	hm->size = new_hm->size;
612 | 	hm->tbl  = new_hm->tbl;
613 | #if HASH_MAP_TRACK_BUCKET_SIZES
614 | 	vector_destroy(hm->tbl_size);
615 | 	vector_destroy(hm->tbl_max_size);
616 | 	hm->tbl_size = new_hm->tbl_size;
617 | 	hm->tbl_max_size = new_hm->tbl_max_size;
618 | #endif
619 | 	free(new_hm);
620 | #if HASH_MAP_IT_MOD_DEBUG
621 | 	hm->version++;
622 | 	hm->loose_version++;
623 | #endif
624 | 
625 | 	return 0;
626 | }
627 | 
628 | #if HASH_MAP_TRACK_BUCKET_SIZES
629 | const vector_t * hash_map_max_sizes(const hash_map_t * hm)
630 | {
631 | 	return hm->tbl_max_size;
632 | }
633 | #endif
634 | 
635 | //
636 | // Iteration (current)
637 | 
638 | hash_map_it2_t hash_map_it2_create(hash_map_t * hm)
639 | {
640 | 	hash_map_it2_t it;
641 | 	size_t i;
642 | 
643 | 	it.key = NULL;
644 | 	it.val = NULL;
645 | 	it.internal.hm = hm;
646 | 	it.internal.next_bucket = 0;
647 | 	it.internal.next_elt = NULL;
648 | #if HASH_MAP_IT_MOD_DEBUG
649 | 	it.internal.loose_version = hm->loose_version;
650 | #endif
651 | 
652 | 	if (!hm)
653 | 		return it;
654 | 
655 | 	// Find the first entry and store it as next
656 | 	for (i = 0; i < vector_size(hm->tbl); i++)
657 | 	{
658 | 		chain_elt_t * head = vector_elt(hm->tbl, i);
659 | 		if (head)
660 | 		{
661 | 			it.internal.next_bucket = i;
662 | 			it.internal.next_elt = head;
663 | 			break;
664 | 		}
665 | 	}
666 | 
667 | 	return it;
668 | }
669 | 
670 | bool hash_map_it2_next(hash_map_it2_t * it)
671 | {
672 | 	size_t i;
673 | 
674 | #if HASH_MAP_IT_MOD_DEBUG
675 | 	assert(!it->internal.hm || it->internal.loose_version == it->internal.hm->loose_version);
676 | #endif
677 | 
678 | 	if (!it->internal.next_elt)
679 | 		return 0;
680 | 
681 | 	it->key = it->internal.next_elt->elt.key;
682 | 	it->val = it->internal.next_elt->elt.val;
683 | 
684 | 	// If there are more elts in this chain, use the next elt
685 | 	if (it->internal.next_elt->next)
686 | 	{
687 | 		it->internal.next_elt = it->internal.next_elt->next;
688 | 		return 1;
689 | 	}
690 | 
691 | 	// Find the next bucket with an elt
692 | 	for (i = it->internal.next_bucket + 1; i < vector_size(it->internal.hm->tbl); i++)
693 | 	{
694 | 		chain_elt_t * head = vector_elt(it->internal.hm->tbl, i);
695 | 		if (head)
696 | 		{
697 | 			it->internal.next_bucket = i;
698 | 			it->internal.next_elt = head;
699 | 			return 1;
700 | 		}
701 | 	}
702 | 
703 | 	// The current entry is the last
704 | 	it->internal.next_elt = NULL;
705 | 	return 1;
706 | }
707 | 
708 | 
709 | //
710 | // Iteration (deprecated)
711 | 
712 | void hash_map_it_init(hash_map_it_t * it, hash_map_t * hm)
713 | {
714 | 	it->hm = hm;
715 | 	it->bucket = 0;
716 | 	it->elt = NULL;
717 | #if HASH_MAP_IT_MOD_DEBUG
718 | 	it->version = hm->version;
719 | #endif
720 | }
721 | 
722 | hash_map_elt_t hash_map_elt_next(hash_map_it_t * it)
723 | {
724 | 	hash_map_elt_t no_elt = { .key = NULL, .val = NULL };
725 | 	chain_elt_t * head;
726 | 	size_t i;
727 | 
728 | #if HASH_MAP_IT_MOD_DEBUG
729 | 	assert(it->version == it->hm->version);
730 | #endif
731 | 
732 | 	if (!it->bucket && !it->elt)
733 | 	{
734 | 		// New iterator
735 | 
736 | 		if (!it->hm)
737 | 			return no_elt;
738 | 
739 | 		// Set it to the first elt
740 | 		for (i=0; i < vector_size(it->hm->tbl); i++)
741 | 		{
742 | 			head = vector_elt(it->hm->tbl, i);
743 | 			if (head)
744 | 			{
745 | 				it->bucket = i;
746 | 				it->elt = head;
747 | 				break;
748 | 			}
749 | 		}
750 | 
751 | 		if (!it->elt)
752 | 			return no_elt; // no elts in the hash map
753 | 		return it->elt->elt;
754 | 	}
755 | 
756 | 	// If there are more elts in this chain, return the next
757 | 	if (it->elt->next)
758 | 	{
759 | 		it->elt = it->elt->next;
760 | 		return it->elt->elt;
761 | 	}
762 | 
763 | 	// Find the next bucket with an elt
764 | 	for (i=it->bucket+1; i < vector_size(it->hm->tbl); i++)
765 | 	{
766 | 		head = vector_elt(it->hm->tbl, i);
767 | 		if (head)
768 | 		{
769 | 			it->bucket = i;
770 | 			it->elt = head;
771 | 			return it->elt->elt;
772 | 		}
773 | 	}
774 | 
775 | 	return no_elt;
776 | }
777 | 
778 | void * hash_map_val_next(hash_map_it_t * it)
779 | {
780 | 	return hash_map_elt_next(it).val;
781 | }
782 | 
783 | 
784 | int hash_map_init(void)
785 | {
786 | 	return atexit(chain_elt_free_all);
787 | }
788 | 


--------------------------------------------------------------------------------
/hash_map.h:
--------------------------------------------------------------------------------
  1 | /* This file is part of BPFS. BPFS is copyright 2009-2010 The Regents of the
  2 |  * University of California. It is distributed under the terms of version 2
  3 |  * of the GNU GPL. See the file LICENSE for details. */
  4 | 
  5 | #ifndef FSTITCH_INC_HASH_MAP_H
  6 | #define FSTITCH_INC_HASH_MAP_H
  7 | 
  8 | #include <stdbool.h>
  9 | #include <stddef.h>
 10 | 
 11 | /* Set to check for illegal hash map modifications during iteration.
 12 |  * If hash map iteration code tries to deref bad pointers, try this. */
 13 | /* values: 0 (normal), 1 (debug) */
 14 | #ifndef NDEBUG
 15 | # define HASH_MAP_IT_MOD_DEBUG 1
 16 | #else
 17 | # define HASH_MAP_IT_MOD_DEBUG 0
 18 | #endif
 19 | 
 20 | /* Set to track the number of elements in hash map buckets. */
 21 | /* values: 0 (normal), 1 (track) */
 22 | #define HASH_MAP_TRACK_BUCKET_SIZES 0
 23 | 
 24 | typedef struct hash_map_elt hash_map_elt_t;
 25 | typedef struct chain_elt chain_elt_t;
 26 | typedef struct hash_map hash_map_t;
 27 | 
 28 | struct hash_map_elt {
 29 | 	void * key;
 30 | 	void * val;
 31 | };
 32 | 
 33 | struct chain_elt;
 34 | 
 35 | struct hash_map;
 36 | 
 37 | int hash_map_init(void);
 38 | 
 39 | // Create a hash_map.
 40 | hash_map_t * hash_map_create_ptr(void);
 41 | hash_map_t * hash_map_create_str(void);
 42 | // Create a hash_map, reserve space for n entries, allow/don't auto resizing.
 43 | hash_map_t * hash_map_create_size_ptr(size_t n, bool auto_resize);
 44 | hash_map_t * hash_map_create_size_str(size_t n, bool auto_resize);
 45 | // Create a hash map that contains the same elements as hm
 46 | hash_map_t * hash_map_copy(const hash_map_t * hm);
 47 | // Destroy a hash_map, does not destroy keys or vals.
 48 | void         hash_map_destroy(hash_map_t * hm);
 49 | 
 50 | // Return number of items in the hash_map.
 51 | size_t hash_map_size(const hash_map_t * hm);
 52 | // Return whether hash_map is empty.
 53 | bool   hash_map_empty(const hash_map_t * hm);
 54 | // Insert the given key-val pair, updating k's v if k exists.
 55 | // Returns 0 or 1 on success, or -ENOMEM.
 56 | int    hash_map_insert(hash_map_t * hm, void * k, void * v);
 57 | // Remove the given key-val pair, does not destory key or val.
 58 | // Returns k's value on success, NULL if k is not in the hash_map.
 59 | void * hash_map_erase(hash_map_t * hm, const void * k);
 60 | // Change the mapping from oldk->val to be newk->val.
 61 | // Returns 0 on success, -EEXIST if newk exists, or -ENOENT if oldk does not exist.
 62 | int    hash_map_change_key(hash_map_t * hm, void * oldk, void * newk);
 63 | // Remove all key-val pairs, does not destroy keys or vals.
 64 | void   hash_map_clear(hash_map_t * hm);
 65 | // Return the val associated with k.
 66 | void * hash_map_find_val(const hash_map_t * hm, const void * k);
 67 | // Return the key and val associated with k.
 68 | hash_map_elt_t hash_map_find_elt(const hash_map_t * hm, const void * k);
 69 | // Return a pointer to the internal key and val associated with k.
 70 | // Useful to expose the address of the internal hash_map_elt_t->val.
 71 | // The value of key must not be changed through this pointer.
 72 | // The returned pointer will become invalid upon erasure of this element.
 73 | hash_map_elt_t * hash_map_find_eltp(const hash_map_t * hm, const void * k);
 74 | 
 75 | // Return the number of buckets currently allocated.
 76 | size_t hash_map_bucket_count(const hash_map_t * hm);
 77 | // Resize the number of buckets to n.
 78 | // Returns 0 on success, 1 on no resize needed, or -ENOMEM.
 79 | int    hash_map_resize(hash_map_t * hm, size_t n);
 80 | 
 81 | #if HASH_MAP_TRACK_BUCKET_SIZES
 82 | struct vector;
 83 | // Return a vector of each bucket's maximum size (vector elts are size_ts)
 84 | const struct vector * hash_map_max_sizes(const hash_map_t * hm);
 85 | #endif
 86 | 
 87 | // Iteration (current)
 88 | 
 89 | struct hash_map_it2 {
 90 | 	void * key; // key of the current map entry
 91 | 	void * val; // value of the current map entry
 92 | 	struct {
 93 | 		hash_map_t * hm;
 94 | 		size_t next_bucket;
 95 | 		chain_elt_t * next_elt;
 96 | #if HASH_MAP_IT_MOD_DEBUG
 97 | 		size_t loose_version;
 98 | #endif
 99 | 	} internal;
100 | };
101 | typedef struct hash_map_it2 hash_map_it2_t;
102 | 
103 | hash_map_it2_t hash_map_it2_create(hash_map_t * hm);
104 | // Iterate through the hash map values using it.
105 | // - Returns false once the end of the hash map is reached.
106 | // - Behavior is undefined if you begin iterating, then insert an element,
107 | //   resize the map, or delete the next element, and then continue iterating
108 | //   using the old iterator. (Define HASH_MAP_IT_MOD_DEBUG to detect some
109 | //   cases.)
110 | bool hash_map_it2_next(hash_map_it2_t * it);
111 | 
112 | 
113 | // Iteration (deprecated)
114 | 
115 | struct hash_map_it {
116 | 	hash_map_t * hm;
117 | 	size_t bucket;
118 | 	chain_elt_t * elt;
119 | #if HASH_MAP_IT_MOD_DEBUG
120 | 	size_t version;
121 | #endif
122 | };
123 | typedef struct hash_map_it hash_map_it_t;
124 | 
125 | void hash_map_it_init(hash_map_it_t * it, hash_map_t * hm);
126 | // Iterate through the hash map values using hm_it.
127 | // - Returns NULL when the end of the hash map is reached.
128 | // - Behavior is undefined if you begin iterating, modify hm, and then continue
129 | //   iterating using the old hm_it. (Define HASH_MAP_IT_MOD_DEBUG to detect.)
130 | void * hash_map_val_next(hash_map_it_t * it);
131 | // Iterate through the hash map values using hm_it.
132 | // - key is NULL when the end of the hash map is reached.
133 | // - Behavior is undefined if you begin iterating, modify hm, and then continue
134 | //   iterating using the old hm_it. (Define HASH_MAP_IT_MOD_DEBUG to detect.)
135 | hash_map_elt_t hash_map_elt_next(hash_map_it_t * it);
136 | 
137 | #endif /* !FSTITCH_INC_HASH_MAP_H */
138 | 


--------------------------------------------------------------------------------
/indirect_cow.c:
--------------------------------------------------------------------------------
  1 | /* This file is part of BPFS. BPFS is copyright 2009-2010 The Regents of the
  2 |  * University of California. It is distributed under the terms of version 2
  3 |  * of the GNU GPL. See the file LICENSE for details. */
  4 | 
  5 | #include "indirect_cow.h"
  6 | #include "bpfs.h"
  7 | #include "hash_map.h"
  8 | 
  9 | #include <assert.h>
 10 | #include <inttypes.h>
 11 | #include <stdbool.h>
 12 | #include <stddef.h>
 13 | #include <stdlib.h>
 14 | 
 15 | #if INDIRECT_COW
 16 | 
 17 | #define DEBUG 0
 18 | #if DEBUG
 19 | # define Dprintf(x...) fprintf(stderr, x)
 20 | #else
 21 | # define Dprintf(x...) do {} while(0)
 22 | #endif
 23 | 
 24 | 
 25 | struct block {
 26 | 	uint64_t orig_blkno; // the block number of the block this block replaces
 27 | 	uint64_t cow_blkno; // this block's block number
 28 | 	char *dram; // new contents (in DRAM)
 29 | 	bool required; // whether this block must be commited to DRAM
 30 | 	struct block *parent; // block's parent. for integrity assertions.
 31 | 	struct block *children_all; // all children of this block
 32 | 	struct block *child_all_next; // this block's entry in the child all list
 33 | 	struct block *children_cow; // children of this block that are cowed
 34 | 	struct block *child_cow_next; // this block's entry in the child cow list
 35 | };
 36 | 
 37 | // super + max inode tree height + max file tree height (is +2 correct?):
 38 | #define PARENT_STACK_SIZE (2 * BPFS_TREE_MAX_HEIGHT + 2)
 39 | 
 40 | struct parent_stack {
 41 | 	struct block *stack[PARENT_STACK_SIZE];
 42 | 	unsigned height;
 43 | };
 44 | 
 45 | 
 46 | static bool indirect_cow_inited;
 47 | static hash_map_t *blkno_map_orig; // orig block number -> struct block*
 48 | static hash_map_t *blkno_map_cow;  //  cow block number -> struct block*
 49 | static struct parent_stack parent_stack;
 50 | 
 51 | 
 52 | static struct block* parent_get(void)
 53 | {
 54 | 	if (!parent_stack.height)
 55 | 		return NULL;
 56 | 	return parent_stack.stack[parent_stack.height - 1];
 57 | }
 58 | 
 59 | static struct block* block_get_either(uint64_t blkno)
 60 | {
 61 | 	void *key = u64_ptr(blkno);
 62 | 	struct block *block = hash_map_find_val(blkno_map_cow, key);
 63 | 	if (block)
 64 | 		return block;
 65 | 	block = hash_map_find_val(blkno_map_orig, key);
 66 | 	assert(!block || block->cow_blkno == BPFS_BLOCKNO_INVALID);
 67 | 	return block;
 68 | }
 69 | 
 70 | static struct block* block_create(uint64_t orig_blkno, uint64_t cow_blkno)
 71 | {
 72 | 	struct block *parent = parent_get();
 73 | 	struct block *block = malloc(sizeof(*block));
 74 | 	assert(!block_get_either(orig_blkno) && !block_get_either(cow_blkno));
 75 | 	if (!block)
 76 | 		return NULL;
 77 | 	block->orig_blkno = orig_blkno;
 78 | 	block->cow_blkno = cow_blkno;
 79 | 	block->dram = NULL;
 80 | 	block->required = false;
 81 | 	block->parent = parent;
 82 | 	block->children_all = NULL;
 83 | 	block->children_cow = NULL;
 84 | 	if (!parent)
 85 | 	{
 86 | 		assert(orig_blkno == BPFS_BLOCKNO_SUPER);
 87 | 		block->child_all_next = NULL;
 88 | 		block->child_cow_next = NULL;
 89 | 	}
 90 | 	else
 91 | 	{
 92 | 		assert(orig_blkno != BPFS_BLOCKNO_SUPER);
 93 | 		assert(parent->orig_blkno != BPFS_BLOCKNO_SUPER
 94 | 		       || !parent->children_all);
 95 | 		block->child_all_next = parent->children_all;
 96 | 		parent->children_all = block;
 97 | 		if (cow_blkno == BPFS_BLOCKNO_INVALID)
 98 | 			block->child_cow_next = NULL;
 99 | 		else
100 | 		{
101 | 			block->child_cow_next = parent->children_cow;
102 | 			parent->children_cow = block;
103 | 		}
104 | 	}
105 | 	return block;
106 | }
107 | 
108 | 
109 | int indirect_cow_init(void)
110 | {
111 | 	assert(!indirect_cow_inited);
112 | 	blkno_map_orig = hash_map_create_ptr();
113 | 	if (!blkno_map_orig)
114 | 		return -ENOMEM;
115 | 	blkno_map_cow = hash_map_create_ptr();
116 | 	if (!blkno_map_cow)
117 | 	{
118 | 		hash_map_destroy(blkno_map_orig);
119 | 		return -ENOMEM;
120 | 	}
121 | 	parent_stack.height = 0;
122 | 	indirect_cow_inited = true;
123 | 	return 0;
124 | }
125 | 
126 | void indirect_cow_destroy(void)
127 | {
128 | 	struct block *super;
129 | 	assert(indirect_cow_inited);
130 | 	assert(hash_map_size(blkno_map_orig) == 1);
131 | 	assert(hash_map_size(blkno_map_cow) == 1);
132 | 
133 | 	indirect_cow_inited = false;
134 | 
135 | 	super = hash_map_find_val(blkno_map_orig, u64_ptr(BPFS_BLOCKNO_SUPER));
136 | 	assert(super);
137 | 	(void) hash_map_erase(blkno_map_orig, u64_ptr(BPFS_BLOCKNO_SUPER));
138 | 	(void) hash_map_erase(blkno_map_cow, u64_ptr(super->cow_blkno));
139 | 	free(super->dram);
140 | 	free(super);
141 | 
142 | 	hash_map_destroy(blkno_map_orig);
143 | 	hash_map_destroy(blkno_map_cow);
144 | }
145 | 
146 | 
147 | int indirect_cow_parent_push(uint64_t blkno)
148 | {
149 | 	struct block *block = block_get_either(blkno);
150 | 	Dprintf("%s(blkno = %" PRIu64 ")\n", __FUNCTION__, blkno);
151 | 
152 | 	if (!block)
153 | 	{
154 | 		int r;
155 | 		block = block_create(blkno, BPFS_BLOCKNO_INVALID);
156 | 		if (!block)
157 | 			return -ENOMEM;
158 | 		r = hash_map_insert(blkno_map_orig, u64_ptr(blkno), block);
159 | 		xcall(r); // TODO: destroy block
160 | 		assert(!r);
161 | 	}
162 | 	else
163 | 	{
164 | #ifndef NDEBUG
165 | 		struct block *parent = parent_get();
166 | 		if (!parent)
167 | 		{
168 | 			assert(block->orig_blkno == BPFS_BLOCKNO_SUPER);
169 | 			assert(!block->child_all_next);
170 | 		}
171 | 		else if (block->orig_blkno == BPFS_BLOCKNO_SUPER)
172 | 		{
173 | 			assert(!block->child_all_next);
174 | 		}
175 | 		else
176 | 		{
177 | 			struct block *sibling = parent->children_all;
178 | 			for (; sibling && block != sibling;
179 | 			     sibling = sibling->child_all_next) ;
180 | 			assert(sibling);
181 | 			assert(block->parent == parent);
182 | 		}
183 | #endif
184 | 	}
185 | 
186 | 	parent_stack.height++;
187 | 	xassert(parent_stack.height <= PARENT_STACK_SIZE);
188 | 	parent_stack.stack[parent_stack.height - 1] = block;
189 | 	return 0;
190 | }
191 | 
192 | void indirect_cow_parent_pop(uint64_t blkno)
193 | {
194 | 	struct block *block;
195 | 	Dprintf("%s(blkno = %" PRIu64 ")\n", __FUNCTION__, blkno);
196 | 	assert(parent_stack.height);
197 | 	block = parent_stack.stack[parent_stack.height - 1];
198 | 	assert(blkno == block->orig_blkno || blkno == block->cow_blkno);
199 | 	parent_stack.height--;
200 | }
201 | 
202 | 
203 | int indirect_cow_block_cow(uint64_t orig_blkno, uint64_t cow_blkno)
204 | {
205 | 	struct block *parent = parent_get();
206 | 	struct block *block = hash_map_find_val(blkno_map_orig,
207 | 	                                        u64_ptr(orig_blkno));
208 | 	bool new_block = !block;
209 | 	void *dram_void;
210 | 	int r;
211 | 	Dprintf("%s(orig_blkno = %" PRIu64 ", cow_blkno = %" PRIu64 ")\n",
212 | 	        __FUNCTION__, orig_blkno, cow_blkno);
213 | 
214 | 	assert(orig_blkno != BPFS_BLOCKNO_INVALID);
215 | 	assert(cow_blkno != BPFS_BLOCKNO_INVALID);
216 | 	assert(cow_blkno != BPFS_BLOCKNO_SUPER);
217 | 	assert(orig_blkno != BPFS_BLOCKNO_SUPER_2);
218 | 	assert(cow_blkno != BPFS_BLOCKNO_SUPER_2);
219 | 
220 | 	assert(!parent || parent->orig_blkno != BPFS_BLOCKNO_SUPER
221 | 	       || !parent->children_cow);
222 | 
223 | 	if (new_block)
224 | 	{
225 | 		block = block_create(orig_blkno, cow_blkno);
226 | 		if (!block)
227 | 		{
228 | 			r = -ENOMEM;
229 | 			goto abort;
230 | 		}
231 | 	}
232 | 	else
233 | 	{
234 | 		assert(block->cow_blkno == BPFS_BLOCKNO_INVALID);
235 | 		assert(!block->child_cow_next);
236 | 		block->cow_blkno = cow_blkno;
237 | 		assert(block->parent == parent);
238 | 		if (parent)
239 | 		{
240 | 			assert(block->orig_blkno != BPFS_BLOCKNO_SUPER);
241 | 			block->child_cow_next = parent->children_cow;
242 | 			parent->children_cow = block;
243 | 		}
244 | 	}
245 | 
246 | 	r = posix_memalign(&dram_void, BPFS_BLOCK_SIZE, BPFS_BLOCK_SIZE);
247 | 	if (r)
248 | 	{
249 | 		assert(r == ENOMEM);
250 | 		r = -r;
251 | 		goto abort;
252 | 	}
253 | 	block->dram = dram_void;
254 | 
255 | 	if (new_block)
256 | 	{
257 | 		r = hash_map_insert(blkno_map_orig, u64_ptr(orig_blkno), block);
258 | 		if (r < 0)
259 | 			goto abort;
260 | 		assert(!r);
261 | 	}
262 | 
263 | 	r = hash_map_insert(blkno_map_cow, u64_ptr(cow_blkno), block);
264 | 	if (r < 0)
265 | 		goto abort;
266 | 	assert(!r);
267 | 
268 | 	return 0;
269 | 
270 |   abort:
271 | 	free(block->dram);
272 | 	block->cow_blkno = BPFS_BLOCKNO_INVALID;
273 | 	(void) hash_map_erase(blkno_map_cow, u64_ptr(cow_blkno));
274 | 	if (new_block)
275 | 	{
276 | 		(void) hash_map_erase(blkno_map_orig, u64_ptr(orig_blkno));
277 | 		free(block);
278 | 	}
279 | 	return r;
280 | }
281 | 
282 | char* indirect_cow_block_get(uint64_t blkno)
283 | {
284 | 	struct block *block = hash_map_find_val(blkno_map_cow, u64_ptr(blkno));
285 | 	if (!block)
286 | 		return NULL;
287 | 	return block->dram;
288 | }
289 | 
290 | void indirect_cow_block_required(uint64_t blkno)
291 | {
292 | 	struct block *block = hash_map_find_val(blkno_map_cow, u64_ptr(blkno));
293 | 	Dprintf("%s(blkno = %" PRIu64 ")\n", __FUNCTION__, blkno);
294 | 	if (block)
295 | 		block->required = true;
296 | 	else
297 | 		assert(block_freshly_alloced(blkno));
298 | }
299 | 
300 | void indirect_cow_block_direct(uint64_t blkno, unsigned off, unsigned size)
301 | {
302 | 	struct block *block = hash_map_find_val(blkno_map_cow, u64_ptr(blkno));
303 | 	Dprintf("%s(blkno = %" PRIu64 ", off = %u, size = %u)\n",
304 | 	        __FUNCTION__, blkno, off, size);
305 | 
306 | 	assert(blkno != BPFS_BLOCKNO_INVALID);
307 | 	assert(off < BPFS_BLOCK_SIZE && size <= BPFS_BLOCK_SIZE);
308 | 	assert(off + size <= BPFS_BLOCK_SIZE);
309 | 
310 | 	if (!block
311 | 	    || block->orig_blkno == BPFS_BLOCKNO_INVALID
312 | 	    || block->cow_blkno == BPFS_BLOCKNO_INVALID)
313 | 		return;
314 | 
315 | 	memcpy(get_block(block->orig_blkno) + off, block->dram + off, size);
316 | }
317 | 
318 | 
319 | static bool cow_is_atomically_writable(const struct block *block,
320 |                                        uint64_t *atomic_new,
321 |                                        unsigned *atomic_off)
322 | {
323 | 	char *block_0 = get_block(block->orig_blkno);
324 | 	char *block_1 = block->dram;
325 | 	bool diff = false;
326 | 	unsigned off;
327 | 
328 | 	assert(!!atomic_new == !!atomic_off);
329 | 	assert(block_0);
330 | 	assert(block_1);
331 | 
332 | 	// BPFS_BLOCK_SIZE will indicate no difference
333 | 	if (atomic_off)
334 | 		*atomic_off = BPFS_BLOCK_SIZE;
335 | 
336 | 	static_assert(ATOMIC_SIZE == 8);
337 | 	for (off = 0 ; off < BPFS_BLOCK_SIZE; off += ATOMIC_SIZE)
338 | 	{
339 | 		if (*(uint64_t*) (block_0 + off) != *(uint64_t*) (block_1 + off))
340 | 		{
341 | 			if (diff)
342 | 				return false;
343 | 
344 | 			diff = true;
345 | 			if (atomic_new)
346 | 			{
347 | 				*atomic_new = *(uint64_t*) (block_1 + off);
348 | 				*atomic_off = off;
349 | 			}
350 | 		}
351 | 	}
352 | 
353 | 	return true;
354 | }
355 | 
356 | void indirect_cow_commit(void)
357 | {
358 | 	struct block *super_block;
359 | 	struct block *notatomic_block;
360 | 	struct block *block;
361 | 	uint64_t atomic_blkno;
362 | 	uint64_t atomic_new;
363 | 	unsigned atomic_off;
364 | 	hash_map_it2_t it;
365 | 	char *block_bpram;
366 | 
367 | 	Dprintf("%s()\n", __FUNCTION__);
368 | 
369 | 	assert(!parent_stack.height);
370 | 
371 | 	// Should contain at least the super block:
372 | 	assert(!hash_map_empty(blkno_map_cow));
373 | 
374 | 	if (hash_map_size(blkno_map_cow) == 1)
375 | 	{
376 | 		block = hash_map_find_val(blkno_map_orig, u64_ptr(BPFS_BLOCKNO_SUPER));
377 | 		assert(block && block->cow_blkno != BPFS_BLOCKNO_INVALID);
378 | 		set_super(get_bpram_super());
379 | 		(void) hash_map_erase(blkno_map_cow, u64_ptr(block->cow_blkno));
380 | 		unfree_block(BPFS_BLOCKNO_SUPER);
381 | 		unalloc_block(block->cow_blkno);
382 | 		free(block->dram);
383 | 		block->dram = NULL;
384 | 		block->cow_blkno = BPFS_BLOCKNO_INVALID;
385 | 
386 | 		it = hash_map_it2_create(blkno_map_orig);
387 | 		while (hash_map_it2_next(&it))
388 | 		{
389 | 			block = it.val;
390 | 			assert(!block->dram && block->cow_blkno == BPFS_BLOCKNO_INVALID);
391 | 			free(block);
392 | 		}
393 | 		hash_map_clear(blkno_map_orig);
394 | 		return;
395 | 	}
396 | 
397 | 	super_block = hash_map_find_val(blkno_map_orig,
398 | 	                                u64_ptr(BPFS_BLOCKNO_SUPER));
399 | 	assert(super_block);
400 | 	assert(!super_block->required);
401 | 	assert(super_block->children_cow);
402 | 	assert(cow_is_atomically_writable(super_block, NULL, NULL));
403 | 
404 | 	// Find the highest block that is atomically writable
405 | 	block = super_block;
406 | 	while (1)
407 | 	{
408 | 		struct block *child = block->children_cow;
409 | 		assert(child || block->required);
410 | 		if (child)
411 | 		{
412 | 			assert(!child->child_cow_next);
413 | 			if (!cow_is_atomically_writable(child, NULL, NULL))
414 | 				break;
415 | 		}
416 | 		if (block->required)
417 | 			break;
418 | 		block = child;
419 | 	}
420 | 	atomic_blkno = block->orig_blkno;
421 | 	xassert(cow_is_atomically_writable(block, &atomic_new, &atomic_off));
422 | 	notatomic_block = block->children_cow;
423 | 
424 | 	// Revert the parents of notatomic_block to their original state
425 | 	block = super_block;
426 | 	assert(block != notatomic_block);
427 | 	set_super(get_bpram_super());
428 | 	while (block != notatomic_block)
429 | 	{
430 | 		struct block *cur = block;
431 | 
432 | 		unalloc_block(block->cow_blkno);
433 | 		unfree_block(block->orig_blkno);
434 | 
435 | 		(void) hash_map_erase(blkno_map_orig, u64_ptr(block->orig_blkno));
436 | 		(void) hash_map_erase(blkno_map_cow, u64_ptr(block->cow_blkno));
437 | 		free(block->dram);
438 | 		block = block->children_cow;
439 | 		free(cur);
440 | 	}
441 | 
442 | 	// Copy CoW blocks to BPRAM
443 | 	it = hash_map_it2_create(blkno_map_cow);
444 | 	while (hash_map_it2_next(&it))
445 | 	{
446 | 		block = it.val;
447 | 
448 | 		(void) hash_map_erase(blkno_map_orig, u64_ptr(block->orig_blkno));
449 | 		// Before the get_block() call so that get_block() gets bpram:
450 | 		(void) hash_map_erase(blkno_map_cow, u64_ptr(block->cow_blkno));
451 | 
452 | 		block_bpram = get_block(block->cow_blkno);
453 | 		memcpy(block_bpram, block->dram, BPFS_BLOCK_SIZE);
454 | 
455 | 		free(block->dram);
456 | 		free(block);
457 | 	}
458 | 
459 | 	// Free the blocks that were not CoWed
460 | 	it = hash_map_it2_create(blkno_map_orig);
461 | 	while (hash_map_it2_next(&it))
462 | 	{
463 | 		block = it.val;
464 | 
465 | 		assert(block->cow_blkno == BPFS_BLOCKNO_INVALID);
466 | 		assert(!block->dram);
467 | 		assert(!block->required);
468 | 
469 | 		(void) hash_map_erase(blkno_map_orig, u64_ptr(block->orig_blkno));
470 | 		free(block);
471 | 	}
472 | 
473 | 	// Atomically commit
474 | 	// (There can be nothing to commit when all CoWs were unnecessary
475 | 	//  or direct.)
476 | 	if (atomic_off != BPFS_BLOCK_SIZE)
477 | 	{
478 | 		block_bpram = get_block(atomic_blkno);
479 | 		*(uint64_t*) (block_bpram + atomic_off) = atomic_new;
480 | 	}
481 | }
482 | 
483 | void indirect_cow_abort(void)
484 | {
485 | 	hash_map_it2_t it;
486 | 
487 | 	Dprintf("%s()\n", __FUNCTION__);
488 | 
489 | 	assert(!parent_stack.height);
490 | 
491 | 	set_super(get_bpram_super());
492 | 
493 | 	it = hash_map_it2_create(blkno_map_orig);
494 | 	while (hash_map_it2_next(&it))
495 | 	{
496 | 		struct block *block = it.val;
497 | 		if (block->cow_blkno)
498 | 		{
499 | 			unalloc_block(block->cow_blkno);
500 | 			unfree_block(block->orig_blkno);
501 | 
502 | 			(void) hash_map_erase(blkno_map_cow, u64_ptr(block->cow_blkno));
503 | 			free(block->dram);
504 | 		}
505 | 
506 | 		(void) hash_map_erase(blkno_map_orig, u64_ptr(block->orig_blkno));
507 | 		free(block);
508 | 	}
509 | }
510 | 
511 | 
512 | uint64_t get_super_blockno(void)
513 | {
514 | 	struct block *super = hash_map_find_val(blkno_map_orig,
515 | 	                                        u64_ptr(BPFS_BLOCKNO_SUPER));
516 | 	if (!super || super->cow_blkno == BPFS_BLOCKNO_INVALID)
517 | 		return BPFS_BLOCKNO_SUPER;
518 | 	return super->cow_blkno;
519 | }
520 | 
521 | #else
522 | 
523 | int indirect_cow_init(void)
524 | {
525 | 	return 0;
526 | }
527 | void indirect_cow_destroy(void)
528 | {
529 | }
530 | 
531 | int indirect_cow_parent_push(uint64_t blkno)
532 | {
533 | 	return 0;
534 | }
535 | void indirect_cow_parent_pop(uint64_t blkno)
536 | {
537 | }
538 | 
539 | int indirect_cow_block_cow(uint64_t orig_blkno, uint64_t cow_blkno)
540 | {
541 | 	return 0;
542 | }
543 | char* indirect_cow_block_get(uint64_t blkno)
544 | {
545 | 	return NULL;
546 | }
547 | void indirect_cow_block_required(uint64_t blkno)
548 | {
549 | }
550 | void indirect_cow_block_direct(uint64_t blkno, unsigned off, unsigned size)
551 | {
552 | }
553 | 
554 | void indirect_cow_commit(void)
555 | {
556 | }
557 | void indirect_cow_abort(void)
558 | {
559 | }
560 | 
561 | uint64_t get_super_blockno(void)
562 | {
563 | 	return BPFS_BLOCKNO_SUPER;
564 | }
565 | 
566 | #endif
567 | 


--------------------------------------------------------------------------------
/indirect_cow.h:
--------------------------------------------------------------------------------
 1 | /* This file is part of BPFS. BPFS is copyright 2009-2010 The Regents of the
 2 |  * University of California. It is distributed under the terms of version 2
 3 |  * of the GNU GPL. See the file LICENSE for details. */
 4 | 
 5 | #ifndef INDIRECT_COW_H
 6 | #define INDIRECT_COW_H
 7 | 
 8 | #include <stdint.h>
 9 | 
10 | int indirect_cow_init(void);
11 | void indirect_cow_destroy(void);
12 | 
13 | // Push when a block pointer is followed. Pop on the return.
14 | int indirect_cow_parent_push(uint64_t blkno);
15 | void indirect_cow_parent_pop(uint64_t blkno);
16 | 
17 | int indirect_cow_block_cow(uint64_t orig_blkno, uint64_t cow_blkno);
18 | char* indirect_cow_block_get(uint64_t blkno);
19 | void indirect_cow_block_required(uint64_t blkno);
20 | // Write the changes in this region immediately if blkno is CoWed
21 | void indirect_cow_block_direct(uint64_t blkno, unsigned off, unsigned size);
22 | 
23 | void indirect_cow_commit(void);
24 | void indirect_cow_abort(void);
25 | 
26 | uint64_t get_super_blockno(void);
27 | 
28 | #endif
29 | 


--------------------------------------------------------------------------------
/mkbpfs.c:
--------------------------------------------------------------------------------
  1 | /* This file is part of BPFS. BPFS is copyright 2009-2010 The Regents of the
  2 |  * University of California. It is distributed under the terms of version 2
  3 |  * of the GNU GPL. See the file LICENSE for details. */
  4 | 
  5 | #include "mkbpfs.h"
  6 | #include "bpfs.h"
  7 | #include "bpfs_structs.h"
  8 | #include "util.h"
  9 | 
 10 | #include <inttypes.h>
 11 | #include <stdio.h>
 12 | #include <string.h>
 13 | #include <uuid/uuid.h>
 14 | 
 15 | #define BPFS_MIN_NBLOCKS 7
 16 | 
 17 | // Appease users of bitmap_scan_t:
 18 | // The number of blocks must be a multiple of this number:
 19 | #define NBLOCKS_MODULUS (sizeof(bitmap_scan_t) * 8)
 20 | // The initial number of inode blocks:
 21 | #define INODES_NBLOCKS \
 22 | 	CMAX(1, ROUNDUP64(sizeof(bitmap_scan_t) * 8, BPFS_INODES_PER_BLOCK) \
 23 | 	        / BPFS_INODES_PER_BLOCK)
 24 | 
 25 | /* As of commit max(commits of this comment), mkbpfs() allocates these blocks:
 26 |  * 1: super
 27 |  * 2: super2
 28 |  * 3: inode root
 29 |  * 4: ir.indirect
 30 |  * 5: ir.data[0]
 31 |  * 6: ir.data[1]
 32 |  * 7: "/".data[0]
 33 |  */
 34 | 
 35 | static char* mk_get_block(char *bpram, struct bpfs_super *super, uint64_t no)
 36 | {
 37 | 	assert(no != BPFS_BLOCKNO_INVALID);
 38 | 	assert(no <= super->nblocks);
 39 | 	assert(no <= BPFS_MIN_NBLOCKS);
 40 | 	static_assert(BPFS_BLOCKNO_INVALID == 0);
 41 | 	return bpram + (no - 1) * BPFS_BLOCK_SIZE;
 42 | }
 43 | 
 44 | static uint64_t mk_alloc_block(struct bpfs_super *super)
 45 | {
 46 | 	static uint64_t next_blockno = BPFS_BLOCKNO_FIRST_ALLOC - 1;
 47 | 	static_assert(BPFS_BLOCKNO_INVALID == 0);
 48 | 	assert(next_blockno < super->nblocks);
 49 | 	assert(next_blockno < BPFS_MIN_NBLOCKS);
 50 | 	return (next_blockno++) + 1;
 51 | }
 52 | 
 53 | #define MK_GET_BLOCK(blockno) mk_get_block(bpram, super, blockno)
 54 | 
 55 | int mkbpfs(char *bpram, size_t bpram_size)
 56 | {
 57 | 	struct bpfs_super *super;
 58 | 	struct bpfs_super *super_2;
 59 | 	struct bpfs_tree_root *inodes_root;
 60 | 	struct bpfs_indir_block *inodes_indir;
 61 | 	struct bpfs_inode *inodes;
 62 | 	struct bpfs_inode *root_inode;
 63 | 	struct bpfs_dirent *root_dirent;
 64 | 	int i;
 65 | 
 66 | 	if (bpram_size < BPFS_MIN_NBLOCKS * BPFS_BLOCK_SIZE)
 67 | 		return -ENOSPC;
 68 | 	if (bpram_size < NBLOCKS_MODULUS * BPFS_BLOCK_SIZE)
 69 | 		return -ENOSPC;
 70 | 
 71 | 	super = (struct bpfs_super*) bpram;
 72 | 	super->version = BPFS_STRUCT_VERSION;
 73 | 	static_assert(sizeof(uuid_t) == sizeof(super->uuid));
 74 | 	uuid_generate(super->uuid);
 75 | 	super->nblocks = ROUNDDOWN64(bpram_size / BPFS_BLOCK_SIZE, NBLOCKS_MODULUS);
 76 | 	super->inode_root_addr = mk_alloc_block(super);
 77 | 	super->inode_root_addr_2 = super->inode_root_addr; // not required for SCSP
 78 | 	super->commit_mode = BPFS_COMMIT_SCSP;
 79 | 	super->ephemeral_valid = 1;
 80 | 	memset(super->pad, 0, sizeof(super->pad));
 81 | 
 82 | 	if (super->nblocks > BPFS_TREE_ROOT_MAX_ADDR + 1)
 83 | 	{
 84 | 		// This simplifies block allocation: limiting nblocks to
 85 | 		// BPFS_TREE_ROOT_MAX_ADDR means allocation code doesn't have
 86 | 		// to ensure that tree root block numbers do not exceed this limit.
 87 | 		fprintf(stderr, "%s: Limiting file system to %" PRIu64 " blocks (%"
 88 | 		        PRIu64 " are available)\n", __FUNCTION__,
 89 | 		        BPFS_TREE_ROOT_MAX_ADDR, super->nblocks);
 90 | 		super->nblocks = BPFS_TREE_ROOT_MAX_ADDR + 1;
 91 | 	}
 92 | 
 93 | 	super_2 = super + 1;
 94 | 	*super_2 = *super; // not required for SCSP
 95 | 
 96 | 	inodes_root = (struct bpfs_tree_root*) MK_GET_BLOCK(super->inode_root_addr);
 97 | 	static_assert(INODES_NBLOCKS <= BPFS_BLOCKNOS_PER_INDIR);
 98 | 	inodes_root->ha.height = 1;
 99 | 	inodes_root->ha.addr = mk_alloc_block(super);
100 | 	inodes_root->nbytes = INODES_NBLOCKS * BPFS_BLOCK_SIZE;
101 | 
102 | 	inodes_indir = (struct bpfs_indir_block*) MK_GET_BLOCK(inodes_root->ha.addr);
103 | 
104 | 	for (i = 0; i < INODES_NBLOCKS; i++)
105 | 	{
106 | #if APPEASE_VALGRIND || DETECT_ZEROLINKS_WITH_LINKS
107 | 		int j;
108 | #endif
109 | 		inodes_indir->addr[i] = mk_alloc_block(super);
110 | 		inodes = (struct bpfs_inode*) MK_GET_BLOCK(inodes_indir->addr[i]);
111 | 
112 | #if APPEASE_VALGRIND || DETECT_ZEROLINKS_WITH_LINKS
113 | 		for (j = 0; j + sizeof(struct bpfs_inode) <= BPFS_BLOCK_SIZE; j += sizeof(struct bpfs_inode))
114 | 		{
115 | # if APPEASE_VALGRIND
116 | 			// init the generation field. not required, but appeases valgrind.
117 | 			inodes[j].generation = 0;
118 | # endif
119 | # if DETECT_ZEROLINKS_WITH_LINKS
120 | 			inodes[j].nlinks = 0;
121 | # endif
122 | 		}
123 | #endif
124 | 	}
125 | 
126 | 	inodes = (struct bpfs_inode*) MK_GET_BLOCK(inodes_indir->addr[0]);
127 | 
128 | 	root_inode = &inodes[0];
129 | 	root_inode->generation = 1;
130 | 	root_inode->mode = BPFS_S_IFDIR;
131 | 	root_inode->mode |= BPFS_S_IRUSR | BPFS_S_IWUSR | BPFS_S_IXUSR | BPFS_S_IRGRP | BPFS_S_IWGRP | BPFS_S_IXGRP | BPFS_S_IROTH | BPFS_S_IXOTH;
132 | 	root_inode->uid = 0;
133 | 	root_inode->gid = 0;
134 | 	root_inode->nlinks = 2;
135 | 	root_inode->flags = 0;
136 | 	root_inode->root.ha.height = 0;
137 | 	root_inode->root.ha.addr = mk_alloc_block(super);
138 | 	root_inode->root.nbytes = BPFS_BLOCK_SIZE;
139 | 	root_inode->mtime = root_inode->ctime = root_inode->atime = BPFS_TIME_NOW();
140 | 	memset(root_inode->pad, 0, sizeof(root_inode->pad));
141 | 
142 | 	root_dirent = (struct bpfs_dirent*) MK_GET_BLOCK(root_inode->root.ha.addr);
143 | 	root_dirent->rec_len = 0;
144 | 
145 | 	super->magic = BPFS_FS_MAGIC;
146 | 	super_2->magic = BPFS_FS_MAGIC;
147 | 
148 | 	return 0;
149 | }
150 | 


--------------------------------------------------------------------------------
/mkbpfs.h:
--------------------------------------------------------------------------------
1 | #ifndef MKBPFS_H
2 | #define MKBPFS_H
3 | 
4 | #include <stddef.h>
5 | 
6 | int mkbpfs(char *bpram, size_t bpram_size);
7 | 
8 | #endif
9 | 


--------------------------------------------------------------------------------
/mkfs.bpfs.c:
--------------------------------------------------------------------------------
 1 | /* This file is part of BPFS. BPFS is copyright 2009-2010 The Regents of the
 2 |  * University of California. It is distributed under the terms of version 2
 3 |  * of the GNU GPL. See the file LICENSE for details. */
 4 | 
 5 | #include "mkbpfs.h"
 6 | #include "util.h"
 7 | 
 8 | #include <fcntl.h>
 9 | #include <inttypes.h>
10 | #include <stdio.h>
11 | #include <stdlib.h>
12 | #include <string.h>
13 | #include <sys/mman.h>
14 | #include <sys/stat.h>
15 | #include <sys/types.h>
16 | #include <unistd.h>
17 | 
18 | int main(int argc, char **argv)
19 | {
20 | 	char *bpram_name;
21 | 	int bpram_fd;
22 | 	struct stat stbuf;
23 | 	char *bpram;
24 | 	size_t bpram_size;
25 | 
26 | 	if (argc != 2)
27 | 	{
28 | 		fprintf(stderr, "Usage: %s <bpram_device>\n", argv[0]);
29 | 		exit(1);
30 | 	}
31 | 
32 | 	bpram_name = argv[1];
33 | 
34 | 	bpram_fd = xsyscall(open(bpram_name, O_RDWR));
35 | 
36 | 	xsyscall(fstat(bpram_fd, &stbuf));
37 | 	bpram_size = stbuf.st_size;
38 | 	xassert(bpram_size == stbuf.st_size);
39 | 
40 | 	bpram = mmap(NULL, bpram_size, PROT_READ | PROT_WRITE, MAP_SHARED, bpram_fd, 0);
41 | 	xassert(bpram);
42 | 
43 | 	xcall(mkbpfs(bpram, bpram_size));
44 | 
45 | 	xsyscall(msync(bpram, bpram_size, MS_SYNC));
46 | 	xsyscall(munmap(bpram, bpram_size));
47 | 	xsyscall(close(bpram_fd));
48 | 
49 | 	return 0;
50 | }
51 | 


--------------------------------------------------------------------------------
/pool.h:
--------------------------------------------------------------------------------
 1 | /* This file is part of BPFS. BPFS is copyright 2009-2010 The Regents of the
 2 |  * University of California. It is distributed under the terms of version 2
 3 |  * of the GNU GPL. See the file LICENSE for details. */
 4 | 
 5 | #ifndef FSTITCH_LIB_POOL_H
 6 | #define FSTITCH_LIB_POOL_H
 7 | 
 8 | #include <stdlib.h>
 9 | 
10 | // Set to 1 to use malloc() and free() instead of pools. Useful for debugging.
11 | #define POOL_MALLOC 0
12 | 
13 | #if !POOL_MALLOC
14 | 
15 | #define PAGE_SIZE 4096
16 | 
17 | #define POOLSIZE(type) ((int) ((PAGE_SIZE - sizeof(void*)) / sizeof(type)))
18 | 
19 | #define unlikely(x) __builtin_expect(!!(x), 1)
20 | 
21 | // Create a pool, allocator, and deallocators for 'type'.
22 | // API: type* name_alloc(), name_free(type*), name_free_all().
23 | #define DECLARE_POOL(name, type) \
24 | 	struct name##_pool { \
25 | 		struct name##_pool * next; \
26 | 		type elts[POOLSIZE(type)]; \
27 | 	}; \
28 | 	static type * name##_free_list; \
29 | 	static struct name##_pool * name##_free_pool; \
30 | 	\
31 | 	static type * alloc_##name##_pool(void) \
32 | 	{ \
33 | 		struct name##_pool * pool; \
34 | 		int i; \
35 | 		if(!(pool = malloc(sizeof(*pool)))) \
36 | 			return NULL; \
37 | 		pool->next = name##_free_pool; \
38 | 		name##_free_pool = pool; \
39 | 		for(i = 1; i < POOLSIZE(type); i++) \
40 | 			* ((type **) &pool->elts[i]) = &pool->elts[i-1]; \
41 | 		* ((type **) &pool->elts[0]) = name##_free_list; \
42 | 		name##_free_list = &pool->elts[POOLSIZE(type) - 1]; \
43 | 		return name##_free_list; \
44 | 	} \
45 | 	static __inline type * name##_alloc(void) __attribute__((always_inline)); \
46 | 	static __inline type * name##_alloc(void) \
47 | 	{ \
48 | 		type * p; \
49 | 		if(unlikely(!name##_free_list)) \
50 | 			if(unlikely(!alloc_##name##_pool())) \
51 | 				return NULL; \
52 | 		p = name##_free_list; \
53 | 		name##_free_list = * ((type **) p); \
54 | 		return p; \
55 | 	} \
56 | 	static __inline void name##_free(type * p) __attribute__((always_inline)); \
57 | 	static __inline void name##_free(type * p) \
58 | 	{ \
59 | 		* ((type **) p) = name##_free_list; \
60 | 		name##_free_list = p; \
61 | 	} \
62 | 	static void name##_free_all(void) \
63 | 	{ \
64 | 		struct name##_pool * pool; \
65 | 		while((pool = name##_free_pool)) \
66 | 		{ \
67 | 			name##_free_pool = pool->next; \
68 | 			free(pool); \
69 | 		} \
70 | 	}
71 | 
72 | #else
73 | 
74 | # define DECLARE_POOL(name, type) \
75 | 	static type * name##_alloc(void) { return malloc(sizeof(type)); } \
76 | 	static void name##_free(type * p) { free(p); } \
77 | 	static void name##_free_all(void) { }
78 | 
79 | #endif
80 | 
81 | #endif
82 | 


--------------------------------------------------------------------------------
/pwrite.c:
--------------------------------------------------------------------------------
 1 | /* This file is part of BPFS. BPFS is copyright 2009-2010 The Regents of the
 2 |  * University of California. It is distributed under the terms of version 2
 3 |  * of the GNU GPL. See the file LICENSE for details. */
 4 | 
 5 | #define _XOPEN_SOURCE 500
 6 | 
 7 | #include <sys/types.h>
 8 | #include <sys/stat.h>
 9 | #include <fcntl.h>
10 | #include <unistd.h>
11 | #include <stdlib.h>
12 | #include <stdio.h>
13 | #include <string.h>
14 | #include <assert.h>
15 | 
16 | int main(int argc, char **argv)
17 | {
18 | 	char *filename = argv[1], *str = argv[2], *offset = argv[3];
19 | 	int fd, r;
20 | 
21 | 	if (argc != 4)
22 | 	{
23 | 		fprintf(stderr, "Overwrite a range of bytes in a file.\n");
24 | 		fprintf(stderr, "Usage: %s <FILE> <STRING> <OFFSET>\n", argv[0]);
25 | 		return 1;
26 | 	}
27 | 
28 | 	fd = open(filename, O_RDWR);
29 | 	assert(fd >= 0);
30 | 
31 | 	r = pwrite(fd, str, strlen(str), atoi(offset));
32 | 	assert(r == strlen(str));
33 | 
34 | 	r = close(fd);
35 | 	assert(r >= 0);
36 | 
37 | 	return 0;
38 | }
39 | 


--------------------------------------------------------------------------------
/todo:
--------------------------------------------------------------------------------
  1 | - could make truncate(expand) write fewer bytes by adding an inode field
  2 | 	'valid', where valid <= nbytes
  3 | - introduce offset typedefs: blockno_t, pgno_t, byteno_t, nbytes_t, ?
  4 | 	typedef uint64_t blkno_t;  // BPRAM block number
  5 | 	typedef uint64_t blkidx_t; // block index into a file
  6 | 	typedef uint64_t byteno_t; // byte number
  7 | - valgrind: http://valgrind.org/docs/manual/manual-core-adv.html
  8 | 
  9 | * benchmarks
 10 | - postmark
 11 | - untar
 12 | - rm
 13 | - something involving rename
 14 | - small, large appends, overwrites
 15 | - create large file
 16 | - well-respected benchmark (IOzone, FileBench (FSL port?))
 17 | 
 18 | * measurements
 19 | - compare SCSP to SP. and minimal? and ext2,ext3,ext4,btrfs?
 20 | - bytes written
 21 |   - measure actual writes
 22 |     - what code contributes how much to this measurement?
 23 | 	- must the controller write an entire cache line? if so, measure this?
 24 |   - measure #bytes that change in file system image
 25 | - best, worst, and "expected" results
 26 | - number of epoch barriers?
 27 | - correctness
 28 |   - detect consistency
 29 |   - detect syscall atomicity
 30 |   - detect if file system does what it is supposed to (eg renames the file)
 31 | 
 32 | * limits
 33 | - SCSP write() can be atomic, but is it guaranteed? eg failed allocation part
 34 |   way through a large write? in general, how does abort work?
 35 |   perhaps there are two types of failures:
 36 |   - one write makes all the preceeding writes visible
 37 |     - easy to solve: log frees/allocs and reverse on failure
 38 |   - more than one live write (eg cmtime with write or rename)
 39 |     - is this ok for most apps? can we make it better? eg guarantee
 40 |       that if data is modified, the mtime was updated? or vice-versa.
 41 |     - can I turn these into single commits without much overhead?
 42 | 
 43 | * long term notes
 44 | - code seems too complicated. maybe how to commit is tied too closely to other?
 45 | - when SCSP has to COW, would it ever be helpful to wait on committing the change for a later write that would have to re-COW a shared set of blocks?
 46 | 	- SP mode would do this
 47 | - can SCSP work with one crawl down and then back up?
 48 | 
 49 | * unimplemented write optimizations
 50 | - make truncate(expand) write fewer bytes by adding an inode field 'valid'
 51 |   where valid <= nbytes
 52 | - could not CoW unused inodes and dirent regions
 53 |   - do not CoW the entire dirent block(s) for rename (skip the ino field(s))
 54 |   - do not CoW unused and to-be-overwritten dirent entries for rename
 55 |   - do not CoW unused and to-be-overwritten inodes for rename
 56 | - only CoW indir block portions that will not be overwritten
 57 | - changing the height separately from the root addr is needless for append
 58 | 
 59 | * near term notes
 60 | - convince self that current code is correct.
 61 | - correctness test: snapshot ram before op, during op, and after: during should "match" before or after
 62 | 	- want to snapshot during not during a machine instruction?
 63 | 	- maybe track which bytes/pages change and only compare them for speed?
 64 | 	- work with large file systems, too? (or, woozle has 8GB ram)
 65 | 	- issue: syscalls are not "atomic". inode file grows in size, freed entries are modified, timestamp updates.
 66 | 	- perhaps track in pin
 67 | - implement readdir() that works when called multiple times and contents change
 68 | - 64bit bpfs can create inos larger than 32b fuse can store. probably don't fix, just keep this in mind.
 69 | - fixme: commit_abort() does not abort entire syscall in SCSP
 70 | - do not allocate the first directory block when creating a directory?
 71 | - consider kernel_cache and *_timeout
 72 | - replace dcache bits for find_dirent() with persistent hashes in dirs?
 73 | 
 74 | * current work
 75 | - WC vs. WT for memory throughput benchmark
 76 | - explain bytes written differences
 77 | - move time updates to before modifications?
 78 | - optimize cows that are made (ie cow and then overwrite some blocks)
 79 | - add benchmarks and measurement tools
 80 | 	- measure layout choices? e.g., to not store "..", no resource bitmaps, ...
 81 | 	- postmark: run with smaller size of ram to bytes written ratio?
 82 | 	- postmark: make runs deterministic
 83 | 	- bonnie++: not deterministic, but variance seems <0.1%
 84 | 	- build_apache: which app(s) make the small overwrites? could they be improved?
 85 | 	- more macro/real-world benchmarks that involve durability
 86 | 		- how should we evaluate disk sync costs? #bytes written.
 87 | 		- mail delivery? (or, postfix suffices?)
 88 | 		- dbt2
 89 | 	- microbenchmark(s) for each syscall?
 90 | 	- add macrobenchmarks: a good/respected one
 91 | 		- truncate separate from unlink? write into a hole?
 92 | 	- expand microbenchmark repetition to ~fill ext[34] journals?
 93 | 	- use impressions?
 94 | - sort of feel that it is too much of a toy/not under realistic workloads.
 95 | 	real workloads/hardware may make the performance benefits inconsequential
 96 | 	or negative.
 97 | - is it ok to optimize only writes? (not reads or other aspects)
 98 | 	- optimize runtime to make testing easier? to know it won't be a bottleneck?
 99 | 		crawl_indir, crawl_tree_ref, crawl_inode
100 | - should I prototype ENOSPC recovery?
101 | - consider journaling
102 | - could expand atomic writes to 16B: gcc -mcx16 and CMPXCHG16B
103 | 
104 | * possibly useful
105 | - add more code documentation? (function definitions?) and/or clean up.
106 | - make code fast in time, too, for ucsd?
107 | 


--------------------------------------------------------------------------------
/util.h:
--------------------------------------------------------------------------------
  1 | /* This file is part of BPFS. BPFS is copyright 2009-2010 The Regents of the
  2 |  * University of California. It is distributed under the terms of version 2
  3 |  * of the GNU GPL. See the file LICENSE for details. */
  4 | 
  5 | #ifndef UTIL_H
  6 | #define UTIL_H
  7 | 
  8 | #include <assert.h>
  9 | #include <errno.h>
 10 | #include <stdio.h>
 11 | #include <stdlib.h>
 12 | #include <stdint.h>
 13 | #include <string.h>
 14 | #include <time.h>
 15 | 
 16 | // if cond is false, display message and then exit
 17 | #define xassert(cond) \
 18 | 	do { \
 19 | 		if (!(cond))  \
 20 | 		{ \
 21 | 			fprintf(stderr, "Not true, but should be: %s\n", # cond); \
 22 | 			assert(0); \
 23 | 			exit(1); \
 24 | 		} \
 25 | 	} while (0)
 26 | 
 27 | // if syscall exp call fails, display message and errno and then exit
 28 | #define xsyscall(call, format...) \
 29 | 	({ \
 30 | 		int err = call; \
 31 | 		if (err < 0)	\
 32 | 		{ \
 33 | 			int err = errno; \
 34 | 			fprintf(stderr, "%s: %s\n", # call, strerror(err));	\
 35 | 			assert(0); \
 36 | 			exit(1); \
 37 | 		} \
 38 | 		err; \
 39 | 	})
 40 | 
 41 | // if function call exp call returns < 0, display message and value and
 42 | // then exit
 43 | #define xcall(call, format...) \
 44 | 	({ \
 45 | 		int err = call; \
 46 | 		if (err < 0)	\
 47 | 		{ \
 48 | 			fprintf(stderr, "%s: %s\n", # call, strerror(-err)); \
 49 | 			assert(0); \
 50 | 			exit(1); \
 51 | 		} \
 52 | 		err; \
 53 | 	})
 54 | 
 55 | #define UNUSED(x) do { (void) x; } while(0)
 56 | 
 57 | // static_assert(x) will generate a compile-time error if 'x' is false.
 58 | #define static_assert(x) switch (x) case 0: case (x):
 59 | 
 60 | // Efficient min and max operations
 61 | #define MIN(_a, _b) \
 62 | 	({ \
 63 | 		typeof(_a) __a = (_a);  \
 64 | 		typeof(_b) __b = (_b);  \
 65 | 		__a <= __b ? __a : __b; \
 66 | 	})
 67 | #define MAX(_a, _b) \
 68 | 	({ \
 69 | 		typeof(_a) __a = (_a);  \
 70 | 		typeof(_b) __b = (_b);  \
 71 | 		__a >= __b ? __a : __b; \
 72 | 	})
 73 | #define MAXU64(_a, _b) \
 74 | 	({ \
 75 | 		uint64_t __a = (_a);  \
 76 | 		uint64_t __b = (_b);  \
 77 | 		__a >= __b ? __a : __b; \
 78 | 	})
 79 | 
 80 | // Max operation that propagates constant expressions as constants
 81 | #define CMAX(_a, _b) ((_a) >= (_b) ? (_a) : (_b))
 82 | 
 83 | // 64-bit integer rounding; only works for n = power of two
 84 | // NOTE: ROUNDUP64() may eval n twice. This macro does not create a variable
 85 | // on the stack to avoid this because it prevents gcc from being able
 86 | // to evaluate the resulting expression at compile time.
 87 | #define ROUNDUP64(a, n)   (((uint64_t) (a) + n - 1) & ~(n - 1))
 88 | #define ROUNDDOWN64(a, n) (((uint64_t) (a)) & ~((n) - 1))
 89 | 
 90 | #define container_of(ptr, type, member) \
 91 | 	({ \
 92 | 		typeof(((type *) 0)->member) * __mptr = (ptr); \
 93 | 		(type *) (uintptr_t) ((const char *) __mptr - offsetof(type, member)); \
 94 | 	})
 95 | 
 96 | // Reinterpret a uint64_t as a void* and perform an equality check if
 97 | // these types are of different sizes
 98 | static __inline
 99 | void* u64_ptr(uint64_t x) __attribute__((always_inline));
100 | 
101 | static __inline
102 | void* u64_ptr(uint64_t u)
103 | {
104 | 	void *p = (void*) u;
105 | 	if (sizeof(p) < sizeof(u))
106 | 		xassert(((uint64_t) p) == u);
107 | 	return p;
108 | }
109 | 
110 | #define NBLOCKS_FOR_NBYTES(nbytes) \
111 | 	(((nbytes) + BPFS_BLOCK_SIZE - 1) / BPFS_BLOCK_SIZE)
112 | 
113 | #define BPFS_TIME_NOW() \
114 | 	({ struct bpfs_time btime = {time(NULL)}; btime; })
115 | 
116 | typedef uint64_t bitmap_scan_t;
117 | 
118 | #endif
119 | 


--------------------------------------------------------------------------------
/vector.c:
--------------------------------------------------------------------------------
  1 | /* This file is part of BPFS. BPFS is copyright 2009-2010 The Regents of the
  2 |  * University of California. It is distributed under the terms of version 2
  3 |  * of the GNU GPL. See the file LICENSE for details. */
  4 | 
  5 | #include "vector.h"
  6 | 
  7 | #include <assert.h>
  8 | #include <errno.h>
  9 | #include <stdlib.h>
 10 | #include <string.h>
 11 | 
 12 | static void ** vector_create_elts(size_t n);
 13 | static void    vector_destroy_elts(vector_t * v);
 14 | static int     vector_grow(vector_t * v);
 15 | 
 16 | # define INIT_CAPACITY 10
 17 | 
 18 | #define smalloc(n) malloc(n)
 19 | #define sfree(v, n) free(v)
 20 | 
 21 | //
 22 | // Construction/destruction
 23 | 
 24 | vector_t * vector_create(void)
 25 | {
 26 | 	// Create a vector with no elements, but with a capacity.
 27 | 
 28 | 	vector_t * v = vector_create_size(INIT_CAPACITY);
 29 | 	if (!v)
 30 | 		return NULL;
 31 | 
 32 | 	v->size = 0;
 33 | 	return v;
 34 | }
 35 | 
 36 | vector_t * vector_create_size(size_t n)
 37 | {
 38 | 	vector_t * v = malloc(sizeof(*v));
 39 | 	if (!v)
 40 | 		return NULL;
 41 | 
 42 | 	v->size = n;
 43 | 	v->elts = vector_create_elts(n);
 44 | 	if (!v->elts)
 45 | 	{
 46 | 		free(v);
 47 | 		return NULL;
 48 | 	}
 49 | 	memset(v->elts, 0, n*sizeof(v->elts));
 50 | 	v->capacity = n;
 51 | 
 52 | 	return v;
 53 | }
 54 | 
 55 | void vector_destroy(vector_t * v)
 56 | {
 57 | 	vector_destroy_elts(v);
 58 | 	free(v);
 59 | }
 60 | 
 61 | static void ** vector_create_elts(size_t n)
 62 | {
 63 | 	void ** elts = smalloc(n*sizeof(*elts));
 64 | 	return elts;
 65 | }
 66 | 
 67 | static void vector_destroy_elts(vector_t * v)
 68 | {
 69 | 	sfree(v->elts, v->capacity*sizeof(*v->elts));
 70 | 	v->elts = NULL;
 71 | 	v->size = 0;
 72 | 	v->capacity = 0;
 73 | }
 74 | 
 75 | 
 76 | //
 77 | // General
 78 | 
 79 | // vector_size() inlined
 80 | 
 81 | // vector_empty() inlined
 82 | 
 83 | int vector_push_back(vector_t * v, void * elt)
 84 | {
 85 | 	int r;
 86 | 	if (v->size == v->capacity)
 87 | 	{
 88 | 		if ((r = vector_grow(v)) < 0)
 89 | 			return r;
 90 | 	}
 91 | 
 92 | 	v->elts[v->size++] = elt;
 93 | 	return 0;
 94 | }
 95 | 
 96 | int vector_push_back_vector(vector_t * v, const vector_t * v2)
 97 | {
 98 | 	size_t v2_size = vector_size(v2);
 99 | 	size_t i;
100 | 	int r;
101 | 
102 | 	r = vector_reserve(v, vector_size(v) + v2_size);
103 | 	if (r < 0)
104 | 		return r;
105 | 
106 | 	for (i=0; i < v2_size; i++)
107 | 	{
108 | 		r = vector_push_back(v, vector_elt((vector_t *) v2, i));
109 | 		assert(r >= 0); // no error since space is pre-allocated
110 | 	}
111 | 
112 | 	return 0;
113 | }
114 | 
115 | // vector_pop_back() inlined
116 | 
117 | void vector_erase(vector_t * v, size_t i)
118 | {
119 | 	for (; i+1 < v->size; i++)
120 | 		v->elts[i] = v->elts[i+1];
121 | 	v->size--;
122 | }
123 | 
124 | void vector_clear(vector_t * v)
125 | {
126 | 	v->size = 0;
127 | }
128 | 
129 | #ifndef __KERNEL__
130 | void vector_sort(vector_t *v, int (*compar)(const void *a, const void *b))
131 | {
132 | 	if (v->size < 2) return;
133 | 	qsort(v->elts, v->size, sizeof(void*), compar);
134 | }
135 | #endif
136 | 
137 | 
138 | //
139 | // Element access
140 | 
141 | // vector_elt() inlined
142 | 
143 | // vector_elt_front() inlined
144 | 
145 | // vector_elt_end() inlined
146 | 
147 | bool vector_contains(vector_t * v, void * elt)
148 | {
149 | 	size_t i;
150 | 	for (i = 0; i < v->size; i++)
151 | 		if(v->elts[i] == elt)
152 | 			return 1;
153 | 	return 0;
154 | }
155 | 
156 | //
157 | // Growing/shrinking
158 | 
159 | size_t vector_capacity(const vector_t * v)
160 | {
161 | 	return v->capacity;
162 | }
163 | 
164 | int vector_reserve(vector_t * v, size_t n)
165 | {
166 | 	size_t i;
167 | 	const size_t n_elts = v->size;
168 | 	void ** elts;
169 | 
170 | 	if (n <= v->capacity)
171 | 		return 1;
172 | 
173 | 	elts = vector_create_elts(n);
174 | 	if (!elts)
175 | 		return -ENOMEM;
176 | 
177 | 	for (i=0; i < n_elts; i++)
178 | 		elts[i] = v->elts[i];
179 | 
180 | 	vector_destroy_elts(v);
181 | 	v->elts = elts;
182 | 	v->size = n_elts;
183 | 	v->capacity = n;
184 | 
185 | 	return 0;
186 | }
187 | 
188 | static int vector_grow(vector_t * v)
189 | {
190 | 	return vector_reserve(v, 2*v->capacity);
191 | }
192 | 


--------------------------------------------------------------------------------
/vector.h:
--------------------------------------------------------------------------------
  1 | /* This file is part of BPFS. BPFS is copyright 2009-2010 The Regents of the
  2 |  * University of California. It is distributed under the terms of version 2
  3 |  * of the GNU GPL. See the file LICENSE for details. */
  4 | 
  5 | #ifndef FSTITCH_INC_VECTOR_H
  6 | #define FSTITCH_INC_VECTOR_H
  7 | 
  8 | #include <stdbool.h>
  9 | #include <stddef.h>
 10 | 
 11 | struct vector {
 12 | 	size_t size;
 13 | 	size_t capacity;
 14 | 	void ** elts;
 15 | };
 16 | typedef struct vector vector_t;
 17 | 
 18 | // Create a vector.
 19 | vector_t * vector_create(void);
 20 | // Create a vector of size n.
 21 | vector_t * vector_create_size(size_t n);
 22 | // Destroy the vector, does not destroy elts.
 23 | void       vector_destroy(vector_t * v);
 24 | 
 25 | // Returns number of elts in the vector.
 26 | static __inline
 27 | size_t vector_size(const vector_t * v) __attribute__((always_inline));
 28 | // Returns whether the vector is empty.
 29 | static __inline
 30 | bool   vector_empty(const vector_t * v) __attribute__((always_inline));
 31 | // Push elt onto the back of the vector, growing if necessary.
 32 | // Returns 0 on success, or -ENOMEM.
 33 | int    vector_push_back(vector_t * v, void * elt);
 34 | // Push vector onto the back, growing if necessary.
 35 | // Returns 0 on success, or -ENOMEM.
 36 | int    vector_push_back_vector(vector_t * v, const vector_t * v2);
 37 | // Remove the last elt in the vector, does not destroy elt.
 38 | static __inline
 39 | void   vector_pop_back(vector_t * v) __attribute__((always_inline));
 40 | // Remove the given elt at position i, does not destroy elt.
 41 | void   vector_erase(vector_t * v, size_t i);
 42 | // Remove all elts in the vector, does not destroy elts.
 43 | void   vector_clear(vector_t * v);
 44 | 
 45 | #ifndef __KERNEL__
 46 | // Sort the vector in ascending order. compar should return a value
 47 | // less than, equal to, or greater than zero if 'a' is less than,
 48 | // equal to, or greater than 'b', respectively.
 49 | void   vector_sort(vector_t *v, int (*compar)(const void *a, const void *b));
 50 | #endif
 51 | 
 52 | // Return the elt at position i.
 53 | static __inline
 54 | void * vector_elt(const vector_t * v, size_t i) __attribute__((always_inline));
 55 | // Set the elt at position i.
 56 | static __inline
 57 | void   vector_elt_set(vector_t * v, size_t i, void * elt) __attribute__((always_inline));
 58 | // Return the first elt.
 59 | static __inline
 60 | void * vector_elt_front(vector_t * v) __attribute__((always_inline));
 61 | // Return the last elt.
 62 | static __inline
 63 | void * vector_elt_end(vector_t * v) __attribute__((always_inline));
 64 | // Returns whether the vector contains the specified element.
 65 | bool   vector_contains(vector_t * v, void * elt);
 66 | 
 67 | // Return the current capacity of the vector.
 68 | size_t vector_capacity(const vector_t * v);
 69 | // Ensure room for n elts is reserved in the vector.
 70 | // Returns 0 on success, or -ENOMEM.
 71 | int    vector_reserve(vector_t * v, size_t n);
 72 | 
 73 | 
 74 | //
 75 | // Implementations of inline functions
 76 | 
 77 | static __inline
 78 | size_t vector_size(const vector_t * v)
 79 | {
 80 | 	return v->size;
 81 | }
 82 | 
 83 | static __inline
 84 | bool vector_empty(const vector_t * v)
 85 | {
 86 | 	return (v->size == 0);
 87 | }
 88 | 
 89 | static __inline
 90 | void vector_pop_back(vector_t * v)
 91 | {
 92 | 	if (v->size == 0)
 93 | 		return;
 94 | 	v->size--;
 95 | }
 96 | 
 97 | static __inline
 98 | void * vector_elt(const vector_t * v, size_t i)
 99 | {
100 | 	return v->elts[i];
101 | }
102 | 
103 | static __inline
104 | void vector_elt_set(vector_t * v, size_t i, void * elt)
105 | {
106 | 	v->elts[i] = elt;
107 | }
108 | 
109 | static __inline
110 | void * vector_elt_front(vector_t * v)
111 | {
112 | 	return v->elts[0];
113 | }
114 | 
115 | static __inline
116 | void * vector_elt_end(vector_t * v)
117 | {
118 | 	return v->elts[v->size - 1];
119 | }
120 | 
121 | #endif /* !FSTITCH_INC_VECTOR_H */
122 | 


--------------------------------------------------------------------------------