├── .gitignore ├── Jamfile ├── Jamroot.jam ├── LICENSE ├── README.md ├── benchmarks ├── Jamfile ├── block_device_test.cpp └── run_benchmark.py ├── docs ├── bittorrent_filesystem.dia ├── bittorrent_filesystem.png ├── btfs.rst ├── file_access.png ├── inode_allocation.dia ├── inode_allocation.png ├── makefile ├── ordered_allocation.png ├── partial_cache_stripe.dia ├── partial_cache_stripe.png ├── piece_size_cdf.png ├── piece_sizes.dat ├── render.gnuplot ├── sequential_allocation.png ├── simulate_sparse.py ├── sizes.dat ├── sizes.gnuplot ├── sizes_cdf.dat ├── sparse_access.png ├── stylesheet └── torrent_size_cdf.png ├── src ├── block_affinity.cpp ├── block_affinity.hpp ├── block_allocator.cpp ├── block_allocator.hpp ├── block_device.cpp ├── block_device.hpp ├── btfs.cpp ├── pool_allocator.cpp └── pool_allocator.hpp └── test ├── Jamfile ├── render.gnuplot └── test_block_allocator.cpp /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled Object files 2 | *.slo 3 | *.lo 4 | *.o 5 | *.obj 6 | 7 | # Precompiled Headers 8 | *.gch 9 | *.pch 10 | 11 | # Compiled Dynamic libraries 12 | *.so 13 | *.dylib 14 | *.dll 15 | 16 | # Fortran module files 17 | *.mod 18 | 19 | # Compiled Static libraries 20 | *.lai 21 | *.la 22 | *.a 23 | *.lib 24 | 25 | # Executables 26 | *.exe 27 | *.out 28 | *.app 29 | -------------------------------------------------------------------------------- /Jamfile: -------------------------------------------------------------------------------- 1 | import modules ; 2 | import feature : feature ; 3 | 4 | BOOST_ROOT = [ modules.peek : BOOST_ROOT ] ; 5 | 6 | use-project /torrent : ../libtorrent ; 7 | 8 | if $(BOOST_ROOT) 9 | { 10 | use-project /boost : $(BOOST_ROOT) ; 11 | } 12 | 13 | feature disk-log : off on : composite propagated ; 14 | feature.compose on : DISK_ACCESS_LOG ; 15 | 16 | lib dl : : shared dl ; 17 | 18 | SOURCES = 19 | block_device 20 | block_allocator 21 | block_affinity 22 | pool_allocator 23 | ; 24 | 25 | lib torrentfs 26 | 27 | : # sources 28 | src/$(SOURCES).cpp 29 | 30 | : # requirements 31 | linux:dl 32 | multi 33 | /torrent//torrent/static/static 34 | $(BOOST_ROOT) 35 | shared:TORRENT_BUILDING_SHARED 36 | 37 | : # default build 38 | static 39 | 40 | : # usage requirements 41 | src 42 | /torrent//torrent/static/static 43 | $(BOOST_ROOT) 44 | shared:TORRENT_LINKING_SHARED 45 | 46 | ; 47 | 48 | exe btfs : src/btfs.cpp : torrentfs/static : static ; 49 | 50 | explicit stage ; 51 | install stage : btfs : . ; 52 | 53 | -------------------------------------------------------------------------------- /Jamroot.jam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arvidn/btfs/75aefe7c9603abd327fb23cd35548542163cdfa0/Jamroot.jam -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | Preamble 9 | 10 | The GNU General Public License is a free, copyleft license for 11 | software and other kinds of works. 12 | 13 | The licenses for most software and other practical works are designed 14 | to take away your freedom to share and change the works. By contrast, 15 | the GNU General Public License is intended to guarantee your freedom to 16 | share and change all versions of a program--to make sure it remains free 17 | software for all its users. We, the Free Software Foundation, use the 18 | GNU General Public License for most of our software; it applies also to 19 | any other work released this way by its authors. You can apply it to 20 | your programs, too. 21 | 22 | When we speak of free software, we are referring to freedom, not 23 | price. Our General Public Licenses are designed to make sure that you 24 | have the freedom to distribute copies of free software (and charge for 25 | them if you wish), that you receive source code or can get it if you 26 | want it, that you can change the software or use pieces of it in new 27 | free programs, and that you know you can do these things. 28 | 29 | To protect your rights, we need to prevent others from denying you 30 | these rights or asking you to surrender the rights. Therefore, you have 31 | certain responsibilities if you distribute copies of the software, or if 32 | you modify it: responsibilities to respect the freedom of others. 33 | 34 | For example, if you distribute copies of such a program, whether 35 | gratis or for a fee, you must pass on to the recipients the same 36 | freedoms that you received. You must make sure that they, too, receive 37 | or can get the source code. And you must show them these terms so they 38 | know their rights. 39 | 40 | Developers that use the GNU GPL protect your rights with two steps: 41 | (1) assert copyright on the software, and (2) offer you this License 42 | giving you legal permission to copy, distribute and/or modify it. 43 | 44 | For the developers' and authors' protection, the GPL clearly explains 45 | that there is no warranty for this free software. For both users' and 46 | authors' sake, the GPL requires that modified versions be marked as 47 | changed, so that their problems will not be attributed erroneously to 48 | authors of previous versions. 49 | 50 | Some devices are designed to deny users access to install or run 51 | modified versions of the software inside them, although the manufacturer 52 | can do so. This is fundamentally incompatible with the aim of 53 | protecting users' freedom to change the software. The systematic 54 | pattern of such abuse occurs in the area of products for individuals to 55 | use, which is precisely where it is most unacceptable. Therefore, we 56 | have designed this version of the GPL to prohibit the practice for those 57 | products. If such problems arise substantially in other domains, we 58 | stand ready to extend this provision to those domains in future versions 59 | of the GPL, as needed to protect the freedom of users. 60 | 61 | Finally, every program is threatened constantly by software patents. 62 | States should not allow patents to restrict development and use of 63 | software on general-purpose computers, but in those that do, we wish to 64 | avoid the special danger that patents applied to a free program could 65 | make it effectively proprietary. To prevent this, the GPL assures that 66 | patents cannot be used to render the program non-free. 67 | 68 | The precise terms and conditions for copying, distribution and 69 | modification follow. 70 | 71 | TERMS AND CONDITIONS 72 | 73 | 0. Definitions. 74 | 75 | "This License" refers to version 3 of the GNU General Public License. 76 | 77 | "Copyright" also means copyright-like laws that apply to other kinds of 78 | works, such as semiconductor masks. 79 | 80 | "The Program" refers to any copyrightable work licensed under this 81 | License. Each licensee is addressed as "you". "Licensees" and 82 | "recipients" may be individuals or organizations. 83 | 84 | To "modify" a work means to copy from or adapt all or part of the work 85 | in a fashion requiring copyright permission, other than the making of an 86 | exact copy. The resulting work is called a "modified version" of the 87 | earlier work or a work "based on" the earlier work. 88 | 89 | A "covered work" means either the unmodified Program or a work based 90 | on the Program. 91 | 92 | To "propagate" a work means to do anything with it that, without 93 | permission, would make you directly or secondarily liable for 94 | infringement under applicable copyright law, except executing it on a 95 | computer or modifying a private copy. Propagation includes copying, 96 | distribution (with or without modification), making available to the 97 | public, and in some countries other activities as well. 98 | 99 | To "convey" a work means any kind of propagation that enables other 100 | parties to make or receive copies. Mere interaction with a user through 101 | a computer network, with no transfer of a copy, is not conveying. 102 | 103 | An interactive user interface displays "Appropriate Legal Notices" 104 | to the extent that it includes a convenient and prominently visible 105 | feature that (1) displays an appropriate copyright notice, and (2) 106 | tells the user that there is no warranty for the work (except to the 107 | extent that warranties are provided), that licensees may convey the 108 | work under this License, and how to view a copy of this License. If 109 | the interface presents a list of user commands or options, such as a 110 | menu, a prominent item in the list meets this criterion. 111 | 112 | 1. Source Code. 113 | 114 | The "source code" for a work means the preferred form of the work 115 | for making modifications to it. "Object code" means any non-source 116 | form of a work. 117 | 118 | A "Standard Interface" means an interface that either is an official 119 | standard defined by a recognized standards body, or, in the case of 120 | interfaces specified for a particular programming language, one that 121 | is widely used among developers working in that language. 122 | 123 | The "System Libraries" of an executable work include anything, other 124 | than the work as a whole, that (a) is included in the normal form of 125 | packaging a Major Component, but which is not part of that Major 126 | Component, and (b) serves only to enable use of the work with that 127 | Major Component, or to implement a Standard Interface for which an 128 | implementation is available to the public in source code form. A 129 | "Major Component", in this context, means a major essential component 130 | (kernel, window system, and so on) of the specific operating system 131 | (if any) on which the executable work runs, or a compiler used to 132 | produce the work, or an object code interpreter used to run it. 133 | 134 | The "Corresponding Source" for a work in object code form means all 135 | the source code needed to generate, install, and (for an executable 136 | work) run the object code and to modify the work, including scripts to 137 | control those activities. However, it does not include the work's 138 | System Libraries, or general-purpose tools or generally available free 139 | programs which are used unmodified in performing those activities but 140 | which are not part of the work. For example, Corresponding Source 141 | includes interface definition files associated with source files for 142 | the work, and the source code for shared libraries and dynamically 143 | linked subprograms that the work is specifically designed to require, 144 | such as by intimate data communication or control flow between those 145 | subprograms and other parts of the work. 146 | 147 | The Corresponding Source need not include anything that users 148 | can regenerate automatically from other parts of the Corresponding 149 | Source. 150 | 151 | The Corresponding Source for a work in source code form is that 152 | same work. 153 | 154 | 2. Basic Permissions. 155 | 156 | All rights granted under this License are granted for the term of 157 | copyright on the Program, and are irrevocable provided the stated 158 | conditions are met. This License explicitly affirms your unlimited 159 | permission to run the unmodified Program. The output from running a 160 | covered work is covered by this License only if the output, given its 161 | content, constitutes a covered work. This License acknowledges your 162 | rights of fair use or other equivalent, as provided by copyright law. 163 | 164 | You may make, run and propagate covered works that you do not 165 | convey, without conditions so long as your license otherwise remains 166 | in force. You may convey covered works to others for the sole purpose 167 | of having them make modifications exclusively for you, or provide you 168 | with facilities for running those works, provided that you comply with 169 | the terms of this License in conveying all material for which you do 170 | not control copyright. Those thus making or running the covered works 171 | for you must do so exclusively on your behalf, under your direction 172 | and control, on terms that prohibit them from making any copies of 173 | your copyrighted material outside their relationship with you. 174 | 175 | Conveying under any other circumstances is permitted solely under 176 | the conditions stated below. Sublicensing is not allowed; section 10 177 | makes it unnecessary. 178 | 179 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law. 180 | 181 | No covered work shall be deemed part of an effective technological 182 | measure under any applicable law fulfilling obligations under article 183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or 184 | similar laws prohibiting or restricting circumvention of such 185 | measures. 186 | 187 | When you convey a covered work, you waive any legal power to forbid 188 | circumvention of technological measures to the extent such circumvention 189 | is effected by exercising rights under this License with respect to 190 | the covered work, and you disclaim any intention to limit operation or 191 | modification of the work as a means of enforcing, against the work's 192 | users, your or third parties' legal rights to forbid circumvention of 193 | technological measures. 194 | 195 | 4. Conveying Verbatim Copies. 196 | 197 | You may convey verbatim copies of the Program's source code as you 198 | receive it, in any medium, provided that you conspicuously and 199 | appropriately publish on each copy an appropriate copyright notice; 200 | keep intact all notices stating that this License and any 201 | non-permissive terms added in accord with section 7 apply to the code; 202 | keep intact all notices of the absence of any warranty; and give all 203 | recipients a copy of this License along with the Program. 204 | 205 | You may charge any price or no price for each copy that you convey, 206 | and you may offer support or warranty protection for a fee. 207 | 208 | 5. Conveying Modified Source Versions. 209 | 210 | You may convey a work based on the Program, or the modifications to 211 | produce it from the Program, in the form of source code under the 212 | terms of section 4, provided that you also meet all of these conditions: 213 | 214 | a) The work must carry prominent notices stating that you modified 215 | it, and giving a relevant date. 216 | 217 | b) The work must carry prominent notices stating that it is 218 | released under this License and any conditions added under section 219 | 7. This requirement modifies the requirement in section 4 to 220 | "keep intact all notices". 221 | 222 | c) You must license the entire work, as a whole, under this 223 | License to anyone who comes into possession of a copy. This 224 | License will therefore apply, along with any applicable section 7 225 | additional terms, to the whole of the work, and all its parts, 226 | regardless of how they are packaged. This License gives no 227 | permission to license the work in any other way, but it does not 228 | invalidate such permission if you have separately received it. 229 | 230 | d) If the work has interactive user interfaces, each must display 231 | Appropriate Legal Notices; however, if the Program has interactive 232 | interfaces that do not display Appropriate Legal Notices, your 233 | work need not make them do so. 234 | 235 | A compilation of a covered work with other separate and independent 236 | works, which are not by their nature extensions of the covered work, 237 | and which are not combined with it such as to form a larger program, 238 | in or on a volume of a storage or distribution medium, is called an 239 | "aggregate" if the compilation and its resulting copyright are not 240 | used to limit the access or legal rights of the compilation's users 241 | beyond what the individual works permit. Inclusion of a covered work 242 | in an aggregate does not cause this License to apply to the other 243 | parts of the aggregate. 244 | 245 | 6. Conveying Non-Source Forms. 246 | 247 | You may convey a covered work in object code form under the terms 248 | of sections 4 and 5, provided that you also convey the 249 | machine-readable Corresponding Source under the terms of this License, 250 | in one of these ways: 251 | 252 | a) Convey the object code in, or embodied in, a physical product 253 | (including a physical distribution medium), accompanied by the 254 | Corresponding Source fixed on a durable physical medium 255 | customarily used for software interchange. 256 | 257 | b) Convey the object code in, or embodied in, a physical product 258 | (including a physical distribution medium), accompanied by a 259 | written offer, valid for at least three years and valid for as 260 | long as you offer spare parts or customer support for that product 261 | model, to give anyone who possesses the object code either (1) a 262 | copy of the Corresponding Source for all the software in the 263 | product that is covered by this License, on a durable physical 264 | medium customarily used for software interchange, for a price no 265 | more than your reasonable cost of physically performing this 266 | conveying of source, or (2) access to copy the 267 | Corresponding Source from a network server at no charge. 268 | 269 | c) Convey individual copies of the object code with a copy of the 270 | written offer to provide the Corresponding Source. This 271 | alternative is allowed only occasionally and noncommercially, and 272 | only if you received the object code with such an offer, in accord 273 | with subsection 6b. 274 | 275 | d) Convey the object code by offering access from a designated 276 | place (gratis or for a charge), and offer equivalent access to the 277 | Corresponding Source in the same way through the same place at no 278 | further charge. You need not require recipients to copy the 279 | Corresponding Source along with the object code. If the place to 280 | copy the object code is a network server, the Corresponding Source 281 | may be on a different server (operated by you or a third party) 282 | that supports equivalent copying facilities, provided you maintain 283 | clear directions next to the object code saying where to find the 284 | Corresponding Source. Regardless of what server hosts the 285 | Corresponding Source, you remain obligated to ensure that it is 286 | available for as long as needed to satisfy these requirements. 287 | 288 | e) Convey the object code using peer-to-peer transmission, provided 289 | you inform other peers where the object code and Corresponding 290 | Source of the work are being offered to the general public at no 291 | charge under subsection 6d. 292 | 293 | A separable portion of the object code, whose source code is excluded 294 | from the Corresponding Source as a System Library, need not be 295 | included in conveying the object code work. 296 | 297 | A "User Product" is either (1) a "consumer product", which means any 298 | tangible personal property which is normally used for personal, family, 299 | or household purposes, or (2) anything designed or sold for incorporation 300 | into a dwelling. In determining whether a product is a consumer product, 301 | doubtful cases shall be resolved in favor of coverage. For a particular 302 | product received by a particular user, "normally used" refers to a 303 | typical or common use of that class of product, regardless of the status 304 | of the particular user or of the way in which the particular user 305 | actually uses, or expects or is expected to use, the product. A product 306 | is a consumer product regardless of whether the product has substantial 307 | commercial, industrial or non-consumer uses, unless such uses represent 308 | the only significant mode of use of the product. 309 | 310 | "Installation Information" for a User Product means any methods, 311 | procedures, authorization keys, or other information required to install 312 | and execute modified versions of a covered work in that User Product from 313 | a modified version of its Corresponding Source. The information must 314 | suffice to ensure that the continued functioning of the modified object 315 | code is in no case prevented or interfered with solely because 316 | modification has been made. 317 | 318 | If you convey an object code work under this section in, or with, or 319 | specifically for use in, a User Product, and the conveying occurs as 320 | part of a transaction in which the right of possession and use of the 321 | User Product is transferred to the recipient in perpetuity or for a 322 | fixed term (regardless of how the transaction is characterized), the 323 | Corresponding Source conveyed under this section must be accompanied 324 | by the Installation Information. But this requirement does not apply 325 | if neither you nor any third party retains the ability to install 326 | modified object code on the User Product (for example, the work has 327 | been installed in ROM). 328 | 329 | The requirement to provide Installation Information does not include a 330 | requirement to continue to provide support service, warranty, or updates 331 | for a work that has been modified or installed by the recipient, or for 332 | the User Product in which it has been modified or installed. Access to a 333 | network may be denied when the modification itself materially and 334 | adversely affects the operation of the network or violates the rules and 335 | protocols for communication across the network. 336 | 337 | Corresponding Source conveyed, and Installation Information provided, 338 | in accord with this section must be in a format that is publicly 339 | documented (and with an implementation available to the public in 340 | source code form), and must require no special password or key for 341 | unpacking, reading or copying. 342 | 343 | 7. Additional Terms. 344 | 345 | "Additional permissions" are terms that supplement the terms of this 346 | License by making exceptions from one or more of its conditions. 347 | Additional permissions that are applicable to the entire Program shall 348 | be treated as though they were included in this License, to the extent 349 | that they are valid under applicable law. If additional permissions 350 | apply only to part of the Program, that part may be used separately 351 | under those permissions, but the entire Program remains governed by 352 | this License without regard to the additional permissions. 353 | 354 | When you convey a copy of a covered work, you may at your option 355 | remove any additional permissions from that copy, or from any part of 356 | it. (Additional permissions may be written to require their own 357 | removal in certain cases when you modify the work.) You may place 358 | additional permissions on material, added by you to a covered work, 359 | for which you have or can give appropriate copyright permission. 360 | 361 | Notwithstanding any other provision of this License, for material you 362 | add to a covered work, you may (if authorized by the copyright holders of 363 | that material) supplement the terms of this License with terms: 364 | 365 | a) Disclaiming warranty or limiting liability differently from the 366 | terms of sections 15 and 16 of this License; or 367 | 368 | b) Requiring preservation of specified reasonable legal notices or 369 | author attributions in that material or in the Appropriate Legal 370 | Notices displayed by works containing it; or 371 | 372 | c) Prohibiting misrepresentation of the origin of that material, or 373 | requiring that modified versions of such material be marked in 374 | reasonable ways as different from the original version; or 375 | 376 | d) Limiting the use for publicity purposes of names of licensors or 377 | authors of the material; or 378 | 379 | e) Declining to grant rights under trademark law for use of some 380 | trade names, trademarks, or service marks; or 381 | 382 | f) Requiring indemnification of licensors and authors of that 383 | material by anyone who conveys the material (or modified versions of 384 | it) with contractual assumptions of liability to the recipient, for 385 | any liability that these contractual assumptions directly impose on 386 | those licensors and authors. 387 | 388 | All other non-permissive additional terms are considered "further 389 | restrictions" within the meaning of section 10. If the Program as you 390 | received it, or any part of it, contains a notice stating that it is 391 | governed by this License along with a term that is a further 392 | restriction, you may remove that term. If a license document contains 393 | a further restriction but permits relicensing or conveying under this 394 | License, you may add to a covered work material governed by the terms 395 | of that license document, provided that the further restriction does 396 | not survive such relicensing or conveying. 397 | 398 | If you add terms to a covered work in accord with this section, you 399 | must place, in the relevant source files, a statement of the 400 | additional terms that apply to those files, or a notice indicating 401 | where to find the applicable terms. 402 | 403 | Additional terms, permissive or non-permissive, may be stated in the 404 | form of a separately written license, or stated as exceptions; 405 | the above requirements apply either way. 406 | 407 | 8. Termination. 408 | 409 | You may not propagate or modify a covered work except as expressly 410 | provided under this License. Any attempt otherwise to propagate or 411 | modify it is void, and will automatically terminate your rights under 412 | this License (including any patent licenses granted under the third 413 | paragraph of section 11). 414 | 415 | However, if you cease all violation of this License, then your 416 | license from a particular copyright holder is reinstated (a) 417 | provisionally, unless and until the copyright holder explicitly and 418 | finally terminates your license, and (b) permanently, if the copyright 419 | holder fails to notify you of the violation by some reasonable means 420 | prior to 60 days after the cessation. 421 | 422 | Moreover, your license from a particular copyright holder is 423 | reinstated permanently if the copyright holder notifies you of the 424 | violation by some reasonable means, this is the first time you have 425 | received notice of violation of this License (for any work) from that 426 | copyright holder, and you cure the violation prior to 30 days after 427 | your receipt of the notice. 428 | 429 | Termination of your rights under this section does not terminate the 430 | licenses of parties who have received copies or rights from you under 431 | this License. If your rights have been terminated and not permanently 432 | reinstated, you do not qualify to receive new licenses for the same 433 | material under section 10. 434 | 435 | 9. Acceptance Not Required for Having Copies. 436 | 437 | You are not required to accept this License in order to receive or 438 | run a copy of the Program. Ancillary propagation of a covered work 439 | occurring solely as a consequence of using peer-to-peer transmission 440 | to receive a copy likewise does not require acceptance. However, 441 | nothing other than this License grants you permission to propagate or 442 | modify any covered work. These actions infringe copyright if you do 443 | not accept this License. Therefore, by modifying or propagating a 444 | covered work, you indicate your acceptance of this License to do so. 445 | 446 | 10. Automatic Licensing of Downstream Recipients. 447 | 448 | Each time you convey a covered work, the recipient automatically 449 | receives a license from the original licensors, to run, modify and 450 | propagate that work, subject to this License. You are not responsible 451 | for enforcing compliance by third parties with this License. 452 | 453 | An "entity transaction" is a transaction transferring control of an 454 | organization, or substantially all assets of one, or subdividing an 455 | organization, or merging organizations. If propagation of a covered 456 | work results from an entity transaction, each party to that 457 | transaction who receives a copy of the work also receives whatever 458 | licenses to the work the party's predecessor in interest had or could 459 | give under the previous paragraph, plus a right to possession of the 460 | Corresponding Source of the work from the predecessor in interest, if 461 | the predecessor has it or can get it with reasonable efforts. 462 | 463 | You may not impose any further restrictions on the exercise of the 464 | rights granted or affirmed under this License. For example, you may 465 | not impose a license fee, royalty, or other charge for exercise of 466 | rights granted under this License, and you may not initiate litigation 467 | (including a cross-claim or counterclaim in a lawsuit) alleging that 468 | any patent claim is infringed by making, using, selling, offering for 469 | sale, or importing the Program or any portion of it. 470 | 471 | 11. Patents. 472 | 473 | A "contributor" is a copyright holder who authorizes use under this 474 | License of the Program or a work on which the Program is based. The 475 | work thus licensed is called the contributor's "contributor version". 476 | 477 | A contributor's "essential patent claims" are all patent claims 478 | owned or controlled by the contributor, whether already acquired or 479 | hereafter acquired, that would be infringed by some manner, permitted 480 | by this License, of making, using, or selling its contributor version, 481 | but do not include claims that would be infringed only as a 482 | consequence of further modification of the contributor version. For 483 | purposes of this definition, "control" includes the right to grant 484 | patent sublicenses in a manner consistent with the requirements of 485 | this License. 486 | 487 | Each contributor grants you a non-exclusive, worldwide, royalty-free 488 | patent license under the contributor's essential patent claims, to 489 | make, use, sell, offer for sale, import and otherwise run, modify and 490 | propagate the contents of its contributor version. 491 | 492 | In the following three paragraphs, a "patent license" is any express 493 | agreement or commitment, however denominated, not to enforce a patent 494 | (such as an express permission to practice a patent or covenant not to 495 | sue for patent infringement). To "grant" such a patent license to a 496 | party means to make such an agreement or commitment not to enforce a 497 | patent against the party. 498 | 499 | If you convey a covered work, knowingly relying on a patent license, 500 | and the Corresponding Source of the work is not available for anyone 501 | to copy, free of charge and under the terms of this License, through a 502 | publicly available network server or other readily accessible means, 503 | then you must either (1) cause the Corresponding Source to be so 504 | available, or (2) arrange to deprive yourself of the benefit of the 505 | patent license for this particular work, or (3) arrange, in a manner 506 | consistent with the requirements of this License, to extend the patent 507 | license to downstream recipients. "Knowingly relying" means you have 508 | actual knowledge that, but for the patent license, your conveying the 509 | covered work in a country, or your recipient's use of the covered work 510 | in a country, would infringe one or more identifiable patents in that 511 | country that you have reason to believe are valid. 512 | 513 | If, pursuant to or in connection with a single transaction or 514 | arrangement, you convey, or propagate by procuring conveyance of, a 515 | covered work, and grant a patent license to some of the parties 516 | receiving the covered work authorizing them to use, propagate, modify 517 | or convey a specific copy of the covered work, then the patent license 518 | you grant is automatically extended to all recipients of the covered 519 | work and works based on it. 520 | 521 | A patent license is "discriminatory" if it does not include within 522 | the scope of its coverage, prohibits the exercise of, or is 523 | conditioned on the non-exercise of one or more of the rights that are 524 | specifically granted under this License. You may not convey a covered 525 | work if you are a party to an arrangement with a third party that is 526 | in the business of distributing software, under which you make payment 527 | to the third party based on the extent of your activity of conveying 528 | the work, and under which the third party grants, to any of the 529 | parties who would receive the covered work from you, a discriminatory 530 | patent license (a) in connection with copies of the covered work 531 | conveyed by you (or copies made from those copies), or (b) primarily 532 | for and in connection with specific products or compilations that 533 | contain the covered work, unless you entered into that arrangement, 534 | or that patent license was granted, prior to 28 March 2007. 535 | 536 | Nothing in this License shall be construed as excluding or limiting 537 | any implied license or other defenses to infringement that may 538 | otherwise be available to you under applicable patent law. 539 | 540 | 12. No Surrender of Others' Freedom. 541 | 542 | If conditions are imposed on you (whether by court order, agreement or 543 | otherwise) that contradict the conditions of this License, they do not 544 | excuse you from the conditions of this License. If you cannot convey a 545 | covered work so as to satisfy simultaneously your obligations under this 546 | License and any other pertinent obligations, then as a consequence you may 547 | not convey it at all. For example, if you agree to terms that obligate you 548 | to collect a royalty for further conveying from those to whom you convey 549 | the Program, the only way you could satisfy both those terms and this 550 | License would be to refrain entirely from conveying the Program. 551 | 552 | 13. Use with the GNU Affero General Public License. 553 | 554 | Notwithstanding any other provision of this License, you have 555 | permission to link or combine any covered work with a work licensed 556 | under version 3 of the GNU Affero General Public License into a single 557 | combined work, and to convey the resulting work. The terms of this 558 | License will continue to apply to the part which is the covered work, 559 | but the special requirements of the GNU Affero General Public License, 560 | section 13, concerning interaction through a network will apply to the 561 | combination as such. 562 | 563 | 14. Revised Versions of this License. 564 | 565 | The Free Software Foundation may publish revised and/or new versions of 566 | the GNU General Public License from time to time. Such new versions will 567 | be similar in spirit to the present version, but may differ in detail to 568 | address new problems or concerns. 569 | 570 | Each version is given a distinguishing version number. If the 571 | Program specifies that a certain numbered version of the GNU General 572 | Public License "or any later version" applies to it, you have the 573 | option of following the terms and conditions either of that numbered 574 | version or of any later version published by the Free Software 575 | Foundation. If the Program does not specify a version number of the 576 | GNU General Public License, you may choose any version ever published 577 | by the Free Software Foundation. 578 | 579 | If the Program specifies that a proxy can decide which future 580 | versions of the GNU General Public License can be used, that proxy's 581 | public statement of acceptance of a version permanently authorizes you 582 | to choose that version for the Program. 583 | 584 | Later license versions may give you additional or different 585 | permissions. However, no additional obligations are imposed on any 586 | author or copyright holder as a result of your choosing to follow a 587 | later version. 588 | 589 | 15. Disclaimer of Warranty. 590 | 591 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY 592 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT 593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY 594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, 595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 596 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM 597 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF 598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 599 | 600 | 16. Limitation of Liability. 601 | 602 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS 604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY 605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE 606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF 607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD 608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), 609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF 610 | SUCH DAMAGES. 611 | 612 | 17. Interpretation of Sections 15 and 16. 613 | 614 | If the disclaimer of warranty and limitation of liability provided 615 | above cannot be given local legal effect according to their terms, 616 | reviewing courts shall apply local law that most closely approximates 617 | an absolute waiver of all civil liability in connection with the 618 | Program, unless a warranty or assumption of liability accompanies a 619 | copy of the Program in return for a fee. 620 | 621 | END OF TERMS AND CONDITIONS 622 | 623 | How to Apply These Terms to Your New Programs 624 | 625 | If you develop a new program, and you want it to be of the greatest 626 | possible use to the public, the best way to achieve this is to make it 627 | free software which everyone can redistribute and change under these terms. 628 | 629 | To do so, attach the following notices to the program. It is safest 630 | to attach them to the start of each source file to most effectively 631 | state the exclusion of warranty; and each file should have at least 632 | the "copyright" line and a pointer to where the full notice is found. 633 | 634 | {one line to give the program's name and a brief idea of what it does.} 635 | Copyright (C) {year} {name of author} 636 | 637 | This program is free software: you can redistribute it and/or modify 638 | it under the terms of the GNU General Public License as published by 639 | the Free Software Foundation, either version 3 of the License, or 640 | (at your option) any later version. 641 | 642 | This program is distributed in the hope that it will be useful, 643 | but WITHOUT ANY WARRANTY; without even the implied warranty of 644 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 645 | GNU General Public License for more details. 646 | 647 | You should have received a copy of the GNU General Public License 648 | along with this program. If not, see . 649 | 650 | Also add information on how to contact you by electronic and paper mail. 651 | 652 | If the program does terminal interaction, make it output a short 653 | notice like this when it starts in an interactive mode: 654 | 655 | {project} Copyright (C) {year} {fullname} 656 | This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 657 | This is free software, and you are welcome to redistribute it 658 | under certain conditions; type `show c' for details. 659 | 660 | The hypothetical commands `show w' and `show c' should show the appropriate 661 | parts of the General Public License. Of course, your program's commands 662 | might be different; for a GUI interface, you would use an "about box". 663 | 664 | You should also get your employer (if you work as a programmer) or school, 665 | if any, to sign a "copyright disclaimer" for the program, if necessary. 666 | For more information on this, and how to apply and follow the GNU GPL, see 667 | . 668 | 669 | The GNU General Public License does not permit incorporating your program 670 | into proprietary programs. If your program is a subroutine library, you 671 | may consider it more useful to permit linking proprietary applications with 672 | the library. If this is what you want to do, use the GNU Lesser General 673 | Public License instead of this License. But first, please read 674 | . 675 | 676 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | btfs 2 | ==== 3 | 4 | a user space file system for efficiently downloading and seeding torrents 5 | -------------------------------------------------------------------------------- /benchmarks/Jamfile: -------------------------------------------------------------------------------- 1 | use-project /torrentfs : .. ; 2 | 3 | exe block_device_test : block_device_test.cpp : /torrentfs//torrentfs/on/static ; 4 | 5 | explicit stage ; 6 | install stage : block_device_test : . ; 7 | 8 | -------------------------------------------------------------------------------- /benchmarks/block_device_test.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright (c) 2014, Arvid Norberg 4 | All rights reserved. 5 | 6 | btfs Source Code is free software: you can redistribute it and/or modify 7 | it under the terms of the GNU General Public License as published by 8 | the Free Software Foundation, either version 3 of the License, or 9 | (at your option) any later version. 10 | 11 | For details, see LICENSE 12 | 13 | */ 14 | 15 | #include "libtorrent/session.hpp" 16 | #include "libtorrent/torrent_handle.hpp" 17 | #include "libtorrent/file.hpp" 18 | #include "libtorrent/alert_types.hpp" 19 | 20 | #include "block_device.hpp" 21 | #include "block_affinity.hpp" 22 | 23 | #include 24 | #include 25 | 26 | using namespace libtorrent; 27 | 28 | int main(int argc, char *const argv[]) 29 | { 30 | if (argc > 3 || argc < 2) 31 | { 32 | fprintf(stderr, "usage: block_device_test listen-port [device-path]\n"); 33 | return 1; 34 | } 35 | 36 | // we want this to destruct after the session, so the 37 | // pointer must be declared before ses 38 | boost::shared_ptr dev; 39 | 40 | int port = atoi(argv[1]); 41 | session ses(fingerprint("LT", 0, 1, 0, 0) 42 | , std::make_pair(port, port + 1)); 43 | ses.set_alert_mask(~alert::progress_notification); 44 | settings_pack s; 45 | high_performance_seed(s); 46 | s.set_int(settings_pack::cache_size, 131072); // 2 GB 47 | s.set_int(settings_pack::listen_queue_size, 500); 48 | s.set_int(settings_pack::alert_queue_size, 50000); 49 | // s.set_bool(settings_pack::contiguous_recv_buffer, false); 50 | s.set_bool(settings_pack::contiguous_recv_buffer, true); 51 | s.set_bool(settings_pack::allow_multiple_connections_per_ip, true); 52 | 53 | if (argc > 2) 54 | { 55 | char const* device_path = argv[2]; 56 | 57 | error_code ec; 58 | dev.reset(new block_device(device_path, ec)); 59 | if (ec) 60 | { 61 | fprintf(stderr, "FATAL: failed to open device \"%s\": (%d) %s\n" 62 | , device_path, ec.value(), ec.message().c_str()); 63 | return 1; 64 | } 65 | dev->read_root_block(ec); 66 | if (ec) 67 | { 68 | fprintf(stderr, "FATAL: failed read device \"%s\": (%d) %s\n" 69 | , device_path, ec.value(), ec.message().c_str()); 70 | return 1; 71 | } 72 | 73 | // with too many threads, we risk getting our writes out of order 74 | // breaking sequentiality 75 | s.set_int(settings_pack::aio_threads, 4); 76 | } 77 | 78 | ses.add_extension(boost::bind(&block_affinity, _1, 4 * 1024 * 1024)); 79 | 80 | // these two settings have a significant impact on performance 81 | // it would be interesting to run multiple tests on a regular 82 | // filesystem with varying cache line sizes 83 | s.set_bool(settings_pack::allow_partial_disk_writes, false); 84 | 85 | // try to flush 4 MB at a time to the disk 86 | s.set_int(settings_pack::write_cache_line_size, 256); 87 | 88 | ses.apply_settings(s); 89 | 90 | // (filename, torrent_info) 91 | std::vector > > test_torrents; 92 | // (handle, cmd line for connection_tester) 93 | std::map handles; 94 | 95 | error_code ec; 96 | std::string path("test_torrents"); 97 | for (directory dir(path, ec); !ec && !dir.done(); dir.next(ec)) 98 | { 99 | if (extension(dir.file()) != ".torrent") continue; 100 | std::string file_path = combine_path(path, dir.file()); 101 | error_code tec; 102 | boost::shared_ptr ti(new torrent_info(file_path, tec)); 103 | 104 | // assume the file isn't fully written yet. 105 | if (tec) 106 | { 107 | fprintf(stderr, "error loading \"%s\": %s\n", file_path.c_str(), tec.message().c_str()); 108 | continue; 109 | } 110 | 111 | test_torrents.push_back(std::make_pair(file_path, ti)); 112 | } 113 | 114 | std::deque alert_queue; 115 | time_t last_added = 0; 116 | do 117 | { 118 | // space out adding new torrents by 2 second 119 | // TODO: it should really be spaced out by number of bytes downloaded... 120 | if (!test_torrents.empty() && handles.size() < 10 && time(NULL) - 1 > last_added) 121 | { 122 | add_torrent_params p; 123 | p.flags = add_torrent_params::flag_update_subscribe | add_torrent_params::flag_pinned; 124 | p.save_path = "torrent_storage"; 125 | p.ti = test_torrents.back().second; 126 | if (dev) 127 | p.storage = boost::bind(&block_device_storage_constructor, dev, _1); 128 | std::string path = test_torrents.back().first; 129 | printf("adding \"%s\"\n", path.c_str()); 130 | test_torrents.pop_back(); 131 | torrent_handle h = ses.add_torrent(p); 132 | char cmd_buf[200]; 133 | snprintf(cmd_buf, sizeof(cmd_buf), "connection_tester upload -c 10 -d 127.0.0.1 -p %d -t %s >logs/tester_%s.log 2>1 &" 134 | , port, path.c_str(), filename(path).c_str()); 135 | handles.insert(std::make_pair(h, std::string(cmd_buf))); 136 | last_added = time(NULL); 137 | } 138 | 139 | usleep(100000); 140 | 141 | ses.pop_alerts(&alert_queue); 142 | for (std::deque::iterator i = alert_queue.begin() 143 | , end(alert_queue.end()); i != end; ++i) 144 | { 145 | std::auto_ptr a(*i); 146 | // printf(" %s\n", a->message().c_str()); 147 | 148 | torrent_deleted_alert* td = alert_cast(a.get()); 149 | torrent_delete_failed_alert* tdf = alert_cast(a.get()); 150 | 151 | if (td || tdf) 152 | { 153 | torrent_alert* tf = (torrent_alert*)td; 154 | if (tf == NULL) tf = (torrent_alert*)tdf; 155 | 156 | std::map::iterator hi = handles.find(tf->handle); 157 | if (hi == handles.end()) 158 | { 159 | // delete the first invalid handle we can find instead 160 | for (std::map::iterator i = handles.begin() 161 | , end(handles.end()); i != end; ++i) 162 | { 163 | if (i->first.is_valid()) continue; 164 | hi = i; 165 | break; 166 | } 167 | } 168 | 169 | handles.erase(hi); 170 | 171 | printf("still running: "); 172 | for (std::map::iterator i = handles.begin() 173 | , end(handles.end()); i != end; ++i) 174 | { 175 | int str_start = i->second.find("test_torrents/"); 176 | if (str_start == std::string::npos) str_start = 0; 177 | else str_start += 14; 178 | 179 | int str_end = i->second.find(" ", str_start); 180 | 181 | printf("\"%s\" ", i->second.substr(str_start, str_end - str_start).c_str()); 182 | } 183 | printf("\n"); 184 | } 185 | else if (torrent_finished_alert* tf = alert_cast(a.get())) 186 | { 187 | std::map::iterator hi = handles.find(tf->handle); 188 | if (hi == handles.end()) continue; 189 | printf("completed: \"%s\"\n", tf->handle.name().c_str()); 190 | ses.remove_torrent(tf->handle, session::delete_files); 191 | } 192 | else if (state_changed_alert* sc = alert_cast(a.get())) 193 | { 194 | if (sc->prev_state == torrent_status::checking_resume_data) 195 | { 196 | std::map::iterator hi = handles.find(sc->handle); 197 | printf("running: \"%s\"\n", hi->second.c_str()); 198 | system(hi->second.c_str()); 199 | } 200 | } 201 | else if (torrent_error_alert* ea = alert_cast(a.get())) 202 | { 203 | printf("ERROR: \"%s\": %s (%s)\n", ea->handle.name().c_str(), ea->error.message().c_str(), ea->error_file.c_str()); 204 | } 205 | } 206 | alert_queue.clear(); 207 | // printf("running: %d (%d)\n", int(handles.size()), handles.empty()); 208 | } 209 | while (!handles.empty() || !test_torrents.empty()); 210 | } 211 | 212 | -------------------------------------------------------------------------------- /benchmarks/run_benchmark.py: -------------------------------------------------------------------------------- 1 | # this test requires connection_tester (from libtorrent/examples) to be installed 2 | # and available in $PATH, as well as parse_access_log (from libtorrent/tools). 3 | 4 | # btfs from the parent directory needs to be built and available at ../btfs 5 | # (this script attempts to do that) 6 | # the test binary itself, block_device_test, should be copied from the build 7 | # directory to the same directory this script lives in, benchmarks. 8 | 9 | # right now the path to the block device where the custom filesystem is 10 | # initialized and tested is hard coded in this script. 11 | 12 | import os 13 | import time 14 | import shutil 15 | import sys 16 | 17 | num_torrents = 25 18 | 19 | port = (int(time.time()) % 50000) + 2000 20 | 21 | try: os.mkdir('test_torrents') 22 | except: pass 23 | 24 | # build the test 25 | ret = os.system('b2 boost=source link=static debug-symbols=on release -j4 stage') 26 | if ret != 0: 27 | print 'ERROR: building block_device_test failed: %d' % ret 28 | sys.exit(1) 29 | 30 | # build the btfs tool 31 | ret = os.system('(cd .. && b2 boost=source link=static -j4 stage)') 32 | if ret != 0: 33 | print 'ERROR: building block_device_test failed: %d' % ret 34 | sys.exit(1) 35 | 36 | if not os.path.exists('torrent_storage.img'): 37 | ret = os.system('dd if=/dev/zero count=20971520 of=torrent_storage.img') 38 | if ret != 0: 39 | print 'ERROR: dd failed: %d' % ret 40 | sys.exit(1) 41 | 42 | ret = os.system('../btfs initialize 1048576 torrent_storage.img') 43 | if ret != 0: 44 | print 'ERROR: btfs failed: %d' % ret 45 | sys.exit(1) 46 | 47 | try: os.mkdir('logs') 48 | except: pass 49 | 50 | for i in range(num_torrents): 51 | if os.path.exists('test_torrents/%d.torrent' % i): continue 52 | ret = os.system('connection_tester gen-torrent -s 1000 -n %d -t test_torrents/%d.torrent' % (i+1, i)) 53 | if ret != 0: 54 | print 'ERROR: connection_tester failed: %d' % ret 55 | sys.exit(1) 56 | 57 | try: shutil.rmtree('torrent_storage') 58 | except: pass 59 | 60 | try: shutil.rmtree('session_stats') 61 | except: pass 62 | try: shutil.rmtree('session_stats_btfs') 63 | except: pass 64 | try: shutil.rmtree('session_stats_ext4') 65 | except: pass 66 | 67 | start = time.time(); 68 | #cmd = 'gdb --args ./block_device_test %d torrent_storage.img' % port 69 | cmd = './block_device_test %d torrent_storage.img' % port 70 | print cmd 71 | ret = os.system(cmd) 72 | 73 | if ret != 0: 74 | print 'ERROR: ./block_device_test failed: %d' % ret 75 | sys.exit(1) 76 | 77 | end = time.time(); 78 | 79 | print 'runtime (custom filesystem): %d seconds' % (end - start) 80 | 81 | os.rename('session_stats', 'session_stats_btfs') 82 | os.system('python ../../libtorrent/tools/parse_session_stats.py session_stats_btfs/*.log') 83 | try: os.rename('session_stats_report', 'session_stats_report_btfs') 84 | except: pass 85 | ret = os.system('parse_access_log block_device_access.log') 86 | if ret != 0: 87 | print 'ERROR: parse_access_log failed: %d' % ret 88 | sys.exit(1) 89 | 90 | start = time.time(); 91 | #cmd = 'gdb --args ./block_device_test %d' % port 92 | cmd = './block_device_test %d' % port 93 | print cmd 94 | ret = os.system(cmd) 95 | if ret != 0: 96 | print 'ERROR: ./block_device_test failed: %d' % ret 97 | sys.exit(1) 98 | 99 | end = time.time(); 100 | 101 | print 'runtime (regular filesystem): %d seconds' % (end - start) 102 | 103 | os.rename('session_stats', 'session_stats_ext4') 104 | os.system('python ../../libtorrent/tools/parse_session_stats.py session_stats_ext4/*.log') 105 | try: os.rename('session_stats_report', 'session_stats_report_ext4') 106 | except: pass 107 | 108 | -------------------------------------------------------------------------------- /docs/bittorrent_filesystem.dia: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arvidn/btfs/75aefe7c9603abd327fb23cd35548542163cdfa0/docs/bittorrent_filesystem.dia -------------------------------------------------------------------------------- /docs/bittorrent_filesystem.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arvidn/btfs/75aefe7c9603abd327fb23cd35548542163cdfa0/docs/bittorrent_filesystem.png -------------------------------------------------------------------------------- /docs/btfs.rst: -------------------------------------------------------------------------------- 1 | ================================================================ 2 | DRAFT: Improving BitTorrent performance with a custom filesystem 3 | ================================================================ 4 | 5 | :author: Arvid Norberg, arvid@rasterbar.com 6 | :date: August 2012 7 | :abstract: One of the main bottlenecks when downloading and seeding 8 | content over bittorrent is accessing the disk. This paper explores 9 | the option to bypass traditional filesystems and use a block device 10 | as storage for torrents, in order to improve download performance 11 | 12 | .. raw:: pdf 13 | 14 | PageBreak twoColumn 15 | SetPageCounter 1 16 | 17 | .. header:: 18 | 19 | .. class:: center 20 | 21 | ###Page### 22 | 23 | .. section-numbering:: 24 | 25 | 26 | background 27 | ---------- 28 | 29 | BitTorrent downloads conceptually divide up content into *pieces*, 30 | which are downloaded in *rarest-first* order. 31 | 32 | The size of a piece is determined by the creator of a torrent, but 33 | is typically a power of 2 in the range 16 kiB - 4 MiB. 34 | 35 | The availability of pieces is kept as part of the downloader state, 36 | that is, the number of peers that have each piece. This availability 37 | is used to prefer downloading pieces that are (locally) rare. Among 38 | pieces with equal availability, pieces are downloaded at random. 39 | 40 | From the disk's point of view, pieces are written in random order, 41 | as well as read in random order (when other peers requests pieces). 42 | 43 | disk access characteristics 44 | ........................... 45 | 46 | The typical mode of disk access for bittorrent is random access at 47 | piece-level and essentially sequential access within pieces. The size 48 | of pieces varies with each torrent. However, typical piece sizes 49 | are 4 MiB or smaller. 50 | 51 | In a sample of about 700k torrents yield the following cumulative piece 52 | size distribution:: 53 | 54 | 16 kiB: 0.4 % 55 | 32 kiB: 2.6 % 56 | 64 kiB: 13.1 % 57 | 128 kiB: 25.8 % 58 | 256 kiB: 49.7 % 59 | 512 kiB: 79.4 % 60 | 1024 kiB: 91.3 % 61 | 2048 kiB: 95.9 % 62 | 4096 kiB: 99.7 % 63 | 8192 kiB: 99.8 % 64 | 16384 kiB: 100.0 % 65 | 66 | .. figure:: piece_size_cdf.png 67 | :alt: typical piece sizes 68 | :scale: 150% 69 | 70 | piece size distribution 71 | 72 | A reasonable upper limit of an expected piece size in the wild is 73 | clearly 4 MiB. 74 | 75 | Both downloading and uploading with bittorrent causes random disk 76 | access at the piece level. For this reason, there is no advantage 77 | for bittorrent to keep files contiguous on disk; reading will be 78 | just as random access anyway. 79 | 80 | block alignment 81 | ............... 82 | 83 | Dealing with *multi-file torrents*, torrents that download to multiple 84 | files rather than a single one, chances are that a significant number 85 | of blocks on disk are not aligned with pieces. 86 | 87 | For example, if a torrent contains one small text file as the first 88 | file, and then a multi gigabyte video file, all blocks in the video 89 | file are shifted and the bittorrent pieces won't line up with the blocks 90 | on disk. That is unless the text file is exactly divisible by 512 bytes 91 | (the typical block size of hard drives). Writing unaligned blocks to disk 92 | is not optimal, since the operating system or filesystem cache need to 93 | read the edge blocks from disk, in order to overlay the beginning and 94 | the end of the block. Typically the whole memory buffer would have to 95 | be copied into an aligned buffer as well, before the driver can DMA it 96 | over to the disk. 97 | 98 | There are a few solutions to this problem. The simplest one is to craft 99 | torrent files in such way that the majority of pieces are aligned with 100 | disk blocks. This can be done either by moving the largest file to the 101 | front of the file list, and also by inserting dummy pad-files in between, 102 | to make large files start on a block or piece boundary. 103 | 104 | The client downloading or seeding the torrent can also simply store all 105 | files concatenated together in a single large file on disk. 106 | 107 | torrent size 108 | ............ 109 | 110 | Looking at the torrent size distribution on the same sample of torrents: 111 | 112 | A few observations from this data is half of all torrents are smaller 113 | than 300 MiB. 98.36 % of all torrents are smaller than 5 GiB. 114 | 115 | .. figure:: torrent_size_cdf.png 116 | :alt: torrent sizes CDF 117 | :scale: 150% 118 | 119 | torrent size CDF 120 | 121 | The maximum torrent size in the sample set is 730.6 GiB. One of the largest torrents 122 | spotted in the wild was 746.70 GiB [#tf_records]_. 123 | 124 | .. [#tf_records] http://torrentfreak.com/5-torrent-files-that-broke-mind-boggling-records-101107/ 125 | 126 | .. raw: pdf 127 | 128 | FrameBreak 50 129 | 130 | filesystem 131 | ---------- 132 | 133 | This section describes the design and rationale for the new filesystem used 134 | to store torrent files. 135 | 136 | properties 137 | .......... 138 | 139 | Based on the characteristics of the disk access of torrent files, a 140 | bittorrent centric filesystem would likely perform well with the following 141 | properties: 142 | 143 | * files are always assumed to be fully fragmented in chunks of 4 MiB. 144 | There is no gain in allocating more than 4 MiB data blocks contiguously, 145 | since they're read out-of-order anyway. 146 | * instead of splitting multi-file torrents into actual files, treat 147 | an entire torrent as a single file on disk in order to keep all bittorrent 148 | pieces aligned to disk blocks. 149 | * torrents of at least 1 PiB must be supported, and probably more than that 150 | for head room. 151 | 152 | 153 | overview 154 | ........ 155 | 156 | Hard drives and other storage mediums that filesystems run on typically has a 157 | *block device* abstraction layer in common. This layer makes the raw storage 158 | of a device available as an array of blocks. A block being the minimum transfer 159 | unit to and from the device. The block size for typical drives is 512 Bytes. 160 | 161 | On top of the block device, filesystems typically impose their own, larger, 162 | block size. The filesystem block size must be a multiple of the device block 163 | size and ends up being in the order of a few kilobytes. 164 | 165 | Locations on the disk is typically addressed by the filesystem block number. 166 | 167 | When the filesystem needs to find a filesystem block to store data in, it uses 168 | a block allocator. The block allocator is a bitfield and a cursor to the last 169 | allocated block. To allocate a block, the bitfield is scanned forward, from 170 | the cursor, until a free block is found. 171 | 172 | When a block is freed, its bit is simply cleared in the bitfield. 173 | 174 | The same allocation strategy is used for data blocks as metadata blocks. 175 | This makes the metadata scattered across the whole disk. 176 | 177 | Filesystem metadata is the supporting structure around the user data, to 178 | associate it with the correct file and location. Filesystem metadata is 179 | described in more detail below. 180 | 181 | .. figure:: bittorrent_filesystem.png 182 | :alt: filesystem overview 183 | 184 | Overview of filesystem structure. 185 | 186 | root block 187 | .......... 188 | 189 | The root block of the filesystem is the entry point of everything stored on 190 | the disk. Therefore, the root block is stored in a well defined location, 191 | which is block 0. That is, the first 4 MiB of the device is occupied by 192 | the root of the filesystem. 193 | 194 | Contrary to typical filesystems, there is no need for a directory structure. 195 | Since the filesystem only stores torrents, they already have an identifier 196 | guaranteed to be unique, their info-hash. 197 | 198 | The main portion of the root block therefore contains a flat list of all files 199 | stored in the filesystem. Apart from that, it contains an identifying header, 200 | to be able to verify that the drive actually contains the expected filesystem, 201 | and the filesystem block size. The block size would normally be 4 MiB, but 202 | could be other powers of 2. 203 | 204 | The file list is not ordered in any way on disk, it is expected to be 205 | organized in an efficient data structure in memory. This implementation 206 | stores files in-memory in a hash table, with the info-hash as the key. 207 | 208 | block map 209 | ......... 210 | 211 | Typical filesystems maps file blocks to filesystem blocks by arrays of *extents*, 212 | or block runs. An extent is essentially a start and an end block reference. This 213 | approach makes sense when block sizes are small and there is an affinity to 214 | keep files as contiguous as possible (i.e. less fragmented). 215 | 216 | Files are expected to be fully fragmented at 4 MiB boundaries (because of the 217 | random access nature of BitTorrent), which makes a simple block map more space 218 | efficient than extents. The first entry maps the first 4 MiB of the file to the 219 | filesystem block number holding that data, and so on. 220 | 221 | Files are also expected to be sparse. That means block entries must be able to 222 | indicate that the block has not been allocated on disk yet. 223 | 224 | Each entry in the block map is 4 bytes. The size of the block map determines 225 | the maximum size of the file it can hold. A map for a 1 GiB file requires 226 | 1024 Bytes worth of block map (with a 4 MiB block size). A map for a 1 PiB 227 | file requires 1 MiB worth of block map. 228 | 229 | i-nodes 230 | ....... 231 | 232 | Information nodes, commonly called *i-nodes*, are a filesystem's file 233 | entries. The i-nodes in this filesystem can be very simple. It essentially 234 | just contains: 235 | 236 | * an i-node identifier (for consistency checking). 237 | * the info-hash of the torrent it holds. 238 | * a reference to where it is stored on disk (for convenience and consistency 239 | checking). 240 | * a block map, mapping blocks in the file address space to blocks in the 241 | filesystem address space. 242 | 243 | The i-nodes need to support torrents that are more than 1 PiB in size. Assuming 244 | a 4 MiB filesystem block size, the block map needs to fit 268,435,456 block 245 | mappings. A mapping is simply a ``uint32_t`` referring to the filesystem block 246 | at that index. Since each slot is 4 bytes, that means the i-node would have to 247 | be 1 MiB. 248 | 249 | Since most torrents are 4.5 GiB or smaller, it would be wasteful to allocate a 250 | full Megabyte for every i-node. Apart from wasting disk, storing i-node 251 | headers 1 MiB apart on disk makes reading them all less efficient, since the 252 | read head needs to move farther and spend less time reading. 253 | 254 | The typical solution to this problem in proper filesystems is to have indirect 255 | blocks. That is, the i-node has an extra block reference to another filesystem 256 | block, which only contains a block map. 257 | 258 | For the purpose of this filesystem, that solution has two drawbacks. 259 | 260 | 1. It complicates the layout on disk. 261 | 2. It makes it more expensive to read the block-map, since it would 262 | be disconnected from the i-node block. 263 | 264 | For these reasons, it seems like a good idea to allow i-nodes to have different 265 | sizes. 266 | 267 | The i-nodes are allocated using a slab allocator, with 13 slabs. The sizes are 268 | 1 kiB, 2 kiB, 4 kiB, 8 kiB, 16 kiB, 32 kiB, 64 kiB, 128 kiB, 256 kiB, 512 kiB, 269 | 1 MiB, 2 MiB and 4 MiB. 270 | 271 | Each slab, in turn, allocates normal filesystem blocks used as storage for the 272 | i-nodes. The free list in each slab is kept sorted, in order to create a strong 273 | bias towards earlier slots, and concentrate allocations into as few filesystem 274 | blocks as possible. Whenever the last i-node is freed from a filesystem block, 275 | it is returned to the global block free-list. 276 | 277 | This setup supports efficient storage of regular torrents, the common case, as 278 | well as supporting the rare case of a very large torrent, several PiB in size. 279 | 280 | .. figure:: inode_allocation.png 281 | :alt: allocation of inodes 282 | 283 | Allocation of i-nodes of varying sizes on top of 284 | the filesystem block partitioning. 285 | 286 | booting 287 | ....... 288 | 289 | There is no bitmap describing which blocks are free, instead the free 290 | block list is built at boot-time. This is possible because of the lack 291 | of directory structure and i-nodes being well packed at the beginning 292 | of the device. 293 | 294 | To build the free-list of filesystem blocks, the filesystem: 295 | 296 | 1. reads the root block (block 0) and mark it as in-use. 297 | 2. collects all i-node references. 298 | 3. sorts the i-node references in ascending order. 299 | 4. reads all the i-node blocks, marking off themselves 300 | and all the blocks in the block map, as in-use. 301 | 302 | At boot time, the root block is also turned into a more efficient in-memory 303 | representation. All i-nodes are put in a hash table with their info-hash 304 | (file name) as the key. All i-nodes are kept in memory as well. 305 | 306 | journaling 307 | .......... 308 | 309 | Journaling is the technique where data on disk is guaranteed to be 310 | in a consistent state at any given time. It removes the requirement 311 | to run tools like checkdisk if the computer is turned off by a power 312 | outage or the whole system crashes. 313 | 314 | Journaling typically only applies to the filesystem structure itself. 315 | The content of files does not have the consistency guarantee, which 316 | probably is fine for the most part. 317 | 318 | Journals can be implemented by simply (when flushing filesystem metadata 319 | updates) first flush all the updated blocks into the journal, then start 320 | updating the actual blocks. Flushing to the journal is often fast because 321 | it is a contiguous chunk of the disk. If the system crashes at any time 322 | during the metadata update, the filesystem can simply pick up the changes 323 | from the journal and replay them to get into a consistent state. 324 | 325 | The reason why a filesystem would need to do this is because the metadata 326 | updates are mutually dependent on each other. If each individual block 327 | of metadata that was updated was entirely self-contained, and would not 328 | cause any incinsistencies at any given point (assuming writing one 329 | block is atomic), there would be no need for a journal. 330 | 331 | The bittorrent filesystem is so simple, making sure the filesystem 332 | is consistent at all times, is just a matter of making sure blocks are 333 | flushed in the right order. 334 | 335 | For any of the consistency guarantees to hold, the filesystem must 336 | operate on a raw defice with no buffering. 337 | 338 | disk usage optimization 339 | ....................... 340 | 341 | With a filesystem block size of 4 MiB, most torrents will run at a fairly large amount 342 | of wasted disk space. For torrents with piece sizes less than 4 MiB, each downloaded 343 | piece will still allocate 4 MiB on disk, and waste the remaining space until the 344 | adjacent pieces are downloaded. 345 | 346 | In order to reduce wasted space, and improve locality of disk writes, an affinity 347 | for downloading the adjacent pieces is created. The adjacent pieces are picked such 348 | that they all fall into the same 4 MiB filesystem block. 349 | 350 | The advantage, apart from more efficiently utilizing the disk space, is that while 351 | filling up the disk, writes may tend to happen physically closer to each other. 352 | 353 | This is a crucial optimization. There is a steep performance gain in going from 354 | being almost sequential, to actually be sequential. For this reason, libtorrent 355 | was made to force cache any adjacent pieces of 4 MiB, until the whole 4 MiB is 356 | downloaded, before flushing it. This way, all flushes are always (or at least 357 | essentially always) 4 MiB at a time, aligned with the filesystem blocks. 358 | 359 | zero-fill 360 | ......... 361 | 362 | When typical filesystems allocate blocks for a file, they sometimes 363 | have to be filled with zeros, in order to not leak potentially sensitive 364 | information that may have been stored in that block previously. This is presumably 365 | mostly the case when perorming small writes, that don't completely fill a block. 366 | This case is fairly common for multi file torrents, for the pieces that are 367 | not aligned with filesystem blocks. 368 | 369 | This is another opportunity for optimizations. Filling reused blocks with zeros 370 | is not strictly necessary, since the bittorrent layer restricts access to any 371 | portions of the file that has not been written. 372 | 373 | metadata updates 374 | ................ 375 | 376 | Typical filesystems, such as ext4, supports *access timestamps*. These are time- 377 | stamps updated every time a file is accessed, for instance read from. These updates 378 | to file metadata can potentially be costly. For this reason, Linux has introduced 379 | a ``O_NOATIME`` flag which opens a file in a mode that doesn't update the file's 380 | access time. This has the restriction that it only works if the process opening 381 | the file runs as the same user as the owner of the file. 382 | 383 | With a custom filesystem almost no metadata needs to be kept per file, and more 384 | importantly, almost no metadata needs to be kept up to date on disk. This is a 385 | potential performance improvement. 386 | 387 | extents 388 | ....... 389 | 390 | It turns out that a significant performance gain comes from aligning pieces, 391 | and ranges of pieces, to be downloaded and flushed to disk with the filesystem's 392 | default extent allocation size and alignment. 393 | 394 | .. figure:: partial_cache_stripe.png 395 | :alt: cache stipe only partially filled by a write 396 | 397 | When writing a small piece to a filesystem with large default extents, 398 | the extent is not fully filled-in by a write. Causing the drive to seek 399 | back later when those parts of the file are downloaded. 400 | 401 | In the case where the bittorrent cache's stripe size is not aligned with the file- 402 | system's the allocation strategy is not optimal. Assuming a random uniform 403 | distribution of the download order of pieces, early pieces are more likely 404 | to allocate a new extent in the file, which is fast because it is done sequentially 405 | from the current disk head position. However, later pieces are more likely to have to 406 | seek back into the previously allocated extents to fill them in. 407 | 408 | .. figure:: sparse_access.png 409 | :alt: write disk access pattern for sparse files 410 | 411 | Write pattern when filling a sparse file at random (where the extents 412 | by which the file grows fit 4 blocks). 413 | 414 | By aligning the cache stripes, the bittorrent engine will attempt to download 415 | ranges of pieces that fill an entire extent allocation. This results in allocating 416 | a new extent in the file and also completely fills it with data. There will not be any 417 | need to seek back to fill in any other pieces. This yields near sequential write 418 | performance. 419 | 420 | block allocation 421 | ................ 422 | 423 | Two different block allocation strategies were tested. 424 | 425 | sequential block allocator 426 | Treats the disk as a ring buffer, allocating the next free block and 427 | wrap back to 0. 428 | 429 | ordered block allocator 430 | allocates the lowest indexed free block for data and the highest 431 | indexed free block for i-nodes. 432 | 433 | Comparing the two with identical, artificial, payloads yields the following access 434 | graphs: 435 | 436 | .. figure:: sequential_allocation.png 437 | :alt: disk access for sequential allocator 438 | 439 | disk access pattern for sequential block allocator (writes) 440 | 441 | .. figure:: ordered_allocation.png 442 | :alt: disk access for ordered allocator 443 | 444 | disk access pattern for ordered block allocator (writes) 445 | 446 | Rotating disks perform significantly better under the sequential allocation scheme. 447 | 448 | benchmarks 449 | ---------- 450 | 451 | The benchmark was run on a Mac with the following specifications:: 452 | 453 | hw.machine = x86_64 454 | hw.model = MacPro4,1 455 | hw.ncpu = 16 456 | hw.physmem = 2147483648 457 | hw.pagesize = 4096 458 | hw.busfrequency = 4294967295 459 | hw.cpufrequency = 2260000000 460 | 461 | Downloading 25 torrents, 1 GB each, 10 torrents in parallel, 1 GB cache: 462 | 463 | btfs file on top of hfs+: 281 seconds (avg: 91.10 MB/s) 464 | directly to hfs+: 897 seconds (avg: 28.54 MB/s) 465 | 466 | .. figure:: file_access.png 467 | :alt: write disk access pattern for the btfs test 468 | 469 | Write pattern when on the block device level in the btfs benchmark. 470 | 471 | The conclusion is that BitTorrent downloads whose download rate is limited 472 | by spinning disks, could potentially be made 3 times faster with a custom 473 | file system where writes are made more sequential, block aligned and has 474 | less metadata overhead. 475 | 476 | -------------------------------------------------------------------------------- /docs/file_access.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arvidn/btfs/75aefe7c9603abd327fb23cd35548542163cdfa0/docs/file_access.png -------------------------------------------------------------------------------- /docs/inode_allocation.dia: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arvidn/btfs/75aefe7c9603abd327fb23cd35548542163cdfa0/docs/inode_allocation.dia -------------------------------------------------------------------------------- /docs/inode_allocation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arvidn/btfs/75aefe7c9603abd327fb23cd35548542163cdfa0/docs/inode_allocation.png -------------------------------------------------------------------------------- /docs/makefile: -------------------------------------------------------------------------------- 1 | TARGETS = btfs 2 | 3 | pdf: $(TARGETS:=.pdf) 4 | 5 | all: pdf 6 | 7 | %.pdf:%.rst 8 | rst2pdf $? -o $@ --stylesheets stylesheet 9 | 10 | -------------------------------------------------------------------------------- /docs/ordered_allocation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arvidn/btfs/75aefe7c9603abd327fb23cd35548542163cdfa0/docs/ordered_allocation.png -------------------------------------------------------------------------------- /docs/partial_cache_stripe.dia: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arvidn/btfs/75aefe7c9603abd327fb23cd35548542163cdfa0/docs/partial_cache_stripe.dia -------------------------------------------------------------------------------- /docs/partial_cache_stripe.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arvidn/btfs/75aefe7c9603abd327fb23cd35548542163cdfa0/docs/partial_cache_stripe.png -------------------------------------------------------------------------------- /docs/piece_size_cdf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arvidn/btfs/75aefe7c9603abd327fb23cd35548542163cdfa0/docs/piece_size_cdf.png -------------------------------------------------------------------------------- /docs/piece_sizes.dat: -------------------------------------------------------------------------------- 1 | 16 0.1 % 2 | 32 1.4 % 3 | 64 6.5 % 4 | 128 6.7 % 5 | 256 21.0 % 6 | 512 19.5 % 7 | 1024 19.4 % 8 | 2048 12.3 % 9 | 4096 12.8 % 10 | 8192 0.1 % 11 | 16384 0.1 % 12 | -------------------------------------------------------------------------------- /docs/render.gnuplot: -------------------------------------------------------------------------------- 1 | set term png size 800,400 giant 2 | set output "sparse_access.png" 3 | set title "block writes (sparse files, 4 pieces per extent)" 4 | set ylabel "block" 5 | set xlabel "write" 6 | set key off 7 | plot "sparse_access.dat" using 1:2 title "disk write" with dots 8 | 9 | set terminal postscript 10 | set output "sparse_access.ps" 11 | replot 12 | 13 | 14 | -------------------------------------------------------------------------------- /docs/sequential_allocation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arvidn/btfs/75aefe7c9603abd327fb23cd35548542163cdfa0/docs/sequential_allocation.png -------------------------------------------------------------------------------- /docs/simulate_sparse.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | 4 | allocated_extents = {} 5 | 6 | pieces = range(10000) 7 | random.shuffle(pieces) 8 | 9 | pieces_per_extent = 4 10 | cursor = 0 11 | 12 | f = open('sparse_access.dat', 'w+') 13 | 14 | for i in xrange(10000): 15 | piece = pieces[0] 16 | pieces = pieces[1:] 17 | ext = piece / pieces_per_extent 18 | if ext in allocated_extents: 19 | pos = allocated_extents[ext] + piece % pieces_per_extent 20 | else: 21 | pos = cursor + piece % pieces_per_extent 22 | allocated_extents[ext] = cursor 23 | cursor += pieces_per_extent 24 | 25 | print >>f, '%d\t%d' % (i, pos) 26 | 27 | f.close() 28 | 29 | -------------------------------------------------------------------------------- /docs/sizes.dat: -------------------------------------------------------------------------------- 1 | 100 19.36 % 2 | 300 12.00 % 3 | 500 6.17 % 4 | 700 16.45 % 5 | 900 5.05 % 6 | 1100 3.23 % 7 | 1300 6.80 % 8 | 1500 5.30 % 9 | 1700 1.58 % 10 | 1900 1.30 % 11 | 2100 1.23 % 12 | 2300 0.98 % 13 | 2500 0.77 % 14 | 2700 0.67 % 15 | 2900 0.91 % 16 | 3100 0.46 % 17 | 3300 0.84 % 18 | 3500 0.77 % 19 | 3700 0.84 % 20 | 3900 0.98 % 21 | 4100 1.09 % 22 | 4300 1.79 % 23 | 4500 2.88 % 24 | 4700 0.21 % 25 | 4900 0.21 % 26 | 5100 0.28 % 27 | 5300 0.25 % 28 | 5500 0.32 % 29 | 5700 0.32 % 30 | 5900 0.28 % 31 | 6100 0.28 % 32 | 6300 0.28 % 33 | 6500 0.25 % 34 | 6700 0.46 % 35 | 6900 0.21 % 36 | 7100 0.25 % 37 | 7300 0.21 % 38 | 7500 0.25 % 39 | 7700 0.28 % 40 | 7900 0.21 % 41 | 8100 0.25 % 42 | 8300 0.21 % 43 | 8500 0.07 % 44 | 8700 0.14 % 45 | 8900 0.14 % 46 | -------------------------------------------------------------------------------- /docs/sizes.gnuplot: -------------------------------------------------------------------------------- 1 | set term png size 800,400 giant 2 | set output "torrent_size_distribution.png" 3 | set title "torrent sizes in random sample" 4 | set ylabel "fraction of torrents" 5 | set xlabel "torrent size in MiB" 6 | set style fill solid border -1 pattern 2 7 | plot "sizes.dat" using 1:2 title "torrent size" with boxes 8 | 9 | set terminal postscript 10 | set output "torrent_size_distribution.ps" 11 | replot 12 | 13 | set term png size 800,400 giant 14 | set output "piece_size_distribution.png" 15 | set title "piece sizes in random sample" 16 | set ylabel "fraction of torrents" 17 | set xlabel "piece size in kiB" 18 | set logscale x 19 | set style fill solid border -1 pattern 2 20 | plot "piece_sizes.dat" using 1:2 title "piece size" with boxes 21 | 22 | set terminal postscript 23 | set output "piece_size_distribution.ps" 24 | replot 25 | 26 | set term png size 800,400 giant 27 | set nologscale x 28 | set output "torrent_size_cdf.png" 29 | set title "torrent sizes CDF in random sample" 30 | set ylabel "torrents (%)" 31 | set xlabel "torrent size in MiB" 32 | set style fill solid border -1 pattern 2 33 | plot "sizes_cdf.dat" using 1:2 title "torrent size" with lines 34 | 35 | set terminal postscript 36 | set output "torrent_size_cdf.ps" 37 | replot 38 | 39 | -------------------------------------------------------------------------------- /docs/sizes_cdf.dat: -------------------------------------------------------------------------------- 1 | 100 19.36 % 2 | 300 31.36 % 3 | 500 37.53 % 4 | 700 53.98 % 5 | 900 59.03 % 6 | 1100 62.26 % 7 | 1300 69.06 % 8 | 1500 74.36 % 9 | 1700 75.94 % 10 | 1900 77.24 % 11 | 2100 78.46 % 12 | 2300 79.45 % 13 | 2500 80.22 % 14 | 2700 80.88 % 15 | 2900 81.80 % 16 | 3100 82.25 % 17 | 3300 83.09 % 18 | 3500 83.87 % 19 | 3700 84.71 % 20 | 3900 85.69 % 21 | 4100 86.78 % 22 | 4300 88.57 % 23 | 4500 91.44 % 24 | 4700 91.65 % 25 | 4900 91.86 % 26 | 5100 92.14 % 27 | 5300 92.39 % 28 | 5500 92.70 % 29 | 5700 93.02 % 30 | 5900 93.30 % 31 | 6100 93.58 % 32 | 6300 93.86 % 33 | 6500 94.11 % 34 | 6700 94.56 % 35 | 6900 94.77 % 36 | 7100 95.02 % 37 | 7300 95.23 % 38 | 7500 95.48 % 39 | 7700 95.76 % 40 | 7900 95.97 % 41 | 8100 96.21 % 42 | 8300 96.42 % 43 | 8500 96.49 % 44 | 8700 96.63 % 45 | 8900 96.77 % 46 | -------------------------------------------------------------------------------- /docs/sparse_access.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arvidn/btfs/75aefe7c9603abd327fb23cd35548542163cdfa0/docs/sparse_access.png -------------------------------------------------------------------------------- /docs/stylesheet: -------------------------------------------------------------------------------- 1 | { 2 | "embeddedFonts" : [ ], 3 | "pageSetup" : { 4 | "size": "A4", 5 | "width": null, 6 | "height": null, 7 | "margin-top": "2cm", 8 | "margin-bottom": "2cm", 9 | "margin-left": "2cm", 10 | "margin-right": "2cm", 11 | "margin-gutter": "0cm", 12 | "firstTemplate": "oneColumn" 13 | }, 14 | "pageTemplates" : { 15 | "coverPage": { 16 | "frames": [ 17 | ["0cm", "0cm", "100%", "100%"] 18 | ], 19 | "showHeader" : false, 20 | "showFooter" : false 21 | }, 22 | "oneColumn": { 23 | "frames": [ 24 | ["0cm", "0cm", "100%", "100%"] 25 | ] 26 | }, 27 | "twoColumn": { 28 | "frames": [ 29 | ["0cm", "0cm", "49%", "100%"], 30 | ["51%", "0cm", "49%", "100%"] 31 | ] 32 | } 33 | }, 34 | "fontsAlias" : { 35 | "stdFont": "Times-Roman", 36 | "stdBold": "Times-Bold", 37 | "stdItalic": "Times-Italic", 38 | "stdBoldItalic": "Times-BoldItalic", 39 | "stdMono": "Courier", 40 | "stdMonoItalic": "Courier-Oblique", 41 | "stdMonoBold": "Courier-Bold", 42 | "stdMonoBoldItalic": "Courier-BoldOblique", 43 | "stdSerif": "Times-Roman" 44 | }, 45 | "linkColor" : "black", 46 | "styles" : [ 47 | [ "base" , { 48 | "parent": null, 49 | "fontName": "stdFont", 50 | "fontSize":10, 51 | "leading":12, 52 | "leftIndent":0, 53 | "rightIndent":0, 54 | "firstLineIndent":0, 55 | "alignment":"TA_LEFT", 56 | "spaceBefore":0, 57 | "spaceAfter":0, 58 | "bulletFontName":"stdFont", 59 | "bulletFontSize":10, 60 | "bulletIndent":0, 61 | "textColor": "black", 62 | "backColor": null, 63 | "wordWrap": null, 64 | "borderWidth": 0, 65 | "borderPadding": 0, 66 | "borderColor": null, 67 | "borderRadius": null, 68 | "allowWidows": false, 69 | "allowOrphans": false, 70 | "hyphenation": false 71 | }] , 72 | ["normal" , { 73 | "parent": "base" 74 | }], 75 | ["title-reference" , { 76 | "parent": "normal", 77 | "fontName": "stdItalic" 78 | }], 79 | ["bodytext" , { 80 | "parent": "normal", 81 | "spaceBefore":6, 82 | "alignment": "TA_JUSTIFY", 83 | "hyphenation": true 84 | }], 85 | ["footer" , { 86 | "parent": "normal", 87 | "alignment": "TA_CENTER" 88 | }], 89 | ["header" , { 90 | "parent": "normal", 91 | "alignment": "TA_CENTER" 92 | }], 93 | ["attribution" , { 94 | "parent": "bodytext", 95 | "alignment": "TA_RIGHT" 96 | }], 97 | ["figure" , { 98 | "parent": "bodytext", 99 | "alignment": "TA_CENTER" 100 | }], 101 | ["definition-list-term" , { 102 | "parent": "normal", 103 | "fontName": "stdBold", 104 | "spaceBefore": 4, 105 | "spaceAfter": 0, 106 | "keepWithNext": true 107 | }], 108 | ["definition-list-classifier" , { 109 | "parent": "normal", 110 | "fontName": "stdItalic" 111 | }], 112 | ["definition" , { 113 | "parent": "bodytext", 114 | "firstLineIndent": 0, 115 | "bulletIndent": 0, 116 | "spaceBefore": 0 117 | }], 118 | ["fieldname" , { 119 | "parent": "bodytext", 120 | "alignment": "TA_RIGHT", 121 | "fontName": "stdBold" 122 | }], 123 | ["rubric" , { 124 | "parent": "bodytext", 125 | "textColor": "darkred", 126 | "alignment": "TA_CENTER" 127 | }], 128 | ["italic" , { 129 | "parent": "bodytext", 130 | "fontName": "stdItalic" 131 | }], 132 | ["title" , { 133 | "parent": "normal", 134 | "fontName": "stdBold", 135 | "fontSize": "200%", 136 | "alignment": "TA_CENTER", 137 | "spaceBefore": 12, 138 | "spaceAfter": 10 139 | }], 140 | ["subtitle" , { 141 | "parent": "title", 142 | "spaceBefore": 9, 143 | "spaceAfter": 6, 144 | "fontSize": "75%" 145 | }], 146 | ["heading1" , { 147 | "parent": "normal", 148 | "fontName": "stdBold", 149 | "fontSize": "150%", 150 | "keepWithNext": true, 151 | "spaceBefore": 9, 152 | "spaceAfter": 3 153 | }], 154 | ["heading2" , { 155 | "parent": "normal", 156 | "fontName": "stdBold", 157 | "fontSize": "125%", 158 | "keepWithNext": true, 159 | "spaceBefore": 9, 160 | "spaceAfter": 3 161 | }], 162 | ["heading3" , { 163 | "parent": "normal", 164 | "fontName": "stdBold", 165 | "keepWithNext": true, 166 | "spaceBefore": 4, 167 | "spaceAfter": 2 168 | }], 169 | ["heading4" , { 170 | "parent": "normal", 171 | "fontName": "stdBold", 172 | "keepWithNext": true 173 | }], 174 | ["sidebar-title" , { 175 | "parent": "heading3" 176 | }], 177 | ["sidebar-subtitle" , { 178 | "parent": "heading4" 179 | }], 180 | ["sidebar" , { 181 | "float": "left", 182 | "width": "30%", 183 | "parent": "normal", 184 | "backColor": "beige", 185 | "borderColor": "darkgray", 186 | "borderPadding": 8, 187 | "borderWidth": 0.5 188 | }], 189 | ["literal" , { 190 | "parent": "normal", 191 | "fontName": "stdMono", 192 | "firstLineIndent": 0 193 | }], 194 | ["table" , { 195 | "rowBackgrounds" : ["f0f0d8","#ffffe8"], 196 | "borderColor": "white" 197 | }], 198 | ["table-title" , { 199 | "parent" : "heading4", 200 | "backColor" : "#e0e0c8", 201 | "alignment" : "TA_CENTER" 202 | }], 203 | ["table-heading" , { 204 | "parent" : "heading4", 205 | "backColor" : "#e0e0c0", 206 | "alignment" : "TA_CENTER", 207 | "valign" : "BOTTOM" 208 | }], 209 | ["code" , { 210 | "parent": "literal", 211 | "fontSize": "75%", 212 | "leftIndent": 0, 213 | "spaceBefore": 5, 214 | "spaceAfter": 5, 215 | "backColor": "#e7e7e7", 216 | "borderColor": "#808080", 217 | "borderRadius": 3, 218 | "borderWidth": 0.5, 219 | "borderPadding": 4 220 | }], 221 | ["pygments-n" , {"parent": "code"}], 222 | ["pygments-nx" , {"parent": "code"}], 223 | ["pygments-p" , {"parent": "code"}], 224 | ["pygments-hll", {"parent": "code", "backColor": "#ffffcc"}], 225 | ["pygments-c", {"textColor": "#008800", "parent": "code"}], 226 | ["pygments-err", {"parent": "code"}], 227 | ["pygments-k", {"textColor": "#AA22FF", "parent": "code"}], 228 | ["pygments-o", {"textColor": "#666666", "parent": "code"}], 229 | ["pygments-cm", {"textColor": "#008800", "parent": "code"}], 230 | ["pygments-cp", {"textColor": "#008800", "parent": "code"}], 231 | ["pygments-c1", {"textColor": "#008800", "parent": "code"}], 232 | ["pygments-cs", {"textColor": "#008800", "parent": "code"}], 233 | ["pygments-gd", {"textColor": "#A00000", "parent": "code"}], 234 | ["pygments-ge", {"parent": "code"}], 235 | ["pygments-gr", {"textColor": "#FF0000", "parent": "code"}], 236 | ["pygments-gh", {"textColor": "#000080", "parent": "code"}], 237 | ["pygments-gi", {"textColor": "#00A000", "parent": "code"}], 238 | ["pygments-go", {"textColor": "#808080", "parent": "code"}], 239 | ["pygments-gp", {"textColor": "#000080", "parent": "code"}], 240 | ["pygments-gs", {"parent": "code"}], 241 | ["pygments-gu", {"textColor": "#800080", "parent": "code"}], 242 | ["pygments-gt", {"textColor": "#0040D0", "parent": "code"}], 243 | ["pygments-kc", {"textColor": "#AA22FF", "parent": "code"}], 244 | ["pygments-kd", {"textColor": "#AA22FF", "parent": "code"}], 245 | ["pygments-kn", {"textColor": "#AA22FF", "parent": "code"}], 246 | ["pygments-kp", {"textColor": "#AA22FF", "parent": "code"}], 247 | ["pygments-kr", {"textColor": "#AA22FF", "parent": "code"}], 248 | ["pygments-kt", {"textColor": "#00BB00", "parent": "code"}], 249 | ["pygments-m", {"textColor": "#666666", "parent": "code"}], 250 | ["pygments-s", {"textColor": "#BB4444", "parent": "code"}], 251 | ["pygments-na", {"textColor": "#BB4444", "parent": "code"}], 252 | ["pygments-nb", {"textColor": "#AA22FF", "parent": "code"}], 253 | ["pygments-nc", {"textColor": "#0000FF", "parent": "code"}], 254 | ["pygments-no", {"textColor": "#880000", "parent": "code"}], 255 | ["pygments-nd", {"textColor": "#AA22FF", "parent": "code"}], 256 | ["pygments-ni", {"textColor": "#999999", "parent": "code"}], 257 | ["pygments-ne", {"textColor": "#D2413A", "parent": "code"}], 258 | ["pygments-nf", {"textColor": "#00A000", "parent": "code"}], 259 | ["pygments-nl", {"textColor": "#A0A000", "parent": "code"}], 260 | ["pygments-nn", {"textColor": "#0000FF", "parent": "code"}], 261 | ["pygments-nt", {"textColor": "#008000", "parent": "code"}], 262 | ["pygments-nv", {"textColor": "#B8860B", "parent": "code"}], 263 | ["pygments-ow", {"textColor": "#AA22FF", "parent": "code"}], 264 | ["pygments-w", {"textColor": "#bbbbbb", "parent": "code"}], 265 | ["pygments-mf", {"textColor": "#666666", "parent": "code"}], 266 | ["pygments-mh", {"textColor": "#666666", "parent": "code"}], 267 | ["pygments-mi", {"textColor": "#666666", "parent": "code"}], 268 | ["pygments-mo", {"textColor": "#666666", "parent": "code"}], 269 | ["pygments-sb", {"textColor": "#BB4444", "parent": "code"}], 270 | ["pygments-sc", {"textColor": "#BB4444", "parent": "code"}], 271 | ["pygments-sd", {"textColor": "#BB4444", "parent": "code"}], 272 | ["pygments-s2", {"textColor": "#BB4444", "parent": "code"}], 273 | ["pygments-se", {"textColor": "#BB6622", "parent": "code"}], 274 | ["pygments-sh", {"textColor": "#BB4444", "parent": "code"}], 275 | ["pygments-si", {"textColor": "#BB6688", "parent": "code"}], 276 | ["pygments-sx", {"textColor": "#008000", "parent": "code"}], 277 | ["pygments-sr", {"textColor": "#BB6688", "parent": "code"}], 278 | ["pygments-s1", {"textColor": "#BB4444", "parent": "code"}], 279 | ["pygments-ss", {"textColor": "#B8860B", "parent": "code"}], 280 | ["pygments-bp", {"textColor": "#AA22FF", "parent": "code"}], 281 | ["pygments-vc", {"textColor": "#B8860B", "parent": "code"}], 282 | ["pygments-vg", {"textColor": "#B8860B", "parent": "code"}], 283 | ["pygments-vi", {"textColor": "#B8860B", "parent": "code"}], 284 | ["pygments-il", {"textColor": "#666666", "parent": "code"}] 285 | ] 286 | } 287 | 288 | -------------------------------------------------------------------------------- /docs/torrent_size_cdf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arvidn/btfs/75aefe7c9603abd327fb23cd35548542163cdfa0/docs/torrent_size_cdf.png -------------------------------------------------------------------------------- /src/block_affinity.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright (c) 2014, Arvid Norberg 4 | All rights reserved. 5 | 6 | btfs Source Code is free software: you can redistribute it and/or modify 7 | it under the terms of the GNU General Public License as published by 8 | the Free Software Foundation, either version 3 of the License, or 9 | (at your option) any later version. 10 | 11 | For details, see LICENSE 12 | 13 | */ 14 | 15 | #include "block_affinity.hpp" 16 | #include "libtorrent/alert.hpp" 17 | #include "libtorrent/alert_types.hpp" 18 | #include "libtorrent/peer_connection.hpp" 19 | #include 20 | 21 | using libtorrent::alert; 22 | using libtorrent::block_downloading_alert; 23 | using libtorrent::alert_cast; 24 | using libtorrent::piece_picker; 25 | 26 | // this is an alert observer that creates an affinity 27 | // to download pieces adjacent to some other pieces 28 | // in order to more efficiently fill up the blocks 29 | // on the block device filesystem 30 | block_affinity_plugin::block_affinity_plugin(torrent& t, int block_size) 31 | : m_torrent(t) 32 | , m_block_size(block_size) 33 | , m_peer_plugin(new peer_block_affinity(*this)) 34 | {} 35 | 36 | boost::shared_ptr block_affinity_plugin::new_connection(peer_connection* p) 37 | { 38 | // this will make peers request whole aligned piece ranges 39 | // to match the block size on disk 40 | p->prefer_whole_pieces(m_block_size / m_torrent.torrent_file().piece_length()); 41 | p->picker_options(piece_picker::align_expanded_pieces); 42 | return m_peer_plugin; 43 | } 44 | 45 | void block_affinity_plugin::sending_request(int piece) 46 | { 47 | int piece_size = m_torrent.torrent_file().piece_length(); 48 | 49 | // there's no point in doing this optimization if a piece 50 | // is as big as a block, or bigger 51 | if (piece_size == 0 || piece_size >= m_block_size) return; 52 | 53 | std::vector > pieces; 54 | 55 | int range_start = ((boost::uint64_t(piece) * piece_size) & ~(m_block_size-1)) / piece_size; 56 | int range_end = (std::min)(range_start + m_block_size / piece_size, m_torrent.torrent_file().num_pieces()); 57 | 58 | for (; range_start < range_end; ++range_start) 59 | pieces.push_back(std::make_pair(range_start, 7)); 60 | 61 | // increase the priority of all pieces that would end up 62 | // in this block on the device, in order to minimize wasted 63 | // space by unused block area 64 | m_torrent.prioritize_piece_list(pieces); 65 | } 66 | 67 | boost::shared_ptr block_affinity(torrent* t, int block_size) 68 | { 69 | return boost::shared_ptr(new block_affinity_plugin(*t, block_size)); 70 | } 71 | 72 | -------------------------------------------------------------------------------- /src/block_affinity.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright (c) 2014, Arvid Norberg 4 | All rights reserved. 5 | 6 | btfs Source Code is free software: you can redistribute it and/or modify 7 | it under the terms of the GNU General Public License as published by 8 | the Free Software Foundation, either version 3 of the License, or 9 | (at your option) any later version. 10 | 11 | For details, see LICENSE 12 | 13 | */ 14 | 15 | #ifndef TORRENT_BLOCK_AFFINITY_HPP_INCLUDED 16 | #define TORRENT_BLOCK_AFFINITY_HPP_INCLUDED 17 | 18 | #include "libtorrent/extensions.hpp" 19 | #include "libtorrent/torrent.hpp" 20 | #include 21 | 22 | using libtorrent::torrent_plugin; 23 | using libtorrent::peer_plugin; 24 | using libtorrent::torrent; 25 | using libtorrent::peer_connection; 26 | using libtorrent::peer_request; 27 | 28 | // this is an plugin that creates an affinity 29 | // to download pieces adjacent to some other pieces 30 | // in order to more efficiently fill up the blocks 31 | // on the block device filesystem 32 | struct block_affinity_plugin : torrent_plugin 33 | { 34 | block_affinity_plugin(torrent& t, int block_size); 35 | boost::shared_ptr new_connection(peer_connection*); 36 | void sending_request(int piece); 37 | 38 | private: 39 | torrent& m_torrent; 40 | int m_block_size; 41 | boost::shared_ptr m_peer_plugin; 42 | }; 43 | 44 | struct peer_block_affinity : peer_plugin 45 | { 46 | peer_block_affinity(block_affinity_plugin& ba) : m_ba(ba) {} 47 | bool write_request(peer_request const& req) 48 | { 49 | m_ba.sending_request(req.piece); 50 | return false; 51 | } 52 | 53 | block_affinity_plugin& m_ba; 54 | }; 55 | 56 | boost::shared_ptr TORRENT_EXPORT 57 | block_affinity(torrent* t, int block_size); 58 | 59 | #endif // TORRENT_BLOCK_AFFINITY_HPP_INCLUDED 60 | 61 | -------------------------------------------------------------------------------- /src/block_allocator.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright (c) 2014, Arvid Norberg 4 | All rights reserved. 5 | 6 | btfs Source Code is free software: you can redistribute it and/or modify 7 | it under the terms of the GNU General Public License as published by 8 | the Free Software Foundation, either version 3 of the License, or 9 | (at your option) any later version. 10 | 11 | For details, see LICENSE 12 | 13 | */ 14 | 15 | #include "block_allocator.hpp" 16 | #include "libtorrent/assert.hpp" 17 | #include 18 | 19 | // used when initializing an empty device 20 | void ordered_block_allocator::init(boost::uint32_t total_blocks) 21 | { 22 | m_max_blocks = total_blocks; 23 | m_free_blocks.resize(m_max_blocks-1); 24 | for (int i = 0; i < m_max_blocks - 1; ++i) 25 | m_free_blocks[i] = i + 1; 26 | } 27 | 28 | // used when opening a device 29 | void ordered_block_allocator::init(boost::uint32_t total_blocks, bitfield const& used_blocks) 30 | { 31 | m_max_blocks = total_blocks; 32 | 33 | // build the free-list for filesystem blocks 34 | for (boost::uint32_t i = 0; i < m_max_blocks; ++i) 35 | { 36 | if (used_blocks[i]) continue; 37 | m_free_blocks.push_back(i); 38 | } 39 | } 40 | 41 | boost::uint32_t ordered_block_allocator::num_free() const 42 | { return m_free_blocks.size(); } 43 | 44 | void ordered_block_allocator::free_block(boost::uint32_t block) 45 | { 46 | mutex::scoped_lock l(m_mutex); 47 | // insert the block ordered 48 | std::deque::iterator i = std::lower_bound( 49 | m_free_blocks.begin(), m_free_blocks.end(), block); 50 | m_free_blocks.insert(i, block); 51 | } 52 | 53 | void ordered_block_allocator::free_blocks(boost::uint32_t* blocks, int num) 54 | { 55 | TORRENT_ASSERT(num > 0); 56 | if (num <= 0) return; 57 | 58 | // this will speed up in-order insertion 59 | std::sort(blocks, blocks + num); 60 | 61 | mutex::scoped_lock l(m_mutex); 62 | 63 | std::deque::iterator i = m_free_blocks.begin(); 64 | while (num > 0) 65 | { 66 | i = std::lower_bound(i, m_free_blocks.end(), *blocks); 67 | // insert the block ordered 68 | m_free_blocks.insert(i, *blocks); 69 | ++blocks; 70 | --num; 71 | } 72 | } 73 | 74 | boost::uint32_t ordered_block_allocator::allocate_block(bool inode) 75 | { 76 | mutex::scoped_lock l(m_mutex); 77 | 78 | // no free blocks left! 79 | if (m_free_blocks.empty()) return unallocated_block; 80 | 81 | boost::uint32_t blk; 82 | if (inode) 83 | { 84 | blk = m_free_blocks.back(); 85 | m_free_blocks.pop_back(); 86 | } 87 | else 88 | { 89 | blk = m_free_blocks.front(); 90 | m_free_blocks.pop_front(); 91 | } 92 | return blk; 93 | } 94 | 95 | 96 | 97 | // ========== sequential block allocator ============ 98 | 99 | 100 | 101 | // used when initializing an empty device 102 | void sequential_block_allocator::init(boost::uint32_t total_blocks) 103 | { 104 | m_max_blocks = total_blocks; 105 | m_used_blocks = total_blocks; 106 | m_cursor = 0; 107 | m_free_blocks = total_blocks; 108 | 109 | TORRENT_ASSERT(m_free_blocks == m_used_blocks.size() - m_used_blocks.count()); 110 | } 111 | 112 | // used when opening a device 113 | void sequential_block_allocator::init(boost::uint32_t total_blocks, bitfield const& used_blocks) 114 | { 115 | m_max_blocks = total_blocks; 116 | m_used_blocks = used_blocks; 117 | m_cursor = 0; 118 | m_free_blocks = total_blocks; 119 | for (int i = 0; i < total_blocks; ++i) 120 | { 121 | if (m_used_blocks[i]) --m_free_blocks; 122 | } 123 | 124 | TORRENT_ASSERT(m_free_blocks == m_used_blocks.size() - m_used_blocks.count()); 125 | } 126 | 127 | boost::uint32_t sequential_block_allocator::num_free() const 128 | { return m_free_blocks; } 129 | 130 | void sequential_block_allocator::free_block(boost::uint32_t block) 131 | { 132 | mutex::scoped_lock l(m_mutex); 133 | 134 | TORRENT_ASSERT(block < m_used_blocks.size()); 135 | TORRENT_ASSERT(m_used_blocks[block]); 136 | m_used_blocks.clear_bit(block); 137 | ++m_free_blocks; 138 | 139 | TORRENT_ASSERT(m_free_blocks == m_used_blocks.size() - m_used_blocks.count()); 140 | } 141 | 142 | void sequential_block_allocator::free_blocks(boost::uint32_t* blocks, int num) 143 | { 144 | TORRENT_ASSERT(num > 0); 145 | if (num <= 0) return; 146 | 147 | // this will speed up in-order insertion 148 | std::sort(blocks, blocks + num); 149 | 150 | mutex::scoped_lock l(m_mutex); 151 | 152 | int i = 0; 153 | while (num > 0) 154 | { 155 | TORRENT_ASSERT(*blocks < m_used_blocks.size()); 156 | TORRENT_ASSERT(m_used_blocks[*blocks]); 157 | m_used_blocks.clear_bit(*blocks); 158 | ++blocks; 159 | ++i; 160 | --num; 161 | ++m_free_blocks; 162 | } 163 | TORRENT_ASSERT(m_free_blocks == m_used_blocks.size() - m_used_blocks.count()); 164 | } 165 | 166 | boost::uint32_t sequential_block_allocator::allocate_block(bool inode) 167 | { 168 | mutex::scoped_lock l(m_mutex); 169 | 170 | if (m_free_blocks == 0) 171 | return unallocated_block; 172 | 173 | while (m_used_blocks[m_cursor]) 174 | { 175 | ++m_cursor; 176 | if (m_cursor >= m_max_blocks) m_cursor = 0; 177 | } 178 | 179 | m_used_blocks.set_bit(m_cursor); 180 | --m_free_blocks; 181 | TORRENT_ASSERT(m_free_blocks == m_used_blocks.size() - m_used_blocks.count()); 182 | return m_cursor; 183 | } 184 | 185 | -------------------------------------------------------------------------------- /src/block_allocator.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright (c) 2014, Arvid Norberg 4 | All rights reserved. 5 | 6 | btfs Source Code is free software: you can redistribute it and/or modify 7 | it under the terms of the GNU General Public License as published by 8 | the Free Software Foundation, either version 3 of the License, or 9 | (at your option) any later version. 10 | 11 | For details, see LICENSE 12 | 13 | */ 14 | 15 | #ifndef BLOCK_ALLOCATOR_HPP 16 | #define BLOCK_ALLOCATOR_HPP 17 | 18 | #include 19 | #include 20 | #include "libtorrent/thread.hpp" 21 | #include "libtorrent/bitfield.hpp" 22 | 23 | using libtorrent::bitfield; 24 | using libtorrent::mutex; 25 | 26 | // the logic manipulating the free-list of 27 | // filesystem blocks 28 | 29 | // this allocator always allocates the lowest index block number 30 | struct ordered_block_allocator 31 | { 32 | // used when initializing an empty device 33 | void init(boost::uint32_t total_blocks); 34 | 35 | // used when opening a device 36 | void init(boost::uint32_t total_blocks, bitfield const& used_blocks); 37 | 38 | boost::uint32_t num_free() const; 39 | 40 | boost::uint32_t allocate_block(bool inode); 41 | void free_block(boost::uint32_t block); 42 | void free_blocks(boost::uint32_t* blocks, int num); 43 | 44 | enum constants_t 45 | { 46 | // this is what is put in the block_map 47 | // for an block that's not allocated. block 0 48 | // is the filesystem root block, it can never 49 | // be allocated for data or anything else 50 | unallocated_block = 0, 51 | }; 52 | 53 | private: 54 | 55 | // the number of blocks available on media 56 | boost::uint32_t m_max_blocks; 57 | 58 | // this mutex must be held when reading or writing 59 | // m_free_blocks. 60 | mutable mutex m_mutex; 61 | 62 | // free blocks are pushed and popped at the end 63 | // of this vector. It's essentially used as a stack. 64 | // this means blocks are likely to be reused immediately 65 | // after they are freed. This may improve cache hits 66 | // when using a cached device or a flat file 67 | // inode blocks are allocated from the front and data 68 | // blocks are allocated from the back 69 | std::deque m_free_blocks; 70 | }; 71 | 72 | // this allocator always allocates the next available 73 | // block from the cursor of the last block allocated 74 | struct sequential_block_allocator 75 | { 76 | // used when initializing an empty device 77 | void init(boost::uint32_t total_blocks); 78 | 79 | // used when opening a device 80 | void init(boost::uint32_t total_blocks, bitfield const& used_blocks); 81 | 82 | boost::uint32_t num_free() const; 83 | 84 | boost::uint32_t allocate_block(bool inode); 85 | void free_block(boost::uint32_t block); 86 | void free_blocks(boost::uint32_t* blocks, int num); 87 | 88 | enum constants_t 89 | { 90 | // this is what is put in the block_map 91 | // for an block that's not allocated. block 0 92 | // is the filesystem root block, it can never 93 | // be allocated for data or anything else 94 | unallocated_block = 0, 95 | }; 96 | 97 | private: 98 | 99 | // the number of blocks available on media 100 | boost::uint32_t m_max_blocks; 101 | 102 | // number of blocks not in use 103 | int m_free_blocks; 104 | 105 | // this mutex must be held when reading or writing 106 | // m_free_blocks. 107 | mutable mutex m_mutex; 108 | 109 | bitfield m_used_blocks; 110 | 111 | // the next block to allocate 112 | int m_cursor; 113 | }; 114 | 115 | //typedef ordered_block_allocator block_allocator; 116 | typedef sequential_block_allocator block_allocator; 117 | 118 | #endif 119 | 120 | -------------------------------------------------------------------------------- /src/block_device.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright (c) 2014, Arvid Norberg 4 | All rights reserved. 5 | 6 | btfs Source Code is free software: you can redistribute it and/or modify 7 | it under the terms of the GNU General Public License as published by 8 | the Free Software Foundation, either version 3 of the License, or 9 | (at your option) any later version. 10 | 11 | For details, see LICENSE 12 | 13 | */ 14 | 15 | #include "block_device.hpp" 16 | #include "pool_allocator.hpp" 17 | 18 | #include "libtorrent/io.hpp" 19 | #include "libtorrent/entry.hpp" 20 | #include "libtorrent/lazy_entry.hpp" 21 | #include "libtorrent/file_storage.hpp" 22 | #include "libtorrent/allocator.hpp" // for page_aligned_allocator 23 | #include "libtorrent/storage_defs.hpp" // for storage_params 24 | #include "libtorrent/torrent_info.hpp" // for torrent_info 25 | 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | 34 | using boost::system::generic_category; 35 | using libtorrent::bufs_size; 36 | using libtorrent::aligned_holder; 37 | 38 | #if defined TORRENT_WINDOWS 39 | #include "windows.h" 40 | #include "winioctl.h" 41 | #endif 42 | 43 | #ifdef TORRENT_BSD 44 | #include 45 | #include 46 | #endif 47 | 48 | #ifdef TORRENT_LINUX 49 | #include 50 | #include 51 | #endif 52 | 53 | #define DISK_ACCESS_LOG 1 54 | 55 | #if DISK_ACCESS_LOG 56 | #include "libtorrent/time.hpp" 57 | #include "libtorrent/thread.hpp" 58 | #include 59 | using libtorrent::ptime; 60 | using libtorrent::min_time; 61 | using libtorrent::time_now_hires; 62 | using libtorrent::total_microseconds; 63 | using boost::atomic; 64 | #endif 65 | 66 | using libtorrent::file; 67 | using libtorrent::file_storage; 68 | using libtorrent::lazy_entry; 69 | using libtorrent::entry; 70 | 71 | 72 | // TODO: write unit test for block device 73 | 74 | /* 75 | the block device implement an extremely simple (and bittorrent 76 | taylored) filesystem, with the following layout: 77 | 78 | The first 4 MiB block is the root block. 79 | The root block primarily consists of an array of 80 | torrent entries. The header looks like this: 81 | */ 82 | 83 | // this is in block 0 84 | struct fs_root_block_t 85 | { 86 | // the string 'BitTorrent filesystem\0\0\0' 87 | boost::uint8_t fs_identifier[24]; 88 | 89 | // the block size for the filesystem. Should 90 | // be in the range of 1 MiB to 8 MiB. inode 91 | // extents cannot be chained, so smaller 92 | // blocks means smaller max file size 93 | // default is 4 MiB. The block size MUST 94 | // be an even multiple of the underlying device's 95 | // block size (which typically is 512 bytes) 96 | boost::uint32_t block_size; 97 | 98 | // the number of files, i.e. the number of 99 | // items in the files array 100 | boost::uint32_t num_files; 101 | 102 | // reserved for future extensions and to 103 | // pad the header to an even 512 byte block 104 | boost::uint8_t reserved[480]; 105 | 106 | // this is an array of inode pointers. 107 | sub_block_ref files[]; 108 | }; 109 | 110 | /* 111 | inode indices are sequence numbers of 4 MiB blocks on the device. 112 | They refer to a block which contains the mapping of offsets in 113 | the file. Each inode has an inode header and a block map. The 114 | inode header is: 115 | */ 116 | 117 | // on-disk representation of an inode 118 | struct inode_block_t 119 | { 120 | // the string 'inod' 121 | boost::uint8_t inode_identifier[4]; 122 | 123 | // info-hash for this file 124 | sha1_hash info_hash; 125 | 126 | // the size of the block_map 127 | boost::uint32_t num_blocks; 128 | 129 | // self reference. Contains size 130 | sub_block_ref inode_ref; 131 | 132 | // reserved for future use 133 | boost::uint8_t reserved[28]; 134 | 135 | // array of num_blocks. Mapping file blocks 136 | // to device blocks. an entry with 'unallocated_block' 137 | // (i.e. -1) is not allocated on the device. Reading 138 | // such block should result in zeroes, writing to 139 | // it requires allocating a new block 140 | boost::uint32_t block_map[]; 141 | }; 142 | 143 | /* 144 | immediately following the inode header, is the block allocation 145 | map. This is an array of 32 bit unsigned block indices. The length 146 | of the array is `num_blocks` as specified in the inode header. 147 | 148 | Each entry represents 4 MiB of the space in the file and the index 149 | refers to a block where this payload is stored. If this range does 150 | not have any data, the block index is 0, which means unallocated. 151 | */ 152 | 153 | // this is the in-memory representation of an inode 154 | struct inode_block 155 | { 156 | inode_block(sha1_hash const& ih, sub_block_ref in) 157 | : info_hash(ih) 158 | , block_index(in) 159 | , dirty(0) 160 | , marked_for_deletion(0) 161 | , references(0) 162 | , blocks_in_use(0) 163 | , max_size((in.node_size() - sizeof(inode_block_t)) / 4) 164 | {} 165 | 166 | sha1_hash info_hash; 167 | 168 | // this mutex must be held when manipulating the inode_block 169 | mutex inode_mutex; 170 | 171 | // the block this inode is stored in 172 | // inode blocks are special. The low bits 173 | // indicate a sub block of the filesystem block 174 | sub_block_ref block_index; 175 | 176 | // set to true if the block map holds information 177 | // that has not been flushed to disk 178 | boost::uint32_t dirty:1; 179 | 180 | // if this is set, the inode and all its blocks are returned 181 | // to the global freelist once the reference count reach 0 182 | boost::uint32_t marked_for_deletion:1; 183 | 184 | // the number of times this inode has been opened 185 | // but not yet closed 186 | boost::uint32_t references:30; 187 | 188 | // the number of allocated blocks in this file. 189 | // this determines how much disk space it uses 190 | boost::uint32_t blocks_in_use; 191 | 192 | // the max number of blocks this inode can hold. 193 | // this is initialized when the file is created 194 | boost::uint32_t max_size; 195 | 196 | // the block map, mapping file blocks to device blocks 197 | std::vector block_map; 198 | }; 199 | 200 | // ============= block_device ================== 201 | 202 | #if DISK_ACCESS_LOG 203 | enum access_log_flags 204 | { 205 | start_read, start_write, complete_read, complete_write 206 | }; 207 | 208 | int write_disk_log(FILE* f, boost::uint64_t offset, int id, int event, ptime timestamp) 209 | { 210 | static atomic global_id(0); 211 | 212 | if (event == start_read || event == start_write) 213 | id = ++global_id; 214 | 215 | // the event format in the log is: 216 | // uint64_t timestamp (microseconds) 217 | // uint64_t file offset 218 | // uint32_t event-id 219 | // uint8_t event (0: start read, 1: start write, 2: complete read, 4: complete write) 220 | char buf[21]; 221 | char* ptr = buf; 222 | 223 | using namespace libtorrent::detail; 224 | 225 | write_uint64(total_microseconds((timestamp - min_time())), ptr); 226 | write_uint64(offset, ptr); 227 | write_uint32(id, ptr); 228 | write_uint8(event, ptr); 229 | 230 | int ret = fwrite(buf, 1, sizeof(buf), f); 231 | if (ret != sizeof(buf)) 232 | { 233 | fprintf(stderr, "ERROR writing to disk access log: (%d) %s\n" 234 | , errno, strerror(errno)); 235 | } 236 | return id; 237 | } 238 | #endif 239 | 240 | block_device::block_device(std::string const& device_path, error_code& ec) 241 | : m_dirty_root_block(false) 242 | { 243 | #if defined TORRENT_DEBUG || TORRENT_RELEASE_ASSERTS 244 | m_destructed = false; 245 | #endif 246 | #if DISK_ACCESS_LOG 247 | m_access_log = NULL; 248 | #endif 249 | m_file.open(device_path, file::read_write | file::no_cache | file::direct_io, ec); 250 | if (ec) return; 251 | 252 | // TODO: this should be properly queried 253 | // assume 512 for windows for now 254 | m_media_block_size = 512; 255 | 256 | #if defined TORRENT_WINDOWS 257 | 258 | PARTITION_INFORMATION pi; 259 | DISK_GEOMETRY gi; 260 | DWORD retbytes; 261 | LARGE_INTEGER size; 262 | 263 | if (DeviceIoControl(m_file.native_handle() 264 | , IOCTL_DISK_GET_PARTITION_INFO 265 | , &pi, sizeof(PARTITION_INFORMATION) 266 | , &pi, sizeof(PARTITION_INFORMATION) 267 | , &size, NULL)) 268 | { 269 | m_max_size = pi.PartitionLength.QuadPart; 270 | } 271 | else if (DeviceIoControl(m_file.native_handle(), IOCTL_DISK_GET_DRIVE_GEOMETRY 272 | , &gi, sizeof(DISK_GEOMETRY) 273 | , &gi, sizeof(DISK_GEOMETRY) 274 | , &size, NULL)) 275 | { 276 | m_max_size = gi.BytesPerSector * 277 | gi.SectorsPerTrack * 278 | gi.TracksPerCylinder * 279 | gi.Cylinders.QuadPart; 280 | } 281 | else if (GetFileSizeEx(m_file.native_handle(), &size)) 282 | { 283 | m_max_size = size.QuadPart / blocksize; 284 | } 285 | else 286 | { 287 | ec.assign(GetLastError(), generic_category()); 288 | return; 289 | } 290 | 291 | #else // TORRENT_WINDOWS 292 | 293 | #if defined DKIOCGETBLOCKCOUNT 294 | 295 | boost::uint64_t block_count = 0; 296 | int block_size = 0; 297 | if (ioctl(m_file.native_handle(), DKIOCGETBLOCKCOUNT, (char*)&block_count) >= 0) 298 | { 299 | if (ioctl(m_file.native_handle(), DKIOCGETBLOCKSIZE, (char*)&block_size) < 0) 300 | { 301 | ec.assign(errno, generic_category()); 302 | return; 303 | } 304 | 305 | m_max_size = block_count * block_size; 306 | m_media_block_size = block_size; 307 | } 308 | else if (errno == ENOTTY) 309 | { 310 | // it appears to be a regular file 311 | struct stat st; 312 | if (fstat(m_file.native_handle(), &st) < 0) 313 | { 314 | ec.assign(errno, generic_category()); 315 | return; 316 | } 317 | m_max_size = st.st_size; 318 | } 319 | else 320 | { 321 | ec.assign(errno, generic_category()); 322 | return; 323 | } 324 | 325 | #elif defined BLKGETSIZE64 326 | 327 | if (ioctl(m_file.native_handle(), BLKGETSIZE64, &m_max_size) < 0) 328 | { 329 | if (errno == ENOTTY) 330 | { 331 | // it appears to be a regular file 332 | struct stat st; 333 | if (fstat(m_file.native_handle(), &st) < 0) 334 | { 335 | ec.assign(errno, generic_category()); 336 | return; 337 | } 338 | m_max_size = st.st_size; 339 | } 340 | else 341 | { 342 | ec.assign(errno, generic_category()); 343 | return; 344 | } 345 | } 346 | 347 | #elif defined BLKGETSIZE 348 | 349 | long num_blocks; 350 | if (ioctl(m_file.native_handle(), BLKGETSIZE, &num_blocks) < 0) 351 | { 352 | if (errno == ENOTTY) 353 | { 354 | // it appears to be a regular file 355 | struct stat st; 356 | if (fstat(m_file.native_handle(), &st) < 0) 357 | { 358 | ec.assign(errno, generic_category()); 359 | return; 360 | } 361 | m_max_size = st.st_size; 362 | } 363 | else 364 | { 365 | ec.assign(errno, generic_category()); 366 | return; 367 | } 368 | } 369 | 370 | #else 371 | // TODO: in this case, just support flat files 372 | #error do not know how to query the size of a block device! 373 | #endif 374 | 375 | #endif // TORRENT_WINDOWS 376 | 377 | #if DISK_ACCESS_LOG 378 | m_start_time = time_now_hires(); 379 | m_access_log = fopen("block_device_access.log", "w+"); 380 | #endif 381 | 382 | } 383 | 384 | block_device::~block_device() 385 | { 386 | #if defined TORRENT_DEBUG || TORRENT_RELEASE_ASSERTS 387 | TORRENT_ASSERT(!m_destructed); 388 | m_destructed = true; 389 | #endif 390 | #if DISK_ACCESS_LOG 391 | if (m_access_log) fclose(m_access_log); 392 | #endif 393 | flush_inodes(); 394 | flush_root_block(); 395 | } 396 | 397 | void block_device::read_root_block(error_code& ec) 398 | { 399 | TORRENT_ASSERT(!m_destructed); 400 | aligned_holder root_block((std::max)(m_media_block_size, boost::uint32_t(4096))); 401 | 402 | file::iovec_t b = { root_block.get(), m_media_block_size }; 403 | int read = m_file.readv(0, &b, 1, ec); 404 | 405 | if (ec) return; 406 | 407 | if (read != m_media_block_size) 408 | { 409 | ec.assign(boost::system::errc::io_error, generic_category()); 410 | return; 411 | } 412 | 413 | fs_root_block_t* root = (fs_root_block_t*)root_block.get(); 414 | 415 | if (memcmp(root->fs_identifier, "BitTorrent filesystem\0\0\0", 24) != 0) 416 | { 417 | // this is not a BitTorrent filesystem 418 | ec.assign(boost::system::errc::no_such_file_or_directory, generic_category()); 419 | return; 420 | } 421 | 422 | m_block_size = root->block_size; 423 | // verify that the block size is reasonable 424 | if ((m_block_size % m_media_block_size) != 0 || m_block_size < 4096) 425 | { 426 | // invalid block size 427 | ec.assign(boost::system::errc::no_such_file_or_directory, generic_category()); 428 | return; 429 | } 430 | 431 | for (int i = 0; i < sizeof(m_node_allocator)/sizeof(m_node_allocator[0]); ++i) 432 | m_node_allocator[i].init(m_block_size / (1024 << i), i); 433 | 434 | boost::uint32_t max_blocks = m_max_size / m_block_size; 435 | 436 | int num_files = root->num_files; 437 | 438 | if (num_files > (m_block_size - sizeof(fs_root_block_t)) / sizeof(sub_block_ref)) 439 | { 440 | ec.assign(boost::system::errc::no_such_file_or_directory, generic_category()); 441 | return; 442 | } 443 | 444 | int block_size = num_files * sizeof(sub_block_ref); 445 | // round up to an even multiple of media block size 446 | block_size = (block_size + m_media_block_size-1) & ~(m_media_block_size-1); 447 | TORRENT_ASSERT(block_size % m_media_block_size == 0); 448 | std::vector file_list(block_size / sizeof(sub_block_ref)); 449 | 450 | TORRENT_ASSERT((file_list.size()*sizeof(sub_block_ref)) % m_media_block_size == 0); 451 | 452 | b.iov_base = &file_list[0]; 453 | b.iov_len = file_list.size() * sizeof(sub_block_ref); 454 | read = m_file.readv(sizeof(fs_root_block_t), &b, 1, ec); 455 | if (ec) return; 456 | if (read != file_list.size() * sizeof(sub_block_ref)) 457 | { 458 | ec.assign(boost::system::errc::io_error, generic_category()); 459 | return; 460 | } 461 | 462 | file_list.resize(num_files); 463 | 464 | // read inodes in increasing order to minimize seeking 465 | std::sort(file_list.begin(), file_list.end()); 466 | 467 | // when reading the directory, also collect all blocks 468 | // used by files, in order to build the freelist 469 | bitfield used_blocks(max_blocks, false); 470 | 471 | // the root block is in use 472 | used_blocks.set_bit(0); 473 | 474 | for (std::vector::iterator i = file_list.begin() 475 | , end(file_list.end()); i != end; ++i) 476 | { 477 | read_inode(used_blocks, *i, ec); 478 | if (ec) return; 479 | } 480 | 481 | m_free_blocks.init(max_blocks, used_blocks); 482 | } 483 | 484 | void block_device::read_inode(bitfield& used_blocks, sub_block_ref iblock, error_code& ec) 485 | { 486 | std::vector inode_buffer; 487 | int inode_size = iblock.node_size(); 488 | inode_buffer.resize(inode_size / 8); 489 | file::iovec_t b = { &inode_buffer[0], inode_buffer.size() * 8}; 490 | int read = m_file.readv(iblock.device_offset(m_block_size), &b, 1, ec); 491 | 492 | if (ec) return; 493 | 494 | if (read != inode_buffer.size() * 8) 495 | { 496 | ec.assign(boost::system::errc::io_error, generic_category()); 497 | return; 498 | } 499 | 500 | inode_block_t* inode = (inode_block_t*)&inode_buffer[0]; 501 | 502 | // if this is not an inode, just ignore it 503 | TORRENT_ASSERT(memcmp(inode->inode_identifier, "inod", 4) == 0); 504 | if (memcmp(inode->inode_identifier, "inod", 4) != 0) 505 | return; 506 | 507 | if (iblock.subblock_size >= sizeof(m_node_allocator)/sizeof(m_node_allocator[0])) 508 | { 509 | // invalid inode size 510 | TORRENT_ASSERT(false); 511 | return; 512 | } 513 | 514 | if ((m_block_size >> iblock.subblock_size) < 1024) 515 | { 516 | // invalid inode size 517 | TORRENT_ASSERT(false); 518 | return; 519 | } 520 | 521 | // this inode block is in use 522 | used_blocks.set_bit(iblock.block); 523 | 524 | m_node_allocator[iblock.subblock_size].mark_in_use(iblock); 525 | 526 | inode_block* blk = new (std::nothrow) inode_block(inode->info_hash, iblock); 527 | if (blk == 0) 528 | { 529 | ec.assign(boost::system::errc::not_enough_memory, generic_category()); 530 | return; 531 | } 532 | m_inodes.insert(std::make_pair(inode->info_hash, blk)); 533 | m_dirty_root_block = true; 534 | 535 | blk->block_map.resize(inode->num_blocks, block_allocator::unallocated_block); 536 | 537 | // first, copy all the blocks we read. We may have to read more from disk 538 | // in case this is a big file 539 | int min_blocks = (std::min)(inode->num_blocks 540 | , boost::uint32_t((inode_buffer.size()*8 - sizeof(inode_block_t)) / 4)); 541 | std::copy(inode->block_map, inode->block_map + min_blocks, &blk->block_map[0]); 542 | int blocks_in_use = 0; 543 | for (int i = 0; i < min_blocks; ++i) 544 | { 545 | boost::uint32_t data_block = inode->block_map[i]; 546 | if (data_block == block_allocator::unallocated_block) continue; 547 | TORRENT_ASSERT(!used_blocks[data_block]); 548 | used_blocks.set_bit(data_block); 549 | ++blocks_in_use; 550 | } 551 | 552 | blk->blocks_in_use = blocks_in_use; 553 | 554 | if (inode->num_blocks > min_blocks) 555 | { 556 | // ok, the first 64 kiB that we read did not cover the whole 557 | // block list. This file is pretty big (> 64 GB) 558 | // read the rest of the block list 559 | TORRENT_ASSERT(false && "not implemented"); 560 | } 561 | } 562 | 563 | void block_device::flush_inodes() 564 | { 565 | mutex::scoped_lock l(m_dirty_blocks_mutex); 566 | std::vector dirty_blocks; 567 | m_dirty_blocks.swap(dirty_blocks); 568 | l.unlock(); 569 | 570 | std::sort(dirty_blocks.begin(), dirty_blocks.end() 571 | , boost::bind(&inode_block::block_index, _1) 572 | < boost::bind(&inode_block::block_index, _2)); 573 | 574 | std::vector buffer; 575 | 576 | for (std::vector::iterator i = dirty_blocks.begin() 577 | , end(dirty_blocks.end()); i != end; ++i) 578 | { 579 | inode_block* blk = *i; 580 | mutex::scoped_lock l2(blk->inode_mutex); 581 | blk->dirty = false; 582 | int block_size = sizeof(inode_block_t) + blk->block_map.size() * 4; 583 | // round up to even media_block_size 584 | block_size = (block_size + m_media_block_size - 1) & ~(m_media_block_size-1); 585 | buffer.resize(block_size / 4); 586 | inode_block_t* block = (inode_block_t*)&buffer[0]; 587 | 588 | memcpy(block->inode_identifier, "inod", 4); 589 | block->info_hash = blk->info_hash; 590 | block->num_blocks = blk->block_map.size(); 591 | block->inode_ref = blk->block_index; 592 | 593 | std::copy(blk->block_map.begin(), blk->block_map.end(), block->block_map); 594 | l2.unlock(); 595 | 596 | boost::int64_t dev_offset = blk->block_index.device_offset(m_block_size); 597 | file::iovec_t b = { &buffer[0], buffer.size()*4 }; 598 | error_code ec; 599 | m_file.writev(dev_offset, &b, 1, ec); 600 | 601 | // decrement the references, to allow the file 602 | // to be closed. Whenever the dirty flag was 603 | // set, the reference count was incremented. 604 | // calling close here primarily decrements that back 605 | // down, but also, if we reached zero, checks to 606 | // see if this node was marked for deletion 607 | close_impl(blk, l2); 608 | } 609 | } 610 | 611 | void block_device::flush_root_block() 612 | { 613 | mutex::scoped_lock l(m_inode_mutex); 614 | if (!m_dirty_root_block) return; 615 | 616 | int num_files = m_inodes.size(); 617 | 618 | std::vector buffer; 619 | int block_size = sizeof(fs_root_block_t) + num_files * 4; 620 | 621 | // round up to m_media_block_size 622 | block_size = (block_size + m_media_block_size-1) & ~(m_media_block_size-1); 623 | 624 | buffer.resize(block_size / 4); 625 | fs_root_block_t* root = (fs_root_block_t*)&buffer[0]; 626 | memcpy(root->fs_identifier, "BitTorrent filesystem\0\0\0", 24); 627 | root->block_size = m_block_size; 628 | root->num_files = num_files; 629 | 630 | int file_index = 0; 631 | for (boost::unordered_map::iterator i = m_inodes.begin() 632 | , end(m_inodes.end()); i != end; ++i, ++file_index) 633 | { 634 | root->files[file_index] = i->second->block_index; 635 | } 636 | m_dirty_root_block = false; 637 | l.unlock(); 638 | 639 | error_code ec; 640 | file::iovec_t b = { &buffer[0], buffer.size()*4 }; 641 | m_file.writev(0, &b, 1, ec); 642 | } 643 | 644 | void block_device::format(error_code& ec, int block_size) 645 | { 646 | if (block_size < 4096) 647 | { 648 | ec.assign(boost::system::errc::invalid_argument, generic_category()); 649 | return; 650 | } 651 | 652 | if ((block_size % m_media_block_size) != 0) 653 | { 654 | ec.assign(boost::system::errc::invalid_argument, generic_category()); 655 | return; 656 | } 657 | 658 | aligned_holder root_block((std::max)(boost::uint32_t(4096), m_media_block_size)); 659 | 660 | using namespace libtorrent::detail; 661 | 662 | fs_root_block_t* root = (fs_root_block_t*)root_block.get(); 663 | memcpy(root->fs_identifier, "BitTorrent filesystem\0\0\0", 24); 664 | 665 | root->block_size = block_size; 666 | root->num_files = 0; 667 | memset(root->reserved, 0, sizeof(root->reserved)); 668 | 669 | file::iovec_t b = { root_block.get(), m_media_block_size }; 670 | int written = m_file.writev(0, &b, 1, ec); 671 | 672 | if (ec) return; 673 | 674 | if (written != 512) 675 | { 676 | ec.assign(boost::system::errc::io_error, generic_category()); 677 | return; 678 | } 679 | m_block_size = block_size; 680 | boost::uint32_t max_blocks = m_max_size / m_block_size; 681 | m_free_blocks.init(max_blocks); 682 | 683 | for (int i = 0; i < sizeof(m_node_allocator)/sizeof(m_node_allocator[0]); ++i) 684 | m_node_allocator[i].init(m_block_size / (1024 << i), i); 685 | 686 | for (boost::unordered_map::iterator i = m_inodes.begin() 687 | , end(m_inodes.end()); i != end; ++i) 688 | { 689 | delete i->second; 690 | } 691 | m_inodes.clear(); 692 | } 693 | 694 | // returns true if there is an entry for this info-hash 695 | bool block_device::exists(sha1_hash const& info_hash) const 696 | { 697 | mutex::scoped_lock l(m_inode_mutex); 698 | return m_inodes.find(info_hash) != m_inodes.end(); 699 | } 700 | 701 | // returns a file handle 702 | void* block_device::open(sha1_hash const& info_hash, boost::uint64_t max_size, error_code& ec) 703 | { 704 | mutex::scoped_lock l(m_inode_mutex); 705 | boost::unordered_map::iterator i = m_inodes.find(info_hash); 706 | 707 | if (i == m_inodes.end()) 708 | { 709 | // create a new file 710 | sub_block_ref inode_block_index = allocate_inode(max_size, ec); 711 | if (inode_block_index == sub_block_ref::invalid) 712 | return NULL; 713 | 714 | inode_block* blk = new (std::nothrow) inode_block(info_hash, inode_block_index); 715 | if (blk == NULL) 716 | { 717 | // if this allocation fails, we need to return the block we just 718 | // allocated for it as well 719 | free_inode(inode_block_index); 720 | ec.assign(boost::system::errc::not_enough_memory, generic_category()); 721 | return NULL; 722 | } 723 | bool inserted; 724 | boost::tie(i, inserted) = m_inodes.insert(std::make_pair(info_hash, blk)); 725 | TORRENT_ASSERT(inserted); 726 | TORRENT_ASSERT(i->second == blk); 727 | 728 | m_dirty_root_block = true; 729 | 730 | // schedule this inode for flushing 731 | mutex::scoped_lock l2(m_dirty_blocks_mutex); 732 | m_dirty_blocks.push_back(blk); 733 | blk->dirty = true; 734 | // keep the inode block alive until we've had a chance 735 | // to flush it. When it's flushed it will be decremented 736 | ++blk->references; 737 | } 738 | 739 | inode_block* blk = i->second; 740 | mutex::scoped_lock l2(blk->inode_mutex); 741 | TORRENT_ASSERT(!blk->marked_for_deletion); 742 | if (blk->references == INT_MAX) 743 | { 744 | ec.assign(boost::system::errc::too_many_files_open, generic_category()); 745 | return NULL; 746 | } 747 | ++blk->references; 748 | return blk; 749 | } 750 | 751 | // close this file and make sure it's flushed 752 | void block_device::close(void* inode) 753 | { 754 | inode_block* blk = (inode_block*)inode; 755 | mutex::scoped_lock l(blk->inode_mutex); 756 | 757 | close_impl(blk, l); 758 | } 759 | 760 | void block_device::close_impl(inode_block* blk, mutex::scoped_lock& l) 761 | { 762 | TORRENT_ASSERT(blk->references > 0); 763 | --blk->references; 764 | 765 | if (blk->references > 0) return; 766 | 767 | // this was the last reference to this file. is it marked 768 | // for deletion? 769 | if (!blk->marked_for_deletion) return; 770 | 771 | std::vector used_blocks; 772 | 773 | // return the inode block itself 774 | free_inode(blk->block_index); 775 | 776 | // return all the allocated data blocks to the free-list 777 | for (std::vector::iterator i = blk->block_map.begin() 778 | , end(blk->block_map.end()); i != end; ++i) 779 | { 780 | if (*i == block_allocator::unallocated_block) continue; 781 | used_blocks.push_back(*i); 782 | } 783 | 784 | trim_blocks(&used_blocks[0], used_blocks.size()); 785 | 786 | m_free_blocks.free_blocks(&used_blocks[0], used_blocks.size()); 787 | 788 | // we have to unlock before deleting the block 789 | // since the mutex is a member 790 | l.unlock(); 791 | delete blk; 792 | } 793 | 794 | #ifdef __linux__ 795 | #ifndef FITRIM 796 | struct fstrim_range { 797 | uint64_t start; 798 | uint64_t len; 799 | uint64_t minlen; 800 | }; 801 | #define FITRIM _IOWR('X', 121, struct fstrim_range) 802 | #endif 803 | #endif // __linux__ 804 | 805 | void block_device::trim_blocks(boost::uint32_t* b, int num_blocks) 806 | { 807 | #ifdef FITRIM 808 | for (int i = 0; i < num_blocks; ++i) 809 | { 810 | fstrim_range rng = { boost::uint64_t(b[i]) * m_block_size, m_block_size, 0}; 811 | ioctl(m_file.native_handle(), FITRIM, &rng); 812 | } 813 | #endif 814 | #ifdef FSCTL_FILE_LEVEL_TRIM 815 | FILE_LEVEL_TRIM* fstrim = TORRENT_ALLOCA(char, sizeof(FILE_LEVEL_TRIM) 816 | + sizeof(EXTENT_PAIR) * num_blocks); 817 | fstrim.PairCount = num_blocks; 818 | for (int i = 0; i < num_blocks; ++i) 819 | { 820 | fstrim.Pairs[i].Offset = boost:uint64_t(b[i]) * m_block_size; 821 | fstrim.Pairs[i].Length = m_block_size; 822 | } 823 | 824 | DeviceIoControl(m_file.native_handle(), FSCTL_FILE_LEVEL_TRIM 825 | , fstrim, sizeof(FILE_LEVEL_TRIM) + sizeof(EXTENT_PAIR) * num_blocks 826 | , NULL, 0, NULL, NULL); 827 | #endif 828 | } 829 | 830 | // return all blocks belonging to this file back to the free-list 831 | // and remove this entry from the file allocation table. 832 | void block_device::unlink(void* inode) 833 | { 834 | TORRENT_ASSERT(!m_destructed); 835 | inode_block* blk = (inode_block*)inode; 836 | 837 | mutex::scoped_lock l(m_inode_mutex); 838 | mutex::scoped_lock l2(blk->inode_mutex); 839 | 840 | // we'll return the blocks once all handles 841 | // are closed 842 | TORRENT_ASSERT(blk->references > 0); 843 | 844 | if (blk->marked_for_deletion) return; 845 | 846 | blk->marked_for_deletion = true; 847 | 848 | boost::unordered_map::iterator i 849 | = m_inodes.find(blk->info_hash); 850 | 851 | // this file has not been unlinked, since marked_for_deletion 852 | // was false. that means this file must exist in the map 853 | TORRENT_ASSERT(i != m_inodes.end()); 854 | 855 | // in production, it's probably better to just abort if 856 | // this happens though 857 | if (i == m_inodes.end()) return; 858 | 859 | m_inodes.erase(i); 860 | m_dirty_root_block = true; 861 | 862 | if (!blk->dirty) return; 863 | 864 | mutex::scoped_lock dl(m_dirty_blocks_mutex); 865 | std::vector::iterator dirty 866 | = std::find(m_dirty_blocks.begin(), m_dirty_blocks.end(), blk); 867 | if (dirty != m_dirty_blocks.end()) 868 | { 869 | m_dirty_blocks.erase(dirty); 870 | blk->dirty = false; 871 | close_impl(blk, l2); 872 | } 873 | } 874 | 875 | // returns a rough estimate of how much free space there is on 876 | // the device 877 | boost::int64_t block_device::free_space() const 878 | { 879 | return boost::int64_t(m_free_blocks.num_free()) * m_block_size; 880 | } 881 | 882 | void block_device::stat(void* inode, fstatus* st) const 883 | { 884 | inode_block* blk = (inode_block*)inode; 885 | 886 | mutex::scoped_lock l(blk->inode_mutex); 887 | 888 | TORRENT_ASSERT(blk->references > 0); 889 | 890 | st->file_size = boost::int64_t(blk->block_map.size()) * m_block_size; 891 | st->allocated_size = boost::int64_t(blk->blocks_in_use) * m_block_size; 892 | st->info_hash = blk->info_hash; 893 | } 894 | 895 | void block_device::readdir(std::vector* dir) const 896 | { 897 | mutex::scoped_lock l(m_inode_mutex); 898 | dir->resize(m_inodes.size()); 899 | 900 | int index = 0; 901 | for (boost::unordered_map::const_iterator i = 902 | m_inodes.begin(), end(m_inodes.end()); i != end; ++i, ++index) 903 | { 904 | fstatus& fs = (*dir)[index]; 905 | inode_block* blk = i->second; 906 | 907 | fs.info_hash = blk->info_hash; 908 | fs.file_size = boost::int64_t(blk->block_map.size()) * m_block_size; 909 | fs.allocated_size = boost::int64_t(blk->blocks_in_use) * m_block_size; 910 | } 911 | } 912 | 913 | void block_device::extent_map(void* inode, std::vector* map) const 914 | { 915 | inode_block* blk = (inode_block*)inode; 916 | mutex::scoped_lock l(blk->inode_mutex); 917 | TORRENT_ASSERT(blk->references > 0); 918 | *map = blk->block_map; 919 | } 920 | 921 | void block_device::allocator_stats(std::vector >* st) const 922 | { 923 | const int num_allocators = sizeof(m_node_allocator)/sizeof(m_node_allocator[0]); 924 | st->resize(num_allocators); 925 | for (int i = 0; i < num_allocators; ++i) 926 | (*st)[i] = m_node_allocator[i].usage(); 927 | } 928 | 929 | sub_block_ref block_device::allocate_inode(boost::uint64_t size, error_code& ec) 930 | { 931 | int target_blocks = (size + m_block_size - 1) / m_block_size; 932 | int target_inode_size = 0; 933 | while (target_blocks > ((1024 << target_inode_size) - sizeof(inode_block_t)) / 4) 934 | ++target_inode_size; 935 | 936 | if (1024 > (m_block_size >> target_inode_size)) 937 | { 938 | ec.assign(boost::system::errc::file_too_large, generic_category()); 939 | return sub_block_ref::invalid; 940 | } 941 | 942 | sub_block_ref ret = m_node_allocator[target_inode_size].allocate_node(m_free_blocks); 943 | if (ret == sub_block_ref::invalid) 944 | ec.assign(boost::system::errc::no_space_on_device, generic_category()); 945 | 946 | return ret; 947 | } 948 | 949 | void block_device::free_inode(sub_block_ref iblock) 950 | { 951 | if (m_node_allocator[iblock.subblock_size].free_node(iblock, m_free_blocks)) 952 | trim_blocks(&iblock.block, 1); 953 | } 954 | 955 | bool block_device::check_iop(inode_block* inode, file::iovec_t const* iov, int nvec 956 | , boost::int64_t offset, error_code& ec) const 957 | { 958 | // negative number of iovecs is clearly invalid 959 | if (nvec < 0) 960 | { 961 | ec.assign(boost::system::errc::invalid_argument, generic_category()); 962 | return true; 963 | } 964 | 965 | // negative offsets are invalid 966 | if (offset < 0) 967 | { 968 | ec.assign(boost::system::errc::invalid_argument, generic_category()); 969 | return true; 970 | } 971 | 972 | int iop_size = bufs_size(iov, nvec); 973 | 974 | // files cannot be larger than block_size * block_map_size 975 | // where block_map_size depends on how many 32 bit words 976 | // fit in the inode_block, which is block_size - inode_header_size 977 | if (offset + iop_size > boost::uint64_t(inode->max_size) * m_block_size) 978 | { 979 | ec.assign(boost::system::errc::file_too_large, generic_category()); 980 | return true; 981 | } 982 | 983 | return false; 984 | } 985 | 986 | int block_device::preadv(void* inode, file::iovec_t const* iov, int nvec 987 | , boost::int64_t offset, error_code& ec) 988 | { 989 | TORRENT_ASSERT(!m_destructed); 990 | inode_block* blk = (inode_block*)inode; 991 | 992 | // whoever is making this call must hold a reference to the inode 993 | TORRENT_ASSERT(blk->references > 0); 994 | 995 | if (check_iop(blk, iov, nvec, offset, ec)) return -1; 996 | 997 | // the block within the file address space 998 | int file_block = offset / m_block_size; 999 | int block_offset = offset % m_block_size; 1000 | 1001 | int left_to_read= bufs_size(iov, nvec); 1002 | int read = 0; 1003 | 1004 | while (left_to_read> 0) 1005 | { 1006 | const int to_read = (std::min)(int(m_block_size - block_offset) 1007 | , left_to_read); 1008 | 1009 | int num_vecs = 0; 1010 | int c = to_read; 1011 | while (c > 0) 1012 | { 1013 | c -= iov[num_vecs].iov_len; 1014 | ++num_vecs; 1015 | } 1016 | 1017 | // the iovecs _must_ be divided up in a way that no individual buffer 1018 | // spans a block boundary. This should be the case in bittorrent 1019 | TORRENT_ASSERT(c == 0); 1020 | if (c != 0) 1021 | { 1022 | ec.assign(boost::system::errc::invalid_argument, generic_category()); 1023 | return -1; 1024 | } 1025 | 1026 | int ret = preadv_impl(blk, iov, num_vecs, file_block, block_offset, ec); 1027 | if (ret < 0) return -1; 1028 | 1029 | block_offset = 0; 1030 | left_to_read -= to_read; 1031 | iov += num_vecs; 1032 | nvec -= num_vecs; 1033 | ++file_block; 1034 | read += ret; 1035 | } 1036 | 1037 | return read; 1038 | } 1039 | 1040 | int block_device::preadv_impl(inode_block* blk, file::iovec_t const* iov, int nvec 1041 | , int file_block, int block_offset, error_code& ec) 1042 | { 1043 | // the block index in the device address space 1044 | int device_block = block_allocator::unallocated_block; 1045 | 1046 | // look up the device block to read from 1047 | mutex::scoped_lock l(blk->inode_mutex); 1048 | if (blk->block_map.size() > file_block) 1049 | device_block = blk->block_map[file_block]; 1050 | l.unlock(); 1051 | 1052 | if (device_block == block_allocator::unallocated_block) 1053 | { 1054 | // we're reading unallocated space. return zeroes 1055 | int ret = 0; 1056 | for (int i = 0; i < nvec; ++i) 1057 | { 1058 | memset(iov[i].iov_base, 0, iov[i].iov_len); 1059 | ret += iov[i].iov_len; 1060 | } 1061 | return ret; 1062 | } 1063 | 1064 | boost::int64_t dev_offset = boost::int64_t(device_block) * m_block_size 1065 | + (block_offset); 1066 | 1067 | #if DISK_ACCESS_LOG 1068 | int id = write_disk_log(m_access_log, dev_offset, 0, start_read, time_now_hires()); 1069 | #endif 1070 | int ret = m_file.readv(dev_offset, iov, nvec, ec); 1071 | #if DISK_ACCESS_LOG 1072 | write_disk_log(m_access_log, dev_offset + bufs_size(iov, nvec), id 1073 | , complete_read, time_now_hires()); 1074 | #endif 1075 | return ret; 1076 | } 1077 | 1078 | int block_device::pwritev(void* inode, file::iovec_t const* iov, int nvec 1079 | , boost::int64_t offset, error_code& ec) 1080 | { 1081 | TORRENT_ASSERT(!m_destructed); 1082 | inode_block* blk = (inode_block*)inode; 1083 | 1084 | // whoever is making this call must hold a reference to the inode 1085 | TORRENT_ASSERT(blk->references > 0); 1086 | 1087 | if (check_iop(blk, iov, nvec, offset, ec)) return -1; 1088 | 1089 | // the block within the file address space 1090 | int file_block = offset / m_block_size; 1091 | int block_offset = offset % m_block_size; 1092 | 1093 | int left_to_write = bufs_size(iov, nvec); 1094 | int written = 0; 1095 | 1096 | while (left_to_write > 0) 1097 | { 1098 | const int to_write = (std::min)(int(m_block_size - block_offset) 1099 | , left_to_write); 1100 | 1101 | int num_vecs = 0; 1102 | int c = to_write; 1103 | while (c > 0) 1104 | { 1105 | c -= iov[num_vecs].iov_len; 1106 | ++num_vecs; 1107 | } 1108 | 1109 | // the iovecs _must_ be divided up in a way that no individual buffer 1110 | // spans a block boundary. This should be the case in bittorrent 1111 | TORRENT_ASSERT(c == 0); 1112 | if (c != 0) 1113 | { 1114 | ec.assign(boost::system::errc::invalid_argument, generic_category()); 1115 | return -1; 1116 | } 1117 | 1118 | int ret = pwritev_impl(blk, iov, num_vecs, file_block, block_offset, ec); 1119 | if (ret < 0) return -1; 1120 | 1121 | block_offset = 0; 1122 | left_to_write -= to_write; 1123 | iov += num_vecs; 1124 | nvec -= num_vecs; 1125 | ++file_block; 1126 | 1127 | written += ret; 1128 | } 1129 | 1130 | return written; 1131 | } 1132 | 1133 | int block_device::pwritev_impl(inode_block* blk, file::iovec_t const* iov 1134 | , int nvec, int file_block, int block_offset, error_code& ec) 1135 | { 1136 | // a single write is not allowed to span multiple blocks. 1137 | TORRENT_ASSERT(block_offset + bufs_size(iov, nvec) <= m_block_size); 1138 | 1139 | // the block index in the device address space 1140 | int device_block = block_allocator::unallocated_block; 1141 | 1142 | // look up the device block to write to 1143 | mutex::scoped_lock l(blk->inode_mutex); 1144 | if (blk->block_map.size() <= file_block) 1145 | { 1146 | if (file_block >= blk->max_size) 1147 | { 1148 | ec.assign(boost::system::errc::file_too_large, generic_category()); 1149 | return -1; 1150 | } 1151 | // no allocated space for this block, extend the 1152 | // block map and fill the new entries with unallocated_block 1153 | blk->block_map.resize(file_block + 1, block_allocator::unallocated_block); 1154 | } 1155 | else 1156 | { 1157 | // this slot exists in the block map, now 1158 | // let's see if there's already a block allocated 1159 | // on the device for this file block 1160 | device_block = blk->block_map[file_block]; 1161 | } 1162 | 1163 | boost::optional seq; 1164 | 1165 | if (device_block == block_allocator::unallocated_block) 1166 | { 1167 | // we're allocating a block. Make sure any other thread that's also 1168 | // allocating a new block to write to is serialized with this thread, 1169 | // to force full sequenctial writes to the disk 1170 | seq = boost::in_place(std::ref(m_sequential_write_mutex)); 1171 | 1172 | // we need to allocae a new block on the device 1173 | // this is a cheap O(1) operation that doesn't 1174 | // have to touch the disk, it's OK to do this 1175 | // while still holding the lock. In fact, we need 1176 | // to hold the lock to avoid another thread allocating 1177 | // the same block 1178 | device_block = m_free_blocks.allocate_block(false); 1179 | if (device_block == block_allocator::unallocated_block) 1180 | { 1181 | // we failed to allocate a new block 1182 | // we're out of space on the device! 1183 | ec.assign(boost::system::errc::no_space_on_device, generic_category()); 1184 | return -1; 1185 | } 1186 | 1187 | // allright, let's add our newly allocated block 1188 | // to the block map 1189 | blk->block_map[file_block] = device_block; 1190 | ++blk->blocks_in_use; 1191 | if (!blk->dirty) 1192 | { 1193 | mutex::scoped_lock l2(m_dirty_blocks_mutex); 1194 | m_dirty_blocks.push_back(blk); 1195 | blk->dirty = true; 1196 | // keep the inode block alive until we've had a chance 1197 | // to flush it. When it's flushed it will be decremented 1198 | ++blk->references; 1199 | } 1200 | if (bufs_size(iov, nvec) != m_block_size) 1201 | { 1202 | printf("performance warning: writing less than block size: %d B " 1203 | "( block-size: %d B) block: %d\n" 1204 | , bufs_size(iov, nvec), m_block_size, device_block); 1205 | } 1206 | } 1207 | else 1208 | { 1209 | printf("performance warning: writing to existing block %d\n", device_block); 1210 | } 1211 | 1212 | l.unlock(); 1213 | 1214 | boost::int64_t dev_offset = boost::int64_t(device_block) * m_block_size 1215 | + block_offset; 1216 | 1217 | #if DISK_ACCESS_LOG 1218 | int id = write_disk_log(m_access_log, dev_offset, 0, start_write, time_now_hires()); 1219 | #endif 1220 | int ret = m_file.writev(dev_offset, iov, nvec, ec); 1221 | #if DISK_ACCESS_LOG 1222 | write_disk_log(m_access_log, dev_offset + bufs_size(iov, nvec), id, complete_write, time_now_hires()); 1223 | #endif 1224 | return ret; 1225 | } 1226 | 1227 | // this implements the libtorrent storage interface, to store torrents in 1228 | // the block_device 1229 | struct block_device_storage : libtorrent::storage_interface 1230 | { 1231 | block_device_storage(boost::shared_ptr dev, file_storage const& fs, sha1_hash const& ih) 1232 | : m_device(dev) 1233 | , m_info_hash(ih) 1234 | , m_inode(0) 1235 | , m_piece_size(fs.piece_length()) 1236 | , m_total_size(fs.total_size()) 1237 | {} 1238 | 1239 | virtual bool tick() 1240 | { 1241 | fprintf(stderr, "flushing inodes and root block\n"); 1242 | m_device->flush_inodes(); 1243 | m_device->flush_root_block(); 1244 | return false; 1245 | } 1246 | 1247 | // create directories and set file sizes 1248 | virtual void initialize(storage_error& ec) 1249 | { 1250 | } 1251 | 1252 | virtual int readv(file::iovec_t const* bufs, int num_bufs 1253 | , int piece, int offset, int flags, storage_error& ec) 1254 | { 1255 | // TODO: m_inode needs mutex protection 1256 | if (m_inode == NULL) 1257 | { 1258 | m_inode = m_device->open(m_info_hash, m_total_size, ec.ec); 1259 | if (ec) return -1; 1260 | } 1261 | 1262 | boost::int64_t toffset = boost::int64_t(piece) * m_piece_size + offset; 1263 | return m_device->preadv(m_inode, bufs, num_bufs, toffset, ec.ec); 1264 | } 1265 | 1266 | virtual int writev(file::iovec_t const* bufs, int num_bufs 1267 | , int piece, int offset, int flags, storage_error& ec) 1268 | { 1269 | // TODO: m_inode needs mutex protection 1270 | if (m_inode == NULL) 1271 | { 1272 | m_inode = m_device->open(m_info_hash, m_total_size, ec.ec); 1273 | if (ec) return -1; 1274 | } 1275 | 1276 | boost::int64_t toffset = boost::int64_t(piece) * m_piece_size + offset; 1277 | return m_device->pwritev(m_inode, bufs, num_bufs, toffset, ec.ec); 1278 | } 1279 | 1280 | virtual bool has_any_file(storage_error& ec) 1281 | { 1282 | return m_device->exists(m_info_hash); 1283 | } 1284 | 1285 | // change the priorities of files. This is a fenced job and is 1286 | // guaranteed to be the only running function on this storage 1287 | virtual void set_file_priority(std::vector const& prio, storage_error& ec) {} 1288 | 1289 | // non-zero return value indicates an error 1290 | virtual int move_storage(std::string const& save_path, int flags 1291 | , storage_error& ec) { return libtorrent::piece_manager::no_error; } 1292 | 1293 | // verify storage dependent fast resume entries 1294 | virtual bool verify_resume_data(lazy_entry const& rd, storage_error& ec) { return true; } 1295 | 1296 | // write storage dependent fast resume entries 1297 | virtual void write_resume_data(entry& rd, storage_error& ec) const {} 1298 | 1299 | // this will close all open files that are opened for 1300 | // writing. This is called when a torrent has finished 1301 | // downloading. 1302 | // non-zero return value indicates an error 1303 | virtual void release_files(storage_error& ec) {} 1304 | 1305 | // this will rename the file specified by index. 1306 | virtual void rename_file(int index, std::string const& new_filenamem, storage_error& ec) {} 1307 | 1308 | // this will close all open files and delete them 1309 | // non-zero return value indicates an error 1310 | virtual void delete_files(storage_error& ec) 1311 | { 1312 | if (m_inode == NULL) return; 1313 | m_device->unlink(m_inode); 1314 | m_device->close(m_inode); 1315 | m_inode = NULL; 1316 | } 1317 | 1318 | virtual ~block_device_storage() 1319 | { 1320 | if (m_inode) 1321 | m_device->close(m_inode); 1322 | } 1323 | 1324 | private: 1325 | boost::shared_ptr m_device; 1326 | sha1_hash m_info_hash; 1327 | 1328 | // refers to the storage for this specific torrent file 1329 | void* m_inode; 1330 | 1331 | int m_piece_size; 1332 | boost::uint64_t m_total_size; 1333 | }; 1334 | 1335 | libtorrent::storage_interface* block_device_storage_constructor( 1336 | boost::shared_ptr dev, storage_params const& params) 1337 | { 1338 | return new block_device_storage(dev, *params.files, params.info->info_hash()); 1339 | } 1340 | 1341 | -------------------------------------------------------------------------------- /src/block_device.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright (c) 2014, Arvid Norberg 4 | All rights reserved. 5 | 6 | btfs Source Code is free software: you can redistribute it and/or modify 7 | it under the terms of the GNU General Public License as published by 8 | the Free Software Foundation, either version 3 of the License, or 9 | (at your option) any later version. 10 | 11 | For details, see LICENSE 12 | 13 | */ 14 | 15 | #ifndef TORRENT_STORAGE_DEVICE 16 | #define TORRENT_STORAGE_DEVICE 17 | 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include "libtorrent/config.hpp" 24 | #include "libtorrent/file.hpp" 25 | #include "libtorrent/error_code.hpp" 26 | #include "libtorrent/peer_id.hpp" // for sha1_hash 27 | #include "libtorrent/bitfield.hpp" 28 | #include "libtorrent/thread.hpp" 29 | #include "libtorrent/storage.hpp" 30 | #include "block_allocator.hpp" 31 | #include "pool_allocator.hpp" 32 | 33 | namespace libtorrent 34 | { 35 | struct storage_params; 36 | } 37 | 38 | using libtorrent::error_code; 39 | using libtorrent::file; 40 | using libtorrent::sha1_hash; 41 | using libtorrent::bitfield; 42 | using libtorrent::mutex; 43 | using libtorrent::storage_error; 44 | using libtorrent::storage_params; 45 | 46 | struct inode_block; 47 | struct block_device; 48 | 49 | #define DISK_ACCESS_LOG 1 50 | 51 | #if DISK_ACCESS_LOG 52 | #include "libtorrent/time.hpp" 53 | using libtorrent::ptime; 54 | #endif 55 | 56 | TORRENT_EXPORT libtorrent::storage_interface* block_device_storage_constructor( 57 | boost::shared_ptr dev, storage_params const& params); 58 | 59 | // this is the singleton all torrent storages uses to 60 | // actually write to to the device 61 | struct TORRENT_EXPORT block_device 62 | { 63 | // the blocks that are allocated on the 64 | // raw block device are 4 MB. 65 | enum { default_block_size = 4 * 1024 * 1024 }; 66 | 67 | block_device(std::string const& device_path, error_code& ec); 68 | ~block_device(); 69 | 70 | int preadv(void* inode, file::iovec_t const* iov, int nvec, boost::int64_t offset, error_code& ec); 71 | int pwritev(void* inode, file::iovec_t const* iov, int nvec, boost::int64_t offset, error_code& ec); 72 | 73 | // returns true if there is an entry for this info-hash 74 | bool exists(sha1_hash const& info_hash) const; 75 | 76 | // returns a file handle 77 | // max_size is required if the file doesn't exist 78 | void* open(sha1_hash const& info_hash, boost::uint64_t max_size, error_code& ec); 79 | 80 | // close this file and make sure it's flushed 81 | void close(void* inode); 82 | 83 | // return all blocks belonging to this file back to the free-list 84 | // and remove this entry from the file allocation table. 85 | void unlink(void* inode); 86 | 87 | // returns a rough estimate of how much free space there is on 88 | // the device 89 | boost::int64_t free_space() const; 90 | 91 | struct fstatus 92 | { 93 | sha1_hash info_hash; 94 | boost::int64_t file_size; 95 | boost::int64_t allocated_size; 96 | }; 97 | 98 | void stat(void* inode, fstatus* st) const; 99 | 100 | void readdir(std::vector* dir) const; 101 | 102 | void extent_map(void* inode, std::vector* map) const; 103 | 104 | void allocator_stats(std::vector >* st) const; 105 | 106 | // -------- initialization ------------ 107 | // you either need to call read_root_block() to initialize 108 | // the block device, assuming there is a formatted file 109 | // system on there already. If not, and you wish to initialize 110 | // the device, call format(). 111 | 112 | // initialize device 113 | void format(error_code& ec, int block_size = default_block_size); 114 | 115 | // read an existing filesystem from drive 116 | void read_root_block(error_code& ec); 117 | 118 | int block_size() const { return m_block_size; } 119 | 120 | void flush_inodes(); 121 | void flush_root_block(); 122 | 123 | private: 124 | 125 | int preadv_impl(inode_block* blk, file::iovec_t const* iov, int nvec 126 | , int file_block, int block_offset, error_code& ec); 127 | int pwritev_impl(inode_block* blk, file::iovec_t const* iov, int nvec 128 | , int file_block, int block_offset, error_code& ec); 129 | 130 | void close_impl(inode_block* blk, mutex::scoped_lock& l); 131 | void trim_blocks(boost::uint32_t* b, int num_blocks); 132 | 133 | bool check_iop(inode_block* inode, file::iovec_t const* iov, int nvec 134 | , boost::int64_t offset, error_code& ec) const; 135 | void read_inode(bitfield& used_blocks, sub_block_ref iblock, error_code& ec); 136 | 137 | sub_block_ref allocate_inode(boost::uint64_t max_size, error_code& ec); 138 | void free_inode(sub_block_ref iblock); 139 | 140 | // this is the max size (in bytes) of the device 141 | boost::uint64_t m_max_size; 142 | 143 | // this is the filesystem block size (typically 4 MiB) 144 | boost::uint32_t m_block_size; 145 | 146 | // this is the underlying media block size (typically 512 B) 147 | boost::uint32_t m_media_block_size; 148 | 149 | // the file referring to the block device or flat file 150 | file m_file; 151 | 152 | mutable mutex m_dirty_blocks_mutex; 153 | std::vector m_dirty_blocks; 154 | 155 | // this is the file list 156 | mutable mutex m_inode_mutex; 157 | boost::unordered_map m_inodes; 158 | bool m_dirty_root_block; 159 | 160 | // this mutex is used to serialize all writes to newly allocated blocks. 161 | // this forces accurate sequential writes 162 | mutable mutex m_sequential_write_mutex; 163 | 164 | // keeps track of free filesystem blocks 165 | block_allocator m_free_blocks; 166 | 167 | // keeps track of allocated but unused inode 168 | // slots of varying sizes 169 | pool_allocator m_node_allocator[13]; 170 | 171 | #if defined TORRENT_DEBUG || TORRENT_RELEASE_ASSERTS 172 | bool m_destructed; 173 | #endif 174 | 175 | #if DISK_ACCESS_LOG 176 | FILE* m_access_log; 177 | ptime m_start_time; 178 | #endif 179 | 180 | }; 181 | 182 | #endif 183 | 184 | -------------------------------------------------------------------------------- /src/btfs.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright (c) 2014, Arvid Norberg 4 | All rights reserved. 5 | 6 | btfs Source Code is free software: you can redistribute it and/or modify 7 | it under the terms of the GNU General Public License as published by 8 | the Free Software Foundation, either version 3 of the License, or 9 | (at your option) any later version. 10 | 11 | For details, see LICENSE 12 | 13 | */ 14 | 15 | #include "block_device.hpp" 16 | #include "libtorrent/escape_string.hpp" 17 | 18 | #include 19 | 20 | using libtorrent::from_hex; 21 | using libtorrent::to_hex; 22 | using libtorrent::file; 23 | 24 | void print_usage() 25 | { 26 | char const* usage_string = "usage: btfs command [arg] device\n\n" 27 | "command must be one of:\n" 28 | " list - list file contents on device\n" 29 | " initialize - format device as a bittorrent filesystem\n" 30 | " block size is specified in bytes.\n" 31 | " recommended setting is 4 MB, 4194304 bytes\n" 32 | " unlink - unlink (remove) the specifed file. This\n" 33 | " command takes an additional argument.\n" 34 | " the file is specified as a 40 digit hex string.\n" 35 | " read - reads the entire file specified and prints it\n" 36 | " to stdout. This command takes an additional\n" 37 | " argument which is the file to print.\n" 38 | " write - reads from stdin and writes to the specified\n" 39 | " file. This command takes an additional\n" 40 | " argument which is the file to write to.\n" 41 | " visualize prints a graphical representation of the\n" 42 | " filesystem to stdout.\n" 43 | " stats prints i-node allocation statistics\n" 44 | "\n" 45 | "filenames are always 40 hex digit sha-1 digests.\n" 46 | "\n" 47 | "the device may be a file, which must have been pre-allocated\n" 48 | "to the desired size\n"; 49 | 50 | fputs(usage_string, stderr); 51 | exit(1); 52 | } 53 | 54 | enum command_t 55 | { 56 | cmd_list, cmd_initialize, cmd_unlink, cmd_read, cmd_write, cmd_visualize, cmd_stats 57 | }; 58 | 59 | int parse_command(int argc, char* argv[], char const* argument[], char const*& device) 60 | { 61 | // skip executable filename 62 | ++argv; 63 | --argc; 64 | int ret; 65 | int num_args = 0; 66 | argument[0] = NULL; 67 | argument[1] = NULL; 68 | if (strcmp(argv[0], "list") == 0) 69 | { 70 | ret = cmd_list; 71 | num_args = 0; 72 | } 73 | else if (strcmp(argv[0], "visualize") == 0) 74 | { 75 | ret = cmd_visualize; 76 | num_args = 0; 77 | } 78 | else if (strcmp(argv[0], "stats") == 0) 79 | { 80 | ret = cmd_stats; 81 | num_args = 0; 82 | } 83 | else if (strcmp(argv[0], "initialize") == 0) 84 | { 85 | ret = cmd_initialize; 86 | num_args = 1; 87 | } 88 | else if (strcmp(argv[0], "unlink") == 0) 89 | { 90 | ret = cmd_unlink; 91 | num_args = 1; 92 | } 93 | else if (strcmp(argv[0], "read") == 0) 94 | { 95 | ret = cmd_read; 96 | num_args = 1; 97 | } 98 | else if (strcmp(argv[0], "write") == 0) 99 | { 100 | ret = cmd_write; 101 | num_args = 2; 102 | } 103 | else 104 | { 105 | fprintf(stderr, "unknown command: \"%s\"\n", argv[0]); 106 | print_usage(); 107 | return 0; 108 | } 109 | ++argv; 110 | --argc; 111 | 112 | if (argc <= 0) 113 | { 114 | fputs("too few arguments\n", stderr); 115 | print_usage(); 116 | return 0; 117 | } 118 | 119 | for (int i = 0; i < num_args; ++i) 120 | { 121 | argument[i] = argv[0]; 122 | ++argv; 123 | --argc; 124 | 125 | if (argc <= 0) 126 | { 127 | fputs("too few arguments\n", stderr); 128 | print_usage(); 129 | return 0; 130 | } 131 | } 132 | 133 | if (argc > 1) 134 | { 135 | fputs("too many arguments\n", stderr); 136 | print_usage(); 137 | return 0; 138 | } 139 | 140 | device = argv[0]; 141 | return ret; 142 | } 143 | 144 | int main(int argc, char* argv[]) 145 | { 146 | if (argc < 3) print_usage(); 147 | 148 | char const* argument[5]; 149 | char const* device = NULL; 150 | int command = parse_command(argc, argv, argument, device); 151 | 152 | error_code ec; 153 | block_device dev(device, ec); 154 | if (ec) 155 | { 156 | fprintf(stderr, "Error opening device or file: %s\n", ec.message().c_str()); 157 | return 1; 158 | } 159 | 160 | sha1_hash info_hash; 161 | if (command != cmd_initialize) 162 | { 163 | // the first argument is always a sha1-hash 164 | if (argument[0]) 165 | { 166 | if (strlen(argument[0]) != 40) 167 | { 168 | fprintf(stderr, "invalid filename argument; \"%s\". Expected 40 hex digits" 169 | " len=%d\n" 170 | , argument[0], int(strlen(argument[0]))); 171 | return 1; 172 | } 173 | if (!from_hex(argument[0], 40, (char*)&info_hash[0])) 174 | { 175 | fprintf(stderr, "invalid filename argument; \"%s\". Expected 40 hex digits\n", argument[0]); 176 | return 1; 177 | } 178 | } 179 | 180 | dev.read_root_block(ec); 181 | if (ec) 182 | { 183 | fprintf(stderr, "failed to read filesystem: %s\n", ec.message().c_str()); 184 | return 1; 185 | } 186 | } 187 | switch (command) 188 | { 189 | case cmd_list: 190 | { 191 | std::vector dir; 192 | dev.readdir(&dir); 193 | for (std::vector::iterator i = dir.begin() 194 | , end(dir.end()); i != end; ++i) 195 | { 196 | fprintf(stderr, "%s s: %10" PRId64 " a: %10" PRId64 "\n" 197 | , to_hex(i->info_hash.to_string()).c_str(), i->file_size, i->allocated_size); 198 | } 199 | fprintf(stderr, "%10" PRId64 " bytes free\n", dev.free_space()); 200 | break; 201 | } 202 | case cmd_initialize: 203 | { 204 | int block_size = atoi(argument[0]); 205 | dev.format(ec, block_size); 206 | if (ec) 207 | { 208 | fprintf(stderr, "failed to initialize device '%s': %s\n", device, ec.message().c_str()); 209 | return 1; 210 | } 211 | fprintf(stderr, "device '%s' successfully initialized. %" PRId64 " bytes free\n" 212 | , device, dev.free_space()); 213 | break; 214 | } 215 | case cmd_unlink: 216 | { 217 | if (!dev.exists(info_hash)) 218 | { 219 | fprintf(stderr, "file \"%s\" does not exist\n", argument[0]); 220 | return 1; 221 | } 222 | void* inode = dev.open(info_hash, 0, ec); 223 | if (ec) 224 | { 225 | fprintf(stderr, "failed to open file \"%s\": %s\n", argument[0], ec.message().c_str()); 226 | return 1; 227 | } 228 | dev.unlink(inode); 229 | dev.close(inode); 230 | break; 231 | } 232 | case cmd_read: 233 | { 234 | if (!dev.exists(info_hash)) 235 | { 236 | fprintf(stderr, "file \"%s\" does not exist\n", argument[0]); 237 | return 1; 238 | } 239 | void* inode = dev.open(info_hash, 0, ec); 240 | if (ec) 241 | { 242 | fprintf(stderr, "failed to open file \"%s\": %s\n", argument[0], ec.message().c_str()); 243 | return 1; 244 | } 245 | char filebuf[4096]; 246 | memset(filebuf, 0, sizeof(filebuf)); 247 | file::iovec_t b = { filebuf, sizeof(filebuf) }; 248 | boost::int64_t offset = 0; 249 | block_device::fstatus st; 250 | dev.stat(inode, &st); 251 | while (offset < st.file_size) 252 | { 253 | dev.preadv(inode, &b, 1, offset, ec); 254 | if (ec) 255 | { 256 | fprintf(stderr, "error reading from file: \"%s\": %s\n" 257 | , argument[0], ec.message().c_str()); 258 | break; 259 | } 260 | offset += sizeof(filebuf); 261 | fwrite(filebuf, 1, sizeof(filebuf), stdout); 262 | } 263 | fprintf(stderr, "read %" PRId64 " bytes from \"%s\"\n", offset, argument[0]); 264 | dev.close(inode); 265 | break; 266 | } 267 | case cmd_write: 268 | { 269 | if (!dev.exists(info_hash)) 270 | fprintf(stderr, "creating new file \"%s\"\n", argument[0]); 271 | FILE* input = fopen(argument[1], "rb"); 272 | if (input == NULL) 273 | { 274 | fprintf(stderr, "failed to open input file \"%s\": (%d) %s\n" 275 | , argument[1], errno, strerror(errno)); 276 | return 1; 277 | } 278 | struct stat st; 279 | if (stat(argument[1], &st) < 0) 280 | { 281 | fprintf(stderr, "failed to stat input file: \"%s\": (%d) %s\n" 282 | , argument[1], errno, strerror(errno)); 283 | } 284 | void* inode = dev.open(info_hash, st.st_size, ec); 285 | if (ec) 286 | { 287 | fprintf(stderr, "failed to open file \"%s\": %s\n" 288 | , argument[0], ec.message().c_str()); 289 | return 1; 290 | } 291 | std::vector filebuf(4 * 1024 * 1024); 292 | int len = 0; 293 | memset(&filebuf[0], 0, filebuf.size()); 294 | file::iovec_t b[2] = { { &filebuf[0], filebuf.size()/2} 295 | , { &filebuf[filebuf.size()/2], filebuf.size()/2 } }; 296 | 297 | boost::int64_t offset = 0; 298 | while ((len = fread(&filebuf[0], 1, filebuf.size(), input)) > 0) 299 | { 300 | if (len < filebuf.size()/2) 301 | { 302 | b[0].iov_len = len; 303 | b[1].iov_len = 0; 304 | } 305 | else 306 | { 307 | b[1].iov_len = len - filebuf.size() / 2; 308 | } 309 | 310 | dev.pwritev(inode, b, 2, offset, ec); 311 | if (ec) 312 | { 313 | fprintf(stderr, "error writing to file: \"%s\": %s\n" 314 | , argument[0], ec.message().c_str()); 315 | break; 316 | } 317 | offset += filebuf.size(); 318 | } 319 | fclose(input); 320 | fprintf(stderr, "wrote %" PRId64 " bytes to \"%s\"\n", offset, argument[0]); 321 | dev.close(inode); 322 | break; 323 | } 324 | case cmd_visualize: 325 | { 326 | std::vector dir; 327 | dev.readdir(&dir); 328 | std::vector blocks; 329 | for (std::vector::iterator i = dir.begin() 330 | , end(dir.end()); i != end; ++i) 331 | { 332 | printf("%s s: %10" PRId64 " a: %10" PRId64 "\n" 333 | , to_hex(i->info_hash.to_string()).c_str(), i->file_size, i->allocated_size); 334 | error_code ec; 335 | void* inode = dev.open(i->info_hash, 0, ec); 336 | if (ec) 337 | { 338 | fprintf(stderr, "error opening file \"%s\": %s\n" 339 | , to_hex(i->info_hash.to_string()).c_str(), ec.message().c_str()); 340 | break; 341 | } 342 | dev.extent_map(inode, &blocks); 343 | for (int i = 0; i < int(blocks.size()); ++i) 344 | { 345 | if (blocks[i] == 0xffffffff) printf("."); 346 | else printf("%u ", blocks[i]); 347 | } 348 | printf("\n"); 349 | dev.close(inode); 350 | } 351 | fprintf(stderr, "%10" PRId64 " bytes free\n", dev.free_space()); 352 | break; 353 | } 354 | case cmd_stats: 355 | { 356 | std::vector > st; 357 | dev.allocator_stats(&st); 358 | int k = 0; 359 | fprintf(stderr, "i-node allocator stats:\n"); 360 | for (std::vector >::iterator i = st.begin() 361 | , end(st.end()); i != end; ++i, ++k) 362 | { 363 | fprintf(stderr, "%4d kiB [ in-use: %-5d allocated: %-5d ]\n" 364 | , 1 << k, i->first, i->second); 365 | } 366 | } 367 | break; 368 | } 369 | }; 370 | -------------------------------------------------------------------------------- /src/pool_allocator.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright (c) 2014, Arvid Norberg 4 | All rights reserved. 5 | 6 | btfs Source Code is free software: you can redistribute it and/or modify 7 | it under the terms of the GNU General Public License as published by 8 | the Free Software Foundation, either version 3 of the License, or 9 | (at your option) any later version. 10 | 11 | For details, see LICENSE 12 | 13 | */ 14 | 15 | #include "pool_allocator.hpp" 16 | #include "block_allocator.hpp" 17 | 18 | pool_allocator::pool_allocator() 19 | : m_nodes_per_block(0) 20 | , m_node_size(0) 21 | , m_in_use(0) 22 | {} 23 | 24 | void pool_allocator::init(int nodes_per_block, int size) 25 | { 26 | m_nodes_per_block = nodes_per_block; 27 | m_node_size = size; 28 | } 29 | 30 | void pool_allocator::mark_in_use(sub_block_ref blk) 31 | { 32 | TORRENT_ASSERT(blk.subblock_size == m_node_size); 33 | 34 | ++m_in_use; 35 | 36 | // we're not locking here because this is only used 37 | // when initializing the allocator, which is single- 38 | // threaded 39 | std::set::iterator i = m_free_nodes.find(blk); 40 | if (i != m_free_nodes.end()) 41 | { 42 | m_free_nodes.erase(i); 43 | return; 44 | } 45 | 46 | for (int i = 0; i < m_nodes_per_block; ++i) 47 | { 48 | if (i == blk.subblock) continue; 49 | sub_block_ref r = {blk.block, boost::uint16_t(i), boost::uint8_t(m_node_size)}; 50 | m_free_nodes.insert(r); 51 | } 52 | } 53 | 54 | sub_block_ref pool_allocator::allocate_node(block_allocator& alloc) 55 | { 56 | mutex::scoped_lock l(m_mutex); 57 | if (!m_free_nodes.empty()) 58 | { 59 | sub_block_ref ret = *m_free_nodes.begin(); 60 | m_free_nodes.erase(m_free_nodes.begin()); 61 | ++m_in_use; 62 | return ret; 63 | } 64 | 65 | boost::uint32_t block = alloc.allocate_block(true); 66 | if (block == block_allocator::unallocated_block) return sub_block_ref::invalid; 67 | sub_block_ref ret = { block, 0, boost::uint8_t(m_node_size) }; 68 | for (int i = 1; i < m_nodes_per_block; ++i) 69 | { 70 | sub_block_ref r = {block, boost::uint16_t(i), boost::uint8_t(m_node_size)}; 71 | m_free_nodes.insert(r); 72 | } 73 | ++m_in_use; 74 | return ret; 75 | } 76 | 77 | bool pool_allocator::free_node(sub_block_ref blk, block_allocator& alloc) 78 | { 79 | TORRENT_ASSERT(blk.subblock_size == m_node_size); 80 | 81 | mutex::scoped_lock l(m_mutex); 82 | 83 | m_free_nodes.insert(blk); 84 | 85 | // now, see if we have freed an entire filesystem block 86 | // if so, we need to return it to the block allocator 87 | typedef std::set::iterator iter; 88 | sub_block_ref r = {blk.block, 0, blk.subblock_size}; 89 | m_free_nodes.insert(r); 90 | iter first = std::lower_bound(m_free_nodes.begin(), m_free_nodes.end(), r); 91 | iter end = m_free_nodes.end(); 92 | int num_free_in_block = 0; 93 | iter i; 94 | for (i = first; i != end && i->block == blk.block; ++i) 95 | ++num_free_in_block; 96 | 97 | if (num_free_in_block < m_nodes_per_block) return false; 98 | 99 | alloc.free_block(blk.block); 100 | m_free_nodes.erase(first, i); 101 | return true; 102 | } 103 | 104 | std::pair pool_allocator::usage() const 105 | { 106 | mutex::scoped_lock l(m_mutex); 107 | return std::pair(m_in_use, m_free_nodes.size()); 108 | } 109 | 110 | const sub_block_ref sub_block_ref::invalid = {0,0,0}; 111 | 112 | -------------------------------------------------------------------------------- /src/pool_allocator.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright (c) 2014, Arvid Norberg 4 | All rights reserved. 5 | 6 | btfs Source Code is free software: you can redistribute it and/or modify 7 | it under the terms of the GNU General Public License as published by 8 | the Free Software Foundation, either version 3 of the License, or 9 | (at your option) any later version. 10 | 11 | For details, see LICENSE 12 | 13 | */ 14 | 15 | #ifndef TORRENT_POOL_ALLOCATOR_HPP_INCLUDED 16 | #define TORRENT_POOL_ALLOCATOR_HPP_INCLUDED 17 | 18 | // this is the pool allocator used to allocate inodes 19 | // each instantiation of a pool allocator allocates 20 | // inodes of a certain size. Typical sizes are: 21 | // 1 kiB, 2 kiB, 4 kiB, 8 kiB, 16 kiB, 32kiB, 64 kiB 22 | // 128 kiB, 256 kiB, 512 kiB, 1 MiB, 2 MiB, 4 MiB. 23 | 24 | // the maximum file size each of those inodes can hold 25 | // are: 26 | // 960 MiB, 1984 MiB, 4032 MiB, 8128 MiB, 16320 MiB, 32704 MiB 27 | // and so on. i.e. (size of inode - 64) MiB 28 | 29 | #include 30 | #include 31 | 32 | #include "libtorrent/thread.hpp" 33 | #include "block_allocator.hpp" 34 | 35 | using libtorrent::mutex; 36 | 37 | struct sub_block_ref 38 | { 39 | // this is the filesystem block 40 | boost::uint32_t block; 41 | // this is the sub-block within the 42 | // filesystem block 43 | boost::uint16_t subblock; 44 | // this is the size expressed in how many 45 | // times to shift 1024 to the left. 46 | boost::uint8_t subblock_size; 47 | 48 | // unused 49 | boost::uint8_t padding; 50 | 51 | boost::uint64_t device_offset(boost::uint32_t block_size) const 52 | { return boost::uint64_t(block_size) * block + subblock * (1024 << subblock_size); } 53 | 54 | int node_size() const { return 1024 << subblock_size; } 55 | 56 | bool operator!=(sub_block_ref rhs) const 57 | { 58 | return block != rhs.block || subblock != rhs.subblock; 59 | } 60 | 61 | bool operator==(sub_block_ref rhs) const 62 | { 63 | return block == rhs.block && subblock == rhs.subblock; 64 | } 65 | 66 | bool operator<(sub_block_ref rhs) const 67 | { 68 | if (block < rhs.block) return true; 69 | if (block > rhs.block) return false; 70 | return subblock < rhs.subblock; 71 | } 72 | 73 | const static sub_block_ref invalid; 74 | }; 75 | 76 | struct pool_allocator 77 | { 78 | pool_allocator(); 79 | void init(int nodes_per_block, int size); 80 | void mark_in_use(sub_block_ref blk); 81 | sub_block_ref allocate_node(block_allocator& alloc); 82 | bool free_node(sub_block_ref, block_allocator& alloc); 83 | std::pair usage() const; 84 | private: 85 | mutable mutex m_mutex; 86 | int m_nodes_per_block; 87 | int m_node_size; 88 | 89 | // the number of nodes that are in use 90 | int m_in_use; 91 | std::set m_free_nodes; 92 | }; 93 | 94 | #endif 95 | 96 | -------------------------------------------------------------------------------- /test/Jamfile: -------------------------------------------------------------------------------- 1 | use-project /torrentfs : .. ; 2 | 3 | exe test_block_allocator : test_block_allocator.cpp : /torrentfs//torrentfs/static ; 4 | 5 | -------------------------------------------------------------------------------- /test/render.gnuplot: -------------------------------------------------------------------------------- 1 | set term png size 800,400 giant 2 | set output "access.png" 3 | set title "block writes" 4 | set ylabel "block" 5 | set xlabel "write" 6 | set key off 7 | plot "stats.dat" using 1:3 title "disk write" with dots 8 | 9 | set terminal postscript 10 | set output "access.ps" 11 | replot 12 | 13 | set term png size 800,400 giant 14 | set output "free_blocks.png" 15 | 16 | set title "free blocks" 17 | set ylabel "free blocks" 18 | set xlabel "write" 19 | plot "stats.dat" using 1:2 title "blocks free" with steps 20 | 21 | set terminal postscript 22 | set output "free_blocks.ps" 23 | replot 24 | 25 | -------------------------------------------------------------------------------- /test/test_block_allocator.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright (c) 2014, Arvid Norberg 4 | All rights reserved. 5 | 6 | btfs Source Code is free software: you can redistribute it and/or modify 7 | it under the terms of the GNU General Public License as published by 8 | the Free Software Foundation, either version 3 of the License, or 9 | (at your option) any later version. 10 | 11 | For details, see LICENSE 12 | 13 | */ 14 | 15 | #include "block_allocator.hpp" 16 | #include 17 | 18 | using namespace libtorrent; 19 | 20 | const int num_blocks = 45000; 21 | const int num_torrents = 500; 22 | 23 | struct test_torrent 24 | { 25 | int size; 26 | std::vector blocks; 27 | }; 28 | 29 | // there are 100 torrents, each with its own 30 | // list of blocks that it has allocated 31 | test_torrent torrents[num_torrents]; 32 | 33 | int get_torrent_size() 34 | { 35 | // average torrent size is 300 MiB 36 | // the return value is the size in 4 MiB blocks 37 | // 300 MiB = 75 blocks. 38 | // randomize the torrent sizes +- 200 MiB / 50 blocks 39 | 40 | int size1 = (rand() % 100) + 75; 41 | 42 | // create a slight bias towards the center (75 blocks) 43 | int size2 = (rand() % 100) + 75; 44 | 45 | return size1 * 2 / 3 + size2 / 3; 46 | } 47 | 48 | int main() 49 | { 50 | int ret = 0; 51 | 52 | // the random numbers should be predictable 53 | srand(0x1337); 54 | 55 | for (int i = 0; i < num_torrents; ++i) 56 | torrents[i].size = get_torrent_size(); 57 | 58 | block_allocator blk; 59 | bitfield bf; 60 | bf.resize(num_blocks, false); 61 | bf.set_bit(0); 62 | blk.init(num_blocks, bf); 63 | 64 | int last_write = 0; 65 | 66 | FILE* f = fopen("stats.dat", "w+"); 67 | fprintf(f, "#%-14s %-15s %-15s\n", "index", "free blocks", "write block"); 68 | 69 | for (int loops = 0; loops < num_blocks * 4; ++loops) 70 | { 71 | int t = rand() % num_torrents; 72 | test_torrent& tor = torrents[t]; 73 | boost::uint32_t b = blk.allocate_block(false); 74 | if (b == 0) 75 | { 76 | fprintf(stderr, "allocation failed\n"); 77 | return 1; 78 | } 79 | tor.blocks.push_back(b); 80 | fprintf(f, "%-15d %-15d %-15d\n", loops, blk.num_free(), b); 81 | last_write = b; 82 | if (tor.blocks.size() >= tor.size) 83 | { 84 | // this torrent is complete, delete all its 85 | // blocks and replace it with a new torrent 86 | blk.free_blocks(&tor.blocks[0], tor.blocks.size()); 87 | tor.blocks.clear(); 88 | tor.size = get_torrent_size(); 89 | } 90 | } 91 | 92 | fclose(f); 93 | 94 | return ret; 95 | } 96 | 97 | --------------------------------------------------------------------------------