├── .gitignore ├── LICENSE ├── README.md ├── abs └── abs.cpp ├── access_patterns └── access_patterns.cpp ├── biased_branches └── random.cpp ├── clamp └── clamp_bench.cpp ├── cmake_example ├── CMakeLists.txt ├── PowersConfig.h.in ├── local_functions │ ├── CMakeLists.txt │ ├── exponent.cpp │ └── exponent.h └── powers.cpp ├── code_scheduling ├── base │ ├── base.cpp │ └── base_random.cpp ├── fast │ ├── fast.cpp │ └── fast_random.cpp └── hint │ └── hint.cpp ├── conditions ├── bool │ ├── non_power.cpp │ ├── power2.cpp │ └── runtime_value.cpp ├── branch │ ├── false.cpp │ ├── random.cpp │ └── true.cpp ├── char │ ├── non_power.cpp │ ├── power2.cpp │ └── runtime_value.cpp ├── int │ ├── non_power.cpp │ ├── power2.cpp │ └── runtime_value.cpp └── sizes.cpp ├── dod └── dod.cpp ├── dot_product ├── base │ └── base.cpp ├── modern │ └── modern.cpp ├── modern_double │ └── modern_double.cpp └── tuned │ └── tuned.cpp ├── duplicate_removal └── duplicate_removal.cpp ├── false_sharing ├── aligned_type.cpp ├── atomic_int.cpp ├── false_sharing.cpp └── vary_thread.cpp ├── hw_barrier ├── hw_barrier.cpp └── sw_barrier.cpp ├── inc_bench ├── bad_inc.cpp └── inc_bench.cpp ├── java_sll └── LinkedList.java ├── peterson ├── peterson.cpp └── peterson_hw_barrier.cpp ├── simple_bench └── my_bench.cpp ├── sorting └── sorting.cpp ├── strength_reduction └── mod_bench.cpp ├── sum_reduction ├── generalized.cu └── sum_reduction.cu ├── task_group └── task_group.cpp ├── thread_affinity └── thread_affinity.cpp └── vector_add └── vectorAdd.cu /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled class file 2 | *.class 3 | 4 | # Log file 5 | *.log 6 | 7 | # BlueJ files 8 | *.ctxt 9 | 10 | # Mobile Tools for Java (J2ME) 11 | .mtj.tmp/ 12 | 13 | # Package Files # 14 | *.jar 15 | *.war 16 | *.nar 17 | *.ear 18 | *.zip 19 | *.tar.gz 20 | *.rar 21 | 22 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 23 | hs_err_pid* 24 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | Preamble 9 | 10 | The GNU General Public License is a free, copyleft license for 11 | software and other kinds of works. 12 | 13 | The licenses for most software and other practical works are designed 14 | to take away your freedom to share and change the works. By contrast, 15 | the GNU General Public License is intended to guarantee your freedom to 16 | share and change all versions of a program--to make sure it remains free 17 | software for all its users. We, the Free Software Foundation, use the 18 | GNU General Public License for most of our software; it applies also to 19 | any other work released this way by its authors. You can apply it to 20 | your programs, too. 21 | 22 | When we speak of free software, we are referring to freedom, not 23 | price. Our General Public Licenses are designed to make sure that you 24 | have the freedom to distribute copies of free software (and charge for 25 | them if you wish), that you receive source code or can get it if you 26 | want it, that you can change the software or use pieces of it in new 27 | free programs, and that you know you can do these things. 28 | 29 | To protect your rights, we need to prevent others from denying you 30 | these rights or asking you to surrender the rights. Therefore, you have 31 | certain responsibilities if you distribute copies of the software, or if 32 | you modify it: responsibilities to respect the freedom of others. 33 | 34 | For example, if you distribute copies of such a program, whether 35 | gratis or for a fee, you must pass on to the recipients the same 36 | freedoms that you received. You must make sure that they, too, receive 37 | or can get the source code. And you must show them these terms so they 38 | know their rights. 39 | 40 | Developers that use the GNU GPL protect your rights with two steps: 41 | (1) assert copyright on the software, and (2) offer you this License 42 | giving you legal permission to copy, distribute and/or modify it. 43 | 44 | For the developers' and authors' protection, the GPL clearly explains 45 | that there is no warranty for this free software. For both users' and 46 | authors' sake, the GPL requires that modified versions be marked as 47 | changed, so that their problems will not be attributed erroneously to 48 | authors of previous versions. 49 | 50 | Some devices are designed to deny users access to install or run 51 | modified versions of the software inside them, although the manufacturer 52 | can do so. This is fundamentally incompatible with the aim of 53 | protecting users' freedom to change the software. The systematic 54 | pattern of such abuse occurs in the area of products for individuals to 55 | use, which is precisely where it is most unacceptable. Therefore, we 56 | have designed this version of the GPL to prohibit the practice for those 57 | products. If such problems arise substantially in other domains, we 58 | stand ready to extend this provision to those domains in future versions 59 | of the GPL, as needed to protect the freedom of users. 60 | 61 | Finally, every program is threatened constantly by software patents. 62 | States should not allow patents to restrict development and use of 63 | software on general-purpose computers, but in those that do, we wish to 64 | avoid the special danger that patents applied to a free program could 65 | make it effectively proprietary. To prevent this, the GPL assures that 66 | patents cannot be used to render the program non-free. 67 | 68 | The precise terms and conditions for copying, distribution and 69 | modification follow. 70 | 71 | TERMS AND CONDITIONS 72 | 73 | 0. Definitions. 74 | 75 | "This License" refers to version 3 of the GNU General Public License. 76 | 77 | "Copyright" also means copyright-like laws that apply to other kinds of 78 | works, such as semiconductor masks. 79 | 80 | "The Program" refers to any copyrightable work licensed under this 81 | License. Each licensee is addressed as "you". "Licensees" and 82 | "recipients" may be individuals or organizations. 83 | 84 | To "modify" a work means to copy from or adapt all or part of the work 85 | in a fashion requiring copyright permission, other than the making of an 86 | exact copy. The resulting work is called a "modified version" of the 87 | earlier work or a work "based on" the earlier work. 88 | 89 | A "covered work" means either the unmodified Program or a work based 90 | on the Program. 91 | 92 | To "propagate" a work means to do anything with it that, without 93 | permission, would make you directly or secondarily liable for 94 | infringement under applicable copyright law, except executing it on a 95 | computer or modifying a private copy. Propagation includes copying, 96 | distribution (with or without modification), making available to the 97 | public, and in some countries other activities as well. 98 | 99 | To "convey" a work means any kind of propagation that enables other 100 | parties to make or receive copies. Mere interaction with a user through 101 | a computer network, with no transfer of a copy, is not conveying. 102 | 103 | An interactive user interface displays "Appropriate Legal Notices" 104 | to the extent that it includes a convenient and prominently visible 105 | feature that (1) displays an appropriate copyright notice, and (2) 106 | tells the user that there is no warranty for the work (except to the 107 | extent that warranties are provided), that licensees may convey the 108 | work under this License, and how to view a copy of this License. If 109 | the interface presents a list of user commands or options, such as a 110 | menu, a prominent item in the list meets this criterion. 111 | 112 | 1. Source Code. 113 | 114 | The "source code" for a work means the preferred form of the work 115 | for making modifications to it. "Object code" means any non-source 116 | form of a work. 117 | 118 | A "Standard Interface" means an interface that either is an official 119 | standard defined by a recognized standards body, or, in the case of 120 | interfaces specified for a particular programming language, one that 121 | is widely used among developers working in that language. 122 | 123 | The "System Libraries" of an executable work include anything, other 124 | than the work as a whole, that (a) is included in the normal form of 125 | packaging a Major Component, but which is not part of that Major 126 | Component, and (b) serves only to enable use of the work with that 127 | Major Component, or to implement a Standard Interface for which an 128 | implementation is available to the public in source code form. A 129 | "Major Component", in this context, means a major essential component 130 | (kernel, window system, and so on) of the specific operating system 131 | (if any) on which the executable work runs, or a compiler used to 132 | produce the work, or an object code interpreter used to run it. 133 | 134 | The "Corresponding Source" for a work in object code form means all 135 | the source code needed to generate, install, and (for an executable 136 | work) run the object code and to modify the work, including scripts to 137 | control those activities. However, it does not include the work's 138 | System Libraries, or general-purpose tools or generally available free 139 | programs which are used unmodified in performing those activities but 140 | which are not part of the work. For example, Corresponding Source 141 | includes interface definition files associated with source files for 142 | the work, and the source code for shared libraries and dynamically 143 | linked subprograms that the work is specifically designed to require, 144 | such as by intimate data communication or control flow between those 145 | subprograms and other parts of the work. 146 | 147 | The Corresponding Source need not include anything that users 148 | can regenerate automatically from other parts of the Corresponding 149 | Source. 150 | 151 | The Corresponding Source for a work in source code form is that 152 | same work. 153 | 154 | 2. Basic Permissions. 155 | 156 | All rights granted under this License are granted for the term of 157 | copyright on the Program, and are irrevocable provided the stated 158 | conditions are met. This License explicitly affirms your unlimited 159 | permission to run the unmodified Program. The output from running a 160 | covered work is covered by this License only if the output, given its 161 | content, constitutes a covered work. This License acknowledges your 162 | rights of fair use or other equivalent, as provided by copyright law. 163 | 164 | You may make, run and propagate covered works that you do not 165 | convey, without conditions so long as your license otherwise remains 166 | in force. You may convey covered works to others for the sole purpose 167 | of having them make modifications exclusively for you, or provide you 168 | with facilities for running those works, provided that you comply with 169 | the terms of this License in conveying all material for which you do 170 | not control copyright. Those thus making or running the covered works 171 | for you must do so exclusively on your behalf, under your direction 172 | and control, on terms that prohibit them from making any copies of 173 | your copyrighted material outside their relationship with you. 174 | 175 | Conveying under any other circumstances is permitted solely under 176 | the conditions stated below. Sublicensing is not allowed; section 10 177 | makes it unnecessary. 178 | 179 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law. 180 | 181 | No covered work shall be deemed part of an effective technological 182 | measure under any applicable law fulfilling obligations under article 183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or 184 | similar laws prohibiting or restricting circumvention of such 185 | measures. 186 | 187 | When you convey a covered work, you waive any legal power to forbid 188 | circumvention of technological measures to the extent such circumvention 189 | is effected by exercising rights under this License with respect to 190 | the covered work, and you disclaim any intention to limit operation or 191 | modification of the work as a means of enforcing, against the work's 192 | users, your or third parties' legal rights to forbid circumvention of 193 | technological measures. 194 | 195 | 4. Conveying Verbatim Copies. 196 | 197 | You may convey verbatim copies of the Program's source code as you 198 | receive it, in any medium, provided that you conspicuously and 199 | appropriately publish on each copy an appropriate copyright notice; 200 | keep intact all notices stating that this License and any 201 | non-permissive terms added in accord with section 7 apply to the code; 202 | keep intact all notices of the absence of any warranty; and give all 203 | recipients a copy of this License along with the Program. 204 | 205 | You may charge any price or no price for each copy that you convey, 206 | and you may offer support or warranty protection for a fee. 207 | 208 | 5. Conveying Modified Source Versions. 209 | 210 | You may convey a work based on the Program, or the modifications to 211 | produce it from the Program, in the form of source code under the 212 | terms of section 4, provided that you also meet all of these conditions: 213 | 214 | a) The work must carry prominent notices stating that you modified 215 | it, and giving a relevant date. 216 | 217 | b) The work must carry prominent notices stating that it is 218 | released under this License and any conditions added under section 219 | 7. This requirement modifies the requirement in section 4 to 220 | "keep intact all notices". 221 | 222 | c) You must license the entire work, as a whole, under this 223 | License to anyone who comes into possession of a copy. This 224 | License will therefore apply, along with any applicable section 7 225 | additional terms, to the whole of the work, and all its parts, 226 | regardless of how they are packaged. This License gives no 227 | permission to license the work in any other way, but it does not 228 | invalidate such permission if you have separately received it. 229 | 230 | d) If the work has interactive user interfaces, each must display 231 | Appropriate Legal Notices; however, if the Program has interactive 232 | interfaces that do not display Appropriate Legal Notices, your 233 | work need not make them do so. 234 | 235 | A compilation of a covered work with other separate and independent 236 | works, which are not by their nature extensions of the covered work, 237 | and which are not combined with it such as to form a larger program, 238 | in or on a volume of a storage or distribution medium, is called an 239 | "aggregate" if the compilation and its resulting copyright are not 240 | used to limit the access or legal rights of the compilation's users 241 | beyond what the individual works permit. Inclusion of a covered work 242 | in an aggregate does not cause this License to apply to the other 243 | parts of the aggregate. 244 | 245 | 6. Conveying Non-Source Forms. 246 | 247 | You may convey a covered work in object code form under the terms 248 | of sections 4 and 5, provided that you also convey the 249 | machine-readable Corresponding Source under the terms of this License, 250 | in one of these ways: 251 | 252 | a) Convey the object code in, or embodied in, a physical product 253 | (including a physical distribution medium), accompanied by the 254 | Corresponding Source fixed on a durable physical medium 255 | customarily used for software interchange. 256 | 257 | b) Convey the object code in, or embodied in, a physical product 258 | (including a physical distribution medium), accompanied by a 259 | written offer, valid for at least three years and valid for as 260 | long as you offer spare parts or customer support for that product 261 | model, to give anyone who possesses the object code either (1) a 262 | copy of the Corresponding Source for all the software in the 263 | product that is covered by this License, on a durable physical 264 | medium customarily used for software interchange, for a price no 265 | more than your reasonable cost of physically performing this 266 | conveying of source, or (2) access to copy the 267 | Corresponding Source from a network server at no charge. 268 | 269 | c) Convey individual copies of the object code with a copy of the 270 | written offer to provide the Corresponding Source. This 271 | alternative is allowed only occasionally and noncommercially, and 272 | only if you received the object code with such an offer, in accord 273 | with subsection 6b. 274 | 275 | d) Convey the object code by offering access from a designated 276 | place (gratis or for a charge), and offer equivalent access to the 277 | Corresponding Source in the same way through the same place at no 278 | further charge. You need not require recipients to copy the 279 | Corresponding Source along with the object code. If the place to 280 | copy the object code is a network server, the Corresponding Source 281 | may be on a different server (operated by you or a third party) 282 | that supports equivalent copying facilities, provided you maintain 283 | clear directions next to the object code saying where to find the 284 | Corresponding Source. Regardless of what server hosts the 285 | Corresponding Source, you remain obligated to ensure that it is 286 | available for as long as needed to satisfy these requirements. 287 | 288 | e) Convey the object code using peer-to-peer transmission, provided 289 | you inform other peers where the object code and Corresponding 290 | Source of the work are being offered to the general public at no 291 | charge under subsection 6d. 292 | 293 | A separable portion of the object code, whose source code is excluded 294 | from the Corresponding Source as a System Library, need not be 295 | included in conveying the object code work. 296 | 297 | A "User Product" is either (1) a "consumer product", which means any 298 | tangible personal property which is normally used for personal, family, 299 | or household purposes, or (2) anything designed or sold for incorporation 300 | into a dwelling. In determining whether a product is a consumer product, 301 | doubtful cases shall be resolved in favor of coverage. For a particular 302 | product received by a particular user, "normally used" refers to a 303 | typical or common use of that class of product, regardless of the status 304 | of the particular user or of the way in which the particular user 305 | actually uses, or expects or is expected to use, the product. A product 306 | is a consumer product regardless of whether the product has substantial 307 | commercial, industrial or non-consumer uses, unless such uses represent 308 | the only significant mode of use of the product. 309 | 310 | "Installation Information" for a User Product means any methods, 311 | procedures, authorization keys, or other information required to install 312 | and execute modified versions of a covered work in that User Product from 313 | a modified version of its Corresponding Source. The information must 314 | suffice to ensure that the continued functioning of the modified object 315 | code is in no case prevented or interfered with solely because 316 | modification has been made. 317 | 318 | If you convey an object code work under this section in, or with, or 319 | specifically for use in, a User Product, and the conveying occurs as 320 | part of a transaction in which the right of possession and use of the 321 | User Product is transferred to the recipient in perpetuity or for a 322 | fixed term (regardless of how the transaction is characterized), the 323 | Corresponding Source conveyed under this section must be accompanied 324 | by the Installation Information. But this requirement does not apply 325 | if neither you nor any third party retains the ability to install 326 | modified object code on the User Product (for example, the work has 327 | been installed in ROM). 328 | 329 | The requirement to provide Installation Information does not include a 330 | requirement to continue to provide support service, warranty, or updates 331 | for a work that has been modified or installed by the recipient, or for 332 | the User Product in which it has been modified or installed. Access to a 333 | network may be denied when the modification itself materially and 334 | adversely affects the operation of the network or violates the rules and 335 | protocols for communication across the network. 336 | 337 | Corresponding Source conveyed, and Installation Information provided, 338 | in accord with this section must be in a format that is publicly 339 | documented (and with an implementation available to the public in 340 | source code form), and must require no special password or key for 341 | unpacking, reading or copying. 342 | 343 | 7. Additional Terms. 344 | 345 | "Additional permissions" are terms that supplement the terms of this 346 | License by making exceptions from one or more of its conditions. 347 | Additional permissions that are applicable to the entire Program shall 348 | be treated as though they were included in this License, to the extent 349 | that they are valid under applicable law. If additional permissions 350 | apply only to part of the Program, that part may be used separately 351 | under those permissions, but the entire Program remains governed by 352 | this License without regard to the additional permissions. 353 | 354 | When you convey a copy of a covered work, you may at your option 355 | remove any additional permissions from that copy, or from any part of 356 | it. (Additional permissions may be written to require their own 357 | removal in certain cases when you modify the work.) You may place 358 | additional permissions on material, added by you to a covered work, 359 | for which you have or can give appropriate copyright permission. 360 | 361 | Notwithstanding any other provision of this License, for material you 362 | add to a covered work, you may (if authorized by the copyright holders of 363 | that material) supplement the terms of this License with terms: 364 | 365 | a) Disclaiming warranty or limiting liability differently from the 366 | terms of sections 15 and 16 of this License; or 367 | 368 | b) Requiring preservation of specified reasonable legal notices or 369 | author attributions in that material or in the Appropriate Legal 370 | Notices displayed by works containing it; or 371 | 372 | c) Prohibiting misrepresentation of the origin of that material, or 373 | requiring that modified versions of such material be marked in 374 | reasonable ways as different from the original version; or 375 | 376 | d) Limiting the use for publicity purposes of names of licensors or 377 | authors of the material; or 378 | 379 | e) Declining to grant rights under trademark law for use of some 380 | trade names, trademarks, or service marks; or 381 | 382 | f) Requiring indemnification of licensors and authors of that 383 | material by anyone who conveys the material (or modified versions of 384 | it) with contractual assumptions of liability to the recipient, for 385 | any liability that these contractual assumptions directly impose on 386 | those licensors and authors. 387 | 388 | All other non-permissive additional terms are considered "further 389 | restrictions" within the meaning of section 10. If the Program as you 390 | received it, or any part of it, contains a notice stating that it is 391 | governed by this License along with a term that is a further 392 | restriction, you may remove that term. If a license document contains 393 | a further restriction but permits relicensing or conveying under this 394 | License, you may add to a covered work material governed by the terms 395 | of that license document, provided that the further restriction does 396 | not survive such relicensing or conveying. 397 | 398 | If you add terms to a covered work in accord with this section, you 399 | must place, in the relevant source files, a statement of the 400 | additional terms that apply to those files, or a notice indicating 401 | where to find the applicable terms. 402 | 403 | Additional terms, permissive or non-permissive, may be stated in the 404 | form of a separately written license, or stated as exceptions; 405 | the above requirements apply either way. 406 | 407 | 8. Termination. 408 | 409 | You may not propagate or modify a covered work except as expressly 410 | provided under this License. Any attempt otherwise to propagate or 411 | modify it is void, and will automatically terminate your rights under 412 | this License (including any patent licenses granted under the third 413 | paragraph of section 11). 414 | 415 | However, if you cease all violation of this License, then your 416 | license from a particular copyright holder is reinstated (a) 417 | provisionally, unless and until the copyright holder explicitly and 418 | finally terminates your license, and (b) permanently, if the copyright 419 | holder fails to notify you of the violation by some reasonable means 420 | prior to 60 days after the cessation. 421 | 422 | Moreover, your license from a particular copyright holder is 423 | reinstated permanently if the copyright holder notifies you of the 424 | violation by some reasonable means, this is the first time you have 425 | received notice of violation of this License (for any work) from that 426 | copyright holder, and you cure the violation prior to 30 days after 427 | your receipt of the notice. 428 | 429 | Termination of your rights under this section does not terminate the 430 | licenses of parties who have received copies or rights from you under 431 | this License. If your rights have been terminated and not permanently 432 | reinstated, you do not qualify to receive new licenses for the same 433 | material under section 10. 434 | 435 | 9. Acceptance Not Required for Having Copies. 436 | 437 | You are not required to accept this License in order to receive or 438 | run a copy of the Program. Ancillary propagation of a covered work 439 | occurring solely as a consequence of using peer-to-peer transmission 440 | to receive a copy likewise does not require acceptance. However, 441 | nothing other than this License grants you permission to propagate or 442 | modify any covered work. These actions infringe copyright if you do 443 | not accept this License. Therefore, by modifying or propagating a 444 | covered work, you indicate your acceptance of this License to do so. 445 | 446 | 10. Automatic Licensing of Downstream Recipients. 447 | 448 | Each time you convey a covered work, the recipient automatically 449 | receives a license from the original licensors, to run, modify and 450 | propagate that work, subject to this License. You are not responsible 451 | for enforcing compliance by third parties with this License. 452 | 453 | An "entity transaction" is a transaction transferring control of an 454 | organization, or substantially all assets of one, or subdividing an 455 | organization, or merging organizations. If propagation of a covered 456 | work results from an entity transaction, each party to that 457 | transaction who receives a copy of the work also receives whatever 458 | licenses to the work the party's predecessor in interest had or could 459 | give under the previous paragraph, plus a right to possession of the 460 | Corresponding Source of the work from the predecessor in interest, if 461 | the predecessor has it or can get it with reasonable efforts. 462 | 463 | You may not impose any further restrictions on the exercise of the 464 | rights granted or affirmed under this License. For example, you may 465 | not impose a license fee, royalty, or other charge for exercise of 466 | rights granted under this License, and you may not initiate litigation 467 | (including a cross-claim or counterclaim in a lawsuit) alleging that 468 | any patent claim is infringed by making, using, selling, offering for 469 | sale, or importing the Program or any portion of it. 470 | 471 | 11. Patents. 472 | 473 | A "contributor" is a copyright holder who authorizes use under this 474 | License of the Program or a work on which the Program is based. The 475 | work thus licensed is called the contributor's "contributor version". 476 | 477 | A contributor's "essential patent claims" are all patent claims 478 | owned or controlled by the contributor, whether already acquired or 479 | hereafter acquired, that would be infringed by some manner, permitted 480 | by this License, of making, using, or selling its contributor version, 481 | but do not include claims that would be infringed only as a 482 | consequence of further modification of the contributor version. For 483 | purposes of this definition, "control" includes the right to grant 484 | patent sublicenses in a manner consistent with the requirements of 485 | this License. 486 | 487 | Each contributor grants you a non-exclusive, worldwide, royalty-free 488 | patent license under the contributor's essential patent claims, to 489 | make, use, sell, offer for sale, import and otherwise run, modify and 490 | propagate the contents of its contributor version. 491 | 492 | In the following three paragraphs, a "patent license" is any express 493 | agreement or commitment, however denominated, not to enforce a patent 494 | (such as an express permission to practice a patent or covenant not to 495 | sue for patent infringement). To "grant" such a patent license to a 496 | party means to make such an agreement or commitment not to enforce a 497 | patent against the party. 498 | 499 | If you convey a covered work, knowingly relying on a patent license, 500 | and the Corresponding Source of the work is not available for anyone 501 | to copy, free of charge and under the terms of this License, through a 502 | publicly available network server or other readily accessible means, 503 | then you must either (1) cause the Corresponding Source to be so 504 | available, or (2) arrange to deprive yourself of the benefit of the 505 | patent license for this particular work, or (3) arrange, in a manner 506 | consistent with the requirements of this License, to extend the patent 507 | license to downstream recipients. "Knowingly relying" means you have 508 | actual knowledge that, but for the patent license, your conveying the 509 | covered work in a country, or your recipient's use of the covered work 510 | in a country, would infringe one or more identifiable patents in that 511 | country that you have reason to believe are valid. 512 | 513 | If, pursuant to or in connection with a single transaction or 514 | arrangement, you convey, or propagate by procuring conveyance of, a 515 | covered work, and grant a patent license to some of the parties 516 | receiving the covered work authorizing them to use, propagate, modify 517 | or convey a specific copy of the covered work, then the patent license 518 | you grant is automatically extended to all recipients of the covered 519 | work and works based on it. 520 | 521 | A patent license is "discriminatory" if it does not include within 522 | the scope of its coverage, prohibits the exercise of, or is 523 | conditioned on the non-exercise of one or more of the rights that are 524 | specifically granted under this License. You may not convey a covered 525 | work if you are a party to an arrangement with a third party that is 526 | in the business of distributing software, under which you make payment 527 | to the third party based on the extent of your activity of conveying 528 | the work, and under which the third party grants, to any of the 529 | parties who would receive the covered work from you, a discriminatory 530 | patent license (a) in connection with copies of the covered work 531 | conveyed by you (or copies made from those copies), or (b) primarily 532 | for and in connection with specific products or compilations that 533 | contain the covered work, unless you entered into that arrangement, 534 | or that patent license was granted, prior to 28 March 2007. 535 | 536 | Nothing in this License shall be construed as excluding or limiting 537 | any implied license or other defenses to infringement that may 538 | otherwise be available to you under applicable patent law. 539 | 540 | 12. No Surrender of Others' Freedom. 541 | 542 | If conditions are imposed on you (whether by court order, agreement or 543 | otherwise) that contradict the conditions of this License, they do not 544 | excuse you from the conditions of this License. If you cannot convey a 545 | covered work so as to satisfy simultaneously your obligations under this 546 | License and any other pertinent obligations, then as a consequence you may 547 | not convey it at all. For example, if you agree to terms that obligate you 548 | to collect a royalty for further conveying from those to whom you convey 549 | the Program, the only way you could satisfy both those terms and this 550 | License would be to refrain entirely from conveying the Program. 551 | 552 | 13. Use with the GNU Affero General Public License. 553 | 554 | Notwithstanding any other provision of this License, you have 555 | permission to link or combine any covered work with a work licensed 556 | under version 3 of the GNU Affero General Public License into a single 557 | combined work, and to convey the resulting work. The terms of this 558 | License will continue to apply to the part which is the covered work, 559 | but the special requirements of the GNU Affero General Public License, 560 | section 13, concerning interaction through a network will apply to the 561 | combination as such. 562 | 563 | 14. Revised Versions of this License. 564 | 565 | The Free Software Foundation may publish revised and/or new versions of 566 | the GNU General Public License from time to time. Such new versions will 567 | be similar in spirit to the present version, but may differ in detail to 568 | address new problems or concerns. 569 | 570 | Each version is given a distinguishing version number. If the 571 | Program specifies that a certain numbered version of the GNU General 572 | Public License "or any later version" applies to it, you have the 573 | option of following the terms and conditions either of that numbered 574 | version or of any later version published by the Free Software 575 | Foundation. If the Program does not specify a version number of the 576 | GNU General Public License, you may choose any version ever published 577 | by the Free Software Foundation. 578 | 579 | If the Program specifies that a proxy can decide which future 580 | versions of the GNU General Public License can be used, that proxy's 581 | public statement of acceptance of a version permanently authorizes you 582 | to choose that version for the Program. 583 | 584 | Later license versions may give you additional or different 585 | permissions. However, no additional obligations are imposed on any 586 | author or copyright holder as a result of your choosing to follow a 587 | later version. 588 | 589 | 15. Disclaimer of Warranty. 590 | 591 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY 592 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT 593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY 594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, 595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 596 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM 597 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF 598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 599 | 600 | 16. Limitation of Liability. 601 | 602 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS 604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY 605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE 606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF 607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD 608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), 609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF 610 | SUCH DAMAGES. 611 | 612 | 17. Interpretation of Sections 15 and 16. 613 | 614 | If the disclaimer of warranty and limitation of liability provided 615 | above cannot be given local legal effect according to their terms, 616 | reviewing courts shall apply local law that most closely approximates 617 | an absolute waiver of all civil liability in connection with the 618 | Program, unless a warranty or assumption of liability accompanies a 619 | copy of the Program in return for a fee. 620 | 621 | END OF TERMS AND CONDITIONS 622 | 623 | How to Apply These Terms to Your New Programs 624 | 625 | If you develop a new program, and you want it to be of the greatest 626 | possible use to the public, the best way to achieve this is to make it 627 | free software which everyone can redistribute and change under these terms. 628 | 629 | To do so, attach the following notices to the program. It is safest 630 | to attach them to the start of each source file to most effectively 631 | state the exclusion of warranty; and each file should have at least 632 | the "copyright" line and a pointer to where the full notice is found. 633 | 634 | 635 | Copyright (C) 636 | 637 | This program is free software: you can redistribute it and/or modify 638 | it under the terms of the GNU General Public License as published by 639 | the Free Software Foundation, either version 3 of the License, or 640 | (at your option) any later version. 641 | 642 | This program is distributed in the hope that it will be useful, 643 | but WITHOUT ANY WARRANTY; without even the implied warranty of 644 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 645 | GNU General Public License for more details. 646 | 647 | You should have received a copy of the GNU General Public License 648 | along with this program. If not, see . 649 | 650 | Also add information on how to contact you by electronic and paper mail. 651 | 652 | If the program does terminal interaction, make it output a short 653 | notice like this when it starts in an interactive mode: 654 | 655 | Copyright (C) 656 | This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 657 | This is free software, and you are welcome to redistribute it 658 | under certain conditions; type `show c' for details. 659 | 660 | The hypothetical commands `show w' and `show c' should show the appropriate 661 | parts of the General Public License. Of course, your program's commands 662 | might be different; for a GUI interface, you would use an "about box". 663 | 664 | You should also get your employer (if you work as a programmer) or school, 665 | if any, to sign a "copyright disclaimer" for the program, if necessary. 666 | For more information on this, and how to apply and follow the GNU GPL, see 667 | . 668 | 669 | The GNU General Public License does not permit incorporating your program 670 | into proprietary programs. If your program is a subroutine library, you 671 | may consider it more useful to permit linking proprietary applications with 672 | the library. If this is what you want to do, use the GNU Lesser General 673 | Public License instead of this License. But first, please read 674 | . 675 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Misc. Code 2 | 3 | This repository holds various examples that don't have a home (yet) 4 | 5 | ## Contact 6 | 7 | Suggestions for specific content can be sent to: CoffeeBeforeArch@gmail.com 8 | 9 | ## Environment 10 | 11 | Operating System: Ubuntu 20.04 12 | 13 | Text Editor: vim 14 | 15 | Compiler: g++-11 16 | 17 | ## Examples 18 | 19 | This repository contains the following examples: 20 | 21 | - [Access pattern benchmarks](access_patterns) 22 | - [Biased branches benchmarks](biased_branches) 23 | - [Clamp benchmarks](clamp) 24 | - [CMake Example](cmake_example) 25 | - [Code Scheduling](code_scheduling) 26 | - [Data oriented design](dod) 27 | - [Branchless programming](conditions) 28 | - [Dot product benchmarks](dot_product) 29 | - [False sharing benchmarks](false_sharing) 30 | - [Hardware memory barriers](hw_barrier) 31 | - [Increment benchmarks](inc_bench) 32 | - [Java singly linked list](java_sll) 33 | - [Simple Google Benchmark](simple_bench) 34 | - [Strength reduction benchmark](strength_reduction) 35 | - [Short string optimization](strings) 36 | - [CUDA sum reduction](sum_reduction) 37 | - [TBB task group](task_group) 38 | - [Thread affinity benchmark](thread_affinity) 39 | - [CUDA vector addition](vector_add) 40 | - [Duplicate Removal](duplicate_removal) 41 | - [Sorting Alternatives](sorting) 42 | - [Peterson's Algorithm](peterson) 43 | -------------------------------------------------------------------------------- /abs/abs.cpp: -------------------------------------------------------------------------------- 1 | // A simple benchmark for absolute value 2 | // By Nick from CoffeeBeforeArch 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | // A simple benchmark for absolute value 10 | static void abs_bench_base(benchmark::State &s) { 11 | // Number of elements 12 | auto num_elements = 1 << s.range(0); 13 | 14 | // Create our random number generators 15 | std::mt19937 rng; 16 | rng.seed(std::random_device()()); 17 | std::uniform_real_distribution dist(-1, 1); 18 | 19 | // Fill our vector with random numbers 20 | std::vector v_in(num_elements); 21 | std::generate(begin(v_in), end(v_in), [&]() { return dist(rng); }); 22 | 23 | // Create a vector for results 24 | std::vector v_out(num_elements); 25 | 26 | // Do our absolute value 27 | for (auto _ : s) { 28 | for (int i = 0; i < v_in.size(); i++) { 29 | v_out[i] = v_in[i] > 0 ? v_in[i] : - v_in[i]; 30 | } 31 | } 32 | } 33 | BENCHMARK(abs_bench_base)->DenseRange(6, 12); 34 | 35 | // A simple benchmark for absolute value 36 | static void abs_bench_branch(benchmark::State &s) { 37 | // Number of elements 38 | auto num_elements = 1 << s.range(0); 39 | 40 | // Create our random number generators 41 | std::mt19937 rng; 42 | rng.seed(std::random_device()()); 43 | std::uniform_real_distribution dist(-1, 1); 44 | 45 | // Fill our vector with random numbers 46 | std::vector v_in(num_elements); 47 | std::generate(begin(v_in), end(v_in), [&]() { return dist(rng); }); 48 | 49 | // Create a vector for results 50 | std::vector v_out(num_elements); 51 | 52 | // Do our absolute value 53 | for (auto _ : s) { 54 | for (int i = 0; i < v_in.size(); i++) { 55 | if (v_in[i] > 0) v_out[i] = v_in[i]; 56 | else v_out[i] = - v_in[i]; 57 | } 58 | } 59 | } 60 | BENCHMARK(abs_bench_branch)->DenseRange(6, 12); 61 | 62 | 63 | // A simple benchmark for absolute value 64 | static void abs_bench_std(benchmark::State &s) { 65 | // Number of elements 66 | auto num_elements = 1 << s.range(0); 67 | 68 | // Create our random number generators 69 | std::mt19937 rng; 70 | rng.seed(std::random_device()()); 71 | std::uniform_real_distribution dist(-1, 1); 72 | 73 | // Fill our vector with random numbers 74 | std::vector v_in(num_elements); 75 | std::generate(begin(v_in), end(v_in), [&]() { return dist(rng); }); 76 | 77 | // Create a vector for results 78 | std::vector v_out(num_elements); 79 | 80 | // Do our absolute value 81 | for (auto _ : s) { 82 | for (int i = 0; i < v_in.size(); i++) { 83 | v_out[i] = std::abs(v_in[i]); 84 | } 85 | } 86 | } 87 | BENCHMARK(abs_bench_std)->DenseRange(6, 12); 88 | 89 | 90 | BENCHMARK_MAIN(); 91 | -------------------------------------------------------------------------------- /access_patterns/access_patterns.cpp: -------------------------------------------------------------------------------- 1 | // Benchmarks of different access patterns in C++ 2 | // By: Nick from CoffeeBeforeArch 3 | 4 | #include 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | // Accesses an array sequentially in row-major fashion 13 | static void rowMajor(benchmark::State &s) { 14 | // Input/output vector size 15 | int N = 1 << s.range(0); 16 | 17 | // Create our input indices 18 | std::vector v_in(N * N); 19 | std::iota(begin(v_in), end(v_in), 0); 20 | 21 | // Create an output vector 22 | std::vector v_out(N * N); 23 | 24 | // Profile a simple traversal with simple additions 25 | while (s.KeepRunning()) { 26 | for (int i = 0; i < N * N; i++) { 27 | v_out[v_in[i]]++; 28 | } 29 | } 30 | } 31 | // Register the benchmark 32 | BENCHMARK(rowMajor)->DenseRange(10, 12)->Unit(benchmark::kMillisecond); 33 | 34 | // Accesses an array sequentially in reverse row-major 35 | static void reverse(benchmark::State &s) { 36 | // Input/output vector size 37 | int N = 1 << s.range(0); 38 | 39 | // Create our input indices 40 | std::vector v_in(N * N); 41 | std::iota(begin(v_in), end(v_in), 0); 42 | std::reverse(begin(v_in), end(v_in)); 43 | 44 | // Create an output vector 45 | std::vector v_out(N * N); 46 | 47 | // Profile a simple traversal with simple additions 48 | while (s.KeepRunning()) { 49 | for (int i = 0; i < N * N; i++) { 50 | // Pre-fetch an item for later 51 | v_out[v_in[i]]++; 52 | } 53 | } 54 | } 55 | // Register the benchmark 56 | BENCHMARK(reverse)->DenseRange(10, 12)->Unit(benchmark::kMillisecond); 57 | 58 | // Accesses an array sequentially in row-major fashion 59 | static void cacheLine(benchmark::State &s) { 60 | // Input/output vector size 61 | int N = 1 << s.range(0); 62 | 63 | // Cache line size 64 | const int stride = 64 / sizeof(int); 65 | 66 | // Create our input indices 67 | std::vector v_in(N * N); 68 | 69 | // For each element in a cache line 70 | int index = 0; 71 | for (int i = 0; i < stride; i++) { 72 | // For each cache line in the array 73 | for (int j = 0; j < (N * N / stride); j++) { 74 | v_in[index] = j * stride + i; 75 | index++; 76 | } 77 | } 78 | 79 | // Create an output vector 80 | std::vector v_out(N * N); 81 | 82 | // Profile a simple traversal with simple additions 83 | while (s.KeepRunning()) { 84 | for (int i = 0; i < N * N; i++) { 85 | v_out[v_in[i]]++; 86 | } 87 | } 88 | } 89 | // Register the benchmark 90 | BENCHMARK(cacheLine)->DenseRange(10, 12)->Unit(benchmark::kMillisecond); 91 | 92 | // Accesses an array sequentially in row-major fashion 93 | static void cacheLineReverse(benchmark::State &s) { 94 | // Input/output vector size 95 | int N = 1 << s.range(0); 96 | 97 | // Cache line size 98 | const int stride = 64 / sizeof(int); 99 | 100 | // Create our input indices 101 | std::vector v_in(N * N); 102 | 103 | // For each element in a cache line 104 | int index = 0; 105 | for (int i = 0; i < stride; i++) { 106 | // For each cache line in the array 107 | for (int j = 0; j < (N * N / stride); j++) { 108 | v_in[index] = j * stride + i; 109 | index++; 110 | } 111 | } 112 | 113 | // Reverse the indices 114 | std::reverse(begin(v_in), end(v_in)); 115 | 116 | // Create an output vector 117 | std::vector v_out(N * N); 118 | 119 | // Profile a simple traversal with simple additions 120 | while (s.KeepRunning()) { 121 | for (int i = 0; i < N * N; i++) { 122 | v_out[v_in[i]]++; 123 | } 124 | } 125 | } 126 | // Register the benchmark 127 | BENCHMARK(cacheLineReverse)->DenseRange(10, 12)->Unit(benchmark::kMillisecond); 128 | 129 | // Accesses an array in column-major order 130 | static void columnMajor(benchmark::State &s) { 131 | // Input/output vector size 132 | int N = 1 << s.range(0); 133 | 134 | // Create our input indices 135 | std::vector v_in(N * N); 136 | for (int i = 0; i < N; i++) { 137 | for (int j = 0; j < N; j++) { 138 | v_in[i * N + j] = j * N + i; 139 | } 140 | } 141 | 142 | // Create an output vector 143 | std::vector v_out(N * N); 144 | 145 | // Profile a simple traversal with simple additions 146 | while (s.KeepRunning()) { 147 | for (int i = 0; i < N * N; i++) { 148 | v_out[v_in[i]]++; 149 | } 150 | } 151 | } 152 | // Register the benchmark 153 | BENCHMARK(columnMajor)->DenseRange(10, 12)->Unit(benchmark::kMillisecond); 154 | 155 | // Accesses an array in randomized order 156 | static void random(benchmark::State &s) { 157 | // Input/output vector size 158 | int N = 1 << s.range(0); 159 | 160 | // Create our input indices 161 | std::vector v_in(N * N); 162 | std::iota(begin(v_in), end(v_in), 0); 163 | 164 | // Now shuffle the vector 165 | std::random_device rng; 166 | std::mt19937 urng(rng()); 167 | std::shuffle(begin(v_in), end(v_in), urng); 168 | 169 | // Create an output vector 170 | std::vector v_out(N * N); 171 | 172 | // Profile a simple traversal with simple additions 173 | while (s.KeepRunning()) { 174 | for (int i = 0; i < N * N; i++) { 175 | v_out[v_in[i]]++; 176 | } 177 | } 178 | } 179 | // Register the benchmark 180 | BENCHMARK(random)->DenseRange(10, 12)->Unit(benchmark::kMillisecond); 181 | 182 | // Accesses in a random order but try pre-fetching 183 | static void randomPrefetch(benchmark::State &s) { 184 | // Input/output vector size 185 | int N = 1 << s.range(0); 186 | 187 | // Create our input indices 188 | std::vector v_in(N * N); 189 | std::iota(begin(v_in), end(v_in), 0); 190 | 191 | // Now shuffle the vector 192 | std::random_device rng; 193 | std::mt19937 urng(rng()); 194 | std::shuffle(begin(v_in), end(v_in), urng); 195 | 196 | // Create an output vector 197 | std::vector v_out(N * N); 198 | 199 | // Profile a simple traversal with simple additions 200 | while (s.KeepRunning()) { 201 | for (int i = 0; i < N * N; i++) { 202 | // Pre-fetch an item for later 203 | __builtin_prefetch(&v_out[v_in[i + 5]]); 204 | v_out[v_in[i]]++; 205 | } 206 | } 207 | } 208 | // Register the benchmark 209 | BENCHMARK(randomPrefetch)->DenseRange(10, 12)->Unit(benchmark::kMillisecond); 210 | 211 | // Benchmark main functions 212 | BENCHMARK_MAIN(); 213 | -------------------------------------------------------------------------------- /biased_branches/random.cpp: -------------------------------------------------------------------------------- 1 | // Benchmarks using branches for conditionally adding a value 2 | // By: Nick from CoffeeBeforeArch 3 | 4 | #include 5 | 6 | #include 7 | #include 8 | #include 9 | 10 | // Function for generating argument pairs 11 | static void custom_args(benchmark::internal::Benchmark *b) { 12 | for (auto i : {14}) { 13 | for (auto j : {0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100}) { 14 | b = b->ArgPair(i, j); 15 | } 16 | } 17 | } 18 | 19 | // Benchmark for using branches 20 | static void branchBenchRandom(benchmark::State &s) { 21 | // Get the input vector size 22 | auto N = 1 << s.range(0); 23 | 24 | // Get the distribution 25 | double probability = s.range(1) / 100.0; 26 | 27 | // Create random number generator 28 | // Bernoulli distribution gives T/F outcomes 29 | std::random_device rd; 30 | std::mt19937 gen(rd()); 31 | std::bernoulli_distribution d(probability); 32 | 33 | // Create a vector of random booleans 34 | std::vector v_in(N); 35 | std::generate(begin(v_in), end(v_in), [&]() { return d(gen); }); 36 | 37 | // Output element 38 | int sink = 0; 39 | 40 | // Benchmark main loop 41 | for (auto _ : s) { 42 | for (auto b : v_in) 43 | if (b) benchmark::DoNotOptimize(sink += s.range(0)); 44 | } 45 | } 46 | BENCHMARK(branchBenchRandom)->Apply(custom_args)->Unit(benchmark::kMicrosecond); 47 | 48 | BENCHMARK_MAIN(); 49 | -------------------------------------------------------------------------------- /clamp/clamp_bench.cpp: -------------------------------------------------------------------------------- 1 | // Benchmarks for compiler optimizations of a clamp function 2 | // By: Nick from CoffeeBeforeArch 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #include "benchmark/benchmark.h" 9 | 10 | // Benchmark for a clamp function 11 | // std::vector + for-loop 12 | static void clamp_bench(benchmark::State &s) { 13 | // Number of elements in the vector 14 | auto N = 1 << s.range(0); 15 | 16 | // Create our random number generators 17 | std::mt19937 rng; 18 | rng.seed(std::random_device()()); 19 | std::uniform_int_distribution dist(0, 1024); 20 | 21 | // Create a vector of random integers 22 | std::vector v_in(N); 23 | std::vector v_out(N); 24 | std::generate(begin(v_in), end(v_in), [&]() { return dist(rng); }); 25 | 26 | // Main benchmark loop 27 | for (auto _ : s) { 28 | for (int i = 0; i < N; i++) { 29 | v_out[i] = (v_in[i] > 512) ? 512 : v_in[i]; 30 | } 31 | } 32 | } 33 | BENCHMARK(clamp_bench)->DenseRange(8, 10); 34 | 35 | // Benchmark for a clamp function 36 | // Raw pointers + for-loop 37 | static void clamp_bench_raw_ptr(benchmark::State &s) { 38 | // Number of elements in the vector 39 | auto N = 1 << s.range(0); 40 | 41 | // Create our random number generators 42 | std::mt19937 rng; 43 | rng.seed(std::random_device()()); 44 | std::uniform_int_distribution dist(0, 1024); 45 | 46 | // Create a vector of random integers 47 | int *v_in = new int[N](); 48 | int *v_out = new int[N](); 49 | std::generate(v_in, v_in + N, [&]() { return dist(rng); }); 50 | 51 | // Main benchmark loop 52 | for (auto _ : s) { 53 | for (int i = 0; i < N; i++) { 54 | v_out[i] = (v_in[i] > 512) ? 512 : v_in[i]; 55 | } 56 | } 57 | 58 | delete[] v_in; 59 | delete[] v_out; 60 | } 61 | BENCHMARK(clamp_bench_raw_ptr)->DenseRange(8, 10); 62 | 63 | // Benchmark for a clamp function 64 | // std::vector + std::transform 65 | static void clamp_bench_lambda(benchmark::State &s) { 66 | // Number of elements in the vector 67 | auto N = 1 << s.range(0); 68 | 69 | // Create our random number generators 70 | std::mt19937 rng; 71 | rng.seed(std::random_device()()); 72 | std::uniform_int_distribution dist(0, 1024); 73 | 74 | // Create a vector of random integers 75 | std::vector v_in(N); 76 | std::vector v_out(N); 77 | std::generate(begin(v_in), end(v_in), [&]() { return dist(rng); }); 78 | 79 | // Our clamp function 80 | auto clamp = [](int in) { return (in > 512) ? 512 : in; }; 81 | 82 | // Main benchmark loop 83 | for (auto _ : s) { 84 | std::transform(begin(v_in), end(v_in), begin(v_out), clamp); 85 | } 86 | } 87 | BENCHMARK(clamp_bench_lambda)->DenseRange(8, 10); 88 | 89 | // Benchmark for a clamp function 90 | // Raw pointers + std::transform 91 | static void clamp_bench_raw_ptr_lambda(benchmark::State &s) { 92 | // Number of elements in the vector 93 | auto N = 1 << s.range(0); 94 | 95 | // Create our random number generators 96 | std::mt19937 rng; 97 | rng.seed(std::random_device()()); 98 | std::uniform_int_distribution dist(0, 1024); 99 | 100 | // Create a vector of random integers 101 | int *v_in = new int[N](); 102 | int *v_out = new int[N](); 103 | std::generate(v_in, v_in + N, [&]() { return dist(rng); }); 104 | 105 | // Our clamp function 106 | auto clamp = [](int in) { return (in > 512) ? 512 : in; }; 107 | 108 | // Main benchmark loop 109 | for (auto _ : s) { 110 | std::transform(v_in, v_in + N, v_out, clamp); 111 | } 112 | 113 | delete[] v_in; 114 | delete[] v_out; 115 | } 116 | BENCHMARK(clamp_bench_raw_ptr_lambda)->DenseRange(8, 10); 117 | 118 | BENCHMARK_MAIN(); 119 | -------------------------------------------------------------------------------- /cmake_example/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Set the minimum CMake version 2 | cmake_minimum_required (VERSION 3.5) 3 | # Name the project (sets PROJECT_NAME variable) 4 | project (Powers) 5 | 6 | # Add a version number 7 | set (Powers_VERSION_MAJOR 1) 8 | set (Powers_VERSION_MINOR 0) 9 | 10 | # Configure a header file to pass CMake settings to the source 11 | configure_file ( 12 | "${PROJECT_SOURCE_DIR}/PowersConfig.h.in" 13 | "${PROJECT_BINARY_DIR}/PowersConfig.h" 14 | ) 15 | 16 | # Add the binary tree to the search path for include files 17 | # This is necessary to find our generated header file 18 | include_directories("${PROJECT_BINARY_DIR}") 19 | 20 | # See if we should use our "myPow" function 21 | option (USE_MYMATH "Use our own exponent function" ON) 22 | 23 | # Act conditionally based on this option 24 | if(USE_MYMATH) 25 | # Add directories for our "myPow" prototype 26 | include_directories ("${PROJECT_SOURCE_DIR}/local_functions") 27 | # Add subdirectory so that the "myPow" function will be built 28 | add_subdirectory (local_functions) 29 | set (EXTRA_LIBS ${EXTRA_LIBS} Exponent) 30 | endif(USE_MYMATH) 31 | 32 | # Builds an executable "Powers" from source "powers.cxx" 33 | add_executable(Powers powers.cpp) 34 | # Target our library for linking 35 | target_link_libraries(Powers ${EXTRA_LIBS}) 36 | 37 | -------------------------------------------------------------------------------- /cmake_example/PowersConfig.h.in: -------------------------------------------------------------------------------- 1 | // This file contains the configured options for our CMake example 2 | #define Powers_VERSION_MAJOR @Powers_VERSION_MAJOR@ 3 | #define Powers_VERSION_MINOR @Powers_VERSION_MINOR@ 4 | #cmakedefine USE_MYMATH 5 | -------------------------------------------------------------------------------- /cmake_example/local_functions/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Add the library target to be built 2 | add_library(Exponent exponent.cpp) 3 | -------------------------------------------------------------------------------- /cmake_example/local_functions/exponent.cpp: -------------------------------------------------------------------------------- 1 | // This file contains the definition of an exponent function 2 | int myPow(int base, int exponent){ 3 | int tmp = base; 4 | 5 | // Return 1 for exponent of 0 6 | if (exponent == 0){ 7 | return 1; 8 | } 9 | 10 | // Accumulate exponent through multiplication 11 | for(int i = 0; i < exponent - 1; i++){ 12 | tmp *= base; 13 | } 14 | 15 | return tmp; 16 | } 17 | -------------------------------------------------------------------------------- /cmake_example/local_functions/exponent.h: -------------------------------------------------------------------------------- 1 | // This file contains the function prototype for the myPow function 2 | // By: Nick from CoffeeBeforeArch 3 | int myPow(int base, int exponent); 4 | -------------------------------------------------------------------------------- /cmake_example/powers.cpp: -------------------------------------------------------------------------------- 1 | // This program takes the power of different numbers and is compiled 2 | // using CMake 3 | // By: Nick from CoffeeBeforeArch 4 | 5 | #include 6 | #include 7 | 8 | // Add header generated by CMake 9 | #include "PowersConfig.h" 10 | 11 | // Conditionally add our myPow function 12 | #ifdef USE_MYMATH 13 | #include "exponent.h" 14 | #endif 15 | 16 | using namespace std; 17 | 18 | int main(){ 19 | // Print the version numbers that we generate 20 | cout << "Major Version: " << Powers_VERSION_MAJOR << endl; 21 | cout << "Minor Version: " << Powers_VERSION_MINOR << endl; 22 | 23 | // Compute some a power calculation 24 | int base = 4; 25 | int exponent = 3; 26 | 27 | // Conditionally use our myPow function 28 | #ifdef USE_MYMATH 29 | int result = myPow(base, exponent); 30 | #else 31 | int result = pow(base, exponent); 32 | #endif 33 | 34 | // Print the result 35 | cout << base << "^" << exponent << " = " << result << endl; 36 | return 0; 37 | } 38 | -------------------------------------------------------------------------------- /code_scheduling/base/base.cpp: -------------------------------------------------------------------------------- 1 | // This program shows off a neat optimization for fast a faster 2 | // modulo operation in C++ 3 | // By: Nick from CoffeeBeforeArch 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | // Function for generating argument pairs 10 | static void custom_args(benchmark::internal::Benchmark *b) { 11 | for (int i = 1 << 4; i <= 1 << 10; i <<= 2) { 12 | // Collect stats at 1/8, 1/2, and 7/8 13 | for (int j : {32, 128, 224}) { 14 | b = b->ArgPair(i, j); 15 | } 16 | } 17 | } 18 | 19 | // Baseline benchmark 20 | static void baseMod(benchmark::State &s) { 21 | // Number of elements in the vectors 22 | int N = s.range(0); 23 | 24 | // Max for mod operator 25 | int ceil = s.range(1); 26 | 27 | // Vector for input and output of modulo 28 | std::vector input; 29 | std::vector output; 30 | input.resize(N); 31 | output.resize(N); 32 | 33 | // Generate random inputs (uniform random dist. between 0 & 255) 34 | std::mt19937 rng; 35 | rng.seed(std::random_device()()); 36 | std::uniform_int_distribution dist(0, 255); 37 | for (int &i : input) { 38 | i = dist(rng); 39 | } 40 | 41 | // Main benchmark loop 42 | while (s.KeepRunning()) { 43 | // Compute the modulo for each element 44 | for (int i = 0; i < N; i++) { 45 | output[i] = input[i] % ceil; 46 | } 47 | } 48 | } 49 | // Register the benchmark 50 | BENCHMARK(baseMod)->Apply(custom_args); 51 | 52 | // Benchmark main function 53 | BENCHMARK_MAIN(); 54 | -------------------------------------------------------------------------------- /code_scheduling/base/base_random.cpp: -------------------------------------------------------------------------------- 1 | // This program shows off a neat optimization for fast a faster 2 | // modulo operation in C++ 3 | // By: Nick from CoffeeBeforeArch 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | // Function for generating argument pairs 10 | static void custom_args(benchmark::internal::Benchmark *b) { 11 | for (int i = 1 << 4; i <= 1 << 10; i <<= 2) { 12 | // Collect stats at 1/8, 1/2, and 7/8 13 | for (int j : {32, 128, 224}) { 14 | b = b->ArgPair(i, j); 15 | } 16 | } 17 | } 18 | 19 | // Baseline benchmark 20 | static void baseModRandom(benchmark::State &s) { 21 | // Number of elements in the vectors 22 | int N = s.range(0); 23 | 24 | // Max for mod operator 25 | int ceil = s.range(1); 26 | 27 | // Vector for input and output of modulo 28 | std::vector input; 29 | std::vector output; 30 | input.resize(N); 31 | output.resize(N); 32 | 33 | // Generate random inputs (uniform random dist. between 0 & 255) 34 | std::mt19937 rng; 35 | rng.seed(std::random_device()()); 36 | std::uniform_int_distribution dist(0, 255); 37 | for (int &i : input) { 38 | i = dist(rng); 39 | } 40 | 41 | // Main benchmark loop 42 | while (s.KeepRunning()) { 43 | s.PauseTiming(); 44 | for (int &i : input) { 45 | i = dist(rng); 46 | } 47 | s.ResumeTiming(); 48 | 49 | // Compute the modulo for each element 50 | for (int i = 0; i < N; i++) { 51 | output[i] = input[i] % ceil; 52 | } 53 | } 54 | } 55 | // Register the benchmark 56 | BENCHMARK(baseModRandom)->Apply(custom_args); 57 | 58 | // Benchmark main function 59 | BENCHMARK_MAIN(); 60 | -------------------------------------------------------------------------------- /code_scheduling/fast/fast.cpp: -------------------------------------------------------------------------------- 1 | // This program shows off a neat optimization for fast a faster 2 | // modulo operation in C++ 3 | // By: Nick from CoffeeBeforeArch 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | // Function for generating argument pairs 10 | static void custom_args(benchmark::internal::Benchmark *b) { 11 | for (int i = 1 << 4; i <= 1 << 10; i <<= 2) { 12 | // Collect stats at 1/8, 1/2, and 7/8 13 | for (int j : {32, 128, 224}) { 14 | b = b->ArgPair(i, j); 15 | } 16 | } 17 | } 18 | 19 | // Our fast modulo operation 20 | static void fastMod(benchmark::State &s) { 21 | // Number of elements 22 | int N = s.range(0); 23 | 24 | // Max for mod operator 25 | int ceil = s.range(1); 26 | 27 | // Vector for input and output 28 | std::vector input; 29 | std::vector output; 30 | input.resize(N); 31 | output.resize(N); 32 | 33 | // Generate random inputs 34 | std::mt19937 rng; 35 | rng.seed(std::random_device()()); 36 | std::uniform_int_distribution dist(0, 255); 37 | for (int &i : input) { 38 | i = dist(rng); 39 | } 40 | 41 | for (auto _ : s) { 42 | // DON'T compute the mod for each element 43 | // Skip the expensive operation using a simple compare 44 | for (int i = 0; i < N; i++) { 45 | output[i] = (input[i] >= ceil) ? input[i] % ceil : input[i]; 46 | } 47 | } 48 | } 49 | // Register the benchmark 50 | BENCHMARK(fastMod)->Apply(custom_args); 51 | 52 | // Benchmark main function 53 | BENCHMARK_MAIN(); 54 | -------------------------------------------------------------------------------- /code_scheduling/fast/fast_random.cpp: -------------------------------------------------------------------------------- 1 | // This program shows off a neat optimization for fast a faster 2 | // modulo operation in C++ 3 | // By: Nick from CoffeeBeforeArch 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | // Function for generating argument pairs 10 | static void custom_args(benchmark::internal::Benchmark *b) { 11 | for (int i = 1 << 4; i <= 1 << 10; i <<= 2) { 12 | // Collect stats at 1/8, 1/2, and 7/8 13 | for (int j : {32, 128, 224}) { 14 | b = b->ArgPair(i, j); 15 | } 16 | } 17 | } 18 | 19 | // Our fast modulo operation 20 | static void fastMod(benchmark::State &s) { 21 | // Number of elements 22 | int N = s.range(0); 23 | 24 | // Max for mod operator 25 | int ceil = s.range(1); 26 | 27 | // Vector for input and output 28 | std::vector input; 29 | std::vector output; 30 | input.resize(N); 31 | output.resize(N); 32 | 33 | // Generate random inputs 34 | std::mt19937 rng; 35 | rng.seed(std::random_device()()); 36 | std::uniform_int_distribution dist(0, 255); 37 | 38 | for (auto _ : s) { 39 | // Generate random numbers but don't profile it 40 | s.PauseTiming(); 41 | for (int &i : input) { 42 | i = dist(rng); 43 | } 44 | s.ResumeTiming(); 45 | 46 | // DON'T compute the mod for each element 47 | // Skip the expensive operation using a simple compare 48 | for (int i = 0; i < N; i++) { 49 | output[i] = (input[i] >= ceil) ? input[i] % ceil : input[i]; 50 | } 51 | } 52 | } 53 | // Register the benchmark 54 | BENCHMARK(fastMod)->Apply(custom_args); 55 | 56 | // Benchmark main function 57 | BENCHMARK_MAIN(); 58 | -------------------------------------------------------------------------------- /code_scheduling/hint/hint.cpp: -------------------------------------------------------------------------------- 1 | // This program shows off a neat optimization for fast a faster 2 | // modulo operation in C++ 3 | // By: Nick from CoffeeBeforeArch 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | // Function for generating argument pairs 10 | static void custom_args(benchmark::internal::Benchmark *b) { 11 | for (int i = 1 << 4; i <= 1 << 10; i <<= 2) { 12 | // Collect stats at 1/8, 1/2, and 7/8 13 | for (int j : {32, 128, 224}) { 14 | b = b->ArgPair(i, j); 15 | } 16 | } 17 | } 18 | 19 | // Baseline for intuitive modulo operation 20 | static void fastModHint(benchmark::State &s) { 21 | // Number of elements 22 | int N = s.range(0); 23 | 24 | // Max for mod operator 25 | int ceil = s.range(1); 26 | 27 | // Vector for input and output of modulo 28 | std::vector input; 29 | std::vector output; 30 | input.resize(N); 31 | output.resize(N); 32 | 33 | // Generate random inputs 34 | std::mt19937 rng; 35 | rng.seed(std::random_device()()); 36 | std::uniform_int_distribution dist(0, 255); 37 | for (int &i : input) { 38 | i = dist(rng); 39 | } 40 | 41 | for (auto _ : s) { 42 | // DON'T compute the mod for each element 43 | // Skip the expensive operation using a simple compare 44 | for (int i = 0; i < N; i++) { 45 | // Hint to the compiler that we usually skip the mod 46 | output[i] = 47 | __builtin_expect(input[i] >= ceil, 0) ? input[i] % ceil : input[i]; 48 | } 49 | } 50 | } 51 | // Register the benchmark 52 | BENCHMARK(fastModHint)->Apply(custom_args); 53 | 54 | // Benchmark main function 55 | BENCHMARK_MAIN(); 56 | -------------------------------------------------------------------------------- /conditions/bool/non_power.cpp: -------------------------------------------------------------------------------- 1 | // Benchmarks for using booleans in arithmetic 2 | // By: Nick from CoffeeBeforeArch 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | // Benchmark for integrating boolean into multiply and add 10 | // Uses constant that is not a power of 2 11 | static void boolBenchNonPower(benchmark::State &s) { 12 | // Get the input vector size 13 | auto N = 1 << s.range(0); 14 | 15 | // Create random number generator 16 | std::random_device rd; 17 | std::mt19937 gen(rd()); 18 | std::bernoulli_distribution d(0.5); 19 | 20 | // Create a vector of random booleans 21 | std::vector v_in(N); 22 | std::generate(begin(v_in), end(v_in), [&]() { return d(gen); }); 23 | 24 | // Output element 25 | // Dynamically allocated int isn't optimized away 26 | int *sink = new int; 27 | *sink = 0; 28 | 29 | // Benchmark main loop 30 | for (auto _ : s) { 31 | for (auto b : v_in) *sink += 41 * b; 32 | } 33 | 34 | // Free our memory 35 | delete sink; 36 | } 37 | BENCHMARK(boolBenchNonPower)->DenseRange(12, 14)->Unit(benchmark::kMicrosecond); 38 | 39 | BENCHMARK_MAIN(); 40 | -------------------------------------------------------------------------------- /conditions/bool/power2.cpp: -------------------------------------------------------------------------------- 1 | // Benchmarks for using booleans in arithmetic 2 | // By: Nick from CoffeeBeforeArch 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | // Benchmark for integrating boolean into multiply and add 10 | // Uses a constant that is a power of 2 11 | static void boolBenchPower(benchmark::State &s) { 12 | // Get the input vector size 13 | auto N = 1 << s.range(0); 14 | 15 | // Create random number generator 16 | std::random_device rd; 17 | std::mt19937 gen(rd()); 18 | std::bernoulli_distribution d(0.5); 19 | 20 | // Create a vector of random booleans 21 | std::vector v_in(N); 22 | std::generate(begin(v_in), end(v_in), [&]() { return d(gen); }); 23 | 24 | // Output element 25 | // Dynamically allocated int isn't optimized away 26 | int *sink = new int; 27 | *sink = 0; 28 | 29 | // Benchmark main loop 30 | for (auto _ : s) { 31 | for (auto b : v_in) *sink += 32 * b; 32 | } 33 | 34 | // Free our memory 35 | delete sink; 36 | } 37 | BENCHMARK(boolBenchPower)->DenseRange(12, 14)->Unit(benchmark::kMicrosecond); 38 | 39 | BENCHMARK_MAIN(); 40 | -------------------------------------------------------------------------------- /conditions/bool/runtime_value.cpp: -------------------------------------------------------------------------------- 1 | // Benchmarks for using booleans in arithmetic 2 | // By: Nick from CoffeeBeforeArch 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | // Benchmark for integrating boolean into multiply and add 10 | // Uses a value known only at runtime 11 | static void boolBenchInput(benchmark::State &s) { 12 | // Get the input vector size 13 | auto N = 1 << s.range(0); 14 | 15 | // Create random number generator 16 | std::random_device rd; 17 | std::mt19937 gen(rd()); 18 | std::bernoulli_distribution d(0.5); 19 | 20 | // Create a vector of random booleans 21 | std::vector v_in(N); 22 | std::generate(begin(v_in), end(v_in), [&]() { return d(gen); }); 23 | 24 | // Output element 25 | // Dynamically allocated int isn't optimized away 26 | int *sink = new int; 27 | *sink = 0; 28 | 29 | // Benchmark main loop 30 | for (auto _ : s) { 31 | for (auto b : v_in) *sink += s.range(0) * b; 32 | } 33 | 34 | // Free our memory 35 | delete sink; 36 | } 37 | BENCHMARK(boolBenchInput)->DenseRange(12, 14)->Unit(benchmark::kMicrosecond); 38 | 39 | BENCHMARK_MAIN(); 40 | -------------------------------------------------------------------------------- /conditions/branch/false.cpp: -------------------------------------------------------------------------------- 1 | // Benchmarks using branches for conditionally adding a value 2 | // By: Nick from CoffeeBeforeArch 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | // Benchmark for using branches 9 | static void branchBenchFalse(benchmark::State &s) { 10 | // Get the input vector size 11 | auto N = 1 << s.range(0); 12 | 13 | // Create a vector of random booleans 14 | std::vector v_in(N); 15 | std::generate(begin(v_in), end(v_in), []() { return false; }); 16 | 17 | // Output element 18 | // Dynamically allocated int isn't optimized away 19 | int *sink = new int; 20 | *sink = 0; 21 | 22 | // Benchmark main loop 23 | for (auto _ : s) { 24 | for (auto b : v_in) 25 | if (b) *sink += 41; 26 | } 27 | 28 | // Free our memory 29 | delete sink; 30 | } 31 | BENCHMARK(branchBenchFalse)->DenseRange(12, 14)->Unit(benchmark::kMicrosecond); 32 | 33 | BENCHMARK_MAIN(); 34 | -------------------------------------------------------------------------------- /conditions/branch/random.cpp: -------------------------------------------------------------------------------- 1 | // Benchmarks using branches for conditionally adding a value 2 | // By: Nick from CoffeeBeforeArch 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | // Benchmark for using branches 10 | static void branchBenchRandom(benchmark::State &s) { 11 | // Get the input vector size 12 | auto N = 1 << s.range(0); 13 | 14 | // Create random number generator 15 | std::random_device rd; 16 | std::mt19937 gen(rd()); 17 | std::bernoulli_distribution d(0.5); 18 | 19 | // Create a vector of random booleans 20 | std::vector v_in(N); 21 | std::generate(begin(v_in), end(v_in), [&]() { return d(gen); }); 22 | 23 | // Output element 24 | // Dynamically allocated int isn't optimized away 25 | int *sink = new int; 26 | *sink = 0; 27 | 28 | // Benchmark main loop 29 | for (auto _ : s) { 30 | for (auto b : v_in) 31 | if (b) *sink += 41; 32 | } 33 | 34 | // Free our memory 35 | delete sink; 36 | } 37 | BENCHMARK(branchBenchRandom)->DenseRange(12, 14)->Unit(benchmark::kMicrosecond); 38 | 39 | BENCHMARK_MAIN(); 40 | -------------------------------------------------------------------------------- /conditions/branch/true.cpp: -------------------------------------------------------------------------------- 1 | // Benchmarks using branches for conditionally adding a value 2 | // By: Nick from CoffeeBeforeArch 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | // Benchmark for using branches 9 | static void branchBenchTrue(benchmark::State &s) { 10 | // Get the input vector size 11 | auto N = 1 << s.range(0); 12 | 13 | // Create a vector of random booleans 14 | std::vector v_in(N); 15 | std::generate(begin(v_in), end(v_in), []() { return true; }); 16 | 17 | // Output element 18 | // Dynamically allocated int isn't optimized away 19 | int *sink = new int; 20 | *sink = 0; 21 | 22 | // Benchmark main loop 23 | for (auto _ : s) { 24 | for (auto b : v_in) 25 | if (b) *sink += 41; 26 | } 27 | 28 | // Free our memory 29 | delete sink; 30 | } 31 | BENCHMARK(branchBenchTrue)->DenseRange(12, 14)->Unit(benchmark::kMicrosecond); 32 | 33 | BENCHMARK_MAIN(); 34 | -------------------------------------------------------------------------------- /conditions/char/non_power.cpp: -------------------------------------------------------------------------------- 1 | // Benchmarks for using chars to store boolean values 2 | // By: Nick from CoffeeBeforeArch 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | // Benchmark for integrating character into multiply and add 10 | // Uses a constant that is not a power of two 11 | static void charBenchNonPower(benchmark::State &s) { 12 | // Get the input vector size 13 | auto N = 1 << s.range(0); 14 | 15 | // Create random number generator 16 | std::random_device rd; 17 | std::mt19937 gen(rd()); 18 | std::bernoulli_distribution d(0.5); 19 | 20 | // Create a vector of random booleans 21 | std::vector v_in(N); 22 | std::generate(begin(v_in), end(v_in), [&]() { return d(gen); }); 23 | 24 | // Output element 25 | // Dynamically allocated int isn't optimized away 26 | int *sink = new int; 27 | *sink = 0; 28 | 29 | // Benchmark main loop 30 | for (auto _ : s) { 31 | for (auto b : v_in) *sink += 41 * b; 32 | } 33 | 34 | // Free our memory 35 | delete sink; 36 | } 37 | BENCHMARK(charBenchNonPower)->DenseRange(12, 14)->Unit(benchmark::kMicrosecond); 38 | 39 | BENCHMARK_MAIN(); 40 | -------------------------------------------------------------------------------- /conditions/char/power2.cpp: -------------------------------------------------------------------------------- 1 | // Benchmarks for using chars to store boolean values 2 | // By: Nick from CoffeeBeforeArch 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | // Benchmark for integrating character into multiply and add 10 | // Uses a constant that is a power of two 11 | static void charBenchPower(benchmark::State &s) { 12 | // Get the input vector size 13 | auto N = 1 << s.range(0); 14 | 15 | // Create random number generator 16 | std::random_device rd; 17 | std::mt19937 gen(rd()); 18 | std::bernoulli_distribution d(0.5); 19 | 20 | // Create a vector of random booleans 21 | std::vector v_in(N); 22 | std::generate(begin(v_in), end(v_in), [&]() { return d(gen); }); 23 | 24 | // Output element 25 | // Dynamically allocated int isn't optimized away 26 | int *sink = new int; 27 | *sink = 0; 28 | 29 | // Benchmark main loop 30 | for (auto _ : s) { 31 | for (auto b : v_in) *sink += 32 * b; 32 | } 33 | 34 | // Free our memory 35 | delete sink; 36 | } 37 | BENCHMARK(charBenchPower)->DenseRange(12, 14)->Unit(benchmark::kMicrosecond); 38 | 39 | BENCHMARK_MAIN(); 40 | -------------------------------------------------------------------------------- /conditions/char/runtime_value.cpp: -------------------------------------------------------------------------------- 1 | // Benchmarks for using chars to store boolean values 2 | // By: Nick from CoffeeBeforeArch 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | // Benchmark for integrating character into multiply and add 10 | // Uses value known only at runtime 11 | static void charBenchInput(benchmark::State &s) { 12 | // Get the input vector size 13 | auto N = 1 << s.range(0); 14 | 15 | // Create random number generator 16 | std::random_device rd; 17 | std::mt19937 gen(rd()); 18 | std::bernoulli_distribution d(0.5); 19 | 20 | // Create a vector of random booleans 21 | std::vector v_in(N); 22 | std::generate(begin(v_in), end(v_in), [&]() { return d(gen); }); 23 | 24 | // Output element 25 | // Dynamically allocated int isn't optimized away 26 | int *sink = new int; 27 | *sink = 0; 28 | 29 | // Benchmark main loop 30 | for (auto _ : s) { 31 | for (auto b : v_in) *sink += s.range(0) * b; 32 | } 33 | 34 | // Free our memory 35 | delete sink; 36 | } 37 | BENCHMARK(charBenchInput)->DenseRange(12, 14)->Unit(benchmark::kMicrosecond); 38 | 39 | BENCHMARK_MAIN(); 40 | -------------------------------------------------------------------------------- /conditions/int/non_power.cpp: -------------------------------------------------------------------------------- 1 | // Benchmarks for using integers to store boolean values 2 | // By: Nick from CoffeeBeforeArch 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | // Benchmark for integrating integer into multiply and add 10 | // Uses a constant that is not a power of two 11 | static void intBenchNonPower(benchmark::State &s) { 12 | // Get the input vector size 13 | auto N = 1 << s.range(0); 14 | 15 | // Create random number generator 16 | std::random_device rd; 17 | std::mt19937 gen(rd()); 18 | std::bernoulli_distribution d(0.5); 19 | 20 | // Create a vector of random booleans 21 | std::vector v_in(N); 22 | std::generate(begin(v_in), end(v_in), [&]() { return d(gen); }); 23 | 24 | // Output element 25 | // Dynamically allocated int isn't optimized away 26 | int *sink = new int; 27 | *sink = 0; 28 | 29 | // Benchmark main loop 30 | for (auto _ : s) { 31 | for (auto b : v_in) *sink += 41 * b; 32 | } 33 | 34 | // Free our memory 35 | delete sink; 36 | } 37 | BENCHMARK(intBenchNonPower)->DenseRange(12, 14)->Unit(benchmark::kMicrosecond); 38 | 39 | BENCHMARK_MAIN(); 40 | -------------------------------------------------------------------------------- /conditions/int/power2.cpp: -------------------------------------------------------------------------------- 1 | // Benchmarks for using integers to store boolean values 2 | // By: Nick from CoffeeBeforeArch 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | // Benchmark for integrating integer into multiply and add 10 | // Uses a constant that is a power of two 11 | static void intBenchPower(benchmark::State &s) { 12 | // Get the input vector size 13 | auto N = 1 << s.range(0); 14 | 15 | // Create random number generator 16 | std::random_device rd; 17 | std::mt19937 gen(rd()); 18 | std::bernoulli_distribution d(0.5); 19 | 20 | // Create a vector of random booleans 21 | std::vector v_in(N); 22 | std::generate(begin(v_in), end(v_in), [&]() { return d(gen); }); 23 | 24 | // Output element 25 | // Dynamically allocated int isn't optimized away 26 | int *sink = new int; 27 | *sink = 0; 28 | 29 | // Benchmark main loop 30 | for (auto _ : s) { 31 | for (auto b : v_in) *sink += 32 * b; 32 | } 33 | 34 | // Free our memory 35 | delete sink; 36 | } 37 | BENCHMARK(intBenchPower)->DenseRange(12, 14)->Unit(benchmark::kMicrosecond); 38 | 39 | BENCHMARK_MAIN(); 40 | -------------------------------------------------------------------------------- /conditions/int/runtime_value.cpp: -------------------------------------------------------------------------------- 1 | // Benchmarks for using integers to store boolean values 2 | // By: Nick from CoffeeBeforeArch 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | // Benchmark for integrating integer into multiply and add 10 | // Uses a value that is only known at runtime 11 | static void intBenchInput(benchmark::State &s) { 12 | // Get the input vector size 13 | auto N = 1 << s.range(0); 14 | 15 | // Create random number generator 16 | std::random_device rd; 17 | std::mt19937 gen(rd()); 18 | std::bernoulli_distribution d(0.5); 19 | 20 | // Create a vector of random booleans 21 | std::vector v_in(N); 22 | std::generate(begin(v_in), end(v_in), [&]() { return d(gen); }); 23 | 24 | // Output element 25 | // Dynamically allocated int isn't optimized away 26 | int *sink = new int; 27 | *sink = 0; 28 | 29 | // Benchmark main loop 30 | for (auto _ : s) { 31 | for (auto b : v_in) *sink += s.range(0) * b; 32 | } 33 | 34 | // Free our memory 35 | delete sink; 36 | } 37 | BENCHMARK(intBenchInput)->DenseRange(12, 14)->Unit(benchmark::kMicrosecond); 38 | 39 | BENCHMARK_MAIN(); 40 | -------------------------------------------------------------------------------- /conditions/sizes.cpp: -------------------------------------------------------------------------------- 1 | // Short program for printing out sizing information 2 | // By: Nick from CoffeeBeforeArch 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | // Overloaded new operator to track dynamic allocation 9 | void* operator new(size_t N) { 10 | std::cout << "Allocating " << N << " bytes of memory!\n"; 11 | return malloc(N); 12 | } 13 | 14 | int main() { 15 | // Measure for the largest size 16 | const size_t N = 1 << 12; 17 | 18 | // Create the vectors used in the benchmarks 19 | std::vector v_bool(N); 20 | std::vector v_char(N); 21 | std::vector v_int(N); 22 | 23 | return 0; 24 | } 25 | -------------------------------------------------------------------------------- /dod/dod.cpp: -------------------------------------------------------------------------------- 1 | // This program shows off the basics of data-oriented-design in C++ 2 | // By: Nick from CoffeeBeforeArch 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | using std::back_inserter; 10 | using std::fill_n; 11 | using std::vector; 12 | 13 | // A simple struct aligned in such a way no two instances will be on 14 | // the same cache line (64 bytes cache lines, 64 byte alignment) 15 | struct SimpleStruct { 16 | // Struct with a 16 integer fields 17 | int v0 = 0; 18 | int v1 = 0; 19 | int v2 = 0; 20 | int v3 = 0; 21 | int v4 = 0; 22 | int v5 = 0; 23 | int v6 = 0; 24 | int v7 = 0; 25 | int v8 = 0; 26 | int v9 = 0; 27 | int v10 = 0; 28 | int v11 = 0; 29 | int v12 = 0; 30 | int v13 = 0; 31 | int v14 = 0; 32 | int v15 = 0; 33 | 34 | // Method to increment the field (only 1 here for brevity) 35 | void inc_v0() { v0++; } 36 | }; 37 | 38 | // A simple struct that contains an array the fields stored in the 39 | // other object 40 | struct SoA { 41 | // Simple constructor that resizes the vector to store N values 42 | SoA(int N) { 43 | // Zero-initialized by default 44 | v0s.resize(N); 45 | v1s.resize(N); 46 | v2s.resize(N); 47 | v3s.resize(N); 48 | v4s.resize(N); 49 | v5s.resize(N); 50 | v6s.resize(N); 51 | v7s.resize(N); 52 | v8s.resize(N); 53 | v9s.resize(N); 54 | v10s.resize(N); 55 | v11s.resize(N); 56 | v12s.resize(N); 57 | v13s.resize(N); 58 | v14s.resize(N); 59 | v15s.resize(N); 60 | } 61 | 62 | // Update method that increments each value 63 | // Only for v0 for the sake of brevity 64 | void update_v0() { 65 | for (auto &i : v0s) { 66 | i++; 67 | } 68 | } 69 | 70 | // Vector of values 71 | vector v0s; 72 | vector v1s; 73 | vector v2s; 74 | vector v3s; 75 | vector v4s; 76 | vector v5s; 77 | vector v6s; 78 | vector v7s; 79 | vector v8s; 80 | vector v9s; 81 | vector v10s; 82 | vector v11s; 83 | vector v12s; 84 | vector v13s; 85 | vector v14s; 86 | vector v15s; 87 | }; 88 | 89 | // Benchmark for classic OO approach 90 | static void ArrayOfStructs_Bench(benchmark::State &s) { 91 | // Extract the number of objects we want 92 | int N = 1 << s.range(0); 93 | 94 | // Create a vector for the PaddedStruct 95 | vector v; 96 | fill_n(back_inserter(v), N, SimpleStruct()); 97 | 98 | // Profile the update for each field 99 | while (s.KeepRunning()) { 100 | // Increment the field for each struct 101 | for (auto &i : v) { 102 | i.inc_v0(); 103 | } 104 | } 105 | } 106 | // Register the SoA benchmark 107 | BENCHMARK(ArrayOfStructs_Bench)->DenseRange(8, 16); 108 | 109 | // Benchmark for DoD approach 110 | static void StructOfArrays_Bench(benchmark::State &s) { 111 | // Extract the number of objects we want 112 | int N = 1 << s.range(0); 113 | 114 | // Create an Struct of Arrays 115 | SoA struct_of_arrays(N); 116 | 117 | // Profile the update of each field 118 | while (s.KeepRunning()) { 119 | struct_of_arrays.update_v0(); 120 | } 121 | } 122 | // Register the AoS benchmark 123 | BENCHMARK(StructOfArrays_Bench)->DenseRange(8, 16); 124 | 125 | // Main function for the benchmarks 126 | BENCHMARK_MAIN(); 127 | -------------------------------------------------------------------------------- /dot_product/base/base.cpp: -------------------------------------------------------------------------------- 1 | // This program implements a baseline dot product in C++ 2 | // By: Nick from CoffeeBeforeArch 3 | 4 | #include 5 | 6 | #include 7 | #include 8 | #include 9 | 10 | // Classic C-style dot product 11 | float dot_product(std::vector &__restrict v1, 12 | std::vector &__restrict v2) { 13 | float tmp = 0.0f; 14 | for (size_t i = 0; i < v1.size(); i++) { 15 | tmp += v1[i] * v2[i]; 16 | } 17 | return tmp; 18 | } 19 | 20 | // Benchmark the baseline C-style dot product 21 | static void baseDP(benchmark::State &s) { 22 | // Get the size of the vector 23 | size_t N = 1 << s.range(0); 24 | 25 | // Initialize the vectors 26 | std::vector v1; 27 | std::fill_n(std::back_inserter(v1), N, rand() % 100); 28 | std::vector v2; 29 | std::fill_n(std::back_inserter(v2), N, rand() % 100); 30 | 31 | // Keep the result from being optimized away 32 | volatile float result = 0.0f; 33 | 34 | // Our benchmark loop 35 | while (s.KeepRunning()) { 36 | result = dot_product(v1, v2); 37 | } 38 | } 39 | BENCHMARK(baseDP)->DenseRange(8, 10); 40 | 41 | // Our benchmark main function 42 | BENCHMARK_MAIN(); 43 | -------------------------------------------------------------------------------- /dot_product/modern/modern.cpp: -------------------------------------------------------------------------------- 1 | // This program implements a modern C++ dot product 2 | // By: Nick from CoffeeBeforeArch 3 | 4 | #include 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | // Modern C++ dot product 12 | float dot_product(std::vector &__restrict v1, 13 | std::vector &__restrict v2) { 14 | return std::transform_reduce(std::execution::unseq, begin(v1), end(v1), 15 | begin(v2), 0.0f); 16 | } 17 | 18 | // Benchmark for a modern C++ dot product 19 | static void modernDP(benchmark::State &s) { 20 | // Get the size of the vector 21 | size_t N = 1 << s.range(0); 22 | 23 | // Initialize the vectors 24 | std::vector v1; 25 | std::fill_n(std::back_inserter(v1), N, rand() % 100); 26 | std::vector v2; 27 | std::fill_n(std::back_inserter(v2), N, rand() % 100); 28 | 29 | // Keep our result from being optimized away 30 | volatile float result = 0; 31 | 32 | // Our benchmark loop 33 | while (s.KeepRunning()) { 34 | result = dot_product(v1, v2); 35 | } 36 | } 37 | BENCHMARK(modernDP)->DenseRange(8, 10); 38 | 39 | // Our benchmark main function 40 | BENCHMARK_MAIN(); 41 | -------------------------------------------------------------------------------- /dot_product/modern_double/modern_double.cpp: -------------------------------------------------------------------------------- 1 | // This program implements two dot product implementations in C++ 2 | // By: Nick from CoffeeBeforeArch 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | // Modern C++ dot product 11 | float dot_product(std::vector &__restrict v1, 12 | std::vector &__restrict v2) { 13 | return std::transform_reduce(std::execution::unseq, begin(v1), end(v1), 14 | begin(v2), 0.0); 15 | } 16 | 17 | // Benchmark the modern C++ dot product 18 | static void modernDP_double(benchmark::State &s) { 19 | // Get the size of the vector 20 | size_t N = 1 << s.range(0); 21 | 22 | // Initialize the vectors 23 | std::vector v1; 24 | std::fill_n(std::back_inserter(v1), N, rand() % 100); 25 | std::vector v2; 26 | std::fill_n(std::back_inserter(v2), N, rand() % 100); 27 | 28 | // Keep our result from being optimized away 29 | volatile float result = 0; 30 | 31 | // Our benchmark loop 32 | while (s.KeepRunning()) { 33 | result = dot_product(v1, v2); 34 | } 35 | } 36 | BENCHMARK(modernDP_double)->DenseRange(8, 10); 37 | 38 | // Our benchmark main function 39 | BENCHMARK_MAIN(); 40 | -------------------------------------------------------------------------------- /dot_product/tuned/tuned.cpp: -------------------------------------------------------------------------------- 1 | // This program implements two dot product implementations in C++ 2 | // By: Nick from CoffeeBeforeArch 3 | 4 | #include 5 | #include 6 | 7 | #include 8 | #include 9 | #include 10 | 11 | // Hand-vectorized dot product 12 | float dot_product(const float *__restrict v1, const float *v2, const size_t N) { 13 | auto tmp = 0.0f; 14 | for (size_t i = 0; i < N; i += 8) { 15 | // Temporary variables to help with intrinsic 16 | float r[8]; 17 | __m256 rv; 18 | 19 | // Our dot product intrinsic 20 | rv = _mm256_dp_ps(_mm256_load_ps(v1 + i), _mm256_load_ps(v2 + i), 0xf1); 21 | 22 | // Avoid type punning using memcpy 23 | std::memcpy(r, &rv, sizeof(float) * 8); 24 | 25 | tmp += r[0] + r[4]; 26 | } 27 | return tmp; 28 | } 29 | 30 | // Benchmark our hand-tuned dot product 31 | static void handTunedDP(benchmark::State &s) { 32 | // Get the size of the vector 33 | size_t N = 1 << s.range(0); 34 | 35 | // Initialize the vectors 36 | // Align memory to 32 bytes for the vector instruction 37 | float *v1 = (float *)aligned_alloc(32, N * sizeof(float)); 38 | float *v2 = (float *)aligned_alloc(32, N * sizeof(float)); 39 | for (size_t i = 0; i < N; i++) { 40 | v1[i] = rand() % 100; 41 | v2[i] = rand() % 100; 42 | } 43 | 44 | // Keep our result from being optimized away 45 | volatile float result = 0; 46 | 47 | // Our benchmark loop 48 | while (s.KeepRunning()) { 49 | result = dot_product(v1, v2, N); 50 | } 51 | } 52 | BENCHMARK(handTunedDP)->DenseRange(8, 10); 53 | 54 | // Our benchmark main function 55 | BENCHMARK_MAIN(); 56 | -------------------------------------------------------------------------------- /duplicate_removal/duplicate_removal.cpp: -------------------------------------------------------------------------------- 1 | // Benchmark for removing duplicates from a vector 2 | // By: Nick from CoffeeBeforeArch 3 | 4 | #include 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | // Function for generating argument pairs 12 | static void custom_args(benchmark::internal::Benchmark *b) { 13 | for (auto i : {10, 11, 12}) { 14 | for (auto j : {10, 100, 1000, 10000}) { 15 | b = b->ArgPair(i, j); 16 | } 17 | } 18 | } 19 | 20 | // Baseline benchmark used by sorting vectors 21 | static void baseline(benchmark::State &s) { 22 | // Create input and output vectors 23 | int N = 1 << s.range(0); 24 | std::vector v_in(N); 25 | std::vector v_out; 26 | 27 | // Create our random number generators 28 | std::mt19937 rng; 29 | rng.seed(std::random_device()()); 30 | std::uniform_int_distribution dist(0, s.range(1)); 31 | 32 | // Fill the input vector with random numbers 33 | std::generate(begin(v_in), end(v_in), [&] { return dist(rng); }); 34 | 35 | // Benchmark loop 36 | for (auto _ : s) { 37 | // For every value in the input vector 38 | for (auto i : v_in) { 39 | // Check if it is not in the output vector 40 | if (std::find(begin(v_out), end(v_out), i) == v_out.end()) 41 | // And put it in if it's not 42 | v_out.push_back(i); 43 | } 44 | 45 | // Clear each iteration 46 | v_out.clear(); 47 | } 48 | } 49 | BENCHMARK(baseline)->Apply(custom_args)->Unit(benchmark::kMicrosecond); 50 | 51 | // Benchmark that filters the values in a hash set 52 | static void unordered_set(benchmark::State &s) { 53 | // Create input and output vectors 54 | int N = 1 << s.range(0); 55 | std::vector v_in(N); 56 | std::unordered_set filter; 57 | 58 | // Create our random number generators 59 | std::mt19937 rng; 60 | rng.seed(std::random_device()()); 61 | std::uniform_int_distribution dist(0, s.range(1)); 62 | 63 | // Fill the input vector with random numbers 64 | std::generate(begin(v_in), end(v_in), [&] { return dist(rng); }); 65 | 66 | // Benchmark loop 67 | for (auto _ : s) { 68 | // Insert each element into the unordered set 69 | // Duplicate will be overridden 70 | for (auto i : v_in) filter.insert(i); 71 | 72 | // Clear each iteration 73 | filter.clear(); 74 | } 75 | } 76 | BENCHMARK(unordered_set)->Apply(custom_args)->Unit(benchmark::kMicrosecond); 77 | 78 | // Benchmark that filters with a has set then copies into a vector 79 | static void unordered_set_copy(benchmark::State &s) { 80 | // Create input and output vectors 81 | int N = 1 << s.range(0); 82 | std::vector v_in(N); 83 | std::vector v_out; 84 | std::unordered_set filter; 85 | 86 | // Create our random number generators 87 | std::mt19937 rng; 88 | rng.seed(std::random_device()()); 89 | std::uniform_int_distribution dist(0, s.range(1)); 90 | 91 | // Fill the input vector with random numbers 92 | std::generate(begin(v_in), end(v_in), [&] { return dist(rng); }); 93 | 94 | // Benchmark loop 95 | for (auto _ : s) { 96 | // Create the output vector and filter inside the loop so it gets cleared 97 | // each iteration 98 | 99 | // Insert each element into the unordered set 100 | // Duplicate will be overridden 101 | for (auto i : v_in) filter.insert(i); 102 | for (auto i : filter) v_out.push_back(i); 103 | 104 | // Clear each iteration 105 | v_out.clear(); 106 | filter.clear(); 107 | } 108 | } 109 | BENCHMARK(unordered_set_copy) 110 | ->Apply(custom_args) 111 | ->Unit(benchmark::kMicrosecond); 112 | 113 | // Benchmark that sorts the data then removes adjacent duplicates 114 | static void sort_unique(benchmark::State &s) { 115 | // Create input and output vectors 116 | int N = 1 << s.range(0); 117 | std::vector v_in(N); 118 | std::vector v_out; 119 | 120 | // Create our random number generators 121 | std::mt19937 rng; 122 | rng.seed(std::random_device()()); 123 | std::uniform_int_distribution dist(0, s.range(1)); 124 | 125 | // Fill the input vector with random numbers 126 | std::generate(begin(v_in), end(v_in), [&] { return dist(rng); }); 127 | 128 | // Benchmark loop 129 | for (auto _ : s) { 130 | // Copy in the random numbers 131 | v_out = v_in; 132 | 133 | // Sort the vector 134 | std::ranges::sort(v_out); 135 | 136 | // Use std::unique to get rid of duplicates 137 | std::ranges::unique(v_out); 138 | } 139 | } 140 | BENCHMARK(sort_unique)->Apply(custom_args)->Unit(benchmark::kMicrosecond); 141 | 142 | BENCHMARK_MAIN(); 143 | -------------------------------------------------------------------------------- /false_sharing/aligned_type.cpp: -------------------------------------------------------------------------------- 1 | // This program shows how atomic integers may be allocated in C++ 2 | // By: Nick from CoffeeBeforeArch 3 | 4 | #include 5 | #include 6 | 7 | // Our aligned atomic 8 | struct alignas(64) AlignedType { 9 | AlignedType() { val = 0; } 10 | std::atomic val; 11 | }; 12 | 13 | int main() { 14 | // Now we're guaranteed that our atomics will be at least 64 bytes apart! 15 | AlignedType a{}; 16 | AlignedType b{}; 17 | AlignedType c{}; 18 | AlignedType d{}; 19 | 20 | // Print out the addresses 21 | std::cout << "Address of AlignedType a - " << &a << '\n'; 22 | std::cout << "Address of AlignedType b - " << &b << '\n'; 23 | std::cout << "Address of AlignedType c - " << &c << '\n'; 24 | std::cout << "Address of AlignedType d - " << &d << '\n'; 25 | 26 | return 0; 27 | } 28 | -------------------------------------------------------------------------------- /false_sharing/atomic_int.cpp: -------------------------------------------------------------------------------- 1 | // This program shows how atomic integers may be allocated in C++ 2 | // By: Nick from CoffeeBeforeArch 3 | 4 | #include 5 | #include 6 | 7 | int main() { 8 | // If we create four atomic integers like this, there's a high probability 9 | // they'll wind up next to each other in memory 10 | std::atomic a; 11 | std::atomic b; 12 | std::atomic c; 13 | std::atomic d; 14 | 15 | // Print out the addresses 16 | std::cout << "Address of atomic a - " << &a << '\n'; 17 | std::cout << "Address of atomic b - " << &b << '\n'; 18 | std::cout << "Address of atomic c - " << &c << '\n'; 19 | std::cout << "Address of atomic d - " << &d << '\n'; 20 | 21 | return 0; 22 | } 23 | -------------------------------------------------------------------------------- /false_sharing/false_sharing.cpp: -------------------------------------------------------------------------------- 1 | // This program shows off the sever implications of false sharing in 2 | // C++ using std::atomic and std::thread 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | // Simple function for incrememnting an atomic int 9 | void work(std::atomic& a) { 10 | for (int i = 0; i < 100000; i++) { 11 | a++; 12 | } 13 | } 14 | 15 | // Simple single-threaded function that calls work 4 times 16 | void single_thread() { 17 | std::atomic a; 18 | a = 0; 19 | 20 | work(a); 21 | work(a); 22 | work(a); 23 | work(a); 24 | } 25 | 26 | // A simple benchmark that runs our single-threaded implementation 27 | static void singleThread(benchmark::State& s) { 28 | while (s.KeepRunning()) { 29 | single_thread(); 30 | } 31 | } 32 | BENCHMARK(singleThread)->Unit(benchmark::kMillisecond); 33 | 34 | // Tries to parallelize the work across multiple threads 35 | // However, each core invalidates the other cores copies on a write 36 | // This is an EXTREME example of poorly thought out code 37 | void same_var() { 38 | std::atomic a; 39 | a = 0; 40 | 41 | // Create four threads and use a lambda to launch work 42 | std::thread t1([&]() { work(a); }); 43 | std::thread t2([&]() { work(a); }); 44 | std::thread t3([&]() { work(a); }); 45 | std::thread t4([&]() { work(a); }); 46 | 47 | // Join the threads 48 | t1.join(); 49 | t2.join(); 50 | t3.join(); 51 | t4.join(); 52 | } 53 | 54 | // A simple benchmark that runs our single-threaded implementation 55 | static void directSharing(benchmark::State& s) { 56 | while (s.KeepRunning()) { 57 | same_var(); 58 | } 59 | } 60 | BENCHMARK(directSharing)->UseRealTime()->Unit(benchmark::kMillisecond); 61 | 62 | // How well does it work if we use different atomic ints? 63 | // Not that well! But look at the addresses! They all reside on the 64 | // same cache line. That means we have false sharing! 65 | // (We invalidate variables that aren't actually being accessed 66 | // because they happen to be on the same cache line) 67 | void diff_var() { 68 | std::atomic a{0}; 69 | std::atomic b{0}; 70 | std::atomic c{0}; 71 | std::atomic d{0}; 72 | 73 | // Creat four threads and use lambda to launch work 74 | std::thread t1([&]() { work(a); }); 75 | std::thread t2([&]() { work(b); }); 76 | std::thread t3([&]() { work(c); }); 77 | std::thread t4([&]() { work(d); }); 78 | 79 | // Join the threads 80 | t1.join(); 81 | t2.join(); 82 | t3.join(); 83 | t4.join(); 84 | } 85 | 86 | // A simple benchmark that runs our single-threaded implementation 87 | static void falseSharing(benchmark::State& s) { 88 | while (s.KeepRunning()) { 89 | diff_var(); 90 | } 91 | } 92 | BENCHMARK(falseSharing)->UseRealTime()->Unit(benchmark::kMillisecond); 93 | 94 | // We can align the struct to 64 bytes 95 | // Now each struct will be on a different cache line 96 | struct alignas(64) AlignedType { 97 | AlignedType() { val = 0; } 98 | std::atomic val; 99 | }; 100 | 101 | // No more invalidations, so our code should be roughly the same as the 102 | void diff_line() { 103 | AlignedType a{}; 104 | AlignedType b{}; 105 | AlignedType c{}; 106 | AlignedType d{}; 107 | 108 | // Launch the four threads now using our aligned data 109 | std::thread t1([&]() { work(a.val); }); 110 | std::thread t2([&]() { work(b.val); }); 111 | std::thread t3([&]() { work(c.val); }); 112 | std::thread t4([&]() { work(d.val); }); 113 | 114 | // Join the threads 115 | t1.join(); 116 | t2.join(); 117 | t3.join(); 118 | t4.join(); 119 | } 120 | 121 | // A simple benchmark that runs our single-threaded implementation 122 | static void noSharing(benchmark::State& s) { 123 | while (s.KeepRunning()) { 124 | diff_line(); 125 | } 126 | } 127 | BENCHMARK(noSharing)->UseRealTime()->Unit(benchmark::kMillisecond); 128 | 129 | BENCHMARK_MAIN(); 130 | -------------------------------------------------------------------------------- /false_sharing/vary_thread.cpp: -------------------------------------------------------------------------------- 1 | // This benchmark scales the number of threads in our false sharing benchmark 2 | // By: Nick from CoffeeBeforeArch 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | // Simple function for incrementing an atomic int 10 | void work(std::atomic& a, int n) { 11 | for (int i = 0; i < (400000 / n); i++) { 12 | a++; 13 | } 14 | } 15 | 16 | // Benchmark 2 threads 17 | void bench2() { 18 | std::atomic a{0}; 19 | std::atomic b{0}; 20 | 21 | // Creat four threads and use lambda to launch work 22 | std::thread t1([&]() { work(a, 2); }); 23 | std::thread t2([&]() { work(b, 2); }); 24 | 25 | // Join the threads 26 | t1.join(); 27 | t2.join(); 28 | } 29 | 30 | // A simple benchmark that runs our single-threaded implementation 31 | static void twoThreads(benchmark::State& s) { 32 | while (s.KeepRunning()) { 33 | bench2(); 34 | } 35 | } 36 | BENCHMARK(twoThreads)->UseRealTime()->Unit(benchmark::kMillisecond); 37 | 38 | // Benchmark 4 threads 39 | void bench4() { 40 | std::atomic a{0}; 41 | std::atomic b{0}; 42 | std::atomic c{0}; 43 | std::atomic d{0}; 44 | 45 | // Creat four threads and use lambda to launch work 46 | std::thread t1([&]() { work(a, 4); }); 47 | std::thread t2([&]() { work(b, 4); }); 48 | std::thread t3([&]() { work(c, 4); }); 49 | std::thread t4([&]() { work(d, 4); }); 50 | 51 | // Join the threads 52 | t1.join(); 53 | t2.join(); 54 | t3.join(); 55 | t4.join(); 56 | } 57 | 58 | // A simple benchmark that runs our single-threaded implementation 59 | static void fourThreads(benchmark::State& s) { 60 | while (s.KeepRunning()) { 61 | bench4(); 62 | } 63 | } 64 | BENCHMARK(fourThreads)->UseRealTime()->Unit(benchmark::kMillisecond); 65 | 66 | // Benchmark 8 threads 67 | void bench8() { 68 | std::atomic a{0}; 69 | std::atomic b{0}; 70 | std::atomic c{0}; 71 | std::atomic d{0}; 72 | std::atomic e{0}; 73 | std::atomic f{0}; 74 | std::atomic g{0}; 75 | std::atomic h{0}; 76 | 77 | // Creat four threads and use lambda to launch work 78 | std::thread t1([&]() { work(a, 8); }); 79 | std::thread t2([&]() { work(b, 8); }); 80 | std::thread t3([&]() { work(c, 8); }); 81 | std::thread t4([&]() { work(d, 8); }); 82 | std::thread t5([&]() { work(e, 8); }); 83 | std::thread t6([&]() { work(f, 8); }); 84 | std::thread t7([&]() { work(g, 8); }); 85 | std::thread t8([&]() { work(h, 8); }); 86 | 87 | // Join the threads 88 | t1.join(); 89 | t2.join(); 90 | t3.join(); 91 | t4.join(); 92 | t5.join(); 93 | t6.join(); 94 | t7.join(); 95 | t8.join(); 96 | } 97 | 98 | // A simple benchmark that runs our single-threaded implementation 99 | static void eightThreads(benchmark::State& s) { 100 | while (s.KeepRunning()) { 101 | bench8(); 102 | } 103 | } 104 | BENCHMARK(eightThreads)->UseRealTime()->Unit(benchmark::kMillisecond); 105 | 106 | BENCHMARK_MAIN(); 107 | -------------------------------------------------------------------------------- /hw_barrier/hw_barrier.cpp: -------------------------------------------------------------------------------- 1 | // This example shows off a memory problem on x86 2 | // By: Nick from CoffeeBeforeArch 3 | 4 | #include 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | void reorder(std::binary_semaphore &start, std::counting_semaphore<2> &end, 12 | int &v1, int &v2, int &rec) { 13 | // Keep going forever 14 | while (true) { 15 | // Wait for the signal to start 16 | start.acquire(); 17 | 18 | // Write to v2 19 | v1 = 1; 20 | 21 | // Barrier to prevent re-ordering in the hardware! 22 | _mm_mfence(); 23 | 24 | // Read v1 25 | rec = v2; 26 | 27 | // Say we're done for this iteration 28 | end.release(); 29 | } 30 | } 31 | 32 | int main() { 33 | // Semaphores for signaling threads 34 | std::binary_semaphore s1(0); 35 | std::binary_semaphore s2(0); 36 | std::counting_semaphore<2> e(0); 37 | 38 | // Variable for memory re-ordering 39 | int v1 = 0; 40 | int v2 = 0; 41 | int r1 = 0; 42 | int r2 = 0; 43 | 44 | // Start threads 45 | std::thread t1([&] { reorder(s1, e, v1, v2, r1); }); 46 | std::thread t2([&] { reorder(s2, e, v2, v1, r2); }); 47 | 48 | for (int i = 0;; i++) { 49 | // Re-initialize the shared variables 50 | v1 = 0; 51 | v2 = 0; 52 | 53 | // Signal the threads to start 54 | s1.release(); 55 | s2.release(); 56 | 57 | // Wait for them to finish 58 | e.acquire(); 59 | e.acquire(); 60 | 61 | // Check of both read values bypassed the loads 62 | auto cond = (r1 == 0) && (r2 == 0); 63 | if (cond) { 64 | printf("ERROR! R1 = %d, R2 = %d, ITER %d\n", r1, r2, i); 65 | assert(false); 66 | } else 67 | printf("ALL GOOD! R1 = %d, R2 = %d\n", r1, r2); 68 | } 69 | 70 | // Should not get here (infinite loop) 71 | return 0; 72 | } 73 | -------------------------------------------------------------------------------- /hw_barrier/sw_barrier.cpp: -------------------------------------------------------------------------------- 1 | // This example shows off a memory problem on x86 2 | // By: Nick from CoffeeBeforeArch 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | void reorder(std::binary_semaphore &start, std::counting_semaphore<2> &end, 10 | int &v1, int &v2, int &rec) { 11 | // Keep going forever 12 | while (true) { 13 | // Wait for the signal to start 14 | start.acquire(); 15 | 16 | // Write to v2 17 | v1 = 1; 18 | 19 | // Barrier to prevent re-ordering of read and write by compiler 20 | asm volatile("" : : : "memory"); 21 | 22 | // Read v1 23 | rec = v2; 24 | 25 | // Say we're done for this iteration 26 | end.release(); 27 | } 28 | } 29 | 30 | int main() { 31 | // Semaphores for signaling threads 32 | std::binary_semaphore s1(0); 33 | std::binary_semaphore s2(0); 34 | std::counting_semaphore<2> e(0); 35 | 36 | // Variable for memory re-ordering 37 | int v1 = 0; 38 | int v2 = 0; 39 | int r1 = 0; 40 | int r2 = 0; 41 | 42 | // Start threads 43 | std::thread t1([&] { reorder(s1, e, v1, v2, r1); }); 44 | std::thread t2([&] { reorder(s2, e, v2, v1, r2); }); 45 | 46 | for (int i = 0;; i++) { 47 | // Re-initialize the shared variables 48 | v1 = 0; 49 | v2 = 0; 50 | 51 | // Signal the threads to start 52 | s1.release(); 53 | s2.release(); 54 | 55 | // Wait for them to finish 56 | e.acquire(); 57 | e.acquire(); 58 | 59 | // Check of both read values bypassed the loads 60 | auto cond = (r1 == 0) && (r2 == 0); 61 | if (cond) { 62 | printf("ERROR! R1 = %d, R2 = %d, ITER %d\n", r1, r2, i); 63 | assert(false); 64 | } else 65 | printf("ALL GOOD! R1 = %d, R2 = %d\n", r1, r2); 66 | } 67 | 68 | // Should not get here (infinite loop) 69 | return 0; 70 | } 71 | -------------------------------------------------------------------------------- /inc_bench/bad_inc.cpp: -------------------------------------------------------------------------------- 1 | // A bad way for threads to write to the same memory location 2 | // By: Nick from CoffeeBeforeArch 3 | 4 | #include 5 | #include 6 | 7 | int main() { 8 | // Shared value for our threads 9 | int shared_val = 0; 10 | 11 | // Number of iterations (65536) 12 | int N = 1 << 16; 13 | 14 | // Lambda that performs an increment 15 | auto inc_func = [&]() { 16 | for (auto i = 0; i < N; i++) shared_val++; 17 | }; 18 | 19 | // Create two threads 20 | std::thread t1(inc_func); 21 | std::thread t2(inc_func); 22 | 23 | // Join the threads 24 | t1.join(); 25 | t2.join(); 26 | 27 | // Print the result 28 | std::cout << "FINAL VALUE IS: " << shared_val << '\n'; 29 | 30 | return 0; 31 | } 32 | -------------------------------------------------------------------------------- /inc_bench/inc_bench.cpp: -------------------------------------------------------------------------------- 1 | // Benchmarks for incrementing an integer using different synchronization 2 | // techniques 3 | // By: Nick from CoffeeBeforeArch 4 | 5 | #include 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | // Function to incrememnt atomic int 13 | void inc_atomic(std::atomic_int64_t &val) { 14 | for (int i = 0; i < 100000; i++) val++; 15 | } 16 | 17 | // Function to increment using a lock 18 | void inc_mutex(std::mutex &m, long long int &val) { 19 | for (int i = 0; i < 100000; i++) { 20 | std::lock_guard g(m); 21 | val++; 22 | } 23 | } 24 | 25 | // Atomic bench 26 | static void atomic_bench(benchmark::State &s) { 27 | // Number of thread 28 | auto num_threads = s.range(0) - 1; 29 | 30 | // Create an atomic integer (for increments) 31 | std::atomic_int64_t val{0}; 32 | 33 | // Reserve space for threads 34 | std::vector threads; 35 | threads.reserve(num_threads); 36 | 37 | // Timing loop 38 | for (auto _ : s) { 39 | // Spawn threads 40 | for (auto i = 0u; i < num_threads; i++) 41 | threads.emplace_back([&] { inc_atomic(val); }); 42 | inc_atomic(val); 43 | 44 | // Join threads 45 | for (auto &thread : threads) thread.join(); 46 | 47 | // Clear to prevent joining twice 48 | threads.clear(); 49 | } 50 | } 51 | BENCHMARK(atomic_bench) 52 | ->DenseRange(1, std::thread::hardware_concurrency()) 53 | ->Unit(benchmark::kMillisecond) 54 | ->UseRealTime(); 55 | 56 | // Lock guard bench 57 | static void lock_guard_bench(benchmark::State &s) { 58 | // Number of thread 59 | auto num_threads = s.range(0) - 1; 60 | 61 | // Create an atomic integer (for increments) 62 | long long int val{0}; 63 | 64 | // Reserve space for threads 65 | std::vector threads; 66 | threads.reserve(num_threads); 67 | 68 | // Create a mutex 69 | std::mutex m; 70 | 71 | // Timing loop 72 | for (auto _ : s) { 73 | // Spawn threads 74 | for (auto i = 0u; i < num_threads; i++) 75 | threads.emplace_back([&] { inc_mutex(m, val); }); 76 | inc_mutex(m, val); 77 | 78 | // Join threads 79 | for (auto &thread : threads) thread.join(); 80 | 81 | // Clear to prevent joining twice 82 | threads.clear(); 83 | } 84 | } 85 | BENCHMARK(lock_guard_bench) 86 | ->DenseRange(1, std::thread::hardware_concurrency()) 87 | ->Unit(benchmark::kMillisecond) 88 | ->UseRealTime(); 89 | 90 | BENCHMARK_MAIN(); 91 | -------------------------------------------------------------------------------- /java_sll/LinkedList.java: -------------------------------------------------------------------------------- 1 | // This is a simple singly linked list implemented in Java 2 | // By: Nick from CoffeeBeforeArch 3 | 4 | class LinkedList { 5 | // Node class for linked list 6 | static class Node { 7 | int data; 8 | Node next; 9 | 10 | // Constructor 11 | Node(int d){ 12 | data = d; 13 | next = null; 14 | } 15 | } 16 | 17 | // Keep track of the head of the list 18 | Node head; 19 | 20 | // Method for inserting into the list 21 | public static LinkedList insert(LinkedList list, int data){ 22 | // Create a new node based on the data 23 | Node new_node = new Node(data); 24 | 25 | // Handle case that the list is empty 26 | // Otherwise normal insertion 27 | if(list.head == null){ 28 | list.head = new_node; 29 | } else{ 30 | // Start traversal at head 31 | Node temp = list.head; 32 | 33 | // Walk until you find the last node that points to null 34 | while(temp.next != null){ 35 | temp = temp.next; 36 | } 37 | 38 | // Insert the new node at the end of the list 39 | temp.next = new_node; 40 | } 41 | 42 | // Return the updated list 43 | return list; 44 | } 45 | 46 | // Method for deleting from the list 47 | public static LinkedList delete(LinkedList list){ 48 | // Handle case where we delete from an empty list 49 | // Otherwise normal deletion 50 | if(list.head == null){ 51 | return list; 52 | } else{ 53 | // Track previous and current node 54 | Node prev = null; 55 | Node temp = list.head; 56 | 57 | // Iterate over list until we find the last item 58 | while(temp.next != null){ 59 | prev = temp; 60 | temp = temp.next; 61 | } 62 | 63 | // Handle case where we are deleting the last list item 64 | if(prev == null){ 65 | list.head = null; 66 | }else{ 67 | // Just set the previous item to point to null 68 | prev.next = null; 69 | } 70 | return list; 71 | } 72 | } 73 | 74 | // Method for printing the list 75 | public static void printList(LinkedList list){ 76 | // Print a dividing line 77 | for(int i = 0; i < 72; i++){ 78 | System.out.print("-"); 79 | } 80 | System.out.println(); 81 | 82 | // Print out the list 83 | System.out.print("List:\t"); 84 | Node temp = list.head; 85 | while(temp != null){ 86 | System.out.print(temp.data + "\t"); 87 | temp = temp.next; 88 | } 89 | System.out.println(); 90 | 91 | // Print a dividing line 92 | for(int i = 0; i < 72; i++){ 93 | System.out.print("-"); 94 | } 95 | System.out.println(); 96 | } 97 | 98 | public static void main(String[] args){ 99 | // Create a new linked list 100 | LinkedList ll = new LinkedList(); 101 | 102 | // Add some nodes 103 | for(int i = 0; i < 5; i++){ 104 | ll = insert(ll, i * i); 105 | printList(ll); 106 | } 107 | 108 | // Now delete some nodes 109 | for(int i = 0; i < 5; i++){ 110 | ll = delete(ll); 111 | printList(ll); 112 | } 113 | } 114 | } 115 | 116 | -------------------------------------------------------------------------------- /peterson/peterson.cpp: -------------------------------------------------------------------------------- 1 | // Peterson's algorithm 2 | // By: Nick from CoffeeBeforeArch 3 | 4 | #include 5 | #include 6 | 7 | // Simple class implementing Peterson's algorithm for mutual exclusion 8 | class Peterson { 9 | private: 10 | // Is this thread interested in the critical section 11 | // Needs to be volatile to prevent caching in registers 12 | volatile int interested[2] = {0, 0}; 13 | 14 | // Who's turn is it? 15 | // Needs to be volatile to prevent caching in registers 16 | volatile int turn = 0; 17 | 18 | public: 19 | // Method for locking w/ Peterson's algorithm 20 | void lock(int tid) { 21 | // Mark that this thread wants to enter the critical section 22 | interested[tid] = 1; 23 | 24 | // Assume the other thread has priority 25 | int other = 1 - tid; 26 | turn = other; 27 | 28 | // Wait until the other thread finishes or is not interested 29 | while (turn == other && interested[other]) 30 | ; 31 | } 32 | 33 | // Method for unlocking w/ Peterson's algorithm 34 | void unlock(int tid) { 35 | // Mark that this thread is no longer interested 36 | interested[tid] = 0; 37 | } 38 | }; 39 | 40 | // Work function 41 | void work(Peterson &p, int &val, int tid) { 42 | for (int i = 0; i < 100000000; i++) { 43 | // Lock using Peterson's algorithm 44 | p.lock(tid); 45 | // Critical section 46 | val++; 47 | // Unlock using Peterson's algorithm 48 | p.unlock(tid); 49 | } 50 | } 51 | 52 | int main() { 53 | // Shared value 54 | int val = 0; 55 | Peterson p; 56 | 57 | // Create threads 58 | std::thread t0([&] { work(p, val, 0); }); 59 | std::thread t1([&] { work(p, val, 1); }); 60 | 61 | // Wait for the threads to finish 62 | t0.join(); 63 | t1.join(); 64 | 65 | // Print the result 66 | std::cout << "FINAL VALUE IS: " << val << '\n'; 67 | 68 | return 0; 69 | } 70 | -------------------------------------------------------------------------------- /peterson/peterson_hw_barrier.cpp: -------------------------------------------------------------------------------- 1 | // A practical example of hw memory barriers with Peterson's algorithm 2 | // By: Nick from CoffeeBeforeArch 3 | 4 | #include 5 | 6 | #include 7 | #include 8 | 9 | // Simple class implementing Peterson's algorithm for mutual exclusion 10 | class Peterson { 11 | private: 12 | // Is this thread interested in the critical section 13 | // Needs to be volatile to prevent caching in registers 14 | volatile int interested[2] = {0, 0}; 15 | 16 | // Who's turn is it? 17 | // Needs to be volatile to prevent caching in registers 18 | volatile int turn = 0; 19 | 20 | public: 21 | // Method for locking w/ Peterson's algorithm 22 | void lock(int tid) { 23 | // Mark that this thread wants to enter the critical section 24 | interested[tid] = 1; 25 | 26 | // Assume the other thread has priority 27 | int other = 1 - tid; 28 | turn = other; 29 | 30 | // Add memory fence to prevent reading interested early! 31 | // This ensures all previous writes have become visable, and no reads 32 | // have been re-ordered before this barrier 33 | _mm_mfence(); 34 | 35 | // Wait until the other thread finishes or is not interested 36 | while (turn == other && interested[other]) 37 | ; 38 | } 39 | 40 | // Method for unlocking w/ Peterson's algorithm 41 | void unlock(int tid) { 42 | // Mark that this thread is no longer interested 43 | interested[tid] = 0; 44 | } 45 | }; 46 | 47 | // Work function 48 | void work(Peterson &p, int &val, int tid) { 49 | for (int i = 0; i < 100000000; i++) { 50 | // Lock using Peterson's algorithm 51 | p.lock(tid); 52 | // Critical section 53 | val++; 54 | // Unlock using Peterson's algorithm 55 | p.unlock(tid); 56 | } 57 | } 58 | 59 | int main() { 60 | // Shared value 61 | int val = 0; 62 | Peterson p; 63 | 64 | // Create threads 65 | std::thread t0([&] { work(p, val, 0); }); 66 | std::thread t1([&] { work(p, val, 1); }); 67 | 68 | // Wait for the threads to finish 69 | t0.join(); 70 | t1.join(); 71 | 72 | // Print the result 73 | std::cout << "FINAL VALUE IS: " << val << '\n'; 74 | 75 | return 0; 76 | } 77 | -------------------------------------------------------------------------------- /simple_bench/my_bench.cpp: -------------------------------------------------------------------------------- 1 | // Benchmark std::accumulate 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include "benchmark/benchmark.h" 9 | 10 | static void accumulate_bench(benchmark::State &s) { 11 | // Number of elements (2^10) 12 | auto N = 1 << s.range(0); 13 | 14 | // Create a vector of random numbers 15 | std::vector v(N); 16 | std::generate(begin(v), end(v), [] { return rand() % 100; }); 17 | 18 | // Variable for our results 19 | int result = 0; 20 | 21 | // Main timing loop 22 | for (auto _ : s) { 23 | benchmark::DoNotOptimize(result = std::accumulate(begin(v), end(v), 0)); 24 | } 25 | } 26 | BENCHMARK(accumulate_bench)->DenseRange(20, 22)->Unit(benchmark::kMicrosecond); 27 | 28 | BENCHMARK_MAIN(); 29 | -------------------------------------------------------------------------------- /sorting/sorting.cpp: -------------------------------------------------------------------------------- 1 | // Benchmark for sorting w/o duplicates 2 | // By: Nick from CoffeeBeforeArch 3 | 4 | #include 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include "absl/container/flat_hash_map.h" 12 | 13 | // Function for generating argument pairs 14 | static void custom_args(benchmark::internal::Benchmark *b) { 15 | for (auto i : {14, 15, 16}) { 16 | for (auto j : {10, 100, 1000, 10000}) { 17 | b = b->ArgPair(i, j); 18 | } 19 | } 20 | } 21 | 22 | // Baseline benchmark used by sorting vectors 23 | static void baseline(benchmark::State &s) { 24 | // Create input and output vectors 25 | int N = 1 << s.range(0); 26 | std::vector v_in(N); 27 | std::vector v_out; 28 | 29 | // Create our random number generators 30 | std::mt19937 rng; 31 | rng.seed(std::random_device()()); 32 | std::uniform_int_distribution dist(0, s.range(1)); 33 | 34 | // Fill the input vector with random numbers 35 | std::generate(begin(v_in), end(v_in), [&] { return dist(rng); }); 36 | 37 | // Benchmark loop 38 | for (auto _ : s) { 39 | // Copy the random number to a new vector 40 | v_out = v_in; 41 | 42 | // Sort the numbers in the new vector 43 | std::ranges::sort(v_out); 44 | } 45 | } 46 | BENCHMARK(baseline)->Apply(custom_args)->Unit(benchmark::kMicrosecond); 47 | 48 | // Benchmark that filters the values in a hash set 49 | static void unordered_map(benchmark::State &s) { 50 | // Create input and output vectors 51 | int N = 1 << s.range(0); 52 | std::vector v_in(N); 53 | std::vector v_out; 54 | v_out.reserve(N); 55 | std::unordered_map filter; 56 | 57 | // Create our random number generators 58 | std::mt19937 rng; 59 | rng.seed(std::random_device()()); 60 | std::uniform_int_distribution dist(0, s.range(1)); 61 | 62 | // Fill the input vector with random numbers 63 | std::generate(begin(v_in), end(v_in), [&] { return dist(rng); }); 64 | 65 | // A vector for our sorted non-duplicates 66 | std::vector tmp; 67 | 68 | // Benchmark loop 69 | for (auto _ : s) { 70 | // Go through each element 71 | for (auto i : v_in) { 72 | // If it is in the filter, increment the number of instances 73 | if (filter.contains(i)) 74 | filter[i]++; 75 | else { 76 | // Otherwise, we found the first one 77 | filter[i] = 1; 78 | // Save one unique value to the vector 79 | tmp.push_back(i); 80 | } 81 | } 82 | 83 | // Sort the non-duplicates 84 | std::ranges::sort(tmp); 85 | 86 | // Recreate the sorted vector 87 | for (auto i : tmp) { 88 | for (int j = 0; j < filter[i]; j++) v_out.push_back(i); 89 | } 90 | 91 | // Clear each iteration 92 | filter.clear(); 93 | v_out.clear(); 94 | tmp.clear(); 95 | } 96 | } 97 | BENCHMARK(unordered_map)->Apply(custom_args)->Unit(benchmark::kMicrosecond); 98 | 99 | // Benchmark that filters the values in a hash set 100 | static void flat_hash_map(benchmark::State &s) { 101 | // Create input and output vectors 102 | int N = 1 << s.range(0); 103 | std::vector v_in(N); 104 | std::vector v_out; 105 | v_out.reserve(N); 106 | absl::flat_hash_map filter; 107 | 108 | // Create our random number generators 109 | std::mt19937 rng; 110 | rng.seed(std::random_device()()); 111 | std::uniform_int_distribution dist(0, s.range(1)); 112 | 113 | // Fill the input vector with random numbers 114 | std::generate(begin(v_in), end(v_in), [&] { return dist(rng); }); 115 | 116 | // A vector for our sorted non-duplicates 117 | std::vector tmp; 118 | 119 | // Benchmark loop 120 | for (auto _ : s) { 121 | // Go through each element 122 | for (auto i : v_in) { 123 | // If it is in the filter, increment the number of instances 124 | if (filter.contains(i)) 125 | filter[i]++; 126 | else { 127 | // Otherwise, we found the first one 128 | filter[i] = 1; 129 | // Save one unique value to the vector 130 | tmp.push_back(i); 131 | } 132 | } 133 | 134 | // Sort the non-duplicates 135 | std::ranges::sort(tmp); 136 | 137 | // Recreate the sorted vector 138 | for (auto i : tmp) { 139 | for (int j = 0; j < filter[i]; j++) v_out.push_back(i); 140 | } 141 | 142 | // Clear each iteration 143 | filter.clear(); 144 | v_out.clear(); 145 | tmp.clear(); 146 | } 147 | } 148 | BENCHMARK(flat_hash_map)->Apply(custom_args)->Unit(benchmark::kMicrosecond); 149 | 150 | BENCHMARK_MAIN(); 151 | -------------------------------------------------------------------------------- /strength_reduction/mod_bench.cpp: -------------------------------------------------------------------------------- 1 | // Short benchmark for strength reduction 2 | // By: Nick from CoffeeBeforeArch 3 | 4 | #include 5 | 6 | #include 7 | 8 | // Benchmark idiv instruction 9 | static void baseMod(benchmark::State &s) { 10 | std::vector v_in(4096); 11 | std::vector v_out(4096); 12 | 13 | for (auto _ : s) { 14 | for (size_t i = 0; i < v_in.size(); i++) v_out[i] = v_in[i] % s.range(0); 15 | } 16 | } 17 | BENCHMARK(baseMod)->Arg(1245)->Unit(benchmark::kMicrosecond); 18 | 19 | // Benchmark compiler strength reduction 20 | static void srMod(benchmark::State &s) { 21 | std::vector v_in(4096); 22 | std::vector v_out(4096); 23 | 24 | for (auto _ : s) { 25 | for (size_t i = 0; i < v_in.size(); i++) v_out[i] = v_in[i] % 1245; 26 | } 27 | } 28 | BENCHMARK(srMod)->Unit(benchmark::kMicrosecond); 29 | 30 | BENCHMARK_MAIN(); 31 | -------------------------------------------------------------------------------- /sum_reduction/generalized.cu: -------------------------------------------------------------------------------- 1 | // This program performs sum reduction with an optimization 2 | // removing warp bank conflicts 3 | // By: Nick from CoffeeBeforeArch 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #define SIZE 256 13 | 14 | __global__ void sum_reduction(int *v, int *v_r, int n) { 15 | // Calculate thread ID 16 | int tid = blockIdx.x * blockDim.x + threadIdx.x; 17 | 18 | // Boundary check 19 | if (tid < n) { 20 | // Allocate shared memory 21 | __shared__ int partial_sum[SIZE]; 22 | 23 | // Calculate the number of elements this block reduces 24 | // Only the last block may have stragglers 25 | int reduce_elements; 26 | if (blockIdx.x == gridDim.x - 1) { 27 | reduce_elements = n - blockIdx.x * SIZE; 28 | } else { 29 | reduce_elements = SIZE; 30 | } 31 | 32 | // Find the next power of two 33 | // __clz finds the leading number of zeros in an int 34 | int next_power = 1 << (32 - __clz(reduce_elements) + 1); 35 | int init = next_power > SIZE ? SIZE : next_power; 36 | 37 | // Load elements into shared memory 38 | partial_sum[threadIdx.x] = v[tid]; 39 | __syncthreads(); 40 | 41 | // Start with a padded number of reduce_elements 42 | for (int s = init; s > 0; s >>= 1) { 43 | // Only threads < stride compute partial sums 44 | // Only threads accessing elements < reduce_elements need to be active 45 | // This handles the case where reduce_elements is an odd number 46 | if (threadIdx.x < s && threadIdx.x + s < reduce_elements) { 47 | partial_sum[threadIdx.x] += partial_sum[threadIdx.x + s]; 48 | } 49 | __syncthreads(); 50 | } 51 | 52 | // Let the thread 0 for this block write it's result to main memory 53 | // Result is inexed by this block 54 | if (threadIdx.x == 0) { 55 | v_r[blockIdx.x] = partial_sum[0]; 56 | } 57 | } 58 | } 59 | 60 | int main() { 61 | // Vector size 62 | int n = 1 << 20; 63 | size_t bytes = n * sizeof(int); 64 | 65 | // Host-side input vector 66 | std::vector h_v(n); 67 | std::generate(begin(h_v), end(h_v), []() { return rand() % 10; }); 68 | 69 | // Single result element 70 | int h_v_r = 0; 71 | 72 | // Allocate device memory 73 | int *d_v, *d_v_r; 74 | cudaMalloc(&d_v, bytes); 75 | cudaMalloc(&d_v_r, bytes); 76 | 77 | // Copy to device 78 | cudaMemcpy(d_v, h_v.data(), bytes, cudaMemcpyHostToDevice); 79 | 80 | // TB Size 81 | int TB_SIZE = SIZE; 82 | 83 | // Grid Size 84 | int GRID_SIZE = (n + TB_SIZE - 1) / TB_SIZE; 85 | 86 | // Number of elements reduced in the next iteration 87 | int num_elements = n; 88 | 89 | // Launch kernels until we've performed the complete reduction 90 | while (1) { 91 | // Call kernel 92 | sum_reduction<<>>(d_v, d_v_r, num_elements); 93 | 94 | // No more reductions left! 95 | if (GRID_SIZE == 1) break; 96 | 97 | // Swap the pointers each iteration 98 | // Output from last iteration is the input to the next 99 | std::swap(d_v, d_v_r); 100 | 101 | // Calculate the number of elements next iteration 102 | // Number of input elements next iteration is the number of output 103 | // elements from last iteration 104 | num_elements = GRID_SIZE; 105 | 106 | // Calculate padded grid size 107 | GRID_SIZE = (GRID_SIZE + TB_SIZE - 1) / TB_SIZE; 108 | } 109 | 110 | // Copy the result back to the host 111 | cudaMemcpy(&h_v_r, d_v_r, sizeof(int), cudaMemcpyDeviceToHost); 112 | 113 | // Host sum 114 | int res = std::accumulate(begin(h_v), end(h_v), 0); 115 | 116 | // Check the result 117 | assert(h_v_r == res); 118 | 119 | std::cout << "COMPLETED SUCCESSFULLY!\n"; 120 | 121 | return 0; 122 | } 123 | -------------------------------------------------------------------------------- /sum_reduction/sum_reduction.cu: -------------------------------------------------------------------------------- 1 | // This program implements a simple, but flexible sum reduction 2 | // implementation in CUDA 3 | // By: Nick from CoffeeBeforeArch 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | // Each TB needs some shared memory 10 | // Allocate for 256 ints 11 | #define SHMEM_SIZE 256 * 4 12 | 13 | using namespace std; 14 | 15 | // Sum reduction kernel taken from previous CUDA video 16 | // Slightly modified to handle inputs of not powers of 256 17 | __global__ void sum_reduction(int *v_in, int *v_out, int N) { 18 | // Allocate shared memory statically 19 | __shared__ int partial_sum[SHMEM_SIZE]; 20 | 21 | // Calculate thread ID 22 | int tid = blockIdx.x * blockDim.x + threadIdx.x; 23 | 24 | // Mask off inactive threads in a TB 25 | if (tid < N) { 26 | // Load elements into shared memory 27 | partial_sum[threadIdx.x] = v_in[tid]; 28 | __syncthreads(); 29 | 30 | // How many elements we sum in this TB depends on how many remaining 31 | // elements there are. If 256, use blockDim, else, use N 32 | int max = (N < blockDim.x) ? N : blockDim.x; 33 | 34 | // Iterate of log base 2 the block dimension 35 | for (int s = 1; s < max; s *= 2) { 36 | // Reduce the threads performing work by half previous the previous 37 | // iteration each cycle 38 | if (threadIdx.x % (2 * s) == 0) { 39 | partial_sum[threadIdx.x] += partial_sum[threadIdx.x + s]; 40 | } 41 | __syncthreads(); 42 | } 43 | 44 | // Let the thread 0 for this block write it's result to main memory 45 | // Result is inexed by this block 46 | if (threadIdx.x == 0) { 47 | v_out[blockIdx.x] = partial_sum[0]; 48 | } 49 | } 50 | } 51 | 52 | int main() { 53 | // Number of elements 54 | int N = 1 << 20; 55 | int CPU_SUM = N; 56 | size_t bytes = N * sizeof(int); 57 | 58 | // Host arrays 59 | int *h_v_in = new int[N]; 60 | int *h_v_out = new int[1]; 61 | 62 | // Device arrays 63 | int *d_v_in, *d_v_out; 64 | cudaMalloc(&d_v_in, bytes); 65 | cudaMalloc(&d_v_out, bytes); 66 | 67 | // Init input array 68 | for (int i = 0; i < N; i++) { 69 | h_v_in[i] = 1; 70 | } 71 | 72 | // Copy the array over 73 | cudaMemcpy(d_v_in, h_v_in, bytes, cudaMemcpyHostToDevice); 74 | 75 | // TB size; 76 | int THREADS = 256; 77 | 78 | // Track the number of iterations it takes 79 | int iter = 0; 80 | 81 | // Grid size may change each loop iteration 82 | int GRID; 83 | 84 | // Simple loop to keep launching kernels until we're done 85 | // N == 1 means we only have 1 elements left, aka, we're done 86 | // If it's not a power of 256, N will converge to 0 instead 87 | while (N > 1) { 88 | // Calculate the grid size 89 | GRID = (N + THREADS - 1) / THREADS; 90 | 91 | // Alternate kernel inputs 92 | // This is so we don't need to constantly re-allocate new 93 | // output arrays 94 | if (iter % 2) { 95 | sum_reduction<<>>(d_v_out, d_v_in, N); 96 | } else { 97 | sum_reduction<<>>(d_v_in, d_v_out, N); 98 | } 99 | 100 | // LOG_256(N) iterations of the loop 101 | iter++; 102 | N /= 256; 103 | } 104 | 105 | // Which array we copy back from depends on the final iter # 106 | // Only a single sum needs to be copied out 107 | if (iter % 2) { 108 | cudaMemcpy(h_v_out, d_v_out, sizeof(int), cudaMemcpyDeviceToHost); 109 | } else { 110 | cudaMemcpy(h_v_out, d_v_in, sizeof(int), cudaMemcpyDeviceToHost); 111 | } 112 | 113 | // Print the result 114 | cout << "Number of iterations = " << iter << endl; 115 | cout << "Reduced Sum = " << h_v_out[0] << endl; 116 | cout << "CPU sum = " << CPU_SUM << endl; 117 | 118 | return 0; 119 | } 120 | -------------------------------------------------------------------------------- /task_group/task_group.cpp: -------------------------------------------------------------------------------- 1 | // Simple task group example 2 | // By: Nick from CoffeeBeforeArch 3 | 4 | #include 5 | 6 | #include 7 | 8 | // Simple multiply function 9 | void multiply(int i, int j) { 10 | auto product = i * j; 11 | printf("Product = %d\n", product); 12 | } 13 | 14 | int main() { 15 | // Create a task group 16 | tbb::task_group tg; 17 | 18 | // Spawn task for each pair 19 | for (int i = 0; i < 10; i++) { 20 | for (int j = 0; j < 10; j++) { 21 | tg.run([=] { multiply(i, j); }); 22 | } 23 | } 24 | 25 | // Wait for tasks 26 | tg.wait(); 27 | 28 | return 0; 29 | } 30 | -------------------------------------------------------------------------------- /thread_affinity/thread_affinity.cpp: -------------------------------------------------------------------------------- 1 | // This program shows off setting thread affinity 2 | // By: Nick from CoffeeBeforeArch 3 | 4 | #include 5 | #include 6 | 7 | #include 8 | #include 9 | #include 10 | 11 | // Simple function for incrememnting an atomic int 12 | void work(std::atomic& a) { 13 | for (int i = 0; i < 100000; i++) { 14 | a++; 15 | } 16 | } 17 | 18 | // Aligned type 19 | // Allows us to keep to atomics from sitting on the same cache line 20 | struct alignas(64) AlignedAtomic { 21 | AlignedAtomic(int v) { val = v; } 22 | std::atomic val; 23 | }; 24 | 25 | void os_scheduler() { 26 | AlignedAtomic a{0}; 27 | AlignedAtomic b{0}; 28 | 29 | // Create four threads and use lambda to launch work 30 | std::thread t1([&]() { work(a.val); }); 31 | std::thread t2([&]() { work(a.val); }); 32 | std::thread t3([&]() { work(b.val); }); 33 | std::thread t4([&]() { work(b.val); }); 34 | 35 | // Join the threads 36 | t1.join(); 37 | t2.join(); 38 | t3.join(); 39 | t4.join(); 40 | } 41 | 42 | // Data sharing benchmark w/ OS scheduling 43 | static void osScheduling(benchmark::State& s) { 44 | while (s.KeepRunning()) { 45 | os_scheduler(); 46 | } 47 | } 48 | BENCHMARK(osScheduling)->UseRealTime()->Unit(benchmark::kMillisecond); 49 | 50 | void thread_affinity() { 51 | AlignedAtomic a{0}; 52 | AlignedAtomic b{0}; 53 | 54 | // Create cpu sets for threads 0,1 and 2,3 55 | cpu_set_t cpu_set_1; 56 | cpu_set_t cpu_set_2; 57 | 58 | // Zero them out 59 | CPU_ZERO(&cpu_set_1); 60 | CPU_ZERO(&cpu_set_2); 61 | 62 | // And set the CPU cores we want to pin the threads too 63 | CPU_SET(0, &cpu_set_1); 64 | CPU_SET(1, &cpu_set_2); 65 | 66 | // Create thread 0 and 1, and pin them to core 0 67 | std::thread t0([&]() { work(a.val); }); 68 | assert(pthread_setaffinity_np(t0.native_handle(), sizeof(cpu_set_t), 69 | &cpu_set_1) == 0); 70 | std::thread t1([&]() { work(a.val); }); 71 | assert(pthread_setaffinity_np(t1.native_handle(), sizeof(cpu_set_t), 72 | &cpu_set_1) == 0); 73 | 74 | // Create thread 1 and 2, and pin them to core 1 75 | std::thread t2([&]() { work(b.val); }); 76 | assert(pthread_setaffinity_np(t2.native_handle(), sizeof(cpu_set_t), 77 | &cpu_set_2) == 0); 78 | std::thread t3([&]() { work(b.val); }); 79 | assert(pthread_setaffinity_np(t3.native_handle(), sizeof(cpu_set_t), 80 | &cpu_set_2) == 0); 81 | 82 | // Join the threads 83 | t0.join(); 84 | t1.join(); 85 | t2.join(); 86 | t3.join(); 87 | } 88 | 89 | // Data sharing benchmark w/ manual scheduling 90 | static void threadAffinity(benchmark::State& s) { 91 | while (s.KeepRunning()) { 92 | thread_affinity(); 93 | } 94 | } 95 | BENCHMARK(threadAffinity)->UseRealTime()->Unit(benchmark::kMillisecond); 96 | 97 | BENCHMARK_MAIN(); 98 | -------------------------------------------------------------------------------- /vector_add/vectorAdd.cu: -------------------------------------------------------------------------------- 1 | // This program computes the sum of two vectors of length N 2 | // By: Nick from CoffeeBeforeArch 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | using std::begin; 12 | using std::copy; 13 | using std::cout; 14 | using std::end; 15 | using std::endl; 16 | using std::generate; 17 | using std::vector; 18 | 19 | // Function that can be call from the CPU or GPU 20 | // Called by the GPU kernel vectorAdd 21 | // Called by the host-side function verify_result 22 | __host__ __device__ int add(int a, int b){ 23 | return a + b; 24 | } 25 | 26 | // CUDA kernel for vector addition 27 | // __global__ means this is called from the CPU, and runs on the GPU 28 | __global__ void vectorAdd(int* a, int* b, int* c, int N) { 29 | // Calculate global thread ID 30 | int tid = (blockIdx.x * blockDim.x) + threadIdx.x; 31 | 32 | // Boundary check 33 | if (tid < N) { 34 | // Each thread adds a single element 35 | c[tid] = add(a[tid], b[tid]); 36 | } 37 | } 38 | 39 | // Check vector add result 40 | void verify_result(vector a, vector b, vector c) { 41 | for (int i = 0; i < a.size(); i++) { 42 | assert(c[i] == add(a[i], b[i])); 43 | } 44 | } 45 | 46 | int main() { 47 | // Array size of 2^16 (65536 elements) 48 | constexpr int N = 1 << 16; 49 | size_t bytes = sizeof(int) * N; 50 | 51 | // Vectors for holding the host-side (CPU-side) data 52 | vector a(N); 53 | vector b(N); 54 | vector c(N); 55 | 56 | // Initialize random numbers in each array 57 | generate(begin(a), end(a), []() { return rand() % 100; }); 58 | generate(begin(b), end(b), []() { return rand() % 100; }); 59 | 60 | // Allocate memory on the device 61 | int *d_a, *d_b, *d_c; 62 | cudaMalloc(&d_a, bytes); 63 | cudaMalloc(&d_b, bytes); 64 | cudaMalloc(&d_c, bytes); 65 | 66 | // Copy data from the host to the device (CPU -> GPU) 67 | cudaMemcpy(d_a, a.data(), bytes, cudaMemcpyHostToDevice); 68 | cudaMemcpy(d_b, b.data(), bytes, cudaMemcpyHostToDevice); 69 | 70 | // Threads per CTA (1024 threads per CTA) 71 | int NUM_THREADS = 1 << 10; 72 | 73 | // CTAs per Grid 74 | // We need to launch at LEAST as many threads as we have elements 75 | // This equation pads an extra CTA to the grid if N cannot evenly be divided 76 | // by NUM_THREADS (e.g. N = 1025, NUM_THREADS = 1024) 77 | int NUM_BLOCKS = (N + NUM_THREADS - 1) / NUM_THREADS; 78 | 79 | // Launch the kernel on the GPU 80 | // Kernel calls are asynchronous (the CPU program continues execution after 81 | // call, but no necessarily before the kernel finishes) 82 | vectorAdd<<>>(d_a, d_b, d_c, N); 83 | 84 | // Copy sum vector from device to host 85 | // cudaMemcpy is a synchronous operation, and waits for the prior kernel 86 | // launch to complete (both go to the default stream in this case). 87 | // Therefore, this cudaMemcpy acts as both a memcpy and synchronization 88 | // barrier. 89 | cudaMemcpy(c.data(), d_c, bytes, cudaMemcpyDeviceToHost); 90 | 91 | // Check result for errors 92 | verify_result(a, b, c); 93 | 94 | // Free memory on device 95 | cudaFree(d_a); 96 | cudaFree(d_b); 97 | cudaFree(d_c); 98 | 99 | cout << "COMPLETED SUCCESSFULLY" << endl; 100 | 101 | return 0; 102 | } 103 | --------------------------------------------------------------------------------