├── .gitignore ├── COPYING ├── Makefile ├── README.asciidoc ├── bench ├── Makefile └── bench.c ├── internal.h ├── lrtypes.h ├── misc.c ├── opti.c ├── opti_init.c ├── rated.c ├── simple.c ├── test ├── Makefile ├── allocfree.c ├── bin.c ├── count.c ├── opti.c ├── rated.c ├── run-tests.sh ├── shortrule.c ├── simple.c ├── str.c └── test.h └── urlmatch.h /.gitignore: -------------------------------------------------------------------------------- 1 | *.[oa] 2 | test/* 3 | bench/bench 4 | -------------------------------------------------------------------------------- /COPYING: -------------------------------------------------------------------------------- 1 | GNU AFFERO GENERAL PUBLIC LICENSE 2 | Version 3, 19 November 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | Preamble 9 | 10 | The GNU Affero General Public License is a free, copyleft license for 11 | software and other kinds of works, specifically designed to ensure 12 | cooperation with the community in the case of network server software. 13 | 14 | The licenses for most software and other practical works are designed 15 | to take away your freedom to share and change the works. By contrast, 16 | our General Public Licenses are intended to guarantee your freedom to 17 | share and change all versions of a program--to make sure it remains free 18 | software for all its users. 19 | 20 | When we speak of free software, we are referring to freedom, not 21 | price. Our General Public Licenses are designed to make sure that you 22 | have the freedom to distribute copies of free software (and charge for 23 | them if you wish), that you receive source code or can get it if you 24 | want it, that you can change the software or use pieces of it in new 25 | free programs, and that you know you can do these things. 26 | 27 | Developers that use our General Public Licenses protect your rights 28 | with two steps: (1) assert copyright on the software, and (2) offer 29 | you this License which gives you legal permission to copy, distribute 30 | and/or modify the software. 31 | 32 | A secondary benefit of defending all users' freedom is that 33 | improvements made in alternate versions of the program, if they 34 | receive widespread use, become available for other developers to 35 | incorporate. Many developers of free software are heartened and 36 | encouraged by the resulting cooperation. However, in the case of 37 | software used on network servers, this result may fail to come about. 38 | The GNU General Public License permits making a modified version and 39 | letting the public access it on a server without ever releasing its 40 | source code to the public. 41 | 42 | The GNU Affero General Public License is designed specifically to 43 | ensure that, in such cases, the modified source code becomes available 44 | to the community. It requires the operator of a network server to 45 | provide the source code of the modified version running there to the 46 | users of that server. Therefore, public use of a modified version, on 47 | a publicly accessible server, gives the public access to the source 48 | code of the modified version. 49 | 50 | An older license, called the Affero General Public License and 51 | published by Affero, was designed to accomplish similar goals. This is 52 | a different license, not a version of the Affero GPL, but Affero has 53 | released a new version of the Affero GPL which permits relicensing under 54 | this license. 55 | 56 | The precise terms and conditions for copying, distribution and 57 | modification follow. 58 | 59 | TERMS AND CONDITIONS 60 | 61 | 0. Definitions. 62 | 63 | "This License" refers to version 3 of the GNU Affero General Public License. 64 | 65 | "Copyright" also means copyright-like laws that apply to other kinds of 66 | works, such as semiconductor masks. 67 | 68 | "The Program" refers to any copyrightable work licensed under this 69 | License. Each licensee is addressed as "you". "Licensees" and 70 | "recipients" may be individuals or organizations. 71 | 72 | To "modify" a work means to copy from or adapt all or part of the work 73 | in a fashion requiring copyright permission, other than the making of an 74 | exact copy. The resulting work is called a "modified version" of the 75 | earlier work or a work "based on" the earlier work. 76 | 77 | A "covered work" means either the unmodified Program or a work based 78 | on the Program. 79 | 80 | To "propagate" a work means to do anything with it that, without 81 | permission, would make you directly or secondarily liable for 82 | infringement under applicable copyright law, except executing it on a 83 | computer or modifying a private copy. Propagation includes copying, 84 | distribution (with or without modification), making available to the 85 | public, and in some countries other activities as well. 86 | 87 | To "convey" a work means any kind of propagation that enables other 88 | parties to make or receive copies. Mere interaction with a user through 89 | a computer network, with no transfer of a copy, is not conveying. 90 | 91 | An interactive user interface displays "Appropriate Legal Notices" 92 | to the extent that it includes a convenient and prominently visible 93 | feature that (1) displays an appropriate copyright notice, and (2) 94 | tells the user that there is no warranty for the work (except to the 95 | extent that warranties are provided), that licensees may convey the 96 | work under this License, and how to view a copy of this License. If 97 | the interface presents a list of user commands or options, such as a 98 | menu, a prominent item in the list meets this criterion. 99 | 100 | 1. Source Code. 101 | 102 | The "source code" for a work means the preferred form of the work 103 | for making modifications to it. "Object code" means any non-source 104 | form of a work. 105 | 106 | A "Standard Interface" means an interface that either is an official 107 | standard defined by a recognized standards body, or, in the case of 108 | interfaces specified for a particular programming language, one that 109 | is widely used among developers working in that language. 110 | 111 | The "System Libraries" of an executable work include anything, other 112 | than the work as a whole, that (a) is included in the normal form of 113 | packaging a Major Component, but which is not part of that Major 114 | Component, and (b) serves only to enable use of the work with that 115 | Major Component, or to implement a Standard Interface for which an 116 | implementation is available to the public in source code form. A 117 | "Major Component", in this context, means a major essential component 118 | (kernel, window system, and so on) of the specific operating system 119 | (if any) on which the executable work runs, or a compiler used to 120 | produce the work, or an object code interpreter used to run it. 121 | 122 | The "Corresponding Source" for a work in object code form means all 123 | the source code needed to generate, install, and (for an executable 124 | work) run the object code and to modify the work, including scripts to 125 | control those activities. However, it does not include the work's 126 | System Libraries, or general-purpose tools or generally available free 127 | programs which are used unmodified in performing those activities but 128 | which are not part of the work. For example, Corresponding Source 129 | includes interface definition files associated with source files for 130 | the work, and the source code for shared libraries and dynamically 131 | linked subprograms that the work is specifically designed to require, 132 | such as by intimate data communication or control flow between those 133 | subprograms and other parts of the work. 134 | 135 | The Corresponding Source need not include anything that users 136 | can regenerate automatically from other parts of the Corresponding 137 | Source. 138 | 139 | The Corresponding Source for a work in source code form is that 140 | same work. 141 | 142 | 2. Basic Permissions. 143 | 144 | All rights granted under this License are granted for the term of 145 | copyright on the Program, and are irrevocable provided the stated 146 | conditions are met. This License explicitly affirms your unlimited 147 | permission to run the unmodified Program. The output from running a 148 | covered work is covered by this License only if the output, given its 149 | content, constitutes a covered work. This License acknowledges your 150 | rights of fair use or other equivalent, as provided by copyright law. 151 | 152 | You may make, run and propagate covered works that you do not 153 | convey, without conditions so long as your license otherwise remains 154 | in force. You may convey covered works to others for the sole purpose 155 | of having them make modifications exclusively for you, or provide you 156 | with facilities for running those works, provided that you comply with 157 | the terms of this License in conveying all material for which you do 158 | not control copyright. Those thus making or running the covered works 159 | for you must do so exclusively on your behalf, under your direction 160 | and control, on terms that prohibit them from making any copies of 161 | your copyrighted material outside their relationship with you. 162 | 163 | Conveying under any other circumstances is permitted solely under 164 | the conditions stated below. Sublicensing is not allowed; section 10 165 | makes it unnecessary. 166 | 167 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law. 168 | 169 | No covered work shall be deemed part of an effective technological 170 | measure under any applicable law fulfilling obligations under article 171 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or 172 | similar laws prohibiting or restricting circumvention of such 173 | measures. 174 | 175 | When you convey a covered work, you waive any legal power to forbid 176 | circumvention of technological measures to the extent such circumvention 177 | is effected by exercising rights under this License with respect to 178 | the covered work, and you disclaim any intention to limit operation or 179 | modification of the work as a means of enforcing, against the work's 180 | users, your or third parties' legal rights to forbid circumvention of 181 | technological measures. 182 | 183 | 4. Conveying Verbatim Copies. 184 | 185 | You may convey verbatim copies of the Program's source code as you 186 | receive it, in any medium, provided that you conspicuously and 187 | appropriately publish on each copy an appropriate copyright notice; 188 | keep intact all notices stating that this License and any 189 | non-permissive terms added in accord with section 7 apply to the code; 190 | keep intact all notices of the absence of any warranty; and give all 191 | recipients a copy of this License along with the Program. 192 | 193 | You may charge any price or no price for each copy that you convey, 194 | and you may offer support or warranty protection for a fee. 195 | 196 | 5. Conveying Modified Source Versions. 197 | 198 | You may convey a work based on the Program, or the modifications to 199 | produce it from the Program, in the form of source code under the 200 | terms of section 4, provided that you also meet all of these conditions: 201 | 202 | a) The work must carry prominent notices stating that you modified 203 | it, and giving a relevant date. 204 | 205 | b) The work must carry prominent notices stating that it is 206 | released under this License and any conditions added under section 207 | 7. This requirement modifies the requirement in section 4 to 208 | "keep intact all notices". 209 | 210 | c) You must license the entire work, as a whole, under this 211 | License to anyone who comes into possession of a copy. This 212 | License will therefore apply, along with any applicable section 7 213 | additional terms, to the whole of the work, and all its parts, 214 | regardless of how they are packaged. This License gives no 215 | permission to license the work in any other way, but it does not 216 | invalidate such permission if you have separately received it. 217 | 218 | d) If the work has interactive user interfaces, each must display 219 | Appropriate Legal Notices; however, if the Program has interactive 220 | interfaces that do not display Appropriate Legal Notices, your 221 | work need not make them do so. 222 | 223 | A compilation of a covered work with other separate and independent 224 | works, which are not by their nature extensions of the covered work, 225 | and which are not combined with it such as to form a larger program, 226 | in or on a volume of a storage or distribution medium, is called an 227 | "aggregate" if the compilation and its resulting copyright are not 228 | used to limit the access or legal rights of the compilation's users 229 | beyond what the individual works permit. Inclusion of a covered work 230 | in an aggregate does not cause this License to apply to the other 231 | parts of the aggregate. 232 | 233 | 6. Conveying Non-Source Forms. 234 | 235 | You may convey a covered work in object code form under the terms 236 | of sections 4 and 5, provided that you also convey the 237 | machine-readable Corresponding Source under the terms of this License, 238 | in one of these ways: 239 | 240 | a) Convey the object code in, or embodied in, a physical product 241 | (including a physical distribution medium), accompanied by the 242 | Corresponding Source fixed on a durable physical medium 243 | customarily used for software interchange. 244 | 245 | b) Convey the object code in, or embodied in, a physical product 246 | (including a physical distribution medium), accompanied by a 247 | written offer, valid for at least three years and valid for as 248 | long as you offer spare parts or customer support for that product 249 | model, to give anyone who possesses the object code either (1) a 250 | copy of the Corresponding Source for all the software in the 251 | product that is covered by this License, on a durable physical 252 | medium customarily used for software interchange, for a price no 253 | more than your reasonable cost of physically performing this 254 | conveying of source, or (2) access to copy the 255 | Corresponding Source from a network server at no charge. 256 | 257 | c) Convey individual copies of the object code with a copy of the 258 | written offer to provide the Corresponding Source. This 259 | alternative is allowed only occasionally and noncommercially, and 260 | only if you received the object code with such an offer, in accord 261 | with subsection 6b. 262 | 263 | d) Convey the object code by offering access from a designated 264 | place (gratis or for a charge), and offer equivalent access to the 265 | Corresponding Source in the same way through the same place at no 266 | further charge. You need not require recipients to copy the 267 | Corresponding Source along with the object code. If the place to 268 | copy the object code is a network server, the Corresponding Source 269 | may be on a different server (operated by you or a third party) 270 | that supports equivalent copying facilities, provided you maintain 271 | clear directions next to the object code saying where to find the 272 | Corresponding Source. Regardless of what server hosts the 273 | Corresponding Source, you remain obligated to ensure that it is 274 | available for as long as needed to satisfy these requirements. 275 | 276 | e) Convey the object code using peer-to-peer transmission, provided 277 | you inform other peers where the object code and Corresponding 278 | Source of the work are being offered to the general public at no 279 | charge under subsection 6d. 280 | 281 | A separable portion of the object code, whose source code is excluded 282 | from the Corresponding Source as a System Library, need not be 283 | included in conveying the object code work. 284 | 285 | A "User Product" is either (1) a "consumer product", which means any 286 | tangible personal property which is normally used for personal, family, 287 | or household purposes, or (2) anything designed or sold for incorporation 288 | into a dwelling. In determining whether a product is a consumer product, 289 | doubtful cases shall be resolved in favor of coverage. For a particular 290 | product received by a particular user, "normally used" refers to a 291 | typical or common use of that class of product, regardless of the status 292 | of the particular user or of the way in which the particular user 293 | actually uses, or expects or is expected to use, the product. A product 294 | is a consumer product regardless of whether the product has substantial 295 | commercial, industrial or non-consumer uses, unless such uses represent 296 | the only significant mode of use of the product. 297 | 298 | "Installation Information" for a User Product means any methods, 299 | procedures, authorization keys, or other information required to install 300 | and execute modified versions of a covered work in that User Product from 301 | a modified version of its Corresponding Source. The information must 302 | suffice to ensure that the continued functioning of the modified object 303 | code is in no case prevented or interfered with solely because 304 | modification has been made. 305 | 306 | If you convey an object code work under this section in, or with, or 307 | specifically for use in, a User Product, and the conveying occurs as 308 | part of a transaction in which the right of possession and use of the 309 | User Product is transferred to the recipient in perpetuity or for a 310 | fixed term (regardless of how the transaction is characterized), the 311 | Corresponding Source conveyed under this section must be accompanied 312 | by the Installation Information. But this requirement does not apply 313 | if neither you nor any third party retains the ability to install 314 | modified object code on the User Product (for example, the work has 315 | been installed in ROM). 316 | 317 | The requirement to provide Installation Information does not include a 318 | requirement to continue to provide support service, warranty, or updates 319 | for a work that has been modified or installed by the recipient, or for 320 | the User Product in which it has been modified or installed. Access to a 321 | network may be denied when the modification itself materially and 322 | adversely affects the operation of the network or violates the rules and 323 | protocols for communication across the network. 324 | 325 | Corresponding Source conveyed, and Installation Information provided, 326 | in accord with this section must be in a format that is publicly 327 | documented (and with an implementation available to the public in 328 | source code form), and must require no special password or key for 329 | unpacking, reading or copying. 330 | 331 | 7. Additional Terms. 332 | 333 | "Additional permissions" are terms that supplement the terms of this 334 | License by making exceptions from one or more of its conditions. 335 | Additional permissions that are applicable to the entire Program shall 336 | be treated as though they were included in this License, to the extent 337 | that they are valid under applicable law. If additional permissions 338 | apply only to part of the Program, that part may be used separately 339 | under those permissions, but the entire Program remains governed by 340 | this License without regard to the additional permissions. 341 | 342 | When you convey a copy of a covered work, you may at your option 343 | remove any additional permissions from that copy, or from any part of 344 | it. (Additional permissions may be written to require their own 345 | removal in certain cases when you modify the work.) You may place 346 | additional permissions on material, added by you to a covered work, 347 | for which you have or can give appropriate copyright permission. 348 | 349 | Notwithstanding any other provision of this License, for material you 350 | add to a covered work, you may (if authorized by the copyright holders of 351 | that material) supplement the terms of this License with terms: 352 | 353 | a) Disclaiming warranty or limiting liability differently from the 354 | terms of sections 15 and 16 of this License; or 355 | 356 | b) Requiring preservation of specified reasonable legal notices or 357 | author attributions in that material or in the Appropriate Legal 358 | Notices displayed by works containing it; or 359 | 360 | c) Prohibiting misrepresentation of the origin of that material, or 361 | requiring that modified versions of such material be marked in 362 | reasonable ways as different from the original version; or 363 | 364 | d) Limiting the use for publicity purposes of names of licensors or 365 | authors of the material; or 366 | 367 | e) Declining to grant rights under trademark law for use of some 368 | trade names, trademarks, or service marks; or 369 | 370 | f) Requiring indemnification of licensors and authors of that 371 | material by anyone who conveys the material (or modified versions of 372 | it) with contractual assumptions of liability to the recipient, for 373 | any liability that these contractual assumptions directly impose on 374 | those licensors and authors. 375 | 376 | All other non-permissive additional terms are considered "further 377 | restrictions" within the meaning of section 10. If the Program as you 378 | received it, or any part of it, contains a notice stating that it is 379 | governed by this License along with a term that is a further 380 | restriction, you may remove that term. If a license document contains 381 | a further restriction but permits relicensing or conveying under this 382 | License, you may add to a covered work material governed by the terms 383 | of that license document, provided that the further restriction does 384 | not survive such relicensing or conveying. 385 | 386 | If you add terms to a covered work in accord with this section, you 387 | must place, in the relevant source files, a statement of the 388 | additional terms that apply to those files, or a notice indicating 389 | where to find the applicable terms. 390 | 391 | Additional terms, permissive or non-permissive, may be stated in the 392 | form of a separately written license, or stated as exceptions; 393 | the above requirements apply either way. 394 | 395 | 8. Termination. 396 | 397 | You may not propagate or modify a covered work except as expressly 398 | provided under this License. Any attempt otherwise to propagate or 399 | modify it is void, and will automatically terminate your rights under 400 | this License (including any patent licenses granted under the third 401 | paragraph of section 11). 402 | 403 | However, if you cease all violation of this License, then your 404 | license from a particular copyright holder is reinstated (a) 405 | provisionally, unless and until the copyright holder explicitly and 406 | finally terminates your license, and (b) permanently, if the copyright 407 | holder fails to notify you of the violation by some reasonable means 408 | prior to 60 days after the cessation. 409 | 410 | Moreover, your license from a particular copyright holder is 411 | reinstated permanently if the copyright holder notifies you of the 412 | violation by some reasonable means, this is the first time you have 413 | received notice of violation of this License (for any work) from that 414 | copyright holder, and you cure the violation prior to 30 days after 415 | your receipt of the notice. 416 | 417 | Termination of your rights under this section does not terminate the 418 | licenses of parties who have received copies or rights from you under 419 | this License. If your rights have been terminated and not permanently 420 | reinstated, you do not qualify to receive new licenses for the same 421 | material under section 10. 422 | 423 | 9. Acceptance Not Required for Having Copies. 424 | 425 | You are not required to accept this License in order to receive or 426 | run a copy of the Program. Ancillary propagation of a covered work 427 | occurring solely as a consequence of using peer-to-peer transmission 428 | to receive a copy likewise does not require acceptance. However, 429 | nothing other than this License grants you permission to propagate or 430 | modify any covered work. These actions infringe copyright if you do 431 | not accept this License. Therefore, by modifying or propagating a 432 | covered work, you indicate your acceptance of this License to do so. 433 | 434 | 10. Automatic Licensing of Downstream Recipients. 435 | 436 | Each time you convey a covered work, the recipient automatically 437 | receives a license from the original licensors, to run, modify and 438 | propagate that work, subject to this License. You are not responsible 439 | for enforcing compliance by third parties with this License. 440 | 441 | An "entity transaction" is a transaction transferring control of an 442 | organization, or substantially all assets of one, or subdividing an 443 | organization, or merging organizations. If propagation of a covered 444 | work results from an entity transaction, each party to that 445 | transaction who receives a copy of the work also receives whatever 446 | licenses to the work the party's predecessor in interest had or could 447 | give under the previous paragraph, plus a right to possession of the 448 | Corresponding Source of the work from the predecessor in interest, if 449 | the predecessor has it or can get it with reasonable efforts. 450 | 451 | You may not impose any further restrictions on the exercise of the 452 | rights granted or affirmed under this License. For example, you may 453 | not impose a license fee, royalty, or other charge for exercise of 454 | rights granted under this License, and you may not initiate litigation 455 | (including a cross-claim or counterclaim in a lawsuit) alleging that 456 | any patent claim is infringed by making, using, selling, offering for 457 | sale, or importing the Program or any portion of it. 458 | 459 | 11. Patents. 460 | 461 | A "contributor" is a copyright holder who authorizes use under this 462 | License of the Program or a work on which the Program is based. The 463 | work thus licensed is called the contributor's "contributor version". 464 | 465 | A contributor's "essential patent claims" are all patent claims 466 | owned or controlled by the contributor, whether already acquired or 467 | hereafter acquired, that would be infringed by some manner, permitted 468 | by this License, of making, using, or selling its contributor version, 469 | but do not include claims that would be infringed only as a 470 | consequence of further modification of the contributor version. For 471 | purposes of this definition, "control" includes the right to grant 472 | patent sublicenses in a manner consistent with the requirements of 473 | this License. 474 | 475 | Each contributor grants you a non-exclusive, worldwide, royalty-free 476 | patent license under the contributor's essential patent claims, to 477 | make, use, sell, offer for sale, import and otherwise run, modify and 478 | propagate the contents of its contributor version. 479 | 480 | In the following three paragraphs, a "patent license" is any express 481 | agreement or commitment, however denominated, not to enforce a patent 482 | (such as an express permission to practice a patent or covenant not to 483 | sue for patent infringement). To "grant" such a patent license to a 484 | party means to make such an agreement or commitment not to enforce a 485 | patent against the party. 486 | 487 | If you convey a covered work, knowingly relying on a patent license, 488 | and the Corresponding Source of the work is not available for anyone 489 | to copy, free of charge and under the terms of this License, through a 490 | publicly available network server or other readily accessible means, 491 | then you must either (1) cause the Corresponding Source to be so 492 | available, or (2) arrange to deprive yourself of the benefit of the 493 | patent license for this particular work, or (3) arrange, in a manner 494 | consistent with the requirements of this License, to extend the patent 495 | license to downstream recipients. "Knowingly relying" means you have 496 | actual knowledge that, but for the patent license, your conveying the 497 | covered work in a country, or your recipient's use of the covered work 498 | in a country, would infringe one or more identifiable patents in that 499 | country that you have reason to believe are valid. 500 | 501 | If, pursuant to or in connection with a single transaction or 502 | arrangement, you convey, or propagate by procuring conveyance of, a 503 | covered work, and grant a patent license to some of the parties 504 | receiving the covered work authorizing them to use, propagate, modify 505 | or convey a specific copy of the covered work, then the patent license 506 | you grant is automatically extended to all recipients of the covered 507 | work and works based on it. 508 | 509 | A patent license is "discriminatory" if it does not include within 510 | the scope of its coverage, prohibits the exercise of, or is 511 | conditioned on the non-exercise of one or more of the rights that are 512 | specifically granted under this License. You may not convey a covered 513 | work if you are a party to an arrangement with a third party that is 514 | in the business of distributing software, under which you make payment 515 | to the third party based on the extent of your activity of conveying 516 | the work, and under which the third party grants, to any of the 517 | parties who would receive the covered work from you, a discriminatory 518 | patent license (a) in connection with copies of the covered work 519 | conveyed by you (or copies made from those copies), or (b) primarily 520 | for and in connection with specific products or compilations that 521 | contain the covered work, unless you entered into that arrangement, 522 | or that patent license was granted, prior to 28 March 2007. 523 | 524 | Nothing in this License shall be construed as excluding or limiting 525 | any implied license or other defenses to infringement that may 526 | otherwise be available to you under applicable patent law. 527 | 528 | 12. No Surrender of Others' Freedom. 529 | 530 | If conditions are imposed on you (whether by court order, agreement or 531 | otherwise) that contradict the conditions of this License, they do not 532 | excuse you from the conditions of this License. If you cannot convey a 533 | covered work so as to satisfy simultaneously your obligations under this 534 | License and any other pertinent obligations, then as a consequence you may 535 | not convey it at all. For example, if you agree to terms that obligate you 536 | to collect a royalty for further conveying from those to whom you convey 537 | the Program, the only way you could satisfy both those terms and this 538 | License would be to refrain entirely from conveying the Program. 539 | 540 | 13. Remote Network Interaction; Use with the GNU General Public License. 541 | 542 | Notwithstanding any other provision of this License, if you modify the 543 | Program, your modified version must prominently offer all users 544 | interacting with it remotely through a computer network (if your version 545 | supports such interaction) an opportunity to receive the Corresponding 546 | Source of your version by providing access to the Corresponding Source 547 | from a network server at no charge, through some standard or customary 548 | means of facilitating copying of software. This Corresponding Source 549 | shall include the Corresponding Source for any work covered by version 3 550 | of the GNU General Public License that is incorporated pursuant to the 551 | following paragraph. 552 | 553 | Notwithstanding any other provision of this License, you have 554 | permission to link or combine any covered work with a work licensed 555 | under version 3 of the GNU General Public License into a single 556 | combined work, and to convey the resulting work. The terms of this 557 | License will continue to apply to the part which is the covered work, 558 | but the work with which it is combined will remain governed by version 559 | 3 of the GNU General Public License. 560 | 561 | 14. Revised Versions of this License. 562 | 563 | The Free Software Foundation may publish revised and/or new versions of 564 | the GNU Affero General Public License from time to time. Such new versions 565 | will be similar in spirit to the present version, but may differ in detail to 566 | address new problems or concerns. 567 | 568 | Each version is given a distinguishing version number. If the 569 | Program specifies that a certain numbered version of the GNU Affero General 570 | Public License "or any later version" applies to it, you have the 571 | option of following the terms and conditions either of that numbered 572 | version or of any later version published by the Free Software 573 | Foundation. If the Program does not specify a version number of the 574 | GNU Affero General Public License, you may choose any version ever published 575 | by the Free Software Foundation. 576 | 577 | If the Program specifies that a proxy can decide which future 578 | versions of the GNU Affero General Public License can be used, that proxy's 579 | public statement of acceptance of a version permanently authorizes you 580 | to choose that version for the Program. 581 | 582 | Later license versions may give you additional or different 583 | permissions. However, no additional obligations are imposed on any 584 | author or copyright holder as a result of your choosing to follow a 585 | later version. 586 | 587 | 15. Disclaimer of Warranty. 588 | 589 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY 590 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT 591 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY 592 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, 593 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 594 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM 595 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF 596 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 597 | 598 | 16. Limitation of Liability. 599 | 600 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 601 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS 602 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY 603 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE 604 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF 605 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD 606 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), 607 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF 608 | SUCH DAMAGES. 609 | 610 | 17. Interpretation of Sections 15 and 16. 611 | 612 | If the disclaimer of warranty and limitation of liability provided 613 | above cannot be given local legal effect according to their terms, 614 | reviewing courts shall apply local law that most closely approximates 615 | an absolute waiver of all civil liability in connection with the 616 | Program, unless a warranty or assumption of liability accompanies a 617 | copy of the Program in return for a fee. 618 | 619 | END OF TERMS AND CONDITIONS 620 | 621 | How to Apply These Terms to Your New Programs 622 | 623 | If you develop a new program, and you want it to be of the greatest 624 | possible use to the public, the best way to achieve this is to make it 625 | free software which everyone can redistribute and change under these terms. 626 | 627 | To do so, attach the following notices to the program. It is safest 628 | to attach them to the start of each source file to most effectively 629 | state the exclusion of warranty; and each file should have at least 630 | the "copyright" line and a pointer to where the full notice is found. 631 | 632 | 633 | Copyright (C) 634 | 635 | This program is free software: you can redistribute it and/or modify 636 | it under the terms of the GNU Affero General Public License as published by 637 | the Free Software Foundation, either version 3 of the License, or 638 | (at your option) any later version. 639 | 640 | This program is distributed in the hope that it will be useful, 641 | but WITHOUT ANY WARRANTY; without even the implied warranty of 642 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 643 | GNU Affero General Public License for more details. 644 | 645 | You should have received a copy of the GNU Affero General Public License 646 | along with this program. If not, see . 647 | 648 | Also add information on how to contact you by electronic and paper mail. 649 | 650 | If your software can interact with users remotely through a computer 651 | network, you should also make sure that it provides a way for users to 652 | get its source. For example, if your program is a web application, its 653 | interface could display a "Source" link that leads users to an archive 654 | of the code. There are many ways you could offer source, and different 655 | solutions will be better for different programs; see section 13 for the 656 | specific requirements. 657 | 658 | You should also get your employer (if you work as a programmer) or school, 659 | if any, to sign a "copyright disclaimer" for the program, if necessary. 660 | For more information on this, and how to apply and follow the GNU AGPL, see 661 | . 662 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: all clean test 2 | 3 | CFLAGS += -Wall -Wextra 4 | 5 | PREFIX ?= /usr 6 | 7 | SRC = $(wildcard *.c) 8 | HDR = $(wildcard *.h) 9 | OBJ = $(SRC:.c=.o) 10 | NAME = liburlmatch.a 11 | 12 | all: $(NAME) 13 | 14 | $(NAME): $(OBJ) 15 | rm -f $(NAME) 16 | ar cru $(NAME) $(OBJ) 17 | ranlib $(NAME) 18 | 19 | $(OBJ): $(HDR) 20 | 21 | clean: 22 | $(MAKE) -C test clean 23 | rm -f $(OBJ) $(NAME) 24 | 25 | test: all 26 | $(MAKE) -C test 27 | 28 | install: all 29 | mkdir -p -m 755 $(DESTDIR)$(PREFIX)/include 30 | install -m 644 urlmatch.h $(DESTDIR)$(PREFIX)/include 31 | mkdir -p -m 755 $(DESTDIR)$(PREFIX)/lib 32 | install -m 644 $(NAME) $(DESTDIR)$(PREFIX)/lib 33 | -------------------------------------------------------------------------------- /README.asciidoc: -------------------------------------------------------------------------------- 1 | URL matcher lib 2 | =============== 3 | 4 | This is a small and fast C library duplicating the URL matching 5 | functionality of Opera. You might use it to implement ad blocking 6 | or similar. 7 | 8 | Given a list of patterns, such as 9 | 10 | ---- 11 | *facebook.com/* 12 | http*google-analytics.* 13 | http://foo.bat/this-annoying-image.jpeg 14 | ---- 15 | 16 | you can then match any connection attempt against the whole list, 17 | getting a yes/no answer back. 18 | 19 | Motivation 20 | ---------- 21 | 22 | One of the main components of Opera, the filtering system, supported 23 | white- and blacklists with wildcards. It was usable for more than just 24 | blocking ads, though it handled those well too. 25 | 26 | This is one such function that should never be relegated to Javascript 27 | (like Adblock browser extensions do). The average page makes close to a 28 | hundred connections, the list is traversed on each connection attempt, 29 | and common lists reach a few thousand entries. 30 | 31 | Turns out there wasn't any existing standalone pattern matching 32 | library. Regex is too slow (or in glibc's case, taking gigabytes of RAM), 33 | and wildcard functionality is essential. 34 | 35 | A simple function (like those you can find dozens of in the web) is 36 | included for benchmark comparison purposes. Currently this library is 37 | ~5x faster vs the simple function. This is not quite fast enough, so 38 | future optimizations will be coming. 39 | -------------------------------------------------------------------------------- /bench/Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: all clean 2 | 3 | CFLAGS += -Wall -Wextra 4 | CPPFLAGS += -I .. 5 | LDFLAGS += -lz 6 | 7 | SRC = $(wildcard *.c) 8 | OBJ = $(SRC:.c=.o) 9 | NAME = bench 10 | 11 | all: $(NAME) 12 | 13 | $(NAME): $(OBJ) ../liburlmatch.a 14 | $(CC) -o $(NAME) $(OBJ) $(CFLAGS) $(LDFLAGS) ../liburlmatch.a 15 | 16 | clean: 17 | rm -f $(OBJ) $(NAME) 18 | -------------------------------------------------------------------------------- /bench/bench.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include "urlmatch.h" 10 | #include "lrtypes.h" 11 | 12 | static u32 urls = 1000 * 1000; 13 | static const u32 rules = 1500; 14 | 15 | static const char **ruling; 16 | static const char **urling; 17 | 18 | static regex_t regex; 19 | 20 | static char *genurl() { 21 | 22 | const u32 len = (rand() % 80) + 10; 23 | char *buf = calloc(len + 1, 1); 24 | 25 | u32 i; 26 | 27 | switch (rand() % 10) { 28 | case 0 ... 6: 29 | strcpy(buf, "http://"); 30 | break; 31 | case 7 ... 8: 32 | strcpy(buf, "https://"); 33 | break; 34 | case 9: 35 | strcpy(buf, "ftp://"); 36 | break; 37 | } 38 | 39 | for (i = strlen(buf); i < len; i++) { 40 | const u32 type = rand() % 10; 41 | 42 | switch (type) { 43 | case 0 ... 6: 44 | buf[i] = 'a' + rand() % 26; 45 | break; 46 | case 7: 47 | buf[i] = '0' + rand() % 10; 48 | break; 49 | case 8 ... 9: 50 | buf[i] = ':' + rand() % 7; 51 | break; 52 | } 53 | } 54 | 55 | return buf; 56 | } 57 | 58 | static char *genrule() { 59 | 60 | const u32 len = (rand() % 10) + 5; 61 | char *buf = calloc(len + 1, 1); 62 | 63 | u32 i; 64 | for (i = 0; i < len; i++) { 65 | u32 type = rand() % 3; 66 | 67 | if (i && type == 2 && buf[i - 1] == '*') 68 | type = 1; 69 | 70 | switch (type) { 71 | case 0: 72 | buf[i] = 'a' + rand() % 26; 73 | break; 74 | case 1: 75 | buf[i] = '0' + rand() % 10; 76 | break; 77 | case 2: 78 | buf[i] = '*'; 79 | break; 80 | } 81 | } 82 | 83 | return buf; 84 | } 85 | 86 | static void gen() { 87 | ruling = calloc(rules, sizeof(char *)); 88 | urling = calloc(urls, sizeof(char *)); 89 | 90 | u32 i; 91 | for (i = 0; i < rules; i++) { 92 | ruling[i] = genrule(); 93 | //printf("Rule %u: %s\n", i, ruling[i]); 94 | } 95 | 96 | for (i = 0; i < urls; i++) { 97 | urling[i] = genurl(); 98 | //printf("URL %u: %s\n", i, urling[i]); 99 | } 100 | } 101 | 102 | static void simple() { 103 | 104 | u32 i, j, sum = 0; 105 | for (i = 0; i < urls; i++) { 106 | for (j = 0; j < rules; j++) { 107 | if (url_simplematch(ruling[j], urling[i])) { 108 | sum++; 109 | break; 110 | } 111 | } 112 | if (i % 10000 == 0) {printf("."); fflush(stdout);} 113 | } 114 | printf("\nTotal %u matches\n", sum); 115 | } 116 | 117 | static urlctx *opti_init() { 118 | 119 | u32 totlen = 1, i; 120 | for (i = 0; i < rules; i++) { 121 | totlen += strlen(ruling[i]) + 2; 122 | } 123 | 124 | char *tmp = calloc(totlen, 1); 125 | 126 | for (i = 0; i < rules; i++) { 127 | strcat(tmp, ruling[i]); 128 | strcat(tmp, "\n"); 129 | } 130 | 131 | urlctx *ctx = url_init(tmp); 132 | free(tmp); 133 | 134 | return ctx; 135 | } 136 | 137 | static void opti(const urlctx * const ctx) { 138 | 139 | u32 i, sum = 0; 140 | for (i = 0; i < urls; i++) { 141 | if (url_match(ctx, urling[i])) { 142 | sum++; 143 | } 144 | if (i % 10000 == 0) {printf("."); fflush(stdout);} 145 | } 146 | printf("\nTotal %u matches\n", sum); 147 | } 148 | 149 | static void reg_init() { 150 | 151 | u32 totlen = 1, i; 152 | for (i = 0; i < rules; i++) { 153 | totlen += strlen(ruling[i]) * 2 + 2; 154 | } 155 | 156 | char *tmp = calloc(totlen, 1); 157 | 158 | for (i = 0; i < rules; i++) { 159 | u32 j; 160 | 161 | for (j = 0; ruling[i][j]; j++) { 162 | if (ruling[i][j] != '*') { 163 | const char buffy[2] = { ruling[i][j], '\0' }; 164 | strcat(tmp, buffy); 165 | } else { 166 | const char buffy[3] = ".*"; 167 | strcat(tmp, buffy); 168 | } 169 | } 170 | 171 | if (i != rules - 1) strcat(tmp, "|"); 172 | } 173 | 174 | for (i = 1; i < totlen; i++) { 175 | if (tmp[i] == '*') 176 | tmp[i-1] = '.'; 177 | } 178 | 179 | int ret = regcomp(®ex, tmp, REG_EXTENDED | REG_NOSUB); 180 | if (ret) puts("Failed to compile regex"); 181 | 182 | free(tmp); 183 | } 184 | 185 | static void reg() { 186 | 187 | u32 i; 188 | for (i = 0; i < urls; i++) { 189 | regexec(®ex, urling[i], 0, NULL, 0); 190 | 191 | if (i % 10000 == 0) {printf("."); fflush(stdout);} 192 | } 193 | } 194 | 195 | int main(int argc, char **argv) { 196 | 197 | if (argc > 1) { 198 | urls = 1000 * atoi(argv[1]); 199 | } 200 | 201 | srand(42); 202 | 203 | printf("Generating %uk urls and %u rules.\n", urls / 1000, rules); 204 | gen(); 205 | 206 | struct timeval start, end; 207 | u32 ms, us; 208 | 209 | printf("Starting testing.\n\n"); 210 | gettimeofday(&start, NULL); 211 | simple(); 212 | gettimeofday(&end, NULL); 213 | 214 | ms = (end.tv_sec - start.tv_sec) * 1000; 215 | ms += (end.tv_usec - start.tv_usec) / 1000; 216 | if (!ms) ms = 1; 217 | printf("Simple backend took %u ms, or %.2f checks per millisecond.\n\n", 218 | ms, (float) urls / ms); 219 | 220 | 221 | 222 | gettimeofday(&start, NULL); 223 | urlctx * ctx = opti_init(); 224 | gettimeofday(&end, NULL); 225 | 226 | ms = (end.tv_sec - start.tv_sec) * 1000; 227 | ms += (end.tv_usec - start.tv_usec) / 1000; 228 | if (!ms) ms = 1; 229 | printf("Optimized init took %u ms.\n", 230 | ms); 231 | 232 | // Yes yes, insecure mktemp. This is a bench. 233 | char name[] = "/tmp/urlmatch_benchXXXXXX"; 234 | mktemp(name); 235 | gettimeofday(&start, NULL); 236 | if (url_save_optimized(ctx, name)) puts("save failed"); 237 | gettimeofday(&end, NULL); 238 | url_free(ctx); 239 | 240 | ms = (end.tv_sec - start.tv_sec) * 1000; 241 | ms += (end.tv_usec - start.tv_usec) / 1000; 242 | if (!ms) ms = 1; 243 | printf("Optimized init, saving to binary file took %u ms.\n", 244 | ms); 245 | 246 | gettimeofday(&start, NULL); 247 | ctx = url_init_file(name); 248 | gettimeofday(&end, NULL); 249 | 250 | ms = (end.tv_sec - start.tv_sec) * 1000; 251 | ms += (end.tv_usec - start.tv_usec) / 1000; 252 | us = (end.tv_sec - start.tv_sec) * 1000000; 253 | us += (end.tv_usec - start.tv_usec); 254 | if (!ms) ms = 1; 255 | printf("Optimized init, read from binary file took %u ms (%u us).\n", 256 | ms, us); 257 | unlink(name); 258 | 259 | 260 | 261 | gettimeofday(&start, NULL); 262 | opti(ctx); 263 | gettimeofday(&end, NULL); 264 | url_free(ctx); 265 | 266 | ms = (end.tv_sec - start.tv_sec) * 1000; 267 | ms += (end.tv_usec - start.tv_usec) / 1000; 268 | if (!ms) ms = 1; 269 | printf("Optimized backend took %u ms, or %.2f checks per millisecond.\n\n", 270 | ms, (float) urls / ms); 271 | 272 | /* 273 | glibc regex uses a fuckton of RAM, causing this to die in just some dozen 274 | iterations. Only enable if you have a sane libc where calling regexec does 275 | not allocate memory. 276 | */ 277 | #if 0 278 | gettimeofday(&start, NULL); 279 | reg_init(); 280 | gettimeofday(&end, NULL); 281 | 282 | ms = (end.tv_sec - start.tv_sec) * 1000; 283 | ms += (end.tv_usec - start.tv_usec) / 1000; 284 | if (!ms) ms = 1; 285 | printf("Regex init took %u ms.\n", 286 | ms); 287 | 288 | 289 | 290 | gettimeofday(&start, NULL); 291 | reg(); 292 | gettimeofday(&end, NULL); 293 | 294 | ms = end.tv_sec - start.tv_sec; 295 | ms += (end.tv_usec - start.tv_usec) / 1000; 296 | if (!ms) ms = 1; 297 | printf("Regex took %u ms, or %.2f checks per millisecond.\n", 298 | ms, (float) urls / ms); 299 | #endif 300 | return 0; 301 | } 302 | -------------------------------------------------------------------------------- /internal.h: -------------------------------------------------------------------------------- 1 | /* 2 | liburlmatch - a fast URL matcher 3 | Copyright (C) 2013 Lauri Kasanen 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU Affero General Public License as published by 7 | the Free Software Foundation, version 3 of the License. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU Affero General Public License for more details. 13 | 14 | You should have received a copy of the GNU Affero General Public License 15 | along with this program. If not, see . 16 | */ 17 | 18 | #ifndef INTERNAL_H 19 | #define INTERNAL_H 20 | 21 | #define _GNU_SOURCE 22 | 23 | #include "lrtypes.h" 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | 31 | // Let's help the compiler 32 | #if __GNUC__ >= 4 33 | 34 | #define PURE_FUNC __attribute__ ((pure)) 35 | #define NORETURN_FUNC __attribute__ ((noreturn)) 36 | #define CONST_FUNC __attribute__ ((const)) 37 | #define WUR_FUNC __attribute__ ((warn_unused_result)) 38 | #else // GNUC 39 | 40 | #define PURE_FUNC 41 | #define NORETURN_FUNC 42 | #define CONST_FUNC 43 | #define WUR_FUNC 44 | 45 | #endif // GNUC 46 | 47 | 48 | #pragma GCC visibility push(hidden) 49 | 50 | u32 countwilds(const char str[]) WUR_FUNC PURE_FUNC; 51 | const char *strrstr(const char hay[], const char needle[]) WUR_FUNC PURE_FUNC; 52 | static inline int suffixcmp(const char one[], const char two[]) WUR_FUNC PURE_FUNC; 53 | 54 | void *xcalloc(size_t nmemb, size_t size); 55 | void *xmalloc(size_t size); 56 | void die(const char s[]) NORETURN_FUNC; 57 | void swrite(const void *ptr, const size_t size, FILE *stream); 58 | void sread(void *ptr, const size_t size, FILE *stream); 59 | void getsuffix(const char str[], char suf[3]); 60 | static inline int wildprefix(const char str[]) WUR_FUNC PURE_FUNC; 61 | 62 | 63 | struct urlctx { 64 | struct prefix *pref; 65 | u16 count; 66 | 67 | char *storage; 68 | u32 storagelen; 69 | u32 used; 70 | }; 71 | 72 | struct prefix { 73 | struct suffix *suf; 74 | u16 count; 75 | 76 | char prefix[6]; 77 | u8 len; 78 | }; 79 | 80 | struct suffix { 81 | struct needle *need; 82 | u16 count; 83 | 84 | char suffix[3]; 85 | }; 86 | 87 | struct needle { 88 | const char *needle; 89 | u16 len; 90 | u16 wilds; 91 | u16 longest; 92 | u16 longlen; 93 | }; 94 | 95 | #define MAGIC "um1" 96 | 97 | void printctx(const struct urlctx *); 98 | int ctxcmp(const struct urlctx *, const struct urlctx *); 99 | void *poolalloc(struct urlctx *, u32 bytes) WUR_FUNC; 100 | 101 | // Inlines 102 | 103 | static inline int wildprefix(const char str[]) { 104 | 105 | u16 len = strlen(str); 106 | if (len > 5) 107 | len = 5; 108 | 109 | return memchr(str, '*', len) != NULL; 110 | } 111 | 112 | static inline int suffixcmp(const char one[], const char two[]) { 113 | 114 | const u16 len1 = one[1] ? 2 : 1; 115 | const u16 len2 = two[1] ? 2 : 1; 116 | 117 | if (len1 == len2) { 118 | if (len1 == 1) 119 | return one[0] != two[0]; 120 | return one[1] != two[1] || one[0] != two[0]; 121 | } 122 | 123 | if (len1 < len2) { 124 | // one is a single byte long 125 | if (one[0] == two[1]) 126 | return 0; 127 | return 1; 128 | } else { 129 | // two is a single byte long 130 | if (one[1] == two[0]) 131 | return 0; 132 | return 1; 133 | } 134 | } 135 | 136 | #pragma GCC visibility pop 137 | 138 | #endif 139 | -------------------------------------------------------------------------------- /lrtypes.h: -------------------------------------------------------------------------------- 1 | #ifndef LRT_TYPES_H 2 | #define LRT_TYPES_H 3 | 4 | #include 5 | 6 | typedef uint64_t u64; 7 | typedef uint32_t u32; 8 | typedef uint16_t u16; 9 | typedef uint8_t u8; 10 | 11 | typedef int64_t s64; 12 | typedef int32_t s32; 13 | typedef int16_t s16; 14 | typedef int8_t s8; 15 | 16 | #endif 17 | -------------------------------------------------------------------------------- /misc.c: -------------------------------------------------------------------------------- 1 | /* 2 | liburlmatch - a fast URL matcher 3 | Copyright (C) 2013 Lauri Kasanen 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU Affero General Public License as published by 7 | the Free Software Foundation, version 3 of the License. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU Affero General Public License for more details. 13 | 14 | You should have received a copy of the GNU Affero General Public License 15 | along with this program. If not, see . 16 | */ 17 | 18 | #include "internal.h" 19 | 20 | u32 countwilds(const char str[]) { 21 | 22 | u32 sum = 0; 23 | 24 | const char *ptr = str; 25 | for (; *ptr; ptr++) { 26 | if (*ptr == '*') sum++; 27 | } 28 | 29 | return sum; 30 | } 31 | 32 | const char *strrstr(const char hay[], const char needle[]) { 33 | 34 | const char *next; 35 | next = strstr(hay, needle); 36 | if (!next) return NULL; 37 | 38 | while (1) { 39 | const char *prev = next; 40 | next = strstr(next + 1, needle); 41 | 42 | if (!next) return prev; 43 | } 44 | } 45 | 46 | void *xcalloc(size_t nmemb, size_t size) { 47 | 48 | void *tmp = calloc(nmemb, size); 49 | if (!tmp) die("Out of memory"); 50 | 51 | return tmp; 52 | } 53 | 54 | void *xmalloc(size_t size) { 55 | 56 | void *tmp = malloc(size); 57 | if (!tmp) die("Out of memory"); 58 | 59 | return tmp; 60 | } 61 | 62 | void die(const char s[]) { 63 | 64 | fprintf(stderr, "%s\n", s); 65 | exit(1); 66 | } 67 | 68 | void swrite(const void * const ptr, const size_t size, FILE * const stream) { 69 | 70 | const size_t ret = fwrite(ptr, size, 1, stream); 71 | 72 | if (ret != 1) die("Failed writing"); 73 | } 74 | 75 | void sread(void * const ptr, const size_t size, FILE * const stream) { 76 | 77 | const size_t ret = fread(ptr, size, 1, stream); 78 | 79 | if (ret != 1) die("Failed reading"); 80 | } 81 | 82 | void getsuffix(const char str[], char suf[3]) { 83 | 84 | const u32 len = strlen(str); 85 | if (len == 0) 86 | return; 87 | 88 | if (len == 1) { 89 | suf[0] = str[0]; 90 | suf[1] = '\0'; 91 | return; 92 | } 93 | 94 | suf[0] = str[len - 2]; 95 | suf[1] = str[len - 1]; 96 | suf[2] = '\0'; 97 | 98 | if (suf[0] == '*' && suf[1] != '*') { 99 | suf[0] = suf[1]; 100 | suf[1] = '\0'; 101 | } else if (suf[0] == '*' || suf[1] == '*') { 102 | suf[0] = '*'; 103 | suf[1] = '\0'; 104 | } 105 | } 106 | 107 | void printctx(const struct urlctx * const ctx) { 108 | 109 | u16 p, s, n; 110 | u16 pmax, smax, nmax; 111 | 112 | pmax = ctx->count; 113 | 114 | printf("URL context has %u prefixes\n", pmax); 115 | 116 | for (p = 0; p < pmax; p++) { 117 | const struct prefix * const curpref = &ctx->pref[p]; 118 | 119 | smax = curpref->count; 120 | printf("\tPrefix %u '%s' has %u suffixes\n", p, curpref->prefix, smax); 121 | 122 | for (s = 0; s < smax; s++) { 123 | const struct suffix * const cursuf = &curpref->suf[s]; 124 | 125 | nmax = cursuf->count; 126 | printf("\t\tSuffix %u '%s' has %u needles\n", s, cursuf->suffix, 127 | nmax); 128 | 129 | for (n = 0; n < cursuf->count; n++) { 130 | const struct needle * const curneed = &cursuf->need[n]; 131 | 132 | printf("\t\t\tNeedle %u: %s\n", n, curneed->needle); 133 | } 134 | } 135 | } 136 | } 137 | 138 | int ctxcmp(const struct urlctx * const a, const struct urlctx * const b) { 139 | 140 | u16 p, s, n; 141 | u16 pmax, smax, nmax; 142 | 143 | pmax = a->count; 144 | 145 | 146 | #define cmperr(ack) do { fprintf(stderr, ack "\n"); return 1; } while (0) 147 | 148 | if (a->count != b->count) cmperr("prefix count"); 149 | 150 | for (p = 0; p < pmax; p++) { 151 | const struct prefix * const curpref = &a->pref[p]; 152 | const struct prefix * const curbpref = &b->pref[p]; 153 | 154 | smax = curpref->count; 155 | if (curpref->count != curbpref->count) cmperr("suffix count"); 156 | if (strcmp(curpref->prefix, curbpref->prefix)) cmperr("prefix"); 157 | if (curpref->len != curbpref->len) cmperr("prefix length"); 158 | 159 | for (s = 0; s < smax; s++) { 160 | const struct suffix * const cursuf = &curpref->suf[s]; 161 | const struct suffix * const curbsuf = &curbpref->suf[s]; 162 | 163 | nmax = cursuf->count; 164 | if (cursuf->count != curbsuf->count) cmperr("needle count"); 165 | if (strcmp(cursuf->suffix, curbsuf->suffix)) cmperr("suffix"); 166 | 167 | for (n = 0; n < nmax; n++) { 168 | const struct needle * const curneed = &cursuf->need[n]; 169 | const struct needle * const curbneed = &curbsuf->need[n]; 170 | 171 | if (curneed->len != curbneed->len) 172 | cmperr("needle len"); 173 | if (curneed->wilds != curbneed->wilds) 174 | cmperr("needle wilds"); 175 | if (curneed->longest != curbneed->longest) 176 | cmperr("needle longest"); 177 | if (curneed->longlen != curbneed->longlen) 178 | cmperr("needle longlen"); 179 | if (strcmp(curneed->needle, curbneed->needle)) 180 | cmperr("needle"); 181 | } 182 | } 183 | } 184 | 185 | #undef cmperr 186 | 187 | return 0; 188 | } 189 | 190 | void *poolalloc(struct urlctx * const ctx, u32 bytes) { 191 | 192 | /* Everything we return is 64-bit aligned. 193 | 194 | This is guaranteed by relying on our base 195 | pointer being ok, and only giving out 196 | multiples of 8. */ 197 | 198 | while (bytes % 8 != 0) 199 | bytes++; 200 | 201 | if (ctx->used + bytes > ctx->storagelen) 202 | die("Storage OOM"); 203 | 204 | const u32 cur = ctx->used; 205 | ctx->used += bytes; 206 | 207 | return ctx->storage + cur; 208 | } 209 | -------------------------------------------------------------------------------- /opti.c: -------------------------------------------------------------------------------- 1 | /* 2 | liburlmatch - a fast URL matcher 3 | Copyright (C) 2013 Lauri Kasanen 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU Affero General Public License as published by 7 | the Free Software Foundation, version 3 of the License. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU Affero General Public License for more details. 13 | 14 | You should have received a copy of the GNU Affero General Public License 15 | along with this program. If not, see . 16 | */ 17 | 18 | #include "internal.h" 19 | #include "urlmatch.h" 20 | #include 21 | 22 | int url_save_optimized(const urlctx *ctx, const char file[]) { 23 | 24 | const int fd = open(file, O_WRONLY | O_CREAT, 0644); 25 | if (fd < 0) 26 | return 1; 27 | 28 | return url_save_optimized2(ctx, fd); 29 | } 30 | 31 | int url_save_optimized2(const urlctx *ctx, const int fd) { 32 | 33 | char *buf; 34 | size_t len; 35 | 36 | FILE *f = open_memstream(&buf, &len); 37 | if (!f) return 1; 38 | 39 | swrite(&ctx->count, 2, f); 40 | swrite(&ctx->storagelen, 4, f); 41 | 42 | u32 p, s, n; 43 | for (p = 0; p < ctx->count; p++) { 44 | const struct prefix * const curpref = &ctx->pref[p]; 45 | swrite(&curpref->count, 2, f); 46 | swrite(curpref->prefix, 5, f); 47 | swrite(&curpref->len, 1, f); 48 | 49 | for (s = 0; s < curpref->count; s++) { 50 | const struct suffix * const cursuf = &curpref->suf[s]; 51 | swrite(&cursuf->count, 2, f); 52 | swrite(cursuf->suffix, 2, f); 53 | 54 | for (n = 0; n < cursuf->count; n++) { 55 | const struct needle * const curneed = &cursuf->need[n]; 56 | swrite(&curneed->len, 2, f); 57 | swrite(&curneed->wilds, 2, f); 58 | swrite(&curneed->longest, 2, f); 59 | swrite(&curneed->longlen, 2, f); 60 | swrite(curneed->needle, curneed->len + 1, f); 61 | } 62 | } 63 | } 64 | 65 | fclose(f); 66 | 67 | // Cool, a buffer. Let's compress it. 68 | u64 bound = compressBound(len); 69 | u8 *dest = xcalloc(bound, 1); 70 | if (compress2(dest, &bound, (u8 *) buf, len, 9) != Z_OK) return 2; 71 | 72 | free(buf); 73 | 74 | f = fdopen(fd, "w"); 75 | if (!f) return 1; 76 | 77 | swrite(MAGIC, 3, f); 78 | swrite(&len, sizeof(size_t), f); 79 | swrite(dest, bound, f); 80 | 81 | free(dest); 82 | fclose(f); 83 | return 0; 84 | } 85 | 86 | static int finalcheck(const char find[], const u32 len, 87 | const char hay[], const u32 haylen) { 88 | 89 | // This is the core of the simple check 90 | 91 | u32 i, h = 0; 92 | 93 | for (i = 0; i < len; i++) { 94 | if (find[i] != '*') { 95 | if (find[i] != hay[h]) 96 | return 0; 97 | h++; 98 | } else { 99 | // If multiple wildcards in a row, skip to the last 100 | while (find[i+1] == '*') i++; 101 | 102 | if (i == len - 1) 103 | return 1; 104 | 105 | // Wildcard, not last 106 | const char * const ender = strchrnul(&find[i + 1], '*'); 107 | const u32 dist = ender - &find[i + 1]; 108 | 109 | char piece[dist + 1]; 110 | memcpy(piece, &find[i + 1], dist); 111 | piece[dist] = '\0'; 112 | 113 | const char * const lastmatch = strrstr(&hay[h], piece); 114 | if (!lastmatch) 115 | return 0; 116 | 117 | // Is backtracking required? 118 | const char * const firstmatch = strstr(&hay[h], piece); 119 | 120 | // The dist check is to make sure this is not a suffix search 121 | if (firstmatch != lastmatch && dist != len - i - 1) { 122 | const u32 move = firstmatch - &hay[h]; 123 | h += move; 124 | } else { 125 | const u32 move = lastmatch - &hay[h]; 126 | h += move; 127 | } 128 | } 129 | } 130 | 131 | // We ran out of needle but not hay 132 | if (h != haylen) return 0; 133 | 134 | return 1; 135 | 136 | } 137 | 138 | static void getsuffixlen(const char str[], char suf[3], const u32 len) { 139 | 140 | if (len == 1) { 141 | suf[0] = str[0]; 142 | suf[1] = '\0'; 143 | return; 144 | } 145 | 146 | suf[0] = str[len - 2]; 147 | suf[1] = str[len - 1]; 148 | suf[2] = '\0'; 149 | } 150 | 151 | int url_match(const urlctx * const ctx, const char haystack[]) { 152 | 153 | const u32 len = strlen(haystack); 154 | char suf[3], pref[6]; 155 | 156 | if (len < 1) return 0; 157 | getsuffixlen(haystack, suf, len); 158 | 159 | strncpy(pref, haystack, 5); 160 | pref[5] = '\0'; 161 | 162 | u32 p, s; 163 | 164 | // Find all applicable prefixes 165 | const u32 pmax = ctx->count; 166 | for (p = 0; p < pmax; p++) { 167 | const struct prefix * const curpref = &ctx->pref[p]; 168 | 169 | // Does this prefix match? 170 | if (curpref->prefix[0] != '*') { 171 | int ret = strncmp(pref, curpref->prefix, curpref->len); 172 | if (ret > 0) 173 | continue; 174 | if (ret < 0) 175 | break; 176 | } 177 | 178 | const u32 smax = curpref->count; 179 | for (s = 0; s < smax; s++) { 180 | const struct suffix * const cursuf = &curpref->suf[s]; 181 | 182 | // Does this suffix match? 183 | if (cursuf->suffix[0] != '*' && 184 | suffixcmp(suf, cursuf->suffix)) 185 | continue; 186 | 187 | // OK, we have to test all needles in this suffix. 188 | u32 n; 189 | const u32 nmax = cursuf->count; 190 | for (n = 0; n < nmax; n++) { 191 | const struct needle * const curneed = &cursuf->need[n]; 192 | 193 | // First: no wildcards 194 | if (!curneed->wilds) { 195 | // Do the lengths match? 196 | if (len != curneed->len) 197 | continue; 198 | if (!strcmp(haystack, curneed->needle)) 199 | return 1; 200 | } else { 201 | // Is the longest streak in it? 202 | if (curneed->longlen) { 203 | if (curneed->longlen >= 4) { 204 | if (!memmem(haystack, len, 205 | curneed->needle + curneed->longest, 206 | curneed->longlen)) 207 | continue; 208 | } else { 209 | if (!memchr(haystack, 210 | curneed->needle[curneed->longest], 211 | len)) 212 | continue; 213 | } 214 | } 215 | 216 | // The prefix and suffix match, and it contains 217 | // the longest streak. Do the actual comparison. 218 | if (finalcheck(curneed->needle, curneed->len, 219 | haystack, len)) 220 | return 1; 221 | } 222 | } 223 | } 224 | } 225 | 226 | return 0; 227 | } 228 | 229 | void url_free(urlctx *ctx) { 230 | 231 | free(ctx->storage); 232 | free(ctx); 233 | } 234 | -------------------------------------------------------------------------------- /opti_init.c: -------------------------------------------------------------------------------- 1 | /* 2 | liburlmatch - a fast URL matcher 3 | Copyright (C) 2013 Lauri Kasanen 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU Affero General Public License as published by 7 | the Free Software Foundation, version 3 of the License. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU Affero General Public License for more details. 13 | 14 | You should have received a copy of the GNU Affero General Public License 15 | along with this program. If not, see . 16 | */ 17 | 18 | #include "internal.h" 19 | #include "urlmatch.h" 20 | #include 21 | 22 | static urlctx *initbin(FILE * const f, const u32 inlen) { 23 | 24 | size_t len; 25 | sread(&len, sizeof(size_t), f); 26 | 27 | u8 * const src = xcalloc(inlen, 1); 28 | u8 *buf = xcalloc(len, 1); 29 | u8 * const origbuf = buf; 30 | 31 | sread(src, inlen, f); 32 | if (uncompress(buf, &len, src, inlen) != Z_OK) return NULL; 33 | free(src); 34 | 35 | // Cool, unpacked. Read it. 36 | urlctx * const out = xcalloc(sizeof(urlctx), 1); 37 | 38 | memcpy(&out->count, buf, 2); 39 | buf += 2; 40 | 41 | memcpy(&out->storagelen, buf, 4); 42 | buf += 4; 43 | 44 | out->storage = xcalloc(out->storagelen, 1); 45 | out->pref = poolalloc(out, sizeof(struct prefix) * out->count); 46 | u32 p, s, n; 47 | 48 | for (p = 0; p < out->count; p++) { 49 | struct prefix * const curpref = &out->pref[p]; 50 | 51 | memcpy(&curpref->count, buf, 2); 52 | buf += 2; 53 | memcpy(curpref->prefix, buf, 5); 54 | buf += 5; 55 | curpref->len = *buf; 56 | buf++; 57 | 58 | curpref->suf = poolalloc(out, sizeof(struct suffix) * curpref->count); 59 | 60 | for (s = 0; s < curpref->count; s++) { 61 | struct suffix * const cursuf = &curpref->suf[s]; 62 | 63 | memcpy(&cursuf->count, buf, 2); 64 | buf += 2; 65 | memcpy(cursuf->suffix, buf, 2); 66 | buf += 2; 67 | 68 | cursuf->need = poolalloc(out, sizeof(struct needle) * cursuf->count); 69 | 70 | for (n = 0; n < cursuf->count; n++) { 71 | struct needle * const curneed = &cursuf->need[n]; 72 | 73 | memcpy(&curneed->len, buf, 2); 74 | buf += 2; 75 | memcpy(&curneed->wilds, buf, 2); 76 | buf += 2; 77 | memcpy(&curneed->longest, buf, 2); 78 | buf += 2; 79 | memcpy(&curneed->longlen, buf, 2); 80 | buf += 2; 81 | 82 | curneed->needle = poolalloc(out, curneed->len + 1); 83 | 84 | memcpy((char *) curneed->needle, buf, curneed->len + 1); 85 | buf += curneed->len + 1; 86 | } 87 | } 88 | } 89 | 90 | free(origbuf); 91 | return out; 92 | } 93 | 94 | urlctx *url_init_file(const char file[]) { 95 | 96 | const int fd = open(file, O_RDONLY); 97 | if (fd < 0) 98 | return NULL; 99 | 100 | return url_init_file2(fd); 101 | } 102 | 103 | urlctx *url_init_file2(const int fd) { 104 | 105 | FILE * const f = fdopen(fd, "r"); 106 | if (!f) return NULL; 107 | 108 | fseek(f, 0, SEEK_END); 109 | const long len = ftell(f); 110 | rewind(f); 111 | 112 | char buf[4] = { 0 }; 113 | fread(buf, 3, 1, f); 114 | 115 | urlctx *out = NULL; 116 | 117 | // Binary format 118 | if (!strcmp(buf, MAGIC)) { 119 | out = initbin(f, len - 3 - sizeof(size_t)); 120 | } else { // Text format 121 | rewind(f); 122 | 123 | char *tmp = xcalloc(len, 1); 124 | if (fread(tmp, len, 1, f) != 1) die("Failed reading"); 125 | out = url_init(tmp); 126 | free(tmp); 127 | } 128 | 129 | fclose(f); 130 | return out; 131 | } 132 | 133 | static int wildpfxcmp(const char a[], const char b[]) { 134 | 135 | const int awild = wildprefix(a); 136 | const int bwild = wildprefix(b); 137 | 138 | if (!awild && !bwild) { 139 | return strncmp(a, b, 5); 140 | } else if (awild && !bwild) { 141 | return strncmp("*", b, 5); 142 | } else if (!awild && bwild) { 143 | return strncmp(a, "*", 5); 144 | } 145 | 146 | return 0; 147 | } 148 | 149 | static void preparepfx(char str[]) { 150 | 151 | if (!wildprefix(str)) 152 | return; 153 | 154 | memset(str + 1, '\0', 4); 155 | str[0] = '*'; 156 | } 157 | 158 | static int cstrcmp(const void * const p1, const void * const p2) { 159 | 160 | const char * const a = * (char * const *) p1; 161 | const char * const b = * (char * const *) p2; 162 | 163 | int ret = wildpfxcmp(a, b); 164 | if (ret) return ret; 165 | 166 | // Secondary sort by the suffix 167 | char sufa[3] = { 0 }; 168 | char sufb[3] = { 0 }; 169 | 170 | getsuffix(a, sufa); 171 | getsuffix(b, sufb); 172 | 173 | return strcmp(sufa, sufb); 174 | } 175 | 176 | static void calclongest(const char needle[], const u16 len, const u16 wilds, 177 | u16 * const longest, u16 * const longlen) { 178 | 179 | // Easy path 180 | if (wilds == 1) { 181 | const char *ptr = strchr(needle, '*'); 182 | const u16 pos = ptr - needle; 183 | const u16 half = len / 2; 184 | 185 | if (pos < half) { 186 | *longlen = len - pos - 1; 187 | *longest = pos + 1; 188 | } else { 189 | *longlen = pos; 190 | *longest = 0; 191 | } 192 | } else { 193 | u16 max = 0; 194 | u16 maxlen = 0; 195 | 196 | const char *ptr = needle; 197 | while (*ptr) { 198 | const char * const next = strchrnul(ptr, '*'); 199 | const u16 thislen = next - ptr; 200 | 201 | if (maxlen < thislen) { 202 | maxlen = thislen; 203 | max = ptr - needle; 204 | } 205 | 206 | if (!*next) break; 207 | ptr = next + 1; 208 | } 209 | 210 | *longest = max; 211 | *longlen = maxlen; 212 | } 213 | } 214 | 215 | static void addneedle(urlctx * const ctx, struct needle * const to, const char from[]) { 216 | 217 | const u32 len = strlen(from); 218 | to->needle = poolalloc(ctx, len + 1); 219 | memcpy((char *) to->needle, from, len + 1); 220 | 221 | to->len = len; 222 | to->wilds = countwilds(from); 223 | 224 | if (to->wilds) 225 | calclongest(from, to->len, to->wilds, &to->longest, &to->longlen); 226 | } 227 | 228 | urlctx *url_init(const char contents[]) { 229 | 230 | u32 lines = 1; 231 | const char *ptr = contents; 232 | const u32 contentlen = strlen(contents); 233 | const char * const endbyte = ptr + contentlen; 234 | for (; *ptr; ptr++) { 235 | if (*ptr == '\n') lines++; 236 | } 237 | 238 | char **outlines = xcalloc(lines, sizeof(char *)); 239 | const u32 origlines = lines; 240 | 241 | // Copy each pattern line to its own space, and optimize on the way 242 | ptr = contents; 243 | u32 i = 0, j; 244 | while (1) { 245 | const char * const end = strchrnul(ptr, '\n'); 246 | const u32 len = end - ptr; 247 | 248 | if (len < 1) { 249 | ptr = end + 1; 250 | if (ptr >= endbyte) { i--; break; } 251 | continue; 252 | } 253 | 254 | char tmp[len + 1]; 255 | tmp[len] = '\0'; 256 | memcpy(tmp, ptr, len); 257 | 258 | outlines[i] = xcalloc(len + 1, 1); 259 | 260 | u32 p, o; 261 | outlines[i][0] = tmp[0]; 262 | for (p = 1, o = 1; p < len; p++) { 263 | if (tmp[p - 1] == '*' && tmp[p] == '*') { 264 | continue; 265 | } 266 | outlines[i][o] = tmp[p]; 267 | 268 | o++; 269 | } 270 | 271 | if (!*end) break; 272 | ptr = end + 1; 273 | i++; 274 | } 275 | 276 | lines = i + 1; 277 | qsort(outlines, lines, sizeof(char *), cstrcmp); 278 | 279 | urlctx * const out = xcalloc(sizeof(urlctx), 1); 280 | // The theoretical maximum amount needed 281 | out->storagelen = contentlen + 1 + 282 | lines * (sizeof(struct suffix) + 283 | sizeof(struct needle) + 284 | sizeof(struct prefix) + 8); 285 | out->storage = xcalloc(out->storagelen, 1); 286 | 287 | // How many prefixes do we have? 288 | u32 prefixes = 1; 289 | for (i = 1; i < lines; i++) { 290 | if (strncmp(outlines[i - 1], outlines[i], 5) && 291 | (!wildprefix(outlines[i - 1]) || !wildprefix(outlines[i]))) 292 | prefixes++; 293 | } 294 | 295 | out->count = prefixes; 296 | out->pref = poolalloc(out, sizeof(struct prefix) * prefixes); 297 | 298 | // Add each prefix 299 | prefixes = 1; 300 | strncpy(out->pref[0].prefix, outlines[0], 5); 301 | preparepfx(out->pref[0].prefix); 302 | out->pref[0].len = strlen(out->pref[0].prefix); 303 | for (i = 1; i < lines; i++) { 304 | if (wildpfxcmp(outlines[i - 1], outlines[i])) { 305 | strncpy(out->pref[prefixes].prefix, outlines[i], 5); 306 | 307 | preparepfx(out->pref[prefixes].prefix); 308 | out->pref[prefixes].len = strlen(out->pref[prefixes].prefix); 309 | 310 | prefixes++; 311 | } 312 | } 313 | 314 | // For each prefix, how many suffixes are there? 315 | for (i = 0; i < out->count; i++) { 316 | 317 | struct prefix * const curpref = &out->pref[i]; 318 | 319 | u32 suffixes = 0; 320 | char prevsuf[3] = { 0 }; 321 | for (j = 0; j < lines; j++) { 322 | const int ret = wildpfxcmp(curpref->prefix, outlines[j]); 323 | 324 | if (ret > 0) continue; 325 | if (ret < 0) break; 326 | 327 | char suf[3]; 328 | getsuffix(outlines[j], suf); 329 | if (strcmp(prevsuf, suf)) suffixes++; 330 | memcpy(prevsuf, suf, 3); 331 | } 332 | 333 | curpref->suf = poolalloc(out, sizeof(struct suffix) * suffixes); 334 | curpref->count = suffixes; 335 | 336 | // For each suffix, how many needles do we have? 337 | suffixes = 0; 338 | prevsuf[0] = prevsuf[1] = 0; 339 | for (j = 0; j < lines; j++) { 340 | const int ret = wildpfxcmp(curpref->prefix, outlines[j]); 341 | 342 | if (ret > 0) continue; 343 | if (ret < 0) break; 344 | 345 | char suf[3]; 346 | getsuffix(outlines[j], suf); 347 | if (strcmp(prevsuf, suf)) { 348 | curpref->suf[suffixes].count = 1; 349 | memcpy(curpref->suf[suffixes].suffix, suf, 3); 350 | 351 | suffixes++; 352 | } else { 353 | curpref->suf[suffixes - 1].count++; 354 | } 355 | memcpy(prevsuf, suf, 3); 356 | } 357 | 358 | // Allocate the needle counts 359 | for (j = 0; j < curpref->count; j++) { 360 | curpref->suf[j].need = poolalloc(out, sizeof(struct needle) * 361 | curpref->suf[j].count); 362 | } 363 | 364 | // For each suffix, save the needles 365 | suffixes = 0; 366 | prevsuf[0] = prevsuf[1] = 0; 367 | for (j = 0; j < lines; j++) { 368 | const int ret = wildpfxcmp(curpref->prefix, outlines[j]); 369 | 370 | if (ret > 0) continue; 371 | if (ret < 0) break; 372 | 373 | char suf[3]; 374 | getsuffix(outlines[j], suf); 375 | if (strcmp(prevsuf, suf)) { 376 | struct suffix * const cursuf = &curpref->suf[suffixes]; 377 | 378 | cursuf->count = 1; 379 | memcpy(cursuf->suffix, suf, 3); 380 | addneedle(out, &cursuf->need[0], outlines[j]); 381 | suffixes++; 382 | } else { 383 | struct suffix * const cursuf = &curpref->suf[suffixes - 1]; 384 | 385 | addneedle(out, &cursuf->need[cursuf->count], 386 | outlines[j]); 387 | cursuf->count++; 388 | } 389 | memcpy(prevsuf, suf, 3); 390 | } 391 | } 392 | 393 | for (i = 0; i < origlines; i++) free(outlines[i]); 394 | free(outlines); 395 | 396 | // Refresh storage size, so that binary save + load doesn't waste space 397 | out->storagelen = out->used; 398 | 399 | return out; 400 | } 401 | -------------------------------------------------------------------------------- /rated.c: -------------------------------------------------------------------------------- 1 | /* 2 | liburlmatch - a fast URL matcher 3 | Copyright (C) 2013 Lauri Kasanen 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU Affero General Public License as published by 7 | the Free Software Foundation, version 3 of the License. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU Affero General Public License for more details. 13 | 14 | You should have received a copy of the GNU Affero General Public License 15 | along with this program. If not, see . 16 | */ 17 | 18 | #include "internal.h" 19 | #include "urlmatch.h" 20 | 21 | #include 22 | 23 | static u32 wordlen(const char *start) { 24 | 25 | const char * const orig = start; 26 | 27 | while (!isspace(*start) && *start) start++; 28 | 29 | return start - orig; 30 | } 31 | 32 | static const char *nextword(const char *ptr) { 33 | 34 | while (isspace(*ptr) && *ptr) ptr++; 35 | 36 | return ptr; 37 | } 38 | 39 | int ratedsearch(const char needle[], const char haystack[]) { 40 | 41 | // For each source word, if it's present in haystack, increment score. 42 | // IOW, a simple google-like search. 43 | const u32 tmplen = 320; 44 | char tmp[tmplen]; 45 | 46 | const char *cur = nextword(needle); 47 | u32 wlen = wordlen(cur); 48 | u32 score = 0; 49 | 50 | while (*cur) { 51 | if (wlen >= tmplen) 52 | return -1; 53 | memcpy(tmp, cur, wlen); 54 | tmp[wlen] = '\0'; 55 | 56 | if (strcasestr(haystack, tmp)) 57 | score++; 58 | 59 | cur += wlen; 60 | cur = nextword(cur); 61 | wlen = wordlen(cur); 62 | } 63 | 64 | return score; 65 | } 66 | -------------------------------------------------------------------------------- /simple.c: -------------------------------------------------------------------------------- 1 | /* 2 | liburlmatch - a fast URL matcher 3 | Copyright (C) 2013 Lauri Kasanen 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU Affero General Public License as published by 7 | the Free Software Foundation, version 3 of the License. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU Affero General Public License for more details. 13 | 14 | You should have received a copy of the GNU Affero General Public License 15 | along with this program. If not, see . 16 | */ 17 | 18 | #include "urlmatch.h" 19 | #include "internal.h" 20 | 21 | int url_simplematch(const char find[], const char hay[]) { 22 | 23 | const u32 wilds = countwilds(find); 24 | 25 | // Easiest path: no wildcards 26 | if (!wilds) { 27 | return strcmp(find, hay) == 0; 28 | } 29 | 30 | const u32 len = strlen(find); 31 | u32 i, h = 0; 32 | 33 | for (i = 0; i < len; i++) { 34 | if (find[i] != '*') { 35 | if (find[i] != hay[h]) 36 | return 0; 37 | h++; 38 | } else { 39 | // If multiple wildcards in a row, skip to the last 40 | while (find[i+1] == '*') i++; 41 | 42 | if (i >= len - 1) 43 | return 1; 44 | 45 | // Wildcard, not last 46 | const char * const ender = strchrnul(&find[i + 1], '*'); 47 | const u32 dist = ender - &find[i + 1]; 48 | 49 | char piece[dist + 1]; 50 | memcpy(piece, &find[i + 1], dist); 51 | piece[dist] = '\0'; 52 | 53 | const char * const lastmatch = strrstr(&hay[h], piece); 54 | if (!lastmatch) 55 | return 0; 56 | 57 | // Is backtracking required? 58 | const char * const firstmatch = strstr(&hay[h], piece); 59 | 60 | // The dist check is to make sure this is not a suffix search 61 | if (firstmatch != lastmatch && dist != len - i - 1) { 62 | const u32 move = firstmatch - &hay[h]; 63 | h += move; 64 | } else { 65 | const u32 move = lastmatch - &hay[h]; 66 | h += move; 67 | } 68 | } 69 | } 70 | 71 | // We ran out of needle but not hay 72 | if (h != strlen(hay)) return 0; 73 | 74 | return 1; 75 | } 76 | -------------------------------------------------------------------------------- /test/Makefile: -------------------------------------------------------------------------------- 1 | SRC = $(wildcard *.c) 2 | TARGETS = $(SRC:.c=) 3 | 4 | CFLAGS += -Wall -Wextra -g 5 | CPPFLAGS += -I .. 6 | LDFLAGS += -lz 7 | 8 | .PHONY: all clean 9 | 10 | all: $(TARGETS) 11 | @./run-tests.sh $(TARGETS) 12 | 13 | $(TARGETS): ../liburlmatch.a 14 | 15 | clean: 16 | rm -f *.o $(TARGETS) 17 | -------------------------------------------------------------------------------- /test/allocfree.c: -------------------------------------------------------------------------------- 1 | #include "test.h" 2 | #include "internal.h" 3 | 4 | int main() { 5 | 6 | const char pat[] = 7 | "http://moi\n" 8 | "https://katti\n" 9 | "moido\n" 10 | "http://hoi\n" 11 | "http://google*"; 12 | 13 | urlctx *ctx = url_init(pat); 14 | url_free(ctx); 15 | 16 | const char pat2[] = 17 | "http://moi\n" 18 | "https://katti\n" 19 | "moido\n" 20 | "http://hoi\n" 21 | "http://google*\n\n\n"; 22 | 23 | ctx = url_init(pat2); 24 | url_free(ctx); 25 | 26 | const char pat3[] = 27 | "http://moi\n" 28 | "https://katti\n\n\n" 29 | "moido\n\n" 30 | "http://hoi\n" 31 | "http://google*\n\n"; 32 | 33 | ctx = url_init(pat3); 34 | printctx(ctx); 35 | url_free(ctx); 36 | 37 | return 0; 38 | } 39 | -------------------------------------------------------------------------------- /test/bin.c: -------------------------------------------------------------------------------- 1 | #include "test.h" 2 | #include "internal.h" 3 | #include 4 | 5 | int main() { 6 | 7 | /* Repeatedly save and load the context, then compare it to the first one. */ 8 | 9 | urlctx *ctx = url_init( 10 | "http://*gooogle*\n" 11 | "ftp://fooo\n" 12 | "*adwords\n" 13 | "http*//*.php"); 14 | 15 | // Yes yes, insecure mktemp. This is a unit test. 16 | char name[] = "/tmp/bintestXXXXXX"; 17 | mktemp(name); 18 | 19 | if (url_save_optimized(ctx, name)) fail("save failed\n"); 20 | 21 | u32 i; 22 | urlctx *tmp; 23 | for (i = 0; i < 20; i++) { 24 | tmp = url_init_file(name); 25 | if (!tmp) fail("load failed\n"); 26 | if (url_save_optimized(tmp, name)) fail("save failed\n"); 27 | url_free(tmp); 28 | } 29 | tmp = url_init_file(name); 30 | 31 | if (ctxcmp(ctx, tmp)) fail("compare failed\n"); 32 | 33 | url_free(ctx); 34 | url_free(tmp); 35 | unlink(name); 36 | return 0; 37 | } 38 | -------------------------------------------------------------------------------- /test/count.c: -------------------------------------------------------------------------------- 1 | #include "test.h" 2 | #include "internal.h" 3 | 4 | void test(const char str[], const u32 result) { 5 | 6 | u32 foo = countwilds(str); 7 | 8 | if (foo != result) 9 | fail("Got %u expected %u for %s\n", foo, result, str); 10 | } 11 | 12 | int main() { 13 | 14 | test("", 0); 15 | test("ddd", 0); 16 | test("dgfdsfsdgd", 0); 17 | 18 | test("*", 1); 19 | test("**", 2); 20 | test("***", 3); 21 | test("****", 4); 22 | 23 | test("*foo*bar", 2); 24 | test("*foobar*", 2); 25 | 26 | return 0; 27 | } 28 | -------------------------------------------------------------------------------- /test/opti.c: -------------------------------------------------------------------------------- 1 | #include "test.h" 2 | 3 | void test(const char needle[], const char hay[], const int res) { 4 | 5 | urlctx *ctx = url_init(needle); 6 | 7 | if (url_match(ctx, hay) != res) 8 | fail("%s in %s, expected %u\n", needle, hay, res); 9 | 10 | url_free(ctx); 11 | } 12 | 13 | int main() { 14 | 15 | test("foo", "bar", 0); 16 | test("foo", "fo", 0); 17 | 18 | test("foo", "foo", 1); 19 | test("foo", "foofoo", 0); 20 | test("foo", "barfoo", 0); 21 | test("foo", "foofoobar", 0); 22 | 23 | test("*", "ff", 1); 24 | test("*", "gdfgfd", 1); 25 | test("*", "*g****", 1); 26 | test("*", "*", 1); 27 | 28 | test("*foo", "foo", 1); 29 | test("*foo", "foofoo", 1); 30 | test("*foo", "ofoo", 1); 31 | test("*foo", "fo", 0); 32 | test("*foo", "oof", 0); 33 | test("f****f", "fof", 1); 34 | test("f****f", "ff", 1); 35 | test("**f****f", "ff", 1); 36 | test("f****f**", "ff", 1); 37 | test("**f****f**", "ff", 1); 38 | 39 | test("*foo*", "foo", 1); 40 | test("*foo*", "bfoob", 1); 41 | test("*foo*", "*foo*", 1); 42 | test("*foo*", "foishbar", 0); 43 | test("*foo*", "foko", 0); 44 | test("*foo*", "fokooooooooooooofoo", 1); 45 | 46 | test("*f*o*o*", "foo", 1); 47 | test("*f*o*o*", "fffffoffoff", 1); 48 | test("*f*oo", "foo", 1); 49 | test("*f*oo", "foof", 0); 50 | test("*f*oo", "fff kkk foo", 1); 51 | 52 | test("f*", "foo", 1); 53 | test("f*f", "foof", 1); 54 | test("f*f", "ffffooffff", 1); 55 | test("f*f", "foo", 0); 56 | test("fkilla*", "foo", 0); 57 | test("fkilla*", "fkillyyy", 0); 58 | test("fkilla*", "fkilla", 1); 59 | test("fkilla*", "fkillamogfgf", 1); 60 | 61 | 62 | return 0; 63 | } 64 | -------------------------------------------------------------------------------- /test/rated.c: -------------------------------------------------------------------------------- 1 | #include "test.h" 2 | 3 | static void test(const char needle[], const char hay[], const int res) { 4 | 5 | const int ret = ratedsearch(needle, hay); 6 | if (ret != res) 7 | fail("'%s' in '%s', expected %u got %d\n", needle, hay, res, ret); 8 | } 9 | 10 | int main() { 11 | 12 | test("masa", "doigfhdfoighfdoignvoifd hoidfvhiofdhvoifd hoifhgifdo hofdi", 0); 13 | test("masa", "masa", 1); 14 | test(" masa ", "masa", 1); 15 | test("nightwish lyric ", "The best of Nightwish: Sleeping Sun LYRICS", 2); 16 | test("NIGHtwish lyric ", "The best of Nightwish: Sleeping Sun LYRICS", 2); 17 | test("johnson core coil", "http://www.google.com?search=kalle+masa+kaupassa", 0); 18 | test("johnson core coil", "http://www.google.com?search=kalle+masa+kaupassa+johnso+coil", 1); 19 | test("johnson core coil", "http://www.google.com?search=core+coil+johnson", 3); 20 | test("", "", 0); 21 | 22 | return 0; 23 | } 24 | -------------------------------------------------------------------------------- /test/run-tests.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Tests starting with x are expected to fail 3 | 4 | 5 | export LANG=C 6 | 7 | GREEN="$(echo -e '\033[1;32m')" 8 | YELLOW="$(echo -e '\033[0;33m')" 9 | RED="$(echo -e '\033[1;31m')" 10 | NORMAL="$(echo -e '\033[0;39m')" 11 | 12 | success=0 13 | fail=0 14 | 15 | [ "$#" -lt 1 ] && exit 16 | 17 | for bin in $*; do 18 | [ ! -f "$bin" ] && continue 19 | [ ! -x "$bin" ] && continue 20 | 21 | test=$bin 22 | log=${test}.log 23 | 24 | ret=0 25 | case $test in x*) ret=1 ;; esac 26 | 27 | echo -n "Running test $test... " 28 | ./$test > $log 29 | if [ $? -ne $ret ]; then 30 | fail=$((fail + 1)) 31 | echo "${RED}Failed $NORMAL" 32 | else 33 | success=$((success + 1)) 34 | echo 35 | rm -f $log 36 | fi 37 | 38 | # If empty, remove 39 | [ ! -s "$log" ] && rm -f $log 40 | done 41 | 42 | echo 43 | 44 | total=$((fail + success)) 45 | percentage=$(awk "BEGIN{print $success/$total * 100}") 46 | percentage=$(printf '%.2f' $percentage) 47 | 48 | num=${percentage//.*/} 49 | 50 | [ $fail -eq 0 ] && echo "$GREEN All tests passed!" 51 | [ $fail -ne 0 -a $num -ge 60 ] && echo "$YELLOW $percentage% passed, $fail/$total fails" 52 | [ $num -lt 60 ] && echo "$RED $percentage% passed, $fail/$total fails" 53 | 54 | echo $NORMAL 55 | -------------------------------------------------------------------------------- /test/shortrule.c: -------------------------------------------------------------------------------- 1 | #include "test.h" 2 | #include "internal.h" 3 | 4 | int main() { 5 | 6 | const char pat[] = "moi"; 7 | 8 | urlctx *ctx = url_init(pat); 9 | printctx(ctx); 10 | url_free(ctx); 11 | 12 | return 0; 13 | } 14 | -------------------------------------------------------------------------------- /test/simple.c: -------------------------------------------------------------------------------- 1 | #include "test.h" 2 | 3 | void test(const char needle[], const char hay[], const int res) { 4 | 5 | if (url_simplematch(needle, hay) != res) 6 | fail("%s in %s, expected %u\n", needle, hay, res); 7 | } 8 | 9 | int main() { 10 | 11 | test("foo", "bar", 0); 12 | test("foo", "fo", 0); 13 | 14 | test("foo", "foo", 1); 15 | test("foo", "foofoo", 0); 16 | test("foo", "barfoo", 0); 17 | test("foo", "foofoobar", 0); 18 | 19 | test("*", "", 1); 20 | test("*", "ff", 1); 21 | test("*", "gdfgfd", 1); 22 | test("*", "*g****", 1); 23 | test("*", "*", 1); 24 | 25 | test("*foo", "foo", 1); 26 | test("*foo", "foofoo", 1); 27 | test("*foo", "ofoo", 1); 28 | test("*foo", "fo", 0); 29 | test("*foo", "oof", 0); 30 | test("f****f", "fof", 1); 31 | test("f****f", "ff", 1); 32 | test("**f****f", "ff", 1); 33 | test("f****f**", "ff", 1); 34 | test("**f****f**", "ff", 1); 35 | 36 | test("*foo*", "foo", 1); 37 | test("*foo*", "bfoob", 1); 38 | test("*foo*", "*foo*", 1); 39 | test("*foo*", "foishbar", 0); 40 | test("*foo*", "foko", 0); 41 | test("*foo*", "fokooooooooooooofoo", 1); 42 | 43 | test("*f*o*o*", "foo", 1); 44 | test("*f*o*o*", "fffffoffoff", 1); 45 | test("*f*oo", "foo", 1); 46 | test("*f*oo", "foof", 0); 47 | test("*f*oo", "fff kkk foo", 1); 48 | 49 | test("f*", "foo", 1); 50 | test("f*f", "foof", 1); 51 | test("f*f", "ffffooffff", 1); 52 | test("f*f", "foo", 0); 53 | test("fkilla*", "foo", 0); 54 | test("fkilla*", "fkillyyy", 0); 55 | test("fkilla*", "fkilla", 1); 56 | test("fkilla*", "fkillamogfgf", 1); 57 | 58 | 59 | return 0; 60 | } 61 | -------------------------------------------------------------------------------- /test/str.c: -------------------------------------------------------------------------------- 1 | #include "test.h" 2 | #include "internal.h" 3 | 4 | static void test(const char hay[], const char needle[], const u32 pos) { 5 | 6 | const char * const ptr = strrstr(hay, needle); 7 | const char * const exists = strstr(hay, needle); 8 | 9 | if (!exists) { 10 | if (ptr) 11 | fail("False positive\n"); 12 | return; 13 | } 14 | 15 | const u32 tmp = ptr - hay; 16 | if (tmp != pos) 17 | fail("%s in %s, wanted %u got %u\n", 18 | needle, hay, pos, tmp); 19 | } 20 | 21 | static void suf(const char one[], const char two[], const u32 nomatch) { 22 | const u32 res = suffixcmp(one, two); 23 | 24 | if (res != nomatch) 25 | fail("suffixcmp %s %s got %u expected %u\n", 26 | one, two, res, nomatch); 27 | } 28 | 29 | int main() { 30 | 31 | test("foo", "bar", 0); 32 | test("foo", "fo", 0); 33 | 34 | test("foo", "foo", 0); 35 | test("foo", "foofoo", 3); 36 | test("foo", "barfoo", 3); 37 | test("foo", "foofoobar", 3); 38 | 39 | suf("aa", "ab", 1); 40 | suf("bb", "ab", 1); 41 | suf("a", "ab", 1); 42 | suf("aa", "b", 1); 43 | 44 | suf("aa", "aa", 0); 45 | suf("bb", "bb", 0); 46 | suf("aa", "a", 0); 47 | suf("a", "aa", 0); 48 | 49 | return 0; 50 | } 51 | -------------------------------------------------------------------------------- /test/test.h: -------------------------------------------------------------------------------- 1 | #ifndef TEST_H 2 | #define TEST_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #include "lrtypes.h" 9 | #include "urlmatch.h" 10 | 11 | static inline void fail(const char fmt[], ...) { 12 | 13 | va_list ap; 14 | va_start(ap, fmt); 15 | 16 | vprintf(fmt, ap); 17 | 18 | va_end(ap); 19 | 20 | exit(1); 21 | } 22 | 23 | #endif 24 | -------------------------------------------------------------------------------- /urlmatch.h: -------------------------------------------------------------------------------- 1 | /* 2 | liburlmatch - a fast URL matcher 3 | Copyright (C) 2013 Lauri Kasanen 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU Affero General Public License as published by 7 | the Free Software Foundation, version 3 of the License. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU Affero General Public License for more details. 13 | 14 | You should have received a copy of the GNU Affero General Public License 15 | along with this program. If not, see . 16 | */ 17 | 18 | #ifndef URLMATCH_H 19 | #define URLMATCH_H 20 | 21 | #ifdef __cplusplus 22 | extern "C" { 23 | #endif 24 | 25 | // Let's help the compiler 26 | #if __GNUC__ >= 4 27 | 28 | #define PURE_FUNC __attribute__ ((pure)) 29 | #define NORETURN_FUNC __attribute__ ((noreturn)) 30 | #define CONST_FUNC __attribute__ ((const)) 31 | #define WUR_FUNC __attribute__ ((warn_unused_result)) 32 | #define NONNULL(A) __attribute__ ((nonnull (A))) 33 | #else // GNUC 34 | 35 | #define PURE_FUNC 36 | #define NORETURN_FUNC 37 | #define CONST_FUNC 38 | #define WUR_FUNC 39 | #define NONNULL 40 | 41 | #endif // GNUC 42 | 43 | // Returns 1 if haystack matches pattern, 0 otherwise. 44 | int url_simplematch(const char pattern[], const char haystack[]) WUR_FUNC PURE_FUNC; 45 | 46 | /* These two functions initialize the optimized pattern matcher. 47 | * _init takes a char array of patterns, one per line. 48 | * _init_file takes a filename, either a text file containing one pattern per line, 49 | * or an optimized binary file as saved by _save_optimized. 50 | * 51 | * On error they return NULL. */ 52 | typedef struct urlctx urlctx; 53 | urlctx *url_init_file(const char file[]) WUR_FUNC; 54 | urlctx *url_init_file2(const int fd) WUR_FUNC; 55 | urlctx *url_init(const char contents[]) WUR_FUNC; 56 | 57 | // Save an optimized binary file for faster loading later. Returns 0 on success. 58 | int url_save_optimized(const urlctx *ctx, const char file[]) WUR_FUNC NONNULL(1); 59 | int url_save_optimized2(const urlctx *ctx, const int fd) WUR_FUNC NONNULL(1); 60 | 61 | /* Returns 1 if haystack matches the optimized pattern, 0 otherwise. 62 | * 63 | * It's safe to call from multiple threads at once, with the same context. */ 64 | int url_match(const urlctx *ctx, const char haystack[]) WUR_FUNC PURE_FUNC NONNULL(1); 65 | 66 | // Frees this context. 67 | void url_free(urlctx *ctx) NONNULL(1); 68 | 69 | /* Auxiliary function for e.g. searching bookmarks 70 | * 71 | * Returns the match score, higher the better. -1 is returned on error. */ 72 | int ratedsearch(const char needle[], const char haystack[]) WUR_FUNC PURE_FUNC; 73 | 74 | #undef PURE_FUNC 75 | #undef NORETURN_FUNC 76 | #undef CONST_FUNC 77 | #undef WUR_FUNC 78 | #undef NONNULL 79 | 80 | #ifdef __cplusplus 81 | } // extern C 82 | #endif 83 | 84 | #endif 85 | --------------------------------------------------------------------------------