├── .gitignore
├── COPYING
├── Makefile
├── README.asciidoc
├── bench
├── Makefile
└── bench.c
├── internal.h
├── lrtypes.h
├── misc.c
├── opti.c
├── opti_init.c
├── rated.c
├── simple.c
├── test
├── Makefile
├── allocfree.c
├── bin.c
├── count.c
├── opti.c
├── rated.c
├── run-tests.sh
├── shortrule.c
├── simple.c
├── str.c
└── test.h
└── urlmatch.h
/.gitignore:
--------------------------------------------------------------------------------
1 | *.[oa]
2 | test/*
3 | bench/bench
4 |
--------------------------------------------------------------------------------
/COPYING:
--------------------------------------------------------------------------------
1 | GNU AFFERO GENERAL PUBLIC LICENSE
2 | Version 3, 19 November 2007
3 |
4 | Copyright (C) 2007 Free Software Foundation, Inc.
5 | Everyone is permitted to copy and distribute verbatim copies
6 | of this license document, but changing it is not allowed.
7 |
8 | Preamble
9 |
10 | The GNU Affero General Public License is a free, copyleft license for
11 | software and other kinds of works, specifically designed to ensure
12 | cooperation with the community in the case of network server software.
13 |
14 | The licenses for most software and other practical works are designed
15 | to take away your freedom to share and change the works. By contrast,
16 | our General Public Licenses are intended to guarantee your freedom to
17 | share and change all versions of a program--to make sure it remains free
18 | software for all its users.
19 |
20 | When we speak of free software, we are referring to freedom, not
21 | price. Our General Public Licenses are designed to make sure that you
22 | have the freedom to distribute copies of free software (and charge for
23 | them if you wish), that you receive source code or can get it if you
24 | want it, that you can change the software or use pieces of it in new
25 | free programs, and that you know you can do these things.
26 |
27 | Developers that use our General Public Licenses protect your rights
28 | with two steps: (1) assert copyright on the software, and (2) offer
29 | you this License which gives you legal permission to copy, distribute
30 | and/or modify the software.
31 |
32 | A secondary benefit of defending all users' freedom is that
33 | improvements made in alternate versions of the program, if they
34 | receive widespread use, become available for other developers to
35 | incorporate. Many developers of free software are heartened and
36 | encouraged by the resulting cooperation. However, in the case of
37 | software used on network servers, this result may fail to come about.
38 | The GNU General Public License permits making a modified version and
39 | letting the public access it on a server without ever releasing its
40 | source code to the public.
41 |
42 | The GNU Affero General Public License is designed specifically to
43 | ensure that, in such cases, the modified source code becomes available
44 | to the community. It requires the operator of a network server to
45 | provide the source code of the modified version running there to the
46 | users of that server. Therefore, public use of a modified version, on
47 | a publicly accessible server, gives the public access to the source
48 | code of the modified version.
49 |
50 | An older license, called the Affero General Public License and
51 | published by Affero, was designed to accomplish similar goals. This is
52 | a different license, not a version of the Affero GPL, but Affero has
53 | released a new version of the Affero GPL which permits relicensing under
54 | this license.
55 |
56 | The precise terms and conditions for copying, distribution and
57 | modification follow.
58 |
59 | TERMS AND CONDITIONS
60 |
61 | 0. Definitions.
62 |
63 | "This License" refers to version 3 of the GNU Affero General Public License.
64 |
65 | "Copyright" also means copyright-like laws that apply to other kinds of
66 | works, such as semiconductor masks.
67 |
68 | "The Program" refers to any copyrightable work licensed under this
69 | License. Each licensee is addressed as "you". "Licensees" and
70 | "recipients" may be individuals or organizations.
71 |
72 | To "modify" a work means to copy from or adapt all or part of the work
73 | in a fashion requiring copyright permission, other than the making of an
74 | exact copy. The resulting work is called a "modified version" of the
75 | earlier work or a work "based on" the earlier work.
76 |
77 | A "covered work" means either the unmodified Program or a work based
78 | on the Program.
79 |
80 | To "propagate" a work means to do anything with it that, without
81 | permission, would make you directly or secondarily liable for
82 | infringement under applicable copyright law, except executing it on a
83 | computer or modifying a private copy. Propagation includes copying,
84 | distribution (with or without modification), making available to the
85 | public, and in some countries other activities as well.
86 |
87 | To "convey" a work means any kind of propagation that enables other
88 | parties to make or receive copies. Mere interaction with a user through
89 | a computer network, with no transfer of a copy, is not conveying.
90 |
91 | An interactive user interface displays "Appropriate Legal Notices"
92 | to the extent that it includes a convenient and prominently visible
93 | feature that (1) displays an appropriate copyright notice, and (2)
94 | tells the user that there is no warranty for the work (except to the
95 | extent that warranties are provided), that licensees may convey the
96 | work under this License, and how to view a copy of this License. If
97 | the interface presents a list of user commands or options, such as a
98 | menu, a prominent item in the list meets this criterion.
99 |
100 | 1. Source Code.
101 |
102 | The "source code" for a work means the preferred form of the work
103 | for making modifications to it. "Object code" means any non-source
104 | form of a work.
105 |
106 | A "Standard Interface" means an interface that either is an official
107 | standard defined by a recognized standards body, or, in the case of
108 | interfaces specified for a particular programming language, one that
109 | is widely used among developers working in that language.
110 |
111 | The "System Libraries" of an executable work include anything, other
112 | than the work as a whole, that (a) is included in the normal form of
113 | packaging a Major Component, but which is not part of that Major
114 | Component, and (b) serves only to enable use of the work with that
115 | Major Component, or to implement a Standard Interface for which an
116 | implementation is available to the public in source code form. A
117 | "Major Component", in this context, means a major essential component
118 | (kernel, window system, and so on) of the specific operating system
119 | (if any) on which the executable work runs, or a compiler used to
120 | produce the work, or an object code interpreter used to run it.
121 |
122 | The "Corresponding Source" for a work in object code form means all
123 | the source code needed to generate, install, and (for an executable
124 | work) run the object code and to modify the work, including scripts to
125 | control those activities. However, it does not include the work's
126 | System Libraries, or general-purpose tools or generally available free
127 | programs which are used unmodified in performing those activities but
128 | which are not part of the work. For example, Corresponding Source
129 | includes interface definition files associated with source files for
130 | the work, and the source code for shared libraries and dynamically
131 | linked subprograms that the work is specifically designed to require,
132 | such as by intimate data communication or control flow between those
133 | subprograms and other parts of the work.
134 |
135 | The Corresponding Source need not include anything that users
136 | can regenerate automatically from other parts of the Corresponding
137 | Source.
138 |
139 | The Corresponding Source for a work in source code form is that
140 | same work.
141 |
142 | 2. Basic Permissions.
143 |
144 | All rights granted under this License are granted for the term of
145 | copyright on the Program, and are irrevocable provided the stated
146 | conditions are met. This License explicitly affirms your unlimited
147 | permission to run the unmodified Program. The output from running a
148 | covered work is covered by this License only if the output, given its
149 | content, constitutes a covered work. This License acknowledges your
150 | rights of fair use or other equivalent, as provided by copyright law.
151 |
152 | You may make, run and propagate covered works that you do not
153 | convey, without conditions so long as your license otherwise remains
154 | in force. You may convey covered works to others for the sole purpose
155 | of having them make modifications exclusively for you, or provide you
156 | with facilities for running those works, provided that you comply with
157 | the terms of this License in conveying all material for which you do
158 | not control copyright. Those thus making or running the covered works
159 | for you must do so exclusively on your behalf, under your direction
160 | and control, on terms that prohibit them from making any copies of
161 | your copyrighted material outside their relationship with you.
162 |
163 | Conveying under any other circumstances is permitted solely under
164 | the conditions stated below. Sublicensing is not allowed; section 10
165 | makes it unnecessary.
166 |
167 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
168 |
169 | No covered work shall be deemed part of an effective technological
170 | measure under any applicable law fulfilling obligations under article
171 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or
172 | similar laws prohibiting or restricting circumvention of such
173 | measures.
174 |
175 | When you convey a covered work, you waive any legal power to forbid
176 | circumvention of technological measures to the extent such circumvention
177 | is effected by exercising rights under this License with respect to
178 | the covered work, and you disclaim any intention to limit operation or
179 | modification of the work as a means of enforcing, against the work's
180 | users, your or third parties' legal rights to forbid circumvention of
181 | technological measures.
182 |
183 | 4. Conveying Verbatim Copies.
184 |
185 | You may convey verbatim copies of the Program's source code as you
186 | receive it, in any medium, provided that you conspicuously and
187 | appropriately publish on each copy an appropriate copyright notice;
188 | keep intact all notices stating that this License and any
189 | non-permissive terms added in accord with section 7 apply to the code;
190 | keep intact all notices of the absence of any warranty; and give all
191 | recipients a copy of this License along with the Program.
192 |
193 | You may charge any price or no price for each copy that you convey,
194 | and you may offer support or warranty protection for a fee.
195 |
196 | 5. Conveying Modified Source Versions.
197 |
198 | You may convey a work based on the Program, or the modifications to
199 | produce it from the Program, in the form of source code under the
200 | terms of section 4, provided that you also meet all of these conditions:
201 |
202 | a) The work must carry prominent notices stating that you modified
203 | it, and giving a relevant date.
204 |
205 | b) The work must carry prominent notices stating that it is
206 | released under this License and any conditions added under section
207 | 7. This requirement modifies the requirement in section 4 to
208 | "keep intact all notices".
209 |
210 | c) You must license the entire work, as a whole, under this
211 | License to anyone who comes into possession of a copy. This
212 | License will therefore apply, along with any applicable section 7
213 | additional terms, to the whole of the work, and all its parts,
214 | regardless of how they are packaged. This License gives no
215 | permission to license the work in any other way, but it does not
216 | invalidate such permission if you have separately received it.
217 |
218 | d) If the work has interactive user interfaces, each must display
219 | Appropriate Legal Notices; however, if the Program has interactive
220 | interfaces that do not display Appropriate Legal Notices, your
221 | work need not make them do so.
222 |
223 | A compilation of a covered work with other separate and independent
224 | works, which are not by their nature extensions of the covered work,
225 | and which are not combined with it such as to form a larger program,
226 | in or on a volume of a storage or distribution medium, is called an
227 | "aggregate" if the compilation and its resulting copyright are not
228 | used to limit the access or legal rights of the compilation's users
229 | beyond what the individual works permit. Inclusion of a covered work
230 | in an aggregate does not cause this License to apply to the other
231 | parts of the aggregate.
232 |
233 | 6. Conveying Non-Source Forms.
234 |
235 | You may convey a covered work in object code form under the terms
236 | of sections 4 and 5, provided that you also convey the
237 | machine-readable Corresponding Source under the terms of this License,
238 | in one of these ways:
239 |
240 | a) Convey the object code in, or embodied in, a physical product
241 | (including a physical distribution medium), accompanied by the
242 | Corresponding Source fixed on a durable physical medium
243 | customarily used for software interchange.
244 |
245 | b) Convey the object code in, or embodied in, a physical product
246 | (including a physical distribution medium), accompanied by a
247 | written offer, valid for at least three years and valid for as
248 | long as you offer spare parts or customer support for that product
249 | model, to give anyone who possesses the object code either (1) a
250 | copy of the Corresponding Source for all the software in the
251 | product that is covered by this License, on a durable physical
252 | medium customarily used for software interchange, for a price no
253 | more than your reasonable cost of physically performing this
254 | conveying of source, or (2) access to copy the
255 | Corresponding Source from a network server at no charge.
256 |
257 | c) Convey individual copies of the object code with a copy of the
258 | written offer to provide the Corresponding Source. This
259 | alternative is allowed only occasionally and noncommercially, and
260 | only if you received the object code with such an offer, in accord
261 | with subsection 6b.
262 |
263 | d) Convey the object code by offering access from a designated
264 | place (gratis or for a charge), and offer equivalent access to the
265 | Corresponding Source in the same way through the same place at no
266 | further charge. You need not require recipients to copy the
267 | Corresponding Source along with the object code. If the place to
268 | copy the object code is a network server, the Corresponding Source
269 | may be on a different server (operated by you or a third party)
270 | that supports equivalent copying facilities, provided you maintain
271 | clear directions next to the object code saying where to find the
272 | Corresponding Source. Regardless of what server hosts the
273 | Corresponding Source, you remain obligated to ensure that it is
274 | available for as long as needed to satisfy these requirements.
275 |
276 | e) Convey the object code using peer-to-peer transmission, provided
277 | you inform other peers where the object code and Corresponding
278 | Source of the work are being offered to the general public at no
279 | charge under subsection 6d.
280 |
281 | A separable portion of the object code, whose source code is excluded
282 | from the Corresponding Source as a System Library, need not be
283 | included in conveying the object code work.
284 |
285 | A "User Product" is either (1) a "consumer product", which means any
286 | tangible personal property which is normally used for personal, family,
287 | or household purposes, or (2) anything designed or sold for incorporation
288 | into a dwelling. In determining whether a product is a consumer product,
289 | doubtful cases shall be resolved in favor of coverage. For a particular
290 | product received by a particular user, "normally used" refers to a
291 | typical or common use of that class of product, regardless of the status
292 | of the particular user or of the way in which the particular user
293 | actually uses, or expects or is expected to use, the product. A product
294 | is a consumer product regardless of whether the product has substantial
295 | commercial, industrial or non-consumer uses, unless such uses represent
296 | the only significant mode of use of the product.
297 |
298 | "Installation Information" for a User Product means any methods,
299 | procedures, authorization keys, or other information required to install
300 | and execute modified versions of a covered work in that User Product from
301 | a modified version of its Corresponding Source. The information must
302 | suffice to ensure that the continued functioning of the modified object
303 | code is in no case prevented or interfered with solely because
304 | modification has been made.
305 |
306 | If you convey an object code work under this section in, or with, or
307 | specifically for use in, a User Product, and the conveying occurs as
308 | part of a transaction in which the right of possession and use of the
309 | User Product is transferred to the recipient in perpetuity or for a
310 | fixed term (regardless of how the transaction is characterized), the
311 | Corresponding Source conveyed under this section must be accompanied
312 | by the Installation Information. But this requirement does not apply
313 | if neither you nor any third party retains the ability to install
314 | modified object code on the User Product (for example, the work has
315 | been installed in ROM).
316 |
317 | The requirement to provide Installation Information does not include a
318 | requirement to continue to provide support service, warranty, or updates
319 | for a work that has been modified or installed by the recipient, or for
320 | the User Product in which it has been modified or installed. Access to a
321 | network may be denied when the modification itself materially and
322 | adversely affects the operation of the network or violates the rules and
323 | protocols for communication across the network.
324 |
325 | Corresponding Source conveyed, and Installation Information provided,
326 | in accord with this section must be in a format that is publicly
327 | documented (and with an implementation available to the public in
328 | source code form), and must require no special password or key for
329 | unpacking, reading or copying.
330 |
331 | 7. Additional Terms.
332 |
333 | "Additional permissions" are terms that supplement the terms of this
334 | License by making exceptions from one or more of its conditions.
335 | Additional permissions that are applicable to the entire Program shall
336 | be treated as though they were included in this License, to the extent
337 | that they are valid under applicable law. If additional permissions
338 | apply only to part of the Program, that part may be used separately
339 | under those permissions, but the entire Program remains governed by
340 | this License without regard to the additional permissions.
341 |
342 | When you convey a copy of a covered work, you may at your option
343 | remove any additional permissions from that copy, or from any part of
344 | it. (Additional permissions may be written to require their own
345 | removal in certain cases when you modify the work.) You may place
346 | additional permissions on material, added by you to a covered work,
347 | for which you have or can give appropriate copyright permission.
348 |
349 | Notwithstanding any other provision of this License, for material you
350 | add to a covered work, you may (if authorized by the copyright holders of
351 | that material) supplement the terms of this License with terms:
352 |
353 | a) Disclaiming warranty or limiting liability differently from the
354 | terms of sections 15 and 16 of this License; or
355 |
356 | b) Requiring preservation of specified reasonable legal notices or
357 | author attributions in that material or in the Appropriate Legal
358 | Notices displayed by works containing it; or
359 |
360 | c) Prohibiting misrepresentation of the origin of that material, or
361 | requiring that modified versions of such material be marked in
362 | reasonable ways as different from the original version; or
363 |
364 | d) Limiting the use for publicity purposes of names of licensors or
365 | authors of the material; or
366 |
367 | e) Declining to grant rights under trademark law for use of some
368 | trade names, trademarks, or service marks; or
369 |
370 | f) Requiring indemnification of licensors and authors of that
371 | material by anyone who conveys the material (or modified versions of
372 | it) with contractual assumptions of liability to the recipient, for
373 | any liability that these contractual assumptions directly impose on
374 | those licensors and authors.
375 |
376 | All other non-permissive additional terms are considered "further
377 | restrictions" within the meaning of section 10. If the Program as you
378 | received it, or any part of it, contains a notice stating that it is
379 | governed by this License along with a term that is a further
380 | restriction, you may remove that term. If a license document contains
381 | a further restriction but permits relicensing or conveying under this
382 | License, you may add to a covered work material governed by the terms
383 | of that license document, provided that the further restriction does
384 | not survive such relicensing or conveying.
385 |
386 | If you add terms to a covered work in accord with this section, you
387 | must place, in the relevant source files, a statement of the
388 | additional terms that apply to those files, or a notice indicating
389 | where to find the applicable terms.
390 |
391 | Additional terms, permissive or non-permissive, may be stated in the
392 | form of a separately written license, or stated as exceptions;
393 | the above requirements apply either way.
394 |
395 | 8. Termination.
396 |
397 | You may not propagate or modify a covered work except as expressly
398 | provided under this License. Any attempt otherwise to propagate or
399 | modify it is void, and will automatically terminate your rights under
400 | this License (including any patent licenses granted under the third
401 | paragraph of section 11).
402 |
403 | However, if you cease all violation of this License, then your
404 | license from a particular copyright holder is reinstated (a)
405 | provisionally, unless and until the copyright holder explicitly and
406 | finally terminates your license, and (b) permanently, if the copyright
407 | holder fails to notify you of the violation by some reasonable means
408 | prior to 60 days after the cessation.
409 |
410 | Moreover, your license from a particular copyright holder is
411 | reinstated permanently if the copyright holder notifies you of the
412 | violation by some reasonable means, this is the first time you have
413 | received notice of violation of this License (for any work) from that
414 | copyright holder, and you cure the violation prior to 30 days after
415 | your receipt of the notice.
416 |
417 | Termination of your rights under this section does not terminate the
418 | licenses of parties who have received copies or rights from you under
419 | this License. If your rights have been terminated and not permanently
420 | reinstated, you do not qualify to receive new licenses for the same
421 | material under section 10.
422 |
423 | 9. Acceptance Not Required for Having Copies.
424 |
425 | You are not required to accept this License in order to receive or
426 | run a copy of the Program. Ancillary propagation of a covered work
427 | occurring solely as a consequence of using peer-to-peer transmission
428 | to receive a copy likewise does not require acceptance. However,
429 | nothing other than this License grants you permission to propagate or
430 | modify any covered work. These actions infringe copyright if you do
431 | not accept this License. Therefore, by modifying or propagating a
432 | covered work, you indicate your acceptance of this License to do so.
433 |
434 | 10. Automatic Licensing of Downstream Recipients.
435 |
436 | Each time you convey a covered work, the recipient automatically
437 | receives a license from the original licensors, to run, modify and
438 | propagate that work, subject to this License. You are not responsible
439 | for enforcing compliance by third parties with this License.
440 |
441 | An "entity transaction" is a transaction transferring control of an
442 | organization, or substantially all assets of one, or subdividing an
443 | organization, or merging organizations. If propagation of a covered
444 | work results from an entity transaction, each party to that
445 | transaction who receives a copy of the work also receives whatever
446 | licenses to the work the party's predecessor in interest had or could
447 | give under the previous paragraph, plus a right to possession of the
448 | Corresponding Source of the work from the predecessor in interest, if
449 | the predecessor has it or can get it with reasonable efforts.
450 |
451 | You may not impose any further restrictions on the exercise of the
452 | rights granted or affirmed under this License. For example, you may
453 | not impose a license fee, royalty, or other charge for exercise of
454 | rights granted under this License, and you may not initiate litigation
455 | (including a cross-claim or counterclaim in a lawsuit) alleging that
456 | any patent claim is infringed by making, using, selling, offering for
457 | sale, or importing the Program or any portion of it.
458 |
459 | 11. Patents.
460 |
461 | A "contributor" is a copyright holder who authorizes use under this
462 | License of the Program or a work on which the Program is based. The
463 | work thus licensed is called the contributor's "contributor version".
464 |
465 | A contributor's "essential patent claims" are all patent claims
466 | owned or controlled by the contributor, whether already acquired or
467 | hereafter acquired, that would be infringed by some manner, permitted
468 | by this License, of making, using, or selling its contributor version,
469 | but do not include claims that would be infringed only as a
470 | consequence of further modification of the contributor version. For
471 | purposes of this definition, "control" includes the right to grant
472 | patent sublicenses in a manner consistent with the requirements of
473 | this License.
474 |
475 | Each contributor grants you a non-exclusive, worldwide, royalty-free
476 | patent license under the contributor's essential patent claims, to
477 | make, use, sell, offer for sale, import and otherwise run, modify and
478 | propagate the contents of its contributor version.
479 |
480 | In the following three paragraphs, a "patent license" is any express
481 | agreement or commitment, however denominated, not to enforce a patent
482 | (such as an express permission to practice a patent or covenant not to
483 | sue for patent infringement). To "grant" such a patent license to a
484 | party means to make such an agreement or commitment not to enforce a
485 | patent against the party.
486 |
487 | If you convey a covered work, knowingly relying on a patent license,
488 | and the Corresponding Source of the work is not available for anyone
489 | to copy, free of charge and under the terms of this License, through a
490 | publicly available network server or other readily accessible means,
491 | then you must either (1) cause the Corresponding Source to be so
492 | available, or (2) arrange to deprive yourself of the benefit of the
493 | patent license for this particular work, or (3) arrange, in a manner
494 | consistent with the requirements of this License, to extend the patent
495 | license to downstream recipients. "Knowingly relying" means you have
496 | actual knowledge that, but for the patent license, your conveying the
497 | covered work in a country, or your recipient's use of the covered work
498 | in a country, would infringe one or more identifiable patents in that
499 | country that you have reason to believe are valid.
500 |
501 | If, pursuant to or in connection with a single transaction or
502 | arrangement, you convey, or propagate by procuring conveyance of, a
503 | covered work, and grant a patent license to some of the parties
504 | receiving the covered work authorizing them to use, propagate, modify
505 | or convey a specific copy of the covered work, then the patent license
506 | you grant is automatically extended to all recipients of the covered
507 | work and works based on it.
508 |
509 | A patent license is "discriminatory" if it does not include within
510 | the scope of its coverage, prohibits the exercise of, or is
511 | conditioned on the non-exercise of one or more of the rights that are
512 | specifically granted under this License. You may not convey a covered
513 | work if you are a party to an arrangement with a third party that is
514 | in the business of distributing software, under which you make payment
515 | to the third party based on the extent of your activity of conveying
516 | the work, and under which the third party grants, to any of the
517 | parties who would receive the covered work from you, a discriminatory
518 | patent license (a) in connection with copies of the covered work
519 | conveyed by you (or copies made from those copies), or (b) primarily
520 | for and in connection with specific products or compilations that
521 | contain the covered work, unless you entered into that arrangement,
522 | or that patent license was granted, prior to 28 March 2007.
523 |
524 | Nothing in this License shall be construed as excluding or limiting
525 | any implied license or other defenses to infringement that may
526 | otherwise be available to you under applicable patent law.
527 |
528 | 12. No Surrender of Others' Freedom.
529 |
530 | If conditions are imposed on you (whether by court order, agreement or
531 | otherwise) that contradict the conditions of this License, they do not
532 | excuse you from the conditions of this License. If you cannot convey a
533 | covered work so as to satisfy simultaneously your obligations under this
534 | License and any other pertinent obligations, then as a consequence you may
535 | not convey it at all. For example, if you agree to terms that obligate you
536 | to collect a royalty for further conveying from those to whom you convey
537 | the Program, the only way you could satisfy both those terms and this
538 | License would be to refrain entirely from conveying the Program.
539 |
540 | 13. Remote Network Interaction; Use with the GNU General Public License.
541 |
542 | Notwithstanding any other provision of this License, if you modify the
543 | Program, your modified version must prominently offer all users
544 | interacting with it remotely through a computer network (if your version
545 | supports such interaction) an opportunity to receive the Corresponding
546 | Source of your version by providing access to the Corresponding Source
547 | from a network server at no charge, through some standard or customary
548 | means of facilitating copying of software. This Corresponding Source
549 | shall include the Corresponding Source for any work covered by version 3
550 | of the GNU General Public License that is incorporated pursuant to the
551 | following paragraph.
552 |
553 | Notwithstanding any other provision of this License, you have
554 | permission to link or combine any covered work with a work licensed
555 | under version 3 of the GNU General Public License into a single
556 | combined work, and to convey the resulting work. The terms of this
557 | License will continue to apply to the part which is the covered work,
558 | but the work with which it is combined will remain governed by version
559 | 3 of the GNU General Public License.
560 |
561 | 14. Revised Versions of this License.
562 |
563 | The Free Software Foundation may publish revised and/or new versions of
564 | the GNU Affero General Public License from time to time. Such new versions
565 | will be similar in spirit to the present version, but may differ in detail to
566 | address new problems or concerns.
567 |
568 | Each version is given a distinguishing version number. If the
569 | Program specifies that a certain numbered version of the GNU Affero General
570 | Public License "or any later version" applies to it, you have the
571 | option of following the terms and conditions either of that numbered
572 | version or of any later version published by the Free Software
573 | Foundation. If the Program does not specify a version number of the
574 | GNU Affero General Public License, you may choose any version ever published
575 | by the Free Software Foundation.
576 |
577 | If the Program specifies that a proxy can decide which future
578 | versions of the GNU Affero General Public License can be used, that proxy's
579 | public statement of acceptance of a version permanently authorizes you
580 | to choose that version for the Program.
581 |
582 | Later license versions may give you additional or different
583 | permissions. However, no additional obligations are imposed on any
584 | author or copyright holder as a result of your choosing to follow a
585 | later version.
586 |
587 | 15. Disclaimer of Warranty.
588 |
589 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
590 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
591 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
592 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
593 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
594 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
595 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
596 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
597 |
598 | 16. Limitation of Liability.
599 |
600 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
601 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
602 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
603 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
604 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
605 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
606 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
607 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
608 | SUCH DAMAGES.
609 |
610 | 17. Interpretation of Sections 15 and 16.
611 |
612 | If the disclaimer of warranty and limitation of liability provided
613 | above cannot be given local legal effect according to their terms,
614 | reviewing courts shall apply local law that most closely approximates
615 | an absolute waiver of all civil liability in connection with the
616 | Program, unless a warranty or assumption of liability accompanies a
617 | copy of the Program in return for a fee.
618 |
619 | END OF TERMS AND CONDITIONS
620 |
621 | How to Apply These Terms to Your New Programs
622 |
623 | If you develop a new program, and you want it to be of the greatest
624 | possible use to the public, the best way to achieve this is to make it
625 | free software which everyone can redistribute and change under these terms.
626 |
627 | To do so, attach the following notices to the program. It is safest
628 | to attach them to the start of each source file to most effectively
629 | state the exclusion of warranty; and each file should have at least
630 | the "copyright" line and a pointer to where the full notice is found.
631 |
632 |
633 | Copyright (C)
634 |
635 | This program is free software: you can redistribute it and/or modify
636 | it under the terms of the GNU Affero General Public License as published by
637 | the Free Software Foundation, either version 3 of the License, or
638 | (at your option) any later version.
639 |
640 | This program is distributed in the hope that it will be useful,
641 | but WITHOUT ANY WARRANTY; without even the implied warranty of
642 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
643 | GNU Affero General Public License for more details.
644 |
645 | You should have received a copy of the GNU Affero General Public License
646 | along with this program. If not, see .
647 |
648 | Also add information on how to contact you by electronic and paper mail.
649 |
650 | If your software can interact with users remotely through a computer
651 | network, you should also make sure that it provides a way for users to
652 | get its source. For example, if your program is a web application, its
653 | interface could display a "Source" link that leads users to an archive
654 | of the code. There are many ways you could offer source, and different
655 | solutions will be better for different programs; see section 13 for the
656 | specific requirements.
657 |
658 | You should also get your employer (if you work as a programmer) or school,
659 | if any, to sign a "copyright disclaimer" for the program, if necessary.
660 | For more information on this, and how to apply and follow the GNU AGPL, see
661 | .
662 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | .PHONY: all clean test
2 |
3 | CFLAGS += -Wall -Wextra
4 |
5 | PREFIX ?= /usr
6 |
7 | SRC = $(wildcard *.c)
8 | HDR = $(wildcard *.h)
9 | OBJ = $(SRC:.c=.o)
10 | NAME = liburlmatch.a
11 |
12 | all: $(NAME)
13 |
14 | $(NAME): $(OBJ)
15 | rm -f $(NAME)
16 | ar cru $(NAME) $(OBJ)
17 | ranlib $(NAME)
18 |
19 | $(OBJ): $(HDR)
20 |
21 | clean:
22 | $(MAKE) -C test clean
23 | rm -f $(OBJ) $(NAME)
24 |
25 | test: all
26 | $(MAKE) -C test
27 |
28 | install: all
29 | mkdir -p -m 755 $(DESTDIR)$(PREFIX)/include
30 | install -m 644 urlmatch.h $(DESTDIR)$(PREFIX)/include
31 | mkdir -p -m 755 $(DESTDIR)$(PREFIX)/lib
32 | install -m 644 $(NAME) $(DESTDIR)$(PREFIX)/lib
33 |
--------------------------------------------------------------------------------
/README.asciidoc:
--------------------------------------------------------------------------------
1 | URL matcher lib
2 | ===============
3 |
4 | This is a small and fast C library duplicating the URL matching
5 | functionality of Opera. You might use it to implement ad blocking
6 | or similar.
7 |
8 | Given a list of patterns, such as
9 |
10 | ----
11 | *facebook.com/*
12 | http*google-analytics.*
13 | http://foo.bat/this-annoying-image.jpeg
14 | ----
15 |
16 | you can then match any connection attempt against the whole list,
17 | getting a yes/no answer back.
18 |
19 | Motivation
20 | ----------
21 |
22 | One of the main components of Opera, the filtering system, supported
23 | white- and blacklists with wildcards. It was usable for more than just
24 | blocking ads, though it handled those well too.
25 |
26 | This is one such function that should never be relegated to Javascript
27 | (like Adblock browser extensions do). The average page makes close to a
28 | hundred connections, the list is traversed on each connection attempt,
29 | and common lists reach a few thousand entries.
30 |
31 | Turns out there wasn't any existing standalone pattern matching
32 | library. Regex is too slow (or in glibc's case, taking gigabytes of RAM),
33 | and wildcard functionality is essential.
34 |
35 | A simple function (like those you can find dozens of in the web) is
36 | included for benchmark comparison purposes. Currently this library is
37 | ~5x faster vs the simple function. This is not quite fast enough, so
38 | future optimizations will be coming.
39 |
--------------------------------------------------------------------------------
/bench/Makefile:
--------------------------------------------------------------------------------
1 | .PHONY: all clean
2 |
3 | CFLAGS += -Wall -Wextra
4 | CPPFLAGS += -I ..
5 | LDFLAGS += -lz
6 |
7 | SRC = $(wildcard *.c)
8 | OBJ = $(SRC:.c=.o)
9 | NAME = bench
10 |
11 | all: $(NAME)
12 |
13 | $(NAME): $(OBJ) ../liburlmatch.a
14 | $(CC) -o $(NAME) $(OBJ) $(CFLAGS) $(LDFLAGS) ../liburlmatch.a
15 |
16 | clean:
17 | rm -f $(OBJ) $(NAME)
18 |
--------------------------------------------------------------------------------
/bench/bench.c:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 | #include
6 | #include
7 | #include
8 | #include
9 | #include "urlmatch.h"
10 | #include "lrtypes.h"
11 |
12 | static u32 urls = 1000 * 1000;
13 | static const u32 rules = 1500;
14 |
15 | static const char **ruling;
16 | static const char **urling;
17 |
18 | static regex_t regex;
19 |
20 | static char *genurl() {
21 |
22 | const u32 len = (rand() % 80) + 10;
23 | char *buf = calloc(len + 1, 1);
24 |
25 | u32 i;
26 |
27 | switch (rand() % 10) {
28 | case 0 ... 6:
29 | strcpy(buf, "http://");
30 | break;
31 | case 7 ... 8:
32 | strcpy(buf, "https://");
33 | break;
34 | case 9:
35 | strcpy(buf, "ftp://");
36 | break;
37 | }
38 |
39 | for (i = strlen(buf); i < len; i++) {
40 | const u32 type = rand() % 10;
41 |
42 | switch (type) {
43 | case 0 ... 6:
44 | buf[i] = 'a' + rand() % 26;
45 | break;
46 | case 7:
47 | buf[i] = '0' + rand() % 10;
48 | break;
49 | case 8 ... 9:
50 | buf[i] = ':' + rand() % 7;
51 | break;
52 | }
53 | }
54 |
55 | return buf;
56 | }
57 |
58 | static char *genrule() {
59 |
60 | const u32 len = (rand() % 10) + 5;
61 | char *buf = calloc(len + 1, 1);
62 |
63 | u32 i;
64 | for (i = 0; i < len; i++) {
65 | u32 type = rand() % 3;
66 |
67 | if (i && type == 2 && buf[i - 1] == '*')
68 | type = 1;
69 |
70 | switch (type) {
71 | case 0:
72 | buf[i] = 'a' + rand() % 26;
73 | break;
74 | case 1:
75 | buf[i] = '0' + rand() % 10;
76 | break;
77 | case 2:
78 | buf[i] = '*';
79 | break;
80 | }
81 | }
82 |
83 | return buf;
84 | }
85 |
86 | static void gen() {
87 | ruling = calloc(rules, sizeof(char *));
88 | urling = calloc(urls, sizeof(char *));
89 |
90 | u32 i;
91 | for (i = 0; i < rules; i++) {
92 | ruling[i] = genrule();
93 | //printf("Rule %u: %s\n", i, ruling[i]);
94 | }
95 |
96 | for (i = 0; i < urls; i++) {
97 | urling[i] = genurl();
98 | //printf("URL %u: %s\n", i, urling[i]);
99 | }
100 | }
101 |
102 | static void simple() {
103 |
104 | u32 i, j, sum = 0;
105 | for (i = 0; i < urls; i++) {
106 | for (j = 0; j < rules; j++) {
107 | if (url_simplematch(ruling[j], urling[i])) {
108 | sum++;
109 | break;
110 | }
111 | }
112 | if (i % 10000 == 0) {printf("."); fflush(stdout);}
113 | }
114 | printf("\nTotal %u matches\n", sum);
115 | }
116 |
117 | static urlctx *opti_init() {
118 |
119 | u32 totlen = 1, i;
120 | for (i = 0; i < rules; i++) {
121 | totlen += strlen(ruling[i]) + 2;
122 | }
123 |
124 | char *tmp = calloc(totlen, 1);
125 |
126 | for (i = 0; i < rules; i++) {
127 | strcat(tmp, ruling[i]);
128 | strcat(tmp, "\n");
129 | }
130 |
131 | urlctx *ctx = url_init(tmp);
132 | free(tmp);
133 |
134 | return ctx;
135 | }
136 |
137 | static void opti(const urlctx * const ctx) {
138 |
139 | u32 i, sum = 0;
140 | for (i = 0; i < urls; i++) {
141 | if (url_match(ctx, urling[i])) {
142 | sum++;
143 | }
144 | if (i % 10000 == 0) {printf("."); fflush(stdout);}
145 | }
146 | printf("\nTotal %u matches\n", sum);
147 | }
148 |
149 | static void reg_init() {
150 |
151 | u32 totlen = 1, i;
152 | for (i = 0; i < rules; i++) {
153 | totlen += strlen(ruling[i]) * 2 + 2;
154 | }
155 |
156 | char *tmp = calloc(totlen, 1);
157 |
158 | for (i = 0; i < rules; i++) {
159 | u32 j;
160 |
161 | for (j = 0; ruling[i][j]; j++) {
162 | if (ruling[i][j] != '*') {
163 | const char buffy[2] = { ruling[i][j], '\0' };
164 | strcat(tmp, buffy);
165 | } else {
166 | const char buffy[3] = ".*";
167 | strcat(tmp, buffy);
168 | }
169 | }
170 |
171 | if (i != rules - 1) strcat(tmp, "|");
172 | }
173 |
174 | for (i = 1; i < totlen; i++) {
175 | if (tmp[i] == '*')
176 | tmp[i-1] = '.';
177 | }
178 |
179 | int ret = regcomp(®ex, tmp, REG_EXTENDED | REG_NOSUB);
180 | if (ret) puts("Failed to compile regex");
181 |
182 | free(tmp);
183 | }
184 |
185 | static void reg() {
186 |
187 | u32 i;
188 | for (i = 0; i < urls; i++) {
189 | regexec(®ex, urling[i], 0, NULL, 0);
190 |
191 | if (i % 10000 == 0) {printf("."); fflush(stdout);}
192 | }
193 | }
194 |
195 | int main(int argc, char **argv) {
196 |
197 | if (argc > 1) {
198 | urls = 1000 * atoi(argv[1]);
199 | }
200 |
201 | srand(42);
202 |
203 | printf("Generating %uk urls and %u rules.\n", urls / 1000, rules);
204 | gen();
205 |
206 | struct timeval start, end;
207 | u32 ms, us;
208 |
209 | printf("Starting testing.\n\n");
210 | gettimeofday(&start, NULL);
211 | simple();
212 | gettimeofday(&end, NULL);
213 |
214 | ms = (end.tv_sec - start.tv_sec) * 1000;
215 | ms += (end.tv_usec - start.tv_usec) / 1000;
216 | if (!ms) ms = 1;
217 | printf("Simple backend took %u ms, or %.2f checks per millisecond.\n\n",
218 | ms, (float) urls / ms);
219 |
220 |
221 |
222 | gettimeofday(&start, NULL);
223 | urlctx * ctx = opti_init();
224 | gettimeofday(&end, NULL);
225 |
226 | ms = (end.tv_sec - start.tv_sec) * 1000;
227 | ms += (end.tv_usec - start.tv_usec) / 1000;
228 | if (!ms) ms = 1;
229 | printf("Optimized init took %u ms.\n",
230 | ms);
231 |
232 | // Yes yes, insecure mktemp. This is a bench.
233 | char name[] = "/tmp/urlmatch_benchXXXXXX";
234 | mktemp(name);
235 | gettimeofday(&start, NULL);
236 | if (url_save_optimized(ctx, name)) puts("save failed");
237 | gettimeofday(&end, NULL);
238 | url_free(ctx);
239 |
240 | ms = (end.tv_sec - start.tv_sec) * 1000;
241 | ms += (end.tv_usec - start.tv_usec) / 1000;
242 | if (!ms) ms = 1;
243 | printf("Optimized init, saving to binary file took %u ms.\n",
244 | ms);
245 |
246 | gettimeofday(&start, NULL);
247 | ctx = url_init_file(name);
248 | gettimeofday(&end, NULL);
249 |
250 | ms = (end.tv_sec - start.tv_sec) * 1000;
251 | ms += (end.tv_usec - start.tv_usec) / 1000;
252 | us = (end.tv_sec - start.tv_sec) * 1000000;
253 | us += (end.tv_usec - start.tv_usec);
254 | if (!ms) ms = 1;
255 | printf("Optimized init, read from binary file took %u ms (%u us).\n",
256 | ms, us);
257 | unlink(name);
258 |
259 |
260 |
261 | gettimeofday(&start, NULL);
262 | opti(ctx);
263 | gettimeofday(&end, NULL);
264 | url_free(ctx);
265 |
266 | ms = (end.tv_sec - start.tv_sec) * 1000;
267 | ms += (end.tv_usec - start.tv_usec) / 1000;
268 | if (!ms) ms = 1;
269 | printf("Optimized backend took %u ms, or %.2f checks per millisecond.\n\n",
270 | ms, (float) urls / ms);
271 |
272 | /*
273 | glibc regex uses a fuckton of RAM, causing this to die in just some dozen
274 | iterations. Only enable if you have a sane libc where calling regexec does
275 | not allocate memory.
276 | */
277 | #if 0
278 | gettimeofday(&start, NULL);
279 | reg_init();
280 | gettimeofday(&end, NULL);
281 |
282 | ms = (end.tv_sec - start.tv_sec) * 1000;
283 | ms += (end.tv_usec - start.tv_usec) / 1000;
284 | if (!ms) ms = 1;
285 | printf("Regex init took %u ms.\n",
286 | ms);
287 |
288 |
289 |
290 | gettimeofday(&start, NULL);
291 | reg();
292 | gettimeofday(&end, NULL);
293 |
294 | ms = end.tv_sec - start.tv_sec;
295 | ms += (end.tv_usec - start.tv_usec) / 1000;
296 | if (!ms) ms = 1;
297 | printf("Regex took %u ms, or %.2f checks per millisecond.\n",
298 | ms, (float) urls / ms);
299 | #endif
300 | return 0;
301 | }
302 |
--------------------------------------------------------------------------------
/internal.h:
--------------------------------------------------------------------------------
1 | /*
2 | liburlmatch - a fast URL matcher
3 | Copyright (C) 2013 Lauri Kasanen
4 |
5 | This program is free software: you can redistribute it and/or modify
6 | it under the terms of the GNU Affero General Public License as published by
7 | the Free Software Foundation, version 3 of the License.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU Affero General Public License for more details.
13 |
14 | You should have received a copy of the GNU Affero General Public License
15 | along with this program. If not, see .
16 | */
17 |
18 | #ifndef INTERNAL_H
19 | #define INTERNAL_H
20 |
21 | #define _GNU_SOURCE
22 |
23 | #include "lrtypes.h"
24 | #include
25 | #include
26 | #include
27 | #include
28 | #include
29 | #include
30 |
31 | // Let's help the compiler
32 | #if __GNUC__ >= 4
33 |
34 | #define PURE_FUNC __attribute__ ((pure))
35 | #define NORETURN_FUNC __attribute__ ((noreturn))
36 | #define CONST_FUNC __attribute__ ((const))
37 | #define WUR_FUNC __attribute__ ((warn_unused_result))
38 | #else // GNUC
39 |
40 | #define PURE_FUNC
41 | #define NORETURN_FUNC
42 | #define CONST_FUNC
43 | #define WUR_FUNC
44 |
45 | #endif // GNUC
46 |
47 |
48 | #pragma GCC visibility push(hidden)
49 |
50 | u32 countwilds(const char str[]) WUR_FUNC PURE_FUNC;
51 | const char *strrstr(const char hay[], const char needle[]) WUR_FUNC PURE_FUNC;
52 | static inline int suffixcmp(const char one[], const char two[]) WUR_FUNC PURE_FUNC;
53 |
54 | void *xcalloc(size_t nmemb, size_t size);
55 | void *xmalloc(size_t size);
56 | void die(const char s[]) NORETURN_FUNC;
57 | void swrite(const void *ptr, const size_t size, FILE *stream);
58 | void sread(void *ptr, const size_t size, FILE *stream);
59 | void getsuffix(const char str[], char suf[3]);
60 | static inline int wildprefix(const char str[]) WUR_FUNC PURE_FUNC;
61 |
62 |
63 | struct urlctx {
64 | struct prefix *pref;
65 | u16 count;
66 |
67 | char *storage;
68 | u32 storagelen;
69 | u32 used;
70 | };
71 |
72 | struct prefix {
73 | struct suffix *suf;
74 | u16 count;
75 |
76 | char prefix[6];
77 | u8 len;
78 | };
79 |
80 | struct suffix {
81 | struct needle *need;
82 | u16 count;
83 |
84 | char suffix[3];
85 | };
86 |
87 | struct needle {
88 | const char *needle;
89 | u16 len;
90 | u16 wilds;
91 | u16 longest;
92 | u16 longlen;
93 | };
94 |
95 | #define MAGIC "um1"
96 |
97 | void printctx(const struct urlctx *);
98 | int ctxcmp(const struct urlctx *, const struct urlctx *);
99 | void *poolalloc(struct urlctx *, u32 bytes) WUR_FUNC;
100 |
101 | // Inlines
102 |
103 | static inline int wildprefix(const char str[]) {
104 |
105 | u16 len = strlen(str);
106 | if (len > 5)
107 | len = 5;
108 |
109 | return memchr(str, '*', len) != NULL;
110 | }
111 |
112 | static inline int suffixcmp(const char one[], const char two[]) {
113 |
114 | const u16 len1 = one[1] ? 2 : 1;
115 | const u16 len2 = two[1] ? 2 : 1;
116 |
117 | if (len1 == len2) {
118 | if (len1 == 1)
119 | return one[0] != two[0];
120 | return one[1] != two[1] || one[0] != two[0];
121 | }
122 |
123 | if (len1 < len2) {
124 | // one is a single byte long
125 | if (one[0] == two[1])
126 | return 0;
127 | return 1;
128 | } else {
129 | // two is a single byte long
130 | if (one[1] == two[0])
131 | return 0;
132 | return 1;
133 | }
134 | }
135 |
136 | #pragma GCC visibility pop
137 |
138 | #endif
139 |
--------------------------------------------------------------------------------
/lrtypes.h:
--------------------------------------------------------------------------------
1 | #ifndef LRT_TYPES_H
2 | #define LRT_TYPES_H
3 |
4 | #include
5 |
6 | typedef uint64_t u64;
7 | typedef uint32_t u32;
8 | typedef uint16_t u16;
9 | typedef uint8_t u8;
10 |
11 | typedef int64_t s64;
12 | typedef int32_t s32;
13 | typedef int16_t s16;
14 | typedef int8_t s8;
15 |
16 | #endif
17 |
--------------------------------------------------------------------------------
/misc.c:
--------------------------------------------------------------------------------
1 | /*
2 | liburlmatch - a fast URL matcher
3 | Copyright (C) 2013 Lauri Kasanen
4 |
5 | This program is free software: you can redistribute it and/or modify
6 | it under the terms of the GNU Affero General Public License as published by
7 | the Free Software Foundation, version 3 of the License.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU Affero General Public License for more details.
13 |
14 | You should have received a copy of the GNU Affero General Public License
15 | along with this program. If not, see .
16 | */
17 |
18 | #include "internal.h"
19 |
20 | u32 countwilds(const char str[]) {
21 |
22 | u32 sum = 0;
23 |
24 | const char *ptr = str;
25 | for (; *ptr; ptr++) {
26 | if (*ptr == '*') sum++;
27 | }
28 |
29 | return sum;
30 | }
31 |
32 | const char *strrstr(const char hay[], const char needle[]) {
33 |
34 | const char *next;
35 | next = strstr(hay, needle);
36 | if (!next) return NULL;
37 |
38 | while (1) {
39 | const char *prev = next;
40 | next = strstr(next + 1, needle);
41 |
42 | if (!next) return prev;
43 | }
44 | }
45 |
46 | void *xcalloc(size_t nmemb, size_t size) {
47 |
48 | void *tmp = calloc(nmemb, size);
49 | if (!tmp) die("Out of memory");
50 |
51 | return tmp;
52 | }
53 |
54 | void *xmalloc(size_t size) {
55 |
56 | void *tmp = malloc(size);
57 | if (!tmp) die("Out of memory");
58 |
59 | return tmp;
60 | }
61 |
62 | void die(const char s[]) {
63 |
64 | fprintf(stderr, "%s\n", s);
65 | exit(1);
66 | }
67 |
68 | void swrite(const void * const ptr, const size_t size, FILE * const stream) {
69 |
70 | const size_t ret = fwrite(ptr, size, 1, stream);
71 |
72 | if (ret != 1) die("Failed writing");
73 | }
74 |
75 | void sread(void * const ptr, const size_t size, FILE * const stream) {
76 |
77 | const size_t ret = fread(ptr, size, 1, stream);
78 |
79 | if (ret != 1) die("Failed reading");
80 | }
81 |
82 | void getsuffix(const char str[], char suf[3]) {
83 |
84 | const u32 len = strlen(str);
85 | if (len == 0)
86 | return;
87 |
88 | if (len == 1) {
89 | suf[0] = str[0];
90 | suf[1] = '\0';
91 | return;
92 | }
93 |
94 | suf[0] = str[len - 2];
95 | suf[1] = str[len - 1];
96 | suf[2] = '\0';
97 |
98 | if (suf[0] == '*' && suf[1] != '*') {
99 | suf[0] = suf[1];
100 | suf[1] = '\0';
101 | } else if (suf[0] == '*' || suf[1] == '*') {
102 | suf[0] = '*';
103 | suf[1] = '\0';
104 | }
105 | }
106 |
107 | void printctx(const struct urlctx * const ctx) {
108 |
109 | u16 p, s, n;
110 | u16 pmax, smax, nmax;
111 |
112 | pmax = ctx->count;
113 |
114 | printf("URL context has %u prefixes\n", pmax);
115 |
116 | for (p = 0; p < pmax; p++) {
117 | const struct prefix * const curpref = &ctx->pref[p];
118 |
119 | smax = curpref->count;
120 | printf("\tPrefix %u '%s' has %u suffixes\n", p, curpref->prefix, smax);
121 |
122 | for (s = 0; s < smax; s++) {
123 | const struct suffix * const cursuf = &curpref->suf[s];
124 |
125 | nmax = cursuf->count;
126 | printf("\t\tSuffix %u '%s' has %u needles\n", s, cursuf->suffix,
127 | nmax);
128 |
129 | for (n = 0; n < cursuf->count; n++) {
130 | const struct needle * const curneed = &cursuf->need[n];
131 |
132 | printf("\t\t\tNeedle %u: %s\n", n, curneed->needle);
133 | }
134 | }
135 | }
136 | }
137 |
138 | int ctxcmp(const struct urlctx * const a, const struct urlctx * const b) {
139 |
140 | u16 p, s, n;
141 | u16 pmax, smax, nmax;
142 |
143 | pmax = a->count;
144 |
145 |
146 | #define cmperr(ack) do { fprintf(stderr, ack "\n"); return 1; } while (0)
147 |
148 | if (a->count != b->count) cmperr("prefix count");
149 |
150 | for (p = 0; p < pmax; p++) {
151 | const struct prefix * const curpref = &a->pref[p];
152 | const struct prefix * const curbpref = &b->pref[p];
153 |
154 | smax = curpref->count;
155 | if (curpref->count != curbpref->count) cmperr("suffix count");
156 | if (strcmp(curpref->prefix, curbpref->prefix)) cmperr("prefix");
157 | if (curpref->len != curbpref->len) cmperr("prefix length");
158 |
159 | for (s = 0; s < smax; s++) {
160 | const struct suffix * const cursuf = &curpref->suf[s];
161 | const struct suffix * const curbsuf = &curbpref->suf[s];
162 |
163 | nmax = cursuf->count;
164 | if (cursuf->count != curbsuf->count) cmperr("needle count");
165 | if (strcmp(cursuf->suffix, curbsuf->suffix)) cmperr("suffix");
166 |
167 | for (n = 0; n < nmax; n++) {
168 | const struct needle * const curneed = &cursuf->need[n];
169 | const struct needle * const curbneed = &curbsuf->need[n];
170 |
171 | if (curneed->len != curbneed->len)
172 | cmperr("needle len");
173 | if (curneed->wilds != curbneed->wilds)
174 | cmperr("needle wilds");
175 | if (curneed->longest != curbneed->longest)
176 | cmperr("needle longest");
177 | if (curneed->longlen != curbneed->longlen)
178 | cmperr("needle longlen");
179 | if (strcmp(curneed->needle, curbneed->needle))
180 | cmperr("needle");
181 | }
182 | }
183 | }
184 |
185 | #undef cmperr
186 |
187 | return 0;
188 | }
189 |
190 | void *poolalloc(struct urlctx * const ctx, u32 bytes) {
191 |
192 | /* Everything we return is 64-bit aligned.
193 |
194 | This is guaranteed by relying on our base
195 | pointer being ok, and only giving out
196 | multiples of 8. */
197 |
198 | while (bytes % 8 != 0)
199 | bytes++;
200 |
201 | if (ctx->used + bytes > ctx->storagelen)
202 | die("Storage OOM");
203 |
204 | const u32 cur = ctx->used;
205 | ctx->used += bytes;
206 |
207 | return ctx->storage + cur;
208 | }
209 |
--------------------------------------------------------------------------------
/opti.c:
--------------------------------------------------------------------------------
1 | /*
2 | liburlmatch - a fast URL matcher
3 | Copyright (C) 2013 Lauri Kasanen
4 |
5 | This program is free software: you can redistribute it and/or modify
6 | it under the terms of the GNU Affero General Public License as published by
7 | the Free Software Foundation, version 3 of the License.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU Affero General Public License for more details.
13 |
14 | You should have received a copy of the GNU Affero General Public License
15 | along with this program. If not, see .
16 | */
17 |
18 | #include "internal.h"
19 | #include "urlmatch.h"
20 | #include
21 |
22 | int url_save_optimized(const urlctx *ctx, const char file[]) {
23 |
24 | const int fd = open(file, O_WRONLY | O_CREAT, 0644);
25 | if (fd < 0)
26 | return 1;
27 |
28 | return url_save_optimized2(ctx, fd);
29 | }
30 |
31 | int url_save_optimized2(const urlctx *ctx, const int fd) {
32 |
33 | char *buf;
34 | size_t len;
35 |
36 | FILE *f = open_memstream(&buf, &len);
37 | if (!f) return 1;
38 |
39 | swrite(&ctx->count, 2, f);
40 | swrite(&ctx->storagelen, 4, f);
41 |
42 | u32 p, s, n;
43 | for (p = 0; p < ctx->count; p++) {
44 | const struct prefix * const curpref = &ctx->pref[p];
45 | swrite(&curpref->count, 2, f);
46 | swrite(curpref->prefix, 5, f);
47 | swrite(&curpref->len, 1, f);
48 |
49 | for (s = 0; s < curpref->count; s++) {
50 | const struct suffix * const cursuf = &curpref->suf[s];
51 | swrite(&cursuf->count, 2, f);
52 | swrite(cursuf->suffix, 2, f);
53 |
54 | for (n = 0; n < cursuf->count; n++) {
55 | const struct needle * const curneed = &cursuf->need[n];
56 | swrite(&curneed->len, 2, f);
57 | swrite(&curneed->wilds, 2, f);
58 | swrite(&curneed->longest, 2, f);
59 | swrite(&curneed->longlen, 2, f);
60 | swrite(curneed->needle, curneed->len + 1, f);
61 | }
62 | }
63 | }
64 |
65 | fclose(f);
66 |
67 | // Cool, a buffer. Let's compress it.
68 | u64 bound = compressBound(len);
69 | u8 *dest = xcalloc(bound, 1);
70 | if (compress2(dest, &bound, (u8 *) buf, len, 9) != Z_OK) return 2;
71 |
72 | free(buf);
73 |
74 | f = fdopen(fd, "w");
75 | if (!f) return 1;
76 |
77 | swrite(MAGIC, 3, f);
78 | swrite(&len, sizeof(size_t), f);
79 | swrite(dest, bound, f);
80 |
81 | free(dest);
82 | fclose(f);
83 | return 0;
84 | }
85 |
86 | static int finalcheck(const char find[], const u32 len,
87 | const char hay[], const u32 haylen) {
88 |
89 | // This is the core of the simple check
90 |
91 | u32 i, h = 0;
92 |
93 | for (i = 0; i < len; i++) {
94 | if (find[i] != '*') {
95 | if (find[i] != hay[h])
96 | return 0;
97 | h++;
98 | } else {
99 | // If multiple wildcards in a row, skip to the last
100 | while (find[i+1] == '*') i++;
101 |
102 | if (i == len - 1)
103 | return 1;
104 |
105 | // Wildcard, not last
106 | const char * const ender = strchrnul(&find[i + 1], '*');
107 | const u32 dist = ender - &find[i + 1];
108 |
109 | char piece[dist + 1];
110 | memcpy(piece, &find[i + 1], dist);
111 | piece[dist] = '\0';
112 |
113 | const char * const lastmatch = strrstr(&hay[h], piece);
114 | if (!lastmatch)
115 | return 0;
116 |
117 | // Is backtracking required?
118 | const char * const firstmatch = strstr(&hay[h], piece);
119 |
120 | // The dist check is to make sure this is not a suffix search
121 | if (firstmatch != lastmatch && dist != len - i - 1) {
122 | const u32 move = firstmatch - &hay[h];
123 | h += move;
124 | } else {
125 | const u32 move = lastmatch - &hay[h];
126 | h += move;
127 | }
128 | }
129 | }
130 |
131 | // We ran out of needle but not hay
132 | if (h != haylen) return 0;
133 |
134 | return 1;
135 |
136 | }
137 |
138 | static void getsuffixlen(const char str[], char suf[3], const u32 len) {
139 |
140 | if (len == 1) {
141 | suf[0] = str[0];
142 | suf[1] = '\0';
143 | return;
144 | }
145 |
146 | suf[0] = str[len - 2];
147 | suf[1] = str[len - 1];
148 | suf[2] = '\0';
149 | }
150 |
151 | int url_match(const urlctx * const ctx, const char haystack[]) {
152 |
153 | const u32 len = strlen(haystack);
154 | char suf[3], pref[6];
155 |
156 | if (len < 1) return 0;
157 | getsuffixlen(haystack, suf, len);
158 |
159 | strncpy(pref, haystack, 5);
160 | pref[5] = '\0';
161 |
162 | u32 p, s;
163 |
164 | // Find all applicable prefixes
165 | const u32 pmax = ctx->count;
166 | for (p = 0; p < pmax; p++) {
167 | const struct prefix * const curpref = &ctx->pref[p];
168 |
169 | // Does this prefix match?
170 | if (curpref->prefix[0] != '*') {
171 | int ret = strncmp(pref, curpref->prefix, curpref->len);
172 | if (ret > 0)
173 | continue;
174 | if (ret < 0)
175 | break;
176 | }
177 |
178 | const u32 smax = curpref->count;
179 | for (s = 0; s < smax; s++) {
180 | const struct suffix * const cursuf = &curpref->suf[s];
181 |
182 | // Does this suffix match?
183 | if (cursuf->suffix[0] != '*' &&
184 | suffixcmp(suf, cursuf->suffix))
185 | continue;
186 |
187 | // OK, we have to test all needles in this suffix.
188 | u32 n;
189 | const u32 nmax = cursuf->count;
190 | for (n = 0; n < nmax; n++) {
191 | const struct needle * const curneed = &cursuf->need[n];
192 |
193 | // First: no wildcards
194 | if (!curneed->wilds) {
195 | // Do the lengths match?
196 | if (len != curneed->len)
197 | continue;
198 | if (!strcmp(haystack, curneed->needle))
199 | return 1;
200 | } else {
201 | // Is the longest streak in it?
202 | if (curneed->longlen) {
203 | if (curneed->longlen >= 4) {
204 | if (!memmem(haystack, len,
205 | curneed->needle + curneed->longest,
206 | curneed->longlen))
207 | continue;
208 | } else {
209 | if (!memchr(haystack,
210 | curneed->needle[curneed->longest],
211 | len))
212 | continue;
213 | }
214 | }
215 |
216 | // The prefix and suffix match, and it contains
217 | // the longest streak. Do the actual comparison.
218 | if (finalcheck(curneed->needle, curneed->len,
219 | haystack, len))
220 | return 1;
221 | }
222 | }
223 | }
224 | }
225 |
226 | return 0;
227 | }
228 |
229 | void url_free(urlctx *ctx) {
230 |
231 | free(ctx->storage);
232 | free(ctx);
233 | }
234 |
--------------------------------------------------------------------------------
/opti_init.c:
--------------------------------------------------------------------------------
1 | /*
2 | liburlmatch - a fast URL matcher
3 | Copyright (C) 2013 Lauri Kasanen
4 |
5 | This program is free software: you can redistribute it and/or modify
6 | it under the terms of the GNU Affero General Public License as published by
7 | the Free Software Foundation, version 3 of the License.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU Affero General Public License for more details.
13 |
14 | You should have received a copy of the GNU Affero General Public License
15 | along with this program. If not, see .
16 | */
17 |
18 | #include "internal.h"
19 | #include "urlmatch.h"
20 | #include
21 |
22 | static urlctx *initbin(FILE * const f, const u32 inlen) {
23 |
24 | size_t len;
25 | sread(&len, sizeof(size_t), f);
26 |
27 | u8 * const src = xcalloc(inlen, 1);
28 | u8 *buf = xcalloc(len, 1);
29 | u8 * const origbuf = buf;
30 |
31 | sread(src, inlen, f);
32 | if (uncompress(buf, &len, src, inlen) != Z_OK) return NULL;
33 | free(src);
34 |
35 | // Cool, unpacked. Read it.
36 | urlctx * const out = xcalloc(sizeof(urlctx), 1);
37 |
38 | memcpy(&out->count, buf, 2);
39 | buf += 2;
40 |
41 | memcpy(&out->storagelen, buf, 4);
42 | buf += 4;
43 |
44 | out->storage = xcalloc(out->storagelen, 1);
45 | out->pref = poolalloc(out, sizeof(struct prefix) * out->count);
46 | u32 p, s, n;
47 |
48 | for (p = 0; p < out->count; p++) {
49 | struct prefix * const curpref = &out->pref[p];
50 |
51 | memcpy(&curpref->count, buf, 2);
52 | buf += 2;
53 | memcpy(curpref->prefix, buf, 5);
54 | buf += 5;
55 | curpref->len = *buf;
56 | buf++;
57 |
58 | curpref->suf = poolalloc(out, sizeof(struct suffix) * curpref->count);
59 |
60 | for (s = 0; s < curpref->count; s++) {
61 | struct suffix * const cursuf = &curpref->suf[s];
62 |
63 | memcpy(&cursuf->count, buf, 2);
64 | buf += 2;
65 | memcpy(cursuf->suffix, buf, 2);
66 | buf += 2;
67 |
68 | cursuf->need = poolalloc(out, sizeof(struct needle) * cursuf->count);
69 |
70 | for (n = 0; n < cursuf->count; n++) {
71 | struct needle * const curneed = &cursuf->need[n];
72 |
73 | memcpy(&curneed->len, buf, 2);
74 | buf += 2;
75 | memcpy(&curneed->wilds, buf, 2);
76 | buf += 2;
77 | memcpy(&curneed->longest, buf, 2);
78 | buf += 2;
79 | memcpy(&curneed->longlen, buf, 2);
80 | buf += 2;
81 |
82 | curneed->needle = poolalloc(out, curneed->len + 1);
83 |
84 | memcpy((char *) curneed->needle, buf, curneed->len + 1);
85 | buf += curneed->len + 1;
86 | }
87 | }
88 | }
89 |
90 | free(origbuf);
91 | return out;
92 | }
93 |
94 | urlctx *url_init_file(const char file[]) {
95 |
96 | const int fd = open(file, O_RDONLY);
97 | if (fd < 0)
98 | return NULL;
99 |
100 | return url_init_file2(fd);
101 | }
102 |
103 | urlctx *url_init_file2(const int fd) {
104 |
105 | FILE * const f = fdopen(fd, "r");
106 | if (!f) return NULL;
107 |
108 | fseek(f, 0, SEEK_END);
109 | const long len = ftell(f);
110 | rewind(f);
111 |
112 | char buf[4] = { 0 };
113 | fread(buf, 3, 1, f);
114 |
115 | urlctx *out = NULL;
116 |
117 | // Binary format
118 | if (!strcmp(buf, MAGIC)) {
119 | out = initbin(f, len - 3 - sizeof(size_t));
120 | } else { // Text format
121 | rewind(f);
122 |
123 | char *tmp = xcalloc(len, 1);
124 | if (fread(tmp, len, 1, f) != 1) die("Failed reading");
125 | out = url_init(tmp);
126 | free(tmp);
127 | }
128 |
129 | fclose(f);
130 | return out;
131 | }
132 |
133 | static int wildpfxcmp(const char a[], const char b[]) {
134 |
135 | const int awild = wildprefix(a);
136 | const int bwild = wildprefix(b);
137 |
138 | if (!awild && !bwild) {
139 | return strncmp(a, b, 5);
140 | } else if (awild && !bwild) {
141 | return strncmp("*", b, 5);
142 | } else if (!awild && bwild) {
143 | return strncmp(a, "*", 5);
144 | }
145 |
146 | return 0;
147 | }
148 |
149 | static void preparepfx(char str[]) {
150 |
151 | if (!wildprefix(str))
152 | return;
153 |
154 | memset(str + 1, '\0', 4);
155 | str[0] = '*';
156 | }
157 |
158 | static int cstrcmp(const void * const p1, const void * const p2) {
159 |
160 | const char * const a = * (char * const *) p1;
161 | const char * const b = * (char * const *) p2;
162 |
163 | int ret = wildpfxcmp(a, b);
164 | if (ret) return ret;
165 |
166 | // Secondary sort by the suffix
167 | char sufa[3] = { 0 };
168 | char sufb[3] = { 0 };
169 |
170 | getsuffix(a, sufa);
171 | getsuffix(b, sufb);
172 |
173 | return strcmp(sufa, sufb);
174 | }
175 |
176 | static void calclongest(const char needle[], const u16 len, const u16 wilds,
177 | u16 * const longest, u16 * const longlen) {
178 |
179 | // Easy path
180 | if (wilds == 1) {
181 | const char *ptr = strchr(needle, '*');
182 | const u16 pos = ptr - needle;
183 | const u16 half = len / 2;
184 |
185 | if (pos < half) {
186 | *longlen = len - pos - 1;
187 | *longest = pos + 1;
188 | } else {
189 | *longlen = pos;
190 | *longest = 0;
191 | }
192 | } else {
193 | u16 max = 0;
194 | u16 maxlen = 0;
195 |
196 | const char *ptr = needle;
197 | while (*ptr) {
198 | const char * const next = strchrnul(ptr, '*');
199 | const u16 thislen = next - ptr;
200 |
201 | if (maxlen < thislen) {
202 | maxlen = thislen;
203 | max = ptr - needle;
204 | }
205 |
206 | if (!*next) break;
207 | ptr = next + 1;
208 | }
209 |
210 | *longest = max;
211 | *longlen = maxlen;
212 | }
213 | }
214 |
215 | static void addneedle(urlctx * const ctx, struct needle * const to, const char from[]) {
216 |
217 | const u32 len = strlen(from);
218 | to->needle = poolalloc(ctx, len + 1);
219 | memcpy((char *) to->needle, from, len + 1);
220 |
221 | to->len = len;
222 | to->wilds = countwilds(from);
223 |
224 | if (to->wilds)
225 | calclongest(from, to->len, to->wilds, &to->longest, &to->longlen);
226 | }
227 |
228 | urlctx *url_init(const char contents[]) {
229 |
230 | u32 lines = 1;
231 | const char *ptr = contents;
232 | const u32 contentlen = strlen(contents);
233 | const char * const endbyte = ptr + contentlen;
234 | for (; *ptr; ptr++) {
235 | if (*ptr == '\n') lines++;
236 | }
237 |
238 | char **outlines = xcalloc(lines, sizeof(char *));
239 | const u32 origlines = lines;
240 |
241 | // Copy each pattern line to its own space, and optimize on the way
242 | ptr = contents;
243 | u32 i = 0, j;
244 | while (1) {
245 | const char * const end = strchrnul(ptr, '\n');
246 | const u32 len = end - ptr;
247 |
248 | if (len < 1) {
249 | ptr = end + 1;
250 | if (ptr >= endbyte) { i--; break; }
251 | continue;
252 | }
253 |
254 | char tmp[len + 1];
255 | tmp[len] = '\0';
256 | memcpy(tmp, ptr, len);
257 |
258 | outlines[i] = xcalloc(len + 1, 1);
259 |
260 | u32 p, o;
261 | outlines[i][0] = tmp[0];
262 | for (p = 1, o = 1; p < len; p++) {
263 | if (tmp[p - 1] == '*' && tmp[p] == '*') {
264 | continue;
265 | }
266 | outlines[i][o] = tmp[p];
267 |
268 | o++;
269 | }
270 |
271 | if (!*end) break;
272 | ptr = end + 1;
273 | i++;
274 | }
275 |
276 | lines = i + 1;
277 | qsort(outlines, lines, sizeof(char *), cstrcmp);
278 |
279 | urlctx * const out = xcalloc(sizeof(urlctx), 1);
280 | // The theoretical maximum amount needed
281 | out->storagelen = contentlen + 1 +
282 | lines * (sizeof(struct suffix) +
283 | sizeof(struct needle) +
284 | sizeof(struct prefix) + 8);
285 | out->storage = xcalloc(out->storagelen, 1);
286 |
287 | // How many prefixes do we have?
288 | u32 prefixes = 1;
289 | for (i = 1; i < lines; i++) {
290 | if (strncmp(outlines[i - 1], outlines[i], 5) &&
291 | (!wildprefix(outlines[i - 1]) || !wildprefix(outlines[i])))
292 | prefixes++;
293 | }
294 |
295 | out->count = prefixes;
296 | out->pref = poolalloc(out, sizeof(struct prefix) * prefixes);
297 |
298 | // Add each prefix
299 | prefixes = 1;
300 | strncpy(out->pref[0].prefix, outlines[0], 5);
301 | preparepfx(out->pref[0].prefix);
302 | out->pref[0].len = strlen(out->pref[0].prefix);
303 | for (i = 1; i < lines; i++) {
304 | if (wildpfxcmp(outlines[i - 1], outlines[i])) {
305 | strncpy(out->pref[prefixes].prefix, outlines[i], 5);
306 |
307 | preparepfx(out->pref[prefixes].prefix);
308 | out->pref[prefixes].len = strlen(out->pref[prefixes].prefix);
309 |
310 | prefixes++;
311 | }
312 | }
313 |
314 | // For each prefix, how many suffixes are there?
315 | for (i = 0; i < out->count; i++) {
316 |
317 | struct prefix * const curpref = &out->pref[i];
318 |
319 | u32 suffixes = 0;
320 | char prevsuf[3] = { 0 };
321 | for (j = 0; j < lines; j++) {
322 | const int ret = wildpfxcmp(curpref->prefix, outlines[j]);
323 |
324 | if (ret > 0) continue;
325 | if (ret < 0) break;
326 |
327 | char suf[3];
328 | getsuffix(outlines[j], suf);
329 | if (strcmp(prevsuf, suf)) suffixes++;
330 | memcpy(prevsuf, suf, 3);
331 | }
332 |
333 | curpref->suf = poolalloc(out, sizeof(struct suffix) * suffixes);
334 | curpref->count = suffixes;
335 |
336 | // For each suffix, how many needles do we have?
337 | suffixes = 0;
338 | prevsuf[0] = prevsuf[1] = 0;
339 | for (j = 0; j < lines; j++) {
340 | const int ret = wildpfxcmp(curpref->prefix, outlines[j]);
341 |
342 | if (ret > 0) continue;
343 | if (ret < 0) break;
344 |
345 | char suf[3];
346 | getsuffix(outlines[j], suf);
347 | if (strcmp(prevsuf, suf)) {
348 | curpref->suf[suffixes].count = 1;
349 | memcpy(curpref->suf[suffixes].suffix, suf, 3);
350 |
351 | suffixes++;
352 | } else {
353 | curpref->suf[suffixes - 1].count++;
354 | }
355 | memcpy(prevsuf, suf, 3);
356 | }
357 |
358 | // Allocate the needle counts
359 | for (j = 0; j < curpref->count; j++) {
360 | curpref->suf[j].need = poolalloc(out, sizeof(struct needle) *
361 | curpref->suf[j].count);
362 | }
363 |
364 | // For each suffix, save the needles
365 | suffixes = 0;
366 | prevsuf[0] = prevsuf[1] = 0;
367 | for (j = 0; j < lines; j++) {
368 | const int ret = wildpfxcmp(curpref->prefix, outlines[j]);
369 |
370 | if (ret > 0) continue;
371 | if (ret < 0) break;
372 |
373 | char suf[3];
374 | getsuffix(outlines[j], suf);
375 | if (strcmp(prevsuf, suf)) {
376 | struct suffix * const cursuf = &curpref->suf[suffixes];
377 |
378 | cursuf->count = 1;
379 | memcpy(cursuf->suffix, suf, 3);
380 | addneedle(out, &cursuf->need[0], outlines[j]);
381 | suffixes++;
382 | } else {
383 | struct suffix * const cursuf = &curpref->suf[suffixes - 1];
384 |
385 | addneedle(out, &cursuf->need[cursuf->count],
386 | outlines[j]);
387 | cursuf->count++;
388 | }
389 | memcpy(prevsuf, suf, 3);
390 | }
391 | }
392 |
393 | for (i = 0; i < origlines; i++) free(outlines[i]);
394 | free(outlines);
395 |
396 | // Refresh storage size, so that binary save + load doesn't waste space
397 | out->storagelen = out->used;
398 |
399 | return out;
400 | }
401 |
--------------------------------------------------------------------------------
/rated.c:
--------------------------------------------------------------------------------
1 | /*
2 | liburlmatch - a fast URL matcher
3 | Copyright (C) 2013 Lauri Kasanen
4 |
5 | This program is free software: you can redistribute it and/or modify
6 | it under the terms of the GNU Affero General Public License as published by
7 | the Free Software Foundation, version 3 of the License.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU Affero General Public License for more details.
13 |
14 | You should have received a copy of the GNU Affero General Public License
15 | along with this program. If not, see .
16 | */
17 |
18 | #include "internal.h"
19 | #include "urlmatch.h"
20 |
21 | #include
22 |
23 | static u32 wordlen(const char *start) {
24 |
25 | const char * const orig = start;
26 |
27 | while (!isspace(*start) && *start) start++;
28 |
29 | return start - orig;
30 | }
31 |
32 | static const char *nextword(const char *ptr) {
33 |
34 | while (isspace(*ptr) && *ptr) ptr++;
35 |
36 | return ptr;
37 | }
38 |
39 | int ratedsearch(const char needle[], const char haystack[]) {
40 |
41 | // For each source word, if it's present in haystack, increment score.
42 | // IOW, a simple google-like search.
43 | const u32 tmplen = 320;
44 | char tmp[tmplen];
45 |
46 | const char *cur = nextword(needle);
47 | u32 wlen = wordlen(cur);
48 | u32 score = 0;
49 |
50 | while (*cur) {
51 | if (wlen >= tmplen)
52 | return -1;
53 | memcpy(tmp, cur, wlen);
54 | tmp[wlen] = '\0';
55 |
56 | if (strcasestr(haystack, tmp))
57 | score++;
58 |
59 | cur += wlen;
60 | cur = nextword(cur);
61 | wlen = wordlen(cur);
62 | }
63 |
64 | return score;
65 | }
66 |
--------------------------------------------------------------------------------
/simple.c:
--------------------------------------------------------------------------------
1 | /*
2 | liburlmatch - a fast URL matcher
3 | Copyright (C) 2013 Lauri Kasanen
4 |
5 | This program is free software: you can redistribute it and/or modify
6 | it under the terms of the GNU Affero General Public License as published by
7 | the Free Software Foundation, version 3 of the License.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU Affero General Public License for more details.
13 |
14 | You should have received a copy of the GNU Affero General Public License
15 | along with this program. If not, see .
16 | */
17 |
18 | #include "urlmatch.h"
19 | #include "internal.h"
20 |
21 | int url_simplematch(const char find[], const char hay[]) {
22 |
23 | const u32 wilds = countwilds(find);
24 |
25 | // Easiest path: no wildcards
26 | if (!wilds) {
27 | return strcmp(find, hay) == 0;
28 | }
29 |
30 | const u32 len = strlen(find);
31 | u32 i, h = 0;
32 |
33 | for (i = 0; i < len; i++) {
34 | if (find[i] != '*') {
35 | if (find[i] != hay[h])
36 | return 0;
37 | h++;
38 | } else {
39 | // If multiple wildcards in a row, skip to the last
40 | while (find[i+1] == '*') i++;
41 |
42 | if (i >= len - 1)
43 | return 1;
44 |
45 | // Wildcard, not last
46 | const char * const ender = strchrnul(&find[i + 1], '*');
47 | const u32 dist = ender - &find[i + 1];
48 |
49 | char piece[dist + 1];
50 | memcpy(piece, &find[i + 1], dist);
51 | piece[dist] = '\0';
52 |
53 | const char * const lastmatch = strrstr(&hay[h], piece);
54 | if (!lastmatch)
55 | return 0;
56 |
57 | // Is backtracking required?
58 | const char * const firstmatch = strstr(&hay[h], piece);
59 |
60 | // The dist check is to make sure this is not a suffix search
61 | if (firstmatch != lastmatch && dist != len - i - 1) {
62 | const u32 move = firstmatch - &hay[h];
63 | h += move;
64 | } else {
65 | const u32 move = lastmatch - &hay[h];
66 | h += move;
67 | }
68 | }
69 | }
70 |
71 | // We ran out of needle but not hay
72 | if (h != strlen(hay)) return 0;
73 |
74 | return 1;
75 | }
76 |
--------------------------------------------------------------------------------
/test/Makefile:
--------------------------------------------------------------------------------
1 | SRC = $(wildcard *.c)
2 | TARGETS = $(SRC:.c=)
3 |
4 | CFLAGS += -Wall -Wextra -g
5 | CPPFLAGS += -I ..
6 | LDFLAGS += -lz
7 |
8 | .PHONY: all clean
9 |
10 | all: $(TARGETS)
11 | @./run-tests.sh $(TARGETS)
12 |
13 | $(TARGETS): ../liburlmatch.a
14 |
15 | clean:
16 | rm -f *.o $(TARGETS)
17 |
--------------------------------------------------------------------------------
/test/allocfree.c:
--------------------------------------------------------------------------------
1 | #include "test.h"
2 | #include "internal.h"
3 |
4 | int main() {
5 |
6 | const char pat[] =
7 | "http://moi\n"
8 | "https://katti\n"
9 | "moido\n"
10 | "http://hoi\n"
11 | "http://google*";
12 |
13 | urlctx *ctx = url_init(pat);
14 | url_free(ctx);
15 |
16 | const char pat2[] =
17 | "http://moi\n"
18 | "https://katti\n"
19 | "moido\n"
20 | "http://hoi\n"
21 | "http://google*\n\n\n";
22 |
23 | ctx = url_init(pat2);
24 | url_free(ctx);
25 |
26 | const char pat3[] =
27 | "http://moi\n"
28 | "https://katti\n\n\n"
29 | "moido\n\n"
30 | "http://hoi\n"
31 | "http://google*\n\n";
32 |
33 | ctx = url_init(pat3);
34 | printctx(ctx);
35 | url_free(ctx);
36 |
37 | return 0;
38 | }
39 |
--------------------------------------------------------------------------------
/test/bin.c:
--------------------------------------------------------------------------------
1 | #include "test.h"
2 | #include "internal.h"
3 | #include
4 |
5 | int main() {
6 |
7 | /* Repeatedly save and load the context, then compare it to the first one. */
8 |
9 | urlctx *ctx = url_init(
10 | "http://*gooogle*\n"
11 | "ftp://fooo\n"
12 | "*adwords\n"
13 | "http*//*.php");
14 |
15 | // Yes yes, insecure mktemp. This is a unit test.
16 | char name[] = "/tmp/bintestXXXXXX";
17 | mktemp(name);
18 |
19 | if (url_save_optimized(ctx, name)) fail("save failed\n");
20 |
21 | u32 i;
22 | urlctx *tmp;
23 | for (i = 0; i < 20; i++) {
24 | tmp = url_init_file(name);
25 | if (!tmp) fail("load failed\n");
26 | if (url_save_optimized(tmp, name)) fail("save failed\n");
27 | url_free(tmp);
28 | }
29 | tmp = url_init_file(name);
30 |
31 | if (ctxcmp(ctx, tmp)) fail("compare failed\n");
32 |
33 | url_free(ctx);
34 | url_free(tmp);
35 | unlink(name);
36 | return 0;
37 | }
38 |
--------------------------------------------------------------------------------
/test/count.c:
--------------------------------------------------------------------------------
1 | #include "test.h"
2 | #include "internal.h"
3 |
4 | void test(const char str[], const u32 result) {
5 |
6 | u32 foo = countwilds(str);
7 |
8 | if (foo != result)
9 | fail("Got %u expected %u for %s\n", foo, result, str);
10 | }
11 |
12 | int main() {
13 |
14 | test("", 0);
15 | test("ddd", 0);
16 | test("dgfdsfsdgd", 0);
17 |
18 | test("*", 1);
19 | test("**", 2);
20 | test("***", 3);
21 | test("****", 4);
22 |
23 | test("*foo*bar", 2);
24 | test("*foobar*", 2);
25 |
26 | return 0;
27 | }
28 |
--------------------------------------------------------------------------------
/test/opti.c:
--------------------------------------------------------------------------------
1 | #include "test.h"
2 |
3 | void test(const char needle[], const char hay[], const int res) {
4 |
5 | urlctx *ctx = url_init(needle);
6 |
7 | if (url_match(ctx, hay) != res)
8 | fail("%s in %s, expected %u\n", needle, hay, res);
9 |
10 | url_free(ctx);
11 | }
12 |
13 | int main() {
14 |
15 | test("foo", "bar", 0);
16 | test("foo", "fo", 0);
17 |
18 | test("foo", "foo", 1);
19 | test("foo", "foofoo", 0);
20 | test("foo", "barfoo", 0);
21 | test("foo", "foofoobar", 0);
22 |
23 | test("*", "ff", 1);
24 | test("*", "gdfgfd", 1);
25 | test("*", "*g****", 1);
26 | test("*", "*", 1);
27 |
28 | test("*foo", "foo", 1);
29 | test("*foo", "foofoo", 1);
30 | test("*foo", "ofoo", 1);
31 | test("*foo", "fo", 0);
32 | test("*foo", "oof", 0);
33 | test("f****f", "fof", 1);
34 | test("f****f", "ff", 1);
35 | test("**f****f", "ff", 1);
36 | test("f****f**", "ff", 1);
37 | test("**f****f**", "ff", 1);
38 |
39 | test("*foo*", "foo", 1);
40 | test("*foo*", "bfoob", 1);
41 | test("*foo*", "*foo*", 1);
42 | test("*foo*", "foishbar", 0);
43 | test("*foo*", "foko", 0);
44 | test("*foo*", "fokooooooooooooofoo", 1);
45 |
46 | test("*f*o*o*", "foo", 1);
47 | test("*f*o*o*", "fffffoffoff", 1);
48 | test("*f*oo", "foo", 1);
49 | test("*f*oo", "foof", 0);
50 | test("*f*oo", "fff kkk foo", 1);
51 |
52 | test("f*", "foo", 1);
53 | test("f*f", "foof", 1);
54 | test("f*f", "ffffooffff", 1);
55 | test("f*f", "foo", 0);
56 | test("fkilla*", "foo", 0);
57 | test("fkilla*", "fkillyyy", 0);
58 | test("fkilla*", "fkilla", 1);
59 | test("fkilla*", "fkillamogfgf", 1);
60 |
61 |
62 | return 0;
63 | }
64 |
--------------------------------------------------------------------------------
/test/rated.c:
--------------------------------------------------------------------------------
1 | #include "test.h"
2 |
3 | static void test(const char needle[], const char hay[], const int res) {
4 |
5 | const int ret = ratedsearch(needle, hay);
6 | if (ret != res)
7 | fail("'%s' in '%s', expected %u got %d\n", needle, hay, res, ret);
8 | }
9 |
10 | int main() {
11 |
12 | test("masa", "doigfhdfoighfdoignvoifd hoidfvhiofdhvoifd hoifhgifdo hofdi", 0);
13 | test("masa", "masa", 1);
14 | test(" masa ", "masa", 1);
15 | test("nightwish lyric ", "The best of Nightwish: Sleeping Sun LYRICS", 2);
16 | test("NIGHtwish lyric ", "The best of Nightwish: Sleeping Sun LYRICS", 2);
17 | test("johnson core coil", "http://www.google.com?search=kalle+masa+kaupassa", 0);
18 | test("johnson core coil", "http://www.google.com?search=kalle+masa+kaupassa+johnso+coil", 1);
19 | test("johnson core coil", "http://www.google.com?search=core+coil+johnson", 3);
20 | test("", "", 0);
21 |
22 | return 0;
23 | }
24 |
--------------------------------------------------------------------------------
/test/run-tests.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Tests starting with x are expected to fail
3 |
4 |
5 | export LANG=C
6 |
7 | GREEN="$(echo -e '\033[1;32m')"
8 | YELLOW="$(echo -e '\033[0;33m')"
9 | RED="$(echo -e '\033[1;31m')"
10 | NORMAL="$(echo -e '\033[0;39m')"
11 |
12 | success=0
13 | fail=0
14 |
15 | [ "$#" -lt 1 ] && exit
16 |
17 | for bin in $*; do
18 | [ ! -f "$bin" ] && continue
19 | [ ! -x "$bin" ] && continue
20 |
21 | test=$bin
22 | log=${test}.log
23 |
24 | ret=0
25 | case $test in x*) ret=1 ;; esac
26 |
27 | echo -n "Running test $test... "
28 | ./$test > $log
29 | if [ $? -ne $ret ]; then
30 | fail=$((fail + 1))
31 | echo "${RED}Failed $NORMAL"
32 | else
33 | success=$((success + 1))
34 | echo
35 | rm -f $log
36 | fi
37 |
38 | # If empty, remove
39 | [ ! -s "$log" ] && rm -f $log
40 | done
41 |
42 | echo
43 |
44 | total=$((fail + success))
45 | percentage=$(awk "BEGIN{print $success/$total * 100}")
46 | percentage=$(printf '%.2f' $percentage)
47 |
48 | num=${percentage//.*/}
49 |
50 | [ $fail -eq 0 ] && echo "$GREEN All tests passed!"
51 | [ $fail -ne 0 -a $num -ge 60 ] && echo "$YELLOW $percentage% passed, $fail/$total fails"
52 | [ $num -lt 60 ] && echo "$RED $percentage% passed, $fail/$total fails"
53 |
54 | echo $NORMAL
55 |
--------------------------------------------------------------------------------
/test/shortrule.c:
--------------------------------------------------------------------------------
1 | #include "test.h"
2 | #include "internal.h"
3 |
4 | int main() {
5 |
6 | const char pat[] = "moi";
7 |
8 | urlctx *ctx = url_init(pat);
9 | printctx(ctx);
10 | url_free(ctx);
11 |
12 | return 0;
13 | }
14 |
--------------------------------------------------------------------------------
/test/simple.c:
--------------------------------------------------------------------------------
1 | #include "test.h"
2 |
3 | void test(const char needle[], const char hay[], const int res) {
4 |
5 | if (url_simplematch(needle, hay) != res)
6 | fail("%s in %s, expected %u\n", needle, hay, res);
7 | }
8 |
9 | int main() {
10 |
11 | test("foo", "bar", 0);
12 | test("foo", "fo", 0);
13 |
14 | test("foo", "foo", 1);
15 | test("foo", "foofoo", 0);
16 | test("foo", "barfoo", 0);
17 | test("foo", "foofoobar", 0);
18 |
19 | test("*", "", 1);
20 | test("*", "ff", 1);
21 | test("*", "gdfgfd", 1);
22 | test("*", "*g****", 1);
23 | test("*", "*", 1);
24 |
25 | test("*foo", "foo", 1);
26 | test("*foo", "foofoo", 1);
27 | test("*foo", "ofoo", 1);
28 | test("*foo", "fo", 0);
29 | test("*foo", "oof", 0);
30 | test("f****f", "fof", 1);
31 | test("f****f", "ff", 1);
32 | test("**f****f", "ff", 1);
33 | test("f****f**", "ff", 1);
34 | test("**f****f**", "ff", 1);
35 |
36 | test("*foo*", "foo", 1);
37 | test("*foo*", "bfoob", 1);
38 | test("*foo*", "*foo*", 1);
39 | test("*foo*", "foishbar", 0);
40 | test("*foo*", "foko", 0);
41 | test("*foo*", "fokooooooooooooofoo", 1);
42 |
43 | test("*f*o*o*", "foo", 1);
44 | test("*f*o*o*", "fffffoffoff", 1);
45 | test("*f*oo", "foo", 1);
46 | test("*f*oo", "foof", 0);
47 | test("*f*oo", "fff kkk foo", 1);
48 |
49 | test("f*", "foo", 1);
50 | test("f*f", "foof", 1);
51 | test("f*f", "ffffooffff", 1);
52 | test("f*f", "foo", 0);
53 | test("fkilla*", "foo", 0);
54 | test("fkilla*", "fkillyyy", 0);
55 | test("fkilla*", "fkilla", 1);
56 | test("fkilla*", "fkillamogfgf", 1);
57 |
58 |
59 | return 0;
60 | }
61 |
--------------------------------------------------------------------------------
/test/str.c:
--------------------------------------------------------------------------------
1 | #include "test.h"
2 | #include "internal.h"
3 |
4 | static void test(const char hay[], const char needle[], const u32 pos) {
5 |
6 | const char * const ptr = strrstr(hay, needle);
7 | const char * const exists = strstr(hay, needle);
8 |
9 | if (!exists) {
10 | if (ptr)
11 | fail("False positive\n");
12 | return;
13 | }
14 |
15 | const u32 tmp = ptr - hay;
16 | if (tmp != pos)
17 | fail("%s in %s, wanted %u got %u\n",
18 | needle, hay, pos, tmp);
19 | }
20 |
21 | static void suf(const char one[], const char two[], const u32 nomatch) {
22 | const u32 res = suffixcmp(one, two);
23 |
24 | if (res != nomatch)
25 | fail("suffixcmp %s %s got %u expected %u\n",
26 | one, two, res, nomatch);
27 | }
28 |
29 | int main() {
30 |
31 | test("foo", "bar", 0);
32 | test("foo", "fo", 0);
33 |
34 | test("foo", "foo", 0);
35 | test("foo", "foofoo", 3);
36 | test("foo", "barfoo", 3);
37 | test("foo", "foofoobar", 3);
38 |
39 | suf("aa", "ab", 1);
40 | suf("bb", "ab", 1);
41 | suf("a", "ab", 1);
42 | suf("aa", "b", 1);
43 |
44 | suf("aa", "aa", 0);
45 | suf("bb", "bb", 0);
46 | suf("aa", "a", 0);
47 | suf("a", "aa", 0);
48 |
49 | return 0;
50 | }
51 |
--------------------------------------------------------------------------------
/test/test.h:
--------------------------------------------------------------------------------
1 | #ifndef TEST_H
2 | #define TEST_H
3 |
4 | #include
5 | #include
6 | #include
7 |
8 | #include "lrtypes.h"
9 | #include "urlmatch.h"
10 |
11 | static inline void fail(const char fmt[], ...) {
12 |
13 | va_list ap;
14 | va_start(ap, fmt);
15 |
16 | vprintf(fmt, ap);
17 |
18 | va_end(ap);
19 |
20 | exit(1);
21 | }
22 |
23 | #endif
24 |
--------------------------------------------------------------------------------
/urlmatch.h:
--------------------------------------------------------------------------------
1 | /*
2 | liburlmatch - a fast URL matcher
3 | Copyright (C) 2013 Lauri Kasanen
4 |
5 | This program is free software: you can redistribute it and/or modify
6 | it under the terms of the GNU Affero General Public License as published by
7 | the Free Software Foundation, version 3 of the License.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU Affero General Public License for more details.
13 |
14 | You should have received a copy of the GNU Affero General Public License
15 | along with this program. If not, see .
16 | */
17 |
18 | #ifndef URLMATCH_H
19 | #define URLMATCH_H
20 |
21 | #ifdef __cplusplus
22 | extern "C" {
23 | #endif
24 |
25 | // Let's help the compiler
26 | #if __GNUC__ >= 4
27 |
28 | #define PURE_FUNC __attribute__ ((pure))
29 | #define NORETURN_FUNC __attribute__ ((noreturn))
30 | #define CONST_FUNC __attribute__ ((const))
31 | #define WUR_FUNC __attribute__ ((warn_unused_result))
32 | #define NONNULL(A) __attribute__ ((nonnull (A)))
33 | #else // GNUC
34 |
35 | #define PURE_FUNC
36 | #define NORETURN_FUNC
37 | #define CONST_FUNC
38 | #define WUR_FUNC
39 | #define NONNULL
40 |
41 | #endif // GNUC
42 |
43 | // Returns 1 if haystack matches pattern, 0 otherwise.
44 | int url_simplematch(const char pattern[], const char haystack[]) WUR_FUNC PURE_FUNC;
45 |
46 | /* These two functions initialize the optimized pattern matcher.
47 | * _init takes a char array of patterns, one per line.
48 | * _init_file takes a filename, either a text file containing one pattern per line,
49 | * or an optimized binary file as saved by _save_optimized.
50 | *
51 | * On error they return NULL. */
52 | typedef struct urlctx urlctx;
53 | urlctx *url_init_file(const char file[]) WUR_FUNC;
54 | urlctx *url_init_file2(const int fd) WUR_FUNC;
55 | urlctx *url_init(const char contents[]) WUR_FUNC;
56 |
57 | // Save an optimized binary file for faster loading later. Returns 0 on success.
58 | int url_save_optimized(const urlctx *ctx, const char file[]) WUR_FUNC NONNULL(1);
59 | int url_save_optimized2(const urlctx *ctx, const int fd) WUR_FUNC NONNULL(1);
60 |
61 | /* Returns 1 if haystack matches the optimized pattern, 0 otherwise.
62 | *
63 | * It's safe to call from multiple threads at once, with the same context. */
64 | int url_match(const urlctx *ctx, const char haystack[]) WUR_FUNC PURE_FUNC NONNULL(1);
65 |
66 | // Frees this context.
67 | void url_free(urlctx *ctx) NONNULL(1);
68 |
69 | /* Auxiliary function for e.g. searching bookmarks
70 | *
71 | * Returns the match score, higher the better. -1 is returned on error. */
72 | int ratedsearch(const char needle[], const char haystack[]) WUR_FUNC PURE_FUNC;
73 |
74 | #undef PURE_FUNC
75 | #undef NORETURN_FUNC
76 | #undef CONST_FUNC
77 | #undef WUR_FUNC
78 | #undef NONNULL
79 |
80 | #ifdef __cplusplus
81 | } // extern C
82 | #endif
83 |
84 | #endif
85 |
--------------------------------------------------------------------------------