├── .gitignore ├── AUTHORS ├── COPYING ├── ChangeLog ├── Makefile.am ├── NEWS ├── README ├── autogen.sh ├── config └── .gitignore ├── configure.ac ├── m4 └── .gitignore ├── src ├── Makefile.am ├── ac_heap.c ├── ac_list.c ├── aho_corasick.c └── libesm │ ├── ac_heap.h │ ├── ac_list.h │ └── aho_corasick.h └── test ├── Makefile.am └── run_tests.c /.gitignore: -------------------------------------------------------------------------------- 1 | .libs 2 | *.lo 3 | *.la 4 | *.o 5 | aclocal.m4 6 | autom4te.cache 7 | config.guess 8 | config.log 9 | config.status 10 | config.sub 11 | configure 12 | depcomp 13 | INSTALL 14 | install-sh 15 | libesm.a 16 | Makefile 17 | Makefile.in 18 | missing 19 | .deps 20 | libtool 21 | libesm-1.0.tar.gz 22 | build 23 | test/run_tests 24 | -------------------------------------------------------------------------------- /AUTHORS: -------------------------------------------------------------------------------- 1 | Will Harris 2 | -------------------------------------------------------------------------------- /COPYING: -------------------------------------------------------------------------------- 1 | GNU LESSER GENERAL PUBLIC LICENSE 2 | Version 2.1, February 1999 3 | 4 | Copyright (C) 1991, 1999 Free Software Foundation, Inc. 5 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 6 | Everyone is permitted to copy and distribute verbatim copies 7 | of this license document, but changing it is not allowed. 8 | 9 | [This is the first released version of the Lesser GPL. It also counts 10 | as the successor of the GNU Library Public License, version 2, hence 11 | the version number 2.1.] 12 | 13 | Preamble 14 | 15 | The licenses for most software are designed to take away your 16 | freedom to share and change it. By contrast, the GNU General Public 17 | Licenses are intended to guarantee your freedom to share and change 18 | free software--to make sure the software is free for all its users. 19 | 20 | This license, the Lesser General Public License, applies to some 21 | specially designated software packages--typically libraries--of the 22 | Free Software Foundation and other authors who decide to use it. You 23 | can use it too, but we suggest you first think carefully about whether 24 | this license or the ordinary General Public License is the better 25 | strategy to use in any particular case, based on the explanations below. 26 | 27 | When we speak of free software, we are referring to freedom of use, 28 | not price. Our General Public Licenses are designed to make sure that 29 | you have the freedom to distribute copies of free software (and charge 30 | for this service if you wish); that you receive source code or can get 31 | it if you want it; that you can change the software and use pieces of 32 | it in new free programs; and that you are informed that you can do 33 | these things. 34 | 35 | To protect your rights, we need to make restrictions that forbid 36 | distributors to deny you these rights or to ask you to surrender these 37 | rights. These restrictions translate to certain responsibilities for 38 | you if you distribute copies of the library or if you modify it. 39 | 40 | For example, if you distribute copies of the library, whether gratis 41 | or for a fee, you must give the recipients all the rights that we gave 42 | you. You must make sure that they, too, receive or can get the source 43 | code. If you link other code with the library, you must provide 44 | complete object files to the recipients, so that they can relink them 45 | with the library after making changes to the library and recompiling 46 | it. And you must show them these terms so they know their rights. 47 | 48 | We protect your rights with a two-step method: (1) we copyright the 49 | library, and (2) we offer you this license, which gives you legal 50 | permission to copy, distribute and/or modify the library. 51 | 52 | To protect each distributor, we want to make it very clear that 53 | there is no warranty for the free library. Also, if the library is 54 | modified by someone else and passed on, the recipients should know 55 | that what they have is not the original version, so that the original 56 | author's reputation will not be affected by problems that might be 57 | introduced by others. 58 | 59 | Finally, software patents pose a constant threat to the existence of 60 | any free program. We wish to make sure that a company cannot 61 | effectively restrict the users of a free program by obtaining a 62 | restrictive license from a patent holder. Therefore, we insist that 63 | any patent license obtained for a version of the library must be 64 | consistent with the full freedom of use specified in this license. 65 | 66 | Most GNU software, including some libraries, is covered by the 67 | ordinary GNU General Public License. This license, the GNU Lesser 68 | General Public License, applies to certain designated libraries, and 69 | is quite different from the ordinary General Public License. We use 70 | this license for certain libraries in order to permit linking those 71 | libraries into non-free programs. 72 | 73 | When a program is linked with a library, whether statically or using 74 | a shared library, the combination of the two is legally speaking a 75 | combined work, a derivative of the original library. The ordinary 76 | General Public License therefore permits such linking only if the 77 | entire combination fits its criteria of freedom. The Lesser General 78 | Public License permits more lax criteria for linking other code with 79 | the library. 80 | 81 | We call this license the "Lesser" General Public License because it 82 | does Less to protect the user's freedom than the ordinary General 83 | Public License. It also provides other free software developers Less 84 | of an advantage over competing non-free programs. These disadvantages 85 | are the reason we use the ordinary General Public License for many 86 | libraries. However, the Lesser license provides advantages in certain 87 | special circumstances. 88 | 89 | For example, on rare occasions, there may be a special need to 90 | encourage the widest possible use of a certain library, so that it becomes 91 | a de-facto standard. To achieve this, non-free programs must be 92 | allowed to use the library. A more frequent case is that a free 93 | library does the same job as widely used non-free libraries. In this 94 | case, there is little to gain by limiting the free library to free 95 | software only, so we use the Lesser General Public License. 96 | 97 | In other cases, permission to use a particular library in non-free 98 | programs enables a greater number of people to use a large body of 99 | free software. For example, permission to use the GNU C Library in 100 | non-free programs enables many more people to use the whole GNU 101 | operating system, as well as its variant, the GNU/Linux operating 102 | system. 103 | 104 | Although the Lesser General Public License is Less protective of the 105 | users' freedom, it does ensure that the user of a program that is 106 | linked with the Library has the freedom and the wherewithal to run 107 | that program using a modified version of the Library. 108 | 109 | The precise terms and conditions for copying, distribution and 110 | modification follow. Pay close attention to the difference between a 111 | "work based on the library" and a "work that uses the library". The 112 | former contains code derived from the library, whereas the latter must 113 | be combined with the library in order to run. 114 | 115 | GNU LESSER GENERAL PUBLIC LICENSE 116 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 117 | 118 | 0. This License Agreement applies to any software library or other 119 | program which contains a notice placed by the copyright holder or 120 | other authorized party saying it may be distributed under the terms of 121 | this Lesser General Public License (also called "this License"). 122 | Each licensee is addressed as "you". 123 | 124 | A "library" means a collection of software functions and/or data 125 | prepared so as to be conveniently linked with application programs 126 | (which use some of those functions and data) to form executables. 127 | 128 | The "Library", below, refers to any such software library or work 129 | which has been distributed under these terms. A "work based on the 130 | Library" means either the Library or any derivative work under 131 | copyright law: that is to say, a work containing the Library or a 132 | portion of it, either verbatim or with modifications and/or translated 133 | straightforwardly into another language. (Hereinafter, translation is 134 | included without limitation in the term "modification".) 135 | 136 | "Source code" for a work means the preferred form of the work for 137 | making modifications to it. For a library, complete source code means 138 | all the source code for all modules it contains, plus any associated 139 | interface definition files, plus the scripts used to control compilation 140 | and installation of the library. 141 | 142 | Activities other than copying, distribution and modification are not 143 | covered by this License; they are outside its scope. The act of 144 | running a program using the Library is not restricted, and output from 145 | such a program is covered only if its contents constitute a work based 146 | on the Library (independent of the use of the Library in a tool for 147 | writing it). Whether that is true depends on what the Library does 148 | and what the program that uses the Library does. 149 | 150 | 1. You may copy and distribute verbatim copies of the Library's 151 | complete source code as you receive it, in any medium, provided that 152 | you conspicuously and appropriately publish on each copy an 153 | appropriate copyright notice and disclaimer of warranty; keep intact 154 | all the notices that refer to this License and to the absence of any 155 | warranty; and distribute a copy of this License along with the 156 | Library. 157 | 158 | You may charge a fee for the physical act of transferring a copy, 159 | and you may at your option offer warranty protection in exchange for a 160 | fee. 161 | 162 | 2. You may modify your copy or copies of the Library or any portion 163 | of it, thus forming a work based on the Library, and copy and 164 | distribute such modifications or work under the terms of Section 1 165 | above, provided that you also meet all of these conditions: 166 | 167 | a) The modified work must itself be a software library. 168 | 169 | b) You must cause the files modified to carry prominent notices 170 | stating that you changed the files and the date of any change. 171 | 172 | c) You must cause the whole of the work to be licensed at no 173 | charge to all third parties under the terms of this License. 174 | 175 | d) If a facility in the modified Library refers to a function or a 176 | table of data to be supplied by an application program that uses 177 | the facility, other than as an argument passed when the facility 178 | is invoked, then you must make a good faith effort to ensure that, 179 | in the event an application does not supply such function or 180 | table, the facility still operates, and performs whatever part of 181 | its purpose remains meaningful. 182 | 183 | (For example, a function in a library to compute square roots has 184 | a purpose that is entirely well-defined independent of the 185 | application. Therefore, Subsection 2d requires that any 186 | application-supplied function or table used by this function must 187 | be optional: if the application does not supply it, the square 188 | root function must still compute square roots.) 189 | 190 | These requirements apply to the modified work as a whole. If 191 | identifiable sections of that work are not derived from the Library, 192 | and can be reasonably considered independent and separate works in 193 | themselves, then this License, and its terms, do not apply to those 194 | sections when you distribute them as separate works. But when you 195 | distribute the same sections as part of a whole which is a work based 196 | on the Library, the distribution of the whole must be on the terms of 197 | this License, whose permissions for other licensees extend to the 198 | entire whole, and thus to each and every part regardless of who wrote 199 | it. 200 | 201 | Thus, it is not the intent of this section to claim rights or contest 202 | your rights to work written entirely by you; rather, the intent is to 203 | exercise the right to control the distribution of derivative or 204 | collective works based on the Library. 205 | 206 | In addition, mere aggregation of another work not based on the Library 207 | with the Library (or with a work based on the Library) on a volume of 208 | a storage or distribution medium does not bring the other work under 209 | the scope of this License. 210 | 211 | 3. You may opt to apply the terms of the ordinary GNU General Public 212 | License instead of this License to a given copy of the Library. To do 213 | this, you must alter all the notices that refer to this License, so 214 | that they refer to the ordinary GNU General Public License, version 2, 215 | instead of to this License. (If a newer version than version 2 of the 216 | ordinary GNU General Public License has appeared, then you can specify 217 | that version instead if you wish.) Do not make any other change in 218 | these notices. 219 | 220 | Once this change is made in a given copy, it is irreversible for 221 | that copy, so the ordinary GNU General Public License applies to all 222 | subsequent copies and derivative works made from that copy. 223 | 224 | This option is useful when you wish to copy part of the code of 225 | the Library into a program that is not a library. 226 | 227 | 4. You may copy and distribute the Library (or a portion or 228 | derivative of it, under Section 2) in object code or executable form 229 | under the terms of Sections 1 and 2 above provided that you accompany 230 | it with the complete corresponding machine-readable source code, which 231 | must be distributed under the terms of Sections 1 and 2 above on a 232 | medium customarily used for software interchange. 233 | 234 | If distribution of object code is made by offering access to copy 235 | from a designated place, then offering equivalent access to copy the 236 | source code from the same place satisfies the requirement to 237 | distribute the source code, even though third parties are not 238 | compelled to copy the source along with the object code. 239 | 240 | 5. A program that contains no derivative of any portion of the 241 | Library, but is designed to work with the Library by being compiled or 242 | linked with it, is called a "work that uses the Library". Such a 243 | work, in isolation, is not a derivative work of the Library, and 244 | therefore falls outside the scope of this License. 245 | 246 | However, linking a "work that uses the Library" with the Library 247 | creates an executable that is a derivative of the Library (because it 248 | contains portions of the Library), rather than a "work that uses the 249 | library". The executable is therefore covered by this License. 250 | Section 6 states terms for distribution of such executables. 251 | 252 | When a "work that uses the Library" uses material from a header file 253 | that is part of the Library, the object code for the work may be a 254 | derivative work of the Library even though the source code is not. 255 | Whether this is true is especially significant if the work can be 256 | linked without the Library, or if the work is itself a library. The 257 | threshold for this to be true is not precisely defined by law. 258 | 259 | If such an object file uses only numerical parameters, data 260 | structure layouts and accessors, and small macros and small inline 261 | functions (ten lines or less in length), then the use of the object 262 | file is unrestricted, regardless of whether it is legally a derivative 263 | work. (Executables containing this object code plus portions of the 264 | Library will still fall under Section 6.) 265 | 266 | Otherwise, if the work is a derivative of the Library, you may 267 | distribute the object code for the work under the terms of Section 6. 268 | Any executables containing that work also fall under Section 6, 269 | whether or not they are linked directly with the Library itself. 270 | 271 | 6. As an exception to the Sections above, you may also combine or 272 | link a "work that uses the Library" with the Library to produce a 273 | work containing portions of the Library, and distribute that work 274 | under terms of your choice, provided that the terms permit 275 | modification of the work for the customer's own use and reverse 276 | engineering for debugging such modifications. 277 | 278 | You must give prominent notice with each copy of the work that the 279 | Library is used in it and that the Library and its use are covered by 280 | this License. You must supply a copy of this License. If the work 281 | during execution displays copyright notices, you must include the 282 | copyright notice for the Library among them, as well as a reference 283 | directing the user to the copy of this License. Also, you must do one 284 | of these things: 285 | 286 | a) Accompany the work with the complete corresponding 287 | machine-readable source code for the Library including whatever 288 | changes were used in the work (which must be distributed under 289 | Sections 1 and 2 above); and, if the work is an executable linked 290 | with the Library, with the complete machine-readable "work that 291 | uses the Library", as object code and/or source code, so that the 292 | user can modify the Library and then relink to produce a modified 293 | executable containing the modified Library. (It is understood 294 | that the user who changes the contents of definitions files in the 295 | Library will not necessarily be able to recompile the application 296 | to use the modified definitions.) 297 | 298 | b) Use a suitable shared library mechanism for linking with the 299 | Library. A suitable mechanism is one that (1) uses at run time a 300 | copy of the library already present on the user's computer system, 301 | rather than copying library functions into the executable, and (2) 302 | will operate properly with a modified version of the library, if 303 | the user installs one, as long as the modified version is 304 | interface-compatible with the version that the work was made with. 305 | 306 | c) Accompany the work with a written offer, valid for at 307 | least three years, to give the same user the materials 308 | specified in Subsection 6a, above, for a charge no more 309 | than the cost of performing this distribution. 310 | 311 | d) If distribution of the work is made by offering access to copy 312 | from a designated place, offer equivalent access to copy the above 313 | specified materials from the same place. 314 | 315 | e) Verify that the user has already received a copy of these 316 | materials or that you have already sent this user a copy. 317 | 318 | For an executable, the required form of the "work that uses the 319 | Library" must include any data and utility programs needed for 320 | reproducing the executable from it. However, as a special exception, 321 | the materials to be distributed need not include anything that is 322 | normally distributed (in either source or binary form) with the major 323 | components (compiler, kernel, and so on) of the operating system on 324 | which the executable runs, unless that component itself accompanies 325 | the executable. 326 | 327 | It may happen that this requirement contradicts the license 328 | restrictions of other proprietary libraries that do not normally 329 | accompany the operating system. Such a contradiction means you cannot 330 | use both them and the Library together in an executable that you 331 | distribute. 332 | 333 | 7. You may place library facilities that are a work based on the 334 | Library side-by-side in a single library together with other library 335 | facilities not covered by this License, and distribute such a combined 336 | library, provided that the separate distribution of the work based on 337 | the Library and of the other library facilities is otherwise 338 | permitted, and provided that you do these two things: 339 | 340 | a) Accompany the combined library with a copy of the same work 341 | based on the Library, uncombined with any other library 342 | facilities. This must be distributed under the terms of the 343 | Sections above. 344 | 345 | b) Give prominent notice with the combined library of the fact 346 | that part of it is a work based on the Library, and explaining 347 | where to find the accompanying uncombined form of the same work. 348 | 349 | 8. You may not copy, modify, sublicense, link with, or distribute 350 | the Library except as expressly provided under this License. Any 351 | attempt otherwise to copy, modify, sublicense, link with, or 352 | distribute the Library is void, and will automatically terminate your 353 | rights under this License. However, parties who have received copies, 354 | or rights, from you under this License will not have their licenses 355 | terminated so long as such parties remain in full compliance. 356 | 357 | 9. You are not required to accept this License, since you have not 358 | signed it. However, nothing else grants you permission to modify or 359 | distribute the Library or its derivative works. These actions are 360 | prohibited by law if you do not accept this License. Therefore, by 361 | modifying or distributing the Library (or any work based on the 362 | Library), you indicate your acceptance of this License to do so, and 363 | all its terms and conditions for copying, distributing or modifying 364 | the Library or works based on it. 365 | 366 | 10. Each time you redistribute the Library (or any work based on the 367 | Library), the recipient automatically receives a license from the 368 | original licensor to copy, distribute, link with or modify the Library 369 | subject to these terms and conditions. You may not impose any further 370 | restrictions on the recipients' exercise of the rights granted herein. 371 | You are not responsible for enforcing compliance by third parties with 372 | this License. 373 | 374 | 11. If, as a consequence of a court judgment or allegation of patent 375 | infringement or for any other reason (not limited to patent issues), 376 | conditions are imposed on you (whether by court order, agreement or 377 | otherwise) that contradict the conditions of this License, they do not 378 | excuse you from the conditions of this License. If you cannot 379 | distribute so as to satisfy simultaneously your obligations under this 380 | License and any other pertinent obligations, then as a consequence you 381 | may not distribute the Library at all. For example, if a patent 382 | license would not permit royalty-free redistribution of the Library by 383 | all those who receive copies directly or indirectly through you, then 384 | the only way you could satisfy both it and this License would be to 385 | refrain entirely from distribution of the Library. 386 | 387 | If any portion of this section is held invalid or unenforceable under any 388 | particular circumstance, the balance of the section is intended to apply, 389 | and the section as a whole is intended to apply in other circumstances. 390 | 391 | It is not the purpose of this section to induce you to infringe any 392 | patents or other property right claims or to contest validity of any 393 | such claims; this section has the sole purpose of protecting the 394 | integrity of the free software distribution system which is 395 | implemented by public license practices. Many people have made 396 | generous contributions to the wide range of software distributed 397 | through that system in reliance on consistent application of that 398 | system; it is up to the author/donor to decide if he or she is willing 399 | to distribute software through any other system and a licensee cannot 400 | impose that choice. 401 | 402 | This section is intended to make thoroughly clear what is believed to 403 | be a consequence of the rest of this License. 404 | 405 | 12. If the distribution and/or use of the Library is restricted in 406 | certain countries either by patents or by copyrighted interfaces, the 407 | original copyright holder who places the Library under this License may add 408 | an explicit geographical distribution limitation excluding those countries, 409 | so that distribution is permitted only in or among countries not thus 410 | excluded. In such case, this License incorporates the limitation as if 411 | written in the body of this License. 412 | 413 | 13. The Free Software Foundation may publish revised and/or new 414 | versions of the Lesser General Public License from time to time. 415 | Such new versions will be similar in spirit to the present version, 416 | but may differ in detail to address new problems or concerns. 417 | 418 | Each version is given a distinguishing version number. If the Library 419 | specifies a version number of this License which applies to it and 420 | "any later version", you have the option of following the terms and 421 | conditions either of that version or of any later version published by 422 | the Free Software Foundation. If the Library does not specify a 423 | license version number, you may choose any version ever published by 424 | the Free Software Foundation. 425 | 426 | 14. If you wish to incorporate parts of the Library into other free 427 | programs whose distribution conditions are incompatible with these, 428 | write to the author to ask for permission. For software which is 429 | copyrighted by the Free Software Foundation, write to the Free 430 | Software Foundation; we sometimes make exceptions for this. Our 431 | decision will be guided by the two goals of preserving the free status 432 | of all derivatives of our free software and of promoting the sharing 433 | and reuse of software generally. 434 | 435 | NO WARRANTY 436 | 437 | 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO 438 | WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. 439 | EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR 440 | OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY 441 | KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE 442 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 443 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE 444 | LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME 445 | THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 446 | 447 | 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN 448 | WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY 449 | AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU 450 | FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR 451 | CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE 452 | LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING 453 | RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A 454 | FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF 455 | SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH 456 | DAMAGES. 457 | 458 | END OF TERMS AND CONDITIONS 459 | 460 | How to Apply These Terms to Your New Libraries 461 | 462 | If you develop a new library, and you want it to be of the greatest 463 | possible use to the public, we recommend making it free software that 464 | everyone can redistribute and change. You can do so by permitting 465 | redistribution under these terms (or, alternatively, under the terms of the 466 | ordinary General Public License). 467 | 468 | To apply these terms, attach the following notices to the library. It is 469 | safest to attach them to the start of each source file to most effectively 470 | convey the exclusion of warranty; and each file should have at least the 471 | "copyright" line and a pointer to where the full notice is found. 472 | 473 | 474 | Copyright (C) 475 | 476 | This library is free software; you can redistribute it and/or 477 | modify it under the terms of the GNU Lesser General Public 478 | License as published by the Free Software Foundation; either 479 | version 2.1 of the License, or (at your option) any later version. 480 | 481 | This library is distributed in the hope that it will be useful, 482 | but WITHOUT ANY WARRANTY; without even the implied warranty of 483 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 484 | Lesser General Public License for more details. 485 | 486 | You should have received a copy of the GNU Lesser General Public 487 | License along with this library; if not, write to the Free Software 488 | Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 489 | 490 | Also add information on how to contact you by electronic and paper mail. 491 | 492 | You should also get your employer (if you work as a programmer) or your 493 | school, if any, to sign a "copyright disclaimer" for the library, if 494 | necessary. Here is a sample; alter the names: 495 | 496 | Yoyodyne, Inc., hereby disclaims all copyright interest in the 497 | library `Frob' (a library for tweaking knobs) written by James Random Hacker. 498 | 499 | , 1 April 1990 500 | Ty Coon, President of Vice 501 | 502 | That's all there is to it! 503 | 504 | 505 | -------------------------------------------------------------------------------- /ChangeLog: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wharris/libesm/7007854f2df378118adcb5b1b9b768b7acb74f86/ChangeLog -------------------------------------------------------------------------------- /Makefile.am: -------------------------------------------------------------------------------- 1 | ACLOCAL_AMFLAGS = -I m4 2 | SUBDIRS = src test 3 | EXTRA_DIST = autogen.sh 4 | ALL = src 5 | -------------------------------------------------------------------------------- /NEWS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wharris/libesm/7007854f2df378118adcb5b1b9b768b7acb74f86/NEWS -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wharris/libesm/7007854f2df378118adcb5b1b9b768b7acb74f86/README -------------------------------------------------------------------------------- /autogen.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | autoreconf --force --install -I config -I m4 3 | -------------------------------------------------------------------------------- /config/.gitignore: -------------------------------------------------------------------------------- 1 | config.guess 2 | config.sub 3 | depcomp 4 | install-sh 5 | ltmain.sh 6 | missing 7 | -------------------------------------------------------------------------------- /configure.ac: -------------------------------------------------------------------------------- 1 | AC_INIT([libesm], [1.0], [will@greatlibrary.net]) 2 | AC_CONFIG_AUX_DIR([config]) 3 | AC_CONFIG_SRCDIR([src/aho_corasick.c]) 4 | AM_INIT_AUTOMAKE([libesm], [1.0]) 5 | 6 | AC_PROG_CC 7 | AC_PROG_LIBTOOL 8 | 9 | AC_OUTPUT([Makefile src/Makefile test/Makefile]) 10 | -------------------------------------------------------------------------------- /m4/.gitignore: -------------------------------------------------------------------------------- 1 | *.m4 2 | -------------------------------------------------------------------------------- /src/Makefile.am: -------------------------------------------------------------------------------- 1 | lib_LTLIBRARIES = libesm.la 2 | libesm_la_SOURCES = ac_heap.c \ 3 | ac_list.c \ 4 | aho_corasick.c \ 5 | libesm/ac_list.h \ 6 | libesm/aho_corasick.h 7 | libesm_la_includedir=$(includedir)/libesm 8 | libesm_la_include_HEADERS = libesm/aho_corasick.h 9 | -------------------------------------------------------------------------------- /src/ac_heap.c: -------------------------------------------------------------------------------- 1 | /* ac_heap.c - functions used to debug memory allocation 2 | Copyright (C) 2007 Tideway Systems Limited. 3 | 4 | This library is free software; you can redistribute it and/or 5 | modify it under the terms of the GNU Lesser General Public 6 | License as published by the Free Software Foundation; either 7 | version 2.1 of the License, or (at your option) any later version. 8 | 9 | This library is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | Lesser General Public License for more details. 13 | 14 | You should have received a copy of the GNU Lesser General Public 15 | License along with this library; if not, write to the Free Software 16 | Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 17 | */ 18 | 19 | #include "libesm/ac_heap.h" 20 | #include "stdio.h" 21 | 22 | static size_t total = 0; 23 | 24 | void* ac_malloc(size_t size, char* file, int line) { 25 | void* result = malloc(size); 26 | total += size; 27 | 28 | if (result) { 29 | printf("malloc %p at %s:%d\n", result, file, line); 30 | } else { 31 | printf("malloc NULL at %s:%d\n", file, line); 32 | } 33 | printf("t %d\n", (int) total); 34 | return result; 35 | } 36 | 37 | void ac_free(void* p, char* file, int line) { 38 | printf("free %p at %s:%d\n", p, file, line); 39 | free(p); 40 | } 41 | 42 | -------------------------------------------------------------------------------- /src/ac_list.c: -------------------------------------------------------------------------------- 1 | /* ac_list.c - functions for linked lists 2 | Copyright (C) 2007 Tideway Systems Limited. 3 | 4 | This library is free software; you can redistribute it and/or 5 | modify it under the terms of the GNU Lesser General Public 6 | License as published by the Free Software Foundation; either 7 | version 2.1 of the License, or (at your option) any later version. 8 | 9 | This library is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | Lesser General Public License for more details. 13 | 14 | You should have received a copy of the GNU Lesser General Public 15 | License along with this library; if not, write to the Free Software 16 | Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 17 | */ 18 | 19 | #include "libesm/ac_list.h" 20 | #include "libesm/ac_heap.h" 21 | 22 | /** 23 | * Make a new, empty linked list. Returns a pointer to a list or NULL if an 24 | * error was encountered allocating heap space for the structure. 25 | */ 26 | ac_list* 27 | ac_list_new(void) { 28 | ac_list* self = NULL; 29 | 30 | if ( (self = MALLOC(sizeof(ac_list)))) { 31 | self->first = NULL; 32 | self->last = NULL; 33 | } 34 | 35 | return self; 36 | } 37 | 38 | /** 39 | * Free a list item. Pass a pointer to this function to ac_list_free when you 40 | * want only the heap space for the list items themselves will be freed. 41 | * Always returns AC_SUCCESS. 42 | */ 43 | ac_error_code 44 | ac_list_free_simple_item(void* item, void* data) { 45 | FREE(item); 46 | return AC_SUCCESS; 47 | } 48 | 49 | /** 50 | * NOP list item free function. Pass a pointer to this function to 51 | * ac_list_free when you don't want the heap space for the list items to be 52 | * freed. 53 | */ 54 | ac_error_code 55 | ac_list_free_keep_item(void* item, void* data) { 56 | return AC_SUCCESS; 57 | } 58 | 59 | /** 60 | * Free the ac_list at self. Heap space allocated for the list will be freed. 61 | * A function pointer should be provided in free_item for freeing resources 62 | * allocated for the items themselves. The function at free_item is called 63 | * once for each item added to the list. It is passed a pointer to the item 64 | * and workspace data pointer (given to ac_list_free in free_data). 65 | * passed to ac_list_free in free_data. The free_item function should return 66 | * AC_SUCCESS if it succeeds or AC_FAILURE if it fails. 67 | * 68 | * Returns AC_SUCCESS if all the free_item calls succeed of AC_FAILURE if self 69 | * is NULL, or if any of the free_item calls fail. 70 | */ 71 | ac_error_code 72 | ac_list_free(ac_list* self, 73 | ac_free_function free_item, 74 | void* free_data) { 75 | 76 | ac_list_item* list_item = NULL; 77 | ac_list_item* tmp = NULL; 78 | ac_error_code result = AC_SUCCESS; 79 | 80 | if ( ! self) { 81 | return AC_FAILURE; 82 | } 83 | 84 | list_item = self->first; 85 | 86 | while (list_item) { 87 | tmp = list_item->next; 88 | 89 | if (free_item(list_item->item, free_data) != AC_SUCCESS) { 90 | result = AC_FAILURE; 91 | } 92 | 93 | FREE(list_item); 94 | list_item = tmp; 95 | } 96 | 97 | FREE(self); 98 | 99 | return result; 100 | } 101 | 102 | /** 103 | * Add an item to a list. The item at item is appended to the list at self. 104 | * Returns AC_SUCCESS if successful or AC_FAILURE if an error is encountered 105 | * allocating heap space for the internal list item structure. 106 | */ 107 | ac_error_code 108 | ac_list_add(ac_list* self, void* item) { 109 | ac_list_item* new_list_item; 110 | 111 | if ( ! (new_list_item = MALLOC(sizeof(ac_list_item)))) { 112 | return AC_FAILURE; 113 | } 114 | 115 | new_list_item->item = item; 116 | new_list_item->next = NULL; 117 | 118 | if ( ! self->first) { 119 | self->first = new_list_item; 120 | } 121 | 122 | if (self->last) { 123 | self->last->next = new_list_item; 124 | } 125 | 126 | self->last = new_list_item; 127 | 128 | return AC_SUCCESS; 129 | } 130 | 131 | -------------------------------------------------------------------------------- /src/aho_corasick.c: -------------------------------------------------------------------------------- 1 | /* aho_corasick.c - Aho Corasick implementations 2 | Copyright (C) 2007 Tideway Systems Limited. 3 | 4 | This library is free software; you can redistribute it and/or 5 | modify it under the terms of the GNU Lesser General Public 6 | License as published by the Free Software Foundation; either 7 | version 2.1 of the License, or (at your option) any later version. 8 | 9 | This library is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | Lesser General Public License for more details. 13 | 14 | You should have received a copy of the GNU Lesser General Public 15 | License along with this library; if not, write to the Free Software 16 | Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 17 | */ 18 | 19 | /* This file contains implementations of some of the algorithms for efficient 20 | * string matching described in the paper, 'Aho, A.V, and Corasick, M. J. 21 | * Efficient String Matching: An Aid to Bibliographic Search. Comm. ACM 18:6 22 | * (June 1975), 333-340', hereafter referred to as 'the paper.' 23 | * 24 | * The paper describes algorithms (Algorithm 2 and Algorithm 3) for 25 | * contructing the functions of a finite state machine from a set of keywords 26 | * and an algorithm (Algorithm 1) for running that machine against an input 27 | * string to output the occurences of the keywords in the input string. 28 | * 29 | * The paper also describes an algorithm (Algorithm 4) for eliminating the 30 | * failure function to create a deterministic finite automaton that makes 31 | * exactly one state transition per input symbol, but I have chosen not to 32 | * implement that algorithm. 33 | * 34 | * I have moved the code that implements the last part of Algorithm 2 (the 35 | * part that completes the goto function for the root state) into the function 36 | * that implements Algorithm 3. I have also made the loop over the set of 37 | * keywords the responsibilty of the user, leaving only Algorithm 2's 'enter' 38 | * procedure. These changes allow for a simple API where the user can create 39 | * an index with ac_index_new, add keywords with associated object to the 40 | * index with ac_index_enter (Algorithm 2's 'enter' procedure), then fix the 41 | * index with ac_index_fix (Algorithm 3 prefixed with the end of Algorithm 2) 42 | * before querying it with ac_index_query (Algorithm 1) and freeing it with 43 | * ac_index_free (not covered in the paper). 44 | */ 45 | 46 | #include "libesm/aho_corasick.h" 47 | #include "libesm/ac_heap.h" 48 | #include "libesm/ac_list.h" 49 | #include 50 | 51 | /** 52 | * Structure for a state in an Aho-Corasick pattern matching machine. The 53 | * structure holds the data for the the goto, failure and output functions for 54 | * a single state. In the paper, states are identified by a sequential integer 55 | * but this implementation uses the pointer to an ac_state structure. 56 | * 57 | * The data for the output function is split into two lists. One (outputs) is 58 | * built when keywords are entered into the index. The other (extra_outputs) 59 | * is built when the index is fixed. 60 | */ 61 | typedef struct ac_state { 62 | /** 63 | * List of ac_goto structures used to evaluate the goto function for this 64 | * state. 65 | */ 66 | ac_list* gotos; 67 | 68 | /** 69 | * List of ac_output structures that are part of the result of the output 70 | * function for this state. Items are added to these lists when keywords 71 | * are entered. 72 | */ 73 | ac_list* outputs; 74 | 75 | /** 76 | * List of ac_output structures that are part of the result of the output 77 | * function for this state. Items are added to this list when the index is 78 | * fixed. 79 | */ 80 | ac_list* extra_outputs; 81 | 82 | /** 83 | * Result of failure function for this state. 84 | */ 85 | struct ac_state* failure; 86 | } ac_state; 87 | 88 | // -------------------------------------------------------------------------- 89 | // Goto list 90 | 91 | /** 92 | * Structure mapping a symbol to a state, forming part of the goto function 93 | * data for a state. 94 | */ 95 | typedef struct ac_goto { 96 | ac_symbol symbol; 97 | ac_state* state; 98 | } ac_goto; 99 | 100 | /** 101 | * Shared workspace used by ac_goto_list_free_item. 102 | */ 103 | typedef struct ac_goto_list_free_data { 104 | /** 105 | * List of states to free. When the index is freed, as we iterate over the 106 | * goto lists to free them, we add to a list of of state objects that will 107 | * need to be freed later. 108 | */ 109 | ac_list* states; 110 | 111 | /** 112 | * State that should not be added to the list above. When a goto item is 113 | * encountered that points to this state it is not added to the list. This 114 | * lets us cover the special case goto function of the root state, which 115 | * can goto itself. 116 | */ 117 | ac_state* state; 118 | } ac_goto_list_free_data; 119 | 120 | /** 121 | * List free function for goto list items. This will be called once per 122 | * ac_goto in the list. The data should be an ac_goto_list_free_data as 123 | * explained above. The states in the list are added to the list in data 124 | * except where the state matches the state in data. 125 | * 126 | * Returns AC_SUCCESS if successful or AC_FAILURE if a failure is encountered 127 | * when adding a state to the list. 128 | */ 129 | ac_error_code 130 | ac_goto_list_free_item(void* item, void* data) { 131 | ac_goto* goto_item = (ac_goto*) item; 132 | ac_goto_list_free_data* free_data = (ac_goto_list_free_data*) data; 133 | 134 | /* Add the state from the goto item to the list unless it is excluded. */ 135 | if (goto_item->state != free_data->state && 136 | ac_list_add(free_data->states, goto_item->state) != AC_SUCCESS) { 137 | return AC_FAILURE; 138 | } 139 | 140 | FREE(item); 141 | return AC_SUCCESS; 142 | } 143 | 144 | /** 145 | * Free a goto list, appending the states that do not match the state argument 146 | * to the list at states. Returns AC_SUCCESS if successful or AC_FAILURE 147 | * otherwise. 148 | */ 149 | ac_error_code 150 | ac_goto_list_free(ac_list* self, 151 | ac_list* states, 152 | ac_state* state) { 153 | 154 | ac_goto_list_free_data free_data = { 155 | states, 156 | state 157 | }; 158 | 159 | return ac_list_free(self, ac_goto_list_free_item, &free_data); 160 | } 161 | 162 | /** 163 | * Search a goto list for the state associated with the given symbol. Return 164 | * a pointer to the state if it is found or NULL if not. 165 | */ 166 | ac_state* 167 | ac_goto_list_get(ac_list* self, ac_symbol symbol) { 168 | ac_list_item* list_item = self->first; 169 | ac_goto* item = NULL; 170 | 171 | while (list_item) { 172 | item = (ac_goto*) list_item->item; 173 | 174 | if (item->symbol == symbol) { 175 | return item->state; 176 | } 177 | list_item = list_item->next; 178 | } 179 | 180 | return NULL; 181 | } 182 | 183 | /** 184 | * Determine whether a goto list has an association for the given symbol. 185 | */ 186 | bool 187 | ac_goto_list_has(ac_list* self, ac_symbol symbol) { 188 | return ac_goto_list_get(self, symbol) != NULL; 189 | } 190 | 191 | /** 192 | * Associates the given symbol with the given state in a goto list. Returns 193 | * AC_SUCCESS if successful or AC_FAILURE if an error was encountered. 194 | */ 195 | ac_error_code 196 | ac_goto_list_add(ac_list* self, ac_symbol symbol, ac_state* state) { 197 | ac_goto* new_item; 198 | 199 | if ( ! (new_item = MALLOC(sizeof(ac_goto)))) { 200 | return AC_FAILURE; 201 | } 202 | new_item->symbol = symbol; 203 | new_item->state = state; 204 | 205 | if (ac_list_add(self, new_item) != AC_SUCCESS) { 206 | FREE(new_item); 207 | return AC_FAILURE; 208 | } 209 | 210 | return AC_SUCCESS; 211 | } 212 | 213 | // -------------------------------------------------------------------------- 214 | // Output list 215 | 216 | /** 217 | * Structure holding the data for one item in the output function for a state. 218 | */ 219 | typedef struct ac_output { 220 | 221 | /** 222 | * The length of the keyword that matches at the state. We use this to 223 | * generate the index span of the query string that contains the keyword. 224 | */ 225 | ac_offset offset; 226 | 227 | /** An object assocated with the state. */ 228 | void* object; 229 | 230 | } ac_output; 231 | 232 | /** 233 | * List item free function for ac_output items. This funciton will be called 234 | * once for each item in each output list. The item argument points to the 235 | * ac_output being freed. The data argument points to a function for freeing 236 | * the associated object. 237 | * 238 | * Returns the result of the function for freeing the associated object. 239 | */ 240 | ac_error_code 241 | ac_output_list_free_item(void* item, void* data) { 242 | ac_output* output = (ac_output*) item; 243 | ac_error_code result; 244 | 245 | result = ((ac_free_function) data)(output->object, NULL); 246 | // TODO: let user pass data to free function 247 | FREE(output); 248 | return result; 249 | } 250 | 251 | /** 252 | * Free the output list, calling object_free on the associated object of 253 | * each output item. 254 | */ 255 | void 256 | ac_output_list_free(ac_list* self, ac_free_function object_free) { 257 | (void) ac_list_free(self, ac_output_list_free_item, object_free); 258 | } 259 | 260 | /** 261 | * Add an offset and an associated object to the output list. Returns 262 | * AC_SUCCESS if successful, or AC_FAILURE if the there was an error was 263 | * encountered. 264 | */ 265 | ac_error_code 266 | ac_output_list_add(ac_list* self, ac_offset offset, void* object) { 267 | ac_output* new_item; 268 | 269 | if ( ! (new_item = MALLOC(sizeof(ac_output)))) { 270 | return AC_FAILURE; 271 | } 272 | 273 | new_item->offset = offset; 274 | new_item->object = object; 275 | 276 | if (ac_list_add(self, new_item) != AC_SUCCESS) { 277 | FREE(new_item); 278 | return AC_FAILURE; 279 | } 280 | 281 | return AC_SUCCESS; 282 | } 283 | 284 | /** 285 | * Add the contents of the output list in other to the output list in self. 286 | * Returns AC_SUCCESS if successful or AC_FAILURE if an error was encountered. 287 | */ 288 | ac_error_code 289 | ac_output_list_add_list(ac_list* self, ac_list* other) { 290 | ac_list_item* list_item = other->first; 291 | ac_output* item = NULL; 292 | 293 | while (list_item) { 294 | item = (ac_output*) list_item->item; 295 | 296 | if (ac_output_list_add(self, 297 | item->offset, 298 | item->object) != AC_SUCCESS) { 299 | return AC_FAILURE; 300 | } 301 | 302 | list_item = list_item->next; 303 | } 304 | 305 | return AC_SUCCESS; 306 | } 307 | 308 | // -------------------------------------------------------------------------- 309 | // Callbacks 310 | 311 | /** 312 | * Call the callback with an single result. Returns AC_SUCCESS if successful, 313 | * or AC_FAILURE if the there was an error was encountered. 314 | */ 315 | ac_error_code 316 | ac_cb_output(ac_result_callback result_cb, 317 | void* result_cb_data, 318 | ac_offset start, 319 | ac_offset end, 320 | void* object) { 321 | 322 | ac_result* new_item; 323 | 324 | if ( ! (new_item = MALLOC(sizeof(ac_result)))) { 325 | return AC_FAILURE; 326 | } 327 | 328 | new_item->start = start; 329 | new_item->end = end; 330 | new_item->object = object; 331 | 332 | // TODO: should we return the callback's error code? 333 | if (result_cb(result_cb_data, new_item) != AC_SUCCESS) { 334 | FREE(new_item); 335 | return AC_FAILURE; 336 | } 337 | 338 | return AC_SUCCESS; 339 | } 340 | 341 | /** 342 | * Send each item in the output list to the callback. 343 | * Returns AC_SUCCESS if successful or AC_FAILURE if an error was encountered. 344 | */ 345 | ac_error_code 346 | ac_cb_outputs(ac_result_callback result_cb, 347 | void* result_cb_data, 348 | ac_list* outputs, 349 | ac_offset end) 350 | { 351 | ac_list_item* list_item = NULL; 352 | ac_output* item = NULL; 353 | 354 | list_item = outputs->first; 355 | 356 | while (list_item) { 357 | item = (ac_output*) list_item->item; 358 | if (ac_cb_output(result_cb, 359 | result_cb_data, 360 | end - item->offset + 1, 361 | end + 1, 362 | item->object) != AC_SUCCESS) { 363 | return AC_FAILURE; 364 | } 365 | 366 | list_item = list_item->next; 367 | } 368 | 369 | return AC_SUCCESS; 370 | } 371 | 372 | // -------------------------------------------------------------------------- 373 | // State object 374 | 375 | /** 376 | * Construct a new state object. The goto list and the both output lists are 377 | * initially empty, and the failure state is initially NULL. Returns a pointer 378 | * to the new state object if successful or NULL if an error was encountered 379 | * while trying to allocate heap space for the object or while constructing 380 | * one of its lists. 381 | */ 382 | ac_state* 383 | ac_state_new(void) { 384 | ac_state* self; 385 | 386 | if ( ! (self = MALLOC(sizeof(ac_state))) ) { 387 | return NULL; 388 | } 389 | 390 | if ( ! (self->gotos = ac_list_new())) { 391 | return NULL; 392 | } 393 | 394 | if ( ! (self->outputs = ac_list_new())) { 395 | return NULL; 396 | } 397 | 398 | if ( ! (self->extra_outputs = ac_list_new())) { 399 | return NULL; 400 | } 401 | 402 | self->failure = NULL; 403 | return self; 404 | } 405 | 406 | /** 407 | * Free the state object. Any states in the goto list will be added to the 408 | * state queue in the children argument. Relavent associated objects from the 409 | * output list will be freed by calling the object_free argument. Returns 410 | * AC_SUCCESS if sucessful, or AC_FAILURE if an error was encountered. 411 | */ 412 | ac_error_code 413 | ac_state_free(ac_state* self, 414 | ac_list* children, 415 | ac_free_function object_free) { 416 | 417 | if ( ! self) { 418 | return AC_FAILURE; 419 | } 420 | 421 | if (ac_goto_list_free(self->gotos, children, self) != AC_SUCCESS) { 422 | return AC_FAILURE; 423 | } 424 | 425 | // We need to call object_free for the associated objects in outputs, but 426 | // must not call object_free for the extra_outputs. 427 | ac_output_list_free(self->outputs, object_free); 428 | ac_output_list_free(self->extra_outputs, ac_list_free_keep_item); 429 | 430 | FREE(self); 431 | 432 | return AC_SUCCESS; 433 | } 434 | 435 | // -------------------------------------------------------------------------- 436 | // State queue 437 | 438 | /** 439 | * Free the state queue. 440 | */ 441 | void 442 | ac_state_queue_free(ac_list* self) { 443 | (void) ac_list_free(self, ac_list_free_keep_item, NULL); 444 | } 445 | 446 | /** 447 | * Pop the next item from the state queue. Returns a pointer to the popped 448 | * item if successful or NULL if the queue was empty. 449 | */ 450 | ac_state* 451 | ac_state_queue_get(ac_list* self) { 452 | ac_state* result = NULL; 453 | ac_list_item* next = NULL; 454 | 455 | if (self && self->first) { 456 | result = (ac_state*) self->first->item; 457 | next = self->first->next; 458 | FREE(self->first); 459 | self->first = next; 460 | } 461 | 462 | if (self->first == NULL) { 463 | self->last = NULL; 464 | } 465 | 466 | return result; 467 | } 468 | 469 | // -------------------------------------------------------------------------- 470 | // Index object 471 | 472 | ac_index* 473 | ac_index_new(void) { 474 | ac_index* self; 475 | 476 | if ( ! (self = MALLOC(sizeof(ac_index)))) { 477 | return NULL; 478 | } 479 | 480 | if ( ! (self->state_0 = ac_state_new())) { 481 | return NULL; 482 | } 483 | 484 | self->index_state = AC_INDEX_UNFIXED; 485 | 486 | return self; 487 | } 488 | 489 | ac_error_code 490 | ac_index_free(ac_index* self, ac_free_function object_free) { 491 | 492 | ac_list* queue = NULL; 493 | ac_state* state = NULL; 494 | ac_error_code result = AC_SUCCESS; 495 | 496 | if ( ! self) { 497 | return AC_FAILURE; 498 | } 499 | 500 | if ( ! (queue = ac_list_new())) { 501 | return AC_FAILURE; 502 | } 503 | 504 | // Free all the state nodes by following the goto function tree breadth 505 | // first, starting with state_0. 506 | state = self->state_0; 507 | 508 | while (state) { 509 | // Free the state and enqueue the states from the goto list. 510 | if (ac_state_free(state, queue, object_free) != AC_SUCCESS) { 511 | result = AC_FAILURE; 512 | } 513 | 514 | state = ac_state_queue_get(queue); 515 | } 516 | 517 | ac_state_queue_free(queue); // The queue should be empty. 518 | 519 | self->state_0 = NULL; 520 | FREE(self); 521 | 522 | return result; 523 | } 524 | 525 | ac_error_code 526 | ac_index_enter(ac_index* self, 527 | ac_symbol* keyword, 528 | ac_offset size, 529 | void* object) { 530 | 531 | // This is an implementation of the enter procedure of 'Algorithm 2. 532 | // Construction of the goto function.' from the paper. 533 | 534 | ac_state* state = self->state_0; 535 | ac_offset j = 0; 536 | ac_state* new_state = NULL; 537 | 538 | // You can't enter strings into a fixed index. 539 | if (self->index_state != AC_INDEX_UNFIXED) { 540 | return AC_FAILURE; 541 | } 542 | 543 | // Make sure that the goto tree has a path that spells out the keyword. 544 | // First skip the front of the the keyword if a matching path already 545 | // exists... 546 | while (j < size && 547 | (new_state = ac_goto_list_get(state->gotos, keyword[j]))) { 548 | 549 | state = new_state; 550 | ++j; 551 | } 552 | 553 | // ... then build the nodes for the rest of the keyword, if any. 554 | while (j < size) { 555 | if (! (new_state = ac_state_new())) { 556 | return AC_FAILURE; 557 | } 558 | 559 | if (ac_goto_list_add(state->gotos, 560 | keyword[j], 561 | new_state) != AC_SUCCESS) { 562 | return AC_FAILURE; 563 | } 564 | 565 | state = new_state; 566 | ++j; 567 | } 568 | 569 | // Now add an output for the keyword and associated object. 570 | if (ac_output_list_add(state->outputs, size, object) != AC_SUCCESS) { 571 | return AC_FAILURE; 572 | } 573 | 574 | return AC_SUCCESS; 575 | } 576 | 577 | /** 578 | * Fix the index, making it ready to be queried. This is an implementation of 579 | * the last part of Algorithm 2 from the paper combined with an implementation 580 | * of 'Algorithm 3. Construction of the failure function.' from the paper. 581 | * Returns AC_SUCCESS if the index was successfully fixed or AC_FAILURE if an 582 | * error was encountered or if the index was not 'unfixed'. 583 | */ 584 | ac_error_code 585 | ac_index_fix(ac_index* self) { 586 | // This is an implementation of the last part of Algorithm 2 from the paper 587 | // combined with an implementation of 'Algorithm 3. Construction of the 588 | // failure function.' from the paper. 589 | 590 | int symbol; 591 | ac_state* state = NULL; 592 | ac_state* r = NULL; 593 | ac_list* queue = NULL; 594 | ac_list_item* list_item = NULL; 595 | ac_goto* item = NULL; 596 | 597 | // You can't fix an index that is already fixed. 598 | if (self->index_state != AC_INDEX_UNFIXED) { 599 | return AC_FAILURE; 600 | } 601 | 602 | // Mark the index as being fixed. 603 | self->index_state = AC_INDEX_FIXED; 604 | 605 | // Make a temporary queue of states. 606 | if ( ! (queue = ac_list_new())) { 607 | return AC_FAILURE; 608 | } 609 | 610 | // Look at all the symbols. If state_0 has a goto for a symbol, add the 611 | // state to the queue and point the failure state back to state_0 - the 612 | // first part of Algorithm 3. Otherwise make state_0 goto itself for that 613 | // symbol - the last part of Algorithm 2. 614 | // TODO: Improve efficiency of state_0 to state_0 gotos. 615 | for (symbol = AC_MIN_SYMBOL; symbol <= AC_MAX_SYMBOL; symbol++) { 616 | if ((state = ac_goto_list_get(self->state_0->gotos, symbol))) { 617 | if (ac_list_add(queue, state) != AC_SUCCESS) { 618 | return AC_FAILURE; 619 | } 620 | state->failure = self->state_0; 621 | } 622 | else { 623 | if (ac_goto_list_add(self->state_0->gotos, 624 | symbol, 625 | self->state_0) != AC_SUCCESS) { 626 | return AC_FAILURE; 627 | } 628 | } 629 | } 630 | 631 | // Do the second part of Algorithm 3. Burn through the queue, enqueing 632 | // states from goto list in order to traverse the goto tree breadth-first. 633 | // ... 634 | while ((r = ac_state_queue_get(queue))) { 635 | list_item = r->gotos->first; 636 | 637 | while (list_item) { 638 | item = (ac_goto*) list_item->item; 639 | symbol = item->symbol; 640 | 641 | if (ac_list_add(queue, item->state) != AC_SUCCESS) { 642 | return AC_FAILURE; 643 | } 644 | 645 | // ... For each goto state, find the failure function by following 646 | // the failure function back until there is a goto defined. We 647 | // will always find a defined goto because state_0 has a goto 648 | // defined for every symbol (by now). ... 649 | state = r->failure; 650 | 651 | while ( ! ac_goto_list_has(state->gotos, symbol)) { 652 | state = state->failure; 653 | } 654 | 655 | item->state->failure = ac_goto_list_get(state->gotos, 656 | symbol); 657 | 658 | // ... Add the outputs for the failure state to the outputs. We 659 | // use the extra_outputs list because the outputs are already 660 | // referenced. 661 | if (ac_output_list_add_list(item->state->extra_outputs, 662 | item->state->failure->outputs)) { 663 | return AC_FAILURE; 664 | }; 665 | 666 | if (ac_output_list_add_list(item->state->extra_outputs, 667 | item->state->failure->extra_outputs)) { 668 | return AC_FAILURE; 669 | }; 670 | 671 | list_item = list_item->next; 672 | } 673 | 674 | } 675 | 676 | // Free the temporary queue. 677 | ac_state_queue_free(queue); 678 | 679 | return AC_SUCCESS; 680 | } 681 | 682 | ac_error_code ac_index_query_cb(ac_index* self, 683 | ac_symbol* phrase, 684 | ac_offset size, 685 | ac_result_callback result_cb, 686 | void* result_cb_data) 687 | { 688 | // This function is an implementation of 'Algorithm 1. Pattern matching 689 | // machine.' from the paper. 690 | ac_state* state = self->state_0; 691 | ac_state* next = NULL; 692 | ac_offset j = 0; 693 | 694 | // You can't query an index that isn't fixed. 695 | if (self->index_state != AC_INDEX_FIXED) { 696 | return AC_FAILURE; 697 | } 698 | 699 | // You must not provide a NULL callback. 700 | if ( ! result_cb) { 701 | return AC_FAILURE; 702 | } 703 | 704 | // Run the pattern matching machine. Iterate over the symbols in the 705 | // phrase. ... 706 | for (; j < size; ++j) { 707 | 708 | // ... If there is no goto for the symbol. Follow the failure 709 | // functions until there is. We will always find our way to a state 710 | // with a goto defined for the symbol because, once the index is 711 | // fixed, state_0 has gotos defined for every symbol. ... 712 | while ( ! (next = ac_goto_list_get(state->gotos, phrase[j]))) { 713 | state = state->failure; 714 | } 715 | 716 | state = next; 717 | 718 | // ... Add the outputs for the state. If there is no match, the state 719 | // will be state_0 which always has no outputs. ... 720 | if (ac_cb_outputs( 721 | result_cb, result_cb_data, state->outputs, j) != AC_SUCCESS) { 722 | return AC_FAILURE; 723 | }; 724 | 725 | if (ac_cb_outputs( 726 | result_cb, result_cb_data, state->extra_outputs, j) != AC_SUCCESS) { 727 | return AC_FAILURE; 728 | }; 729 | } 730 | 731 | return AC_SUCCESS; 732 | } 733 | 734 | // TODO: Add ac_index_unfix method to unfix the index. 735 | -------------------------------------------------------------------------------- /src/libesm/ac_heap.h: -------------------------------------------------------------------------------- 1 | /* ac_heap.h - macros used to debug memory allocation 2 | Copyright (C) 2007 Tideway Systems Limited. 3 | 4 | This library is free software; you can redistribute it and/or 5 | modify it under the terms of the GNU Lesser General Public 6 | License as published by the Free Software Foundation; either 7 | version 2.1 of the License, or (at your option) any later version. 8 | 9 | This library is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | Lesser General Public License for more details. 13 | 14 | You should have received a copy of the GNU Lesser General Public 15 | License along with this library; if not, write to the Free Software 16 | Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 17 | */ 18 | 19 | #include 20 | #ifdef HEAP_CHECK 21 | 22 | #define MALLOC(s) (ac_malloc(s, __FILE__, __LINE__)) 23 | #define FREE(p) (ac_free(p, __FILE__, __LINE__)) 24 | 25 | void* ac_malloc(size_t, char*, int); 26 | void ac_free(void*, char*, int); 27 | 28 | #else 29 | 30 | #define MALLOC(s) (malloc(s)) 31 | #define FREE(s) (free(s)) 32 | 33 | #endif 34 | 35 | -------------------------------------------------------------------------------- /src/libesm/ac_list.h: -------------------------------------------------------------------------------- 1 | /* ac_list.h - declarations for linked list functions 2 | Copyright (C) 2007 Tideway Systems Limited. 3 | 4 | This library is free software; you can redistribute it and/or 5 | modify it under the terms of the GNU Lesser General Public 6 | License as published by the Free Software Foundation; either 7 | version 2.1 of the License, or (at your option) any later version. 8 | 9 | This library is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | Lesser General Public License for more details. 13 | 14 | You should have received a copy of the GNU Lesser General Public 15 | License along with this library; if not, write to the Free Software 16 | Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 17 | */ 18 | 19 | /* 20 | * Header for linked list implementation. 21 | */ 22 | #include "libesm/aho_corasick.h" 23 | 24 | #ifndef AC_LIST_H 25 | #define AC_LIST_H 26 | 27 | /** Structure for internal list items. */ 28 | typedef struct ac_list_item { 29 | /** Pointer to the item itself. */ 30 | void* item; 31 | 32 | /** Pointer to the next ac_list_item or NULL of this is the last item. */ 33 | struct ac_list_item* next; 34 | } ac_list_item; 35 | 36 | /** Structure for linked list. */ 37 | typedef struct { 38 | /** Pointer to first list item. */ 39 | ac_list_item* first; 40 | 41 | /** Pointer to last list item. */ 42 | ac_list_item* last; 43 | } ac_list; 44 | 45 | ac_list* ac_list_new(void); 46 | ac_error_code ac_list_free(ac_list*, ac_free_function, void*); 47 | ac_error_code ac_list_add(ac_list*, void*); 48 | 49 | /* Simple item freeing methods. */ 50 | ac_error_code ac_list_free_simple_item(void*, void*); 51 | ac_error_code ac_list_free_keep_item(void*, void*); 52 | 53 | #endif /* AC_LIST_H */ 54 | 55 | -------------------------------------------------------------------------------- /src/libesm/aho_corasick.h: -------------------------------------------------------------------------------- 1 | /* aho_corasick.h - declarations for Aho Corasick implementations 2 | Copyright (C) 2007 Tideway Systems Limited. 3 | 4 | This library is free software; you can redistribute it and/or 5 | modify it under the terms of the GNU Lesser General Public 6 | License as published by the Free Software Foundation; either 7 | version 2.1 of the License, or (at your option) any later version. 8 | 9 | This library is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | Lesser General Public License for more details. 13 | 14 | You should have received a copy of the GNU Lesser General Public 15 | License along with this library; if not, write to the Free Software 16 | Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 17 | */ 18 | 19 | #ifndef AHO_CORASICK_H 20 | #define AHO_CORASICK_H 21 | 22 | typedef enum { 23 | AC_SUCCESS = 0, 24 | AC_FAILURE = 1 25 | } ac_error_code; 26 | 27 | /** 28 | * Type for function pointers used for freeing complex item types. 29 | */ 30 | typedef ac_error_code (*ac_free_function)(void *item, void *data); 31 | 32 | /** 33 | * Type for symbols used to build strings such as keywords and query phrases. 34 | * All symbols satisfy AC_MIN_SYMBOL <= symbol <= AC_MAX_SYMBOL. 35 | */ 36 | typedef char ac_symbol; 37 | 38 | #define AC_MIN_SYMBOL 0 // Smallest ordinal for an ac_symbol. 39 | #define AC_MAX_SYMBOL 255 // Greatest ordinal for an ac_symbol. 40 | 41 | 42 | /** 43 | * Type for the index into a string of symbols or the length of a string of 44 | * symbols. We use 0-based indexing. 45 | */ 46 | typedef int ac_offset; 47 | 48 | 49 | /** 50 | * Structure for a single query match, containing the span of the query phrase 51 | * that matched the keyword and a pointer associated with the keyword. 52 | */ 53 | typedef struct ac_result { 54 | /** 55 | * The offset of the first symbol in the matching substring of the query 56 | * phrase. 57 | */ 58 | ac_offset start; 59 | 60 | /** The offset of the symbol after the last symbol in the matching 61 | * substring of the query phrase. 62 | */ 63 | ac_offset end; 64 | 65 | /** Pointer associated with the keyword. */ 66 | void *object; 67 | } ac_result; 68 | 69 | /** 70 | * Interface states for an index objects. 71 | */ 72 | typedef enum ac_index_state { 73 | 74 | /** 75 | * The index is 'unfixed'. New keywords can be entered but the index 76 | * cannot be queried. 77 | */ 78 | AC_INDEX_UNFIXED, 79 | 80 | /** 81 | * The index in 'fixed'. The index can be queried but new keywords cannot 82 | * be entered. 83 | */ 84 | AC_INDEX_FIXED 85 | 86 | } ac_index_state; 87 | 88 | 89 | /** 90 | * Structure for index objects. 91 | */ 92 | typedef struct ac_index { 93 | 94 | /** The interface state of the index. */ 95 | ac_index_state index_state; 96 | 97 | /** The top state node of the index's patten matching machine. */ 98 | struct ac_state *state_0; 99 | 100 | } ac_index; 101 | 102 | /** 103 | * Type for result callback functions. 104 | */ 105 | typedef ac_error_code (*ac_result_callback)(void *result_cb_data, 106 | ac_result *result); 107 | 108 | // Operations for index objects. 109 | 110 | /** 111 | * Construct a new index. The state is initially 'unfixed', meaning that the 112 | * user can enter new keywords but can not query the index. 113 | * 114 | * @return a pointer to the ac_index or NULL if an error was encountered. 115 | */ 116 | ac_index * 117 | ac_index_new(void); 118 | 119 | /** 120 | * Free the index and all its subordinate objects. The provided object_free 121 | * function should do whatever necessary to free each associated object. It is 122 | * called once for each associated object that was given to ac_index_enter. 123 | * 124 | * @param self the index 125 | * @param object_free function that frees associated objects. 126 | * @return AC_SUCCESS if successful of AC_FAILURE is an error was encountered. 127 | */ 128 | ac_error_code 129 | ac_index_free(ac_index *self, ac_free_function object_free); 130 | 131 | /** 132 | * Add a keyword and an associated object to the index. 133 | * 134 | * @param self the index 135 | * @param keyword pointer to an array of symbols that comprise the keyword 136 | * @param size the number of symbols in the keyword 137 | * @param object pointer to an associated object that is passed to the result 138 | * callback when the keyword is matched. 139 | * @return AC_SUCCESS if the keyword and object were added successfully or 140 | * AC_FAILURE if an error was encountered or if the index has already 141 | * been fixed. 142 | */ 143 | ac_error_code 144 | ac_index_enter(ac_index *self, 145 | ac_symbol *keyword, 146 | ac_offset size, 147 | void *object); 148 | 149 | /** 150 | * Fix the index, making it ready to be queried. 151 | * 152 | * @param self the index 153 | * @return AC_SUCCESS if the index was successfully fixed or AC_FAILURE if an 154 | * error was encountered or if the index has already been fixed. 155 | */ 156 | ac_error_code 157 | ac_index_fix(ac_index *self); 158 | 159 | /** 160 | * Query the index with the given phrase of the given size. Matching keyword 161 | * spans and associated objects are sent with result_cb_data to result_cb. 162 | * 163 | * @param self the index 164 | * @param phrase pointer to an array of symbols that will be searched 165 | * @param size the number of symbols in the the array 166 | * @param result_cb function that will be called whenever a match is found 167 | * @param result_cb_data pointer to a context object that will be passed to the 168 | * callback whenever a keyword is matched 169 | * @return AC_SUCCESS if the query was successful (even if there were no 170 | * matches) or AC_FAILURE if an error was encountered 171 | */ 172 | ac_error_code 173 | ac_index_query_cb(ac_index *self, 174 | ac_symbol *phrase, 175 | ac_offset size, 176 | ac_result_callback result_cb, 177 | void *result_cb_data); 178 | 179 | #endif // AHO_CORASICK_H 180 | 181 | -------------------------------------------------------------------------------- /test/Makefile.am: -------------------------------------------------------------------------------- 1 | TESTS = run_tests 2 | noinst_PROGRAMS = $(TESTS) 3 | run_tests_SOURCES = run_tests.c \ 4 | ../src/aho_corasick.c \ 5 | ../src/ac_list.c 6 | run_tests_CFLAGS = -std=c99 7 | 8 | run_tests_includedir=$(includedir)/libesm 9 | -------------------------------------------------------------------------------- /test/run_tests.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | ac_error_code 7 | free_string(void *item, void *data) 8 | { 9 | // Do nnothing 10 | } 11 | 12 | char * 13 | test_init(void) 14 | { 15 | if ( ! ac_index_new()) { 16 | return "ac_index_new() should not be NULL"; 17 | } 18 | 19 | return NULL; 20 | } 21 | 22 | ac_error_code 23 | collect_results(void *context, ac_result *result) 24 | { 25 | char **ctx = (char **) context; 26 | char *input = *ctx; 27 | char *object = (char *) result->object; 28 | 29 | if (input) { 30 | asprintf(ctx, "%s, (%d %d %s)", 31 | input, result->start, result->end, object); 32 | } else { 33 | asprintf(ctx, "(%d %d %s)", 34 | result->start, result->end, object); 35 | } 36 | 37 | return AC_SUCCESS; 38 | } 39 | 40 | char * 41 | test_query(void) 42 | { 43 | ac_index *index = ac_index_new(); 44 | 45 | if ( ! index) { 46 | return "Error constructing index."; 47 | } 48 | 49 | if (ac_index_enter(index, "he", 2, "HE") != AC_SUCCESS) { 50 | return "Error adding to index."; 51 | } 52 | 53 | if (ac_index_enter(index, "she", 3, "SHE") != AC_SUCCESS) { 54 | return "Error adding to index."; 55 | } 56 | 57 | if (ac_index_enter(index, "his", 3, "HIS") != AC_SUCCESS) { 58 | return "Error adding to index."; 59 | } 60 | 61 | if (ac_index_enter(index, "hers", 4, "HERS") != AC_SUCCESS) { 62 | return "Error adding to index."; 63 | } 64 | 65 | if (ac_index_fix(index) != AC_SUCCESS) { 66 | return "Error fixing index."; 67 | } 68 | 69 | char *phrase = "this here is history"; 70 | // .123456789.123456789 71 | // --- -- --- 72 | int phrase_length = strlen(phrase); 73 | 74 | char *results = NULL; 75 | 76 | if (ac_index_query_cb(index, 77 | phrase, 78 | phrase_length, 79 | collect_results, 80 | &results) != AC_SUCCESS) { 81 | return "Error running query."; 82 | } 83 | 84 | if ( ! results) { 85 | return "No results returned."; 86 | } 87 | 88 | const char *target = "(1 4 HIS), (5 7 HE), (13 16 HIS)"; 89 | if (strncmp(results, target, strlen(target)) != 0) { 90 | char *message = NULL; 91 | asprintf(&message, "Expected %s but got %s", target, results); 92 | return message; 93 | } 94 | 95 | 96 | if (ac_index_free(index, free_string) != AC_SUCCESS) { 97 | return "Error freeing index."; 98 | } 99 | 100 | 101 | return NULL; 102 | } 103 | 104 | char * 105 | test_cannot_fix_when_already_fixed(void) 106 | { 107 | ac_index *index = ac_index_new(); 108 | ac_index_fix(index); 109 | 110 | if (ac_index_fix(index) != AC_FAILURE) { 111 | return "Expected fix to fail when already fixed."; 112 | } 113 | 114 | return NULL; 115 | } 116 | 117 | char * 118 | test_cannot_enter_when_already_fixed(void) 119 | { 120 | ac_index *index = ac_index_new(); 121 | ac_index_fix(index); 122 | 123 | char *results = NULL; 124 | if (ac_index_enter(index, "foo", 3, "FOO") != AC_FAILURE) { 125 | return "Expected enter to fail after fix."; 126 | } 127 | 128 | return NULL; 129 | } 130 | 131 | char * 132 | test_cannot_query_until_fixed(void) 133 | { 134 | ac_index *index = ac_index_new(); 135 | ac_index_enter(index, "hers", 4, "HERS"); 136 | 137 | char *phrase = "this here is history"; 138 | // .123456789.123456789 139 | // --- -- --- 140 | int phrase_length = strlen(phrase); 141 | 142 | char *results = NULL; 143 | 144 | if (ac_index_query_cb(index, 145 | phrase, 146 | phrase_length, 147 | collect_results, 148 | &results) != AC_FAILURE) { 149 | return "Expected query to fail without fix."; 150 | } 151 | 152 | return NULL; 153 | } 154 | 155 | 156 | ac_error_code 157 | ignore_results(void *context, ac_result *result) 158 | { 159 | return AC_SUCCESS; 160 | } 161 | 162 | ac_error_code 163 | decref(void *item, void *data) 164 | { 165 | int *refcount = (int *) item; 166 | (*refcount)--; 167 | return AC_SUCCESS; 168 | } 169 | 170 | char * 171 | test_objects_for_common_endings_are_freed_correctly(void) 172 | { 173 | ac_index *index = ac_index_new(); 174 | 175 | int refcount = 0; 176 | ac_index_enter(index, "food", 4, &refcount); 177 | ++refcount; 178 | 179 | ac_index_enter(index, "ood", 3, &refcount); 180 | ++refcount; 181 | 182 | ac_index_fix(index); 183 | 184 | char *phrase = "blah"; 185 | void *results = NULL; 186 | ac_index_query_cb(index, 187 | phrase, 188 | strlen(phrase), 189 | collect_results, 190 | &results); 191 | 192 | ac_index_free(index, decref); 193 | 194 | if (refcount != 0) { 195 | char *message = NULL; 196 | asprintf(&message, "Expected refcount to be 0 but was %d", refcount); 197 | return message; 198 | } 199 | 200 | return NULL; 201 | } 202 | 203 | typedef char * (*test_case)(void); 204 | 205 | int 206 | main(void) 207 | { 208 | test_case tests[] = { 209 | test_init, 210 | test_query, 211 | test_cannot_fix_when_already_fixed, 212 | test_cannot_enter_when_already_fixed, 213 | test_cannot_query_until_fixed, 214 | test_objects_for_common_endings_are_freed_correctly, 215 | NULL 216 | }; 217 | 218 | for (test_case *testp = tests; *testp; ++testp) { 219 | test_case test = *testp; 220 | char *result = test(); 221 | if (result) { 222 | printf("\nFAILURE: %s\n", result); 223 | return 1; 224 | } else { 225 | printf("."); 226 | } 227 | } 228 | 229 | printf("\n"); 230 | return 0; 231 | } 232 | --------------------------------------------------------------------------------