├── .dockerignore ├── .gitignore ├── .travis.yml ├── CREDITS ├── Dockerfile ├── LICENSE ├── LICENSE-PECL ├── PECL.md ├── README.md ├── build.sh ├── config.m4 ├── docker ├── clean.sh ├── install_php5.6.sh ├── install_php7.2.sh ├── install_php7.3.sh ├── install_pyenv.sh ├── test1.sh └── test4.sh ├── examples └── benchmark.php ├── package.xml ├── src ├── multifast │ ├── acmem.h │ ├── actypes.h │ ├── ahocorasick.c │ ├── ahocorasick.h │ ├── mpool.c │ ├── mpool.h │ ├── node.c │ ├── node.h │ ├── replace.c │ └── replace.h ├── php_ahocorasick.c ├── php_ahocorasick.h ├── php_ahocorasick.stub.php ├── php_ahocorasick_arginfo.h └── php_ahocorasick_legacy_arginfo.h └── tests ├── test1.phpt ├── test2.phpt ├── test3.phpt ├── test4.phpt ├── test5.phpt └── test6.phpt /.dockerignore: -------------------------------------------------------------------------------- 1 | archive/ 2 | .idea/ 3 | .git/ 4 | modules/ 5 | tmp/ 6 | autom4* 7 | cmake-* 8 | cov/ 9 | cov-int/ 10 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Object files 2 | *.o 3 | 4 | # Libraries 5 | *.lib 6 | *.a 7 | *.la 8 | *.lo 9 | 10 | # Shared objects (inc. Windows DLLs) 11 | *.dll 12 | *.so 13 | *.so.* 14 | *.dylib 15 | 16 | # Executables 17 | *.exe 18 | *.out 19 | *.app 20 | 21 | # Build 22 | build/ 23 | tmp/ 24 | autom4te.cache/ 25 | config.nice 26 | config.log 27 | 28 | # Project files 29 | .idea 30 | nbproject 31 | 32 | cmake-build* 33 | modules/ 34 | .libs 35 | .deps 36 | cov/ 37 | cov-int/ 38 | 39 | aclocal.m4 40 | acinclude.m4 41 | CMakeLists.txt 42 | config.guess 43 | config.h 44 | config.h.in 45 | config.h.in~ 46 | config.status 47 | config.sub 48 | configure 49 | configure.ac 50 | install-sh 51 | libtool 52 | ltmain.sh 53 | ltmain.sh.backup 54 | Makefile* 55 | missing 56 | mkinstalldirs 57 | run-tests.php 58 | 59 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # see http://about.travis-ci.org/docs/user/languages/php/ for more hints 2 | language: php 3 | 4 | # Use container based infrastructure for Travis 5 | sudo: false 6 | dist: xenial 7 | 8 | # list any PHP version you want to test against 9 | php: 10 | # using major version aliases 11 | # aliased to a recent 5.6.x version 12 | - 5.6 13 | # aliased to a recent 7.0.x version 14 | - 7.0 15 | # aliased to a recent 7.1.x version 16 | - 7.1 17 | # aliased to a recent 7.2.x version 18 | - 7.2 19 | # aliased to a recent 7.3.x version 20 | - 7.3 21 | # aliased to a recent 7.4.x version 22 | - "7.4snapshot" 23 | # aliased the most recent 8.x branch 24 | - nightly 25 | # aliased to a recent hhvm version 26 | - hhvm 27 | 28 | #Coverity 29 | env: 30 | global: 31 | # The next declaration is the encrypted COVERITY_SCAN_TOKEN, created 32 | # via the "travis encrypt" command using the project repo's public key 33 | - secure: "vMCan2hMWKZOmYDRximgj8Z+yQqVNsaA0FJ83eOjVmjjy2xxNVc53u0Qisq/FJf+1aGTz1FIz/xoSeV5tC7Gbpk6J23D98NUMX/6IfrbXl5mvLFyg4kKd8MPo4tonDMalXtbbzaaipYSRDgav/POK0A/NhrT3dmAw8t43jYu47to/uUyQULZDjPQKg/3hcDaEXQ0bXW6JMGnL73Tkx9FbYNHkAbLq8apEUpKDmfe1tyij3J5t29EjJMzFAkDTJ4fDFvFTpoM28WjKagwMQgjfNIaYYKQKtwGMcbua/lrDTxbRf3LSBwRE0oVLWW1g+9gfwEmQN0JsTnQQvzn5iPeKgdu1/ENgVKY6bz66wlMQXPL4QXUPFmZD43uzxjbr28PL9yjcmoQa+S3pgRi5hjoBl0vfKb1xBOkSsMCkxakjdYIVbQ0r6+UbNDsFq21CRsRFFXAkUvnQUpH285uo8IPYoyBM2xIi2Z3YlUjugdnee/m+th2sDRIDRUXZUC7a+BpjabvvlrFpTRH5WqaebtB04D4sH0qwIIR5s24w6MlZyVKBigamboTMg52tFshlE7w1AkWBqa0ZzrIeJvTzvoR6QEoU3r89vo3KKn4TX52ko/gG0HGq49zd2sKvTsJnzPb0Z1+AbSDbI145M7uU6hi5JQcIW9pCPmEgG6T97p3tUg=" 34 | 35 | # https://docs.travis-ci.com/user/migrating-from-legacy 36 | addons: 37 | apt: 38 | packages: 39 | - uuid-dev 40 | - build-essential 41 | - autoconf 42 | - cmake 43 | - time 44 | coverity_scan: 45 | project: 46 | name: "ph4r05/php_aho_corasick" 47 | description: "PHP extension for fast pattern matching using AhoCorasick algorithm" 48 | notification_email: ph4r05@gmail.com 49 | build_command: ./build.sh 50 | branch_pattern: coverity_scan 51 | 52 | script: 53 | - ./build.sh 54 | - NO_INTERACTION=1 REPORT_EXIT_STATUS=1 make test TESTS="--show-diff" 55 | 56 | # optionally set up exclutions and allowed failures in the matrix 57 | matrix: 58 | allow_failures: 59 | - php: nightly 60 | - php: hhvm 61 | -------------------------------------------------------------------------------- /CREDITS: -------------------------------------------------------------------------------- 1 | ;; ahocorasick 2 | Dusan Klinec (ph4r05sk) (lead) 3 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # Classpoly Docker file 2 | # ----------------------------------------------------------------------------- 3 | # Creates compilation image, i.e. you can modify the built sources. 4 | # However, binaries are also compiled during image creation 5 | # thus available in the image. 6 | 7 | # Base image is an argument - debian or ubuntu. 8 | ARG BASE_IMAGE=ubuntu:latest 9 | 10 | # Builder stage 11 | # Multistage docker build, requires docker 17.05 12 | # https://docs.docker.com/develop/develop-images/dockerfile_best-practices/ 13 | # https://docs.docker.com/engine/reference/builder/ 14 | # https://medium.com/@tonistiigi/advanced-multi-stage-build-patterns-6f741b852fae 15 | FROM ${BASE_IMAGE} AS base 16 | 17 | ARG DEVEL_TOOLS=0 18 | ENV DEBIAN_FRONTEND=noninteractive 19 | ENV TZ=Europe/Prague 20 | 21 | RUN set -ex && \ 22 | apt-get update && \ 23 | apt-get --no-install-recommends --yes install \ 24 | autoconf \ 25 | automake \ 26 | bzip2 \ 27 | ca-certificates \ 28 | curl \ 29 | g++ \ 30 | git \ 31 | libtool-bin \ 32 | make \ 33 | php7.2-dev \ 34 | pkg-config \ 35 | python \ 36 | rsync \ 37 | unzip \ 38 | wget \ 39 | && if [ "${DEVEL_TOOLS}" -eq 1 ] ; then \ 40 | apt-get --no-install-recommends --yes install \ 41 | gdb \ 42 | gdbserver \ 43 | software-properties-common \ 44 | valgrind \ 45 | vim; \ 46 | fi \ 47 | && rm -rf /var/lib/apt/lists/* 48 | 49 | 50 | # Building class poly 51 | FROM base AS builder 52 | WORKDIR /usr/local/src 53 | 54 | ENV PROJECT_DIR /usr/local/src/php_aho_corasick 55 | 56 | # Build either from current source or github repo (no local files needed then) 57 | ARG DIR_BUSTER=0 58 | 59 | # php_aho_corasick 60 | COPY config* $PROJECT_DIR/ 61 | COPY build.sh $PROJECT_DIR/ 62 | COPY src/ $PROJECT_DIR/src/ 63 | 64 | COPY docker/*.sh /usr/local/bin/ 65 | RUN set -ex \ 66 | && chmod +x /usr/local/bin/*.sh 67 | 68 | WORKDIR $PROJECT_DIR 69 | RUN set -ex \ 70 | && phpize --clean \ 71 | && phpize \ 72 | && ./configure --enable-ahocorasick \ 73 | && make clean \ 74 | && make 75 | 76 | COPY examples/ $PROJECT_DIR/examples/ 77 | 78 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU LESSER GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | 9 | This version of the GNU Lesser General Public License incorporates 10 | the terms and conditions of version 3 of the GNU General Public 11 | License, supplemented by the additional permissions listed below. 12 | 13 | 0. Additional Definitions. 14 | 15 | As used herein, "this License" refers to version 3 of the GNU Lesser 16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU 17 | General Public License. 18 | 19 | "The Library" refers to a covered work governed by this License, 20 | other than an Application or a Combined Work as defined below. 21 | 22 | An "Application" is any work that makes use of an interface provided 23 | by the Library, but which is not otherwise based on the Library. 24 | Defining a subclass of a class defined by the Library is deemed a mode 25 | of using an interface provided by the Library. 26 | 27 | A "Combined Work" is a work produced by combining or linking an 28 | Application with the Library. The particular version of the Library 29 | with which the Combined Work was made is also called the "Linked 30 | Version". 31 | 32 | The "Minimal Corresponding Source" for a Combined Work means the 33 | Corresponding Source for the Combined Work, excluding any source code 34 | for portions of the Combined Work that, considered in isolation, are 35 | based on the Application, and not on the Linked Version. 36 | 37 | The "Corresponding Application Code" for a Combined Work means the 38 | object code and/or source code for the Application, including any data 39 | and utility programs needed for reproducing the Combined Work from the 40 | Application, but excluding the System Libraries of the Combined Work. 41 | 42 | 1. Exception to Section 3 of the GNU GPL. 43 | 44 | You may convey a covered work under sections 3 and 4 of this License 45 | without being bound by section 3 of the GNU GPL. 46 | 47 | 2. Conveying Modified Versions. 48 | 49 | If you modify a copy of the Library, and, in your modifications, a 50 | facility refers to a function or data to be supplied by an Application 51 | that uses the facility (other than as an argument passed when the 52 | facility is invoked), then you may convey a copy of the modified 53 | version: 54 | 55 | a) under this License, provided that you make a good faith effort to 56 | ensure that, in the event an Application does not supply the 57 | function or data, the facility still operates, and performs 58 | whatever part of its purpose remains meaningful, or 59 | 60 | b) under the GNU GPL, with none of the additional permissions of 61 | this License applicable to that copy. 62 | 63 | 3. Object Code Incorporating Material from Library Header Files. 64 | 65 | The object code form of an Application may incorporate material from 66 | a header file that is part of the Library. You may convey such object 67 | code under terms of your choice, provided that, if the incorporated 68 | material is not limited to numerical parameters, data structure 69 | layouts and accessors, or small macros, inline functions and templates 70 | (ten or fewer lines in length), you do both of the following: 71 | 72 | a) Give prominent notice with each copy of the object code that the 73 | Library is used in it and that the Library and its use are 74 | covered by this License. 75 | 76 | b) Accompany the object code with a copy of the GNU GPL and this license 77 | document. 78 | 79 | 4. Combined Works. 80 | 81 | You may convey a Combined Work under terms of your choice that, 82 | taken together, effectively do not restrict modification of the 83 | portions of the Library contained in the Combined Work and reverse 84 | engineering for debugging such modifications, if you also do each of 85 | the following: 86 | 87 | a) Give prominent notice with each copy of the Combined Work that 88 | the Library is used in it and that the Library and its use are 89 | covered by this License. 90 | 91 | b) Accompany the Combined Work with a copy of the GNU GPL and this license 92 | document. 93 | 94 | c) For a Combined Work that displays copyright notices during 95 | execution, include the copyright notice for the Library among 96 | these notices, as well as a reference directing the user to the 97 | copies of the GNU GPL and this license document. 98 | 99 | d) Do one of the following: 100 | 101 | 0) Convey the Minimal Corresponding Source under the terms of this 102 | License, and the Corresponding Application Code in a form 103 | suitable for, and under terms that permit, the user to 104 | recombine or relink the Application with a modified version of 105 | the Linked Version to produce a modified Combined Work, in the 106 | manner specified by section 6 of the GNU GPL for conveying 107 | Corresponding Source. 108 | 109 | 1) Use a suitable shared library mechanism for linking with the 110 | Library. A suitable mechanism is one that (a) uses at run time 111 | a copy of the Library already present on the user's computer 112 | system, and (b) will operate properly with a modified version 113 | of the Library that is interface-compatible with the Linked 114 | Version. 115 | 116 | e) Provide Installation Information, but only if you would otherwise 117 | be required to provide such information under section 6 of the 118 | GNU GPL, and only to the extent that such information is 119 | necessary to install and execute a modified version of the 120 | Combined Work produced by recombining or relinking the 121 | Application with a modified version of the Linked Version. (If 122 | you use option 4d0, the Installation Information must accompany 123 | the Minimal Corresponding Source and Corresponding Application 124 | Code. If you use option 4d1, you must provide the Installation 125 | Information in the manner specified by section 6 of the GNU GPL 126 | for conveying Corresponding Source.) 127 | 128 | 5. Combined Libraries. 129 | 130 | You may place library facilities that are a work based on the 131 | Library side by side in a single library together with other library 132 | facilities that are not Applications and are not covered by this 133 | License, and convey such a combined library under terms of your 134 | choice, if you do both of the following: 135 | 136 | a) Accompany the combined library with a copy of the same work based 137 | on the Library, uncombined with any other library facilities, 138 | conveyed under the terms of this License. 139 | 140 | b) Give prominent notice with the combined library that part of it 141 | is a work based on the Library, and explaining where to find the 142 | accompanying uncombined form of the same work. 143 | 144 | 6. Revised Versions of the GNU Lesser General Public License. 145 | 146 | The Free Software Foundation may publish revised and/or new versions 147 | of the GNU Lesser General Public License from time to time. Such new 148 | versions will be similar in spirit to the present version, but may 149 | differ in detail to address new problems or concerns. 150 | 151 | Each version is given a distinguishing version number. If the 152 | Library as you received it specifies that a certain numbered version 153 | of the GNU Lesser General Public License "or any later version" 154 | applies to it, you have the option of following the terms and 155 | conditions either of that published version or of any later version 156 | published by the Free Software Foundation. If the Library as you 157 | received it does not specify a version number of the GNU Lesser 158 | General Public License, you may choose any version of the GNU Lesser 159 | General Public License ever published by the Free Software Foundation. 160 | 161 | If the Library as you received it specifies that a proxy can decide 162 | whether future versions of the GNU Lesser General Public License shall 163 | apply, that proxy's public statement of acceptance of any version is 164 | permanent authorization for you to choose that version for the 165 | Library. -------------------------------------------------------------------------------- /LICENSE-PECL: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------- 2 | The PHP License, version 3.01 3 | Copyright (c) 1999 - 2018 The PHP Group. All rights reserved. 4 | -------------------------------------------------------------------- 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, is permitted provided that the following conditions 8 | are met: 9 | 10 | 1. Redistributions of source code must retain the above copyright 11 | notice, this list of conditions and the following disclaimer. 12 | 13 | 2. Redistributions in binary form must reproduce the above copyright 14 | notice, this list of conditions and the following disclaimer in 15 | the documentation and/or other materials provided with the 16 | distribution. 17 | 18 | 3. The name "PHP" must not be used to endorse or promote products 19 | derived from this software without prior written permission. For 20 | written permission, please contact group@php.net. 21 | 22 | 4. Products derived from this software may not be called "PHP", nor 23 | may "PHP" appear in their name, without prior written permission 24 | from group@php.net. You may indicate that your software works in 25 | conjunction with PHP by saying "Foo for PHP" instead of calling 26 | it "PHP Foo" or "phpfoo" 27 | 28 | 5. The PHP Group may publish revised and/or new versions of the 29 | license from time to time. Each version will be given a 30 | distinguishing version number. 31 | Once covered code has been published under a particular version 32 | of the license, you may always continue to use it under the terms 33 | of that version. You may also choose to use such covered code 34 | under the terms of any subsequent version of the license 35 | published by the PHP Group. No one other than the PHP Group has 36 | the right to modify the terms applicable to covered code created 37 | under this License. 38 | 39 | 6. Redistributions of any form whatsoever must retain the following 40 | acknowledgment: 41 | "This product includes PHP software, freely available from 42 | ". 43 | 44 | THIS SOFTWARE IS PROVIDED BY THE PHP DEVELOPMENT TEAM ``AS IS'' AND 45 | ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 46 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 47 | PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE PHP 48 | DEVELOPMENT TEAM OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 49 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 50 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 51 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 52 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 53 | STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 54 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED 55 | OF THE POSSIBILITY OF SUCH DAMAGE. 56 | 57 | -------------------------------------------------------------------- 58 | 59 | This software consists of voluntary contributions made by many 60 | individuals on behalf of the PHP Group. 61 | 62 | The PHP Group can be contacted via Email at group@php.net. 63 | 64 | For more information on the PHP Group and the PHP project, 65 | please see . 66 | 67 | PHP includes the Zend Engine, freely available at 68 | . -------------------------------------------------------------------------------- /PECL.md: -------------------------------------------------------------------------------- 1 | ## PECL publishing 2 | 3 | ```bash 4 | docker build -t="ahoc" --build-arg DEVEL_TOOLS=1 . 5 | docker run -i --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --mount type=bind,src=`pwd`,dst=/aho -t ahoc 6 | 7 | cd /aho 8 | apt-get update 9 | apt-get install php7.2-xml 10 | 11 | phpize --clean 12 | phpize 13 | ./configure --enable-ahocorasick 14 | make clean 15 | make 16 | 17 | wget http://pear.php.net/go-pear.phar 18 | php go-pear.phar 19 | 20 | pear package 21 | 22 | # Upload here: https://pecl.php.net/release-upload.php 23 | ``` 24 | 25 | 26 | ## Debugging 27 | 28 | Installing pyenv in the Docker 29 | 30 | ``` 31 | . docker/install_penv.sh 32 | 33 | NO_INTERACTION=1 REPORT_EXIT_STATUS=1 make test TESTS="--show-diff" 34 | ``` 35 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # php_aho_corasick 2 | [![Build Status](https://travis-ci.org/ph4r05/php_aho_corasick.svg?branch=master)](https://travis-ci.org/ph4r05/php_aho_corasick) 3 | [![Coverity Status](https://scan.coverity.com/projects/7177/badge.svg)](https://scan.coverity.com/projects/ph4r05-php_aho_corasick) 4 | 5 | PHP extension implementing Aho-Corasick pattern matching algorithm (more on [wiki]). 6 | 7 | Is especially effective if there is a large database of needles (=strings to be searched, for example virus signatures). 8 | Another advantage is that built search structure is initialized before search in separate call thus it can be called 9 | more times with different haystack, saving time. 10 | 11 | Computing Aho-Corasick in th native code (PHP extension) rather than in a pure PHP manner gives this implementation 12 | significant performance boost. 13 | 14 | ## Dependencies 15 | This project is simple PHP wrapper of (or interface to) another project: [MultiFast]. Sources include MultiFast library v 2.0. 16 | No extra dependencies are required. [MultiFast] library is wrapped as PHP extension loadable to PHP. 17 | 18 | Source of inspiration for this project was a great [tutorial]. 19 | 20 | Compatible with PHP 5.3+ and PHP 7.0+. 21 | 22 | ## PECL & Licensing 23 | The original project [MultiFast] is licensed under LGPLv3 so this PHP wrapper is also licensed under LGPLv3. 24 | Thanks to the [author] of the [MultiFast], Kamiar Kanani, who gave me a [permission] to license the code under PHP License 3.01 for the purpose 25 | of adding this extension to PECL repository. 26 | 27 | https://pecl.php.net/package/ahocorasick 28 | 29 | Pecl installation: 30 | 31 | ```bash 32 | pecl install channel://pecl.php.net/ahocorasick-0.0.7 33 | ``` 34 | 35 | Note the `php-dev` (or `php-devel`, depends on your distribution) is required for pecl package to compile. 36 | 37 | ## Build 38 | ```bash 39 | phpize 40 | ./configure --enable-ahocorasick 41 | make 42 | ``` 43 | 44 | ## Docker build 45 | 46 | ```bash 47 | $ docker build -t="ahoc" . 48 | $ docker run -i -t ahoc 49 | $ php -d extension=modules/ahocorasick.so -f examples/test.php 50 | ``` 51 | 52 | Install debugging tools, remote debugging 53 | 54 | ```bash 55 | $ docker build -t="ahoc" --build-arg DEVEL_TOOLS=1 . 56 | $ docker run -i --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t ahoc 57 | $ docker run -i --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --mount type=bind,src=`pwd`,dst=/aho -t ahoc 58 | ``` 59 | 60 | Recompiling, or cache busting: 61 | 62 | ```bash 63 | $ docker build -t="ahoc" --build-arg DIR_BUSTER=`date`. 64 | ``` 65 | 66 | ## Usage 67 | This extension is case sensitive, thus if you want case insensitive, convert every string input to this algorithm to 68 | lowercase (use mb_strtolower() for example). 69 | 70 | For more usage examples, see provided testing examples. 71 | 72 | `examples/test.php`: 73 | ```php 74 | $data = array( 75 | array('key'=>'ab', 'value'=>'alfa'), 76 | array('key'=>'ac', 'value'=>'beta'), 77 | array('key'=>'ad', 'value'=>'gamma', 'aux'=>array(1)), 78 | array('key'=>'ae', 'value'=>'delta'), 79 | array('id'=>0, 'value'=>'zeta'), 80 | array('key'=>'ag', 'value'=>'omega'), 81 | array('value'=>'lfa') 82 | ); 83 | 84 | // initialize search , returns resourceID for search structure 85 | $c = ahocorasick_init($data); 86 | 87 | // perform search 88 | $d1 = ahocorasick_match("alFABETA gamma zetaomegaalfa!", $c); 89 | 90 | // deinitialize search structure (will free memory) 91 | ahocorasick_deinit($c); 92 | 93 | var_dump($d1); 94 | ``` 95 | 96 | Call with: 97 | ```bash 98 | php -d extension=modules/ahocorasick.so -f examples/test.php 99 | ``` 100 | 101 | Results with: 102 | ``` 103 | array(5) { 104 | [0]=> 105 | array(5) { 106 | ["pos"]=> 107 | int(14) 108 | ["key"]=> 109 | string(2) "ad" 110 | ["aux"]=> 111 | array(1) { 112 | [0]=> 113 | int(1) 114 | } 115 | ["start_postion"]=> 116 | int(9) 117 | ["value"]=> 118 | string(5) "gamma" 119 | } 120 | [1]=> 121 | array(4) { 122 | ["pos"]=> 123 | int(19) 124 | ["keyIdx"]=> 125 | int(0) 126 | ["start_postion"]=> 127 | int(15) 128 | ["value"]=> 129 | string(4) "zeta" 130 | } 131 | [2]=> 132 | array(4) { 133 | ["pos"]=> 134 | int(24) 135 | ["key"]=> 136 | string(2) "ag" 137 | ["start_postion"]=> 138 | int(19) 139 | ["value"]=> 140 | string(5) "omega" 141 | } 142 | [3]=> 143 | array(4) { 144 | ["pos"]=> 145 | int(28) 146 | ["key"]=> 147 | string(2) "ab" 148 | ["start_postion"]=> 149 | int(24) 150 | ["value"]=> 151 | string(4) "alfa" 152 | } 153 | [4]=> 154 | array(3) { 155 | ["pos"]=> 156 | int(28) 157 | ["start_postion"]=> 158 | int(25) 159 | ["value"]=> 160 | string(3) "lfa" 161 | } 162 | } 163 | ``` 164 | 165 | ## Benchmark 166 | In this repo you can find `examples/benchmark.php` file, with this you can perform your own benchmark and measure speed up. 167 | 168 | My setup generates random haystacks and needles from alphabet="abcdef". There is performed 5 measurements of time spent by search and average is computed. 169 | Search structure construction is conted to time measurements. 170 | 171 | Script generates: 172 | * 256 random haystacks of size 8192 characters 173 | * 2048 needles with 16 characters. 174 | 175 | Principle: 176 | * Naive approach simply iterates over haystacks and needles, search is performed with strpos(). 177 | * Aho-Corasick approach constructs search structure, then all haystacks are searched for needles. 178 | 179 | Results: 180 | ``` 181 | $> php -d extension=modules/ahocorasick.so -f examples/benchmark.php 182 | Classic search; sampleCount: 10; keySize: 2048; timeAvg: 13.060877 183 | AhoCorasick search; sampleCount: 10; keySize: 2048; timeAvg: 0.174326 s, totalTime: 1.743264 s, memory increase: 272 B 184 | AhoCorasick pattern matching is 74.921962 times faster than naive approach 185 | ``` 186 | 187 | Speedup: 74x compared to the naive approach. 188 | 189 | ## API 190 | Documentation writing is in progress. 191 | 192 | Basic ideas of the API: 193 | * AhoCorasick pattern matching engine has to be initialized (`ahocorasick_init()`) before use and deinitialized (`ahocorasick_deinit()`) 194 | after use so memory is handled properly. 195 | * Engine has to be fed with pattern matching rules, given as array of rules, either to initialization function (`ahocorasick_init()`) 196 | or later (`ahocorasick_add_patterns()`). 197 | * After engine is finalized (`ahocorasick_finalize()`) or a first matching is performed (`ahocorasick_match()`) no further patterns are 198 | allowed, as underlying searching trie is finalized. 199 | * When matching finishes, it returns array of matched results. Each entry determines position of the found occurrence and pattern 200 | that was matched. 201 | * Modifications made during the php 7 migration: 202 | ** 'value' is the default key when adding patterns. 203 | ** 'start_postion' field added to the results. The original algorithm returns the end position of the matched patterns. 204 | 205 | Rules: 206 | * Simplest pattern looks like: 207 | ```php 208 | array('lorem') 209 | ``` 210 | * Pattern can be identified, so it is easier to process result from match call. Either by string 211 | ```php 212 | array('key'=>'ae', 'value'=>'delta') 213 | ``` 214 | or integer 215 | ```php 216 | array('id'=>0, 'value'=>'zeta') 217 | ``` 218 | * Pattern can carry an arbitrary object 219 | ```php 220 | array('key'=>'ad', 'value'=>'gamma', 'aux'=>array(1)) 221 | ``` 222 | 223 | ## Development 224 | 225 | ``` 226 | # Create package for distribution 227 | pear package 228 | ``` 229 | 230 | ### OSX Mojave 231 | 232 | Since Mojave the PHP module needs to be codesigned in order to be loaded to the process. 233 | 234 | https://developer.apple.com/library/archive/technotes/tn2206/_index.html 235 | 236 | 237 | Donating 238 | ======== 239 | 240 | This implementation is an open source. If you like the code or you do find it useful please feel free to donate to the 241 | author whatever amount you would like by clicking on the paypal button below. 242 | And if you don't feel like donating, that's OK too. 243 | 244 | [![](https://www.paypalobjects.com/en_US/i/btn/btn_donateCC_LG.gif)](https://www.paypal.com/cgi-bin/webscr?cmd=_donations&business=XK6RLD768RGGJ&lc=SK&item_name=ph4r05&item_number=php_aho_corasick%2egit¤cy_code=EUR&bn=PP%2dDonationsBF%3abtn_donateCC_LG%2egif%3aNonHosted) 245 | 246 | Bitcoin: 247 | 248 | ![1Gh3TC55L4FjCyS2y5WKc4EGMYBYa6qvDw](https://deadcode.me/btc-aho.png)
`1Gh3TC55L4FjCyS2y5WKc4EGMYBYa6qvDw` 249 | 250 | Monero: 251 | ``` 252 | 89bF7TFrhdyczkz6JmUzXx57yQa3fb28tbyT8nXLpj3bVFfQEE6cpjxec1gAJVSWHBBG7ex2XBj7u6BLrgKBaEmuSzWgdcn 253 | ``` 254 | 255 | [wiki]: http://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm 256 | [MultiFast]: http://sourceforge.net/projects/multifast/?source=dlp 257 | [tutorial]: http://devzone.zend.com/446/extension-writing-part-iii-resources/ 258 | [permission]: https://sourceforge.net/p/multifast/discussion/1317362/thread/dc5b4a1e/#a0a2 259 | [author]: https://sourceforge.net/u/kamiark/ 260 | -------------------------------------------------------------------------------- /build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | phpize --clean 3 | phpize 4 | ./configure --enable-ahocorasick 5 | make clean 6 | make 7 | -------------------------------------------------------------------------------- /config.m4: -------------------------------------------------------------------------------- 1 | PHP_ARG_ENABLE(ahocorasick, whether to enable AhoCorasick support, [ --enable-ahocorasick Enable Aho Corasick support]) 2 | if test "$PHP_AHOCORASICK" = "yes"; then 3 | AC_DEFINE(HAVE_AHOCORASICK, 1, [Whether you have Aho Corasick]) 4 | PHP_NEW_EXTENSION(ahocorasick, \ 5 | src/php_ahocorasick.c \ 6 | src/multifast/node.c \ 7 | src/multifast/ahocorasick.c \ 8 | src/multifast/mpool.c \ 9 | src/multifast/replace.c, $ext_shared) 10 | fi 11 | -------------------------------------------------------------------------------- /docker/clean.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | find . -name \*.gcno -o -name \*.gcda | xargs rm -f 3 | find . -name \*.lo -o -name \*.o | xargs rm -f 4 | find . -name \*.la -o -name \*.a | xargs rm -f 5 | find . -name \*.so | xargs rm -f 6 | find . -name .libs -a -type d|xargs rm -rf 7 | rm -f libphp.la modules/* libs/* 8 | -------------------------------------------------------------------------------- /docker/install_php5.6.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export DEBIAN_FRONTEND=noninteractive 3 | 4 | apt-get remove --yes php7.2-dev 5 | apt-get remove --yes php7.3-dev 6 | 7 | apt-get update 8 | apt-get install --yes software-properties-common 9 | add-apt-repository ppa:ondrej/php 10 | apt-get update 11 | 12 | apt-get install --no-install-recommends --yes php5.6-dev 13 | -------------------------------------------------------------------------------- /docker/install_php7.2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export DEBIAN_FRONTEND=noninteractive 3 | 4 | apt-get remove --yes php5.6-dev 5 | apt-get remove --yes php7.3-dev 6 | apt-get install --no-install-recommends php7.2-dev 7 | -------------------------------------------------------------------------------- /docker/install_php7.3.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export DEBIAN_FRONTEND=noninteractive 3 | 4 | apt-get remove --yes php5.6-dev 5 | apt-get remove --yes php7.2-dev 6 | 7 | apt-get update 8 | apt-get install --yes software-properties-common 9 | add-apt-repository ppa:ondrej/php 10 | apt-get update 11 | 12 | apt-get install --no-install-recommends php7.3-dev 13 | -------------------------------------------------------------------------------- /docker/install_pyenv.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # https://github.com/phpenv/phpenv 3 | 4 | apt-get install libcurl4-openssl-dev libmcrypt-dev libreadline-dev \ 5 | libxslt1-dev libxml2-dev libbz2-dev libjpeg-dev libpng-dev \ 6 | libtidy-dev 7 | 8 | # Installer for phpenv. 9 | curl -L http://git.io/phpenv-installer | bash 10 | 11 | export PHPENV_ROOT="/root/.phpenv" 12 | if [ -d "${PHPENV_ROOT}" ]; then 13 | export PATH="${PHPENV_ROOT}/bin:${PATH}" 14 | eval "$(phpenv init -)" 15 | fi 16 | 17 | echo 'export PATH="$HOME/.phpenv/bin:$PATH"' >> ~/.bash_profile 18 | 19 | echo 'eval "$(phpenv init -)"' >> ~/.bash_profile 20 | 21 | exec $SHELL -l 22 | 23 | # workaround for new ubuntu 24 | cd /usr/include 25 | ln -s x86_64-linux-gnu/curl 26 | cd - 27 | 28 | # phpenv 29 | phpenv install --list 30 | phpenv install 5.5.38 31 | phpenv install 5.6.40 32 | phpenv install 7.2.16 33 | 34 | for PHP_PATH in $HOME/.phpenv/versions/[0-9].[0-9].[0-9]*; do 35 | PHP_VERSION=${PHP_PATH##*/}; 36 | unlink "${HOME}/.phpenv/versions/${PHP_VERSION%.*}" 2>/dev/null 37 | ln -s "${PHP_PATH}" "${HOME}/.phpenv/versions/${PHP_VERSION%.*}" 2>/dev/null 38 | done 39 | 40 | phpenv rehash 41 | 42 | phpenv global 5.6.40 2>/dev/null 43 | 44 | 45 | -------------------------------------------------------------------------------- /docker/test1.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | php -d extension=/usr/local/src/php_aho_corasick/modules/ahocorasick.so -f /usr/local/src/php_aho_corasick/examples/test.php 3 | -------------------------------------------------------------------------------- /docker/test4.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | php -d extension=/usr/local/src/php_aho_corasick/modules/ahocorasick.so -f /usr/local/src/php_aho_corasick/examples/test4.php 3 | -------------------------------------------------------------------------------- /examples/benchmark.php: -------------------------------------------------------------------------------- 1 | $i, 'value'=>$randomKeys[$i], 'aux' => $randomBuffers); 62 | } 63 | 64 | $c = ahocorasick_init($data); 65 | foreach($randomBuffers as $randomBuffer){ 66 | $d = ahocorasick_match($randomBuffer, $c); 67 | } 68 | 69 | ahocorasick_deinit($c); 70 | 71 | $curTime = microtime(true) - $curTime; 72 | $sum += $curTime; 73 | 74 | unset($data); 75 | unset($d); 76 | } 77 | 78 | $memStop = memory_get_usage(); 79 | $avgAho = $sum/((float)$sampleCount); 80 | 81 | printf("AhoCorasick search; sampleCount: %d; keySize: %d; timeAvg: %f s, totalTime: %f s, memory increase: %d B\n\n", 82 | $sampleCount, $keySize, $avgAho, $sum, $memStop-$memStart); 83 | 84 | printf("AhoCorasick pattern matching is %f times faster than naive approach\n", $avgNaive/$avgAho); 85 | 86 | -------------------------------------------------------------------------------- /package.xml: -------------------------------------------------------------------------------- 1 | 2 | 10 | ahocorasick 11 | pecl.php.net 12 | Effective Aho-Corasick string pattern matching algorithm 13 | PHP extension implementing Aho-Corasick pattern matching algorithm (more on wiki). 14 | 15 | Is especially effective if there is a large database of needles (=strings to be searched, for example virus 16 | signatures). Another advantage is that built search structure is initialized before search in separate call thus 17 | it can be called more times with different haystack, saving time. 18 | 19 | Computing Aho-Corasick in th native code (PHP extension) rather than in a pure PHP manner gives this 20 | implementation significant performance boost. 21 | 22 | 23 | Dusan Klinec 24 | ph4r05sk 25 | ph4r05@gmail.com 26 | yes 27 | 28 | 2019-07-08 29 | 30 | 0.0.7 31 | 0.0.1 32 | 33 | 34 | beta 35 | alpha 36 | 37 | PHP License 38 | 39 | Issue #21 fixed. Problem with buffering all previous searches in the trie. 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 5.2.0 79 | 7.3.99 80 | 81 | 82 | 1.4.0b1 83 | 84 | 85 | 86 | ahocorasick 87 | 88 | 89 | 90 | 91 | -------------------------------------------------------------------------------- /src/multifast/acmem.h: -------------------------------------------------------------------------------- 1 | // 2 | // Created by Dusan Klinec on 01.12.15. 3 | // 4 | 5 | #ifndef PHP_AHO_CORASICK_ACMEM_H 6 | #define PHP_AHO_CORASICK_ACMEM_H 7 | 8 | #ifdef ZTS 9 | #include "TSRM.h" 10 | #endif 11 | 12 | #include "php.h" 13 | #include "php_ini.h" 14 | 15 | // Override memory allocator for Aho-Corasick library 16 | #define AC_MALLOC emalloc 17 | #define AC_MFREE efree 18 | 19 | #endif //PHP_AHO_CORASICK_ACMEM_H 20 | -------------------------------------------------------------------------------- /src/multifast/actypes.h: -------------------------------------------------------------------------------- 1 | /* 2 | * actypes.h: Defines basic data types of the trie 3 | * This file is part of multifast. 4 | * 5 | Copyright 2010-2015 Kamiar Kanani 6 | 7 | multifast is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU Lesser General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | multifast is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU Lesser General Public License for more details. 16 | 17 | You should have received a copy of the GNU Lesser General Public License 18 | along with multifast. If not, see . 19 | */ 20 | 21 | #ifndef _AC_TYPES_H_ 22 | #define _AC_TYPES_H_ 23 | 24 | #include 25 | 26 | #ifdef __cplusplus 27 | extern "C" { 28 | #endif 29 | 30 | /** 31 | * @brief The alphabet type 32 | * 33 | * Actually defining AC_ALPHABET_t as a char works for many usage case, but 34 | * sometimes we deal with streams of other basic types e.g. integers or 35 | * enumerators. Although they consists of string of bytes (chars), but using 36 | * their specific types as AC_ALPHABET_t will lead to a better performance. 37 | * So instead of working with strings of chars, we assume that we are working 38 | * with strings of AC_ALPHABET_t and leave it optional for users to define 39 | * their own alphabets. 40 | */ 41 | typedef char AC_ALPHABET_t; 42 | 43 | /** 44 | * The text (strings of alphabets) type that is used for input/output when 45 | * dealing with the A.C. Trie. The text can contain zero value alphabets. 46 | */ 47 | typedef struct ac_text 48 | { 49 | const AC_ALPHABET_t *astring; /**< String of alphabets */ 50 | size_t length; /**< String length */ 51 | } AC_TEXT_t; 52 | 53 | /** 54 | * Pattern ID type 55 | * @see struct ac_pattid 56 | */ 57 | enum ac_pattid_type 58 | { 59 | AC_PATTID_TYPE_DEFAULT = 0, 60 | AC_PATTID_TYPE_NUMBER, 61 | AC_PATTID_TYPE_STRING 62 | }; 63 | 64 | /** 65 | * Provides a more readable representative for the pattern. Because patterns 66 | * themselves are not always suitable for displaying (e.g. patterns containing 67 | * special characters), we offer this type to improve intelligibility of the 68 | * output. Sometimes it can be also useful, when you are retrieving patterns 69 | * from a database, to maintain their identifiers in the trie for further 70 | * reference. We provisioned two possible types as a union. you can add your 71 | * type here. 72 | */ 73 | typedef struct ac_pattid 74 | { 75 | union 76 | { 77 | const char *stringy; /**< Null-terminated string */ 78 | long number; /**< Item indicator */ 79 | } u; 80 | 81 | enum ac_pattid_type type; /**< Shows the type of id */ 82 | 83 | } AC_PATTID_t; 84 | 85 | /** 86 | * This is the pattern type that the trie must be fed by. 87 | */ 88 | typedef struct ac_pattern 89 | { 90 | AC_TEXT_t ptext; /**< The search string */ 91 | AC_TEXT_t rtext; /**< The replace string */ 92 | AC_PATTID_t id; /**< Pattern identifier */ 93 | void * aux; /**< User defined object */ 94 | } AC_PATTERN_t; 95 | 96 | /** 97 | * @brief Provides the structure for reporting a match in the text. 98 | * 99 | * A match occurs when the trie reaches a final node. Any final 100 | * node can match one or more patterns at a position in the input text. 101 | * the 'patterns' field holds these matched patterns. Obviously these 102 | * matched patterns have same end-position in the text. There is a relationship 103 | * between matched patterns: the shorter one is a factor (tail) of the longer 104 | * one. The 'position' maintains the end position of matched patterns. 105 | */ 106 | typedef struct ac_match 107 | { 108 | AC_PATTERN_t *patterns; /**< Array of matched pattern(s) */ 109 | size_t size; /**< Number of matched pattern(s) */ 110 | 111 | size_t position; /**< The end position of the matching pattern(s) in 112 | * the input text */ 113 | } AC_MATCH_t; 114 | 115 | /** 116 | * The return status of various A.C. Trie functions 117 | */ 118 | typedef enum ac_status 119 | { 120 | ACERR_SUCCESS = 0, /**< No error occurred */ 121 | ACERR_DUPLICATE_PATTERN, /**< Duplicate patterns */ 122 | ACERR_LONG_PATTERN, /**< Pattern length is too long */ 123 | ACERR_ZERO_PATTERN, /**< Empty pattern (zero length) */ 124 | ACERR_TRIE_CLOSED /**< Trie is closed. */ 125 | } AC_STATUS_t; 126 | 127 | /** 128 | * @ brief The call-back function to report the matched patterns back to the 129 | * caller. 130 | * 131 | * When a match is found, the trie will reach the caller using this 132 | * function. You can send parameters to the call-back function when you call 133 | * _search() or _replace() functions. The call-back function receives those 134 | * parameters as the second parameter determined by void * in bellow. If you 135 | * return 0 from call-back function, it will tell trie to continue 136 | * searching, otherwise it will return from the trie function. 137 | */ 138 | typedef int (*AC_MATCH_CALBACK_f)(AC_MATCH_t *, void *); 139 | 140 | /** 141 | * @brief Call-back function to receive the replacement text (chunk by chunk). 142 | */ 143 | typedef void (*MF_REPLACE_CALBACK_f)(AC_TEXT_t *, void *); 144 | 145 | /** 146 | * Maximum accepted length of search/replace pattern 147 | */ 148 | #define AC_PATTRN_MAX_LENGTH 1024 149 | 150 | /** 151 | * Replacement buffer size 152 | */ 153 | #define MF_REPLACEMENT_BUFFER_SIZE 2048 154 | 155 | #if (MF_REPLACEMENT_BUFFER_SIZE <= AC_PATTRN_MAX_LENGTH) 156 | #error "REPLACEMENT_BUFFER_SIZE must be bigger than AC_PATTRN_MAX_LENGTH" 157 | #endif 158 | 159 | typedef enum act_working_mode 160 | { 161 | AC_WORKING_MODE_SEARCH = 0, /* Default */ 162 | AC_WORKING_MODE_FINDNEXT, 163 | AC_WORKING_MODE_REPLACE /* Not used */ 164 | } ACT_WORKING_MODE_t; 165 | 166 | 167 | #ifdef __cplusplus 168 | } 169 | #endif 170 | 171 | #endif 172 | -------------------------------------------------------------------------------- /src/multifast/ahocorasick.c: -------------------------------------------------------------------------------- 1 | /* 2 | * ahocorasick.c: Implements the A. C. Trie functionalities 3 | * This file is part of multifast. 4 | * 5 | Copyright 2010-2015 Kamiar Kanani 6 | 7 | multifast is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU Lesser General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | multifast is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU Lesser General Public License for more details. 16 | 17 | You should have received a copy of the GNU Lesser General Public License 18 | along with multifast. If not, see . 19 | */ 20 | 21 | #include 22 | #include 23 | #include 24 | 25 | #include "node.h" 26 | #include "ahocorasick.h" 27 | #include "mpool.h" 28 | 29 | /* Privates */ 30 | 31 | static void ac_trie_set_failure 32 | (ACT_NODE_t *node, AC_ALPHABET_t *alphas); 33 | 34 | static void ac_trie_traverse_setfailure 35 | (ACT_NODE_t *node, AC_ALPHABET_t *prefix); 36 | 37 | static void ac_trie_traverse_action 38 | (ACT_NODE_t *node, void(*func)(ACT_NODE_t *), int top_down); 39 | 40 | static void ac_trie_reset 41 | (AC_TRIE_t *thiz); 42 | 43 | static int ac_trie_match_handler 44 | (AC_MATCH_t * matchp, void * param); 45 | 46 | /* Friends */ 47 | 48 | extern void mf_repdata_init (AC_TRIE_t *thiz); 49 | extern void mf_repdata_reset (MF_REPLACEMENT_DATA_t *rd); 50 | extern void mf_repdata_release (MF_REPLACEMENT_DATA_t *rd); 51 | extern void mf_repdata_allocbuf (MF_REPLACEMENT_DATA_t *rd); 52 | 53 | 54 | /** 55 | * @brief Initializes the trie; allocates memories and sets initial values 56 | * 57 | * @return 58 | *****************************************************************************/ 59 | AC_TRIE_t *ac_trie_create (void) 60 | { 61 | AC_TRIE_t *thiz = (AC_TRIE_t *) malloc (sizeof(AC_TRIE_t)); 62 | thiz->mp = mpool_create(0); 63 | 64 | thiz->root = node_create (thiz); 65 | 66 | thiz->patterns_count = 0; 67 | 68 | mf_repdata_init (thiz); 69 | ac_trie_reset (thiz); 70 | thiz->text = NULL; 71 | thiz->position = 0; 72 | 73 | thiz->wm = AC_WORKING_MODE_SEARCH; 74 | thiz->trie_open = 1; 75 | 76 | return thiz; 77 | } 78 | 79 | /** 80 | * @brief Adds pattern to the trie. 81 | * 82 | * @param Thiz pointer to the trie 83 | * @param Patt pointer to the pattern 84 | * @param copy should trie make a copy of patten strings or not, if not, 85 | * then user must keep the strings valid for the life-time of the trie. If 86 | * the pattern are available in the user program then call the function with 87 | * copy = 0 and do not waste memory. 88 | * 89 | * @return The return value indicates the success or failure of adding action 90 | *****************************************************************************/ 91 | AC_STATUS_t ac_trie_add (AC_TRIE_t *thiz, AC_PATTERN_t *patt, int copy) 92 | { 93 | size_t i; 94 | ACT_NODE_t *n = thiz->root; 95 | ACT_NODE_t *next; 96 | AC_ALPHABET_t alpha; 97 | 98 | if(!thiz->trie_open) 99 | return ACERR_TRIE_CLOSED; 100 | 101 | if (!patt->ptext.length) 102 | return ACERR_ZERO_PATTERN; 103 | 104 | if (patt->ptext.length > AC_PATTRN_MAX_LENGTH) 105 | return ACERR_LONG_PATTERN; 106 | 107 | for (i = 0; i < patt->ptext.length; i++) 108 | { 109 | alpha = patt->ptext.astring[i]; 110 | if ((next = node_find_next (n, alpha))) 111 | { 112 | n = next; 113 | continue; 114 | } 115 | else 116 | { 117 | next = node_create_next (n, alpha); 118 | next->depth = n->depth + 1; 119 | n = next; 120 | } 121 | } 122 | 123 | if(n->final) 124 | return ACERR_DUPLICATE_PATTERN; 125 | 126 | n->final = 1; 127 | node_accept_pattern (n, patt, copy); 128 | thiz->patterns_count++; 129 | 130 | return ACERR_SUCCESS; 131 | } 132 | 133 | /** 134 | * @brief Finalizes the preprocessing stage and gets the trie ready 135 | * 136 | * Locates the failure node for all nodes and collects all matched 137 | * pattern for each node. It also sorts outgoing edges of node, so binary 138 | * search could be performed on them. After calling this function the automate 139 | * will be finalized and you can not add new patterns to the automate. 140 | * 141 | * @param thiz pointer to the trie 142 | *****************************************************************************/ 143 | void ac_trie_finalize (AC_TRIE_t *thiz) 144 | { 145 | AC_ALPHABET_t prefix[AC_PATTRN_MAX_LENGTH]; 146 | 147 | /* 'prefix' defined here, because ac_trie_traverse_setfailure() calls 148 | * itself recursively */ 149 | ac_trie_traverse_setfailure (thiz->root, prefix); 150 | 151 | ac_trie_traverse_action (thiz->root, node_collect_matches, 1); 152 | mf_repdata_allocbuf (&thiz->repdata); 153 | 154 | thiz->trie_open = 0; /* Do not accept patterns any more */ 155 | } 156 | 157 | /** 158 | * @brief Search in the input text using the given trie. 159 | * 160 | * @param thiz pointer to the trie 161 | * @param text input text to be searched 162 | * @param keep indicated that if the input text the successive chunk of the 163 | * previous given text or not 164 | * @param callback when a match occurs this function will be called. The 165 | * call-back function in turn after doing its job, will return an integer 166 | * value, 0 means continue search, and non-0 value means stop search and return 167 | * to the caller. 168 | * @param user this parameter will be send to the call-back function 169 | * 170 | * @return 171 | * -1: failed; trie is not finalized 172 | * 0: success; input text was searched to the end 173 | * 1: success; input text was searched partially. (callback broke the loop) 174 | *****************************************************************************/ 175 | int ac_trie_search (AC_TRIE_t *thiz, AC_TEXT_t *text, int keep, 176 | AC_MATCH_CALBACK_f callback, void *user) 177 | { 178 | size_t position; 179 | ACT_NODE_t *current; 180 | ACT_NODE_t *next; 181 | AC_MATCH_t match; 182 | 183 | if (thiz->trie_open) 184 | return -1; /* Trie must be finalized first. */ 185 | 186 | if (thiz->wm == AC_WORKING_MODE_FINDNEXT) 187 | position = thiz->position; 188 | else 189 | position = 0; 190 | 191 | if (!keep) 192 | ac_trie_reset (thiz); 193 | 194 | current = thiz->last_node; 195 | 196 | /* This is the main search loop. 197 | * It must be kept as lightweight as possible. 198 | */ 199 | while (position < text->length) 200 | { 201 | if (!(next = node_find_next_bs (current, text->astring[position]))) 202 | { 203 | if(current->failure_node /* We are not in the root node */) 204 | current = current->failure_node; 205 | else 206 | position++; 207 | } 208 | else 209 | { 210 | current = next; 211 | position++; 212 | } 213 | 214 | if (current->final && next) 215 | /* We check 'next' to find out if we have come here after a alphabet 216 | * transition or due to a fail transition. in second case we should not 217 | * report match, because it has already been reported */ 218 | { 219 | /* Found a match! */ 220 | match.position = position + thiz->base_position; 221 | match.size = current->matched_size; 222 | match.patterns = current->matched; 223 | 224 | /* Do call-back */ 225 | if (callback(&match, user)) 226 | { 227 | if (thiz->wm == AC_WORKING_MODE_FINDNEXT) { 228 | thiz->position = position; 229 | thiz->last_node = current; 230 | } 231 | return 1; 232 | } 233 | } 234 | } 235 | 236 | /* Save status variables */ 237 | thiz->last_node = current; 238 | thiz->base_position += position; 239 | 240 | return 0; 241 | } 242 | 243 | /** 244 | * @brief sets the input text to be searched by a function call to _findnext() 245 | * 246 | * @param thiz The pointer to the trie 247 | * @param text The text to be searched. The owner of the text is the 248 | * calling program and no local copy is made, so it must be valid until you 249 | * have done with it. 250 | * @param keep Indicates that if the given text is the sequel of the previous 251 | * one or not; 1: it is, 0: it is not 252 | *****************************************************************************/ 253 | void ac_trie_settext (AC_TRIE_t *thiz, AC_TEXT_t *text, int keep) 254 | { 255 | if (!keep) 256 | ac_trie_reset (thiz); 257 | 258 | thiz->text = text; 259 | thiz->position = 0; 260 | } 261 | 262 | /** 263 | * @brief finds the next match in the input text which is set by _settext() 264 | * 265 | * @param thiz The pointer to the trie 266 | * @return A pointer to the matched structure 267 | *****************************************************************************/ 268 | AC_MATCH_t ac_trie_findnext (AC_TRIE_t *thiz) 269 | { 270 | AC_MATCH_t match; 271 | 272 | thiz->wm = AC_WORKING_MODE_FINDNEXT; 273 | match.size = 0; 274 | 275 | ac_trie_search (thiz, thiz->text, 1, 276 | ac_trie_match_handler, (void *)&match); 277 | 278 | thiz->wm = AC_WORKING_MODE_SEARCH; 279 | 280 | return match; 281 | } 282 | 283 | /** 284 | * @brief Release all allocated memories to the trie 285 | * 286 | * @param thiz pointer to the trie 287 | *****************************************************************************/ 288 | void ac_trie_release (AC_TRIE_t *thiz) 289 | { 290 | /* It must be called with a 0 top-down parameter */ 291 | ac_trie_traverse_action (thiz->root, node_release_vectors, 0); 292 | 293 | mf_repdata_release (&thiz->repdata); 294 | mpool_free(thiz->mp); 295 | free(thiz); 296 | } 297 | 298 | /** 299 | * @brief Prints the trie to output in human readable form. It is useful 300 | * for debugging purpose. 301 | * 302 | * @param thiz pointer to the trie 303 | *****************************************************************************/ 304 | void ac_trie_display (AC_TRIE_t *thiz) 305 | { 306 | ac_trie_traverse_action (thiz->root, node_display, 1); 307 | } 308 | 309 | /** 310 | * @brief the match handler function used in _findnext function 311 | * 312 | * @param matchp 313 | * @param param 314 | * @return 315 | *****************************************************************************/ 316 | static int ac_trie_match_handler (AC_MATCH_t * matchp, void * param) 317 | { 318 | AC_MATCH_t * mp = (AC_MATCH_t *)param; 319 | mp->position = matchp->position; 320 | mp->patterns = matchp->patterns; 321 | mp->size = matchp->size; 322 | return 1; 323 | } 324 | 325 | /** 326 | * @brief reset the trie and make it ready for doing new search 327 | * 328 | * @param thiz pointer to the trie 329 | *****************************************************************************/ 330 | static void ac_trie_reset (AC_TRIE_t *thiz) 331 | { 332 | thiz->last_node = thiz->root; 333 | thiz->base_position = 0; 334 | mf_repdata_reset (&thiz->repdata); 335 | } 336 | 337 | /** 338 | * @brief Finds and bookmarks the failure transition for the given node. 339 | * 340 | * @param node the node pointer 341 | * @param prefix The array that contain the prefix that leads the path from 342 | * root the the node. 343 | *****************************************************************************/ 344 | static void ac_trie_set_failure 345 | (ACT_NODE_t *node, AC_ALPHABET_t *prefix) 346 | { 347 | size_t i, j; 348 | ACT_NODE_t *n; 349 | ACT_NODE_t *root = node->trie->root; 350 | 351 | if (node == root) 352 | return; /* Failure transition is not defined for the root */ 353 | 354 | for (i = 1; i < node->depth; i++) 355 | { 356 | n = root; 357 | for (j = i; j < node->depth && n; j++) 358 | n = node_find_next (n, prefix[j]); 359 | if (n) 360 | { 361 | node->failure_node = n; 362 | break; 363 | } 364 | } 365 | 366 | if (!node->failure_node) 367 | node->failure_node = root; 368 | } 369 | 370 | /** 371 | * @brief Sets the failure transition node for all nodes 372 | * 373 | * Traverse all trie nodes using DFS (Depth First Search), meanwhile it set 374 | * the failure node for every node it passes through. this function is called 375 | * after adding last pattern to trie. 376 | * 377 | * @param node The pointer to the root node 378 | * @param prefix The array that contain the prefix that leads the path from 379 | * root the the node 380 | *****************************************************************************/ 381 | static void ac_trie_traverse_setfailure 382 | (ACT_NODE_t *node, AC_ALPHABET_t *prefix) 383 | { 384 | size_t i; 385 | 386 | /* In each node, look for its failure node */ 387 | ac_trie_set_failure (node, prefix); 388 | 389 | for (i = 0; i < node->outgoing_size; i++) 390 | { 391 | prefix[node->depth] = node->outgoing[i].alpha; /* Make the prefix */ 392 | 393 | /* Recursively call itself to traverse all nodes */ 394 | ac_trie_traverse_setfailure (node->outgoing[i].next, prefix); 395 | } 396 | } 397 | 398 | /** 399 | * @brief Traverses the trie using DFS method and applies the 400 | * given @param func on all nodes. At top level it should be called by 401 | * sending the the root node. 402 | * 403 | * @param node Pointer to trie root node 404 | * @param func The function that must be applied to all nodes 405 | * @param top_down Indicates that if the action should be applied to the note 406 | * itself and then to its children or vise versa. 407 | *****************************************************************************/ 408 | static void ac_trie_traverse_action 409 | (ACT_NODE_t *node, void(*func)(ACT_NODE_t *), int top_down) 410 | { 411 | size_t i; 412 | 413 | if (top_down) 414 | func (node); 415 | 416 | for (i = 0; i < node->outgoing_size; i++) 417 | /* Recursively call itself to traverse all nodes */ 418 | ac_trie_traverse_action (node->outgoing[i].next, func, top_down); 419 | 420 | if (!top_down) 421 | func (node); 422 | } 423 | -------------------------------------------------------------------------------- /src/multifast/ahocorasick.h: -------------------------------------------------------------------------------- 1 | /* 2 | * ahocorasick.h: The main ahocorasick header file. 3 | * This file is part of multifast. 4 | * 5 | Copyright 2010-2015 Kamiar Kanani 6 | 7 | multifast is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU Lesser General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | multifast is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU Lesser General Public License for more details. 16 | 17 | You should have received a copy of the GNU Lesser General Public License 18 | along with multifast. If not, see . 19 | */ 20 | 21 | #ifndef _AHOCORASICK_H_ 22 | #define _AHOCORASICK_H_ 23 | 24 | #include "replace.h" 25 | 26 | #ifdef __cplusplus 27 | extern "C" { 28 | #endif 29 | 30 | /* Forward declaration */ 31 | struct act_node; 32 | struct mpool; 33 | 34 | /* 35 | * The A.C. Trie data structure 36 | */ 37 | typedef struct ac_trie 38 | { 39 | struct act_node *root; /**< The root node of the trie */ 40 | 41 | size_t patterns_count; /**< Total patterns in the trie */ 42 | 43 | short trie_open; /**< This flag indicates that if trie is finalized 44 | * or not. After finalizing the trie you can not 45 | * add pattern to trie anymore. */ 46 | 47 | struct mpool *mp; /**< Memory pool */ 48 | 49 | /* ******************* Thread specific part ******************** */ 50 | 51 | /* It is possible to search a long input chunk by chunk. In order to 52 | * connect these chunks and make a continuous view of the input, we need 53 | * the following variables. 54 | */ 55 | struct act_node *last_node; /**< Last node we stopped at */ 56 | size_t base_position; /**< Represents the position of the current chunk, 57 | * related to whole input text */ 58 | 59 | AC_TEXT_t *text; /**< A helper variable to hold the input chunk */ 60 | size_t position; /**< A helper variable to hold the relative current 61 | * position in the given text */ 62 | 63 | MF_REPLACEMENT_DATA_t repdata; /**< Replacement data structure */ 64 | 65 | ACT_WORKING_MODE_t wm; /**< Working mode */ 66 | 67 | } AC_TRIE_t; 68 | 69 | /* 70 | * The API functions 71 | */ 72 | 73 | AC_TRIE_t *ac_trie_create (void); 74 | AC_STATUS_t ac_trie_add (AC_TRIE_t *thiz, AC_PATTERN_t *patt, int copy); 75 | void ac_trie_finalize (AC_TRIE_t *thiz); 76 | void ac_trie_release (AC_TRIE_t *thiz); 77 | void ac_trie_display (AC_TRIE_t *thiz); 78 | 79 | int ac_trie_search (AC_TRIE_t *thiz, AC_TEXT_t *text, int keep, 80 | AC_MATCH_CALBACK_f callback, void *param); 81 | 82 | void ac_trie_settext (AC_TRIE_t *thiz, AC_TEXT_t *text, int keep); 83 | AC_MATCH_t ac_trie_findnext (AC_TRIE_t *thiz); 84 | 85 | int multifast_replace (AC_TRIE_t *thiz, AC_TEXT_t *text, 86 | MF_REPLACE_MODE_t mode, MF_REPLACE_CALBACK_f callback, void *param); 87 | void multifast_rep_flush (AC_TRIE_t *thiz, int keep); 88 | 89 | 90 | #ifdef __cplusplus 91 | } 92 | #endif 93 | 94 | #endif 95 | -------------------------------------------------------------------------------- /src/multifast/mpool.c: -------------------------------------------------------------------------------- 1 | /* 2 | * mpool.c memory pool management 3 | * This file is part of multifast. 4 | * 5 | Copyright 2010-2015 Kamiar Kanani 6 | 7 | multifast is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU Lesser General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | multifast is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU Lesser General Public License for more details. 16 | 17 | You should have received a copy of the GNU Lesser General Public License 18 | along with multifast. If not, see . 19 | */ 20 | 21 | #include 22 | #include 23 | #include 24 | #include "mpool.h" 25 | 26 | 27 | #define MPOOL_BLOCK_SIZE (24*1024) 28 | 29 | #if (MPOOL_BLOCK_SIZE % 16 > 0) 30 | #error "MPOOL_BLOCK_SIZE must be multiple 16" 31 | #endif 32 | 33 | #if (MPOOL_BLOCK_SIZE <= AC_PATTRN_MAX_LENGTH) 34 | #error "MPOOL_BLOCK_SIZE must be bigger than AC_PATTRN_MAX_LENGTH" 35 | #endif 36 | 37 | struct mpool_block 38 | { 39 | size_t size; 40 | unsigned char *bp; /* Block pointer */ 41 | unsigned char *free; /* Free area; End of allocated section */ 42 | 43 | struct mpool_block *next; /* Next block */ 44 | }; 45 | 46 | struct mpool 47 | { 48 | struct mpool_block *block; 49 | }; 50 | 51 | 52 | /** 53 | * @brief Allocate a new block to the pool 54 | * 55 | * @param size 56 | * @return 57 | ******************************************************************************/ 58 | static struct mpool_block *mpool_new_block (size_t size) 59 | { 60 | struct mpool_block *block; 61 | 62 | if (!size) 63 | size = MPOOL_BLOCK_SIZE; 64 | 65 | block = (struct mpool_block *) AC_MALLOC (sizeof(struct mpool_block)); 66 | 67 | block->bp = block->free = AC_MALLOC(size); 68 | block->size = size; 69 | block->next = NULL; 70 | 71 | return block; 72 | } 73 | 74 | /** 75 | * @brief Creates a new pool 76 | * 77 | * @param size 78 | * @return 79 | ******************************************************************************/ 80 | struct mpool *mpool_create (size_t size) 81 | { 82 | struct mpool *ret; 83 | 84 | ret = AC_MALLOC (sizeof(struct mpool)); 85 | ret->block = mpool_new_block(size); 86 | 87 | return ret; 88 | } 89 | 90 | /** 91 | * @brief Free a pool 92 | * 93 | * @param pool 94 | ******************************************************************************/ 95 | void mpool_free (struct mpool *pool) 96 | { 97 | struct mpool_block *p, *p_next; 98 | 99 | if (!pool) 100 | return; 101 | 102 | if (!pool->block) { 103 | AC_MFREE(pool); 104 | return; 105 | } 106 | 107 | p = pool->block; 108 | 109 | while (p) { 110 | p_next = p->next; 111 | AC_MFREE(p->bp); 112 | AC_MFREE(p); 113 | p = p_next; 114 | } 115 | 116 | AC_MFREE(pool); 117 | } 118 | 119 | /** 120 | * @brief Allocate from a pool 121 | * 122 | * @param pool 123 | * @param size 124 | * @return 125 | ******************************************************************************/ 126 | void *mpool_malloc (struct mpool *pool, size_t size) 127 | { 128 | void *ret = NULL; 129 | struct mpool_block *block, *new_block; 130 | size_t remain, block_size; 131 | 132 | if(!pool || !pool->block || !size) 133 | return NULL; 134 | 135 | size = (size + 15) & ~0xF; /* This is to align memory allocation on 136 | * multiple 16 boundary */ 137 | 138 | block = pool->block; 139 | remain = block->size - ((size_t)block->free - (size_t)block->bp); 140 | 141 | if (remain < size) 142 | { 143 | /* Allocate a new block */ 144 | block_size = ((size > block->size) ? size : block->size); 145 | new_block = mpool_new_block (block_size); 146 | new_block->next = block; 147 | block = pool->block = new_block; 148 | } 149 | 150 | ret = block->free; 151 | 152 | block->free = block->bp + (block->free - block->bp + size); 153 | 154 | return ret; 155 | } 156 | 157 | /** 158 | * @brief Makes a copy of a string with known size 159 | * 160 | * @param pool 161 | * @param str 162 | * @param n 163 | * @return 164 | *****************************************************************************/ 165 | void *mpool_strndup (struct mpool *pool, const char *str, size_t n) 166 | { 167 | void *ret; 168 | 169 | if (!str) 170 | return NULL; 171 | 172 | if ((ret = mpool_malloc(pool, n+1))) 173 | { 174 | strncpy((char *)ret, str, n); 175 | ((char *)ret)[n] = '\0'; 176 | } 177 | 178 | return ret; 179 | } 180 | 181 | /** 182 | * @brief Makes a copy of zero terminated string 183 | * 184 | * @param pool 185 | * @param str 186 | * @return 187 | ******************************************************************************/ 188 | void *mpool_strdup (struct mpool *pool, const char *str) 189 | { 190 | size_t len; 191 | 192 | if (!str) 193 | return NULL; 194 | len = strlen(str); 195 | 196 | return mpool_strndup (pool, str, len); 197 | } 198 | -------------------------------------------------------------------------------- /src/multifast/mpool.h: -------------------------------------------------------------------------------- 1 | /* 2 | * mpool.c memory pool management 3 | * This file is part of multifast. 4 | * 5 | Copyright 2010-2015 Kamiar Kanani 6 | 7 | multifast is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU Lesser General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | multifast is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU Lesser General Public License for more details. 16 | 17 | You should have received a copy of the GNU Lesser General Public License 18 | along with multifast. If not, see . 19 | */ 20 | 21 | #ifndef _MPOOL_H_ 22 | #define _MPOOL_H_ 23 | 24 | #include "acmem.h" 25 | 26 | #ifndef AC_MALLOC 27 | #define AC_MALLOC malloc 28 | #endif 29 | 30 | #ifndef AC_MFREE 31 | #define AC_MFREE free 32 | #endif 33 | 34 | #ifdef __cplusplus 35 | extern "C" { 36 | #endif 37 | 38 | /* Forward declaration */ 39 | struct mpool; 40 | 41 | 42 | struct mpool *mpool_create (size_t size); 43 | void mpool_free (struct mpool *pool); 44 | 45 | void *mpool_malloc (struct mpool *pool, size_t size); 46 | void *mpool_strdup (struct mpool *pool, const char *str); 47 | void *mpool_strndup (struct mpool *pool, const char *str, size_t n); 48 | 49 | 50 | #ifdef __cplusplus 51 | } 52 | #endif 53 | 54 | #endif /* _MPOOL_H_ */ 55 | -------------------------------------------------------------------------------- /src/multifast/node.c: -------------------------------------------------------------------------------- 1 | /* 2 | * node.c: Implements the A.C. Trie node 3 | * This file is part of multifast. 4 | * 5 | Copyright 2010-2015 Kamiar Kanani 6 | 7 | multifast is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU Lesser General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | multifast is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU Lesser General Public License for more details. 16 | 17 | You should have received a copy of the GNU Lesser General Public License 18 | along with multifast. If not, see . 19 | */ 20 | 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include "node.h" 26 | #include "mpool.h" 27 | #include "ahocorasick.h" 28 | 29 | /* Privates */ 30 | static void node_init (ACT_NODE_t *thiz); 31 | static int node_edge_compare (const void *l, const void *r); 32 | static int node_has_pattern (ACT_NODE_t *thiz, AC_PATTERN_t *patt); 33 | static void node_grow_outgoing_vector (ACT_NODE_t *thiz); 34 | static void node_grow_matched_vector (ACT_NODE_t *thiz); 35 | static void node_copy_pattern (ACT_NODE_t *thiz, 36 | AC_PATTERN_t *to, AC_PATTERN_t *from); 37 | 38 | /** 39 | * @brief Creates the node 40 | * 41 | * @return 42 | ******************************************************************************/ 43 | struct act_node * node_create (struct ac_trie *trie) 44 | { 45 | ACT_NODE_t *node; 46 | 47 | node = (ACT_NODE_t *) mpool_malloc (trie->mp, sizeof(ACT_NODE_t)); 48 | node_init (node); 49 | node->trie = trie; 50 | 51 | return node; 52 | } 53 | 54 | /** 55 | * @brief Initializes the node 56 | * 57 | * @param thiz 58 | *****************************************************************************/ 59 | static void node_init (ACT_NODE_t *thiz) 60 | { 61 | node_assign_id (thiz); 62 | 63 | thiz->final = 0; 64 | thiz->failure_node = NULL; 65 | thiz->depth = 0; 66 | 67 | thiz->matched = NULL; 68 | thiz->matched_capacity = 0; 69 | thiz->matched_size = 0; 70 | 71 | thiz->outgoing = NULL; 72 | thiz->outgoing_capacity = 0; 73 | thiz->outgoing_size = 0; 74 | 75 | thiz->to_be_replaced = NULL; 76 | } 77 | 78 | /** 79 | * @brief Releases the node memories 80 | * 81 | * @param thiz 82 | *****************************************************************************/ 83 | void node_release_vectors(ACT_NODE_t *nod) 84 | { 85 | free(nod->matched); 86 | free(nod->outgoing); 87 | } 88 | 89 | /** 90 | * @brief Finds out the next node for a given alpha. this function is used in 91 | * the pre-processing stage in which edge array is not sorted. so it uses 92 | * linear search. 93 | * 94 | * @param thiz 95 | * @param alpha 96 | * @return 97 | *****************************************************************************/ 98 | ACT_NODE_t * node_find_next(ACT_NODE_t *nod, AC_ALPHABET_t alpha) 99 | { 100 | size_t i; 101 | 102 | for (i=0; i < nod->outgoing_size; i++) 103 | { 104 | if(nod->outgoing[i].alpha == alpha) 105 | return (nod->outgoing[i].next); 106 | } 107 | return NULL; 108 | } 109 | 110 | /** 111 | * @brief Finds out the next node for a given alpha. this function is used 112 | * after the pre-processing stage in which we sort edges. so it uses Binary 113 | * Search. 114 | * 115 | * @param thiz 116 | * @param alpha 117 | * @return 118 | *****************************************************************************/ 119 | ACT_NODE_t *node_find_next_bs (ACT_NODE_t *nod, AC_ALPHABET_t alpha) 120 | { 121 | size_t mid; 122 | int min, max; 123 | AC_ALPHABET_t amid; 124 | 125 | min = 0; 126 | max = nod->outgoing_size - 1; 127 | 128 | while (min <= max) 129 | { 130 | mid = (min + max) >> 1; 131 | amid = nod->outgoing[mid].alpha; 132 | if (alpha > amid) 133 | min = mid + 1; 134 | else if (alpha < amid) 135 | max = mid - 1; 136 | else 137 | return (nod->outgoing[mid].next); 138 | } 139 | return NULL; 140 | } 141 | 142 | /** 143 | * @brief Determines if a final node contains a pattern in its accepted pattern 144 | * list or not. 145 | * 146 | * @param thiz 147 | * @param newstr 148 | * @return 1: has the pattern, 0: doesn't have it 149 | *****************************************************************************/ 150 | static int node_has_pattern (ACT_NODE_t *thiz, AC_PATTERN_t *patt) 151 | { 152 | size_t i, j; 153 | AC_TEXT_t *txt; 154 | AC_TEXT_t *new_txt = &patt->ptext; 155 | 156 | for (i = 0; i < thiz->matched_size; i++) 157 | { 158 | txt = &thiz->matched[i].ptext; 159 | 160 | if (txt->length != new_txt->length) 161 | continue; 162 | 163 | /* The following loop is futile! Because the input pattern always come 164 | * from a failure node, and if they have the same length, then they are 165 | * equal. But for the sake of functional integrity we leave it here. */ 166 | 167 | for (j = 0; j < txt->length; j++) 168 | if (txt->astring[j] != new_txt->astring[j]) 169 | break; 170 | 171 | if (j == txt->length) 172 | return 1; 173 | } 174 | return 0; 175 | } 176 | 177 | /** 178 | * @brief Create the next node for the given alpha. 179 | * 180 | * @param thiz 181 | * @param alpha 182 | * @return 183 | *****************************************************************************/ 184 | ACT_NODE_t *node_create_next (ACT_NODE_t *nod, AC_ALPHABET_t alpha) 185 | { 186 | ACT_NODE_t *next; 187 | 188 | if (node_find_next (nod, alpha) != NULL) 189 | /* The edge already exists */ 190 | return NULL; 191 | 192 | next = node_create (nod->trie); 193 | node_add_edge (nod, next, alpha); 194 | 195 | return next; 196 | } 197 | 198 | /** 199 | * @brief Adds the pattern to the list of accepted pattern. 200 | * 201 | * @param thiz 202 | * @param str 203 | * @param copy 204 | *****************************************************************************/ 205 | void node_accept_pattern (ACT_NODE_t *nod, AC_PATTERN_t *new_patt, int copy) 206 | { 207 | AC_PATTERN_t *patt; 208 | 209 | /* Check if the new pattern already exists in the node list */ 210 | if (node_has_pattern(nod, new_patt)) 211 | return; 212 | 213 | /* Manage memory */ 214 | if (nod->matched_size == nod->matched_capacity) 215 | node_grow_matched_vector (nod); 216 | 217 | patt = &nod->matched[nod->matched_size++]; 218 | 219 | if (copy) 220 | { 221 | /* Deep copy */ 222 | node_copy_pattern (nod, patt, new_patt); 223 | } 224 | else 225 | { 226 | /* Shallow copy */ 227 | *patt = *new_patt; 228 | } 229 | } 230 | 231 | /** 232 | * @brief Makes a deep copy of the pattern 233 | * 234 | * @param thiz pointer to the owner node 235 | * @param from 236 | * @param to 237 | *****************************************************************************/ 238 | static void node_copy_pattern 239 | (ACT_NODE_t *thiz, AC_PATTERN_t *to, AC_PATTERN_t *from) 240 | { 241 | struct mpool *mp = thiz->trie->mp; 242 | 243 | to->ptext.astring = (AC_ALPHABET_t *) mpool_strndup (mp, 244 | (const char *) from->ptext.astring, 245 | from->ptext.length * sizeof(AC_ALPHABET_t)); 246 | to->ptext.length = from->ptext.length; 247 | 248 | to->rtext.astring = (AC_ALPHABET_t *) mpool_strndup (mp, 249 | (const char *) from->rtext.astring, 250 | from->rtext.length * sizeof(AC_ALPHABET_t)); 251 | to->rtext.length = from->rtext.length; 252 | 253 | if (from->id.type == AC_PATTID_TYPE_STRING) 254 | to->id.u.stringy = (const char *) mpool_strdup (mp, 255 | (const char *) from->id.u.stringy); 256 | else 257 | to->id.u.number = from->id.u.number; 258 | 259 | to->id.type = from->id.type; 260 | to->aux = from->aux; 261 | } 262 | 263 | /** 264 | * @brief Establish an edge between two nodes 265 | * 266 | * @param thiz 267 | * @param next 268 | * @param alpha 269 | *****************************************************************************/ 270 | void node_add_edge (ACT_NODE_t *nod, ACT_NODE_t *next, AC_ALPHABET_t alpha) 271 | { 272 | struct act_edge *oe; /* Outgoing edge */ 273 | 274 | if(nod->outgoing_size == nod->outgoing_capacity) 275 | node_grow_outgoing_vector (nod); 276 | 277 | oe = &nod->outgoing[nod->outgoing_size]; 278 | oe->alpha = alpha; 279 | oe->next = next; 280 | nod->outgoing_size++; 281 | } 282 | 283 | /** 284 | * @brief Assigns a unique ID to the node (used for debugging purpose) 285 | * 286 | * @param thiz 287 | *****************************************************************************/ 288 | void node_assign_id (ACT_NODE_t *nod) 289 | { 290 | static int unique_id = 1; 291 | nod->id = unique_id++; 292 | } 293 | 294 | /** 295 | * @brief Comparison function for qsort. see man qsort. 296 | * 297 | * @param l left side 298 | * @param r right side 299 | * @return According to the man page: The comparison function must return an 300 | * integer less than, equal to, or greater than zero if the first argument is 301 | * considered to be respectively less than, equal to, or greater than the 302 | * second. if two members compare as equal, their order in the sorted array is 303 | * undefined. 304 | *****************************************************************************/ 305 | static int node_edge_compare (const void *l, const void *r) 306 | { 307 | /* 308 | * NOTE: Because edge alphabets are unique in every node we ignore 309 | * equivalence case. 310 | */ 311 | if (((struct act_edge *)l)->alpha >= ((struct act_edge *)r)->alpha) 312 | return 1; 313 | else 314 | return -1; 315 | } 316 | 317 | /** 318 | * @brief Sorts edges alphabets. 319 | * 320 | * @param thiz 321 | *****************************************************************************/ 322 | void node_sort_edges (ACT_NODE_t *nod) 323 | { 324 | qsort ((void *)nod->outgoing, nod->outgoing_size, 325 | sizeof(struct act_edge), node_edge_compare); 326 | } 327 | 328 | /** 329 | * @brief Bookmarks the to-be-replaced patterns 330 | * 331 | * If there was more than one pattern accepted in a node then only one of them 332 | * must be replaced: The longest pattern that has a requested replacement. 333 | * 334 | * @param node 335 | * @return 1 if there was any replacement, 0 otherwise 336 | *****************************************************************************/ 337 | int node_book_replacement (ACT_NODE_t *nod) 338 | { 339 | size_t j; 340 | AC_PATTERN_t *pattern; 341 | AC_PATTERN_t *longest = NULL; 342 | 343 | if(!nod->final) 344 | return 0; 345 | 346 | for (j=0; j < nod->matched_size; j++) 347 | { 348 | pattern = &nod->matched[j]; 349 | 350 | if (pattern->rtext.astring != NULL) 351 | { 352 | if (!longest) 353 | longest = pattern; 354 | else if (pattern->ptext.length > longest->ptext.length) 355 | longest = pattern; 356 | } 357 | } 358 | 359 | nod->to_be_replaced = longest; 360 | 361 | return longest ? 1 : 0; 362 | } 363 | 364 | /** 365 | * @brief Grows the size of outgoing edges vector 366 | * 367 | * @param thiz 368 | *****************************************************************************/ 369 | static void node_grow_outgoing_vector (ACT_NODE_t *thiz) 370 | { 371 | const size_t grow_factor = (8 / (thiz->depth + 1)) + 1; 372 | 373 | /* The outgoing edges of nodes grow with different pace in different 374 | * depths; the shallower nodes the bigger outgoing number of nodes. 375 | * So for efficiency (speed & memory usage), we apply a measure to 376 | * manage different growth rate. 377 | */ 378 | 379 | if (thiz->outgoing_capacity == 0) 380 | { 381 | thiz->outgoing_capacity = grow_factor; 382 | thiz->outgoing = (struct act_edge *) malloc 383 | (thiz->outgoing_capacity * sizeof(struct act_edge)); 384 | } 385 | else 386 | { 387 | thiz->outgoing_capacity += grow_factor; 388 | thiz->outgoing = (struct act_edge *) realloc ( 389 | thiz->outgoing, 390 | thiz->outgoing_capacity * sizeof(struct act_edge)); 391 | } 392 | } 393 | 394 | /** 395 | * @brief Grows the size of matched patterns vector 396 | * 397 | * @param thiz 398 | *****************************************************************************/ 399 | static void node_grow_matched_vector (ACT_NODE_t *thiz) 400 | { 401 | if (thiz->matched_capacity == 0) 402 | { 403 | thiz->matched_capacity = 1; 404 | thiz->matched = (AC_PATTERN_t *) malloc 405 | (thiz->matched_capacity * sizeof(AC_PATTERN_t)); 406 | } 407 | else 408 | { 409 | thiz->matched_capacity += 2; 410 | thiz->matched = (AC_PATTERN_t *) realloc ( 411 | thiz->matched, 412 | thiz->matched_capacity * sizeof(AC_PATTERN_t)); 413 | } 414 | } 415 | 416 | /** 417 | * @brief Collect accepted patterns of the node. 418 | * 419 | * The accepted patterns consist of the node's own accepted pattern plus 420 | * accepted patterns of its failure node. 421 | * 422 | * @param node 423 | *****************************************************************************/ 424 | void node_collect_matches (ACT_NODE_t *nod) 425 | { 426 | size_t i; 427 | ACT_NODE_t *n = nod; 428 | 429 | while ((n = n->failure_node)) 430 | { 431 | for (i = 0; i < n->matched_size; i++) 432 | /* Always call with copy parameter 0 */ 433 | node_accept_pattern (nod, &(n->matched[i]), 0); 434 | 435 | if (n->final) 436 | nod->final = 1; 437 | } 438 | 439 | node_sort_edges (nod); 440 | /* Sort matched patterns? Is that necessary? I don't think so. */ 441 | } 442 | 443 | /** 444 | * @brief Displays all nodes recursively 445 | * 446 | * @param n 447 | * @param repcast 448 | *****************************************************************************/ 449 | void node_display (ACT_NODE_t *nod) 450 | { 451 | size_t j; 452 | struct act_edge *e; 453 | AC_PATTERN_t patt; 454 | 455 | printf("NODE(%3d)/....fail....> ", nod->id); 456 | if (nod->failure_node) 457 | printf("NODE(%3d)\n", nod->failure_node->id); 458 | else 459 | printf ("N.A.\n"); 460 | 461 | for (j = 0; j < nod->outgoing_size; j++) 462 | { 463 | e = &nod->outgoing[j]; 464 | printf(" |----("); 465 | if(isgraph(e->alpha)) 466 | printf("%c)---", e->alpha); 467 | else 468 | printf("0x%x)", e->alpha); 469 | printf("--> NODE(%3d)\n", e->next->id); 470 | } 471 | 472 | if (nod->matched_size) 473 | { 474 | printf("Accepts: {"); 475 | for (j = 0; j < nod->matched_size; j++) 476 | { 477 | patt = nod->matched[j]; 478 | if(j) 479 | printf(", "); 480 | switch (patt.id.type) 481 | { 482 | case AC_PATTID_TYPE_DEFAULT: 483 | case AC_PATTID_TYPE_NUMBER: 484 | printf("%ld", patt.id.u.number); 485 | break; 486 | case AC_PATTID_TYPE_STRING: 487 | printf("%s", patt.id.u.stringy); 488 | break; 489 | } 490 | printf(": %.*s", (int)patt.ptext.length, patt.ptext.astring); 491 | } 492 | printf("}\n"); 493 | } 494 | printf("\n"); 495 | } 496 | -------------------------------------------------------------------------------- /src/multifast/node.h: -------------------------------------------------------------------------------- 1 | /* 2 | * node.h: Defines the trie node and interface functions 3 | * This file is part of multifast. 4 | * 5 | Copyright 2010-2015 Kamiar Kanani 6 | 7 | multifast is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU Lesser General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | multifast is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU Lesser General Public License for more details. 16 | 17 | You should have received a copy of the GNU Lesser General Public License 18 | along with multifast. If not, see . 19 | */ 20 | 21 | #ifndef _NODE_H_ 22 | #define _NODE_H_ 23 | 24 | #include "actypes.h" 25 | 26 | #ifdef __cplusplus 27 | extern "C" { 28 | #endif 29 | 30 | /* Forward Declaration */ 31 | struct act_edge; 32 | struct ac_trie; 33 | 34 | /** 35 | * Aho-Corasick Trie node 36 | */ 37 | typedef struct act_node 38 | { 39 | int id; /**< Node identifier: used for debugging purpose */ 40 | 41 | int final; /**< A final node accepts pattern; 0: not, 1: is final */ 42 | size_t depth; /**< Distance between this node and the root */ 43 | struct act_node *failure_node; /**< The failure transition node */ 44 | 45 | struct act_edge *outgoing; /**< Outgoing edges array */ 46 | size_t outgoing_capacity; /**< Max capacity of outgoing edges */ 47 | size_t outgoing_size; /**< Number of outgoing edges */ 48 | 49 | AC_PATTERN_t *matched; /**< Matched patterns array */ 50 | size_t matched_capacity; /**< Max capacity of the matched patterns */ 51 | size_t matched_size; /**< Number of matched patterns in this node */ 52 | 53 | AC_PATTERN_t *to_be_replaced; /**< Pointer to the pattern that must be 54 | * replaced */ 55 | 56 | struct ac_trie *trie; /**< The trie that this node belongs to */ 57 | 58 | } ACT_NODE_t; 59 | 60 | /** 61 | * Edge of the node 62 | */ 63 | struct act_edge 64 | { 65 | AC_ALPHABET_t alpha; /**< Transition alpha */ 66 | ACT_NODE_t *next; /**< Target of the edge */ 67 | }; 68 | 69 | /* 70 | * Node interface functions 71 | */ 72 | 73 | ACT_NODE_t *node_create (struct ac_trie *trie); 74 | ACT_NODE_t *node_create_next (ACT_NODE_t *nod, AC_ALPHABET_t alpha); 75 | ACT_NODE_t *node_find_next (ACT_NODE_t *nod, AC_ALPHABET_t alpha); 76 | ACT_NODE_t *node_find_next_bs (ACT_NODE_t *nod, AC_ALPHABET_t alpha); 77 | 78 | void node_assign_id (ACT_NODE_t *nod); 79 | void node_add_edge (ACT_NODE_t *nod, ACT_NODE_t *next, AC_ALPHABET_t alpha); 80 | void node_sort_edges (ACT_NODE_t *nod); 81 | void node_accept_pattern (ACT_NODE_t *nod, AC_PATTERN_t *new_patt, int copy); 82 | void node_collect_matches (ACT_NODE_t *nod); 83 | void node_release_vectors (ACT_NODE_t *nod); 84 | int node_book_replacement (ACT_NODE_t *nod); 85 | void node_display (ACT_NODE_t *nod); 86 | 87 | #ifdef __cplusplus 88 | } 89 | #endif 90 | 91 | #endif 92 | -------------------------------------------------------------------------------- /src/multifast/replace.c: -------------------------------------------------------------------------------- 1 | /* 2 | * replace.c: Implements the replacement functionality 3 | * 4 | * This file is part of multifast. 5 | * 6 | Copyright 2010-2015 Kamiar Kanani 7 | 8 | multifast is free software: you can redistribute it and/or modify 9 | it under the terms of the GNU Lesser General Public License as published by 10 | the Free Software Foundation, either version 3 of the License, or 11 | (at your option) any later version. 12 | 13 | multifast is distributed in the hope that it will be useful, 14 | but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | GNU Lesser General Public License for more details. 17 | 18 | You should have received a copy of the GNU Lesser General Public License 19 | along with multifast. If not, see . 20 | */ 21 | 22 | #include 23 | 24 | #include "node.h" 25 | #include "ahocorasick.h" 26 | 27 | 28 | /* Privates */ 29 | 30 | static void mf_repdata_do_replace 31 | (MF_REPLACEMENT_DATA_t *rd, size_t to_position); 32 | 33 | static void mf_repdata_booknominee 34 | (MF_REPLACEMENT_DATA_t *rd, struct mf_replacement_nominee *new_nom); 35 | 36 | static void mf_repdata_push_nominee 37 | (MF_REPLACEMENT_DATA_t *rd, struct mf_replacement_nominee *new_nom); 38 | 39 | static void mf_repdata_grow_noms_array 40 | (MF_REPLACEMENT_DATA_t *rd); 41 | 42 | static void mf_repdata_appendtext 43 | (MF_REPLACEMENT_DATA_t *rd, AC_TEXT_t *text); 44 | 45 | static void mf_repdata_appendfactor 46 | (MF_REPLACEMENT_DATA_t *rd, size_t from, size_t to); 47 | 48 | static void mf_repdata_savetobacklog 49 | (MF_REPLACEMENT_DATA_t *rd, size_t to_position_r); 50 | 51 | static void mf_repdata_flush 52 | (MF_REPLACEMENT_DATA_t *rd); 53 | 54 | static unsigned int mf_repdata_bookreplacements 55 | (ACT_NODE_t *node); 56 | 57 | /* Publics */ 58 | 59 | void mf_repdata_init (AC_TRIE_t *trie); 60 | void mf_repdata_reset (MF_REPLACEMENT_DATA_t *rd); 61 | void mf_repdata_release (MF_REPLACEMENT_DATA_t *rd); 62 | void mf_repdata_allocbuf (MF_REPLACEMENT_DATA_t *rd); 63 | 64 | 65 | /** 66 | * @brief Initializes the replacement data part of the trie 67 | * 68 | * @param trie 69 | *****************************************************************************/ 70 | void mf_repdata_init (AC_TRIE_t *trie) 71 | { 72 | MF_REPLACEMENT_DATA_t *rd = &trie->repdata; 73 | 74 | rd->buffer.astring = NULL; 75 | rd->buffer.length = 0; 76 | rd->backlog.astring = NULL; 77 | rd->backlog.length = 0; 78 | rd->has_replacement = 0; 79 | rd->curser = 0; 80 | 81 | rd->noms = NULL; 82 | rd->noms_capacity = 0; 83 | rd->noms_size = 0; 84 | 85 | rd->replace_mode = MF_REPLACE_MODE_DEFAULT; 86 | rd->trie = trie; 87 | } 88 | 89 | /** 90 | * @brief Performs finalization tasks on replacement data. 91 | * Must be called when finalizing the trie itself 92 | * 93 | * @param rd 94 | *****************************************************************************/ 95 | void mf_repdata_allocbuf (MF_REPLACEMENT_DATA_t *rd) 96 | { 97 | /* Bookmark replacement pattern for faster retrieval */ 98 | rd->has_replacement = mf_repdata_bookreplacements (rd->trie->root); 99 | 100 | if (rd->has_replacement) 101 | { 102 | rd->buffer.astring = (AC_ALPHABET_t *) 103 | malloc (MF_REPLACEMENT_BUFFER_SIZE * sizeof(AC_ALPHABET_t)); 104 | 105 | rd->backlog.astring = (AC_ALPHABET_t *) 106 | malloc (AC_PATTRN_MAX_LENGTH * sizeof(AC_ALPHABET_t)); 107 | 108 | /* Backlog length is not bigger than the max pattern length */ 109 | } 110 | } 111 | 112 | /** 113 | * @brief Bookmarks the to-be-replaced patterns for all nodes 114 | * 115 | * @param node 116 | * @return 117 | *****************************************************************************/ 118 | static unsigned int mf_repdata_bookreplacements (ACT_NODE_t *node) 119 | { 120 | size_t i; 121 | unsigned int ret; 122 | 123 | ret = node_book_replacement (node); 124 | 125 | for (i = 0; i < node->outgoing_size; i++) 126 | { 127 | /* Recursively call itself to traverse all nodes */ 128 | ret += mf_repdata_bookreplacements (node->outgoing[i].next); 129 | } 130 | 131 | return ret; 132 | } 133 | 134 | /** 135 | * @brief Resets the replacement data and prepares it for a new operation 136 | * 137 | * @param rd 138 | *****************************************************************************/ 139 | void mf_repdata_reset (MF_REPLACEMENT_DATA_t *rd) 140 | { 141 | rd->buffer.length = 0; 142 | rd->backlog.length = 0; 143 | rd->curser = 0; 144 | rd->noms_size = 0; 145 | } 146 | 147 | /** 148 | * @brief Release the allocated resources to the replacement data 149 | * 150 | * @param rd 151 | *****************************************************************************/ 152 | void mf_repdata_release (MF_REPLACEMENT_DATA_t *rd) 153 | { 154 | free((AC_ALPHABET_t *)rd->buffer.astring); 155 | free((AC_ALPHABET_t *)rd->backlog.astring); 156 | free(rd->noms); 157 | } 158 | 159 | /** 160 | * @brief Flushes out all the available stuff in the buffer to the user 161 | * 162 | * @param rd 163 | *****************************************************************************/ 164 | static void mf_repdata_flush (MF_REPLACEMENT_DATA_t *rd) 165 | { 166 | rd->cbf(&rd->buffer, rd->user); 167 | rd->buffer.length = 0; 168 | } 169 | 170 | /** 171 | * @brief Extends the nominees array 172 | * 173 | * @param rd 174 | *****************************************************************************/ 175 | static void mf_repdata_grow_noms_array (MF_REPLACEMENT_DATA_t *rd) 176 | { 177 | const size_t grow_factor = 128; 178 | 179 | if (rd->noms_capacity == 0) 180 | { 181 | rd->noms_capacity = grow_factor; 182 | rd->noms = (struct mf_replacement_nominee *) malloc 183 | (rd->noms_capacity * sizeof(struct mf_replacement_nominee)); 184 | rd->noms_size = 0; 185 | } 186 | else 187 | { 188 | rd->noms_capacity += grow_factor; 189 | rd->noms = (struct mf_replacement_nominee *) realloc (rd->noms, 190 | rd->noms_capacity * sizeof(struct mf_replacement_nominee)); 191 | } 192 | } 193 | 194 | /** 195 | * @brief Adds the nominee to the end of the nominee list 196 | * 197 | * @param rd 198 | * @param new_nom 199 | *****************************************************************************/ 200 | static void mf_repdata_push_nominee 201 | (MF_REPLACEMENT_DATA_t *rd, struct mf_replacement_nominee *new_nom) 202 | { 203 | struct mf_replacement_nominee *nomp; 204 | 205 | /* Extend the vector if needed */ 206 | if (rd->noms_size == rd->noms_capacity) 207 | mf_repdata_grow_noms_array (rd); 208 | 209 | /* Add the new nominee to the end */ 210 | nomp = &rd->noms[rd->noms_size]; 211 | nomp->pattern = new_nom->pattern; 212 | nomp->position = new_nom->position; 213 | rd->noms_size ++; 214 | } 215 | 216 | /** 217 | * @brief Tries to add the nominee to the end of the nominee list 218 | * 219 | * @param rd 220 | * @param new_nom 221 | *****************************************************************************/ 222 | static void mf_repdata_booknominee (MF_REPLACEMENT_DATA_t *rd, 223 | struct mf_replacement_nominee *new_nom) 224 | { 225 | struct mf_replacement_nominee *prev_nom; 226 | size_t prev_start_pos, prev_end_pos, new_start_pos; 227 | 228 | if (new_nom->pattern == NULL) 229 | return; /* This is not a to-be-replaced pattern; ignore it. */ 230 | 231 | new_start_pos = new_nom->position - new_nom->pattern->ptext.length; 232 | 233 | switch (rd->replace_mode) 234 | { 235 | case MF_REPLACE_MODE_LAZY: 236 | 237 | if (new_start_pos < rd->curser) 238 | return; /* Ignore the new nominee, because it overlaps with the 239 | * previous replacement */ 240 | 241 | if (rd->noms_size > 0) 242 | { 243 | prev_nom = &rd->noms[rd->noms_size - 1]; 244 | prev_end_pos = prev_nom->position; 245 | 246 | if (new_start_pos < prev_end_pos) 247 | return; 248 | } 249 | break; 250 | 251 | case MF_REPLACE_MODE_DEFAULT: 252 | case MF_REPLACE_MODE_NORMAL: 253 | default: 254 | 255 | while (rd->noms_size > 0) 256 | { 257 | prev_nom = &rd->noms[rd->noms_size - 1]; 258 | prev_start_pos = 259 | prev_nom->position - prev_nom->pattern->ptext.length; 260 | prev_end_pos = prev_nom->position; 261 | 262 | if (new_start_pos <= prev_start_pos) 263 | rd->noms_size--; /* Remove that nominee, because it is a 264 | * factor of the new nominee */ 265 | else 266 | break; /* Get out the loop and add the new nominee */ 267 | } 268 | break; 269 | } 270 | 271 | mf_repdata_push_nominee(rd, new_nom); 272 | } 273 | 274 | /** 275 | * @brief Append the given text to the output buffer 276 | * 277 | * @param rd 278 | * @param text 279 | *****************************************************************************/ 280 | static void mf_repdata_appendtext (MF_REPLACEMENT_DATA_t *rd, AC_TEXT_t *text) 281 | { 282 | size_t remaining_bufspace = 0; 283 | size_t remaining_text = 0; 284 | size_t copy_len = 0; 285 | size_t copy_index = 0; 286 | 287 | while (copy_index < text->length) 288 | { 289 | remaining_bufspace = MF_REPLACEMENT_BUFFER_SIZE - rd->buffer.length; 290 | remaining_text = text->length - copy_index; 291 | 292 | copy_len = (remaining_bufspace >= remaining_text)? 293 | remaining_text : remaining_bufspace; 294 | 295 | memcpy((void *)&rd->buffer.astring[rd->buffer.length], 296 | (void *)&text->astring[copy_index], 297 | copy_len * sizeof(AC_ALPHABET_t)); 298 | 299 | rd->buffer.length += copy_len; 300 | copy_index += copy_len; 301 | 302 | if (rd->buffer.length == MF_REPLACEMENT_BUFFER_SIZE) 303 | mf_repdata_flush(rd); 304 | } 305 | } 306 | 307 | /** 308 | * @brief Append a factor of the current text to the output buffer 309 | * 310 | * @param rd 311 | * @param from 312 | * @param to 313 | *****************************************************************************/ 314 | static void mf_repdata_appendfactor 315 | (MF_REPLACEMENT_DATA_t *rd, size_t from, size_t to) 316 | { 317 | AC_TEXT_t *instr = rd->trie->text; 318 | AC_TEXT_t factor; 319 | size_t backlog_base_pos; 320 | size_t base_position = rd->trie->base_position; 321 | 322 | if (to < from) 323 | return; 324 | 325 | if (base_position <= from) 326 | { 327 | /* The backlog located in the input text part */ 328 | factor.astring = &instr->astring[from - base_position]; 329 | factor.length = to - from; 330 | mf_repdata_appendtext(rd, &factor); 331 | } 332 | else 333 | { 334 | backlog_base_pos = base_position - rd->backlog.length; 335 | if (from < backlog_base_pos) 336 | return; /* shouldn't come here */ 337 | 338 | if (to < base_position) 339 | { 340 | /* The backlog located in the backlog part */ 341 | factor.astring = &rd->backlog.astring[from - backlog_base_pos]; 342 | factor.length = to - from; 343 | mf_repdata_appendtext (rd, &factor); 344 | } 345 | else 346 | { 347 | /* The factor is divided between backlog and input text */ 348 | 349 | /* The backlog part */ 350 | factor.astring = &rd->backlog.astring[from - backlog_base_pos]; 351 | factor.length = rd->backlog.length - from + backlog_base_pos; 352 | mf_repdata_appendtext (rd, &factor); 353 | 354 | /* The input text part */ 355 | factor.astring = instr->astring; 356 | factor.length = to - base_position; 357 | mf_repdata_appendtext (rd, &factor); 358 | } 359 | } 360 | } 361 | 362 | /** 363 | * @brief Saves the backlog part of the current text to the backlog buffer. The 364 | * backlog part is the part after @p bg_pos 365 | * 366 | * @param rd 367 | * @param bg_pos backlog position 368 | *****************************************************************************/ 369 | static void mf_repdata_savetobacklog (MF_REPLACEMENT_DATA_t *rd, size_t bg_pos) 370 | { 371 | size_t bg_pos_r; /* relative backlog position */ 372 | AC_TEXT_t *instr = rd->trie->text; 373 | size_t base_position = rd->trie->base_position; 374 | 375 | if (base_position < bg_pos) 376 | bg_pos_r = bg_pos - base_position; 377 | else 378 | bg_pos_r = 0; /* the whole input text must go to backlog */ 379 | 380 | if (instr->length == bg_pos_r) 381 | return; /* Nothing left for the backlog */ 382 | 383 | if (instr->length < bg_pos_r) 384 | return; /* unexpected : assert (instr->length >= bg_pos_r) */ 385 | 386 | /* Copy the part after bg_pos_r to the backlog buffer */ 387 | memcpy( (AC_ALPHABET_t *) 388 | &rd->backlog.astring[rd->backlog.length], 389 | &instr->astring[bg_pos_r], 390 | instr->length - bg_pos_r ); 391 | 392 | rd->backlog.length += instr->length - bg_pos_r; 393 | } 394 | 395 | /** 396 | * @brief Perform replacement operations on the non-backlog part of the current 397 | * text. In-range nominees will be replaced the original pattern and the result 398 | * will be pushed to the output buffer. 399 | * 400 | * @param rd 401 | * @param to_position 402 | *****************************************************************************/ 403 | static void mf_repdata_do_replace 404 | (MF_REPLACEMENT_DATA_t *rd, size_t to_position) 405 | { 406 | unsigned int index; 407 | struct mf_replacement_nominee *nom; 408 | size_t base_position = rd->trie->base_position; 409 | 410 | if (to_position < base_position) 411 | return; 412 | 413 | /* Replace the candidate patterns */ 414 | if (rd->noms_size > 0) 415 | { 416 | for (index = 0; index < rd->noms_size; index++) 417 | { 418 | nom = &rd->noms[index]; 419 | 420 | if (to_position <= (nom->position - nom->pattern->ptext.length)) 421 | break; 422 | 423 | /* Append the space before pattern */ 424 | mf_repdata_appendfactor (rd, rd->curser, /* from */ 425 | nom->position - nom->pattern->ptext.length /* to */); 426 | 427 | /* Append the replacement instead of the pattern */ 428 | mf_repdata_appendtext(rd, &nom->pattern->rtext); 429 | 430 | rd->curser = nom->position; 431 | } 432 | rd->noms_size -= index; 433 | 434 | /* Shift the array to the left to eliminate the consumed nominees */ 435 | if (rd->noms_size && index) 436 | { 437 | memcpy (&rd->noms[0], &rd->noms[index], 438 | rd->noms_size * sizeof(struct mf_replacement_nominee)); 439 | /* TODO: implement a circular queue */ 440 | } 441 | } 442 | 443 | /* Append the chunk between the last pattern and to_position */ 444 | if (to_position > rd->curser) 445 | { 446 | mf_repdata_appendfactor (rd, rd->curser, to_position); 447 | 448 | rd->curser = to_position; 449 | } 450 | 451 | if (base_position <= rd->curser) 452 | { 453 | /* we consume the whole backlog or none of it */ 454 | rd->backlog.length = 0; 455 | } 456 | } 457 | 458 | /** 459 | * @brief Replaces the patterns in the given text with their correspondence 460 | * replacement in the A.C. Trie 461 | * 462 | * @param thiz 463 | * @param instr 464 | * @param mode 465 | * @param callback 466 | * @param param 467 | * @return 468 | *****************************************************************************/ 469 | int multifast_replace (AC_TRIE_t *thiz, AC_TEXT_t *instr, 470 | MF_REPLACE_MODE_t mode, MF_REPLACE_CALBACK_f callback, void *param) 471 | { 472 | ACT_NODE_t *current; 473 | ACT_NODE_t *next; 474 | struct mf_replacement_nominee nom; 475 | MF_REPLACEMENT_DATA_t *rd = &thiz->repdata; 476 | 477 | size_t position_r = 0; /* Relative current position in the input string */ 478 | size_t backlog_pos = 0; /* Relative backlog position in the input string */ 479 | 480 | if (thiz->trie_open) 481 | return -1; /* _finalize() must be called first */ 482 | 483 | if (!rd->has_replacement) 484 | return -2; /* Trie doesn't have any to-be-replaced pattern */ 485 | 486 | rd->cbf = callback; 487 | rd->user = param; 488 | rd->replace_mode = mode; 489 | 490 | thiz->text = instr; /* Save the input string in a helper variable 491 | * for convenience */ 492 | 493 | current = thiz->last_node; 494 | 495 | /* Main replace loop: 496 | * Find patterns and bookmark them 497 | */ 498 | while (position_r < instr->length) 499 | { 500 | if (!(next = node_find_next_bs(current, instr->astring[position_r]))) 501 | { 502 | /* Failed to follow a pattern */ 503 | if(current->failure_node) 504 | current = current->failure_node; 505 | else 506 | position_r++; 507 | } 508 | else 509 | { 510 | current = next; 511 | position_r++; 512 | } 513 | 514 | if (current->final && next) 515 | { 516 | /* Bookmark nominee patterns for replacement */ 517 | nom.pattern = current->to_be_replaced; 518 | nom.position = thiz->base_position + position_r; 519 | 520 | mf_repdata_booknominee (rd, &nom); 521 | } 522 | } 523 | 524 | /* 525 | * At the end of input chunk, if the tail of the chunk is a prefix of a 526 | * pattern, then we must keep it in the backlog buffer and wait for the 527 | * next chunk to decide about it. */ 528 | 529 | backlog_pos = thiz->base_position + instr->length - current->depth; 530 | 531 | /* Now replace the patterns up to the backlog_pos point */ 532 | mf_repdata_do_replace (rd, backlog_pos); 533 | 534 | /* Save the remaining to the backlog buffer */ 535 | mf_repdata_savetobacklog (rd, backlog_pos); 536 | 537 | /* Save status variables */ 538 | thiz->last_node = current; 539 | thiz->base_position += position_r; 540 | 541 | return 0; 542 | } 543 | 544 | /** 545 | * @brief Flushes the remaining data back to the user and ends the replacement 546 | * operation. 547 | * 548 | * @param thiz 549 | * @param keep Indicates the continuity of the chunks. 0 means that the last 550 | * chunk has been fed in, and we want to end the replacement and receive the 551 | * final result. 552 | *****************************************************************************/ 553 | void multifast_rep_flush (AC_TRIE_t *thiz, int keep) 554 | { 555 | if (!keep) 556 | { 557 | mf_repdata_do_replace (&thiz->repdata, thiz->base_position); 558 | } 559 | 560 | mf_repdata_flush (&thiz->repdata); 561 | 562 | if (!keep) 563 | { 564 | mf_repdata_reset (&thiz->repdata); 565 | thiz->last_node = thiz->root; 566 | thiz->base_position = 0; 567 | } 568 | } 569 | -------------------------------------------------------------------------------- /src/multifast/replace.h: -------------------------------------------------------------------------------- 1 | /* 2 | * replace.h: Defines replacement related data structures 3 | * 4 | * This file is part of multifast. 5 | * 6 | Copyright 2010-2015 Kamiar Kanani 7 | 8 | multifast is free software: you can redistribute it and/or modify 9 | it under the terms of the GNU Lesser General Public License as published by 10 | the Free Software Foundation, either version 3 of the License, or 11 | (at your option) any later version. 12 | 13 | multifast is distributed in the hope that it will be useful, 14 | but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | GNU Lesser General Public License for more details. 17 | 18 | You should have received a copy of the GNU Lesser General Public License 19 | along with multifast. If not, see . 20 | */ 21 | 22 | #ifndef _MF_REPLACE_H_ 23 | #define _MF_REPLACE_H_ 24 | 25 | #include "actypes.h" 26 | 27 | #ifdef __cplusplus 28 | extern "C" { 29 | #endif 30 | 31 | /** 32 | * Different replace modes 33 | */ 34 | typedef enum mf_replace_mode 35 | { 36 | MF_REPLACE_MODE_DEFAULT = 0, 37 | MF_REPLACE_MODE_NORMAL, /**< Normal replace mode: Short factors are swollen 38 | * by the big one; All other patterns are replced 39 | * even if they have overlap. 40 | */ 41 | MF_REPLACE_MODE_LAZY /**< Lazy replace mode: every pattern which comes 42 | * first is replced; the overlapping pattrns are 43 | * nullified by the previous patterns; consequently, 44 | * factor patterns nullify the big patterns. 45 | */ 46 | } MF_REPLACE_MODE_t; 47 | 48 | 49 | /** 50 | * Before we replace any pattern we encounter, we should be patient 51 | * because it may be a factor of another longer pattern. So we maintain a record 52 | * of each recognized pattern until we make sure that it is not a sub-pattern 53 | * and can be replaced by its substitute. To keep a record of packets we use 54 | * the following structure. 55 | */ 56 | struct mf_replacement_nominee 57 | { 58 | AC_PATTERN_t *pattern; 59 | size_t position; 60 | }; 61 | 62 | 63 | /** 64 | * Contains replacement related data 65 | */ 66 | typedef struct mf_replacement_date 67 | { 68 | AC_TEXT_t buffer; /**< replacement buffer: maintains the result 69 | * of replacement */ 70 | 71 | AC_TEXT_t backlog; /**< replacement backlog: if a pattern is divided 72 | * between two or more different chunks, then at the 73 | * end of the first chunk we need to keep it here until 74 | * the next chunk comes and we decide if it is a 75 | * pattern or just a pattern prefix. */ 76 | 77 | unsigned int has_replacement; /**< total number of to-be-replaced patterns 78 | */ 79 | 80 | struct mf_replacement_nominee *noms; /**< Replacement nominee array */ 81 | size_t noms_capacity; /**< Max capacity of the array */ 82 | size_t noms_size; /**< Number of nominees in the array */ 83 | 84 | size_t curser; /**< the position in the input text before which all 85 | * patterns are replaced and the result is saved to the 86 | * buffer. */ 87 | 88 | MF_REPLACE_MODE_t replace_mode; /**< Replace mode */ 89 | 90 | MF_REPLACE_CALBACK_f cbf; /**< Callback function */ 91 | void *user; /**< User parameters sent to the callback function */ 92 | 93 | struct ac_trie *trie; /**< Pointer to the trie */ 94 | 95 | } MF_REPLACEMENT_DATA_t; 96 | 97 | 98 | #ifdef __cplusplus 99 | } 100 | #endif 101 | 102 | #endif /* REPLACE_H */ 103 | -------------------------------------------------------------------------------- /src/php_ahocorasick.c: -------------------------------------------------------------------------------- 1 | /* 2 | * php_ahocorasick.c: PHP Aho Corasick extension file 3 | * 4 | Copyright 2010-2013 Ph4r05 5 | 6 | This software is free software: you can redistribute it and/or modify 7 | it under the terms of the GNU Lesser General Public License as published by 8 | the Free Software Foundation, either version 3 of the License, or 9 | (at your option) any later version. 10 | 11 | This software is distributed in the hope that it will be useful, 12 | but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | GNU Lesser General Public License for more details. 15 | 16 | You should have received a copy of the GNU Lesser General Public License 17 | along with this software. If not, see . 18 | 19 | This code uses thirdparty code: 20 | MultiFast (http://sourceforge.net/projects/multifast/?source=dlp) 21 | */ 22 | 23 | /** 24 | * Sources: 25 | * http://www.phpinternalsbook.com/zvals/memory_management.html 26 | * http://docstore.mik.ua/orelly/webprog/php/ch14_06.htm 27 | * 28 | * https://wiki.php.net/phpng-upgrading 29 | * https://phpinternals.net/docs/zval_copy 30 | * https://nikic.github.io/2015/05/05/Internal-value-representation-in-PHP-7-part-1.html 31 | * https://nikic.github.io/2015/06/19/Internal-value-representation-in-PHP-7-part-2.html 32 | * https://github.com/php/php-src/blob/c8efaea1e3f93b5b836a38c6985b67983b1dc95a/Zend/zend_types.h#L364 33 | * https://github.com/php/php-src/blob/master/Zend/zend_types.h 34 | * https://github.com/copentop/php-7.2.5/blob/924c4cd50b01efebbb93438392c9e1916a568f02/php-7.2.5/ext/sodium/libsodium.c 35 | */ 36 | 37 | #ifdef HAVE_CONFIG_H 38 | #include "config.h" 39 | #endif 40 | #include "php.h" 41 | #include "php_ini.h" 42 | #include "php_ahocorasick.h" 43 | #include "ext/standard/php_string.h" 44 | #include "php_variables.h" 45 | #include "php_globals.h" 46 | #include "TSRM.h" 47 | #include "zend_exceptions.h" 48 | #include 49 | #include 50 | 51 | /* For PHP 8 compatibility */ 52 | #ifndef TSRMLS_CC 53 | #define TSRMLS_CC 54 | #endif 55 | #ifndef TSRMLS_DC 56 | #define TSRMLS_DC 57 | #endif 58 | 59 | // counter for aho struct resources 60 | int le_ahocorasick_master; 61 | #if PHP7 62 | static zend_class_entry *aho_exception_ce; 63 | #endif 64 | static char exception_buffer[8192]; 65 | 66 | 67 | ZEND_DECLARE_MODULE_GLOBALS(ahocorasick) 68 | 69 | #if PHP_VERSION_ID < 80000 70 | #include "php_ahocorasick_legacy_arginfo.h" 71 | #else 72 | #include "php_ahocorasick_arginfo.h" 73 | #endif 74 | 75 | zend_module_entry ahocorasick_module_entry = { 76 | STANDARD_MODULE_HEADER, 77 | PHP_AHOCORASICK_EXTNAME, 78 | ext_functions, 79 | PHP_MINIT(ahocorasick), 80 | PHP_MSHUTDOWN(ahocorasick), 81 | PHP_RINIT(ahocorasick), 82 | NULL, 83 | NULL, 84 | PHP_AHOCORASICK_VERSION, 85 | STANDARD_MODULE_PROPERTIES 86 | }; 87 | 88 | #ifdef COMPILE_DL_AHOCORASICK 89 | ZEND_GET_MODULE(ahocorasick) 90 | #endif 91 | 92 | //PHP_INI_BEGIN() 93 | //PHP_INI_ENTRY("ahocorasick.greeting", "Hello World", PHP_INI_ALL, NULL) 94 | //STD_PHP_INI_ENTRY("helloahocorasick.direction", "1", PHP_INI_ALL, OnUpdateBool, direction, zend_ahocorasick_globals, ahocorasick_globals) 95 | //PHP_INI_END() 96 | 97 | 98 | static char * php_aho_type_str(int tp){ 99 | static char typebuff[128]; 100 | switch(tp){ 101 | #if PHP7 102 | case IS_UNDEF: return "undef"; 103 | case IS_NULL: return "null"; 104 | case IS_FALSE: return "false"; 105 | case IS_TRUE: return "true"; 106 | case IS_LONG: return "long"; 107 | case IS_DOUBLE: return "double"; 108 | case IS_STRING: return "string"; 109 | case IS_ARRAY: return "array"; 110 | case IS_OBJECT: return "object"; 111 | case IS_RESOURCE: return "resource"; 112 | case IS_REFERENCE: return "reference"; 113 | #endif 114 | default: sprintf(typebuff, "%d", tp); return typebuff; 115 | } 116 | } 117 | 118 | /** 119 | * register some global variables here 120 | * @param ahocorasick_globals 121 | static void php_ahocorasick_init_globals(zend_ahocorasick_globals *ahocorasick_globals) 122 | { 123 | return; 124 | } 125 | */ 126 | 127 | /** 128 | * Finalizes searching trie if it was not finalized. 129 | */ 130 | static inline int php_ahocorasick_finalize(ahocorasick_master_t * ahoMaster){ 131 | if (ahoMaster == NULL 132 | || ahoMaster->init_ok != 1 133 | || ahoMaster->ac_finalized == 1) 134 | { 135 | return 0; 136 | } 137 | 138 | ahoMaster->ac_finalized = 1; 139 | //*** 5. Finalize automata (no more patterns will be added). 140 | ac_trie_finalize (ahoMaster->acap); 141 | return 1; 142 | } 143 | 144 | /** 145 | * Resets all pattern fields. Does not perform any deallocation. 146 | */ 147 | static inline int php_ahocorasick_reset_pattern(ahocorasick_pattern_t * tmpStruct){ 148 | if (tmpStruct == NULL){ 149 | return -1; 150 | } 151 | 152 | tmpStruct->ignoreCase=0; 153 | tmpStruct->key=NULL; 154 | COMPAT_ZVAL_UNDEF(tmpStruct->zKey); 155 | tmpStruct->keyId=0; 156 | tmpStruct->keyType=AC_PATTID_TYPE_DEFAULT; 157 | tmpStruct->value=NULL; 158 | COMPAT_ZVAL_UNDEF(tmpStruct->zVal); 159 | COMPAT_ZVAL_UNDEF(tmpStruct->auxObj); 160 | return 0; 161 | } 162 | 163 | /** 164 | * Deallocates all memory related to the given pattern. 165 | */ 166 | static inline int php_ahocorasick_dealloc_pattern(ahocorasick_pattern_t * tmpStruct){ 167 | if (tmpStruct == NULL){ 168 | return -1; 169 | } 170 | 171 | if (!COMPAT_Z_ISUNDEF(tmpStruct->auxObj)){ 172 | zval_ptr_dtor(&(tmpStruct->auxObj)); 173 | COMPAT_ZVAL_UNDEF(tmpStruct->auxObj); 174 | } 175 | 176 | if (tmpStruct->key != NULL && !COMPAT_Z_ISUNDEF(tmpStruct->zKey)) { 177 | zval_ptr_dtor(&(tmpStruct->zKey)); 178 | tmpStruct->key = NULL; 179 | COMPAT_ZVAL_UNDEF(tmpStruct->zKey); 180 | } 181 | 182 | if (tmpStruct->value != NULL && !COMPAT_Z_ISUNDEF(tmpStruct->zVal)) { 183 | zval_ptr_dtor(&(tmpStruct->zVal)); 184 | tmpStruct->value = NULL; 185 | COMPAT_ZVAL_UNDEF(tmpStruct->zVal); 186 | } 187 | 188 | php_ahocorasick_reset_pattern(tmpStruct); 189 | return 0; 190 | } 191 | 192 | /** 193 | * Reads single pattern definition, construct ahocorasick_pattern_t representation of pattern. 194 | */ 195 | static inline int php_ahocorasick_process_pattern(zend_long pidx, ahocorasick_pattern_t * tmpStruct, HashTable * arr_hash_sub TSRMLS_DC) { 196 | php_ahocorasick_reset_pattern(tmpStruct); 197 | 198 | // iterate over sub array 199 | int returnCode = 0; 200 | unsigned long allKeys = 0; 201 | COMPAT_ZVAL *data_sub; 202 | zend_long num_key; 203 | zend_string *key; 204 | zend_bool has_exception = 0; 205 | 206 | #define PATTERN_EXCEPTION() do{ \ 207 | has_exception=1; \ 208 | returnCode=-5; \ 209 | } while(0); break 210 | 211 | COMPAT_ZEND_HASH_FOREACH_KEY_VAL(arr_hash_sub, num_key, key, data_sub) 212 | { 213 | (void)num_key; 214 | unsigned long keyFound = 0; 215 | if (returnCode != 0 || has_exception){ 216 | break; 217 | } 218 | #if !PHP7 219 | unsigned int key_len; 220 | unsigned long index; 221 | if (zend_hash_get_current_key_ex(arr_hash_sub, &key, &key_len, &index, 0, &pointer) != HASH_KEY_IS_STRING) { 222 | php_error_docref(NULL TSRMLS_CC, E_WARNING, "Invalid structure (bad sub-array key)! Cannot initialize." 223 | "Pattern idx: %ld", (long)pidx); 224 | returnCode = -1; 225 | break; 226 | } 227 | #endif 228 | 229 | // determine known keys 230 | if (!key){ 231 | keyFound|=2; 232 | }else if (COMPAT_STR_EQUALS_CI("key", key)){ 233 | keyFound|=1; 234 | } else if (COMPAT_STR_EQUALS_CI("value", key)){ 235 | keyFound|=2; 236 | } else if (COMPAT_STR_EQUALS_CI("ignoreCase", key)){ 237 | keyFound|=4; 238 | } else if (COMPAT_STR_EQUALS_CI("id", key)){ 239 | keyFound|=8; 240 | } else if (COMPAT_STR_EQUALS_CI("aux", key)){ 241 | keyFound|=0x10; 242 | } else { 243 | php_error_docref(NULL TSRMLS_CC, E_WARNING, 244 | "Invalid structure (unrecognized sub-array key)! " 245 | "Only allowed are: {key, id, value, aux, ignoreCase}. Cannot initialize. " 246 | "Pattern index: %ld", (long)pidx); 247 | returnCode = -2; 248 | break; 249 | } 250 | allKeys |= keyFound; 251 | 252 | // Numeric identifier 253 | if ((keyFound & 0x8) > 0){ 254 | if (COMPAT_Z_TYPE_P(*data_sub) != IS_LONG){ 255 | sprintf(exception_buffer, "Invalid type of pattern ID given (long required), type: %s, pattern index: %ld", php_aho_type_str(COMPAT_Z_TYPE_P(*data_sub)), (long)pidx); 256 | PATTERN_EXCEPTION(); 257 | } 258 | 259 | long keyId = COMPAT_Z_LVAL(*data_sub); 260 | tmpStruct->keyId = keyId; 261 | tmpStruct->keyType = AC_PATTID_TYPE_NUMBER; 262 | } 263 | 264 | // Aux object 265 | if ((keyFound & 0x10) > 0){ 266 | // No copying using same reference. 267 | tmpStruct->auxObj = *data_sub; 268 | COMPAT_ZVAL_COPY(&(tmpStruct->auxObj), data_sub); // soft-copy 269 | } 270 | 271 | // ignoreCase - deprecated. 272 | if ((keyFound & 0x4) > 0){ 273 | tmpStruct->ignoreCase = 0; 274 | } 275 | 276 | // key/value present -> process 277 | if ((keyFound & 0x3) > 0){ 278 | char * stmp = NULL; 279 | 280 | if (COMPAT_Z_TYPE_P(*data_sub) != IS_STRING){ 281 | sprintf(exception_buffer, "Pattern %s has to be a string, type: %s, pattern index: %ld", 282 | keyFound == 0x1 ? "key" : "value", 283 | php_aho_type_str(COMPAT_Z_TYPE_P(*data_sub)), 284 | (long)pidx); 285 | PATTERN_EXCEPTION(); 286 | } 287 | 288 | // Avoid string copy, use reference counting. 289 | stmp = COMPAT_Z_STRVAL(*data_sub); 290 | if (keyFound == 0x1){ 291 | COMPAT_ZVAL_COPY(&(tmpStruct->zKey), data_sub); // soft-copy 292 | tmpStruct->key = stmp; 293 | tmpStruct->keyType = AC_PATTID_TYPE_STRING; 294 | } else if (keyFound == 0x2){ 295 | // value 296 | COMPAT_ZVAL_COPY(&(tmpStruct->zVal), data_sub); // soft-copy 297 | tmpStruct->value = stmp; 298 | tmpStruct->valueLen = COMPAT_Z_STRLEN(*data_sub); 299 | } 300 | } 301 | } COMPAT_ZEND_HASH_FOREACH_END(); 302 | 303 | // sanity check, if failed, return false 304 | if (returnCode == 0 && tmpStruct->value==NULL){ 305 | php_error_docref(NULL TSRMLS_CC, E_WARNING, "No value was specified for pattern index: %ld", (long)pidx); 306 | returnCode = -2; 307 | } 308 | 309 | // numeric key and string identifier are mutually exclusive 310 | if (returnCode == 0 && (allKeys & 0x1) && (allKeys & 0x8)){ 311 | php_error_docref(NULL TSRMLS_CC, E_WARNING, "Pattern can have either numeric or string identifier, not both! Pattern index: %ld", (long)pidx); 312 | returnCode = -3; 313 | } 314 | 315 | // Deprecate ignoreCase option 316 | if (allKeys & 0x4){ 317 | php_error_docref(NULL TSRMLS_CC, E_WARNING, "ignoreCase attribute is deprecated and is ignored. Pattern index: %ld", (long)pidx); 318 | } 319 | 320 | // If everything went well, we can return successfully. 321 | if (returnCode == 0){ 322 | return 0; 323 | } 324 | 325 | // Otherwise deallocate this entry. 326 | php_ahocorasick_dealloc_pattern(tmpStruct); 327 | if (has_exception) { 328 | #if PHP7 329 | zend_throw_exception(aho_exception_ce, exception_buffer, 0 TSRMLS_CC); 330 | #else 331 | php_error_docref(NULL TSRMLS_CC, E_ERROR, "%s", exception_buffer); 332 | #endif 333 | } 334 | return returnCode; 335 | #undef PATTERN_EXCEPTION 336 | } 337 | 338 | /** 339 | * Adds the given list to the pattern list 340 | */ 341 | static inline int php_ahocorasick_add_patterns(ahocorasick_master_t * master, ahocorasick_pattern_t * tmpStruct, ahocorasick_pattern_t * tmpStructLast, long sublistSize){ 342 | if (master == NULL || tmpStruct == NULL){ 343 | return -1; 344 | } 345 | 346 | tmpStruct->prev = NULL; 347 | tmpStructLast->next = master->patterns; 348 | 349 | if (master->patterns){ 350 | master->patterns->prev = tmpStructLast; 351 | } 352 | master->patterns = tmpStruct; 353 | master->pattern_count += sublistSize; 354 | return 0; 355 | } 356 | 357 | /** 358 | * Adds given pattern to the doubly linked list. Does not copy memory, embbeds given structure directly to the list. 359 | */ 360 | static inline int php_ahocorasick_add_pattern(ahocorasick_master_t * master, ahocorasick_pattern_t * tmpStruct){ 361 | return php_ahocorasick_add_patterns(master, tmpStruct, tmpStruct, 1); 362 | } 363 | 364 | /** 365 | * Releases all associated memory in linked list of patterns 366 | */ 367 | static inline int php_ahocorasick_release_patterns(ahocorasick_master_t * master){ 368 | if (master == NULL){ 369 | return -1; 370 | } 371 | 372 | ahocorasick_pattern_t * p0 = master->patterns; 373 | while(p0){ 374 | ahocorasick_pattern_t * next = p0->next; 375 | php_ahocorasick_dealloc_pattern(p0); 376 | efree(p0); 377 | p0 = next; 378 | } 379 | 380 | master->patterns = NULL; 381 | master->pattern_count = 0; 382 | 383 | return 0; 384 | } 385 | 386 | /** 387 | * Reads array of patterns, adds them to the search trie. 388 | */ 389 | static inline int php_ahocorasick_process_patterns(ahocorasick_master_t * master, HashTable * arr_hash TSRMLS_DC){ 390 | int pattern_processing_status = 0; 391 | COMPAT_ZVAL *data; 392 | zend_long curIdx = 0; 393 | zend_string *key; 394 | ahocorasick_pattern_t * p0 = NULL; 395 | ahocorasick_pattern_t * p1 = NULL; 396 | ahocorasick_pattern_t * prevPattern = NULL; 397 | ahocorasick_pattern_t * lastPattern = NULL; 398 | 399 | // iterate input initialized array 400 | COMPAT_ZEND_HASH_FOREACH_KEY_VAL(arr_hash, curIdx, key, data) { 401 | (void)key; 402 | // check structure 403 | if (COMPAT_Z_TYPE_PP(data) != IS_ARRAY) { 404 | php_error_docref(NULL TSRMLS_CC, E_WARNING, "Invalid pattern structure! Cannot initialize."); 405 | pattern_processing_status = -4; 406 | break; 407 | } 408 | 409 | // now we know that element is another array - iterate over it again and gain needed info 410 | ahocorasick_pattern_t * tmpStruct = (ahocorasick_pattern_t *) emalloc(sizeof(ahocorasick_pattern_t)); 411 | if (curIdx == 0){ 412 | lastPattern = tmpStruct; 413 | } 414 | 415 | // Construct as a doubly linked list. 416 | tmpStruct->prev = NULL; 417 | tmpStruct->next = prevPattern; 418 | if (prevPattern){ 419 | prevPattern->prev = tmpStruct; 420 | } 421 | prevPattern = tmpStruct; 422 | 423 | // iterate over sub array 424 | HashTable *arr_hash_sub = COMPAT_Z_ARRVAL_P(data); 425 | int status_code = php_ahocorasick_process_pattern(curIdx, tmpStruct, arr_hash_sub TSRMLS_CC); 426 | if (status_code != 0){ 427 | pattern_processing_status = -1; 428 | break; 429 | } 430 | } COMPAT_ZEND_HASH_FOREACH_END(); 431 | 432 | // if processing failed, free memory. 433 | if (pattern_processing_status != 0){ 434 | p0 = prevPattern; 435 | while(p0){ 436 | p1 = p0->next; 437 | php_ahocorasick_dealloc_pattern(p0); 438 | efree(p0); 439 | p0 = p1; 440 | } 441 | 442 | return pattern_processing_status; 443 | } 444 | 445 | // Nothing to process. 446 | if (prevPattern == NULL){ 447 | return 0; 448 | } 449 | 450 | // 451 | // now is everything OK (input data parsed properly) -> initialize AHO automata 452 | // 453 | 454 | // Add all patterns at once to the internal data structures 455 | php_ahocorasick_add_patterns(master, prevPattern, lastPattern, curIdx); 456 | 457 | p0 = prevPattern; 458 | while(p0){ 459 | AC_PATTERN_t tmp_patt; 460 | p1 = p0->next; 461 | 462 | // Construct search pattern for AhoCorasick library. 463 | // search string 464 | tmp_patt.ptext.astring = p0->value; 465 | tmp_patt.ptext.length = p0->valueLen; 466 | 467 | //The replacement pattern is not applicable in this program, so better 468 | //to initialize it with 0/ 469 | tmp_patt.rtext.astring = NULL; 470 | tmp_patt.rtext.length = 0; 471 | 472 | // search value key 473 | tmp_patt.id.type = p0->keyType; 474 | if (p0->keyType == AC_PATTID_TYPE_NUMBER){ 475 | tmp_patt.id.u.number = p0->keyId; 476 | } else if (p0->keyType == AC_PATTID_TYPE_STRING) { 477 | tmp_patt.id.u.stringy = p0->key; 478 | } 479 | 480 | // Aux object holds the whole pattern in our representation. 481 | tmp_patt.aux = (void*)p0; 482 | 483 | // add this pattern to trie. copy pattern to internal memory. 484 | ac_trie_add(master->acap, &tmp_patt, 1); 485 | p0 = p1; 486 | } 487 | 488 | return pattern_processing_status; 489 | } 490 | 491 | /** 492 | * Destructor of ahocorasick_pattern_t resource 493 | */ 494 | static void php_ahocorasick_pattern_t_master_dtor(COMPAT_RESOURCE_PARAM(rsrc)) 495 | { 496 | ahocorasick_master_t *aho = (ahocorasick_master_t*)rsrc->ptr; 497 | if (aho == NULL){ 498 | return; 499 | } 500 | 501 | if (aho->patterns != NULL) { 502 | // release automata here 503 | if (aho->acap != NULL) { 504 | ac_trie_release(aho->acap); 505 | } 506 | 507 | php_ahocorasick_release_patterns(aho); 508 | } 509 | 510 | // release holder structure 511 | efree(aho); 512 | } 513 | 514 | #ifdef AHOCORASICK_USE_LOWER 515 | /** 516 | * Invokes PHP function that converts string to lower case. 517 | * Calling mb_strtolower didn't work properly because of unavailability 518 | * of php_unicode.h, thus we are using call_user_function. 519 | * 520 | * Calls PHP function mb_strtolower from user space 521 | */ 522 | static char * php_ahocorasick_mb_strtolower(char * input TSRMLS_DC){ 523 | zval ret, function_name, *params[1]; 524 | 525 | // construct function to call 526 | COMPAT_ZVAL_STRING(&function_name, "mb_strtolower"); 527 | // construct parameter to pass to target function 528 | COMPAT_MAKE_STD_ZVAL(params[0]); 529 | COMPAT_ZVAL_STRING(params[0], input); // TODO, 1? 530 | // invoke target function 531 | if (call_user_function(EG(function_table), NULL, &function_name, &ret, 1, params TSRMLS_CC) == SUCCESS){ 532 | return Z_STRVAL(ret); 533 | } else { 534 | return NULL; 535 | } 536 | } 537 | #endif 538 | 539 | /** 540 | * AhoCorasick callback handler - MATCH_CALBACK_t type 541 | */ 542 | static int php_ahocorasick_match_handler(AC_MATCH_t * m, void * param) 543 | { 544 | // variable to hold sub array - one found result 545 | COMPAT_ZVAL mysubarray; 546 | unsigned int j; 547 | 548 | /* example of sending parameter to call-back function */ 549 | struct ahocorasick_callback_payload_t * myp = (struct ahocorasick_callback_payload_t *) param; 550 | if (COMPAT_Z_ISUNDEF(myp->resultArray)) { 551 | // invalid condition - result array not initialized 552 | return 0; 553 | } 554 | 555 | for (j = 0; j < m->size; j++) { 556 | // dump found matches to result array 557 | ahocorasick_pattern_t * curPattern = (ahocorasick_pattern_t *) m->patterns[j].aux; 558 | if (curPattern == NULL){ 559 | continue; 560 | } 561 | 562 | COMPAT_ALLOC_INIT_ZVAL(mysubarray); 563 | array_init(COMPAT_Z_ARREF(mysubarray)); 564 | add_assoc_long(COMPAT_Z_ARREF(mysubarray), "pos", m->position); 565 | 566 | if (m->patterns[j].id.type == AC_PATTID_TYPE_STRING){ 567 | COMPAT_ADD_ASSOC_ZVAL(mysubarray, "key", curPattern->zKey); 568 | 569 | } else if (m->patterns[j].id.type == AC_PATTID_TYPE_NUMBER){ 570 | add_assoc_long(COMPAT_Z_ARREF(mysubarray), "keyIdx", m->patterns[j].id.u.number); 571 | 572 | } 573 | 574 | if (!COMPAT_Z_ISUNDEF(curPattern->auxObj)){ 575 | COMPAT_ADD_ASSOC_ZVAL(mysubarray, "aux", curPattern->auxObj); 576 | } 577 | 578 | add_assoc_long(COMPAT_Z_ARREF(mysubarray), "start_postion", (m->position - COMPAT_Z_STRLEN_PP(COMPAT_Z_ARREF(curPattern->zVal)))); 579 | 580 | COMPAT_ADD_ASSOC_ZVAL(mysubarray, "value", curPattern->zVal); 581 | 582 | // add to aggregate array 583 | add_next_index_zval(COMPAT_Z_ARREF(myp->resultArray), COMPAT_Z_ARREF(mysubarray)); 584 | } 585 | 586 | // return 1 if we want to find just first 587 | // to find all return 0 588 | return myp->retVal == 0 ? 0 : 1; 589 | } 590 | 591 | PHP_RINIT_FUNCTION(ahocorasick) 592 | { 593 | return SUCCESS; 594 | } 595 | 596 | PHP_MINIT_FUNCTION(ahocorasick) 597 | { 598 | // destruction of ahocorasick_pattern_t master 599 | le_ahocorasick_master = zend_register_list_destructors_ex(php_ahocorasick_pattern_t_master_dtor, NULL, PHP_AHOSTRUCT_MASTER_RES_NAME, module_number); 600 | 601 | #if PHP7 602 | zend_class_entry ce; 603 | INIT_CLASS_ENTRY(ce, "AhoException", NULL); 604 | aho_exception_ce = zend_register_internal_class_ex(&ce, zend_ce_exception); 605 | #endif 606 | 607 | //ZEND_INIT_MODULE_GLOBALS(ahocorasick, php_ahocorasick_init_globals, NULL); 608 | //REGISTER_INI_ENTRIES(); 609 | return SUCCESS; 610 | } 611 | 612 | PHP_MSHUTDOWN_FUNCTION(ahocorasick) 613 | { 614 | //UNREGISTER_INI_ENTRIES(); 615 | return SUCCESS; 616 | } 617 | 618 | /** 619 | * Returns whether current AhoCorasick resource is valid 620 | * @param 621 | * @return 622 | */ 623 | PHP_FUNCTION(ahocorasick_isValid) 624 | { 625 | ahocorasick_master_t * ahoMaster = NULL; 626 | 627 | #if PHP7 628 | zval *zid; 629 | ZEND_PARSE_PARAMETERS_START(1,1) 630 | Z_PARAM_RESOURCE(zid) 631 | ZEND_PARSE_PARAMETERS_END(); 632 | 633 | if (Z_RES_TYPE_P(zid) != le_ahocorasick_master){ 634 | RETURN_FALSE; 635 | } 636 | 637 | // fetch resource passed as parameter 638 | ahoMaster = (ahocorasick_master_t*) zend_fetch_resource(Z_RES_P(zid), PHP_AHOSTRUCT_MASTER_RES_NAME, le_ahocorasick_master); 639 | 640 | #else 641 | zval *zval_aho_master = NULL; 642 | if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "z", &zval_aho_master) == FAILURE) { 643 | RETURN_NULL(); 644 | } 645 | 646 | // fetch resource passed as parameter 647 | ahoMaster = (ahocorasick_master_t*) zend_fetch_resource(&zval_aho_master TSRMLS_CC, -1, NULL, NULL, 1, le_ahocorasick_master); 648 | #endif 649 | 650 | if (ahoMaster==NULL || ahoMaster->init_ok != 1){ 651 | RETURN_FALSE; 652 | } else { 653 | RETURN_TRUE; 654 | } 655 | } 656 | 657 | /** 658 | * Basic strtolower matcher. 659 | * Returns array of keys matched (strings). 660 | * 661 | * @param 662 | * @return 663 | */ 664 | PHP_FUNCTION(ahocorasick_match) 665 | { 666 | #ifdef AHOCORASICK_USE_LOWER 667 | char *lowered; 668 | #endif 669 | char *normal; 670 | zend_bool findAll = 1; 671 | ahocorasick_master_t * ahoMaster = NULL; 672 | AC_TEXT_t tmp_text; 673 | 674 | #if PHP7 675 | zend_string *uservar; 676 | zval *zid; 677 | ZEND_PARSE_PARAMETERS_START(2,3) 678 | Z_PARAM_STR(uservar) 679 | Z_PARAM_RESOURCE(zid) 680 | Z_PARAM_OPTIONAL 681 | Z_PARAM_BOOL(findAll) 682 | ZEND_PARSE_PARAMETERS_END(); 683 | // fetch resource passed as parameter 684 | ahoMaster = (ahocorasick_master_t*) zend_fetch_resource(Z_RES_P(zid) , PHP_AHOSTRUCT_MASTER_RES_NAME, le_ahocorasick_master); 685 | #else 686 | zval *uservar=NULL; 687 | zval *zval_aho_master = NULL; 688 | if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "zz|b", &uservar, &zval_aho_master, &findAll) == FAILURE) { 689 | RETURN_NULL(); 690 | } 691 | 692 | // fetch resource passed as parameter 693 | ahoMaster = (ahocorasick_master_t*) zend_fetch_resource(&zval_aho_master TSRMLS_CC, -1, NULL, NULL, 1, le_ahocorasick_master); 694 | #endif 695 | 696 | if (ahoMaster==NULL){ 697 | php_error_docref(NULL TSRMLS_CC, E_WARNING, "Invalid resource."); 698 | RETURN_FALSE; 699 | } 700 | 701 | if (ahoMaster->init_ok != 1){ 702 | php_error_docref(NULL TSRMLS_CC, E_WARNING, "Not initialized."); 703 | RETURN_FALSE; 704 | } 705 | 706 | // finalize trie if not finalized already 707 | php_ahocorasick_finalize(ahoMaster); 708 | 709 | normal = COMPAT_Z_STRVAL_P(uservar); 710 | 711 | // prints to stdout 712 | // PHPWRITE(ZSTR_VAL(uservar), ZSTR_LEN(uservar)); 713 | 714 | #ifdef AHOCORASICK_USE_LOWER 715 | // at first, obtain also lower case variant 716 | // strtolower is disabled now, exact match is required 717 | #if PHP7 718 | lowered = php_ahocorasick_mb_strtolower(Z_STR_P(uservar) TSRMLS_CC); 719 | #else 720 | lowered = php_ahocorasick_mb_strtolower(Z_STRVAL_P(uservar) TSRMLS_CC); 721 | #endif 722 | tmp_text.astring = lowered; 723 | #else 724 | //*** 6. Set input text 725 | tmp_text.astring = normal; 726 | #endif 727 | tmp_text.length = COMPAT_Z_STRLEN_P(uservar); 728 | 729 | /* Sending parameter to call-back function */ 730 | // initialize return array 731 | array_init(return_value); 732 | struct ahocorasick_callback_payload_t my_param; 733 | my_param.retVal = 0; 734 | 735 | #if PHP7 736 | my_param.resultArray = *return_value; 737 | #else 738 | my_param.resultArray = return_value; 739 | #endif 740 | 741 | // find all defined 742 | my_param.retVal = findAll ? 0:1; 743 | 744 | //*** 7. Do search 745 | ac_trie_search(ahoMaster->acap, &tmp_text, 0, php_ahocorasick_match_handler, (void *)(&my_param)); 746 | } 747 | 748 | /** 749 | * De-initializes AhoCorasick master resource 750 | * 751 | * @param 752 | * @return 753 | */ 754 | PHP_FUNCTION(ahocorasick_deinit) 755 | { 756 | ahocorasick_master_t * ahoMaster = NULL; 757 | 758 | #if PHP7 759 | zval *zid; 760 | ZEND_PARSE_PARAMETERS_START(1,1) 761 | Z_PARAM_RESOURCE(zid) 762 | ZEND_PARSE_PARAMETERS_END(); 763 | 764 | if (Z_RES_TYPE_P(zid) != le_ahocorasick_master){ 765 | RETURN_FALSE; 766 | } 767 | 768 | ahoMaster = (ahocorasick_master_t*) zend_fetch_resource(Z_RES_P(zid), PHP_AHOSTRUCT_MASTER_RES_NAME, le_ahocorasick_master); 769 | #else 770 | zval *zval_aho_master; 771 | 772 | if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "z", &zval_aho_master) == FAILURE) { 773 | RETURN_NULL(); 774 | } 775 | ahoMaster = (ahocorasick_master_t*) zend_fetch_resource(&zval_aho_master TSRMLS_CC, -1, NULL, NULL, 1, le_ahocorasick_master); 776 | #endif 777 | 778 | if(ahoMaster==NULL){ 779 | RETURN_FALSE; 780 | } 781 | 782 | php_ahocorasick_finalize(ahoMaster); 783 | ahoMaster->init_ok = 0; 784 | 785 | #if PHP7 786 | zend_list_close(Z_RES_P(zid)); 787 | #else 788 | zend_list_delete(Z_LVAL_P(zval_aho_master)); 789 | #endif 790 | RETURN_TRUE; 791 | } 792 | 793 | /** 794 | * Initializes AhoCorasick search structure with passed data 795 | * @param 796 | * @return 797 | */ 798 | PHP_FUNCTION(ahocorasick_init) 799 | { 800 | zval *arr; 801 | HashTable *arr_hash; 802 | 803 | if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "a", &arr) == FAILURE) { 804 | RETURN_NULL(); 805 | } 806 | 807 | // Get matching patterns 808 | arr_hash = Z_ARRVAL_P(arr); 809 | 810 | // create resource in holder structure, fill with data, return 811 | ahocorasick_master_t * ahomaster = emalloc(sizeof(ahocorasick_master_t)); 812 | ahomaster->acap = ac_trie_create(); 813 | ahomaster->ac_finalized = 0; 814 | ahomaster->init_ok = 0; 815 | ahomaster->patterns = NULL; 816 | ahomaster->pattern_count = 0; 817 | 818 | int pattern_processing_status = php_ahocorasick_process_patterns(ahomaster, arr_hash TSRMLS_CC); 819 | if (pattern_processing_status != 0){ 820 | php_ahocorasick_release_patterns(ahomaster); 821 | ac_trie_release(ahomaster->acap); 822 | efree(ahomaster); 823 | RETURN_FALSE; 824 | } 825 | 826 | // pass ACAP object - holding aho automaton 827 | ahomaster->init_ok = 1; 828 | // register this resource for ZEND engine 829 | #if PHP7 830 | ZVAL_RES(return_value, zend_register_resource(ahomaster, le_ahocorasick_master)); 831 | #else 832 | ZEND_REGISTER_RESOURCE(return_value, ahomaster, le_ahocorasick_master); 833 | #endif 834 | // ahocorasick_pattern_t build OK. 835 | // Keep in mind that we are not freeing strings allocated in memory, it is 836 | // still used internally in aho structure, this free is postponed to releasing 837 | // aho structure. 838 | } 839 | 840 | /** 841 | * Finalizes aho corasick search structure 842 | * @param 843 | * @return 844 | */ 845 | PHP_FUNCTION(ahocorasick_finalize) 846 | { 847 | ahocorasick_master_t * ahoMaster = NULL; 848 | 849 | #if PHP7 850 | zval *zid; 851 | ZEND_PARSE_PARAMETERS_START(1,1) 852 | Z_PARAM_RESOURCE(zid) 853 | ZEND_PARSE_PARAMETERS_END(); 854 | ahoMaster = (ahocorasick_master_t*) zend_fetch_resource(Z_RES_P(zid) , PHP_AHOSTRUCT_MASTER_RES_NAME, le_ahocorasick_master); 855 | 856 | #else 857 | zval *zval_aho_master; 858 | 859 | if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "z", &zval_aho_master) == FAILURE) { 860 | RETURN_NULL(); 861 | } 862 | ahoMaster = (ahocorasick_master_t*) zend_fetch_resource(&zval_aho_master TSRMLS_CC, -1, NULL, NULL, 1, le_ahocorasick_master); 863 | #endif 864 | 865 | // fetch resource passed as parameter 866 | if (ahoMaster==NULL){ 867 | RETURN_FALSE; 868 | } else { 869 | if (php_ahocorasick_finalize(ahoMaster)) { 870 | RETURN_TRUE; 871 | } else { 872 | RETURN_FALSE; 873 | } 874 | } 875 | } 876 | 877 | /** 878 | * Adds search patterns to the non-finalized search trie. 879 | * @param 880 | * @return 881 | */ 882 | PHP_FUNCTION(ahocorasick_add_patterns) 883 | { 884 | zval *arr; 885 | ahocorasick_master_t *ahoMaster = NULL; 886 | HashTable *arr_hash = NULL; 887 | 888 | #if PHP7 889 | zval *zid; 890 | ZEND_PARSE_PARAMETERS_START(2,2) 891 | Z_PARAM_RESOURCE(zid) 892 | Z_PARAM_ARRAY(arr) 893 | ZEND_PARSE_PARAMETERS_END(); 894 | 895 | ahoMaster = (ahocorasick_master_t*) zend_fetch_resource(Z_RES_P(zid) , PHP_AHOSTRUCT_MASTER_RES_NAME, le_ahocorasick_master); 896 | 897 | #else 898 | zval *zval_aho_master; 899 | 900 | if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "za", &zval_aho_master, &arr) == FAILURE) { 901 | RETURN_NULL(); 902 | } 903 | 904 | // fetch resource passed as parameter 905 | ahoMaster = (ahocorasick_master_t*) zend_fetch_resource(&zval_aho_master TSRMLS_CC, -1, NULL, NULL, 1, le_ahocorasick_master); 906 | #endif 907 | 908 | arr_hash = Z_ARRVAL_P(arr); 909 | if (ahoMaster == NULL || ahoMaster->init_ok != 1){ 910 | php_error_docref(NULL TSRMLS_CC, E_WARNING, "Cannot add a new pattern, not initialized"); 911 | RETURN_FALSE; 912 | } 913 | 914 | if (ahoMaster->ac_finalized){ 915 | php_error_docref(NULL TSRMLS_CC, E_WARNING, "Cannot add a new pattern to finalized search structure"); 916 | RETURN_FALSE; 917 | } 918 | 919 | int pattern_processing_status = php_ahocorasick_process_patterns(ahoMaster, arr_hash TSRMLS_CC); 920 | if (pattern_processing_status != 0){ 921 | RETURN_FALSE; 922 | } else { 923 | RETURN_TRUE; 924 | } 925 | } 926 | -------------------------------------------------------------------------------- /src/php_ahocorasick.h: -------------------------------------------------------------------------------- 1 | /* 2 | * php_ahocorasick.h: PHP Aho Corasick extension header file 3 | * 4 | Copyright 2010-2013 Ph4r05 5 | 6 | This software is free software: you can redistribute it and/or modify 7 | it under the terms of the GNU Lesser General Public License as published by 8 | the Free Software Foundation, either version 3 of the License, or 9 | (at your option) any later version. 10 | 11 | This software is distributed in the hope that it will be useful, 12 | but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | GNU Lesser General Public License for more details. 15 | 16 | You should have received a copy of the GNU Lesser General Public License 17 | along with this software. If not, see . 18 | 19 | This code uses thirdparty code: 20 | MultiFast (http://sourceforge.net/projects/multifast/?source=dlp) 21 | */ 22 | 23 | #ifndef PHP_AHOCORASICK_H 24 | #define PHP_AHOCORASICK_H 1 25 | #ifdef ZTS 26 | #include "TSRM.h" 27 | #endif 28 | 29 | #define PHP_AHOCORASICK_VERSION "0.0.7" 30 | #define PHP_AHOCORASICK_EXTNAME "ahocorasick" 31 | 32 | #ifndef PHP_MAJOR_VERSION 33 | #define PHP_MAJOR_VERSION 7 34 | #endif 35 | 36 | // Compatibility 37 | #ifndef TSRMLS_CC 38 | #define TSRMLS_CC 39 | #define TSRMLS_DC 40 | #endif 41 | 42 | #if PHP_MAJOR_VERSION < 7 43 | #define PHP7 0 44 | typedef long zend_long; 45 | typedef char zend_string; 46 | typedef int strsize_t; 47 | #define COMPAT_ZVAL zval * 48 | #define COMPAT_ZVAL_UNDEF(x) x = NULL 49 | #define COMPAT_Z_ISUNDEF(x) (x == NULL) 50 | #define COMPAT_Z_RESOURCE zval 51 | #define COMPAT_ZEND_HASH_FOREACH_KEY_VAL(arr_hash, idx, key, data) \ 52 | { HashPosition pointer; \ 53 | for(zend_hash_internal_pointer_reset_ex(arr_hash, &pointer); \ 54 | zend_hash_get_current_data_ex(arr_hash, (void**) &(data), &pointer) == SUCCESS; \ 55 | zend_hash_move_forward_ex(arr_hash, &pointer), (idx)+=1) 56 | #define COMPAT_ZEND_HASH_FOREACH_END() } 57 | 58 | #define COMPAT_STR_EQUALS_CI(s, x) strcasecmp(s, x) == 0 59 | #define COMPAT_RESOURCE_PARAM(x) zend_rsrc_list_entry *(x) TSRMLS_DC 60 | #define COMPAT_ZVAL_STRING(str, len) ZVAL_STRING(str, len, 0) 61 | #define COMPAT_RETURN_STRING(str) RETURN_STRING(str, 0) 62 | #define COMPAT_Z_TYPE_PP(x) Z_TYPE_PP(x) 63 | #define COMPAT_Z_TYPE_P(x) Z_TYPE_P(x) 64 | #define COMPAT_Z_LVAL(x) Z_LVAL(*(x)) 65 | #define COMPAT_Z_STRVAL(x) Z_STRVAL(*(x)) 66 | #define COMPAT_Z_ARRVAL_P(x) Z_ARRVAL_P(*(x)) 67 | #define COMPAT_Z_ADDREF_P(x) Z_ADDREF_P(x) 68 | #define COMPAT_Z_STRLEN(x) Z_STRLEN(*(x)) 69 | #define COMPAT_Z_STRLEN_P(x) Z_STRLEN_P(x) 70 | #define COMPAT_Z_STRLEN_PP(x) Z_STRLEN_P(x) 71 | #define COMPAT_Z_STRVAL_P(x) Z_STRVAL_P(x) 72 | #define COMPAT_Z_ARREF(x) (x) 73 | #define COMPAT_Z_REFCOUNTED(zval) 1 74 | #define COMPAT_TRY_ADDREF_P(SRC) COMPAT_Z_ADDREF_P(SRC) 75 | 76 | #define COMPAT_MAKE_STD_ZVAL(x) MAKE_STD_ZVAL(x) 77 | #define COMPAT_DECLARE_ZVAL(name) zval *name 78 | #define COMPAT_ALLOC_INIT_ZVAL(name) ALLOC_INIT_ZVAL(name) 79 | #define COMPAT_ALLOC_INIT_ZVAL2(name) ALLOC_INIT_ZVAL(name) 80 | #define COMPAT_ZVAL_COPY(z, v) do { \ 81 | Z_ADDREF_P(*(v)); \ 82 | *z = *v; \ 83 | } while(0) 84 | #define hp_ptr_dtor(val) zval_ptr_dtor(&(val)) 85 | 86 | #define COMPAT_ADD_ASSOC_ZVAL(ARR, KEY, SRC) do { \ 87 | COMPAT_TRY_ADDREF_P(SRC); \ 88 | add_assoc_zval(COMPAT_Z_ARREF(ARR), KEY, COMPAT_Z_ARREF(SRC)); \ 89 | } while(0) 90 | 91 | #else 92 | #define PHP7 1 93 | typedef size_t strsize_t; 94 | #define COMPAT_ZVAL zval 95 | #define COMPAT_ZVAL_UNDEF(x) ZVAL_UNDEF(&(x)) 96 | #define COMPAT_Z_ISUNDEF(x) Z_ISUNDEF(x) 97 | #define COMPAT_Z_RESOURCE zend_resource 98 | #define COMPAT_ZEND_HASH_FOREACH_KEY_VAL(arr_hash, idx, key, data) ZEND_HASH_FOREACH_KEY_VAL(arr_hash, idx, key, data) 99 | #define COMPAT_ZEND_HASH_FOREACH_END() ZEND_HASH_FOREACH_END() 100 | #define COMPAT_STR_EQUALS_CI(s, x) zend_string_equals_ci(zend_string_init(ZEND_STRL(s), 0), x) 101 | #define COMPAT_RESOURCE_PARAM(x) zend_resource *(x) 102 | #define COMPAT_ZVAL_STRING(str, len) ZVAL_STRING(str, len) 103 | #define COMPAT_RETURN_STRING(str) RETURN_STRING(str) 104 | #define COMPAT_Z_TYPE_PP(x) Z_TYPE_P(x) 105 | #define COMPAT_Z_TYPE_P(x) Z_TYPE(x) 106 | #define COMPAT_Z_LVAL(x) Z_LVAL(x) 107 | #define COMPAT_Z_STRVAL(x) Z_STRVAL(x) 108 | #define COMPAT_Z_ARRVAL_P(x) Z_ARRVAL_P(x) 109 | #define COMPAT_Z_ADDREF_P(x) Z_ADDREF(x) 110 | #define COMPAT_Z_STRLEN(x) Z_STRLEN(x) 111 | #define COMPAT_Z_STRLEN_P(x) ZSTR_LEN(x) 112 | #define COMPAT_Z_STRLEN_PP(x) Z_STRLEN_P(x) 113 | #define COMPAT_Z_STRVAL_P(x) ZSTR_VAL(x) 114 | #define COMPAT_Z_ARREF(x) &(x) 115 | #define COMPAT_Z_REFCOUNTED(zval) ((Z_TYPE_FLAGS(zval) & IS_TYPE_REFCOUNTED) != 0) 116 | #define COMPAT_TRY_ADDREF_P(SRC) Z_TRY_ADDREF_P(SRC) 117 | 118 | #define COMPAT_MAKE_STD_ZVAL(x) 119 | #define COMPAT_DECLARE_ZVAL(name) zval name ## _v; zval * name = &name ## _v 120 | #define COMPAT_ALLOC_INIT_ZVAL(name) 121 | #define COMPAT_ALLOC_INIT_ZVAL2(name) ZVAL_NULL(name) 122 | #define COMPAT_ZVAL_COPY(z, v) ZVAL_COPY(z, v) 123 | #define hp_ptr_dtor(val) zval_ptr_dtor(val) 124 | 125 | #define COMPAT_ADD_ASSOC_ZVAL(ARR, KEY, SRC) do { \ 126 | COMPAT_TRY_ADDREF_P(&SRC); \ 127 | add_assoc_zval(COMPAT_Z_ARREF(ARR), KEY, &SRC); \ 128 | } while(0) 129 | 130 | #endif 131 | 132 | // Aho-Corasick import 133 | #include "multifast/ahocorasick.h" 134 | #include "multifast/actypes.h" 135 | 136 | ZEND_BEGIN_MODULE_GLOBALS(ahocorasick) 137 | long counter; 138 | zend_bool direction; 139 | ZEND_END_MODULE_GLOBALS(ahocorasick) 140 | 141 | #ifdef ZTS 142 | #define AHOCORASICK_G(v) TSRMG(ahocorasick_globals_id, zend_ahocorasick_globals *, v) 143 | #else 144 | #define AHOCORASICK_G(v) (ahocorasick_globals.v) 145 | #endif 146 | 147 | /** 148 | * Sample param to pass to callback handler for AhoCorasick search algorithm 149 | */ 150 | struct ahocorasick_callback_payload_t { 151 | // found match will be added here 152 | int retVal; 153 | COMPAT_ZVAL resultArray; 154 | }; 155 | 156 | /** 157 | * AhoCorasick table initialization 158 | */ 159 | typedef struct ahocorasick_pattern_t { 160 | char *key; 161 | long keyId; 162 | enum ac_pattid_type keyType; 163 | COMPAT_ZVAL zKey; 164 | 165 | char *value; 166 | int valueLen; 167 | COMPAT_ZVAL zVal; 168 | 169 | int ignoreCase; 170 | COMPAT_ZVAL auxObj; 171 | 172 | struct ahocorasick_pattern_t * next; 173 | struct ahocorasick_pattern_t * prev; 174 | } ahocorasick_pattern_t; 175 | 176 | /** 177 | * Initialized Aho Corasick search structure - resource 178 | */ 179 | typedef struct ahocorasick_master_t { 180 | // aho corasick main search tree 181 | AC_TRIE_t * acap; 182 | // if structure was finalized, is set to 1 183 | unsigned char ac_finalized; 184 | // if initialization was ok, is set to 1. 185 | unsigned char init_ok; 186 | // root of the doubly linked list for search patterns 187 | ahocorasick_pattern_t * patterns; 188 | // size of the search pattern list 189 | long pattern_count; 190 | } ahocorasick_master_t; 191 | 192 | #define PHP_AHOSTRUCT_MASTER_RES_NAME "AhoCorasick search" 193 | #define PHP_AHOSTRUCT_RES_NAME "Ahostruct element data" 194 | 195 | PHP_MINIT_FUNCTION(ahocorasick); 196 | PHP_MSHUTDOWN_FUNCTION(ahocorasick); 197 | PHP_RINIT_FUNCTION(ahocorasick); 198 | PHP_FUNCTION(ahocorasick_match); 199 | PHP_FUNCTION(ahocorasick_init); 200 | PHP_FUNCTION(ahocorasick_deinit); 201 | PHP_FUNCTION(ahocorasick_isValid); 202 | PHP_FUNCTION(ahocorasick_finalize); 203 | PHP_FUNCTION(ahocorasick_add_patterns); 204 | extern zend_module_entry ahocorasick_module_entry; 205 | #define phpext_hello_ptr &ahocorasick_module_entry 206 | 207 | #endif 208 | -------------------------------------------------------------------------------- /src/php_ahocorasick.stub.php: -------------------------------------------------------------------------------- 1 | 5 | --FILE-- 6 | 'ab', 'value'=>'alfa'), 13 | array('key'=>'ac', 'value'=>'beta'), 14 | array('key'=>'ad', 'value'=>'gamma', 'aux'=>array(1)), 15 | array('key'=>'ae', 'value'=>'delta'), 16 | array('id'=>0, 'value'=>'zeta'), 17 | array('key'=>'ag', 'value'=>'omega'), 18 | array('value'=>'lfa') 19 | ); 20 | // initialize search , returns resourceID for search structure 21 | $c = ahocorasick_init($data); 22 | unset($data); 23 | 24 | // perform search 1 25 | $d1 = ahocorasick_match("alFABETA gamma zetaomegaalfa!", $c); 26 | // deinitialize search structure (will free memory) 27 | ahocorasick_deinit($c); 28 | 29 | var_dump($d1); 30 | 31 | if (count($d1) != 5){ 32 | throw new Exception("Expected 5 results"); 33 | } 34 | 35 | $ex = ["pos"=>28, "start_postion"=>25, "value"=>"lfa"]; 36 | if ($d1[4] != $ex){ 37 | throw new Exception("Expected"); 38 | } 39 | 40 | // UTF8 check 41 | $check_word=[ 42 | ['value'=>'你好'], 43 | ['value'=>'hi'], 44 | ['value'=>'谢谢'], 45 | ['value'=>'thanks'] 46 | ]; 47 | 48 | $text = "你好,hi,谢谢,thanks"; 49 | 50 | $c = ahocorasick_init($check_word); 51 | 52 | $res = ahocorasick_match($text, $c); 53 | var_dump($res); 54 | 55 | ahocorasick_deinit($c); 56 | 57 | echo "OK\n"; 58 | ?> 59 | --EXPECT-- 60 | array(5) { 61 | [0]=> 62 | array(5) { 63 | ["pos"]=> 64 | int(14) 65 | ["key"]=> 66 | string(2) "ad" 67 | ["aux"]=> 68 | array(1) { 69 | [0]=> 70 | int(1) 71 | } 72 | ["start_postion"]=> 73 | int(9) 74 | ["value"]=> 75 | string(5) "gamma" 76 | } 77 | [1]=> 78 | array(4) { 79 | ["pos"]=> 80 | int(19) 81 | ["keyIdx"]=> 82 | int(0) 83 | ["start_postion"]=> 84 | int(15) 85 | ["value"]=> 86 | string(4) "zeta" 87 | } 88 | [2]=> 89 | array(4) { 90 | ["pos"]=> 91 | int(24) 92 | ["key"]=> 93 | string(2) "ag" 94 | ["start_postion"]=> 95 | int(19) 96 | ["value"]=> 97 | string(5) "omega" 98 | } 99 | [3]=> 100 | array(4) { 101 | ["pos"]=> 102 | int(28) 103 | ["key"]=> 104 | string(2) "ab" 105 | ["start_postion"]=> 106 | int(24) 107 | ["value"]=> 108 | string(4) "alfa" 109 | } 110 | [4]=> 111 | array(3) { 112 | ["pos"]=> 113 | int(28) 114 | ["start_postion"]=> 115 | int(25) 116 | ["value"]=> 117 | string(3) "lfa" 118 | } 119 | } 120 | array(4) { 121 | [0]=> 122 | array(3) { 123 | ["pos"]=> 124 | int(6) 125 | ["start_postion"]=> 126 | int(0) 127 | ["value"]=> 128 | string(6) "你好" 129 | } 130 | [1]=> 131 | array(3) { 132 | ["pos"]=> 133 | int(11) 134 | ["start_postion"]=> 135 | int(9) 136 | ["value"]=> 137 | string(2) "hi" 138 | } 139 | [2]=> 140 | array(3) { 141 | ["pos"]=> 142 | int(20) 143 | ["start_postion"]=> 144 | int(14) 145 | ["value"]=> 146 | string(6) "谢谢" 147 | } 148 | [3]=> 149 | array(3) { 150 | ["pos"]=> 151 | int(29) 152 | ["start_postion"]=> 153 | int(23) 154 | ["value"]=> 155 | string(6) "thanks" 156 | } 157 | } 158 | OK 159 | -------------------------------------------------------------------------------- /tests/test2.phpt: -------------------------------------------------------------------------------- 1 | --TEST-- 2 | Test 2 3 | --SKIPIF-- 4 | 5 | --FILE-- 6 | 'ab', 'value'=>'alfa'), 17 | array('key'=>'ac', 'value'=>'beta'), 18 | array('key'=>'ad', 'value'=>'gamma', 'aux'=>$aux2), 19 | array('key'=>'ae', 'value'=>'delta', 'aux'=>$aux3), 20 | array('key'=>'af', 'value'=>'zeta'), 21 | array('key'=>'ag', 'value'=>'omega'), 22 | array('key'=>'ah', 'value'=>'lfa'), 23 | array('id'=>42, 'value'=>'pie'), 24 | array('value'=>'simple'), 25 | array('value'=>'aux', 'aux'=>$aux1), 26 | array('value'=>'aux2', 'aux'=>$aux2), 27 | array('value'=>'aux3', 'aux'=>$aux1), 28 | array('value'=>'ščř+éé'), 29 | array('value'=>'éé'), 30 | ); 31 | 32 | $c = ahocorasick_init($data); 33 | $data = array(); // Memoty test. 34 | 35 | $str = "alFABETA gammadelta delta delta simple pie! aux ssščř+ééžž ččř é é-é éeéee éé aux2 aux3 aux2"; 36 | $d = ahocorasick_match($str, $c); 37 | echo "AhoCorasick match for ahocorasick_match(\"$str\", c): "; 38 | var_dump($d); 39 | 40 | $str = "alFABETAABECEDAAAA!"; 41 | $d = ahocorasick_match($str, $c); 42 | echo "AhoCorasick match for ahocorasick_match(\"$str\", c): "; 43 | var_dump($d); 44 | 45 | $str = "alFABETAABECEDAAAA!"; 46 | $d = ahocorasick_match($str, $c, false); 47 | echo "AhoCorasick match for ahocorasick_match(\"$str\", c, false): "; 48 | var_dump($d); 49 | 50 | $str = "alFABETAABECEDAAAA!"; 51 | $d = ahocorasick_match($str, $c, true); 52 | echo "AhoCorasick match for ahocorasick_match(\"$str\", c, true): "; 53 | var_dump($d); 54 | 55 | echo "AhoCorasick isValid(c): "; 56 | var_dump(ahocorasick_isValid($c)); 57 | 58 | echo "AhoCorasick deinit(c): "; 59 | var_dump(ahocorasick_deinit($c)); 60 | 61 | if ($c){ 62 | echo "AhoCorasick isValid(c): "; 63 | var_dump(ahocorasick_isValid($c)); 64 | 65 | echo "AhoCorasick deinit(c): "; 66 | var_dump(ahocorasick_deinit($c)); 67 | } 68 | ?> 69 | --EXPECT-- 70 | 71 | AhoCorasick match for ahocorasick_match("alFABETA gammadelta delta delta simple pie! aux ssščř+ééžž ččř é é-é éeéee éé aux2 aux3 aux2", c): array(16) { 72 | [0]=> 73 | array(5) { 74 | ["pos"]=> 75 | int(14) 76 | ["key"]=> 77 | string(2) "ad" 78 | ["aux"]=> 79 | int(66) 80 | ["start_postion"]=> 81 | int(9) 82 | ["value"]=> 83 | string(5) "gamma" 84 | } 85 | [1]=> 86 | array(5) { 87 | ["pos"]=> 88 | int(19) 89 | ["key"]=> 90 | string(2) "ae" 91 | ["aux"]=> 92 | string(10) "simple-aux" 93 | ["start_postion"]=> 94 | int(14) 95 | ["value"]=> 96 | string(5) "delta" 97 | } 98 | [2]=> 99 | array(5) { 100 | ["pos"]=> 101 | int(25) 102 | ["key"]=> 103 | string(2) "ae" 104 | ["aux"]=> 105 | string(10) "simple-aux" 106 | ["start_postion"]=> 107 | int(20) 108 | ["value"]=> 109 | string(5) "delta" 110 | } 111 | [3]=> 112 | array(5) { 113 | ["pos"]=> 114 | int(31) 115 | ["key"]=> 116 | string(2) "ae" 117 | ["aux"]=> 118 | string(10) "simple-aux" 119 | ["start_postion"]=> 120 | int(26) 121 | ["value"]=> 122 | string(5) "delta" 123 | } 124 | [4]=> 125 | array(3) { 126 | ["pos"]=> 127 | int(38) 128 | ["start_postion"]=> 129 | int(32) 130 | ["value"]=> 131 | string(6) "simple" 132 | } 133 | [5]=> 134 | array(4) { 135 | ["pos"]=> 136 | int(42) 137 | ["keyIdx"]=> 138 | int(42) 139 | ["start_postion"]=> 140 | int(39) 141 | ["value"]=> 142 | string(3) "pie" 143 | } 144 | [6]=> 145 | array(4) { 146 | ["pos"]=> 147 | int(47) 148 | ["aux"]=> 149 | array(1) { 150 | [0]=> 151 | array(2) { 152 | [0]=> 153 | string(14) "helloAuxObject" 154 | [1]=> 155 | int(41) 156 | } 157 | } 158 | ["start_postion"]=> 159 | int(44) 160 | ["value"]=> 161 | string(3) "aux" 162 | } 163 | [7]=> 164 | array(3) { 165 | ["pos"]=> 166 | int(61) 167 | ["start_postion"]=> 168 | int(50) 169 | ["value"]=> 170 | string(11) "ščř+éé" 171 | } 172 | [8]=> 173 | array(3) { 174 | ["pos"]=> 175 | int(61) 176 | ["start_postion"]=> 177 | int(57) 178 | ["value"]=> 179 | string(4) "éé" 180 | } 181 | [9]=> 182 | array(3) { 183 | ["pos"]=> 184 | int(94) 185 | ["start_postion"]=> 186 | int(90) 187 | ["value"]=> 188 | string(4) "éé" 189 | } 190 | [10]=> 191 | array(4) { 192 | ["pos"]=> 193 | int(98) 194 | ["aux"]=> 195 | array(1) { 196 | [0]=> 197 | array(2) { 198 | [0]=> 199 | string(14) "helloAuxObject" 200 | [1]=> 201 | int(41) 202 | } 203 | } 204 | ["start_postion"]=> 205 | int(95) 206 | ["value"]=> 207 | string(3) "aux" 208 | } 209 | [11]=> 210 | array(4) { 211 | ["pos"]=> 212 | int(99) 213 | ["aux"]=> 214 | int(66) 215 | ["start_postion"]=> 216 | int(95) 217 | ["value"]=> 218 | string(4) "aux2" 219 | } 220 | [12]=> 221 | array(4) { 222 | ["pos"]=> 223 | int(103) 224 | ["aux"]=> 225 | array(1) { 226 | [0]=> 227 | array(2) { 228 | [0]=> 229 | string(14) "helloAuxObject" 230 | [1]=> 231 | int(41) 232 | } 233 | } 234 | ["start_postion"]=> 235 | int(100) 236 | ["value"]=> 237 | string(3) "aux" 238 | } 239 | [13]=> 240 | array(4) { 241 | ["pos"]=> 242 | int(104) 243 | ["aux"]=> 244 | array(1) { 245 | [0]=> 246 | array(2) { 247 | [0]=> 248 | string(14) "helloAuxObject" 249 | [1]=> 250 | int(41) 251 | } 252 | } 253 | ["start_postion"]=> 254 | int(100) 255 | ["value"]=> 256 | string(4) "aux3" 257 | } 258 | [14]=> 259 | array(4) { 260 | ["pos"]=> 261 | int(108) 262 | ["aux"]=> 263 | array(1) { 264 | [0]=> 265 | array(2) { 266 | [0]=> 267 | string(14) "helloAuxObject" 268 | [1]=> 269 | int(41) 270 | } 271 | } 272 | ["start_postion"]=> 273 | int(105) 274 | ["value"]=> 275 | string(3) "aux" 276 | } 277 | [15]=> 278 | array(4) { 279 | ["pos"]=> 280 | int(109) 281 | ["aux"]=> 282 | int(66) 283 | ["start_postion"]=> 284 | int(105) 285 | ["value"]=> 286 | string(4) "aux2" 287 | } 288 | } 289 | AhoCorasick match for ahocorasick_match("alFABETAABECEDAAAA!", c): array(0) { 290 | } 291 | AhoCorasick match for ahocorasick_match("alFABETAABECEDAAAA!", c, false): array(0) { 292 | } 293 | AhoCorasick match for ahocorasick_match("alFABETAABECEDAAAA!", c, true): array(0) { 294 | } 295 | AhoCorasick isValid(c): bool(true) 296 | AhoCorasick deinit(c): bool(true) 297 | AhoCorasick isValid(c): bool(false) 298 | AhoCorasick deinit(c): bool(false) 299 | -------------------------------------------------------------------------------- /tests/test3.phpt: -------------------------------------------------------------------------------- 1 | --TEST-- 2 | Test 3 3 | --SKIPIF-- 4 | 5 | --FILE-- 6 | 'ab', 'value'=>'alfa'))); 15 | ahocorasick_add_patterns($c, array(array('key'=>'ac', 'value'=>'beta'))); 16 | ahocorasick_add_patterns($c, array(array('key'=>'ad', 'value'=>'gamma', 'aux'=>array(1)))); 17 | ahocorasick_add_patterns($c, array(array('key'=>'ae', 'value'=>'delta'))); 18 | ahocorasick_add_patterns($c, array(array('id'=>0, 'value'=>'zeta'), 19 | array('key'=>'ag', 'value'=>'omega'), 20 | array('value'=>'lfa'))); 21 | 22 | 23 | // perform search 1 24 | $d1 = ahocorasick_match("alFABETA gamma zetaomegaalfa!", $c); 25 | //unset($d1); 26 | 27 | // deinitialize search structure (will free memory) 28 | ahocorasick_deinit($c); 29 | 30 | var_dump($d1); 31 | ?> 32 | --EXPECT-- 33 | 34 | array(5) { 35 | [0]=> 36 | array(5) { 37 | ["pos"]=> 38 | int(14) 39 | ["key"]=> 40 | string(2) "ad" 41 | ["aux"]=> 42 | array(1) { 43 | [0]=> 44 | int(1) 45 | } 46 | ["start_postion"]=> 47 | int(9) 48 | ["value"]=> 49 | string(5) "gamma" 50 | } 51 | [1]=> 52 | array(4) { 53 | ["pos"]=> 54 | int(19) 55 | ["keyIdx"]=> 56 | int(0) 57 | ["start_postion"]=> 58 | int(15) 59 | ["value"]=> 60 | string(4) "zeta" 61 | } 62 | [2]=> 63 | array(4) { 64 | ["pos"]=> 65 | int(24) 66 | ["key"]=> 67 | string(2) "ag" 68 | ["start_postion"]=> 69 | int(19) 70 | ["value"]=> 71 | string(5) "omega" 72 | } 73 | [3]=> 74 | array(4) { 75 | ["pos"]=> 76 | int(28) 77 | ["key"]=> 78 | string(2) "ab" 79 | ["start_postion"]=> 80 | int(24) 81 | ["value"]=> 82 | string(4) "alfa" 83 | } 84 | [4]=> 85 | array(3) { 86 | ["pos"]=> 87 | int(28) 88 | ["start_postion"]=> 89 | int(25) 90 | ["value"]=> 91 | string(3) "lfa" 92 | } 93 | } 94 | -------------------------------------------------------------------------------- /tests/test4.phpt: -------------------------------------------------------------------------------- 1 | --TEST-- 2 | Test 4 3 | --SKIPIF-- 4 | 5 | --FILE-- 6 | 'a5')); 13 | 14 | 15 | for($out = 0; $out < 20; ++$out){ 16 | printf("\nT $out\n"); 17 | $c = ahocorasick_init($data); 18 | 19 | for ($i = 0; $i < 1000; ++$i) { 20 | $d = ahocorasick_match($s, $c); 21 | if (!$d || sizeof($d) != 4) 22 | throw new Exception('Unexpected result!'); 23 | printf('.'); 24 | } 25 | ahocorasick_deinit($c); 26 | $c = 0; 27 | } 28 | printf("\n"); 29 | ?> 30 | --EXPECT-- 31 | 32 | T 0 33 | ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ 34 | T 1 35 | ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ 36 | T 2 37 | ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ 38 | T 3 39 | ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ 40 | T 4 41 | ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ 42 | T 5 43 | ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ 44 | T 6 45 | ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ 46 | T 7 47 | ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ 48 | T 8 49 | ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ 50 | T 9 51 | ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ 52 | T 10 53 | ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ 54 | T 11 55 | ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ 56 | T 12 57 | ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ 58 | T 13 59 | ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ 60 | T 14 61 | ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ 62 | T 15 63 | ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ 64 | T 16 65 | ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ 66 | T 17 67 | ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ 68 | T 18 69 | ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ 70 | T 19 71 | ........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................ 72 | -------------------------------------------------------------------------------- /tests/test5.phpt: -------------------------------------------------------------------------------- 1 | --TEST-- 2 | Test 5 3 | --SKIPIF-- 4 | 5 | --FILE-- 6 | "熊本県熊本市北区四方寄町", "value" => "北区四方寄町"], 13 | ["key" => "熊本県熊本市北区立福寺町", "value" => "北区立福寺町"] 14 | ]; 15 | 16 | $list = [ 17 | "東京都東京都", 18 | "兵庫県兵庫県", 19 | "奈良県奈良県", 20 | "兵庫県兵庫県", 21 | "兵庫県兵庫県", 22 | "兵庫県兵庫県", 23 | "兵庫県兵庫県", 24 | "埼玉県埼玉県", 25 | "兵庫県兵庫県", 26 | "兵庫県兵庫県", 27 | "兵庫県兵庫県", 28 | "東京都東京都", 29 | "愛知県、大阪府愛知県", 30 | "墨田区錦糸町駅前東京都墨田区錦糸町駅", 31 | "東京都渋谷区東京都渋谷区" 32 | ]; 33 | 34 | $c = ahocorasick_init($data); 35 | foreach ($list as $keyword) { 36 | var_dump($keyword); 37 | $matchedAddressList = ahocorasick_match($keyword, $c); 38 | } 39 | ?> 40 | --EXPECT-- 41 | 42 | string(18) "東京都東京都" 43 | string(18) "兵庫県兵庫県" 44 | string(18) "奈良県奈良県" 45 | string(18) "兵庫県兵庫県" 46 | string(18) "兵庫県兵庫県" 47 | string(18) "兵庫県兵庫県" 48 | string(18) "兵庫県兵庫県" 49 | string(18) "埼玉県埼玉県" 50 | string(18) "兵庫県兵庫県" 51 | string(18) "兵庫県兵庫県" 52 | string(18) "兵庫県兵庫県" 53 | string(18) "東京都東京都" 54 | string(30) "愛知県、大阪府愛知県" 55 | string(54) "墨田区錦糸町駅前東京都墨田区錦糸町駅" 56 | string(36) "東京都渋谷区東京都渋谷区" -------------------------------------------------------------------------------- /tests/test6.phpt: -------------------------------------------------------------------------------- 1 | --TEST-- 2 | Test 6 3 | --SKIPIF-- 4 | 5 | --FILE-- 6 | 'a', 'value' => 'abcd'], 14 | ['key' => 'b', 'value' => 'ghij'], 15 | ['key' => 'c', 'value' => 'defg'], 16 | ['key' => 'd', 'value' => 'defghijkl'] 17 | ]; 18 | 19 | $c = ahocorasick_init($data); 20 | 21 | $firstText = "abcde"; 22 | $secondText = "fghij"; 23 | $thirdText = "klmno"; 24 | 25 | $firstResult = ahocorasick_match($firstText, $c); 26 | print("-----\n"); 27 | $secondResult = ahocorasick_match($secondText, $c); 28 | print("-----\n"); 29 | $thirdResult = ahocorasick_match($thirdText, $c); 30 | print("-----\n"); 31 | 32 | ahocorasick_deinit($c); 33 | 34 | var_dump($firstResult); 35 | print "\n"; 36 | var_dump($secondResult); 37 | print "\n"; 38 | var_dump($thirdResult); 39 | ?> 40 | --EXPECT-- 41 | ----- 42 | ----- 43 | ----- 44 | array(1) { 45 | [0]=> 46 | array(4) { 47 | ["pos"]=> 48 | int(4) 49 | ["key"]=> 50 | string(1) "a" 51 | ["start_postion"]=> 52 | int(0) 53 | ["value"]=> 54 | string(4) "abcd" 55 | } 56 | } 57 | 58 | array(1) { 59 | [0]=> 60 | array(4) { 61 | ["pos"]=> 62 | int(5) 63 | ["key"]=> 64 | string(1) "b" 65 | ["start_postion"]=> 66 | int(1) 67 | ["value"]=> 68 | string(4) "ghij" 69 | } 70 | } 71 | 72 | array(0) { 73 | } 74 | --------------------------------------------------------------------------------