├── .gitattributes
├── .gitignore
├── LICENSE
├── README.md
├── config
├── filter
    ├── config
    └── ngx_http_zstd_filter_module.c
├── static
    ├── config
    └── ngx_http_zstd_static_module.c
├── t
    ├── 00-filter.t
    ├── 01-static.t
    └── suite
    │   ├── test
    │   └── test.zst
└── valgrind.suppress


/.gitattributes:
--------------------------------------------------------------------------------
1 | *.t linguist-language=Text
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Prerequisites
 2 | *.d
 3 | 
 4 | # Object files
 5 | *.o
 6 | *.ko
 7 | *.obj
 8 | *.elf
 9 | 
10 | # Linker output
11 | *.ilk
12 | *.map
13 | *.exp
14 | 
15 | # Precompiled Headers
16 | *.gch
17 | *.pch
18 | 
19 | # Libraries
20 | *.lib
21 | *.a
22 | *.la
23 | *.lo
24 | 
25 | # Shared objects (inc. Windows DLLs)
26 | *.dll
27 | *.so
28 | *.so.*
29 | *.dylib
30 | 
31 | # Executables
32 | *.exe
33 | *.out
34 | *.app
35 | *.i*86
36 | *.x86_64
37 | *.hex
38 | 
39 | # Debug files
40 | *.dSYM/
41 | *.su
42 | *.idb
43 | *.pdb
44 | 
45 | # Kernel Module Compile Results
46 | *.mod*
47 | *.cmd
48 | .tmp_versions/
49 | modules.order
50 | Module.symvers
51 | Mkfile.old
52 | dkms.conf
53 | 
54 | t/servroot/*
55 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 2-Clause License
 2 | 
 3 | Copyright (c) 2018, Alex Zhang
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
20 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Name
  2 | zstd-nginx-module - Nginx module for the [Zstandard compression](https://facebook.github.io/zstd/).
  3 | 
  4 | # Table of Content
  5 | 
  6 | * [Name](#name)
  7 | * [Status](#status)
  8 | * [Synopsis](#synopsis)
  9 | * [Installation](#installation)
 10 | * [Directives](#directives)
 11 |   * [ngx_http_zstd_filter_module](#ngx_http_zstd_filter_module)
 12 |     * [zstd_dict_file](#zstd_dict_file)
 13 |     * [zstd](#zstd)
 14 |     * [zstd_comp_level](#zstd_comp_level)
 15 |     * [zstd_min_length](#zstd_min_length)
 16 |     * [zstd_types](#zstd_types)
 17 |     * [zstd_buffers](#zstd_buffers)
 18 |   * [ngx_http_zstd_static_module](#ngx_http_zstd_static_module)
 19 |     * [zstd_static](#zstd_static)
 20 | * [Variables](#variables)
 21 |   * [ngx_http_zstd_filter_module](#ngx_http_zstd_filter_module)
 22 |     * [$zstd_ratio](#$zstd_ratio)
 23 | * [Author](#author)
 24 | 
 25 | # Status
 26 | 
 27 | This Nginx module is currently considered experimental. Issues and PRs are welcome if you encounter any problems.
 28 | 
 29 | # Synopsis
 30 | 
 31 | ```nginx
 32 | 
 33 | # specify the dictionary
 34 | zstd_dict_file /path/to/dict;
 35 | 
 36 | server {
 37 |     listen 127.0.0.1:8080;
 38 |     server_name localhost;
 39 | 
 40 |     location / {
 41 |         # enable zstd compression
 42 |         zstd on;
 43 |         zstd_min_length 256; # no less than 256 bytes
 44 |         zstd_comp_level 3; # set the level to 3
 45 | 
 46 |         proxy_pass http://foo.com;
 47 |     }
 48 | }
 49 | 
 50 | server {
 51 |     listen 127.0.0.1:8081;
 52 |     server_name localhost;
 53 | 
 54 |     location / {
 55 |         zstd_static on;
 56 |         root html;
 57 |     }
 58 | }
 59 | ```
 60 | 
 61 | # Installation
 62 | 
 63 | To use theses modules, configure your nginx branch with `--add-module=/path/to/zstd-nginx-module`. Several points should be taken care.
 64 | 
 65 | * You can set environment variables `ZSTD_INC` and `ZSTD_LIB` to specify the path to `zstd.h` and the path to zstd shared library represently.
 66 | * static library will be tried prior to dynamic library, since this Nginx module uses some **advanced APIs** where static linking is recommended.
 67 | * System's zstd bundle will be linked if `ZSTD_INC` and `ZSTD_LIB` are not specified.
 68 | * Both `ngx_http_zstd_static_module` and `ngx_http_zstd_filter_module` will be configured.
 69 | 
 70 | # Directives
 71 | 
 72 | ## ngx_http_zstd_filter_module
 73 | 
 74 | The `ngx_http_zstd_filter_module` module is a filter that compresses responses using the "zstd" method. This often helps to reduce the size of transmitted data by half or even more.
 75 | 
 76 | ### zstd_dict_file
 77 | 
 78 | **Syntax:** *zstd_dict_file /path/to/dict;*  
 79 | **Default:** *-*  
 80 | **Context:** *http*  
 81 | 
 82 | Specifies the external dictionary.
 83 | 
 84 | **WARNING:** Be careful! The content-coding registration only specifies a means to signal the use of the zstd format, and does not additionally specify any mechanism for advertising/negotiating/synchronizing the use of a specific dictionary between client and server. Use the `zstd_dict_file` only if you can insure that both ends (server and client) are capable of  using the same dictionary (e.g. advertise with a HTTP header). See https://github.com/tokers/zstd-nginx-module/issues/2 for the details.
 85 | 
 86 | ### zstd
 87 | 
 88 | **Syntax:** *zstd on | off;*  
 89 | **Default:** *zstd off;*  
 90 | **Context:** *http, server, location, if in location*
 91 | 
 92 | Enables or disables zstd compression for response.
 93 | 
 94 | ### zstd_comp_level
 95 | 
 96 | **Syntax:** *zstd_comp_level level;*  
 97 | **Default:** *zstd_comp_level 1;*  
 98 | **Context:** *http, server, location*
 99 | 
100 | Sets a zstd compression level of a response. Acceptable values are in the range from 1 to `ZSTD_maxCLevel()`.
101 | 
102 | ### zstd_min_length
103 | 
104 | **Syntax:** *zstd_min_length length;*  
105 | **Default:** *zstd_min_length 20;*  
106 | **Context:** *http, server, location*
107 | 
108 | Sets the minimum length of a response that will be compressed by zstd. The length is determined only from the "Content-Length" response header field.
109 | 
110 | ### zstd_types
111 | 
112 | **Syntax:** *zstd_types mime-type ...;*  
113 | **Default:** *zstd_types text/html;*  
114 | **Context:** *http, server, location*
115 | 
116 | Enables ztd of responses for the specified MIME types in addition to "text/html". The special value "*" matches any MIME type.
117 | 
118 | ### zstd_buffers
119 | 
120 | **Syntax:** *zstd_buffers number size;*  
121 | **Default:** *zstd_buffers 32 4k | 16 8k;*  
122 | **Context:** *http, server, location*
123 | 
124 | Sets the number and size of buffers used to compress a response. By default, the buffer size is equal to one memory page. This is either 4K or 8K, depending on a platform.
125 | 
126 | ## ngx_http_zstd_static_module
127 | 
128 | The `ngx_http_zstd_static_module` module allows sending precompressed files with the ".zst" filename extension instead of regular files.
129 | 
130 | ### zstd_static
131 | 
132 | **Syntax:**	*zstd_static on | off | always;*  
133 | **Default:** *zstd_static off;*  
134 | **Context:** *http, server, location*  
135 | 
136 | Enables ("on") or disables ("off") checking the existence of precompressed files. The following directives are also taken into account: gzip_vary.
137 | 
138 | With the "always" value, "zsted" file is used in all cases, without checking if the client supports it.
139 | 
140 | 
141 | # Variables
142 | 
143 | ## ngx_http_zstd_filter_module
144 | 
145 | ### $zstd_ratio
146 | 
147 | Achieved compression ratio, computed as the ratio between the original and compressed response sizes.
148 | 
149 | # Author
150 | 
151 | Alex Zhang (张超) zchao1995@gmail, UPYUN Inc.
152 | 
153 | # License
154 | 
155 | This Nginx module is licensed under [BSD 2-Clause License](LICENSE).
156 | 


--------------------------------------------------------------------------------
/config:
--------------------------------------------------------------------------------
 1 | # Make sure the module knows it is a submodule.
 2 | ngx_addon_name=ngx_zstd
 3 | . $ngx_addon_dir/filter/config
 4 | 
 5 | # Make sure the module knows it is a submodule.
 6 | ngx_addon_name=ngx_zstd
 7 | . $ngx_addon_dir/static/config
 8 | 
 9 | # The final name for reporting.
10 | ngx_addon_name=ngx_zstd
11 | 
12 | 


--------------------------------------------------------------------------------
/filter/config:
--------------------------------------------------------------------------------
  1 | ngx_feature_incs="#include <zstd.h>"
  2 | ngx_feature_test="(void) ZSTD_createCCtx();"
  3 | ngx_feature_libs=
  4 | ngx_feature_run=yes
  5 | 
  6 | ngx_zstd_opt_I=
  7 | ngx_zstd_opt_L=
  8 | 
  9 | if [ -n "$ZSTD_INC" -o -n "$ZSTD_LIB" ]; then
 10 |     ngx_feature="ZStandard static library in $ZSTD_INC and $ZSTD_LIB"
 11 |     ngx_feature_path=$ZSTD_INC
 12 | 
 13 |     # we try the static shared library firstly
 14 |     ngx_zstd_opt_I="-I$ZSTD_INC -DZSTD_STATIC_LINKING_ONLY"
 15 |     ngx_zstd_opt_L="$ZSTD_LIB/libzstd.a"
 16 |     SAVED_CC_TAST_FLAGS=$CC_TEST_FLAGS
 17 |     CC_TEST_FLAGS="$ngx_zstd_opt_I $CC_TEST_FLAGS"
 18 |     SAVED_NGX_TEST_LD_OPT=$NGX_TEST_LD_OPT
 19 |     NGX_TEST_LD_OPT="$ngx_zstd_opt_L $NGX_TEST_LD_OPT"
 20 | 
 21 |     . auto/feature
 22 | 
 23 |     # restore
 24 |     CC_TEST_FLAGS=$SAVED_CC_TAST_FLAGS
 25 |     NGX_TEST_LD_OPT=$SAVED_NGX_TEST_LD_OPT
 26 | 
 27 |     if [ $ngx_found = no ]; then
 28 |         # then try the dynamic shared library
 29 |         ngx_feature="ZStandard dynamic library in $ZSTD_INC and $ZSTD_LIB"
 30 |         ngx_zstd_opt_L="-L$ZSTD_LIB -lzstd -Wl,-rpath, $ZSTD_LIB"
 31 | 
 32 |         SAVED_CC_TAST_FLAGS=$CC_TEST_FLAGS
 33 |         CC_TEST_FLAGS="$ngx_zstd_opt_I $CC_TEST_FLAGS"
 34 |         SAVED_NGX_TEST_LD_OPT=$NGX_TEST_LD_OPT
 35 |         NGX_TEST_LD_OPT="$ngx_zstd_opt_L $NGX_TEST_LD_OPT"
 36 | 
 37 |         . auto/feature
 38 | 
 39 |         # restore
 40 |         CC_TEST_FLAGS=$SAVED_CC_TAST_FLAGS
 41 |         NGX_TEST_LD_OPT=$SAVED_NGX_TEST_LD_OPT
 42 | 
 43 |         if [ $ngx_found = no ]; then
 44 |             cat << END
 45 |             $0: error: ngx_http_zstd_filter_module requires the ZStandard library, please be sure that "\$ZSTD_INC" and "\$ZSTD_LIB" are set correctly.
 46 | END
 47 |             exit 1
 48 |         fi
 49 | 
 50 |     fi
 51 | else
 52 |     # auto-discovery
 53 |     ngx_feature="ZStandard static library"
 54 |     ngx_zstd_opt_I="-DZSTD_STATIC_LINKING_ONLY"
 55 |     ngx_zstd_opt_L="-l:libzstd.a"
 56 | 
 57 |     # still we consider the static library firstly
 58 |     SAVED_CC_TAST_FLAGS=$CC_TEST_FLAGS
 59 |     CC_TEST_FLAGS="$ngx_zstd_opt_I $CC_TEST_FLAGS"
 60 |     SAVED_NGX_TEST_LD_OPT=$NGX_TEST_LD_OPT
 61 |     NGX_TEST_LD_OPT="$ngx_zstd_opt_L $NGX_TEST_LD_OPT"
 62 | 
 63 |     . auto/feature
 64 | 
 65 |     # restore
 66 |     CC_TEST_FLAGS=$SAVED_CC_TAST_FLAGS
 67 |     NGX_TEST_LD_OPT=$SAVED_NGX_TEST_LD_OPT
 68 | 
 69 |     if [ $ngx_found = no ]; then
 70 | 
 71 |         ngx_feature="ZStandard dynamic library"
 72 |         ngx_zstd_opt_L="-lzstd"
 73 |         SAVED_CC_TAST_FLAGS=$CC_TEST_FLAGS
 74 |         CC_TEST_FLAGS="$ngx_zstd_opt_I $CC_TEST_FLAGS"
 75 |         SAVED_NGX_TEST_LD_OPT=$NGX_TEST_LD_OPT
 76 |         NGX_TEST_LD_OPT="$ngx_zstd_opt_L $NGX_TEST_LD_OPT"
 77 | 
 78 |         . auto/feature
 79 | 
 80 |         if [ $ngx_found = no ]; then
 81 |             cat << END
 82 |             $0: error: ngx_http_zstd_filter_module requires the ZStandard library.
 83 | END
 84 |             exit 1
 85 |         fi
 86 | 
 87 |         # restore
 88 |         CC_TEST_FLAGS=$SAVED_CC_TAST_FLAGS
 89 |         NGX_TEST_LD_OPT=$SAVED_NGX_TEST_LD_OPT
 90 | 
 91 |         cat << END
 92 |         $0: warning: ngx_http_zstd_filter_module uses advanced ZStandard APIs (which are still considered experimental) while you are trying to link the dynamic shared library.
 93 | END
 94 |     fi
 95 | 
 96 |     # TODO we need more tries for the different OS port.
 97 | fi
 98 | 
 99 | NGX_LD_OPT="$ngx_zstd_opt_L $NGX_LD_OPT"
100 | 
101 | HTTP_ZSTD_SRCS="$ngx_addon_dir/filter/ngx_http_zstd_filter_module.c"
102 | 
103 | ngx_addon_name=ngx_http_zstd_filter_module
104 | ngx_module_type=HTTP_FILTER
105 | ngx_module_name=ngx_http_zstd_filter_module
106 | ngx_module_incs="$ngx_zstd_opt_I"
107 | ngx_module_srcs=$HTTP_ZSTD_SRCS
108 | ngx_module_libs=$NGX_LD_OPT
109 | ngx_module_order="$ngx_module_name \
110 |                   ngx_pagespeed \
111 |                   ngx_http_postpone_filter_module \
112 |                   ngx_http_ssi_filter_module \
113 |                   ngx_http_charset_filter_module \
114 |                   ngx_http_xslt_filter_module \
115 |                   ngx_http_image_filter_module \
116 |                   ngx_http_sub_filter_module \
117 |                   ngx_http_addition_filter_module \
118 |                   ngx_http_gunzip_filter_module \
119 |                   ngx_http_userid_filter_module \
120 |                   ngx_http_headers_filter_module \
121 |                   ngx_http_copy_filter_module \
122 |                   ngx_http_range_body_filter_module \
123 |                   ngx_http_not_modified_filter_module \
124 |                   ngx_http_slice_filter_module"
125 | 
126 | . auto/module
127 | 
128 | if [ "$ngx_module_link" != DYNAMIC ]; then
129 |     # ngx_module_order doesn't work with static modules,
130 |     # so we must re-order filters here.
131 | 
132 |     if [ "$HTTP_GZIP" = YES ]; then
133 |         next=ngx_http_gzip_filter_module
134 |     elif echo $HTTP_FILTER_MODULES | grep pagespeed_etag_filter >/dev/null; then
135 |         next=ngx_pagespeed_etag_filter
136 |     else
137 |         next=ngx_http_range_header_filter_module
138 |     fi
139 | 
140 |     HTTP_FILTER_MODULES=`echo $HTTP_FILTER_MODULES \
141 |                          | sed "s/$ngx_module_name//" \
142 |                          | sed "s/$next/$next $ngx_module_name/"`
143 | fi
144 | 
145 | 


--------------------------------------------------------------------------------
/filter/ngx_http_zstd_filter_module.c:
--------------------------------------------------------------------------------
   1 | 
   2 | /*
   3 |  * Copyright (C) Alex Zhang
   4 |  */
   5 | 
   6 | 
   7 | #include <ngx_config.h>
   8 | #include <ngx_core.h>
   9 | #include <ngx_http.h>
  10 | 
  11 | #include <zstd.h>
  12 | 
  13 | 
  14 | #define NGX_HTTP_ZSTD_FILTER_COMPRESS       0
  15 | #define NGX_HTTP_ZSTD_FILTER_FLUSH          1
  16 | #define NGX_HTTP_ZSTD_FILTER_END            2
  17 | 
  18 | 
  19 | typedef struct {
  20 |     ngx_str_t                    dict_file;
  21 | } ngx_http_zstd_main_conf_t;
  22 | 
  23 | 
  24 | typedef struct {
  25 |     ngx_flag_t                   enable;
  26 |     ngx_int_t                    level;
  27 |     ssize_t                      min_length;
  28 | 
  29 |     ngx_hash_t                   types;
  30 | 
  31 |     ngx_bufs_t                   bufs;
  32 | 
  33 |     ngx_array_t                 *types_keys;
  34 | 
  35 |     ZSTD_CDict                  *dict;
  36 | } ngx_http_zstd_loc_conf_t;
  37 | 
  38 | 
  39 | typedef struct {
  40 |     ngx_chain_t                 *in;
  41 |     ngx_chain_t                 *free;
  42 |     ngx_chain_t                 *busy;
  43 |     ngx_chain_t                 *out;
  44 |     ngx_chain_t                **last_out;
  45 | 
  46 |     ngx_buf_t                   *in_buf;
  47 |     ngx_buf_t                   *out_buf;
  48 |     ngx_int_t                    bufs;
  49 | 
  50 |     ZSTD_inBuffer                buffer_in;
  51 |     ZSTD_outBuffer               buffer_out;
  52 | 
  53 |     ZSTD_CStream                *cstream;
  54 | 
  55 |     ngx_http_request_t          *request;
  56 | 
  57 |     size_t                       bytes_in;
  58 |     size_t                       bytes_out;
  59 | 
  60 |     unsigned                     action:2;
  61 |     unsigned                     last:1;
  62 |     unsigned                     redo:1;
  63 |     unsigned                     flush:1;
  64 |     unsigned                     done:1;
  65 |     unsigned                     nomem:1;
  66 | } ngx_http_zstd_ctx_t;
  67 | 
  68 | 
  69 | typedef struct {
  70 |     ngx_conf_post_handler_pt  post_handler;
  71 | } ngx_http_zstd_comp_level_bounds_t;
  72 | 
  73 | 
  74 | static ngx_http_output_header_filter_pt  ngx_http_next_header_filter;
  75 | static ngx_http_output_body_filter_pt  ngx_http_next_body_filter;
  76 | 
  77 | static ngx_str_t  ngx_http_zstd_ratio = ngx_string("zstd_ratio");
  78 | 
  79 | 
  80 | static ngx_int_t ngx_http_zstd_header_filter(ngx_http_request_t *r);
  81 | static ngx_int_t ngx_http_zstd_body_filter(ngx_http_request_t *r,
  82 |     ngx_chain_t *in);
  83 | static ngx_int_t ngx_http_zstd_filter_add_data(ngx_http_request_t *r,
  84 |     ngx_http_zstd_ctx_t *ctx);
  85 | static ngx_int_t ngx_http_zstd_filter_get_buf(ngx_http_request_t *r,
  86 |     ngx_http_zstd_ctx_t *ctx);
  87 | static ZSTD_CStream *ngx_http_zstd_filter_create_cstream(ngx_http_request_t *r,
  88 |     ngx_http_zstd_ctx_t *ctx);
  89 | static ngx_int_t ngx_http_zstd_filter_compress(ngx_http_request_t *r,
  90 |     ngx_http_zstd_ctx_t *ctx);
  91 | static ngx_int_t ngx_http_zstd_accept_encoding(ngx_str_t *ae);
  92 | static ngx_int_t ngx_http_zstd_ok(ngx_http_request_t *r);
  93 | static ngx_int_t ngx_http_zstd_filter_init(ngx_conf_t *cf);
  94 | static void * ngx_http_zstd_create_main_conf(ngx_conf_t *cf);
  95 | static char *ngx_http_zstd_init_main_conf(ngx_conf_t *cf, void *conf);
  96 | static void *ngx_http_zstd_create_loc_conf(ngx_conf_t *cf);
  97 | static char *ngx_http_zstd_merge_loc_conf(ngx_conf_t *cf, void *parent,
  98 |     void *child);
  99 | static ngx_int_t ngx_http_zstd_add_variables(ngx_conf_t *cf);
 100 | static ngx_int_t ngx_http_zstd_ratio_variable(ngx_http_request_t *r,
 101 |     ngx_http_variable_value_t *vv, uintptr_t data);
 102 | static void * ngx_http_zstd_filter_alloc(void *opaque, size_t size);
 103 | static void ngx_http_zstd_filter_free(void *opaque, void *address);
 104 | static char *ngx_http_zstd_comp_level(ngx_conf_t *cf, void *post, void *data);
 105 | static char *ngx_conf_zstd_set_num_slot_with_negatives(ngx_conf_t *cf, ngx_command_t *cmd, void *conf);
 106 | 
 107 | 
 108 | static ngx_http_zstd_comp_level_bounds_t  ngx_http_zstd_comp_level_bounds = {
 109 |     ngx_http_zstd_comp_level
 110 | };
 111 | 
 112 | 
 113 | static ngx_command_t  ngx_http_zstd_filter_commands[] = {
 114 | 
 115 |     { ngx_string("zstd"),
 116 |       NGX_HTTP_MAIN_CONF|NGX_HTTP_SRV_CONF|NGX_HTTP_LOC_CONF|NGX_HTTP_LIF_CONF
 117 |       |NGX_CONF_FLAG,
 118 |       ngx_conf_set_flag_slot,
 119 |       NGX_HTTP_LOC_CONF_OFFSET,
 120 |       offsetof(ngx_http_zstd_loc_conf_t, enable),
 121 |       NULL },
 122 | 
 123 |     { ngx_string("zstd_comp_level"),
 124 |       NGX_HTTP_MAIN_CONF|NGX_HTTP_SRV_CONF|NGX_HTTP_LOC_CONF|NGX_CONF_TAKE1,
 125 |       ngx_conf_zstd_set_num_slot_with_negatives,
 126 |       NGX_HTTP_LOC_CONF_OFFSET,
 127 |       offsetof(ngx_http_zstd_loc_conf_t, level),
 128 |       &ngx_http_zstd_comp_level_bounds },
 129 | 
 130 |     { ngx_string("zstd_types"),
 131 |       NGX_HTTP_MAIN_CONF|NGX_HTTP_SRV_CONF|NGX_HTTP_LOC_CONF|NGX_CONF_1MORE,
 132 |       ngx_http_types_slot,
 133 |       NGX_HTTP_LOC_CONF_OFFSET,
 134 |       offsetof(ngx_http_zstd_loc_conf_t, types_keys),
 135 |       &ngx_http_html_default_types[0] },
 136 | 
 137 |     { ngx_string("zstd_buffers"),
 138 |       NGX_HTTP_MAIN_CONF|NGX_HTTP_SRV_CONF|NGX_HTTP_LOC_CONF|NGX_CONF_TAKE2,
 139 |       ngx_conf_set_bufs_slot,
 140 |       NGX_HTTP_LOC_CONF_OFFSET,
 141 |       offsetof(ngx_http_zstd_loc_conf_t, bufs),
 142 |       NULL },
 143 | 
 144 |     { ngx_string("zstd_min_length"),
 145 |       NGX_HTTP_MAIN_CONF|NGX_HTTP_SRV_CONF|NGX_HTTP_LOC_CONF|NGX_CONF_1MORE,
 146 |       ngx_conf_set_size_slot,
 147 |       NGX_HTTP_LOC_CONF_OFFSET,
 148 |       offsetof(ngx_http_zstd_loc_conf_t, min_length),
 149 |       NULL },
 150 | 
 151 |     { ngx_string("zstd_dict_file"),
 152 |       NGX_HTTP_MAIN_CONF|NGX_CONF_TAKE1,
 153 |       ngx_conf_set_str_slot,
 154 |       NGX_HTTP_MAIN_CONF_OFFSET,
 155 |       offsetof(ngx_http_zstd_main_conf_t, dict_file),
 156 |       NULL },
 157 | 
 158 |     ngx_null_command
 159 | };
 160 | 
 161 | 
 162 | static ngx_http_module_t  ngx_http_zstd_filter_module_ctx = {
 163 |     ngx_http_zstd_add_variables,            /* preconfiguration */
 164 |     ngx_http_zstd_filter_init,              /* postconfiguration */
 165 | 
 166 |     ngx_http_zstd_create_main_conf,         /* create main configuration */
 167 |     ngx_http_zstd_init_main_conf,           /* init main configuration */
 168 | 
 169 |     NULL,                                   /* create server configuration */
 170 |     NULL,                                   /* merge server configuration */
 171 | 
 172 |     ngx_http_zstd_create_loc_conf,          /* create location configuration */
 173 |     ngx_http_zstd_merge_loc_conf,           /* merge location configuration */
 174 | };
 175 | 
 176 | 
 177 | ngx_module_t  ngx_http_zstd_filter_module = {
 178 |     NGX_MODULE_V1,
 179 |     &ngx_http_zstd_filter_module_ctx,       /* module context */
 180 |     ngx_http_zstd_filter_commands,          /* module directives */
 181 |     NGX_HTTP_MODULE,                        /* module type */
 182 |     NULL,                                   /* init master */
 183 |     NULL,                                   /* init module */
 184 |     NULL,                                   /* init process */
 185 |     NULL,                                   /* init thread */
 186 |     NULL,                                   /* exit thread */
 187 |     NULL,                                   /* exit process */
 188 |     NULL,                                   /* exit master */
 189 |     NGX_MODULE_V1_PADDING
 190 | };
 191 | 
 192 | 
 193 | static ngx_int_t
 194 | ngx_http_zstd_header_filter(ngx_http_request_t *r)
 195 | {
 196 |     ngx_table_elt_t           *h;
 197 |     ngx_http_zstd_loc_conf_t  *zlcf;
 198 |     ngx_http_zstd_ctx_t       *ctx;
 199 | 
 200 |     zlcf = ngx_http_get_module_loc_conf(r, ngx_http_zstd_filter_module);
 201 | 
 202 |     if (!zlcf->enable
 203 |         || (r->headers_out.status != NGX_HTTP_OK
 204 |             && r->headers_out.status != NGX_HTTP_FORBIDDEN
 205 |             && r->headers_out.status != NGX_HTTP_NOT_FOUND)
 206 |        || (r->headers_out.content_encoding
 207 |            && r->headers_out.content_encoding->value.len)
 208 |        || (r->headers_out.content_length_n != -1
 209 |            && r->headers_out.content_length_n < zlcf->min_length)
 210 |        || ngx_http_test_content_type(r, &zlcf->types) == NULL
 211 |        || r->header_only)
 212 |     {
 213 |         return ngx_http_next_header_filter(r);
 214 |     }
 215 | 
 216 |     r->gzip_vary = 1;
 217 | 
 218 |     if (ngx_http_zstd_ok(r) != NGX_OK) {
 219 |         return ngx_http_next_header_filter(r);
 220 |     }
 221 | 
 222 |     ctx = ngx_pcalloc(r->pool, sizeof(ngx_http_zstd_ctx_t));
 223 |     if (ctx == NULL) {
 224 |         return NGX_ERROR;
 225 |     }
 226 | 
 227 |     ngx_http_set_ctx(r, ctx, ngx_http_zstd_filter_module);
 228 | 
 229 |     ctx->request = r;
 230 |     ctx->last_out = &ctx->out;
 231 | 
 232 |     h = ngx_list_push(&r->headers_out.headers);
 233 |     if (h == NULL) {
 234 |         return NGX_ERROR;
 235 |     }
 236 | 
 237 |     h->hash = 1;
 238 |     ngx_str_set(&h->key, "Content-Encoding");
 239 |     ngx_str_set(&h->value, "zstd");
 240 |     r->headers_out.content_encoding = h;
 241 | 
 242 |     r->main_filter_need_in_memory = 1;
 243 | 
 244 |     ngx_http_clear_content_length(r);
 245 |     ngx_http_clear_accept_ranges(r);
 246 |     ngx_http_weak_etag(r);
 247 | 
 248 |     return ngx_http_next_header_filter(r);
 249 | }
 250 | 
 251 | 
 252 | static ngx_int_t
 253 | ngx_http_zstd_body_filter(ngx_http_request_t *r, ngx_chain_t *in)
 254 | {
 255 |     size_t                rv;
 256 |     ngx_int_t             flush, rc;
 257 |     ngx_chain_t          *cl;
 258 |     ngx_http_zstd_ctx_t  *ctx;
 259 | 
 260 | 
 261 |     ctx = ngx_http_get_module_ctx(r, ngx_http_zstd_filter_module);
 262 | 
 263 |     if (ctx == NULL || ctx->done || r->header_only) {
 264 |         return ngx_http_next_body_filter(r, in);
 265 |     }
 266 | 
 267 |     ngx_log_debug0(NGX_LOG_DEBUG_HTTP, r->connection->log, 0,
 268 |                    "http zstd filter");
 269 | 
 270 |     if (ctx->cstream == NULL) {
 271 |         ctx->cstream = ngx_http_zstd_filter_create_cstream(r, ctx);
 272 |         if (ctx->cstream == NULL) {
 273 |             goto failed;
 274 |         }
 275 |     }
 276 | 
 277 |     if (in) {
 278 |         if (ngx_chain_add_copy(r->pool, &ctx->in, in) != NGX_OK) {
 279 |             goto failed;
 280 |         }
 281 | 
 282 |         r->connection->buffered |= NGX_HTTP_GZIP_BUFFERED;
 283 |     }
 284 | 
 285 |     if (ctx->nomem) {
 286 | 
 287 |         /* flush busy buffers */
 288 | 
 289 |         if (ngx_http_next_body_filter(r, NULL) == NGX_ERROR) {
 290 |             goto failed;
 291 |         }
 292 | 
 293 |         cl = NULL;
 294 | 
 295 |         ngx_chain_update_chains(r->pool, &ctx->free, &ctx->busy, &cl,
 296 |                                 (ngx_buf_tag_t) &ngx_http_zstd_filter_module);
 297 | 
 298 |         flush = 0;
 299 |         ctx->nomem = 0;
 300 | 
 301 |     } else {
 302 |         flush = ctx->busy ? 1 : 0;
 303 |     }
 304 | 
 305 |     for ( ;; ) {
 306 | 
 307 |         /* cycle while we can write to a client */
 308 | 
 309 |         for ( ;; ) {
 310 | 
 311 |             rc = ngx_http_zstd_filter_add_data(r, ctx);
 312 | 
 313 |             if (rc == NGX_DECLINED) {
 314 |                 break;
 315 |             }
 316 | 
 317 |             if (rc == NGX_AGAIN) {
 318 |                 continue;
 319 |             }
 320 | 
 321 |             rc = ngx_http_zstd_filter_get_buf(r, ctx);
 322 | 
 323 |             if (rc == NGX_ERROR) {
 324 |                 goto failed;
 325 |             }
 326 | 
 327 |             if (rc == NGX_DECLINED) {
 328 |                 break;
 329 |             }
 330 | 
 331 |             rc = ngx_http_zstd_filter_compress(r, ctx);
 332 | 
 333 |             if (rc == NGX_ERROR) {
 334 |                 goto failed;
 335 |             }
 336 | 
 337 |             if (rc == NGX_OK) {
 338 |                 break;
 339 |             }
 340 | 
 341 |             /* rc == NGX_AGAIN */
 342 |         }
 343 | 
 344 |         if (ctx->out == NULL && !flush) {
 345 |             return ctx->busy ? NGX_AGAIN : NGX_OK;
 346 |         }
 347 | 
 348 |         rc = ngx_http_next_body_filter(r, ctx->out);
 349 | 
 350 |         if (rc == NGX_ERROR) {
 351 |             goto failed;
 352 |         }
 353 | 
 354 |         ngx_chain_update_chains(r->pool, &ctx->free, &ctx->busy, &ctx->out,
 355 |                                 (ngx_buf_tag_t) &ngx_http_zstd_filter_module);
 356 | 
 357 |         ctx->last_out = &ctx->out;
 358 |         ctx->nomem = 0;
 359 |         flush = 0;
 360 | 
 361 |         if (ctx->done) {
 362 |             rv = ZSTD_freeCStream(ctx->cstream);
 363 |             if (ZSTD_isError(rv)) {
 364 |                 ngx_log_error(NGX_LOG_ALERT, r->connection->log, 0,
 365 |                               "ZSTD_freeCStream() failed: %s",
 366 |                               ZSTD_getErrorName(rc));
 367 | 
 368 |                 rc = NGX_ERROR;
 369 |             }
 370 | 
 371 |             return rc;
 372 |         }
 373 |     }
 374 | 
 375 | failed:
 376 | 
 377 |     ctx->done = 1;
 378 |     rv = ZSTD_freeCStream(ctx->cstream);
 379 |     if (ZSTD_isError(rv)) {
 380 |         ngx_log_error(NGX_LOG_ALERT, r->connection->log, 0,
 381 |                       "ZSTD_freeCStream() failed: %s", ZSTD_getErrorName(rv));
 382 |     }
 383 | 
 384 |     return NGX_ERROR;
 385 | }
 386 | 
 387 | 
 388 | static ngx_int_t
 389 | ngx_http_zstd_filter_compress(ngx_http_request_t *r, ngx_http_zstd_ctx_t *ctx)
 390 | {
 391 |     size_t        rc, pos_in, pos_out;
 392 |     char         *hint;
 393 |     ngx_chain_t  *cl;
 394 |     ngx_buf_t    *b;
 395 | 
 396 |     ngx_log_debug8(NGX_LOG_DEBUG_HTTP, r->connection->log, 0,
 397 |                    "zstd compress in: src:%p pos:%ud size: %ud, "
 398 |                    "dst:%p pos:%ud size:%ud flush:%d redo:%d",
 399 |                    ctx->buffer_in.src, ctx->buffer_in.pos, ctx->buffer_in.size,
 400 |                    ctx->buffer_out.dst, ctx->buffer_out.pos,
 401 |                    ctx->buffer_out.size, ctx->flush, ctx->redo);
 402 | 
 403 |     pos_in = ctx->buffer_in.pos;
 404 |     pos_out = ctx->buffer_out.pos;
 405 | 
 406 |     switch (ctx->action) {
 407 | 
 408 |     case NGX_HTTP_ZSTD_FILTER_FLUSH:
 409 |         hint = "ZSTD_flushStream() ";
 410 |         rc = ZSTD_flushStream(ctx->cstream, &ctx->buffer_out);
 411 |         break;
 412 | 
 413 |     case NGX_HTTP_ZSTD_FILTER_END:
 414 |         hint = "ZSTD_endStream() ";
 415 |         rc = ZSTD_endStream(ctx->cstream, &ctx->buffer_out);
 416 |         break;
 417 | 
 418 |     default:
 419 |         hint = "ZSTD_compressStream() ";
 420 |         rc = ZSTD_compressStream(ctx->cstream, &ctx->buffer_out,
 421 |                                  &ctx->buffer_in);
 422 |         break;
 423 |     }
 424 | 
 425 |     if (ZSTD_isError(rc)) {
 426 |         ngx_log_error(NGX_LOG_ALERT, r->connection->log, 0,
 427 |                       "%s failed: %s", hint, ZSTD_getErrorName(rc));
 428 | 
 429 |         return NGX_ERROR;
 430 |     }
 431 | 
 432 |     ngx_log_debug6(NGX_LOG_DEBUG_HTTP, r->connection->log, 0,
 433 |                    "zstd compress out: src:%p pos:%ud size: %ud, "
 434 |                    "dst:%p pos:%ud size:%ud",
 435 |                    ctx->buffer_in.src, ctx->buffer_in.pos, ctx->buffer_in.size,
 436 |                    ctx->buffer_out.dst, ctx->buffer_out.pos,
 437 |                    ctx->buffer_out.size);
 438 | 
 439 |     ctx->in_buf->pos += ctx->buffer_in.pos - pos_in;
 440 |     ctx->out_buf->last += ctx->buffer_out.pos - pos_out;
 441 |     ctx->redo = 0;
 442 | 
 443 |     if (rc > 0) {
 444 |         if (ctx->action == NGX_HTTP_ZSTD_FILTER_COMPRESS) {
 445 |             ctx->action = NGX_HTTP_ZSTD_FILTER_FLUSH;
 446 |         }
 447 | 
 448 |         ctx->redo = 1;
 449 | 
 450 |     } else if (ctx->last && ctx->action != NGX_HTTP_ZSTD_FILTER_END) {
 451 |         ctx->redo = 1;
 452 |         ctx->action = NGX_HTTP_ZSTD_FILTER_END;
 453 | 
 454 |         /* pending to call the ZSTD_endStream() */
 455 | 
 456 |         return NGX_AGAIN;
 457 | 
 458 |     } else {
 459 |         ctx->action = NGX_HTTP_ZSTD_FILTER_COMPRESS; /* restore */
 460 |     }
 461 | 
 462 |     if (ngx_buf_size(ctx->out_buf) == 0) {
 463 |         return NGX_AGAIN;
 464 |     }
 465 | 
 466 |     cl = ngx_alloc_chain_link(r->pool);
 467 |     if (cl == NULL) {
 468 |         return NGX_ERROR;
 469 |     }
 470 | 
 471 |     b = ctx->out_buf;
 472 | 
 473 |     if (rc == 0 && (ctx->flush || ctx->last)) {
 474 |         r->connection->buffered &= ~NGX_HTTP_GZIP_BUFFERED;
 475 | 
 476 |         b->flush = ctx->flush;
 477 |         b->last_buf = ctx->last;
 478 | 
 479 |         ctx->done = ctx->last;
 480 |         ctx->flush = 0;
 481 |     }
 482 | 
 483 |     ctx->bytes_out += ngx_buf_size(b);
 484 | 
 485 |     cl->next = NULL;
 486 |     cl->buf = b;
 487 | 
 488 |     *ctx->last_out = cl;
 489 |     ctx->last_out = &cl->next;
 490 | 
 491 |     ngx_memzero(&ctx->buffer_out, sizeof(ZSTD_outBuffer));
 492 | 
 493 |     return ctx->last && rc == 0 ? NGX_OK : NGX_AGAIN;
 494 | }
 495 | 
 496 | 
 497 | static ngx_int_t
 498 | ngx_http_zstd_filter_add_data(ngx_http_request_t *r, ngx_http_zstd_ctx_t *ctx)
 499 | {
 500 |     if (ctx->buffer_in.pos < ctx->buffer_in.size
 501 |         || ctx->flush
 502 |         || ctx->last
 503 |         || ctx->redo)
 504 |     {
 505 |         return NGX_OK;
 506 |     }
 507 | 
 508 |     ngx_log_debug1(NGX_LOG_DEBUG_HTTP, r->connection->log, 0,
 509 |                    "zstd in: %p", ctx->in);
 510 | 
 511 |     if (ctx->in == NULL) {
 512 |         return NGX_DECLINED;
 513 |     }
 514 | 
 515 |     ctx->in_buf = ctx->in->buf;
 516 |     ctx->in = ctx->in->next;
 517 | 
 518 |     if (ctx->in_buf->flush) {
 519 |         ctx->flush = 1;
 520 | 
 521 |     } else if (ctx->in_buf->last_buf) {
 522 |         ctx->last = 1;
 523 |     }
 524 | 
 525 |     ctx->buffer_in.src = ctx->in_buf->pos;
 526 |     ctx->buffer_in.pos = 0;
 527 |     ctx->buffer_in.size = ngx_buf_size(ctx->in_buf);
 528 | 
 529 |     ctx->bytes_in += ngx_buf_size(ctx->in_buf);
 530 | 
 531 |     if (ctx->buffer_in.size == 0) {
 532 |         return NGX_AGAIN;
 533 |     }
 534 | 
 535 |     return NGX_OK;
 536 | }
 537 | 
 538 | 
 539 | static ngx_int_t
 540 | ngx_http_zstd_filter_get_buf(ngx_http_request_t *r, ngx_http_zstd_ctx_t *ctx)
 541 | {
 542 |     ngx_chain_t               *cl;
 543 |     ngx_http_zstd_loc_conf_t  *zlcf;
 544 | 
 545 |     if (ctx->buffer_out.pos < ctx->buffer_out.size) {
 546 |         return NGX_OK;
 547 |     }
 548 | 
 549 |     zlcf = ngx_http_get_module_loc_conf(r, ngx_http_zstd_filter_module);
 550 | 
 551 |     if (ctx->free) {
 552 |         cl = ctx->free;
 553 |         ctx->free = ctx->free->next;
 554 |         ctx->out_buf = cl->buf;
 555 |         ngx_free_chain(r->pool, cl);
 556 | 
 557 |     } else if (ctx->bufs < zlcf->bufs.num) {
 558 |         ctx->out_buf = ngx_create_temp_buf(r->pool, zlcf->bufs.size);
 559 |         if (ctx->out_buf == NULL) {
 560 |             return NGX_ERROR;
 561 |         }
 562 | 
 563 |         ctx->out_buf->tag = (ngx_buf_tag_t) &ngx_http_zstd_filter_module;
 564 |         ctx->out_buf->recycled = 1;
 565 |         ctx->bufs++;
 566 | 
 567 |     } else {
 568 |         ctx->nomem = 1;
 569 |         return NGX_DECLINED;
 570 |     }
 571 | 
 572 |     ctx->buffer_out.dst = ctx->out_buf->pos;
 573 |     ctx->buffer_out.pos = 0;
 574 |     ctx->buffer_out.size = ctx->out_buf->end - ctx->out_buf->start;
 575 | 
 576 |     return NGX_OK;
 577 | }
 578 | 
 579 | 
 580 | static ZSTD_CStream *
 581 | ngx_http_zstd_filter_create_cstream(ngx_http_request_t *r,
 582 |     ngx_http_zstd_ctx_t *ctx)
 583 | {
 584 |     size_t                      rc;
 585 |     ZSTD_CStream               *cstream;
 586 |     ZSTD_customMem              cmem;
 587 |     ngx_http_zstd_loc_conf_t   *zlcf;
 588 | 
 589 |     zlcf = ngx_http_get_module_loc_conf(r, ngx_http_zstd_filter_module);
 590 | 
 591 |     cmem.customAlloc = ngx_http_zstd_filter_alloc;
 592 |     cmem.customFree = ngx_http_zstd_filter_free;
 593 |     cmem.opaque = ctx;
 594 | 
 595 |     cstream = ZSTD_createCStream_advanced(cmem);
 596 |     if (cstream == NULL) {
 597 |         ngx_log_error(NGX_LOG_ALERT, r->connection->log, 0,
 598 |                       "ZSTD_createCStream_advanced() failed");
 599 | 
 600 |         return NULL;
 601 |     }
 602 | 
 603 |     /* TODO use the advanced initialize functions */
 604 | 
 605 |     if (zlcf->dict) {
 606 | #if ZSTD_VERSION_NUMBER >= 10500
 607 |         rc = ZSTD_CCtx_reset(cstream, ZSTD_reset_session_only);
 608 |         if (ZSTD_isError(rc)) {
 609 |             ngx_log_error(NGX_LOG_ALERT, r->connection->log, 0,
 610 |                           "ZSTD_CCtx_reset() failed: %s",
 611 |                           ZSTD_getErrorName(rc));
 612 |             goto failed;
 613 |         }
 614 | 
 615 |         rc = ZSTD_CCtx_refCDict(cstream, zlcf->dict);
 616 |         if (ZSTD_isError(rc)) {
 617 |             ngx_log_error(NGX_LOG_ALERT, r->connection->log, 0,
 618 |                           "ZSTD_CCtx_refCDict() failed: %s",
 619 |                           ZSTD_getErrorName(rc));
 620 |             goto failed;
 621 |         }
 622 | #else
 623 |         rc = ZSTD_initCStream_usingCDict(cstream, zlcf->dict);
 624 | #endif
 625 |         if (ZSTD_isError(rc)) {
 626 |             ngx_log_error(NGX_LOG_ALERT, r->connection->log, 0,
 627 |                           "ZSTD_initCStream_usingCDict() failed: %s",
 628 |                           ZSTD_getErrorName(rc));
 629 | 
 630 |             goto failed;
 631 |         }
 632 | 
 633 |     } else {
 634 |         rc = ZSTD_initCStream(cstream, zlcf->level);
 635 |         if (ZSTD_isError(rc)) {
 636 |             ngx_log_error(NGX_LOG_ALERT, r->connection->log, 0,
 637 |                           "ZSTD_initCStream() failed: %s",
 638 |                           ZSTD_getErrorName(rc));
 639 | 
 640 |             goto failed;
 641 |         }
 642 |     }
 643 | 
 644 |     return cstream;
 645 | 
 646 | failed:
 647 |     rc = ZSTD_freeCStream(cstream);
 648 |     if (ZSTD_isError(rc)) {
 649 |         ngx_log_error(NGX_LOG_ALERT, r->connection->log, 0,
 650 |                       "ZSTD_freeCStream() failed: %s", ZSTD_getErrorName(rc));
 651 |     }
 652 | 
 653 |     return NULL;
 654 | }
 655 | 
 656 | 
 657 | static ngx_int_t
 658 | ngx_http_zstd_accept_encoding(ngx_str_t *ae)
 659 | {
 660 |     u_char  *p;
 661 | 
 662 |     p = ngx_strcasestrn(ae->data, "zstd", sizeof("zstd") - 2);
 663 |     if (p == NULL) {
 664 |         return NGX_DECLINED;
 665 |     }
 666 | 
 667 |     if (p == ae->data || (*(p - 1) == ',' || *(p - 1) == ' ')) {
 668 | 
 669 |         p += sizeof("zstd") - 1;
 670 | 
 671 |         if (p == ae->data + ae->len || *p == ',' || *p == ' ' || *p == ';') {
 672 |             return NGX_OK;
 673 |         }
 674 |     }
 675 | 
 676 |     return NGX_DECLINED;
 677 | }
 678 | 
 679 | 
 680 | static ngx_int_t
 681 | ngx_http_zstd_ok(ngx_http_request_t *r)
 682 | {
 683 |     ngx_table_elt_t  *ae;
 684 | 
 685 |     if (r != r->main) {
 686 |         return NGX_DECLINED;
 687 |     }
 688 | 
 689 |     ae = r->headers_in.accept_encoding;
 690 |     if (ae == NULL) {
 691 |         return NGX_DECLINED;
 692 |     }
 693 | 
 694 |     if (ae->value.len < sizeof("zstd") - 1) {
 695 |         return NGX_DECLINED;
 696 |     }
 697 | 
 698 |     if (ngx_memcmp(ae->value.data, "zstd", 4) != 0
 699 |         && ngx_http_zstd_accept_encoding(&ae->value) != NGX_OK)
 700 |     {
 701 |         return NGX_DECLINED;
 702 |     }
 703 | 
 704 | 
 705 |     r->gzip_tested = 1;
 706 |     r->gzip_ok = 0;
 707 | 
 708 |     return NGX_OK;
 709 | }
 710 | 
 711 | 
 712 | static void *
 713 | ngx_http_zstd_create_main_conf(ngx_conf_t *cf)
 714 | {
 715 |     ngx_http_zstd_main_conf_t  *zmcf;
 716 | 
 717 |     zmcf = ngx_pcalloc(cf->pool, sizeof(ngx_http_zstd_main_conf_t));
 718 |     if (zmcf == NULL) {
 719 |         return NULL;
 720 |     }
 721 | 
 722 |     return zmcf;
 723 | }
 724 | 
 725 | 
 726 | static char *
 727 | ngx_http_zstd_init_main_conf(ngx_conf_t *cf, void *conf)
 728 | {
 729 |     ngx_http_zstd_main_conf_t *zmcf = conf;
 730 | 
 731 |     if (zmcf->dict_file.len == 0) {
 732 |         return NGX_CONF_OK;
 733 |     }
 734 | 
 735 |     if (ngx_conf_full_name(cf->cycle, &zmcf->dict_file, 1) != NGX_OK) {
 736 |         return NGX_CONF_ERROR;
 737 |     }
 738 | 
 739 |     return NGX_CONF_OK;
 740 | }
 741 | 
 742 | 
 743 | static void *
 744 | ngx_http_zstd_create_loc_conf(ngx_conf_t *cf)
 745 | {
 746 |     ngx_http_zstd_loc_conf_t  *conf;
 747 | 
 748 |     conf = ngx_pcalloc(cf->pool, sizeof(ngx_http_zstd_loc_conf_t));
 749 |     if (conf == NULL) {
 750 |         return NULL;
 751 |     }
 752 | 
 753 |     /*
 754 |      * set by ngx_pcalloc():
 755 |      *
 756 |      *    conf->bufs.num = 0;
 757 |      *    conf->types = { NULL };
 758 |      *    conf->types_keys = NULL;
 759 |      *    conf->dict = NULL;
 760 |      */
 761 | 
 762 |     conf->enable = NGX_CONF_UNSET;
 763 |     conf->level = NGX_CONF_UNSET;
 764 |     conf->min_length = NGX_CONF_UNSET;
 765 | 
 766 |     return conf;
 767 | }
 768 | 
 769 | 
 770 | static char *
 771 | ngx_http_zstd_merge_loc_conf(ngx_conf_t *cf, void *parent, void *child)
 772 | {
 773 |     ngx_http_zstd_loc_conf_t *prev = parent;
 774 |     ngx_http_zstd_loc_conf_t *conf = child;
 775 | 
 776 |     ngx_fd_t                    fd;
 777 |     size_t                      size;
 778 |     ssize_t                     n;
 779 |     char                       *rc;
 780 |     u_char                     *buf;
 781 |     ngx_file_info_t             info;
 782 |     ngx_http_zstd_main_conf_t  *zmcf;
 783 | 
 784 |     rc = NGX_OK;
 785 |     buf = NULL;
 786 |     fd = NGX_INVALID_FILE;
 787 | 
 788 |     ngx_conf_merge_value(conf->enable, prev->enable, 0);
 789 |     ngx_conf_merge_value(conf->level, prev->level, 1);
 790 |     ngx_conf_merge_value(conf->min_length, prev->min_length, 20);
 791 | 
 792 |     if (ngx_http_merge_types(cf, &conf->types_keys, &conf->types,
 793 |                              &prev->types_keys, &prev->types,
 794 |                              ngx_http_html_default_types))
 795 |     {
 796 |         return NGX_CONF_ERROR;
 797 |     }
 798 | 
 799 |     ngx_conf_merge_ptr_value(conf->dict, prev->dict, NULL);
 800 |     ngx_conf_merge_bufs_value(conf->bufs, prev->bufs,
 801 |                               (128 * 1024) / ngx_pagesize, ngx_pagesize);
 802 | 
 803 |     zmcf = ngx_http_conf_get_module_main_conf(cf, ngx_http_zstd_filter_module);
 804 | 
 805 |     if (conf->enable && zmcf->dict_file.len > 0) {
 806 | 
 807 |         if (conf->level == prev->level) {
 808 |             conf->dict = prev->dict;
 809 | 
 810 |         } else {
 811 |             /*
 812 |              * compression level is different from the outer block,
 813 |              * so we should create a seperate dict object.
 814 |              */
 815 | 
 816 |             fd = ngx_open_file(zmcf->dict_file.data, NGX_FILE_RDONLY,
 817 |                                NGX_FILE_OPEN, 0);
 818 | 
 819 |             if (fd == NGX_INVALID_FILE) {
 820 |                 ngx_conf_log_error(NGX_LOG_EMERG, cf, ngx_errno,
 821 |                                    ngx_open_file_n " \"%V\" failed",
 822 |                                    &zmcf->dict_file);
 823 | 
 824 |                 return NGX_CONF_ERROR;
 825 |             }
 826 | 
 827 |             if (ngx_fd_info(fd, &info) == NGX_FILE_ERROR) {
 828 |                 ngx_conf_log_error(NGX_LOG_EMERG, cf, ngx_errno,
 829 |                                    ngx_fd_info_n " \"%V\" failed",
 830 |                                    &zmcf->dict_file);
 831 | 
 832 |                 rc = NGX_CONF_ERROR;
 833 |                 goto close;
 834 |             }
 835 | 
 836 |             size = ngx_file_size(&info);
 837 |             buf = ngx_palloc(cf->pool, size);
 838 |             if (buf == NULL) {
 839 |                 rc = NGX_CONF_ERROR;
 840 |                 goto close;
 841 |             }
 842 | 
 843 |             n = ngx_read_fd(fd, (void *) buf, size);
 844 |             if (n < 0) {
 845 |                 ngx_conf_log_error(NGX_LOG_EMERG, cf, ngx_errno,
 846 |                                    ngx_read_fd_n " %V\" failed",
 847 |                                    &zmcf->dict_file);
 848 | 
 849 |                 rc = NGX_CONF_ERROR;
 850 |                 goto close;
 851 | 
 852 |             } else if ((size_t) n != size) {
 853 |                 ngx_conf_log_error(NGX_LOG_EMERG, cf, ngx_errno,
 854 |                                    ngx_read_fd_n "\"%V incomplete\"",
 855 |                                    &zmcf->dict_file);
 856 | 
 857 |                 rc = NGX_CONF_ERROR;
 858 |                 goto close;
 859 |             }
 860 | 
 861 |             conf->dict = ZSTD_createCDict_byReference(buf, size, conf->level);
 862 |             if (conf->dict == NULL) {
 863 |                 ngx_conf_log_error(NGX_LOG_EMERG, cf, 0,
 864 |                                    "ZSTD_createCDict_byReference() failed");
 865 |                 rc = NGX_CONF_ERROR;
 866 |                 goto close;
 867 |             }
 868 |         }
 869 |     }
 870 | 
 871 | close:
 872 | 
 873 |     if (fd != NGX_INVALID_FILE && ngx_close_file(fd) == NGX_FILE_ERROR) {
 874 |         ngx_conf_log_error(NGX_LOG_EMERG, cf, ngx_errno,
 875 |                            ngx_close_file_n " \"%V\" failed",
 876 |                            &zmcf->dict_file);
 877 | 
 878 |         rc = NGX_CONF_ERROR;
 879 |     }
 880 | 
 881 |     return rc;
 882 | }
 883 | 
 884 | 
 885 | static ngx_int_t
 886 | ngx_http_zstd_filter_init(ngx_conf_t *cf)
 887 | {
 888 |     ngx_http_next_header_filter = ngx_http_top_header_filter;
 889 |     ngx_http_top_header_filter = ngx_http_zstd_header_filter;
 890 | 
 891 |     ngx_http_next_body_filter = ngx_http_top_body_filter;
 892 |     ngx_http_top_body_filter = ngx_http_zstd_body_filter;
 893 | 
 894 |     return NGX_OK;
 895 | }
 896 | 
 897 | 
 898 | static void *
 899 | ngx_http_zstd_filter_alloc(void *opaque, size_t size)
 900 | {
 901 |     ngx_http_zstd_ctx_t *ctx = opaque;
 902 | 
 903 |     void  *p;
 904 | 
 905 |     p = ngx_palloc(ctx->request->pool, size);
 906 | 
 907 |     ngx_log_debug2(NGX_LOG_DEBUG_HTTP, ctx->request->connection->log, 0,
 908 |                    "zstd alloc: %p, size: %uz", p, size);
 909 | 
 910 |     return p;
 911 | }
 912 | 
 913 | 
 914 | static ngx_int_t
 915 | ngx_http_zstd_add_variables(ngx_conf_t *cf)
 916 | {
 917 |     ngx_http_variable_t  *v;
 918 | 
 919 |     v = ngx_http_add_variable(cf, &ngx_http_zstd_ratio,
 920 |                               NGX_HTTP_VAR_NOCACHEABLE);
 921 |     if (v == NULL) {
 922 |         return NGX_ERROR;
 923 |     }
 924 | 
 925 |     v->get_handler = ngx_http_zstd_ratio_variable;
 926 | 
 927 |     return NGX_OK;
 928 | }
 929 | 
 930 | 
 931 | static ngx_int_t
 932 | ngx_http_zstd_ratio_variable(ngx_http_request_t *r,
 933 |     ngx_http_variable_value_t *vv, uintptr_t data)
 934 | {
 935 |     ngx_uint_t            ratio_int, ratio_frac;
 936 |     ngx_http_zstd_ctx_t  *ctx;
 937 | 
 938 |     ctx = ngx_http_get_module_ctx(r, ngx_http_zstd_filter_module);
 939 |     if (ctx == NULL || !ctx->done || ctx->bytes_out == 0) {
 940 |         vv->not_found = 1;
 941 |         return NGX_OK;
 942 |     }
 943 | 
 944 |     vv->data = ngx_pnalloc(r->pool, NGX_INT32_LEN + 3);
 945 |     if (vv->data == NULL) {
 946 |         return NGX_ERROR;
 947 |     }
 948 | 
 949 |     ratio_int = (ngx_uint_t) ctx->bytes_in / ctx->bytes_out;
 950 |     ratio_frac = (ngx_uint_t) (ctx->bytes_in * 1000 / ctx->bytes_out % 1000);
 951 | 
 952 |     vv->len = ngx_sprintf(vv->data, "%ui.%03ui", ratio_int, ratio_frac)
 953 |               - vv->data;
 954 | 
 955 |     vv->valid = 1;
 956 |     vv->no_cacheable = 1;
 957 | 
 958 |     return NGX_OK;
 959 | }
 960 | 
 961 | 
 962 | static void
 963 | ngx_http_zstd_filter_free(void *opaque, void *address)
 964 | {
 965 | #if (NGX_DEBUG)
 966 | 
 967 |     ngx_http_zstd_ctx_t *ctx = opaque;
 968 | 
 969 |     ngx_log_debug1(NGX_LOG_DEBUG_HTTP, ctx->request->connection->log, 0,
 970 |                    "zstd free: %p", address);
 971 | 
 972 | #endif
 973 | }
 974 | 
 975 | 
 976 | static char *
 977 | ngx_http_zstd_comp_level(ngx_conf_t *cf, void *post, void *data)
 978 | {
 979 |     ngx_int_t  *np = data;
 980 | 
 981 |     if (*np == 0 || *np < (ngx_int_t)ZSTD_minCLevel() || *np > ZSTD_maxCLevel()) {
 982 |         ngx_conf_log_error(NGX_LOG_EMERG, cf, 0,
 983 |                            "zstd compress level must between %i and %i excluding 0",
 984 |                            (ngx_int_t)ZSTD_minCLevel(), ZSTD_maxCLevel());
 985 | 
 986 |         return NGX_CONF_ERROR;
 987 |     }
 988 | 
 989 |     return NGX_CONF_OK;
 990 | }
 991 | 
 992 | static char *
 993 | ngx_conf_zstd_set_num_slot_with_negatives(ngx_conf_t *cf, ngx_command_t *cmd, void *conf)
 994 | {
 995 |     char  *p = conf;
 996 | 
 997 |     ngx_int_t        *np;
 998 |     ngx_str_t        *value;
 999 |     ngx_conf_post_t  *post;
1000 | 
1001 | 
1002 |     np = (ngx_int_t *) (p + cmd->offset);
1003 | 
1004 |     if (*np != NGX_CONF_UNSET) {
1005 |         return "is duplicate";
1006 |     }
1007 | 
1008 |     value = cf->args->elts;
1009 | 
1010 |     if (*(value[1].data) == '-') {
1011 |         // Parse ignoring the leading '-' character
1012 |         *np = ngx_atoi(value[1].data + 1, value[1].len - 1);
1013 | 
1014 |         // NGX_ERROR is -1 so we need to check for that before making the parsed
1015 |         // result negative
1016 |         if (*np == NGX_ERROR) {
1017 |             return "invalid number";
1018 |         }
1019 | 
1020 |         *np = -*np;
1021 |     } else {
1022 |         *np = ngx_atoi(value[1].data, value[1].len);
1023 | 
1024 |         if (*np == NGX_ERROR) {
1025 |             return "invalid number";
1026 |         }
1027 |     }
1028 | 
1029 |     if (cmd->post) {
1030 |         post = cmd->post;
1031 |         return post->post_handler(cf, post, np);
1032 |     }
1033 | 
1034 |     return NGX_CONF_OK;
1035 | }
1036 | 


--------------------------------------------------------------------------------
/static/config:
--------------------------------------------------------------------------------
  1 | ngx_feature_incs="#include <zstd.h>"
  2 | ngx_feature_test="(void) ZSTD_createCCtx();"
  3 | ngx_feature_libs=
  4 | ngx_feature_run=yes
  5 | 
  6 | ngx_zstd_opt_I=
  7 | ngx_zstd_opt_L=
  8 | 
  9 | if [ -n "$ZSTD_INC" -o -n "$ZSTD_LIB" ]; then
 10 |     ngx_feature="ZStandard static library in $ZSTD_INC and $ZSTD_LIB"
 11 |     ngx_feature_path=$ZSTD_INC
 12 | 
 13 |     # we try the static shared library firstly
 14 |     ngx_zstd_opt_I="-I$ZSTD_INC -DZSTD_STATIC_LINKING_ONLY"
 15 |     ngx_zstd_opt_L="$ZSTD_LIB/libzstd.a"
 16 |     SAVED_CC_TAST_FLAGS=$CC_TEST_FLAGS
 17 |     CC_TEST_FLAGS="$ngx_zstd_opt_I $CC_TEST_FLAGS"
 18 |     SAVED_NGX_TEST_LD_OPT=$NGX_TEST_LD_OPT
 19 |     NGX_TEST_LD_OPT="$ngx_zstd_opt_L $NGX_TEST_LD_OPT"
 20 | 
 21 |     . auto/feature
 22 | 
 23 |     # restore
 24 |     CC_TEST_FLAGS=$SAVED_CC_TAST_FLAGS
 25 |     NGX_TEST_LD_OPT=$SAVED_NGX_TEST_LD_OPT
 26 | 
 27 |     if [ $ngx_found = no ]; then
 28 |         # then try the dynamic shared library
 29 |         ngx_feature="ZStandard dynamic library in $ZSTD_INC and $ZSTD_LIB"
 30 |         ngx_zstd_opt_L="-L$ZSTD_LIB -lzstd -Wl,-rpath, $ZSTD_LIB"
 31 | 
 32 |         SAVED_CC_TAST_FLAGS=$CC_TEST_FLAGS
 33 |         CC_TEST_FLAGS="$ngx_zstd_opt_I $CC_TEST_FLAGS"
 34 |         SAVED_NGX_TEST_LD_OPT=$NGX_TEST_LD_OPT
 35 |         NGX_TEST_LD_OPT="$ngx_zstd_opt_L $NGX_TEST_LD_OPT"
 36 | 
 37 |         . auto/feature
 38 | 
 39 |         # restore
 40 |         CC_TEST_FLAGS=$SAVED_CC_TAST_FLAGS
 41 |         NGX_TEST_LD_OPT=$SAVED_NGX_TEST_LD_OPT
 42 | 
 43 |         if [ $ngx_found = no ]; then
 44 |             cat << END
 45 |             $0: error: ngx_http_zstd_filter_module requires the ZStandard library, please be sure that "\$ZSTD_INC" and "\$ZSTD_LIB" are set correctly.
 46 | END
 47 |             exit 1
 48 |         fi
 49 | 
 50 |     fi
 51 | else
 52 |     # auto-discovery
 53 |     ngx_feature="ZStandard static library"
 54 |     ngx_zstd_opt_I="-DZSTD_STATIC_LINKING_ONLY"
 55 |     ngx_zstd_opt_L="-l:libzstd.a"
 56 | 
 57 |     # still we consider the static library firstly
 58 |     SAVED_CC_TAST_FLAGS=$CC_TEST_FLAGS
 59 |     CC_TEST_FLAGS="$ngx_zstd_opt_I $CC_TEST_FLAGS"
 60 |     SAVED_NGX_TEST_LD_OPT=$NGX_TEST_LD_OPT
 61 |     NGX_TEST_LD_OPT="$ngx_zstd_opt_L $NGX_TEST_LD_OPT"
 62 | 
 63 |     . auto/feature
 64 | 
 65 |     # restore
 66 |     CC_TEST_FLAGS=$SAVED_CC_TAST_FLAGS
 67 |     NGX_TEST_LD_OPT=$SAVED_NGX_TEST_LD_OPT
 68 | 
 69 |     if [ $ngx_found = no ]; then
 70 | 
 71 |         ngx_feature="ZStandard dynamic library"
 72 |         ngx_zstd_opt_L="-lzstd"
 73 |         SAVED_CC_TAST_FLAGS=$CC_TEST_FLAGS
 74 |         CC_TEST_FLAGS="$ngx_zstd_opt_I $CC_TEST_FLAGS"
 75 |         SAVED_NGX_TEST_LD_OPT=$NGX_TEST_LD_OPT
 76 |         NGX_TEST_LD_OPT="$ngx_zstd_opt_L $NGX_TEST_LD_OPT"
 77 | 
 78 |         . auto/feature
 79 | 
 80 |         if [ $ngx_found = no ]; then
 81 |             cat << END
 82 |             $0: error: ngx_http_zstd_filter_module requires the ZStandard library.
 83 | END
 84 |             exit 1
 85 |         fi
 86 | 
 87 |         # restore
 88 |         CC_TEST_FLAGS=$SAVED_CC_TAST_FLAGS
 89 |         NGX_TEST_LD_OPT=$SAVED_NGX_TEST_LD_OPT
 90 | 
 91 |         cat << END
 92 |         $0: warning: ngx_http_zstd_filter_module uses advanced ZStandard APIs (which are still considered experimental) while you are trying to link the dynamic shared library.
 93 | END
 94 |     fi
 95 | 
 96 |     # TODO we need more tries for the different OS port.
 97 | fi
 98 | 
 99 | CFLAGS="$ngx_zstd_opt_I $CFLAGS"
100 | NGX_LD_OPT="$ngx_zstd_opt_L $NGX_LD_OPT"
101 | 
102 | # build the ngx_http_zstd_static_module
103 | HTTP_ZSTD_SRCS="$ngx_addon_dir/static/ngx_http_zstd_static_module.c"
104 | 
105 | ngx_addon_name=ngx_http_zstd_static_module
106 | ngx_module_type=HTTP
107 | ngx_module_name=ngx_http_zstd_static_module
108 | ngx_module_incs="$ngx_zstd_opt_I"
109 | ngx_module_srcs=$HTTP_ZSTD_SRCS
110 | 
111 | . auto/module
112 | 


--------------------------------------------------------------------------------
/static/ngx_http_zstd_static_module.c:
--------------------------------------------------------------------------------
  1 | 
  2 | /*
  3 |  * Copyright (C) Alex Zhang
  4 |  */
  5 | 
  6 | 
  7 | #include <ngx_config.h>
  8 | #include <ngx_core.h>
  9 | #include <ngx_http.h>
 10 | 
 11 | 
 12 | #define NGX_HTTP_ZSTD_STATIC_OFF        0
 13 | #define NGX_HTTP_ZSTD_STATIC_ON         1
 14 | #define NGX_HTTP_ZSTD_STATIC_ALWAYS     2
 15 | 
 16 | 
 17 | typedef struct {
 18 |     ngx_uint_t  enable;
 19 | } ngx_http_zstd_static_conf_t;
 20 | 
 21 | 
 22 | static ngx_conf_enum_t  ngx_http_zstd_static[] = {
 23 |     { ngx_string("off"), NGX_HTTP_ZSTD_STATIC_OFF },
 24 |     { ngx_string("on"), NGX_HTTP_ZSTD_STATIC_ON },
 25 |     { ngx_string("always"), NGX_HTTP_ZSTD_STATIC_ALWAYS },
 26 | };
 27 | 
 28 | 
 29 | static ngx_command_t  ngx_http_zstd_static_commands[] = {
 30 | 
 31 |     { ngx_string("zstd_static"),
 32 |       NGX_HTTP_MAIN_CONF|NGX_HTTP_SRV_CONF|NGX_HTTP_LOC_CONF|NGX_CONF_TAKE1,
 33 |       ngx_conf_set_enum_slot,
 34 |       NGX_HTTP_LOC_CONF_OFFSET,
 35 |       offsetof(ngx_http_zstd_static_conf_t, enable),
 36 |       &ngx_http_zstd_static },
 37 | 
 38 |     ngx_null_command
 39 | };
 40 | 
 41 | 
 42 | static ngx_int_t ngx_http_zstd_static_handler(ngx_http_request_t *r);
 43 | static ngx_int_t ngx_http_zstd_accept_encoding(ngx_str_t *ae);
 44 | static ngx_int_t ngx_http_zstd_ok(ngx_http_request_t *r);
 45 | static void * ngx_http_zstd_static_create_loc_conf(ngx_conf_t *cf);
 46 | static char * ngx_http_zstd_static_merge_loc_conf(ngx_conf_t *cf, void *parent,
 47 |     void *child);
 48 | static ngx_int_t ngx_http_zstd_static_init(ngx_conf_t *cf);
 49 | 
 50 | 
 51 | static ngx_http_module_t  ngx_http_zstd_static_module_ctx = {
 52 |     NULL,                                     /* preconfiguration */
 53 |     ngx_http_zstd_static_init,                /* postconfiguration */
 54 | 
 55 |     NULL,                                     /* create main configuration */
 56 |     NULL,                                     /* init main configuration */
 57 | 
 58 |     NULL,                                     /* create server configuration */
 59 |     NULL,                                     /* merge server configuration */
 60 | 
 61 |     ngx_http_zstd_static_create_loc_conf,     /* create location configuration */
 62 |     ngx_http_zstd_static_merge_loc_conf,      /* merge location configuration */
 63 | };
 64 | 
 65 | 
 66 | ngx_module_t  ngx_http_zstd_static_module = {
 67 |     NGX_MODULE_V1,
 68 |     &ngx_http_zstd_static_module_ctx,       /* module context */
 69 |     ngx_http_zstd_static_commands,          /* module directives */
 70 |     NGX_HTTP_MODULE,                        /* module type */
 71 |     NULL,                                   /* init master */
 72 |     NULL,                                   /* init module */
 73 |     NULL,                                   /* init process */
 74 |     NULL,                                   /* init thread */
 75 |     NULL,                                   /* exit thread */
 76 |     NULL,                                   /* exit process */
 77 |     NULL,                                   /* exit master */
 78 |     NGX_MODULE_V1_PADDING
 79 | };
 80 | 
 81 | 
 82 | static ngx_int_t
 83 | ngx_http_zstd_static_handler(ngx_http_request_t *r)
 84 | {
 85 |     u_char                       *p;
 86 |     ngx_int_t                     rc;
 87 |     ngx_uint_t                    level;
 88 |     size_t                        root;
 89 |     ngx_str_t                     path;
 90 |     ngx_buf_t                    *b;
 91 |     ngx_log_t                    *log;
 92 |     ngx_table_elt_t              *h;
 93 |     ngx_chain_t                   out;
 94 |     ngx_open_file_info_t          of;
 95 |     ngx_http_core_loc_conf_t     *clcf;
 96 |     ngx_http_zstd_static_conf_t  *zscf;
 97 | 
 98 |     if (!(r->method & (NGX_HTTP_GET|NGX_HTTP_HEAD))) {
 99 |         return NGX_DECLINED;
100 |     }
101 | 
102 |     if (r->uri.data[r->uri.len - 1] == '/') {
103 |         return NGX_DECLINED;
104 |     }
105 | 
106 |     zscf = ngx_http_get_module_loc_conf(r, ngx_http_zstd_static_module);
107 | 
108 |     if (zscf->enable == NGX_HTTP_ZSTD_STATIC_OFF) {
109 |         return NGX_DECLINED;
110 |     }
111 | 
112 |     if (zscf->enable == NGX_HTTP_ZSTD_STATIC_ON) {
113 |         rc = ngx_http_zstd_ok(r);
114 | 
115 |     } else {
116 |         rc = NGX_OK;
117 |     }
118 | 
119 |     clcf = ngx_http_get_module_loc_conf(r, ngx_http_core_module);
120 | 
121 |     if (!clcf->gzip_vary && rc != NGX_OK) {
122 |         return NGX_DECLINED;
123 |     }
124 | 
125 |     log = r->connection->log;
126 | 
127 |     p = ngx_http_map_uri_to_path(r, &path, &root, sizeof(".zst") - 1);
128 |     if (p == NULL) {
129 |         return NGX_HTTP_INTERNAL_SERVER_ERROR;
130 |     }
131 | 
132 |     *p++ = '.';
133 |     *p++ = 'z';
134 |     *p++ = 's';
135 |     *p++ = 't';
136 |     *p = '\0';
137 | 
138 |     path.len = p - path.data;
139 | 
140 |     ngx_log_debug1(NGX_LOG_DEBUG_HTTP, log, 0,
141 |                    "http filename: \"%s\"", path.data);
142 | 
143 |     ngx_memzero(&of, sizeof(ngx_open_file_info_t));
144 | 
145 |     of.read_ahead = clcf->read_ahead;
146 |     of.directio = clcf->directio;
147 |     of.valid = clcf->open_file_cache_valid;
148 |     of.min_uses = clcf->open_file_cache_min_uses;
149 |     of.errors = clcf->open_file_cache_errors;
150 |     of.events = clcf->open_file_cache_events;
151 | 
152 |     if (ngx_http_set_disable_symlinks(r, clcf, &path, &of) != NGX_OK) {
153 |         return NGX_HTTP_INTERNAL_SERVER_ERROR;
154 |     }
155 | 
156 |     if (ngx_open_cached_file(clcf->open_file_cache, &path, &of, r->pool)
157 |         != NGX_OK)
158 |     {
159 |         switch (of.err) {
160 | 
161 |         case 0:
162 |             return NGX_HTTP_INTERNAL_SERVER_ERROR;
163 | 
164 |         case NGX_ENOENT:
165 |         case NGX_ENOTDIR:
166 |         case NGX_ENAMETOOLONG:
167 | 
168 |             return NGX_DECLINED;
169 | 
170 |         case NGX_EACCES:
171 | #if (NGX_HAVE_OPENAT)
172 |         case NGX_EMLINK:
173 |         case NGX_ELOOP:
174 | #endif
175 | 
176 |             level = NGX_LOG_ERR;
177 |             break;
178 | 
179 |         default:
180 | 
181 |             level = NGX_LOG_CRIT;
182 |             break;
183 |         }
184 | 
185 |         ngx_log_error(level, log, of.err,
186 |                       "%s \"%s\" failed", of.failed, path.data);
187 | 
188 |         return NGX_DECLINED;
189 |     }
190 | 
191 |     if (zscf->enable == NGX_HTTP_ZSTD_STATIC_ON) {
192 |         r->gzip_vary = 1;
193 | 
194 |         if (rc != NGX_OK) {
195 |             return NGX_DECLINED;
196 |         }
197 |     }
198 | 
199 |     ngx_log_debug1(NGX_LOG_DEBUG_HTTP, log, 0, "http static fd: %d", of.fd);
200 | 
201 |     if (of.is_dir) {
202 |         ngx_log_debug0(NGX_LOG_DEBUG_HTTP, log, 0, "http dir");
203 |         return NGX_DECLINED;
204 |     }
205 | 
206 | #if !(NGX_WIN32) /* the not regular files are probably Unix specific */
207 | 
208 |     if (!of.is_file) {
209 |         ngx_log_error(NGX_LOG_CRIT, log, 0,
210 |                       "\"%s\" is not a regular file", path.data);
211 | 
212 |         return NGX_HTTP_NOT_FOUND;
213 |     }
214 | 
215 | #endif
216 | 
217 |     r->root_tested = !r->error_page;
218 | 
219 |     rc = ngx_http_discard_request_body(r);
220 |     if (rc != NGX_OK) {
221 |         return rc;
222 |     }
223 | 
224 |     log->action = "sending response to client";
225 | 
226 |     r->headers_out.status = NGX_HTTP_OK;
227 |     r->headers_out.content_length_n = of.size;
228 |     r->headers_out.last_modified_time = of.mtime;
229 | 
230 |     if (ngx_http_set_etag(r) != NGX_OK) {
231 |         return NGX_HTTP_INTERNAL_SERVER_ERROR;
232 |     }
233 | 
234 |     if (ngx_http_set_content_type(r) != NGX_OK) {
235 |         return NGX_HTTP_INTERNAL_SERVER_ERROR;
236 |     }
237 | 
238 |     h = ngx_list_push(&r->headers_out.headers);
239 |     if (h == NULL) {
240 |         return NGX_HTTP_INTERNAL_SERVER_ERROR;
241 |     }
242 | 
243 |     h->hash = 1;
244 |     ngx_str_set(&h->key, "Content-Encoding");
245 |     ngx_str_set(&h->value, "zstd");
246 |     r->headers_out.content_encoding = h;
247 | 
248 |     b = ngx_calloc_buf(r->pool);
249 |     if (b == NULL) {
250 |         return NGX_HTTP_INTERNAL_SERVER_ERROR;
251 |     }
252 | 
253 |     b->file = ngx_pcalloc(r->pool, sizeof(ngx_file_t));
254 |     if (b->file == NULL) {
255 |         return NGX_HTTP_INTERNAL_SERVER_ERROR;
256 |     }
257 | 
258 |     rc = ngx_http_send_header(r);
259 | 
260 |     if (rc == NGX_ERROR || rc > NGX_OK || r->header_only) {
261 |         return rc;
262 |     }
263 | 
264 |     b->file_pos = 0;
265 |     b->file_last = of.size;
266 | 
267 |     b->in_file = b->file_last ? 1 : 0;
268 |     b->last_buf = (r == r->main) ? 1 : 0;
269 |     b->last_in_chain = 1;
270 | 
271 |     b->file->fd = of.fd;
272 |     b->file->name = path;
273 |     b->file->log = log;
274 |     b->file->directio = of.is_directio;
275 | 
276 |     out.buf = b;
277 |     out.next = NULL;
278 | 
279 |     return ngx_http_output_filter(r, &out);
280 | }
281 | 
282 | 
283 | static ngx_int_t
284 | ngx_http_zstd_ok(ngx_http_request_t *r)
285 | {
286 |     ngx_table_elt_t  *ae;
287 | 
288 |     if (r != r->main) {
289 |         return NGX_DECLINED;
290 |     }
291 | 
292 |     ae = r->headers_in.accept_encoding;
293 |     if (ae == NULL) {
294 |         return NGX_DECLINED;
295 |     }
296 | 
297 |     if (ae->value.len < sizeof("zstd") - 1) {
298 |         return NGX_DECLINED;
299 |     }
300 | 
301 |     if (ngx_memcmp(ae->value.data, "zstd", 4) != 0
302 |         && ngx_http_zstd_accept_encoding(&ae->value) != NGX_OK)
303 |     {
304 |         return NGX_DECLINED;
305 |     }
306 | 
307 | 
308 |     r->gzip_tested = 1;
309 |     r->gzip_ok = 0;
310 | 
311 |     return NGX_OK;
312 | }
313 | 
314 | 
315 | static ngx_int_t
316 | ngx_http_zstd_accept_encoding(ngx_str_t *ae)
317 | {
318 |     u_char  *p;
319 | 
320 |     p = ngx_strcasestrn(ae->data, "zstd", sizeof("zstd") - 1);
321 |     if (p == NULL) {
322 |         return NGX_DECLINED;
323 |     }
324 | 
325 |     if (p == ae->data || (*(p - 1) == ',' || *(p - 1) == ' ')) {
326 | 
327 |         p += sizeof("zstd") - 1;
328 | 
329 |         if (p == ae->data + ae->len || *p == ',' || *p == ' ' || *p == ';') {
330 |             return NGX_OK;
331 |         }
332 |     }
333 | 
334 |     return NGX_DECLINED;
335 | }
336 | 
337 | 
338 | static void *
339 | ngx_http_zstd_static_create_loc_conf(ngx_conf_t *cf)
340 | {
341 |     ngx_http_zstd_static_conf_t  *conf;
342 | 
343 |     conf = ngx_palloc(cf->pool, sizeof(ngx_http_zstd_static_conf_t));
344 |     if (conf == NULL) {
345 |         return NULL;
346 |     }
347 | 
348 |     conf->enable = NGX_CONF_UNSET_UINT;
349 | 
350 |     return conf;
351 | }
352 | 
353 | 
354 | static char *
355 | ngx_http_zstd_static_merge_loc_conf(ngx_conf_t *cf, void *parent, void *child)
356 | {
357 |     ngx_http_zstd_static_conf_t *prev = parent;
358 |     ngx_http_zstd_static_conf_t *conf = child;
359 | 
360 |     ngx_conf_merge_uint_value(conf->enable, prev->enable,
361 |                               NGX_HTTP_ZSTD_STATIC_OFF);
362 | 
363 |     return NGX_CONF_OK;
364 | }
365 | 
366 | 
367 | static ngx_int_t
368 | ngx_http_zstd_static_init(ngx_conf_t *cf)
369 | {
370 |     ngx_http_handler_pt        *h;
371 |     ngx_http_core_main_conf_t  *cmcf;
372 | 
373 |     cmcf = ngx_http_conf_get_module_main_conf(cf, ngx_http_core_module);
374 | 
375 |     h = ngx_array_push(&cmcf->phases[NGX_HTTP_CONTENT_PHASE].handlers);
376 |     if (h == NULL) {
377 |         return NGX_ERROR;
378 |     }
379 | 
380 |     *h = ngx_http_zstd_static_handler;
381 | 
382 |     return NGX_OK;
383 | }
384 | 


--------------------------------------------------------------------------------
/t/00-filter.t:
--------------------------------------------------------------------------------
1 | use Test::Nginx::Socket::Lua;
2 | 
3 | no_long_string();
4 | run_tests();
5 | 
6 | __DATA__
7 | 
8 | 
9 | 


--------------------------------------------------------------------------------
/t/01-static.t:
--------------------------------------------------------------------------------
  1 | use Test::Nginx::Socket;
  2 | use lib 'lib';
  3 | 
  4 | no_long_string();
  5 | log_level 'debug';
  6 | repeat_each(3);
  7 | plan tests => repeat_each() * ((blocks() - 3) * 5 + 3);
  8 | run_tests();
  9 | 
 10 | 
 11 | __DATA__
 12 | 
 13 | 
 14 | === TEST 1: zstd_static off
 15 | --- config
 16 |     location /test {
 17 |         zstd_static off;
 18 |         root ../../t/suite;
 19 |     }
 20 | --- request
 21 | GET /test
 22 | --- response_headers
 23 | Content-Length: 59738
 24 | ETag: "5be17d33-e95a"
 25 | !Content-Encoding
 26 | --- no_error_log
 27 | [error]
 28 | 
 29 | 
 30 | 
 31 | === TEST 2: zstd_static off (with accept-encoding header)
 32 | --- config
 33 |     location /test {
 34 |         zstd_static off;
 35 |         root ../../t/suite;
 36 |     }
 37 | --- request
 38 | GET /test
 39 | Accept-Encoding: gzip,zstd
 40 | --- response_headers
 41 | Content-Length: 59738
 42 | ETag: "5be17d33-e95a"
 43 | !Content-Encoding
 44 | --- no_error_log
 45 | [error]
 46 | 
 47 | 
 48 | 
 49 | === TEST 3: zstd_static on
 50 | --- config
 51 |     location /test {
 52 |         zstd_static on;
 53 |         root ../../t/suite;
 54 |     }
 55 | --- request
 56 | GET /test
 57 | --- more_headers
 58 | Accept-Encoding: gzip, zstd
 59 | --- response_headers
 60 | Content-Length: 20706
 61 | ETag: "5be17d33-50e2"
 62 | !Content-Encoding
 63 | Content-Encoding: zstd
 64 | --- no_error_log
 65 | [error]
 66 | 
 67 | 
 68 | 
 69 | === TEST 4: zstd_static on (without accept-encoding header)
 70 | --- config
 71 |     location /test {
 72 |         zstd_static on;
 73 |         root ../../t/suite;
 74 |     }
 75 | --- request
 76 | GET /test
 77 | --- response_headers
 78 | Content-Length: 59738
 79 | ETag: "5be17d33-e95a"
 80 | Content-Encoding: zstd
 81 | !Content-Encoding
 82 | --- no_error_log
 83 | [error]
 84 | 
 85 | 
 86 | 
 87 | === TEST 5: zstd_static on (without zstd component in accept-encoding header)
 88 | --- config
 89 |     location /test {
 90 |         zstd_static on;
 91 |         root ../../t/suite;
 92 |     }
 93 | --- request
 94 | GET /test
 95 | --- more_headers
 96 | Accept-Encoding: gzip, br
 97 | --- response_headers
 98 | Content-Length: 59738
 99 | ETag: "5be17d33-e95a"
100 | !Content-Encoding
101 | --- no_error_log
102 | [error]
103 | 
104 | 
105 | 
106 | === TEST 6: zstd_static always
107 | --- config
108 |     location /test {
109 |         zstd_static always;
110 |         root ../../t/suite;
111 |     }
112 | --- request
113 | GET /test
114 | --- more_headers
115 | Accept-Encoding: gzip, br
116 | --- response_headers
117 | Content-Length: 20706
118 | ETag: "5be17d33-50e2"
119 | Content-Encoding: zstd
120 | --- no_error_log
121 | [error]
122 | 
123 | 
124 | 
125 | === TEST 6: zstd_static always (without accept-encoding header)
126 | --- config
127 |     location /test {
128 |         zstd_static always;
129 |         root ../../t/suite;
130 |     }
131 | --- request
132 | GET /test
133 | --- response_headers
134 | Content-Length: 20706
135 | ETag: "5be17d33-50e2"
136 | Content-Encoding: zstd
137 | --- no_error_log
138 | [error]
139 | 
140 | 
141 | 
142 | === TEST 7: zstd_static always (without zstd component in accept-encoding header)
143 | --- config
144 |     location /test {
145 |         zstd_static always;
146 |         root ../../t/suite;
147 |     }
148 | --- request
149 | GET /test
150 | --- more_headers
151 | Accept-Encoding: gzip, br
152 | --- response_headers
153 | Content-Length: 20706
154 | ETag: "5be17d33-50e2"
155 | Content-Encoding: zstd
156 | --- no_error_log
157 | [error]
158 | 
159 | 
160 | === TEST 8: zstd_static always (file does not exist)
161 | --- config
162 |     location /test2 {
163 |         zstd_static always;
164 |         root ../../t/suite;
165 |     }
166 | --- request
167 | GET /test2
168 | --- more_headers
169 | Accept-Encoding: gzip, br
170 | --- error_code: 404
171 | 
172 | 
173 | 
174 | === TEST 9: zstd_static on (file does not exist)
175 | --- config
176 |     location /test2 {
177 |         zstd_static on;
178 |         root ../../t/suite;
179 |     }
180 | --- request
181 | GET /test2
182 | --- more_headers
183 | Accept-Encoding: gzip, br
184 | --- error_code: 404
185 | 
186 | 
187 | 
188 | === TEST 10: zstd_static off (file does not exist)
189 | --- config
190 |     location /test2 {
191 |         zstd_static off;
192 |         root ../../t/suite;
193 |     }
194 | --- request
195 | GET /test2
196 | --- more_headers
197 | Accept-Encoding: gzip, br
198 | --- error_code: 404
199 | 


--------------------------------------------------------------------------------
/t/suite/test:
--------------------------------------------------------------------------------
   1 | <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
   2 | "http://www.w3.org/TR/html4/loose.dtd">
   3 | <html>
   4 | <head>
   5 | <meta http-equiv="content-type" content="text/html; charset=UTF-8">
   6 | <title>Regular Expression Matching Can Be Simple And Fast</title>
   7 | <style type="text/css"><!--
   8 | body {
   9 | 	background-color: white;
  10 | 	color: black;
  11 | 	font-family: serif;
  12 | 	font-size: medium;
  13 | 	line-height: 1.2em;
  14 | 	margin-left: 0.5in;
  15 | 	margin-right: 0.5in;
  16 | 	margin-top: 0;
  17 | 	margin-bottom: 0;
  18 | }
  19 | 
  20 | p.lp {
  21 | 	text-indent: 0in;
  22 | 	text-align: justify;
  23 | }
  24 | 
  25 | p.lp-left {
  26 | 	text-indent: 0in;
  27 | 	text-align: left;
  28 | }
  29 | 
  30 | p.tlp {
  31 | 	text-indent: 0in;
  32 | 	text-align: justify;
  33 | 	margin-top: 0.25in;
  34 | }
  35 | 
  36 | p.pp {
  37 | 	text-indent: 0.35in;
  38 | 	text-align: justify;
  39 | }
  40 | 
  41 | code {
  42 | 	font-family: monospace;
  43 | 	font-size: medium;
  44 | }
  45 | 
  46 | h2.sh {
  47 | 	text-indent: 0in;
  48 | 	text-align: left;
  49 | 	margin-top: 2em;
  50 | 	margin-bottom: 0.05in;
  51 | 	font-weight: bold;
  52 | 	font-size: medium
  53 | }
  54 | 
  55 | p.fig {
  56 | 	text-align: center;
  57 | }
  58 | 
  59 | div.fig {
  60 | 	text-align: center;
  61 | 	margin-left: -0.5in;
  62 | 	margin-right: -0.5in;
  63 | }
  64 | 
  65 | .box {
  66 | 	border-style: dashed;
  67 | 	border-width: 1px;
  68 | }
  69 | 
  70 | pre.p1 {
  71 | 	text-indent: 0in;
  72 | 	text-align: left;
  73 | 	line-height: 1.1em;
  74 | 	font-size: 0.9em;
  75 | 	margin-left: 0.5in;
  76 | 	margin-right: 0.5in;
  77 | 	margin-top: 0;
  78 | 	margin-bottom: 0;
  79 | }
  80 | 
  81 | h1.tl {
  82 | 	font-weight: bold;
  83 | 	font-size: medium;
  84 | 	text-align: center;
  85 | 	margin-top: 3em;
  86 | }
  87 | 
  88 | h2.au {
  89 | 	font-weight: normal;
  90 | 	font-size: medium;
  91 | 	text-align: center;
  92 | 	margin-top: 1.5em;
  93 | 	margin-bottom: 3em;
  94 | }
  95 | 
  96 | p.copy {
  97 | 	text-align: center;
  98 | 	text-indent: 0in;
  99 | 	margin-top: 3em;
 100 | 	margin-bottom: 3em;
 101 | 	font-size: small;
 102 | }
 103 | 
 104 | --></style>
 105 | </head>
 106 | <body>
 107 | 
 108 | <h1 class=tl>
 109 | Regular Expression Matching Can Be Simple And Fast
 110 | <br>
 111 | (but is slow in Java, Perl, PHP, Python, Ruby, ...)
 112 | </h1>
 113 | <h2 class=au>
 114 | <a href="http://swtch.com/~rsc/">Russ Cox</a>
 115 | <br>
 116 | <i>rsc@swtch.com</i>
 117 | <br>
 118 | January 2007
 119 | <br>
 120 | <a href="https://plus.google.com/116810148281701144465" rel="author"><IMG src="http://www.google.com/images/icons/ui/gprofile_button-16.png" WIDTH="16" HEIGHT="16"></a> <g:plusone size="small" annotation="none"></g:plusone>
 121 | </h2>
 122 | 
 123 | 
 124 | <h2 class=sh>Introduction</h2>
 125 | 
 126 | <p class=pp>
 127 | This is a tale of two approaches to regular expression matching.
 128 | One of them is in widespread use in the
 129 | standard interpreters for many languages, including Perl.
 130 | The other is used only in a few places, notably most implementations
 131 | of awk and grep.
 132 | The two approaches have wildly different
 133 | performance characteristics:
 134 | </p>
 135 | 
 136 | <div class=fig>
 137 | <center>
 138 | <table cellspacing=0 cellpadding=0 border=0>
 139 | <tr><td valign=bottom><img src=grep3p.png alt="Perl graph" width="301" height="148"><td width=20><td valign=bottom><img src=grep4p.png alt="Thompson NFA graph" width="301" height="148">
 140 | <tr><td height=10>
 141 | <tr><td colspan=3 align=center>
 142 | Time to match <code>a?</code><sup><i>n</i></sup><code>a</code><sup><i>n</i></sup> against <code>a</code><sup><i>n</i></sup>
 143 | </table>
 144 | </center>
 145 | </div>
 146 | 
 147 | <p class=lp>
 148 | Let's use superscripts to denote string repetition,
 149 | so that 
 150 | <code>a?<sup>3</sup>a<sup>3</sup></code>
 151 | is shorthand for
 152 | <code>a?a?a?aaa</code>.
 153 | The two graphs plot the time required by each approach
 154 | to match the regular expression 
 155 | <code>a?</code><sup><i>n</i></sup><code>a</code><sup><i>n</i></sup>
 156 | against the string <code>a</code><sup><i>n</i></sup>.
 157 | </p>
 158 | 
 159 | <p class=pp>
 160 | Notice that Perl requires over sixty seconds to match
 161 | a 29-character string.
 162 | The other approach, labeled Thompson NFA for
 163 | reasons that will be explained later,
 164 | requires twenty <i>microseconds</i> to match the string.
 165 | That's not a typo.  The Perl graph plots time in seconds,
 166 | while the Thompson NFA graph plots time in microseconds:
 167 | the Thompson NFA implementation
 168 | is a million times faster than Perl
 169 | when running on a miniscule 29-character string.
 170 | The trends shown in the graph continue: the
 171 | Thompson NFA handles a 100-character string in under 200 microseconds,
 172 | while Perl would require over 10<sup>15</sup> years.
 173 | (Perl is only the most conspicuous example of a large
 174 | number of popular programs that use the same algorithm;
 175 | the above graph could have been Python, or PHP, or Ruby,
 176 | or many other languages.  A more detailed
 177 | graph later in this article presents data for other implementations.)
 178 | </p>
 179 | 
 180 | <p class=pp>
 181 | It may be hard to believe the graphs: perhaps you've used Perl,
 182 | and it never seemed like regular expression matching was
 183 | particularly slow.
 184 | Most of the time, in fact, regular expression matching in Perl
 185 | is fast enough.  
 186 | As the graph shows, though, it is possible
 187 | to write so-called &ldquo;pathological&rdquo; regular expressions that
 188 | Perl matches very <i>very</i> slowly.
 189 | In contrast, there are no regular expressions that are 
 190 | pathological for the Thompson NFA implementation.
 191 | Seeing the two graphs side by side prompts the question, 
 192 | &ldquo;why doesn't Perl use the Thompson NFA approach?&rdquo;
 193 | It can, it should, and that's what the rest of this article is about.
 194 | </p>
 195 | 
 196 | <p class=pp>
 197 | Historically, regular expressions are one of computer science's
 198 | shining examples of how using good theory leads to good programs.
 199 | They were originally developed by theorists as a
 200 | simple computational model,
 201 | but Ken Thompson introduced them to
 202 | programmers in his implementation of the text editor QED
 203 | for CTSS.
 204 | Dennis Ritchie followed suit in his own implementation
 205 | of QED, for GE-TSS.
 206 | Thompson and Ritchie would go on to create Unix,
 207 | and they brought regular expressions with them.
 208 | By the late 1970s, regular expressions were a key
 209 | feature of the Unix landscape, in tools such as
 210 | ed, sed, grep, egrep, awk, and lex.
 211 | </p>
 212 | 
 213 | <p class=pp>
 214 | Today, regular expressions have also become a shining
 215 | example of how ignoring good theory leads to bad programs.
 216 | The regular expression implementations used by
 217 | today's popular tools are significantly slower
 218 | than the ones used in many of those thirty-year-old Unix tools.
 219 | </p>
 220 | 
 221 | <p class=pp>
 222 | This article reviews the good theory: 
 223 | regular expressions, finite automata, 
 224 | and a regular expression search algorithm
 225 | invented by Ken Thompson in the mid-1960s.
 226 | It also puts the theory into practice, describing 
 227 | a simple implementation of Thompson's algorithm.
 228 | That implementation, less than 400 lines of C,
 229 | is the one that went head to head with Perl above.
 230 | It outperforms the more complex real-world
 231 | implementations used by
 232 | Perl, Python, PCRE, and others.
 233 | The article concludes with a discussion of how 
 234 | theory might yet be converted into practice
 235 | in the real-world implementations.
 236 | </p>
 237 | 
 238 | <h2 class=sh>
 239 | Regular Expressions
 240 | </h2>
 241 | 
 242 | 
 243 | <p class=pp>
 244 | Regular expressions are a notation for
 245 | describing sets of character strings.
 246 | When a particular string is in the set
 247 | described by a regular expression,
 248 | we often say that the regular expression
 249 | <i>matches</i>
 250 | the string.
 251 | </p>
 252 | 
 253 | <p class=pp>
 254 | The simplest regular expression is a single literal character.
 255 | Except for the special metacharacters 
 256 | <code>*+?()|</code>,
 257 | characters match themselves.
 258 | To match a metacharacter, escape it with
 259 | a backslash:
 260 | <code>\+</code>
 261 | matches a literal plus character.
 262 | </p>
 263 | 
 264 | <p class=pp>
 265 | Two regular expressions can be alternated or concatenated to form a new
 266 | regular expression:
 267 | if <i>e</i><sub>1</sub> matches
 268 | <i>s</i>
 269 | and <i>e</i><sub>2</sub> matches
 270 | <i>t</i>,
 271 | then <i>e</i><sub>1</sub><code>|</code><i>e</i><sub>2</sub> matches
 272 | <i>s</i>
 273 | or
 274 | <i>t</i>,
 275 | and
 276 | <i>e</i><sub>1</sub><i>e</i><sub>2</sub>
 277 | matches 
 278 | <i>st</i>.
 279 | </p>
 280 | 
 281 | <p class=pp>
 282 | The metacharacters
 283 | <code>*</code>,
 284 | <code>+</code>,
 285 | and
 286 | <code>?</code>
 287 | are repetition operators:
 288 | <i>e</i><sub>1</sub><code>*</code>
 289 | matches a sequence of zero or more (possibly different)
 290 | strings, each of which match <i>e</i><sub>1</sub>;
 291 | <i>e</i><sub>1</sub><code>+</code>
 292 | matches one or more; 
 293 | <i>e</i><sub>1</sub><code>?</code>
 294 | matches zero or one.
 295 | </p>
 296 | 
 297 | <p class=pp>
 298 | The operator precedence, from weakest to strongest binding, is
 299 | first alternation, then concatenation, and finally the
 300 | repetition operators.
 301 | Explicit parentheses can be used to force different meanings,
 302 | just as in arithmetic expressions.
 303 | Some examples:
 304 | <code>ab|cd</code>
 305 | is equivalent to
 306 | <code>(ab)|(cd)</code>;
 307 | <code>ab*</code>
 308 | is equivalent to
 309 | <code>a(b*)</code>.
 310 | </p>
 311 | 
 312 | <p class=pp>
 313 | The syntax described so far is a subset of the traditional Unix
 314 | egrep
 315 | regular expression syntax.
 316 | This subset suffices to describe all regular
 317 | languages: loosely speaking, a regular language is a set
 318 | of strings that can be matched in a single pass through
 319 | the text using only a fixed amount of memory.
 320 | Newer regular expression facilities (notably Perl and
 321 | those that have copied it) have added 
 322 | <a href="http://www.perl.com/doc/manual/html/pod/perlre.html">many new operators
 323 | and escape sequences</a>.  These additions make the regular
 324 | expressions more concise, and sometimes more cryptic, but usually
 325 | not more powerful:
 326 | these fancy new regular expressions almost always have longer
 327 | equivalents using the traditional syntax.
 328 | </p>
 329 | 
 330 | <p class=pp>
 331 | One common regular expression extension that 
 332 | does provide additional power is called
 333 | <i>backreferences</i>.
 334 | A backreference like
 335 | <code>\1</code>
 336 | or
 337 | <code>\2</code>
 338 | matches the string matched
 339 | by a previous parenthesized expression, and only that string:
 340 | <code>(cat|dog)\1</code>
 341 | matches
 342 | <code>catcat</code>
 343 | and
 344 | <code>dogdog</code>
 345 | but not
 346 | <code>catdog</code>
 347 | nor
 348 | <code>dogcat</code>.
 349 | As far as the theoretical term is concerned,
 350 | regular expressions with backreferences
 351 | are not regular expressions.
 352 | The power that backreferences add comes at great cost:
 353 | in the worst case, the best known implementations require
 354 | exponential search algorithms,
 355 | like the one Perl uses.
 356 | Perl (and the other languages)
 357 | could not now remove backreference support,
 358 | of course, but they could employ much faster algorithms
 359 | when presented with regular expressions that don't have
 360 | backreferences, like the ones considered above.
 361 | This article is about those faster algorithms.
 362 | </p>
 363 | 
 364 | <h2 class=sh>
 365 | Finite Automata
 366 | </h2>
 367 | 
 368 | 
 369 | 
 370 | <p class=pp>
 371 | Another way to describe sets of character strings is with
 372 | finite automata.
 373 | Finite automata are also known as state machines,
 374 | and we will use &ldquo;automaton&rdquo; and &ldquo;machine&rdquo; interchangeably.
 375 | </p>
 376 | 
 377 | <p class=pp>
 378 | As a simple example, here is a machine recognizing
 379 | the set of strings matched by the regular expression
 380 | <code>a(bb)+a</code>:
 381 | </p>
 382 | 
 383 | <p class=fig><img src=fig0.png alt="DFA for a(bb)+a" width="278" height="54"></p>
 384 | 
 385 | <p class=pp>
 386 | A finite automaton is always in one of its states,
 387 | represented in the diagram by circles.
 388 | (The numbers inside the circles are labels to make this
 389 | discussion easier; they are not part of the machine's operation.)
 390 | As it reads the string, it switches from state to state.
 391 | This machine has two special states: the start state <i>s</i><sub>0</sub>
 392 | and the matching state <i>s</i><sub>4</sub>.
 393 | Start states are depicted with lone arrowheads pointing at them,
 394 | and matching states are drawn as a double circle.
 395 | </p>
 396 | 
 397 | <p class=pp>
 398 | The machine reads an input string one character at a time,
 399 | following arrows corresponding to the input to move from
 400 | state to state.
 401 | Suppose the input string is
 402 | <code>abbbba</code>.
 403 | When the machine reads the first letter of the string, the
 404 | <code>a</code>,
 405 | it is in the start state <i>s</i><sub>0</sub>.  It follows the
 406 | <code>a</code>
 407 | arrow to state <i>s</i><sub>1</sub>.
 408 | This process repeats as the machine reads the rest of the string:
 409 | <code>b</code>
 410 | to
 411 | <code><i>s</i><sub>2</sub></code>,
 412 | <code>b</code>
 413 | to
 414 | <code><i>s</i><sub>3</sub></code>,
 415 | <code>b</code>
 416 | to
 417 | <code><i>s</i><sub>2</sub></code>,
 418 | <code>b</code>
 419 | to
 420 | <code><i>s</i><sub>3</sub></code>,
 421 | and finally
 422 | <code>a</code>
 423 | to
 424 | <code><i>s</i><sub>4</sub></code>.
 425 | </p>
 426 | <p class=fig><img src=fig1.png alt="DFA execution on abbbba" width="357" height="426"></p>
 427 | <p class=lp>
 428 | The machine ends in <i>s</i><sub>4</sub>, a matching state, so it
 429 | matches the string.
 430 | If the machine ends in a non-matching state, it does not 
 431 | match the string.
 432 | If, at any point during the machine's execution, there is no
 433 | arrow for it to follow corresponding to the current
 434 | input character, the machine stops executing early.
 435 | </p>
 436 | 
 437 | <p class=pp>
 438 | The machine we have been considering is called a
 439 | <i>deterministic</i>
 440 | finite automaton (DFA),
 441 | because in any state, each possible input letter
 442 | leads to at most one new state.
 443 | We can also create machines
 444 | that must choose between multiple possible next states.
 445 | For example, this machine is equivalent to the previous
 446 | one but is not deterministic:
 447 | </p>
 448 | <p class=fig><img src=fig2.png alt="NFA for a(bb)+a" width="278" height="54"></p>
 449 | <p class=lp>
 450 | The machine is not deterministic because if it reads a
 451 | <code>b</code>
 452 | in state <i>s</i><sub>2</sub>, it has multiple choices for the next state:
 453 | it can go back to <i>s</i><sub>1</sub> in hopes of seeing another
 454 | <code>bb</code>,
 455 | or it can go on to <i>s</i><sub>3</sub> in hopes of seeing the final
 456 | <code>a</code>.
 457 | Since the machine cannot peek ahead to see the rest of
 458 | the string, it has no way to know which is the correct decision.
 459 | In this situation, it turns out to be interesting to
 460 | let the machine
 461 | <i>always guess correctly</i>.
 462 | Such machines are called non-deterministic finite automata
 463 | (NFAs or NDFAs).
 464 | An NFA matches an input string if there is some way 
 465 | it can read the string and follow arrows to a matching state.
 466 | </p>
 467 | 
 468 | <p class=pp>
 469 | Sometimes it is convenient to let NFAs have arrows with no
 470 | corresponding input character.  We will leave these arrows unlabeled.
 471 | An NFA can, at any time, choose to follow an unlabeled arrow
 472 | without reading any input.
 473 | This NFA is equivalent to the previous two, but the unlabeled arrow
 474 | makes the correspondence with
 475 | <code>a(bb)+a</code>
 476 | clearest:
 477 | </p>
 478 | <p class=fig><img src=fig3.png alt="Another NFA for a(bb)+a" width="278" height="39"></p>
 479 | 
 480 | <h2 class=sh>
 481 | Converting Regular Expressions to NFAs
 482 | </h2>
 483 | 
 484 | <p class=pp>
 485 | Regular expressions and NFAs turn out to be exactly
 486 | equivalent in power: every regular expression has an
 487 | equivalent NFA (they match the same strings) and vice versa.
 488 | (It turns out that DFAs are also equivalent in power 
 489 | to NFAs and regular expressions; we will see this later.)
 490 | There are multiple ways to translate regular expressions into NFAs.
 491 | The method described here was first described by Thompson
 492 | in his 1968 CACM paper.
 493 | </p>
 494 | 
 495 | <p class=pp>
 496 | The NFA for a regular expression is built up from partial NFAs
 497 | for each subexpression, with a different construction for
 498 | each operator.  The partial NFAs have
 499 | no matching states: instead they have one or more dangling arrows,
 500 | pointing to nothing.  The construction process will finish by
 501 | connecting these arrows to a matching state.
 502 | </p>
 503 | 
 504 | <p class=pp>
 505 | The NFAs for matching single characters look like:
 506 | </p>
 507 | <p class=fig><img src=fig4.png alt="Single-character NFA" width="113" height="21"></p>
 508 | <p class=lp>
 509 | The NFA for the concatenation <i>e</i><sub>1</sub><i>e</i><sub>2</sub>
 510 | connects the final arrow of the <i>e</i><sub>1</sub> 
 511 | machine to the start of the <i>e</i><sub>2</sub> machine:
 512 | </p>
 513 | <p class=fig><img src=fig5.png alt="Concatenation NFA" width="242" height="20"></p>
 514 | <p class=lp>
 515 | The NFA for the alternation <i>e</i><sub>1</sub><code>|</code><i>e</i><sub>2</sub>
 516 | adds a new start state with a choice of either the
 517 | <i>e</i><sub>1</sub> machine or the <i>e</i><sub>2</sub> machine.
 518 | </p>
 519 | <p class=fig><img src=fig6.png alt="Alternation NFA" width="202" height="62"></p>
 520 | <p class=lp>
 521 | The NFA for <i>e</i><code>?</code> alternates the <i>e</i> machine with an empty path:
 522 | </p>
 523 | <p class=fig><img src=fig7.png alt="Zero or one NFA" width="184" height="56"></p>
 524 | <p class=lp>
 525 | The NFA for <i>e</i><code>*</code> uses the same alternation but loops a 
 526 | matching <i>e</i> machine back to the start:
 527 | </p>
 528 | <p class=fig><img src=fig8.png alt="Zero or more NFA" width="184" height="56"></p>
 529 | <p class=lp>
 530 | The NFA for <i>e</i><code>+</code> also creates a loop, but one that
 531 | requires passing through <i>e</i> at least once:
 532 | </p>
 533 | <p class=fig><img src=fig9.png alt="One or more NFA" width="190" height="41"></p>
 534 | 
 535 | <p class=pp>
 536 | Counting the new states in the diagrams above, we can see
 537 | that this technique creates exactly one state per character
 538 | or metacharacter in the regular expression,
 539 | excluding parentheses.
 540 | Therefore the number of states in the final NFA is at most
 541 | equal to the length of the original regular expression.
 542 | </p>
 543 | 
 544 | <p class=pp>
 545 | Just as with the example NFA discussed earlier, it is always possible
 546 | to remove the unlabeled arrows, and it is also always possible to generate
 547 | the NFA without the unlabeled arrows in the first place.
 548 | Having the unlabeled arrows makes the NFA easier for us to read
 549 | and understand, and they also make the C representation
 550 | simpler, so we will keep them.
 551 | </p>
 552 | 
 553 | <h2 class=sh>
 554 | Regular Expression Search Algorithms
 555 | </h2>
 556 | 
 557 | <p class=pp>
 558 | Now we have a way to test whether a regular expression
 559 | matches a string: convert the regular expression to an NFA
 560 | and then run the NFA using the string as input.
 561 | Remember that NFAs are endowed with the ability to guess
 562 | perfectly when faced with a choice of next state:
 563 | to run the NFA using an ordinary computer, we must find
 564 | a way to simulate this guessing.
 565 | </p>
 566 | 
 567 | <p class=pp>
 568 | One way to simulate perfect guessing is to guess
 569 | one option, and if that doesn't work, try the other.
 570 | For example, consider the NFA for
 571 | <code>abab|abbb</code>
 572 | run on the string
 573 | <code>abbb</code>:
 574 | </p>
 575 | <p class=fig><img src=fig10.png alt="NFA for abab|abbb" width="364" height="79"></p>
 576 | <p class=fig><img src=fig11.png alt="Backtracking execution on abbb" width="729" height="619"></p>
 577 | <p class=lp>
 578 | At step 0, the NFA must make a choice: try to match
 579 | <code>abab</code>
 580 | or
 581 | try to match
 582 | <code>abbb</code>?
 583 | In the diagram, the NFA tries
 584 | <code>abab</code>,
 585 | but that fails after step 3.
 586 | The NFA then tries the other choice, leading to step 4 and eventually a match.
 587 | This backtracking approach
 588 | has a simple recursive implementation
 589 | but can read the input string many times
 590 | before succeeding.
 591 | If the string does not match,
 592 | the machine must try
 593 | <i>all</i>
 594 | possible execution paths before
 595 | giving up.
 596 | The NFA tried only two different paths in the example,
 597 | but in the worst case, there can be exponentially
 598 | many possible execution paths, leading to very slow run times.
 599 | </p>
 600 | 
 601 | <p class=pp>
 602 | A more efficient but more complicated way to simulate perfect
 603 | guessing is to guess both options simultaneously. 
 604 | In this approach, the simulation allows the machine
 605 | to be in multiple states at once.  To process each letter,
 606 | it advances all the states along all the arrows that
 607 | match the letter.
 608 | </p>
 609 | <p class=fig><img src=fig12.png alt="Parallel execution on abbb" width="329" height="511"></p>
 610 | <p class=lp>
 611 | The machine starts in the start state and all the states
 612 | reachable from the start state by unlabeled arrows.
 613 | In steps 1 and 2, the NFA is in two states simultaneously.
 614 | Only at step 3 does the state set narrow down to a single state.
 615 | This multi-state approach tries both paths at the same time,
 616 | reading the input only once.
 617 | In the worst case, the NFA might be in
 618 | <i>every</i>
 619 | state at each step, but this results in at worst a constant amount
 620 | of work independent of the length of the string,
 621 | so arbitrarily
 622 | large input strings can be processed in linear time.
 623 | This is a dramatic improvement over the exponential time
 624 | required by the backtracking approach.
 625 | The efficiency comes from tracking the set of reachable
 626 | states but
 627 | <i>not</i>
 628 | which paths were used to reach them.
 629 | In an NFA with 
 630 | <i>n</i>
 631 | nodes, there can only be 
 632 | <i>n</i>
 633 | reachable states at any step, but there might be
 634 | 2<sup><i>n</i></sup> paths through the NFA.
 635 | </p>
 636 | 
 637 | <h2 class=sh>
 638 | Implementation
 639 | </h2>
 640 | 
 641 | <p class=pp>
 642 | Thompson introduced the multiple-state simulation approach
 643 | in his 1968 paper.
 644 | In his formulation, the states of the NFA were represented
 645 | by small machine-code sequences, and the list of possible states
 646 | was just a sequence of function call instructions.
 647 | In essence, Thompson compiled the regular expression into clever
 648 | machine code.
 649 | Forty years later, computers are much faster and the 
 650 | machine code approach is not as necessary.
 651 | The following sections
 652 | present an implementation written in portable ANSI C.
 653 | The full source code (under 400 lines)
 654 | and the benchmarking scripts are 
 655 | <a href="http://swtch.com/~rsc/regexp/">available online</a>.
 656 | (Readers who are unfamiliar or uncomfortable with C or pointers should
 657 | feel free to read the descriptions and skip over the actual code.)
 658 | </p>
 659 | 
 660 | <h2 class=sh id="compiling">
 661 | Implementation: Compiling to NFA
 662 | </h2>
 663 | 
 664 | <p class=pp>
 665 | The first step is to compile the regular expression
 666 | into an equivalent NFA.
 667 | In our C program, we will represent an NFA as a
 668 | linked collection of 
 669 | <code>State</code>
 670 | structures:
 671 | </p>
 672 | <pre class=p1>
 673 | struct State
 674 | {
 675 | 	int c;
 676 | 	State *out;
 677 | 	State *out1;
 678 | 	int lastlist;
 679 | };
 680 | </pre><p class=lp>
 681 | Each
 682 | <code>State</code>
 683 | represents one of the following three NFA fragments,
 684 | depending on the value of
 685 | <code>c</code>.
 686 | </p>
 687 | <p class=fig><img src=fig13.png alt="Possible per-State NFA fragments" width="340" height="109"></p>
 688 | <p class=lp>
 689 | (<code>Lastlist</code>
 690 | is used during execution and is explained in the next section.)
 691 | </p>
 692 | 
 693 | <p class=pp>
 694 | Following Thompson's paper,
 695 | the compiler builds an NFA from a regular expression in
 696 | <i>postfix</i>
 697 | notation with dot
 698 | (<code>.</code>) added
 699 | as an explicit concatenation operator.
 700 | A separate function
 701 | <code>re2post</code>
 702 | rewrites infix regular expressions like
 703 | &ldquo;<code>a(bb)+a</code>&rdquo;
 704 | into equivalent postfix expressions like
 705 | &ldquo;<code>abb.+.a.</code>&rdquo;.
 706 | (A &ldquo;real&rdquo; implementation would certainly
 707 | need to use dot as the &ldquo;any character&rdquo; metacharacter
 708 | rather than as a concatenation operator.
 709 | A real implementation would also probably build the 
 710 | NFA during parsing rather than build an explicit postfix expression.
 711 | However, the postfix version is convenient and follows 
 712 | Thompson's paper more closely.)
 713 | </p>
 714 | 
 715 | <p class=pp>
 716 | As the compiler scans the postfix expression, it maintains
 717 | a stack of computed NFA fragments.
 718 | Literals push new NFA fragments onto the stack, while
 719 | operators pop fragments off the stack and then
 720 | push a new fragment.
 721 | For example, 
 722 | after compiling the
 723 | <code>abb</code> in <code>abb.+.a.</code>,
 724 | the stack contains NFA fragments for
 725 | <code>a</code>,
 726 | <code>b</code>,
 727 | and
 728 | <code>b</code>.
 729 | The compilation of the
 730 | <code>.</code>
 731 | that follows pops the two
 732 | <code>b</code>
 733 | NFA fragment from the stack and pushes an NFA fragment for the
 734 | concatenation
 735 | <code>bb.</code>.
 736 | Each NFA fragment is defined by its start state and its
 737 | outgoing arrows:
 738 | </p><pre class=p1>
 739 | struct Frag
 740 | {
 741 | 	State *start;
 742 | 	Ptrlist *out;
 743 | };
 744 | </pre><p class=lp>
 745 | <code>Start</code>
 746 | points at the start state for the fragment,
 747 | and
 748 | <code>out</code>
 749 | is a list of pointers to 
 750 | <code>State*</code>
 751 | pointers that are not yet connected to anything.
 752 | These are the dangling arrows in the NFA fragment.
 753 | </p>
 754 | 
 755 | <p class=pp>
 756 | Some helper functions manipulate pointer lists:
 757 | </p><pre class=p1>
 758 | Ptrlist *list1(State **outp);
 759 | Ptrlist *append(Ptrlist *l1, Ptrlist *l2);
 760 | 
 761 | void patch(Ptrlist *l, State *s);
 762 | </pre><p class=lp>
 763 | <code>List1</code>
 764 | creates a new pointer list containing the single pointer
 765 | <code>outp</code>.
 766 | <code>Append</code>
 767 | concatenates two pointer lists, returning the result.
 768 | <code>Patch</code>
 769 | connects the dangling arrows in the pointer list
 770 | <code>l</code>
 771 | to the state
 772 | <code>s</code>:
 773 | it sets
 774 | <code>*outp</code>
 775 | <code>=</code>
 776 | <code>s</code>
 777 | for each pointer
 778 | <code>outp</code>
 779 | in
 780 | <code>l</code>.
 781 | </p>
 782 | 
 783 | <p class=pp>
 784 | Given these primitives and a fragment stack,
 785 | the compiler is a simple loop over the postfix expression.
 786 | At the end, there is a single fragment left:
 787 | patching in a matching state completes the NFA.
 788 | </p><pre class=p1>
 789 | State*
 790 | post2nfa(char *postfix)
 791 | {
 792 | 	char *p;
 793 | 	Frag stack[1000], *stackp, e1, e2, e;
 794 | 	State *s;
 795 | 
 796 | 	#define push(s) *stackp++ = s
 797 | 	#define pop()   *--stackp
 798 | 
 799 | 	stackp = stack;
 800 | 	for(p=postfix; *p; p++){
 801 | 		switch(*p){
 802 | 		/* <i>compilation cases, described below</i> */
 803 | 		}
 804 | 	}
 805 | 	
 806 | 	e = pop();
 807 | 	patch(e.out, matchstate);
 808 | 	return e.start;
 809 | }
 810 | </pre><p class=lp><a id="compile"></a>
 811 | The specific compilation cases mimic the translation 
 812 | steps described earlier.
 813 | </p>
 814 | 
 815 | <table cellspacing=0 cellpadding=0 border=0>
 816 | <tr><td><p class=tlp>
 817 | Literal characters:
 818 | </p><pre class=p1>
 819 | default:
 820 | 	s = state(*p, NULL, NULL);
 821 | 	push(frag(s, list1(&amp;s-&gt;out));
 822 | 	break;
 823 | </pre>
 824 | <td><img src=fig14.png alt="" width="61" height="24">
 825 | 
 826 | <tr><td><p class=tlp>
 827 | Catenation:
 828 | </p><pre class=p1>
 829 | case '.':
 830 | 	e2 = pop();
 831 | 	e1 = pop();
 832 | 	patch(e1.out, e2.start);
 833 | 	push(frag(e1.start, e2.out));
 834 | 	break;
 835 | </pre>
 836 | <td><img src=fig15.png alt="" width="182" height="20">
 837 | 
 838 | <tr><td><p class=tlp>
 839 | Alternation:
 840 | </p><pre class=p1>
 841 | case '|':
 842 | 	e2 = pop();
 843 | 	e1 = pop();
 844 | 	s = state(Split, e1.start, e2.start);
 845 | 	push(frag(s, append(e1.out, e2.out)));
 846 | 	break;
 847 | </pre>
 848 | <td><img src=fig16.png alt="" width="140" height="62">
 849 | 
 850 | <tr><td><p class=tlp>
 851 | Zero or one:
 852 | </p><pre class=p1>
 853 | case '?':
 854 | 	e = pop();
 855 | 	s = state(Split, e.start, NULL);
 856 | 	push(frag(s, append(e.out, list1(&amp;s-&gt;out1))));
 857 | 	break;
 858 | </pre>
 859 | <td><img src=fig17.png alt="" width="140" height="68">
 860 | 
 861 | <tr><td><p class=tlp>
 862 | Zero or more:
 863 | </p><pre class=p1>
 864 | case '*':
 865 | 	e = pop();
 866 | 	s = state(Split, e.start, NULL);
 867 | 	patch(e.out, s);
 868 | 	push(frag(s, list1(&amp;s-&gt;out1)));
 869 | 	break;
 870 | </pre>
 871 | <td><img src=fig18.png alt="" width="131" height="68">
 872 | 
 873 | <tr><td><p class=tlp>
 874 | One or more:
 875 | </p><pre class=p1>
 876 | case '+':
 877 | 	e = pop();
 878 | 	s = state(Split, e.start, NULL);
 879 | 	patch(e.out, s);
 880 | 	push(frag(e.start, list1(&amp;s-&gt;out1)));
 881 | 	break;
 882 | </pre>
 883 | <td><img src=fig19.png alt="" width="140" height="53">
 884 | </table>
 885 | 
 886 | <h2 class=sh>
 887 | Implementation: Simulating the NFA
 888 | </h2>
 889 | 
 890 | <p class=pp>
 891 | Now that the NFA has been built, we need to simulate it.
 892 | The simulation requires tracking 
 893 | <code>State</code>
 894 | sets, which are stored as a simple array list:
 895 | </p><pre class=p1>
 896 | struct List
 897 | {
 898 | 	State **s;
 899 | 	int n;
 900 | };
 901 | </pre><p class=lp>
 902 | The simulation uses two lists:
 903 | <code>clist</code>
 904 | is the current set of states that the NFA is in,
 905 | and
 906 | <code>nlist</code>
 907 | is the next set of states that the NFA will be in,
 908 | after processing the current character.
 909 | The execution loop initializes
 910 | <code>clist</code>
 911 | to contain just the start state and then
 912 | runs the machine one step at a time.
 913 | </p><pre class=p1>
 914 | int
 915 | match(State *start, char *s)
 916 | {
 917 | 	List *clist, *nlist, *t;
 918 | 
 919 | 	/* l1 and l2 are preallocated globals */
 920 | 	clist = startlist(start, &amp;l1);
 921 | 	nlist = &amp;l2;
 922 | 	for(; *s; s++){
 923 | 		step(clist, *s, nlist);
 924 | 		t = clist; clist = nlist; nlist = t;	/* swap clist, nlist */
 925 | 	}
 926 | 	return ismatch(clist);
 927 | }
 928 | </pre><p class=lp>
 929 | To avoid allocating on every iteration of the loop,
 930 | <code>match</code>
 931 | uses two preallocated lists
 932 | <code>l1</code>
 933 | and
 934 | <code>l2</code>
 935 | as
 936 | <code>clist</code>
 937 | and
 938 | <code>nlist</code>,
 939 | swapping the two after each step.
 940 | </p>
 941 | 
 942 | <p class=pp>
 943 | If the final state list contains the matching state,
 944 | then the string matches.
 945 | </p><pre class=p1>
 946 | int
 947 | ismatch(List *l)
 948 | {
 949 | 	int i;
 950 | 
 951 | 	for(i=0; i&lt;l-&gt;n; i++)
 952 | 		if(l-&gt;s[i] == matchstate)
 953 | 			return 1;
 954 | 	return 0;
 955 | }
 956 | </pre><p class=lp>
 957 | </p>
 958 | 
 959 | <p class=pp>
 960 | <code>Addstate</code>
 961 | adds a state to the list,
 962 | but not if it is already on the list.
 963 | Scanning the entire list for each add would be inefficient;
 964 | instead the variable
 965 | <code>listid</code>
 966 | acts as a list generation number.
 967 | When
 968 | <code>addstate</code>
 969 | adds
 970 | <code>s</code>
 971 | to a list,
 972 | it records
 973 | <code>listid</code>
 974 | in
 975 | <code>s->lastlist</code>.
 976 | If the two are already equal,
 977 | then 
 978 | <code>s</code>
 979 | is already on the list being built.
 980 | <code>Addstate</code>
 981 | also follows unlabeled arrows:
 982 | if 
 983 | <code>s</code>
 984 | is a
 985 | <code>Split</code>
 986 | state with two unlabeled arrows to new states,
 987 | <code>addstate</code>
 988 | adds those states to the list instead of
 989 | <code>s</code>.
 990 | </p><pre class=p1>
 991 | void
 992 | addstate(List *l, State *s)
 993 | {
 994 | 	if(s == NULL || s-&gt;lastlist == listid)
 995 | 		return;
 996 | 	s-&gt;lastlist = listid;
 997 | 	if(s-&gt;c == Split){
 998 | 		/* follow unlabeled arrows */
 999 | 		addstate(l, s-&gt;out);
1000 | 		addstate(l, s-&gt;out1);
1001 | 		return;
1002 | 	}
1003 | 	l-&gt;s[l-&gt;n++] = s;
1004 | }
1005 | </pre><p class=lp>
1006 | </p>
1007 | 
1008 | <p class=pp>
1009 | <code>Startlist</code>
1010 | creates an initial state list by adding just the start state:
1011 | </p><pre class=p1>
1012 | List*
1013 | startlist(State *s, List *l)
1014 | {
1015 | 	listid++;
1016 | 	l-&gt;n = 0;
1017 | 	addstate(l, s);
1018 | 	return l;
1019 | }
1020 | </pre><p class=lp>
1021 | </p>
1022 | 
1023 | <p class=pp>
1024 | Finally,
1025 | <code>step</code>
1026 | advances the NFA past a single character, using
1027 | the current list
1028 | <code>clist</code>
1029 | to compute the next list
1030 | <code>nlist</code>.
1031 | </p><pre class=p1>
1032 | void
1033 | step(List *clist, int c, List *nlist)
1034 | {
1035 | 	int i;
1036 | 	State *s;
1037 | 
1038 | 	listid++;
1039 | 	nlist-&gt;n = 0;
1040 | 	for(i=0; i&lt;clist-&gt;n; i++){
1041 | 		s = clist-&gt;s[i];
1042 | 		if(s-&gt;c == c)
1043 | 			addstate(nlist, s-&gt;out);
1044 | 	}
1045 | }
1046 | </pre>
1047 | 
1048 | <h2 class=sh>
1049 | Performance
1050 | </h2>
1051 | 
1052 | <p class=pp>
1053 | The C implementation just described was not written with performance in mind.
1054 | Even so, a slow implementation of a linear-time algorithm
1055 | can easily outperform a fast implementation of an 
1056 | exponential-time algorithm once the exponent is large enough.
1057 | Testing a variety of popular regular expression engines on 
1058 | a so-called pathological regular expression demonstrates this nicely.
1059 | </p>
1060 | 
1061 | <p class=pp>
1062 | Consider the regular expression
1063 | <code>a?<sup><i>n</i></sup>a<sup><i>n</i></sup></code>.
1064 | It matches the string
1065 | <code>a<sup><i>n</i></sup></code>
1066 | when the
1067 | <code>a?</code>
1068 | are chosen not to match any letters,
1069 | leaving the entire string to be matched by the
1070 | <code>a<sup><i>n</i></sup></code>.
1071 | Backtracking regular expression implementations
1072 | implement the zero-or-one
1073 | <code>?</code>
1074 | by first trying one and then zero.
1075 | There are
1076 | <i>n</i>
1077 | such choices to make, a total of
1078 | 2<sup><i>n</i></sup> possibilities.
1079 | Only the very last
1080 | possibility&mdash;choosing zero for all the <code>?</code>&mdash;will lead to a match.
1081 | The backtracking approach thus requires
1082 | <i>O</i>(2<sup><i>n</i></sup>) time, so it will not scale much beyond <i>n</i>=25.
1083 | </p>
1084 | 
1085 | <p class=pp>
1086 | In contrast, Thompson's algorithm maintains state lists of length
1087 | approximately <i>n</i> and processes the string, also of length <i>n</i>,
1088 | for a total of <i>O</i>(<i>n</i><sup>2</sup>) time.
1089 | (The run time is superlinear,
1090 | because we are not keeping the regular expression constant
1091 | as the input grows.
1092 | For a regular expression of length <i>m</i> run on text of length <i>n</i>,
1093 | the Thompson NFA requires <i>O</i>(<i>mn</i>) time.)
1094 | </p>
1095 | 
1096 | <p class=pp>
1097 | The following graph plots time required to check whether
1098 | <code>a?<sup><i>n</i></sup>a<sup><i>n</i></sup></code>
1099 | matches
1100 | <code>a<sup><i>n</i></sup></code>:
1101 | </p>
1102 | 
1103 | <div class=fig>
1104 | <center>
1105 | <table cellspacing=0 cellpadding=0 border=0><tr><td>
1106 | <div class=box>
1107 | <center>
1108 | <img src=grep1p.png alt="Performance graph" width="779" height="388">
1109 | <br>
1110 | regular expression and text size <i>n</i>
1111 | <br>
1112 | <code>a?</code><sup><i>n</i></sup><code>a</code><sup><i>n</i></sup>
1113 | matching 
1114 | <code>a</code><sup><i>n</i></sup>
1115 | </center>
1116 | </div>
1117 | </table>
1118 | </center>
1119 | </div>
1120 | 
1121 | <p class=lp>
1122 | Notice that the graph's <i>y</i>-axis has a logarithmic scale,
1123 | in order to be able to see a wide variety of times on a single graph.
1124 | </p>
1125 | 
1126 | <p class=pp>
1127 | From the graph it is clear that Perl, PCRE, Python, and Ruby are
1128 | all using recursive backtracking.
1129 | PCRE stops getting the right answer at 
1130 | <i>n</i>=23,
1131 | because it aborts the recursive backtracking after a maximum number
1132 | of steps.
1133 | As of Perl 5.6, Perl's regular expression engine is
1134 | <a href="http://perlmonks.org/index.pl?node_id=502408">said to memoize</a>
1135 | the recursive backtracking search, which should, at some memory cost,
1136 | keep the search from taking exponential amounts of time 
1137 | unless backreferences are being used.
1138 | As the performance graph shows, the memoization is not complete:
1139 | Perl's run time grows exponentially even though there
1140 | are no backreferences
1141 | in the expression.
1142 | Although not benchmarked here, Java uses a backtracking
1143 | implementation too.
1144 | In fact, the
1145 | <code>java.util.regex</code>
1146 | interface requires a backtracking
1147 | implementation, because arbitrary Java code
1148 | can be substituted into the matching path.
1149 | PHP uses the PCRE library.
1150 | </p>
1151 | 
1152 | <p class=pp>
1153 | The thick blue line is the C implementation of Thompson's algorithm given above.
1154 | Awk, Tcl, GNU grep, and GNU awk 
1155 | build DFAs, either precomputing them or using the on-the-fly
1156 | construction described in the next section.
1157 | </p>
1158 | 
1159 | <p class=pp>
1160 | Some might argue that this test is unfair to
1161 | the backtracking implementations, since it focuses on an
1162 | uncommon corner case.
1163 | This argument misses the point:
1164 | given a choice between an implementation
1165 | with a predictable, consistent, fast running time on all inputs
1166 | or one that usually runs quickly but can take
1167 | years of CPU time (or more) on some inputs,
1168 | the decision should be easy.
1169 | Also, while examples as dramatic as this one
1170 | rarely occur in practice, less dramatic ones do occur.
1171 | Examples include using
1172 | <code>(.*)</code>
1173 | <code>(.*)</code>
1174 | <code>(.*)</code>
1175 | <code>(.*)</code>
1176 | <code>(.*)</code>
1177 | to split five space-separated fields, or using
1178 | alternations where the common cases
1179 | are not listed first.
1180 | As a result, programmers often learn which constructs are
1181 | expensive and avoid them, or they turn to so-called
1182 | <a href="http://search.cpan.org/~dankogai/Regexp-Optimizer-0.15/lib/Regexp/Optimizer.pm">optimizers</a>.
1183 | Using Thompson's NFA simulation does not require such adaptation:
1184 | there are no expensive regular expressions.
1185 | </p>
1186 | 
1187 | <h2 class=sh>
1188 | Caching the NFA to build a DFA
1189 | </h2>
1190 | 
1191 | <p class=pp>
1192 | Recall that DFAs are more efficient to execute than NFAs,
1193 | because DFAs are only ever in one state at a time: they never
1194 | have a choice of multiple next states.
1195 | Any NFA can be converted into an equivalent DFA
1196 | in which each DFA state corresponds to a
1197 | list of NFA states.
1198 | </p>
1199 | 
1200 | <p class=pp>
1201 | For example, here is the NFA we used earlier for
1202 | <code>abab|abbb</code>,
1203 | with state numbers added:
1204 | </p>
1205 | <p class=fig><img src=fig20.png alt="NFA for abab|abbb" width="424" height="91"></p>
1206 | <p class=lp>
1207 | The equivalent DFA would be:
1208 | </p>
1209 | <p class=fig><img src=fig21.png alt="DFA for abab|abbb" width="496" height="170"></p>
1210 | <p class=lp>
1211 | Each state in the DFA corresponds to a list of 
1212 | states from the NFA.
1213 | </p>
1214 | 
1215 | <p class=pp>
1216 | In a sense, Thompson's NFA simulation is
1217 | executing the equivalent DFA: each
1218 | <code>List</code>
1219 | corresponds to some DFA state,
1220 | and the 
1221 | <code>step</code>
1222 | function is computing, given a list and a next character,
1223 | the next DFA state to enter.
1224 | Thompson's algorithm simulates the DFA by 
1225 | reconstructing each DFA state as it is needed.
1226 | Rather than throw away this work after each step,
1227 | we could cache the
1228 | <code>Lists</code>
1229 | in spare memory, avoiding the cost of repeating the computation
1230 | in the future
1231 | and essentially computing the equivalent DFA as it is needed.
1232 | This section presents the implementation of such an approach.
1233 | Starting with the NFA implementation from the previous section,
1234 | we need to add less than 100 lines to build a DFA implementation.
1235 | </p>
1236 | 
1237 | <p class=pp>
1238 | To implement the cache, we first introduce a new data type
1239 | that represents a DFA state:
1240 | </p><pre class=p1>
1241 | struct DState
1242 | {
1243 | 	List l;
1244 | 	DState *next[256];
1245 | 	DState *left;
1246 | 	DState *right;
1247 | };
1248 | </pre><p class=lp>
1249 | A
1250 | <code>DState</code>
1251 | is the cached copy of the list
1252 | <code>l</code>.
1253 | The array
1254 | <code>next</code>
1255 | contains pointers to the next state for each
1256 | possible input character:
1257 | if the current state is
1258 | <code>d</code>
1259 | and the next input character is
1260 | <code>c</code>,
1261 | then
1262 | <code>d->next[c]</code>
1263 | is the next state.
1264 | If
1265 | <code>d->next[c]</code>
1266 | is null, then the next state has not been computed yet.
1267 | <code>Nextstate</code>
1268 | computes, records, and returns the next state
1269 | for a given state and character.
1270 | </p>
1271 | 
1272 | <p class=pp>
1273 | The regular expression match follows
1274 | <code>d->next[c]</code>
1275 | repeatedly, calling
1276 | <code>nextstate</code>
1277 | to compute new states as needed.
1278 | </p><pre class=p1>
1279 | int
1280 | match(DState *start, char *s)
1281 | {
1282 | 	int c;
1283 | 	DState *d, *next;
1284 | 	
1285 | 	d = start;
1286 | 	for(; *s; s++){
1287 | 		c = *s &amp; 0xFF;
1288 | 		if((next = d-&gt;next[c]) == NULL)
1289 | 			next = nextstate(d, c);
1290 | 		d = next;
1291 | 	}
1292 | 	return ismatch(&amp;d-&gt;l);
1293 | }
1294 | </pre><p class=lp>
1295 | </p>
1296 | 
1297 | <p class=pp>
1298 | All the
1299 | <code>DStates</code>
1300 | that have been computed need to be saved in a 
1301 | structure that lets us look up a
1302 | <code>DState</code>
1303 | by its
1304 | <code>List</code>.
1305 | To do this, we arrange them 
1306 | in a binary tree
1307 | using the sorted
1308 | <code>List</code>
1309 | as the key.
1310 | The
1311 | <code>dstate</code>
1312 | function returns the
1313 | <code>DState</code>
1314 | for a given
1315 | <code>List</code>,
1316 | allocating one if necessary:
1317 | </p><pre class=p1>
1318 | DState*
1319 | dstate(List *l)
1320 | {
1321 | 	int i;
1322 | 	DState **dp, *d;
1323 | 	static DState *alldstates;
1324 | 
1325 | 	qsort(l-&gt;s, l-&gt;n, sizeof l-&gt;s[0], ptrcmp);
1326 | 
1327 | 	/* look in tree for existing DState */
1328 | 	dp = &amp;alldstates;
1329 | 	while((d = *dp) != NULL){
1330 | 		i = listcmp(l, &amp;d-&gt;l);
1331 | 		if(i &lt; 0)
1332 | 			dp = &amp;d-&gt;left;
1333 | 		else if(i &gt; 0)
1334 | 			dp = &amp;d-&gt;right;
1335 | 		else
1336 | 			return d;
1337 | 	}
1338 | 	
1339 | 	/* allocate, initialize new DState */
1340 | 	d = malloc(sizeof *d + l-&gt;n*sizeof l-&gt;s[0]);
1341 | 	memset(d, 0, sizeof *d);
1342 | 	d-&gt;l.s = (State**)(d+1);
1343 | 	memmove(d-&gt;l.s, l-&gt;s, l-&gt;n*sizeof l-&gt;s[0]);
1344 | 	d-&gt;l.n = l-&gt;n;
1345 | 
1346 | 	/* insert in tree */
1347 | 	*dp = d;
1348 | 	return d;
1349 | }
1350 | </pre><p class=lp>
1351 | Nextstate runs the NFA
1352 | <code>step</code>
1353 | and returns the corresponding
1354 | <code>DState</code>:
1355 | </p><pre class=p1>
1356 | DState*
1357 | nextstate(DState *d, int c)
1358 | {
1359 | 	step(&amp;d-&gt;l, c, &amp;l1);
1360 | 	return d-&gt;next[c] = dstate(&amp;l1);
1361 | }
1362 | </pre><p class=lp>
1363 | Finally, the DFA's start state is the
1364 | <code>DState</code>
1365 | corresponding to the NFA's start list:
1366 | </p><pre class=p1>
1367 | DState*
1368 | startdstate(State *start)
1369 | {
1370 | 	return dstate(startlist(start, &amp;l1));
1371 | }
1372 | </pre><p class=lp>
1373 | (As in the NFA simulation,
1374 | <code>l1</code>
1375 | is a preallocated
1376 | <code>List</code>.)
1377 | </p>
1378 | 
1379 | <p class=pp>
1380 | The
1381 | <code>DStates</code>
1382 | correspond to DFA states, but the DFA is only built as needed:
1383 | if a DFA state has not been encountered during the search,
1384 | it does not yet exist in the cache.
1385 | An alternative would be to compute the entire DFA at once.
1386 | Doing so would make
1387 | <code>match</code>
1388 | a little faster by removing the conditional branch,
1389 | but at the cost of increased startup time and
1390 | memory use.
1391 | </p>
1392 | 
1393 | <p class=pp>
1394 | One might also worry about bounding the amount of
1395 | memory used by the on-the-fly DFA construction.
1396 | Since the
1397 | <code>DStates</code>
1398 | are only a cache of the 
1399 | <code>step</code>
1400 | function, the implementation of
1401 | <code>dstate</code>
1402 | could choose to throw away the entire DFA so far
1403 | if the cache grew too large.
1404 | This cache replacement policy 
1405 | only requires a few extra lines of code in 
1406 | <code>dstate</code>
1407 | and in
1408 | <code>nextstate</code>,
1409 | plus around 50 lines of code for memory management.
1410 | An implementation is
1411 | <a href="http://swtch.com/~rsc/regexp/">available online</a>.
1412 | (<a href="http://cm.bell-labs.com/cm/cs/awkbook/">Awk</a>
1413 | uses a similar limited-size cache strategy,
1414 | with a fixed limit of 32 cached states; this explains the discontinuity
1415 | in its performance at <i>n</i>=28 in the graph above.)
1416 | </p>
1417 | 
1418 | <p class=pp>
1419 | NFAs derived from regular expressions
1420 | tend to exhibit good locality: they visit the same states
1421 | and follow the same transition arrows over and over
1422 | when run on most texts.
1423 | This makes the caching worthwhile: the first time an arrow
1424 | is followed, the next state must be computed as in the NFA
1425 | simulation, but future traversals of the arrow are just
1426 | a single memory access.
1427 | Real DFA-based implementations can make use
1428 | of additional optimizations to run even faster.
1429 | A companion article (not yet written) will explore
1430 | DFA-based regular expression implementations in more detail.
1431 | </p>
1432 | 
1433 | 
1434 | <h2 class=sh>
1435 | Real world regular expressions
1436 | </h2>
1437 | 
1438 | <p class=pp>
1439 | Regular expression usage in real programs
1440 | is somewhat more complicated than what the regular expression
1441 | implementations described above can handle.
1442 | This section briefly describes the common complications;
1443 | full treatment of any of these is beyond the scope of this
1444 | introductory article.
1445 | </p>
1446 | 
1447 | <p class=pp>
1448 | <i>Character classes</i>.
1449 | A character class, whether 
1450 | <code>[0-9]</code>
1451 | or
1452 | <code>\w</code>
1453 | or
1454 | <code>.</code> (dot),
1455 | is just a concise representation of an alternation.
1456 | Character classes can be expanded into alternations
1457 | during compilation, though it is more efficient to add
1458 | a new kind of NFA node to represent them explicitly.
1459 | <a href="http://www.opengroup.org/onlinepubs/009695399/basedefs/xbd_chap09.html">POSIX</a>
1460 | defines special character classes
1461 | like <code>[[:upper:]]</code> that change meaning
1462 | depending on the current locale, but the hard part of
1463 | accommodating these is determining their meaning,
1464 | not encoding that meaning into an NFA.
1465 | </p>
1466 | 
1467 | <p class=pp>
1468 | <i>Escape sequences</i>.
1469 | Real regular expression syntaxes need to handle
1470 | escape sequences, both as a way to match metacharacters
1471 | (<code>\(</code>,
1472 | <code>\)</code>,
1473 | <code>\\</code>,
1474 | etc.)
1475 | and to specify otherwise difficult-to-type characters such as
1476 | <code>\n</code>.
1477 | </p>
1478 | 
1479 | <p class=pp>
1480 | <i>Counted repetition</i>.
1481 | Many regular expression implementations provide a counted
1482 | repetition operator
1483 | <code>{<i>n</i>}</code>
1484 | to match exactly 
1485 | <i>n</i>
1486 | strings matching a pattern;
1487 | <code>{</code><i>n</i><code>,</code><i>m</i><code>}</code>
1488 | to match at least 
1489 | <i>n</i>
1490 | but no more than
1491 | <i>m</i>;
1492 | and
1493 | <code>{</code><i>n</i><code>,}</code>
1494 | to match
1495 | <i>n</i>
1496 | or more.
1497 | A recursive backtracking implementation can implement
1498 | counted repetition using a loop; an NFA or DFA-based
1499 | implementation must expand the repetition:
1500 | <i>e</i><code>{3}</code>
1501 | expands to
1502 | <i>eee</i>;
1503 | <i>e</i><code>{3,5}</code>
1504 | expands to
1505 | <i>eeee</i><code>?</code><i>e</i><code>?</code>,
1506 | and
1507 | <i>e</i><code>{3,}</code>
1508 | expands to
1509 | <i>eee</i><code>+</code>.
1510 | </p>
1511 | 
1512 | <p class=pp>
1513 | <i>Submatch extraction</i>.
1514 | When regular expressions are used for splitting or parsing strings,
1515 | it is useful to be able to find out which sections of the input string
1516 | were matched by each subexpression.
1517 | After a regular expression like
1518 | <code>([0-9]+-[0-9]+-[0-9]+)</code>
1519 | <code>([0-9]+:[0-9]+)</code>
1520 | matches a string (say a date and time),
1521 | many regular expression engines make the
1522 | text matched by each parenthesized expression
1523 | available.
1524 | For example, one might write in Perl:
1525 | </p><pre class=p1>
1526 | if(/([0-9]+-[0-9]+-[0-9]+) ([0-9]+:[0-9]+)/){
1527 | 	print "date: $1, time: $2\n";
1528 | }
1529 | </pre><p class=lp>
1530 | The extraction of submatch boundaries has been mostly ignored
1531 | by computer science theorists, and it is perhaps the most
1532 | compelling argument for using recursive backtracking.
1533 | However, Thompson-style algorithms can be adapted to
1534 | track submatch boundaries without giving up efficient performance.
1535 | The Eighth Edition Unix
1536 | <i>regexp</i>(3)
1537 | library implemented such an algorithm as early as 1985,
1538 | though as explained below,
1539 | it was not very widely used or even noticed.
1540 | </p>
1541 | 
1542 | <p class=pp>
1543 | <i>Unanchored matches</i>.
1544 | This article has assumed that regular expressions
1545 | are matched against an entire input string.
1546 | In practice, one often wishes to find a substring
1547 | of the input that matches the regular expression.
1548 | Unix tools traditionally return the longest matching substring
1549 | that starts at the leftmost possible point in the input.
1550 | An unanchored search for 
1551 | <i>e</i>
1552 | is a special case
1553 | of submatch extraction: it is like searching for
1554 | <code>.*(<i>e</i>).*</code>
1555 | where the first
1556 | <code>.*</code>
1557 | is constrained to match as short a string as possible.
1558 | </p>
1559 | 
1560 | <p class=pp>
1561 | <i>Non-greedy operators</i>.
1562 | In traditional Unix regular expressions, the repetition operators
1563 | <code>?</code>,
1564 | <code>*</code>,
1565 | and
1566 | <code>+</code>
1567 | are defined to match as much of the string as possible while
1568 | still allowing the entire regular expression to match:
1569 | when matching
1570 | <code>(.+)(.+)</code>
1571 | against
1572 | <code>abcd</code>,
1573 | the first
1574 | <code>(.+)</code>
1575 | will match
1576 | <code>abc</code>,
1577 | and the second
1578 | will match
1579 | <code>d</code>.
1580 | These operators are now called
1581 | <i>greedy</i>.
1582 | Perl introduced
1583 | <code>??</code>,
1584 | <code>*?</code>,
1585 | and
1586 | <code>+?</code>
1587 | as non-greedy versions, which match as little of the string
1588 | as possible while preserving the overall match:
1589 | when matching
1590 | <code>(.+?)(.+?)</code>
1591 | against
1592 | <code>abcd</code>,
1593 | the first
1594 | <code>(.+?)</code>
1595 | will match only
1596 | <code>a</code>,
1597 | and the second
1598 | will match
1599 | <code>bcd.</code>
1600 | By definition, whether an operator is greedy
1601 | cannot affect whether a regular expression matches a
1602 | particular string as a whole; it only affects the
1603 | choice of submatch boundaries.
1604 | The backtracking algorithm admits a simple implementation
1605 | of non-greedy operators:
1606 | try the shorter match before the longer one.
1607 | For example, in a standard backtracking implementation,
1608 | <code><i>e</i>?</code>
1609 | first tries using
1610 | <i>e</i>
1611 | and then tries not using it;
1612 | <code><i>e</i>??</code>
1613 | uses the other order.
1614 | The submatch-tracking variants of Thompson's algorithm
1615 | can be adapted to accommodate non-greedy operators.
1616 | </p>
1617 | 
1618 | <p class=pp>
1619 | <i>Assertions</i>.
1620 | The traditional regular expression metacharacters
1621 | <code>^</code>
1622 | and
1623 | <code>$</code>
1624 | can be viewed as
1625 | <i>assertions</i>
1626 | about the text around them:
1627 | <code>^</code>
1628 | asserts that the previous character
1629 | is a newline (or the beginning of the string),
1630 | while
1631 | <code>$</code>
1632 | asserts that the next character is a newline
1633 | (or the end of the string).
1634 | Perl added more assertions, like
1635 | the word boundary
1636 | <code>\b</code>,
1637 | which asserts that 
1638 | the previous character is alphanumeric but the next
1639 | is not, or vice versa.
1640 | Perl also generalized the idea to arbitrary
1641 | conditions called lookahead assertions:
1642 | <code>(?=</code><i>re</i><code>)</code>
1643 | asserts that the text after the current input position matches
1644 | <i>re</i>,
1645 | but does not actually advance the input position;
1646 | <code>(?!</code><i>re</i><code>)</code>
1647 | is similar but 
1648 | asserts that the text does not match
1649 | <i>re</i>.
1650 | The lookbehind assertions
1651 | <code>(?&lt;=</code><i>re</i><code>)</code>
1652 | and
1653 | <code>(?&lt;!</code><i>re</i><code>)</code>
1654 | are similar but make assertions about the text
1655 | before the current input position.
1656 | Simple assertions like
1657 | <code>^</code>,
1658 | <code>$</code>,
1659 | and
1660 | <code>\b</code>
1661 | are easy to accommodate in an NFA,
1662 | delaying the match one byte for forward assertions.
1663 | The generalized assertions
1664 | are harder to accommodate but in principle could
1665 | be encoded in the NFA.
1666 | </p>
1667 | 
1668 | <p class=pp>
1669 | <i>Backreferences</i>.
1670 | As mentioned earlier, no one knows how to 
1671 | implement regular expressions with backreferences efficiently,
1672 | though no one can prove that it's impossible either.
1673 | (Specifically, the 
1674 | <a href="http://perl.plover.com/NPC/NPC-3SAT.html">problem is NP-complete</a>, meaning that if
1675 | someone did find an efficient implementation, that would
1676 | be <i>major</i> news to computer scientists and would
1677 | win a <a href="http://www.claymath.org/Popular_Lectures/Minesweeper/">million dollar prize</a>.)
1678 | The simplest, most effective strategy for backreferences,
1679 | taken by the original awk and egrep, is not to implement them.
1680 | This strategy is no longer practical: users have come to
1681 | rely on backreferences for at least occasional use,
1682 | and backreferences are part of
1683 | the
1684 | <a href="http://www.opengroup.org/onlinepubs/009695399/basedefs/xbd_chap09.html">POSIX standard for regular expressions</a>.
1685 | Even so, it would be reasonable to use Thompson's NFA simulation
1686 | for most regular expressions, and only bring out
1687 | backtracking when it is needed.
1688 | A particularly clever implementation could combine the two,
1689 | resorting to backtracking only to accommodate the backreferences.
1690 | </p>
1691 | 
1692 | <p class=pp>
1693 | <i>Backtracking with memoization</i>.
1694 | Perl's approach of using memoization to avoid exponential blowup
1695 | during backtracking
1696 | when possible is a good one.  At least in theory, it should make
1697 | Perl's regular expressions behave more like an NFA and
1698 | less like backtracking.  
1699 | Memoization does not completely solve the problem, though:
1700 | the memoization itself requires a memory footprint roughly 
1701 | equal to the size of the text times the size of the regular expression.
1702 | Memoization also does not address the issue of the stack space used
1703 | by backtracking, which is linear in the size of the text:
1704 | matching long strings typically causes a backtracking
1705 | implementation to run out of stack space:
1706 | </p><pre class=p1>
1707 | $ perl -e '("a" x 100000) =~ /^(ab?)*$/;'
1708 | Segmentation fault (core dumped)
1709 | $
1710 | </pre>
1711 | 
1712 | <p class=pp>
1713 | <i>Character sets</i>.
1714 | Modern regular expression implementations must deal with 
1715 | large non-ASCII character sets such as Unicode.
1716 | The 
1717 | <a href="http://swtch.com/plan9port/unix/"
1718 | >Plan 9 regular expression library</a>
1719 | incorporates Unicode by running an NFA with a
1720 | single Unicode character as the input character for each step.
1721 | That library separates the running of the NFA from decoding
1722 | the input, so that the same regular expression matching code
1723 | is used for both 
1724 | <a href="http://plan9.bell-labs.com/sys/doc/utf.html">UTF-8</a>
1725 | and wide-character inputs.
1726 | </p>
1727 | 
1728 | <h2 class=sh id=History>
1729 | History and References
1730 | </h2>
1731 | 
1732 | 
1733 | <p class=pp>
1734 | <a name="rabin-scott-b"></a>Michael Rabin and Dana Scott
1735 | introduced non-deterministic finite automata
1736 | and the concept of non-determinism in 1959
1737 | [<a href="#rabin-scott">7</a>],
1738 | showing that NFAs can be simulated by
1739 | (potentially much larger) DFAs in which 
1740 | each DFA state corresponds to a set of NFA states.
1741 | (They won the Turing Award in 1976 for the introduction
1742 | of the concept of non-determinism in that paper.)
1743 | </p>
1744 | 
1745 | <p class=pp>
1746 | <a name="mcnaughton-yamada-b">R. McNaughton and H. Yamada
1747 | </a>[<a href="#mcnaughton-yamada">4</a>]
1748 | and 
1749 | <a name="thompson-b"></a>Ken Thompson
1750 | [<a href="#thompson">9</a>]
1751 | are commonly credited with giving the first constructions
1752 | to convert regular expressions into NFAs,
1753 | even though neither paper mentions the
1754 | then-nascent concept of an NFA.
1755 | McNaughton and Yamada's construction
1756 | creates a DFA,
1757 | and Thompson's construction creates IBM 7094 machine code,
1758 | but reading between the lines one can
1759 | see latent NFA constructions underlying both.
1760 | Regular expression to NFA constructions differ only in how they encode 
1761 | the choices that the NFA must make.
1762 | The approach used above, mimicking Thompson,
1763 | encodes the choices with explicit choice
1764 | nodes
1765 | (the
1766 | <code>Split</code>
1767 | nodes above)
1768 | and unlabeled arrows.
1769 | An alternative approach,
1770 | the one most commonly credited to McNaughton and Yamada,
1771 | is to avoid unlabeled arrows, instead allowing NFA states to
1772 | have multiple outgoing arrows with the same label.
1773 | <a name="mcilroy-b"></a>McIlroy
1774 | [<a href="#mcilroy">3</a>]
1775 | gives a particularly elegant implementation of this approach
1776 | in Haskell.
1777 | </p>
1778 | 
1779 | <p class=pp>
1780 | <a name="vanvleck-b"></a>Thompson's regular expression implementation
1781 | was for his QED editor running on the CTSS 
1782 | [<a href="#vanvleck">10</a>]
1783 | operating
1784 | system on the IBM 7094.
1785 | <a name="pierce-b"></a>A copy of the editor can be found in archived CTSS sources
1786 | [<a href="#pierce">5</a>].
1787 | <a name="deutsch-lampson-b"></a>L. Peter Deutsch and Butler Lampson
1788 | [<a href="#deutsch-lampson">1</a>]
1789 | developed the first QED, but
1790 | Thompson's reimplementation was the first to use
1791 | regular expressions.
1792 | <a name="ritchie-b"></a>Dennis Ritchie, author of yet another QED implementation,
1793 | has documented the early history of the QED editor
1794 | [<a href="#ritchie">8</a>]
1795 | (Thompson, Ritchie, and Lampson later won
1796 | Turing awards for work unrelated to QED or finite automata.)
1797 | </p>
1798 | 
1799 | <p class=pp>
1800 | Thompson's paper marked the 
1801 | beginning of a long line of regular expression implementations.
1802 | Thompson chose not to use his algorithm when 
1803 | implementing the text editor ed, which appeared in 
1804 | First Edition Unix (1971), or in its descendant grep,
1805 | which first appeared in the Fourth Edition (1973).
1806 | Instead, these venerable Unix tools used
1807 | recursive backtracking!
1808 | Backtracking was justifiable because the
1809 | regular expression syntax was quite limited:
1810 | it omitted grouping parentheses and the
1811 | <code>|</code>,
1812 | <code>?</code>,
1813 | and
1814 | <code>+</code>
1815 | operators.
1816 | Al Aho's egrep,
1817 | which first appeared in the Seventh Edition (1979),
1818 | was the first Unix tool to provide
1819 | the full regular expression syntax, using a
1820 | precomputed DFA.
1821 | By the Eighth Edition (1985), egrep computed the DFA on the fly,
1822 | like the implementation given above.
1823 | </p>
1824 | 
1825 | <p class=pp>
1826 | <a name="pike-b"></a>While writing the text editor sam 
1827 | [<a href="#pike">6</a>]
1828 | in the early 1980s,
1829 | Rob Pike wrote a new regular expression implementation,
1830 | which Dave Presotto extracted into a library that 
1831 | appeared in the Eighth Edition.
1832 | Pike's implementation
1833 | incorporated submatch tracking into an efficient NFA simulation
1834 | but, like the rest of the Eighth Edition source, was not widely
1835 | distributed.
1836 | Pike himself did not realize that his technique was anything new.
1837 | Henry Spencer reimplemented the Eighth Edition library
1838 | interface from scratch, but using backtracking,
1839 | and
1840 | <a href="http://arglist.com/regex/">released his implementation</a>
1841 | into the public domain.
1842 | It became very widely used, eventually serving as the basis
1843 | for the slow regular expression implementations
1844 | mentioned earlier: Perl, PCRE, Python, and so on.
1845 | (In his defense,
1846 | Spencer knew the routines could be slow,
1847 | and he didn't know that a more efficient algorithm existed.
1848 | He even warned in the documentation,
1849 | &ldquo;Many users have found the speed perfectly adequate,
1850 | although replacing the insides of egrep with this code
1851 | would be a mistake.&rdquo;)
1852 | Pike's regular expression implementation, extended to
1853 | support Unicode, was made freely available
1854 | with sam in 
1855 | <a href="http://groups.google.com/group/comp.os.research/msg/f1783504a2d18051">late 1992</a>,
1856 | but the particularly efficient
1857 | regular expression search algorithm went unnoticed.
1858 | The code is now available in many forms: as 
1859 | <a href="http://plan9.bell-labs.com/sources/plan9/sys/src/cmd/sam/">part of sam</a>,
1860 | as 
1861 | <a href="http://plan9.bell-labs.com/sources/plan9/sys/src/libregexp/">Plan&nbsp;9's regular expression library</a>,
1862 | or
1863 | <a href="http://swtch.com/plan9port/unix/">packaged separately for Unix</a>.
1864 | <a name="laurikari-b"></a>Ville Laurikari independently discovered Pike's algorithm
1865 | in 1999, developing a theoretical foundation as well
1866 | [<a href="#laurikari">2</a>].
1867 | </p>
1868 | 
1869 | 
1870 | <p class=pp>
1871 | Finally, any discussion of regular expressions
1872 | would be incomplete without mentioning 
1873 | Jeffrey Friedl's book
1874 | <i>Mastering Regular Expressions</i>,
1875 | perhaps the most popular reference among today's programmers.
1876 | Friedl's book teaches programmers how best to use today's
1877 | regular expression implementations, but not how best to implement them.
1878 | What little text it devotes to implementation
1879 | issues perpetuates the widespread belief that recursive backtracking
1880 | is the only way to simulate an NFA.
1881 | Friedl makes it clear that he 
1882 | <a href="http://regex.info/blog/2006-09-15/248"
1883 | >neither understands nor respects</a>
1884 | the underlying theory.
1885 | </p>
1886 | 
1887 | <h2 class=sh>
1888 | Summary
1889 | </h2>
1890 | 
1891 | <p class=pp>
1892 | Regular expression matching can be simple and fast, using
1893 | finite automata-based techniques that have been known for decades.
1894 | In contrast, Perl, PCRE, Python, Ruby, Java,
1895 | and many other languages
1896 | have regular expression implementations based on 
1897 | recursive backtracking that are simple but can be
1898 | excruciatingly slow.
1899 | With the exception of backreferences, the features
1900 | provided by the slow backtracking implementations
1901 | can be provided by the automata-based implementations
1902 | at dramatically faster, more consistent speeds.
1903 | </p>
1904 | 
1905 | <p class=pp>
1906 | The next article in this series,
1907 | &ldquo;<a href="regexp2.html">Regular Expression Matching: the Virtual Machine Approach</a>,&rdquo; discusses NFA-based submatch extraction.
1908 | The third article, &ldquo;<a href="regexp3.html">Regular Expression Matching in the Wild</a>,&rdquo; examines a production implementation.
1909 | The fourth article, &ldquo;<a href="regexp4.html">Regular Expression Matching with a Trigram Index</a>,&rdquo; explains how Google Code Search was implemented.
1910 | </p>
1911 | 
1912 | <h2 class=sh>
1913 | Acknowledgements
1914 | </h2>
1915 | 
1916 | <p class=pp>
1917 | Lee Feigenbaum,
1918 | James Grimmelmann,
1919 | Alex Healy,
1920 | William Josephson,
1921 | and
1922 | Arnold Robbins
1923 | read drafts of this article and made many helpful suggestions.
1924 | Rob Pike clarified some of the history surrounding his
1925 | regular expression implementation.
1926 | Thanks to all.
1927 | </p>
1928 | 
1929 | <h2 class=sh>
1930 | References
1931 | </h2>
1932 | 
1933 | <p class=lp-left>
1934 | <a name=deutsch-lampson></a>
1935 | [<a href="#deutsch-lampson-b">1</a>]
1936 | L. Peter Deutsch and Butler Lampson,
1937 | &ldquo;An online editor,&rdquo;
1938 | Communications of the ACM 10(12) (December 1967), pp.&nbsp;793&ndash;799.
1939 | <a href="http://doi.acm.org/10.1145/363848.363863"><i>http://doi.acm.org/10.1145/363848.363863</i></a>
1940 | </p><p class=lp-left>
1941 | <a name=laurikari></a>
1942 | [<a href="#laurikari-b">2</a>]
1943 | Ville Laurikari,
1944 | &ldquo;NFAs with Tagged Transitions,
1945 | their Conversion to Deterministic Automata
1946 | and
1947 | Application to Regular Expressions,&rdquo;
1948 | in Proceedings of the Symposium on String Processing and
1949 | Information Retrieval, September 2000.
1950 | <a href="http://laurikari.net/ville/spire2000-tnfa.ps"><i>http://laurikari.net/ville/spire2000-tnfa.ps</i></a>
1951 | </p><p class=lp-left>
1952 | <a name=mcilroy></a>
1953 | [<a href="#mcilroy-b">3</a>]
1954 | M. Douglas McIlroy,
1955 | &ldquo;Enumerating the strings of regular languages,&rdquo;
1956 | Journal of Functional Programming 14 (2004), pp.&nbsp;503&ndash;518.
1957 | <a href="http://www.cs.dartmouth.edu/~doug/nfa.ps.gz"><i>http://www.cs.dartmouth.edu/~doug/nfa.ps.gz</i></a> (preprint)
1958 | </p><p class=lp-left>
1959 | <a name=mcnaughton-yamada></a>
1960 | [<a href="#mcnaughton-yamada-b">4</a>]
1961 | R. McNaughton and H. Yamada,
1962 | &ldquo;Regular expressions and state graphs for automata,&rdquo;
1963 | IRE Transactions on Electronic Computers EC-9(1) (March 1960), pp.&nbsp;39&ndash;47.
1964 | </p><p class=lp-left>
1965 | <a name=pierce></a>
1966 | [<a href="#pierce-b">5</a>]
1967 | Paul Pierce,
1968 | &ldquo;CTSS source listings.&rdquo;
1969 | <a href="http://www.piercefuller.com/library/ctss.html"><i>http://www.piercefuller.com/library/ctss.html</i></a> 
1970 | (Thompson's QED is in the file
1971 | <code>com5</code>
1972 | in the source listings archive and is marked as
1973 | <code>0QED</code>)
1974 | </p><p class=lp-left>
1975 | <a name=pike></a>
1976 | [<a href="#pike-b">6</a>]
1977 | Rob Pike,
1978 | &ldquo;The text editor sam,&rdquo;
1979 | Software&mdash;Practice & Experience 17(11) (November 1987), pp.&nbsp;813&ndash;845.
1980 | <a href="http://plan9.bell-labs.com/sys/doc/sam/sam.html"><i>http://plan9.bell-labs.com/sys/doc/sam/sam.html</i></a>
1981 | </p><p class=lp-left>
1982 | <a name=rabin-scott></a>
1983 | [<a href="#rabin-scott-b">7</a>]
1984 | Michael Rabin and Dana Scott,
1985 | &ldquo;Finite automata and their decision problems,&rdquo;
1986 | IBM Journal of Research and Development 3 (1959), pp.&nbsp;114&ndash;125.
1987 | <a href="http://www.research.ibm.com/journal/rd/032/ibmrd0302C.pdf"><i>http://www.research.ibm.com/journal/rd/032/ibmrd0302C.pdf</i></a>
1988 | </p><p class=lp-left>
1989 | <a name=ritchie></a>
1990 | [<a href="#ritchie-b">8</a>]
1991 | Dennis Ritchie,
1992 | &ldquo;An incomplete history of the QED text editor.&rdquo;
1993 | <a href="http://plan9.bell-labs.com/~dmr/qed.html"><i>http://plan9.bell-labs.com/~dmr/qed.html</i></a>
1994 | </p><p class=lp-left>
1995 | <a name=thompson></a>
1996 | [<a href="#thompson-b">9</a>]
1997 | Ken Thompson,
1998 | &ldquo;Regular expression search algorithm,&rdquo;
1999 | Communications of the ACM 11(6) (June 1968), pp.&nbsp;419&ndash;422.
2000 | <a href="http://doi.acm.org/10.1145/363347.363387"><i>http://doi.acm.org/10.1145/363347.363387</i></a>
2001 | (<font size=-1><a href="http://www.cs.chalmers.se/~coquand/AUTOMATA/thompson.pdf">PDF</a></font>)
2002 | </p><p class=lp-left>
2003 | <a name=vanvleck></a>
2004 | [<a href="#vanvleck-b">10</a>]
2005 | Tom Van Vleck,
2006 | &ldquo;The IBM 7094 and CTSS.&rdquo;
2007 | <a href="http://www.multicians.org/thvv/7094.html"><i>http://www.multicians.org/thvv/7094.html</i></a>
2008 | </p>
2009 | 
2010 | <br>
2011 | <p class=lp-left>
2012 | Discussion on <a href="http://programming.reddit.com/info/10c60/comments">reddit</a> and <a href="http://perlmonks.org/?node_id=597262">perlmonks</a> and
2013 | <a href="http://lambda-the-ultimate.org/node/2064">LtU</a>
2014 | </p>
2015 | 
2016 | <center>
2017 | <p class=copy>
2018 | Copyright &copy; 2007 Russ Cox.  All Rights Reserved.
2019 | <br>
2020 | <a href="http://swtch.com/~rsc/regexp/">http://swtch.com/~rsc/regexp/</a>
2021 | </p>
2022 | </center>
2023 | <script type="text/javascript">
2024 | var gaJsHost = (("https:" == document.location.protocol) ? "https://ssl." : "http://www.");
2025 | document.write(unescape("%3Cscript src='" + gaJsHost + "google-analytics.com/ga.js' type='text/javascript'%3E%3C/script%3E"));
2026 | </script>
2027 | <script type="text/javascript">
2028 | var pageTracker = _gat._getTracker("UA-3319603-2");
2029 | pageTracker._initData();
2030 | pageTracker._trackPageview();
2031 | </script>
2032 | <script type="text/javascript">
2033 |   (function() {
2034 |     var po = document.createElement('script'); po.type = 'text/javascript'; po.async = true;
2035 |     po.src = 'https://apis.google.com/js/plusone.js';
2036 |     var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(po, s);
2037 |   })();
2038 | </script>
2039 | </body>
2040 | </html>
2041 | 


--------------------------------------------------------------------------------
/t/suite/test.zst:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tokers/zstd-nginx-module/f4ba115e0b0eaecde545e5f37db6aa18917d8f4b/t/suite/test.zst


--------------------------------------------------------------------------------
/valgrind.suppress:
--------------------------------------------------------------------------------
  1 | {
  2 |    <insert_a_suppression_name_here>
  3 |    Memcheck:Addr1
  4 |    fun:ngx_init_cycle
  5 |    fun:ngx_master_process_cycle
  6 |    fun:main
  7 | }
  8 | {
  9 |    <insert_a_suppression_name_here>
 10 |    Memcheck:Addr4
 11 |    fun:ngx_init_cycle
 12 |    fun:ngx_master_process_cycle
 13 |    fun:main
 14 | }
 15 | {
 16 |    <insert_a_suppression_name_here>
 17 |    Memcheck:Cond
 18 |    fun:ngx_vslprintf
 19 |    fun:ngx_snprintf
 20 |    fun:ngx_sock_ntop
 21 |    fun:ngx_event_accept
 22 |    fun:ngx_epoll_process_events
 23 |    fun:ngx_process_events_and_timers
 24 | }
 25 | {
 26 |    <insert_a_suppression_name_here>
 27 |    Memcheck:Addr1
 28 |    fun:ngx_vslprintf
 29 |    fun:ngx_snprintf
 30 |    fun:ngx_sock_ntop
 31 |    fun:ngx_event_accept
 32 | }
 33 | {
 34 |    <insert_a_suppression_name_here>
 35 |    exp-sgcheck:SorG
 36 |    fun:ngx_http_lua_ndk_set_var_get
 37 | }
 38 | {
 39 |    <insert_a_suppression_name_here>
 40 |    exp-sgcheck:SorG
 41 |    fun:ngx_http_variables_init_vars
 42 |    fun:ngx_http_block
 43 | }
 44 | {
 45 |    <insert_a_suppression_name_here>
 46 |    exp-sgcheck:SorG
 47 |    fun:ngx_conf_parse
 48 | }
 49 | {
 50 |    <insert_a_suppression_name_here>
 51 |    exp-sgcheck:SorG
 52 |    fun:ngx_vslprintf
 53 |    fun:ngx_log_error_core
 54 | }
 55 | {
 56 |    <insert_a_suppression_name_here>
 57 |    Memcheck:Param
 58 |    epoll_ctl(event)
 59 |    fun:epoll_ctl
 60 | }
 61 | {
 62 |    <insert_a_suppression_name_here>
 63 |    Memcheck:Cond
 64 |    fun:ngx_conf_flush_files
 65 |    fun:ngx_single_process_cycle
 66 | }
 67 | {
 68 |    <insert_a_suppression_name_here>
 69 |    Memcheck:Cond
 70 |    fun:memcpy
 71 |    fun:ngx_vslprintf
 72 |    fun:ngx_log_error_core
 73 |    fun:ngx_http_charset_header_filter
 74 | }
 75 | {
 76 |    <insert_a_suppression_name_here>
 77 |    Memcheck:Param
 78 |    socketcall.setsockopt(optval)
 79 |    fun:setsockopt
 80 |    fun:drizzle_state_connect
 81 | }
 82 | {
 83 |    <insert_a_suppression_name_here>
 84 |    Memcheck:Cond
 85 |    fun:ngx_conf_flush_files
 86 |    fun:ngx_single_process_cycle
 87 |    fun:main
 88 | }
 89 | {
 90 |    <insert_a_suppression_name_here>
 91 |    Memcheck:Leak
 92 |    fun:malloc
 93 |    fun:ngx_alloc
 94 |    fun:ngx_event_process_init
 95 | }
 96 | {
 97 |    <insert_a_suppression_name_here>
 98 |    Memcheck:Param
 99 |    sendmsg(mmsg[0].msg_hdr)
100 |    fun:sendmmsg
101 |    fun:__libc_res_nsend
102 | }
103 | {
104 |    <insert_a_suppression_name_here>
105 |    Memcheck:Param
106 |    sendmsg(msg.msg_iov[0])
107 |    fun:__sendmsg_nocancel
108 |    fun:ngx_write_channel
109 |    fun:ngx_pass_open_channel
110 |    fun:ngx_start_cache_manager_processes
111 | }
112 | {
113 |    <insert_a_suppression_name_here>
114 |    Memcheck:Cond
115 |    fun:ngx_init_cycle
116 |    fun:ngx_master_process_cycle
117 |    fun:main
118 | }
119 | {
120 |    <insert_a_suppression_name_here>
121 |    Memcheck:Cond
122 |    fun:index
123 |    fun:expand_dynamic_string_token
124 |    fun:_dl_map_object
125 |    fun:map_doit
126 |    fun:_dl_catch_error
127 |    fun:do_preload
128 |    fun:dl_main
129 |    fun:_dl_sysdep_start
130 |    fun:_dl_start
131 | }
132 | {
133 |    <insert_a_suppression_name_here>
134 |    Memcheck:Param
135 |    sendmsg(mmsg[0].msg_hdr)
136 |    fun:sendmmsg
137 |    fun:__libc_res_nsend
138 |    fun:__libc_res_nquery
139 |    fun:__libc_res_nquerydomain
140 |    fun:__libc_res_nsearch
141 | }
142 | {
143 |    <insert_a_suppression_name_here>
144 |    Memcheck:Leak
145 |    match-leak-kinds: definite
146 |    fun:malloc
147 |    fun:ngx_alloc
148 |    fun:ngx_set_environment
149 |    fun:ngx_single_process_cycle
150 | }
151 | {
152 |    <insert_a_suppression_name_here>
153 |    Memcheck:Cond
154 |    obj:*
155 | }
156 | {
157 |    <insert_a_suppression_name_here>
158 |    Memcheck:Leak
159 |    match-leak-kinds: definite
160 |    fun:malloc
161 |    fun:ngx_alloc
162 |    fun:ngx_set_environment
163 |    fun:ngx_worker_process_init
164 | }
165 | {
166 |    <insert_a_suppression_name_here>
167 |    Memcheck:Leak
168 |    match-leak-kinds: definite
169 |    fun:malloc
170 |    fun:ngx_alloc
171 |    fun:ngx_create_pool
172 |    fun:main
173 | }
174 | {
175 |    <insert_a_suppression_name_here>
176 |    Memcheck:Param
177 |    epoll_pwait(sigmask)
178 |    fun:epoll_pwait
179 |    fun:epoll_wait
180 |    fun:ngx_epoll_process_events
181 |    fun:ngx_process_events_and_timers
182 | }
183 | {
184 |    <insert_a_suppression_name_here>
185 |    Memcheck:Param
186 |    epoll_pwait(sigmask)
187 |    fun:epoll_pwait
188 |    fun:epoll_wait
189 |    fun:ngx_epoll_test_rdhup
190 |    fun:ngx_epoll_init
191 |    fun:ngx_event_process_init
192 | }
193 | {
194 |    <insert_a_suppression_name_here>
195 |    Memcheck:Param
196 |    epoll_pwait(sigmask)
197 |    fun:epoll_pwait
198 |    fun:ngx_epoll_process_events
199 |    fun:ngx_process_events_and_timers
200 | }
201 | {
202 |    <insert_a_suppression_name_here>
203 |    Memcheck:Param
204 |    epoll_pwait(sigmask)
205 |    fun:epoll_pwait
206 |    fun:ngx_epoll_test_rdhup
207 |    fun:ngx_epoll_init
208 |    fun:ngx_event_process_init
209 | }
210 | {
211 |    <insert_a_suppression_name_here>
212 |    Memcheck:Leak
213 |    match-leak-kinds: possible
214 |    fun:malloc
215 |    fun:ngx_alloc
216 |    fun:ngx_crc32_table_init
217 |    fun:main
218 | }
219 | 


--------------------------------------------------------------------------------