├── screenshot └── grafana.png ├── counter.conf ├── README.md └── lib └── counter.lua /screenshot/grafana.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vovolie/lua-nginx-prometheus/HEAD/screenshot/grafana.png -------------------------------------------------------------------------------- /counter.conf: -------------------------------------------------------------------------------- 1 | # Please copy to nginx's conf.d directory 2 | # Set search paths for pure Lua external libraries (';;' is the default path): 3 | lua_package_path "/Users/zl/Work/Counter/nginx-lua-prometheus/?.lua;;/Users/zl/Work/Counter/lua-resty-consul/lib/resty/?.lua;;/Users/zl/Work/Counter/lib/?.lua;;"; 4 | 5 | # Set Prometheus global dict 6 | lua_shared_dict prometheus_metrics 10M; #init 10M memory 7 | lua_shared_dict uri_by_host 10M; 8 | lua_shared_dict global_set 1M; 9 | # Development option, if deploy production, pls cache on! 10 | lua_code_cache off; 11 | 12 | init_by_lua_block { 13 | counter = require 'counter' 14 | counter.init() 15 | consul_host = "" 16 | consul_port = 17 | } 18 | 19 | log_by_lua_block { 20 | counter.log() 21 | } 22 | 23 | # Expose prometheus's metrics scrape port 24 | server { 25 | listen 9145; 26 | allow 127.0.0.1; 27 | deny all; 28 | access_log off; 29 | location /metrics { 30 | content_by_lua 'prometheus:collect()'; 31 | } 32 | 33 | } 34 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # lua-nginx-prometheus 2 | 这是一个监控Nginx流量的扩展程序. 3 | 4 | ## 介绍 5 | 基于Openresty和Prometheus、Consul、Grafana设计的,实现了针对域名和Endpoint级别的流量统计,使用Consul做服务发现、KV存储,Grafana做性能图展示。 6 | 7 | 最终展现图 8 | 9 | ![](screenshot/grafana.png) 10 | 11 | ### 主要实现流程 12 | 13 | POST Json 注册服务 -> Consul Service <- Prometheus -> 定时抓取 http指标接口 Nginx 14 | 15 | POST KV 提交Endpoint -> Consul KV Service <- Nginx 定时更新需要监控的Endpoint 16 | 17 | Grafana 读取 -> Prometheus 18 | 19 | ### 优点 20 | 21 | * 基本实现自动化集中配置,各种接口使用非常简单 22 | * 通过Prometheus提供了非常丰富的查询维度,例如(域名、Endpoint、状态码、协议类型、method),当然还可以非常简单地添加更多。 23 | * Grafana图表功能强大,非常直观地查看各个服务的状态和发现异常。 24 | 25 | ## 安装和使用说明 26 | 27 | 本项目是基于Openresty开发,所以事先安装好Openresty,这个非常简单。 28 | 29 | 安装Consul,这是基于golang开发的服务自动发现工具,详细查看官方文档。`https://www.consul.io/` 30 | 31 | 安装Prometheus,这是一个时序数据库和监控工具,性能和存储十分可靠,把Prometheus配置发现服务使用Consul。官方文档:`https://prometheus.io/docs/operating/configuration/#` 32 | 33 | 安装Grafana。`https://grafana.com/` 34 | 35 | ### 安装 本扩展程序 36 | 37 | 克隆 lua-nginx-prometheus 仓库到Openresty服务器上。 38 | 39 | 克隆依赖Prometheus [nginx-lua-prometheus](https://github.com/knyar/nginx-lua-prometheus) 仓库到服务器上。 40 | 41 | 克隆依赖Consul [lua-resty-consul](https://github.com/hamishforbes/lua-resty-consul) 仓库到服务器上。 42 | 43 | 把lua-nginx-prometheus仓库中的 counter.conf文件复制到Openresty目录下的nginx/conf/conf.d目录内。 44 | 45 | ### 编辑 counter.conf 文件 46 | 47 | ```conf 48 | lua_package_path "/Users/zl/Work/Counter/nginx-lua-prometheus/?.lua;;/Users/zl/Work/Counter/lua-resty-consul/lib/resty/?.lua;;/Users/zl/Work/Counter/lib/?.lua;;"; 49 | ``` 50 | 51 | 修改lua_package_path参数,把 lua-nginx-prometheus、nginx-lua-prometheus、lua-resty-consul三个目录位置指定,目录下一定是包含 ?.lua。 52 | 53 | ``` 54 | consul_host = "" 55 | consul_port = 56 | ``` 57 | 58 | 把consul的地址和端口替换上。 59 | 60 | ``` 61 | server { 62 | listen 9145; 63 | allow 127.0.0.1; 64 | deny all; 65 | access_log off; 66 | location /metrics { 67 | content_by_lua 'prometheus:collect()'; 68 | } 69 | } 70 | ``` 71 | 72 | 添加allow 允许指定ip访问 指标接口。 73 | 74 | 启动Openresty后,试试 `http://:9145/metrics` 75 | 76 | ### 配置 Prometheus 服务发现功能 77 | 78 | 详细参考这个文档 79 | 80 | `https://prometheus.io/docs/operating/configuration/#` 81 | 82 | 完成后,通过Consul 的 http API进行注册服务。 83 | 84 | ``` 85 | curl -X PUT -d @test.json http://:/v1/agent/service/register 86 | ``` 87 | 88 | ```json 89 | { 90 | "ID": <定义唯一的ID>, 91 | "Name": "对应prometheus consul_sd_config", 92 | "Tags": [ 93 | "" 94 | ], 95 | "Address": , 96 | "Port": 9145 97 | } 98 | ``` 99 | 100 | 注销服务 101 | 102 | ``` 103 | curl http://:/v1/agent/service/deregister/ 104 | ``` 105 | 106 | ### 配置 Consul KV存储 107 | 108 | 增加域名和对应的Endpoint 109 | 110 | ``` 111 | curl --request PUT --data @test.json http://:/v1/kv/domain//routers 112 | ``` 113 | 数组 114 | 115 | ```json 116 | [ 117 | "/users/[0-9]+/followers/", 118 | "/users/[0-9]+/", 119 | "/users/[0-9]+/comments/", 120 | "/news" 121 | ] 122 | ``` 123 | 124 | ### 配置Grafana 到 Prometheus上读取数据 125 | 126 | 详细文档参考 `https://prometheus.io/docs/visualization/grafana/` 127 | 128 | 129 | ### 创建图表 130 | 131 | 常见查询语句 132 | 133 | ``` 134 | sum(irate(nginx_http_request_duration_seconds_count{host="api.qq.com"}[5m])) by (status) 135 | ``` 136 | 多台服务器合计每秒请求量,查询单个域名,group by 用状态码 137 | 138 | ``` 139 | sum(rate(nginx_http_request_duration_seconds_sum{host="api.qq.com",endpoint!="/ws"}[1m])) / sum(rate(nginx_http_request_duration_seconds_count{host="api.qq.com",endpoint!="/ws"}[1m])) 140 | ``` 141 | 接口平均响应时间, 不包含 websocket接口 142 | 143 | histogram 直方图非常有用,可以详细了解一下。 144 | 145 | ``` 146 | topk(5, sum(rate(nginx_http_request_duration_seconds_sum{host="api.qq.com",endpoint!="/ws"}[1h])) by (endpoint)/sum(rate(nginx_http_request_duration_seconds_count{host="api.qq.com",endpoint!="/ws"}[1h])) by (endpoint)) 147 | ``` 148 | 5个响应时间最大的,不包含 websocket接口 149 | 150 | -------------------------------------------------------------------------------- /lib/counter.lua: -------------------------------------------------------------------------------- 1 | local cjson = require('cjson') 2 | local http = require('resty.http') 3 | local pcall = pcall 4 | local json_decode = cjson.decode 5 | local ngx = ngx 6 | local ngx_log = ngx.log 7 | local ngx_err = ngx.ERR 8 | local timer_at = ngx.timer.at 9 | local ngx_sleep = ngx.sleep 10 | local delay = 300 -- 轮询consul时间间隔,10s 11 | local _M = {} 12 | 13 | -- 初始化Prometheus指标,全局字典对象,initted 已经被初始化标记,looped 已经开始循环标记 14 | function _M.init() 15 | uris = ngx.shared.uri_by_host 16 | global_set = ngx.shared.global_set 17 | global_set:set("initted", false) 18 | global_set:set("looped", false) 19 | prometheus = require("prometheus").init("prometheus_metrics") 20 | metric_get_consul = prometheus:counter("nginx_consul_get_total", "Number of query uri from consul", {"status"}) 21 | metric_latency = prometheus:histogram("nginx_http_request_duration_seconds", "HTTP request latency status", {"host", "status", "scheme", "method", "endpoint"}) 22 | end 23 | -- 从consul上拉取k-v数据,先取得 domain内的 域名列表,然后迭代全部域名key内的endpoint值 24 | function _M.sync_consul() 25 | local httpc = http.new() 26 | httpc:set_timeout(500) 27 | local res, err = httpc:request_uri("http://consul_ip:8500/v1/kv/domain/Value?raw") 28 | if not res then 29 | ngx_log(ngx_err, err) 30 | metric_get_consul:inc(1, {"failed"}) 31 | return false 32 | else 33 | metric_get_consul:inc(1, {"succ"}) 34 | end 35 | local hosts, err = json_decode(res.body) 36 | if hosts == nil then 37 | ngx_log(ngx_err, err) 38 | return false 39 | end 40 | for i=1, #hosts do 41 | local host = hosts[i] 42 | local get_uri_by_host, err = httpc:request_uri("http://consul_ip:8500/v1/kv/domain/"..host.."/routers?raw") 43 | if not get_uri_by_host then 44 | ngx_log(ngx_err, err) 45 | return false 46 | end 47 | local uris_json = get_uri_by_host.body 48 | if not uris_json then 49 | ngx_log(ngx_err, err) 50 | return false 51 | end 52 | uris:set(host, uris_json) 53 | end 54 | return true 55 | end 56 | -- nginx启动后,初次开始同步consul 57 | function _M.first_init() 58 | local initted = global_set:get("initted") 59 | if initted == false then 60 | global_set:set("initted", true) 61 | local handler 62 | function handler(premature) 63 | if not _M.sync_consul() then 64 | ngx_log(ngx_err, "Call sync_consul failed!") 65 | return 66 | end 67 | end 68 | -- 第一次启动定时器 69 | local ok, err = timer_at(0, handler) 70 | if not ok then 71 | ngx_log(ngx_err, "Call timer_at failed: ", err) 72 | return 73 | end 74 | ngx_log(ngx_err, "First initialize load consul data!") 75 | end 76 | end 77 | -- 开始循环定时拉取consul数据 78 | function _M.loop_load() 79 | local loop_handler 80 | -- premature 表示nginx 的slave进程的状态(例如nginx平滑reload时,子进程可能存在未完全退出) 81 | function loop_handler(premature) 82 | ngx_log(ngx_err, "Timer prematurely expired: ", premature) 83 | ngx_log(ngx_err, "Worker exiting: ", ngx.worker.exiting()) 84 | if not premature then 85 | if _M.sync_consul() then 86 | -- 拉起定时器 87 | local ok, err = timer_at(delay, loop_handler) 88 | if not ok then 89 | ngx_log(ngx_err, "Call timer_at failed: ", err) 90 | return 91 | end 92 | ngx_log(ngx_err, "Looping in timer!") 93 | end 94 | else 95 | global_set:set("looped", false) 96 | end 97 | end 98 | -- 绑定到第一个进程上,防止重复拉起定时器 99 | if global_set:get("looped") == false then 100 | if 0 == ngx.worker.id() then 101 | local ok, err = timer_at(delay, loop_handler) 102 | if not ok then 103 | ngx_log(ngx_err, "Call timer_at failed: ", err) 104 | return 105 | end 106 | global_set:set("looped", true) 107 | ngx_log(ngx_err, "Starting loop load consul data!") 108 | end 109 | end 110 | end 111 | function _M.log() 112 | _M.first_init() 113 | _M.loop_load() 114 | local request_host = ngx.var.host 115 | local request_uri = ngx.unescape_uri(ngx.var.uri) 116 | local request_status = ngx.var.status 117 | local request_scheme = ngx.var.scheme 118 | local request_method = ngx.var.request_method 119 | local get_all_hosts = uris:get_keys() 120 | if get_all_hosts == nil then 121 | ngx_log(ngx_err, "Dict is empty!") 122 | return 123 | end 124 | for j=1, #get_all_hosts do 125 | if get_all_hosts[j] == request_host then 126 | local def_uri = json_decode(uris:get(get_all_hosts[j])) 127 | if def_uri == nil then 128 | ngx_log(ngx_err, "Decode uris err!") 129 | return 130 | end 131 | for k=1, #def_uri do 132 | local s = "^"..def_uri[k].."$" 133 | if ngx.re.find(request_uri, s, "isjo" ) ~= nil then 134 | metric_latency:observe(ngx.now() - ngx.req.start_time(), {request_host, request_status, request_scheme, request_method, def_uri[k]}) 135 | end 136 | end 137 | end 138 | end 139 | end 140 | return _M 141 | --------------------------------------------------------------------------------