├── .gitignore ├── LICENSE ├── NOTICE ├── README.md ├── cfg.example.json ├── control ├── cron ├── cleaner.go └── strategy.go ├── g ├── cfg.go ├── g.go ├── redis.go ├── rpc.go └── var.go ├── http ├── common.go ├── http.go └── info.go ├── main.go ├── rpc ├── receiver.go └── rpc.go └── store ├── func.go ├── history.go ├── judge.go └── linkedlist.go /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled Object files, Static and Dynamic libs (Shared Objects) 2 | *.o 3 | *.a 4 | *.so 5 | 6 | # Folders 7 | _obj 8 | _test 9 | 10 | # Architecture specific extensions/prefixes 11 | *.[568vq] 12 | [568vq].out 13 | 14 | *.cgo1.go 15 | *.cgo2.c 16 | _cgo_defun.c 17 | _cgo_gotypes.go 18 | _cgo_export.* 19 | 20 | _testmain.go 21 | 22 | *.exe 23 | *.test 24 | *.prof 25 | 26 | *.swp 27 | *.swo 28 | *.log 29 | .idea 30 | .DS_Store 31 | /var 32 | /judge* 33 | /falcon-judge* 34 | /cfg.json 35 | /gitversion 36 | 37 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | Open-Falcon 2 | 3 | Copyright (c) 2014-2015 Xiaomi, Inc. All Rights Reserved. 4 | 5 | This product is licensed to you under the Apache License, Version 2.0 (the "License"). 6 | You may not use this product except in compliance with the License. 7 | 8 | This product may include a number of subcomponents with separate copyright notices 9 | and license terms. Your use of these subcomponents is subject to the terms and 10 | conditions of the subcomponent's license, as noted in the LICENSE file. 11 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | falcon-judge 2 | ============ 3 | 4 | Judge是用于判断是否触发报警条件的组件。 5 | 6 | Transfer的数据不但要打到Graph来存储并绘图,还要打到Judge用于报警判断。Judge先从hbs获取所有策略列表,静等Transfer的数据转发。 7 | 每收到一条Transfer转发过来的数据,立即找到这条数据关联的Strategy、Expression,然后做阈值判断。 8 | 9 | **如何找到关联的Strategy** 10 | push上来的数据带有一个endpoint,endpoint通常都是hostname,hostname隶属于多个HostGroup,HostGroup可以关联多个Template,各个 11 | Teamplate下面就是Strategy,层层顺藤摸瓜可得。但是,如果endpoint不是hostname,并没有被HostGroup管理,那就找不到了。 12 | 13 | **如何找到关联的Expression** 14 | 这是一种更通用的方案,主要针对endpoint不是hostname的情况。push上来的数据通常带有多个tag,比如project=falcon,module=judge, 15 | 假如我们要针对所有打了project=falcon这个tag的数据做qps的阈值判断,那我们可以配置一个这样的表达式: 16 | 17 | ``` 18 | each(metric=qps project=falcon) 19 | ``` 20 | 21 | 如上配置之后,push上来的数据如果发现metric=qps,并且带有project=falcon这个tag,那就说明与这个expression相关,要做相关阈值判断 22 | 23 | ## Installation 24 | 25 | ```bash 26 | # set $GOPATH and $GOROOT 27 | mkdir -p $GOPATH/src/github.com/open-falcon 28 | cd $GOPATH/src/github.com/open-falcon 29 | git clone https://github.com/open-falcon/judge.git 30 | cd judge 31 | go get ./... 32 | ./control build 33 | ./control start 34 | ``` 35 | 36 | ## Configuration 37 | 38 | 配置文件中主要是一些连接地址和监听的端口,没啥好说的,看一下alarm的配置,judge报警判断完毕之后会产生报警event,这些event会写入 39 | alarm的redis队列中,不同优先级(配置策略的时候每个策略会配置一个优先级,0-5)写入不同队列,alarm中除了redis地址需要修改,其他 40 | 的建议维持默认。 41 | 42 | alarm中有一个minInterval的配置,单位是秒,默认是300秒,表示同一个event,如果配置报警多次,那么两个报警之间至少间隔300秒。 43 | 这是个经验值,我们觉得报警太频繁没有意义,对工程师来说是干扰。收到报警之后拿出电脑、开机、连上vpn就差不多要3分钟了…… 44 | 45 | -------------------------------------------------------------------------------- /cfg.example.json: -------------------------------------------------------------------------------- 1 | { 2 | "debug": true, 3 | "debugHost": "nil", 4 | "remain": 11, 5 | "http": { 6 | "enabled": true, 7 | "listen": "0.0.0.0:6081" 8 | }, 9 | "rpc": { 10 | "enabled": true, 11 | "listen": "0.0.0.0:6080" 12 | }, 13 | "hbs": { 14 | "servers": ["127.0.0.1:6030"], 15 | "timeout": 300, 16 | "interval": 60 17 | }, 18 | "alarm": { 19 | "enabled": true, 20 | "minInterval": 300, 21 | "queuePattern": "event:p%v", 22 | "redis": { 23 | "dsn": "127.0.0.1:6379", 24 | "maxIdle": 5, 25 | "connTimeout": 5000, 26 | "readTimeout": 5000, 27 | "writeTimeout": 5000 28 | } 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /control: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | WORKSPACE=$(cd $(dirname $0)/; pwd) 4 | cd $WORKSPACE 5 | 6 | mkdir -p var 7 | 8 | module=judge 9 | app=falcon-$module 10 | conf=cfg.json 11 | pidfile=var/app.pid 12 | logfile=var/app.log 13 | 14 | [[ -f $module ]] && mv $module $app 15 | [[ -f $conf ]] || cp cfg.example.json $conf 16 | 17 | function check_pid() { 18 | if [ -f $pidfile ];then 19 | pid=`cat $pidfile` 20 | if [ -n $pid ]; then 21 | running=`ps -p $pid|grep -v "PID TTY" |wc -l` 22 | return $running 23 | fi 24 | fi 25 | return 0 26 | } 27 | 28 | function start() { 29 | check_pid 30 | running=$? 31 | if [ $running -gt 0 ];then 32 | echo -n "$app now is running already, pid=" 33 | cat $pidfile 34 | return 1 35 | fi 36 | 37 | if [ $# -eq 2 ]; then 38 | rpc=$1 39 | http=$2 40 | sed -i -e "s/6080/${rpc}/g" cfg.json 41 | sed -i -e "s/6081/${http}/g" cfg.json 42 | fi 43 | 44 | nohup ./$app -c $conf >>$logfile 2>&1 & 45 | echo $! > $pidfile 46 | echo "$app started..., pid=$!" 47 | } 48 | 49 | function stop() { 50 | pid=`cat $pidfile` 51 | kill $pid 52 | echo "$app quit..." 53 | } 54 | 55 | function kill9() { 56 | pid=`cat $pidfile` 57 | kill -9 $pid 58 | echo "$app stoped..." 59 | } 60 | 61 | function restart() { 62 | stop 63 | sleep 1 64 | start $1 $2 65 | } 66 | 67 | function status() { 68 | check_pid 69 | running=$? 70 | if [ $running -gt 0 ];then 71 | echo -n "$app now is running, pid=" 72 | cat $pidfile 73 | else 74 | echo "$app is stoped" 75 | fi 76 | } 77 | 78 | function tailf() { 79 | tail -f $logfile 80 | } 81 | 82 | function build() { 83 | go build 84 | if [ $? -ne 0 ]; then 85 | exit $? 86 | fi 87 | mv $module $app 88 | ./$app -v 89 | } 90 | 91 | function pack() { 92 | build 93 | git log -1 --pretty=%h > gitversion 94 | version=`./$app -v` 95 | tar zcvf $app-$version.tar.gz control $app cfg.example.json gitversion 96 | } 97 | 98 | function packbin() { 99 | build 100 | git log -1 --pretty=%h > gitversion 101 | version=`./$app -v` 102 | tar zcvf $app-bin-$version.tar.gz $app gitversion 103 | } 104 | 105 | function help() { 106 | echo "$0 build|pack|packbin|start|stop|restart|status|tail" 107 | } 108 | 109 | if [ "$1" == "" ]; then 110 | help 111 | elif [ "$1" == "stop" ];then 112 | stop 113 | elif [ "$1" == "start" ];then 114 | start $2 $3 115 | elif [ "$1" == "restart" ];then 116 | restart $2 $3 117 | elif [ "$1" == "status" ];then 118 | status 119 | elif [ "$1" == "tail" ];then 120 | tailf 121 | elif [ "$1" == "build" ];then 122 | build 123 | elif [ "$1" == "pack" ];then 124 | pack 125 | elif [ "$1" == "packbin" ];then 126 | packbin 127 | else 128 | help 129 | fi 130 | -------------------------------------------------------------------------------- /cron/cleaner.go: -------------------------------------------------------------------------------- 1 | package cron 2 | 3 | import ( 4 | "github.com/open-falcon/judge/store" 5 | "time" 6 | ) 7 | 8 | func CleanStale() { 9 | for { 10 | time.Sleep(time.Hour * 5) 11 | cleanStale() 12 | } 13 | } 14 | 15 | func cleanStale() { 16 | before := time.Now().Unix() - 3600*24*7 17 | 18 | arr := []string{"0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "a", "b", "c", "d", "e", "f"} 19 | for i := 0; i < 16; i++ { 20 | for j := 0; j < 16; j++ { 21 | store.HistoryBigMap[arr[i]+arr[j]].CleanStale(before) 22 | } 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /cron/strategy.go: -------------------------------------------------------------------------------- 1 | package cron 2 | 3 | import ( 4 | "encoding/json" 5 | "fmt" 6 | "github.com/open-falcon/common/model" 7 | "github.com/open-falcon/judge/g" 8 | "log" 9 | "time" 10 | ) 11 | 12 | func SyncStrategies() { 13 | duration := time.Duration(g.Config().Hbs.Interval) * time.Second 14 | for { 15 | syncStrategies() 16 | syncExpression() 17 | time.Sleep(duration) 18 | } 19 | } 20 | 21 | func syncStrategies() { 22 | var strategiesResponse model.StrategiesResponse 23 | err := g.HbsClient.Call("Hbs.GetStrategies", model.NullRpcRequest{}, &strategiesResponse) 24 | if err != nil { 25 | log.Println("[ERROR] Hbs.GetStrategies:", err) 26 | return 27 | } 28 | 29 | rebuildStrategyMap(&strategiesResponse) 30 | } 31 | 32 | func rebuildStrategyMap(strategiesResponse *model.StrategiesResponse) { 33 | // endpoint:metric => [strategy1, strategy2 ...] 34 | m := make(map[string][]model.Strategy) 35 | for _, hs := range strategiesResponse.HostStrategies { 36 | hostname := hs.Hostname 37 | if g.Config().Debug && hostname == g.Config().DebugHost { 38 | log.Println(hostname, "strategies:") 39 | bs, _ := json.Marshal(hs.Strategies) 40 | fmt.Println(string(bs)) 41 | } 42 | for _, strategy := range hs.Strategies { 43 | key := fmt.Sprintf("%s/%s", hostname, strategy.Metric) 44 | if _, exists := m[key]; exists { 45 | m[key] = append(m[key], strategy) 46 | } else { 47 | m[key] = []model.Strategy{strategy} 48 | } 49 | } 50 | } 51 | 52 | g.StrategyMap.ReInit(m) 53 | } 54 | 55 | func syncExpression() { 56 | var expressionResponse model.ExpressionResponse 57 | err := g.HbsClient.Call("Hbs.GetExpressions", model.NullRpcRequest{}, &expressionResponse) 58 | if err != nil { 59 | log.Println("[ERROR] Hbs.GetExpressions:", err) 60 | return 61 | } 62 | 63 | rebuildExpressionMap(&expressionResponse) 64 | } 65 | 66 | func rebuildExpressionMap(expressionResponse *model.ExpressionResponse) { 67 | m := make(map[string][]*model.Expression) 68 | for _, exp := range expressionResponse.Expressions { 69 | for k, v := range exp.Tags { 70 | key := fmt.Sprintf("%s/%s=%s", exp.Metric, k, v) 71 | if _, exists := m[key]; exists { 72 | m[key] = append(m[key], exp) 73 | } else { 74 | m[key] = []*model.Expression{exp} 75 | } 76 | } 77 | } 78 | 79 | g.ExpressionMap.ReInit(m) 80 | } 81 | -------------------------------------------------------------------------------- /g/cfg.go: -------------------------------------------------------------------------------- 1 | package g 2 | 3 | import ( 4 | "encoding/json" 5 | "github.com/toolkits/file" 6 | "log" 7 | "sync" 8 | ) 9 | 10 | type HttpConfig struct { 11 | Enabled bool `json:"enabled"` 12 | Listen string `json:"listen"` 13 | } 14 | 15 | type RpcConfig struct { 16 | Enabled bool `json:"enabled"` 17 | Listen string `json:"listen"` 18 | } 19 | 20 | type HbsConfig struct { 21 | Servers []string `json:"servers"` 22 | Timeout int64 `json:"timeout"` 23 | Interval int64 `json:"interval"` 24 | } 25 | 26 | type RedisConfig struct { 27 | Dsn string `json:"dsn"` 28 | MaxIdle int `json:"maxIdle"` 29 | ConnTimeout int `json:"connTimeout"` 30 | ReadTimeout int `json:"readTimeout"` 31 | WriteTimeout int `json:"writeTimeout"` 32 | } 33 | 34 | type AlarmConfig struct { 35 | Enabled bool `json:"enabled"` 36 | MinInterval int64 `json:"minInterval"` 37 | QueuePattern string `json:"queuePattern"` 38 | Redis *RedisConfig `json:"redis"` 39 | } 40 | 41 | type GlobalConfig struct { 42 | Debug bool `json:"debug"` 43 | DebugHost string `json:"debugHost"` 44 | Remain int `json:"remain"` 45 | Http *HttpConfig `json:"http"` 46 | Rpc *RpcConfig `json:"rpc"` 47 | Hbs *HbsConfig `json:"hbs"` 48 | Alarm *AlarmConfig `json:"alarm"` 49 | } 50 | 51 | var ( 52 | ConfigFile string 53 | config *GlobalConfig 54 | configLock = new(sync.RWMutex) 55 | ) 56 | 57 | func Config() *GlobalConfig { 58 | configLock.RLock() 59 | defer configLock.RUnlock() 60 | return config 61 | } 62 | 63 | func ParseConfig(cfg string) { 64 | if cfg == "" { 65 | log.Fatalln("use -c to specify configuration file") 66 | } 67 | 68 | if !file.IsExist(cfg) { 69 | log.Fatalln("config file:", cfg, "is not existent") 70 | } 71 | 72 | ConfigFile = cfg 73 | 74 | configContent, err := file.ToTrimString(cfg) 75 | if err != nil { 76 | log.Fatalln("read config file:", cfg, "fail:", err) 77 | } 78 | 79 | var c GlobalConfig 80 | err = json.Unmarshal([]byte(configContent), &c) 81 | if err != nil { 82 | log.Fatalln("parse config file:", cfg, "fail:", err) 83 | } 84 | 85 | configLock.Lock() 86 | defer configLock.Unlock() 87 | 88 | config = &c 89 | 90 | log.Println("read config file:", cfg, "successfully") 91 | } 92 | -------------------------------------------------------------------------------- /g/g.go: -------------------------------------------------------------------------------- 1 | package g 2 | 3 | import ( 4 | "log" 5 | "runtime" 6 | ) 7 | 8 | // change log 9 | // 2.0.1: bugfix HistoryData limit 10 | // 2.0.2: clean stale data 11 | const ( 12 | VERSION = "2.0.2" 13 | ) 14 | 15 | func init() { 16 | runtime.GOMAXPROCS(runtime.NumCPU()) 17 | log.SetFlags(log.Ldate | log.Ltime | log.Lshortfile) 18 | } 19 | -------------------------------------------------------------------------------- /g/redis.go: -------------------------------------------------------------------------------- 1 | package g 2 | 3 | import ( 4 | "github.com/garyburd/redigo/redis" 5 | "log" 6 | "time" 7 | ) 8 | 9 | var RedisConnPool *redis.Pool 10 | 11 | func InitRedisConnPool() { 12 | if !Config().Alarm.Enabled { 13 | return 14 | } 15 | 16 | dsn := Config().Alarm.Redis.Dsn 17 | maxIdle := Config().Alarm.Redis.MaxIdle 18 | idleTimeout := 240 * time.Second 19 | 20 | connTimeout := time.Duration(Config().Alarm.Redis.ConnTimeout) * time.Millisecond 21 | readTimeout := time.Duration(Config().Alarm.Redis.ReadTimeout) * time.Millisecond 22 | writeTimeout := time.Duration(Config().Alarm.Redis.WriteTimeout) * time.Millisecond 23 | 24 | RedisConnPool = &redis.Pool{ 25 | MaxIdle: maxIdle, 26 | IdleTimeout: idleTimeout, 27 | Dial: func() (redis.Conn, error) { 28 | c, err := redis.DialTimeout("tcp", dsn, connTimeout, readTimeout, writeTimeout) 29 | if err != nil { 30 | return nil, err 31 | } 32 | return c, err 33 | }, 34 | TestOnBorrow: PingRedis, 35 | } 36 | } 37 | 38 | func PingRedis(c redis.Conn, t time.Time) error { 39 | _, err := c.Do("ping") 40 | if err != nil { 41 | log.Println("[ERROR] ping redis fail", err) 42 | } 43 | return err 44 | } 45 | -------------------------------------------------------------------------------- /g/rpc.go: -------------------------------------------------------------------------------- 1 | package g 2 | 3 | import ( 4 | "github.com/toolkits/net" 5 | "log" 6 | "math" 7 | "net/rpc" 8 | "sync" 9 | "time" 10 | ) 11 | 12 | type SingleConnRpcClient struct { 13 | sync.Mutex 14 | rpcClient *rpc.Client 15 | RpcServers []string 16 | Timeout time.Duration 17 | } 18 | 19 | func (this *SingleConnRpcClient) close() { 20 | if this.rpcClient != nil { 21 | this.rpcClient.Close() 22 | this.rpcClient = nil 23 | } 24 | } 25 | 26 | func (this *SingleConnRpcClient) insureConn() { 27 | if this.rpcClient != nil { 28 | return 29 | } 30 | 31 | var err error 32 | var retry int = 1 33 | 34 | for { 35 | if this.rpcClient != nil { 36 | return 37 | } 38 | 39 | for _, s := range this.RpcServers { 40 | this.rpcClient, err = net.JsonRpcClient("tcp", s, this.Timeout) 41 | if err == nil { 42 | return 43 | } 44 | 45 | log.Printf("dial %s fail: %s", s, err) 46 | } 47 | 48 | if retry > 6 { 49 | retry = 1 50 | } 51 | 52 | time.Sleep(time.Duration(math.Pow(2.0, float64(retry))) * time.Second) 53 | 54 | retry++ 55 | } 56 | } 57 | 58 | func (this *SingleConnRpcClient) Call(method string, args interface{}, reply interface{}) error { 59 | 60 | this.Lock() 61 | defer this.Unlock() 62 | 63 | this.insureConn() 64 | 65 | err := this.rpcClient.Call(method, args, reply) 66 | if err != nil { 67 | this.close() 68 | } 69 | 70 | return err 71 | } 72 | -------------------------------------------------------------------------------- /g/var.go: -------------------------------------------------------------------------------- 1 | package g 2 | 3 | import ( 4 | "github.com/open-falcon/common/model" 5 | "sync" 6 | "time" 7 | ) 8 | 9 | type SafeStrategyMap struct { 10 | sync.RWMutex 11 | // endpoint:metric => [strategy1, strategy2 ...] 12 | M map[string][]model.Strategy 13 | } 14 | 15 | type SafeExpressionMap struct { 16 | sync.RWMutex 17 | // metric:tag1 => [exp1, exp2 ...] 18 | // metric:tag2 => [exp1, exp2 ...] 19 | M map[string][]*model.Expression 20 | } 21 | 22 | type SafeEventMap struct { 23 | sync.RWMutex 24 | M map[string]*model.Event 25 | } 26 | 27 | var ( 28 | HbsClient *SingleConnRpcClient 29 | StrategyMap = &SafeStrategyMap{M: make(map[string][]model.Strategy)} 30 | ExpressionMap = &SafeExpressionMap{M: make(map[string][]*model.Expression)} 31 | LastEvents = &SafeEventMap{M: make(map[string]*model.Event)} 32 | ) 33 | 34 | func InitHbsClient() { 35 | HbsClient = &SingleConnRpcClient{ 36 | RpcServers: Config().Hbs.Servers, 37 | Timeout: time.Duration(Config().Hbs.Timeout) * time.Millisecond, 38 | } 39 | } 40 | 41 | func (this *SafeStrategyMap) ReInit(m map[string][]model.Strategy) { 42 | this.Lock() 43 | defer this.Unlock() 44 | this.M = m 45 | } 46 | 47 | func (this *SafeStrategyMap) Get() map[string][]model.Strategy { 48 | this.RLock() 49 | defer this.RUnlock() 50 | return this.M 51 | } 52 | 53 | func (this *SafeExpressionMap) ReInit(m map[string][]*model.Expression) { 54 | this.Lock() 55 | defer this.Unlock() 56 | this.M = m 57 | } 58 | 59 | func (this *SafeExpressionMap) Get() map[string][]*model.Expression { 60 | this.RLock() 61 | defer this.RUnlock() 62 | return this.M 63 | } 64 | 65 | func (this *SafeEventMap) Get(key string) (*model.Event, bool) { 66 | this.RLock() 67 | defer this.RUnlock() 68 | event, exists := this.M[key] 69 | return event, exists 70 | } 71 | 72 | func (this *SafeEventMap) Set(key string, event *model.Event) { 73 | this.Lock() 74 | defer this.Unlock() 75 | this.M[key] = event 76 | } 77 | -------------------------------------------------------------------------------- /http/common.go: -------------------------------------------------------------------------------- 1 | package http 2 | 3 | import ( 4 | "github.com/open-falcon/judge/g" 5 | "github.com/toolkits/file" 6 | "net/http" 7 | "strings" 8 | ) 9 | 10 | func configCommonRoutes() { 11 | http.HandleFunc("/health", func(w http.ResponseWriter, r *http.Request) { 12 | w.Write([]byte("ok")) 13 | }) 14 | 15 | http.HandleFunc("/version", func(w http.ResponseWriter, r *http.Request) { 16 | w.Write([]byte(g.VERSION)) 17 | }) 18 | 19 | http.HandleFunc("/workdir", func(w http.ResponseWriter, r *http.Request) { 20 | RenderDataJson(w, file.SelfDir()) 21 | }) 22 | 23 | http.HandleFunc("/config/reload", func(w http.ResponseWriter, r *http.Request) { 24 | if strings.HasPrefix(r.RemoteAddr, "127.0.0.1") { 25 | g.ParseConfig(g.ConfigFile) 26 | RenderDataJson(w, g.Config()) 27 | } else { 28 | w.Write([]byte("no privilege")) 29 | } 30 | }) 31 | } 32 | -------------------------------------------------------------------------------- /http/http.go: -------------------------------------------------------------------------------- 1 | package http 2 | 3 | import ( 4 | "encoding/json" 5 | "github.com/open-falcon/judge/g" 6 | "log" 7 | "net/http" 8 | _ "net/http/pprof" 9 | ) 10 | 11 | type Dto struct { 12 | Msg string `json:"msg"` 13 | Data interface{} `json:"data"` 14 | } 15 | 16 | func init() { 17 | configCommonRoutes() 18 | configInfoRoutes() 19 | } 20 | 21 | func RenderJson(w http.ResponseWriter, v interface{}) { 22 | bs, err := json.Marshal(v) 23 | if err != nil { 24 | http.Error(w, err.Error(), http.StatusInternalServerError) 25 | return 26 | } 27 | w.Header().Set("Content-Type", "application/json; charset=UTF-8") 28 | w.Write(bs) 29 | } 30 | 31 | func RenderDataJson(w http.ResponseWriter, data interface{}) { 32 | RenderJson(w, Dto{Msg: "success", Data: data}) 33 | } 34 | 35 | func RenderMsgJson(w http.ResponseWriter, msg string) { 36 | RenderJson(w, map[string]string{"msg": msg}) 37 | } 38 | 39 | func AutoRender(w http.ResponseWriter, data interface{}, err error) { 40 | if err != nil { 41 | RenderMsgJson(w, err.Error()) 42 | return 43 | } 44 | RenderDataJson(w, data) 45 | } 46 | 47 | func Start() { 48 | if !g.Config().Http.Enabled { 49 | return 50 | } 51 | 52 | addr := g.Config().Http.Listen 53 | if addr == "" { 54 | return 55 | } 56 | s := &http.Server{ 57 | Addr: addr, 58 | MaxHeaderBytes: 1 << 30, 59 | } 60 | log.Println("http listening", addr) 61 | log.Fatalln(s.ListenAndServe()) 62 | } 63 | -------------------------------------------------------------------------------- /http/info.go: -------------------------------------------------------------------------------- 1 | package http 2 | 3 | import ( 4 | "fmt" 5 | "github.com/open-falcon/common/utils" 6 | "github.com/open-falcon/judge/g" 7 | "github.com/open-falcon/judge/store" 8 | "net/http" 9 | "strings" 10 | ) 11 | 12 | func configInfoRoutes() { 13 | // e.g. /strategy/lg-dinp-docker01.bj/cpu.idle 14 | http.HandleFunc("/strategy/", func(w http.ResponseWriter, r *http.Request) { 15 | urlParam := r.URL.Path[len("/strategy/"):] 16 | m := g.StrategyMap.Get() 17 | RenderDataJson(w, m[urlParam]) 18 | }) 19 | 20 | // e.g. /expression/net.port.listen/port=22 21 | http.HandleFunc("/expression/", func(w http.ResponseWriter, r *http.Request) { 22 | urlParam := r.URL.Path[len("/expression/"):] 23 | m := g.ExpressionMap.Get() 24 | RenderDataJson(w, m[urlParam]) 25 | }) 26 | 27 | http.HandleFunc("/count", func(w http.ResponseWriter, r *http.Request) { 28 | sum := 0 29 | arr := []string{"0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "a", "b", "c", "d", "e", "f"} 30 | for i := 0; i < 16; i++ { 31 | for j := 0; j < 16; j++ { 32 | sum += store.HistoryBigMap[arr[i]+arr[j]].Len() 33 | } 34 | } 35 | 36 | out := fmt.Sprintf("total: %d\n", sum) 37 | w.Write([]byte(out)) 38 | }) 39 | 40 | http.HandleFunc("/history/", func(w http.ResponseWriter, r *http.Request) { 41 | urlParam := r.URL.Path[len("/history/"):] 42 | pk := utils.Md5(urlParam) 43 | L, exists := store.HistoryBigMap[pk[0:2]].Get(pk) 44 | if !exists || L.Len() == 0 { 45 | w.Write([]byte("not found\n")) 46 | return 47 | } 48 | 49 | arr := []string{} 50 | 51 | datas, _ := L.HistoryData(g.Config().Remain - 1) 52 | for i := 0; i < len(datas); i++ { 53 | if datas[i] == nil { 54 | continue 55 | } 56 | 57 | str := fmt.Sprintf( 58 | "%d %s %v\n", 59 | datas[i].Timestamp, 60 | utils.UnixTsFormat(datas[i].Timestamp), 61 | datas[i].Value, 62 | ) 63 | arr = append(arr, str) 64 | } 65 | 66 | w.Write([]byte(strings.Join(arr, ""))) 67 | }) 68 | 69 | } 70 | -------------------------------------------------------------------------------- /main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "flag" 5 | "fmt" 6 | "github.com/open-falcon/judge/cron" 7 | "github.com/open-falcon/judge/g" 8 | "github.com/open-falcon/judge/http" 9 | "github.com/open-falcon/judge/rpc" 10 | "github.com/open-falcon/judge/store" 11 | "os" 12 | ) 13 | 14 | func main() { 15 | cfg := flag.String("c", "cfg.json", "configuration file") 16 | version := flag.Bool("v", false, "show version") 17 | flag.Parse() 18 | 19 | if *version { 20 | fmt.Println(g.VERSION) 21 | os.Exit(0) 22 | } 23 | 24 | g.ParseConfig(*cfg) 25 | 26 | g.InitRedisConnPool() 27 | g.InitHbsClient() 28 | 29 | store.InitHistoryBigMap() 30 | 31 | go http.Start() 32 | go rpc.Start() 33 | 34 | go cron.SyncStrategies() 35 | go cron.CleanStale() 36 | 37 | select {} 38 | } 39 | -------------------------------------------------------------------------------- /rpc/receiver.go: -------------------------------------------------------------------------------- 1 | package rpc 2 | 3 | import ( 4 | "github.com/open-falcon/common/model" 5 | "github.com/open-falcon/judge/g" 6 | "github.com/open-falcon/judge/store" 7 | "time" 8 | ) 9 | 10 | type Judge int 11 | 12 | func (this *Judge) Ping(req model.NullRpcRequest, resp *model.SimpleRpcResponse) error { 13 | return nil 14 | } 15 | 16 | func (this *Judge) Send(items []*model.JudgeItem, resp *model.SimpleRpcResponse) error { 17 | remain := g.Config().Remain 18 | // 把当前时间的计算放在最外层,是为了减少获取时间时的系统调用开销 19 | now := time.Now().Unix() 20 | for _, item := range items { 21 | pk := item.PrimaryKey() 22 | store.HistoryBigMap[pk[0:2]].PushFrontAndMaintain(pk, item, remain, now) 23 | } 24 | return nil 25 | } 26 | -------------------------------------------------------------------------------- /rpc/rpc.go: -------------------------------------------------------------------------------- 1 | package rpc 2 | 3 | import ( 4 | "github.com/open-falcon/judge/g" 5 | "log" 6 | "net" 7 | "net/rpc" 8 | ) 9 | 10 | func Start() { 11 | if !g.Config().Rpc.Enabled { 12 | return 13 | } 14 | addr := g.Config().Rpc.Listen 15 | tcpAddr, err := net.ResolveTCPAddr("tcp", addr) 16 | if err != nil { 17 | log.Fatalf("net.ResolveTCPAddr fail: %s", err) 18 | } 19 | 20 | listener, err := net.ListenTCP("tcp", tcpAddr) 21 | if err != nil { 22 | log.Fatalf("listen %s fail: %s", addr, err) 23 | } else { 24 | log.Println("rpc listening", addr) 25 | } 26 | 27 | rpc.Register(new(Judge)) 28 | 29 | for { 30 | conn, err := listener.Accept() 31 | if err != nil { 32 | log.Printf("listener.Accept occur error: %s", err) 33 | continue 34 | } 35 | go rpc.ServeConn(conn) 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /store/func.go: -------------------------------------------------------------------------------- 1 | package store 2 | 3 | import ( 4 | "fmt" 5 | "github.com/open-falcon/common/model" 6 | "math" 7 | "strconv" 8 | "strings" 9 | ) 10 | 11 | type Function interface { 12 | Compute(L *SafeLinkedList) (vs []*model.HistoryData, leftValue float64, isTriggered bool, isEnough bool) 13 | } 14 | 15 | type MaxFunction struct { 16 | Function 17 | Limit int 18 | Operator string 19 | RightValue float64 20 | } 21 | 22 | func (this MaxFunction) Compute(L *SafeLinkedList) (vs []*model.HistoryData, leftValue float64, isTriggered bool, isEnough bool) { 23 | vs, isEnough = L.HistoryData(this.Limit) 24 | if !isEnough { 25 | return 26 | } 27 | 28 | max := vs[0].Value 29 | for i := 1; i < this.Limit; i++ { 30 | if max < vs[i].Value { 31 | max = vs[i].Value 32 | } 33 | } 34 | 35 | leftValue = max 36 | isTriggered = checkIsTriggered(leftValue, this.Operator, this.RightValue) 37 | return 38 | } 39 | 40 | type MinFunction struct { 41 | Function 42 | Limit int 43 | Operator string 44 | RightValue float64 45 | } 46 | 47 | func (this MinFunction) Compute(L *SafeLinkedList) (vs []*model.HistoryData, leftValue float64, isTriggered bool, isEnough bool) { 48 | vs, isEnough = L.HistoryData(this.Limit) 49 | if !isEnough { 50 | return 51 | } 52 | 53 | min := vs[0].Value 54 | for i := 1; i < this.Limit; i++ { 55 | if min > vs[i].Value { 56 | min = vs[i].Value 57 | } 58 | } 59 | 60 | leftValue = min 61 | isTriggered = checkIsTriggered(leftValue, this.Operator, this.RightValue) 62 | return 63 | } 64 | 65 | type AllFunction struct { 66 | Function 67 | Limit int 68 | Operator string 69 | RightValue float64 70 | } 71 | 72 | func (this AllFunction) Compute(L *SafeLinkedList) (vs []*model.HistoryData, leftValue float64, isTriggered bool, isEnough bool) { 73 | vs, isEnough = L.HistoryData(this.Limit) 74 | if !isEnough { 75 | return 76 | } 77 | 78 | isTriggered = true 79 | for i := 0; i < this.Limit; i++ { 80 | isTriggered = checkIsTriggered(vs[i].Value, this.Operator, this.RightValue) 81 | if !isTriggered { 82 | break 83 | } 84 | } 85 | 86 | leftValue = vs[0].Value 87 | return 88 | } 89 | 90 | type SumFunction struct { 91 | Function 92 | Limit int 93 | Operator string 94 | RightValue float64 95 | } 96 | 97 | func (this SumFunction) Compute(L *SafeLinkedList) (vs []*model.HistoryData, leftValue float64, isTriggered bool, isEnough bool) { 98 | vs, isEnough = L.HistoryData(this.Limit) 99 | if !isEnough { 100 | return 101 | } 102 | 103 | sum := 0.0 104 | for i := 0; i < this.Limit; i++ { 105 | sum += vs[i].Value 106 | } 107 | 108 | leftValue = sum 109 | isTriggered = checkIsTriggered(leftValue, this.Operator, this.RightValue) 110 | return 111 | } 112 | 113 | type AvgFunction struct { 114 | Function 115 | Limit int 116 | Operator string 117 | RightValue float64 118 | } 119 | 120 | func (this AvgFunction) Compute(L *SafeLinkedList) (vs []*model.HistoryData, leftValue float64, isTriggered bool, isEnough bool) { 121 | vs, isEnough = L.HistoryData(this.Limit) 122 | if !isEnough { 123 | return 124 | } 125 | 126 | sum := 0.0 127 | for i := 0; i < this.Limit; i++ { 128 | sum += vs[i].Value 129 | } 130 | 131 | leftValue = sum / float64(this.Limit) 132 | isTriggered = checkIsTriggered(leftValue, this.Operator, this.RightValue) 133 | return 134 | } 135 | 136 | type DiffFunction struct { 137 | Function 138 | Limit int 139 | Operator string 140 | RightValue float64 141 | } 142 | 143 | // 只要有一个点的diff触发阈值,就报警 144 | func (this DiffFunction) Compute(L *SafeLinkedList) (vs []*model.HistoryData, leftValue float64, isTriggered bool, isEnough bool) { 145 | // 此处this.Limit要+1,因为通常说diff(#3),是当前点与历史的3个点相比较 146 | // 然而最新点已经在linkedlist的第一个位置,所以…… 147 | vs, isEnough = L.HistoryData(this.Limit + 1) 148 | if !isEnough { 149 | return 150 | } 151 | 152 | if len(vs) == 0 { 153 | isEnough = false 154 | return 155 | } 156 | 157 | first := vs[0].Value 158 | 159 | isTriggered = false 160 | for i := 1; i < this.Limit+1; i++ { 161 | // diff是当前值减去历史值 162 | leftValue = first - vs[i].Value 163 | isTriggered = checkIsTriggered(leftValue, this.Operator, this.RightValue) 164 | if isTriggered { 165 | break 166 | } 167 | } 168 | 169 | return 170 | } 171 | 172 | // pdiff(#3) 173 | type PDiffFunction struct { 174 | Function 175 | Limit int 176 | Operator string 177 | RightValue float64 178 | } 179 | 180 | func (this PDiffFunction) Compute(L *SafeLinkedList) (vs []*model.HistoryData, leftValue float64, isTriggered bool, isEnough bool) { 181 | vs, isEnough = L.HistoryData(this.Limit + 1) 182 | if !isEnough { 183 | return 184 | } 185 | 186 | if len(vs) == 0 { 187 | isEnough = false 188 | return 189 | } 190 | 191 | first := vs[0].Value 192 | 193 | isTriggered = false 194 | for i := 1; i < this.Limit+1; i++ { 195 | if vs[i].Value == 0 { 196 | continue 197 | } 198 | 199 | leftValue = (first - vs[i].Value) / vs[i].Value * 100.0 200 | isTriggered = checkIsTriggered(leftValue, this.Operator, this.RightValue) 201 | if isTriggered { 202 | break 203 | } 204 | } 205 | 206 | return 207 | } 208 | 209 | // @str: e.g. all(#3) sum(#3) avg(#10) diff(#10) 210 | func ParseFuncFromString(str string, operator string, rightValue float64) (fn Function, err error) { 211 | idx := strings.Index(str, "#") 212 | limit, err := strconv.ParseInt(str[idx+1:len(str)-1], 10, 64) 213 | if err != nil { 214 | return nil, err 215 | } 216 | 217 | switch str[:idx-1] { 218 | case "max": 219 | fn = &MaxFunction{Limit: int(limit), Operator: operator, RightValue: rightValue} 220 | case "min": 221 | fn = &MinFunction{Limit: int(limit), Operator: operator, RightValue: rightValue} 222 | case "all": 223 | fn = &AllFunction{Limit: int(limit), Operator: operator, RightValue: rightValue} 224 | case "sum": 225 | fn = &SumFunction{Limit: int(limit), Operator: operator, RightValue: rightValue} 226 | case "avg": 227 | fn = &AvgFunction{Limit: int(limit), Operator: operator, RightValue: rightValue} 228 | case "diff": 229 | fn = &DiffFunction{Limit: int(limit), Operator: operator, RightValue: rightValue} 230 | case "pdiff": 231 | fn = &PDiffFunction{Limit: int(limit), Operator: operator, RightValue: rightValue} 232 | default: 233 | err = fmt.Errorf("not_supported_method") 234 | } 235 | 236 | return 237 | } 238 | 239 | func checkIsTriggered(leftValue float64, operator string, rightValue float64) (isTriggered bool) { 240 | switch operator { 241 | case "=", "==": 242 | isTriggered = math.Abs(leftValue-rightValue) < 0.0001 243 | case "!=": 244 | isTriggered = math.Abs(leftValue-rightValue) > 0.0001 245 | case "<": 246 | isTriggered = leftValue < rightValue 247 | case "<=": 248 | isTriggered = leftValue <= rightValue 249 | case ">": 250 | isTriggered = leftValue > rightValue 251 | case ">=": 252 | isTriggered = leftValue >= rightValue 253 | } 254 | 255 | return 256 | } 257 | -------------------------------------------------------------------------------- /store/history.go: -------------------------------------------------------------------------------- 1 | package store 2 | 3 | import ( 4 | "container/list" 5 | "github.com/open-falcon/common/model" 6 | "sync" 7 | ) 8 | 9 | type JudgeItemMap struct { 10 | sync.RWMutex 11 | M map[string]*SafeLinkedList 12 | } 13 | 14 | func NewJudgeItemMap() *JudgeItemMap { 15 | return &JudgeItemMap{M: make(map[string]*SafeLinkedList)} 16 | } 17 | 18 | func (this *JudgeItemMap) Get(key string) (*SafeLinkedList, bool) { 19 | this.RLock() 20 | defer this.RUnlock() 21 | val, ok := this.M[key] 22 | return val, ok 23 | } 24 | 25 | func (this *JudgeItemMap) Set(key string, val *SafeLinkedList) { 26 | this.Lock() 27 | defer this.Unlock() 28 | this.M[key] = val 29 | } 30 | 31 | func (this *JudgeItemMap) Len() int { 32 | this.RLock() 33 | defer this.RUnlock() 34 | return len(this.M) 35 | } 36 | 37 | func (this *JudgeItemMap) Delete(key string) { 38 | this.Lock() 39 | defer this.Unlock() 40 | delete(this.M, key) 41 | } 42 | 43 | func (this *JudgeItemMap) BatchDelete(keys []string) { 44 | count := len(keys) 45 | if count == 0 { 46 | return 47 | } 48 | 49 | this.Lock() 50 | defer this.Unlock() 51 | for i := 0; i < count; i++ { 52 | delete(this.M, keys[i]) 53 | } 54 | } 55 | 56 | func (this *JudgeItemMap) CleanStale(before int64) { 57 | keys := []string{} 58 | 59 | this.RLock() 60 | for key, L := range this.M { 61 | front := L.Front() 62 | if front == nil { 63 | continue 64 | } 65 | 66 | if front.Value.(*model.JudgeItem).Timestamp < before { 67 | keys = append(keys, key) 68 | } 69 | } 70 | this.RUnlock() 71 | 72 | this.BatchDelete(keys) 73 | } 74 | 75 | func (this *JudgeItemMap) PushFrontAndMaintain(key string, val *model.JudgeItem, maxCount int, now int64) { 76 | if linkedList, exists := this.Get(key); exists { 77 | needJudge := linkedList.PushFrontAndMaintain(val, maxCount) 78 | if needJudge { 79 | Judge(linkedList, val, now) 80 | } 81 | } else { 82 | NL := list.New() 83 | NL.PushFront(val) 84 | safeList := &SafeLinkedList{L: NL} 85 | this.Set(key, safeList) 86 | Judge(safeList, val, now) 87 | } 88 | } 89 | 90 | // 这是个线程不安全的大Map,需要提前初始化好 91 | var HistoryBigMap = make(map[string]*JudgeItemMap) 92 | 93 | func InitHistoryBigMap() { 94 | arr := []string{"0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "a", "b", "c", "d", "e", "f"} 95 | for i := 0; i < 16; i++ { 96 | for j := 0; j < 16; j++ { 97 | HistoryBigMap[arr[i]+arr[j]] = NewJudgeItemMap() 98 | } 99 | } 100 | } 101 | -------------------------------------------------------------------------------- /store/judge.go: -------------------------------------------------------------------------------- 1 | package store 2 | 3 | import ( 4 | "encoding/json" 5 | "fmt" 6 | "github.com/open-falcon/common/model" 7 | "github.com/open-falcon/judge/g" 8 | "log" 9 | ) 10 | 11 | func Judge(L *SafeLinkedList, firstItem *model.JudgeItem, now int64) { 12 | CheckStrategy(L, firstItem, now) 13 | CheckExpression(L, firstItem, now) 14 | } 15 | 16 | func CheckStrategy(L *SafeLinkedList, firstItem *model.JudgeItem, now int64) { 17 | key := fmt.Sprintf("%s/%s", firstItem.Endpoint, firstItem.Metric) 18 | strategyMap := g.StrategyMap.Get() 19 | strategies, exists := strategyMap[key] 20 | if !exists { 21 | return 22 | } 23 | 24 | for _, s := range strategies { 25 | // 因为key仅仅是endpoint和metric,所以得到的strategies并不一定是与当前judgeItem相关的 26 | // 比如lg-dinp-docker01.bj配置了两个proc.num的策略,一个name=docker,一个name=agent 27 | // 所以此处要排除掉一部分 28 | related := true 29 | for tagKey, tagVal := range s.Tags { 30 | if myVal, exists := firstItem.Tags[tagKey]; !exists || myVal != tagVal { 31 | related = false 32 | break 33 | } 34 | } 35 | 36 | if !related { 37 | continue 38 | } 39 | 40 | judgeItemWithStrategy(L, s, firstItem, now) 41 | } 42 | } 43 | 44 | func judgeItemWithStrategy(L *SafeLinkedList, strategy model.Strategy, firstItem *model.JudgeItem, now int64) { 45 | fn, err := ParseFuncFromString(strategy.Func, strategy.Operator, strategy.RightValue) 46 | if err != nil { 47 | log.Printf("[ERROR] parse func %s fail: %v. strategy id: %d", strategy.Func, err, strategy.Id) 48 | return 49 | } 50 | 51 | historyData, leftValue, isTriggered, isEnough := fn.Compute(L) 52 | if !isEnough { 53 | return 54 | } 55 | 56 | event := &model.Event{ 57 | Id: fmt.Sprintf("s_%d_%s", strategy.Id, firstItem.PrimaryKey()), 58 | Strategy: &strategy, 59 | Endpoint: firstItem.Endpoint, 60 | LeftValue: leftValue, 61 | EventTime: firstItem.Timestamp, 62 | PushedTags: firstItem.Tags, 63 | } 64 | 65 | sendEventIfNeed(historyData, isTriggered, now, event, strategy.MaxStep) 66 | } 67 | 68 | func sendEvent(event *model.Event) { 69 | // update last event 70 | g.LastEvents.Set(event.Id, event) 71 | 72 | bs, err := json.Marshal(event) 73 | if err != nil { 74 | log.Printf("json marshal event %v fail: %v", event, err) 75 | return 76 | } 77 | 78 | // send to redis 79 | redisKey := fmt.Sprintf(g.Config().Alarm.QueuePattern, event.Priority()) 80 | rc := g.RedisConnPool.Get() 81 | defer rc.Close() 82 | rc.Do("LPUSH", redisKey, string(bs)) 83 | } 84 | 85 | func CheckExpression(L *SafeLinkedList, firstItem *model.JudgeItem, now int64) { 86 | keys := buildKeysFromMetricAndTags(firstItem) 87 | if len(keys) == 0 { 88 | return 89 | } 90 | 91 | // expression可能会被多次重复处理,用此数据结构保证只被处理一次 92 | handledExpression := make(map[int]struct{}) 93 | 94 | expressionMap := g.ExpressionMap.Get() 95 | for _, key := range keys { 96 | expressions, exists := expressionMap[key] 97 | if !exists { 98 | continue 99 | } 100 | 101 | related := filterRelatedExpressions(expressions, firstItem) 102 | for _, exp := range related { 103 | if _, ok := handledExpression[exp.Id]; ok { 104 | continue 105 | } 106 | handledExpression[exp.Id] = struct{}{} 107 | judgeItemWithExpression(L, exp, firstItem, now) 108 | } 109 | } 110 | } 111 | 112 | func buildKeysFromMetricAndTags(item *model.JudgeItem) (keys []string) { 113 | for k, v := range item.Tags { 114 | keys = append(keys, fmt.Sprintf("%s/%s=%s", item.Metric, k, v)) 115 | } 116 | keys = append(keys, fmt.Sprintf("%s/endpoint=%s", item.Metric, item.Endpoint)) 117 | return 118 | } 119 | 120 | func filterRelatedExpressions(expressions []*model.Expression, firstItem *model.JudgeItem) []*model.Expression { 121 | size := len(expressions) 122 | if size == 0 { 123 | return []*model.Expression{} 124 | } 125 | 126 | exps := make([]*model.Expression, 0, size) 127 | 128 | for _, exp := range expressions { 129 | 130 | related := true 131 | 132 | itemTagsCopy := firstItem.Tags 133 | // 注意:exp.Tags 中可能会有一个endpoint=xxx的tag 134 | if _, ok := exp.Tags["endpoint"]; ok { 135 | itemTagsCopy = copyItemTags(firstItem) 136 | } 137 | 138 | for tagKey, tagVal := range exp.Tags { 139 | if myVal, exists := itemTagsCopy[tagKey]; !exists || myVal != tagVal { 140 | related = false 141 | break 142 | } 143 | } 144 | 145 | if !related { 146 | continue 147 | } 148 | 149 | exps = append(exps, exp) 150 | } 151 | 152 | return exps 153 | } 154 | 155 | func copyItemTags(item *model.JudgeItem) map[string]string { 156 | ret := make(map[string]string) 157 | ret["endpoint"] = item.Endpoint 158 | if item.Tags != nil && len(item.Tags) > 0 { 159 | for k, v := range item.Tags { 160 | ret[k] = v 161 | } 162 | } 163 | return ret 164 | } 165 | 166 | func judgeItemWithExpression(L *SafeLinkedList, expression *model.Expression, firstItem *model.JudgeItem, now int64) { 167 | fn, err := ParseFuncFromString(expression.Func, expression.Operator, expression.RightValue) 168 | if err != nil { 169 | log.Printf("[ERROR] parse func %s fail: %v. expression id: %d", expression.Func, err, expression.Id) 170 | return 171 | } 172 | 173 | historyData, leftValue, isTriggered, isEnough := fn.Compute(L) 174 | if !isEnough { 175 | return 176 | } 177 | 178 | event := &model.Event{ 179 | Id: fmt.Sprintf("e_%d_%s", expression.Id, firstItem.PrimaryKey()), 180 | Expression: expression, 181 | Endpoint: firstItem.Endpoint, 182 | LeftValue: leftValue, 183 | EventTime: firstItem.Timestamp, 184 | PushedTags: firstItem.Tags, 185 | } 186 | 187 | sendEventIfNeed(historyData, isTriggered, now, event, expression.MaxStep) 188 | 189 | } 190 | 191 | func sendEventIfNeed(historyData []*model.HistoryData, isTriggered bool, now int64, event *model.Event, maxStep int) { 192 | lastEvent, exists := g.LastEvents.Get(event.Id) 193 | if isTriggered { 194 | event.Status = "PROBLEM" 195 | if !exists || lastEvent.Status[0] == 'O' { 196 | // 本次触发了阈值,之前又没报过警,得产生一个报警Event 197 | event.CurrentStep = 1 198 | 199 | // 但是有些用户把最大报警次数配置成了0,相当于屏蔽了,要检查一下 200 | if maxStep == 0 { 201 | return 202 | } 203 | 204 | sendEvent(event) 205 | return 206 | } 207 | 208 | // 逻辑走到这里,说明之前Event是PROBLEM状态 209 | if lastEvent.CurrentStep >= maxStep { 210 | // 报警次数已经足够多,到达了最多报警次数了,不再报警 211 | return 212 | } 213 | 214 | if historyData[len(historyData)-1].Timestamp <= lastEvent.EventTime { 215 | // 产生过报警的点,就不能再使用来判断了,否则容易出现一分钟报一次的情况 216 | // 只需要拿最后一个historyData来做判断即可,因为它的时间最老 217 | return 218 | } 219 | 220 | if now-lastEvent.EventTime < g.Config().Alarm.MinInterval { 221 | // 报警不能太频繁,两次报警之间至少要间隔MinInterval秒,否则就不能报警 222 | return 223 | } 224 | 225 | event.CurrentStep = lastEvent.CurrentStep + 1 226 | sendEvent(event) 227 | } else { 228 | // 如果LastEvent是Problem,报OK,否则啥都不做 229 | if exists && lastEvent.Status[0] == 'P' { 230 | event.Status = "OK" 231 | event.CurrentStep = 1 232 | sendEvent(event) 233 | } 234 | } 235 | } 236 | -------------------------------------------------------------------------------- /store/linkedlist.go: -------------------------------------------------------------------------------- 1 | package store 2 | 3 | import ( 4 | "container/list" 5 | "github.com/open-falcon/common/model" 6 | "sync" 7 | ) 8 | 9 | type SafeLinkedList struct { 10 | sync.RWMutex 11 | L *list.List 12 | } 13 | 14 | func (this *SafeLinkedList) ToSlice() []*model.JudgeItem { 15 | this.RLock() 16 | defer this.RUnlock() 17 | sz := this.L.Len() 18 | if sz == 0 { 19 | return []*model.JudgeItem{} 20 | } 21 | 22 | ret := make([]*model.JudgeItem, 0, sz) 23 | for e := this.L.Front(); e != nil; e = e.Next() { 24 | ret = append(ret, e.Value.(*model.JudgeItem)) 25 | } 26 | return ret 27 | } 28 | 29 | // @param limit 至多返回这些,如果不够,有多少返回多少 30 | // @return bool isEnough 31 | func (this *SafeLinkedList) HistoryData(limit int) ([]*model.HistoryData, bool) { 32 | if limit < 1 { 33 | // 其实limit不合法,此处也返回false吧,上层代码要注意 34 | // 因为false通常使上层代码进入异常分支,这样就统一了 35 | return []*model.HistoryData{}, false 36 | } 37 | 38 | size := this.Len() 39 | if size == 0 { 40 | return []*model.HistoryData{}, false 41 | } 42 | 43 | firstElement := this.Front() 44 | firstItem := firstElement.Value.(*model.JudgeItem) 45 | 46 | var vs []*model.HistoryData 47 | isEnough := true 48 | 49 | judgeType := firstItem.JudgeType[0] 50 | if judgeType == 'G' || judgeType == 'g' { 51 | if size < limit { 52 | // 有多少获取多少 53 | limit = size 54 | isEnough = false 55 | } 56 | vs = make([]*model.HistoryData, limit) 57 | vs[0] = &model.HistoryData{Timestamp: firstItem.Timestamp, Value: firstItem.Value} 58 | i := 1 59 | currentElement := firstElement 60 | for i < limit { 61 | nextElement := currentElement.Next() 62 | vs[i] = &model.HistoryData{ 63 | Timestamp: nextElement.Value.(*model.JudgeItem).Timestamp, 64 | Value: nextElement.Value.(*model.JudgeItem).Value, 65 | } 66 | i++ 67 | currentElement = nextElement 68 | } 69 | } else { 70 | if size < limit+1 { 71 | isEnough = false 72 | limit = size - 1 73 | } 74 | 75 | vs = make([]*model.HistoryData, limit) 76 | 77 | i := 0 78 | currentElement := firstElement 79 | for i < limit { 80 | nextElement := currentElement.Next() 81 | diffVal := currentElement.Value.(*model.JudgeItem).Value - nextElement.Value.(*model.JudgeItem).Value 82 | diffTs := currentElement.Value.(*model.JudgeItem).Timestamp - nextElement.Value.(*model.JudgeItem).Timestamp 83 | vs[i] = &model.HistoryData{ 84 | Timestamp: currentElement.Value.(*model.JudgeItem).Timestamp, 85 | Value: diffVal / float64(diffTs), 86 | } 87 | i++ 88 | currentElement = nextElement 89 | } 90 | } 91 | 92 | return vs, isEnough 93 | } 94 | 95 | func (this *SafeLinkedList) PushFront(v interface{}) *list.Element { 96 | this.Lock() 97 | defer this.Unlock() 98 | return this.L.PushFront(v) 99 | } 100 | 101 | // @return needJudge 如果是false不需要做judge,因为新上来的数据不合法 102 | func (this *SafeLinkedList) PushFrontAndMaintain(v *model.JudgeItem, maxCount int) bool { 103 | this.Lock() 104 | defer this.Unlock() 105 | 106 | sz := this.L.Len() 107 | if sz > 0 { 108 | // 新push上来的数据有可能重复了,或者timestamp不对,这种数据要丢掉 109 | if v.Timestamp <= this.L.Front().Value.(*model.JudgeItem).Timestamp || v.Timestamp <= 0 { 110 | return false 111 | } 112 | } 113 | 114 | this.L.PushFront(v) 115 | 116 | sz++ 117 | if sz <= maxCount { 118 | return true 119 | } 120 | 121 | del := sz - maxCount 122 | for i := 0; i < del; i++ { 123 | this.L.Remove(this.L.Back()) 124 | } 125 | 126 | return true 127 | } 128 | 129 | func (this *SafeLinkedList) Front() *list.Element { 130 | this.RLock() 131 | defer this.RUnlock() 132 | return this.L.Front() 133 | } 134 | 135 | func (this *SafeLinkedList) Len() int { 136 | this.RLock() 137 | defer this.RUnlock() 138 | return this.L.Len() 139 | } 140 | --------------------------------------------------------------------------------