├── .gitignore ├── LICENSE ├── NOTICE ├── README.md ├── cfg.example.json ├── control ├── g ├── cfg.go ├── g.go └── status.go ├── http ├── api_http.go ├── common.go ├── debug_http.go └── http.go ├── main.go ├── perfcounter.json ├── receiver ├── receiver.go └── rpc │ ├── rpc.go │ └── rpc_transfer.go ├── sender ├── conn_pools.go ├── connpool │ ├── conn_pool.go │ ├── conn_pool_manager.go │ └── influxdb_pool.go ├── send_queues.go ├── send_tasks.go ├── sender.go └── sender_cron.go └── test ├── debug └── rpcclient.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled Object files, Static and Dynamic libs (Shared Objects) 2 | *.o 3 | *.a 4 | *.so 5 | 6 | # Folders 7 | _obj 8 | _test 9 | 10 | # Architecture specific extensions/prefixes 11 | *.[568vq] 12 | [568vq].out 13 | 14 | *.cgo1.go 15 | *.cgo2.c 16 | _cgo_defun.c 17 | _cgo_gotypes.go 18 | _cgo_export.* 19 | 20 | _testmain.go 21 | 22 | *.exe 23 | *.test 24 | *.prof 25 | 26 | *.swp 27 | *.swo 28 | *.log 29 | .idea 30 | .DS_Store 31 | /var 32 | /falcon-transfer* 33 | /cfg.json 34 | /test/build 35 | /test/*.go 36 | 37 | gitversion 38 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | Open-Falcon 2 | 3 | Copyright (c) 2014-2015 Xiaomi, Inc. All Rights Reserved. 4 | 5 | This product is licensed to you under the Apache License, Version 2.0 (the "License"). 6 | You may not use this product except in compliance with the License. 7 | 8 | This product may include a number of subcomponents with separate copyright notices 9 | and license terms. Your use of these subcomponents is subject to the terms and 10 | conditions of the subcomponent's license, as noted in the LICENSE file. 11 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | swtfr介绍 2 | ========= 3 | 4 | 我们将falcon-transfer用于流量采集系统中 5 | influxdb的写入接口。如果关注openfalcon项目,可以发现最新版本的已经有opentsdb接口。其实我们也实验了opentsdb数据来存储流量信息。但在使用中,我们发现opentsdb单机部署的情况下查询效率有点慢,所以我们实现了influxdb作为tsdb数据库。这个选择有优点也有缺点。列举一下,以便大家选择。请所以慎重选择这个版本。 6 | 7 | influxdb: 8 | 9 | 优点: 10 | 11 | 1. 单机部署简单。 12 | 13 | 1. 查询效率优于opentsdb。(只测试了单机,查询时间快很多,具体数值缺失) 14 | 15 | 1. 自带capacitor,容易做一些报警分析。 16 | 17 | 2. 备份容易。 18 | 19 | 缺点: 20 | 21 | 1. 版本比较新,有些功能未完善,还未到1.0版本。 22 | 23 | 1. 下一个开源版本不再支持集群。 24 | 25 | 2. 查询函数支持比较少。 26 | 27 | 3. 版本更新很快,居然换了底层存储组件。备份有旧数据处理问题。 28 | 29 |   30 | 31 | opentsdb 32 | 33 | 优点: 34 | 35 | 1. 版本较成熟。 36 | 37 | 1. 支持函数多。 38 | 39 | 缺点: 40 | 41 | 1. 依赖hbase,部署比较复杂。 42 | 43 | 1. 查询较慢。单机调优效果不是很明显。 44 | 45 |   46 | 47 | \---------- 48 | 49 | Introduction 50 | ------------ 51 | 52 | 数据收集,是监控系统一个最基本的功能,在Open-Falcon中,Agent采集到的数据,会先发送给Transfer组件。Transfer在接收到客户端发送的数据,做一些数据规整,检查之后,转发到多个后端系统去处理。在转发到每个后端业务系统的时候,Transfer会根据一致性哈希算法,进行数据分片,来达到后端业务系统的水平扩展。Transfer自身是无状态的,挂掉一台或者多台不会有任何影响。 53 | 54 | Transfer支持的业务后端,有三种,Judge、Graph、OpenTSDB(开源版本尚未开放此功能)。Judge是我们开发的高性能告警判定组件,Graph是我们开发的高性能数据存储、归档、查询组件,OpenTSDB是开源的时间序列数据存储服务。每个业务后端,都可以通过Transfer的配置文件来开启。 55 | 56 | Transfer的数据来源,一般有四种: 57 | 58 | 1.Falcon-agent主动采集的基础监控数据。 59 | 2.Falcon-agent执行用户自定义的插件返回的数据。 60 | 3.client-library:线上的业务系统,都嵌入使用了统一的基础库,对于业务系统中每个业务接口,都会主动计算其qps、latency等指标,并上报。 61 | 4.用户产生的一些自定义的指标,由用户自行上报。 62 | 63 | 这四种数据,都会先发送给本机的Proxy-gateway,再由Proxy-gateway转发给Transfer 64 | 65 | 一个推送数据给Proxy-gateway的例子: 66 | 67 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ python 68 | #!-*- coding:utf8 -*- 69 | 70 | import requests 71 | import time 72 | import json 73 | 74 | ts = int(time.time()) 75 | payload = [ 76 | { 77 | "endpoint": "test-endpoint", 78 | "metric": "test-metric", 79 | "timestamp": ts, 80 | "step": 60, 81 | "value": 1, 82 | "counterType": "GAUGE", 83 | "tags": "location=beijing,service=falcon", 84 | }, 85 | ] 86 | r=requests.post("http://127.0.0.1:1988/v1/push",data=json.dumps(payload)) 87 | print r.text 88 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 89 | 90 | Installation 91 | ------------ 92 | 93 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ bash 94 | # set $GOPATH and $GOROOT 95 | 96 | mkdir -p $GOPATH/src/github.com/open-falcon 97 | cd $GOPATH/src/github.com/open-falcon 98 | git clone https://github.com/open-falcon/transfer.git 99 | 100 | cd transfer 101 | go get ./... 102 | ./control build 103 | 104 | ./control start 105 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 106 | 107 | Usage 108 | ----- 109 | 110 | send items via transfer's http-api 111 | 112 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ bash 113 | #!/bin/bash 114 | e="test.endpoint.1" 115 | m="test.metric.1" 116 | t="t0=tag0,t1=tag1,t2=tag2" 117 | ts=`date +%s` 118 | curl -s -X POST -d "[{\"metric\":\"$m\", \"endpoint\":\"$e\", \"timestamp\":$ts,\"step\":60, \"value\":9, \"counterType\":\"GAUGE\",\"tags\":\"$t\"}]" "127.0.0.1:6060/api/push" | python -m json.tool 119 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 120 | 121 | u want sending items via python jsonrpc client? turn to one python example: 122 | `./test/rcpclient.py` 123 | 124 | u want sending items via java jsonrpc client? turn to one java example: 125 | [jsonrpc4go]() 126 | 127 | Configuration 128 | ------------- 129 | 130 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 131 | debug: true/false, 如果为true,日志中会打印debug信息 132 | 133 | http 134 | - enable: true/false, 表示是否开启该http端口,该端口为控制端口,主要用来对transfer发送控制命令、统计命令、debug命令等 135 | - listen: 表示监听的http端口 136 | 137 | rpc 138 | - enable: true/false, 表示是否开启该jsonrpc数据接收端口, Agent发送数据使用的就是该端口 139 | - listen: 表示监听的http端口 140 | 141 | socket #即将被废弃,请避免使用 142 | - enable: true/false, 表示是否开启该telnet方式的数据接收端口,这是为了方便用户一行行的发送数据给transfer 143 | - listen: 表示监听的http端口 144 | 145 | judge 146 | - enable: true/false, 表示是否开启向judge发送数据 147 | - batch: 数据转发的批量大小,可以加快发送速度,建议保持默认值 148 | - connTimeout: 单位是毫秒,与后端建立连接的超时时间,可以根据网络质量微调,建议保持默认 149 | - callTimeout: 单位是毫秒,发送数据给后端的超时时间,可以根据网络质量微调,建议保持默认 150 | - pingMethod: 后端提供的ping接口,用来探测连接是否可用,必须保持默认 151 | - maxConns: 连接池相关配置,最大连接数,建议保持默认 152 | - maxIdle: 连接池相关配置,最大空闲连接数,建议保持默认 153 | - replicas: 这是一致性hash算法需要的节点副本数量,建议不要变更,保持默认即可 154 | - cluster: key-value形式的字典,表示后端的judge列表,其中key代表后端judge名字,value代表的是具体的ip:port 155 | 156 | graph 157 | - enable: true/false, 表示是否开启向graph发送数据 158 | - batch: 数据转发的批量大小,可以加快发送速度,建议保持默认值 159 | - connTimeout: 单位是毫秒,与后端建立连接的超时时间,可以根据网络质量微调,建议保持默认 160 | - callTimeout: 单位是毫秒,发送数据给后端的超时时间,可以根据网络质量微调,建议保持默认 161 | - pingMethod: 后端提供的ping接口,用来探测连接是否可用,必须保持默认 162 | - maxConns: 连接池相关配置,最大连接数,建议保持默认 163 | - maxIdle: 连接池相关配置,最大空闲连接数,建议保持默认 164 | - replicas: 这是一致性hash算法需要的节点副本数量,建议不要变更,保持默认即可 165 | - migrating: true/false,当我们需要对graph后端列表进行扩容的时候,设置为true, transfer会根据扩容前后的实例信息,对每个数据采集项,进行两次一致性哈希计算,根据计算结果,来决定是否需要发送双份的数据,当新扩容的服务器积累了足够久的数据后,就可以设置为false。 166 | - cluster: key-value形式的字典,表示后端的graph列表,其中key代表后端graph名字,value代表的是具体的ip:port(多个地址 用逗号隔开, transfer会将同一份数据 发送至各个地址) 167 | - clusterMigrating: key-value形式的字典,表示新扩容的后端的graph列表,其中key代表后端graph名字,value代表的是具体的ip:port(多个地址 用逗号隔开, transfer会将同一份数据 发送至各个地址) 168 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 169 | -------------------------------------------------------------------------------- /cfg.example.json: -------------------------------------------------------------------------------- 1 | { 2 | "debug": true, 3 | "nodepatch": "node.file", 4 | "http": { 5 | "enabled": true, 6 | "listen": "0.0.0.0:6060" 7 | }, 8 | "rpc": { 9 | "enabled": true, 10 | "listen": "0.0.0.0:8433" 11 | }, 12 | "influxdb": { 13 | "enabled": true, 14 | "batch": 200, 15 | "retry": 3, 16 | "username":"influxdbuser", 17 | "password":"influxdbpass", 18 | "database":"openfalcon", 19 | "connTimeout": 1000, 20 | "callTimeout": 5000, 21 | "maxConns": 32, 22 | "maxIdle": 32, 23 | "cluster": { 24 | "influxdb-00": "http://127.0.0.1:8086" 25 | }, 26 | "remove": { 27 | "traffic.lan.in": true, 28 | "traffic.lan.out": true, 29 | "traffic.wan.in": true, 30 | "traffic.wan.out": true, 31 | "switch.if.In": true, 32 | "switch.if.Out": true, 33 | "switch.if.InPkts": true, 34 | "switch.if.OutPkts": true 35 | } 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /control: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | WORKSPACE=$( 4 | cd $(dirname $0)/ 5 | pwd 6 | ) 7 | cd $WORKSPACE 8 | 9 | mkdir -p var 10 | 11 | module=swtfr2 12 | app=octopux-$module 13 | conf=cfg.json 14 | dist=$(rpm --showrc | grep dist | grep el | awk '($2=="dist"){print $3 }' | awk -F . '{print $2}') 15 | logfile=/var/log/supervisor/${app}.log 16 | iteration=3 17 | progpath="usr/local/octopux/${app}/" 18 | file_list="control cfg.example.json ${app} perfcounter.json" 19 | 20 | function check_pid() { 21 | running=$(ps -C $app | grep -v "PID TTY" | wc -l) 22 | return $running 23 | } 24 | 25 | function supervisorconf() { 26 | cat <${app}.conf 27 | [program:${app}] 28 | command=/${progpath}${app} 29 | numprocs=1 30 | directory=/${progpath} 31 | autostart=true 32 | autorestart=true 33 | startsecs=1 34 | startretries=1000000 35 | exitcodes=0 36 | stopsignal=SIGTERM 37 | stopwaitsecs=300 38 | user=root 39 | redirect_stderr=true 40 | stdout_logfile=/var/log/supervisor/${app}.log 41 | stdout_logfile_maxbytes=1MB 42 | stdout_logfile_backups=10 43 | stdout_capture_maxbytes=1MB 44 | stdout_events_enabled=true 45 | stderr_logfile=/var/log/supervisor/${app}.err.log 46 | stderr_logfile_maxbytes=1MB 47 | stderr_logfile_backups=10 48 | stderr_capture_maxbytes=1MB 49 | stderr_events_enabled=true 50 | 51 | EOF 52 | 53 | 54 | cat << EOF > postinsl.in 55 | supervisorctl reread 56 | supervisorctl update 57 | EOF 58 | 59 | echo "OK" 60 | echo "" 61 | 62 | } 63 | 64 | function run() { 65 | check_pid 66 | running=$? 67 | if [ ${running} -gt 0 ]; then 68 | echo -n "${app} now is running already, pid=" 69 | ps -C $app | grep -v "PID TTY" | awk '{print $1}' 70 | stop 71 | sleep 1 72 | fi 73 | 74 | if ! [ -f ${conf} ]; then 75 | echo "Config file $conf doesn't exist, creating one." 76 | cp cfg.example.json ${conf} 77 | fi 78 | ulimit -HSn 65536 79 | ./$app -c ${conf} >>${logfile} 2>&1 80 | } 81 | 82 | function start() { 83 | check_pid 84 | running=$? 85 | if [ $running -gt 0 ]; then 86 | echo -n "${app} now is running already, pid=" 87 | ps -C ${app} | grep -v "PID TTY" | awk '{print $1}' 88 | return 1 89 | fi 90 | 91 | if ! [ -f $conf ]; then 92 | echo "Config file $conf doesn't exist, creating one." 93 | cp cfg.example.json $conf 94 | fi 95 | ulimit -HSn 65536 96 | nohup ./${app} -c ${conf} >>${logfile} 2>&1 & 97 | sleep 1 98 | running=$(ps -C ${app} | grep -v "PID TTY" | wc -l) 99 | if [ ${running} -gt 0 ]; then 100 | echo "${app} started..., pid=" $(ps -C ${app} | grep -v "PID TTY" | awk '{print $1}') 101 | else 102 | echo "${app} failed to start." 103 | return 1 104 | fi 105 | } 106 | 107 | function stop() { 108 | pid=$(ps -C ${app} | grep -v "PID TTY" | awk '{print $1}') 109 | kill $(pidof ${app}) 110 | echo "${app} (${pid}) stoped..." 111 | } 112 | 113 | function restart() { 114 | stop 115 | sleep 1 116 | start 117 | } 118 | 119 | function status() { 120 | check_pid 121 | running=$? 122 | if [ $running -gt 0 ]; then 123 | echo started 124 | else 125 | echo stoped 126 | fi 127 | } 128 | 129 | function tailf() { 130 | tail -f ${logfile} 131 | } 132 | 133 | function build() { 134 | commit=$(git log -1 --pretty=%h) 135 | cat <./g/git.go 136 | package g 137 | const ( 138 | COMMIT = "$commit" 139 | ) 140 | EOF 141 | go build -o ${app} 142 | if [ $? -ne 0 ]; then 143 | exit $? 144 | fi 145 | 146 | ./$app -v 147 | } 148 | 149 | function rpm() { 150 | build 151 | 152 | if [ "$dist" == "" ]; then 153 | echo "cant build rpm on this os!" 154 | exit 1 155 | fi 156 | version=$(./${app} -v) 157 | supervisorconf 158 | mkdir -p "rpm/${progpath}" 159 | mkdir -p rpm/etc/supervisor/conf.d/ 160 | 161 | cp -R ${file_list} rpm/${progpath} 162 | cp cfg.example.json rpm/${progpath}cfg.json 163 | 164 | cp ${app}.conf rpm/etc/supervisor/conf.d 165 | 166 | fpm -s dir -t rpm -n ${app} -m dotwoo_test -v ${version} --iteration ${iteration} --rpm-dist ${dist} --after-install postinsl.in --after-upgrade postinsl.in --after-remove postinsl.in -C rpm --config-files etc/supervisor/conf.d/${app}.conf --config-files ${progpath}cfg.json -f --url http://www.baishancloud.com/ --provides dotwoo@baishancloud.com --vendor dotwoo@baishancloud.com 167 | 168 | rm -fr rpm 169 | } 170 | 171 | function help() { 172 | echo "$0 build|start|stop|restart|status|tail|run|rpm" 173 | } 174 | 175 | if [ "$1" == "" ]; then 176 | help 177 | elif [ "$1" == "stop" ]; then 178 | stop 179 | elif [ "$1" == "start" ]; then 180 | start 181 | elif [ "$1" == "restart" ]; then 182 | restart 183 | elif [ "$1" == "status" ]; then 184 | status 185 | elif [ "$1" == "tail" ]; then 186 | tailf 187 | elif [ "$1" == "run" ]; then 188 | run 189 | elif [ "$1" == "build" ]; then 190 | build 191 | elif [ "$1" == "rpm" ]; then 192 | rpm 193 | else 194 | help 195 | fi 196 | -------------------------------------------------------------------------------- /g/cfg.go: -------------------------------------------------------------------------------- 1 | package g 2 | 3 | import ( 4 | "encoding/json" 5 | "log" 6 | "strings" 7 | "sync" 8 | 9 | "github.com/toolkits/file" 10 | ) 11 | 12 | type HttpConfig struct { 13 | Enabled bool `json:"enabled"` 14 | Listen string `json:"listen"` 15 | } 16 | 17 | type RpcConfig struct { 18 | Enabled bool `json:"enabled"` 19 | Listen string `json:"listen"` 20 | } 21 | 22 | type InfluxdbConfig struct { 23 | Enabled bool `json:"enabled"` 24 | Batch int `json:"batch"` 25 | Username string `json:"username"` 26 | Password string `json:"password"` 27 | Database string `json:"database"` 28 | ConnTimeout int `json:"connTimeout"` 29 | CallTimeout int `json:"callTimeout"` 30 | MaxConns int `json:"maxConns"` 31 | MaxIdle int `json:"maxIdle"` 32 | MaxRetry int `json:"retry"` 33 | Cluster map[string]string `json:"cluster"` 34 | RemoveMetrics map[string]bool `json:"remove"` 35 | Cluster2 map[string]*ClusterNode `json:"cluster2"` 36 | } 37 | 38 | type GlobalConfig struct { 39 | Debug bool `json:"debug"` 40 | NodePath string `json:"nodepatch"` 41 | Http *HttpConfig `json:"http"` 42 | Rpc *RpcConfig `json:"rpc"` 43 | 44 | Influxdb *InfluxdbConfig `json:"influxdb"` 45 | } 46 | 47 | var ( 48 | ConfigFile string 49 | config *GlobalConfig 50 | configLock = new(sync.RWMutex) 51 | ) 52 | 53 | func Config() *GlobalConfig { 54 | configLock.RLock() 55 | defer configLock.RUnlock() 56 | return config 57 | } 58 | 59 | func ParseConfig(cfg string) { 60 | if cfg == "" { 61 | log.Fatalln("use -c to specify configuration file") 62 | } 63 | 64 | if !file.IsExist(cfg) { 65 | log.Fatalln("config file:", cfg, "is not existent. maybe you need `mv cfg.example.json cfg.json`") 66 | } 67 | 68 | ConfigFile = cfg 69 | 70 | configContent, err := file.ToTrimString(cfg) 71 | if err != nil { 72 | log.Fatalln("read config file:", cfg, "fail:", err) 73 | } 74 | 75 | var c GlobalConfig 76 | err = json.Unmarshal([]byte(configContent), &c) 77 | if err != nil { 78 | log.Fatalln("parse config file:", cfg, "fail:", err) 79 | } 80 | 81 | // split cluster config 82 | c.Influxdb.Cluster2 = formatClusterItems(c.Influxdb.Cluster) 83 | 84 | configLock.Lock() 85 | defer configLock.Unlock() 86 | config = &c 87 | 88 | log.Println("g.ParseConfig ok, file ", cfg) 89 | } 90 | 91 | // CLUSTER NODE 92 | type ClusterNode struct { 93 | Addrs []string `json:"addrs"` 94 | } 95 | 96 | func NewClusterNode(addrs []string) *ClusterNode { 97 | return &ClusterNode{addrs} 98 | } 99 | 100 | // map["node"]="host1,host2" --> map["node"]=["host1", "host2"] 101 | func formatClusterItems(cluster map[string]string) map[string]*ClusterNode { 102 | ret := make(map[string]*ClusterNode) 103 | for node, clusterStr := range cluster { 104 | items := strings.Split(clusterStr, ",") 105 | nitems := make([]string, 0) 106 | for _, item := range items { 107 | nitems = append(nitems, strings.TrimSpace(item)) 108 | } 109 | ret[node] = NewClusterNode(nitems) 110 | } 111 | 112 | return ret 113 | } 114 | -------------------------------------------------------------------------------- /g/g.go: -------------------------------------------------------------------------------- 1 | package g 2 | 3 | import ( 4 | "log" 5 | "runtime" 6 | ) 7 | 8 | // changelog: 9 | // 0.0.1: init project 10 | // 0.0.4: bugfix: set replicas before add node 11 | // 0.0.8: change receiver, mv proc cron to proc pkg, add readme, add gitversion, add config reload, add trace tools 12 | // 0.0.9: fix bugs of conn pool(use transfer's private conn pool, named & minimum) 13 | // 0.0.10: use more efficient proc & sema, rm conn_pool status log 14 | // 0.0.11: fix bug: all graphs' traffic delined when one graph broken down, modify retry interval 15 | // 0.0.14: support sending multi copies to graph node, align ts for judge, add filter 16 | // 0.1.4: 添加influxdb存储支持;用于流量采集系统,修改程序名称以区分;修改程序启功方式支持supervisor。 17 | // 0.1.5: 修改项目名称 18 | // 0.1.7:删除 judge 和 graph 部分 19 | // 0.1.9: 添加 mallard pfc统计 20 | // 0.2.0: 添加优雅重启支持 21 | // 1.0.0: 修改打包方式 22 | 23 | const ( 24 | VERSION = "1.0.0" 25 | GAUGE = "GAUGE" 26 | COUNTER = "COUNTER" 27 | DERIVE = "DERIVE" 28 | DEFAULT_STEP = 60 29 | MIN_STEP = 30 30 | ) 31 | 32 | func init() { 33 | runtime.GOMAXPROCS(runtime.NumCPU()) 34 | log.SetFlags(log.Ldate | log.Ltime | log.Lshortfile) 35 | } 36 | -------------------------------------------------------------------------------- /g/status.go: -------------------------------------------------------------------------------- 1 | package g 2 | 3 | import "sync" 4 | 5 | type ReceiverStatusManager struct { 6 | sync.WaitGroup 7 | lock sync.RWMutex 8 | isRun bool 9 | } 10 | 11 | func NewReceiverStatusManager() *ReceiverStatusManager { 12 | rsm := &ReceiverStatusManager{} 13 | rsm.isRun = false 14 | return rsm 15 | } 16 | 17 | func (r *ReceiverStatusManager) IsRun() bool { 18 | r.lock.RLock() 19 | defer r.lock.RUnlock() 20 | return r.isRun 21 | } 22 | 23 | func (r *ReceiverStatusManager) Run() { 24 | r.lock.Lock() 25 | defer r.lock.Unlock() 26 | r.isRun = true 27 | 28 | } 29 | 30 | func (r *ReceiverStatusManager) Stop() { 31 | r.lock.Lock() 32 | defer r.lock.Unlock() 33 | r.isRun = false 34 | } 35 | -------------------------------------------------------------------------------- /http/api_http.go: -------------------------------------------------------------------------------- 1 | package http 2 | 3 | import ( 4 | "encoding/json" 5 | "net/http" 6 | 7 | trpc "github.com/baishancloud/octopux-swtfr/receiver/rpc" 8 | cmodel "github.com/open-falcon/common/model" 9 | ) 10 | 11 | func configApiHttpRoutes() { 12 | http.HandleFunc("/api/push", func(w http.ResponseWriter, req *http.Request) { 13 | if req.ContentLength == 0 { 14 | http.Error(w, "blank body", http.StatusBadRequest) 15 | return 16 | } 17 | 18 | decoder := json.NewDecoder(req.Body) 19 | var metrics []*cmodel.MetricValue 20 | err := decoder.Decode(&metrics) 21 | if err != nil { 22 | http.Error(w, "decode error", http.StatusBadRequest) 23 | return 24 | } 25 | 26 | reply := &cmodel.TransferResponse{} 27 | trpc.RecvMetricValues(metrics, reply, "http") 28 | 29 | RenderDataJson(w, reply) 30 | }) 31 | } 32 | -------------------------------------------------------------------------------- /http/common.go: -------------------------------------------------------------------------------- 1 | package http 2 | 3 | import ( 4 | "fmt" 5 | "net/http" 6 | "strings" 7 | 8 | "github.com/baishancloud/octopux-swtfr/g" 9 | "github.com/toolkits/file" 10 | ) 11 | 12 | func configCommonRoutes() { 13 | http.HandleFunc("/health", func(w http.ResponseWriter, r *http.Request) { 14 | w.Write([]byte("ok\n")) 15 | }) 16 | 17 | http.HandleFunc("/version", func(w http.ResponseWriter, r *http.Request) { 18 | w.Write([]byte(fmt.Sprintf("%s\n", g.VERSION))) 19 | }) 20 | 21 | http.HandleFunc("/workdir", func(w http.ResponseWriter, r *http.Request) { 22 | w.Write([]byte(fmt.Sprintf("%s\n", file.SelfDir()))) 23 | }) 24 | 25 | http.HandleFunc("/config", func(w http.ResponseWriter, r *http.Request) { 26 | RenderDataJson(w, g.Config()) 27 | }) 28 | 29 | http.HandleFunc("/config/reload", func(w http.ResponseWriter, r *http.Request) { 30 | if strings.HasPrefix(r.RemoteAddr, "127.0.0.1") { 31 | g.ParseConfig(g.ConfigFile) 32 | RenderDataJson(w, "ok") 33 | } else { 34 | RenderDataJson(w, "no privilege") 35 | } 36 | }) 37 | } 38 | -------------------------------------------------------------------------------- /http/debug_http.go: -------------------------------------------------------------------------------- 1 | package http 2 | 3 | import ( 4 | "fmt" 5 | "net/http" 6 | "strings" 7 | 8 | "github.com/baishancloud/octopux-swtfr/sender" 9 | ) 10 | 11 | func configDebugHttpRoutes() { 12 | // conn pools 13 | http.HandleFunc("/debug/connpool/", func(w http.ResponseWriter, r *http.Request) { 14 | urlParam := r.URL.Path[len("/debug/connpool/"):] 15 | args := strings.Split(urlParam, "/") 16 | 17 | argsLen := len(args) 18 | if argsLen < 1 { 19 | w.Write([]byte(fmt.Sprintf("bad args\n"))) 20 | return 21 | } 22 | 23 | var result string 24 | receiver := args[0] 25 | switch receiver { 26 | case "tsdb": 27 | result = strings.Join(sender.InfluxdbConnPools.Proc(), "\n") 28 | default: 29 | result = fmt.Sprintf("bad args, module not exist\n") 30 | } 31 | w.Write([]byte(result)) 32 | }) 33 | } 34 | -------------------------------------------------------------------------------- /http/http.go: -------------------------------------------------------------------------------- 1 | package http 2 | 3 | import ( 4 | "encoding/json" 5 | "log" 6 | "net" 7 | "net/http" 8 | _ "net/http/pprof" 9 | 10 | "github.com/baishancloud/octopux-swtfr/g" 11 | ) 12 | 13 | type Dto struct { 14 | Msg string `json:"msg"` 15 | Data interface{} `json:"data"` 16 | } 17 | 18 | var ( 19 | httpserv *http.Server 20 | ln *net.TCPListener 21 | ) 22 | 23 | func Stop() { 24 | if ln != nil { 25 | log.Println("set http listen close!") 26 | ln.Close() 27 | } 28 | } 29 | 30 | func Start() { 31 | go startHTTPServer() 32 | } 33 | func startHTTPServer() { 34 | if !g.Config().Http.Enabled { 35 | return 36 | } 37 | 38 | addr := g.Config().Http.Listen 39 | if addr == "" { 40 | return 41 | } 42 | 43 | configCommonRoutes() 44 | configDebugHttpRoutes() 45 | configApiHttpRoutes() 46 | 47 | httpserv = &http.Server{ 48 | Addr: addr, 49 | MaxHeaderBytes: 1 << 30, 50 | } 51 | 52 | log.Println("http.startHttpServer ok, listening", addr) 53 | if addr == "" { 54 | addr = ":http" 55 | } 56 | hln, err := net.Listen("tcp", addr) 57 | if err != nil { 58 | log.Fatalln("Start listen http error :", err) 59 | return 60 | } 61 | ln = hln.(*net.TCPListener) 62 | log.Println(httpserv.Serve(ln)) 63 | } 64 | 65 | func RenderJson(w http.ResponseWriter, v interface{}) { 66 | bs, err := json.Marshal(v) 67 | if err != nil { 68 | http.Error(w, err.Error(), http.StatusInternalServerError) 69 | return 70 | } 71 | w.Header().Set("Content-Type", "application/json; charset=UTF-8") 72 | w.Write(bs) 73 | } 74 | 75 | func RenderDataJson(w http.ResponseWriter, data interface{}) { 76 | RenderJson(w, Dto{Msg: "success", Data: data}) 77 | } 78 | 79 | func RenderMsgJson(w http.ResponseWriter, msg string) { 80 | RenderJson(w, map[string]string{"msg": msg}) 81 | } 82 | 83 | func AutoRender(w http.ResponseWriter, data interface{}, err error) { 84 | if err != nil { 85 | RenderMsgJson(w, err.Error()) 86 | return 87 | } 88 | RenderDataJson(w, data) 89 | } 90 | -------------------------------------------------------------------------------- /main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "flag" 5 | "fmt" 6 | "log" 7 | _ "net/http/pprof" 8 | "os" 9 | "os/signal" 10 | "runtime" 11 | "runtime/pprof" 12 | "strings" 13 | "syscall" 14 | "time" 15 | 16 | _ "github.com/baishancloud/goperfcounter" 17 | "github.com/baishancloud/octopux-swtfr/g" 18 | "github.com/baishancloud/octopux-swtfr/http" 19 | "github.com/baishancloud/octopux-swtfr/receiver" 20 | "github.com/baishancloud/octopux-swtfr/sender" 21 | ) 22 | 23 | var ( 24 | pid int 25 | progname string 26 | ) 27 | 28 | func init() { 29 | pid = os.Getpid() 30 | paths := strings.Split(os.Args[0], "/") 31 | paths = strings.Split(paths[len(paths)-1], string(os.PathSeparator)) 32 | progname = paths[len(paths)-1] 33 | runtime.MemProfileRate = 1 34 | } 35 | func saveHeapProfile() { 36 | runtime.GC() 37 | f, err := os.Create(fmt.Sprintf("prof/heap_%s_%d_%s.prof", progname, pid, time.Now().Format("2006_01_02_03_04_05"))) 38 | if err != nil { 39 | return 40 | } 41 | defer f.Close() 42 | pprof.Lookup("heap").WriteTo(f, 1) 43 | } 44 | func main() { 45 | //defer saveHeapProfile() 46 | cfg := flag.String("c", "cfg.json", "configuration file") 47 | version := flag.Bool("v", false, "show version") 48 | versionGit := flag.Bool("vg", false, "show version") 49 | flag.Parse() 50 | 51 | if *version { 52 | fmt.Println(g.VERSION) 53 | os.Exit(0) 54 | } 55 | if *versionGit { 56 | fmt.Println(g.VERSION, g.COMMIT) 57 | os.Exit(0) 58 | } 59 | 60 | // global config 61 | g.ParseConfig(*cfg) 62 | 63 | rcv, err := receiver.New() 64 | if err != nil { 65 | log.Fatalln("Set receive serve error ", err) 66 | } 67 | rcv.GoServe() 68 | 69 | sender.Start(rcv.Rm) 70 | 71 | http.Start() 72 | 73 | signals := make(chan os.Signal) 74 | signal.Notify(signals, syscall.SIGHUP, syscall.SIGTERM) 75 | for sig := range signals { 76 | if sig == syscall.SIGTERM { 77 | http.Stop() 78 | rcv.Stop() 79 | log.Println("exit SIGTERM", time.Now()) 80 | rcv.Rm.Wait() 81 | log.Println("exit SIGTERM end", time.Now()) 82 | os.Exit(0) 83 | //TODO . timeout exit 84 | } else if sig == syscall.SIGHUP { 85 | http.Stop() 86 | rcv.Stop() 87 | log.Println("exit SIGHUP", time.Now()) 88 | os.Setenv("_GRACEFUL_RESTART", "true") 89 | execSpec := &syscall.ProcAttr{ 90 | Env: os.Environ(), 91 | Files: []uintptr{os.Stdin.Fd(), os.Stdout.Fd(), os.Stderr.Fd()}, 92 | } 93 | // Fork exec the new version of your server 94 | fork, err := syscall.ForkExec(os.Args[0], os.Args, execSpec) 95 | if err != nil { 96 | log.Fatalln("Fail to fork", err) 97 | } 98 | 99 | log.Println("SIGHUP received: fork-exec to", fork) 100 | // Wait for all conections to be finished 101 | rcv.Rm.Wait() 102 | log.Println(os.Getpid(), "Server gracefully shutdown", time.Now()) 103 | os.Exit(0) 104 | } 105 | } 106 | } 107 | -------------------------------------------------------------------------------- /perfcounter.json: -------------------------------------------------------------------------------- 1 | { 2 | "debug": false, 3 | "hostname": "", 4 | "tags": "cop=bsy,owt=flow,pdl=octopux,module=swtfr", 5 | "step": 300, 6 | "bases": [], 7 | "push": { 8 | "enabled": true, 9 | "clear": true, 10 | "api": "http://127.0.0.1:10699/v1/push" 11 | }, 12 | "http": { 13 | "enabled": false, 14 | "listen": "0.0.0.0:2015" 15 | } 16 | } 17 | 18 | -------------------------------------------------------------------------------- /receiver/receiver.go: -------------------------------------------------------------------------------- 1 | package receiver 2 | 3 | import ( 4 | "log" 5 | "net" 6 | "time" 7 | 8 | "github.com/baishancloud/octopux-swtfr/g" 9 | "github.com/baishancloud/octopux-swtfr/receiver/rpc" 10 | ) 11 | 12 | type Server struct { 13 | Rm *g.ReceiverStatusManager 14 | rpcsocket *net.TCPListener 15 | } 16 | 17 | func (s *Server) Stop() { 18 | if s.rpcsocket != nil { 19 | s.rpcsocket.SetDeadline(time.Now()) 20 | s.Rm.Stop() 21 | } 22 | } 23 | 24 | func New() (*Server, error) { 25 | s := &Server{} 26 | rln, err := rpc.NewRpcListener() 27 | if err != nil { 28 | log.Println("rpc new Listener error:", err) 29 | return nil, err 30 | 31 | } 32 | s.rpcsocket = rln 33 | s.Rm = &g.ReceiverStatusManager{} 34 | return s, nil 35 | } 36 | 37 | func (s *Server) GoServe() { 38 | s.Rm.Run() 39 | go rpc.RpcServe(s.rpcsocket) 40 | } 41 | 42 | func Start() { 43 | go rpc.StartRpc() 44 | } 45 | -------------------------------------------------------------------------------- /receiver/rpc/rpc.go: -------------------------------------------------------------------------------- 1 | package rpc 2 | 3 | import ( 4 | "errors" 5 | "fmt" 6 | "log" 7 | "net" 8 | "net/rpc" 9 | "net/rpc/jsonrpc" 10 | 11 | "github.com/baishancloud/octopux-swtfr/g" 12 | ) 13 | 14 | func NewRpcListener() (*net.TCPListener, error) { 15 | if !g.Config().Rpc.Enabled { 16 | return nil, errors.New("disable rpc") 17 | } 18 | 19 | addr := g.Config().Rpc.Listen 20 | tcpAddr, err := net.ResolveTCPAddr("tcp", addr) 21 | if err != nil { 22 | log.Fatalf("net.ResolveTCPAddr fail: %s", err) 23 | return nil, err 24 | } 25 | 26 | listener, err := net.ListenTCP("tcp", tcpAddr) 27 | if err != nil { 28 | log.Fatalf("listen %s fail: %s", addr, err) 29 | return nil, err 30 | } else { 31 | log.Println("rpc listening", addr) 32 | } 33 | return listener, nil 34 | } 35 | 36 | func RpcServe(ln *net.TCPListener) { 37 | server := rpc.NewServer() 38 | server.Register(new(Transfer)) 39 | 40 | for { 41 | conn, err := ln.Accept() 42 | if err != nil { 43 | if nerr, ok := err.(net.Error); ok && nerr.Timeout() { 44 | ln.Close() 45 | fmt.Println("Stop rpc accepting connections") 46 | return 47 | } 48 | log.Println("listener.Accept occur error:", err) 49 | continue 50 | } 51 | // go rpc.ServeConn(conn) 52 | go server.ServeCodec(jsonrpc.NewServerCodec(conn)) 53 | } 54 | } 55 | 56 | func StartRpc() { 57 | ln, err := NewRpcListener() 58 | if err != nil { 59 | return 60 | } 61 | RpcServe(ln) 62 | } 63 | -------------------------------------------------------------------------------- /receiver/rpc/rpc_transfer.go: -------------------------------------------------------------------------------- 1 | package rpc 2 | 3 | import ( 4 | "fmt" 5 | "strconv" 6 | "time" 7 | 8 | pfc "github.com/baishancloud/goperfcounter" 9 | "github.com/baishancloud/octopux-swtfr/g" 10 | "github.com/baishancloud/octopux-swtfr/sender" 11 | cmodel "github.com/open-falcon/common/model" 12 | cutils "github.com/open-falcon/common/utils" 13 | //"log" 14 | ) 15 | 16 | type Transfer int 17 | 18 | type TransferResp struct { 19 | Msg string 20 | Total int 21 | ErrInvalid int 22 | Latency int64 23 | } 24 | 25 | func (t *TransferResp) String() string { 26 | s := fmt.Sprintf("TransferResp total=%d, err_invalid=%d, latency=%dms", 27 | t.Total, t.ErrInvalid, t.Latency) 28 | if t.Msg != "" { 29 | s = fmt.Sprintf("%s, msg=%s", s, t.Msg) 30 | } 31 | return s 32 | } 33 | 34 | func (this *Transfer) Ping(req cmodel.NullRpcRequest, resp *cmodel.SimpleRpcResponse) error { 35 | return nil 36 | } 37 | 38 | func (t *Transfer) Update(args []*cmodel.MetricValue, reply *cmodel.TransferResponse) error { 39 | return RecvMetricValues(args, reply, "rpc") 40 | } 41 | 42 | // process new metric values 43 | func RecvMetricValues(args []*cmodel.MetricValue, reply *cmodel.TransferResponse, from string) error { 44 | start := time.Now() 45 | reply.Invalid = 0 46 | //log.Printf("rpc call \n") 47 | items := []*cmodel.MetaData{} 48 | for _, v := range args { 49 | if v == nil { 50 | reply.Invalid++ 51 | continue 52 | } 53 | 54 | // 历史遗留问题. 55 | // 老版本agent上报的metric=kernel.hostname的数据,其取值为string类型,现在已经不支持了;所以,这里硬编码过滤掉 56 | if v.Metric == "kernel.hostname" { 57 | reply.Invalid++ 58 | continue 59 | } 60 | 61 | if v.Metric == "" || v.Endpoint == "" { 62 | reply.Invalid++ 63 | continue 64 | } 65 | 66 | if v.Type != g.COUNTER && v.Type != g.GAUGE && v.Type != g.DERIVE { 67 | reply.Invalid++ 68 | continue 69 | } 70 | 71 | if v.Value == "" { 72 | reply.Invalid++ 73 | continue 74 | } 75 | 76 | if v.Step <= 0 { 77 | reply.Invalid++ 78 | continue 79 | } 80 | 81 | if len(v.Metric)+len(v.Tags) > 510 { 82 | reply.Invalid++ 83 | continue 84 | } 85 | 86 | // TODO 呵呵,这里需要再优雅一点 87 | now := start.Unix() 88 | if v.Timestamp <= 0 || v.Timestamp > now*2 { 89 | v.Timestamp = now 90 | } 91 | 92 | fv := &cmodel.MetaData{ 93 | Metric: v.Metric, 94 | Endpoint: v.Endpoint, 95 | Timestamp: v.Timestamp, 96 | Step: v.Step, 97 | CounterType: v.Type, 98 | Tags: cutils.DictedTagstring(v.Tags), //TODO tags键值对的个数,要做一下限制 99 | } 100 | 101 | valid := true 102 | var vv float64 103 | var err error 104 | 105 | switch cv := v.Value.(type) { 106 | case string: 107 | vv, err = strconv.ParseFloat(cv, 64) 108 | if err != nil { 109 | valid = false 110 | } 111 | case float64: 112 | vv = cv 113 | case int64: 114 | vv = float64(cv) 115 | default: 116 | valid = false 117 | } 118 | 119 | if !valid { 120 | reply.Invalid += 1 121 | continue 122 | } 123 | 124 | fv.Value = vv 125 | items = append(items, fv) 126 | } 127 | 128 | // statistics 129 | cnt := int64(len(items)) 130 | pfc.Meter("SWTFRRevc", cnt) 131 | 132 | cfg := g.Config() 133 | 134 | if cfg.Influxdb.Enabled { 135 | sender.Push2TsdbSendQueue(items) 136 | } 137 | 138 | reply.Message = "ok" 139 | reply.Total = len(args) 140 | reply.Latency = (time.Now().UnixNano() - start.UnixNano()) / 1000000 141 | 142 | return nil 143 | } 144 | -------------------------------------------------------------------------------- /sender/conn_pools.go: -------------------------------------------------------------------------------- 1 | package sender 2 | 3 | import ( 4 | "github.com/baishancloud/octopux-swtfr/g" 5 | cpool "github.com/baishancloud/octopux-swtfr/sender/connpool" 6 | nset "github.com/toolkits/container/set" 7 | ) 8 | 9 | func initConnPools() { 10 | cfg := g.Config() 11 | 12 | if cfg.Influxdb != nil && cfg.Influxdb.Enabled { 13 | influxdbInstances := nset.NewStringSet() 14 | for _, instance := range cfg.Influxdb.Cluster { 15 | influxdbInstances.Add(instance) 16 | } 17 | InfluxdbConnPools = cpool.CreateInfluxdbCliPools(cfg.Influxdb.MaxConns, cfg.Influxdb.MaxIdle, 18 | cfg.Influxdb.ConnTimeout, cfg.Influxdb.CallTimeout, influxdbInstances.ToSlice()) 19 | 20 | } 21 | 22 | } 23 | 24 | func DestroyConnPools() { 25 | 26 | InfluxdbConnPools.Destroy() 27 | } 28 | -------------------------------------------------------------------------------- /sender/connpool/conn_pool.go: -------------------------------------------------------------------------------- 1 | package connpool 2 | 3 | import ( 4 | "fmt" 5 | "io" 6 | "sync" 7 | "time" 8 | ) 9 | 10 | //TODO: 保存所有的连接, 而不是只保存连接计数 11 | 12 | var ErrMaxConn = fmt.Errorf("maximum connections reached") 13 | 14 | // 15 | type NConn interface { 16 | io.Closer 17 | Name() string 18 | Closed() bool 19 | } 20 | 21 | type ConnPool struct { 22 | sync.RWMutex 23 | 24 | Name string 25 | Address string 26 | MaxConns int 27 | MaxIdle int 28 | Cnt int64 29 | Username string 30 | Password string 31 | Database string 32 | Precision string 33 | 34 | New func(name string) (NConn, error) 35 | 36 | active int 37 | free []NConn 38 | all map[string]NConn 39 | } 40 | 41 | func NewConnPool(name string, address string, maxConns int, maxIdle int) *ConnPool { 42 | return &ConnPool{Name: name, Address: address, MaxConns: maxConns, MaxIdle: maxIdle, Cnt: 0, all: make(map[string]NConn)} 43 | } 44 | 45 | func (this *ConnPool) Proc() string { 46 | this.RLock() 47 | defer this.RUnlock() 48 | 49 | return fmt.Sprintf("Name:%s,Cnt:%d,active:%d,all:%d,free:%d", 50 | this.Name, this.Cnt, this.active, len(this.all), len(this.free)) 51 | } 52 | 53 | func (this *ConnPool) Fetch() (NConn, error) { 54 | this.Lock() 55 | defer this.Unlock() 56 | 57 | // get from free 58 | conn := this.fetchFree() 59 | if conn != nil { 60 | return conn, nil 61 | } 62 | 63 | if this.overMax() { 64 | return nil, ErrMaxConn 65 | } 66 | 67 | // create new conn 68 | conn, err := this.newConn() 69 | if err != nil { 70 | return nil, err 71 | } 72 | 73 | this.increActive() 74 | return conn, nil 75 | } 76 | 77 | func (this *ConnPool) Release(conn NConn) { 78 | this.Lock() 79 | defer this.Unlock() 80 | 81 | if this.overMaxIdle() { 82 | this.deleteConn(conn) 83 | this.decreActive() 84 | } else { 85 | this.addFree(conn) 86 | } 87 | } 88 | 89 | func (this *ConnPool) ForceClose(conn NConn) { 90 | this.Lock() 91 | defer this.Unlock() 92 | 93 | this.deleteConn(conn) 94 | this.decreActive() 95 | } 96 | 97 | func (this *ConnPool) Destroy() { 98 | this.Lock() 99 | defer this.Unlock() 100 | 101 | for _, conn := range this.free { 102 | if conn != nil && !conn.Closed() { 103 | conn.Close() 104 | } 105 | } 106 | 107 | for _, conn := range this.all { 108 | if conn != nil && !conn.Closed() { 109 | conn.Close() 110 | } 111 | } 112 | 113 | this.active = 0 114 | this.free = []NConn{} 115 | this.all = map[string]NConn{} 116 | } 117 | 118 | // internal, concurrently unsafe 119 | func (this *ConnPool) newConn() (NConn, error) { 120 | name := fmt.Sprintf("%s_%d_%d", this.Name, this.Cnt, time.Now().Unix()) 121 | conn, err := this.New(name) 122 | if err != nil { 123 | if conn != nil { 124 | conn.Close() 125 | } 126 | return nil, err 127 | } 128 | 129 | this.Cnt++ 130 | this.all[conn.Name()] = conn 131 | return conn, nil 132 | } 133 | 134 | func (this *ConnPool) deleteConn(conn NConn) { 135 | if conn != nil { 136 | conn.Close() 137 | } 138 | delete(this.all, conn.Name()) 139 | } 140 | 141 | func (this *ConnPool) addFree(conn NConn) { 142 | this.free = append(this.free, conn) 143 | } 144 | 145 | func (this *ConnPool) fetchFree() NConn { 146 | if len(this.free) == 0 { 147 | return nil 148 | } 149 | 150 | conn := this.free[0] 151 | this.free = this.free[1:] 152 | return conn 153 | } 154 | 155 | func (this *ConnPool) increActive() { 156 | this.active += 1 157 | } 158 | 159 | func (this *ConnPool) decreActive() { 160 | this.active -= 1 161 | } 162 | 163 | func (this *ConnPool) overMax() bool { 164 | return this.active >= this.MaxConns 165 | } 166 | 167 | func (this *ConnPool) overMaxIdle() bool { 168 | return len(this.free) >= this.MaxIdle 169 | } 170 | -------------------------------------------------------------------------------- /sender/connpool/conn_pool_manager.go: -------------------------------------------------------------------------------- 1 | package connpool 2 | 3 | import ( 4 | "fmt" 5 | "net" 6 | "net/rpc" 7 | "sync" 8 | "time" 9 | ) 10 | 11 | // RpcCient, 要实现io.Closer接口 12 | type RpcClient struct { 13 | cli *rpc.Client 14 | name string 15 | } 16 | 17 | func (this RpcClient) Name() string { 18 | return this.name 19 | } 20 | 21 | func (this RpcClient) Closed() bool { 22 | return this.cli == nil 23 | } 24 | 25 | func (this RpcClient) Close() error { 26 | if this.cli != nil { 27 | err := this.cli.Close() 28 | this.cli = nil 29 | return err 30 | } 31 | return nil 32 | } 33 | 34 | func (this RpcClient) Call(method string, args interface{}, reply interface{}) error { 35 | return this.cli.Call(method, args, reply) 36 | } 37 | 38 | // ConnPools Manager 39 | type SafeRpcConnPools struct { 40 | sync.RWMutex 41 | M map[string]*ConnPool 42 | MaxConns int 43 | MaxIdle int 44 | ConnTimeout int 45 | CallTimeout int 46 | } 47 | 48 | func CreateSafeRpcConnPools(maxConns, maxIdle, connTimeout, callTimeout int, cluster []string) *SafeRpcConnPools { 49 | cp := &SafeRpcConnPools{M: make(map[string]*ConnPool), MaxConns: maxConns, MaxIdle: maxIdle, 50 | ConnTimeout: connTimeout, CallTimeout: callTimeout} 51 | 52 | ct := time.Duration(cp.ConnTimeout) * time.Millisecond 53 | for _, address := range cluster { 54 | if _, exist := cp.M[address]; exist { 55 | continue 56 | } 57 | cp.M[address] = createOnePool(address, address, ct, maxConns, maxIdle) 58 | } 59 | 60 | return cp 61 | } 62 | 63 | func (this *SafeRpcConnPools) Proc() []string { 64 | procs := []string{} 65 | for _, cp := range this.M { 66 | procs = append(procs, cp.Proc()) 67 | } 68 | return procs 69 | } 70 | 71 | // 同步发送, 完成发送或超时后 才能返回 72 | func (this *SafeRpcConnPools) Call(addr, method string, args interface{}, resp interface{}) error { 73 | connPool, exists := this.Get(addr) 74 | if !exists { 75 | return fmt.Errorf("%s has no connection pool", addr) 76 | } 77 | 78 | conn, err := connPool.Fetch() 79 | if err != nil { 80 | return fmt.Errorf("%s get connection fail: conn %v, err %v. proc: %s", addr, conn, err, connPool.Proc()) 81 | } 82 | 83 | rpcClient := conn.(RpcClient) 84 | callTimeout := time.Duration(this.CallTimeout) * time.Millisecond 85 | 86 | done := make(chan error) 87 | go func() { 88 | done <- rpcClient.Call(method, args, resp) 89 | }() 90 | 91 | select { 92 | case <-time.After(callTimeout): 93 | connPool.ForceClose(conn) 94 | return fmt.Errorf("%s, call timeout", addr) 95 | case err = <-done: 96 | if err != nil { 97 | connPool.ForceClose(conn) 98 | err = fmt.Errorf("%s, call failed, err %v. proc: %s", addr, err, connPool.Proc()) 99 | } else { 100 | connPool.Release(conn) 101 | } 102 | return err 103 | } 104 | } 105 | 106 | func (this *SafeRpcConnPools) Get(address string) (*ConnPool, bool) { 107 | this.RLock() 108 | defer this.RUnlock() 109 | p, exists := this.M[address] 110 | return p, exists 111 | } 112 | 113 | func (this *SafeRpcConnPools) Destroy() { 114 | this.Lock() 115 | defer this.Unlock() 116 | addresses := make([]string, 0, len(this.M)) 117 | for address := range this.M { 118 | addresses = append(addresses, address) 119 | } 120 | 121 | for _, address := range addresses { 122 | this.M[address].Destroy() 123 | delete(this.M, address) 124 | } 125 | } 126 | 127 | func createOnePool(name string, address string, connTimeout time.Duration, maxConns int, maxIdle int) *ConnPool { 128 | p := NewConnPool(name, address, maxConns, maxIdle) 129 | p.New = func(connName string) (NConn, error) { 130 | _, err := net.ResolveTCPAddr("tcp", p.Address) 131 | if err != nil { 132 | //log.Println(p.Address, "format error", err) 133 | return nil, err 134 | } 135 | 136 | conn, err := net.DialTimeout("tcp", p.Address, connTimeout) 137 | if err != nil { 138 | //log.Printf("new conn fail, addr %s, err %v", p.Address, err) 139 | return nil, err 140 | } 141 | 142 | return RpcClient{cli: rpc.NewClient(conn), name: connName}, nil 143 | } 144 | 145 | return p 146 | } 147 | -------------------------------------------------------------------------------- /sender/connpool/influxdb_pool.go: -------------------------------------------------------------------------------- 1 | package connpool 2 | 3 | //Influxdb 4 | import ( 5 | "errors" 6 | "fmt" 7 | "log" 8 | "net/url" 9 | "strings" 10 | "sync" 11 | "time" 12 | 13 | "github.com/baishancloud/octopux-swtfr/g" 14 | "github.com/influxdata/influxdb/client/v2" 15 | ) 16 | 17 | type InfluxdbClient struct { 18 | name string 19 | Address string 20 | Username string 21 | Password string 22 | Database string 23 | UserAgent string 24 | Precision string 25 | Timeout time.Duration 26 | UDPPayload int `toml:"udp_payload"` 27 | 28 | cli client.Client 29 | } 30 | 31 | func (this InfluxdbClient) Name() string { 32 | return this.name 33 | } 34 | 35 | func (this InfluxdbClient) Closed() bool { 36 | return this.cli == nil 37 | } 38 | 39 | func (this InfluxdbClient) Close() error { 40 | if this.cli != nil { 41 | this.cli.Close() 42 | this.cli = nil 43 | return nil 44 | } 45 | return nil 46 | } 47 | 48 | func (this *InfluxdbClient) Connect() error { 49 | 50 | // Backward-compatability with single Influx URL config files 51 | // This could eventually be removed in favor of specifying the urls as a list 52 | if this.Address == "" { 53 | return fmt.Errorf("Influxdb url is nil.") 54 | } 55 | 56 | switch { 57 | case strings.HasPrefix(this.Address, "udp"): 58 | parsedURL, err := url.Parse(this.Address) 59 | if err != nil { 60 | return err 61 | } 62 | 63 | if this.UDPPayload == 0 { 64 | this.UDPPayload = client.UDPPayloadSize 65 | } 66 | c, err := client.NewUDPClient(client.UDPConfig{ 67 | Addr: parsedURL.Host, 68 | PayloadSize: this.UDPPayload, 69 | }) 70 | if err != nil { 71 | return err 72 | } 73 | this.cli = c 74 | default: 75 | // If URL doesn't start with "udp", assume HTTP client 76 | c, err := client.NewHTTPClient(client.HTTPConfig{ 77 | Addr: this.Address, 78 | Username: this.Username, 79 | Password: this.Password, 80 | UserAgent: this.UserAgent, 81 | Timeout: this.Timeout, 82 | }) 83 | if err != nil { 84 | return err 85 | } 86 | 87 | this.cli = c 88 | } 89 | 90 | return nil 91 | } 92 | 93 | func newInfluxdbConnPool(name string, address string, connTimeout time.Duration, maxConns int, maxIdle int) *ConnPool { 94 | pool := NewConnPool(name, address, maxConns, maxIdle) 95 | pool.Username = g.Config().Influxdb.Username 96 | pool.Password = g.Config().Influxdb.Password 97 | pool.Database = g.Config().Influxdb.Database 98 | pool.Precision = "s" 99 | 100 | pool.New = func(connName string) (NConn, error) { 101 | nconn := InfluxdbClient{ 102 | name: connName, 103 | Address: pool.Address, 104 | Username: pool.Username, 105 | Password: pool.Password, 106 | Database: pool.Database, 107 | Precision: pool.Precision, 108 | Timeout: connTimeout, 109 | } 110 | err := nconn.Connect() 111 | if err != nil { 112 | return nil, err 113 | } 114 | 115 | return nconn, nil 116 | } 117 | 118 | return pool 119 | } 120 | 121 | type InfluxdbConnPoolHelper struct { 122 | sync.RWMutex 123 | M map[string]*ConnPool 124 | MaxConns int 125 | MaxIdle int 126 | ConnTimeout int 127 | CallTimeout int 128 | } 129 | 130 | func (this *InfluxdbConnPoolHelper) Get(address string) (*ConnPool, bool) { 131 | this.RLock() 132 | defer this.RUnlock() 133 | p, exists := this.M[address] 134 | return p, exists 135 | } 136 | 137 | func (this *InfluxdbConnPoolHelper) Destroy() { 138 | this.Lock() 139 | defer this.Unlock() 140 | addresses := make([]string, 0, len(this.M)) 141 | for address := range this.M { 142 | addresses = append(addresses, address) 143 | } 144 | 145 | for _, address := range addresses { 146 | this.M[address].Destroy() 147 | delete(this.M, address) 148 | } 149 | } 150 | 151 | func CreateInfluxdbCliPools(maxConns, maxIdle, connTimeout, callTimeout int, cluster []string) *InfluxdbConnPoolHelper { 152 | tp := &InfluxdbConnPoolHelper{ 153 | M: make(map[string]*ConnPool), MaxConns: maxConns, MaxIdle: maxIdle, 154 | ConnTimeout: connTimeout, CallTimeout: callTimeout, 155 | } 156 | 157 | ct := time.Duration(tp.ConnTimeout) * time.Millisecond 158 | for _, address := range cluster { 159 | if _, exist := tp.M[address]; exist { 160 | continue 161 | } 162 | tp.M[address] = newInfluxdbConnPool(address, address, ct, maxConns, maxIdle) 163 | } 164 | return tp 165 | } 166 | 167 | func (this *InfluxdbConnPoolHelper) Proc() []string { 168 | procs := []string{} 169 | for _, cp := range this.M { 170 | procs = append(procs, cp.Proc()) 171 | } 172 | return procs 173 | } 174 | 175 | func (this *InfluxdbConnPoolHelper) Send(addr string, points []*client.Point) (err error) { 176 | connPool, exists := this.Get(addr) 177 | if !exists { 178 | return fmt.Errorf("%s has no connection pool", addr) 179 | } 180 | 181 | conn, err := connPool.Fetch() 182 | if err != nil { 183 | return fmt.Errorf("%s get connection fail: conn %v, err %v. proc: %s", addr, conn, err, connPool.Proc()) 184 | } 185 | 186 | cli := conn.(InfluxdbClient) 187 | 188 | done := make(chan error) 189 | go func() { 190 | err = cli.Write(points) 191 | done <- err 192 | }() 193 | 194 | select { 195 | case <-time.After(time.Duration(this.CallTimeout) * time.Millisecond): 196 | connPool.ForceClose(conn) 197 | return fmt.Errorf("%s, call timeout", conn.Name()) 198 | case err = <-done: 199 | if err != nil { 200 | connPool.ForceClose(conn) 201 | err = fmt.Errorf("%s, call failed, err %v. proc: %s", conn.Name(), err, connPool.Proc()) 202 | } else { 203 | connPool.Release(conn) 204 | } 205 | return err 206 | } 207 | } 208 | 209 | //influx write 210 | func (this *InfluxdbClient) Write(points []*client.Point) error { 211 | bp, _ := client.NewBatchPoints(client.BatchPointsConfig{ 212 | Database: this.Database, 213 | Precision: this.Precision, 214 | }) 215 | 216 | for _, point := range points { 217 | bp.AddPoint(point) 218 | } 219 | 220 | if e := this.cli.Write(bp); e != nil { 221 | log.Println("ERROR: " + e.Error()) 222 | return errors.New("Could not write to any InfluxDB server in cluster") 223 | } 224 | 225 | return nil 226 | } 227 | -------------------------------------------------------------------------------- /sender/send_queues.go: -------------------------------------------------------------------------------- 1 | package sender 2 | 3 | import ( 4 | "github.com/baishancloud/octopux-swtfr/g" 5 | nlist "github.com/toolkits/container/list" 6 | ) 7 | 8 | func initSendQueues() { 9 | cfg := g.Config() 10 | 11 | if cfg.Influxdb != nil && cfg.Influxdb.Enabled { 12 | for tnode, _ := range cfg.Influxdb.Cluster { 13 | Q := nlist.NewSafeListLimited(DefaultSendQueueMaxSize) 14 | InfluxdbQueues[tnode] = Q 15 | } 16 | } 17 | 18 | } 19 | -------------------------------------------------------------------------------- /sender/send_tasks.go: -------------------------------------------------------------------------------- 1 | package sender 2 | 3 | import ( 4 | "log" 5 | "time" 6 | 7 | pfc "github.com/baishancloud/goperfcounter" 8 | "github.com/baishancloud/octopux-swtfr/g" 9 | "github.com/influxdata/influxdb/client/v2" 10 | nsema "github.com/toolkits/concurrent/semaphore" 11 | "github.com/toolkits/container/list" 12 | ) 13 | 14 | // send 15 | const ( 16 | DefaultSendTaskSleepInterval = time.Millisecond * 50 //默认睡眠间隔为50ms 17 | ) 18 | 19 | // TODO 添加对发送任务的控制,比如stop等 20 | func startSendTasks(server *g.ReceiverStatusManager) { 21 | cfg := g.Config() 22 | // init semaphore 23 | influxdbConcurrent := cfg.Influxdb.MaxIdle 24 | 25 | if influxdbConcurrent < 1 { 26 | influxdbConcurrent = 1 27 | } 28 | 29 | // init send go-routines 30 | if cfg.Influxdb != nil && cfg.Influxdb.Enabled { 31 | for node, _ := range cfg.Influxdb.Cluster { 32 | queue := InfluxdbQueues[node] 33 | go forward2InfluxdbTask(queue, node, influxdbConcurrent, server) 34 | } 35 | } 36 | 37 | } 38 | 39 | // Tsdb定时任务, 将 Tsdb发送缓存中的数据 通过api连接池 发送到Tsdb 40 | func forward2InfluxdbTask(Q *list.SafeListLimited, node string, concurrent int, server *g.ReceiverStatusManager) { 41 | 42 | batch := g.Config().Influxdb.Batch // 一次发送,最多batch条数据 43 | sema := nsema.NewSemaphore(concurrent) 44 | addr := g.Config().Influxdb.Cluster[node] 45 | retry := g.Config().Influxdb.MaxRetry 46 | server.Add(1) 47 | defer server.Done() 48 | 49 | for { 50 | items := Q.PopBackBy(batch) 51 | count := len(items) 52 | if count == 0 { 53 | time.Sleep(DefaultSendTaskSleepInterval) 54 | if server.IsRun() == false && Q.Len() == 0 { 55 | return 56 | } 57 | continue 58 | } 59 | pts := make([]*client.Point, count) 60 | for i := 0; i < count; i++ { 61 | pts[i] = items[i].(*client.Point) 62 | } 63 | 64 | sema.Acquire() 65 | go func(addr string, itemList []*client.Point) { 66 | defer sema.Release() 67 | var err error 68 | start := time.Now() 69 | 70 | for i := 0; i < retry; i++ { //最多重试3次 71 | err = InfluxdbConnPools.Send(addr, pts) 72 | if err == nil { 73 | pfc.Meter("SWTFRSendCnt"+node, int64(len(pts))) 74 | break 75 | } 76 | time.Sleep(time.Millisecond * 10) 77 | } 78 | 79 | // statistics 80 | if err != nil { 81 | log.Printf("send to tsdb %s:%s fail: %v", node, addr, err) 82 | pfc.Meter("SWTFRSendFail"+node, int64(len(pts))) 83 | } 84 | pfc.Histogram("SWTFRSendTime"+node, int64(time.Since(start)/time.Millisecond)) 85 | }(addr, pts) 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /sender/sender.go: -------------------------------------------------------------------------------- 1 | package sender 2 | 3 | import ( 4 | "log" 5 | "time" 6 | 7 | pfc "github.com/baishancloud/goperfcounter" 8 | "github.com/baishancloud/octopux-swtfr/g" 9 | cpool "github.com/baishancloud/octopux-swtfr/sender/connpool" 10 | "github.com/influxdata/influxdb/client/v2" 11 | cmodel "github.com/open-falcon/common/model" 12 | nlist "github.com/toolkits/container/list" 13 | ) 14 | 15 | const ( 16 | DefaultSendQueueMaxSize = 102400 //10.24w 17 | ) 18 | 19 | // 服务节点的一致性哈希环 20 | // pk -> node 21 | var () 22 | 23 | // 发送缓存队列 24 | // node -> queue_of_data 25 | var ( 26 | InfluxdbQueues = make(map[string]*nlist.SafeListLimited) 27 | ) 28 | 29 | // 连接池 30 | // node_address -> connection_pool 31 | var ( 32 | InfluxdbConnPools *cpool.InfluxdbConnPoolHelper 33 | ) 34 | 35 | // 初始化数据发送服务, 在main函数中调用 36 | func Start(server *g.ReceiverStatusManager) { 37 | initConnPools() 38 | initSendQueues() 39 | // SendTasks依赖基础组件的初始化,要最后启动 40 | startSendTasks(server) 41 | startSenderCron(server) 42 | log.Println("send.Start, ok") 43 | } 44 | 45 | // 将数据 打入所有的Tsdb的发送缓存队列, 相互备份 46 | func Push2TsdbSendQueue(items []*cmodel.MetaData) { 47 | removeMetrics := g.Config().Influxdb.RemoveMetrics 48 | //log.Printf("Push2TsdbSendQueue") 49 | for _, item := range items { 50 | b, ok := removeMetrics[item.Metric] 51 | //log.Printf ("select:%V,%V,%V", b, ok,item ) 52 | if b && ok { 53 | continue 54 | } 55 | influxPoint := Convert2InfluxPoint(item) 56 | errCnt := 0 57 | for _, Q := range InfluxdbQueues { 58 | if !Q.PushFront(influxPoint) { 59 | errCnt += 1 60 | } 61 | } 62 | 63 | // statistics 64 | if errCnt > 0 { 65 | pfc.Meter("SWTFRSendToInfluxdbDropCnt", int64(errCnt)) 66 | } 67 | 68 | } 69 | } 70 | 71 | // 转化为tsdb格式 72 | func Convert2InfluxPoint(d *cmodel.MetaData) *client.Point { 73 | d.Tags["Endpoint"] = d.Endpoint 74 | pt, _ := client.NewPoint( 75 | d.Metric, 76 | d.Tags, 77 | map[string]interface{}{"value": d.Value}, 78 | time.Unix(d.Timestamp, 0), 79 | ) 80 | return pt 81 | } 82 | 83 | func alignTs(ts int64, period int64) int64 { 84 | return ts - ts%period 85 | } 86 | -------------------------------------------------------------------------------- /sender/sender_cron.go: -------------------------------------------------------------------------------- 1 | package sender 2 | 3 | import ( 4 | "time" 5 | 6 | pfc "github.com/baishancloud/goperfcounter" 7 | "github.com/baishancloud/octopux-swtfr/g" 8 | "github.com/toolkits/container/list" 9 | ) 10 | 11 | const ( 12 | DefaultProcCronPeriod = time.Duration(10) * time.Second //ProcCron的周期,默认1s 13 | ) 14 | 15 | // send_cron程序入口 16 | func startSenderCron(server *g.ReceiverStatusManager) { 17 | go startProcCron(server) 18 | } 19 | 20 | func startProcCron(server *g.ReceiverStatusManager) { 21 | server.Add(1) 22 | defer server.Done() 23 | for { 24 | time.Sleep(DefaultProcCronPeriod) 25 | refreshSendingCacheSize() 26 | if server.IsRun() == false { 27 | return 28 | } 29 | } 30 | } 31 | 32 | func refreshSendingCacheSize() { 33 | pfc.Gauge("SWTFRInfluxdbQueueSize", calcSendCacheSize(InfluxdbQueues)) 34 | 35 | } 36 | func calcSendCacheSize(mapList map[string]*list.SafeListLimited) int64 { 37 | var cnt int64 = 0 38 | for _, list := range mapList { 39 | if list != nil { 40 | cnt += int64(list.Len()) 41 | } 42 | } 43 | return cnt 44 | } 45 | -------------------------------------------------------------------------------- /test/debug: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ## test home 3 | testdir=$(cd $(dirname $0)/; pwd) 4 | ## word home 5 | workdir=$(dirname $testdir) 6 | cd $workdir 7 | 8 | module=transfer 9 | app=falcon-$module 10 | pidfile=var/app.pid 11 | logfile=var/app.log 12 | control=./control 13 | httpprex="127.0.0.1:6060" 14 | sockaddr="127.0.0.1" 15 | sockport="4444" 16 | 17 | ## statistics 18 | function statistics(){ 19 | curl -s "$httpprex/statistics/all" | python -m json.tool 20 | } 21 | 22 | ## config 23 | function config(){ 24 | action=$1 25 | case $action in 26 | "reload") 27 | curl -s "$httpprex/config/reload" | python -m json.tool 28 | ;; 29 | *) 30 | curl -s "$httpprex/config" | python -m json.tool 31 | ;; 32 | esac 33 | } 34 | 35 | ## status 36 | function conn_pool_status(){ 37 | m=$1 38 | if [ "X$m" == "X" ]; then 39 | m="graph" 40 | fi 41 | curl -s "$httpprex/debug/connpool/$m" 42 | printf "\n" 43 | } 44 | 45 | ## trace 46 | function trace_recv(){ 47 | e="test.endpoint.niean.1" 48 | m="test.metric.niean.1" 49 | t="tag0=tag0-niean-1,tag1=tag1-niean-1,tag2=tag2-niean-1" 50 | curl -s "$httpprex/trace/$e/$m/$t" | python -m json.tool 51 | } 52 | 53 | ## filter 54 | function filter_recv(){ 55 | e="test.endpoint.niean.1" 56 | m="test.metric.niean.1" 57 | opt="lt" 58 | val="5" 59 | t="tag0=tag0-niean-1,tag1=tag1-niean-1,tag2=tag2-niean-1" 60 | curl -s "$httpprex/filter/$e/$m/$opt/$val/$t" | python -m json.tool 61 | } 62 | 63 | ## api 64 | function http_post(){ 65 | e="test.endpoint.niean.1" 66 | m="test.metric.niean.1" 67 | t="tag0=tag0-niean-1,tag1=tag1-niean-1,tag2=tag2-niean-1" 68 | ts=`date +%s` 69 | val=`expr $ts / 60 % 10` 70 | curl -s -X POST -d "[{\"metric\":\"$m\", \"endpoint\":\"$e\", \"timestamp\":$ts,\"step\":60, \"value\":$val, \"counterType\":\"GAUGE\",\"tags\":\"$t\"}]" "$httpprex/api/push" | python -m json.tool 71 | } 72 | 73 | 74 | ## telnet 75 | function telnet_send(){ 76 | cnt=$1 77 | if [ "X$cnt" == "X" ];then 78 | cnt=1 79 | fi 80 | 81 | e="test.endpoint.niean.1" 82 | m="test.metric.niean.1" 83 | type="GAUGE" 84 | step=60 85 | # endpoint counter timestamp value [type] [step] 86 | (for i in `seq 1 $cnt`; do echo "update $e $m `date +%s` $i $type $step"; sleep 1; done; echo "quit") | nc $sockaddr $sockport 87 | if [ $? -eq 0 ]; then 88 | echo "ok" 89 | else 90 | echo "error" 91 | fi 92 | } 93 | 94 | ## tail 95 | function tail_log(){ 96 | $control tail 97 | } 98 | 99 | ## build 100 | function build(){ 101 | $control build 102 | [ $? -eq 0 ] && echo -e "build ok" || { echo -e "build error"; exit 1; } 103 | } 104 | function start(){ 105 | $control start 106 | } 107 | function stop(){ 108 | $control stop 109 | } 110 | 111 | ## mockagent 112 | srcname=mockagent 113 | appname=$srcname-debug 114 | builddir=$testdir/build 115 | masrc=$testdir/$srcname.go 116 | matarget=$builddir/$appname.bin 117 | 118 | function build_mockagent(){ 119 | rm -rf $matarget &&\ 120 | go build -o $matarget $masrc &>/dev/null 121 | ec=$? 122 | [ $ec -eq 0 ] && echo -e "mockagent build, ok" || { echo -e "mockagent build, error"; exit $ec;} 123 | } 124 | 125 | function clean_mockagent(){ 126 | rm -rf $builddir 127 | ec=$? 128 | [ $ec -eq 0 ] && echo -e "clean mockagent, ok" || { echo -e "clean mockagent, error"; exit $ec; } 129 | } 130 | 131 | function kill_mockagent(){ 132 | pids=`ps -ef | grep $appname.bin | grep -v grep | awk '{print $2}'` 133 | for pid in $pids 134 | do 135 | kill -9 $pid &>/dev/null 136 | echo -e "kill mockagent, $pid" 137 | sleep 0.01 138 | done 139 | echo -e "kill mockagent ok" 140 | } 141 | 142 | function start_mockagent(){ 143 | cnt=$1 144 | if [ "X$cnt" == "X" ];then 145 | cnt=1 146 | fi 147 | 148 | for i in `seq 1 $cnt` 149 | do 150 | id=malog.`date +%s` 151 | $matarget -i 3 > $builddir/$id.$i.log & 152 | echo -e "start mockagent, $id.$i" 153 | sleep 0.2 154 | done 155 | } 156 | 157 | action=$1 158 | case $action in 159 | "build") 160 | build 161 | ;; 162 | "start") 163 | start 164 | ;; 165 | "stop") 166 | stop 167 | ;; 168 | "restart") 169 | stop && build && start 170 | ;; 171 | "config") 172 | config $2 173 | ;; 174 | "tail") 175 | tail_log 176 | ;; 177 | "trace") 178 | trace_recv 179 | ;; 180 | "filter") 181 | filter_recv 182 | ;; 183 | "conn") 184 | conn_pool_status $2 185 | ;; 186 | "post") 187 | http_post 188 | ;; 189 | "send") 190 | telnet_send 191 | ;; 192 | "startm") 193 | start_mockagent $2 194 | ;; 195 | "killm") 196 | kill_mockagent 197 | ;; 198 | "cleanm") 199 | clean_mockagent 200 | ;; 201 | "buildm") 202 | build_mockagent 203 | ;; 204 | *) 205 | statistics 206 | ;; 207 | esac 208 | 209 | -------------------------------------------------------------------------------- /test/rpcclient.py: -------------------------------------------------------------------------------- 1 | import json 2 | import socket 3 | import itertools 4 | import time 5 | 6 | class RPCClient(object): 7 | 8 | def __init__(self, addr, codec=json): 9 | self._socket = socket.create_connection(addr) 10 | self._id_iter = itertools.count() 11 | self._codec = codec 12 | 13 | def _message(self, name, *params): 14 | return dict(id=self._id_iter.next(), 15 | params=list(params), 16 | method=name) 17 | 18 | def call(self, name, *params): 19 | req = self._message(name, *params) 20 | id = req.get('id') 21 | 22 | mesg = self._codec.dumps(req) 23 | self._socket.sendall(mesg) 24 | 25 | # This will actually have to loop if resp is bigger 26 | resp = self._socket.recv(4096) 27 | resp = self._codec.loads(resp) 28 | 29 | if resp.get('id') != id: 30 | raise Exception("expected id=%s, received id=%s: %s" 31 | %(id, resp.get('id'), resp.get('error'))) 32 | 33 | if resp.get('error') is not None: 34 | raise Exception(resp.get('error')) 35 | 36 | return resp.get('result') 37 | 38 | def close(self): 39 | self._socket.close() 40 | 41 | 42 | if __name__ == '__main__': 43 | rpc = RPCClient(("127.0.0.1", 8433)) 44 | for i in xrange(10000): 45 | mv1 = dict(endpoint='host.niean', metric='metric.niean.1', value=i, step=60, 46 | counterType='GAUGE', tags='tag=t'+str(i), timestamp=int(time.time())) 47 | mv2 = dict(endpoint='host.niean', metric='metric.niean.2', value=i, step=60, 48 | counterType='COUNTER', tags='tag=t'+str(i), timestamp=int(time.time())) 49 | print rpc.call("Transfer.Update", [mv1, mv2]) 50 | --------------------------------------------------------------------------------