├── .github └── workflows │ └── ci.yml ├── .gitignore ├── CONTRIBUTING.md ├── LICENSE ├── Makefile ├── consts.go ├── dingtalk.png ├── doc ├── example.md └── zh.md ├── example ├── 1gbslice │ └── 1gbslice.go ├── alloc │ └── alloc.go ├── channelblock │ └── channelblock.go ├── cpu_explode │ └── cpu_explode.go ├── deadlock │ └── deadlock.go ├── deadloop │ └── deadloop.go ├── gcheap │ ├── .gitignore │ ├── README.md │ ├── gcheap.go │ ├── go.mod │ ├── go.sum │ ├── memory-spike.png │ └── rand.sh ├── pyroscope_rideshare │ ├── README.md │ ├── admin.png │ ├── bike │ │ └── bike.go │ ├── car │ │ └── car.go │ ├── go.mod │ ├── go.sum │ ├── main.go │ ├── requests.py │ ├── scooter │ │ └── scooter.go │ ├── start_client.sh │ └── utility │ │ └── utility.go ├── run_in_docker │ └── run_in_docker.go ├── slowlyleak │ └── slowlyleak.go └── thread_trigger │ └── thread_trigger.go ├── go.mod ├── go.sum ├── holmes.go ├── holmes_test.go ├── log.go ├── options.go ├── readme.md ├── report.go ├── reporters ├── http_reporter │ ├── http_reporter.go │ ├── http_reporter_test.go │ └── reporter_filename_test ├── pyroscope_reporter │ ├── client_config.go │ ├── flameql │ │ ├── error.go │ │ ├── flameql.go │ │ ├── key.go │ │ ├── parse.go │ │ └── sortedmap.go │ ├── pyroscope_client.go │ └── pyroscope_client_test.go └── reporter_test.go ├── ring.go ├── ring_test.go ├── tool └── build-example.sh └── util.go /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: actions 2 | on: 3 | push: 4 | branches: 5 | - master 6 | pull_request: 7 | jobs: 8 | golangci-lint: 9 | name: runner / golangci-lint 10 | runs-on: ubuntu-latest 11 | steps: 12 | - name: Check out code into the Go module directory 13 | uses: actions/checkout@v2 14 | - name: golangci-lint 15 | uses: reviewdog/action-golangci-lint@v1 16 | with: 17 | golangci_lint_flags: --timeout=10m --tests=false --skip-dirs=example 18 | 19 | test: 20 | name: Test 21 | runs-on: ubuntu-latest 22 | steps: 23 | - name: Set up Go 24 | uses: actions/setup-go@v1 25 | with: 26 | go-version: 1.14.13 27 | 28 | - name: Check out code 29 | uses: actions/checkout@v1 30 | 31 | - name: holmes test 32 | run: make test 33 | 34 | - name: example 35 | run: make example 36 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | 3 | *.log 4 | 5 | ./test_case_gen 6 | 7 | vendor 8 | 9 | reporters/*.bin 10 | 11 | example/1gbslice/1gbslice 12 | example/alloc/alloc 13 | example/channelblock/channelblock 14 | example/cpu_explode/cpu_explode 15 | example/deadlock/deadlock 16 | example/deadloop/deadloop 17 | example/gcheap/m 18 | example/run_in_docker/run_in_docker 19 | example/slowlyleak/slowlyleak 20 | example/thread_trigger/thread_trigger 21 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Holmes contributor guide 2 | 3 | Holmes is released under the Apache 2.0 license, and follows a very standard Github development process, using Github tracker for issues and merging pull requests into master. If you would like to contribute something, or simply want to hack on the code this document should help you get started. 4 | 5 | Before we accept a non-trivial patch or pull request we will need you to sign the Contributor License Agreement. Signing the contributor’s agreement does not grant anyone commits rights to the main repository, but it does mean that we can accept your contributions, and you will get an author credit if we do. Active contributors might be asked to join the core team and given the ability to merge pull requests. 6 | 7 | ## Code Conventions 8 | 9 | None of these is essential for a pull request, but they will all help. 10 | 11 | 1. Code format 12 | - With cli, run `goimports -w yourfile.go` and `golint yourfile.go` to format the style 13 | - With ide like goland, select 'Group stdlib imports', 'Move all stdlib imports in a single group', 'Move all imports in a single declaration' in Go->imports page 14 | - We would check code format when run ci test, so please ensure that you have built project before you push branch. 15 | 2. Make sure all new `.go` files to have a simple doc class comment 16 | with at least an `author` tag identifying you, and preferably at least a 17 | paragraph on what the class is for. 18 | 3. Add the ASF license header comment to all new `.go` files (copy from existing files in the project) 19 | 4. Add yourself as an `author` to the `.go` files that you modify substantially (more than cosmetic changes). 20 | 5. Add some docs. 21 | 6. A few unit tests would help a lot as well — someone has to do it. 22 | 7. When writing a commit message please follow [these conventions](https://tbaggery.com/2008/04/19/a-note-about-git-commit-messages.html), if you are fixing an existing issue please add Fixes gh-XXXX at the end of the commit message (where XXXX is the issue number). 23 | 8. Please ensure that code coverage will not decrease. 24 | 9. Contribute a PR as the rule of Gitflow Workflow, and you should follow the pull request's rules. 25 | 26 | ## Version naming convention 27 | 28 | Holmes's version contains three-digit with the format x.x.x, the first one is for compatibility; the second one is for new features and enhancement; the last one is for a bug fix. 29 | 30 | ## PR review policy for maintainers 31 | 32 | The following strategies are recommended for project maintainers to review code: 33 | 34 | 1. Check the issue with this PR 35 | 2. Check the solution's reasonability 36 | 3. Check UT's and Benchmark's result 37 | 4. Pay attention to the code which makes the code structure change, the usage of the global variable, the handling of the corner case and concurrency 38 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | modules=$(shell go list ./... | grep -v example) 2 | test: 3 | GO111MODULE=on go test -gcflags "-N -l" $(modules) 4 | 5 | lint: 6 | golangci-lint run --timeout=10m --exclude-use-default=false --tests=false --skip-dirs=example 7 | 8 | .PHONY: example 9 | example: 10 | bash tool/build-example.sh 11 | -------------------------------------------------------------------------------- /consts.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package holmes 19 | 20 | import ( 21 | "os" 22 | "time" 23 | ) 24 | 25 | const ( 26 | defaultThreadTriggerMin = 10 // 10 threads 27 | defaultThreadTriggerAbs = 70 // 70 threads 28 | defaultThreadTriggerDiff = 25 // 25% 29 | 30 | defaultCPUTriggerMin = 10 // 10% 31 | defaultCPUTriggerAbs = 70 // 70% 32 | defaultCPUTriggerDiff = 25 // 25% 33 | defaultCPUSamplingTime = 5 * time.Second // collect 5s cpu profile 34 | 35 | defaultGoroutineTriggerMin = 3000 // 3000 goroutines 36 | defaultGoroutineTriggerAbs = 200000 // 200k goroutines 37 | defaultGoroutineTriggerDiff = 20 // 20% diff 38 | 39 | defaultMemTriggerMin = 10 // 10% 40 | defaultMemTriggerAbs = 80 // 80% 41 | defaultMemTriggerDiff = 25 // 25% 42 | 43 | defaultGCHeapTriggerMin = 10 // 10% 44 | defaultGCHeapTriggerAbs = 40 // 40% 45 | defaultGCHeapTriggerDiff = 20 // 20% 46 | 47 | defaultCooldown = time.Minute 48 | defaultThreadCoolDown = time.Hour 49 | defaultGoroutineCoolDown = time.Minute * 10 50 | 51 | defaultInterval = 5 * time.Second 52 | defaultDumpProfileType = binaryDump 53 | defaultDumpPath = "/tmp" 54 | defaultLoggerName = "holmes.log" 55 | defaultLoggerFlags = os.O_RDWR | os.O_CREATE | os.O_APPEND 56 | defaultLoggerPerm = 0644 57 | defaultShardLoggerSize = 5242880 // 5m 58 | ) 59 | 60 | type dumpProfileType int 61 | 62 | const ( 63 | binaryDump dumpProfileType = 0 64 | textDump dumpProfileType = 1 65 | ) 66 | 67 | type configureType int 68 | 69 | const ( 70 | mem configureType = iota 71 | cpu 72 | thread 73 | goroutine 74 | gcHeap 75 | ) 76 | 77 | // check type to profile name, just align to pprof 78 | var type2name = map[configureType]string{ 79 | mem: "heap", 80 | cpu: "cpu", 81 | thread: "threadcreate", 82 | goroutine: "goroutine", 83 | gcHeap: "heap", 84 | } 85 | 86 | // check type to check name 87 | var check2name = map[configureType]string{ 88 | mem: "mem", 89 | cpu: "cpu", 90 | thread: "thread", 91 | goroutine: "goroutine", 92 | gcHeap: "GCHeap", 93 | } 94 | 95 | const ( 96 | cgroupMemLimitPath = "/sys/fs/cgroup/memory/memory.limit_in_bytes" 97 | cgroupCpuQuotaPath = "/sys/fs/cgroup/cpu/cpu.cfs_quota_us" 98 | cgroupCpuPeriodPath = "/sys/fs/cgroup/cpu/cpu.cfs_period_us" 99 | ) 100 | 101 | const minCollectCyclesBeforeDumpStart = 10 102 | 103 | const ( 104 | // TrimResultTopN trimResult return only reserve the top n. 105 | TrimResultTopN = 10 106 | 107 | // TrimResultMaxBytes trimResultFront return only reserve the front n bytes. 108 | TrimResultMaxBytes = 512000 109 | 110 | // NotSupportTypeMaxConfig means this profile type is 111 | // not support control dump profile by max parameter. 112 | NotSupportTypeMaxConfig = 0 113 | 114 | // UniformLogFormat is the format of uniform logging. 115 | UniformLogFormat = "[Holmes] %v %v, config_min : %v, config_diff : %v, config_abs : %v, config_max : %v, previous : %v, current: %v" 116 | ) 117 | -------------------------------------------------------------------------------- /dingtalk.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mosn/holmes/efb1a7768843e83b645f6e683f7b6c5d826651ab/dingtalk.png -------------------------------------------------------------------------------- /doc/example.md: -------------------------------------------------------------------------------- 1 | * [cases show](#cases-show) 2 | * [RSS peak caused by make a 1GB slice](#rss-peak-caused-by-make-a-1gb-slice) 3 | * [goroutine explosion caused by deadlock](#goroutine-explosion-caused-by-deadlock) 4 | * [goroutine explosion caused by channel block](#goroutine-explosion-caused-by-channel-block) 5 | * [process slowly leaks goroutines](#process-slowly-leaks-goroutines) 6 | * [large memory allocation caused by business logic](#large-memory-allocation-caused-by-business-logic) 7 | * [deadloop caused cpu outage](#deadloop-caused-cpu-outage) 8 | * [large thread allocation caused by cgo block](#large-thread-allocation-caused-by-cgo-block) 9 | 10 | 11 | ## cases show 12 | all example code in [here](../example) 13 | 14 | ### RSS peak caused by make a 1GB slice 15 | 16 | see this [example](example/1gbslice/1gbslice.go) 17 | 18 | after warming up, just curl http://localhost:10003/make1gb for some times, then you'll probably see: 19 | 20 | ``` 21 | heap profile: 0: 0 [1: 1073741824] @ heap/1048576 22 | 0: 0 [1: 1073741824] @ 0x42ba3ef 0x4252254 0x4254095 0x4254fd3 0x425128c 0x40650a1 23 | # 0x42ba3ee main.make1gbslice+0x3e /Users/xargin/go/src/github.com/mosn/holmes/example/1gbslice.go:24 24 | # 0x4252253 net/http.HandlerFunc.ServeHTTP+0x43 /Users/xargin/sdk/go1.14.2/src/net/http/server.go:2012 25 | # 0x4254094 net/http.(*ServeMux).ServeHTTP+0x1a4 /Users/xargin/sdk/go1.14.2/src/net/http/server.go:2387 26 | # 0x4254fd2 net/http.serverHandler.ServeHTTP+0xa2 /Users/xargin/sdk/go1.14.2/src/net/http/server.go:2807 27 | # 0x425128b net/http.(*conn).serve+0x86b /Users/xargin/sdk/go1.14.2/src/net/http/server.go:1895 28 | ``` 29 | 30 | 1: 1073741824 means 1 object and 1GB memory consumption. 31 | 32 | ### goroutine explosion caused by deadlock 33 | 34 | See this [example](./example/deadlock/deadlock.go) 35 | 36 | curl localhost:10003/lockorder1 37 | 38 | curl localhost:10003/lockorder2 39 | 40 | After warming up, wrk -c 100 http://localhost:10003/req, then you'll see the deadlock 41 | caused goroutine num peak: 42 | 43 | ``` 44 | 100 @ 0x40380b0 0x4048c80 0x4048c6b 0x40489e7 0x406f72c 0x42badfc 0x42badfd 0x4252b94 0x42549d5 0x4255913 0x4251bcc 0x40659e1 45 | # 0x40489e6 sync.runtime_SemacquireMutex+0x46 /Users/xargin/sdk/go1.14.2/src/runtime/sema.go:71 46 | # 0x406f72b sync.(*Mutex).lockSlow+0xfb /Users/xargin/sdk/go1.14.2/src/sync/mutex.go:138 47 | # 0x42badfb sync.(*Mutex).Lock+0x8b /Users/xargin/sdk/go1.14.2/src/sync/mutex.go:81 48 | # 0x42badfc main.req+0x8c /Users/xargin/go/src/github.com/mosn/holmes/example/deadlock.go:30 49 | # 0x4252b93 net/http.HandlerFunc.ServeHTTP+0x43 /Users/xargin/sdk/go1.14.2/src/net/http/server.go:2012 50 | # 0x42549d4 net/http.(*ServeMux).ServeHTTP+0x1a4 /Users/xargin/sdk/go1.14.2/src/net/http/server.go:2387 51 | # 0x4255912 net/http.serverHandler.ServeHTTP+0xa2 /Users/xargin/sdk/go1.14.2/src/net/http/server.go:2807 52 | # 0x4251bcb net/http.(*conn).serve+0x86b /Users/xargin/sdk/go1.14.2/src/net/http/server.go:1895 53 | 1 @ 0x40380b0 0x4048c80 0x4048c6b 0x40489e7 0x406f72c 0x42bb041 0x42bb042 0x4252b94 0x42549d5 0x4255913 0x4251bcc 0x40659e1 54 | 55 | # 0x40489e6 sync.runtime_SemacquireMutex+0x46 /Users/xargin/sdk/go1.14.2/src/runtime/sema.go:71 56 | # 0x406f72b sync.(*Mutex).lockSlow+0xfb /Users/xargin/sdk/go1.14.2/src/sync/mutex.go:138 57 | # 0x42bb040 sync.(*Mutex).Lock+0xf0 /Users/xargin/sdk/go1.14.2/src/sync/mutex.go:81 58 | # 0x42bb041 main.lockorder2+0xf1 /Users/xargin/go/src/github.com/mosn/holmes/example/deadlock.go:50 59 | # 0x4252b93 net/http.HandlerFunc.ServeHTTP+0x43 /Users/xargin/sdk/go1.14.2/src/net/http/server.go:2012 60 | # 0x42549d4 net/http.(*ServeMux).ServeHTTP+0x1a4 /Users/xargin/sdk/go1.14.2/src/net/http/server.go:2387 61 | # 0x4255912 net/http.serverHandler.ServeHTTP+0xa2 /Users/xargin/sdk/go1.14.2/src/net/http/server.go:2807 62 | # 0x4251bcb net/http.(*conn).serve+0x86b /Users/xargin/sdk/go1.14.2/src/net/http/server.go:1895 63 | 64 | 1 @ 0x40380b0 0x4048c80 0x4048c6b 0x40489e7 0x406f72c 0x42baf11 0x42baf12 0x4252b94 0x42549d5 0x4255913 0x4251bcc 0x40659e1 65 | # 0x40489e6 sync.runtime_SemacquireMutex+0x46 /Users/xargin/sdk/go1.14.2/src/runtime/sema.go:71 66 | # 0x406f72b sync.(*Mutex).lockSlow+0xfb /Users/xargin/sdk/go1.14.2/src/sync/mutex.go:138 67 | # 0x42baf10 sync.(*Mutex).Lock+0xf0 /Users/xargin/sdk/go1.14.2/src/sync/mutex.go:81 68 | # 0x42baf11 main.lockorder1+0xf1 /Users/xargin/go/src/github.com/mosn/holmes/example/deadlock.go:40 69 | # 0x4252b93 net/http.HandlerFunc.ServeHTTP+0x43 /Users/xargin/sdk/go1.14.2/src/net/http/server.go:2012 70 | # 0x42549d4 net/http.(*ServeMux).ServeHTTP+0x1a4 /Users/xargin/sdk/go1.14.2/src/net/http/server.go:2387 71 | # 0x4255912 net/http.serverHandler.ServeHTTP+0xa2 /Users/xargin/sdk/go1.14.2/src/net/http/server.go:2807 72 | # 0x4251bcb net/http.(*conn).serve+0x86b /Users/xargin/sdk/go1.14.2/src/net/http/server.go:1895 73 | ``` 74 | 75 | The req API was blocked by deadlock. 76 | 77 | Your should set DumpFullStack to true to locate deadlock bug. 78 | 79 | ### goroutine explosion caused by channel block 80 | 81 | see this [example](example/channelblock/channelblock.go) 82 | 83 | after warming up, just wrk -c100 http://localhost:10003/chanblock 84 | 85 | ``` 86 | goroutine profile: total 203 87 | 100 @ 0x4037750 0x4007011 0x4006a15 0x42ba3c9 0x4252234 0x4254075 0x4254fb3 0x425126c 0x4065081 88 | # 0x42ba3c8 main.channelBlock+0x38 /Users/xargin/go/src/github.com/mosn/holmes/example/channelblock.go:26 89 | # 0x4252233 net/http.HandlerFunc.ServeHTTP+0x43 /Users/xargin/sdk/go1.14.2/src/net/http/server.go:2012 90 | # 0x4254074 net/http.(*ServeMux).ServeHTTP+0x1a4 /Users/xargin/sdk/go1.14.2/src/net/http/server.go:2387 91 | # 0x4254fb2 net/http.serverHandler.ServeHTTP+0xa2 /Users/xargin/sdk/go1.14.2/src/net/http/server.go:2807 92 | # 0x425126b net/http.(*conn).serve+0x86b /Users/xargin/sdk/go1.14.2/src/net/http/server.go:1895 93 | ``` 94 | 95 | It's easy to locate. 96 | 97 | ### process slowly leaks goroutines 98 | 99 | See this [example](example/slowlyleak/slowlyleak.go) 100 | 101 | The producer forget to close the task channel after produce finishes, so every request 102 | to this URI will leak a goroutine, we could curl http://localhost:10003/leak several 103 | time and got the following log: 104 | 105 | ``` 106 | goroutine profile: total 10 107 | 7 @ 0x4038380 0x4008497 0x400819b 0x42bb129 0x4065cb1 108 | # 0x42bb128 main.leak.func1+0x48 /Users/xargin/go/src/github.com/mosn/holmes/example/slowlyleak.go:26 109 | ``` 110 | 111 | It's easy to find the leakage reason 112 | 113 | ### large memory allocation caused by business logic 114 | 115 | See this [example](example/alloc/alloc.go), this is a similar example as the large slice make. 116 | 117 | After warming up finished, wrk -c100 http://localhost:10003/alloc: 118 | 119 | ``` 120 | pprof memory, config_min : 3, config_diff : 25, config_abs : 80, previous : [0 0 0 4 0 0 0 0 0 0], current : 4 121 | heap profile: 83: 374069984 [3300: 14768402720] @ heap/1048576 122 | 79: 374063104 [3119: 14768390144] @ 0x40104b3 0x401024f 0x42bb1ba 0x4252ff4 0x4254e35 0x4255d73 0x425202c 0x4065e41 123 | # 0x42bb1b9 main.alloc+0x69 /Users/xargin/go/src/github.com/mosn/holmes/example/alloc.go:25 124 | # 0x4252ff3 net/http.HandlerFunc.ServeHTTP+0x43 /Users/xargin/sdk/go1.14.2/src/net/http/server.go:2012 125 | # 0x4254e34 net/http.(*ServeMux).ServeHTTP+0x1a4 /Users/xargin/sdk/go1.14.2/src/net/http/server.go:2387 126 | # 0x4255d72 net/http.serverHandler.ServeHTTP+0xa2 /Users/xargin/sdk/go1.14.2/src/net/http/server.go:2807 127 | # 0x425202b net/http.(*conn).serve+0x86b /Users/xargin/sdk/go1.14.2/src/net/http/server.go:1895 128 | ``` 129 | 130 | ### deadloop caused cpu outage 131 | 132 | See this [example](example/cpu_explode/cpu_explode.go). 133 | 134 | After warming up finished, curl http://localhost:10003/cpuex several times, then you'll 135 | see the cpu profile dump to your dump path. 136 | 137 | Notice the cpu profile currently doesn't support text mode. 138 | 139 | ``` 140 | go tool pprof cpu.20201028100641.bin 141 | 142 | (pprof) top 143 | Showing nodes accounting for 19.45s, 99.95% of 19.46s total 144 | Dropped 6 nodes (cum <= 0.10s) 145 | flat flat% sum% cum cum% 146 | 17.81s 91.52% 91.52% 19.45s 99.95% main.cpuex.func1 147 | 1.64s 8.43% 99.95% 1.64s 8.43% runtime.asyncPreempt 148 | 149 | (pprof) list func1 150 | Total: 19.46s 151 | ROUTINE ======================== main.cpuex.func1 in /Users/xargin/go/src/github.com/mosn/holmes/example/cpu_explode.go 152 | 17.81s 19.45s (flat, cum) 99.95% of Total 153 | 80ms 80ms 1:/* 154 | * Licensed to the Apache Software Foundation (ASF) under one or more 155 | * contributor license agreements. See the NOTICE file distributed with 156 | * this work for additional information regarding copyright ownership. 157 | * The ASF licenses this file to You under the Apache License, Version 2.0 158 | * (the "License"); you may not use this file except in compliance with 159 | * the License. You may obtain a copy of the License at 160 | * 161 | * http://www.apache.org/licenses/LICENSE-2.0 162 | * 163 | * Unless required by applicable law or agreed to in writing, software 164 | * distributed under the License is distributed on an "AS IS" BASIS, 165 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 166 | * See the License for the specific language governing permissions and 167 | * limitations under the License. 168 | */ 169 | 170 | package main 171 | . . 2: 172 | . . 3:import ( 173 | . . 4: "net/http" 174 | . . 5: "time" 175 | . . 6: 176 | . . 7: "github.com/mosn/holmes" 177 | . . 8:) 178 | . . 9: 179 | . . 10:func init() { 180 | . . 11: http.HandleFunc("/cpuex", cpuex) 181 | . . 12: go http.ListenAndServe(":10003", nil) 182 | . . 13:} 183 | . . 14: 184 | . . 15:var h = holmes.New("2s", "1m", "/tmp", false). 185 | . . 16: EnableCPUDump().Config(20, 25, 80) 186 | . . 17: 187 | . . 18:func main() { 188 | . . 19: h.Start() 189 | . . 20: time.Sleep(time.Hour) 190 | . . 21:} 191 | . . 22: 192 | . . 23:func cpuex(wr http.ResponseWriter, req *http.Request) { 193 | . . 24: go func() { 194 | 17.73s 19.37s 25: for { 195 | . . 26: } 196 | . . 27: }() 197 | . . 28:} 198 | 199 | ``` 200 | 201 | So we find out the criminal. 202 | 203 | ### large thread allocation caused by cgo block 204 | 205 | See this [example](./example/thread_trigger/thread_trigger.go) 206 | 207 | This is a cgo block example, massive cgo blocking will cause many threads created. 208 | 209 | After warming up, curl http://localhost:10003/leak, then the thread profile and goroutine 210 | profile will be dumped to dumpPath: 211 | 212 | ``` 213 | [2020-11-10 19:49:52.145][Holmes] pprof thread, config_min : 10, config_diff : 25, config_abs : 100, previous : [8 8 8 8 8 8 8 8 8 1013], current : 1013 214 | [2020-11-10 19:49:52.146]threadcreate profile: total 1013 215 | 1012 @ 216 | # 0x0 217 | 218 | 1 @ 0x403af6e 0x403b679 0x4037e34 0x4037e35 0x40677d1 219 | # 0x403af6d runtime.allocm+0x14d /Users/xargin/sdk/go1.14.2/src/runtime/proc.go:1390 220 | # 0x403b678 runtime.newm+0x38 /Users/xargin/sdk/go1.14.2/src/runtime/proc.go:1704 221 | # 0x4037e33 runtime.startTemplateThread+0x2c3 /Users/xargin/sdk/go1.14.2/src/runtime/proc.go:1768 222 | # 0x4037e34 runtime.main+0x2c4 /Users/xargin/sdk/go1.14.2/src/runtime/proc.go:186 223 | 224 | goroutine profile: total 1002 225 | 999 @ 0x4004f8b 0x4394a61 0x4394f79 0x40677d1 226 | # 0x4394a60 main._Cfunc_output+0x40 _cgo_gotypes.go:70 227 | # 0x4394f78 main.leak.func1.1+0x48 /Users/xargin/go/src/github.com/mosn/holmes/example/thread_trigger.go:45 228 | 229 | 1 @ 0x4038160 0x40317ca 0x4030d35 0x40c6555 0x40c8db4 0x40c8d96 0x41a8f92 0x41c2a52 0x41c1894 0x42d00cd 0x42cfe17 0x4394c57 0x4394c20 0x4037d82 0x40677d1 230 | # 0x4030d34 internal/poll.runtime_pollWait+0x54 /Users/xargin/sdk/go1.14.2/src/runtime/netpoll.go:203 231 | # 0x40c6554 internal/poll.(*pollDesc).wait+0x44 /Users/xargin/sdk/go1.14.2/src/internal/poll/fd_poll_runtime.go:87 232 | # 0x40c8db3 internal/poll.(*pollDesc).waitRead+0x1d3 /Users/xargin/sdk/go1.14.2/src/internal/poll/fd_poll_runtime.go:92 233 | # 0x40c8d95 internal/poll.(*FD).Accept+0x1b5 /Users/xargin/sdk/go1.14.2/src/internal/poll/fd_unix.go:384 234 | # 0x41a8f91 net.(*netFD).accept+0x41 /Users/xargin/sdk/go1.14.2/src/net/fd_unix.go:238 235 | # 0x41c2a51 net.(*TCPListener).accept+0x31 /Users/xargin/sdk/go1.14.2/src/net/tcpsock_posix.go:139 236 | # 0x41c1893 net.(*TCPListener).Accept+0x63 /Users/xargin/sdk/go1.14.2/src/net/tcpsock.go:261 237 | # 0x42d00cc net/http.(*Server).Serve+0x25c /Users/xargin/sdk/go1.14.2/src/net/http/server.go:2901 238 | # 0x42cfe16 net/http.(*Server).ListenAndServe+0xb6 /Users/xargin/sdk/go1.14.2/src/net/http/server.go:2830 239 | # 0x4394c56 net/http.ListenAndServe+0x96 /Users/xargin/sdk/go1.14.2/src/net/http/server.go:3086 240 | # 0x4394c1f main.main+0x5f /Users/xargin/go/src/github.com/mosn/holmes/example/thread_trigger.go:55 241 | # 0x4037d81 runtime.main+0x211 /Users/xargin/sdk/go1.14.2/src/runtime/proc.go:203 242 | 243 | 1 @ 0x4038160 0x4055bea 0x4394ead 0x40677d1 244 | # 0x4055be9 time.Sleep+0xb9 /Users/xargin/sdk/go1.14.2/src/runtime/time.go:188 245 | # 0x4394eac main.init.0.func1+0x1dc /Users/xargin/go/src/github.com/mosn/holmes/example/thread_trigger.go:34 246 | 247 | 1 @ 0x43506d5 0x43504f0 0x434d28a 0x4391872 0x43914cf 0x43902c2 0x40677d1 248 | # 0x43506d4 runtime/pprof.writeRuntimeProfile+0x94 /Users/xargin/sdk/go1.14.2/src/runtime/pprof/pprof.go:694 249 | # 0x43504ef runtime/pprof.writeGoroutine+0x9f /Users/xargin/sdk/go1.14.2/src/runtime/pprof/pprof.go:656 250 | # 0x434d289 runtime/pprof.(*Profile).WriteTo+0x3d9 /Users/xargin/sdk/go1.14.2/src/runtime/pprof/pprof.go:329 251 | # 0x4391871 github.com/mosn/holmes.(*Holmes).threadProfile+0x2e1 /Users/xargin/go/src/github.com/mosn/holmes/holmes.go:260 252 | # 0x43914ce github.com/mosn/holmes.(*Holmes).threadCheckAndDump+0x9e /Users/xargin/go/src/github.com/mosn/holmes/holmes.go:241 253 | # 0x43902c1 github.com/mosn/holmes.(*Holmes).startDumpLoop+0x571 /Users/xargin/go/src/github.com/mosn/holmes/holmes.go:158 254 | ``` 255 | 256 | So we know that the threads are blocked by cgo calls. 257 | -------------------------------------------------------------------------------- /doc/zh.md: -------------------------------------------------------------------------------- 1 | 2 | * [holmes](#holmes) 3 | * [设计](#设计) 4 | * [如何使用](#如何使用) 5 | * [Dump Goroutine profile](#dump-goroutine-profile) 6 | * [Dump cpu profile](#dump-cpu-profile) 7 | * [Dump Heap Memory Profile](#dump-heap-memory-profile) 8 | * [基于Gc周期的Heap Memory Dump](#基于gc周期的heap-memory-dump) 9 | * [动态设置holmes配置](#动态设置holmes配置) 10 | * [Dump事件上报](#dump事件上报) 11 | * [开启全部](#开启全部) 12 | * [在docker 或者cgroup环境下运行 holmes](#在docker-或者cgroup环境下运行-holmes) 13 | * [已知风险](#已知风险) 14 | * [使用示例](#使用示例) 15 | 16 | # holmes 17 | 18 | 基于规则的自动Golang Profile Dumper. 19 | 20 | 作为一名"懒惰"的程序员,如何避免在线上Golang系统半夜宕机 21 | (一般是OOM导致的)时起床保存现场呢?又或者如何dump压测时性能尖刺时刻的profile文件呢? 22 | 23 | holmes 或许能帮助您解决以上问题。 24 | 25 | ## 设计 26 | 27 | holmes 每隔一段时间收集一次以下应用指标: 28 | 29 | * 协程数,通过`runtime.NumGoroutine`。 30 | * 当前应用所占用的RSS,通过[gopsutil](https://github.com/shirou/gopsutil)第三方库。 31 | * CPU使用率,比如8C的机器,如果使用了4C,则使用率为50%,通过[gopsutil](https://github.com/shirou/gopsutil)第三方库。 32 | 33 | 除此之外,holmes还会根据Gc周期收集RSS指标,如果您开启了`GCheap dump`的话。 34 | 35 | 在预热阶段(应用启动后,holmes会收集十次指标)结束后,holmes会比较当前指标是否满足用户所设置的阈值/规则,如果满足的话,则dump profile, 36 | 以日志或者二进制文件的格式保留现场。 37 | 38 | ## 如何使用 39 | 40 | ```shell 41 | go get mosn.io/holmes 42 | ``` 43 | 在应用初始化逻辑加上对应的holmes配置。 44 | ```go 45 | func main() { 46 | 47 | h := initHolmes() 48 | 49 | // start the metrics collect and dump loop 50 | h.Start() 51 | ...... 52 | 53 | // quit the application and stop the dumper 54 | h.Stop() 55 | } 56 | func initHolmes() *Holmes{ 57 | h, _ := holmes.New( 58 | holmes.WithCollectInterval("5s"), 59 | holmes.WithDumpPath("/tmp"), 60 | holmes.WithCPUDump(20, 25, 80, time.Minute), 61 | holmes.WithCPUMax(90), 62 | ) 63 | h.EnableCPUDump() 64 | return h 65 | } 66 | 67 | ``` 68 | 69 | holmes 支持对以下几种应用指标进行监控: 70 | 71 | * mem: 内存分配 72 | * cpu: cpu使用率 73 | * thread: 线程数 74 | * goroutine: 协程数 75 | * gcHeap: 基于GC周期的内存分配 76 | 77 | 78 | ### Dump Goroutine profile 79 | 80 | ```go 81 | h, _ := holmes.New( 82 | holmes.WithCollectInterval("5s"), 83 | holmes.WithDumpPath("/tmp"), 84 | holmes.WithTextDump(), 85 | holmes.WithDumpToLogger(true), 86 | holmes.WithGoroutineDump(10, 25, 2000, 10*1000, time.Minute), 87 | ) 88 | h.EnableGoroutineDump() 89 | 90 | // start the metrics collect and dump loop 91 | h.Start() 92 | 93 | // stop the dumper 94 | h.Stop() 95 | ``` 96 | 97 | * WithCollectInterval("5s") 每5s采集一次当前应用的各项指标,该值建议设置为大于1s。 98 | * WithDumpPath("/tmp") profile文件保存路径。 99 | * WithTextDump() 以文本格式保存profile内容。 100 | * WithDumpToLogger() profile内容将会输出到日志。 101 | * WithGoroutineDump(10, 25, 2000, 100*1000, time.Minute) 当goroutine指标满足以下条件时,将会触发dump操作。 102 | current_goroutine_num > `10` && current_goroutine_num < `100*1000` && 103 | current_goroutine_num > `125`% * previous_average_goroutine_num or current_goroutine_num > `2000`. 104 | `time.Minute` 是两次dump操作之间最小时间间隔,避免频繁profiling对性能产生的影响。 105 | 106 | > WithGoroutineDump(min int, diff int, abs int, max int, coolDown time.Duration) 107 | > 当应用所启动的goroutine number大于`Max` 时,holmes会跳过dump操作,因为当goroutine number很大时, 108 | > dump goroutine profile操作成本很高(STW && dump),有可能拖垮应用。当`Max`=0 时代表没有限制。 109 | 110 | ### Dump cpu profile 111 | 112 | ```go 113 | h, _ := holmes.New( 114 | holmes.WithCollectInterval("5s"), 115 | holmes.WithDumpPath("/tmp"), 116 | holmes.WithCPUDump(20, 25, 80, time.Minute), 117 | holmes.WithCPUMax(90), 118 | ) 119 | h.EnableCPUDump() 120 | 121 | // start the metrics collect and dump loop 122 | h.Start() 123 | 124 | // stop the dumper 125 | h.Stop() 126 | ``` 127 | 128 | * WithCollectInterval("5s") 每5s采集一次当前应用的各项指标,该值建议设置为大于1s。 129 | * WithDumpPath("/tmp") profile文件保存路径。 130 | * cpu profile支持保存文件,不支持输出到日志中,所以WithBinaryDump()和 WithTextDump()在这场景会失效。 131 | * WithCPUDump(10, 25, 80, time.Minute) 会在满足以下条件时dump profile cpu usage > `10%` && 132 | cpu usage > `125%` * previous cpu usage recorded or cpu usage > `80%`. 133 | `time.Minute` 是两次dump操作之间最小时间间隔,避免频繁profiling对性能产生的影响。 134 | * WithCPUMax 当cpu使用率大于`Max`, holmes会跳过dump操作,以防拖垮系统。 135 | 136 | ### Dump Heap Memory Profile 137 | 138 | ```go 139 | h, _ := holmes.New( 140 | holmes.WithCollectInterval("5s"), 141 | holmes.WithDumpPath("/tmp"), 142 | holmes.WithTextDump(), 143 | holmes.WithMemDump(30, 25, 80, time.Minute), 144 | ) 145 | 146 | h.EnableMemDump() 147 | 148 | // start the metrics collect and dump loop 149 | h.Start() 150 | 151 | // stop the dumper 152 | h.Stop() 153 | ``` 154 | * WithCollectInterval("5s") 每5s采集一次当前应用的各项指标,该值建议设置为大于1s。 155 | * WithDumpPath("/tmp") profile文件保存路径。 156 | * WithTextDump() profile的内容将会输出到日志中。 157 | * WithMemDump(30, 25, 80, time.Minute) 会在满足以下条件时抓取heap profile memory usage > `10%` && 158 | memory usage > `125%` * previous memory usage or memory usage > `80%`, 159 | `time.Minute` 是两次dump操作之间最小时间间隔,避免频繁profiling对性能产生的影响。 160 | 161 | ### 基于Gc周期的Heap Memory Dump 162 | 163 | 在一些场景下,我们无法通过定时的`Memory Dump`保留到现场, 比如应用在一个`CollectInterval`周期内分配了大量内存, 164 | 又快速回收了它们,此时`holmes`在周期前后的采集到内存使用率没有产生过大波动,与实际情况不符。为了解决这种情况,`holmes`开发了基于GC周期的 165 | `Profile`类型,它会在内存使用率飙高的前后两个GC周期内各dump一次profile,然后开发人员可以使用`pprof --base`命令去对比 166 | 两个时刻堆内存之间的差异。 [具体实现介绍](https://uncledou.site/2022/go-pprof-heap/)。 167 | 168 | ```go 169 | h, _ := holmes.New( 170 | holmes.WithDumpPath("/tmp"), 171 | holmes.WithLogger(holmes.NewFileLog("/tmp/holmes.log", mlog.INFO)), 172 | holmes.WithBinaryDump(), 173 | holmes.WithMemoryLimit(100*1024*1024), // 100MB 174 | holmes.WithGCHeapDump(10, 20, 40, time.Minute), 175 | // holmes.WithProfileReporter(reporter), 176 | ) 177 | h.EnableGCHeapDump().Start() 178 | time.Sleep(time.Hour) 179 | ``` 180 | 181 | ### 动态设置holmes配置 182 | 183 | 您可以通过`Set`在系统运行时更新holmes的配置。它的使用十分简单,和初始化时的`New`方法一样。 184 | 185 | ```go 186 | h.Set( 187 | WithCollectInterval("2s"), 188 | WithGoroutineDump(10, 10, 50, 90, time.Minute)) 189 | ``` 190 | 191 | ### Dump事件上报 192 | 193 | 您可以通过实现`Reporter` 来实现以下功能: 194 | * 发送包含现场的告警信息,当`holmes`触发`Dump`操作时。 195 | * 将`Profiles`上传到其他地方,以防实例被销毁,从而导致profile丢失,或进行分析。 196 | 197 | ```go 198 | type ReporterImpl struct{} 199 | func (r *ReporterImpl) Report(pType string, filename string, reason ReasonType, eventID string, sampleTime time.Time, pprofBytes []byte, scene Scene) error{ 200 | // do something 201 | } 202 | ...... 203 | r := &ReporterImpl{} // a implement of holmes.ProfileReporter Interface. 204 | h, _ := holmes.New( 205 | holmes.WithProfileReporter(reporter), 206 | holmes.WithDumpPath("/tmp"), 207 | holmes.WithLogger(holmes.NewFileLog("/tmp/holmes.log", mlog.INFO)), 208 | holmes.WithBinaryDump(), 209 | holmes.WithMemoryLimit(100*1024*1024), // 100MB 210 | holmes.WithGCHeapDump(10, 20, 40, time.Minute), 211 | ) 212 | 213 | ``` 214 | 215 | ### 开启全部 216 | 217 | holmes当然不是只支持一个类型的dump啦,您可以按需选择您需要的dump类型。 218 | 219 | ```go 220 | h, _ := holmes.New( 221 | holmes.WithCollectInterval("5s"), 222 | holmes.WithDumpPath("/tmp"), 223 | holmes.WithTextDump(), 224 | 225 | holmes.WithCPUDump(10, 25, 80, time.Minute), 226 | holmes.WithMemDump(30, 25, 80, time.Minute), 227 | holmes.WithGCHeapDump(10, 20, 40, time.Minute), 228 | holmes.WithGoroutineDump(500, 25, 20000, 0, time.Minute), 229 | ) 230 | 231 | h.EnableCPUDump(). 232 | EnableGoroutineDump(). 233 | EnableMemDump(). 234 | EnableGCHeapDump().Start() 235 | 236 | ``` 237 | 238 | ### 在docker 或者cgroup环境下运行 holmes 239 | 240 | ```go 241 | h, _ := holmes.New( 242 | holmes.WithCollectInterval("5s"), 243 | holmes.WithDumpPath("/tmp"), 244 | holmes.WithTextDump(), 245 | 246 | holmes.WithCPUDump(10, 25, 80, time.Minute), 247 | holmes.WithCGroup(true), // set cgroup to true 248 | ) 249 | ``` 250 | 251 | ## 已知风险 252 | Gorountine dump 会导致 STW,[从而导致时延](https://github.com/golang/go/issues/33250)。 253 | > 目前Go官方已经有一个[CL](https://go-review.googlesource.com/c/go/+/387415/)在优化这个问题了。 254 | 255 | ## 使用示例 256 | [点击这里](./example.md) 257 | 258 | ## Contributing 259 | See our [contributor guide](./CONTRIBUTING.md). -------------------------------------------------------------------------------- /example/1gbslice/1gbslice.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package main 19 | 20 | import ( 21 | mlog "mosn.io/pkg/log" 22 | "net/http" 23 | "time" 24 | 25 | "mosn.io/holmes" 26 | ) 27 | 28 | // run `curl http://localhost:10003/make1gb` after 15s(warn up) 29 | func init() { 30 | http.HandleFunc("/make1gb", make1gbslice) 31 | go http.ListenAndServe(":10003", nil) //nolint:errcheck 32 | } 33 | 34 | func main() { 35 | h, _ := holmes.New( 36 | holmes.WithCollectInterval("2s"), 37 | holmes.WithDumpPath("./tmp"), 38 | holmes.WithLogger(holmes.NewFileLog("./tmp/holmes.log", mlog.DEBUG)), 39 | holmes.WithTextDump(), 40 | holmes.WithMemDump(3, 25, 80, time.Minute), 41 | ) 42 | h.EnableMemDump().Start() 43 | time.Sleep(time.Hour) 44 | } 45 | 46 | func make1gbslice(wr http.ResponseWriter, req *http.Request) { 47 | var a = make([]byte, 1073741824) 48 | _ = a 49 | } 50 | -------------------------------------------------------------------------------- /example/alloc/alloc.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package main 19 | 20 | import ( 21 | "fmt" 22 | "net/http" 23 | "time" 24 | 25 | mlog "mosn.io/pkg/log" 26 | 27 | "mosn.io/holmes" 28 | ) 29 | 30 | // run `curl http://localhost:10003/alloc` after 15s(warn up) 31 | func init() { 32 | http.HandleFunc("/alloc", alloc) 33 | go http.ListenAndServe(":10003", nil) //nolint:errcheck 34 | } 35 | 36 | func main() { 37 | h, _ := holmes.New( 38 | holmes.WithCollectInterval("2s"), 39 | holmes.WithDumpPath("./tmp"), 40 | holmes.WithLogger(holmes.NewFileLog("./tmp/holmes.log", mlog.DEBUG)), 41 | holmes.WithTextDump(), 42 | holmes.WithMemDump(3, 25, 80, time.Minute), 43 | ) 44 | h.EnableMemDump().Start() 45 | time.Sleep(time.Hour) 46 | } 47 | 48 | func alloc(wr http.ResponseWriter, req *http.Request) { 49 | var m = make(map[string]string, 1073741824) 50 | for i := 0; i < 1000; i++ { 51 | m[fmt.Sprint(i)] = fmt.Sprint(i) 52 | } 53 | _ = m 54 | } 55 | -------------------------------------------------------------------------------- /example/channelblock/channelblock.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package main 19 | 20 | import ( 21 | mlog "mosn.io/pkg/log" 22 | "net/http" 23 | "time" 24 | 25 | "mosn.io/holmes" 26 | ) 27 | 28 | // run `curl http://localhost:10003/chanblock` after 15s(warn up) 29 | func init() { 30 | http.HandleFunc("/chanblock", channelBlock) 31 | go http.ListenAndServe(":10003", nil) //nolint:errcheck 32 | } 33 | 34 | func main() { 35 | h, _ := holmes.New( 36 | holmes.WithCollectInterval("5s"), 37 | holmes.WithDumpPath("./tmp"), 38 | holmes.WithLogger(holmes.NewFileLog("./tmp/holmes.log", mlog.DEBUG)), 39 | holmes.WithTextDump(), 40 | holmes.WithGoroutineDump(10, 25, 2000, 10000, time.Minute), 41 | ) 42 | h.EnableGoroutineDump().Start() 43 | time.Sleep(time.Hour) 44 | } 45 | 46 | var nilCh chan int 47 | 48 | func channelBlock(wr http.ResponseWriter, req *http.Request) { 49 | nilCh <- 1 50 | } 51 | -------------------------------------------------------------------------------- /example/cpu_explode/cpu_explode.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package main 19 | 20 | import ( 21 | "net/http" 22 | "time" 23 | 24 | mlog "mosn.io/pkg/log" 25 | 26 | "mosn.io/holmes" 27 | ) 28 | 29 | // run `curl http://localhost:10003/cpuex` after 15s(warn up) 30 | func init() { 31 | http.HandleFunc("/cpuex", cpuex) 32 | go http.ListenAndServe(":10003", nil) //nolint:errcheck 33 | } 34 | 35 | func main() { 36 | h, _ := holmes.New( 37 | holmes.WithCollectInterval("2s"), 38 | holmes.WithDumpPath("./tmp"), 39 | holmes.WithLogger(holmes.NewFileLog("./tmp/holmes.log", mlog.DEBUG)), 40 | holmes.WithCPUDump(20, 25, 80, time.Minute), 41 | ) 42 | h.EnableCPUDump().Start() 43 | time.Sleep(time.Hour) 44 | } 45 | 46 | func cpuex(wr http.ResponseWriter, req *http.Request) { 47 | go func() { 48 | for { 49 | 50 | } 51 | }() 52 | } 53 | -------------------------------------------------------------------------------- /example/deadlock/deadlock.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package main 19 | 20 | import ( 21 | mlog "mosn.io/pkg/log" 22 | "net/http" 23 | "sync" 24 | "time" 25 | 26 | "mosn.io/holmes" 27 | ) 28 | 29 | // run `curl http://localhost:10003/lockorder1` after 15s(warn up) 30 | // run `curl http://localhost:10003/lockorder2` after 15s(warn up) 31 | // run `curl http://localhost:10003/req` after 15s(warn up) 32 | func init() { 33 | http.HandleFunc("/lockorder1", lockorder1) 34 | http.HandleFunc("/lockorder2", lockorder2) 35 | http.HandleFunc("/req", req) 36 | go http.ListenAndServe(":10003", nil) //nolint:errcheck 37 | } 38 | 39 | func main() { 40 | h, _ := holmes.New( 41 | holmes.WithCollectInterval("5s"), 42 | holmes.WithDumpPath("./tmp"), 43 | holmes.WithLogger(holmes.NewFileLog("./tmp/holmes.log", mlog.DEBUG)), 44 | holmes.WithTextDump(), 45 | holmes.WithGoroutineDump(10, 25, 2000, 10000, time.Minute), 46 | ) 47 | h.EnableGoroutineDump().Start() 48 | time.Sleep(time.Hour) 49 | } 50 | 51 | var l1 sync.Mutex 52 | var l2 sync.Mutex 53 | 54 | func req(wr http.ResponseWriter, req *http.Request) { 55 | l1.Lock() 56 | defer l1.Unlock() 57 | } 58 | 59 | func lockorder1(wr http.ResponseWriter, req *http.Request) { 60 | l1.Lock() 61 | defer l1.Unlock() 62 | 63 | time.Sleep(time.Minute) 64 | 65 | l2.Lock() 66 | defer l2.Unlock() 67 | } 68 | 69 | func lockorder2(wr http.ResponseWriter, req *http.Request) { 70 | l2.Lock() 71 | defer l2.Unlock() 72 | 73 | time.Sleep(time.Minute) 74 | 75 | l1.Lock() 76 | defer l1.Unlock() 77 | } 78 | -------------------------------------------------------------------------------- /example/deadloop/deadloop.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package main 19 | 20 | import ( 21 | mlog "mosn.io/pkg/log" 22 | "net/http" 23 | "time" 24 | 25 | "mosn.io/holmes" 26 | ) 27 | 28 | // run `curl http://localhost:10003/alldeadloopoc` after 15s(warn up) 29 | func init() { 30 | http.HandleFunc("/deadloop", deadloop) 31 | go http.ListenAndServe(":10003", nil) //nolint:errcheck 32 | } 33 | 34 | func main() { 35 | h, _ := holmes.New( 36 | holmes.WithCollectInterval("2s"), 37 | holmes.WithDumpPath("./tmp"), 38 | holmes.WithLogger(holmes.NewFileLog("./tmp/holmes.log", mlog.DEBUG)), 39 | holmes.WithCPUDump(10, 25, 80, time.Minute), 40 | ) 41 | h.EnableCPUDump().Start() 42 | time.Sleep(time.Hour) 43 | } 44 | 45 | func deadloop(wr http.ResponseWriter, req *http.Request) { 46 | for { 47 | select { 48 | case <-req.Context().Done(): 49 | break 50 | default: 51 | time.Sleep(time.Millisecond) 52 | } 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /example/gcheap/.gitignore: -------------------------------------------------------------------------------- 1 | gcheap 2 | vendor 3 | -------------------------------------------------------------------------------- /example/gcheap/README.md: -------------------------------------------------------------------------------- 1 | 2 | ## How to test 3 | 4 | 1. change to the current gcheap directory 5 | ``` 6 | cd /path/to/gcheap/ 7 | ``` 8 | 9 | 2. compile 10 | ``` 11 | go build gcheap.go 12 | ``` 13 | 14 | 3. start the sample http server with the gctrace enabled. 15 | ``` 16 | GODEBUG=gctrace=1 ./gcheap 17 | ``` 18 | 19 | 4. start the rand allocation case 20 | ``` 21 | ./rand.sh 22 | ``` 23 | 24 | the `/rand` api will allocation some memory and will be recycled soon, 25 | the internal `heapMarked` will stay about 10 MB, and the GC Goal will stay about 21MB. 26 | 27 | we can see the following gctrace log from stdout: 28 | ``` 29 | gc 28 @11.666s 0%: 0.17+0.19+0.017 ms clock, 2.0+0.085/0.19/0.13+0.20 ms cpu, 20->20->10 MB, 21 MB goal, 12 P 30 | gc 29 @12.121s 0%: 0.065+0.21+0.015 ms clock, 0.78+0.11/0.23/0.13+0.18 ms cpu, 20->20->10 MB, 21 MB goal, 12 P 31 | ``` 32 | 33 | Also, we can see the following holmes log from /tmp/holmes.log: 34 | ``` 35 | [Holmes] NODUMP GCHeap, config_min : 10, config_diff : 20, config_abs : 40, config_max : 0, previous : [10 10 10 10 10 10 10 10 10 10], current: 10 36 | ``` 37 | 38 | Everything works well now. 39 | 40 | 5. memory spike 41 | ```bash 42 | curl http://localhost:10024/spike 43 | ``` 44 | 45 | The `/spike` API will allocate 10 MB memory and keep for a while. 46 | 47 | We can see the GC goal increased from the gctrace log: 48 | ``` 49 | gc 432 @191.430s 0%: 0.14+0.40+0.004 ms clock, 1.7+0.26/0.41/0.52+0.051 ms cpu, 21->22->19 MB, 22 MB goal, 12 P 50 | gc 433 @192.079s 0%: 0.042+0.22+0.002 ms clock, 0.51+0.097/0.38/0.21+0.024 ms cpu, 37->37->10 MB, 38 MB goal, 12 P 51 | ``` 52 | 53 | And we see that we got two profiles from holmes log: 54 | ``` 55 | [2022-02-09 14:48:23.103][Holmes] pprof GCHeap, config_min : 10, config_diff : 20, config_abs : 40, config_max : 0, previous : [10 10 10 10 10 10 10 10 10 19], current: 19 56 | [2022-02-09 14:48:23.751][Holmes] pprof GCHeap, config_min : 10, config_diff : 20, config_abs : 40, config_max : 0, previous : [10 10 10 10 10 10 10 10 10 19], current: 10 57 | ``` 58 | 59 | 6. generate flamegraph 60 | 61 | we will know what cause the GC goal increased exactly by using the following command. 62 | (we got the profile name by timestamp that from holmes log) 63 | ``` 64 | go tool pprof -http=:8000 -base GCHeap.20220209144823.103.bin GCHeap.20220209144823.751.bin 65 | ``` 66 | 67 | It shows the reason for memory spike clearly. 68 | 69 | ![memory spike](memory-spike.png) -------------------------------------------------------------------------------- /example/gcheap/gcheap.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package main 19 | 20 | import ( 21 | "fmt" 22 | 23 | mlog "mosn.io/pkg/log" 24 | 25 | //"mosn.io/holmes/reporters/http_reporter" 26 | "math/rand" 27 | "net/http" 28 | "time" 29 | 30 | "mosn.io/holmes" 31 | ) 32 | 33 | // run `curl http://localhost:10024/rand` after 15s(warn up) 34 | // run `curl http://localhost:10024/spike` after 15s(warn up) 35 | func init() { 36 | http.HandleFunc("/rand", randAlloc) 37 | http.HandleFunc("/spike", spikeAlloc) 38 | go http.ListenAndServe(":10024", nil) 39 | } 40 | 41 | func main() { 42 | // reporter := http_reporter.NewReporter("TOKEN", "URL") 43 | h, _ := holmes.New( 44 | holmes.WithDumpPath("./tmp"), 45 | holmes.WithLogger(holmes.NewFileLog("./tmp/holmes.log", mlog.DEBUG)), 46 | holmes.WithBinaryDump(), 47 | holmes.WithMemoryLimit(100*1024*1024), // 100MB 48 | holmes.WithGCHeapDump(10, 20, 40, time.Minute), 49 | // holmes.WithProfileReporter(reporter), 50 | ) 51 | h.EnableGCHeapDump().Start() 52 | time.Sleep(time.Hour) 53 | } 54 | 55 | var ( 56 | base = make([]byte, 1024*1024*10) // 10 MB long live memory. 57 | ) 58 | 59 | func randAlloc(wr http.ResponseWriter, req *http.Request) { 60 | var s = make([][]byte, 0) // short live 61 | for i := 0; i < 1024; i++ { 62 | len := rand.Intn(1024) 63 | bytes := make([]byte, len) 64 | 65 | s = append(s, bytes) 66 | 67 | if len == 0 { 68 | s = make([][]byte, 0) 69 | } 70 | } 71 | time.Sleep(time.Millisecond * 10) 72 | fmt.Fprintf(wr, "slice current length: %v\n", len(s)) 73 | } 74 | 75 | func spikeAlloc(wr http.ResponseWriter, req *http.Request) { 76 | var s = make([][]byte, 0, 1024) // spike, 10MB 77 | for i := 0; i < 10; i++ { 78 | bytes := make([]byte, 1024*1024) 79 | s = append(s, bytes) 80 | } 81 | // live for a while 82 | time.Sleep(time.Millisecond * 500) 83 | fmt.Fprintf(wr, "spike slice length: %v\n", len(s)) 84 | } 85 | -------------------------------------------------------------------------------- /example/gcheap/go.mod: -------------------------------------------------------------------------------- 1 | module example.com/m 2 | 3 | go 1.17 4 | 5 | require ( 6 | mosn.io/holmes v0.0.0-20220125114618-8cb365eb42ac 7 | mosn.io/pkg v0.0.0-20220308091858-ea728aacbe63 8 | ) 9 | 10 | require ( 11 | github.com/StackExchange/wmi v0.0.0-20190523213315-cbe66965904d // indirect 12 | github.com/go-ole/go-ole v1.2.4 // indirect 13 | github.com/hashicorp/go-syslog v1.0.0 // indirect 14 | github.com/rcrowley/go-metrics v0.0.0-20200313005456-10cdbea86bc0 // indirect 15 | github.com/shirou/gopsutil v3.20.11+incompatible // indirect 16 | golang.org/x/sys v0.0.0-20210809222454-d867a43fc93e // indirect 17 | gopkg.in/natefinch/lumberjack.v2 v2.0.0 // indirect 18 | mosn.io/api v0.0.0-20210204052134-5b9a826795fd // indirect 19 | ) 20 | 21 | replace mosn.io/holmes => ../../ 22 | -------------------------------------------------------------------------------- /example/gcheap/memory-spike.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mosn/holmes/efb1a7768843e83b645f6e683f7b6c5d826651ab/example/gcheap/memory-spike.png -------------------------------------------------------------------------------- /example/gcheap/rand.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -x 4 | 5 | while true; do 6 | curl 'http://localhost:10024/rand' 7 | done 8 | -------------------------------------------------------------------------------- /example/pyroscope_rideshare/README.md: -------------------------------------------------------------------------------- 1 | 2 | Enable holmes as pyroscope client and reports pprof 3 | event to pyroscope server. 4 | 5 | note: CAN NOT set TextDump while using holmes as pyroscope client, 6 | bcs pyroscope need profile in proto format. 7 | 8 | Step 1 9 | 10 | ``docker run -it -p 4040:4040 pyroscope/pyroscope:latest server`` 11 | 12 | open browser on [pyroscope admin page](http://localhost:4040/) 13 | 14 | Step 2 15 | run the script `start_client.sh` at `rideshare/` 16 | 17 | Step 3 18 | wait 15 seconds, refresh pyroscope admin page, select 19 | `holmes-client` on the `Application` box as the following. 20 | ![admin](./admin.png) 21 | 22 | -------------------------------------------------------------------------------- /example/pyroscope_rideshare/admin.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mosn/holmes/efb1a7768843e83b645f6e683f7b6c5d826651ab/example/pyroscope_rideshare/admin.png -------------------------------------------------------------------------------- /example/pyroscope_rideshare/bike/bike.go: -------------------------------------------------------------------------------- 1 | package bike 2 | 3 | import "rideshare/utility" 4 | 5 | func OrderBike(searchRadius int64) { 6 | utility.FindNearestVehicle(searchRadius, "bike") 7 | for i := 0; i < 3; i++ { 8 | go utility.AllocMem() 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /example/pyroscope_rideshare/car/car.go: -------------------------------------------------------------------------------- 1 | package car 2 | 3 | import ( 4 | "rideshare/utility" 5 | ) 6 | 7 | func OrderCar(searchRadius int64) { 8 | utility.FindNearestVehicle(searchRadius, "car") 9 | } 10 | -------------------------------------------------------------------------------- /example/pyroscope_rideshare/go.mod: -------------------------------------------------------------------------------- 1 | module rideshare 2 | 3 | go 1.14 4 | 5 | require mosn.io/holmes v1.1.0 6 | 7 | replace mosn.io/holmes => ../../ 8 | -------------------------------------------------------------------------------- /example/pyroscope_rideshare/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "net/http" 6 | "os" 7 | "time" 8 | 9 | "mosn.io/holmes" 10 | "mosn.io/holmes/reporters/pyroscope_reporter" 11 | "rideshare/bike" 12 | "rideshare/car" 13 | "rideshare/scooter" 14 | ) 15 | 16 | func bikeRoute(w http.ResponseWriter, r *http.Request) { 17 | bike.OrderBike(1) 18 | w.Write([]byte("

Bike ordered

")) 19 | } 20 | 21 | func scooterRoute(w http.ResponseWriter, r *http.Request) { 22 | scooter.OrderScooter(2) 23 | w.Write([]byte("

Scooter ordered

")) 24 | } 25 | 26 | func carRoute(w http.ResponseWriter, r *http.Request) { 27 | car.OrderCar(3) 28 | 29 | w.Write([]byte("

Car ordered

")) 30 | } 31 | 32 | func index(w http.ResponseWriter, r *http.Request) { 33 | result := "

environment vars:

" 34 | for _, env := range os.Environ() { 35 | result += env + "
" 36 | } 37 | w.Write([]byte(result)) 38 | } 39 | 40 | var h *holmes.Holmes 41 | 42 | func InitHolmes() { 43 | fmt.Println("holmes initialing") 44 | h, _ = holmes.New( 45 | holmes.WithCollectInterval("1s"), 46 | holmes.WithDumpPath("./log/"), 47 | // can not set text in pyroscope client 48 | ) 49 | fmt.Println("holmes initial success") 50 | h. 51 | EnableCPUDump(). 52 | EnableGoroutineDump(). 53 | EnableMemDump(). 54 | Start() 55 | time.Sleep(11 * time.Second) 56 | fmt.Println("on running") 57 | } 58 | 59 | func main() { 60 | InitHolmes() 61 | region := os.Getenv("region") 62 | port := os.Getenv("port") 63 | fmt.Printf("region is %v port is %v \n", region, port) 64 | cfg := pyroscope_reporter.RemoteConfig{ 65 | //AuthToken: "", 66 | //UpstreamThreads: 4, 67 | UpstreamAddress: "http://localhost:4040", 68 | UpstreamRequestTimeout: 3 * time.Second, 69 | } 70 | 71 | tags := map[string]string{ 72 | "region": region, 73 | } 74 | 75 | pReporter, err := pyroscope_reporter.NewPyroscopeReporter("holmes-client", tags, cfg, holmes.NewStdLogger()) 76 | if err != nil { 77 | fmt.Printf("NewPyroscopeReporter error %v\n", err) 78 | return 79 | } 80 | 81 | err = h.Set( 82 | holmes.WithProfileReporter(pReporter), 83 | holmes.WithGoroutineDump(2, 2, 20, 90, 20*time.Second), 84 | holmes.WithCPUDump(2, 2, 80, 20*time.Second), 85 | holmes.WithMemDump(1, 2, 80, 20*time.Second), 86 | holmes.WithCollectInterval("5s"), 87 | ) 88 | if err != nil { 89 | fmt.Printf("fail to set opts on running time.\n") 90 | return 91 | } 92 | 93 | http.HandleFunc("/", index) 94 | http.HandleFunc("/bike", bikeRoute) 95 | http.HandleFunc("/scooter", scooterRoute) 96 | http.HandleFunc("/car", carRoute) 97 | err = http.ListenAndServe(":"+port, nil) 98 | if err != nil { 99 | panic(err) 100 | } 101 | 102 | time.Sleep(1 * time.Minute) 103 | 104 | } 105 | -------------------------------------------------------------------------------- /example/pyroscope_rideshare/requests.py: -------------------------------------------------------------------------------- 1 | import random 2 | import requests 3 | import time 4 | 5 | PORTS = [ 6 | '15011', 7 | '15012', 8 | '15013', 9 | ] 10 | 11 | VEHICLES = [ 12 | 'bike', 13 | 'scooter', 14 | 'car', 15 | ] 16 | 17 | if __name__ == "__main__": 18 | print(f"starting load generator") 19 | time.sleep(3) 20 | while True: 21 | port = PORTS[random.randint(0, len(PORTS) - 1)] 22 | vehicle = VEHICLES[random.randint(0, len(VEHICLES) - 1)] 23 | print(f"requesting {vehicle} from {port}") 24 | resp = requests.get(f'http://localhost:{port}/{vehicle}') 25 | print(f"received {resp}") 26 | time.sleep(random.uniform(0.2, 0.4)) 27 | -------------------------------------------------------------------------------- /example/pyroscope_rideshare/scooter/scooter.go: -------------------------------------------------------------------------------- 1 | package scooter 2 | 3 | import "rideshare/utility" 4 | 5 | func OrderScooter(searchRadius int64) { 6 | utility.FindNearestVehicle(searchRadius, "scooter") 7 | } 8 | -------------------------------------------------------------------------------- /example/pyroscope_rideshare/start_client.sh: -------------------------------------------------------------------------------- 1 | 2 | region=us-east;port=15011 go run main.go & 3 | region=eu-north;port=15012 go run main.go & 4 | region=ap-south;port=15013 go run main.go & 5 | 6 | echo "wait holmes client init" 7 | sleep 15s 8 | echo "init done, start to send request" 9 | 10 | python3 requests.py 11 | -------------------------------------------------------------------------------- /example/pyroscope_rideshare/utility/utility.go: -------------------------------------------------------------------------------- 1 | package utility 2 | 3 | import ( 4 | "os" 5 | "time" 6 | ) 7 | 8 | const durationConstant = time.Duration(200 * time.Millisecond) 9 | 10 | func mutexLock(n int64) { 11 | var i int64 = 0 12 | 13 | // start time is number of seconds since epoch 14 | startTime := time.Now() 15 | 16 | // This changes the amplitude of cpu bars 17 | for time.Since(startTime) < time.Duration(n*30)*durationConstant { 18 | i++ 19 | } 20 | } 21 | 22 | func checkDriverAvailability(n int64) { 23 | var i int64 = 0 24 | 25 | // start time is number of seconds since epoch 26 | startTime := time.Now() 27 | 28 | for time.Since(startTime) < time.Duration(n)*durationConstant { 29 | i++ 30 | } 31 | 32 | // Every other minute this will artificially create make requests.py in eu-north region slow 33 | // this is just for demonstration purposes to show how performance impacts show up in the 34 | // flamegraph 35 | force_mutex_lock := time.Now().Minute()%2 == 0 36 | if os.Getenv("REGION") == "eu-north" && force_mutex_lock { 37 | mutexLock(n) 38 | } 39 | 40 | } 41 | 42 | func FindNearestVehicle(searchRadius int64, vehicle string) { 43 | //pyroscope.TagWrapper(context.Background(), pyroscope.Labels("vehicle", vehicle), func(ctx context.Context) { 44 | var i int64 = 0 45 | 46 | startTime := time.Now() 47 | for time.Since(startTime) < time.Duration(searchRadius)*durationConstant { 48 | i++ 49 | } 50 | 51 | if vehicle == "car" { 52 | checkDriverAvailability(searchRadius) 53 | go func() { 54 | go AllocMem() 55 | }() 56 | } 57 | if vehicle == "bike" { 58 | for i := 1; i < 10; i++ { 59 | go func() { 60 | time.Sleep(15 * time.Second) 61 | }() 62 | } 63 | } 64 | //}) 65 | } 66 | 67 | func AllocMem() { 68 | var a = make([]byte, 1073741824) 69 | _ = a 70 | time.Sleep(10 * time.Second) 71 | } 72 | -------------------------------------------------------------------------------- /example/run_in_docker/run_in_docker.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package main 19 | 20 | import ( 21 | "net/http" 22 | "time" 23 | 24 | "mosn.io/holmes" 25 | ) 26 | 27 | func init() { 28 | http.HandleFunc("/docker", dockermake1gb) 29 | http.HandleFunc("/docker/cpu", cpuex) 30 | http.HandleFunc("/docker/cpu_multi_core", cpuMulticore) 31 | go http.ListenAndServe(":10003", nil) //nolint:errcheck 32 | } 33 | 34 | func main() { 35 | h, _ := holmes.New( 36 | holmes.WithCollectInterval("2s"), 37 | holmes.WithDumpPath("/tmp"), 38 | holmes.WithTextDump(), 39 | holmes.WithMemDump(3, 25, 80, time.Minute), 40 | holmes.WithCPUDump(60, 10, 80, time.Minute), 41 | holmes.WithCGroup(true), 42 | ) 43 | h.EnableCPUDump() 44 | h.EnableMemDump() 45 | h.Start() 46 | time.Sleep(time.Hour) 47 | } 48 | 49 | func cpuex(wr http.ResponseWriter, req *http.Request) { 50 | for { 51 | select { 52 | case <-req.Context().Done(): 53 | break 54 | default: 55 | time.Sleep(time.Millisecond) 56 | } 57 | } 58 | } 59 | 60 | func cpuMulticore(wr http.ResponseWriter, req *http.Request) { 61 | for i := 1; i <= 100; i++ { 62 | go func() { 63 | for { 64 | select { 65 | case <-req.Context().Done(): 66 | default: 67 | time.Sleep(time.Millisecond) 68 | } 69 | } 70 | }() 71 | } 72 | 73 | <-req.Context().Done() 74 | } 75 | 76 | func dockermake1gb(wr http.ResponseWriter, req *http.Request) { 77 | var a = make([]byte, 1073741824) 78 | _ = a 79 | } 80 | -------------------------------------------------------------------------------- /example/slowlyleak/slowlyleak.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package main 19 | 20 | import ( 21 | "net/http" 22 | "time" 23 | 24 | "mosn.io/holmes" 25 | ) 26 | 27 | func init() { 28 | http.HandleFunc("/leak", leak) 29 | go http.ListenAndServe(":10003", nil) //nolint:errcheck 30 | } 31 | 32 | func main() { 33 | h, _ := holmes.New( 34 | holmes.WithCollectInterval("2s"), 35 | holmes.WithDumpPath("/tmp"), 36 | holmes.WithTextDump(), 37 | holmes.WithGoroutineDump(10, 25, 80, 10000, time.Minute), 38 | ) 39 | h.EnableGoroutineDump().Start() 40 | time.Sleep(time.Hour) 41 | } 42 | 43 | func leak(wr http.ResponseWriter, req *http.Request) { 44 | taskChan := make(chan int) 45 | consumer := func() { 46 | for task := range taskChan { 47 | _ = task // do some tasks 48 | } 49 | } 50 | 51 | producer := func() { 52 | for i := 0; i < 10; i++ { 53 | taskChan <- i // generate some tasks 54 | } 55 | // forget to close the taskChan here 56 | } 57 | 58 | go consumer() 59 | go producer() 60 | } 61 | -------------------------------------------------------------------------------- /example/thread_trigger/thread_trigger.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package main 19 | 20 | /* 21 | #include 22 | #include 23 | #include 24 | void output(char *str) { 25 | sleep(10000); 26 | printf("%s\n", str); 27 | } 28 | */ 29 | import "C" 30 | import ( 31 | "fmt" 32 | "net/http" 33 | "time" 34 | "unsafe" 35 | 36 | _ "net/http/pprof" 37 | 38 | "mosn.io/holmes" 39 | ) 40 | 41 | func init() { 42 | go func() { 43 | h, _ := holmes.New( 44 | holmes.WithCollectInterval("2s"), 45 | holmes.WithDumpPath("/tmp"), 46 | holmes.WithTextDump(), 47 | holmes.WithThreadDump(10, 25, 100, time.Minute), 48 | ) 49 | h.EnableThreadDump().Start() 50 | time.Sleep(time.Hour) 51 | }() 52 | } 53 | 54 | func leak(wr http.ResponseWriter, req *http.Request) { 55 | go func() { 56 | for i := 0; i < 1000; i++ { 57 | go func() { 58 | str := "hello cgo" 59 | //change to char* 60 | cstr := C.CString(str) 61 | C.output(cstr) 62 | C.free(unsafe.Pointer(cstr)) 63 | 64 | }() 65 | } 66 | }() 67 | } 68 | 69 | func main() { 70 | http.HandleFunc("/leak", leak) 71 | err := http.ListenAndServe(":10003", nil) 72 | if err != nil { 73 | fmt.Println(err) 74 | return 75 | } 76 | select {} 77 | } 78 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module mosn.io/holmes 2 | 3 | go 1.14 4 | 5 | require ( 6 | github.com/gin-gonic/gin v1.7.7 7 | github.com/shirou/gopsutil v3.20.11+incompatible 8 | github.com/stretchr/testify v1.7.0 9 | mosn.io/pkg v1.6.0 10 | ) 11 | -------------------------------------------------------------------------------- /holmes.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package holmes 19 | 20 | import ( 21 | "bytes" 22 | "fmt" 23 | "io/ioutil" 24 | "runtime" 25 | "runtime/pprof" 26 | "sync" 27 | "sync/atomic" 28 | "time" 29 | ) 30 | 31 | // Holmes is a self-aware profile dumper. 32 | type Holmes struct { 33 | opts *options 34 | 35 | // stats 36 | collectCount int 37 | gcCycleCount int 38 | threadTriggerCount int 39 | cpuTriggerCount int 40 | memTriggerCount int 41 | grTriggerCount int 42 | gcHeapTriggerCount int 43 | shrinkThreadTriggerCount int 44 | 45 | // cooldown 46 | threadCoolDownTime time.Time 47 | cpuCoolDownTime time.Time 48 | memCoolDownTime time.Time 49 | gcHeapCoolDownTime time.Time 50 | grCoolDownTime time.Time 51 | shrinkThrCoolDownTime time.Time 52 | 53 | // GC heap triggered, need to dump next time. 54 | gcHeapTriggered bool 55 | 56 | // stats ring 57 | memStats ring 58 | cpuStats ring 59 | grNumStats ring 60 | threadStats ring 61 | gcHeapStats ring 62 | 63 | // switch 64 | stopped int64 65 | 66 | // lock Protect the following 67 | sync.Mutex 68 | // channel for GC sweep finalizer event 69 | gcEventsCh chan struct{} 70 | // profiler reporter channels 71 | rptEventsCh chan rptEvent 72 | } 73 | 74 | // New creates a holmes dumper. 75 | func New(opts ...Option) (*Holmes, error) { 76 | holmes := &Holmes{ 77 | 78 | opts: newOptions(), 79 | stopped: 1, // Initialization should be off 80 | } 81 | 82 | for _, opt := range opts { 83 | if err := opt.apply(holmes.opts); err != nil { 84 | return nil, err 85 | } 86 | } 87 | 88 | return holmes, nil 89 | } 90 | 91 | // EnableThreadDump enables the goroutine dump. 92 | func (h *Holmes) EnableThreadDump() *Holmes { 93 | h.opts.threadOpts.Enable = true 94 | return h 95 | } 96 | 97 | // DisableThreadDump disables the goroutine dump. 98 | func (h *Holmes) DisableThreadDump() *Holmes { 99 | h.opts.threadOpts.Enable = false 100 | return h 101 | } 102 | 103 | // EnableGoroutineDump enables the goroutine dump. 104 | func (h *Holmes) EnableGoroutineDump() *Holmes { 105 | h.opts.grOpts.Enable = true 106 | return h 107 | } 108 | 109 | // DisableGoroutineDump disables the goroutine dump. 110 | func (h *Holmes) DisableGoroutineDump() *Holmes { 111 | h.opts.grOpts.Enable = false 112 | return h 113 | } 114 | 115 | // EnableCPUDump enables the CPU dump. 116 | func (h *Holmes) EnableCPUDump() *Holmes { 117 | h.opts.cpuOpts.Enable = true 118 | return h 119 | } 120 | 121 | // DisableCPUDump disables the CPU dump. 122 | func (h *Holmes) DisableCPUDump() *Holmes { 123 | h.opts.cpuOpts.Enable = false 124 | return h 125 | } 126 | 127 | // EnableMemDump enables the mem dump. 128 | func (h *Holmes) EnableMemDump() *Holmes { 129 | h.opts.memOpts.Enable = true 130 | return h 131 | } 132 | 133 | // DisableMemDump disables the mem dump. 134 | func (h *Holmes) DisableMemDump() *Holmes { 135 | h.opts.memOpts.Enable = false 136 | return h 137 | } 138 | 139 | // EnableGCHeapDump enables the GC heap dump. 140 | func (h *Holmes) EnableGCHeapDump() *Holmes { 141 | h.opts.gCHeapOpts.Enable = true 142 | return h 143 | } 144 | 145 | // DisableGCHeapDump disables the gc heap dump. 146 | func (h *Holmes) DisableGCHeapDump() *Holmes { 147 | h.opts.gCHeapOpts.Enable = false 148 | return h 149 | } 150 | 151 | // EnableShrinkThread enables shrink thread 152 | func (h *Holmes) EnableShrinkThread() *Holmes { 153 | h.opts.ShrinkThrOptions.Enable = true 154 | return h 155 | } 156 | 157 | // DisableShrinkThread disables shrink thread 158 | func (h *Holmes) DisableShrinkThread() *Holmes { 159 | h.opts.ShrinkThrOptions.Enable = false 160 | return h 161 | } 162 | 163 | func finalizerCallback(gc *gcHeapFinalizer) { 164 | defer func() { 165 | if r := recover(); r != nil { 166 | gc.h.Errorf("Panic in finalizer callback: %v", r) 167 | } 168 | }() 169 | // disable or stop gc clean up normally 170 | if atomic.LoadInt64(&gc.h.stopped) == 1 { 171 | return 172 | } 173 | 174 | // register the finalizer again 175 | runtime.SetFinalizer(gc, finalizerCallback) 176 | 177 | // read channel should be atomic. 178 | ch := gc.h.gcEventsCh 179 | if ch == nil { 180 | return 181 | } 182 | // Notice: here may be a litte race, will panic when ch is closed now. 183 | // we just leave it since it is very small and there is a recover. 184 | select { 185 | case ch <- struct{}{}: 186 | default: 187 | gc.h.Errorf("can not send event to finalizer channel immediately, may be analyzer blocked?") 188 | } 189 | } 190 | 191 | // it won't fit into tiny span since this struct contains point. 192 | type gcHeapFinalizer struct { 193 | h *Holmes 194 | } 195 | 196 | func (h *Holmes) startGCCycleLoop(ch chan struct{}) { 197 | h.gcHeapStats = newRing(minCollectCyclesBeforeDumpStart) 198 | 199 | gc := &gcHeapFinalizer{ 200 | h, 201 | } 202 | 203 | runtime.SetFinalizer(gc, finalizerCallback) 204 | 205 | go gc.h.gcHeapCheckLoop(ch) 206 | } 207 | 208 | // Start starts the dump loop of holmes. 209 | func (h *Holmes) Start() { 210 | h.Lock() 211 | defer h.Unlock() 212 | 213 | if !atomic.CompareAndSwapInt64(&h.stopped, 1, 0) { 214 | //nolint 215 | h.Errorf("Holmes has started, please don't start it again.") 216 | return 217 | } 218 | 219 | gcEventsCh := make(chan struct{}, 1) 220 | rptCh := make(chan rptEvent, 32) 221 | h.gcEventsCh = gcEventsCh 222 | h.rptEventsCh = rptCh 223 | 224 | h.initEnvironment() 225 | go h.startDumpLoop() 226 | go h.startReporter(rptCh) 227 | 228 | h.startGCCycleLoop(gcEventsCh) 229 | } 230 | 231 | // Stop the dump loop. 232 | func (h *Holmes) Stop() { 233 | h.Lock() 234 | defer h.Unlock() 235 | 236 | if !atomic.CompareAndSwapInt64(&h.stopped, 0, 1) { 237 | //nolint 238 | fmt.Println("Holmes has stop, please don't stop it again.") 239 | return 240 | } 241 | 242 | if gcEventsCh := h.gcEventsCh; gcEventsCh != nil { 243 | h.gcEventsCh = nil 244 | close(gcEventsCh) 245 | } 246 | if rptEventsCh := h.rptEventsCh; rptEventsCh != nil { 247 | h.rptEventsCh = nil 248 | close(rptEventsCh) 249 | } 250 | } 251 | 252 | func (h *Holmes) startDumpLoop() { 253 | // init previous cool down time 254 | now := time.Now() 255 | h.cpuCoolDownTime = now 256 | h.memCoolDownTime = now 257 | h.grCoolDownTime = now 258 | 259 | // init stats ring 260 | h.cpuStats = newRing(minCollectCyclesBeforeDumpStart) 261 | h.memStats = newRing(minCollectCyclesBeforeDumpStart) 262 | h.grNumStats = newRing(minCollectCyclesBeforeDumpStart) 263 | h.threadStats = newRing(minCollectCyclesBeforeDumpStart) 264 | 265 | // dump loop 266 | ticker := time.NewTicker(h.opts.CollectInterval) 267 | defer ticker.Stop() 268 | 269 | for { 270 | select { 271 | case <-h.opts.intervalResetting: 272 | // wait for go version update to 1.15 273 | // can use Reset API directly here. pkg.go.dev/time#Ticker.Reset 274 | // we can't use the `for-range` here, because the range loop 275 | // caches the variable to be lopped and then it can't be overwritten 276 | itv := h.opts.CollectInterval 277 | h.Infof("[Holmes] collect interval is resetting to [%v]\n", itv) //nolint:forbidigo 278 | ticker = time.NewTicker(itv) 279 | 280 | default: 281 | // bug fix: https://github.com/mosn/holmes/issues/63 282 | // make sure that the message inside intervalResetting channel 283 | // would be consumed before ticker.C. 284 | <-ticker.C 285 | if atomic.LoadInt64(&h.stopped) == 1 { 286 | h.Infof("[Holmes] dump loop stopped") //nolint:forbidigo 287 | return 288 | } 289 | 290 | cpuCore, err := h.getCPUCore() 291 | if cpuCore == 0 || err != nil { 292 | h.Errorf("[Holmes] get CPU core failed, CPU core: %v, error: %v", cpuCore, err) 293 | return 294 | } 295 | 296 | memoryLimit, err := h.getMemoryLimit() 297 | if memoryLimit == 0 || err != nil { 298 | h.Errorf("[Holmes] get memory limit failed, memory limit: %v, error: %v", memoryLimit, err) 299 | return 300 | } 301 | 302 | cpu, mem, gNum, tNum, err := collect(cpuCore, memoryLimit) 303 | if err != nil { 304 | h.Errorf("failed to collect resource usage: %v", err.Error()) 305 | 306 | continue 307 | } 308 | 309 | h.cpuStats.push(cpu) 310 | h.memStats.push(mem) 311 | h.grNumStats.push(gNum) 312 | h.threadStats.push(tNum) 313 | 314 | h.collectCount++ 315 | if h.collectCount < minCollectCyclesBeforeDumpStart { 316 | // at least collect some cycles 317 | // before start to judge and dump 318 | h.Debugf("[Holmes] warming up cycle : %d", h.collectCount) 319 | 320 | continue 321 | } 322 | 323 | if err := h.EnableDump(cpu); err != nil { 324 | h.Infof("[Holmes] unable to dump: %v", err) 325 | 326 | continue 327 | } 328 | 329 | h.memCheckAndDump(mem) 330 | h.cpuCheckAndDump(cpu) 331 | h.threadCheckAndDump(tNum) 332 | h.threadCheckAndShrink(tNum) 333 | h.goroutineCheckAndDump(gNum) 334 | } 335 | } 336 | } 337 | 338 | // goroutine start. 339 | func (h *Holmes) goroutineCheckAndDump(gNum int) { 340 | // get a copy instead of locking it 341 | grOpts := h.opts.GetGrOpts() 342 | if !grOpts.Enable { 343 | return 344 | } 345 | 346 | if h.grCoolDownTime.After(time.Now()) { 347 | h.Debugf("[Holmes] goroutine dump is in cooldown") 348 | return 349 | } 350 | // grOpts is a struct, no escape. 351 | if triggered := h.goroutineProfile(gNum, grOpts); triggered { 352 | h.grCoolDownTime = time.Now().Add(grOpts.CoolDown) 353 | h.grTriggerCount++ 354 | } 355 | } 356 | 357 | func (h *Holmes) goroutineProfile(gNum int, c grOptions) bool { 358 | match, reason := matchRule(h.grNumStats, gNum, c.TriggerMin, c.TriggerAbs, c.TriggerDiff, c.GoroutineTriggerNumMax) 359 | if !match { 360 | h.Infof(UniformLogFormat, "NODUMP", check2name[goroutine], 361 | c.TriggerMin, c.TriggerDiff, c.TriggerAbs, 362 | c.GoroutineTriggerNumMax, h.grNumStats.sequentialData(), gNum) 363 | return false 364 | } 365 | 366 | h.Alertf("holmes.goroutine", UniformLogFormat, "pprof ", check2name[goroutine], 367 | c.TriggerMin, c.TriggerDiff, c.TriggerAbs, 368 | c.GoroutineTriggerNumMax, 369 | h.grNumStats.sequentialData(), gNum) 370 | 371 | var buf bytes.Buffer 372 | _ = pprof.Lookup("goroutine").WriteTo(&buf, int(h.opts.DumpProfileType)) // nolint: errcheck 373 | 374 | scene := Scene{ 375 | typeOption: *c.typeOption, 376 | CurVal: gNum, 377 | Avg: h.grNumStats.avg(), 378 | } 379 | 380 | h.ReportProfile(type2name[goroutine], h.writeProfileDataToFile(buf, goroutine, ""), 381 | reason, "", time.Now(), buf.Bytes(), scene) 382 | return true 383 | } 384 | 385 | // memory start. 386 | func (h *Holmes) memCheckAndDump(mem int) { 387 | // get a copy instead of locking it 388 | memOpts := h.opts.GetMemOpts() 389 | if !memOpts.Enable { 390 | return 391 | } 392 | 393 | if h.memCoolDownTime.After(time.Now()) { 394 | h.Debugf("[Holmes] mem dump is in cooldown") 395 | return 396 | } 397 | // memOpts is a struct, no escape. 398 | if triggered := h.memProfile(mem, memOpts); triggered { 399 | h.memCoolDownTime = time.Now().Add(memOpts.CoolDown) 400 | h.memTriggerCount++ 401 | } 402 | } 403 | 404 | func (h *Holmes) memProfile(rss int, c typeOption) bool { 405 | match, reason := matchRule(h.memStats, rss, c.TriggerMin, c.TriggerAbs, c.TriggerDiff, NotSupportTypeMaxConfig) 406 | if !match { 407 | // let user know why this should not dump 408 | h.Infof(UniformLogFormat, "NODUMP", check2name[mem], 409 | c.TriggerMin, c.TriggerDiff, c.TriggerAbs, NotSupportTypeMaxConfig, 410 | h.memStats.sequentialData(), rss) 411 | 412 | return false 413 | } 414 | 415 | h.Alertf("holmes.memory", UniformLogFormat, "pprof", check2name[mem], 416 | c.TriggerMin, c.TriggerDiff, c.TriggerAbs, 417 | NotSupportTypeMaxConfig, h.memStats, rss) 418 | 419 | var buf bytes.Buffer 420 | _ = pprof.Lookup("heap").WriteTo(&buf, int(h.opts.DumpProfileType)) // nolint: errcheck 421 | 422 | scene := Scene{ 423 | typeOption: c, 424 | CurVal: rss, 425 | Avg: h.memStats.avg(), 426 | } 427 | 428 | h.ReportProfile(type2name[mem], h.writeProfileDataToFile(buf, mem, ""), reason, "", time.Now(), buf.Bytes(), scene) 429 | return true 430 | } 431 | 432 | func (h *Holmes) threadCheckAndShrink(threadNum int) { 433 | opts := h.opts.GetShrinkThreadOpts() 434 | 435 | if !opts.Enable { 436 | return 437 | } 438 | 439 | if h.shrinkThrCoolDownTime.After(time.Now()) { 440 | return 441 | } 442 | 443 | if threadNum > opts.Threshold { 444 | // 100x Delay time a cooldown time as default 445 | delay := opts.Delay * 100 446 | // one hour at least 447 | if delay < time.Hour { 448 | delay = time.Hour 449 | } 450 | if delay > time.Hour*24 { 451 | delay = time.Hour * 24 452 | } 453 | h.shrinkThrCoolDownTime = time.Now().Add(delay) 454 | 455 | h.Alertf("holmes.thread", "current thread number(%v) larger than threshold(%v), will start to shrink thread after %v", threadNum, opts.Threshold, opts.Delay) 456 | 457 | // do not shrink thread immediately 458 | time.AfterFunc(opts.Delay, func() { 459 | h.startShrinkThread() 460 | }) 461 | } 462 | } 463 | 464 | // thread start. 465 | func (h *Holmes) threadCheckAndDump(threadNum int) { 466 | threadOpts := h.opts.GetThreadOpts() 467 | if !threadOpts.Enable { 468 | return 469 | } 470 | 471 | if h.threadCoolDownTime.After(time.Now()) { 472 | h.Debugf("[Holmes] thread dump is in cooldown") 473 | return 474 | } 475 | // threadOpts is a struct, no escape. 476 | if triggered := h.threadProfile(threadNum, threadOpts); triggered { 477 | h.threadCoolDownTime = time.Now().Add(threadOpts.CoolDown) 478 | h.threadTriggerCount++ 479 | 480 | // optimize: https://github.com/mosn/holmes/issues/84 481 | // Thread dump information contains goroutine information 482 | // skip goroutine dump 483 | h.goroutineCoolDownByThread() 484 | } 485 | } 486 | 487 | // The thread dump is triggered while operating goroutine dump CoolDown . 488 | // Thread dump information contains goroutine information . 489 | func (h *Holmes) goroutineCoolDownByThread() { 490 | grOpts := h.opts.GetGrOpts() 491 | if !grOpts.Enable { 492 | return 493 | } 494 | 495 | h.grCoolDownTime = time.Now().Add(grOpts.CoolDown) 496 | } 497 | 498 | // TODO: better only shrink the threads that are idle. 499 | func (h *Holmes) startShrinkThread() { 500 | 501 | curThreadNum := getThreadNum() 502 | opts := h.opts.GetShrinkThreadOpts() 503 | 504 | n := curThreadNum - opts.Threshold 505 | 506 | // check again after the timer triggered 507 | if opts.Enable && n > 0 { 508 | h.shrinkThreadTriggerCount++ 509 | h.Infof("[holmes] start to shrink %v threads, now: %v", n, curThreadNum) 510 | 511 | var wg sync.WaitGroup 512 | wg.Add(n) 513 | for i := 0; i < n; i++ { 514 | // avoid close too much thread in batch. 515 | time.Sleep(time.Millisecond * 100) 516 | 517 | go func() { 518 | defer wg.Done() 519 | runtime.LockOSThread() 520 | }() 521 | } 522 | wg.Wait() 523 | 524 | h.Infof("[holmes] finished shrink threads, now: %v", getThreadNum()) 525 | } 526 | } 527 | 528 | func (h *Holmes) threadProfile(curThreadNum int, c typeOption) bool { 529 | match, reason := matchRule(h.threadStats, curThreadNum, c.TriggerMin, c.TriggerAbs, c.TriggerDiff, NotSupportTypeMaxConfig) 530 | if !match { 531 | // let user know why this should not dump 532 | h.Infof(UniformLogFormat, "NODUMP", check2name[thread], 533 | c.TriggerMin, c.TriggerDiff, c.TriggerAbs, NotSupportTypeMaxConfig, 534 | h.threadStats.sequentialData(), curThreadNum) 535 | 536 | return false 537 | } 538 | 539 | h.Alertf("holmes.thread", UniformLogFormat, "pprof", check2name[thread], 540 | c.TriggerMin, c.TriggerDiff, c.TriggerAbs, 541 | NotSupportTypeMaxConfig, h.threadStats, curThreadNum) 542 | 543 | eventID := fmt.Sprintf("thr-%d", h.threadTriggerCount) 544 | var buf bytes.Buffer 545 | 546 | _ = pprof.Lookup("threadcreate").WriteTo(&buf, int(h.opts.DumpProfileType)) // nolint: errcheck 547 | 548 | scene := Scene{ 549 | typeOption: c, 550 | CurVal: curThreadNum, 551 | Avg: h.threadStats.avg(), 552 | } 553 | 554 | h.ReportProfile(type2name[thread], h.writeProfileDataToFile(buf, thread, eventID), 555 | reason, eventID, time.Now(), buf.Bytes(), scene) 556 | 557 | buf.Reset() 558 | _ = pprof.Lookup("goroutine").WriteTo(&buf, int(h.opts.DumpProfileType)) // nolint: errcheck 559 | 560 | h.ReportProfile(type2name[goroutine], h.writeProfileDataToFile(buf, goroutine, eventID), 561 | reason, eventID, time.Now(), buf.Bytes(), scene) 562 | 563 | return true 564 | } 565 | 566 | // thread end. 567 | 568 | // cpu start. 569 | func (h *Holmes) cpuCheckAndDump(cpu int) { 570 | cpuOpts := h.opts.GetCPUOpts() 571 | if !cpuOpts.Enable { 572 | return 573 | } 574 | 575 | if h.cpuCoolDownTime.After(time.Now()) { 576 | h.Debugf("[Holmes] cpu dump is in cooldown") 577 | return 578 | } 579 | // cpuOpts is a struct, no escape. 580 | if triggered := h.cpuProfile(cpu, cpuOpts); triggered { 581 | h.cpuCoolDownTime = time.Now().Add(cpuOpts.CoolDown) 582 | h.cpuTriggerCount++ 583 | } 584 | } 585 | 586 | func (h *Holmes) cpuProfile(curCPUUsage int, c typeOption) bool { 587 | match, reason := matchRule(h.cpuStats, curCPUUsage, c.TriggerMin, c.TriggerAbs, c.TriggerDiff, NotSupportTypeMaxConfig) 588 | if !match { 589 | // let user know why this should not dump 590 | h.Infof(UniformLogFormat, "NODUMP", check2name[cpu], 591 | c.TriggerMin, c.TriggerDiff, c.TriggerAbs, NotSupportTypeMaxConfig, 592 | h.cpuStats.sequentialData(), curCPUUsage) 593 | 594 | return false 595 | } 596 | 597 | h.Alertf("holmes.cpu", UniformLogFormat, "pprof dump", check2name[cpu], 598 | c.TriggerMin, c.TriggerDiff, c.TriggerAbs, NotSupportTypeMaxConfig, 599 | h.cpuStats.sequentialData(), curCPUUsage) 600 | 601 | bf, binFileName, err := getBinaryFileNameAndCreate(h.opts.DumpPath, cpu, "") 602 | if err != nil { 603 | h.Errorf("[Holmes] failed to create cpu profile file: %v", err.Error()) 604 | return false 605 | } 606 | defer bf.Close() // nolint: errcheck 607 | 608 | err = pprof.StartCPUProfile(bf) 609 | if err != nil { 610 | h.Errorf("[Holmes] failed to profile cpu: %v", err.Error()) 611 | return false 612 | } 613 | 614 | time.Sleep(h.opts.CPUSamplingTime) 615 | pprof.StopCPUProfile() 616 | 617 | rptOpts, bfCpy := h.opts.GetReporterOpts(), []byte{} 618 | if h.opts.DumpToLogger || rptOpts.active == 1 { 619 | bfCpy, err = ioutil.ReadFile(binFileName) 620 | if err != nil { 621 | h.Errorf("encounter error when dumping profile to logger, failed to read cpu profile file: %v", err) 622 | return true 623 | } 624 | } 625 | 626 | if h.opts.DumpToLogger { 627 | h.Infof("[Holmes] CPU profile name : " + "::" + binFileName + " \n" + string(bfCpy)) 628 | } 629 | 630 | scene := Scene{ 631 | typeOption: c, 632 | CurVal: curCPUUsage, 633 | Avg: h.cpuStats.avg(), 634 | } 635 | 636 | if rptOpts.active == 1 { 637 | h.ReportProfile(type2name[cpu], binFileName, 638 | reason, "", time.Now(), bfCpy, scene) 639 | } 640 | 641 | return true 642 | } 643 | 644 | func (h *Holmes) gcHeapCheckLoop(ch chan struct{}) { 645 | for range ch { 646 | h.gcHeapCheckAndDump() 647 | } 648 | } 649 | 650 | func (h *Holmes) gcHeapCheckAndDump() { 651 | gcHeapOpts := h.opts.GetGcHeapOpts() 652 | 653 | if !gcHeapOpts.Enable || atomic.LoadInt64(&h.stopped) == 1 { 654 | return 655 | } 656 | 657 | memStats := new(runtime.MemStats) 658 | runtime.ReadMemStats(memStats) 659 | 660 | // TODO: we can only use NextGC for now since runtime haven't expose heapmarked yet 661 | // and we hard code the gcPercent is 100 here. 662 | // may introduce a new API debug.GCHeapMarked? it can also has better performance(no STW). 663 | nextGC := memStats.NextGC 664 | prevGC := nextGC / 2 //nolint:gomnd 665 | 666 | memoryLimit, err := h.getMemoryLimit() 667 | if memoryLimit == 0 || err != nil { 668 | h.Errorf("[Holmes] get memory limit failed, memory limit: %v, error: %v", memoryLimit, err) 669 | return 670 | } 671 | 672 | ratio := int(100 * float64(prevGC) / float64(memoryLimit)) 673 | h.gcHeapStats.push(ratio) 674 | 675 | h.gcCycleCount++ 676 | if h.gcCycleCount < minCollectCyclesBeforeDumpStart { 677 | // at least collect some cycles 678 | // before start to judge and dump 679 | h.Debugf("[Holmes] GC cycle warming up : %d", h.gcCycleCount) 680 | return 681 | } 682 | 683 | if h.gcHeapCoolDownTime.After(time.Now()) { 684 | h.Debugf("[Holmes] GC heap dump is in cooldown") 685 | return 686 | } 687 | 688 | if triggered := h.gcHeapProfile(ratio, h.gcHeapTriggered, gcHeapOpts); triggered { 689 | if h.gcHeapTriggered { 690 | // already dump twice, mark it false 691 | h.gcHeapTriggered = false 692 | h.gcHeapCoolDownTime = time.Now().Add(gcHeapOpts.CoolDown) 693 | h.gcHeapTriggerCount++ 694 | } else { 695 | // force dump next time 696 | h.gcHeapTriggered = true 697 | } 698 | } 699 | } 700 | 701 | func (h *Holmes) getCPUCore() (float64, error) { 702 | if h.opts.cpuCore > 0 { 703 | return h.opts.cpuCore, nil 704 | } 705 | 706 | if h.opts.UseGoProcAsCPUCore { 707 | return float64(runtime.GOMAXPROCS(-1)), nil 708 | } 709 | 710 | if h.opts.UseCGroup { 711 | return getCGroupCPUCore() 712 | } 713 | 714 | return float64(runtime.NumCPU()), nil 715 | } 716 | 717 | func (h *Holmes) getMemoryLimit() (uint64, error) { 718 | if h.opts.memoryLimit > 0 { 719 | return h.opts.memoryLimit, nil 720 | } 721 | 722 | if h.opts.UseCGroup { 723 | return getCGroupMemoryLimit() 724 | } 725 | 726 | return getNormalMemoryLimit() 727 | } 728 | 729 | // gcHeapProfile will dump profile twice when triggered once. 730 | // since the current memory profile will be merged after next GC cycle. 731 | // And we assume the finalizer will be called before next GC cycle(it will be usually). 732 | func (h *Holmes) gcHeapProfile(gc int, force bool, c typeOption) bool { 733 | match, reason := matchRule(h.gcHeapStats, gc, c.TriggerMin, c.TriggerAbs, c.TriggerDiff, NotSupportTypeMaxConfig) 734 | if !force && !match { 735 | // let user know why this should not dump 736 | h.Infof(UniformLogFormat, "NODUMP", check2name[gcHeap], 737 | c.TriggerMin, c.TriggerDiff, c.TriggerAbs, 738 | NotSupportTypeMaxConfig, 739 | h.gcHeapStats.sequentialData(), gc) 740 | 741 | return false 742 | } 743 | 744 | h.Alertf("holmes.gcheap", UniformLogFormat, "pprof", check2name[gcHeap], 745 | c.TriggerMin, c.TriggerDiff, c.TriggerAbs, 746 | NotSupportTypeMaxConfig, h.gcHeapStats, gc) 747 | 748 | // gcHeapTriggerCount only increased after got both two profiles 749 | eventID := fmt.Sprintf("heap-%d", h.gcHeapTriggerCount) 750 | 751 | var buf bytes.Buffer 752 | _ = pprof.Lookup("heap").WriteTo(&buf, int(h.opts.DumpProfileType)) // nolint: errcheck 753 | 754 | scene := Scene{ 755 | typeOption: c, 756 | CurVal: gc, 757 | Avg: h.gcHeapStats.avg(), 758 | } 759 | 760 | h.ReportProfile(type2name[gcHeap], h.writeProfileDataToFile(buf, gcHeap, eventID), 761 | reason, eventID, time.Now(), buf.Bytes(), scene) 762 | return true 763 | } 764 | 765 | func (h *Holmes) writeProfileDataToFile(data bytes.Buffer, dumpType configureType, eventID string) string { 766 | fileName, err := writeFile(data, dumpType, h.opts.DumpOptions, eventID) 767 | if err != nil { 768 | h.Errorf("failed to write profile to file(%v), err: %s", fileName, err.Error()) 769 | return "" 770 | } 771 | 772 | if h.opts.DumpOptions.DumpToLogger { 773 | h.Infof(fmt.Sprintf("[Holmes] %v profile: \n", check2name[dumpType]) + data.String()) 774 | } 775 | 776 | h.Infof("[Holmes] pprof %v profile write to file %v successfully", check2name[dumpType], fileName) 777 | return fileName 778 | } 779 | 780 | func (h *Holmes) initEnvironment() { 781 | // whether the max memory is limited by cgroup 782 | if h.opts.UseCGroup { 783 | h.Infof("[Holmes] use cgroup to limit memory") 784 | } else { 785 | h.Infof("[Holmes] use the default memory percent calculated by gopsutil") 786 | } 787 | } 788 | 789 | func (h *Holmes) EnableDump(curCPU int) (err error) { 790 | if h.opts.CPUMaxPercent != 0 && curCPU >= h.opts.CPUMaxPercent { 791 | return fmt.Errorf("current cpu percent [%v] is greater than the CPUMaxPercent [%v]", curCPU, h.opts.CPUMaxPercent) 792 | } 793 | return nil 794 | } 795 | 796 | // Set sets holmes's optional after initialing. 797 | func (h *Holmes) Set(opts ...Option) error { 798 | h.opts.L.Lock() 799 | defer h.opts.L.Unlock() 800 | 801 | for _, opt := range opts { 802 | if err := opt.apply(h.opts); err != nil { 803 | return err 804 | } 805 | } 806 | return nil 807 | } 808 | 809 | func (h *Holmes) DisableProfileReporter() { 810 | atomic.StoreInt32(&h.opts.rptOpts.active, 0) 811 | } 812 | 813 | func (h *Holmes) EnableProfileReporter() { 814 | opt := h.opts.GetReporterOpts() 815 | if opt.reporter == nil { 816 | h.Infof("failed to enable profile reporter since reporter is empty") 817 | return 818 | } 819 | atomic.StoreInt32(&h.opts.rptOpts.active, 1) 820 | } 821 | 822 | func (h *Holmes) ReportProfile(pType string, filename string, reason ReasonType, eventID string, sampleTime time.Time, pprofBytes []byte, scene Scene) { 823 | if filename == "" { 824 | h.Errorf("dump name is empty, type:%s, reason:%s, eventID:%s", pType, reason.String(), eventID) 825 | return 826 | } 827 | 828 | defer func() { 829 | if r := recover(); r != nil { 830 | h.Errorf("Panic during report profile: %v", r) 831 | } 832 | }() 833 | 834 | if atomic.LoadInt64(&h.stopped) == 1 { 835 | return 836 | } 837 | 838 | opts := h.opts.GetReporterOpts() 839 | if opts.active == 0 { 840 | return 841 | } 842 | 843 | msg := rptEvent{ 844 | PType: pType, 845 | FileName: filename, 846 | Reason: reason, 847 | EventID: eventID, 848 | SampleTime: sampleTime, 849 | PprofBytes: pprofBytes, 850 | Scene: scene, 851 | } 852 | 853 | // read channel should be atomic. 854 | ch := h.rptEventsCh 855 | if ch == nil { 856 | return 857 | } 858 | // Notice: here may be a litte race, will panic when ch is closed now. 859 | // we just leave it since it is very small and there is a recover. 860 | select { 861 | case ch <- msg: 862 | default: 863 | h.Warnf("reporter channel is full, will ignore it") 864 | } 865 | } 866 | 867 | // startReporter starts a background goroutine to consume event channel, 868 | // and finish it at after receive from cancel channel. 869 | func (h *Holmes) startReporter(ch chan rptEvent) { 870 | go func() { 871 | for evt := range ch { 872 | opts := h.opts.GetReporterOpts() 873 | if opts.reporter == nil { 874 | h.Infof("reporter is nil, please initial it before startReporter") 875 | // drop the event 876 | continue 877 | } 878 | 879 | // It's supposed to be sending judgment, isn't it? 880 | err := opts.reporter.Report(evt.PType, evt.FileName, evt.Reason, evt.EventID, evt.SampleTime, evt.PprofBytes, evt.Scene) // nolint: errcheck 881 | if err != nil { 882 | h.Infof("reporter err:%v", err) 883 | 884 | } 885 | } 886 | }() 887 | } 888 | -------------------------------------------------------------------------------- /holmes_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package holmes 19 | 20 | import ( 21 | "log" 22 | "os" 23 | "runtime" 24 | "testing" 25 | "time" 26 | ) 27 | 28 | var h *Holmes 29 | 30 | func TestMain(m *testing.M) { 31 | log.Println("holmes initialing") 32 | h, _ = New( 33 | WithCollectInterval("1s"), 34 | WithTextDump(), 35 | WithGoroutineDump(10, 25, 80, 90, time.Minute), 36 | ) 37 | log.Println("holmes initial success") 38 | h.EnableGoroutineDump().Start() 39 | time.Sleep(10 * time.Second) 40 | log.Println("on running") 41 | os.Exit(m.Run()) 42 | } 43 | 44 | // -gcflags=all=-l 45 | func TestResetCollectInterval(t *testing.T) { 46 | before := h.collectCount 47 | go func() { 48 | h.Set(WithCollectInterval("2s")) //nolint:errcheck 49 | defer h.Set(WithCollectInterval("1s")) //nolint:errcheck 50 | time.Sleep(6 * time.Second) 51 | // if collect interval not change, collectCount would increase 5 at least 52 | if h.collectCount-before >= 5 { 53 | log.Fatalf("fail, before %v, now %v", before, h.collectCount) 54 | } 55 | }() 56 | time.Sleep(8 * time.Second) 57 | } 58 | 59 | func TestSetGrOpts(t *testing.T) { 60 | // decrease min trigger, if our set api is effective, 61 | // gr profile would be trigger and grCoolDown increase. 62 | min, diff, abs := 3, 10, 1 63 | before := h.grCoolDownTime 64 | 65 | err := h.Set( 66 | WithGoroutineDump(min, diff, abs, 90, time.Minute)) 67 | if err != nil { 68 | log.Fatalf("fail to set opts on running time.") 69 | } 70 | 71 | time.Sleep(5 * time.Second) 72 | if before.Equal(h.grCoolDownTime) { 73 | log.Fatalf("fail") 74 | } 75 | } 76 | 77 | func TestCpuCore(t *testing.T) { 78 | _ = h.Set( 79 | WithCGroup(false), 80 | WithGoProcAsCPUCore(false), 81 | ) 82 | cpuCore1, _ := h.getCPUCore() 83 | goProc1 := runtime.GOMAXPROCS(-1) 84 | 85 | // system cpu core matches go procs 86 | if cpuCore1 != float64(goProc1) { 87 | log.Fatalf("cpuCore1 %v not equal goProc1 %v", cpuCore1, goProc1) 88 | } 89 | 90 | // go proc = system cpu core + 1 91 | runtime.GOMAXPROCS(goProc1 + 1) 92 | 93 | cpuCore2, _ := h.getCPUCore() 94 | goProc2 := runtime.GOMAXPROCS(-1) 95 | if cpuCore2 != float64(goProc2)-1 { 96 | log.Fatalf("cpuCore2 %v not equal goProc2-1 %v", cpuCore2, goProc2) 97 | } 98 | 99 | // set cpu core directly 100 | _ = h.Set( 101 | WithCPUCore(cpuCore1 + 5), 102 | ) 103 | 104 | cpuCore3, _ := h.getCPUCore() 105 | if cpuCore3 != cpuCore1+5 { 106 | log.Fatalf("cpuCore3 %v not equal cpuCore1+5 %v", cpuCore3, cpuCore1+5) 107 | } 108 | } 109 | 110 | func createThread(n int, blockTime time.Duration) { 111 | for i := 0; i < n; i++ { 112 | go func() { 113 | runtime.LockOSThread() 114 | time.Sleep(blockTime) 115 | 116 | runtime.UnlockOSThread() 117 | }() 118 | } 119 | } 120 | 121 | func TestWithShrinkThread(t *testing.T) { 122 | before := h.shrinkThreadTriggerCount 123 | 124 | err := h.Set( 125 | // delay 5 seconds, after the 50 threads unlocked 126 | WithThreadDump(10, 10, 10, time.Minute), 127 | WithShrinkThread(20, time.Second*5), 128 | WithCollectInterval("1s"), 129 | ) 130 | h.EnableShrinkThread() 131 | if err != nil { 132 | log.Fatalf("fail to set opts on running time.") 133 | } 134 | 135 | threadNum1 := getThreadNum() 136 | // 50 threads exists 3 seconds 137 | createThread(50, time.Second*3) 138 | 139 | time.Sleep(time.Second) 140 | threadNum2 := getThreadNum() 141 | if threadNum2-threadNum1 < 40 { 142 | log.Fatalf("create thread failed, before: %v, now: %v", threadNum1, threadNum2) 143 | } 144 | log.Printf("created 50 threads, before: %v, now: %v", threadNum1, threadNum2) 145 | 146 | time.Sleep(10 * time.Second) 147 | 148 | if before+1 != h.shrinkThreadTriggerCount { 149 | log.Fatalf("shrink thread not triggered, before: %v, now: %v", before, h.shrinkThreadTriggerCount) 150 | } 151 | 152 | threadNum3 := getThreadNum() 153 | if threadNum2-threadNum3 < 30 { 154 | log.Fatalf("shrink thread failed, before: %v, now: %v", threadNum2, threadNum3) 155 | } 156 | 157 | h.DisableShrinkThread() 158 | } 159 | -------------------------------------------------------------------------------- /log.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package holmes 19 | 20 | import ( 21 | mlog "mosn.io/pkg/log" 22 | ) 23 | 24 | func (h *Holmes) getLogger() mlog.ErrorLogger { 25 | h.opts.L.RLock() 26 | defer h.opts.L.RUnlock() 27 | return h.opts.logger 28 | } 29 | 30 | func (h *Holmes) Debugf(format string, args ...interface{}) { 31 | logger := h.getLogger() 32 | if logger == nil { 33 | return 34 | } 35 | logger.Debugf(format, args...) 36 | } 37 | 38 | func (h *Holmes) Infof(format string, args ...interface{}) { 39 | logger := h.getLogger() 40 | if logger == nil { 41 | return 42 | } 43 | logger.Infof(format, args...) 44 | } 45 | 46 | func (h *Holmes) Warnf(format string, args ...interface{}) { 47 | logger := h.getLogger() 48 | if logger == nil { 49 | return 50 | } 51 | logger.Warnf(format, args...) 52 | } 53 | 54 | func (h *Holmes) Errorf(format string, args ...interface{}) { 55 | logger := h.getLogger() 56 | if logger == nil { 57 | return 58 | } 59 | logger.Errorf(format, args...) 60 | } 61 | 62 | func (h *Holmes) Alertf(alert string, format string, args ...interface{}) { 63 | logger := h.getLogger() 64 | if logger == nil { 65 | return 66 | } 67 | logger.Alertf(alert, format, args...) 68 | } 69 | 70 | // NewStdLogger create an ErrorLogger interface value that writing to os.Stdout 71 | func NewStdLogger() mlog.ErrorLogger { 72 | logger, _ := mlog.GetOrCreateLogger("stdout", nil) 73 | return &mlog.SimpleErrorLog{ 74 | Logger: logger, 75 | Level: mlog.DEBUG, 76 | } 77 | } 78 | 79 | func NewFileLog(path string, level mlog.Level) mlog.ErrorLogger { 80 | logger, _ := mlog.GetOrCreateLogger(path, nil) 81 | return &mlog.SimpleErrorLog{ 82 | Logger: logger, 83 | Level: level, 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /options.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package holmes 19 | 20 | import ( 21 | "sync" 22 | "sync/atomic" 23 | "time" 24 | 25 | mlog "mosn.io/pkg/log" 26 | ) 27 | 28 | type options struct { 29 | logger mlog.ErrorLogger 30 | 31 | UseGoProcAsCPUCore bool // use the go max procs number as the CPU core number when it's true 32 | UseCGroup bool // use the CGroup to calc cpu/memory when it's true 33 | 34 | // overwrite the system level memory limitation when > 0. 35 | memoryLimit uint64 36 | cpuCore float64 37 | 38 | *ShrinkThrOptions 39 | 40 | *DumpOptions 41 | 42 | // interval for dump loop, default 5s 43 | CollectInterval time.Duration 44 | intervalResetting chan struct{} 45 | 46 | // if current cpu usage percent is greater than CPUMaxPercent, 47 | // holmes would not dump all types profile, cuz this 48 | // move may result of the system crash. 49 | CPUMaxPercent int 50 | 51 | // cpu sampling time 52 | CPUSamplingTime time.Duration 53 | 54 | // if write lock is held mean holmes's 55 | // configuration is being modified. 56 | L *sync.RWMutex 57 | 58 | // the cooldown time after every type of dump 59 | // interval for cooldown,default 1m 60 | // each check type have different cooldowns of their own 61 | 62 | grOpts *grOptions 63 | 64 | memOpts *typeOption 65 | gCHeapOpts *typeOption 66 | cpuOpts *typeOption 67 | threadOpts *typeOption 68 | 69 | // profile reporter 70 | rptOpts *ReporterOptions 71 | } 72 | 73 | type ReporterOptions struct { 74 | reporter ProfileReporter 75 | active int32 // switch 76 | } 77 | 78 | // newReporterOpts returns ReporterOptions。 79 | func newReporterOpts() *ReporterOptions { 80 | opts := &ReporterOptions{} 81 | 82 | return opts 83 | } 84 | 85 | // DumpOptions contains configuration about dump file. 86 | type DumpOptions struct { 87 | // full path to put the profile files, default /tmp 88 | DumpPath string 89 | // default dump to binary profile, set to true if you want a text profile 90 | DumpProfileType dumpProfileType 91 | // only dump top 10 if set to false, otherwise dump all, only effective when in_text = true 92 | DumpFullStack bool 93 | // dump profile to logger. It will make huge log output if enable DumpToLogger option. issues/90 94 | DumpToLogger bool 95 | } 96 | 97 | // ShrinkThrOptions contains the configuration about shrink thread 98 | type ShrinkThrOptions struct { 99 | // shrink the thread number when it exceeds the max threshold that specified in Threshold 100 | Enable bool 101 | Threshold int 102 | Delay time.Duration // start to shrink thread after the delay time. 103 | } 104 | 105 | // GetReporterOpts returns a copy of rptOpts. 106 | func (o *options) GetReporterOpts() ReporterOptions { 107 | o.L.RLock() 108 | defer o.L.RUnlock() 109 | return *o.rptOpts 110 | } 111 | 112 | // GetShrinkThreadOpts return a copy of ShrinkThrOptions. 113 | func (o *options) GetShrinkThreadOpts() ShrinkThrOptions { 114 | o.L.RLock() 115 | defer o.L.RUnlock() 116 | return *o.ShrinkThrOptions 117 | } 118 | 119 | // GetMemOpts return a copy of typeOption. 120 | func (o *options) GetMemOpts() typeOption { 121 | o.L.RLock() 122 | defer o.L.RUnlock() 123 | return *o.memOpts 124 | } 125 | 126 | // GetCPUOpts return a copy of typeOption 127 | // if cpuOpts not exist return a empty typeOption and false. 128 | func (o *options) GetCPUOpts() typeOption { 129 | o.L.RLock() 130 | defer o.L.RUnlock() 131 | return *o.cpuOpts 132 | } 133 | 134 | // GetGrOpts return a copy of grOptions 135 | // if grOpts not exist return a empty grOptions and false. 136 | func (o *options) GetGrOpts() grOptions { 137 | o.L.RLock() 138 | defer o.L.RUnlock() 139 | return *o.grOpts 140 | } 141 | 142 | // GetThreadOpts return a copy of typeOption 143 | // if threadOpts not exist return a empty typeOption and false. 144 | func (o *options) GetThreadOpts() typeOption { 145 | o.L.RLock() 146 | defer o.L.RUnlock() 147 | return *o.threadOpts 148 | } 149 | 150 | // GetGcHeapOpts return a copy of typeOption 151 | // if gCHeapOpts not exist return a empty typeOption and false. 152 | func (o *options) GetGcHeapOpts() typeOption { 153 | o.L.RLock() 154 | defer o.L.RUnlock() 155 | return *o.gCHeapOpts 156 | } 157 | 158 | // Option holmes option type. 159 | type Option interface { 160 | apply(*options) error 161 | } 162 | 163 | type optionFunc func(*options) error 164 | 165 | func (f optionFunc) apply(opts *options) error { 166 | return f(opts) 167 | } 168 | 169 | func newOptions() *options { 170 | o := &options{ 171 | logger: NewStdLogger(), 172 | grOpts: newGrOptions(), 173 | memOpts: newMemOptions(), 174 | gCHeapOpts: newGCHeapOptions(), 175 | cpuOpts: newCPUOptions(), 176 | threadOpts: newThreadOptions(), 177 | CollectInterval: defaultInterval, 178 | intervalResetting: make(chan struct{}, 1), 179 | CPUSamplingTime: defaultCPUSamplingTime, 180 | DumpOptions: &DumpOptions{ 181 | DumpPath: defaultDumpPath, 182 | DumpProfileType: defaultDumpProfileType, 183 | DumpFullStack: false, 184 | }, 185 | ShrinkThrOptions: &ShrinkThrOptions{ 186 | Enable: false, 187 | }, 188 | L: &sync.RWMutex{}, 189 | rptOpts: newReporterOpts(), 190 | } 191 | return o 192 | } 193 | 194 | // WithLogger set the logger 195 | // logger can be created by: NewFileLog("/path/to/log/file", level) 196 | func WithLogger(logger mlog.ErrorLogger) Option { 197 | return optionFunc(func(opts *options) (err error) { 198 | opts.logger = logger 199 | return 200 | }) 201 | } 202 | 203 | // WithDumpPath set the dump path for holmes. 204 | func WithDumpPath(dumpPath string) Option { 205 | return optionFunc(func(opts *options) (err error) { 206 | opts.DumpPath = dumpPath 207 | return 208 | }) 209 | } 210 | 211 | // WithCollectInterval : interval must be valid time duration string, 212 | // eg. "ns", "us" (or "µs"), "ms", "s", "m", "h". 213 | func WithCollectInterval(interval string) Option { 214 | return optionFunc(func(opts *options) (err error) { 215 | // CollectInterval wouldn't be zero value, because it 216 | // will be initialized as defaultInterval at newOptions() 217 | newInterval, err := time.ParseDuration(interval) 218 | if err != nil || opts.CollectInterval.Seconds() == newInterval.Seconds() { 219 | return 220 | } 221 | 222 | opts.CollectInterval = newInterval 223 | opts.intervalResetting <- struct{}{} 224 | 225 | return 226 | }) 227 | } 228 | 229 | // WithCPUMax : set the CPUMaxPercent parameter as max 230 | func WithCPUMax(max int) Option { 231 | return optionFunc(func(opts *options) (err error) { 232 | opts.CPUMaxPercent = max 233 | return 234 | }) 235 | } 236 | 237 | // WithCPUSamplingTime set cpu sampling time 238 | func WithCPUSamplingTime(duration string) Option { 239 | return optionFunc(func(opts *options) (err error) { 240 | // CPUSamplingTime wouldn't be zero value, because it 241 | // will be initialized as defaultCPUSamplingTime at newOptions() 242 | newDuration, err := time.ParseDuration(duration) 243 | if err != nil { 244 | return 245 | } 246 | 247 | if newDuration <= 0 { 248 | newDuration = defaultCPUSamplingTime 249 | } 250 | 251 | opts.CPUSamplingTime = newDuration 252 | 253 | return 254 | }) 255 | } 256 | 257 | // WithBinaryDump set dump mode to binary. 258 | func WithBinaryDump() Option { 259 | return withDumpProfileType(binaryDump) 260 | } 261 | 262 | // WithTextDump set dump mode to text. 263 | func WithTextDump() Option { 264 | return withDumpProfileType(textDump) 265 | } 266 | 267 | // WithFullStack set to dump full stack or top 10 stack, when dump in text mode. 268 | func WithFullStack(isFull bool) Option { 269 | return optionFunc(func(opts *options) (err error) { 270 | opts.DumpFullStack = isFull 271 | return 272 | }) 273 | } 274 | 275 | func withDumpProfileType(profileType dumpProfileType) Option { 276 | return optionFunc(func(opts *options) (err error) { 277 | opts.DumpProfileType = profileType 278 | return 279 | }) 280 | } 281 | 282 | type grOptions struct { 283 | // enable the goroutine dumper, should dump if one of the following requirements is matched 284 | // 1. goroutine_num > TriggerMin && goroutine_num < GoroutineTriggerNumMax && goroutine diff percent > TriggerDiff 285 | // 2. goroutine_num > GoroutineTriggerNumAbsNum && goroutine_num < GoroutineTriggerNumMax 286 | *typeOption 287 | GoroutineTriggerNumMax int // goroutine trigger max in number 288 | } 289 | 290 | func newGrOptions() *grOptions { 291 | base := newTypeOpts( 292 | defaultGoroutineTriggerMin, 293 | defaultGoroutineTriggerAbs, 294 | defaultGoroutineTriggerDiff, 295 | defaultGoroutineCoolDown, 296 | ) 297 | return &grOptions{typeOption: base} 298 | } 299 | 300 | // WithGoroutineDump set the goroutine dump options. 301 | func WithGoroutineDump(min int, diff int, abs int, max int, coolDown time.Duration) Option { 302 | return optionFunc(func(opts *options) (err error) { 303 | opts.grOpts.Set(min, abs, diff, coolDown) 304 | opts.grOpts.GoroutineTriggerNumMax = max 305 | return 306 | }) 307 | } 308 | 309 | func WithDumpToLogger(new bool) Option { 310 | return optionFunc(func(opts *options) (err error) { 311 | opts.DumpToLogger = new 312 | return 313 | }) 314 | } 315 | 316 | type typeOption struct { 317 | Enable bool 318 | // mem/cpu/gcheap trigger minimum in percent, goroutine/thread trigger minimum in number 319 | TriggerMin int 320 | 321 | // mem/cpu/gcheap trigger abs in percent, goroutine/thread trigger abs in number 322 | TriggerAbs int 323 | 324 | // mem/cpu/gcheap/goroutine/thread trigger diff in percent 325 | TriggerDiff int 326 | 327 | // CoolDown skip profile for CoolDown time after done a profile 328 | CoolDown time.Duration 329 | } 330 | 331 | func newTypeOpts(triggerMin, triggerAbs, triggerDiff int, coolDown time.Duration) *typeOption { 332 | return &typeOption{ 333 | Enable: false, 334 | TriggerMin: triggerMin, 335 | TriggerAbs: triggerAbs, 336 | TriggerDiff: triggerDiff, 337 | CoolDown: coolDown, 338 | } 339 | } 340 | 341 | func (base *typeOption) Set(min, abs, diff int, coolDown time.Duration) { 342 | base.TriggerMin, base.TriggerAbs, base.TriggerDiff, base.CoolDown = min, abs, diff, coolDown 343 | } 344 | 345 | // newMemOptions 346 | // enable the heap dumper, should dump if one of the following requirements is matched 347 | // 1. memory usage > TriggerMin && memory usage diff > TriggerDiff 348 | // 2. memory usage > TriggerAbs. 349 | func newMemOptions() *typeOption { 350 | return newTypeOpts( 351 | defaultMemTriggerMin, 352 | defaultMemTriggerAbs, 353 | defaultMemTriggerDiff, 354 | defaultCooldown, 355 | ) 356 | } 357 | 358 | // WithMemDump set the memory dump options. 359 | func WithMemDump(min int, diff int, abs int, coolDown time.Duration) Option { 360 | return optionFunc(func(opts *options) (err error) { 361 | opts.memOpts.Set(min, abs, diff, coolDown) 362 | return 363 | }) 364 | } 365 | 366 | // newGCHeapOptions 367 | // enable the heap dumper, should dump if one of the following requirements is matched 368 | // 1. GC heap usage > TriggerMin && GC heap usage diff > TriggerDiff 369 | // 2. GC heap usage > TriggerAbs 370 | // 371 | // in percent. 372 | func newGCHeapOptions() *typeOption { 373 | return newTypeOpts( 374 | defaultGCHeapTriggerMin, 375 | defaultGCHeapTriggerAbs, 376 | defaultGCHeapTriggerDiff, 377 | defaultCooldown, 378 | ) 379 | } 380 | 381 | // WithGCHeapDump set the GC heap dump options. 382 | func WithGCHeapDump(min int, diff int, abs int, coolDown time.Duration) Option { 383 | return optionFunc(func(opts *options) (err error) { 384 | opts.gCHeapOpts.Set(min, abs, diff, coolDown) 385 | return 386 | }) 387 | } 388 | 389 | // WithCPUCore overwrite the system level CPU core number when it > 0. 390 | // it's not a good idea to modify it on fly since it affects the CPU percent caculation. 391 | func WithCPUCore(cpuCore float64) Option { 392 | return optionFunc(func(opts *options) (err error) { 393 | opts.cpuCore = cpuCore 394 | return 395 | }) 396 | } 397 | 398 | // WithMemoryLimit overwrite the system level memory limit when it > 0. 399 | func WithMemoryLimit(limit uint64) Option { 400 | return optionFunc(func(opts *options) (err error) { 401 | opts.memoryLimit = limit 402 | return 403 | }) 404 | } 405 | 406 | func newThreadOptions() *typeOption { 407 | return newTypeOpts( 408 | defaultThreadTriggerMin, 409 | defaultThreadTriggerAbs, 410 | defaultThreadTriggerDiff, 411 | defaultThreadCoolDown, 412 | ) 413 | } 414 | 415 | // WithThreadDump set the thread dump options. 416 | func WithThreadDump(min, diff, abs int, coolDown time.Duration) Option { 417 | return optionFunc(func(opts *options) (err error) { 418 | opts.threadOpts.Set(min, abs, diff, coolDown) 419 | return 420 | }) 421 | } 422 | 423 | // newCPUOptions 424 | // enable the cpu dumper, should dump if one of the following requirements is matched 425 | // in percent 426 | // 1. cpu usage > CPUTriggerMin && cpu usage diff > CPUTriggerDiff 427 | // 2. cpu usage > CPUTriggerAbs 428 | // 429 | // in percent. 430 | func newCPUOptions() *typeOption { 431 | return newTypeOpts( 432 | defaultCPUTriggerMin, 433 | defaultCPUTriggerAbs, 434 | defaultCPUTriggerDiff, 435 | defaultCooldown, 436 | ) 437 | } 438 | 439 | // WithCPUDump set the cpu dump options. 440 | func WithCPUDump(min int, diff int, abs int, coolDown time.Duration) Option { 441 | return optionFunc(func(opts *options) (err error) { 442 | opts.cpuOpts.Set(min, abs, diff, coolDown) 443 | return 444 | }) 445 | } 446 | 447 | // WithGoProcAsCPUCore set holmes use cgroup or not. 448 | func WithGoProcAsCPUCore(enabled bool) Option { 449 | return optionFunc(func(opts *options) (err error) { 450 | opts.UseGoProcAsCPUCore = enabled 451 | return 452 | }) 453 | } 454 | 455 | // WithCGroup set holmes use cgroup or not. 456 | // Use CGroup are best used when resource limits are set. 457 | // refer to: https://github.com/mosn/holmes/issues/135 458 | func WithCGroup(useCGroup bool) Option { 459 | return optionFunc(func(opts *options) (err error) { 460 | opts.UseCGroup = useCGroup 461 | return 462 | }) 463 | } 464 | 465 | // WithShrinkThread enable/disable shrink thread when the thread number exceed the max threshold. 466 | func WithShrinkThread(threshold int, delay time.Duration) Option { 467 | return optionFunc(func(opts *options) (err error) { 468 | if threshold > 0 { 469 | opts.ShrinkThrOptions.Threshold = threshold 470 | } 471 | opts.ShrinkThrOptions.Delay = delay 472 | return 473 | }) 474 | } 475 | 476 | // WithProfileReporter will enable reporter 477 | // reopens profile reporter through WithProfileReporter(h.opts.rptOpts.reporter) 478 | func WithProfileReporter(r ProfileReporter) Option { 479 | return optionFunc(func(opts *options) (err error) { 480 | if r == nil { 481 | return nil 482 | } 483 | 484 | opts.rptOpts.reporter = r 485 | atomic.StoreInt32(&opts.rptOpts.active, 1) 486 | return 487 | }) 488 | } 489 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | ![license](https://img.shields.io/badge/license-Apache--2.0-green.svg) 2 | 3 | * [Holmes](#holmes) 4 | * [Design](#design) 5 | * [How to use](#how-to-use) 6 | * [Dump goroutine when goroutine number spikes](#dump-goroutine-when-goroutine-number-spikes) 7 | * [dump cpu profile when cpu load spikes](#dump-cpu-profile-when-cpu-load-spikes) 8 | * [dump heap profile when RSS spikes](#dump-heap-profile-when-rss-spikes) 9 | * [Dump heap profile when RSS spikes based GC cycle](#dump-heap-profile-when-rss-spikes-based-gc-cycle) 10 | * [Set holmes configurations on fly](#set-holmes-configurations-on-fly) 11 | * [Reporter dump event](#reporter-dump-event) 12 | * [Enable them all\!](#enable-them-all) 13 | * [Running in docker or other cgroup limited environment](#running-in-docker-or-other-cgroup-limited-environment) 14 | * [known risks](#known-risks) 15 | * [Show cases](#show-cases) 16 | 17 | # Holmes 18 | [中文版](./doc/zh.md) 19 | 20 | Self-aware Golang profile dumper. 21 | 22 | Our online system often crashes at midnight (usually killed by the OS due to OOM). 23 | As lazy developers, we don't want to be woken up at midnight and waiting for the online error to recur. 24 | 25 | holmes comes to rescue. 26 | 27 | ## Design 28 | 29 | Holmes collects the following stats every interval passed: 30 | 31 | * Goroutine number by `runtime.NumGoroutine`. 32 | * RSS used by the current process with [gopsutil](https://github.com/shirou/gopsutil) 33 | * CPU percent a total. eg total 8 core, use 4 core = 50% with [gopsutil](https://github.com/shirou/gopsutil) 34 | 35 | In addition, holmes will collect `RSS` based on GC cycle, if you enable `GC heap`. 36 | 37 | After warming up(10 times collects after starting application) phase finished, 38 | Holmes will compare the current stats with the average 39 | of previous collected stats(10 cycles). If the dump rule is matched, Holmes will dump 40 | the related profile to log(text mode) or binary file(binary mode). 41 | 42 | When you get warning messages sent by your own monitor system, e.g, memory usage exceed 80%, 43 | OOM killed, CPU usage exceed 80%, goroutine num exceed 100k. The profile is already dumped 44 | to your dump path. You could just fetch the profile and see what actually happened without pressure. 45 | 46 | 47 | ## How to use 48 | 49 | ```shell 50 | go get mosn.io/holmes 51 | ``` 52 | 53 | ### Dump goroutine when goroutine number spikes 54 | 55 | ```go 56 | h, _ := holmes.New( 57 | holmes.WithCollectInterval("5s"), 58 | holmes.WithDumpPath("/tmp"), 59 | holmes.WithTextDump(), 60 | holmes.WithDumpToLogger(true), 61 | holmes.WithGoroutineDump(10, 25, 2000, 10*1000,time.Minute), 62 | ) 63 | h.EnableGoroutineDump() 64 | 65 | // start the metrics collect and dump loop 66 | h.Start() 67 | 68 | // stop the dumper 69 | h.Stop() 70 | ``` 71 | 72 | * WithCollectInterval("5s") means the system metrics are collected once 5 seconds 73 | * WithDumpPath("/tmp") means the dump binary file(binary mode) will write content to `/tmp` dir. 74 | * WithTextDump() means not in binary mode, so it's text mode profiles 75 | * WithDumpToLogger() means profiles information will be outputted to logger. 76 | * WithGoroutineDump(10, 25, 2000, 100*1000,time.Minute) means dump will happen when current_goroutine_num > 10 && 77 | current_goroutine_num < `100*1000` && current_goroutine_num > `125%` * previous_average_goroutine_num or current_goroutine_num > `2000`, 78 | `time.Minute` means once a dump happened, the next dump will not happen before cooldown 79 | finish-1 minute. 80 | > WithGoroutineDump(min int, diff int, abs int, max int, coolDown time.Duration) 81 | > 100*1000 means max goroutine number, when current goroutines number is greater 100k, holmes would not 82 | > dump goroutine profile. Cuz if goroutine num is huge, e.g, 100k goroutine dump will also become a 83 | > heavy action: stw && stack dump. Max = 0 means no limit. 84 | 85 | ### dump cpu profile when cpu load spikes 86 | 87 | ```go 88 | h, _ := holmes.New( 89 | holmes.WithCollectInterval("5s"), 90 | holmes.WithDumpPath("/tmp"), 91 | holmes.WithCPUDump(20, 25, 80, time.Minute), 92 | holmes.WithCPUMax(90), 93 | ) 94 | h.EnableCPUDump() 95 | 96 | // start the metrics collect and dump loop 97 | h.Start() 98 | 99 | // stop the dumper 100 | h.Stop() 101 | ``` 102 | 103 | * WithCollectInterval("5s") means the system metrics are collected once 5 seconds 104 | * WithDumpPath("/tmp") means the dump binary file(binary mode) will write content to `/tmp` dir. 105 | * WithBinaryDump() or WithTextDump() doesn't affect the CPU profile dump, because the pprof 106 | standard library doesn't support text mode dump. 107 | * WithCPUDump(10, 25, 80,time.Minute) means dump will happen when cpu usage > `10%` && 108 | cpu usage > `125%` * previous cpu usage recorded or cpu usage > `80%`. 109 | `time.Minute` means once a dump happened, the next dump will not happen before 110 | cooldown finish-1 minute. 111 | * WithCPUMax means holmes would not dump all types profile when current cpu 112 | usage percent is greater than CPUMaxPercent. 113 | 114 | ### dump heap profile when RSS spikes 115 | 116 | ```go 117 | h, _ := holmes.New( 118 | holmes.WithCollectInterval("5s"), 119 | holmes.WithDumpPath("/tmp"), 120 | holmes.WithTextDump(), 121 | holmes.WithMemDump(30, 25, 80,time.Minute), 122 | ) 123 | 124 | h.EnableMemDump() 125 | 126 | // start the metrics collect and dump loop 127 | h.Start() 128 | 129 | // stop the dumper 130 | h.Stop() 131 | ``` 132 | 133 | * WithCollectInterval("5s") means the system metrics are collected once 5 seconds 134 | * WithDumpPath("/tmp") means the dump binary file(binary mode) will write content to `/tmp` dir. 135 | * WithTextDump() means not in binary mode, so it's text mode profiles 136 | * WithMemDump(30, 25, 80, time.Minute) means dump will happen when memory usage > `10%` && 137 | memory usage > `125%` * previous memory usage or memory usage > `80%`. 138 | `time.Minute` means once a dump happened, the next dump will not happen before 139 | cooldown finish-1 minute. 140 | 141 | ### Dump heap profile when RSS spikes based GC cycle 142 | 143 | In some situations we can not get useful information, such the application allocates heap memory and 144 | collects it between one `CollectInterval`. So we design a new heap memory monitor rule, which bases on 145 | GC cycle, to control holmes dump. It will dump twice heap profile continuously while RSS spike, then devs 146 | can compare the profiles through `pprof base` command. 147 | 148 | 149 | ```go 150 | h, _ := holmes.New( 151 | holmes.WithDumpPath("/tmp"), 152 | holmes.WithLogger(holmes.NewFileLog("/tmp/holmes.log", mlog.INFO)), 153 | holmes.WithBinaryDump(), 154 | holmes.WithMemoryLimit(100*1024*1024), // 100MB 155 | holmes.WithGCHeapDump(10, 20, 40, time.Minute), 156 | // holmes.WithProfileReporter(reporter), 157 | ) 158 | h.EnableGCHeapDump().Start() 159 | time.Sleep(time.Hour) 160 | ``` 161 | ### Set holmes configurations on fly 162 | You can use `Set` method to modify holmes' configurations when the application is running. 163 | ```go 164 | h.Set( 165 | WithCollectInterval("2s"), 166 | WithGoroutineDump(min, diff, abs, 90, time.Minute)) 167 | ``` 168 | 169 | ### Reporter dump event 170 | 171 | You can use `Reporter` to implement the following features: 172 | * Send alarm messages that include the scene information when holmes dump profiles. 173 | * Send profiles to the data center for saving or analyzing. 174 | 175 | ```go 176 | type ReporterImpl struct{} 177 | func (r *ReporterImpl) Report(pType string, filename string, reason ReasonType, eventID string, sampleTime time.Time, pprofBytes []byte, scene Scene) error{ 178 | // do something 179 | } 180 | ...... 181 | r := &ReporterImpl{} // a implement of holmes.ProfileReporter Interface. 182 | h, _ := holmes.New( 183 | holmes.WithProfileReporter(reporter), 184 | holmes.WithDumpPath("/tmp"), 185 | holmes.WithLogger(holmes.NewFileLog("/tmp/holmes.log", mlog.INFO)), 186 | holmes.WithBinaryDump(), 187 | holmes.WithMemoryLimit(100*1024*1024), // 100MB 188 | holmes.WithGCHeapDump(10, 20, 40, time.Minute), 189 | ) 190 | 191 | ``` 192 | 193 | #### Enable holmes as pyroscope client 194 | 195 | Holmes supports to upload your profile to [pyroscope](https://github.com/pyroscope-io/pyroscope) server. More details 196 | click [here](./example/pyroscope_rideshare/README.md) please. 197 | 198 | Noted that **NOT** set `TextDump` when you enable holmes as pyroscope client. 199 | 200 | ### Enable them all! 201 | 202 | It's easy. 203 | 204 | ```go 205 | h, _ := holmes.New( 206 | holmes.WithCollectInterval("5s"), 207 | holmes.WithDumpPath("/tmp"), 208 | holmes.WithTextDump(), 209 | 210 | holmes.WithCPUDump(10, 25, 80, time.Minute), 211 | holmes.WithMemDump(30, 25, 80, time.Minute), 212 | holmes.WithGCHeapDump(10, 20, 40, time.Minute), 213 | holmes.WithGoroutineDump(500, 25, 20000, 0, time.Minute), 214 | ) 215 | 216 | h.EnableCPUDump(). 217 | EnableGoroutineDump(). 218 | EnableMemDump(). 219 | EnableGCHeapDump().Start() 220 | 221 | ``` 222 | 223 | ### Running in docker or other cgroup limited environment 224 | 225 | ```go 226 | h, _ := holmes.New( 227 | holmes.WithCollectInterval("5s"), 228 | holmes.WithDumpPath("/tmp"), 229 | holmes.WithTextDump(), 230 | 231 | holmes.WithCPUDump(10, 25, 80,time.Minute), 232 | holmes.WithCGroup(true), // set cgroup to true 233 | ) 234 | ``` 235 | 236 | ## known risks 237 | 238 | If golang version < 1.19, collect a goroutine itself [may cause latency spike](https://github.com/golang/go/issues/33250) because of the long time STW. 239 | At golang 1.19, it has been optz by concurrent way at this [CL](https://go-review.googlesource.com/c/go/+/387415/). 240 | 241 | ## Show cases 242 | [Click here](./doc/example.md) 243 | 244 | ## Contributing 245 | See our [contributor guide](./CONTRIBUTING.md). 246 | 247 | ## Community 248 | 249 | Scan the QR code below with DingTalk(钉钉) to join the Holmes user group. 250 | 251 | dingtalk 252 | 253 | 254 | -------------------------------------------------------------------------------- /report.go: -------------------------------------------------------------------------------- 1 | package holmes 2 | 3 | import "time" 4 | 5 | type ProfileReporter interface { 6 | Report(pType string, filename string, reason ReasonType, eventID string, sampleTime time.Time, pprofBytes []byte, scene Scene) error 7 | } 8 | 9 | // rptEvent stands of the args of report event 10 | type rptEvent struct { 11 | PType string 12 | FileName string 13 | Reason ReasonType 14 | EventID string 15 | SampleTime time.Time 16 | PprofBytes []byte 17 | Scene Scene 18 | } 19 | 20 | // Scene contains the scene information when profile triggers, 21 | // including current value, average value and configurations. 22 | type Scene struct { 23 | typeOption 24 | 25 | // current value while dump event occurs 26 | CurVal int 27 | // Avg is the average of the past values 28 | Avg int 29 | } 30 | 31 | type ReasonType uint8 32 | 33 | const ( 34 | // ReasonCurlLessMin means current value is less than min value. 35 | ReasonCurlLessMin ReasonType = iota 36 | // ReasonCurlGreaterMin means current value is greater than min value, 37 | // but don't meet any trigger conditions. 38 | ReasonCurlGreaterMin 39 | // ReasonCurGreaterMax means current value is greater than max value. 40 | ReasonCurGreaterMax 41 | // ReasonCurGreaterAbs means current value meets the trigger condition where 42 | // it is greater than abs value. 43 | ReasonCurGreaterAbs 44 | // ReasonDiff means current value is greater than the value: (diff + 1) * agv. 45 | ReasonDiff 46 | ) 47 | 48 | func (rt ReasonType) String() string { 49 | var reason string 50 | switch rt { 51 | case ReasonCurlLessMin: 52 | reason = "curVal < ruleMin" 53 | case ReasonCurlGreaterMin: 54 | reason = "curVal >= ruleMin, but don't meet diff trigger condition" 55 | case ReasonCurGreaterMax: 56 | reason = "curVal >= ruleMax" 57 | case ReasonCurGreaterAbs: 58 | reason = "curVal > ruleAbs" 59 | case ReasonDiff: 60 | reason = "curVal >= ruleMin, and meet diff trigger condition" 61 | 62 | } 63 | 64 | return reason 65 | } 66 | -------------------------------------------------------------------------------- /reporters/http_reporter/http_reporter.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package http_reporter 19 | 20 | import ( 21 | "bytes" 22 | "encoding/json" 23 | "fmt" 24 | "io/ioutil" 25 | "mime/multipart" 26 | "net/http" 27 | "time" 28 | 29 | "mosn.io/holmes" 30 | ) 31 | 32 | type HttpReporter struct { 33 | token string 34 | url string 35 | } 36 | 37 | type Response struct { 38 | Code int `json:"code"` 39 | Message string `json:"message"` 40 | } 41 | 42 | func NewReporter(token string, url string) holmes.ProfileReporter { 43 | return &HttpReporter{ 44 | token: token, 45 | url: url, 46 | } 47 | } 48 | 49 | func (r *HttpReporter) Report(ptype string, filename string, reason holmes.ReasonType, eventID string, tt time.Time, bts []byte, scene holmes.Scene) error { 50 | body := &bytes.Buffer{} 51 | writer := multipart.NewWriter(body) 52 | 53 | // read filename 54 | if filename == "" { 55 | return fmt.Errorf("file name is empty") 56 | } 57 | data, err := ioutil.ReadFile(filename) 58 | if err != nil { 59 | return fmt.Errorf("read form File: %s err: %v", filename, err) 60 | } 61 | part, err := writer.CreateFormFile("profile", "go-pprof-profile") 62 | if err != nil { 63 | return fmt.Errorf("create form File err: %v", err) 64 | } 65 | 66 | if _, err := part.Write(data); err != nil { 67 | return fmt.Errorf("write buf to file part err: %v", err) 68 | } 69 | 70 | writer.WriteField("token", r.token) // nolint: errcheck 71 | writer.WriteField("profile_type", ptype) // nolint: errcheck 72 | writer.WriteField("event_id", eventID) // nolint: errcheck 73 | writer.WriteField("comment", reason.String()) // nolint: errcheck 74 | writer.Close() // nolint: errcheck 75 | request, err := http.NewRequest("POST", r.url, body) 76 | if err != nil { 77 | return fmt.Errorf("NewRequest err: %v", err) 78 | } 79 | 80 | request.Header.Add("Content-Type", writer.FormDataContentType()) 81 | client := &http.Client{} 82 | response, err := client.Do(request) 83 | if err != nil { 84 | return fmt.Errorf("do Request err: %v", err) 85 | } 86 | defer response.Body.Close() // nolint: errcheck 87 | 88 | respContent, err := ioutil.ReadAll(response.Body) 89 | if err != nil { 90 | return fmt.Errorf("read response err: %v", err) 91 | } 92 | 93 | rsp := &Response{} 94 | if err := json.Unmarshal(respContent, rsp); err != nil { 95 | return fmt.Errorf("failed to decode resp json: %v", err) 96 | } 97 | 98 | if rsp.Code != 1 { 99 | return fmt.Errorf("code: %d, msg: %s", rsp.Code, rsp.Message) 100 | } 101 | return nil 102 | } 103 | -------------------------------------------------------------------------------- /reporters/http_reporter/http_reporter_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package http_reporter 19 | 20 | import ( 21 | "log" 22 | "mosn.io/holmes" 23 | "testing" 24 | "time" 25 | 26 | "github.com/gin-gonic/gin" 27 | ) 28 | 29 | func TestHttpReporter_Report(t *testing.T) { 30 | newMockServer() 31 | 32 | reporter := NewReporter("test", "http://127.0.0.1:8080/profile/upload") 33 | 34 | if err := reporter.Report("goroutine", "reporter_filename_test", holmes.ReasonCurlGreaterMin, "test-id", time.Now(), []byte{}, holmes.Scene{}); err != nil { 35 | log.Fatalf("failed to report: %v", err) 36 | } 37 | } 38 | 39 | func newMockServer() { 40 | r := gin.New() 41 | r.POST("/profile/upload", ProfileUploadHandler) 42 | go r.Run() //nolint:errcheck // listen and serve on 0.0.0.0:8080 (for windows "localhost:8080") 43 | 44 | time.Sleep(time.Millisecond * 100) 45 | } 46 | 47 | func ProfileUploadHandler(c *gin.Context) { 48 | ret := map[string]interface{}{} 49 | ret["code"] = 1 50 | ret["message"] = "success" 51 | c.JSON(200, ret) 52 | } 53 | -------------------------------------------------------------------------------- /reporters/http_reporter/reporter_filename_test: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mosn/holmes/efb1a7768843e83b645f6e683f7b6c5d826651ab/reporters/http_reporter/reporter_filename_test -------------------------------------------------------------------------------- /reporters/pyroscope_reporter/client_config.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package pyroscope_reporter 18 | 19 | import ( 20 | "errors" 21 | "time" 22 | 23 | "mosn.io/holmes/reporters/pyroscope_reporter/flameql" 24 | ) 25 | 26 | /* 27 | Copied from pyroscope-io/client 28 | */ 29 | var ( 30 | ErrCloudTokenRequired = errors.New("Please provide an authentication token. You can find it here: https://pyroscope.io/cloud") 31 | ErrUpload = errors.New("Failed to upload a profile") 32 | ErrUpgradeServer = errors.New("Newer version of pyroscope server required (>= v0.3.1). Visit https://pyroscope.io/docs/golang/ for more information") 33 | ) 34 | 35 | const ( 36 | Pprof UploadFormat = "pprof" 37 | Trie = "trie" 38 | DefaultUploadRate = 10 * time.Second 39 | ) 40 | 41 | type UploadFormat string 42 | type Payload interface { 43 | Bytes() []byte 44 | } 45 | 46 | type ParserState int 47 | 48 | const ( 49 | ReservedTagKeyName = "__name__" 50 | ) 51 | 52 | var ( 53 | heapSampleTypes = map[string]*SampleType{ 54 | "alloc_objects": { 55 | Units: "objects", 56 | Cumulative: false, 57 | }, 58 | "alloc_space": { 59 | Units: "bytes", 60 | Cumulative: false, 61 | }, 62 | "inuse_space": { 63 | Units: "bytes", 64 | Aggregation: "average", 65 | Cumulative: false, 66 | }, 67 | "inuse_objects": { 68 | Units: "objects", 69 | Aggregation: "average", 70 | Cumulative: false, 71 | }, 72 | } 73 | goroutineSampleTypes = map[string]*SampleType{ 74 | "goroutine": { 75 | DisplayName: "goroutines", 76 | Units: "goroutines", 77 | Aggregation: "average", 78 | }, 79 | } 80 | ) 81 | 82 | type SampleType struct { 83 | Units string `json:"units,omitempty"` 84 | Aggregation string `json:"aggregation,omitempty"` 85 | DisplayName string `json:"display-name,omitempty"` 86 | Sampled bool `json:"sampled,omitempty"` 87 | Cumulative bool `json:"cumulative,omitempty"` 88 | } 89 | 90 | type UploadJob struct { 91 | Name string 92 | StartTime time.Time 93 | EndTime time.Time 94 | SpyName string 95 | SampleRate uint32 96 | Units string 97 | AggregationType string 98 | Format UploadFormat 99 | Profile []byte 100 | PrevProfile []byte 101 | SampleTypeConfig map[string]*SampleType 102 | } 103 | 104 | type RemoteConfig struct { 105 | AuthToken string // holmes not used 106 | UpstreamThreads int // holmes not used 107 | UpstreamAddress string 108 | UpstreamRequestTimeout time.Duration 109 | 110 | ManualStart bool // holmes not used 111 | } 112 | 113 | // mergeTagsWithAppName validates user input and merges explicitly specified 114 | // tags with tags from app name. 115 | // 116 | // App name may be in the full form including tags (app.name{foo=bar,baz=qux}). 117 | // Returned application name is always short, any tags that were included are 118 | // moved to tags map. When merged with explicitly provided tags (config/CLI), 119 | // last take precedence. 120 | // 121 | // App name may be an empty string. Tags must not contain reserved keys, 122 | // the map is modified in place. 123 | func mergeTagsWithAppName(appName string, tags map[string]string) (string, error) { 124 | k, err := flameql.ParseKey(appName) 125 | if err != nil { 126 | return "", err 127 | } 128 | for tagKey, tagValue := range tags { 129 | if flameql.IsTagKeyReserved(tagKey) { 130 | continue 131 | } 132 | if err = flameql.ValidateTagKey(tagKey); err != nil { 133 | return "", err 134 | } 135 | k.Add(tagKey, tagValue) 136 | } 137 | return k.Normalized(), nil 138 | } 139 | -------------------------------------------------------------------------------- /reporters/pyroscope_reporter/flameql/error.go: -------------------------------------------------------------------------------- 1 | package flameql 2 | 3 | import ( 4 | "errors" 5 | "fmt" 6 | ) 7 | 8 | /* 9 | Copied from pyroscope-io/client 10 | */ 11 | var ( 12 | ErrInvalidQuerySyntax = errors.New("invalid query syntax") 13 | ErrInvalidAppName = errors.New("invalid application name") 14 | ErrInvalidMatchersSyntax = errors.New("invalid tag matchers syntax") 15 | ErrInvalidTagKey = errors.New("invalid tag key") 16 | ErrInvalidTagValueSyntax = errors.New("invalid tag value syntax") 17 | 18 | ErrAppNameIsRequired = errors.New("application name is required") 19 | ErrTagKeyIsRequired = errors.New("tag key is required") 20 | ErrTagKeyReserved = errors.New("tag key is reserved") 21 | 22 | ErrMatchOperatorIsRequired = errors.New("match operator is required") 23 | ErrUnknownOp = errors.New("unknown tag match operator") 24 | ) 25 | 26 | type Error struct { 27 | Inner error 28 | Expr string 29 | // TODO: add offset? 30 | } 31 | 32 | func newErr(err error, expr string) *Error { return &Error{Inner: err, Expr: expr} } 33 | 34 | func (e *Error) Error() string { return e.Inner.Error() + ": " + e.Expr } 35 | 36 | func (e *Error) Unwrap() error { return e.Inner } 37 | 38 | func newInvalidTagKeyRuneError(k string, r rune) *Error { 39 | return newInvalidRuneError(ErrInvalidTagKey, k, r) 40 | } 41 | 42 | func newInvalidAppNameRuneError(k string, r rune) *Error { 43 | return newInvalidRuneError(ErrInvalidAppName, k, r) 44 | } 45 | 46 | func newInvalidRuneError(err error, k string, r rune) *Error { 47 | return newErr(err, fmt.Sprintf("%s: character is not allowed: %q", k, r)) 48 | } 49 | -------------------------------------------------------------------------------- /reporters/pyroscope_reporter/flameql/flameql.go: -------------------------------------------------------------------------------- 1 | package flameql 2 | 3 | import "regexp" 4 | 5 | /* 6 | Copied from pyroscope-io/client 7 | */ 8 | type Query struct { 9 | AppName string 10 | Matchers []*TagMatcher 11 | 12 | q string // The original query string. 13 | } 14 | 15 | func (q *Query) String() string { return q.q } 16 | 17 | type TagMatcher struct { 18 | Key string 19 | Value string 20 | Op 21 | 22 | R *regexp.Regexp 23 | } 24 | 25 | type Op int 26 | 27 | const ( 28 | // The order should respect operator priority and cost. 29 | // Negating operators go first. See IsNegation. 30 | _ Op = iota 31 | OpNotEqual // != 32 | OpNotEqualRegex // !~ 33 | OpEqual // = 34 | OpEqualRegex // =~ 35 | ) 36 | 37 | const ( 38 | ReservedTagKeyName = "__name__" 39 | ) 40 | 41 | var reservedTagKeys = []string{ 42 | ReservedTagKeyName, 43 | } 44 | 45 | // IsNegation reports whether the operator assumes negation. 46 | func (o Op) IsNegation() bool { return o < OpEqual } 47 | 48 | // ByPriority is a supplemental type for sorting tag matchers. 49 | type ByPriority []*TagMatcher 50 | 51 | func (p ByPriority) Len() int { return len(p) } 52 | func (p ByPriority) Swap(i, j int) { p[i], p[j] = p[j], p[i] } 53 | func (p ByPriority) Less(i, j int) bool { return p[i].Op < p[j].Op } 54 | 55 | func (m *TagMatcher) Match(v string) bool { 56 | switch m.Op { 57 | case OpEqual: 58 | return m.Value == v 59 | case OpNotEqual: 60 | return m.Value != v 61 | case OpEqualRegex: 62 | return m.R.Match([]byte(v)) 63 | case OpNotEqualRegex: 64 | return !m.R.Match([]byte(v)) 65 | default: 66 | panic("invalid match operator") 67 | } 68 | } 69 | 70 | // ValidateTagKey report an error if the given key k violates constraints. 71 | // 72 | // The function should be used to validate user input. The function returns 73 | // ErrTagKeyReserved if the key is valid but reserved for internal use. 74 | func ValidateTagKey(k string) error { 75 | if len(k) == 0 { 76 | return ErrTagKeyIsRequired 77 | } 78 | for _, r := range k { 79 | if !IsTagKeyRuneAllowed(r) { 80 | return newInvalidTagKeyRuneError(k, r) 81 | } 82 | } 83 | if IsTagKeyReserved(k) { 84 | return newErr(ErrTagKeyReserved, k) 85 | } 86 | return nil 87 | } 88 | 89 | // ValidateAppName report an error if the given app name n violates constraints. 90 | func ValidateAppName(n string) error { 91 | if len(n) == 0 { 92 | return ErrAppNameIsRequired 93 | } 94 | for _, r := range n { 95 | if !IsAppNameRuneAllowed(r) { 96 | return newInvalidAppNameRuneError(n, r) 97 | } 98 | } 99 | return nil 100 | } 101 | 102 | func IsTagKeyRuneAllowed(r rune) bool { 103 | return (r >= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z') || (r >= '0' && r <= '9') || r == '_' 104 | } 105 | 106 | func IsAppNameRuneAllowed(r rune) bool { 107 | return r == '-' || r == '.' || IsTagKeyRuneAllowed(r) 108 | } 109 | 110 | func IsTagKeyReserved(k string) bool { 111 | for _, s := range reservedTagKeys { 112 | if s == k { 113 | return true 114 | } 115 | } 116 | return false 117 | } 118 | -------------------------------------------------------------------------------- /reporters/pyroscope_reporter/flameql/key.go: -------------------------------------------------------------------------------- 1 | package flameql 2 | 3 | import ( 4 | "errors" 5 | "strconv" 6 | "strings" 7 | "time" 8 | ) 9 | 10 | /* 11 | Copied from pyroscope-io/client 12 | */ 13 | type Key struct { 14 | labels map[string]string 15 | } 16 | 17 | type ParserState int 18 | 19 | const ( 20 | nameParserState ParserState = iota 21 | tagKeyParserState 22 | tagValueParserState 23 | doneParserState 24 | ) 25 | 26 | func NewKey(labels map[string]string) *Key { return &Key{labels: labels} } 27 | 28 | func ParseKey(name string) (*Key, error) { 29 | k := &Key{labels: make(map[string]string)} 30 | p := parser{parserState: nameParserState} 31 | var err error 32 | for _, r := range name + "{" { 33 | switch p.parserState { 34 | case nameParserState: 35 | err = p.nameParserCase(r, k) 36 | case tagKeyParserState: 37 | p.tagKeyParserCase(r) 38 | case tagValueParserState: 39 | err = p.tagValueParserCase(r, k) 40 | } 41 | if err != nil { 42 | return nil, err 43 | } 44 | } 45 | return k, nil 46 | } 47 | 48 | type parser struct { 49 | parserState ParserState 50 | key string 51 | value string 52 | } 53 | 54 | // ParseKey's nameParserState switch case 55 | func (p *parser) nameParserCase(r int32, k *Key) error { 56 | switch r { 57 | case '{': 58 | p.parserState = tagKeyParserState 59 | appName := strings.TrimSpace(p.value) 60 | if err := ValidateAppName(appName); err != nil { 61 | return err 62 | } 63 | k.labels["__name__"] = appName 64 | default: 65 | p.value += string(r) 66 | } 67 | return nil 68 | } 69 | 70 | // ParseKey's tagKeyParserState switch case 71 | func (p *parser) tagKeyParserCase(r int32) { 72 | switch r { 73 | case '}': 74 | p.parserState = doneParserState 75 | case '=': 76 | p.parserState = tagValueParserState 77 | p.value = "" 78 | default: 79 | p.key += string(r) 80 | } 81 | } 82 | 83 | // ParseKey's tagValueParserState switch case 84 | func (p *parser) tagValueParserCase(r int32, k *Key) error { 85 | switch r { 86 | case ',', '}': 87 | p.parserState = tagKeyParserState 88 | key := strings.TrimSpace(p.key) 89 | if !IsTagKeyReserved(key) { 90 | if err := ValidateTagKey(key); err != nil { 91 | return err 92 | } 93 | } 94 | k.labels[key] = strings.TrimSpace(p.value) 95 | p.key = "" 96 | default: 97 | p.value += string(r) 98 | } 99 | return nil 100 | } 101 | 102 | func (k *Key) SegmentKey() string { 103 | return k.Normalized() 104 | } 105 | 106 | func TreeKey(k string, depth int, unixTime int64) string { 107 | return k + ":" + strconv.Itoa(depth) + ":" + strconv.FormatInt(unixTime, 10) 108 | } 109 | 110 | func (k *Key) TreeKey(depth int, t time.Time) string { 111 | return TreeKey(k.Normalized(), depth, t.Unix()) 112 | } 113 | 114 | var errKeyInvalid = errors.New("invalid key") 115 | 116 | // ParseTreeKey retrieves tree time and depth level from the given key. 117 | func ParseTreeKey(k string) (time.Time, int, error) { 118 | a := strings.Split(k, ":") 119 | if len(a) < 3 { 120 | return time.Time{}, 0, errKeyInvalid 121 | } 122 | level, err := strconv.Atoi(a[1]) 123 | if err != nil { 124 | return time.Time{}, 0, err 125 | } 126 | v, err := strconv.Atoi(a[2]) 127 | if err != nil { 128 | return time.Time{}, 0, err 129 | } 130 | return time.Unix(int64(v), 0), level, err 131 | } 132 | 133 | func (k *Key) DictKey() string { 134 | return k.labels["__name__"] 135 | } 136 | 137 | // FromTreeToDictKey returns app name from tree key k: given tree key 138 | // "foo{}:0:1234567890", the call returns "foo". 139 | // 140 | // Before tags support, segment key form (i.e. app name + tags: foo{key=value}) 141 | // has been used to reference a dictionary (trie). 142 | func FromTreeToDictKey(k string) string { 143 | return k[0:strings.IndexAny(k, "{")] 144 | } 145 | 146 | func (k *Key) Normalized() string { 147 | var sb strings.Builder 148 | 149 | sortedMap := New() 150 | for k, v := range k.labels { 151 | if k == "__name__" { 152 | sb.WriteString(v) 153 | } else { 154 | sortedMap.Put(k, v) 155 | } 156 | } 157 | 158 | sb.WriteString("{") 159 | for i, k := range sortedMap.Keys() { 160 | v := sortedMap.Get(k).(string) 161 | if i != 0 { 162 | sb.WriteString(",") 163 | } 164 | sb.WriteString(k) 165 | sb.WriteString("=") 166 | sb.WriteString(v) 167 | } 168 | sb.WriteString("}") 169 | 170 | return sb.String() 171 | } 172 | 173 | func (k *Key) AppName() string { 174 | return k.labels["__name__"] 175 | } 176 | 177 | func (k *Key) Labels() map[string]string { 178 | return k.labels 179 | } 180 | 181 | func (k *Key) Add(key, value string) { 182 | if value == "" { 183 | delete(k.labels, key) 184 | } else { 185 | k.labels[key] = value 186 | } 187 | } 188 | 189 | // Match reports whether the key matches the query. 190 | func (k *Key) Clone() *Key { 191 | newMap := make(map[string]string) 192 | for k, v := range k.labels { 193 | newMap[k] = v 194 | } 195 | return &Key{labels: newMap} 196 | } 197 | 198 | func (k *Key) Match(q *Query) bool { 199 | if k.AppName() != q.AppName { 200 | return false 201 | } 202 | for _, m := range q.Matchers { 203 | var ok bool 204 | for labelKey, labelValue := range k.labels { 205 | if m.Key != labelKey { 206 | continue 207 | } 208 | if m.Match(labelValue) { 209 | if !m.IsNegation() { 210 | ok = true 211 | break 212 | } 213 | } else if m.IsNegation() { 214 | return false 215 | } 216 | } 217 | if !ok && !m.IsNegation() { 218 | return false 219 | } 220 | } 221 | return true 222 | } 223 | -------------------------------------------------------------------------------- /reporters/pyroscope_reporter/flameql/parse.go: -------------------------------------------------------------------------------- 1 | package flameql 2 | 3 | import ( 4 | "regexp" 5 | "sort" 6 | "strings" 7 | ) 8 | 9 | /* 10 | Copied from pyroscope-io/client 11 | */ 12 | 13 | // ParseQuery parses a string of $app_name<{<$tag_matchers>}> form. 14 | func ParseQuery(s string) (*Query, error) { 15 | s = strings.TrimSpace(s) 16 | q := Query{q: s} 17 | 18 | for offset, c := range s { 19 | switch c { 20 | case '{': 21 | if offset == 0 { 22 | return nil, ErrAppNameIsRequired 23 | } 24 | if s[len(s)-1] != '}' { 25 | return nil, newErr(ErrInvalidQuerySyntax, "expected } at the end") 26 | } 27 | m, err := ParseMatchers(s[offset+1 : len(s)-1]) 28 | if err != nil { 29 | return nil, err 30 | } 31 | q.AppName = s[:offset] 32 | q.Matchers = m 33 | return &q, nil 34 | default: 35 | if !IsAppNameRuneAllowed(c) { 36 | return nil, newErr(ErrInvalidAppName, s[:offset+1]) 37 | } 38 | } 39 | } 40 | 41 | if len(s) == 0 { 42 | return nil, ErrAppNameIsRequired 43 | } 44 | 45 | q.AppName = s 46 | return &q, nil 47 | } 48 | 49 | // ParseMatchers parses a string of $tag_matcher<,$tag_matchers> form. 50 | func ParseMatchers(s string) ([]*TagMatcher, error) { 51 | var matchers []*TagMatcher 52 | for _, t := range split(s) { 53 | if t == "" { 54 | continue 55 | } 56 | m, err := ParseMatcher(strings.TrimSpace(t)) 57 | if err != nil { 58 | return nil, err 59 | } 60 | matchers = append(matchers, m) 61 | } 62 | if len(matchers) == 0 && len(s) != 0 { 63 | return nil, newErr(ErrInvalidMatchersSyntax, s) 64 | } 65 | sort.Sort(ByPriority(matchers)) 66 | return matchers, nil 67 | } 68 | 69 | // ParseMatcher parses a string of $tag_key$op"$tag_value" form, 70 | // where $op is one of the supported match operators. 71 | func ParseMatcher(s string) (*TagMatcher, error) { 72 | var tm TagMatcher 73 | var offset int 74 | var c rune 75 | 76 | loop: 77 | for offset, c = range s { 78 | r := len(s) - (offset + 1) 79 | switch c { 80 | case '=': 81 | switch { 82 | case r <= 2: 83 | return nil, newErr(ErrInvalidTagValueSyntax, s) 84 | case s[offset+1] == '"': 85 | tm.Op = OpEqual 86 | case s[offset+1] == '~': 87 | if r <= 3 { 88 | return nil, newErr(ErrInvalidTagValueSyntax, s) 89 | } 90 | tm.Op = OpEqualRegex 91 | default: 92 | // Just for more meaningful error message. 93 | if s[offset+2] != '"' { 94 | return nil, newErr(ErrInvalidTagValueSyntax, s) 95 | } 96 | return nil, newErr(ErrUnknownOp, s) 97 | } 98 | break loop 99 | case '!': 100 | if r <= 3 { 101 | return nil, newErr(ErrInvalidTagValueSyntax, s) 102 | } 103 | switch s[offset+1] { 104 | case '=': 105 | tm.Op = OpNotEqual 106 | case '~': 107 | tm.Op = OpNotEqualRegex 108 | default: 109 | return nil, newErr(ErrUnknownOp, s) 110 | } 111 | break loop 112 | default: 113 | if !IsTagKeyRuneAllowed(c) { 114 | return nil, newInvalidTagKeyRuneError(s, c) 115 | } 116 | } 117 | } 118 | 119 | k := s[:offset] 120 | if IsTagKeyReserved(k) { 121 | return nil, newErr(ErrTagKeyReserved, k) 122 | } 123 | 124 | var v string 125 | var ok bool 126 | switch tm.Op { 127 | default: 128 | return nil, newErr(ErrMatchOperatorIsRequired, s) 129 | case OpEqual: 130 | v, ok = unquote(s[offset+1:]) 131 | case OpNotEqual, OpEqualRegex, OpNotEqualRegex: 132 | v, ok = unquote(s[offset+2:]) 133 | } 134 | if !ok { 135 | return nil, newErr(ErrInvalidTagValueSyntax, v) 136 | } 137 | 138 | // Compile regex, if applicable. 139 | switch tm.Op { 140 | case OpEqualRegex, OpNotEqualRegex: 141 | r, err := regexp.Compile(v) 142 | if err != nil { 143 | return nil, newErr(err, v) 144 | } 145 | tm.R = r 146 | } 147 | 148 | tm.Key = k 149 | tm.Value = v 150 | return &tm, nil 151 | } 152 | 153 | func unquote(s string) (string, bool) { 154 | if s[0] != '"' || s[len(s)-1] != '"' { 155 | return s, false 156 | } 157 | return s[1 : len(s)-1], true 158 | } 159 | 160 | func split(s string) []string { 161 | var r []string 162 | var x int 163 | var y bool 164 | for i := 0; i < len(s); i++ { 165 | switch { 166 | case s[i] == ',' && !y: 167 | r = append(r, s[x:i]) 168 | x = i + 1 169 | case s[i] == '"': 170 | if y && i > 0 && s[i-1] != '\\' { 171 | y = false 172 | continue 173 | } 174 | y = true 175 | } 176 | } 177 | return append(r, s[x:]) 178 | } 179 | -------------------------------------------------------------------------------- /reporters/pyroscope_reporter/flameql/sortedmap.go: -------------------------------------------------------------------------------- 1 | package flameql 2 | 3 | import ( 4 | "sort" 5 | ) 6 | 7 | type SortedMap struct { 8 | data map[string]interface{} 9 | keys []string 10 | } 11 | 12 | func (s *SortedMap) Put(k string, v interface{}) { 13 | s.data[k] = v 14 | i := sort.Search(len(s.keys), func(i int) bool { return s.keys[i] >= k }) 15 | s.keys = append(s.keys, "") 16 | copy(s.keys[i+1:], s.keys[i:]) 17 | s.keys[i] = k 18 | } 19 | 20 | func (s *SortedMap) Get(k string) (v interface{}) { 21 | return s.data[k] 22 | } 23 | 24 | func (s *SortedMap) Keys() []string { 25 | return s.keys 26 | } 27 | 28 | func New() *SortedMap { 29 | return &SortedMap{ 30 | data: make(map[string]interface{}), 31 | keys: make([]string, 0), 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /reporters/pyroscope_reporter/pyroscope_client.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package pyroscope_reporter 19 | 20 | import ( 21 | "bytes" 22 | "encoding/json" 23 | "fmt" 24 | "io/ioutil" 25 | "mime/multipart" 26 | "net/http" 27 | "net/url" 28 | "path/filepath" 29 | "strconv" 30 | "time" 31 | 32 | "mosn.io/holmes" 33 | 34 | mlog "mosn.io/pkg/log" 35 | ) 36 | 37 | /* 38 | Enable holmes to report pprof event to pyroscope as it's client. 39 | */ 40 | 41 | type PyroscopeReporter struct { 42 | AppName string 43 | Tags map[string]string 44 | 45 | cfg RemoteConfig 46 | client *http.Client 47 | Logger mlog.ErrorLogger 48 | } 49 | 50 | func NewPyroscopeReporter(AppName string, tags map[string]string, cfg RemoteConfig, logger mlog.ErrorLogger) (*PyroscopeReporter, error) { 51 | appName, err := mergeTagsWithAppName(AppName, tags) 52 | if err != nil { 53 | return nil, err 54 | } 55 | 56 | reporter := &PyroscopeReporter{ 57 | cfg: cfg, 58 | client: &http.Client{ 59 | Transport: &http.Transport{ 60 | MaxConnsPerHost: cfg.UpstreamThreads, 61 | }, 62 | Timeout: cfg.UpstreamRequestTimeout, 63 | }, 64 | Logger: logger, 65 | AppName: appName, 66 | } 67 | 68 | // todo: holmes doesn't support auth token temporary 69 | 70 | return reporter, nil 71 | } 72 | 73 | // uploadProfile copied from pyroscope client 74 | func (r *PyroscopeReporter) uploadProfile(j *UploadJob) error { 75 | u, err := url.Parse(r.cfg.UpstreamAddress) 76 | if err != nil { 77 | return fmt.Errorf("url parse: %v", err) 78 | } 79 | 80 | body := &bytes.Buffer{} 81 | 82 | writer := multipart.NewWriter(body) 83 | fw, err := writer.CreateFormFile("profile", "profile.pprof") 84 | fw.Write(j.Profile) // nolint: errcheck 85 | if err != nil { 86 | return err 87 | } 88 | if j.PrevProfile != nil { 89 | fw, err = writer.CreateFormFile("prev_profile", "profile.pprof") 90 | fw.Write(j.PrevProfile) // nolint: errcheck 91 | if err != nil { 92 | return err 93 | } 94 | } 95 | if j.SampleTypeConfig != nil { 96 | fw, err = writer.CreateFormFile("sample_type_config", "sample_type_config.json") 97 | if err != nil { 98 | return err 99 | } 100 | b, err := json.Marshal(j.SampleTypeConfig) 101 | if err != nil { 102 | return err 103 | } 104 | fw.Write(b) 105 | } 106 | writer.Close() // nolint: errcheck 107 | 108 | q := u.Query() 109 | q.Set("name", j.Name) 110 | // TODO: I think these should be renamed to startTime / endTime 111 | q.Set("from", strconv.Itoa(int(j.StartTime.Unix()))) 112 | q.Set("until", strconv.Itoa(int(j.EndTime.Unix()))) 113 | q.Set("spyName", j.SpyName) 114 | q.Set("sampleRate", strconv.Itoa(int(j.SampleRate))) 115 | q.Set("units", j.Units) 116 | q.Set("aggregationType", j.AggregationType) 117 | 118 | u.Path = filepath.Join(u.Path, "/ingest") 119 | u.RawQuery = q.Encode() 120 | 121 | r.Logger.Debugf("uploading at %s", u.String()) 122 | 123 | // new a request for the job 124 | request, err := http.NewRequest("POST", u.String(), body) 125 | //r.Logger.Debugf("body is %s", body.String()) 126 | if err != nil { 127 | return fmt.Errorf("new http request: %v", err) 128 | } 129 | contentType := writer.FormDataContentType() 130 | r.Logger.Debugf("content type: %s", contentType) 131 | request.Header.Set("Content-Type", contentType) 132 | // request.Header.Set("Content-Type", "binary/octet-stream+"+string(j.Format)) 133 | 134 | if r.cfg.AuthToken != "" { 135 | request.Header.Set("Authorization", "Bearer "+r.cfg.AuthToken) 136 | } 137 | 138 | // do the request and get the response 139 | response, err := r.client.Do(request) 140 | if err != nil { 141 | return fmt.Errorf("do http request: %v", err) 142 | } 143 | defer response.Body.Close() // nolint: errcheck 144 | 145 | // read all the response body 146 | _, err = ioutil.ReadAll(response.Body) 147 | if err != nil { 148 | return fmt.Errorf("read response body: %v", err) 149 | } 150 | 151 | if response.StatusCode == 422 { 152 | return ErrUpgradeServer 153 | } 154 | if response.StatusCode != 200 { 155 | return ErrUpload 156 | } 157 | 158 | return nil 159 | } 160 | 161 | func (r *PyroscopeReporter) Report(ptype string, filename string, reason holmes.ReasonType, eventID string, sampleTime time.Time, pprofBytes []byte, scene holmes.Scene) error { 162 | endTime := sampleTime.Truncate(DefaultUploadRate) 163 | startTime := endTime.Add(-DefaultUploadRate) 164 | _, _, _, _, _ = ptype, filename, reason, eventID, scene 165 | stc := sampleTypeCfg(ptype) 166 | j := &UploadJob{ 167 | Name: r.AppName, 168 | StartTime: startTime, 169 | EndTime: endTime, 170 | SpyName: "gospy", 171 | SampleRate: 100, 172 | Units: "samples", 173 | AggregationType: "sum", 174 | Format: Pprof, 175 | Profile: pprofBytes, 176 | SampleTypeConfig: stc, 177 | } 178 | 179 | if err := r.uploadProfile(j); err != nil { 180 | return err 181 | } 182 | return nil 183 | } 184 | 185 | func sampleTypeCfg(ptype string) map[string]*SampleType { 186 | switch ptype { 187 | case "heap": 188 | return heapSampleTypes 189 | case "goroutine": 190 | return goroutineSampleTypes 191 | } 192 | return nil 193 | } 194 | -------------------------------------------------------------------------------- /reporters/pyroscope_reporter/pyroscope_client_test.go: -------------------------------------------------------------------------------- 1 | package pyroscope_reporter 2 | 3 | import ( 4 | "log" 5 | "os" 6 | "testing" 7 | "time" 8 | 9 | "github.com/gin-gonic/gin" 10 | 11 | "mosn.io/holmes" 12 | ) 13 | 14 | var h *holmes.Holmes 15 | 16 | func TestMain(m *testing.M) { 17 | log.Println("holmes initialing") 18 | h, _ = holmes.New( 19 | holmes.WithCollectInterval("1s"), 20 | ) 21 | log.Println("holmes initial success") 22 | h.EnableMemDump().EnableGoroutineDump().EnableCPUDump().Start() 23 | time.Sleep(11 * time.Second) 24 | log.Println("on running") 25 | newMockServer() 26 | os.Exit(m.Run()) 27 | } 28 | 29 | var received = false 30 | 31 | func TestPyroscopeClient(t *testing.T) { 32 | 33 | cfg := RemoteConfig{ 34 | //AuthToken: "", 35 | //UpstreamThreads: 4, 36 | UpstreamAddress: "http://localhost:8080", 37 | UpstreamRequestTimeout: 3 * time.Second, 38 | } 39 | tags := map[string]string{ 40 | "region": "zh", 41 | } 42 | pReporter, err := NewPyroscopeReporter("holmes-client-01", tags, cfg, holmes.NewStdLogger()) 43 | if err != nil { 44 | log.Fatalf("NewPyroscopeReporter error %v", err) 45 | } 46 | 47 | err = h.Set( 48 | holmes.WithProfileReporter(pReporter), 49 | holmes.WithGoroutineDump(0, 0, 1, 2, time.Second), 50 | holmes.WithCPUDump(0, 2, 80, time.Second), 51 | holmes.WithMemDump(0, 1, 1, time.Second), 52 | holmes.WithCollectInterval("1s"), 53 | ) 54 | if err != nil { 55 | log.Fatalf("fail to set opts on running time.") 56 | } 57 | go cpuex() 58 | time.Sleep(20 * time.Second) 59 | if !received { 60 | t.Errorf("mock pyroscope server didn't received request") 61 | } 62 | } 63 | 64 | func cpuex() { 65 | go func() { 66 | for { 67 | time.Sleep(time.Millisecond) 68 | } 69 | }() 70 | } 71 | 72 | func newMockServer() { 73 | r := gin.New() 74 | r.POST("/ingest", ProfileUploadHandler) 75 | go r.Run() //nolint:errcheck // listen and serve on 0.0.0.0:8080 (for windows "localhost:8080") 76 | 77 | time.Sleep(time.Millisecond * 100) 78 | } 79 | 80 | func ProfileUploadHandler(c *gin.Context) { 81 | ret := map[string]interface{}{} 82 | ret["code"] = 1 83 | ret["message"] = "success" 84 | c.JSON(200, ret) 85 | received = true 86 | } 87 | -------------------------------------------------------------------------------- /reporters/reporter_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package reporters 19 | 20 | import ( 21 | "fmt" 22 | "log" 23 | "os" 24 | "testing" 25 | "time" 26 | 27 | "mosn.io/holmes" 28 | ) 29 | 30 | var h *holmes.Holmes 31 | 32 | func TestMain(m *testing.M) { 33 | log.Println("holmes initialing") 34 | h, _ = holmes.New( 35 | holmes.WithCollectInterval("1s"), 36 | holmes.WithDumpPath("./"), 37 | holmes.WithTextDump(), 38 | ) 39 | log.Println("holmes initial success") 40 | h.EnableGoroutineDump().EnableCPUDump().Start() 41 | time.Sleep(11 * time.Second) 42 | log.Println("on running") 43 | os.Exit(m.Run()) 44 | } 45 | 46 | var grReportCount int 47 | var cpuReportCount int 48 | var unknownReasonTypeErr error 49 | var sceneException error 50 | 51 | type mockReporter struct { 52 | } 53 | 54 | func (m *mockReporter) Report(pType string, filename string, reason holmes.ReasonType, eventID string, sampleTime time.Time, pprofBytes []byte, scene holmes.Scene) error { 55 | log.Printf("call %s , filename %s report \n", pType, filename) 56 | 57 | // read filename 58 | switch pType { 59 | case "goroutine": 60 | grReportCount++ 61 | case "cpu": 62 | cpuReportCount++ 63 | 64 | } 65 | 66 | if len(reason.String()) == 0 { // unknown reason type 67 | unknownReasonTypeErr = fmt.Errorf("reporter: unknown reason type") 68 | return unknownReasonTypeErr 69 | } 70 | 71 | { // test scene 72 | errPrefix := "reporter: scene exception ==> " 73 | if scene.TriggerAbs == 0 { 74 | sceneException = fmt.Errorf(errPrefix + "abs in configuration is 0") 75 | return sceneException 76 | } 77 | if scene.TriggerDiff == 0 { 78 | sceneException = fmt.Errorf(errPrefix + "diff in configuration is 0") 79 | return sceneException 80 | } 81 | } 82 | return nil 83 | } 84 | 85 | var grReopenReportCount int 86 | 87 | type mockReopenReporter struct { 88 | } 89 | 90 | func (m *mockReopenReporter) Report(pType string, filename string, reason holmes.ReasonType, eventID string, sampleTime time.Time, pprofBytes []byte, scene holmes.Scene) error { 91 | log.Printf("call %s report \n", pType) 92 | 93 | switch pType { 94 | case "goroutine": 95 | grReopenReportCount++ 96 | } 97 | 98 | if len(reason.String()) == 0 { // unknown reason type 99 | unknownReasonTypeErr = fmt.Errorf("reopen reporter: unknown reason type") 100 | return unknownReasonTypeErr 101 | } 102 | 103 | { // test scene 104 | errPrefix := "reopen reporter: scene exception ==> " 105 | if scene.TriggerAbs == 0 { 106 | sceneException = fmt.Errorf(errPrefix + "abs in configuration is 0") 107 | return sceneException 108 | } 109 | if scene.TriggerDiff == 0 { 110 | sceneException = fmt.Errorf(errPrefix + "diff in configuration is 0") 111 | return sceneException 112 | } 113 | } 114 | return nil 115 | } 116 | 117 | func TestReporter(t *testing.T) { 118 | grReportCount = 0 119 | cpuReportCount = 0 120 | unknownReasonTypeErr = nil 121 | sceneException = nil 122 | 123 | r := &mockReporter{} 124 | err := h.Set( 125 | holmes.WithProfileReporter(r), 126 | holmes.WithGoroutineDump(5, 10, 20, 90, time.Second), 127 | holmes.WithCPUDump(0, 2, 80, time.Second), 128 | holmes.WithCollectInterval("5s"), 129 | ) 130 | if err != nil { 131 | log.Fatalf("fail to set opts on running time.") 132 | } 133 | go cpuex() 134 | time.Sleep(10 * time.Second) 135 | 136 | if grReportCount == 0 { 137 | log.Fatalf("not grReport") 138 | } 139 | 140 | if cpuReportCount == 0 { 141 | log.Fatalf("not cpuReport") 142 | } 143 | 144 | if unknownReasonTypeErr != nil { 145 | log.Fatalf(unknownReasonTypeErr.Error()) 146 | } 147 | 148 | if sceneException != nil { 149 | log.Fatalf(sceneException.Error()) 150 | } 151 | 152 | // test reopen feature 153 | h.Stop() 154 | h.Start() 155 | grReopenReportCount = 0 156 | _ = h.Set( 157 | holmes.WithProfileReporter(&mockReopenReporter{})) 158 | time.Sleep(10 * time.Second) 159 | 160 | time.Sleep(5 * time.Second) 161 | 162 | if grReopenReportCount == 0 { 163 | log.Fatalf("fail to reopen") 164 | } 165 | } 166 | 167 | func TestReporterReopen(t *testing.T) { 168 | grReportCount = 0 169 | cpuReportCount = 0 170 | r := &mockReporter{} 171 | err := h.Set( 172 | holmes.WithProfileReporter(r), 173 | holmes.WithGoroutineDump(5, 10, 20, 90, time.Second), 174 | holmes.WithCPUDump(0, 2, 80, time.Second), 175 | holmes.WithCollectInterval("5s"), 176 | holmes.WithDumpToLogger(true), 177 | ) 178 | if err != nil { 179 | log.Fatalf("fail to set opts on running time.") 180 | } 181 | go cpuex() 182 | time.Sleep(10 * time.Second) 183 | 184 | if grReportCount == 0 { 185 | log.Fatalf("not grReport") 186 | } 187 | 188 | if cpuReportCount == 0 { 189 | log.Fatalf("not cpuReport") 190 | } 191 | 192 | // test reopen feature 193 | h.DisableProfileReporter() 194 | 195 | h.EnableProfileReporter() 196 | 197 | grReopenReportCount = 0 198 | _ = h.Set( 199 | holmes.WithProfileReporter(&mockReopenReporter{})) 200 | time.Sleep(10 * time.Second) 201 | 202 | time.Sleep(5 * time.Second) 203 | 204 | if grReopenReportCount == 0 { 205 | log.Fatalf("fail to reopen") 206 | } 207 | } 208 | 209 | func cpuex() { 210 | go func() { 211 | var ch = make(chan struct{}) 212 | for { 213 | select { 214 | case <-ch: 215 | // do nothing 216 | default: 217 | continue 218 | } 219 | } 220 | }() 221 | } 222 | -------------------------------------------------------------------------------- /ring.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package holmes 19 | 20 | type ring struct { 21 | data []int 22 | idx int 23 | sum int 24 | maxLen int 25 | } 26 | 27 | func newRing(maxLen int) ring { 28 | return ring{ 29 | data: make([]int, 0, maxLen), 30 | idx: 0, 31 | maxLen: maxLen, 32 | } 33 | } 34 | 35 | func (r *ring) push(i int) { 36 | if r.maxLen == 0 { 37 | return 38 | } 39 | 40 | // the first round 41 | if len(r.data) < r.maxLen { 42 | r.sum += i 43 | r.data = append(r.data, i) 44 | return 45 | } 46 | 47 | r.sum += i - r.data[r.idx] 48 | 49 | // the ring is expanded, just write to the position 50 | r.data[r.idx] = i 51 | r.idx = (r.idx + 1) % r.maxLen 52 | } 53 | 54 | func (r *ring) avg() int { 55 | // Check if the len(r.data) is zero before dividing 56 | if r.maxLen == 0 || len(r.data) == 0 { 57 | return 0 58 | } 59 | return r.sum / len(r.data) 60 | } 61 | 62 | func (r *ring) sequentialData() []int { 63 | index := r.idx 64 | slice := make([]int, r.maxLen) 65 | // len(r.data) < r.maxLen ( cap > len ), index is not incremented. >>> (r.data = append(r.data, i)). r.idx starts scrolling only when the array is full. 66 | if index == 0 { 67 | copy(slice, r.data) 68 | return slice 69 | } 70 | copy(slice, r.data[index:]) 71 | copy((slice)[r.maxLen-index:], r.data[:index]) 72 | return slice 73 | } 74 | -------------------------------------------------------------------------------- /ring_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package holmes 19 | 20 | import ( 21 | "testing" 22 | 23 | "github.com/stretchr/testify/assert" 24 | ) 25 | 26 | func TestEmptyRing(t *testing.T) { 27 | var r = newRing(0) 28 | assert.Equal(t, r.avg(), 0) 29 | 30 | r = newRing(1) 31 | assert.Equal(t, r.avg(), 0) 32 | } 33 | 34 | func TestRing(t *testing.T) { 35 | var cases = []struct { 36 | slice []int 37 | maxLen int 38 | avg int 39 | }{ 40 | { 41 | slice: []int{1, 2, 3}, 42 | maxLen: 10, 43 | avg: 2, 44 | }, 45 | { 46 | slice: []int{1, 2, 3}, 47 | maxLen: 1, 48 | avg: 3, 49 | }, 50 | } 51 | 52 | for _, cas := range cases { 53 | var r = newRing(cas.maxLen) 54 | for _, elem := range cas.slice { 55 | r.push(elem) 56 | } 57 | assert.Equal(t, r.avg(), cas.avg) 58 | } 59 | } 60 | 61 | func Test_ring_humanData(t *testing.T) { 62 | r := newRing(5) 63 | var cases = []struct { 64 | except []int 65 | }{ 66 | { 67 | except: []int{1, 0, 0, 0, 0}, 68 | }, 69 | { 70 | except: []int{1, 2, 0, 0, 0}, 71 | }, 72 | { 73 | except: []int{1, 2, 3, 0, 0}, 74 | }, 75 | { 76 | except: []int{1, 2, 3, 4, 0}, 77 | }, 78 | { 79 | except: []int{1, 2, 3, 4, 5}, 80 | }, 81 | { 82 | except: []int{2, 3, 4, 5, 6}, 83 | }, 84 | { 85 | except: []int{3, 4, 5, 6, 7}, 86 | }, 87 | { 88 | except: []int{4, 5, 6, 7, 8}, 89 | }, 90 | { 91 | except: []int{5, 6, 7, 8, 9}, 92 | }, 93 | { 94 | except: []int{6, 7, 8, 9, 10}, 95 | }, 96 | } 97 | for i := 0; i < 10; i++ { 98 | r.push(i + 1) 99 | assert.Equal(t, r.sequentialData(), cases[i].except) 100 | } 101 | } 102 | -------------------------------------------------------------------------------- /tool/build-example.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # make sure the example can be compiled 4 | 5 | set -e 6 | set -x 7 | 8 | examples=`ls example` 9 | 10 | for file in $examples; do 11 | echo $file 12 | cd example/$file 13 | 14 | go mod tidy 15 | rm -rf vendor 16 | go build . 17 | 18 | cd - 19 | done 20 | -------------------------------------------------------------------------------- /util.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package holmes 19 | 20 | import ( 21 | "bytes" 22 | "fmt" 23 | "io/ioutil" 24 | "math" 25 | "os" 26 | "path/filepath" 27 | "runtime" 28 | "runtime/pprof" 29 | "strconv" 30 | "strings" 31 | "time" 32 | 33 | mem_util "github.com/shirou/gopsutil/mem" 34 | "github.com/shirou/gopsutil/process" 35 | ) 36 | 37 | // copied from https://github.com/containerd/cgroups/blob/318312a373405e5e91134d8063d04d59768a1bff/utils.go#L251 38 | func parseUint(s string, base, bitSize int) (uint64, error) { 39 | v, err := strconv.ParseUint(s, base, bitSize) 40 | if err != nil { 41 | intValue, intErr := strconv.ParseInt(s, base, bitSize) 42 | // 1. Handle negative values greater than MinInt64 (and) 43 | // 2. Handle negative values lesser than MinInt64 44 | if intErr == nil && intValue < 0 { 45 | return 0, nil 46 | } else if intErr != nil && 47 | intErr.(*strconv.NumError).Err == strconv.ErrRange && 48 | intValue < 0 { 49 | return 0, nil 50 | } 51 | return 0, err 52 | } 53 | return v, nil 54 | } 55 | 56 | // copied from https://github.com/containerd/cgroups/blob/318312a373405e5e91134d8063d04d59768a1bff/utils.go#L243 57 | func readUint(path string) (uint64, error) { 58 | v, err := ioutil.ReadFile(path) 59 | if err != nil { 60 | return 0, err 61 | } 62 | return parseUint(strings.TrimSpace(string(v)), 10, 64) 63 | } 64 | 65 | // only reserve the top n. 66 | func trimResultTop(buffer bytes.Buffer) []byte { 67 | index := TrimResultTopN 68 | arr := strings.SplitN(buffer.String(), "\n\n", TrimResultTopN+1) 69 | 70 | if len(arr) <= TrimResultTopN { 71 | index = len(arr) - 1 72 | } 73 | 74 | return []byte(strings.Join(arr[:index], "\n\n")) 75 | } 76 | 77 | // only reserve the front n bytes 78 | func trimResultFront(buffer bytes.Buffer) []byte { 79 | if buffer.Len() <= TrimResultMaxBytes { 80 | return buffer.Bytes() 81 | } 82 | return buffer.Bytes()[:TrimResultMaxBytes-1] 83 | } 84 | 85 | // return values: 86 | // 1. cpu percent, not division cpu cores yet, 87 | // 2. RSS mem in bytes, 88 | // 3. goroutine num, 89 | // 4. thread num 90 | func getUsage() (float64, uint64, int, int, error) { 91 | p, err := process.NewProcess(int32(os.Getpid())) 92 | if err != nil { 93 | return 0, 0, 0, 0, err 94 | } 95 | cpuPercent, err := p.Percent(time.Second) 96 | if err != nil { 97 | return 0, 0, 0, 0, err 98 | } 99 | 100 | mem, err := p.MemoryInfo() 101 | if err != nil { 102 | return 0, 0, 0, 0, err 103 | } 104 | 105 | rss := mem.RSS 106 | gNum := runtime.NumGoroutine() 107 | tNum := getThreadNum() 108 | 109 | return cpuPercent, rss, gNum, tNum, nil 110 | } 111 | 112 | // get cpu core number limited by CGroup. 113 | func getCGroupCPUCore() (float64, error) { 114 | var cpuQuota uint64 115 | 116 | cpuPeriod, err := readUint(cgroupCpuPeriodPath) 117 | if cpuPeriod == 0 || err != nil { 118 | return 0, err 119 | } 120 | 121 | if cpuQuota, err = readUint(cgroupCpuQuotaPath); err != nil { 122 | return 0, err 123 | } 124 | 125 | return float64(cpuQuota) / float64(cpuPeriod), nil 126 | } 127 | 128 | func getCGroupMemoryLimit() (uint64, error) { 129 | usage, err := readUint(cgroupMemLimitPath) 130 | if err != nil { 131 | return 0, err 132 | } 133 | machineMemory, err := mem_util.VirtualMemory() 134 | if err != nil { 135 | return 0, err 136 | } 137 | limit := uint64(math.Min(float64(usage), float64(machineMemory.Total))) 138 | return limit, nil 139 | } 140 | 141 | func getNormalMemoryLimit() (uint64, error) { 142 | machineMemory, err := mem_util.VirtualMemory() 143 | if err != nil { 144 | return 0, err 145 | } 146 | return machineMemory.Total, nil 147 | } 148 | 149 | func getThreadNum() int { 150 | return pprof.Lookup("threadcreate").Count() 151 | } 152 | 153 | // cpu mem goroutine thread err. 154 | func collect(cpuCore float64, memoryLimit uint64) (int, int, int, int, error) { 155 | cpu, mem, gNum, tNum, err := getUsage() 156 | if err != nil { 157 | return 0, 0, 0, 0, err 158 | } 159 | 160 | // The default percent is from all cores, multiply by cpu core 161 | // but it's inconvenient to calculate the proper percent 162 | // here we divide by core number, so we can set a percent bar more intuitively 163 | cpuPercent := cpu / cpuCore 164 | 165 | memPercent := float64(mem) / float64(memoryLimit) * 100 166 | 167 | return int(cpuPercent), int(memPercent), gNum, tNum, nil 168 | } 169 | 170 | func matchRule(history ring, curVal, ruleMin, ruleAbs, ruleDiff, ruleMax int) (bool, ReasonType) { 171 | // should bigger than rule min 172 | if curVal < ruleMin { 173 | return false, ReasonCurlLessMin 174 | //fmt.Sprintf("curVal [%d]< ruleMin [%d]", curVal, ruleMin) 175 | } 176 | 177 | // if ruleMax is enable and current value bigger max, skip dumping 178 | if ruleMax != NotSupportTypeMaxConfig && curVal >= ruleMax { 179 | return false, ReasonCurGreaterMax 180 | } 181 | 182 | // the current peak load exceed the absolute value 183 | if curVal > ruleAbs { 184 | return true, ReasonCurGreaterAbs 185 | // fmt.Sprintf("curVal [%d] > ruleAbs [%d]", curVal, ruleAbs) 186 | } 187 | 188 | // the peak load matches the rule 189 | avg := history.avg() 190 | if curVal >= avg*(100+ruleDiff)/100 { 191 | return true, ReasonDiff 192 | // fmt.Sprintf("curVal[%d] >= avg[%d]*(100+ruleDiff)/100", curVal, avg) 193 | } 194 | return false, ReasonCurlGreaterMin 195 | } 196 | 197 | func getBinaryFileName(filePath string, dumpType configureType, eventID string) string { 198 | suffix := time.Now().Format("20060102150405.000") + ".log" 199 | if len(eventID) == 0 { 200 | return filepath.Join(filePath, check2name[dumpType]+"."+suffix) 201 | } 202 | 203 | return filepath.Join(filePath, check2name[dumpType]+"."+eventID+"."+suffix) 204 | } 205 | 206 | // fix #89 207 | func getBinaryFileNameAndCreate(dump string, dumpType configureType, eventID string) (*os.File, string, error) { 208 | filePath := getBinaryFileName(dump, dumpType, eventID) 209 | f, err := os.OpenFile(filePath, defaultLoggerFlags, defaultLoggerPerm) 210 | if err != nil && os.IsNotExist(err) { 211 | if err = os.MkdirAll(dump, 0o755); err != nil { 212 | return nil, filePath, err 213 | } 214 | f, err = os.OpenFile(filePath, defaultLoggerFlags, defaultLoggerPerm) 215 | if err != nil { 216 | return nil, filePath, err 217 | } 218 | } 219 | return f, filePath, err 220 | } 221 | 222 | func writeFile(data bytes.Buffer, dumpType configureType, dumpOpts *DumpOptions, eventID string) (string, error) { 223 | var buf []byte 224 | if dumpOpts.DumpProfileType == textDump && !dumpOpts.DumpFullStack { 225 | switch dumpType { 226 | case mem, gcHeap, goroutine: 227 | buf = trimResultTop(data) 228 | case thread: 229 | buf = trimResultFront(data) 230 | default: 231 | buf = data.Bytes() 232 | } 233 | } else { 234 | buf = data.Bytes() 235 | } 236 | 237 | file, fileName, err := getBinaryFileNameAndCreate(dumpOpts.DumpPath, dumpType, eventID) 238 | if err != nil { 239 | return fileName, fmt.Errorf("pprof %v open file failed : %w", type2name[dumpType], err) 240 | } 241 | defer file.Close() //nolint:errcheck,gosec 242 | 243 | if _, err = file.Write(buf); err != nil { 244 | return fileName, fmt.Errorf("pprof %v write to file failed : %w", type2name[dumpType], err) 245 | } 246 | return fileName, nil 247 | } 248 | --------------------------------------------------------------------------------