├── .gitignore ├── AUTHOR ├── CHANGE ├── LICENSE ├── README.md ├── doc ├── congestion.md ├── log.md ├── retransmission.md └── transmission.md ├── image ├── congestion1.jpg ├── congestion2.jpg ├── http1.jpg ├── http2.jpg ├── lab.png ├── online.png ├── retransmission1.jpg ├── retransmission2.jpg └── transmission.jpg ├── rpm └── tcpdive-2.6.32-431.17.1.el6-1.0-stable.x86_64.rpm ├── script ├── make_rpm.sh ├── tcpdive └── tcpdive.spec ├── src ├── close.stp ├── congestion.stp ├── estab.stp ├── http.stp ├── memory.stp ├── options.stp ├── recv.stp ├── reset.stp ├── retrans.stp ├── rtt.stp ├── send.stp ├── share.stp └── structs.stp └── tcpdive.sh /.gitignore: -------------------------------------------------------------------------------- 1 | bak 2 | temp 3 | *.swp 4 | *.ko 5 | *.patch 6 | -------------------------------------------------------------------------------- /AUTHOR: -------------------------------------------------------------------------------- 1 | Shaokai Zhang 2 | Blog http://zhangskd.com 3 | -------------------------------------------------------------------------------- /CHANGE: -------------------------------------------------------------------------------- 1 | 2016/01/01 v1.0 A stable version of tcpdive for kernel-2.6.32-431.17.1.el6.x86_64 released. 2 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 2, June 1991 3 | 4 | Copyright (C) 1989, 1991 Free Software Foundation, Inc., 5 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 6 | Everyone is permitted to copy and distribute verbatim copies 7 | of this license document, but changing it is not allowed. 8 | 9 | Preamble 10 | 11 | The licenses for most software are designed to take away your 12 | freedom to share and change it. By contrast, the GNU General Public 13 | License is intended to guarantee your freedom to share and change free 14 | software--to make sure the software is free for all its users. This 15 | General Public License applies to most of the Free Software 16 | Foundation's software and to any other program whose authors commit to 17 | using it. (Some other Free Software Foundation software is covered by 18 | the GNU Lesser General Public License instead.) You can apply it to 19 | your programs, too. 20 | 21 | When we speak of free software, we are referring to freedom, not 22 | price. Our General Public Licenses are designed to make sure that you 23 | have the freedom to distribute copies of free software (and charge for 24 | this service if you wish), that you receive source code or can get it 25 | if you want it, that you can change the software or use pieces of it 26 | in new free programs; and that you know you can do these things. 27 | 28 | To protect your rights, we need to make restrictions that forbid 29 | anyone to deny you these rights or to ask you to surrender the rights. 30 | These restrictions translate to certain responsibilities for you if you 31 | distribute copies of the software, or if you modify it. 32 | 33 | For example, if you distribute copies of such a program, whether 34 | gratis or for a fee, you must give the recipients all the rights that 35 | you have. You must make sure that they, too, receive or can get the 36 | source code. And you must show them these terms so they know their 37 | rights. 38 | 39 | We protect your rights with two steps: (1) copyright the software, and 40 | (2) offer you this license which gives you legal permission to copy, 41 | distribute and/or modify the software. 42 | 43 | Also, for each author's protection and ours, we want to make certain 44 | that everyone understands that there is no warranty for this free 45 | software. If the software is modified by someone else and passed on, we 46 | want its recipients to know that what they have is not the original, so 47 | that any problems introduced by others will not reflect on the original 48 | authors' reputations. 49 | 50 | Finally, any free program is threatened constantly by software 51 | patents. We wish to avoid the danger that redistributors of a free 52 | program will individually obtain patent licenses, in effect making the 53 | program proprietary. To prevent this, we have made it clear that any 54 | patent must be licensed for everyone's free use or not licensed at all. 55 | 56 | The precise terms and conditions for copying, distribution and 57 | modification follow. 58 | 59 | GNU GENERAL PUBLIC LICENSE 60 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 61 | 62 | 0. This License applies to any program or other work which contains 63 | a notice placed by the copyright holder saying it may be distributed 64 | under the terms of this General Public License. The "Program", below, 65 | refers to any such program or work, and a "work based on the Program" 66 | means either the Program or any derivative work under copyright law: 67 | that is to say, a work containing the Program or a portion of it, 68 | either verbatim or with modifications and/or translated into another 69 | language. (Hereinafter, translation is included without limitation in 70 | the term "modification".) Each licensee is addressed as "you". 71 | 72 | Activities other than copying, distribution and modification are not 73 | covered by this License; they are outside its scope. The act of 74 | running the Program is not restricted, and the output from the Program 75 | is covered only if its contents constitute a work based on the 76 | Program (independent of having been made by running the Program). 77 | Whether that is true depends on what the Program does. 78 | 79 | 1. You may copy and distribute verbatim copies of the Program's 80 | source code as you receive it, in any medium, provided that you 81 | conspicuously and appropriately publish on each copy an appropriate 82 | copyright notice and disclaimer of warranty; keep intact all the 83 | notices that refer to this License and to the absence of any warranty; 84 | and give any other recipients of the Program a copy of this License 85 | along with the Program. 86 | 87 | You may charge a fee for the physical act of transferring a copy, and 88 | you may at your option offer warranty protection in exchange for a fee. 89 | 90 | 2. You may modify your copy or copies of the Program or any portion 91 | of it, thus forming a work based on the Program, and copy and 92 | distribute such modifications or work under the terms of Section 1 93 | above, provided that you also meet all of these conditions: 94 | 95 | a) You must cause the modified files to carry prominent notices 96 | stating that you changed the files and the date of any change. 97 | 98 | b) You must cause any work that you distribute or publish, that in 99 | whole or in part contains or is derived from the Program or any 100 | part thereof, to be licensed as a whole at no charge to all third 101 | parties under the terms of this License. 102 | 103 | c) If the modified program normally reads commands interactively 104 | when run, you must cause it, when started running for such 105 | interactive use in the most ordinary way, to print or display an 106 | announcement including an appropriate copyright notice and a 107 | notice that there is no warranty (or else, saying that you provide 108 | a warranty) and that users may redistribute the program under 109 | these conditions, and telling the user how to view a copy of this 110 | License. (Exception: if the Program itself is interactive but 111 | does not normally print such an announcement, your work based on 112 | the Program is not required to print an announcement.) 113 | 114 | These requirements apply to the modified work as a whole. If 115 | identifiable sections of that work are not derived from the Program, 116 | and can be reasonably considered independent and separate works in 117 | themselves, then this License, and its terms, do not apply to those 118 | sections when you distribute them as separate works. But when you 119 | distribute the same sections as part of a whole which is a work based 120 | on the Program, the distribution of the whole must be on the terms of 121 | this License, whose permissions for other licensees extend to the 122 | entire whole, and thus to each and every part regardless of who wrote it. 123 | 124 | Thus, it is not the intent of this section to claim rights or contest 125 | your rights to work written entirely by you; rather, the intent is to 126 | exercise the right to control the distribution of derivative or 127 | collective works based on the Program. 128 | 129 | In addition, mere aggregation of another work not based on the Program 130 | with the Program (or with a work based on the Program) on a volume of 131 | a storage or distribution medium does not bring the other work under 132 | the scope of this License. 133 | 134 | 3. You may copy and distribute the Program (or a work based on it, 135 | under Section 2) in object code or executable form under the terms of 136 | Sections 1 and 2 above provided that you also do one of the following: 137 | 138 | a) Accompany it with the complete corresponding machine-readable 139 | source code, which must be distributed under the terms of Sections 140 | 1 and 2 above on a medium customarily used for software interchange; or, 141 | 142 | b) Accompany it with a written offer, valid for at least three 143 | years, to give any third party, for a charge no more than your 144 | cost of physically performing source distribution, a complete 145 | machine-readable copy of the corresponding source code, to be 146 | distributed under the terms of Sections 1 and 2 above on a medium 147 | customarily used for software interchange; or, 148 | 149 | c) Accompany it with the information you received as to the offer 150 | to distribute corresponding source code. (This alternative is 151 | allowed only for noncommercial distribution and only if you 152 | received the program in object code or executable form with such 153 | an offer, in accord with Subsection b above.) 154 | 155 | The source code for a work means the preferred form of the work for 156 | making modifications to it. For an executable work, complete source 157 | code means all the source code for all modules it contains, plus any 158 | associated interface definition files, plus the scripts used to 159 | control compilation and installation of the executable. However, as a 160 | special exception, the source code distributed need not include 161 | anything that is normally distributed (in either source or binary 162 | form) with the major components (compiler, kernel, and so on) of the 163 | operating system on which the executable runs, unless that component 164 | itself accompanies the executable. 165 | 166 | If distribution of executable or object code is made by offering 167 | access to copy from a designated place, then offering equivalent 168 | access to copy the source code from the same place counts as 169 | distribution of the source code, even though third parties are not 170 | compelled to copy the source along with the object code. 171 | 172 | 4. You may not copy, modify, sublicense, or distribute the Program 173 | except as expressly provided under this License. Any attempt 174 | otherwise to copy, modify, sublicense or distribute the Program is 175 | void, and will automatically terminate your rights under this License. 176 | However, parties who have received copies, or rights, from you under 177 | this License will not have their licenses terminated so long as such 178 | parties remain in full compliance. 179 | 180 | 5. You are not required to accept this License, since you have not 181 | signed it. However, nothing else grants you permission to modify or 182 | distribute the Program or its derivative works. These actions are 183 | prohibited by law if you do not accept this License. Therefore, by 184 | modifying or distributing the Program (or any work based on the 185 | Program), you indicate your acceptance of this License to do so, and 186 | all its terms and conditions for copying, distributing or modifying 187 | the Program or works based on it. 188 | 189 | 6. Each time you redistribute the Program (or any work based on the 190 | Program), the recipient automatically receives a license from the 191 | original licensor to copy, distribute or modify the Program subject to 192 | these terms and conditions. You may not impose any further 193 | restrictions on the recipients' exercise of the rights granted herein. 194 | You are not responsible for enforcing compliance by third parties to 195 | this License. 196 | 197 | 7. If, as a consequence of a court judgment or allegation of patent 198 | infringement or for any other reason (not limited to patent issues), 199 | conditions are imposed on you (whether by court order, agreement or 200 | otherwise) that contradict the conditions of this License, they do not 201 | excuse you from the conditions of this License. If you cannot 202 | distribute so as to satisfy simultaneously your obligations under this 203 | License and any other pertinent obligations, then as a consequence you 204 | may not distribute the Program at all. For example, if a patent 205 | license would not permit royalty-free redistribution of the Program by 206 | all those who receive copies directly or indirectly through you, then 207 | the only way you could satisfy both it and this License would be to 208 | refrain entirely from distribution of the Program. 209 | 210 | If any portion of this section is held invalid or unenforceable under 211 | any particular circumstance, the balance of the section is intended to 212 | apply and the section as a whole is intended to apply in other 213 | circumstances. 214 | 215 | It is not the purpose of this section to induce you to infringe any 216 | patents or other property right claims or to contest validity of any 217 | such claims; this section has the sole purpose of protecting the 218 | integrity of the free software distribution system, which is 219 | implemented by public license practices. Many people have made 220 | generous contributions to the wide range of software distributed 221 | through that system in reliance on consistent application of that 222 | system; it is up to the author/donor to decide if he or she is willing 223 | to distribute software through any other system and a licensee cannot 224 | impose that choice. 225 | 226 | This section is intended to make thoroughly clear what is believed to 227 | be a consequence of the rest of this License. 228 | 229 | 8. If the distribution and/or use of the Program is restricted in 230 | certain countries either by patents or by copyrighted interfaces, the 231 | original copyright holder who places the Program under this License 232 | may add an explicit geographical distribution limitation excluding 233 | those countries, so that distribution is permitted only in or among 234 | countries not thus excluded. In such case, this License incorporates 235 | the limitation as if written in the body of this License. 236 | 237 | 9. The Free Software Foundation may publish revised and/or new versions 238 | of the General Public License from time to time. Such new versions will 239 | be similar in spirit to the present version, but may differ in detail to 240 | address new problems or concerns. 241 | 242 | Each version is given a distinguishing version number. If the Program 243 | specifies a version number of this License which applies to it and "any 244 | later version", you have the option of following the terms and conditions 245 | either of that version or of any later version published by the Free 246 | Software Foundation. If the Program does not specify a version number of 247 | this License, you may choose any version ever published by the Free Software 248 | Foundation. 249 | 250 | 10. If you wish to incorporate parts of the Program into other free 251 | programs whose distribution conditions are different, write to the author 252 | to ask for permission. For software which is copyrighted by the Free 253 | Software Foundation, write to the Free Software Foundation; we sometimes 254 | make exceptions for this. Our decision will be guided by the two goals 255 | of preserving the free status of all derivatives of our free software and 256 | of promoting the sharing and reuse of software generally. 257 | 258 | NO WARRANTY 259 | 260 | 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY 261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN 262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES 263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED 264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS 266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE 267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, 268 | REPAIR OR CORRECTION. 269 | 270 | 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR 272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, 273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING 274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED 275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY 276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER 277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE 278 | POSSIBILITY OF SUCH DAMAGES. 279 | 280 | END OF TERMS AND CONDITIONS 281 | 282 | How to Apply These Terms to Your New Programs 283 | 284 | If you develop a new program, and you want it to be of the greatest 285 | possible use to the public, the best way to achieve this is to make it 286 | free software which everyone can redistribute and change under these terms. 287 | 288 | To do so, attach the following notices to the program. It is safest 289 | to attach them to the start of each source file to most effectively 290 | convey the exclusion of warranty; and each file should have at least 291 | the "copyright" line and a pointer to where the full notice is found. 292 | 293 | {description} 294 | Copyright (C) {year} {fullname} 295 | 296 | This program is free software; you can redistribute it and/or modify 297 | it under the terms of the GNU General Public License as published by 298 | the Free Software Foundation; either version 2 of the License, or 299 | (at your option) any later version. 300 | 301 | This program is distributed in the hope that it will be useful, 302 | but WITHOUT ANY WARRANTY; without even the implied warranty of 303 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 304 | GNU General Public License for more details. 305 | 306 | You should have received a copy of the GNU General Public License along 307 | with this program; if not, write to the Free Software Foundation, Inc., 308 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 309 | 310 | Also add information on how to contact you by electronic and paper mail. 311 | 312 | If the program is interactive, make it output a short notice like this 313 | when it starts in an interactive mode: 314 | 315 | Gnomovision version 69, Copyright (C) year name of author 316 | Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 317 | This is free software, and you are welcome to redistribute it 318 | under certain conditions; type `show c' for details. 319 | 320 | The hypothetical commands `show w' and `show c' should show the appropriate 321 | parts of the General Public License. Of course, the commands you use may 322 | be called something other than `show w' and `show c'; they could even be 323 | mouse-clicks or menu items--whatever suits your program. 324 | 325 | You should also get your employer (if you work as a programmer) or your 326 | school, if any, to sign a "copyright disclaimer" for the program, if 327 | necessary. Here is a sample; alter the names: 328 | 329 | Yoyodyne, Inc., hereby disclaims all copyright interest in the program 330 | `Gnomovision' (which makes passes at compilers) written by James Hacker. 331 | 332 | {signature of Ty Coon}, 1 April 1989 333 | Ty Coon, President of Vice 334 | 335 | This General Public License does not permit incorporating your program into 336 | proprietary programs. If your program is a subroutine library, you may 337 | consider it more useful to permit linking proprietary applications with the 338 | library. If this is what you want to do, use the GNU Lesser General 339 | Public License instead of this License. 340 | 341 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # [tcpdive](https://github.com/fastos/tcpdive) - A TCP performance profiling tool 2 | 3 | Table of Contents 4 | ----------------------------------------- 5 | * [Introduction](#introduction) 6 | * [Background](#background) 7 | * [Advantages](#advantages) 8 | * [Scenarios](#scenarios) 9 | * [Functionality](#functionality) 10 | * [Transmission](#transmission) 11 | * [Loss and Retransmission](#loss-and-retransmission) 12 | * [Congestion Control](#congestion-control) 13 | * [HTTP Processing](#http-processing) 14 | * [Usage](#usage) 15 | * [Kernel Version](#kernel-version) 16 | * [Packages](#packages) 17 | * [Quick Start](#quick-start) 18 | * [Full Guide](#full-guide) 19 | * [Performance](#performance) 20 | * [Lab](#lab) 21 | * [Online](#online) 22 | * [Contact](#contact) 23 | 24 | ## INTRODUCTION ## 25 | 26 | #### Background #### 27 | 28 | Over the last decade, network conditions have changed a lot due to the rapid evolvement 29 | and popularity of some technologies such as Mobile Networking(2G/3G/4G/WiFi), 30 | Cloud Computing and etc. However, as the most commonly used transport layer protocol, 31 | TCP is not designed to deal with such complicated scenarios. As a result, some companies 32 | have been optimizing TCP to achieve a better user experience. 33 | 34 | However, when it comes to TCP performance optimization, we may be disappointed to find that 35 | there are few suited tools available. Utilities such as tcpdump, netstat and ss are 36 | not focused on TCP protocol itself. They can provide few performance information 37 | about TCP internals. 38 | 39 | For the reasons mentioned above, we decided to develop tcpdive - a TCP performance 40 | profiling tool. Tcpdive is designed to provide an insight into TCP, by monitoring 41 | and analysing mass data collected from a running linux kernel. Based on systemtap, 42 | tcpdive requires no kernel modifications, which makes it easy to deploy and friendly 43 | to use. 44 | 45 | #### Advantages #### 46 | 47 | Compared to existing tools, tcpdive has some advantages: 48 | - Far more peformance information of TCP internals. 49 | - Quantitatively evaluate the improvement of TCP performance. 50 | - Describe the processing of HTTP in TCP layer, which is independent of HTTP Apps. 51 | - Easy to deploy and friendly to use. 52 | 53 | #### Scenarios #### 54 | 55 | Tcpdive has been deployed in the production environment of SINA to: 56 | - Accelerate the improvement of service quality for Weibo image. 57 | - Accelerate the improvement of service quality for Weibo video. 58 | - Compare the performance of TCP in wired and wireless networks. 59 | - Characterise TCP traffic of different applications. 60 | 61 | ## FUNCTIONALITY ## 62 | 63 | #### Transmission #### 64 | 65 | Performance indicators listed below are used to describe a TCP connection's transmission. 66 | For the meaning of each performance indicator, please refer to [TRANSMISSION](doc/transmission.md). 67 | 68 | ![Transmission](image/transmission.jpg "Transmission") 69 | 70 | #### Loss and Retransmission #### 71 | 72 | TCP uses two primary mechanisms to detect and recover from losses. 73 | - Fast retransmit, triggered by dupacks 74 | - Timeout, triggered by timer 75 | 76 | First is Fast retransmit, where TCP performs a retransmission of the missing segment after 77 | receiving a certain number of duplicate ACKs. As a fall back, whenever fast retransmit is 78 | unsuccessful or when a sender does not receive enough duplicate ACKs, TCP uses the second 79 | mechanism where it waits for a duration of RTO before deducing that a segment is lost and 80 | then performs retransmission. 81 | 82 | ![Retrans Mechanism](image/retransmission1.jpg "Loss detection and recovery") 83 | 84 | Tcpdive can distinguish Fast retransmit and Timeout, figuring out how much time a connection 85 | spends on each kind of retransmission mechanism. For the meaning of each performance indicator, 86 | please refer to [RETRANSMISSION](doc/retransmission.md). 87 |
88 | 89 | ![Loss and Retransmission](image/retransmission2.jpg "Loss and Retransmission") 90 | 91 | #### Congestion Control #### 92 | 93 | Cubic is a TCP congestion control algorithm and the current default in Linux. 94 | As the name of the algorithm represents, the window growth function of Cubic is a cubic 95 | function which consists of three components: 96 | - The first is a concave where the window quickly ramps up to the window size before the last congestion event. 97 | - The middle is a plateau between the concave and convex which allows the window size to stabilize for a period of time. 98 | - The last is the convex where it probes for more bandwidth rapidly. 99 | 100 | Tcpdive explores TCP's congestion control mainly through the design and implementation of Cubic. 101 | Performance indicators listed below are used to profile Cubic. For the meaning of each performance indicator, 102 | please refer to [CONGESTION](doc/congestion.md). 103 | 104 | ![Congestion Control](image/congestion1.jpg "Congestion Control") 105 | 106 | What if we want more animate profiling of Cubic other than some averaged performance indicators? 107 | Tcpdive supports five kinds of critical points, by the use of which we can depict the fluctuation of a connection. 108 | To figure out what messages a critical point can convey, please refer to [CONGESTION](doc/congestion.md). 109 | 110 | ![Advanced Congestion Control](image/congestion2.jpg "Advanced Congestion Control") 111 | 112 | 113 | #### HTTP Processing #### 114 | 115 | As we all know, HTTP is a request-response based protocol. That means the client will initiate 116 | a communication by sending a request and the HTTP server will respond back by sending a response. 117 | While tcpdive is designed to profile the performance of TCP protocol, it can also be used to 118 | monitor every http request and response. HTTP Keep-Alive is suppported. 119 | 120 | ![HTTP Processing](image/http1.jpg "HTTP Processing") 121 | 122 | For a pair of http request and response, performance indicators listed below are provided. 123 | 124 | ![HTTP performance indicators](image/http2.jpg "HTTP performance indicators") 125 | 126 | Tcpdive can profile HTTP processing in some degree, as illustrated above. 127 | Note that all the work is done in TCP Layer, which means it is independent of HTTP applications. 128 | 129 | ## USAGE ## 130 | 131 | #### KERNEL VERSION #### 132 | 133 | Although no kernel modification is required, tcpdive itself is kernel version dependent. 134 | Now tcpdive is developed on **kernel 2.6.32-431.17.1** of CentOS 6.5 which is the major system of 135 | our production environment. More kernel versions will be supported in the future. 136 | So check your kernel version first. 137 | If you don't have this version of kernel, please download related kernel rpms from [here](http://vault.centos.org/6.5/updates/x86_64/Packages/). 138 | 139 | #### PACKAGES #### 140 | 141 | For a quick start, only **systemtap-runtime** is required. 142 | To make full use of tcpdive, some extra packages are required: 143 | - gcc 144 | - elfutils 145 | - systemtap 146 | - kernel-devel 147 | - kernel-debuginfo 148 | - kernel-debuginfo-common 149 | 150 | Use rpm command to install gcc, elfutils and systemtap if they don't exist. 151 | Kernel-debuginfo and kernel-debuginfo-common packages of this kernel version can be 152 | downloaded from [here](http://debuginfo.centos.org/6/x86_64/). 153 | 154 | #### QUICK START #### 155 | 156 | Install tcpdive-2.6.32-431.17.1.el6-1.0-stable.x86_64.rpm in [rpm](rpm/). 157 | Modify /usr/bin/tcpdive to change default configuration. 158 | 159 | LOG_NAME="tcpdive.log" # log file name 160 | LOG_SIZE="500" # per log file's upper size(MB) 161 | LOG_NUM=20 # max number of log files 162 | PORTS="80" # server ports concerned, eg. 80,8080 163 | 164 | Let's get tcpdive to work. 165 | 166 | tcpdive {start|stop|status} 167 | 168 | After running "tcpdive start", you will find a log in current directory. 169 | For every single line in the log, it represents a TCP connection which is profiled from 170 | multiple dimensions such as Transmission, Loss and Retransmission, and HTTP Processing. 171 | For the default log format of tcpdive, please refer to [LOG](doc/log.md). 172 | 173 | #### FULL GUIDE #### 174 | 175 | To make full use of tcpdive, some flexible and constomized ways of usage are suggested. 176 | Make sure [packages](#packages) are already installed. 177 | 178 | Run "./tcpdive.sh -h" to see the help information of tcpdive. 179 | Note that there are two kinds of log formats supported, check [LOG](doc/log.md) for more details. 180 | 181 | USAGE: 182 | ./tcpdive.sh [options] [modules] [filters] 183 | 184 | OPTIONS: 185 | -h # show help 186 | -V # show version 187 | -v # verbose mode for debugging 188 | -t # stop itself after running specified time 189 | -m # compile as tcpdive.ko instead of running directly 190 | -d # detailed logging instead of default format 191 | 192 | MODULES: 193 | -L # Loss and Retransmission 194 | -H # HTTP Performance (1.0/1.1) 195 | -C # Congestion Control 196 | -A # Advanced CC (depict critical points) 197 | -R # Monitor Reset Packet 198 | 199 | FILTERS: 200 | -l # lifetime of connection should greater than 201 | -i # trans time of response should greater than 202 | -s # take one sample from connections 203 | -p # server ports cared, use comma to separate 204 | 205 | -f :-: [-f <...>] # should be last 206 | eg. -f *.*.*.*:80-10.210.136.*:* 207 | 208 | 209 | **1. RUN DIRECTLY** 210 | 211 | Compile and run directly, use Ctrl+C to stop if -t option is not specified. 212 | Below are some examples. 213 | 214 | ./tcpdive.sh -L -t 60 // or 215 | ./tcpdive.sh -d -L -H -p 80 // or 216 | ./tcpdive.sh -v -L -H -C -f *.*.*.*:8080-10.210.136.*:* 217 | 218 | **2. MODULE WAY** 219 | 220 | By specifying -m option, we can get a module named tcpdive.ko instead of running directly. 221 | 222 | ./tcpdive.sh -L -t 60 -m // or 223 | ./tcpdive.sh -d -L -H -p 80,8080 -m // or 224 | ./tcpdive.sh -v -L -H -C -R -A 10 -m 225 | 226 | Load tcpdive.ko and run tcpdive in background. 227 | 228 | staprun -D -o log tcpdive.ko // or 229 | staprun -D -S 500,20 -o log tcpdive.ko port_str="80,8080" 230 | 231 | To stop the running of tcpdive, use the following command. 232 | 233 | ps aux|grep stap|grep tcpdive|grep -v grep|awk '{print $2}'|xargs kill 234 | 235 | **3. RPM WAY** 236 | 237 | To deploy tcpdive in production environment, use [make_rpm.sh](script/make_rpm.sh) to 238 | configure and make rpm package. 239 | 240 | sh script/make_rpm.sh 241 | 242 | After installing the rpm, a convenient way of usage is available. 243 | 244 | tcpdive {start|stop|status} 245 | 246 | ## PERFORMANCE ## 247 | 248 | We evaluate tcpdive mainly by comparing the usage of system resources with and without it. 249 | While tcpdive does increase the CPU usage to some degree, it has no significant influence on 250 | other system resources. Commonly used functionalities of tcpdive such as Transmission, Loss and 251 | Retransmission, and HTTP Processing are enabled in the following tests. 252 | 253 | #### LAB #### 254 | 255 | Brief configuration of Nginx: 256 | - Worker number is set to the number of CPU cores, which is 12 in our testbed. 257 | - HTTP keep-alive is disabled for a short connection test. 258 | 259 | 10-Gigabit network cards are used with their tx/rx queues bound. 260 | Http_load running on client fetches a 1KB file from Nginx with different degrees of concurrency to 261 | vary the CPU load of server. 262 | 263 | Here's a figure to demonstrate the increment of Per-core CPU Usage and the decrement of QPS induced 264 | by tcpdive under different circumstances. 265 | 266 | ![Lab Evaluation](image/lab.png "Lab Evaluation") 267 | 268 | The figure above shows Per-core CPU consumption of tcpdive is less than 10% while QPS is no significant 269 | influenced, which we believe is acceptable in most cases. However, on condition that Per-core CPU usage 270 | is already very high (greater than 60%), the use of tcpdive is not recommended because QPS will drop a 271 | lot to make room for tcpdive. 272 | 273 | #### ONLINE #### 274 | 275 | As mentioned before, tcpdive has already been deployed in the production environment of SINA. 276 | One typical scenario is using tcpdive on HAProxy server which acts as the load balancer of image service. 277 | The figure below shows the fluctuation of Per-core CPU usage of a 24-core server within 24 hours. 278 | Two 10-Gigabit network cards are used with their tx/rx queues bound. 279 | 280 | ![Online Evaluation](image/online.png "Online Evaluation") 281 | 282 | Based on months observation, we make a conclusion that tcpdive is stable and robust enough to be 283 | deployed in the production environment. 284 | 285 | ## CONTACT ## 286 | 287 | Blog: [zhangskd.com](http://zhangskd.com) 288 | Email: zhangskd@gmail.com 289 | 290 | If you have any questions, please leave a message on my blog or email me. 291 | -------------------------------------------------------------------------------- /doc/congestion.md: -------------------------------------------------------------------------------- 1 | #Congestion Control 2 | 3 | * [First Loss](#first-loss) 4 | * [Slow Start](#slow-start) 5 | * [Standard](#standard) 6 | * [ACK Train Length](#ack-train-length) 7 | * [Delay Increase](#delay-increase) 8 | * [Abort](#abort) 9 | * [Cong Avoid](#cong-avoid) 10 | * [Epoch Phase](#epoch-phase) 11 | * [Epoch Point](#epoch-point) 12 | * [Advanced CC](#advanced-cc) 13 | 14 | ## First Loss ## 15 | 16 | **fl_phase** 17 | The first loss occurs in which phase, slow start or congestion avoid. The value is {ss|cong}. 18 | 19 | **fl_cwnd** 20 | The congestion window size when first loss happens. The unit is MSS. 21 | 22 | **fl_rtt** 23 | The RTT when first loss happens. The unit is ms. 24 | 25 | ## Slow Start ## 26 | 27 | There are three kinds of slow start algorithms used by Cubic. 28 | - Standard 29 | - ACK Train Length 30 | - Delay Increase 31 | 32 | Besides, when slow start quits due to loss or connection close, we call it Abort. 33 | 34 | #### STANDARD #### 35 | 36 | **std_ss_cnt** 37 | The number of completed Standard slow start. 38 | 39 | **std_ss_time** 40 | Average time of Standard slow start. The unit is RTT. 41 | 42 | **std_start_cwnd** 43 | Average start cwnd of Standard slow start. The unit is MSS. 44 | 45 | **std_end_cwnd** 46 | Average end cwnd of Standard slow start. The unit is MSS. 47 | 48 | #### ACK TRAIN LENGTH #### 49 | 50 | **ack_ss_cnt** 51 | The number of completed ACK Train Length slow start. 52 | 53 | **ack_ss_time** 54 | Average time of ACK Train Length slow start. The unit is RTT. 55 | 56 | **ack_start_cwnd** 57 | Average start cwnd of ACK Train Length slow start. The unit is MSS. 58 | 59 | **ack_end_cwnd** 60 | Average end cwnd of ACK Train Length slow start. The unit is MSS. 61 | 62 | #### DELAY INCREASE #### 63 | 64 | **delay_ss_cnt** 65 | The number of completed Delay Increase slow start. 66 | 67 | **delay_ss_time** 68 | Average time of Delay Increase slow start. The unit is RTT. 69 | 70 | **delay_start_cwnd** 71 | Average start cwnd of Delay Increase slow start. The unit is MSS. 72 | 73 | **delay_end_cwnd** 74 | Average end cwnd of Delay Increase slow start. The unit is MSS. 75 | 76 | #### ABORT #### 77 | 78 | **abort_ss_cnt** 79 | The number of unfinished slow start. 80 | 81 | **abort_ss_time** 82 | Average time of Abort slow start. The unit is RTT. 83 | 84 | **abort_start_cwnd** 85 | Average start cwnd of Abort slow start. The unit is MSS. 86 | 87 | **abort_end_cwnd** 88 | Average end cwnd of Abort slow start. The unit is MSS. 89 | 90 | ## Cong Avoid ## 91 | 92 | Cubic uses a cubic function of the elapsed time from the last congestion event. 93 | The region between two continuous congestion events is called an epoch. So actually 94 | the congestion avoidance of Cubic is composed by serveral epochs. 95 | 96 | #### EPOCH PHASE #### 97 | 98 | An epoch period consists of three phases: 99 | - Searching phase, ramps up to the window size before the last congestion event. 100 | - Stable phase, allows the window size to stabilize for a period. 101 | - Max probing phase, probes for more bandwidth. 102 | 103 | **epoch_cnt** 104 | The number of epochs experienced by a connection. 105 | 106 | **epoch_time** 107 | Total time of epochs. The unit is RTT. 108 | 109 | **search_cnt** 110 | The number of searching phases experienced by a connnection. 111 | 112 | **search_time** 113 | Total time of searching phases. The unit is RTT. 114 | 115 | **probe_cnt** 116 | The number of probing phases experienced by a connection. 117 | 118 | **probe_time** 119 | Total time of probing phases. The unit is RTT. 120 | 121 | #### EPOCH POINT #### 122 | 123 | **ep_start_cnt** 124 | The number of epochs' start points. 125 | 126 | **ep_start_cwnd** 127 | Average cwnd of epochs' start points. 128 | 129 | **ep_start_rtt** 130 | Average RTT of epochs' start points. 131 | 132 | **ep_steady_cnt** 133 | The number of epochs' steady points. 134 | 135 | **ep_steady_cwnd** 136 | Average cwnd of epochs' steady points. 137 | 138 | **ep_steady_rtt** 139 | Average RTT of epochs' steady points. 140 | 141 | **ep_end_cnt** 142 | The number of epochs' end points. 143 | 144 | **ep_end_cwnd** 145 | Average cwnd of epochs' end points. 146 | 147 | **ep_end_rtt** 148 | Average RTT of epochs' end points. 149 | 150 | ## Advanced CC ## 151 | 152 | Tcpdive uses five kinds of critical points to depict the fluctuation of a connection. 153 | A critial point looks like: 154 | 155 | [point] DATE local remote id cwnd rtt time msg 156 | 157 | **point** 158 | The name of a critical point. 159 | - SS start, enter slow start. 160 | - SS end, exit slow start. 161 | - EP start, start point of an epoch. 162 | - EP steady, stable point of an epoch. 163 | - EP end, end point of an epoch. 164 | 165 | **DATE** 166 | year/month/day 167 | 168 | **local** 169 | DIP:DPORT, ip and port of the server. 170 | 171 | **remote** 172 | SIP:SPORT, ip and port of the client. 173 | 174 | **id** 175 | ISN of the server. 176 | 177 | **cwnd** 178 | Congestion window size, the unit is MSS. 179 | 180 | **rtt** 181 | Round trip time, the unit is ms. 182 | 183 | **time** 184 | Time elapsed from the establishment of a connection, the unit is ms. 185 | 186 | **msg** 187 | The messages conveyed by a critical point. 188 | - null, nothing. 189 | - std, use Standard slow start. 190 | - ack, use ACK Train Length slow start. 191 | - delay, use Delay Increase slow start. 192 | - abort, slow start aborts. 193 | - close, the connection is closing. 194 | - search, in searching phase of an epoch. 195 | - probe, in probing phase of an epoch. 196 | 197 | -------------------------------------------------------------------------------- /doc/log.md: -------------------------------------------------------------------------------- 1 | #log format 2 | 3 | * [Default Format](#default-format) 4 | * [Detailed Format](#detailed-format) 5 | * [Packet-based Message](#packet-based-message) 6 | 7 | ## DEFAULT FORMAT ## 8 | 9 | Each line of log files represents a TCP connection which is profiled by lots of performance indicators. 10 | Performance indicators in the same line are recorded in "name=value" way, separated by commas. 11 | A performance indicator's value can be -1 in case it hasn't been initialized. 12 | 13 | Performance indicators composing a log line are recorded in the following sequence. 14 | 15 | **1. Transmission, default** 16 | 17 | date, start, end, local, remote, id, // Conn ID 18 | data, time, packet, synack_rtx, accept_wait, // Basic 19 | small_swnd, zero_awnd, rst_flag, from_state, to_state, // Exception 20 | init_cwnd, end_cwnd, init_ssthr, end_ssthr, // Cwnd / SSthresh 21 | rtt_avg, rtt_min, rtt_max, rtt_cnt, // RTT 22 | rto_avg, rto_min, rto_max, rto_cnt, // RTO 23 | 24 | **2. Loss & Retransmission, -L option** 25 | 26 | fr_ev, fr_repkts, fr_wait, fr_rec, fr_undo, // Fast retransmit 27 | to_ev, to_repkts, to_wait, to_rec, to_undo, // Timeout 28 | 29 | **3. Congestion control, -C option** 30 | 31 | First Loss 32 | 33 | fl_phase, fl_cwnd, fl_rtt, // First Loss 34 | 35 | 36 | Slow Start 37 | 38 | std_ss_cnt, std_ss_time, std_start_cwnd, std_end_cwnd, // Standard 39 | ack_ss_cnt, ack_ss_time, ack_start_cwnd, ack_end_cwnd, // ACK Train Length 40 | delay_ss_cnt, delay_ss_time, delay_start_cwnd, delay_end_cwnd, // Delay Increase 41 | abort_ss_cnt, abort_ss_time, abort_start_cwnd, abort_end_cwnd, // Abort 42 | 43 | Congestion Avoidance 44 | 45 | cwnd_unlimit, fast_converg, 46 | epoch_cnt, epoch_time, search_cnt, search_time, probe_cnt, probe_time, // Epoch phase 47 | ep_start_cnt, ep_start_cwnd, ep_start_rtt, // Start point 48 | ep_steady_cnt, ep_steady_cwnd, ep_steady_rtt, // Steady point 49 | ep_end_cnt, ep_end_cwnd, ep_end_rtt, // End point 50 | 51 | **4. HTTP information, -H option** 52 | 53 | req_count, 54 | num, time, acked_data, req_wait, resp_wait, trans_time, // First pair 55 | ... 56 | num, time, acked_data, req_wait, resp_wait, trans_time // Last pair 57 | 58 | #### EXAMPLE ##### 59 | 60 | Below is a log line profiling a TCP connection with -L, -C and -H options enabled. 61 | 62 | 2015/9/8,start=19:12:09,end=19:12:19,local=10.210.136.54:80,remote=10.210.136.53:19497,id=909432482, 63 | data=1048830,time=10219,packet=726,synack_rtx=0,accept_wait=1, 64 | small_swnd=35,zero_awnd=0,rst_flag=0,from_state=FIN_WAIT1,to_state=FIN_WAIT2, 65 | init_cwnd=10,end_cwnd=5,init_ssthr=2147483647,end_ssthr=93, 66 | rtt_avg=557,rtt_min=155,rtt_max=976,rtt_cnt=442, 67 | rto_avg=805,rto_min=416,rto_max=1262,rto_cnt=442, 68 | fr_ev=-1,fr_repkts=-1,fr_wait=-1,fr_rec=-1,fr_undo=-1, 69 | to_ev=1,to_repkts=12,to_wait=1577,to_rec=1242,to_undo=0, 70 | fl_phase=cong,fl_cwnd=133,fl_rtt=939, 71 | std_ss_cnt=-1,std_ss_time=-1,std_start_cwnd=-1,std_end_cwnd=-1, 72 | ack_ss_cnt=-1,ack_ss_time=-1,ack_start_cwnd=-1,ack_end_cwnd=-1, 73 | delay_ss_cnt=1,delay_ss_time=2,delay_start_cwnd=10,delay_end_cwnd=29, 74 | abort_ss_cnt=-1,abort_ss_time=-1,abort_start_cwnd=-1,abort_end_cwnd=-1, 75 | cwnd_unlimit=39,fast_converg=0, 76 | epoch_cnt=1,epoch_time=9,search_cnt=-1,search_time=-1,probe_cnt=1,probe_time=9, 77 | ep_start_cnt=1,ep_start_cwnd=30,ep_start_rtt=227, 78 | ep_steady_cnt=-1,ep_steady_cwnd=-1,ep_steady_rtt=-1, 79 | ep_end_cnt=1,ep_end_cwnd=133,ep_end_rtt=939, 80 | req_count=1, 81 | num=1,time=19:12:09,acked_data=1048830,req_wait=0,resp_wait=1,trans_time=10218 82 | 83 | ## DETAILED FORMAT ## 84 | 85 | Besides the default format, a more human readable alternative is provided for realtime analysis, 86 | which is called detailed format. Instead of a log line, serveral tables are used to describe a TCP connection. 87 | 88 | No. | TABLE NAME | OPTION | WHEN TO DISPLAY 89 | 1 | TRANS | - | always 90 | 2 | RTT | - | always 91 | 3 | RETRANS | -L | has loss and retransmission 92 | 4 | FIRST LOSS | -C | has loss 93 | 5 | SLOW START | -C | has slow start 94 | 6 | CONG PHASE | -C | has congestion avoidance 95 | 7 | CONG POINT | -C | has congestion avoidance 96 | 8 | HTTP | -H | use HTTP protocol 97 | 98 | #### EXAMPLE ONE#### 99 | 100 | Below is a TCP connection profiled using -L, -C and -H options, with detailed format enabled. 101 | 102 | ========================================================================== 103 | 2015/12/30,start=16:48:55,end=16:48:56,id=890624153 104 | local=10.210.136.54:8080,remote=10.210.136.53:28686 105 | 106 | TRANS TABLE 107 | data 1048815 B 108 | time 898 ms 109 | packet 726 pkts 110 | synack_rtx 0 pkts 111 | accept_wait 0 ms 112 | small_swnd 1 113 | zero_awnd 0 114 | rst_flag 0 115 | from_state FIN_WAIT1 116 | to_state FIN_WAIT2 117 | init_cwnd 10 118 | end_cwnd 63 119 | init_ssthresh 2147483647 120 | end_ssthresh 29 121 | 122 | RTT TABLE avg min max cnt 123 | RTT(ms) 48 8 102 399 124 | RTO(ms) 247 201 267 399 125 | 126 | SLOW START TABLE count s_cwnd e_cwnd time(RTT) 127 | Delay Increase 1 10 29 2 128 | 129 | CONG PHASE TABLE count time(RTT) 130 | epoch 1 13 131 | probing 1 13 132 | 133 | CONG POINT TABLE count cwnd rtt 134 | start 1 30 17 135 | end 1 63 65 136 | 137 | HTTP TABLE time ack_data req_wait resp_wait trans_time 138 | Num.1 16:48:55 1048815 0 0 898 139 | 140 | #### EXAMPLE TWO#### 141 | 142 | Below is a TCP connection profiled using -L, -C and -H options, with detailed format enabled. 143 | 144 | ========================================================================== 145 | 2015/12/30,start=16:07:31,end=16:07:38,id=1395054193 146 | local=10.210.136.54:8080,remote=10.210.136.53:60212 147 | 148 | TRANS TABLE 149 | data 234812 B 150 | time 6367 ms 151 | packet 163 pkts 152 | synack_rtx 0 pkts 153 | accept_wait 4 ms 154 | small_swnd 36 155 | zero_awnd 0 156 | rst_flag 0 157 | from_state LAST_ACK 158 | to_state CLOSE 159 | init_cwnd 10 160 | end_cwnd 11 161 | init_ssthresh 11 162 | end_ssthresh 9 163 | 164 | RTT TABLE avg min max cnt 165 | RTT(ms) 302 75 456 91 166 | RTO(ms) 574 347 698 92 167 | 168 | RETRANS TABLE events pkts wa_time rec_time undo 169 | Fast recovery 2 8 879 421 0 170 | Timeout 1 8 689 958 0 171 | TO in Recovery 1 8 689 958 172 | 173 | FIRST LOSS TABLE phase cwnd rtt 174 | cong 17 416 175 | 176 | SLOW START TABLE count s_cwnd e_cwnd time(RTT) 177 | Standard 3 8 10 0 178 | 179 | CONG PHASE TABLE count time(RTT) 180 | epoch 3 9 181 | searching 1 1 182 | probing 2 8 183 | 184 | CONG POINT TABLE count cwnd rtt 185 | start 3 11 319 186 | end 3 13 380 187 | 188 | HTTP TABLE time ack_data req_wait resp_wait trans_time 189 | Num.1 16:07:31 218884 7 5 6115 190 | 191 | ## Packet-based Message ## 192 | 193 | Although most of log messages are based on connection, there are some based on packet. 194 | 195 | **Monitor Reset Packet, -R option** 196 | 197 | [TX RST], DATE, local, remote // Active RST sent 198 | [RX RST], DATE, local, remote, state // RST received 199 | 200 | Below are some examples. 201 | 202 | [TX RST],2015/9/8,19:12:10,local=10.210.136.54:80,remote=10.210.136.53:19450 203 | [RX RST],2015/9/8,19:20:09,local=10.210.136.54:80,remote=10.210.136.53:14678,state=CLOSE_WAIT 204 | 205 | **Advanced Congestion Control, -A option** 206 | 207 | [point], DATE, local, remote, id, cwnd, rtt, time, msg // Critical point 208 | 209 | Below are some critical points by the use of which we can depict the fluctuation of a connection. 210 | 211 | [SS start],2015/9/8,19:12:09,local=10.210.136.54:80,remote=10.210.136.53:19497,id=909432482,cwnd=10,rtt=140,time=156,msg=null 212 | [SS end],2015/9/8,19:12:09,local=10.210.136.54:80,remote=10.210.136.53:19497,id=909432482,cwnd=30,rtt=227,time=407,msg=delay 213 | [EP start],2015/9/8,19:12:09,local=10.210.136.54:80,remote=10.210.136.53:19497,id=909432482,cwnd=30,rtt=227,time=407,msg=probe 214 | [EP end],2015/9/8,19:12:18,local=10.210.136.54:80,remote=10.210.136.53:19497,id=909432482,cwnd=133,rtt=939,time=8977,msg=probe 215 | [SS start],2015/9/8,19:12:18,local=10.210.136.54:80,remote=10.210.136.53:19497,id=909432482,cwnd=2,rtt=784,time=9535,msg=null 216 | [SS end],2015/9/8,19:12:19,local=10.210.136.54:80,remote=10.210.136.53:19497,id=909432482,cwnd=5,rtt=459,time=10219,msg=close 217 | 218 | 219 | -------------------------------------------------------------------------------- /doc/retransmission.md: -------------------------------------------------------------------------------- 1 | #Loss and Retransmission 2 | 3 | * [Fast retransmit](#fast-retransmit) 4 | * [Timeout](#timeout) 5 | 6 | ## Fast retransmit ## 7 | 8 | **fr_ev** 9 | The number of Fast retransmit events occurred during the lifetime of a connection. 10 | 11 | **fr_repkts** 12 | The number of packets retransmitted during Fast retransmit events. 13 | 14 | **fr_wait** 15 | The wait time of Fast retransmit events. 16 | 17 | **fr_rec** 18 | The recovery time of Fast retransmit events. 19 | 20 | **fr_undo** 21 | The supurious times of Fast retransmit events. 22 | 23 | ## Timeout ## 24 | 25 | **to_ev** 26 | The number of Timeout events occurred during the lifetime of a connection. 27 | 28 | **to_repkts** 29 | The number of packets retransmitted during Timeout events. 30 | 31 | **to_wait** 32 | The wait time of Timeout events. 33 | 34 | **to_rec** 35 | The recovery time of Timeout events. 36 | 37 | **to_undo** 38 | The supurious times of Timeout events. 39 | 40 | Furthermore, we classify Timeout events according to the congestion state during which they happen. 41 | - Open 42 | - Disorder 43 | - CWR 44 | - Recovery 45 | - Loss 46 | 47 | -------------------------------------------------------------------------------- /doc/transmission.md: -------------------------------------------------------------------------------- 1 | #Transmission 2 | 3 | * [Conn ID](#conn-id) 4 | * [Basic](#basic) 5 | * [Exception](#exception) 6 | * [Cwnd / SSthresh](#cwnd-ssthresh) 7 | * [RTT / RTO](#rtt-rto) 8 | 9 | ## Conn ID ## 10 | 11 | **date** 12 | year/month/day 13 | 14 | **start** 15 | hour/minute/second, the time when a connection is established. 16 | 17 | **end** 18 | hour/minute/second, the time when a connection is closed. 19 | 20 | **local** 21 | DIP:DPORT, ip and port of the server. 22 | 23 | **remote** 24 | SIP:SPORT, ip and port of the client. 25 | 26 | **id** 27 | ISN of the server. 28 | 29 | ## Basic ## 30 | 31 | **data** 32 | The amount of data sent out by a connection, the unit is Byte. 33 | 34 | **time** 35 | Lifetime of a connection, the unit is millisecond. 36 | 37 | **packet** 38 | The number of packets sent out by a connection. 39 | 40 | **synack_rtx** 41 | The times SYNACK is retransmitted. 42 | 43 | **accept_wait** 44 | Time elapsed between a connection being established and being accepted. The unit is ms. 45 | 46 | ## Exception ## 47 | 48 | **small_swnd** 49 | How many times when the send window of server is less than MSS. 50 | 51 | **zero_awnd** 52 | How many times when the advertise window of client is zero. 53 | 54 | **rst_flag** 55 | A packet with RST flag set is sent or received by the connection. 56 | 57 | **from_state** 58 | The state of a connection before monitoring is finishing. 59 | 60 | **to_state** 61 | The state of a connection after monitoring is finished. 62 | 63 | ## Cwnd / SSthresh ## 64 | 65 | **init_cwnd** 66 | Initial congestion window size, the unit is MSS. 67 | 68 | **end_cwnd** 69 | Final congestion window size, the unit is MSS. 70 | 71 | **init_ssthr** 72 | Initial slow start threshold, the unit is MSS. 73 | 74 | **end_ssthr** 75 | Final slow start threshold, the unit is MSS. 76 | 77 | ## RTT / RTO ## 78 | 79 | **rtt_avg** 80 | Average RTT of a connection, the unit is ms. 81 | 82 | **rtt_min** 83 | Minimal RTT of a connection, the unit is ms. 84 | 85 | **rtt_max** 86 | Maximal RTT of a connection, the unit is ms. 87 | 88 | **rtt_cnt** 89 | The number of RTT samples. 90 | 91 | **rto_avg** 92 | Average RTO of a connection, the unit is ms. 93 | 94 | **rto_min** 95 | Minimal RTO of a connection, the unit is ms. 96 | 97 | **rto_max** 98 | Maximal RTO of a connection, the unit is ms. 99 | 100 | **rto_cnt** 101 | The number of RTO samples. 102 | 103 | -------------------------------------------------------------------------------- /image/congestion1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fastos/tcpdive/02b575f5fc11535a619bc80c196e9c123546384b/image/congestion1.jpg -------------------------------------------------------------------------------- /image/congestion2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fastos/tcpdive/02b575f5fc11535a619bc80c196e9c123546384b/image/congestion2.jpg -------------------------------------------------------------------------------- /image/http1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fastos/tcpdive/02b575f5fc11535a619bc80c196e9c123546384b/image/http1.jpg -------------------------------------------------------------------------------- /image/http2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fastos/tcpdive/02b575f5fc11535a619bc80c196e9c123546384b/image/http2.jpg -------------------------------------------------------------------------------- /image/lab.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fastos/tcpdive/02b575f5fc11535a619bc80c196e9c123546384b/image/lab.png -------------------------------------------------------------------------------- /image/online.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fastos/tcpdive/02b575f5fc11535a619bc80c196e9c123546384b/image/online.png -------------------------------------------------------------------------------- /image/retransmission1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fastos/tcpdive/02b575f5fc11535a619bc80c196e9c123546384b/image/retransmission1.jpg -------------------------------------------------------------------------------- /image/retransmission2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fastos/tcpdive/02b575f5fc11535a619bc80c196e9c123546384b/image/retransmission2.jpg -------------------------------------------------------------------------------- /image/transmission.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fastos/tcpdive/02b575f5fc11535a619bc80c196e9c123546384b/image/transmission.jpg -------------------------------------------------------------------------------- /rpm/tcpdive-2.6.32-431.17.1.el6-1.0-stable.x86_64.rpm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fastos/tcpdive/02b575f5fc11535a619bc80c196e9c123546384b/rpm/tcpdive-2.6.32-431.17.1.el6-1.0-stable.x86_64.rpm -------------------------------------------------------------------------------- /script/make_rpm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Make a rpm package of tcpdive based on specific kernel version. 4 | 5 | # Kernel Version 6 | KERNEL_VER="2.6.32-431.17.1.el6.x86_64" 7 | KERNEL_NAME=`echo $KERNEL_VER|sed 's/\.x86_64//'` 8 | 9 | if [ ! -d "/lib/modules/$KERNEL_VER" ]; then 10 | echo "kernel-devel-$KERNEL_VER.rpm is not installed!" 11 | exit 1 12 | fi 13 | 14 | # Tool Version 15 | TOOL_VER="1.0" 16 | TOOL_RELEASE="stable" 17 | DIR=`dirname $0` 18 | CMD_FILE="$DIR/tcpdive" 19 | SPEC_FILE="$DIR/tcpdive.spec" 20 | MODULE="tcpdive.ko" 21 | 22 | # Revise cmd file 23 | sed -i "4s/.*/TOOL_VER=$TOOL_VER/" $CMD_FILE 24 | sed -i "5s/.*/TOOL_RELEASE=$TOOL_RELEASE/" $CMD_FILE 25 | sed -i "7s/.*/KERNEL=$KERNEL_VER/" $CMD_FILE 26 | 27 | # Revise spec file 28 | sed -i "1s/.*/%define kversion $KERNEL_NAME/" $SPEC_FILE 29 | sed -i "4s/.*/Version: $TOOL_VER/" $SPEC_FILE 30 | sed -i "5s/.*/Release: $TOOL_RELEASE/" $SPEC_FILE 31 | 32 | # Compile module 33 | # use "../tcpdive.sh -h" to see all options supported. 34 | $DIR/../tcpdive.sh -m -L -H > /dev/null 35 | if [[ $? -ne 0 || ! -f $MODULE ]]; then 36 | echo "Compile $MODULE failed!" 37 | exit 1 38 | fi 39 | 40 | # Build rpm 41 | RPMROOT="/root/rpmbuild" 42 | TAR_NAME="tcpdive-$KERNEL_NAME-$TOOL_VER" 43 | 44 | mkdir -p $TAR_NAME 45 | mv $MODULE $TAR_NAME 46 | cp $CMD_FILE $TAR_NAME 47 | tar -czf $TAR_NAME.tar.gz $TAR_NAME/ 48 | rm -rf $TAR_NAME 49 | 50 | mkdir -p $RPMROOT/SOURCES 51 | mv $TAR_NAME.tar.gz $RPMROOT/SOURCES 52 | 53 | rpmbuild -bb $SPEC_FILE &> $RPMROOT/log 54 | if [ $? -eq 0 ]; then 55 | echo "Make rpm package ok!" 56 | mv $RPMROOT/RPMS/x86_64/${TAR_NAME}*.rpm . 57 | rm -f $RPMROOT/RPMS/x86_64/tcpdive* 58 | else 59 | echo "Make rpm package failed!" 60 | cat $RPMROOT/log 61 | fi 62 | 63 | rm -f $MODULE 64 | rm -f $RPMROOT/log 65 | rm -rf $RPMROOT/SOURCES/${TAR_NAME}* 66 | rm -rf $RPMROOT/BUILD/${TAR_NAME}* 67 | 68 | 69 | -------------------------------------------------------------------------------- /script/tcpdive: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Version 4 | TOOL_VER=1.0 5 | TOOL_RELEASE=stable 6 | 7 | KERNEL=2.6.32-431.17.1.el6.x86_64 8 | KERNEL_NAME=`echo $KERNEL|sed 's/\.x86_64//'` 9 | 10 | MODULE="/lib/modules/$KERNEL/kernel/net/ipv4/tcpdive.ko" 11 | RPM="tcpdive-$KERNEL_NAME-$TOOL_VER-$TOOL_RELEASE.x86_64.rpm" 12 | 13 | # Log 14 | LOG_NAME="tcpdive.log" # log file name 15 | LOG_SIZE=500 # per log file's upper size(MB) 16 | LOG_NUM=20 # max number of log files 17 | 18 | # Port filter 19 | PORTS="80" # server ports concerned, e.g. 80,8080 20 | 21 | start() { 22 | staprun -D -S $LOG_SIZE,$LOG_NUM -o $LOG_NAME $MODULE \ 23 | port_str=$PORTS > /dev/null 24 | } 25 | 26 | stop() { 27 | PID=`ps aux|grep stap|grep tcpdive|grep -v grep|awk '{print $2}'` 28 | kill -s SIGINT $PID 29 | } 30 | 31 | # 0 - running, 1 - stopped 32 | status() { 33 | local ret=0 34 | ps aux|grep stap|grep tcpdive|grep -v grep &> /dev/null || ret=1 35 | return $ret 36 | } 37 | 38 | check() { 39 | # Check Permission 40 | if [ $UID -ne 0 ]; then 41 | echo "please run as root." 42 | exit 1 43 | fi 44 | 45 | # Check installation 46 | RPM_NAME=`echo $RPM|sed 's/\.rpm//'` 47 | rpm -qa|grep $RPM_NAME &> /dev/null 48 | if [ $? -ne 0 ]; then 49 | echo "$RPM is not installed." 50 | exit 1 51 | fi 52 | 53 | if [ ! -f $MODULE ]; then 54 | MODULE="/lib/modules/$KERNEL_NAME/kernel/net/ipv4/tcpdive.ko" 55 | fi 56 | 57 | if [ ! -f $MODULE ]; then 58 | echo "$MODULE is missing!" 59 | echo "please reinstall $RPM." 60 | exit 1 61 | fi 62 | 63 | # Check kernel version 64 | KVER=`uname -r` 65 | if [ "$KVER" != "$KERNEL" ]; then 66 | echo "Current kernel version: $KVER" 67 | echo "Installed tcpdive is for kernel version: $KERNEL" 68 | echo "Please install the right version of tcpdive!" 69 | exit 1 70 | fi 71 | } 72 | 73 | case $1 in 74 | start) 75 | check 76 | status 77 | ret=$? 78 | if [ $ret -eq 0 ]; then 79 | echo "tcpdive is already running..." 80 | else 81 | # Check corner case 82 | lsmod|grep tcpdive|grep -v grep &> /dev/null 83 | if [ $? -eq 0 ]; then 84 | echo "tcpdive.ko is already insmod." 85 | echo "Do some cleaning before reinstall." 86 | exit 1 87 | fi 88 | 89 | start 90 | sleep 1 91 | status 92 | if [ $? -eq 0 ]; then 93 | echo "tcpdive is started." 94 | else 95 | echo "tcpdive start failed!" 96 | fi 97 | fi 98 | ;; 99 | stop) 100 | status 101 | ret=$? 102 | if [ $ret -eq 0 ]; then 103 | stop 104 | sleep 1 105 | echo "tcpdive is stopped." 106 | else 107 | echo "tcpdive is already stopped." 108 | fi 109 | ;; 110 | status) 111 | status 112 | ret=$? 113 | if [ $ret -eq 0 ]; then 114 | echo "tcpdive is running..." 115 | else 116 | echo "tcpdive is stopped." 117 | fi 118 | ;; 119 | *) 120 | echo "Usage: $0 {start|stop|status}" 121 | exit 1;; 122 | esac 123 | 124 | 125 | -------------------------------------------------------------------------------- /script/tcpdive.spec: -------------------------------------------------------------------------------- 1 | %define kversion 2.6.32-431.17.1.el6 2 | 3 | Name: tcpdive-%{kversion} 4 | Version: 1.0 5 | Release: stable 6 | Summary: A TCP performance profiling tool 7 | Group: System Environment/Kernel 8 | License: GPL 9 | Source0: %{name}-%{version}.tar.gz 10 | BuildRoot: %(mktemp -ud %{_tmppath}/%{name}-%{version}-%{release}-XXXXXX) 11 | 12 | Requires: kernel = %{kversion} systemtap-runtime >= 2.3 13 | 14 | %description 15 | A TCP performance profiling tool. 16 | 17 | %prep 18 | 19 | %setup -q 20 | 21 | %build 22 | 23 | %install 24 | mkdir -p %{buildroot}/usr/bin 25 | mkdir -p %{buildroot}/lib/modules/%{kversion}/kernel/net/ipv4 26 | install -m 744 tcpdive %{buildroot}/usr/bin 27 | install -m 744 tcpdive.ko %{buildroot}/lib/modules/%{kversion}/kernel/net/ipv4 28 | 29 | %clean 30 | rm -rf %{buildroot} 31 | 32 | %files 33 | /usr/bin/tcpdive 34 | /lib/modules/%{kversion}/kernel/net/ipv4/tcpdive.ko 35 | %defattr(-,root,root,-) 36 | %doc 37 | 38 | %changelog 39 | * Thu Dec 31 2015 zhangsk 40 | - 1.0 First release 41 | 42 | 43 | -------------------------------------------------------------------------------- /src/close.stp: -------------------------------------------------------------------------------- 1 | # Close 2 | # log stap_info right before connection being closed. 3 | 4 | %{ 5 | #include 6 | #include 7 | 8 | struct stap_info; 9 | %} 10 | 11 | /** 12 | * When a connection is closing. 13 | */ 14 | probe close.Close = 15 | kernel.function("tcp_set_state") 16 | { 17 | if (!$sk || !$sk->sk_protinfo) 18 | next 19 | 20 | new_state = $state 21 | if (new_state == %{TCP_CLOSE%} || 22 | new_state == %{TCP_FIN_WAIT2%}) { 23 | if (!mem_is_stop()) { 24 | check_rst($sk, new_state) 25 | close_update($sk) 26 | 27 | if (filter_output($sk)) { 28 | /* Default */ 29 | log_conn_id($sk) 30 | log_trans($sk, new_state) 31 | log_cwnd_ssthr($sk) 32 | log_rtt_rto($sk) 33 | 34 | /* Optional */ 35 | log_retrans($sk) 36 | log_cong($sk) 37 | log_http($sk) 38 | printf("\n") 39 | } 40 | } 41 | 42 | mem_free($sk) 43 | } 44 | } 45 | 46 | function filter_output:long (sk:long) 47 | { 48 | ret = 1 49 | 50 | if (%{stap_options.bitmap & STAPF_HTTP%}) 51 | ret &= filter_http_transtime(sk) 52 | 53 | ret &= filter_conn_lifetime(sk) 54 | 55 | return ret 56 | } 57 | 58 | function filter_conn_lifetime:long (sk:long) 59 | { 60 | return get_conn_lifetime(sk) >= %{stap_options.lifetime%} 61 | } 62 | 63 | function close_update(sk:long) 64 | { 65 | cwnd_ssthr_update(sk) 66 | cong_close_update(sk) 67 | } 68 | 69 | function cwnd_ssthr_update(sk:long) 70 | %{ 71 | struct sock *sk = (struct sock *)STAP_ARG_sk; 72 | struct stap_info *info = sk->sk_protinfo; 73 | struct tcp_sock *tp = tcp_sk(sk); 74 | 75 | info->end_cwnd = tp->snd_cwnd; 76 | info->end_ssthr = tp->snd_ssthresh; 77 | %} 78 | 79 | /** 80 | * Make sure memory being freed in any condition. 81 | */ 82 | probe close.Destruct = 83 | kernel.function("inet_sock_destruct") 84 | { 85 | if (!$sk->sk_protinfo) 86 | next 87 | 88 | mem_free($sk) 89 | } 90 | 91 | /** 92 | * Capture signal SIGINT and SIGTERM to quit. 93 | */ 94 | probe close.Signal = 95 | signal.send 96 | { 97 | if (!mem_is_stop() && 98 | (sig_name == "SIGINT" || sig_name == "SIGTERM") 99 | && pid_name == "stapio") { 100 | mem_set_stop() 101 | mem_free_active() 102 | } 103 | } 104 | 105 | /** 106 | * Log ID of a connection. 107 | */ 108 | function log_conn_id (sk:long) 109 | { 110 | tm_year = %{STAP_VALUE(STAP_ARG_sk, tm).tm_year%} 111 | tm_mon = %{STAP_VALUE(STAP_ARG_sk, tm).tm_mon%} 112 | tm_mday = %{STAP_VALUE(STAP_ARG_sk, tm).tm_mday%} 113 | tm_hour = %{STAP_VALUE(STAP_ARG_sk, tm).tm_hour%} 114 | tm_min = %{STAP_VALUE(STAP_ARG_sk, tm).tm_min%} 115 | tm_sec = %{STAP_VALUE(STAP_ARG_sk, tm).tm_sec%} 116 | isn = %{STAP_VALUE(STAP_ARG_sk, isn)%} 117 | 118 | time = sprintf("%d/%d/%d,start=%02d:%02d:%02d,end=%s", 119 | tm_year+1900, tm_mon+1, tm_mday, tm_hour, tm_min, tm_sec, 120 | get_short_time()) 121 | 122 | addr = get_socket_addr(sk) 123 | 124 | if (%{stap_options.detail_log%}) { 125 | line = "=====================================" 126 | printf("%s%s\n", line, line) 127 | printf("%s,id=%u\n%s\n\n", time, isn, addr) 128 | } else { 129 | printf("%s,%s,id=%u", time, addr, isn) 130 | } 131 | } 132 | 133 | 134 | -------------------------------------------------------------------------------- /src/congestion.stp: -------------------------------------------------------------------------------- 1 | # Congestion control (Cubic algorithm) 2 | 3 | %{ 4 | #include 5 | #include 6 | #include 7 | 8 | struct bictcp { 9 | u32 cnt; 10 | u32 last_max_cwnd; 11 | u32 loss_cwnd; 12 | u32 last_cwnd; 13 | u32 last_time; 14 | u32 bic_origin_point; 15 | u32 bic_K; 16 | u32 delay_min; 17 | u32 epoch_start; 18 | u32 ack_cnt; 19 | u32 tcp_cwnd; 20 | #define ACK_RATIO_SHIFT 4 21 | #define ACK_RATIO_LIMIT (32u << ACK_RATIO_SHIFT) 22 | u16 delayed_ack; 23 | u8 sample_cnt; 24 | u8 found; 25 | u32 round_start; 26 | u32 end_seq; 27 | u32 last_ack; 28 | u32 curr_rtt; 29 | }; 30 | 31 | enum { 32 | POINT_NULL = 0, 33 | POINT_SS_START = 1, 34 | POINT_SS_END = 2, 35 | POINT_EP_START = 3, 36 | POINT_EP_STEADY = 4, 37 | POINT_EP_END = 5, 38 | POINT_MAX = 6 39 | }; 40 | 41 | enum { 42 | MSG_NULL = 0, 43 | MSG_SS_STD = 1, 44 | MSG_SS_ACK = 2, 45 | MSG_SS_DELAY = 3, 46 | MSG_SS_ABORT = 4, 47 | MSG_SS_CLOSE = 5, 48 | MSG_EP_SEARCH = 6, 49 | MSG_EP_PROBE = 7, 50 | MSG_MAX = 8 51 | }; 52 | 53 | static const char *point_name[POINT_MAX] = { 54 | "null", 55 | "SS start", 56 | "SS end", 57 | "EP start", 58 | "EP steady", 59 | "EP end" 60 | }; 61 | 62 | static const char *msg_name[MSG_MAX] = { 63 | "null", 64 | "std", 65 | "ack", 66 | "delay", 67 | "abort", 68 | "close", 69 | "search", 70 | "probe" 71 | }; 72 | 73 | /* Advanced Congestion Control, -A option */ 74 | static void advanced_cc_set(struct sock *sk, int point, int msg) 75 | { 76 | struct stap_info *info = sk->sk_protinfo; 77 | struct stap_cong *cong = info->cong; 78 | 79 | if (!(stap_options.bitmap & STAPF_ADCONG) || !cong || 80 | !cong->adcong_flag) 81 | return; 82 | 83 | if (atomic_inc_return(&cong->adcong_cnt) <= stap_options.ad_cong) { 84 | cong->adcong_point = point; 85 | cong->adcong_msg = msg; 86 | } else 87 | cong->adcong_flag = 0; 88 | } 89 | %} 90 | 91 | /** 92 | * State machine of slow start. 93 | */ 94 | probe cong.StMachine = 95 | kernel.function("bictcp_cong_avoid") 96 | { 97 | if (mem_is_stop()) next 98 | if (!$sk->sk_protinfo) 99 | next 100 | 101 | if (!cwnd_limited($sk, $in_flight)) 102 | next 103 | 104 | state_machine($sk) 105 | log_advanced_cc($sk) 106 | } 107 | 108 | function cwnd_limited:long (sk:long, in_flight:long) 109 | %{ 110 | struct sock *sk = (struct sock *)STAP_ARG_sk; 111 | struct tcp_sock *tp = tcp_sk(sk); 112 | struct stap_info *info = sk->sk_protinfo; 113 | struct stap_cong *cong = info->cong; 114 | int ret; 115 | 116 | if (!cong) 117 | return; 118 | 119 | /* First ACK */ 120 | if (!info->init_cwnd) { 121 | info->init_cwnd = tp->snd_cwnd; 122 | info->init_ssthr = tp->snd_ssthresh; 123 | } 124 | 125 | ret = tcp_is_cwnd_limited(sk, STAP_ARG_in_flight); 126 | cong->cwnd_unlimit += !ret; 127 | STAP_RETVALUE = ret; 128 | %} 129 | 130 | function state_machine (sk:long) 131 | %{ 132 | struct sock *sk = (struct sock *)STAP_ARG_sk; 133 | struct stap_info *info = sk->sk_protinfo; 134 | struct stap_cong *cong = info->cong; 135 | struct tcp_sock *tp = tcp_sk(sk); 136 | struct bictcp *ca = inet_csk_ca(sk); 137 | int i, idx; 138 | 139 | if (!cong) 140 | return; 141 | 142 | /* first ACK */ 143 | if (!info->init_cwnd) { 144 | info->init_cwnd = tp->snd_cwnd; 145 | info->init_ssthr = tp->snd_ssthresh; 146 | } 147 | 148 | /* slow start begin point: 149 | * 1.connection begin 150 | * 2.timeout loss 151 | * 3.cong to slow start 152 | */ 153 | if (!cong->ss_running && tp->snd_cwnd <= tp->snd_ssthresh) { 154 | cong->ss_running = 1; 155 | cong->ss_rtt_cnt = 0; 156 | cong->ss_enter_cwnd = tp->snd_cwnd; 157 | 158 | /* Advanced CC, SS Start */ 159 | advanced_cc_set(sk, POINT_SS_START, MSG_NULL); 160 | } 161 | 162 | /* slow start end point: 163 | * 1.successfully switch to cong avoid 164 | */ 165 | if (cong->ss_running && tp->snd_cwnd > tp->snd_ssthresh) { 166 | idx = ca->found; 167 | idx = idx == 3 ? 1 : idx; 168 | 169 | /* Advanced CC, SS End */ 170 | advanced_cc_set(sk, POINT_SS_END, idx + 1); 171 | 172 | AGAIN: 173 | cong->ss_cnt[idx]++; 174 | cong->ss_time[idx] += cong->ss_rtt_cnt; 175 | cong->ss_start_cwnd[idx] += cong->ss_enter_cwnd; 176 | cong->ss_end_cwnd[idx] += tp->snd_ssthresh; 177 | 178 | if (ca->found == 3 && idx == 1) { 179 | idx = 2; 180 | goto AGAIN; 181 | } 182 | 183 | /* do cleaning */ 184 | cong->ss_running = 0; 185 | cong->ss_enter_cwnd = 0; 186 | cong->ss_rtt_cnt = 0; 187 | cong->epoch_switch = 0; 188 | cong->epoch_switch_ts = 0; 189 | } 190 | %} 191 | 192 | /** 193 | * Count RTTs experienced by slow start. 194 | */ 195 | probe cong.RttCnt = 196 | kernel.function("bictcp_hystart_reset") 197 | { 198 | if (mem_is_stop()) next 199 | if (!$sk->sk_protinfo) 200 | next 201 | 202 | do_RttCnt($sk) 203 | } 204 | 205 | function do_RttCnt (sk:long) 206 | %{ 207 | struct sock *sk = (struct sock *)STAP_ARG_sk; 208 | struct stap_info *info = sk->sk_protinfo; 209 | struct stap_cong *cong = info->cong; 210 | 211 | if (!cong) 212 | return; 213 | 214 | if (cong->ss_running) 215 | cong->ss_rtt_cnt++; 216 | %} 217 | 218 | /** 219 | * First loss event 220 | * Slow start abort 221 | * Epoch end point 222 | */ 223 | probe cong.SetSsthresh = 224 | kernel.function("bictcp_recalc_ssthresh") 225 | { 226 | if (mem_is_stop()) next 227 | if (!$sk->sk_protinfo) 228 | next 229 | 230 | do_SetSsthresh($sk) 231 | 232 | /* Epoch end point */ 233 | epoch_end($sk) 234 | log_advanced_cc($sk) 235 | } 236 | 237 | function do_SetSsthresh (sk:long) 238 | %{ 239 | struct sock *sk = (struct sock *)STAP_ARG_sk; 240 | struct stap_info *info = sk->sk_protinfo; 241 | struct stap_cong *cong = info->cong; 242 | struct tcp_sock *tp = tcp_sk(sk); 243 | struct bictcp *ca = inet_csk_ca(sk); 244 | 245 | if (!cong) 246 | return; 247 | 248 | /* First loss event */ 249 | if (!cong->fl_cwnd) { 250 | cong->fl_phase = cong->ss_running ? 0 : 1; 251 | cong->fl_cwnd = tp->snd_cwnd; 252 | cong->fl_rtt = tp->srtt >> 3; 253 | } 254 | 255 | /* Slow start abort due to loss */ 256 | if (cong->ss_running) { 257 | cong->ss_cnt[3]++; 258 | cong->ss_time[3] += cong->ss_rtt_cnt; 259 | cong->ss_start_cwnd[3] += cong->ss_enter_cwnd; 260 | cong->ss_end_cwnd[3] += tp->snd_cwnd; 261 | 262 | cong->ss_running = 0; 263 | cong->ss_enter_cwnd = 0; 264 | cong->ss_rtt_cnt = 0; 265 | 266 | /* Advanced CC, SS End */ 267 | advanced_cc_set(sk, POINT_SS_END, MSG_SS_ABORT); 268 | } 269 | 270 | /* Fast convergence */ 271 | if (tp->snd_cwnd < ca->last_max_cwnd) 272 | cong->fast_converg++; 273 | %} 274 | 275 | function epoch_end(sk:long) 276 | %{ 277 | struct sock *sk = (struct sock *)STAP_ARG_sk; 278 | struct tcp_sock *tp = tcp_sk(sk); 279 | struct bictcp *ca = inet_csk_ca(sk); 280 | struct stap_info *info = sk->sk_protinfo; 281 | struct stap_cong *cong = info->cong; 282 | u32 rtt_cnt = 0, msg; 283 | 284 | if (!cong || tp->snd_cwnd < tp->snd_ssthresh || 285 | cong->ss_running || !ca->epoch_start) 286 | return; 287 | 288 | /* cwnd and rtt at this moment */ 289 | cong->epoch_cwnd[2] += tp->snd_cwnd; 290 | cong->epoch_rtt[2] += tp->srtt >> 3; 291 | cong->epoch_cnt[2]++; 292 | 293 | /* Phase time, unit is RTT */ 294 | if (tp->srtt) 295 | rtt_cnt = ((tcp_time_stamp - ca->epoch_start) << 3) / tp->srtt; 296 | 297 | cong->phase_time[0] += rtt_cnt; 298 | cong->phase_cnt[0]++; 299 | 300 | if (!ca->bic_K) { 301 | cong->phase_time[2] += rtt_cnt; 302 | cong->phase_cnt[2]++; 303 | msg = MSG_EP_PROBE; 304 | } else if (cong->epoch_switch) { 305 | cong->phase_time[2] += 306 | ((tcp_time_stamp - cong->epoch_switch_ts) << 3) / tp->srtt; 307 | cong->phase_cnt[2]++; 308 | msg = MSG_EP_PROBE; 309 | } else { 310 | cong->phase_time[1] += rtt_cnt; 311 | cong->phase_cnt[1]++; 312 | msg = MSG_EP_SEARCH; 313 | } 314 | 315 | cong->epoch_switch = 0; 316 | cong->epoch_switch_ts = 0; 317 | 318 | /* Advanced CC, Epoch End */ 319 | advanced_cc_set(sk, POINT_EP_END, msg); 320 | %} 321 | 322 | /* Called when conn is gonna to be closed */ 323 | function cong_close_update(sk:long) 324 | { 325 | if ((%{stap_options.bitmap & (STAPF_CONG | STAPF_ADCONG)%}) 326 | && current_algorithm(sk, "cubic")) { 327 | 328 | /* Advanced CC, SS Close point */ 329 | if (%{stap_options.bitmap%} & %{STAPF_ADCONG%}) 330 | ss_close_update(sk) 331 | 332 | /* Epoch end point */ 333 | epoch_end(sk) 334 | log_advanced_cc(sk) 335 | } 336 | } 337 | 338 | function current_algorithm:long (sk:long, name:string) 339 | %{ 340 | struct sock *sk = (struct sock *)STAP_ARG_sk; 341 | struct inet_connection_sock *icsk = inet_csk(sk); 342 | u32 ret = 0; 343 | 344 | if (strcmp(icsk->icsk_ca_ops->name, STAP_ARG_name) == 0) 345 | ret = 1; 346 | 347 | STAP_RETVALUE = ret; 348 | %} 349 | 350 | function ss_close_update(sk:long) 351 | %{ 352 | struct sock *sk = (struct sock *)STAP_ARG_sk; 353 | struct tcp_sock *tp = tcp_sk(sk); 354 | struct stap_info *info = sk->sk_protinfo; 355 | struct stap_cong *cong = info->cong; 356 | 357 | if (cong && tp->snd_cwnd < tp->snd_ssthresh && 358 | cong->ss_running) { 359 | advanced_cc_set(sk, POINT_SS_END, MSG_SS_CLOSE); 360 | } 361 | %} 362 | 363 | function get_sock_from_ca:long (ca:long) 364 | %{ 365 | struct bictcp *ca = (void *)STAP_ARG_ca; 366 | struct sock *sk = (struct sock *)((char *)ca - 367 | offsetof(struct inet_connection_sock, icsk_ca_priv)); 368 | 369 | STAP_RETVALUE = (long)sk; 370 | %} 371 | 372 | /** 373 | * Epoch start point. 374 | */ 375 | probe cong.EpochStart = 376 | kernel.statement("bictcp_update@net/ipv4/tcp_cubic.c:223") 377 | { 378 | if (mem_is_stop()) next 379 | sk = get_sock_from_ca($ca) 380 | if (!@cast(sk, "struct sock")->sk_protinfo) 381 | next 382 | 383 | do_EpochStart(sk, $cwnd) 384 | log_advanced_cc(sk) 385 | } 386 | 387 | function do_EpochStart (sk:long, cwnd:long) 388 | %{ 389 | struct sock *sk = (struct sock *)STAP_ARG_sk; 390 | struct tcp_sock *tp = tcp_sk(sk); 391 | struct bictcp *ca = inet_csk_ca(sk); 392 | struct stap_info *info = sk->sk_protinfo; 393 | struct stap_cong *cong = info->cong; 394 | u32 msg; 395 | 396 | if (!cong) 397 | return; 398 | 399 | cong->epoch_cwnd[0] += tp->snd_cwnd; 400 | cong->epoch_rtt[0] += tp->srtt >> 3; 401 | cong->epoch_cnt[0]++; 402 | 403 | cong->epoch_switch = 0; 404 | cong->epoch_switch_ts = 0; 405 | 406 | if (ca->last_max_cwnd <= STAP_ARG_cwnd) 407 | msg = MSG_EP_PROBE; 408 | else 409 | msg = MSG_EP_SEARCH; 410 | 411 | /* Advanced CC, Epoch Start */ 412 | advanced_cc_set(sk, POINT_EP_START, msg); 413 | %} 414 | 415 | /** 416 | * Epoch steady point. 417 | */ 418 | probe cong.EpochSteady = 419 | kernel.statement("bictcp_update@net/ipv4/tcp_cubic.c:254") 420 | { 421 | if (mem_is_stop()) next 422 | sk = get_sock_from_ca($ca) 423 | if (!@cast(sk, "struct sock")->sk_protinfo) 424 | next 425 | 426 | do_EpochSteady(sk) 427 | log_advanced_cc(sk) 428 | } 429 | 430 | function do_EpochSteady (sk:long) 431 | %{ 432 | struct sock *sk = (struct sock *)STAP_ARG_sk; 433 | struct tcp_sock *tp = tcp_sk(sk); 434 | struct bictcp *ca = inet_csk_ca(sk); 435 | struct stap_info *info = sk->sk_protinfo; 436 | struct stap_cong *cong = info->cong; 437 | u32 now; 438 | 439 | if (!cong) 440 | return; 441 | 442 | now = ((tcp_time_stamp + msecs_to_jiffies(ca->delay_min>>3) 443 | - ca->epoch_start) << 10) / HZ; 444 | 445 | if (!cong->epoch_switch && ca->bic_K && now > ca->bic_K) { 446 | cong->epoch_switch = 1; 447 | cong->epoch_switch_ts = tcp_time_stamp; 448 | 449 | cong->epoch_rtt[1] += tp->srtt >> 3; 450 | cong->epoch_cwnd[1] += tp->snd_cwnd; 451 | cong->epoch_cnt[1]++; 452 | 453 | cong->phase_cnt[1]++; 454 | if (tp->srtt) 455 | cong->phase_time[1] += 456 | ((tcp_time_stamp - ca->epoch_start) << 3) / tp->srtt; 457 | 458 | /* Advanced CC, Epoch Steady point */ 459 | advanced_cc_set(sk, POINT_EP_STEADY, MSG_NULL); 460 | } 461 | %} 462 | 463 | /** 464 | * Log cwnd and ssthresh. 465 | */ 466 | function log_cwnd_ssthr (sk:long) 467 | { 468 | init_cwnd = %{STAP_VALUE(STAP_ARG_sk, init_cwnd)%} 469 | end_cwnd = %{STAP_VALUE(STAP_ARG_sk, end_cwnd)%} 470 | init_ssthr = %{STAP_VALUE(STAP_ARG_sk, init_ssthr)%} 471 | end_ssthr = %{STAP_VALUE(STAP_ARG_sk, end_ssthr)%} 472 | 473 | if (!%{stap_options.detail_log%}) 474 | printf(",init_cwnd=%d,end_cwnd=%d,init_ssthr=%d,end_ssthr=%d", 475 | init_cwnd, end_cwnd, init_ssthr, end_ssthr) 476 | else { 477 | printf("%-18s %u\n", "init_cwnd", init_cwnd) 478 | printf("%-18s %u\n", "end_cwnd", end_cwnd) 479 | printf("%-18s %u\n", "init_ssthresh", init_ssthr) 480 | printf("%-18s %u\n", "end_ssthresh", end_ssthr) 481 | printf("\n") 482 | } 483 | } 484 | 485 | /** 486 | * Log of advanced congestion control, -A option. 487 | */ 488 | function log_advanced_cc:long (sk:long) 489 | { 490 | if (!(%{stap_options.bitmap%} & %{STAPF_ADCONG%}) || 491 | cong_is_null(sk)) 492 | return 0 493 | 494 | flag = %{STAP_MEM_VALUE(STAP_ARG_sk, cong, adcong_flag)%} 495 | point = %{STAP_MEM_VALUE(STAP_ARG_sk, cong, adcong_point)%} 496 | if (!flag || point == %{POINT_NULL%}) 497 | return 0 498 | 499 | isn = %{STAP_VALUE(STAP_ARG_sk, isn)%} 500 | cwnd = %{((struct tcp_sock *)STAP_ARG_sk)->snd_cwnd%} 501 | rtt = %{((struct tcp_sock *)STAP_ARG_sk)->srtt >> 3%} 502 | 503 | printf("[%s],%s,%s,id=%d,cwnd=%d,rtt=%d,time=%d,msg=%s\n", 504 | get_point_name(sk), get_full_time(), get_socket_addr(sk), 505 | isn, cwnd, rtt, get_time_elapsed(sk), 506 | get_msg_name(sk)) 507 | 508 | advanced_cc_clear(sk) 509 | return point 510 | } 511 | 512 | function cong_is_null:long (sk:long) 513 | %{ 514 | struct sock *sk = (struct sock *)STAP_ARG_sk; 515 | struct stap_info *info = sk->sk_protinfo; 516 | struct stap_cong *cong = info->cong; 517 | 518 | STAP_RETVALUE = !cong; 519 | %} 520 | 521 | function get_point_name:string (sk:long) 522 | %{ 523 | struct sock *sk = (struct sock *)STAP_ARG_sk; 524 | struct stap_info *info = sk->sk_protinfo; 525 | struct stap_cong *cong = info->cong; 526 | 527 | sprintf(STAP_RETVALUE, "%s", point_name[cong->adcong_point]); 528 | %} 529 | 530 | function get_msg_name:string (sk:long) 531 | %{ 532 | struct sock *sk = (struct sock *)STAP_ARG_sk; 533 | struct stap_info *info = sk->sk_protinfo; 534 | struct stap_cong *cong = info->cong; 535 | 536 | sprintf(STAP_RETVALUE, "%s", msg_name[cong->adcong_msg]); 537 | %} 538 | 539 | function get_time_elapsed:long (sk:long) 540 | %{ 541 | struct sock *sk = (struct sock *)STAP_ARG_sk; 542 | struct stap_info *info = sk->sk_protinfo; 543 | 544 | STAP_RETVALUE = tcp_time_stamp - info->estab_t; 545 | %} 546 | 547 | function advanced_cc_clear(sk:long) 548 | %{ 549 | struct sock *sk = (struct sock *)STAP_ARG_sk; 550 | struct stap_info *info = sk->sk_protinfo; 551 | struct stap_cong *cong = info->cong; 552 | 553 | cong->adcong_point = POINT_NULL; 554 | cong->adcong_msg = MSG_NULL; 555 | %} 556 | 557 | /** 558 | * Log of congestion control, -C option. 559 | */ 560 | function log_cong (sk:long) 561 | { 562 | if (%{stap_options.bitmap & STAPF_CONG%} && 563 | !cong_is_null(sk)) { 564 | if (!%{stap_options.detail_log%}) 565 | log_default_cong(sk) 566 | else 567 | log_detail_cong(sk) 568 | } 569 | } 570 | 571 | /* log format: default */ 572 | function log_default_cong (sk:long) 573 | { 574 | log_default_first_loss(sk) 575 | log_default_slow_start(sk) 576 | log_default_cong_avoid(sk) 577 | } 578 | 579 | /* first loss event */ 580 | function log_default_first_loss (sk:long) 581 | { 582 | fl_phase = %{STAP_MEM_VALUE(STAP_ARG_sk, cong, fl_phase)%} 583 | fl_cwnd = %{STAP_MEM_VALUE(STAP_ARG_sk, cong, fl_cwnd)%} 584 | fl_rtt = %{STAP_MEM_VALUE(STAP_ARG_sk, cong, fl_rtt)%} 585 | 586 | if (!fl_cwnd) 587 | printf(",fl_phase=-1,fl_cwnd=-1,fl_rtt=-1") 588 | else 589 | printf(",fl_phase=%s,fl_cwnd=%d,fl_rtt=%d", 590 | fl_phase ? "cong" : "ss", fl_cwnd, fl_rtt) 591 | } 592 | 593 | /* slow start info */ 594 | function log_default_slow_start (sk:long) 595 | { 596 | for (i = 0; i < 4; i++) { 597 | ss_cnt = %{STAP_MEM_VALUE(STAP_ARG_sk, cong, 598 | ss_cnt[STAP_ARG_i])%} 599 | ss_time = %{STAP_MEM_VALUE(STAP_ARG_sk, cong, 600 | ss_time[STAP_ARG_i])%} 601 | ss_start_cwnd = %{STAP_MEM_VALUE(STAP_ARG_sk, cong, 602 | ss_start_cwnd[STAP_ARG_i])%} 603 | ss_end_cwnd = %{STAP_MEM_VALUE(STAP_ARG_sk, cong, 604 | ss_end_cwnd[STAP_ARG_i])%} 605 | 606 | if (ss_cnt) { 607 | ss_time /= ss_cnt 608 | ss_start_cwnd /= ss_cnt 609 | ss_end_cwnd /= ss_cnt 610 | } else 611 | ss_cnt = ss_time = ss_start_cwnd = ss_end_cwnd = -1 612 | 613 | if (i == 0) prefix = "std_" 614 | else if (i == 1) prefix = "ack_" 615 | else if (i == 2) prefix = "delay_" 616 | else prefix = "abort_" 617 | 618 | printf(",%s=%d,%s=%d,%s=%d,%s=%d", 619 | prefix . "ss_cnt", ss_cnt, 620 | prefix . "ss_time", ss_time, 621 | prefix . "start_cwnd", ss_start_cwnd, 622 | prefix . "end_cwnd", ss_end_cwnd) 623 | } 624 | } 625 | 626 | /* congestion avoidance info */ 627 | function log_default_cong_avoid (sk:long) 628 | { 629 | cwnd_unlimit = %{STAP_MEM_VALUE(STAP_ARG_sk, cong, cwnd_unlimit)%} 630 | fast_converg = %{STAP_MEM_VALUE(STAP_ARG_sk, cong, fast_converg)%} 631 | 632 | printf(",cwnd_unlimit=%d,fast_converg=%d", 633 | cwnd_unlimit,fast_converg) 634 | 635 | for (i = 0; i < 3; i++) { 636 | phase_cnt = %{STAP_MEM_VALUE(STAP_ARG_sk, cong, 637 | phase_cnt[STAP_ARG_i])%} 638 | phase_time = %{STAP_MEM_VALUE(STAP_ARG_sk, cong, 639 | phase_time[STAP_ARG_i])%} 640 | 641 | if (!phase_cnt) 642 | phase_cnt = phase_time = -1 643 | 644 | if (i == 0) prefix = "epoch_" 645 | else if (i == 1) prefix = "search_" 646 | else prefix = "probe_" 647 | 648 | printf(",%s=%d,%s=%d", 649 | prefix . "cnt", phase_cnt, 650 | prefix . "time", phase_time) 651 | } 652 | 653 | for (i = 0; i < 3; i++) { 654 | epoch_cwnd = %{STAP_MEM_VALUE(STAP_ARG_sk, cong, 655 | epoch_cwnd[STAP_ARG_i])%} 656 | epoch_rtt = %{STAP_MEM_VALUE(STAP_ARG_sk, cong, 657 | epoch_rtt[STAP_ARG_i])%} 658 | epoch_cnt = %{STAP_MEM_VALUE(STAP_ARG_sk, cong, 659 | epoch_cnt[STAP_ARG_i])%} 660 | 661 | if (epoch_cnt) { 662 | epoch_cwnd /= epoch_cnt 663 | epoch_rtt /= epoch_cnt 664 | } else 665 | epoch_cnt = epoch_cwnd = epoch_rtt = -1 666 | 667 | if (i == 0) prefix = "ep_start_" 668 | else if (i == 1) prefix = "ep_steady_" 669 | else prefix = "ep_end_" 670 | 671 | printf(",%s=%d,%s=%d,%s=%d", 672 | prefix . "cnt", epoch_cnt, 673 | prefix . "cwnd", epoch_cwnd, 674 | prefix . "rtt", epoch_rtt) 675 | } 676 | } 677 | 678 | /* log format: detail */ 679 | function log_detail_cong (sk:long) 680 | { 681 | print(log_detail_slow_start(sk)) 682 | print(log_detail_cong_avoid(sk)) 683 | } 684 | 685 | function log_detail_slow_start:string (sk:long) 686 | %{ 687 | struct sock *sk = (struct sock *)STAP_ARG_sk; 688 | struct stap_info *info = sk->sk_protinfo; 689 | struct stap_cong *c = info->cong; 690 | int i, len = 0, flag = 1, maxlen = MAXSTRINGLEN; 691 | char *buf = (char *)STAP_RETVALUE; 692 | static const char *ss_name[4] = { 693 | "Standard", 694 | "ACK Train Length", 695 | "Delay Increase", 696 | "Abort" 697 | }; 698 | 699 | if (c->fl_cwnd) 700 | len = snprintf(buf, maxlen, 701 | "%-18s %-10s %-10s %-10s\n%-18s %-10s %-10d %-10d\n\n", 702 | "FIRST LOSS TABLE", "phase", "cwnd", "rtt", "", 703 | c->fl_phase ? "cong" : "ss", c->fl_cwnd, c->fl_rtt); 704 | 705 | for (i = 0; i < 4; i++) { 706 | if (!c->ss_cnt[i]) 707 | continue; 708 | 709 | if (flag) { 710 | len += snprintf(buf + len, maxlen - len, 711 | "%-18s %-10s %-10s %-10s %-10s\n", "SLOW START TABLE", 712 | "count", "s_cwnd", "e_cwnd", "time(RTT)"); 713 | flag = 0; 714 | } 715 | 716 | len += snprintf(buf + len, maxlen - len, 717 | "%-18s %-10u %-10u %-10u %-10u\n", ss_name[i], 718 | c->ss_cnt[i], c->ss_start_cwnd[i] / c->ss_cnt[i], 719 | c->ss_end_cwnd[i] / c->ss_cnt[i], 720 | c->ss_time[i] / c->ss_cnt[i]); 721 | } 722 | 723 | if (!flag) 724 | snprintf(buf + len, maxlen - len, "\n"); 725 | %} 726 | 727 | function log_detail_cong_avoid:string (sk:long) 728 | %{ 729 | struct sock *sk = (struct sock *)STAP_ARG_sk; 730 | struct stap_info *info = sk->sk_protinfo; 731 | struct stap_cong *c = info->cong; 732 | int i, len = 0, flag = 1, maxlen = MAXSTRINGLEN; 733 | char *buf = (char *)STAP_RETVALUE; 734 | static const char *phase_name[3] = 735 | {"epoch", "searching", "probing"}; 736 | static const char *point_name[3] = 737 | {"start", "steady", "end"}; 738 | 739 | for (i = 0; i < 3; i++) { 740 | if (!c->phase_cnt[i]) 741 | continue; 742 | 743 | if (flag) { 744 | len = snprintf(buf, maxlen, 745 | "%-18s %-10s %-10s\n", "CONG PHASE TABLE", 746 | "count", "time(RTT)"); 747 | flag = 0; 748 | } 749 | 750 | len += snprintf(buf + len, maxlen - len, 751 | "%-18s %-10d %-10d\n", phase_name[i], c->phase_cnt[i], 752 | c->phase_time[i]); 753 | } 754 | 755 | if (!flag) { 756 | len += snprintf(buf + len, maxlen - len, "\n"); 757 | flag = 1; 758 | } 759 | 760 | for (i = 0; i < 3; i++) { 761 | if (!c->epoch_cnt[i]) 762 | continue; 763 | 764 | if (flag) { 765 | len += snprintf(buf + len, maxlen - len, 766 | "%-18s %-10s %-10s %-10s\n", "CONG POINT TABLE", 767 | "count", "cwnd", "rtt"); 768 | flag = 0; 769 | } 770 | 771 | len += snprintf(buf + len, maxlen - len, 772 | "%-18s %-10d %-10d %-10d\n", point_name[i], 773 | c->epoch_cnt[i], c->epoch_cwnd[i] / c->epoch_cnt[i], 774 | c->epoch_rtt[i] / c->epoch_cnt[i]); 775 | } 776 | 777 | if (!flag) 778 | snprintf(buf + len, maxlen - len, "\n"); 779 | %} 780 | 781 | 782 | -------------------------------------------------------------------------------- /src/estab.stp: -------------------------------------------------------------------------------- 1 | # Establishment, a place to begin 2 | 3 | %{ 4 | #include 5 | #include 6 | #include 7 | 8 | struct stap_info; 9 | %} 10 | 11 | /** 12 | * When a connection is established. 13 | */ 14 | probe estab.Estab = 15 | kernel.statement("tcp_rcv_state_process@net/ipv4/tcp_input.c:5830") 16 | { 17 | if (mem_is_stop()) next 18 | if (!filter($sk)) next 19 | if (!take_sample()) next 20 | 21 | if (mem_alloc($sk)) { 22 | printf("mem_alloc failed!\n") 23 | next 24 | } 25 | 26 | do_Estab($sk) 27 | } 28 | 29 | function do_Estab(sk:long) 30 | %{ 31 | struct sock *sk = (struct sock *)STAP_ARG_sk; 32 | struct tcp_sock *tp = tcp_sk(sk); 33 | struct inet_connection_sock *icsk = inet_csk(sk); 34 | struct stap_info *info = sk->sk_protinfo; 35 | struct stap_cong *cong; 36 | struct timeval tv; 37 | unsigned long time; 38 | 39 | if (!info) 40 | return; 41 | 42 | /* Init vars */ 43 | info->estab_t = tcp_time_stamp; 44 | info->isn = tp->snd_nxt; 45 | info->rtx_synack = tp->total_retrans; 46 | atomic_set(&info->freeing, 0); 47 | 48 | /* Record estab time */ 49 | do_gettimeofday(&tv); 50 | time = tv.tv_sec + 8 * 3600; 51 | rtc_time_to_tm(time, &info->tm); 52 | 53 | /* Advanced CC */ 54 | cong = info->cong; 55 | if (cong && (stap_options.bitmap & STAPF_ADCONG)) { 56 | cong->adcong_flag = 1; 57 | atomic_set(&cong->adcong_cnt, 0); 58 | } 59 | %} 60 | 61 | /** 62 | * When a connection is accepted. 63 | */ 64 | probe estab.Accept = 65 | kernel.function("inet_csk_accept").return 66 | { 67 | if (mem_is_stop()) next 68 | sk = $return 69 | if (!sk) next 70 | 71 | do_Accept(sk) 72 | } 73 | 74 | function do_Accept(sk:long) 75 | %{ 76 | struct sock *sk = (struct sock *)STAP_ARG_sk; 77 | struct tcp_sock *tp = tcp_sk(sk); 78 | struct stap_info *info = sk->sk_protinfo; 79 | 80 | if (!info) 81 | return; 82 | 83 | info->accept_wait = tcp_time_stamp - info->estab_t; 84 | info->init_cwnd = tp->snd_cwnd; 85 | info->init_ssthr = tp->snd_ssthresh; 86 | %} 87 | 88 | 89 | -------------------------------------------------------------------------------- /src/http.stp: -------------------------------------------------------------------------------- 1 | # Log of HTTP 2 | # As for the dealing of http request and response, 3 | # see send.stp and recv.stp 4 | 5 | %{ 6 | #include 7 | #include 8 | %} 9 | 10 | function get_stap_http:long (sk:long, idx:long) 11 | %{ 12 | struct sock *sk = (struct sock *)STAP_ARG_sk; 13 | struct stap_info *info = sk->sk_protinfo; 14 | 15 | if (info && info->http) 16 | STAP_RETVALUE = (long)(info->http + STAP_ARG_idx); 17 | else 18 | STAP_RETVALUE = 0; 19 | %} 20 | 21 | /** 22 | * Log of HTTP transaction. 23 | */ 24 | function log_http (sk:long) 25 | { 26 | if (%{stap_options.bitmap%} & %{STAPF_HTTP%}) 27 | do_log_http(sk) 28 | } 29 | 30 | function do_log_http (sk:long) 31 | { 32 | http_count = %{STAP_VALUE(STAP_ARG_sk, http_count)%} 33 | estab_t = %{STAP_VALUE(STAP_ARG_sk, estab_t)%} 34 | req_begin = estab_t 35 | cnt = 0 36 | 37 | while (cnt < http_count) { 38 | http = get_stap_http(sk, cnt) 39 | if (!http) continue 40 | 41 | tm_hour = %{STAP_HTTP_MEM_VALUE(STAP_ARG_http, tm, tm_hour)%} 42 | tm_min = %{STAP_HTTP_MEM_VALUE(STAP_ARG_http, tm, tm_min)%} 43 | tm_sec = %{STAP_HTTP_MEM_VALUE(STAP_ARG_http, tm, tm_sec)%} 44 | 45 | rcv_req_ts = %{STAP_HTTP_VALUE(STAP_ARG_http, rcv_req_ts)%} 46 | xmit_resp_ts = %{STAP_HTTP_VALUE(STAP_ARG_http, xmit_resp_ts)%} 47 | last_ack_ts = %{STAP_HTTP_VALUE(STAP_ARG_http, last_ack_ts)%} 48 | start_seq = %{STAP_HTTP_VALUE(STAP_ARG_http, start_seq)%} 49 | end_snd_una = %{STAP_HTTP_VALUE(STAP_ARG_http, end_snd_una)%} 50 | 51 | req_wait = resp_wait = trans_wait = -1 52 | 53 | if (rcv_req_ts && req_begin && rcv_req_ts >= req_begin) 54 | req_wait = rcv_req_ts - req_begin 55 | 56 | if (xmit_resp_ts && rcv_req_ts && 57 | xmit_resp_ts >= rcv_req_ts) 58 | resp_wait = xmit_resp_ts - rcv_req_ts 59 | 60 | if (last_ack_ts && xmit_resp_ts && 61 | last_ack_ts >= xmit_resp_ts) 62 | trans_wait = last_ack_ts - xmit_resp_ts 63 | 64 | acked_data = end_snd_una - start_seq 65 | if (acked_data > 0) 66 | acked_data-- 67 | else if (acked_data < 0) 68 | acked_data = -1 69 | 70 | cnt++ 71 | time = sprintf("%02d:%02d:%02d", tm_hour, tm_min, tm_sec) 72 | 73 | /* log format: default or detail */ 74 | if (!%{stap_options.detail_log%}) { 75 | if (cnt == 1) 76 | printf(",req_count=%u", http_count) 77 | 78 | if (trans_wait >= %{stap_options.trans_time%} || 79 | trans_wait == -1) { 80 | printf(",num=%d,time=%s,acked_data=%d", 81 | cnt, time, acked_data) 82 | printf(",req_wait=%d,resp_wait=%d,trans_time=%d", 83 | req_wait, resp_wait, trans_wait) 84 | } 85 | } else { 86 | if (cnt == 1) 87 | printf("%-18s %-10s %-10s %-10s %-10s %-10s\n", 88 | "HTTP TABLE", "time", "ack_data", "req_wait", 89 | "resp_wait", "trans_time") 90 | 91 | if (trans_wait >= %{stap_options.trans_time%} || 92 | trans_wait == -1) { 93 | blank = " " 94 | printf("Num.%-14d %s%s %-10d %-10d %-10d %-10d\n", 95 | cnt, time, blank, acked_data, req_wait, resp_wait, 96 | trans_wait) 97 | } 98 | } 99 | 100 | req_begin = last_ack_ts 101 | } 102 | } 103 | 104 | 105 | -------------------------------------------------------------------------------- /src/memory.stp: -------------------------------------------------------------------------------- 1 | # Memory management 2 | 3 | %{ 4 | #include 5 | #include 6 | 7 | extern struct inet_hashinfo tcp_hashinfo; 8 | 9 | typedef struct { 10 | atomic64_t info; 11 | atomic64_t retrans; 12 | atomic64_t http; 13 | atomic64_t cong; 14 | } mem_stat_t; 15 | 16 | typedef struct { 17 | int info; 18 | int retrans; 19 | int http; 20 | int cong; 21 | } active_mem_stat_t; 22 | 23 | struct stap_mem_s { 24 | atomic_t stop_alloc; 25 | mem_stat_t alloc; 26 | mem_stat_t free; 27 | active_mem_stat_t active_free; 28 | }; 29 | typedef struct stap_mem_s stap_mem_t; 30 | 31 | /* Memory statistics */ 32 | stap_mem_t stap_mem; 33 | 34 | 35 | void do_mem_free(struct sock *sk, int active) 36 | { 37 | struct stap_info *info = sk->sk_protinfo; 38 | mem_stat_t *free = &stap_mem.free; 39 | active_mem_stat_t *active_free = &stap_mem.active_free; 40 | 41 | if (!info) 42 | return; 43 | 44 | if (!atomic_add_unless(&info->freeing, 1, 1)) 45 | return; 46 | 47 | if (info->retrans) { 48 | kfree(info->retrans); 49 | info->retrans = NULL; 50 | 51 | if (active) 52 | active_free->retrans++; 53 | else 54 | atomic64_inc(&free->retrans); 55 | } 56 | 57 | if (info->cong) { 58 | kfree(info->cong); 59 | info->cong = NULL; 60 | 61 | if (active) 62 | active_free->cong++; 63 | else 64 | atomic64_inc(&free->cong); 65 | } 66 | 67 | if (info->http) { 68 | kfree(info->http); 69 | info->http = NULL; 70 | 71 | if (active) 72 | active_free->http += info->http_alloc; 73 | else 74 | atomic64_add(info->http_alloc, &free->http); 75 | } 76 | 77 | kfree(info); 78 | sk->sk_protinfo = NULL; 79 | 80 | if (active) 81 | active_free->info++; 82 | else 83 | atomic64_inc(&free->info); 84 | } 85 | %} 86 | 87 | function mem_alloc:long (sk:long) 88 | %{ 89 | struct sock *sk = (struct sock *)STAP_ARG_sk; 90 | struct stap_info *info; 91 | gfp_t flags = GFP_ATOMIC | __GFP_ZERO; 92 | mem_stat_t *alloc = &stap_mem.alloc; 93 | void *ptr; 94 | 95 | STAP_RETVALUE = 0; 96 | if (atomic_read(&stap_mem.stop_alloc)) 97 | return; 98 | 99 | ptr = kmalloc(sizeof(struct stap_info), flags); 100 | if (ptr == NULL) 101 | goto FAILED; 102 | 103 | sk->sk_protinfo = ptr; 104 | info = ptr; 105 | atomic64_inc(&alloc->info); 106 | 107 | if (stap_options.bitmap & STAPF_RETRANS) { 108 | ptr = kmalloc(sizeof(struct stap_retrans), flags); 109 | if (ptr == NULL) 110 | goto FAILED; 111 | 112 | info->retrans = ptr; 113 | atomic64_inc(&alloc->retrans); 114 | } 115 | 116 | if (stap_options.bitmap & (STAPF_CONG | STAPF_ADCONG)) { 117 | ptr = kmalloc(sizeof(struct stap_cong), flags); 118 | if (ptr == NULL) 119 | goto FAILED; 120 | 121 | info->cong = ptr; 122 | atomic64_inc(&alloc->cong); 123 | } 124 | 125 | if (stap_options.bitmap & STAPF_HTTP) { 126 | ptr = kmalloc(sizeof(struct stap_http) * STAP_HTTP_INIT_CNT, 127 | flags); 128 | if (ptr == NULL) 129 | goto FAILED; 130 | 131 | info->http = ptr; 132 | info->http_alloc = STAP_HTTP_INIT_CNT; 133 | atomic64_add(STAP_HTTP_INIT_CNT, &alloc->http); 134 | } 135 | 136 | return; 137 | 138 | FAILED: 139 | do_mem_free(sk, 0); 140 | STAP_RETVALUE = 1; 141 | %} 142 | 143 | /* After being informed of exit by signal or timer, 144 | * it should not allocate any more memory. 145 | */ 146 | function mem_set_stop () 147 | %{ 148 | atomic_set(&stap_mem.stop_alloc, 1); 149 | %} 150 | 151 | function mem_is_stop:long () 152 | %{ 153 | STAP_RETVALUE = atomic_read(&stap_mem.stop_alloc); 154 | %} 155 | 156 | function mem_free(sk:long) 157 | %{ 158 | struct sock * sk = (struct sock *)STAP_ARG_sk; 159 | struct stap_info *info = sk->sk_protinfo; 160 | 161 | do_mem_free(sk, 0); 162 | %} 163 | 164 | function mem_free_active() 165 | %{ 166 | struct inet_hashinfo *hashinfo = &tcp_hashinfo; 167 | struct inet_ehash_bucket *head; 168 | struct hlist_nulls_node *node; 169 | struct sock *sk; 170 | spinlock_t *lock; 171 | int i; 172 | 173 | for (i = 0; i < hashinfo->ehash_size; i++) 174 | { 175 | head = &hashinfo->ehash[i]; 176 | lock = inet_ehash_lockp(hashinfo, i); 177 | 178 | if (hlist_nulls_empty(&head->chain)) 179 | continue; 180 | 181 | spin_lock(lock); 182 | sk_nulls_for_each(sk, node, &head->chain) 183 | { 184 | if (sk->sk_protinfo) { 185 | do_mem_free(sk, 1); 186 | } 187 | } 188 | spin_unlock(lock); 189 | } 190 | %} 191 | 192 | /** 193 | * Log memory usage 194 | */ 195 | function mem_log:string (idx:long) 196 | %{ 197 | int alloc, free, active, active_free, leak; 198 | int idx = STAP_ARG_idx; 199 | 200 | alloc = atomic64_read(&((atomic64_t *)&stap_mem.alloc)[idx]); 201 | free = atomic64_read(&((atomic64_t *)&stap_mem.free)[idx]); 202 | active_free = ((int *)&stap_mem.active_free)[idx]; 203 | active = alloc - free; 204 | leak = active - active_free; 205 | 206 | snprintf(STAP_RETVALUE, MAXSTRINGLEN, 207 | "%-14d%-14d%-14d%-14d%-14d", alloc, free, active, 208 | active_free, leak); 209 | %} 210 | 211 | function log_mem_usage() 212 | { 213 | printf("%-20s%-14s%-14s%-14s%-14s%-14s\n", "Memory Usage", 214 | "alloc", "free", "active", "active_free", "leak") 215 | 216 | printf("%-20s%s\n", "stap_info", mem_log(0)) 217 | 218 | if (%{stap_options.bitmap%} & %{STAPF_RETRANS%}) 219 | printf("%-20s%s\n", "stap_retrans", mem_log(1)) 220 | 221 | if (%{stap_options.bitmap%} & %{STAPF_HTTP%}) 222 | printf("%-20s%s\n", "stap_http", mem_log(2)) 223 | 224 | if (%{stap_options.bitmap & (STAPF_CONG | STAPF_ADCONG)%}) 225 | printf("%-20s%s\n", "stap_cong", mem_log(3)) 226 | 227 | printf("\n") 228 | } 229 | 230 | 231 | -------------------------------------------------------------------------------- /src/options.stp: -------------------------------------------------------------------------------- 1 | # Command-line options 2 | # ice and fire 3 | 4 | %{ 5 | #include 6 | struct stap_info; 7 | 8 | #define MAX_CONN_FILTERS 5 9 | #define MAX_PORT_FILTERS 5 10 | 11 | /* options */ 12 | struct stap_options_s { 13 | u32 bitmap; 14 | u32 timeout; 15 | u32 lifetime; 16 | u32 trans_time; 17 | u32 ad_cong; 18 | u32 detail_log; 19 | u32 sample_ratio; 20 | atomic64_t sample_cnt; 21 | }; 22 | typedef struct stap_options_s stap_options_t; 23 | 24 | enum { 25 | OP_BITMAP = 0, 26 | OP_TIMEOUT = 1, 27 | OP_LIFETIME = 2, 28 | OP_TRANS_TIME = 3, 29 | OP_AD_CONG = 4, 30 | OP_DETAIL_LOG = 5, 31 | OP_SAMPLE_RATIO = 6, 32 | OP_MAX = 7 33 | }; 34 | 35 | static const char *op_names[OP_MAX] = { 36 | "bitmap", 37 | "timeout", 38 | "lifetime", 39 | "trans_time", 40 | "ad_cong", 41 | "detail_log", 42 | "sample_ratio" 43 | }; 44 | 45 | /* connection filter */ 46 | struct conn_filter_s { 47 | u32 laddr; 48 | u32 laddr_mask; 49 | u32 raddr; 50 | u32 raddr_mask; 51 | u16 lport; 52 | u16 rport; 53 | }; 54 | typedef struct conn_filter_s conn_filter_t; 55 | 56 | typedef struct { 57 | u32 num; 58 | conn_filter_t filters[MAX_CONN_FILTERS]; 59 | } conn_filters_t; 60 | 61 | /* port filter */ 62 | typedef struct { 63 | u16 num; 64 | u16 ports[MAX_PORT_FILTERS]; 65 | } port_filters_t; 66 | 67 | /* global vars */ 68 | static conn_filters_t conn_filters; 69 | static port_filters_t port_filters; 70 | stap_options_t stap_options; 71 | %} 72 | 73 | /** 74 | * Allow setting port filters by module argument. 75 | */ 76 | global port_str = "" 77 | 78 | /** 79 | * Parse cmdline arguments. 80 | * Note: conn filter should be the last argument. 81 | */ 82 | function process_cmdline:long () 83 | { 84 | flag = 1 85 | ret = 0 86 | 87 | if (port_str != "") 88 | init_port_filter(port_str) 89 | 90 | for (i = 1; i <= argc; i++) { 91 | name = tokenize(argv[i], "=") 92 | 93 | if (flag && (idx = is_stap_option(name)) >= 0) { 94 | argv[i] = "" 95 | value = strtol(tokenize(argv[i], "="), 10) 96 | init_stap_option(idx, value) 97 | continue 98 | } 99 | 100 | if (flag && name == "ports") { 101 | argv[i] = "" 102 | ports = tokenize(argv[i], "=") 103 | init_port_filter(ports) 104 | continue 105 | } 106 | 107 | flag = 0 108 | if ((ret = init_conn_filter(i)) < 0) 109 | break 110 | } 111 | 112 | return ret 113 | } 114 | 115 | function is_stap_option:long (name:string) 116 | %{ 117 | int j, ret = -1; 118 | 119 | for(j = 0; j < OP_MAX; j++) 120 | if (strcmp(STAP_ARG_name, op_names[j]) == 0) { 121 | ret = j; 122 | break; 123 | } 124 | 125 | STAP_RETVALUE = ret; 126 | %} 127 | 128 | function init_stap_option (idx:long, value:long) 129 | %{ 130 | ((u32 *)&stap_options)[STAP_ARG_idx] = STAP_ARG_value; 131 | %} 132 | 133 | function take_sample:long () 134 | %{ 135 | int ret = 0; 136 | 137 | if (!stap_options.sample_ratio) 138 | ret = 1; 139 | else { 140 | atomic64_inc(&stap_options.sample_cnt); 141 | ret = !(atomic64_read(&stap_options.sample_cnt) % 142 | stap_options.sample_ratio); 143 | } 144 | 145 | STAP_RETVALUE = ret; 146 | %} 147 | 148 | /** 149 | * port filter, -p option. 150 | */ 151 | function init_port_filter (ports:string) 152 | { 153 | while (strlen(buf = tokenize(ports, ",")) != 0) { 154 | port = strtol(buf, 10) 155 | ports = "" 156 | 157 | if (set_port_filter(port) < 0) 158 | break 159 | } 160 | } 161 | 162 | function set_port_filter:long (port:long) 163 | %{ 164 | int ret = 0; 165 | 166 | if (STAP_ARG_port > 0xffff || port_filters.num >= MAX_PORT_FILTERS) 167 | ret = -1; 168 | else 169 | port_filters.ports[port_filters.num++] = STAP_ARG_port; 170 | 171 | STAP_RETVALUE = ret; 172 | %} 173 | 174 | function use_port_filter:long (sk:long) 175 | { 176 | lport = tcpmib_local_port(sk) 177 | return do_port_filter(lport) 178 | } 179 | 180 | function do_port_filter:long (port:long) 181 | %{ 182 | int i, ret = 0; 183 | 184 | if (!port_filters.num) 185 | ret = 1; 186 | else { 187 | for (i = 0; i < port_filters.num; i++) 188 | if (STAP_ARG_port == port_filters.ports[i]){ 189 | ret = 1; 190 | break; 191 | } 192 | } 193 | 194 | STAP_RETVALUE = ret; 195 | %} 196 | 197 | /* Filter API */ 198 | function filter:long (sk:long) 199 | { 200 | /* Neither conn filter nor port filter is used */ 201 | if (!%{conn_filters.num | port_filters.num%}) 202 | return 1 203 | 204 | /* Both conn filter and port filter are used */ 205 | if (%{conn_filters.num & port_filters.num%}) 206 | return (use_port_filter(sk) && use_conn_filter(sk)) 207 | 208 | /* Only port filter is used */ 209 | if (%{port_filters.num%}) 210 | return use_port_filter(sk) 211 | 212 | /* Only conn filter is used */ 213 | if (%{conn_filters.num%}) 214 | return use_conn_filter(sk) 215 | } 216 | 217 | /** 218 | * connection filter, -f option. 219 | */ 220 | function init_conn_filter:long (idx:long) 221 | { 222 | local = tokenize(argv[idx], "-") 223 | argv[idx] = "" 224 | remote = tokenize(argv[idx], "-") 225 | 226 | local_addr = tokenize(local, ":") 227 | local = "" 228 | local_port = tokenize(local, ":") 229 | 230 | remote_addr = tokenize(remote, ":") 231 | remote = "" 232 | remote_port = tokenize(remote, ":") 233 | 234 | laddr = ipv4_pton(local_addr, 0) 235 | laddr_mask = ipv4_pton(local_addr, 1) 236 | lport = ipv4_portton(local_port) 237 | 238 | raddr = ipv4_pton(remote_addr, 0) 239 | raddr_mask = ipv4_pton(remote_addr, 1) 240 | rport = ipv4_portton(remote_port) 241 | 242 | if (laddr < 0 || laddr_mask < 0 || 243 | raddr < 0 || raddr_mask < 0 || 244 | lport < 0 || rport < 0) 245 | return -1 246 | 247 | if (set_conn_filter(laddr, laddr_mask, lport, 248 | raddr, raddr_mask, rport) < 0) 249 | return -1 250 | 251 | return 0 252 | } 253 | 254 | function set_conn_filter:long (laddr:long, laddr_mask:long, 255 | lport:long, raddr:long, raddr_mask:long, rport:long) 256 | %{ 257 | u32 cnt = conn_filters.num; 258 | conn_filter_t *filter; 259 | 260 | if (cnt >= MAX_CONN_FILTERS) { 261 | STAP_RETVALUE = -1; 262 | return; 263 | } 264 | 265 | filter = &conn_filters.filters[cnt]; 266 | filter->laddr = STAP_ARG_laddr; 267 | filter->laddr_mask = STAP_ARG_laddr_mask; 268 | filter->lport = STAP_ARG_lport; 269 | filter->raddr = STAP_ARG_raddr; 270 | filter->raddr_mask = STAP_ARG_raddr_mask; 271 | filter->rport = STAP_ARG_rport; 272 | conn_filters.num++; 273 | 274 | STAP_RETVALUE = 0; 275 | %} 276 | 277 | /* 278 | * Convert an ipv4 dot notation address into long. 279 | * Support "*" in any field, treating it as a wildcard 280 | * by making the byte = 0. If make_mask is set, it creates 281 | * a mask based on "*" fields. All non = "*" bytes are 282 | * set to 0xff, all "*" fields are set to 0x0. 283 | */ 284 | function ipv4_pton:long (addr:string, make_mask:long) 285 | { 286 | i = 32 287 | ip = 0 288 | ips = addr 289 | 290 | while (strlen(byte = tokenize(ips, ".")) != 0) { 291 | i -= 8 292 | ips = "" 293 | 294 | if (byte == "*") { 295 | byte = "0" 296 | } else if (make_mask) 297 | byte = "255" 298 | 299 | j = strtol(byte, 10) 300 | if (j > 0xff) 301 | return -1 302 | 303 | ip = ip + (j << i) 304 | } 305 | 306 | if (i != 0) 307 | return -1 308 | 309 | return ip 310 | } 311 | 312 | /* 313 | * Convert an ascii integer between 0 and 0xffff to a 314 | * u16 port number. "*" is treated as a wildcard and 315 | * will be converted to 0x0. 316 | */ 317 | function ipv4_portton:long (port:string) 318 | { 319 | if (port == "*") 320 | port = "0" 321 | 322 | digit = strtol(port, 10) 323 | if (digit > 0xffff) 324 | return -1 325 | 326 | return digit 327 | } 328 | 329 | /* 330 | * Do connection filtering. 331 | * On success return 1, else return 0. 332 | */ 333 | function use_conn_filter:long (sk:long) 334 | { 335 | laddr = tcpmib_local_addr(sk) 336 | lport = tcpmib_local_port(sk) 337 | raddr = tcpmib_remote_addr(sk) 338 | rport = tcpmib_remote_port(sk) 339 | 340 | return do_conn_filter(laddr, lport, raddr, rport) 341 | } 342 | 343 | function do_conn_filter:long (laddr:long, lport:long, 344 | raddr:long, rport:long) 345 | %{ 346 | int local_valid, remote_valid; 347 | conn_filter_t filter; 348 | int i, ret = 0; 349 | 350 | if (!conn_filters.num) { 351 | STAP_RETVALUE = 1; 352 | return; 353 | } 354 | 355 | for (i = 0; i < conn_filters.num; i++) { 356 | local_valid = remote_valid = 0; 357 | filter = conn_filters.filters[i]; 358 | 359 | /* filter local ip and local port */ 360 | if ((STAP_ARG_laddr & filter.laddr_mask) == filter.laddr) { 361 | if (!filter.lport || (STAP_ARG_lport == filter.lport)) 362 | local_valid = 1; 363 | } 364 | 365 | /* filter remote ip and remote port */ 366 | if ((STAP_ARG_raddr & filter.raddr_mask) == filter.raddr) { 367 | if (!filter.rport || (STAP_ARG_rport == filter.rport)) 368 | remote_valid = 1; 369 | } 370 | 371 | if (local_valid && remote_valid) { 372 | ret = 1; 373 | break; 374 | } 375 | } 376 | 377 | STAP_RETVALUE = ret; 378 | %} 379 | 380 | /** 381 | * stop itself after running specified secs. 382 | */ 383 | function stap_update_timer:long (delta:long) 384 | %{ 385 | stap_options.timeout -= STAP_ARG_delta; 386 | STAP_RETVALUE = stap_options.timeout; 387 | %} 388 | 389 | probe timer.s(1) 390 | { 391 | if (%{stap_options.timeout%} == 0) 392 | next 393 | 394 | if (stap_update_timer(1) == 0 && 395 | !mem_is_stop()) { 396 | mem_set_stop() 397 | mem_free_active() 398 | 399 | exit() 400 | } 401 | } 402 | 403 | 404 | -------------------------------------------------------------------------------- /src/recv.stp: -------------------------------------------------------------------------------- 1 | # Recv Routine 2 | 3 | %{ 4 | #include 5 | #include 6 | %} 7 | 8 | /** 9 | * Receive data. 10 | */ 11 | probe trans.Recv = 12 | kernel.function("tcp_event_data_recv") 13 | { 14 | if (mem_is_stop()) next 15 | if (!$sk->sk_protinfo) 16 | next 17 | 18 | do_Recv($sk) 19 | } 20 | 21 | function do_Recv(sk:long) 22 | %{ 23 | struct sock *sk = (struct sock *)STAP_ARG_sk; 24 | struct inet_connection_sock *icsk = inet_csk(sk); 25 | struct stap_info *info = sk->sk_protinfo; 26 | struct stap_http *http; 27 | struct timeval tv; 28 | unsigned long time; 29 | void *ptr; 30 | 31 | /* -H option, HTTP performance */ 32 | if (info->http && info->http_alloc) { 33 | /* run out of pre allocated http structs */ 34 | if (info->http_count >= info->http_alloc) { 35 | if (info->http_alloc >= STAP_HTTP_MAX_CNT) 36 | goto OUT; 37 | 38 | ptr = krealloc(info->http, sizeof(struct stap_http) * 39 | (info->http_alloc + STAP_HTTP_INIT_CNT), 40 | GFP_ATOMIC | __GFP_ZERO); 41 | if (!ptr) 42 | goto OUT; 43 | 44 | info->http = ptr; 45 | info->http_alloc += STAP_HTTP_INIT_CNT; 46 | atomic64_add(STAP_HTTP_INIT_CNT, &stap_mem.alloc.http); 47 | } 48 | 49 | /* Corner case: request is composed of multi packets */ 50 | if (info->http_count) { 51 | http = info->http + (info->http_count -1); 52 | if (http && !http->xmit_resp_ts) 53 | goto OUT; 54 | } 55 | 56 | info->http_count++; 57 | info->http_offset = tcp_sk(sk)->packets_out? 2 : 1; 58 | 59 | http = info->http + (info->http_count - 1); 60 | if (http) { 61 | http->rcv_req_ts = tcp_time_stamp; 62 | do_gettimeofday(&tv); 63 | time = tv.tv_sec + 8 * 3600; 64 | rtc_time_to_tm(time, &http->tm); 65 | } 66 | } 67 | 68 | OUT: 69 | return; 70 | %} 71 | 72 | /** 73 | * HTTP response is ACKed. 74 | */ 75 | probe http.RespEnd = 76 | kernel.function("tcp_rearm_rto") 77 | { 78 | if (mem_is_stop()) next 79 | if (!$sk->sk_protinfo) 80 | next 81 | 82 | do_RespEnd($sk) 83 | } 84 | 85 | function do_RespEnd(sk:long) 86 | %{ 87 | struct sock *sk = (struct sock *)STAP_ARG_sk; 88 | struct stap_info *info = sk->sk_protinfo; 89 | struct stap_http *http, *next; 90 | 91 | if (!info->http || !info->http_count || 92 | info->http_count < info->http_offset) 93 | return; 94 | 95 | http = info->http + (info->http_count - info->http_offset); 96 | 97 | if (http) { 98 | http->last_ack_ts = tcp_time_stamp; 99 | http->end_snd_una = tcp_sk(sk)->snd_una; 100 | 101 | if (info->http_offset == 2) { 102 | next = http + 1; 103 | http->last_ack_ts = next->rcv_req_ts; 104 | } 105 | info->http_offset = 1; 106 | 107 | /* For HTTP trans_time filter */ 108 | if (http->last_ack_ts && http->xmit_resp_ts && 109 | http->last_ack_ts - http->xmit_resp_ts >= 110 | stap_options.trans_time) 111 | info->http_filter = 1; 112 | } 113 | %} 114 | 115 | /** 116 | * Receive an ACK. 117 | */ 118 | probe trans.RecvACK = 119 | kernel.statement("tcp_ack@net/ipv4/tcp_input.c:3756") 120 | { 121 | if (mem_is_stop()) next 122 | if (!$sk->sk_protinfo) 123 | next 124 | 125 | do_RecvACK($sk, $skb) 126 | } 127 | 128 | function do_RecvACK(sk:long, skb:long) 129 | %{ 130 | struct sock *sk = (struct sock *)STAP_ARG_sk; 131 | struct sk_buff *skb = (struct sk_buff *)STAP_ARG_skb; 132 | struct stap_info *info = sk->sk_protinfo; 133 | 134 | if (skb && !tcp_hdr(skb)->window) 135 | info->zero_awnd++; 136 | %} 137 | 138 | 139 | -------------------------------------------------------------------------------- /src/reset.stp: -------------------------------------------------------------------------------- 1 | # Monitor Reset packets 2 | 3 | %{ 4 | #include 5 | #include 6 | %} 7 | 8 | /** 9 | * Send Active RST. 10 | */ 11 | probe rst.ActiveSend = 12 | kernel.function("tcp_send_active_reset") 13 | { 14 | if (!filter($sk)) 15 | next 16 | 17 | do_ActiveSend($sk) 18 | } 19 | 20 | function do_ActiveSend(sk:long) 21 | { 22 | printf("[TX RST],%s,%s\n", 23 | get_full_time(), get_socket_addr(sk)) 24 | } 25 | 26 | /** 27 | * Receive RST. 28 | */ 29 | probe rst.Recv = 30 | kernel.function("tcp_reset") 31 | { 32 | if (!filter($sk)) 33 | next 34 | 35 | do_RstRecv($sk) 36 | } 37 | 38 | function do_RstRecv(sk:long) 39 | { 40 | printf("[RX RST],%s,%s,state=%s\n", 41 | get_full_time(), get_socket_addr(sk), 42 | get_socket_state(sk)) 43 | } 44 | 45 | function check_rst(sk:long, state:long) 46 | %{ 47 | struct sock *sk = (struct sock *)STAP_ARG_sk; 48 | struct stap_info *info = sk->sk_protinfo; 49 | u32 new_state = STAP_ARG_state; 50 | 51 | if (sk->sk_state == TCP_ESTABLISHED && 52 | new_state == TCP_CLOSE) 53 | info->rst_flag = 1; 54 | %} 55 | 56 | 57 | -------------------------------------------------------------------------------- /src/retrans.stp: -------------------------------------------------------------------------------- 1 | # Loss and Retransmission 2 | 3 | %{ 4 | #include 5 | #include 6 | 7 | #define CA_STATE_NUM 5 8 | static const char *ca_state_name[CA_STATE_NUM] = { 9 | "Open", 10 | "Disorder", 11 | "CWR", 12 | "Recovery", 13 | "Loss" 14 | }; 15 | 16 | static int stap_tcp_packet_delayed(struct tcp_sock *tp) 17 | { 18 | return !tp->retrans_stamp || 19 | (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && 20 | before(tp->rx_opt.rcv_tsecr, tp->retrans_stamp)); 21 | } 22 | 23 | static int stap_tcp_may_undo(struct tcp_sock *tp) 24 | { 25 | return tp->undo_marker && (!tp->undo_retrans || 26 | stap_tcp_packet_delayed(tp)); 27 | } 28 | %} 29 | 30 | /** 31 | * Fast recovery event. 32 | */ 33 | probe retran.FrEvent = 34 | kernel.statement("tcp_fastretrans_alert@net/ipv4/tcp_input.c:3139") 35 | { 36 | if (mem_is_stop()) next 37 | if (!$sk->sk_protinfo) 38 | next 39 | 40 | do_FrEvent($sk) 41 | } 42 | 43 | function do_FrEvent(sk:long) 44 | %{ 45 | struct sock *sk = (struct sock *)STAP_ARG_sk; 46 | struct stap_info *info = sk->sk_protinfo; 47 | struct stap_retrans *retrans = info->retrans; 48 | struct sk_buff *skb; 49 | u32 when; 50 | 51 | if (!retrans) 52 | return; 53 | 54 | skb = tcp_write_queue_head(sk); 55 | when = ((struct tcp_skb_cb *)&(skb->cb[0]))->when; 56 | 57 | retrans->fr_ev++; 58 | retrans->fr_s_t = tcp_time_stamp; 59 | retrans->fr_wt += tcp_time_stamp - when; 60 | %} 61 | 62 | /** 63 | * Timeout event distribution. 64 | */ 65 | probe retran.ToEventDistr = 66 | kernel.statement("tcp_retransmit_timer@net/ipv4/tcp_timer.c:334") 67 | { 68 | if (mem_is_stop()) next 69 | if (!$sk->sk_protinfo) 70 | next 71 | 72 | do_ToEventDistr($sk) 73 | } 74 | 75 | function do_ToEventDistr(sk:long) 76 | %{ 77 | struct sock *sk = (struct sock *)STAP_ARG_sk; 78 | struct stap_info *info = sk->sk_protinfo; 79 | struct stap_retrans *retrans = info->retrans; 80 | struct inet_connection_sock *icsk = inet_csk(sk); 81 | struct sk_buff *skb; 82 | u32 when; 83 | 84 | if (!retrans) 85 | return; 86 | 87 | skb = tcp_write_queue_head(sk); 88 | when = ((struct tcp_skb_cb *)&(skb->cb[0]))->when; 89 | 90 | retrans->to_ev++; 91 | retrans->to_s_t = tcp_time_stamp; 92 | retrans->to_wt += tcp_time_stamp - when; 93 | 94 | retrans->ca_bf_to = icsk->icsk_ca_state; 95 | retrans->to_ev_distr[icsk->icsk_ca_state]++; 96 | retrans->to_wt_distr[icsk->icsk_ca_state] += tcp_time_stamp - when; 97 | %} 98 | 99 | /** 100 | * Retrans pkt distribution. 101 | */ 102 | probe retran.PktDistr = 103 | kernel.statement("tcp_retransmit_skb@net/ipv4/tcp_output.c:2146") 104 | { 105 | if (mem_is_stop()) next 106 | if (!$sk->sk_protinfo) 107 | next 108 | 109 | do_PktDistr($sk, $skb) 110 | } 111 | 112 | function do_PktDistr(sk:long, skb:long) 113 | %{ 114 | struct sock *sk = (struct sock *)STAP_ARG_sk; 115 | struct stap_info *info = sk->sk_protinfo; 116 | struct stap_retrans *retrans = info->retrans; 117 | struct inet_connection_sock *icsk = inet_csk(sk); 118 | int pkts_sent; 119 | 120 | if (!retrans) 121 | return; 122 | 123 | pkts_sent = tcp_skb_pcount((struct sk_buff *)STAP_ARG_skb); 124 | 125 | if (icsk->icsk_ca_state == TCP_CA_Recovery) 126 | retrans->fr_rp += pkts_sent; 127 | else { 128 | retrans->to_rp += pkts_sent; 129 | retrans->to_rp_distr[retrans->ca_bf_to] += pkts_sent; 130 | } 131 | %} 132 | 133 | /** 134 | * Retrans recover time. 135 | */ 136 | probe retran.RecoverTime = 137 | kernel.statement("tcp_fastretrans_alert@net/ipv4/tcp_input.c:3039") 138 | { 139 | if (mem_is_stop()) next 140 | if (!$sk->sk_protinfo) 141 | next 142 | 143 | do_RecoverTime($sk) 144 | if (%{stap_options.bitmap%} & %{STAPF_HTTP%}) 145 | do_RespEnd($sk) 146 | } 147 | 148 | function do_RecoverTime(sk:long) 149 | %{ 150 | struct sock *sk = (struct sock *)STAP_ARG_sk; 151 | struct stap_info *info = sk->sk_protinfo; 152 | struct stap_retrans *retrans = info->retrans; 153 | struct inet_connection_sock *icsk = inet_csk(sk); 154 | u32 now = tcp_time_stamp; 155 | 156 | if (!retrans) 157 | return; 158 | 159 | if (icsk->icsk_ca_state == TCP_CA_Recovery) 160 | retrans->fr_rt += now - retrans->fr_s_t; 161 | else if (icsk->icsk_ca_state == TCP_CA_Loss) { 162 | retrans->to_rt += now - retrans->to_s_t; 163 | retrans->to_rt_distr[retrans->ca_bf_to] += now - retrans->to_s_t; 164 | } 165 | %} 166 | 167 | /** 168 | * Retrans(Fast recovery and Timeout) undo event. 169 | */ 170 | probe retran.UndoEvent = 171 | kernel.function("tcp_try_undo_recovery") 172 | { 173 | if (mem_is_stop()) next 174 | if (!$sk->sk_protinfo) 175 | next 176 | 177 | if (@cast($sk, "inet_connection_sock")->icsk_ca_state 178 | == %{TCP_CA_Recovery%}) 179 | do_UndoIncr($sk, 1) 180 | else 181 | do_UndoIncr($sk, 0) 182 | } 183 | 184 | function do_UndoIncr(sk:long, fr:long) 185 | %{ 186 | struct sock *sk = (struct sock *)STAP_ARG_sk; 187 | struct tcp_sock *tp = tcp_sk(sk); 188 | struct stap_info *info = sk->sk_protinfo; 189 | struct stap_retrans *retrans = info->retrans; 190 | 191 | if (!retrans) 192 | return; 193 | 194 | if (!stap_tcp_may_undo(tp)) 195 | return; 196 | 197 | if(STAP_ARG_fr) 198 | retrans->fr_undo += 1; 199 | else 200 | retrans->to_undo += 1; 201 | %} 202 | 203 | /** 204 | * Timeout undo event. 205 | */ 206 | probe retran.ToUndo = 207 | kernel.function("tcp_try_undo_loss") 208 | { 209 | if (mem_is_stop()) next 210 | if (!$sk->sk_protinfo) 211 | next 212 | 213 | do_UndoIncr($sk, 0) 214 | } 215 | 216 | /** 217 | * Fast recovery undo event. 218 | */ 219 | probe retran.FrUndo = 220 | kernel.function("tcp_try_undo_partial") 221 | { 222 | if (mem_is_stop()) next 223 | if (!$sk->sk_protinfo) 224 | next 225 | 226 | do_UndoIncr($sk, 1) 227 | } 228 | 229 | /** 230 | * FRTO undo event. 231 | */ 232 | probe retran.FrtoUndo = 233 | kernel.statement("tcp_process_frto@net/ipv4/tcp_input.c:3649") 234 | { 235 | if (mem_is_stop()) next 236 | if (!$sk->sk_protinfo) 237 | next 238 | 239 | do_UndoIncr($sk, 0) 240 | } 241 | 242 | /** 243 | * Log of loss and retransmission. 244 | */ 245 | function log_retrans(sk:long) 246 | { 247 | if (%{stap_options.bitmap%} & %{STAPF_RETRANS%}) { 248 | if (!%{stap_options.detail_log%}) 249 | print(log_default_retrans(sk)) 250 | else 251 | print(log_detail_retrans(sk)) 252 | } 253 | } 254 | 255 | function log_default_retrans:string (sk:long) 256 | %{ 257 | struct sock *sk = (struct sock *)STAP_ARG_sk; 258 | struct stap_info *info = sk->sk_protinfo; 259 | struct stap_retrans *r = info->retrans; 260 | int len = 0, maxlen = MAXSTRINGLEN; 261 | char *buf = (char *)STAP_RETVALUE; 262 | 263 | if (r->fr_ev) 264 | len = snprintf(buf, maxlen, 265 | ",fr_ev=%u,fr_repkts=%u,fr_wait=%u,fr_rec=%u,fr_undo=%u", 266 | r->fr_ev, r->fr_rp, r->fr_wt, r->fr_rt, r->fr_undo); 267 | else 268 | len = snprintf(buf, maxlen, 269 | ",fr_ev=%d,fr_repkts=%d,fr_wait=%d,fr_rec=%d,fr_undo=%d", 270 | -1, -1, -1, -1, -1); 271 | 272 | if (r->to_ev) 273 | len += snprintf(buf + len, maxlen - len, 274 | ",to_ev=%u,to_repkts=%u,to_wait=%u,to_rec=%u,to_undo=%u", 275 | r->to_ev, r->to_rp, r->to_wt, r->to_rt, r->to_undo); 276 | else 277 | len += snprintf(buf + len, maxlen - len, 278 | ",to_ev=%d,to_repkts=%d,to_wait=%d,to_rec=%d,to_undo=%d", 279 | -1, -1, -1, -1, -1); 280 | %} 281 | 282 | function log_detail_retrans:string (sk:long) 283 | %{ 284 | struct sock *sk = (struct sock *)STAP_ARG_sk; 285 | struct stap_info *info = sk->sk_protinfo; 286 | struct stap_retrans *r = info->retrans; 287 | int i, len = 0, maxlen = MAXSTRINGLEN; 288 | char *buf = (char *)STAP_RETVALUE; 289 | 290 | if (r->fr_ev || r->to_ev) { 291 | len = snprintf(buf, maxlen, "%-19s%-11s%-11s%-11s%-11s%-11s\n", 292 | "RETRANS TABLE", "events", "pkts", "wa_time", "rec_time", 293 | "undo"); 294 | 295 | if (r->fr_ev) 296 | len += snprintf(buf + len, maxlen - len, 297 | "%-19s%-11u%-11u%-11u%-11u%-11u\n", "Fast recovery", 298 | r->fr_ev, r->fr_rp, r->fr_wt, r->fr_rt, r->fr_undo); 299 | 300 | if (r->to_ev) { 301 | len += snprintf(buf + len, maxlen - len, 302 | "%-19s%-11u%-11u%-11u%-11u%-11u\n", "Timeout", 303 | r->to_ev, r->to_rp, r->to_wt, r->to_rt, r->to_undo); 304 | 305 | for (i = 0; i < CA_STATE_NUM; i++) { 306 | if (!r->to_ev_distr[i]) 307 | continue; 308 | 309 | len += snprintf(buf + len, maxlen - len, 310 | "TO in %-13s%-11u%-11u%-11u%-11u\n", 311 | ca_state_name[i], r->to_ev_distr[i], r->to_rp_distr[i], 312 | r->to_wt_distr[i], r->to_rt_distr[i]); 313 | } 314 | } 315 | 316 | snprintf(buf + len, maxlen - len, "\n"); 317 | } 318 | %} 319 | 320 | 321 | -------------------------------------------------------------------------------- /src/rtt.stp: -------------------------------------------------------------------------------- 1 | # RTT and RTO 2 | 3 | %{ 4 | #include 5 | #include 6 | %} 7 | 8 | /** 9 | * RTT, round trip time. 10 | */ 11 | probe trans.Rtt = 12 | kernel.function("tcp_valid_rtt_meas") 13 | { 14 | if (mem_is_stop()) next 15 | if (!$sk || !$sk->sk_protinfo) 16 | next 17 | 18 | do_Rtt($sk, $seq_rtt) 19 | } 20 | 21 | function do_Rtt(sk:long, rtt:long) 22 | %{ 23 | struct sock *sk = (struct sock *)STAP_ARG_sk; 24 | struct stap_info *info = sk->sk_protinfo; 25 | u32 new_rtt = STAP_ARG_rtt; 26 | 27 | /* Check carefully 8) */ 28 | if (new_rtt == INIT_VALUE || 29 | sk->sk_state == TCP_LISTEN || 30 | sk->sk_state == TCP_SYN_SENT || 31 | sk->sk_state == TCP_SYN_RECV) 32 | return; 33 | 34 | /* First rtt sample */ 35 | if (!info->rtt_cnt) { 36 | info->rtt[0] = new_rtt; 37 | info->rtt[2] = new_rtt; 38 | } 39 | 40 | /* Update min and max */ 41 | if (new_rtt < info->rtt[0]) 42 | info->rtt[0] = new_rtt; 43 | if (new_rtt > info->rtt[2]) 44 | info->rtt[2] = new_rtt; 45 | 46 | info->rtt[1] += new_rtt; 47 | info->rtt_cnt++; 48 | %} 49 | 50 | /** 51 | * RTO, retransmission timeout. 52 | */ 53 | probe trans.Rto = 54 | kernel.function("tcp_set_rto") 55 | { 56 | if (mem_is_stop()) next 57 | if (!$sk || !$sk->sk_protinfo) 58 | next 59 | 60 | do_Rto($sk) 61 | } 62 | 63 | function do_Rto(sk:long) 64 | %{ 65 | struct sock *sk = (struct sock *)STAP_ARG_sk; 66 | struct tcp_sock *tp = tcp_sk(sk); 67 | struct stap_info *info = sk->sk_protinfo; 68 | u32 new_rto; 69 | 70 | /* Check carefully 8) */ 71 | if (sk->sk_state == TCP_LISTEN || 72 | sk->sk_state == TCP_SYN_SENT || 73 | sk->sk_state == TCP_SYN_RECV) 74 | return; 75 | 76 | new_rto = (tp->srtt >> 3) + tp->rttvar; 77 | if (new_rto > TCP_RTO_MAX) 78 | new_rto = TCP_RTO_MAX; 79 | 80 | info->rto[1] += new_rto; 81 | info->rto_cnt++; 82 | 83 | /* First rto sample */ 84 | if (!info->rto[0]) 85 | info->rto[0] = new_rto; 86 | if (!info->rto[2]) 87 | info->rto[2] = new_rto; 88 | 89 | /* Update min and max */ 90 | if (new_rto < info->rto[0]) 91 | info->rto[0] = new_rto; 92 | if (new_rto > info->rto[2]) 93 | info->rto[2] = new_rto; 94 | %} 95 | 96 | /** 97 | * Log RTT and RTO. 98 | */ 99 | function log_rtt_rto (sk:long) 100 | { 101 | rtt_min = %{STAP_VALUE(STAP_ARG_sk, rtt[0])%} 102 | rtt_sum = %{STAP_VALUE(STAP_ARG_sk, rtt[1])%} 103 | rtt_max = %{STAP_VALUE(STAP_ARG_sk, rtt[2])%} 104 | 105 | rto_min = %{STAP_VALUE(STAP_ARG_sk, rto[0])%} 106 | rto_sum = %{STAP_VALUE(STAP_ARG_sk, rto[1])%} 107 | rto_max = %{STAP_VALUE(STAP_ARG_sk, rto[2])%} 108 | 109 | rtt_cnt = %{STAP_VALUE(STAP_ARG_sk, rtt_cnt)%} 110 | rto_cnt = %{STAP_VALUE(STAP_ARG_sk, rto_cnt)%} 111 | 112 | if (!rtt_cnt) 113 | rtt_cnt = rtt_avg = rtt_min = rtt_max = -1 114 | else 115 | rtt_avg = rtt_sum / rtt_cnt 116 | 117 | if (!rto_cnt) 118 | rto_cnt = rto_avg = rto_min = rto_max = -1 119 | else 120 | rto_avg = rto_sum / rto_cnt 121 | 122 | /* log format: default or detail */ 123 | if (!%{stap_options.detail_log%}) { 124 | printf(",rtt_avg=%d,rtt_min=%d,rtt_max=%d,rtt_cnt=%d", 125 | rtt_avg, rtt_min, rtt_max, rtt_cnt) 126 | printf(",rto_avg=%d,rto_min=%d,rto_max=%d,rto_cnt=%d", 127 | rto_avg, rto_min, rto_max, rto_cnt) 128 | } else { 129 | printf("%-18s %-10s %-10s %-10s %-10s\n", 130 | "RTT TABLE", "avg", "min", "max", "cnt") 131 | printf("%-18s %-10d %-10d %-10d %-10d\n", "RTT(ms)", 132 | rtt_avg, rtt_min, rtt_max, rtt_cnt) 133 | printf("%-18s %-10d %-10d %-10d %-10d\n", "RTO(ms)", 134 | rto_avg, rto_min, rto_max, rto_cnt) 135 | printf("\n") 136 | } 137 | } 138 | 139 | 140 | -------------------------------------------------------------------------------- /src/send.stp: -------------------------------------------------------------------------------- 1 | # Send Routine 2 | 3 | %{ 4 | #include 5 | #include 6 | %} 7 | 8 | /** 9 | * Send data out. 10 | */ 11 | probe trans.Send = 12 | kernel.statement("tcp_write_xmit@net/ipv4/tcp_output.c:1785") 13 | { 14 | if (mem_is_stop()) next 15 | if (!$sk->sk_protinfo) 16 | next 17 | 18 | do_Send($sk, $sent_pkts) 19 | } 20 | 21 | function do_Send(sk:long, cnt:long) 22 | %{ 23 | struct sock *sk = (struct sock *)STAP_ARG_sk; 24 | struct tcp_sock *tp = tcp_sk(sk); 25 | struct stap_info *info = sk->sk_protinfo; 26 | struct stap_http *http; 27 | struct sk_buff *skb; 28 | u32 end_seq; 29 | 30 | info->trans_pkt += STAP_ARG_cnt; 31 | 32 | /* -H option, HTTP performance */ 33 | if (info->http && info->http_count && STAP_ARG_cnt) { 34 | http = info->http + (info->http_count - 1); 35 | if (!http->xmit_resp_ts) { 36 | http->xmit_resp_ts = tcp_time_stamp; 37 | http->start_seq = tp->snd_una; 38 | } 39 | } 40 | 41 | /* Check peer's receive window */ 42 | if (sk->sk_send_head) { 43 | skb = sk->sk_send_head; 44 | end_seq = TCP_SKB_CB(skb)->end_seq; 45 | if (skb->len > tp->mss_cache) 46 | end_seq = TCP_SKB_CB(skb)->seq + tp->mss_cache; 47 | 48 | if (after(end_seq, tp->snd_una + tp->snd_wnd)) 49 | info->small_swnd++; 50 | } 51 | %} 52 | 53 | /** 54 | * Log transmission info of a connection. 55 | */ 56 | function log_trans (sk:long, state:long) 57 | { 58 | data = get_conn_data(sk) 59 | time = get_conn_lifetime(sk) 60 | time = time ? time : 1 61 | 62 | trans_pkt = %{STAP_VALUE(STAP_ARG_sk, trans_pkt)%} 63 | rtx_synack = %{STAP_VALUE(STAP_ARG_sk, rtx_synack)%} 64 | accept_wait = %{STAP_VALUE(STAP_ARG_sk, accept_wait)%} 65 | small_swnd = %{STAP_VALUE(STAP_ARG_sk, small_swnd)%} 66 | zero_awnd = %{STAP_VALUE(STAP_ARG_sk, zero_awnd)%} 67 | rst_flag = %{STAP_VALUE(STAP_ARG_sk, rst_flag)%} 68 | 69 | /* log format: default and detail */ 70 | if (!%{stap_options.detail_log%}) { 71 | printf(",data=%u,time=%u,packet=%u,synack_rtx=%u", 72 | data, time, trans_pkt, rtx_synack) 73 | printf(",accept_wait=%u,small_swnd=%u,zero_awnd=%u", 74 | accept_wait, small_swnd, zero_awnd) 75 | printf(",rst_flag=%u,from_state=%s,to_state=%s", 76 | rst_flag, get_socket_state(sk), 77 | socket_state_num2str(state)) 78 | } else { 79 | printf("TRANS TABLE\n"); 80 | printf("%-18s %u B\n", "data", data) 81 | printf("%-18s %u ms\n", "time", time) 82 | printf("%-18s %u pkts\n", "packet", trans_pkt) 83 | printf("%-18s %u pkts\n", "synack_rtx", rtx_synack) 84 | printf("%-18s %u ms\n", "accept_wait", accept_wait) 85 | printf("%-18s %u\n", "small_swnd", small_swnd) 86 | printf("%-18s %u\n", "zero_awnd", zero_awnd) 87 | printf("%-18s %u\n", "rst_flag", rst_flag) 88 | printf("%-18s %s\n", "from_state", get_socket_state(sk)) 89 | printf("%-18s %s\n", "to_state", socket_state_num2str(state)) 90 | } 91 | } 92 | 93 | 94 | -------------------------------------------------------------------------------- /src/share.stp: -------------------------------------------------------------------------------- 1 | # functions shared by serveral modules 2 | 3 | %{ 4 | #include 5 | #include 6 | 7 | static const char tcp_state_array[][16] = { 8 | "NULL", 9 | "ESTABLISHED", 10 | "SYN_SENT", 11 | "SYN_RECV", 12 | "FIN_WAIT1", 13 | "FIN_WAIT2", 14 | "TIME_WAIT", 15 | "CLOSE", 16 | "CLOSE_WAIT", 17 | "LAST_ACK", 18 | "LISTEN", 19 | "CLOSING" 20 | }; 21 | %} 22 | 23 | function get_short_time:string() 24 | %{ 25 | struct timeval tv; 26 | struct rtc_time tm; 27 | unsigned long time; 28 | 29 | do_gettimeofday(&tv); 30 | time = tv.tv_sec + 8 * 3600; 31 | rtc_time_to_tm(time, &tm); 32 | 33 | sprintf(STAP_RETVALUE, "%02d:%02d:%02d", 34 | tm.tm_hour, tm.tm_min, tm.tm_sec); 35 | %} 36 | 37 | function get_full_time:string() 38 | %{ 39 | struct timeval tv; 40 | struct rtc_time tm; 41 | unsigned long time; 42 | 43 | do_gettimeofday(&tv); 44 | time = tv.tv_sec + 8 * 3600; 45 | rtc_time_to_tm(time, &tm); 46 | 47 | sprintf(STAP_RETVALUE, "%d/%d/%d,%02d:%02d:%02d", 48 | tm.tm_year+1900, tm.tm_mon+1, tm.tm_mday, 49 | tm.tm_hour, tm.tm_min, tm.tm_sec); 50 | %} 51 | 52 | function get_conn_lifetime:long (sk:long) 53 | %{ 54 | struct sock *sk = (struct sock *)STAP_ARG_sk; 55 | struct stap_info *info = sk->sk_protinfo; 56 | STAP_RETVALUE = jiffies_to_msecs(tcp_time_stamp - info->estab_t); 57 | %} 58 | 59 | function get_conn_data:long (sk:long) 60 | %{ 61 | struct sock *sk = (struct sock *)STAP_ARG_sk; 62 | struct tcp_sock *tp = tcp_sk(sk); 63 | struct stap_info *info = sk->sk_protinfo; 64 | u32 len = tp->snd_nxt - info->isn; 65 | 66 | STAP_RETVALUE = len ? len - 1 : len; 67 | %} 68 | 69 | function filter_http_transtime:long (sk:long) 70 | %{ 71 | struct sock *sk = (struct sock *)STAP_ARG_sk; 72 | struct stap_info *info = sk->sk_protinfo; 73 | 74 | STAP_RETVALUE = info->http_filter; 75 | %} 76 | 77 | function get_socket_addr:string (sk:long) 78 | { 79 | laddr = tcpmib_local_addr(sk) 80 | lport = tcpmib_local_port(sk) 81 | raddr = tcpmib_remote_addr(sk) 82 | rport = tcpmib_remote_port(sk) 83 | 84 | local_addr = sprintf("%s:%d", ip_ntop(htonl(laddr)), lport) 85 | remote_addr = sprintf("%s:%d", ip_ntop(htonl(raddr)), rport) 86 | 87 | return sprintf("local=%s,remote=%s", local_addr, remote_addr) 88 | } 89 | 90 | function get_socket_state:string (sk:long) 91 | %{ 92 | struct sock *sk = (struct sock *)STAP_ARG_sk; 93 | sprintf(STAP_RETVALUE, "%s", tcp_state_array[sk->sk_state]); 94 | %} 95 | 96 | function socket_state_num2str:string (state:long) 97 | %{ 98 | sprintf(STAP_RETVALUE, "%s", tcp_state_array[STAP_ARG_state]); 99 | %} 100 | 101 | 102 | -------------------------------------------------------------------------------- /src/structs.stp: -------------------------------------------------------------------------------- 1 | # Data Structure 2 | # What stap_info is to TcpDive is what sk_buff to TCP/IP. 3 | 4 | %{ 5 | #include 6 | #include 7 | #include 8 | 9 | struct stap_info { 10 | /* Summary, default */ 11 | u32 estab_t; /* Establish time of a connection */ 12 | struct rtc_time tm; /* Date of the connection */ 13 | u32 isn; /* for transmission data counting */ 14 | u32 rtx_synack; /* SYNACK retrans times */ 15 | u32 trans_pkt; /* Transmission pkts */ 16 | u32 accept_wait; /* wait time before being accepted */ 17 | u32 small_swnd; /* snd_wnd less than MSS */ 18 | u32 zero_awnd; /* peer's advertise window is zero */ 19 | u32 rst_flag; /* send or receive a RST */ 20 | atomic_t freeing; /* A flag to avoid reentry of mem free */ 21 | 22 | /* RTT and RTO, default */ 23 | u32 rtt[3]; /* min, avg, max RTT */ 24 | u32 rto[3]; /* min, avg, max RTO */ 25 | u32 rto_cnt; /* for calculating average rto */ 26 | u32 rtt_cnt; /* for calculating average rtt */ 27 | 28 | /* cwnd and ssthresh, default */ 29 | u32 init_cwnd; /* initial cwnd */ 30 | u32 end_cwnd; /* cwnd at the end of connection */ 31 | u32 init_ssthr; /* initial ssthresh */ 32 | u32 end_ssthr; /* sstresh at the end of connection */ 33 | 34 | /* HTTP info, -H option */ 35 | u32 http_count; /* HTTP req/resp count */ 36 | u32 http_alloc; /* HTTP structs alloc */ 37 | u32 http_offset; /* Temp var */ 38 | u32 http_filter; /* A flag to control output */ 39 | struct stap_http *http; /* HTTP req/resp queue */ 40 | 41 | /* Other options */ 42 | struct stap_retrans *retrans; /* Loss and Retransmission, -L option */ 43 | struct stap_cong *cong; /* Congestion control, -C option */ 44 | }; 45 | 46 | /* HTTP req/resp info */ 47 | struct stap_http { 48 | struct rtc_time tm; /* Date of request */ 49 | u32 rcv_req_ts; /* HTTP request recv ts */ 50 | u32 xmit_resp_ts; /* HTTP response start xmit ts */ 51 | u32 last_ack_ts; /* HTTP response finish xmit ts */ 52 | u32 start_seq; /* HTTP response start sequence */ 53 | u32 end_snd_una; /* HTTP response end sequence received */ 54 | }; 55 | 56 | /* Loss and Retransmission */ 57 | struct stap_retrans { 58 | /* Retrans events */ 59 | u32 fr_ev; /* Fast recovery event */ 60 | u32 to_ev; /* Timeout event */ 61 | u32 to_ev_distr[5]; /* Timeout event distribution */ 62 | 63 | /* Retrans packets */ 64 | u32 fr_rp; /* Fast recovery retrans pkt */ 65 | u32 to_rp; /* Timeout recovery retrans pkt */ 66 | u32 to_rp_distr[5]; /* Timeout retrans pkt distribution */ 67 | 68 | /* Retrans recovery time */ 69 | u32 fr_rt; /* Fast recovery time */ 70 | u32 to_rt; /* Timeout recovery time */ 71 | u32 to_rt_distr[5]; /* Timeout recovery time distribution */ 72 | 73 | /* Retrans waiting time */ 74 | u32 fr_wt; /* Wait time before entering fast recovery */ 75 | u32 to_wt; /* Wait time before entering timeout */ 76 | u32 to_wt_distr[5]; /* Wait time before entering timeout distribution */ 77 | 78 | /* Retrans undo events */ 79 | u32 fr_undo; /* Fast recovery is spurious */ 80 | u32 to_undo; /* Timeout recovery is spurious */ 81 | 82 | /* Intermediate vars */ 83 | u32 ca_bf_to; /* ca state just before timeout */ 84 | u32 fr_s_t; /* timestap when fast recovery starts */ 85 | u32 to_s_t; /* timestap when timeout recovery starts */ 86 | }; 87 | 88 | /* Congestion control */ 89 | struct stap_cong { 90 | /* Summary */ 91 | u32 cwnd_unlimit; /* times when tcp is no cwnd limited */ 92 | u32 fast_converg; /* fast covergence performed times */ 93 | 94 | /* First loss */ 95 | u32 fl_phase; /* in which phase, 0:ss, 1:cong */ 96 | u32 fl_cwnd; /* cwnd at first loss */ 97 | u32 fl_rtt; /* rtt at first loss */ 98 | 99 | /* Slow Start 100 | * 0 - Standard 101 | * 1 - ACK Train Length 102 | * 2 - Delay Increase 103 | * 3 - Abort due to loss 104 | */ 105 | u32 ss_cnt[4]; /* slow start count */ 106 | u32 ss_time[4]; /* avg time used, unit is RTT */ 107 | u32 ss_start_cwnd[4]; /* avg cwnd when enter slow start */ 108 | u32 ss_end_cwnd[4]; /* avg cwnd when exit slow start */ 109 | 110 | /* Congestion Avoid */ 111 | /* phase_cnt and phase_time: 112 | * 0 - epoch 113 | * 1 - searching 114 | * 2 - probing 115 | * epoch_cwnd, epoch_rtt and epoch_cnt: 116 | * 0 - start 117 | * 1 - steady 118 | * 2 - end 119 | */ 120 | u32 phase_cnt[3]; /* phase {epoch|search|probe} counts */ 121 | u32 phase_time[3]; /* phase {epoch|search|probe} time */ 122 | u32 epoch_cwnd[3]; /* epoch {start|steady|end} cwnd */ 123 | u32 epoch_rtt[3]; /* epoch {start|steady|end} rtt */ 124 | u32 epoch_cnt[3]; /* epoch {start|steady|end} count */ 125 | 126 | /* Advanced CC option, -A */ 127 | atomic_t adcong_cnt; /* point count */ 128 | u16 adcong_flag; /* on or off */ 129 | u8 adcong_point; /* point type */ 130 | u8 adcong_msg; /* point msg */ 131 | 132 | /* Temp vars */ 133 | u32 ss_running; /* in slow start */ 134 | u32 ss_enter_cwnd; /* cwnd when entering slow start */ 135 | u32 ss_rtt_cnt; /* for slow start time measurement */ 136 | u32 epoch_switch; /* epoch phase switch flag */ 137 | u32 epoch_switch_ts; /* epoch phase switch timestamp */ 138 | }; 139 | 140 | enum { 141 | STAP_RETRANS = 1, 142 | STAP_HTTP = 2, 143 | STAP_CONG = 3, 144 | STAP_RST = 4, 145 | STAP_ADCONG = 5 146 | }; 147 | 148 | enum { 149 | STAPF_RETRANS = (1 << 1), 150 | STAPF_HTTP = (1 << 2), 151 | STAPF_CONG = (1 << 3), 152 | STAPF_RST = (1 << 4), 153 | STAPF_ADCONG = (1 << 5) 154 | }; 155 | 156 | #define INIT_VALUE ((unsigned)(-1)) 157 | 158 | #define STAP_VALUE(sk, name) \ 159 | (((struct stap_info *)((struct sock *) \ 160 | sk)->sk_protinfo)->name) 161 | 162 | #define STAP_MEM_VALUE(sk, mem, name) \ 163 | (((struct stap_info *)((struct sock *) \ 164 | sk)->sk_protinfo)->mem->name) 165 | 166 | #define STAP_HTTP_VALUE(addr, name) \ 167 | (((struct stap_http *)addr)->name) 168 | 169 | #define STAP_HTTP_MEM_VALUE(addr, mem, name) \ 170 | (((struct stap_http *)addr)->mem.name) 171 | 172 | /* Number of HTTP request structs pre allocated. 173 | * With HTTP Keepalive enabled, a value greater than one is suggested. 174 | */ 175 | #define STAP_HTTP_INIT_CNT 1 176 | 177 | /* Max number of HTTP request structs that a connection can use. 178 | * Prevent too much memory consumed by one connection. 179 | */ 180 | #define STAP_HTTP_MAX_CNT 10000 181 | %} 182 | 183 | function structs_init () {} 184 | 185 | 186 | -------------------------------------------------------------------------------- /tcpdive.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Name: 4 | # tcpdive - A TCP performance profiling tool. 5 | # 6 | # Description: 7 | # Tcpdive is designed to provide an insight into TCP's performance. 8 | # It uses systemtap to collect data from a running linux kernel. 9 | 10 | VERSION="1.0" 11 | DATE="2016-1-1" 12 | AUTHOR="zhangskd@gmail.com" 13 | DIR=`dirname $0` 14 | SOURCE="$DIR/src" 15 | 16 | # MODULES 17 | RETRANS="never" 18 | CONG="never" 19 | RST="never" 20 | HTTP="never" 21 | BITMAP=0 22 | 23 | # OPTIONS 24 | VERBOSE="" 25 | FILTER="" 26 | PORTS="" 27 | FORMAT=0 28 | MODULE_NAME="" 29 | RUN_TIME="" 30 | LIFE_TIME=0 31 | TRANS_TIME=0 32 | SAMPLE=0 33 | ADCONG=0 34 | 35 | function version { 36 | echo -e "Version $VERSION $DATE" 37 | echo -e "$AUTHOR" 38 | echo "tcpdive - A TCP performance profiling tool." 39 | echo "" 40 | } 41 | 42 | function usage { 43 | echo "USAGE:" 44 | echo " $0 [options] [modules] [filters]" 45 | echo "" 46 | echo "OPTIONS:" 47 | echo " -h # show help" 48 | echo " -V # show version" 49 | echo " -v # verbose mode for debugging" 50 | echo " -t # stop itself after running specified time" 51 | echo " -m # compile as tcpdive.ko instead of running directly" 52 | echo " -d # detailed logging instead of default format" 53 | echo "" 54 | echo "MODULES:" 55 | echo " -L # Loss and Retransmission" 56 | echo " -H # HTTP Performance (1.0/1.1)" 57 | echo " -C # Congestion Control" 58 | echo " -A # Advanced CC (depict critical points)" 59 | echo " -R # Monitor Reset Packet" 60 | echo "" 61 | echo "FILTERS:" 62 | echo " -l # lifetime of connection should greater than " 63 | echo " -i # trans time of response should greater than " 64 | echo " -s # take one sample from connections" 65 | echo " -p # server ports cared, use comma to separate" 66 | echo "" 67 | echo " -f :-: [-f <...>] # should be last" 68 | echo " eg. -f *.*.*.*:80-10.210.136.*:*" 69 | echo "" 70 | } 71 | 72 | # Process cmdline options 73 | while getopts hVvdLCRHmA:f:t:l:i:s:p: option 74 | do 75 | case $option in 76 | V) version 77 | exit 0;; 78 | v) VERBOSE="-v";; 79 | L) RETRANS="retran.*" 80 | BITMAP=$(($BITMAP+2));; 81 | H) HTTP="http.*" 82 | BITMAP=$(($BITMAP+4));; 83 | C) CONG="cong.*" 84 | BITMAP=$(($BITMAP+8));; 85 | R) RST="rst.*" 86 | BITMAP=$(($BITMAP+16));; 87 | A) ADCONG=$OPTARG 88 | CONG="cong.*" 89 | BITMAP=$(($BITMAP+32));; 90 | f) FILTER="$OPTARG $FILTER";; 91 | t) RUN_TIME=$OPTARG;; 92 | m) MODULE_NAME="-p 4 -m tcpdive";; 93 | d) FORMAT=1;; 94 | l) LIFE_TIME=$OPTARG;; 95 | i) TRANS_TIME=$OPTARG;; 96 | s) SAMPLE=$OPTARG;; 97 | p) PORTS="ports=$OPTARG";; 98 | h|?|*) 99 | usage 100 | exit 1;; 101 | esac 102 | done 103 | 104 | # In case that no modules specified, set default here 105 | if [ $BITMAP -eq 0 ]; then 106 | RETRANS="never" 107 | CONG="never" 108 | HTTP="never" 109 | RST="never" 110 | fi 111 | 112 | # Use systemtap to compile module 113 | stap -t -w -g -DINTERRUPTIBLE=0 \ 114 | -D MAXACTION=1000000 -D MAXSTRINGLEN=1024 \ 115 | $VERBOSE $MODULE_NAME -I $SOURCE -e ' 116 | probe begin 117 | { 118 | printf("Probe begin...\n") 119 | structs_init() 120 | 121 | if (process_cmdline() < 0) 122 | exit() 123 | } 124 | 125 | # Probe points MUST 126 | probe estab.* {} 127 | probe close.* {} 128 | probe trans.* {} 129 | 130 | # Probe points Optional 131 | probe '$RETRANS' {} 132 | probe '$CONG' {} 133 | probe '$RST' {} 134 | probe '$HTTP' {} 135 | 136 | probe end 137 | { 138 | if (!mem_is_stop()) 139 | mem_free_active() 140 | 141 | printf("\n\nProbe end!\n") 142 | log_mem_usage() 143 | } 144 | ' bitmap=$BITMAP timeout=$RUN_TIME lifetime=$LIFE_TIME \ 145 | trans_time=$TRANS_TIME ad_cong=$ADCONG detail_log=$FORMAT \ 146 | sample_ratio=$SAMPLE $PORTS $FILTER 147 | 148 | 149 | --------------------------------------------------------------------------------