├── Makefile ├── test.sh ├── detail_report.sh ├── merge_all_thread.py ├── report.sh ├── README.md └── ProcessGroupMonitor.cpp /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: clean 2 | 3 | PGM: clean 4 | g++ -std=c++11 ProcessGroupMonitor.cpp -g -o PGM 5 | 6 | clean: 7 | rm -f PGM 8 | -------------------------------------------------------------------------------- /test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "Task start ..." 4 | 5 | for ((i=1; i<=1000000; i ++)) 6 | do 7 | echo $i 8 | done 9 | 10 | echo "Task end ..." 11 | -------------------------------------------------------------------------------- /detail_report.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | function run1(){ 4 | awk 'BEGIN{tb=-1;tp=-1;tn=-1;mm=m;mc=0;r=0;tct=0}{if($1 == "cmd" && r>1) print $0 ; if($1=="###"){ r+=1;if(r==1){ tb=0+$4; tp=tb;} }; if(r<2){if($1=="---"){ tn = 0+$4 ;} else if ($1 == "MEM") {if( $2 > mm ) mm=$2 ; } else if ( $1 == "CPU") {t=tn-tp; tct+=t*(0+$2);tp=tn; if($2>mc)mc=$2;} } } END{printf("MEM_max %s KB\nCPU_max %s\nreal_time %s seconds\ncpu_time %s seconds\n",mm,mc,tn-tb,tct/100);}' $1 5 | } 6 | 7 | for x in $* 8 | do 9 | echo "----------------------------" 10 | echo "log_file: $x" 11 | run1 $x 12 | echo "----------------------------" 13 | echo "" 14 | done 15 | 16 | -------------------------------------------------------------------------------- /merge_all_thread.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | data = pd.read_csv('tmp_pglog_all.csv',sep='\t',header=0) 5 | data.columns = ['tip','pid','time','cpu','memory'] 6 | 7 | data['time'] = data['time']-data['time'].min() 8 | data['time'] = data['time'] / 60 9 | data['time'] = data['time'].astype(int) 10 | data = data.groupby(['time','pid'])[['cpu','memory']].max().reset_index() 11 | data.to_csv('report_pglog_all.csv',sep='\t',header=True) 12 | 13 | data = data.groupby('time')[['cpu','memory']].sum().reset_index() 14 | print(f'CPU max: {data["cpu"].max()}',flush=True) 15 | print(f'MEM max: {data["memory"].max()}',flush=True) 16 | data.to_csv('report_pglog_all_as_one.csv',sep='\t',header=True) 17 | -------------------------------------------------------------------------------- /report.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | FD=`realpath $0` 4 | FD=`dirname $FD` 5 | 6 | mem=`tail -n6 pglog_* |grep MEM_max | awk 'BEGIN{x=0;}{if($2>x)x=$2;}END{print x;}'` 7 | cpu=`tail -n6 pglog_* |grep CPU_max | awk 'BEGIN{x=0;}{if($2>x)x=$2;}END{print x;}'` 8 | t=`tail -n6 pglog_* |grep TIME_total | awk 'BEGIN{x=0;}{if($2>x)x=$2;}END{print x;}'` 9 | ct=`tail -n6 pglog_* |grep -E "CPU_max|TIME_total" | awk 'BEGIN{CT=0;c=0;}{if($1=="CPU_max"){c=0+$2;}else{t=0+$2; ct=c*t; CT=CT+ct; c=0; }} END { printf("%d\n", CT ); }'` 10 | 11 | echo "---------report in single process mode---------------" 12 | echo "----- Total report begin ==========" 13 | echo "----- MEM_max $mem KB." 14 | echo "----- CPU_max $cpu% threads." 15 | echo "----- TIME $t seconds." 16 | echo "----- sum(CPU_max*TIME)*100 $ct " 17 | echo "----- Total report end ==========" 18 | echo "-----------------------------------------------------" 19 | 20 | 21 | echo "---------report in multi process mode---------------" 22 | grep -Inr DataFrame pglog_* >tmp_pglog_all.csv 23 | python3 $FD/merge_all_thread.py 24 | echo "----------------------------------------------------" 25 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ProcessGroupMonitor 2 | 3 | A simple tool to monitor a process group and record its CPU, MEM and time usage. 4 | 5 | ## INSTALL 6 | 7 | > make # to make PGM 8 | > make clean # to remove PGM 9 | 10 | ## USAGE 11 | 12 | ### Monitor a running process group 13 | 14 | ./PGM pgid [pgid_num_to_monitor] 15 | 16 | For example, if pgid is 100: 17 | 18 | > ./PGM pgid 100 19 | 20 | Note that you can use the following command to find the pgid for XXX 21 | 22 | > ps -A -o stat,pid,pgid,cmd | grep XXX 23 | 24 | ### Start a new command and monitor it 25 | 26 | ./PGM file_to_run args_list 27 | 28 | For example, if you want to run 29 | 30 | > ./test.sh 10 # this test.sh needs 1 parameter 31 | 32 | and monitor its CPU, MEM and time usage, then you can simply run: 33 | 34 | > ./PGM ./test.sh 10 35 | 36 | ### Output 37 | 38 | #### PGM will generate a log file for each process. 39 | 40 | * the name of log file : cmd + pid + start_time . 41 | 42 | * any character in cmd that is neither a digit nor 43 | an alphabet will become a '_' like: 44 | 45 | > __share_app_bwa_0_7_12_bwa_index_chr19_standard_contig__206046_1538103905 46 | 47 | #### To get a detail and accurate data for all subcommand 48 | 49 | ``` 50 | ./detail_report.sh pglog_* 51 | ``` 52 | 53 | The report looks like 54 | 55 | ``` 56 | ---------------------------- 57 | log_file: pglog_xxx 58 | cmd 14994 classify --hap paternal.unique.filter.mer --hap maternal.unique.filter.mer --thread 30 --read rel3-nanopore-wgs.fastq --format fastq 59 | MEM_max 21823048 KB 60 | CPU_max 2782 61 | real_time 5435 seconds 62 | cpu_time 130997 seconds 63 | ---------------------------- 64 | 65 | ---------------------------- 66 | log_file: pglog_14994_awk__print__2___83594_1590736593 67 | cmd 14994 awk {print $2} 68 | MEM_max 9936 KB 69 | CPU_max 45.8 70 | real_time 7 seconds 71 | cpu_time 3.185 seconds 72 | ---------------------------- 73 | ``` 74 | 75 | #### To get the overview of a complete process, use 76 | 77 | > report.sh 78 | 79 | You will get a report like 80 | 81 | ``` 82 | Total report begin ========== 83 | MEM_max 327149736 KB. 84 | CPU_max 1443 threads. 85 | TIME 327149736 seconds. 86 | Total report end ========== 87 | ``` 88 | Or if your want check a specific command : 89 | 90 | > tail -n 6 your_log_file_name 91 | 92 | You will get a report like 93 | 94 | ### Final report 1538103946 ### 95 | cmd /share/app/bwa-0.7.12/bwa index chr19_standard.contig 96 | pid 206046 97 | CPU_max 98.5 98 | MEM_max 126148 99 | TIME 41 seconds 100 | 101 | 102 | If you want more information other than just a simple report, then you can read the whole log file and get information for each snapshot. 103 | 104 | ### Snapshot frequency 105 | 106 | The default value is 5 seconds per snapshot. 107 | 108 | If a different frequency is desired, then you can try to modify the codes of sleep function. 109 | -------------------------------------------------------------------------------- /ProcessGroupMonitor.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | struct AProc 14 | { 15 | bool touched ; 16 | private: 17 | 18 | int pid ; 19 | 20 | long start_time ; 21 | 22 | float CPU_max ; 23 | 24 | float CPU_curr ; 25 | 26 | long Mem_max ; 27 | 28 | long Mem_curr ; 29 | 30 | long curr_time ; 31 | 32 | std::string cmd ; 33 | 34 | std::string cmd_full ; 35 | 36 | std::ofstream * ofs ; 37 | 38 | std::string Photo() 39 | { 40 | std::ostringstream ost ; 41 | ost<<"---\tTouch report\t"<>\t"< 150 ) 97 | path= path.substr(0,150); 98 | std::string file_name =std::string("pglog_")+ path +"_"+ std::to_string(pid)+"_"+std::to_string(curr_time); 99 | std::cerr<<" start of "< CPU_max ) 109 | CPU_max = CPU ; 110 | Mem_curr = MEM ; 111 | if ( Mem_max < MEM_Max ) 112 | Mem_max = MEM ; 113 | curr_time = time_steamp ; 114 | touched = true ; 115 | (*ofs)< datas ; 160 | 161 | long curr_time ; 162 | void TouchStart( long time ) 163 | { 164 | curr_time = time ; 165 | for( auto & pair : datas ) 166 | { 167 | pair.second.UnTouch(); 168 | } 169 | }; 170 | 171 | bool Touch( int pid , std::string cmd , std::string cmd_full ) 172 | { 173 | if( datas.find( pid ) == datas.end () ) 174 | { 175 | datas[pid].Init(pid,cmd,cmd_full,curr_time); 176 | } 177 | long MEM , MEM_Max ; 178 | float CPU ; 179 | if( GetProcInfo(pid , CPU , MEM , MEM_Max) ) 180 | { 181 | datas[pid].Touch(CPU ,MEM,MEM_Max , curr_time ); 182 | return true ; 183 | } 184 | return false; 185 | } 186 | 187 | void TouchEnd() 188 | { 189 | std::set dels; 190 | for( auto pair : datas ) 191 | { 192 | if( ! pair.second.touched ) 193 | { 194 | pair.second.End(); 195 | dels.insert(pair.first); 196 | } 197 | } 198 | for( int x :dels ) 199 | datas.erase(x); 200 | } 201 | } pgdata; 202 | 203 | void monitor_pgid(int pgid , int pid_self) 204 | { 205 | int tick = 0 ; 206 | std::ostringstream ost; 207 | //Ssl 1009 1009 /usr/sbin/ModemManager 208 | ost<<"ps -A -o stat,pid,pgid,cmd | grep "< 0) { // Monitor process 317 | monitor_pgid(pid,pid); 318 | //delete []argv_new ; 319 | wait(NULL); 320 | return 0 ; 321 | } 322 | else { // Job process 323 | execv(argv[1] , argv_new); 324 | std::cerr<<"Error : execv failed !!!"<