├── ogg_big_data_heartbeat_report.py └── gg_status_daemon.sh /ogg_big_data_heartbeat_report.py: -------------------------------------------------------------------------------- 1 | #!/bin/python3 2 | # Python program to read ogg heartbeat history json file 3 | 4 | import json 5 | import time 6 | import datetime 7 | import os 8 | import glob 9 | import sys, getopt 10 | 11 | def main(argv): 12 | # Initialize Variables 13 | vLagJsonDir = '' 14 | try: 15 | opts, args = getopt.getopt(argv,"h:j:",["jsondir="]) 16 | if len(opts) == 0: 17 | print('Script Usage: ogg_big_data_heartbeat_report.py -j ') 18 | sys.exit(1) 19 | except getopt.error as err: 20 | print('Script Usage: ogg_big_data_heartbeat_report.py -j ') 21 | sys.exit(2) 22 | for opt, arg in opts: 23 | if opt == '-h': 24 | print('Script Usage: ogg_big_data_heartbeat_report.py -j ') 25 | sys.exit() 26 | #elif opt in ("-j", "--jsondir"): 27 | elif opt == '-j': 28 | vLagJsonDir = arg 29 | elif opt == '--jsondir': 30 | vLagJsonDir = arg 31 | 32 | vTotLag = 0 33 | vTotJsonRecords = 0 34 | vTotLag_1hour = 0 35 | vTotJsonRecords_1hour = 0 36 | vTotLag_4hour = 0 37 | vTotJsonRecords_4hour = 0 38 | vTotLag_8hour = 0 39 | vTotJsonRecords_8hour = 0 40 | vTotLag_24hour = 0 41 | vTotJsonRecords_24hour = 0 42 | now = time.mktime(datetime.datetime.now().timetuple()) 43 | if vLagJsonDir == "": 44 | vLagJsonDir = "/u01/app/oracle/product/oggBd/19.1/gg_1/dirtmp/" 45 | print('JSON Dir defaulted to: ' + str(vLagJsonDir)) 46 | else: 47 | print('JSON Dir is: ' + str(vLagJsonDir)) 48 | lag_records = [] 49 | heartbeat_timestamp_records = [] 50 | replication_path_records = [] 51 | 52 | # Opening JSON file 53 | for filename in glob.glob(vLagJsonDir + '/*-hb-[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9].json'): 54 | #print(os.path.join(vLagJsonDir + "/", filename)) 55 | f = open(os.path.join(vLagJsonDir + "/", filename)) 56 | 57 | # returns JSON object as 58 | # a dictionary 59 | data = json.load(f) 60 | 61 | # Iterating through the json 62 | # list 63 | for i in data['records']: 64 | vIncomingTs = time.mktime(datetime.datetime.strptime(i['incomingHeartbeatTs'][:-3],"%Y-%m-%d %H:%M:%S.%f").timetuple()) 65 | vOutgoingTs = time.mktime(datetime.datetime.strptime(i['outgoingReplicatTs'][:-3],"%Y-%m-%d %H:%M:%S.%f").timetuple()) 66 | vIncomingHeartbeatTs = datetime.datetime.strptime(i['incomingHeartbeatTs'][:-3],"%Y-%m-%d %H:%M:%S.%f").strftime('%Y-%m-%d %H:%M') 67 | heartbeat_timestamp_records.append(vIncomingHeartbeatTs) 68 | #print(str(now - vOutgoingTs)) 69 | if (now - vOutgoingTs) <= 3600: 70 | vTotLag_1hour = vTotLag_1hour + (vOutgoingTs - vIncomingTs) 71 | vTotJsonRecords_1hour = (vTotJsonRecords_1hour + 1) 72 | lag_records.append(i['incomingExtract'] + " => " + i['incomingRoutingPath'] + " => " + i['incomingReplicat'] + " | " + vIncomingHeartbeatTs + " | " + str(vOutgoingTs - vIncomingTs)) 73 | replication_path_records.append(i['incomingExtract'] + " => " + i['incomingRoutingPath'] + " => " + i['incomingReplicat']) 74 | elif (now - vOutgoingTs) <= 14400: 75 | vTotLag_4hour = vTotLag_4hour + (vOutgoingTs - vIncomingTs) 76 | vTotJsonRecords_4hour = (vTotJsonRecords_4hour + 1) 77 | elif (now - vOutgoingTs) <= 28800: 78 | vTotLag_8hour = vTotLag_8hour + (vOutgoingTs - vIncomingTs) 79 | vTotJsonRecords_8hour = (vTotJsonRecords_8hour + 1) 80 | elif (now - vOutgoingTs) <= 86400: 81 | vTotLag_24hour = vTotLag_24hour + (vOutgoingTs - vIncomingTs) 82 | vTotJsonRecords_24hour = (vTotJsonRecords_24hour + 1) 83 | 84 | vTotLag = vTotLag + (vOutgoingTs - vIncomingTs) 85 | vTotJsonRecords = (vTotJsonRecords + 1) 86 | 87 | # Closing file 88 | f.close() 89 | 90 | vMaxHeartbeatTs = datetime.datetime.strptime(max(heartbeat_timestamp_records), '%Y-%m-%d %H:%M') 91 | vMinHeartbeatTs = datetime.datetime.strptime(min(heartbeat_timestamp_records), '%Y-%m-%d %H:%M') 92 | vTimeDiff = round((vMaxHeartbeatTs - vMinHeartbeatTs).total_seconds()/86400,1) 93 | 94 | # Print the array of Extract Lag Information 95 | 96 | replication_path_records = list(dict.fromkeys(replication_path_records)) 97 | print('\nReplication Paths:') 98 | for elem in replication_path_records: 99 | print(elem) 100 | 101 | print('\nCombined Lag Data for Replication Paths:\n') 102 | # Print Average Lag Over Entire Recordset 103 | if vTotJsonRecords_1hour > 0: 104 | print("Average Lag over the past hour: " + str(vTotLag_1hour // vTotJsonRecords_1hour) + " seconds") 105 | if vTotJsonRecords_4hour > 0: 106 | print("Average Lag over the past 4 hours: " + str(vTotLag_4hour // vTotJsonRecords_4hour) + " seconds") 107 | if vTotJsonRecords_8hour > 0: 108 | print("Average Lag over the past 8 hours: " + str(vTotLag_8hour // vTotJsonRecords_8hour) + " seconds") 109 | if vTotJsonRecords_24hour > 0: 110 | print("Average Lag over the past 24 hours: " + str(vTotLag_24hour // vTotJsonRecords_24hour) + " seconds") 111 | print("Average Lag over the dataset (" + str(vTimeDiff) + " Days): " + str(vTotLag // vTotJsonRecords) + " seconds") 112 | 113 | if __name__ == "__main__": 114 | if len(sys.argv) < 1: 115 | print('Script Usage: ogg_big_data_heartbeat_report.py -j ') 116 | sys.exit(2) 117 | else: 118 | main(sys.argv[1:]) -------------------------------------------------------------------------------- /gg_status_daemon.sh: -------------------------------------------------------------------------------- 1 | #!/bin/ksh 2 | ################################################################## 3 | # Name: ggs_status_daemon.ksh # 4 | # PURPOSE: TO MONITOR LAG OF GOLDEN GATE # 5 | # THIS SCRIPT WILL NOTIFY IF REPLICATION LAG IS MORE THEN 30 MIN # 6 | # THIS SCRIPT WILL NOTIFY IF CHECKPOINT LAG IS MORE THAN 15 MIN # 7 | # THIS SCRIPT WILL NOTIFY OBJECTS ARE ABENDED # 8 | ################################################################## 9 | 10 | helpFunction() 11 | { 12 | echo "" 13 | echo "Usage: $0 -o OPERATION (UPDOWN/LAG) -e ENVIRONMENT FILE" 14 | echo -e "\t-o Designate the script to check lag or up/down condition" 15 | echo -e "\t-e Environment file that contains the necessary parameters to execute GoldenGate" 16 | exit 1 # Exit script after printing help 17 | } 18 | 19 | chkGoldenGate() { 20 | 21 | ########################################################################## 22 | # RUNNING SCRIPT TO GET GOLDEN GATE INFORMATION # 23 | ########################################################################## 24 | $OGG_HOME/ggsci << EOF > ${LOGDIR}/ggs_objects_check_${OPERATION}.tmp 25 | info all 26 | exit 27 | EOF 28 | 29 | ################################################################################## 30 | ## EXTRACT DATA ABOUT THE GOLDENGATE OBJECTS ONLY ## 31 | ################################################################################## 32 | 33 | ### Based on host name, you can configure an additional exclusion list. If host match does not occur check all eligible processes 34 | if [[ $(uname -a | awk '{print $2}' | egrep -i 'dvlx|dhpxggatedvlx') ]] 35 | then 36 | egrep "(${PROCESS_LIST})" ${LOGDIR}/ggs_objects_check_${OPERATION}.tmp | egrep -v "Version|${PROCESS_EXCLUSION_LIST}" | tr ":" " "| tr -s '[:space:]'|cut -d" " -f1-9 > ${LOGDIR}/ggs_objects_${OPERATION}.tmp 37 | else 38 | egrep "(${PROCESS_LIST})" ${LOGDIR}/ggs_objects_check_${OPERATION}.tmp | grep -v Version | tr ":" " "| tr -s '[:space:]'|cut -d" " -f1-9 > ${LOGDIR}/ggs_objects_${OPERATION}.tmp 39 | fi 40 | 41 | } 42 | 43 | checkGoldenGateUpDown() { 44 | 45 | ########################################################################## 46 | ## CHECKING FOR ABENDED PROCESS ## 47 | ########################################################################## 48 | 49 | awk -v opath="${LOGDIR}" '{if ( $2 == "ABENDED" ) {print $1 " " $3 " HAS ABENDED -- at -- " d "\n"} else {print "NO ABENDS FOR " $1 " " $3 " " d > opath"/ggs_objects_not_abended.log" }}' d="$(date)" ${LOGDIR}/ggs_objects_${OPERATION}.tmp > ${LOGDIR}/ggs_objects_abended.log 50 | 51 | ########################################################################## 52 | ## CHECKING FOR STOPPED PROCESS ## 53 | ########################################################################## 54 | 55 | awk -v opath="${LOGDIR}" '{if ( $2 == "STOPPED" ) {print $1 " " $3 " IS STOPPED -- at -- " d "\n"} else {print $1 " " $3 " IS NOT STOPPED " d > opath"/ggs_objects_not_stopped.log" }}' d="$(date)" ${LOGDIR}/ggs_objects_${OPERATION}.tmp > ${LOGDIR}/ggs_objects_stopped.log 56 | 57 | if [ -s ${LOGDIR}/ggs_objects_abended.log ] 58 | then 59 | cat ${LOGDIR}/ggs_objects_abended.log >> ${EMAILFile} 60 | fi 61 | 62 | if [ -s ${LOGDIR}/ggs_objects_stopped.log ] 63 | then 64 | cat ${LOGDIR}/ggs_objects_stopped.log >> ${EMAILFile} 65 | fi 66 | 67 | } 68 | 69 | checkPreviousLag() { 70 | ########################################################################## 71 | ## CHECKING IF THERE WAS PREVIOUS LAG IN THE CASE OF PERIODIC ALERTS 72 | ########################################################################## 73 | GGSCI_LAG_CHECK=0 74 | GGSCI_CHECKPOINT_LAG_CHECK=0 75 | 76 | if [ -s ${LOGDIR}/ggs_objects_lag.log ] 77 | then 78 | echo "GGSCI_LAG_CHECK|$(date)" >> ${LOGDIR}/ggs_objects_previous_lag.log 79 | GGSCI_LAG_CHECK=$(grep GGSCI_LAG_CHECK ${LOGDIR}/ggs_objects_previous_lag.log | wc -l) 80 | 81 | if [ ${GGSCI_LAG_CHECK} -ge ${MAX_GGSCI_LAG_INTERVAL} ] 82 | then 83 | mv ${LOGDIR}/ggs_objects_previous_lag.log ${LOGDIR}/ggs_objects_previous_lag.log.${TIMESTAMP} 84 | fi 85 | fi 86 | 87 | if [ -s ${LOGDIR}/ggs_objects_checkpoint_lag.log ] 88 | then 89 | echo "GGSCI_CHECKPOINT_LAG_CHECK|$(date)" >> ${LOGDIR}/ggs_objects_previous_checkpoint_lag.log 90 | GGSCI_CHECKPOINT_LAG_CHECK=$(grep GGSCI_CHECKPOINT_LAG_CHECK ${LOGDIR}/ggs_objects_previous_checkpoint_lag.log | wc -l) 91 | 92 | if [ ${GGSCI_CHECKPOINT_LAG_CHECK} -ge ${MAX_GGSCI_CHECKPOINT_LAG_INTERVAL} ] 93 | then 94 | mv ${LOGDIR}/ggs_objects_previous_checkpoint_lag.log ${LOGDIR}/ggs_objects_previous_checkpoint_lag.log.${TIMESTAMP} 95 | fi 96 | fi 97 | } 98 | 99 | checkGoldenGateLag() { 100 | ########################################################################## 101 | ## CHECKING FOR LAG OF MORE THEN 30 ## 102 | ## AND CHECKPOINT LAG OF MORE THAN 15 ## 103 | ########################################################################## 104 | 105 | awk -v opath="${LOGDIR}" -v lag_hours="${LAG_HOURS}" -v lag_mins="${LAG_MINS}" '{if ( $4 > lag_hours || $5 >= lag_mins ) {print $1 " " $3 " HAS LAG of " $4" hour " $5 " min -- at -- " d } else {print "NO LAG FOR " $1 " " $3 " " d > opath"/ggs_objects_no_lag.log" }}' d="$(date)" ${LOGDIR}/ggs_objects_${OPERATION}.tmp > ${LOGDIR}/ggs_objects_lag.log 106 | 107 | awk -v opath="${LOGDIR}" -v lag_checkpoint_hours="${LAG_CHECKPOINT_HOURS}" -v lag_checkpoint_mins="${LAG_CHECKPOINT_MINS}" '{if ( $7 >= lag_checkpoint_hours && $8 >= lag_checkpoint_mins ) {print $1 " " $3 " HAS CHECKPOINT LAG of " $7" hour " $8 " min -- at -- " d "\n"} else {print "NO CHECKPOINT LAG FOR " $1 " " $3 " " d > opath"/ggs_objects_no_checkpoint_lag.log" }}' d="$(date)" ${LOGDIR}/ggs_objects_${OPERATION}.tmp > ${LOGDIR}/ggs_objects_checkpoint_lag.log 108 | 109 | 110 | ## Determine if there has been previous lag 111 | checkPreviousLag 112 | 113 | if [[ -s ${LOGDIR}/ggs_objects_lag.log && ${GGSCI_LAG_CHECK} -ge ${MAX_GGSCI_LAG_INTERVAL} ]] 114 | then 115 | cat ${LOGDIR}/ggs_objects_lag.log >> ${EMAILFile} 116 | fi 117 | 118 | if [[ -s ${LOGDIR}/ggs_objects_checkpoint_lag.log && ${GGSCI_CHECKPOINT_LAG_CHECK} -ge ${MAX_GGSCI_CHECKPOINT_LAG_INTERVAL} ]] 119 | then 120 | cat ${LOGDIR}/ggs_objects_checkpoint_lag.log >> ${EMAILFile} 121 | fi 122 | 123 | #### Clean up previous lag files if present and the previous check vars are set to 0 #### 124 | #### Handles the situation where a lag was present below the threshold and then no lag was present on next run #### 125 | 126 | if [[ -s ${LOGDIR}/ggs_objects_previous_lag.log && ${GGSCI_LAG_CHECK} -eq 0 ]] 127 | then 128 | rm ${LOGDIR}/ggs_objects_previous_lag.log 129 | fi 130 | 131 | if [[ -s ${LOGDIR}/ggs_objects_previous_checkpoint_lag.log && ${GGSCI_CHECKPOINT_LAG_CHECK} -eq 0 ]] 132 | then 133 | rm ${LOGDIR}/ggs_objects_previous_checkpoint_lag.log 134 | fi 135 | 136 | } 137 | 138 | sendGoldenGateStatus() { 139 | ########################################################## 140 | ## SENDING EMAIL IF ERRORS ARE IN LOGFILE ### 141 | ########################################################## 142 | 143 | if [ -s $EMAILFile ] 144 | then 145 | echo $(date) "-- SCRIPT OPERATION ${OPERATION} -- FOUND PROBLEM AND REACHED THRESHOLD IN GOLDENGATE HOME ${OGG_HOME} -- Sending Email" >> $LOGDIR/ggsci-status-daemon_${DATE}.log 146 | cat $EMAILFile | mailx -s "GGSCI-STATUS-DAEMON DETECTED ${OPERATION} PROBLEM IN ${OGG_HOME} ON: $HOST" $EMAILRECEPIENTS 147 | else 148 | if [[ ((${GGSCI_LAG_CHECK} -gt 0 && ${GGSCI_LAG_CHECK} -lt ${MAX_GGSCI_LAG_INTERVAL}) || (${GGSCI_CHECKPOINT_LAG_CHECK} -gt 0 && ${GGSCI_CHECKPOINT_LAG_CHECK} -lt ${MAX_GGSCI_CHECKPOINT_LAG_INTERVAL})) ]] 149 | then 150 | echo `date` "-- SCRIPT OPERATION ${OPERATION} -- FOUND PROBLEM IN GOLDENGATE HOME ${OGG_HOME} - EMAIL THRESHOLD NOT REACHED" >> $LOGDIR/ggsci-status-daemon_${DATE}.log 151 | if [[ -s ${LOGDIR}/ggs_objects_lag.log ]] 152 | then 153 | while read lag 154 | do 155 | echo ${lag} >> $LOGDIR/ggsci-status-daemon_${DATE}.log 156 | done < ${LOGDIR}/ggs_objects_lag.log 157 | fi 158 | 159 | if [[ -s ${LOGDIR}/ggs_objects_checkpoint_lag.log ]] 160 | then 161 | while read lag 162 | do 163 | echo ${lag} >> $LOGDIR/ggsci-status-daemon_${DATE}.log 164 | done < ${LOGDIR}/ggs_objects_checkpoint_lag.log 165 | fi 166 | else 167 | echo `date` "-- SCRIPT OPERATION ${OPERATION} -- NO ERRORS FOUND IN GOLDENGATE HOME ${OGG_HOME}" >> $LOGDIR/ggsci-status-daemon_${DATE}.log 168 | fi 169 | fi 170 | } 171 | 172 | cleanupFiles(){ 173 | ########################################################## 174 | ## TEMPORARY FILE CLEANUP ## 175 | ########################################################## 176 | 177 | if [ -e "${EMAILFile}" ] 178 | then 179 | mv ${EMAILFile} ${EMAILFile}.${TIMESTAMP} 180 | fi 181 | 182 | find ${LOGDIR} -type f -name "ggsci-status-daemon_*.log" -mtime +7 -delete 2>&1 183 | find ${LOGDIR} -type f -name "ggs_email_${OPERATION}.log.*" -mtime +2 -delete 2>&1 184 | find ${LOGDIR} -type f -name "ggs_objects_previous_lag.log.*" -mtime +2 -delete 2>&1 185 | find ${LOGDIR} -type f -name "ggs_objects_previous_checkpoint_lag.log.*" -mtime +2 -delete 2>&1 186 | 187 | } 188 | 189 | ########################################################## 190 | ## MAINLINE LOGIC ## 191 | ########################################################## 192 | ## Setup environment variables 193 | 194 | while getopts "o:e:" opt 195 | do 196 | case "$opt" in 197 | o ) OPERATION="$OPTARG" ;; 198 | e ) ENVIRONMENT="$OPTARG" ;; 199 | ? ) helpFunction ;; # Print helpFunction in case parameter is non-existent 200 | esac 201 | done 202 | 203 | # Print helpFunction in case parameters are empty 204 | if [ -z "$OPERATION" ] || [ -z "$ENVIRONMENT" ] 205 | then 206 | echo "Some or all of the parameters are empty"; 207 | helpFunction 208 | fi 209 | 210 | SCRIPT_HOME=$(dirname $0) 211 | LOGDIR=${SCRIPT_HOME}/log 212 | HOST=$(uname -a | awk '{print $2}') 213 | EMAILRECEPIENTS="shane.borden@deancare.com" 214 | DATE=$(date '+%m%d%Y') 215 | TIMESTAMP=$(date '+%m%d%Y%H%M%S') 216 | LAG_HOURS=00 217 | LAG_MINS=30 218 | LAG_CHECKPOINT_HOURS=00 219 | LAG_CHECKPOINT_MINS=15 220 | PROCESS_LIST="EXTRACT|REPLICAT|MANAGER|JAGENT|PMSRVR" 221 | PROCESS_EXCLUSION_LIST="IRIMMUNI|RC2ABRPU" 222 | MAX_GGSCI_LAG_INTERVAL=2 223 | MAX_GGSCI_CHECKPOINT_LAG_INTERVAL=2 224 | 225 | ### Check that necessary directories and files exist 226 | if [ ! -d ${LOGDIR} ]; then 227 | mkdir -p ${LOGDIR} 228 | fi 229 | 230 | if [[ -e ${ENVIRONMENT} ]]; then 231 | . ${ENVIRONMENT} 2>&1 > ${LOGDIR}/ggs_environment.out 232 | else 233 | echo "Environment File not valid. Exiting!" 234 | exit 1 235 | fi 236 | 237 | ### Check variations of OGG_HOME in the environment and if it is set differently set the OGG_HOME variable 238 | if [[ ! -z ${GG_HOME} ]]; then 239 | OGG_HOME=${GG_HOME} 240 | fi 241 | 242 | ## Begin Mainline Processing 243 | 244 | if [[ ${OPERATION} = "UPDOWN" ]] 245 | then 246 | EMAILFile=${LOGDIR}/ggs_email_${OPERATION}.log 247 | 248 | ## Clean up temp files in case they exist when script starts 249 | cleanupFiles 250 | 251 | ## Retrieve info all from ggsci 252 | chkGoldenGate 253 | 254 | ## Parse results from ggsci 255 | checkGoldenGateUpDown 256 | 257 | ## Email status if necessary 258 | sendGoldenGateStatus 259 | 260 | elif [[ ${OPERATION} = "LAG" ]] 261 | then 262 | EMAILFile=${LOGDIR}/ggs_email_${OPERATION}.log 263 | 264 | ## Clean up temp files in case they exist when script starts 265 | cleanupFiles 266 | 267 | ## Retrieve info all from ggsci 268 | chkGoldenGate 269 | 270 | ## Parse results from ggsci 271 | checkGoldenGateLag 272 | 273 | ## Email status if necessary 274 | sendGoldenGateStatus 275 | 276 | elif [[ ${OPERATION} = "ALLCHECKS" ]] 277 | then 278 | EMAILFile=${LOGDIR}/ggs_email_${OPERATION}.log 279 | 280 | ## Clean up temp files in case they exist when script starts 281 | cleanupFiles 282 | 283 | ## Retrieve info all from ggsci 284 | chkGoldenGate 285 | 286 | ## Parse results from ggsci 287 | checkGoldenGateUpDown 288 | 289 | ## Parse results from ggsci 290 | checkGoldenGateLag 291 | 292 | ## Email status if necessary 293 | sendGoldenGateStatus 294 | 295 | else 296 | echo "Must provide a valid parameter of 'ALLCHECKS','UPDOWN' or 'LAG'" 297 | fi 298 | 299 | ################# SCRIPT END ###################### --------------------------------------------------------------------------------