#!/usr/bin/ksh93 ################################################################ function usagemsg_psmonitor_k93 { print " Program: psmonitor_k93 This function reads a list of process arguments from a process list file and checks the system process list to see if that process is running, if not it logs an error in the AIX error log. This function is intended to be run from cron once every minute of every day. Usage: ${1##*/} [-?] [-vVLEu] [-p processListFile] [-c configFile] [-l loggerScript] [-r resetDays] [-e emailAddress] Where: -p processListFile = Use the specified process list file. Default: /etc/psmonitor.list -c configFile = Use the specified configuration file to define error message variables. Default: none -l loggerScript = Execute the external error logging script specified by the file name "loggerScript". Default: none -r resetDays = Number of days between configuration resets Default: 1 (NOT IMPLEMENTED AT THIS TIME) -e emailAddr = Email address(s) to send error notification. -u = Execute local user customized code section of this function. -L = Do NOT log messages to AIX Errorlog. -E = Do NOT send email messages for error notification. -v = Verbose mode -V = Very Verbose Mode Example: psmonitor_k93 Author: Dana French (dfrench@mtxia.com) Copyright 2006 by Dana French \"AutoContent\" enabled " } ################################################################ function psmonitor_k93 { typeset TRUE="1" typeset FALSE="0" typeset RETCODE="0" typeset VERBOSE="${FALSE}" typeset VERYVERB="${FALSE}" typeset LOGGER="" typeset PROCLISTFILE="/etc/psmonitor.list" typeset CONFIGFILE="" typeset RESETDAYS="1" typeset TMPFILE="/tmp/psmonitor_k93.${$}.tmp" typeset AIXERRLOG="${TRUE}" typeset SENDEMAIL="${TRUE}" typeset CUSTOMCODE="${FALSE}" typeset VERSION="1.0" typeset ErrMsgType="${ErrMsgType:-1-Critical}" typeset ErrNtfyCon="${ErrNtfyCon:-Unix on-call}" typeset ErrNtfyTim="${ErrNtfyTim:-Normal office hours}" typeset ErrCompCls="${ErrCompCls:-Software}" typeset ErrCompNam="${ErrCompNam:-AIX}" typeset ErrRetCode="${ErrRetCode:-1}" typeset ErrLabel="${ErrLabel:-Process not running}" typeset ErrDescrip="${ErrDescrip:-Process is not running for frame associated with}" typeset ErrEmail="${ErrEmail:-dfrench1@capgeminienergy.com Unix_Team@txu.com}" typeset MESSAGE='Error Message Type: ${ErrMsgType} Error Notification Contact: ${ErrNtfyCon} Error Notification Time: ${ErrNtfyTim} Error Component Class: ${ErrCompCls} Error Component Name: ${ErrCompNam} Error Return Code: ${ErrRetCode} Error Label: ${ErrLabel} \"${PROCESSCMD}\" Error Desription: ${ErrDescrip} \"${PROCESSCMD}\" Error Email Address: ${ErrEmail}' while getopts ":vVELup:c:r#e:" OPTION do case "${OPTION}" in 'v') VERBOSE="${TRUE}";; 'V') VERYVERB="${TRUE}";; 'p') PROCLISTFILE="${OPTARG}";; 'c') CONFIGFILE="${OPTARG}";; 'l') LOGGER="${OPTARG}";; 'e') ErrEmail="${OPTARG}";; 'u') CUSTOMCODE="${TRUE}";; 'L') AIXERRLOG="${FALSE}";; 'E') SENDEMAIL="${FALSE}";; '?') usagemsg_psmonitor_k93 "${0}" && return 1 ;; ':') usagemsg_psmonitor_k93 "${0}" && return 1 ;; '#') usagemsg_psmonitor_k93 "${0}" && return 1 ;; esac done shift $(( ${OPTIND} - 1 )) (( VERBOSE == TRUE )) && print -u 2 -- "# Version: ${VERSION}" (( VERBOSE == TRUE )) && print -u 2 -- "# Process List File: ${PROCLISTFILE}" ################################################################ trap "usagemsg_psmonitor_k93 ${0}" EXIT #### #### Check to see if the specified process list file exists #### and contains data. If not, display an error message and #### return from the function with a non-zero return code. #### RETCODE="1" if ! [[ -s "${PROCLISTFILE}" ]] then print -u 2 -- "# ERROR: Process List file \"${PROCLISTFILE}\" does not exist" print -u 2 -- "# or contains no data." return ${RETCODE} fi #### #### Build a full path file name for the working copy of the #### process list file, replacing the slashes with bang #### symbols. This is so that if this function is executed #### from multiple users, they will not likely overwrite each #### others working process list file. #### typeset PROCLISTWORK="${PROCLISTFILE}.work" if [[ "_${PROCLISTFILE}" != _/* ]] then typeset PROCLISTWORK="${PWD}/${PROCLISTFILE}.work" fi PROCLISTWORK="/tmp/${PROCLISTWORK//\//!}" PROCLISTWORK="${PROCLISTWORK//!.!/!}" (( VERBOSE == TRUE )) && print -u 2 -- "# Working process list File: ${PROCLISTWORK}" #### #### Check to see if the working process list file exists, #### if not create it from the user specified or default #### process list file using sorted and unique record lines. #### if ! [[ -f "${PROCLISTWORK}" ]] then (( VERBOSE == TRUE )) && print -u 2 -- "# Working process list file \"${PROCLISTWORK}\" does not exist" (( VERBOSE == TRUE )) && print -u 2 -- "# Creating \"${PROCLISTWORK}\"" sort "${PROCLISTFILE}" | uniq > "${PROCLISTWORK}" fi #### #### Check to see if the user specified or default #### process list file has a later time stamp than the #### working process list file. If so, rebuild the working #### config file using sorted and unique record lines. #### if [[ "${PROCLISTFILE}" -nt "${PROCLISTWORK}" ]] then (( VERBOSE == TRUE )) && print -u 2 -- "# Process list file \"${PROCLISTFILE}\" is newer than working copy." (( VERBOSE == TRUE )) && print -u 2 -- "# Resetting working copy to resemble newer process list file." sort "${PROCLISTFILE}" | uniq > "${PROCLISTWORK}" fi #### #### Check to see if the number of days between working file #### resets is less than 1, if so display an error message #### and return from the function with a non-zero return #### code. #### RETCODE="2" if (( RESETDAYS <= 0 )) then print -u 2 -- "# ERROR: Number of days between working file resets is less than 1, MIN=1" return ${RETCODE} fi #### #### If a configuation file is specified on the command line, #### check to see that it exists, has a non-zero file length, #### and is executable. If it passes these tests, execute it #### to define the error message variables and values. #### RETCODE="3" if [[ "_${CONFIGFILE}" != "_" ]] && [[ -s "${CONFIGFILE}" ]] then (( VERBOSE == TRUE )) && print -u 2 -- "# Configuration File: ${CONFIGFILE}" if [[ -x "${CONFIGFILE}" ]] then . "${CONFIGFILE}" else print -u 2 -- "# ERROR: Configuration file \"${CONFIGFILE}\" is not executable." return ${RETCODE} fi fi RETCODE="0" trap "-" EXIT (( VERYVERB == TRUE )) && set -x #### #### Reset the working psmonitor.list file once a day at midnight #### TOD=$( date +"%H%M" ) if [[ "_${TOD}" = _0000 ]] then rm -f -- "${PROCLISTWORK}" sort -- "${PROCLISTFILE}" | uniq > "${PROCLISTWORK}" fi ################################################################ #### Generate a list of all processes on the system and store #### the list in an array, one process record line per array #### element. IFS=$'\n' PLIST=( $( ps -ef | grep -v grep ) ) IFS=$' \t\n' #### #### Loop through the record lines in the working #### process list file one line at a time. Each line is #### assumed to contain a regular expression representing a #### process that appears in a system's "ps -ef" output. #### rm -f -- "${TMPFILE}" while read -r -- PROCESSCMD do (( VERBOSE == TRUE )) && print -u 2 -r -- "# Process args regex: \"${PROCESSCMD}\"" #### Test the contents of the process list array to determine #### if it contains the process identifier read from the #### working process list file. If it does not, then the #### process is not running, so log an error message. IFS=$'\n' if ! print -- "${PLIST[*]}" | grep -- "${PROCESSCMD}" > /dev/null 2>&1 then IFS=$' \t\n' print -u 2 -r -- "# ERROR: Process matching \"${PROCESSCMD}\" does not exist" #### Evaluate the error message text to cause the dynamically #### assigned values to be substituted into the message. eval MSG="\"${MESSAGE}\"" # (( VERBOSE == TRUE )) && print -- "${MSG}" #### Insert the error message into the standard AIX error log #### using the "errlogger" utility. (( AIXERRLOG == TRUE )) && errlogger "${MSG}" #### Email the error message to the person(s) or groups #### identified as the recipient of these error messages. #### This email address may be specified on the command line, #### configuration file, or as an environment variable. (( SENDEMAIL == TRUE )) && print -r -- "${MSG}" | mail -s "$( hostname ) psmonitor_k93" "${ErrEmail}" #### If an error logging script was specified on the command #### line, execute it. Assume the script utilizes the #### appropriate error message variables. [[ "_${LOGGER}" != "_" ]] && [[ -x "${LOGGER}" ]] && . "${LOGGER}" #### If the command line option to execute local user #### customized code was selected on the command line, #### execute this section of code. CHANGE THE BODY OF THE #### FOLLOWING "if" STATEMENT TO SUIT YOUR INDIVIDUAL NEEDS #### AND REQUIREMENTS FOR LOGGING ERROR MESSAGES. if (( CUSTOMCODE == TRUE )) then (( VERBOSE == TRUE )) && print -u 2 "# Begin local user custom code section." print "# " print "# If you had inserted your customized code for error" print "# logging and/or notification, this function would be" print "# running it now..." print "# " (( VERBOSE == TRUE )) && print -u 2 "# End local user custom code section." fi else #### If the process list array contains the process #### identifier read from the working process list file, then #### insert the process identifer into a temporary storage #### file. This file will be used during the next invocation #### of this function as the list of valid process identifers #### to test against. IFS=$' \t\n' print -r -- "${PROCESSCMD}" >> "${TMPFILE}" fi done < "${PROCLISTWORK}" #### Sort the list of valid process identifiers and extract #### only the unique values. Store these values in the #### working process list file. sort -- "${TMPFILE}" | uniq > "${PROCLISTWORK}" ################################################################ (( VERBOSE == TRUE )) && print -u 2 -r -- "# Begin checking for restarted processes." #### #### Now loop through the record lines of the configuration #### file, that do not appear in the working configuration #### file, and determine if any running processes match. If #### so, add it back to the working configuration file. #### rm -f -- "${TMPFILE}" cp -f -- "${PROCLISTWORK}" "${TMPFILE}" sort -- "${PROCLISTFILE}" | uniq | comm -23 - "${PROCLISTWORK}" | while read -r -- PROCESSCMD do (( VERBOSE == TRUE )) && print -u 2 -r -- "# Check for restarted process: \"${PROCESSCMD}\"" #### Test the contents of the process list array to determine #### if it contains the process identifier read from the #### process list file. If it does, then the process has #### been restarted, so add it back into the working process #### list file. IFS=$'\n' if print -- "${PLIST[*]}" | grep -- "${PROCESSCMD}" > /dev/null 2>&1 then IFS=$' \t\n' #### If the process list array contains the process #### identifier read from the working process list file, then #### insert the process identifer into a temporary storage #### file. This file will be used during the next invocation #### of this function as the list of valid process identifers #### to test against. (( VERBOSE == TRUE )) && print -u 2 -r -- "# Re-adding \"${PROCESSCMD}\" to the working process list." print -r -- "${PROCESSCMD}" >> "${TMPFILE}" fi IFS=$' \t\n' done #### Sort the list of valid process identifiers and extract #### only the unique values. Store these values in the #### working process list file. sort -- "${TMPFILE}" | uniq > "${PROCLISTWORK}" rm -f -- "${TMPFILE}" (( VERBOSE == TRUE )) && print -u 2 -r -- "# End checking for restarted processes." return ${RETCODE} } ################################################################ psmonitor_k93 "${@}"