Using Nagios to monitor OST capacity
1 How to use this plugin
- This plugin relies on check_nrpe, so you have to have such capabilities on the local host
- This plugin must run on a lustre client
- Keep in mind that many versions of check_nrpe have a 1024 character limit on output
- usage currently requires -f, c, and w arguments, e.g.:
./check_osts.sh -f naaschpc -c 90 -w 89
2 Plugin script
#!/usr/bin/env bash
# Nagios Exit Codes
OK=0
WARNING=1
CRITICAL=2
UNKNOWN=3
usage()
{
cat <<EOF
Check the fullness of lustre OSTs.
Options:
-f Specify filesystem
-c Critical threshold as an int (0-100)
-w Warning threshold as an int (0-100)
Usage: $0 -f tlustre -c 10 -w 6
EOF
}
if [ $# -lt 6 ];
then
usage
exit 1
fi
# Define now to prevent expected number errors
FS=mylustre
CRIT=0
WARN=0
while getopts "hc:f:w:" OPTION
do
case $OPTION in
h)
usage
;;
c)
CRIT="$OPTARG"
;;
f)
FS="$OPTARG"
;;
w)
WARN="$OPTARG"
;;
\?)
exit 1
;;
esac
done
#use the fullest OST to determine CRIT or WARN
STATUS=$(lfs df | grep ${FS}-OST |awk '{sub(/_UUID/,"")}; 1' |awk '{sub(/%/,"")}; 1 {printf "%s\n", $5}' | sort -gr | head -1)
#gather the data on all OSTs
ALLSORT=$(lfs df | grep ${FS}-OST |sed -e "s/^$FS-//" -e "s/_UUID//" |awk '{sub(/%/,"")}; 1 {printf "%s %s\n", $1, $5}' | awk '{printf "%s=%s% ", $1, $2}')
# you never need to do the in-between CRIT and WARN logic because of the order of these if statements....
if [ $STATUS -gt $CRIT ]; then
echo "At least one OST is at ${STATUS}% capacity! | $ALLSORT"
exit $CRITICAL
elif [ $STATUS -gt $WARN ]; then
echo "At least one OST is at ${STATUS}% capacity! | $ALLSORT"
exit $WARNING
else
echo "All OSTs are below warning capacity. | $ALLSORT"
exit $OK
fi