#########################################################################
# File Name: monitor.sh
# Author: wugj
# mail: [email protected]
# Created Time: 2015年11月16日 星期一 15时14分19秒
#########################################################################
#!/bin/bash
#监控系统负载、内存,超出则发出邮件警告
hostname=`hostname|sed ‘s/.local//g‘`
#ip=`ifconfig |awk ‘{print $1,$2}‘|egrep -e ‘inet addr‘ -e ‘Link‘|egrep -v ‘lo|127.0.0.1‘|cut -f 2 -d ":"|sed ‘s/Link/:/g‘`
ip=`ifconfig eth0|grep "inet addr"|cut -f 2 -d ":"|cut -f 1 -d " "`
echo ip:$ip
#cpu个数
cpu_num=`grep -c ‘model name‘ /proc/cpuinfo`
echo cpu_num:$cpu_num
#统计节点状态信息日志路径
stat_path=/share/nas1/wugj/script/shell/log
echo persent static path:$stat_path
cur_time=`date +%y%m%d`
#节点状态文件
stat_file="$hostname`date +%y%m%d`.xls"
err_log="`date +%y%m`.log"
if [[ ! -f $stat_path/$err_log ]];
then touch $stat_path/$err_log
echo user pid host date command >$stat_path/$err_log
fi
echo $stat_file
#设置平均负载的警告值
load_warn=0.70
#提取本机的静态变量
watc_cpu_test(){
#系统15分钟的负载
load_15=`uptime |awk ‘{print $NF}‘`
echo load_15: $load_15
#每个核心每15分钟负载
average_load=`echo "scale=3;a=$load_15/$cpu_num;if(length(a)==scale(a))print 0 ;print a"|bc`
echo $average_load
average_int=`echo $average_load|cut -f 1 -d "."`
echo average_int:$average_int
#当单个核心15分钟的平均负载值大于等于1.0(即个位整数大于0) ,直接发邮件告警;如果小于1.0则进行二次比较
if (( $average_int > 0)); then
echo "$hostname15分钟的系统平均负载为$average_load,超过警戒值1.0,请立即处理!!!"
else
#当前系统15分钟平均负载值与告警值进行比较(当大于告警值0.70时会返回1,小于时会返回0 )
load_now=`expr $average_load \> $load_warn`
#如果系统单个核心15分钟的平均负载值大于告警值0.70(返回值为1),则发邮件给管理员
if (($load_now == 1)); then
echo "$hostname15分钟的系统平均负载达到 $average_load,超过警戒值0.70,请及时处理。"
fi
fi
}
function timeout()
{
waitsec=5
($*) & pid=$!
(sleep $waitsec && kill -HUP $pid) 2>/dev/null & watchdog=$!
#if command is finish
if wait $pid 2>/dev/null;then
pkill -HUP -P $watchdog
wait $watchdog
fi
}
mem_quota=90
function watch_mem()
{
memtotal=`cat /proc/meminfo |grep "MemTotal"|awk ‘{print $2}‘`
memfree=`cat /proc/meminfo |grep "MemFree"|awk ‘{print $2}‘`
cached=`cat /proc/meminfo |grep "^Cached"|awk ‘{print $2}‘`
buffers=`cat /proc/meminfo |grep "Buffers"|awk ‘{print $2}‘`
#echo "hahhahah"
mem_usage=$((100-memfree*100/memtotal-buffers*100/memtotal-cached*100/memtotal))
# echo mem_memssage:
if [ $mem_usage -gt $mem_quota ];then
mem_message="WARN! The Memory usage is over than $mem_usage%"
kill_proc
return 1
else
return 0
fi
}
function get_cpu_info()
{
head -1 /proc/stat|awk ‘{used+=$2+$3+$4;unused+=$5+$6+$7+$8} END{print used,unused}‘
}
function proc_cpu_ps5()
{
ps aux|sort -nk3r|head -n 5 >>$stat_path/$stat_file
}
function proc_cpu_top5()
{
top -n 1 -b |sed -n ‘7‘p >>$stat_path/$stat_file
top -n 1 -b | grep -v -E ‘^[[:alpha:]]|^$|COMMAND‘ |sort -k9nr|head -n 5 >>$stat_path/$stat_file
usr`=top -n 1 -b | grep -v -E ‘^[[:alpha:]]|^$|COMMAND‘ |sort -k9nr|head -n 1|awk -F " " ‘{print $2}‘`
mail -s "cpu load high" -c [email protected] [email protected]<$stat_paht/$stat_file
}
function kill_proc()
{
#pid=`top -n 1 -b | grep -v -E ‘^[[:alpha:]]|^$|COMMAND|root‘ |sort -k6nr -k 9|head -n 1|awk -F " " ‘{print $1}‘`
#mesg=`top -n 1 -b | grep -v -E ‘^[[:alpha:]]|^$|COMMAND|root‘ |sort -k6nr -k 9|head -n 1|awk -F " " ‘{print "\t"$6"\t"$12}‘`
#usr=`top -n 1 -b | grep -v -E ‘^[[:alpha:]]|^$|COMMAND|root‘ |sort -k6nr -k 9|head -n 1|awk -F " " ‘{print $2}‘`
#command=`top -n 1 -b -c | grep -v -E ‘^[[:alpha:]]|^$|COMMAND‘ |sort -k9nr|head -n 1|awk -F " " ‘{print $12}‘`
top -n 1 -b -c |awk -F " " ‘{
res=$6; pid=$1;user=$2; com=$12 ;
if(res ~/m/){
res=res*1024 ;
print user"\t"res"\t"pid"\t"com;
}
else if (res ~/g/){
res=res*1024*1024;
print user"\t"res"\t"pid"\t"com;
}
else if (res ~/*/){
res=res*1;
print user"\t"res"\t"pid"\t"com;
}
}‘|sort -k2nr|head -n 1|awk -F " " ‘{
system("kill -9 "$3"");
system("echo killed process "$3"");
system("echo "$0" "HOSTNAME" >> ‘"$stat_path/$err_log"‘");
system("echo Dear "$1" , you process "$3" has been killed of ‘"$hostname"‘ at `date`|mail -s "killed pid warn" [email protected] "$1"@biomarker.com.cn");
}‘
#echo $pid|xargs kill -9
#echo $usr $pid has be killed in `date`
#echo " $usr $pid $hostname `date` $command" >> $stat_path/$err_log
#echo "you process $usr $pid $mesg of $hostname at `date` has been killed " |mail -s "killed pid" -c [email protected] [email protected]
}
cpu_quota=80
function watch_cpu()
{
time_point_1=`get_cpu_info`
sleep 10
time_point_2=`get_cpu_info`
cpu_usage=`echo $time_point_1 $time_point_2|awk ‘{used=$3-$1;total=$3+$4-$1-$2;print used*100/total}‘`
echo cpu_usage: $cpu_usage>>$stat_path/$stat_file
if [[ $cpu_usage > $cpu_quota ]]; then
cpu_message="WARN! The CPU Usage is over than $cpu_quota%"
echo cpu_message: $cpu_message >> $stat_path/$stat_file
#timeout proc_cpu_ps5
timeout proc_cpu_top5
#kill_proc
fi
#测试
# timeout proc_cpu_ps5
# timeout proc_cpu_top5
}
function update_file(){
if [[ ! -f $stat_path/$stat_file ]];then
touch $stat_path/$stat_file
echo make new file successful
fi
awk ‘BEGIN{
print "ip: ‘"$ip"‘ "
print "host: ‘"$hostname"‘"
print "cpu_num: ‘"$cpu_num"‘"
print "mem_usage: ‘"$mem_usage"‘%"
print "mem_message: ‘"$mem_message"‘"
}‘>$stat_path/$stat_file
}
watch_mem
update_file
watch_cpu
kill_proc