blob: 55d9ef7cc4fe13b987697d0c9d7d5d63f932264c (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
|
#!/bin/bash
# smart disk monitor for critial parameters
. /etc/crons.conf.d/smart-monitor.conf
long=
tmp=`mktemp`
trap "rm -f ${tmp}" EXIT
if [ x$1 = x--print ]; then
smart_monitor_send_always=1
fi
cache_dir='/var/cache/smart-monitor'
[ -d $cache_dir ] || mkdir -p $cache_dir
# for now, store output into variable since we don't know yet if we are going
# to send it or not (depending on errors)
exec 6>&1
exec 1<>"${tmp}"
echo "==========================================================="
echo " critical parameters table"
echo "==========================================================="
echo ""
echo ""
echo "Critical parameters, if any of these parameters are >0 - INVESTIGATE!"
echo "And start thinking about disk change."
echo ""
echo ""
# draw table header
echo "+------------------------------------------+--------+-----+-----+-----+-----+-----+-----+-----+-----+"
echo "| drive | (1) | (2) | (3) | (4) | (5) | (6) | (7) | (8) | (9) |"
echo "+------------------------------------------+--------+-----+-----+-----+-----+-----+-----+-----+-----+"
# total warning/error count, if higher than 0, script will print report
# to stdout, otherwise it will stay silent
etotal=0
for d in "${smart_monitor_disks[@]}"
do
dl=40 # drive length for printf
vl=3 # value length for printf
smart=`/usr/sbin/smartctl -a $d`
disk_name=`echo $d | rev | cut -f1 -d/ | rev`
total_cached=$(cat $cache_dir/$disk_name 2>/dev/null)
if [ -z "$total_cached" ]; then total_cached=0; fi
# add detailed smart info for later print
long+=$'===========================================================\n'
long+=$' '$disk_name$'\n'
long+=$'===========================================================\n\n\n'
long+="$smart"
long+=$'\n\n\n'
# collect short health status
health=`echo "$smart" | grep "SMART overall" | cut -d':' -f2 | xargs echo`
rrer=`echo "$smart" | grep "Raw_Read_Error_Rate" | awk '{print $10}'`
ucec=`echo "$smart" | grep "UDMA_CRC_Error_Count" | awk '{print $10}'`
rsc=`echo "$smart" | grep "Reallocated_Sector_Ct" | awk '{print $10}'`
ser=`echo "$smart" | grep "Seek_Error_Rate" | awk '{print $10}'`
src=`echo "$smart" | grep "Spin_Retry_Count" | awk '{print $10}'`
rec=`echo "$smart" | grep "Reallocated_Event_Count" | awk '{print $10}'`
cps=`echo "$smart" | grep "Current_Pending_Sector" | awk '{print $10}'`
ou=`echo "$smart" | grep "Offline_Uncorrectable" | awk '{print $10}'`
disk_warns=$((rrer + ucec + rsc + ser + src + rec + cps + ou))
if [ "${health}" != "PASSED" ]; then
disk_warns=$((disk_warns + 1))
fi
if [ $disk_warns -ne $total_cached ]; then
# make sure we only send report once disk value has increased. This
# is usefull when user wants to ignore warning, he may decide that
# UDMA_CRC_Error_Count is of no interest for him, or single
# Reallocated_Sector_Ct does not botter him. Without it we would
# send email each time this script is run. Now we only send report
# when value incraeses.
etotal=$((etotal + disk_warns))
echo $disk_warns > $cache_dir/$disk_name
# else
# do not increase etotal if we already reported problem once, in this
# case etotal will be 0 (if no other errors are detected) and report
# won't be send
fi
# print table line
printf "| %-${dl}s | %s | %${vl}d | %${vl}d | %${vl}d | %${vl}d | %${vl}d | %${vl}d | %${vl}d | %${vl}d |\n" \
$disk_name $health $rrer $ucec $rsc $ser $src $rec $cps $ou
done
echo "+------------------------------------------+--------+-----+-----+-----+-----+-----+-----+-----+-----+"
echo ""
echo "Legend:"
echo -e "\t-(1): overall health"
echo -e "\t-(2): raw read error rate"
echo -e "\t-(3): udma crc error count"
echo -e "\t-(4): realocated sector count"
echo -e "\t-(5): seek error rate"
echo -e "\t-(6): spin retry count"
echo -e "\t-(7): reallocated event count"
echo -e "\t-(8): current pending sector"
echo -e "\t-(9): offline uncorrectable\n"
echo "==========================================================="
echo " detailed health status"
echo "==========================================================="
echo ""
echo ""
echo "$long"
# restore stdout
exec 1>&6 6>&-
if [ ${etotal} -gt 0 ] || [ $smart_monitor_send_always -eq 1 ]; then
# if there was at least one error or user wants report always,
# send and email
cat "${tmp}"
fi
|