2016-11-27 14:32:32 +01:00
|
|
|
#!/bin/bash
|
|
|
|
# Script informed by the collectd monitoring script for smartmontools (using smartctl)
|
|
|
|
# by Samuel B. <samuel_._behan_(at)_dob_._sk> (c) 2012
|
|
|
|
# source at: http://devel.dob.sk/collectd-scripts/
|
|
|
|
|
|
|
|
# TODO: This probably needs to be a little more complex. The raw numbers can have more
|
|
|
|
# data in them than you'd think.
|
|
|
|
# http://arstechnica.com/civis/viewtopic.php?p=22062211
|
|
|
|
|
|
|
|
parse_smartctl_attributes_awk="$(cat << 'SMARTCTLAWK'
|
2018-01-22 16:51:20 +01:00
|
|
|
$1 ~ /^ *[0-9]+$/ && $2 ~ /^[a-zA-Z0-9_-]+$/ {
|
2016-11-27 14:32:32 +01:00
|
|
|
gsub(/-/, "_");
|
|
|
|
printf "%s_value{%s,smart_id=\"%s\"} %d\n", $2, labels, $1, $4
|
|
|
|
printf "%s_worst{%s,smart_id=\"%s\"} %d\n", $2, labels, $1, $5
|
|
|
|
printf "%s_threshold{%s,smart_id=\"%s\"} %d\n", $2, labels, $1, $6
|
2017-03-21 10:47:27 +01:00
|
|
|
printf "%s_raw_value{%s,smart_id=\"%s\"} %e\n", $2, labels, $1, $10
|
2016-11-27 14:32:32 +01:00
|
|
|
}
|
|
|
|
SMARTCTLAWK
|
|
|
|
)"
|
|
|
|
|
|
|
|
smartmon_attrs="$(cat << 'SMARTMONATTRS'
|
|
|
|
airflow_temperature_cel
|
|
|
|
command_timeout
|
|
|
|
current_pending_sector
|
|
|
|
end_to_end_error
|
|
|
|
erase_fail_count
|
|
|
|
g_sense_error_rate
|
|
|
|
hardware_ecc_recovered
|
|
|
|
host_reads_mib
|
|
|
|
host_reads_32mib
|
|
|
|
host_writes_mib
|
|
|
|
host_writes_32mib
|
|
|
|
load_cycle_count
|
|
|
|
media_wearout_indicator
|
2017-10-19 19:20:43 +02:00
|
|
|
wear_leveling_count
|
2016-11-27 14:32:32 +01:00
|
|
|
nand_writes_1gib
|
|
|
|
offline_uncorrectable
|
|
|
|
power_cycle_count
|
|
|
|
power_on_hours
|
|
|
|
program_fail_count
|
|
|
|
raw_read_error_rate
|
|
|
|
reallocated_sector_ct
|
|
|
|
reported_uncorrect
|
|
|
|
sata_downshift_count
|
|
|
|
spin_retry_count
|
|
|
|
spin_up_time
|
|
|
|
start_stop_count
|
2018-03-22 11:10:19 +01:00
|
|
|
temperature_case
|
2016-11-27 14:32:32 +01:00
|
|
|
temperature_celsius
|
2018-03-22 11:10:19 +01:00
|
|
|
temperature_internal
|
2016-11-27 14:32:32 +01:00
|
|
|
total_lbas_read
|
|
|
|
total_lbas_written
|
|
|
|
udma_crc_error_count
|
|
|
|
unsafe_shutdown_count
|
|
|
|
workld_host_reads_perc
|
|
|
|
workld_media_wear_indic
|
|
|
|
workload_minutes
|
|
|
|
SMARTMONATTRS
|
|
|
|
)"
|
|
|
|
smartmon_attrs="$(echo ${smartmon_attrs} | xargs | tr ' ' '|')"
|
|
|
|
|
|
|
|
parse_smartctl_attributes() {
|
|
|
|
local disk="$1"
|
|
|
|
local disk_type="$2"
|
|
|
|
local labels="disk=\"${disk}\",type=\"${disk_type}\""
|
|
|
|
local vars="$(echo "${smartmon_attrs}" | xargs | tr ' ' '|')"
|
|
|
|
sed 's/^ \+//g' \
|
|
|
|
| awk -v labels="${labels}" "${parse_smartctl_attributes_awk}" 2>/dev/null \
|
|
|
|
| tr A-Z a-z \
|
|
|
|
| grep -E "(${smartmon_attrs})"
|
|
|
|
}
|
|
|
|
|
2018-07-04 00:30:20 +02:00
|
|
|
parse_smartctl_scsi_attributes() {
|
|
|
|
local disk="$1"
|
|
|
|
local disk_type="$2"
|
|
|
|
local labels="disk=\"${disk}\",type=\"${disk_type}\""
|
|
|
|
while read line ; do
|
|
|
|
attr_type="$(echo "${line}" | tr '=' ':' | cut -f1 -d: | sed 's/^ \+//g' | tr ' ' '_')"
|
|
|
|
attr_value="$(echo "${line}" | tr '=' ':' | cut -f2 -d: | sed 's/^ \+//g')"
|
|
|
|
case "${attr_type}" in
|
|
|
|
number_of_hours_powered_up_) power_on="$( echo "${attr_value}" | awk '{ printf "%e\n", $1 }')" ;;
|
|
|
|
Current_Drive_Temperature) temp_cel="$(echo ${attr_value} | cut -f1 -d' ' | awk '{ printf "%e\n", $1 }')" ;;
|
|
|
|
Blocks_read_from_cache_and_sent_to_initiator_) lbas_read="$(echo ${attr_value} | awk '{ printf "%e\n", $1 }')" ;;
|
|
|
|
Accumulated_start-stop_cycles) power_cycle="$(echo ${attr_value} | awk '{ printf "%e\n", $1 }')" ;;
|
|
|
|
esac
|
|
|
|
done
|
|
|
|
echo "power_on_hours_raw_value{"${labels}",smart_id=\"9\"} ${power_on}"
|
|
|
|
echo "temperature_celsius_raw_value{"${labels}",smart_id=\"194\"} ${temp_cel}"
|
|
|
|
echo "total_lbas_read_raw_value{"${labels}",smart_id=\"242\"} ${lbas_read}"
|
|
|
|
echo "power_cycle_count_raw_value{"${labels}",smart_id=\"12\"} ${power_cycle}"
|
|
|
|
}
|
|
|
|
|
2016-11-27 14:32:32 +01:00
|
|
|
parse_smartctl_info() {
|
2017-04-05 16:51:58 +02:00
|
|
|
local -i smart_available=0 smart_enabled=0 smart_healthy=0
|
2016-11-27 14:32:32 +01:00
|
|
|
local disk="$1" disk_type="$2"
|
2018-01-22 16:51:20 +01:00
|
|
|
local model_family='' device_model='' serial_number='' fw_version='' vendor='' product='' revision='' lun_id=''
|
2016-11-27 14:32:32 +01:00
|
|
|
while read line ; do
|
|
|
|
info_type="$(echo "${line}" | cut -f1 -d: | tr ' ' '_')"
|
2018-01-24 11:35:14 +01:00
|
|
|
info_value="$(echo "${line}" | cut -f2- -d: | sed 's/^ \+//g' | sed 's/"/\\"/')"
|
2016-11-27 14:32:32 +01:00
|
|
|
case "${info_type}" in
|
|
|
|
Model_Family) model_family="${info_value}" ;;
|
|
|
|
Device_Model) device_model="${info_value}" ;;
|
|
|
|
Serial_Number) serial_number="${info_value}" ;;
|
|
|
|
Firmware_Version) fw_version="${info_value}" ;;
|
|
|
|
Vendor) vendor="${info_value}" ;;
|
|
|
|
Product) product="${info_value}" ;;
|
|
|
|
Revision) revision="${info_value}" ;;
|
|
|
|
Logical_Unit_id) lun_id="${info_value}" ;;
|
|
|
|
esac
|
|
|
|
if [[ "${info_type}" == 'SMART_support_is' ]] ; then
|
|
|
|
case "${info_value:0:7}" in
|
|
|
|
Enabled) smart_enabled=1 ;;
|
|
|
|
Availab) smart_available=1 ;;
|
|
|
|
Unavail) smart_available=0 ;;
|
|
|
|
esac
|
|
|
|
fi
|
2017-04-05 16:51:58 +02:00
|
|
|
if [[ "${info_type}" == 'SMART_overall-health_self-assessment_test_result' ]] ; then
|
|
|
|
case "${info_value:0:6}" in
|
|
|
|
PASSED) smart_healthy=1 ;;
|
|
|
|
esac
|
|
|
|
elif [[ "${info_type}" == 'SMART_Health_Status' ]] ; then
|
|
|
|
case "${info_value:0:2}" in
|
|
|
|
OK) smart_healthy=1 ;;
|
|
|
|
esac
|
|
|
|
fi
|
2016-11-27 14:32:32 +01:00
|
|
|
done
|
2018-01-22 16:51:20 +01:00
|
|
|
echo "device_info{disk=\"${disk}\",type=\"${disk_type}\",vendor=\"${vendor}\",product=\"${product}\",revision=\"${revision}\",lun_id=\"${lun_id}\",model_family=\"${model_family}\",device_model=\"${device_model}\",serial_number=\"${serial_number}\",firmware_version=\"${fw_version}\"} 1"
|
2016-11-27 14:32:32 +01:00
|
|
|
echo "device_smart_available{disk=\"${disk}\",type=\"${disk_type}\"} ${smart_available}"
|
|
|
|
echo "device_smart_enabled{disk=\"${disk}\",type=\"${disk_type}\"} ${smart_enabled}"
|
2017-04-05 16:51:58 +02:00
|
|
|
echo "device_smart_healthy{disk=\"${disk}\",type=\"${disk_type}\"} ${smart_healthy}"
|
2016-11-27 14:32:32 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
output_format_awk="$(cat << 'OUTPUTAWK'
|
|
|
|
BEGIN { v = "" }
|
|
|
|
v != $1 {
|
|
|
|
print "# HELP smartmon_" $1 " SMART metric " $1;
|
|
|
|
print "# TYPE smartmon_" $1 " gauge";
|
|
|
|
v = $1
|
|
|
|
}
|
|
|
|
{print "smartmon_" $0}
|
|
|
|
OUTPUTAWK
|
|
|
|
)"
|
|
|
|
|
|
|
|
format_output() {
|
|
|
|
sort \
|
|
|
|
| awk -F'{' "${output_format_awk}"
|
|
|
|
}
|
|
|
|
|
|
|
|
smartctl_version="$(/usr/sbin/smartctl -V | head -n1 | awk '$1 == "smartctl" {print $2}')"
|
|
|
|
|
|
|
|
echo "smartctl_version{version=\"${smartctl_version}\"} 1" | format_output
|
|
|
|
|
|
|
|
if [[ "$(expr "${smartctl_version}" : '\([0-9]*\)\..*')" -lt 6 ]] ; then
|
|
|
|
exit
|
|
|
|
fi
|
|
|
|
|
2017-10-18 07:37:47 +02:00
|
|
|
device_list="$(/usr/sbin/smartctl --scan-open | awk '/^\/dev/{print $1 "|" $3}')"
|
2016-11-27 14:32:32 +01:00
|
|
|
|
|
|
|
for device in ${device_list}; do
|
|
|
|
disk="$(echo ${device} | cut -f1 -d'|')"
|
|
|
|
type="$(echo ${device} | cut -f2 -d'|')"
|
|
|
|
echo "smartctl_run{disk=\"${disk}\",type=\"${type}\"}" $(TZ=UTC date '+%s')
|
2017-04-05 16:51:58 +02:00
|
|
|
# Get the SMART information and health
|
|
|
|
/usr/sbin/smartctl -i -H -d "${type}" "${disk}" | parse_smartctl_info "${disk}" "${type}"
|
2016-11-27 14:32:32 +01:00
|
|
|
# Get the SMART attributes
|
2018-07-04 00:30:20 +02:00
|
|
|
case ${type} in
|
|
|
|
sat) /usr/sbin/smartctl -A -d "${type}" "${disk}" | parse_smartctl_attributes "${disk}" "${type}" ;;
|
|
|
|
scsi) /usr/sbin/smartctl -A -d "${type}" "${disk}" | parse_smartctl_scsi_attributes "${disk}" "${type}" ;;
|
|
|
|
*) echo "disk type is not sat or scsi, ${type}"; exit ;;
|
|
|
|
esac
|
Always try to return smartmon_device_info metric (#663)
* Always try to return smartmon_device_info metric
Sometimes the 'model family' field is not returned by `smartctl' because
a disk is not in the disk database for the version of smartmontools
installed on the system.
In those cases, the device model and serial number is still returned (at
least as far as I have observed.
Re-work the logic to prefer the 'vendor' field first, and if not
present, always output a `smartmon_device_info` metric even if some
labels have empty values.
On the box I'm testing this on, where previously no metric was returned,
it now returns:
# HELP smartmon_device_info SMART metric device_info
# TYPE smartmon_device_info gauge
smartmon_device_info{disk="/dev/sda",type="sat",model_family="",device_model="INTEL REDACTED",serial_number="REDACTED",firmware_version="REDACTED"} 1
smartmon_device_info{disk="/dev/sdb",type="sat",model_family="",device_model="INTEL REDACTED",serial_number="REDACTED",firmware_version="REDACTED"} 1
smartmon_device_info{disk="/dev/sdc",type="sat",model_family="",device_model="INTEL REDACTED",serial_number="REDACTED",firmware_version="REDACTED"} 1
smartmon_device_info{disk="/dev/sdd",type="sat",model_family="",device_model="INTEL REDACTED",serial_number="REDACTED",firmware_version="REDACTED"} 1
smartmon_device_info{disk="/dev/sde",type="sat",model_family="",device_model="INTEL REDACTED",serial_number="REDACTED",firmware_version="REDACTED"} 1
smartmon_device_info{disk="/dev/sdf",type="sat",model_family="",device_model="INTEL REDACTED",serial_number="REDACTED",firmware_version="REDACTED"} 1
* Add trailing newline
Because POSIX:
https://stackoverflow.com/a/729795
2017-08-31 18:00:42 +02:00
|
|
|
done | format_output
|