From 6aa5cfba6c42ea6a9bd83787b2e595984d91f8d6 Mon Sep 17 00:00:00 2001 From: Christopher Blum Date: Tue, 18 Sep 2018 22:43:20 +0200 Subject: [PATCH] textfile example script rework (#1074) * textfile smartmon.sh Added functions to also parse megaraid disks. Added parsing to also detect the grown_defects counters. * textfile storcli.py Reworked the example file to export lots more information about megaraid attached controllers, VDs and PDs. Signed-off-by: Christopher Blum --- text_collector_examples/smartmon.sh | 111 +++++++------ text_collector_examples/storcli.py | 245 ++++++++++++++++++---------- 2 files changed, 220 insertions(+), 136 deletions(-) diff --git a/text_collector_examples/smartmon.sh b/text_collector_examples/smartmon.sh index 9b0c7d4a..7b873fac 100755 --- a/text_collector_examples/smartmon.sh +++ b/text_collector_examples/smartmon.sh @@ -7,7 +7,11 @@ # data in them than you'd think. # http://arstechnica.com/civis/viewtopic.php?p=22062211 -parse_smartctl_attributes_awk="$(cat << 'SMARTCTLAWK' +# Formatting done via shfmt -i 2 +# https://github.com/mvdan/sh + +parse_smartctl_attributes_awk="$( + cat <<'SMARTCTLAWK' $1 ~ /^ *[0-9]+$/ && $2 ~ /^[a-zA-Z0-9_-]+$/ { gsub(/-/, "_"); printf "%s_value{%s,smart_id=\"%s\"} %d\n", $2, labels, $1, $4 @@ -18,7 +22,8 @@ $1 ~ /^ *[0-9]+$/ && $2 ~ /^[a-zA-Z0-9_-]+$/ { SMARTCTLAWK )" -smartmon_attrs="$(cat << 'SMARTMONATTRS' +smartmon_attrs="$( + cat <<'SMARTMONATTRS' airflow_temperature_cel command_timeout current_pending_sector @@ -64,63 +69,65 @@ parse_smartctl_attributes() { local disk_type="$2" local labels="disk=\"${disk}\",type=\"${disk_type}\"" local vars="$(echo "${smartmon_attrs}" | xargs | tr ' ' '|')" - sed 's/^ \+//g' \ - | awk -v labels="${labels}" "${parse_smartctl_attributes_awk}" 2>/dev/null \ - | tr A-Z a-z \ - | grep -E "(${smartmon_attrs})" + sed 's/^ \+//g' | + awk -v labels="${labels}" "${parse_smartctl_attributes_awk}" 2>/dev/null | + tr A-Z a-z | + grep -E "(${smartmon_attrs})" } parse_smartctl_scsi_attributes() { - local disk="$1" - local disk_type="$2" - local labels="disk=\"${disk}\",type=\"${disk_type}\"" - while read line ; do - attr_type="$(echo "${line}" | tr '=' ':' | cut -f1 -d: | sed 's/^ \+//g' | tr ' ' '_')" - attr_value="$(echo "${line}" | tr '=' ':' | cut -f2 -d: | sed 's/^ \+//g')" - case "${attr_type}" in - number_of_hours_powered_up_) power_on="$( echo "${attr_value}" | awk '{ printf "%e\n", $1 }')" ;; - Current_Drive_Temperature) temp_cel="$(echo ${attr_value} | cut -f1 -d' ' | awk '{ printf "%e\n", $1 }')" ;; - Blocks_read_from_cache_and_sent_to_initiator_) lbas_read="$(echo ${attr_value} | awk '{ printf "%e\n", $1 }')" ;; - Accumulated_start-stop_cycles) power_cycle="$(echo ${attr_value} | awk '{ printf "%e\n", $1 }')" ;; - esac - done - echo "power_on_hours_raw_value{"${labels}",smart_id=\"9\"} ${power_on}" - echo "temperature_celsius_raw_value{"${labels}",smart_id=\"194\"} ${temp_cel}" - echo "total_lbas_read_raw_value{"${labels}",smart_id=\"242\"} ${lbas_read}" - echo "power_cycle_count_raw_value{"${labels}",smart_id=\"12\"} ${power_cycle}" + local disk="$1" + local disk_type="$2" + local labels="disk=\"${disk}\",type=\"${disk_type}\"" + while read line; do + attr_type="$(echo "${line}" | tr '=' ':' | cut -f1 -d: | sed 's/^ \+//g' | tr ' ' '_')" + attr_value="$(echo "${line}" | tr '=' ':' | cut -f2 -d: | sed 's/^ \+//g')" + case "${attr_type}" in + number_of_hours_powered_up_) power_on="$(echo "${attr_value}" | awk '{ printf "%e\n", $1 }')" ;; + Current_Drive_Temperature) temp_cel="$(echo ${attr_value} | cut -f1 -d' ' | awk '{ printf "%e\n", $1 }')" ;; + Blocks_read_from_cache_and_sent_to_initiator_) lbas_read="$(echo ${attr_value} | awk '{ printf "%e\n", $1 }')" ;; + Accumulated_start-stop_cycles) power_cycle="$(echo ${attr_value} | awk '{ printf "%e\n", $1 }')" ;; + Elements_in_grown_defect_list) grown_defects="$(echo ${attr_value} | awk '{ printf "%e\n", $1 }')" ;; + esac + done + [ ! -z "$power_on" ] && echo "power_on_hours_raw_value{${labels},smart_id=\"9\"} ${power_on}" + [ ! -z "$temp_cel" ] && echo "temperature_celsius_raw_value{${labels},smart_id=\"194\"} ${temp_cel}" + [ ! -z "$lbas_read" ] && echo "total_lbas_read_raw_value{${labels},smart_id=\"242\"} ${lbas_read}" + [ ! -z "$power_cycle" ] && echo "power_cycle_count_raw_value{${labels},smart_id=\"12\"} ${power_cycle}" + [ ! -z "$grown_defects" ] && echo "grown_defects_count_raw_value{${labels},smart_id=\"12\"} ${grown_defects}" } parse_smartctl_info() { local -i smart_available=0 smart_enabled=0 smart_healthy=0 local disk="$1" disk_type="$2" local model_family='' device_model='' serial_number='' fw_version='' vendor='' product='' revision='' lun_id='' - while read line ; do + while read line; do info_type="$(echo "${line}" | cut -f1 -d: | tr ' ' '_')" info_value="$(echo "${line}" | cut -f2- -d: | sed 's/^ \+//g' | sed 's/"/\\"/')" case "${info_type}" in - Model_Family) model_family="${info_value}" ;; - Device_Model) device_model="${info_value}" ;; - Serial_Number) serial_number="${info_value}" ;; - Firmware_Version) fw_version="${info_value}" ;; - Vendor) vendor="${info_value}" ;; - Product) product="${info_value}" ;; - Revision) revision="${info_value}" ;; - Logical_Unit_id) lun_id="${info_value}" ;; + Model_Family) model_family="${info_value}" ;; + Device_Model) device_model="${info_value}" ;; + Serial_Number) serial_number="${info_value}" ;; + Firmware_Version) fw_version="${info_value}" ;; + Vendor) vendor="${info_value}" ;; + Product) product="${info_value}" ;; + Revision) revision="${info_value}" ;; + Logical_Unit_id) lun_id="${info_value}" ;; esac - if [[ "${info_type}" == 'SMART_support_is' ]] ; then + if [[ "${info_type}" == 'SMART_support_is' ]]; then case "${info_value:0:7}" in - Enabled) smart_enabled=1 ;; - Availab) smart_available=1 ;; - Unavail) smart_available=0 ;; + Enabled) smart_enabled=1 ;; + Availab) smart_available=1 ;; + Unavail) smart_available=0 ;; esac fi - if [[ "${info_type}" == 'SMART_overall-health_self-assessment_test_result' ]] ; then + if [[ "${info_type}" == 'SMART_overall-health_self-assessment_test_result' ]]; then case "${info_value:0:6}" in - PASSED) smart_healthy=1 ;; + PASSED) smart_healthy=1 ;; esac - elif [[ "${info_type}" == 'SMART_Health_Status' ]] ; then + elif [[ "${info_type}" == 'SMART_Health_Status' ]]; then case "${info_value:0:2}" in - OK) smart_healthy=1 ;; + OK) smart_healthy=1 ;; esac fi done @@ -130,7 +137,8 @@ parse_smartctl_info() { echo "device_smart_healthy{disk=\"${disk}\",type=\"${disk_type}\"} ${smart_healthy}" } -output_format_awk="$(cat << 'OUTPUTAWK' +output_format_awk="$( + cat <<'OUTPUTAWK' BEGIN { v = "" } v != $1 { print "# HELP smartmon_" $1 " SMART metric " $1; @@ -142,15 +150,15 @@ OUTPUTAWK )" format_output() { - sort \ - | awk -F'{' "${output_format_awk}" + sort | + awk -F'{' "${output_format_awk}" } -smartctl_version="$(/usr/sbin/smartctl -V | head -n1 | awk '$1 == "smartctl" {print $2}')" +smartctl_version="$(/usr/sbin/smartctl -V | head -n1 | awk '$1 == "smartctl" {print $2}')" echo "smartctl_version{version=\"${smartctl_version}\"} 1" | format_output -if [[ "$(expr "${smartctl_version}" : '\([0-9]*\)\..*')" -lt 6 ]] ; then +if [[ "$(expr "${smartctl_version}" : '\([0-9]*\)\..*')" -lt 6 ]]; then exit fi @@ -159,13 +167,18 @@ device_list="$(/usr/sbin/smartctl --scan-open | awk '/^\/dev/{print $1 "|" $3}') for device in ${device_list}; do disk="$(echo ${device} | cut -f1 -d'|')" type="$(echo ${device} | cut -f2 -d'|')" - echo "smartctl_run{disk=\"${disk}\",type=\"${type}\"}" $(TZ=UTC date '+%s') + echo "smartctl_run{disk=\"${disk}\",type=\"${type}\"}" "$(TZ=UTC date '+%s')" # Get the SMART information and health /usr/sbin/smartctl -i -H -d "${type}" "${disk}" | parse_smartctl_info "${disk}" "${type}" # Get the SMART attributes case ${type} in - sat) /usr/sbin/smartctl -A -d "${type}" "${disk}" | parse_smartctl_attributes "${disk}" "${type}" ;; - scsi) /usr/sbin/smartctl -A -d "${type}" "${disk}" | parse_smartctl_scsi_attributes "${disk}" "${type}" ;; - *) echo "disk type is not sat or scsi, ${type}"; exit ;; + sat) /usr/sbin/smartctl -A -d "${type}" "${disk}" | parse_smartctl_attributes "${disk}" "${type}" ;; + sat+megaraid*) /usr/sbin/smartctl -A -d "${type}" "${disk}" | parse_smartctl_attributes "${disk}" "${type}" ;; + scsi) /usr/sbin/smartctl -A -d "${type}" "${disk}" | parse_smartctl_scsi_attributes "${disk}" "${type}" ;; + megaraid*) /usr/sbin/smartctl -A -d "${type}" "${disk}" | parse_smartctl_scsi_attributes "${disk}" "${type}" ;; + *) + echo "disk type is not sat, scsi or megaraid but ${type}" + exit + ;; esac done | format_output diff --git a/text_collector_examples/storcli.py b/text_collector_examples/storcli.py index 31662909..48e2bbaf 100755 --- a/text_collector_examples/storcli.py +++ b/text_collector_examples/storcli.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 """ Script to parse StorCLI's JSON output and expose MegaRAID health as Prometheus metrics. @@ -19,110 +19,181 @@ import argparse import json import os import subprocess +import shlex +from dateutil.parser import parse +import collections +from enum import IntEnum DESCRIPTION = """Parses StorCLI's JSON output and exposes MegaRAID health as Prometheus metrics.""" -VERSION = '0.0.1' +VERSION = '0.0.2' + +storcli_path = '' +metric_prefix = 'megaraid_' +metric_list = {} +metric_list = collections.defaultdict(list) + + +class VD_State(IntEnum): + Optl = 0 # Optimal + Dgrd = 1 # Degraded + Pdgd = 2 # Partially Degraded + OfLn = 3 # Offline + Rec = 4 # Recovery + Cac = 5 # CacheCade def main(args): """ main """ + global storcli_path + storcli_path = args.storcli_path + data = json.loads(get_storcli_json('/cALL show all J')) - # exporter variables - metric_prefix = 'megaraid_' - metric_controller_labels = '{{controller="{}", model="{}"}}' + # All the information is collected underneath the Controllers key + data = data['Controllers'] - data = json.loads(get_storcli_json(args.storcli_path)) + # try: + # overview = status['Response Data']['System Overview'] + # except KeyError: + # pass - # It appears that the data we need will always be present in the first - # item in the Controllers array - status = data['Controllers'][0] + for controller in data: + response = controller['Response Data'] + if response['Version']['Driver Name'] == 'megaraid_sas': + handle_megaraid_controller(response) + elif response['Version']['Driver Name'] == 'mpt3sas': + handle_sas_controller(response) - metrics = { - 'status_code': status['Command Status']['Status Code'], - 'controllers': status['Response Data']['Number of Controllers'], - } - - for name, value in metrics.iteritems(): - print('# HELP {}{} MegaRAID {}'.format(metric_prefix, name, name.replace('_', ' '))) - print('# TYPE {}{} gauge'.format(metric_prefix, name)) - print("{}{} {}".format(metric_prefix, name, value)) - - controller_info = [] - controller_metrics = {} - overview = [] - - try: - overview = status['Response Data']['System Overview'] - except KeyError: - pass - - for controller in overview: - controller_index = controller['Ctl'] - model = controller['Model'] - controller_info.append(metric_controller_labels.format(controller_index, model)) - - controller_metrics = { - # FIXME: Parse dimmer switch options - # 'dimmer_switch': controller['DS'], - - 'battery_backup_healthy': int(controller['BBU'] == 'Opt'), - 'degraded': int(controller['Hlth'] == 'Dgd'), - 'drive_groups': controller['DGs'], - 'emergency_hot_spare': int(controller['EHS'] == 'Y'), - 'failed': int(controller['Hlth'] == 'Fld'), - 'healthy': int(controller['Hlth'] == 'Opt'), - 'physical_drives': controller['PDs'], - 'ports': controller['Ports'], - 'scheduled_patrol_read': int(controller['sPR'] == 'On'), - 'virtual_drives': controller['VDs'], - - # Reverse StorCLI's logic to make metrics consistent - 'drive_groups_optimal': int(controller['DNOpt'] == 0), - 'virtual_drives_optimal': int(controller['VNOpt'] == 0), - } - - for name, value in controller_metrics.iteritems(): - print('# HELP {}{} MegaRAID {}'.format(metric_prefix, name, name.replace('_', ' '))) - print('# TYPE {}{} gauge'.format(metric_prefix, name)) - print('{}{}{{controller="{}"}} {}'.format(metric_prefix, name, - controller_index, value)) - - if controller_info: - print('# HELP {}{} MegaRAID controller info'.format(metric_prefix, 'controller_info')) - print('# TYPE {}{} gauge'.format(metric_prefix, 'controller_info')) - for labels in controller_info: - print('{}{}{} {}'.format(metric_prefix, 'controller_info', labels, 1)) + # print_dict_to_exporter({'controller_info': [1]}, controller_info_list) + # print_dict_to_exporter({'virtual_disk_info': [1]}, vd_info_list) + # print_dict_to_exporter({'physical_disk_info': [1]}, pd_info_list) + # print_all_metrics(vd_metric_list) + print_all_metrics(metric_list) -def get_storcli_json(storcli_path): +def handle_sas_controller(response): + pass + + +def handle_megaraid_controller(response): + controller_index = response['Basics']['Controller'] + baselabel = 'controller="{}"'.format(controller_index) + + controller_info_label = baselabel + ',model="{}",serial="{}",fwversion="{}"'.format( + response['Basics']['Model'], + response['Basics']['Serial Number'], + response['Version']['Firmware Version'], + ) + add_metric('controller_info', controller_info_label, 1) + + add_metric('battery_backup_healthy', baselabel, int(response['Status']['BBU Status'] == 0)) + add_metric('degraded', baselabel, int(response['Status']['Controller Status'] == 'Degraded')) + add_metric('failed', baselabel, int(response['Status']['Controller Status'] == 'Failed')) + add_metric('healthy', baselabel, int(response['Status']['Controller Status'] == 'Optimal')) + add_metric('drive_groups', baselabel, response['Drive Groups']) + add_metric('virtual_drives', baselabel, response['Virtual Drives']) + add_metric('physical_drives', baselabel, response['Physical Drives']) + add_metric('ports', baselabel, response['HwCfg']['Backend Port Count']) + add_metric('scheduled_patrol_read', baselabel, + int('hrs' in response['Scheduled Tasks']['Patrol Read Reoccurrence'])) + + time_difference_seconds = -1 + system_time = parse(response['Basics'].get('Current System Date/time')) + controller_time = parse(response['Basics'].get('Current Controller Date/Time')) + if system_time and controller_time: + time_difference_seconds = abs(system_time - controller_time).seconds + add_metric('time_difference', baselabel, time_difference_seconds) + + for virtual_drive in response['VD LIST']: + vd_position = virtual_drive.get('DG/VD') + drive_group, volume_group = -1, -1 + if vd_position: + drive_group = vd_position.split('/')[0] + volume_group = vd_position.split('/')[1] + vd_baselabel = 'controller="{}",DG="{}",VG="{}"'.format(controller_index, drive_group, + volume_group) + vd_info_label = vd_baselabel + ',name="{}",cache="{}",type="{}"'.format( + virtual_drive.get('Name'), virtual_drive.get('Cache'), virtual_drive.get('TYPE')) + add_metric('vd_info', vd_info_label, 1) + add_metric('vd_status', vd_baselabel, int(VD_State[virtual_drive.get('State')])) + + if response['Physical Drives'] > 0: + data = json.loads(get_storcli_json('/cALL/eALL/sALL show all J')) + drive_info = data['Controllers'][controller_index]['Response Data'] + for physical_drive in response['PD LIST']: + enclosure = physical_drive.get('EID:Slt').split(':')[0] + slot = physical_drive.get('EID:Slt').split(':')[1] + + pd_baselabel = 'controller="{}",enclosure="{}",slot="{}"'.format( + controller_index, enclosure, slot) + pd_info_label = pd_baselabel + ',disk_id="{}",interface="{}",media="{}",model="{}"'.format( + physical_drive.get('DID'), physical_drive.get('Intf'), physical_drive.get('Med'), + physical_drive.get('Model').strip()) + + drive_identifier = 'Drive /c' + str(controller_index) + '/e' + str(enclosure) + '/s' + str( + slot) + try: + info = drive_info[drive_identifier + ' - Detailed Information'] + state = info[drive_identifier + ' State'] + attributes = info[drive_identifier + ' Device attributes'] + settings = info[drive_identifier + ' Policies/Settings'] + + add_metric('pd_shield_counter', pd_baselabel, state['Shield Counter']) + add_metric('pd_media_errors_total', pd_baselabel, state['Media Error Count']) + add_metric('pd_other_errors_total', pd_baselabel, state['Other Error Count']) + add_metric('pd_predictive_errors_total', pd_baselabel, + state['Predictive Failure Count']) + add_metric('pd_smart_alerted', pd_baselabel, + int(state['S.M.A.R.T alert flagged by drive'] == 'Yes')) + add_metric('pd_link_speed_gbps', pd_baselabel, attributes['Link Speed'].split('.')[0]) + add_metric('pd_device_speed_gbps', pd_baselabel, + attributes['Device Speed'].split('.')[0]) + add_metric('pd_commissioned_spare', pd_baselabel, + int(settings['Commissioned Spare'] == 'Yes')) + add_metric('pd_emergency_spare', pd_baselabel, + int(settings['Emergency Spare'] == 'Yes')) + pd_info_label += ',firmware="{}"'.format(attributes['Firmware Revision']) + except KeyError: + pass + add_metric('pd_info', pd_info_label, 1) + + +def add_metric(name, labels, value): + global metric_list + metric_list[name].append({ + 'labels': labels, + 'value': value, + }) + + +def print_all_metrics(metrics): + for metric, measurements in metrics.items(): + print('# HELP {}{} MegaRAID {}'.format(metric_prefix, metric, metric.replace('_', ' '))) + print('# TYPE {}{} gauge'.format(metric_prefix, metric)) + for measurement in measurements: + print('{}{}{} {}'.format(metric_prefix, metric, '{' + measurement['labels'] + '}', + measurement['value'])) + + +def get_storcli_json(storcli_args): """Get storcli output in JSON format.""" + # Check if storcli is installed and executable + if not (os.path.isfile(storcli_path) and os.access(storcli_path, os.X_OK)): + SystemExit(1) + storcli_cmd = shlex.split(storcli_path + ' ' + storcli_args) + proc = subprocess.Popen( + storcli_cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + output_json = proc.communicate()[0] - # Check if storcli is installed - if os.path.isfile(storcli_path) and os.access(storcli_path, os.X_OK): - storcli_cmd = [storcli_path, 'show', 'all', 'J'] - proc = subprocess.Popen(storcli_cmd, shell=False, - stdout=subprocess.PIPE, stderr=subprocess.PIPE) - output_json = proc.communicate()[0] - else: - # Create an empty dummy-JSON where storcli not installed. - dummy_json = {"Controllers":[{ - "Command Status": {"Status Code": 0, "Status": "Success", - "Description": "None"}, - "Response Data": {"Number of Controllers": 0}}]} - output_json = json.dumps(dummy_json) + return output_json.decode("utf-8") - return output_json if __name__ == "__main__": - PARSER = argparse.ArgumentParser(description=DESCRIPTION, - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - PARSER.add_argument('--storcli_path', - default='/opt/MegaRAID/storcli/storcli64', - help='path to StorCLi binary') - PARSER.add_argument('--version', - action='version', - version='%(prog)s {}'.format(VERSION)) + PARSER = argparse.ArgumentParser( + description=DESCRIPTION, formatter_class=argparse.ArgumentDefaultsHelpFormatter) + PARSER.add_argument( + '--storcli_path', default='/opt/MegaRAID/storcli/storcli64', help='path to StorCLi binary') + PARSER.add_argument('--version', action='version', version='%(prog)s {}'.format(VERSION)) ARGS = PARSER.parse_args() main(ARGS)