diff --git a/CHANGELOG.md b/CHANGELOG.md index 3264fd36..21599c0b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,11 +4,15 @@ * The netdev collector CLI argument `--collector.netdev.ignored-devices` was renamed to `--collector.netdev.device-blacklist` in order to conform with the systemd collector. #1279 * The label named `state` on `node_systemd_service_restart_total` metrics was changed to `name` to better describe the metric. #1393 - +* Refactoring of the mdadm collector changes several metrics + - `node_md_disks_active` is removed + - `node_md_disks` now has a `state` label for "fail", "spare", "active" disks. + - `node_md_is_active` is replaced by `node_md_state` with a state set of "active", "inactive", "recovering", "resync". ### Changes * [CHANGE] Add `--collector.netdev.device-whitelist`. #1279 +* [CHANGE] Refactor mdadm collector #1403 * [FEATURE] * [ENHANCEMENT] * [BUGFIX] Renamed label `state` to `name` on `node_systemd_service_restart_total`. #1393 diff --git a/collector/fixtures/e2e-64k-page-output.txt b/collector/fixtures/e2e-64k-page-output.txt index c4bbd6fe..b7cf4372 100644 --- a/collector/fixtures/e2e-64k-page-output.txt +++ b/collector/fixtures/e2e-64k-page-output.txt @@ -1078,6 +1078,7 @@ node_load5 0.37 node_md_blocks{device="md0"} 248896 node_md_blocks{device="md00"} 4.186624e+06 node_md_blocks{device="md10"} 3.14159265e+08 +node_md_blocks{device="md101"} 322560 node_md_blocks{device="md11"} 4.190208e+06 node_md_blocks{device="md12"} 3.886394368e+09 node_md_blocks{device="md120"} 2.095104e+06 @@ -1095,7 +1096,8 @@ node_md_blocks{device="md9"} 523968 node_md_blocks_synced{device="md0"} 248896 node_md_blocks_synced{device="md00"} 4.186624e+06 node_md_blocks_synced{device="md10"} 3.14159265e+08 -node_md_blocks_synced{device="md11"} 4.190208e+06 +node_md_blocks_synced{device="md101"} 322560 +node_md_blocks_synced{device="md11"} 0 node_md_blocks_synced{device="md12"} 3.886394368e+09 node_md_blocks_synced{device="md120"} 2.095104e+06 node_md_blocks_synced{device="md126"} 1.855870976e+09 @@ -1106,58 +1108,141 @@ node_md_blocks_synced{device="md4"} 4.883648e+06 node_md_blocks_synced{device="md6"} 1.6775552e+07 node_md_blocks_synced{device="md7"} 7.813735424e+09 node_md_blocks_synced{device="md8"} 1.6775552e+07 -node_md_blocks_synced{device="md9"} 523968 -# HELP node_md_disks Total number of disks of device. +node_md_blocks_synced{device="md9"} 0 +# HELP node_md_disks Number of active/failed/spare disks of device. # TYPE node_md_disks gauge -node_md_disks{device="md0"} 2 -node_md_disks{device="md00"} 1 -node_md_disks{device="md10"} 2 -node_md_disks{device="md11"} 2 -node_md_disks{device="md12"} 2 -node_md_disks{device="md120"} 2 -node_md_disks{device="md126"} 2 -node_md_disks{device="md127"} 2 -node_md_disks{device="md219"} 3 -node_md_disks{device="md3"} 8 -node_md_disks{device="md4"} 2 -node_md_disks{device="md6"} 2 -node_md_disks{device="md7"} 4 -node_md_disks{device="md8"} 2 -node_md_disks{device="md9"} 4 -# HELP node_md_disks_active Number of active disks of device. -# TYPE node_md_disks_active gauge -node_md_disks_active{device="md0"} 2 -node_md_disks_active{device="md00"} 1 -node_md_disks_active{device="md10"} 2 -node_md_disks_active{device="md11"} 2 -node_md_disks_active{device="md12"} 2 -node_md_disks_active{device="md120"} 2 -node_md_disks_active{device="md126"} 2 -node_md_disks_active{device="md127"} 2 -node_md_disks_active{device="md219"} 0 -node_md_disks_active{device="md3"} 8 -node_md_disks_active{device="md4"} 0 -node_md_disks_active{device="md6"} 1 -node_md_disks_active{device="md7"} 3 -node_md_disks_active{device="md8"} 2 -node_md_disks_active{device="md9"} 4 -# HELP node_md_is_active Indicator whether the md-device is active or not. -# TYPE node_md_is_active gauge -node_md_is_active{device="md0"} 1 -node_md_is_active{device="md00"} 1 -node_md_is_active{device="md10"} 1 -node_md_is_active{device="md11"} 1 -node_md_is_active{device="md12"} 1 -node_md_is_active{device="md120"} 1 -node_md_is_active{device="md126"} 1 -node_md_is_active{device="md127"} 1 -node_md_is_active{device="md219"} 0 -node_md_is_active{device="md3"} 1 -node_md_is_active{device="md4"} 0 -node_md_is_active{device="md6"} 1 -node_md_is_active{device="md7"} 1 -node_md_is_active{device="md8"} 1 -node_md_is_active{device="md9"} 1 +node_md_disks{device="md0",state="active"} 2 +node_md_disks{device="md0",state="failed"} 0 +node_md_disks{device="md0",state="spare"} 0 +node_md_disks{device="md00",state="active"} 1 +node_md_disks{device="md00",state="failed"} 0 +node_md_disks{device="md00",state="spare"} 0 +node_md_disks{device="md10",state="active"} 2 +node_md_disks{device="md10",state="failed"} 0 +node_md_disks{device="md10",state="spare"} 0 +node_md_disks{device="md101",state="active"} 3 +node_md_disks{device="md101",state="failed"} 0 +node_md_disks{device="md101",state="spare"} 0 +node_md_disks{device="md11",state="active"} 2 +node_md_disks{device="md11",state="failed"} 1 +node_md_disks{device="md11",state="spare"} 2 +node_md_disks{device="md12",state="active"} 2 +node_md_disks{device="md12",state="failed"} 0 +node_md_disks{device="md12",state="spare"} 0 +node_md_disks{device="md120",state="active"} 2 +node_md_disks{device="md120",state="failed"} 0 +node_md_disks{device="md120",state="spare"} 0 +node_md_disks{device="md126",state="active"} 2 +node_md_disks{device="md126",state="failed"} 0 +node_md_disks{device="md126",state="spare"} 0 +node_md_disks{device="md127",state="active"} 2 +node_md_disks{device="md127",state="failed"} 0 +node_md_disks{device="md127",state="spare"} 0 +node_md_disks{device="md219",state="active"} 0 +node_md_disks{device="md219",state="failed"} 0 +node_md_disks{device="md219",state="spare"} 3 +node_md_disks{device="md3",state="active"} 8 +node_md_disks{device="md3",state="failed"} 0 +node_md_disks{device="md3",state="spare"} 2 +node_md_disks{device="md4",state="active"} 0 +node_md_disks{device="md4",state="failed"} 1 +node_md_disks{device="md4",state="spare"} 1 +node_md_disks{device="md6",state="active"} 1 +node_md_disks{device="md6",state="failed"} 1 +node_md_disks{device="md6",state="spare"} 1 +node_md_disks{device="md7",state="active"} 3 +node_md_disks{device="md7",state="failed"} 1 +node_md_disks{device="md7",state="spare"} 0 +node_md_disks{device="md8",state="active"} 2 +node_md_disks{device="md8",state="failed"} 0 +node_md_disks{device="md8",state="spare"} 2 +node_md_disks{device="md9",state="active"} 4 +node_md_disks{device="md9",state="failed"} 2 +node_md_disks{device="md9",state="spare"} 1 +# HELP node_md_disks_required Total number of disks of device. +# TYPE node_md_disks_required gauge +node_md_disks_required{device="md0"} 2 +node_md_disks_required{device="md00"} 1 +node_md_disks_required{device="md10"} 2 +node_md_disks_required{device="md101"} 3 +node_md_disks_required{device="md11"} 2 +node_md_disks_required{device="md12"} 2 +node_md_disks_required{device="md120"} 2 +node_md_disks_required{device="md126"} 2 +node_md_disks_required{device="md127"} 2 +node_md_disks_required{device="md219"} 0 +node_md_disks_required{device="md3"} 8 +node_md_disks_required{device="md4"} 0 +node_md_disks_required{device="md6"} 2 +node_md_disks_required{device="md7"} 4 +node_md_disks_required{device="md8"} 2 +node_md_disks_required{device="md9"} 4 +# HELP node_md_state Indicates the state of md-device. +# TYPE node_md_state gauge +node_md_state{device="md0",state="active"} 1 +node_md_state{device="md0",state="inactive"} 0 +node_md_state{device="md0",state="recovering"} 0 +node_md_state{device="md0",state="resync"} 0 +node_md_state{device="md00",state="active"} 1 +node_md_state{device="md00",state="inactive"} 0 +node_md_state{device="md00",state="recovering"} 0 +node_md_state{device="md00",state="resync"} 0 +node_md_state{device="md10",state="active"} 1 +node_md_state{device="md10",state="inactive"} 0 +node_md_state{device="md10",state="recovering"} 0 +node_md_state{device="md10",state="resync"} 0 +node_md_state{device="md101",state="active"} 1 +node_md_state{device="md101",state="inactive"} 0 +node_md_state{device="md101",state="recovering"} 0 +node_md_state{device="md101",state="resync"} 0 +node_md_state{device="md11",state="active"} 0 +node_md_state{device="md11",state="inactive"} 0 +node_md_state{device="md11",state="recovering"} 0 +node_md_state{device="md11",state="resync"} 1 +node_md_state{device="md12",state="active"} 1 +node_md_state{device="md12",state="inactive"} 0 +node_md_state{device="md12",state="recovering"} 0 +node_md_state{device="md12",state="resync"} 0 +node_md_state{device="md120",state="active"} 1 +node_md_state{device="md120",state="inactive"} 0 +node_md_state{device="md120",state="recovering"} 0 +node_md_state{device="md120",state="resync"} 0 +node_md_state{device="md126",state="active"} 1 +node_md_state{device="md126",state="inactive"} 0 +node_md_state{device="md126",state="recovering"} 0 +node_md_state{device="md126",state="resync"} 0 +node_md_state{device="md127",state="active"} 1 +node_md_state{device="md127",state="inactive"} 0 +node_md_state{device="md127",state="recovering"} 0 +node_md_state{device="md127",state="resync"} 0 +node_md_state{device="md219",state="active"} 0 +node_md_state{device="md219",state="inactive"} 1 +node_md_state{device="md219",state="recovering"} 0 +node_md_state{device="md219",state="resync"} 0 +node_md_state{device="md3",state="active"} 1 +node_md_state{device="md3",state="inactive"} 0 +node_md_state{device="md3",state="recovering"} 0 +node_md_state{device="md3",state="resync"} 0 +node_md_state{device="md4",state="active"} 0 +node_md_state{device="md4",state="inactive"} 1 +node_md_state{device="md4",state="recovering"} 0 +node_md_state{device="md4",state="resync"} 0 +node_md_state{device="md6",state="active"} 0 +node_md_state{device="md6",state="inactive"} 0 +node_md_state{device="md6",state="recovering"} 1 +node_md_state{device="md6",state="resync"} 0 +node_md_state{device="md7",state="active"} 1 +node_md_state{device="md7",state="inactive"} 0 +node_md_state{device="md7",state="recovering"} 0 +node_md_state{device="md7",state="resync"} 0 +node_md_state{device="md8",state="active"} 0 +node_md_state{device="md8",state="inactive"} 0 +node_md_state{device="md8",state="recovering"} 0 +node_md_state{device="md8",state="resync"} 1 +node_md_state{device="md9",state="active"} 0 +node_md_state{device="md9",state="inactive"} 0 +node_md_state{device="md9",state="recovering"} 0 +node_md_state{device="md9",state="resync"} 1 # HELP node_memory_Active_anon_bytes Memory information field Active_anon_bytes. # TYPE node_memory_Active_anon_bytes gauge node_memory_Active_anon_bytes 2.068484096e+09 diff --git a/collector/fixtures/e2e-output.txt b/collector/fixtures/e2e-output.txt index baef2248..3f5292c4 100644 --- a/collector/fixtures/e2e-output.txt +++ b/collector/fixtures/e2e-output.txt @@ -1078,6 +1078,7 @@ node_load5 0.37 node_md_blocks{device="md0"} 248896 node_md_blocks{device="md00"} 4.186624e+06 node_md_blocks{device="md10"} 3.14159265e+08 +node_md_blocks{device="md101"} 322560 node_md_blocks{device="md11"} 4.190208e+06 node_md_blocks{device="md12"} 3.886394368e+09 node_md_blocks{device="md120"} 2.095104e+06 @@ -1095,7 +1096,8 @@ node_md_blocks{device="md9"} 523968 node_md_blocks_synced{device="md0"} 248896 node_md_blocks_synced{device="md00"} 4.186624e+06 node_md_blocks_synced{device="md10"} 3.14159265e+08 -node_md_blocks_synced{device="md11"} 4.190208e+06 +node_md_blocks_synced{device="md101"} 322560 +node_md_blocks_synced{device="md11"} 0 node_md_blocks_synced{device="md12"} 3.886394368e+09 node_md_blocks_synced{device="md120"} 2.095104e+06 node_md_blocks_synced{device="md126"} 1.855870976e+09 @@ -1106,58 +1108,141 @@ node_md_blocks_synced{device="md4"} 4.883648e+06 node_md_blocks_synced{device="md6"} 1.6775552e+07 node_md_blocks_synced{device="md7"} 7.813735424e+09 node_md_blocks_synced{device="md8"} 1.6775552e+07 -node_md_blocks_synced{device="md9"} 523968 -# HELP node_md_disks Total number of disks of device. +node_md_blocks_synced{device="md9"} 0 +# HELP node_md_disks Number of active/failed/spare disks of device. # TYPE node_md_disks gauge -node_md_disks{device="md0"} 2 -node_md_disks{device="md00"} 1 -node_md_disks{device="md10"} 2 -node_md_disks{device="md11"} 2 -node_md_disks{device="md12"} 2 -node_md_disks{device="md120"} 2 -node_md_disks{device="md126"} 2 -node_md_disks{device="md127"} 2 -node_md_disks{device="md219"} 3 -node_md_disks{device="md3"} 8 -node_md_disks{device="md4"} 2 -node_md_disks{device="md6"} 2 -node_md_disks{device="md7"} 4 -node_md_disks{device="md8"} 2 -node_md_disks{device="md9"} 4 -# HELP node_md_disks_active Number of active disks of device. -# TYPE node_md_disks_active gauge -node_md_disks_active{device="md0"} 2 -node_md_disks_active{device="md00"} 1 -node_md_disks_active{device="md10"} 2 -node_md_disks_active{device="md11"} 2 -node_md_disks_active{device="md12"} 2 -node_md_disks_active{device="md120"} 2 -node_md_disks_active{device="md126"} 2 -node_md_disks_active{device="md127"} 2 -node_md_disks_active{device="md219"} 0 -node_md_disks_active{device="md3"} 8 -node_md_disks_active{device="md4"} 0 -node_md_disks_active{device="md6"} 1 -node_md_disks_active{device="md7"} 3 -node_md_disks_active{device="md8"} 2 -node_md_disks_active{device="md9"} 4 -# HELP node_md_is_active Indicator whether the md-device is active or not. -# TYPE node_md_is_active gauge -node_md_is_active{device="md0"} 1 -node_md_is_active{device="md00"} 1 -node_md_is_active{device="md10"} 1 -node_md_is_active{device="md11"} 1 -node_md_is_active{device="md12"} 1 -node_md_is_active{device="md120"} 1 -node_md_is_active{device="md126"} 1 -node_md_is_active{device="md127"} 1 -node_md_is_active{device="md219"} 0 -node_md_is_active{device="md3"} 1 -node_md_is_active{device="md4"} 0 -node_md_is_active{device="md6"} 1 -node_md_is_active{device="md7"} 1 -node_md_is_active{device="md8"} 1 -node_md_is_active{device="md9"} 1 +node_md_disks{device="md0",state="active"} 2 +node_md_disks{device="md0",state="failed"} 0 +node_md_disks{device="md0",state="spare"} 0 +node_md_disks{device="md00",state="active"} 1 +node_md_disks{device="md00",state="failed"} 0 +node_md_disks{device="md00",state="spare"} 0 +node_md_disks{device="md10",state="active"} 2 +node_md_disks{device="md10",state="failed"} 0 +node_md_disks{device="md10",state="spare"} 0 +node_md_disks{device="md101",state="active"} 3 +node_md_disks{device="md101",state="failed"} 0 +node_md_disks{device="md101",state="spare"} 0 +node_md_disks{device="md11",state="active"} 2 +node_md_disks{device="md11",state="failed"} 1 +node_md_disks{device="md11",state="spare"} 2 +node_md_disks{device="md12",state="active"} 2 +node_md_disks{device="md12",state="failed"} 0 +node_md_disks{device="md12",state="spare"} 0 +node_md_disks{device="md120",state="active"} 2 +node_md_disks{device="md120",state="failed"} 0 +node_md_disks{device="md120",state="spare"} 0 +node_md_disks{device="md126",state="active"} 2 +node_md_disks{device="md126",state="failed"} 0 +node_md_disks{device="md126",state="spare"} 0 +node_md_disks{device="md127",state="active"} 2 +node_md_disks{device="md127",state="failed"} 0 +node_md_disks{device="md127",state="spare"} 0 +node_md_disks{device="md219",state="active"} 0 +node_md_disks{device="md219",state="failed"} 0 +node_md_disks{device="md219",state="spare"} 3 +node_md_disks{device="md3",state="active"} 8 +node_md_disks{device="md3",state="failed"} 0 +node_md_disks{device="md3",state="spare"} 2 +node_md_disks{device="md4",state="active"} 0 +node_md_disks{device="md4",state="failed"} 1 +node_md_disks{device="md4",state="spare"} 1 +node_md_disks{device="md6",state="active"} 1 +node_md_disks{device="md6",state="failed"} 1 +node_md_disks{device="md6",state="spare"} 1 +node_md_disks{device="md7",state="active"} 3 +node_md_disks{device="md7",state="failed"} 1 +node_md_disks{device="md7",state="spare"} 0 +node_md_disks{device="md8",state="active"} 2 +node_md_disks{device="md8",state="failed"} 0 +node_md_disks{device="md8",state="spare"} 2 +node_md_disks{device="md9",state="active"} 4 +node_md_disks{device="md9",state="failed"} 2 +node_md_disks{device="md9",state="spare"} 1 +# HELP node_md_disks_required Total number of disks of device. +# TYPE node_md_disks_required gauge +node_md_disks_required{device="md0"} 2 +node_md_disks_required{device="md00"} 1 +node_md_disks_required{device="md10"} 2 +node_md_disks_required{device="md101"} 3 +node_md_disks_required{device="md11"} 2 +node_md_disks_required{device="md12"} 2 +node_md_disks_required{device="md120"} 2 +node_md_disks_required{device="md126"} 2 +node_md_disks_required{device="md127"} 2 +node_md_disks_required{device="md219"} 0 +node_md_disks_required{device="md3"} 8 +node_md_disks_required{device="md4"} 0 +node_md_disks_required{device="md6"} 2 +node_md_disks_required{device="md7"} 4 +node_md_disks_required{device="md8"} 2 +node_md_disks_required{device="md9"} 4 +# HELP node_md_state Indicates the state of md-device. +# TYPE node_md_state gauge +node_md_state{device="md0",state="active"} 1 +node_md_state{device="md0",state="inactive"} 0 +node_md_state{device="md0",state="recovering"} 0 +node_md_state{device="md0",state="resync"} 0 +node_md_state{device="md00",state="active"} 1 +node_md_state{device="md00",state="inactive"} 0 +node_md_state{device="md00",state="recovering"} 0 +node_md_state{device="md00",state="resync"} 0 +node_md_state{device="md10",state="active"} 1 +node_md_state{device="md10",state="inactive"} 0 +node_md_state{device="md10",state="recovering"} 0 +node_md_state{device="md10",state="resync"} 0 +node_md_state{device="md101",state="active"} 1 +node_md_state{device="md101",state="inactive"} 0 +node_md_state{device="md101",state="recovering"} 0 +node_md_state{device="md101",state="resync"} 0 +node_md_state{device="md11",state="active"} 0 +node_md_state{device="md11",state="inactive"} 0 +node_md_state{device="md11",state="recovering"} 0 +node_md_state{device="md11",state="resync"} 1 +node_md_state{device="md12",state="active"} 1 +node_md_state{device="md12",state="inactive"} 0 +node_md_state{device="md12",state="recovering"} 0 +node_md_state{device="md12",state="resync"} 0 +node_md_state{device="md120",state="active"} 1 +node_md_state{device="md120",state="inactive"} 0 +node_md_state{device="md120",state="recovering"} 0 +node_md_state{device="md120",state="resync"} 0 +node_md_state{device="md126",state="active"} 1 +node_md_state{device="md126",state="inactive"} 0 +node_md_state{device="md126",state="recovering"} 0 +node_md_state{device="md126",state="resync"} 0 +node_md_state{device="md127",state="active"} 1 +node_md_state{device="md127",state="inactive"} 0 +node_md_state{device="md127",state="recovering"} 0 +node_md_state{device="md127",state="resync"} 0 +node_md_state{device="md219",state="active"} 0 +node_md_state{device="md219",state="inactive"} 1 +node_md_state{device="md219",state="recovering"} 0 +node_md_state{device="md219",state="resync"} 0 +node_md_state{device="md3",state="active"} 1 +node_md_state{device="md3",state="inactive"} 0 +node_md_state{device="md3",state="recovering"} 0 +node_md_state{device="md3",state="resync"} 0 +node_md_state{device="md4",state="active"} 0 +node_md_state{device="md4",state="inactive"} 1 +node_md_state{device="md4",state="recovering"} 0 +node_md_state{device="md4",state="resync"} 0 +node_md_state{device="md6",state="active"} 0 +node_md_state{device="md6",state="inactive"} 0 +node_md_state{device="md6",state="recovering"} 1 +node_md_state{device="md6",state="resync"} 0 +node_md_state{device="md7",state="active"} 1 +node_md_state{device="md7",state="inactive"} 0 +node_md_state{device="md7",state="recovering"} 0 +node_md_state{device="md7",state="resync"} 0 +node_md_state{device="md8",state="active"} 0 +node_md_state{device="md8",state="inactive"} 0 +node_md_state{device="md8",state="recovering"} 0 +node_md_state{device="md8",state="resync"} 1 +node_md_state{device="md9",state="active"} 0 +node_md_state{device="md9",state="inactive"} 0 +node_md_state{device="md9",state="recovering"} 0 +node_md_state{device="md9",state="resync"} 1 # HELP node_memory_Active_anon_bytes Memory information field Active_anon_bytes. # TYPE node_memory_Active_anon_bytes gauge node_memory_Active_anon_bytes 2.068484096e+09 diff --git a/collector/fixtures/proc/mdstat b/collector/fixtures/proc/mdstat index 19934ae1..a135435f 100644 --- a/collector/fixtures/proc/mdstat +++ b/collector/fixtures/proc/mdstat @@ -1,5 +1,6 @@ Personalities : [linear] [multipath] [raid0] [raid1] [raid6] [raid5] [raid4] [raid10] -md3 : active raid6 sda1[8] sdh1[7] sdg1[6] sdf1[5] sde1[11] sdd1[3] sdc1[10] sdb1[9] + +md3 : active raid6 sda1[8] sdh1[7] sdg1[6] sdf1[5] sde1[11] sdd1[3] sdc1[10] sdb1[9] sdd1[10](S) sdd2[11](S) 5853468288 blocks super 1.2 level 6, 64k chunk, algorithm 2 [8/8] [UUUUUUUU] md127 : active raid1 sdi2[0] sdj2[1] @@ -8,31 +9,31 @@ md127 : active raid1 sdi2[0] sdj2[1] md0 : active raid1 sdi1[0] sdj1[1] 248896 blocks [2/2] [UU] -md4 : inactive raid1 sda3[0] sdb3[1] +md4 : inactive raid1 sda3[0](F) sdb3[1](S) 4883648 blocks [2/2] [UU] -md6 : active raid1 sdb2[2] sda2[0] +md6 : active raid1 sdb2[2](F) sdc[1](S) sda2[0] 195310144 blocks [2/1] [U_] [=>...................] recovery = 8.5% (16775552/195310144) finish=17.0min speed=259783K/sec -md8 : active raid1 sdb1[1] sda1[0] - 195310144 blocks [2/2] [UU] - [=>...................] resync = 8.5% (16775552/195310144) finish=17.0min speed=259783K/sec +md8 : active raid1 sdb1[1] sda1[0] sdc[2](S) sde[3](S) + 195310144 blocks [2/2] [UU] + [=>...................] resync = 8.5% (16775552/195310144) finish=17.0min speed=259783K/sec -md7 : active raid6 sdb1[0] sde1[3] sdd1[2] sdc1[1] +md7 : active raid6 sdb1[0] sde1[3] sdd1[2] sdc1[1](F) 7813735424 blocks super 1.2 level 6, 512k chunk, algorithm 2 [4/3] [U_UU] bitmap: 0/30 pages [0KB], 65536KB chunk -md9 : active raid1 sdc2[2] sdd2[3] sdb2[1] sda2[0] +md9 : active raid1 sdc2[2] sdd2[3] sdb2[1] sda2[0] sde[4](F) sdf[5](F) sdg[6](S) 523968 blocks super 1.2 [4/4] [UUUU] - resync=DELAYED + resync=DELAYED md10 : active raid0 sda1[0] sdb1[1] - 314159265 blocks 64k chunks + 314159265 blocks 64k chunks -md11 : active (auto-read-only) raid1 sdb2[0] sdc2[1] +md11 : active (auto-read-only) raid1 sdb2[0] sdc2[1] sdc3[2](F) hda[4](S) ssdc2[3](S) 4190208 blocks super 1.2 [2/2] [UU] - resync=PENDING + resync=PENDING md12 : active raid0 sdc2[0] sdd2[1] 3886394368 blocks super 1.2 512k chunks @@ -41,12 +42,15 @@ md126 : active raid0 sdb[1] sdc[0] 1855870976 blocks super external:/md127/0 128k chunks md219 : inactive sdb[2](S) sdc[1](S) sda[0](S) - 7932 blocks super external:imsm + 7932 blocks super external:imsm md00 : active raid0 xvdb[0] 4186624 blocks super 1.2 256k chunks md120 : active linear sda1[1] sdb1[0] - 2095104 blocks super 1.2 0k rounding + 2095104 blocks super 1.2 0k rounding + +md101 : active (read-only) raid0 sdb[2] sdd[1] sdc[0] + 322560 blocks super 1.2 512k chunks unused devices: diff --git a/collector/fixtures/proc/mdstat_invalid b/collector/fixtures/proc/mdstat_invalid deleted file mode 100644 index c60c77be..00000000 --- a/collector/fixtures/proc/mdstat_invalid +++ /dev/null @@ -1,5 +0,0 @@ -Personalities : [invalid] -md3 : invalid - 314159265 blocks 64k chunks - -unused devices: diff --git a/collector/mdadm_linux.go b/collector/mdadm_linux.go index b3a0dc3a..ce130f50 100644 --- a/collector/mdadm_linux.go +++ b/collector/mdadm_linux.go @@ -17,229 +17,59 @@ package collector import ( "fmt" - "io/ioutil" "os" - "regexp" - "strconv" - "strings" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/common/log" + "github.com/prometheus/procfs" ) -var ( - statuslineRE = regexp.MustCompile(`(\d+) blocks .*\[(\d+)/(\d+)\] \[[U_]+\]`) - raid0lineRE = regexp.MustCompile(`(\d+) blocks .*\d+k (chunks|rounding)`) - buildlineRE = regexp.MustCompile(`\((\d+)/\d+\)`) - unknownPersonalityLineRE = regexp.MustCompile(`(\d+) blocks (.*)`) - raidPersonalityRE = regexp.MustCompile(`^(linear|raid[0-9]+)$`) -) - -type mdStatus struct { - name string - active bool - disksActive int64 - disksTotal int64 - blocksTotal int64 - blocksSynced int64 -} - type mdadmCollector struct{} func init() { registerCollector("mdadm", defaultEnabled, NewMdadmCollector) } -func evalStatusline(statusline string) (active, total, size int64, err error) { - matches := statuslineRE.FindStringSubmatch(statusline) - - // +1 to make it more obvious that the whole string containing the info is also returned as matches[0]. - if len(matches) < 3+1 { - return 0, 0, 0, fmt.Errorf("too few matches found in statusline: %s", statusline) - } else if len(matches) > 3+1 { - return 0, 0, 0, fmt.Errorf("too many matches found in statusline: %s", statusline) - } - - size, err = strconv.ParseInt(matches[1], 10, 64) - if err != nil { - return 0, 0, 0, fmt.Errorf("%s in statusline: %s", err, statusline) - } - - total, err = strconv.ParseInt(matches[2], 10, 64) - if err != nil { - return 0, 0, 0, fmt.Errorf("%s in statusline: %s", err, statusline) - } - active, err = strconv.ParseInt(matches[3], 10, 64) - if err != nil { - return 0, 0, 0, fmt.Errorf("%s in statusline: %s", err, statusline) - } - - return active, total, size, nil -} - -func evalRaid0line(statusline string) (size int64, err error) { - matches := raid0lineRE.FindStringSubmatch(statusline) - - if len(matches) < 2 { - return 0, fmt.Errorf("invalid raid0 status line: %s", statusline) - } - - size, err = strconv.ParseInt(matches[1], 10, 64) - if err != nil { - return 0, fmt.Errorf("%s in statusline: %s", err, statusline) - } - - return size, nil -} - -func evalUnknownPersonalitylineRE(statusline string) (size int64, err error) { - matches := unknownPersonalityLineRE.FindStringSubmatch(statusline) - - if len(matches) != 2+1 { - return 0, fmt.Errorf("invalid unknown personality status line: %s", statusline) - } - - size, err = strconv.ParseInt(matches[1], 10, 64) - if err != nil { - return 0, fmt.Errorf("%s in statusline: %s", err, statusline) - } - - return size, nil -} - -// evalBuildline gets the size that has already been synced out of the sync-line. -func evalBuildline(buildline string) (int64, error) { - matches := buildlineRE.FindStringSubmatch(buildline) - - // +1 to make it more obvious that the whole string containing the info is also returned as matches[0]. - if len(matches) < 1+1 { - return 0, fmt.Errorf("too few matches found in buildline: %s", buildline) - } - - if len(matches) > 1+1 { - return 0, fmt.Errorf("too many matches found in buildline: %s", buildline) - } - - syncedSize, err := strconv.ParseInt(matches[1], 10, 64) - - if err != nil { - return 0, fmt.Errorf("%s in buildline: %s", err, buildline) - } - - return syncedSize, nil -} - -// parseMdstat parses an mdstat-file and returns a struct with the relevant infos. -func parseMdstat(mdStatusFilePath string) ([]mdStatus, error) { - content, err := ioutil.ReadFile(mdStatusFilePath) - if err != nil { - return []mdStatus{}, err - } - - lines := strings.Split(string(content), "\n") - // Each md has at least the deviceline, statusline and one empty line afterwards - // so we will have probably something of the order len(lines)/3 devices - // so we use that for preallocation. - mdStates := make([]mdStatus, 0, len(lines)/3) - for i, line := range lines { - if line == "" { - continue - } - if line[0] == ' ' || line[0] == '\t' { - // Lines starting with white space are not the beginning of a md-section. - continue - } - if strings.HasPrefix(line, "Personalities") || strings.HasPrefix(line, "unused") { - // These lines contain general information. - continue - } - - mainLine := strings.Split(line, " ") - if len(mainLine) < 4 { - return mdStates, fmt.Errorf("error parsing mdline: %s", line) - } - md := mdStatus{ - name: mainLine[0], - active: mainLine[2] == "active", - } - - if len(lines) <= i+3 { - return mdStates, fmt.Errorf("error parsing mdstat: entry for %s has fewer lines than expected", md.name) - } - - personality := "" - for _, possiblePersonality := range mainLine[3:] { - if raidPersonalityRE.MatchString(possiblePersonality) { - personality = possiblePersonality - break - } - } - switch { - case personality == "raid0" || personality == "linear": - md.disksActive = int64(len(mainLine) - 4) // Get the number of devices from the main line. - md.disksTotal = md.disksActive // Raid0 active and total is always the same if active. - md.blocksTotal, err = evalRaid0line(lines[i+1]) - case raidPersonalityRE.MatchString(personality): - md.disksActive, md.disksTotal, md.blocksTotal, err = evalStatusline(lines[i+1]) - default: - log.Debugf("Personality unknown: %s", mainLine) - md.disksTotal = int64(len(mainLine) - 3) - md.blocksTotal, err = evalUnknownPersonalitylineRE(lines[i+1]) - } - if err != nil { - return mdStates, fmt.Errorf("error parsing mdstat: %s", err) - } - - if !md.active { - md.disksActive = 0 - } - - syncLine := lines[i+2] - if strings.Contains(syncLine, "bitmap") { - syncLine = lines[i+3] - } - - // If device is syncing at the moment, get the number of currently synced bytes, - // otherwise that number equals the size of the device. - if strings.Contains(syncLine, "recovery") || - strings.Contains(syncLine, "resync") && - !strings.Contains(syncLine, "\tresync=") { - md.blocksSynced, err = evalBuildline(syncLine) - if err != nil { - return mdStates, fmt.Errorf("error parsing mdstat: %s", err) - } - } else { - md.blocksSynced = md.blocksTotal - } - - mdStates = append(mdStates, md) - } - - return mdStates, nil -} - // NewMdadmCollector returns a new Collector exposing raid statistics. func NewMdadmCollector() (Collector, error) { return &mdadmCollector{}, nil } var ( - isActiveDesc = prometheus.NewDesc( - prometheus.BuildFQName(namespace, "md", "is_active"), - "Indicator whether the md-device is active or not.", + activeDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "md", "state"), + "Indicates the state of md-device.", []string{"device"}, - nil, + prometheus.Labels{"state": "active"}, + ) + inActiveDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "md", "state"), + "Indicates the state of md-device.", + []string{"device"}, + prometheus.Labels{"state": "inactive"}, + ) + recoveringDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "md", "state"), + "Indicates the state of md-device.", + []string{"device"}, + prometheus.Labels{"state": "recovering"}, + ) + resyncDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "md", "state"), + "Indicates the state of md-device.", + []string{"device"}, + prometheus.Labels{"state": "resync"}, ) - disksActiveDesc = prometheus.NewDesc( - prometheus.BuildFQName(namespace, "md", "disks_active"), - "Number of active disks of device.", - []string{"device"}, + disksDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "md", "disks"), + "Number of active/failed/spare disks of device.", + []string{"device", "state"}, nil, ) disksTotalDesc = prometheus.NewDesc( - prometheus.BuildFQName(namespace, "md", "disks"), + prometheus.BuildFQName(namespace, "md", "disks_required"), "Total number of disks of device.", []string{"device"}, nil, @@ -261,52 +91,96 @@ var ( ) func (c *mdadmCollector) Update(ch chan<- prometheus.Metric) error { - statusfile := procFilePath("mdstat") - mdstate, err := parseMdstat(statusfile) + fs, errFs := procfs.NewFS(*procPath) + + if errFs != nil { + return fmt.Errorf("failed to open procfs: %v", errFs) + } + + mdStats, err := fs.MDStat() + if err != nil { if os.IsNotExist(err) { - log.Debugf("Not collecting mdstat, file does not exist: %s", statusfile) + log.Debugf("Not collecting mdstat, file does not exist: %s", *procPath) return nil } + return fmt.Errorf("error parsing mdstatus: %s", err) } - for _, mds := range mdstate { - log.Debugf("collecting metrics for device %s", mds.name) + for _, mdStat := range mdStats { + log.Debugf("collecting metrics for device %s", mdStat.Name) + + stateVals := make(map[string]float64) + stateVals[mdStat.ActivityState] = 1 - var active float64 - if mds.active { - active = 1 - } - ch <- prometheus.MustNewConstMetric( - isActiveDesc, - prometheus.GaugeValue, - active, - mds.name, - ) - ch <- prometheus.MustNewConstMetric( - disksActiveDesc, - prometheus.GaugeValue, - float64(mds.disksActive), - mds.name, - ) ch <- prometheus.MustNewConstMetric( disksTotalDesc, prometheus.GaugeValue, - float64(mds.disksTotal), - mds.name, + float64(mdStat.DisksTotal), + mdStat.Name, ) + + ch <- prometheus.MustNewConstMetric( + disksDesc, + prometheus.GaugeValue, + float64(mdStat.DisksActive), + mdStat.Name, + "active", + ) + ch <- prometheus.MustNewConstMetric( + disksDesc, + prometheus.GaugeValue, + float64(mdStat.DisksFailed), + mdStat.Name, + "failed", + ) + ch <- prometheus.MustNewConstMetric( + disksDesc, + prometheus.GaugeValue, + float64(mdStat.DisksSpare), + mdStat.Name, + "spare", + ) + ch <- prometheus.MustNewConstMetric( + activeDesc, + prometheus.GaugeValue, + stateVals["active"], + mdStat.Name, + ) + + ch <- prometheus.MustNewConstMetric( + inActiveDesc, + prometheus.GaugeValue, + stateVals["inactive"], + mdStat.Name, + ) + + ch <- prometheus.MustNewConstMetric( + recoveringDesc, + prometheus.GaugeValue, + stateVals["recovering"], + mdStat.Name, + ) + + ch <- prometheus.MustNewConstMetric( + resyncDesc, + prometheus.GaugeValue, + stateVals["resyncing"], + mdStat.Name, + ) + ch <- prometheus.MustNewConstMetric( blocksTotalDesc, prometheus.GaugeValue, - float64(mds.blocksTotal), - mds.name, + float64(mdStat.BlocksTotal), + mdStat.Name, ) ch <- prometheus.MustNewConstMetric( blocksSyncedDesc, prometheus.GaugeValue, - float64(mds.blocksSynced), - mds.name, + float64(mdStat.BlocksSynced), + mdStat.Name, ) } diff --git a/collector/mdadm_linux_test.go b/collector/mdadm_linux_test.go deleted file mode 100644 index 1ca1f3ba..00000000 --- a/collector/mdadm_linux_test.go +++ /dev/null @@ -1,61 +0,0 @@ -// Copyright 2015 The Prometheus Authors -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package collector - -import ( - "testing" -) - -func TestMdadm(t *testing.T) { - mdStates, err := parseMdstat("fixtures/proc/mdstat") - if err != nil { - t.Fatalf("parsing of reference-file failed entirely: %s", err) - } - - refs := map[string]mdStatus{ - // { "", , , , , } - "md3": {"md3", true, 8, 8, 5853468288, 5853468288}, - "md127": {"md127", true, 2, 2, 312319552, 312319552}, - "md0": {"md0", true, 2, 2, 248896, 248896}, - "md4": {"md4", false, 0, 2, 4883648, 4883648}, - "md6": {"md6", true, 1, 2, 195310144, 16775552}, - "md8": {"md8", true, 2, 2, 195310144, 16775552}, - "md7": {"md7", true, 3, 4, 7813735424, 7813735424}, - "md9": {"md9", true, 4, 4, 523968, 523968}, - "md10": {"md10", true, 2, 2, 314159265, 314159265}, - "md11": {"md11", true, 2, 2, 4190208, 4190208}, - "md12": {"md12", true, 2, 2, 3886394368, 3886394368}, - "md120": {"md120", true, 2, 2, 2095104, 2095104}, - "md126": {"md126", true, 2, 2, 1855870976, 1855870976}, - "md219": {"md219", false, 0, 3, 7932, 7932}, - "md00": {"md00", true, 1, 1, 4186624, 4186624}, - } - - for _, md := range mdStates { - if md != refs[md.name] { - t.Errorf("failed parsing md-device %s correctly: want %v, got %v", md.name, refs[md.name], md) - } - } - - if len(mdStates) != len(refs) { - t.Errorf("expected number of parsed md-device to be %d, but was %d", len(refs), len(mdStates)) - } -} - -func TestInvalidMdstat(t *testing.T) { - _, err := parseMdstat("fixtures/proc/mdstat_invalid") - if err == nil { - t.Fatalf("parsing of invalid reference file did not find any errors") - } -}