diff --git a/etc/kayobe/ansible/deployment/get-nvme-drives.yml b/etc/kayobe/ansible/deployment/get-nvme-drives.yml index 1d2404d805..9671c0c208 100644 --- a/etc/kayobe/ansible/deployment/get-nvme-drives.yml +++ b/etc/kayobe/ansible/deployment/get-nvme-drives.yml @@ -3,16 +3,31 @@ hosts: overcloud gather_facts: no tasks: - - name: Retrieve NVMe device information - ansible.builtin.command: "nvme list -o json" - register: nvme_list + - name: Scan for NVMe devices with smartctl + ansible.builtin.command: "smartctl --scan -j" + register: smartctl_scan changed_when: false become: true + - name: Extract NVMe device paths + ansible.builtin.set_fact: + nvme_devices: "{{ smartctl_scan.stdout | from_json | json_query('devices[?type==`nvme`].info_name') | default([]) }}" + changed_when: false + + - name: Retrieve NVMe device information via smartctl + ansible.builtin.command: "smartctl -i -j {{ item }}" + register: smartctl_info + loop: "{{ nvme_devices }}" + loop_control: + label: "{{ item }}" + changed_when: false + become: true + when: nvme_devices | length > 0 + - name: Parse NVMe device model names ansible.builtin.set_fact: - nvme_models: "{{ nvme_models | default([]) + [item.ModelNumber] }}" - loop: "{{ nvme_list.stdout | from_json | json_query('Devices[].{ModelNumber: ModelNumber}') }}" + nvme_models: "{{ nvme_models | default([]) + [item.model_name] }}" + loop: "{{ smartctl_info.results | default([]) | map(attribute='stdout') | map('from_json') | selectattr('model_name', 'defined') | list }}" changed_when: false - name: Set unique NVMe models as host facts diff --git a/etc/kayobe/ansible/deployment/smartmon-tools.yml b/etc/kayobe/ansible/deployment/smartmon-tools.yml index 6ec35ae0be..354aae31ae 100644 --- a/etc/kayobe/ansible/deployment/smartmon-tools.yml +++ b/etc/kayobe/ansible/deployment/smartmon-tools.yml @@ -2,11 +2,10 @@ - name: Install and set up SMART monitoring tools hosts: overcloud tasks: - - name: Ensure smartmontools, jq, nvme-cli and cron/cronie are installed + - name: Ensure smartmontools, jq, and cron/cronie are installed ansible.builtin.package: name: - smartmontools - - nvme-cli - jq - "{{ 'cron' if ansible_facts['distribution'] == 'Ubuntu' else 'cronie' }}" state: present @@ -54,7 +53,7 @@ enabled: true become: true - - name: Copy smartmon.py and nvmemon.sh from scripts folder + - name: Copy smartmon.py from scripts folder ansible.builtin.copy: src: "{{ lookup('env', 'KAYOBE_CONFIG_PATH') }}/ansible/scripts/{{ item }}" dest: /usr/local/bin/{{ item }} @@ -63,7 +62,6 @@ mode: "0700" loop: - smartmon.py - - nvmemon.sh become: true - name: Set PATH Variable for cron @@ -84,17 +82,6 @@ mv -f /var/lib/docker/volumes/textfile/_data/smartmon.prom.temp /var/lib/docker/volumes/textfile/_data/smartmon.prom become: true - - name: Schedule cronjob to run nvmemon.sh every 5 minutes and save output to file - ansible.builtin.cron: - name: SMART metrics for drive monitoring using nvmemon.sh - user: root - minute: "*/5" - job: >- - umask 0022 && /usr/local/bin/nvmemon.sh > - /var/lib/docker/volumes/textfile/_data/nvmemon.prom.temp && - mv -f /var/lib/docker/volumes/textfile/_data/nvmemon.prom.temp /var/lib/docker/volumes/textfile/_data/nvmemon.prom - become: true - - name: Remove old cronjobs if present ansible.builtin.cron: name: SMART metrics for drive monitoring using {{ item }} @@ -104,11 +91,15 @@ loop: - smartmon - nvmemon + - nvmemon.sh - - name: Remove old smartmon.sh if present + - name: Remove old monitoring scripts if present ansible.builtin.file: - path: /usr/local/bin/smartmon.sh + path: /usr/local/bin/{{ item }} state: absent + loop: + - smartmon.sh + - nvmemon.sh become: true - name: Gather NVMe drives and generate dwpd ratings diff --git a/etc/kayobe/ansible/scripts/generate_fixtures.py b/etc/kayobe/ansible/scripts/generate_fixtures.py index 5f8f7cc641..589d3acee6 100644 --- a/etc/kayobe/ansible/scripts/generate_fixtures.py +++ b/etc/kayobe/ansible/scripts/generate_fixtures.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 import json import re +import subprocess from pySMART import DeviceList SMARTMON_ATTRS = { @@ -63,6 +64,8 @@ "critical_comp_time", } +SMARTCTL_PATH = "/usr/sbin/smartctl" + DISK_INFO = { "name", "interface", @@ -84,6 +87,17 @@ def camel_to_snake(name): """ return re.sub(r'(? - -# Check if we are root -if [ "$EUID" -ne 0 ]; then - echo "${0##*/}: Please run as root!" >&2 - exit 1 -fi - -# Check if programs are installed -if ! command -v nvme >/dev/null 2>&1; then - echo "${0##*/}: nvme is not installed. Aborting." >&2 - exit 1 -fi - -if ! command -v jq >/dev/null 2>&1; then - echo "${0##*/}: jq is required but not installed. Aborting." >&2 - exit 1 -fi - -# Path to the DWPD ratings JSON file -dwpd_file="/opt/kayobe/etc/monitoring/dwpd_ratings.json" - -declare -A rated_dwpd - -load_dwpd_ratings() { - if [[ -f "$dwpd_file" ]]; then - # Read the JSON; if it fails, default to empty array - dwpd_json="$(cat "$dwpd_file" 2>/dev/null | jq '.' || echo '[]')" - - # We iterate over each array element in dwpd_json - while IFS= read -r line; do - key="$(echo "$line" | jq -r '.model_name')" - value="$(echo "$line" | jq -r '.rated_dwpd')" - - # Clean up trailing whitespace - key="${key%%[[:space:]]*}" - value="${value%%[[:space:]]*}" - - # If we have a valid key, store it in the dictionary - if [[ -n "$key" && "$key" != "null" ]]; then - rated_dwpd["$key"]="$value" - fi - done < <(echo "$dwpd_json" | jq -c '.[]') - else - echo "Warning: DWPD ratings file not found at '$dwpd_file'. Defaulting to rated_dwpd=1." >&2 - fi -} - - -load_dwpd_ratings - -output_format_awk="$( - cat <<'OUTPUTAWK' -BEGIN { v = "" } -v != $1 { - print "# HELP nvme_" $1 " SMART metric " $1; - if ($1 ~ /_total$/) - print "# TYPE nvme_" $1 " counter"; - else - print "# TYPE nvme_" $1 " gauge"; - v = $1 -} -{print "nvme_" $0} -OUTPUTAWK -)" - -format_output() { - sort | awk -F'{' "${output_format_awk}" -} - -# Get the nvme-cli version -nvme_version="$(nvme version | awk '$1 == "nvme" {print $3}')" -echo "nvmecli{version=\"${nvme_version}\"} 1" | format_output - -# Get devices (DevicePath, PhysicalSize and ModelNumber) -device_info="$(nvme list -o json | jq -c '.Devices[] | {DevicePath, PhysicalSize, ModelNumber, SerialNumber}')" - -# Convert device_info to an array -device_info_array=() -while IFS= read -r line; do - device_info_array+=("$line") -done <<< "$device_info" - -# Loop through the NVMe devices -for device_data in "${device_info_array[@]}"; do - device="$(echo "$device_data" | jq -r '.DevicePath')" - json_check="$(nvme smart-log -o json "${device}")" - disk="${device##*/}" - model_name="$(echo "$device_data" | jq -r '.ModelNumber')" - serial_number="$(echo "$device_data" | jq -r '.SerialNumber')" - - physical_size="$(echo "$device_data" | jq -r '.PhysicalSize')" - echo "physical_size_bytes{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${physical_size}" - - # The temperature value in JSON is in Kelvin, we want Celsius - value_temperature="$(echo "$json_check" | jq '.temperature - 273')" - echo "temperature_celsius{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${value_temperature}" - - # Get the rated DWPD from the dictionary or default to 1 if not found - value_rated_dwpd="${rated_dwpd[$model_name]:-1}" - echo "rated_dwpd{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${value_rated_dwpd}" - - value_available_spare="$(echo "$json_check" | jq '.avail_spare / 100')" - echo "available_spare_ratio{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${value_available_spare}" - - value_available_spare_threshold="$(echo "$json_check" | jq '.spare_thresh / 100')" - echo "available_spare_threshold_ratio{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${value_available_spare_threshold}" - - value_percentage_used="$(echo "$json_check" | jq '.percent_used / 100')" - echo "percentage_used_ratio{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${value_percentage_used}" - - value_critical_warning="$(echo "$json_check" | jq '.critical_warning')" - echo "critical_warning_total{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${value_critical_warning}" - - value_media_errors="$(echo "$json_check" | jq '.media_errors')" - echo "media_errors_total{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${value_media_errors}" - - value_num_err_log_entries="$(echo "$json_check" | jq '.num_err_log_entries')" - echo "num_err_log_entries_total{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${value_num_err_log_entries}" - - value_power_cycles="$(echo "$json_check" | jq '.power_cycles')" - echo "power_cycles_total{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${value_power_cycles}" - - value_power_on_hours="$(echo "$json_check" | jq '.power_on_hours')" - echo "power_on_hours_total{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${value_power_on_hours}" - - value_controller_busy_time="$(echo "$json_check" | jq '.controller_busy_time')" - echo "controller_busy_time_seconds{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${value_controller_busy_time}" - - value_data_units_written="$(echo "$json_check" | jq '.data_units_written')" - echo "data_units_written_total{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${value_data_units_written}" - - value_data_units_read="$(echo "$json_check" | jq '.data_units_read')" - echo "data_units_read_total{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${value_data_units_read}" - - value_host_read_commands="$(echo "$json_check" | jq '.host_read_commands')" - echo "host_read_commands_total{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${value_host_read_commands}" - - value_host_write_commands="$(echo "$json_check" | jq '.host_write_commands')" - echo "host_write_commands_total{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${value_host_write_commands}" -done | format_output diff --git a/etc/kayobe/ansible/scripts/smartmon.py b/etc/kayobe/ansible/scripts/smartmon.py index 033ddbb94b..c14c717d29 100644 --- a/etc/kayobe/ansible/scripts/smartmon.py +++ b/etc/kayobe/ansible/scripts/smartmon.py @@ -69,8 +69,101 @@ "num_err_log_entries", "warning_temp_time", "critical_comp_time", + "nvme_total_capacity", + "nvme_unallocated_capacity", } +DATA_UNIT_BYTES = 512000 # NVMe data unit size (1000 * 512 bytes) +BYTES_PER_TB = 10 ** 12 +DWPD_RATINGS_PATH = "/opt/kayobe/etc/monitoring/dwpd_ratings.json" +DEFAULT_DWPD = 1.0 + + +def canonical_device_path(name): + """ + Ensure device name is an absolute /dev path for smartctl invocations. + """ + if not name: + return name + return name if name.startswith("/dev/") else f"/dev/{name}" + +def coerce_numeric(value): + """ + Best effort conversion of various value types (including pySMART attribute objects) + into a float. Returns None when conversion is not possible. + """ + if value is None: + return None + if isinstance(value, (int, float)): + return float(value) + for attr in ("value", "raw"): + try: + candidate = getattr(value, attr) + except AttributeError: + continue + if isinstance(candidate, (int, float)): + return float(candidate) + try: + return float(value) + except (TypeError, ValueError): + return None + + +def load_dwpd_ratings(path=DWPD_RATINGS_PATH): + """ + Load rated DWPD values from JSON file. + + The file is expected to contain either a list of objects with + 'model_name' and 'rated_dwpd' keys, or a dictionary containing such a list. + """ + mapping = {} + + try: + with open(path, "r", encoding="utf-8") as fh: + data = json.load(fh) + except FileNotFoundError: + return mapping + except (json.JSONDecodeError, OSError): + return mapping + + if isinstance(data, dict): + if isinstance(data.get("stackhpc_dwpd_ratings"), list): + data_iterable = data["stackhpc_dwpd_ratings"] + elif isinstance(data.get("dwpd_values"), list): + data_iterable = data["dwpd_values"] + else: + data_iterable = [] + elif isinstance(data, list): + data_iterable = data + else: + data_iterable = [] + + for entry in data_iterable: + if not isinstance(entry, dict): + continue + model_name = str(entry.get("model_name", "")).strip() + rated_value = coerce_numeric(entry.get("rated_dwpd")) + if not model_name: + continue + if rated_value is None: + continue + mapping[model_name.lower()] = rated_value + + return mapping + + +DWPD_RATINGS = load_dwpd_ratings() + + +def get_rated_dwpd(model_name): + """ + Look up DWPD rating for the given model name, defaulting to 1.0. + """ + if not model_name: + return DEFAULT_DWPD + lookup_key = model_name.lower().strip() + return DWPD_RATINGS.get(lookup_key, DEFAULT_DWPD) + def run_command(command, parse_json=False): """ Helper to run a subprocess command and optionally parse JSON output. @@ -80,6 +173,27 @@ def run_command(command, parse_json=False): return json.loads(result.stdout) return result.stdout.strip() + +def smartctl_json(disk_name, disk_type=None, *args): + """ + Execute smartctl with JSON output enabled and return the parsed response. + + Args: + disk_name (str): Device path (e.g. /dev/nvme0). + disk_type (str): Interface type passed to smartctl -d (optional). + *args: Additional smartctl arguments (e.g. "-x", "-n", "standby"). + + Returns: + dict: Parsed JSON response. + """ + cmd = [SMARTCTL_PATH] + cmd.extend(args) + if disk_type and disk_type.lower() != "nvme": + cmd.extend(["-d", disk_type]) + cmd.extend(["-j", disk_name]) + return run_command(cmd, parse_json=True) + + def camel_to_snake(name): """ Convert a CamelCase string to snake_case. @@ -132,6 +246,12 @@ def parse_device_info(device): f'smartmon_device_smart_healthy{{{metric_labels}}} {float(is_healthy)}' ) + # Explicitly collect top-level temperature if available (fixes SCSI temperature issue) + # pySMART exposes 'temperature' as a top-level property which we can use for SCSI, + # whereas device.if_attributes.temperature is often None for SCSI. + if device.temperature is not None: + metrics.append(f'smartmon_temperature{{{metric_labels}}} {float(device.temperature)}') + return metrics def parse_if_attributes(device): @@ -170,6 +290,139 @@ def parse_if_attributes(device): return metrics + +def collect_nvme_metrics(device): + """ + Collect NVMe specific metrics using smartctl JSON output. + + Args: + device (Device): pySMART Device instance. + + Returns: + List[str]: Prometheus formatted metric strings. + """ + metrics = [] + disk_name = device.name + disk_type = device.interface or "" + serial_number = (device.serial or "").lower() + labels = f'disk="{disk_name}",serial_number="{serial_number}",type="{disk_type}"' + model_name = (device.model or "").strip() + + attr_values = {} + if device.if_attributes: + for attr_name in dir(device.if_attributes): + if attr_name.startswith("_"): + continue + value = getattr(device.if_attributes, attr_name, None) + if callable(value): + continue + attr_values[camel_to_snake(attr_name)] = value + + smartctl_target = canonical_device_path(disk_name) + try: + nvme_json = smartctl_json(smartctl_target, disk_type, "-x") + except Exception: + nvme_json = {} + + if not model_name: + model_name = str(nvme_json.get("model_name", "")).strip() + + health_log = nvme_json.get("nvme_smart_health_information_log") + if not isinstance(health_log, dict): + health_log = {} + + user_capacity = nvme_json.get("user_capacity") + if not isinstance(user_capacity, dict): + user_capacity = {} + + namespaces = nvme_json.get("nvme_namespaces") + if not isinstance(namespaces, list): + namespaces = [] + + def numeric_value(*sources): + for source in sources: + value = coerce_numeric(source) + if value is not None: + return value + return None + + namespace_capacity = None + for namespace in namespaces: + if not isinstance(namespace, dict): + continue + namespace_capacity = numeric_value( + namespace.get("capacity", {}).get("bytes"), + namespace.get("size", {}).get("bytes"), + namespace.get("utilization", {}).get("bytes"), + ) + if namespace_capacity is not None: + break + + total_capacity = numeric_value( + attr_values.get("nvme_total_capacity"), + nvme_json.get("nvme_total_capacity"), + user_capacity.get("bytes"), + namespace_capacity, + ) + if total_capacity is not None: + metrics.append(f"smartmon_nvme_total_capacity_bytes{{{labels}}} {total_capacity}") + metrics.append(f"smartmon_physical_size_bytes{{{labels}}} {total_capacity}") + + rated_dwpd = get_rated_dwpd(model_name) + metrics.append(f"smartmon_nvme_rated_dwpd{{{labels}}} {rated_dwpd}") + + unallocated_capacity = numeric_value( + attr_values.get("nvme_unallocated_capacity"), + nvme_json.get("nvme_unallocated_capacity"), + ) + if unallocated_capacity is not None: + metrics.append(f"smartmon_nvme_unallocated_capacity_bytes{{{labels}}} {unallocated_capacity}") + + data_units_read_attr = "data_units_read" in attr_values + data_units_read = numeric_value( + attr_values.get("data_units_read"), + health_log.get("data_units_read"), + ) + if data_units_read is not None: + bytes_read = data_units_read * DATA_UNIT_BYTES + if not data_units_read_attr: + metrics.append(f"smartmon_data_units_read{{{labels}}} {data_units_read}") + metrics.append(f"smartmon_nvme_terabytes_read_total{{{labels}}} {bytes_read / BYTES_PER_TB}") + + data_units_written_attr = "data_units_written" in attr_values + data_units_written = numeric_value( + attr_values.get("data_units_written"), + health_log.get("data_units_written"), + ) + if data_units_written is not None: + bytes_written = data_units_written * DATA_UNIT_BYTES + if not data_units_written_attr: + metrics.append(f"smartmon_data_units_written{{{labels}}} {data_units_written}") + metrics.append(f"smartmon_nvme_terabytes_written_total{{{labels}}} {bytes_written / BYTES_PER_TB}") + + # Collect additional NVMe health log metrics that might be missed by pySMART + # due to naming mismatches + nvme_health_metrics = [ + "media_errors", + "num_err_log_entries", + "warning_temp_time", + "critical_comp_time", + "host_reads", + "host_writes", + ] + + for key in nvme_health_metrics: + # Check if we already got this from pySMART (may change in the future) + if key in attr_values: + continue + + val = numeric_value(health_log.get(key)) + if val is not None: + metrics.append(f"smartmon_{key}{{{labels}}} {val}") + + return metrics + + def write_metrics_to_textfile(metrics, output_path=None): """ Write metrics to a Prometheus textfile using prometheus_client. @@ -228,13 +481,15 @@ def main(output_path=None): disk_type = dev.interface or "" serial_number = (dev.serial or "").lower() + if not serial_number or not dev.assessment: + continue + run_timestamp = int(datetime.datetime.now(datetime.timezone.utc).timestamp()) all_metrics.append(f'smartmon_smartctl_run{{disk="{disk_name}",type="{disk_type}"}} {run_timestamp}') active = 1 try: - cmd = [SMARTCTL_PATH, "-n", "standby", "-d", disk_type, "-j", disk_name] - standby_json = run_command(cmd, parse_json=True) + standby_json = smartctl_json(canonical_device_path(disk_name), disk_type, "-n", "standby") if standby_json.get("power_mode", "") == "standby": active = 0 except json.JSONDecodeError: @@ -250,6 +505,11 @@ def main(output_path=None): all_metrics.extend(parse_device_info(dev)) all_metrics.extend(parse_if_attributes(dev)) + disk_basename = os.path.basename(disk_name) + disk_type_normalized = (disk_type or "").lower() + is_nvme = disk_type_normalized == "nvme" or disk_basename.startswith("nvme") + if is_nvme: + all_metrics.extend(collect_nvme_metrics(dev)) write_metrics_to_textfile(all_metrics, output_path) diff --git a/etc/kayobe/ansible/scripts/test_smartmon.py b/etc/kayobe/ansible/scripts/test_smartmon.py index 4749808a5c..2ebd7c16b3 100644 --- a/etc/kayobe/ansible/scripts/test_smartmon.py +++ b/etc/kayobe/ansible/scripts/test_smartmon.py @@ -1,11 +1,49 @@ import glob import json import os -import unittest +import sys import tempfile +import types +import unittest import math -from time import sleep +CURRENT_DIR = os.path.dirname(__file__) +if CURRENT_DIR not in sys.path: + sys.path.insert(0, CURRENT_DIR) + +prometheus_stub = types.ModuleType("prometheus_client") + +class DummyCollectorRegistry: + pass + + +class DummyGauge: + def __init__(self, *args, **kwargs): + self._values = {} + + def labels(self, *args, **kwargs): + return self + + def set(self, value): + self._last_set = value + + +prometheus_stub.CollectorRegistry = DummyCollectorRegistry +prometheus_stub.Gauge = DummyGauge +prometheus_stub.write_to_textfile = lambda *args, **kwargs: None +sys.modules.setdefault("prometheus_client", prometheus_stub) + +pySMART_stub = types.ModuleType("pySMART") + +class DummyDeviceList: + def __init__(self, devices=None): + self.devices = devices or [] + +pySMART_stub.DeviceList = DummyDeviceList +sys.modules.setdefault("pySMART", pySMART_stub) + +# Import after stubbing so smartmon pulls in the lightweight stand-ins above. +import smartmon from unittest.mock import patch, MagicMock from smartmon import ( parse_device_info, @@ -13,7 +51,10 @@ main, SMARTMON_ATTRS, camel_to_snake, - write_metrics_to_textfile, + collect_nvme_metrics, + DATA_UNIT_BYTES, + BYTES_PER_TB, + DEFAULT_DWPD, ) def load_json_fixture(filename): @@ -28,14 +69,26 @@ def load_json_fixture(filename): class TestSmartMon(unittest.TestCase): @classmethod def setUpClass(cls): - # Collect all *.json files from ./tests/ + # Collect all JSON fixtures that include both device metadata and the smartctl JSON payload. data_folder = os.path.join(os.path.dirname(__file__), "tests") - cls.fixture_files = glob.glob(os.path.join(data_folder, "*.json")) + cls.fixture_files = [] + for path in glob.glob(os.path.join(data_folder, "*.json")): + with open(path, "r", encoding="utf-8") as fh: + try: + data = json.load(fh) + except json.JSONDecodeError: + continue + if isinstance(data, dict) and "device_info" in data and "smartctl" in data: + cls.fixture_files.append(path) + if not cls.fixture_files: + raise unittest.SkipTest("No SMART fixtures found") + cls.primary_fixture = os.path.basename(cls.fixture_files[0]) def create_mock_device_from_json(self, device_info, if_attributes=None): """ Given a 'device_info' dict and optional 'if_attributes', build - a MagicMock that mimics a pySMART Device object. + a MagicMock that mimics a pySMART Device object so the code under test + sees the same shape it would on a live host. """ device = MagicMock() device.name = device_info.get("name", "") @@ -202,15 +255,81 @@ def test_parse_if_attributes(self): with self.subTest(fixture=fixture_name): self._test_parse_if_attributes(fixture_name) + @patch("smartmon.smartctl_json") + @patch("smartmon.get_rated_dwpd") + def test_collect_nvme_metrics_includes_capacity_and_dwpd(self, mock_get_dwpd, mock_smartctl_json): + """ + Ensure collect_nvme_metrics emits NVMe capacity, DWPD, and TB counters. + """ + # Re-use the real fixture so the smartctl payload matches what the exporter + # will see in production. + data = load_json_fixture(self.primary_fixture) + device_info = data["device_info"] + smartctl_payload = data["smartctl"] + mock_smartctl_json.return_value = smartctl_payload + mock_get_dwpd.return_value = 2.5 + + device = self.create_mock_device_from_json(device_info, data.get("if_attributes")) + + metrics = collect_nvme_metrics(device) + disk_name = device_info["name"] + serial_number = device_info["serial"].lower() + disk_type = device_info["interface"] + labels = f'disk="{disk_name}",serial_number="{serial_number}",type="{disk_type}"' + + total_capacity = float(smartctl_payload["nvme_total_capacity"]) + expected_capacity = f"smartmon_nvme_total_capacity_bytes{{{labels}}} {total_capacity}" + expected_physical = f"smartmon_physical_size_bytes{{{labels}}} {total_capacity}" + expected_unallocated = f"smartmon_nvme_unallocated_capacity_bytes{{{labels}}} {float(smartctl_payload.get('nvme_unallocated_capacity', 0))}" + expected_rated = f"smartmon_nvme_rated_dwpd{{{labels}}} 2.5" + + self.assertIn(expected_capacity, metrics) + self.assertIn(expected_physical, metrics) + self.assertIn(expected_unallocated, metrics) + self.assertIn(expected_rated, metrics) + + health_log = smartctl_payload["nvme_smart_health_information_log"] + expected_tb_read = (health_log["data_units_read"] * DATA_UNIT_BYTES) / BYTES_PER_TB + expected_tb_written = (health_log["data_units_written"] * DATA_UNIT_BYTES) / BYTES_PER_TB + + self.assertTrue( + any( + line.startswith(f"smartmon_nvme_terabytes_read_total{{{labels}}}") and + math.isclose(float(line.split()[-1]), expected_tb_read, rel_tol=1e-9) + for line in metrics + ), + "Expected NVMe TB read metric not found or incorrect value.", + ) + self.assertTrue( + any( + line.startswith(f"smartmon_nvme_terabytes_written_total{{{labels}}}") and + math.isclose(float(line.split()[-1]), expected_tb_written, rel_tol=1e-9) + for line in metrics + ), + "Expected NVMe TB written metric not found or incorrect value.", + ) + + self.assertFalse( + any(line.startswith(f"smartmon_data_units_read{{{labels}}}") for line in metrics), + "collect_nvme_metrics should not emit raw data_units_read when already provided by pySMART.", + ) + @patch("smartmon.run_command") @patch("smartmon.DeviceList") - @patch("smartmon.write_metrics_to_textfile", wraps=write_metrics_to_textfile) + @patch("smartmon.write_metrics_to_textfile") def test_main(self, mock_write_metrics, mock_devicelist_class, mock_run_cmd): """ End-to-end test of main() for every JSON fixture in ./tests/. This ensures we can handle multiple disks (multiple fixture files). Checks metrics written to a temp file, and that write_metrics_to_textfile is called once. """ + def fake_write_metrics(metrics, output_path): + # Instead of writing Prometheus text format we simply dump the raw metric + # strings so assertions can compare them without the collector library. + with open(output_path, "w", encoding="utf-8") as fh: + fh.write("\n".join(metrics)) + + mock_write_metrics.side_effect = fake_write_metrics # Patch run_command to return a version & "active" power_mode def run_command_side_effect(cmd, parse_json=False): @@ -227,6 +346,7 @@ def run_command_side_effect(cmd, parse_json=False): with self.subTest(msg=f"Testing main() with {fixture_name}"): mock_write_metrics.reset_mock() data = load_json_fixture(fixture_name) + smartctl_payload = data.get("smartctl", {}) device_info = data["device_info"] if_attrs = data.get("if_attributes", {}) @@ -238,41 +358,46 @@ def run_command_side_effect(cmd, parse_json=False): mock_dev_list.devices = [device_mock] mock_devicelist_class.return_value = mock_dev_list - with tempfile.NamedTemporaryFile(mode="r+", delete_on_close=False) as tmpfile: - path= tmpfile.name - main(output_path=path) - tmpfile.close() + with patch("smartmon.smartctl_json", return_value=smartctl_payload), patch("smartmon.get_rated_dwpd", return_value=DEFAULT_DWPD): + with tempfile.NamedTemporaryFile(mode="r+", delete=False) as tmpfile: + path = tmpfile.name + main(output_path=path) + tmpfile.close() - # Ensure write_metrics_to_textfile was called once self.assertEqual(mock_write_metrics.call_count, 1) - with open(path, "r") as f: - # Read the metrics from the file - metrics_lines = [line.strip() for line in f.readlines() if line.strip() and not line.startswith('#')] - print(f"Metrics lines: {metrics_lines}") - - # Generate expected metrics using the parse functions - expected_metrics = [] - expected_metrics.extend(parse_device_info(device_mock)) - expected_metrics.extend(parse_if_attributes(device_mock)) - - # Check that all expected metrics are present in the file - for expected in expected_metrics: - exp_metric, exp_val_str = expected.rsplit(" ", 1) - exp_val = float(exp_val_str) - found = any( - (exp_metric in line) and - math.isclose(float(line.rsplit(" ", 1)[1]), exp_val) - for line in metrics_lines - ) - self.assertTrue(found, f"Expected metric '{expected}' not found") - - # Check that smartctl_version metric is present - version_found = any(line.startswith("smartmon_smartctl_version{") for line in metrics_lines) - self.assertTrue(version_found, "Expected 'smartmon_smartctl_version' metric not found in output file.") - - # Check that the output file is not empty - self.assertTrue(metrics_lines, "Metrics output file is empty.") + with open(path, "r", encoding="utf-8") as f: + metrics_lines = [ + line.strip() + for line in f.readlines() + if line.strip() and not line.startswith("#") + ] + + try: + expected_metrics = [] + expected_metrics.extend(parse_device_info(device_mock)) + expected_metrics.extend(parse_if_attributes(device_mock)) + + iface = (device_info.get("interface") or "").lower() + if iface == "nvme" or device_info.get("name", "").startswith("/dev/nvme"): + expected_metrics.extend(collect_nvme_metrics(device_mock)) + + for expected in expected_metrics: + exp_metric, exp_val_str = expected.rsplit(" ", 1) + exp_val = float(exp_val_str) + found = any( + (exp_metric in line) and + math.isclose(float(line.rsplit(" ", 1)[1]), exp_val, rel_tol=1e-9) + for line in metrics_lines + ) + self.assertTrue(found, f"Expected metric '{expected}' not found") + + version_found = any(line.startswith("smartmon_smartctl_version{") for line in metrics_lines) + self.assertTrue(version_found, "Expected 'smartmon_smartctl_version' metric not found in output file.") + self.assertTrue(metrics_lines, "Metrics output file is empty.") + finally: + if os.path.exists(path): + os.unlink(path) if __name__ == "__main__": unittest.main() diff --git a/etc/kayobe/ansible/scripts/tests/Dell_ENT_NVMe_CM6.json b/etc/kayobe/ansible/scripts/tests/Dell_ENT_NVMe_CM6.json deleted file mode 100644 index d867910ae1..0000000000 --- a/etc/kayobe/ansible/scripts/tests/Dell_ENT_NVMe_CM6.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "device_info": { - "assessment": "PASS", - "firmware": "2.1.8", - "interface": "nvme", - "model": "Dell Ent NVMe CM6 RI 7.68TB", - "name": "nvme8", - "serial": "Y2Q0A0BPTCF8", - "smart_capable": true, - "smart_enabled": true, - "vendor": "Dell" - }, - "if_attributes": { - "availableSpare": 100, - "availableSpareThreshold": 10, - "controllerBusyTime": 2478, - "criticalWarning": 0, - "dataUnitsRead": 177817765, - "dataUnitsWritten": 127992843, - "percentageUsed": 1, - "powerCycles": 750, - "powerOnHours": 17427, - "temperature": 36, - "unsafeShutdowns": 37 - } -} diff --git a/etc/kayobe/ansible/scripts/tests/INTEL_SSDPE2KX010T8.json b/etc/kayobe/ansible/scripts/tests/INTEL_SSDPE2KX010T8.json new file mode 100644 index 0000000000..9ab1fd4334 --- /dev/null +++ b/etc/kayobe/ansible/scripts/tests/INTEL_SSDPE2KX010T8.json @@ -0,0 +1,133 @@ +{ + "device_info": { + "assessment": "PASS", + "firmware": "VDV10131", + "interface": "nvme", + "model": "INTEL SSDPE2KX010T8", + "name": "nvme1", + "serial": "BTLJ9183038W1P0FGN", + "smart_capable": true, + "smart_enabled": true, + "vendor": "INTEL" + }, + "if_attributes": { + "availableSpare": 100, + "availableSpareThreshold": 10, + "controllerBusyTime": 938, + "criticalWarning": 0, + "dataUnitsRead": 372386419, + "dataUnitsWritten": 172944945, + "percentageUsed": 8, + "powerCycles": 111, + "powerOnHours": 49618, + "temperature": 36, + "unsafeShutdowns": 95 + }, + "smartctl": { + "json_format_version": [ + 1, + 0 + ], + "smartctl": { + "version": [ + 7, + 2 + ], + "svn_revision": "5155", + "platform_info": "x86_64-linux-5.14.0-427.31.1.el9_4.x86_64", + "build_info": "(local build)", + "argv": [ + "smartctl", + "-x", + "-j", + "/dev/nvme1" + ], + "exit_status": 0 + }, + "device": { + "name": "/dev/nvme1", + "info_name": "/dev/nvme1", + "type": "nvme", + "protocol": "NVMe" + }, + "model_name": "INTEL SSDPE2KX010T8", + "serial_number": "BTLJ9183038W1P0FGN", + "firmware_version": "VDV10131", + "nvme_pci_vendor": { + "id": 32902, + "subsystem_id": 32902 + }, + "nvme_ieee_oui_identifier": 6083300, + "nvme_total_capacity": 1000204886016, + "nvme_unallocated_capacity": 0, + "nvme_controller_id": 0, + "nvme_version": { + "string": "1.2", + "value": 66048 + }, + "nvme_number_of_namespaces": 1, + "nvme_namespaces": [ + { + "id": 1, + "size": { + "blocks": 1953525168, + "bytes": 1000204886016 + }, + "capacity": { + "blocks": 1953525168, + "bytes": 1000204886016 + }, + "utilization": { + "blocks": 1953525168, + "bytes": 1000204886016 + }, + "formatted_lba_size": 512, + "eui64": { + "oui": 6083300, + "ext_id": 664757469440 + } + } + ], + "user_capacity": { + "blocks": 1953525168, + "bytes": 1000204886016 + }, + "logical_block_size": 512, + "local_time": { + "time_t": 1762460775, + "asctime": "Thu Nov 6 22:26:15 2025 SAST" + }, + "smart_status": { + "passed": true, + "nvme": { + "value": 0 + } + }, + "nvme_smart_health_information_log": { + "critical_warning": 0, + "temperature": 36, + "available_spare": 100, + "available_spare_threshold": 10, + "percentage_used": 8, + "data_units_read": 372386419, + "data_units_written": 172944945, + "host_reads": 2592438330, + "host_writes": 8381424983, + "controller_busy_time": 938, + "power_cycles": 111, + "power_on_hours": 49618, + "unsafe_shutdowns": 95, + "media_errors": 0, + "num_err_log_entries": 0, + "warning_temp_time": 0, + "critical_comp_time": 0 + }, + "temperature": { + "current": 36 + }, + "power_cycle_count": 111, + "power_on_time": { + "hours": 49618 + } + } +} diff --git a/etc/kayobe/ansible/scripts/tests/nvme.json b/etc/kayobe/ansible/scripts/tests/nvme.json deleted file mode 100644 index bbff19ec01..0000000000 --- a/etc/kayobe/ansible/scripts/tests/nvme.json +++ /dev/null @@ -1,24 +0,0 @@ -{ - "device_info": { - "name": "/dev/nvme0", - "interface": "nvme", - "vendor": "AcmeCorp", - "family": "Acme NVMe Family", - "model": "Acme NVMe 1TB", - "serial": "ABCD1234", - "firmware": "3.0.1", - "smart_capable": true, - "smart_enabled": true, - "assessment": "PASS" - }, - "if_attributes": { - "criticalWarning": 0, - "temperature": 36, - "availableSpare": 100, - "availableSpareThreshold": 10, - "percentageUsed": 0, - "dataUnitsRead": 117446405, - "dataUnitsWritten": 84630284, - "notInSmartmonAttrs": 999 - } -} diff --git a/etc/kayobe/kolla/config/grafana/dashboards/openstack/hardware_overview.json b/etc/kayobe/kolla/config/grafana/dashboards/openstack/hardware_overview.json index b305502223..1e5b22e287 100644 --- a/etc/kayobe/kolla/config/grafana/dashboards/openstack/hardware_overview.json +++ b/etc/kayobe/kolla/config/grafana/dashboards/openstack/hardware_overview.json @@ -1,48 +1,5 @@ {% raw %} { - "__inputs": [ - { - "name": "datasource", - "label": "Prometheus", - "description": "", - "type": "datasource", - "pluginId": "prometheus", - "pluginName": "Prometheus" - } - ], - "__elements": {}, - "__requires": [ - { - "type": "grafana", - "id": "grafana", - "name": "Grafana", - "version": "11.4.0" - }, - { - "type": "datasource", - "id": "prometheus", - "name": "Prometheus", - "version": "1.0.0" - }, - { - "type": "panel", - "id": "stat", - "name": "Stat", - "version": "" - }, - { - "type": "panel", - "id": "table", - "name": "Table", - "version": "" - }, - { - "type": "panel", - "id": "timeseries", - "name": "Time series", - "version": "" - } - ], "annotations": { "list": [ { @@ -68,7 +25,7 @@ "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, - "id": null, + "id": 66, "links": [], "panels": [ { @@ -100,8 +57,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" } ] } @@ -132,7 +88,7 @@ "textMode": "auto", "wideLayout": true }, - "pluginVersion": "11.4.0", + "pluginVersion": "12.0.1+security-01", "targets": [ { "datasource": { @@ -170,8 +126,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -206,7 +161,7 @@ "textMode": "auto", "wideLayout": true }, - "pluginVersion": "11.4.0", + "pluginVersion": "12.0.1+security-01", "targets": [ { "datasource": { @@ -243,8 +198,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" } ] } @@ -275,7 +229,7 @@ "textMode": "auto", "wideLayout": true }, - "pluginVersion": "11.4.0", + "pluginVersion": "12.0.1+security-01", "targets": [ { "datasource": { @@ -317,8 +271,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" } ] } @@ -562,7 +515,7 @@ "showHeader": true, "sortBy": [] }, - "pluginVersion": "11.4.0", + "pluginVersion": "12.0.1+security-01", "targets": [ { "$$hashKey": "object:40", @@ -704,8 +657,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -731,11 +683,12 @@ "showLegend": true }, "tooltip": { + "hideZeros": false, "mode": "single", "sort": "none" } }, - "pluginVersion": "11.4.0", + "pluginVersion": "12.0.1+security-01", "targets": [ { "datasource": { @@ -789,8 +742,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" } ] } @@ -1024,7 +976,7 @@ } ] }, - "pluginVersion": "11.4.0", + "pluginVersion": "12.0.1+security-01", "targets": [ { "$$hashKey": "object:40", @@ -1041,7 +993,7 @@ "displayValueWithAlias": "Never", "editorMode": "code", "exemplar": false, - "expr": "label_join(nvme_critical_warning_total{instance=~\"$node\"},\"unique_device\", \"-\", \"instance\", \"device\")", + "expr": "label_join(smartmon_critical_warning{instance=~\"$node\",type=\"nvme\"},\"unique_device\", \"-\", \"instance\", \"disk\")", "format": "table", "instant": true, "interval": "", @@ -1059,7 +1011,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "label_join(nvme_temperature_celsius{instance=~\"$node\"},\"unique_device\", \"-\", \"instance\", \"device\")", + "expr": "label_join(smartmon_temperature{instance=~\"$node\",type=\"nvme\"},\"unique_device\", \"-\", \"instance\", \"disk\")", "format": "table", "hide": false, "instant": true, @@ -1075,7 +1027,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "label_join(nvme_data_units_written_total{instance=~\"$node\"},\"unique_device\", \"-\", \"instance\", \"device\") * 512", + "expr": "label_join(smartmon_data_units_written{instance=~\"$node\",type=\"nvme\"},\"unique_device\", \"-\", \"instance\", \"disk\") * 512", "format": "table", "hide": false, "instant": true, @@ -1090,7 +1042,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "label_join(nvme_data_units_read_total{instance=~\"$node\"},\"unique_device\", \"-\", \"instance\", \"device\") * 512", + "expr": "label_join(smartmon_data_units_read{instance=~\"$node\",type=\"nvme\"},\"unique_device\", \"-\", \"instance\", \"disk\") * 512", "format": "table", "hide": false, "instant": true, @@ -1105,7 +1057,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "label_join(nvme_physical_size_bytes{instance=~\"$node\"},\"unique_device\", \"-\", \"instance\", \"device\")", + "expr": "label_join(smartmon_physical_size_bytes{instance=~\"$node\",type=\"nvme\"},\"unique_device\", \"-\", \"instance\", \"disk\")", "format": "table", "hide": false, "instant": true, @@ -1120,7 +1072,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "label_join(delta(nvme_data_units_written_total{instance=~\"$node\"}[24h])*512000,\"unique_device\", \"-\", \"instance\", \"device\")/label_join(nvme_physical_size_bytes{instance=~\"$node\"},\"unique_device\", \"-\", \"instance\", \"device\")", + "expr": "label_join(delta(smartmon_data_units_written{instance=~\"$node\",type=\"nvme\"}[24h])*512000,\"unique_device\", \"-\", \"instance\", \"disk\")/label_join(smartmon_physical_size_bytes{instance=~\"$node\",type=\"nvme\"},\"unique_device\", \"-\", \"instance\", \"disk\")", "format": "table", "hide": false, "instant": true, @@ -1135,7 +1087,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "label_join(nvme_rated_dwpd{instance=~\"$node\"},\"unique_device\", \"-\", \"instance\", \"device\")", + "expr": "label_join(smartmon_nvme_rated_dwpd{instance=~\"$node\"},\"unique_device\", \"-\", \"instance\", \"disk\")", "format": "table", "hide": false, "instant": true, @@ -1176,6 +1128,12 @@ "device 5": true, "device 6": true, "device 7": true, + "disk 2": true, + "disk 3": true, + "disk 4": true, + "disk 5": true, + "disk 6": true, + "disk 7": true, "instance 1": false, "instance 2": true, "instance 3": true, @@ -1209,17 +1167,24 @@ "serial_number 5": true, "serial_number 6": true, "serial_number 7": true, + "type 1": true, + "type 2": true, + "type 3": true, + "type 4": true, + "type 5": true, + "type 6": true, + "type 7": true, "unique_device": true }, "includeByName": {}, "indexByName": { - "Time 1": 11, - "Time 2": 15, - "Time 3": 23, - "Time 4": 27, - "Time 5": 32, - "Time 6": 38, - "Time 7": 53, + "Time 1": 10, + "Time 2": 14, + "Time 3": 19, + "Time 4": 21, + "Time 5": 24, + "Time 6": 28, + "Time 7": 36, "Value #Capacity": 6, "Value #DWPD": 8, "Value #Health": 2, @@ -1227,53 +1192,46 @@ "Value #TBR": 5, "Value #TBW": 4, "Value #Temp": 3, - "__name__ 1": 12, - "__name__ 2": 16, - "__name__ 3": 37, - "__name__ 4": 54, - "device 1": 1, - "device 2": 21, - "device 3": 24, - "device 4": 28, - "device 5": 33, - "device 6": 39, - "device 7": 55, - "instance 1": 0, - "instance 2": 17, - "instance 3": 14, - "instance 4": 29, - "instance 5": 34, - "instance 6": 40, - "instance 7": 56, - "job 1": 13, - "job 2": 18, - "job 3": 25, - "job 4": 30, - "job 5": 35, - "job 6": 41, - "job 7": 57, - "model 1": 9, - "model 2": 43, - "model 3": 45, - "model 4": 47, - "model 5": 49, - "model 6": 51, - "model 7": 58, - "original_device 1": 20, - "original_device 2": 22, - "original_device 3": 26, - "original_device 4": 31, - "original_device 5": 36, - "original_device 6": 42, - "original_device 7": 59, - "serial_number 1": 10, - "serial_number 2": 44, - "serial_number 3": 46, - "serial_number 4": 48, - "serial_number 5": 50, - "serial_number 6": 52, - "serial_number 7": 60, - "unique_device": 19 + "__name__ 1": 11, + "__name__ 2": 15, + "__name__ 3": 27, + "__name__ 4": 37, + "disk 1": 0, + "disk 2": 42, + "disk 3": 44, + "disk 4": 46, + "disk 5": 48, + "disk 6": 50, + "disk 7": 52, + "instance 1": 1, + "instance 2": 16, + "instance 3": 13, + "instance 4": 22, + "instance 5": 25, + "instance 6": 29, + "instance 7": 38, + "job 1": 12, + "job 2": 17, + "job 3": 20, + "job 4": 23, + "job 5": 26, + "job 6": 30, + "job 7": 39, + "serial_number 1": 9, + "serial_number 2": 31, + "serial_number 3": 32, + "serial_number 4": 33, + "serial_number 5": 34, + "serial_number 6": 35, + "serial_number 7": 40, + "type 1": 41, + "type 2": 43, + "type 3": 45, + "type 4": 47, + "type 5": 49, + "type 6": 51, + "type 7": 53, + "unique_device": 18 }, "renameByName": { "Time 1": "", @@ -1284,6 +1242,7 @@ "Value #TBW": "TBW", "__name__ 1": "", "device 1": "Device", + "disk 1": "Device", "instance 1": "Hostname", "model 1": "Model Name", "serial_number 1": "Serial Number" @@ -1343,8 +1302,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" } ] } @@ -1366,11 +1324,12 @@ "showLegend": true }, "tooltip": { + "hideZeros": false, "mode": "single", "sort": "none" } }, - "pluginVersion": "11.4.0", + "pluginVersion": "12.0.1+security-01", "targets": [ { "datasource": { @@ -1378,8 +1337,8 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "delta(nvme_data_units_written_total{instance=~\"$node\"}[24h])*512000 / nvme_physical_size_bytes{instance=~\"$node\"}", - "legendFormat": "{{instance}} - {{device}}", + "expr": "delta(smartmon_data_units_written{instance=~\"$node\"}[24h])*512000 / smartmon_physical_size_bytes{instance=~\"$node\"}", + "legendFormat": "{{instance}} - {{disk}}", "range": true, "refId": "A" } @@ -1435,8 +1394,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -1462,11 +1420,12 @@ "showLegend": true }, "tooltip": { + "hideZeros": false, "mode": "single", "sort": "none" } }, - "pluginVersion": "11.4.0", + "pluginVersion": "12.0.1+security-01", "targets": [ { "datasource": { @@ -1475,10 +1434,10 @@ }, "editorMode": "code", "exemplar": false, - "expr": "avg_over_time(nvme_temperature_celsius{instance=~\"$node\"}[1h]) ", + "expr": "avg_over_time(smartmon_temperature{instance=~\"$node\"}[1h]) ", "instant": false, "interval": "", - "legendFormat": "{{instance}} - {{device}}", + "legendFormat": "{{instance}} - {{disk}}", "range": true, "refId": "A" } @@ -1487,8 +1446,9 @@ "type": "timeseries" } ], + "preload": false, "refresh": false, - "schemaVersion": 40, + "schemaVersion": 41, "tags": [], "templating": { "list": [ @@ -1543,7 +1503,6 @@ "timezone": "", "title": "Hardware Overview", "uid": "TCN51Y25P", - "version": 10, - "weekStart": "" + "version": 5 } {% endraw %} diff --git a/etc/kayobe/kolla/config/grafana/dashboards/openstack/nvme.json b/etc/kayobe/kolla/config/grafana/dashboards/openstack/nvme.json index 1669b02a06..a450ec2a44 100644 --- a/etc/kayobe/kolla/config/grafana/dashboards/openstack/nvme.json +++ b/etc/kayobe/kolla/config/grafana/dashboards/openstack/nvme.json @@ -19,7 +19,7 @@ "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 1, - "id": 17197, + "id": 85, "links": [], "panels": [ { @@ -35,8 +35,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" } ] } @@ -68,7 +67,7 @@ "textMode": "name", "wideLayout": true }, - "pluginVersion": "11.4.0", + "pluginVersion": "12.0.1+security-01", "targets": [ { "datasource": { @@ -76,9 +75,9 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "nvme_data_units_written_total{serial_number=~\"$serial_number\"}", + "expr": "smartmon_data_units_written{serial_number=~\"$serial_number\"}", "instant": true, - "legendFormat": "{{instance}} - {{device}} - {{serial_number}}", + "legendFormat": "{{instance}} - {{disk}} - {{serial_number}}", "refId": "A" } ], @@ -110,8 +109,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" } ] }, @@ -143,14 +141,14 @@ "textMode": "auto", "wideLayout": true }, - "pluginVersion": "11.4.0", + "pluginVersion": "12.0.1+security-01", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "expr": "nvme_physical_size_bytes{serial_number=\"$serial_number\"}", + "expr": "smartmon_physical_size_bytes{serial_number=\"$serial_number\"}", "legendFormat": "Physical Size", "refId": "A" } @@ -172,8 +170,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "yellow", @@ -211,14 +208,14 @@ "showThresholdMarkers": true, "sizing": "auto" }, - "pluginVersion": "11.4.0", + "pluginVersion": "12.0.1+security-01", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "expr": "nvme_temperature_celsius{serial_number=\"$serial_number\"}", + "expr": "smartmon_temperature{serial_number=\"$serial_number\"}", "legendFormat": "Temperature", "refId": "A" } @@ -238,8 +235,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" } ] } @@ -270,14 +266,14 @@ "textMode": "auto", "wideLayout": true }, - "pluginVersion": "11.4.0", + "pluginVersion": "12.0.1+security-01", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "expr": "nvme_rated_dwpd{serial_number=\"$serial_number\"}", + "expr": "smartmon_nvme_rated_dwpd{serial_number=\"$serial_number\"}", "legendFormat": "Rated DWPD", "refId": "A" } @@ -297,8 +293,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "orange", @@ -333,14 +328,14 @@ "textMode": "auto", "wideLayout": true }, - "pluginVersion": "11.4.0", + "pluginVersion": "12.0.1+security-01", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "expr": "nvme_power_on_hours_total{serial_number=\"$serial_number\"}", + "expr": "smartmon_power_on_hours{serial_number=\"$serial_number\"}", "legendFormat": "Power Hours", "refId": "A" } @@ -360,8 +355,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" } ] } @@ -392,14 +386,14 @@ "textMode": "auto", "wideLayout": true }, - "pluginVersion": "11.4.0", + "pluginVersion": "12.0.1+security-01", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "expr": "nvme_power_cycles_total{serial_number=\"$serial_number\"}", + "expr": "smartmon_power_cycles{serial_number=\"$serial_number\"}", "legendFormat": "Power Cycles", "refId": "A" } @@ -435,8 +429,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -471,14 +464,14 @@ "textMode": "auto", "wideLayout": true }, - "pluginVersion": "11.4.0", + "pluginVersion": "12.0.1+security-01", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "expr": "nvme_critical_warning_total{serial_number=\"$serial_number\"}", + "expr": "smartmon_critical_warning{serial_number=\"$serial_number\"}", "legendFormat": "Critical Warnings", "refId": "A" } @@ -500,8 +493,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "orange", @@ -539,15 +531,17 @@ "showThresholdMarkers": true, "sizing": "auto" }, - "pluginVersion": "11.4.0", + "pluginVersion": "12.0.1+security-01", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "expr": "nvme_percentage_used_ratio{serial_number=\"$serial_number\"} * 100", + "editorMode": "code", + "expr": "smartmon_percentage_used{serial_number=\"$serial_number\"}", "legendFormat": "Percentage Used", + "range": true, "refId": "A" } ], @@ -566,8 +560,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -602,14 +595,14 @@ "textMode": "auto", "wideLayout": true }, - "pluginVersion": "11.4.0", + "pluginVersion": "12.0.1+security-01", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "expr": "nvme_num_err_log_entries_total{serial_number=\"$serial_number\"}", + "expr": "smartmon_num_err_log_entries{serial_number=\"$serial_number\"}", "legendFormat": "Error Log Entries", "refId": "A" } @@ -629,8 +622,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -665,14 +657,14 @@ "textMode": "auto", "wideLayout": true }, - "pluginVersion": "11.4.0", + "pluginVersion": "12.0.1+security-01", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "expr": "nvme_media_errors_total{serial_number=\"$serial_number\"}", + "expr": "smartmon_media_errors{serial_number=\"$serial_number\"}", "legendFormat": "Media Errors", "refId": "A" } @@ -694,8 +686,7 @@ "mode": "absolute", "steps": [ { - "color": "red", - "value": null + "color": "red" }, { "color": "yellow", @@ -733,15 +724,17 @@ "showThresholdMarkers": true, "sizing": "auto" }, - "pluginVersion": "11.4.0", + "pluginVersion": "12.0.1+security-01", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "expr": "nvme_available_spare_ratio{serial_number=\"$serial_number\"} * 100", + "editorMode": "code", + "expr": "smartmon_available_spare{serial_number=\"$serial_number\"}", "legendFormat": "Available Spare", + "range": true, "refId": "A" } ], @@ -765,8 +758,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" } ] }, @@ -798,15 +790,17 @@ "textMode": "auto", "wideLayout": true }, - "pluginVersion": "11.4.0", + "pluginVersion": "12.0.1+security-01", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "expr": "nvme_available_spare_threshold_ratio{serial_number=\"$serial_number\"} * 100", + "editorMode": "code", + "expr": "smartmon_available_spare_threshold{serial_number=\"$serial_number\"}", "legendFormat": "Spare Threshold", + "range": true, "refId": "A" } ], @@ -874,8 +868,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -902,11 +895,12 @@ "showLegend": true }, "tooltip": { + "hideZeros": false, "mode": "single", "sort": "none" } }, - "pluginVersion": "11.4.0", + "pluginVersion": "12.0.1+security-01", "targets": [ { "datasource": { @@ -914,7 +908,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "rate(nvme_data_units_read_total{serial_number=\"$serial_number\"}[5m])*512000", + "expr": "rate(smartmon_data_units_read{serial_number=\"$serial_number\"}[5m])*512000", "legendFormat": "Data Read", "range": true, "refId": "A" @@ -925,7 +919,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "rate(nvme_data_units_written_total{serial_number=\"$serial_number\"}[5m])*512000", + "expr": "rate(smartmon_data_units_written{serial_number=\"$serial_number\"}[5m])*512000", "legendFormat": "Data Written", "range": true, "refId": "B" @@ -946,8 +940,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" } ] }, @@ -979,7 +972,7 @@ "textMode": "auto", "wideLayout": true }, - "pluginVersion": "11.4.0", + "pluginVersion": "12.0.1+security-01", "targets": [ { "datasource": { @@ -988,7 +981,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "nvme_data_units_written_total{serial_number=\"$serial_number\"} * 512000", + "expr": "smartmon_data_units_written{serial_number=\"$serial_number\"} * 512000", "instant": false, "legendFormat": "__auto", "range": true, @@ -1010,8 +1003,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" } ] }, @@ -1043,7 +1035,7 @@ "textMode": "auto", "wideLayout": true }, - "pluginVersion": "11.4.0", + "pluginVersion": "12.0.1+security-01", "targets": [ { "datasource": { @@ -1051,7 +1043,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "nvme_data_units_read_total{serial_number=\"$serial_number\"} * 512000", + "expr": "smartmon_data_units_read{serial_number=\"$serial_number\"} * 512000", "legendFormat": "__auto", "range": true, "refId": "A" @@ -1108,8 +1100,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -1136,11 +1127,12 @@ "showLegend": true }, "tooltip": { + "hideZeros": false, "mode": "single", "sort": "none" } }, - "pluginVersion": "11.4.0", + "pluginVersion": "12.0.1+security-01", "targets": [ { "datasource": { @@ -1148,7 +1140,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "irate(nvme_controller_busy_time_seconds{serial_number=\"$serial_number\"}[5m])", + "expr": "irate(smartmon_controller_busy_time{serial_number=\"$serial_number\"}[5m])", "legendFormat": "Controller Busy Time", "range": true, "refId": "A" @@ -1156,11 +1148,147 @@ ], "title": "Controller Busy Time", "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 12, + "y": 27 + }, + "id": 27, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.0.1+security-01", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "delta(smartmon_data_units_written{serial_number=\"$serial_number\"}[24h])*512000 / smartmon_physical_size_bytes{serial_number=\"$serial_number\"}", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "DWPD", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 18, + "y": 27 + }, + "id": 28, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.0.1+security-01", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "(smartmon_data_units_written{serial_number=\"$serial_number\"} * 512000)/smartmon_physical_size_bytes{serial_number=\"$serial_number\"}/(smartmon_power_on_hours{serial_number=\"$serial_number\"} / 24)", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Lifetime Average DWPD", + "type": "stat" } ], "preload": false, "refresh": "1m", - "schemaVersion": 40, + "schemaVersion": 41, "tags": [], "templating": { "list": [ @@ -1187,14 +1315,14 @@ "type": "prometheus", "uid": "${datasource}" }, - "definition": "label_values(nvme_data_units_read_total,serial_number)", + "definition": "label_values(smartmon_data_units_read,serial_number)", "includeAll": false, "label": "Serial Number", "name": "serial_number", "options": [], "query": { "qryType": 1, - "query": "label_values(nvme_data_units_read_total,serial_number)", + "query": "label_values(smartmon_data_units_read,serial_number)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -1211,7 +1339,6 @@ "timezone": "", "title": "NVMe Monitoring", "uid": "uesjf83hh", - "version": 1, - "weekStart": "" + "version": 2 } {% endraw %}