23c689aa5b
Ships all five phases of the deep-profile overhaul together. Runs now carry a profile (quick/deep/soak); every profile walks the same 11-stage order — Inventory → Firmware → SpecValidate → SMART → CPUStress → Storage → Network → Burn → GPU → PSU → Reporting — with only per-stage durations and concurrency scaled. Phase 1: profiles.ProfileRegistry loaded from vetting.yaml; runs.profile column + CreateWithProfile; threshold table + evaluator seeded per-run from the shared vetting.thresholds block; breach flips result at /sensor + /result. Phase 2: upgraded CPUStress (stress-ng --cpu-method=all --verify + EDAC/MCE poll), Storage (fio --verify=md5 + SMART start/end delta), Network (sustained iperf + /proc/net/dev deltas) with per-profile knobs from Deps. Phase 3: Burn super-stage with goroutine fan-out for CPU + memory + fio + iperf, PSU rails sampled across the Burn window, SensorMux (2 s flush, 500-sample cap) to absorb backpressure. Phase 4: Firmware stage + firmware_snapshots table; probes dmidecode (BIOS), ipmitool (BMC), ethtool -i (NIC), nvme (sysfs + id-ctrl), lspci (HBA), /proc/cpuinfo (microcode). spec.DiffFirmware folds into SpecValidate with pin-by-identifier and fan-out-across-component matching; mismatches park the run in FailedHolding. Phase 5: profile radio on the host start form, profile chip on the run header, Firmware section in the HTML report, coverage artifact uploaded from CI, agent/tests/fakes/ scaffold with Deps.LookPath seam + stress_ng and dmidecode example fakes. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
554 lines
18 KiB
Go
554 lines
18 KiB
Go
package tests
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"os/exec"
|
|
"strconv"
|
|
"strings"
|
|
"time"
|
|
)
|
|
|
|
// Storage is the destructive stage. Phase 2 replaced the old
|
|
// badblocks + 128 MiB fio combo with a single fio run per disk that
|
|
// writes, verifies md5 of what it wrote, and reports p99 latency.
|
|
// Modes:
|
|
//
|
|
// - fio_sample (quick): bounded 1 GiB write per disk, ~3 min runtime.
|
|
// - full_disk (deep/soak): writes the whole device, time-bounded by
|
|
// the fio_time knob (2 h deep, 6 h soak).
|
|
//
|
|
// Pre-gates kept from Phase 1:
|
|
//
|
|
// 1. Device allowlist: only act on /dev/<X> where the kernel-reported
|
|
// serial matches one of Deps.ExpectedDisks. USB sticks and unexpected
|
|
// drives are excluded.
|
|
// 2. Wipe probe: blkid + wipefs --no-act on each target; any filesystem
|
|
// signature, partition table, or LVM metadata → fail with
|
|
// UnexpectedData unless Deps.OverrideWipe is set.
|
|
//
|
|
// After fio, the stage captures a SMART diff (start snapshot taken
|
|
// before any writes; end snapshot after all writes finish) and posts
|
|
// deltas on attributes like Reallocated_Sector_Ct and Current_Pending_Sector.
|
|
// The threshold evaluator isn't seeded to gate smart_delta out of the
|
|
// box — those samples are diagnostic for the report. Fio's p99 latency
|
|
// posts as fio_p99_us so the per-stage Storage warning threshold can
|
|
// fire on a latency cliff.
|
|
func Storage(ctx context.Context, d Deps) Outcome {
|
|
if len(d.ExpectedDisks) == 0 {
|
|
d.Info("Storage: no expected disks in spec — skipping stage")
|
|
return Outcome{
|
|
Passed: true,
|
|
Summary: "skipped (no expected disks)",
|
|
Extras: map[string]any{"skipped": true, "reason": "no_expected_disks"},
|
|
}
|
|
}
|
|
|
|
targets := resolveTargets(d.ExpectedDisks)
|
|
if len(targets) == 0 {
|
|
d.Error("Storage: none of the expected disks are present on this host")
|
|
return Outcome{
|
|
Passed: false,
|
|
Message: "device allowlist matched zero disks",
|
|
Summary: "no allowed disks present",
|
|
Extras: map[string]any{"expected": d.ExpectedDisks},
|
|
}
|
|
}
|
|
|
|
// Non-destructive runs skip wipe-probe (nothing to refuse), fio
|
|
// writes, and SMART delta (nothing changed so no delta to report).
|
|
// Every expected disk is still asserted present so a vanished drive
|
|
// still fails the stage.
|
|
if d.NonDestructive {
|
|
perDisk := map[string]any{}
|
|
for _, t := range targets {
|
|
perDisk[t.Device] = map[string]any{"mode": "non_destructive", "serial": t.Serial}
|
|
}
|
|
d.Info(fmt.Sprintf("Storage: non-destructive — verified %d disk(s) present", len(targets)))
|
|
return Outcome{
|
|
Passed: true,
|
|
Summary: fmt.Sprintf("non-destructive: read-only checks only (%d disks)", len(targets)),
|
|
Extras: map[string]any{"per_disk": perDisk, "non_destructive": true},
|
|
}
|
|
}
|
|
|
|
// Wipe probe on every target. A single dirty disk halts the stage
|
|
// unless the operator has set OverrideWipe via the UI.
|
|
probes := map[string]wipeProbeResult{}
|
|
dirty := []string{}
|
|
for _, t := range targets {
|
|
probe := probeWipe(ctx, t.Device)
|
|
probes[t.Device] = probe
|
|
if probe.HasData {
|
|
dirty = append(dirty, t.Device)
|
|
}
|
|
}
|
|
if len(dirty) > 0 && !d.OverrideWipe {
|
|
d.Error("Storage: wipe probe found existing data on: " + strings.Join(dirty, ", "))
|
|
return Outcome{
|
|
Passed: false,
|
|
Message: "UnexpectedData: " + strings.Join(dirty, ", ") + " (operator override required)",
|
|
Summary: fmt.Sprintf("wipe-probe halt (%d disk(s) have data)", len(dirty)),
|
|
Extras: map[string]any{
|
|
"wipe_probe": probes,
|
|
"override_hint": "click 'Override wipe & retry' in the held tile",
|
|
"dirty_devices": dirty,
|
|
},
|
|
}
|
|
}
|
|
if d.OverrideWipe && len(dirty) > 0 {
|
|
d.Warn("Storage: operator override engaged — proceeding despite data on " + strings.Join(dirty, ", "))
|
|
}
|
|
|
|
// Capture start-of-stage SMART attributes before we write anything
|
|
// so the delta is attributable to *this* stage's writes and not the
|
|
// host's prior history. Per-disk failures are tolerated (e.g. the
|
|
// device doesn't expose SMART); we just can't emit a delta for it.
|
|
startSMART := captureSMARTAttrs(ctx, targets)
|
|
|
|
fioOpts := resolveFioOpts(d.StorageKnobs)
|
|
d.Info(fmt.Sprintf("Storage: fio mode=%s size=%s runtime=%s bs=%s rw=%s verify=%s",
|
|
fioOpts.Mode, fioOpts.Size, fioOpts.Runtime, fioOpts.BS, fioOpts.RW, fioOpts.Verify))
|
|
|
|
var samples []Sample
|
|
var subs []SubStepReport
|
|
perDisk := map[string]any{}
|
|
failed := ""
|
|
for _, t := range targets {
|
|
d.Info(fmt.Sprintf("Storage: running fio %s on %s", fioOpts.Mode, t.Device))
|
|
fioStart := time.Now()
|
|
fr := runFioVerify(ctx, t.Device, fioOpts)
|
|
fioEnd := time.Now()
|
|
fioSummary, _ := json.Marshal(fr)
|
|
subs = append(subs, SubStepReport{
|
|
Name: fmt.Sprintf("fio %s %s", fioOpts.Mode, t.Device),
|
|
Passed: fr.Error == "",
|
|
StartedAt: fioStart,
|
|
CompletedAt: fioEnd,
|
|
SummaryJSON: fioSummary,
|
|
})
|
|
perDisk[t.Device] = map[string]any{"fio": fr}
|
|
|
|
if fr.Error == "" {
|
|
samples = append(samples,
|
|
Sample{Kind: "fio", Key: t.Device + "/read_iops", Value: fr.ReadIOPS, Unit: "iops"},
|
|
Sample{Kind: "fio", Key: t.Device + "/write_iops", Value: fr.WriteIOPS, Unit: "iops"},
|
|
)
|
|
if fr.ReadP99Us > 0 {
|
|
samples = append(samples, Sample{Kind: "fio_p99_us", Key: t.Device + "/read", Value: fr.ReadP99Us, Unit: "us"})
|
|
}
|
|
if fr.WriteP99Us > 0 {
|
|
samples = append(samples, Sample{Kind: "fio_p99_us", Key: t.Device + "/write", Value: fr.WriteP99Us, Unit: "us"})
|
|
}
|
|
} else if failed == "" {
|
|
failed = t.Device
|
|
}
|
|
}
|
|
|
|
// End-of-stage SMART snapshot + diff. We capture whether or not fio
|
|
// succeeded — a mid-run failure still produces attributable deltas,
|
|
// which is often more interesting than the stage outcome itself.
|
|
endSMART := captureSMARTAttrs(ctx, targets)
|
|
deltas := diffSMARTAttrs(startSMART, endSMART)
|
|
for dev, attrs := range deltas {
|
|
for attr, delta := range attrs {
|
|
samples = append(samples, Sample{Kind: "smart_delta", Key: dev + "/" + attr, Value: delta, Unit: "count"})
|
|
}
|
|
}
|
|
if d.Sensor != nil && len(samples) > 0 {
|
|
_ = d.Sensor(ctx, samples)
|
|
}
|
|
|
|
if failed != "" {
|
|
return Outcome{
|
|
Passed: false,
|
|
Message: "fio verify failed on " + failed,
|
|
Summary: "fio failed on " + failed,
|
|
Extras: map[string]any{"per_disk": perDisk, "wipe_probe": probes, "smart_delta": deltas, "fio_opts": fioOpts},
|
|
SubSteps: subs,
|
|
}
|
|
}
|
|
|
|
d.Info(fmt.Sprintf("Storage: %d disk(s) passed fio --verify", len(targets)))
|
|
return Outcome{
|
|
Passed: true,
|
|
Summary: fmt.Sprintf("%d disks passed (%s)", len(targets), fioOpts.Mode),
|
|
Extras: map[string]any{"per_disk": perDisk, "wipe_probe": probes, "smart_delta": deltas, "fio_opts": fioOpts},
|
|
SubSteps: subs,
|
|
}
|
|
}
|
|
|
|
type diskTarget struct {
|
|
Serial string
|
|
Device string
|
|
}
|
|
|
|
// resolveTargets maps expected-disk serials to /dev/<X> paths by reading
|
|
// /sys/block. Uses the same mechanism as probes.inventory to avoid drift.
|
|
func resolveTargets(expected []ExpectedDisk) []diskTarget {
|
|
disks, err := listBlockDisks()
|
|
if err != nil {
|
|
return nil
|
|
}
|
|
// Build serial → device map from /sys.
|
|
serialOf := map[string]string{}
|
|
for _, dev := range disks {
|
|
name := strings.TrimPrefix(dev, "/dev/")
|
|
s := diskSerialFromSys(name)
|
|
if s != "" {
|
|
serialOf[strings.ToLower(s)] = dev
|
|
}
|
|
}
|
|
var out []diskTarget
|
|
for _, e := range expected {
|
|
if e.Serial == "" {
|
|
continue
|
|
}
|
|
if dev, ok := serialOf[strings.ToLower(e.Serial)]; ok {
|
|
out = append(out, diskTarget{Serial: e.Serial, Device: dev})
|
|
}
|
|
}
|
|
return out
|
|
}
|
|
|
|
// diskSerialFromSys is a smaller copy of probes.diskSerial; imported
|
|
// from internal/probes would cause a cycle so we duplicate the short
|
|
// lookup. If it drifts from the inventory probe, Storage fails because
|
|
// the serial doesn't match — which is the correct behavior.
|
|
func diskSerialFromSys(name string) string {
|
|
for _, rel := range []string{
|
|
"/sys/block/" + name + "/device/serial",
|
|
"/sys/block/" + name + "/serial",
|
|
} {
|
|
b, err := readFileBytes(rel)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
s := strings.TrimSpace(string(b))
|
|
if s != "" {
|
|
return s
|
|
}
|
|
}
|
|
// Fall back to udevadm — ID_SERIAL_SHORT is more reliable on SCSI.
|
|
out, err := exec.Command("udevadm", "info", "--query=property", "--name="+name).Output()
|
|
if err != nil {
|
|
return ""
|
|
}
|
|
for _, line := range strings.Split(string(out), "\n") {
|
|
if v, ok := strings.CutPrefix(line, "ID_SERIAL_SHORT="); ok {
|
|
return strings.TrimSpace(v)
|
|
}
|
|
}
|
|
return ""
|
|
}
|
|
|
|
func readFileBytes(p string) ([]byte, error) {
|
|
return readFile(p)
|
|
}
|
|
|
|
// ---------- wipe probe ----------
|
|
|
|
type wipeProbeResult struct {
|
|
Device string `json:"device"`
|
|
HasData bool `json:"has_data"`
|
|
Findings []string `json:"findings,omitempty"`
|
|
}
|
|
|
|
// probeWipe runs blkid + wipefs -n. Any non-empty output from either is
|
|
// a "has data" signal. This is deliberately conservative: we'd rather
|
|
// halt on a bare ext4 signature than hand fio a disk with real bytes on
|
|
// it.
|
|
func probeWipe(ctx context.Context, device string) wipeProbeResult {
|
|
out := wipeProbeResult{Device: device}
|
|
|
|
if b, err := exec.CommandContext(ctx, "blkid", "-o", "full", device).Output(); err == nil {
|
|
s := strings.TrimSpace(string(b))
|
|
if s != "" {
|
|
out.Findings = append(out.Findings, "blkid: "+s)
|
|
out.HasData = true
|
|
}
|
|
}
|
|
if b, err := exec.CommandContext(ctx, "wipefs", "--no-act", device).Output(); err == nil {
|
|
s := strings.TrimSpace(string(b))
|
|
// wipefs prints a header line even on a clean disk; keep only
|
|
// lines with actual signature data.
|
|
for _, line := range strings.Split(s, "\n") {
|
|
line = strings.TrimSpace(line)
|
|
if line == "" || strings.HasPrefix(line, "DEVICE") || strings.HasPrefix(line, "offset") {
|
|
continue
|
|
}
|
|
out.Findings = append(out.Findings, "wipefs: "+line)
|
|
out.HasData = true
|
|
}
|
|
}
|
|
return out
|
|
}
|
|
|
|
// ---------- fio ----------
|
|
|
|
// fioOpts resolves the probe knobs into the concrete flag values fio
|
|
// needs. Defaults match the quick profile's fio_sample shape so callers
|
|
// with zero knobs still run something bounded.
|
|
type fioOpts struct {
|
|
Mode string `json:"mode"` // "fio_sample" | "full_disk"
|
|
Size string `json:"size"` // "1GiB"; only used for fio_sample
|
|
Runtime time.Duration `json:"runtime"` // bounding time
|
|
BS string `json:"bs"` // "4k"
|
|
RW string `json:"rw"` // "randrw"
|
|
Verify string `json:"verify"` // "md5" | ""
|
|
}
|
|
|
|
// resolveFioOpts normalizes the knobs into a runnable config. Zero-
|
|
// valued fields fall back to the quick defaults so a stage that's
|
|
// missing its knobs still has coherent behavior (safer than refusing).
|
|
func resolveFioOpts(k StorageKnobs) fioOpts {
|
|
o := fioOpts{
|
|
Mode: firstNonEmpty(k.Mode, "fio_sample"),
|
|
Size: firstNonEmpty(k.FioSize, "1GiB"),
|
|
Runtime: k.FioTime,
|
|
BS: firstNonEmpty(k.FioBS, "4k"),
|
|
RW: firstNonEmpty(k.FioRW, "randrw"),
|
|
Verify: firstNonEmpty(k.Verify, "md5"),
|
|
}
|
|
if o.Runtime <= 0 {
|
|
o.Runtime = 3 * time.Minute
|
|
}
|
|
return o
|
|
}
|
|
|
|
func firstNonEmpty(vs ...string) string {
|
|
for _, v := range vs {
|
|
if v != "" {
|
|
return v
|
|
}
|
|
}
|
|
return ""
|
|
}
|
|
|
|
type fioResult struct {
|
|
Mode string `json:"mode"`
|
|
ReadIOPS float64 `json:"read_iops"`
|
|
WriteIOPS float64 `json:"write_iops"`
|
|
ReadBWKBps float64 `json:"read_bw_kbps"`
|
|
WriteBWKBps float64 `json:"write_bw_kbps"`
|
|
ReadP99Us float64 `json:"read_p99_us,omitempty"`
|
|
WriteP99Us float64 `json:"write_p99_us,omitempty"`
|
|
Error string `json:"error,omitempty"`
|
|
OutputTail string `json:"output_tail,omitempty"`
|
|
}
|
|
|
|
// runFioVerify invokes fio with md5-verify semantics. fio_sample mode
|
|
// caps the IO at opts.Size; full_disk drives the whole device bounded
|
|
// by runtime. Both use direct IO to bypass the page cache — we want
|
|
// real disk latency, not Linux' cheerful buffer.
|
|
func runFioVerify(ctx context.Context, device string, opts fioOpts) fioResult {
|
|
// 30s grace over runtime so fio has time to flush + close cleanly.
|
|
runCtx, cancel := context.WithTimeout(ctx, opts.Runtime+30*time.Second)
|
|
defer cancel()
|
|
|
|
args := []string{
|
|
"--name=verify-" + strings.TrimPrefix(device, "/dev/"),
|
|
"--filename=" + device,
|
|
"--rw=" + opts.RW,
|
|
"--bs=" + opts.BS,
|
|
"--numjobs=1",
|
|
"--direct=1",
|
|
"--group_reporting",
|
|
"--output-format=json",
|
|
"--runtime=" + strconv.Itoa(int(opts.Runtime.Seconds())),
|
|
}
|
|
if opts.Verify != "" {
|
|
args = append(args,
|
|
"--verify="+opts.Verify,
|
|
"--verify_pattern=random",
|
|
"--do_verify=1",
|
|
)
|
|
}
|
|
switch opts.Mode {
|
|
case "full_disk":
|
|
// Time-bounded across the full device — fio uses the device's
|
|
// full size when --size is omitted on a block device.
|
|
args = append(args, "--time_based=1")
|
|
default:
|
|
// fio_sample: bounded write. Setting --size= limits the IO
|
|
// volume regardless of runtime.
|
|
args = append(args, "--size="+opts.Size, "--time_based=0")
|
|
}
|
|
|
|
cmd := exec.CommandContext(runCtx, "fio", args...)
|
|
out, err := cmd.Output()
|
|
r := fioResult{Mode: opts.Mode, OutputTail: tailLines(string(out), 20)}
|
|
if err != nil {
|
|
r.Error = err.Error()
|
|
return r
|
|
}
|
|
parsed, perr := parseFioJSON(out)
|
|
if perr != nil {
|
|
r.Error = "parse fio json: " + perr.Error()
|
|
return r
|
|
}
|
|
r.ReadIOPS = parsed.ReadIOPS
|
|
r.WriteIOPS = parsed.WriteIOPS
|
|
r.ReadBWKBps = parsed.ReadBWKBps
|
|
r.WriteBWKBps = parsed.WriteBWKBps
|
|
r.ReadP99Us = parsed.ReadP99Us
|
|
r.WriteP99Us = parsed.WriteP99Us
|
|
return r
|
|
}
|
|
|
|
// parseFioJSON extracts the bits we care about from fio's --output-format=json.
|
|
// Latency percentiles live at .jobs[0].read.clat_ns.percentile["99.000000"];
|
|
// we convert nanoseconds to microseconds for the fio_p99_us sample.
|
|
func parseFioJSON(out []byte) (fioResult, error) {
|
|
var top struct {
|
|
Jobs []struct {
|
|
Read struct {
|
|
IOPS float64 `json:"iops"`
|
|
BW float64 `json:"bw"`
|
|
CLat struct {
|
|
Percentile map[string]float64 `json:"percentile"`
|
|
} `json:"clat_ns"`
|
|
} `json:"read"`
|
|
Write struct {
|
|
IOPS float64 `json:"iops"`
|
|
BW float64 `json:"bw"`
|
|
CLat struct {
|
|
Percentile map[string]float64 `json:"percentile"`
|
|
} `json:"clat_ns"`
|
|
} `json:"write"`
|
|
} `json:"jobs"`
|
|
}
|
|
if err := json.Unmarshal(out, &top); err != nil {
|
|
return fioResult{}, err
|
|
}
|
|
if len(top.Jobs) == 0 {
|
|
return fioResult{}, fmt.Errorf("no jobs in fio output")
|
|
}
|
|
j := top.Jobs[0]
|
|
r := fioResult{
|
|
ReadIOPS: j.Read.IOPS, WriteIOPS: j.Write.IOPS,
|
|
ReadBWKBps: j.Read.BW, WriteBWKBps: j.Write.BW,
|
|
}
|
|
if p := j.Read.CLat.Percentile["99.000000"]; p > 0 {
|
|
r.ReadP99Us = p / 1000.0
|
|
}
|
|
if p := j.Write.CLat.Percentile["99.000000"]; p > 0 {
|
|
r.WriteP99Us = p / 1000.0
|
|
}
|
|
return r, nil
|
|
}
|
|
|
|
// ---------- SMART delta ----------
|
|
|
|
// smartAttrMap: device → attribute → raw counter value. ATA drives
|
|
// populate named attributes (Reallocated_Sector_Ct etc); NVMe drives
|
|
// populate a flatter nvme-specific map. We track a curated whitelist
|
|
// of wear indicators — anything else is diagnostic and drops to the raw
|
|
// report output.
|
|
type smartAttrMap map[string]map[string]float64
|
|
|
|
// captureSMARTAttrs runs smartctl -aj on each target and pulls the
|
|
// whitelisted attributes. Per-device failures (virtio, permission
|
|
// issues) degrade silently — the delta step just shows no data for
|
|
// that device.
|
|
func captureSMARTAttrs(ctx context.Context, targets []diskTarget) smartAttrMap {
|
|
out := smartAttrMap{}
|
|
for _, t := range targets {
|
|
parsed, err := runSmartctl(ctx, t.Device)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
attrs := extractSMARTAttrs(parsed)
|
|
if len(attrs) > 0 {
|
|
out[t.Device] = attrs
|
|
}
|
|
}
|
|
return out
|
|
}
|
|
|
|
// smartAttributeWhitelist is the set of attributes we diff across a
|
|
// stage. They're the ones that reflect *this stage's* IO damage, not
|
|
// cumulative drive history. Adding attributes is cheap — missing ones
|
|
// just drop to zero.
|
|
var smartAttributeWhitelist = map[string]bool{
|
|
// ATA SMART attribute names (smartctl normalizes to these)
|
|
"Reallocated_Sector_Ct": true,
|
|
"Current_Pending_Sector": true,
|
|
"Offline_Uncorrectable": true,
|
|
"UDMA_CRC_Error_Count": true,
|
|
"Reported_Uncorrect": true,
|
|
"Raw_Read_Error_Rate": true,
|
|
// NVMe log fields (flat keys at top of nvme_smart_health_information_log)
|
|
"media_errors": true,
|
|
"num_err_log_entries": true,
|
|
"percentage_used": true,
|
|
}
|
|
|
|
// extractSMARTAttrs walks smartctl's JSON for whitelisted attribute
|
|
// values. Handles both the ATA shape (ata_smart_attributes.table[]) and
|
|
// the NVMe shape (nvme_smart_health_information_log). Returns a map
|
|
// keyed by the canonical attribute name.
|
|
func extractSMARTAttrs(raw map[string]any) map[string]float64 {
|
|
out := map[string]float64{}
|
|
// ATA attributes are in ata_smart_attributes.table[] — each element
|
|
// has {"name": "Reallocated_Sector_Ct", "raw": {"value": N}}.
|
|
if ata, ok := raw["ata_smart_attributes"].(map[string]any); ok {
|
|
if tbl, ok := ata["table"].([]any); ok {
|
|
for _, row := range tbl {
|
|
rm, ok := row.(map[string]any)
|
|
if !ok {
|
|
continue
|
|
}
|
|
name, _ := rm["name"].(string)
|
|
if !smartAttributeWhitelist[name] {
|
|
continue
|
|
}
|
|
if r, ok := rm["raw"].(map[string]any); ok {
|
|
if v, ok := r["value"].(float64); ok {
|
|
out[name] = v
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
// NVMe attributes live flat under nvme_smart_health_information_log.
|
|
if nvme, ok := raw["nvme_smart_health_information_log"].(map[string]any); ok {
|
|
for k, v := range nvme {
|
|
if !smartAttributeWhitelist[k] {
|
|
continue
|
|
}
|
|
if n, ok := v.(float64); ok {
|
|
out[k] = n
|
|
}
|
|
}
|
|
}
|
|
return out
|
|
}
|
|
|
|
// diffSMARTAttrs subtracts start from end per (device, attribute).
|
|
// Only attributes present in both ends produce a delta; missing
|
|
// attributes drop out (can't attribute a zero-to-present delta safely).
|
|
// Negative deltas are kept so a drive that resets a counter is visible.
|
|
func diffSMARTAttrs(start, end smartAttrMap) map[string]map[string]float64 {
|
|
out := map[string]map[string]float64{}
|
|
for dev, endAttrs := range end {
|
|
startAttrs, ok := start[dev]
|
|
if !ok {
|
|
continue
|
|
}
|
|
devOut := map[string]float64{}
|
|
for attr, endV := range endAttrs {
|
|
startV, ok := startAttrs[attr]
|
|
if !ok {
|
|
continue
|
|
}
|
|
devOut[attr] = endV - startV
|
|
}
|
|
if len(devOut) > 0 {
|
|
out[dev] = devOut
|
|
}
|
|
}
|
|
return out
|
|
}
|