Skip to content

Commit 5303e0a

Browse files
authored
wip: Supraseal healthpage output (#325)
* supraffi: healthpage api * wire in health page getter * batchseal: Report nvme metrics to prometheus * make gen * batchseal: Fix data units
1 parent ce37e6e commit 5303e0a

File tree

6 files changed

+345
-3
lines changed

6 files changed

+345
-3
lines changed

lib/supraffi/common.go

+75
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
package supraffi
2+
3+
import "time"
4+
5+
// HealthInfo represents NVMe device health information in a more Go-friendly format
6+
type HealthInfo struct {
7+
// Critical warning flags
8+
CriticalWarning byte
9+
10+
// Temperature information in Celsius
11+
Temperature float64
12+
TemperatureSensors []float64
13+
WarningTempTime time.Duration
14+
CriticalTempTime time.Duration
15+
16+
// Reliability metrics
17+
AvailableSpare uint8
18+
AvailableSpareThreshold uint8
19+
PercentageUsed uint8
20+
21+
// Usage statistics
22+
DataUnitsRead uint64 // in 512-byte units
23+
DataUnitsWritten uint64 // in 512-byte units
24+
HostReadCommands uint64
25+
HostWriteCommands uint64
26+
ControllerBusyTime time.Duration
27+
28+
// Power and error statistics
29+
PowerCycles uint64
30+
PowerOnHours time.Duration
31+
UnsafeShutdowns uint64
32+
MediaErrors uint64
33+
ErrorLogEntries uint64
34+
}
35+
36+
// Helper methods for interpreting critical warning flags
37+
const (
38+
WarningSpareSpace = 1 << 0
39+
WarningTemperature = 1 << 1
40+
WarningReliability = 1 << 2
41+
WarningReadOnly = 1 << 3
42+
WarningVolatileMemory = 1 << 4
43+
WarningPersistentMemory = 1 << 5
44+
)
45+
46+
// HasWarning checks if a specific warning flag is set
47+
func (h *HealthInfo) HasWarning(flag byte) bool {
48+
return (h.CriticalWarning & flag) != 0
49+
}
50+
51+
// GetWarnings returns a slice of active warning descriptions
52+
func (h *HealthInfo) GetWarnings() []string {
53+
var warnings []string
54+
55+
if h.HasWarning(WarningSpareSpace) {
56+
warnings = append(warnings, "available spare space has fallen below threshold")
57+
}
58+
if h.HasWarning(WarningTemperature) {
59+
warnings = append(warnings, "temperature is above critical threshold")
60+
}
61+
if h.HasWarning(WarningReliability) {
62+
warnings = append(warnings, "device reliability has been degraded")
63+
}
64+
if h.HasWarning(WarningReadOnly) {
65+
warnings = append(warnings, "media has been placed in read only mode")
66+
}
67+
if h.HasWarning(WarningVolatileMemory) {
68+
warnings = append(warnings, "volatile memory backup device has failed")
69+
}
70+
if h.HasWarning(WarningPersistentMemory) {
71+
warnings = append(warnings, "persistent memory region has become read-only")
72+
}
73+
74+
return warnings
75+
}

lib/supraffi/no_supraseal.go

+4
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,10 @@ func GenerateMultiString(paths []Path) (string, error) {
4747
return buffer.String(), nil
4848
}
4949

50+
func GetHealthInfo() ([]HealthInfo, error) {
51+
panic("GetHealthInfo: supraseal build tag not enabled")
52+
}
53+
5054
// Pc2 performs the pc2 operation.
5155
func Pc2(blockOffset uint64, numSectors int, outputDir string, sectorSize uint64) int {
5256
panic("Pc2: supraseal build tag not enabled")

lib/supraffi/seal.go

+81
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,37 @@ package supraffi
99
#include <stdbool.h>
1010
#include "supra_seal.h"
1111
#include <stdlib.h>
12+
13+
typedef struct nvme_health_info {
14+
uint8_t critical_warning;
15+
int16_t temperature;
16+
uint8_t available_spare;
17+
uint8_t available_spare_threshold;
18+
uint8_t percentage_used;
19+
uint64_t data_units_read;
20+
uint64_t data_units_written;
21+
uint64_t host_read_commands;
22+
uint64_t host_write_commands;
23+
uint64_t controller_busy_time;
24+
uint64_t power_cycles;
25+
uint64_t power_on_hours;
26+
uint64_t unsafe_shutdowns;
27+
uint64_t media_errors;
28+
uint64_t num_error_info_log_entries;
29+
uint32_t warning_temp_time;
30+
uint32_t critical_temp_time;
31+
int16_t temp_sensors[8];
32+
} nvme_health_info_t;
33+
34+
size_t get_nvme_health_info(nvme_health_info_t* health_infos, size_t max_controllers);
35+
1236
*/
1337
import "C"
1438
import (
1539
"bytes"
1640
"encoding/binary"
41+
"fmt"
42+
"time"
1743
"unsafe"
1844
)
1945

@@ -137,6 +163,61 @@ func SupraSealInit(sectorSize uint64, configFile string) {
137163
C.supra_seal_init(C.size_t(sectorSize), cConfigFile)
138164
}
139165

166+
// GetHealthInfo retrieves health information for all NVMe devices
167+
func GetHealthInfo() ([]HealthInfo, error) {
168+
// Allocate space for raw C struct
169+
const maxControllers = 64
170+
rawInfos := make([]C.nvme_health_info_t, maxControllers)
171+
172+
// Get health info from C
173+
count := C.get_nvme_health_info(
174+
(*C.nvme_health_info_t)(unsafe.Pointer(&rawInfos[0])),
175+
C.size_t(maxControllers),
176+
)
177+
178+
if count == 0 {
179+
return nil, fmt.Errorf("no NVMe controllers found")
180+
}
181+
182+
// Convert C structs to Go structs
183+
healthInfos := make([]HealthInfo, count)
184+
for i := 0; i < int(count); i++ {
185+
raw := &rawInfos[i]
186+
187+
// Convert temperature sensors, filtering out unused ones
188+
sensors := make([]float64, 0, 8)
189+
for _, temp := range raw.temp_sensors {
190+
if temp != 0 {
191+
sensors = append(sensors, float64(temp))
192+
}
193+
}
194+
195+
// todo likely not entirely correct
196+
healthInfos[i] = HealthInfo{
197+
CriticalWarning: byte(raw.critical_warning),
198+
Temperature: float64(raw.temperature), // celsius??
199+
TemperatureSensors: sensors,
200+
WarningTempTime: time.Duration(raw.warning_temp_time) * time.Minute,
201+
CriticalTempTime: time.Duration(raw.critical_temp_time) * time.Minute,
202+
AvailableSpare: uint8(raw.available_spare),
203+
AvailableSpareThreshold: uint8(raw.available_spare_threshold),
204+
PercentageUsed: uint8(raw.percentage_used),
205+
DataUnitsRead: uint64(raw.data_units_read),
206+
DataUnitsWritten: uint64(raw.data_units_written),
207+
HostReadCommands: uint64(raw.host_read_commands),
208+
HostWriteCommands: uint64(raw.host_write_commands),
209+
ControllerBusyTime: time.Duration(raw.controller_busy_time) * time.Minute,
210+
PowerCycles: uint64(raw.power_cycles),
211+
PowerOnHours: time.Duration(raw.power_on_hours) * time.Hour,
212+
UnsafeShutdowns: uint64(raw.unsafe_shutdowns),
213+
MediaErrors: uint64(raw.media_errors),
214+
ErrorLogEntries: uint64(raw.num_error_info_log_entries),
215+
}
216+
}
217+
218+
return healthInfos, nil
219+
}
220+
140221
// Pc1 performs the pc1 operation.
141222
func Pc1(blockOffset uint64, replicaIDs [][32]byte, parentsFilename string, sectorSize uint64) int {
142223
flatReplicaIDs := make([]byte, len(replicaIDs)*32)

tasks/sealsupra/metrics.go

+111-2
Original file line numberDiff line numberDiff line change
@@ -7,19 +7,52 @@ import (
77
)
88

99
var (
10-
phaseKey, _ = tag.NewKey("phase")
11-
pre = "sealsupra_"
10+
phaseKey, _ = tag.NewKey("phase")
11+
nvmeDeviceKey, _ = tag.NewKey("nvme_device")
12+
pre = "sealsupra_"
1213
)
1314

1415
// SupraSealMeasures groups all SupraSeal metrics.
1516
var SupraSealMeasures = struct {
1617
PhaseLockCount *stats.Int64Measure
1718
PhaseWaitingCount *stats.Int64Measure
1819
PhaseAvgDuration *stats.Float64Measure
20+
21+
// NVMe Health measures
22+
NVMeTemperature *stats.Float64Measure
23+
NVMeAvailableSpare *stats.Int64Measure
24+
NVMePercentageUsed *stats.Int64Measure
25+
NVMePowerCycles *stats.Int64Measure
26+
NVMePowerOnHours *stats.Float64Measure
27+
NVMeUnsafeShutdowns *stats.Int64Measure
28+
NVMeMediaErrors *stats.Int64Measure
29+
NVMeErrorLogEntries *stats.Int64Measure
30+
NVMeCriticalWarning *stats.Int64Measure
31+
32+
NVMeBytesRead *stats.Int64Measure
33+
NVMeBytesWritten *stats.Int64Measure
34+
NVMeReadIO *stats.Int64Measure
35+
NVMeWriteIO *stats.Int64Measure
1936
}{
2037
PhaseLockCount: stats.Int64(pre+"phase_lock_count", "Number of active locks in each phase", stats.UnitDimensionless),
2138
PhaseWaitingCount: stats.Int64(pre+"phase_waiting_count", "Number of goroutines waiting for a phase lock", stats.UnitDimensionless),
2239
PhaseAvgDuration: stats.Float64(pre+"phase_avg_duration", "Average duration of each phase in seconds", stats.UnitSeconds),
40+
41+
// NVMe Health measures
42+
NVMeTemperature: stats.Float64(pre+"nvme_temperature_celsius", "NVMe Temperature in Celsius", stats.UnitDimensionless),
43+
NVMeAvailableSpare: stats.Int64(pre+"nvme_available_spare", "NVMe Available Spare", stats.UnitDimensionless),
44+
NVMePercentageUsed: stats.Int64(pre+"nvme_percentage_used", "NVMe Percentage Used", stats.UnitDimensionless),
45+
NVMePowerCycles: stats.Int64(pre+"nvme_power_cycles", "NVMe Power Cycles", stats.UnitDimensionless),
46+
NVMePowerOnHours: stats.Float64(pre+"nvme_power_on_hours", "NVMe Power On Hours", stats.UnitDimensionless),
47+
NVMeUnsafeShutdowns: stats.Int64(pre+"nvme_unsafe_shutdowns", "NVMe Unsafe Shutdowns", stats.UnitDimensionless),
48+
NVMeMediaErrors: stats.Int64(pre+"nvme_media_errors", "NVMe Media Errors", stats.UnitDimensionless),
49+
NVMeErrorLogEntries: stats.Int64(pre+"nvme_error_log_entries", "NVMe Error Log Entries", stats.UnitDimensionless),
50+
NVMeCriticalWarning: stats.Int64(pre+"nvme_critical_warning", "NVMe Critical Warning Flags", stats.UnitDimensionless),
51+
52+
NVMeBytesRead: stats.Int64(pre+"nvme_bytes_read", "NVMe Bytes Read", stats.UnitBytes),
53+
NVMeBytesWritten: stats.Int64(pre+"nvme_bytes_written", "NVMe Bytes Written", stats.UnitBytes),
54+
NVMeReadIO: stats.Int64(pre+"nvme_read_io", "NVMe Read IOs", stats.UnitDimensionless),
55+
NVMeWriteIO: stats.Int64(pre+"nvme_write_io", "NVMe Write IOs", stats.UnitDimensionless),
2356
}
2457

2558
// init registers the views for SupraSeal metrics.
@@ -40,6 +73,82 @@ func init() {
4073
Aggregation: view.LastValue(),
4174
TagKeys: []tag.Key{phaseKey},
4275
},
76+
// NVMe Health views
77+
&view.View{
78+
Measure: SupraSealMeasures.NVMeTemperature,
79+
Aggregation: view.LastValue(),
80+
TagKeys: []tag.Key{nvmeDeviceKey},
81+
},
82+
&view.View{
83+
Measure: SupraSealMeasures.NVMeAvailableSpare,
84+
Aggregation: view.LastValue(),
85+
TagKeys: []tag.Key{nvmeDeviceKey},
86+
},
87+
&view.View{
88+
Measure: SupraSealMeasures.NVMePercentageUsed,
89+
Aggregation: view.LastValue(),
90+
TagKeys: []tag.Key{nvmeDeviceKey},
91+
},
92+
&view.View{
93+
Measure: SupraSealMeasures.NVMePowerCycles,
94+
Aggregation: view.LastValue(),
95+
TagKeys: []tag.Key{nvmeDeviceKey},
96+
},
97+
&view.View{
98+
Measure: SupraSealMeasures.NVMePowerOnHours,
99+
Aggregation: view.LastValue(),
100+
TagKeys: []tag.Key{nvmeDeviceKey},
101+
},
102+
&view.View{
103+
Measure: SupraSealMeasures.NVMeUnsafeShutdowns,
104+
Aggregation: view.LastValue(),
105+
TagKeys: []tag.Key{nvmeDeviceKey},
106+
},
107+
&view.View{
108+
Measure: SupraSealMeasures.NVMeMediaErrors,
109+
Aggregation: view.LastValue(),
110+
TagKeys: []tag.Key{nvmeDeviceKey},
111+
},
112+
&view.View{
113+
Measure: SupraSealMeasures.NVMeErrorLogEntries,
114+
Aggregation: view.LastValue(),
115+
TagKeys: []tag.Key{nvmeDeviceKey},
116+
},
117+
&view.View{
118+
Measure: SupraSealMeasures.NVMeCriticalWarning,
119+
Aggregation: view.LastValue(),
120+
TagKeys: []tag.Key{nvmeDeviceKey},
121+
},
122+
&view.View{
123+
Measure: SupraSealMeasures.NVMeBytesRead,
124+
Aggregation: view.Sum(),
125+
TagKeys: []tag.Key{nvmeDeviceKey},
126+
},
127+
&view.View{
128+
Measure: SupraSealMeasures.NVMeBytesWritten,
129+
Aggregation: view.Sum(),
130+
TagKeys: []tag.Key{nvmeDeviceKey},
131+
},
132+
&view.View{
133+
Measure: SupraSealMeasures.NVMeReadIO,
134+
Aggregation: view.Sum(),
135+
TagKeys: []tag.Key{nvmeDeviceKey},
136+
},
137+
&view.View{
138+
Measure: SupraSealMeasures.NVMeWriteIO,
139+
Aggregation: view.Sum(),
140+
TagKeys: []tag.Key{nvmeDeviceKey},
141+
},
142+
&view.View{
143+
Measure: SupraSealMeasures.NVMeReadIO,
144+
Aggregation: view.Sum(),
145+
TagKeys: []tag.Key{nvmeDeviceKey},
146+
},
147+
&view.View{
148+
Measure: SupraSealMeasures.NVMeWriteIO,
149+
Aggregation: view.Sum(),
150+
TagKeys: []tag.Key{nvmeDeviceKey},
151+
},
43152
)
44153
if err != nil {
45154
panic(err)

0 commit comments

Comments
 (0)