diff --git a/agent/app/dto/monitor.go b/agent/app/dto/monitor.go index 393e2cdf4..e8837feaf 100644 --- a/agent/app/dto/monitor.go +++ b/agent/app/dto/monitor.go @@ -44,6 +44,7 @@ type MonitorGPUSearch struct { EndTime time.Time `json:"endTime"` } type MonitorGPUData struct { + GPUType string `json:"gpuType"` ProductNames []string `json:"productNames"` Date []time.Time `json:"date"` GPUValue []float64 `json:"gpuValue"` @@ -58,8 +59,8 @@ type GPUPowerUsageHelper struct { Percent float64 `json:"percent"` } type GPUMemoryUsageHelper struct { - Total int `json:"total"` - Used int `json:"used"` + Total float64 `json:"total"` + Used float64 `json:"used"` Percent float64 `json:"percent"` GPUProcesses []GPUProcess `json:"gpuProcesses"` diff --git a/agent/app/model/monitor.go b/agent/app/model/monitor.go index cc825eb5a..628f5ab67 100644 --- a/agent/app/model/monitor.go +++ b/agent/app/model/monitor.go @@ -39,8 +39,8 @@ type MonitorGPU struct { Temperature int `json:"temperature"` PowerDraw float64 `json:"powerDraw"` MaxPowerLimit float64 `json:"maxPowerLimit"` - MemUsed int `json:"memUsed"` - MemTotal int `json:"memTotal"` + MemUsed float64 `json:"memUsed"` + MemTotal float64 `json:"memTotal"` FanSpeed int `json:"fanSpeed"` Processes string `json:"processes"` } diff --git a/agent/app/service/monitor.go b/agent/app/service/monitor.go index d0b91397b..2e88cd6be 100644 --- a/agent/app/service/monitor.go +++ b/agent/app/service/monitor.go @@ -131,8 +131,10 @@ func (m *MonitorService) LoadGPUMonitorData(req dto.MonitorGPUSearch) (dto.Monit } if len(req.ProductName) == 0 { if gpuExist { + data.GPUType = "gpu" gpuInfo, err := gpuclient.LoadGpuInfo() if err != nil || len(gpuInfo.GPUs) == 0 { + global.LOG.Error("Load GPU info failed or no GPU found, err: ", err) return data, buserr.New("ErrRecordNotFound") } req.ProductName = gpuInfo.GPUs[0].ProductName @@ -140,8 +142,10 @@ func (m *MonitorService) LoadGPUMonitorData(req dto.MonitorGPUSearch) (dto.Monit data.ProductNames = append(data.ProductNames, item.ProductName) } } else { + data.GPUType = "xpu" xpuInfo, err := xpuClient.LoadGpuInfo() if err != nil || len(xpuInfo.Xpu) == 0 { + global.LOG.Error("Load XPU info failed or no XPU found, err: ", err) return data, buserr.New("ErrRecordNotFound") } req.ProductName = xpuInfo.Xpu[0].Basic.DeviceName @@ -159,15 +163,18 @@ func (m *MonitorService) LoadGPUMonitorData(req dto.MonitorGPUSearch) (dto.Monit data.Date = append(data.Date, gpu.CreatedAt) data.GPUValue = append(data.GPUValue, gpu.GPUUtil) data.TemperatureValue = append(data.TemperatureValue, gpu.Temperature) - data.PowerValue = append(data.PowerValue, dto.GPUPowerUsageHelper{ - Total: gpu.MaxPowerLimit, - Used: gpu.PowerDraw, - Percent: gpu.PowerDraw / gpu.MaxPowerLimit * 100, - }) + powerItem := dto.GPUPowerUsageHelper{ + Total: gpu.MaxPowerLimit, + Used: gpu.PowerDraw, + } + if powerItem.Total != 0 { + powerItem.Percent = powerItem.Used / powerItem.Total + } + data.PowerValue = append(data.PowerValue, powerItem) memItem := dto.GPUMemoryUsageHelper{ Total: gpu.MemTotal, Used: gpu.MemUsed, - Percent: float64(gpu.MemUsed) / float64(gpu.MemTotal) * 100, + Percent: gpu.MemUsed / gpu.MemTotal * 100, } var process []dto.GPUProcess if err := json.Unmarshal([]byte(gpu.Processes), &process); err == nil { @@ -564,14 +571,13 @@ func saveGPUDataToDB() { var list []model.MonitorGPU for _, gpuItem := range gpuInfo.GPUs { item := model.MonitorGPU{ - ProductName: gpuItem.ProductName, - GPUUtil: loadGPUInfoFloat(gpuItem.GPUUtil), - Temperature: loadGPUInfoInt(gpuItem.Temperature), - PowerDraw: loadGPUInfoFloat(gpuItem.PowerDraw), - MaxPowerLimit: loadGPUInfoFloat(gpuItem.MaxPowerLimit), - MemUsed: loadGPUInfoInt(gpuItem.MemUsed), - MemTotal: loadGPUInfoInt(gpuItem.MemTotal), - FanSpeed: loadGPUInfoInt(gpuItem.FanSpeed), + ProductName: gpuItem.ProductName, + GPUUtil: loadGPUInfoFloat(gpuItem.GPUUtil), + Temperature: loadGPUInfoInt(gpuItem.Temperature), + PowerDraw: loadGPUInfoFloat(gpuItem.PowerDraw), + MemUsed: loadGPUInfoFloat(gpuItem.MemUsed), + MemTotal: loadGPUInfoFloat(gpuItem.MemTotal), + FanSpeed: loadGPUInfoInt(gpuItem.FanSpeed), } process, _ := json.Marshal(gpuItem.Processes) if len(process) != 0 { @@ -596,25 +602,28 @@ func saveXPUDataToDB() { var list []model.MonitorGPU for _, xpuItem := range xpuInfo.Xpu { item := model.MonitorGPU{ - ProductName: xpuItem.Basic.DeviceName, - GPUUtil: loadGPUInfoFloat(xpuItem.Stats.MemoryUtil), - Temperature: loadGPUInfoInt(xpuItem.Stats.Temperature), - PowerDraw: loadGPUInfoFloat(xpuItem.Stats.Power), - MemUsed: loadGPUInfoInt(xpuItem.Stats.MemoryUsed), - MemTotal: loadGPUInfoInt(xpuItem.Basic.Memory), + ProductName: xpuItem.Basic.DeviceName, + GPUUtil: loadGPUInfoFloat(xpuItem.Stats.MemoryUtil), + Temperature: loadGPUInfoInt(xpuItem.Stats.Temperature), + PowerDraw: loadGPUInfoFloat(xpuItem.Stats.Power), + MaxPowerLimit: float64(xpuItem.Config.PowerLimit), + MemUsed: loadGPUInfoFloat(xpuItem.Stats.MemoryUsed), + MemTotal: loadGPUInfoFloat(xpuItem.Basic.Memory), } - var processItem []dto.GPUProcess - for _, ps := range xpuItem.Processes { - processItem = append(processItem, dto.GPUProcess{ - Pid: fmt.Sprintf("%v", ps.PID), - Type: ps.SHR, - ProcessName: ps.Command, - UsedMemory: ps.Memory, - }) - } - process, _ := json.Marshal(processItem) - if len(process) != 0 { - item.Processes = string(process) + if len(xpuItem.Processes) != 0 { + var processItem []dto.GPUProcess + for _, ps := range xpuItem.Processes { + processItem = append(processItem, dto.GPUProcess{ + Pid: fmt.Sprintf("%v", ps.PID), + Type: ps.SHR, + ProcessName: ps.Command, + UsedMemory: ps.Memory, + }) + } + process, _ := json.Marshal(processItem) + if len(process) != 0 { + item.Processes = string(process) + } } list = append(list, item) } @@ -633,6 +642,7 @@ func loadGPUInfoInt(val string) int { } func loadGPUInfoFloat(val string) float64 { valItem := strings.ReplaceAll(val, "W", "") + valItem = strings.ReplaceAll(valItem, "MB", "") valItem = strings.ReplaceAll(valItem, "%", "") valItem = strings.TrimSpace(valItem) data, _ := strconv.ParseFloat(valItem, 64) diff --git a/agent/init/migration/migrations/init.go b/agent/init/migration/migrations/init.go index 41984a49a..4eec39de6 100644 --- a/agent/init/migration/migrations/init.go +++ b/agent/init/migration/migrations/init.go @@ -721,7 +721,7 @@ var UpdateDatabase = &gormigrate.Migration{ } var AddGPUMonitor = &gormigrate.Migration{ - ID: "20251119-add-gpu-monitor", + ID: "20251127-add-gpu-monitor", Migrate: func(tx *gorm.DB) error { return global.GPUMonitorDB.AutoMigrate(&model.MonitorGPU{}) }, diff --git a/agent/utils/ai_tools/xpu/xpu_info.go b/agent/utils/ai_tools/xpu/xpu_info.go index 9c7d45656..e7a558e6c 100644 --- a/agent/utils/ai_tools/xpu/xpu_info.go +++ b/agent/utils/ai_tools/xpu/xpu_info.go @@ -10,6 +10,7 @@ type XpuInfo struct { type Xpu struct { Basic Basic `json:"basic"` Stats Stats `json:"stats"` + Config Config `json:"config"` Processes []Process `json:"processes"` } @@ -23,6 +24,11 @@ type Basic struct { PciBdfAddress string `json:"pciBdfAddress"` } +type Config struct { + PowerLimit int `json:"power_limit"` + PowerValidRange string `json:"power_vaild_range"` +} + type Stats struct { Power string `json:"power"` Frequency string `json:"frequency"` diff --git a/frontend/src/api/interface/host.ts b/frontend/src/api/interface/host.ts index ac47d7f6d..f6b6a8fa3 100644 --- a/frontend/src/api/interface/host.ts +++ b/frontend/src/api/interface/host.ts @@ -167,6 +167,7 @@ export namespace Host { endTime: Date; } export interface MonitorGPUData { + gpuType: string; productNames: Array; date: Array; gpuValue: Array; diff --git a/frontend/src/lang/modules/zh.ts b/frontend/src/lang/modules/zh.ts index 1b327b032..745a39e41 100644 --- a/frontend/src/lang/modules/zh.ts +++ b/frontend/src/lang/modules/zh.ts @@ -656,7 +656,7 @@ const message = { gpu: { gpu: 'GPU 监控', base: '基础信息', - gpuHelper: '当前系统未检测到 NVIDIA-SMI或者XPU-SMI 指令,请检查后重试!', + gpuHelper: '当前系统未检测到 NVIDIA-SMI 或者 XPU-SMI 指令,请检查后重试!', driverVersion: '驱动版本', cudaVersion: 'CUDA 版本', process: '进程信息', diff --git a/frontend/src/views/ai/gpu/index.vue b/frontend/src/views/ai/gpu/index.vue index d4a9c1bc7..d7bf4ef54 100644 --- a/frontend/src/views/ai/gpu/index.vue +++ b/frontend/src/views/ai/gpu/index.vue @@ -30,9 +30,9 @@ - - - + + +