feat: Compatible with XPU monitoring (#11088)

This commit is contained in:
ssongliu 2025-11-26 22:30:15 +08:00 committed by GitHub
parent 18c65c3096
commit 8e03b24b0c
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 108 additions and 52 deletions

View file

@ -44,6 +44,7 @@ type MonitorGPUSearch struct {
EndTime time.Time `json:"endTime"` EndTime time.Time `json:"endTime"`
} }
type MonitorGPUData struct { type MonitorGPUData struct {
GPUType string `json:"gpuType"`
ProductNames []string `json:"productNames"` ProductNames []string `json:"productNames"`
Date []time.Time `json:"date"` Date []time.Time `json:"date"`
GPUValue []float64 `json:"gpuValue"` GPUValue []float64 `json:"gpuValue"`
@ -58,8 +59,8 @@ type GPUPowerUsageHelper struct {
Percent float64 `json:"percent"` Percent float64 `json:"percent"`
} }
type GPUMemoryUsageHelper struct { type GPUMemoryUsageHelper struct {
Total int `json:"total"` Total float64 `json:"total"`
Used int `json:"used"` Used float64 `json:"used"`
Percent float64 `json:"percent"` Percent float64 `json:"percent"`
GPUProcesses []GPUProcess `json:"gpuProcesses"` GPUProcesses []GPUProcess `json:"gpuProcesses"`

View file

@ -39,8 +39,8 @@ type MonitorGPU struct {
Temperature int `json:"temperature"` Temperature int `json:"temperature"`
PowerDraw float64 `json:"powerDraw"` PowerDraw float64 `json:"powerDraw"`
MaxPowerLimit float64 `json:"maxPowerLimit"` MaxPowerLimit float64 `json:"maxPowerLimit"`
MemUsed int `json:"memUsed"` MemUsed float64 `json:"memUsed"`
MemTotal int `json:"memTotal"` MemTotal float64 `json:"memTotal"`
FanSpeed int `json:"fanSpeed"` FanSpeed int `json:"fanSpeed"`
Processes string `json:"processes"` Processes string `json:"processes"`
} }

View file

@ -131,8 +131,10 @@ func (m *MonitorService) LoadGPUMonitorData(req dto.MonitorGPUSearch) (dto.Monit
} }
if len(req.ProductName) == 0 { if len(req.ProductName) == 0 {
if gpuExist { if gpuExist {
data.GPUType = "gpu"
gpuInfo, err := gpuclient.LoadGpuInfo() gpuInfo, err := gpuclient.LoadGpuInfo()
if err != nil || len(gpuInfo.GPUs) == 0 { if err != nil || len(gpuInfo.GPUs) == 0 {
global.LOG.Error("Load GPU info failed or no GPU found, err: ", err)
return data, buserr.New("ErrRecordNotFound") return data, buserr.New("ErrRecordNotFound")
} }
req.ProductName = gpuInfo.GPUs[0].ProductName req.ProductName = gpuInfo.GPUs[0].ProductName
@ -140,8 +142,10 @@ func (m *MonitorService) LoadGPUMonitorData(req dto.MonitorGPUSearch) (dto.Monit
data.ProductNames = append(data.ProductNames, item.ProductName) data.ProductNames = append(data.ProductNames, item.ProductName)
} }
} else { } else {
data.GPUType = "xpu"
xpuInfo, err := xpuClient.LoadGpuInfo() xpuInfo, err := xpuClient.LoadGpuInfo()
if err != nil || len(xpuInfo.Xpu) == 0 { if err != nil || len(xpuInfo.Xpu) == 0 {
global.LOG.Error("Load XPU info failed or no XPU found, err: ", err)
return data, buserr.New("ErrRecordNotFound") return data, buserr.New("ErrRecordNotFound")
} }
req.ProductName = xpuInfo.Xpu[0].Basic.DeviceName req.ProductName = xpuInfo.Xpu[0].Basic.DeviceName
@ -159,15 +163,18 @@ func (m *MonitorService) LoadGPUMonitorData(req dto.MonitorGPUSearch) (dto.Monit
data.Date = append(data.Date, gpu.CreatedAt) data.Date = append(data.Date, gpu.CreatedAt)
data.GPUValue = append(data.GPUValue, gpu.GPUUtil) data.GPUValue = append(data.GPUValue, gpu.GPUUtil)
data.TemperatureValue = append(data.TemperatureValue, gpu.Temperature) data.TemperatureValue = append(data.TemperatureValue, gpu.Temperature)
data.PowerValue = append(data.PowerValue, dto.GPUPowerUsageHelper{ powerItem := dto.GPUPowerUsageHelper{
Total: gpu.MaxPowerLimit, Total: gpu.MaxPowerLimit,
Used: gpu.PowerDraw, Used: gpu.PowerDraw,
Percent: gpu.PowerDraw / gpu.MaxPowerLimit * 100, }
}) if powerItem.Total != 0 {
powerItem.Percent = powerItem.Used / powerItem.Total
}
data.PowerValue = append(data.PowerValue, powerItem)
memItem := dto.GPUMemoryUsageHelper{ memItem := dto.GPUMemoryUsageHelper{
Total: gpu.MemTotal, Total: gpu.MemTotal,
Used: gpu.MemUsed, Used: gpu.MemUsed,
Percent: float64(gpu.MemUsed) / float64(gpu.MemTotal) * 100, Percent: gpu.MemUsed / gpu.MemTotal * 100,
} }
var process []dto.GPUProcess var process []dto.GPUProcess
if err := json.Unmarshal([]byte(gpu.Processes), &process); err == nil { if err := json.Unmarshal([]byte(gpu.Processes), &process); err == nil {
@ -564,14 +571,13 @@ func saveGPUDataToDB() {
var list []model.MonitorGPU var list []model.MonitorGPU
for _, gpuItem := range gpuInfo.GPUs { for _, gpuItem := range gpuInfo.GPUs {
item := model.MonitorGPU{ item := model.MonitorGPU{
ProductName: gpuItem.ProductName, ProductName: gpuItem.ProductName,
GPUUtil: loadGPUInfoFloat(gpuItem.GPUUtil), GPUUtil: loadGPUInfoFloat(gpuItem.GPUUtil),
Temperature: loadGPUInfoInt(gpuItem.Temperature), Temperature: loadGPUInfoInt(gpuItem.Temperature),
PowerDraw: loadGPUInfoFloat(gpuItem.PowerDraw), PowerDraw: loadGPUInfoFloat(gpuItem.PowerDraw),
MaxPowerLimit: loadGPUInfoFloat(gpuItem.MaxPowerLimit), MemUsed: loadGPUInfoFloat(gpuItem.MemUsed),
MemUsed: loadGPUInfoInt(gpuItem.MemUsed), MemTotal: loadGPUInfoFloat(gpuItem.MemTotal),
MemTotal: loadGPUInfoInt(gpuItem.MemTotal), FanSpeed: loadGPUInfoInt(gpuItem.FanSpeed),
FanSpeed: loadGPUInfoInt(gpuItem.FanSpeed),
} }
process, _ := json.Marshal(gpuItem.Processes) process, _ := json.Marshal(gpuItem.Processes)
if len(process) != 0 { if len(process) != 0 {
@ -596,25 +602,28 @@ func saveXPUDataToDB() {
var list []model.MonitorGPU var list []model.MonitorGPU
for _, xpuItem := range xpuInfo.Xpu { for _, xpuItem := range xpuInfo.Xpu {
item := model.MonitorGPU{ item := model.MonitorGPU{
ProductName: xpuItem.Basic.DeviceName, ProductName: xpuItem.Basic.DeviceName,
GPUUtil: loadGPUInfoFloat(xpuItem.Stats.MemoryUtil), GPUUtil: loadGPUInfoFloat(xpuItem.Stats.MemoryUtil),
Temperature: loadGPUInfoInt(xpuItem.Stats.Temperature), Temperature: loadGPUInfoInt(xpuItem.Stats.Temperature),
PowerDraw: loadGPUInfoFloat(xpuItem.Stats.Power), PowerDraw: loadGPUInfoFloat(xpuItem.Stats.Power),
MemUsed: loadGPUInfoInt(xpuItem.Stats.MemoryUsed), MaxPowerLimit: float64(xpuItem.Config.PowerLimit),
MemTotal: loadGPUInfoInt(xpuItem.Basic.Memory), MemUsed: loadGPUInfoFloat(xpuItem.Stats.MemoryUsed),
MemTotal: loadGPUInfoFloat(xpuItem.Basic.Memory),
} }
var processItem []dto.GPUProcess if len(xpuItem.Processes) != 0 {
for _, ps := range xpuItem.Processes { var processItem []dto.GPUProcess
processItem = append(processItem, dto.GPUProcess{ for _, ps := range xpuItem.Processes {
Pid: fmt.Sprintf("%v", ps.PID), processItem = append(processItem, dto.GPUProcess{
Type: ps.SHR, Pid: fmt.Sprintf("%v", ps.PID),
ProcessName: ps.Command, Type: ps.SHR,
UsedMemory: ps.Memory, ProcessName: ps.Command,
}) UsedMemory: ps.Memory,
} })
process, _ := json.Marshal(processItem) }
if len(process) != 0 { process, _ := json.Marshal(processItem)
item.Processes = string(process) if len(process) != 0 {
item.Processes = string(process)
}
} }
list = append(list, item) list = append(list, item)
} }
@ -633,6 +642,7 @@ func loadGPUInfoInt(val string) int {
} }
func loadGPUInfoFloat(val string) float64 { func loadGPUInfoFloat(val string) float64 {
valItem := strings.ReplaceAll(val, "W", "") valItem := strings.ReplaceAll(val, "W", "")
valItem = strings.ReplaceAll(valItem, "MB", "")
valItem = strings.ReplaceAll(valItem, "%", "") valItem = strings.ReplaceAll(valItem, "%", "")
valItem = strings.TrimSpace(valItem) valItem = strings.TrimSpace(valItem)
data, _ := strconv.ParseFloat(valItem, 64) data, _ := strconv.ParseFloat(valItem, 64)

View file

@ -721,7 +721,7 @@ var UpdateDatabase = &gormigrate.Migration{
} }
var AddGPUMonitor = &gormigrate.Migration{ var AddGPUMonitor = &gormigrate.Migration{
ID: "20251119-add-gpu-monitor", ID: "20251127-add-gpu-monitor",
Migrate: func(tx *gorm.DB) error { Migrate: func(tx *gorm.DB) error {
return global.GPUMonitorDB.AutoMigrate(&model.MonitorGPU{}) return global.GPUMonitorDB.AutoMigrate(&model.MonitorGPU{})
}, },

View file

@ -10,6 +10,7 @@ type XpuInfo struct {
type Xpu struct { type Xpu struct {
Basic Basic `json:"basic"` Basic Basic `json:"basic"`
Stats Stats `json:"stats"` Stats Stats `json:"stats"`
Config Config `json:"config"`
Processes []Process `json:"processes"` Processes []Process `json:"processes"`
} }
@ -23,6 +24,11 @@ type Basic struct {
PciBdfAddress string `json:"pciBdfAddress"` PciBdfAddress string `json:"pciBdfAddress"`
} }
type Config struct {
PowerLimit int `json:"power_limit"`
PowerValidRange string `json:"power_vaild_range"`
}
type Stats struct { type Stats struct {
Power string `json:"power"` Power string `json:"power"`
Frequency string `json:"frequency"` Frequency string `json:"frequency"`

View file

@ -167,6 +167,7 @@ export namespace Host {
endTime: Date; endTime: Date;
} }
export interface MonitorGPUData { export interface MonitorGPUData {
gpuType: string;
productNames: Array<string>; productNames: Array<string>;
date: Array<Date>; date: Array<Date>;
gpuValue: Array<number>; gpuValue: Array<number>;

View file

@ -656,7 +656,7 @@ const message = {
gpu: { gpu: {
gpu: 'GPU 监控', gpu: 'GPU 监控',
base: '基础信息', base: '基础信息',
gpuHelper: '当前系统未检测到 NVIDIA-SMI或者XPU-SMI 指令请检查后重试', gpuHelper: '当前系统未检测到 NVIDIA-SMI 或者 XPU-SMI 指令请检查后重试',
driverVersion: '驱动版本', driverVersion: '驱动版本',
cudaVersion: 'CUDA 版本', cudaVersion: 'CUDA 版本',
process: '进程信息', process: '进程信息',

View file

@ -30,9 +30,9 @@
</div> </div>
</el-card> </el-card>
</div> </div>
<el-row :gutter="7" class="card-interval" v-if="options.length !== 0"> <el-row :gutter="7" v-if="options.length !== 0">
<el-col :span="24"> <el-col v-bind="gpuType === 'gpu' ? fullWidthProps : halfWidthProps">
<el-card style="overflow: inherit"> <el-card class="card-interval" style="overflow: inherit">
<template #header> <template #header>
<div :class="mobile ? 'flx-wrap' : 'flex justify-between'"> <div :class="mobile ? 'flx-wrap' : 'flex justify-between'">
<span class="title">{{ $t('monitor.gpuUtil') }}</span> <span class="title">{{ $t('monitor.gpuUtil') }}</span>
@ -51,7 +51,7 @@
</el-card> </el-card>
</el-col> </el-col>
<el-col :xs="24" :sm="24" :md="12" :lg="12" :xl="12"> <el-col :xs="24" :sm="24" :md="12" :lg="12" :xl="12">
<el-card style="overflow: inherit"> <el-card class="card-interval" style="overflow: inherit">
<template #header> <template #header>
<div :class="mobile ? 'flx-wrap' : 'flex justify-between'"> <div :class="mobile ? 'flx-wrap' : 'flex justify-between'">
<span class="title">{{ $t('monitor.memoryUsage') }}</span> <span class="title">{{ $t('monitor.memoryUsage') }}</span>
@ -70,7 +70,7 @@
</el-card> </el-card>
</el-col> </el-col>
<el-col :xs="24" :sm="24" :md="12" :lg="12" :xl="12"> <el-col :xs="24" :sm="24" :md="12" :lg="12" :xl="12">
<el-card style="overflow: inherit"> <el-card class="card-interval" style="overflow: inherit">
<template #header> <template #header>
<div :class="mobile ? 'flx-wrap' : 'flex justify-between'"> <div :class="mobile ? 'flx-wrap' : 'flex justify-between'">
<span class="title">{{ $t('monitor.powerUsage') }}</span> <span class="title">{{ $t('monitor.powerUsage') }}</span>
@ -89,7 +89,7 @@
</el-card> </el-card>
</el-col> </el-col>
<el-col :xs="24" :sm="24" :md="12" :lg="12" :xl="12"> <el-col :xs="24" :sm="24" :md="12" :lg="12" :xl="12">
<el-card style="overflow: inherit"> <el-card class="card-interval" style="overflow: inherit">
<template #header> <template #header>
<div> <div>
{{ $t('monitor.temperature') }} {{ $t('monitor.temperature') }}
@ -110,8 +110,8 @@
</div> </div>
</el-card> </el-card>
</el-col> </el-col>
<el-col :xs="24" :sm="24" :md="12" :lg="12" :xl="12"> <el-col :xs="24" :sm="24" :md="12" :lg="12" :xl="12" v-if="gpuType === 'gpu'">
<el-card style="overflow: inherit"> <el-card class="card-interval" style="overflow: inherit">
<template #header> <template #header>
<div :class="mobile ? 'flx-wrap' : 'flex justify-between'"> <div :class="mobile ? 'flx-wrap' : 'flex justify-between'">
<span class="title">{{ $t('monitor.fanSpeed') }}</span> <span class="title">{{ $t('monitor.fanSpeed') }}</span>
@ -161,8 +161,12 @@ const mobile = computed(() => {
return globalStore.isMobile(); return globalStore.isMobile();
}); });
const fullWidthProps = { span: 24 };
const halfWidthProps = { xs: 24, sm: 24, md: 12, lg: 12, xl: 12 };
const loading = ref(false); const loading = ref(false);
const options = ref([]); const options = ref([]);
const gpuType = ref('gpu');
const timeRangeGlobal = ref<[Date, Date]>([new Date(new Date().setHours(0, 0, 0, 0)), new Date()]); const timeRangeGlobal = ref<[Date, Date]>([new Date(new Date().setHours(0, 0, 0, 0)), new Date()]);
const chartsOption = ref({ const chartsOption = ref({
loadPowerChart: null, loadPowerChart: null,
@ -189,6 +193,7 @@ const search = async () => {
.then((res) => { .then((res) => {
loading.value = false; loading.value = false;
options.value = res.data.productNames || []; options.value = res.data.productNames || [];
gpuType.value = res.data.gpuType || 'gpu';
searchInfo.productName = searchInfo.productName || (options.value.length > 0 ? options.value[0] : ''); searchInfo.productName = searchInfo.productName || (options.value.length > 0 ? options.value[0] : '');
let baseDate = res.data.date.length === 0 ? loadEmptyDate(timeRangeGlobal.value) : res.data.date; let baseDate = res.data.date.length === 0 ? loadEmptyDate(timeRangeGlobal.value) : res.data.date;
let date = baseDate.map(function (item: any) { let date = baseDate.map(function (item: any) {
@ -196,7 +201,11 @@ const search = async () => {
}); });
initCPUCharts(date, res.data.gpuValue); initCPUCharts(date, res.data.gpuValue);
initMemoryCharts(date, res.data.memoryValue); initMemoryCharts(date, res.data.memoryValue);
initPowerCharts(date, res.data.powerValue); if (gpuType.value === 'gpu') {
initPowerCharts(date, res.data.powerValue);
} else {
initXpuPowerCharts(date, res.data.powerValue);
}
initSpeedCharts(date, res.data.speedValue); initSpeedCharts(date, res.data.speedValue);
initTemperatureCharts(date, res.data.temperatureValue); initTemperatureCharts(date, res.data.temperatureValue);
}) })
@ -270,6 +279,33 @@ function initPowerCharts(baseDate: any, items: any) {
formatStr: '%', formatStr: '%',
}; };
} }
function initXpuPowerCharts(baseDate: any, items: any) {
let list = items.map(function (item: any) {
return { value: Number(item.used.toFixed(2)), data: item };
});
list = list.length === 0 ? loadEmptyData2() : list;
chartsOption.value['loadPowerChart'] = {
xData: baseDate,
yData: [
{
name: i18n.global.t('monitor.powerUsage'),
data: list,
},
],
tooltip: {
trigger: 'axis',
formatter: function (list: any) {
let res = loadDate(list[0].name);
for (const item of list) {
res += loadSeries(item, item.data.value ? item.data.value : item.data, 'W');
}
return res;
},
},
formatStr: 'W',
};
}
function initTemperatureCharts(baseDate: any, items: any) { function initTemperatureCharts(baseDate: any, items: any) {
let temperatures = items.map(function (item: any) { let temperatures = items.map(function (item: any) {
return Number(item); return Number(item);
@ -334,14 +370,15 @@ function withMemoryProcess(list: any) {
if (!process) { if (!process) {
return res; return res;
} }
let title = gpuType.value === 'gpu' ? i18n.global.t('aiTools.gpu.type') : i18n.global.t('aiTools.gpu.shr');
res += ` res += `
<div style="margin-top: 10px; border-bottom: 1px dashed black;"></div> <div style="margin-top: 10px; border-bottom: 1px dashed black;"></div>
<table style="border-collapse: collapse; margin-top: 20px; font-size: 12px;"> <table style="border-collapse: collapse; margin-top: 20px; font-size: 12px;">
<thead> <thead>
<tr> <tr>
<th style="padding: 6px 8px;">PID</th> <th style="padding: 6px 8px;">PID</th>
<th style="padding: 6px 8px;">${i18n.global.t('aiTools.gpu.type')}</th>
<th style="padding: 6px 8px;">${i18n.global.t('aiTools.gpu.processName')}</th> <th style="padding: 6px 8px;">${i18n.global.t('aiTools.gpu.processName')}</th>
<th style="padding: 6px 8px;">${title}</th>
<th style="padding: 6px 8px;">${i18n.global.t('aiTools.gpu.processMemoryUsage')}</th> <th style="padding: 6px 8px;">${i18n.global.t('aiTools.gpu.processMemoryUsage')}</th>
</tr> </tr>
</thead> </thead>
@ -354,10 +391,10 @@ function withMemoryProcess(list: any) {
${row.pid} ${row.pid}
</td> </td>
<td style="padding: 6px 8px; text-align: center;"> <td style="padding: 6px 8px; text-align: center;">
${loadProcessType(row.type)} ${row.processName}
</td> </td>
<td style="padding: 6px 8px; text-align: center;"> <td style="padding: 6px 8px; text-align: center;">
${row.processName} ${loadProcessType(row.type)}
</td> </td>
<td style="padding: 6px 8px; text-align: center;"> <td style="padding: 6px 8px; text-align: center;">
${row.usedMemory} ${row.usedMemory}

View file

@ -69,8 +69,9 @@
<el-button v-if="!cpuShowAll" @click="cpuShowAll = true" icon="More" link size="small" /> <el-button v-if="!cpuShowAll" @click="cpuShowAll = true" icon="More" link size="small" />
<el-button v-if="cpuShowAll" @click="cpuShowAll = false" icon="ArrowUp" link size="small" /> <el-button v-if="cpuShowAll" @click="cpuShowAll = false" icon="ArrowUp" link size="small" />
</div> </div>
<br />
<el-button link size="small" type="primary" class="mt-2 mb-2" @click="showTop = !showTop"> <el-button link size="small" type="primary" class="mt-1 mb-2" @click="showTop = !showTop">
{{ $t('home.cpuTop') }} {{ $t('home.cpuTop') }}
<el-icon v-if="!showTop"><ArrowRight /></el-icon> <el-icon v-if="!showTop"><ArrowRight /></el-icon>
<el-icon v-if="showTop"><ArrowDown /></el-icon> <el-icon v-if="showTop"><ArrowDown /></el-icon>
@ -315,7 +316,7 @@
<span class="input-help" v-else>{{ item.deviceName }}</span> <span class="input-help" v-else>{{ item.deviceName }}</span>
</el-col> </el-col>
</template> </template>
<el-col :xs="6" :sm="6" :md="6" :lg="6" :xl="6" align="center" v-if="totalCount > 5"> <el-col :xs="6" :sm="6" :md="3" :lg="3" :xl="3" align="center" v-if="totalCount > 5">
<el-button v-if="!showMore" link type="primary" @click="changeShowMore(true)" class="buttonClass"> <el-button v-if="!showMore" link type="primary" @click="changeShowMore(true)" class="buttonClass">
{{ $t('tabs.more') }} {{ $t('tabs.more') }}
<el-icon><Bottom /></el-icon> <el-icon><Bottom /></el-icon>