feat: Compatible with XPU monitoring (#11088)

This commit is contained in:
ssongliu 2025-11-26 22:30:15 +08:00 committed by GitHub
parent 18c65c3096
commit 8e03b24b0c
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 108 additions and 52 deletions

View file

@ -44,6 +44,7 @@ type MonitorGPUSearch struct {
EndTime time.Time `json:"endTime"`
}
type MonitorGPUData struct {
GPUType string `json:"gpuType"`
ProductNames []string `json:"productNames"`
Date []time.Time `json:"date"`
GPUValue []float64 `json:"gpuValue"`
@ -58,8 +59,8 @@ type GPUPowerUsageHelper struct {
Percent float64 `json:"percent"`
}
type GPUMemoryUsageHelper struct {
Total int `json:"total"`
Used int `json:"used"`
Total float64 `json:"total"`
Used float64 `json:"used"`
Percent float64 `json:"percent"`
GPUProcesses []GPUProcess `json:"gpuProcesses"`

View file

@ -39,8 +39,8 @@ type MonitorGPU struct {
Temperature int `json:"temperature"`
PowerDraw float64 `json:"powerDraw"`
MaxPowerLimit float64 `json:"maxPowerLimit"`
MemUsed int `json:"memUsed"`
MemTotal int `json:"memTotal"`
MemUsed float64 `json:"memUsed"`
MemTotal float64 `json:"memTotal"`
FanSpeed int `json:"fanSpeed"`
Processes string `json:"processes"`
}

View file

@ -131,8 +131,10 @@ func (m *MonitorService) LoadGPUMonitorData(req dto.MonitorGPUSearch) (dto.Monit
}
if len(req.ProductName) == 0 {
if gpuExist {
data.GPUType = "gpu"
gpuInfo, err := gpuclient.LoadGpuInfo()
if err != nil || len(gpuInfo.GPUs) == 0 {
global.LOG.Error("Load GPU info failed or no GPU found, err: ", err)
return data, buserr.New("ErrRecordNotFound")
}
req.ProductName = gpuInfo.GPUs[0].ProductName
@ -140,8 +142,10 @@ func (m *MonitorService) LoadGPUMonitorData(req dto.MonitorGPUSearch) (dto.Monit
data.ProductNames = append(data.ProductNames, item.ProductName)
}
} else {
data.GPUType = "xpu"
xpuInfo, err := xpuClient.LoadGpuInfo()
if err != nil || len(xpuInfo.Xpu) == 0 {
global.LOG.Error("Load XPU info failed or no XPU found, err: ", err)
return data, buserr.New("ErrRecordNotFound")
}
req.ProductName = xpuInfo.Xpu[0].Basic.DeviceName
@ -159,15 +163,18 @@ func (m *MonitorService) LoadGPUMonitorData(req dto.MonitorGPUSearch) (dto.Monit
data.Date = append(data.Date, gpu.CreatedAt)
data.GPUValue = append(data.GPUValue, gpu.GPUUtil)
data.TemperatureValue = append(data.TemperatureValue, gpu.Temperature)
data.PowerValue = append(data.PowerValue, dto.GPUPowerUsageHelper{
powerItem := dto.GPUPowerUsageHelper{
Total: gpu.MaxPowerLimit,
Used: gpu.PowerDraw,
Percent: gpu.PowerDraw / gpu.MaxPowerLimit * 100,
})
}
if powerItem.Total != 0 {
powerItem.Percent = powerItem.Used / powerItem.Total
}
data.PowerValue = append(data.PowerValue, powerItem)
memItem := dto.GPUMemoryUsageHelper{
Total: gpu.MemTotal,
Used: gpu.MemUsed,
Percent: float64(gpu.MemUsed) / float64(gpu.MemTotal) * 100,
Percent: gpu.MemUsed / gpu.MemTotal * 100,
}
var process []dto.GPUProcess
if err := json.Unmarshal([]byte(gpu.Processes), &process); err == nil {
@ -568,9 +575,8 @@ func saveGPUDataToDB() {
GPUUtil: loadGPUInfoFloat(gpuItem.GPUUtil),
Temperature: loadGPUInfoInt(gpuItem.Temperature),
PowerDraw: loadGPUInfoFloat(gpuItem.PowerDraw),
MaxPowerLimit: loadGPUInfoFloat(gpuItem.MaxPowerLimit),
MemUsed: loadGPUInfoInt(gpuItem.MemUsed),
MemTotal: loadGPUInfoInt(gpuItem.MemTotal),
MemUsed: loadGPUInfoFloat(gpuItem.MemUsed),
MemTotal: loadGPUInfoFloat(gpuItem.MemTotal),
FanSpeed: loadGPUInfoInt(gpuItem.FanSpeed),
}
process, _ := json.Marshal(gpuItem.Processes)
@ -600,9 +606,11 @@ func saveXPUDataToDB() {
GPUUtil: loadGPUInfoFloat(xpuItem.Stats.MemoryUtil),
Temperature: loadGPUInfoInt(xpuItem.Stats.Temperature),
PowerDraw: loadGPUInfoFloat(xpuItem.Stats.Power),
MemUsed: loadGPUInfoInt(xpuItem.Stats.MemoryUsed),
MemTotal: loadGPUInfoInt(xpuItem.Basic.Memory),
MaxPowerLimit: float64(xpuItem.Config.PowerLimit),
MemUsed: loadGPUInfoFloat(xpuItem.Stats.MemoryUsed),
MemTotal: loadGPUInfoFloat(xpuItem.Basic.Memory),
}
if len(xpuItem.Processes) != 0 {
var processItem []dto.GPUProcess
for _, ps := range xpuItem.Processes {
processItem = append(processItem, dto.GPUProcess{
@ -616,6 +624,7 @@ func saveXPUDataToDB() {
if len(process) != 0 {
item.Processes = string(process)
}
}
list = append(list, item)
}
if err := repo.NewIMonitorRepo().BatchCreateMonitorGPU(list); err != nil {
@ -633,6 +642,7 @@ func loadGPUInfoInt(val string) int {
}
func loadGPUInfoFloat(val string) float64 {
valItem := strings.ReplaceAll(val, "W", "")
valItem = strings.ReplaceAll(valItem, "MB", "")
valItem = strings.ReplaceAll(valItem, "%", "")
valItem = strings.TrimSpace(valItem)
data, _ := strconv.ParseFloat(valItem, 64)

View file

@ -721,7 +721,7 @@ var UpdateDatabase = &gormigrate.Migration{
}
var AddGPUMonitor = &gormigrate.Migration{
ID: "20251119-add-gpu-monitor",
ID: "20251127-add-gpu-monitor",
Migrate: func(tx *gorm.DB) error {
return global.GPUMonitorDB.AutoMigrate(&model.MonitorGPU{})
},

View file

@ -10,6 +10,7 @@ type XpuInfo struct {
type Xpu struct {
Basic Basic `json:"basic"`
Stats Stats `json:"stats"`
Config Config `json:"config"`
Processes []Process `json:"processes"`
}
@ -23,6 +24,11 @@ type Basic struct {
PciBdfAddress string `json:"pciBdfAddress"`
}
type Config struct {
PowerLimit int `json:"power_limit"`
PowerValidRange string `json:"power_vaild_range"`
}
type Stats struct {
Power string `json:"power"`
Frequency string `json:"frequency"`

View file

@ -167,6 +167,7 @@ export namespace Host {
endTime: Date;
}
export interface MonitorGPUData {
gpuType: string;
productNames: Array<string>;
date: Array<Date>;
gpuValue: Array<number>;

View file

@ -30,9 +30,9 @@
</div>
</el-card>
</div>
<el-row :gutter="7" class="card-interval" v-if="options.length !== 0">
<el-col :span="24">
<el-card style="overflow: inherit">
<el-row :gutter="7" v-if="options.length !== 0">
<el-col v-bind="gpuType === 'gpu' ? fullWidthProps : halfWidthProps">
<el-card class="card-interval" style="overflow: inherit">
<template #header>
<div :class="mobile ? 'flx-wrap' : 'flex justify-between'">
<span class="title">{{ $t('monitor.gpuUtil') }}</span>
@ -51,7 +51,7 @@
</el-card>
</el-col>
<el-col :xs="24" :sm="24" :md="12" :lg="12" :xl="12">
<el-card style="overflow: inherit">
<el-card class="card-interval" style="overflow: inherit">
<template #header>
<div :class="mobile ? 'flx-wrap' : 'flex justify-between'">
<span class="title">{{ $t('monitor.memoryUsage') }}</span>
@ -70,7 +70,7 @@
</el-card>
</el-col>
<el-col :xs="24" :sm="24" :md="12" :lg="12" :xl="12">
<el-card style="overflow: inherit">
<el-card class="card-interval" style="overflow: inherit">
<template #header>
<div :class="mobile ? 'flx-wrap' : 'flex justify-between'">
<span class="title">{{ $t('monitor.powerUsage') }}</span>
@ -89,7 +89,7 @@
</el-card>
</el-col>
<el-col :xs="24" :sm="24" :md="12" :lg="12" :xl="12">
<el-card style="overflow: inherit">
<el-card class="card-interval" style="overflow: inherit">
<template #header>
<div>
{{ $t('monitor.temperature') }}
@ -110,8 +110,8 @@
</div>
</el-card>
</el-col>
<el-col :xs="24" :sm="24" :md="12" :lg="12" :xl="12">
<el-card style="overflow: inherit">
<el-col :xs="24" :sm="24" :md="12" :lg="12" :xl="12" v-if="gpuType === 'gpu'">
<el-card class="card-interval" style="overflow: inherit">
<template #header>
<div :class="mobile ? 'flx-wrap' : 'flex justify-between'">
<span class="title">{{ $t('monitor.fanSpeed') }}</span>
@ -161,8 +161,12 @@ const mobile = computed(() => {
return globalStore.isMobile();
});
const fullWidthProps = { span: 24 };
const halfWidthProps = { xs: 24, sm: 24, md: 12, lg: 12, xl: 12 };
const loading = ref(false);
const options = ref([]);
const gpuType = ref('gpu');
const timeRangeGlobal = ref<[Date, Date]>([new Date(new Date().setHours(0, 0, 0, 0)), new Date()]);
const chartsOption = ref({
loadPowerChart: null,
@ -189,6 +193,7 @@ const search = async () => {
.then((res) => {
loading.value = false;
options.value = res.data.productNames || [];
gpuType.value = res.data.gpuType || 'gpu';
searchInfo.productName = searchInfo.productName || (options.value.length > 0 ? options.value[0] : '');
let baseDate = res.data.date.length === 0 ? loadEmptyDate(timeRangeGlobal.value) : res.data.date;
let date = baseDate.map(function (item: any) {
@ -196,7 +201,11 @@ const search = async () => {
});
initCPUCharts(date, res.data.gpuValue);
initMemoryCharts(date, res.data.memoryValue);
if (gpuType.value === 'gpu') {
initPowerCharts(date, res.data.powerValue);
} else {
initXpuPowerCharts(date, res.data.powerValue);
}
initSpeedCharts(date, res.data.speedValue);
initTemperatureCharts(date, res.data.temperatureValue);
})
@ -270,6 +279,33 @@ function initPowerCharts(baseDate: any, items: any) {
formatStr: '%',
};
}
function initXpuPowerCharts(baseDate: any, items: any) {
let list = items.map(function (item: any) {
return { value: Number(item.used.toFixed(2)), data: item };
});
list = list.length === 0 ? loadEmptyData2() : list;
chartsOption.value['loadPowerChart'] = {
xData: baseDate,
yData: [
{
name: i18n.global.t('monitor.powerUsage'),
data: list,
},
],
tooltip: {
trigger: 'axis',
formatter: function (list: any) {
let res = loadDate(list[0].name);
for (const item of list) {
res += loadSeries(item, item.data.value ? item.data.value : item.data, 'W');
}
return res;
},
},
formatStr: 'W',
};
}
function initTemperatureCharts(baseDate: any, items: any) {
let temperatures = items.map(function (item: any) {
return Number(item);
@ -334,14 +370,15 @@ function withMemoryProcess(list: any) {
if (!process) {
return res;
}
let title = gpuType.value === 'gpu' ? i18n.global.t('aiTools.gpu.type') : i18n.global.t('aiTools.gpu.shr');
res += `
<div style="margin-top: 10px; border-bottom: 1px dashed black;"></div>
<table style="border-collapse: collapse; margin-top: 20px; font-size: 12px;">
<thead>
<tr>
<th style="padding: 6px 8px;">PID</th>
<th style="padding: 6px 8px;">${i18n.global.t('aiTools.gpu.type')}</th>
<th style="padding: 6px 8px;">${i18n.global.t('aiTools.gpu.processName')}</th>
<th style="padding: 6px 8px;">${title}</th>
<th style="padding: 6px 8px;">${i18n.global.t('aiTools.gpu.processMemoryUsage')}</th>
</tr>
</thead>
@ -354,10 +391,10 @@ function withMemoryProcess(list: any) {
${row.pid}
</td>
<td style="padding: 6px 8px; text-align: center;">
${loadProcessType(row.type)}
${row.processName}
</td>
<td style="padding: 6px 8px; text-align: center;">
${row.processName}
${loadProcessType(row.type)}
</td>
<td style="padding: 6px 8px; text-align: center;">
${row.usedMemory}

View file

@ -69,8 +69,9 @@
<el-button v-if="!cpuShowAll" @click="cpuShowAll = true" icon="More" link size="small" />
<el-button v-if="cpuShowAll" @click="cpuShowAll = false" icon="ArrowUp" link size="small" />
</div>
<br />
<el-button link size="small" type="primary" class="mt-2 mb-2" @click="showTop = !showTop">
<el-button link size="small" type="primary" class="mt-1 mb-2" @click="showTop = !showTop">
{{ $t('home.cpuTop') }}
<el-icon v-if="!showTop"><ArrowRight /></el-icon>
<el-icon v-if="showTop"><ArrowDown /></el-icon>
@ -315,7 +316,7 @@
<span class="input-help" v-else>{{ item.deviceName }}</span>
</el-col>
</template>
<el-col :xs="6" :sm="6" :md="6" :lg="6" :xl="6" align="center" v-if="totalCount > 5">
<el-col :xs="6" :sm="6" :md="3" :lg="3" :xl="3" align="center" v-if="totalCount > 5">
<el-button v-if="!showMore" link type="primary" @click="changeShowMore(true)" class="buttonClass">
{{ $t('tabs.more') }}
<el-icon><Bottom /></el-icon>