feat: Compatible with XPU monitoring (#11099)

This commit is contained in:
ssongliu 2025-11-27 13:57:24 +08:00 committed by GitHub
parent 74b48a18be
commit 3d2023858c
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
13 changed files with 137 additions and 97 deletions

View file

@ -126,3 +126,7 @@ func (b *BaseApi) GetIOOptions(c *gin.Context) {
sort.Strings(options)
helper.SuccessWithData(c, options)
}
func (b *BaseApi) GetCPUOptions(c *gin.Context) {
helper.SuccessWithData(c, monitorService.LoadGPUOptions())
}

View file

@ -38,17 +38,19 @@ type MonitorSettingUpdate struct {
Value string `json:"value"`
}
type MonitorGPUOptions struct {
GPUType string `json:"gpuType"`
Options []string `json:"options"`
}
type MonitorGPUSearch struct {
ProductName string `json:"productName"`
StartTime time.Time `json:"startTime"`
EndTime time.Time `json:"endTime"`
}
type MonitorGPUData struct {
GPUType string `json:"gpuType"`
ProductNames []string `json:"productNames"`
Date []time.Time `json:"date"`
GPUValue []float64 `json:"gpuValue"`
TemperatureValue []int `json:"temperatureValue"`
TemperatureValue []float64 `json:"temperatureValue"`
PowerValue []GPUPowerUsageHelper `json:"powerValue"`
MemoryValue []GPUMemoryUsageHelper `json:"memoryValue"`
SpeedValue []int `json:"speedValue"`

View file

@ -36,7 +36,7 @@ type MonitorGPU struct {
BaseModel
ProductName string `json:"productName"`
GPUUtil float64 `json:"gpuUtil"`
Temperature int `json:"temperature"`
Temperature float64 `json:"temperature"`
PowerDraw float64 `json:"powerDraw"`
MaxPowerLimit float64 `json:"maxPowerLimit"`
MemUsed float64 `json:"memUsed"`

View file

@ -39,6 +39,7 @@ var monitorCancel context.CancelFunc
type IMonitorService interface {
Run()
LoadMonitorData(req dto.MonitorSearch) ([]dto.MonitorData, error)
LoadGPUOptions() dto.MonitorGPUOptions
LoadGPUMonitorData(req dto.MonitorGPUSearch) (dto.MonitorGPUData, error)
LoadSetting() (*dto.MonitorSetting, error)
UpdateSetting(key, value string) error
@ -118,42 +119,43 @@ func (m *MonitorService) LoadMonitorData(req dto.MonitorSearch) ([]dto.MonitorDa
return data, nil
}
func (m *MonitorService) LoadGPUOptions() dto.MonitorGPUOptions {
var data dto.MonitorGPUOptions
gpuExist, gpuClient := gpu.New()
xpuExist, xpuClient := xpu.New()
if !gpuExist && !xpuExist {
return data
}
if gpuExist {
data.GPUType = "gpu"
gpuInfo, err := gpuClient.LoadGpuInfo()
if err != nil || len(gpuInfo.GPUs) == 0 {
global.LOG.Error("Load GPU info failed or no GPU found, err: ", err)
return data
}
sort.Slice(gpuInfo.GPUs, func(i, j int) bool {
return gpuInfo.GPUs[i].Index < gpuInfo.GPUs[j].Index
})
for _, item := range gpuInfo.GPUs {
data.Options = append(data.Options, fmt.Sprintf("%d - %s", item.Index, item.ProductName))
}
return data
} else {
data.GPUType = "xpu"
var err error
data.Options, err = xpuClient.LoadDeviceList()
if err != nil || len(data.Options) == 0 {
global.LOG.Error("Load XPU info failed or no XPU found, err: ", err)
}
return data
}
}
func (m *MonitorService) LoadGPUMonitorData(req dto.MonitorGPUSearch) (dto.MonitorGPUData, error) {
loc, _ := time.LoadLocation(common.LoadTimeZoneByCmd())
req.StartTime = req.StartTime.In(loc)
req.EndTime = req.EndTime.In(loc)
var data dto.MonitorGPUData
gpuExist, gpuclient := gpu.New()
xpuExist, xpuClient := xpu.New()
if !gpuExist && !xpuExist {
return data, nil
}
if len(req.ProductName) == 0 {
if gpuExist {
data.GPUType = "gpu"
gpuInfo, err := gpuclient.LoadGpuInfo()
if err != nil || len(gpuInfo.GPUs) == 0 {
global.LOG.Error("Load GPU info failed or no GPU found, err: ", err)
return data, buserr.New("ErrRecordNotFound")
}
req.ProductName = gpuInfo.GPUs[0].ProductName
for _, item := range gpuInfo.GPUs {
data.ProductNames = append(data.ProductNames, item.ProductName)
}
} else {
data.GPUType = "xpu"
xpuInfo, err := xpuClient.LoadGpuInfo()
if err != nil || len(xpuInfo.Xpu) == 0 {
global.LOG.Error("Load XPU info failed or no XPU found, err: ", err)
return data, buserr.New("ErrRecordNotFound")
}
req.ProductName = xpuInfo.Xpu[0].Basic.DeviceName
for _, item := range xpuInfo.Xpu {
data.ProductNames = append(data.ProductNames, item.Basic.DeviceName)
}
}
}
gpuList, err := monitorRepo.GetGPU(repo.WithByCreatedAt(req.StartTime, req.EndTime), monitorRepo.WithByProductName(req.ProductName))
if err != nil {
return data, err
@ -571,13 +573,14 @@ func saveGPUDataToDB() {
var list []model.MonitorGPU
for _, gpuItem := range gpuInfo.GPUs {
item := model.MonitorGPU{
ProductName: gpuItem.ProductName,
GPUUtil: loadGPUInfoFloat(gpuItem.GPUUtil),
Temperature: loadGPUInfoInt(gpuItem.Temperature),
PowerDraw: loadGPUInfoFloat(gpuItem.PowerDraw),
MemUsed: loadGPUInfoFloat(gpuItem.MemUsed),
MemTotal: loadGPUInfoFloat(gpuItem.MemTotal),
FanSpeed: loadGPUInfoInt(gpuItem.FanSpeed),
ProductName: fmt.Sprintf("%d - %s", gpuItem.Index, gpuItem.ProductName),
GPUUtil: loadGPUInfoFloat(gpuItem.GPUUtil),
Temperature: loadGPUInfoFloat(gpuItem.Temperature),
PowerDraw: loadGPUInfoFloat(gpuItem.PowerDraw),
MaxPowerLimit: loadGPUInfoFloat(gpuItem.MaxPowerLimit),
MemUsed: loadGPUInfoFloat(gpuItem.MemUsed),
MemTotal: loadGPUInfoFloat(gpuItem.MemTotal),
FanSpeed: loadGPUInfoInt(gpuItem.FanSpeed),
}
process, _ := json.Marshal(gpuItem.Processes)
if len(process) != 0 {
@ -602,13 +605,12 @@ func saveXPUDataToDB() {
var list []model.MonitorGPU
for _, xpuItem := range xpuInfo.Xpu {
item := model.MonitorGPU{
ProductName: xpuItem.Basic.DeviceName,
GPUUtil: loadGPUInfoFloat(xpuItem.Stats.MemoryUtil),
Temperature: loadGPUInfoInt(xpuItem.Stats.Temperature),
PowerDraw: loadGPUInfoFloat(xpuItem.Stats.Power),
MaxPowerLimit: float64(xpuItem.Config.PowerLimit),
MemUsed: loadGPUInfoFloat(xpuItem.Stats.MemoryUsed),
MemTotal: loadGPUInfoFloat(xpuItem.Basic.Memory),
ProductName: fmt.Sprintf("%d - %s", xpuItem.Basic.DeviceID, xpuItem.Basic.DeviceName),
GPUUtil: loadGPUInfoFloat(xpuItem.Stats.MemoryUtil),
Temperature: loadGPUInfoFloat(xpuItem.Stats.Temperature),
PowerDraw: loadGPUInfoFloat(xpuItem.Stats.Power),
MemUsed: loadGPUInfoFloat(xpuItem.Stats.MemoryUsed),
MemTotal: loadGPUInfoFloat(xpuItem.Basic.Memory),
}
if len(xpuItem.Processes) != 0 {
var processItem []dto.GPUProcess
@ -643,6 +645,9 @@ func loadGPUInfoInt(val string) int {
func loadGPUInfoFloat(val string) float64 {
valItem := strings.ReplaceAll(val, "W", "")
valItem = strings.ReplaceAll(valItem, "MB", "")
valItem = strings.ReplaceAll(valItem, "MiB", "")
valItem = strings.ReplaceAll(valItem, "C", "")
valItem = strings.ReplaceAll(valItem, "°C", "")
valItem = strings.ReplaceAll(valItem, "%", "")
valItem = strings.TrimSpace(valItem)
data, _ := strconv.ParseFloat(valItem, 64)

View file

@ -721,7 +721,7 @@ var UpdateDatabase = &gormigrate.Migration{
}
var AddGPUMonitor = &gormigrate.Migration{
ID: "20251127-add-gpu-monitor",
ID: "20251122-add-gpu-monitor",
Migrate: func(tx *gorm.DB) error {
return global.GPUMonitorDB.AutoMigrate(&model.MonitorGPU{})
},

View file

@ -31,6 +31,7 @@ func (s *HostRouter) InitRouter(Router *gin.RouterGroup) {
hostRouter.POST("/monitor/search", baseApi.LoadMonitor)
hostRouter.POST("/monitor/gpu/search", baseApi.LoadGPUMonitor)
hostRouter.POST("/monitor/clean", baseApi.CleanMonitor)
hostRouter.GET("/monitor/gpuoptions", baseApi.GetCPUOptions)
hostRouter.GET("/monitor/netoptions", baseApi.GetNetworkOptions)
hostRouter.GET("/monitor/iooptions", baseApi.GetIOOptions)
hostRouter.GET("/monitor/setting", baseApi.LoadMonitorSetting)

View file

@ -120,6 +120,23 @@ func (x XpuSMI) LoadDashData() ([]XPUSimpleInfo, error) {
return res, nil
}
func (x XpuSMI) LoadDeviceList() ([]string, error) {
cmdMgr := cmd.NewCommandMgr(cmd.WithTimeout(5 * time.Second))
data, err := cmdMgr.RunWithStdoutBashC("xpu-smi discovery -j")
if err != nil {
return nil, fmt.Errorf("calling xpu-smi failed, %v", err)
}
var deviceInfo DeviceInfo
if err := json.Unmarshal([]byte(data), &deviceInfo); err != nil {
return nil, fmt.Errorf("deviceInfo json unmarshal failed, err: %w", err)
}
var deviceNames []string
for _, device := range deviceInfo.DeviceList {
deviceNames = append(deviceNames, fmt.Sprintf("%d - %s", device.DeviceID, device.DeviceName))
}
return deviceNames, nil
}
func (x XpuSMI) LoadGpuInfo() (*XpuInfo, error) {
cmdMgr := cmd.NewCommandMgr(cmd.WithTimeout(5 * time.Second))
data, err := cmdMgr.RunWithStdoutBashC("xpu-smi discovery -j")

View file

@ -10,7 +10,6 @@ type XpuInfo struct {
type Xpu struct {
Basic Basic `json:"basic"`
Stats Stats `json:"stats"`
Config Config `json:"config"`
Processes []Process `json:"processes"`
}
@ -24,11 +23,6 @@ type Basic struct {
PciBdfAddress string `json:"pciBdfAddress"`
}
type Config struct {
PowerLimit int `json:"power_limit"`
PowerValidRange string `json:"power_vaild_range"`
}
type Stats struct {
Power string `json:"power"`
Frequency string `json:"frequency"`

View file

@ -166,9 +166,11 @@ export namespace Host {
startTime: Date;
endTime: Date;
}
export interface MonitorGPUData {
export interface MonitorGPUOptions {
gpuType: string;
productNames: Array<string>;
options: Array<string>;
}
export interface MonitorGPUData {
date: Array<Date>;
gpuValue: Array<number>;
temperatureValue: Array<number>;

View file

@ -65,6 +65,9 @@ export const operateFilterChain = (name: string, op: string) => {
export const loadMonitor = (param: Host.MonitorSearch) => {
return http.post<Array<Host.MonitorData>>(`/hosts/monitor/search`, param);
};
export const getGPUOptions = () => {
return http.get<Host.MonitorGPUOptions>(`/hosts/monitor/gpuoptions`);
};
export const loadGPUMonitor = (param: Host.MonitorGPUSearch) => {
return http.post<Host.MonitorGPUData>(`/hosts/monitor/gpu/search`, param);
};

View file

@ -32,25 +32,6 @@
</div>
<el-row :gutter="7" v-if="options.length !== 0">
<el-col v-bind="gpuType === 'gpu' ? fullWidthProps : halfWidthProps">
<el-card class="card-interval" style="overflow: inherit">
<template #header>
<div :class="mobile ? 'flx-wrap' : 'flex justify-between'">
<span class="title">{{ $t('monitor.gpuUtil') }}</span>
</div>
</template>
<div class="chart">
<v-charts
height="400px"
id="loadGPUChart"
type="line"
:option="chartsOption['loadGPUChart']"
v-if="chartsOption['loadGPUChart']"
:dataZoom="true"
/>
</div>
</el-card>
</el-col>
<el-col :xs="24" :sm="24" :md="12" :lg="12" :xl="12">
<el-card class="card-interval" style="overflow: inherit">
<template #header>
<div :class="mobile ? 'flx-wrap' : 'flex justify-between'">
@ -69,6 +50,25 @@
</div>
</el-card>
</el-col>
<el-col :xs="24" :sm="24" :md="12" :lg="12" :xl="12">
<el-card class="card-interval" style="overflow: inherit">
<template #header>
<div :class="mobile ? 'flx-wrap' : 'flex justify-between'">
<span class="title">{{ $t('monitor.gpuUtil') }}</span>
</div>
</template>
<div class="chart">
<v-charts
height="400px"
id="loadGPUChart"
type="line"
:option="chartsOption['loadGPUChart']"
v-if="chartsOption['loadGPUChart']"
:dataZoom="true"
/>
</div>
</el-card>
</el-col>
<el-col :xs="24" :sm="24" :md="12" :lg="12" :xl="12">
<el-card class="card-interval" style="overflow: inherit">
<template #header>
@ -148,7 +148,7 @@
<script setup lang="ts">
import { ref, reactive, onMounted, computed } from 'vue';
import { loadGPUMonitor } from '@/api/modules/host';
import { loadGPUMonitor, getGPUOptions } from '@/api/modules/host';
import { dateFormatWithoutYear } from '@/utils/util';
import { GlobalStore } from '@/store';
import { shortcuts } from '@/utils/shortcuts';
@ -183,6 +183,21 @@ const searchInfo = reactive<Host.MonitorGPUSearch>({
endTime: new Date(),
});
const loadOptions = async () => {
loading.value = true;
await getGPUOptions()
.then((res) => {
gpuType.value = res.data.gpuType || 'gpu';
options.value = res.data.options || [];
searchInfo.productName = options.value.length > 0 ? options.value[0] : '';
search();
})
.catch(() => {
loading.value = false;
options.value = [];
});
};
const search = async () => {
if (searchTime.value && searchTime.value.length === 2) {
searchInfo.startTime = searchTime.value[0];
@ -192,9 +207,6 @@ const search = async () => {
await loadGPUMonitor(searchInfo)
.then((res) => {
loading.value = false;
options.value = res.data.productNames || [];
gpuType.value = res.data.gpuType || 'gpu';
searchInfo.productName = searchInfo.productName || (options.value.length > 0 ? options.value[0] : '');
let baseDate = res.data.date.length === 0 ? loadEmptyDate(timeRangeGlobal.value) : res.data.date;
let date = baseDate.map(function (item: any) {
return dateFormatWithoutYear(item);
@ -270,7 +282,7 @@ function initPowerCharts(baseDate: any, items: any) {
formatter: function (list: any) {
let res = loadDate(list[0].name);
for (const item of list) {
res += loadSeries(item, item.data.value ? item.data.value : item.data, '%');
res += loadSeries(item, item.data.value, '%');
res += `( ${item.data?.data.used} W / ${item.data?.data.total} W)<br/>`;
}
return res;
@ -298,7 +310,7 @@ function initXpuPowerCharts(baseDate: any, items: any) {
formatter: function (list: any) {
let res = loadDate(list[0].name);
for (const item of list) {
res += loadSeries(item, item.data.value ? item.data.value : item.data, 'W');
res += loadSeries(item, item.data.value, 'W');
}
return res;
},
@ -364,7 +376,7 @@ function withMemoryProcess(list: any) {
if (item.data?.data) {
process = item.data?.data.gpuProcesses || [];
}
res += loadSeries(item, item.data.value ? item.data.value : item.data, '%');
res += loadSeries(item, item.data.value, '%');
res += `( ${item.data?.data.used} MiB / ${item.data?.data.total} MiB)<br/>`;
}
if (!process) {
@ -425,7 +437,7 @@ const loadProcessType = (val: string) => {
};
onMounted(() => {
search();
loadOptions();
});
</script>

View file

@ -20,7 +20,7 @@
</template>
<template #main>
<el-row v-if="data.length > 0" :gutter="20" class="row-box">
<el-col :span="7">
<el-col :xs="24" :sm="24" :md="8" :lg="8" :xl="6">
<el-card>
<el-table
:max-height="loadTableHeight()"
@ -112,7 +112,7 @@
</el-table>
</el-card>
</el-col>
<el-col :span="17">
<el-col :xs="24" :sm="24" :md="16" :lg="16" :xl="18">
<el-card v-if="currentCompose" v-loading="detailLoading">
<el-table
v-if="composeContainers.length > 0"
@ -120,24 +120,24 @@
size="small"
max-height="250"
>
<el-table-column :label="$t('commons.table.name')" prop="name" show-overflow-tooltip>
<el-table-column
:label="$t('commons.table.name')"
prop="name"
show-overflow-tooltip
fixed="left"
>
<template #default="{ row }">
<el-text type="primary" class="cursor-pointer" @click="onInspectContainer(row)">
{{ row.name }}
</el-text>
</template>
</el-table-column>
<el-table-column :label="$t('commons.table.status')" prop="state" width="150">
<el-table-column :label="$t('commons.table.status')" prop="state">
<template #default="{ row }">
<Status :key="row.state" :status="row.state"></Status>
</template>
</el-table-column>
<el-table-column
:label="$t('container.source')"
show-overflow-tooltip
prop="resource"
min-width="150"
>
<el-table-column :label="$t('container.source')" show-overflow-tooltip prop="resource">
<template #default="{ row }">
<div v-if="row.hasLoad">
<div class="source-font">CPU: {{ row.cpuPercent.toFixed(2) }}%</div>
@ -202,7 +202,7 @@
</div>
</template>
</el-table-column>
<el-table-column :label="$t('commons.table.operate')" width="180" fixed="right">
<el-table-column :label="$t('commons.table.operate')">
<template #default="{ row }">
<el-button type="primary" link @click="onOpenTerminal(row)">
{{ $t('menu.terminal') }}

View file

@ -54,7 +54,7 @@
</div>
<NoSuchService v-else name="Firewalld / Ufw / iptables" />
<LayoutContent :divider="true" v-if="!baseInfo.isInit">
<LayoutContent :divider="true" v-if="baseInfo.isExist && !baseInfo.isInit">
<template #main>
<div class="app-warn">
<div class="flex flex-col gap-2 items-center justify-center w-full sm:flex-row">