feat: GPU monitoring data supports persistence (#11051)

Refs #9496
This commit is contained in:
ssongliu 2025-11-24 15:59:15 +08:00 committed by GitHub
parent 3f47a6e701
commit 63f9368e26
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
14 changed files with 654 additions and 342 deletions

View file

@ -31,6 +31,27 @@ func (b *BaseApi) LoadMonitor(c *gin.Context) {
helper.SuccessWithData(c, data)
}
// @Tags Monitor
// @Summary Load monitor data
// @Param request body dto.MonitorGPUSearch true "request"
// @Success 200 {object} dto.dto.MonitorGPUData
// @Security ApiKeyAuth
// @Security Timestamp
// @Router /hosts/monitor/gpu/search [post]
func (b *BaseApi) LoadGPUMonitor(c *gin.Context) {
var req dto.MonitorGPUSearch
if err := helper.CheckBindAndValidate(&req, c); err != nil {
return
}
data, err := monitorService.LoadGPUMonitorData(req)
if err != nil {
helper.InternalServer(c, err)
return
}
helper.SuccessWithData(c, data)
}
// @Tags Monitor
// @Summary Clean monitor data
// @Success 200

View file

@ -11,7 +11,7 @@ type MonitorSearch struct {
}
type MonitorData struct {
Param string `json:"param" validate:"required,oneof=cpu memory load io network"`
Param string `json:"param"`
Date []time.Time `json:"date"`
Value []interface{} `json:"value"`
}
@ -37,3 +37,36 @@ type MonitorSettingUpdate struct {
Key string `json:"key" validate:"required,oneof=MonitorStatus MonitorStoreDays MonitorInterval DefaultNetwork DefaultIO"`
Value string `json:"value"`
}
type MonitorGPUSearch struct {
ProductName string `json:"productName"`
StartTime time.Time `json:"startTime"`
EndTime time.Time `json:"endTime"`
}
type MonitorGPUData struct {
ProductNames []string `json:"productNames"`
Date []time.Time `json:"date"`
GPUValue []float64 `json:"gpuValue"`
TemperatureValue []int `json:"temperatureValue"`
PowerValue []GPUPowerUsageHelper `json:"powerValue"`
MemoryValue []GPUMemoryUsageHelper `json:"memoryValue"`
SpeedValue []int `json:"speedValue"`
}
type GPUPowerUsageHelper struct {
Total float64 `json:"total"`
Used float64 `json:"used"`
Percent float64 `json:"percent"`
}
type GPUMemoryUsageHelper struct {
Total int `json:"total"`
Used int `json:"used"`
Percent float64 `json:"percent"`
GPUProcesses []GPUProcess `json:"gpuProcesses"`
}
type GPUProcess struct {
Pid string `json:"pid"`
Type string `json:"type"`
ProcessName string `json:"processName"`
UsedMemory string `json:"usedMemory"`
}

View file

@ -31,3 +31,16 @@ type MonitorNetwork struct {
Up float64 `json:"up"`
Down float64 `json:"down"`
}
type MonitorGPU struct {
BaseModel
ProductName string `json:"productName"`
GPUUtil float64 `json:"gpuUtil"`
Temperature int `json:"temperature"`
PowerDraw float64 `json:"powerDraw"`
MaxPowerLimit float64 `json:"maxPowerLimit"`
MemUsed int `json:"memUsed"`
MemTotal int `json:"memTotal"`
FanSpeed int `json:"fanSpeed"`
Processes string `json:"processes"`
}

View file

@ -5,21 +5,27 @@ import (
"github.com/1Panel-dev/1Panel/agent/app/model"
"github.com/1Panel-dev/1Panel/agent/global"
"gorm.io/gorm"
)
type MonitorRepo struct{}
type IMonitorRepo interface {
GetBase(opts ...DBOption) ([]model.MonitorBase, error)
GetGPU(opts ...DBOption) ([]model.MonitorGPU, error)
GetIO(opts ...DBOption) ([]model.MonitorIO, error)
GetNetwork(opts ...DBOption) ([]model.MonitorNetwork, error)
CreateMonitorBase(model model.MonitorBase) error
BatchCreateMonitorGPU(list []model.MonitorGPU) error
BatchCreateMonitorIO(ioList []model.MonitorIO) error
BatchCreateMonitorNet(ioList []model.MonitorNetwork) error
DelMonitorBase(timeForDelete time.Time) error
DelMonitorGPU(timeForDelete time.Time) error
DelMonitorIO(timeForDelete time.Time) error
DelMonitorNet(timeForDelete time.Time) error
WithByProductName(name string) DBOption
}
func NewIMonitorRepo() IMonitorRepo {
@ -53,10 +59,22 @@ func (u *MonitorRepo) GetNetwork(opts ...DBOption) ([]model.MonitorNetwork, erro
err := db.Find(&data).Error
return data, err
}
func (u *MonitorRepo) GetGPU(opts ...DBOption) ([]model.MonitorGPU, error) {
var data []model.MonitorGPU
db := global.GPUMonitorDB
for _, opt := range opts {
db = opt(db)
}
err := db.Find(&data).Error
return data, err
}
func (u *MonitorRepo) CreateMonitorBase(model model.MonitorBase) error {
return global.MonitorDB.Create(&model).Error
}
func (s *MonitorRepo) BatchCreateMonitorGPU(list []model.MonitorGPU) error {
return global.GPUMonitorDB.CreateInBatches(&list, len(list)).Error
}
func (u *MonitorRepo) BatchCreateMonitorIO(ioList []model.MonitorIO) error {
return global.MonitorDB.CreateInBatches(ioList, len(ioList)).Error
}
@ -72,3 +90,12 @@ func (u *MonitorRepo) DelMonitorIO(timeForDelete time.Time) error {
func (u *MonitorRepo) DelMonitorNet(timeForDelete time.Time) error {
return global.MonitorDB.Where("created_at < ?", timeForDelete).Delete(&model.MonitorNetwork{}).Error
}
func (s *MonitorRepo) DelMonitorGPU(timeForDelete time.Time) error {
return global.GPUMonitorDB.Where("created_at < ?", timeForDelete).Delete(&model.MonitorGPU{}).Error
}
func (s *MonitorRepo) WithByProductName(name string) DBOption {
return func(g *gorm.DB) *gorm.DB {
return g.Where("product_name = ?", name)
}
}

View file

@ -2,7 +2,6 @@ package repo
import (
"errors"
"time"
"github.com/1Panel-dev/1Panel/agent/app/model"
"github.com/1Panel-dev/1Panel/agent/global"
@ -19,12 +18,6 @@ type ISettingRepo interface {
Update(key, value string) error
WithByKey(key string) DBOption
CreateMonitorBase(model model.MonitorBase) error
BatchCreateMonitorIO(ioList []model.MonitorIO) error
BatchCreateMonitorNet(ioList []model.MonitorNetwork) error
DelMonitorBase(timeForDelete time.Time) error
DelMonitorIO(timeForDelete time.Time) error
DelMonitorNet(timeForDelete time.Time) error
UpdateOrCreate(key, value string) error
GetDescription(opts ...DBOption) (model.CommonDescription, error)
@ -85,25 +78,6 @@ func (s *SettingRepo) Update(key, value string) error {
return global.DB.Model(&model.Setting{}).Where("key = ?", key).Updates(map[string]interface{}{"value": value}).Error
}
func (s *SettingRepo) CreateMonitorBase(model model.MonitorBase) error {
return global.MonitorDB.Create(&model).Error
}
func (s *SettingRepo) BatchCreateMonitorIO(ioList []model.MonitorIO) error {
return global.MonitorDB.CreateInBatches(ioList, len(ioList)).Error
}
func (s *SettingRepo) BatchCreateMonitorNet(ioList []model.MonitorNetwork) error {
return global.MonitorDB.CreateInBatches(ioList, len(ioList)).Error
}
func (s *SettingRepo) DelMonitorBase(timeForDelete time.Time) error {
return global.MonitorDB.Where("created_at < ?", timeForDelete).Delete(&model.MonitorBase{}).Error
}
func (s *SettingRepo) DelMonitorIO(timeForDelete time.Time) error {
return global.MonitorDB.Where("created_at < ?", timeForDelete).Delete(&model.MonitorIO{}).Error
}
func (s *SettingRepo) DelMonitorNet(timeForDelete time.Time) error {
return global.MonitorDB.Where("created_at < ?", timeForDelete).Delete(&model.MonitorNetwork{}).Error
}
func (s *SettingRepo) UpdateOrCreate(key, value string) error {
var setting model.Setting
result := global.DB.Where("key = ?", key).First(&setting)

View file

@ -6,6 +6,7 @@ import (
"fmt"
"sort"
"strconv"
"strings"
"time"
"github.com/1Panel-dev/1Panel/agent/app/repo"
@ -15,6 +16,8 @@ import (
"github.com/1Panel-dev/1Panel/agent/app/dto"
"github.com/1Panel-dev/1Panel/agent/app/model"
"github.com/1Panel-dev/1Panel/agent/global"
"github.com/1Panel-dev/1Panel/agent/utils/ai_tools/gpu"
"github.com/1Panel-dev/1Panel/agent/utils/ai_tools/xpu"
"github.com/1Panel-dev/1Panel/agent/utils/common"
"github.com/robfig/cron/v3"
"github.com/shirou/gopsutil/v4/cpu"
@ -35,6 +38,7 @@ var monitorCancel context.CancelFunc
type IMonitorService interface {
Run()
LoadMonitorData(req dto.MonitorSearch) ([]dto.MonitorData, error)
LoadGPUMonitorData(req dto.MonitorGPUSearch) (dto.MonitorGPUData, error)
LoadSetting() (*dto.MonitorSetting, error)
UpdateSetting(key, value string) error
CleanData() error
@ -113,6 +117,67 @@ func (m *MonitorService) LoadMonitorData(req dto.MonitorSearch) ([]dto.MonitorDa
return data, nil
}
func (m *MonitorService) LoadGPUMonitorData(req dto.MonitorGPUSearch) (dto.MonitorGPUData, error) {
loc, _ := time.LoadLocation(common.LoadTimeZoneByCmd())
req.StartTime = req.StartTime.In(loc)
req.EndTime = req.EndTime.In(loc)
var data dto.MonitorGPUData
gpuExist, gpuclient := gpu.New()
xpuExist, xpuClient := xpu.New()
if !gpuExist && !xpuExist {
return data, nil
}
if len(req.ProductName) == 0 {
if gpuExist {
gpuInfo, err := gpuclient.LoadGpuInfo()
if err != nil || len(gpuInfo.GPUs) == 0 {
return data, buserr.New("ErrRecordNotFound")
}
req.ProductName = gpuInfo.GPUs[0].ProductName
for _, item := range gpuInfo.GPUs {
data.ProductNames = append(data.ProductNames, item.ProductName)
}
} else {
xpuInfo, err := xpuClient.LoadGpuInfo()
if err != nil || len(xpuInfo.Xpu) == 0 {
return data, buserr.New("ErrRecordNotFound")
}
req.ProductName = xpuInfo.Xpu[0].Basic.DeviceName
for _, item := range xpuInfo.Xpu {
data.ProductNames = append(data.ProductNames, item.Basic.DeviceName)
}
}
}
gpuList, err := monitorRepo.GetGPU(repo.WithByCreatedAt(req.StartTime, req.EndTime), monitorRepo.WithByProductName(req.ProductName))
if err != nil {
return data, err
}
for _, gpu := range gpuList {
data.Date = append(data.Date, gpu.CreatedAt)
data.GPUValue = append(data.GPUValue, gpu.GPUUtil)
data.TemperatureValue = append(data.TemperatureValue, gpu.Temperature)
data.PowerValue = append(data.PowerValue, dto.GPUPowerUsageHelper{
Total: gpu.MaxPowerLimit,
Used: gpu.PowerDraw,
Percent: gpu.PowerDraw / gpu.MaxPowerLimit * 100,
})
memItem := dto.GPUMemoryUsageHelper{
Total: gpu.MemTotal,
Used: gpu.MemUsed,
Percent: float64(gpu.MemUsed) / float64(gpu.MemTotal) * 100,
}
var process []dto.GPUProcess
if err := json.Unmarshal([]byte(gpu.Processes), &process); err == nil {
memItem.GPUProcesses = process
}
data.MemoryValue = append(data.MemoryValue, memItem)
data.SpeedValue = append(data.SpeedValue, gpu.FanSpeed)
}
return data, nil
}
func (m *MonitorService) LoadSetting() (*dto.MonitorSetting, error) {
setting, err := settingRepo.GetList()
if err != nil {
@ -174,10 +239,13 @@ func (m *MonitorService) CleanData() error {
if err := global.MonitorDB.Exec("DELETE FROM monitor_networks").Error; err != nil {
return err
}
_ = global.GPUMonitorDB.Exec("DELETE FROM monitor_gpus").Error
return nil
}
func (m *MonitorService) Run() {
saveGPUDataToDB()
saveXPUDataToDB()
var itemModel model.MonitorBase
totalPercent, _ := cpu.Percent(3*time.Second, false)
if len(totalPercent) == 1 {
@ -207,7 +275,7 @@ func (m *MonitorService) Run() {
}
}
if err := settingRepo.CreateMonitorBase(itemModel); err != nil {
if err := monitorRepo.CreateMonitorBase(itemModel); err != nil {
global.LOG.Errorf("Insert basic monitoring data failed, err: %v", err)
}
@ -220,9 +288,9 @@ func (m *MonitorService) Run() {
}
storeDays, _ := strconv.Atoi(MonitorStoreDays.Value)
timeForDelete := time.Now().AddDate(0, 0, -storeDays)
_ = settingRepo.DelMonitorBase(timeForDelete)
_ = settingRepo.DelMonitorIO(timeForDelete)
_ = settingRepo.DelMonitorNet(timeForDelete)
_ = monitorRepo.DelMonitorBase(timeForDelete)
_ = monitorRepo.DelMonitorIO(timeForDelete)
_ = monitorRepo.DelMonitorNet(timeForDelete)
}
func (m *MonitorService) loadDiskIO() {
@ -302,7 +370,7 @@ func (m *MonitorService) saveIODataToDB(ctx context.Context, interval float64) {
}
}
}
if err := settingRepo.BatchCreateMonitorIO(ioList); err != nil {
if err := monitorRepo.BatchCreateMonitorIO(ioList); err != nil {
global.LOG.Errorf("Insert io monitoring data failed, err: %v", err)
}
m.DiskIO <- ioStat2
@ -341,7 +409,7 @@ func (m *MonitorService) saveNetDataToDB(ctx context.Context, interval float64)
}
}
if err := settingRepo.BatchCreateMonitorNet(netList); err != nil {
if err := monitorRepo.BatchCreateMonitorNet(netList); err != nil {
global.LOG.Errorf("Insert network monitoring data failed, err: %v", err)
}
m.NetIO <- netStat2
@ -482,3 +550,90 @@ func StartMonitor(removeBefore bool, interval string) error {
return nil
}
func saveGPUDataToDB() {
exist, client := gpu.New()
if !exist {
return
}
gpuInfo, err := client.LoadGpuInfo()
if err != nil {
return
}
var list []model.MonitorGPU
for _, gpuItem := range gpuInfo.GPUs {
item := model.MonitorGPU{
ProductName: gpuItem.ProductName,
GPUUtil: loadGPUInfoFloat(gpuItem.GPUUtil),
Temperature: loadGPUInfoInt(gpuItem.Temperature),
PowerDraw: loadGPUInfoFloat(gpuItem.PowerDraw),
MaxPowerLimit: loadGPUInfoFloat(gpuItem.MaxPowerLimit),
MemUsed: loadGPUInfoInt(gpuItem.MemUsed),
MemTotal: loadGPUInfoInt(gpuItem.MemTotal),
FanSpeed: loadGPUInfoInt(gpuItem.FanSpeed),
}
process, _ := json.Marshal(gpuItem.Processes)
if len(process) != 0 {
item.Processes = string(process)
}
list = append(list, item)
}
if err := repo.NewIMonitorRepo().BatchCreateMonitorGPU(list); err != nil {
global.LOG.Errorf("batch create gpu monitor data failed, err: %v", err)
return
}
}
func saveXPUDataToDB() {
exist, client := xpu.New()
if !exist {
return
}
xpuInfo, err := client.LoadGpuInfo()
if err != nil {
return
}
var list []model.MonitorGPU
for _, xpuItem := range xpuInfo.Xpu {
item := model.MonitorGPU{
ProductName: xpuItem.Basic.DeviceName,
GPUUtil: loadGPUInfoFloat(xpuItem.Stats.MemoryUtil),
Temperature: loadGPUInfoInt(xpuItem.Stats.Temperature),
PowerDraw: loadGPUInfoFloat(xpuItem.Stats.Power),
MemUsed: loadGPUInfoInt(xpuItem.Stats.MemoryUsed),
MemTotal: loadGPUInfoInt(xpuItem.Basic.Memory),
}
var processItem []dto.GPUProcess
for _, ps := range xpuItem.Processes {
processItem = append(processItem, dto.GPUProcess{
Pid: fmt.Sprintf("%v", ps.PID),
Type: ps.SHR,
ProcessName: ps.Command,
UsedMemory: ps.Memory,
})
}
process, _ := json.Marshal(processItem)
if len(process) != 0 {
item.Processes = string(process)
}
list = append(list, item)
}
if err := repo.NewIMonitorRepo().BatchCreateMonitorGPU(list); err != nil {
global.LOG.Errorf("batch create gpu monitor data failed, err: %v", err)
return
}
}
func loadGPUInfoInt(val string) int {
valItem := strings.ReplaceAll(val, "MiB", "")
valItem = strings.ReplaceAll(valItem, "C", "")
valItem = strings.ReplaceAll(valItem, "%", "")
valItem = strings.TrimSpace(valItem)
data, _ := strconv.Atoi(valItem)
return data
}
func loadGPUInfoFloat(val string) float64 {
valItem := strings.ReplaceAll(val, "W", "")
valItem = strings.ReplaceAll(valItem, "%", "")
valItem = strings.TrimSpace(valItem)
data, _ := strconv.ParseFloat(valItem, 64)
return data
}

View file

@ -13,11 +13,12 @@ import (
)
var (
DB *gorm.DB
MonitorDB *gorm.DB
TaskDB *gorm.DB
CoreDB *gorm.DB
AlertDB *gorm.DB
DB *gorm.DB
MonitorDB *gorm.DB
GPUMonitorDB *gorm.DB
TaskDB *gorm.DB
CoreDB *gorm.DB
AlertDB *gorm.DB
LOG *logrus.Logger
CONF ServerConfig

View file

@ -11,6 +11,7 @@ func Init() {
global.DB = common.LoadDBConnByPath(path.Join(global.Dir.DbDir, "agent.db"), "agent")
global.TaskDB = common.LoadDBConnByPath(path.Join(global.Dir.DbDir, "task.db"), "task")
global.MonitorDB = common.LoadDBConnByPath(path.Join(global.Dir.DbDir, "monitor.db"), "monitor")
global.GPUMonitorDB = common.LoadDBConnByPath(path.Join(global.Dir.DbDir, "gpu_monitor.db"), "gpu_monitor")
global.AlertDB = common.LoadDBConnByPath(path.Join(global.Dir.DbDir, "alert.db"), "alert")
if global.IsMaster {

View file

@ -54,6 +54,7 @@ func InitAgentDB() {
migrations.AddIptablesFilterRuleTable,
migrations.AddCommonDescription,
migrations.UpdateDatabase,
migrations.AddGPUMonitor,
})
if err := m.Migrate(); err != nil {
global.LOG.Error(err)

View file

@ -719,3 +719,10 @@ var UpdateDatabase = &gormigrate.Migration{
return tx.AutoMigrate(&model.Database{})
},
}
var AddGPUMonitor = &gormigrate.Migration{
ID: "20251119-add-gpu-monitor",
Migrate: func(tx *gorm.DB) error {
return global.GPUMonitorDB.AutoMigrate(&model.MonitorGPU{})
},
}

View file

@ -29,6 +29,7 @@ func (s *HostRouter) InitRouter(Router *gin.RouterGroup) {
hostRouter.POST("/firewall/filter/chain/status", baseApi.LoadChainStatus)
hostRouter.POST("/monitor/search", baseApi.LoadMonitor)
hostRouter.POST("/monitor/gpu/search", baseApi.LoadGPUMonitor)
hostRouter.POST("/monitor/clean", baseApi.CleanMonitor)
hostRouter.GET("/monitor/netoptions", baseApi.GetNetworkOptions)
hostRouter.GET("/monitor/iooptions", baseApi.GetIOOptions)

View file

@ -161,6 +161,38 @@ export namespace Host {
endTime: Date;
}
export interface MonitorGPUSearch {
productName: string;
startTime: Date;
endTime: Date;
}
export interface MonitorGPUData {
productNames: Array<string>;
date: Array<Date>;
gpuValue: Array<number>;
temperatureValue: Array<number>;
powerValue: Array<GPUPowerUsageHelper>;
memoryValue: Array<GPUMemoryUsageHelper>;
speedValue: Array<number>;
}
export interface GPUPowerUsageHelper {
total: number;
used: number;
percent: number;
}
export interface GPUMemoryUsageHelper {
total: number;
used: number;
percent: number;
gpuProcesses: Array<GPUProcess>;
}
export interface GPUProcess {
pid: string;
type: string;
processName: string;
usedMemory: string;
}
export interface SSHInfo {
autoStart: boolean;
isActive: boolean;

View file

@ -65,6 +65,9 @@ export const operateFilterChain = (name: string, op: string) => {
export const loadMonitor = (param: Host.MonitorSearch) => {
return http.post<Array<Host.MonitorData>>(`/hosts/monitor/search`, param);
};
export const loadGPUMonitor = (param: Host.MonitorGPUSearch) => {
return http.post<Host.MonitorGPUData>(`/hosts/monitor/gpu/search`, param);
};
export const getNetworkOptions = () => {
return http.get<Array<string>>(`/hosts/monitor/netoptions`);
};

View file

@ -1,5 +1,5 @@
<template>
<div>
<div v-loading="loading">
<RouterButton
:buttons="[
{
@ -9,242 +9,129 @@
]"
/>
<div v-if="gpuType == 'nvidia'">
<LayoutContent
v-loading="loading"
:title="$t('aiTools.gpu.gpu')"
:divider="true"
v-if="gpuInfo.driverVersion.length !== 0 && !loading"
>
<template #toolbar>
<el-row>
<el-col :xs="24" :sm="16" :md="16" :lg="16" :xl="16" />
<el-col :xs="24" :sm="8" :md="8" :lg="8" :xl="8">
<TableSetting title="gpu-refresh" @search="refresh()" />
</el-col>
</el-row>
</template>
<template #main>
<el-descriptions direction="vertical" :column="14" border>
<el-descriptions-item :label="$t('aiTools.gpu.driverVersion')" width="50%" :span="7">
{{ gpuInfo.driverVersion }}
</el-descriptions-item>
<el-descriptions-item :label="$t('aiTools.gpu.cudaVersion')" :span="7">
{{ gpuInfo.cudaVersion }}
</el-descriptions-item>
</el-descriptions>
<el-collapse v-model="activeNames" class="card-interval">
<el-collapse-item v-for="item in gpuInfo.gpu" :key="item.index" :name="item.index">
<template #title>
<span class="name-class">{{ item.index + '. ' + item.productName }}</span>
</template>
<span class="title-class">{{ $t('aiTools.gpu.base') }}</span>
<el-descriptions direction="vertical" :column="6" border size="small" class="mt-2">
<el-descriptions-item :label="$t('monitor.gpuUtil')">
{{ item.gpuUtil }}
</el-descriptions-item>
<el-descriptions-item>
<template #label>
<div class="cell-item">
{{ $t('monitor.temperature') }}
<el-tooltip placement="top" :content="$t('aiTools.gpu.temperatureHelper')">
<el-icon class="icon-item"><InfoFilled /></el-icon>
</el-tooltip>
</div>
</template>
{{ item.temperature.replaceAll('C', '°C') }}
</el-descriptions-item>
<el-descriptions-item>
<template #label>
<div class="cell-item">
{{ $t('monitor.performanceState') }}
<el-tooltip
placement="top"
:content="$t('aiTools.gpu.performanceStateHelper')"
>
<el-icon class="icon-item"><InfoFilled /></el-icon>
</el-tooltip>
</div>
</template>
{{ item.performanceState }}
</el-descriptions-item>
<el-descriptions-item :label="$t('monitor.powerUsage')">
{{ item.powerDraw }} / {{ item.maxPowerLimit }}
</el-descriptions-item>
<el-descriptions-item :label="$t('monitor.memoryUsage')">
{{ item.memUsed }} / {{ item.memTotal }}
</el-descriptions-item>
<el-descriptions-item :label="$t('monitor.fanSpeed')">
{{ item.fanSpeed }}
</el-descriptions-item>
<div class="content-container__search" v-if="options.length !== 0">
<el-card>
<div>
<el-date-picker
@change="search()"
v-model="timeRangeGlobal"
type="datetimerange"
range-separator="-"
:start-placeholder="$t('commons.search.timeStart')"
:end-placeholder="$t('commons.search.timeEnd')"
:shortcuts="shortcuts"
style="max-width: 360px; width: 100%"
:size="mobile ? 'small' : 'default'"
></el-date-picker>
<el-select class="p-w-300 ml-2" v-model="searchInfo.productName" @change="search()">
<el-option v-for="item in options" :key="item" :label="item" :value="item" />
</el-select>
<TableRefresh class="float-right" @search="search()" />
</div>
</el-card>
</div>
<el-row :gutter="7" class="card-interval" v-if="options.length !== 0">
<el-col :span="24">
<el-card style="overflow: inherit">
<template #header>
<div :class="mobile ? 'flx-wrap' : 'flex justify-between'">
<span class="title">{{ $t('monitor.gpuUtil') }}</span>
</div>
</template>
<div class="chart">
<v-charts
height="400px"
id="loadGPUChart"
type="line"
:option="chartsOption['loadGPUChart']"
v-if="chartsOption['loadGPUChart']"
:dataZoom="true"
/>
</div>
</el-card>
</el-col>
<el-col :xs="24" :sm="24" :md="12" :lg="12" :xl="12">
<el-card style="overflow: inherit">
<template #header>
<div :class="mobile ? 'flx-wrap' : 'flex justify-between'">
<span class="title">{{ $t('monitor.memoryUsage') }}</span>
</div>
</template>
<div class="chart">
<v-charts
height="400px"
id="loadMemoryChart"
type="line"
:option="chartsOption['loadMemoryChart']"
v-if="chartsOption['loadMemoryChart']"
:dataZoom="true"
/>
</div>
</el-card>
</el-col>
<el-col :xs="24" :sm="24" :md="12" :lg="12" :xl="12">
<el-card style="overflow: inherit">
<template #header>
<div :class="mobile ? 'flx-wrap' : 'flex justify-between'">
<span class="title">{{ $t('monitor.powerUsage') }}</span>
</div>
</template>
<div class="chart">
<v-charts
height="400px"
id="loadPowerChart"
type="line"
:option="chartsOption['loadPowerChart']"
v-if="chartsOption['loadPowerChart']"
:dataZoom="true"
/>
</div>
</el-card>
</el-col>
<el-col :xs="24" :sm="24" :md="12" :lg="12" :xl="12">
<el-card style="overflow: inherit">
<template #header>
<div>
{{ $t('monitor.temperature') }}
<el-tooltip placement="top" :content="$t('aiTools.gpu.temperatureHelper')">
<el-icon size="15"><InfoFilled /></el-icon>
</el-tooltip>
</div>
</template>
<div class="chart">
<v-charts
height="400px"
id="loadTemperatureChart"
type="line"
:option="chartsOption['loadTemperatureChart']"
v-if="chartsOption['loadTemperatureChart']"
:dataZoom="true"
/>
</div>
</el-card>
</el-col>
<el-col :xs="24" :sm="24" :md="12" :lg="12" :xl="12">
<el-card style="overflow: inherit">
<template #header>
<div :class="mobile ? 'flx-wrap' : 'flex justify-between'">
<span class="title">{{ $t('monitor.fanSpeed') }}</span>
</div>
</template>
<div class="chart">
<v-charts
height="400px"
id="loadSpeedChart"
type="line"
:option="chartsOption['loadSpeedChart']"
v-if="chartsOption['loadSpeedChart']"
:dataZoom="true"
/>
</div>
</el-card>
</el-col>
</el-row>
<el-descriptions-item :label="$t('aiTools.gpu.busID')">
{{ item.busID }}
</el-descriptions-item>
<el-descriptions-item>
<template #label>
<div class="cell-item">
{{ $t('aiTools.gpu.persistenceMode') }}
<el-tooltip
placement="top"
:content="$t('aiTools.gpu.persistenceModeHelper')"
>
<el-icon class="icon-item"><InfoFilled /></el-icon>
</el-tooltip>
</div>
</template>
{{ $t('aiTools.gpu.' + item.persistenceMode.toLowerCase()) }}
</el-descriptions-item>
<el-descriptions-item :label="$t('aiTools.gpu.displayActive')">
{{
lowerCase(item.displayActive) === 'disabled'
? $t('aiTools.gpu.displayActiveF')
: $t('aiTools.gpu.displayActiveT')
}}
</el-descriptions-item>
<el-descriptions-item>
<template #label>
<div class="cell-item">
Uncorr. ECC
<el-tooltip placement="top" :content="$t('aiTools.gpu.ecc')">
<el-icon class="icon-item"><InfoFilled /></el-icon>
</el-tooltip>
</div>
</template>
{{ loadEcc(item.ecc) }}
</el-descriptions-item>
<el-descriptions-item :label="$t('aiTools.gpu.computeMode')">
<template #label>
<div class="cell-item">
{{ $t('aiTools.gpu.computeMode') }}
<el-tooltip placement="top">
<template #content>
{{ $t('aiTools.gpu.defaultHelper') }}
<br />
{{ $t('aiTools.gpu.exclusiveProcessHelper') }}
<br />
{{ $t('aiTools.gpu.exclusiveThreadHelper') }}
<br />
{{ $t('aiTools.gpu.prohibitedHelper') }}
</template>
<el-icon class="icon-item"><InfoFilled /></el-icon>
</el-tooltip>
</div>
</template>
{{ loadComputeMode(item.computeMode) }}
</el-descriptions-item>
<el-descriptions-item label="MIG.M">
<template #label>
<div class="cell-item">
MIG M.
<el-tooltip placement="top">
<template #content>
{{ $t('aiTools.gpu.migModeHelper') }}
</template>
<el-icon class="icon-item"><InfoFilled /></el-icon>
</el-tooltip>
</div>
</template>
{{
item.migMode === 'N/A'
? $t('aiTools.gpu.migModeNA')
: $t('aiTools.gpu.' + lowerCase(item.migMode))
}}
</el-descriptions-item>
</el-descriptions>
<div class="card-interval">
<span class="title-class">{{ $t('aiTools.gpu.process') }}</span>
</div>
<el-table :data="item.processes" v-if="item.processes?.length !== 0">
<el-table-column label="PID" prop="pid" />
<el-table-column :label="$t('aiTools.gpu.type')" prop="type">
<template #default="{ row }">
{{ loadProcessType(row.type) }}
</template>
</el-table-column>
<el-table-column :label="$t('aiTools.gpu.processName')" prop="processName" />
<el-table-column :label="$t('aiTools.gpu.processMemoryUsage')" prop="usedMemory" />
</el-table>
</el-collapse-item>
</el-collapse>
</template>
</LayoutContent>
</div>
<div v-else>
<LayoutContent
v-loading="loading"
:title="$t('aiTools.gpu.gpu')"
:divider="true"
v-if="xpuInfo.driverVersion.length !== 0 && !loading"
>
<template #toolbar>
<el-row>
<el-col :xs="24" :sm="16" :md="16" :lg="16" :xl="16" />
<el-col :xs="24" :sm="8" :md="8" :lg="8" :xl="8">
<TableSetting title="xpu-refresh" @search="refresh()" />
</el-col>
</el-row>
</template>
<template #main>
<el-descriptions direction="vertical" :column="14" border>
<el-descriptions-item :label="$t('aiTools.gpu.driverVersion')" width="50%" :span="7">
{{ xpuInfo.driverVersion }}
</el-descriptions-item>
</el-descriptions>
<el-collapse v-model="activeNames" class="card-interval">
<el-collapse-item
v-for="item in xpuInfo.xpu"
:key="item.basic.deviceID"
:name="item.basic.deviceID"
>
<template #title>
<span class="name-class">{{ item.basic.deviceID + '. ' + item.basic.deviceName }}</span>
</template>
<span class="title-class">{{ $t('aiTools.gpu.base') }}</span>
<el-descriptions direction="vertical" :column="6" border size="small" class="mt-2">
<el-descriptions-item :label="$t('monitor.gpuUtil')">
{{ item.stats.memoryUtil }}
</el-descriptions-item>
<el-descriptions-item>
<template #label>
<div class="cell-item">
{{ $t('monitor.temperature') }}
<el-tooltip placement="top" :content="$t('aiTools.gpu.temperatureHelper')">
<el-icon class="icon-item"><InfoFilled /></el-icon>
</el-tooltip>
</div>
</template>
{{ item.stats.temperature }}
</el-descriptions-item>
<el-descriptions-item :label="$t('monitor.powerUsage')">
{{ item.stats.power }}
</el-descriptions-item>
<el-descriptions-item :label="$t('monitor.memoryUsage')">
{{ item.stats.memoryUsed }} / {{ item.basic.memory }}
</el-descriptions-item>
<el-descriptions-item :label="$t('aiTools.gpu.busID')">
{{ item.basic.pciBdfAddress }}
</el-descriptions-item>
</el-descriptions>
<div class="card-interval">
<span class="title-class">{{ $t('aiTools.gpu.process') }}</span>
</div>
<el-table :data="item.processes" v-if="item.processes?.length !== 0">
<el-table-column label="PID" prop="pid" />
<el-table-column :label="$t('aiTools.gpu.processName')" prop="command" />
<el-table-column :label="$t('aiTools.gpu.shr')" prop="shr" />
<el-table-column :label="$t('aiTools.gpu.processMemoryUsage')" prop="memory" />
</el-table>
</el-collapse-item>
</el-collapse>
</template>
</LayoutContent>
</div>
<LayoutContent
:title="$t('aiTools.gpu.gpu')"
:divider="true"
v-if="gpuInfo.driverVersion.length === 0 && xpuInfo.driverVersion.length == 0 && !loading"
>
<LayoutContent :title="$t('aiTools.gpu.gpu')" :divider="true" v-else>
<template #main>
<div class="app-warn">
<div class="flx-center">
@ -259,79 +146,237 @@
</div>
</template>
<script lang="ts" setup>
import { onMounted, ref } from 'vue';
import { loadGPUInfo } from '@/api/modules/ai';
import { AI } from '@/api/interface/ai';
<script setup lang="ts">
import { ref, reactive, onMounted, computed } from 'vue';
import { loadGPUMonitor } from '@/api/modules/host';
import { dateFormatWithoutYear } from '@/utils/util';
import { GlobalStore } from '@/store';
import { shortcuts } from '@/utils/shortcuts';
import { Host } from '@/api/interface/host';
import i18n from '@/lang';
const loading = ref();
const activeNames = ref(0);
const gpuInfo = ref<AI.Info>({
cudaVersion: '',
driverVersion: '',
type: 'nvidia',
gpu: [],
const globalStore = GlobalStore();
const mobile = computed(() => {
return globalStore.isMobile();
});
const xpuInfo = ref<AI.XpuInfo>({
driverVersion: '',
type: 'xpu',
xpu: [],
const loading = ref(false);
const options = ref([]);
const timeRangeGlobal = ref<[Date, Date]>([new Date(new Date().setHours(0, 0, 0, 0)), new Date()]);
const chartsOption = ref({
loadPowerChart: null,
loadGPUChart: null,
loadMemoryChart: null,
loadTemperatureChart: null,
loadSpeedChart: null,
});
const searchTime = ref();
const searchInfo = reactive<Host.MonitorGPUSearch>({
productName: '',
startTime: new Date(new Date().setHours(0, 0, 0, 0)),
endTime: new Date(),
});
const gpuType = ref('nvidia');
const search = async () => {
if (searchTime.value && searchTime.value.length === 2) {
searchInfo.startTime = searchTime.value[0];
searchInfo.endTime = searchTime.value[1];
}
loading.value = true;
await loadGPUInfo()
await loadGPUMonitor(searchInfo)
.then((res) => {
loading.value = false;
gpuType.value = res.data.type;
if (res.data.type == 'nvidia') {
gpuInfo.value = res.data;
} else {
xpuInfo.value = res.data;
}
options.value = res.data.productNames || [];
searchInfo.productName = searchInfo.productName || (options.value.length > 0 ? options.value[0] : '');
let baseDate = res.data.date.length === 0 ? loadEmptyDate(timeRangeGlobal.value) : res.data.date;
let date = baseDate.map(function (item: any) {
return dateFormatWithoutYear(item);
});
initCPUCharts(date, res.data.gpuValue);
initMemoryCharts(date, res.data.memoryValue);
initPowerCharts(date, res.data.powerValue);
initSpeedCharts(date, res.data.speedValue);
initTemperatureCharts(date, res.data.temperatureValue);
})
.catch(() => {
loading.value = false;
});
};
const refresh = async () => {
const res = await loadGPUInfo();
gpuInfo.value = res.data;
};
function initCPUCharts(baseDate: any, items: any) {
let percents = items.map(function (item: any) {
return Number(item.toFixed(2));
});
let data = percents.length === 0 ? loadEmptyData() : percents;
chartsOption.value['loadGPUChart'] = {
xData: baseDate,
yData: [
{
name: i18n.global.t('monitor.gpuUtil'),
data: data,
},
],
formatStr: '%',
};
}
function initMemoryCharts(baseDate: any, items: any) {
let lists = items.map(function (item: any) {
return { value: Number(item.percent.toFixed(2)), data: item };
});
lists = lists.length === 0 ? loadEmptyData2() : lists;
chartsOption.value['loadMemoryChart'] = {
xData: baseDate,
yData: [
{
name: i18n.global.t('monitor.memoryUsage'),
data: lists,
},
],
tooltip: {
trigger: 'axis',
formatter: function (list: any) {
return withMemoryProcess(list);
},
},
formatStr: '%',
};
}
function initPowerCharts(baseDate: any, items: any) {
let list = items.map(function (item: any) {
return { value: Number(item.percent.toFixed(2)), data: item };
});
list = list.length === 0 ? loadEmptyData2() : list;
chartsOption.value['loadPowerChart'] = {
xData: baseDate,
yData: [
{
name: i18n.global.t('monitor.powerUsage'),
data: list,
},
],
tooltip: {
trigger: 'axis',
formatter: function (list: any) {
let res = loadDate(list[0].name);
for (const item of list) {
res += loadSeries(item, item.data.value ? item.data.value : item.data, '%');
res += `( ${item.data?.data.used} W / ${item.data?.data.total} W)<br/>`;
}
return res;
},
},
formatStr: '%',
};
}
function initTemperatureCharts(baseDate: any, items: any) {
let temperatures = items.map(function (item: any) {
return Number(item);
});
temperatures = temperatures.length === 0 ? loadEmptyData() : temperatures;
chartsOption.value['loadTemperatureChart'] = {
xData: baseDate,
yData: [
{
name: i18n.global.t('monitor.temperature'),
data: temperatures,
},
],
formatStr: '°C',
};
}
function initSpeedCharts(baseDate: any, items: any) {
let speeds = items.map(function (item: any) {
return Number(item);
});
speeds = speeds.length === 0 ? loadEmptyData() : speeds;
chartsOption.value['loadSpeedChart'] = {
xData: baseDate,
yData: [
{
name: i18n.global.t('monitor.fanSpeed'),
data: speeds,
},
],
formatStr: '%',
};
}
const lowerCase = (val: string) => {
return val.toLowerCase();
};
function loadEmptyDate(timeRange: any) {
if (timeRange.length != 2) {
return;
}
let date1 = new Date(timeRange[0]);
let date2 = new Date(timeRange[1]);
return [date1, date2];
}
function loadEmptyData() {
return [0, 0];
}
function loadEmptyData2() {
return [
{ value: 0, data: {} },
{ value: 0, data: {} },
];
}
const loadComputeMode = (val: string) => {
switch (val) {
case 'Default':
return i18n.global.t('aiTools.gpu.default');
case 'Exclusive Process':
return i18n.global.t('aiTools.gpu.exclusiveProcess');
case 'Exclusive Thread':
return i18n.global.t('aiTools.gpu.exclusiveThread');
case 'Prohibited':
return i18n.global.t('aiTools.gpu.prohibited');
function withMemoryProcess(list: any) {
let process;
let res = loadDate(list[0].name);
for (const item of list) {
if (item.data?.data) {
process = item.data?.data.gpuProcesses || [];
}
res += loadSeries(item, item.data.value ? item.data.value : item.data, '%');
res += `( ${item.data?.data.used} MiB / ${item.data?.data.total} MiB)<br/>`;
}
};
const loadEcc = (val: string) => {
if (val === 'N/A') {
return i18n.global.t('aiTools.gpu.migModeNA');
if (!process) {
return res;
}
if (val === 'Disabled') {
return i18n.global.t('aiTools.gpu.disabled');
res += `
<div style="margin-top: 10px; border-bottom: 1px dashed black;"></div>
<table style="border-collapse: collapse; margin-top: 20px; font-size: 12px;">
<thead>
<tr>
<th style="padding: 6px 8px;">PID</th>
<th style="padding: 6px 8px;">${i18n.global.t('aiTools.gpu.type')}</th>
<th style="padding: 6px 8px;">${i18n.global.t('aiTools.gpu.processName')}</th>
<th style="padding: 6px 8px;">${i18n.global.t('aiTools.gpu.processMemoryUsage')}</th>
</tr>
</thead>
<tbody>
`;
for (const row of process) {
res += `
<tr>
<td style="padding: 6px 8px; text-align: center;">
${row.pid}
</td>
<td style="padding: 6px 8px; text-align: center;">
${loadProcessType(row.type)}
</td>
<td style="padding: 6px 8px; text-align: center;">
${row.processName}
</td>
<td style="padding: 6px 8px; text-align: center;">
${row.usedMemory}
</td>
</tr>
`;
}
if (val === 'Enabled') {
return i18n.global.t('aiTools.gpu.enabled');
}
return val || 0;
};
return res;
}
function loadDate(name: any) {
return ` <div style="display: inline-block; width: 100%; padding-bottom: 10px;">
${i18n.global.t('commons.search.date')}: ${name.replaceAll('\n', ' ')}
</div>`;
}
function loadSeries(item: any, data: any, unit: any) {
return `<div style="width: 100%;">
${item.marker} ${item.seriesName}: ${data} ${unit}
</div>`;
}
const loadProcessType = (val: string) => {
if (val === 'C' || val === 'G') {
return i18n.global.t('aiTools.gpu.type' + val);
@ -347,21 +392,19 @@ onMounted(() => {
});
</script>
<style lang="scss" scoped>
.name-class {
font-size: 18px;
font-weight: 500;
}
.title-class {
font-size: 14px;
font-weight: 500;
}
.cell-item {
display: flex;
align-items: center;
.icon-item {
margin-left: 4px;
margin-top: -1px;
<style scoped lang="scss">
.content-container__search {
margin-top: 7px;
.el-card {
--el-card-padding: 12px;
}
}
.title {
font-size: 16px;
font-weight: 500;
}
.chart {
width: 100%;
height: 400px;
}
</style>