feat: GPU monitoring data supports persistence (#11051)

Refs #9496
This commit is contained in:
ssongliu 2025-11-24 15:59:15 +08:00 committed by GitHub
parent 3f47a6e701
commit 63f9368e26
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
14 changed files with 654 additions and 342 deletions

View file

@ -31,6 +31,27 @@ func (b *BaseApi) LoadMonitor(c *gin.Context) {
helper.SuccessWithData(c, data) helper.SuccessWithData(c, data)
} }
// @Tags Monitor
// @Summary Load monitor data
// @Param request body dto.MonitorGPUSearch true "request"
// @Success 200 {object} dto.dto.MonitorGPUData
// @Security ApiKeyAuth
// @Security Timestamp
// @Router /hosts/monitor/gpu/search [post]
func (b *BaseApi) LoadGPUMonitor(c *gin.Context) {
var req dto.MonitorGPUSearch
if err := helper.CheckBindAndValidate(&req, c); err != nil {
return
}
data, err := monitorService.LoadGPUMonitorData(req)
if err != nil {
helper.InternalServer(c, err)
return
}
helper.SuccessWithData(c, data)
}
// @Tags Monitor // @Tags Monitor
// @Summary Clean monitor data // @Summary Clean monitor data
// @Success 200 // @Success 200

View file

@ -11,7 +11,7 @@ type MonitorSearch struct {
} }
type MonitorData struct { type MonitorData struct {
Param string `json:"param" validate:"required,oneof=cpu memory load io network"` Param string `json:"param"`
Date []time.Time `json:"date"` Date []time.Time `json:"date"`
Value []interface{} `json:"value"` Value []interface{} `json:"value"`
} }
@ -37,3 +37,36 @@ type MonitorSettingUpdate struct {
Key string `json:"key" validate:"required,oneof=MonitorStatus MonitorStoreDays MonitorInterval DefaultNetwork DefaultIO"` Key string `json:"key" validate:"required,oneof=MonitorStatus MonitorStoreDays MonitorInterval DefaultNetwork DefaultIO"`
Value string `json:"value"` Value string `json:"value"`
} }
type MonitorGPUSearch struct {
ProductName string `json:"productName"`
StartTime time.Time `json:"startTime"`
EndTime time.Time `json:"endTime"`
}
type MonitorGPUData struct {
ProductNames []string `json:"productNames"`
Date []time.Time `json:"date"`
GPUValue []float64 `json:"gpuValue"`
TemperatureValue []int `json:"temperatureValue"`
PowerValue []GPUPowerUsageHelper `json:"powerValue"`
MemoryValue []GPUMemoryUsageHelper `json:"memoryValue"`
SpeedValue []int `json:"speedValue"`
}
type GPUPowerUsageHelper struct {
Total float64 `json:"total"`
Used float64 `json:"used"`
Percent float64 `json:"percent"`
}
type GPUMemoryUsageHelper struct {
Total int `json:"total"`
Used int `json:"used"`
Percent float64 `json:"percent"`
GPUProcesses []GPUProcess `json:"gpuProcesses"`
}
type GPUProcess struct {
Pid string `json:"pid"`
Type string `json:"type"`
ProcessName string `json:"processName"`
UsedMemory string `json:"usedMemory"`
}

View file

@ -31,3 +31,16 @@ type MonitorNetwork struct {
Up float64 `json:"up"` Up float64 `json:"up"`
Down float64 `json:"down"` Down float64 `json:"down"`
} }
type MonitorGPU struct {
BaseModel
ProductName string `json:"productName"`
GPUUtil float64 `json:"gpuUtil"`
Temperature int `json:"temperature"`
PowerDraw float64 `json:"powerDraw"`
MaxPowerLimit float64 `json:"maxPowerLimit"`
MemUsed int `json:"memUsed"`
MemTotal int `json:"memTotal"`
FanSpeed int `json:"fanSpeed"`
Processes string `json:"processes"`
}

View file

@ -5,21 +5,27 @@ import (
"github.com/1Panel-dev/1Panel/agent/app/model" "github.com/1Panel-dev/1Panel/agent/app/model"
"github.com/1Panel-dev/1Panel/agent/global" "github.com/1Panel-dev/1Panel/agent/global"
"gorm.io/gorm"
) )
type MonitorRepo struct{} type MonitorRepo struct{}
type IMonitorRepo interface { type IMonitorRepo interface {
GetBase(opts ...DBOption) ([]model.MonitorBase, error) GetBase(opts ...DBOption) ([]model.MonitorBase, error)
GetGPU(opts ...DBOption) ([]model.MonitorGPU, error)
GetIO(opts ...DBOption) ([]model.MonitorIO, error) GetIO(opts ...DBOption) ([]model.MonitorIO, error)
GetNetwork(opts ...DBOption) ([]model.MonitorNetwork, error) GetNetwork(opts ...DBOption) ([]model.MonitorNetwork, error)
CreateMonitorBase(model model.MonitorBase) error CreateMonitorBase(model model.MonitorBase) error
BatchCreateMonitorGPU(list []model.MonitorGPU) error
BatchCreateMonitorIO(ioList []model.MonitorIO) error BatchCreateMonitorIO(ioList []model.MonitorIO) error
BatchCreateMonitorNet(ioList []model.MonitorNetwork) error BatchCreateMonitorNet(ioList []model.MonitorNetwork) error
DelMonitorBase(timeForDelete time.Time) error DelMonitorBase(timeForDelete time.Time) error
DelMonitorGPU(timeForDelete time.Time) error
DelMonitorIO(timeForDelete time.Time) error DelMonitorIO(timeForDelete time.Time) error
DelMonitorNet(timeForDelete time.Time) error DelMonitorNet(timeForDelete time.Time) error
WithByProductName(name string) DBOption
} }
func NewIMonitorRepo() IMonitorRepo { func NewIMonitorRepo() IMonitorRepo {
@ -53,10 +59,22 @@ func (u *MonitorRepo) GetNetwork(opts ...DBOption) ([]model.MonitorNetwork, erro
err := db.Find(&data).Error err := db.Find(&data).Error
return data, err return data, err
} }
func (u *MonitorRepo) GetGPU(opts ...DBOption) ([]model.MonitorGPU, error) {
var data []model.MonitorGPU
db := global.GPUMonitorDB
for _, opt := range opts {
db = opt(db)
}
err := db.Find(&data).Error
return data, err
}
func (u *MonitorRepo) CreateMonitorBase(model model.MonitorBase) error { func (u *MonitorRepo) CreateMonitorBase(model model.MonitorBase) error {
return global.MonitorDB.Create(&model).Error return global.MonitorDB.Create(&model).Error
} }
func (s *MonitorRepo) BatchCreateMonitorGPU(list []model.MonitorGPU) error {
return global.GPUMonitorDB.CreateInBatches(&list, len(list)).Error
}
func (u *MonitorRepo) BatchCreateMonitorIO(ioList []model.MonitorIO) error { func (u *MonitorRepo) BatchCreateMonitorIO(ioList []model.MonitorIO) error {
return global.MonitorDB.CreateInBatches(ioList, len(ioList)).Error return global.MonitorDB.CreateInBatches(ioList, len(ioList)).Error
} }
@ -72,3 +90,12 @@ func (u *MonitorRepo) DelMonitorIO(timeForDelete time.Time) error {
func (u *MonitorRepo) DelMonitorNet(timeForDelete time.Time) error { func (u *MonitorRepo) DelMonitorNet(timeForDelete time.Time) error {
return global.MonitorDB.Where("created_at < ?", timeForDelete).Delete(&model.MonitorNetwork{}).Error return global.MonitorDB.Where("created_at < ?", timeForDelete).Delete(&model.MonitorNetwork{}).Error
} }
func (s *MonitorRepo) DelMonitorGPU(timeForDelete time.Time) error {
return global.GPUMonitorDB.Where("created_at < ?", timeForDelete).Delete(&model.MonitorGPU{}).Error
}
func (s *MonitorRepo) WithByProductName(name string) DBOption {
return func(g *gorm.DB) *gorm.DB {
return g.Where("product_name = ?", name)
}
}

View file

@ -2,7 +2,6 @@ package repo
import ( import (
"errors" "errors"
"time"
"github.com/1Panel-dev/1Panel/agent/app/model" "github.com/1Panel-dev/1Panel/agent/app/model"
"github.com/1Panel-dev/1Panel/agent/global" "github.com/1Panel-dev/1Panel/agent/global"
@ -19,12 +18,6 @@ type ISettingRepo interface {
Update(key, value string) error Update(key, value string) error
WithByKey(key string) DBOption WithByKey(key string) DBOption
CreateMonitorBase(model model.MonitorBase) error
BatchCreateMonitorIO(ioList []model.MonitorIO) error
BatchCreateMonitorNet(ioList []model.MonitorNetwork) error
DelMonitorBase(timeForDelete time.Time) error
DelMonitorIO(timeForDelete time.Time) error
DelMonitorNet(timeForDelete time.Time) error
UpdateOrCreate(key, value string) error UpdateOrCreate(key, value string) error
GetDescription(opts ...DBOption) (model.CommonDescription, error) GetDescription(opts ...DBOption) (model.CommonDescription, error)
@ -85,25 +78,6 @@ func (s *SettingRepo) Update(key, value string) error {
return global.DB.Model(&model.Setting{}).Where("key = ?", key).Updates(map[string]interface{}{"value": value}).Error return global.DB.Model(&model.Setting{}).Where("key = ?", key).Updates(map[string]interface{}{"value": value}).Error
} }
func (s *SettingRepo) CreateMonitorBase(model model.MonitorBase) error {
return global.MonitorDB.Create(&model).Error
}
func (s *SettingRepo) BatchCreateMonitorIO(ioList []model.MonitorIO) error {
return global.MonitorDB.CreateInBatches(ioList, len(ioList)).Error
}
func (s *SettingRepo) BatchCreateMonitorNet(ioList []model.MonitorNetwork) error {
return global.MonitorDB.CreateInBatches(ioList, len(ioList)).Error
}
func (s *SettingRepo) DelMonitorBase(timeForDelete time.Time) error {
return global.MonitorDB.Where("created_at < ?", timeForDelete).Delete(&model.MonitorBase{}).Error
}
func (s *SettingRepo) DelMonitorIO(timeForDelete time.Time) error {
return global.MonitorDB.Where("created_at < ?", timeForDelete).Delete(&model.MonitorIO{}).Error
}
func (s *SettingRepo) DelMonitorNet(timeForDelete time.Time) error {
return global.MonitorDB.Where("created_at < ?", timeForDelete).Delete(&model.MonitorNetwork{}).Error
}
func (s *SettingRepo) UpdateOrCreate(key, value string) error { func (s *SettingRepo) UpdateOrCreate(key, value string) error {
var setting model.Setting var setting model.Setting
result := global.DB.Where("key = ?", key).First(&setting) result := global.DB.Where("key = ?", key).First(&setting)

View file

@ -6,6 +6,7 @@ import (
"fmt" "fmt"
"sort" "sort"
"strconv" "strconv"
"strings"
"time" "time"
"github.com/1Panel-dev/1Panel/agent/app/repo" "github.com/1Panel-dev/1Panel/agent/app/repo"
@ -15,6 +16,8 @@ import (
"github.com/1Panel-dev/1Panel/agent/app/dto" "github.com/1Panel-dev/1Panel/agent/app/dto"
"github.com/1Panel-dev/1Panel/agent/app/model" "github.com/1Panel-dev/1Panel/agent/app/model"
"github.com/1Panel-dev/1Panel/agent/global" "github.com/1Panel-dev/1Panel/agent/global"
"github.com/1Panel-dev/1Panel/agent/utils/ai_tools/gpu"
"github.com/1Panel-dev/1Panel/agent/utils/ai_tools/xpu"
"github.com/1Panel-dev/1Panel/agent/utils/common" "github.com/1Panel-dev/1Panel/agent/utils/common"
"github.com/robfig/cron/v3" "github.com/robfig/cron/v3"
"github.com/shirou/gopsutil/v4/cpu" "github.com/shirou/gopsutil/v4/cpu"
@ -35,6 +38,7 @@ var monitorCancel context.CancelFunc
type IMonitorService interface { type IMonitorService interface {
Run() Run()
LoadMonitorData(req dto.MonitorSearch) ([]dto.MonitorData, error) LoadMonitorData(req dto.MonitorSearch) ([]dto.MonitorData, error)
LoadGPUMonitorData(req dto.MonitorGPUSearch) (dto.MonitorGPUData, error)
LoadSetting() (*dto.MonitorSetting, error) LoadSetting() (*dto.MonitorSetting, error)
UpdateSetting(key, value string) error UpdateSetting(key, value string) error
CleanData() error CleanData() error
@ -113,6 +117,67 @@ func (m *MonitorService) LoadMonitorData(req dto.MonitorSearch) ([]dto.MonitorDa
return data, nil return data, nil
} }
func (m *MonitorService) LoadGPUMonitorData(req dto.MonitorGPUSearch) (dto.MonitorGPUData, error) {
loc, _ := time.LoadLocation(common.LoadTimeZoneByCmd())
req.StartTime = req.StartTime.In(loc)
req.EndTime = req.EndTime.In(loc)
var data dto.MonitorGPUData
gpuExist, gpuclient := gpu.New()
xpuExist, xpuClient := xpu.New()
if !gpuExist && !xpuExist {
return data, nil
}
if len(req.ProductName) == 0 {
if gpuExist {
gpuInfo, err := gpuclient.LoadGpuInfo()
if err != nil || len(gpuInfo.GPUs) == 0 {
return data, buserr.New("ErrRecordNotFound")
}
req.ProductName = gpuInfo.GPUs[0].ProductName
for _, item := range gpuInfo.GPUs {
data.ProductNames = append(data.ProductNames, item.ProductName)
}
} else {
xpuInfo, err := xpuClient.LoadGpuInfo()
if err != nil || len(xpuInfo.Xpu) == 0 {
return data, buserr.New("ErrRecordNotFound")
}
req.ProductName = xpuInfo.Xpu[0].Basic.DeviceName
for _, item := range xpuInfo.Xpu {
data.ProductNames = append(data.ProductNames, item.Basic.DeviceName)
}
}
}
gpuList, err := monitorRepo.GetGPU(repo.WithByCreatedAt(req.StartTime, req.EndTime), monitorRepo.WithByProductName(req.ProductName))
if err != nil {
return data, err
}
for _, gpu := range gpuList {
data.Date = append(data.Date, gpu.CreatedAt)
data.GPUValue = append(data.GPUValue, gpu.GPUUtil)
data.TemperatureValue = append(data.TemperatureValue, gpu.Temperature)
data.PowerValue = append(data.PowerValue, dto.GPUPowerUsageHelper{
Total: gpu.MaxPowerLimit,
Used: gpu.PowerDraw,
Percent: gpu.PowerDraw / gpu.MaxPowerLimit * 100,
})
memItem := dto.GPUMemoryUsageHelper{
Total: gpu.MemTotal,
Used: gpu.MemUsed,
Percent: float64(gpu.MemUsed) / float64(gpu.MemTotal) * 100,
}
var process []dto.GPUProcess
if err := json.Unmarshal([]byte(gpu.Processes), &process); err == nil {
memItem.GPUProcesses = process
}
data.MemoryValue = append(data.MemoryValue, memItem)
data.SpeedValue = append(data.SpeedValue, gpu.FanSpeed)
}
return data, nil
}
func (m *MonitorService) LoadSetting() (*dto.MonitorSetting, error) { func (m *MonitorService) LoadSetting() (*dto.MonitorSetting, error) {
setting, err := settingRepo.GetList() setting, err := settingRepo.GetList()
if err != nil { if err != nil {
@ -174,10 +239,13 @@ func (m *MonitorService) CleanData() error {
if err := global.MonitorDB.Exec("DELETE FROM monitor_networks").Error; err != nil { if err := global.MonitorDB.Exec("DELETE FROM monitor_networks").Error; err != nil {
return err return err
} }
_ = global.GPUMonitorDB.Exec("DELETE FROM monitor_gpus").Error
return nil return nil
} }
func (m *MonitorService) Run() { func (m *MonitorService) Run() {
saveGPUDataToDB()
saveXPUDataToDB()
var itemModel model.MonitorBase var itemModel model.MonitorBase
totalPercent, _ := cpu.Percent(3*time.Second, false) totalPercent, _ := cpu.Percent(3*time.Second, false)
if len(totalPercent) == 1 { if len(totalPercent) == 1 {
@ -207,7 +275,7 @@ func (m *MonitorService) Run() {
} }
} }
if err := settingRepo.CreateMonitorBase(itemModel); err != nil { if err := monitorRepo.CreateMonitorBase(itemModel); err != nil {
global.LOG.Errorf("Insert basic monitoring data failed, err: %v", err) global.LOG.Errorf("Insert basic monitoring data failed, err: %v", err)
} }
@ -220,9 +288,9 @@ func (m *MonitorService) Run() {
} }
storeDays, _ := strconv.Atoi(MonitorStoreDays.Value) storeDays, _ := strconv.Atoi(MonitorStoreDays.Value)
timeForDelete := time.Now().AddDate(0, 0, -storeDays) timeForDelete := time.Now().AddDate(0, 0, -storeDays)
_ = settingRepo.DelMonitorBase(timeForDelete) _ = monitorRepo.DelMonitorBase(timeForDelete)
_ = settingRepo.DelMonitorIO(timeForDelete) _ = monitorRepo.DelMonitorIO(timeForDelete)
_ = settingRepo.DelMonitorNet(timeForDelete) _ = monitorRepo.DelMonitorNet(timeForDelete)
} }
func (m *MonitorService) loadDiskIO() { func (m *MonitorService) loadDiskIO() {
@ -302,7 +370,7 @@ func (m *MonitorService) saveIODataToDB(ctx context.Context, interval float64) {
} }
} }
} }
if err := settingRepo.BatchCreateMonitorIO(ioList); err != nil { if err := monitorRepo.BatchCreateMonitorIO(ioList); err != nil {
global.LOG.Errorf("Insert io monitoring data failed, err: %v", err) global.LOG.Errorf("Insert io monitoring data failed, err: %v", err)
} }
m.DiskIO <- ioStat2 m.DiskIO <- ioStat2
@ -341,7 +409,7 @@ func (m *MonitorService) saveNetDataToDB(ctx context.Context, interval float64)
} }
} }
if err := settingRepo.BatchCreateMonitorNet(netList); err != nil { if err := monitorRepo.BatchCreateMonitorNet(netList); err != nil {
global.LOG.Errorf("Insert network monitoring data failed, err: %v", err) global.LOG.Errorf("Insert network monitoring data failed, err: %v", err)
} }
m.NetIO <- netStat2 m.NetIO <- netStat2
@ -482,3 +550,90 @@ func StartMonitor(removeBefore bool, interval string) error {
return nil return nil
} }
func saveGPUDataToDB() {
exist, client := gpu.New()
if !exist {
return
}
gpuInfo, err := client.LoadGpuInfo()
if err != nil {
return
}
var list []model.MonitorGPU
for _, gpuItem := range gpuInfo.GPUs {
item := model.MonitorGPU{
ProductName: gpuItem.ProductName,
GPUUtil: loadGPUInfoFloat(gpuItem.GPUUtil),
Temperature: loadGPUInfoInt(gpuItem.Temperature),
PowerDraw: loadGPUInfoFloat(gpuItem.PowerDraw),
MaxPowerLimit: loadGPUInfoFloat(gpuItem.MaxPowerLimit),
MemUsed: loadGPUInfoInt(gpuItem.MemUsed),
MemTotal: loadGPUInfoInt(gpuItem.MemTotal),
FanSpeed: loadGPUInfoInt(gpuItem.FanSpeed),
}
process, _ := json.Marshal(gpuItem.Processes)
if len(process) != 0 {
item.Processes = string(process)
}
list = append(list, item)
}
if err := repo.NewIMonitorRepo().BatchCreateMonitorGPU(list); err != nil {
global.LOG.Errorf("batch create gpu monitor data failed, err: %v", err)
return
}
}
func saveXPUDataToDB() {
exist, client := xpu.New()
if !exist {
return
}
xpuInfo, err := client.LoadGpuInfo()
if err != nil {
return
}
var list []model.MonitorGPU
for _, xpuItem := range xpuInfo.Xpu {
item := model.MonitorGPU{
ProductName: xpuItem.Basic.DeviceName,
GPUUtil: loadGPUInfoFloat(xpuItem.Stats.MemoryUtil),
Temperature: loadGPUInfoInt(xpuItem.Stats.Temperature),
PowerDraw: loadGPUInfoFloat(xpuItem.Stats.Power),
MemUsed: loadGPUInfoInt(xpuItem.Stats.MemoryUsed),
MemTotal: loadGPUInfoInt(xpuItem.Basic.Memory),
}
var processItem []dto.GPUProcess
for _, ps := range xpuItem.Processes {
processItem = append(processItem, dto.GPUProcess{
Pid: fmt.Sprintf("%v", ps.PID),
Type: ps.SHR,
ProcessName: ps.Command,
UsedMemory: ps.Memory,
})
}
process, _ := json.Marshal(processItem)
if len(process) != 0 {
item.Processes = string(process)
}
list = append(list, item)
}
if err := repo.NewIMonitorRepo().BatchCreateMonitorGPU(list); err != nil {
global.LOG.Errorf("batch create gpu monitor data failed, err: %v", err)
return
}
}
func loadGPUInfoInt(val string) int {
valItem := strings.ReplaceAll(val, "MiB", "")
valItem = strings.ReplaceAll(valItem, "C", "")
valItem = strings.ReplaceAll(valItem, "%", "")
valItem = strings.TrimSpace(valItem)
data, _ := strconv.Atoi(valItem)
return data
}
func loadGPUInfoFloat(val string) float64 {
valItem := strings.ReplaceAll(val, "W", "")
valItem = strings.ReplaceAll(valItem, "%", "")
valItem = strings.TrimSpace(valItem)
data, _ := strconv.ParseFloat(valItem, 64)
return data
}

View file

@ -13,11 +13,12 @@ import (
) )
var ( var (
DB *gorm.DB DB *gorm.DB
MonitorDB *gorm.DB MonitorDB *gorm.DB
TaskDB *gorm.DB GPUMonitorDB *gorm.DB
CoreDB *gorm.DB TaskDB *gorm.DB
AlertDB *gorm.DB CoreDB *gorm.DB
AlertDB *gorm.DB
LOG *logrus.Logger LOG *logrus.Logger
CONF ServerConfig CONF ServerConfig

View file

@ -11,6 +11,7 @@ func Init() {
global.DB = common.LoadDBConnByPath(path.Join(global.Dir.DbDir, "agent.db"), "agent") global.DB = common.LoadDBConnByPath(path.Join(global.Dir.DbDir, "agent.db"), "agent")
global.TaskDB = common.LoadDBConnByPath(path.Join(global.Dir.DbDir, "task.db"), "task") global.TaskDB = common.LoadDBConnByPath(path.Join(global.Dir.DbDir, "task.db"), "task")
global.MonitorDB = common.LoadDBConnByPath(path.Join(global.Dir.DbDir, "monitor.db"), "monitor") global.MonitorDB = common.LoadDBConnByPath(path.Join(global.Dir.DbDir, "monitor.db"), "monitor")
global.GPUMonitorDB = common.LoadDBConnByPath(path.Join(global.Dir.DbDir, "gpu_monitor.db"), "gpu_monitor")
global.AlertDB = common.LoadDBConnByPath(path.Join(global.Dir.DbDir, "alert.db"), "alert") global.AlertDB = common.LoadDBConnByPath(path.Join(global.Dir.DbDir, "alert.db"), "alert")
if global.IsMaster { if global.IsMaster {

View file

@ -54,6 +54,7 @@ func InitAgentDB() {
migrations.AddIptablesFilterRuleTable, migrations.AddIptablesFilterRuleTable,
migrations.AddCommonDescription, migrations.AddCommonDescription,
migrations.UpdateDatabase, migrations.UpdateDatabase,
migrations.AddGPUMonitor,
}) })
if err := m.Migrate(); err != nil { if err := m.Migrate(); err != nil {
global.LOG.Error(err) global.LOG.Error(err)

View file

@ -719,3 +719,10 @@ var UpdateDatabase = &gormigrate.Migration{
return tx.AutoMigrate(&model.Database{}) return tx.AutoMigrate(&model.Database{})
}, },
} }
var AddGPUMonitor = &gormigrate.Migration{
ID: "20251119-add-gpu-monitor",
Migrate: func(tx *gorm.DB) error {
return global.GPUMonitorDB.AutoMigrate(&model.MonitorGPU{})
},
}

View file

@ -29,6 +29,7 @@ func (s *HostRouter) InitRouter(Router *gin.RouterGroup) {
hostRouter.POST("/firewall/filter/chain/status", baseApi.LoadChainStatus) hostRouter.POST("/firewall/filter/chain/status", baseApi.LoadChainStatus)
hostRouter.POST("/monitor/search", baseApi.LoadMonitor) hostRouter.POST("/monitor/search", baseApi.LoadMonitor)
hostRouter.POST("/monitor/gpu/search", baseApi.LoadGPUMonitor)
hostRouter.POST("/monitor/clean", baseApi.CleanMonitor) hostRouter.POST("/monitor/clean", baseApi.CleanMonitor)
hostRouter.GET("/monitor/netoptions", baseApi.GetNetworkOptions) hostRouter.GET("/monitor/netoptions", baseApi.GetNetworkOptions)
hostRouter.GET("/monitor/iooptions", baseApi.GetIOOptions) hostRouter.GET("/monitor/iooptions", baseApi.GetIOOptions)

View file

@ -161,6 +161,38 @@ export namespace Host {
endTime: Date; endTime: Date;
} }
export interface MonitorGPUSearch {
productName: string;
startTime: Date;
endTime: Date;
}
export interface MonitorGPUData {
productNames: Array<string>;
date: Array<Date>;
gpuValue: Array<number>;
temperatureValue: Array<number>;
powerValue: Array<GPUPowerUsageHelper>;
memoryValue: Array<GPUMemoryUsageHelper>;
speedValue: Array<number>;
}
export interface GPUPowerUsageHelper {
total: number;
used: number;
percent: number;
}
export interface GPUMemoryUsageHelper {
total: number;
used: number;
percent: number;
gpuProcesses: Array<GPUProcess>;
}
export interface GPUProcess {
pid: string;
type: string;
processName: string;
usedMemory: string;
}
export interface SSHInfo { export interface SSHInfo {
autoStart: boolean; autoStart: boolean;
isActive: boolean; isActive: boolean;

View file

@ -65,6 +65,9 @@ export const operateFilterChain = (name: string, op: string) => {
export const loadMonitor = (param: Host.MonitorSearch) => { export const loadMonitor = (param: Host.MonitorSearch) => {
return http.post<Array<Host.MonitorData>>(`/hosts/monitor/search`, param); return http.post<Array<Host.MonitorData>>(`/hosts/monitor/search`, param);
}; };
export const loadGPUMonitor = (param: Host.MonitorGPUSearch) => {
return http.post<Host.MonitorGPUData>(`/hosts/monitor/gpu/search`, param);
};
export const getNetworkOptions = () => { export const getNetworkOptions = () => {
return http.get<Array<string>>(`/hosts/monitor/netoptions`); return http.get<Array<string>>(`/hosts/monitor/netoptions`);
}; };

View file

@ -1,5 +1,5 @@
<template> <template>
<div> <div v-loading="loading">
<RouterButton <RouterButton
:buttons="[ :buttons="[
{ {
@ -9,242 +9,129 @@
]" ]"
/> />
<div v-if="gpuType == 'nvidia'"> <div class="content-container__search" v-if="options.length !== 0">
<LayoutContent <el-card>
v-loading="loading" <div>
:title="$t('aiTools.gpu.gpu')" <el-date-picker
:divider="true" @change="search()"
v-if="gpuInfo.driverVersion.length !== 0 && !loading" v-model="timeRangeGlobal"
> type="datetimerange"
<template #toolbar> range-separator="-"
<el-row> :start-placeholder="$t('commons.search.timeStart')"
<el-col :xs="24" :sm="16" :md="16" :lg="16" :xl="16" /> :end-placeholder="$t('commons.search.timeEnd')"
<el-col :xs="24" :sm="8" :md="8" :lg="8" :xl="8"> :shortcuts="shortcuts"
<TableSetting title="gpu-refresh" @search="refresh()" /> style="max-width: 360px; width: 100%"
</el-col> :size="mobile ? 'small' : 'default'"
</el-row> ></el-date-picker>
</template> <el-select class="p-w-300 ml-2" v-model="searchInfo.productName" @change="search()">
<template #main> <el-option v-for="item in options" :key="item" :label="item" :value="item" />
<el-descriptions direction="vertical" :column="14" border> </el-select>
<el-descriptions-item :label="$t('aiTools.gpu.driverVersion')" width="50%" :span="7"> <TableRefresh class="float-right" @search="search()" />
{{ gpuInfo.driverVersion }} </div>
</el-descriptions-item> </el-card>
<el-descriptions-item :label="$t('aiTools.gpu.cudaVersion')" :span="7"> </div>
{{ gpuInfo.cudaVersion }} <el-row :gutter="7" class="card-interval" v-if="options.length !== 0">
</el-descriptions-item> <el-col :span="24">
</el-descriptions> <el-card style="overflow: inherit">
<el-collapse v-model="activeNames" class="card-interval"> <template #header>
<el-collapse-item v-for="item in gpuInfo.gpu" :key="item.index" :name="item.index"> <div :class="mobile ? 'flx-wrap' : 'flex justify-between'">
<template #title> <span class="title">{{ $t('monitor.gpuUtil') }}</span>
<span class="name-class">{{ item.index + '. ' + item.productName }}</span> </div>
</template> </template>
<span class="title-class">{{ $t('aiTools.gpu.base') }}</span> <div class="chart">
<el-descriptions direction="vertical" :column="6" border size="small" class="mt-2"> <v-charts
<el-descriptions-item :label="$t('monitor.gpuUtil')"> height="400px"
{{ item.gpuUtil }} id="loadGPUChart"
</el-descriptions-item> type="line"
<el-descriptions-item> :option="chartsOption['loadGPUChart']"
<template #label> v-if="chartsOption['loadGPUChart']"
<div class="cell-item"> :dataZoom="true"
{{ $t('monitor.temperature') }} />
<el-tooltip placement="top" :content="$t('aiTools.gpu.temperatureHelper')"> </div>
<el-icon class="icon-item"><InfoFilled /></el-icon> </el-card>
</el-tooltip> </el-col>
</div> <el-col :xs="24" :sm="24" :md="12" :lg="12" :xl="12">
</template> <el-card style="overflow: inherit">
{{ item.temperature.replaceAll('C', '°C') }} <template #header>
</el-descriptions-item> <div :class="mobile ? 'flx-wrap' : 'flex justify-between'">
<el-descriptions-item> <span class="title">{{ $t('monitor.memoryUsage') }}</span>
<template #label> </div>
<div class="cell-item"> </template>
{{ $t('monitor.performanceState') }} <div class="chart">
<el-tooltip <v-charts
placement="top" height="400px"
:content="$t('aiTools.gpu.performanceStateHelper')" id="loadMemoryChart"
> type="line"
<el-icon class="icon-item"><InfoFilled /></el-icon> :option="chartsOption['loadMemoryChart']"
</el-tooltip> v-if="chartsOption['loadMemoryChart']"
</div> :dataZoom="true"
</template> />
{{ item.performanceState }} </div>
</el-descriptions-item> </el-card>
<el-descriptions-item :label="$t('monitor.powerUsage')"> </el-col>
{{ item.powerDraw }} / {{ item.maxPowerLimit }} <el-col :xs="24" :sm="24" :md="12" :lg="12" :xl="12">
</el-descriptions-item> <el-card style="overflow: inherit">
<el-descriptions-item :label="$t('monitor.memoryUsage')"> <template #header>
{{ item.memUsed }} / {{ item.memTotal }} <div :class="mobile ? 'flx-wrap' : 'flex justify-between'">
</el-descriptions-item> <span class="title">{{ $t('monitor.powerUsage') }}</span>
<el-descriptions-item :label="$t('monitor.fanSpeed')"> </div>
{{ item.fanSpeed }} </template>
</el-descriptions-item> <div class="chart">
<v-charts
height="400px"
id="loadPowerChart"
type="line"
:option="chartsOption['loadPowerChart']"
v-if="chartsOption['loadPowerChart']"
:dataZoom="true"
/>
</div>
</el-card>
</el-col>
<el-col :xs="24" :sm="24" :md="12" :lg="12" :xl="12">
<el-card style="overflow: inherit">
<template #header>
<div>
{{ $t('monitor.temperature') }}
<el-tooltip placement="top" :content="$t('aiTools.gpu.temperatureHelper')">
<el-icon size="15"><InfoFilled /></el-icon>
</el-tooltip>
</div>
</template>
<div class="chart">
<v-charts
height="400px"
id="loadTemperatureChart"
type="line"
:option="chartsOption['loadTemperatureChart']"
v-if="chartsOption['loadTemperatureChart']"
:dataZoom="true"
/>
</div>
</el-card>
</el-col>
<el-col :xs="24" :sm="24" :md="12" :lg="12" :xl="12">
<el-card style="overflow: inherit">
<template #header>
<div :class="mobile ? 'flx-wrap' : 'flex justify-between'">
<span class="title">{{ $t('monitor.fanSpeed') }}</span>
</div>
</template>
<div class="chart">
<v-charts
height="400px"
id="loadSpeedChart"
type="line"
:option="chartsOption['loadSpeedChart']"
v-if="chartsOption['loadSpeedChart']"
:dataZoom="true"
/>
</div>
</el-card>
</el-col>
</el-row>
<el-descriptions-item :label="$t('aiTools.gpu.busID')"> <LayoutContent :title="$t('aiTools.gpu.gpu')" :divider="true" v-else>
{{ item.busID }}
</el-descriptions-item>
<el-descriptions-item>
<template #label>
<div class="cell-item">
{{ $t('aiTools.gpu.persistenceMode') }}
<el-tooltip
placement="top"
:content="$t('aiTools.gpu.persistenceModeHelper')"
>
<el-icon class="icon-item"><InfoFilled /></el-icon>
</el-tooltip>
</div>
</template>
{{ $t('aiTools.gpu.' + item.persistenceMode.toLowerCase()) }}
</el-descriptions-item>
<el-descriptions-item :label="$t('aiTools.gpu.displayActive')">
{{
lowerCase(item.displayActive) === 'disabled'
? $t('aiTools.gpu.displayActiveF')
: $t('aiTools.gpu.displayActiveT')
}}
</el-descriptions-item>
<el-descriptions-item>
<template #label>
<div class="cell-item">
Uncorr. ECC
<el-tooltip placement="top" :content="$t('aiTools.gpu.ecc')">
<el-icon class="icon-item"><InfoFilled /></el-icon>
</el-tooltip>
</div>
</template>
{{ loadEcc(item.ecc) }}
</el-descriptions-item>
<el-descriptions-item :label="$t('aiTools.gpu.computeMode')">
<template #label>
<div class="cell-item">
{{ $t('aiTools.gpu.computeMode') }}
<el-tooltip placement="top">
<template #content>
{{ $t('aiTools.gpu.defaultHelper') }}
<br />
{{ $t('aiTools.gpu.exclusiveProcessHelper') }}
<br />
{{ $t('aiTools.gpu.exclusiveThreadHelper') }}
<br />
{{ $t('aiTools.gpu.prohibitedHelper') }}
</template>
<el-icon class="icon-item"><InfoFilled /></el-icon>
</el-tooltip>
</div>
</template>
{{ loadComputeMode(item.computeMode) }}
</el-descriptions-item>
<el-descriptions-item label="MIG.M">
<template #label>
<div class="cell-item">
MIG M.
<el-tooltip placement="top">
<template #content>
{{ $t('aiTools.gpu.migModeHelper') }}
</template>
<el-icon class="icon-item"><InfoFilled /></el-icon>
</el-tooltip>
</div>
</template>
{{
item.migMode === 'N/A'
? $t('aiTools.gpu.migModeNA')
: $t('aiTools.gpu.' + lowerCase(item.migMode))
}}
</el-descriptions-item>
</el-descriptions>
<div class="card-interval">
<span class="title-class">{{ $t('aiTools.gpu.process') }}</span>
</div>
<el-table :data="item.processes" v-if="item.processes?.length !== 0">
<el-table-column label="PID" prop="pid" />
<el-table-column :label="$t('aiTools.gpu.type')" prop="type">
<template #default="{ row }">
{{ loadProcessType(row.type) }}
</template>
</el-table-column>
<el-table-column :label="$t('aiTools.gpu.processName')" prop="processName" />
<el-table-column :label="$t('aiTools.gpu.processMemoryUsage')" prop="usedMemory" />
</el-table>
</el-collapse-item>
</el-collapse>
</template>
</LayoutContent>
</div>
<div v-else>
<LayoutContent
v-loading="loading"
:title="$t('aiTools.gpu.gpu')"
:divider="true"
v-if="xpuInfo.driverVersion.length !== 0 && !loading"
>
<template #toolbar>
<el-row>
<el-col :xs="24" :sm="16" :md="16" :lg="16" :xl="16" />
<el-col :xs="24" :sm="8" :md="8" :lg="8" :xl="8">
<TableSetting title="xpu-refresh" @search="refresh()" />
</el-col>
</el-row>
</template>
<template #main>
<el-descriptions direction="vertical" :column="14" border>
<el-descriptions-item :label="$t('aiTools.gpu.driverVersion')" width="50%" :span="7">
{{ xpuInfo.driverVersion }}
</el-descriptions-item>
</el-descriptions>
<el-collapse v-model="activeNames" class="card-interval">
<el-collapse-item
v-for="item in xpuInfo.xpu"
:key="item.basic.deviceID"
:name="item.basic.deviceID"
>
<template #title>
<span class="name-class">{{ item.basic.deviceID + '. ' + item.basic.deviceName }}</span>
</template>
<span class="title-class">{{ $t('aiTools.gpu.base') }}</span>
<el-descriptions direction="vertical" :column="6" border size="small" class="mt-2">
<el-descriptions-item :label="$t('monitor.gpuUtil')">
{{ item.stats.memoryUtil }}
</el-descriptions-item>
<el-descriptions-item>
<template #label>
<div class="cell-item">
{{ $t('monitor.temperature') }}
<el-tooltip placement="top" :content="$t('aiTools.gpu.temperatureHelper')">
<el-icon class="icon-item"><InfoFilled /></el-icon>
</el-tooltip>
</div>
</template>
{{ item.stats.temperature }}
</el-descriptions-item>
<el-descriptions-item :label="$t('monitor.powerUsage')">
{{ item.stats.power }}
</el-descriptions-item>
<el-descriptions-item :label="$t('monitor.memoryUsage')">
{{ item.stats.memoryUsed }} / {{ item.basic.memory }}
</el-descriptions-item>
<el-descriptions-item :label="$t('aiTools.gpu.busID')">
{{ item.basic.pciBdfAddress }}
</el-descriptions-item>
</el-descriptions>
<div class="card-interval">
<span class="title-class">{{ $t('aiTools.gpu.process') }}</span>
</div>
<el-table :data="item.processes" v-if="item.processes?.length !== 0">
<el-table-column label="PID" prop="pid" />
<el-table-column :label="$t('aiTools.gpu.processName')" prop="command" />
<el-table-column :label="$t('aiTools.gpu.shr')" prop="shr" />
<el-table-column :label="$t('aiTools.gpu.processMemoryUsage')" prop="memory" />
</el-table>
</el-collapse-item>
</el-collapse>
</template>
</LayoutContent>
</div>
<LayoutContent
:title="$t('aiTools.gpu.gpu')"
:divider="true"
v-if="gpuInfo.driverVersion.length === 0 && xpuInfo.driverVersion.length == 0 && !loading"
>
<template #main> <template #main>
<div class="app-warn"> <div class="app-warn">
<div class="flx-center"> <div class="flx-center">
@ -259,79 +146,237 @@
</div> </div>
</template> </template>
<script lang="ts" setup> <script setup lang="ts">
import { onMounted, ref } from 'vue'; import { ref, reactive, onMounted, computed } from 'vue';
import { loadGPUInfo } from '@/api/modules/ai'; import { loadGPUMonitor } from '@/api/modules/host';
import { AI } from '@/api/interface/ai'; import { dateFormatWithoutYear } from '@/utils/util';
import { GlobalStore } from '@/store';
import { shortcuts } from '@/utils/shortcuts';
import { Host } from '@/api/interface/host';
import i18n from '@/lang'; import i18n from '@/lang';
const loading = ref(); const globalStore = GlobalStore();
const activeNames = ref(0);
const gpuInfo = ref<AI.Info>({ const mobile = computed(() => {
cudaVersion: '', return globalStore.isMobile();
driverVersion: '',
type: 'nvidia',
gpu: [],
}); });
const xpuInfo = ref<AI.XpuInfo>({
driverVersion: '', const loading = ref(false);
type: 'xpu', const options = ref([]);
xpu: [], const timeRangeGlobal = ref<[Date, Date]>([new Date(new Date().setHours(0, 0, 0, 0)), new Date()]);
const chartsOption = ref({
loadPowerChart: null,
loadGPUChart: null,
loadMemoryChart: null,
loadTemperatureChart: null,
loadSpeedChart: null,
});
const searchTime = ref();
const searchInfo = reactive<Host.MonitorGPUSearch>({
productName: '',
startTime: new Date(new Date().setHours(0, 0, 0, 0)),
endTime: new Date(),
}); });
const gpuType = ref('nvidia');
const search = async () => { const search = async () => {
if (searchTime.value && searchTime.value.length === 2) {
searchInfo.startTime = searchTime.value[0];
searchInfo.endTime = searchTime.value[1];
}
loading.value = true; loading.value = true;
await loadGPUInfo() await loadGPUMonitor(searchInfo)
.then((res) => { .then((res) => {
loading.value = false; loading.value = false;
gpuType.value = res.data.type; options.value = res.data.productNames || [];
if (res.data.type == 'nvidia') { searchInfo.productName = searchInfo.productName || (options.value.length > 0 ? options.value[0] : '');
gpuInfo.value = res.data; let baseDate = res.data.date.length === 0 ? loadEmptyDate(timeRangeGlobal.value) : res.data.date;
} else { let date = baseDate.map(function (item: any) {
xpuInfo.value = res.data; return dateFormatWithoutYear(item);
} });
initCPUCharts(date, res.data.gpuValue);
initMemoryCharts(date, res.data.memoryValue);
initPowerCharts(date, res.data.powerValue);
initSpeedCharts(date, res.data.speedValue);
initTemperatureCharts(date, res.data.temperatureValue);
}) })
.catch(() => { .catch(() => {
loading.value = false; loading.value = false;
}); });
}; };
const refresh = async () => { function initCPUCharts(baseDate: any, items: any) {
const res = await loadGPUInfo(); let percents = items.map(function (item: any) {
gpuInfo.value = res.data; return Number(item.toFixed(2));
}; });
let data = percents.length === 0 ? loadEmptyData() : percents;
chartsOption.value['loadGPUChart'] = {
xData: baseDate,
yData: [
{
name: i18n.global.t('monitor.gpuUtil'),
data: data,
},
],
formatStr: '%',
};
}
function initMemoryCharts(baseDate: any, items: any) {
let lists = items.map(function (item: any) {
return { value: Number(item.percent.toFixed(2)), data: item };
});
lists = lists.length === 0 ? loadEmptyData2() : lists;
chartsOption.value['loadMemoryChart'] = {
xData: baseDate,
yData: [
{
name: i18n.global.t('monitor.memoryUsage'),
data: lists,
},
],
tooltip: {
trigger: 'axis',
formatter: function (list: any) {
return withMemoryProcess(list);
},
},
formatStr: '%',
};
}
function initPowerCharts(baseDate: any, items: any) {
let list = items.map(function (item: any) {
return { value: Number(item.percent.toFixed(2)), data: item };
});
list = list.length === 0 ? loadEmptyData2() : list;
chartsOption.value['loadPowerChart'] = {
xData: baseDate,
yData: [
{
name: i18n.global.t('monitor.powerUsage'),
data: list,
},
],
tooltip: {
trigger: 'axis',
formatter: function (list: any) {
let res = loadDate(list[0].name);
for (const item of list) {
res += loadSeries(item, item.data.value ? item.data.value : item.data, '%');
res += `( ${item.data?.data.used} W / ${item.data?.data.total} W)<br/>`;
}
return res;
},
},
formatStr: '%',
};
}
function initTemperatureCharts(baseDate: any, items: any) {
let temperatures = items.map(function (item: any) {
return Number(item);
});
temperatures = temperatures.length === 0 ? loadEmptyData() : temperatures;
chartsOption.value['loadTemperatureChart'] = {
xData: baseDate,
yData: [
{
name: i18n.global.t('monitor.temperature'),
data: temperatures,
},
],
formatStr: '°C',
};
}
function initSpeedCharts(baseDate: any, items: any) {
let speeds = items.map(function (item: any) {
return Number(item);
});
speeds = speeds.length === 0 ? loadEmptyData() : speeds;
chartsOption.value['loadSpeedChart'] = {
xData: baseDate,
yData: [
{
name: i18n.global.t('monitor.fanSpeed'),
data: speeds,
},
],
formatStr: '%',
};
}
const lowerCase = (val: string) => { function loadEmptyDate(timeRange: any) {
return val.toLowerCase(); if (timeRange.length != 2) {
}; return;
}
let date1 = new Date(timeRange[0]);
let date2 = new Date(timeRange[1]);
return [date1, date2];
}
function loadEmptyData() {
return [0, 0];
}
function loadEmptyData2() {
return [
{ value: 0, data: {} },
{ value: 0, data: {} },
];
}
const loadComputeMode = (val: string) => { function withMemoryProcess(list: any) {
switch (val) { let process;
case 'Default': let res = loadDate(list[0].name);
return i18n.global.t('aiTools.gpu.default'); for (const item of list) {
case 'Exclusive Process': if (item.data?.data) {
return i18n.global.t('aiTools.gpu.exclusiveProcess'); process = item.data?.data.gpuProcesses || [];
case 'Exclusive Thread': }
return i18n.global.t('aiTools.gpu.exclusiveThread'); res += loadSeries(item, item.data.value ? item.data.value : item.data, '%');
case 'Prohibited': res += `( ${item.data?.data.used} MiB / ${item.data?.data.total} MiB)<br/>`;
return i18n.global.t('aiTools.gpu.prohibited');
} }
}; if (!process) {
return res;
const loadEcc = (val: string) => {
if (val === 'N/A') {
return i18n.global.t('aiTools.gpu.migModeNA');
} }
if (val === 'Disabled') { res += `
return i18n.global.t('aiTools.gpu.disabled'); <div style="margin-top: 10px; border-bottom: 1px dashed black;"></div>
<table style="border-collapse: collapse; margin-top: 20px; font-size: 12px;">
<thead>
<tr>
<th style="padding: 6px 8px;">PID</th>
<th style="padding: 6px 8px;">${i18n.global.t('aiTools.gpu.type')}</th>
<th style="padding: 6px 8px;">${i18n.global.t('aiTools.gpu.processName')}</th>
<th style="padding: 6px 8px;">${i18n.global.t('aiTools.gpu.processMemoryUsage')}</th>
</tr>
</thead>
<tbody>
`;
for (const row of process) {
res += `
<tr>
<td style="padding: 6px 8px; text-align: center;">
${row.pid}
</td>
<td style="padding: 6px 8px; text-align: center;">
${loadProcessType(row.type)}
</td>
<td style="padding: 6px 8px; text-align: center;">
${row.processName}
</td>
<td style="padding: 6px 8px; text-align: center;">
${row.usedMemory}
</td>
</tr>
`;
} }
if (val === 'Enabled') { return res;
return i18n.global.t('aiTools.gpu.enabled'); }
} function loadDate(name: any) {
return val || 0; return ` <div style="display: inline-block; width: 100%; padding-bottom: 10px;">
}; ${i18n.global.t('commons.search.date')}: ${name.replaceAll('\n', ' ')}
</div>`;
}
function loadSeries(item: any, data: any, unit: any) {
return `<div style="width: 100%;">
${item.marker} ${item.seriesName}: ${data} ${unit}
</div>`;
}
const loadProcessType = (val: string) => { const loadProcessType = (val: string) => {
if (val === 'C' || val === 'G') { if (val === 'C' || val === 'G') {
return i18n.global.t('aiTools.gpu.type' + val); return i18n.global.t('aiTools.gpu.type' + val);
@ -347,21 +392,19 @@ onMounted(() => {
}); });
</script> </script>
<style lang="scss" scoped> <style scoped lang="scss">
.name-class { .content-container__search {
font-size: 18px; margin-top: 7px;
font-weight: 500; .el-card {
} --el-card-padding: 12px;
.title-class {
font-size: 14px;
font-weight: 500;
}
.cell-item {
display: flex;
align-items: center;
.icon-item {
margin-left: 4px;
margin-top: -1px;
} }
} }
.title {
font-size: 16px;
font-weight: 500;
}
.chart {
width: 100%;
height: 400px;
}
</style> </style>