netmaker/mq/publishers.go
Abhishek K 5a561b3835
Net 1440 batchpeerupdate (#3042)
* NET-1440 scale test changes

* fix UT error and add error info

* load metric data into cacha in startup

* remove debug info for metric

* add server telemetry and hasSuperAdmin to cache

* fix user UT case

* update sqlite connection string for performance

* update check-in TS in cache only if cache enabled

* update metric data in cache only if cache enabled and write to DB once in stop

* update server status in mq topic

* add failover existed to server status update

* only send mq messsage when there is server status change

* batch peerUpdate

* code changes for scale for review

* update UT case

* update mq client check

* mq connection code change

* revert server status update changes

* revert batch peerUpdate

* remove server status update info

* batch peerUpdate

* code changes based on review and setupmqtt in keepalive

* set the mq message order to false for PIN

* remove setupmqtt in keepalive

* add peerUpdate batch size to config

* update batch peerUpdate

* recycle ip in node deletion

* update ip allocation logic

* remove ip addr cap

* remove ippool file

* update get extClient func

* remove ip from cache map when extClient is removed

* add batch peerUpdate switch

* set batch peerUpdate to true by default

---------

Co-authored-by: Max Ma <mayabin@gmail.com>
2024-08-16 15:35:43 +05:30

251 lines
7.1 KiB
Go

package mq
import (
"encoding/json"
"errors"
"fmt"
"sync"
"time"
"github.com/gravitl/netmaker/logger"
"github.com/gravitl/netmaker/logic"
"github.com/gravitl/netmaker/models"
"github.com/gravitl/netmaker/servercfg"
"golang.org/x/exp/slog"
)
var batchSize = servercfg.GetPeerUpdateBatchSize()
var batchUpdate = servercfg.GetBatchPeerUpdate()
// PublishPeerUpdate --- determines and publishes a peer update to all the hosts
func PublishPeerUpdate(replacePeers bool) error {
if !servercfg.IsMessageQueueBackend() {
return nil
}
hosts, err := logic.GetAllHosts()
if err != nil {
logger.Log(1, "err getting all hosts", err.Error())
return err
}
allNodes, err := logic.GetAllNodes()
if err != nil {
return err
}
//if batch peer update disabled
if !batchUpdate {
for _, host := range hosts {
host := host
go func(host models.Host) {
if err = PublishSingleHostPeerUpdate(&host, allNodes, nil, nil, replacePeers, nil); err != nil {
logger.Log(1, "failed to publish peer update to host", host.ID.String(), ": ", err.Error())
}
}(host)
}
return nil
}
//if batch peer update enabled
batchHost := BatchItems(hosts, batchSize)
var wg sync.WaitGroup
for _, v := range batchHost {
hostLen := len(v)
wg.Add(hostLen)
for i := 0; i < hostLen; i++ {
host := hosts[i]
go func(host models.Host) {
if err = PublishSingleHostPeerUpdate(&host, allNodes, nil, nil, replacePeers, &wg); err != nil {
logger.Log(1, "failed to publish peer update to host", host.ID.String(), ": ", err.Error())
}
}(host)
}
wg.Wait()
}
return nil
}
// PublishDeletedNodePeerUpdate --- determines and publishes a peer update
// to all the hosts with a deleted node to account for
func PublishDeletedNodePeerUpdate(delNode *models.Node) error {
if !servercfg.IsMessageQueueBackend() {
return nil
}
hosts, err := logic.GetAllHosts()
if err != nil {
logger.Log(1, "err getting all hosts", err.Error())
return err
}
allNodes, err := logic.GetAllNodes()
if err != nil {
return err
}
for _, host := range hosts {
host := host
if err = PublishSingleHostPeerUpdate(&host, allNodes, delNode, nil, false, nil); err != nil {
logger.Log(1, "failed to publish peer update to host", host.ID.String(), ": ", err.Error())
}
}
return err
}
// PublishDeletedClientPeerUpdate --- determines and publishes a peer update
// to all the hosts with a deleted ext client to account for
func PublishDeletedClientPeerUpdate(delClient *models.ExtClient) error {
if !servercfg.IsMessageQueueBackend() {
return nil
}
hosts, err := logic.GetAllHosts()
if err != nil {
logger.Log(1, "err getting all hosts", err.Error())
return err
}
nodes, err := logic.GetAllNodes()
if err != nil {
return err
}
for _, host := range hosts {
host := host
if host.OS != models.OS_Types.IoT {
if err = PublishSingleHostPeerUpdate(&host, nodes, nil, []models.ExtClient{*delClient}, false, nil); err != nil {
logger.Log(1, "failed to publish peer update to host", host.ID.String(), ": ", err.Error())
}
}
}
return err
}
// PublishSingleHostPeerUpdate --- determines and publishes a peer update to one host
func PublishSingleHostPeerUpdate(host *models.Host, allNodes []models.Node, deletedNode *models.Node, deletedClients []models.ExtClient, replacePeers bool, wg *sync.WaitGroup) error {
if wg != nil {
defer wg.Done()
}
peerUpdate, err := logic.GetPeerUpdateForHost("", host, allNodes, deletedNode, deletedClients)
if err != nil {
return err
}
peerUpdate.ReplacePeers = replacePeers
data, err := json.Marshal(&peerUpdate)
if err != nil {
return err
}
return publish(host, fmt.Sprintf("peers/host/%s/%s", host.ID.String(), servercfg.GetServer()), data)
}
// NodeUpdate -- publishes a node update
func NodeUpdate(node *models.Node) error {
host, err := logic.GetHost(node.HostID.String())
if err != nil {
return nil
}
if !servercfg.IsMessageQueueBackend() {
return nil
}
logger.Log(3, "publishing node update to "+node.ID.String())
//if len(node.NetworkSettings.AccessKeys) > 0 {
//node.NetworkSettings.AccessKeys = []models.AccessKey{} // not to be sent (don't need to spread access keys around the network; we need to know how to reach other nodes, not become them)
//}
data, err := json.Marshal(node)
if err != nil {
logger.Log(2, "error marshalling node update ", err.Error())
return err
}
if err = publish(host, fmt.Sprintf("node/update/%s/%s", node.Network, node.ID), data); err != nil {
logger.Log(2, "error publishing node update to peer ", node.ID.String(), err.Error())
return err
}
return nil
}
// HostUpdate -- publishes a host update to clients
func HostUpdate(hostUpdate *models.HostUpdate) error {
if !servercfg.IsMessageQueueBackend() {
return nil
}
logger.Log(3, "publishing host update to "+hostUpdate.Host.ID.String())
data, err := json.Marshal(hostUpdate)
if err != nil {
logger.Log(2, "error marshalling node update ", err.Error())
return err
}
if err = publish(&hostUpdate.Host, fmt.Sprintf("host/update/%s/%s", hostUpdate.Host.ID.String(), servercfg.GetServer()), data); err != nil {
logger.Log(2, "error publishing host update to", hostUpdate.Host.ID.String(), err.Error())
return err
}
return nil
}
// ServerStartNotify - notifies all non server nodes to pull changes after a restart
func ServerStartNotify() error {
nodes, err := logic.GetAllNodes()
if err != nil {
return err
}
for i := range nodes {
nodes[i].Action = models.NODE_FORCE_UPDATE
if err = NodeUpdate(&nodes[i]); err != nil {
logger.Log(1, "error when notifying node", nodes[i].ID.String(), "of a server startup")
}
}
return nil
}
// PublishMqUpdatesForDeletedNode - published all the required updates for deleted node
func PublishMqUpdatesForDeletedNode(node models.Node, sendNodeUpdate bool, gwClients []models.ExtClient) {
// notify of peer change
node.PendingDelete = true
node.Action = models.NODE_DELETE
if sendNodeUpdate {
if err := NodeUpdate(&node); err != nil {
slog.Error("error publishing node update to node", "node", node.ID, "error", err)
}
}
if err := PublishDeletedNodePeerUpdate(&node); err != nil {
logger.Log(1, "error publishing peer update ", err.Error())
}
if servercfg.IsDNSMode() {
logic.SetDNS()
}
}
func PushMetricsToExporter(metrics models.Metrics) error {
logger.Log(2, "----> Pushing metrics to exporter")
data, err := json.Marshal(metrics)
if err != nil {
return errors.New("failed to marshal metrics: " + err.Error())
}
if mqclient == nil || !mqclient.IsConnectionOpen() {
return errors.New("cannot publish ... mqclient not connected")
}
if token := mqclient.Publish("metrics_exporter", 0, true, data); !token.WaitTimeout(MQ_TIMEOUT*time.Second) || token.Error() != nil {
var err error
if token.Error() == nil {
err = errors.New("connection timeout")
} else {
err = token.Error()
}
return err
}
return nil
}
// sendPeers - retrieve networks, send peer ports to all peers
func sendPeers() {
peer_force_send++
if peer_force_send == 5 {
servercfg.SetHost()
peer_force_send = 0
err := logic.TimerCheckpoint() // run telemetry & log dumps if 24 hours has passed..
if err != nil {
logger.Log(3, "error occurred on timer,", err.Error())
}
}
}