Fix slow shutdown (#2113)

* rearrange shutdown

Signed-off-by: Kristoffer Dalby <kristoffer@tailscale.com>

* http closed is fine

Signed-off-by: Kristoffer Dalby <kristoffer@tailscale.com>

* update changelog

Signed-off-by: Kristoffer Dalby <kristoffer@tailscale.com>

* logging while shutting

Signed-off-by: Kristoffer Dalby <kristoffer@tailscale.com>

---------

Signed-off-by: Kristoffer Dalby <kristoffer@tailscale.com>
This commit is contained in:
Kristoffer Dalby 2024-09-09 14:10:22 +02:00 committed by GitHub
parent bac7ea67f4
commit 60b94b0467
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 57 additions and 21 deletions

View file

@ -74,6 +74,7 @@ after improving the test harness as part of adopting [#1460](https://github.com/
- Make sure integration tests cover postgres for all scenarios
- CLI commands (all except `serve`) only requires minimal configuration, no more errors or warnings from unset settings [#2109](https://github.com/juanfont/headscale/pull/2109)
- CLI results are now concistently sent to stdout and errors to stderr [#2109](https://github.com/juanfont/headscale/pull/2109)
- Fix issue where shutting down headscale would hang [#2113](https://github.com/juanfont/headscale/pull/2113)
## 0.22.3 (2023-05-12)

View file

@ -1,6 +1,9 @@
package cli
import (
"errors"
"net/http"
"github.com/rs/zerolog/log"
"github.com/spf13/cobra"
)
@ -22,8 +25,8 @@ var serveCmd = &cobra.Command{
}
err = app.Serve()
if err != nil {
log.Fatal().Caller().Err(err).Msg("Error starting server")
if err != nil && !errors.Is(err, http.ErrServerClosed) {
log.Fatal().Caller().Err(err).Msg("Headscale ran into an error and had to shut down.")
}
},
}

View file

@ -770,7 +770,7 @@ func (h *Headscale) Serve() error {
})
}
default:
trace := log.Trace().Msgf
info := func(msg string) { log.Info().Msg(msg) }
log.Info().
Str("signal", sig.String()).
Msg("Received signal to stop, shutting down gracefully")
@ -778,55 +778,55 @@ func (h *Headscale) Serve() error {
expireNodeCancel()
h.ephemeralGC.Close()
trace("waiting for netmap stream to close")
h.pollNetMapStreamWG.Wait()
// Gracefully shut down servers
ctx, cancel := context.WithTimeout(
context.Background(),
types.HTTPShutdownTimeout,
)
trace("shutting down debug http server")
info("shutting down debug http server")
if err := debugHTTPServer.Shutdown(ctx); err != nil {
log.Error().Err(err).Msg("Failed to shutdown prometheus http")
log.Error().Err(err).Msg("failed to shutdown prometheus http")
}
trace("shutting down main http server")
info("shutting down main http server")
if err := httpServer.Shutdown(ctx); err != nil {
log.Error().Err(err).Msg("Failed to shutdown http")
log.Error().Err(err).Msg("failed to shutdown http")
}
trace("shutting down grpc server (socket)")
info("closing node notifier")
h.nodeNotifier.Close()
info("waiting for netmap stream to close")
h.pollNetMapStreamWG.Wait()
info("shutting down grpc server (socket)")
grpcSocket.GracefulStop()
if grpcServer != nil {
trace("shutting down grpc server (external)")
info("shutting down grpc server (external)")
grpcServer.GracefulStop()
grpcListener.Close()
}
if tailsqlContext != nil {
trace("shutting down tailsql")
info("shutting down tailsql")
tailsqlContext.Done()
}
trace("closing node notifier")
h.nodeNotifier.Close()
// Close network listeners
trace("closing network listeners")
info("closing network listeners")
debugHTTPListener.Close()
httpListener.Close()
grpcGatewayConn.Close()
// Stop listening (and unlink the socket if unix type):
trace("closing socket listener")
info("closing socket listener")
socketListener.Close()
// Close db connections
trace("closing database connection")
info("closing database connection")
err = h.db.Close()
if err != nil {
log.Error().Err(err).Msg("Failed to close db")
log.Error().Err(err).Msg("failed to close db")
}
log.Info().

View file

@ -36,6 +36,7 @@ type Notifier struct {
connected *xsync.MapOf[types.NodeID, bool]
b *batcher
cfg *types.Config
closed bool
}
func NewNotifier(cfg *types.Config) *Notifier {
@ -43,6 +44,7 @@ func NewNotifier(cfg *types.Config) *Notifier {
nodes: make(map[types.NodeID]chan<- types.StateUpdate),
connected: xsync.NewMapOf[types.NodeID, bool](),
cfg: cfg,
closed: false,
}
b := newBatcher(cfg.Tuning.BatchChangeDelay, n)
n.b = b
@ -51,9 +53,19 @@ func NewNotifier(cfg *types.Config) *Notifier {
return n
}
// Close stops the batcher inside the notifier.
// Close stops the batcher and closes all channels.
func (n *Notifier) Close() {
notifierWaitersForLock.WithLabelValues("lock", "close").Inc()
n.l.Lock()
defer n.l.Unlock()
notifierWaitersForLock.WithLabelValues("lock", "close").Dec()
n.closed = true
n.b.close()
for _, c := range n.nodes {
close(c)
}
}
func (n *Notifier) tracef(nID types.NodeID, msg string, args ...any) {
@ -70,6 +82,10 @@ func (n *Notifier) AddNode(nodeID types.NodeID, c chan<- types.StateUpdate) {
notifierWaitersForLock.WithLabelValues("lock", "add").Dec()
notifierWaitForLock.WithLabelValues("add").Observe(time.Since(start).Seconds())
if n.closed {
return
}
// If a channel exists, it means the node has opened a new
// connection. Close the old channel and replace it.
if curr, ok := n.nodes[nodeID]; ok {
@ -96,6 +112,10 @@ func (n *Notifier) RemoveNode(nodeID types.NodeID, c chan<- types.StateUpdate) b
notifierWaitersForLock.WithLabelValues("lock", "remove").Dec()
notifierWaitForLock.WithLabelValues("remove").Observe(time.Since(start).Seconds())
if n.closed {
return true
}
if len(n.nodes) == 0 {
return true
}
@ -154,6 +174,10 @@ func (n *Notifier) NotifyWithIgnore(
update types.StateUpdate,
ignoreNodeIDs ...types.NodeID,
) {
if n.closed {
return
}
notifierUpdateReceived.WithLabelValues(update.Type.String(), types.NotifyOriginKey.Value(ctx)).Inc()
n.b.addOrPassthrough(update)
}
@ -170,6 +194,10 @@ func (n *Notifier) NotifyByNodeID(
notifierWaitersForLock.WithLabelValues("lock", "notify").Dec()
notifierWaitForLock.WithLabelValues("notify").Observe(time.Since(start).Seconds())
if n.closed {
return
}
if c, ok := n.nodes[nodeID]; ok {
select {
case <-ctx.Done():
@ -205,6 +233,10 @@ func (n *Notifier) sendAll(update types.StateUpdate) {
notifierWaitersForLock.WithLabelValues("lock", "send-all").Dec()
notifierWaitForLock.WithLabelValues("send-all").Observe(time.Since(start).Seconds())
if n.closed {
return
}
for id, c := range n.nodes {
// Whenever an update is sent to all nodes, there is a chance that the node
// has disconnected and the goroutine that was supposed to consume the update