improve testing of route failover logic

Signed-off-by: Kristoffer Dalby <kristoffer@tailscale.com>
This commit is contained in:
Kristoffer Dalby 2024-04-10 15:35:09 +02:00 committed by Juan Font
parent bf4fd078fc
commit 1704977e76
11 changed files with 518 additions and 143 deletions

View file

@ -3,7 +3,6 @@ package hscontrol
import (
"encoding/binary"
"encoding/json"
"errors"
"io"
"net/http"
@ -12,7 +11,6 @@ import (
"github.com/rs/zerolog/log"
"golang.org/x/net/http2"
"golang.org/x/net/http2/h2c"
"gorm.io/gorm"
"tailscale.com/control/controlbase"
"tailscale.com/control/controlhttp"
"tailscale.com/tailcfg"
@ -103,12 +101,12 @@ func (h *Headscale) NoiseUpgradeHandler(
router.HandleFunc("/machine/map", noiseServer.NoisePollNetMapHandler)
server := http.Server{
ReadTimeout: types.HTTPReadTimeout,
ReadTimeout: types.HTTPTimeout,
}
noiseServer.httpBaseConfig = &http.Server{
Handler: router,
ReadHeaderTimeout: types.HTTPReadTimeout,
ReadHeaderTimeout: types.HTTPTimeout,
}
noiseServer.http2Server = &http2.Server{}
@ -225,15 +223,6 @@ func (ns *noiseServer) NoisePollNetMapHandler(
key.NodePublic{},
)
if err != nil {
if errors.Is(err, gorm.ErrRecordNotFound) {
log.Warn().
Str("handler", "NoisePollNetMap").
Uint64("node.id", node.ID.Uint64()).
Msgf("Ignoring request, cannot find node with key %s", mapRequest.NodeKey.String())
http.Error(writer, "Internal error", http.StatusNotFound)
return
}
log.Error().
Str("handler", "NoisePollNetMap").
Uint64("node.id", node.ID.Uint64()).
@ -242,58 +231,59 @@ func (ns *noiseServer) NoisePollNetMapHandler(
return
}
log.Debug().
Str("handler", "NoisePollNetMap").
Str("node", node.Hostname).
Int("cap_ver", int(mapRequest.Version)).
Uint64("node.id", node.ID.Uint64()).
Msg("A node sending a MapRequest with Noise protocol")
sess := ns.headscale.newMapSession(req.Context(), mapRequest, writer, node)
session := ns.headscale.newMapSession(req.Context(), mapRequest, writer, node)
sess.tracef("a node sending a MapRequest with Noise protocol")
// If a streaming mapSession exists for this node, close it
// and start a new one.
if session.isStreaming() {
log.Debug().
Caller().
Uint64("node.id", node.ID.Uint64()).
Int("cap_ver", int(mapRequest.Version)).
Msg("Aquiring lock to check stream")
if sess.isStreaming() {
sess.tracef("aquiring lock to check stream")
ns.headscale.mapSessionMu.Lock()
if oldSession, ok := ns.headscale.mapSessions[node.ID]; ok {
log.Info().
Caller().
Uint64("node.id", node.ID.Uint64()).
Msg("Node has an open streaming session, replacing")
oldSession.close()
if _, ok := ns.headscale.mapSessions[node.ID]; ok {
// NOTE/TODO(kradalby): From how I understand the protocol, when
// a client connects with stream=true, and already has a streaming
// connection open, the correct way is to close the current channel
// and replace it. However, I cannot manage to get that working with
// some sort of lock/block happening on the cancelCh in the streaming
// session.
// Not closing the channel and replacing it puts us in a weird state
// which keeps a ghost stream open, receiving keep alives, but no updates.
//
// Typically a new connection is opened when one exists as a client which
// is already authenticated reconnects (e.g. down, then up). The client will
// start auth and streaming at the same time, and then cancel the streaming
// when the auth has finished successfully, opening a new connection.
//
// As a work-around to not replacing, abusing the clients "resilience"
// by reject the new connection which will cause the client to immediately
// reconnect and "fix" the issue, as the other connection typically has been
// closed, meaning there is nothing to replace.
//
// sess.infof("node has an open stream(%p), replacing with %p", oldSession, sess)
// oldSession.close()
defer ns.headscale.mapSessionMu.Unlock()
sess.infof("node has an open stream(%p), rejecting new stream", sess)
return
}
ns.headscale.mapSessions[node.ID] = session
ns.headscale.mapSessions[node.ID] = sess
ns.headscale.mapSessionMu.Unlock()
log.Debug().
Caller().
Uint64("node.id", node.ID.Uint64()).
Int("cap_ver", int(mapRequest.Version)).
Msg("Releasing lock to check stream")
sess.tracef("releasing lock to check stream")
}
session.serve()
sess.serve()
if session.isStreaming() {
log.Debug().
Caller().
Uint64("node.id", node.ID.Uint64()).
Int("cap_ver", int(mapRequest.Version)).
Msg("Aquiring lock to remove stream")
if sess.isStreaming() {
sess.tracef("aquiring lock to remove stream")
ns.headscale.mapSessionMu.Lock()
defer ns.headscale.mapSessionMu.Unlock()
delete(ns.headscale.mapSessions, node.ID)
ns.headscale.mapSessionMu.Unlock()
log.Debug().
Caller().
Uint64("node.id", node.ID.Uint64()).
Int("cap_ver", int(mapRequest.Version)).
Msg("Releasing lock to remove stream")
sess.tracef("releasing lock to remove stream")
}
}