aboutsummaryrefslogtreecommitdiffstats
path: root/core/core.go
diff options
context:
space:
mode:
Diffstat (limited to 'core/core.go')
-rw-r--r--core/core.go79
1 files changed, 55 insertions, 24 deletions
diff --git a/core/core.go b/core/core.go
index 6714f6f..4782ba1 100644
--- a/core/core.go
+++ b/core/core.go
@@ -55,7 +55,7 @@ const (
Connected ConnectionState = iota
// Disconnected connection state means that the connection to VPP has been lost.
- Disconnected = iota
+ Disconnected
)
// ConnectionEvent is a notification about change in the VPP connection state.
@@ -85,6 +85,9 @@ type Connection struct {
maxChannelID uint32 // maximum used client ID
pingReqID uint16 // ID if the ControlPing message
pingReplyID uint16 // ID of the ControlPingReply message
+
+ lastReplyLock sync.Mutex // lock for the last reply
+ lastReply time.Time // time of the last received reply from VPP
}
// channelMetadata contains core-local metadata of an API channel.
@@ -271,17 +274,19 @@ func (c *Connection) connectLoop(connChan chan ConnectionEvent) {
// it continues with connectLoop and tries to reconnect.
func (c *Connection) healthCheckLoop(connChan chan ConnectionEvent) {
// create a separate API channel for health check probes
- ch, err := conn.NewAPIChannel()
+ ch, err := conn.NewAPIChannelBuffered(1, 1)
if err != nil {
- log.Error("Error by creating health check API channel, health check will be disabled:", err)
+ log.Error("Failed to create health check API channel, health check will be disabled:", err)
return
}
- failedChecks := 0
- // send health check probes until an error occurs
+ var sinceLastReply time.Duration
+ var failedChecks int
+
+ // send health check probes until an error or timeout occurs
for {
- // wait for healthCheckProbeInterval
- <-time.After(healthCheckProbeInterval)
+ // sleep until next health check probe period
+ time.Sleep(healthCheckProbeInterval)
if atomic.LoadUint32(&c.connected) == 0 {
// Disconnect has been called in the meantime, return the healthcheck - reconnect loop
@@ -289,30 +294,56 @@ func (c *Connection) healthCheckLoop(connChan chan ConnectionEvent) {
return
}
- // send the control ping
- ch.ReqChan <- &api.VppRequest{Message: msgControlPing}
-
- // expect response within timeout period
+ // try draining probe replies from previous request before sending next one
select {
- case vppReply := <-ch.ReplyChan:
- err = vppReply.Error
- case <-time.After(healthCheckReplyTimeout):
- err = errors.New("probe reply not received within the timeout period")
+ case <-ch.ReplyChan:
+ log.Debug("drained old probe reply from reply channel")
+ default:
}
- if err != nil {
- failedChecks++
- log.Warnf("VPP health check failed (%d. time): %v", failedChecks, err)
- } else {
- failedChecks = 0
+ // send the control ping request
+ ch.ReqChan <- &api.VppRequest{Message: msgControlPing}
+
+ for {
+ // expect response within timeout period
+ select {
+ case vppReply := <-ch.ReplyChan:
+ err = vppReply.Error
+
+ case <-time.After(healthCheckReplyTimeout):
+ err = ErrProbeTimeout
+
+ // check if time since last reply from any other
+ // channel is less than health check reply timeout
+ conn.lastReplyLock.Lock()
+ sinceLastReply = time.Since(c.lastReply)
+ conn.lastReplyLock.Unlock()
+
+ if sinceLastReply < healthCheckReplyTimeout {
+ log.Warnf("VPP health check probe timing out, but some request on other channel was received %v ago, continue waiting!", sinceLastReply)
+ continue
+ }
+ }
+ break
}
- if failedChecks > healthCheckThreshold {
- // in case of error, break & disconnect
- log.Errorf("Number of VPP health check fails exceeded treshold (%d)", healthCheckThreshold, err)
- // signal disconnected event via channel
+ if err == ErrProbeTimeout {
+ failedChecks++
+ log.Warnf("VPP health check probe timed out after %v (%d. timeout)", healthCheckReplyTimeout, failedChecks)
+ if failedChecks > healthCheckThreshold {
+ // in case of exceeded treshold disconnect
+ log.Errorf("VPP health check exceeded treshold for timeouts (>%d), assuming disconnect", healthCheckThreshold)
+ connChan <- ConnectionEvent{Timestamp: time.Now(), State: Disconnected}
+ break
+ }
+ } else if err != nil {
+ // in case of error disconnect
+ log.Errorf("VPP health check probe failed: %v", err)
connChan <- ConnectionEvent{Timestamp: time.Now(), State: Disconnected}
break
+ } else if failedChecks > 0 {
+ failedChecks = 0
+ log.Infof("VPP health check probe OK")
}
}