Skip to content

Commit

Permalink
informant: handle signals (#61)
Browse files Browse the repository at this point in the history
  • Loading branch information
Sam Kleinman authored Mar 13, 2023
1 parent 6884919 commit b2f5506
Showing 1 changed file with 78 additions and 34 deletions.
112 changes: 78 additions & 34 deletions cmd/vm-informant/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ import (
"net/http"
"os"
"os/exec"
"os/signal"
"syscall"
"time"

"github.com/containerd/cgroups/v3/cgroup2"
Expand All @@ -18,7 +20,12 @@ import (
"github.com/neondatabase/autoscaling/pkg/util"
)

const minSubProcessRestartInterval = 5 * time.Second

func main() {
ctx, cancel := signal.NotifyContext(context.Background(), syscall.SIGTERM)
defer cancel()

buildInfo := util.GetBuildInfo()
klog.Infof("buildInfo.GitInfo: %s", buildInfo.GitInfo)
klog.Infof("buildInfo.GoVersion: %s", buildInfo.GoVersion)
Expand Down Expand Up @@ -61,7 +68,7 @@ func main() {
})
}

runRestartOnFailure(args, cleanupHooks)
runRestartOnFailure(ctx, args, cleanupHooks)
return
}

Expand Down Expand Up @@ -117,49 +124,86 @@ func main() {
// We execute ourselves as a subprocess so that it's possible to appropriately cleanup after
// termination by various signals (or an unhandled panic!). This is worthwhile because we *really*
// don't want to leave the cgroup frozen while waiting to restart.
func runRestartOnFailure(args []string, cleanupHooks []func()) {
func runRestartOnFailure(ctx context.Context, args []string, cleanupHooks []func()) {
selfPath := os.Args[0]

minWaitDuration := time.Second * 5
timer := time.NewTimer(0)
defer timer.Stop()

for {
startTime := time.Now()
func() {
pctx, pcancel := context.WithCancel(context.Background())
defer pcancel()

cmd := exec.Command(selfPath, args...)
cmd.Stdout = os.Stdout
cmd.Stderr = os.Stderr

klog.Infof("Running vm-informant with args %+v", args)
err := cmd.Start()
if err == nil {
go func() {
select {
case <-pctx.Done():
return
case <-ctx.Done():
if pctx.Err() != nil {
// the process has already returned
// and we don't need to signal it
return
}
if err := cmd.Process.Signal(syscall.SIGTERM); err != nil {
klog.Warningf("could not signal vm-informant process: %v", err)
}
}
}()

// this is blocking, but we should
// have killed the process in the
// wait goroutine, or the process would
// return normally.
err = cmd.Wait()
// stop the goroutine above, as the
// process has already returned.
pcancel()
}

klog.Infof("Running vm-informant with args %+v", args)
cmd := exec.CommandContext(context.TODO(), selfPath, args...)
cmd.Stdout = os.Stdout
cmd.Stderr = os.Stderr
err := cmd.Run()

var exitMode string
if err != nil {
klog.Errorf("vm-informant exited with error: %v", err)
} else {
klog.Warningf("vm-informant exited without error. This should not happen.")
}

if err != nil {
// lint note: the linter's worried about wrapped errors being incorrect with switch, but
// this is cleaner than the alternative (via errors.As) and it's still correct because
// exec.Command.Run() explicitly mentions ExitError.
switch err.(type) { //nolint:errorlint // see above.
case *exec.ExitError:
exitMode = "failed"
klog.Errorf("vm-informant exited with: %v", err)
default:
exitMode = "failed to start"
klog.Errorf("error running vm-informant: %v", err)
for _, h := range cleanupHooks {
h()
}
} else {
exitMode = ""
klog.Warningf("vm-informant exited without error. This should not happen.")
}
}()

select {
case <-ctx.Done():
klog.Infof("vm-informant restart loop: received termination signal")
return
default:
dur := time.Since(startTime)
if dur < minSubProcessRestartInterval {
// drain the timer before resetting it, required by Timer.Reset:
if !timer.Stop() {
<-timer.C
}
timer.Reset(minSubProcessRestartInterval - dur)

for _, h := range cleanupHooks {
h()
}
klog.Infof("vm-informant exited. respecting minimum wait of %s", minSubProcessRestartInterval)
select {
case <-ctx.Done():
klog.Infof("vm-informant restart loop: received termination signal")
return
case <-timer.C:
continue
}
}

dur := time.Since(startTime)
if dur < minWaitDuration {
klog.Infof("vm-informant %s. respecting minimum wait of %s", exitMode, minWaitDuration)
time.Sleep(minWaitDuration - dur)
} else {
klog.Infof("vm-informant restarting immediately")
continue
}
}
}

0 comments on commit b2f5506

Please sign in to comment.