From 11d357ee66b2cd3ff0ceff282bb6f7de30f56da1 Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Mon, 26 Sep 2016 15:31:37 +0100 Subject: [PATCH 1/3] ciao-launcher: Show all stacks when panic()ing When panic()ing as part of a timed out termination process it's useful to get stack traces for all running Goroutines. This is necessary as in Go 1.6+ panic() was changed to only show the stack for the Goroutine that called panic(). Signed-off-by: Rob Bradford --- ciao-launcher/main.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ciao-launcher/main.go b/ciao-launcher/main.go index 5a3d6208f..6825a35d2 100644 --- a/ciao-launcher/main.go +++ b/ciao-launcher/main.go @@ -24,6 +24,7 @@ import ( "os" "os/signal" "path" + "runtime/debug" "sync" "syscall" "time" @@ -563,7 +564,7 @@ DONE: glog.Flush() /* We panic here to see which naughty go routines are still running. */ - + debug.SetTraceback("all") panic("Server Loop did not exit within 1 second quitting") } } From c7116c3618fd04d9c1230f15e807e2f7573aa7fb Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Wed, 28 Sep 2016 11:02:59 +0100 Subject: [PATCH 2/3] ciao-launcher: Update doc to say panic() will now show all goroutines Signed-off-by: Rob Bradford --- ciao-launcher/doc.go | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/ciao-launcher/doc.go b/ciao-launcher/doc.go index edc34e4d1..5620634c7 100644 --- a/ciao-launcher/doc.go +++ b/ciao-launcher/doc.go @@ -42,10 +42,7 @@ // it instructs all child go routines to quit and waits for their exit. Note that // it only waits for 1 second. If all child go routines have failed to exit in 1 // second, ciao-launcher panics. The panic is useful as it prints the stack trace of -// all the running go routines, so you can see which ones are blocked. At least -// this was the intention. The default behaviour of the go runtime has changed in -// this regard in 1.6 so a small code change is required, but you get the idea, I -// hope. +// all the running go routines, so you can see which ones are blocked. // // The Server go routine // From a59ae919f30984c55b840828d9ad560b1851b234 Mon Sep 17 00:00:00 2001 From: Rob Bradford Date: Mon, 26 Sep 2016 18:49:04 +0100 Subject: [PATCH 3/3] ciao-launcher: Fix potential deadlock at launcher startup Ensure that the connection to the server and overseer is ready before processing commands. This avoids a race where ConnectNotify() was returning a command before Dial() (and it's follow-up work) was completed. This would result in an deadlock as the overseer was not ready to receive the command. Fixes: #591 Signed-off-by: Rob Bradford --- ciao-launcher/main.go | 81 ++++++++++++++++++++++--------------------- 1 file changed, 41 insertions(+), 40 deletions(-) diff --git a/ciao-launcher/main.go b/ciao-launcher/main.go index 6825a35d2..8dbcfb622 100644 --- a/ciao-launcher/main.go +++ b/ciao-launcher/main.go @@ -17,6 +17,7 @@ package main import ( + "context" "flag" "fmt" "log" @@ -29,8 +30,6 @@ import ( "syscall" "time" - "context" - "github.com/01org/ciao/osprepare" "github.com/01org/ciao/payloads" "github.com/01org/ciao/ssntp" @@ -390,50 +389,52 @@ func connectToServer(doneCh chan struct{}, statusCh chan struct{}) { dialCh <- err }() - dialing := true + select { + case err := <-dialCh: + if err != nil { + break + } + clusterConfig, err := client.conn.ClusterConfiguration() + if err != nil { + glog.Errorf("Unable to get Cluster Configuration %v", err) + client.conn.Close() + break + } + computeNet = clusterConfig.Configure.Launcher.ComputeNetwork + mgmtNet = clusterConfig.Configure.Launcher.ManagementNetwork + diskLimit = clusterConfig.Configure.Launcher.DiskLimit + memLimit = clusterConfig.Configure.Launcher.MemoryLimit + if secretPath == "" { + secretPath = clusterConfig.Configure.Storage.SecretPath + } + if cephID == "" { + cephID = clusterConfig.Configure.Storage.CephID + } + printClusterConfig() -DONE: - for { - select { - case err := <-dialCh: - dialing = false - if err != nil { - break DONE - } - clusterConfig, err := client.conn.ClusterConfiguration() - if err != nil { - glog.Errorf("Unable to get Cluster Configuration %v", err) - client.conn.Close() - break DONE - } - computeNet = clusterConfig.Configure.Launcher.ComputeNetwork - mgmtNet = clusterConfig.Configure.Launcher.ManagementNetwork - diskLimit = clusterConfig.Configure.Launcher.DiskLimit - memLimit = clusterConfig.Configure.Launcher.MemoryLimit - if secretPath == "" { - secretPath = clusterConfig.Configure.Storage.SecretPath - } - if cephID == "" { - cephID = clusterConfig.Configure.Storage.CephID - } - printClusterConfig() + client.installLauncherDeps() - client.installLauncherDeps() + err = startNetwork(doneCh) + if err != nil { + glog.Errorf("Failed to start network: %v\n", err) + client.conn.Close() + break + } + defer shutdownNetwork() - err = startNetwork(doneCh) - if err != nil { - glog.Errorf("Failed to start network: %v\n", err) - client.conn.Close() - break DONE - } - defer shutdownNetwork() + ovsCh = startOverseer(&wg, client) + case <-doneCh: + client.conn.Close() + <-dialCh + return + } - ovsCh = startOverseer(&wg, client) +DONE: + for { + select { case <-doneCh: client.conn.Close() - if !dialing { - break DONE - } + break DONE case cmd := <-client.cmdCh: /* Double check we're not quitting here. Otherwise a flood of commands