@@ -26,6 +26,7 @@ import (
26
26
"strconv"
27
27
"strings"
28
28
"sync"
29
+ "sync/atomic"
29
30
"syscall"
30
31
"testing"
31
32
"time"
@@ -242,49 +243,41 @@ func createTapDevice(ctx context.Context, tapName string) error {
242
243
func TestMultipleVMs_Isolated (t * testing.T ) {
243
244
integtest .Prepare (t )
244
245
245
- var err error
246
+ // Seems there is a deadlock (see #581).
247
+ // Cancel if running VMs takes more than 5 minutes.
248
+ const timeout = 5 * time .Minute
246
249
247
250
// numberOfVmsEnvName = NUMBER_OF_VMS ENV and is configurable from buildkite
251
+ var err error
248
252
numberOfVms := defaultNumberOfVms
249
253
if str := os .Getenv (numberOfVmsEnvName ); str != "" {
250
254
numberOfVms , err = strconv .Atoi (str )
251
255
require .NoError (t , err , "failed to get NUMBER_OF_VMS env" )
252
256
}
253
257
t .Logf ("TestMultipleVMs_Isolated: will run up to %d VMs" , numberOfVms )
254
258
255
- // We should be able to run 10 VMs without any issues.
256
- if numberOfVms <= 10 {
257
- testMultipleVMs (t , 10 )
259
+ const delta = 10
260
+
261
+ if numberOfVms <= delta {
262
+ ctx , cancel := context .WithTimeout (context .Background (), timeout )
263
+ defer cancel ()
264
+ testMultipleVMs (ctx , t , numberOfVms )
258
265
return
259
266
}
260
267
261
- // We have issues running 100 VMs (see #581).
262
- // Incrementally increase the number of VMs to find the breaking point.
263
- for i := 10 ; i <= numberOfVms ; i += 10 {
264
- success := t .Run (fmt .Sprintf ("VMs=%d" , i ), func (t * testing.T ) {
265
- testMultipleVMs (t , i )
268
+ // Seems the instability isn't correlating with the number of VMs.
269
+ // Having a failure in N VMs doesn't necessary mean running more than N VMs
270
+ // doesn't work at all.
271
+ for i := delta ; i <= numberOfVms ; i += delta {
272
+ t .Run (fmt .Sprintf ("VMs=%d" , i ), func (t * testing.T ) {
273
+ ctx , cancel := context .WithTimeout (context .Background (), timeout )
274
+ defer cancel ()
275
+ testMultipleVMs (ctx , t , i )
266
276
})
267
- if ! success {
268
- // If running N VMs doesn't work, no point to go further.
269
- return
270
- }
271
277
}
272
278
}
273
279
274
- type Event int
275
-
276
- const (
277
- Created Event = iota
278
- Stopped
279
- )
280
-
281
- func testMultipleVMs (t * testing.T , count int ) {
282
- // This test starts multiple VMs and some may hit firecracker-containerd's
283
- // default timeout. So overriding the timeout to wait longer.
284
- // One hour should be enough to start a VM, regardless of the load of
285
- // the underlying host.
286
- const createVMTimeout = 1 * time .Hour
287
-
280
+ func testMultipleVMs (ctx context.Context , t * testing.T , count int ) {
288
281
// Apparently writing a lot from Firecracker's serial console blocks VMs.
289
282
// https://github.com/firecracker-microvm/firecracker/blob/v1.1.0/docs/prod-host-setup.md
290
283
kernelArgs := integtest .DefaultRuntimeConfig .KernelArgs + " 8250.nr_uarts=0 quiet loglevel=1"
@@ -321,7 +314,7 @@ func testMultipleVMs(t *testing.T, count int) {
321
314
},
322
315
}
323
316
324
- testCtx := namespaces .WithNamespace (context . Background () , defaultNamespace )
317
+ testCtx := namespaces .WithNamespace (ctx , defaultNamespace )
325
318
326
319
client , err := containerd .New (integtest .ContainerdSockPath , containerd .WithDefaultRuntime (firecrackerRuntime ))
327
320
require .NoError (t , err , "unable to create client to containerd service at %s, is containerd running?" , integtest .ContainerdSockPath )
@@ -333,14 +326,17 @@ func testMultipleVMs(t *testing.T, count int) {
333
326
cfg , err := config .LoadConfig ("" )
334
327
require .NoError (t , err , "failed to load config" )
335
328
336
- eventCh := make (chan Event )
337
-
338
329
// Creating tap devices without goroutines somehow stabilize this test.
339
330
var devices []string
340
331
341
332
defer func () {
333
+ // It needs a new context to delete all of the devices
334
+ // even if the incoming context is being cancelled.
335
+ ctx , cancel := context .WithTimeout (context .Background (), time .Minute )
336
+ defer cancel ()
337
+
342
338
for _ , dev := range devices {
343
- err := deleteTapDevice (testCtx , dev )
339
+ err := deleteTapDevice (ctx , dev )
344
340
assert .NoError (t , err )
345
341
}
346
342
}()
@@ -355,6 +351,11 @@ func testMultipleVMs(t *testing.T, count int) {
355
351
devices = append (devices , tapName )
356
352
}
357
353
354
+ var (
355
+ created int64
356
+ stopped int64
357
+ )
358
+
358
359
// This test spawns separate VMs in parallel and ensures containers are spawned within each expected VM. It asserts each
359
360
// container ends up in the right VM by assigning each VM a network device with a unique mac address and having each container
360
361
// print the mac address it sees inside its VM.
@@ -389,7 +390,7 @@ func testMultipleVMs(t *testing.T, count int) {
389
390
},
390
391
ContainerCount : containerCount ,
391
392
JailerConfig : jailerConfig ,
392
- TimeoutSeconds : uint32 ( createVMTimeout / time . Second ) ,
393
+ TimeoutSeconds : 60 ,
393
394
// In tests, our in-VM agent has Go's race detector,
394
395
// which makes the agent resource-hoggy than its production build
395
396
// So the default VM size (128MB) is too small.
@@ -417,7 +418,7 @@ func testMultipleVMs(t *testing.T, count int) {
417
418
createVMErr ,
418
419
)
419
420
}
420
- eventCh <- Created
421
+ atomic . AddInt64 ( & created , 1 )
421
422
422
423
containerEg , containerCtx := errgroup .WithContext (vmEgCtx )
423
424
for containerID := 0 ; containerID < int (containerCount ); containerID ++ {
@@ -478,7 +479,7 @@ func testMultipleVMs(t *testing.T, count int) {
478
479
}
479
480
480
481
_ , err = fcClient .StopVM (ctx , & proto.StopVMRequest {VMID : strconv .Itoa (vmID ), TimeoutSeconds : 5 })
481
- eventCh <- Stopped
482
+ atomic . AddInt64 ( & stopped , 1 )
482
483
return err
483
484
}
484
485
@@ -494,23 +495,22 @@ func testMultipleVMs(t *testing.T, count int) {
494
495
ticker := time .NewTicker (10 * time .Second )
495
496
defer ticker .Stop ()
496
497
497
- var created int
498
- for stopped := 0 ; stopped < count ; {
498
+ loop:
499
+ for {
499
500
select {
500
501
case <- vmEgCtx .Done ():
501
- require .NoError (t , vmEg .Wait ())
502
- return
503
- case e := <- eventCh :
504
- switch e {
505
- case Created :
506
- created ++
507
- case Stopped :
508
- stopped ++
509
- }
502
+ break loop
510
503
case <- ticker .C :
511
- t .Logf ("created=%d/%d stopped=%d/%d" , created , count , stopped , count )
504
+ c := atomic .LoadInt64 (& created )
505
+ s := atomic .LoadInt64 (& stopped )
506
+ t .Logf ("%s: created=%d/%d stopped=%d/%d" , time .Now (), c , count , s , count )
507
+ if s == int64 (count ) {
508
+ break loop
509
+ }
512
510
}
513
511
}
512
+
513
+ require .NoError (t , vmEg .Wait ())
514
514
}
515
515
516
516
func testMultipleExecs (
0 commit comments