Skip to content

Commit e14a18a

Browse files
committed
Make TestMultipleVMs_Isolated less painful to run
This commit makes the test less painful to run - By reducing the number of VMs from 100 to 50 - By reducing the timeout to fail sooner Signed-off-by: Kazuyoshi Kato <[email protected]>
1 parent b9404d0 commit e14a18a

File tree

3 files changed

+49
-49
lines changed

3 files changed

+49
-49
lines changed

.buildkite/al2_pipeline.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ steps:
3434
distro: "${BUILDKITE_AGENT_META_DATA_DISTRO}"
3535
hostname: "${BUILDKITE_AGENT_META_DATA_HOSTNAME}"
3636
env:
37-
NUMBER_OF_VMS: "100"
37+
NUMBER_OF_VMS: "10"
3838
EXTRAGOARGS: "-v -count=1 -timeout=1h"
3939
artifact_paths:
4040
- "runtime/logs/*"

.buildkite/pipeline.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -97,8 +97,8 @@ steps:
9797
hostname: "${BUILDKITE_AGENT_META_DATA_HOSTNAME}"
9898
env:
9999
DOCKER_IMAGE_TAG: "$BUILDKITE_BUILD_NUMBER"
100-
NUMBER_OF_VMS: 100
101-
EXTRAGOARGS: "-v -count=1 -race -timeout=1h"
100+
NUMBER_OF_VMS: 50
101+
EXTRAGOARGS: "-v -count=1 -race"
102102
FICD_DM_VOLUME_GROUP: fcci-vg
103103
artifact_paths:
104104
- "runtime/logs/*"

runtime/service_integ_test.go

Lines changed: 46 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ import (
2626
"strconv"
2727
"strings"
2828
"sync"
29+
"sync/atomic"
2930
"syscall"
3031
"testing"
3132
"time"
@@ -242,49 +243,41 @@ func createTapDevice(ctx context.Context, tapName string) error {
242243
func TestMultipleVMs_Isolated(t *testing.T) {
243244
integtest.Prepare(t)
244245

245-
var err error
246+
// Seems there is a deadlock (see #581).
247+
// Cancel if running VMs takes more than 5 minutes.
248+
const timeout = 5 * time.Minute
246249

247250
// numberOfVmsEnvName = NUMBER_OF_VMS ENV and is configurable from buildkite
251+
var err error
248252
numberOfVms := defaultNumberOfVms
249253
if str := os.Getenv(numberOfVmsEnvName); str != "" {
250254
numberOfVms, err = strconv.Atoi(str)
251255
require.NoError(t, err, "failed to get NUMBER_OF_VMS env")
252256
}
253257
t.Logf("TestMultipleVMs_Isolated: will run up to %d VMs", numberOfVms)
254258

255-
// We should be able to run 10 VMs without any issues.
256-
if numberOfVms <= 10 {
257-
testMultipleVMs(t, 10)
259+
const delta = 10
260+
261+
if numberOfVms <= delta {
262+
ctx, cancel := context.WithTimeout(context.Background(), timeout)
263+
defer cancel()
264+
testMultipleVMs(ctx, t, numberOfVms)
258265
return
259266
}
260267

261-
// We have issues running 100 VMs (see #581).
262-
// Incrementally increase the number of VMs to find the breaking point.
263-
for i := 10; i <= numberOfVms; i += 10 {
264-
success := t.Run(fmt.Sprintf("VMs=%d", i), func(t *testing.T) {
265-
testMultipleVMs(t, i)
268+
// Seems the instability isn't correlating with the number of VMs.
269+
// Having a failure in N VMs doesn't necessary mean running more than N VMs
270+
// doesn't work at all.
271+
for i := delta; i <= numberOfVms; i += delta {
272+
t.Run(fmt.Sprintf("VMs=%d", i), func(t *testing.T) {
273+
ctx, cancel := context.WithTimeout(context.Background(), timeout)
274+
defer cancel()
275+
testMultipleVMs(ctx, t, i)
266276
})
267-
if !success {
268-
// If running N VMs doesn't work, no point to go further.
269-
return
270-
}
271277
}
272278
}
273279

274-
type Event int
275-
276-
const (
277-
Created Event = iota
278-
Stopped
279-
)
280-
281-
func testMultipleVMs(t *testing.T, count int) {
282-
// This test starts multiple VMs and some may hit firecracker-containerd's
283-
// default timeout. So overriding the timeout to wait longer.
284-
// One hour should be enough to start a VM, regardless of the load of
285-
// the underlying host.
286-
const createVMTimeout = 1 * time.Hour
287-
280+
func testMultipleVMs(ctx context.Context, t *testing.T, count int) {
288281
// Apparently writing a lot from Firecracker's serial console blocks VMs.
289282
// https://github.com/firecracker-microvm/firecracker/blob/v1.1.0/docs/prod-host-setup.md
290283
kernelArgs := integtest.DefaultRuntimeConfig.KernelArgs + " 8250.nr_uarts=0 quiet loglevel=1"
@@ -321,7 +314,7 @@ func testMultipleVMs(t *testing.T, count int) {
321314
},
322315
}
323316

324-
testCtx := namespaces.WithNamespace(context.Background(), defaultNamespace)
317+
testCtx := namespaces.WithNamespace(ctx, defaultNamespace)
325318

326319
client, err := containerd.New(integtest.ContainerdSockPath, containerd.WithDefaultRuntime(firecrackerRuntime))
327320
require.NoError(t, err, "unable to create client to containerd service at %s, is containerd running?", integtest.ContainerdSockPath)
@@ -333,14 +326,17 @@ func testMultipleVMs(t *testing.T, count int) {
333326
cfg, err := config.LoadConfig("")
334327
require.NoError(t, err, "failed to load config")
335328

336-
eventCh := make(chan Event)
337-
338329
// Creating tap devices without goroutines somehow stabilize this test.
339330
var devices []string
340331

341332
defer func() {
333+
// It needs a new context to delete all of the devices
334+
// even if the incoming context is being cancelled.
335+
ctx, cancel := context.WithTimeout(context.Background(), time.Minute)
336+
defer cancel()
337+
342338
for _, dev := range devices {
343-
err := deleteTapDevice(testCtx, dev)
339+
err := deleteTapDevice(ctx, dev)
344340
assert.NoError(t, err)
345341
}
346342
}()
@@ -355,6 +351,11 @@ func testMultipleVMs(t *testing.T, count int) {
355351
devices = append(devices, tapName)
356352
}
357353

354+
var (
355+
created int64
356+
stopped int64
357+
)
358+
358359
// This test spawns separate VMs in parallel and ensures containers are spawned within each expected VM. It asserts each
359360
// container ends up in the right VM by assigning each VM a network device with a unique mac address and having each container
360361
// print the mac address it sees inside its VM.
@@ -389,7 +390,7 @@ func testMultipleVMs(t *testing.T, count int) {
389390
},
390391
ContainerCount: containerCount,
391392
JailerConfig: jailerConfig,
392-
TimeoutSeconds: uint32(createVMTimeout / time.Second),
393+
TimeoutSeconds: 60,
393394
// In tests, our in-VM agent has Go's race detector,
394395
// which makes the agent resource-hoggy than its production build
395396
// So the default VM size (128MB) is too small.
@@ -417,7 +418,7 @@ func testMultipleVMs(t *testing.T, count int) {
417418
createVMErr,
418419
)
419420
}
420-
eventCh <- Created
421+
atomic.AddInt64(&created, 1)
421422

422423
containerEg, containerCtx := errgroup.WithContext(vmEgCtx)
423424
for containerID := 0; containerID < int(containerCount); containerID++ {
@@ -478,7 +479,7 @@ func testMultipleVMs(t *testing.T, count int) {
478479
}
479480

480481
_, err = fcClient.StopVM(ctx, &proto.StopVMRequest{VMID: strconv.Itoa(vmID), TimeoutSeconds: 5})
481-
eventCh <- Stopped
482+
atomic.AddInt64(&stopped, 1)
482483
return err
483484
}
484485

@@ -494,23 +495,22 @@ func testMultipleVMs(t *testing.T, count int) {
494495
ticker := time.NewTicker(10 * time.Second)
495496
defer ticker.Stop()
496497

497-
var created int
498-
for stopped := 0; stopped < count; {
498+
loop:
499+
for {
499500
select {
500501
case <-vmEgCtx.Done():
501-
require.NoError(t, vmEg.Wait())
502-
return
503-
case e := <-eventCh:
504-
switch e {
505-
case Created:
506-
created++
507-
case Stopped:
508-
stopped++
509-
}
502+
break loop
510503
case <-ticker.C:
511-
t.Logf("created=%d/%d stopped=%d/%d", created, count, stopped, count)
504+
c := atomic.LoadInt64(&created)
505+
s := atomic.LoadInt64(&stopped)
506+
t.Logf("%s: created=%d/%d stopped=%d/%d", time.Now(), c, count, s, count)
507+
if s == int64(count) {
508+
break loop
509+
}
512510
}
513511
}
512+
513+
require.NoError(t, vmEg.Wait())
514514
}
515515

516516
func testMultipleExecs(

0 commit comments

Comments
 (0)