Skip to content

Commit d71e631

Browse files
feat: add failover to on-demand in case request is failing (#3409)
Adding the option to create on-demand instances in case spot is failing. ## Problem This module either support spot or on-demand instances. When using spot (default), creation can fail with several reasons. In case there is no sufficient capacity on AWS it makes sens to request on-demand instances. AWS does not support this kind of requests via the fleet API. This PR addresses this problme and add the option (opt-in_ to create on-demand instances in case of Insufficient capacity. ## Migrations No migrations required ## Opt-in Opt in by setting the variable`enable_runner_on_demand_failover` ## Verfication Not easy to test the failover. But due to changes in multi-runner, runner module as well lambda scale-up and pool. The following checks are required - [x] Ephemeral example with pool - [x] Multi runner example --------- Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
1 parent a949ead commit d71e631

File tree

23 files changed

+384
-112
lines changed

23 files changed

+384
-112
lines changed

README.md

+1
Original file line numberDiff line numberDiff line change
@@ -532,6 +532,7 @@ We welcome any improvement to the standard module to make the default as secure
532532
| <a name="input_enable_organization_runners"></a> [enable\_organization\_runners](#input\_enable\_organization\_runners) | Register runners to organization, instead of repo level | `bool` | `false` | no |
533533
| <a name="input_enable_runner_binaries_syncer"></a> [enable\_runner\_binaries\_syncer](#input\_enable\_runner\_binaries\_syncer) | Option to disable the lambda to sync GitHub runner distribution, useful when using a pre-build AMI. | `bool` | `true` | no |
534534
| <a name="input_enable_runner_detailed_monitoring"></a> [enable\_runner\_detailed\_monitoring](#input\_enable\_runner\_detailed\_monitoring) | Should detailed monitoring be enabled for the runner. Set this to true if you want to use detailed monitoring. See https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-cloudwatch-new.html for details. | `bool` | `false` | no |
535+
| <a name="input_enable_runner_on_demand_failover_for_errors"></a> [enable\_runner\_on\_demand\_failover\_for\_errors](#input\_enable\_runner\_on\_demand\_failover\_for\_errors) | Enable on-demand failover. For example to fall back to on demand when no spot capacity is available the variable can be set to `InsufficientInstanceCapacity`. When not defined the default behavior is to retry later. | `list(string)` | `[]` | no |
535536
| <a name="input_enable_runner_workflow_job_labels_check_all"></a> [enable\_runner\_workflow\_job\_labels\_check\_all](#input\_enable\_runner\_workflow\_job\_labels\_check\_all) | If set to true all labels in the workflow job must match the GitHub labels (os, architecture and `self-hosted`). When false if __any__ label matches it will trigger the webhook. | `bool` | `true` | no |
536537
| <a name="input_enable_ssm_on_runners"></a> [enable\_ssm\_on\_runners](#input\_enable\_ssm\_on\_runners) | Enable to allow access to the runner instances for debugging purposes via SSM. Note that this adds additional permissions to the runner instances. | `bool` | `false` | no |
537538
| <a name="input_enable_user_data_debug_logging_runner"></a> [enable\_user\_data\_debug\_logging\_runner](#input\_enable\_user\_data\_debug\_logging\_runner) | Option to enable debug logging for user-data, this logs all secrets as well. | `bool` | `false` | no |

examples/multi-runner/templates/runner-configs/linux-x64.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ runner_config:
1515
- m5a.large
1616
runners_maximum_count: 1
1717
enable_ephemeral_runners: true
18+
enable_on_demand_failover_for_errors: ['InsufficientInstanceCapacity']
1819
create_service_linked_role_spot: true
1920
scale_down_schedule_expression: cron(* * * * ? *)
2021
runner_metadata_options:

lambdas/functions/control-plane/jest.config.ts

+4-4
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,10 @@ const config: Config = {
66
...defaultConfig,
77
coverageThreshold: {
88
global: {
9-
statements: 97.89,
10-
branches: 94.64,
11-
functions: 97.33,
12-
lines: 98.21,
9+
statements: 97.99,
10+
branches: 96.04,
11+
functions: 97.53,
12+
lines: 98.3,
1313
},
1414
},
1515
};

lambdas/functions/control-plane/src/aws/runners.d.ts

+2-1
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,8 @@ export interface RunnerInputParameters {
3737
maxSpotPrice?: string;
3838
instanceAllocationStrategy: SpotAllocationStrategy;
3939
};
40-
numberOfRunners?: number;
40+
numberOfRunners: number;
4141
amiIdSsmParameterName?: string;
4242
tracingEnabled?: boolean;
43+
onDemandFailoverOnError?: string[];
4344
}

lambdas/functions/control-plane/src/aws/runners.test.ts

+166-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import {
22
CreateFleetCommand,
33
CreateFleetCommandInput,
4+
CreateFleetInstance,
45
CreateFleetResult,
56
DefaultTargetCapacityType,
67
DescribeInstancesCommand,
@@ -140,6 +141,28 @@ describe('list instances', () => {
140141
const resp = await listEC2Runners();
141142
expect(resp.length).toBe(1);
142143
});
144+
145+
it('Filter instances for state running.', async () => {
146+
mockEC2Client.on(DescribeInstancesCommand).resolves(mockRunningInstances);
147+
await listEC2Runners({ statuses: ['running'] });
148+
expect(mockEC2Client).toHaveReceivedCommandWith(DescribeInstancesCommand, {
149+
Filters: [
150+
{ Name: 'instance-state-name', Values: ['running'] },
151+
{ Name: 'tag:ghr:Application', Values: ['github-action-runner'] },
152+
],
153+
});
154+
});
155+
156+
it('Filter instances with status undefined, fall back to defaults.', async () => {
157+
mockEC2Client.on(DescribeInstancesCommand).resolves(mockRunningInstances);
158+
await listEC2Runners({ statuses: undefined });
159+
expect(mockEC2Client).toHaveReceivedCommandWith(DescribeInstancesCommand, {
160+
Filters: [
161+
{ Name: 'instance-state-name', Values: ['running', 'pending'] },
162+
{ Name: 'tag:ghr:Application', Values: ['github-action-runner'] },
163+
],
164+
});
165+
});
143166
});
144167

145168
describe('terminate runner', () => {
@@ -178,7 +201,6 @@ describe('create runner', () => {
178201
mockEC2Client.reset();
179202
mockSSMClient.reset();
180203

181-
//mockEC2.createFleet.mockImplementation(() => mockCreateFleet);
182204
mockEC2Client.on(CreateFleetCommand).resolves({ Instances: [{ InstanceIds: ['i-1234'] }] });
183205
mockSSMClient.on(GetParameterCommand).resolves({});
184206
});
@@ -333,6 +355,130 @@ describe('create runner with errors', () => {
333355
expect(mockEC2Client).not.toHaveReceivedCommand(CreateFleetCommand);
334356
expect(mockSSMClient).not.toHaveReceivedCommand(PutParameterCommand);
335357
});
358+
359+
it('Error with undefined Instances and Errors.', async () => {
360+
mockEC2Client.on(CreateFleetCommand).resolvesOnce({ Instances: undefined, Errors: undefined });
361+
await expect(createRunner(createRunnerConfig(defaultRunnerConfig))).rejects.toBeInstanceOf(Error);
362+
});
363+
364+
it('Error with undefined InstanceIds and ErrorCode.', async () => {
365+
mockEC2Client.on(CreateFleetCommand).resolvesOnce({
366+
Instances: [{ InstanceIds: undefined }],
367+
Errors: [
368+
{
369+
ErrorCode: undefined,
370+
},
371+
],
372+
});
373+
await expect(createRunner(createRunnerConfig(defaultRunnerConfig))).rejects.toBeInstanceOf(Error);
374+
});
375+
});
376+
377+
describe('create runner with errors fail over to OnDemand', () => {
378+
const defaultRunnerConfig: RunnerConfig = {
379+
allocationStrategy: SpotAllocationStrategy.CAPACITY_OPTIMIZED,
380+
capacityType: 'spot',
381+
type: 'Repo',
382+
onDemandFailoverOnError: ['InsufficientInstanceCapacity'],
383+
};
384+
const defaultExpectedFleetRequestValues: ExpectedFleetRequestValues = {
385+
type: 'Repo',
386+
capacityType: 'spot',
387+
allocationStrategy: SpotAllocationStrategy.CAPACITY_OPTIMIZED,
388+
totalTargetCapacity: 1,
389+
};
390+
beforeEach(() => {
391+
jest.clearAllMocks();
392+
mockEC2Client.reset();
393+
mockSSMClient.reset();
394+
395+
mockSSMClient.on(PutParameterCommand).resolves({});
396+
mockSSMClient.on(GetParameterCommand).resolves({});
397+
mockEC2Client.on(CreateFleetCommand).resolves({ Instances: [] });
398+
});
399+
400+
it('test InsufficientInstanceCapacity fallback to on demand .', async () => {
401+
const instancesIds = ['i-123'];
402+
createFleetMockWithWithOnDemandFallback(['InsufficientInstanceCapacity'], instancesIds);
403+
404+
const instancesResult = await createRunner(createRunnerConfig(defaultRunnerConfig));
405+
expect(instancesResult).toEqual(instancesIds);
406+
407+
expect(mockEC2Client).toHaveReceivedCommandTimes(CreateFleetCommand, 2);
408+
409+
// first call with spot failuer
410+
expect(mockEC2Client).toHaveReceivedNthCommandWith(1, CreateFleetCommand, {
411+
...expectedCreateFleetRequest({
412+
...defaultExpectedFleetRequestValues,
413+
totalTargetCapacity: 1,
414+
capacityType: 'spot',
415+
}),
416+
});
417+
418+
// second call with with OnDemand failback
419+
expect(mockEC2Client).toHaveReceivedNthCommandWith(2, CreateFleetCommand, {
420+
...expectedCreateFleetRequest({
421+
...defaultExpectedFleetRequestValues,
422+
totalTargetCapacity: 1,
423+
capacityType: 'on-demand',
424+
}),
425+
});
426+
});
427+
428+
it('test InsufficientInstanceCapacity no failback.', async () => {
429+
await expect(
430+
createRunner(createRunnerConfig({ ...defaultRunnerConfig, onDemandFailoverOnError: [] })),
431+
).rejects.toBeInstanceOf(Error);
432+
});
433+
434+
it('test InsufficientInstanceCapacity with mutlipte instances and fallback to on demand .', async () => {
435+
const instancesIds = ['i-123', 'i-456'];
436+
createFleetMockWithWithOnDemandFallback(['InsufficientInstanceCapacity'], instancesIds);
437+
438+
const instancesResult = await createRunner({ ...createRunnerConfig(defaultRunnerConfig), numberOfRunners: 2 });
439+
expect(instancesResult).toEqual(instancesIds);
440+
441+
expect(mockEC2Client).toHaveReceivedCommandTimes(CreateFleetCommand, 2);
442+
443+
// first call with spot failuer
444+
expect(mockEC2Client).toHaveReceivedNthCommandWith(1, CreateFleetCommand, {
445+
...expectedCreateFleetRequest({
446+
...defaultExpectedFleetRequestValues,
447+
totalTargetCapacity: 2,
448+
capacityType: 'spot',
449+
}),
450+
});
451+
452+
// second call with with OnDemand failback, capacity is reduced by 1
453+
expect(mockEC2Client).toHaveReceivedNthCommandWith(2, CreateFleetCommand, {
454+
...expectedCreateFleetRequest({
455+
...defaultExpectedFleetRequestValues,
456+
totalTargetCapacity: 1,
457+
capacityType: 'on-demand',
458+
}),
459+
});
460+
});
461+
462+
it('test UnfulfillableCapacity with mutlipte instances and no fallback to on demand .', async () => {
463+
const instancesIds = ['i-123', 'i-456'];
464+
// fallback to on demand for UnfulfillableCapacity but InsufficientInstanceCapacity is thrown
465+
createFleetMockWithWithOnDemandFallback(['UnfulfillableCapacity'], instancesIds);
466+
467+
await expect(
468+
createRunner({ ...createRunnerConfig(defaultRunnerConfig), numberOfRunners: 2 }),
469+
).rejects.toBeInstanceOf(Error);
470+
471+
expect(mockEC2Client).toHaveReceivedCommandTimes(CreateFleetCommand, 1);
472+
473+
// first call with spot failuer
474+
expect(mockEC2Client).toHaveReceivedNthCommandWith(1, CreateFleetCommand, {
475+
...expectedCreateFleetRequest({
476+
...defaultExpectedFleetRequestValues,
477+
totalTargetCapacity: 2,
478+
capacityType: 'spot',
479+
}),
480+
});
481+
});
336482
});
337483

338484
function createFleetMockWithErrors(errors: string[], instances?: string[]) {
@@ -354,20 +500,37 @@ function createFleetMockWithErrors(errors: string[], instances?: string[]) {
354500
mockEC2Client.on(CreateFleetCommand).resolves(result);
355501
}
356502

503+
function createFleetMockWithWithOnDemandFallback(errors: string[], instances?: string[], numberOfFailures = 1) {
504+
const instanceesFirstCall: CreateFleetInstance = {
505+
InstanceIds: instances?.slice(0, instances.length - numberOfFailures).map((i) => i),
506+
};
507+
508+
const instancesSecondCall: CreateFleetInstance = {
509+
InstanceIds: instances?.slice(instances.length - numberOfFailures, instances.length).map((i) => i),
510+
};
511+
512+
mockEC2Client
513+
.on(CreateFleetCommand)
514+
.resolvesOnce({ Instances: [instanceesFirstCall], Errors: errors.map((e) => ({ ErrorCode: e })) })
515+
.resolvesOnce({ Instances: [instancesSecondCall] });
516+
}
517+
357518
interface RunnerConfig {
358519
type: RunnerType;
359520
capacityType: DefaultTargetCapacityType;
360521
allocationStrategy: SpotAllocationStrategy;
361522
maxSpotPrice?: string;
362523
amiIdSsmParameterName?: string;
363524
tracingEnabled?: boolean;
525+
onDemandFailoverOnError?: string[];
364526
}
365527

366528
function createRunnerConfig(runnerConfig: RunnerConfig): RunnerInputParameters {
367529
return {
368530
environment: ENVIRONMENT,
369531
runnerType: runnerConfig.type,
370532
runnerOwner: REPO_NAME,
533+
numberOfRunners: 1,
371534
launchTemplateName: LAUNCH_TEMPLATE,
372535
ec2instanceCriteria: {
373536
instanceTypes: ['m5.large', 'c5.large'],
@@ -378,6 +541,7 @@ function createRunnerConfig(runnerConfig: RunnerConfig): RunnerInputParameters {
378541
subnets: ['subnet-123', 'subnet-456'],
379542
amiIdSsmParameterName: runnerConfig.amiIdSsmParameterName,
380543
tracingEnabled: runnerConfig.tracingEnabled,
544+
onDemandFailoverOnError: runnerConfig.onDemandFailoverOnError,
381545
};
382546
}
383547

@@ -451,7 +615,7 @@ function expectedCreateFleetRequest(expectedValues: ExpectedFleetRequestValues):
451615
};
452616

453617
if (expectedValues.imageId) {
454-
for (const config of request?.LaunchTemplateConfigs || []) {
618+
for (const config of request?.LaunchTemplateConfigs ?? []) {
455619
if (config.Overrides) {
456620
for (const override of config.Overrides) {
457621
override.ImageId = expectedValues.imageId;

0 commit comments

Comments
 (0)