Skip to content

Commit 522c147

Browse files
authored
fix to reduce test flakiness (#53)
Adds a debug logging statement to see status when status does exist but arn does not. Additionally adds a random sleep in the `create_sagemaker_resource` method. This is because there may be many of the same resource created at the same time and it will take longer than 30 seconds for the `wait_resource_consumed_by_controller `to pass instead of adding extra time. Eventually they will all create but for the tests they will fail at the assert since it only waits 30 seconds.
1 parent 17c03a5 commit 522c147

16 files changed

+104
-56
lines changed

test/e2e/__init__.py

+15-2
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
import time
1717
import boto3
1818
from pathlib import Path
19+
import random
1920

2021
from acktest.k8s import resource as k8s
2122

@@ -36,12 +37,22 @@ def sagemaker_client():
3637

3738

3839
def create_sagemaker_resource(
39-
resource_plural, resource_name, spec_file, replacements, namespace="default"
40+
resource_plural,
41+
resource_name,
42+
spec_file,
43+
replacements,
44+
namespace="default",
45+
wait_period=3,
46+
period_length=10,
4047
):
4148
"""
4249
Wrapper around k8s.load_and_create_resource to create a SageMaker resource
4350
"""
44-
51+
# Add a random sleep to prevent throttling exception before the call to load and create
52+
# This is because there may be many of the same resource ex: Multiple Models being created at the same time
53+
# If this occurs then a throttling exception may occur and cause the tests to fail, this sleep prevents this from occurring.
54+
rand = random.randrange(1, 4)
55+
time.sleep(rand)
4556
reference, spec, resource = k8s.load_and_create_resource(
4657
resource_directory,
4758
CRD_GROUP,
@@ -51,6 +62,8 @@ def create_sagemaker_resource(
5162
spec_file,
5263
replacements,
5364
namespace,
65+
wait_period,
66+
period_length,
5467
)
5568

5669
return reference, spec, resource

test/e2e/common/fixtures.py

+13-1
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
"""
1515

1616
import pytest
17-
17+
import logging
1818
from e2e import (
1919
create_sagemaker_resource,
2020
wait_sagemaker_endpoint_status,
@@ -51,6 +51,10 @@ def xgboost_churn_endpoint(sagemaker_client):
5151
replacements=replacements,
5252
)
5353
assert model_resource is not None
54+
if k8s.get_resource_arn(model_resource) is None:
55+
logging.debug(
56+
f"ARN for this resource is None, resource status is: {model_resource['status']}"
57+
)
5458
assert k8s.get_resource_arn(model_resource) is not None
5559

5660
(
@@ -64,6 +68,10 @@ def xgboost_churn_endpoint(sagemaker_client):
6468
replacements=replacements,
6569
)
6670
assert endpoint_config_resource is not None
71+
if k8s.get_resource_arn(endpoint_config_resource) is None:
72+
logging.debug(
73+
f"ARN for this resource is None, resource status is: {endpoint_config_resource['status']}"
74+
)
6775
assert k8s.get_resource_arn(endpoint_config_resource) is not None
6876

6977
endpoint_reference, endpoint_spec, endpoint_resource = create_sagemaker_resource(
@@ -73,6 +81,10 @@ def xgboost_churn_endpoint(sagemaker_client):
7381
replacements=replacements,
7482
)
7583
assert endpoint_resource is not None
84+
if k8s.get_resource_arn(endpoint_resource) is None:
85+
logging.debug(
86+
f"ARN for this resource is None, resource status is: {endpoint_resource['status']}"
87+
)
7688
assert k8s.get_resource_arn(endpoint_resource) is not None
7789
wait_sagemaker_endpoint_status(replacements["ENDPOINT_NAME"], "InService")
7890

test/e2e/requirements.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
1-
acktest @ git+https://github.com/aws-controllers-k8s/test-infra.git@5ed60a505afa953096e53c9d3d6779830250915b
1+
acktest @ git+https://github.com/aws-controllers-k8s/test-infra.git@6518b782a765fb57dda1432482dc79c0711b73c2
22
black==20.8b1

test/e2e/service_bootstrap.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -123,8 +123,7 @@ def service_bootstrap() -> dict:
123123
logging.getLogger().setLevel(logging.INFO)
124124

125125
return TestBootstrapResources(
126-
create_data_bucket(),
127-
create_execution_role(),
126+
create_data_bucket(), create_execution_role(),
128127
).__dict__
129128

130129

test/e2e/tests/test_adopt_endpoint.py

+1-4
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,6 @@ def adopted_endpoint(sdk_endpoint):
123123
(model_input, _, endpoint_config_input, _, endpoint_input, _) = sdk_endpoint
124124

125125
replacements = REPLACEMENT_VALUES.copy()
126-
127126
# adopt model
128127
replacements["ADOPTED_RESOURCE_NAME"] = "adopt-" + model_input["ModelName"]
129128
replacements["TARGET_RESOURCE_AWS"] = replacements[
@@ -260,8 +259,6 @@ def test_smoke(self, sdk_endpoint, adopted_endpoint):
260259
)
261260

262261
assert_endpoint_status_in_sync(
263-
endpoint_name,
264-
endpoint_reference,
265-
cfg.ENDPOINT_STATUS_INSERVICE,
262+
endpoint_name, endpoint_reference, cfg.ENDPOINT_STATUS_INSERVICE,
266263
)
267264
assert k8s.wait_on_condition(endpoint_reference, "ACK.ResourceSynced", "True")

test/e2e/tests/test_endpoint.py

+29-21
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,10 @@ def single_container_model(name_suffix):
5252
replacements=replacements,
5353
)
5454
assert model_resource is not None
55+
if k8s.get_resource_arn(model_resource) is None:
56+
logging.debug(
57+
f"ARN for this resource is None, resource status is: {model_resource['status']}"
58+
)
5559
assert k8s.get_resource_arn(model_resource) is not None
5660

5761
yield (model_reference, model_resource)
@@ -77,6 +81,10 @@ def multi_variant_config(name_suffix, single_container_model):
7781
replacements=replacements,
7882
)
7983
assert config_resource is not None
84+
if k8s.get_resource_arn(config_resource) is None:
85+
logging.debug(
86+
f"ARN for this resource is None, resource status is: {config_resource['status']}"
87+
)
8088
assert k8s.get_resource_arn(config_resource) is not None
8189

8290
yield (config_reference, config_resource)
@@ -102,6 +110,10 @@ def single_variant_config(name_suffix, single_container_model):
102110
replacements=replacements,
103111
)
104112
assert config_resource is not None
113+
if k8s.get_resource_arn(config_resource) is None:
114+
logging.debug(
115+
f"ARN for this resource is None, resource status is: {config_resource['status']}"
116+
)
105117
assert k8s.get_resource_arn(config_resource) is not None
106118

107119
yield (config_reference, config_resource)
@@ -160,7 +172,10 @@ def faulty_config(name_suffix, single_container_model):
160172
replacements=replacements,
161173
)
162174
assert model_resource is not None
163-
model_resource = k8s.get_resource(model_reference)
175+
if k8s.get_resource_arn(model_resource) is None:
176+
logging.debug(
177+
f"ARN for this resource is None, resource status is: {model_resource['status']}"
178+
)
164179
assert k8s.get_resource_arn(model_resource) is not None
165180
s3.delete_object(model_bucket, model_destination_key)
166181

@@ -177,6 +192,10 @@ def faulty_config(name_suffix, single_container_model):
177192
replacements=replacements,
178193
)
179194
assert config_resource is not None
195+
if k8s.get_resource_arn(config_resource) is None:
196+
logging.debug(
197+
f"ARN for this resource is None, resource status is: {config_resource['status']}"
198+
)
180199
assert k8s.get_resource_arn(config_resource) is not None
181200

182201
yield (config_reference, config_resource)
@@ -246,9 +265,7 @@ def update_endpoint_failed_test(
246265

247266
# endpoint transitions Updating -> InService state
248267
assert_endpoint_status_in_sync(
249-
endpoint_reference.name,
250-
endpoint_reference,
251-
cfg.ENDPOINT_STATUS_UPDATING,
268+
endpoint_reference.name, endpoint_reference, cfg.ENDPOINT_STATUS_UPDATING,
252269
)
253270
assert k8s.wait_on_condition(endpoint_reference, "ACK.ResourceSynced", "False")
254271
endpoint_resource = k8s.get_resource(endpoint_reference)
@@ -258,17 +275,12 @@ def update_endpoint_failed_test(
258275
)
259276

260277
assert_endpoint_status_in_sync(
261-
endpoint_reference.name,
262-
endpoint_reference,
263-
cfg.ENDPOINT_STATUS_INSERVICE,
278+
endpoint_reference.name, endpoint_reference, cfg.ENDPOINT_STATUS_INSERVICE,
264279
)
265280

266281
assert k8s.wait_on_condition(endpoint_reference, "ACK.ResourceSynced", "True")
267282
assert k8s.assert_condition_state_message(
268-
endpoint_reference,
269-
"ACK.Terminal",
270-
"True",
271-
FAIL_UPDATE_ERROR_MESSAGE,
283+
endpoint_reference, "ACK.Terminal", "True", FAIL_UPDATE_ERROR_MESSAGE,
272284
)
273285

274286
endpoint_resource = k8s.get_resource(endpoint_reference)
@@ -279,10 +291,10 @@ def update_endpoint_failed_test(
279291
current_config_name = endpoint_resource["status"].get(
280292
"latestEndpointConfigName"
281293
)
282-
assert (
283-
current_config_name is not None
284-
and current_config_name
285-
== old_config_resource["spec"].get("endpointConfigName", None)
294+
assert current_config_name is not None and current_config_name == old_config_resource[
295+
"spec"
296+
].get(
297+
"endpointConfigName", None
286298
)
287299

288300
def update_endpoint_successful_test(
@@ -306,9 +318,7 @@ def update_endpoint_successful_test(
306318

307319
# endpoint transitions Updating -> InService state
308320
assert_endpoint_status_in_sync(
309-
endpoint_reference.name,
310-
endpoint_reference,
311-
cfg.ENDPOINT_STATUS_UPDATING,
321+
endpoint_reference.name, endpoint_reference, cfg.ENDPOINT_STATUS_UPDATING,
312322
)
313323

314324
assert k8s.wait_on_condition(endpoint_reference, "ACK.ResourceSynced", "False")
@@ -322,9 +332,7 @@ def update_endpoint_successful_test(
322332
)
323333

324334
assert_endpoint_status_in_sync(
325-
endpoint_reference.name,
326-
endpoint_reference,
327-
cfg.ENDPOINT_STATUS_INSERVICE,
335+
endpoint_reference.name, endpoint_reference, cfg.ENDPOINT_STATUS_INSERVICE,
328336
)
329337
assert k8s.wait_on_condition(endpoint_reference, "ACK.ResourceSynced", "True")
330338
assert k8s.assert_condition_state_message(

test/e2e/tests/test_endpoint_config.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313
"""Integration tests for the SageMaker EndpointConfig API.
1414
"""
1515

16-
from _pytest import config
1716
import botocore
1817
import pytest
1918
import logging
@@ -43,6 +42,10 @@ def single_variant_config():
4342
replacements=replacements,
4443
)
4544
assert model_resource is not None
45+
if k8s.get_resource_arn(model_resource) is None:
46+
logging.debug(
47+
f"ARN for this resource is None, resource status is: {model_resource['status']}"
48+
)
4649
assert k8s.get_resource_arn(model_resource) is not None
4750

4851
config_reference, config_spec, config_resource = create_sagemaker_resource(

test/e2e/tests/test_hpo.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -45,8 +45,11 @@ def xgboost_hpojob():
4545
spec_file="xgboost_hpojob",
4646
replacements=replacements,
4747
)
48-
4948
assert resource is not None
49+
if k8s.get_resource_arn(resource) is None:
50+
logging.debug(
51+
f"ARN for this resource is None, resource status is: {resource['status']}"
52+
)
5053
assert k8s.get_resource_arn(resource) is not None
5154

5255
yield (reference, resource)

test/e2e/tests/test_model_bias_job_definition.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -53,11 +53,11 @@ def xgboost_churn_model_bias_job_definition(xgboost_churn_endpoint):
5353
yield (reference, resource)
5454

5555
if k8s.get_resource_exists(reference):
56-
_, deleted = k8s.delete_custom_resource(job_definition_reference, 3, 10)
56+
_, deleted = k8s.delete_custom_resource(reference, 3, 10)
5757
assert deleted
5858

5959

60-
def describe_sagemaker_model_bias_job_definition(sagemaker_client, job_definition_name):
60+
def get_sagemaker_model_bias_job_definition(sagemaker_client, job_definition_name):
6161
try:
6262
return sagemaker_client.describe_model_bias_job_definition(
6363
JobDefinitionName=job_definition_name
@@ -79,7 +79,7 @@ def test_smoke(self, sagemaker_client, xgboost_churn_model_bias_job_definition):
7979
job_definition_name = resource["spec"].get("jobDefinitionName")
8080
assert (
8181
k8s.get_resource_arn(resource)
82-
== describe_sagemaker_model_bias_job_definition(
82+
== get_sagemaker_model_bias_job_definition(
8383
sagemaker_client, job_definition_name
8484
)["JobDefinitionArn"]
8585
)
@@ -88,7 +88,7 @@ def test_smoke(self, sagemaker_client, xgboost_churn_model_bias_job_definition):
8888
_, deleted = k8s.delete_custom_resource(reference, 3, 10)
8989
assert deleted
9090
assert (
91-
describe_sagemaker_model_bias_job_definition(
91+
get_sagemaker_model_bias_job_definition(
9292
sagemaker_client, job_definition_name
9393
)
9494
is None

test/e2e/tests/test_model_explainability_job_definition.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -53,11 +53,11 @@ def xgboost_churn_model_explainability_job_definition(xgboost_churn_endpoint):
5353
yield (reference, resource)
5454

5555
if k8s.get_resource_exists(reference):
56-
_, deleted = k8s.delete_custom_resource(job_definition_reference, 3, 10)
56+
_, deleted = k8s.delete_custom_resource(reference, 3, 10)
5757
assert deleted
5858

5959

60-
def describe_sagemaker_model_explainability_job_definition(
60+
def get_sagemaker_model_explainability_job_definition(
6161
sagemaker_client, job_definition_name
6262
):
6363
try:
@@ -83,7 +83,7 @@ def test_smoke(
8383
job_definition_name = resource["spec"].get("jobDefinitionName")
8484
assert (
8585
k8s.get_resource_arn(resource)
86-
== describe_sagemaker_model_explainability_job_definition(
86+
== get_sagemaker_model_explainability_job_definition(
8787
sagemaker_client, job_definition_name
8888
)["JobDefinitionArn"]
8989
)
@@ -92,7 +92,7 @@ def test_smoke(
9292
_, deleted = k8s.delete_custom_resource(reference, 3, 10)
9393
assert deleted
9494
assert (
95-
describe_sagemaker_model_explainability_job_definition(
95+
get_sagemaker_model_explainability_job_definition(
9696
sagemaker_client, job_definition_name
9797
)
9898
is None

test/e2e/tests/test_model_quality_job_definition.py

+4-6
Original file line numberDiff line numberDiff line change
@@ -53,13 +53,11 @@ def xgboost_churn_model_quality_job_definition(xgboost_churn_endpoint):
5353
yield (reference, resource)
5454

5555
if k8s.get_resource_exists(reference):
56-
_, deleted = k8s.delete_custom_resource(job_definition_reference, 3, 10)
56+
_, deleted = k8s.delete_custom_resource(reference, 3, 10)
5757
assert deleted
5858

5959

60-
def describe_sagemaker_model_quality_job_definition(
61-
sagemaker_client, job_definition_name
62-
):
60+
def get_sagemaker_model_quality_job_definition(sagemaker_client, job_definition_name):
6361
try:
6462
return sagemaker_client.describe_model_quality_job_definition(
6563
JobDefinitionName=job_definition_name
@@ -81,7 +79,7 @@ def test_smoke(self, sagemaker_client, xgboost_churn_model_quality_job_definitio
8179
job_definition_name = resource["spec"].get("jobDefinitionName")
8280
assert (
8381
k8s.get_resource_arn(resource)
84-
== describe_sagemaker_model_quality_job_definition(
82+
== get_sagemaker_model_quality_job_definition(
8583
sagemaker_client, job_definition_name
8684
)["JobDefinitionArn"]
8785
)
@@ -90,7 +88,7 @@ def test_smoke(self, sagemaker_client, xgboost_churn_model_quality_job_definitio
9088
_, deleted = k8s.delete_custom_resource(reference, 3, 10)
9189
assert deleted
9290
assert (
93-
describe_sagemaker_model_quality_job_definition(
91+
get_sagemaker_model_quality_job_definition(
9492
sagemaker_client, job_definition_name
9593
)
9694
is None

0 commit comments

Comments
 (0)