From 8b10058f25117bb9cb183d4fedd2916f5de87060 Mon Sep 17 00:00:00 2001 From: Harrryr Date: Thu, 7 Dec 2023 15:29:38 -0800 Subject: [PATCH 1/6] Add Retry Mechanism to E2E EKS Terraform Deployment --- .github/workflows/appsignals-e2e-eks-test.yml | 126 +++++++++++------- 1 file changed, 76 insertions(+), 50 deletions(-) diff --git a/.github/workflows/appsignals-e2e-eks-test.yml b/.github/workflows/appsignals-e2e-eks-test.yml index 09beef68a6..50e3646ffd 100644 --- a/.github/workflows/appsignals-e2e-eks-test.yml +++ b/.github/workflows/appsignals-e2e-eks-test.yml @@ -83,22 +83,6 @@ jobs: with: terraform_wrapper: false - - name: Deploy sample app via terraform - working-directory: testing/terraform/eks - run: | - terraform init - terraform validate - terraform apply -auto-approve \ - -var="test_id=${{ env.TESTING_ID }}" \ - -var="aws_region=${{ env.AWS_DEFAULT_REGION }}" \ - -var="kube_directory_path=${{ github.workspace }}/.kube" \ - -var="eks_cluster_name=${{ inputs.test-cluster-name }}" \ - -var="eks_cluster_context_name=$(kubectl config current-context)" \ - -var="test_namespace=${{ env.SAMPLE_APP_NAMESPACE }}" \ - -var="service_account_aws_access=service-account-${{ env.TESTING_ID }}" \ - -var="sample_app_image=${{ env.SAMPLE_APP_FRONTEND_SERVICE_IMAGE }}" \ - -var="sample_remote_app_image=${{ env.SAMPLE_APP_REMOTE_SERVICE_IMAGE }}" - # Enable App Signals on the test cluster - name: Pull and unzip enablement script from S3 run: aws s3 cp ${{ env.ENABLEMENT_SCRIPT_S3_BUCKET }} . && unzip -j onboarding.zip @@ -107,21 +91,80 @@ jobs: if: inputs.caller-workflow-name == 'main-build' run: "sed -i 's#image:.*#image: ${{ inputs.appsignals-adot-image-name }}#g' instrumentation.yaml" - - name: Enable App Signals - run: | - ./enable-app-signals.sh \ - ${{ inputs.test-cluster-name }} \ - ${{ env.AWS_DEFAULT_REGION }} \ - ${{ env.SAMPLE_APP_NAMESPACE }} - - # Application pods need to be restarted for the - # app signals instrumentation to take effect - - name: Restart the app pods - run: kubectl delete pods --all -n ${{ env.SAMPLE_APP_NAMESPACE }} - - - name: Wait for sample app pods to come up + - name: Deploy sample app via terraform and wait for the endpoint to come online + id: deploy-sample-app + working-directory: testing/terraform/eks run: | - kubectl wait --for=condition=Ready pod --all -n ${{ env.SAMPLE_APP_NAMESPACE }} + terraform init + terraform validate + + # Attempt to deploy the sample app on an EKS instance and wait for its endpoint to come online. + # There may be occasional failures due to transitivity issues, so try up to 2 times. + # Success of 0 indicates that both the terraform deployment and the endpoint are running, while 1 indicates + # that it failed at some point + retry_counter=0 + max_retry=2 + success=1 + while [ $success -eq 1 ]; do + if [ $retry_counter -ge $max_retry ]; then + exit 1 + fi + success=0 + terraform apply -auto-approve \ + -var="test_id=${{ env.TESTING_ID }}" \ + -var="aws_region=${{ env.AWS_DEFAULT_REGION }}" \ + -var="kube_directory_path=${{ github.workspace }}/.kube" \ + -var="eks_cluster_name=${{ inputs.test-cluster-name }}" \ + -var="eks_cluster_context_name=$(kubectl config current-context)" \ + -var="test_namespace=${{ env.SAMPLE_APP_NAMESPACE }}" \ + -var="service_account_aws_access=service-account-${{ env.TESTING_ID }}" \ + -var="sample_app_image=${{ env.SAMPLE_APP_FRONTEND_SERVICE_IMAGE }}" \ + -var="sample_remote_app_image=${{ env.SAMPLE_APP_REMOTE_SERVICE_IMAGE }}" || success=$? + + # If the success is still 0, then the terraform deployment succeeded and now try to connect to the endpoint + # after installing App Signals. Attempts to connect will be made for up to 10 minutes + if [ $success -eq 0 ]; then + echo "Installing app signals to the sample app" + ../../../enable-app-signals.sh \ + ${{ inputs.test-cluster-name }} \ + ${{ env.AWS_DEFAULT_REGION }} \ + ${{ env.SAMPLE_APP_NAMESPACE }} + + kubectl delete pods --all -n ${{ env.SAMPLE_APP_NAMESPACE }} + kubectl wait --for=condition=Ready pod --all -n ${{ env.SAMPLE_APP_NAMESPACE }} + + echo "Attempting to connect to the endpoint" + sample_app_endpoint=http://$(terraform output sample_app_endpoint) + attempt_counter=0 + max_attempts=60 + until $(curl --output /dev/null --silent --head --fail $(echo "$sample_app_endpoint" | tr -d '"')); do + if [ ${attempt_counter} -eq ${max_attempts} ];then + echo "Max attempts reached" + success=1 + break + fi + + printf '.' + attempt_counter=$(($attempt_counter+1)) + sleep 10 + done + fi + + # If the success is 1 then either the terraform deployment or the endpoint connection failed, so first destroy the + # resources created from terraform and try again. + if [ $success -eq 1 ]; then + kubectl delete namespace ${{ env.SAMPLE_APP_NAMESPACE }} + terraform destroy -auto-approve \ + -var="test_id=${{ env.TESTING_ID }}" \ + -var="kube_directory_path=${{ github.workspace }}/.kube" \ + -var="eks_cluster_name=${{ inputs.test-cluster-name }}" \ + -var="test_namespace=${{ env.SAMPLE_APP_NAMESPACE }}" \ + -var="service_account_aws_access=service-account-${{ env.TESTING_ID }}" \ + -var="sample_app_image=${{ env.SAMPLE_APP_IMAGE }}" + + retry_counter=$(($retry_counter+1)) + fi + done - name: Get remote service pod name and IP run: | @@ -139,30 +182,13 @@ jobs: jq '.items[0].status.containerStatuses[0].imageID' - name: Get the sample app endpoint - run: | - echo "APP_ENDPOINT=$(terraform output sample_app_endpoint)" >> $GITHUB_ENV + run: echo "APP_ENDPOINT=$(terraform output sample_app_endpoint)" >> $GITHUB_ENV working-directory: testing/terraform/eks - - name: Wait for app endpoint to come online - id: endpoint-check - run: | - attempt_counter=0 - max_attempts=30 - until $(curl --output /dev/null --silent --head --fail http://${{ env.APP_ENDPOINT }}); do - if [ ${attempt_counter} -eq ${max_attempts} ];then - echo "Max attempts reached" - exit 1 - fi - - printf '.' - attempt_counter=$(($attempt_counter+1)) - sleep 10 - done - # Validation for app signals telemetry data - name: Call endpoint and validate generated EMF logs id: log-validation - if: steps.endpoint-check.outcome == 'success' && !cancelled() + if: steps.deploy-sample-app.outcome == 'success' && !cancelled() run: ./gradlew testing:validator:run --args='-c eks/log-validation.yml --testing-id ${{ env.TESTING_ID }} --endpoint http://${{ env.APP_ENDPOINT }} @@ -274,4 +300,4 @@ jobs: --name service-account-${{ env.TESTING_ID }} \ --namespace ${{ env.SAMPLE_APP_NAMESPACE }} \ --cluster ${{ inputs.test-cluster-name }} \ - --region ${{ env.AWS_DEFAULT_REGION }} \ + --region ${{ env.AWS_DEFAULT_REGION }} \ No newline at end of file From 3c676542aa93482252792ffb56d18fb08bdadcab Mon Sep 17 00:00:00 2001 From: Harrryr Date: Fri, 8 Dec 2023 13:41:54 -0800 Subject: [PATCH 2/6] Add Extra Comments --- .github/workflows/appsignals-e2e-eks-test.yml | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/.github/workflows/appsignals-e2e-eks-test.yml b/.github/workflows/appsignals-e2e-eks-test.yml index 50e3646ffd..8c580bea7e 100644 --- a/.github/workflows/appsignals-e2e-eks-test.yml +++ b/.github/workflows/appsignals-e2e-eks-test.yml @@ -107,8 +107,10 @@ jobs: success=1 while [ $success -eq 1 ]; do if [ $retry_counter -ge $max_retry ]; then + echo "Max retry reached, failed to deploy terraform and connect to the endpoint. Exiting code" exit 1 fi + echo "Attempt $retry_counter" success=0 terraform apply -auto-approve \ -var="test_id=${{ env.TESTING_ID }}" \ @@ -121,6 +123,10 @@ jobs: -var="sample_app_image=${{ env.SAMPLE_APP_FRONTEND_SERVICE_IMAGE }}" \ -var="sample_remote_app_image=${{ env.SAMPLE_APP_REMOTE_SERVICE_IMAGE }}" || success=$? + if [ $success -eq 1 ]; then + echo "Terraform deployment was unsuccessful. Will attempt to retry deployment." + fi + # If the success is still 0, then the terraform deployment succeeded and now try to connect to the endpoint # after installing App Signals. Attempts to connect will be made for up to 10 minutes if [ $success -eq 0 ]; then @@ -139,7 +145,7 @@ jobs: max_attempts=60 until $(curl --output /dev/null --silent --head --fail $(echo "$sample_app_endpoint" | tr -d '"')); do if [ ${attempt_counter} -eq ${max_attempts} ];then - echo "Max attempts reached" + echo "Failed to connect to endpoint. Will attempt to redeploy sample app." success=1 break fi @@ -153,6 +159,7 @@ jobs: # If the success is 1 then either the terraform deployment or the endpoint connection failed, so first destroy the # resources created from terraform and try again. if [ $success -eq 1 ]; then + echo "Destroying terraform" kubectl delete namespace ${{ env.SAMPLE_APP_NAMESPACE }} terraform destroy -auto-approve \ -var="test_id=${{ env.TESTING_ID }}" \ From 736fd3c4e581169bec89bd6886ffef796e397fb6 Mon Sep 17 00:00:00 2001 From: Harrryr Date: Fri, 8 Dec 2023 14:14:21 -0800 Subject: [PATCH 3/6] Call Test APIs First before Validation --- .github/workflows/appsignals-e2e-eks-test.yml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/.github/workflows/appsignals-e2e-eks-test.yml b/.github/workflows/appsignals-e2e-eks-test.yml index 8c580bea7e..75dbde30fe 100644 --- a/.github/workflows/appsignals-e2e-eks-test.yml +++ b/.github/workflows/appsignals-e2e-eks-test.yml @@ -192,6 +192,15 @@ jobs: run: echo "APP_ENDPOINT=$(terraform output sample_app_endpoint)" >> $GITHUB_ENV working-directory: testing/terraform/eks + # This steps increases the speed of the validation by creating the telemetry data in advance + - name: Call all test APIs + continue-on-error: true + run: | + curl -S -s -o /dev/null http://${{ env.APP_ENDPOINT }}/outgoing-http-call/ + curl -S -s -o /dev/null http://${{ env.APP_ENDPOINT }}/aws-sdk-call/ + curl -S -s -o /dev/null http://${{ env.APP_ENDPOINT }}/remote-service?ip=${{ env.REMOTE_SERVICE_POD_IP }}/ + curl -S -s -o /dev/null http://${{ env.APP_ENDPOINT }}/client-call/ + # Validation for app signals telemetry data - name: Call endpoint and validate generated EMF logs id: log-validation From 586abc59df612d8ceec131bb6e12f45951051a5e Mon Sep 17 00:00:00 2001 From: Harrryr Date: Fri, 8 Dec 2023 18:12:37 -0800 Subject: [PATCH 4/6] Add clean-app-signals to retry logic --- .github/workflows/appsignals-e2e-eks-test.yml | 25 +++++++++++++------ 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/.github/workflows/appsignals-e2e-eks-test.yml b/.github/workflows/appsignals-e2e-eks-test.yml index 75dbde30fe..625e980262 100644 --- a/.github/workflows/appsignals-e2e-eks-test.yml +++ b/.github/workflows/appsignals-e2e-eks-test.yml @@ -91,6 +91,12 @@ jobs: if: inputs.caller-workflow-name == 'main-build' run: "sed -i 's#image:.*#image: ${{ inputs.appsignals-adot-image-name }}#g' instrumentation.yaml" + - name: Remove log group deletion command + if: always() + run: | + delete_log_group="aws logs delete-log-group --log-group-name '${{ env.LOG_GROUP_NAME }}' --region \$REGION" + sed -i "s#$delete_log_group##g" clean-app-signals.sh + - name: Deploy sample app via terraform and wait for the endpoint to come online id: deploy-sample-app working-directory: testing/terraform/eks @@ -126,7 +132,7 @@ jobs: if [ $success -eq 1 ]; then echo "Terraform deployment was unsuccessful. Will attempt to retry deployment." fi - + # If the success is still 0, then the terraform deployment succeeded and now try to connect to the endpoint # after installing App Signals. Attempts to connect will be made for up to 10 minutes if [ $success -eq 0 ]; then @@ -159,10 +165,19 @@ jobs: # If the success is 1 then either the terraform deployment or the endpoint connection failed, so first destroy the # resources created from terraform and try again. if [ $success -eq 1 ]; then + echo "Cleaning up App Signal" + ../../../clean-app-signals.sh \ + ${{ inputs.test-cluster-name }} \ + ${{ env.AWS_DEFAULT_REGION }} \ + ${{ env.SAMPLE_APP_NAMESPACE }} + + # Running clean-app-signal.sh removes the current cluster from the config. Update the cluster again for subsequent runs. + aws eks update-kubeconfig --name ${{ inputs.test-cluster-name }} --region ${{ env.AWS_DEFAULT_REGION }} + echo "Destroying terraform" - kubectl delete namespace ${{ env.SAMPLE_APP_NAMESPACE }} terraform destroy -auto-approve \ -var="test_id=${{ env.TESTING_ID }}" \ + -var="aws_region=${{ env.AWS_DEFAULT_REGION }}" \ -var="kube_directory_path=${{ github.workspace }}/.kube" \ -var="eks_cluster_name=${{ inputs.test-cluster-name }}" \ -var="test_namespace=${{ env.SAMPLE_APP_NAMESPACE }}" \ @@ -273,12 +288,6 @@ jobs: # Clean up Procedures - - name: Remove log group deletion command - if: always() - run: | - delete_log_group="aws logs delete-log-group --log-group-name '${{ env.LOG_GROUP_NAME }}' --region \$REGION" - sed -i "s#$delete_log_group##g" clean-app-signals.sh - - name: Clean Up App Signals if: always() continue-on-error: true From b9ec088a2e8efe580d78a734b7586e946e354095 Mon Sep 17 00:00:00 2001 From: Harrryr Date: Wed, 13 Dec 2023 12:47:18 -0800 Subject: [PATCH 5/6] Change App Signal Download Directory and modify if statement for validation --- .github/workflows/appsignals-e2e-eks-test.yml | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/.github/workflows/appsignals-e2e-eks-test.yml b/.github/workflows/appsignals-e2e-eks-test.yml index 625e980262..5703047d7a 100644 --- a/.github/workflows/appsignals-e2e-eks-test.yml +++ b/.github/workflows/appsignals-e2e-eks-test.yml @@ -85,6 +85,7 @@ jobs: # Enable App Signals on the test cluster - name: Pull and unzip enablement script from S3 + working-directory: testing/terraform/eks run: aws s3 cp ${{ env.ENABLEMENT_SCRIPT_S3_BUCKET }} . && unzip -j onboarding.zip - name: Change ADOT image if main-build @@ -93,6 +94,7 @@ jobs: - name: Remove log group deletion command if: always() + working-directory: testing/terraform/eks run: | delete_log_group="aws logs delete-log-group --log-group-name '${{ env.LOG_GROUP_NAME }}' --region \$REGION" sed -i "s#$delete_log_group##g" clean-app-signals.sh @@ -137,7 +139,7 @@ jobs: # after installing App Signals. Attempts to connect will be made for up to 10 minutes if [ $success -eq 0 ]; then echo "Installing app signals to the sample app" - ../../../enable-app-signals.sh \ + ./enable-app-signals.sh \ ${{ inputs.test-cluster-name }} \ ${{ env.AWS_DEFAULT_REGION }} \ ${{ env.SAMPLE_APP_NAMESPACE }} @@ -166,7 +168,7 @@ jobs: # resources created from terraform and try again. if [ $success -eq 1 ]; then echo "Cleaning up App Signal" - ../../../clean-app-signals.sh \ + ./clean-app-signals.sh \ ${{ inputs.test-cluster-name }} \ ${{ env.AWS_DEFAULT_REGION }} \ ${{ env.SAMPLE_APP_NAMESPACE }} @@ -236,7 +238,7 @@ jobs: - name: Call endpoints and validate generated metrics id: metric-validation - if: (success() || steps.log-validation.outcome == 'failure') && !cancelled() + if: (steps.deploy-sample-app.outcome == 'success' || steps.log-validation.outcome == 'failure') && !cancelled() run: ./gradlew testing:validator:run --args='-c eks/metric-validation.yml --testing-id ${{ env.TESTING_ID }} --endpoint http://${{ env.APP_ENDPOINT }} @@ -254,7 +256,7 @@ jobs: - name: Call endpoints and validate generated traces id: trace-validation - if: (success() || steps.log-validation.outcome == 'failure' || steps.metric-validation.outcome == 'failure') && !cancelled() + if: (steps.deploy-sample-app.outcome == 'success' || steps.log-validation.outcome == 'failure' || steps.metric-validation.outcome == 'failure') && !cancelled() run: ./gradlew testing:validator:run --args='-c eks/trace-validation.yml --testing-id ${{ env.TESTING_ID }} --endpoint http://${{ env.APP_ENDPOINT }} From a4b1f0dc7dc62ec2edf3689d200504441068b7e9 Mon Sep 17 00:00:00 2001 From: Harrryr Date: Wed, 13 Dec 2023 14:30:05 -0800 Subject: [PATCH 6/6] Modify while loop and refactor code --- .github/workflows/appsignals-e2e-eks-test.yml | 36 ++++++++++--------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/.github/workflows/appsignals-e2e-eks-test.yml b/.github/workflows/appsignals-e2e-eks-test.yml index 5703047d7a..e97522f133 100644 --- a/.github/workflows/appsignals-e2e-eks-test.yml +++ b/.github/workflows/appsignals-e2e-eks-test.yml @@ -108,18 +108,13 @@ jobs: # Attempt to deploy the sample app on an EKS instance and wait for its endpoint to come online. # There may be occasional failures due to transitivity issues, so try up to 2 times. - # Success of 0 indicates that both the terraform deployment and the endpoint are running, while 1 indicates + # deployment_failed of 0 indicates that both the terraform deployment and the endpoint are running, while 1 indicates # that it failed at some point retry_counter=0 max_retry=2 - success=1 - while [ $success -eq 1 ]; do - if [ $retry_counter -ge $max_retry ]; then - echo "Max retry reached, failed to deploy terraform and connect to the endpoint. Exiting code" - exit 1 - fi + while [ $retry_counter -lt $max_retry ]; do echo "Attempt $retry_counter" - success=0 + deployment_failed=0 terraform apply -auto-approve \ -var="test_id=${{ env.TESTING_ID }}" \ -var="aws_region=${{ env.AWS_DEFAULT_REGION }}" \ @@ -129,15 +124,16 @@ jobs: -var="test_namespace=${{ env.SAMPLE_APP_NAMESPACE }}" \ -var="service_account_aws_access=service-account-${{ env.TESTING_ID }}" \ -var="sample_app_image=${{ env.SAMPLE_APP_FRONTEND_SERVICE_IMAGE }}" \ - -var="sample_remote_app_image=${{ env.SAMPLE_APP_REMOTE_SERVICE_IMAGE }}" || success=$? + -var="sample_remote_app_image=${{ env.SAMPLE_APP_REMOTE_SERVICE_IMAGE }}" \ + || deployment_failed=$? - if [ $success -eq 1 ]; then + if [ $deployment_failed -eq 1 ]; then echo "Terraform deployment was unsuccessful. Will attempt to retry deployment." fi - # If the success is still 0, then the terraform deployment succeeded and now try to connect to the endpoint + # If the deployment_failed is still 0, then the terraform deployment succeeded and now try to connect to the endpoint # after installing App Signals. Attempts to connect will be made for up to 10 minutes - if [ $success -eq 0 ]; then + if [ $deployment_failed -eq 0 ]; then echo "Installing app signals to the sample app" ./enable-app-signals.sh \ ${{ inputs.test-cluster-name }} \ @@ -154,7 +150,7 @@ jobs: until $(curl --output /dev/null --silent --head --fail $(echo "$sample_app_endpoint" | tr -d '"')); do if [ ${attempt_counter} -eq ${max_attempts} ];then echo "Failed to connect to endpoint. Will attempt to redeploy sample app." - success=1 + deployment_failed=1 break fi @@ -164,9 +160,9 @@ jobs: done fi - # If the success is 1 then either the terraform deployment or the endpoint connection failed, so first destroy the + # If the deployment_failed is 1 then either the terraform deployment or the endpoint connection failed, so first destroy the # resources created from terraform and try again. - if [ $success -eq 1 ]; then + if [ $deployment_failed -eq 1 ]; then echo "Cleaning up App Signal" ./clean-app-signals.sh \ ${{ inputs.test-cluster-name }} \ @@ -187,6 +183,14 @@ jobs: -var="sample_app_image=${{ env.SAMPLE_APP_IMAGE }}" retry_counter=$(($retry_counter+1)) + else + # If deployment succeeded, then exit the loop + break + fi + + if [ $retry_counter -eq $max_retry ]; then + echo "Max retry reached, failed to deploy terraform and connect to the endpoint. Exiting code" + exit 1 fi done @@ -327,4 +331,4 @@ jobs: --name service-account-${{ env.TESTING_ID }} \ --namespace ${{ env.SAMPLE_APP_NAMESPACE }} \ --cluster ${{ inputs.test-cluster-name }} \ - --region ${{ env.AWS_DEFAULT_REGION }} \ No newline at end of file + --region ${{ env.AWS_DEFAULT_REGION }}