diff --git a/.github/workflows/appsignals-e2e-eks-test.yml b/.github/workflows/appsignals-e2e-eks-test.yml index 09beef68a6..e97522f133 100644 --- a/.github/workflows/appsignals-e2e-eks-test.yml +++ b/.github/workflows/appsignals-e2e-eks-test.yml @@ -83,45 +83,116 @@ jobs: with: terraform_wrapper: false - - name: Deploy sample app via terraform - working-directory: testing/terraform/eks - run: | - terraform init - terraform validate - terraform apply -auto-approve \ - -var="test_id=${{ env.TESTING_ID }}" \ - -var="aws_region=${{ env.AWS_DEFAULT_REGION }}" \ - -var="kube_directory_path=${{ github.workspace }}/.kube" \ - -var="eks_cluster_name=${{ inputs.test-cluster-name }}" \ - -var="eks_cluster_context_name=$(kubectl config current-context)" \ - -var="test_namespace=${{ env.SAMPLE_APP_NAMESPACE }}" \ - -var="service_account_aws_access=service-account-${{ env.TESTING_ID }}" \ - -var="sample_app_image=${{ env.SAMPLE_APP_FRONTEND_SERVICE_IMAGE }}" \ - -var="sample_remote_app_image=${{ env.SAMPLE_APP_REMOTE_SERVICE_IMAGE }}" - # Enable App Signals on the test cluster - name: Pull and unzip enablement script from S3 + working-directory: testing/terraform/eks run: aws s3 cp ${{ env.ENABLEMENT_SCRIPT_S3_BUCKET }} . && unzip -j onboarding.zip - name: Change ADOT image if main-build if: inputs.caller-workflow-name == 'main-build' run: "sed -i 's#image:.*#image: ${{ inputs.appsignals-adot-image-name }}#g' instrumentation.yaml" - - name: Enable App Signals + - name: Remove log group deletion command + if: always() + working-directory: testing/terraform/eks run: | - ./enable-app-signals.sh \ - ${{ inputs.test-cluster-name }} \ - ${{ env.AWS_DEFAULT_REGION }} \ - ${{ env.SAMPLE_APP_NAMESPACE }} - - # Application pods need to be restarted for the - # app signals instrumentation to take effect - - name: Restart the app pods - run: kubectl delete pods --all -n ${{ env.SAMPLE_APP_NAMESPACE }} + delete_log_group="aws logs delete-log-group --log-group-name '${{ env.LOG_GROUP_NAME }}' --region \$REGION" + sed -i "s#$delete_log_group##g" clean-app-signals.sh - - name: Wait for sample app pods to come up + - name: Deploy sample app via terraform and wait for the endpoint to come online + id: deploy-sample-app + working-directory: testing/terraform/eks run: | - kubectl wait --for=condition=Ready pod --all -n ${{ env.SAMPLE_APP_NAMESPACE }} + terraform init + terraform validate + + # Attempt to deploy the sample app on an EKS instance and wait for its endpoint to come online. + # There may be occasional failures due to transitivity issues, so try up to 2 times. + # deployment_failed of 0 indicates that both the terraform deployment and the endpoint are running, while 1 indicates + # that it failed at some point + retry_counter=0 + max_retry=2 + while [ $retry_counter -lt $max_retry ]; do + echo "Attempt $retry_counter" + deployment_failed=0 + terraform apply -auto-approve \ + -var="test_id=${{ env.TESTING_ID }}" \ + -var="aws_region=${{ env.AWS_DEFAULT_REGION }}" \ + -var="kube_directory_path=${{ github.workspace }}/.kube" \ + -var="eks_cluster_name=${{ inputs.test-cluster-name }}" \ + -var="eks_cluster_context_name=$(kubectl config current-context)" \ + -var="test_namespace=${{ env.SAMPLE_APP_NAMESPACE }}" \ + -var="service_account_aws_access=service-account-${{ env.TESTING_ID }}" \ + -var="sample_app_image=${{ env.SAMPLE_APP_FRONTEND_SERVICE_IMAGE }}" \ + -var="sample_remote_app_image=${{ env.SAMPLE_APP_REMOTE_SERVICE_IMAGE }}" \ + || deployment_failed=$? + + if [ $deployment_failed -eq 1 ]; then + echo "Terraform deployment was unsuccessful. Will attempt to retry deployment." + fi + + # If the deployment_failed is still 0, then the terraform deployment succeeded and now try to connect to the endpoint + # after installing App Signals. Attempts to connect will be made for up to 10 minutes + if [ $deployment_failed -eq 0 ]; then + echo "Installing app signals to the sample app" + ./enable-app-signals.sh \ + ${{ inputs.test-cluster-name }} \ + ${{ env.AWS_DEFAULT_REGION }} \ + ${{ env.SAMPLE_APP_NAMESPACE }} + + kubectl delete pods --all -n ${{ env.SAMPLE_APP_NAMESPACE }} + kubectl wait --for=condition=Ready pod --all -n ${{ env.SAMPLE_APP_NAMESPACE }} + + echo "Attempting to connect to the endpoint" + sample_app_endpoint=http://$(terraform output sample_app_endpoint) + attempt_counter=0 + max_attempts=60 + until $(curl --output /dev/null --silent --head --fail $(echo "$sample_app_endpoint" | tr -d '"')); do + if [ ${attempt_counter} -eq ${max_attempts} ];then + echo "Failed to connect to endpoint. Will attempt to redeploy sample app." + deployment_failed=1 + break + fi + + printf '.' + attempt_counter=$(($attempt_counter+1)) + sleep 10 + done + fi + + # If the deployment_failed is 1 then either the terraform deployment or the endpoint connection failed, so first destroy the + # resources created from terraform and try again. + if [ $deployment_failed -eq 1 ]; then + echo "Cleaning up App Signal" + ./clean-app-signals.sh \ + ${{ inputs.test-cluster-name }} \ + ${{ env.AWS_DEFAULT_REGION }} \ + ${{ env.SAMPLE_APP_NAMESPACE }} + + # Running clean-app-signal.sh removes the current cluster from the config. Update the cluster again for subsequent runs. + aws eks update-kubeconfig --name ${{ inputs.test-cluster-name }} --region ${{ env.AWS_DEFAULT_REGION }} + + echo "Destroying terraform" + terraform destroy -auto-approve \ + -var="test_id=${{ env.TESTING_ID }}" \ + -var="aws_region=${{ env.AWS_DEFAULT_REGION }}" \ + -var="kube_directory_path=${{ github.workspace }}/.kube" \ + -var="eks_cluster_name=${{ inputs.test-cluster-name }}" \ + -var="test_namespace=${{ env.SAMPLE_APP_NAMESPACE }}" \ + -var="service_account_aws_access=service-account-${{ env.TESTING_ID }}" \ + -var="sample_app_image=${{ env.SAMPLE_APP_IMAGE }}" + + retry_counter=$(($retry_counter+1)) + else + # If deployment succeeded, then exit the loop + break + fi + + if [ $retry_counter -eq $max_retry ]; then + echo "Max retry reached, failed to deploy terraform and connect to the endpoint. Exiting code" + exit 1 + fi + done - name: Get remote service pod name and IP run: | @@ -139,30 +210,22 @@ jobs: jq '.items[0].status.containerStatuses[0].imageID' - name: Get the sample app endpoint - run: | - echo "APP_ENDPOINT=$(terraform output sample_app_endpoint)" >> $GITHUB_ENV + run: echo "APP_ENDPOINT=$(terraform output sample_app_endpoint)" >> $GITHUB_ENV working-directory: testing/terraform/eks - - name: Wait for app endpoint to come online - id: endpoint-check + # This steps increases the speed of the validation by creating the telemetry data in advance + - name: Call all test APIs + continue-on-error: true run: | - attempt_counter=0 - max_attempts=30 - until $(curl --output /dev/null --silent --head --fail http://${{ env.APP_ENDPOINT }}); do - if [ ${attempt_counter} -eq ${max_attempts} ];then - echo "Max attempts reached" - exit 1 - fi - - printf '.' - attempt_counter=$(($attempt_counter+1)) - sleep 10 - done + curl -S -s -o /dev/null http://${{ env.APP_ENDPOINT }}/outgoing-http-call/ + curl -S -s -o /dev/null http://${{ env.APP_ENDPOINT }}/aws-sdk-call/ + curl -S -s -o /dev/null http://${{ env.APP_ENDPOINT }}/remote-service?ip=${{ env.REMOTE_SERVICE_POD_IP }}/ + curl -S -s -o /dev/null http://${{ env.APP_ENDPOINT }}/client-call/ # Validation for app signals telemetry data - name: Call endpoint and validate generated EMF logs id: log-validation - if: steps.endpoint-check.outcome == 'success' && !cancelled() + if: steps.deploy-sample-app.outcome == 'success' && !cancelled() run: ./gradlew testing:validator:run --args='-c eks/log-validation.yml --testing-id ${{ env.TESTING_ID }} --endpoint http://${{ env.APP_ENDPOINT }} @@ -179,7 +242,7 @@ jobs: - name: Call endpoints and validate generated metrics id: metric-validation - if: (success() || steps.log-validation.outcome == 'failure') && !cancelled() + if: (steps.deploy-sample-app.outcome == 'success' || steps.log-validation.outcome == 'failure') && !cancelled() run: ./gradlew testing:validator:run --args='-c eks/metric-validation.yml --testing-id ${{ env.TESTING_ID }} --endpoint http://${{ env.APP_ENDPOINT }} @@ -197,7 +260,7 @@ jobs: - name: Call endpoints and validate generated traces id: trace-validation - if: (success() || steps.log-validation.outcome == 'failure' || steps.metric-validation.outcome == 'failure') && !cancelled() + if: (steps.deploy-sample-app.outcome == 'success' || steps.log-validation.outcome == 'failure' || steps.metric-validation.outcome == 'failure') && !cancelled() run: ./gradlew testing:validator:run --args='-c eks/trace-validation.yml --testing-id ${{ env.TESTING_ID }} --endpoint http://${{ env.APP_ENDPOINT }} @@ -231,12 +294,6 @@ jobs: # Clean up Procedures - - name: Remove log group deletion command - if: always() - run: | - delete_log_group="aws logs delete-log-group --log-group-name '${{ env.LOG_GROUP_NAME }}' --region \$REGION" - sed -i "s#$delete_log_group##g" clean-app-signals.sh - - name: Clean Up App Signals if: always() continue-on-error: true @@ -274,4 +331,4 @@ jobs: --name service-account-${{ env.TESTING_ID }} \ --namespace ${{ env.SAMPLE_APP_NAMESPACE }} \ --cluster ${{ inputs.test-cluster-name }} \ - --region ${{ env.AWS_DEFAULT_REGION }} \ + --region ${{ env.AWS_DEFAULT_REGION }}