Add support to publish metrics to cloudwatch (#1414)

nikhil-sk · web-flow · commit 3cfa853506cc · 2022-02-14T16:48:54.000-08:00
* Add support to publish metrics to cloudwatch

* Update README

* Refactor cloudwatch namespace and data published

* Add benchmark buildspec and correct instance types

* Correct github repo

* Remove the checkout for custom branch

* Add some clarifying comments
diff --git a/benchmarks/automated/README.md b/benchmarks/automated/README.md
@@ -10,8 +10,8 @@ Check out a sample vgg11 model config at the path: `tests/suite/vgg11.yaml`
 -- [AmazonEC2ContainerRegistryFullAccess](https://console.aws.amazon.com/iam/home#policies/arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryFullAccess) <br>
 -- [AmazonEC2FullAccess](https://console.aws.amazon.com/iam/home#policies/arn:aws:iam::aws:policy/AmazonEC2FullAccess) <br>
 -- [AmazonS3FullAccess](https://console.aws.amazon.com/iam/home#policies/arn:aws:iam::aws:policy/AmazonS3FullAccess) <br>
--- [IAMFullAccess](https://console.aws.amazon.com/iam/home#policies/arn:aws:iam::aws:policy/IAMFullAccess) 
-<br (or at the least iam:passrole).
+-- [IAMFullAccess](https://console.aws.amazon.com/iam/home#policies/arn:aws:iam::aws:policy/IAMFullAccess) (or at the least iam:passrole). <br>
+-- [CloudWatchFullAccess](https://console.aws.amazon.com/iam/home#/policies/arn:aws:iam::aws:policy/CloudWatchFullAccess$jsonEditor) <br>
 
 * [Create](https://docs.aws.amazon.com/cli/latest/reference/ecr/create-repository.html) an ECR repository with the name “torchserve-benchmark” in the us-west-2 region, e.g.
 ```
diff --git a/benchmarks/automated/tests/suite/bert_cpu.yaml b/benchmarks/automated/tests/suite/bert_cpu.yaml
@@ -19,5 +19,4 @@ bert:
             - "cpu"
             - "gpus": "all"
 instance_types: #special keyword not recognized as a 'model', define instance types per yaml file.
-    - "p3.8xlarge"
-    - "c5.4xlarge"
+    - "m6i.4xlarge"
diff --git a/benchmarks/automated/tests/suite/bert_multi_gpu.yaml b/benchmarks/automated/tests/suite/bert_multi_gpu.yaml
@@ -19,5 +19,4 @@ bert:
             - "cpu"
             - "gpus": "all"
 instance_types: #special keyword not recognized as a 'model', define instance types per yaml file.
-    - "p3.8xlarge"
-    - "c5.4xlarge"
+    - "p3.8xlarge"
diff --git a/benchmarks/automated/tests/suite/fastrcnn.yaml b/benchmarks/automated/tests/suite/fastrcnn.yaml
@@ -20,4 +20,4 @@ fastrcnn:
             - "gpus": "all"
 instance_types: #special keyword not recognized as a 'model', define instance types per yaml file.
     - "p3.8xlarge"
-    - "c5.4xlarge"
+    - "m6i.4xlarge"
diff --git a/benchmarks/automated/tests/suite/mnist.yaml b/benchmarks/automated/tests/suite/mnist.yaml
@@ -20,4 +20,4 @@ mnist:
             - "gpus": "all"
 instance_types: #special keyword not recognized as a 'model', define instance types per yaml file.
     - "p3.8xlarge"
-    - "c5.4xlarge"
+    - "m6i.4xlarge"
diff --git a/benchmarks/automated/tests/suite/vgg11.yaml b/benchmarks/automated/tests/suite/vgg11.yaml
@@ -38,4 +38,4 @@ vgg11:
             - "gpus": "all"
 instance_types: #special keyword not recognized as a 'model', define instance types per yaml file.
     - "p3.8xlarge"
-    - "c5.4xlarge"
+    - "m6i.4xlarge"
diff --git a/benchmarks/automated/tests/suite/vgg16.yaml b/benchmarks/automated/tests/suite/vgg16.yaml
@@ -38,4 +38,4 @@ vgg16:
             - "gpus": "all"
 instance_types: #special keyword not recognized as a 'model', define instance types per yaml file.
     - "p3.8xlarge"
-    - "c5.4xlarge"
+    - "m6i.4xlarge"
diff --git a/benchmarks/automated/tests/utils/apache_bench.py b/benchmarks/automated/tests/utils/apache_bench.py
@@ -19,6 +19,7 @@
 from invoke.context import Context
 
 from . import DEFAULT_REGION, IAM_INSTANCE_PROFILE, AMI_ID, LOGGER, S3_BUCKET_BENCHMARK_ARTIFACTS
+from . import cloudwatch as cloudwatch_utils
 
 TMP_DIR = "/home/ubuntu"
 LOCAL_TMP_DIR = "/tmp"
@@ -65,7 +66,7 @@ def run_apache_bench(self, requests, concurrency, input_file, is_workflow=False,
             self.connection.run(f"cp {file_name} {os.path.join(TMP_DIR, 'benchmark/input')}")
         else:
             self.connection.run(f"cp {input_file} {os.path.join(TMP_DIR, 'benchmark/input')}")
-        
+
         predict_flag = "predictions"
         model_name = "benchmark"
         if is_workflow:
@@ -100,7 +101,7 @@ def extract_metrics(self, connection=None):
         temp_uuid = uuid.uuid4()
 
         time.sleep(5)
-        
+
         # Upload to s3 and fetch back to local instance: more reliable than using self.connection.get()
         connection.run(f"aws s3 cp {self.result_file} {S3_BUCKET_BENCHMARK_ARTIFACTS}/{temp_uuid}/result.txt")
         time.sleep(2)
@@ -136,7 +137,7 @@ def extract_entity(self, data, pattern, index, delim=" "):
             if pattern.search(line):
                 return line.split(delim)[index].strip()
 
-    def generate_csv_output(self, requests, concurrency, connection=None):
+    def generate_csv_output(self, requests, concurrency, batch_size, mode, connection=None):
         LOGGER.info("*Generating CSV output...")
 
         batched_requests = requests / concurrency
@@ -147,6 +148,8 @@ def generate_csv_output(self, requests, concurrency, connection=None):
         with open(f"{self.local_tmp_dir}/result.txt") as f:
             data = f.readlines()
         artifacts["Benchmark"] = "AB"
+        artifacts["Batch Size"] = batch_size
+        artifacts["Mode"] = mode # This is the pytorch mode i.e. eager or scripted, as specified in <model>.yaml
         artifacts["Model"] = self.model_name
         artifacts["Concurrency"] = concurrency
         artifacts["Requests"] = requests
@@ -178,6 +181,28 @@ def generate_csv_output(self, requests, concurrency, connection=None):
 
         return artifacts
 
-    def generate_report(self, requests, concurrency, connection=None):
+    def push_benchmark_metrics(self, artifacts, connection=None):
+        curr_instance_type = connection.run(
+            f"curl http://169.254.169.254/latest/meta-data/instance-type", warn=True
+        ).stdout
+
+        artifacts["instance_type"] = curr_instance_type
+
+        # 'BENCHMARK_CONTEXT' is set internally at AWS for certain benchmark jobs.
+        # When it's not available, 'DevTest' is used.
+        dashboard_context = os.getenv("BENCHMARK_CONTEXT", "DevTest")
+
+        cloudwatchMetricsHandler = cloudwatch_utils.CloudWatchMetricsHandler(
+            context=dashboard_context, sub_namespace=f"{self.model_name}/{artifacts.get('Mode')}"
+        )
+
+        cloudwatchMetricsHandler.push_benchmark_metrics(artifacts)
+
+    def generate_report(self, requests, concurrency, batch_size, mode, connection=None):
         self.extract_metrics(connection=connection)
-        self.generate_csv_output(requests, concurrency, connection=connection)
+        artifacts = self.generate_csv_output(
+            requests, concurrency, batch_size=batch_size, mode=mode, connection=connection
+        )
+
+        # Push metrics to cloudwatch
+        self.push_benchmark_metrics(artifacts, connection=connection)
diff --git a/benchmarks/automated/tests/utils/benchmark.py b/benchmarks/automated/tests/utils/benchmark.py
@@ -168,7 +168,7 @@ def execute_benchmark(
 
                     # Generate report (note: needs to happen after torchserve has stopped)
                     apacheBenchHandler.generate_report(
-                        requests=requests, concurrency=concurrency, connection=self.connection
+                        requests=requests, concurrency=concurrency, batch_size=batch_size, mode=mode,connection=self.connection
                     )
 
                     # Move artifacts into a common folder.
diff --git a/benchmarks/automated/tests/utils/cloudwatch.py b/benchmarks/automated/tests/utils/cloudwatch.py
@@ -0,0 +1,59 @@
+import csv
+import os
+import time
+import re
+import boto3
+import uuid
+
+import matplotlib.pyplot as plt
+import pandas as pd
+import numpy as n
+
+from inspect import signature
+from retrying import retry
+from fabric2 import Connection
+from botocore.config import Config
+from botocore.exceptions import ClientError
+
+from invoke import run, sudo
+from invoke.context import Context
+
+from . import DEFAULT_REGION, IAM_INSTANCE_PROFILE, AMI_ID, LOGGER, S3_BUCKET_BENCHMARK_ARTIFACTS
+
+TMP_DIR = "/home/ubuntu"
+LOCAL_TMP_DIR = "/tmp"
+
+
+class CloudWatchMetricsHandler:
+    def __init__(self, context="DevTest", sub_namespace="TestModel"):
+        self.client = boto3.Session(region_name=DEFAULT_REGION).client("cloudwatch")
+        self.context = context
+        self.namespace = f"TorchServe/{context.title()}/{sub_namespace.title()}"
+
+    def push(self, name, unit, value, metrics_info):
+        # dimensions = [{"Name": "BenchmarkContextTest", "Value":self.context}]
+        dimensions = []
+
+        for key in metrics_info:
+            dimensions.append({"Name": key, "Value": str(metrics_info.get(key))})
+
+        try:
+            response = self.client.put_metric_data(
+                MetricData=[{"MetricName": name, "Dimensions": dimensions, "Unit": unit, "Value": float(value)}],
+                Namespace=self.namespace,
+            )
+        except Exception as e:
+            raise Exception(str(e))
+
+        return response
+
+    def push_benchmark_metrics(self, benchmark_dict):
+
+        # CloudWatch allows a maximum of 10 dimensions for a metric, so only the most important are published here
+        info = {"Instance Type": benchmark_dict.get("instance_type"), "Batch Size": benchmark_dict.get("Batch Size")}
+
+        self.push("Model Latency P90", "Milliseconds", benchmark_dict.get("Model_p90"), info)
+        self.push("TS Throughput", "Count/Second", benchmark_dict.get("TS throughput"), info)
+        self.push("TS Error Rate", "Count/Second", str(benchmark_dict.get("TS error rate")), info)
+
+        LOGGER.info(f"Benchmark metric pushed to cloudwatch")
diff --git a/benchmarks/automated/tests/utils/ts.py b/benchmarks/automated/tests/utils/ts.py
@@ -154,6 +154,7 @@ def stop_recording_docker_stats(self, model_name, num_workers, batch_size):
         self.connection.run("ps axl|grep -e '--no-stream'| grep -v color | awk '{print $3}' | xargs kill -9", warn=True)
         self.connection.run(f"cp nohup.out nohup.{model_name}.{num_workers}.{batch_size}", warn=True)
         self.connection.run(f"rm nohup.out", warn=True)
+        time.sleep(3)
 
     def plot_stats_graph(self, model_name, mode_name, num_workers, batch_size):
         """
@@ -163,6 +164,9 @@ def plot_stats_graph(self, model_name, mode_name, num_workers, batch_size):
 
         LOGGER.info(f"Generating graphs")
 
+        if not self.is_local_execution:
+            self.connection.get(f"free.{model_name}.{num_workers}.{batch_size}", f"free.{model_name}.{num_workers}.{batch_size}")
+
         # plot graphs from the utility 'free'
         with open(f"free.{model_name}.{num_workers}.{batch_size}") as f:
             file_contents = [float(line.strip()) for line in f.readlines()]
diff --git a/ci/benchmark/buildspec.yml b/ci/benchmark/buildspec.yml
@@ -0,0 +1,16 @@
+# Build Spec for AWS CodeBuild CI
+
+version: 0.2
+
+phases:
+  install:
+    commands:
+      - apt-get update
+      - apt-get install sudo -y
+      - git clone https://www.github.com/pytorch/serve.git
+      - cd serve
+      - pip install -r benchmarks/automated/requirements.txt
+
+  build:
+    commands:
+      - python benchmarks/automated/run_benchmark.py --run-only ${MODEL_NAME}

Original file line number	Diff line number	Diff line change
`@@ -168,7 +168,7 @@ def execute_benchmark(`
`168`	`168`
`169`	`169`	`# Generate report (note: needs to happen after torchserve has stopped)`
`170`	`170`	`apacheBenchHandler.generate_report(`
`171`		`- requests=requests, concurrency=concurrency, connection=self.connection`
	`171`	`+ requests=requests, concurrency=concurrency, batch_size=batch_size, mode=mode,connection=self.connection`
`172`	`172`	`)`
`173`	`173`
`174`	`174`	`# Move artifacts into a common folder.`