Skip to content

Commit 3cfa853

Browse files
authored
Add support to publish metrics to cloudwatch (#1414)
* Add support to publish metrics to cloudwatch * Update README * Refactor cloudwatch namespace and data published * Add benchmark buildspec and correct instance types * Correct github repo * Remove the checkout for custom branch * Add some clarifying comments
1 parent 0c5daa3 commit 3cfa853

File tree

12 files changed

+118
-16
lines changed

12 files changed

+118
-16
lines changed

benchmarks/automated/README.md

+2-2
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,8 @@ Check out a sample vgg11 model config at the path: `tests/suite/vgg11.yaml`
1010
-- [AmazonEC2ContainerRegistryFullAccess](https://console.aws.amazon.com/iam/home#policies/arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryFullAccess) <br>
1111
-- [AmazonEC2FullAccess](https://console.aws.amazon.com/iam/home#policies/arn:aws:iam::aws:policy/AmazonEC2FullAccess) <br>
1212
-- [AmazonS3FullAccess](https://console.aws.amazon.com/iam/home#policies/arn:aws:iam::aws:policy/AmazonS3FullAccess) <br>
13-
-- [IAMFullAccess](https://console.aws.amazon.com/iam/home#policies/arn:aws:iam::aws:policy/IAMFullAccess)
14-
<br (or at the least iam:passrole).
13+
-- [IAMFullAccess](https://console.aws.amazon.com/iam/home#policies/arn:aws:iam::aws:policy/IAMFullAccess) (or at the least iam:passrole). <br>
14+
-- [CloudWatchFullAccess](https://console.aws.amazon.com/iam/home#/policies/arn:aws:iam::aws:policy/CloudWatchFullAccess$jsonEditor) <br>
1515

1616
* [Create](https://docs.aws.amazon.com/cli/latest/reference/ecr/create-repository.html) an ECR repository with the name “torchserve-benchmark” in the us-west-2 region, e.g.
1717
```

benchmarks/automated/tests/suite/bert_cpu.yaml

+1-2
Original file line numberDiff line numberDiff line change
@@ -19,5 +19,4 @@ bert:
1919
- "cpu"
2020
- "gpus": "all"
2121
instance_types: #special keyword not recognized as a 'model', define instance types per yaml file.
22-
- "p3.8xlarge"
23-
- "c5.4xlarge"
22+
- "m6i.4xlarge"

benchmarks/automated/tests/suite/bert_multi_gpu.yaml

+1-2
Original file line numberDiff line numberDiff line change
@@ -19,5 +19,4 @@ bert:
1919
- "cpu"
2020
- "gpus": "all"
2121
instance_types: #special keyword not recognized as a 'model', define instance types per yaml file.
22-
- "p3.8xlarge"
23-
- "c5.4xlarge"
22+
- "p3.8xlarge"

benchmarks/automated/tests/suite/fastrcnn.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -20,4 +20,4 @@ fastrcnn:
2020
- "gpus": "all"
2121
instance_types: #special keyword not recognized as a 'model', define instance types per yaml file.
2222
- "p3.8xlarge"
23-
- "c5.4xlarge"
23+
- "m6i.4xlarge"

benchmarks/automated/tests/suite/mnist.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -20,4 +20,4 @@ mnist:
2020
- "gpus": "all"
2121
instance_types: #special keyword not recognized as a 'model', define instance types per yaml file.
2222
- "p3.8xlarge"
23-
- "c5.4xlarge"
23+
- "m6i.4xlarge"

benchmarks/automated/tests/suite/vgg11.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -38,4 +38,4 @@ vgg11:
3838
- "gpus": "all"
3939
instance_types: #special keyword not recognized as a 'model', define instance types per yaml file.
4040
- "p3.8xlarge"
41-
- "c5.4xlarge"
41+
- "m6i.4xlarge"

benchmarks/automated/tests/suite/vgg16.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -38,4 +38,4 @@ vgg16:
3838
- "gpus": "all"
3939
instance_types: #special keyword not recognized as a 'model', define instance types per yaml file.
4040
- "p3.8xlarge"
41-
- "c5.4xlarge"
41+
- "m6i.4xlarge"

benchmarks/automated/tests/utils/apache_bench.py

+30-5
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
from invoke.context import Context
2020

2121
from . import DEFAULT_REGION, IAM_INSTANCE_PROFILE, AMI_ID, LOGGER, S3_BUCKET_BENCHMARK_ARTIFACTS
22+
from . import cloudwatch as cloudwatch_utils
2223

2324
TMP_DIR = "/home/ubuntu"
2425
LOCAL_TMP_DIR = "/tmp"
@@ -65,7 +66,7 @@ def run_apache_bench(self, requests, concurrency, input_file, is_workflow=False,
6566
self.connection.run(f"cp {file_name} {os.path.join(TMP_DIR, 'benchmark/input')}")
6667
else:
6768
self.connection.run(f"cp {input_file} {os.path.join(TMP_DIR, 'benchmark/input')}")
68-
69+
6970
predict_flag = "predictions"
7071
model_name = "benchmark"
7172
if is_workflow:
@@ -100,7 +101,7 @@ def extract_metrics(self, connection=None):
100101
temp_uuid = uuid.uuid4()
101102

102103
time.sleep(5)
103-
104+
104105
# Upload to s3 and fetch back to local instance: more reliable than using self.connection.get()
105106
connection.run(f"aws s3 cp {self.result_file} {S3_BUCKET_BENCHMARK_ARTIFACTS}/{temp_uuid}/result.txt")
106107
time.sleep(2)
@@ -136,7 +137,7 @@ def extract_entity(self, data, pattern, index, delim=" "):
136137
if pattern.search(line):
137138
return line.split(delim)[index].strip()
138139

139-
def generate_csv_output(self, requests, concurrency, connection=None):
140+
def generate_csv_output(self, requests, concurrency, batch_size, mode, connection=None):
140141
LOGGER.info("*Generating CSV output...")
141142

142143
batched_requests = requests / concurrency
@@ -147,6 +148,8 @@ def generate_csv_output(self, requests, concurrency, connection=None):
147148
with open(f"{self.local_tmp_dir}/result.txt") as f:
148149
data = f.readlines()
149150
artifacts["Benchmark"] = "AB"
151+
artifacts["Batch Size"] = batch_size
152+
artifacts["Mode"] = mode # This is the pytorch mode i.e. eager or scripted, as specified in <model>.yaml
150153
artifacts["Model"] = self.model_name
151154
artifacts["Concurrency"] = concurrency
152155
artifacts["Requests"] = requests
@@ -178,6 +181,28 @@ def generate_csv_output(self, requests, concurrency, connection=None):
178181

179182
return artifacts
180183

181-
def generate_report(self, requests, concurrency, connection=None):
184+
def push_benchmark_metrics(self, artifacts, connection=None):
185+
curr_instance_type = connection.run(
186+
f"curl http://169.254.169.254/latest/meta-data/instance-type", warn=True
187+
).stdout
188+
189+
artifacts["instance_type"] = curr_instance_type
190+
191+
# 'BENCHMARK_CONTEXT' is set internally at AWS for certain benchmark jobs.
192+
# When it's not available, 'DevTest' is used.
193+
dashboard_context = os.getenv("BENCHMARK_CONTEXT", "DevTest")
194+
195+
cloudwatchMetricsHandler = cloudwatch_utils.CloudWatchMetricsHandler(
196+
context=dashboard_context, sub_namespace=f"{self.model_name}/{artifacts.get('Mode')}"
197+
)
198+
199+
cloudwatchMetricsHandler.push_benchmark_metrics(artifacts)
200+
201+
def generate_report(self, requests, concurrency, batch_size, mode, connection=None):
182202
self.extract_metrics(connection=connection)
183-
self.generate_csv_output(requests, concurrency, connection=connection)
203+
artifacts = self.generate_csv_output(
204+
requests, concurrency, batch_size=batch_size, mode=mode, connection=connection
205+
)
206+
207+
# Push metrics to cloudwatch
208+
self.push_benchmark_metrics(artifacts, connection=connection)

benchmarks/automated/tests/utils/benchmark.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -168,7 +168,7 @@ def execute_benchmark(
168168

169169
# Generate report (note: needs to happen after torchserve has stopped)
170170
apacheBenchHandler.generate_report(
171-
requests=requests, concurrency=concurrency, connection=self.connection
171+
requests=requests, concurrency=concurrency, batch_size=batch_size, mode=mode,connection=self.connection
172172
)
173173

174174
# Move artifacts into a common folder.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
import csv
2+
import os
3+
import time
4+
import re
5+
import boto3
6+
import uuid
7+
8+
import matplotlib.pyplot as plt
9+
import pandas as pd
10+
import numpy as n
11+
12+
from inspect import signature
13+
from retrying import retry
14+
from fabric2 import Connection
15+
from botocore.config import Config
16+
from botocore.exceptions import ClientError
17+
18+
from invoke import run, sudo
19+
from invoke.context import Context
20+
21+
from . import DEFAULT_REGION, IAM_INSTANCE_PROFILE, AMI_ID, LOGGER, S3_BUCKET_BENCHMARK_ARTIFACTS
22+
23+
TMP_DIR = "/home/ubuntu"
24+
LOCAL_TMP_DIR = "/tmp"
25+
26+
27+
class CloudWatchMetricsHandler:
28+
def __init__(self, context="DevTest", sub_namespace="TestModel"):
29+
self.client = boto3.Session(region_name=DEFAULT_REGION).client("cloudwatch")
30+
self.context = context
31+
self.namespace = f"TorchServe/{context.title()}/{sub_namespace.title()}"
32+
33+
def push(self, name, unit, value, metrics_info):
34+
# dimensions = [{"Name": "BenchmarkContextTest", "Value":self.context}]
35+
dimensions = []
36+
37+
for key in metrics_info:
38+
dimensions.append({"Name": key, "Value": str(metrics_info.get(key))})
39+
40+
try:
41+
response = self.client.put_metric_data(
42+
MetricData=[{"MetricName": name, "Dimensions": dimensions, "Unit": unit, "Value": float(value)}],
43+
Namespace=self.namespace,
44+
)
45+
except Exception as e:
46+
raise Exception(str(e))
47+
48+
return response
49+
50+
def push_benchmark_metrics(self, benchmark_dict):
51+
52+
# CloudWatch allows a maximum of 10 dimensions for a metric, so only the most important are published here
53+
info = {"Instance Type": benchmark_dict.get("instance_type"), "Batch Size": benchmark_dict.get("Batch Size")}
54+
55+
self.push("Model Latency P90", "Milliseconds", benchmark_dict.get("Model_p90"), info)
56+
self.push("TS Throughput", "Count/Second", benchmark_dict.get("TS throughput"), info)
57+
self.push("TS Error Rate", "Count/Second", str(benchmark_dict.get("TS error rate")), info)
58+
59+
LOGGER.info(f"Benchmark metric pushed to cloudwatch")

benchmarks/automated/tests/utils/ts.py

+4
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,7 @@ def stop_recording_docker_stats(self, model_name, num_workers, batch_size):
154154
self.connection.run("ps axl|grep -e '--no-stream'| grep -v color | awk '{print $3}' | xargs kill -9", warn=True)
155155
self.connection.run(f"cp nohup.out nohup.{model_name}.{num_workers}.{batch_size}", warn=True)
156156
self.connection.run(f"rm nohup.out", warn=True)
157+
time.sleep(3)
157158

158159
def plot_stats_graph(self, model_name, mode_name, num_workers, batch_size):
159160
"""
@@ -163,6 +164,9 @@ def plot_stats_graph(self, model_name, mode_name, num_workers, batch_size):
163164

164165
LOGGER.info(f"Generating graphs")
165166

167+
if not self.is_local_execution:
168+
self.connection.get(f"free.{model_name}.{num_workers}.{batch_size}", f"free.{model_name}.{num_workers}.{batch_size}")
169+
166170
# plot graphs from the utility 'free'
167171
with open(f"free.{model_name}.{num_workers}.{batch_size}") as f:
168172
file_contents = [float(line.strip()) for line in f.readlines()]

ci/benchmark/buildspec.yml

+16
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
# Build Spec for AWS CodeBuild CI
2+
3+
version: 0.2
4+
5+
phases:
6+
install:
7+
commands:
8+
- apt-get update
9+
- apt-get install sudo -y
10+
- git clone https://www.github.com/pytorch/serve.git
11+
- cd serve
12+
- pip install -r benchmarks/automated/requirements.txt
13+
14+
build:
15+
commands:
16+
- python benchmarks/automated/run_benchmark.py --run-only ${MODEL_NAME}

0 commit comments

Comments
 (0)