Skip to content

Commit

Permalink
Fix and improve group status metrics graphs and dashboard URLs (#1694)
Browse files Browse the repository at this point in the history
The old links are broken, so build new statsboard graphs using new metrics and new APIs.

The old graphs are not referencing the right metrics, so updated the metrics source. Count -> Rate.

Note that statsboard links are preferred over rendered graphs, and thus they are moved above the graphs.

Co-authored-by: Omar <[email protected]>

---------

Co-authored-by: Omar <[email protected]>
  • Loading branch information
tylerwowen and osoriano authored Aug 14, 2024
1 parent ac9a4df commit 5fcda63
Show file tree
Hide file tree
Showing 4 changed files with 93 additions and 57 deletions.
57 changes: 29 additions & 28 deletions deploy-board/deploy_board/templates/groups/group_details.html
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ <h4 class="panel-title pull-left">Environments</h4>
{% endblock %}

{% block main %}
<!---- Group Metrics Panel --->
<!-- Group Metrics Panel -->
<div class="panel panel-default">
<div class="panel-heading clearfix">
<h4 class="panel-title pull-left pointer-cursor">
Expand All @@ -86,7 +86,32 @@ <h4 class="panel-title pull-left pointer-cursor">
</script>

<div id="metricStatId" class="collapse in panel-body">
<div align="center" id="groupStatsId" class="collapse in panel-body">
<div id="tsdLinksId" style="text-align: left;">
<h4>Check out these links for better visualization</h4>
<a type="button" class="deployToolTip btn btn-xs" data-toggle="tooltip"
href="{{ group_size_url }}"
title="" data-original-title="Click to see more group size information in TSDB">
<strong>Group Size</strong>
</a>
<a type="button" class="deployToolTip btn btn-xs" data-toggle="tooltip"
href="{{ provision_latency_url }}"
title="" data-original-title="Click to see more provision latency information in TSDB">
<strong>Provision Latency</strong>
</a>
{% for env in envs %}
<a type="button" class="deployToolTip btn btn-xs" data-toggle="tooltip"
href="{{ env.firstDeployLatencyLink }}"
title="" data-original-title="Click to see more first deploy latency information in TSDB">
<strong>Deploy Latency for {{ env.envName }}</strong>
</a>
<a type="button" class="deployToolTip btn btn-xs" data-toggle="tooltip"
href="{{ env.firstDeploySRLink }}"
title="" data-original-title="Click to see more first deploy success rate information in TSDB">
<strong>First deploy SR for {{ env.envName }}</strong>
</a>
{% endfor %}
</div>
<div id="groupStatsId" class="collapse in panel-body" style="text-align: center;">
<div id="container" class="chartContainer">
<div id="line_latencystats"></div>
<div id="launch_rate_id"></div>
Expand All @@ -108,37 +133,13 @@ <h4 class="panel-title pull-left pointer-cursor">
</div>
<div id="loadGroupInfo"></div>
</div>
<div align="left" id="tsdLinksId">
<a type="button" class="deployToolTip btn btn-xs" data-toggle="tooltip"
href="{{group_size_url}}"
title="" data-original-title="Click to see more group size information in TSDB">
<strong>Group Size</strong>
</a>
{% for env in envs %}

<a type="button" class="deployToolTip btn btn-xs" data-toggle="tooltip"
href="{{env.launchlatencylink}}"
title="" data-original-title="Click to see more launch latency information in TSDB">
<strong>Launch Latency for {{ env.envName }}</strong>
</a>
<a type="button" class="deployToolTip btn btn-xs" data-toggle="tooltip"
href="{{env.deploylatencylink}}"
title="" data-original-title="Click to see more deploy latency information in TSDB">
<strong>Deploy Latency for {{ env.envName }}</strong>
<a type="button" class="deployToolTip btn btn-xs" data-toggle="tooltip"
href="{{env.deployfailedlink}}"
title="" data-original-title="Click to see more launch failed count information in TSDB">
<strong>Launch failed count for {{ env.envName }}</strong>
</a>
{% endfor %}
</div>
</div>
</div>

<!--- launch instances button dialog-->
{% include "message_banner.tmpl" %}

<!---- Group Details Panel --->
<!--- Group Details Panel -->
{% if not scaling_down_event_enabled and asg_status == "ENABLED" %}
<div class="panel panel-warning">
{% elif asg_status == "DISABLED" %}
Expand All @@ -162,7 +163,7 @@ <h4 class="panel-title pull-left pointer-cursor">
</a>
</h4>

<!---- Buttons --->
<!--- Buttons -->
{% if not scaling_down_event_enabled and asg_status == "ENABLED" %}
<div class="btn-group pull-right">
<button type="button" class="deployToolTip btn btn-default btn-sm"
Expand Down
6 changes: 3 additions & 3 deletions deploy-board/deploy_board/templates/groups/launch_rate.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,11 @@
var data = new google.visualization.DataTable();
data.addColumn("datetime", "Date");

var failure_count_name = "Launch Failure Count";
var failure_count_name = "Launch Failure Rate";
data.addColumn("number", failure_count_name);

var options = {
title: 'Launch Failure Count',
title: 'Launch Failure Rate',
titleFontSize: 15,
height: 300,
min: 0,
Expand Down Expand Up @@ -65,7 +65,7 @@
if (metric_names != null) {
for (var i = 0; i < 1; ++i) {
var metric_name = metric_names[i];
data.addColumn("number", "Launch Failure Count");
data.addColumn("number", "Launch Failure Rate");
data_list = response[metric_name];
for (j = 0; j < data_list.length; ++j) {
var d = new Date(data_list[j][0]);
Expand Down
85 changes: 60 additions & 25 deletions deploy-board/deploy_board/webapp/group_view.py
Original file line number Diff line number Diff line change
Expand Up @@ -1206,6 +1206,12 @@ def get(self, request, group_name):


class GroupDetailView(View):
default_params = {
"begin": "1w",
"reducer_interval": "10m",
}
base_metric_url = "https://statsboard.pinadmin.com/build3?"

def get(self, request, group_name):
autoscaling_summary = autoscaling_groups_helper.get_autoscaling_summary(request, group_name)
if autoscaling_summary is None:
Expand All @@ -1215,32 +1221,10 @@ def get(self, request, group_name):
envs = environs_helper.get_all_envs_by_group(request, group_name)
disabled_actions = autoscaling_groups_helper.get_disabled_asg_actions(request, group_name)
pas_config = autoscaling_groups_helper.get_pas_config(request, group_name)
base_metric_url = "https://statsboard.pinadmin.com/build?"

group_size_url = base_metric_url+'''
{"renderer":"line","title":"Fleet Size", "yAxisLabel":"Group Size", "ymin":"0","from":"1w",
"metrics":[{"agg":"avg", "color":"dodgerblue","db":"tsdb", "dsValue":"10m", "renderer":"line",
"metric":"autoscaling.%s.size"}]}
''' % group_name

for env in envs:
env['launchlatencylink'] = base_metric_url + '''
{"renderer":"line", "yAxisLabel":"Launch Latency","ymin":"0","from":"1w",
"metrics":[{"agg":"avg", "color":"dodgerblue","db":"tsdb", "dsValue":"10m", "renderer":"line",
"metric":"autoscaling.%s.%s.launchlatency"}]}
''' % (env.get('envName'), env.get('stageName'))

env['deploylatencylink'] = base_metric_url + '''
{"renderer":"line", "yAxisLabel":"Deploy Latency", "ymin":"0","from":"1w",
"metrics":[{"agg":"avg", "color":"dodgerblue","db":"tsdb", "dsValue":"10m", "renderer":"line",
"metric":"autoscaling.%s.%s.deploylatency"}]}
''' % (env.get('envName'), env.get('stageName'))

env['deployfailedlink'] = base_metric_url + '''
{"renderer":"line", "yAxisLabel":"Launch Failed", "ymin":"0","from":"1w",
"metrics":[{"agg":"mimmax", "color":"dodgerblue","db":"tsdb", "dsValue":"10m", "renderer":"line",
"metric":"autoscaling.%s.%s.first_deploy.failed"}]}
''' % (env.get('envName'), env.get('stageName'))
env['firstDeploySRLink'] = self.generate_first_deploy_success_rate_link(env)
env['firstDeployLatencyLink'] = self.generate_deploy_latency_link(env)

if "Terminate" in disabled_actions:
scaling_down_event_enabled = False
Expand All @@ -1263,9 +1247,60 @@ def get(self, request, group_name):
"launch_config": launch_config,
"pas_enabled": pas_config['pas_state'] if pas_config else False,
"disallow_autoscaling": _disallow_autoscaling(curr_image),
"group_size_url": group_size_url,
"group_size_url": self.generate_group_size_url(group_name),
"provision_latency_url": self.generate_provision_latency_url(group_name),
})

def generate_deploy_latency_link(self, env):
params = {
"metrics": (
'{"cmd":"sd=(s-s.timeShift(1h)).nonNegative()\\nctd=(ct-ct.timeShift(1h)).nonNegative()\\nmean=sd/ctd\\nreturn max,mean","metrics":'
f'[{{"aggregator":"zimsum","alias":"s","metric":"teletraan.{env.get("envName")}.{env.get("stageName")}.deploy_latency.sum"}},'
f'{{"aggregator":"zimsum","alias":"ct","metric":"teletraan.{env.get("envName")}.{env.get("stageName")}.deploy_latency.count"}},'
f'{{"aggregator":"mimmax","alias":"max","metric":"teletraan.{env.get("envName")}.{env.get("stageName")}.deploy_latency.max"}}]}}'
),
"settings": (
'{"appearance":{"mean":{"color":"#0000ff"},"max":{"color":"#ff8000"}},"title":"mean & max first deploy latency [1h window]","y_axis_label":"Latency","y_min":0,'
'"note":"The deploy latency is measured from the first deploy start to finish on a single host."}'
),
}
params.update(self.default_params)
return f"{self.base_metric_url}{urllib.parse.urlencode(params)}"

def generate_provision_latency_url(self, group_name):
params = {
"metrics": (
'{"cmd":"sd=(s-s.timeShift(1h)).nonNegative()\\nctd=(ct-ct.timeShift(1h)).nonNegative()\\nmean=sd/ctd\\nreturn max,mean","metrics":'
f'[{{"aggregator":"zimsum","alias":"s","metric":"teletraan.{group_name}.provision_latency.sum"}},'
f'{{"aggregator":"zimsum","alias":"ct","metric":"teletraan.{group_name}.provision_latency.count"}},'
f'{{"aggregator":"mimmax","alias":"max","metric":"teletraan.{group_name}.provision_latency.max"}}]}}'
),
"settings": '{"appearance":{"mean":{"color":"#0000ff"},"max":{"color":"#ff8000"}},"title":"mean & max provision latency [1h window]","y_axis_label":"Latency","y_min":0,'
'"note":"The provision latency is measured from the host launch to the first Teletraan ping."}',
}
params.update(self.default_params)
return f"{self.base_metric_url}{urllib.parse.urlencode(params)}"

def generate_first_deploy_success_rate_link(self, env):
params = {
"metrics": (
'{"cmd":"sd=(suc-suc.timeShift(1h)).nonNegative()\\ntotd=(tot-tot.timeShift(1h)).nonNegative()\\nsr=sd/totd*100\\nreturn sr","metrics":'
f'[{{"aggregator":"zimsum","alias":"suc","metric":"teletraan.{env.get("envName")}.{env.get("stageName")}.first_deploy","tags":{{"success":"true"}}}},'
f'{{"aggregator":"zimsum","alias":"tot","metric":"teletraan.{env.get("envName")}.{env.get("stageName")}.first_deploy"}}]}}'
),
"settings": '{"appearance":{"sr":{"disabled":false,"stroke_style":"solid","color":"#00ff00"}},"title":"First deploy success rate [1h window]","y_max":105,"y_min":0}',
}
params.update(self.default_params)
return f"{self.base_metric_url}{urllib.parse.urlencode(params)}"

def generate_group_size_url(self, group_name):
params = {
"metrics": f'{{"metrics":[{{"aggregator":"zimavg","alias":"size","metric":"autoscaling.{group_name}.size"}}]}}',
"settings": '{"appearance":{"d":{"color":"dodgerblue"}},"renderer":"line","title":"Group size","y_axis_label":"Group size","y_min":0}',
}
params.update(self.default_params)
return f"{self.base_metric_url}{urllib.parse.urlencode(params)}"


# generate aws related settings
def get_aws_settings(request):
Expand Down
2 changes: 1 addition & 1 deletion deploy-board/deploy_board/webapp/util_views.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ def get_launch_rate(request, group_name):
try:
util_data["metric_names"] = []
for env in envs:
metric_name = "mimmax:autoscaling.{}.{}.first_deploy.failed".format(
metric_name = "mimmax:rate:teletraan.{}.{}.first_deploy{{success=false}}".format(
env["envName"], env["stageName"])
rate_data_points = autoscaling_metrics_helper.get_raw_metrics(request, metric_name,
settings.DEFAULT_START_TIME)
Expand Down

0 comments on commit 5fcda63

Please sign in to comment.