From 30c825440b2e4fadd02e446d6d900808a9fed87c Mon Sep 17 00:00:00 2001 From: kaihsun Date: Sat, 27 Apr 2024 00:45:15 +0000 Subject: [PATCH 1/2] update --- config/grafana/default_grafana_dashboard.json | 156 ++++++++++-------- 1 file changed, 86 insertions(+), 70 deletions(-) diff --git a/config/grafana/default_grafana_dashboard.json b/config/grafana/default_grafana_dashboard.json index bd8a0330db9..96603322971 100644 --- a/config/grafana/default_grafana_dashboard.json +++ b/config/grafana/default_grafana_dashboard.json @@ -23,7 +23,7 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "Prometheus", + "datasource": "${datasource}", "description": "Current number of tasks in a particular state.\n\nState: the task state, as described by rpc::TaskState proto in common.proto. Task resubmissions due to failures or object reconstruction are shown with (retry) in the label.", "fieldConfig": { "defaults": {}, @@ -94,7 +94,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(max_over_time(ray_tasks{IsRetry=\"0\",State=~\"FINISHED|FAILED\",SessionName=\"$SessionName\",}[14d])) by (State) or clamp_min(sum(ray_tasks{IsRetry=\"0\",State!~\"FINISHED|FAILED\",SessionName=\"$SessionName\",}) by (State), 0)", + "expr": "sum(max_over_time(ray_tasks{IsRetry=\"0\",State=~\"FINISHED|FAILED\",SessionName=~\"$SessionName\",}[14d])) by (State) or clamp_min(sum(ray_tasks{IsRetry=\"0\",State!~\"FINISHED|FAILED\",SessionName=~\"$SessionName\",}) by (State), 0)", "interval": "", "legendFormat": "{{State}}", "queryType": "randomWalk", @@ -102,7 +102,7 @@ }, { "exemplar": true, - "expr": "sum(max_over_time(ray_tasks{IsRetry!=\"0\",State=~\"FINISHED|FAILED\",SessionName=\"$SessionName\",}[14d])) by (State) or clamp_min(sum(ray_tasks{IsRetry!=\"0\",State!~\"FINISHED|FAILED\",SessionName=\"$SessionName\",}) by (State), 0)", + "expr": "sum(max_over_time(ray_tasks{IsRetry!=\"0\",State=~\"FINISHED|FAILED\",SessionName=~\"$SessionName\",}[14d])) by (State) or clamp_min(sum(ray_tasks{IsRetry!=\"0\",State!~\"FINISHED|FAILED\",SessionName=~\"$SessionName\",}) by (State), 0)", "interval": "", "legendFormat": "{{State}} (retry)", "queryType": "randomWalk", @@ -157,7 +157,7 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "Prometheus", + "datasource": "${datasource}", "description": "Current number of (live) tasks with a particular name. Task resubmissions due to failures or object reconstruction are shown with (retry) in the label.", "fieldConfig": { "defaults": {}, @@ -228,7 +228,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(ray_tasks{IsRetry=\"0\",State!~\"FINISHED|FAILED\",SessionName=\"$SessionName\",}) by (Name)", + "expr": "sum(ray_tasks{IsRetry=\"0\",State!~\"FINISHED|FAILED\",SessionName=~\"$SessionName\",}) by (Name)", "interval": "", "legendFormat": "{{Name}}", "queryType": "randomWalk", @@ -236,7 +236,7 @@ }, { "exemplar": true, - "expr": "sum(ray_tasks{IsRetry!=\"0\",State!~\"FINISHED|FAILED\",SessionName=\"$SessionName\",}) by (Name)", + "expr": "sum(ray_tasks{IsRetry!=\"0\",State!~\"FINISHED|FAILED\",SessionName=~\"$SessionName\",}) by (Name)", "interval": "", "legendFormat": "{{Name}} (retry)", "queryType": "randomWalk", @@ -291,7 +291,7 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "Prometheus", + "datasource": "${datasource}", "description": "Current number of actors in a particular state.\n\nState: the actor state, as described by rpc::ActorTableData proto in gcs.proto.", "fieldConfig": { "defaults": {}, @@ -362,7 +362,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(ray_actors{SessionName=\"$SessionName\",}) by (State)", + "expr": "sum(ray_actors{SessionName=~\"$SessionName\",}) by (State)", "interval": "", "legendFormat": "{{State}}", "queryType": "randomWalk", @@ -417,7 +417,7 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "Prometheus", + "datasource": "${datasource}", "description": "Current number of (live) actors with a particular name.", "fieldConfig": { "defaults": {}, @@ -488,7 +488,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(ray_actors{State!=\"DEAD\",SessionName=\"$SessionName\",}) by (Name)", + "expr": "sum(ray_actors{State!=\"DEAD\",SessionName=~\"$SessionName\",}) by (Name)", "interval": "", "legendFormat": "{{Name}}", "queryType": "randomWalk", @@ -543,7 +543,7 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "Prometheus", + "datasource": "${datasource}", "description": "Logical CPU usage of Ray. The dotted line indicates the total number of CPUs. The logical CPU is allocated by `num_cpus` arguments from tasks and actors. PENDING means the number of CPUs that will be available when new nodes are up after the autoscaler scales up.\n\nNOTE: Ray's logical CPU is different from physical CPU usage. Ray's logical CPU is allocated by `num_cpus` arguments.", "fieldConfig": { "defaults": {}, @@ -614,7 +614,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(ray_resources{Name=\"CPU\",State=\"USED\",SessionName=\"$SessionName\",}) by (instance)", + "expr": "sum(ray_resources{Name=\"CPU\",State=\"USED\",SessionName=~\"$SessionName\",}) by (instance)", "interval": "", "legendFormat": "CPU Usage: {{instance}}", "queryType": "randomWalk", @@ -622,7 +622,7 @@ }, { "exemplar": true, - "expr": "sum(ray_resources{Name=\"CPU\",SessionName=\"$SessionName\",})", + "expr": "sum(ray_resources{Name=\"CPU\",SessionName=~\"$SessionName\",})", "interval": "", "legendFormat": "MAX", "queryType": "randomWalk", @@ -630,7 +630,7 @@ }, { "exemplar": true, - "expr": "((sum(autoscaler_cluster_resources{resource=\"CPU\",SessionName=\"$SessionName\",}) or vector(0)) + (sum(autoscaler_pending_resources{resource=\"CPU\",SessionName=\"$SessionName\",}) or vector(0)) and (sum(autoscaler_cluster_resources{resource=\"CPU\",SessionName=\"$SessionName\",}) or vector(0)) + (sum(autoscaler_pending_resources{resource=\"CPU\",SessionName=\"$SessionName\",}) or vector(0)) > (sum(autoscaler_cluster_resources{resource=\"CPU\",SessionName=\"$SessionName\",}) or vector(0)))", + "expr": "((sum(autoscaler_cluster_resources{resource=\"CPU\",SessionName=~\"$SessionName\",}) or vector(0)) + (sum(autoscaler_pending_resources{resource=\"CPU\",SessionName=~\"$SessionName\",}) or vector(0)) and (sum(autoscaler_cluster_resources{resource=\"CPU\",SessionName=~\"$SessionName\",}) or vector(0)) + (sum(autoscaler_pending_resources{resource=\"CPU\",SessionName=~\"$SessionName\",}) or vector(0)) > (sum(autoscaler_cluster_resources{resource=\"CPU\",SessionName=~\"$SessionName\",}) or vector(0)))", "interval": "", "legendFormat": "MAX + PENDING", "queryType": "randomWalk", @@ -685,7 +685,7 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "Prometheus", + "datasource": "${datasource}", "description": "Object store memory usage by location. The dotted line indicates the object store memory capacity.\n\nLocation: where the memory was allocated, which is MMAP_SHM or MMAP_DISK to indicate memory-mapped page, SPILLED to indicate spillage to disk, and WORKER_HEAP for objects small enough to be inlined in worker memory. Refer to metric_defs.cc for more information.", "fieldConfig": { "defaults": {}, @@ -756,7 +756,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(ray_object_store_memory{SessionName=\"$SessionName\",}) by (Location)", + "expr": "sum(ray_object_store_memory{SessionName=~\"$SessionName\",}) by (Location)", "interval": "", "legendFormat": "{{Location}}", "queryType": "randomWalk", @@ -764,7 +764,7 @@ }, { "exemplar": true, - "expr": "sum(ray_resources{Name=\"object_store_memory\",SessionName=\"$SessionName\",})", + "expr": "sum(ray_resources{Name=\"object_store_memory\",SessionName=~\"$SessionName\",})", "interval": "", "legendFormat": "MAX", "queryType": "randomWalk", @@ -819,7 +819,7 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "Prometheus", + "datasource": "${datasource}", "description": "Logical GPU usage of Ray. The dotted line indicates the total number of GPUs. The logical GPU is allocated by `num_gpus` arguments from tasks and actors. PENDING means the number of GPUs that will be available when new nodes are up after the autoscaler scales up.", "fieldConfig": { "defaults": {}, @@ -890,7 +890,7 @@ "targets": [ { "exemplar": true, - "expr": "ray_resources{Name=\"GPU\",State=\"USED\",SessionName=\"$SessionName\",}", + "expr": "ray_resources{Name=\"GPU\",State=\"USED\",SessionName=~\"$SessionName\",}", "interval": "", "legendFormat": "GPU Usage: {{instance}}", "queryType": "randomWalk", @@ -898,7 +898,7 @@ }, { "exemplar": true, - "expr": "sum(ray_resources{Name=\"GPU\",SessionName=\"$SessionName\",})", + "expr": "sum(ray_resources{Name=\"GPU\",SessionName=~\"$SessionName\",})", "interval": "", "legendFormat": "MAX", "queryType": "randomWalk", @@ -906,7 +906,7 @@ }, { "exemplar": true, - "expr": "((sum(autoscaler_cluster_resources{resource=\"GPU\",SessionName=\"$SessionName\",}) or vector(0)) + (sum(autoscaler_pending_resources{resource=\"GPU\",SessionName=\"$SessionName\",}) or vector(0)) and (sum(autoscaler_cluster_resources{resource=\"GPU\",SessionName=\"$SessionName\",}) or vector(0)) + (sum(autoscaler_pending_resources{resource=\"GPU\",SessionName=\"$SessionName\",}) or vector(0)) > (sum(autoscaler_cluster_resources{resource=\"GPU\",SessionName=\"$SessionName\",}) or vector(0)))", + "expr": "((sum(autoscaler_cluster_resources{resource=\"GPU\",SessionName=~\"$SessionName\",}) or vector(0)) + (sum(autoscaler_pending_resources{resource=\"GPU\",SessionName=~\"$SessionName\",}) or vector(0)) and (sum(autoscaler_cluster_resources{resource=\"GPU\",SessionName=~\"$SessionName\",}) or vector(0)) + (sum(autoscaler_pending_resources{resource=\"GPU\",SessionName=~\"$SessionName\",}) or vector(0)) > (sum(autoscaler_cluster_resources{resource=\"GPU\",SessionName=~\"$SessionName\",}) or vector(0)))", "interval": "", "legendFormat": "MAX + PENDING", "queryType": "randomWalk", @@ -961,7 +961,7 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "Prometheus", + "datasource": "${datasource}", "description": "Current number of placement groups in a particular state.\n\nState: the placement group state, as described by the rpc::PlacementGroupTable proto in gcs.proto.", "fieldConfig": { "defaults": {}, @@ -1032,7 +1032,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(ray_placement_groups{SessionName=\"$SessionName\",}) by (State)", + "expr": "sum(ray_placement_groups{SessionName=~\"$SessionName\",}) by (State)", "interval": "", "legendFormat": "{{State}}", "queryType": "randomWalk", @@ -1087,7 +1087,7 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "Prometheus", + "datasource": "${datasource}", "description": "", "fieldConfig": { "defaults": {}, @@ -1158,7 +1158,7 @@ "targets": [ { "exemplar": true, - "expr": "ray_node_cpu_utilization{instance=~\"$Instance\",SessionName=\"$SessionName\",} * ray_node_cpu_count{instance=~\"$Instance\",SessionName=\"$SessionName\",} / 100", + "expr": "ray_node_cpu_utilization{instance=~\"$Instance\",SessionName=~\"$SessionName\",} * ray_node_cpu_count{instance=~\"$Instance\",SessionName=~\"$SessionName\",} / 100", "interval": "", "legendFormat": "CPU Usage: {{instance}}", "queryType": "randomWalk", @@ -1166,7 +1166,7 @@ }, { "exemplar": true, - "expr": "sum(ray_node_cpu_count{SessionName=\"$SessionName\",})", + "expr": "sum(ray_node_cpu_count{SessionName=~\"$SessionName\",})", "interval": "", "legendFormat": "MAX", "queryType": "randomWalk", @@ -1221,7 +1221,7 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "Prometheus", + "datasource": "${datasource}", "description": "Node's physical (hardware) GPU usage. The dotted line means the total number of hardware GPUs from the cluster. ", "fieldConfig": { "defaults": {}, @@ -1292,7 +1292,7 @@ "targets": [ { "exemplar": true, - "expr": "ray_node_gpus_utilization{instance=~\"$Instance\",SessionName=\"$SessionName\",} / 100", + "expr": "ray_node_gpus_utilization{instance=~\"$Instance\",SessionName=~\"$SessionName\",} / 100", "interval": "", "legendFormat": "GPU Usage: {{instance}}, gpu.{{GpuIndex}}, {{GpuDeviceName}}", "queryType": "randomWalk", @@ -1300,7 +1300,7 @@ }, { "exemplar": true, - "expr": "sum(ray_node_gpus_available{SessionName=\"$SessionName\",})", + "expr": "sum(ray_node_gpus_available{SessionName=~\"$SessionName\",})", "interval": "", "legendFormat": "MAX", "queryType": "randomWalk", @@ -1355,7 +1355,7 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "Prometheus", + "datasource": "${datasource}", "description": "Node's physical (hardware) disk usage. The dotted line means the total amount of disk space from the cluster.\n\nNOTE: When Ray is deployed within a container, this shows the disk usage from the host machine. ", "fieldConfig": { "defaults": {}, @@ -1426,7 +1426,7 @@ "targets": [ { "exemplar": true, - "expr": "ray_node_disk_usage{instance=~\"$Instance\",SessionName=\"$SessionName\",}", + "expr": "ray_node_disk_usage{instance=~\"$Instance\",SessionName=~\"$SessionName\",}", "interval": "", "legendFormat": "Disk Used: {{instance}}", "queryType": "randomWalk", @@ -1434,7 +1434,7 @@ }, { "exemplar": true, - "expr": "sum(ray_node_disk_free{SessionName=\"$SessionName\",}) + sum(ray_node_disk_usage{SessionName=\"$SessionName\",})", + "expr": "sum(ray_node_disk_free{SessionName=~\"$SessionName\",}) + sum(ray_node_disk_usage{SessionName=~\"$SessionName\",})", "interval": "", "legendFormat": "MAX", "queryType": "randomWalk", @@ -1489,7 +1489,7 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "Prometheus", + "datasource": "${datasource}", "description": "Disk IO per node.", "fieldConfig": { "defaults": {}, @@ -1560,7 +1560,7 @@ "targets": [ { "exemplar": true, - "expr": "ray_node_disk_io_write_speed{instance=~\"$Instance\",SessionName=\"$SessionName\",}", + "expr": "ray_node_disk_io_write_speed{instance=~\"$Instance\",SessionName=~\"$SessionName\",}", "interval": "", "legendFormat": "Write: {{instance}}", "queryType": "randomWalk", @@ -1568,7 +1568,7 @@ }, { "exemplar": true, - "expr": "ray_node_disk_io_read_speed{instance=~\"$Instance\",SessionName=\"$SessionName\",}", + "expr": "ray_node_disk_io_read_speed{instance=~\"$Instance\",SessionName=~\"$SessionName\",}", "interval": "", "legendFormat": "Read: {{instance}}", "queryType": "randomWalk", @@ -1623,7 +1623,7 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "Prometheus", + "datasource": "${datasource}", "description": "The physical (hardware) memory usage for each node. The dotted line means the total amount of memory from the cluster. Node memory is a sum of object store memory (shared memory) and heap memory.\n\nNote: If Ray is deployed within a container, the total memory could be lower than the host machine because Ray may reserve some additional memory space outside the container.", "fieldConfig": { "defaults": {}, @@ -1694,7 +1694,7 @@ "targets": [ { "exemplar": true, - "expr": "ray_node_mem_used{instance=~\"$Instance\",SessionName=\"$SessionName\",}", + "expr": "ray_node_mem_used{instance=~\"$Instance\",SessionName=~\"$SessionName\",}", "interval": "", "legendFormat": "Memory Used: {{instance}}", "queryType": "randomWalk", @@ -1702,7 +1702,7 @@ }, { "exemplar": true, - "expr": "sum(ray_node_mem_total{SessionName=\"$SessionName\",})", + "expr": "sum(ray_node_mem_total{SessionName=~\"$SessionName\",})", "interval": "", "legendFormat": "MAX", "queryType": "randomWalk", @@ -1757,7 +1757,7 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "Prometheus", + "datasource": "${datasource}", "description": "The number of tasks and actors killed by the Ray Out of Memory killer due to high memory pressure. Metrics are broken down by IP and the name. https://docs.ray.io/en/master/ray-core/scheduling/ray-oom-prevention.html.", "fieldConfig": { "defaults": {}, @@ -1828,7 +1828,7 @@ "targets": [ { "exemplar": true, - "expr": "ray_memory_manager_worker_eviction_total{instance=~\"$Instance\",SessionName=\"$SessionName\",}", + "expr": "ray_memory_manager_worker_eviction_total{instance=~\"$Instance\",SessionName=~\"$SessionName\",}", "interval": "", "legendFormat": "OOM Killed: {{Name}}, {{instance}}", "queryType": "randomWalk", @@ -1883,7 +1883,7 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "Prometheus", + "datasource": "${datasource}", "description": "The physical (hardware) memory usage across the cluster, broken down by component. This reports the summed RSS-SHM per Ray component, which corresponds to an approximate memory usage per proc. Ray components consist of system components (e.g., raylet, gcs, dashboard, or agent) and the process (that contains method names) names of running tasks/actors.", "fieldConfig": { "defaults": {}, @@ -1954,7 +1954,7 @@ "targets": [ { "exemplar": true, - "expr": "(sum(ray_component_rss_mb{SessionName=\"$SessionName\",} * 1e6) by (Component)) - (sum(ray_component_mem_shared_bytes{SessionName=\"$SessionName\",}) by (Component))", + "expr": "(sum(ray_component_rss_mb{SessionName=~\"$SessionName\",} * 1e6) by (Component)) - (sum(ray_component_mem_shared_bytes{SessionName=~\"$SessionName\",}) by (Component))", "interval": "", "legendFormat": "{{Component}}", "queryType": "randomWalk", @@ -1962,7 +1962,7 @@ }, { "exemplar": true, - "expr": "sum(ray_node_mem_shared_bytes{SessionName=\"$SessionName\",})", + "expr": "sum(ray_node_mem_shared_bytes{SessionName=~\"$SessionName\",})", "interval": "", "legendFormat": "shared_memory", "queryType": "randomWalk", @@ -1970,7 +1970,7 @@ }, { "exemplar": true, - "expr": "sum(ray_node_mem_total{SessionName=\"$SessionName\",})", + "expr": "sum(ray_node_mem_total{SessionName=~\"$SessionName\",})", "interval": "", "legendFormat": "MAX", "queryType": "randomWalk", @@ -2025,7 +2025,7 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "Prometheus", + "datasource": "${datasource}", "description": "The physical (hardware) CPU usage across the cluster, broken down by component. This reports the summed CPU usage per Ray component. Ray components consist of system components (e.g., raylet, gcs, dashboard, or agent) and the process (that contains method names) names of running tasks/actors.", "fieldConfig": { "defaults": {}, @@ -2096,7 +2096,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(ray_component_cpu_percentage{SessionName=\"$SessionName\",}) by (Component) / 100", + "expr": "sum(ray_component_cpu_percentage{SessionName=~\"$SessionName\",}) by (Component) / 100", "interval": "", "legendFormat": "{{Component}}", "queryType": "randomWalk", @@ -2104,7 +2104,7 @@ }, { "exemplar": true, - "expr": "sum(ray_node_cpu_count{SessionName=\"$SessionName\",})", + "expr": "sum(ray_node_cpu_count{SessionName=~\"$SessionName\",})", "interval": "", "legendFormat": "MAX", "queryType": "randomWalk", @@ -2159,7 +2159,7 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "Prometheus", + "datasource": "${datasource}", "description": "The physical (hardware) GPU memory usage for each node. The dotted line means the total amount of GPU memory from the cluster.", "fieldConfig": { "defaults": {}, @@ -2230,7 +2230,7 @@ "targets": [ { "exemplar": true, - "expr": "ray_node_gram_used{instance=~\"$Instance\",SessionName=\"$SessionName\",} * 1024 * 1024", + "expr": "ray_node_gram_used{instance=~\"$Instance\",SessionName=~\"$SessionName\",} * 1024 * 1024", "interval": "", "legendFormat": "Used GRAM: {{instance}}, gpu.{{GpuIndex}}, {{GpuDeviceName}}", "queryType": "randomWalk", @@ -2238,7 +2238,7 @@ }, { "exemplar": true, - "expr": "(sum(ray_node_gram_available{SessionName=\"$SessionName\",}) + sum(ray_node_gram_used{SessionName=\"$SessionName\",})) * 1024 * 1024", + "expr": "(sum(ray_node_gram_available{SessionName=~\"$SessionName\",}) + sum(ray_node_gram_used{SessionName=~\"$SessionName\",})) * 1024 * 1024", "interval": "", "legendFormat": "MAX", "queryType": "randomWalk", @@ -2293,7 +2293,7 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "Prometheus", + "datasource": "${datasource}", "description": "Network speed per node", "fieldConfig": { "defaults": {}, @@ -2364,7 +2364,7 @@ "targets": [ { "exemplar": true, - "expr": "ray_node_network_receive_speed{instance=~\"$Instance\",SessionName=\"$SessionName\",}", + "expr": "ray_node_network_receive_speed{instance=~\"$Instance\",SessionName=~\"$SessionName\",}", "interval": "", "legendFormat": "Recv: {{instance}}", "queryType": "randomWalk", @@ -2372,7 +2372,7 @@ }, { "exemplar": true, - "expr": "ray_node_network_send_speed{instance=~\"$Instance\",SessionName=\"$SessionName\",}", + "expr": "ray_node_network_send_speed{instance=~\"$Instance\",SessionName=~\"$SessionName\",}", "interval": "", "legendFormat": "Send: {{instance}}", "queryType": "randomWalk", @@ -2427,7 +2427,7 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "Prometheus", + "datasource": "${datasource}", "description": "A total number of active failed, and pending nodes from the cluster. \n\nACTIVE: A node is alive and available.\n\nFAILED: A node is dead and not available. The node is considered dead when the raylet process on the node is terminated. The node will get into the failed state if it cannot be provided (e.g., there's no available node from the cloud provider) or failed to setup (e.g., setup_commands have errors). \n\nPending: A node is being started by the Ray cluster launcher. The node is unavailable now because it is being provisioned and initialized.", "fieldConfig": { "defaults": {}, @@ -2498,7 +2498,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(autoscaler_active_nodes{SessionName=\"$SessionName\",}) by (NodeType)", + "expr": "sum(autoscaler_active_nodes{SessionName=~\"$SessionName\",}) by (NodeType)", "interval": "", "legendFormat": "Active Nodes: {{NodeType}}", "queryType": "randomWalk", @@ -2506,7 +2506,7 @@ }, { "exemplar": true, - "expr": "sum(autoscaler_recently_failed_nodes{SessionName=\"$SessionName\",}) by (NodeType)", + "expr": "sum(autoscaler_recently_failed_nodes{SessionName=~\"$SessionName\",}) by (NodeType)", "interval": "", "legendFormat": "Failed Nodes: {{NodeType}}", "queryType": "randomWalk", @@ -2514,7 +2514,7 @@ }, { "exemplar": true, - "expr": "sum(autoscaler_pending_nodes{SessionName=\"$SessionName\",}) by (NodeType)", + "expr": "sum(autoscaler_pending_nodes{SessionName=~\"$SessionName\",}) by (NodeType)", "interval": "", "legendFormat": "Pending Nodes: {{NodeType}}", "queryType": "randomWalk", @@ -2569,7 +2569,7 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "Prometheus", + "datasource": "${datasource}", "description": "Aggregated utilization of all physical resources (CPU, GPU, memory, disk, or etc.) across the cluster.", "fieldConfig": { "defaults": {}, @@ -2640,7 +2640,7 @@ "targets": [ { "exemplar": true, - "expr": "avg(ray_node_cpu_utilization{SessionName=\"$SessionName\",})", + "expr": "avg(ray_node_cpu_utilization{SessionName=~\"$SessionName\",})", "interval": "", "legendFormat": "CPU (physical)", "queryType": "randomWalk", @@ -2648,7 +2648,7 @@ }, { "exemplar": true, - "expr": "sum(ray_node_gpus_utilization{SessionName=\"$SessionName\",}) / on() (sum(autoscaler_cluster_resources{resource='GPU',SessionName=\"$SessionName\",}) or vector(0))", + "expr": "sum(ray_node_gpus_utilization{SessionName=~\"$SessionName\",}) / on() (sum(autoscaler_cluster_resources{resource='GPU',SessionName=~\"$SessionName\",}) or vector(0))", "interval": "", "legendFormat": "GPU (physical)", "queryType": "randomWalk", @@ -2656,7 +2656,7 @@ }, { "exemplar": true, - "expr": "sum(ray_node_mem_used{SessionName=\"$SessionName\",}) / on() (sum(ray_node_mem_total{SessionName=\"$SessionName\",})) * 100", + "expr": "sum(ray_node_mem_used{SessionName=~\"$SessionName\",}) / on() (sum(ray_node_mem_total{SessionName=~\"$SessionName\",})) * 100", "interval": "", "legendFormat": "Memory (RAM)", "queryType": "randomWalk", @@ -2664,7 +2664,7 @@ }, { "exemplar": true, - "expr": "sum(ray_node_gram_used{SessionName=\"$SessionName\",}) / on() (sum(ray_node_gram_available{SessionName=\"$SessionName\",}) + sum(ray_node_gram_used{SessionName=\"$SessionName\",})) * 100", + "expr": "sum(ray_node_gram_used{SessionName=~\"$SessionName\",}) / on() (sum(ray_node_gram_available{SessionName=~\"$SessionName\",}) + sum(ray_node_gram_used{SessionName=~\"$SessionName\",})) * 100", "interval": "", "legendFormat": "GRAM", "queryType": "randomWalk", @@ -2672,7 +2672,7 @@ }, { "exemplar": true, - "expr": "sum(ray_object_store_memory{SessionName=\"$SessionName\",}) / on() sum(ray_resources{Name=\"object_store_memory\",SessionName=\"$SessionName\",}) * 100", + "expr": "sum(ray_object_store_memory{SessionName=~\"$SessionName\",}) / on() sum(ray_resources{Name=\"object_store_memory\",SessionName=~\"$SessionName\",}) * 100", "interval": "", "legendFormat": "Object Store Memory", "queryType": "randomWalk", @@ -2680,7 +2680,7 @@ }, { "exemplar": true, - "expr": "sum(ray_node_disk_usage{SessionName=\"$SessionName\",}) / on() (sum(ray_node_disk_free{SessionName=\"$SessionName\",}) + sum(ray_node_disk_usage{SessionName=\"$SessionName\",})) * 100", + "expr": "sum(ray_node_disk_usage{SessionName=~\"$SessionName\",}) / on() (sum(ray_node_disk_free{SessionName=~\"$SessionName\",}) + sum(ray_node_disk_usage{SessionName=~\"$SessionName\",})) * 100", "interval": "", "legendFormat": "Disk", "queryType": "randomWalk", @@ -2735,21 +2735,37 @@ "schemaVersion": 27, "style": "dark", "tags": [ - "rayVersion:2.5.0" + "rayVersion:2.9.0" ], "templating": { "list": [ { - "allValue": null, "current": { "selected": false }, - "datasource": "Prometheus", + "description": "Filter queries of a specific Prometheus type.", + "hide": 2, + "includeAll": false, + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "allValue": ".+", + "current": { + "selected": false + }, + "datasource": "${datasource}", "definition": "label_values(ray_node_network_receive_speed{}, SessionName)", "description": "Filter queries to specific ray sessions.", "error": null, "hide": 0, - "includeAll": false, + "includeAll": true, "label": null, "multi": false, "name": "SessionName", @@ -2779,7 +2795,7 @@ "$__all" ] }, - "datasource": "Prometheus", + "datasource": "${datasource}", "definition": "label_values(ray_node_network_receive_speed{SessionName=\"$SessionName\",}, instance)", "description": null, "error": null, @@ -2817,4 +2833,4 @@ "rayMeta": [ "supportsGlobalFilterOverride" ] -} \ No newline at end of file +} From 7cba8662bc948cf458b17f12823089d641ec5aee Mon Sep 17 00:00:00 2001 From: kaihsun Date: Tue, 30 Apr 2024 01:14:24 +0000 Subject: [PATCH 2/2] update --- config/grafana/data_grafana_dashboard.json | 1511 +++++++++++ .../serve_deployment_grafana_dashboard.json | 2241 +++++++++++++++++ config/grafana/serve_grafana_dashboard.json | 2208 ++++++++++++++++ 3 files changed, 5960 insertions(+) create mode 100644 config/grafana/data_grafana_dashboard.json create mode 100644 config/grafana/serve_deployment_grafana_dashboard.json create mode 100644 config/grafana/serve_grafana_dashboard.json diff --git a/config/grafana/data_grafana_dashboard.json b/config/grafana/data_grafana_dashboard.json new file mode 100644 index 00000000000..6336dab5753 --- /dev/null +++ b/config/grafana/data_grafana_dashboard.json @@ -0,0 +1,1511 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "iteration": 1667344411089, + "links": [], + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Amount spilled by dataset operators. DataContext.enable_get_object_locations_for_metrics must be set to True to report this metric", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "hiddenSeries": false, + "id": 1, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_spilled_bytes{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", + "interval": "", + "legendFormat": "Bytes Spilled: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Bytes Spilled", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "bytes", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Amount allocated by dataset operators.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "hiddenSeries": false, + "id": 2, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_allocated_bytes{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", + "interval": "", + "legendFormat": "Bytes Allocated: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Bytes Allocated", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "bytes", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Amount freed by dataset operators.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 1 + }, + "hiddenSeries": false, + "id": 3, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_freed_bytes{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", + "interval": "", + "legendFormat": "Bytes Freed: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Bytes Freed", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "bytes", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Amount of memory store used by dataset operators.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 1 + }, + "hiddenSeries": false, + "id": 4, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_current_bytes{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", + "interval": "", + "legendFormat": "Current Usage: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Object Store Memory", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "bytes", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Logical CPUs allocated to dataset operators.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 2 + }, + "hiddenSeries": false, + "id": 5, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_cpu_usage_cores{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", + "interval": "", + "legendFormat": "CPU Usage: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "CPUs (logical slots)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "cores", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Logical GPUs allocated to dataset operators.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 2 + }, + "hiddenSeries": false, + "id": 6, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_gpu_usage_cores{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", + "interval": "", + "legendFormat": "GPU Usage: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "GPUs (logical slots)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "cores", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Total bytes outputted by dataset operators.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 3 + }, + "hiddenSeries": false, + "id": 7, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_output_bytes{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", + "interval": "", + "legendFormat": "Bytes Outputted: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Bytes Outputted", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "bytes", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Total rows outputted by dataset operators.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 3 + }, + "hiddenSeries": false, + "id": 11, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_output_rows{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", + "interval": "", + "legendFormat": "Rows Outputted: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Rows Outputted", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "rows", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Time spent generating blocks.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 4 + }, + "hiddenSeries": false, + "id": 8, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_block_generation_seconds{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", + "interval": "", + "legendFormat": "Block Generation Time: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Block Generation Time", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "seconds", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Seconds user thread is blocked by iter_batches()", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 4 + }, + "hiddenSeries": false, + "id": 9, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_iter_total_blocked_seconds{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset)", + "interval": "", + "legendFormat": "Seconds: {{dataset}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Iteration Blocked Time", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "seconds", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Seconds spent in user code", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 5 + }, + "hiddenSeries": false, + "id": 10, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_iter_user_seconds{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset)", + "interval": "", + "legendFormat": "Seconds: {{dataset}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Iteration User Time", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "seconds", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": false, + "schemaVersion": 27, + "style": "dark", + "tags": [ + "rayVersion:2.9.0" + ], + "templating": { + "list": [ + { + "current": { + "selected": false + }, + "description": "Filter queries of a specific Prometheus type.", + "hide": 2, + "includeAll": false, + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "allValue": ".+", + "current": { + "selected": false + }, + "datasource": "${datasource}", + "definition": "label_values(ray_data_allocated_bytes{}, SessionName)", + "description": "Filter queries to specific ray sessions.", + "error": null, + "hide": 0, + "includeAll": true, + "label": null, + "multi": false, + "name": "SessionName", + "options": [], + "query": { + "query": "label_values(ray_data_allocated_bytes{}, SessionName)", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 2, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".+", + "current": { + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": "${datasource}", + "definition": "label_values(ray_data_allocated_bytes{}, dataset)", + "description": null, + "error": null, + "hide": 0, + "includeAll": true, + "label": null, + "multi": true, + "name": "DatasetID", + "options": [], + "query": { + "query": "label_values(ray_data_allocated_bytes{}, dataset)", + "refId": "Prometheus-Dataset-Variable-Query" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "rayMeta": [ + "excludesSystemRoutes", + "supportsGlobalFilterOverride" + ], + "time": { + "from": "now-30m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Data Dashboard", + "uid": "rayDataDashboard", + "version": 1 +} diff --git a/config/grafana/serve_deployment_grafana_dashboard.json b/config/grafana/serve_deployment_grafana_dashboard.json new file mode 100644 index 00000000000..ea80d2a6bd6 --- /dev/null +++ b/config/grafana/serve_deployment_grafana_dashboard.json @@ -0,0 +1,2241 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "iteration": 1667344411089, + "links": [], + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Number of replicas per deployment. Ignores \"Route\" variable.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "x": 0, + "y": 0, + "w": 8, + "h": 8 + }, + "hiddenSeries": false, + "id": 1, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_serve_deployment_replica_healthy{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}) by (application, deployment)", + "interval": "", + "legendFormat": "{{application, deployment}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Replicas per deployment", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "replicas", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "QPS for each replica.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "x": 8, + "y": 0, + "w": 8, + "h": 8 + }, + "hiddenSeries": false, + "id": 2, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(ray_serve_deployment_request_counter{route=~\"$Route\",route!~\"/-/.*\",application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}[5m])) by (application, deployment, replica)", + "interval": "", + "legendFormat": "{{replica}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "QPS per replica", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "qps", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Error QPS for each replica.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "x": 16, + "y": 0, + "w": 8, + "h": 8 + }, + "hiddenSeries": false, + "id": 3, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(ray_serve_deployment_error_counter{route=~\"$Route\",route!~\"/-/.*\",application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}[5m])) by (application, deployment, replica)", + "interval": "", + "legendFormat": "{{replica}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Error QPS per replica", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "qps", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "P50 latency per replica.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "x": 0, + "y": 1, + "w": 8, + "h": 8 + }, + "hiddenSeries": false, + "id": 4, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "histogram_quantile(0.5, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{route=~\"$Route\",route!~\"/-/.*\",application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}[5m])) by (application, deployment, replica, le))", + "interval": "", + "legendFormat": "{{replica}}", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.5, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{route=~\"$Route\",route!~\"/-/.*\",application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}[5m])) by (le))", + "interval": "", + "legendFormat": "Total", + "queryType": "randomWalk", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "P50 latency per replica", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "ms", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "P90 latency per replica.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "x": 8, + "y": 1, + "w": 8, + "h": 8 + }, + "hiddenSeries": false, + "id": 5, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "histogram_quantile(0.9, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{route=~\"$Route\",route!~\"/-/.*\",application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}[5m])) by (application, deployment, replica, le))", + "interval": "", + "legendFormat": "{{replica}}", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.9, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{route=~\"$Route\",route!~\"/-/.*\",application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}[5m])) by (le))", + "interval": "", + "legendFormat": "Total", + "queryType": "randomWalk", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "P90 latency per replica", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "ms", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "P99 latency per replica.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "x": 16, + "y": 1, + "w": 8, + "h": 8 + }, + "hiddenSeries": false, + "id": 6, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "histogram_quantile(0.99, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{route=~\"$Route\",route!~\"/-/.*\",application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}[5m])) by (application, deployment, replica, le))", + "interval": "", + "legendFormat": "{{replica}}", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.99, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{route=~\"$Route\",application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}[5m])) by (le))", + "interval": "", + "legendFormat": "Total", + "queryType": "randomWalk", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "P99 latency per replica", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "ms", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Number of requests queued per deployment. Ignores \"Replica\" and \"Route\" variable.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "x": 0, + "y": 2, + "w": 8, + "h": 8 + }, + "hiddenSeries": false, + "id": 7, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_serve_deployment_queued_queries{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}) by (application, deployment)", + "interval": "", + "legendFormat": "{{application, deployment}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Queue size per deployment", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "requests", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Pending requests for each replica.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "x": 8, + "y": 2, + "w": 8, + "h": 8 + }, + "hiddenSeries": false, + "id": 8, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_serve_replica_pending_queries{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}) by (application, deployment, replica)", + "interval": "", + "legendFormat": "{{replica}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Pending requests per replica", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "requests", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Current running requests for each replica.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "x": 16, + "y": 2, + "w": 8, + "h": 8 + }, + "hiddenSeries": false, + "id": 9, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_serve_replica_processing_queries{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}) by (application, deployment, replica)", + "interval": "", + "legendFormat": "{{replica}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Running requests per replica", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "requests", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "The number of multiplexed models for each replica.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "x": 0, + "y": 3, + "w": 8, + "h": 8 + }, + "hiddenSeries": false, + "id": 10, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_serve_num_multiplexed_models{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}) by (application, deployment, replica)", + "interval": "", + "legendFormat": "{{replica}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Multiplexed models per replica", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "models", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "The number of times of multiplexed models loaded for each replica.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "x": 8, + "y": 3, + "w": 8, + "h": 8 + }, + "hiddenSeries": false, + "id": 11, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_serve_multiplexed_models_load_counter{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}) by (application, deployment, replica)", + "interval": "", + "legendFormat": "{{replica}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Multiplexed model loads per replica", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "times", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "The number of times of multiplexed models unloaded for each replica.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "x": 16, + "y": 3, + "w": 8, + "h": 8 + }, + "hiddenSeries": false, + "id": 12, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_serve_multiplexed_models_unload_counter{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}) by (application, deployment, replica)", + "interval": "", + "legendFormat": "{{replica}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Multiplexed model unloads per replica", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "times", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "P99 latency of mutliplexed model load per replica.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "x": 0, + "y": 4, + "w": 8, + "h": 8 + }, + "hiddenSeries": false, + "id": 13, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "histogram_quantile(0.99, sum(rate(ray_serve_multiplexed_model_load_latency_ms_bucket{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}[5m])) by (application, deployment, replica, le))", + "interval": "", + "legendFormat": "{{replica}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "P99 latency of multiplexed model loads per replica", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "ms", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "P99 latency of mutliplexed model unload per replica.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "x": 8, + "y": 4, + "w": 8, + "h": 8 + }, + "hiddenSeries": false, + "id": 14, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "histogram_quantile(0.99, sum(rate(ray_serve_multiplexed_model_unload_latency_ms_bucket{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}[5m])) by (application, deployment, replica, le))", + "interval": "", + "legendFormat": "{{replica}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "P99 latency of multiplexed model unloads per replica", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "ms", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "The ids of multiplexed models for each replica.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "x": 16, + "y": 4, + "w": 8, + "h": 8 + }, + "hiddenSeries": false, + "id": 15, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "ray_serve_registered_multiplexed_model_id{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}", + "interval": "", + "legendFormat": "{{replica}}:{{model_id}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Multiplexed model ids per replica", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "model", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "The cache hit rate of multiplexed models for the deployment.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "x": 0, + "y": 5, + "w": 8, + "h": 8 + }, + "hiddenSeries": false, + "id": 16, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "(1 - sum(rate(ray_serve_multiplexed_models_load_counter{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}[5m]))/sum(rate(ray_serve_multiplexed_get_model_requests_counter{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}[5m])))", + "interval": "", + "legendFormat": "{{replica}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Multiplexed model cache hit rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "%", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": false, + "schemaVersion": 27, + "style": "dark", + "tags": [ + "rayVersion:2.9.0" + ], + "templating": { + "list": [ + { + "current": { + "selected": false + }, + "description": "Filter queries to specific prometheus type.", + "hide": 2, + "includeAll": false, + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "allValue": ".*", + "current": { + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": "${datasource}", + "definition": "label_values(ray_serve_deployment_replica_healthy{}, application)", + "description": null, + "error": null, + "hide": 0, + "includeAll": true, + "label": null, + "multi": true, + "name": "Application", + "options": [], + "query": { + "query": "label_values(ray_serve_deployment_replica_healthy{}, application)", + "refId": "Prometheus-Instance-Variable-Query" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".*", + "current": { + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": "${datasource}", + "definition": "label_values(ray_serve_deployment_replica_healthy{application=~\"$Application\",}, deployment)", + "description": null, + "error": null, + "hide": 0, + "includeAll": true, + "label": null, + "multi": true, + "name": "Deployment", + "options": [], + "query": { + "query": "label_values(ray_serve_deployment_replica_healthy{application=~\"$Application\",}, deployment)", + "refId": "Prometheus-Instance-Variable-Query" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".*", + "current": { + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": "${datasource}", + "definition": "label_values(ray_serve_deployment_replica_healthy{application=~\"$Application\",deployment=~\"$Deployment\",}, replica)", + "description": null, + "error": null, + "hide": 0, + "includeAll": true, + "label": null, + "multi": true, + "name": "Replica", + "options": [], + "query": { + "query": "label_values(ray_serve_deployment_replica_healthy{application=~\"$Application\",deployment=~\"$Deployment\",}, replica)", + "refId": "Prometheus-Instance-Variable-Query" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".*", + "current": { + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": "${datasource}", + "definition": "label_values(ray_serve_deployment_request_counter{deployment=~\"$Deployment\",}, route)", + "description": null, + "error": null, + "hide": 0, + "includeAll": true, + "label": null, + "multi": true, + "name": "Route", + "options": [], + "query": { + "query": "label_values(ray_serve_deployment_request_counter{deployment=~\"$Deployment\",}, route)", + "refId": "Prometheus-Instance-Variable-Query" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "rayMeta": [ + "excludesSystemRoutes", + "supportsGlobalFilterOverride" + ], + "time": { + "from": "now-30m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Serve Deployment Dashboard", + "uid": "rayServeDeploymentDashboard", + "version": 1 +} diff --git a/config/grafana/serve_grafana_dashboard.json b/config/grafana/serve_grafana_dashboard.json new file mode 100644 index 00000000000..dec53437cf9 --- /dev/null +++ b/config/grafana/serve_grafana_dashboard.json @@ -0,0 +1,2208 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "iteration": 1667344411089, + "links": [], + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Aggregated utilization of all physical resources (CPU, GPU, memory, disk, or etc.) across the cluster. Ignores application variable.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "x": 0, + "y": 0, + "w": 8, + "h": 8 + }, + "hiddenSeries": false, + "id": 5, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "avg(ray_node_cpu_utilization{})", + "interval": "", + "legendFormat": "CPU (physical)", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "sum(ray_node_gpus_utilization{}) / on() (sum(autoscaler_cluster_resources{resource='GPU',}) or vector(0))", + "interval": "", + "legendFormat": "GPU (physical)", + "queryType": "randomWalk", + "refId": "B" + }, + { + "exemplar": true, + "expr": "sum(ray_node_mem_used{}) / on() (sum(ray_node_mem_total{})) * 100", + "interval": "", + "legendFormat": "Memory (RAM)", + "queryType": "randomWalk", + "refId": "C" + }, + { + "exemplar": true, + "expr": "sum(ray_node_gram_used{}) / on() (sum(ray_node_gram_available{}) + sum(ray_node_gram_used{})) * 100", + "interval": "", + "legendFormat": "GRAM", + "queryType": "randomWalk", + "refId": "D" + }, + { + "exemplar": true, + "expr": "sum(ray_object_store_memory{}) / on() sum(ray_resources{Name=\"object_store_memory\",}) * 100", + "interval": "", + "legendFormat": "Object Store Memory", + "queryType": "randomWalk", + "refId": "E" + }, + { + "exemplar": true, + "expr": "sum(ray_node_disk_usage{}) / on() (sum(ray_node_disk_free{}) + sum(ray_node_disk_usage{})) * 100", + "interval": "", + "legendFormat": "Disk", + "queryType": "randomWalk", + "refId": "F" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Cluster Utilization", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "%", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "QPS for each selected application.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "x": 8, + "y": 0, + "w": 8, + "h": 8 + }, + "hiddenSeries": false, + "id": 7, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(ray_serve_num_http_requests{application=~\"$Application\",application!~\"\",route=~\"$HTTP_Route\",route!~\"/-/.*\",}[5m])) by (application, route)", + "interval": "", + "legendFormat": "{{application, route}}", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "sum(rate(ray_serve_num_grpc_requests{application=~\"$Application\",application!~\"\",method=~\"$gRPC_Method\",}[5m])) by (application, method)", + "interval": "", + "legendFormat": "{{application, method}}", + "queryType": "randomWalk", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "QPS per application", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "qps", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Error QPS for each selected application.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "x": 16, + "y": 0, + "w": 8, + "h": 8 + }, + "hiddenSeries": false, + "id": 8, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(ray_serve_num_http_error_requests{application=~\"$Application\",application!~\"\",route=~\"$HTTP_Route\",route!~\"/-/.*\",}[5m])) by (application, route)", + "interval": "", + "legendFormat": "{{application, route}}", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "sum(rate(ray_serve_num_grpc_error_requests{application=~\"$Application\",application!~\"\",method=~\"$gRPC_Method\",}[5m])) by (application, method)", + "interval": "", + "legendFormat": "{{application, method}}", + "queryType": "randomWalk", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Error QPS per application", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "qps", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "P50 latency for selected applications.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "x": 0, + "y": 1, + "w": 8, + "h": 8 + }, + "hiddenSeries": false, + "id": 12, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "histogram_quantile(0.5, sum(rate(ray_serve_http_request_latency_ms_bucket{application=~\"$Application\",application!~\"\",route=~\"$HTTP_Route\",route!~\"/-/.*\",}[5m])) by (application, route, le))", + "interval": "", + "legendFormat": "{{application, route}}", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.5, sum(rate(ray_serve_grpc_request_latency_ms_bucket{application=~\"$Application\",application!~\"\",method=~\"$gRPC_Method\",}[5m])) by (application, method, le))", + "interval": "", + "legendFormat": "{{application, method}}", + "queryType": "randomWalk", + "refId": "B" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.5, sum(rate({__name__=~ \"ray_serve_(http|grpc)_request_latency_ms_bucket\",application=~\"$Application\",application!~\"\",}[5m])) by (le))", + "interval": "", + "legendFormat": "Total", + "queryType": "randomWalk", + "refId": "C" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "P50 latency per application", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "ms", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "P90 latency for selected applications.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "x": 8, + "y": 1, + "w": 8, + "h": 8 + }, + "hiddenSeries": false, + "id": 15, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "histogram_quantile(0.9, sum(rate(ray_serve_http_request_latency_ms_bucket{application=~\"$Application\",application!~\"\",route=~\"$HTTP_Route\",route!~\"/-/.*\",}[5m])) by (application, route, le))", + "interval": "", + "legendFormat": "{{application, route}}", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.9, sum(rate(ray_serve_grpc_request_latency_ms_bucket{application=~\"$Application\",application!~\"\",method=~\"$gRPC_Method\",}[5m])) by (application, method, le))", + "interval": "", + "legendFormat": "{{application, method}}", + "queryType": "randomWalk", + "refId": "B" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.9, sum(rate({__name__=~ \"ray_serve_(http|grpc)_request_latency_ms_bucket|ray_serve_grpc_request_latency_ms_bucket\",application=~\"$Application\",application!~\"\",}[5m])) by (le))", + "interval": "", + "legendFormat": "Total", + "queryType": "randomWalk", + "refId": "C" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "P90 latency per application", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "ms", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "P99 latency for selected applications.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "x": 16, + "y": 1, + "w": 8, + "h": 8 + }, + "hiddenSeries": false, + "id": 16, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "histogram_quantile(0.99, sum(rate(ray_serve_http_request_latency_ms_bucket{application=~\"$Application\",application!~\"\",route=~\"$HTTP_Route\",route!~\"/-/.*\",}[5m])) by (application, route, le))", + "interval": "", + "legendFormat": "{{application, route}}", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.99, sum(rate(ray_serve_grpc_request_latency_ms_bucket{application=~\"$Application\",application!~\"\",method=~\"$gRPC_Method\",}[5m])) by (application, method, le))", + "interval": "", + "legendFormat": "{{application, method}}", + "queryType": "randomWalk", + "refId": "B" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.99, sum(rate({__name__=~ \"ray_serve_(http|grpc)_request_latency_ms_bucket|ray_serve_grpc_request_latency_ms_bucket\",application=~\"$Application\",application!~\"\",}[5m])) by (le))", + "interval": "", + "legendFormat": "Total", + "queryType": "randomWalk", + "refId": "C" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "P99 latency per application", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "ms", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Number of replicas per deployment. Ignores \"Application\" variable.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "x": 0, + "y": 2, + "w": 8, + "h": 8 + }, + "hiddenSeries": false, + "id": 2, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_serve_deployment_replica_healthy{}) by (application, deployment)", + "interval": "", + "legendFormat": "{{application, deployment}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Replicas per deployment", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "replicas", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "QPS for each deployment.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "x": 8, + "y": 2, + "w": 8, + "h": 8 + }, + "hiddenSeries": false, + "id": 13, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(ray_serve_deployment_request_counter{application=~\"$Application\",application!~\"\",}[5m])) by (application, deployment)", + "interval": "", + "legendFormat": "{{application, deployment}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "QPS per deployment", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "qps", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Error QPS for each deplyoment.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "x": 16, + "y": 2, + "w": 8, + "h": 8 + }, + "hiddenSeries": false, + "id": 14, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(ray_serve_deployment_error_counter{application=~\"$Application\",application!~\"\",}[5m])) by (application, deployment)", + "interval": "", + "legendFormat": "{{application, deployment}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Error QPS per deployment", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "qps", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "P50 latency per deployment.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "x": 0, + "y": 3, + "w": 8, + "h": 8 + }, + "hiddenSeries": false, + "id": 9, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "histogram_quantile(0.5, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{application=~\"$Application\",application!~\"\",}[5m])) by (application, deployment, le))", + "interval": "", + "legendFormat": "{{application, deployment}}", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.5, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{application=~\"$Application\",application!~\"\",}[5m])) by (le))", + "interval": "", + "legendFormat": "Total", + "queryType": "randomWalk", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "P50 latency per deployment", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "ms", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "P90 latency per deployment.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "x": 8, + "y": 3, + "w": 8, + "h": 8 + }, + "hiddenSeries": false, + "id": 10, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "histogram_quantile(0.9, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{application=~\"$Application\",application!~\"\",}[5m])) by (application, deployment, le))", + "interval": "", + "legendFormat": "{{application, deployment}}", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.9, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{application=~\"$Application\",application!~\"\",}[5m])) by (le))", + "interval": "", + "legendFormat": "Total", + "queryType": "randomWalk", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "P90 latency per deployment", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "ms", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "P99 latency per deployment.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "x": 16, + "y": 3, + "w": 8, + "h": 8 + }, + "hiddenSeries": false, + "id": 11, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "histogram_quantile(0.99, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{application=~\"$Application\",application!~\"\",}[5m])) by (application, deployment, le))", + "interval": "", + "legendFormat": "{{application, deployment}}", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.99, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{application=~\"$Application\",application!~\"\",}[5m])) by (le))", + "interval": "", + "legendFormat": "Total", + "queryType": "randomWalk", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "P99 latency per deployment", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "ms", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Number of requests queued per deployment. Ignores \"Application\" variable.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "x": 0, + "y": 4, + "w": 8, + "h": 8 + }, + "hiddenSeries": false, + "id": 3, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_serve_deployment_queued_queries{}) by (application, deployment)", + "interval": "", + "legendFormat": "{{application, deployment}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Queue size per deployment", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "requests", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Number of nodes in this cluster. Ignores \"Application\" variable.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "x": 8, + "y": 4, + "w": 8, + "h": 8 + }, + "hiddenSeries": false, + "id": 4, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(autoscaler_active_nodes{}) by (NodeType)", + "interval": "", + "legendFormat": "Active Nodes: {{NodeType}}", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "sum(autoscaler_recently_failed_nodes{}) by (NodeType)", + "interval": "", + "legendFormat": "Failed Nodes: {{NodeType}}", + "queryType": "randomWalk", + "refId": "B" + }, + { + "exemplar": true, + "expr": "sum(autoscaler_pending_nodes{}) by (NodeType)", + "interval": "", + "legendFormat": "Pending Nodes: {{NodeType}}", + "queryType": "randomWalk", + "refId": "C" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Node count", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "nodes", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Network speed per node. Ignores \"Application\" variable.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "x": 16, + "y": 4, + "w": 8, + "h": 8 + }, + "hiddenSeries": false, + "id": 6, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_node_network_receive_speed{}) by (instance)", + "interval": "", + "legendFormat": "Recv: {{instance}}", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "sum(ray_node_network_send_speed{}) by (instance)", + "interval": "", + "legendFormat": "Send: {{instance}}", + "queryType": "randomWalk", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Node network", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "Bps", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": false, + "schemaVersion": 27, + "style": "dark", + "tags": [ + "rayVersion:2.9.0" + ], + "templating": { + "list": [ + { + "current": { + "selected": false + }, + "description": "Filter queries of a specific Prometheus type.", + "hide": 2, + "includeAll": false, + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "allValue": ".*", + "current": { + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": "${datasource}", + "definition": "label_values(ray_serve_deployment_replica_healthy{}, application)", + "description": null, + "error": null, + "hide": 0, + "includeAll": true, + "label": null, + "multi": true, + "name": "Application", + "options": [], + "query": { + "query": "label_values(ray_serve_deployment_replica_healthy{}, application)", + "refId": "Prometheus-Instance-Variable-Query" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".*", + "current": { + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": "${datasource}", + "definition": "label_values(ray_serve_num_http_requests{}, route)", + "description": null, + "error": null, + "hide": 0, + "includeAll": true, + "label": "HTTP Route", + "multi": true, + "name": "HTTP_Route", + "options": [], + "query": { + "query": "label_values(ray_serve_num_http_requests{}, route)", + "refId": "Prometheus-Instance-Variable-Query" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".*", + "current": { + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": "${datasource}", + "definition": "label_values(ray_serve_num_grpc_requests{}, method)", + "description": null, + "error": null, + "hide": 0, + "includeAll": true, + "label": "gRPC Service Method", + "multi": true, + "name": "gRPC_Method", + "options": [], + "query": { + "query": "label_values(ray_serve_num_grpc_requests{}, method)", + "refId": "Prometheus-Instance-Variable-Query" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "rayMeta": [ + "excludesSystemRoutes", + "supportsGlobalFilterOverride" + ], + "time": { + "from": "now-30m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Serve Dashboard", + "uid": "rayServeDashboard", + "version": 1 +}