diff --git a/alert-policies/temporal-cloud/FailedWorkflows.yml b/alert-policies/temporal-cloud/FailedWorkflows.yml new file mode 100644 index 0000000000..5cc0f79eb0 --- /dev/null +++ b/alert-policies/temporal-cloud/FailedWorkflows.yml @@ -0,0 +1,29 @@ +# Name of the alert +name: Failed Workflows + +# Description and details +description: |+ + This alert is triggered if the Temporal cloud workflows fail once within a 5-minute window. +type: STATIC + +# NRQL query +nrql: + query: "FROM temporalCloudWorkflowFailed SELECT latest(`data.result-value1`) FACET `data.result-metric-__name__`" + +# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE) +valueFunction: SINGLE_VALUE + +# List of Critical and Warning thresholds for the condition +terms: + - priority: CRITICAL + # Operator used to compare against the threshold. + operator: ABOVE + # Value that triggers a violation + threshold: 1 + # Time in seconds; 120 - 3600 + thresholdDuration: 300 + # How many data points must be in violation for the duration + thresholdOccurrences: ALL +# Duration after which a violation automatically closes +# Time in seconds; 300 - 2592000 (Default: 86400 [1 day]) +violationTimeLimitSeconds: 86400 \ No newline at end of file diff --git a/alert-policies/temporal-cloud/ServiceLatency.yml b/alert-policies/temporal-cloud/ServiceLatency.yml new file mode 100644 index 0000000000..1e7c2d8431 --- /dev/null +++ b/alert-policies/temporal-cloud/ServiceLatency.yml @@ -0,0 +1,30 @@ +# Name of the alert +name: Service Latency + +# Description and details +description: |+ + This alert is triggered if the Temporal cloud service latency exceeds 5 seconds for 5 minutes. +# Type of alert +type: STATIC + +# NRQL query +nrql: + query: "FROM temporalCloudWorkflowFailed SELECT latest(`data.result-value1`) FACET `data.result-metric-__name__`" + +# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE) +valueFunction: SINGLE_VALUE + +# List of Critical and Warning thresholds for the condition +terms: + - priority: CRITICAL + # Operator used to compare against the threshold. + operator: ABOVE + # Value that triggers a violation + threshold: 1 + # Time in seconds; 120 - 3600 + thresholdDuration: 300 + # How many data points must be in violation for the duration + thresholdOccurrences: ALL +# Duration after which a violation automatically closes +# Time in seconds; 300 - 2592000 (Default: 86400 [1 day]) +violationTimeLimitSeconds: 86400 \ No newline at end of file diff --git a/dashboards/temporal-cloud/temporal-cloud-01.png b/dashboards/temporal-cloud/temporal-cloud-01.png new file mode 100644 index 0000000000..91a3869e9e Binary files /dev/null and b/dashboards/temporal-cloud/temporal-cloud-01.png differ diff --git a/dashboards/temporal-cloud/temporal-cloud-02.png b/dashboards/temporal-cloud/temporal-cloud-02.png new file mode 100644 index 0000000000..9a2e050179 Binary files /dev/null and b/dashboards/temporal-cloud/temporal-cloud-02.png differ diff --git a/dashboards/temporal-cloud/temporal-cloud-03.png b/dashboards/temporal-cloud/temporal-cloud-03.png new file mode 100644 index 0000000000..93ec212bdb Binary files /dev/null and b/dashboards/temporal-cloud/temporal-cloud-03.png differ diff --git a/dashboards/temporal-cloud/temporal-cloud.json b/dashboards/temporal-cloud/temporal-cloud.json new file mode 100644 index 0000000000..e470ca8ec3 --- /dev/null +++ b/dashboards/temporal-cloud/temporal-cloud.json @@ -0,0 +1,576 @@ +{ + "name": "Temporal Cloud", + "description": null, + "pages": [ + { + "name": "Temporal Cloud", + "description": null, + "widgets": [ + { + "title": "", + "layout": { + "column": 1, + "row": 1, + "width": 2, + "height": 1 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.markdown" + }, + "rawConfiguration": { + "text": "![Temporal cloud icon](https://docs.temporal.io/img/temporal-logo-dark.svg)\n" + } + }, + { + "title": "Terminated Workflows", + "layout": { + "column": 3, + "row": 1, + "width": 5, + "height": 4 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.area" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM temporalCloudWorkflowTerminate SELECT latest(`data.result-value1`) AS 'temporal_cloud_v0_workflow_terminate_count' WHERE `data.result-metric-__name__` LIKE 'temporal_cloud_v0_workflow_terminate_count' TIMESERIES AUTO" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Service Latency", + "layout": { + "column": 8, + "row": 1, + "width": 5, + "height": 4 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.stacked-bar" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM temporalCloudServiceLatencyCount SELECT latest(`data.result-value1`) AS 'temporal_cloud_v0_service_latency_count' WHERE `data.result-metric-__name__` LIKE 'temporal_cloud_v0_service_latency_count' TIMESERIES AUTO " + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "units": { + "unit": "MS" + } + } + }, + { + "title": "", + "layout": { + "column": 1, + "row": 2, + "width": 2, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.markdown" + }, + "rawConfiguration": { + "text": "**About**\n\nInstrument Temporal cloud with New Relic - [Add Data](https://one.newrelic.com/)\n\nFollow New Relic [Temporal cloud documentation](https://docs.newrelic.com/docs/infrastructure/host-integrations/host-integrations-list/temporal-cloud-integration/) documentation to instrument Temporal cloud.\n\n\n[Please rate this dashboard](https://docs.google.com/forms/d/e/1FAIpQLSclR38J8WbbB2J1tHnllKUkzWZkJhf4SrJGyavpMd4t82NjnQ/viewform?usp=pp_url&entry.1615922415=Temporal_cloud&entry.358368110=https://onenr.io/0dQeLbY0Yje) here and let us know how we can improve it for you." + } + }, + { + "title": "Rate of Poll Success and Sync", + "layout": { + "column": 1, + "row": 5, + "width": 6, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.pie" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": true + }, + "legend": { + "enabled": true + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM temporalCloudPollSuccess, temporalCloudPollSuccessSync SELECT rate(latest(`data.result-value1`),1 minute ) FACET `data.result-metric-__name__`, `data.result-metric-temporal_namespace`" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Total Number of Successful Scheduled Actions", + "layout": { + "column": 7, + "row": 5, + "width": 6, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.area" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM temporalCloudScheduleActionSuccess SELECT latest(`data.result-value1`) AS 'temporal_cloud_v0_schedule_action_success_count' WHERE `data.result-metric-__name__` LIKE 'temporal_cloud_v0_schedule_action_success_count' TIMESERIES AUTO" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Frontend Service Errors", + "layout": { + "column": 1, + "row": 8, + "width": 5, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.area" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM temporalCloudFrontendServiceError SELECT latest(`data.result-value1`) AS 'temporal_cloud_v0_frontend_service_error_count' WHERE `data.result-metric-__name__` LIKE 'temporal_cloud_v0_frontend_service_error_count' TIMESERIES AUTO" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Frontend Service Requests", + "layout": { + "column": 6, + "row": 8, + "width": 4, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.area" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM temporalCloudFrontendServiceRequest SELECT latest(`data.result-value1`) AS 'temporal_cloud_v0_frontend_service_request_count' WHERE `data.result-metric-__name__` LIKE 'temporal_cloud_v0_frontend_service_request_count' TIMESERIES AUTO " + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "units": { + "unit": "COUNT" + } + } + }, + { + "title": "Failed Workflows", + "layout": { + "column": 10, + "row": 8, + "width": 3, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.area" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM temporalCloudWorkflowFailed SELECT latest(`data.result-value1`) AS 'temporal_cloud_v0_workflow_failed_count' WHERE `data.result-metric-__name__` LIKE 'temporal_cloud_v0_workflow_failed_count' TIMESERIES AUTO" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Resource Exhausted Errors", + "layout": { + "column": 1, + "row": 11, + "width": 3, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.stacked-bar" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM temporalCloudResourceExhaustedErrors SELECT latest(`data.result-value1`) AS 'temporal_cloud_v0_resource_exhausted_error_count' WHERE `data.result-metric-__name__` LIKE 'temporal_cloud_v0_resource_exhausted_error_count' TIMESERIES AUTO" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Cancel Workflows", + "layout": { + "column": 4, + "row": 11, + "width": 4, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.area" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM temporalCloudWorkflowCancel SELECT latest(`data.result-value1`) AS 'temporal_cloud_v0_workflow_cancel_count' WHERE `data.result-metric-__name__` LIKE 'temporal_cloud_v0_workflow_cancel_count' TIMESERIES AUTO" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Namespace State Transitions", + "layout": { + "column": 8, + "row": 11, + "width": 5, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.pie" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": true + }, + "legend": { + "enabled": true + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "FROM temporalCloudStateTransition SELECT latest(`data.result-value1`) WHERE `data.result-metric-__name__`LIKE 'temporal_cloud_v0_state_transition_count' FACET `data.result-metric-temporal_namespace` " + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + } + ] + }, + { + "name": "Infrastructure", + "description": null, + "widgets": [ + { + "title": "CPU Usage (%)", + "layout": { + "column": 1, + "row": 1, + "width": 4, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT average(cpuPercent) AS `CPU used %` FROM SystemSample TIMESERIES AUTO" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": { + "isLabelVisible": true + }, + "yAxisLeft": { + "max": 100, + "min": 0, + "zero": true + }, + "yAxisRight": { + "zero": true + } + } + }, + { + "title": "Memory Usage (%)", + "layout": { + "column": 5, + "row": 1, + "width": 4, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT average(memoryUsedPercent) AS `Memory used %` FROM SystemSample TIMESERIES AUTO" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": { + "isLabelVisible": true + }, + "yAxisLeft": { + "max": 100, + "min": 0, + "zero": true + }, + "yAxisRight": { + "zero": true + } + } + }, + { + "title": "Storage Usage (%)", + "layout": { + "column": 9, + "row": 1, + "width": 4, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT average(diskUsedPercent) AS `Storage used %` FROM StorageSample TIMESERIES AUTO" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": { + "isLabelVisible": true + }, + "yAxisLeft": { + "max": 100, + "min": 0, + "zero": true + }, + "yAxisRight": { + "zero": true + } + } + }, + { + "title": "Network Traffic", + "layout": { + "column": 1, + "row": 4, + "width": 4, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.area" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT average(transmitBytesPerSecond) AS `Transmit bytes per second`, average(receiveBytesPerSecond) AS `Receive bytes per second` FROM NetworkSample TIMESERIES AUTO" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, + { + "title": "Load Average", + "layout": { + "column": 5, + "row": 4, + "width": 4, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "legend": { + "enabled": true + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT average(loadAverageOneMinute) AS `1 minute`, average(loadAverageFiveMinute) AS `5 minutes`, average(loadAverageFifteenMinute) AS `15 minutes` FROM SystemSample TIMESERIES AUTO" + } + ], + "platformOptions": { + "ignoreTimeRange": false + }, + "thresholds": { + "isLabelVisible": true + }, + "yAxisLeft": { + "zero": true + }, + "yAxisRight": { + "zero": true + } + } + }, + { + "title": "Processes Running", + "layout": { + "column": 9, + "row": 4, + "width": 4, + "height": 3 + }, + "linkedEntityGuids": null, + "visualization": { + "id": "viz.table" + }, + "rawConfiguration": { + "facet": { + "showOtherSeries": false + }, + "nrqlQueries": [ + { + "accountIds": [], + "query": "SELECT latest(cpuPercent) AS `CPU %`, latest(threadCount) AS `Threads` FROM ProcessSample FACET processId, processDisplayName ORDER BY cpuPercent asc LIMIT 100" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + } + ] + } + ], + "variables": [] +} \ No newline at end of file diff --git a/data-sources/temporal-cloud/config.yml b/data-sources/temporal-cloud/config.yml new file mode 100644 index 0000000000..ab7b4761b3 --- /dev/null +++ b/data-sources/temporal-cloud/config.yml @@ -0,0 +1,15 @@ +id: temporal-cloud +displayName: Temporal cloud +description: | + Optimize your Temporal cloud performance using New Relic Temporal cloud monitoring. +icon: logo.svg +install: + primary: + link: + url: https://docs.newrelic.com/docs/infrastructure/host-integrations/host-integrations-list/temporal-cloud-integration/ +keywords: + - temporal + - temporal cloud + - temporal java sdk + - workflow + - cluster \ No newline at end of file diff --git a/data-sources/temporal-cloud/logo.svg b/data-sources/temporal-cloud/logo.svg new file mode 100644 index 0000000000..3f9daa2621 --- /dev/null +++ b/data-sources/temporal-cloud/logo.svg @@ -0,0 +1 @@ +Temporal \ No newline at end of file diff --git a/quickstarts/temporal-cloud/config.yml b/quickstarts/temporal-cloud/config.yml new file mode 100644 index 0000000000..20d433a069 --- /dev/null +++ b/quickstarts/temporal-cloud/config.yml @@ -0,0 +1,42 @@ +id: 48f80505-4552-4ea2-9d5d-8ce5f6f6266d +slug: temporal-cloud +description: | + ## Why monitor Temporal cloud? + Monitoring allows you to identify performance bottlenecks, such as slow workflows or inefficient resource usage, optimize your Temporal cloud for better performance, and help you detect and diagnose faults or errors in your Temporal workflows. + + ## Comprehensive monitoring quickstart for Temporal cloud + Temporal cloud provides users with workflow execution states and metadata for debugging purposes. Integrating Temporal cloud with New Relic enhances system health maintenance, early issue identification, and smooth operation assurance. This integration leverages the strengths of both systems to offer a comprehensive monitoring solution, encompassing metrics and alerts. Customize the following steps based on the specific programming language and framework of your Temporal cloud services. + + ## What’s included in this quickstart? + New Relic Temporal cloud monitoring quickstart ability to cover quality on out-of-the-box reporting. + + - Dashboards (failed, cancel and terminated workflows, service latency and frontend service requests) + - Alerts (failed workflows and service latency) + +summary: | + Monitor and analyze your Temporal cloud with New Relic +icon: logo.svg +level: New Relic +authors: + - New Relic + - Praveen Kudikyala +title: Temporal cloud +documentation: + - name: Temporal cloud integration documentation + description: | + Enhance the performance monitoring and instrumentation of your Temporal cloud by integrating New Relic. + url: >- + https://docs.newrelic.com/docs/infrastructure/host-integrations/host-integrations-list/temporal-cloud-integration/ +keywords: + - temporal + - temporal cloud + - temporal java sdk + - workflow + - cluster + - NR1_addData +dashboards: + - temporal-cloud +alertPolicies: + - temporal-cloud +dataSourceIds: + - temporal-cloud diff --git a/quickstarts/temporal-cloud/logo.svg b/quickstarts/temporal-cloud/logo.svg new file mode 100644 index 0000000000..3f9daa2621 --- /dev/null +++ b/quickstarts/temporal-cloud/logo.svg @@ -0,0 +1 @@ +Temporal \ No newline at end of file