Skip to content

Commit 5f82330

Browse files
cjaliagasoninaren
authored andcommitted
Logging improvements to DiagnosticEventTableStorageRepository (#10996)
1 parent 441e35c commit 5f82330

File tree

5 files changed

+224
-24
lines changed

5 files changed

+224
-24
lines changed

release_notes.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,4 @@
22

33
<!-- Please add your release notes in the following format:
44
- My change description (#PR)
5-
-->
5+
- Disable Diagnostic Events when Table Storage is not accessible (#10996)
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
// Copyright (c) .NET Foundation. All rights reserved.
2+
// Licensed under the MIT License. See License.txt in the project root for license information.
3+
4+
using System;
5+
using Microsoft.Extensions.Logging;
6+
7+
namespace Microsoft.Azure.WebJobs.Script.WebHost.Diagnostics
8+
{
9+
public partial class DiagnosticEventTableStorageRepository
10+
{
11+
private static class Logger
12+
{
13+
private static readonly Action<ILogger, Exception> _serviceDisabledFailedToCreateClient =
14+
LoggerMessage.Define(
15+
LogLevel.Warning,
16+
new EventId(1, nameof(ServiceDisabledFailedToCreateClient)),
17+
"We couldn’t initialize the Table Storage Client using the 'AzureWebJobsStorage' connection string. We are unable to record diagnostic events, so the diagnostic logging service is being stopped. Please check the 'AzureWebJobsStorage' connection string in Application Settings.");
18+
19+
private static readonly Action<ILogger, Exception> _serviceDisabledUnauthorizedClient =
20+
LoggerMessage.Define(
21+
LogLevel.Warning,
22+
new EventId(2, nameof(ServiceDisabledUnauthorizedClient)),
23+
"We couldn’t access the Table service in the Azure Storage account defined by the 'AzureWebJobsStorage' setting. We are unable to record diagnostic events, so the diagnostic logging service is being stopped. Please ensure the connection string or managed identity has permissions to access the Table service and that any network rules allow connectivity. If you're using an identity-based connection, make sure it has been assigned the 'Storage Table Data Contributor' role.");
24+
25+
private static readonly Action<ILogger, Exception> _serviceDisabledUnableToConnectToStorage =
26+
LoggerMessage.Define(
27+
LogLevel.Warning,
28+
new EventId(3, nameof(ServiceDisabledUnableToConnectToStorage)),
29+
"We couldn’t reach the Table service endpoint specified in the 'AzureWebJobsStorage' setting. We are unable to record diagnostic events, so the diagnostic logging service is being stopped. Please confirm network connectivity and endpoint accessibility.");
30+
31+
private static readonly Action<ILogger, string, Exception> _purgingDiagnosticEvents =
32+
LoggerMessage.Define<string>(LogLevel.Debug, new EventId(4, nameof(PurgingDiagnosticEvents)), "Purging diagnostic events with versions older than '{currentEventVersion}'.");
33+
34+
private static readonly Action<ILogger, string, Exception> _deletingTableWithoutEventVersion =
35+
LoggerMessage.Define<string>(LogLevel.Debug, new EventId(5, nameof(DeletingTableWithoutEventVersion)), "Deleting table '{tableName}' as it contains records without an EventVersion.");
36+
37+
private static readonly Action<ILogger, string, Exception> _deletingTableWithOutdatedEventVersion =
38+
LoggerMessage.Define<string>(LogLevel.Debug, new EventId(6, nameof(DeletingTableWithOutdatedEventVersion)), "Deleting table '{tableName}' as it contains records with an outdated EventVersion.");
39+
40+
private static readonly Action<ILogger, Exception> _errorPurgingDiagnosticEventVersions =
41+
LoggerMessage.Define(LogLevel.Error, new EventId(7, nameof(ErrorPurgingDiagnosticEventVersions)), "Error occurred when attempting to purge previous diagnostic event versions.");
42+
43+
private static readonly Action<ILogger, Exception> _unableToGetTableReference =
44+
LoggerMessage.Define(LogLevel.Error, new EventId(8, nameof(UnableToGetTableReference)), "Unable to get table reference. Aborting write operation.");
45+
46+
private static readonly Action<ILogger, Exception> _unableToGetTableReferenceOrCreateTable =
47+
LoggerMessage.Define(LogLevel.Error, new EventId(9, nameof(UnableToGetTableReferenceOrCreateTable)), "Unable to get table reference or create table. Aborting write operation.");
48+
49+
private static readonly Action<ILogger, Exception> _unableToWriteDiagnosticEvents =
50+
LoggerMessage.Define(LogLevel.Error, new EventId(10, nameof(UnableToWriteDiagnosticEvents)), "Unable to write diagnostic events to table storage.");
51+
52+
private static readonly Action<ILogger, Exception> _primaryHostStateProviderNotAvailable =
53+
LoggerMessage.Define(LogLevel.Debug, new EventId(11, nameof(PrimaryHostStateProviderNotAvailable)), "PrimaryHostStateProvider is not available. Skipping the check for primary host.");
54+
55+
private static readonly Action<ILogger, Exception> _stoppingFlushLogsTimer =
56+
LoggerMessage.Define(LogLevel.Information, new EventId(12, nameof(StoppingFlushLogsTimer)), "Stopping the flush logs timer.");
57+
58+
private static readonly Action<ILogger, Exception> _queueingBackgroundTablePurge =
59+
LoggerMessage.Define(LogLevel.Debug, new EventId(13, nameof(QueueingBackgroundTablePurge)), "Queueing background table purge.");
60+
61+
public static void ServiceDisabledFailedToCreateClient(ILogger logger) => _serviceDisabledFailedToCreateClient(logger, null);
62+
63+
public static void ServiceDisabledUnauthorizedClient(ILogger logger, Exception exception) => _serviceDisabledUnauthorizedClient(logger, exception);
64+
65+
public static void ServiceDisabledUnableToConnectToStorage(ILogger logger, Exception exception) => _serviceDisabledUnableToConnectToStorage(logger, exception);
66+
67+
public static void PurgingDiagnosticEvents(ILogger logger, string currentEventVersion) => _purgingDiagnosticEvents(logger, currentEventVersion, null);
68+
69+
public static void DeletingTableWithoutEventVersion(ILogger logger, string tableName) => _deletingTableWithoutEventVersion(logger, tableName, null);
70+
71+
public static void DeletingTableWithOutdatedEventVersion(ILogger logger, string tableName) => _deletingTableWithOutdatedEventVersion(logger, tableName, null);
72+
73+
public static void ErrorPurgingDiagnosticEventVersions(ILogger<DiagnosticEventTableStorageRepository> logger, Exception exception) => _errorPurgingDiagnosticEventVersions(logger, exception);
74+
75+
public static void UnableToGetTableReference(ILogger logger) => _unableToGetTableReference(logger, null);
76+
77+
public static void UnableToGetTableReferenceOrCreateTable(ILogger logger, Exception exception) => _unableToGetTableReferenceOrCreateTable(logger, exception);
78+
79+
public static void UnableToWriteDiagnosticEvents(ILogger logger, Exception exception) => _unableToWriteDiagnosticEvents(logger, exception);
80+
81+
public static void PrimaryHostStateProviderNotAvailable(ILogger logger) => _primaryHostStateProviderNotAvailable(logger, null);
82+
83+
public static void StoppingFlushLogsTimer(ILogger logger) => _stoppingFlushLogsTimer(logger, null);
84+
85+
public static void QueueingBackgroundTablePurge(ILogger logger) => _queueingBackgroundTablePurge(logger, null);
86+
}
87+
}
88+
}

src/WebJobs.Script.WebHost/Diagnostics/DiagnosticEventTableStorageRepository.cs

Lines changed: 74 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,12 @@
55
using System.Collections.Concurrent;
66
using System.Collections.Generic;
77
using System.Linq;
8+
using System.Net;
89
using System.Threading;
910
using System.Threading.Tasks;
11+
using Azure;
1012
using Azure.Data.Tables;
13+
using Azure.Data.Tables.Models;
1114
using Microsoft.Azure.WebJobs.Host.Executors;
1215
using Microsoft.Azure.WebJobs.Hosting;
1316
using Microsoft.Azure.WebJobs.Logging;
@@ -17,7 +20,7 @@
1720

1821
namespace Microsoft.Azure.WebJobs.Script.WebHost.Diagnostics
1922
{
20-
public class DiagnosticEventTableStorageRepository : IDiagnosticEventRepository, IDisposable
23+
public partial class DiagnosticEventTableStorageRepository : IDiagnosticEventRepository, IDisposable
2124
{
2225
internal const string TableNamePrefix = "AzureFunctionsDiagnosticEvents";
2326
private const int LogFlushInterval = 1000 * 60 * 10; // 10 minutes
@@ -59,11 +62,41 @@ internal TableServiceClient TableClient
5962
{
6063
get
6164
{
62-
if (!_environment.IsPlaceholderModeEnabled() && _tableClient == null && !_azureTableStorageProvider.TryCreateHostingTableServiceClient(out _tableClient))
65+
if (_tableClient is null && !_environment.IsPlaceholderModeEnabled())
6366
{
64-
_logger.LogWarning("An error occurred initializing the Table Storage Client. We are unable to record diagnostic events, so the diagnostic logging service is being stopped.");
65-
_isEnabled = false;
66-
StopTimer();
67+
if (!_azureTableStorageProvider.TryCreateHostingTableServiceClient(out _tableClient))
68+
{
69+
DisableService();
70+
Logger.ServiceDisabledFailedToCreateClient(_logger);
71+
return _tableClient;
72+
}
73+
74+
try
75+
{
76+
// When using RBAC, we need "Storage Table Data Contributor" as we require to list, create and delete tables and query/insert/delete entities.
77+
// Testing permissions by listing tables, creating and deleting a test table.
78+
var testTable = _tableClient.GetTableClient($"{TableNamePrefix}Check");
79+
_ = TableStorageHelpers.TableExists(testTable, _tableClient);
80+
_ = testTable.CreateIfNotExists();
81+
_ = testTable.Delete();
82+
}
83+
catch (RequestFailedException rfe) when (rfe.Status == (int)HttpStatusCode.Conflict || rfe.ErrorCode == TableErrorCode.TableBeingDeleted)
84+
{
85+
// The table is being deleted or there could be a conflict for several instances initializing.
86+
// We can ignore this error as it is not a failure and we tested the permissions.
87+
}
88+
catch (RequestFailedException rfe) when (rfe.Status == (int)HttpStatusCode.Forbidden)
89+
{
90+
DisableService();
91+
Logger.ServiceDisabledUnauthorizedClient(_logger, rfe);
92+
}
93+
catch (Exception ex)
94+
{
95+
// We failed to connect to the table storage account. This could be due to a transient error or a configuration issue, such network issues.
96+
// We will disable the service.
97+
DisableService();
98+
Logger.ServiceDisabledUnableToConnectToStorage(_logger, ex);
99+
}
67100
}
68101

69102
return _tableClient;
@@ -84,6 +117,13 @@ internal string HostId
84117

85118
internal ConcurrentDictionary<string, DiagnosticEvent> Events => _events;
86119

120+
private void DisableService()
121+
{
122+
_isEnabled = false;
123+
StopTimer();
124+
_events.Clear();
125+
}
126+
87127
internal TableClient GetDiagnosticEventsTable(DateTime? now = null)
88128
{
89129
if (TableClient != null)
@@ -114,7 +154,7 @@ protected internal virtual async void OnFlushLogs(object state)
114154

115155
private async Task PurgePreviousEventVersions()
116156
{
117-
_logger.LogDebug("Purging diagnostic events with versions older than '{currentEventVersion}'.", DiagnosticEvent.CurrentEventVersion);
157+
Logger.PurgingDiagnosticEvents(_logger, DiagnosticEvent.CurrentEventVersion);
118158

119159
bool tableDeleted = false;
120160

@@ -133,7 +173,7 @@ await Utility.InvokeWithRetriesAsync(async () =>
133173
// Delete table if it doesn't have records with EventVersion
134174
if (string.IsNullOrEmpty(record.EventVersion) == true)
135175
{
136-
_logger.LogDebug("Deleting table '{tableName}' as it contains records without an EventVersion.", table.Name);
176+
Logger.DeletingTableWithoutEventVersion(_logger, table.Name);
137177
await table.DeleteAsync();
138178
tableDeleted = true;
139179
break;
@@ -142,7 +182,7 @@ await Utility.InvokeWithRetriesAsync(async () =>
142182
// If the table does have EventVersion, query if it is an outdated version
143183
if (string.Compare(DiagnosticEvent.CurrentEventVersion, record.EventVersion, StringComparison.Ordinal) > 0)
144184
{
145-
_logger.LogDebug("Deleting table '{tableName}' as it contains records with an outdated EventVersion.", table.Name);
185+
Logger.DeletingTableWithOutdatedEventVersion(_logger, table.Name);
146186
await table.DeleteAsync();
147187
tableDeleted = true;
148188
break;
@@ -154,7 +194,7 @@ await Utility.InvokeWithRetriesAsync(async () =>
154194
}
155195
catch (Exception ex)
156196
{
157-
_logger.LogError(ex, "Error occurred when attempting to purge previous diagnostic event versions.");
197+
Logger.ErrorPurgingDiagnosticEventVersions(_logger, ex);
158198
}
159199
}, maxRetries: 5, retryInterval: TimeSpan.FromSeconds(5));
160200

@@ -170,7 +210,7 @@ internal virtual async Task FlushLogs(TableClient table = null)
170210
// TableClient is initialized lazily and it will stop the timer that schedules flush logs whenever it fails to initialize.
171211
// We need to check if the TableClient is null before proceeding. This helps when the first time the property is accessed is as part of the FlushLogs method.
172212
// We should not have any events stored pending to be written since WriteDiagnosticEvent will check for an initialized TableClient.
173-
if (_environment.IsPlaceholderModeEnabled() || TableClient is null)
213+
if (_environment.IsPlaceholderModeEnabled() || TableClient is null || !IsEnabled())
174214
{
175215
return;
176216
}
@@ -186,21 +226,29 @@ internal virtual async Task FlushLogs(TableClient table = null)
186226

187227
if (table == null)
188228
{
189-
_logger.LogError("Unable to get table reference. Aborting write operation.");
190-
StopTimer();
229+
Logger.UnableToGetTableReference(_logger);
230+
DisableService();
191231
return;
192232
}
193233

194234
bool tableCreated = await TableStorageHelpers.CreateIfNotExistsAsync(table, TableClient, TableCreationMaxRetryCount);
195235
if (tableCreated)
196236
{
197-
_logger.LogDebug("Queueing background table purge.");
237+
Logger.QueueingBackgroundTablePurge(_logger);
198238
TableStorageHelpers.QueueBackgroundTablePurge(table, TableClient, TableNamePrefix, _logger);
199239
}
200240
}
241+
catch (RequestFailedException ex) when (ex.Status == (int)HttpStatusCode.Forbidden)
242+
{
243+
// If we reach this point, we already checked for permissions on TableClient initialization. It is possible that the permissions changed after the initialization or any storage firewall/network configuration changed.
244+
// We will log the error and disable the service.
245+
Logger.UnableToGetTableReferenceOrCreateTable(_logger, ex);
246+
DisableService();
247+
Logger.ServiceDisabledUnauthorizedClient(_logger, ex);
248+
}
201249
catch (Exception ex)
202250
{
203-
_logger.LogError(ex, "Unable to get table reference or create table. Aborting write operation.");
251+
Logger.UnableToGetTableReferenceOrCreateTable(_logger, ex);
204252
// Clearing the memory cache to avoid memory build up.
205253
_events.Clear();
206254
return;
@@ -232,9 +280,18 @@ internal async Task ExecuteBatchAsync(ConcurrentDictionary<string, DiagnosticEve
232280
await table.SubmitTransactionAsync(batch);
233281
events.Clear();
234282
}
283+
catch (RequestFailedException ex) when (ex.Status == (int)HttpStatusCode.Forbidden)
284+
{
285+
// If we reach this point, we already checked for permissions on TableClient initialization.
286+
// It is possible that the permissions changed after the initialization, any firewall/network rules were changed or it's a custom role where we don't have permissions to write entities.
287+
// We will log the error and disable the service.
288+
Logger.UnableToWriteDiagnosticEvents(_logger, ex);
289+
DisableService();
290+
Logger.ServiceDisabledUnauthorizedClient(_logger, ex);
291+
}
235292
catch (Exception ex)
236293
{
237-
_logger.LogError(ex, "Unable to write diagnostic events to table storage.");
294+
Logger.UnableToWriteDiagnosticEvents(_logger, ex);
238295
}
239296
}
240297

@@ -275,7 +332,7 @@ private bool IsPrimaryHost()
275332
var primaryHostStateProvider = _serviceProvider?.GetService<IPrimaryHostStateProvider>();
276333
if (primaryHostStateProvider is null)
277334
{
278-
_logger.LogDebug("PrimaryHostStateProvider is not available. Skipping the check for primary host.");
335+
Logger.PrimaryHostStateProviderNotAvailable(_logger);
279336
return false;
280337
}
281338

@@ -284,7 +341,7 @@ private bool IsPrimaryHost()
284341

285342
private void StopTimer()
286343
{
287-
_logger.LogInformation("Stopping the flush logs timer.");
344+
Logger.StoppingFlushLogsTimer(_logger);
288345
_flushLogsTimer?.Change(Timeout.Infinite, Timeout.Infinite);
289346
}
290347

src/WebJobs.Script.WebHost/Helpers/TableStorageHelpers.cs

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,8 @@ internal static async Task<IEnumerable<TableClient>> ListOldTablesAsync(TableCli
9696

9797
internal static async Task<IEnumerable<TableClient>> ListTablesAsync(TableServiceClient tableClient, string tableNamePrefix)
9898
{
99+
ArgumentNullException.ThrowIfNull(tableClient, nameof(tableClient));
100+
99101
// Azure.Data.Tables doesn't have a direct way to list tables with a prefix so we need to do it manually
100102
var givenValue = tableNamePrefix + "{";
101103
AsyncPageable<TableItem> tablesQuery = tableClient.QueryAsync(p => p.Name.CompareTo(tableNamePrefix) >= 0 && p.Name.CompareTo(givenValue) <= 0);
@@ -120,5 +122,17 @@ internal static async Task<bool> TableExistAsync(TableClient table, TableService
120122

121123
return false;
122124
}
125+
126+
internal static bool TableExists(TableClient table, TableServiceClient tableClient)
127+
{
128+
var query = tableClient.Query(p => p.Name == table.Name);
129+
130+
foreach (var item in query)
131+
{
132+
return true;
133+
}
134+
135+
return false;
136+
}
123137
}
124138
}

0 commit comments

Comments
 (0)