diff --git a/release_notes.md b/release_notes.md index cb92971115..752c8e0012 100644 --- a/release_notes.md +++ b/release_notes.md @@ -12,3 +12,4 @@ - Update Python Worker Version to [4.37.0](https://github.com/Azure/azure-functions-python-worker/releases/tag/4.37.0) - Add runtime and process metrics. (#11034) - Add `win-arm64` and `linux-arm64` to the list of PowerShell runtimes; added filter for `osx` RIDs (includes `osx-x64` and `osx-arm64`) (#11013) +- Disable Diagnostic Events when Table Storage is not accessible (#10996) \ No newline at end of file diff --git a/src/WebJobs.Script.WebHost/Diagnostics/DiagnosticEventTableStorageRepository.Log.cs b/src/WebJobs.Script.WebHost/Diagnostics/DiagnosticEventTableStorageRepository.Log.cs new file mode 100644 index 0000000000..a3547bf1b6 --- /dev/null +++ b/src/WebJobs.Script.WebHost/Diagnostics/DiagnosticEventTableStorageRepository.Log.cs @@ -0,0 +1,88 @@ +// Copyright (c) .NET Foundation. All rights reserved. +// Licensed under the MIT License. See License.txt in the project root for license information. + +using System; +using Microsoft.Extensions.Logging; + +namespace Microsoft.Azure.WebJobs.Script.WebHost.Diagnostics +{ + public partial class DiagnosticEventTableStorageRepository + { + private static class Logger + { + private static readonly Action _serviceDisabledFailedToCreateClient = + LoggerMessage.Define( + LogLevel.Warning, + new EventId(1, nameof(ServiceDisabledFailedToCreateClient)), + "We couldn’t initialize the Table Storage Client using the 'AzureWebJobsStorage' connection string. We are unable to record diagnostic events, so the diagnostic logging service is being stopped. Please check the 'AzureWebJobsStorage' connection string in Application Settings."); + + private static readonly Action _serviceDisabledUnauthorizedClient = + LoggerMessage.Define( + LogLevel.Warning, + new EventId(2, nameof(ServiceDisabledUnauthorizedClient)), + "We couldn’t access the Table service in the Azure Storage account defined by the 'AzureWebJobsStorage' setting. We are unable to record diagnostic events, so the diagnostic logging service is being stopped. Please ensure the connection string or managed identity has permissions to access the Table service and that any network rules allow connectivity. If you're using an identity-based connection, make sure it has been assigned the 'Storage Table Data Contributor' role."); + + private static readonly Action _serviceDisabledUnableToConnectToStorage = + LoggerMessage.Define( + LogLevel.Warning, + new EventId(3, nameof(ServiceDisabledUnableToConnectToStorage)), + "We couldn’t reach the Table service endpoint specified in the 'AzureWebJobsStorage' setting. We are unable to record diagnostic events, so the diagnostic logging service is being stopped. Please confirm network connectivity and endpoint accessibility."); + + private static readonly Action _purgingDiagnosticEvents = + LoggerMessage.Define(LogLevel.Debug, new EventId(4, nameof(PurgingDiagnosticEvents)), "Purging diagnostic events with versions older than '{currentEventVersion}'."); + + private static readonly Action _deletingTableWithoutEventVersion = + LoggerMessage.Define(LogLevel.Debug, new EventId(5, nameof(DeletingTableWithoutEventVersion)), "Deleting table '{tableName}' as it contains records without an EventVersion."); + + private static readonly Action _deletingTableWithOutdatedEventVersion = + LoggerMessage.Define(LogLevel.Debug, new EventId(6, nameof(DeletingTableWithOutdatedEventVersion)), "Deleting table '{tableName}' as it contains records with an outdated EventVersion."); + + private static readonly Action _errorPurgingDiagnosticEventVersions = + LoggerMessage.Define(LogLevel.Error, new EventId(7, nameof(ErrorPurgingDiagnosticEventVersions)), "Error occurred when attempting to purge previous diagnostic event versions."); + + private static readonly Action _unableToGetTableReference = + LoggerMessage.Define(LogLevel.Error, new EventId(8, nameof(UnableToGetTableReference)), "Unable to get table reference. Aborting write operation."); + + private static readonly Action _unableToGetTableReferenceOrCreateTable = + LoggerMessage.Define(LogLevel.Error, new EventId(9, nameof(UnableToGetTableReferenceOrCreateTable)), "Unable to get table reference or create table. Aborting write operation."); + + private static readonly Action _unableToWriteDiagnosticEvents = + LoggerMessage.Define(LogLevel.Error, new EventId(10, nameof(UnableToWriteDiagnosticEvents)), "Unable to write diagnostic events to table storage."); + + private static readonly Action _primaryHostStateProviderNotAvailable = + LoggerMessage.Define(LogLevel.Debug, new EventId(11, nameof(PrimaryHostStateProviderNotAvailable)), "PrimaryHostStateProvider is not available. Skipping the check for primary host."); + + private static readonly Action _stoppingFlushLogsTimer = + LoggerMessage.Define(LogLevel.Information, new EventId(12, nameof(StoppingFlushLogsTimer)), "Stopping the flush logs timer."); + + private static readonly Action _queueingBackgroundTablePurge = + LoggerMessage.Define(LogLevel.Debug, new EventId(13, nameof(QueueingBackgroundTablePurge)), "Queueing background table purge."); + + public static void ServiceDisabledFailedToCreateClient(ILogger logger) => _serviceDisabledFailedToCreateClient(logger, null); + + public static void ServiceDisabledUnauthorizedClient(ILogger logger, Exception exception) => _serviceDisabledUnauthorizedClient(logger, exception); + + public static void ServiceDisabledUnableToConnectToStorage(ILogger logger, Exception exception) => _serviceDisabledUnableToConnectToStorage(logger, exception); + + public static void PurgingDiagnosticEvents(ILogger logger, string currentEventVersion) => _purgingDiagnosticEvents(logger, currentEventVersion, null); + + public static void DeletingTableWithoutEventVersion(ILogger logger, string tableName) => _deletingTableWithoutEventVersion(logger, tableName, null); + + public static void DeletingTableWithOutdatedEventVersion(ILogger logger, string tableName) => _deletingTableWithOutdatedEventVersion(logger, tableName, null); + + public static void ErrorPurgingDiagnosticEventVersions(ILogger logger, Exception exception) => _errorPurgingDiagnosticEventVersions(logger, exception); + + public static void UnableToGetTableReference(ILogger logger) => _unableToGetTableReference(logger, null); + + public static void UnableToGetTableReferenceOrCreateTable(ILogger logger, Exception exception) => _unableToGetTableReferenceOrCreateTable(logger, exception); + + public static void UnableToWriteDiagnosticEvents(ILogger logger, Exception exception) => _unableToWriteDiagnosticEvents(logger, exception); + + public static void PrimaryHostStateProviderNotAvailable(ILogger logger) => _primaryHostStateProviderNotAvailable(logger, null); + + public static void StoppingFlushLogsTimer(ILogger logger) => _stoppingFlushLogsTimer(logger, null); + + public static void QueueingBackgroundTablePurge(ILogger logger) => _queueingBackgroundTablePurge(logger, null); + } + } +} diff --git a/src/WebJobs.Script.WebHost/Diagnostics/DiagnosticEventTableStorageRepository.cs b/src/WebJobs.Script.WebHost/Diagnostics/DiagnosticEventTableStorageRepository.cs index d0397c8928..4b8a6be3bd 100644 --- a/src/WebJobs.Script.WebHost/Diagnostics/DiagnosticEventTableStorageRepository.cs +++ b/src/WebJobs.Script.WebHost/Diagnostics/DiagnosticEventTableStorageRepository.cs @@ -5,9 +5,12 @@ using System.Collections.Concurrent; using System.Collections.Generic; using System.Linq; +using System.Net; using System.Threading; using System.Threading.Tasks; +using Azure; using Azure.Data.Tables; +using Azure.Data.Tables.Models; using Microsoft.Azure.WebJobs.Host.Executors; using Microsoft.Azure.WebJobs.Hosting; using Microsoft.Azure.WebJobs.Logging; @@ -17,7 +20,7 @@ namespace Microsoft.Azure.WebJobs.Script.WebHost.Diagnostics { - public class DiagnosticEventTableStorageRepository : IDiagnosticEventRepository, IDisposable + public partial class DiagnosticEventTableStorageRepository : IDiagnosticEventRepository, IDisposable { internal const string TableNamePrefix = "AzureFunctionsDiagnosticEvents"; private const int LogFlushInterval = 1000 * 60 * 10; // 10 minutes @@ -59,11 +62,41 @@ internal TableServiceClient TableClient { get { - if (!_environment.IsPlaceholderModeEnabled() && _tableClient == null && !_azureTableStorageProvider.TryCreateHostingTableServiceClient(out _tableClient)) + if (_tableClient is null && !_environment.IsPlaceholderModeEnabled()) { - _logger.LogWarning("An error occurred initializing the Table Storage Client. We are unable to record diagnostic events, so the diagnostic logging service is being stopped."); - _isEnabled = false; - StopTimer(); + if (!_azureTableStorageProvider.TryCreateHostingTableServiceClient(out _tableClient)) + { + DisableService(); + Logger.ServiceDisabledFailedToCreateClient(_logger); + return _tableClient; + } + + try + { + // When using RBAC, we need "Storage Table Data Contributor" as we require to list, create and delete tables and query/insert/delete entities. + // Testing permissions by listing tables, creating and deleting a test table. + var testTable = _tableClient.GetTableClient($"{TableNamePrefix}Check"); + _ = TableStorageHelpers.TableExists(testTable, _tableClient); + _ = testTable.CreateIfNotExists(); + _ = testTable.Delete(); + } + catch (RequestFailedException rfe) when (rfe.Status == (int)HttpStatusCode.Conflict || rfe.ErrorCode == TableErrorCode.TableBeingDeleted) + { + // The table is being deleted or there could be a conflict for several instances initializing. + // We can ignore this error as it is not a failure and we tested the permissions. + } + catch (RequestFailedException rfe) when (rfe.Status == (int)HttpStatusCode.Forbidden) + { + DisableService(); + Logger.ServiceDisabledUnauthorizedClient(_logger, rfe); + } + catch (Exception ex) + { + // We failed to connect to the table storage account. This could be due to a transient error or a configuration issue, such network issues. + // We will disable the service. + DisableService(); + Logger.ServiceDisabledUnableToConnectToStorage(_logger, ex); + } } return _tableClient; @@ -84,6 +117,13 @@ internal string HostId internal ConcurrentDictionary Events => _events; + private void DisableService() + { + _isEnabled = false; + StopTimer(); + _events.Clear(); + } + internal TableClient GetDiagnosticEventsTable(DateTime? now = null) { if (TableClient != null) @@ -114,7 +154,7 @@ protected internal virtual async void OnFlushLogs(object state) private async Task PurgePreviousEventVersions() { - _logger.LogDebug("Purging diagnostic events with versions older than '{currentEventVersion}'.", DiagnosticEvent.CurrentEventVersion); + Logger.PurgingDiagnosticEvents(_logger, DiagnosticEvent.CurrentEventVersion); bool tableDeleted = false; @@ -133,7 +173,7 @@ await Utility.InvokeWithRetriesAsync(async () => // Delete table if it doesn't have records with EventVersion if (string.IsNullOrEmpty(record.EventVersion) == true) { - _logger.LogDebug("Deleting table '{tableName}' as it contains records without an EventVersion.", table.Name); + Logger.DeletingTableWithoutEventVersion(_logger, table.Name); await table.DeleteAsync(); tableDeleted = true; break; @@ -142,7 +182,7 @@ await Utility.InvokeWithRetriesAsync(async () => // If the table does have EventVersion, query if it is an outdated version if (string.Compare(DiagnosticEvent.CurrentEventVersion, record.EventVersion, StringComparison.Ordinal) > 0) { - _logger.LogDebug("Deleting table '{tableName}' as it contains records with an outdated EventVersion.", table.Name); + Logger.DeletingTableWithOutdatedEventVersion(_logger, table.Name); await table.DeleteAsync(); tableDeleted = true; break; @@ -154,7 +194,7 @@ await Utility.InvokeWithRetriesAsync(async () => } catch (Exception ex) { - _logger.LogError(ex, "Error occurred when attempting to purge previous diagnostic event versions."); + Logger.ErrorPurgingDiagnosticEventVersions(_logger, ex); } }, maxRetries: 5, retryInterval: TimeSpan.FromSeconds(5)); @@ -170,7 +210,7 @@ internal virtual async Task FlushLogs(TableClient table = null) // TableClient is initialized lazily and it will stop the timer that schedules flush logs whenever it fails to initialize. // We need to check if the TableClient is null before proceeding. This helps when the first time the property is accessed is as part of the FlushLogs method. // We should not have any events stored pending to be written since WriteDiagnosticEvent will check for an initialized TableClient. - if (_environment.IsPlaceholderModeEnabled() || TableClient is null) + if (_environment.IsPlaceholderModeEnabled() || TableClient is null || !IsEnabled()) { return; } @@ -186,21 +226,29 @@ internal virtual async Task FlushLogs(TableClient table = null) if (table == null) { - _logger.LogError("Unable to get table reference. Aborting write operation."); - StopTimer(); + Logger.UnableToGetTableReference(_logger); + DisableService(); return; } bool tableCreated = await TableStorageHelpers.CreateIfNotExistsAsync(table, TableClient, TableCreationMaxRetryCount); if (tableCreated) { - _logger.LogDebug("Queueing background table purge."); + Logger.QueueingBackgroundTablePurge(_logger); TableStorageHelpers.QueueBackgroundTablePurge(table, TableClient, TableNamePrefix, _logger); } } + catch (RequestFailedException ex) when (ex.Status == (int)HttpStatusCode.Forbidden) + { + // If we reach this point, we already checked for permissions on TableClient initialization. It is possible that the permissions changed after the initialization or any storage firewall/network configuration changed. + // We will log the error and disable the service. + Logger.UnableToGetTableReferenceOrCreateTable(_logger, ex); + DisableService(); + Logger.ServiceDisabledUnauthorizedClient(_logger, ex); + } catch (Exception ex) { - _logger.LogError(ex, "Unable to get table reference or create table. Aborting write operation."); + Logger.UnableToGetTableReferenceOrCreateTable(_logger, ex); // Clearing the memory cache to avoid memory build up. _events.Clear(); return; @@ -232,9 +280,18 @@ internal async Task ExecuteBatchAsync(ConcurrentDictionary(); if (primaryHostStateProvider is null) { - _logger.LogDebug("PrimaryHostStateProvider is not available. Skipping the check for primary host."); + Logger.PrimaryHostStateProviderNotAvailable(_logger); return false; } @@ -284,7 +341,7 @@ private bool IsPrimaryHost() private void StopTimer() { - _logger.LogInformation("Stopping the flush logs timer."); + Logger.StoppingFlushLogsTimer(_logger); _flushLogsTimer?.Change(Timeout.Infinite, Timeout.Infinite); } diff --git a/src/WebJobs.Script.WebHost/Helpers/TableStorageHelpers.cs b/src/WebJobs.Script.WebHost/Helpers/TableStorageHelpers.cs index e9c528a8cc..a02742abb4 100644 --- a/src/WebJobs.Script.WebHost/Helpers/TableStorageHelpers.cs +++ b/src/WebJobs.Script.WebHost/Helpers/TableStorageHelpers.cs @@ -96,6 +96,8 @@ internal static async Task> ListOldTablesAsync(TableCli internal static async Task> ListTablesAsync(TableServiceClient tableClient, string tableNamePrefix) { + ArgumentNullException.ThrowIfNull(tableClient, nameof(tableClient)); + // Azure.Data.Tables doesn't have a direct way to list tables with a prefix so we need to do it manually var givenValue = tableNamePrefix + "{"; AsyncPageable tablesQuery = tableClient.QueryAsync(p => p.Name.CompareTo(tableNamePrefix) >= 0 && p.Name.CompareTo(givenValue) <= 0); @@ -120,5 +122,17 @@ internal static async Task TableExistAsync(TableClient table, TableService return false; } + + internal static bool TableExists(TableClient table, TableServiceClient tableClient) + { + var query = tableClient.Query(p => p.Name == table.Name); + + foreach (var item in query) + { + return true; + } + + return false; + } } } diff --git a/test/WebJobs.Script.Tests.Integration/Diagnostics/DiagnosticEventTableStorageRepositoryTests.cs b/test/WebJobs.Script.Tests.Integration/Diagnostics/DiagnosticEventTableStorageRepositoryTests.cs index 624eb55053..d9a528535a 100644 --- a/test/WebJobs.Script.Tests.Integration/Diagnostics/DiagnosticEventTableStorageRepositoryTests.cs +++ b/test/WebJobs.Script.Tests.Integration/Diagnostics/DiagnosticEventTableStorageRepositoryTests.cs @@ -19,21 +19,24 @@ using Moq; using Moq.Protected; using Xunit; +using Microsoft.Azure.WebJobs.Script.Tests.Integration.Fixtures; namespace Microsoft.Azure.WebJobs.Script.Tests.Integration.Diagnostics { - public class DiagnosticEventTableStorageRepositoryTests + public class DiagnosticEventTableStorageRepositoryTests : IClassFixture { private const string TestHostId = "testhostid"; private readonly IHostIdProvider _hostIdProvider; private readonly TestLoggerProvider _loggerProvider; private readonly IAzureTableStorageProvider _azureTableStorageProvider; + private readonly AzuriteFixture _azurite; private ILogger _logger; private Mock _scriptHostMock; - public DiagnosticEventTableStorageRepositoryTests() + public DiagnosticEventTableStorageRepositoryTests(AzuriteFixture azurite) { + _azurite = azurite; _hostIdProvider = new FixedHostIdProvider(TestHostId); var mockPrimaryHostStateProvider = new Mock(MockBehavior.Strict); @@ -47,7 +50,12 @@ public DiagnosticEventTableStorageRepositoryTests() loggerFactory.AddProvider(_loggerProvider); _logger = loggerFactory.CreateLogger(); - var configuration = new ConfigurationBuilder().AddEnvironmentVariables().Build(); + var configuration = new ConfigurationBuilder() + .AddInMemoryCollection(new Dictionary(StringComparer.OrdinalIgnoreCase) + { + { "AzureWebJobsStorage", _azurite.GetConnectionString() }, + }) + .Build(); _azureTableStorageProvider = TestHelpers.GetAzureTableStorageProvider(configuration); } @@ -164,7 +172,8 @@ public void GetDiagnosticEventsTable_LogsError_StorageConnectionStringIsNotPrese var cloudTable = repository.GetDiagnosticEventsTable(dateTime); Assert.Null(cloudTable); var messages = _loggerProvider.GetAllLogMessages(); - Assert.Equal(messages[0].FormattedMessage, "An error occurred initializing the Table Storage Client. We are unable to record diagnostic events, so the diagnostic logging service is being stopped."); + var errorIntializingPresent = messages.Any(m => m.FormattedMessage.Contains("We couldn’t initialize the Table Storage Client using the 'AzureWebJobsStorage' connection string. We are unable to record diagnostic events, so the diagnostic logging service is being stopped. Please check the 'AzureWebJobsStorage' connection string in Application Settings.")); + Assert.True(errorIntializingPresent); Assert.False(repository.IsEnabled()); } @@ -285,7 +294,7 @@ public async Task FlushLogs_OnPrimaryHost_DoesNotTryToPurgeEvents_WhenTableClien await repository.FlushLogs(); // Assert - var createFailureMessagePresent = _loggerProvider.GetAllLogMessages().Any(m => m.FormattedMessage.Contains("An error occurred initializing the Table Storage Client. We are unable to record diagnostic events, so the diagnostic logging service is being stopped.")); + var createFailureMessagePresent = _loggerProvider.GetAllLogMessages().Any(m => m.FormattedMessage.Contains("We couldn’t initialize the Table Storage Client using the 'AzureWebJobsStorage' connection string. We are unable to record diagnostic events, so the diagnostic logging service is being stopped. Please check the 'AzureWebJobsStorage' connection string in Application Settings.")); Assert.True(createFailureMessagePresent); var purgeEventMessagePresent = _loggerProvider.GetAllLogMessages().Any(m => m.FormattedMessage.Contains("Purging diagnostic events with versions older than")); @@ -418,11 +427,35 @@ public async Task ExecuteBatchAsync_LogsError() events.TryAdd("EC123", diagnosticEvent); await repository.ExecuteBatchAsync(events, table); - ExecuteQuery(tableClient, table); string message = _loggerProvider.GetAllLogMessages()[0].FormattedMessage; Assert.True(message.StartsWith("Unable to write diagnostic events to table storage")); } + [Fact] + public async Task FlushLogs_DisablesService_NoPermissions() + { + IEnvironment testEnvironment = new TestEnvironment(); + testEnvironment.SetEnvironmentVariable(EnvironmentSettingNames.AzureWebsitePlaceholderMode, "0"); + + // Tamper the connection string to simulate a permissions issue, we need a valid base64 connection string to create the TableServiceClient. + var connectionString = _azurite.GetConnectionString(); + var configuration = new ConfigurationBuilder() + .AddInMemoryCollection(new Dictionary(StringComparer.OrdinalIgnoreCase) + { + { "AzureWebJobsStorage", connectionString.Replace(AzuriteFixture.AccountKey, TamperKey(AzuriteFixture.AccountKey)) }, + }) + .Build(); + var localStorageProvider = TestHelpers.GetAzureTableStorageProvider(configuration); + + DiagnosticEventTableStorageRepository repository = + new DiagnosticEventTableStorageRepository(_hostIdProvider, testEnvironment, _scriptHostMock.Object, localStorageProvider, _logger); + + // The repository should be initially enabled and then disabled after failing to initialize the TableServiceClient while flushing Logs. + Assert.True(repository.IsEnabled()); + await repository.FlushLogs(); + Assert.False(repository.IsEnabled()); + } + private DiagnosticEvent CreateDiagnosticEvent(DateTime timestamp, string errorCode, LogLevel level, string message, string helpLink, Exception exception, string eventVersion) { var diagnosticEvent = new DiagnosticEvent(TestHostId, timestamp) @@ -459,6 +492,14 @@ internal IEnumerable ExecuteQuery(TableServiceClient tableClient, T return table.Query(); } + private string TamperKey(string key) => + string.Create(key.Length, key, (destination, source) => + { + source.AsSpan().CopyTo(destination); + // We simply replace the first character to maintain a well-formed base64 key + destination[0] = destination[0] == 'A' ? 'B' : 'A'; + }); + private class FixedHostIdProvider : IHostIdProvider { private readonly string _hostId;