Skip to content

Logging improvements to DiagnosticEventTableStorageRepository #10996

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Apr 29, 2025
1 change: 1 addition & 0 deletions release_notes.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,4 @@
- Update Python Worker Version to [4.37.0](https://github.com/Azure/azure-functions-python-worker/releases/tag/4.37.0)
- Add runtime and process metrics. (#11034)
- Add `win-arm64` and `linux-arm64` to the list of PowerShell runtimes; added filter for `osx` RIDs (includes `osx-x64` and `osx-arm64`) (#11013)
- Disable Diagnostic Events when Table Storage is not accessible (#10996)
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
// Copyright (c) .NET Foundation. All rights reserved.
// Licensed under the MIT License. See License.txt in the project root for license information.

using System;
using Microsoft.Extensions.Logging;

namespace Microsoft.Azure.WebJobs.Script.WebHost.Diagnostics
{
public partial class DiagnosticEventTableStorageRepository
{
private static class Logger
{
private static readonly Action<ILogger, Exception> _serviceDisabledFailedToCreateClient =
LoggerMessage.Define(
LogLevel.Warning,
new EventId(1, nameof(ServiceDisabledFailedToCreateClient)),
"We couldn’t initialize the Table Storage Client using the 'AzureWebJobsStorage' connection string. We are unable to record diagnostic events, so the diagnostic logging service is being stopped. Please check the 'AzureWebJobsStorage' connection string in Application Settings.");

private static readonly Action<ILogger, Exception> _serviceDisabledUnauthorizedClient =
LoggerMessage.Define(
LogLevel.Warning,
new EventId(2, nameof(ServiceDisabledUnauthorizedClient)),
"We couldn’t access the Table service in the Azure Storage account defined by the 'AzureWebJobsStorage' setting. We are unable to record diagnostic events, so the diagnostic logging service is being stopped. Please ensure the connection string or managed identity has permissions to access the Table service and that any network rules allow connectivity. If you're using an identity-based connection, make sure it has been assigned the 'Storage Table Data Contributor' role.");

private static readonly Action<ILogger, Exception> _serviceDisabledUnableToConnectToStorage =
LoggerMessage.Define(
LogLevel.Warning,
new EventId(3, nameof(ServiceDisabledUnableToConnectToStorage)),
"We couldn’t reach the Table service endpoint specified in the 'AzureWebJobsStorage' setting. We are unable to record diagnostic events, so the diagnostic logging service is being stopped. Please confirm network connectivity and endpoint accessibility.");

private static readonly Action<ILogger, string, Exception> _purgingDiagnosticEvents =
LoggerMessage.Define<string>(LogLevel.Debug, new EventId(4, nameof(PurgingDiagnosticEvents)), "Purging diagnostic events with versions older than '{currentEventVersion}'.");

private static readonly Action<ILogger, string, Exception> _deletingTableWithoutEventVersion =
LoggerMessage.Define<string>(LogLevel.Debug, new EventId(5, nameof(DeletingTableWithoutEventVersion)), "Deleting table '{tableName}' as it contains records without an EventVersion.");

private static readonly Action<ILogger, string, Exception> _deletingTableWithOutdatedEventVersion =
LoggerMessage.Define<string>(LogLevel.Debug, new EventId(6, nameof(DeletingTableWithOutdatedEventVersion)), "Deleting table '{tableName}' as it contains records with an outdated EventVersion.");

private static readonly Action<ILogger, Exception> _errorPurgingDiagnosticEventVersions =
LoggerMessage.Define(LogLevel.Error, new EventId(7, nameof(ErrorPurgingDiagnosticEventVersions)), "Error occurred when attempting to purge previous diagnostic event versions.");

private static readonly Action<ILogger, Exception> _unableToGetTableReference =
LoggerMessage.Define(LogLevel.Error, new EventId(8, nameof(UnableToGetTableReference)), "Unable to get table reference. Aborting write operation.");

private static readonly Action<ILogger, Exception> _unableToGetTableReferenceOrCreateTable =
LoggerMessage.Define(LogLevel.Error, new EventId(9, nameof(UnableToGetTableReferenceOrCreateTable)), "Unable to get table reference or create table. Aborting write operation.");

private static readonly Action<ILogger, Exception> _unableToWriteDiagnosticEvents =
LoggerMessage.Define(LogLevel.Error, new EventId(10, nameof(UnableToWriteDiagnosticEvents)), "Unable to write diagnostic events to table storage.");

private static readonly Action<ILogger, Exception> _primaryHostStateProviderNotAvailable =
LoggerMessage.Define(LogLevel.Debug, new EventId(11, nameof(PrimaryHostStateProviderNotAvailable)), "PrimaryHostStateProvider is not available. Skipping the check for primary host.");

private static readonly Action<ILogger, Exception> _stoppingFlushLogsTimer =
LoggerMessage.Define(LogLevel.Information, new EventId(12, nameof(StoppingFlushLogsTimer)), "Stopping the flush logs timer.");

private static readonly Action<ILogger, Exception> _queueingBackgroundTablePurge =
LoggerMessage.Define(LogLevel.Debug, new EventId(13, nameof(QueueingBackgroundTablePurge)), "Queueing background table purge.");

public static void ServiceDisabledFailedToCreateClient(ILogger logger) => _serviceDisabledFailedToCreateClient(logger, null);

public static void ServiceDisabledUnauthorizedClient(ILogger logger, Exception exception) => _serviceDisabledUnauthorizedClient(logger, exception);

public static void ServiceDisabledUnableToConnectToStorage(ILogger logger, Exception exception) => _serviceDisabledUnableToConnectToStorage(logger, exception);

public static void PurgingDiagnosticEvents(ILogger logger, string currentEventVersion) => _purgingDiagnosticEvents(logger, currentEventVersion, null);

public static void DeletingTableWithoutEventVersion(ILogger logger, string tableName) => _deletingTableWithoutEventVersion(logger, tableName, null);

public static void DeletingTableWithOutdatedEventVersion(ILogger logger, string tableName) => _deletingTableWithOutdatedEventVersion(logger, tableName, null);

public static void ErrorPurgingDiagnosticEventVersions(ILogger<DiagnosticEventTableStorageRepository> logger, Exception exception) => _errorPurgingDiagnosticEventVersions(logger, exception);

public static void UnableToGetTableReference(ILogger logger) => _unableToGetTableReference(logger, null);

public static void UnableToGetTableReferenceOrCreateTable(ILogger logger, Exception exception) => _unableToGetTableReferenceOrCreateTable(logger, exception);

public static void UnableToWriteDiagnosticEvents(ILogger logger, Exception exception) => _unableToWriteDiagnosticEvents(logger, exception);

public static void PrimaryHostStateProviderNotAvailable(ILogger logger) => _primaryHostStateProviderNotAvailable(logger, null);

public static void StoppingFlushLogsTimer(ILogger logger) => _stoppingFlushLogsTimer(logger, null);

public static void QueueingBackgroundTablePurge(ILogger logger) => _queueingBackgroundTablePurge(logger, null);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,12 @@
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.Linq;
using System.Net;
using System.Threading;
using System.Threading.Tasks;
using Azure;
using Azure.Data.Tables;
using Azure.Data.Tables.Models;
using Microsoft.Azure.WebJobs.Host.Executors;
using Microsoft.Azure.WebJobs.Hosting;
using Microsoft.Azure.WebJobs.Logging;
Expand All @@ -17,7 +20,7 @@

namespace Microsoft.Azure.WebJobs.Script.WebHost.Diagnostics
{
public class DiagnosticEventTableStorageRepository : IDiagnosticEventRepository, IDisposable
public partial class DiagnosticEventTableStorageRepository : IDiagnosticEventRepository, IDisposable
{
internal const string TableNamePrefix = "AzureFunctionsDiagnosticEvents";
private const int LogFlushInterval = 1000 * 60 * 10; // 10 minutes
Expand Down Expand Up @@ -59,11 +62,41 @@ internal TableServiceClient TableClient
{
get
{
if (!_environment.IsPlaceholderModeEnabled() && _tableClient == null && !_azureTableStorageProvider.TryCreateHostingTableServiceClient(out _tableClient))
if (_tableClient is null && !_environment.IsPlaceholderModeEnabled())
{
_logger.LogWarning("An error occurred initializing the Table Storage Client. We are unable to record diagnostic events, so the diagnostic logging service is being stopped.");
_isEnabled = false;
StopTimer();
if (!_azureTableStorageProvider.TryCreateHostingTableServiceClient(out _tableClient))
{
DisableService();
Logger.ServiceDisabledFailedToCreateClient(_logger);
return _tableClient;
}

try
{
// When using RBAC, we need "Storage Table Data Contributor" as we require to list, create and delete tables and query/insert/delete entities.
// Testing permissions by listing tables, creating and deleting a test table.
var testTable = _tableClient.GetTableClient($"{TableNamePrefix}Check");
_ = TableStorageHelpers.TableExists(testTable, _tableClient);
_ = testTable.CreateIfNotExists();
_ = testTable.Delete();
}
catch (RequestFailedException rfe) when (rfe.Status == (int)HttpStatusCode.Conflict || rfe.ErrorCode == TableErrorCode.TableBeingDeleted)
{
// The table is being deleted or there could be a conflict for several instances initializing.
// We can ignore this error as it is not a failure and we tested the permissions.
}
catch (RequestFailedException rfe) when (rfe.Status == (int)HttpStatusCode.Forbidden)
{
DisableService();
Logger.ServiceDisabledUnauthorizedClient(_logger, rfe);
}
catch (Exception ex)
{
// We failed to connect to the table storage account. This could be due to a transient error or a configuration issue, such network issues.
// We will disable the service.
DisableService();
Logger.ServiceDisabledUnableToConnectToStorage(_logger, ex);
}
}

return _tableClient;
Expand All @@ -84,6 +117,13 @@ internal string HostId

internal ConcurrentDictionary<string, DiagnosticEvent> Events => _events;

private void DisableService()
{
_isEnabled = false;
StopTimer();
_events.Clear();
}

internal TableClient GetDiagnosticEventsTable(DateTime? now = null)
{
if (TableClient != null)
Expand Down Expand Up @@ -114,7 +154,7 @@ protected internal virtual async void OnFlushLogs(object state)

private async Task PurgePreviousEventVersions()
{
_logger.LogDebug("Purging diagnostic events with versions older than '{currentEventVersion}'.", DiagnosticEvent.CurrentEventVersion);
Logger.PurgingDiagnosticEvents(_logger, DiagnosticEvent.CurrentEventVersion);

bool tableDeleted = false;

Expand All @@ -133,7 +173,7 @@ await Utility.InvokeWithRetriesAsync(async () =>
// Delete table if it doesn't have records with EventVersion
if (string.IsNullOrEmpty(record.EventVersion) == true)
{
_logger.LogDebug("Deleting table '{tableName}' as it contains records without an EventVersion.", table.Name);
Logger.DeletingTableWithoutEventVersion(_logger, table.Name);
await table.DeleteAsync();
tableDeleted = true;
break;
Expand All @@ -142,7 +182,7 @@ await Utility.InvokeWithRetriesAsync(async () =>
// If the table does have EventVersion, query if it is an outdated version
if (string.Compare(DiagnosticEvent.CurrentEventVersion, record.EventVersion, StringComparison.Ordinal) > 0)
{
_logger.LogDebug("Deleting table '{tableName}' as it contains records with an outdated EventVersion.", table.Name);
Logger.DeletingTableWithOutdatedEventVersion(_logger, table.Name);
await table.DeleteAsync();
tableDeleted = true;
break;
Expand All @@ -154,7 +194,7 @@ await Utility.InvokeWithRetriesAsync(async () =>
}
catch (Exception ex)
{
_logger.LogError(ex, "Error occurred when attempting to purge previous diagnostic event versions.");
Logger.ErrorPurgingDiagnosticEventVersions(_logger, ex);
}
}, maxRetries: 5, retryInterval: TimeSpan.FromSeconds(5));

Expand All @@ -170,7 +210,7 @@ internal virtual async Task FlushLogs(TableClient table = null)
// TableClient is initialized lazily and it will stop the timer that schedules flush logs whenever it fails to initialize.
// We need to check if the TableClient is null before proceeding. This helps when the first time the property is accessed is as part of the FlushLogs method.
// We should not have any events stored pending to be written since WriteDiagnosticEvent will check for an initialized TableClient.
if (_environment.IsPlaceholderModeEnabled() || TableClient is null)
if (_environment.IsPlaceholderModeEnabled() || TableClient is null || !IsEnabled())
{
return;
}
Expand All @@ -186,21 +226,29 @@ internal virtual async Task FlushLogs(TableClient table = null)

if (table == null)
{
_logger.LogError("Unable to get table reference. Aborting write operation.");
StopTimer();
Logger.UnableToGetTableReference(_logger);
DisableService();
return;
}

bool tableCreated = await TableStorageHelpers.CreateIfNotExistsAsync(table, TableClient, TableCreationMaxRetryCount);
if (tableCreated)
{
_logger.LogDebug("Queueing background table purge.");
Logger.QueueingBackgroundTablePurge(_logger);
TableStorageHelpers.QueueBackgroundTablePurge(table, TableClient, TableNamePrefix, _logger);
}
}
catch (RequestFailedException ex) when (ex.Status == (int)HttpStatusCode.Forbidden)
{
// If we reach this point, we already checked for permissions on TableClient initialization. It is possible that the permissions changed after the initialization or any storage firewall/network configuration changed.
// We will log the error and disable the service.
Logger.UnableToGetTableReferenceOrCreateTable(_logger, ex);
DisableService();
Logger.ServiceDisabledUnauthorizedClient(_logger, ex);
}
catch (Exception ex)
{
_logger.LogError(ex, "Unable to get table reference or create table. Aborting write operation.");
Logger.UnableToGetTableReferenceOrCreateTable(_logger, ex);
// Clearing the memory cache to avoid memory build up.
_events.Clear();
return;
Expand Down Expand Up @@ -232,9 +280,18 @@ internal async Task ExecuteBatchAsync(ConcurrentDictionary<string, DiagnosticEve
await table.SubmitTransactionAsync(batch);
events.Clear();
}
catch (RequestFailedException ex) when (ex.Status == (int)HttpStatusCode.Forbidden)
{
// If we reach this point, we already checked for permissions on TableClient initialization.
// It is possible that the permissions changed after the initialization, any firewall/network rules were changed or it's a custom role where we don't have permissions to write entities.
// We will log the error and disable the service.
Logger.UnableToWriteDiagnosticEvents(_logger, ex);
DisableService();
Logger.ServiceDisabledUnauthorizedClient(_logger, ex);
}
catch (Exception ex)
{
_logger.LogError(ex, "Unable to write diagnostic events to table storage.");
Logger.UnableToWriteDiagnosticEvents(_logger, ex);
}
}

Expand Down Expand Up @@ -275,7 +332,7 @@ private bool IsPrimaryHost()
var primaryHostStateProvider = _serviceProvider?.GetService<IPrimaryHostStateProvider>();
if (primaryHostStateProvider is null)
{
_logger.LogDebug("PrimaryHostStateProvider is not available. Skipping the check for primary host.");
Logger.PrimaryHostStateProviderNotAvailable(_logger);
return false;
}

Expand All @@ -284,7 +341,7 @@ private bool IsPrimaryHost()

private void StopTimer()
{
_logger.LogInformation("Stopping the flush logs timer.");
Logger.StoppingFlushLogsTimer(_logger);
_flushLogsTimer?.Change(Timeout.Infinite, Timeout.Infinite);
}

Expand Down
14 changes: 14 additions & 0 deletions src/WebJobs.Script.WebHost/Helpers/TableStorageHelpers.cs
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,8 @@ internal static async Task<IEnumerable<TableClient>> ListOldTablesAsync(TableCli

internal static async Task<IEnumerable<TableClient>> ListTablesAsync(TableServiceClient tableClient, string tableNamePrefix)
{
ArgumentNullException.ThrowIfNull(tableClient, nameof(tableClient));

// Azure.Data.Tables doesn't have a direct way to list tables with a prefix so we need to do it manually
var givenValue = tableNamePrefix + "{";
AsyncPageable<TableItem> tablesQuery = tableClient.QueryAsync(p => p.Name.CompareTo(tableNamePrefix) >= 0 && p.Name.CompareTo(givenValue) <= 0);
Expand All @@ -120,5 +122,17 @@ internal static async Task<bool> TableExistAsync(TableClient table, TableService

return false;
}

internal static bool TableExists(TableClient table, TableServiceClient tableClient)
{
var query = tableClient.Query(p => p.Name == table.Name);

foreach (var item in query)
{
return true;
}

return false;
}
}
}
Loading