Skip to content

Commit f8cd9b2

Browse files
authored
Properly log errors and close container when erorrs happen inside tim… (#20704)
Part of #20703 investigation. In one of the service runs done locally (having original "getSelf" changes applied), I've noticed that error generated as result of hitting assert in ConnectionStateHandler was printed on the screen by runner, i.e. without hitting out logging system or closing container. Examining code, it's obvious that there are a number of code paths where same can happen - most of the activities related to connectivity are run in response to timers or incoming messages / packets. We do not wrap then with try/catch - any error hit there will leave system in inconsistent state, without us every noticing directly (unless someone looks at Loop logs that capture unhandled exceptions). Addressing these issues for all code paths that touch ConnectionStateHandler. More might need to be done in overall space though (for example, likely same needs to be done for all methods on IConnectionManagerFactoryArgs - you can notice that we were hit by same problem as incomingOpHandler has try/catch, but not any other method). https://dev.azure.com/fluidframework/internal/_workitems/edit/7784 tracks follow up.
1 parent aad5ebb commit f8cd9b2

File tree

4 files changed

+40
-11
lines changed

4 files changed

+40
-11
lines changed

packages/loader/container-loader/src/connectionStateHandler.ts

+18-6
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
* Licensed under the MIT License.
44
*/
55

6-
import { IDeltaManager } from "@fluidframework/container-definitions";
6+
import { IDeltaManager, ICriticalContainerError } from "@fluidframework/container-definitions";
77
import { ITelemetryBaseProperties } from "@fluidframework/core-interfaces";
88
import { assert, Timer } from "@fluidframework/core-utils/internal";
99
import { IAnyDriverError } from "@fluidframework/driver-definitions";
@@ -12,6 +12,7 @@ import { ITelemetryLoggerExt, type TelemetryEventCategory } from "@fluidframewor
1212
import {
1313
PerformanceEvent,
1414
loggerToMonitoringContext,
15+
normalizeError,
1516
} from "@fluidframework/telemetry-utils/internal";
1617

1718
import { CatchUpMonitor, ICatchUpMonitor } from "./catchUpMonitor.js";
@@ -48,6 +49,9 @@ export interface IConnectionStateHandlerInputs {
4849
) => void;
4950
/** Callback to note that an old local client ID is still present in the Quorum that should have left and should now be considered invalid */
5051
clientShouldHaveLeft: (clientId: string) => void;
52+
53+
/** Some critical error was hit. Container should be closed and error logged. */
54+
onCriticalError: (error: ICriticalContainerError) => void;
5155
}
5256

5357
/**
@@ -196,6 +200,10 @@ class ConnectionStateHandlerPassThrough
196200
public clientShouldHaveLeft(clientId: string) {
197201
return this.inputs.clientShouldHaveLeft(clientId);
198202
}
203+
204+
public onCriticalError(error: ICriticalContainerError) {
205+
return this.inputs.onCriticalError(error);
206+
}
199207
}
200208

201209
/**
@@ -357,11 +365,15 @@ class ConnectionStateHandler implements IConnectionStateHandler {
357365
// the max time on server after which leave op is sent.
358366
this.handler.maxClientLeaveWaitTime ?? 300000,
359367
() => {
360-
assert(
361-
this.connectionState !== ConnectionState.Connected,
362-
0x2ac /* "Connected when timeout waiting for leave from previous session fired!" */,
363-
);
364-
this.applyForConnectedState("timeout");
368+
try {
369+
assert(
370+
this.connectionState !== ConnectionState.Connected,
371+
0x2ac /* "Connected when timeout waiting for leave from previous session fired!" */,
372+
);
373+
this.applyForConnectedState("timeout");
374+
} catch (error) {
375+
this.handler.onCriticalError(normalizeError(error));
376+
}
365377
},
366378
);
367379

packages/loader/container-loader/src/container.ts

+6-2
Original file line numberDiff line numberDiff line change
@@ -372,7 +372,7 @@ export class Container
372372

373373
return PerformanceEvent.timedExecAsync(
374374
container.mc.logger,
375-
{ eventName: "Load" },
375+
{ eventName: "Load", ...loadMode },
376376
async (event) =>
377377
new Promise<Container>((resolve, reject) => {
378378
const defaultMode: IContainerLoadMode = { opsBeforeReturn: "cached" };
@@ -397,7 +397,7 @@ export class Container
397397
})
398398
.then(
399399
(props) => {
400-
event.end({ ...props, ...loadMode });
400+
event.end({ ...props });
401401
resolve(container);
402402
},
403403
(error) => {
@@ -721,6 +721,7 @@ export class Container
721721
},
722722
error,
723723
);
724+
this.close(normalizeError(error));
724725
});
725726

726727
const {
@@ -893,6 +894,9 @@ export class Container
893894
clientShouldHaveLeft: (clientId: string) => {
894895
this.clientsWhoShouldHaveLeft.add(clientId);
895896
},
897+
onCriticalError: (error: ICriticalContainerError) => {
898+
this.close(error);
899+
},
896900
},
897901
this.deltaManager,
898902
pendingLocalState?.clientId,

packages/loader/container-loader/src/deltaManager.ts

+12-3
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
* Licensed under the MIT License.
44
*/
55

6-
import { TypedEventEmitter } from "@fluid-internal/client-utils";
76
import {
87
ICriticalContainerError,
98
IDeltaManager,
@@ -47,6 +46,7 @@ import {
4746
isFluidError,
4847
normalizeError,
4948
safeRaiseEvent,
49+
EventEmitterWithErrorHandling,
5050
} from "@fluidframework/telemetry-utils/internal";
5151
import { v4 as uuid } from "uuid";
5252

@@ -149,7 +149,7 @@ function logIfFalse(
149149
* messages in order regardless of possible network conditions or timings causing out of order delivery.
150150
*/
151151
export class DeltaManager<TConnectionManager extends IConnectionManager>
152-
extends TypedEventEmitter<IDeltaManagerInternalEvents>
152+
extends EventEmitterWithErrorHandling<IDeltaManagerInternalEvents>
153153
implements
154154
IDeltaManager<ISequencedDocumentMessage, IDocumentMessage>,
155155
IEventProvider<IDeltaManagerInternalEvents>
@@ -411,7 +411,16 @@ export class DeltaManager<TConnectionManager extends IConnectionManager>
411411
private readonly _active: () => boolean,
412412
createConnectionManager: (props: IConnectionManagerFactoryArgs) => TConnectionManager,
413413
) {
414-
super();
414+
super((name, error) => {
415+
this.logger.sendErrorEvent(
416+
{
417+
eventName: "DeltaManagerEventHandlerException",
418+
name: typeof name === "string" ? name : undefined,
419+
},
420+
error,
421+
);
422+
this.close(normalizeError(error));
423+
});
415424
const props: IConnectionManagerFactoryArgs = {
416425
incomingOpHandler: (messages: ISequencedDocumentMessage[], reason: string) => {
417426
try {

packages/loader/container-loader/src/test/connectionStateHandler.spec.ts

+4
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,10 @@ describe("ConnectionStateHandler Tests", () => {
157157
connectionStateChanged: () => {},
158158
logger: createChildLogger(),
159159
clientShouldHaveLeft: (clientId: string) => {},
160+
onCriticalError: (error) => {
161+
// eslint-disable-next-line @typescript-eslint/no-throw-literal
162+
throw error;
163+
},
160164
};
161165

162166
deltaManagerForCatchingUp = new MockDeltaManagerForCatchingUp();

0 commit comments

Comments
 (0)