Skip to content

Commit c44ca66

Browse files
committed
DPP-97 #resolve Fixed - Supervisor Shutdown Policies
Supervisors must shut down all their children before exiting themselves. Previously we shut them down in parallel, yet many of our restart modes are indicative of inter-relationships between sibling children. The addition of a `ShutdownMode` provides a way to switch to a synchronous shutdown sequence and to control the traversal (i.e., direction) used.
1 parent f496fff commit c44ca66

File tree

2 files changed

+107
-48
lines changed

2 files changed

+107
-48
lines changed

src/Control/Distributed/Process/Platform/Supervisor.hs

Lines changed: 62 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -239,6 +239,7 @@ module Control.Distributed.Process.Platform.Supervisor
239239
, RestartMode(..)
240240
, RestartOrder(..)
241241
, RestartStrategy(..)
242+
, ShutdownMode(..)
242243
, restartOne
243244
, restartAll
244245
, restartLeft
@@ -430,14 +431,21 @@ data RestartMode =
430431
instance Binary RestartMode where
431432
instance NFData RestartMode where
432433

434+
data ShutdownMode = SequentialShutdown !RestartOrder
435+
| ParallelShutdown
436+
deriving (Typeable, Generic, Show, Eq)
437+
instance Binary ShutdownMode where
438+
instance NFData ShutdownMode where
439+
433440
-- | Strategy used by a supervisor to handle child restarts, whether due to
434441
-- unexpected child failure or explicit restart requests from a client.
435442
--
436443
-- Some terminology: We refer to child processes managed by the same supervisor
437444
-- as /siblings/. When restarting a child process, the 'RestartNone' policy
438445
-- indicates that sibling processes should be left alone, whilst the 'RestartAll'
439446
-- policy will cause /all/ children to be restarted (in the same order they were
440-
-- started). ************************************************************************
447+
-- started).
448+
--
441449
-- The other two restart strategies refer to /prior/ and /subsequent/
442450
-- siblings, which describe's those children's configured position
443451
-- (i.e., insertion order). These latter modes allow one to control the order
@@ -446,18 +454,19 @@ instance NFData RestartMode where
446454
--
447455
data RestartStrategy =
448456
RestartOne
449-
{ intensity :: !RestartLimit } -- ^ restart only the failed child process
457+
{ intensity :: !RestartLimit
458+
} -- ^ restart only the failed child process
450459
| RestartAll
451-
{ intensity :: !RestartLimit
452-
, mode :: !RestartMode
460+
{ intensity :: !RestartLimit
461+
, mode :: !RestartMode
453462
} -- ^ also restart all siblings
454463
| RestartLeft
455-
{ intensity :: !RestartLimit
456-
, mode :: !RestartMode
464+
{ intensity :: !RestartLimit
465+
, mode :: !RestartMode
457466
} -- ^ restart prior siblings (i.e., prior /start order/)
458467
| RestartRight
459-
{ intensity :: !RestartLimit
460-
, mode :: !RestartMode
468+
{ intensity :: !RestartLimit
469+
, mode :: !RestartMode
461470
} -- ^ restart subsequent siblings (i.e., subsequent /start order/)
462471
deriving (Typeable, Generic, Show)
463472
instance Binary RestartStrategy where
@@ -542,7 +551,7 @@ instance NFData ChildType where
542551
data RestartPolicy =
543552
Permanent -- ^ a permanent child will always be restarted
544553
| Temporary -- ^ a temporary child will /never/ be restarted
545-
| Transient -- ^ a transient child will be restarted only if it terminates abnormally
554+
| Transient -- ^ A transient child will be restarted only if it terminates abnormally
546555
| Intrinsic -- ^ as 'Transient', but if the child exits normally, the supervisor also exits normally
547556
deriving (Typeable, Generic, Eq, Show)
548557
instance Binary RestartPolicy where
@@ -819,13 +828,14 @@ instance Logger LogSink where
819828
logMessage (LogProcess client') = logMessage client'
820829

821830
data State = State {
822-
_specs :: ChildSpecs
823-
, _active :: Map ProcessId ChildKey
824-
, _strategy :: RestartStrategy
825-
, _restartPeriod :: NominalDiffTime
826-
, _restarts :: [UTCTime]
827-
, _stats :: SupervisorStats
828-
, _logger :: LogSink
831+
_specs :: ChildSpecs
832+
, _active :: Map ProcessId ChildKey
833+
, _strategy :: RestartStrategy
834+
, _restartPeriod :: NominalDiffTime
835+
, _restarts :: [UTCTime]
836+
, _stats :: SupervisorStats
837+
, _logger :: LogSink
838+
, shutdownStrategy :: ShutdownMode
829839
}
830840

831841
--------------------------------------------------------------------------------
@@ -837,13 +847,13 @@ data State = State {
837847
--
838848
-- > start = spawnLocal . run
839849
--
840-
start :: RestartStrategy -> [ChildSpec] -> Process ProcessId
841-
start s cs = spawnLocal $ run s cs
850+
start :: RestartStrategy -> ShutdownMode -> [ChildSpec] -> Process ProcessId
851+
start rs ss cs = spawnLocal $ run rs ss cs
842852

843853
-- | Run the supplied children using the provided restart strategy.
844854
--
845-
run :: RestartStrategy -> [ChildSpec] -> Process ()
846-
run strategy' specs' = MP.pserve (strategy', specs') supInit serverDefinition
855+
run :: RestartStrategy -> ShutdownMode -> [ChildSpec] -> Process ()
856+
run rs ss specs' = MP.pserve (rs, ss, specs') supInit serverDefinition
847857

848858
--------------------------------------------------------------------------------
849859
-- Client Facing API --
@@ -935,8 +945,8 @@ shutdownAndWait sid = do
935945
-- Server Initialisation/Startup --
936946
--------------------------------------------------------------------------------
937947

938-
supInit :: InitHandler (RestartStrategy, [ChildSpec]) State
939-
supInit (strategy', specs') = do
948+
supInit :: InitHandler (RestartStrategy, ShutdownMode, [ChildSpec]) State
949+
supInit (strategy', shutdown', specs') = do
940950
logClient <- Log.client
941951
let client' = case logClient of
942952
Nothing -> LogChan
@@ -946,7 +956,7 @@ supInit (strategy', specs') = do
946956
)
947957
. (strategy ^= strategy')
948958
. (logger ^= client')
949-
$ emptyState
959+
$ emptyState shutdown'
950960
)
951961
-- TODO: should we return Ignore, as per OTP's supervisor, if no child starts?
952962
(foldlM initChild initState specs' >>= return . (flip InitOk) Infinity)
@@ -982,15 +992,16 @@ initialised state spec (Right ref) = do
982992
-- Server Definition/State --
983993
--------------------------------------------------------------------------------
984994

985-
emptyState :: State
986-
emptyState = State {
987-
_specs = Seq.empty
988-
, _active = Map.empty
989-
, _strategy = restartAll
990-
, _restartPeriod = (fromIntegral (0 :: Integer)) :: NominalDiffTime
991-
, _restarts = []
992-
, _stats = emptyStats
993-
, _logger = LogChan
995+
emptyState :: ShutdownMode -> State
996+
emptyState strat = State {
997+
_specs = Seq.empty
998+
, _active = Map.empty
999+
, _strategy = restartAll
1000+
, _restartPeriod = (fromIntegral (0 :: Integer)) :: NominalDiffTime
1001+
, _restarts = []
1002+
, _stats = emptyStats
1003+
, _logger = LogChan
1004+
, shutdownStrategy = strat
9941005
}
9951006

9961007
emptyStats :: SupervisorStats
@@ -1218,7 +1229,6 @@ handleMonitorSignal state (ProcessMonitorNotification _ pid reason) = do
12181229
--------------------------------------------------------------------------------
12191230

12201231
handleShutdown :: State -> ExitReason -> Process ()
1221-
-- TODO: stop all our children from left to right...
12221232
handleShutdown state (ExitOther reason) = terminateChildren state >> die reason
12231233
handleShutdown state _ = terminateChildren state
12241234

@@ -1260,7 +1270,7 @@ tryRestartBranch rs sp dr st = -- TODO: use DiedReason for logging...
12601270
RestartAll _ _ -> childSpecs
12611271
RestartLeft _ _ -> subTreeL
12621272
RestartRight _ _ -> subTreeR
1263-
_ -> error "IllegalState"
1273+
_ -> error "IllegalState"
12641274
proc = case mode' of
12651275
RestartEach _ -> stopStart
12661276
RestartInOrder _ -> restartL
@@ -1628,17 +1638,24 @@ filterInitFailures sup pid ex = do
16281638

16291639
terminateChildren :: State -> Process ()
16301640
terminateChildren state = do
1631-
let allChildren = toList $ state ^. specs
1632-
pids <- forM allChildren $ \ch -> do
1633-
pid <- spawnLocal $ asyncTerminate ch $ (active ^= Map.empty) state
1634-
void $ monitor pid
1635-
return pid
1636-
_ <- collectExits [] pids
1637-
-- TODO: report errs???
1638-
return ()
1641+
case (shutdownStrategy state) of
1642+
ParallelShutdown -> do
1643+
let allChildren = toList $ state ^. specs
1644+
pids <- forM allChildren $ \ch -> do
1645+
pid <- spawnLocal $ void $ syncTerminate ch $ (active ^= Map.empty) state
1646+
void $ monitor pid
1647+
return pid
1648+
void $ collectExits [] pids
1649+
-- TODO: report errs???
1650+
SequentialShutdown ord -> do
1651+
let specs' = state ^. specs
1652+
let allChildren = case ord of
1653+
RightToLeft -> Seq.reverse specs'
1654+
LeftToRight -> specs'
1655+
void $ foldlM (flip syncTerminate) state (toList allChildren)
16391656
where
1640-
asyncTerminate :: Child -> State -> Process ()
1641-
asyncTerminate (cr, cs) state' = void $ doTerminateChild cr cs state'
1657+
syncTerminate :: Child -> State -> Process State
1658+
syncTerminate (cr, cs) state' = doTerminateChild cr cs state'
16421659

16431660
collectExits :: [DiedReason]
16441661
-> [ProcessId]
@@ -1667,7 +1684,7 @@ doTerminateChild ref spec state = do
16671684
)
16681685
where
16691686
shutdownComplete :: State -> ProcessId -> DiedReason -> Process State
1670-
shutdownComplete _ _ DiedNormal = return $ updateStopped
1687+
shutdownComplete _ _ DiedNormal = return $ updateStopped
16711688
shutdownComplete state' pid (r :: DiedReason) = do
16721689
logShutdown (state' ^. logger) chKey pid r >> return state'
16731690

tests/TestSupervisor.hs

Lines changed: 45 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ import Control.Exception (throwIO)
1818
import Control.Distributed.Process hiding (call, monitor)
1919
import Control.Distributed.Process.Closure
2020
import Control.Distributed.Process.Node
21-
import Control.Distributed.Process.Platform hiding (__remoteTable, send)
21+
import Control.Distributed.Process.Platform hiding (__remoteTable, send, sendChan)
2222
-- import Control.Distributed.Process.Platform as Alt (monitor)
2323
import Control.Distributed.Process.Platform.Test
2424
import Control.Distributed.Process.Platform.Time
@@ -114,7 +114,7 @@ runInTestContext :: LocalNode
114114
-> Assertion
115115
runInTestContext node lock rs cs proc = do
116116
Ex.bracket (takeMVar lock) (putMVar lock) $ \() -> runProcess node $ do
117-
sup <- Supervisor.start rs cs
117+
sup <- Supervisor.start rs ParallelShutdown cs
118118
(proc sup) `finally` (exit sup ExitShutdown)
119119

120120
verifyChildWasRestarted :: ChildKey -> ProcessId -> ProcessId -> Process ()
@@ -209,6 +209,44 @@ normalStartStop sup = do
209209
shutdown sup
210210
sup `shouldExitWith` DiedNormal
211211

212+
sequentialShutdown :: TestResult (Maybe ()) -> Process ()
213+
sequentialShutdown result = do
214+
(sp, rp) <- newChan
215+
(sg, rg) <- newChan
216+
core' <- toChildStart $ runCore sp
217+
app' <- toChildStart $ runApp sg
218+
let core = (permChild core') { childRegName = Just (LocalName "core")
219+
, childStop = TerminateTimeout (Delay $ within 2 Seconds)
220+
}
221+
let app = (permChild app') { childRegName = Just (LocalName "app")
222+
, childStop = TerminateTimeout (Delay $ within 2 Seconds)
223+
}
224+
225+
sup <- Supervisor.start restartRight
226+
(SequentialShutdown RightToLeft)
227+
[core, app]
228+
229+
() <- receiveChan rg
230+
exit sup ExitShutdown
231+
res <- receiveChanTimeout (asTimeout $ seconds 2) rp
232+
233+
-- whereis "core" >>= liftIO . putStrLn . ("core :" ++) . show
234+
-- whereis "app" >>= liftIO . putStrLn . ("app :" ++) . show
235+
236+
sleepFor 1 Seconds
237+
stash result res
238+
239+
where
240+
runCore :: SendPort () -> Process ()
241+
runCore sp = (expect >>= say) `catchExit` (\_ ExitShutdown -> sendChan sp ())
242+
243+
runApp :: SendPort () -> Process ()
244+
runApp sg = do
245+
Just pid <- whereis "core"
246+
link pid -- if the real "core" exits first, we go too
247+
sendChan sg ()
248+
expect >>= say
249+
212250
configuredTemporaryChildExitsWithIgnore ::
213251
ChildStart
214252
-> (RestartStrategy -> [ChildSpec] -> (ProcessId -> Process ()) -> Assertion)
@@ -973,7 +1011,7 @@ localChildStartLinking :: TestResult Bool -> Process ()
9731011
localChildStartLinking result = do
9741012
s1 <- toChildStart procExpect
9751013
s2 <- toChildStart procLinkExpect
976-
pid <- Supervisor.start restartOne [tempWorker s1, tempWorker s2]
1014+
pid <- Supervisor.start restartOne ParallelShutdown [tempWorker s1, tempWorker s2]
9771015
[(r1, _), (r2, _)] <- listChildren pid
9781016
Just p1 <- resolve r1
9791017
Just p2 <- resolve r2
@@ -1117,6 +1155,10 @@ tests transport = do
11171155
(withSupervisor restartAll []
11181156
(withClosure restartWithoutTempChildren
11191157
$(mkStaticClosure 'blockIndefinitely)))
1158+
, testCase "Sequential Shutdown Ordering"
1159+
(delayedAssertion
1160+
"expected the shutdown order to hold"
1161+
localNode (Just ()) sequentialShutdown)
11201162
]
11211163
, testGroup "Stopping and Restarting Children"
11221164
[

0 commit comments

Comments
 (0)