Skip to content

Commit c51c0a0

Browse files
authored
Merge pull request #18541 from dsouzai/coordsampler
Coordinate Sampler Thread across a Checkpoint/Restore
2 parents 513e863 + 59db93b commit c51c0a0

File tree

4 files changed

+210
-44
lines changed

4 files changed

+210
-44
lines changed

doc/compiler/control/OptionsPostRestore.md

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -139,13 +139,14 @@ consequence of this is that if the same file name is specified then a
139139
new file will be opened. This can have the consequence of overwriting
140140
the previous file if the PID of the restored process is the same.
141141

142-
### Start and Elapsed Time
143-
144-
The Checkpoint phase is conceptually part of building the application;
145-
therefore it does not make sense to expect a user who specifies an
146-
option such as `-XsamplingExpirationTime` to take into account the time
147-
spent executing in the Checkpoint phase. Therefore, on restore, both
148-
the start and elapsed time are reset.
142+
### Start Time
143+
144+
While the checkpoint phase is conceptually part of building the
145+
application, in order to ensure consistency with parts of the compiler
146+
that memoize elapsed time, the start time is reset to pretend like the
147+
JVM started `persistentInfo->getElapsedTime()` milliseconds ago. This
148+
will impact options such as `-XsamplingExpirationTime`. However, such
149+
an option may not make sense in the context of checkpoint/restore.
149150

150151
### `-Xrs`, `-Xtrace`, `-Xjit:disableTraps`, `-Xjit:noResumableTrapHandler`
151152

runtime/compiler/control/CompilationRuntime.hpp

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -374,6 +374,8 @@ class CompilationInfo
374374
SAMPLE_THR_FAILED_TO_ATTACH,
375375
SAMPLE_THR_ATTACHED,
376376
SAMPLE_THR_INITIALIZED,
377+
SAMPLE_THR_SUSPENDED,
378+
SAMPLE_THR_RESUMING,
377379
SAMPLE_THR_STOPPING,
378380
SAMPLE_THR_DESTROYED,
379381
SAMPLE_THR_LAST_STATE // must be the last one
@@ -731,9 +733,6 @@ class CompilationInfo
731733
void setVMExceptionEventsHooked(bool trace) { _vmExceptionEventsHooked = trace; }
732734
bool isVMExceptionEventsHooked() { return _vmExceptionEventsHooked; }
733735

734-
bool resetStartAndElapsedTime() { return _resetStartAndElapsedTime; }
735-
void setResetStartAndElapsedTime(bool reset) { _resetStartAndElapsedTime = reset; }
736-
737736
#if defined(J9VM_OPT_JITSERVER)
738737
bool canPerformRemoteCompilationInCRIUMode() { return _canPerformRemoteCompilationInCRIUMode; }
739738
void setCanPerformRemoteCompilationInCRIUMode(bool remoteComp) { _canPerformRemoteCompilationInCRIUMode = remoteComp; }
@@ -1395,7 +1394,6 @@ class CompilationInfo
13951394
TR_CheckpointStatus _checkpointStatus;
13961395
bool _vmMethodTraceEnabled;
13971396
bool _vmExceptionEventsHooked;
1398-
bool _resetStartAndElapsedTime;
13991397
#if defined(J9VM_OPT_JITSERVER)
14001398
bool _canPerformRemoteCompilationInCRIUMode;
14011399
bool _remoteCompilationRequestedAtBootstrap;
@@ -1595,6 +1593,29 @@ class CompilationInfo
15951593
* @return false false if the checkpoint is interrupted, true otherwise.
15961594
*/
15971595
bool suspendCompThreadsForCheckpoint(J9VMThread *vmThread);
1596+
1597+
/**
1598+
* @brief Suspend all JIT threads such as
1599+
* * Compilation Threads
1600+
* * Sampler Thread
1601+
*
1602+
* @param vmThread The J9VMThread
1603+
*
1604+
* @return false if the checkpoint is interrupted, true otherwise.
1605+
*/
1606+
bool suspendJITThreadsForCheckpoint(J9VMThread *vmThread);
1607+
1608+
/**
1609+
* @brief Resume all JIT threads suspended by suspendCompilerThreadsForCheckpoint
1610+
*
1611+
* @param vmThread The J9VMThread
1612+
*/
1613+
void resumeJITThreadsForRestore(J9VMThread *vmThread);
1614+
1615+
/**
1616+
* @brief Reset Start Time post retore
1617+
*/
1618+
void resetStartTime();
15981619
#endif
15991620
}; // CompilationInfo
16001621
}

runtime/compiler/control/CompilationThread.cpp

Lines changed: 80 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1207,8 +1207,6 @@ TR::CompilationInfo::CompilationInfo(J9JITConfig *jitConfig) :
12071207
|| J9_EVENT_IS_RESERVED(jitConfig->javaVM->hookInterface, J9HOOK_VM_EXCEPTION_THROW);
12081208
_vmExceptionEventsHooked = exceptionCatchEventHooked || exceptionThrowEventHooked;
12091209

1210-
_resetStartAndElapsedTime = false;
1211-
12121210
#if defined(J9VM_OPT_JITSERVER)
12131211
_canPerformRemoteCompilationInCRIUMode = false;
12141212
_remoteCompilationRequestedAtBootstrap = false;
@@ -2901,6 +2899,79 @@ bool TR::CompilationInfo::suspendCompThreadsForCheckpoint(J9VMThread *vmThread)
29012899
return true;
29022900
}
29032901

2902+
bool
2903+
TR::CompilationInfo::suspendJITThreadsForCheckpoint(J9VMThread *vmThread)
2904+
{
2905+
// Suspend compilation threads for checkpoint
2906+
if (!suspendCompThreadsForCheckpoint(vmThread))
2907+
return false;
2908+
2909+
// Suspend Sampler Thread
2910+
if (_jitConfig->samplerMonitor)
2911+
{
2912+
j9thread_monitor_enter(_jitConfig->samplerMonitor);
2913+
j9thread_interrupt(_jitConfig->samplerThread);
2914+
2915+
// Determine whether to wait on the CR Monitor.
2916+
//
2917+
// Note, this thread releases the sampler monitor and then
2918+
// acquires the CR monitor inside releaseCompMonitorUntilNotifiedOnCRMonitor.
2919+
while (!shouldCheckpointBeInterrupted()
2920+
&& getSamplingThreadLifetimeState() != TR::CompilationInfo::SAMPLE_THR_SUSPENDED)
2921+
{
2922+
j9thread_monitor_exit(_jitConfig->samplerMonitor);
2923+
releaseCompMonitorUntilNotifiedOnCRMonitor(vmThread);
2924+
j9thread_monitor_enter(_jitConfig->samplerMonitor);
2925+
}
2926+
2927+
j9thread_monitor_exit(_jitConfig->samplerMonitor);
2928+
}
2929+
2930+
return !shouldCheckpointBeInterrupted();
2931+
}
2932+
2933+
void
2934+
TR::CompilationInfo::resumeJITThreadsForRestore(J9VMThread *vmThread)
2935+
{
2936+
// Resume suspended Sampler Thread
2937+
if (_jitConfig->samplerMonitor)
2938+
{
2939+
j9thread_monitor_enter(_jitConfig->samplerMonitor);
2940+
setSamplingThreadLifetimeState(TR::CompilationInfo::SAMPLE_THR_RESUMING);
2941+
j9thread_monitor_notify_all(_jitConfig->samplerMonitor);
2942+
j9thread_monitor_exit(_jitConfig->samplerMonitor);
2943+
}
2944+
2945+
// Resume suspended compilation threads.
2946+
resumeCompilationThread();
2947+
}
2948+
2949+
/* Post-restore, reset the start time. While the Checkpoint phase is
2950+
* conceptually part of building the application, in order to ensure
2951+
* consistency with parts of the compiler that memoize elapsd time,
2952+
* the start time is reset to pretend like the JVM started
2953+
* persistentInfo->getElapsedTime() milliseconds ago. This will impact
2954+
* options such as -XsamplingExpirationTime. However, such an option
2955+
* may not make sense in the context of checkpoint/restore.
2956+
*/
2957+
void
2958+
TR::CompilationInfo::resetStartTime()
2959+
{
2960+
PORT_ACCESS_FROM_JAVAVM(jitConfig->javaVM);
2961+
TR::PersistentInfo *persistentInfo = getPersistentInfo();
2962+
2963+
if (TR::Options::isAnyVerboseOptionSet())
2964+
TR_VerboseLog::writeLineLocked(TR_Vlog_CHECKPOINT_RESTORE, "Start and elapsed time: startTime=%6u, elapsedTime=%6u",
2965+
(uint32_t)persistentInfo->getStartTime(), (uint32_t)persistentInfo->getElapsedTime());
2966+
2967+
uint64_t crtTime = j9time_current_time_millis() - persistentInfo->getElapsedTime();
2968+
persistentInfo->setStartTime(crtTime);
2969+
2970+
if (TR::Options::isAnyVerboseOptionSet())
2971+
TR_VerboseLog::writeLineLocked(TR_Vlog_CHECKPOINT_RESTORE, "Reset start and elapsed time: startTime=%6u, elapsedTime=%6u",
2972+
(uint32_t)persistentInfo->getStartTime(), (uint32_t)persistentInfo->getElapsedTime());
2973+
}
2974+
29042975
void TR::CompilationInfo::prepareForCheckpoint()
29052976
{
29062977
J9JavaVM *vm = _jitConfig->javaVM;
@@ -2933,8 +3004,8 @@ void TR::CompilationInfo::prepareForCheckpoint()
29333004
if (!compileMethodsForCheckpoint(vmThread))
29343005
return;
29353006

2936-
// Suspend compilation threads for checkpoint
2937-
if (!suspendCompThreadsForCheckpoint(vmThread))
3007+
// Suspend JIT threads for checkpoint
3008+
if (!suspendJITThreadsForCheckpoint(vmThread))
29383009
return;
29393010

29403011
#if defined(J9VM_OPT_JITSERVER)
@@ -2967,9 +3038,6 @@ void TR::CompilationInfo::prepareForRestore()
29673038
if (TR::Options::getCmdLineOptions()->getVerboseOption(TR_VerboseCheckpointRestore))
29683039
TR_VerboseLog::writeLineLocked(TR_Vlog_CHECKPOINT_RESTORE, "Preparing for restore");
29693040

2970-
// Inform the Sampler Thread to reset the start and elapsed time it maintains
2971-
setResetStartAndElapsedTime(true);
2972-
29733041
// Process the post-restore options
29743042
J9::OptionsPostRestore::processOptionsPostRestore(vmThread, _jitConfig, this);
29753043

@@ -2981,8 +3049,11 @@ void TR::CompilationInfo::prepareForRestore()
29813049
// Reset the checkpoint in progress flag.
29823050
resetCheckpointInProgress();
29833051

2984-
// Resume suspended compilation threads.
2985-
resumeCompilationThread();
3052+
// Reset the start time.
3053+
resetStartTime();
3054+
3055+
// Resume JIT threads.
3056+
resumeJITThreadsForRestore(vmThread);
29863057
}
29873058

29883059
// Check if there is no swap memory post restore

runtime/compiler/control/HookedByTheJit.cpp

Lines changed: 97 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -5928,6 +5928,84 @@ class CompilationDensity
59285928
}
59295929
}; // class CompilationDensity
59305930

5931+
#if defined(J9VM_OPT_CRIU_SUPPORT)
5932+
static void suspendSamplerThreadForCheckpoint(J9VMThread *samplerThread, J9JITConfig *jitConfig, TR::CompilationInfo *compInfo)
5933+
{
5934+
compInfo->acquireCompMonitor(samplerThread);
5935+
if (compInfo->shouldSuspendThreadsForCheckpoint())
5936+
{
5937+
PORT_ACCESS_FROM_JITCONFIG(jitConfig);
5938+
5939+
// Must acquire this with the comp monitor in hand to ensure
5940+
// consistency with the checkpointing thread.
5941+
j9thread_monitor_enter(jitConfig->samplerMonitor);
5942+
5943+
// Because this thread has the comp monitor in hand, and because
5944+
// shouldSuspendThreadsForCheckpoint() returned true, it is not possible
5945+
// for the Sampler Thread to have any other state other than SAMPLE_THR_INITIALIZED
5946+
TR_ASSERT_FATAL(compInfo->getSamplingThreadLifetimeState() == TR::CompilationInfo::SAMPLE_THR_INITIALIZED,
5947+
"Sampler Thread Lifetime State %d is not SAMPLE_THR_INITIALIZED!", compInfo->getSamplingThreadLifetimeState());
5948+
5949+
// Update the sampler thread state.
5950+
compInfo->setSamplingThreadLifetimeState(TR::CompilationInfo::SAMPLE_THR_SUSPENDED);
5951+
5952+
// Notify the checkpointing thread about the state change.
5953+
//
5954+
// Note, unlike the checkpointing thread, this thread does NOT
5955+
// release the sampler monitor before acquring the CR monitor.
5956+
// This ensures that the Sampling Thread Lifetime State does not
5957+
// change because of something like Shutdown. However, this
5958+
// can only cause a deadlock if the checkpointing thread
5959+
// decides to re-acquire the sampler monitor with the CR monitor
5960+
// in hand.
5961+
compInfo->acquireCRMonitor();
5962+
compInfo->getCRMonitor()->notifyAll();
5963+
compInfo->releaseCRMonitor();
5964+
5965+
if (TR::Options::isAnyVerboseOptionSet())
5966+
TR_VerboseLog::writeLineLocked(TR_Vlog_CHECKPOINT_RESTORE, "Suspending Sampler Thread for Checkpoint");
5967+
5968+
// Release the comp monitor before suspending.
5969+
compInfo->releaseCompMonitor(samplerThread);
5970+
5971+
// Wait until restore, at which point the lifetime state
5972+
// will be TR::CompilationInfo::SAMPLE_THR_RESUMING
5973+
while (compInfo->getSamplingThreadLifetimeState() == TR::CompilationInfo::SAMPLE_THR_SUSPENDED)
5974+
{
5975+
j9thread_monitor_wait(jitConfig->samplerMonitor);
5976+
}
5977+
5978+
if (TR::Options::isAnyVerboseOptionSet())
5979+
TR_VerboseLog::writeLineLocked(TR_Vlog_CHECKPOINT_RESTORE, "Resuming Sampler Thread from Checkpoint");
5980+
5981+
// Release the sampler monitor before reacquring both the
5982+
// comp monitor and sampler monitor. This is necessary to
5983+
// ensure consistency with the checkpointing thread.
5984+
j9thread_monitor_exit(jitConfig->samplerMonitor);
5985+
compInfo->acquireCompMonitor(samplerThread);
5986+
j9thread_monitor_enter(jitConfig->samplerMonitor);
5987+
5988+
// Ensure the sampler thread was resumed because of a restore
5989+
// rather than something else (such as shutdown)
5990+
if (compInfo->getSamplingThreadLifetimeState() == TR::CompilationInfo::SAMPLE_THR_RESUMING)
5991+
{
5992+
if (TR::Options::isAnyVerboseOptionSet())
5993+
TR_VerboseLog::writeLineLocked(TR_Vlog_CHECKPOINT_RESTORE, "Resetting Sampling Thread Lifetime State");
5994+
compInfo->setSamplingThreadLifetimeState(TR::CompilationInfo::SAMPLE_THR_INITIALIZED);
5995+
}
5996+
else
5997+
{
5998+
if (TR::Options::isAnyVerboseOptionSet())
5999+
TR_VerboseLog::writeLineLocked(TR_Vlog_CHECKPOINT_RESTORE, "Sampling Thread Lifetime State is %p which is not %p!", compInfo->getSamplingThreadLifetimeState(), TR::CompilationInfo::SAMPLE_THR_RESUMING);
6000+
}
6001+
6002+
// Release the reacquired sampler thread monitor.
6003+
j9thread_monitor_exit(jitConfig->samplerMonitor);
6004+
}
6005+
compInfo->releaseCompMonitor(samplerThread);
6006+
}
6007+
#endif
6008+
59316009
static int32_t J9THREAD_PROC samplerThreadProc(void * entryarg)
59326010
{
59336011
J9JITConfig * jitConfig = (J9JITConfig *) entryarg;
@@ -6124,35 +6202,30 @@ static int32_t J9THREAD_PROC samplerThreadProc(void * entryarg)
61246202
while (!shutdownSamplerThread && // watch for shutdown signals
61256203
j9thread_sleep_interruptable((IDATA) samplingPeriod, 0) == 0) // Anything non-0 is an error condition so we shouldn't do the sampling //!= J9THREAD_INTERRUPTED)
61266204
{
6127-
#if defined(J9VM_OPT_CRIU_SUPPORT)
6128-
// Post-restore, reset the start and elapsed time. The Checkpoint
6129-
// phase is conceptually part of building the application; therefore
6130-
// it does not make sense to expect a user who specifies an option such
6131-
// as -XsamplingExpirationTime to take into account the time spent
6132-
// executing in the Checkpoint phase.
6133-
if (compInfo->resetStartAndElapsedTime())
6134-
{
6135-
if (TR::Options::isAnyVerboseOptionSet())
6136-
TR_VerboseLog::writeLineLocked(TR_Vlog_CHECKPOINT_RESTORE, "Start and elapsed time: startTime=%6u, elapsedTime=%6u",
6137-
(uint32_t)persistentInfo->getStartTime(), (uint32_t)persistentInfo->getElapsedTime());
6138-
6139-
persistentInfo->setStartTime(j9time_current_time_millis());
6140-
persistentInfo->setElapsedTime(0);
6141-
6142-
if (TR::Options::isAnyVerboseOptionSet())
6143-
TR_VerboseLog::writeLineLocked(TR_Vlog_CHECKPOINT_RESTORE, "Reset start and elapsed time: startTime=%6u, elapsedTime=%6u",
6144-
(uint32_t)persistentInfo->getStartTime(), (uint32_t)persistentInfo->getElapsedTime());
6145-
6146-
// Only reset the time once
6147-
compInfo->setResetStartAndElapsedTime(false);
6148-
}
6149-
#endif // #if defined(J9VM_OPT_CRIU_SUPPORT)
6150-
61516205
J9VMThread * currentThread;
61526206

61536207
persistentInfo->updateElapsedTime(samplingPeriod);
61546208
crtTime += samplingPeriod;
61556209

6210+
#if defined(J9VM_OPT_CRIU_SUPPORT)
6211+
if (vm->internalVMFunctions->isCheckpointAllowed(samplerThread)
6212+
/* It's ok to not acquire the comp monitor here. Even if at this
6213+
* point a checkpoint isn't in progress but later it is, the
6214+
* checkpoint will not complete until the sampler thread suspends
6215+
* itself. Therefore, on some iteration of the enclosing while
6216+
* loop, this condition will be true and the sampler thread will
6217+
* go through the process of suspending itself. Conversely, if
6218+
* this condition is true, but after acquring the comp monitor
6219+
* (in the call to suspendSamplerThreadForCheckpoint) this condition
6220+
* isn't true (e.g., due to shutdown), then the sampler thread will
6221+
* not suspend itself.
6222+
*/
6223+
&& compInfo->shouldSuspendThreadsForCheckpoint())
6224+
{
6225+
suspendSamplerThreadForCheckpoint(samplerThread,jitConfig, compInfo);
6226+
}
6227+
#endif // #if defined(J9VM_OPT_CRIU_SUPPORT)
6228+
61566229
// periodic chores
61576230
// FIXME: make a constant/macro for the period, and make it 100
61586231
if (crtTime - oldSyncTime >= 100) // every 100 ms

0 commit comments

Comments
 (0)