Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Configure the wait time for checkpoint safety #18489

Merged
merged 5 commits into from
Dec 7, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions runtime/oti/j9nonbuilder.h
Original file line number Diff line number Diff line change
Expand Up @@ -4229,6 +4229,7 @@ typedef struct J9CRIUCheckpointState {
I_64 checkpointRestoreTimeDelta;
I_64 lastRestoreTimeMillis;
UDATA maxRetryForNotCheckpointSafe;
UDATA sleepMillisecondsForNotCheckpointSafe;
jclass criuJVMCheckpointExceptionClass;
jclass criuSystemCheckpointExceptionClass;
jclass criuJVMRestoreExceptionClass;
Expand Down
2 changes: 2 additions & 0 deletions runtime/oti/jvminit.h
Original file line number Diff line number Diff line change
Expand Up @@ -435,6 +435,8 @@ enum INIT_STAGE {
#define VMOPT_XSHARECLASSES_DISABLEONRESTORE "-Xshareclasses:disableOnRestore"
#define VMOPT_XXENABLETHROWONDELAYECHECKPOINTOPERATION "-XX:+ThrowOnDelayedCheckpointOperation"
#define VMOPT_XXDISABLETHROWONDELAYECHECKPOINTOPERATION "-XX:-ThrowOnDelayedCheckpointOperation"
#define VMOPT_XXMAXRETRYFORNOTCHECKPOINTSAFE_EQUALS "-XX:maxRetryForNotCheckpointSafe="
#define VMOPT_XXSLEEPMILLISECONDSFORNOTCHECKPOINTSAFE_EQUALS "-XX:sleepMillisecondsForNotCheckpointSafe="
#endif /* defined(J9VM_OPT_CRIU_SUPPORT) */

/* Compatibility options. */
Expand Down
3 changes: 2 additions & 1 deletion runtime/vm/CRIUHelpers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1498,6 +1498,7 @@ criuCheckpointJVMImpl(JNIEnv *env,
UDATA success = 0;
bool safePoint = J9_ARE_ANY_BITS_SET(vm->extendedRuntimeFlags, J9_EXTENDED_RUNTIME_OSR_SAFE_POINT);
UDATA maxRetries = vm->checkpointState.maxRetryForNotCheckpointSafe;
UDATA sleepMilliseconds = vm->checkpointState.sleepMillisecondsForNotCheckpointSafe;
BOOLEAN syslogFlagNone = TRUE;
char *syslogOptions = NULL;
I_32 syslogBufferSize = 0;
Expand Down Expand Up @@ -1652,7 +1653,7 @@ criuCheckpointJVMImpl(JNIEnv *env,
for (UDATA i = 0; (0 != notSafeToCheckpoint) && (i <= maxRetries); i++) {
releaseSafeOrExcusiveVMAccess(currentThread, vmFuncs, safePoint);
vmFuncs->internalExitVMToJNI(currentThread);
omrthread_nanosleep(10000);
omrthread_sleep(sleepMilliseconds);
vmFuncs->internalEnterVMFromJNI(currentThread);
acquireSafeOrExcusiveVMAccess(currentThread, vmFuncs, safePoint);
notSafeToCheckpoint = checkIfSafeToCheckpoint(currentThread);
Expand Down
33 changes: 31 additions & 2 deletions runtime/vm/jvminit.c
Original file line number Diff line number Diff line change
Expand Up @@ -2760,6 +2760,37 @@ VMInitStages(J9JavaVM *vm, IDATA stage, void* reserved)
argIndex2 = FIND_NEXT_ARG_IN_VMARGS_FORWARD(STARTSWITH_MATCH, VMOPT_XXGLOBALLOCKRESERVATIONCOLON, NULL, argIndex2);
}

#if defined(J9VM_OPT_CRIU_SUPPORT)
vm->checkpointState.maxRetryForNotCheckpointSafe = 100;
if ((argIndex = FIND_AND_CONSUME_VMARG(STARTSWITH_MATCH, VMOPT_XXMAXRETRYFORNOTCHECKPOINTSAFE_EQUALS, NULL)) >= 0) {
UDATA maxRetryForNotCheckpointSafe = 0;
char *optname = VMOPT_XXMAXRETRYFORNOTCHECKPOINTSAFE_EQUALS;
parseError = GET_INTEGER_VALUE(argIndex, optname, maxRetryForNotCheckpointSafe);
if (OPTION_OK != parseError) {
parseErrorOption = VMOPT_XXMAXRETRYFORNOTCHECKPOINTSAFE_EQUALS;
goto _memParseError;
}
vm->checkpointState.maxRetryForNotCheckpointSafe = maxRetryForNotCheckpointSafe;
}

vm->checkpointState.sleepMillisecondsForNotCheckpointSafe = 10;
if ((argIndex = FIND_AND_CONSUME_VMARG(STARTSWITH_MATCH, VMOPT_XXSLEEPMILLISECONDSFORNOTCHECKPOINTSAFE_EQUALS, NULL)) >= 0) {
UDATA sleepMillisecondsForNotCheckpointSafe = 0;
char *optname = VMOPT_XXSLEEPMILLISECONDSFORNOTCHECKPOINTSAFE_EQUALS;
parseError = GET_INTEGER_VALUE(argIndex, optname, sleepMillisecondsForNotCheckpointSafe);
if (OPTION_OK != parseError) {
parseErrorOption = VMOPT_XXSLEEPMILLISECONDSFORNOTCHECKPOINTSAFE_EQUALS;
goto _memParseError;
}
if (sleepMillisecondsForNotCheckpointSafe < 1) {
parseErrorOption = VMOPT_XXSLEEPMILLISECONDSFORNOTCHECKPOINTSAFE_EQUALS;
parseError = OPTION_OUTOFRANGE;
goto _memParseError;
}
vm->checkpointState.sleepMillisecondsForNotCheckpointSafe = sleepMillisecondsForNotCheckpointSafe;
}
#endif /* defined(J9VM_OPT_CRIU_SUPPORT) */

break;

case BYTECODE_TABLE_SET:
Expand Down Expand Up @@ -3924,8 +3955,6 @@ processVMArgsFromFirstToLast(J9JavaVM * vm)
}

vm->checkpointState.lastRestoreTimeMillis = -1;
/* Its unclear if we need an option for this, so we can keep the init here for the time being */
vm->checkpointState.maxRetryForNotCheckpointSafe = 100;
#endif /* defined(J9VM_OPT_CRIU_SUPPORT) */

{
Expand Down
9 changes: 4 additions & 5 deletions test/functional/cmdLineTests/criu/criu_nonPortable.xml
Original file line number Diff line number Diff line change
Expand Up @@ -292,9 +292,8 @@
<output type="failure" caseSensitive="yes" regex="no">Could not dump the JVM processes, err=-70</output>
</test>

<!--
<test id="Create and Restore Criu Checkpoint Image once - MethodTypeDeadlockTest">
<command>bash $SCRIPPATH$ $TEST_RESROOT$ $JAVA_COMMAND$ "$JVM_OPTIONS$ -XX:+ThrowOnDelayedCheckpointOperation -Xtrace:print=j9vm.731 -fix-add-opens java.base/jdk.internal.misc=ALL-UNNAMED -fix-add-exports java.base/openj9.internal.criu=ALL-UNNAMED" $MAINCLASS_DEADLOCK_TEST$ MethodTypeDeadlockTest 1</command>
<command>bash $SCRIPPATH$ $TEST_RESROOT$ $JAVA_COMMAND$ "$JVM_OPTIONS$ -XX:+ThrowOnDelayedCheckpointOperation -XX:sleepMillisecondsForNotCheckpointSafe=20 -Xtrace:print=j9vm.731 --add-opens java.base/jdk.internal.misc=ALL-UNNAMED --add-exports java.base/openj9.internal.criu=ALL-UNNAMED" $MAINCLASS_DEADLOCK_TEST$ MethodTypeDeadlockTest 1</command>
<output type="success" caseSensitive="yes" regex="no">User requested Java dump using</output>
<output type="success" caseSensitive="yes" regex="no">TEST PASSED</output>
<output type="failure" caseSensitive="yes" regex="no">TEST FAILED</output>
Expand All @@ -303,15 +302,15 @@
<output type="required" caseSensitive="no" regex="no">Killed</output>
<output type="failure" caseSensitive="yes" regex="no">CRIU is not enabled</output>
<output type="failure" caseSensitive="yes" regex="no">Operation not permitted</output>
If CRIU can't acquire the original thread IDs, this test will fail. Nothing can be done about this failure.
<!-- If CRIU can't acquire the original thread IDs, this test will fail. Nothing can be done about this failure. -->
<output type="success" caseSensitive="yes" regex="no">Thread pid mismatch</output>
<output type="success" caseSensitive="yes" regex="no">do not match expected</output>
<output type="success" caseSensitive="yes" regex="no">Unable to create a thread:</output>
<output type="failure" caseSensitive="yes" regex="no">TEST FAILED</output>
In the past, the failure below was caused by an issue where CRIU can't be found on the PATH.
<!-- In the past, the failure below was caused by an issue where CRIU can't be found on the PATH. -->
<output type="failure" caseSensitive="yes" regex="no">Could not dump the JVM processes, err=-70</output>
</test>
-->

<test id="Create and Restore Criu Checkpoint Image once - clinitTest">
<command>bash $SCRIPPATH$ $TEST_RESROOT$ $JAVA_COMMAND$ "$JVM_OPTIONS$ -XX:+ThrowOnDelayedCheckpointOperation -Xdump:system:events=user -Xtrace:print=j9vm.731 --add-opens java.base/jdk.internal.misc=ALL-UNNAMED --add-exports java.base/openj9.internal.criu=ALL-UNNAMED" $MAINCLASS_DEADLOCK_TEST$ ClinitTest 1</command>
<output type="success" caseSensitive="yes" regex="no">User requested Java dump using</output>
Expand Down
4 changes: 4 additions & 0 deletions test/functional/cmdLineTests/criu/playlist.xml
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,10 @@
<comment>https://github.com/eclipse-openj9/openj9/issues/18468</comment>
<platform>ppc64le.*</platform>
</disable>
<disable>
<comment>https://github.com/eclipse-openj9/openj9/issues/18570</comment>
<platform>s390x.*</platform>
</disable>
</disables>
<command>
TR_Options=$(Q)exclude={org/openj9/criu/TimeChangeTest.nanoTimeInt()J},dontInline={org/openj9/criu/TimeChangeTest.nanoTimeInt()J|org/openj9/criu/TimeChangeTest.nanoTimeJit()J},{org/openj9/criu/TimeChangeTest.nanoTimeJit()J}(count=1)$(Q) \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ public static void checkpointDeadlock() {

Thread t1 = new Thread(() -> {
synchronized (lock) {
testResult.lockStatus = 1;
testResult.lockStatus.set(1);
try {
Thread.sleep(20000);
} catch (InterruptedException e) {
Expand All @@ -95,7 +95,7 @@ public static void checkpointDeadlock() {
}
});

while (testResult.lockStatus == 0) {
while (testResult.lockStatus.get() == 0) {
Thread.yield();
}

Expand Down Expand Up @@ -143,7 +143,7 @@ public static void notCheckpointSafeDeadlock() {
Thread t1 = new Thread(() -> {
Runnable run = () -> {
synchronized (lock) {
testResult.lockStatus = 1;
testResult.lockStatus.set(1);
try {
Thread.sleep(20000);
} catch (InterruptedException e) {
Expand All @@ -160,7 +160,7 @@ public static void notCheckpointSafeDeadlock() {

CRIUSupport criuSupport = new CRIUSupport(path);

while (testResult.lockStatus == 0) {
while (testResult.lockStatus.get() == 0) {
Thread.yield();
}

Expand All @@ -187,7 +187,7 @@ public static void methodTypeDeadlockTest() {
Path path = Paths.get("cpData");
final TestResult testResult = new TestResult(true, 0);
Runnable run = () -> {
testResult.lockStatus++;
testResult.lockStatus.incrementAndGet();
for (int i = 0; i < 30; i++) {
URL[] urlArray = { A.class.getProtectionDomain().getCodeSource().getLocation() };
URLClassLoader loader = new URLClassLoader(urlArray);
Expand All @@ -203,7 +203,7 @@ public static void methodTypeDeadlockTest() {
thread.start();
}

while (testResult.lockStatus < 5) {
while (testResult.lockStatus.get() < 5) {
Thread.yield();
}

Expand Down Expand Up @@ -258,15 +258,15 @@ public static void clinitTest() {
Path path = Paths.get("cpData");

mainTestResult.testPassed = false;
mainTestResult.lockStatus = 0;
mainTestResult.lockStatus.set(0);

Thread t1 = new Thread(()->{
new ClinitDeadlock();
});

t1.start();

while (mainTestResult.lockStatus == 0) {
while (mainTestResult.lockStatus.get() == 0) {
Thread.yield();
}

Expand Down Expand Up @@ -295,15 +295,15 @@ public static void clinitTest2() {
Path path = Paths.get("cpData");

mainTestResult.testPassed = false;
mainTestResult.lockStatus = 0;
mainTestResult.lockStatus.set(0);

Thread t1 = new Thread(()->{
new ClinitDeadlock();
});

t1.start();

while (mainTestResult.lockStatus == 0) {
while (mainTestResult.lockStatus.get() == 0) {
Thread.yield();
}

Expand Down Expand Up @@ -331,7 +331,7 @@ public static void clinitTest2() {
static class ClinitDeadlock {

static {
mainTestResult.lockStatus = 1;
mainTestResult.lockStatus.set(1);
synchronized(lock) {
try {
System.out.println("Thread waiting");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ private void test(String testName) throws InterruptedException {
throw new RuntimeException("Unrecognized test name: " + testName);
}

while (testResult.lockStatus == 0) {
while (testResult.lockStatus.get() == 0) {
Thread.currentThread().yield();
}
CRIUTestUtils.checkPointJVMNoSetup(criu, CRIUTestUtils.imagePath, false);
Expand Down Expand Up @@ -138,7 +138,7 @@ public static void showMessages(String logStr, long expectedTime, boolean isMill
private void testThreadParkHelper(String testName) {
CRIUTestUtils.showThreadCurrentTime(testName + " before park()");
final long startNanoTime = System.nanoTime();
testResult.lockStatus = 1;
testResult.lockStatus.set(1);
unsafe.park(false, nsTime5s);
final long endNanoTime = System.nanoTime();
CRIUTestUtils.showThreadCurrentTime(testName + " after park()");
Expand All @@ -160,7 +160,7 @@ private void testThreadSleepHelper(String testName) {
CRIUTestUtils.showThreadCurrentTime(testName + " before sleep()");
final long startNanoTime = System.nanoTime();
try {
testResult.lockStatus = 1;
testResult.lockStatus.set(1);
Thread.sleep(msTime5s);
final long endNanoTime = System.nanoTime();
CRIUTestUtils.showThreadCurrentTime(testName + " after sleep()");
Expand Down Expand Up @@ -191,7 +191,7 @@ public void run() {
final long startNanoTime;
synchronized (objWait) {
startNanoTime = System.nanoTime();
testResult.lockStatus = 1;
testResult.lockStatus.set(1);
objWait.wait();
}
final long endNanoTime = System.nanoTime();
Expand Down Expand Up @@ -220,7 +220,7 @@ private void testObjectWaitTimedHelper(String testName, long ms, int ns) {
final long startNanoTime;
synchronized (objWait) {
startNanoTime = System.nanoTime();
testResult.lockStatus = 1;
testResult.lockStatus.set(1);
objWait.wait(ms, ns);
}
final long endNanoTime = System.nanoTime();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -118,45 +118,45 @@ static void TestConcurrentModePreCheckpointHookPriorities() {
criu.registerPreCheckpointHook(() -> {
CRIUTestUtils.showThreadCurrentTime("The preCheckpointHook with lower priority in CONCURRENT_MODE");
// check if it is the initial value
if (testResult.lockStatus != 0) {
if (testResult.lockStatus.get() != 0) {
testResult.testPassed = false;
CRIUTestUtils.showThreadCurrentTime("The preCheckpointHook with lower priority in CONCURRENT_MODE failed with testResult.lockStatus = "
+ testResult.lockStatus);
+ testResult.lockStatus.get());
} else {
testResult.lockStatus = 1;
testResult.lockStatus.set(1);
}
}, CRIUSupport.HookMode.CONCURRENT_MODE, USER_HOOK_MODE_PRIORITY_LOW);
criu.registerPreCheckpointHook(() -> {
CRIUTestUtils.showThreadCurrentTime("The preCheckpointHook with higher priority in CONCURRENT_MODE");
// check if it is the value set by the hook with USER_HOOK_MODE_PRIORITY_LOW in CONCURRENT_MODE
if (testResult.lockStatus != 1) {
if (testResult.lockStatus.get() != 1) {
testResult.testPassed = false;
CRIUTestUtils.showThreadCurrentTime("The preCheckpointHook with higher priority in CONCURRENT_MODE failed with testResult.lockStatus = "
+ testResult.lockStatus);
+ testResult.lockStatus.get());
} else {
testResult.lockStatus = 2;
testResult.lockStatus.set(2);
}
}, CRIUSupport.HookMode.CONCURRENT_MODE, USER_HOOK_MODE_PRIORITY_HIGH);
criu.registerPreCheckpointHook(() -> {
CRIUTestUtils.showThreadCurrentTime("The preCheckpointHook with lower priority in SINGLE_THREAD_MODE");
// check if it is the value set by the hook with USER_HOOK_MODE_PRIORITY_HIGH in CONCURRENT_MODE
if (testResult.lockStatus != 2) {
if (testResult.lockStatus.get() != 2) {
testResult.testPassed = false;
CRIUTestUtils.showThreadCurrentTime("The preCheckpointHook with lower priority in SINGLE_THREAD_MODE failed with testResult.lockStatus = "
+ testResult.lockStatus);
+ testResult.lockStatus.get());
} else {
testResult.lockStatus = 3;
testResult.lockStatus.set(3);
}
}, CRIUSupport.HookMode.SINGLE_THREAD_MODE, USER_HOOK_MODE_PRIORITY_LOW);
criu.registerPreCheckpointHook(() -> {
CRIUTestUtils.showThreadCurrentTime("The preCheckpointHook with higher priority in SINGLE_THREAD_MODE");
// check if it is the value set by the hook with USER_HOOK_MODE_PRIORITY_LOW in SINGLE_THREAD_MODE
if (testResult.lockStatus != 3) {
if (testResult.lockStatus.get() != 3) {
testResult.testPassed = false;
CRIUTestUtils.showThreadCurrentTime("The preCheckpointHook with higher priority in SINGLE_THREAD_MODE failed with testResult.lockStatus = "
+ testResult.lockStatus);
+ testResult.lockStatus.get());
} else {
testResult.lockStatus = 4;
testResult.lockStatus.set(4);
}
}, CRIUSupport.HookMode.SINGLE_THREAD_MODE, USER_HOOK_MODE_PRIORITY_HIGH);

Expand Down Expand Up @@ -217,45 +217,45 @@ static void TestConcurrentModePostRestoreHookPriorities() {
criu.registerPostRestoreHook(() -> {
CRIUTestUtils.showThreadCurrentTime("The postRestoreHook with lower priority in CONCURRENT_MODE");
// check if it is the value set by the hook with USER_HOOK_MODE_PRIORITY_LOW in CONCURRENT_MODE
if (testResult.lockStatus != 3) {
if (testResult.lockStatus.get() != 3) {
testResult.testPassed = false;
CRIUTestUtils.showThreadCurrentTime("The postRestoreHook with lower priority in CONCURRENT_MODE failed with testResult.lockStatus = "
+ testResult.lockStatus);
+ testResult.lockStatus.get());
} else {
testResult.lockStatus = 4;
testResult.lockStatus.set(4);
}
}, CRIUSupport.HookMode.CONCURRENT_MODE, USER_HOOK_MODE_PRIORITY_LOW);
criu.registerPostRestoreHook(() -> {
CRIUTestUtils.showThreadCurrentTime("The postRestoreHook with higher priority in CONCURRENT_MODE");
// check if it is the value set by the hook with USER_HOOK_MODE_PRIORITY_LOW in SINGLE_THREAD_MODE
if (testResult.lockStatus != 2) {
if (testResult.lockStatus.get() != 2) {
testResult.testPassed = false;
CRIUTestUtils.showThreadCurrentTime("The postRestoreHook with higher priority in CONCURRENT_MODE failed with testResult.lockStatus = "
+ testResult.lockStatus);
+ testResult.lockStatus.get());
} else {
testResult.lockStatus = 3;
testResult.lockStatus.set(3);
}
}, CRIUSupport.HookMode.CONCURRENT_MODE, USER_HOOK_MODE_PRIORITY_HIGH);
criu.registerPostRestoreHook(() -> {
CRIUTestUtils.showThreadCurrentTime("The postRestoreHook with lower priority in SINGLE_THREAD_MODE");
// check if it is the value set by the hook with USER_HOOK_MODE_PRIORITY_HIGH in SINGLE_THREAD_MODE
if (testResult.lockStatus != 1) {
if (testResult.lockStatus.get() != 1) {
testResult.testPassed = false;
CRIUTestUtils.showThreadCurrentTime("The postRestoreHook with lower priority in SINGLE_THREAD_MODE failed with testResult.lockStatus = "
+ testResult.lockStatus);
+ testResult.lockStatus.get());
} else {
testResult.lockStatus = 2;
testResult.lockStatus.set(2);
}
}, CRIUSupport.HookMode.SINGLE_THREAD_MODE, USER_HOOK_MODE_PRIORITY_LOW);
criu.registerPostRestoreHook(() -> {
CRIUTestUtils.showThreadCurrentTime("The postRestoreHook with higher priority in SINGLE_THREAD_MODE");
// check if it is the initial value
if (testResult.lockStatus != 0) {
if (testResult.lockStatus.get() != 0) {
testResult.testPassed = false;
CRIUTestUtils.showThreadCurrentTime("The postRestoreHook with higher priority in SINGLE_THREAD_MODE failed with testResult.lockStatus = "
+ testResult.lockStatus);
+ testResult.lockStatus.get());
} else {
testResult.lockStatus = 1;
testResult.lockStatus.set(1);
}
}, CRIUSupport.HookMode.SINGLE_THREAD_MODE, USER_HOOK_MODE_PRIORITY_HIGH);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,14 @@
*******************************************************************************/
package org.openj9.criu;

import java.util.concurrent.atomic.AtomicInteger;

public class TestResult {
boolean testPassed;
volatile int lockStatus;
AtomicInteger lockStatus;

TestResult(boolean testPassed, int lockStatus) {
this.testPassed = testPassed;
this.lockStatus = lockStatus;
this.lockStatus = new AtomicInteger(lockStatus);
}
}