Skip to content

Commit 6906b6e

Browse files
authored
Merge pull request flux-framework#4856 from chu11/issue4804_job_manager_checkpoint
job-manager: do not checkpoint on every queue state change
2 parents d0cb296 + 04b3054 commit 6906b6e

File tree

4 files changed

+39
-29
lines changed

4 files changed

+39
-29
lines changed

doc/man1/flux-queue.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ OPTIONS
8484
Be taciturn.
8585

8686
**-a, --all**
87-
Use with *enable*, *disable*, *stop, or *start* subcommands to
87+
Use with *enable*, *disable*, *stop*, or *start* subcommands to
8888
signify intent to affect all queues, when queues are configured but
8989
*--queue* is missing.
9090

src/modules/job-manager/queue.c

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -619,8 +619,6 @@ static void queue_enable_cb (flux_t *h,
619619
if (jobq_enable (q, enable, disable_reason) < 0)
620620
goto error;
621621
}
622-
if (restart_save_state (queue->ctx) < 0)
623-
flux_log_error (h, "problem saving checkpoint after queue change");
624622
if (flux_respond (h, msg, NULL) < 0)
625623
flux_log_error (h, "error responding to job-manager.queue-enable");
626624
return;
@@ -719,8 +717,6 @@ static void queue_start_cb (flux_t *h,
719717
else
720718
queue_stop (queue, name);
721719
}
722-
if (restart_save_state (queue->ctx) < 0)
723-
flux_log_error (h, "problem saving checkpoint after queue change");
724720
if (flux_respond (h, msg, NULL) < 0)
725721
flux_log_error (h, "error responding to job-manager.queue-start");
726722
return;

t/t2219-job-manager-restart.t

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,6 @@ test_expect_success 'verify that instance can restart after config change' '
114114
'
115115

116116
test_expect_success 'verify that named queue start/stop persists across restart' '
117-
rm -rf /tmp/achu/mylog &&
118117
mkdir -p conf.d &&
119118
cat >conf.d/queues.toml <<-EOT &&
120119
[queues.debug]

t/t2240-queue-cmd.t

Lines changed: 38 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -126,16 +126,16 @@ test_expect_success 'flux-queue: status reports no reason for stop' '
126126
'
127127

128128
test_expect_success HAVE_JQ 'flux-queue: stop with --nocheckpoint works' '
129-
flux queue start &&
130-
flux kvs get checkpoint.job-manager | jq -e ".queue[0].start == true" &&
131-
flux queue stop --nocheckpoint &&
132-
flux kvs get checkpoint.job-manager | jq -e ".queue[0].start == true" &&
133-
flux queue status >status3.out &&
134-
cat <<-EOT >status3.exp &&
135-
Job submission is enabled
136-
Scheduling is stopped
137-
EOT
138-
test_cmp status3.exp status3.out
129+
flux start \
130+
-o,-Scontent.dump=dump_queue_nocheckpoint1.tar \
131+
flux queue stop &&
132+
tar -xvf dump_queue_nocheckpoint1.tar &&
133+
cat checkpoint/job-manager | jq -e ".queue[0].start == false" &&
134+
flux start \
135+
-o,-Scontent.dump=dump_queue_nocheckpoint2.tar \
136+
flux queue stop --nocheckpoint &&
137+
tar -xvf dump_queue_nocheckpoint2.tar &&
138+
cat checkpoint/job-manager | jq -e ".queue[0].start == true"
139139
'
140140

141141
test_expect_success 'flux-queue: submit some jobs' '
@@ -533,19 +533,34 @@ test_expect_success 'previously submitted job run to completion' '
533533
flux jobs -n -o "{state}" $(cat job_batch2.id) | grep INACTIVE
534534
'
535535

536-
# for this test we pick one the first queue's name to stop, but we don't care
537-
# which one it is
538-
test_expect_success HAVE_JQ 'flux-queue: stop with one queue and --nocheckpoint works' '
539-
flux queue start --all &&
540-
flux kvs get checkpoint.job-manager | jq -e ".queue[0].start == true" &&
541-
flux kvs get checkpoint.job-manager | jq -e ".queue[1].start == true" &&
542-
flux kvs get checkpoint.job-manager | jq -r ".queue[0].name" > name.out &&
543-
flux queue stop -q $(cat name.out) --nocheckpoint nocheckpoint &&
544-
flux queue status >mqstatus_nocheckpoint.out &&
545-
test $(grep -c "Scheduling is started" mqstatus_nocheckpoint.out) -eq 1 &&
546-
test $(grep -c "Scheduling is stopped: nocheckpoint" mqstatus_nocheckpoint.out) -eq 1 &&
547-
flux kvs get checkpoint.job-manager | jq -e ".queue[0].start == true" &&
548-
flux kvs get checkpoint.job-manager | jq -e ".queue[1].start == true"
536+
test_expect_success HAVE_JQ 'flux-queue: stop with named queues and --nocheckpoint works' '
537+
mkdir -p conf.d &&
538+
cat >conf.d/queues.toml <<-EOT &&
539+
[queues.debug]
540+
[queues.batch]
541+
EOT
542+
cat >stopqueues.sh <<-EOT &&
543+
flux queue start --all
544+
flux queue stop --all
545+
EOT
546+
cat >stopqueuesnocheckpoint.sh <<-EOT &&
547+
flux queue start --all
548+
flux queue stop --all --nocheckpoint
549+
EOT
550+
chmod +x ./stopqueues.sh &&
551+
chmod +x ./stopqueuesnocheckpoint.sh &&
552+
flux start -o,--config-path=$(pwd)/conf.d \
553+
-o,-Scontent.dump=dump_queue_named_nocheckpoint1.tar \
554+
./stopqueues.sh &&
555+
tar -xvf dump_queue_named_nocheckpoint1.tar &&
556+
cat checkpoint/job-manager | jq -e ".queue[0].start == false" &&
557+
cat checkpoint/job-manager | jq -e ".queue[1].start == false" &&
558+
flux start -o,--config-path=$(pwd)/conf.d \
559+
-o,-Scontent.dump=dump_queue_named_nocheckpoint2.tar \
560+
./stopqueuesnocheckpoint.sh &&
561+
tar -xvf dump_queue_named_nocheckpoint2.tar &&
562+
cat checkpoint/job-manager | jq -e ".queue[0].start == true" &&
563+
cat checkpoint/job-manager | jq -e ".queue[1].start == true"
549564
'
550565

551566
test_expect_success 'flux-queue: quiet option works with one queue' '

0 commit comments

Comments
 (0)