Skip to content

Commit f6bef37

Browse files
authored
Merge pull request #6633 from grondo/reslog-truncate-simple
truncate resource journal at configurable size
2 parents 04c71be + 6f79323 commit f6bef37

File tree

6 files changed

+157
-18
lines changed

6 files changed

+157
-18
lines changed

doc/man5/flux-config-resource.rst

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -79,8 +79,14 @@ rediscover
7979
(optional) If true, force rediscovery of resources using HWLOC, rather
8080
then using the R and HWLOC XML from the enclosing instance.
8181

82-
Note that updates to the resource table are ignored until the next Flux
83-
restart.
82+
journal-max
83+
(optional) An integer containing the maximum number of resource eventlog
84+
events held in the resource module for the ``resource.journal`` RPC. The
85+
default is 100,000. This value takes immediate effect on a configuration
86+
update.
87+
88+
Note that, except where noted above, updates to the resource table are
89+
ignored until the next Flux restart.
8490

8591
EXAMPLE
8692
=======

src/modules/resource/reslog.c

Lines changed: 81 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,8 @@ struct reslog {
3636
struct resource_ctx *ctx;
3737
zlist_t *pending; // list of pending futures
3838
zlist_t *watchers;
39-
json_t *eventlog;
39+
zlistx_t *eventlog;
40+
int journal_max;
4041
struct flux_msglist *consumers;
4142
flux_msg_handler_t **handlers;
4243
};
@@ -196,6 +197,45 @@ int reslog_sync (struct reslog *reslog)
196197
return 0;
197198
}
198199

200+
/* Truncate resource journal if needed to reslog->journal_max
201+
*/
202+
static int reslog_truncate (struct reslog *reslog)
203+
{
204+
int rc = -1;
205+
int count;
206+
double timestamp;
207+
json_t *event = NULL;
208+
209+
if ((count = zlistx_size (reslog->eventlog)) <= reslog->journal_max)
210+
return 0;
211+
212+
/* Detach events until count is decreased to max - 1.
213+
* Save timestamps for the truncate event.
214+
*/
215+
while (count-- >= reslog->journal_max) {
216+
event = zlistx_first (reslog->eventlog);
217+
if (eventlog_entry_parse (event, &timestamp, NULL, NULL) < 0) {
218+
/* Unlikely: failed to parse timestamp from first event, but
219+
* timestamp of truncate event needs to come before any other
220+
* event, so set it to a small value (not 0., because this will
221+
* cause eventlog_entry_create() to use current timestamp.)
222+
*/
223+
timestamp = 0.1;
224+
}
225+
zlistx_delete (reslog->eventlog, zlistx_cursor (reslog->eventlog));
226+
}
227+
228+
/* Push truncate event onto front of list
229+
*/
230+
if (!(event = eventlog_entry_create (timestamp, "truncate", NULL))
231+
|| !(zlistx_add_start (reslog->eventlog, event)))
232+
goto out;
233+
rc = 0;
234+
out:
235+
json_decref (event);
236+
return rc;
237+
}
238+
199239
int reslog_post_pack (struct reslog *reslog,
200240
const flux_msg_t *request,
201241
double timestamp,
@@ -217,11 +257,12 @@ int reslog_post_pack (struct reslog *reslog,
217257
va_end (ap);
218258
if (!event)
219259
return -1;
220-
if (json_array_append (reslog->eventlog, event) < 0) {
221-
json_decref (event);
260+
if (!zlistx_add_end (reslog->eventlog, event)) {
222261
errno = ENOMEM;
223262
return -1;
224263
}
264+
if (reslog_truncate (reslog) < 0)
265+
flux_log_error (h, "failed to truncate eventlog");
225266
if ((flags & EVENT_NO_COMMIT)) {
226267
if (!(f = flux_future_create (NULL, NULL)))
227268
goto error;
@@ -299,11 +340,11 @@ int reslog_add_callback (struct reslog *reslog, reslog_cb_f cb, void *arg)
299340
static bool send_backlog (struct reslog *reslog, const flux_msg_t *msg)
300341
{
301342
flux_t *h = reslog->ctx->h;
302-
size_t index;
303-
json_t *entry;
304-
json_array_foreach (reslog->eventlog, index, entry) {
343+
json_t *entry = zlistx_first (reslog->eventlog);
344+
while (entry) {
305345
if (notify_one_consumer (reslog, msg, entry) < 0)
306346
goto error;
347+
entry = zlistx_next (reslog->eventlog);
307348
}
308349
if (flux_respond_pack (h, msg, "{s:[]}", "events") < 0) // delimiter
309350
goto error;
@@ -398,34 +439,64 @@ void reslog_destroy (struct reslog *reslog)
398439
flux_msglist_destroy (reslog->consumers);
399440
}
400441
zlist_destroy (&reslog->watchers);
401-
json_decref (reslog->eventlog);
442+
zlistx_destroy (&reslog->eventlog);
402443
free (reslog);
403444
errno = saved_errno;
404445
}
405446
}
406447

407-
struct reslog *reslog_create (struct resource_ctx *ctx, json_t *eventlog)
448+
static void entry_destructor (void **item)
449+
{
450+
if (*item) {
451+
json_decref (*item);
452+
*item = NULL;
453+
}
454+
}
455+
456+
static void *entry_duplicator (const void *item)
457+
{
458+
return json_incref ((json_t *) item);
459+
}
460+
461+
void reslog_set_journal_max (struct reslog *reslog, int max)
462+
{
463+
if (reslog) {
464+
reslog->journal_max = max;
465+
if (reslog_truncate (reslog) < 0)
466+
flux_log_error (reslog->ctx->h,
467+
"resource eventlog truncation failed");
468+
}
469+
}
470+
471+
struct reslog *reslog_create (struct resource_ctx *ctx,
472+
json_t *eventlog,
473+
int journal_max)
408474
{
409475
struct reslog *reslog;
410476

411477
if (!(reslog = calloc (1, sizeof (*reslog))))
412478
return NULL;
413479
reslog->ctx = ctx;
480+
reslog->journal_max = journal_max;
414481
if (!(reslog->pending = zlist_new ())
415482
|| !(reslog->watchers = zlist_new ()))
416483
goto nomem;
417484
zlist_comparefn (reslog->watchers, watcher_compare);
418-
if (!(reslog->eventlog = json_array ()))
485+
if (!(reslog->eventlog = zlistx_new ()))
419486
goto nomem;
487+
zlistx_set_destructor (reslog->eventlog, entry_destructor);
488+
zlistx_set_duplicator (reslog->eventlog, entry_duplicator);
420489
if (eventlog) {
421490
size_t index;
422491
json_t *entry;
423492
json_array_foreach (eventlog, index, entry) {
424493
// historical resource-define events are not helpful
425494
if (match_event (entry, "resource-define"))
426495
continue;
427-
if (json_array_append (reslog->eventlog, entry) < 0)
496+
if (!zlistx_add_end (reslog->eventlog, entry))
428497
goto nomem;
498+
if (reslog_truncate (reslog) < 0)
499+
flux_log_error (ctx->h, "eventlog truncate failed");
429500
}
430501
}
431502
if (!(reslog->consumers = flux_msglist_create ()))

src/modules/resource/reslog.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,13 @@ typedef void (*reslog_cb_f)(struct reslog *reslog,
2222
json_t *context,
2323
void *arg);
2424

25-
struct reslog *reslog_create (struct resource_ctx *ctx, json_t *eventlog);
25+
struct reslog *reslog_create (struct resource_ctx *ctx,
26+
json_t *eventlog,
27+
int journal_max);
2628
void reslog_destroy (struct reslog *reslog);
2729

30+
void reslog_set_journal_max (struct reslog *reslog, int max);
31+
2832
/* Post an event to the eventlog. This function returns immediately,
2933
* and the commit to the eventlog completes asynchronously.
3034
* If 'request' is non-NULL, a success/fail response is sent upon commit

src/modules/resource/resource.c

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,11 @@
6161
* rediscover = false
6262
* Force rediscovery of local resources via hwloc. Do not fetch R or hwloc
6363
* XML from the enclosing instance.
64+
*
65+
* journal-max = 100000
66+
* Maximum size allowed of the resource journal before it is truncated.
6467
*/
68+
6569
static int parse_config (struct resource_ctx *ctx,
6670
const flux_conf_t *conf,
6771
struct resource_config *rconfig,
@@ -75,12 +79,13 @@ static int parse_config (struct resource_ctx *ctx,
7579
int norestrict = 0;
7680
int no_update_watch = 0;
7781
int rediscover = 0;
82+
int journal_max = 100000;
7883
json_t *o = NULL;
7984
json_t *config = NULL;
8085

8186
if (flux_conf_unpack (conf,
8287
&error,
83-
"{s?{s?s s?s s?o s?s s?b s?b s?b s?b !}}",
88+
"{s?{s?s s?s s?o s?s s?b s?b s?b s?b s?i !}}",
8489
"resource",
8590
"path", &path,
8691
"scheduling", &scheduling_path,
@@ -89,7 +94,8 @@ static int parse_config (struct resource_ctx *ctx,
8994
"norestrict", &norestrict,
9095
"noverify", &noverify,
9196
"no-update-watch", &no_update_watch,
92-
"rediscover", &rediscover) < 0) {
97+
"rediscover", &rediscover,
98+
"journal-max", &journal_max) < 0) {
9399
errprintf (errp,
94100
"error parsing [resource] configuration: %s",
95101
error.text);
@@ -146,6 +152,7 @@ static int parse_config (struct resource_ctx *ctx,
146152
}
147153
}
148154
if (rconfig) {
155+
rconfig->journal_max = journal_max;
149156
rconfig->exclude_idset = exclude;
150157
rconfig->noverify = noverify ? true : false;
151158
rconfig->norestrict = norestrict ? true : false;
@@ -171,13 +178,15 @@ static void config_reload_cb (flux_t *h,
171178
const flux_conf_t *conf;
172179
flux_error_t error;
173180
const char *errstr = NULL;
181+
struct resource_config config = {0};
174182

175183
if (flux_conf_reload_decode (msg, &conf) < 0)
176184
goto error;
177-
if (parse_config (ctx, conf, NULL, &error) < 0) {
185+
if (parse_config (ctx, conf, &config, &error) < 0) {
178186
errstr = error.text;
179187
goto error;
180188
}
189+
reslog_set_journal_max (ctx->reslog, config.journal_max);
181190
if (flux_set_conf (h, flux_conf_incref (conf)) < 0) {
182191
errstr = "error updating cached configuration";
183192
goto error;
@@ -372,7 +381,7 @@ int mod_main (flux_t *h, int argc, char **argv)
372381
*/
373382
if (upgrade_eventlog (h, &eventlog) < 0)
374383
goto error;
375-
if (!(ctx->reslog = reslog_create (ctx, eventlog)))
384+
if (!(ctx->reslog = reslog_create (ctx, eventlog, config.journal_max)))
376385
goto error;
377386
if (!(ctx->acquire = acquire_create (ctx)))
378387
goto error;

src/modules/resource/resource.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ struct resource_config {
1919
bool norestrict;
2020
bool no_update_watch;
2121
bool monitor_force_up;
22+
int journal_max;
2223
};
2324

2425
struct resource_ctx {

t/t2355-resource-journal.t

Lines changed: 49 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -158,7 +158,55 @@ test_expect_success 'last event: resource-define method=kvs' '
158158
jq -e ".name == \"resource-define\"" restartlog.3 &&
159159
jq -e ".context.method == \"kvs\"" restartlog.3
160160
'
161-
161+
test_expect_success 'ensure all ranks are undrained' '
162+
ranks=$(flux resource status -no {ranks} -s drain) &&
163+
if test -n "$ranks"; then
164+
flux resource undrain $ranks
165+
fi
166+
'
167+
test_expect_success 'capture eventlog before truncation' '
168+
flux resource eventlog -H &&
169+
flux resource eventlog -f json > eventlog.pre.out
170+
'
171+
test_expect_success 'set a journal size limit 1 less than current entries' '
172+
limit=$(($(flux resource eventlog | wc -l) - 1)) &&
173+
test_debug "echo limiting resource.journal to $limit entries" &&
174+
echo resource.journal-max=$limit | flux config load
175+
'
176+
test_expect_success 'eventlog is now truncated' '
177+
flux resource eventlog -H &&
178+
flux resource eventlog -f json > eventlog.trunc
179+
'
180+
test_expect_success 'truncated eventlog has expected number of entries' '
181+
test_debug "wc -l eventlog.trunc" &&
182+
test $(wc -l < eventlog.trunc) -eq $limit
183+
'
184+
test_expect_success '1st event is a truncate event' '
185+
head -1 eventlog.trunc > eventlog.trunc.1 &&
186+
jq -e ".name == \"truncate\"" eventlog.trunc.1
187+
'
188+
test_expect_success 'cause another event to be posted to the eventlog' '
189+
flux resource drain 0
190+
'
191+
test_expect_success '1st event is still a truncate event' '
192+
flux resource eventlog -H &&
193+
flux resource eventlog -f json > eventlog2.trunc &&
194+
head -1 eventlog2.trunc | jq -e ".name == \"truncate\""
195+
'
196+
test_expect_success 'truncated eventlog has expected number of entries' '
197+
test $(wc -l < eventlog2.trunc) -eq $limit
198+
'
199+
test_expect_success 'cause another event to be posted to the eventlog' '
200+
flux resource undrain 0
201+
'
202+
test_expect_success '1st event is still a truncate event' '
203+
flux resource eventlog -H &&
204+
flux resource eventlog -f json > eventlog3.trunc &&
205+
head -1 eventlog3.trunc | jq -e ".name == \"truncate\""
206+
'
207+
test_expect_success 'truncated eventlog has expected number of entries' '
208+
test $(wc -l < eventlog3.trunc) -eq $limit
209+
'
162210
test_expect_success 'reload the scheduler' '
163211
flux module load sched-simple
164212
'

0 commit comments

Comments
 (0)