20
20
from parsl .executors .high_throughput .errors import ManagerLost , VersionMismatch
21
21
from parsl .executors .high_throughput .manager_record import ManagerRecord
22
22
from parsl .monitoring .message_type import MessageType
23
+ from parsl .monitoring .radios import MonitoringRadioSender , ZMQRadioSender
23
24
from parsl .process_loggers import wrap_with_logs
24
25
from parsl .serialize import serialize as serialize_object
25
26
from parsl .utils import setproctitle
@@ -216,36 +217,26 @@ def task_puller(self) -> NoReturn:
216
217
task_counter += 1
217
218
logger .debug (f"Fetched { task_counter } tasks so far" )
218
219
219
- def _create_monitoring_channel (self ) -> Optional [zmq .Socket ]:
220
- if self .hub_address and self .hub_zmq_port :
221
- logger .info ("Connecting to MonitoringHub" )
222
- # This is a one-off because monitoring is unencrypted
223
- hub_channel = zmq .Context ().socket (zmq .DEALER )
224
- hub_channel .set_hwm (0 )
225
- hub_channel .connect ("tcp://{}:{}" .format (self .hub_address , self .hub_zmq_port ))
226
- logger .info ("Connected to MonitoringHub" )
227
- return hub_channel
228
- else :
229
- return None
230
-
231
- def _send_monitoring_info (self , hub_channel : Optional [zmq .Socket ], manager : ManagerRecord ) -> None :
232
- if hub_channel :
220
+ def _send_monitoring_info (self , monitoring_radio : Optional [MonitoringRadioSender ], manager : ManagerRecord ) -> None :
221
+ if monitoring_radio :
233
222
logger .info ("Sending message {} to MonitoringHub" .format (manager ))
234
223
235
224
d : Dict = cast (Dict , manager .copy ())
236
225
d ['timestamp' ] = datetime .datetime .now ()
237
226
d ['last_heartbeat' ] = datetime .datetime .fromtimestamp (d ['last_heartbeat' ])
238
227
239
- hub_channel . send_pyobj ((MessageType .NODE_INFO , d ))
228
+ monitoring_radio . send ((MessageType .NODE_INFO , d ))
240
229
241
230
@wrap_with_logs (target = "interchange" )
242
231
def _command_server (self ) -> NoReturn :
243
232
""" Command server to run async command to the interchange
244
233
"""
245
234
logger .debug ("Command Server Starting" )
246
235
247
- # Need to create a new ZMQ socket for command server thread
248
- hub_channel = self ._create_monitoring_channel ()
236
+ if self .hub_address is not None and self .hub_zmq_port is not None :
237
+ monitoring_radio = ZMQRadioSender (self .hub_address , self .hub_zmq_port )
238
+ else :
239
+ monitoring_radio = None
249
240
250
241
reply : Any # the type of reply depends on the command_req received (aka this needs dependent types...)
251
242
@@ -295,7 +286,7 @@ def _command_server(self) -> NoReturn:
295
286
if manager_id in self ._ready_managers :
296
287
m = self ._ready_managers [manager_id ]
297
288
m ['active' ] = False
298
- self ._send_monitoring_info (hub_channel , m )
289
+ self ._send_monitoring_info (monitoring_radio , m )
299
290
else :
300
291
logger .warning ("Worker to hold was not in ready managers list" )
301
292
@@ -330,9 +321,16 @@ def start(self) -> None:
330
321
# parent-process-inheritance problems.
331
322
signal .signal (signal .SIGTERM , signal .SIG_DFL )
332
323
333
- logger .info ("Incoming ports bound " )
324
+ logger .info ("Starting main interchange method " )
334
325
335
- hub_channel = self ._create_monitoring_channel ()
326
+ if self .hub_address is not None and self .hub_zmq_port is not None :
327
+ print ("logging inside if" )
328
+ logger .info ("BENC: inside if" )
329
+ print ("constructing radio sender" )
330
+ monitoring_radio = ZMQRadioSender (self .hub_address , self .hub_zmq_port )
331
+ logger .info ("Created monitoring radio" )
332
+ else :
333
+ monitoring_radio = None
336
334
337
335
poll_period = self .poll_period
338
336
@@ -363,10 +361,10 @@ def start(self) -> None:
363
361
while not kill_event .is_set ():
364
362
self .socks = dict (poller .poll (timeout = poll_period ))
365
363
366
- self .process_task_outgoing_incoming (interesting_managers , hub_channel , kill_event )
367
- self .process_results_incoming (interesting_managers , hub_channel )
368
- self .expire_bad_managers (interesting_managers , hub_channel )
369
- self .expire_drained_managers (interesting_managers , hub_channel )
364
+ self .process_task_outgoing_incoming (interesting_managers , monitoring_radio , kill_event )
365
+ self .process_results_incoming (interesting_managers , monitoring_radio )
366
+ self .expire_bad_managers (interesting_managers , monitoring_radio )
367
+ self .expire_drained_managers (interesting_managers , monitoring_radio )
370
368
self .process_tasks_to_send (interesting_managers )
371
369
372
370
self .zmq_context .destroy ()
@@ -377,7 +375,7 @@ def start(self) -> None:
377
375
def process_task_outgoing_incoming (
378
376
self ,
379
377
interesting_managers : Set [bytes ],
380
- hub_channel : Optional [zmq . Socket ],
378
+ monitoring_radio : Optional [MonitoringRadioSender ],
381
379
kill_event : threading .Event
382
380
) -> None :
383
381
"""Process one message from manager on the task_outgoing channel.
@@ -431,7 +429,7 @@ def process_task_outgoing_incoming(
431
429
m .update (msg ) # type: ignore[typeddict-item]
432
430
433
431
logger .info ("Registration info for manager {!r}: {}" .format (manager_id , msg ))
434
- self ._send_monitoring_info (hub_channel , m )
432
+ self ._send_monitoring_info (monitoring_radio , m )
435
433
436
434
if (msg ['python_v' ].rsplit ("." , 1 )[0 ] != self .current_platform ['python_v' ].rsplit ("." , 1 )[0 ] or
437
435
msg ['parsl_v' ] != self .current_platform ['parsl_v' ]):
@@ -462,7 +460,7 @@ def process_task_outgoing_incoming(
462
460
logger .error (f"Unexpected message type received from manager: { msg ['type' ]} " )
463
461
logger .debug ("leaving task_outgoing section" )
464
462
465
- def expire_drained_managers (self , interesting_managers : Set [bytes ], hub_channel : Optional [zmq . Socket ]) -> None :
463
+ def expire_drained_managers (self , interesting_managers : Set [bytes ], monitoring_radio : Optional [MonitoringRadioSender ]) -> None :
466
464
467
465
for manager_id in list (interesting_managers ):
468
466
# is it always true that a draining manager will be in interesting managers?
@@ -475,7 +473,7 @@ def expire_drained_managers(self, interesting_managers: Set[bytes], hub_channel:
475
473
self ._ready_managers .pop (manager_id )
476
474
477
475
m ['active' ] = False
478
- self ._send_monitoring_info (hub_channel , m )
476
+ self ._send_monitoring_info (monitoring_radio , m )
479
477
480
478
def process_tasks_to_send (self , interesting_managers : Set [bytes ]) -> None :
481
479
# Check if there are tasks that could be sent to managers
@@ -519,7 +517,7 @@ def process_tasks_to_send(self, interesting_managers: Set[bytes]) -> None:
519
517
else :
520
518
logger .debug ("either no interesting managers or no tasks, so skipping manager pass" )
521
519
522
- def process_results_incoming (self , interesting_managers : Set [bytes ], hub_channel : Optional [zmq . Socket ]) -> None :
520
+ def process_results_incoming (self , interesting_managers : Set [bytes ], monitoring_radio : Optional [MonitoringRadioSender ]) -> None :
523
521
# Receive any results and forward to client
524
522
if self .results_incoming in self .socks and self .socks [self .results_incoming ] == zmq .POLLIN :
525
523
logger .debug ("entering results_incoming section" )
@@ -539,11 +537,11 @@ def process_results_incoming(self, interesting_managers: Set[bytes], hub_channel
539
537
elif r ['type' ] == 'monitoring' :
540
538
# the monitoring code makes the assumption that no
541
539
# monitoring messages will be received if monitoring
542
- # is not configured, and that hub_channel will only
540
+ # is not configured, and that monitoring_radio will only
543
541
# be None when monitoring is not configurated.
544
- assert hub_channel is not None
542
+ assert monitoring_radio is not None
545
543
546
- hub_channel . send_pyobj (r ['payload' ])
544
+ monitoring_radio . send (r ['payload' ])
547
545
elif r ['type' ] == 'heartbeat' :
548
546
logger .debug (f"Manager { manager_id !r} sent heartbeat via results connection" )
549
547
b_messages .append ((p_message , r ))
@@ -587,15 +585,15 @@ def process_results_incoming(self, interesting_managers: Set[bytes], hub_channel
587
585
interesting_managers .add (manager_id )
588
586
logger .debug ("leaving results_incoming section" )
589
587
590
- def expire_bad_managers (self , interesting_managers : Set [bytes ], hub_channel : Optional [zmq . Socket ]) -> None :
588
+ def expire_bad_managers (self , interesting_managers : Set [bytes ], monitoring_radio : Optional [MonitoringRadioSender ]) -> None :
591
589
bad_managers = [(manager_id , m ) for (manager_id , m ) in self ._ready_managers .items () if
592
590
time .time () - m ['last_heartbeat' ] > self .heartbeat_threshold ]
593
591
for (manager_id , m ) in bad_managers :
594
592
logger .debug ("Last: {} Current: {}" .format (m ['last_heartbeat' ], time .time ()))
595
593
logger .warning (f"Too many heartbeats missed for manager { manager_id !r} - removing manager" )
596
594
if m ['active' ]:
597
595
m ['active' ] = False
598
- self ._send_monitoring_info (hub_channel , m )
596
+ self ._send_monitoring_info (monitoring_radio , m )
599
597
600
598
logger .warning (f"Cancelling htex tasks { m ['tasks' ]} on removed manager" )
601
599
for tid in m ['tasks' ]:
0 commit comments