@@ -85,7 +85,7 @@ def optimize(dag: 'dag_lib.Dag',
85
85
# This function is effectful: mutates every node in 'dag' by setting
86
86
# node.best_resources if it is None.
87
87
dag = Optimizer ._add_dummy_source_sink_nodes (dag )
88
- optimized_dag , unused_best_plan = Optimizer ._optimize_cost (
88
+ optimized_dag , unused_best_plan = Optimizer ._optimize_objective (
89
89
dag ,
90
90
minimize_cost = minimize == OptimizeTarget .COST ,
91
91
blocked_launchable_resources = blocked_launchable_resources ,
@@ -339,6 +339,154 @@ def _optimize_by_dp(
339
339
best_resources = dp_point_backs [node ][best_resources ]
340
340
return best_plan , best_total_objective
341
341
342
+ @staticmethod
343
+ def _optimize_by_ilp (
344
+ graph ,
345
+ topo_order : List [Task ],
346
+ node_to_cost_map : _TaskToCostMap ,
347
+ minimize_cost : bool = True ,
348
+ ) -> Tuple [Dict [Task , resources_lib .Resources ], float ]:
349
+ """Optimizes a general DAG using an ILP solver.
350
+
351
+ Notations:
352
+ V: the set of nodes (tasks).
353
+ E: the set of edges (dependencies).
354
+ k: node -> [r.cost for r in node.resources].
355
+ F: (node i, node j) -> the egress cost/time between node i and j.
356
+ c: node -> one-hot decision vector. c[node][i] = 1 means
357
+ the node is assigned to the i-th resource.
358
+ e: (node i, node j) -> linearization of c[node i] x c[node j].
359
+ e[node i][node j][a][b] = 1 means node i and node j are assigned
360
+ to the a-th and the b-th resources, respectively.
361
+
362
+ Objective:
363
+ For cost optimization,
364
+ minimize_{c} sum(c[v]^T @ k[v] for each v in V) +
365
+ sum(c[u]^T @ F[u][v] @ c[v] for each u, v in E)
366
+ s.t. sum(c[v] == 1) for each v in V
367
+ which is equivalent (linearized) to,
368
+ minimize_{c, e} sum(c[v]^T @ k[v] for each v in V) +
369
+ sum(e[u][v]^T @ F[u][v] for each u, v in E)
370
+ s.t. sum(c[v] == 1) for each v in V (i.e., c is one-hot)
371
+ sum(e[u][v] == 1) for each u, v in E (i.e., e is one-hot)
372
+ e[u][v] = flatten(c[u] @ c[v]^T) for each u, v in E
373
+ The first term of the objective indicates the execution cost
374
+ of the task v, and the second term indicates the egress cost
375
+ of the parent task u to the task v.
376
+
377
+ For time optimization,
378
+ minimize_{c} finish_time[sink_node]
379
+ s.t. finish_time[v] >= c[v]^T @ k[v] + finish_time[u] +
380
+ c[u]^T @ F[u][v] @ c[v]
381
+ for each u, v in E
382
+ sum(c[v] == 1) for each v in V
383
+ which is equivalent (linearized) to,
384
+ minimize_{c, e} finish_time[sink_node]
385
+ s.t. finish_time[v] >= c[v]^T @ k[v] + finish_time[u] +
386
+ e[u][v]^T @ F[u][v]
387
+ for each u, v in E
388
+ sum(c[v] == 1) for each v in V (i.e., c is one-hot)
389
+ sum(e[u][v] == 1) for each u, v in E (i.e., e is one-hot)
390
+ e[u][v] = flatten(c[u] @ c[v]^T) for each u, v in E
391
+ The first term of the objective indicates the execution time
392
+ of the task v, and the other two terms indicate that the task v
393
+ starts executing no sooner than its parent tasks are finished and
394
+ the output data from the parents has arrived to the task v.
395
+ """
396
+ import pulp # pylint: disable=import-outside-toplevel
397
+
398
+ if minimize_cost :
399
+ prob = pulp .LpProblem ('Sky-Cost-Optimization' , pulp .LpMinimize )
400
+ else :
401
+ prob = pulp .LpProblem ('Sky-Runtime-Optimization' , pulp .LpMinimize )
402
+
403
+ # Prepare the constants.
404
+ V = topo_order # pylint: disable=invalid-name
405
+ E = graph .edges () # pylint: disable=invalid-name
406
+ k = {
407
+ node : list (resource_cost_map .values ())
408
+ for node , resource_cost_map in node_to_cost_map .items ()
409
+ }
410
+ F = collections .defaultdict (dict ) # pylint: disable=invalid-name
411
+ for u , v in E :
412
+ F [u ][v ] = []
413
+ for r_u in node_to_cost_map [u ].keys ():
414
+ for r_v in node_to_cost_map [v ].keys ():
415
+ F [u ][v ].append (
416
+ Optimizer ._egress_cost_or_time (minimize_cost , u , r_u , v ,
417
+ r_v ))
418
+
419
+ # Define the decision variables.
420
+ c = {
421
+ v : pulp .LpVariable .matrix (v .name , (range (len (k [v ])),), cat = 'Binary' )
422
+ for v in V
423
+ }
424
+
425
+ e = collections .defaultdict (dict )
426
+ for u , v in E :
427
+ num_vars = len (c [u ]) * len (c [v ])
428
+ e [u ][v ] = pulp .LpVariable .matrix (f'({ u .name } ->{ v .name } )' ,
429
+ (range (num_vars ),),
430
+ cat = 'Binary' )
431
+
432
+ # Formulate the constraints.
433
+ # 1. c[v] is an one-hot vector.
434
+ for v in V :
435
+ prob += pulp .lpSum (c [v ]) == 1
436
+
437
+ # 2. e[u][v] is an one-hot vector.
438
+ for u , v in E :
439
+ prob += pulp .lpSum (e [u ][v ]) == 1
440
+
441
+ # 3. e[u][v] linearizes c[u] x c[v].
442
+ for u , v in E :
443
+ e_uv = e [u ][v ] # 1-d one-hot vector
444
+ N_u = len (c [u ]) # pylint: disable=invalid-name
445
+ N_v = len (c [v ]) # pylint: disable=invalid-name
446
+
447
+ for row in range (N_u ):
448
+ prob += pulp .lpSum (
449
+ e_uv [N_v * row + col ] for col in range (N_v )) == c [u ][row ]
450
+
451
+ for col in range (N_v ):
452
+ prob += pulp .lpSum (
453
+ e_uv [N_v * row + col ] for row in range (N_u )) == c [v ][col ]
454
+
455
+ # Formulate the objective.
456
+ if minimize_cost :
457
+ objective = 0
458
+ for v in V :
459
+ objective += pulp .lpDot (c [v ], k [v ])
460
+ for u , v in E :
461
+ objective += pulp .lpDot (e [u ][v ], F [u ][v ])
462
+ else :
463
+ # We need additional decision variables.
464
+ finish_time = {
465
+ v : pulp .LpVariable (f'lat({ v } )' , lowBound = 0 ) for v in V
466
+ }
467
+ for u , v in E :
468
+ prob += finish_time [v ] >= (pulp .lpDot (
469
+ c [v ], k [v ]) + finish_time [u ] + pulp .lpDot (e [u ][v ], F [u ][v ]))
470
+ sink_node = V [- 1 ]
471
+ objective = finish_time [sink_node ]
472
+ prob += objective
473
+
474
+ # Solve the optimization problem.
475
+ prob .solve (solver = pulp .PULP_CBC_CMD (msg = False ))
476
+ assert prob .status != pulp .LpStatusInfeasible , \
477
+ 'Cannot solve the optimization problem'
478
+ best_total_objective = prob .objective .value ()
479
+
480
+ # Find the best plan for the DAG.
481
+ # node -> best resources
482
+ best_plan = {}
483
+ for node , variables in c .items ():
484
+ selected = [variable .value () for variable in variables ].index (1 )
485
+ best_resources = list (node_to_cost_map [node ].keys ())[selected ]
486
+ node .best_resources = best_resources
487
+ best_plan [node ] = best_resources
488
+ return best_plan , best_total_objective
489
+
342
490
@staticmethod
343
491
def _compute_total_time (
344
492
graph ,
@@ -510,7 +658,7 @@ def _print_candidates(node_to_candidate_map: _TaskToPerCloudCandidates):
510
658
f'To list more details, run \' sky show-gpus { acc_name } \' .' )
511
659
512
660
@staticmethod
513
- def _optimize_cost (
661
+ def _optimize_objective (
514
662
dag : 'dag_lib.Dag' ,
515
663
minimize_cost : bool = True ,
516
664
blocked_launchable_resources : Optional [List [
@@ -540,7 +688,8 @@ def _optimize_cost(
540
688
best_plan , best_total_objective = Optimizer ._optimize_by_dp (
541
689
topo_order , node_to_cost_map , minimize_cost )
542
690
else :
543
- raise NotImplementedError ('Currently Sky only supports chain DAGs.' )
691
+ best_plan , best_total_objective = Optimizer ._optimize_by_ilp (
692
+ graph , topo_order , node_to_cost_map , minimize_cost )
544
693
545
694
if minimize_cost :
546
695
total_time = Optimizer ._compute_total_time (graph , topo_order ,
0 commit comments