[Spot] Let the controller aware of the failed setup and fail early (skypilot-org#1479)

Michaelvll · concretevitamin · web-flow · commit 7adb54e16735 · 2023-01-24T10:58:49.000-08:00
* Let the controller aware of the failed setup and fail early

* format

* Add test

* yapf

* add test yaml

* increase timeout for spot tests

* fix

* Add timeout for final spot status waiting

* yapf

* fix merge error

* get rid of autostop test for spot controller

* reorder

* fix comment

* Add failed setup status for spot

* Update sky/spot/recovery_strategy.py

Co-authored-by: Zongheng Yang &lt;zongheng.y@gmail.com&gt;

* Address comments

* format

* update and variable names

* format

* lint

* address comments

* Address comments

* fix test

Co-authored-by: Zongheng Yang &lt;zongheng.y@gmail.com&gt;
diff --git a/sky/skylet/job_lib.py b/sky/skylet/job_lib.py
@@ -79,6 +79,8 @@ class JobStatus(enum.Enum):
     # The `job_id` has been generated, but the generated ray program has
     # not started yet. skylet can transit the state from INIT to FAILED
     # directly, if the ray program fails to start.
+    # In the 'jobs' table, the `submitted_at` column will be set to the current
+    # time, when the job is firstly created (in the INIT state).
     INIT = 'INIT'
     # Running the user's setup script (only in effect if --detach-setup is
     # set). Our update_job_status() can temporarily (for a short period) set
@@ -90,6 +92,8 @@ class JobStatus(enum.Enum):
     # by the placement constraints.)
     PENDING = 'PENDING'
     # The job is running.
+    # In the 'jobs' table, the `start_at` column will be set to the current
+    # time, when the job is firstly transitioned to RUNNING.
     RUNNING = 'RUNNING'
     # 3 terminal states below: once reached, they do not transition.
     # The job finished successfully.
@@ -290,12 +294,26 @@ def get_latest_job_id() -> Optional[int]:
         return job_id
 
 
-def get_job_time_payload(job_id: int, is_end: bool) -> Optional[int]:
-    field = 'end_at' if is_end else 'start_at'
+def get_job_submitted_or_ended_timestamp_payload(job_id: int,
+                                                 get_ended_time: bool) -> str:
+    """Get the job submitted/ended timestamp.
+
+    This function should only be called by the spot controller,
+    which is ok to use `submitted_at` instead of `start_at`,
+    because the spot job duration need to include both setup
+    and running time and the job will not stay in PENDING
+    state.
+
+    The normal job duration will use `start_at` instead of
+    `submitted_at` (in `format_job_queue()`), because the job
+    may stay in PENDING if the cluster is busy.
+    """
+    field = 'end_at' if get_ended_time else 'submitted_at'
     rows = _CURSOR.execute(f'SELECT {field} FROM jobs WHERE job_id=(?)',
                            (job_id,))
     for (timestamp,) in rows:
         return common_utils.encode_payload(timestamp)
+    return common_utils.encode_payload(None)
 
 
 def _get_records_from_rows(rows) -> List[Dict[str, Any]]:
@@ -684,13 +702,16 @@ def get_job_status(cls, job_ids: Optional[List[int]] = None) -> str:
         return cls._build(code)
 
     @classmethod
-    def get_job_time_payload(cls,
-                             job_id: Optional[int] = None,
-                             is_end: bool = False) -> str:
+    def get_job_submitted_or_ended_timestamp_payload(
+            cls,
+            job_id: Optional[int] = None,
+            get_ended_time: bool = False) -> str:
         code = [
             f'job_id = {job_id} if {job_id} is not None '
             'else job_lib.get_latest_job_id()',
-            f'job_time = job_lib.get_job_time_payload(job_id, {is_end})',
+            'job_time = '
+            'job_lib.get_job_submitted_or_ended_timestamp_payload('
+            f'job_id, {get_ended_time})',
             'print(job_time, flush=True)',
         ]
         return cls._build(code)
diff --git a/sky/spot/controller.py b/sky/spot/controller.py
@@ -64,9 +64,9 @@ def _run(self):
         logger.info(f'Started monitoring spot task {self._task_name} '
                     f'(id: {self._job_id})')
         spot_state.set_starting(self._job_id)
-        start_at = self._strategy_executor.launch()
+        job_submitted_at = self._strategy_executor.launch()
 
-        spot_state.set_started(self._job_id, start_time=start_at)
+        spot_state.set_started(self._job_id, start_time=job_submitted_at)
         while True:
             time.sleep(spot_utils.JOB_STATUS_CHECK_GAP_SECONDS)
 
@@ -120,7 +120,9 @@ def _run(self):
                 if job_status is not None and not job_status.is_terminal():
                     # The multi-node job is still running, continue monitoring.
                     continue
-                elif job_status == job_lib.JobStatus.FAILED:
+                elif job_status in [
+                        job_lib.JobStatus.FAILED, job_lib.JobStatus.FAILED_SETUP
+                ]:
                     # The user code has probably crashed, fail immediately.
                     end_time = spot_utils.get_job_timestamp(self._backend,
                                                             self._cluster_name,
@@ -132,10 +134,12 @@ def _run(self):
                                             None,
                                             spot_job_id=self._job_id)
                     logger.info(f'\n== End of logs (ID: {self._job_id}) ==')
-                    spot_state.set_failed(
-                        self._job_id,
-                        failure_type=spot_state.SpotStatus.FAILED,
-                        end_time=end_time)
+                    status_to_set = spot_state.SpotStatus.FAILED
+                    if job_status == job_lib.JobStatus.FAILED_SETUP:
+                        status_to_set = spot_state.SpotStatus.FAILED_SETUP
+                    spot_state.set_failed(self._job_id,
+                                          failure_type=status_to_set,
+                                          end_time=end_time)
                     break
                 # Although the cluster is healthy, we fail to access the
                 # job status. Try to recover the job (will not restart the
diff --git a/sky/spot/recovery_strategy.py b/sky/spot/recovery_strategy.py
@@ -80,7 +80,7 @@ def launch(self) -> Optional[float]:
         It can fail if resource is not available. Need to check the cluster
         status, after calling.
 
-        Returns: The job's start timestamp, or None if failed to start.
+        Returns: The job's submit timestamp, or None if failed.
         """
         if self.retry_until_up:
             return self._launch(max_retry=None)
@@ -140,8 +140,9 @@ def _launch(self, max_retry=3, raise_on_failure=True) -> Optional[float]:
             raise_on_failure: Whether to raise an exception if the launch fails.
 
         Returns:
-            The job's start timestamp, or None if failed to start and
-            raise_on_failure is False.
+            The job's submit timestamp, or None if failed to submit the job
+            (either provisioning fails or any error happens in job submission)
+            and raise_on_failure is False.
         """
         # TODO(zhwu): handle the failure during `preparing sky runtime`.
         retry_cnt = 0
@@ -152,8 +153,11 @@ def _launch(self, max_retry=3, raise_on_failure=True) -> Optional[float]:
             exception = None
             try:
                 usage_lib.messages.usage.set_internal()
+                # Detach setup, so that the setup failure can be detected
+                # by the controller process (job_status -> FAILED_SETUP).
                 sky.launch(self.dag,
                            cluster_name=self.cluster_name,
+                           detach_setup=True,
                            detach_run=True,
                            _is_launched_by_spot_controller=True)
                 logger.info('Spot cluster launched.')
@@ -202,7 +206,7 @@ def _launch(self, max_retry=3, raise_on_failure=True) -> Optional[float]:
                     # The cluster can be preempted before the job is launched.
                     # Break to let the retry launch kick in.
                     logger.info('The cluster is preempted before the job '
-                                'starts.')
+                                'is submitted.')
                     # TODO(zhwu): we should recover the preemption with the
                     # recovery strategy instead of the current while loop.
                     retry_launch = True
@@ -223,11 +227,11 @@ def _launch(self, max_retry=3, raise_on_failure=True) -> Optional[float]:
                     continue
 
                 # Check the job status until it is not in initialized status
-                if status is not None and job_lib.JobStatus.PENDING < status:
+                if status is not None and status > job_lib.JobStatus.INIT:
                     try:
-                        launch_time = spot_utils.get_job_timestamp(
+                        job_submitted_at = spot_utils.get_job_timestamp(
                             self.backend, self.cluster_name, get_end_time=False)
-                        return launch_time
+                        return job_submitted_at
                     except Exception as e:  # pylint: disable=broad-except
                         # If we failed to get the job timestamp, we will retry
                         # job checking loop.
@@ -271,16 +275,16 @@ def __init__(self, cluster_name: str, backend: 'backends.Backend',
                                                     'sky.clouds.Region']] = None
 
     def _launch(self, max_retry=3, raise_on_failure=True) -> Optional[float]:
-        launch_time = super()._launch(max_retry, raise_on_failure)
-        if launch_time is not None:
+        job_submitted_at = super()._launch(max_retry, raise_on_failure)
+        if job_submitted_at is not None:
             # Only record the cloud/region if the launch is successful.
             handle = global_user_state.get_handle_from_cluster_name(
                 self.cluster_name)
             assert handle is not None, 'Cluster should be launched.'
             launched_resources = handle.launched_resources
             self._launched_cloud_region = (launched_resources.cloud,
                                            launched_resources.region)
-        return launch_time
+        return job_submitted_at
 
     def recover(self) -> float:
         # 1. Cancel the jobs and launch the cluster with the STOPPED status,
@@ -308,11 +312,11 @@ def recover(self) -> float:
                                                region=launched_region)
                 task.set_resources({new_resources})
                 # Not using self.launch to avoid the retry until up logic.
-                launched_time = self._launch(raise_on_failure=False)
+                job_submitted_at = self._launch(raise_on_failure=False)
                 # Restore the original dag, i.e. reset the region constraint.
                 task.set_resources({original_resources})
-                if launched_time is not None:
-                    return launched_time
+                if job_submitted_at is not None:
+                    return job_submitted_at
 
             # Step 2
             logger.debug('Terminating unhealthy spot cluster and '
@@ -324,9 +328,9 @@ def recover(self) -> float:
             logger.debug('Relaunch the cluster  without constraining to prior '
                          'cloud/region.')
             # Not using self.launch to avoid the retry until up logic.
-            launched_time = self._launch(max_retry=self._MAX_RETRY_CNT,
-                                         raise_on_failure=False)
-            if launched_time is None:
+            job_submitted_at = self._launch(max_retry=self._MAX_RETRY_CNT,
+                                            raise_on_failure=False)
+            if job_submitted_at is None:
                 # Failed to launch the cluster.
                 if self.retry_until_up:
                     gap_seconds = self.RETRY_INIT_GAP_SECONDS
@@ -339,4 +343,4 @@ def recover(self) -> float:
                         f'Failed to recover the spot cluster after retrying '
                         f'{self._MAX_RETRY_CNT} times.')
 
-            return launched_time
+            return job_submitted_at
diff --git a/sky/spot/spot_state.py b/sky/spot/spot_state.py
@@ -35,8 +35,9 @@
     recovery_count INTEGER DEFAULT 0,
     job_duration FLOAT DEFAULT 0)""")
 
-# job_duration is the time a job actually runs before last_recover,
-# excluding the provision and recovery time.
+# job_duration is the time a job actually runs (including the
+# setup duration) before last_recover, excluding the provision
+# and recovery time.
 # If the job is not finished:
 # total_job_duration = now() - last_recovered_at + job_duration
 # If the job is not finished:
@@ -51,17 +52,66 @@
 
 
 class SpotStatus(enum.Enum):
-    """Spot job status, designed to be in serverless style"""
+    """Spot job status, designed to be in serverless style.
+
+    The SpotStatus is a higher level status than the JobStatus.
+    Each spot job submitted to the spot cluster, will have a JobStatus
+    on that spot cluster:
+        JobStatus = [INIT, SETTING_UP, PENDING, RUNNING, ...]
+    Whenever the spot cluster is preempted and recovered, the JobStatus
+    will go through the statuses above again.
+    That means during the lifetime of a spot job, its JobsStatus could be
+    reset to INIT or SETTING_UP multiple times (depending on the preemptions).
+
+    However, a spot job only has one SpotStatus on the spot controller.
+        SpotStatus = [PENDING, SUBMITTED, STARTING, RUNNING, ...]
+    Mapping from JobStatus to SpotStatus:
+        INIT            ->  STARTING/RECOVERING
+        SETTING_UP      ->  RUNNING
+        PENDING         ->  RUNNING
+        RUNNING         ->  RUNNING
+        SUCCEEDED       ->  SUCCEEDED
+        FAILED          ->  FAILED
+        FAILED_SETUP    ->  FAILED_SETUP
+    Note that the JobStatus will not be stuck in PENDING, because each spot
+    cluster is dedicated to a spot job, i.e. there should always be enough
+    resource to run the job and the job will be immediately transitioned to
+    RUNNING.
+    """
+    # PENDING: Waiting for the spot controller to have a slot to run the
+    # controller process.
+    # The submitted_at timestamp of the spot job in the 'spot' table will be
+    # set to the time when the job is firstly submitted by the user (set to
+    # PENDING).
     PENDING = 'PENDING'
+    # SUBMITTED: The spot controller starts the controller process.
     SUBMITTED = 'SUBMITTED'
+    # STARTING: The controller process is launching the spot cluster for
+    # the spot job.
     STARTING = 'STARTING'
+    # RUNNING: The job is submitted to the spot cluster, and is setting up
+    # or running.
+    # The start_at timestamp of the spot job in the 'spot' table will be set
+    # to the time when the job is firstly transitioned to RUNNING.
     RUNNING = 'RUNNING'
+    # RECOVERING: The spot cluster is preempted, and the controller process
+    # is recovering the spot cluster (relaunching/failover).
     RECOVERING = 'RECOVERING'
     # Terminal statuses
+    # SUCCEEDED: The job is finished successfully.
     SUCCEEDED = 'SUCCEEDED'
+    # FAILED: The job is finished with failure from the user's program.
     FAILED = 'FAILED'
+    # FAILED_SETUP: The job is finished with failure from the user's setup
+    # script.
+    FAILED_SETUP = 'FAILED_SETUP'
+    # FAILED_NO_RESOURCE: The job is finished with failure because there is no
+    # resource available in the cloud provider(s) to launch the spot cluster.
     FAILED_NO_RESOURCE = 'FAILED_NO_RESOURCE'
+    # FAILED_CONTROLLER: The job is finished with failure because of unexpected
+    # error in the controller process.
     FAILED_CONTROLLER = 'FAILED_CONTROLLER'
+    # CANCELLED: The job is cancelled by the user.
     CANCELLED = 'CANCELLED'
 
     def is_terminal(self) -> bool:
@@ -83,7 +133,10 @@ def terminal_statuses(cls) -> List['SpotStatus']:
 
     @classmethod
     def failure_statuses(cls) -> List['SpotStatus']:
-        return [cls.FAILED, cls.FAILED_NO_RESOURCE, cls.FAILED_CONTROLLER]
+        return [
+            cls.FAILED, cls.FAILED_SETUP, cls.FAILED_NO_RESOURCE,
+            cls.FAILED_CONTROLLER
+        ]
 
 
 _SPOT_STATUS_TO_COLOR = {
@@ -94,6 +147,7 @@ def failure_statuses(cls) -> List['SpotStatus']:
     SpotStatus.RECOVERING: colorama.Fore.CYAN,
     SpotStatus.SUCCEEDED: colorama.Fore.GREEN,
     SpotStatus.FAILED: colorama.Fore.RED,
+    SpotStatus.FAILED_SETUP: colorama.Fore.RED,
     SpotStatus.FAILED_NO_RESOURCE: colorama.Fore.RED,
     SpotStatus.FAILED_CONTROLLER: colorama.Fore.RED,
     SpotStatus.CANCELLED: colorama.Fore.YELLOW,
diff --git a/sky/spot/spot_utils.py b/sky/spot/spot_utils.py
@@ -43,6 +43,12 @@
 _JOB_CANCELLED_MESSAGE = ('[bold cyan]Waiting for the job status to be updated.'
                           '[/] It may take a minute.')
 
+# The maximum time to wait for the spot job status to transition to terminal
+# state, after the job finished. This is a safeguard to avoid the case where
+# the spot job status fails to be updated and keep the `sky spot logs` blocking
+# for a long time.
+_FINAL_SPOT_STATUS_WAIT_TIMEOUT_SECONDS = 10
+
 
 class UserSignal(enum.Enum):
     """The signal to be sent to the user."""
@@ -119,9 +125,9 @@ def update_spot_job_status(job_id: Optional[int] = None):
 
 def get_job_timestamp(backend: 'backends.CloudVmRayBackend', cluster_name: str,
                       get_end_time: bool) -> float:
-    """Get the started/ended time of the job."""
-    code = job_lib.JobLibCodeGen.get_job_time_payload(job_id=None,
-                                                      is_end=get_end_time)
+    """Get the submitted/ended time of the job."""
+    code = job_lib.JobLibCodeGen.get_job_submitted_or_ended_timestamp_payload(
+        job_id=None, get_ended_time=get_end_time)
     handle = global_user_state.get_handle_from_cluster_name(cluster_name)
     returncode, stdout, stderr = backend.run_on_head(handle,
                                                      code,
@@ -317,10 +323,13 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
     # The spot_status may not be in terminal status yet, since the controllerhas
     # not updated the spot state yet. We wait for a while, until the spot state
     # is updated.
+    wait_seconds = 0
     spot_status = spot_state.get_status(job_id)
     assert spot_status is not None, job_id
-    while not spot_status.is_terminal() and follow:
+    while (not spot_status.is_terminal() and follow and
+           wait_seconds < _FINAL_SPOT_STATUS_WAIT_TIMEOUT_SECONDS):
         time.sleep(1)
+        wait_seconds += 1
         spot_status = spot_state.get_status(job_id)
         assert spot_status is not None, job_id
 
@@ -351,12 +360,12 @@ def dump_spot_job_queue() -> str:
         if end_at is None:
             end_at = time.time()
 
-        job_start_at = job['last_recovered_at'] - job['job_duration']
+        job_submitted_at = job['last_recovered_at'] - job['job_duration']
         if job['status'] == spot_state.SpotStatus.RECOVERING:
             # When job is recovering, the duration is exact job['job_duration']
             job_duration = job['job_duration']
-        elif job_start_at > 0:
-            job_duration = end_at - job_start_at
+        elif job_submitted_at > 0:
+            job_duration = end_at - job_submitted_at
         else:
             # When job_start_at <= 0, that means the last_recovered_at is not
             # set yet, i.e. the job is not started.
diff --git a/tests/test_smoke.py b/tests/test_smoke.py
diff --git a/tests/test_yamls/failed_setup.yaml b/tests/test_yamls/failed_setup.yaml