facebookincubator
diff --git a/‎submitit/core/core.py‎
Lines changed: 24 additions & 17 deletions b/‎submitit/core/core.py‎
Lines changed: 24 additions & 17 deletions
diff --git a/‎submitit/core/submission.py‎
Lines changed: 45 additions & 13 deletions b/‎submitit/core/submission.py‎
Lines changed: 45 additions & 13 deletions
@@ -18,6 +18,8 @@
 
 from . import logger, utils
 
+log = logger.get_logger()
+
 # R as in "Result", so yes it's covariant.
 # pylint: disable=typevar-name-incorrect-variance
 R = tp.TypeVar("R", covariant=True)
@@ -128,12 +130,10 @@ def update(self) -> None:
             return
         self._num_calls += 1
         try:
-            logger.get_logger().debug(f"Call #{self.num_calls} - Command {' '.join(command)}")
+            log.debug(f"Call #{self.num_calls} - Command {' '.join(command)}")
             self._output = subprocess.check_output(command, shell=False)
         except Exception as e:
-            logger.get_logger().warning(
-                f"Call #{self.num_calls} - Bypassing sacct error {e}, status may be inaccurate."
-            )
+            log.warning(f"Call #{self.num_calls} - Bypassing sacct error {e}, status may be inaccurate.")
         else:
             self._info_dict.update(self.read_info(self._output))
         self._last_status_check = _time.time()
@@ -323,18 +323,27 @@ def exception(self) -> tp.Optional[tp.Union[utils.UncompletedJobError, utils.Fai
             return exceptions[0]
 
         try:
-            outcome, trace = self._get_outcome_and_result()
+            outcome, original_err = self._get_outcome_and_result()
+            if outcome == "success":
+                return None
         except utils.UncompletedJobError as e:
             return e
-        if outcome == "error":
+        if isinstance(original_err, str):
+            # Normally original_err is an exception, unless we failed to pickle it.
+            trace = original_err
             return utils.FailedJobError(
                 f"Job (task={self.task_id}) failed during processing with trace:\n"
                 f"----------------------\n{trace}\n"
                 "----------------------\n"
                 f"You can check full logs with 'job.stderr({self.task_id})' and 'job.stdout({self.task_id})'"
                 f"or at paths:\n  - {self.paths.stderr}\n  - {self.paths.stdout}"
             )
-        return None
+        log.error(
+            f"Job (task={self.task_id}) failed \n."
+            f"You can check full logs with 'job.stderr({self.task_id})' and 'job.stdout({self.task_id})'"
+            f"or at paths:\n  - {self.paths.stderr}\n  - {self.paths.stdout}"
+        )
+        return original_err
 
     def _get_outcome_and_result(self) -> tp.Tuple[str, tp.Any]:
         """Getter for the output of the submitted function.
@@ -371,17 +380,17 @@ def _get_outcome_and_result(self) -> tp.Tuple[str, tp.Any]:
                 f"Job {self.job_id} (task: {self.task_id}) with path {self.paths.result_pickle}",
                 f"has not produced any output (state: {self.state})",
             ]
-            log = self.stderr()
-            if log:
-                message.extend(["Error stream produced:", "-" * 40, log])
+            stderr = self.stderr()
+            if stderr:
+                message.extend(["Error stream produced:", "-" * 40, stderr])
             elif self.paths.stdout.exists():
-                log = subprocess.check_output(["tail", "-40", str(self.paths.stdout)], encoding="utf-8")
+                stderr = subprocess.check_output(["tail", "-40", str(self.paths.stdout)], encoding="utf-8")
                 message.extend(
-                    [f"No error stream produced. Look at stdout: {self.paths.stdout}", "-" * 40, log]
+                    [f"No error stream produced. Look at stdout: {self.paths.stdout}", "-" * 40, stderr]
                 )
             else:
                 message.append(f"No output/error stream produced ! Check: {self.paths.stdout}")
-            raise utils.UncompletedJobError("\n".join(message))
+            raise utils.JobResultsNotFoundError("\n".join(message))
         try:
             output: tp.Tuple[str, tp.Any] = utils.pickle_load(self.paths.result_pickle)
         except EOFError:
@@ -504,7 +513,7 @@ def __repr__(self) -> str:
         try:
             state = self.state
         except Exception as e:
-            logger.get_logger().warning(f"Bypassing state error:\n{e}")
+            log.warning(f"Bypassing state error:\n{e}")
         return f'{self.__class__.__name__}<job_id={self.job_id}, task_id={self.task_id}, state="{state}">'
 
     def __del__(self) -> None:
@@ -702,9 +711,7 @@ def batch(self, allow_implicit_submissions: bool = False) -> tp.Iterator[None]:
         try:
             yield None
         except Exception as e:
-            logger.get_logger().error(
-                'Caught error within "with executor.batch()" context, submissions are dropped.\n '
-            )
+            log.error('Caught error within "with executor.batch()" context, submissions are dropped.\n ')
             raise e
         else:
             self._submit_delayed_batch()
 
@@ -17,9 +17,11 @@
 except ImportError:
     pass
 
-from . import job_environment, utils
+from . import job_environment, tblib, utils
 from .logger import get_logger
 
+logger = get_logger()
+
 
 def process_job(folder: Union[Path, str]) -> None:
     """Loads a pickled job, runs it and pickles the output
@@ -36,7 +38,6 @@ def process_job(folder: Union[Path, str]) -> None:
     os.environ["SUBMITIT_FOLDER"] = str(folder)
     env = job_environment.JobEnvironment()
     paths = env.paths
-    logger = get_logger()
     logger.info(f"Starting with {env}")
     logger.info(f"Loading pickle: {paths.submitted_pickle}")
     wait_time = 60
@@ -53,20 +54,51 @@ def process_job(folder: Union[Path, str]) -> None:
         env = job_environment.JobEnvironment()
         env._handle_signals(paths, delayed)
         result = delayed.result()
+        logger.info("Job computed its result")
+        # if it blocks here, you have a race condition that must be solved!
+        del delayed
+    except Exception as error:
+        logger.error("Submitted job triggered an exception")
+        with utils.temporary_save_path(paths.result_pickle) as tmp_path:
+            save_error(error, tmp_path)
+        raise
+    except BaseException:
+        logger.exception("Submitted job encoutered a system error. Will result in an UncompletedJobError")
+        raise
+
+    with utils.temporary_save_path(paths.result_pickle) as tmp_path:
+        save_result(result, tmp_path)
+        # if it blocks here, you have a race condition that must be solved!
+        del result
+        logger.info("Exitting after successful completion")
+
+
+def save_result(result, tmp_path: Path):
+    try:
+        utils.cloudpickle_dump(("success", result), tmp_path)
         logger.info("Job completed successfully")
-        del delayed  # if it blocks here, you have a race condition that must be solved!
-        with utils.temporary_save_path(paths.result_pickle) as tmppath:  # save somewhere else, and move
-            utils.cloudpickle_dump(("success", result), tmppath)
-            del result
-            logger.info("Exitting after successful completion")
-    except Exception as error:  # TODO: check pickle methods for capturing traceback; pickling and raising
+    except Exception as pickle_error:
+        logger.error(f"Could not pickle job result because of {pickle_error}")
+        save_error(pickle_error, tmp_path)
+
+
+def save_error(error: Exception, tmp_path: Path) -> None:
+    """Pickle the full exception with its trace using tblib."""
+    try:
+        # tblib needs to be installed after we have created the exception class
+        # they recommend doing it just before pickling the exception.
+        # This seems to be a limitation of copyreg.
+        tblib.install(error)
+        utils.cloudpickle_dump(("error", error), tmp_path)
+    except Exception as pickle_error:
+        logger.error(f"Could not pickle exception:\n{error}\n\nbecause of {pickle_error}")
+        # Fallbacks to only pickling the trace
         try:
-            with utils.temporary_save_path(paths.result_pickle) as tmppath:
-                utils.cloudpickle_dump(("error", traceback.format_exc()), tmppath)
+            utils.cloudpickle_dump(("error", traceback.format_exc()), tmp_path)
         except Exception as dumperror:
-            logger.error(f"Could not dump error:\n{error}\n\nbecause of {dumperror}")
-        logger.error("Submitted job triggered an exception")
-        raise error
+            logger.error(f"Could not dump exception:\n{error}\n\nbecause of {dumperror}")
+            logger.error("This will trigger a JobResultsNotFoundError")
+            raise
 
 
 def submitit_main() -> None: