Merge branch 'release_20.01' into release_20.05

mvdbeek · mvdbeek · commit 71e01da20f6a · 2020-10-02T15:32:35.000+02:00
diff --git a/lib/galaxy/jobs/runners/kubernetes.py b/lib/galaxy/jobs/runners/kubernetes.py
@@ -2,7 +2,6 @@
 Offload jobs to a Kubernetes cluster.
 """
 
-import errno
 import logging
 import math
 import os
@@ -465,30 +464,14 @@ def check_watched_item(self, job_state):
             # there is no job responding to this job_id, it is either lost or something happened.
             log.error("No Jobs are available under expected selector app=%s", job_state.job_id)
             self.mark_as_failed(job_state)
-            try:
-                with open(job_state.error_file, 'w') as error_file:
-                    error_file.write("No Kubernetes Jobs are available under expected selector app=%s\n" % job_state.job_id)
-            except EnvironmentError as e:
-                # Python 2/3 compatible handling of FileNotFoundError
-                if e.errno == errno.ENOENT:
-                    log.error("Job directory already cleaned up. Assuming already handled for selector app=%s", job_state.job_id)
-                else:
-                    raise
-            return job_state
+            # job is no longer viable - remove from watched jobs
+            return None
         else:
             # there is more than one job associated to the expected unique job id used as selector.
             log.error("More than one Kubernetes Job associated to job id '%s'", job_state.job_id)
             self.mark_as_failed(job_state)
-            try:
-                with open(job_state.error_file, 'w') as error_file:
-                    error_file.write("More than one Kubernetes Job associated with job id '%s'\n" % job_state.job_id)
-            except EnvironmentError as e:
-                # Python 2/3 compatible handling of FileNotFoundError
-                if e.errno == errno.ENOENT:
-                    log.error("Job directory already cleaned up. Assuming already handled for selector app=%s", job_state.job_id)
-                else:
-                    raise
-            return job_state
+            # job is no longer viable - remove from watched jobs
+            return None
 
     def _handle_job_failure(self, job, job_state):
         # Figure out why job has failed
diff --git a/packages/test.sh b/packages/test.sh
@@ -14,7 +14,7 @@ TEST_ENV_DIR=${TEST_ENV_DIR:-$(mktemp -d -t gxpkgtestenvXXXXXX)}
 
 virtualenv -p "$TEST_PYTHON" "$TEST_ENV_DIR"
 . "${TEST_ENV_DIR}/bin/activate"
-pip install pytest
+pip install "pytest<6.1"
 
 # ensure ordered by dependency dag
 PACKAGE_DIRS=(
diff --git a/test/integration/test_kubernetes_runner.py b/test/integration/test_kubernetes_runner.py
@@ -216,6 +216,17 @@ def test_job_environment(self):
         job_env = self._run_and_get_environment_properties()
         assert job_env.some_env == '42'
 
+    @staticmethod
+    def _wait_for_external_state(sa_session, job, expected):
+        # Not checking the state here allows the change from queued to running to overwrite
+        # the change from queued to deleted_new in the API thread - this is a problem because
+        # the job will still run. See issue https://github.com/galaxyproject/galaxy/issues/4960.
+        max_tries = 60
+        while max_tries > 0 and job.job_runner_external_id is None or job.state != expected:
+            sa_session.refresh(job)
+            time.sleep(1)
+            max_tries -= 1
+
     @skip_without_tool('cat_data_and_sleep')
     def test_kill_process(self):
         with self.dataset_populator.test_history() as history_id:
@@ -234,22 +245,12 @@ def test_kill_process(self):
 
             app = self._app
             sa_session = app.model.context.current
-            external_id = None
-            state = False
-
-            job = sa_session.query(app.model.Job).filter_by(tool_id="cat_data_and_sleep").one()
-            # Not checking the state here allows the change from queued to running to overwrite
-            # the change from queued to deleted_new in the API thread - this is a problem because
-            # the job will still run. See issue https://github.com/galaxyproject/galaxy/issues/4960.
-            max_tries = 60
-            while max_tries > 0 and external_id is None or state != app.model.Job.states.RUNNING:
-                sa_session.refresh(job)
-                assert not job.finished
-                external_id = job.job_runner_external_id
-                state = job.state
-                time.sleep(1)
-                max_tries -= 1
+            job = sa_session.query(app.model.Job).get(app.security.decode_id(job_dict["id"]))
+
+            self._wait_for_external_state(sa_session, job, app.model.Job.states.RUNNING)
+            assert not job.finished
 
+            external_id = job.job_runner_external_id
             output = unicodify(subprocess.check_output(['kubectl', 'get', 'job', external_id, '-o', 'json']))
             status = json.loads(output)
             assert status['status']['active'] == 1
@@ -264,6 +265,42 @@ def test_kill_process(self):
                 subprocess.check_output(['kubectl', 'get', 'job', external_id, '-o', 'json'], stderr=subprocess.STDOUT)
             assert "not found" in unicodify(excinfo.value.output)
 
+    @skip_without_tool('cat_data_and_sleep')
+    def test_external_job_delete(self):
+        with self.dataset_populator.test_history() as history_id:
+            hda1 = self.dataset_populator.new_dataset(history_id, content="1 2 3")
+            running_inputs = {
+                "input1": {"src": "hda", "id": hda1["id"]},
+                "sleep_time": 240,
+            }
+            running_response = self.dataset_populator.run_tool(
+                "cat_data_and_sleep",
+                running_inputs,
+                history_id,
+                assert_ok=False,
+            )
+            job_dict = running_response.json()["jobs"][0]
+
+            app = self._app
+            sa_session = app.model.context.current
+            job = sa_session.query(app.model.Job).get(app.security.decode_id(job_dict["id"]))
+
+            self._wait_for_external_state(sa_session, job, app.model.Job.states.RUNNING)
+
+            external_id = job.job_runner_external_id
+            output = unicodify(subprocess.check_output(['kubectl', 'get', 'job', external_id, '-o', 'json']))
+            status = json.loads(output)
+            assert status['status']['active'] == 1
+
+            output = unicodify(subprocess.check_output(['kubectl', 'delete', 'job', external_id, '-o', 'name']))
+            assert 'job.batch/%s' % external_id in output
+
+            result = self.dataset_populator.wait_for_tool_run(run_response=running_response, history_id=history_id,
+                                                              assert_ok=False).json()
+            details = self.dataset_populator.get_job_details(result['jobs'][0]['id'], full=True).json()
+
+            assert details['state'] == app.model.Job.states.ERROR, details
+
     @skip_without_tool('job_properties')
     def test_exit_code_127(self):
         inputs = {