Skip to content

Add button to set failed job to pending #8377

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 20 commits into from
Mar 12, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.unreleased.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ For upgrade instructions, please check the [migration guide](MIGRATIONS.released
[Commits](https://github.com/scalableminds/webknossos/compare/25.02.1...HEAD)

### Added
- Failed jobs may be retried by super-users. [#8377](https://github.com/scalableminds/webknossos/pull/8377)

### Changed
- When using a zarr link to a wk-served data layer as another layer’s source, the user’s token is used to access the data. [#8322](https://github.com/scalableminds/webknossos/pull/8322/)
Expand Down
2 changes: 1 addition & 1 deletion MIGRATIONS.unreleased.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,5 @@ User-facing changes are documented in the [changelog](CHANGELOG.released.md).


### Postgres Evolutions:

- [126-mag-real-paths.sql](conf/evolutions/126-mag-real-paths.sql)
- [127-job-retried-by-super-user.sql](conf/evolutions/127-job-retried-by-super-user.sql)
42 changes: 26 additions & 16 deletions app/controllers/JobController.scala
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import com.scalableminds.util.tools.Fox
import models.dataset.{DataStoreDAO, DatasetDAO, DatasetLayerAdditionalAxesDAO, DatasetService}
import models.job._
import models.organization.OrganizationDAO
import models.user.MultiUserDAO
import models.user.{MultiUserDAO, UserService}
import play.api.i18n.Messages
import play.api.libs.json._
import play.api.mvc.{Action, AnyContent, PlayBodyParsers}
Expand Down Expand Up @@ -50,21 +50,21 @@ object AnimationJobOptions {
implicit val jsonFormat: OFormat[AnimationJobOptions] = Json.format[AnimationJobOptions]
}

class JobController @Inject()(
jobDAO: JobDAO,
sil: Silhouette[WkEnv],
datasetDAO: DatasetDAO,
datasetService: DatasetService,
jobService: JobService,
workerService: WorkerService,
workerDAO: WorkerDAO,
datasetLayerAdditionalAxesDAO: DatasetLayerAdditionalAxesDAO,
wkconf: WkConf,
multiUserDAO: MultiUserDAO,
wkSilhouetteEnvironment: WkSilhouetteEnvironment,
slackNotificationService: SlackNotificationService,
organizationDAO: OrganizationDAO,
dataStoreDAO: DataStoreDAO)(implicit ec: ExecutionContext, playBodyParsers: PlayBodyParsers)
class JobController @Inject()(jobDAO: JobDAO,
sil: Silhouette[WkEnv],
datasetDAO: DatasetDAO,
datasetService: DatasetService,
jobService: JobService,
workerService: WorkerService,
workerDAO: WorkerDAO,
datasetLayerAdditionalAxesDAO: DatasetLayerAdditionalAxesDAO,
wkconf: WkConf,
multiUserDAO: MultiUserDAO,
wkSilhouetteEnvironment: WkSilhouetteEnvironment,
slackNotificationService: SlackNotificationService,
organizationDAO: OrganizationDAO,
dataStoreDAO: DataStoreDAO,
userService: UserService)(implicit ec: ExecutionContext, playBodyParsers: PlayBodyParsers)
extends Controller
with Zarr3OutputHelper {

Expand Down Expand Up @@ -112,6 +112,16 @@ class JobController @Inject()(
} yield Ok(js)
}

def retry(id: ObjectId): Action[AnyContent] = sil.SecuredAction.async { implicit request =>
for {
_ <- bool2Fox(wkconf.Features.jobsEnabled) ?~> "job.disabled"
_ <- userService.assertIsSuperUser(request.identity) ?~> "notAllowed" ~> FORBIDDEN
job <- jobDAO.findOne(id)
_ <- jobDAO.retryOne(id)
js <- jobService.publicWrites(job)
} yield Ok(js)
}

// Note that the dataset has to be registered by reserveUpload via the datastore first.
def runConvertToWkwJob(datasetId: ObjectId, scale: String, unit: Option[String]): Action[AnyContent] =
sil.SecuredAction.async { implicit request =>
Expand Down
11 changes: 11 additions & 0 deletions app/models/job/Job.scala
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ case class Job(
_voxelyticsWorkflowHash: Option[String] = None,
latestRunId: Option[String] = None,
returnValue: Option[String] = None,
retriedBySuperUser: Boolean = false,
started: Option[Long] = None,
ended: Option[Long] = None,
created: Instant = Instant.now,
Expand Down Expand Up @@ -126,6 +127,7 @@ class JobDAO @Inject()(sqlClient: SqlClient)(implicit ec: ExecutionContext)
r._VoxelyticsWorkflowhash,
r.latestrunid,
r.returnvalue,
r.retriedbysuperuser,
r.started.map(_.getTime),
r.ended.map(_.getTime),
Instant.fromSql(r.created),
Expand Down Expand Up @@ -253,6 +255,15 @@ class JobDAO @Inject()(sqlClient: SqlClient)(implicit ec: ExecutionContext)
_ <- run(q"""UPDATE webknossos.jobs SET manualState = $manualState WHERE _id = $id""".asUpdate)
} yield ()

def retryOne(id: ObjectId)(implicit ctx: DBAccessContext): Fox[Unit] =
for {
_ <- assertUpdateAccess(id)
_ <- run(q"""UPDATE webknossos.jobs
SET state = 'PENDING', retriedBySuperUser = true
WHERE _id = $id
AND state = 'FAILURE'""".asUpdate)
} yield ()

def updateStatus(jobId: ObjectId, s: JobStatus): Fox[Unit] =
for {
_ <- run(q"""UPDATE webknossos.jobs SET
Expand Down
2 changes: 1 addition & 1 deletion app/models/job/JobService.scala
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ class JobService @Inject()(wkConf: WkConf,
s"Failed job$superUserLabel",
msg
)
_ = sendFailedEmailNotification(user, jobAfterChange)
_ = if (!jobAfterChange.retriedBySuperUser) sendFailedEmailNotification(user, jobAfterChange)
} yield ()
()
}
Expand Down
12 changes: 12 additions & 0 deletions conf/evolutions/127-job-retried-by-super-user.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
START TRANSACTION;

do $$ begin ASSERT (select schemaVersion from webknossos.releaseInformation) = 126, 'Previous schema version mismatch'; end; $$ LANGUAGE plpgsql;

DROP VIEW webknossos.jobs_;

ALTER TABLE webknossos.jobs ADD column retriedBySuperUser BOOLEAN NOT NULL DEFAULT false;
UPDATE webknossos.releaseInformation SET schemaVersion = 127;

CREATE VIEW webknossos.jobs_ AS SELECT * FROM webknossos.jobs WHERE NOT isDeleted;

COMMIT TRANSACTION;
12 changes: 12 additions & 0 deletions conf/evolutions/reversions/127-job-retried-by-super-user.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
START TRANSACTION;

do $$ begin ASSERT (select schemaVersion from webknossos.releaseInformation) = 127, 'Previous schema version mismatch'; end; $$ LANGUAGE plpgsql;

DROP VIEW webknossos.jobs_;

ALTER TABLE webknossos.jobs DROP column retriedBySuperUser;
UPDATE webknossos.releaseInformation SET schemaVersion = 126;

CREATE VIEW webknossos.jobs_ AS SELECT * FROM webknossos.jobs WHERE NOT isDeleted;

COMMIT TRANSACTION;
1 change: 1 addition & 0 deletions conf/webknossos.latest.routes
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,7 @@ POST /jobs/run/findLargestSegmentId/:datasetId
POST /jobs/run/renderAnimation/:datasetId controllers.JobController.runRenderAnimationJob(datasetId: ObjectId)
GET /jobs/:id controllers.JobController.get(id: ObjectId)
PATCH /jobs/:id/cancel controllers.JobController.cancel(id: ObjectId)
PATCH /jobs/:id/retry controllers.JobController.retry(id: ObjectId)
POST /jobs/:id/status controllers.WKRemoteWorkerController.updateJobStatus(key: String, id: ObjectId)
POST /jobs/:id/attachVoxelyticsWorkflow controllers.WKRemoteWorkerController.attachVoxelyticsWorkflow(key: String, id: ObjectId)
POST /jobs/:id/attachDatasetToInference controllers.WKRemoteWorkerController.attachDatasetToInference(key: String, id: ObjectId)
Expand Down
6 changes: 6 additions & 0 deletions frontend/javascripts/admin/api/jobs.ts
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,12 @@ export async function cancelJob(jobId: string): Promise<APIJob> {
});
}

export async function retryJob(jobId: string): Promise<APIJob> {
return Request.receiveJSON(`/api/jobs/${jobId}/retry`, {
method: "PATCH",
});
}

export async function startConvertToWkwJob(
datasetId: string,
scale: Vector3,
Expand Down
30 changes: 28 additions & 2 deletions frontend/javascripts/admin/job/job_list_view.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -7,22 +7,26 @@ import {
EyeOutlined,
InfoCircleOutlined,
LoadingOutlined,
PlayCircleOutlined,
QuestionCircleTwoTone,
} from "@ant-design/icons";
import { PropTypes } from "@scalableminds/prop-types";
import { cancelJob, getJobs } from "admin/admin_rest_api";
import { cancelJob, getJobs, retryJob } from "admin/admin_rest_api";
import { Input, Modal, Spin, Table, Tooltip, Typography } from "antd";
import { AsyncLink } from "components/async_clickables";
import FormattedDate from "components/formatted_date";
import { confirmAsync } from "dashboard/dataset/helper_components";
import { formatWkLibsNdBBox } from "libs/format_utils";
import Persistence from "libs/persistence";
import { useInterval } from "libs/react_helpers";
import Toast from "libs/toast";
import * as Utils from "libs/utils";
import _ from "lodash";
import { getReadableURLPart } from "oxalis/model/accessors/dataset_accessor";
import type { OxalisState } from "oxalis/store";
import type * as React from "react";
import { useEffect, useState } from "react";
import { useSelector } from "react-redux";
import { Link } from "react-router-dom";
import { type APIJob, APIJobType, type APIUserBase } from "types/api_flow_types";

Expand Down Expand Up @@ -133,6 +137,7 @@ function JobListView() {
const [isLoading, setIsLoading] = useState(true);
const [jobs, setJobs] = useState<APIJob[]>([]);
const [searchQuery, setSearchQuery] = useState("");
const isCurrentUserSuperUser = useSelector((state: OxalisState) => state.activeUser?.isSuperUser);

useEffect(() => {
fetchData();
Expand Down Expand Up @@ -293,11 +298,32 @@ function JobListView() {
cancelJob(job.id).then(() => fetchData());
}
}}
icon={<CloseCircleOutlined key="cancel" className="icon-margin-right" />}
icon={<CloseCircleOutlined className="icon-margin-right" />}
>
Cancel
</AsyncLink>
);
} else if (job.state === "FAILURE" && isCurrentUserSuperUser) {
return (
<Tooltip title="Restarts the workflow from the failed task, skipping and reusing artifacts from preceding tasks that were already successful.">
<AsyncLink
href="#"
onClick={async () => {
try {
await retryJob(job.id);
await fetchData();
Toast.success("Job is being retried");
} catch (e) {
console.error("Could not retry job", e);
Toast.error("Failed to start retrying the job");
}
}}
icon={<PlayCircleOutlined className="icon-margin-right" />}
>
Retry
</AsyncLink>
</Tooltip>
);
} else if (
job.type === APIJobType.CONVERT_TO_WKW ||
job.type === APIJobType.COMPUTE_SEGMENT_INDEX_FILE ||
Expand Down
3 changes: 2 additions & 1 deletion tools/postgres/schema.sql
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ CREATE TABLE webknossos.releaseInformation (
schemaVersion BIGINT NOT NULL
);

INSERT INTO webknossos.releaseInformation(schemaVersion) values(126);
INSERT INTO webknossos.releaseInformation(schemaVersion) values(127);
COMMIT TRANSACTION;


Expand Down Expand Up @@ -475,6 +475,7 @@ CREATE TABLE webknossos.jobs(
_voxelytics_workflowHash VARCHAR(512),
latestRunId VARCHAR(1024),
returnValue Text,
retriedBySuperUser BOOLEAN NOT NULL DEFAULT false,
started TIMESTAMPTZ,
ended TIMESTAMPTZ,
created TIMESTAMPTZ NOT NULL DEFAULT NOW(),
Expand Down