Skip to content

Commit d176f92

Browse files
authored
Merge pull request #195 from jhiemstrawisc/integrate-dsub
Integrate dsub
2 parents 1bfe497 + 288936a commit d176f92

File tree

5 files changed

+180
-7
lines changed

5 files changed

+180
-7
lines changed

config/config.yaml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,10 @@
33
# The length of the hash used to identify a parameter combination
44
hash_length: 7
55

6-
# Specify the container framework. Current supported versions include 'docker' and
7-
# 'singularity'. If container_framework is not specified, SPRAS will default to docker.
6+
# Specify the container framework used by each PRM wrapper. Valid options include:
7+
# - docker (default if not specified)
8+
# - singularity -- Also known as apptainer, useful in HPC/HTC environments where docker isn't allowed
9+
# - dsub -- experimental with limited support, used for running on Google Cloud
810
container_framework: docker
911

1012
# Only used if container_framework is set to singularity, this will unpack the singularity containers

environment.yml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,12 @@ dependencies:
1717
- scikit-learn=1.2
1818
- seaborn=0.12
1919
- spython=0.2
20+
# for dsub
21+
- python-dateutil<=2.9.0
22+
- pytz<=2024.1
23+
- pyyaml<=6.0.1
24+
- tenacity<=8.2.3
25+
- tabulate<=0.9.0
2026
# Only required for GraphSpace
2127
- commonmark=0.9
2228
- docutils=0.19
@@ -27,3 +33,4 @@ dependencies:
2733
- pip:
2834
- graphspace_python==1.3.1
2935
- sphinx-rtd-theme==2.0.0
36+
- dsub==0.4.13

spras/config.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ def __init__(self, raw_config):
6262
# __init__ makes clear exactly what is being configured.
6363
# Directory used for storing output
6464
self.out_dir = None
65-
# Container framework used by PRMs. Valid options are "docker" and "singularity"
65+
# Container framework used by PRMs. Valid options are "docker", "dsub", and "singularity"
6666
self.container_framework = None
6767
# The container prefix (host and organization) to use for images. Default is "docker.io/reedcompbio"
6868
self.container_prefix = DEFAULT_CONTAINER_PREFIX
@@ -116,9 +116,11 @@ def process_config(self, raw_config):
116116
# However, if we get a bad value, we raise an exception.
117117
if "container_framework" in raw_config:
118118
container_framework = raw_config["container_framework"].lower()
119-
if container_framework not in ("docker", "singularity"):
120-
msg = "SPRAS was configured to run with an unknown container framework: '" + raw_config["container_framework"] + "'. Accepted values are 'docker' or 'singularity'."
119+
if container_framework not in ("docker", "singularity", "dsub"):
120+
msg = "SPRAS was configured to run with an unknown container framework: '" + raw_config["container_framework"] + "'. Accepted values are 'docker', 'singularity' or 'dsub'."
121121
raise ValueError(msg)
122+
if container_framework == "dsub":
123+
print("Warning: 'dsub' framework integration is experimental and may not be fully supported.")
122124
self.container_framework = container_framework
123125
else:
124126
self.container_framework = "docker"

spras/containers.py

Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import os
22
import platform
33
import re
4+
import subprocess
45
from pathlib import Path, PurePath, PurePosixPath
56
from typing import Any, Dict, List, Optional, Tuple, Union
67

@@ -42,6 +43,83 @@ def convert_docker_path(src_path: PurePath, dest_path: PurePath, file_path: Unio
4243
return PurePosixPath(dest_path, rel_path)
4344

4445

46+
def download_gcs(gcs_path: str, local_path: str, is_dir: bool):
47+
# check that output path exists
48+
if not os.path.exists(Path(local_path).parent):
49+
os.makedirs(Path(local_path).parent)
50+
51+
# build command
52+
cmd = 'gcloud storage'
53+
# rsync with checksums to make file transfer faster for larger files
54+
cmd = cmd + ' rsync --checksums-only'
55+
# check if directory
56+
if is_dir:
57+
cmd = cmd + ' -r'
58+
cmd = cmd + ' ' + gcs_path + ' ' + local_path
59+
60+
print(cmd)
61+
# run command
62+
subprocess.run(cmd, shell=True)
63+
64+
if Path(Path(local_path)/'gcs_temp.txt').exists():
65+
Path(Path(local_path)/'gcs_temp.txt').unlink()
66+
67+
68+
def upload_gcs(local_path: str, gcs_path: str, is_dir: bool):
69+
# check if path exists in cloud storage
70+
exists = len(subprocess.run(f'gcloud storage ls {gcs_path}', shell=True, capture_output=True, text=True).stdout)
71+
# if path exists rsync
72+
if exists > 0:
73+
cmd = 'gcloud storage rsync --checksums-only'
74+
# if directory is empty
75+
elif exists == 0 and len(os.listdir(local_path)) == 0:
76+
# create a temporary file because GCS will not recognize empty directories
77+
Path(Path(local_path)/'gcs_temp.txt').touch()
78+
# copy path to cloud storage
79+
cmd = 'gcloud storage cp -c'
80+
# else copy path to cloud storage
81+
else:
82+
cmd = 'gcloud storage cp -c'
83+
# check if directory
84+
if is_dir:
85+
cmd = cmd + ' -r'
86+
cmd = cmd + ' ' + str(Path(local_path).resolve()) + ' ' + gcs_path
87+
88+
print(cmd)
89+
# run command
90+
subprocess.run(cmd, shell=True)
91+
92+
93+
def prepare_dsub_cmd(flags: dict):
94+
# set constant flags
95+
dsub_command = 'dsub'
96+
flags['provider'] = 'google-cls-v2'
97+
flags['regions'] = 'us-central1'
98+
flags['user-project'] = os.getenv('GOOGLE_PROJECT')
99+
flags['project'] = os.getenv('GOOGLE_PROJECT')
100+
flags['network'] = 'network'
101+
flags['subnetwork'] = 'subnetwork'
102+
flags['service-account'] = subprocess.run(['gcloud', 'config', 'get-value', 'account'], capture_output=True, text=True).stdout.replace('\n', '')
103+
104+
# order flags according to flag_list
105+
flag_list = ["provider", "regions", "zones", "location", "user-project", "project", "network", "subnetwork", "service-account", "image", "env",
106+
"logging", "input", "input-recursive", "mount", "output", "output-recursive", "command", "script"]
107+
ordered_flags = {f:flags[f] for f in flag_list if f in flags.keys()}
108+
109+
# iteratively add flags to the command
110+
for flag in ordered_flags.keys():
111+
if isinstance(ordered_flags.get(flag), list):
112+
for f in ordered_flags.get(flag):
113+
dsub_command = dsub_command + " --" + flag + " " + f
114+
else:
115+
dsub_command = dsub_command + " --" + flag + " " + ordered_flags.get(flag)
116+
117+
# Wait for dsub job to complete
118+
dsub_command = dsub_command + " --wait"
119+
print(f"dsub command: {dsub_command}")
120+
return dsub_command
121+
122+
45123
# TODO consider a better default environment variable
46124
# TODO environment currently a single string (e.g. 'TMPDIR=/OmicsIntegrator1'), should it be a list?
47125
# run_container_singularity assumes a single string
@@ -65,6 +143,8 @@ def run_container(framework: str, container_suffix: str, command: List[str], vol
65143
return run_container_docker(container, command, volumes, working_dir, environment)
66144
elif normalized_framework == 'singularity':
67145
return run_container_singularity(container, command, volumes, working_dir, environment)
146+
elif normalized_framework == 'dsub':
147+
return run_container_dsub(container, command, volumes, working_dir, environment)
68148
else:
69149
raise ValueError(f'{framework} is not a recognized container framework. Choose "docker" or "singularity".')
70150

@@ -223,6 +303,7 @@ def run_container_singularity(container: str, command: List[str], volumes: List[
223303
options=singularity_options,
224304
bind=bind_paths)
225305

306+
226307
# Because this is called independently for each file, the same local path can be mounted to multiple volumes
227308
def prepare_volume(filename: Union[str, PurePath], volume_base: Union[str, PurePath]) -> Tuple[Tuple[PurePath, PurePath], str]:
228309
"""
@@ -258,3 +339,72 @@ def prepare_volume(filename: Union[str, PurePath], volume_base: Union[str, PureP
258339
src = parent
259340

260341
return (src, dest), container_filename
342+
343+
344+
def run_container_dsub(container: str, command: List[str], volumes: List[Tuple[PurePath, PurePath]], working_dir: str, environment: str = 'SPRAS=True') -> str:
345+
"""
346+
Runs a command in the Google Cloud using dsub.
347+
@param container: name of the container in the Google Cloud Container Registry
348+
@param command: command to run
349+
@param volumes: a list of volumes to mount where each item is a (source, destination) tuple
350+
@param working_dir: the working directory in the container
351+
@param environment: environment variables to set in the container
352+
@return: path of output from dsub
353+
"""
354+
# Dictionary of flags for dsub command
355+
flags = dict()
356+
357+
workspace_bucket = os.getenv('WORKSPACE_BUCKET')
358+
# Add path in the workspace bucket and label for dsub command for each volume
359+
dsub_volumes = [(src, dst, workspace_bucket + str(dst), "INPUT_" + str(i),) for i, (src, dst) in enumerate(volumes)]
360+
361+
# Prepare command that will be run inside the container for dsub
362+
container_command = list()
363+
for item in command:
364+
# Find if item is volume
365+
to_replace = [(str(path[1]), "${"+path[3]+'}') for path in dsub_volumes if str(path[1]) in item]
366+
# Replace volume path with dsub volume path
367+
if len(to_replace) == 1:
368+
# Get path that will be replaced
369+
path = to_replace[0][0]
370+
# Get dsub input variable that will replace path
371+
env_variable = to_replace[0][1]
372+
# Replace path with env_variable
373+
container_path = item.replace(path, env_variable)
374+
# Add / if there is no suffix
375+
if container_path == env_variable:
376+
container_path = container_path + '/'
377+
container_command.append(container_path)
378+
else:
379+
container_command.append(item)
380+
381+
# Add a command to copy the volumes to the workspace buckets
382+
container_command.append(('; cp -rf ' + f'/mnt/data/input/gs/{workspace_bucket}{working_dir}/*' + ' $OUTPUT').replace('gs://', ''))
383+
384+
# Make the command into a string
385+
flags['command'] = ' '.join(container_command)
386+
flags['command'] = "'" + flags['command'] + "'"
387+
388+
# Push volumes to WORKSPACE_BUCKET
389+
for src, _dst, gcs_path, _env in dsub_volumes:
390+
upload_gcs(local_path=str(src), gcs_path=gcs_path, is_dir=True)
391+
392+
# Prepare flags for dsub command
393+
flags['image'] = container
394+
flags['env'] = environment
395+
flags['input-recursive'] = [vol[3]+'='+vol[2] for vol in dsub_volumes]
396+
flags['output-recursive'] = "OUTPUT=" + workspace_bucket + working_dir
397+
flags['logging'] = workspace_bucket + '/dsub/'
398+
399+
# Create dsub command
400+
dsub_command = prepare_dsub_cmd(flags)
401+
402+
# Run dsub as subprocess
403+
subprocess.run(dsub_command, shell=True)
404+
405+
# Pull output volumes from WORKSPACE_BUCKET
406+
for src, _dst, gcs_path, _env in dsub_volumes:
407+
download_gcs(local_path=str(src), gcs_path=gcs_path, is_dir=True)
408+
409+
# return location of dsub logs in WORKSPACE_BUCKET
410+
return 'dsub logs: {logs}'.format(logs=flags['logging'])

spras/meo.py

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import os
12
from pathlib import Path
23

34
from spras.containers import prepare_volume, run_container
@@ -20,7 +21,7 @@
2021
# Does not support MINSAT or MAXCSP
2122
# TODO add parameter validation
2223
def write_properties(filename=Path('properties.txt'), edges=None, sources=None, targets=None, edge_output=None,
23-
path_output=None, max_path_length=None, local_search=None, rand_restarts=None):
24+
path_output=None, max_path_length=None, local_search=None, rand_restarts=None, framework='docker'):
2425
"""
2526
Write the properties file for Maximum Edge Orientation
2627
See https://github.com/agitter/meo/blob/master/sample.props for property descriptions and the default values at
@@ -32,6 +33,17 @@ def write_properties(filename=Path('properties.txt'), edges=None, sources=None,
3233
if edges is None or sources is None or targets is None or edge_output is None or path_output is None:
3334
raise ValueError('Required Maximum Edge Orientation properties file arguments are missing')
3435

36+
if framework == 'dsub':
37+
# Get path inside dsub container
38+
workspace_bucket = os.getenv('WORKSPACE_BUCKET')
39+
input_prefix = f'/mnt/data/input/gs/{workspace_bucket}'.replace('gs://', '')
40+
# Add input prefix to all MEO paths
41+
edges = input_prefix + edges
42+
sources = input_prefix + sources
43+
targets = input_prefix + targets
44+
edge_output = input_prefix + edge_output
45+
path_output = input_prefix + path_output
46+
3547
with open(filename, 'w') as f:
3648
# Write the required properties
3749
f.write(f'edges.file = {Path(edges).as_posix()}\n')
@@ -158,7 +170,7 @@ def run(edges=None, sources=None, targets=None, output_file=None, max_path_lengt
158170
properties_file_local = Path(out_dir, properties_file)
159171
write_properties(filename=properties_file_local, edges=edge_file, sources=source_file, targets=target_file,
160172
edge_output=mapped_output_file, path_output=mapped_path_output,
161-
max_path_length=max_path_length, local_search=local_search, rand_restarts=rand_restarts)
173+
max_path_length=max_path_length, local_search=local_search, rand_restarts=rand_restarts, framework=container_framework)
162174
bind_path, properties_file = prepare_volume(str(properties_file_local), work_dir)
163175
volumes.append(bind_path)
164176

0 commit comments

Comments
 (0)