Skip to content

Commit 86be501

Browse files
authored
force workspace download (#196)
* force workspace download * docs: auto-download workspace * cleanup
1 parent c2af5eb commit 86be501

File tree

2 files changed

+34
-31
lines changed

2 files changed

+34
-31
lines changed

documentation/DCP-documentation/step_1_configuration.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,10 @@ For more information and examples, see [External Buckets](external_buckets.md).
2828
This is generally the bucket in the account in which you are running compute.
2929
* **SOURCE_BUCKET:** The bucket where the image files you will be reading are.
3030
Often, this is the same as AWS_BUCKET.
31-
* **WORKSPACE:** The bucket where non-image files you will be reading are (e.g. pipeline, load_data.csv, etc.).
31+
These files can be downloaded or read directly off the bucket (see `DOWNLOAD_FILES` below for more).
32+
* **WORKSPACE_BUCKET:** The bucket where non-image files you will be reading are (e.g. pipeline, load_data.csv, etc.).
3233
Often, this is the same as AWS_BUCKET.
34+
Workspace files will always be automatically downloaded to your EC2 instance (as of v2.2.1).
3335
* **DESTINATION_BUCKET:** The bucket where you want to write your output files.
3436
Often, this is the same as AWS_BUCKET.
3537
* **UPLOAD_FLAGS:** If you need to add flags to an AWS CLI command to upload flags to your DESTINATION_BUCKET, this is where you enter them.
@@ -57,6 +59,7 @@ If you have multiple Dockers running per machine, each Docker will have access t
5759
This typically requires a larger EBS volume (depending on the size of your image sets, and how many sets are processed per group), but avoids occasional issues with S3FS that can crop up on longer runs.
5860
By default, DCP uses S3FS to mount the S3 `SOURCE_BUCKET` as a pseudo-file system on each EC2 instance in your spot fleet to avoid file download.
5961
If you are unable to mount the `SOURCE_BUCKET` (perhaps because of a permissions issue) you should proceed with `DOWNLOAD_FILES = 'True'`.
62+
Note that as of v2.2.1, all non-image files (e.g. load_data.csv's and pipelines) are downloaded regardless of this setting and regardless of whether `SOURCE_BUCKET` and `WORKSPACE_BUCKET` are the same.
6063
* **ASSIGN_IP:** Whether or not to assign an a public IPv4 address to each instance in the spot fleet.
6164
If set to 'False' will overwrite whatever is in the Fleet file.
6265
If set to 'True' will respect whatever is in the Fleet file.

worker/cp-worker.py

Lines changed: 30 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,8 @@ def printandlog(text,logger):
109109
#################################
110110

111111
def runCellProfiler(message):
112+
s3client=boto3.client('s3')
113+
112114
#List the directories in the bucket- this prevents a strange s3fs error
113115
rootlist=os.listdir(DATA_ROOT)
114116
for eachSubDir in rootlist:
@@ -167,7 +169,6 @@ def runCellProfiler(message):
167169
# See if this is a message you've already handled, if you've so chosen
168170
if CHECK_IF_DONE_BOOL.upper() == 'TRUE':
169171
try:
170-
s3client=boto3.client('s3')
171172
bucketlist=s3client.list_objects(Bucket=DESTINATION_BUCKET,Prefix=f'{remoteOut}/')
172173
objectsizelist=[k['Size'] for k in bucketlist['Contents']]
173174
objectsizelist = [i for i in objectsizelist if i >= MIN_FILE_SIZE_BYTES]
@@ -179,26 +180,37 @@ def runCellProfiler(message):
179180
return 'SUCCESS'
180181
except KeyError: #Returned if that folder does not exist
181182
pass
183+
184+
# Download load data file
185+
data_file_path = os.path.join(localIn,message['data_file'])
186+
printandlog(f"Downloading {message['data_file']} from {WORKSPACE_BUCKET}", logger)
187+
csv_insubfolders = message['data_file'].split('/')
188+
subfolders = '/'.join((csv_insubfolders)[:-1])
189+
if not os.path.exists(os.path.join(localIn,subfolders)):
190+
os.makedirs(os.path.join(localIn,subfolders), exist_ok=True)
191+
try:
192+
s3client.download_file(WORKSPACE_BUCKET, message['data_file'], data_file_path)
193+
except botocore.exceptions.ClientError:
194+
printandlog(f"Can't find load data file in S3. Looking for {message['data_file']} in {WORKSPACE_BUCKET}",logger)
195+
printandlog("Aborting. Can't run without load data.",logger)
196+
logger.removeHandler(watchtowerlogger)
197+
return 'DOWNLOAD_PROBLEM'
198+
199+
# Download pipeline and update pipeline path in message
200+
printandlog(f"Downloading {message['pipeline']} from {WORKSPACE_BUCKET}", logger)
201+
pipepath = os.path.join(localIn, message['pipeline'].split('/')[-1])
202+
try:
203+
s3client.download_file(WORKSPACE_BUCKET, message['pipeline'], pipepath)
204+
except botocore.exceptions.ClientError:
205+
printandlog(f"Can't find pipeline in S3. Looking for {message['pipeline']} in {WORKSPACE_BUCKET}",logger)
206+
printandlog("Aborting. Can't run without pipeline.",logger)
207+
logger.removeHandler(watchtowerlogger)
208+
return 'DOWNLOAD_PROBLEM'
182209

183210
downloaded_files = []
184211

185-
# Optional - download all files, bypass S3 mounting
212+
# Optional - download image files, bypass S3 mounting
186213
if DOWNLOAD_FILES.lower() == 'true':
187-
# Download load data file and image files
188-
data_file_path = os.path.join(localIn,message['data_file'])
189-
printandlog(f"Downloading {message['data_file']} from {WORKSPACE_BUCKET}", logger)
190-
csv_insubfolders = message['data_file'].split('/')
191-
subfolders = '/'.join((csv_insubfolders)[:-1])
192-
if not os.path.exists(os.path.join(localIn,subfolders)):
193-
os.makedirs(os.path.join(localIn,subfolders), exist_ok=True)
194-
s3client=boto3.client('s3')
195-
try:
196-
s3client.download_file(WORKSPACE_BUCKET, message['data_file'], data_file_path)
197-
except botocore.exceptions.ClientError:
198-
printandlog(f"Can't find load data file in S3. Looking for {message['data_file']} in {WORKSPACE_BUCKET}",logger)
199-
printandlog("Aborting. Can't run without load data.",logger)
200-
logger.removeHandler(watchtowerlogger)
201-
return 'DOWNLOAD_PROBLEM'
202214
if message['data_file'][-4:]=='.csv':
203215
printandlog('Figuring which files to download', logger)
204216
import pandas
@@ -273,20 +285,8 @@ def runCellProfiler(message):
273285
printandlog(f'Downloaded {str(len(downloaded_files))} files',logger)
274286
else:
275287
printandlog("Couldn't parse data file for file download. Not supported input of .csv or .txt",logger)
276-
# Download pipeline and update pipeline path in message
277-
printandlog(f"Downloading {message['pipeline']} from {WORKSPACE_BUCKET}", logger)
278-
pipepath = os.path.join(localIn, message['pipeline'].split('/')[-1])
279-
try:
280-
s3client.download_file(WORKSPACE_BUCKET, message['pipeline'], pipepath)
281-
except botocore.exceptions.ClientError:
282-
printandlog(f"Can't find pipeline in S3. Looking for {message['pipeline']} in {WORKSPACE_BUCKET}",logger)
283-
printandlog("Aborting. Can't run without pipeline.",logger)
284-
logger.removeHandler(watchtowerlogger)
285-
return 'DOWNLOAD_PROBLEM'
286-
287288
else:
288-
data_file_path = os.path.join(DATA_ROOT,message['data_file'])
289-
pipepath = os.path.join(DATA_ROOT,message["pipeline"])
289+
printandlog('Using bucket mount for image files', logger)
290290

291291
# Build and run CellProfiler command
292292
cpDone = f'{localOut}/cp.is.done'

0 commit comments

Comments
 (0)