From 0fd8dae482ecd3fee784e7d1bbed3cc6470e0127 Mon Sep 17 00:00:00 2001 From: Jun Date: Tue, 5 Oct 2021 02:29:28 -0500 Subject: [PATCH] Connected pipelines to s3 bucket --- README.md | 2 +- aws_module/README.md | 2 +- aws_module/ec2_deployment/aws_module.sh | 5 ++- docker-compose.AWS.yaml | 2 + docker-compose.yaml | 16 ++++--- ...nfig_51a6f79ad7d648df8b34268cb9c2d917.yaml | 8 ++++ pipelines/pipeline_controller/app.py | 44 +++++++++++++++++-- .../pipeline_controller/requirements.txt | 2 + .../templates/progress.html | 3 +- 9 files changed, 69 insertions(+), 15 deletions(-) create mode 100644 pipelines/deg_pipeline/config_51a6f79ad7d648df8b34268cb9c2d917.yaml diff --git a/README.md b/README.md index 18ca28d..79d4b38 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ * Current members: Kicheol Kim, Junhee Yoon * Please, leave a message in **Discussions** tab if you have any question and requests -* Please use docker image to analyze the data. AWS module is ready and Please ask to members for getting auth f AWS is needed to analze data. +* Please use docker image to analyze the data. AWS module is ready and Please ask to members for getting auth if AWS is needed * Our data is located in S3 bucket ### Goal diff --git a/aws_module/README.md b/aws_module/README.md index 917b565..5b4f25b 100644 --- a/aws_module/README.md +++ b/aws_module/README.md @@ -9,6 +9,6 @@ apt-get install jq ## Modules List | Name | Description | Main exec file | -|---------|---------|---------|---------| +|---------|---------|---------| | ec2_deployment | Module for EC2 auto-deployment | aws_module.sh | | sageMaker_deployment | Module for sageMaker auto-deployment | sagemaker_module.sh | \ No newline at end of file diff --git a/aws_module/ec2_deployment/aws_module.sh b/aws_module/ec2_deployment/aws_module.sh index 64495cc..99f81f0 100644 --- a/aws_module/ec2_deployment/aws_module.sh +++ b/aws_module/ec2_deployment/aws_module.sh @@ -69,11 +69,14 @@ echo "Cooling down starts. It takes more than 8 minutes.." ## 7m, cooling down while AWS is loading and preparing resources sleep 500 +## copy aws credential to ec2 +scp -i MSplatform-key.pem -o StrictHostKeyChecking=no $HOME/.aws/credentials ubuntu@$ip_addr:/home/ubuntu/.aws/ + ## Running installer ssh -i MSplatform-key.pem -o StrictHostKeyChecking=no ubuntu@$ip_addr 'bash -s' < utils/installer.sh ## Moving credentials to ec2 for s3 connection -scp -i MSplatform-key.pem -o StrictHostKeyChecking=no credentials ubuntu@$ip_addr:/home/ubuntu/.aws +#scp -i MSplatform-key.pem -o StrictHostKeyChecking=no credentials ubuntu@$ip_addr:/home/ubuntu/.aws ## S3 sync from S3 project bucket ssh -i MSplatform-key.pem -o StrictHostKeyChecking=no ubuntu@$ip_addr 'bash -s' < utils/s3Sync.sh diff --git a/docker-compose.AWS.yaml b/docker-compose.AWS.yaml index c15ac03..c5be022 100644 --- a/docker-compose.AWS.yaml +++ b/docker-compose.AWS.yaml @@ -24,6 +24,7 @@ services: - /home/ubuntu/MSProject/multiple_sclerosis_proj/pipelines:/pipelines - /home/ubuntu/MSProject/multiple_sclerosis_proj/data:/MainData - /home/ubuntu/MSProject/multiple_sclerosis_proj/notebook/resultFiles:/Output + - $HOME/.aws/credentials:/root/.aws/credentials:ro ports: - 80:5000 depends_on: @@ -46,6 +47,7 @@ services: - /home/ubuntu/MSProject/multiple_sclerosis_proj/pipelines:/pipelines - /home/ubuntu/MSProject/multiple_sclerosis_proj/data:/MainData - /home/ubuntu/MSProject/multiple_sclerosis_proj/resultFiles:/Output + - $HOME/.aws/credentials:/root/.aws/credentials:ro working_dir: /pipelines/pipeline_controller/ command: conda run -n pipeline_controller_base celery -A app.celery worker --loglevel=info depends_on: diff --git a/docker-compose.yaml b/docker-compose.yaml index 6600719..671d024 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -3,7 +3,7 @@ services: notebook: # Notebook build: context: ./notebook - #image: swiri021/openkbc_msproject:notebookcontainer1 + image: swiri021/openkbc_msproject:notebookcontainer1 volumes: - /Users/junheeyun/OpenKBC/multiple_sclerosis_proj/notebook/notebook_lib:/home/jovyan/work/notebook_lib - /Users/junheeyun/OpenKBC/multiple_sclerosis_proj/notebook/notebook_utils:/home/jovyan/work/notebook_utils @@ -17,15 +17,12 @@ services: pipelines: # Pipelines build: context: ./pipelines - #image: swiri021/openkbc_msproject:pipelinecontainer1 - deploy: - resources: - limits: - memory: 4000m + image: swiri021/openkbc_msproject:pipelinecontainer1 volumes: - /Users/junheeyun/OpenKBC/multiple_sclerosis_proj/pipelines:/pipelines - /Users/junheeyun/OpenKBC/multiple_sclerosis_proj/data:/MainData - /Users/junheeyun/OpenKBC/multiple_sclerosis_proj/notebook/resultFiles:/Output + - $HOME/.aws/credentials:/root/.aws/credentials:ro ports: - 80:5000 depends_on: @@ -44,11 +41,16 @@ services: celery: # celery build: context: ./pipelines - #image: swiri021/openkbc_msproject:celerycontainer1 + image: swiri021/openkbc_msproject:celerycontainer1 + deploy: + resources: + limits: + memory: 8000m volumes: - /Users/junheeyun/OpenKBC/multiple_sclerosis_proj/pipelines:/pipelines - /Users/junheeyun/OpenKBC/multiple_sclerosis_proj/data:/MainData - /Users/junheeyun/OpenKBC/multiple_sclerosis_proj/notebook/resultFiles:/Output + - $HOME/.aws/credentials:/root/.aws/credentials:ro working_dir: /pipelines/pipeline_controller/ command: conda run -n pipeline_controller_base celery -A app.celery worker --loglevel=info depends_on: diff --git a/pipelines/deg_pipeline/config_51a6f79ad7d648df8b34268cb9c2d917.yaml b/pipelines/deg_pipeline/config_51a6f79ad7d648df8b34268cb9c2d917.yaml new file mode 100644 index 0000000..eb24192 --- /dev/null +++ b/pipelines/deg_pipeline/config_51a6f79ad7d648df8b34268cb9c2d917.yaml @@ -0,0 +1,8 @@ +InputFolder: /MainData/rsem_counts +MetaFile: /MainData/annotation_metadata/EPIC_HCvB_metadata_baseline_updated-share.csv +SampleColumnName: HCVB_ID +CondColumnName: DiseaseCourse +Condition1: RR +Condition2: CIS +OutputFolder: ./OutputTest/ +logID: user1 diff --git a/pipelines/pipeline_controller/app.py b/pipelines/pipeline_controller/app.py index c057bc4..71fdd2f 100644 --- a/pipelines/pipeline_controller/app.py +++ b/pipelines/pipeline_controller/app.py @@ -18,6 +18,8 @@ import uuid import os import subprocess +import boto3 +import glob # Custom form making from wtforms.validators import Required @@ -123,13 +125,11 @@ def workflow_running(pipeline_path, yaml_file): line = proc.stdout.readline() if not line: break - print(str(line)) current_task.update_state(state='PROGRESS', meta={'msg': str(line)}) return 999 @app.route("/workflow_progress") def workflow_progress(): - print("WORKFLOW RETURN") jobid = request.values.get('jobid') print(jobid) if jobid: @@ -137,8 +137,27 @@ def workflow_progress(): print(job.state) if job.state == 'PROGRESS': return json.dumps(dict( state=job.state, msg=job.result['msg'],)) + elif job.state == 'SUCCESS': + ## S3 Upload process START + output_counter = int(session.get('output_count', None)) + output_folder_list = [ session.get('output'+str(i), None) for i in range(output_counter)] + logID = session.get('logID', None) + bucket_name = 'openkbc-ms-result-bucket' # fixed bucket + #bucket_dest = 's3://'+bucket_name+"/"+logID+"/" + + s3 = boto3.client('s3') # Client set, S3 + for path in output_folder_list: + filelist = glob.glob(path+"/*") # search all files + for fname in filelist: # get name + with open(fname, "rb") as f: + s3.upload_fileobj(f, bucket_name, logID+"/"+os.path.basename(fname)) # upload to s3 + ## S3 Upload process END + return json.dumps(dict( state=job.state, msg="done",)) + + elif job.state == 'FAILURE': + return json.dumps(dict( state=job.state, msg="failture",)) ## return somewhere to exit return '{}' @app.route("/status") @@ -190,13 +209,26 @@ def _reform_yamlFile(selected_pipeline, data_dict): f = open(yamlFileName, "w") # write file with unique name nested_items = [] # List for handing nested items + output_count=0 # Output key count(Tracking purpose) for key, value in data_dict.items(): if key.find('--')>-1: # Nested key has '--' subkeys = key.split('--')# 2 layers keys nested_items.append([subkeys[0],subkeys[1],value]) #make list else: - f.write(key+": "+value+"\n") + ## Tracking output path and user ID + if key.find("Output") > -1 or key.find("output") > -1: ## key has 'output' string + output_count+=1 + session['output'+str(output_count)]=value # set session for output folder (Tracking purpose) + session['output_count'] = output_count # set session for output counter (Tracking purpose) + + if key.find('logID') > -1: # Find log ID + session['logID'] = value # set session for ID + ## Tracking output path and user ID + f.write(key+": "+value+"\n") ## Write new form of yaml + + ### Add error handling here + ### Add error handling here key1_unique=list(set([x[0] for x in nested_items])) # make a list of root key for x in key1_unique: f.write(x+":"+"\n") # first line of nested key (root key) @@ -206,6 +238,10 @@ def _reform_yamlFile(selected_pipeline, data_dict): f.close() return yamlFileName - + +def get_filenames(path): + filelist = glob.glob(path+"/*") + return filelist + if __name__ == '__main__': app.run(host='0.0.0.0') \ No newline at end of file diff --git a/pipelines/pipeline_controller/requirements.txt b/pipelines/pipeline_controller/requirements.txt index 4bc67c0..4c509de 100644 --- a/pipelines/pipeline_controller/requirements.txt +++ b/pipelines/pipeline_controller/requirements.txt @@ -8,6 +8,8 @@ Flask-Bootstrap==3.3.7.1 flask-nav==0.6 celery==5.1.2 redis==3.5.3 +boto3==1.18.54 +awscli==1.20.54 ##deg requirements pip==21.2.2 pandas==1.3.2 diff --git a/pipelines/pipeline_controller/templates/progress.html b/pipelines/pipeline_controller/templates/progress.html index 3f99f8b..dea87c5 100644 --- a/pipelines/pipeline_controller/templates/progress.html +++ b/pipelines/pipeline_controller/templates/progress.html @@ -22,7 +22,8 @@

Workflow controller

$("#pct").html("Workflow has been completed"); if(resp.msg == 'done') { return; - } else{ + } + else{ $("#pct").html(""); setTimeout(poll, 1000.0); }