Merge pull request #52 from OpenKBC/engineering_dev

swiri021 · web-flow · commit e80a3e35c13e · 2021-10-05T02:30:55.000-05:00
Connected pipelines to s3 bucket
diff --git a/README.md b/README.md
@@ -2,7 +2,7 @@
 
 * Current members: Kicheol Kim, Junhee Yoon
 * Please, leave a message in **Discussions** tab if you have any question and requests
-* Please use docker image to analyze the data. AWS module is ready and Please ask to members for getting auth f AWS is needed to analze data.
+* Please use docker image to analyze the data. AWS module is ready and Please ask to members for getting auth if AWS is needed
 * Our data is located in S3 bucket
 
 ### Goal
diff --git a/aws_module/ec2_deployment/aws_module.sh b/aws_module/ec2_deployment/aws_module.sh
@@ -69,11 +69,14 @@ echo "Cooling down starts. It takes more than 8 minutes.."
 ## 7m, cooling down while AWS is loading and preparing resources
 sleep 500
 
+## copy aws credential to ec2
+scp -i MSplatform-key.pem -o StrictHostKeyChecking=no $HOME/.aws/credentials ubuntu@$ip_addr:/home/ubuntu/.aws/
+
 ## Running installer
 ssh -i MSplatform-key.pem -o StrictHostKeyChecking=no ubuntu@$ip_addr 'bash -s' < utils/installer.sh
 
 ## Moving credentials to ec2 for s3 connection
-scp -i MSplatform-key.pem -o StrictHostKeyChecking=no credentials ubuntu@$ip_addr:/home/ubuntu/.aws
+#scp -i MSplatform-key.pem -o StrictHostKeyChecking=no credentials ubuntu@$ip_addr:/home/ubuntu/.aws
 
 ## S3 sync from S3 project bucket
 ssh -i MSplatform-key.pem -o StrictHostKeyChecking=no ubuntu@$ip_addr 'bash -s' < utils/s3Sync.sh
diff --git a/docker-compose.AWS.yaml b/docker-compose.AWS.yaml
@@ -24,6 +24,7 @@ services:
       - /home/ubuntu/MSProject/multiple_sclerosis_proj/pipelines:/pipelines
       - /home/ubuntu/MSProject/multiple_sclerosis_proj/data:/MainData
       - /home/ubuntu/MSProject/multiple_sclerosis_proj/notebook/resultFiles:/Output
+      -  $HOME/.aws/credentials:/root/.aws/credentials:ro
     ports:
       - 80:5000
     depends_on:
@@ -46,6 +47,7 @@ services:
       - /home/ubuntu/MSProject/multiple_sclerosis_proj/pipelines:/pipelines
       - /home/ubuntu/MSProject/multiple_sclerosis_proj/data:/MainData
       - /home/ubuntu/MSProject/multiple_sclerosis_proj/resultFiles:/Output
+      -  $HOME/.aws/credentials:/root/.aws/credentials:ro
     working_dir: /pipelines/pipeline_controller/
     command: conda run -n pipeline_controller_base celery -A app.celery worker --loglevel=info
     depends_on:
diff --git a/docker-compose.yaml b/docker-compose.yaml
@@ -3,7 +3,7 @@ services:
   notebook: # Notebook
     build:
       context: ./notebook
-    #image: swiri021/openkbc_msproject:notebookcontainer1
+    image: swiri021/openkbc_msproject:notebookcontainer1
     volumes:
       - /Users/junheeyun/OpenKBC/multiple_sclerosis_proj/notebook/notebook_lib:/home/jovyan/work/notebook_lib
       - /Users/junheeyun/OpenKBC/multiple_sclerosis_proj/notebook/notebook_utils:/home/jovyan/work/notebook_utils
@@ -17,15 +17,12 @@ services:
   pipelines: # Pipelines
     build:
       context: ./pipelines
-    #image: swiri021/openkbc_msproject:pipelinecontainer1
-    deploy:
-      resources:
-        limits:
-          memory: 4000m
+    image: swiri021/openkbc_msproject:pipelinecontainer1
     volumes:
       - /Users/junheeyun/OpenKBC/multiple_sclerosis_proj/pipelines:/pipelines
       - /Users/junheeyun/OpenKBC/multiple_sclerosis_proj/data:/MainData
       - /Users/junheeyun/OpenKBC/multiple_sclerosis_proj/notebook/resultFiles:/Output
+      -  $HOME/.aws/credentials:/root/.aws/credentials:ro
     ports:
       - 80:5000
     depends_on:
@@ -44,11 +41,16 @@ services:
   celery: # celery
     build:
       context: ./pipelines
-    #image: swiri021/openkbc_msproject:celerycontainer1
+    image: swiri021/openkbc_msproject:celerycontainer1
+    deploy:
+      resources:
+        limits:
+          memory: 8000m
     volumes:
       - /Users/junheeyun/OpenKBC/multiple_sclerosis_proj/pipelines:/pipelines
       - /Users/junheeyun/OpenKBC/multiple_sclerosis_proj/data:/MainData
       - /Users/junheeyun/OpenKBC/multiple_sclerosis_proj/notebook/resultFiles:/Output
+      -  $HOME/.aws/credentials:/root/.aws/credentials:ro
     working_dir: /pipelines/pipeline_controller/
     command: conda run -n pipeline_controller_base celery -A app.celery worker --loglevel=info
     depends_on:
diff --git a/pipelines/deg_pipeline/config_51a6f79ad7d648df8b34268cb9c2d917.yaml b/pipelines/deg_pipeline/config_51a6f79ad7d648df8b34268cb9c2d917.yaml
@@ -0,0 +1,8 @@
+InputFolder: /MainData/rsem_counts
+MetaFile: /MainData/annotation_metadata/EPIC_HCvB_metadata_baseline_updated-share.csv
+SampleColumnName: HCVB_ID
+CondColumnName: DiseaseCourse
+Condition1: RR
+Condition2: CIS
+OutputFolder: ./OutputTest/
+logID: user1
diff --git a/pipelines/pipeline_controller/app.py b/pipelines/pipeline_controller/app.py
@@ -18,6 +18,8 @@
 import uuid
 import os
 import subprocess
+import boto3
+import glob
 
 # Custom form making
 from wtforms.validators import Required
@@ -123,22 +125,39 @@ def workflow_running(pipeline_path, yaml_file):
         line = proc.stdout.readline()
         if not line:
             break
-        print(str(line))
         current_task.update_state(state='PROGRESS', meta={'msg': str(line)})
     return 999
 
 @app.route("/workflow_progress")
 def workflow_progress():
-    print("WORKFLOW RETURN")
     jobid = request.values.get('jobid')
     print(jobid)
     if jobid:
         job = AsyncResult(jobid, app=celery)
     print(job.state)
     if job.state == 'PROGRESS':
         return json.dumps(dict( state=job.state, msg=job.result['msg'],))
+
     elif job.state == 'SUCCESS':
+        ## S3 Upload process START
+        output_counter = int(session.get('output_count', None))
+        output_folder_list = [ session.get('output'+str(i), None) for i in range(output_counter)]
+        logID = session.get('logID', None)
+        bucket_name = 'openkbc-ms-result-bucket' # fixed bucket
+        #bucket_dest = 's3://'+bucket_name+"/"+logID+"/"
+
+        s3 = boto3.client('s3') # Client set, S3
+        for path in output_folder_list:
+            filelist = glob.glob(path+"/*") # search all files
+            for fname in filelist: # get name
+                with open(fname, "rb") as f:
+                    s3.upload_fileobj(f, bucket_name, logID+"/"+os.path.basename(fname)) # upload to s3
+        ## S3 Upload process END
+
         return json.dumps(dict( state=job.state, msg="done",))
+
+    elif job.state == 'FAILURE':
+        return json.dumps(dict( state=job.state, msg="failture",)) ## return somewhere to exit
     return '{}'
 
 @app.route("/status")
@@ -190,13 +209,26 @@ def _reform_yamlFile(selected_pipeline, data_dict):
     f = open(yamlFileName, "w") # write file with unique name
 
     nested_items = [] # List for handing nested items
+    output_count=0 # Output key count(Tracking purpose)
     for key, value in data_dict.items():
         if key.find('--')>-1: # Nested key has '--'
             subkeys = key.split('--')# 2 layers keys
             nested_items.append([subkeys[0],subkeys[1],value]) #make list
         else:
-            f.write(key+": "+value+"\n")
+            ## Tracking output path and user ID
+            if key.find("Output") > -1 or key.find("output") > -1: ## key has 'output' string
+                output_count+=1
+                session['output'+str(output_count)]=value # set session for output folder (Tracking purpose)    
+            session['output_count'] = output_count # set session for output counter (Tracking purpose)
+
+            if key.find('logID') > -1: # Find log ID
+                session['logID'] = value # set session for ID
+            ## Tracking output path and user ID
 
+            f.write(key+": "+value+"\n") ## Write new form of yaml
+    
+    ### Add error handling here
+    ### Add error handling here
     key1_unique=list(set([x[0] for x in nested_items])) # make a list of root key
     for x in key1_unique:
         f.write(x+":"+"\n") # first line of nested key (root key)
@@ -206,6 +238,10 @@ def _reform_yamlFile(selected_pipeline, data_dict):
     
     f.close()
     return yamlFileName
-    
+
+def get_filenames(path):
+    filelist = glob.glob(path+"/*")
+    return filelist
+
 if __name__ == '__main__':
     app.run(host='0.0.0.0')
diff --git a/pipelines/pipeline_controller/requirements.txt b/pipelines/pipeline_controller/requirements.txt
@@ -8,6 +8,8 @@ Flask-Bootstrap==3.3.7.1
 flask-nav==0.6
 celery==5.1.2
 redis==3.5.3
+boto3==1.18.54
+awscli==1.20.54
 ##deg requirements
 pip==21.2.2
 pandas==1.3.2
diff --git a/pipelines/pipeline_controller/templates/progress.html b/pipelines/pipeline_controller/templates/progress.html
@@ -22,7 +22,8 @@ <h3>Workflow controller</h3>
                 $("#pct").html("<b>Workflow has been completed</b>");
                 if(resp.msg == 'done') {
                     return;
-                } else{
+                }
+                else{
                     $("#pct").html("<img src='/static/spinning-loading.gif'>");
                     setTimeout(poll, 1000.0);
                 }