OpenKBC · swiri021 · Oct 17, 2021 · Oct 17, 2021 · Oct 17, 2021
diff --git a/aws_module/batch_deployment/README.md b/aws_module/batch_deployment/README.md
@@ -1,5 +1,6 @@
 ## AWS module for running the project
 * This module supports to run the project codes, pipelines and analysis by launching AWS Batch. Currently, it is on development phase and this module can run with limited code (Activation Score Calculation).
+* Parallel jobs execution is needed lambda function input, please use lambda_deployment section first
 
 ### Requirements on local PC
 ```
@@ -22,5 +23,9 @@ apt-get install awscli
 ```
 * And run module
 ```
-sh batch_module.sh
+# Single job
+sh batch_module_singleJob.sh 
+
+# Parallelized job
+sh batch_module_parallel.sh
 ```
diff --git a/aws_module/batch_deployment/batch_jobs/get_zscore.py b/aws_module/batch_deployment/batch_jobs/get_zscore.py
@@ -73,13 +73,14 @@ def uploadFile(bucketName, writeFileName, data):
     ### Get ENV variables
     msigdbName = os.environ['msigdb'] # msigdb.v7.4.entrez.gmt
     sampleName = os.environ['inputfile'] # counts_vst_CD4.converted.csv
+    mainDataBucket = os.environ['mainbucket'] # openkbc-ms-maindata-bucket
+    uploadDataBucket = os.environ['uploadbucket'] # openkbc-ms-casting-bucket
 
     ### Error handling here
 
     ### Data prepration
-    main_bucket = 'openkbc-ms-maindata-bucket'
-    MSIGDB_PATH = getFile(main_bucket, [msigdbName])
-    input_df = getFile(main_bucket, [sampleName])
+    MSIGDB_PATH = getFile('openkbc-ms-maindata-bucket', [msigdbName]) ## This is FIXED parameter
+    input_df = getFile(mainDataBucket, [sampleName])
 
     ### Actual job
     # .gmt parsing
@@ -108,5 +109,5 @@ def uploadFile(bucketName, writeFileName, data):
     zscore_df = pd.concat(zscore_arr, axis=1) # make dataframe
 
     ### Result upload
-    upload_bucket = 'openkbc-ms-casting-bucket'
-    uploadFile(upload_bucket, 'output.csv', zscore_df)
+    output_number = sampleName.split('.')[-2] # Format is always same one (name.0.csv, name.1.csv..)
+    uploadFile(uploadDataBucket, 'output.'+output_number+'.csv', zscore_df)
diff --git a/aws_module/batch_deployment/batch_module.sh b/aws_module/batch_deployment/batch_module.sh
diff --git a/aws_module/batch_deployment/batch_module_parallel.sh b/aws_module/batch_deployment/batch_module_parallel.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+## Need to push job docker images before running this module. This is an example parallelized AWS batch by using lambda function.
+## Lambda function generates sliced expression matrix in casting bucket and batch grabs those casting results as input.
+## This could be replaced to array job, and current version is controlled by env varibles and created single jobs for each input variable
+
+dataBucket='openkbc-ms-casting-bucket'
+
+mkdir logs/
+mkdir json_setfiles/
+
+echo "Creating compute environment.."
+aws batch create-compute-environment --compute-environment-name activation-score-env \
+--type MANAGED --compute-resources type=FARGATE,maxvCpus=4,securityGroupIds=sg-08946d1b26a30d376,subnets=[subnet-46231822,subnet-5c5f8b53]
+
+sleep 5
+
+echo "Creating job queue.."
+aws batch create-job-queue --job-queue-name activation-score-queue --compute-environment-order order=1,computeEnvironment=activation-score-env --priority 100
+
+# Get AWS s3 input list
+inputList=($(aws s3 ls $dataBucket | awk '{print $4}'))
+
+COUNTER=0 # counter
+for filename in "${inputList[@]}"
+do
+    ## JSON create for job registering (Change input name and input bucket name)
+    sed "/counts_vst_CD4.converted.csv/s/:".*"/: \"${filename}\"/" container_configure.json | sed "/openkbc-ms-maindata-bucket/s/:".*"/: \"${dataBucket}\"/" > json_setfiles/container_configure_${COUNTER}.json
+
+    ## Job registering
+    echo "Creating $COUNTER -job.."
+    aws batch register-job-definition --job-definition-name activation-score-job_${COUNTER} --platform-capabilities FARGATE \
+    --type container --container-properties file://json_setfiles/container_configure_${COUNTER}.json
+
+    sleep 3
+
+    ## Job submit
+    echo "$COUNTER -job Submit.."
+    aws batch submit-job --job-name activation-score-job_${COUNTER} --job-queue activation-score-queue --job-definition activation-score-job_${COUNTER} > logs/job.submitted_${COUNTER}
+    COUNTER=$[$COUNTER +1]
+done
+echo "Job submission has been completed.."
diff --git a/aws_module/batch_deployment/batch_module_singleJob.sh b/aws_module/batch_deployment/batch_module_singleJob.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+## Need to push job docker images before running this module, this module is an example for how to run single job for AWS batch.
+## It generate zscore by using gene matrix in main bucket, it is not parallelized.
+
+echo "Creating compute environment.."
+aws batch create-compute-environment --compute-environment-name activation-score-env \
+--type MANAGED --compute-resources type=FARGATE,maxvCpus=4,securityGroupIds=sg-08946d1b26a30d376,subnets=[subnet-46231822,subnet-5c5f8b53]
+
+echo "Creating job queue.."
+aws batch create-job-queue --job-queue-name activation-score-queue --compute-environment-order order=1,computeEnvironment=activation-score-env --priority 100
+
+echo "Creating job.."
+aws batch register-job-definition --job-definition-name activation-score-job --platform-capabilities FARGATE \
+--type container --container-properties file://container_configure.json
+
+echo "Submit.."
+aws batch submit-job --job-name activation-score-job --job-queue activation-score-queue --job-definition activation-score-job > job.submitted
+
+jobID=$(jq '.jobId' job.submitted)
+jobID="${objectState%\"}" # Remove double quotes from string
+jobID="${objectState#\"}" # Remove double quotes from string
+
+## Purpose of this bash file is running while EC2 is ready. When it is ready, automatically it will be out
+while [ "$objectState" != "SUCCEEDED" ];do # EC2 running checking
+    sleep 1
+    objStatuses=$(aws batch describe-jobs --jobs $jobID)
+    objectState=$( jq --jsonargs '.jobs | .[] | .status' <<< "${objStatuses}" )
+    objectState="${objectState%\"}" # Remove double quotes from string
+    objectState="${objectState#\"}" # Remove double quotes from string
+    echo "Job status: $objectState "
+done
+echo "Job has been completed.."
diff --git a/aws_module/batch_deployment/container_configure.json b/aws_module/batch_deployment/container_configure.json
@@ -13,7 +13,7 @@
 
     "resourceRequirements" : [
         {
-            "value":"1",
+            "value":"2",
             "type":"VCPU"
         },
         {
@@ -29,6 +29,14 @@
         {
             "name": "inputfile",
             "value": "counts_vst_CD4.converted.csv"  
+        },
+        {
+            "name": "mainbucket",
+            "value": "openkbc-ms-maindata-bucket"
+        },
+        {
+            "name": "uploadbucket",
+            "value": "openkbc-ms-batchresult-bucket"
         }
       ]
 }
diff --git a/aws_module/lambda_deployment/lambda_functions/initial_lambda.log b/aws_module/lambda_deployment/lambda_functions/initial_lambda.log
@@ -0,0 +1,21 @@
+{
+    "FunctionName": "SpliceColumnFunction",
+    "FunctionArn": "arn:aws:lambda:us-east-1:601333025120:function:SpliceColumnFunction",
+    "Runtime": "python3.8",
+    "Role": "arn:aws:iam::601333025120:role/lambda-s3-access-role",
+    "Handler": "spliceColumns.lambda_handler",
+    "CodeSize": 1495,
+    "Description": "",
+    "Timeout": 120,
+    "MemorySize": 2000,
+    "LastModified": "2021-10-16T02:08:26.560+0000",
+    "CodeSha256": "CK7U2Vxd5lGjHA+mGJaA+usqmineU1OQlpK1CSgJkFY=",
+    "Version": "$LATEST",
+    "TracingConfig": {
+        "Mode": "PassThrough"
+    },
+    "RevisionId": "c8b6c575-d399-4ba1-8a3e-469f15fa977e",
+    "State": "Active",
+    "LastUpdateStatus": "Successful",
+    "PackageType": "Zip"
+}
diff --git a/aws_module/lambda_deployment/lambda_functions/response.json b/aws_module/lambda_deployment/lambda_functions/response.json
@@ -0,0 +1 @@
+{"statusCode": 200, "body": "\"Success\""}
diff --git a/aws_module/lambda_deployment/lambda_module.sh b/aws_module/lambda_deployment/lambda_module.sh
@@ -1,17 +1,19 @@
 #!/bin/bash
 
-##
+## This is testing lambda function
 
 lambdaRole=arn:aws:iam::601333025120:role/lambda-s3-access-role
 
 ## zipping lambda code
 #zip -r lambda_functions/spliceColumns.py.zip lambda_functions/spliceColumns.py
 
+echo "Function create.."
 ## Lambda function create
 aws lambda create-function --role $lambdaRole --memory-size 2000 \
 --timeout 120 --runtime python3.8 --handler spliceColumns.lambda_handler \
 --zip-file fileb://lambda_functions/spliceColumns.py.zip --function-name SpliceColumnFunction > lambda_functions/initial_lambda.log
 
+echo "Invoke lambda function.."
 ## Invoke Lambda
 aws lambda invoke --function-name SpliceColumnFunction lambda_functions/response.json