diff --git a/aws_module/batch_deployment/README.md b/aws_module/batch_deployment/README.md index 377433f..5d84179 100644 --- a/aws_module/batch_deployment/README.md +++ b/aws_module/batch_deployment/README.md @@ -1,5 +1,6 @@ ## AWS module for running the project * This module supports to run the project codes, pipelines and analysis by launching AWS Batch. Currently, it is on development phase and this module can run with limited code (Activation Score Calculation). +* Parallel jobs execution is needed lambda function input, please use lambda_deployment section first ### Requirements on local PC ``` @@ -22,5 +23,9 @@ apt-get install awscli ``` * And run module ``` -sh batch_module.sh +# Single job +sh batch_module_singleJob.sh + +# Parallelized job +sh batch_module_parallel.sh ``` \ No newline at end of file diff --git a/aws_module/batch_deployment/batch_jobs/get_zscore.py b/aws_module/batch_deployment/batch_jobs/get_zscore.py index 1e65b7f..67ccb89 100644 --- a/aws_module/batch_deployment/batch_jobs/get_zscore.py +++ b/aws_module/batch_deployment/batch_jobs/get_zscore.py @@ -73,13 +73,14 @@ def uploadFile(bucketName, writeFileName, data): ### Get ENV variables msigdbName = os.environ['msigdb'] # msigdb.v7.4.entrez.gmt sampleName = os.environ['inputfile'] # counts_vst_CD4.converted.csv + mainDataBucket = os.environ['mainbucket'] # openkbc-ms-maindata-bucket + uploadDataBucket = os.environ['uploadbucket'] # openkbc-ms-casting-bucket ### Error handling here ### Data prepration - main_bucket = 'openkbc-ms-maindata-bucket' - MSIGDB_PATH = getFile(main_bucket, [msigdbName]) - input_df = getFile(main_bucket, [sampleName]) + MSIGDB_PATH = getFile('openkbc-ms-maindata-bucket', [msigdbName]) ## This is FIXED parameter + input_df = getFile(mainDataBucket, [sampleName]) ### Actual job # .gmt parsing @@ -108,5 +109,5 @@ def uploadFile(bucketName, writeFileName, data): zscore_df = pd.concat(zscore_arr, axis=1) # make dataframe ### Result upload - upload_bucket = 'openkbc-ms-casting-bucket' - uploadFile(upload_bucket, 'output.csv', zscore_df) \ No newline at end of file + output_number = sampleName.split('.')[-2] # Format is always same one (name.0.csv, name.1.csv..) + uploadFile(uploadDataBucket, 'output.'+output_number+'.csv', zscore_df) \ No newline at end of file diff --git a/aws_module/batch_deployment/batch_module.sh b/aws_module/batch_deployment/batch_module.sh deleted file mode 100644 index 08045a3..0000000 --- a/aws_module/batch_deployment/batch_module.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/bin/bash - -## Need to push job docker images before running this module - -echo "Creating compute environment.." -aws batch create-compute-environment --compute-environment-name activation-score-env \ ---type MANAGED --compute-resources type=FARGATE,maxvCpus=4,securityGroupIds=sg-08946d1b26a30d376,subnets=[subnet-46231822,subnet-5c5f8b53] - -echo "Creating job queue.." -aws batch create-job-queue --job-queue-name activation-score-queueorder=1,computeEnvironment=activation-score-env --priority 100 - -echo "Creating job.." -aws batch register-job-definition --job-definition-name activation-score-job --platform-capabilities FARGATE \ ---type container --container-properties file://container_configure.json - -echo "Submit.." -aws batch submit-job --job-name activation-score-job --job-queue activation-score-queue --job-definition activation-score-job \ No newline at end of file diff --git a/aws_module/batch_deployment/batch_module_parallel.sh b/aws_module/batch_deployment/batch_module_parallel.sh new file mode 100644 index 0000000..737793d --- /dev/null +++ b/aws_module/batch_deployment/batch_module_parallel.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +## Need to push job docker images before running this module. This is an example parallelized AWS batch by using lambda function. +## Lambda function generates sliced expression matrix in casting bucket and batch grabs those casting results as input. +## This could be replaced to array job, and current version is controlled by env varibles and created single jobs for each input variable + +dataBucket='openkbc-ms-casting-bucket' + +mkdir logs/ +mkdir json_setfiles/ + +echo "Creating compute environment.." +aws batch create-compute-environment --compute-environment-name activation-score-env \ +--type MANAGED --compute-resources type=FARGATE,maxvCpus=4,securityGroupIds=sg-08946d1b26a30d376,subnets=[subnet-46231822,subnet-5c5f8b53] + +sleep 5 + +echo "Creating job queue.." +aws batch create-job-queue --job-queue-name activation-score-queue --compute-environment-order order=1,computeEnvironment=activation-score-env --priority 100 + +# Get AWS s3 input list +inputList=($(aws s3 ls $dataBucket | awk '{print $4}')) + +COUNTER=0 # counter +for filename in "${inputList[@]}" +do + ## JSON create for job registering (Change input name and input bucket name) + sed "/counts_vst_CD4.converted.csv/s/:".*"/: \"${filename}\"/" container_configure.json | sed "/openkbc-ms-maindata-bucket/s/:".*"/: \"${dataBucket}\"/" > json_setfiles/container_configure_${COUNTER}.json + + ## Job registering + echo "Creating $COUNTER -job.." + aws batch register-job-definition --job-definition-name activation-score-job_${COUNTER} --platform-capabilities FARGATE \ + --type container --container-properties file://json_setfiles/container_configure_${COUNTER}.json + + sleep 3 + + ## Job submit + echo "$COUNTER -job Submit.." + aws batch submit-job --job-name activation-score-job_${COUNTER} --job-queue activation-score-queue --job-definition activation-score-job_${COUNTER} > logs/job.submitted_${COUNTER} + COUNTER=$[$COUNTER +1] +done +echo "Job submission has been completed.." \ No newline at end of file diff --git a/aws_module/batch_deployment/batch_module_singleJob.sh b/aws_module/batch_deployment/batch_module_singleJob.sh new file mode 100644 index 0000000..d2f73cc --- /dev/null +++ b/aws_module/batch_deployment/batch_module_singleJob.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +## Need to push job docker images before running this module, this module is an example for how to run single job for AWS batch. +## It generate zscore by using gene matrix in main bucket, it is not parallelized. + +echo "Creating compute environment.." +aws batch create-compute-environment --compute-environment-name activation-score-env \ +--type MANAGED --compute-resources type=FARGATE,maxvCpus=4,securityGroupIds=sg-08946d1b26a30d376,subnets=[subnet-46231822,subnet-5c5f8b53] + +echo "Creating job queue.." +aws batch create-job-queue --job-queue-name activation-score-queue --compute-environment-order order=1,computeEnvironment=activation-score-env --priority 100 + +echo "Creating job.." +aws batch register-job-definition --job-definition-name activation-score-job --platform-capabilities FARGATE \ +--type container --container-properties file://container_configure.json + +echo "Submit.." +aws batch submit-job --job-name activation-score-job --job-queue activation-score-queue --job-definition activation-score-job > job.submitted + +jobID=$(jq '.jobId' job.submitted) +jobID="${objectState%\"}" # Remove double quotes from string +jobID="${objectState#\"}" # Remove double quotes from string + +## Purpose of this bash file is running while EC2 is ready. When it is ready, automatically it will be out +while [ "$objectState" != "SUCCEEDED" ];do # EC2 running checking + sleep 1 + objStatuses=$(aws batch describe-jobs --jobs $jobID) + objectState=$( jq --jsonargs '.jobs | .[] | .status' <<< "${objStatuses}" ) + objectState="${objectState%\"}" # Remove double quotes from string + objectState="${objectState#\"}" # Remove double quotes from string + echo "Job status: $objectState " +done +echo "Job has been completed.." \ No newline at end of file diff --git a/aws_module/batch_deployment/container_configure.json b/aws_module/batch_deployment/container_configure.json index e715a21..f013158 100644 --- a/aws_module/batch_deployment/container_configure.json +++ b/aws_module/batch_deployment/container_configure.json @@ -13,7 +13,7 @@ "resourceRequirements" : [ { - "value":"1", + "value":"2", "type":"VCPU" }, { @@ -29,6 +29,14 @@ { "name": "inputfile", "value": "counts_vst_CD4.converted.csv" + }, + { + "name": "mainbucket", + "value": "openkbc-ms-maindata-bucket" + }, + { + "name": "uploadbucket", + "value": "openkbc-ms-batchresult-bucket" } ] } \ No newline at end of file diff --git a/aws_module/lambda_deployment/lambda_functions/initial_lambda.log b/aws_module/lambda_deployment/lambda_functions/initial_lambda.log new file mode 100644 index 0000000..d8e33d7 --- /dev/null +++ b/aws_module/lambda_deployment/lambda_functions/initial_lambda.log @@ -0,0 +1,21 @@ +{ + "FunctionName": "SpliceColumnFunction", + "FunctionArn": "arn:aws:lambda:us-east-1:601333025120:function:SpliceColumnFunction", + "Runtime": "python3.8", + "Role": "arn:aws:iam::601333025120:role/lambda-s3-access-role", + "Handler": "spliceColumns.lambda_handler", + "CodeSize": 1495, + "Description": "", + "Timeout": 120, + "MemorySize": 2000, + "LastModified": "2021-10-16T02:08:26.560+0000", + "CodeSha256": "CK7U2Vxd5lGjHA+mGJaA+usqmineU1OQlpK1CSgJkFY=", + "Version": "$LATEST", + "TracingConfig": { + "Mode": "PassThrough" + }, + "RevisionId": "c8b6c575-d399-4ba1-8a3e-469f15fa977e", + "State": "Active", + "LastUpdateStatus": "Successful", + "PackageType": "Zip" +} diff --git a/aws_module/lambda_deployment/lambda_functions/response.json b/aws_module/lambda_deployment/lambda_functions/response.json new file mode 100644 index 0000000..cddb495 --- /dev/null +++ b/aws_module/lambda_deployment/lambda_functions/response.json @@ -0,0 +1 @@ +{"statusCode": 200, "body": "\"Success\""} \ No newline at end of file diff --git a/aws_module/lambda_deployment/lambda_module.sh b/aws_module/lambda_deployment/lambda_module.sh index c7debba..cb632cf 100644 --- a/aws_module/lambda_deployment/lambda_module.sh +++ b/aws_module/lambda_deployment/lambda_module.sh @@ -1,17 +1,19 @@ #!/bin/bash -## +## This is testing lambda function lambdaRole=arn:aws:iam::601333025120:role/lambda-s3-access-role ## zipping lambda code #zip -r lambda_functions/spliceColumns.py.zip lambda_functions/spliceColumns.py +echo "Function create.." ## Lambda function create aws lambda create-function --role $lambdaRole --memory-size 2000 \ --timeout 120 --runtime python3.8 --handler spliceColumns.lambda_handler \ --zip-file fileb://lambda_functions/spliceColumns.py.zip --function-name SpliceColumnFunction > lambda_functions/initial_lambda.log +echo "Invoke lambda function.." ## Invoke Lambda aws lambda invoke --function-name SpliceColumnFunction lambda_functions/response.json