OpenKBC · swiri021 · Nov 3, 2021 · Nov 3, 2021 · Nov 3, 2021
diff --git a/aws_module/batch_deployment/activation_score_batch/README.md b/aws_module/batch_deployment/activation_score_batch/README.md
@@ -1,3 +1,5 @@
+# Activation Score Calculation in AWS Batch
+
 ## AWS module for running the project
 * This is initial version of DEG pipeline with AWS batch, it has same function with pipeline module in this project.
 * To change input, JSON file needs modification

diff --git a/aws_module/batch_deployment/deg_pipeline_batch/README.md b/aws_module/batch_deployment/deg_pipeline_batch/README.md
@@ -1,6 +1,8 @@
+# DEG Pipieline in AWS Batch
+
 ## AWS module for running the project
-* This module supports to run the project codes, pipelines and analysis by launching AWS Batch. Currently, it is on development phase and this module can run with limited code (Activation Score Calculation).
-* Parallel jobs execution is needed lambda function input, please use lambda_deployment section first
+* This module supports to get sample DEG by certain conditions, it is using AWS Batch and need to launch separate running for each cells result(CD4, CD8, CD14)
+* There is no parallel job for this and it is sequential job type.
 
 ### Requirements on local PC
 ```
@@ -25,5 +27,5 @@ apt-get install jq
 sh batch_module_singleJob.sh # For CD4 only
 ```
 
-### Multiple Jobs Flow
-![flow1](../../../README_resource/batch_detail2.png)
+### Sequential Jobs Flow
+![flow1](../../../README_resource/batch_detail2.png)
diff --git a/aws_module/batch_deployment/feature_extraction_batch/README.md b/aws_module/batch_deployment/feature_extraction_batch/README.md
@@ -0,0 +1,31 @@
+# Feature Extranction Pipieline in AWS Batch
+
+## AWS module for running the project
+* This module supports to extract features by certain conditions, it is using AWS Batch and need to launch separate running for each cells result(CD4, CD8, CD14)
+* There is no parallel job for this and it is sequential job type.
+
+### Requirements on local PC
+```
+apt-get install awscli
+apt-get install jq
+```
+
+### Usage on local PC
+* To change cell type(CD4, CD8, CD14) or category, please replace JSON file to run them separately
+```json
+        {
+            "name": "msigDBPATH",
+            "value": "msigdb.v7.4.entrez.gmt"  
+        },
+        {
+            "name": "startFile",
+            "value": "counts_vst_CD4.csv" # Change here for input
+        },
+```
+* And run module
+```
+sh batch_module_singleJob.sh # For CD4 only
+```
+
+### Sequential Jobs Flow
+![flow1](../../../README_resource/batch_detail2.png)
diff --git a/aws_module/batch_deployment/feature_extraction_batch/batch_jobs/Dockerfile b/aws_module/batch_deployment/feature_extraction_batch/batch_jobs/Dockerfile
@@ -0,0 +1,12 @@
+FROM continuumio/miniconda
+
+RUN mkdir /data
+RUN mkdir /output
+
+COPY . .
+RUN conda create -n pipeline_controller_base python=3.8.2 R=3.6
+SHELL ["conda", "run", "-n", "pipeline_controller_base", "/bin/bash", "-c"]
+
+RUN pip install -r requirements.txt
+RUN Rscript installer_Rpackage.R
+RUN chmod +x pipeline_controller.sh
diff --git a/aws_module/batch_deployment/feature_extraction_batch/batch_jobs/deseq2_norm.R b/aws_module/batch_deployment/feature_extraction_batch/batch_jobs/deseq2_norm.R
@@ -0,0 +1,75 @@
+##################################################################################
+##### normalization using DESeq2 - get normalized and vst transformed counts #####
+##################################################################################
+### install DESeq2, tximport packages from Bioconductor
+#if (!requireNamespace("BiocManager", quietly = TRUE))
+#  install.packages("BiocManager", repos='http://cran.us.r-project.org')
+#  BiocManager::install("DESeq2")
+#  BiocManager::install("tximport")
+
+library(tidyverse)
+library(DESeq2)
+library(tximport)
+
+args = commandArgs(trailingOnly=TRUE)
+inputpath = args[1] # with path
+outputFile = args[2] # with path
+metadataPath = args[3]
+
+data_path <- "../data/"
+print(getwd())
+setwd(data_path)
+
+# loading metadata
+metadata <- read_csv(metadataPath))
+colnames(metadata)[1] <- "sampleID"
+
+# loading rsem filelist
+files <- list.files(path=inputpath, pattern="*.genes.results", recursive = TRUE, full.names = TRUE)
+
+# create sample table
+sampleTable <- data.frame(sampleID = str_extract(files, "\\d{5}\\w"), CellType = str_extract(files, "CD\\d{1,2}"), files = files) %>% 
+  left_join(., metadata, by="sampleID")
+row.names(sampleTable) <- paste(sampleTable$sampleID, sampleTable$CellType, sep=".")
+
+
+# function for loading rsem counts and normalization
+getNormCounts <- function(celltypes, files, sampleTable){
+  rsem.gene.sub <- tximport(files[grep(celltypes, files)], type = "rsem", txIn = FALSE, txOut = FALSE)
+  sampleTable.sub <- sampleTable %>% filter(files %in% files[grep(celltypes, files)]) %>% 
+    arrange(match(files, files[grep(celltypes, files)]))
+
+  rsem.gene.sub$length[rsem.gene.sub$length == 0] <- 1
+  cds.sub <- DESeqDataSetFromTximport(txi=rsem.gene.sub, colData=sampleTable.sub, design=~1)
+  cds.sub <- cds.sub[ rowSums(counts(cds.sub)) > 0, ]
+  cds.sub <- DESeq(cds.sub)
+  return(cds.sub)
+}
+
+# run function
+celltypes <- c("CD4","CD8","CD14")
+
+deseq_obj <- list()
+for (i in celltypes){
+  deseq_obj[[i]] <- getNormCounts(i, files, sampleTable)
+}
+
+
+### save count matrix
+# save DESeq2 object
+save(deseq_obj, file = "DESeq2_object.RData")
+
+# normalized counts
+write.csv(counts(deseq_obj$CD4, normalized = TRUE), "counts_norm_CD4.csv")
+write.csv(counts(deseq_obj$CD8, normalized = TRUE), "counts_norm_CD8.csv")
+write.csv(counts(deseq_obj$CD14, normalized = TRUE), "counts_norm_CD14.csv")
+
+# vst(variance stabilization transformation) transformed counts
+write.csv(assay(vst(deseq_obj$CD4)), "counts_vst_CD4.csv")
+write.csv(assay(vst(deseq_obj$CD8)), "counts_vst_CD8.csv")
+write.csv(assay(vst(deseq_obj$CD14)), "counts_vst_CD14.csv")
+
+# rlog transformed counts
+write.csv(assay(rlog(deseq_obj$CD4)), "counts_rlog_CD4.csv")
+write.csv(assay(rlog(deseq_obj$CD8)), "counts_rlog_CD8.csv")
+write.csv(assay(rlog(deseq_obj$CD14)), "counts_rlog_CD14.csv")
diff --git a/aws_module/batch_deployment/feature_extraction_batch/batch_jobs/installer_Rpackage.R b/aws_module/batch_deployment/feature_extraction_batch/batch_jobs/installer_Rpackage.R
@@ -0,0 +1,7 @@
+### install DESeq2, tximport packages from Bioconductor
+if (!requireNamespace("BiocManager", quietly = TRUE))
+  install.packages("BiocManager", repos='http://cran.us.r-project.org')
+  BiocManager::install("DESeq2")
+  BiocManager::install("tximport")
+  BiocManager::install("AnnotationDbi")
+  BiocManager::install("org.Hs.eg.db")
diff --git a/aws_module/batch_deployment/feature_extraction_batch/batch_jobs/libraries/botoClass.py b/aws_module/batch_deployment/feature_extraction_batch/batch_jobs/libraries/botoClass.py
@@ -0,0 +1,59 @@
+import boto3
+import os
+
+
+class botoHandler(object):
+
+    def __init__(self, bucketName):
+        s3 = boto3.resource('s3') # s3 resource
+        my_bucket = s3.Bucket(bucketName) # Bucket Set
+        self.s3 = s3
+        self.my_bucket = my_bucket
+
+
+    def getDirFiles(self, dirname, destpath='/data/'):
+        """
+        Get all ojects in specific folder
+        input: folder name, output destination in container
+        output: Downloading objects
+        """
+
+        for object_summary in self.my_bucket.objects.filter(Prefix=dirname):
+            targetFile=destpath+os.path.basename(object_summary.key)
+            self.my_bucket.download_file(object_summary.key, targetFile)
+
+    # Data search
+    def search_obj(self, bucketObj, searchList):
+        """
+        Search function for s3 objects
+        input: s3 object in boto3, list of items
+        output: s3 object key list
+        """
+        result=[]
+        for target in searchList:
+            for candidate in bucketObj.objects.all():
+                if str(candidate.key).find(target) > -1: # if target finds in string
+                    result.append(candidate.key) # get key
+        return result
+
+    # Get data from S3
+    def getFile(self, searchList, destpath='/data/'):
+        """
+        Download file from S3
+        input: bucketname, input name, container path for samples
+        output: output path string
+        """
+        s3_list = self.search_obj(self.my_bucket, searchList) # Search file object
+        targetFile=destpath+searchList[0]
+        self.my_bucket.download_file(s3_list[0], targetFile)
+
+        return targetFile
+
+    # Upload data to S3
+    def uploadFile(self, writeFileName, data, datatype='pandas'):
+        if datatype=='pandas':
+            self.my_bucket.put_object(Key=os.path.basename(writeFileName), Body=data.to_csv()) # Streaming to S3
+        elif datatype=='txt':
+            self.my_bucket.put_object(Key=os.path.basename(writeFileName), Body=data) # Streaming to S3
+        else:
+            raise ValueError('Error for upload data type definition')
diff --git a/aws_module/batch_deployment/feature_extraction_batch/batch_jobs/libraries/externalHandler.py b/aws_module/batch_deployment/feature_extraction_batch/batch_jobs/libraries/externalHandler.py
@@ -0,0 +1,66 @@
+import pandas as pd
+import itertools
+
+class handlers(object):
+    def get_column(filename_with_path,  ext_value, annot='gene_id', header_line=0, sep="\t"):
+        """
+        filename_with_path = filepath + basename
+        ext_value = column name of file
+        sep = separator
+        """
+
+        # Don't use pandas.read_csv because of memory usage
+        index_list = []
+        value_list = []
+        with open(filename_with_path, 'r') as infile:
+            for i, line in enumerate(infile):
+                line = line.strip()
+                if i==header_line: # found header
+                    header_info = line.split(sep)
+                    value_ext_location = header_info.index(ext_value) # location of value extraction point
+                    index_ext_location = header_info.index(annot) # location of value extraction point
+
+                elif i!=header_line:
+                    line_list = line.split(sep)
+                    index_list.append(str(line_list[index_ext_location])) # Value list
+                    value_list.append(float(line_list[value_ext_location])) # Index list
+
+        result_df = pd.DataFrame(data={ext_value: value_list}, index=index_list)
+        return result_df
+
+    def get_samplename(filelist):
+        """
+        filelist = list of basename
+        Lambda function could be--
+        _get_samplename = lambda filelist : [x.split("-")[0] for x in filelist]
+        """
+        sampleName = [x.split("-")[0] for x in filelist]
+        return sampleName
+
+    def get_condtionMatrix_by_category(dataframe, sampleColumn, dataColname, conditions:list):
+        """
+        Transform meta data to DESeq condition matrix
+        Input
+        dataframe: metadata input
+        sampleColumn: Column name for Sample ID in metadata input
+        dataColumn: Column name for category value in metadata input
+        conditions: Conditions you selected, list type, and it has 2 elements
+
+        Output
+        result dataframe with 2 columns (colnames: sampleID, conditions)
+        """
+
+        assert len(conditions)==2, "Please make sure that conditions list has 2 elements"
+
+        sampleList = [] # empty list
+        conditionValues = []
+        for x in conditions: 
+            data = dataframe[dataframe[dataColname]==x][sampleColumn] # get sample name
+            sampleList.append(data.values.tolist()) # sampleID
+            conditionValues.append([x]*len(data.values.tolist())) # condition value
+
+        sampleList = list(itertools.chain(*sampleList)) # flatten
+        conditionValues = list(itertools.chain(*conditionValues))
+
+        result = pd.DataFrame(data={'sampleID':sampleList, 'conditions':conditionValues}).set_index('sampleID')
+        return result
diff --git a/...module/batch_deployment/feature_extraction_batch/batch_jobs/libraries/standard_gzscore.py b/...module/batch_deployment/feature_extraction_batch/batch_jobs/libraries/standard_gzscore.py
@@ -0,0 +1,33 @@
+__author__ = "Junhee Yoon"
+__version__ = "1.0.0"
+__maintainer__ = "Junhee Yoon"
+__email__ = "[email protected]"
+
+"""
+Reference: https://genomebiology.biomedcentral.com/articles/10.1186/gb-2006-7-10-r93
+Description: calculating activation score by using z score
+"""
+
+import pandas as pd
+import numpy as np
+
+class calculator(object):
+
+    def __init__(self, df):
+        if df.empty:
+            raise ValueError("Input Dataframe is empty, please try with different one.")
+        else:
+            self.df = df
+
+    # function structure
+    def gs_zscore(self, names='Zscore', gene_set=[]):
+        zscore=[]
+        arr1_index = self.df.index.tolist()
+        inter = list(set(arr1_index).intersection(gene_set))
+
+        diff_mean = self.df.loc[inter].mean(axis=0).subtract(self.df.mean(axis=0))
+        len_norm = self.df.std(ddof=1, axis=0).apply(lambda x: np.sqrt(len(inter))/x)
+        zscore = diff_mean*len_norm
+        zscore = zscore.to_frame()
+        zscore.columns = [names]
+        return zscore
diff --git a/aws_module/batch_deployment/feature_extraction_batch/batch_jobs/libraries/statFunction.py b/aws_module/batch_deployment/feature_extraction_batch/batch_jobs/libraries/statFunction.py
@@ -0,0 +1,72 @@
+__author__ = "Junhee Yoon"
+__version__ = "1.0.0"
+__maintainer__ = "Junhee Yoon"
+__email__ = "[email protected]"
+
+"""
+Description: Repeative functions in notebook
+"""
+from sklearn.svm import SVC
+from sklearn.model_selection import StratifiedKFold
+from sklearn.feature_selection import RFECV
+
+from scipy.stats import ranksums
+import pandas as pd
+import numpy as np
+
+class StatHandler(object):
+    """
+    Statistics handlers
+
+    """
+
+    def calculate_ranksum(df, sampleList, controlList):
+        """
+        ranksum statistics wrapper by following notebook
+
+        Input:
+        df = Sample dataframe
+        sampleList = short disease duration
+        controlList = long disease duration
+
+        Output:
+        significant index(statistics and pvalue) by dataframe
+        """
+
+        significant_list = []
+        for x in df.index.tolist():
+            long_data = df[controlList].loc[x] # Long expr list
+            short_data = df[sampleList].loc[x] # Short expr list
+
+            s, p = ranksums(long_data.values.tolist(), short_data.values.tolist()) # ranksum
+            fc = short_data.mean(skipna=True) - long_data.mean(skipna=True) # FC
+
+            if p<0.05:
+                significant_list.append([x,fc, p]) # sig list
+
+        sig_df = pd.DataFrame(significant_list, columns=["Names", "fc", "pval"])
+        return sig_df
+
+    def calculate_RFECV(df, X, y, rankthresh=10):
+        ## Log function is needed here
+        ## Reference: 
+        ## https://scikit-learn.org/stable/auto_examples/feature_selection/plot_rfe_with_cross_validation.html
+
+        estimator = SVC(kernel="linear") # linear
+        min_features_to_select = 1
+        rfecv = RFECV(estimator=estimator, step=1, cv=StratifiedKFold(2),\
+            scoring='accuracy', min_features_to_select=min_features_to_select)
+        rfecv.fit(X, y)
+
+        print("Optimal number of features : %d" % rfecv.n_features_)
+        return np.where(rfecv.ranking_ <= int(rankthresh))
+
+        """
+        # Muted visualization part
+        # Plot number of features VS. cross-validation scores
+        plt.figure()
+        plt.xlabel("Number of features selected")
+        plt.ylabel("Cross validation score (nb of correct classifications)")
+        plt.plot(range(min_features_to_select, len(rfecv.grid_scores_) + min_features_to_select), rfecv.grid_scores_)
+        plt.show()
+        """