Merge pull request #17 from OpenKBC/engineering_dev

Added activate score and minor bug fixes, confirmed
OpenKBC · Sep 17, 2021 · 33901d0 · 33901d0
2 parents 9a96f06 + 6f6caf0
commit 33901d0
Show file tree

Hide file tree

Showing 7 changed files with 90 additions and 26 deletions.
diff --git a/R/IDconverter.R b/R/IDconverter.R
@@ -16,7 +16,7 @@ names(data) <- sub("^X", "", names(data)) # drop "X" string in columns name
 
 ### Warning ###
 # Entrez ID might duplicate for Ensemble ID
-data$entrez = mapIds(org.Hs.eg.db,  keys=row.names(data), column="ENTREZID", keytype="ENSEMBL", multiVals="first")
-row.names(data)<-make.names(data$entrez, unique=TRUE)
+entrez = mapIds(org.Hs.eg.db,  keys=row.names(data), column="ENTREZID", keytype="ENSEMBL", multiVals="first")
+row.names(data)<-make.names(entrez, unique=TRUE)
 row.names(data) <- sub("^X", "", row.names(data)) # drop "X" string in index name
 write.table(data, outputFile, sep=',', row.names = TRUE, col.names = TRUE) # Write result
diff --git a/notebook/notebook_lib/__init__.py b/notebook/notebook_lib/__init__.py
diff --git a/notebook/notebook_lib/gene_zscore/standard_gzscore.py b/notebook/notebook_lib/gene_zscore/standard_gzscore.py
@@ -0,0 +1,33 @@
+__author__ = "Junhee Yoon"
+__version__ = "1.0.0"
+__maintainer__ = "Junhee Yoon"
+__email__ = "[email protected]"
+
+"""
+Reference: https://genomebiology.biomedcentral.com/articles/10.1186/gb-2006-7-10-r93
+Description: calculating activation score by using z score
+"""
+
+import pandas as pd
+import numpy as np
+
+class calculator(object):
+
+    def __init__(self, df):
+        if df.empty:
+            raise ValueError("Input Dataframe is empty, please try with different one.")
+        else:
+            self.df = df
+
+    # function structure
+    def gs_zscore(self, names='Zscore', gene_set=[]):
+        zscore=[]
+        arr1_index = self.df.index.tolist()
+        inter = list(set(arr1_index).intersection(gene_set))
+
+        diff_mean = self.df.loc[inter].mean(axis=0).subtract(self.df.mean(axis=0))
+        len_norm = self.df.std(ddof=1, axis=0).apply(lambda x: np.sqrt(len(inter))/x)
+        zscore = diff_mean*len_norm
+        zscore = zscore.to_frame()
+        zscore.columns = [names]
+        return zscore
diff --git a/notebook/notebook_lib/gene_zscore/threaded_gzscore.py b/notebook/notebook_lib/gene_zscore/threaded_gzscore.py
@@ -4,6 +4,7 @@
 __email__ = "[email protected]"
 
 """
+EXPERIMENTAL CODE
 Manual: https://github.com/swiri021/Threaded_gsZscore
 Reference: https://genomebiology.biomedcentral.com/articles/10.1186/gb-2006-7-10-r93
 Description: calculating activation score by using threaded z score
@@ -13,6 +14,7 @@
 import threading
 import functools
 import itertools
+import math
 
 class funcThread(object):
     def __init__(self):
@@ -29,6 +31,9 @@ def run(*args, **kwargs):
             ####Divide Samples by number of threads
             i_col = len(args[1].columns.tolist())
             contents_numb = i_col/kwargs['nthread']
+            #contents_numb = math.ceil(contents_numb)
+            contents_numb = round(contents_numb) # round for matching thread number
+
             split_columns = [args[1].columns.tolist()[i:i+contents_numb] for i in range(0, len(args[1].columns.tolist()), contents_numb)]
             if len(split_columns)>kwargs['nthread']:
                 split_columns = split_columns[:kwargs['nthread']-1] + [list(itertools.chain(*split_columns[kwargs['nthread']-1:]))]
@@ -37,7 +42,7 @@ def run(*args, **kwargs):
 
             ####Running threads
             for i, item in enumerate(split_columns):
-                threads[i] = threading.Thread(target = func, args=(args[0], args[1].ix[:,item], container, i), kwargs=kwargs)
+                threads[i] = threading.Thread(target = func, args=(args[0], args[1][item], container, i), kwargs=kwargs)
                 threads[i].start()
             for i in range(len(threads)):
                 threads[i].join()
@@ -56,7 +61,7 @@ def __init__(self, df):
             self.df = df
 
     # Wrapper for controlling Threads
-    def gs_zscore(self, nthread=5, gene_set=[]):
+    def gs_zscore(self, nthread=4, gene_set=[]):
         arr1 = self.df
         container = None
         i = None
@@ -66,7 +71,7 @@ def gs_zscore(self, nthread=5, gene_set=[]):
     # function structure
     # args(input, container, thread_index , **kwargs)
     @funcThread()
-    def _calculating(self, arr1, container, i, nthread=5, gene_set=[]):
+    def _calculating(self, arr1, container, i, nthread=4, gene_set=[]):
         zscore=[]
         arr1_index = arr1.index.tolist()
         inter = list(set(arr1_index).intersection(gene_set))

diff --git a/utils/cleanup_normalized_matrix.py b/utils/cleanup_normalized_matrix.py
@@ -10,7 +10,7 @@
 
 import argparse
 import pandas as pd
-import os
+import numpy as np
 
 parser = argparse.ArgumentParser(prog='cleanup_normalized_matrix.py')
 # Input data
@@ -19,6 +19,9 @@
 # Output path
 parser.add_argument('-o','--output', type=str, dest='output', required=True,\
      help='Output file name including path')
+
+parser.add_argument('-v','--vst', dest='vst', action='store_true',default=False,\
+     help='Input data is vst normalized or not, default = False')
 args = parser.parse_args()
 
 if __name__ == "__main__":
@@ -35,6 +38,9 @@
     df.columns = [x.split(".")[0] for x in df.columns.tolist()] # New column names
     df = df[~df.index.duplicated(keep='first')] # Taking first values in duplicated index
 
+    if args.vst==False:
+        df=df.applymap(lambda x: np.log2(x+1)) # Apply log2 for non-vst normalized data
+
     ## Need to add file name handler
     if '.csv' in args.input_df:
         df.to_csv(args.output)

diff --git a/utils/convert_ensembl_to_entrez.py b/utils/convert_ensembl_to_entrez.py
diff --git a/utils/get_all_acitivation_scores.py b/utils/get_all_acitivation_scores.py
@@ -1,15 +1,50 @@
-import glob
-import os
+__author__ = "Junhee Yoon"
+__version__ = "1.0.0"
+__maintainer__ = "Junhee Yoon"
+__email__ = "[email protected]"
+
+"""
+Description: This code generates activation scores by using MsigDB. This code needs expression matrix by EntrezID index
+"""
+
+## Move to setting.env in the future
+MSIGDB_PATH = "/Users/junheeyun/OpenKBC/multiple_sclerosis_proj/data/MsigDB_list/msigdb.v7.4.entrez.gmt"
+
+## sys import path for library calling
+import sys; sys.path.append('/Users/junheeyun/OpenKBC/multiple_sclerosis_proj/notebook/notebook_lib');
 import argparse
 import pandas as pd
+from gene_zscore.standard_gzscore import calculator
 
 parser = argparse.ArgumentParser(prog='get_all_activattion_scores.py')
 # Input data
 parser.add_argument('-i','--input', type=str, dest='input_df', required=True,\
      help='Input data matrix')
 # Output path
-parser.add_argument('-o','--output', type=str, dest='filepath', required=True, default='./',\
-     help='Output directory')
+parser.add_argument('-o','--output', type=str, dest='output', required=True, default='./',\
+     help='Output file')
 args = parser.parse_args()
 
-MSIGDB_PATH = "data/MsigDB_list/msigdb.v7.4.entrez.gmt"
+if __name__ == "__main__":
+     # .gmt parsing
+     count = 0
+     gmt_arr = [] # gmt parsing array
+     with open(MSIGDB_PATH, 'r') as infile:
+        for line in infile:
+             gmt_value = line.strip().split("\t") # splitting line
+             sig_names = gmt_value[0] # signature name
+             gene_list = gmt_value[2:] # gene list
+             gmt_arr.append([sig_names]+gene_list)
+
+     # Sample loading, and some entrezIDs are duplicated in the matrix
+     gexpr = pd.read_csv(args.input_df, index_col=0)
+     gexpr.index = [x.split(".")[0] for x in gexpr.index.tolist()] # remove effect from R make.names
+     gexpr = gexpr.groupby(gexpr.index).max() # keeping max for duplicated index
+
+     zscore_arr = [] # result array
+     zscore_calculator = calculator(gexpr) # Set activation calculator
+     for sig in gmt_arr:
+          zscore_value = zscore_calculator.gs_zscore(names=sig[0], gene_set=sig[1:]) # using standard, threaded version has an error
+          zscore_arr.append(zscore_value)
+     zscore_df = pd.concat(zscore_arr, axis=1) # make dataframe
+     zscore_df.to_csv(args.output)