diff --git a/R/IDconverter.R b/R/IDconverter.R index 2d1d1c4..4d3b2da 100644 --- a/R/IDconverter.R +++ b/R/IDconverter.R @@ -16,7 +16,7 @@ names(data) <- sub("^X", "", names(data)) # drop "X" string in columns name ### Warning ### # Entrez ID might duplicate for Ensemble ID -data$entrez = mapIds(org.Hs.eg.db, keys=row.names(data), column="ENTREZID", keytype="ENSEMBL", multiVals="first") -row.names(data)<-make.names(data$entrez, unique=TRUE) +entrez = mapIds(org.Hs.eg.db, keys=row.names(data), column="ENTREZID", keytype="ENSEMBL", multiVals="first") +row.names(data)<-make.names(entrez, unique=TRUE) row.names(data) <- sub("^X", "", row.names(data)) # drop "X" string in index name write.table(data, outputFile, sep=',', row.names = TRUE, col.names = TRUE) # Write result \ No newline at end of file diff --git a/notebook/notebook_lib/__init__.py b/notebook/notebook_lib/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/notebook/notebook_lib/gene_zscore/standard_gzscore.py b/notebook/notebook_lib/gene_zscore/standard_gzscore.py new file mode 100644 index 0000000..18f4555 --- /dev/null +++ b/notebook/notebook_lib/gene_zscore/standard_gzscore.py @@ -0,0 +1,33 @@ +__author__ = "Junhee Yoon" +__version__ = "1.0.0" +__maintainer__ = "Junhee Yoon" +__email__ = "swiri021@gmail.com" + +""" +Reference: https://genomebiology.biomedcentral.com/articles/10.1186/gb-2006-7-10-r93 +Description: calculating activation score by using z score +""" + +import pandas as pd +import numpy as np + +class calculator(object): + + def __init__(self, df): + if df.empty: + raise ValueError("Input Dataframe is empty, please try with different one.") + else: + self.df = df + + # function structure + def gs_zscore(self, names='Zscore', gene_set=[]): + zscore=[] + arr1_index = self.df.index.tolist() + inter = list(set(arr1_index).intersection(gene_set)) + + diff_mean = self.df.loc[inter].mean(axis=0).subtract(self.df.mean(axis=0)) + len_norm = self.df.std(ddof=1, axis=0).apply(lambda x: np.sqrt(len(inter))/x) + zscore = diff_mean*len_norm + zscore = zscore.to_frame() + zscore.columns = [names] + return zscore \ No newline at end of file diff --git a/notebook/notebook_lib/gene_zscore/threaded_gzscore.py b/notebook/notebook_lib/gene_zscore/threaded_gzscore.py index fad581a..9ef3a4b 100644 --- a/notebook/notebook_lib/gene_zscore/threaded_gzscore.py +++ b/notebook/notebook_lib/gene_zscore/threaded_gzscore.py @@ -4,6 +4,7 @@ __email__ = "swiri021@gmail.com" """ +EXPERIMENTAL CODE Manual: https://github.com/swiri021/Threaded_gsZscore Reference: https://genomebiology.biomedcentral.com/articles/10.1186/gb-2006-7-10-r93 Description: calculating activation score by using threaded z score @@ -13,6 +14,7 @@ import threading import functools import itertools +import math class funcThread(object): def __init__(self): @@ -29,6 +31,9 @@ def run(*args, **kwargs): ####Divide Samples by number of threads i_col = len(args[1].columns.tolist()) contents_numb = i_col/kwargs['nthread'] + #contents_numb = math.ceil(contents_numb) + contents_numb = round(contents_numb) # round for matching thread number + split_columns = [args[1].columns.tolist()[i:i+contents_numb] for i in range(0, len(args[1].columns.tolist()), contents_numb)] if len(split_columns)>kwargs['nthread']: split_columns = split_columns[:kwargs['nthread']-1] + [list(itertools.chain(*split_columns[kwargs['nthread']-1:]))] @@ -37,7 +42,7 @@ def run(*args, **kwargs): ####Running threads for i, item in enumerate(split_columns): - threads[i] = threading.Thread(target = func, args=(args[0], args[1].ix[:,item], container, i), kwargs=kwargs) + threads[i] = threading.Thread(target = func, args=(args[0], args[1][item], container, i), kwargs=kwargs) threads[i].start() for i in range(len(threads)): threads[i].join() @@ -56,7 +61,7 @@ def __init__(self, df): self.df = df # Wrapper for controlling Threads - def gs_zscore(self, nthread=5, gene_set=[]): + def gs_zscore(self, nthread=4, gene_set=[]): arr1 = self.df container = None i = None @@ -66,7 +71,7 @@ def gs_zscore(self, nthread=5, gene_set=[]): # function structure # args(input, container, thread_index , **kwargs) @funcThread() - def _calculating(self, arr1, container, i, nthread=5, gene_set=[]): + def _calculating(self, arr1, container, i, nthread=4, gene_set=[]): zscore=[] arr1_index = arr1.index.tolist() inter = list(set(arr1_index).intersection(gene_set)) diff --git a/utils/cleanup_normalized_matrix.py b/utils/cleanup_normalized_matrix.py index e098fdb..7677714 100644 --- a/utils/cleanup_normalized_matrix.py +++ b/utils/cleanup_normalized_matrix.py @@ -10,7 +10,7 @@ import argparse import pandas as pd -import os +import numpy as np parser = argparse.ArgumentParser(prog='cleanup_normalized_matrix.py') # Input data @@ -19,6 +19,9 @@ # Output path parser.add_argument('-o','--output', type=str, dest='output', required=True,\ help='Output file name including path') + +parser.add_argument('-v','--vst', dest='vst', action='store_true',default=False,\ + help='Input data is vst normalized or not, default = False') args = parser.parse_args() if __name__ == "__main__": @@ -35,6 +38,9 @@ df.columns = [x.split(".")[0] for x in df.columns.tolist()] # New column names df = df[~df.index.duplicated(keep='first')] # Taking first values in duplicated index + if args.vst==False: + df=df.applymap(lambda x: np.log2(x+1)) # Apply log2 for non-vst normalized data + ## Need to add file name handler if '.csv' in args.input_df: df.to_csv(args.output) diff --git a/utils/convert_ensembl_to_entrez.py b/utils/convert_ensembl_to_entrez.py deleted file mode 100644 index 35eb55b..0000000 --- a/utils/convert_ensembl_to_entrez.py +++ /dev/null @@ -1,15 +0,0 @@ -import glob -import os -import argparse -import pandas as pd - -parser = argparse.ArgumentParser(prog='get_all_activattion_scores.py') -# Input data -parser.add_argument('-i','--input', type=str, dest='input_df', required=True,\ - help='Input data matrix') -# Output path -parser.add_argument('-o','--output', type=str, dest='filepath', required=True, default='./',\ - help='Output directory') -args = parser.parse_args() - -CONVERTING_FILE_PATH = "data/MsigDB_list/msigdb.v7.4.entrez.gmt" diff --git a/utils/get_all_acitivation_scores.py b/utils/get_all_acitivation_scores.py index 62af4bd..1686d77 100644 --- a/utils/get_all_acitivation_scores.py +++ b/utils/get_all_acitivation_scores.py @@ -1,15 +1,50 @@ -import glob -import os +__author__ = "Junhee Yoon" +__version__ = "1.0.0" +__maintainer__ = "Junhee Yoon" +__email__ = "swiri021@gmail.com" + +""" +Description: This code generates activation scores by using MsigDB. This code needs expression matrix by EntrezID index +""" + +## Move to setting.env in the future +MSIGDB_PATH = "/Users/junheeyun/OpenKBC/multiple_sclerosis_proj/data/MsigDB_list/msigdb.v7.4.entrez.gmt" + +## sys import path for library calling +import sys; sys.path.append('/Users/junheeyun/OpenKBC/multiple_sclerosis_proj/notebook/notebook_lib'); import argparse import pandas as pd +from gene_zscore.standard_gzscore import calculator parser = argparse.ArgumentParser(prog='get_all_activattion_scores.py') # Input data parser.add_argument('-i','--input', type=str, dest='input_df', required=True,\ help='Input data matrix') # Output path -parser.add_argument('-o','--output', type=str, dest='filepath', required=True, default='./',\ - help='Output directory') +parser.add_argument('-o','--output', type=str, dest='output', required=True, default='./',\ + help='Output file') args = parser.parse_args() -MSIGDB_PATH = "data/MsigDB_list/msigdb.v7.4.entrez.gmt" +if __name__ == "__main__": + # .gmt parsing + count = 0 + gmt_arr = [] # gmt parsing array + with open(MSIGDB_PATH, 'r') as infile: + for line in infile: + gmt_value = line.strip().split("\t") # splitting line + sig_names = gmt_value[0] # signature name + gene_list = gmt_value[2:] # gene list + gmt_arr.append([sig_names]+gene_list) + + # Sample loading, and some entrezIDs are duplicated in the matrix + gexpr = pd.read_csv(args.input_df, index_col=0) + gexpr.index = [x.split(".")[0] for x in gexpr.index.tolist()] # remove effect from R make.names + gexpr = gexpr.groupby(gexpr.index).max() # keeping max for duplicated index + + zscore_arr = [] # result array + zscore_calculator = calculator(gexpr) # Set activation calculator + for sig in gmt_arr: + zscore_value = zscore_calculator.gs_zscore(names=sig[0], gene_set=sig[1:]) # using standard, threaded version has an error + zscore_arr.append(zscore_value) + zscore_df = pd.concat(zscore_arr, axis=1) # make dataframe + zscore_df.to_csv(args.output) \ No newline at end of file