Skip to content

Commit

Permalink
Merge pull request #17 from OpenKBC/engineering_dev
Browse files Browse the repository at this point in the history
Added activate score and minor bug fixes, confirmed
  • Loading branch information
swiri021 authored Sep 17, 2021
2 parents 9a96f06 + 6f6caf0 commit 33901d0
Show file tree
Hide file tree
Showing 7 changed files with 90 additions and 26 deletions.
4 changes: 2 additions & 2 deletions R/IDconverter.R
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ names(data) <- sub("^X", "", names(data)) # drop "X" string in columns name

### Warning ###
# Entrez ID might duplicate for Ensemble ID
data$entrez = mapIds(org.Hs.eg.db, keys=row.names(data), column="ENTREZID", keytype="ENSEMBL", multiVals="first")
row.names(data)<-make.names(data$entrez, unique=TRUE)
entrez = mapIds(org.Hs.eg.db, keys=row.names(data), column="ENTREZID", keytype="ENSEMBL", multiVals="first")
row.names(data)<-make.names(entrez, unique=TRUE)
row.names(data) <- sub("^X", "", row.names(data)) # drop "X" string in index name
write.table(data, outputFile, sep=',', row.names = TRUE, col.names = TRUE) # Write result
Empty file.
33 changes: 33 additions & 0 deletions notebook/notebook_lib/gene_zscore/standard_gzscore.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
__author__ = "Junhee Yoon"
__version__ = "1.0.0"
__maintainer__ = "Junhee Yoon"
__email__ = "[email protected]"

"""
Reference: https://genomebiology.biomedcentral.com/articles/10.1186/gb-2006-7-10-r93
Description: calculating activation score by using z score
"""

import pandas as pd
import numpy as np

class calculator(object):

def __init__(self, df):
if df.empty:
raise ValueError("Input Dataframe is empty, please try with different one.")
else:
self.df = df

# function structure
def gs_zscore(self, names='Zscore', gene_set=[]):
zscore=[]
arr1_index = self.df.index.tolist()
inter = list(set(arr1_index).intersection(gene_set))

diff_mean = self.df.loc[inter].mean(axis=0).subtract(self.df.mean(axis=0))
len_norm = self.df.std(ddof=1, axis=0).apply(lambda x: np.sqrt(len(inter))/x)
zscore = diff_mean*len_norm
zscore = zscore.to_frame()
zscore.columns = [names]
return zscore
11 changes: 8 additions & 3 deletions notebook/notebook_lib/gene_zscore/threaded_gzscore.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
__email__ = "[email protected]"

"""
EXPERIMENTAL CODE
Manual: https://github.com/swiri021/Threaded_gsZscore
Reference: https://genomebiology.biomedcentral.com/articles/10.1186/gb-2006-7-10-r93
Description: calculating activation score by using threaded z score
Expand All @@ -13,6 +14,7 @@
import threading
import functools
import itertools
import math

class funcThread(object):
def __init__(self):
Expand All @@ -29,6 +31,9 @@ def run(*args, **kwargs):
####Divide Samples by number of threads
i_col = len(args[1].columns.tolist())
contents_numb = i_col/kwargs['nthread']
#contents_numb = math.ceil(contents_numb)
contents_numb = round(contents_numb) # round for matching thread number

split_columns = [args[1].columns.tolist()[i:i+contents_numb] for i in range(0, len(args[1].columns.tolist()), contents_numb)]
if len(split_columns)>kwargs['nthread']:
split_columns = split_columns[:kwargs['nthread']-1] + [list(itertools.chain(*split_columns[kwargs['nthread']-1:]))]
Expand All @@ -37,7 +42,7 @@ def run(*args, **kwargs):

####Running threads
for i, item in enumerate(split_columns):
threads[i] = threading.Thread(target = func, args=(args[0], args[1].ix[:,item], container, i), kwargs=kwargs)
threads[i] = threading.Thread(target = func, args=(args[0], args[1][item], container, i), kwargs=kwargs)
threads[i].start()
for i in range(len(threads)):
threads[i].join()
Expand All @@ -56,7 +61,7 @@ def __init__(self, df):
self.df = df

# Wrapper for controlling Threads
def gs_zscore(self, nthread=5, gene_set=[]):
def gs_zscore(self, nthread=4, gene_set=[]):
arr1 = self.df
container = None
i = None
Expand All @@ -66,7 +71,7 @@ def gs_zscore(self, nthread=5, gene_set=[]):
# function structure
# args(input, container, thread_index , **kwargs)
@funcThread()
def _calculating(self, arr1, container, i, nthread=5, gene_set=[]):
def _calculating(self, arr1, container, i, nthread=4, gene_set=[]):
zscore=[]
arr1_index = arr1.index.tolist()
inter = list(set(arr1_index).intersection(gene_set))
Expand Down
8 changes: 7 additions & 1 deletion utils/cleanup_normalized_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

import argparse
import pandas as pd
import os
import numpy as np

parser = argparse.ArgumentParser(prog='cleanup_normalized_matrix.py')
# Input data
Expand All @@ -19,6 +19,9 @@
# Output path
parser.add_argument('-o','--output', type=str, dest='output', required=True,\
help='Output file name including path')

parser.add_argument('-v','--vst', dest='vst', action='store_true',default=False,\
help='Input data is vst normalized or not, default = False')
args = parser.parse_args()

if __name__ == "__main__":
Expand All @@ -35,6 +38,9 @@
df.columns = [x.split(".")[0] for x in df.columns.tolist()] # New column names
df = df[~df.index.duplicated(keep='first')] # Taking first values in duplicated index

if args.vst==False:
df=df.applymap(lambda x: np.log2(x+1)) # Apply log2 for non-vst normalized data

## Need to add file name handler
if '.csv' in args.input_df:
df.to_csv(args.output)
Expand Down
15 changes: 0 additions & 15 deletions utils/convert_ensembl_to_entrez.py

This file was deleted.

45 changes: 40 additions & 5 deletions utils/get_all_acitivation_scores.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,50 @@
import glob
import os
__author__ = "Junhee Yoon"
__version__ = "1.0.0"
__maintainer__ = "Junhee Yoon"
__email__ = "[email protected]"

"""
Description: This code generates activation scores by using MsigDB. This code needs expression matrix by EntrezID index
"""

## Move to setting.env in the future
MSIGDB_PATH = "/Users/junheeyun/OpenKBC/multiple_sclerosis_proj/data/MsigDB_list/msigdb.v7.4.entrez.gmt"

## sys import path for library calling
import sys; sys.path.append('/Users/junheeyun/OpenKBC/multiple_sclerosis_proj/notebook/notebook_lib');
import argparse
import pandas as pd
from gene_zscore.standard_gzscore import calculator

parser = argparse.ArgumentParser(prog='get_all_activattion_scores.py')
# Input data
parser.add_argument('-i','--input', type=str, dest='input_df', required=True,\
help='Input data matrix')
# Output path
parser.add_argument('-o','--output', type=str, dest='filepath', required=True, default='./',\
help='Output directory')
parser.add_argument('-o','--output', type=str, dest='output', required=True, default='./',\
help='Output file')
args = parser.parse_args()

MSIGDB_PATH = "data/MsigDB_list/msigdb.v7.4.entrez.gmt"
if __name__ == "__main__":
# .gmt parsing
count = 0
gmt_arr = [] # gmt parsing array
with open(MSIGDB_PATH, 'r') as infile:
for line in infile:
gmt_value = line.strip().split("\t") # splitting line
sig_names = gmt_value[0] # signature name
gene_list = gmt_value[2:] # gene list
gmt_arr.append([sig_names]+gene_list)

# Sample loading, and some entrezIDs are duplicated in the matrix
gexpr = pd.read_csv(args.input_df, index_col=0)
gexpr.index = [x.split(".")[0] for x in gexpr.index.tolist()] # remove effect from R make.names
gexpr = gexpr.groupby(gexpr.index).max() # keeping max for duplicated index

zscore_arr = [] # result array
zscore_calculator = calculator(gexpr) # Set activation calculator
for sig in gmt_arr:
zscore_value = zscore_calculator.gs_zscore(names=sig[0], gene_set=sig[1:]) # using standard, threaded version has an error
zscore_arr.append(zscore_value)
zscore_df = pd.concat(zscore_arr, axis=1) # make dataframe
zscore_df.to_csv(args.output)

0 comments on commit 33901d0

Please sign in to comment.