-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #17 from OpenKBC/engineering_dev
Added activate score and minor bug fixes, confirmed
- Loading branch information
Showing
7 changed files
with
90 additions
and
26 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
__author__ = "Junhee Yoon" | ||
__version__ = "1.0.0" | ||
__maintainer__ = "Junhee Yoon" | ||
__email__ = "[email protected]" | ||
|
||
""" | ||
Reference: https://genomebiology.biomedcentral.com/articles/10.1186/gb-2006-7-10-r93 | ||
Description: calculating activation score by using z score | ||
""" | ||
|
||
import pandas as pd | ||
import numpy as np | ||
|
||
class calculator(object): | ||
|
||
def __init__(self, df): | ||
if df.empty: | ||
raise ValueError("Input Dataframe is empty, please try with different one.") | ||
else: | ||
self.df = df | ||
|
||
# function structure | ||
def gs_zscore(self, names='Zscore', gene_set=[]): | ||
zscore=[] | ||
arr1_index = self.df.index.tolist() | ||
inter = list(set(arr1_index).intersection(gene_set)) | ||
|
||
diff_mean = self.df.loc[inter].mean(axis=0).subtract(self.df.mean(axis=0)) | ||
len_norm = self.df.std(ddof=1, axis=0).apply(lambda x: np.sqrt(len(inter))/x) | ||
zscore = diff_mean*len_norm | ||
zscore = zscore.to_frame() | ||
zscore.columns = [names] | ||
return zscore |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,6 +4,7 @@ | |
__email__ = "[email protected]" | ||
|
||
""" | ||
EXPERIMENTAL CODE | ||
Manual: https://github.com/swiri021/Threaded_gsZscore | ||
Reference: https://genomebiology.biomedcentral.com/articles/10.1186/gb-2006-7-10-r93 | ||
Description: calculating activation score by using threaded z score | ||
|
@@ -13,6 +14,7 @@ | |
import threading | ||
import functools | ||
import itertools | ||
import math | ||
|
||
class funcThread(object): | ||
def __init__(self): | ||
|
@@ -29,6 +31,9 @@ def run(*args, **kwargs): | |
####Divide Samples by number of threads | ||
i_col = len(args[1].columns.tolist()) | ||
contents_numb = i_col/kwargs['nthread'] | ||
#contents_numb = math.ceil(contents_numb) | ||
contents_numb = round(contents_numb) # round for matching thread number | ||
|
||
split_columns = [args[1].columns.tolist()[i:i+contents_numb] for i in range(0, len(args[1].columns.tolist()), contents_numb)] | ||
if len(split_columns)>kwargs['nthread']: | ||
split_columns = split_columns[:kwargs['nthread']-1] + [list(itertools.chain(*split_columns[kwargs['nthread']-1:]))] | ||
|
@@ -37,7 +42,7 @@ def run(*args, **kwargs): | |
|
||
####Running threads | ||
for i, item in enumerate(split_columns): | ||
threads[i] = threading.Thread(target = func, args=(args[0], args[1].ix[:,item], container, i), kwargs=kwargs) | ||
threads[i] = threading.Thread(target = func, args=(args[0], args[1][item], container, i), kwargs=kwargs) | ||
threads[i].start() | ||
for i in range(len(threads)): | ||
threads[i].join() | ||
|
@@ -56,7 +61,7 @@ def __init__(self, df): | |
self.df = df | ||
|
||
# Wrapper for controlling Threads | ||
def gs_zscore(self, nthread=5, gene_set=[]): | ||
def gs_zscore(self, nthread=4, gene_set=[]): | ||
arr1 = self.df | ||
container = None | ||
i = None | ||
|
@@ -66,7 +71,7 @@ def gs_zscore(self, nthread=5, gene_set=[]): | |
# function structure | ||
# args(input, container, thread_index , **kwargs) | ||
@funcThread() | ||
def _calculating(self, arr1, container, i, nthread=5, gene_set=[]): | ||
def _calculating(self, arr1, container, i, nthread=4, gene_set=[]): | ||
zscore=[] | ||
arr1_index = arr1.index.tolist() | ||
inter = list(set(arr1_index).intersection(gene_set)) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,15 +1,50 @@ | ||
import glob | ||
import os | ||
__author__ = "Junhee Yoon" | ||
__version__ = "1.0.0" | ||
__maintainer__ = "Junhee Yoon" | ||
__email__ = "[email protected]" | ||
|
||
""" | ||
Description: This code generates activation scores by using MsigDB. This code needs expression matrix by EntrezID index | ||
""" | ||
|
||
## Move to setting.env in the future | ||
MSIGDB_PATH = "/Users/junheeyun/OpenKBC/multiple_sclerosis_proj/data/MsigDB_list/msigdb.v7.4.entrez.gmt" | ||
|
||
## sys import path for library calling | ||
import sys; sys.path.append('/Users/junheeyun/OpenKBC/multiple_sclerosis_proj/notebook/notebook_lib'); | ||
import argparse | ||
import pandas as pd | ||
from gene_zscore.standard_gzscore import calculator | ||
|
||
parser = argparse.ArgumentParser(prog='get_all_activattion_scores.py') | ||
# Input data | ||
parser.add_argument('-i','--input', type=str, dest='input_df', required=True,\ | ||
help='Input data matrix') | ||
# Output path | ||
parser.add_argument('-o','--output', type=str, dest='filepath', required=True, default='./',\ | ||
help='Output directory') | ||
parser.add_argument('-o','--output', type=str, dest='output', required=True, default='./',\ | ||
help='Output file') | ||
args = parser.parse_args() | ||
|
||
MSIGDB_PATH = "data/MsigDB_list/msigdb.v7.4.entrez.gmt" | ||
if __name__ == "__main__": | ||
# .gmt parsing | ||
count = 0 | ||
gmt_arr = [] # gmt parsing array | ||
with open(MSIGDB_PATH, 'r') as infile: | ||
for line in infile: | ||
gmt_value = line.strip().split("\t") # splitting line | ||
sig_names = gmt_value[0] # signature name | ||
gene_list = gmt_value[2:] # gene list | ||
gmt_arr.append([sig_names]+gene_list) | ||
|
||
# Sample loading, and some entrezIDs are duplicated in the matrix | ||
gexpr = pd.read_csv(args.input_df, index_col=0) | ||
gexpr.index = [x.split(".")[0] for x in gexpr.index.tolist()] # remove effect from R make.names | ||
gexpr = gexpr.groupby(gexpr.index).max() # keeping max for duplicated index | ||
|
||
zscore_arr = [] # result array | ||
zscore_calculator = calculator(gexpr) # Set activation calculator | ||
for sig in gmt_arr: | ||
zscore_value = zscore_calculator.gs_zscore(names=sig[0], gene_set=sig[1:]) # using standard, threaded version has an error | ||
zscore_arr.append(zscore_value) | ||
zscore_df = pd.concat(zscore_arr, axis=1) # make dataframe | ||
zscore_df.to_csv(args.output) |