-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #16 from OpenKBC/engineering_dev
New updates for engineering, confirmed
- Loading branch information
Showing
16 changed files
with
522 additions
and
161 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
# ID converter from Ensembl ID to Entrez ID, please cleanup data by using python code before using this code | ||
# Only working with csv file | ||
# Usage Rscript | ||
# Rscript IDconverter.R path/counts_norm_CD8.csv path/outputfile.csv | ||
|
||
library("AnnotationDbi") | ||
library("org.Hs.eg.db") | ||
|
||
args = commandArgs(trailingOnly=TRUE) | ||
inputFile = args[1] # with path | ||
outputFile = args[2] # with path | ||
|
||
#str_split | ||
data<-read.table(inputFile, row.names=1, sep=',', header=TRUE) # Read data | ||
names(data) <- sub("^X", "", names(data)) # drop "X" string in columns name | ||
|
||
### Warning ### | ||
# Entrez ID might duplicate for Ensemble ID | ||
data$entrez = mapIds(org.Hs.eg.db, keys=row.names(data), column="ENTREZID", keytype="ENSEMBL", multiVals="first") | ||
row.names(data)<-make.names(data$entrez, unique=TRUE) | ||
row.names(data) <- sub("^X", "", row.names(data)) # drop "X" string in index name | ||
write.table(data, outputFile, sep=',', row.names = TRUE, col.names = TRUE) # Write result |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
# R Utils for the project | ||
|
||
### Requirements | ||
```shell | ||
Rscript notebook/installers/installer_Rpackage.R | ||
``` | ||
|
||
#### 1. deseq2_normalizaiton.R | ||
This code is an example to get normalized matrix from raw files, it does not have instruction | ||
|
||
#### 2. IDconverter.R | ||
This code is converter for Ensembl ID to Entrez ID, and input should be cleaned up. Input file should be CSV format. | ||
|
||
**Example:** | ||
```shell | ||
Rscript IDconverter.R inputpath/input.csv outputpath/output.csv | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,2 @@ | ||
feather-format==0.4.1 | ||
feather-format==0.4.1 | ||
scikit-learn==0.24.2 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
__author__ = "Junhee Yoon" | ||
__version__ = "1.0.0" | ||
__maintainer__ = "Junhee Yoon" | ||
__email__ = "[email protected]" | ||
|
||
""" | ||
Manual: https://github.com/swiri021/Threaded_gsZscore | ||
Reference: https://genomebiology.biomedcentral.com/articles/10.1186/gb-2006-7-10-r93 | ||
Description: calculating activation score by using threaded z score | ||
""" | ||
import pandas as pd | ||
import numpy as np | ||
import threading | ||
import functools | ||
import itertools | ||
|
||
class funcThread(object): | ||
def __init__(self): | ||
print ("Loaded Threads") | ||
|
||
def __call__(self, func): | ||
@functools.wraps(func) | ||
def run(*args, **kwargs): | ||
print ("Number of Threads : %d"%(kwargs['nthread'])) | ||
|
||
threads = [None]*kwargs['nthread'] | ||
container = [None]*kwargs['nthread'] | ||
|
||
####Divide Samples by number of threads | ||
i_col = len(args[1].columns.tolist()) | ||
contents_numb = i_col/kwargs['nthread'] | ||
split_columns = [args[1].columns.tolist()[i:i+contents_numb] for i in range(0, len(args[1].columns.tolist()), contents_numb)] | ||
if len(split_columns)>kwargs['nthread']: | ||
split_columns = split_columns[:kwargs['nthread']-1] + [list(itertools.chain(*split_columns[kwargs['nthread']-1:]))] | ||
#split_columns[len(split_columns)-2] = split_columns[len(split_columns)-2]+split_columns[len(split_columns)-1] | ||
#split_columns = split_columns[:len(split_columns)-1] | ||
|
||
####Running threads | ||
for i, item in enumerate(split_columns): | ||
threads[i] = threading.Thread(target = func, args=(args[0], args[1].ix[:,item], container, i), kwargs=kwargs) | ||
threads[i].start() | ||
for i in range(len(threads)): | ||
threads[i].join() | ||
|
||
return pd.concat(container, axis=0) | ||
|
||
return run | ||
|
||
|
||
class calculator(object): | ||
|
||
def __init__(self, df): | ||
if df.empty: | ||
raise ValueError("Input Dataframe is empty, please try with different one.") | ||
else: | ||
self.df = df | ||
|
||
# Wrapper for controlling Threads | ||
def gs_zscore(self, nthread=5, gene_set=[]): | ||
arr1 = self.df | ||
container = None | ||
i = None | ||
|
||
return self._calculating(arr1, container, i, nthread=nthread, gene_set=gene_set) | ||
|
||
# function structure | ||
# args(input, container, thread_index , **kwargs) | ||
@funcThread() | ||
def _calculating(self, arr1, container, i, nthread=5, gene_set=[]): | ||
zscore=[] | ||
arr1_index = arr1.index.tolist() | ||
inter = list(set(arr1_index).intersection(gene_set)) | ||
|
||
diff_mean = arr1.loc[inter].mean(axis=0).subtract(arr1.mean(axis=0)) | ||
len_norm = arr1.std(ddof=1, axis=0).apply(lambda x: np.sqrt(len(inter))/x) | ||
zscore = diff_mean*len_norm | ||
zscore = zscore.to_frame() | ||
zscore.columns = ['Zscore'] | ||
container[i] = zscore | ||
##No Return |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,13 +1,17 @@ | ||
from .statistics import STAT | ||
from scipy import stats | ||
import numpy as np | ||
__author__ = "Junhee Yoon" | ||
__version__ = "1.0.0" | ||
__maintainer__ = "Junhee Yoon" | ||
__email__ = "[email protected]" | ||
|
||
""" | ||
Manual: https://github.com/swiri021/NWPV2 | ||
Reference: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3135688/ | ||
Description: Method of combined p-values for getting DEG in dataset | ||
""" | ||
|
||
|
||
from .statistics import STAT | ||
from scipy import stats | ||
import numpy as np | ||
|
||
class nwpv_calculation(object): | ||
def _preprocessing(self, min_adj=1e-16, max_adj=0.9999999999999999): | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
from notebook_lib.nwpv.nwpv import nwpv_calculation | ||
from notebook_lib.gene_zscore.threaded_gzscore import calculator as gzscore_class | ||
|
||
class AdvancedCalculators(object): | ||
def nwpv_calculator(self, test : list, contol : list, data, save : bool = True): | ||
""" | ||
NWPV calculator | ||
Input | ||
test : test sample list | ||
control : control sample list | ||
data : actual input data | ||
save : saving output or not ? | ||
""" | ||
#NWPV calculation | ||
nwpv_class = nwpv_calculation(data, test, contol) | ||
result = nwpv_class.get_result() | ||
|
||
if save==True: | ||
result.to_csv("resultFiles/nwpv_result.csv") | ||
|
||
return result | ||
|
||
def activation_score(self, data, gene_set : list): | ||
|
||
""" | ||
gene zscore calculator | ||
Input | ||
data : actual input data | ||
gene set : gene set input for calculating activate score | ||
save : saving output or not ? | ||
""" | ||
|
||
#### Init Class and check input file | ||
zscore_calculator = gzscore_class(data) | ||
|
||
#### Input list should be EntrezIDs(Pathways) | ||
result = zscore_calculator.gs_zscore(nthread=4, gene_set=gene_set) | ||
return result |
Oops, something went wrong.