Skip to content

Commit c252264

Browse files
authored
Merge pull request #14 from OpenKBC/engineering_dev
Added docker-compose, dockerfile for jupyter notebook setting, confirmed
2 parents cde32d9 + 08245b2 commit c252264

File tree

8 files changed

+191
-111
lines changed

8 files changed

+191
-111
lines changed

Dockerfile_jupyterNotebook

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
FROM jupyter/datascience-notebook
2+
3+
COPY notebook/installers/installer_Rpackage.R /installer_Rpackage.R
4+
COPY notebook/installers/requirements.txt /requirements.txt
5+
6+
RUN Rscript /installer_Rpackage.R
7+
RUN pip install -r /requirements.txt

R/deseq2_normalization.R

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,17 @@
22
##### normalization using DESeq2 - get normalized and vst transformed counts #####
33
##################################################################################
44
### install DESeq2, tximport packages from Bioconductor
5-
# if (!requireNamespace("BiocManager", quietly = TRUE))
6-
# install.packages("BiocManager")
7-
# BiocManager::install("DESeq2")
8-
# BiocManager::install("tximport")
5+
if (!requireNamespace("BiocManager", quietly = TRUE))
6+
install.packages("BiocManager", repos='http://cran.us.r-project.org')
7+
BiocManager::install("DESeq2")
8+
BiocManager::install("tximport")
9+
910
library(tidyverse)
1011
library(DESeq2)
1112
library(tximport)
1213

13-
data_path <- "~/Downloads/MS_RNAseq_NAE1"
14+
data_path <- "../data/"
15+
print(getwd())
1416
setwd(data_path)
1517

1618
# loading metadata
@@ -66,3 +68,4 @@ write.csv(assay(vst(deseq_obj$CD14)), "counts_vst_CD14.csv")
6668
write.csv(assay(rlog(deseq_obj$CD4)), "counts_rlog_CD4.csv")
6769
write.csv(assay(rlog(deseq_obj$CD8)), "counts_rlog_CD8.csv")
6870
write.csv(assay(rlog(deseq_obj$CD14)), "counts_rlog_CD14.csv")
71+

docker-compose.yaml

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
version: "3"
2+
services:
3+
notebook:
4+
build:
5+
dockerfile: Dockerfile_jupyterNotebook
6+
volumes:
7+
- /Users/junheeyun/OpenKBC/multiple_sclerosis_proj/notebook/notebook_lib:/home/jovyan/work/notebook_lib
8+
- /Users/junheeyun/OpenKBC/multiple_sclerosis_proj/notebook/notebook_utils:/home/jovyan/work/notebook_utils
9+
- /Users/junheeyun/OpenKBC/multiple_sclerosis_proj/notebook/resultFiles:/home/jovyan/work/resultFiles
10+
- /Users/junheeyun/OpenKBC/multiple_sclerosis_proj/data:/home/jovyan/data
11+
ports:
12+
- 8888:8888
13+
container_name: datascience-notebook-container

environment.yaml

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,18 @@
1+
## For running in conda
12
name: utils_v1
23
channels:
34
- defaults
5+
- conda-forge
6+
- bioconda
47
dependencies:
58
- ipykernel=6.2.0
9+
- r-essentials=3.6.1
610
- pip:
711
- feather-format==0.4.1
812
- numpy==1.21.2
913
- pandas==1.3.2
1014
- pyarrow==5.0.0
11-
- pytz==2021.1
15+
- pytz==2021.1
16+
- scikit-learn==0.24.2
17+
- matplotlib==3.4.3
18+
- seaborn==0.11.2

notebook/TPM_DEG.ipynb

Lines changed: 0 additions & 105 deletions
This file was deleted.

notebook/getDEG_with_nwpv.ipynb

Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 1,
6+
"source": [
7+
"import pandas as pd\n",
8+
"\n",
9+
"## Utils and Library for notebook\n",
10+
"from notebook_lib.nwpv.nwpv import nwpv_calculation\n",
11+
"from notebook_utils.OpenKbcMSToolkit import ExtractionToolkit as exttoolkit\n",
12+
"\n",
13+
"# Root data path\n",
14+
"DATA_PATH = '../data/'\n",
15+
"\n",
16+
"# Sample loading\n",
17+
"gene_tpm = pd.read_feather(DATA_PATH+\"counts_normalized/counts_vst_CD8.feather\").set_index('index') # Load normalized CD8\n",
18+
"meta_data = pd.read_csv(DATA_PATH+'EPIC_HCvB_metadata_baseline_updated-share.csv')\n",
19+
"\n",
20+
"# Getting Sample Info\n",
21+
"# DiseaseStatus: ['MS' 'Unknown' 'CIS' 'Healthy']\n",
22+
"# DiseaseDuration(Early?)\n",
23+
"# DiseaseCourse: ['RR' 'PP' 'SP' 'RIS' 'CIS' 'Unknown' 'Healthy']\n",
24+
"sample_list, sample_category = exttoolkit.get_sample_name_by_category(dataframe=meta_data, sampleColumn='HCVB_ID', dataColname='DiseaseCourse')\n",
25+
"print(\"Sample Count\")\n",
26+
"count=0 \n",
27+
"for category, values in zip(sample_category, sample_list):\n",
28+
" print(category+\" : \"+str(len(values)) + \", List number : \" + str(count))\n",
29+
" count+=1"
30+
],
31+
"outputs": [
32+
{
33+
"output_type": "stream",
34+
"name": "stdout",
35+
"text": [
36+
"Sample Count\n",
37+
"RR : 82, List number : 0\n",
38+
"PP : 14, List number : 1\n",
39+
"SP : 1, List number : 2\n",
40+
"RIS : 2, List number : 3\n",
41+
"CIS : 40, List number : 4\n",
42+
"Unknown : 1, List number : 5\n",
43+
"Healthy : 22, List number : 6\n"
44+
]
45+
}
46+
],
47+
"metadata": {}
48+
},
49+
{
50+
"cell_type": "code",
51+
"execution_count": 2,
52+
"source": [
53+
"duration_df = meta_data.dropna(subset=['DiseaseDuration']) # data for patient's duration set\n",
54+
"duration_df['DiseaseDuration'] = duration_df['DiseaseDuration'].astype(float) # make float\n",
55+
"\n",
56+
"## Long DD\n",
57+
"longDD_samples = duration_df.loc[ duration_df['DiseaseDuration'] >= duration_df['DiseaseDuration'].median(), 'HCVB_ID'] # Get sampleData which has longDD from metadata\n",
58+
"longDD_sampleList = list(set(gene_tpm.columns.tolist()).intersection(longDD_samples.values.tolist())) # Get intersected sampleID between expr and longDD\n",
59+
"longDD_meta = duration_df[duration_df['HCVB_ID'].isin(longDD_sampleList)] # Get meta with sampleNames\n",
60+
"longDD_gene_expr = gene_tpm[longDD_sampleList] # Get expr with sampleNames\n",
61+
"\n",
62+
"## Short DD\n",
63+
"shortDD_samples = duration_df.loc[ duration_df['DiseaseDuration'] < duration_df['DiseaseDuration'].median(), 'HCVB_ID']\n",
64+
"shortDD_sampleList = list(set(gene_tpm.columns.tolist()).intersection(shortDD_samples.values.tolist()))\n",
65+
"shortDD_meta = duration_df[duration_df['HCVB_ID'].isin(shortDD_samples.values.tolist())]\n",
66+
"shortDD_gene_expr = gene_tpm[shortDD_sampleList]"
67+
],
68+
"outputs": [
69+
{
70+
"output_type": "stream",
71+
"name": "stderr",
72+
"text": [
73+
"/var/folders/sx/0rms4skn47nfn6svhhprv5700000gq/T/ipykernel_36505/3086605768.py:2: SettingWithCopyWarning: \n",
74+
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
75+
"Try using .loc[row_indexer,col_indexer] = value instead\n",
76+
"\n",
77+
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
78+
" duration_df['DiseaseDuration'] = duration_df['DiseaseDuration'].astype(float) # make float\n"
79+
]
80+
}
81+
],
82+
"metadata": {}
83+
},
84+
{
85+
"cell_type": "code",
86+
"execution_count": 3,
87+
"source": [
88+
"# NWPV calculation\n",
89+
"#nwpv_class = nwpv_calculation(gene_tpm, shortDD_sampleList, longDD_sampleList)\n",
90+
"#nwpv_class.get_result().to_csv(\"nwpv_result_CD8_vst.csv\")\n",
91+
"#nwpv_df = pd.read_csv('resultFiles/nwpv_result_CD8_vst.csv')\n",
92+
"#nwpv_df[(nwpv_df['combined_pvalue_adj'] < 0.05)]"
93+
],
94+
"outputs": [],
95+
"metadata": {}
96+
},
97+
{
98+
"cell_type": "code",
99+
"execution_count": 10,
100+
"source": [
101+
"X = longDD_gene_expr.values[0]"
102+
],
103+
"outputs": [
104+
{
105+
"output_type": "execute_result",
106+
"data": {
107+
"text/plain": [
108+
"72"
109+
]
110+
},
111+
"metadata": {},
112+
"execution_count": 10
113+
}
114+
],
115+
"metadata": {}
116+
},
117+
{
118+
"cell_type": "code",
119+
"execution_count": null,
120+
"source": [],
121+
"outputs": [],
122+
"metadata": {}
123+
}
124+
],
125+
"metadata": {
126+
"orig_nbformat": 4,
127+
"language_info": {
128+
"name": "python",
129+
"version": "3.9.6",
130+
"mimetype": "text/x-python",
131+
"codemirror_mode": {
132+
"name": "ipython",
133+
"version": 3
134+
},
135+
"pygments_lexer": "ipython3",
136+
"nbconvert_exporter": "python",
137+
"file_extension": ".py"
138+
},
139+
"kernelspec": {
140+
"name": "python3",
141+
"display_name": "Python 3.9.6 64-bit ('utils_v1': conda)"
142+
},
143+
"interpreter": {
144+
"hash": "77a526a359b8fd796eb09814c2228805e7076f62d8d78ef70c860dff672df599"
145+
}
146+
},
147+
"nbformat": 4,
148+
"nbformat_minor": 2
149+
}
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
### install DESeq2, tximport packages from Bioconductor
2+
if (!requireNamespace("BiocManager", quietly = TRUE))
3+
install.packages("BiocManager", repos='http://cran.us.r-project.org')
4+
BiocManager::install("DESeq2")
5+
BiocManager::install("tximport")

notebook/installers/requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
feather-format==0.4.1

0 commit comments

Comments
 (0)