Skip to content

Commit

Permalink
Merge pull request #14 from OpenKBC/engineering_dev
Browse files Browse the repository at this point in the history
Added docker-compose, dockerfile for jupyter notebook setting, confirmed
  • Loading branch information
swiri021 authored Sep 14, 2021
2 parents cde32d9 + 08245b2 commit c252264
Show file tree
Hide file tree
Showing 8 changed files with 191 additions and 111 deletions.
7 changes: 7 additions & 0 deletions Dockerfile_jupyterNotebook
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
FROM jupyter/datascience-notebook

COPY notebook/installers/installer_Rpackage.R /installer_Rpackage.R
COPY notebook/installers/requirements.txt /requirements.txt

RUN Rscript /installer_Rpackage.R
RUN pip install -r /requirements.txt
13 changes: 8 additions & 5 deletions R/deseq2_normalization.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,17 @@
##### normalization using DESeq2 - get normalized and vst transformed counts #####
##################################################################################
### install DESeq2, tximport packages from Bioconductor
# if (!requireNamespace("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
# BiocManager::install("DESeq2")
# BiocManager::install("tximport")
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager", repos='http://cran.us.r-project.org')
BiocManager::install("DESeq2")
BiocManager::install("tximport")

library(tidyverse)
library(DESeq2)
library(tximport)

data_path <- "~/Downloads/MS_RNAseq_NAE1"
data_path <- "../data/"
print(getwd())
setwd(data_path)

# loading metadata
Expand Down Expand Up @@ -66,3 +68,4 @@ write.csv(assay(vst(deseq_obj$CD14)), "counts_vst_CD14.csv")
write.csv(assay(rlog(deseq_obj$CD4)), "counts_rlog_CD4.csv")
write.csv(assay(rlog(deseq_obj$CD8)), "counts_rlog_CD8.csv")
write.csv(assay(rlog(deseq_obj$CD14)), "counts_rlog_CD14.csv")

13 changes: 13 additions & 0 deletions docker-compose.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
version: "3"
services:
notebook:
build:
dockerfile: Dockerfile_jupyterNotebook
volumes:
- /Users/junheeyun/OpenKBC/multiple_sclerosis_proj/notebook/notebook_lib:/home/jovyan/work/notebook_lib
- /Users/junheeyun/OpenKBC/multiple_sclerosis_proj/notebook/notebook_utils:/home/jovyan/work/notebook_utils
- /Users/junheeyun/OpenKBC/multiple_sclerosis_proj/notebook/resultFiles:/home/jovyan/work/resultFiles
- /Users/junheeyun/OpenKBC/multiple_sclerosis_proj/data:/home/jovyan/data
ports:
- 8888:8888
container_name: datascience-notebook-container
9 changes: 8 additions & 1 deletion environment.yaml
Original file line number Diff line number Diff line change
@@ -1,11 +1,18 @@
## For running in conda
name: utils_v1
channels:
- defaults
- conda-forge
- bioconda
dependencies:
- ipykernel=6.2.0
- r-essentials=3.6.1
- pip:
- feather-format==0.4.1
- numpy==1.21.2
- pandas==1.3.2
- pyarrow==5.0.0
- pytz==2021.1
- pytz==2021.1
- scikit-learn==0.24.2
- matplotlib==3.4.3
- seaborn==0.11.2
105 changes: 0 additions & 105 deletions notebook/TPM_DEG.ipynb

This file was deleted.

149 changes: 149 additions & 0 deletions notebook/getDEG_with_nwpv.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"source": [
"import pandas as pd\n",
"\n",
"## Utils and Library for notebook\n",
"from notebook_lib.nwpv.nwpv import nwpv_calculation\n",
"from notebook_utils.OpenKbcMSToolkit import ExtractionToolkit as exttoolkit\n",
"\n",
"# Root data path\n",
"DATA_PATH = '../data/'\n",
"\n",
"# Sample loading\n",
"gene_tpm = pd.read_feather(DATA_PATH+\"counts_normalized/counts_vst_CD8.feather\").set_index('index') # Load normalized CD8\n",
"meta_data = pd.read_csv(DATA_PATH+'EPIC_HCvB_metadata_baseline_updated-share.csv')\n",
"\n",
"# Getting Sample Info\n",
"# DiseaseStatus: ['MS' 'Unknown' 'CIS' 'Healthy']\n",
"# DiseaseDuration(Early?)\n",
"# DiseaseCourse: ['RR' 'PP' 'SP' 'RIS' 'CIS' 'Unknown' 'Healthy']\n",
"sample_list, sample_category = exttoolkit.get_sample_name_by_category(dataframe=meta_data, sampleColumn='HCVB_ID', dataColname='DiseaseCourse')\n",
"print(\"Sample Count\")\n",
"count=0 \n",
"for category, values in zip(sample_category, sample_list):\n",
" print(category+\" : \"+str(len(values)) + \", List number : \" + str(count))\n",
" count+=1"
],
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Sample Count\n",
"RR : 82, List number : 0\n",
"PP : 14, List number : 1\n",
"SP : 1, List number : 2\n",
"RIS : 2, List number : 3\n",
"CIS : 40, List number : 4\n",
"Unknown : 1, List number : 5\n",
"Healthy : 22, List number : 6\n"
]
}
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 2,
"source": [
"duration_df = meta_data.dropna(subset=['DiseaseDuration']) # data for patient's duration set\n",
"duration_df['DiseaseDuration'] = duration_df['DiseaseDuration'].astype(float) # make float\n",
"\n",
"## Long DD\n",
"longDD_samples = duration_df.loc[ duration_df['DiseaseDuration'] >= duration_df['DiseaseDuration'].median(), 'HCVB_ID'] # Get sampleData which has longDD from metadata\n",
"longDD_sampleList = list(set(gene_tpm.columns.tolist()).intersection(longDD_samples.values.tolist())) # Get intersected sampleID between expr and longDD\n",
"longDD_meta = duration_df[duration_df['HCVB_ID'].isin(longDD_sampleList)] # Get meta with sampleNames\n",
"longDD_gene_expr = gene_tpm[longDD_sampleList] # Get expr with sampleNames\n",
"\n",
"## Short DD\n",
"shortDD_samples = duration_df.loc[ duration_df['DiseaseDuration'] < duration_df['DiseaseDuration'].median(), 'HCVB_ID']\n",
"shortDD_sampleList = list(set(gene_tpm.columns.tolist()).intersection(shortDD_samples.values.tolist()))\n",
"shortDD_meta = duration_df[duration_df['HCVB_ID'].isin(shortDD_samples.values.tolist())]\n",
"shortDD_gene_expr = gene_tpm[shortDD_sampleList]"
],
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"/var/folders/sx/0rms4skn47nfn6svhhprv5700000gq/T/ipykernel_36505/3086605768.py:2: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" duration_df['DiseaseDuration'] = duration_df['DiseaseDuration'].astype(float) # make float\n"
]
}
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 3,
"source": [
"# NWPV calculation\n",
"#nwpv_class = nwpv_calculation(gene_tpm, shortDD_sampleList, longDD_sampleList)\n",
"#nwpv_class.get_result().to_csv(\"nwpv_result_CD8_vst.csv\")\n",
"#nwpv_df = pd.read_csv('resultFiles/nwpv_result_CD8_vst.csv')\n",
"#nwpv_df[(nwpv_df['combined_pvalue_adj'] < 0.05)]"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 10,
"source": [
"X = longDD_gene_expr.values[0]"
],
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"72"
]
},
"metadata": {},
"execution_count": 10
}
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": null,
"source": [],
"outputs": [],
"metadata": {}
}
],
"metadata": {
"orig_nbformat": 4,
"language_info": {
"name": "python",
"version": "3.9.6",
"mimetype": "text/x-python",
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"pygments_lexer": "ipython3",
"nbconvert_exporter": "python",
"file_extension": ".py"
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3.9.6 64-bit ('utils_v1': conda)"
},
"interpreter": {
"hash": "77a526a359b8fd796eb09814c2228805e7076f62d8d78ef70c860dff672df599"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
5 changes: 5 additions & 0 deletions notebook/installers/installer_Rpackage.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
### install DESeq2, tximport packages from Bioconductor
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager", repos='http://cran.us.r-project.org')
BiocManager::install("DESeq2")
BiocManager::install("tximport")
1 change: 1 addition & 0 deletions notebook/installers/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
feather-format==0.4.1

0 comments on commit c252264

Please sign in to comment.