Merge pull request #14 from OpenKBC/engineering_dev

swiri021 · web-flow · commit c2522643fba6 · 2021-09-14T02:38:20.000-05:00
Added docker-compose, dockerfile for jupyter notebook setting, confirmed
diff --git a/Dockerfile_jupyterNotebook b/Dockerfile_jupyterNotebook
@@ -0,0 +1,7 @@
+FROM jupyter/datascience-notebook
+
+COPY notebook/installers/installer_Rpackage.R /installer_Rpackage.R
+COPY notebook/installers/requirements.txt /requirements.txt
+
+RUN Rscript /installer_Rpackage.R
+RUN pip install -r /requirements.txt
diff --git a/R/deseq2_normalization.R b/R/deseq2_normalization.R
@@ -2,15 +2,17 @@
 ##### normalization using DESeq2 - get normalized and vst transformed counts #####
 ##################################################################################
 ### install DESeq2, tximport packages from Bioconductor
-# if (!requireNamespace("BiocManager", quietly = TRUE))
-#   install.packages("BiocManager")
-# BiocManager::install("DESeq2")
-# BiocManager::install("tximport")
+if (!requireNamespace("BiocManager", quietly = TRUE))
+  install.packages("BiocManager", repos='http://cran.us.r-project.org')
+  BiocManager::install("DESeq2")
+  BiocManager::install("tximport")
+
 library(tidyverse)
 library(DESeq2)
 library(tximport)
 
-data_path <- "~/Downloads/MS_RNAseq_NAE1"
+data_path <- "../data/"
+print(getwd())
 setwd(data_path)
 
 # loading metadata
@@ -66,3 +68,4 @@ write.csv(assay(vst(deseq_obj$CD14)), "counts_vst_CD14.csv")
 write.csv(assay(rlog(deseq_obj$CD4)), "counts_rlog_CD4.csv")
 write.csv(assay(rlog(deseq_obj$CD8)), "counts_rlog_CD8.csv")
 write.csv(assay(rlog(deseq_obj$CD14)), "counts_rlog_CD14.csv")
+
diff --git a/docker-compose.yaml b/docker-compose.yaml
@@ -0,0 +1,13 @@
+version: "3"
+services:
+  notebook:
+    build:
+      dockerfile: Dockerfile_jupyterNotebook
+    volumes:
+      - /Users/junheeyun/OpenKBC/multiple_sclerosis_proj/notebook/notebook_lib:/home/jovyan/work/notebook_lib
+      - /Users/junheeyun/OpenKBC/multiple_sclerosis_proj/notebook/notebook_utils:/home/jovyan/work/notebook_utils
+      - /Users/junheeyun/OpenKBC/multiple_sclerosis_proj/notebook/resultFiles:/home/jovyan/work/resultFiles
+      - /Users/junheeyun/OpenKBC/multiple_sclerosis_proj/data:/home/jovyan/data
+    ports:
+        - 8888:8888
+    container_name: datascience-notebook-container
diff --git a/environment.yaml b/environment.yaml
@@ -1,11 +1,18 @@
+## For running in conda
 name: utils_v1
 channels:
   - defaults
+  - conda-forge
+  - bioconda
 dependencies:
   - ipykernel=6.2.0
+  - r-essentials=3.6.1
   - pip:
     - feather-format==0.4.1
     - numpy==1.21.2
     - pandas==1.3.2
     - pyarrow==5.0.0
-    - pytz==2021.1
+    - pytz==2021.1
+    - scikit-learn==0.24.2
+    - matplotlib==3.4.3
+    - seaborn==0.11.2
diff --git a/notebook/TPM_DEG.ipynb b/notebook/TPM_DEG.ipynb
diff --git a/notebook/getDEG_with_nwpv.ipynb b/notebook/getDEG_with_nwpv.ipynb
@@ -0,0 +1,149 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "## Utils and Library for notebook\n",
+    "from notebook_lib.nwpv.nwpv import nwpv_calculation\n",
+    "from notebook_utils.OpenKbcMSToolkit import ExtractionToolkit as exttoolkit\n",
+    "\n",
+    "# Root data path\n",
+    "DATA_PATH = '../data/'\n",
+    "\n",
+    "# Sample loading\n",
+    "gene_tpm = pd.read_feather(DATA_PATH+\"counts_normalized/counts_vst_CD8.feather\").set_index('index') # Load normalized CD8\n",
+    "meta_data = pd.read_csv(DATA_PATH+'EPIC_HCvB_metadata_baseline_updated-share.csv')\n",
+    "\n",
+    "# Getting Sample Info\n",
+    "# DiseaseStatus: ['MS' 'Unknown' 'CIS' 'Healthy']\n",
+    "# DiseaseDuration(Early?)\n",
+    "# DiseaseCourse: ['RR' 'PP' 'SP' 'RIS' 'CIS' 'Unknown' 'Healthy']\n",
+    "sample_list, sample_category = exttoolkit.get_sample_name_by_category(dataframe=meta_data, sampleColumn='HCVB_ID', dataColname='DiseaseCourse')\n",
+    "print(\"Sample Count\")\n",
+    "count=0 \n",
+    "for category, values in zip(sample_category, sample_list):\n",
+    "    print(category+\" : \"+str(len(values)) + \", List number : \" + str(count))\n",
+    "    count+=1"
+   ],
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "Sample Count\n",
+      "RR : 82, List number : 0\n",
+      "PP : 14, List number : 1\n",
+      "SP : 1, List number : 2\n",
+      "RIS : 2, List number : 3\n",
+      "CIS : 40, List number : 4\n",
+      "Unknown : 1, List number : 5\n",
+      "Healthy : 22, List number : 6\n"
+     ]
+    }
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "source": [
+    "duration_df = meta_data.dropna(subset=['DiseaseDuration']) # data for patient's duration set\n",
+    "duration_df['DiseaseDuration'] = duration_df['DiseaseDuration'].astype(float) # make float\n",
+    "\n",
+    "## Long DD\n",
+    "longDD_samples = duration_df.loc[ duration_df['DiseaseDuration'] >= duration_df['DiseaseDuration'].median(), 'HCVB_ID'] # Get sampleData which has longDD from metadata\n",
+    "longDD_sampleList = list(set(gene_tpm.columns.tolist()).intersection(longDD_samples.values.tolist())) # Get intersected sampleID between expr and longDD\n",
+    "longDD_meta = duration_df[duration_df['HCVB_ID'].isin(longDD_sampleList)] # Get meta with sampleNames\n",
+    "longDD_gene_expr = gene_tpm[longDD_sampleList] # Get expr with sampleNames\n",
+    "\n",
+    "## Short DD\n",
+    "shortDD_samples = duration_df.loc[ duration_df['DiseaseDuration'] < duration_df['DiseaseDuration'].median(), 'HCVB_ID']\n",
+    "shortDD_sampleList = list(set(gene_tpm.columns.tolist()).intersection(shortDD_samples.values.tolist()))\n",
+    "shortDD_meta = duration_df[duration_df['HCVB_ID'].isin(shortDD_samples.values.tolist())]\n",
+    "shortDD_gene_expr = gene_tpm[shortDD_sampleList]"
+   ],
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stderr",
+     "text": [
+      "/var/folders/sx/0rms4skn47nfn6svhhprv5700000gq/T/ipykernel_36505/3086605768.py:2: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+      "Try using .loc[row_indexer,col_indexer] = value instead\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  duration_df['DiseaseDuration'] = duration_df['DiseaseDuration'].astype(float) # make float\n"
+     ]
+    }
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "source": [
+    "# NWPV calculation\n",
+    "#nwpv_class = nwpv_calculation(gene_tpm, shortDD_sampleList, longDD_sampleList)\n",
+    "#nwpv_class.get_result().to_csv(\"nwpv_result_CD8_vst.csv\")\n",
+    "#nwpv_df = pd.read_csv('resultFiles/nwpv_result_CD8_vst.csv')\n",
+    "#nwpv_df[(nwpv_df['combined_pvalue_adj'] < 0.05)]"
+   ],
+   "outputs": [],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "source": [
+    "X = longDD_gene_expr.values[0]"
+   ],
+   "outputs": [
+    {
+     "output_type": "execute_result",
+     "data": {
+      "text/plain": [
+       "72"
+      ]
+     },
+     "metadata": {},
+     "execution_count": 10
+    }
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "source": [],
+   "outputs": [],
+   "metadata": {}
+  }
+ ],
+ "metadata": {
+  "orig_nbformat": 4,
+  "language_info": {
+   "name": "python",
+   "version": "3.9.6",
+   "mimetype": "text/x-python",
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "pygments_lexer": "ipython3",
+   "nbconvert_exporter": "python",
+   "file_extension": ".py"
+  },
+  "kernelspec": {
+   "name": "python3",
+   "display_name": "Python 3.9.6 64-bit ('utils_v1': conda)"
+  },
+  "interpreter": {
+   "hash": "77a526a359b8fd796eb09814c2228805e7076f62d8d78ef70c860dff672df599"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/notebook/installers/installer_Rpackage.R b/notebook/installers/installer_Rpackage.R
@@ -0,0 +1,5 @@
+### install DESeq2, tximport packages from Bioconductor
+if (!requireNamespace("BiocManager", quietly = TRUE))
+  install.packages("BiocManager", repos='http://cran.us.r-project.org')
+  BiocManager::install("DESeq2")
+  BiocManager::install("tximport")
diff --git a/notebook/installers/requirements.txt b/notebook/installers/requirements.txt
@@ -0,0 +1 @@
+feather-format==0.4.1