Skip to content

Commit 3a93a96

Browse files
breecumminsdanbryce
authored andcommittedJun 19, 2020
Bree's notebooks added
1 parent c6e0228 commit 3a93a96

File tree

2 files changed

+599
-0
lines changed

2 files changed

+599
-0
lines changed
 

‎notebooks/TenFoldComparisons.ipynb

+340
Large diffs are not rendered by default.

‎notebooks/wasserstein_analysis.ipynb

+259
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,259 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 1,
6+
"metadata": {},
7+
"outputs": [
8+
{
9+
"data": {
10+
"text/plain": [
11+
"'/work/05258/bcummins/GIT/flow_cytometry_scoring/notebooks'"
12+
]
13+
},
14+
"execution_count": 1,
15+
"metadata": {},
16+
"output_type": "execute_result"
17+
}
18+
],
19+
"source": [
20+
"%pwd"
21+
]
22+
},
23+
{
24+
"cell_type": "code",
25+
"execution_count": 2,
26+
"metadata": {},
27+
"outputs": [
28+
{
29+
"name": "stdout",
30+
"output_type": "stream",
31+
"text": [
32+
"/opt/conda/lib/python3.6/distutils/dist.py:261: UserWarning: Unknown distribution option: 'install_requires'\n",
33+
" warnings.warn(msg)\n",
34+
"running install\n",
35+
"running build\n",
36+
"running build_py\n",
37+
"running install_lib\n",
38+
"running install_egg_info\n",
39+
"Removing /work/05258/bcummins/jupyter_packages/lib/python3.6/site-packages/flow_cytometry_scoring-0.0.1-py3.6.egg-info\n",
40+
"Writing /work/05258/bcummins/jupyter_packages/lib/python3.6/site-packages/flow_cytometry_scoring-0.0.1-py3.6.egg-info\n"
41+
]
42+
}
43+
],
44+
"source": [
45+
"import pandas as pd\n",
46+
"import os\n",
47+
"import json\n",
48+
"import sys\n",
49+
"from os.path import expanduser\n",
50+
"import numpy as np\n",
51+
"import matplotlib.pyplot as plt\n",
52+
"\n",
53+
"\n",
54+
"## Assumes we are inside flow_cytometry_scoring/notebooks\n",
55+
"os.chdir(\"../\")\n",
56+
"# !{sys.executable} setup.py develop --user --uninstall\n",
57+
"!{sys.executable} setup.py install --user \n",
58+
"# %pip uninstall -y flow_cytometry_scoring &> /dev/null || True\n",
59+
"# %pip install . --user\n",
60+
"os.chdir(\"notebooks/\")\n"
61+
]
62+
},
63+
{
64+
"cell_type": "code",
65+
"execution_count": 3,
66+
"metadata": {},
67+
"outputs": [
68+
{
69+
"data": {
70+
"text/plain": [
71+
"['/work/projects/SD2E-Community/prod/projects/sd2e-project-43/test/dc_YeastSTATES-CRISPR-Short-Duration-Time-Series-20191208_20200423194115',\n",
72+
" '/work/projects/SD2E-Community/prod/projects/sd2e-project-43/test/dc_YeastSTATES-CRISPR-Short-Duration-Time-Series-35C_20200423195648']"
73+
]
74+
},
75+
"execution_count": 3,
76+
"metadata": {},
77+
"output_type": "execute_result"
78+
}
79+
],
80+
"source": [
81+
"from flow_cytometry_scoring import rank_order_truth_tables as rank\n",
82+
"\n",
83+
"%load_ext autoreload\n",
84+
"%reload_ext autoreload\n",
85+
"%autoreload 2\n",
86+
" \n",
87+
"\n",
88+
"\n",
89+
"DATA_CONVERGE_PROJECT=\"sd2e-project-43\"\n",
90+
"\n",
91+
"data_converge_base = os.path.join(expanduser(\"~\"), 'sd2e-projects', DATA_CONVERGE_PROJECT)\n",
92+
"experiment_dir = os.path.realpath(os.path.join(data_converge_base, 'test'))\n",
93+
"experiment_dir_contents = [os.path.realpath(os.path.join(experiment_dir, x)) for x in os.listdir(experiment_dir)]\n",
94+
"\n",
95+
"experiments = [x for x in experiment_dir_contents \n",
96+
" if os.path.isdir(x) and \"CRISPR-Short-Duration\" in x]\n",
97+
"\n",
98+
"experiments = {x.split('_')[1]:x for x in sorted(experiments)}\n",
99+
"experiments = list(experiments.values())\n",
100+
"\n",
101+
"experiments\n"
102+
]
103+
},
104+
{
105+
"cell_type": "code",
106+
"execution_count": 4,
107+
"metadata": {},
108+
"outputs": [],
109+
"source": [
110+
"## Data Helper functions\n",
111+
"\n",
112+
"def get_record(experiment):\n",
113+
" record = json.load(open(os.path.join(experiment, \"record.json\")))\n",
114+
" return record\n",
115+
"\n",
116+
"def get_record_file(record, file_type=\"fc_meta\"):\n",
117+
" files = record['files']\n",
118+
" files_of_type = [ x for x in files if file_type in x['name']]\n",
119+
" if len(files_of_type) > 0:\n",
120+
" return files_of_type[0]\n",
121+
" else:\n",
122+
" return None\n",
123+
"\n",
124+
"def get_data(experiment, record, file_type):\n",
125+
" fc_raw_file = get_record_file(record, file_type)\n",
126+
" if fc_raw_file:\n",
127+
" data_df = pd.read_csv(os.path.join(experiment, fc_raw_file['name']))\n",
128+
" return data_df\n",
129+
" else:\n",
130+
" return None\n",
131+
"\n",
132+
"def get_bins(df):\n",
133+
" return [float(x.split(\"_\")[1]) for x in df.columns if \"bin\" in x]\n",
134+
"\n",
135+
"def get_row_values(df,row_name,id_col):\n",
136+
" df_j = df.loc[df[id_col] == row_name]\n",
137+
" df_j = df_j[[x for x in df_j.columns if \"bin\" in x]]\n",
138+
" return df_j.values[0]\n",
139+
" \n",
140+
" "
141+
]
142+
},
143+
{
144+
"cell_type": "code",
145+
"execution_count": 5,
146+
"metadata": {},
147+
"outputs": [],
148+
"source": [
149+
"def do_analysis(experiment,datafile,id_col=\"sample_id\",channel_col=\"channel\",channel_val=\"BL1-A\"):\n",
150+
" # datafile is \"fc_raw_log10_stats.csv\" or \"fc_etl_stats.csv\"\n",
151+
"\n",
152+
" ## load dataset from data converge \n",
153+
" record = get_record(experiment)\n",
154+
" df = get_data(experiment, record, datafile)\n",
155+
" \n",
156+
" if df is None:\n",
157+
" return None\n",
158+
" \n",
159+
" # handle difference between etl and log10 histogram filesS\n",
160+
" if channel_col in df.columns:\n",
161+
" df = df.loc[df[channel_col] == channel_val]\n",
162+
"\n",
163+
"# ## Truncated for testing\n",
164+
"# df = df.iloc[:5]\n",
165+
"\n",
166+
" bins = get_bins(df) \n",
167+
" ids = list(df[id_col].values) \n",
168+
" res = np.zeros([len(ids),len(ids)])\n",
169+
" \n",
170+
" for j,s in enumerate(ids):\n",
171+
" s_bin_vals = get_row_values(df,s,id_col)\n",
172+
" for k,t in enumerate(ids[j+1:]):\n",
173+
" t_bin_vals = get_row_values(df,t,id_col)\n",
174+
" score = rank.emdist(s_bin_vals, t_bin_vals, bins)\n",
175+
" res[j,j+k+1] = 10**score\n",
176+
" res[j+k+1,j] = 10**score\n",
177+
" df_results = pd.DataFrame(data=res, index=ids, columns=ids)\n",
178+
" return df_results\n",
179+
"\n"
180+
]
181+
},
182+
{
183+
"cell_type": "code",
184+
"execution_count": 6,
185+
"metadata": {},
186+
"outputs": [
187+
{
188+
"name": "stdout",
189+
"output_type": "stream",
190+
"text": [
191+
"dc_YeastSTATES-CRISPR-Short-Duration-Time-Series-20191208_20200423194115\n",
192+
"fc_raw_log10_stats.csv\n",
193+
"dc_YeastSTATES-CRISPR-Short-Duration-Time-Series-20191208_20200423194115\n",
194+
"fc_etl_stats.csv\n"
195+
]
196+
},
197+
{
198+
"name": "stderr",
199+
"output_type": "stream",
200+
"text": [
201+
"/work/05258/bcummins/jupyter_packages/lib/python3.6/site-packages/flow_cytometry_scoring/rank_order_truth_tables.py:37: RuntimeWarning: invalid value encountered in true_divide\n",
202+
" return pyemd.emd(np.asarray(h1)/float(sum(h1)), np.asarray(h2)/float(sum(h2)), bin_dist)\n"
203+
]
204+
},
205+
{
206+
"name": "stdout",
207+
"output_type": "stream",
208+
"text": [
209+
"dc_YeastSTATES-CRISPR-Short-Duration-Time-Series-35C_20200423195648\n",
210+
"fc_raw_log10_stats.csv\n",
211+
"dc_YeastSTATES-CRISPR-Short-Duration-Time-Series-35C_20200423195648\n",
212+
"fc_etl_stats.csv\n"
213+
]
214+
}
215+
],
216+
"source": [
217+
"## Run Wasserstein analysis on all processed data sets\n",
218+
"\n",
219+
"for experiment in experiments:\n",
220+
" for datafile in [\"fc_raw_log10_stats.csv\",\"fc_etl_stats.csv\"]:\n",
221+
" experiment_name = experiment.split(\"/\")[-1]\n",
222+
" print(experiment_name)\n",
223+
" print(datafile)\n",
224+
" fname = experiment_name+\"_\"+datafile.split(\".\")[0]+\"_wasserstein_dists.csv\"\n",
225+
" if not os.path.exists(fname):\n",
226+
" df = do_analysis(experiment,datafile)\n",
227+
" df.to_csv(fname)\n"
228+
]
229+
},
230+
{
231+
"cell_type": "code",
232+
"execution_count": null,
233+
"metadata": {},
234+
"outputs": [],
235+
"source": []
236+
}
237+
],
238+
"metadata": {
239+
"kernelspec": {
240+
"display_name": "Python 3",
241+
"language": "python",
242+
"name": "python3"
243+
},
244+
"language_info": {
245+
"codemirror_mode": {
246+
"name": "ipython",
247+
"version": 3
248+
},
249+
"file_extension": ".py",
250+
"mimetype": "text/x-python",
251+
"name": "python",
252+
"nbconvert_exporter": "python",
253+
"pygments_lexer": "ipython3",
254+
"version": "3.6.8"
255+
}
256+
},
257+
"nbformat": 4,
258+
"nbformat_minor": 2
259+
}

0 commit comments

Comments
 (0)
Please sign in to comment.