output prediction working

danbryce · danbryce · commit 2749f5d281d9 · 2020-06-20T04:26:41.000Z
diff --git a/app/precomputed-data-table-app/fcs_signal_prediction/src/fcs_signal_prediction/predict.py b/app/precomputed-data-table-app/fcs_signal_prediction/src/fcs_signal_prediction/predict.py
@@ -1,5 +1,25 @@
 from pandas import DataFrame
+from typing import Optional, List
 
+from pysd2cat.analysis import correctness 
 
-def predict_signal(df: DataFrame, experiment_identifier: str) -> DataFrame:
-    pass
+def predict_signal(df: DataFrame, 
+                   experiment_identifier: str,
+                   low_control : str,
+                   high_control : str,
+                   id_col : str,
+                   channels : List[str],
+                   strain_col : Optional[str]='strain_name') -> DataFrame:
+          
+    res = correctness.compute_predicted_output(df, 
+                             training_df=None,
+                             data_columns = channels, 
+                             out_dir='.',
+                             strain_col=strain_col,
+                             high_control=high_control, 
+                             low_control=low_control,
+                             id_col=id_col,
+                             use_harness=False,
+                             description=None)
+    
+    return res
diff --git a/app/precomputed-data-table-app/fcs_signal_prediction/src/fcs_signal_prediction/utils/data_utils.py b/app/precomputed-data-table-app/fcs_signal_prediction/src/fcs_signal_prediction/utils/data_utils.py
@@ -0,0 +1,60 @@
+import json
+import os
+import pandas as pd
+
+## Data Helper functions
+
+def get_record(experiment):
+    record = json.load(open(os.path.join(experiment, "record.json")))
+    return record
+
+def json_to_pd(json_data, channels=["FSC-A"]):    
+    df = pd.DataFrame()
+    for sample in json_data:
+        sample_id = sample['sample_id']       
+        sample_df = pd.DataFrame(data={ k:v for k, v in sample.items() if k != "sample_id"})        
+        sample_df.loc[:,'sample_id'] = sample_id
+        df = df.append(sample_df, ignore_index=True)
+    return df
+            
+def get_record_file(record, file_type="fc_meta"):
+    files = record['files']
+    files_of_type = [ x for x in files if file_type in x['name']]
+    if len(files_of_type) > 0:
+        return files_of_type[0]
+    else:
+        return None
+
+def get_meta(experiment, record):
+    meta_file_name = get_record_file(record, file_type="fc_meta.csv")
+    #print(meta_file_name)
+    if meta_file_name:
+        meta_df = pd.read_csv(os.path.join(experiment, meta_file_name['name']))
+        return meta_df
+    else:
+        return None
+    
+
+def get_data(experiment, record):
+    fc_raw_file = get_record_file(record, file_type="fc_raw_events")
+    if fc_raw_file:
+        fc_raw_data = json.load(open(os.path.join(experiment, fc_raw_file['name'])))
+        return json_to_pd(fc_raw_data)
+    else:
+        return None
+
+def get_data_and_metadata(experiment):
+    record = get_record(experiment)
+    data = get_data(experiment, record)
+    meta = get_meta(experiment, record)
+    if data is not None and meta is not None:
+        df = meta.merge(data, on="sample_id", how="inner")
+        return df
+    else:
+        return None
+    
+def get_data_converge_id(path):
+    data_converge_id = path.split("/")[-1].split(".")[0]
+    return data_converge_id
+
+
diff --git a/notebooks/correctness_analysis.ipynb b/notebooks/correctness_analysis.ipynb
@@ -15,45 +15,15 @@
      ]
     },
     {
-     "data": {
-      "text/plain": [
-       "['/work/projects/SD2E-Community/prod/projects/sd2e-project-43/test/dc_YeastSTATES-CRISPR-Short-Duration-Time-Series-35C_20200414152048',\n",
-       " '/work/projects/SD2E-Community/prod/projects/sd2e-project-43/test/dc_YeastSTATES-CRISPR-Growth-Curves-35C_20200414152809',\n",
-       " '/work/projects/SD2E-Community/prod/projects/sd2e-project-43/test/dc_YeastSTATES-Beta-Estradiol-OR-Gate-Plant-TF-Dose-Response_20200423200308',\n",
-       " '/work/projects/SD2E-Community/prod/projects/sd2e-project-43/test/dc_YeastSTATES-OR-Gate-CRISPR-Dose-Response_20200427004659',\n",
-       " '/work/projects/SD2E-Community/prod/projects/sd2e-project-43/test/dc_YeastSTATES-OR-Gate-CRISPR-Growth-Curves_20200417214333',\n",
-       " '/work/projects/SD2E-Community/prod/projects/sd2e-project-43/test/dc_YeastSTATES-CRISPR-Dose-Response_20200414151033',\n",
-       " '/work/projects/SD2E-Community/prod/projects/sd2e-project-43/test/dc_YeastSTATES-CRISPR-Long-Duration-Time-Series-20191208_20200414085017',\n",
-       " '/work/projects/SD2E-Community/prod/projects/sd2e-project-43/test/dc_YeastSTATES-Doxycycline-OR-Gate-Plant-TF-Growth-Curves_20200414144659',\n",
-       " '/work/projects/SD2E-Community/prod/projects/sd2e-project-43/test/dc_YeastSTATES-CRISPR-Growth-Curves-35C_20200423200123',\n",
-       " '/work/projects/SD2E-Community/prod/projects/sd2e-project-43/test/dc_YeastSTATES-OR-Gate-CRISPR-Dose-Response_20200423202039',\n",
-       " '/work/projects/SD2E-Community/prod/projects/sd2e-project-43/test/dc_YeastSTATES-CRISPR-Growth-Curves-with-Plate-Reader-Optimization_20200423194854',\n",
-       " '/work/projects/SD2E-Community/prod/projects/sd2e-project-43/test/dc_YeastSTATES-Beta-Estradiol-OR-Gate-Plant-TF-Growth-Curves_20200327174254',\n",
-       " '/work/projects/SD2E-Community/prod/projects/sd2e-project-43/test/dc_YeastSTATES-OR-Gate-CRISPR-Growth-Curves_20200414145352',\n",
-       " '/work/projects/SD2E-Community/prod/projects/sd2e-project-43/test/dc_YeastSTATES-Beta-Estradiol-OR-Gate-Plant-TF-Dose-Response_20200414091814',\n",
-       " '/work/projects/SD2E-Community/prod/projects/sd2e-project-43/test/dc_YeastSTATES-CRISPR-Long-Duration-Time-Series-20191208_20200328071458',\n",
-       " '/work/projects/SD2E-Community/prod/projects/sd2e-project-43/test/dc_YeastSTATES-CRISPR-Growth-Curves-with-Plate-Reader-Optimization_20200414151551',\n",
-       " '/work/projects/SD2E-Community/prod/projects/sd2e-project-43/test/dc_YeastSTATES-CRISPR-Short-Duration-Time-Series-35C_20200423195648',\n",
-       " '/work/projects/SD2E-Community/prod/projects/sd2e-project-43/test/dc_YeastSTATES-CRISPR-Short-Duration-Time-Series-20191208_20200328030858',\n",
-       " '/work/projects/SD2E-Community/prod/projects/sd2e-project-43/test/dc_YeastSTATES-CRISPR-Short-Duration-Time-Series-20191208_20200414095504',\n",
-       " '/work/projects/SD2E-Community/prod/projects/sd2e-project-43/test/dc_YeastSTATES-Doxycycline-OR-Gate-Plant-TF-Dose-Response_20200423201302',\n",
-       " '/work/projects/SD2E-Community/prod/projects/sd2e-project-43/test/dc_YeastSTATES-CRISPR-Dose-Response_20200327165322',\n",
-       " '/work/projects/SD2E-Community/prod/projects/sd2e-project-43/test/dc_YeastSTATES-CRISPR-Long-Duration-Time-Series-20191208_20200423193808',\n",
-       " '/work/projects/SD2E-Community/prod/projects/sd2e-project-43/test/dc_YeastSTATES-Beta-Estradiol-OR-Gate-Plant-TF-Growth-Curves_20200423201016',\n",
-       " '/work/projects/SD2E-Community/prod/projects/sd2e-project-43/test/dc_YeastSTATES-OR-Gate-CRISPR-Growth-Curves_20200423202428',\n",
-       " '/work/projects/SD2E-Community/prod/projects/sd2e-project-43/test/dc_YeastSTATES-Doxycycline-OR-Gate-Plant-TF-Growth-Curves_20200327174015',\n",
-       " '/work/projects/SD2E-Community/prod/projects/sd2e-project-43/test/dc_YeastSTATES-Beta-Estradiol-OR-Gate-Plant-TF-Growth-Curves_20200414150010',\n",
-       " '/work/projects/SD2E-Community/prod/projects/sd2e-project-43/test/dc_YeastSTATES-CRISPR-Dose-Response_20200423194728',\n",
-       " '/work/projects/SD2E-Community/prod/projects/sd2e-project-43/test/dc_YeastSTATES-OR-Gate-CRISPR-Growth-Curves_20200327171330',\n",
-       " '/work/projects/SD2E-Community/prod/projects/sd2e-project-43/test/dc_YeastSTATES-CRISPR-Short-Duration-Time-Series-20191208_20200423194115',\n",
-       " '/work/projects/SD2E-Community/prod/projects/sd2e-project-43/test/dc_CEN-PK-Inducible-CRISPR-Characterization_20200426200027',\n",
-       " '/work/projects/SD2E-Community/prod/projects/sd2e-project-43/test/dc_YeastSTATES-Doxycycline-OR-Gate-Plant-TF-Dose-Response_20200426192144',\n",
-       " '/work/projects/SD2E-Community/prod/projects/sd2e-project-43/test/dc_YeastSTATES-Doxycycline-OR-Gate-Plant-TF-Growth-Curves_20200423201744']"
-      ]
-     },
-     "execution_count": 1,
-     "metadata": {},
-     "output_type": "execute_result"
+     "ename": "ModuleNotFoundError",
+     "evalue": "No module named 'data_converge'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-1-1f0e54a506c8>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m     24\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mpysd2cat\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0manalysis\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mcorrectness\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     25\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 26\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mdata_converge\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mutil\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mdata_utils\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mdu\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     27\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     28\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0magavepy\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0magave\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mAgave\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mAgaveError\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'data_converge'"
+     ]
     }
    ],
    "source": [
@@ -1737,7 +1707,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3631,18 +3601,6 @@
    "display_name": "Python 3",
    "language": "python",
    "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.6.8"
   }
  },
  "nbformat": 4,
diff --git a/notebooks/demo_fcs_signal_prediction.ipynb b/notebooks/demo_fcs_signal_prediction.ipynb
@@ -0,0 +1,162 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import numpy as np\n",
+    "from os.path import expanduser\n",
+    "from fcs_signal_prediction.predict import predict_signal\n",
+    "from fcs_signal_prediction.utils import data_utils as du\n",
+    "\n",
+    "\n",
+    "%load_ext autoreload\n",
+    "%reload_ext autoreload\n",
+    "%autoreload 2\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "DATA_CONVERGE_PROJECT=\"sd2e-project-43\"\n",
+    "\n",
+    "data_converge_base = os.path.join(expanduser(\"~\"), 'sd2e-projects', DATA_CONVERGE_PROJECT)\n",
+    "experiment_dir = os.path.realpath(os.path.join(data_converge_base, 'reactor_outputs', 'complete'))\n",
+    "experiment_dir_contents = [os.path.realpath(os.path.join(experiment_dir, x)) for x in os.listdir(experiment_dir)]\n",
+    "\n",
+    "experiment_dir_contents"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#experiment_id = \"YeastSTATES-CRISPR-Short-Duration-Time-Series-20191208\"\n",
+    "experiment_id=\"YeastSTATES-OR-Gate-CRISPR-Dose-Response\"\n",
+    "process_dir = os.path.join(experiment_dir, experiment_id)\n",
+    "experiment_id_dir_contents = [os.path.realpath(os.path.join(process_dir, x)) for x in os.listdir(process_dir)]\n",
+    "experiment_id_dir_contents.sort()\n",
+    "last_process = experiment_id_dir_contents[-1]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "id_col = 'sample_id'\n",
+    "df = du.get_data(last_process, du.get_record(last_process))\n",
+    "meta = du.get_meta(last_process, du.get_record(last_process))\n",
+    "#df = du.get_meta(last_process, du.get_record(last_process))\n",
+    "df = df.merge(meta[[id_col, 'strain_name']])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "channels = list(df.columns)\n",
+    "channels.remove(id_col)\n",
+    "channels.remove(\"strain_name\")\n",
+    "\n",
+    "high_control = 'CRISPR_CEN.PK2_positive_control_NOR_00_24864'\n",
+    "low_control = 'CRISPR_CEN.PK2_negative_control_23970'\n",
+    "strain_col = \"strain_name\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "predictions = predict_signal(df, experiment_id, low_control, high_control, id_col, channels, strain_col)\n",
+    "mean_prediction = predictions.groupby([id_col]).agg({\"predicted_output\" : [np.mean, np.std]}).reset_index()\n",
+    "mean_prediction.columns  = mean_prediction.columns.map('_'.join)\n",
+    "mean_prediction = mean_prediction.rename(columns={id_col+\"_\": id_col})\n",
+    "result = meta.merge(mean_prediction, on=id_col)\n",
+    "result"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_log_gfp(df):\n",
+    "    df['log BL1-A'] = df['BL1-A'].apply(lambda x: x+1.0).apply(np.log10)\n",
+    "    return df.replace([np.inf, -np.inf], np.nan).dropna()\n",
+    "\n",
+    "\n",
+    "log_df = get_log_gfp(df).sample(n=int(10e5))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "samples = log_df.sample_id.unique()\n",
+    "num_samples = len(samples)\n",
+    "\n",
+    "fig, ax = plt.subplots(num_samples, 1, figsize=(3, 3*num_samples))\n",
+    "\n",
+    "num=100000\n",
+    "\n",
+    "high_df = log_df.loc[log_df.strain_name == high_control]#.sample(n=num)\n",
+    "low_df = log_df.loc[log_df.strain_name == low_control]#.sample(n=num)\n",
+    "\n",
+    "for i, sample in enumerate(samples):\n",
+    "    #print(i)\n",
+    "    sample_df = log_df.loc[log_df.sample_id == sample]\n",
+    "    \n",
+    "    #sample_df = sample_df.sample(n=min(num, len(sample_df)))\n",
+    "\n",
+    "    ax[i].hist(high_df['log BL1-A'], label=\"high\")\n",
+    "    ax[i].hist(low_df['log BL1-A'], label=\"low\")\n",
+    "    ax[i].hist(sample_df['log BL1-A'], label=\"sample\")\n",
+    "    ax[i].set_xlim(0, 5)\n",
+    "    ax[i].text(6, 0, \"\\n\".join(result[result.sample_id==sample][[\"predicted_output_mean\", \"predicted_output_std\"]].transpose()[0:].to_string().split(\"\\n\")[1:]))\n",
+    "    ax[i].set_title(sample)\n",
+    "\n",
+    "#    if i >= 9: \n",
+    "#        break\n",
+    "#fig.legend()\n",
+    "\n",
+    "#result[result.sample_id==sample][[\"predicted_output_mean\", \"predicted_output_std\"]]\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/notebooks/live-dead-transfer-analysis.ipynb b/notebooks/live-dead-transfer-analysis.ipynb