From afaa85ca6c7383ff74921da44911909ef6e1464b Mon Sep 17 00:00:00 2001
From: Greg Way <gregory.way@gmail.com>
Date: Sat, 20 Mar 2021 16:58:01 -0400
Subject: [PATCH] Adding spherized profiles (#60)

* add spherize notebook

* add batch 2 spherize processing

* add spherized profiles

* move folders

* add multiple spherize outputs

* add spherized data

* update READMEs
---
 README.md                                     |  22 ++-
 profiles/README.md                            |  55 ++++--
 profiles/{profile.py => profile_cells.py}     |   0
 profiles/profiling_pipeline.py                |   2 +-
 ...files_with_input_normalized_by_dmso.csv.gz |   3 +
 ...ith_input_normalized_by_whole_plate.csv.gz |   3 +
 ...files_with_input_normalized_by_dmso.csv.gz |   3 +
 ...ith_input_normalized_by_whole_plate.csv.gz |   3 +
 .../nbconverted/spherize-batch-effects.py     | 100 +++++++++++
 .../spherize-batch-effects.ipynb              | 156 ++++++++++++++++++
 10 files changed, 319 insertions(+), 28 deletions(-)
 rename profiles/{profile.py => profile_cells.py} (100%)
 create mode 100644 spherized_profiles/profiles/2016_04_01_a549_48hr_batch1_dmso_spherized_profiles_with_input_normalized_by_dmso.csv.gz
 create mode 100644 spherized_profiles/profiles/2016_04_01_a549_48hr_batch1_dmso_spherized_profiles_with_input_normalized_by_whole_plate.csv.gz
 create mode 100644 spherized_profiles/profiles/2017_12_05_Batch2_dmso_spherized_profiles_with_input_normalized_by_dmso.csv.gz
 create mode 100644 spherized_profiles/profiles/2017_12_05_Batch2_dmso_spherized_profiles_with_input_normalized_by_whole_plate.csv.gz
 create mode 100644 spherized_profiles/scripts/nbconverted/spherize-batch-effects.py
 create mode 100644 spherized_profiles/spherize-batch-effects.ipynb

diff --git a/README.md b/README.md
index 31f9b48..f784a33 100644
--- a/README.md
+++ b/README.md
@@ -1,25 +1,29 @@
-# Processed Data for the LINCS Cell Painting Project
+# LINCS Cell Painting profile data repository
 
-The repository stores data and data processing scripts for **a subset** of the [Broad Drug Repurposing Hub](https://clue.io/repurposing#home) collection of compounds.
+The Library of Integrated Network-Based Cellular Signatures (LINCS) Project aims to create publicly available resources to characterize how cells respond to perturbation.
+This repository stores Cell Painting readouts and associated data-processing pipelines for the LINCS Cell Painting dataset.
 
-In this project, the [Connectivity Map](https://clue.io/team) team perturbed A549 cells with ~1,500 compounds across 6 doses in 5 technical replicates. 
+The data represent **a subset** of the [Broad Drug Repurposing Hub](https://clue.io/repurposing#home) collection of compounds.
+
+In this project, the [Connectivity Map](https://clue.io/team) team perturbed A549 cells with ~1,500 compounds across 6 doses in 5 technical replicates.
 We refer to this dataset as `LINCS Pilot 1`.
+We also include data for the second batch of LINCS Cell Painting data, which we refer to as `LKCP`.
 
-For a specific list of compounds tested, see [`metadata`](https://github.com/broadinstitute/lincs-cell-painting/tree/master/metadata). 
-Information about the compounds can be interactively explored in the [CLUE Repurposing app](https://clue.io/repurposing-app). 
+For a specific list of compounds tested, see [`metadata`](https://github.com/broadinstitute/lincs-cell-painting/tree/master/metadata).
+You can interactively explore information about the compounds in the [CLUE Repurposing app](https://clue.io/repurposing-app).
 The [Morphology Connectivity Hub](https://clue.io/morphology) is the primary source of this dataset.
 
-## Image-Based Profiling
+## Image-Based profiling
 
-We apply a unified, image-based profiling pipeline to all 136 384-well plates from `LINCS Pilot 1` .
+We apply a unified, image-based profiling pipeline to all 136 384-well plates from `LINCS Pilot 1`, and all 135 384-well plates from `LKCP`.
 We use [pycytominer](https://github.com/cytomining/pycytominer) as the primary tool for image-based profiling.
 
-The profiles are processed and stored in the [profiles/](profiles/) directory.
+We process and store profiles in the [profiles/](profiles/) directory.
 See [`profiles/README.md`](profiles/README.md) for more details and for instructions on how to reproduce the pipeline.
 
 For more details about image-based profiling in general, please refer to [Caicedo et al. 2017](https://doi.org/10.1038/nmeth.4397).
 
-## Computational Environment
+## Computational environment
 
 We use [conda](https://docs.conda.io/en/latest/) to manage the computational environment.
 
diff --git a/profiles/README.md b/profiles/README.md
index fa18c5e..2081810 100644
--- a/profiles/README.md
+++ b/profiles/README.md
@@ -1,8 +1,9 @@
-# Image-Based Profiling
+# Image-Based profiling
 
 Image-based profiling represents a series of data processing steps that turn image-based readouts into more manageable data matrices for downstream analyses ([Caicedo et al. 2017](https://doi.org/10.1038/nmeth.4397)).
-Typically, the image-based readouts are derived from CellProfiler ([McQuin et al. 2018](https://doi.org/10.1371/journal.pbio.2005970)) and represent single cell morphology measurements.
-In this folder, we process the CellProfiler derived morphology features using [pycytominer](https://github.com/cytomining/pycytominer) - a tool enabling reproducible image-based profiling.
+Typically, you derive image-based readouts using software, like CellProfiler ([McQuin et al. 2018](https://doi.org/10.1371/journal.pbio.2005970)), that segment cells and extract so-called hand-engineered single cell morphology measurements.
+In this folder, we process the CellProfiler derived morphology features for the LINCS Cell Painting dataset using [pycytominer](https://github.com/cytomining/pycytominer) - a tool enabling reproducible image-based profiling.
+
 Specifically, we include:
 
 1. Data processing scripts to perform the full unified, image-based profiling pipeline
@@ -11,34 +12,52 @@ Specifically, we include:
 
 ## Workflow
 
-![Cytominer Workflow](media/cytominer_workflow.png)
+![Cytominer workflow](media/cytominer_workflow.png)
 
 Note here that we do not include the intermediate step of generating `.sqlite` files per plate using a tool called [cytominer-database](https://github.com/cytomining/cytominer-database).
 This repository and workflow begins after we applied cytominer-database.
 
 ## Data Levels
 
-### CellProfilier-derived Profiles
+We include two batches of Cell Painting data in this repository: `2016_04_01_a549_48hr_batch1` and `2017_12_05_Batch2`.
+
+### CellProfilier-derived profiles
+
+For each batch, we include:
 
-| Data Level | Description | File Format | Included in this Repo |
+| Data level | Description | File format | Included in this repo? |
 | :--------- | :---------- | :---------- | :-------------------- |
-| Level 1 | Cell Images | `.tif` | No^ |
-| Level 2 | Single Cell Profiles | `.sqlite` | No^ |
-| Level 3 | Aggregated Profiles with Metadata | `.csv.gz` | Yes |
-| Level 4a | Normalized Profiles with Metadata | `.csv.gz` | Yes |
-| Level 4b | Normalized and Feature Selected Profiles with Metadata | `.csv.gz` | Yes |
-| Level 5 | Consensus Perturbation Profiles | `.csv.gz` | Yes |
+| Level 1 | Cell images | `.tif` | No^ |
+| Level 2 | Single cell profiles | `.sqlite` | No^ |
+| Level 3 | Aggregated profiles with metadata | `.csv.gz` | Yes |
+| Level 4a | Normalized profiles with metadata | `.csv.gz` | Yes |
+| Level 4b | Normalized and feature selected profiles with metadata | `.csv.gz` | Yes |
+| Level 5 | Consensus perturbation profiles | `.csv.gz` | Yes |
 
 Importantly, we include files for _two_ different types of normalization: Whole-plate normalization, and DMSO-specific normalization.
-See [`profile.py`](profile.py) for more details.
+See [`profile_cells.py`](profile_cells.py) for more details.
+
+#### Batch corrected profiles
+
+We use a spherize (a.k.a. whiten) transform to adjust for plate position effects.
+The spherize transform adjusts for plate position effects by transforming the profile data such that the DMSO profiles are left with an identity covariance matrix.
+See [`spherize-batch-effects.ipynb`](spherized_profiles/spherize-batch-effects.ipynb) for implementation details.
+
+For each batch we include four different spherized profiles.
+These data include all level 4b profiles for every batch.
 
-^ Note that these files are being prepared
+| Batch | Input data | Spherized output file |
+| :---: | :--------: | :-------------------: |
+| 2016_04_01_a549_48hr_batch1 | DMSO normalized | 2016_04_01_a549_48hr_batch1_dmso_spherized_profiles_with_input_normalized_by_dmso.csv.gz |
+| 2016_04_01_a549_48hr_batch1 | Whole plate normalized | 2017_12_05_Batch2_dmso_spherized_profiles_with_input_normalized_by_whole_plate.csv.gz |
+| 2017_12_05_Batch2 | DMSO normalized | 2017_12_05_Batch2_dmso_spherized_profiles_with_input_normalized_by_dmso.csv.gz |
+| 2017_12_05_Batch2 | Whole plate normalized | 2017_12_05_Batch2_dmso_spherized_profiles_with_input_normalized_by_whole_plate.csv.gz |
 
-### DeepProfiler-derived Profiles
+### DeepProfiler-derived profiles
 
 TBD
 
-## Reproduce Pipeline
+## Reproduce pipeline
 
 The pipeline can be reproduced by simply executing the following:
 
@@ -53,7 +72,7 @@ python profiling_pipeline.py
 python profiling_pipeline.py --batch "2017_12_05_Batch2" --plate_prefix "BR" --well_col "Metadata_Well" --plate_col "Metadata_Plate" --extract_cell_line
 ```
 
-## Critical Details
+## Critical details
 
 There are several critical details that are important for understanding data generation and processing.
-See [`profile.py`](profile.py) for more details about the specific processing steps and decisions.
+See [`profile_cells.py`](profile_cells.py) for more details about the specific processing steps and decisions.
diff --git a/profiles/profile.py b/profiles/profile_cells.py
similarity index 100%
rename from profiles/profile.py
rename to profiles/profile_cells.py
diff --git a/profiles/profiling_pipeline.py b/profiles/profiling_pipeline.py
index 7be6f8c..d305009 100644
--- a/profiles/profiling_pipeline.py
+++ b/profiles/profiling_pipeline.py
@@ -75,7 +75,7 @@
 
     cmd = [
         "python",
-        "profile.py",
+        "profile_cells.py",
         "--sql_file",
         sql_file,
         "--batch",
diff --git a/spherized_profiles/profiles/2016_04_01_a549_48hr_batch1_dmso_spherized_profiles_with_input_normalized_by_dmso.csv.gz b/spherized_profiles/profiles/2016_04_01_a549_48hr_batch1_dmso_spherized_profiles_with_input_normalized_by_dmso.csv.gz
new file mode 100644
index 0000000..10ada2b
--- /dev/null
+++ b/spherized_profiles/profiles/2016_04_01_a549_48hr_batch1_dmso_spherized_profiles_with_input_normalized_by_dmso.csv.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c1c20fe49259c4e91454b1e063223edcb18fec8e409298eef2387cd1479a27cb
+size 290341784
diff --git a/spherized_profiles/profiles/2016_04_01_a549_48hr_batch1_dmso_spherized_profiles_with_input_normalized_by_whole_plate.csv.gz b/spherized_profiles/profiles/2016_04_01_a549_48hr_batch1_dmso_spherized_profiles_with_input_normalized_by_whole_plate.csv.gz
new file mode 100644
index 0000000..ffd2ca2
--- /dev/null
+++ b/spherized_profiles/profiles/2016_04_01_a549_48hr_batch1_dmso_spherized_profiles_with_input_normalized_by_whole_plate.csv.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b2ef2163c6f9d0a5fb573a261c482257420f682a99ed07e66c1c1a411bc99e32
+size 327458899
diff --git a/spherized_profiles/profiles/2017_12_05_Batch2_dmso_spherized_profiles_with_input_normalized_by_dmso.csv.gz b/spherized_profiles/profiles/2017_12_05_Batch2_dmso_spherized_profiles_with_input_normalized_by_dmso.csv.gz
new file mode 100644
index 0000000..374f044
--- /dev/null
+++ b/spherized_profiles/profiles/2017_12_05_Batch2_dmso_spherized_profiles_with_input_normalized_by_dmso.csv.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dd78159dcf8c88a453240025e6ecb19a2c0fe26989a5491d24ebbffb7bd794b8
+size 103902586
diff --git a/spherized_profiles/profiles/2017_12_05_Batch2_dmso_spherized_profiles_with_input_normalized_by_whole_plate.csv.gz b/spherized_profiles/profiles/2017_12_05_Batch2_dmso_spherized_profiles_with_input_normalized_by_whole_plate.csv.gz
new file mode 100644
index 0000000..63fe128
--- /dev/null
+++ b/spherized_profiles/profiles/2017_12_05_Batch2_dmso_spherized_profiles_with_input_normalized_by_whole_plate.csv.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dbe636236325e325743735786c45fc627d3d9a833f512066d795847b7b576eeb
+size 268978977
diff --git a/spherized_profiles/scripts/nbconverted/spherize-batch-effects.py b/spherized_profiles/scripts/nbconverted/spherize-batch-effects.py
new file mode 100644
index 0000000..7fcbb86
--- /dev/null
+++ b/spherized_profiles/scripts/nbconverted/spherize-batch-effects.py
@@ -0,0 +1,100 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+# ## Adjust batch effects with a spherize transform
+# 
+# Here, we load in all normalized profiles (level 4a) data across all plates and apply a spherize transform using the DMSO profiles as the background distribution.
+# 
+# We've previously observed that sphering (aka whitening) the data successfully adjusts for technical artifacts induced by batch to batch variation and plate position effects.
+
+# In[1]:
+
+
+import os
+import pathlib
+import subprocess
+import pandas as pd
+
+from pycytominer import normalize, feature_select
+from pycytominer.cyto_utils import output, infer_cp_features
+
+
+# In[2]:
+
+
+input_dir = pathlib.Path("../profiles/")
+batches = ["2016_04_01_a549_48hr_batch1", "2017_12_05_Batch2"]
+
+suffixes = {
+    "whole_plate": "_normalized.csv.gz",
+    "dmso": "_normalized_dmso.csv.gz"
+}
+
+plates = {
+    batch: [x.name for x in pathlib.Path(f"{input_dir}/{batch}").iterdir() if ".DS_Store" not in x.name]
+    for batch in batches
+}
+
+files = {
+    batch: {
+        suffix: [pathlib.Path(f"{input_dir}/{batch}/{x}/{x}{suffixes[suffix]}") for x in plates[batch]]
+        for suffix in suffixes
+    }
+    for batch in batches
+}
+
+feature_select_ops = [
+    "variance_threshold",
+    "correlation_threshold",
+    "drop_na_columns",
+    "blacklist",
+    "drop_outliers"
+]
+
+na_cut = 0
+corr_threshold = 0.95
+outlier_cutoff = 60
+
+output_dir = "profiles"
+
+
+# In[3]:
+
+
+for batch in batches:
+    for suffix in suffixes:
+        output_file = pathlib.Path(
+            f"{output_dir}/{batch}_dmso_spherized_profiles_with_input_normalized_by_{suffix}.csv.gz"
+        )
+        print(f"Now processing {output_file}...")
+
+        profile_df = pd.concat([pd.read_csv(x) for x in files[batch][suffix]]).reset_index(drop=True)
+
+        # Perform feature selection
+        profile_df = feature_select(
+            profiles=profile_df,
+            operation=feature_select_ops,
+            na_cutoff=0,
+            corr_threshold=corr_threshold,
+            outlier_cutoff=outlier_cutoff
+        )
+
+        print(profile_df.shape)
+        profile_df.head()
+
+        spherize_df = normalize(
+            profiles=profile_df,
+            features="infer",
+            meta_features="infer",
+            samples="Metadata_broad_sample == 'DMSO'",
+            method="whiten",
+        )
+
+        print(spherize_df.shape)
+        spherize_df.head()
+
+        output(
+            df=spherize_df,
+            output_filename=output_file
+        )
+
diff --git a/spherized_profiles/spherize-batch-effects.ipynb b/spherized_profiles/spherize-batch-effects.ipynb
new file mode 100644
index 0000000..4ccc929
--- /dev/null
+++ b/spherized_profiles/spherize-batch-effects.ipynb
@@ -0,0 +1,156 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Adjust batch effects with a spherize transform\n",
+    "\n",
+    "Here, we load in all normalized profiles (level 4a) data across all plates and apply a spherize transform using the DMSO profiles as the background distribution.\n",
+    "\n",
+    "We've previously observed that sphering (aka whitening) the data successfully adjusts for technical artifacts induced by batch to batch variation and plate position effects."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import pathlib\n",
+    "import subprocess\n",
+    "import pandas as pd\n",
+    "\n",
+    "from pycytominer import normalize, feature_select\n",
+    "from pycytominer.cyto_utils import output, infer_cp_features"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "input_dir = pathlib.Path(\"../profiles/\")\n",
+    "batches = [\"2016_04_01_a549_48hr_batch1\", \"2017_12_05_Batch2\"]\n",
+    "\n",
+    "suffixes = {\n",
+    "    \"whole_plate\": \"_normalized.csv.gz\",\n",
+    "    \"dmso\": \"_normalized_dmso.csv.gz\"\n",
+    "}\n",
+    "\n",
+    "plates = {\n",
+    "    batch: [x.name for x in pathlib.Path(f\"{input_dir}/{batch}\").iterdir() if \".DS_Store\" not in x.name]\n",
+    "    for batch in batches\n",
+    "}\n",
+    "\n",
+    "files = {\n",
+    "    batch: {\n",
+    "        suffix: [pathlib.Path(f\"{input_dir}/{batch}/{x}/{x}{suffixes[suffix]}\") for x in plates[batch]]\n",
+    "        for suffix in suffixes\n",
+    "    }\n",
+    "    for batch in batches\n",
+    "}\n",
+    "\n",
+    "feature_select_ops = [\n",
+    "    \"variance_threshold\",\n",
+    "    \"correlation_threshold\",\n",
+    "    \"drop_na_columns\",\n",
+    "    \"blacklist\",\n",
+    "    \"drop_outliers\"\n",
+    "]\n",
+    "\n",
+    "na_cut = 0\n",
+    "corr_threshold = 0.95\n",
+    "outlier_cutoff = 60\n",
+    "\n",
+    "output_dir = \"profiles\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Now processing profiles/2016_04_01_a549_48hr_batch1_dmso_spherized_profiles_with_input_normalized_by_whole_plate.csv.gz...\n",
+      "(52223, 711)\n",
+      "(52223, 711)\n",
+      "Now processing profiles/2016_04_01_a549_48hr_batch1_dmso_spherized_profiles_with_input_normalized_by_dmso.csv.gz...\n",
+      "(52223, 632)\n",
+      "(52223, 632)\n",
+      "Now processing profiles/2017_12_05_Batch2_dmso_spherized_profiles_with_input_normalized_by_whole_plate.csv.gz...\n",
+      "(51447, 600)\n",
+      "(51447, 600)\n",
+      "Now processing profiles/2017_12_05_Batch2_dmso_spherized_profiles_with_input_normalized_by_dmso.csv.gz...\n",
+      "(51447, 247)\n",
+      "(51447, 247)\n"
+     ]
+    }
+   ],
+   "source": [
+    "for batch in batches:\n",
+    "    for suffix in suffixes:\n",
+    "        output_file = pathlib.Path(\n",
+    "            f\"{output_dir}/{batch}_dmso_spherized_profiles_with_input_normalized_by_{suffix}.csv.gz\"\n",
+    "        )\n",
+    "        print(f\"Now processing {output_file}...\")\n",
+    "\n",
+    "        profile_df = pd.concat([pd.read_csv(x) for x in files[batch][suffix]]).reset_index(drop=True)\n",
+    "\n",
+    "        # Perform feature selection\n",
+    "        profile_df = feature_select(\n",
+    "            profiles=profile_df,\n",
+    "            operation=feature_select_ops,\n",
+    "            na_cutoff=0,\n",
+    "            corr_threshold=corr_threshold,\n",
+    "            outlier_cutoff=outlier_cutoff\n",
+    "        )\n",
+    "\n",
+    "        print(profile_df.shape)\n",
+    "        profile_df.head()\n",
+    "\n",
+    "        spherize_df = normalize(\n",
+    "            profiles=profile_df,\n",
+    "            features=\"infer\",\n",
+    "            meta_features=\"infer\",\n",
+    "            samples=\"Metadata_broad_sample == 'DMSO'\",\n",
+    "            method=\"whiten\",\n",
+    "        )\n",
+    "\n",
+    "        print(spherize_df.shape)\n",
+    "        spherize_df.head()\n",
+    "\n",
+    "        output(\n",
+    "            df=spherize_df,\n",
+    "            output_filename=output_file\n",
+    "        )"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}