From afaa85ca6c7383ff74921da44911909ef6e1464b Mon Sep 17 00:00:00 2001 From: Greg Way Date: Sat, 20 Mar 2021 16:58:01 -0400 Subject: [PATCH] Adding spherized profiles (#60) * add spherize notebook * add batch 2 spherize processing * add spherized profiles * move folders * add multiple spherize outputs * add spherized data * update READMEs --- README.md | 22 ++- profiles/README.md | 55 ++++-- profiles/{profile.py => profile_cells.py} | 0 profiles/profiling_pipeline.py | 2 +- ...files_with_input_normalized_by_dmso.csv.gz | 3 + ...ith_input_normalized_by_whole_plate.csv.gz | 3 + ...files_with_input_normalized_by_dmso.csv.gz | 3 + ...ith_input_normalized_by_whole_plate.csv.gz | 3 + .../nbconverted/spherize-batch-effects.py | 100 +++++++++++ .../spherize-batch-effects.ipynb | 156 ++++++++++++++++++ 10 files changed, 319 insertions(+), 28 deletions(-) rename profiles/{profile.py => profile_cells.py} (100%) create mode 100644 spherized_profiles/profiles/2016_04_01_a549_48hr_batch1_dmso_spherized_profiles_with_input_normalized_by_dmso.csv.gz create mode 100644 spherized_profiles/profiles/2016_04_01_a549_48hr_batch1_dmso_spherized_profiles_with_input_normalized_by_whole_plate.csv.gz create mode 100644 spherized_profiles/profiles/2017_12_05_Batch2_dmso_spherized_profiles_with_input_normalized_by_dmso.csv.gz create mode 100644 spherized_profiles/profiles/2017_12_05_Batch2_dmso_spherized_profiles_with_input_normalized_by_whole_plate.csv.gz create mode 100644 spherized_profiles/scripts/nbconverted/spherize-batch-effects.py create mode 100644 spherized_profiles/spherize-batch-effects.ipynb diff --git a/README.md b/README.md index 31f9b48..f784a33 100644 --- a/README.md +++ b/README.md @@ -1,25 +1,29 @@ -# Processed Data for the LINCS Cell Painting Project +# LINCS Cell Painting profile data repository -The repository stores data and data processing scripts for **a subset** of the [Broad Drug Repurposing Hub](https://clue.io/repurposing#home) collection of compounds. +The Library of Integrated Network-Based Cellular Signatures (LINCS) Project aims to create publicly available resources to characterize how cells respond to perturbation. +This repository stores Cell Painting readouts and associated data-processing pipelines for the LINCS Cell Painting dataset. -In this project, the [Connectivity Map](https://clue.io/team) team perturbed A549 cells with ~1,500 compounds across 6 doses in 5 technical replicates. +The data represent **a subset** of the [Broad Drug Repurposing Hub](https://clue.io/repurposing#home) collection of compounds. + +In this project, the [Connectivity Map](https://clue.io/team) team perturbed A549 cells with ~1,500 compounds across 6 doses in 5 technical replicates. We refer to this dataset as `LINCS Pilot 1`. +We also include data for the second batch of LINCS Cell Painting data, which we refer to as `LKCP`. -For a specific list of compounds tested, see [`metadata`](https://github.com/broadinstitute/lincs-cell-painting/tree/master/metadata). -Information about the compounds can be interactively explored in the [CLUE Repurposing app](https://clue.io/repurposing-app). +For a specific list of compounds tested, see [`metadata`](https://github.com/broadinstitute/lincs-cell-painting/tree/master/metadata). +You can interactively explore information about the compounds in the [CLUE Repurposing app](https://clue.io/repurposing-app). The [Morphology Connectivity Hub](https://clue.io/morphology) is the primary source of this dataset. -## Image-Based Profiling +## Image-Based profiling -We apply a unified, image-based profiling pipeline to all 136 384-well plates from `LINCS Pilot 1` . +We apply a unified, image-based profiling pipeline to all 136 384-well plates from `LINCS Pilot 1`, and all 135 384-well plates from `LKCP`. We use [pycytominer](https://github.com/cytomining/pycytominer) as the primary tool for image-based profiling. -The profiles are processed and stored in the [profiles/](profiles/) directory. +We process and store profiles in the [profiles/](profiles/) directory. See [`profiles/README.md`](profiles/README.md) for more details and for instructions on how to reproduce the pipeline. For more details about image-based profiling in general, please refer to [Caicedo et al. 2017](https://doi.org/10.1038/nmeth.4397). -## Computational Environment +## Computational environment We use [conda](https://docs.conda.io/en/latest/) to manage the computational environment. diff --git a/profiles/README.md b/profiles/README.md index fa18c5e..2081810 100644 --- a/profiles/README.md +++ b/profiles/README.md @@ -1,8 +1,9 @@ -# Image-Based Profiling +# Image-Based profiling Image-based profiling represents a series of data processing steps that turn image-based readouts into more manageable data matrices for downstream analyses ([Caicedo et al. 2017](https://doi.org/10.1038/nmeth.4397)). -Typically, the image-based readouts are derived from CellProfiler ([McQuin et al. 2018](https://doi.org/10.1371/journal.pbio.2005970)) and represent single cell morphology measurements. -In this folder, we process the CellProfiler derived morphology features using [pycytominer](https://github.com/cytomining/pycytominer) - a tool enabling reproducible image-based profiling. +Typically, you derive image-based readouts using software, like CellProfiler ([McQuin et al. 2018](https://doi.org/10.1371/journal.pbio.2005970)), that segment cells and extract so-called hand-engineered single cell morphology measurements. +In this folder, we process the CellProfiler derived morphology features for the LINCS Cell Painting dataset using [pycytominer](https://github.com/cytomining/pycytominer) - a tool enabling reproducible image-based profiling. + Specifically, we include: 1. Data processing scripts to perform the full unified, image-based profiling pipeline @@ -11,34 +12,52 @@ Specifically, we include: ## Workflow -![Cytominer Workflow](media/cytominer_workflow.png) +![Cytominer workflow](media/cytominer_workflow.png) Note here that we do not include the intermediate step of generating `.sqlite` files per plate using a tool called [cytominer-database](https://github.com/cytomining/cytominer-database). This repository and workflow begins after we applied cytominer-database. ## Data Levels -### CellProfilier-derived Profiles +We include two batches of Cell Painting data in this repository: `2016_04_01_a549_48hr_batch1` and `2017_12_05_Batch2`. + +### CellProfilier-derived profiles + +For each batch, we include: -| Data Level | Description | File Format | Included in this Repo | +| Data level | Description | File format | Included in this repo? | | :--------- | :---------- | :---------- | :-------------------- | -| Level 1 | Cell Images | `.tif` | No^ | -| Level 2 | Single Cell Profiles | `.sqlite` | No^ | -| Level 3 | Aggregated Profiles with Metadata | `.csv.gz` | Yes | -| Level 4a | Normalized Profiles with Metadata | `.csv.gz` | Yes | -| Level 4b | Normalized and Feature Selected Profiles with Metadata | `.csv.gz` | Yes | -| Level 5 | Consensus Perturbation Profiles | `.csv.gz` | Yes | +| Level 1 | Cell images | `.tif` | No^ | +| Level 2 | Single cell profiles | `.sqlite` | No^ | +| Level 3 | Aggregated profiles with metadata | `.csv.gz` | Yes | +| Level 4a | Normalized profiles with metadata | `.csv.gz` | Yes | +| Level 4b | Normalized and feature selected profiles with metadata | `.csv.gz` | Yes | +| Level 5 | Consensus perturbation profiles | `.csv.gz` | Yes | Importantly, we include files for _two_ different types of normalization: Whole-plate normalization, and DMSO-specific normalization. -See [`profile.py`](profile.py) for more details. +See [`profile_cells.py`](profile_cells.py) for more details. + +#### Batch corrected profiles + +We use a spherize (a.k.a. whiten) transform to adjust for plate position effects. +The spherize transform adjusts for plate position effects by transforming the profile data such that the DMSO profiles are left with an identity covariance matrix. +See [`spherize-batch-effects.ipynb`](spherized_profiles/spherize-batch-effects.ipynb) for implementation details. + +For each batch we include four different spherized profiles. +These data include all level 4b profiles for every batch. -^ Note that these files are being prepared +| Batch | Input data | Spherized output file | +| :---: | :--------: | :-------------------: | +| 2016_04_01_a549_48hr_batch1 | DMSO normalized | 2016_04_01_a549_48hr_batch1_dmso_spherized_profiles_with_input_normalized_by_dmso.csv.gz | +| 2016_04_01_a549_48hr_batch1 | Whole plate normalized | 2017_12_05_Batch2_dmso_spherized_profiles_with_input_normalized_by_whole_plate.csv.gz | +| 2017_12_05_Batch2 | DMSO normalized | 2017_12_05_Batch2_dmso_spherized_profiles_with_input_normalized_by_dmso.csv.gz | +| 2017_12_05_Batch2 | Whole plate normalized | 2017_12_05_Batch2_dmso_spherized_profiles_with_input_normalized_by_whole_plate.csv.gz | -### DeepProfiler-derived Profiles +### DeepProfiler-derived profiles TBD -## Reproduce Pipeline +## Reproduce pipeline The pipeline can be reproduced by simply executing the following: @@ -53,7 +72,7 @@ python profiling_pipeline.py python profiling_pipeline.py --batch "2017_12_05_Batch2" --plate_prefix "BR" --well_col "Metadata_Well" --plate_col "Metadata_Plate" --extract_cell_line ``` -## Critical Details +## Critical details There are several critical details that are important for understanding data generation and processing. -See [`profile.py`](profile.py) for more details about the specific processing steps and decisions. +See [`profile_cells.py`](profile_cells.py) for more details about the specific processing steps and decisions. diff --git a/profiles/profile.py b/profiles/profile_cells.py similarity index 100% rename from profiles/profile.py rename to profiles/profile_cells.py diff --git a/profiles/profiling_pipeline.py b/profiles/profiling_pipeline.py index 7be6f8c..d305009 100644 --- a/profiles/profiling_pipeline.py +++ b/profiles/profiling_pipeline.py @@ -75,7 +75,7 @@ cmd = [ "python", - "profile.py", + "profile_cells.py", "--sql_file", sql_file, "--batch", diff --git a/spherized_profiles/profiles/2016_04_01_a549_48hr_batch1_dmso_spherized_profiles_with_input_normalized_by_dmso.csv.gz b/spherized_profiles/profiles/2016_04_01_a549_48hr_batch1_dmso_spherized_profiles_with_input_normalized_by_dmso.csv.gz new file mode 100644 index 0000000..10ada2b --- /dev/null +++ b/spherized_profiles/profiles/2016_04_01_a549_48hr_batch1_dmso_spherized_profiles_with_input_normalized_by_dmso.csv.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1c20fe49259c4e91454b1e063223edcb18fec8e409298eef2387cd1479a27cb +size 290341784 diff --git a/spherized_profiles/profiles/2016_04_01_a549_48hr_batch1_dmso_spherized_profiles_with_input_normalized_by_whole_plate.csv.gz b/spherized_profiles/profiles/2016_04_01_a549_48hr_batch1_dmso_spherized_profiles_with_input_normalized_by_whole_plate.csv.gz new file mode 100644 index 0000000..ffd2ca2 --- /dev/null +++ b/spherized_profiles/profiles/2016_04_01_a549_48hr_batch1_dmso_spherized_profiles_with_input_normalized_by_whole_plate.csv.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2ef2163c6f9d0a5fb573a261c482257420f682a99ed07e66c1c1a411bc99e32 +size 327458899 diff --git a/spherized_profiles/profiles/2017_12_05_Batch2_dmso_spherized_profiles_with_input_normalized_by_dmso.csv.gz b/spherized_profiles/profiles/2017_12_05_Batch2_dmso_spherized_profiles_with_input_normalized_by_dmso.csv.gz new file mode 100644 index 0000000..374f044 --- /dev/null +++ b/spherized_profiles/profiles/2017_12_05_Batch2_dmso_spherized_profiles_with_input_normalized_by_dmso.csv.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd78159dcf8c88a453240025e6ecb19a2c0fe26989a5491d24ebbffb7bd794b8 +size 103902586 diff --git a/spherized_profiles/profiles/2017_12_05_Batch2_dmso_spherized_profiles_with_input_normalized_by_whole_plate.csv.gz b/spherized_profiles/profiles/2017_12_05_Batch2_dmso_spherized_profiles_with_input_normalized_by_whole_plate.csv.gz new file mode 100644 index 0000000..63fe128 --- /dev/null +++ b/spherized_profiles/profiles/2017_12_05_Batch2_dmso_spherized_profiles_with_input_normalized_by_whole_plate.csv.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dbe636236325e325743735786c45fc627d3d9a833f512066d795847b7b576eeb +size 268978977 diff --git a/spherized_profiles/scripts/nbconverted/spherize-batch-effects.py b/spherized_profiles/scripts/nbconverted/spherize-batch-effects.py new file mode 100644 index 0000000..7fcbb86 --- /dev/null +++ b/spherized_profiles/scripts/nbconverted/spherize-batch-effects.py @@ -0,0 +1,100 @@ +#!/usr/bin/env python +# coding: utf-8 + +# ## Adjust batch effects with a spherize transform +# +# Here, we load in all normalized profiles (level 4a) data across all plates and apply a spherize transform using the DMSO profiles as the background distribution. +# +# We've previously observed that sphering (aka whitening) the data successfully adjusts for technical artifacts induced by batch to batch variation and plate position effects. + +# In[1]: + + +import os +import pathlib +import subprocess +import pandas as pd + +from pycytominer import normalize, feature_select +from pycytominer.cyto_utils import output, infer_cp_features + + +# In[2]: + + +input_dir = pathlib.Path("../profiles/") +batches = ["2016_04_01_a549_48hr_batch1", "2017_12_05_Batch2"] + +suffixes = { + "whole_plate": "_normalized.csv.gz", + "dmso": "_normalized_dmso.csv.gz" +} + +plates = { + batch: [x.name for x in pathlib.Path(f"{input_dir}/{batch}").iterdir() if ".DS_Store" not in x.name] + for batch in batches +} + +files = { + batch: { + suffix: [pathlib.Path(f"{input_dir}/{batch}/{x}/{x}{suffixes[suffix]}") for x in plates[batch]] + for suffix in suffixes + } + for batch in batches +} + +feature_select_ops = [ + "variance_threshold", + "correlation_threshold", + "drop_na_columns", + "blacklist", + "drop_outliers" +] + +na_cut = 0 +corr_threshold = 0.95 +outlier_cutoff = 60 + +output_dir = "profiles" + + +# In[3]: + + +for batch in batches: + for suffix in suffixes: + output_file = pathlib.Path( + f"{output_dir}/{batch}_dmso_spherized_profiles_with_input_normalized_by_{suffix}.csv.gz" + ) + print(f"Now processing {output_file}...") + + profile_df = pd.concat([pd.read_csv(x) for x in files[batch][suffix]]).reset_index(drop=True) + + # Perform feature selection + profile_df = feature_select( + profiles=profile_df, + operation=feature_select_ops, + na_cutoff=0, + corr_threshold=corr_threshold, + outlier_cutoff=outlier_cutoff + ) + + print(profile_df.shape) + profile_df.head() + + spherize_df = normalize( + profiles=profile_df, + features="infer", + meta_features="infer", + samples="Metadata_broad_sample == 'DMSO'", + method="whiten", + ) + + print(spherize_df.shape) + spherize_df.head() + + output( + df=spherize_df, + output_filename=output_file + ) + diff --git a/spherized_profiles/spherize-batch-effects.ipynb b/spherized_profiles/spherize-batch-effects.ipynb new file mode 100644 index 0000000..4ccc929 --- /dev/null +++ b/spherized_profiles/spherize-batch-effects.ipynb @@ -0,0 +1,156 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Adjust batch effects with a spherize transform\n", + "\n", + "Here, we load in all normalized profiles (level 4a) data across all plates and apply a spherize transform using the DMSO profiles as the background distribution.\n", + "\n", + "We've previously observed that sphering (aka whitening) the data successfully adjusts for technical artifacts induced by batch to batch variation and plate position effects." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import pathlib\n", + "import subprocess\n", + "import pandas as pd\n", + "\n", + "from pycytominer import normalize, feature_select\n", + "from pycytominer.cyto_utils import output, infer_cp_features" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "input_dir = pathlib.Path(\"../profiles/\")\n", + "batches = [\"2016_04_01_a549_48hr_batch1\", \"2017_12_05_Batch2\"]\n", + "\n", + "suffixes = {\n", + " \"whole_plate\": \"_normalized.csv.gz\",\n", + " \"dmso\": \"_normalized_dmso.csv.gz\"\n", + "}\n", + "\n", + "plates = {\n", + " batch: [x.name for x in pathlib.Path(f\"{input_dir}/{batch}\").iterdir() if \".DS_Store\" not in x.name]\n", + " for batch in batches\n", + "}\n", + "\n", + "files = {\n", + " batch: {\n", + " suffix: [pathlib.Path(f\"{input_dir}/{batch}/{x}/{x}{suffixes[suffix]}\") for x in plates[batch]]\n", + " for suffix in suffixes\n", + " }\n", + " for batch in batches\n", + "}\n", + "\n", + "feature_select_ops = [\n", + " \"variance_threshold\",\n", + " \"correlation_threshold\",\n", + " \"drop_na_columns\",\n", + " \"blacklist\",\n", + " \"drop_outliers\"\n", + "]\n", + "\n", + "na_cut = 0\n", + "corr_threshold = 0.95\n", + "outlier_cutoff = 60\n", + "\n", + "output_dir = \"profiles\"" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Now processing profiles/2016_04_01_a549_48hr_batch1_dmso_spherized_profiles_with_input_normalized_by_whole_plate.csv.gz...\n", + "(52223, 711)\n", + "(52223, 711)\n", + "Now processing profiles/2016_04_01_a549_48hr_batch1_dmso_spherized_profiles_with_input_normalized_by_dmso.csv.gz...\n", + "(52223, 632)\n", + "(52223, 632)\n", + "Now processing profiles/2017_12_05_Batch2_dmso_spherized_profiles_with_input_normalized_by_whole_plate.csv.gz...\n", + "(51447, 600)\n", + "(51447, 600)\n", + "Now processing profiles/2017_12_05_Batch2_dmso_spherized_profiles_with_input_normalized_by_dmso.csv.gz...\n", + "(51447, 247)\n", + "(51447, 247)\n" + ] + } + ], + "source": [ + "for batch in batches:\n", + " for suffix in suffixes:\n", + " output_file = pathlib.Path(\n", + " f\"{output_dir}/{batch}_dmso_spherized_profiles_with_input_normalized_by_{suffix}.csv.gz\"\n", + " )\n", + " print(f\"Now processing {output_file}...\")\n", + "\n", + " profile_df = pd.concat([pd.read_csv(x) for x in files[batch][suffix]]).reset_index(drop=True)\n", + "\n", + " # Perform feature selection\n", + " profile_df = feature_select(\n", + " profiles=profile_df,\n", + " operation=feature_select_ops,\n", + " na_cutoff=0,\n", + " corr_threshold=corr_threshold,\n", + " outlier_cutoff=outlier_cutoff\n", + " )\n", + "\n", + " print(profile_df.shape)\n", + " profile_df.head()\n", + "\n", + " spherize_df = normalize(\n", + " profiles=profile_df,\n", + " features=\"infer\",\n", + " meta_features=\"infer\",\n", + " samples=\"Metadata_broad_sample == 'DMSO'\",\n", + " method=\"whiten\",\n", + " )\n", + "\n", + " print(spherize_df.shape)\n", + " spherize_df.head()\n", + "\n", + " output(\n", + " df=spherize_df,\n", + " output_filename=output_file\n", + " )" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.10" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}