Skip to content

Commit

Permalink
Add PRISM drug response metadata annotation (#716)
Browse files Browse the repository at this point in the history
* Added PRISM metadata annotation

* Made PRISM annotation case-invariant

* Likely fix tests

Signed-off-by: Lukas Heumos <[email protected]>

* Refactoring

Signed-off-by: Lukas Heumos <[email protected]>

---------

Signed-off-by: Lukas Heumos <[email protected]>
Co-authored-by: Lukas Heumos <[email protected]>
  • Loading branch information
Lilly-May and Zethson authored Feb 23, 2025
1 parent 3137547 commit 7b9d965
Show file tree
Hide file tree
Showing 2 changed files with 122 additions and 13 deletions.
109 changes: 102 additions & 7 deletions pertpy/metadata/_cell_line.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ def __init__(self):
self.proteomics = None
self.drug_response_gdsc1 = None
self.drug_response_gdsc2 = None
self.drug_response_prism = None

def _download_cell_line(self, cell_line_source: Literal["DepMap", "Cancerrxgene"] = "DepMap") -> None:
if cell_line_source == "DepMap":
Expand Down Expand Up @@ -157,7 +158,7 @@ def _download_proteomics(self) -> None:
def _download_gdsc(self, gdsc_dataset: Literal[1, 2] = 1) -> None:
if gdsc_dataset == 1:
# Download GDSC drug response data
# Source: https://www.cancerrxgene.org/downloads/bulk_download (Drug Screening - IC50s)
# Source: https://www.cancerrxgene.org/downloads/bulk_download (Drug Screening - IC50s and AUC)
# URL: https://cog.sanger.ac.uk/cancerrxgene/GDSC_release8.4/GDSC1_fitted_dose_response_24Jul22.xlsx
drug_response_gdsc1_file_path = Path(settings.cachedir) / "gdsc1_info.csv"
if not Path(drug_response_gdsc1_file_path).exists():
Expand All @@ -181,6 +182,23 @@ def _download_gdsc(self, gdsc_dataset: Literal[1, 2] = 1) -> None:
)
self.drug_response_gdsc2 = pd.read_csv(drug_response_gdsc2_file_path, index_col=0)

def _download_prism(self) -> None:
# Download PRISM drug response data
# Source: DepMap PRISM Repurposing 19Q4 secondary screen dose response curve parameters
drug_response_prism_file_path = Path(settings.cachedir) / "prism_info.csv"
if not Path(drug_response_prism_file_path).exists():
_download(
url="https://figshare.com/ndownloader/files/20237739",
output_file_name="prism_info.csv",
output_path=settings.cachedir,
block_size=4096,
is_zip=False,
)
df = pd.read_csv(drug_response_prism_file_path, index_col=0)[["depmap_id", "name", "ic50", "ec50", "auc"]]
df = df.dropna(subset=["depmap_id", "name"])
df = df.groupby(["depmap_id", "name"]).mean().reset_index()
self.drug_response_prism = df

def annotate(
self,
adata: AnnData,
Expand All @@ -197,13 +215,13 @@ def annotate(
Args:
adata: The data object to annotate.
query_id: The column of `.obs` with cell line information.
query_id: The column of ``.obs`` with cell line information.
reference_id: The type of cell line identifier in the metadata, e.g. ModelID, CellLineName or StrippedCellLineName.
If fetching cell line metadata from Cancerrxgene, it is recommended to choose "stripped_cell_line_name".
fetch: The metadata to fetch.
cell_line_source: The source of cell line metadata, DepMap or Cancerrxgene.
verbosity: The number of unmatched identifiers to print, can be either non-negative values or "all".
copy: Determines whether a copy of the `adata` is returned.
copy: Determines whether a copy of ``adata`` is returned.
Returns:
Returns an AnnData object with cell line annotation.
Expand Down Expand Up @@ -316,7 +334,8 @@ def annotate_bulk_rna(
Args:
adata: The data object to annotate.
query_id: The column of `.obs` with cell line information. Defaults to "cell_line_name" if `cell_line_source` is sanger, otherwise "DepMap_ID".
query_id: The column of `.obs` with cell line information.
Defaults to "cell_line_name" if `cell_line_source` is sanger, otherwise "DepMap_ID".
cell_line_source: The bulk rna expression data from either broad or sanger cell line.
verbosity: The number of unmatched identifiers to print, can be either non-negative values or "all".
copy: Determines whether a copy of the `adata` is returned.
Expand Down Expand Up @@ -491,7 +510,7 @@ def annotate_from_gdsc(
) -> AnnData:
"""Fetch drug response data from GDSC.
For each cell, we fetch drug response data as natural log of the fitted IC50 for its
For each cell, we fetch drug response data as natural log of the fitted IC50 and AUC for its
corresponding cell line and perturbation from GDSC fitted data results file.
Args:
Expand Down Expand Up @@ -554,9 +573,82 @@ def annotate_from_gdsc(
adata.obs = (
adata.obs.reset_index()
.set_index([query_id, query_perturbation])
.assign(ln_ic50=gdsc_data.set_index([reference_id, reference_perturbation]).ln_ic50)
.assign(ln_ic50_gdsc=gdsc_data.set_index([reference_id, reference_perturbation]).ln_ic50)
.assign(auc_gdsc=gdsc_data.set_index([reference_id, reference_perturbation]).auc)
.reset_index()
.set_index(old_index_name)
)

return adata

def annotate_from_prism(
self,
adata: AnnData,
query_id: str = "DepMap_ID",
query_perturbation: str = "perturbation",
verbosity: int | str = 5,
copy: bool = False,
) -> AnnData:
"""Fetch drug response data from PRISM.
For each cell, we fetch drug response data as IC50, EC50 and AUC for its
corresponding cell line and perturbation from PRISM fitted data results file.
Note that all rows where either `depmap_id` or `name` is missing will be dropped.
Args:
adata: The data object to annotate.
query_id: The column of `.obs` with cell line information.
query_perturbation: The column of `.obs` with perturbation information.
verbosity: The number of unmatched identifiers to print, can be either non-negative values or 'all'.
copy: Determines whether a copy of the `adata` is returned.
Returns:
Returns an AnnData object with drug response annotation.
Examples:
>>> import pertpy as pt
>>> adata = pt.dt.mcfarland_2020()
>>> pt_metadata = pt.md.CellLine()
>>> pt_metadata.annotate_from_prism(adata, query_id="DepMap_ID")
"""
if copy:
adata = adata.copy()
if query_id not in adata.obs.columns:
raise ValueError(
f"The specified `query_id` {query_id} can't be found in the `adata.obs`. \n"
"Ensure that you are using one of the available query IDs present in 'adata.obs' for the annotation.\n"
"If the desired query ID is not available, you can fetch the cell line metadata "
"using the `annotate()` function before calling `annotate_from_prism()`. "
"This ensures that the required query ID is included in your data."
)
if self.drug_response_prism is None:
self._download_prism()
prism_data = self.drug_response_prism
# PRISM starts most drug names with a lowercase letter, so we want to make it case-insensitive
prism_data["name_lower"] = prism_data["name"].str.lower()
adata.obs["perturbation_lower"] = adata.obs[query_perturbation].str.lower()

identifier_num_all = len(adata.obs[query_id].unique())
not_matched_identifiers = list(set(adata.obs[query_id]) - set(prism_data["depmap_id"]))
self._warn_unmatch(
total_identifiers=identifier_num_all,
unmatched_identifiers=not_matched_identifiers,
query_id=query_id,
reference_id="depmap_id",
metadata_type="drug response",
verbosity=verbosity,
)

old_index_name = "index" if adata.obs.index.name is None else adata.obs.index.name
adata.obs = (
adata.obs.reset_index()
.set_index([query_id, "perturbation_lower"])
.assign(ic50_prism=prism_data.set_index(["depmap_id", "name"]).ic50)
.assign(ec50_prism=prism_data.set_index(["depmap_id", "name"]).ec50)
.assign(auc_prism=prism_data.set_index(["depmap_id", "name"]).auc)
.reset_index()
.set_index(old_index_name)
.drop(columns="perturbation_lower")
)

return adata
Expand All @@ -577,7 +669,7 @@ def lookup(self) -> LookUp:
>>> pt_metadata = pt.md.CellLine()
>>> lookup = pt_metadata.lookup()
"""
# Fetch the metadata if it hasn't beed downloaded yet
# Fetch the metadata if it hasn't been downloaded yet
if self.depmap is None:
self._download_cell_line(cell_line_source="DepMap")
if self.cancerxgene is None:
Expand All @@ -594,6 +686,8 @@ def lookup(self) -> LookUp:
self._download_gdsc(gdsc_dataset=1)
if self.drug_response_gdsc2 is None:
self._download_gdsc(gdsc_dataset=2)
if self.drug_response_prism is None:
self._download_prism()

# Transfer the data
return LookUp(
Expand All @@ -607,6 +701,7 @@ def lookup(self) -> LookUp:
self.proteomics,
self.drug_response_gdsc1,
self.drug_response_gdsc2,
self.drug_response_prism,
],
)

Expand Down
26 changes: 20 additions & 6 deletions tests/metadata/test_cell_line.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,17 +16,14 @@

@pytest.fixture
def adata() -> AnnData:
rng = np.random.default_rng(seed=1)

X = rng.normal(0, 1, (NUM_CELLS, NUM_GENES))
X = np.where(X < 0, 0, X)
X = np.random.default_rng().normal(0, 1, (NUM_CELLS, NUM_GENES))

obs = pd.DataFrame(
{
"DepMap_ID": ["ACH-000016", "ACH-000049", "ACH-001208", "ACH-000956"] * NUM_CELLS_PER_ID,
"perturbation": ["Midostaurin"] * NUM_CELLS_PER_ID * 4,
},
index=[str(i) for i in range(NUM_GENES)],
index=[str(i) for i in range(NUM_CELLS)],
)

var_data = {"gene_name": [f"gene{i}" for i in range(1, NUM_GENES + 1)]}
Expand All @@ -48,7 +45,24 @@ def test_cell_line_annotation(adata):
def test_gdsc_annotation(adata):
pt_metadata.annotate(adata)
pt_metadata.annotate_from_gdsc(adata, query_id="StrippedCellLineName")
assert "ln_ic50" in adata.obs
assert "ln_ic50_gdsc" in adata.obs
assert "auc_gdsc" in adata.obs


def test_prism_annotation(adata):
adata.obs = pd.DataFrame(
{
"DepMap_ID": ["ACH-000879", "ACH-000488", "ACH-000488", "ACH-000008"] * NUM_CELLS_PER_ID,
"perturbation": ["cytarabine", "cytarabine", "secnidazole", "flutamide"] * NUM_CELLS_PER_ID,
},
index=[str(i) for i in range(NUM_CELLS)],
)

pt_metadata.annotate(adata)
pt_metadata.annotate_from_prism(adata, query_id="DepMap_ID")
assert "ic50_prism" in adata.obs
assert "ec50_prism" in adata.obs
assert "auc_prism" in adata.obs


def test_protein_expression_annotation(adata):
Expand Down

0 comments on commit 7b9d965

Please sign in to comment.