Skip to content

Commit 7b9d965

Browse files
Lilly-MayZethson
andauthored
Add PRISM drug response metadata annotation (#716)
* Added PRISM metadata annotation * Made PRISM annotation case-invariant * Likely fix tests Signed-off-by: Lukas Heumos <[email protected]> * Refactoring Signed-off-by: Lukas Heumos <[email protected]> --------- Signed-off-by: Lukas Heumos <[email protected]> Co-authored-by: Lukas Heumos <[email protected]>
1 parent 3137547 commit 7b9d965

File tree

2 files changed

+122
-13
lines changed

2 files changed

+122
-13
lines changed

pertpy/metadata/_cell_line.py

Lines changed: 102 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ def __init__(self):
3939
self.proteomics = None
4040
self.drug_response_gdsc1 = None
4141
self.drug_response_gdsc2 = None
42+
self.drug_response_prism = None
4243

4344
def _download_cell_line(self, cell_line_source: Literal["DepMap", "Cancerrxgene"] = "DepMap") -> None:
4445
if cell_line_source == "DepMap":
@@ -157,7 +158,7 @@ def _download_proteomics(self) -> None:
157158
def _download_gdsc(self, gdsc_dataset: Literal[1, 2] = 1) -> None:
158159
if gdsc_dataset == 1:
159160
# Download GDSC drug response data
160-
# Source: https://www.cancerrxgene.org/downloads/bulk_download (Drug Screening - IC50s)
161+
# Source: https://www.cancerrxgene.org/downloads/bulk_download (Drug Screening - IC50s and AUC)
161162
# URL: https://cog.sanger.ac.uk/cancerrxgene/GDSC_release8.4/GDSC1_fitted_dose_response_24Jul22.xlsx
162163
drug_response_gdsc1_file_path = Path(settings.cachedir) / "gdsc1_info.csv"
163164
if not Path(drug_response_gdsc1_file_path).exists():
@@ -181,6 +182,23 @@ def _download_gdsc(self, gdsc_dataset: Literal[1, 2] = 1) -> None:
181182
)
182183
self.drug_response_gdsc2 = pd.read_csv(drug_response_gdsc2_file_path, index_col=0)
183184

185+
def _download_prism(self) -> None:
186+
# Download PRISM drug response data
187+
# Source: DepMap PRISM Repurposing 19Q4 secondary screen dose response curve parameters
188+
drug_response_prism_file_path = Path(settings.cachedir) / "prism_info.csv"
189+
if not Path(drug_response_prism_file_path).exists():
190+
_download(
191+
url="https://figshare.com/ndownloader/files/20237739",
192+
output_file_name="prism_info.csv",
193+
output_path=settings.cachedir,
194+
block_size=4096,
195+
is_zip=False,
196+
)
197+
df = pd.read_csv(drug_response_prism_file_path, index_col=0)[["depmap_id", "name", "ic50", "ec50", "auc"]]
198+
df = df.dropna(subset=["depmap_id", "name"])
199+
df = df.groupby(["depmap_id", "name"]).mean().reset_index()
200+
self.drug_response_prism = df
201+
184202
def annotate(
185203
self,
186204
adata: AnnData,
@@ -197,13 +215,13 @@ def annotate(
197215
198216
Args:
199217
adata: The data object to annotate.
200-
query_id: The column of `.obs` with cell line information.
218+
query_id: The column of ``.obs`` with cell line information.
201219
reference_id: The type of cell line identifier in the metadata, e.g. ModelID, CellLineName or StrippedCellLineName.
202220
If fetching cell line metadata from Cancerrxgene, it is recommended to choose "stripped_cell_line_name".
203221
fetch: The metadata to fetch.
204222
cell_line_source: The source of cell line metadata, DepMap or Cancerrxgene.
205223
verbosity: The number of unmatched identifiers to print, can be either non-negative values or "all".
206-
copy: Determines whether a copy of the `adata` is returned.
224+
copy: Determines whether a copy of ``adata`` is returned.
207225
208226
Returns:
209227
Returns an AnnData object with cell line annotation.
@@ -316,7 +334,8 @@ def annotate_bulk_rna(
316334
317335
Args:
318336
adata: The data object to annotate.
319-
query_id: The column of `.obs` with cell line information. Defaults to "cell_line_name" if `cell_line_source` is sanger, otherwise "DepMap_ID".
337+
query_id: The column of `.obs` with cell line information.
338+
Defaults to "cell_line_name" if `cell_line_source` is sanger, otherwise "DepMap_ID".
320339
cell_line_source: The bulk rna expression data from either broad or sanger cell line.
321340
verbosity: The number of unmatched identifiers to print, can be either non-negative values or "all".
322341
copy: Determines whether a copy of the `adata` is returned.
@@ -491,7 +510,7 @@ def annotate_from_gdsc(
491510
) -> AnnData:
492511
"""Fetch drug response data from GDSC.
493512
494-
For each cell, we fetch drug response data as natural log of the fitted IC50 for its
513+
For each cell, we fetch drug response data as natural log of the fitted IC50 and AUC for its
495514
corresponding cell line and perturbation from GDSC fitted data results file.
496515
497516
Args:
@@ -554,9 +573,82 @@ def annotate_from_gdsc(
554573
adata.obs = (
555574
adata.obs.reset_index()
556575
.set_index([query_id, query_perturbation])
557-
.assign(ln_ic50=gdsc_data.set_index([reference_id, reference_perturbation]).ln_ic50)
576+
.assign(ln_ic50_gdsc=gdsc_data.set_index([reference_id, reference_perturbation]).ln_ic50)
577+
.assign(auc_gdsc=gdsc_data.set_index([reference_id, reference_perturbation]).auc)
578+
.reset_index()
579+
.set_index(old_index_name)
580+
)
581+
582+
return adata
583+
584+
def annotate_from_prism(
585+
self,
586+
adata: AnnData,
587+
query_id: str = "DepMap_ID",
588+
query_perturbation: str = "perturbation",
589+
verbosity: int | str = 5,
590+
copy: bool = False,
591+
) -> AnnData:
592+
"""Fetch drug response data from PRISM.
593+
594+
For each cell, we fetch drug response data as IC50, EC50 and AUC for its
595+
corresponding cell line and perturbation from PRISM fitted data results file.
596+
Note that all rows where either `depmap_id` or `name` is missing will be dropped.
597+
598+
Args:
599+
adata: The data object to annotate.
600+
query_id: The column of `.obs` with cell line information.
601+
query_perturbation: The column of `.obs` with perturbation information.
602+
verbosity: The number of unmatched identifiers to print, can be either non-negative values or 'all'.
603+
copy: Determines whether a copy of the `adata` is returned.
604+
605+
Returns:
606+
Returns an AnnData object with drug response annotation.
607+
608+
Examples:
609+
>>> import pertpy as pt
610+
>>> adata = pt.dt.mcfarland_2020()
611+
>>> pt_metadata = pt.md.CellLine()
612+
>>> pt_metadata.annotate_from_prism(adata, query_id="DepMap_ID")
613+
"""
614+
if copy:
615+
adata = adata.copy()
616+
if query_id not in adata.obs.columns:
617+
raise ValueError(
618+
f"The specified `query_id` {query_id} can't be found in the `adata.obs`. \n"
619+
"Ensure that you are using one of the available query IDs present in 'adata.obs' for the annotation.\n"
620+
"If the desired query ID is not available, you can fetch the cell line metadata "
621+
"using the `annotate()` function before calling `annotate_from_prism()`. "
622+
"This ensures that the required query ID is included in your data."
623+
)
624+
if self.drug_response_prism is None:
625+
self._download_prism()
626+
prism_data = self.drug_response_prism
627+
# PRISM starts most drug names with a lowercase letter, so we want to make it case-insensitive
628+
prism_data["name_lower"] = prism_data["name"].str.lower()
629+
adata.obs["perturbation_lower"] = adata.obs[query_perturbation].str.lower()
630+
631+
identifier_num_all = len(adata.obs[query_id].unique())
632+
not_matched_identifiers = list(set(adata.obs[query_id]) - set(prism_data["depmap_id"]))
633+
self._warn_unmatch(
634+
total_identifiers=identifier_num_all,
635+
unmatched_identifiers=not_matched_identifiers,
636+
query_id=query_id,
637+
reference_id="depmap_id",
638+
metadata_type="drug response",
639+
verbosity=verbosity,
640+
)
641+
642+
old_index_name = "index" if adata.obs.index.name is None else adata.obs.index.name
643+
adata.obs = (
644+
adata.obs.reset_index()
645+
.set_index([query_id, "perturbation_lower"])
646+
.assign(ic50_prism=prism_data.set_index(["depmap_id", "name"]).ic50)
647+
.assign(ec50_prism=prism_data.set_index(["depmap_id", "name"]).ec50)
648+
.assign(auc_prism=prism_data.set_index(["depmap_id", "name"]).auc)
558649
.reset_index()
559650
.set_index(old_index_name)
651+
.drop(columns="perturbation_lower")
560652
)
561653

562654
return adata
@@ -577,7 +669,7 @@ def lookup(self) -> LookUp:
577669
>>> pt_metadata = pt.md.CellLine()
578670
>>> lookup = pt_metadata.lookup()
579671
"""
580-
# Fetch the metadata if it hasn't beed downloaded yet
672+
# Fetch the metadata if it hasn't been downloaded yet
581673
if self.depmap is None:
582674
self._download_cell_line(cell_line_source="DepMap")
583675
if self.cancerxgene is None:
@@ -594,6 +686,8 @@ def lookup(self) -> LookUp:
594686
self._download_gdsc(gdsc_dataset=1)
595687
if self.drug_response_gdsc2 is None:
596688
self._download_gdsc(gdsc_dataset=2)
689+
if self.drug_response_prism is None:
690+
self._download_prism()
597691

598692
# Transfer the data
599693
return LookUp(
@@ -607,6 +701,7 @@ def lookup(self) -> LookUp:
607701
self.proteomics,
608702
self.drug_response_gdsc1,
609703
self.drug_response_gdsc2,
704+
self.drug_response_prism,
610705
],
611706
)
612707

tests/metadata/test_cell_line.py

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,17 +16,14 @@
1616

1717
@pytest.fixture
1818
def adata() -> AnnData:
19-
rng = np.random.default_rng(seed=1)
20-
21-
X = rng.normal(0, 1, (NUM_CELLS, NUM_GENES))
22-
X = np.where(X < 0, 0, X)
19+
X = np.random.default_rng().normal(0, 1, (NUM_CELLS, NUM_GENES))
2320

2421
obs = pd.DataFrame(
2522
{
2623
"DepMap_ID": ["ACH-000016", "ACH-000049", "ACH-001208", "ACH-000956"] * NUM_CELLS_PER_ID,
2724
"perturbation": ["Midostaurin"] * NUM_CELLS_PER_ID * 4,
2825
},
29-
index=[str(i) for i in range(NUM_GENES)],
26+
index=[str(i) for i in range(NUM_CELLS)],
3027
)
3128

3229
var_data = {"gene_name": [f"gene{i}" for i in range(1, NUM_GENES + 1)]}
@@ -48,7 +45,24 @@ def test_cell_line_annotation(adata):
4845
def test_gdsc_annotation(adata):
4946
pt_metadata.annotate(adata)
5047
pt_metadata.annotate_from_gdsc(adata, query_id="StrippedCellLineName")
51-
assert "ln_ic50" in adata.obs
48+
assert "ln_ic50_gdsc" in adata.obs
49+
assert "auc_gdsc" in adata.obs
50+
51+
52+
def test_prism_annotation(adata):
53+
adata.obs = pd.DataFrame(
54+
{
55+
"DepMap_ID": ["ACH-000879", "ACH-000488", "ACH-000488", "ACH-000008"] * NUM_CELLS_PER_ID,
56+
"perturbation": ["cytarabine", "cytarabine", "secnidazole", "flutamide"] * NUM_CELLS_PER_ID,
57+
},
58+
index=[str(i) for i in range(NUM_CELLS)],
59+
)
60+
61+
pt_metadata.annotate(adata)
62+
pt_metadata.annotate_from_prism(adata, query_id="DepMap_ID")
63+
assert "ic50_prism" in adata.obs
64+
assert "ec50_prism" in adata.obs
65+
assert "auc_prism" in adata.obs
5266

5367

5468
def test_protein_expression_annotation(adata):

0 commit comments

Comments
 (0)