From da4169a10e23cbd0782537466a6dd2190ba05196 Mon Sep 17 00:00:00 2001 From: JiriBruthans <136194089+JiriBruthans@users.noreply.github.com> Date: Wed, 26 Feb 2025 09:57:29 +0100 Subject: [PATCH] improve CellLine metadata module UX (#717) * improve CellLine metadata module UX * Update notebooks submodule to latest commit --- pertpy/metadata/_cell_line.py | 53 ++++++++++++++++++++++------------- 1 file changed, 34 insertions(+), 19 deletions(-) diff --git a/pertpy/metadata/_cell_line.py b/pertpy/metadata/_cell_line.py index 4bf621c0..09fbf124 100644 --- a/pertpy/metadata/_cell_line.py +++ b/pertpy/metadata/_cell_line.py @@ -55,6 +55,7 @@ def _download_cell_line(self, cell_line_source: Literal["DepMap", "Cancerrxgene" is_zip=False, ) self.depmap = pd.read_csv(depmap_cell_line_path) + self.depmap = self.depmap.reset_index().rename(columns={"CellLineName": "cell_line_name"}) else: # Download cell line metadata from The Genomics of Drug Sensitivity in Cancer Project # Source: https://www.cancerrxgene.org/celllines @@ -234,7 +235,7 @@ def annotate( >>> adata_annotated = pt_metadata.annotate(adata=adata, >>> reference_id='cell_line_name', >>> query_id='cell_line_name', - >>> fetch=["cell_line_name", "age", "primary_disease"], + >>> fetch=["cell_line_name", "Age", "OncotreePrimaryDisease"], >>> copy=True) """ if copy: @@ -322,7 +323,7 @@ def annotate( def annotate_bulk_rna( self, adata: AnnData, - query_id: str = "cell_line_name", + query_id: str = None, cell_line_source: Literal["broad", "sanger"] = "sanger", verbosity: int | str = 5, gene_identifier: Literal["gene_name", "gene_ID", "both"] = "gene_ID", @@ -338,6 +339,7 @@ def annotate_bulk_rna( Defaults to "cell_line_name" if `cell_line_source` is sanger, otherwise "DepMap_ID". cell_line_source: The bulk rna expression data from either broad or sanger cell line. verbosity: The number of unmatched identifiers to print, can be either non-negative values or "all". + gene_identifier: The type of gene identifier saved in the fetched meta data, 'gene_name', 'gene_ID' or 'both'. copy: Determines whether a copy of the `adata` is returned. Returns: @@ -359,32 +361,44 @@ def annotate_bulk_rna( # Make sure that the specified `cell_line_type` can be found in the bulk rna expression data, # then we can compare these keys and fetch the corresponding metadata. if query_id not in adata.obs.columns: - raise ValueError( - f"The specified `query_id` {query_id} can't be found in the `adata.obs`. \n" - "Ensure that you are using one of the available query IDs present in the adata.obs for the annotation." - "If the desired query ID is not available, you can fetch the cell line metadata " - "using the `annotate()` function before calling 'annotate_bulk_rna()'. " - "This ensures that the required query ID is included in your data, e.g. stripped_cell_line_name, DepMap ID." - ) - + if query_id is not None: + raise ValueError( + f"The specified `query_id` {query_id} can't be found in the `adata.obs`. \n" + "Ensure that you are using one of the available query IDs present in the adata.obs for the annotation." + "If the desired query ID is not available, you can fetch the cell line metadata " + "using the `annotate()` function before calling 'annotate_bulk_rna()'. " + "This ensures that the required query ID is included in your data, e.g. stripped_cell_line_name, DepMap ID." + ) + if query_id is None: + if cell_line_source == "sanger": + query_id = "cell_line_name" + else: + query_id = "DepMap_ID" identifier_num_all = len(adata.obs[query_id].unique()) # Lazily download the bulk rna expression data if cell_line_source == "sanger": + if query_id not in adata.obs.columns: + raise ValueError( + "To annotate bulk RNA data from Wellcome Sanger Institute, `cell_line_name` is used as default reference and query identifier if no `query_id` is given." + "Ensure that you have column `cell_line_name` in `adata.obs` or specify column name in which cell line name is stored." + "If cell line name isn't available in 'adata.obs', use `annotate()` to annotate the cell line first." + ) if self.bulk_rna_sanger is None: self._download_bulk_rna(cell_line_source="sanger") reference_id = "model_name" not_matched_identifiers = list(set(adata.obs[query_id]) - set(self.bulk_rna_sanger.index)) else: + if query_id not in adata.obs.columns: + raise ValueError( + "To annotate bulk RNA data from Broad Institue, `DepMap_ID` is used as default reference and query identifier if no `query_id` is given." + "Ensure that you have column `DepMap_ID` in `adata.obs` or specify column name in which DepMap ID is stored." + "If DepMap ID isn't available in 'adata.obs', use `annotate()` to annotate the cell line first." + ) reference_id = "DepMap_ID" - logger.warning( - "To annotate bulk RNA data from Broad Institue, `DepMap_ID` is used as default reference and query identifier if no `reference_id` is given." - "If `DepMap_ID` isn't available in 'adata.obs', use `annotate()` to annotate the cell line first." - ) + if self.bulk_rna_broad is None: self._download_bulk_rna(cell_line_source="broad") - if query_id == "cell_line_name": - query_id = "DepMap_ID" not_matched_identifiers = list(set(adata.obs[query_id]) - set(self.bulk_rna_broad.index)) self._warn_unmatch( @@ -493,7 +507,8 @@ def annotate_protein_expression( adata.obsm["proteomics_" + protein_information] = ( self.proteomics[[reference_id, protein_id, protein_information]] .pivot(index=reference_id, columns=protein_id, values=protein_information) - .reindex(adata.obs.index) + .reindex(adata.obs[query_id]) + .set_index(adata.obs.index) ) return adata @@ -759,9 +774,9 @@ def correlate( "Dimensions of adata.X do not match those of metadata. Ensure that they have the same gene list." ) if isinstance(adata.obsm[metadata_key], pd.DataFrame): - # Give warning if the genes are not the same + # Raise error if the genes are not the same if sum(adata.obsm[metadata_key].columns != adata.var.index.values) > 0: - logger.warning( + raise ValueError( "Column name of metadata is not the same as the index of adata.var. Ensure that the genes are in the same order." )