|
7 | 7 | import numpy.typing as npt
|
8 | 8 | import pandas as pd
|
9 | 9 | from scipy.stats import kendalltau, pearsonr, spearmanr
|
| 10 | +from statsmodels.stats.multitest import fdrcorrection |
10 | 11 |
|
11 | 12 | if TYPE_CHECKING:
|
12 | 13 | from anndata import AnnData
|
@@ -216,6 +217,78 @@ def calculate_cohens_d(self, de_res_1: pd.DataFrame, de_res_2: pd.DataFrame) ->
|
216 | 217 |
|
217 | 218 | return cohens_d
|
218 | 219 |
|
| 220 | + def de_res_to_anndata( |
| 221 | + self, |
| 222 | + adata: AnnData, |
| 223 | + de_res: pd.DataFrame, |
| 224 | + *, |
| 225 | + groupby: str, |
| 226 | + gene_id_col: str = "gene_symbols", |
| 227 | + score_col: str = "scores", |
| 228 | + pval_col: str = "pvals", |
| 229 | + pval_adj_col: str | None = "pvals_adj", |
| 230 | + lfc_col: str = "logfoldchanges", |
| 231 | + key_added: str = "rank_genes_groups", |
| 232 | + ) -> None: |
| 233 | + """Add tabular differential expression result to AnnData as if it was produced by `scanpy.tl.rank_genes_groups`. |
| 234 | +
|
| 235 | + Args: |
| 236 | + adata: |
| 237 | + Annotated data matrix |
| 238 | + de_res: |
| 239 | + Tablular de result |
| 240 | + groupby: |
| 241 | + Column in `de_res` that indicates the group. This column must also exist in `adata.obs`. |
| 242 | + gene_id_col: |
| 243 | + Column in `de_res` that holds the gene identifiers |
| 244 | + score_col: |
| 245 | + Column in `de_res` that holds the score (results will be ordered by score). |
| 246 | + pval_col: |
| 247 | + Column in `de_res` that holds the unadjusted pvalue |
| 248 | + pval_adj_col: |
| 249 | + Column in `de_res` that holds the adjusted pvalue. |
| 250 | + If not specified, the unadjusted pvalues will be FDR-adjusted. |
| 251 | + lfc_col: |
| 252 | + Column in `de_res` that holds the log fold change |
| 253 | + key_added: |
| 254 | + Key under which the results will be stored in `adata.uns` |
| 255 | + """ |
| 256 | + if groupby not in adata.obs.columns or groupby not in de_res.columns: |
| 257 | + raise ValueError("groupby column must exist in both adata and de_res.") |
| 258 | + res_dict = { |
| 259 | + "params": { |
| 260 | + "groupby": groupby, |
| 261 | + "reference": "rest", |
| 262 | + "method": "other", |
| 263 | + "use_raw": True, |
| 264 | + "layer": None, |
| 265 | + "corr_method": "other", |
| 266 | + }, |
| 267 | + "names": [], |
| 268 | + "scores": [], |
| 269 | + "pvals": [], |
| 270 | + "pvals_adj": [], |
| 271 | + "logfoldchanges": [], |
| 272 | + } |
| 273 | + df_groupby = de_res.groupby(groupby) |
| 274 | + for _, tmp_df in df_groupby: |
| 275 | + tmp_df = tmp_df.sort_values(score_col, ascending=False) |
| 276 | + res_dict["names"].append(tmp_df[gene_id_col].values) # type: ignore |
| 277 | + res_dict["scores"].append(tmp_df[score_col].values) # type: ignore |
| 278 | + res_dict["pvals"].append(tmp_df[pval_col].values) # type: ignore |
| 279 | + if pval_adj_col is not None: |
| 280 | + res_dict["pvals_adj"].append(tmp_df[pval_adj_col].values) # type: ignore |
| 281 | + else: |
| 282 | + res_dict["pvals_adj"].append(fdrcorrection(tmp_df[pval_col].values)[1]) # type: ignore |
| 283 | + res_dict["logfoldchanges"].append(tmp_df[lfc_col].values) # type: ignore |
| 284 | + |
| 285 | + for key in ["names", "scores", "pvals", "pvals_adj", "logfoldchanges"]: |
| 286 | + res_dict[key] = pd.DataFrame( |
| 287 | + np.vstack(res_dict[key]).T, |
| 288 | + columns=list(df_groupby.groups.keys()), |
| 289 | + ).to_records(index=False, column_dtypes="O") |
| 290 | + adata.uns[key_added] = res_dict |
| 291 | + |
219 | 292 | def de_analysis(
|
220 | 293 | self,
|
221 | 294 | adata: AnnData,
|
|
0 commit comments