@@ -57,15 +57,14 @@ def _get_pseudobulks(
57
57
58
58
Copied from `https://github.com/schillerlab/sc-toolbox/blob/397e80dc5e8fb8017b75f6c3fa634a1e1213d484/sc_toolbox/tools/__init__.py#L458`
59
59
60
- # TODO: Replace with decoupler's implementation
61
-
62
60
Args:
63
61
groupby: The key to groupby for pseudobulks
64
62
strategy: The pseudobulking strategy. One of "median" or "mean"
65
63
66
64
Returns:
67
65
A Pandas DataFrame of pseudobulk counts
68
66
"""
67
+ # TODO: Replace with decoupler's implementation
69
68
pseudobulk = {"Genes" : adata .var_names .values }
70
69
71
70
for category in adata .obs .loc [:, groupby ].cat .categories :
@@ -105,18 +104,16 @@ def _pseudobulk_pca(self, adata: AnnData, groupby: str, n_components: int = 50)
105
104
def _scale_data (self , pseudobulks : pd .DataFrame , normalize : bool = True ) -> np .ndarray :
106
105
"""Row-wise mean center and scale by the standard deviation.
107
106
108
- TODO: the `scale` function we implemented to match the R `scale` fn should already contain this functionality.
109
-
110
107
Args:
111
108
pseudobulks: The pseudobulk PCA components.
112
109
normalize: Whether to mimic DIALOGUE behavior or not.
113
110
114
111
Returns:
115
112
The scaled count matrix.
116
113
"""
114
+ # TODO: the `scale` function we implemented to match the R `scale` fn should already contain this functionality.
117
115
# DIALOGUE doesn't scale the data before passing to multicca, unlike what is recommended by sparsecca.
118
116
# However, performing this scaling _does_ increase overall correlation of the end result
119
- # WHEN SAMPLE ORDER AND DIALOGUE2+3 PROCESSING IS IGNORED.
120
117
if normalize :
121
118
return pseudobulks .to_numpy ()
122
119
else :
@@ -371,7 +368,7 @@ def _get_residuals(self, X: np.ndarray, y: np.ndarray) -> np.ndarray:
371
368
return np .array (resid )
372
369
373
370
def _iterative_nnls (self , A_orig : np .ndarray , y_orig : np .ndarray , feature_ranks : list [int ], n_iter : int = 1000 ):
374
- """Solves non-negative least squares separately for different feature categories.
371
+ """Solves non-negative least- squares separately for different feature categories.
375
372
376
373
Mimics DLG.iterative.nnls.
377
374
Variables are notated according to:
@@ -628,9 +625,8 @@ def calculate_multifactor_PMD(
628
625
>>> dl = pt.tl.Dialogue(sample_id = "clinical.status", celltype_key = "cell.subtypes", n_counts_key = "nCount_RNA", n_mpcs = 3)
629
626
>>> adata, mcps, ws, ct_subs = dl.calculate_multifactor_PMD(adata, normalize=True)
630
627
"""
631
- # IMPORTANT NOTE: the order in which matrices are passed to multicca matters. As such,
632
- # it is important here that to obtain the same result as in R, we pass the matrices in
633
- # in the same order.
628
+ # IMPORTANT NOTE: the order in which matrices are passed to multicca matters.
629
+ # As such, it is important here that to obtain the same result as in R, we pass the matrices in the same order.
634
630
if ct_order is not None :
635
631
cell_types = ct_order
636
632
else :
@@ -798,7 +794,7 @@ def multilevel_modeling(
798
794
for mcp in mcps :
799
795
mixed_model_progress .update (mm_task , description = f"[bold blue]Determining mixed effects for { mcp } " )
800
796
801
- # TODO Check that the genes in result{sig_genes_1] are different and if so note that somewhere and explain why
797
+ # TODO Check whether the genes in result{sig_genes_1] are different and if so note that somewhere and explain why
802
798
result = {}
803
799
result ["HLM_result_1" ], result ["sig_genes_1" ] = self ._apply_HLM_per_MCP_for_one_pair (
804
800
mcp_name = mcp ,
@@ -868,22 +864,19 @@ def test_association(
868
864
sample_label = self .sample_id
869
865
n_mcps = self .n_mcps
870
866
871
- # create conditions_compare if not supplied
872
867
if conditions_compare is None :
873
868
conditions_compare = list (adata .obs ["path_str" ].cat .categories ) # type: ignore
874
869
if len (conditions_compare ) != 2 :
875
870
raise ValueError ("Please specify conditions to compare or supply an object with only 2 conditions" )
876
871
877
- # create data frames to store results
878
872
pvals = pd .DataFrame (1 , adata .obs [celltype_label ].unique (), ["mcp_" + str (n ) for n in range (0 , n_mcps )])
879
873
tstats = pd .DataFrame (1 , adata .obs [celltype_label ].unique (), ["mcp_" + str (n ) for n in range (0 , n_mcps )])
880
874
pvals_adj = pd .DataFrame (1 , adata .obs [celltype_label ].unique (), ["mcp_" + str (n ) for n in range (0 , n_mcps )])
881
875
882
876
response = adata .obs .groupby (sample_label )[condition_label ].agg (pd .Series .mode )
883
877
for celltype in adata .obs [celltype_label ].unique ():
884
- # subset data to cell type
885
878
df = adata .obs [adata .obs [celltype_label ] == celltype ]
886
- # run t-test for each MCP
879
+
887
880
for mcpnum in ["mcp_" + str (n ) for n in range (0 , n_mcps )]:
888
881
mns = df .groupby (sample_label )[mcpnum ].mean ()
889
882
mns = pd .concat ([mns , response ], axis = 1 )
@@ -893,11 +886,10 @@ def test_association(
893
886
)
894
887
pvals .loc [celltype , mcpnum ] = res [1 ]
895
888
tstats .loc [celltype , mcpnum ] = res [0 ]
896
- # return(res)
897
889
898
- # benjamini-hochberg correction for number of cell types (use BH because correlated MCPs)
899
890
for mcpnum in ["mcp_" + str (n ) for n in range (0 , n_mcps )]:
900
891
pvals_adj [mcpnum ] = multipletests (pvals [mcpnum ], method = "fdr_bh" )[1 ]
892
+
901
893
return {"pvals" : pvals , "tstats" : tstats , "pvals_adj" : pvals_adj }
902
894
903
895
def get_mlm_mcp_genes (
@@ -914,7 +906,7 @@ def get_mlm_mcp_genes(
914
906
celltype: Cell type of interest.
915
907
results: dl.MultilevelModeling result object.
916
908
MCP: MCP key of the result object.
917
- threshhold : Number between [0,1]. The fraction of cell types compared against which must have the associated MCP gene.
909
+ threshold : Number between [0,1]. The fraction of cell types compared against which must have the associated MCP gene.
918
910
Defaults to 0.70.
919
911
focal_celltypes: None (compare against all cell types) or a list of other cell types which you want to compare against.
920
912
Defaults to None.
@@ -938,7 +930,6 @@ def get_mlm_mcp_genes(
938
930
# REMOVE THIS BLOCK ONCE MLM OUTPUT MATCHES STANDARD
939
931
if MCP .startswith ("mcp_" ):
940
932
MCP = MCP .replace ("mcp_" , "MCP" )
941
- # convert from MCPx to MCPx+1
942
933
MCP = "MCP" + str (int (MCP [3 :]) - 1 )
943
934
944
935
# Extract all comparison keys from the results object
@@ -1007,17 +998,16 @@ def _get_extrema_MCP_genes_single(self, ct_subs: dict, mcp: str = "mcp_0", fract
1007
998
objects containing the results of gene ranking analysis.
1008
999
1009
1000
Examples:
1010
- ct_subs = {
1011
- "subpop1": anndata_obj1,
1012
- "subpop2": anndata_obj2,
1013
- # ... more subpopulations ...
1014
- }
1015
- genes_results = _get_extrema_MCP_genes_single(ct_subs, mcp="mcp_4", fraction=0.2)
1001
+ >>> ct_subs = {
1002
+ >>> "subpop1": anndata_obj1,
1003
+ >>> "subpop2": anndata_obj2,
1004
+ >>> # ... more subpopulations ...
1005
+ >>> }
1006
+ >>> genes_results = _get_extrema_MCP_genes_single(ct_subs, mcp="mcp_4", fraction=0.2)
1016
1007
"""
1017
1008
genes = {}
1018
1009
for ct in ct_subs .keys ():
1019
1010
mini = ct_subs [ct ]
1020
- mini .obs [mcp ]
1021
1011
mini .obs ["extrema" ] = pd .qcut (
1022
1012
mini .obs [mcp ],
1023
1013
[0 , 0 + fraction , 1 - fraction , 1.0 ],
@@ -1027,6 +1017,7 @@ def _get_extrema_MCP_genes_single(self, ct_subs: dict, mcp: str = "mcp_0", fract
1027
1017
mini , "extrema" , groups = ["high" + mcp + " " + ct ], reference = "low " + mcp + " " + ct
1028
1018
)
1029
1019
genes [ct ] = mini # .uns['rank_genes_groups']
1020
+
1030
1021
return genes
1031
1022
1032
1023
def get_extrema_MCP_genes (self , ct_subs : dict , fraction : float = 0.1 ):
@@ -1064,11 +1055,12 @@ def get_extrema_MCP_genes(self, ct_subs: dict, fraction: float = 0.1):
1064
1055
rank_dfs [mcp ] = {}
1065
1056
ct_ranked = self ._get_extrema_MCP_genes_single (ct_subs , mcp = mcp , fraction = fraction )
1066
1057
for celltype in ct_ranked .keys ():
1067
- rank_dfs [mcp ][celltype ] = sc .get .rank_genes_groups_df (ct_ranked [celltype ], group = None )
1058
+ rank_dfs [mcp ][celltype ] = sc .get .rank_genes_groups_df (ct_ranked [celltype ])
1068
1059
1069
1060
return rank_dfs
1070
1061
1071
1062
def plot_split_violins (
1063
+ self ,
1072
1064
adata : AnnData ,
1073
1065
split_key : str ,
1074
1066
celltype_key = str ,
@@ -1111,18 +1103,20 @@ def plot_split_violins(
1111
1103
1112
1104
return ax
1113
1105
1114
- def plot_pairplot (adata : AnnData , celltype_key : str , color : str , sample_id : str , mcp : str = "mcp_0" ) -> PairGrid :
1106
+ def plot_pairplot (
1107
+ self , adata : AnnData , celltype_key : str , color : str , sample_id : str , mcp : str = "mcp_0"
1108
+ ) -> PairGrid :
1115
1109
"""Generate a pairplot visualization for multi-cell perturbation (MCP) data.
1116
1110
1117
1111
Computes the mean of a specified MCP feature (mcp) for each combination of sample and cell type,
1118
1112
then creates a pairplot to visualize the relationships between these mean MCP values.
1119
1113
1120
1114
Args:
1121
1115
adata: Annotated data object.
1122
- celltype_key: Key in adata.obs containing cell type annotations.
1123
- color: Key in adata.obs for color annotations. This parameter is used as the hue
1124
- sample_id: Key in adata.obs for the sample annotations.
1125
- mcp: Key in adata.obs for MCP feature values. Defaults to "mcp_0".
1116
+ celltype_key: Key in ` adata.obs` containing cell type annotations.
1117
+ color: Key in ` adata.obs` for color annotations. This parameter is used as the hue
1118
+ sample_id: Key in ` adata.obs` for the sample annotations.
1119
+ mcp: Key in ` adata.obs` for MCP feature values. Defaults to ` "mcp_0"` .
1126
1120
1127
1121
Returns:
1128
1122
Seaborn Pairgrid object.
0 commit comments