Skip to content

Commit 7721c51

Browse files
improved zero variance handling of ttest and rank test
1 parent 702692b commit 7721c51

File tree

3 files changed

+106
-22
lines changed

3 files changed

+106
-22
lines changed

diffxpy/testing/det.py

+23-2
Original file line numberDiff line numberDiff line change
@@ -1609,7 +1609,8 @@ def __init__(
16091609
sample_description: pd.DataFrame,
16101610
grouping,
16111611
gene_names,
1612-
is_logged
1612+
is_logged,
1613+
is_sig_zerovar: bool = True
16131614
):
16141615
super().__init__()
16151616
self._X = data
@@ -1647,6 +1648,16 @@ def __init__(
16471648
n0=x0.shape[0],
16481649
n1=x1.shape[0]
16491650
)
1651+
pval[np.where(np.logical_and(
1652+
np.logical_and(mean_x0 == mean_x1, self._mean > 0),
1653+
np.logical_not(self._var_geq_zero)
1654+
))[0]] = 1.0
1655+
if is_sig_zerovar:
1656+
pval[np.where(np.logical_and(
1657+
mean_x0 != mean_x1,
1658+
np.logical_not(self._var_geq_zero)
1659+
))[0]] = 0.0
1660+
16501661
self._pval = pval
16511662

16521663
if is_logged:
@@ -1732,7 +1743,8 @@ def __init__(
17321743
sample_description: pd.DataFrame,
17331744
grouping,
17341745
gene_names,
1735-
is_logged
1746+
is_logged,
1747+
is_sig_zerovar: bool = True
17361748
):
17371749
super().__init__()
17381750
self._X = data
@@ -1772,6 +1784,15 @@ def __init__(
17721784
x0=x0.X[:, idx_run].toarray(),
17731785
x1=x1.X[:, idx_run].toarray()
17741786
)
1787+
pval[np.where(np.logical_and(
1788+
np.logical_and(mean_x0 == mean_x1, self._mean > 0),
1789+
np.logical_not(self._var_geq_zero)
1790+
))[0]] = 1.0
1791+
if is_sig_zerovar:
1792+
pval[np.where(np.logical_and(
1793+
mean_x0 != mean_x1,
1794+
np.logical_not(self._var_geq_zero)
1795+
))[0]] = 0.0
17751796

17761797
self._pval = pval
17771798

diffxpy/testing/tests.py

+45-4
Original file line numberDiff line numberDiff line change
@@ -669,6 +669,7 @@ def t_test(
669669
gene_names: Union[np.ndarray, list] = None,
670670
sample_description: pd.DataFrame = None,
671671
is_logged: bool = False,
672+
is_sig_zerovar: bool = True,
672673
dtype="float64"
673674
):
674675
"""
@@ -686,6 +687,9 @@ def t_test(
686687
:param is_logged:
687688
Whether data is already logged. If True, log-fold changes are computed as fold changes on this data.
688689
If False, log-fold changes are computed as log-fold changes on this data.
690+
:param is_sig_zerovar:
691+
Whether to assign p-value of 0 to a gene which has zero variance in both groups but not the same mean. If False,
692+
the p-value is set to np.nan.
689693
"""
690694
gene_names = parse_gene_names(data, gene_names)
691695
X = parse_data(data, gene_names)
@@ -698,7 +702,8 @@ def t_test(
698702
sample_description=sample_description,
699703
grouping=grouping,
700704
gene_names=gene_names,
701-
is_logged=is_logged
705+
is_logged=is_logged,
706+
is_sig_zerovar=is_sig_zerovar
702707
)
703708

704709
return de_test
@@ -709,7 +714,8 @@ def rank_test(
709714
grouping: Union[str, np.ndarray, list],
710715
gene_names: Union[np.ndarray, list] = None,
711716
sample_description: pd.DataFrame = None,
712-
is_logged=False,
717+
is_logged: bool = False,
718+
is_sig_zerovar: bool = True,
713719
dtype="float64"
714720
):
715721
"""
@@ -727,6 +733,9 @@ def rank_test(
727733
:param is_logged:
728734
Whether data is already logged. If True, log-fold changes are computed as fold changes on this data.
729735
If False, log-fold changes are computed as log-fold changes on this data.
736+
:param is_sig_zerovar:
737+
Whether to assign p-value of 0 to a gene which has zero variance in both groups but not the same mean. If False,
738+
the p-value is set to np.nan.
730739
"""
731740
gene_names = parse_gene_names(data, gene_names)
732741
X = parse_data(data, gene_names)
@@ -739,7 +748,8 @@ def rank_test(
739748
sample_description=sample_description,
740749
grouping=grouping,
741750
gene_names=gene_names,
742-
is_logged=is_logged
751+
is_logged=is_logged,
752+
is_sig_zerovar=is_sig_zerovar
743753
)
744754

745755
return de_test
@@ -756,6 +766,7 @@ def two_sample(
756766
size_factors: np.ndarray = None,
757767
batch_size: int = None,
758768
training_strategy: Union[str, List[Dict[str, object]], Callable] = "AUTO",
769+
is_sig_zerovar: bool = True,
759770
quick_scale: bool = None,
760771
dtype="float64",
761772
**kwargs
@@ -824,6 +835,9 @@ def two_sample(
824835
`training_strategy(estimator)`.
825836
- list of keyword dicts containing method arguments: Will call Estimator.train() once with each dict of
826837
method arguments.
838+
:param is_sig_zerovar:
839+
Whether to assign p-value of 0 to a gene which has zero variance in both groups but not the same mean. If False,
840+
the p-value is set to np.nan.
827841
:param quick_scale: Depending on the optimizer, `scale` will be fitted faster and maybe less accurate.
828842
829843
Useful in scenarios where fitting the exact `scale` is not absolutely necessary.
@@ -898,13 +912,15 @@ def two_sample(
898912
data=X,
899913
gene_names=gene_names,
900914
grouping=grouping,
915+
is_sig_zerovar=is_sig_zerovar,
901916
dtype=dtype
902917
)
903918
elif test.lower() == 'rank':
904919
de_test = rank_test(
905920
data=X,
906921
gene_names=gene_names,
907922
grouping=grouping,
923+
is_sig_zerovar=is_sig_zerovar,
908924
dtype=dtype
909925
)
910926
else:
@@ -925,6 +941,7 @@ def pairwise(
925941
size_factors: np.ndarray = None,
926942
batch_size: int = None,
927943
training_strategy: Union[str, List[Dict[str, object]], Callable] = "AUTO",
944+
is_sig_zerovar: bool = True,
928945
quick_scale: bool = None,
929946
dtype="float64",
930947
pval_correction: str = "global",
@@ -1016,7 +1033,10 @@ def pairwise(
10161033
10171034
- "global": correct all p-values in one operation
10181035
- "by_test": correct the p-values of each test individually
1019-
:param keep_full_test_objs: [Debugging] keep the individual test objects; currently valid for test != "z-test"
1036+
:param keep_full_test_objs: [Debugging] keep the individual test objects; currently valid for test != "z-test".
1037+
:param is_sig_zerovar:
1038+
Whether to assign p-value of 0 to a gene which has zero variance in both groups but not the same mean. If False,
1039+
the p-value is set to np.nan.
10201040
:param kwargs: [Debugging] Additional arguments will be passed to the _fit method.
10211041
"""
10221042
if len(kwargs) != 0:
@@ -1096,6 +1116,7 @@ def pairwise(
10961116
batch_size=batch_size,
10971117
training_strategy=training_strategy,
10981118
quick_scale=quick_scale,
1119+
is_sig_zerovar=is_sig_zerovar,
10991120
dtype=dtype,
11001121
**kwargs
11011122
)
@@ -1131,6 +1152,7 @@ def versus_rest(
11311152
size_factors: np.ndarray = None,
11321153
batch_size: int = None,
11331154
training_strategy: Union[str, List[Dict[str, object]], Callable] = "AUTO",
1155+
is_sig_zerovar: bool = True,
11341156
quick_scale: bool = None,
11351157
dtype="float64",
11361158
pval_correction: str = "global",
@@ -1221,6 +1243,9 @@ def versus_rest(
12211243
12221244
- "global": correct all p-values in one operation
12231245
- "by_test": correct the p-values of each test individually
1246+
:param is_sig_zerovar:
1247+
Whether to assign p-value of 0 to a gene which has zero variance in both groups but not the same mean. If False,
1248+
the p-value is set to np.nan.
12241249
:param kwargs: [Debugging] Additional arguments will be passed to the _fit method.
12251250
"""
12261251
if len(kwargs) != 0:
@@ -1257,6 +1282,7 @@ def versus_rest(
12571282
training_strategy=training_strategy,
12581283
quick_scale=quick_scale,
12591284
size_factors=size_factors,
1285+
is_sig_zerovar=is_sig_zerovar,
12601286
dtype=dtype,
12611287
**kwargs
12621288
)
@@ -1353,6 +1379,7 @@ def two_sample(
13531379
noise_model: str = None,
13541380
batch_size: int = None,
13551381
training_strategy: Union[str, List[Dict[str, object]], Callable] = "AUTO",
1382+
is_sig_zerovar: bool = True,
13561383
**kwargs
13571384
) -> _DifferentialExpressionTestMulti:
13581385
"""
@@ -1388,6 +1415,9 @@ def two_sample(
13881415
`training_strategy(estimator)`.
13891416
- list of keyword dicts containing method arguments: Will call Estimator.train() once with each dict of
13901417
method arguments.
1418+
:param is_sig_zerovar:
1419+
Whether to assign p-value of 0 to a gene which has zero variance in both groups but not the same mean. If False,
1420+
the p-value is set to np.nan.
13911421
:param kwargs: [Debugging] Additional arguments will be passed to the _fit method.
13921422
"""
13931423
DETestsSingle = []
@@ -1403,6 +1433,7 @@ def two_sample(
14031433
size_factors=size_factors[idx] if size_factors is not None else None,
14041434
batch_size=batch_size,
14051435
training_strategy=training_strategy,
1436+
is_sig_zerovar=is_sig_zerovar,
14061437
**kwargs
14071438
))
14081439
return DifferentialExpressionTestByPartition(
@@ -1415,6 +1446,7 @@ def t_test(
14151446
self,
14161447
grouping: Union[str],
14171448
is_logged: bool,
1449+
is_sig_zerovar: bool = True,
14181450
dtype="float64"
14191451
):
14201452
"""
@@ -1428,6 +1460,9 @@ def t_test(
14281460
:param is_logged:
14291461
Whether data is already logged. If True, log-fold changes are computed as fold changes on this data.
14301462
If False, log-fold changes are computed as log-fold changes on this data.
1463+
:param is_sig_zerovar:
1464+
Whether to assign p-value of 0 to a gene which has zero variance in both groups but not the same mean. If False,
1465+
the p-value is set to np.nan.
14311466
:param dtype:
14321467
"""
14331468
DETestsSingle = []
@@ -1438,6 +1473,7 @@ def t_test(
14381473
is_logged=is_logged,
14391474
gene_names=self.gene_names,
14401475
sample_description=self.sample_description.iloc[idx, :],
1476+
is_sig_zerovar=is_sig_zerovar,
14411477
dtype=dtype
14421478
))
14431479
return DifferentialExpressionTestByPartition(
@@ -1449,6 +1485,7 @@ def t_test(
14491485
def rank_test(
14501486
self,
14511487
grouping: Union[str],
1488+
is_sig_zerovar: bool = True,
14521489
dtype="float64"
14531490
):
14541491
"""
@@ -1460,6 +1497,9 @@ def rank_test(
14601497
14611498
- column in data.obs/sample_description which contains the split of observations into the two groups.
14621499
- array of length `num_observations` containing group labels
1500+
:param is_sig_zerovar:
1501+
Whether to assign p-value of 0 to a gene which has zero variance in both groups but not the same mean. If False,
1502+
the p-value is set to np.nan.
14631503
:param dtype:
14641504
"""
14651505
DETestsSingle = []
@@ -1469,6 +1509,7 @@ def rank_test(
14691509
grouping=grouping,
14701510
gene_names=self.gene_names,
14711511
sample_description=self.sample_description.iloc[idx, :],
1512+
is_sig_zerovar=is_sig_zerovar,
14721513
dtype=dtype
14731514
))
14741515
return DifferentialExpressionTestByPartition(

diffxpy/unit_test/test_extreme_values.py

+38-16
Original file line numberDiff line numberDiff line change
@@ -11,24 +11,19 @@
1111

1212
class TestExtremeValues(unittest.TestCase):
1313

14-
def test_t_test_zero_variance(self, n_cells: int = 2000, n_genes: int = 100):
14+
def test_t_test_zero_variance(self):
1515
"""
16-
Test if de.t_test() generates a uniform p-value distribution
17-
if it is given data simulated based on the null model. Returns the p-value
18-
of the two-side Kolmgorov-Smirnov test for equality of the observed
19-
p-value distribution and a uniform distribution.
20-
21-
:param n_cells: Number of cells to simulate (number of observations per test).
22-
:param n_genes: Number of genes to simulate (number of tests).
16+
Test if T-test works if it is given genes with zero variance.
2317
"""
2418
logging.getLogger("tensorflow").setLevel(logging.ERROR)
2519
logging.getLogger("batchglm").setLevel(logging.WARNING)
2620
logging.getLogger("diffxpy").setLevel(logging.WARNING)
2721

28-
sim = Simulator(num_observations=n_cells, num_features=n_genes)
22+
sim = Simulator(num_observations=1000, num_features=10)
2923
sim.generate_sample_description(num_batches=0, num_conditions=0)
3024
sim.generate()
31-
sim.data.X[:, 0] = np.exp(sim.a)[0, 0]
25+
sim.data.X[:, 0] = 0
26+
sim.data.X[:, 1] = 5
3227

3328
random_sample_description = pd.DataFrame({
3429
"condition": np.random.randint(2, size=sim.num_observations)
@@ -37,17 +32,44 @@ def test_t_test_zero_variance(self, n_cells: int = 2000, n_genes: int = 100):
3732
test = de.test.t_test(
3833
data=sim.X,
3934
grouping="condition",
40-
sample_description=random_sample_description
35+
sample_description=random_sample_description,
36+
is_sig_zerovar=True
4137
)
4238

43-
# Compare p-value distribution under null model against uniform distribution.
44-
pval_h0 = stats.kstest(test.pval, 'uniform').pvalue
39+
assert np.isnan(test.pval[0]) and test.pval[1] == 1, \
40+
"rank test did not assign p-value of zero to groups with zero variance and same mean, %f, %f" % \
41+
(test.pval[0], test.pval[1])
42+
return True
43+
44+
def test_rank_test_zero_variance(self):
45+
"""
46+
Test if rank test works if it is given genes with zero variance.
47+
"""
48+
logging.getLogger("tensorflow").setLevel(logging.ERROR)
49+
logging.getLogger("batchglm").setLevel(logging.WARNING)
50+
logging.getLogger("diffxpy").setLevel(logging.WARNING)
51+
52+
sim = Simulator(num_observations=1000, num_features=10)
53+
sim.generate_sample_description(num_batches=0, num_conditions=0)
54+
sim.generate()
55+
sim.data.X[:, 0] = 0
56+
sim.data.X[:, 1] = 5
4557

46-
print('KS-test pvalue for null model match of t_test(): %f' % pval_h0)
58+
random_sample_description = pd.DataFrame({
59+
"condition": np.random.randint(2, size=sim.num_observations)
60+
})
4761

48-
assert pval_h0 > 0.05, "KS-Test failed: pval_h0 is <= 0.05!"
62+
test = de.test.rank_test(
63+
data=sim.X,
64+
grouping="condition",
65+
sample_description=random_sample_description,
66+
is_sig_zerovar=True
67+
)
4968

50-
return pval_h0
69+
assert np.isnan(test.pval[0]) and test.pval[1] == 1, \
70+
"rank test did not assign p-value of zero to groups with zero variance and same mean, %f, %f" % \
71+
(test.pval[0], test.pval[1])
72+
return True
5173

5274

5375
if __name__ == '__main__':

0 commit comments

Comments
 (0)