improved zero variance handling of ttest and rank test

davidsebfischer · davidsebfischer · commit 7721c511664a · 2019-08-14T13:28:11.000+02:00
diff --git a/diffxpy/testing/det.py b/diffxpy/testing/det.py
@@ -1609,7 +1609,8 @@ def __init__(
             sample_description: pd.DataFrame,
             grouping,
             gene_names,
-            is_logged
+            is_logged,
+            is_sig_zerovar: bool = True
     ):
         super().__init__()
         self._X = data
@@ -1647,6 +1648,16 @@ def __init__(
             n0=x0.shape[0],
             n1=x1.shape[0]
         )
+        pval[np.where(np.logical_and(
+            np.logical_and(mean_x0 == mean_x1, self._mean > 0),
+            np.logical_not(self._var_geq_zero)
+        ))[0]] = 1.0
+        if is_sig_zerovar:
+            pval[np.where(np.logical_and(
+                mean_x0 != mean_x1,
+                np.logical_not(self._var_geq_zero)
+            ))[0]] = 0.0
+
         self._pval = pval
 
         if is_logged:
@@ -1732,7 +1743,8 @@ def __init__(
             sample_description: pd.DataFrame,
             grouping,
             gene_names,
-            is_logged
+            is_logged,
+            is_sig_zerovar: bool = True
     ):
         super().__init__()
         self._X = data
@@ -1772,6 +1784,15 @@ def __init__(
                 x0=x0.X[:, idx_run].toarray(),
                 x1=x1.X[:, idx_run].toarray()
             )
+        pval[np.where(np.logical_and(
+            np.logical_and(mean_x0 == mean_x1, self._mean > 0),
+            np.logical_not(self._var_geq_zero)
+        ))[0]] = 1.0
+        if is_sig_zerovar:
+            pval[np.where(np.logical_and(
+                mean_x0 != mean_x1,
+                np.logical_not(self._var_geq_zero)
+            ))[0]] = 0.0
 
         self._pval = pval
 
diff --git a/diffxpy/testing/tests.py b/diffxpy/testing/tests.py
@@ -669,6 +669,7 @@ def t_test(
         gene_names: Union[np.ndarray, list] = None,
         sample_description: pd.DataFrame = None,
         is_logged: bool = False,
+        is_sig_zerovar: bool = True,
         dtype="float64"
 ):
     """
@@ -686,6 +687,9 @@ def t_test(
     :param is_logged:
         Whether data is already logged. If True, log-fold changes are computed as fold changes on this data.
         If False, log-fold changes are computed as log-fold changes on this data.
+    :param is_sig_zerovar:
+        Whether to assign p-value of 0 to a gene which has zero variance in both groups but not the same mean. If False,
+        the p-value is set to np.nan.
     """
     gene_names = parse_gene_names(data, gene_names)
     X = parse_data(data, gene_names)
@@ -698,7 +702,8 @@ def t_test(
         sample_description=sample_description,
         grouping=grouping,
         gene_names=gene_names,
-        is_logged=is_logged
+        is_logged=is_logged,
+        is_sig_zerovar=is_sig_zerovar
     )
 
     return de_test
@@ -709,7 +714,8 @@ def rank_test(
         grouping: Union[str, np.ndarray, list],
         gene_names: Union[np.ndarray, list] = None,
         sample_description: pd.DataFrame = None,
-        is_logged=False,
+        is_logged: bool = False,
+        is_sig_zerovar: bool = True,
         dtype="float64"
 ):
     """
@@ -727,6 +733,9 @@ def rank_test(
     :param is_logged:
         Whether data is already logged. If True, log-fold changes are computed as fold changes on this data.
         If False, log-fold changes are computed as log-fold changes on this data.
+    :param is_sig_zerovar:
+        Whether to assign p-value of 0 to a gene which has zero variance in both groups but not the same mean. If False,
+        the p-value is set to np.nan.
     """
     gene_names = parse_gene_names(data, gene_names)
     X = parse_data(data, gene_names)
@@ -739,7 +748,8 @@ def rank_test(
         sample_description=sample_description,
         grouping=grouping,
         gene_names=gene_names,
-        is_logged=is_logged
+        is_logged=is_logged,
+        is_sig_zerovar=is_sig_zerovar
     )
 
     return de_test
@@ -756,6 +766,7 @@ def two_sample(
         size_factors: np.ndarray = None,
         batch_size: int = None,
         training_strategy: Union[str, List[Dict[str, object]], Callable] = "AUTO",
+        is_sig_zerovar: bool = True,
         quick_scale: bool = None,
         dtype="float64",
         **kwargs
@@ -824,6 +835,9 @@ def two_sample(
           `training_strategy(estimator)`.
         - list of keyword dicts containing method arguments: Will call Estimator.train() once with each dict of
           method arguments.
+    :param is_sig_zerovar:
+        Whether to assign p-value of 0 to a gene which has zero variance in both groups but not the same mean. If False,
+        the p-value is set to np.nan.
     :param quick_scale: Depending on the optimizer, `scale` will be fitted faster and maybe less accurate.
 
         Useful in scenarios where fitting the exact `scale` is not absolutely necessary.
@@ -898,13 +912,15 @@ def two_sample(
             data=X,
             gene_names=gene_names,
             grouping=grouping,
+            is_sig_zerovar=is_sig_zerovar,
             dtype=dtype
         )
     elif test.lower() == 'rank':
         de_test = rank_test(
             data=X,
             gene_names=gene_names,
             grouping=grouping,
+            is_sig_zerovar=is_sig_zerovar,
             dtype=dtype
         )
     else:
@@ -925,6 +941,7 @@ def pairwise(
         size_factors: np.ndarray = None,
         batch_size: int = None,
         training_strategy: Union[str, List[Dict[str, object]], Callable] = "AUTO",
+        is_sig_zerovar: bool = True,
         quick_scale: bool = None,
         dtype="float64",
         pval_correction: str = "global",
@@ -1016,7 +1033,10 @@ def pairwise(
 
         - "global": correct all p-values in one operation
         - "by_test": correct the p-values of each test individually
-    :param keep_full_test_objs: [Debugging] keep the individual test objects; currently valid for test != "z-test"
+    :param keep_full_test_objs: [Debugging] keep the individual test objects; currently valid for test != "z-test".
+    :param is_sig_zerovar:
+        Whether to assign p-value of 0 to a gene which has zero variance in both groups but not the same mean. If False,
+        the p-value is set to np.nan.
     :param kwargs: [Debugging] Additional arguments will be passed to the _fit method.
     """
     if len(kwargs) != 0:
@@ -1096,6 +1116,7 @@ def pairwise(
                     batch_size=batch_size,
                     training_strategy=training_strategy,
                     quick_scale=quick_scale,
+                    is_sig_zerovar=is_sig_zerovar,
                     dtype=dtype,
                     **kwargs
                 )
@@ -1131,6 +1152,7 @@ def versus_rest(
         size_factors: np.ndarray = None,
         batch_size: int = None,
         training_strategy: Union[str, List[Dict[str, object]], Callable] = "AUTO",
+        is_sig_zerovar: bool = True,
         quick_scale: bool = None,
         dtype="float64",
         pval_correction: str = "global",
@@ -1221,6 +1243,9 @@ def versus_rest(
 
         - "global": correct all p-values in one operation
         - "by_test": correct the p-values of each test individually
+    :param is_sig_zerovar:
+        Whether to assign p-value of 0 to a gene which has zero variance in both groups but not the same mean. If False,
+        the p-value is set to np.nan.
     :param kwargs: [Debugging] Additional arguments will be passed to the _fit method.
     """
     if len(kwargs) != 0:
@@ -1257,6 +1282,7 @@ def versus_rest(
             training_strategy=training_strategy,
             quick_scale=quick_scale,
             size_factors=size_factors,
+            is_sig_zerovar=is_sig_zerovar,
             dtype=dtype,
             **kwargs
         )
@@ -1353,6 +1379,7 @@ def two_sample(
             noise_model: str = None,
             batch_size: int = None,
             training_strategy: Union[str, List[Dict[str, object]], Callable] = "AUTO",
+            is_sig_zerovar: bool = True,
             **kwargs
     ) -> _DifferentialExpressionTestMulti:
         """
@@ -1388,6 +1415,9 @@ def two_sample(
               `training_strategy(estimator)`.
             - list of keyword dicts containing method arguments: Will call Estimator.train() once with each dict of
               method arguments.
+        :param is_sig_zerovar:
+            Whether to assign p-value of 0 to a gene which has zero variance in both groups but not the same mean. If False,
+            the p-value is set to np.nan.
         :param kwargs: [Debugging] Additional arguments will be passed to the _fit method.
         """
         DETestsSingle = []
@@ -1403,6 +1433,7 @@ def two_sample(
                 size_factors=size_factors[idx] if size_factors is not None else None,
                 batch_size=batch_size,
                 training_strategy=training_strategy,
+                is_sig_zerovar=is_sig_zerovar,
                 **kwargs
             ))
         return DifferentialExpressionTestByPartition(
@@ -1415,6 +1446,7 @@ def t_test(
             self,
             grouping: Union[str],
             is_logged: bool,
+            is_sig_zerovar: bool = True,
             dtype="float64"
     ):
         """
@@ -1428,6 +1460,9 @@ def t_test(
         :param is_logged:
             Whether data is already logged. If True, log-fold changes are computed as fold changes on this data.
             If False, log-fold changes are computed as log-fold changes on this data.
+        :param is_sig_zerovar:
+            Whether to assign p-value of 0 to a gene which has zero variance in both groups but not the same mean. If False,
+            the p-value is set to np.nan.
         :param dtype:
         """
         DETestsSingle = []
@@ -1438,6 +1473,7 @@ def t_test(
                 is_logged=is_logged,
                 gene_names=self.gene_names,
                 sample_description=self.sample_description.iloc[idx, :],
+                is_sig_zerovar=is_sig_zerovar,
                 dtype=dtype
             ))
         return DifferentialExpressionTestByPartition(
@@ -1449,6 +1485,7 @@ def t_test(
     def rank_test(
             self,
             grouping: Union[str],
+            is_sig_zerovar: bool = True,
             dtype="float64"
     ):
         """
@@ -1460,6 +1497,9 @@ def rank_test(
 
             - column in data.obs/sample_description which contains the split of observations into the two groups.
             - array of length `num_observations` containing group labels
+        :param is_sig_zerovar:
+            Whether to assign p-value of 0 to a gene which has zero variance in both groups but not the same mean. If False,
+            the p-value is set to np.nan.
         :param dtype:
         """
         DETestsSingle = []
@@ -1469,6 +1509,7 @@ def rank_test(
                 grouping=grouping,
                 gene_names=self.gene_names,
                 sample_description=self.sample_description.iloc[idx, :],
+                is_sig_zerovar=is_sig_zerovar,
                 dtype=dtype
             ))
         return DifferentialExpressionTestByPartition(
diff --git a/diffxpy/unit_test/test_extreme_values.py b/diffxpy/unit_test/test_extreme_values.py
@@ -11,24 +11,19 @@
 
 class TestExtremeValues(unittest.TestCase):
 
-    def test_t_test_zero_variance(self, n_cells: int = 2000, n_genes: int = 100):
+    def test_t_test_zero_variance(self):
         """
-        Test if de.t_test() generates a uniform p-value distribution
-        if it is given data simulated based on the null model. Returns the p-value
-        of the two-side Kolmgorov-Smirnov test for equality of the observed
-        p-value distribution and a uniform distribution.
-
-        :param n_cells: Number of cells to simulate (number of observations per test).
-        :param n_genes: Number of genes to simulate (number of tests).
+        Test if T-test works if it is given genes with zero variance.
         """
         logging.getLogger("tensorflow").setLevel(logging.ERROR)
         logging.getLogger("batchglm").setLevel(logging.WARNING)
         logging.getLogger("diffxpy").setLevel(logging.WARNING)
 
-        sim = Simulator(num_observations=n_cells, num_features=n_genes)
+        sim = Simulator(num_observations=1000, num_features=10)
         sim.generate_sample_description(num_batches=0, num_conditions=0)
         sim.generate()
-        sim.data.X[:, 0] = np.exp(sim.a)[0, 0]
+        sim.data.X[:, 0] = 0
+        sim.data.X[:, 1] = 5
 
         random_sample_description = pd.DataFrame({
             "condition": np.random.randint(2, size=sim.num_observations)
@@ -37,17 +32,44 @@ def test_t_test_zero_variance(self, n_cells: int = 2000, n_genes: int = 100):
         test = de.test.t_test(
             data=sim.X,
             grouping="condition",
-            sample_description=random_sample_description
+            sample_description=random_sample_description,
+            is_sig_zerovar=True
         )
 
-        # Compare p-value distribution under null model against uniform distribution.
-        pval_h0 = stats.kstest(test.pval, 'uniform').pvalue
+        assert np.isnan(test.pval[0]) and test.pval[1] == 1, \
+            "rank test did not assign p-value of zero to groups with zero variance and same mean, %f, %f" % \
+            (test.pval[0], test.pval[1])
+        return True
+
+    def test_rank_test_zero_variance(self):
+        """
+        Test if rank test works if it is given genes with zero variance.
+        """
+        logging.getLogger("tensorflow").setLevel(logging.ERROR)
+        logging.getLogger("batchglm").setLevel(logging.WARNING)
+        logging.getLogger("diffxpy").setLevel(logging.WARNING)
+
+        sim = Simulator(num_observations=1000, num_features=10)
+        sim.generate_sample_description(num_batches=0, num_conditions=0)
+        sim.generate()
+        sim.data.X[:, 0] = 0
+        sim.data.X[:, 1] = 5
 
-        print('KS-test pvalue for null model match of t_test(): %f' % pval_h0)
+        random_sample_description = pd.DataFrame({
+            "condition": np.random.randint(2, size=sim.num_observations)
+        })
 
-        assert pval_h0 > 0.05, "KS-Test failed: pval_h0 is <= 0.05!"
+        test = de.test.rank_test(
+            data=sim.X,
+            grouping="condition",
+            sample_description=random_sample_description,
+            is_sig_zerovar=True
+        )
 
-        return pval_h0
+        assert np.isnan(test.pval[0]) and test.pval[1] == 1, \
+            "rank test did not assign p-value of zero to groups with zero variance and same mean, %f, %f" % \
+            (test.pval[0], test.pval[1])
+        return True
 
 
 if __name__ == '__main__':