Bernoulli RBF for call density estimation.

sdenton4 · copybara-github · commit e7c3f7ffdf84 · 2025-02-07T16:33:23.000-08:00
PiperOrigin-RevId: 724507934
diff --git a/chirp/projects/bernoulli_rbf.py b/chirp/projects/bernoulli_rbf.py
@@ -0,0 +1,243 @@
+# coding=utf-8
+# Copyright 2024 The Perch Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Call density estimation using a bernoulli kernel."""
+
+from flax import nnx
+import jax
+from jax import numpy as jnp
+
+
+def log_sum_exp(xs: jnp.ndarray, axis=None):
+  # Log-of-sum-of-exponentials admits a nice, more-stable form using the max
+  # of the sequence.
+  # https://mc-stan.org/docs/2_27/stan-users-guide/log-sum-of-exponentials.html
+  max_x = jnp.max(xs, axis=axis, keepdims=True)
+  max_sq = jnp.max(xs, axis=axis)
+  sums_x = jnp.log(jnp.sum(jnp.exp(xs - max_x), axis=axis))
+  return sums_x + max_sq
+
+
+sq_norm = lambda x: jnp.sum(x * x, axis=1)
+dots_ab = lambda x, y: jnp.dot(x, y.T)
+dists_ab = lambda x, y: (
+    -2 * dots_ab(x, y) + sq_norm(x)[:, jnp.newaxis] + sq_norm(y)[jnp.newaxis, :]
+)
+
+# Scaled distances.
+dists_ab_s = lambda a, b, s: (
+    dists_ab(a * s[jnp.newaxis, :], b * s[jnp.newaxis, :])
+)
+
+
+def scaled_rbf_kernel(
+    x: jnp.ndarray, y: jnp.ndarray, scale: jnp.ndarray, bias: float
+):
+  return dists_ab_s(x, y, scale) + bias
+
+
+class BernoulliData(nnx.Variable):
+  """Container for data and labels for a BernoulliProcessor."""
+
+  # We declare this subclass so that the groundtruth data is not updated
+  # during training.
+  pass
+
+
+class BernoulliRBF(nnx.Module):
+  r"""Model P(+|x) ~ \beta(a(x), b(x)).
+
+  Given some input data x, we want to estimate the number of virtual positive
+  and negative observations to associate with x. These are used as parameters in
+  a beta distribution, allowing us to have both an expected value for P(+|x)
+  and a measure of certainty, according to the total weight a(x) + b(x).
+
+  We combine two approaches for estimating a(x), b(x):
+  First, a learned RBF kernel over the ground-truth observations acts as a KNN
+  classifier, contributing positive and negative observations at arbitrary x
+  according to learned similarity between x and the groundtruth.
+
+  Second, we (optionally) directly predict a number of pos/neg observations
+  a_f(x), b_f(x) from the features themselves. For example, if one of the
+  features is a classifier score, this allows the model to directly use the
+  classifier score as a prior, with some learned weight.
+  """
+
+  def __init__(
+      self,
+      data: jnp.ndarray,
+      data_labels: jnp.ndarray,
+      data_mean: float | None = 0.0,
+      data_std: float | None = 1.0,
+      learn_feature_weights: bool = False,
+      *,
+      rngs: nnx.Rngs
+  ):
+    key = rngs.params()
+    num_features = data.shape[-1]
+    self.scales_pos = nnx.Param(jax.random.uniform(key, (num_features,)))
+    self.scales_neg = nnx.Param(jax.random.uniform(key, (num_features,)))
+    self.weight_bias = nnx.Param(jnp.zeros([2]))
+    if data_mean is None:
+      self.data_mean = BernoulliData(jnp.mean(data, axis=0, keepdims=True))
+    else:
+      self.data_mean = BernoulliData(data_mean)
+    if data_std is None:
+      self.data_stds = BernoulliData(jnp.std(data, axis=0, keepdims=True))
+    else:
+      self.data_stds = BernoulliData(data_std)
+    data_pos, data_neg = self.split_labeled_data(data, data_labels)
+    self.data_pos = jax.lax.stop_gradient(
+        BernoulliData(self._normalize(data_pos))
+    )
+    self.data_neg = jax.lax.stop_gradient(
+        BernoulliData(self._normalize(data_neg))
+    )
+    self.data_labels = jax.lax.stop_gradient(BernoulliData(data_labels))
+    self.learn_feature_weights = learn_feature_weights
+
+    # Matrices for assigning pos/neg weight directly from features.
+    self.feature_weights = nnx.Param(jax.random.uniform(key, (num_features, 2)))
+    self.feature_bias = nnx.Param(jax.random.uniform(key, (2,)))
+
+  @classmethod
+  def split_labeled_data(cls, data: jnp.ndarray, data_labels: jnp.ndarray):
+    pos_idxes = jnp.where(data_labels == 1)[0]
+    neg_idxes = jnp.where(data_labels == 0)[0]
+    data_pos = data[pos_idxes]
+    data_neg = data[neg_idxes]
+    return data_pos, data_neg
+
+  def _normalize(self, x):
+    return (x - self.data_mean.value) / self.data_stds.value
+
+  def _log_counts(self, x: jnp.ndarray, normalize: bool = True):
+    if normalize:
+      x = self._normalize(x)
+    pos_count = scaled_rbf_kernel(
+        x, self.data_pos, self.scales_pos, self.weight_bias[0]
+    )
+    neg_count = scaled_rbf_kernel(
+        x, self.data_neg, self.scales_neg, self.weight_bias[1]
+    )
+
+    if self.learn_feature_weights:
+      feature_count = jnp.dot(x, self.feature_weights.value) + self.feature_bias
+      pos_count = jnp.concat([pos_count, feature_count[:, :1]], axis=1)
+      neg_count = jnp.concat([neg_count, feature_count[:, 1:]], axis=1)
+    log_pos_count = log_sum_exp(-pos_count, axis=1)
+    log_neg_count = log_sum_exp(-neg_count, axis=1)
+    log_weight_count = log_sum_exp(
+        jnp.concatenate([-pos_count, -neg_count], axis=1), axis=1
+    )
+    return log_pos_count, log_neg_count, log_weight_count
+
+  def __call__(self, x: jnp.ndarray, normalize: bool = True):
+    """Compute log(P(+|x)) and the total example weight of x."""
+    log_pos_count, _, log_weight_count = self._log_counts(x, normalize)
+    log_p_x = log_pos_count - log_weight_count
+    return log_p_x, log_weight_count
+
+  def sampled_counts(self, seed: int, x: jnp.ndarray, n_samples: int = 1024):
+    """Create sampled positive counts from the learned distribution at x."""
+    log_pos_count, log_neg_count, unused_log_wt = self._log_counts(x)
+    pos_count = jnp.exp(log_pos_count)[:, jnp.newaxis]
+    neg_count = jnp.exp(log_neg_count)[:, jnp.newaxis]
+
+    k = jax.random.PRNGKey(seed)
+    beta_samp = jax.random.beta(
+        k, pos_count, neg_count, shape=[pos_count.shape[0], n_samples]
+    )
+    sample_counts = jnp.sum(
+        jax.random.uniform(k, shape=beta_samp.shape) < beta_samp, axis=0
+    )
+    return sample_counts
+
+  def gt_log_likelihood(self):
+    """Total log likelihood of the GT data, given learned params."""
+    # Counts for positive points.
+    pos_pos_count = scaled_rbf_kernel(  # [N+, N+]
+        self.data_pos, self.data_pos, self.scales_pos, self.weight_bias[0]
+    )
+    pos_neg_count = scaled_rbf_kernel(
+        self.data_pos, self.data_neg, self.scales_neg, self.weight_bias[1]
+    )
+
+    # Counts for negative points.
+    neg_neg_count = scaled_rbf_kernel(
+        self.data_neg, self.data_neg, self.scales_neg, self.weight_bias[1]
+    )
+    neg_pos_count = scaled_rbf_kernel(
+        self.data_neg, self.data_pos, self.scales_pos, self.weight_bias[0]
+    )
+
+    # Estimate pos/neg priors from raw features.
+    if self.learn_feature_weights:
+      pos_feature_count = (
+          jnp.dot(self.data_pos.value, self.feature_weights.value)  # [N+, 2]
+          + self.feature_bias.value
+      )
+      neg_feature_count = (
+          jnp.dot(self.data_neg.value, self.feature_weights.value)  # [N-, 2]
+          + self.feature_bias.value
+      )
+      # Add feature counts to the list of actual counts from data.
+      pos_pos_count = jnp.concat(
+          [pos_pos_count, pos_feature_count[:, :1]], axis=-1
+      )
+      pos_neg_count = jnp.concat(
+          [pos_neg_count, pos_feature_count[:, 1:]], axis=-1
+      )
+      neg_pos_count = jnp.concat(
+          [neg_pos_count, neg_feature_count[:, :1]], axis=-1
+      )
+      neg_neg_count = jnp.concat(
+          [neg_neg_count, neg_feature_count[:, 1:]], axis=-1
+      )
+
+    pos_log_prob = log_sum_exp(-pos_pos_count, axis=1) - log_sum_exp(
+        jnp.concatenate([-pos_pos_count, -pos_neg_count], axis=1), axis=1
+    )
+    neg_log_prob = log_sum_exp(-neg_neg_count, axis=1) - log_sum_exp(
+        jnp.concatenate([-neg_neg_count, -neg_pos_count], axis=1), axis=1
+    )
+
+    pos_log_prob = jnp.mean(pos_log_prob)
+    neg_log_prob = jnp.mean(neg_log_prob)
+    return pos_log_prob + neg_log_prob
+
+  def matching_loss(self):
+    """Difference between observed log P(+) and estimated log P(+)."""
+    data_log_p_x, _ = self(
+        jnp.concatenate([self.data_pos, self.data_neg], axis=0), normalize=False
+    )
+    target_log_p_x = jnp.log(self.data_pos.shape[0]) - jnp.log(
+        self.data_pos.shape[0] + self.data_neg.shape[0]
+    )
+    return jnp.abs(data_log_p_x.mean() - target_log_p_x)
+
+
+@nnx.jit
+def train_step(
+    model: BernoulliRBF, optimizer: nnx.optimizer.Optimizer, mu: float
+) -> float:
+  def loss_fn(model: BernoulliRBF):
+    gt_log_likelihood_loss = -model.gt_log_likelihood()
+    matching_loss = model.matching_loss()
+    return gt_log_likelihood_loss + mu * matching_loss
+
+  loss, grads = nnx.value_and_grad(loss_fn)(model)
+  optimizer.update(grads)
+  return loss
diff --git a/chirp/projects/bernoulli_rbf_test.py b/chirp/projects/bernoulli_rbf_test.py
@@ -0,0 +1,106 @@
+# coding=utf-8
+# Copyright 2024 The Perch Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for Bernoulli RBF."""
+
+from chirp.projects import bernoulli_rbf
+from flax import nnx
+from jax import numpy as jnp
+import numpy as np
+import optax
+
+from absl.testing import absltest
+
+
+class BernoulliRbfTest(absltest.TestCase):
+
+  def test_kernel_numerics(self):
+
+    xs = jnp.array([[0, 1], [2, 3]])
+    ys = jnp.array([[1, 0], [1, 1], [2, 3]])
+
+    with self.subTest('unit_scale'):
+      scales = jnp.array([1.0, 1.0])
+      got = bernoulli_rbf.scaled_rbf_kernel(xs, ys, scales, 0.0)
+      expect = jnp.array([
+          [2.0, 1.0, 8.0],
+          [10.0, 5.0, 0.0],
+      ])
+      np.testing.assert_array_equal(got.shape, (2, 3))
+      np.testing.assert_array_equal(got, expect)
+
+    with self.subTest('scaled'):
+      scales = jnp.array([2.0, 3.0])
+      got = bernoulli_rbf.scaled_rbf_kernel(xs, ys, scales, 0.0)
+      expect = jnp.array([
+          [13.0, 4.0, 52.0],
+          [85.0, 40.0, 0.0],
+      ])
+      np.testing.assert_array_equal(got.shape, (2, 3))
+      np.testing.assert_array_equal(got, expect)
+
+  def test_split_labeled_data(self):
+    data = jnp.array([[0, 1], [2, 3], [4, 5], [6, 7]])
+    labels = jnp.array([0, 1, 0, 1])
+    pos, neg = bernoulli_rbf.BernoulliRBF.split_labeled_data(data, labels)
+    np.testing.assert_array_equal(pos, jnp.array([[2, 3], [6, 7]]))
+    np.testing.assert_array_equal(neg, jnp.array([[0, 1], [4, 5]]))
+
+  def test_log_prob(self):
+    data = jnp.array([[1, 0], [1, 1], [2, 3]])
+    labels = jnp.array([0, 1, 0])
+    model = bernoulli_rbf.BernoulliRBF(
+        data,
+        labels,
+        rngs=nnx.Rngs(666),
+        learn_feature_weights=False,
+    )
+    # Set unit scales and bias for simplicity.
+    model.scales_pos = jnp.array([1.0, 1.0])
+    model.scales_neg = jnp.array([1.0, 1.0])
+    model.weight_bias = jnp.array([0.0, 0.0])
+
+    got_log_prob, got_log_wt = model(jnp.array([[0, 1]]))
+    # Squared distances to the data points are [2.0, 1.0, 8.0], and only
+    # the second example is positive.
+    # Then the positive example weight is [exp(-1)] and negative example weights
+    # are [exp(-2), exp(-8)].
+    # Then our predicted probability is:
+    # exp(-1) / (exp(-1) + exp(-2) + exp(-8)) ~= 0.7306.
+    # The log-of-sum-of-exponentials is log(exp(-1) + exp(-2) + exp(-8)) = -1.
+    # The total weight is exp(-1) + exp(-2) + exp(-8) = 1.
+    expect_log_prob = -1.0 + -np.log((np.exp(-1) + np.exp(-2) + np.exp(-8)))
+    np.testing.assert_allclose(got_log_prob, expect_log_prob, atol=1e-5)
+    expect_log_wt = np.log(np.exp(-1) + np.exp(-2) + np.exp(-8))
+    np.testing.assert_allclose(got_log_wt, expect_log_wt, atol=1e-5)
+
+  def test_train_step(self):
+    data = jnp.array([[1, 0], [1, 1], [2, 3]])
+    labels = jnp.array([0, 1, 0])
+    model = bernoulli_rbf.BernoulliRBF(
+        data,
+        labels,
+        rngs=nnx.Rngs(666),
+        data_mean=None,
+        data_std=None,
+        learn_feature_weights=False,
+    )
+    optimizer = nnx.Optimizer(model, optax.adam(1e-3))  # reference sharing
+    loss = bernoulli_rbf.train_step(model, optimizer, mu=1.0)
+    self.assertLess(loss, 2.0)
+
+
+if __name__ == '__main__':
+  absltest.main()