pymc-devs
diff --git a/‎RELEASE-NOTES.md
+1 b/‎RELEASE-NOTES.md
+1
diff --git a/‎benchmarks/benchmarks/benchmarks.py
+10-10 b/‎benchmarks/benchmarks/benchmarks.py
+10-10
diff --git a/‎docs/source/PyMC_and_Aesara.rst
+7-7 b/‎docs/source/PyMC_and_Aesara.rst
+7-7
diff --git a/‎docs/source/contributing/developer_guide.rst
+2-2 b/‎docs/source/contributing/developer_guide.rst
+2-2
diff --git a/‎docs/source/learn/examples/dimensionality.ipynb
+4-185 b/‎docs/source/learn/examples/dimensionality.ipynb
+4-185
diff --git a/‎docs/source/learn/examples/posterior_predictive.ipynb
+5-5 b/‎docs/source/learn/examples/posterior_predictive.ipynb
+5-5
diff --git a/‎pymc/data.py
+2-2 b/‎pymc/data.py
+2-2
diff --git a/‎pymc/distributions/continuous.py
+13-51 b/‎pymc/distributions/continuous.py
+13-51
diff --git a/‎pymc/distributions/mixture.py
+2-6 b/‎pymc/distributions/mixture.py
+2-6
diff --git a/‎pymc/distributions/timeseries.py
+6-13 b/‎pymc/distributions/timeseries.py
+6-13
diff --git a/‎pymc/model.py
+1-1 b/‎pymc/model.py
+1-1
@@ -97,6 +97,7 @@ All of the above apply to:
 This includes API changes we did not warn about since at least `3.11.0` (2021-01).
 
 - Setting initial values through `pm.Distribution(testval=...)` is now `pm.Distribution(initval=...)`.
+- Alternative `sd` keyword argument has been removed from all distributions. `sigma` should be used instead (see [#5583](https://github.com/pymc-devs/pymc/pull/5583)).
 
 
 ### New features
 
@@ -32,17 +32,17 @@ def glm_hierarchical_model(random_seed=123):
 
     n_counties = len(data.county.unique())
     with pm.Model() as model:
-        mu_a = pm.Normal("mu_a", mu=0.0, sd=100**2)
+        mu_a = pm.Normal("mu_a", mu=0.0, sigma=100**2)
         sigma_a = pm.HalfCauchy("sigma_a", 5)
-        mu_b = pm.Normal("mu_b", mu=0.0, sd=100**2)
+        mu_b = pm.Normal("mu_b", mu=0.0, sigma=100**2)
         sigma_b = pm.HalfCauchy("sigma_b", 5)
-        a = pm.Normal("a", mu=0, sd=1, shape=n_counties)
-        b = pm.Normal("b", mu=0, sd=1, shape=n_counties)
+        a = pm.Normal("a", mu=0, sigma=1, shape=n_counties)
+        b = pm.Normal("b", mu=0, sigma=1, shape=n_counties)
         a = mu_a + sigma_a * a
         b = mu_b + sigma_b * b
         eps = pm.HalfCauchy("eps", 5)
         radon_est = a[county_idx] + b[county_idx] * data.floor.values
-        pm.Normal("radon_like", mu=radon_est, sd=eps, observed=data.log_radon)
+        pm.Normal("radon_like", mu=radon_est, sigma=eps, observed=data.log_radon)
     return model
 
 
@@ -58,7 +58,7 @@ def mixture_model(random_seed=1234):
 
     with pm.Model() as model:
         w = pm.Dirichlet("w", a=np.ones_like(w_true))
-        mu = pm.Normal("mu", mu=0.0, sd=10.0, shape=w_true.shape)
+        mu = pm.Normal("mu", mu=0.0, sigma=10.0, shape=w_true.shape)
         enforce_order = pm.Potential(
             "enforce_order",
             at.switch(mu[0] - mu[1] <= 0, 0.0, -np.inf)
@@ -88,7 +88,7 @@ class OverheadSuite:
     def setup(self, step):
         self.n_steps = 10000
         with pm.Model() as self.model:
-            pm.Normal("x", mu=0, sd=1)
+            pm.Normal("x", mu=0, sigma=1)
 
     def time_overhead_sample(self, step):
         with self.model:
@@ -133,8 +133,8 @@ def time_drug_evaluation(self):
         sigma_low = 1
         sigma_high = 10
         with pm.Model():
-            group1_mean = pm.Normal("group1_mean", y_mean, sd=y_std)
-            group2_mean = pm.Normal("group2_mean", y_mean, sd=y_std)
+            group1_mean = pm.Normal("group1_mean", y_mean, sigma=y_std)
+            group2_mean = pm.Normal("group2_mean", y_mean, sigma=y_std)
             group1_std = pm.Uniform("group1_std", lower=sigma_low, upper=sigma_high)
             group2_std = pm.Uniform("group2_std", lower=sigma_low, upper=sigma_high)
             lambda_1 = group1_std**-2
@@ -301,7 +301,7 @@ def freefall(y, t, p):
             # If we know one of the parameter values, we can simply pass the value.
             ode_solution = ode_model(y0=[0], theta=[gamma, 9.8])
             # The ode_solution has a shape of (n_times, n_states)
-            Y = pm.Normal("Y", mu=ode_solution, sd=sigma, observed=y)
+            Y = pm.Normal("Y", mu=ode_solution, sigma=sigma, observed=y)
 
             t0 = time.time()
             idata = pm.sample(500, tune=1000, chains=2, cores=2, random_seed=0)
 
@@ -188,8 +188,8 @@ example::
 
     with pm.Model() as model:
         mu = pm.Normal('mu', 0, 1)
-        sd = pm.HalfNormal('sd', 1)
-        y = pm.Normal('y', mu=mu, sigma=sd, observed=data)
+        sigma = pm.HalfNormal('sigma', 1)
+        y = pm.Normal('y', mu=mu, sigma=sigma, observed=data)
 
 is roughly equivalent to this::
 
@@ -203,10 +203,10 @@ is roughly equivalent to this::
     model.add_free_variable(sd_log__)
     model.add_logp_term(corrected_logp_half_normal(sd_log__))
 
-    sd = at.exp(sd_log__)
-    model.add_deterministic_variable(sd)
+    sigma = at.exp(sd_log__)
+    model.add_deterministic_variable(sigma)
 
-    model.add_logp_term(pm.Normal.dist(mu, sd).logp(data))
+    model.add_logp_term(pm.Normal.dist(mu, sigma).logp(data))
 
 The return values of the variable constructors are subclasses
 of Aesara variables, so when we define a variable we can use any
@@ -217,5 +217,5 @@ Aesara operation on them::
         # beta is a at.dvector
         beta = pm.Normal('beta', 0, 1, shape=len(design_matrix))
         predict = at.dot(design_matrix, beta)
-        sd = pm.HalfCauchy('sd', beta=2.5)
-        pm.Normal('y', mu=predict, sigma=sd, observed=data)
+        sigma = pm.HalfCauchy('sigma', beta=2.5)
+        pm.Normal('y', mu=predict, sigma=sigma, observed=data)
@@ -888,8 +888,8 @@ others. The challenge and some summary of the solution could be found in Luciano
 
     with pm.Model() as m:
         mu = pm.Normal('mu', 0., 1., shape=(5, 1))
-        sd = pm.HalfNormal('sd', 5., shape=(1, 10))
-        pm.Normal('x', mu=mu, sigma=sd, observed=np.random.randn(2, 5, 10))
+        sigma = pm.HalfNormal('sigma', 5., shape=(1, 10))
+        pm.Normal('x', mu=mu, sigma=sigma, observed=np.random.randn(2, 5, 10))
         trace = pm.sample_prior_predictive(100)
 
     trace['x'].shape # ==> should be (100, 2, 5, 10)
 
@@ -151,9 +151,9 @@
     "    b = pm.Normal(\"b\", 0.0, 10.0)\n",
     "\n",
     "    mu = a + b * predictor_scaled\n",
-    "    sd = pm.Exponential(\"sd\", 1.0)\n",
+    "    sigma = pm.Exponential(\"sigma\", 1.0)\n",
     "\n",
-    "    pm.Normal(\"obs\", mu=mu, sigma=sd, observed=outcome_scaled)\n",
+    "    pm.Normal(\"obs\", mu=mu, sigma=sigma, observed=outcome_scaled)\n",
     "    idata = pm.sample_prior_predictive(samples=50)"
    ]
   },
@@ -212,9 +212,9 @@
     "    b = pm.Normal(\"b\", 0.0, 1.0)\n",
     "\n",
     "    mu = a + b * predictor_scaled\n",
-    "    sd = pm.Exponential(\"sd\", 1.0)\n",
+    "    sigma = pm.Exponential(\"sigma\", 1.0)\n",
     "\n",
-    "    pm.Normal(\"obs\", mu=mu, sigma=sd, observed=outcome_scaled)\n",
+    "    pm.Normal(\"obs\", mu=mu, sigma=sigma, observed=outcome_scaled)\n",
     "    idata = pm.sample_prior_predictive(samples=50)"
    ]
   },
@@ -328,7 +328,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Everything ran smoothly, but it's often difficult to understand what the parameters' values mean when analyzing a trace plot or table summary -- even more so here, as the parameters live in the standardized space. A useful thing to understand your models is... you guessed it: posterior predictive checks! We'll use PyMC's dedicated function to sample data from the posterior. This function will randomly draw 4000 samples of parameters from the trace. Then, for each sample, it will draw 100 random numbers from a normal distribution specified by the values of `mu` and `sd` in that sample:"
+    "Everything ran smoothly, but it's often difficult to understand what the parameters' values mean when analyzing a trace plot or table summary -- even more so here, as the parameters live in the standardized space. A useful thing to understand your models is... you guessed it: posterior predictive checks! We'll use PyMC's dedicated function to sample data from the posterior. This function will randomly draw 4000 samples of parameters from the trace. Then, for each sample, it will draw 100 random numbers from a normal distribution specified by the values of `mu` and `sigma` in that sample:"
    ]
   },
   {
 
@@ -205,8 +205,8 @@ class Minibatch(TensorVariable):
 
     >>> with pm.Model() as model:
     ...     mu = pm.Flat('mu')
-    ...     sd = pm.HalfNormal('sd')
-    ...     lik = pm.Normal('lik', mu, sd, observed=x, total_size=(100, 100))
+    ...     sigma = pm.HalfNormal('sigma')
+    ...     lik = pm.Normal('lik', mu, sigma, observed=x, total_size=(100, 100))
 
 
     Then you can perform regular Variational Inference out of the box
 
@@ -546,13 +546,10 @@ class Normal(Continuous):
     rv_op = normal
 
     @classmethod
-    def dist(cls, mu=0, sigma=None, tau=None, sd=None, no_assert=False, **kwargs):
-        if sd is not None:
-            sigma = sd
+    def dist(cls, mu=0, sigma=None, tau=None, no_assert=False, **kwargs):
         tau, sigma = get_tau_sigma(tau=tau, sigma=sigma)
         sigma = at.as_tensor_variable(sigma)
 
-        # sd = sigma
         # tau = at.as_tensor_variable(tau)
         # mean = median = mode = mu = at.as_tensor_variable(floatX(mu))
         # variance = 1.0 / self.tau
@@ -710,13 +707,11 @@ def dist(
         mu: Optional[DIST_PARAMETER_TYPES] = None,
         sigma: Optional[DIST_PARAMETER_TYPES] = None,
         tau: Optional[DIST_PARAMETER_TYPES] = None,
-        sd: Optional[DIST_PARAMETER_TYPES] = None,
         lower: Optional[DIST_PARAMETER_TYPES] = None,
         upper: Optional[DIST_PARAMETER_TYPES] = None,
         *args,
         **kwargs,
     ) -> RandomVariable:
-        sigma = sd if sd is not None else sigma
         tau, sigma = get_tau_sigma(tau=tau, sigma=sigma)
         sigma = at.as_tensor_variable(sigma)
         tau = at.as_tensor_variable(tau)
@@ -866,10 +861,7 @@ class HalfNormal(PositiveContinuous):
     rv_op = halfnormal
 
     @classmethod
-    def dist(cls, sigma=None, tau=None, sd=None, *args, **kwargs):
-        if sd is not None:
-            sigma = sd
-
+    def dist(cls, sigma=None, tau=None, *args, **kwargs):
         tau, sigma = get_tau_sigma(tau=tau, sigma=sigma)
 
         assert_negative_support(tau, "tau", "HalfNormal")
@@ -1226,10 +1218,7 @@ class Beta(UnitContinuous):
     rv_op = aesara.tensor.random.beta
 
     @classmethod
-    def dist(cls, alpha=None, beta=None, mu=None, sigma=None, sd=None, *args, **kwargs):
-        if sd is not None:
-            sigma = sd
-
+    def dist(cls, alpha=None, beta=None, mu=None, sigma=None, *args, **kwargs):
         alpha, beta = cls.get_alpha_beta(alpha, beta, mu, sigma)
         alpha = at.as_tensor_variable(floatX(alpha))
         beta = at.as_tensor_variable(floatX(beta))
@@ -1785,10 +1774,7 @@ class LogNormal(PositiveContinuous):
     rv_op = lognormal
 
     @classmethod
-    def dist(cls, mu=0, sigma=None, tau=None, sd=None, *args, **kwargs):
-        if sd is not None:
-            sigma = sd
-
+    def dist(cls, mu=0, sigma=None, tau=None, *args, **kwargs):
         tau, sigma = get_tau_sigma(tau=tau, sigma=sigma)
 
         mu = at.as_tensor_variable(floatX(mu))
@@ -1914,9 +1900,7 @@ class StudentT(Continuous):
     rv_op = studentt
 
     @classmethod
-    def dist(cls, nu, mu=0, lam=None, sigma=None, sd=None, *args, **kwargs):
-        if sd is not None:
-            sigma = sd
+    def dist(cls, nu, mu=0, lam=None, sigma=None, *args, **kwargs):
         nu = at.as_tensor_variable(floatX(nu))
         lam, sigma = get_tau_sigma(tau=lam, sigma=sigma)
         sigma = at.as_tensor_variable(sigma)
@@ -2306,10 +2290,7 @@ class Gamma(PositiveContinuous):
     rv_op = gamma
 
     @classmethod
-    def dist(cls, alpha=None, beta=None, mu=None, sigma=None, sd=None, no_assert=False, **kwargs):
-        if sd is not None:
-            sigma = sd
-
+    def dist(cls, alpha=None, beta=None, mu=None, sigma=None, no_assert=False, **kwargs):
         alpha, beta = cls.get_alpha_beta(alpha, beta, mu, sigma)
         alpha = at.as_tensor_variable(floatX(alpha))
         beta = at.as_tensor_variable(floatX(beta))
@@ -2426,10 +2407,7 @@ class InverseGamma(PositiveContinuous):
     rv_op = invgamma
 
     @classmethod
-    def dist(cls, alpha=None, beta=None, mu=None, sigma=None, sd=None, *args, **kwargs):
-        if sd is not None:
-            sigma = sd
-
+    def dist(cls, alpha=None, beta=None, mu=None, sigma=None, *args, **kwargs):
         alpha, beta = cls._get_alpha_beta(alpha, beta, mu, sigma)
         alpha = at.as_tensor_variable(floatX(alpha))
         beta = at.as_tensor_variable(floatX(beta))
@@ -2750,11 +2728,7 @@ class HalfStudentT(PositiveContinuous):
     rv_op = halfstudentt
 
     @classmethod
-    def dist(cls, nu=1, sigma=None, lam=None, sd=None, *args, **kwargs):
-
-        if sd is not None:
-            sigma = sd
-
+    def dist(cls, nu=1, sigma=None, lam=None, *args, **kwargs):
         nu = at.as_tensor_variable(floatX(nu))
         lam, sigma = get_tau_sigma(lam, sigma)
         sigma = at.as_tensor_variable(sigma)
@@ -2886,11 +2860,7 @@ class ExGaussian(Continuous):
     rv_op = exgaussian
 
     @classmethod
-    def dist(cls, mu=0.0, sigma=None, nu=None, sd=None, *args, **kwargs):
-
-        if sd is not None:
-            sigma = sd
-
+    def dist(cls, mu=0.0, sigma=None, nu=None, *args, **kwargs):
         mu = at.as_tensor_variable(floatX(mu))
         sigma = at.as_tensor_variable(floatX(sigma))
         nu = at.as_tensor_variable(floatX(nu))
@@ -3118,10 +3088,7 @@ class SkewNormal(Continuous):
     rv_op = skewnormal
 
     @classmethod
-    def dist(cls, alpha=1, mu=0.0, sigma=None, tau=None, sd=None, *args, **kwargs):
-        if sd is not None:
-            sigma = sd
-
+    def dist(cls, alpha=1, mu=0.0, sigma=None, tau=None, *args, **kwargs):
         tau, sigma = get_tau_sigma(tau=tau, sigma=sigma)
         alpha = at.as_tensor_variable(floatX(alpha))
         mu = at.as_tensor_variable(floatX(mu))
@@ -3445,10 +3412,7 @@ class Rice(PositiveContinuous):
     rv_op = rice
 
     @classmethod
-    def dist(cls, nu=None, sigma=None, b=None, sd=None, *args, **kwargs):
-        if sd is not None:
-            sigma = sd
-
+    def dist(cls, nu=None, sigma=None, b=None, *args, **kwargs):
         nu, b, sigma = cls.get_nu_b(nu, b, sigma)
         b = at.as_tensor_variable(floatX(b))
         sigma = at.as_tensor_variable(floatX(sigma))
@@ -3657,12 +3621,10 @@ class LogitNormal(UnitContinuous):
     rv_op = logit_normal
 
     @classmethod
-    def dist(cls, mu=0, sigma=None, tau=None, sd=None, **kwargs):
-        if sd is not None:
-            sigma = sd
+    def dist(cls, mu=0, sigma=None, tau=None, **kwargs):
         mu = at.as_tensor_variable(floatX(mu))
         tau, sigma = get_tau_sigma(tau=tau, sigma=sigma)
-        sigma = sd = at.as_tensor_variable(sigma)
+        sigma = at.as_tensor_variable(sigma)
         tau = at.as_tensor_variable(tau)
         assert_negative_support(sigma, "sigma", "LogitNormal")
         assert_negative_support(tau, "tau", "LogitNormal")
 
@@ -514,17 +514,13 @@ class NormalMixture:
             y = pm.NormalMixture("y", w=weights, mu=μ, sigma=σ, observed=data)
     """
 
-    def __new__(cls, name, w, mu, sigma=None, tau=None, sd=None, comp_shape=(), **kwargs):
-        if sd is not None:
-            sigma = sd
+    def __new__(cls, name, w, mu, sigma=None, tau=None, comp_shape=(), **kwargs):
         _, sigma = get_tau_sigma(tau=tau, sigma=sigma)
 
         return Mixture(name, w, Normal.dist(mu, sigma=sigma, size=comp_shape), **kwargs)
 
     @classmethod
-    def dist(cls, w, mu, sigma=None, tau=None, sd=None, comp_shape=(), **kwargs):
-        if sd is not None:
-            sigma = sd
+    def dist(cls, w, mu, sigma=None, tau=None, comp_shape=(), **kwargs):
         _, sigma = get_tau_sigma(tau=tau, sigma=sigma)
 
         return Mixture.dist(w, Normal.dist(mu, sigma=sigma, size=comp_shape), **kwargs)
@@ -108,15 +108,10 @@ class AR(distribution.Continuous):
         distribution for initial values (Defaults to Flat())
     """
 
-    def __init__(
-        self, rho, sigma=None, tau=None, constant=False, init=None, sd=None, *args, **kwargs
-    ):
+    def __init__(self, rho, sigma=None, tau=None, constant=False, init=None, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        if sd is not None:
-            sigma = sd
-
         tau, sigma = get_tau_sigma(tau=tau, sigma=sigma)
-        self.sigma = self.sd = at.as_tensor_variable(sigma)
+        self.sigma = at.as_tensor_variable(sigma)
         self.tau = at.as_tensor_variable(tau)
 
         self.mean = at.as_tensor_variable(0.0)
@@ -201,17 +196,15 @@ class GaussianRandomWalk(distribution.Continuous):
         distribution for initial value (Defaults to Flat())
     """
 
-    def __init__(self, tau=None, init=None, sigma=None, mu=0.0, sd=None, *args, **kwargs):
+    def __init__(self, tau=None, init=None, sigma=None, mu=0.0, *args, **kwargs):
         kwargs.setdefault("shape", 1)
         super().__init__(*args, **kwargs)
         if sum(self.shape) == 0:
             raise TypeError("GaussianRandomWalk must be supplied a non-zero shape argument!")
-        if sd is not None:
-            sigma = sd
         tau, sigma = get_tau_sigma(tau=tau, sigma=sigma)
         self.tau = at.as_tensor_variable(tau)
         sigma = at.as_tensor_variable(sigma)
-        self.sigma = self.sd = sigma
+        self.sigma = sigma
         self.mu = at.as_tensor_variable(mu)
         self.init = init or Flat.dist()
         self.mean = at.as_tensor_variable(0.0)
@@ -400,8 +393,8 @@ def logp(self, x):
         xt = x[:-1]
         f, g = self.sde_fn(x[:-1], *self.sde_pars)
         mu = xt + self.dt * f
-        sd = at.sqrt(self.dt) * g
-        return at.sum(Normal.dist(mu=mu, sigma=sd).logp(x[1:]))
+        sigma = at.sqrt(self.dt) * g
+        return at.sum(Normal.dist(mu=mu, sigma=sigma).logp(x[1:]))
 
     def _distr_parameters_for_repr(self):
         return ["dt"]
 
@@ -482,7 +482,7 @@ def __init__(self, mean=0, sigma=1, name=''):
                 Normal('v2', mu=mean, sigma=sd)
 
                 # something more complex is allowed, too
-                half_cauchy = HalfCauchy('sd', beta=10, initval=1.)
+                half_cauchy = HalfCauchy('sigma', beta=10, initval=1.)
                 Normal('v3', mu=mean, sigma=half_cauchy)
 
                 # Deterministic variables can be used in usual way