NVIDIA
diff --git a/‎docs/api/diffusion/noise_schedulers.rst‎
Lines changed: 18 additions & 1 deletion b/‎docs/api/diffusion/noise_schedulers.rst‎
Lines changed: 18 additions & 1 deletion
diff --git a/‎examples/weather/stormcast/README.md‎
Lines changed: 85 additions & 57 deletions b/‎examples/weather/stormcast/README.md‎
Lines changed: 85 additions & 57 deletions
diff --git a/‎examples/weather/stormcast/config/inference/stormcast.yaml‎
Lines changed: 1 addition & 1 deletion b/‎examples/weather/stormcast/config/inference/stormcast.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/weather/stormcast/config/sampler/edm_deterministic.yaml‎
Lines changed: 4 additions & 3 deletions b/‎examples/weather/stormcast/config/sampler/edm_deterministic.yaml‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎examples/weather/stormcast/config/training/default.yaml‎
Lines changed: 1 addition & 1 deletion b/‎examples/weather/stormcast/config/training/default.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/weather/stormcast/inference.py‎
Lines changed: 11 additions & 1 deletion b/‎examples/weather/stormcast/inference.py‎
Lines changed: 11 additions & 1 deletion
diff --git a/‎examples/weather/stormcast/test_training.py‎
Lines changed: 24 additions & 14 deletions b/‎examples/weather/stormcast/test_training.py‎
Lines changed: 24 additions & 14 deletions
diff --git a/‎examples/weather/stormcast/utils/config.py‎
Lines changed: 3 additions & 1 deletion b/‎examples/weather/stormcast/utils/config.py‎
Lines changed: 3 additions & 1 deletion
@@ -99,11 +99,13 @@ available at three levels:
   - the inverse mapping :math:`\sigma^{-1}(\sigma) = t` from noise level back to time
   - the discretization of the diffusion time grid
 
-- **Ready-to-use schedules**: Five concrete implementations that work out of
+- **Ready-to-use schedules**: Multiple concrete implementations that work out of
   the box:
 
   - :class:`EDMNoiseScheduler` --- :math:`\alpha(t)=1`,
     :math:`\sigma(t)=t`.  The recommended default for most applications.
+  - :class:`EDMLogUniformNoiseScheduler` --- EDM variant that samples
+    training times uniformly in log-space instead of from a log-normal.
   - :class:`VENoiseScheduler` --- Variance Exploding schedule.
   - :class:`VPNoiseScheduler` --- Variance Preserving schedule.
   - :class:`IDDPMNoiseScheduler` --- Improved DDPM schedule.
@@ -137,6 +139,14 @@ API Reference
     :members:
     :exclude-members: __init__
 
+:code:`EDMLogUniformNoiseScheduler`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: physicsnemo.diffusion.noise_schedulers.EDMLogUniformNoiseScheduler
+    :show-inheritance:
+    :members:
+    :exclude-members: __init__
+
 :code:`VENoiseScheduler`
 ~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -168,3 +178,10 @@ API Reference
     :show-inheritance:
     :members:
     :exclude-members: __init__
+
+:code:`DomainParallelNoiseScheduler`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: physicsnemo.diffusion.noise_schedulers.DomainParallelNoiseScheduler
+    :members:
+    :exclude-members: __init__
@@ -20,7 +20,7 @@ experiment_name: 'stormcast-inference' # Name for the inference experiment being
 run_id: 0 # Unique identifier for the inference run
 rundir: ./${inference.outdir}/${inference.experiment_name}/${inference.run_id} # Path where experiement outputs will be saved
 regression_checkpoint: stormcast_checkpoints/regression/StormCastUNet.0.0.mdlus
-diffusion_checkpoint: stormcast_checkpoints/diffusion/EDMPrecond.0.0.mdlus
+diffusion_checkpoint: stormcast_checkpoints/diffusion/EDMPreconditioner.0.0.mdlus
 
 # Initial and lead times
 initial_time: "2022-11-04T21:00:00" # datetime to intialize forecast with (YYYY-MM-DDTHH:MM:SS)
 
@@ -14,9 +14,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# Sampler args 
-# Below are passed as kwargs to physicsnemo.utils.diffusion.determinisitic_sampler
-# Also supports stochastic sampling via S_churn and related args.
+# Sampler args for physicsnemo.diffusion.samplers.sample
+# Uses EDMNoiseScheduler for timestep generation and Heun solver for ODE integration.
+# Set S_churn > 0 for stochastic sampling (EDMStochasticHeunSolver).
 # See EDM paper for details (https://arxiv.org/abs/2206.00364)
 
 name: 'EDM Deterministic'
@@ -25,6 +25,7 @@ args:
   sigma_min: 0.002
   sigma_max: 800
   rho: 7
+  solver: heun  # "heun" (2nd order) or "euler" (1st order, faster)
   S_churn: 0.
   S_min: 0.
   S_max: .inf
 
@@ -47,7 +47,7 @@ force_sharding: False
 # Performance and optimization (like corrdiff perf section)
 perf:
   fp_optimizations: fp32 # Floating point mode: "fp32", "amp-fp16", "amp-bf16"
-  torch_compile: False # Use torch.compile to compile model
+  torch_compile: False # torch.compile the training loss forward (skipped with domain parallelism)
   use_apex_gn: False # Use Apex GroupNorm (enables channels_last memory format)
   allow_tf32: False # Allow TF32 for matmul and cuDNN (faster but less precise)
   allow_fp16_reduced_precision: False # Allow reduced precision reductions in fp16
 
@@ -23,6 +23,8 @@
 from omegaconf import DictConfig
 from physicsnemo.core import Module
 
+from physicsnemo.diffusion.noise_schedulers import EDMNoiseScheduler
+
 from datasets import dataset_classes
 from utils.io import (
     init_inference_results_zarr,
@@ -81,6 +83,13 @@ def main(cfg: DictConfig):
     net = Module.from_checkpoint(cfg.inference.diffusion_checkpoint)
     diffusion_model = net.to(device)
 
+    sa = dict(cfg.sampler.args)
+    sampling_scheduler = EDMNoiseScheduler(
+        sigma_min=sa.get("sigma_min", 0.002),
+        sigma_max=sa.get("sigma_max", 80.0),
+        rho=sa.get("rho", 7.0),
+    )
+
     # initialize zarr
     (
         group,
@@ -150,7 +159,8 @@ def main(cfg: DictConfig):
                 diffusion_model,
                 condition,
                 state_pred.shape,
-                sampler_args=dict(cfg.sampler.args),
+                scheduler=sampling_scheduler,
+                sampler_args=sa,
                 lead_time_label=lead_time_label,
             )
 
 
@@ -162,8 +162,9 @@ def test_training(
     if dist.world_size > 1:
         torch.distributed.barrier()
 
-    net_cls = "EDMPrecond" if net_architecture == "unet" else "EDMPreconditioner"
-    ckpt_path = os.path.join(rundir, "checkpoints_diffusion", f"{net_cls}.0.10.mdlus")
+    ckpt_path = os.path.join(
+        rundir, "checkpoints_diffusion", "EDMPreconditioner.0.10.mdlus"
+    )
     assert os.path.isfile(ckpt_path), "Diffusion checkpoint not found"
 
 
@@ -232,8 +233,9 @@ def test_checkpointing(
     if num_procs > 1:
         torch.distributed.barrier()
 
-    net_cls = "EDMPrecond" if net_architecture == "unet" else "EDMPreconditioner"
-    ckpt_path = os.path.join(rundir, "checkpoints_diffusion", f"{net_cls}.0.20.mdlus")
+    ckpt_path = os.path.join(
+        rundir, "checkpoints_diffusion", "EDMPreconditioner.0.20.mdlus"
+    )
     assert os.path.isfile(ckpt_path), (
         f"Diffusion checkpoint not found on rank {dist.rank}"
     )
@@ -359,7 +361,7 @@ def test_seeding(
 
       - Domain (model-parallel) groups are {0, 1} and {2, 3}.
         Ranks within the same domain group must see **identical** sigma
-        (enforced by ``replicate_in_mesh`` broadcast).
+        (enforced by ``DomainParallelNoiseScheduler`` broadcast).
       - DDP (data-parallel) groups are {0, 2} and {1, 3}.
         Ranks in different DDP groups must see **different** sigma
         (they process different data and have distinct RNG seeds).
@@ -391,17 +393,26 @@ def test_seeding(
 
     t = trainer.Trainer(cfg)
 
-    # -- instrument the loss to capture post-broadcast sigma values ----------
+    # -- instrument the loss to capture sigma values -------------------------
+    from physicsnemo.diffusion.noise_schedulers import DomainParallelNoiseScheduler
+
+    scheduler = t.train_noise_scheduler
+    if domain_parallel_size > 1 and not isinstance(
+        scheduler, DomainParallelNoiseScheduler
+    ):
+        raise ValueError(
+            "test_seeding requires a DomainParallelNoiseScheduler on the "
+            "loss when domain_parallel_size > 1"
+        )
     captured_sigmas: list[torch.Tensor] = []
-    _orig_replicate = t.loss_fn.replicate_in_mesh
+    _orig_sample_time = scheduler.sample_time
 
-    def _capturing_replicate(x, y):
-        result = _orig_replicate(x, y)
-        local = result.to_local() if hasattr(result, "to_local") else result
-        captured_sigmas.append(local.detach().cpu())
+    def _capturing_sample_time(*args, **kwargs):
+        result = _orig_sample_time(*args, **kwargs)
+        captured_sigmas.append(result.detach().cpu())
         return result
 
-    t.loss_fn.replicate_in_mesh = _capturing_replicate
+    scheduler.sample_time = _capturing_sample_time
 
     # -- helper: gather sigmas and assert the expected pattern ---------------
     def _check_sigma_pattern(label: str) -> None:
@@ -520,8 +531,7 @@ def test_model_types(
         if dist.world_size > 1:
             torch.distributed.barrier()
 
-        net_cls = "EDMPrecond" if net_architecture == "unet" else "EDMPreconditioner"
         ckpt_path = os.path.join(
-            rundir, "checkpoints_diffusion", f"{net_cls}.0.10.mdlus"
+            rundir, "checkpoints_diffusion", "EDMPreconditioner.0.10.mdlus"
         )
         assert os.path.isfile(ckpt_path), "Diffusion checkpoint not found"
@@ -54,7 +54,9 @@ class PerfConfig:
     fp_optimizations: Literal["fp32", "amp-fp16", "amp-bf16"] = (
         "fp32"  # Floating point mode: "fp32", "amp-fp16", "amp-bf16"
     )
-    torch_compile: bool = False  # Use torch.compile to compile model
+    torch_compile: bool = (
+        False  # torch.compile training loss forward (skipped with domain parallelism)
+    )
     use_apex_gn: bool = (
         False  # Use Apex GroupNorm (enables channels_last memory format)
     )
Original file line number	Diff line number	Diff line change
`@@ -54,7 +54,9 @@ class PerfConfig:`
`54`	`54`	`fp_optimizations: Literal["fp32", "amp-fp16", "amp-bf16"] = (`
`55`	`55`	`"fp32" # Floating point mode: "fp32", "amp-fp16", "amp-bf16"`
`56`	`56`	`)`
`57`		`- torch_compile: bool = False # Use torch.compile to compile model`
	`57`	`+ torch_compile: bool = (`
	`58`	`+ False # torch.compile training loss forward (skipped with domain parallelism)`
	`59`	`+ )`
`58`	`60`	`use_apex_gn: bool = (`
`59`	`61`	`False # Use Apex GroupNorm (enables channels_last memory format)`
`60`	`62`	`)`