diff --git a/shortfin/python/shortfin_apps/sd/components/service.py b/shortfin/python/shortfin_apps/sd/components/service.py
index be0e0bf9e..bd0fb102c 100644
--- a/shortfin/python/shortfin_apps/sd/components/service.py
+++ b/shortfin/python/shortfin_apps/sd/components/service.py
@@ -386,9 +386,11 @@ async def run(self):
                 await self._denoise(device=device)
             if phases[InferencePhase.DECODE]["required"]:
                 await self._decode(device=device)
-                # Postprocessing needs the output data to be on the host.  Even
-                # without postprocessing, we're done with the GPU, so we wait for
-                # it to finish here.
+            else:
+                # Decode and postprocess both need the output data to be on the host.
+                # With decode enabled, decode itself will wait for the data.
+                # With decode disabled, whether or not we're postprocessing,
+                # we're done with the GPU, so we wait for it to finish here.
                 await device
             if phases[InferencePhase.POSTPROCESS]["required"]:
                 await self._postprocess(device=device)
@@ -546,6 +548,11 @@ async def _decode(self, device):
         )
         (cb.images,) = await fn(cb.latents, fiber=self.fiber)
         cb.images_host.copy_from(cb.images)
+
+        # Wait for the device-to-host transfer, so that we can read the
+        # data with .items.
+        await device
+
         image_array = cb.images_host.items
         dtype = image_array.typecode
         if cb.images_host.dtype == sfnp.float16: