diff --git a/shortfin/python/shortfin_apps/sd/components/service.py b/shortfin/python/shortfin_apps/sd/components/service.py index be0e0bf9e..bd0fb102c 100644 --- a/shortfin/python/shortfin_apps/sd/components/service.py +++ b/shortfin/python/shortfin_apps/sd/components/service.py @@ -386,9 +386,11 @@ async def run(self): await self._denoise(device=device) if phases[InferencePhase.DECODE]["required"]: await self._decode(device=device) - # Postprocessing needs the output data to be on the host. Even - # without postprocessing, we're done with the GPU, so we wait for - # it to finish here. + else: + # Decode and postprocess both need the output data to be on the host. + # With decode enabled, decode itself will wait for the data. + # With decode disabled, whether or not we're postprocessing, + # we're done with the GPU, so we wait for it to finish here. await device if phases[InferencePhase.POSTPROCESS]["required"]: await self._postprocess(device=device) @@ -546,6 +548,11 @@ async def _decode(self, device): ) (cb.images,) = await fn(cb.latents, fiber=self.fiber) cb.images_host.copy_from(cb.images) + + # Wait for the device-to-host transfer, so that we can read the + # data with .items. + await device + image_array = cb.images_host.items dtype = image_array.typecode if cb.images_host.dtype == sfnp.float16: