Skip to content

Commit 6bc1178

Browse files
[Img2Img] Fix batch size mismatch prompts vs. init images (#793)
* [Img2Img] Fix batch size mismatch prompts vs. init images * Remove bogus folder * fix * Update src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py Co-authored-by: Pedro Cuenca <[email protected]> Co-authored-by: Pedro Cuenca <[email protected]>
1 parent c1b6ea3 commit 6bc1178

File tree

2 files changed

+60
-2
lines changed

2 files changed

+60
-2
lines changed

src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py

+18-2
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,7 @@ def __call__(
195195
"""
196196
if isinstance(prompt, str):
197197
batch_size = 1
198+
prompt = [prompt]
198199
elif isinstance(prompt, list):
199200
batch_size = len(prompt)
200201
else:
@@ -284,8 +285,23 @@ def __call__(
284285
init_latents = init_latent_dist.sample(generator=generator)
285286
init_latents = 0.18215 * init_latents
286287

287-
# expand init_latents for batch_size
288-
init_latents = torch.cat([init_latents] * batch_size * num_images_per_prompt, dim=0)
288+
if len(prompt) > init_latents.shape[0] and len(prompt) % init_latents.shape[0] == 0:
289+
# expand init_latents for batch_size
290+
deprecation_message = (
291+
f"You have passed {len(prompt)} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
292+
" images (`init_image`). Initial images are now duplicating to match the number of text prompts. Note"
293+
" that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
294+
" your script to pass as many init images as text prompts to suppress this warning."
295+
)
296+
deprecate("len(prompt) != len(init_image)", "1.0.0", deprecation_message, standard_warn=False)
297+
additional_image_per_prompt = len(prompt) // init_latents.shape[0]
298+
init_latents = torch.cat([init_latents] * additional_image_per_prompt * num_images_per_prompt, dim=0)
299+
elif len(prompt) > init_latents.shape[0] and len(prompt) % init_latents.shape[0] != 0:
300+
raise ValueError(
301+
f"Cannot duplicate `init_image` of batch size {init_latents.shape[0]} to {len(prompt)} text prompts."
302+
)
303+
else:
304+
init_latents = torch.cat([init_latents] * num_images_per_prompt, dim=0)
289305

290306
# get the original timestep using init_timestep
291307
offset = self.scheduler.config.get("steps_offset", 0)

tests/test_pipelines.py

+42
Original file line numberDiff line numberDiff line change
@@ -698,6 +698,48 @@ def test_stable_diffusion_img2img(self):
698698
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
699699
assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
700700

701+
def test_stable_diffusion_img2img_multiple_init_images(self):
702+
device = "cpu" # ensure determinism for the device-dependent torch.Generator
703+
unet = self.dummy_cond_unet
704+
scheduler = PNDMScheduler(skip_prk_steps=True)
705+
vae = self.dummy_vae
706+
bert = self.dummy_text_encoder
707+
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
708+
709+
init_image = self.dummy_image.to(device).repeat(2, 1, 1, 1)
710+
711+
# make sure here that pndm scheduler skips prk
712+
sd_pipe = StableDiffusionImg2ImgPipeline(
713+
unet=unet,
714+
scheduler=scheduler,
715+
vae=vae,
716+
text_encoder=bert,
717+
tokenizer=tokenizer,
718+
safety_checker=self.dummy_safety_checker,
719+
feature_extractor=self.dummy_extractor,
720+
)
721+
sd_pipe = sd_pipe.to(device)
722+
sd_pipe.set_progress_bar_config(disable=None)
723+
724+
prompt = 2 * ["A painting of a squirrel eating a burger"]
725+
generator = torch.Generator(device=device).manual_seed(0)
726+
output = sd_pipe(
727+
prompt,
728+
generator=generator,
729+
guidance_scale=6.0,
730+
num_inference_steps=2,
731+
output_type="np",
732+
init_image=init_image,
733+
)
734+
735+
image = output.images
736+
737+
image_slice = image[-1, -3:, -3:, -1]
738+
739+
assert image.shape == (2, 32, 32, 3)
740+
expected_slice = np.array([0.5144, 0.4447, 0.4735, 0.6676, 0.5526, 0.5454, 0.645, 0.5149, 0.4689])
741+
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
742+
701743
def test_stable_diffusion_img2img_k_lms(self):
702744
device = "cpu" # ensure determinism for the device-dependent torch.Generator
703745
unet = self.dummy_cond_unet

0 commit comments

Comments
 (0)