Fixes for VAE logic and 2B ControlNets, and speed up model loading by loading ControlNets to CUDA if available

brianfitzgerald · brianfitzgerald · commit 7df9edbf5b7b · 2024-11-20T23:09:53.000Z
diff --git a/.gitignore b/.gitignore
@@ -167,4 +167,5 @@ cython_debug/
 #.idea/
 
 .vscode/
-*.out.*
+*.out.*
+*.pt
diff --git a/dit_embedder.py b/dit_embedder.py
@@ -32,6 +32,8 @@ def __init__(
             in_chans=in_chans,
             embed_dim=self.hidden_size,
             strict_img_size=pos_embed_max_size is None,
+            device=device,
+            dtype=dtype,
         )
 
         self.t_embedder = TimestepEmbedder(self.hidden_size, dtype=dtype, device=device)
@@ -41,14 +43,14 @@ def __init__(
 
         self.transformer_blocks = nn.ModuleList(
             DismantledBlock(
-                hidden_size=self.hidden_size, num_heads=num_attention_heads, qkv_bias=True
+                hidden_size=self.hidden_size, num_heads=num_attention_heads, qkv_bias=True, device=device, dtype=dtype
             )
             for _ in range(num_layers)
         )
 
         self.controlnet_blocks = nn.ModuleList([])
         for _ in range(len(self.transformer_blocks)):
-            controlnet_block = nn.Linear(self.hidden_size, self.hidden_size)
+            controlnet_block = nn.Linear(self.hidden_size, self.hidden_size, device=device, dtype=dtype)
             self.controlnet_blocks.append(controlnet_block)
 
         self.pos_embed_input = PatchEmbed(
@@ -57,7 +59,10 @@ def __init__(
             in_chans=in_chans,
             embed_dim=self.hidden_size,
             strict_img_size=False,
+            dtype=dtype,
+            device=device
         )
+        self.using_8b_controlnet: bool = False
 
     def forward(
         self,
@@ -66,10 +71,9 @@ def forward(
         y: Tensor,
         scale: int = 1,
         timestep: Optional[Tensor] = None,
-        is_8b: bool = False
     ) -> Tuple[Tensor, List[Tensor]]:
 
-        if not is_8b:
+        if not self.using_8b_controlnet:
             x = self.x_embedder(x)
         timestep = timestep * 1000
         c = self.t_embedder(timestep, dtype=x.dtype)
@@ -83,7 +87,7 @@ def forward(
 
         for block in self.transformer_blocks:
             out = block(x, c)
-            if is_8b:
+            if self.using_8b_controlnet:
                 x = out
             block_out += (out,)
 
diff --git a/mmditx.py b/mmditx.py
@@ -294,7 +294,7 @@ def post_attention(self, x: torch.Tensor) -> torch.Tensor:
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         (q, k, v) = self.pre_attention(x)
-        x = attention(q, k, v, self.num_heads)
+        x = attention(q, k, v, self.num_heads, self.attn_mode)
         x = self.post_attention(x)
         return x
 
diff --git a/other_impls.py b/other_impls.py
@@ -7,21 +7,33 @@
 import torch
 from torch import nn
 from transformers import CLIPTokenizer, T5TokenizerFast
+from einops import rearrange
+
+try:
+    import xformers.ops
+except ImportError:
+    xformers.ops = None
+    print("xformers not found, attn_mode='xformers' will not work")
 
 #################################################################################################
 ### Core/Utility
 #################################################################################################
 
 
-def attention(q, k, v, heads, mask=None):
+def attention(q, k, v, heads, mask=None, attn_mode: str = "torch"):
     """Convenience wrapper around a basic attention operation"""
     b, _, dim_head = q.shape
     dim_head //= heads
     q, k, v = map(lambda t: t.view(b, -1, heads, dim_head).transpose(1, 2), (q, k, v))
-    out = torch.nn.functional.scaled_dot_product_attention(
-        q, k, v, attn_mask=mask, dropout_p=0.0, is_causal=False
-    )
-    return out.transpose(1, 2).reshape(b, -1, heads * dim_head)
+    if attn_mode == "torch":
+        out = torch.nn.functional.scaled_dot_product_attention(
+            q, k, v, attn_mask=mask, dropout_p=0.0, is_causal=False
+        )
+        return out.transpose(1, 2).reshape(b, -1, heads * dim_head)
+    elif attn_mode == "xformers":
+        x = xformers.ops.memory_efficient_attention(q, k, v)
+        x = rearrange(x, "b h n d -> b n (h d)")
+        return x
 
 
 class Mlp(nn.Module):
diff --git a/sd3_impls.py b/sd3_impls.py
@@ -148,7 +148,7 @@ def __init__(
                 pooled_projection_size=pooled_projection_size,
                 device=device,
                 dtype=dtype,
-            ).to(device=device, dtype=dtype)
+            )
 
     def apply_model(self, x, sigma, c_crossattn=None, y=None, skip_layers=[], controlnet_cond=None):
         dtype = self.get_dtype()
@@ -159,17 +159,15 @@ def apply_model(self, x, sigma, c_crossattn=None, y=None, skip_layers=[], contro
             controlnet_cond = controlnet_cond.to(dtype=x.dtype, device=x.device)
             controlnet_cond = controlnet_cond.repeat(x.shape[0], 1, 1, 1)
 
-            # 8B ControlNets were trained with a slightly different architecture.
-            is_8b = y_cond.shape[-1] == self.control_model.y_embedder.mlp[0].in_features
-            if not is_8b:
+            if not self.control_model.using_8b_controlnet:
                 y_cond = self.diffusion_model.y_embedder(y)
             
             x_controlnet = x
-            if is_8b:
+            if self.control_model.using_8b_controlnet:
                 hw = x.shape[-2:]
                 x_controlnet = self.diffusion_model.x_embedder(x) + self.diffusion_model.cropped_pos_embed(hw)
             controlnet_hidden_states = self.control_model(
-                x_controlnet, controlnet_cond, y_cond, 1, sigma.to(torch.float32), is_8b
+                x_controlnet, controlnet_cond, y_cond, 1, sigma.to(torch.float32)
             )
         model_output = self.diffusion_model(
             x.to(dtype),
diff --git a/sd3_infer.py b/sd3_infer.py
@@ -17,6 +17,7 @@
 from PIL import Image
 from safetensors import safe_open
 from tqdm import tqdm
+import re
 
 import sd3_impls
 from other_impls import SD3Tokenizer, SDClipModel, SDXLClipG, T5XXLModel
@@ -61,7 +62,9 @@ def load_into(ckpt, model, prefix, device, dtype=None, remap=None):
                 obj.requires_grad_(False)
                 # print(f"K: {model_key}, O: {obj.shape} T: {tensor.shape}")
                 if obj.shape != tensor.shape:
-                    print(f"W: shape mismatch for key {model_key}, {obj.shape} != {tensor.shape}")
+                    print(
+                        f"W: shape mismatch for key {model_key}, {obj.shape} != {tensor.shape}"
+                    )
                 obj.set_(tensor)
             except Exception as e:
                 print(f"Failed to load key '{key}' in safetensors file: {e}")
@@ -148,6 +151,11 @@ class SD3:
     def __init__(
         self, model, shift, control_model_file=None, verbose=False, device="cpu"
     ):
+
+        # NOTE 8B ControlNets were trained with a slightly different forward pass and conditioning, 
+        # so this is a flag to enable that logic.
+        self.using_8b_controlnet = False
+
         with safe_open(model, framework="pt", device="cpu") as f:
             control_model_ckpt = None
             if control_model_file is not None:
@@ -165,9 +173,6 @@ def __init__(
             ).eval()
             load_into(f, self.model, "model.", "cuda", torch.float16)
         if control_model_file is not None:
-            self.model.control_model = self.model.control_model.to(
-                device=device, dtype=torch.float16
-            )
             control_model_ckpt = safe_open(
                 control_model_file, framework="pt", device=device
             )
@@ -179,6 +184,9 @@ def __init__(
                 dtype=torch.float16,
                 remap=CONTROLNET_MAP,
             )
+
+            self.using_8b_controlnet = self.model.control_model.y_embedder.mlp[0].in_features == 2048
+            self.model.control_model.using_8b_controlnet = self.using_8b_controlnet
         control_model_ckpt = None
 
 
@@ -252,7 +260,7 @@ def load(
         model_folder: str = MODEL_FOLDER,
         text_encoder_device: str = "cpu",
         verbose=False,
-        load_tokenizers: bool = True
+        load_tokenizers: bool = True,
     ):
         self.verbose = verbose
         print("Loading tokenizers...")
@@ -374,19 +382,19 @@ def do_sampling(
         self.print("Sampling done")
         return latent
 
-    def vae_encode(self, image, controlnet_cond: bool = False) -> torch.Tensor:
+    def vae_encode(self, image, using_8b_controlnet: bool = False) -> torch.Tensor:
         self.print("Encoding image to latent...")
         image = image.convert("RGB")
         image_np = np.array(image).astype(np.float32) / 255.0
         image_np = np.moveaxis(image_np, 2, 0)
         batch_images = np.expand_dims(image_np, axis=0).repeat(1, axis=0)
         image_torch = torch.from_numpy(batch_images).cuda()
-        if not controlnet_cond:
+        if using_8b_controlnet:
             image_torch = 2.0 * image_torch - 1.0
+        else:
+            image_torch = image_torch * 255
         image_torch = image_torch.cuda()
         self.vae.model = self.vae.model.cuda()
-        if controlnet_cond:
-            image_torch = image_torch * 255
         latent = self.vae.model.encode(image_torch).cpu()
         self.vae.model = self.vae.model.cpu()
         self.print("Encoded")
@@ -411,10 +419,10 @@ def vae_decode(self, latent) -> Image.Image:
         self.print("Decoded")
         return out_image
 
-    def _image_to_latent(self, image, width, height, controlnet_cond: bool = False):
+    def _image_to_latent(self, image, width, height, using_8b_controlnet: bool = False):
         image_data = Image.open(image)
         image_data = image_data.resize((width, height), Image.LANCZOS)
-        latent = self.vae_encode(image_data, controlnet_cond)
+        latent = self.vae_encode(image_data, using_8b_controlnet)
         latent = SD3LatentFormat().process_in(latent)
         return latent
 
@@ -442,7 +450,7 @@ def gen_image(
             latent = latent.cuda()
         if controlnet_cond_image:
             controlnet_cond = self._image_to_latent(
-                controlnet_cond_image, width, height, True
+                controlnet_cond_image, width, height, self.sd3.using_8b_controlnet
             )
         neg_cond = self.get_cond("")
         seed_num = None
@@ -468,8 +476,9 @@ def gen_image(
                 skip_layer_config,
             )
             image = self.vae_decode(sampled_latent)
+            os.makedirs(out_dir, exist_ok=False)
             save_path = os.path.join(out_dir, f"{i:06d}.png")
-            self.print(f"Will save to {save_path}")
+            self.print(f"Saving to to {save_path}")
             image.save(save_path)
             self.print("Done")
 
@@ -553,7 +562,13 @@ def main(
     inferencer = SD3Inferencer()
 
     inferencer.load(
-        model, vae, shift, controlnet_ckpt, model_folder, text_encoder_device, verbose
+        model,
+        vae,
+        shift,
+        controlnet_ckpt,
+        model_folder,
+        text_encoder_device,
+        verbose,
     )
 
     if isinstance(prompt, str):
@@ -563,6 +578,7 @@ def main(
         else:
             prompts = [prompt]
 
+    sanitized_prompt = re.sub(r'[^\w\-\.]', '_', prompt)
     out_dir = os.path.join(
         out_dir,
         (
@@ -573,11 +589,9 @@ def main(
                 else ""
             )
         ),
-        os.path.splitext(os.path.basename(prompt))[0][:50]
+        os.path.splitext(os.path.basename(sanitized_prompt))[0][:50]
         + (postfix or datetime.datetime.now().strftime("_%Y-%m-%dT%H-%M-%S")),
     )
-    print(f"Saving images to {out_dir}")
-    os.makedirs(out_dir, exist_ok=False)
 
     inferencer.gen_image(
         prompts,

-Original file line number
+Diff line change
 #.idea/
 .vscode/
 -*.out.*
 +*.out.*
 +*.pt