fixed latent encoder behavior based on control type

brianfitzgerald · brianfitzgerald · commit d343ae05383a · 2024-11-26T03:26:28.000Z
diff --git a/dit_embedder.py b/dit_embedder.py
@@ -36,6 +36,9 @@ def __init__(
             dtype=dtype,
         )
 
+        # blur = 0, canny = 1, depth = 2
+        self.control_type = torch.tensor([0], dtype=torch.int32, device=device)
+
         self.t_embedder = TimestepEmbedder(self.hidden_size, dtype=dtype, device=device)
         self.y_embedder = VectorEmbedder(
             pooled_projection_size, self.hidden_size, dtype, device
diff --git a/sd3_infer.py b/sd3_infer.py
@@ -57,7 +57,7 @@ def load_into(ckpt, model, prefix, device, dtype=None, remap=None):
                 continue
             try:
                 tensor = ckpt.get_tensor(key).to(device=device)
-                if dtype is not None:
+                if dtype is not None and tensor.dtype != torch.int32:
                     tensor = tensor.to(dtype=dtype)
                 obj.requires_grad_(False)
                 # print(f"K: {model_key}, O: {obj.shape} T: {tensor.shape}")
@@ -385,15 +385,19 @@ def do_sampling(
         self.print("Sampling done")
         return latent
 
-    def vae_encode(self, image, using_2b_controlnet: bool = False) -> torch.Tensor:
+    def vae_encode(
+        self, image, using_2b_controlnet: bool = False, controlnet_type: int = 0
+    ) -> torch.Tensor:
         self.print("Encoding image to latent...")
         image = image.convert("RGB")
         image_np = np.array(image).astype(np.float32) / 255.0
         image_np = np.moveaxis(image_np, 2, 0)
         batch_images = np.expand_dims(image_np, axis=0).repeat(1, axis=0)
         image_torch = torch.from_numpy(batch_images).cuda()
         if using_2b_controlnet:
-            image_torch = image_torch * 255
+            image_torch = image_torch * 2.0 - 1.0
+        elif controlnet_type == 1: # canny
+            image_torch = image_torch * 255 * 0.5 + 0.5
         else:
             image_torch = 2.0 * image_torch - 1.0
         image_torch = image_torch.cuda()
@@ -422,10 +426,17 @@ def vae_decode(self, latent) -> Image.Image:
         self.print("Decoded")
         return out_image
 
-    def _image_to_latent(self, image, width, height, using_2b_controlnet: bool = False):
+    def _image_to_latent(
+        self,
+        image,
+        width,
+        height,
+        using_2b_controlnet: bool = False,
+        controlnet_type: int = 0,
+    ) -> torch.Tensor:
         image_data = Image.open(image)
         image_data = image_data.resize((width, height), Image.LANCZOS)
-        latent = self.vae_encode(image_data, using_2b_controlnet)
+        latent = self.vae_encode(image_data, using_2b_controlnet, controlnet_type)
         latent = SD3LatentFormat().process_in(latent)
         return latent
 
@@ -452,12 +463,12 @@ def gen_image(
             latent = self.get_empty_latent(1, width, height, seed, "cpu")
             latent = latent.cuda()
         if controlnet_cond_image:
-            using_2b_controlnet = (
-                self.sd3.model.control_model is not None
-                and not self.sd3.using_8b_controlnet
-            )
+            using_2b, control_type = False, 0
+            if self.sd3.model.control_model is not None:
+                using_2b = not self.sd3.using_8b_controlnet
+                control_type = int(self.sd3.model.control_model.control_type.item())
             controlnet_cond = self._image_to_latent(
-                controlnet_cond_image, width, height, using_2b_controlnet
+                controlnet_cond_image, width, height, using_2b, control_type
             )
         neg_cond = self.get_cond("")
         seed_num = None

Original file line number	Diff line number	Diff line change
`@@ -36,6 +36,9 @@ def __init__(`
`36`	`36`	`dtype=dtype,`
`37`	`37`	`)`
`38`	`38`
	`39`	`+ # blur = 0, canny = 1, depth = 2`
	`40`	`+ self.control_type = torch.tensor([0], dtype=torch.int32, device=device)`
	`41`	`+`
`39`	`42`	`self.t_embedder = TimestepEmbedder(self.hidden_size, dtype=dtype, device=device)`
`40`	`43`	`self.y_embedder = VectorEmbedder(`
`41`	`44`	`pooled_projection_size, self.hidden_size, dtype, device`