MingtaoGuo · Mar 2, 2025
diff --git a/‎inference.py
+185 b/‎inference.py
+185
diff --git a/‎preprocess.py
+260 b/‎preprocess.py
+260
@@ -0,0 +1,185 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import cv2
+import argparse
+import numpy as np
+from PIL import Image
+import torch
+import torch.utils.checkpoint
+from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
+
+from diffusers import AutoencoderKL, EulerDiscreteScheduler
+
+from src.modules.head_net import HeadNet
+from src.modules.light_net import LightNet
+from src.modules.ref_net import RefNet
+from src.modules.unet import UNetSpatioTemporalConditionModel
+from src.pipelines.pipeline_relightalbepa_composer import RelightablepaPipeline
+
+
+pretrained_model_name_or_path = "../../stable-video-diffusion-img2vid-xt"
+
+# Load scheduler, tokenizer and models.
+noise_scheduler = EulerDiscreteScheduler.from_pretrained(
+pretrained_model_name_or_path, subfolder="scheduler")
+feature_extractor = CLIPImageProcessor.from_pretrained(
+    pretrained_model_name_or_path, subfolder="feature_extractor" 
+)
+image_encoder = CLIPVisionModelWithProjection.from_pretrained(
+    pretrained_model_name_or_path, subfolder="image_encoder", variant="fp16"
+)
+vae = AutoencoderKL.from_pretrained(
+    pretrained_model_name_or_path, subfolder="sd-vae-ft-mse")
+unet = UNetSpatioTemporalConditionModel.from_config(
+    pretrained_model_name_or_path,
+    subfolder="unet",
+    low_cpu_mem_usage=True,
+)
+head_embedder = HeadNet(noise_latent_channels=320)
+light_embedder = LightNet(noise_latent_channels=320)
+ref_embedder = RefNet(noise_latent_channels=320)
+
+# Freeze vae and image_encoder
+vae.requires_grad_(False)
+image_encoder.requires_grad_(False)
+unet.requires_grad_(False)
+head_embedder.requires_grad_(False)
+light_embedder.requires_grad_(False)
+ref_embedder.requires_grad_(False)
+
+unet.load_state_dict(torch.load("outputs/checkpoint-29000/unet.pth"))
+head_embedder.load_state_dict(torch.load("outputs/checkpoint-29000/head_embedder.pth"))
+light_embedder.load_state_dict(torch.load("outputs/checkpoint-29000/light_embedder.pth"))
+ref_embedder.load_state_dict(torch.load("outputs/checkpoint-29000/app_embedder.pth"))
+
+weight_dtype = torch.float16
+device = "cuda"
+
+image_encoder.to(device, dtype=weight_dtype)
+vae.to(device, dtype=weight_dtype)
+unet.to(device, dtype=weight_dtype)
+head_embedder.to(device, dtype=weight_dtype) 
+light_embedder.to(device, dtype=weight_dtype) 
+ref_embedder.to(device, dtype=weight_dtype)
+
+# The models need unwrapping because for compatibility in distributed training mode.
+pipeline = RelightablepaPipeline.from_pretrained(
+    pretrained_model_name_or_path,
+    unet=unet,
+    image_encoder=image_encoder,
+    vae=vae,
+    head_embedder=head_embedder,
+    light_embedder=light_embedder,
+    ref_embedder=ref_embedder,
+    torch_dtype=weight_dtype,
+)
+pipeline = pipeline.to(device)
+pipeline.set_progress_bar_config(disable=False)
+
+def portrait_animation_and_relighting(video_path, save_path, guidance, inference_steps, driving_mode="relighting"):
+    path = "resources/target/"
+    path_tmp = "resources/tmp/"
+    if not os.path.exists(path):
+        os.system(f"mkdir {path}")
+    else:
+        os.system(f"rm -r {path}/*")
+
+    if not os.path.exists(path_tmp):
+        os.system(f"mkdir {path_tmp}")
+    else:
+        os.system(f"rm -r {path_tmp}/*")
+    
+    os.system(f"ffmpeg -i {video_path} {path}/%5d.png")
+    
+    pixel_values = []
+    pixel_head = []
+    pixel_values_light = []
+    img = np.array(Image.open(path + "00001.png"))
+    # img = cv2.resize(img, (img.shape[1], img.shape[0]))
+    pixel_ref_values = img[:, :512]
+    pixel_ref_mask = img[:, 512:1024]
+    pixel_ref_mask = cv2.resize(pixel_ref_mask, (64, 64))
+    # pixel_ref_mask = np.ones_like(pixel_ref_mask) * 255
+
+    for i in range(1, len(os.listdir(path))+1):
+        img = np.array(Image.open(f"{path}/{str(i).zfill(5)}.png"), dtype=np.uint8)
+        # img = cv2.resize(img, (img.shape[1], img.shape[0]))
+        pixel_values.append(img[:, 1024:1536][None])
+        pixel_head.append(img[:, 1536:2048][None])
+        pixel_values_light.append(img[:, 2048:2560][None])
+
+    pixel_values = torch.tensor(np.concatenate(pixel_values, axis=0)[None]).to(device, dtype=weight_dtype).permute(0, 1, 4, 2, 3) / 127.5 - 1.0
+    pixel_head = torch.tensor(np.concatenate(pixel_head, axis=0)[None]).to(device, dtype=weight_dtype).permute(0, 1, 4, 2, 3) / 255.0
+    pixel_values_light = torch.tensor(np.concatenate(pixel_values_light, axis=0)[None]).to(device, dtype=weight_dtype).permute(0, 1, 4, 2, 3) / 255.0
+
+    pixel_ref_values = torch.tensor(pixel_ref_values[None, None]).repeat(1, pixel_values.size(1), 1, 1, 1).to(device, dtype=weight_dtype).permute(0, 1, 4, 2, 3) / 127.5 - 1.0
+    pixel_ref_mask = torch.tensor(pixel_ref_mask[None, None]).repeat(1, pixel_values.size(1), 1, 1, 1).to(device, dtype=weight_dtype).permute(0, 1, 4, 2, 3)[:, :, 0:1] / 255.0
+
+    num_frames = pixel_values.size(1)
+    pixel_pil = [Image.fromarray(np.uint8((pixel_values.permute(0, 1, 3, 4, 2).cpu().numpy()[0, i] + 1) * 127.5)) for i in range(num_frames)]
+    heads_pil = [Image.fromarray(np.uint8((pixel_head.permute(0, 1, 3, 4, 2).cpu().numpy()[0, i]) * 255)) for i in range(num_frames)]
+    lights_drv_pil = [Image.fromarray(np.uint8((pixel_values_light.permute(0, 1, 3, 4, 2).cpu().numpy()[0, i]) * 255)) for i in range(num_frames)]
+    reference_pil = [Image.fromarray(np.uint8((pixel_ref_values.permute(0, 1, 3, 4, 2).cpu().numpy()[0, 0] + 1) * 127.5))]
+
+    if driving_mode == "relighting":
+        model_args = [{"image_head": None, "image_light": pixel_values_light, "image_ref": pixel_ref_values}, # cond
+                      {"image_head": None, "image_light": None,               "image_ref": pixel_ref_values}] # uncond
+    elif driving_mode == "landmark":
+        model_args = [{"image_head": pixel_head, "image_light": None, "image_ref": pixel_ref_values},         # cond
+                      {"image_head": None, "image_light": None,               "image_ref": None}]             # uncond
+    else:
+        model_args = [{"image_head": None, "image_light": pixel_values_light, "image_ref": pixel_ref_values}, # cond
+                      {"image_head": None, "image_light": None,               "image_ref": None}]             # uncond
+        
+    frames = pipeline(
+        reference_pil, model_args=model_args, image_mask=pixel_ref_mask, 
+        num_frames=pixel_head.size(1),
+        tile_size=16, tile_overlap=6,
+        height=512, width=512, fps=7,
+        noise_aug_strength=0.02, num_inference_steps=inference_steps,
+        generator=None, min_guidance_scale=guidance, 
+        max_guidance_scale=guidance, decode_chunk_size=8, output_type="pt", device="cuda"
+    ).frames.cpu()
+    video_frames = (frames.permute(0, 1, 3, 4, 2) * 255.0).to(torch.uint8).numpy()[0]
+
+    final = []
+    for i in range(pixel_head.size(1)):
+        img = video_frames[i]
+        head = np.array(heads_pil[i])
+        light = np.array(lights_drv_pil[i])
+        tar = np.array(pixel_pil[i])
+        ref = np.array(reference_pil[0])
+        # final.append(np.concatenate([ref, head, light, img, tar], axis=1))
+        Image.fromarray(np.uint8(np.concatenate([ref, light, img, tar], axis=1))).save(f"{path_tmp}/{str(i).zfill(5)}.png")
+
+    os.system(f"ffmpeg -r 20 -i {path_tmp}/%05d.png -pix_fmt yuv420p -c:v libx264 {save_path} -y")
+    # torchvision.io.write_video(save_path, final, fps=20, video_codec='h264', options={'crf': '10'})
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--video_path", type=str, default="resources/shading.mp4", help="reference and shading") 
+    parser.add_argument("--save_path", type=str, default="result.mp4", help="result save path")
+    parser.add_argument("--guidance", type=float, default=4.5, help="lighting intensity")
+    parser.add_argument("--inference_steps", type=int, default=25, help="diffusion reverse sampling steps")
+
+    args = parser.parse_args()
+
+    portrait_animation_and_relighting(video_path=args.video_path, save_path=args.save_path, guidance=args.guidance, inference_steps=args.inference_steps, driving_mode="relighting")
+         
+         
@@ -0,0 +1,260 @@
+from src.facepose.mp_utils  import LMKExtractor
+from src.facepose.draw_utils import FaceMeshVisualizer
+from src.facepose.motion_utils import motion_sync
+from src.facematting.u2net_matting import U2NET
+from src.decalib.utils import util
+from src.decalib.utils.tensor_cropper import transform_points
+from src.decalib.deca import DECA
+from src.decalib.utils.config import cfg as deca_cfg
+from PIL import Image
+from tqdm import tqdm
+import numpy as np
+import torch
+import cv2 
+import os 
+import argparse 
+
+
+class FaceMatting:
+    def __init__(self) -> None:
+        self.net = U2NET(3,1).cuda()
+        self.net.load_state_dict(torch.load("./src/facematting/u2net_human_seg.pth"))
+
+    def portrait_matting(self, rgb_image):
+        rgb_image = cv2.resize(rgb_image, (320, 320))[None] / 255
+        rgb_image[:,:,0] = (rgb_image[:,:,0] - 0.485) / 0.229
+        rgb_image[:,:,1] = (rgb_image[:,:,1] - 0.456) / 0.224
+        rgb_image[:,:,2] = (rgb_image[:,:,2] - 0.406) / 0.225
+        rgb_image_th = torch.tensor(rgb_image, dtype=torch.float32).cuda().permute(0, 3, 1, 2)
+        with torch.no_grad():
+            d1,d2,d3,d4,d5,d6,d7 = self.net(rgb_image_th)
+            # normalization
+            pred = d1[:,0,:,:]
+            ma = torch.max(pred)
+            mi = torch.min(pred)
+            alpha = (pred-mi)/(ma-mi)
+        alpha = alpha.detach().cpu().numpy()[0]
+        alpha[alpha > 0.5] = 255 
+        alpha[alpha <=0.5] = 0
+        alpha = np.dstack([alpha, alpha, alpha])
+        alpha = cv2.resize(alpha, (512, 512))
+        alpha = cv2.dilate(alpha, np.ones([7, 7]))
+        return alpha
+
+
+class FaceImageRender:
+    def __init__(self) -> None:
+        # Init DECA
+        self.deca = DECA(config=deca_cfg)
+        f_mask = np.load('./src/decalib/data/FLAME_masks_face-id.pkl', allow_pickle=True, encoding='latin1')
+        v_mask = np.load('./src/decalib/data/FLAME_masks.pkl', allow_pickle=True, encoding='latin1')
+        self.mask={
+            'v_mask':v_mask['face'].tolist(),
+            'f_mask':f_mask['face'].tolist()
+        }
+    
+    def image_to_3dcoeff(self, rgb_image):
+        with torch.no_grad():
+            codedict, detected_flag = self.deca.img_to_3dcoeff(rgb_image)
+        return codedict
+
+    def render_shape(self, shape, exp, pose, cam, light, tform, h, w):
+        with torch.no_grad():
+            # all parameters are from codedict
+            verts, landmarks2d, landmarks3d = self.deca.flame(shape_params=shape, expression_params=exp, pose_params=pose)
+
+            ## projection
+            trans_verts = util.batch_orth_proj(verts, cam); trans_verts[:,:,1:] = -trans_verts[:,:,1:]
+
+            points_scale = [self.deca.image_size, self.deca.image_size]
+            trans_verts = transform_points(trans_verts, tform, points_scale, [h, w])
+
+            shape_images, _, grid, alpha_images, albedo_images =self.deca.render.render_shape(verts, trans_verts, h=h, w=w, lights=light, images=None, return_grid=True, mask=self.mask)
+            shape_images = shape_images.permute(0, 2, 3, 1).clamp(0, 1).detach().cpu().numpy()[0] * 255
+            albedo_images = albedo_images.permute(0, 2, 3, 1).clamp(0, 1).detach().cpu().numpy()[0] * 255
+        return shape_images, albedo_images
+
+    def render_shape_with_light(self, codedict, target_light=None):
+        if target_light is None:
+            target_light = codedict["light"]
+        shape, exp, pose = codedict["shape"], codedict["exp"], codedict["pose"]
+        cam, tform, h, w = codedict["cam"], codedict["tform"], codedict["height"], codedict["width"]
+        shape_image, albedo_image = self.render_shape(shape, exp, pose, cam, target_light, tform, h, w)
+        return shape_image
+
+    def render_motion_single(self, image):
+        codedict = self.image_to_3dcoeff(image) 
+        shading = self.render_shape_with_light(codedict)
+        return shading 
+
+    def render_motion_single_with_light(self, image, target_light_image):
+        codedict = self.image_to_3dcoeff(image) 
+        target_light = self.image_to_3dcoeff(target_light_image)["light"]
+        shading = self.render_shape_with_light(codedict, target_light=target_light)
+        return shading 
+    
+    def render_motion_sync(self, ref_image, driver_frames, target_light_image):
+        ref_code_dict = self.image_to_3dcoeff(ref_image)
+        target_light = self.image_to_3dcoeff(target_light_image)["light"]
+
+        shading_frames = []
+        for drv_frm in tqdm(driver_frames):
+            codedict = self.image_to_3dcoeff(drv_frm)
+            shape, exp, pose = ref_code_dict["shape"], ref_code_dict["exp"], codedict["pose"]
+            cam, tform, h, w = ref_code_dict["cam"], ref_code_dict["tform"], ref_code_dict["height"], ref_code_dict["width"]
+            shape_image, albedo_image = self.render_shape(shape, exp, pose, cam, target_light, tform, h, w)
+            shading_frames.append(shape_image)
+        return shading_frames
+
+    def render_motion_sync_relative(self, ref_image, driver_frames, target_light_image):
+        ref_codedict = self.image_to_3dcoeff(ref_image)
+        target_light = self.image_to_3dcoeff(target_light_image)["light"]
+
+        drv_codedict_list = []
+        shading_frames = []
+        for drv_frm in tqdm(driver_frames):
+            drv_codedict = self.image_to_3dcoeff(drv_frm)
+            drv_codedict_list.append(drv_codedict)
+        
+        # best_dist = 10000
+        # best_pose = None 
+        # for idx, drv_codedict in enumerate(drv_codedict_list):
+        #     dist = torch.mean(torch.abs(ref_codedict["pose"] - drv_codedict["pose"]))
+        #     if dist < best_dist:
+        #         best_dist = dist
+        #         best_pose = drv_codedict["pose"]
+        best_pose = drv_codedict_list[0]["pose"]
+        best_exp = drv_codedict_list[0]["exp"]
+        for drv_codedict in drv_codedict_list:
+            relative_pose = drv_codedict["pose"] - best_pose + ref_codedict["pose"]
+            relative_exp = drv_codedict["exp"] - best_exp + ref_codedict["exp"]
+            shape, exp, pose = ref_codedict["shape"], relative_exp, relative_pose
+            cam, tform, h, w = ref_codedict["cam"], ref_codedict["tform"], ref_codedict["height"], ref_codedict["width"]
+            shape_image, albedo_image = self.render_shape(shape, exp, pose, cam, target_light, tform, h, w)
+            shading_frames.append(shape_image)
+        return shading_frames
+
+    def render_motion_sync(self, ref_image, driver_frames, target_light_image):
+        ref_codedict = self.image_to_3dcoeff(ref_image)
+        target_light = self.image_to_3dcoeff(target_light_image)["light"]
+
+        drv_codedict_list = []
+        shading_frames = []
+        for drv_frm in tqdm(driver_frames):
+            drv_codedict = self.image_to_3dcoeff(drv_frm)
+            drv_codedict_list.append(drv_codedict)
+        
+        for drv_codedict in drv_codedict_list:
+            shape, exp, pose = ref_codedict["shape"], drv_codedict["exp"], drv_codedict["pose"]
+            cam, tform, h, w = ref_codedict["cam"], ref_codedict["tform"], ref_codedict["height"], ref_codedict["width"]
+            shape_image, albedo_image = self.render_shape(shape, exp, pose, cam, target_light, tform, h, w)
+            shading_frames.append(shape_image)
+        return shading_frames
+
+class FaceKPDetector:
+    def __init__(self) -> None:
+        self.vis = FaceMeshVisualizer(draw_iris=False, draw_mouse=True, draw_eye=True, draw_nose=True, draw_eyebrow=True, draw_pupil=True)
+        self.lmk_extractor = LMKExtractor()
+
+    def motion_sync(self, ref_image, driver_frames):        
+        ref_image = cv2.cvtColor(ref_image, cv2.COLOR_RGB2BGR)
+        ref_frame =cv2.resize(ref_image, (512, 512))
+        ref_det = self.lmk_extractor(ref_frame)
+
+        sequence_driver_det = []
+        try: 
+            for frame in tqdm(driver_frames):
+                frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
+                frame =cv2.resize(frame, (512, 512))
+                result = self.lmk_extractor(frame)
+                assert result is not None, "bad video, face not detected"
+                sequence_driver_det.append(result)
+        except:
+            print("face detection failed")
+            exit()
+
+        sequence_det_ms = motion_sync(sequence_driver_det, ref_det)
+        pose_frames = [self.vis.draw_landmarks((512, 512), i, normed=False) for i in sequence_det_ms]
+        return pose_frames
+
+    def motion_self(self, driver_frames):        
+        pose_frames = []
+        try: 
+            for frame in tqdm(driver_frames):
+                frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
+                frame =cv2.resize(frame, (512, 512))
+                frame_det = self.lmk_extractor(frame)
+                kpmap = self.vis.draw_landmarks((512, 512), frame_det["lmks"], normed=True)
+                pose_frames.append(kpmap)
+        except:
+            print("face detection failed")
+            exit()
+
+        return pose_frames
+
+    def single_kp(self, image):
+        frame_det = self.lmk_extractor(image)
+        kpmap = self.vis.draw_landmarks((512, 512), frame_det["lmks"], normed=True)
+        return kpmap 
+
+class InferVideo:
+    def __init__(self) -> None:
+        self.vis = FaceMeshVisualizer(draw_iris=False, draw_mouse=True, draw_eye=True, draw_nose=True, draw_eyebrow=True, draw_pupil=True)
+        self.lmk_extractor = LMKExtractor()
+
+        self.fm = FaceMatting()
+
+        self.fir = FaceImageRender()
+
+        self.fkpd = FaceKPDetector()
+
+    def inference(self, source_path, light_path, video_path, save_path, motion_align="relative"):
+        tmp_path = "resources/target/"
+
+        if os.path.exists(tmp_path):
+            os.system(f"rm -r {tmp_path}")
+            
+        os.mkdir(tmp_path)
+        os.system(f"ffmpeg -i {video_path} {tmp_path}/%5d.png")
+
+        # motion sync
+        source_image = np.array(Image.open(source_path).resize([512, 512]))[..., :3]
+        target_lighting = np.array(Image.open(light_path).resize([512, 512]))[..., :3]
+        
+        driver_frames = [np.array(Image.open(os.path.join(tmp_path, str(i).zfill(5)+".png")).resize([512, 512])) for i in range(1, 1 + len(os.listdir(tmp_path)))]
+        
+        aligned_kpmaps = self.fkpd.motion_self(driver_frames)
+        
+        alpha = self.fm.portrait_matting(source_image)
+
+        if motion_align == "relative":
+            aligned_shading = self.fir.render_motion_sync_relative(source_image, driver_frames, target_lighting)
+        else:
+            aligned_shading = self.fir.render_motion_sync(source_image, driver_frames, target_lighting)
+        
+        for idx, (drv_frame, kpmap, shading) in tqdm(enumerate(zip(driver_frames, aligned_kpmaps, aligned_shading))):
+            img = np.concatenate([source_image, alpha, drv_frame, kpmap, shading], axis=1)
+            Image.fromarray(np.uint8(img)).save(f"{tmp_path}/{str(idx + 1).zfill(5)}.png")
+
+        source_kp = self.fkpd.single_kp(source_image)
+        source_shading = self.fir.render_motion_single_with_light(source_image, source_image)
+        
+        img = np.concatenate([source_image, alpha, source_image, source_kp, source_shading], axis=1)
+        Image.fromarray(np.uint8(img)).save(f"{tmp_path}/{str(0).zfill(5)}.png")
+        os.system(f"ffmpeg -r 20 -i {tmp_path}/%05d.png -pix_fmt yuv420p -c:v libx264 {save_path} -y")
+
+    
+if __name__ == "__main__":
+    iv = InferVideo()
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--video_path", type=str, default="resources/WDA_DebbieDingell1_000.mp4", help="driving video path") 
+    parser.add_argument("--source_path", type=str, default="resources/reference.png", help="reference image path") 
+    parser.add_argument("--light_path", type=str, default="resources/target_lighting1.png", help="target lighting image ") 
+    parser.add_argument("--save_path", type=str, default="resources/shading.mp4", help="shading hints") 
+    parser.add_argument("--motion_align", type=str, default="relative", help="motion alignment mode") 
+    args = parser.parse_args()
+
+    iv.inference(source_path=args.source_path, light_path=args.light_path, video_path=args.video_path, save_path=args.save_path, motion_align=args.motion_align)
+         
+