-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathoptimization_h36m.py
250 lines (229 loc) · 11.2 KB
/
optimization_h36m.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
from numpy import dot
from numpy.linalg import norm
from matplotlib import image
from matplotlib import pyplot as plt
from scipy.ndimage import gaussian_filter
from PIL import Image
from statistics import mean
from argparse import Namespace
import cv2
import os
import glob
import torch
import random
import numpy as np
import pickle
import math
import argparse
import sys
from scripts.flow_utils import *
from scripts.pose_utils import *
sys.path.append('RAFT/core')
from raft import RAFT
from utils import flow_viz
from utils.utils import InputPadder
DEVICE = 'cuda'
device = torch.device('cuda')
#hyperparameters
lambda_opt = 0.01
lambda_pos = 300
lambda_2D = 0.01
lambda_pos_2D = 0.00 #not necessary, can set it to 0.001
lambda_cam = 0.1
lambda_bone = 10000
lambda_3D = 400
pose_steps = 1500
flow_steps = 8
neighborhood = 15
nb_points = 20
def pose_optimization(results, file_num, s, loop_counter, base_input_image_path_raft, input_video_file_path):
"""optimizes the 3D pose based on temporal information, bone length information and optical flow"""
X = np.zeros((len(results),1, 14, 3), dtype = 'float32')
X_init_numpy = np.zeros((len(results),1, 14, 3), dtype = 'float32')
cam = np.zeros((len(results),1, 1, 3), dtype = 'float32')
#filling the initialized arrays with METRO-RAFT estimates
if loop_counter == 0:
steps = 0
for frame in range(len(results)):
X[frame] = results[frame][0]['pred_3d_joints_from_smpl'].cpu().numpy()
cam[frame][0][0] = results[frame][0]['pred_camera'].cpu().numpy()
else:
steps = pose_steps
for frame in range(len(results)):
X[frame] = results[frame][0]['pred_3d_joints_from_smpl_optim']
cam[frame][0][0] = results[frame][0]['camera_optim'].cpu().numpy()
X_init_numpy = X
#get 2D keypoints and confidence values from DCPose
x_det, confidence = get_joints_dcpose_h36m(file_num, s)
#creating tensors for training/optimizing
X_tensor = torch.tensor(X, device = device)
X_init = torch.tensor(X_init_numpy, device = device)
cam_tensor = torch.tensor(cam, device = device)
#Smooth L1 loss and Adam optimizer
criterion = torch.nn.SmoothL1Loss(reduction='none')
optimizer = torch.optim.Adam([X_tensor, cam_tensor], 0.001)
X_tensor.requires_grad = True
cam_tensor.requires_grad = True
stop_criterion = 0
prev_loss = -1
#finetuning loop starts
for step in range(steps):
#project 3D joints on to 2D image for all the frames in the video
pj2d = project(X_tensor, results, cam_tensor)
#compute bone length of skeletons in all the frames
skel = compute_bone_length(X_tensor)
#initialize loss
loss = torch.zeros(1, requires_grad=True)
loss_list = []
optimizer.zero_grad()
count = 0
for k in range(len(results)):
for human in range(1):
pj2d_curr = pj2d[count]
#if not first frame
if count != 0:
pj2d_prev = pj2d[count-1]
#temporal loss
temp_l = lambda_cam * criterion(cam_tensor[count], cam_tensor[count-1]).sum().float() + lambda_pos_2D * criterion(pj2d_curr, pj2d_prev).sum().float() + lambda_pos * criterion(X_tensor[count][human], X_tensor[(count-1)][human]).sum().float() + lambda_bone * criterion(skel[count], skel[count-1]).sum().float()
#flow loss
flow_l = lambda_opt * criterion(torch.tensor(get_flow_metro(results[(count-1)], pj2d_prev[human]), device = device).float(),(pj2d_curr[human] - pj2d_prev[human]).float()).sum().float()
#if first frame
else:
#temporal loss
temp_l = 0
#flow loss
flow_l = 0
#reprojection loss
det_l = lambda_2D * criterion(torch.tensor(confidence[count][human], device = device) * torch.tensor(x_det[count][human], device = device), torch.tensor(confidence[count][human], device = device) * pj2d_curr[human]).sum().float()
#prior term
prior_l = lambda_3D * criterion(X_tensor[count][human], X_init[count][human]).sum().float()
l = temp_l + det_l + prior_l + flow_l
loss_list.append(l.float())
count += 1
loss = sum(loss_list)/(len(loss_list))
if step % 100 == 0:
pass
# print("Step- ", step)
# print('Loss', loss)
loss.backward()
optimizer.step()
if format(loss.item(), ".4f") == format(prev_loss, ".4f"):
stop_criterion += 1
else:
stop_criterion = 0
if stop_criterion == 10:
break
prev_loss = loss.item()
#store optimized values in the "results" dictionary
X_tensor_copy = X_tensor
projection = project((X_tensor_copy), results, cam_tensor).detach().cpu().numpy().astype('float32')
for frame in range(len(results)):
if loop_counter == 0:
results[frame][0]['pred_3d_joints_from_smpl'] = X_tensor[frame].detach()
results[frame][0]['joints_orig'] = projection[frame]
results[frame][0]['pred_3d_joints_from_smpl_optim'] = X_tensor[frame].detach().cpu().numpy().astype('float32')
results[frame][0]['joints_optim'] = projection[frame]
results[frame][0]['camera_optim'] = (cam_tensor[frame].detach())
if loop_counter == 0:
pass
get_video_frames("data/h36m/" + s + input_video_file_path + file_num + ".mp4", results, base_input_image_path_raft)
return results
def flow_optimization(results, x_det, confidence, file_num, model, loop_count, s=""):
"""Optical flow optimization using pose"""
args = Namespace(alternate_corr = False, corr_levels = 4, corr_radius = 4, dropout = 0, mixed_precision = True, model = 'RAFT/models/raft-things.pth', path = 'raft_inputs/', small=False)
if loop_count == 0:
#load the raft model and evaluate its performance in terms of End Point Error (EPE)
model = load_model(args, file_num)
results = infer(model, results, args, 0, s)
compute_error_flow(results, file_num, 0, s)
#--------------------------------------------------
#START OF FLOW OPTIMIZATION
#get the initial estimates
results = infer(model, results, args, 1, s)
model = model.train()
#generate target flow map
target, _, _ = generate_flowmap(file_num, x_det, confidence, results, loop_count, s)
#Smooth L1 loss and Adam optimizer
criterion = torch.nn.SmoothL1Loss()
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-5)
#finetuning loop starts
for step in range(flow_steps):
if True:
images = glob.glob(os.path.join(args.path, '*.png')) + \
glob.glob(os.path.join(args.path, '*.jpg'))
images = sorted(images)
count = 0
loss_list = []
for imfile1, imfile2 in zip(images[:-1], images[1:]):
optimizer.zero_grad()
image1 = load_image(imfile1)
image2 = load_image(imfile2)
padder = InputPadder(image1.shape)
image1, image2 = padder.pad(image1, image2)
flow_low, flow_up = model(image1, image2, iters=20, test_mode=True)
flow_up = padder.unpad(flow_up)
l = criterion(flow_up[0].permute(1,2,0).cuda(), (target[count]).cuda()).sum()
loss_list.append(l.float())
l.backward()
optimizer.step()
count += 1
if (step+1)%4==0:
pass
# print("Step- ", step)
# print('Loss', sum(loss_list)/len(loss_list))
#finetuning loop ends
#evaluate the results of the fine tuned model
results = infer(model, results, args, 0, s)
if loop_count == 0:
compute_error_flow(results, file_num, 1, s)
else:
compute_error_flow(results, file_num, 2, s)
print("Current EPE (first optimization, second optimization, original) : ", sum((err1))/(32*sum(total_frame)), sum((err3))/(32*sum(total_frame)), sum((err2))/(32*sum(total_frame)))
return results, model, x_det, confidence
def get_video_frames(video_file_path, results, base_input_image_path_raft):
from scripts.utility import OpenCVCapture
capture = OpenCVCapture(video_file_path)
video_length = int(capture.cap.get(cv2.CAP_PROP_FRAME_COUNT))
#remove all the images in the input folder of RAFT
r_files = glob.glob(base_input_image_path_raft+"*")
for r_f in r_files:
os.remove(r_f)
frame_id = 0
for orig_frame_id in range(video_length):
if frame_id >= len(results):
break
frame = capture.read()
if (orig_frame_id-1) % 5 == 0:
plt.imsave(base_input_image_path_raft+str(frame_id).zfill(4)+'.jpg', frame)
frame_id += 1
def data_paths_full(s, input_video_file_path):
import os
import os.path as path
file_path, dirs, files = next(os.walk("data/h36m/" + s + input_video_file_path))
return files
def main():
subjects = ["S9", "S11"]
base_input_image_path_raft = 'raft_inputs/'
input_video_file_path = '/Videos/'
os.mkdir(base_input_image_path_raft)
for s in subjects:
file_path = data_paths_full(s, input_video_file_path)
print("Total number of videos, ", len(file_path))
for i in range(len(file_path)):
file_num = file_path[(i)].split(".mp4")[0]
if file_num.split(".")[-1] == "60457274" and file_num != '_ALL.60457274':
print("Optimizing - ",file_num)
results = np.load('metro_estimates_h36m/'+s+'/'+ file_num +'_results.npz',allow_pickle=True)['results'][()]
results = pose_optimization(results, file_num, s, 0, base_input_image_path_raft, input_video_file_path)
print("First cycle of optical flow optimization begins")
results, model, _, _ = flow_optimization(results, None, None, file_num, None, 0, s)
print("First cycle of pose optimization begins")
results = pose_optimization(results, file_num, s, 1, base_input_image_path_raft, input_video_file_path)
print("Second cycle of optical flow optimization begins")
results, model, _, _ = flow_optimization(results, None, None, file_num, model, 1, s)
compute_error(results, file_path[(i)].split(".mp4")[0], s)
print("Current MPJPE (first optimization, original) : ", sum(e1)/len(e1), sum(e2)/len(e2))
print("Done optimizing ",file_num)
print("----------------------------------")
if __name__ == '__main__':
main()