forked from BytesNBitsCL/modelo_lstm_lsch
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathevaluate_model.py
104 lines (87 loc) · 3.94 KB
/
evaluate_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import os
import cv2
import numpy as np
from mediapipe.python.solutions.holistic import Holistic
from keras.models import load_model
from helpers import *
from constants import *
from tensorflow.keras.preprocessing.sequence import pad_sequences
from text_to_speech import text_to_speech
def interpolate_keypoints(keypoints, target_length=15):
current_length = len(keypoints)
if current_length == target_length:
return keypoints
indices = np.linspace(0, current_length - 1, target_length)
interpolated_keypoints = []
for i in indices:
lower_idx = int(np.floor(i))
upper_idx = int(np.ceil(i))
weight = i - lower_idx
if lower_idx == upper_idx:
interpolated_keypoints.append(keypoints[lower_idx])
else:
interpolated_point = (1 - weight) * np.array(keypoints[lower_idx]) + weight * np.array(keypoints[upper_idx])
interpolated_keypoints.append(interpolated_point.tolist())
return interpolated_keypoints
def normalize_keypoints(keypoints, target_length=15):
current_length = len(keypoints)
if current_length < target_length:
return interpolate_keypoints(keypoints, target_length)
elif current_length > target_length:
step = current_length / target_length
indices = np.arange(0, current_length, step).astype(int)[:target_length]
return [keypoints[i] for i in indices]
else:
return keypoints
def evaluate_model(src=None, threshold=0.8, margin_frame=1, delay_frames=3):
kp_seq, sentence = [], []
word_ids = get_word_ids(WORDS_JSON_PATH)
model = load_model(MODEL_PATH)
count_frame = 0
fix_frames = 0
recording = False
with Holistic() as holistic_model:
video = cv2.VideoCapture(src or 0)
while video.isOpened():
ret, frame = video.read()
if not ret: break
results = mediapipe_detection(frame, holistic_model)
# TODO: colocar un máximo de frames para cada seña,
# es decir, que traduzca incluso cuando hay mano si se llega a ese máximo.
if there_hand(results) or recording:
recording = False
count_frame += 1
if count_frame > margin_frame:
kp_frame = extract_keypoints(results)
kp_seq.append(kp_frame)
else:
if count_frame >= MIN_LENGTH_FRAMES + margin_frame:
fix_frames += 1
if fix_frames < delay_frames:
recording = True
continue
kp_seq = kp_seq[: - (margin_frame + delay_frames)]
kp_normalized = normalize_keypoints(kp_seq, int(MODEL_FRAMES))
res = model.predict(np.expand_dims(kp_normalized, axis=0))[0]
print(np.argmax(res), f"({res[np.argmax(res)] * 100:.2f}%)")
if res[np.argmax(res)] > threshold:
word_id = word_ids[np.argmax(res)].split('-')[0]
sent = words_text.get(word_id)
sentence.insert(0, sent)
text_to_speech(sent) # ONLY LOCAL (NO SERVER)
recording = False
fix_frames = 0
count_frame = 0
kp_seq = []
if not src:
cv2.rectangle(frame, (0, 0), (640, 35), (245, 117, 16), -1)
cv2.putText(frame, ' | '.join(sentence), FONT_POS, FONT, FONT_SIZE, (255, 255, 255))
draw_keypoints(frame, results)
cv2.imshow('Traductor LSP', frame)
if cv2.waitKey(10) & 0xFF == ord('q'):
break
video.release()
cv2.destroyAllWindows()
return sentence
if __name__ == "__main__":
evaluate_model()