-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreate_movie.py
396 lines (298 loc) · 14.3 KB
/
create_movie.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
from moviepy.editor import ImageClip, TextClip, CompositeVideoClip, concatenate_videoclips, AudioFileClip, ColorClip, CompositeAudioClip, VideoFileClip
import requests
from PIL import Image
from io import BytesIO
import os
import uuid
from dotenv import load_dotenv
from elevenlabs import VoiceSettings
from elevenlabs.client import ElevenLabs
from moviepy.video.fx.all import speedx
import replicate
import json
import subprocess
os.environ['IMAGEMAGICK_BINARY'] = '/opt/homebrew/bin/magick'
load_dotenv()
ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY")
client = ElevenLabs(api_key=ELEVENLABS_API_KEY)
FONT_PATH = "Arial-Bold.ttf"
MAX_WORDS_PER_LINE = 5
def download_video(remote_video_name, index):
volume_name = "model-cache-volume"
local_video_path = os.path.join('videos')
local_video_path_name = os.path.join('videos', f"gen_video_{index}_0000.mp4")
if not os.path.exists('videos'):
os.makedirs('videos')
command = f"modal volume get {volume_name} {remote_video_name} {local_video_path}"
try:
result = subprocess.run(command, shell=True, check=True, capture_output=True, text=True)
print("Video downloaded from volume successfully.")
print(f"Standard Output:\n{result.stdout}")
print(f"Standard Error:\n{result.stderr}")
return local_video_path_name
except subprocess.CalledProcessError as e:
print(f"Video download failed with exit code {e.returncode}")
print(f"Standard Output:\n{e.stdout}")
print(f"Standard Error:\n{e.stderr}")
return None
def upload_image_to_volume(local_image_path, remote_image_name, volume_name):
command = f"modal volume put {volume_name} {local_image_path} {remote_image_name}"
try:
result = subprocess.run(command, shell=True, check=True, capture_output=True, text=True)
print("Image uploaded to volume successfully.")
print(f"Standard Output:\n{result.stdout}")
print(f"Standard Error:\n{result.stderr}")
except subprocess.CalledProcessError as e:
if "already exists" in e.stderr:
print("Image already exists in volume. Skipping upload.")
else:
print(f"Image upload failed with exit code {e.returncode}")
print(f"Standard Output:\n{e.stdout}")
print(f"Standard Error:\n{e.stderr}")
def run_modal_main(image_paths, prompts, file_mount_names, sample_names, durations, resolutions, aspect_ratios):
# Upload images to volume
for idx in range(len(image_paths)):
upload_image_to_volume(image_paths[idx], file_mount_names[idx], "model-cache-volume")
# Prepare command to run generate_video.py
command = [
"modal", "run", "generate_video.py::main",
"--image-paths", json.dumps(image_paths),
"--prompts", json.dumps(prompts),
"--file-mount-names", json.dumps(file_mount_names),
"--sample-names", json.dumps(sample_names),
"--durations", json.dumps(durations),
"--resolutions", json.dumps(resolutions),
"--aspect-ratios", json.dumps(aspect_ratios)
]
try:
result = subprocess.run(command, check=True, capture_output=True, text=True)
print("Modal run command executed successfully.")
print(f"Standard Output:\n{result.stdout}")
print(f"Standard Error:\n{result.stderr}")
except subprocess.CalledProcessError as e:
print(f"Modal run command failed with exit code {e.returncode}")
print(f"Standard Output:\n{e.stdout}")
print(f"Standard Error:\n{e.stderr}")
def download_image(image_url, index):
response = requests.get(image_url)
if response.status_code == 200:
image = Image.open(BytesIO(response.content))
if not os.path.exists('images'):
os.makedirs('images')
image_path = os.path.join('images', f"image{index}.png")
image.save(image_path)
return image_path
else:
raise Exception(f"Failed to download image from {image_url}")
def generate_narration(text, narration_speed):
if not os.path.exists('audio_temp'):
os.makedirs('audio_temp')
if not os.path.exists('audio'):
os.makedirs('audio')
print(f"Generating narration for text: {text}")
response = client.text_to_speech.convert(
voice_id="pNInz6obpgDQGcFmaJgB", # Adam pre-made voice
optimize_streaming_latency="0",
output_format="mp3_22050_32",
text=text,
model_id="eleven_turbo_v2",
voice_settings=VoiceSettings(
stability=0.0,
similarity_boost=1.0,
style=0.0,
use_speaker_boost=True,
),
)
save_file_path = os.path.join('audio_temp', f"{uuid.uuid4()}.mp3")
with open(save_file_path, "wb") as f:
for chunk in response:
if chunk:
f.write(chunk)
print(f"{save_file_path}: A new audio file was saved successfully!")
# Adjust narration speed
audio_clip = AudioFileClip(save_file_path)
adjusted_audio_clip = speedx(audio_clip, narration_speed)
adjusted_audio_path = os.path.join('audio', f"{uuid.uuid4()}_adjusted.mp3")
adjusted_audio_clip.write_audiofile(adjusted_audio_path)
return adjusted_audio_path
def split_text_into_lines(text, max_words_per_line):
words = text.split()
lines = []
current_line = []
for word in words:
if len(current_line) + len(word.split()) <= max_words_per_line:
current_line.append(word)
else:
lines.append(" ".join(current_line))
current_line = [word]
if current_line:
lines.append(" ".join(current_line))
return lines
def generate_music(prompt, duration):
output = replicate.run(
"meta/musicgen:671ac645ce5e552cc63a54a2bbff63fcf798043055d2dac5fc9e36a837eedcfb",
input={
"top_k": 250,
"top_p": 0,
"prompt": prompt,
"duration": int(duration),
"temperature": 1,
"continuation": False,
"model_version": "stereo-melody-large",
"output_format": "mp3",
"continuation_start": 0,
"multi_band_diffusion": False,
"normalization_strategy": "peak",
"classifier_free_guidance": 3
}
)
music_url = output
if music_url:
music_response = requests.get(music_url)
music_file_path = f"{uuid.uuid4()}.mp3"
with open(music_file_path, "wb") as music_file:
music_file.write(music_response.content)
return music_file_path
else:
raise Exception("Failed to get music URL from response.")
def remove_quotes(image_descriptions):
return [desc.replace('"', '').replace("'", '') for desc in image_descriptions]
def create_movie(image_urls, image_descriptions, narrations, music_prompt, narration_speed=1.0, output_file="output_video.mp4", fps=24, resolution="480p", aspect_ratio="9:16"):
image_descriptions = remove_quotes(image_descriptions)
audio_clips = []
audio_files = []
sum_of_clips_duration = 0
images_paths = []
# Generate narration audio and calculate the sum of clip durations
for i, image_url in enumerate(image_urls):
# Download the image and save the path
image_path = download_image(image_url, i+1)
images_paths.append(image_path)
print(f"Downloaded image {i + 1}: {image_path}") # Debug info
# Generate narration audio
narration = narrations[i]
if not narration.strip():
print(f"Skipping empty narration for slide {i + 1}")
continue
audio_file = generate_narration(narration, narration_speed)
audio_clip = AudioFileClip(audio_file)
audio_duration = audio_clip.duration
sum_of_clips_duration += audio_duration
audio_clips.append(audio_clip.set_duration(audio_duration))
audio_files.append(audio_file)
print(f"Generated narration {i + 1}: {audio_file} with duration {audio_duration}")
if not audio_clips:
print("No valid clips to concatenate. Exiting.")
return
# get the duration for the audio that goes over the full script (concatenated witha ll strings), then sum up the audio for all of the little clips that we tested to get sum_of_clips_duration. Then duration_scale = actual_duration/sum_of_clips_duration and then for all the clips multiply their duration by duration_scale
# Generate full narration audio for the entire script
full_narration_text = " ".join(narrations)
full_narration_file = generate_narration(full_narration_text, narration_speed)
final_audio_clip = AudioFileClip(full_narration_file)
audio_files.append(full_narration_file)
# Calculate the actual duration of the full narration audio
actual_duration = final_audio_clip.duration
# Calculate the duration scale factor
duration_scale = actual_duration / sum_of_clips_duration
# Create video clips with scaled durations
scaled_clips = []
# Scaled durations for the audio clips (due to audio for full script taking longer/shorter than audio for combined clips)
# These durations also represent the length for each "scene"
scaled_durations = []
prompts = []
file_mount_names = []
sample_names = []
resolutions = []
aspect_ratios = []
for i, image_path in enumerate(images_paths):
audio_clip = audio_clips[i]
scaled_durations.append(audio_clip.duration * duration_scale)
# args for create videos
file_mount_names.append(f"image{i}.png")
sample_names.append(f"gen_video_{i}")
resolutions.append(resolution)
aspect_ratios.append(aspect_ratio)
prompts.append(image_descriptions[i])
# since open-sora only accepts durations of 2, 4, 8, 16s, we have to make it longer then cut it short
durations = []
for duration in scaled_durations:
if duration < 2:
durations.append("2s")
elif duration < 4:
durations.append("4s")
elif duration < 8:
durations.append("8s")
else:
durations.append("16s")
for prompt in prompts:
print(f"\n\n\n{prompt}")
# run in inference for vids in parallel
run_modal_main(images_paths, prompts, file_mount_names, sample_names, durations, resolutions, aspect_ratios)
print("\n\n\n\n\n\n\nfinished inference for vids\n\n\n\n\n\n\n")
# Download videos and store paths
video_paths = []
for i in range(len(images_paths)):
video_path = download_video(f"gen_video_{i}_0000.mp4", i)
if video_path:
video_paths.append(video_path)
for i, video_path in enumerate(video_paths):
scaled_duration = scaled_durations[i]
# Create a VideoClip with the scaled duration
video_clip = VideoFileClip(video_path).subclip(0, scaled_duration)
# Split the narration into lines for subtitles
narration = narrations[i]
lines = split_text_into_lines(narration, MAX_WORDS_PER_LINE)
if not lines:
print(f"No lines generated for slide {i + 1}")
continue
# Create clips for each line of subtitles
line_clips = []
for j, line in enumerate(lines):
# Create a TextClip for the line
subtitle = TextClip(line, fontsize=30, color='white', font=FONT_PATH, method='caption', size=(video_clip.size[0], None))
if j < len(lines) - 1:
# Set the duration for non-last subtitles
subtitle_duration = scaled_duration / len(lines)
else:
# Set the duration for the last subtitle to extend to the end of the scene
subtitle_duration = scaled_duration - (len(lines) - 1) * (scaled_duration / len(lines))
subtitle = subtitle.set_duration(subtitle_duration).set_start(j * (scaled_duration / len(lines))).set_position(('center', 'bottom'))
# Create a semi-transparent black background for each line
bg_size = subtitle.size
bg_clip = ColorClip(size=(bg_size[0] + 20, bg_size[1] + 10), color=(0, 0, 0, 128)).set_duration(subtitle_duration).set_start(j * (scaled_duration / len(lines)))
# Combine the background and text
subtitle_with_bg = CompositeVideoClip([bg_clip.set_position(('center', 'bottom')), subtitle.set_position(('center', 'bottom'))])
line_clips.append(subtitle_with_bg)
# Overlay the final subtitle clip on the video
video_clip = CompositeVideoClip([video_clip] + line_clips)
scaled_clips.append(video_clip)
# Concatenate the scaled clips into the final video
final_video = concatenate_videoclips(scaled_clips, method="compose")
# Set the continuous audio to the video
final_video = final_video.set_audio(final_audio_clip)
# Generate music based on the prompt and video duration
music_file_path = generate_music(music_prompt, int(actual_duration)) # Ensure duration is an integer
music_clip = AudioFileClip(music_file_path)
# Adjust the music duration to match the video duration
music_clip = music_clip.subclip(0, min(music_clip.duration, actual_duration))
# Combine the final audio clip and the music clip
combined_audio = CompositeAudioClip([final_video.audio, music_clip])
final_video = final_video.set_audio(combined_audio)
# Write the result to a file with fps specified and ensure audio codec is AAC
final_video.write_videofile(output_file, codec="libx264", audio_codec="aac", fps=fps)
print(f"Video written to {output_file}")
# Clean up audio files
for audio_file in audio_files:
os.remove(audio_file)
os.remove(music_file_path)
if __name__ == "__main__":
image_urls = ["https://example.com/image1.png", "https://example.com/image2.png"]
image_descriptions = ["image prompt 1", "image prompt 2"]
narrations = ["Narration for image 1. This is the first narration.", "Narration for image 2. This is the second narration."]
narration_speed = 1
music_prompt = "Example music prompt for testing"
output_file="movie.mp4"
fps=24
resolution="480p"
aspect_ratio="9:16"
create_movie(image_urls, image_descriptions, narrations, music_prompt, narration_speed,output_file,fps,resolution,aspect_ratio)