garyuzair commited on
Commit
eab4ff6
·
verified ·
1 Parent(s): f60e24b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +210 -364
app.py CHANGED
@@ -1,381 +1,227 @@
1
  import streamlit as st
2
- from PIL import Image
3
  import numpy as np
 
 
 
4
  import torch
5
- import gc
6
  import os
7
  import tempfile
8
- import math
9
- import imageio
10
- import traceback
11
- import scipy.io.wavfile # For saving WAV files
12
 
13
- # --- Attempt to import moviepy for video processing ---
14
  try:
15
  import moviepy.editor as mpy
16
- MOVIEPY_AVAILABLE = True
17
- except Exception as e: # Catch any exception during moviepy import
18
- MOVIEPY_AVAILABLE = False
19
- st.warning(
20
- f"MoviePy library could not be loaded (Error: {e}). "
21
- "Video syncing features will be disabled. "
22
- "If running locally, ensure MoviePy and its dependency ffmpeg are correctly installed."
23
- )
24
- print(f"MoviePy load error: {e}")
25
-
26
-
27
- # --- Model Configuration ---
28
- IMAGE_CAPTION_MODEL = "Salesforce/blip-image-captioning-base"
29
- AUDIO_GEN_MODEL = "facebook/musicgen-small"
30
-
31
- # --- Constants ---
32
- DEFAULT_NUM_FRAMES = 2
33
- DEFAULT_AUDIO_DURATION_S = 7 # Slightly increased default
34
- MAX_FRAMES_TO_SHOW_UI = 2 # Reducing for smaller UI footprint
35
- DEVICE = torch.device("cpu") # Explicitly use CPU
36
-
37
- # --- Page Setup ---
38
- st.set_page_config(page_title="AI Video Sound Designer (HF Space)", layout="wide", page_icon="🎬")
39
-
40
- st.title("🎬 AI Video Sound Designer")
41
- st.markdown("""
42
- Upload a short video (MP4, MOV, AVI). The tool will:
43
- 1. Extract frames.
44
- 2. Analyze frames with **BLIP** to generate sound ideas.
45
- 3. Synthesize audio with **MusicGen** based on these ideas.
46
- 4. Optionally, combine the new audio with your video.
47
- ---
48
- **Important:** This app runs on CPU. **Audio generation can be very slow (several minutes for a few seconds of audio).** Please be patient!
49
- """)
50
-
51
- # --- Utility Functions ---
52
- def clear_memory(model_obj=None, processor_obj=None):
53
- if model_obj:
54
- del model_obj
55
- if processor_obj:
56
- del processor_obj
57
- gc.collect()
58
- print("Memory cleared.")
59
-
60
- @st.cache_resource(show_spinner="Loading Image Analysis Model...")
61
- def load_image_caption_model_and_processor():
62
- try:
63
- from transformers import BlipProcessor, BlipForConditionalGeneration
64
- print(f"Loading Image Captioning Model: {IMAGE_CAPTION_MODEL} to {DEVICE}")
65
- processor = BlipProcessor.from_pretrained(IMAGE_CAPTION_MODEL)
66
- # Standard loading for BLIP on CPU. No 'low_mem' or 'low_cpu_mem_usage'
67
- model = BlipForConditionalGeneration.from_pretrained(IMAGE_CAPTION_MODEL).to(DEVICE)
68
- model.eval() # Set to evaluation mode
69
- st.toast("Image Analysis model (BLIP) loaded!", icon="🖼️")
70
- return processor, model
71
- except Exception as e:
72
- st.error(f"Error loading BLIP model ({IMAGE_CAPTION_MODEL}): {e}")
73
- st.error(traceback.format_exc())
74
- return None, None
75
-
76
- @st.cache_resource(show_spinner="Loading Audio Generation Model (can be slow)...")
77
- def load_audio_gen_model_and_processor():
78
- try:
79
- from transformers import AutoProcessor, MusicgenForConditionalGeneration
80
- print(f"Loading Audio Generation Model: {AUDIO_GEN_MODEL} to {DEVICE}")
81
- processor = AutoProcessor.from_pretrained(AUDIO_GEN_MODEL)
82
- # Standard loading for MusicGen on CPU.
83
- # `low_cpu_mem_usage` could be used here if accelerate is properly configured,
84
- # but for simplicity and robustness on free tier, direct .to(DEVICE) is safer.
85
- model = MusicgenForConditionalGeneration.from_pretrained(AUDIO_GEN_MODEL).to(DEVICE)
86
- model.eval() # Set to evaluation mode
87
- st.toast("Audio Generation model (MusicGen) loaded! (CPU generation will be slow)", icon="🎶")
88
- return processor, model
89
- except Exception as e:
90
- st.error(f"Error loading MusicGen model ({AUDIO_GEN_MODEL}): {e}")
91
- st.error(traceback.format_exc())
92
- return None, None
93
-
94
- def extract_frames_from_video(video_path, num_frames_to_extract):
95
- frames = []
96
- reader = None
97
- try:
98
- reader = imageio.get_reader(video_path, "ffmpeg")
99
- total_frames_in_video = 0
100
- try: # Try to get frame count
101
- total_frames_in_video = reader.count_frames()
102
- except Exception:
103
- meta_data = reader.get_meta_data()
104
- duration = meta_data.get('duration')
105
- fps = meta_data.get('fps', 25) # Default FPS if not found
106
- if duration and fps:
107
- total_frames_in_video = int(duration * fps)
108
-
109
- if not total_frames_in_video or total_frames_in_video < 1:
110
- # Fallback: try to read a few frames directly if length is unknown
111
- print("Video length unknown or zero, attempting to read initial frames.")
112
- temp_frames = []
113
- for i, frame_data in enumerate(reader):
114
- temp_frames.append(Image.fromarray(frame_data).convert("RGB"))
115
- if len(temp_frames) >= num_frames_to_extract * 2: # Read a bit more
116
- break
117
- if not temp_frames:
118
- st.error("Could not extract any frames. Video might be empty or corrupted.")
119
- if reader: reader.close()
120
- return []
121
- # Select frames from what was read
122
- indices = np.linspace(0, len(temp_frames) - 1, num_frames_to_extract, dtype=int, endpoint=True)
123
- frames = [temp_frames[i] for i in indices]
124
- if reader: reader.close()
125
- return frames
126
-
127
- # If frame count is known
128
- num_to_sample = min(num_frames_to_extract, total_frames_in_video)
129
- indices = np.linspace(0, total_frames_in_video - 1, num_to_sample, dtype=int, endpoint=True)
130
-
131
- for i in indices:
132
- try:
133
- frame_data = reader.get_data(i)
134
- frames.append(Image.fromarray(frame_data).convert("RGB"))
135
- except Exception as frame_e:
136
- st.warning(f"Skipping problematic frame at index {i}: {frame_e}")
137
- return frames
138
- except (imageio.core.fetching.NeedDownloadError, OSError) as e_ffmpeg:
139
- st.error(f"FFmpeg error during frame extraction: {e_ffmpeg}. Ensure ffmpeg is available.")
140
- return []
141
- except Exception as e:
142
- st.error(f"Could not extract frames: {e}")
143
- st.error(traceback.format_exc())
144
- return []
145
- finally:
146
- if reader:
147
- reader.close()
148
-
149
- def generate_sound_prompt_from_frames(frames, caption_proc, caption_mod):
150
- if not frames: return "ambient background noise"
151
-
152
- descriptions = []
153
- # BLIP doesn't need a complex instruction, it captions directly.
154
- # We ask for "sound-producing elements" in post-processing of the descriptions.
155
-
156
- progress_bar = st.progress(0.0, text="Analyzing frames for sound ideas...")
157
- for i, frame in enumerate(frames):
158
- try:
159
- inputs = caption_proc(images=frame, return_tensors="pt").to(DEVICE)
160
- generated_ids = caption_mod.generate(**inputs, max_new_tokens=40) # Shorter captions
161
- description = caption_proc.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
162
- if description: descriptions.append(description)
163
- progress_bar.progress((i + 1) / len(frames), text=f"Frame {i+1}/{len(frames)} analyzed.")
164
- except Exception as e:
165
- st.warning(f"Could not get description for a frame: {e}")
166
- progress_bar.empty()
167
 
168
- if not descriptions: return "general ambiance, subtle environmental sounds"
169
-
170
- unique_descs = list(dict.fromkeys(descriptions)) # Remove duplicates
171
- combined_prompt = ". ".join(unique_descs)
172
- # Refine for MusicGen
173
- final_prompt = f"Soundscape for a scene with: {combined_prompt}. Emphasize distinct sounds and overall mood."
174
- # Limit prompt length for MusicGen
175
- if len(final_prompt) > 300: # Arbitrary limit for conciseness
176
- final_prompt = final_prompt[:300] + "..."
177
- return final_prompt
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
 
179
- def generate_audio_from_prompt(prompt, duration_s, audio_proc, audio_mod, guidance, temp):
180
  try:
181
- inputs = audio_proc(text=[prompt], return_tensors="pt", padding=True).to(DEVICE)
182
-
183
- tokens_per_second = audio_mod.config.audio_encoder.token_per_second
184
- max_new_tokens = min(int(duration_s * tokens_per_second), 1500) # Cap for stability
185
-
186
- with st.spinner(f"Synthesizing {duration_s}s audio (CPU: This is the SLOW part!)... Please wait patiently."):
187
- audio_values = audio_mod.generate(
188
- **inputs,
189
- max_new_tokens=max_new_tokens,
190
- do_sample=True, # Sampling is important for diversity
191
- guidance_scale=guidance,
192
- temperature=temp,
193
- )
194
-
195
- audio_array = audio_values[0, 0].cpu().numpy()
196
- sampling_rate = audio_mod.config.audio_encoder.sampling_rate
197
-
198
- peak = np.abs(audio_array).max()
199
- if peak > 1e-5: # Avoid division by zero or near-zero
200
- audio_array = (audio_array / peak) * 0.9 # Normalize with headroom
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
  else:
202
- audio_array = np.zeros_like(audio_array) # Output silence if generated audio is too quiet
203
- return audio_array, sampling_rate
204
- except Exception as e:
205
- st.error(f"Error generating audio: {e}")
206
- st.error(traceback.format_exc())
207
- return None, None
208
-
209
- def combine_audio_video(video_path, audio_arr, sr, mix_orig):
210
- if not MOVIEPY_AVAILABLE:
211
- st.error("MoviePy is unavailable. Cannot combine audio and video.")
212
- return None
213
-
214
- out_vid_path = None
215
- tmp_audio_path = None
216
- vid_clip = gen_audio_clip = final_aud = comp_clip = None
217
-
218
- try:
219
- with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_audio_f:
220
- scipy.io.wavfile.write(tmp_audio_f.name, sr, audio_arr.astype(np.float32)) # Ensure float32 for some moviepy versions
221
- tmp_audio_path = tmp_audio_f.name
222
-
223
- vid_clip = mpy.VideoFileClip(video_path)
224
- gen_audio_clip = mpy.AudioFileClip(tmp_audio_path)
225
-
226
- target_duration = vid_clip.duration
227
- if gen_audio_clip.duration < target_duration:
228
- gen_audio_clip = gen_audio_clip.fx(mpy.afx.audio_loop, duration=target_duration)
229
- gen_audio_clip = gen_audio_clip.subclip(0, target_duration)
230
 
231
- if mix_orig and vid_clip.audio:
232
- orig_audio = vid_clip.audio.volumex(0.6) # Lower original audio
233
- gen_audio_clip = gen_audio_clip.volumex(0.9) # Slightly boost generated
234
- final_aud = mpy.CompositeAudioClip([orig_audio, gen_audio_clip]).set_duration(target_duration)
235
  else:
236
- final_aud = gen_audio_clip.set_duration(target_duration)
237
-
238
- comp_clip = vid_clip.set_audio(final_aud)
239
-
240
- with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as tmp_out_vid_f:
241
- out_vid_path = tmp_out_vid_f.name
242
-
243
- comp_clip.write_videofile(
244
- out_vid_path, codec="libx264", audio_codec="aac",
245
- threads=max(1, os.cpu_count() // 2), logger=None # Be less verbose
 
 
 
 
 
 
246
  )
247
- return out_vid_path
248
- except Exception as e:
249
- st.error(f"Error combining audio/video: {e}")
250
- st.error(traceback.format_exc())
251
- return None
252
- finally:
253
- # Close all clips
254
- if vid_clip: vid_clip.close()
255
- if gen_audio_clip: gen_audio_clip.close()
256
- # final_aud is usually a derivative, not needing separate close if others are.
257
- if comp_clip: comp_clip.close()
258
- if tmp_audio_path and os.path.exists(tmp_audio_path): os.remove(tmp_audio_path)
259
-
260
- # --- Sidebar for Settings ---
261
- with st.sidebar:
262
- st.header("⚙️ Settings")
263
- num_frames_analysis = st.slider("Frames to Analyze", 1, 4, DEFAULT_NUM_FRAMES, 1,
264
- help="Fewer frames = faster analysis.")
265
- audio_duration = st.slider("Target Audio Duration (s)", 3, 15, DEFAULT_AUDIO_DURATION_S, 1, # Max 15s for CPU
266
- help="Shorter = MUCH faster on CPU. MusicGen is slow.")
267
-
268
- st.subheader("MusicGen Parameters")
269
- guidance = st.slider("Guidance Scale", 1.0, 7.0, 3.0, 0.5)
270
- temperature = st.slider("Temperature", 0.5, 1.5, 1.0, 0.1)
271
-
272
- mix_audio = False
273
- if MOVIEPY_AVAILABLE:
274
- st.subheader("Video Output")
275
- mix_audio = st.checkbox("Mix with original video audio", value=False)
276
-
277
- # --- Main Application ---
278
- uploaded_file = st.file_uploader("📤 Upload Video (MP4, MOV, AVI - short clips best):", type=["mp4", "mov", "avi"])
279
-
280
- if 'generated_audio_path_sess' not in st.session_state:
281
- st.session_state.generated_audio_path_sess = None
282
- if 'output_video_path_sess' not in st.session_state:
283
- st.session_state.output_video_path_sess = None
284
-
285
- if uploaded_file is not None:
286
- st.video(uploaded_file)
287
-
288
- if st.button("✨ Generate Sound Design!", type="primary", use_container_width=True):
289
- # --- Clear previous results from session state and disk ---
290
- for key in ['generated_audio_path_sess', 'output_video_path_sess']:
291
- if st.session_state.get(key) and os.path.exists(st.session_state[key]):
292
- try: os.remove(st.session_state[key])
293
- except Exception as e_rem: print(f"Error removing old temp file: {e_rem}")
294
- st.session_state[key] = None
295
- clear_memory() # General memory clear
296
-
297
- temp_video_path = None
298
- try:
299
- with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(uploaded_file.name)[1]) as tmp_v:
300
- tmp_v.write(uploaded_file.read()) # Use read() for BytesIO from uploader
301
- temp_video_path = tmp_v.name
302
-
303
- # === Stage 1: Frame Extraction ===
304
- st.markdown("--- \n### 1. Extracting Frames")
305
- frames = extract_frames_from_video(temp_video_path, num_frames_analysis)
306
- if not frames: st.error("Frame extraction failed. Cannot continue."); st.stop()
307
- st.success(f"Extracted {len(frames)} frames.")
308
- if frames:
309
- cols = st.columns(min(len(frames), MAX_FRAMES_TO_SHOW_UI))
310
- for i, frame_img in enumerate(frames[:len(cols)]):
311
- cols[i].image(frame_img, caption=f"Frame {i+1}", use_column_width=True)
312
-
313
- # === Stage 2: Image Captioning (Sound Prompt Generation) ===
314
- st.markdown("--- \n### 2. Analyzing Frames for Sound Ideas (BLIP)")
315
- cap_proc, cap_model = load_image_caption_model_and_processor()
316
- sound_prompt = "ambient environmental sounds" # Default
317
- if cap_proc and cap_model:
318
- sound_prompt = generate_sound_prompt_from_frames(frames, cap_proc, cap_model)
319
- clear_memory(cap_model, cap_proc) # Unload BLIP
320
- else: st.error("BLIP model failed to load. Using default sound prompt.")
321
- st.info(f"✍️ **Sound Prompt for MusicGen:** {sound_prompt}")
322
-
323
- # === Stage 3: Audio Generation ===
324
- st.markdown("--- \n### 3. Synthesizing Audio (MusicGen)")
325
- st.warning("🎧 This step is very slow on CPU. Your patience is appreciated!")
326
- aud_proc, aud_model = load_audio_gen_model_and_processor()
327
- gen_aud_arr, s_r = None, None
328
- if aud_proc and aud_model:
329
- gen_aud_arr, s_r = generate_audio_from_prompt(sound_prompt, audio_duration, aud_proc, aud_model, guidance, temperature)
330
- clear_memory(aud_model, aud_proc) # Unload MusicGen
331
- else: st.error("MusicGen model failed to load. Cannot generate audio.")
332
-
333
- if gen_aud_arr is not None and s_r is not None:
334
- st.success("Audio successfully generated!")
335
- st.audio(gen_aud_arr, sample_rate=s_r)
336
- with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_a_out:
337
- scipy.io.wavfile.write(tmp_a_out.name, s_r, gen_aud_arr.astype(np.float32))
338
- st.session_state.generated_audio_path_sess = tmp_a_out.name
339
- with open(st.session_state.generated_audio_path_sess, "rb") as f_aud:
340
- st.download_button("📥 Download Audio Only (.wav)", f_aud, "generated_sound.wav", "audio/wav")
341
-
342
- # === Stage 4: (Optional) Video and Audio Syncing ===
343
- if MOVIEPY_AVAILABLE:
344
- st.markdown("--- \n### 4. Combining Audio with Video")
345
- with st.spinner("Processing video with new audio... (also can be slow)"):
346
- out_vid_p = combine_audio_video(temp_video_path, gen_aud_arr, s_r, mix_audio)
347
- if out_vid_p and os.path.exists(out_vid_p):
348
- st.success("Video processing complete!")
349
- st.video(out_vid_p)
350
- st.session_state.output_video_path_sess = out_vid_p
351
- with open(out_vid_p, "rb") as f_vid:
352
- st.download_button("🎬 Download Video with New Sound (.mp4)", f_vid, "video_with_sound.mp4", "video/mp4")
353
- elif MOVIEPY_AVAILABLE: st.error("Failed to combine audio and video.")
354
- else: st.error("Audio generation failed. Video syncing skipped.")
355
- except Exception as e_main:
356
- st.error(f"An unexpected error occurred in main processing: {e_main}")
357
- st.error(traceback.format_exc())
358
- finally:
359
- if temp_video_path and os.path.exists(temp_video_path): os.remove(temp_video_path)
360
- clear_memory() # Final general clear
361
-
362
- # Show download buttons for files from a previous successful run in the same session
363
- elif st.session_state.generated_audio_path_sess and os.path.exists(st.session_state.generated_audio_path_sess):
364
- st.markdown("---")
365
- st.write("Previously generated audio available:")
366
- st.audio(st.session_state.generated_audio_path_sess)
367
- with open(st.session_state.generated_audio_path_sess, "rb") as f_aud_prev:
368
- st.download_button("📥 Download Previous Audio (.wav)", f_aud_prev, "generated_sound_prev.wav", "audio/wav", key="prev_aud_dl")
369
-
370
- if st.session_state.output_video_path_sess and os.path.exists(st.session_state.output_video_path_sess) and MOVIEPY_AVAILABLE:
371
- st.markdown("---") # This might appear even if audio only was generated, so careful with flow
372
- st.write("Previously generated video available:")
373
- st.video(st.session_state.output_video_path_sess)
374
- with open(st.session_state.output_video_path_sess, "rb") as f_vid_prev:
375
- st.download_button("🎬 Download Previous Video (.mp4)", f_vid_prev, "video_with_sound_prev.mp4", "video/mp4", key="prev_vid_dl")
376
 
377
- else:
378
- st.info("☝️ Upload a video to begin.")
 
379
 
380
- st.markdown("---")
381
- st.caption("Built for Hugging Face Spaces (CPU). Patience is key for generation times!")
 
 
 
 
1
  import streamlit as st
2
+ import imageio
3
  import numpy as np
4
+ from PIL import Image
5
+ from transformers import AutoProcessor, BlipForConditionalGeneration, MusicgenForConditionalGeneration
6
+ import soundfile as sf
7
  import torch
 
8
  import os
9
  import tempfile
 
 
 
 
10
 
11
+ # Try importing moviepy with fallback
12
  try:
13
  import moviepy.editor as mpy
14
+ except ModuleNotFoundError:
15
+ st.error("The 'moviepy' library is not installed. Please ensure 'moviepy==1.0.3' is in requirements.txt and installed.")
16
+ st.stop()
17
+
18
+ # Set page title and instructions
19
+ st.title("Story Video Sound Effect Sync Generator")
20
+ st.write("Upload an MP4 video to auto-generate and sync a high-quality sound effect.")
21
+
22
+ # User-configurable settings
23
+ num_frames_to_extract = st.slider("Number of frames to analyze", 1, 3, 1, help="Fewer frames = faster processing")
24
+ mix_original_audio = st.checkbox("Mix with original audio", value=False, help="Blend sound effect with video’s original sound")
25
+
26
+ # Enhanced prompt generation function
27
+ def enhance_prompt(base_description):
28
+ """Generate a detailed, sound-specific prompt from BLIP caption."""
29
+ base = base_description.lower().strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
+ # Define action, object, and environment keywords
32
+ actions = {
33
+ "walk": "crisp footsteps on a wooden floor",
34
+ "run": "rapid footsteps and heavy breathing",
35
+ "drive": "engine roar and tires screeching",
36
+ "talk": "soft voices and background murmur",
37
+ "crash": "loud crash and debris scattering",
38
+ "fall": "thud of impact and rustling debris"
39
+ }
40
+ objects = {
41
+ "person": "human activity with subtle breathing",
42
+ "dog": "playful barks and pawsteps",
43
+ "car": "mechanical hum and tire friction",
44
+ "tree": "rustling leaves in a breeze",
45
+ "forest": "gentle wind and distant bird calls"
46
+ }
47
+ environments = {
48
+ "room": "echoing footsteps and muffled sounds",
49
+ "street": "distant traffic and urban hum",
50
+ "forest": "wind through trees and twigs snapping",
51
+ "outside": "open air with faint wind"
52
+ }
53
+
54
+ # Extract key elements from the caption
55
+ sound_description = ""
56
+ for action, sound in actions.items():
57
+ if action in base:
58
+ sound_description = sound
59
+ break
60
+ if not sound_description: # Default to subtle ambient if no action
61
+ sound_description = "subtle ambient hum"
62
+
63
+ # Add object-specific sounds
64
+ for obj, sound in objects.items():
65
+ if obj in base:
66
+ sound_description += f" and {sound}"
67
+ break
68
+
69
+ # Add environment if detected
70
+ for env, sound in environments.items():
71
+ if env in base:
72
+ sound_description += f" in a {env} with {sound}"
73
+ break
74
+
75
+ # Construct final prompt
76
+ return f"{base} with {sound_description}"
77
+
78
+ # File uploader for video
79
+ uploaded_file = st.file_uploader("Upload an MP4 video (high resolution)", type=["mp4"])
80
 
81
+ if uploaded_file is not None:
82
  try:
83
+ # Temporary video file
84
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as temp_video:
85
+ temp_video.write(uploaded_file.getbuffer())
86
+ temp_video_path = temp_video.name
87
+
88
+ # Progress bar setup
89
+ progress_bar = st.progress(0)
90
+ status_text = st.empty()
91
+
92
+ # Extract frames
93
+ status_text.text("Extracting frames...")
94
+ video = imageio.get_reader(temp_video_path, "ffmpeg")
95
+ total_frames = len(list(video.iter_data()))
96
+ step = max(1, total_frames // num_frames_to_extract)
97
+ frames = [
98
+ Image.fromarray(video.get_data(i))
99
+ for i in range(0, min(total_frames, num_frames_to_extract * step), step)
100
+ ][:num_frames_to_extract]
101
+ progress_bar.progress(20)
102
+
103
+ # Load BLIP model
104
+ @st.cache_resource
105
+ def load_blip_model():
106
+ processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
107
+ model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
108
+ if torch.cuda.is_available():
109
+ model = model.half().to("cuda")
110
+ return processor, model
111
+
112
+ processor, model = load_blip_model()
113
+
114
+ # Generate and enhance text descriptions
115
+ status_text.text("Analyzing frames...")
116
+ descriptions = []
117
+ for i, frame in enumerate(frames):
118
+ inputs = processor(images=frame, return_tensors="pt")
119
+ if torch.cuda.is_available():
120
+ inputs = {k: v.to("cuda") for k, v in inputs.items()}
121
+ out = model.generate(**inputs)
122
+ base_description = processor.decode(out[0], skip_special_tokens=True)
123
+ enhanced_description = enhance_prompt(base_description)
124
+ descriptions.append(enhanced_description)
125
+ progress_bar.progress(20 + int(30 * (i + 1) / len(frames)))
126
+
127
+ text_prompt = ". ".join(descriptions)
128
+ st.write("Enhanced text prompt:", text_prompt)
129
+
130
+ # Load MusicGen model
131
+ @st.cache_resource
132
+ def load_musicgen_model():
133
+ processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
134
+ model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
135
+ if torch.cuda.is_available():
136
+ model = model.half().to("cuda")
137
+ return processor, model
138
+
139
+ musicgen_processor, musicgen_model = load_musicgen_model()
140
+
141
+ # Generate sound effect (~8 seconds)
142
+ status_text.text("Generating sound effect...")
143
+ inputs = musicgen_processor(
144
+ text=[text_prompt],
145
+ padding=True,
146
+ return_tensors="pt",
147
+ )
148
+ if torch.cuda.is_available():
149
+ inputs = {k: v.to("cuda") for k, v in inputs.items()}
150
+ audio_values = musicgen_model.generate(
151
+ **inputs,
152
+ max_new_tokens=256,
153
+ do_sample=True,
154
+ guidance_scale=3.0,
155
+ top_k=50,
156
+ top_p=0.95
157
+ )
158
+ audio_array = audio_values[0].cpu().numpy()
159
+ if audio_array.ndim > 1:
160
+ audio_array = audio_array.flatten()
161
+ audio_array = audio_array / np.max(np.abs(audio_array)) * 0.9
162
+ audio_array = np.clip(audio_array, -1.0, 1.0)
163
+ sample_rate = 32000
164
+ progress_bar.progress(60)
165
+
166
+ # Save temporary audio
167
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
168
+ sf.write(temp_audio.name, audio_array, sample_rate)
169
+ temp_audio_path = temp_audio.name
170
+
171
+ # Synchronize with video using mpy
172
+ status_text.text("Syncing audio with video...")
173
+ video_clip = mpy.VideoFileClip(temp_video_path)
174
+ video_duration = video_clip.duration
175
+ audio_clip = mpy.AudioFileClip(temp_audio_path)
176
+
177
+ # Adjust audio length
178
+ if audio_clip.duration < video_duration:
179
+ loops_needed = int(np.ceil(video_duration / audio_clip.duration))
180
+ audio_clip = mpy.concatenate_audioclips([audio_clip] * loops_needed).subclip(0, video_duration)
181
  else:
182
+ audio_clip = audio_clip.subclip(0, video_duration)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
 
184
+ # Mix or replace audio
185
+ if mix_original_audio and video_clip.audio:
186
+ final_audio = video_clip.audio.volumex(0.5) + audio_clip.volumex(0.5)
 
187
  else:
188
+ final_audio = audio_clip
189
+
190
+ # Set audio to video
191
+ final_video = video_clip.set_audio(final_audio)
192
+
193
+ # Save final video with high quality
194
+ output_path = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
195
+ final_video.write_videofile(
196
+ output_path,
197
+ codec="libx264",
198
+ audio_codec="aac",
199
+ preset="medium", # Better quality than ultrafast
200
+ bitrate="8000k", # Higher bitrate for video quality
201
+ audio_bitrate="192k", # Good audio quality
202
+ temp_audiofile="temp-audio.m4a",
203
+ remove_temp=True
204
  )
205
+ progress_bar.progress(90)
206
+
207
+ # Provide playback and download
208
+ status_text.text("Done!")
209
+ st.video(output_path)
210
+ with open(output_path, "rb") as video_file:
211
+ st.download_button(
212
+ label="Download Synced Video",
213
+ data=video_file,
214
+ file_name="synced_story_video.mp4",
215
+ mime="video/mp4"
216
+ )
217
+ progress_bar.progress(100)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
218
 
219
+ except Exception as e:
220
+ st.error(f"An error occurred: {str(e)}")
221
+ st.write("Try reducing frames or uploading a smaller video.")
222
 
223
+ finally:
224
+ # Clean up
225
+ for path in [temp_video_path, temp_audio_path, output_path]:
226
+ if 'path' in locals() and os.path.exists(path):
227
+ os.remove(path)