Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,381 +1,227 @@
|
|
| 1 |
import streamlit as st
|
| 2 |
-
|
| 3 |
import numpy as np
|
|
|
|
|
|
|
|
|
|
| 4 |
import torch
|
| 5 |
-
import gc
|
| 6 |
import os
|
| 7 |
import tempfile
|
| 8 |
-
import math
|
| 9 |
-
import imageio
|
| 10 |
-
import traceback
|
| 11 |
-
import scipy.io.wavfile # For saving WAV files
|
| 12 |
|
| 13 |
-
#
|
| 14 |
try:
|
| 15 |
import moviepy.editor as mpy
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
DEFAULT_NUM_FRAMES = 2
|
| 33 |
-
DEFAULT_AUDIO_DURATION_S = 7 # Slightly increased default
|
| 34 |
-
MAX_FRAMES_TO_SHOW_UI = 2 # Reducing for smaller UI footprint
|
| 35 |
-
DEVICE = torch.device("cpu") # Explicitly use CPU
|
| 36 |
-
|
| 37 |
-
# --- Page Setup ---
|
| 38 |
-
st.set_page_config(page_title="AI Video Sound Designer (HF Space)", layout="wide", page_icon="🎬")
|
| 39 |
-
|
| 40 |
-
st.title("🎬 AI Video Sound Designer")
|
| 41 |
-
st.markdown("""
|
| 42 |
-
Upload a short video (MP4, MOV, AVI). The tool will:
|
| 43 |
-
1. Extract frames.
|
| 44 |
-
2. Analyze frames with **BLIP** to generate sound ideas.
|
| 45 |
-
3. Synthesize audio with **MusicGen** based on these ideas.
|
| 46 |
-
4. Optionally, combine the new audio with your video.
|
| 47 |
-
---
|
| 48 |
-
**Important:** This app runs on CPU. **Audio generation can be very slow (several minutes for a few seconds of audio).** Please be patient!
|
| 49 |
-
""")
|
| 50 |
-
|
| 51 |
-
# --- Utility Functions ---
|
| 52 |
-
def clear_memory(model_obj=None, processor_obj=None):
|
| 53 |
-
if model_obj:
|
| 54 |
-
del model_obj
|
| 55 |
-
if processor_obj:
|
| 56 |
-
del processor_obj
|
| 57 |
-
gc.collect()
|
| 58 |
-
print("Memory cleared.")
|
| 59 |
-
|
| 60 |
-
@st.cache_resource(show_spinner="Loading Image Analysis Model...")
|
| 61 |
-
def load_image_caption_model_and_processor():
|
| 62 |
-
try:
|
| 63 |
-
from transformers import BlipProcessor, BlipForConditionalGeneration
|
| 64 |
-
print(f"Loading Image Captioning Model: {IMAGE_CAPTION_MODEL} to {DEVICE}")
|
| 65 |
-
processor = BlipProcessor.from_pretrained(IMAGE_CAPTION_MODEL)
|
| 66 |
-
# Standard loading for BLIP on CPU. No 'low_mem' or 'low_cpu_mem_usage'
|
| 67 |
-
model = BlipForConditionalGeneration.from_pretrained(IMAGE_CAPTION_MODEL).to(DEVICE)
|
| 68 |
-
model.eval() # Set to evaluation mode
|
| 69 |
-
st.toast("Image Analysis model (BLIP) loaded!", icon="🖼️")
|
| 70 |
-
return processor, model
|
| 71 |
-
except Exception as e:
|
| 72 |
-
st.error(f"Error loading BLIP model ({IMAGE_CAPTION_MODEL}): {e}")
|
| 73 |
-
st.error(traceback.format_exc())
|
| 74 |
-
return None, None
|
| 75 |
-
|
| 76 |
-
@st.cache_resource(show_spinner="Loading Audio Generation Model (can be slow)...")
|
| 77 |
-
def load_audio_gen_model_and_processor():
|
| 78 |
-
try:
|
| 79 |
-
from transformers import AutoProcessor, MusicgenForConditionalGeneration
|
| 80 |
-
print(f"Loading Audio Generation Model: {AUDIO_GEN_MODEL} to {DEVICE}")
|
| 81 |
-
processor = AutoProcessor.from_pretrained(AUDIO_GEN_MODEL)
|
| 82 |
-
# Standard loading for MusicGen on CPU.
|
| 83 |
-
# `low_cpu_mem_usage` could be used here if accelerate is properly configured,
|
| 84 |
-
# but for simplicity and robustness on free tier, direct .to(DEVICE) is safer.
|
| 85 |
-
model = MusicgenForConditionalGeneration.from_pretrained(AUDIO_GEN_MODEL).to(DEVICE)
|
| 86 |
-
model.eval() # Set to evaluation mode
|
| 87 |
-
st.toast("Audio Generation model (MusicGen) loaded! (CPU generation will be slow)", icon="🎶")
|
| 88 |
-
return processor, model
|
| 89 |
-
except Exception as e:
|
| 90 |
-
st.error(f"Error loading MusicGen model ({AUDIO_GEN_MODEL}): {e}")
|
| 91 |
-
st.error(traceback.format_exc())
|
| 92 |
-
return None, None
|
| 93 |
-
|
| 94 |
-
def extract_frames_from_video(video_path, num_frames_to_extract):
|
| 95 |
-
frames = []
|
| 96 |
-
reader = None
|
| 97 |
-
try:
|
| 98 |
-
reader = imageio.get_reader(video_path, "ffmpeg")
|
| 99 |
-
total_frames_in_video = 0
|
| 100 |
-
try: # Try to get frame count
|
| 101 |
-
total_frames_in_video = reader.count_frames()
|
| 102 |
-
except Exception:
|
| 103 |
-
meta_data = reader.get_meta_data()
|
| 104 |
-
duration = meta_data.get('duration')
|
| 105 |
-
fps = meta_data.get('fps', 25) # Default FPS if not found
|
| 106 |
-
if duration and fps:
|
| 107 |
-
total_frames_in_video = int(duration * fps)
|
| 108 |
-
|
| 109 |
-
if not total_frames_in_video or total_frames_in_video < 1:
|
| 110 |
-
# Fallback: try to read a few frames directly if length is unknown
|
| 111 |
-
print("Video length unknown or zero, attempting to read initial frames.")
|
| 112 |
-
temp_frames = []
|
| 113 |
-
for i, frame_data in enumerate(reader):
|
| 114 |
-
temp_frames.append(Image.fromarray(frame_data).convert("RGB"))
|
| 115 |
-
if len(temp_frames) >= num_frames_to_extract * 2: # Read a bit more
|
| 116 |
-
break
|
| 117 |
-
if not temp_frames:
|
| 118 |
-
st.error("Could not extract any frames. Video might be empty or corrupted.")
|
| 119 |
-
if reader: reader.close()
|
| 120 |
-
return []
|
| 121 |
-
# Select frames from what was read
|
| 122 |
-
indices = np.linspace(0, len(temp_frames) - 1, num_frames_to_extract, dtype=int, endpoint=True)
|
| 123 |
-
frames = [temp_frames[i] for i in indices]
|
| 124 |
-
if reader: reader.close()
|
| 125 |
-
return frames
|
| 126 |
-
|
| 127 |
-
# If frame count is known
|
| 128 |
-
num_to_sample = min(num_frames_to_extract, total_frames_in_video)
|
| 129 |
-
indices = np.linspace(0, total_frames_in_video - 1, num_to_sample, dtype=int, endpoint=True)
|
| 130 |
-
|
| 131 |
-
for i in indices:
|
| 132 |
-
try:
|
| 133 |
-
frame_data = reader.get_data(i)
|
| 134 |
-
frames.append(Image.fromarray(frame_data).convert("RGB"))
|
| 135 |
-
except Exception as frame_e:
|
| 136 |
-
st.warning(f"Skipping problematic frame at index {i}: {frame_e}")
|
| 137 |
-
return frames
|
| 138 |
-
except (imageio.core.fetching.NeedDownloadError, OSError) as e_ffmpeg:
|
| 139 |
-
st.error(f"FFmpeg error during frame extraction: {e_ffmpeg}. Ensure ffmpeg is available.")
|
| 140 |
-
return []
|
| 141 |
-
except Exception as e:
|
| 142 |
-
st.error(f"Could not extract frames: {e}")
|
| 143 |
-
st.error(traceback.format_exc())
|
| 144 |
-
return []
|
| 145 |
-
finally:
|
| 146 |
-
if reader:
|
| 147 |
-
reader.close()
|
| 148 |
-
|
| 149 |
-
def generate_sound_prompt_from_frames(frames, caption_proc, caption_mod):
|
| 150 |
-
if not frames: return "ambient background noise"
|
| 151 |
-
|
| 152 |
-
descriptions = []
|
| 153 |
-
# BLIP doesn't need a complex instruction, it captions directly.
|
| 154 |
-
# We ask for "sound-producing elements" in post-processing of the descriptions.
|
| 155 |
-
|
| 156 |
-
progress_bar = st.progress(0.0, text="Analyzing frames for sound ideas...")
|
| 157 |
-
for i, frame in enumerate(frames):
|
| 158 |
-
try:
|
| 159 |
-
inputs = caption_proc(images=frame, return_tensors="pt").to(DEVICE)
|
| 160 |
-
generated_ids = caption_mod.generate(**inputs, max_new_tokens=40) # Shorter captions
|
| 161 |
-
description = caption_proc.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
|
| 162 |
-
if description: descriptions.append(description)
|
| 163 |
-
progress_bar.progress((i + 1) / len(frames), text=f"Frame {i+1}/{len(frames)} analyzed.")
|
| 164 |
-
except Exception as e:
|
| 165 |
-
st.warning(f"Could not get description for a frame: {e}")
|
| 166 |
-
progress_bar.empty()
|
| 167 |
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 178 |
|
| 179 |
-
|
| 180 |
try:
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 201 |
else:
|
| 202 |
-
|
| 203 |
-
return audio_array, sampling_rate
|
| 204 |
-
except Exception as e:
|
| 205 |
-
st.error(f"Error generating audio: {e}")
|
| 206 |
-
st.error(traceback.format_exc())
|
| 207 |
-
return None, None
|
| 208 |
-
|
| 209 |
-
def combine_audio_video(video_path, audio_arr, sr, mix_orig):
|
| 210 |
-
if not MOVIEPY_AVAILABLE:
|
| 211 |
-
st.error("MoviePy is unavailable. Cannot combine audio and video.")
|
| 212 |
-
return None
|
| 213 |
-
|
| 214 |
-
out_vid_path = None
|
| 215 |
-
tmp_audio_path = None
|
| 216 |
-
vid_clip = gen_audio_clip = final_aud = comp_clip = None
|
| 217 |
-
|
| 218 |
-
try:
|
| 219 |
-
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_audio_f:
|
| 220 |
-
scipy.io.wavfile.write(tmp_audio_f.name, sr, audio_arr.astype(np.float32)) # Ensure float32 for some moviepy versions
|
| 221 |
-
tmp_audio_path = tmp_audio_f.name
|
| 222 |
-
|
| 223 |
-
vid_clip = mpy.VideoFileClip(video_path)
|
| 224 |
-
gen_audio_clip = mpy.AudioFileClip(tmp_audio_path)
|
| 225 |
-
|
| 226 |
-
target_duration = vid_clip.duration
|
| 227 |
-
if gen_audio_clip.duration < target_duration:
|
| 228 |
-
gen_audio_clip = gen_audio_clip.fx(mpy.afx.audio_loop, duration=target_duration)
|
| 229 |
-
gen_audio_clip = gen_audio_clip.subclip(0, target_duration)
|
| 230 |
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
final_aud = mpy.CompositeAudioClip([orig_audio, gen_audio_clip]).set_duration(target_duration)
|
| 235 |
else:
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 246 |
)
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
# --- Sidebar for Settings ---
|
| 261 |
-
with st.sidebar:
|
| 262 |
-
st.header("⚙️ Settings")
|
| 263 |
-
num_frames_analysis = st.slider("Frames to Analyze", 1, 4, DEFAULT_NUM_FRAMES, 1,
|
| 264 |
-
help="Fewer frames = faster analysis.")
|
| 265 |
-
audio_duration = st.slider("Target Audio Duration (s)", 3, 15, DEFAULT_AUDIO_DURATION_S, 1, # Max 15s for CPU
|
| 266 |
-
help="Shorter = MUCH faster on CPU. MusicGen is slow.")
|
| 267 |
-
|
| 268 |
-
st.subheader("MusicGen Parameters")
|
| 269 |
-
guidance = st.slider("Guidance Scale", 1.0, 7.0, 3.0, 0.5)
|
| 270 |
-
temperature = st.slider("Temperature", 0.5, 1.5, 1.0, 0.1)
|
| 271 |
-
|
| 272 |
-
mix_audio = False
|
| 273 |
-
if MOVIEPY_AVAILABLE:
|
| 274 |
-
st.subheader("Video Output")
|
| 275 |
-
mix_audio = st.checkbox("Mix with original video audio", value=False)
|
| 276 |
-
|
| 277 |
-
# --- Main Application ---
|
| 278 |
-
uploaded_file = st.file_uploader("📤 Upload Video (MP4, MOV, AVI - short clips best):", type=["mp4", "mov", "avi"])
|
| 279 |
-
|
| 280 |
-
if 'generated_audio_path_sess' not in st.session_state:
|
| 281 |
-
st.session_state.generated_audio_path_sess = None
|
| 282 |
-
if 'output_video_path_sess' not in st.session_state:
|
| 283 |
-
st.session_state.output_video_path_sess = None
|
| 284 |
-
|
| 285 |
-
if uploaded_file is not None:
|
| 286 |
-
st.video(uploaded_file)
|
| 287 |
-
|
| 288 |
-
if st.button("✨ Generate Sound Design!", type="primary", use_container_width=True):
|
| 289 |
-
# --- Clear previous results from session state and disk ---
|
| 290 |
-
for key in ['generated_audio_path_sess', 'output_video_path_sess']:
|
| 291 |
-
if st.session_state.get(key) and os.path.exists(st.session_state[key]):
|
| 292 |
-
try: os.remove(st.session_state[key])
|
| 293 |
-
except Exception as e_rem: print(f"Error removing old temp file: {e_rem}")
|
| 294 |
-
st.session_state[key] = None
|
| 295 |
-
clear_memory() # General memory clear
|
| 296 |
-
|
| 297 |
-
temp_video_path = None
|
| 298 |
-
try:
|
| 299 |
-
with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(uploaded_file.name)[1]) as tmp_v:
|
| 300 |
-
tmp_v.write(uploaded_file.read()) # Use read() for BytesIO from uploader
|
| 301 |
-
temp_video_path = tmp_v.name
|
| 302 |
-
|
| 303 |
-
# === Stage 1: Frame Extraction ===
|
| 304 |
-
st.markdown("--- \n### 1. Extracting Frames")
|
| 305 |
-
frames = extract_frames_from_video(temp_video_path, num_frames_analysis)
|
| 306 |
-
if not frames: st.error("Frame extraction failed. Cannot continue."); st.stop()
|
| 307 |
-
st.success(f"Extracted {len(frames)} frames.")
|
| 308 |
-
if frames:
|
| 309 |
-
cols = st.columns(min(len(frames), MAX_FRAMES_TO_SHOW_UI))
|
| 310 |
-
for i, frame_img in enumerate(frames[:len(cols)]):
|
| 311 |
-
cols[i].image(frame_img, caption=f"Frame {i+1}", use_column_width=True)
|
| 312 |
-
|
| 313 |
-
# === Stage 2: Image Captioning (Sound Prompt Generation) ===
|
| 314 |
-
st.markdown("--- \n### 2. Analyzing Frames for Sound Ideas (BLIP)")
|
| 315 |
-
cap_proc, cap_model = load_image_caption_model_and_processor()
|
| 316 |
-
sound_prompt = "ambient environmental sounds" # Default
|
| 317 |
-
if cap_proc and cap_model:
|
| 318 |
-
sound_prompt = generate_sound_prompt_from_frames(frames, cap_proc, cap_model)
|
| 319 |
-
clear_memory(cap_model, cap_proc) # Unload BLIP
|
| 320 |
-
else: st.error("BLIP model failed to load. Using default sound prompt.")
|
| 321 |
-
st.info(f"✍️ **Sound Prompt for MusicGen:** {sound_prompt}")
|
| 322 |
-
|
| 323 |
-
# === Stage 3: Audio Generation ===
|
| 324 |
-
st.markdown("--- \n### 3. Synthesizing Audio (MusicGen)")
|
| 325 |
-
st.warning("🎧 This step is very slow on CPU. Your patience is appreciated!")
|
| 326 |
-
aud_proc, aud_model = load_audio_gen_model_and_processor()
|
| 327 |
-
gen_aud_arr, s_r = None, None
|
| 328 |
-
if aud_proc and aud_model:
|
| 329 |
-
gen_aud_arr, s_r = generate_audio_from_prompt(sound_prompt, audio_duration, aud_proc, aud_model, guidance, temperature)
|
| 330 |
-
clear_memory(aud_model, aud_proc) # Unload MusicGen
|
| 331 |
-
else: st.error("MusicGen model failed to load. Cannot generate audio.")
|
| 332 |
-
|
| 333 |
-
if gen_aud_arr is not None and s_r is not None:
|
| 334 |
-
st.success("Audio successfully generated!")
|
| 335 |
-
st.audio(gen_aud_arr, sample_rate=s_r)
|
| 336 |
-
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_a_out:
|
| 337 |
-
scipy.io.wavfile.write(tmp_a_out.name, s_r, gen_aud_arr.astype(np.float32))
|
| 338 |
-
st.session_state.generated_audio_path_sess = tmp_a_out.name
|
| 339 |
-
with open(st.session_state.generated_audio_path_sess, "rb") as f_aud:
|
| 340 |
-
st.download_button("📥 Download Audio Only (.wav)", f_aud, "generated_sound.wav", "audio/wav")
|
| 341 |
-
|
| 342 |
-
# === Stage 4: (Optional) Video and Audio Syncing ===
|
| 343 |
-
if MOVIEPY_AVAILABLE:
|
| 344 |
-
st.markdown("--- \n### 4. Combining Audio with Video")
|
| 345 |
-
with st.spinner("Processing video with new audio... (also can be slow)"):
|
| 346 |
-
out_vid_p = combine_audio_video(temp_video_path, gen_aud_arr, s_r, mix_audio)
|
| 347 |
-
if out_vid_p and os.path.exists(out_vid_p):
|
| 348 |
-
st.success("Video processing complete!")
|
| 349 |
-
st.video(out_vid_p)
|
| 350 |
-
st.session_state.output_video_path_sess = out_vid_p
|
| 351 |
-
with open(out_vid_p, "rb") as f_vid:
|
| 352 |
-
st.download_button("🎬 Download Video with New Sound (.mp4)", f_vid, "video_with_sound.mp4", "video/mp4")
|
| 353 |
-
elif MOVIEPY_AVAILABLE: st.error("Failed to combine audio and video.")
|
| 354 |
-
else: st.error("Audio generation failed. Video syncing skipped.")
|
| 355 |
-
except Exception as e_main:
|
| 356 |
-
st.error(f"An unexpected error occurred in main processing: {e_main}")
|
| 357 |
-
st.error(traceback.format_exc())
|
| 358 |
-
finally:
|
| 359 |
-
if temp_video_path and os.path.exists(temp_video_path): os.remove(temp_video_path)
|
| 360 |
-
clear_memory() # Final general clear
|
| 361 |
-
|
| 362 |
-
# Show download buttons for files from a previous successful run in the same session
|
| 363 |
-
elif st.session_state.generated_audio_path_sess and os.path.exists(st.session_state.generated_audio_path_sess):
|
| 364 |
-
st.markdown("---")
|
| 365 |
-
st.write("Previously generated audio available:")
|
| 366 |
-
st.audio(st.session_state.generated_audio_path_sess)
|
| 367 |
-
with open(st.session_state.generated_audio_path_sess, "rb") as f_aud_prev:
|
| 368 |
-
st.download_button("📥 Download Previous Audio (.wav)", f_aud_prev, "generated_sound_prev.wav", "audio/wav", key="prev_aud_dl")
|
| 369 |
-
|
| 370 |
-
if st.session_state.output_video_path_sess and os.path.exists(st.session_state.output_video_path_sess) and MOVIEPY_AVAILABLE:
|
| 371 |
-
st.markdown("---") # This might appear even if audio only was generated, so careful with flow
|
| 372 |
-
st.write("Previously generated video available:")
|
| 373 |
-
st.video(st.session_state.output_video_path_sess)
|
| 374 |
-
with open(st.session_state.output_video_path_sess, "rb") as f_vid_prev:
|
| 375 |
-
st.download_button("🎬 Download Previous Video (.mp4)", f_vid_prev, "video_with_sound_prev.mp4", "video/mp4", key="prev_vid_dl")
|
| 376 |
|
| 377 |
-
|
| 378 |
-
|
|
|
|
| 379 |
|
| 380 |
-
|
| 381 |
-
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
+
import imageio
|
| 3 |
import numpy as np
|
| 4 |
+
from PIL import Image
|
| 5 |
+
from transformers import AutoProcessor, BlipForConditionalGeneration, MusicgenForConditionalGeneration
|
| 6 |
+
import soundfile as sf
|
| 7 |
import torch
|
|
|
|
| 8 |
import os
|
| 9 |
import tempfile
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
+
# Try importing moviepy with fallback
|
| 12 |
try:
|
| 13 |
import moviepy.editor as mpy
|
| 14 |
+
except ModuleNotFoundError:
|
| 15 |
+
st.error("The 'moviepy' library is not installed. Please ensure 'moviepy==1.0.3' is in requirements.txt and installed.")
|
| 16 |
+
st.stop()
|
| 17 |
+
|
| 18 |
+
# Set page title and instructions
|
| 19 |
+
st.title("Story Video Sound Effect Sync Generator")
|
| 20 |
+
st.write("Upload an MP4 video to auto-generate and sync a high-quality sound effect.")
|
| 21 |
+
|
| 22 |
+
# User-configurable settings
|
| 23 |
+
num_frames_to_extract = st.slider("Number of frames to analyze", 1, 3, 1, help="Fewer frames = faster processing")
|
| 24 |
+
mix_original_audio = st.checkbox("Mix with original audio", value=False, help="Blend sound effect with video’s original sound")
|
| 25 |
+
|
| 26 |
+
# Enhanced prompt generation function
|
| 27 |
+
def enhance_prompt(base_description):
|
| 28 |
+
"""Generate a detailed, sound-specific prompt from BLIP caption."""
|
| 29 |
+
base = base_description.lower().strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
|
| 31 |
+
# Define action, object, and environment keywords
|
| 32 |
+
actions = {
|
| 33 |
+
"walk": "crisp footsteps on a wooden floor",
|
| 34 |
+
"run": "rapid footsteps and heavy breathing",
|
| 35 |
+
"drive": "engine roar and tires screeching",
|
| 36 |
+
"talk": "soft voices and background murmur",
|
| 37 |
+
"crash": "loud crash and debris scattering",
|
| 38 |
+
"fall": "thud of impact and rustling debris"
|
| 39 |
+
}
|
| 40 |
+
objects = {
|
| 41 |
+
"person": "human activity with subtle breathing",
|
| 42 |
+
"dog": "playful barks and pawsteps",
|
| 43 |
+
"car": "mechanical hum and tire friction",
|
| 44 |
+
"tree": "rustling leaves in a breeze",
|
| 45 |
+
"forest": "gentle wind and distant bird calls"
|
| 46 |
+
}
|
| 47 |
+
environments = {
|
| 48 |
+
"room": "echoing footsteps and muffled sounds",
|
| 49 |
+
"street": "distant traffic and urban hum",
|
| 50 |
+
"forest": "wind through trees and twigs snapping",
|
| 51 |
+
"outside": "open air with faint wind"
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
# Extract key elements from the caption
|
| 55 |
+
sound_description = ""
|
| 56 |
+
for action, sound in actions.items():
|
| 57 |
+
if action in base:
|
| 58 |
+
sound_description = sound
|
| 59 |
+
break
|
| 60 |
+
if not sound_description: # Default to subtle ambient if no action
|
| 61 |
+
sound_description = "subtle ambient hum"
|
| 62 |
+
|
| 63 |
+
# Add object-specific sounds
|
| 64 |
+
for obj, sound in objects.items():
|
| 65 |
+
if obj in base:
|
| 66 |
+
sound_description += f" and {sound}"
|
| 67 |
+
break
|
| 68 |
+
|
| 69 |
+
# Add environment if detected
|
| 70 |
+
for env, sound in environments.items():
|
| 71 |
+
if env in base:
|
| 72 |
+
sound_description += f" in a {env} with {sound}"
|
| 73 |
+
break
|
| 74 |
+
|
| 75 |
+
# Construct final prompt
|
| 76 |
+
return f"{base} with {sound_description}"
|
| 77 |
+
|
| 78 |
+
# File uploader for video
|
| 79 |
+
uploaded_file = st.file_uploader("Upload an MP4 video (high resolution)", type=["mp4"])
|
| 80 |
|
| 81 |
+
if uploaded_file is not None:
|
| 82 |
try:
|
| 83 |
+
# Temporary video file
|
| 84 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as temp_video:
|
| 85 |
+
temp_video.write(uploaded_file.getbuffer())
|
| 86 |
+
temp_video_path = temp_video.name
|
| 87 |
+
|
| 88 |
+
# Progress bar setup
|
| 89 |
+
progress_bar = st.progress(0)
|
| 90 |
+
status_text = st.empty()
|
| 91 |
+
|
| 92 |
+
# Extract frames
|
| 93 |
+
status_text.text("Extracting frames...")
|
| 94 |
+
video = imageio.get_reader(temp_video_path, "ffmpeg")
|
| 95 |
+
total_frames = len(list(video.iter_data()))
|
| 96 |
+
step = max(1, total_frames // num_frames_to_extract)
|
| 97 |
+
frames = [
|
| 98 |
+
Image.fromarray(video.get_data(i))
|
| 99 |
+
for i in range(0, min(total_frames, num_frames_to_extract * step), step)
|
| 100 |
+
][:num_frames_to_extract]
|
| 101 |
+
progress_bar.progress(20)
|
| 102 |
+
|
| 103 |
+
# Load BLIP model
|
| 104 |
+
@st.cache_resource
|
| 105 |
+
def load_blip_model():
|
| 106 |
+
processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
|
| 107 |
+
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
|
| 108 |
+
if torch.cuda.is_available():
|
| 109 |
+
model = model.half().to("cuda")
|
| 110 |
+
return processor, model
|
| 111 |
+
|
| 112 |
+
processor, model = load_blip_model()
|
| 113 |
+
|
| 114 |
+
# Generate and enhance text descriptions
|
| 115 |
+
status_text.text("Analyzing frames...")
|
| 116 |
+
descriptions = []
|
| 117 |
+
for i, frame in enumerate(frames):
|
| 118 |
+
inputs = processor(images=frame, return_tensors="pt")
|
| 119 |
+
if torch.cuda.is_available():
|
| 120 |
+
inputs = {k: v.to("cuda") for k, v in inputs.items()}
|
| 121 |
+
out = model.generate(**inputs)
|
| 122 |
+
base_description = processor.decode(out[0], skip_special_tokens=True)
|
| 123 |
+
enhanced_description = enhance_prompt(base_description)
|
| 124 |
+
descriptions.append(enhanced_description)
|
| 125 |
+
progress_bar.progress(20 + int(30 * (i + 1) / len(frames)))
|
| 126 |
+
|
| 127 |
+
text_prompt = ". ".join(descriptions)
|
| 128 |
+
st.write("Enhanced text prompt:", text_prompt)
|
| 129 |
+
|
| 130 |
+
# Load MusicGen model
|
| 131 |
+
@st.cache_resource
|
| 132 |
+
def load_musicgen_model():
|
| 133 |
+
processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
|
| 134 |
+
model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
|
| 135 |
+
if torch.cuda.is_available():
|
| 136 |
+
model = model.half().to("cuda")
|
| 137 |
+
return processor, model
|
| 138 |
+
|
| 139 |
+
musicgen_processor, musicgen_model = load_musicgen_model()
|
| 140 |
+
|
| 141 |
+
# Generate sound effect (~8 seconds)
|
| 142 |
+
status_text.text("Generating sound effect...")
|
| 143 |
+
inputs = musicgen_processor(
|
| 144 |
+
text=[text_prompt],
|
| 145 |
+
padding=True,
|
| 146 |
+
return_tensors="pt",
|
| 147 |
+
)
|
| 148 |
+
if torch.cuda.is_available():
|
| 149 |
+
inputs = {k: v.to("cuda") for k, v in inputs.items()}
|
| 150 |
+
audio_values = musicgen_model.generate(
|
| 151 |
+
**inputs,
|
| 152 |
+
max_new_tokens=256,
|
| 153 |
+
do_sample=True,
|
| 154 |
+
guidance_scale=3.0,
|
| 155 |
+
top_k=50,
|
| 156 |
+
top_p=0.95
|
| 157 |
+
)
|
| 158 |
+
audio_array = audio_values[0].cpu().numpy()
|
| 159 |
+
if audio_array.ndim > 1:
|
| 160 |
+
audio_array = audio_array.flatten()
|
| 161 |
+
audio_array = audio_array / np.max(np.abs(audio_array)) * 0.9
|
| 162 |
+
audio_array = np.clip(audio_array, -1.0, 1.0)
|
| 163 |
+
sample_rate = 32000
|
| 164 |
+
progress_bar.progress(60)
|
| 165 |
+
|
| 166 |
+
# Save temporary audio
|
| 167 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
|
| 168 |
+
sf.write(temp_audio.name, audio_array, sample_rate)
|
| 169 |
+
temp_audio_path = temp_audio.name
|
| 170 |
+
|
| 171 |
+
# Synchronize with video using mpy
|
| 172 |
+
status_text.text("Syncing audio with video...")
|
| 173 |
+
video_clip = mpy.VideoFileClip(temp_video_path)
|
| 174 |
+
video_duration = video_clip.duration
|
| 175 |
+
audio_clip = mpy.AudioFileClip(temp_audio_path)
|
| 176 |
+
|
| 177 |
+
# Adjust audio length
|
| 178 |
+
if audio_clip.duration < video_duration:
|
| 179 |
+
loops_needed = int(np.ceil(video_duration / audio_clip.duration))
|
| 180 |
+
audio_clip = mpy.concatenate_audioclips([audio_clip] * loops_needed).subclip(0, video_duration)
|
| 181 |
else:
|
| 182 |
+
audio_clip = audio_clip.subclip(0, video_duration)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 183 |
|
| 184 |
+
# Mix or replace audio
|
| 185 |
+
if mix_original_audio and video_clip.audio:
|
| 186 |
+
final_audio = video_clip.audio.volumex(0.5) + audio_clip.volumex(0.5)
|
|
|
|
| 187 |
else:
|
| 188 |
+
final_audio = audio_clip
|
| 189 |
+
|
| 190 |
+
# Set audio to video
|
| 191 |
+
final_video = video_clip.set_audio(final_audio)
|
| 192 |
+
|
| 193 |
+
# Save final video with high quality
|
| 194 |
+
output_path = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
|
| 195 |
+
final_video.write_videofile(
|
| 196 |
+
output_path,
|
| 197 |
+
codec="libx264",
|
| 198 |
+
audio_codec="aac",
|
| 199 |
+
preset="medium", # Better quality than ultrafast
|
| 200 |
+
bitrate="8000k", # Higher bitrate for video quality
|
| 201 |
+
audio_bitrate="192k", # Good audio quality
|
| 202 |
+
temp_audiofile="temp-audio.m4a",
|
| 203 |
+
remove_temp=True
|
| 204 |
)
|
| 205 |
+
progress_bar.progress(90)
|
| 206 |
+
|
| 207 |
+
# Provide playback and download
|
| 208 |
+
status_text.text("Done!")
|
| 209 |
+
st.video(output_path)
|
| 210 |
+
with open(output_path, "rb") as video_file:
|
| 211 |
+
st.download_button(
|
| 212 |
+
label="Download Synced Video",
|
| 213 |
+
data=video_file,
|
| 214 |
+
file_name="synced_story_video.mp4",
|
| 215 |
+
mime="video/mp4"
|
| 216 |
+
)
|
| 217 |
+
progress_bar.progress(100)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 218 |
|
| 219 |
+
except Exception as e:
|
| 220 |
+
st.error(f"An error occurred: {str(e)}")
|
| 221 |
+
st.write("Try reducing frames or uploading a smaller video.")
|
| 222 |
|
| 223 |
+
finally:
|
| 224 |
+
# Clean up
|
| 225 |
+
for path in [temp_video_path, temp_audio_path, output_path]:
|
| 226 |
+
if 'path' in locals() and os.path.exists(path):
|
| 227 |
+
os.remove(path)
|