Spaces:

garyuzair
/

Video-To-SoundFX

Running

App Files Files Community

garyuzair commited on May 7, 2025

Commit

eab4ff6

verified ·

1 Parent(s): f60e24b

Update app.py

Browse files

Files changed (1) hide show

app.py +210 -364

app.py CHANGED Viewed

@@ -1,381 +1,227 @@
 import streamlit as st
-from PIL import Image
 import numpy as np
 import torch
-import gc
 import os
 import tempfile
-import math
-import imageio
-import traceback
-import scipy.io.wavfile # For saving WAV files
-# --- Attempt to import moviepy for video processing ---
 try:
     import moviepy.editor as mpy
-    MOVIEPY_AVAILABLE = True
-except Exception as e: # Catch any exception during moviepy import
-    MOVIEPY_AVAILABLE = False
-    st.warning(
-        f"MoviePy library could not be loaded (Error: {e}). "
-        "Video syncing features will be disabled. "
-        "If running locally, ensure MoviePy and its dependency ffmpeg are correctly installed."
-    )
-    print(f"MoviePy load error: {e}")
-# --- Model Configuration ---
-IMAGE_CAPTION_MODEL = "Salesforce/blip-image-captioning-base"
-AUDIO_GEN_MODEL = "facebook/musicgen-small"
-# --- Constants ---
-DEFAULT_NUM_FRAMES = 2
-DEFAULT_AUDIO_DURATION_S = 7 # Slightly increased default
-MAX_FRAMES_TO_SHOW_UI = 2 # Reducing for smaller UI footprint
-DEVICE = torch.device("cpu") # Explicitly use CPU
-# --- Page Setup ---
-st.set_page_config(page_title="AI Video Sound Designer (HF Space)", layout="wide", page_icon="🎬")
-st.title("🎬 AI Video Sound Designer")
-st.markdown("""
-    Upload a short video (MP4, MOV, AVI). The tool will:
-    1. Extract frames.
-    2. Analyze frames with **BLIP** to generate sound ideas.
-    3. Synthesize audio with **MusicGen** based on these ideas.
-    4. Optionally, combine the new audio with your video.
-    ---
-    **Important:** This app runs on CPU. **Audio generation can be very slow (several minutes for a few seconds of audio).** Please be patient!
-""")
-# --- Utility Functions ---
-def clear_memory(model_obj=None, processor_obj=None):
-    if model_obj:
-        del model_obj
-    if processor_obj:
-        del processor_obj
-    gc.collect()
-    print("Memory cleared.")
-@st.cache_resource(show_spinner="Loading Image Analysis Model...")
-def load_image_caption_model_and_processor():
-    try:
-        from transformers import BlipProcessor, BlipForConditionalGeneration
-        print(f"Loading Image Captioning Model: {IMAGE_CAPTION_MODEL} to {DEVICE}")
-        processor = BlipProcessor.from_pretrained(IMAGE_CAPTION_MODEL)
-        # Standard loading for BLIP on CPU. No 'low_mem' or 'low_cpu_mem_usage'
-        model = BlipForConditionalGeneration.from_pretrained(IMAGE_CAPTION_MODEL).to(DEVICE)
-        model.eval() # Set to evaluation mode
-        st.toast("Image Analysis model (BLIP) loaded!", icon="🖼️")
-        return processor, model
-    except Exception as e:
-        st.error(f"Error loading BLIP model ({IMAGE_CAPTION_MODEL}): {e}")
-        st.error(traceback.format_exc())
-        return None, None
-@st.cache_resource(show_spinner="Loading Audio Generation Model (can be slow)...")
-def load_audio_gen_model_and_processor():
-    try:
-        from transformers import AutoProcessor, MusicgenForConditionalGeneration
-        print(f"Loading Audio Generation Model: {AUDIO_GEN_MODEL} to {DEVICE}")
-        processor = AutoProcessor.from_pretrained(AUDIO_GEN_MODEL)
-        # Standard loading for MusicGen on CPU.
-        # `low_cpu_mem_usage` could be used here if accelerate is properly configured,
-        # but for simplicity and robustness on free tier, direct .to(DEVICE) is safer.
-        model = MusicgenForConditionalGeneration.from_pretrained(AUDIO_GEN_MODEL).to(DEVICE)
-        model.eval() # Set to evaluation mode
-        st.toast("Audio Generation model (MusicGen) loaded! (CPU generation will be slow)", icon="🎶")
-        return processor, model
-    except Exception as e:
-        st.error(f"Error loading MusicGen model ({AUDIO_GEN_MODEL}): {e}")
-        st.error(traceback.format_exc())
-        return None, None
-def extract_frames_from_video(video_path, num_frames_to_extract):
-    frames = []
-    reader = None
-    try:
-        reader = imageio.get_reader(video_path, "ffmpeg")
-        total_frames_in_video = 0
-        try: # Try to get frame count
-            total_frames_in_video = reader.count_frames()
-        except Exception:
-            meta_data = reader.get_meta_data()
-            duration = meta_data.get('duration')
-            fps = meta_data.get('fps', 25) # Default FPS if not found
-            if duration and fps:
-                total_frames_in_video = int(duration * fps)
-        if not total_frames_in_video or total_frames_in_video < 1:
-            # Fallback: try to read a few frames directly if length is unknown
-            print("Video length unknown or zero, attempting to read initial frames.")
-            temp_frames = []
-            for i, frame_data in enumerate(reader):
-                temp_frames.append(Image.fromarray(frame_data).convert("RGB"))
-                if len(temp_frames) >= num_frames_to_extract * 2: # Read a bit more
-                    break
-            if not temp_frames:
-                st.error("Could not extract any frames. Video might be empty or corrupted.")
-                if reader: reader.close()
-                return []
-            # Select frames from what was read
-            indices = np.linspace(0, len(temp_frames) - 1, num_frames_to_extract, dtype=int, endpoint=True)
-            frames = [temp_frames[i] for i in indices]
-            if reader: reader.close()
-            return frames
-        # If frame count is known
-        num_to_sample = min(num_frames_to_extract, total_frames_in_video)
-        indices = np.linspace(0, total_frames_in_video - 1, num_to_sample, dtype=int, endpoint=True)
-        for i in indices:
-            try:
-                frame_data = reader.get_data(i)
-                frames.append(Image.fromarray(frame_data).convert("RGB"))
-            except Exception as frame_e:
-                st.warning(f"Skipping problematic frame at index {i}: {frame_e}")
-        return frames
-    except (imageio.core.fetching.NeedDownloadError, OSError) as e_ffmpeg:
-        st.error(f"FFmpeg error during frame extraction: {e_ffmpeg}. Ensure ffmpeg is available.")
-        return []
-    except Exception as e:
-        st.error(f"Could not extract frames: {e}")
-        st.error(traceback.format_exc())
-        return []
-    finally:
-        if reader:
-            reader.close()
-def generate_sound_prompt_from_frames(frames, caption_proc, caption_mod):
-    if not frames: return "ambient background noise"
-    descriptions = []
-    # BLIP doesn't need a complex instruction, it captions directly.
-    # We ask for "sound-producing elements" in post-processing of the descriptions.
-    progress_bar = st.progress(0.0, text="Analyzing frames for sound ideas...")
-    for i, frame in enumerate(frames):
-        try:
-            inputs = caption_proc(images=frame, return_tensors="pt").to(DEVICE)
-            generated_ids = caption_mod.generate(**inputs, max_new_tokens=40) # Shorter captions
-            description = caption_proc.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
-            if description: descriptions.append(description)
-            progress_bar.progress((i + 1) / len(frames), text=f"Frame {i+1}/{len(frames)} analyzed.")
-        except Exception as e:
-            st.warning(f"Could not get description for a frame: {e}")
-    progress_bar.empty()
-    if not descriptions: return "general ambiance, subtle environmental sounds"
-    unique_descs = list(dict.fromkeys(descriptions)) # Remove duplicates
-    combined_prompt = ". ".join(unique_descs)
-    # Refine for MusicGen
-    final_prompt = f"Soundscape for a scene with: {combined_prompt}. Emphasize distinct sounds and overall mood."
-    # Limit prompt length for MusicGen
-    if len(final_prompt) > 300: # Arbitrary limit for conciseness
-        final_prompt = final_prompt[:300] + "..."
-    return final_prompt
-def generate_audio_from_prompt(prompt, duration_s, audio_proc, audio_mod, guidance, temp):
     try:
-        inputs = audio_proc(text=[prompt], return_tensors="pt", padding=True).to(DEVICE)
-        tokens_per_second = audio_mod.config.audio_encoder.token_per_second
-        max_new_tokens = min(int(duration_s * tokens_per_second), 1500) # Cap for stability
-        with st.spinner(f"Synthesizing {duration_s}s audio (CPU: This is the SLOW part!)... Please wait patiently."):
-            audio_values = audio_mod.generate(
-                **inputs,
-                max_new_tokens=max_new_tokens,
-                do_sample=True, # Sampling is important for diversity
-                guidance_scale=guidance,
-                temperature=temp,
-            )
-        audio_array = audio_values[0, 0].cpu().numpy()
-        sampling_rate = audio_mod.config.audio_encoder.sampling_rate
-        peak = np.abs(audio_array).max()
-        if peak > 1e-5: # Avoid division by zero or near-zero
-            audio_array = (audio_array / peak) * 0.9 # Normalize with headroom
         else:
-            audio_array = np.zeros_like(audio_array) # Output silence if generated audio is too quiet
-        return audio_array, sampling_rate
-    except Exception as e:
-        st.error(f"Error generating audio: {e}")
-        st.error(traceback.format_exc())
-        return None, None
-def combine_audio_video(video_path, audio_arr, sr, mix_orig):
-    if not MOVIEPY_AVAILABLE:
-        st.error("MoviePy is unavailable. Cannot combine audio and video.")
-        return None
-    out_vid_path = None
-    tmp_audio_path = None
-    vid_clip = gen_audio_clip = final_aud = comp_clip = None
-    try:
-        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_audio_f:
-            scipy.io.wavfile.write(tmp_audio_f.name, sr, audio_arr.astype(np.float32)) # Ensure float32 for some moviepy versions
-            tmp_audio_path = tmp_audio_f.name
-        vid_clip = mpy.VideoFileClip(video_path)
-        gen_audio_clip = mpy.AudioFileClip(tmp_audio_path)
-        target_duration = vid_clip.duration
-        if gen_audio_clip.duration < target_duration:
-            gen_audio_clip = gen_audio_clip.fx(mpy.afx.audio_loop, duration=target_duration)
-        gen_audio_clip = gen_audio_clip.subclip(0, target_duration)
-        if mix_orig and vid_clip.audio:
-            orig_audio = vid_clip.audio.volumex(0.6) # Lower original audio
-            gen_audio_clip = gen_audio_clip.volumex(0.9) # Slightly boost generated
-            final_aud = mpy.CompositeAudioClip([orig_audio, gen_audio_clip]).set_duration(target_duration)
         else:
-            final_aud = gen_audio_clip.set_duration(target_duration)
-        comp_clip = vid_clip.set_audio(final_aud)
-        with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as tmp_out_vid_f:
-            out_vid_path = tmp_out_vid_f.name
-        comp_clip.write_videofile(
-            out_vid_path, codec="libx264", audio_codec="aac",
-            threads=max(1, os.cpu_count() // 2), logger=None # Be less verbose
         )
-        return out_vid_path
-    except Exception as e:
-        st.error(f"Error combining audio/video: {e}")
-        st.error(traceback.format_exc())
-        return None
-    finally:
-        # Close all clips
-        if vid_clip: vid_clip.close()
-        if gen_audio_clip: gen_audio_clip.close()
-        # final_aud is usually a derivative, not needing separate close if others are.
-        if comp_clip: comp_clip.close()
-        if tmp_audio_path and os.path.exists(tmp_audio_path): os.remove(tmp_audio_path)
-# --- Sidebar for Settings ---
-with st.sidebar:
-    st.header("⚙️ Settings")
-    num_frames_analysis = st.slider("Frames to Analyze", 1, 4, DEFAULT_NUM_FRAMES, 1,
-                                   help="Fewer frames = faster analysis.")
-    audio_duration = st.slider("Target Audio Duration (s)", 3, 15, DEFAULT_AUDIO_DURATION_S, 1, # Max 15s for CPU
-                               help="Shorter = MUCH faster on CPU. MusicGen is slow.")
-    st.subheader("MusicGen Parameters")
-    guidance = st.slider("Guidance Scale", 1.0, 7.0, 3.0, 0.5)
-    temperature = st.slider("Temperature", 0.5, 1.5, 1.0, 0.1)
-    mix_audio = False
-    if MOVIEPY_AVAILABLE:
-        st.subheader("Video Output")
-        mix_audio = st.checkbox("Mix with original video audio", value=False)
-# --- Main Application ---
-uploaded_file = st.file_uploader("📤 Upload Video (MP4, MOV, AVI - short clips best):", type=["mp4", "mov", "avi"])
-if 'generated_audio_path_sess' not in st.session_state:
-    st.session_state.generated_audio_path_sess = None
-if 'output_video_path_sess' not in st.session_state:
-    st.session_state.output_video_path_sess = None
-if uploaded_file is not None:
-    st.video(uploaded_file)
-    if st.button("✨ Generate Sound Design!", type="primary", use_container_width=True):
-        # --- Clear previous results from session state and disk ---
-        for key in ['generated_audio_path_sess', 'output_video_path_sess']:
-            if st.session_state.get(key) and os.path.exists(st.session_state[key]):
-                try: os.remove(st.session_state[key])
-                except Exception as e_rem: print(f"Error removing old temp file: {e_rem}")
-            st.session_state[key] = None
-        clear_memory() # General memory clear
-        temp_video_path = None
-        try:
-            with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(uploaded_file.name)[1]) as tmp_v:
-                tmp_v.write(uploaded_file.read()) # Use read() for BytesIO from uploader
-                temp_video_path = tmp_v.name
-            # === Stage 1: Frame Extraction ===
-            st.markdown("--- \n### 1. Extracting Frames")
-            frames = extract_frames_from_video(temp_video_path, num_frames_analysis)
-            if not frames: st.error("Frame extraction failed. Cannot continue."); st.stop()
-            st.success(f"Extracted {len(frames)} frames.")
-            if frames:
-                cols = st.columns(min(len(frames), MAX_FRAMES_TO_SHOW_UI))
-                for i, frame_img in enumerate(frames[:len(cols)]):
-                    cols[i].image(frame_img, caption=f"Frame {i+1}", use_column_width=True)
-            # === Stage 2: Image Captioning (Sound Prompt Generation) ===
-            st.markdown("--- \n### 2. Analyzing Frames for Sound Ideas (BLIP)")
-            cap_proc, cap_model = load_image_caption_model_and_processor()
-            sound_prompt = "ambient environmental sounds" # Default
-            if cap_proc and cap_model:
-                sound_prompt = generate_sound_prompt_from_frames(frames, cap_proc, cap_model)
-                clear_memory(cap_model, cap_proc) # Unload BLIP
-            else: st.error("BLIP model failed to load. Using default sound prompt.")
-            st.info(f"✍️ **Sound Prompt for MusicGen:** {sound_prompt}")
-            # === Stage 3: Audio Generation ===
-            st.markdown("--- \n### 3. Synthesizing Audio (MusicGen)")
-            st.warning("🎧 This step is very slow on CPU. Your patience is appreciated!")
-            aud_proc, aud_model = load_audio_gen_model_and_processor()
-            gen_aud_arr, s_r = None, None
-            if aud_proc and aud_model:
-                gen_aud_arr, s_r = generate_audio_from_prompt(sound_prompt, audio_duration, aud_proc, aud_model, guidance, temperature)
-                clear_memory(aud_model, aud_proc) # Unload MusicGen
-            else: st.error("MusicGen model failed to load. Cannot generate audio.")
-            if gen_aud_arr is not None and s_r is not None:
-                st.success("Audio successfully generated!")
-                st.audio(gen_aud_arr, sample_rate=s_r)
-                with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_a_out:
-                    scipy.io.wavfile.write(tmp_a_out.name, s_r, gen_aud_arr.astype(np.float32))
-                    st.session_state.generated_audio_path_sess = tmp_a_out.name
-                with open(st.session_state.generated_audio_path_sess, "rb") as f_aud:
-                    st.download_button("📥 Download Audio Only (.wav)", f_aud, "generated_sound.wav", "audio/wav")
-                # === Stage 4: (Optional) Video and Audio Syncing ===
-                if MOVIEPY_AVAILABLE:
-                    st.markdown("--- \n### 4. Combining Audio with Video")
-                    with st.spinner("Processing video with new audio... (also can be slow)"):
-                        out_vid_p = combine_audio_video(temp_video_path, gen_aud_arr, s_r, mix_audio)
-                    if out_vid_p and os.path.exists(out_vid_p):
-                        st.success("Video processing complete!")
-                        st.video(out_vid_p)
-                        st.session_state.output_video_path_sess = out_vid_p
-                        with open(out_vid_p, "rb") as f_vid:
-                            st.download_button("🎬 Download Video with New Sound (.mp4)", f_vid, "video_with_sound.mp4", "video/mp4")
-                    elif MOVIEPY_AVAILABLE: st.error("Failed to combine audio and video.")
-            else: st.error("Audio generation failed. Video syncing skipped.")
-        except Exception as e_main:
-            st.error(f"An unexpected error occurred in main processing: {e_main}")
-            st.error(traceback.format_exc())
-        finally:
-            if temp_video_path and os.path.exists(temp_video_path): os.remove(temp_video_path)
-            clear_memory() # Final general clear
-    # Show download buttons for files from a previous successful run in the same session
-    elif st.session_state.generated_audio_path_sess and os.path.exists(st.session_state.generated_audio_path_sess):
-        st.markdown("---")
-        st.write("Previously generated audio available:")
-        st.audio(st.session_state.generated_audio_path_sess)
-        with open(st.session_state.generated_audio_path_sess, "rb") as f_aud_prev:
-            st.download_button("📥 Download Previous Audio (.wav)", f_aud_prev, "generated_sound_prev.wav", "audio/wav", key="prev_aud_dl")
-    if st.session_state.output_video_path_sess and os.path.exists(st.session_state.output_video_path_sess) and MOVIEPY_AVAILABLE:
-        st.markdown("---") # This might appear even if audio only was generated, so careful with flow
-        st.write("Previously generated video available:")
-        st.video(st.session_state.output_video_path_sess)
-        with open(st.session_state.output_video_path_sess, "rb") as f_vid_prev:
-            st.download_button("🎬 Download Previous Video (.mp4)", f_vid_prev, "video_with_sound_prev.mp4", "video/mp4", key="prev_vid_dl")
-else:
-    st.info("☝️ Upload a video to begin.")
-st.markdown("---")
-st.caption("Built for Hugging Face Spaces (CPU). Patience is key for generation times!")

 import streamlit as st
+import imageio
 import numpy as np
+from PIL import Image
+from transformers import AutoProcessor, BlipForConditionalGeneration, MusicgenForConditionalGeneration
+import soundfile as sf
 import torch
 import os
 import tempfile
+# Try importing moviepy with fallback
 try:
     import moviepy.editor as mpy
+except ModuleNotFoundError:
+    st.error("The 'moviepy' library is not installed. Please ensure 'moviepy==1.0.3' is in requirements.txt and installed.")
+    st.stop()
+# Set page title and instructions
+st.title("Story Video Sound Effect Sync Generator")
+st.write("Upload an MP4 video to auto-generate and sync a high-quality sound effect.")
+# User-configurable settings
+num_frames_to_extract = st.slider("Number of frames to analyze", 1, 3, 1, help="Fewer frames = faster processing")
+mix_original_audio = st.checkbox("Mix with original audio", value=False, help="Blend sound effect with video’s original sound")
+# Enhanced prompt generation function
+def enhance_prompt(base_description):
+    """Generate a detailed, sound-specific prompt from BLIP caption."""
+    base = base_description.lower().strip()
+    # Define action, object, and environment keywords
+    actions = {
+        "walk": "crisp footsteps on a wooden floor",
+        "run": "rapid footsteps and heavy breathing",
+        "drive": "engine roar and tires screeching",
+        "talk": "soft voices and background murmur",
+        "crash": "loud crash and debris scattering",
+        "fall": "thud of impact and rustling debris"
+    }
+    objects = {
+        "person": "human activity with subtle breathing",
+        "dog": "playful barks and pawsteps",
+        "car": "mechanical hum and tire friction",
+        "tree": "rustling leaves in a breeze",
+        "forest": "gentle wind and distant bird calls"
+    }
+    environments = {
+        "room": "echoing footsteps and muffled sounds",
+        "street": "distant traffic and urban hum",
+        "forest": "wind through trees and twigs snapping",
+        "outside": "open air with faint wind"
+    }
+    # Extract key elements from the caption
+    sound_description = ""
+    for action, sound in actions.items():
+        if action in base:
+            sound_description = sound
+            break
+    if not sound_description:  # Default to subtle ambient if no action
+        sound_description = "subtle ambient hum"
+    # Add object-specific sounds
+    for obj, sound in objects.items():
+        if obj in base:
+            sound_description += f" and {sound}"
+            break
+    # Add environment if detected
+    for env, sound in environments.items():
+        if env in base:
+            sound_description += f" in a {env} with {sound}"
+            break
+    # Construct final prompt
+    return f"{base} with {sound_description}"
+# File uploader for video
+uploaded_file = st.file_uploader("Upload an MP4 video (high resolution)", type=["mp4"])
+if uploaded_file is not None:
     try:
+        # Temporary video file
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as temp_video:
+            temp_video.write(uploaded_file.getbuffer())
+            temp_video_path = temp_video.name
+        # Progress bar setup
+        progress_bar = st.progress(0)
+        status_text = st.empty()
+        # Extract frames
+        status_text.text("Extracting frames...")
+        video = imageio.get_reader(temp_video_path, "ffmpeg")
+        total_frames = len(list(video.iter_data()))
+        step = max(1, total_frames // num_frames_to_extract)
+        frames = [
+            Image.fromarray(video.get_data(i))
+            for i in range(0, min(total_frames, num_frames_to_extract * step), step)
+        ][:num_frames_to_extract]
+        progress_bar.progress(20)
+        # Load BLIP model
+        @st.cache_resource
+        def load_blip_model():
+            processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
+            model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
+            if torch.cuda.is_available():
+                model = model.half().to("cuda")
+            return processor, model
+        processor, model = load_blip_model()
+        # Generate and enhance text descriptions
+        status_text.text("Analyzing frames...")
+        descriptions = []
+        for i, frame in enumerate(frames):
+            inputs = processor(images=frame, return_tensors="pt")
+            if torch.cuda.is_available():
+                inputs = {k: v.to("cuda") for k, v in inputs.items()}
+            out = model.generate(**inputs)
+            base_description = processor.decode(out[0], skip_special_tokens=True)
+            enhanced_description = enhance_prompt(base_description)
+            descriptions.append(enhanced_description)
+            progress_bar.progress(20 + int(30 * (i + 1) / len(frames)))
+        text_prompt = ". ".join(descriptions)
+        st.write("Enhanced text prompt:", text_prompt)
+        # Load MusicGen model
+        @st.cache_resource
+        def load_musicgen_model():
+            processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
+            model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
+            if torch.cuda.is_available():
+                model = model.half().to("cuda")
+            return processor, model
+        musicgen_processor, musicgen_model = load_musicgen_model()
+        # Generate sound effect (~8 seconds)
+        status_text.text("Generating sound effect...")
+        inputs = musicgen_processor(
+            text=[text_prompt],
+            padding=True,
+            return_tensors="pt",
+        )
+        if torch.cuda.is_available():
+            inputs = {k: v.to("cuda") for k, v in inputs.items()}
+        audio_values = musicgen_model.generate(
+            **inputs,
+            max_new_tokens=256,
+            do_sample=True,
+            guidance_scale=3.0,
+            top_k=50,
+            top_p=0.95
+        )
+        audio_array = audio_values[0].cpu().numpy()
+        if audio_array.ndim > 1:
+            audio_array = audio_array.flatten()
+        audio_array = audio_array / np.max(np.abs(audio_array)) * 0.9
+        audio_array = np.clip(audio_array, -1.0, 1.0)
+        sample_rate = 32000
+        progress_bar.progress(60)
+        # Save temporary audio
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
+            sf.write(temp_audio.name, audio_array, sample_rate)
+            temp_audio_path = temp_audio.name
+        # Synchronize with video using mpy
+        status_text.text("Syncing audio with video...")
+        video_clip = mpy.VideoFileClip(temp_video_path)
+        video_duration = video_clip.duration
+        audio_clip = mpy.AudioFileClip(temp_audio_path)
+        # Adjust audio length
+        if audio_clip.duration < video_duration:
+            loops_needed = int(np.ceil(video_duration / audio_clip.duration))
+            audio_clip = mpy.concatenate_audioclips([audio_clip] * loops_needed).subclip(0, video_duration)
         else:
+            audio_clip = audio_clip.subclip(0, video_duration)
+        # Mix or replace audio
+        if mix_original_audio and video_clip.audio:
+            final_audio = video_clip.audio.volumex(0.5) + audio_clip.volumex(0.5)
         else:
+            final_audio = audio_clip
+        # Set audio to video
+        final_video = video_clip.set_audio(final_audio)
+        # Save final video with high quality
+        output_path = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
+        final_video.write_videofile(
+            output_path,
+            codec="libx264",
+            audio_codec="aac",
+            preset="medium",  # Better quality than ultrafast
+            bitrate="8000k",  # Higher bitrate for video quality
+            audio_bitrate="192k",  # Good audio quality
+            temp_audiofile="temp-audio.m4a",
+            remove_temp=True
         )
+        progress_bar.progress(90)
+        # Provide playback and download
+        status_text.text("Done!")
+        st.video(output_path)
+        with open(output_path, "rb") as video_file:
+            st.download_button(
+                label="Download Synced Video",
+                data=video_file,
+                file_name="synced_story_video.mp4",
+                mime="video/mp4"
+            )
+        progress_bar.progress(100)
+    except Exception as e:
+        st.error(f"An error occurred: {str(e)}")
+        st.write("Try reducing frames or uploading a smaller video.")
+    finally:
+        # Clean up
+        for path in [temp_video_path, temp_audio_path, output_path]:
+            if 'path' in locals() and os.path.exists(path):
+                os.remove(path)