import streamlit as st
import imageio
import numpy as np
from PIL import Image
from transformers import AutoProcessor, BlipForConditionalGeneration, MusicgenForConditionalGeneration
import soundfile as sf
import torch
import os
import tempfile

# Try importing moviepy with fallback
try:
    import moviepy.editor as mpy
except ModuleNotFoundError:
    st.error("The 'moviepy' library is not installed. Please ensure 'moviepy==1.0.3' is in requirements.txt and installed.")
    st.stop()

# Set page title and instructions
st.title("Story Video Sound Effect Sync Generator")
st.write("Upload an MP4 video to auto-generate and sync a high-quality sound effect.")

# User-configurable settings
num_frames_to_extract = st.slider("Number of frames to analyze", 1, 3, 1, help="Fewer frames = faster processing")
mix_original_audio = st.checkbox("Mix with original audio", value=False, help="Blend sound effect with video’s original sound")

# Enhanced prompt generation function
def enhance_prompt(base_description):
    """Generate a detailed, sound-specific prompt from BLIP caption."""
    base = base_description.lower().strip()
    
    # Define action, object, and environment keywords
    actions = {
        "walk": "crisp footsteps on a wooden floor",
        "run": "rapid footsteps and heavy breathing",
        "drive": "engine roar and tires screeching",
        "talk": "soft voices and background murmur",
        "crash": "loud crash and debris scattering",
        "fall": "thud of impact and rustling debris"
    }
    objects = {
        "person": "human activity with subtle breathing",
        "dog": "playful barks and pawsteps",
        "car": "mechanical hum and tire friction",
        "tree": "rustling leaves in a breeze",
        "forest": "gentle wind and distant bird calls"
    }
    environments = {
        "room": "echoing footsteps and muffled sounds",
        "street": "distant traffic and urban hum",
        "forest": "wind through trees and twigs snapping",
        "outside": "open air with faint wind"
    }

    # Extract key elements from the caption
    sound_description = ""
    for action, sound in actions.items():
        if action in base:
            sound_description = sound
            break
    if not sound_description:  # Default to subtle ambient if no action
        sound_description = "subtle ambient hum"

    # Add object-specific sounds
    for obj, sound in objects.items():
        if obj in base:
            sound_description += f" and {sound}"
            break

    # Add environment if detected
    for env, sound in environments.items():
        if env in base:
            sound_description += f" in a {env} with {sound}"
            break

    # Construct final prompt
    return f"{base} with {sound_description}"

# File uploader for video
uploaded_file = st.file_uploader("Upload an MP4 video (high resolution)", type=["mp4"])

if uploaded_file is not None:
    try:
        # Temporary video file
        with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as temp_video:
            temp_video.write(uploaded_file.getbuffer())
            temp_video_path = temp_video.name

        # Progress bar setup
        progress_bar = st.progress(0)
        status_text = st.empty()

        # Extract frames
        status_text.text("Extracting frames...")
        video = imageio.get_reader(temp_video_path, "ffmpeg")
        total_frames = len(list(video.iter_data()))
        step = max(1, total_frames // num_frames_to_extract)
        frames = [
            Image.fromarray(video.get_data(i)) 
            for i in range(0, min(total_frames, num_frames_to_extract * step), step)
        ][:num_frames_to_extract]
        progress_bar.progress(20)

        # Load BLIP model
        @st.cache_resource
        def load_blip_model():
            processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
            model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
            if torch.cuda.is_available():
                model = model.half().to("cuda")
            return processor, model

        processor, model = load_blip_model()

        # Generate and enhance text descriptions
        status_text.text("Analyzing frames...")
        descriptions = []
        for i, frame in enumerate(frames):
            inputs = processor(images=frame, return_tensors="pt")
            if torch.cuda.is_available():
                inputs = {k: v.to("cuda") for k, v in inputs.items()}
            out = model.generate(**inputs)
            base_description = processor.decode(out[0], skip_special_tokens=True)
            enhanced_description = enhance_prompt(base_description)
            descriptions.append(enhanced_description)
            progress_bar.progress(20 + int(30 * (i + 1) / len(frames)))

        text_prompt = ". ".join(descriptions)
        st.write("Enhanced text prompt:", text_prompt)

        # Load MusicGen model
        @st.cache_resource
        def load_musicgen_model():
            processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
            model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
            if torch.cuda.is_available():
                model = model.half().to("cuda")
            return processor, model

        musicgen_processor, musicgen_model = load_musicgen_model()

        # Generate sound effect (~8 seconds)
        status_text.text("Generating sound effect...")
        inputs = musicgen_processor(
            text=[text_prompt],
            padding=True,
            return_tensors="pt",
        )
        if torch.cuda.is_available():
            inputs = {k: v.to("cuda") for k, v in inputs.items()}
        audio_values = musicgen_model.generate(
            **inputs, 
            max_new_tokens=256, 
            do_sample=True, 
            guidance_scale=3.0, 
            top_k=50, 
            top_p=0.95
        )
        audio_array = audio_values[0].cpu().numpy()
        if audio_array.ndim > 1:
            audio_array = audio_array.flatten()
        audio_array = audio_array / np.max(np.abs(audio_array)) * 0.9
        audio_array = np.clip(audio_array, -1.0, 1.0)
        sample_rate = 32000
        progress_bar.progress(60)

        # Save temporary audio
        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
            sf.write(temp_audio.name, audio_array, sample_rate)
            temp_audio_path = temp_audio.name

        # Synchronize with video using mpy
        status_text.text("Syncing audio with video...")
        video_clip = mpy.VideoFileClip(temp_video_path)
        video_duration = video_clip.duration
        audio_clip = mpy.AudioFileClip(temp_audio_path)

        # Adjust audio length
        if audio_clip.duration < video_duration:
            loops_needed = int(np.ceil(video_duration / audio_clip.duration))
            audio_clip = mpy.concatenate_audioclips([audio_clip] * loops_needed).subclip(0, video_duration)
        else:
            audio_clip = audio_clip.subclip(0, video_duration)

        # Mix or replace audio
        if mix_original_audio and video_clip.audio:
            final_audio = video_clip.audio.volumex(0.5) + audio_clip.volumex(0.5)
        else:
            final_audio = audio_clip

        # Set audio to video
        final_video = video_clip.set_audio(final_audio)

        # Save final video with high quality
        output_path = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
        final_video.write_videofile(
            output_path,
            codec="libx264",
            audio_codec="aac",
            preset="medium",  # Better quality than ultrafast
            bitrate="8000k",  # Higher bitrate for video quality
            audio_bitrate="192k",  # Good audio quality
            temp_audiofile="temp-audio.m4a",
            remove_temp=True
        )
        progress_bar.progress(90)

        # Provide playback and download
        status_text.text("Done!")
        st.video(output_path)
        with open(output_path, "rb") as video_file:
            st.download_button(
                label="Download Synced Video",
                data=video_file,
                file_name="synced_story_video.mp4",
                mime="video/mp4"
            )
        progress_bar.progress(100)

    except Exception as e:
        st.error(f"An error occurred: {str(e)}")
        st.write("Try reducing frames or uploading a smaller video.")

    finally:
        # Clean up
        for path in [temp_video_path, temp_audio_path, output_path]:
            if 'path' in locals() and os.path.exists(path):
                os.remove(path)