jeanma's picture
Omnilingual ASR transcription demo
ae238b3 verified
import React, { useRef, useEffect, useState, useCallback } from 'react';
import { ArrowDownTrayIcon } from '@heroicons/react/24/outline';
import { useTranscriptionStore } from '../stores/transcriptionStore';
interface MinimapTimelineProps {
audioRef: React.RefObject<HTMLAudioElement>;
videoRef: React.RefObject<HTMLVideoElement>;
canvasTimelineRef: React.RefObject<HTMLDivElement>; // Container that scrolls
}
export default function MinimapTimeline({
audioRef,
videoRef,
canvasTimelineRef
}: MinimapTimelineProps) {
const canvasRef = useRef<HTMLCanvasElement>(null);
const containerRef = useRef<HTMLDivElement>(null);
const [isDragging, setIsDragging] = useState(false);
const [dragStartX, setDragStartX] = useState(0);
const [dragStartScrollLeft, setDragStartScrollLeft] = useState(0);
const [waveformData, setWaveformData] = useState<number[]>([]);
const [viewport, setViewport] = useState({ start: 0, end: 30, visible: false });
const {
transcription,
preprocessedAudio,
currentTime,
} = useTranscriptionStore();
// Constants
const MINIMAP_HEIGHT = 80;
const PIXELS_PER_SECOND = 300; // Match the CanvasTimeline scaling
// Get media duration
const getMediaDuration = useCallback(() => {
const audioElement = audioRef.current;
const videoElement = videoRef.current;
if (audioElement && !isNaN(audioElement.duration)) {
return audioElement.duration;
}
if (videoElement && !isNaN(videoElement.duration)) {
return videoElement.duration;
}
return transcription?.total_duration || 0;
}, [audioRef, videoRef, transcription]);
const mediaDuration = getMediaDuration();
// Canvas width based on container
const [canvasWidth, setCanvasWidth] = useState(800);
// Update canvas width on resize
useEffect(() => {
const updateCanvasWidth = () => {
if (containerRef.current) {
setCanvasWidth(containerRef.current.clientWidth);
}
};
updateCanvasWidth();
window.addEventListener('resize', updateCanvasWidth);
return () => window.removeEventListener('resize', updateCanvasWidth);
}, []);
// Track Canvas Timeline scroll position and calculate viewport
const updateViewportFromScroll = useCallback(() => {
const canvasContainer = canvasTimelineRef.current;
if (!canvasContainer || mediaDuration === 0) return;
const scrollLeft = canvasContainer.scrollLeft;
const containerWidth = canvasContainer.clientWidth;
const totalCanvasWidth = mediaDuration * PIXELS_PER_SECOND;
// Calculate what time range is currently visible
const startTime = (scrollLeft / totalCanvasWidth) * mediaDuration;
const endTime = ((scrollLeft + containerWidth) / totalCanvasWidth) * mediaDuration;
setViewport({
start: Math.max(0, startTime),
end: Math.min(mediaDuration, endTime),
visible: true
});
}, [canvasTimelineRef, mediaDuration]);
// Listen for scroll events on the Canvas Timeline container
useEffect(() => {
const canvasContainer = canvasTimelineRef.current;
if (!canvasContainer) return;
const handleScroll = () => {
updateViewportFromScroll();
};
const handleLoadOrResize = () => {
// Update viewport when container size changes
updateViewportFromScroll();
};
// Initial viewport calculation
updateViewportFromScroll();
canvasContainer.addEventListener('scroll', handleScroll);
window.addEventListener('resize', handleLoadOrResize);
return () => {
canvasContainer.removeEventListener('scroll', handleScroll);
window.removeEventListener('resize', handleLoadOrResize);
};
}, [updateViewportFromScroll]);
// Generate waveform data from preprocessed audio
const generateWaveformFromPreprocessedAudio = useCallback(async () => {
if (!preprocessedAudio?.data) {
console.log('No preprocessed audio data available');
return;
}
try {
console.log('Generating waveform from preprocessed audio data');
// Decode base64 audio data
const audioBytes = atob(preprocessedAudio.data);
const audioArrayBuffer = new ArrayBuffer(audioBytes.length);
const audioUint8Array = new Uint8Array(audioArrayBuffer);
for (let i = 0; i < audioBytes.length; i++) {
audioUint8Array[i] = audioBytes.charCodeAt(i);
}
// Create audio context and decode the WAV data
const audioContext = new (window.AudioContext || (window as any).webkitAudioContext)();
const audioBuffer = await audioContext.decodeAudioData(audioArrayBuffer);
// Extract audio data
const channelData = audioBuffer.getChannelData(0);
const samples = Math.min(800, canvasWidth); // Limit samples for performance
const blockSize = Math.floor(channelData.length / samples);
const waveform: number[] = [];
for (let i = 0; i < samples; i++) {
const start = i * blockSize;
const end = Math.min(start + blockSize, channelData.length);
let sum = 0;
for (let j = start; j < end; j++) {
sum += Math.abs(channelData[j]);
}
waveform.push(sum / (end - start));
}
// Normalize waveform
const max = Math.max(...waveform);
const normalizedWaveform = max > 0 ? waveform.map(val => val / max) : waveform;
setWaveformData(normalizedWaveform);
console.log(`Generated waveform with ${normalizedWaveform.length} samples from preprocessed audio`);
} catch (error) {
console.error('Error generating waveform from preprocessed audio:', error);
// Fallback to segment-based visualization
generateFallbackWaveform();
}
}, [preprocessedAudio, canvasWidth]);
// Fallback waveform generation from segment data
const generateFallbackWaveform = useCallback(() => {
if (!transcription?.aligned_segments || mediaDuration === 0) return;
console.log('Using fallback waveform generation from segments');
const segments = transcription.aligned_segments;
const samples = Math.min(400, canvasWidth / 2);
const bars = new Array(samples).fill(0);
// Create waveform based on speech activity in segments
segments.forEach(segment => {
const startIndex = Math.floor((segment.start / mediaDuration) * samples);
const endIndex = Math.ceil((segment.end / mediaDuration) * samples);
for (let i = startIndex; i < Math.min(endIndex, samples); i++) {
// Use segment text length and duration to estimate intensity
const intensity = Math.min(1.0, segment.text.length / 50 + 0.3);
bars[i] = Math.max(bars[i], intensity * (0.7 + Math.random() * 0.3));
}
});
setWaveformData(bars);
console.log(`Generated fallback waveform with ${bars.length} samples`);
}, [transcription, mediaDuration, canvasWidth]);
// Generate waveform when preprocessed audio becomes available
useEffect(() => {
if (preprocessedAudio?.data) {
generateWaveformFromPreprocessedAudio();
} else if (transcription?.aligned_segments) {
// Use fallback if we have segments but no preprocessed audio
generateFallbackWaveform();
}
}, [preprocessedAudio, generateWaveformFromPreprocessedAudio, generateFallbackWaveform]);
// Draw the minimap
const draw = useCallback(() => {
const canvas = canvasRef.current;
if (!canvas || mediaDuration === 0) return;
const ctx = canvas.getContext('2d');
if (!ctx) return;
const { width, height } = canvas;
// Clear canvas
ctx.clearRect(0, 0, width, height);
// Draw background
ctx.fillStyle = '#1a1a1a';
ctx.fillRect(0, 0, width, height);
// Draw waveform
if (waveformData.length > 0) {
ctx.fillStyle = '#4a5568';
const barWidth = width / waveformData.length;
waveformData.forEach((amplitude, index) => {
const barHeight = amplitude * (height - 20);
const x = index * barWidth;
const y = (height - barHeight) / 2;
ctx.fillRect(x, y, Math.max(1, barWidth - 1), barHeight);
});
}
// Draw segments as colored bars
if (transcription?.aligned_segments) {
transcription.aligned_segments.forEach((segment, index) => {
const startX = (segment.start / mediaDuration) * width;
const endX = (segment.end / mediaDuration) * width;
const segmentWidth = endX - startX;
// Alternate colors for segments
ctx.fillStyle = index % 2 === 0 ? '#3182ce' : '#38a169';
ctx.fillRect(startX, height - 4, segmentWidth, 4);
});
}
// Draw current time indicator
const currentTimeX = (currentTime / mediaDuration) * width;
ctx.strokeStyle = '#f56565';
ctx.lineWidth = 2;
ctx.beginPath();
ctx.moveTo(currentTimeX, 0);
ctx.lineTo(currentTimeX, height);
ctx.stroke();
// Draw viewport region (what's visible in Canvas Timeline)
if (viewport.visible) {
const viewportStartX = (viewport.start / mediaDuration) * width;
const viewportEndX = (viewport.end / mediaDuration) * width;
// Draw viewport selection area (visible region highlight)
ctx.fillStyle = 'rgba(66, 153, 225, 0.3)';
ctx.fillRect(viewportStartX, 0, viewportEndX - viewportStartX, height);
// Draw left boundary line (start of visible area)
ctx.strokeStyle = '#4299e1';
ctx.lineWidth = 3;
ctx.beginPath();
ctx.moveTo(viewportStartX, 0);
ctx.lineTo(viewportStartX, height);
ctx.stroke();
// Draw right boundary line (end of visible area)
ctx.beginPath();
ctx.moveTo(viewportEndX, 0);
ctx.lineTo(viewportEndX, height);
ctx.stroke();
// Draw border around visible area
ctx.strokeStyle = '#4299e1';
ctx.lineWidth = 1;
ctx.strokeRect(viewportStartX, 0, viewportEndX - viewportStartX, height);
}
}, [waveformData, transcription, currentTime, viewport, mediaDuration]);
// Update canvas size and redraw
useEffect(() => {
const canvas = canvasRef.current;
if (canvas) {
canvas.width = canvasWidth;
canvas.height = MINIMAP_HEIGHT;
draw();
}
}, [canvasWidth, draw]);
// Redraw when dependencies change
useEffect(() => {
draw();
}, [draw]);
// Utility function to get time from X coordinate
const getTimeFromX = useCallback((x: number) => {
return (x / canvasWidth) * mediaDuration;
}, [canvasWidth, mediaDuration]);
// Check if clicking inside the viewport region
const isClickingViewport = useCallback((x: number) => {
if (!viewport.visible) return false;
const viewportStartX = (viewport.start / mediaDuration) * canvasWidth;
const viewportEndX = (viewport.end / mediaDuration) * canvasWidth;
return x >= viewportStartX && x <= viewportEndX;
}, [viewport, mediaDuration, canvasWidth]);
// Scroll Canvas Timeline to show specific time
const scrollToTime = useCallback((time: number) => {
const canvasContainer = canvasTimelineRef.current;
if (!canvasContainer) return;
const totalCanvasWidth = mediaDuration * PIXELS_PER_SECOND;
const targetScrollLeft = Math.max(0, (time / mediaDuration) * totalCanvasWidth);
canvasContainer.scrollLeft = targetScrollLeft;
}, [canvasTimelineRef, mediaDuration]);
// Mouse event handlers
const handleMouseDown = useCallback((e: React.MouseEvent) => {
const rect = canvasRef.current?.getBoundingClientRect();
if (!rect) return;
const x = e.clientX - rect.left;
if (isClickingViewport(x)) {
// Start dragging the viewport
setIsDragging(true);
setDragStartX(x);
const canvasContainer = canvasTimelineRef.current;
if (canvasContainer) {
setDragStartScrollLeft(canvasContainer.scrollLeft);
}
} else {
// Click outside viewport - jump to that position
const clickTime = getTimeFromX(x);
scrollToTime(clickTime);
}
}, [isClickingViewport, canvasTimelineRef, getTimeFromX, scrollToTime]);
const handleMouseMove = useCallback((e: React.MouseEvent) => {
if (!isDragging) return;
const rect = canvasRef.current?.getBoundingClientRect();
if (!rect) return;
const x = e.clientX - rect.left;
const deltaX = x - dragStartX;
const canvasContainer = canvasTimelineRef.current;
if (!canvasContainer) return;
// Convert deltaX to scroll delta
const totalCanvasWidth = mediaDuration * PIXELS_PER_SECOND;
const scrollDelta = (deltaX / canvasWidth) * totalCanvasWidth;
const newScrollLeft = Math.max(0, Math.min(
dragStartScrollLeft + scrollDelta,
canvasContainer.scrollWidth - canvasContainer.clientWidth
));
canvasContainer.scrollLeft = newScrollLeft;
}, [isDragging, dragStartX, dragStartScrollLeft, canvasTimelineRef, mediaDuration, canvasWidth]);
const handleMouseUp = useCallback(() => {
setIsDragging(false);
}, []);
// Add global mouse event listeners when dragging
useEffect(() => {
if (isDragging) {
const handleGlobalMouseMove = (e: MouseEvent) => {
handleMouseMove(e as any);
};
const handleGlobalMouseUp = () => {
handleMouseUp();
};
document.addEventListener('mousemove', handleGlobalMouseMove);
document.addEventListener('mouseup', handleGlobalMouseUp);
return () => {
document.removeEventListener('mousemove', handleGlobalMouseMove);
document.removeEventListener('mouseup', handleGlobalMouseUp);
};
}
}, [isDragging, handleMouseMove, handleMouseUp]);
// Change cursor based on hover position
const handleMouseHover = useCallback((e: React.MouseEvent) => {
if (isDragging) return;
const rect = canvasRef.current?.getBoundingClientRect();
if (!rect) return;
const x = e.clientX - rect.left;
const canvas = canvasRef.current;
if (!canvas) return;
if (isClickingViewport(x)) {
canvas.style.cursor = 'move';
} else {
canvas.style.cursor = 'pointer';
}
}, [isDragging, isClickingViewport]);
// Download preprocessed audio as WAV file
const downloadPreprocessedAudio = useCallback(() => {
if (!preprocessedAudio?.data) {
console.error('No preprocessed audio data available');
return;
}
try {
// Decode base64 audio data
const audioBytes = atob(preprocessedAudio.data);
const audioArrayBuffer = new ArrayBuffer(audioBytes.length);
const audioUint8Array = new Uint8Array(audioArrayBuffer);
for (let i = 0; i < audioBytes.length; i++) {
audioUint8Array[i] = audioBytes.charCodeAt(i);
}
// Create blob and download
const blob = new Blob([audioUint8Array], { type: 'audio/wav' });
const url = URL.createObjectURL(blob);
// Get original filename without extension
const { file } = useTranscriptionStore.getState();
const originalName = file?.name?.replace(/\.[^/.]+$/, '') || 'audio';
const filename = `${originalName}_preprocessed_16khz_mono_normalized.wav`;
// Create download link
const link = document.createElement('a');
link.href = url;
link.download = filename;
document.body.appendChild(link);
link.click();
document.body.removeChild(link);
// Clean up URL
URL.revokeObjectURL(url);
console.log(`Downloaded preprocessed audio: ${filename}`);
} catch (error) {
console.error('Error downloading preprocessed audio:', error);
}
}, [preprocessedAudio]);
if (!transcription || mediaDuration === 0) {
return null;
}
return (
<div className="bg-gray-800 border-b border-gray-700">
<div className="px-4 py-2">
<div className="flex justify-between items-center text-xs text-gray-400 mb-1">
<div className="flex items-center gap-2">
<span>
Overview - Full Timeline ({Math.round(mediaDuration)}s)
{preprocessedAudio ? ' • Preprocessed Waveform' : ' • Segment-Based View'}
</span>
{preprocessedAudio && (
<div className="tooltip tooltip-bottom" data-tip="Download preprocessed audio as WAV file (16kHz, mono, layer-normalized). This is the exact audio data processed by the AI transcription model after conversion and standardization from the original file.">
<button
onClick={downloadPreprocessedAudio}
className="flex items-center gap-1 px-1.5 py-0.5 text-xs bg-gray-600 hover:bg-gray-500 rounded transition-colors text-white"
>
<ArrowDownTrayIcon className="w-3 h-3" />
.wav
</button>
</div>
)}
</div>
{viewport.visible && (
<span>
Visible: {viewport.start.toFixed(1)}s - {viewport.end.toFixed(1)}s
({Math.round(viewport.end - viewport.start)}s view)
</span>
)}
</div>
<div
ref={containerRef}
className="relative"
style={{ height: MINIMAP_HEIGHT }}
>
<canvas
ref={canvasRef}
onMouseDown={handleMouseDown}
onMouseMove={handleMouseHover}
className="block w-full h-full"
style={{
width: '100%',
height: MINIMAP_HEIGHT,
}}
/>
</div>
</div>
</div>
);
}