Spaces:
Running
Running
| /** | |
| * LFM2-VL Image Processor for WebGPU/ONNX Runtime Web | |
| * | |
| * Implements the image preprocessing logic from Lfm2VlImageProcessorFast: | |
| * 1. Split image into tiles (512x512) | |
| * 2. Extract 16x16 patches from each tile (32x32 = 1024 patches per tile) | |
| * 3. Flatten each patch to 768 values (16*16*3) | |
| * 4. Normalize: (pixel / 255 - 0.5) / 0.5 = pixel / 127.5 - 1 | |
| * | |
| * Output shapes match Python processor: | |
| * - pixel_values: [num_tiles, 1024, 768] | |
| * - pixel_attention_mask: [num_tiles, 1024] | |
| */ | |
| // Configuration from preprocessor_config.json | |
| const CONFIG = { | |
| tileSize: 512, | |
| maxTiles: 10, | |
| minTiles: 2, | |
| imageMean: [0.5, 0.5, 0.5], | |
| imageStd: [0.5, 0.5, 0.5], | |
| rescaleFactor: 1 / 255, | |
| useThumbnail: false, // LFM2-VL-450M does not use thumbnail | |
| patchSize: 16, // Each patch is 16x16 pixels | |
| patchesPerTile: 32, // 512 / 16 = 32 patches per side = 1024 per tile | |
| downsampleFactor: 2, | |
| minImageTokens: 64, | |
| maxImageTokens: 256, | |
| maxPixelsTolerance: 2.0, | |
| }; | |
| // Pre-computed normalization constants for faster patch extraction | |
| // Formula: (pixel / 255 - 0.5) / 0.5 = pixel / 127.5 - 1.0 | |
| const NORM_SCALE = 1 / 127.5; | |
| const NORM_OFFSET = -1.0; | |
| // Pre-computed patch info for common live-caption resolutions (all 32-aligned) | |
| const PRECOMPUTED_SIZES = { | |
| 256: { width: 256, height: 256, patchesH: 16, patchesW: 16 }, // 256/16 = 16 | |
| 384: { width: 384, height: 384, patchesH: 24, patchesW: 24 }, // 384/16 = 24 | |
| 448: { width: 448, height: 448, patchesH: 28, patchesW: 28 }, // 448/16 = 28 | |
| 512: { width: 512, height: 512, patchesH: 32, patchesW: 32 }, // 512/16 = 32 | |
| }; | |
| /** | |
| * Round number to closest value divisible by factor | |
| */ | |
| function roundByFactor(number, factor) { | |
| return Math.round(number / factor) * factor; | |
| } | |
| /** | |
| * Ceil number to smallest value >= number divisible by factor | |
| */ | |
| function ceilByFactor(number, factor) { | |
| return Math.ceil(number / factor) * factor; | |
| } | |
| /** | |
| * Floor number to largest value <= number divisible by factor | |
| */ | |
| function floorByFactor(number, factor) { | |
| return Math.floor(number / factor) * factor; | |
| } | |
| /** | |
| * Find the closest aspect ratio from target ratios to match input aspect ratio | |
| * Matches Python's find_closest_aspect_ratio() | |
| */ | |
| function findClosestAspectRatio(aspectRatio, targetRatios, width, height, imageSize) { | |
| let bestRatioDiff = Infinity; | |
| let bestRatio = [1, 1]; | |
| const area = width * height; | |
| for (const ratio of targetRatios) { | |
| const targetAspectRatio = ratio[0] / ratio[1]; | |
| const ratioDiff = Math.abs(aspectRatio - targetAspectRatio); | |
| if (ratioDiff < bestRatioDiff) { | |
| bestRatioDiff = ratioDiff; | |
| bestRatio = ratio; | |
| } else if (ratioDiff === bestRatioDiff) { | |
| // If equally close, prefer ratio that better matches original image area | |
| const targetArea = imageSize * imageSize * ratio[0] * ratio[1]; | |
| if (area > 0.5 * targetArea) { | |
| bestRatio = ratio; | |
| } | |
| } | |
| } | |
| return bestRatio; | |
| } | |
| /** | |
| * Check if image is too large to process as one tile | |
| * Matches Python's _is_img_too_large() | |
| */ | |
| function isImageTooLarge(width, height) { | |
| const { patchSize, maxImageTokens, downsampleFactor, maxPixelsTolerance } = CONFIG; | |
| const hBar = Math.max(patchSize, roundByFactor(height, patchSize)); | |
| const wBar = Math.max(patchSize, roundByFactor(width, patchSize)); | |
| const maxPixels = maxImageTokens * (patchSize ** 2) * (downsampleFactor ** 2) * maxPixelsTolerance; | |
| return hBar * wBar > maxPixels; | |
| } | |
| /** | |
| * Smart resize to ensure dimensions divisible by patchSize * downsampleFactor | |
| * and total pixels within [minPixels, maxPixels] | |
| * Matches Python's _smart_resize() | |
| * @returns {{width: number, height: number}} | |
| */ | |
| function smartResize(width, height) { | |
| const { patchSize, downsampleFactor, minImageTokens, maxImageTokens } = CONFIG; | |
| const totalFactor = patchSize * downsampleFactor; // 32 | |
| const minPixels = minImageTokens * (patchSize ** 2) * (downsampleFactor ** 2); | |
| const maxPixels = maxImageTokens * (patchSize ** 2) * (downsampleFactor ** 2); | |
| let hBar = Math.max(totalFactor, roundByFactor(height, totalFactor)); | |
| let wBar = Math.max(totalFactor, roundByFactor(width, totalFactor)); | |
| if (hBar * wBar > maxPixels) { | |
| const beta = Math.sqrt((height * width) / maxPixels); | |
| hBar = Math.max(totalFactor, floorByFactor(height / beta, totalFactor)); | |
| wBar = Math.max(totalFactor, floorByFactor(width / beta, totalFactor)); | |
| } else if (hBar * wBar < minPixels) { | |
| const beta = Math.sqrt(minPixels / (height * width)); | |
| hBar = ceilByFactor(height * beta, totalFactor); | |
| wBar = ceilByFactor(width * beta, totalFactor); | |
| } | |
| return { width: wBar, height: hBar }; | |
| } | |
| /** | |
| * Get number of tokens for an image of given dimensions | |
| * Matches Python's _get_tokens_num() | |
| */ | |
| function getTokensNum(height, width) { | |
| const { patchSize, downsampleFactor } = CONFIG; | |
| const numPatchesHeight = Math.floor(height / patchSize); | |
| const numPatchesWidth = Math.floor(width / patchSize); | |
| const dwnNumPatchesHeight = Math.ceil(numPatchesHeight / downsampleFactor); | |
| const dwnNumPatchesWidth = Math.ceil(numPatchesWidth / downsampleFactor); | |
| return dwnNumPatchesHeight * dwnNumPatchesWidth; | |
| } | |
| /** | |
| * Calculate optimal tile grid for an image | |
| * Matches Python's _high_res_preprocessor() grid selection | |
| * @param {number} width - Image width | |
| * @param {number} height - Image height | |
| * @returns {{rows: number, cols: number}} - Tile grid dimensions | |
| */ | |
| function calculateTileGrid(width, height) { | |
| const { tileSize, minTiles, maxTiles } = CONFIG; | |
| const aspectRatio = width / height; | |
| // Generate valid patch grid configurations (width, height) | |
| // Matches Python: [(w, h) for n in range(min_tiles, max_tiles + 1) for w in range(1, n + 1) for h in range(1, n + 1) if min_tiles <= w * h <= max_tiles] | |
| const targetRatios = []; | |
| for (let n = minTiles; n <= maxTiles; n++) { | |
| for (let w = 1; w <= n; w++) { | |
| for (let h = 1; h <= n; h++) { | |
| if (w * h >= minTiles && w * h <= maxTiles) { | |
| // Check if already exists | |
| if (!targetRatios.some(r => r[0] === w && r[1] === h)) { | |
| targetRatios.push([w, h]); | |
| } | |
| } | |
| } | |
| } | |
| } | |
| // Sort by total tiles | |
| targetRatios.sort((a, b) => (a[0] * a[1]) - (b[0] * b[1])); | |
| if (targetRatios.length === 0) { | |
| return { rows: 1, cols: 1 }; | |
| } | |
| // Find best matching grid configuration | |
| const [gridWidth, gridHeight] = findClosestAspectRatio( | |
| aspectRatio, targetRatios, width, height, tileSize | |
| ); | |
| return { rows: gridHeight, cols: gridWidth }; | |
| } | |
| /** | |
| * Process an image into flattened patches for VL model | |
| * Matches Python's _resize_and_maybe_split() logic | |
| * @param {HTMLImageElement|HTMLCanvasElement|ImageData} image - Input image or raw ImageData | |
| * @returns {Promise<{pixelValues: Float32Array, attentionMask: BigInt64Array, numTiles: number, shape: number[]}>} | |
| */ | |
| export async function processImage(image) { | |
| let width, height; | |
| let inputImageData = null; // For direct ImageData input | |
| if (image instanceof ImageData) { | |
| // Direct ImageData input - skip canvas creation entirely | |
| width = image.width; | |
| height = image.height; | |
| inputImageData = image; | |
| } else if (image instanceof HTMLImageElement) { | |
| width = image.naturalWidth; | |
| height = image.naturalHeight; | |
| } else { | |
| width = image.width; | |
| height = image.height; | |
| } | |
| const { tileSize, patchSize, useThumbnail } = CONFIG; | |
| const patchesPerSide = CONFIG.patchesPerTile; // 32 | |
| const maxPatchesPerTile = patchesPerSide * patchesPerSide; // 1024 | |
| const patchDim = patchSize * patchSize * 3; // 768 | |
| // Check if image needs splitting (matches Python's _resize_and_maybe_split) | |
| const needsSplitting = isImageTooLarge(width, height); | |
| if (needsSplitting) { | |
| // HIGH-RES PATH: Split into tiles + optional thumbnail | |
| // Matches Python's _high_res_preprocessor() | |
| const { rows, cols } = calculateTileGrid(width, height); | |
| const totalGridTiles = rows * cols; | |
| // Only use tiling if we get more than 1 tile | |
| if (totalGridTiles > 1) { | |
| const numTiles = totalGridTiles + (useThumbnail ? 1 : 0); | |
| // Output arrays - use max patches per tile for uniform shape | |
| const pixelValues = new Float32Array(numTiles * maxPatchesPerTile * patchDim); | |
| const attentionMask = new BigInt64Array(numTiles * maxPatchesPerTile); | |
| const spatialShapes = new BigInt64Array(numTiles * 2); | |
| // STEP 1: Resize ENTIRE image to target grid dimensions (matches Python) | |
| const targetWidth = tileSize * cols; | |
| const targetHeight = tileSize * rows; | |
| const resizedCanvas = document.createElement('canvas'); | |
| resizedCanvas.width = targetWidth; | |
| resizedCanvas.height = targetHeight; | |
| const resizedCtx = resizedCanvas.getContext('2d'); | |
| resizedCtx.drawImage(image, 0, 0, targetWidth, targetHeight); | |
| // STEP 2: Extract tiles by CROPPING from resized image (matches Python) | |
| let tileIdx = 0; | |
| for (let row = 0; row < rows; row++) { | |
| for (let col = 0; col < cols; col++) { | |
| const tileCanvas = document.createElement('canvas'); | |
| tileCanvas.width = tileSize; | |
| tileCanvas.height = tileSize; | |
| const tileCtx = tileCanvas.getContext('2d'); | |
| // Crop tile from resized image | |
| tileCtx.drawImage( | |
| resizedCanvas, | |
| col * tileSize, row * tileSize, tileSize, tileSize, // source crop | |
| 0, 0, tileSize, tileSize // dest (same size, no scaling) | |
| ); | |
| const tileData = tileCtx.getImageData(0, 0, tileSize, tileSize); | |
| extractPatchesFromFullTile(tileData, pixelValues, attentionMask, tileIdx, patchesPerSide, maxPatchesPerTile); | |
| // Spatial shape for this tile | |
| spatialShapes[tileIdx * 2] = BigInt(patchesPerSide); // height in patches | |
| spatialShapes[tileIdx * 2 + 1] = BigInt(patchesPerSide); // width in patches | |
| tileIdx++; | |
| } | |
| } | |
| // STEP 3: Add thumbnail LAST (matches Python - thumbnail is appended) | |
| // Thumbnail uses smart resize to variable dimensions (like single-tile path) | |
| if (useThumbnail) { | |
| const thumbResized = smartResize(width, height); | |
| const thumbWidth = thumbResized.width; | |
| const thumbHeight = thumbResized.height; | |
| const thumbCanvas = document.createElement('canvas'); | |
| thumbCanvas.width = thumbWidth; | |
| thumbCanvas.height = thumbHeight; | |
| const thumbCtx = thumbCanvas.getContext('2d'); | |
| thumbCtx.drawImage(image, 0, 0, thumbWidth, thumbHeight); | |
| const thumbData = thumbCtx.getImageData(0, 0, thumbWidth, thumbHeight); | |
| const thumbPatchesH = thumbHeight / patchSize; | |
| const thumbPatchesW = thumbWidth / patchSize; | |
| extractPatchesFromVariableSize(thumbData, pixelValues, attentionMask, tileIdx, thumbPatchesH, thumbPatchesW, maxPatchesPerTile); | |
| // Spatial shape for thumbnail (variable based on smart resize) | |
| spatialShapes[tileIdx * 2] = BigInt(thumbPatchesH); | |
| spatialShapes[tileIdx * 2 + 1] = BigInt(thumbPatchesW); | |
| tileIdx++; | |
| } | |
| return { | |
| pixelValues, | |
| attentionMask, | |
| spatialShapes, | |
| numTiles, | |
| shape: [numTiles, maxPatchesPerTile, patchDim], | |
| }; | |
| } | |
| } | |
| // SINGLE-TILE PATH: Smart resize only (no splitting) | |
| // Matches Python's else branch in _resize_and_maybe_split() | |
| let resizedWidth, resizedHeight, actualPatchesH, actualPatchesW; | |
| let imageData; | |
| // OPTIMIZATION: Check if dimensions are pre-computed (32-aligned live caption sizes) | |
| const precomputed = PRECOMPUTED_SIZES[width]; | |
| const isAlreadyAligned = precomputed && width === height; | |
| if (inputImageData && isAlreadyAligned) { | |
| // FAST PATH: Direct ImageData with known dimensions - skip all resizing | |
| resizedWidth = width; | |
| resizedHeight = height; | |
| actualPatchesH = precomputed.patchesH; | |
| actualPatchesW = precomputed.patchesW; | |
| imageData = inputImageData; | |
| } else if (isAlreadyAligned) { | |
| // Dimensions already 32-aligned, skip smartResize computation | |
| resizedWidth = width; | |
| resizedHeight = height; | |
| actualPatchesH = precomputed.patchesH; | |
| actualPatchesW = precomputed.patchesW; | |
| // Still need to get ImageData from the image | |
| const resizedCanvas = document.createElement('canvas'); | |
| resizedCanvas.width = resizedWidth; | |
| resizedCanvas.height = resizedHeight; | |
| const resizedCtx = resizedCanvas.getContext('2d'); | |
| resizedCtx.drawImage(image, 0, 0, resizedWidth, resizedHeight); | |
| imageData = resizedCtx.getImageData(0, 0, resizedWidth, resizedHeight); | |
| } else { | |
| // Standard path: compute smart resize | |
| const resized = smartResize(width, height); | |
| resizedWidth = resized.width; | |
| resizedHeight = resized.height; | |
| actualPatchesH = resizedHeight / patchSize; | |
| actualPatchesW = resizedWidth / patchSize; | |
| // Create canvas at actual resized dimensions | |
| const resizedCanvas = document.createElement('canvas'); | |
| resizedCanvas.width = resizedWidth; | |
| resizedCanvas.height = resizedHeight; | |
| const resizedCtx = resizedCanvas.getContext('2d'); | |
| resizedCtx.drawImage(image, 0, 0, resizedWidth, resizedHeight); | |
| imageData = resizedCtx.getImageData(0, 0, resizedWidth, resizedHeight); | |
| } | |
| const numTiles = 1; | |
| const pixelValues = new Float32Array(numTiles * maxPatchesPerTile * patchDim); | |
| const attentionMask = new BigInt64Array(numTiles * maxPatchesPerTile); | |
| const spatialShapes = new BigInt64Array(numTiles * 2); | |
| extractPatchesFromVariableSize(imageData, pixelValues, attentionMask, 0, actualPatchesH, actualPatchesW, maxPatchesPerTile); | |
| spatialShapes[0] = BigInt(actualPatchesH); | |
| spatialShapes[1] = BigInt(actualPatchesW); | |
| return { | |
| pixelValues, | |
| attentionMask, | |
| spatialShapes, | |
| numTiles, | |
| shape: [numTiles, maxPatchesPerTile, patchDim], | |
| }; | |
| } | |
| /** | |
| * Extract patches from a full 512x512 tile (all patches are valid) | |
| * @param {ImageData} tileData - Tile image data (512x512) | |
| * @param {Float32Array} pixelValues - Output pixel values array | |
| * @param {BigInt64Array} attentionMask - Output attention mask array | |
| * @param {number} tileIdx - Index of this tile | |
| * @param {number} patchesPerSide - Number of patches per side (32 for 512x512) | |
| * @param {number} maxPatchesPerTile - Max patches per tile for array indexing (1024) | |
| */ | |
| function extractPatchesFromFullTile(tileData, pixelValues, attentionMask, tileIdx, patchesPerSide, maxPatchesPerTile) { | |
| const patchSize = CONFIG.patchSize; | |
| const patchDim = patchSize * patchSize * 3; | |
| const tileWidth = tileData.width; | |
| const pixels = tileData.data; | |
| const tileOffset = tileIdx * maxPatchesPerTile * patchDim; | |
| const maskOffset = tileIdx * maxPatchesPerTile; | |
| let patchIdx = 0; | |
| for (let py = 0; py < patchesPerSide; py++) { | |
| for (let px = 0; px < patchesPerSide; px++) { | |
| const patchStartX = px * patchSize; | |
| const patchStartY = py * patchSize; | |
| // All patches in full tile are valid | |
| attentionMask[maskOffset + patchIdx] = 1n; | |
| // Extract and normalize patch pixels using pre-computed constants | |
| const patchOffset = tileOffset + patchIdx * patchDim; | |
| let outIdx = 0; | |
| // Flatten patch: iterate over pixels in patch, then channels | |
| // Optimized: srcIdx calculated once per pixel, use pre-computed normalization | |
| for (let dy = 0; dy < patchSize; dy++) { | |
| const rowOffset = (patchStartY + dy) * tileWidth; | |
| for (let dx = 0; dx < patchSize; dx++) { | |
| const srcIdx = (rowOffset + patchStartX + dx) * 4; | |
| // Optimized normalization: pixel * (1/127.5) - 1.0 | |
| pixelValues[patchOffset + outIdx++] = pixels[srcIdx] * NORM_SCALE + NORM_OFFSET; | |
| pixelValues[patchOffset + outIdx++] = pixels[srcIdx + 1] * NORM_SCALE + NORM_OFFSET; | |
| pixelValues[patchOffset + outIdx++] = pixels[srcIdx + 2] * NORM_SCALE + NORM_OFFSET; | |
| } | |
| } | |
| patchIdx++; | |
| } | |
| } | |
| } | |
| /** | |
| * Extract patches from variable-sized image and pad to maxPatchesPerTile | |
| * Matches Python's convert_image_to_patches + pad_along_first_dim | |
| * @param {ImageData} imageData - Image data at actual dimensions | |
| * @param {Float32Array} pixelValues - Output pixel values array | |
| * @param {BigInt64Array} attentionMask - Output attention mask array | |
| * @param {number} tileIdx - Index of this tile | |
| * @param {number} patchesH - Number of patches in height | |
| * @param {number} patchesW - Number of patches in width | |
| * @param {number} maxPatchesPerTile - Max patches per tile for padding (1024) | |
| */ | |
| function extractPatchesFromVariableSize(imageData, pixelValues, attentionMask, tileIdx, patchesH, patchesW, maxPatchesPerTile) { | |
| const patchSize = CONFIG.patchSize; | |
| const patchDim = patchSize * patchSize * 3; | |
| const imageWidth = imageData.width; | |
| const pixels = imageData.data; | |
| const tileOffset = tileIdx * maxPatchesPerTile * patchDim; | |
| const maskOffset = tileIdx * maxPatchesPerTile; | |
| const actualPatches = patchesH * patchesW; | |
| // Extract actual patches | |
| let patchIdx = 0; | |
| for (let py = 0; py < patchesH; py++) { | |
| for (let px = 0; px < patchesW; px++) { | |
| const patchStartX = px * patchSize; | |
| const patchStartY = py * patchSize; | |
| // Mark as valid | |
| attentionMask[maskOffset + patchIdx] = 1n; | |
| // Extract and normalize patch pixels using pre-computed constants | |
| const patchOffset = tileOffset + patchIdx * patchDim; | |
| let outIdx = 0; | |
| // Flatten patch: iterate over pixels in patch, then channels | |
| // Optimized: srcIdx calculated once per pixel, use pre-computed normalization | |
| for (let dy = 0; dy < patchSize; dy++) { | |
| const rowOffset = (patchStartY + dy) * imageWidth; | |
| for (let dx = 0; dx < patchSize; dx++) { | |
| const srcIdx = (rowOffset + patchStartX + dx) * 4; | |
| // Optimized normalization: pixel * (1/127.5) - 1.0 | |
| pixelValues[patchOffset + outIdx++] = pixels[srcIdx] * NORM_SCALE + NORM_OFFSET; | |
| pixelValues[patchOffset + outIdx++] = pixels[srcIdx + 1] * NORM_SCALE + NORM_OFFSET; | |
| pixelValues[patchOffset + outIdx++] = pixels[srcIdx + 2] * NORM_SCALE + NORM_OFFSET; | |
| } | |
| } | |
| patchIdx++; | |
| } | |
| } | |
| // Pad remaining patches with zeros and mask = 0 | |
| for (let i = actualPatches; i < maxPatchesPerTile; i++) { | |
| attentionMask[maskOffset + i] = 0n; | |
| // pixelValues already initialized to 0 | |
| } | |
| } | |
| /** | |
| * Load an image from URL or data URL | |
| * @param {string} src - Image URL or data URL | |
| * @returns {Promise<HTMLImageElement>} | |
| */ | |
| export function loadImage(src) { | |
| return new Promise((resolve, reject) => { | |
| const img = new Image(); | |
| img.crossOrigin = 'anonymous'; | |
| img.onload = () => resolve(img); | |
| img.onerror = reject; | |
| img.src = src; | |
| }); | |
| } | |