LFM2-VL-450M-WebGPU / vl-processor.js
shubeydoo's picture
Liquid AI LFM2-VL-450M-WebGPU Demo
accf76b
/**
* LFM2-VL Image Processor for WebGPU/ONNX Runtime Web
*
* Implements the image preprocessing logic from Lfm2VlImageProcessorFast:
* 1. Split image into tiles (512x512)
* 2. Extract 16x16 patches from each tile (32x32 = 1024 patches per tile)
* 3. Flatten each patch to 768 values (16*16*3)
* 4. Normalize: (pixel / 255 - 0.5) / 0.5 = pixel / 127.5 - 1
*
* Output shapes match Python processor:
* - pixel_values: [num_tiles, 1024, 768]
* - pixel_attention_mask: [num_tiles, 1024]
*/
// Configuration from preprocessor_config.json
const CONFIG = {
tileSize: 512,
maxTiles: 10,
minTiles: 2,
imageMean: [0.5, 0.5, 0.5],
imageStd: [0.5, 0.5, 0.5],
rescaleFactor: 1 / 255,
useThumbnail: false, // LFM2-VL-450M does not use thumbnail
patchSize: 16, // Each patch is 16x16 pixels
patchesPerTile: 32, // 512 / 16 = 32 patches per side = 1024 per tile
downsampleFactor: 2,
minImageTokens: 64,
maxImageTokens: 256,
maxPixelsTolerance: 2.0,
};
// Pre-computed normalization constants for faster patch extraction
// Formula: (pixel / 255 - 0.5) / 0.5 = pixel / 127.5 - 1.0
const NORM_SCALE = 1 / 127.5;
const NORM_OFFSET = -1.0;
// Pre-computed patch info for common live-caption resolutions (all 32-aligned)
const PRECOMPUTED_SIZES = {
256: { width: 256, height: 256, patchesH: 16, patchesW: 16 }, // 256/16 = 16
384: { width: 384, height: 384, patchesH: 24, patchesW: 24 }, // 384/16 = 24
448: { width: 448, height: 448, patchesH: 28, patchesW: 28 }, // 448/16 = 28
512: { width: 512, height: 512, patchesH: 32, patchesW: 32 }, // 512/16 = 32
};
/**
* Round number to closest value divisible by factor
*/
function roundByFactor(number, factor) {
return Math.round(number / factor) * factor;
}
/**
* Ceil number to smallest value >= number divisible by factor
*/
function ceilByFactor(number, factor) {
return Math.ceil(number / factor) * factor;
}
/**
* Floor number to largest value <= number divisible by factor
*/
function floorByFactor(number, factor) {
return Math.floor(number / factor) * factor;
}
/**
* Find the closest aspect ratio from target ratios to match input aspect ratio
* Matches Python's find_closest_aspect_ratio()
*/
function findClosestAspectRatio(aspectRatio, targetRatios, width, height, imageSize) {
let bestRatioDiff = Infinity;
let bestRatio = [1, 1];
const area = width * height;
for (const ratio of targetRatios) {
const targetAspectRatio = ratio[0] / ratio[1];
const ratioDiff = Math.abs(aspectRatio - targetAspectRatio);
if (ratioDiff < bestRatioDiff) {
bestRatioDiff = ratioDiff;
bestRatio = ratio;
} else if (ratioDiff === bestRatioDiff) {
// If equally close, prefer ratio that better matches original image area
const targetArea = imageSize * imageSize * ratio[0] * ratio[1];
if (area > 0.5 * targetArea) {
bestRatio = ratio;
}
}
}
return bestRatio;
}
/**
* Check if image is too large to process as one tile
* Matches Python's _is_img_too_large()
*/
function isImageTooLarge(width, height) {
const { patchSize, maxImageTokens, downsampleFactor, maxPixelsTolerance } = CONFIG;
const hBar = Math.max(patchSize, roundByFactor(height, patchSize));
const wBar = Math.max(patchSize, roundByFactor(width, patchSize));
const maxPixels = maxImageTokens * (patchSize ** 2) * (downsampleFactor ** 2) * maxPixelsTolerance;
return hBar * wBar > maxPixels;
}
/**
* Smart resize to ensure dimensions divisible by patchSize * downsampleFactor
* and total pixels within [minPixels, maxPixels]
* Matches Python's _smart_resize()
* @returns {{width: number, height: number}}
*/
function smartResize(width, height) {
const { patchSize, downsampleFactor, minImageTokens, maxImageTokens } = CONFIG;
const totalFactor = patchSize * downsampleFactor; // 32
const minPixels = minImageTokens * (patchSize ** 2) * (downsampleFactor ** 2);
const maxPixels = maxImageTokens * (patchSize ** 2) * (downsampleFactor ** 2);
let hBar = Math.max(totalFactor, roundByFactor(height, totalFactor));
let wBar = Math.max(totalFactor, roundByFactor(width, totalFactor));
if (hBar * wBar > maxPixels) {
const beta = Math.sqrt((height * width) / maxPixels);
hBar = Math.max(totalFactor, floorByFactor(height / beta, totalFactor));
wBar = Math.max(totalFactor, floorByFactor(width / beta, totalFactor));
} else if (hBar * wBar < minPixels) {
const beta = Math.sqrt(minPixels / (height * width));
hBar = ceilByFactor(height * beta, totalFactor);
wBar = ceilByFactor(width * beta, totalFactor);
}
return { width: wBar, height: hBar };
}
/**
* Get number of tokens for an image of given dimensions
* Matches Python's _get_tokens_num()
*/
function getTokensNum(height, width) {
const { patchSize, downsampleFactor } = CONFIG;
const numPatchesHeight = Math.floor(height / patchSize);
const numPatchesWidth = Math.floor(width / patchSize);
const dwnNumPatchesHeight = Math.ceil(numPatchesHeight / downsampleFactor);
const dwnNumPatchesWidth = Math.ceil(numPatchesWidth / downsampleFactor);
return dwnNumPatchesHeight * dwnNumPatchesWidth;
}
/**
* Calculate optimal tile grid for an image
* Matches Python's _high_res_preprocessor() grid selection
* @param {number} width - Image width
* @param {number} height - Image height
* @returns {{rows: number, cols: number}} - Tile grid dimensions
*/
function calculateTileGrid(width, height) {
const { tileSize, minTiles, maxTiles } = CONFIG;
const aspectRatio = width / height;
// Generate valid patch grid configurations (width, height)
// Matches Python: [(w, h) for n in range(min_tiles, max_tiles + 1) for w in range(1, n + 1) for h in range(1, n + 1) if min_tiles <= w * h <= max_tiles]
const targetRatios = [];
for (let n = minTiles; n <= maxTiles; n++) {
for (let w = 1; w <= n; w++) {
for (let h = 1; h <= n; h++) {
if (w * h >= minTiles && w * h <= maxTiles) {
// Check if already exists
if (!targetRatios.some(r => r[0] === w && r[1] === h)) {
targetRatios.push([w, h]);
}
}
}
}
}
// Sort by total tiles
targetRatios.sort((a, b) => (a[0] * a[1]) - (b[0] * b[1]));
if (targetRatios.length === 0) {
return { rows: 1, cols: 1 };
}
// Find best matching grid configuration
const [gridWidth, gridHeight] = findClosestAspectRatio(
aspectRatio, targetRatios, width, height, tileSize
);
return { rows: gridHeight, cols: gridWidth };
}
/**
* Process an image into flattened patches for VL model
* Matches Python's _resize_and_maybe_split() logic
* @param {HTMLImageElement|HTMLCanvasElement|ImageData} image - Input image or raw ImageData
* @returns {Promise<{pixelValues: Float32Array, attentionMask: BigInt64Array, numTiles: number, shape: number[]}>}
*/
export async function processImage(image) {
let width, height;
let inputImageData = null; // For direct ImageData input
if (image instanceof ImageData) {
// Direct ImageData input - skip canvas creation entirely
width = image.width;
height = image.height;
inputImageData = image;
} else if (image instanceof HTMLImageElement) {
width = image.naturalWidth;
height = image.naturalHeight;
} else {
width = image.width;
height = image.height;
}
const { tileSize, patchSize, useThumbnail } = CONFIG;
const patchesPerSide = CONFIG.patchesPerTile; // 32
const maxPatchesPerTile = patchesPerSide * patchesPerSide; // 1024
const patchDim = patchSize * patchSize * 3; // 768
// Check if image needs splitting (matches Python's _resize_and_maybe_split)
const needsSplitting = isImageTooLarge(width, height);
if (needsSplitting) {
// HIGH-RES PATH: Split into tiles + optional thumbnail
// Matches Python's _high_res_preprocessor()
const { rows, cols } = calculateTileGrid(width, height);
const totalGridTiles = rows * cols;
// Only use tiling if we get more than 1 tile
if (totalGridTiles > 1) {
const numTiles = totalGridTiles + (useThumbnail ? 1 : 0);
// Output arrays - use max patches per tile for uniform shape
const pixelValues = new Float32Array(numTiles * maxPatchesPerTile * patchDim);
const attentionMask = new BigInt64Array(numTiles * maxPatchesPerTile);
const spatialShapes = new BigInt64Array(numTiles * 2);
// STEP 1: Resize ENTIRE image to target grid dimensions (matches Python)
const targetWidth = tileSize * cols;
const targetHeight = tileSize * rows;
const resizedCanvas = document.createElement('canvas');
resizedCanvas.width = targetWidth;
resizedCanvas.height = targetHeight;
const resizedCtx = resizedCanvas.getContext('2d');
resizedCtx.drawImage(image, 0, 0, targetWidth, targetHeight);
// STEP 2: Extract tiles by CROPPING from resized image (matches Python)
let tileIdx = 0;
for (let row = 0; row < rows; row++) {
for (let col = 0; col < cols; col++) {
const tileCanvas = document.createElement('canvas');
tileCanvas.width = tileSize;
tileCanvas.height = tileSize;
const tileCtx = tileCanvas.getContext('2d');
// Crop tile from resized image
tileCtx.drawImage(
resizedCanvas,
col * tileSize, row * tileSize, tileSize, tileSize, // source crop
0, 0, tileSize, tileSize // dest (same size, no scaling)
);
const tileData = tileCtx.getImageData(0, 0, tileSize, tileSize);
extractPatchesFromFullTile(tileData, pixelValues, attentionMask, tileIdx, patchesPerSide, maxPatchesPerTile);
// Spatial shape for this tile
spatialShapes[tileIdx * 2] = BigInt(patchesPerSide); // height in patches
spatialShapes[tileIdx * 2 + 1] = BigInt(patchesPerSide); // width in patches
tileIdx++;
}
}
// STEP 3: Add thumbnail LAST (matches Python - thumbnail is appended)
// Thumbnail uses smart resize to variable dimensions (like single-tile path)
if (useThumbnail) {
const thumbResized = smartResize(width, height);
const thumbWidth = thumbResized.width;
const thumbHeight = thumbResized.height;
const thumbCanvas = document.createElement('canvas');
thumbCanvas.width = thumbWidth;
thumbCanvas.height = thumbHeight;
const thumbCtx = thumbCanvas.getContext('2d');
thumbCtx.drawImage(image, 0, 0, thumbWidth, thumbHeight);
const thumbData = thumbCtx.getImageData(0, 0, thumbWidth, thumbHeight);
const thumbPatchesH = thumbHeight / patchSize;
const thumbPatchesW = thumbWidth / patchSize;
extractPatchesFromVariableSize(thumbData, pixelValues, attentionMask, tileIdx, thumbPatchesH, thumbPatchesW, maxPatchesPerTile);
// Spatial shape for thumbnail (variable based on smart resize)
spatialShapes[tileIdx * 2] = BigInt(thumbPatchesH);
spatialShapes[tileIdx * 2 + 1] = BigInt(thumbPatchesW);
tileIdx++;
}
return {
pixelValues,
attentionMask,
spatialShapes,
numTiles,
shape: [numTiles, maxPatchesPerTile, patchDim],
};
}
}
// SINGLE-TILE PATH: Smart resize only (no splitting)
// Matches Python's else branch in _resize_and_maybe_split()
let resizedWidth, resizedHeight, actualPatchesH, actualPatchesW;
let imageData;
// OPTIMIZATION: Check if dimensions are pre-computed (32-aligned live caption sizes)
const precomputed = PRECOMPUTED_SIZES[width];
const isAlreadyAligned = precomputed && width === height;
if (inputImageData && isAlreadyAligned) {
// FAST PATH: Direct ImageData with known dimensions - skip all resizing
resizedWidth = width;
resizedHeight = height;
actualPatchesH = precomputed.patchesH;
actualPatchesW = precomputed.patchesW;
imageData = inputImageData;
} else if (isAlreadyAligned) {
// Dimensions already 32-aligned, skip smartResize computation
resizedWidth = width;
resizedHeight = height;
actualPatchesH = precomputed.patchesH;
actualPatchesW = precomputed.patchesW;
// Still need to get ImageData from the image
const resizedCanvas = document.createElement('canvas');
resizedCanvas.width = resizedWidth;
resizedCanvas.height = resizedHeight;
const resizedCtx = resizedCanvas.getContext('2d');
resizedCtx.drawImage(image, 0, 0, resizedWidth, resizedHeight);
imageData = resizedCtx.getImageData(0, 0, resizedWidth, resizedHeight);
} else {
// Standard path: compute smart resize
const resized = smartResize(width, height);
resizedWidth = resized.width;
resizedHeight = resized.height;
actualPatchesH = resizedHeight / patchSize;
actualPatchesW = resizedWidth / patchSize;
// Create canvas at actual resized dimensions
const resizedCanvas = document.createElement('canvas');
resizedCanvas.width = resizedWidth;
resizedCanvas.height = resizedHeight;
const resizedCtx = resizedCanvas.getContext('2d');
resizedCtx.drawImage(image, 0, 0, resizedWidth, resizedHeight);
imageData = resizedCtx.getImageData(0, 0, resizedWidth, resizedHeight);
}
const numTiles = 1;
const pixelValues = new Float32Array(numTiles * maxPatchesPerTile * patchDim);
const attentionMask = new BigInt64Array(numTiles * maxPatchesPerTile);
const spatialShapes = new BigInt64Array(numTiles * 2);
extractPatchesFromVariableSize(imageData, pixelValues, attentionMask, 0, actualPatchesH, actualPatchesW, maxPatchesPerTile);
spatialShapes[0] = BigInt(actualPatchesH);
spatialShapes[1] = BigInt(actualPatchesW);
return {
pixelValues,
attentionMask,
spatialShapes,
numTiles,
shape: [numTiles, maxPatchesPerTile, patchDim],
};
}
/**
* Extract patches from a full 512x512 tile (all patches are valid)
* @param {ImageData} tileData - Tile image data (512x512)
* @param {Float32Array} pixelValues - Output pixel values array
* @param {BigInt64Array} attentionMask - Output attention mask array
* @param {number} tileIdx - Index of this tile
* @param {number} patchesPerSide - Number of patches per side (32 for 512x512)
* @param {number} maxPatchesPerTile - Max patches per tile for array indexing (1024)
*/
function extractPatchesFromFullTile(tileData, pixelValues, attentionMask, tileIdx, patchesPerSide, maxPatchesPerTile) {
const patchSize = CONFIG.patchSize;
const patchDim = patchSize * patchSize * 3;
const tileWidth = tileData.width;
const pixels = tileData.data;
const tileOffset = tileIdx * maxPatchesPerTile * patchDim;
const maskOffset = tileIdx * maxPatchesPerTile;
let patchIdx = 0;
for (let py = 0; py < patchesPerSide; py++) {
for (let px = 0; px < patchesPerSide; px++) {
const patchStartX = px * patchSize;
const patchStartY = py * patchSize;
// All patches in full tile are valid
attentionMask[maskOffset + patchIdx] = 1n;
// Extract and normalize patch pixels using pre-computed constants
const patchOffset = tileOffset + patchIdx * patchDim;
let outIdx = 0;
// Flatten patch: iterate over pixels in patch, then channels
// Optimized: srcIdx calculated once per pixel, use pre-computed normalization
for (let dy = 0; dy < patchSize; dy++) {
const rowOffset = (patchStartY + dy) * tileWidth;
for (let dx = 0; dx < patchSize; dx++) {
const srcIdx = (rowOffset + patchStartX + dx) * 4;
// Optimized normalization: pixel * (1/127.5) - 1.0
pixelValues[patchOffset + outIdx++] = pixels[srcIdx] * NORM_SCALE + NORM_OFFSET;
pixelValues[patchOffset + outIdx++] = pixels[srcIdx + 1] * NORM_SCALE + NORM_OFFSET;
pixelValues[patchOffset + outIdx++] = pixels[srcIdx + 2] * NORM_SCALE + NORM_OFFSET;
}
}
patchIdx++;
}
}
}
/**
* Extract patches from variable-sized image and pad to maxPatchesPerTile
* Matches Python's convert_image_to_patches + pad_along_first_dim
* @param {ImageData} imageData - Image data at actual dimensions
* @param {Float32Array} pixelValues - Output pixel values array
* @param {BigInt64Array} attentionMask - Output attention mask array
* @param {number} tileIdx - Index of this tile
* @param {number} patchesH - Number of patches in height
* @param {number} patchesW - Number of patches in width
* @param {number} maxPatchesPerTile - Max patches per tile for padding (1024)
*/
function extractPatchesFromVariableSize(imageData, pixelValues, attentionMask, tileIdx, patchesH, patchesW, maxPatchesPerTile) {
const patchSize = CONFIG.patchSize;
const patchDim = patchSize * patchSize * 3;
const imageWidth = imageData.width;
const pixels = imageData.data;
const tileOffset = tileIdx * maxPatchesPerTile * patchDim;
const maskOffset = tileIdx * maxPatchesPerTile;
const actualPatches = patchesH * patchesW;
// Extract actual patches
let patchIdx = 0;
for (let py = 0; py < patchesH; py++) {
for (let px = 0; px < patchesW; px++) {
const patchStartX = px * patchSize;
const patchStartY = py * patchSize;
// Mark as valid
attentionMask[maskOffset + patchIdx] = 1n;
// Extract and normalize patch pixels using pre-computed constants
const patchOffset = tileOffset + patchIdx * patchDim;
let outIdx = 0;
// Flatten patch: iterate over pixels in patch, then channels
// Optimized: srcIdx calculated once per pixel, use pre-computed normalization
for (let dy = 0; dy < patchSize; dy++) {
const rowOffset = (patchStartY + dy) * imageWidth;
for (let dx = 0; dx < patchSize; dx++) {
const srcIdx = (rowOffset + patchStartX + dx) * 4;
// Optimized normalization: pixel * (1/127.5) - 1.0
pixelValues[patchOffset + outIdx++] = pixels[srcIdx] * NORM_SCALE + NORM_OFFSET;
pixelValues[patchOffset + outIdx++] = pixels[srcIdx + 1] * NORM_SCALE + NORM_OFFSET;
pixelValues[patchOffset + outIdx++] = pixels[srcIdx + 2] * NORM_SCALE + NORM_OFFSET;
}
}
patchIdx++;
}
}
// Pad remaining patches with zeros and mask = 0
for (let i = actualPatches; i < maxPatchesPerTile; i++) {
attentionMask[maskOffset + i] = 0n;
// pixelValues already initialized to 0
}
}
/**
* Load an image from URL or data URL
* @param {string} src - Image URL or data URL
* @returns {Promise<HTMLImageElement>}
*/
export function loadImage(src) {
return new Promise((resolve, reject) => {
const img = new Image();
img.crossOrigin = 'anonymous';
img.onload = () => resolve(img);
img.onerror = reject;
img.src = src;
});
}