Spaces:

LiquidAI
/

LFM2-VL-450M-WebGPU

Running

App Files Files Community

LFM2-VL-450M-WebGPU / vl-processor.js

shubeydoo's picture

Liquid AI LFM2-VL-450M-WebGPU Demo

accf76b about 2 months ago

history blame contribute delete

18.7 kB

	/**
	* LFM2-VL Image Processor for WebGPU/ONNX Runtime Web
	*
	* Implements the image preprocessing logic from Lfm2VlImageProcessorFast:
	* 1. Split image into tiles (512x512)
	* 2. Extract 16x16 patches from each tile (32x32 = 1024 patches per tile)
	* 3. Flatten each patch to 768 values (16163)
	* 4. Normalize: (pixel / 255 - 0.5) / 0.5 = pixel / 127.5 - 1
	*
	* Output shapes match Python processor:
	* - pixel_values: [num_tiles, 1024, 768]
	* - pixel_attention_mask: [num_tiles, 1024]
	*/

	// Configuration from preprocessor_config.json
	const CONFIG = {
	tileSize: 512,
	maxTiles: 10,
	minTiles: 2,
	imageMean: [0.5, 0.5, 0.5],
	imageStd: [0.5, 0.5, 0.5],
	rescaleFactor: 1 / 255,
	useThumbnail: false, // LFM2-VL-450M does not use thumbnail
	patchSize: 16, // Each patch is 16x16 pixels
	patchesPerTile: 32, // 512 / 16 = 32 patches per side = 1024 per tile
	downsampleFactor: 2,
	minImageTokens: 64,
	maxImageTokens: 256,
	maxPixelsTolerance: 2.0,
	};

	// Pre-computed normalization constants for faster patch extraction
	// Formula: (pixel / 255 - 0.5) / 0.5 = pixel / 127.5 - 1.0
	const NORM_SCALE = 1 / 127.5;
	const NORM_OFFSET = -1.0;

	// Pre-computed patch info for common live-caption resolutions (all 32-aligned)
	const PRECOMPUTED_SIZES = {
	256: { width: 256, height: 256, patchesH: 16, patchesW: 16 }, // 256/16 = 16
	384: { width: 384, height: 384, patchesH: 24, patchesW: 24 }, // 384/16 = 24
	448: { width: 448, height: 448, patchesH: 28, patchesW: 28 }, // 448/16 = 28
	512: { width: 512, height: 512, patchesH: 32, patchesW: 32 }, // 512/16 = 32
	};

	/**
	* Round number to closest value divisible by factor
	*/
	function roundByFactor(number, factor) {
	return Math.round(number / factor) * factor;
	}

	/**
	* Ceil number to smallest value >= number divisible by factor
	*/
	function ceilByFactor(number, factor) {
	return Math.ceil(number / factor) * factor;
	}

	/**
	* Floor number to largest value <= number divisible by factor
	*/
	function floorByFactor(number, factor) {
	return Math.floor(number / factor) * factor;
	}

	/**
	* Find the closest aspect ratio from target ratios to match input aspect ratio
	* Matches Python's find_closest_aspect_ratio()
	*/
	function findClosestAspectRatio(aspectRatio, targetRatios, width, height, imageSize) {
	let bestRatioDiff = Infinity;
	let bestRatio = [1, 1];
	const area = width * height;

	for (const ratio of targetRatios) {
	const targetAspectRatio = ratio[0] / ratio[1];
	const ratioDiff = Math.abs(aspectRatio - targetAspectRatio);

	if (ratioDiff < bestRatioDiff) {
	bestRatioDiff = ratioDiff;
	bestRatio = ratio;
	} else if (ratioDiff === bestRatioDiff) {
	// If equally close, prefer ratio that better matches original image area
	const targetArea = imageSize * imageSize * ratio[0] * ratio[1];
	if (area > 0.5 * targetArea) {
	bestRatio = ratio;
	}
	}
	}

	return bestRatio;
	}

	/**
	* Check if image is too large to process as one tile
	* Matches Python's _is_img_too_large()
	*/
	function isImageTooLarge(width, height) {
	const { patchSize, maxImageTokens, downsampleFactor, maxPixelsTolerance } = CONFIG;
	const hBar = Math.max(patchSize, roundByFactor(height, patchSize));
	const wBar = Math.max(patchSize, roundByFactor(width, patchSize));
	const maxPixels = maxImageTokens * (patchSize ** 2) * (downsampleFactor ** 2) * maxPixelsTolerance;
	return hBar * wBar > maxPixels;
	}

	/**
	* Smart resize to ensure dimensions divisible by patchSize * downsampleFactor
	* and total pixels within [minPixels, maxPixels]
	* Matches Python's _smart_resize()
	* @returns {{width: number, height: number}}
	*/
	function smartResize(width, height) {
	const { patchSize, downsampleFactor, minImageTokens, maxImageTokens } = CONFIG;
	const totalFactor = patchSize * downsampleFactor; // 32
	const minPixels = minImageTokens * (patchSize ** 2) * (downsampleFactor ** 2);
	const maxPixels = maxImageTokens * (patchSize ** 2) * (downsampleFactor ** 2);

	let hBar = Math.max(totalFactor, roundByFactor(height, totalFactor));
	let wBar = Math.max(totalFactor, roundByFactor(width, totalFactor));

	if (hBar * wBar > maxPixels) {
	const beta = Math.sqrt((height * width) / maxPixels);
	hBar = Math.max(totalFactor, floorByFactor(height / beta, totalFactor));
	wBar = Math.max(totalFactor, floorByFactor(width / beta, totalFactor));
	} else if (hBar * wBar < minPixels) {
	const beta = Math.sqrt(minPixels / (height * width));
	hBar = ceilByFactor(height * beta, totalFactor);
	wBar = ceilByFactor(width * beta, totalFactor);
	}

	return { width: wBar, height: hBar };
	}

	/**
	* Get number of tokens for an image of given dimensions
	* Matches Python's _get_tokens_num()
	*/
	function getTokensNum(height, width) {
	const { patchSize, downsampleFactor } = CONFIG;
	const numPatchesHeight = Math.floor(height / patchSize);
	const numPatchesWidth = Math.floor(width / patchSize);
	const dwnNumPatchesHeight = Math.ceil(numPatchesHeight / downsampleFactor);
	const dwnNumPatchesWidth = Math.ceil(numPatchesWidth / downsampleFactor);
	return dwnNumPatchesHeight * dwnNumPatchesWidth;
	}

	/**
	* Calculate optimal tile grid for an image
	* Matches Python's _high_res_preprocessor() grid selection
	* @param {number} width - Image width
	* @param {number} height - Image height
	* @returns {{rows: number, cols: number}} - Tile grid dimensions
	*/
	function calculateTileGrid(width, height) {
	const { tileSize, minTiles, maxTiles } = CONFIG;
	const aspectRatio = width / height;

	// Generate valid patch grid configurations (width, height)
	// Matches Python: [(w, h) for n in range(min_tiles, max_tiles + 1) for w in range(1, n + 1) for h in range(1, n + 1) if min_tiles <= w * h <= max_tiles]
	const targetRatios = [];
	for (let n = minTiles; n <= maxTiles; n++) {
	for (let w = 1; w <= n; w++) {
	for (let h = 1; h <= n; h++) {
	if (w * h >= minTiles && w * h <= maxTiles) {
	// Check if already exists
	if (!targetRatios.some(r => r[0] === w && r[1] === h)) {
	targetRatios.push([w, h]);
	}
	}
	}
	}
	}
	// Sort by total tiles
	targetRatios.sort((a, b) => (a[0] * a[1]) - (b[0] * b[1]));

	if (targetRatios.length === 0) {
	return { rows: 1, cols: 1 };
	}

	// Find best matching grid configuration
	const [gridWidth, gridHeight] = findClosestAspectRatio(
	aspectRatio, targetRatios, width, height, tileSize
	);

	return { rows: gridHeight, cols: gridWidth };
	}

	/**
	* Process an image into flattened patches for VL model
	* Matches Python's _resize_and_maybe_split() logic
	* @param {HTMLImageElement\|HTMLCanvasElement\|ImageData} image - Input image or raw ImageData
	* @returns {Promise<{pixelValues: Float32Array, attentionMask: BigInt64Array, numTiles: number, shape: number[]}>}
	*/
	export async function processImage(image) {
	let width, height;
	let inputImageData = null; // For direct ImageData input

	if (image instanceof ImageData) {
	// Direct ImageData input - skip canvas creation entirely
	width = image.width;
	height = image.height;
	inputImageData = image;
	} else if (image instanceof HTMLImageElement) {
	width = image.naturalWidth;
	height = image.naturalHeight;
	} else {
	width = image.width;
	height = image.height;
	}

	const { tileSize, patchSize, useThumbnail } = CONFIG;
	const patchesPerSide = CONFIG.patchesPerTile; // 32
	const maxPatchesPerTile = patchesPerSide * patchesPerSide; // 1024
	const patchDim = patchSize * patchSize * 3; // 768

	// Check if image needs splitting (matches Python's _resize_and_maybe_split)
	const needsSplitting = isImageTooLarge(width, height);

	if (needsSplitting) {
	// HIGH-RES PATH: Split into tiles + optional thumbnail
	// Matches Python's _high_res_preprocessor()
	const { rows, cols } = calculateTileGrid(width, height);
	const totalGridTiles = rows * cols;

	// Only use tiling if we get more than 1 tile
	if (totalGridTiles > 1) {
	const numTiles = totalGridTiles + (useThumbnail ? 1 : 0);

	// Output arrays - use max patches per tile for uniform shape
	const pixelValues = new Float32Array(numTiles * maxPatchesPerTile * patchDim);
	const attentionMask = new BigInt64Array(numTiles * maxPatchesPerTile);
	const spatialShapes = new BigInt64Array(numTiles * 2);

	// STEP 1: Resize ENTIRE image to target grid dimensions (matches Python)
	const targetWidth = tileSize * cols;
	const targetHeight = tileSize * rows;

	const resizedCanvas = document.createElement('canvas');
	resizedCanvas.width = targetWidth;
	resizedCanvas.height = targetHeight;
	const resizedCtx = resizedCanvas.getContext('2d');
	resizedCtx.drawImage(image, 0, 0, targetWidth, targetHeight);

	// STEP 2: Extract tiles by CROPPING from resized image (matches Python)
	let tileIdx = 0;
	for (let row = 0; row < rows; row++) {
	for (let col = 0; col < cols; col++) {
	const tileCanvas = document.createElement('canvas');
	tileCanvas.width = tileSize;
	tileCanvas.height = tileSize;
	const tileCtx = tileCanvas.getContext('2d');

	// Crop tile from resized image
	tileCtx.drawImage(
	resizedCanvas,
	col * tileSize, row * tileSize, tileSize, tileSize, // source crop
	0, 0, tileSize, tileSize // dest (same size, no scaling)
	);

	const tileData = tileCtx.getImageData(0, 0, tileSize, tileSize);
	extractPatchesFromFullTile(tileData, pixelValues, attentionMask, tileIdx, patchesPerSide, maxPatchesPerTile);

	// Spatial shape for this tile
	spatialShapes[tileIdx * 2] = BigInt(patchesPerSide); // height in patches
	spatialShapes[tileIdx * 2 + 1] = BigInt(patchesPerSide); // width in patches

	tileIdx++;
	}
	}

	// STEP 3: Add thumbnail LAST (matches Python - thumbnail is appended)
	// Thumbnail uses smart resize to variable dimensions (like single-tile path)
	if (useThumbnail) {
	const thumbResized = smartResize(width, height);
	const thumbWidth = thumbResized.width;
	const thumbHeight = thumbResized.height;

	const thumbCanvas = document.createElement('canvas');
	thumbCanvas.width = thumbWidth;
	thumbCanvas.height = thumbHeight;
	const thumbCtx = thumbCanvas.getContext('2d');
	thumbCtx.drawImage(image, 0, 0, thumbWidth, thumbHeight);

	const thumbData = thumbCtx.getImageData(0, 0, thumbWidth, thumbHeight);
	const thumbPatchesH = thumbHeight / patchSize;
	const thumbPatchesW = thumbWidth / patchSize;

	extractPatchesFromVariableSize(thumbData, pixelValues, attentionMask, tileIdx, thumbPatchesH, thumbPatchesW, maxPatchesPerTile);

	// Spatial shape for thumbnail (variable based on smart resize)
	spatialShapes[tileIdx * 2] = BigInt(thumbPatchesH);
	spatialShapes[tileIdx * 2 + 1] = BigInt(thumbPatchesW);

	tileIdx++;
	}

	return {
	pixelValues,
	attentionMask,
	spatialShapes,
	numTiles,
	shape: [numTiles, maxPatchesPerTile, patchDim],
	};
	}
	}

	// SINGLE-TILE PATH: Smart resize only (no splitting)
	// Matches Python's else branch in _resize_and_maybe_split()

	let resizedWidth, resizedHeight, actualPatchesH, actualPatchesW;
	let imageData;

	// OPTIMIZATION: Check if dimensions are pre-computed (32-aligned live caption sizes)
	const precomputed = PRECOMPUTED_SIZES[width];
	const isAlreadyAligned = precomputed && width === height;

	if (inputImageData && isAlreadyAligned) {
	// FAST PATH: Direct ImageData with known dimensions - skip all resizing
	resizedWidth = width;
	resizedHeight = height;
	actualPatchesH = precomputed.patchesH;
	actualPatchesW = precomputed.patchesW;
	imageData = inputImageData;
	} else if (isAlreadyAligned) {
	// Dimensions already 32-aligned, skip smartResize computation
	resizedWidth = width;
	resizedHeight = height;
	actualPatchesH = precomputed.patchesH;
	actualPatchesW = precomputed.patchesW;

	// Still need to get ImageData from the image
	const resizedCanvas = document.createElement('canvas');
	resizedCanvas.width = resizedWidth;
	resizedCanvas.height = resizedHeight;
	const resizedCtx = resizedCanvas.getContext('2d');
	resizedCtx.drawImage(image, 0, 0, resizedWidth, resizedHeight);
	imageData = resizedCtx.getImageData(0, 0, resizedWidth, resizedHeight);
	} else {
	// Standard path: compute smart resize
	const resized = smartResize(width, height);
	resizedWidth = resized.width;
	resizedHeight = resized.height;
	actualPatchesH = resizedHeight / patchSize;
	actualPatchesW = resizedWidth / patchSize;

	// Create canvas at actual resized dimensions
	const resizedCanvas = document.createElement('canvas');
	resizedCanvas.width = resizedWidth;
	resizedCanvas.height = resizedHeight;
	const resizedCtx = resizedCanvas.getContext('2d');
	resizedCtx.drawImage(image, 0, 0, resizedWidth, resizedHeight);
	imageData = resizedCtx.getImageData(0, 0, resizedWidth, resizedHeight);
	}

	const numTiles = 1;
	const pixelValues = new Float32Array(numTiles * maxPatchesPerTile * patchDim);
	const attentionMask = new BigInt64Array(numTiles * maxPatchesPerTile);
	const spatialShapes = new BigInt64Array(numTiles * 2);

	extractPatchesFromVariableSize(imageData, pixelValues, attentionMask, 0, actualPatchesH, actualPatchesW, maxPatchesPerTile);

	spatialShapes[0] = BigInt(actualPatchesH);
	spatialShapes[1] = BigInt(actualPatchesW);

	return {
	pixelValues,
	attentionMask,
	spatialShapes,
	numTiles,
	shape: [numTiles, maxPatchesPerTile, patchDim],
	};
	}

	/**
	* Extract patches from a full 512x512 tile (all patches are valid)
	* @param {ImageData} tileData - Tile image data (512x512)
	* @param {Float32Array} pixelValues - Output pixel values array
	* @param {BigInt64Array} attentionMask - Output attention mask array
	* @param {number} tileIdx - Index of this tile
	* @param {number} patchesPerSide - Number of patches per side (32 for 512x512)
	* @param {number} maxPatchesPerTile - Max patches per tile for array indexing (1024)
	*/
	function extractPatchesFromFullTile(tileData, pixelValues, attentionMask, tileIdx, patchesPerSide, maxPatchesPerTile) {
	const patchSize = CONFIG.patchSize;
	const patchDim = patchSize * patchSize * 3;
	const tileWidth = tileData.width;

	const pixels = tileData.data;
	const tileOffset = tileIdx * maxPatchesPerTile * patchDim;
	const maskOffset = tileIdx * maxPatchesPerTile;

	let patchIdx = 0;

	for (let py = 0; py < patchesPerSide; py++) {
	for (let px = 0; px < patchesPerSide; px++) {
	const patchStartX = px * patchSize;
	const patchStartY = py * patchSize;

	// All patches in full tile are valid
	attentionMask[maskOffset + patchIdx] = 1n;

	// Extract and normalize patch pixels using pre-computed constants
	const patchOffset = tileOffset + patchIdx * patchDim;
	let outIdx = 0;

	// Flatten patch: iterate over pixels in patch, then channels
	// Optimized: srcIdx calculated once per pixel, use pre-computed normalization
	for (let dy = 0; dy < patchSize; dy++) {
	const rowOffset = (patchStartY + dy) * tileWidth;
	for (let dx = 0; dx < patchSize; dx++) {
	const srcIdx = (rowOffset + patchStartX + dx) * 4;
	// Optimized normalization: pixel * (1/127.5) - 1.0
	pixelValues[patchOffset + outIdx++] = pixels[srcIdx] * NORM_SCALE + NORM_OFFSET;
	pixelValues[patchOffset + outIdx++] = pixels[srcIdx + 1] * NORM_SCALE + NORM_OFFSET;
	pixelValues[patchOffset + outIdx++] = pixels[srcIdx + 2] * NORM_SCALE + NORM_OFFSET;
	}
	}

	patchIdx++;
	}
	}
	}

	/**
	* Extract patches from variable-sized image and pad to maxPatchesPerTile
	* Matches Python's convert_image_to_patches + pad_along_first_dim
	* @param {ImageData} imageData - Image data at actual dimensions
	* @param {Float32Array} pixelValues - Output pixel values array
	* @param {BigInt64Array} attentionMask - Output attention mask array
	* @param {number} tileIdx - Index of this tile
	* @param {number} patchesH - Number of patches in height
	* @param {number} patchesW - Number of patches in width
	* @param {number} maxPatchesPerTile - Max patches per tile for padding (1024)
	*/
	function extractPatchesFromVariableSize(imageData, pixelValues, attentionMask, tileIdx, patchesH, patchesW, maxPatchesPerTile) {
	const patchSize = CONFIG.patchSize;
	const patchDim = patchSize * patchSize * 3;
	const imageWidth = imageData.width;

	const pixels = imageData.data;
	const tileOffset = tileIdx * maxPatchesPerTile * patchDim;
	const maskOffset = tileIdx * maxPatchesPerTile;

	const actualPatches = patchesH * patchesW;

	// Extract actual patches
	let patchIdx = 0;
	for (let py = 0; py < patchesH; py++) {
	for (let px = 0; px < patchesW; px++) {
	const patchStartX = px * patchSize;
	const patchStartY = py * patchSize;

	// Mark as valid
	attentionMask[maskOffset + patchIdx] = 1n;

	// Extract and normalize patch pixels using pre-computed constants
	const patchOffset = tileOffset + patchIdx * patchDim;
	let outIdx = 0;

	// Flatten patch: iterate over pixels in patch, then channels
	// Optimized: srcIdx calculated once per pixel, use pre-computed normalization
	for (let dy = 0; dy < patchSize; dy++) {
	const rowOffset = (patchStartY + dy) * imageWidth;
	for (let dx = 0; dx < patchSize; dx++) {
	const srcIdx = (rowOffset + patchStartX + dx) * 4;
	// Optimized normalization: pixel * (1/127.5) - 1.0
	pixelValues[patchOffset + outIdx++] = pixels[srcIdx] * NORM_SCALE + NORM_OFFSET;
	pixelValues[patchOffset + outIdx++] = pixels[srcIdx + 1] * NORM_SCALE + NORM_OFFSET;
	pixelValues[patchOffset + outIdx++] = pixels[srcIdx + 2] * NORM_SCALE + NORM_OFFSET;
	}
	}

	patchIdx++;
	}
	}

	// Pad remaining patches with zeros and mask = 0
	for (let i = actualPatches; i < maxPatchesPerTile; i++) {
	attentionMask[maskOffset + i] = 0n;
	// pixelValues already initialized to 0
	}
	}

	/**
	* Load an image from URL or data URL
	* @param {string} src - Image URL or data URL
	* @returns {Promise<HTMLImageElement>}
	*/
	export function loadImage(src) {
	return new Promise((resolve, reject) => {
	const img = new Image();
	img.crossOrigin = 'anonymous';
	img.onload = () => resolve(img);
	img.onerror = reject;
	img.src = src;
	});
	}