Spaces:

webml-community
/

mobileclip-webgpu

Running

App Files Files Community

mobileclip-webgpu / main.js

Xenova's picture

Xenova HF Staff

Upload 3 files

787389e verified 10 months ago

history blame contribute delete

5.01 kB

	import {
	AutoTokenizer,
	CLIPTextModelWithProjection,
	AutoProcessor,
	CLIPVisionModelWithProjection,
	RawImage,
	dot,
	softmax,
	} from "https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.5.0";

	// Reference the elements that we will need
	const status = document.getElementById("status");
	const container = document.getElementById("container");
	const video = document.getElementById("video");
	const labelsInput = document.getElementById("labels");
	const templateInput = document.getElementById("template");
	const overlay = document.getElementById("overlay");

	status.textContent = "Loading model (88MB)...";

	const model_id = "Xenova/mobileclip_s0";
	let tokenizer, text_model, processor, vision_model;
	try {
	// Load tokenizer and text model
	tokenizer = await AutoTokenizer.from_pretrained(model_id);
	text_model = await CLIPTextModelWithProjection.from_pretrained(model_id, {
	device: "wasm",
	dtype: "q8",
	});

	// Load processor and vision model
	processor = await AutoProcessor.from_pretrained(model_id);
	vision_model = await CLIPVisionModelWithProjection.from_pretrained(model_id, {
	device: "webgpu",
	dtype: "fp32",
	});
	} catch (err) {
	console.error(err);
	status.textContent = err.message;
	alert(err.message);
	throw err;
	}

	labelsInput.disabled = false;
	templateInput.disabled = false;

	status.textContent = "Ready";

	// See `model.logit_scale` parameter of original model
	const exp_logit_scale = Math.exp(4.6052);

	const IMAGE_SIZE = 224;
	const canvas = document.createElement("canvas");
	canvas.width = canvas.height = IMAGE_SIZE;
	const context = canvas.getContext("2d", { willReadFrequently: true });

	let isProcessing = false;
	let previousTime;
	let textEmbeddings;
	let prevTextInputs;
	let prevTemplate;
	let labels;

	function onFrameUpdate() {
	if (!isProcessing) {
	isProcessing = true;
	(async function () {
	// If text inputs have changed, update the embeddings
	if (
	prevTextInputs !== labelsInput.value \|\|
	prevTemplate !== templateInput.value
	) {
	textEmbeddings = null;
	prevTextInputs = labelsInput.value;
	prevTemplate = templateInput.value;
	labels = prevTextInputs.split(/\s,\s/).filter((x) => x);

	if (labels.length > 0) {
	const texts = labels.map((x) =>
	templateInput.value.replaceAll("{}", x),
	);

	const text_inputs = tokenizer(texts, {
	padding: "max_length", // NB: the model requires max_length padding
	truncation: true,
	});

	// Compute embeddings
	const { text_embeds } = await text_model(text_inputs);
	textEmbeddings = text_embeds.normalize().tolist();
	} else {
	overlay.innerHTML = "";
	}
	}

	if (textEmbeddings) {
	// Read the current frame from the video
	context.drawImage(video, 0, 0, IMAGE_SIZE, IMAGE_SIZE);
	const pixelData = context.getImageData(
	0,
	0,
	IMAGE_SIZE,
	IMAGE_SIZE,
	).data;
	const image = new RawImage(pixelData, IMAGE_SIZE, IMAGE_SIZE, 4);

	const image_inputs = await processor(image);

	// Compute embeddings
	const { image_embeds } = await vision_model(image_inputs);
	const imageEmbedding = image_embeds.normalize().tolist()[0];

	// Compute similarity
	const similarities = textEmbeddings.map(
	(x) => dot(x, imageEmbedding) * exp_logit_scale,
	);

	const sortedIndices = softmax(similarities)
	.map((x, i) => [x, i])
	.sort((a, b) => b[0] - a[0]);

	// Update UI
	overlay.innerHTML = "";
	for (const [score, index] of sortedIndices) {
	overlay.appendChild(
	document.createTextNode(`${labels[index]}: ${score.toFixed(2)}`),
	);
	overlay.appendChild(document.createElement("br"));
	}
	}

	if (previousTime !== undefined) {
	const fps = 1000 / (performance.now() - previousTime);
	status.textContent = `FPS: ${fps.toFixed(2)}`;
	}
	previousTime = performance.now();
	isProcessing = false;
	})();
	}

	window.requestAnimationFrame(onFrameUpdate);
	}

	// Start the video stream
	navigator.mediaDevices
	.getUserMedia(
	{ video: true }, // Ask for video
	)
	.then((stream) => {
	// Set up the video and canvas elements.
	video.srcObject = stream;
	video.play();

	const videoTrack = stream.getVideoTracks()[0];
	const { width, height } = videoTrack.getSettings();

	video.width = width;
	video.height = height;

	// Set container width and height depending on the image aspect ratio
	const ar = width / height;
	const [cw, ch] = ar > 720 / 405 ? [720, 720 / ar] : [405 * ar, 405];
	container.style.width = `${cw}px`;
	container.style.height = `${ch}px`;

	// Start the animation loop
	window.requestAnimationFrame(onFrameUpdate);
	})
	.catch((error) => {
	alert(error);
	});