| | import { |
| | AutoTokenizer, |
| | CLIPTextModelWithProjection, |
| | AutoProcessor, |
| | CLIPVisionModelWithProjection, |
| | RawImage, |
| | dot, |
| | softmax, |
| | } from "https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.5.0"; |
| |
|
| | |
| | const status = document.getElementById("status"); |
| | const container = document.getElementById("container"); |
| | const video = document.getElementById("video"); |
| | const labelsInput = document.getElementById("labels"); |
| | const templateInput = document.getElementById("template"); |
| | const overlay = document.getElementById("overlay"); |
| |
|
| | status.textContent = "Loading model (88MB)..."; |
| |
|
| | const model_id = "Xenova/mobileclip_s0"; |
| | let tokenizer, text_model, processor, vision_model; |
| | try { |
| | |
| | tokenizer = await AutoTokenizer.from_pretrained(model_id); |
| | text_model = await CLIPTextModelWithProjection.from_pretrained(model_id, { |
| | device: "wasm", |
| | dtype: "q8", |
| | }); |
| |
|
| | |
| | processor = await AutoProcessor.from_pretrained(model_id); |
| | vision_model = await CLIPVisionModelWithProjection.from_pretrained(model_id, { |
| | device: "webgpu", |
| | dtype: "fp32", |
| | }); |
| | } catch (err) { |
| | console.error(err); |
| | status.textContent = err.message; |
| | alert(err.message); |
| | throw err; |
| | } |
| |
|
| | labelsInput.disabled = false; |
| | templateInput.disabled = false; |
| |
|
| | status.textContent = "Ready"; |
| |
|
| | |
| | const exp_logit_scale = Math.exp(4.6052); |
| |
|
| | const IMAGE_SIZE = 224; |
| | const canvas = document.createElement("canvas"); |
| | canvas.width = canvas.height = IMAGE_SIZE; |
| | const context = canvas.getContext("2d", { willReadFrequently: true }); |
| |
|
| | let isProcessing = false; |
| | let previousTime; |
| | let textEmbeddings; |
| | let prevTextInputs; |
| | let prevTemplate; |
| | let labels; |
| |
|
| | function onFrameUpdate() { |
| | if (!isProcessing) { |
| | isProcessing = true; |
| | (async function () { |
| | |
| | if ( |
| | prevTextInputs !== labelsInput.value || |
| | prevTemplate !== templateInput.value |
| | ) { |
| | textEmbeddings = null; |
| | prevTextInputs = labelsInput.value; |
| | prevTemplate = templateInput.value; |
| | labels = prevTextInputs.split(/\s*,\s*/).filter((x) => x); |
| |
|
| | if (labels.length > 0) { |
| | const texts = labels.map((x) => |
| | templateInput.value.replaceAll("{}", x), |
| | ); |
| |
|
| | const text_inputs = tokenizer(texts, { |
| | padding: "max_length", |
| | truncation: true, |
| | }); |
| |
|
| | |
| | const { text_embeds } = await text_model(text_inputs); |
| | textEmbeddings = text_embeds.normalize().tolist(); |
| | } else { |
| | overlay.innerHTML = ""; |
| | } |
| | } |
| |
|
| | if (textEmbeddings) { |
| | |
| | context.drawImage(video, 0, 0, IMAGE_SIZE, IMAGE_SIZE); |
| | const pixelData = context.getImageData( |
| | 0, |
| | 0, |
| | IMAGE_SIZE, |
| | IMAGE_SIZE, |
| | ).data; |
| | const image = new RawImage(pixelData, IMAGE_SIZE, IMAGE_SIZE, 4); |
| |
|
| | const image_inputs = await processor(image); |
| |
|
| | |
| | const { image_embeds } = await vision_model(image_inputs); |
| | const imageEmbedding = image_embeds.normalize().tolist()[0]; |
| |
|
| | |
| | const similarities = textEmbeddings.map( |
| | (x) => dot(x, imageEmbedding) * exp_logit_scale, |
| | ); |
| |
|
| | const sortedIndices = softmax(similarities) |
| | .map((x, i) => [x, i]) |
| | .sort((a, b) => b[0] - a[0]); |
| |
|
| | |
| | overlay.innerHTML = ""; |
| | for (const [score, index] of sortedIndices) { |
| | overlay.appendChild( |
| | document.createTextNode(`${labels[index]}: ${score.toFixed(2)}`), |
| | ); |
| | overlay.appendChild(document.createElement("br")); |
| | } |
| | } |
| |
|
| | if (previousTime !== undefined) { |
| | const fps = 1000 / (performance.now() - previousTime); |
| | status.textContent = `FPS: ${fps.toFixed(2)}`; |
| | } |
| | previousTime = performance.now(); |
| | isProcessing = false; |
| | })(); |
| | } |
| |
|
| | window.requestAnimationFrame(onFrameUpdate); |
| | } |
| |
|
| | |
| | navigator.mediaDevices |
| | .getUserMedia( |
| | { video: true }, |
| | ) |
| | .then((stream) => { |
| | |
| | video.srcObject = stream; |
| | video.play(); |
| |
|
| | const videoTrack = stream.getVideoTracks()[0]; |
| | const { width, height } = videoTrack.getSettings(); |
| |
|
| | video.width = width; |
| | video.height = height; |
| |
|
| | |
| | const ar = width / height; |
| | const [cw, ch] = ar > 720 / 405 ? [720, 720 / ar] : [405 * ar, 405]; |
| | container.style.width = `${cw}px`; |
| | container.style.height = `${ch}px`; |
| |
|
| | |
| | window.requestAnimationFrame(onFrameUpdate); |
| | }) |
| | .catch((error) => { |
| | alert(error); |
| | }); |
| |
|