Instructions to use PierrunoYT/moondream3-preview with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use PierrunoYT/moondream3-preview with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("image-text-to-text", model="PierrunoYT/moondream3-preview", trust_remote_code=True)# Load model directly from transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained("PierrunoYT/moondream3-preview", trust_remote_code=True, dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps Settings
- vLLM
How to use PierrunoYT/moondream3-preview with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "PierrunoYT/moondream3-preview" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "PierrunoYT/moondream3-preview", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker
docker model run hf.co/PierrunoYT/moondream3-preview
- SGLang
How to use PierrunoYT/moondream3-preview with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "PierrunoYT/moondream3-preview" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "PierrunoYT/moondream3-preview", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "PierrunoYT/moondream3-preview" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "PierrunoYT/moondream3-preview", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }' - Docker Model Runner
How to use PierrunoYT/moondream3-preview with Docker Model Runner:
docker model run hf.co/PierrunoYT/moondream3-preview
File size: 2,299 Bytes
20e6e81 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 | import functools
import os
import shutil
import torch
from pathlib import Path
from urllib.request import Request, urlopen
from typing import Optional
def variant_cache_dir():
hf_hub_cache = os.environ.get("HF_HUB_CACHE")
if hf_hub_cache is not None:
return Path(hf_hub_cache) / "md_variants"
hf_home = os.environ.get("HF_HOME")
if hf_home is not None:
return Path(hf_home) / "hub" / "md_variants"
return Path("~/.cache/huggingface/hub").expanduser() / "md_variants"
def cached_variant_path(variant_id: str):
variant, *rest = variant_id.split("/", 1)
step = rest[0] if rest else "final"
cache_dir = variant_cache_dir() / variant
os.makedirs(cache_dir, exist_ok=True)
dest = cache_dir / f"{step}.pt"
if dest.exists():
return dest
md_endpoint = os.getenv("MOONDREAM_ENDPOINT", "https://api.moondream.ai")
headers = {"User-Agent": "moondream-torch"}
api_key = os.getenv("MOONDREAM_API_KEY")
if api_key is not None:
headers["X-Moondream-Auth"] = api_key
req = Request(f"{md_endpoint}/v1/variants/{variant_id}/download", headers=headers)
with urlopen(req) as r, open(dest, "wb") as f:
shutil.copyfileobj(r, f)
return dest
def nest(flat):
tree = {}
for k, v in flat.items():
parts = k.split(".")
d = tree
for p in parts[:-1]:
d = d.setdefault(p, {})
d[parts[-1]] = v
return tree
@functools.lru_cache(maxsize=5)
def variant_state_dict(variant_id: Optional[str] = None, device: str = "cpu"):
if variant_id is None:
return None
state_dict = torch.load(
cached_variant_path(variant_id), map_location=device, weights_only=True
)
# TODO: Move these into the training code that saves checkpoints...
rename_rules = [
("text_model.transformer.h", "text.blocks"),
(".mixer", ".attn"),
(".out_proj", ".proj"),
(".Wqkv", ".qkv"),
(".parametrizations.weight.0", ""),
]
new_state_dict = {}
for key, tensor in state_dict.items():
new_key = key
for old, new in rename_rules:
if old in new_key:
new_key = new_key.replace(old, new)
new_state_dict[new_key] = tensor
return nest(new_state_dict)
|