Spaces:

raphael-gl
/

test-openai

Sleeping

test-openai / app.py

Raphael Glon

wip

a25030f unverified about 1 month ago

4.4 kB

	# Copied/Adapted from https://huggingface.co/spaces/akhaliq/MobileLLM-Pro

	import spaces

	import logging
	import os
	import re
	import threading
	from typing import List, Tuple, Dict

	import torch
	import gradio as gr
	from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
	from huggingface_hub import login

	MODEL_ID = "openai/gpt-oss-20b"

	logging.basicConfig(level=logging.DEBUG)

	LOG = logging.getLogger(__name__)

	MAX_NEW_TOKENS = 256
	TEMPERATURE = 0.7
	TOP_P = 0.95

	ANALYSIS_PATTERN = analysis_match = re.compile(r'^(.*)assistantfinal', flags=re.DOTALL)

	# --- Silent Hub auth via env/Space Secret (no UI) ---
	HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN") or os.getenv("HUGGINGFACE_TOKEN")
	if HF_TOKEN:
	try:
	login(token=HF_TOKEN)
	except Exception:
	pass # stay silent

	# Globals so we only load once
	_tokenizer = None
	_model = None
	_device = None


	def _ensure_loaded():
	LOG.info("Loading model and tokenizer")
	global _tokenizer, _model, _device
	if _tokenizer is not None and _model is not None:
	return
	_tokenizer = AutoTokenizer.from_pretrained(
	MODEL_ID, trust_remote_code=True
	)
	_model = AutoModelForCausalLM.from_pretrained(
	MODEL_ID,
	trust_remote_code=True,
	# torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
	low_cpu_mem_usage=True,
	device_map="auto" if torch.cuda.is_available() else None,
	)
	if _tokenizer.pad_token_id is None and _tokenizer.eos_token_id is not None:
	_tokenizer.pad_token = _tokenizer.eos_token
	_model.eval()
	_device = next(_model.parameters()).device


	_ensure_loaded()
	LOG.info("DEVICE %s", _device)


	def _history_to_messages(history: List[Tuple[str, str]]) -> List[Dict[str, str]]:
	msgs: List[Dict[str, str]] = []
	for user_msg, bot_msg in history:
	if user_msg:
	msgs.append({"role": "user", "content": user_msg})
	if bot_msg:
	msgs.append({"role": "assistant", "content": bot_msg})
	return msgs


	@spaces.GPU(duration=120)
	def generate_stream(message: str, history: List[Tuple[str, str]]):
	"""
	Minimal streaming chat function for gr.ChatInterface.
	Uses instruct chat template. No token UI. No extra controls.
	"""

	# FIXME: check the memory footprint doing so. We should rather do this before the spaces wrapper...
	# _ensure_loaded()

	messages = _history_to_messages(history) + [{"role": "user", "content": message}]
	inputs = _tokenizer.apply_chat_template(
	messages,
	return_tensors="pt",
	add_generation_prompt=True,
	)
	input_ids = inputs["input_ids"] if isinstance(inputs, dict) else inputs
	input_ids = input_ids.to(_device)

	# IMPORTANT: don't stream the prompt (prevents system/user text from appearing)
	streamer = TextIteratorStreamer(
	_tokenizer,
	skip_special_tokens=True,
	skip_prompt=True, # <-- key fix
	)

	gen_kwargs = dict(
	input_ids=input_ids,
	max_new_tokens=MAX_NEW_TOKENS,
	do_sample=TEMPERATURE > 0.0,
	temperature=float(TEMPERATURE),
	top_p=float(TOP_P),
	pad_token_id=_tokenizer.pad_token_id,
	eos_token_id=_tokenizer.eos_token_id,
	streamer=streamer,
	)

	thread = threading.Thread(target=_model.generate, kwargs=gen_kwargs)
	thread.start()

	analysis = ""
	output = ""
	for new_text in streamer:
	output += new_text
	if not analysis:
	m = ANALYSIS_PATTERN.match(output)
	if m:
	analysis = re.sub(r'^analysis\s*', '', m.group(1))
	output = ""

	LOG.info("NEW TEXT: %s, OUTPUT: %s", new_text, output.encode())
	if not analysis:
	answer = f"Analysis:\n{output}"
	else:
	answer = f"Analysis:\n{analysis}\nAnswer:\n{output}"
	yield answer


	with gr.Blocks(title="OpenAI GPT-OSS 20B Chat") as demo:
	gr.Markdown(
	"""
	# Chat
	Streaming chat with openai/gpt-oss-20b (instruct)
	""")
	gr.ChatInterface(
	fn=generate_stream,
	chatbot=gr.Chatbot(height=420, label="OpenAI"),
	title=None, # header handled by Markdown above
	description=None,
	)

	if __name__ == "__main__":
	demo.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", 7860)))