import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer import torch model_name = "dphn/Dolphin3.0-Llama3.1-8B" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype=torch.float16, device_map="auto" ) def chat(message, history): inputs = tokenizer.apply_chat_template( history + [{"role": "user", "content": message}], return_tensors="pt" ).to(model.device) outputs = model.generate(inputs, max_new_tokens=512) reply = tokenizer.decode(outputs[0], skip_special_tokens=True) history.append({"role": "assistant", "content": reply}) return reply, history gr.ChatInterface(fn=chat, title="Dolphin 3.0 Chat").launch()