Spaces:

Rbpppp
/

ticket-ocr

Sleeping

File size: 3,196 Bytes

import gradio as gr
from transformers import Qwen3VLForConditionalGeneration, AutoProcessor
import torch
from PIL import Image
import spaces
import json
import re

# Load model and processor
model = Qwen3VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen3-VL-2B-Instruct",
    torch_dtype=torch.bfloat16,
    device_map="auto"
)
processor = AutoProcessor.from_pretrained("Qwen/Qwen3-VL-2B-Instruct")

SYSTEM_PROMPT = """Eres un asistente que recibe la imagen de un ticket de compra y responde SOLO con JSON válido.
Esquema requerido:
{
  "merchant": string,
  "date": string | null,
  "time": string | null,
  "currency": string | null,
  "subtotal": number | null,
  "tax": number | null,
  "total": number | null,
  "paymentMethod": string | null,
  "category": string | null,
  "items": [
    { "name": string, "quantity": number | null, "unitPrice": number | null, "total": number | null }
  ]
}
Reglas:
- No inventes valores: si falta un dato, usa null.
- Los números deben ser numéricos, no strings.
- La salida debe ser SOLO ese JSON, sin texto extra ni bloques de código."""

@spaces.GPU(duration=120)
def analyze_ticket(image):
    """Analiza una imagen de ticket y devuelve JSON estructurado."""
    if image is None:
        return {"error": "No se proporcionó imagen"}
    
    # Build message with image and prompt
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": image},
                {"type": "text", "text": SYSTEM_PROMPT + "\n\nAnaliza este ticket."}
            ]
        }
    ]
    
    # Prepare inputs using apply_chat_template
    inputs = processor.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_dict=True,
        return_tensors="pt"
    )
    inputs = inputs.to(model.device)
    
    # Generate response
    with torch.no_grad():
        generated_ids = model.generate(
            **inputs,
            max_new_tokens=1024,
            do_sample=False,
        )
    
    # Decode output
    generated_ids_trimmed = [
        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )[0]
    
    print(f"Respuesta del modelo: {output_text[:500]}")
    
    # Parse JSON response
    try:
        cleaned = output_text.strip()
        cleaned = re.sub(r'^```(?:json)?\s*', '', cleaned)
        cleaned = re.sub(r'\s*```$', '', cleaned)
        result = json.loads(cleaned)
        return result
    except json.JSONDecodeError as e:
        print(f"Error parseando JSON: {e}")
        return {"raw_response": output_text, "parse_error": True}

# Simple Interface with /predict endpoint
demo = gr.Interface(
    fn=analyze_ticket,
    inputs=gr.Image(type="pil", label="Imagen del ticket"),
    outputs=gr.JSON(label="Datos extraídos"),
    title="🧾 Ticket OCR",
    description="Sube una imagen de un ticket para extraer la información estructurada.",
    api_name="predict"
)

if __name__ == "__main__":
    demo.launch()