Spaces:

Rbpppp
/

ticket-ocr

Sleeping

App Files Files Community

Rbpppp commited on 9 days ago

Commit

4ad7141

verified ·

1 Parent(s): ef66f3a

Update app.py

Browse files

Files changed (1) hide show

app.py +112 -110

app.py CHANGED Viewed

@@ -1,110 +1,112 @@
-import gradio as gr
-import spaces
-import torch
-from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
-from qwen_vl_utils import process_vision_info
-import json
-import re
-# Cargar modelo y procesador (se hace una vez al iniciar)
-MODEL_ID = "Qwen/Qwen2.5-VL-3B-Instruct"
-model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-    MODEL_ID,
-    torch_dtype=torch.bfloat16,
-    device_map="auto",
-)
-processor = AutoProcessor.from_pretrained(MODEL_ID)
-SYSTEM_PROMPT = """Eres un asistente que recibe la imagen de un ticket de compra y responde SOLO con JSON válido.
-Esquema requerido:
-{
-  "merchant": string,
-  "date": string | null,
-  "time": string | null,
-  "currency": string | null,
-  "subtotal": number | null,
-  "tax": number | null,
-  "total": number | null,
-  "paymentMethod": string | null,
-  "category": string | null,
-  "items": [
-    { "name": string, "quantity": number | null, "unitPrice": number | null, "total": number | null }
-  ]
-}
-Reglas:
-- No inventes valores: si falta un dato, usa null.
-- Los números deben ser numéricos, no strings.
-- La salida debe ser SOLO ese JSON, sin texto extra ni bloques de código."""
-@spaces.GPU
-def analyze_ticket(image):
-    """Analiza una imagen de ticket y devuelve JSON estructurado."""
-    if image is None:
-        return {"error": "No se proporcionó imagen"}
-    messages = [
-        {
-            "role": "system",
-            "content": SYSTEM_PROMPT
-        },
-        {
-            "role": "user",
-            "content": [
-                {"type": "image", "image": image},
-                {"type": "text", "text": "Analiza este ticket y extrae la información en formato JSON."}
-            ],
-        }
-    ]
-    # Preparar inputs
-    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-    image_inputs, video_inputs = process_vision_info(messages)
-    inputs = processor(
-        text=[text],
-        images=image_inputs,
-        videos=video_inputs,
-        padding=True,
-        return_tensors="pt",
-    ).to(model.device)
-    # Generar respuesta
-    generated_ids = model.generate(
-        **inputs,
-        max_new_tokens=1024,
-        do_sample=False,
-    )
-    # Decodificar solo los tokens generados
-    generated_ids_trimmed = [
-        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
-    ]
-    output_text = processor.batch_decode(
-        generated_ids_trimmed,
-        skip_special_tokens=True,
-        clean_up_tokenization_spaces=False
-    )[0]
-    # Intentar parsear como JSON
-    try:
-        # Limpiar posibles bloques de código markdown
-        cleaned = re.sub(r'^```(?:json)?\s*', '', output_text.strip())
-        cleaned = re.sub(r'\s*```$', '', cleaned)
-        result = json.loads(cleaned)
-        return result
-    except json.JSONDecodeError:
-        # Si falla el parseo, devolver el texto crudo
-        return {"raw_response": output_text, "parse_error": True}
-# Crear interfaz Gradio
-demo = gr.Interface(
-    fn=analyze_ticket,
-    inputs=gr.Image(type="pil", label="Imagen del ticket"),
-    outputs=gr.JSON(label="Datos extraídos"),
-    title="🧾 Ticket OCR",
-    description="Sube una imagen de un ticket para extraer la información estructurada.",
-    api_name="predict"
-)
-if __name__ == "__main__":
-    demo.launch()

+import gradio as gr
+import spaces
+import torch
+from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
+from PIL import Image
+import json
+import re
+# Cargar modelo más pequeño (2B en lugar de 3B)
+MODEL_ID = "Qwen/Qwen2.5-VL-2B-Instruct"
+print("Cargando modelo...")
+model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+    MODEL_ID,
+    torch_dtype=torch.float16,
+    device_map="auto",
+    trust_remote_code=True,
+)
+processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
+print("Modelo cargado!")
+SYSTEM_PROMPT = """Eres un asistente que recibe la imagen de un ticket de compra y responde SOLO con JSON válido.
+Esquema requerido:
+{
+  "merchant": string,
+  "date": string | null,
+  "time": string | null,
+  "currency": string | null,
+  "subtotal": number | null,
+  "tax": number | null,
+  "total": number | null,
+  "paymentMethod": string | null,
+  "category": string | null,
+  "items": [
+    { "name": string, "quantity": number | null, "unitPrice": number | null, "total": number | null }
+  ]
+}
+Reglas:
+- No inventes valores: si falta un dato, usa null.
+- Los números deben ser numéricos, no strings.
+- La salida debe ser SOLO ese JSON, sin texto extra ni bloques de código."""
+@spaces.GPU
+def analyze_ticket(image):
+    """Analiza una imagen de ticket y devuelve JSON estructurado."""
+    if image is None:
+        return {"error": "No se proporcionó imagen"}
+    # Construir mensajes para el modelo
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image", "image": image},
+                {"type": "text", "text": SYSTEM_PROMPT + "\n\nAnaliza este ticket y extrae la información."}
+            ],
+        }
+    ]
+    # Preparar inputs
+    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    inputs = processor(
+        text=[text],
+        images=[image],
+        padding=True,
+        return_tensors="pt",
+    ).to(model.device)
+    # Generar respuesta
+    with torch.no_grad():
+        generated_ids = model.generate(
+            **inputs,
+            max_new_tokens=1024,
+            do_sample=False,
+            pad_token_id=processor.tokenizer.pad_token_id,
+        )
+    # Decodificar solo los tokens generados
+    generated_ids_trimmed = generated_ids[:, inputs.input_ids.shape[1]:]
+    output_text = processor.batch_decode(
+        generated_ids_trimmed,
+        skip_special_tokens=True,
+        clean_up_tokenization_spaces=False
+    )[0]
+    print(f"Respuesta del modelo: {output_text[:500]}")
+    # Intentar parsear como JSON
+    try:
+        # Limpiar posibles bloques de código markdown
+        cleaned = output_text.strip()
+        cleaned = re.sub(r'^```(?:json)?\s*', '', cleaned)
+        cleaned = re.sub(r'\s*```$', '', cleaned)
+        result = json.loads(cleaned)
+        return result
+    except json.JSONDecodeError as e:
+        print(f"Error parseando JSON: {e}")
+        return {"raw_response": output_text, "parse_error": True}
+# Crear interfaz Gradio
+demo = gr.Interface(
+    fn=analyze_ticket,
+    inputs=gr.Image(type="pil", label="Imagen del ticket"),
+    outputs=gr.JSON(label="Datos extraídos"),
+    title="🧾 Ticket OCR",
+    description="Sube una imagen de un ticket para extraer la información estructurada.",
+    api_name="predict"
+)
+if __name__ == "__main__":
+    demo.launch()