Spaces:

AdithyaSK
/

NetraEmbed

Running on Zero

App Files Files Community

AdithyaSK commited on 5 days ago

Commit

db64b10

1 Parent(s): 4392e56

Refactor app.py: update demo description, enhance PDF handling, and improve model loading functions

Browse files

Files changed (1) hide show

app.py +323 -287

app.py CHANGED Viewed

@@ -1,28 +1,22 @@
 """
-Gradio Demo for Document Retrieval - Hugging Face Spaces with ZeroGPU
-This script creates a Gradio interface for testing both BiGemma3 and ColGemma3 models
-with PDF document upload, automatic conversion to images, and query-based retrieval.
-Features:
-- PDF upload with automatic conversion to images
-- Model selection: NetraEmbed (BiGemma3), ColNetraEmbed (ColGemma3), or Both
-- Query input with top-k selection (default: 5)
-- Similarity score display
-- Side-by-side comparison when both models are selected
-- ZeroGPU integration for efficient GPU usage
-"""
-import io
-import gc
-import math
-from typing import List, Optional, Tuple
-import gradio as gr
-import torch
 import spaces
 from pdf2image import convert_from_path
 from PIL import Image
 import matplotlib.pyplot as plt
 import numpy as np
 import seaborn as sns
@@ -33,8 +27,6 @@ from colpali_engine.models import BiGemma3, BiGemmaProcessor3, ColGemma3, ColGem
 from colpali_engine.interpretability import get_similarity_maps_from_embeddings
 from colpali_engine.interpretability.similarity_map_utils import normalize_similarity_map
-# Configuration
-MAX_BATCH_SIZE = 32  # Maximum pages to process at once
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"Device: {device}")
@@ -54,146 +46,144 @@ class DocumentIndex:
 doc_index = DocumentIndex()
-# Helper functions
-def pdf_to_images(pdf_path: str) -> List[Image.Image]:
-    """Convert PDF to list of PIL Images with error handling."""
-    try:
-        print(f"Converting PDF to images: {pdf_path}")
-        images = convert_from_path(pdf_path, dpi=200)
-        print(f"Converted {len(images)} pages")
-        return images
-    except Exception as e:
-        print(f"❌ PDF conversion error: {str(e)}")
-        raise gr.Error(f"Failed to convert PDF: {str(e)}")
 @spaces.GPU
 def load_bigemma_model():
     """Load BiGemma3 model and processor."""
     if doc_index.bigemma_model is None:
         print("Loading BiGemma3 (NetraEmbed)...")
-        try:
-            doc_index.bigemma_processor = BiGemmaProcessor3.from_pretrained(
-                "Cognitive-Lab/NetraEmbed",
-                use_fast=True,
-            )
-            doc_index.bigemma_model = BiGemma3.from_pretrained(
-                "Cognitive-Lab/NetraEmbed",
-                torch_dtype=torch.bfloat16,
-                device_map=device,
-            )
-            doc_index.bigemma_model.eval()
-            print("✓ BiGemma3 loaded successfully")
-        except Exception as e:
-            print(f"❌ Failed to load BiGemma3: {str(e)}")
-            raise gr.Error(f"Failed to load BiGemma3: {str(e)}")
-    return "✅ BiGemma3 loaded"
 @spaces.GPU
 def load_colgemma_model():
     """Load ColGemma3 model and processor."""
     if doc_index.colgemma_model is None:
         print("Loading ColGemma3 (ColNetraEmbed)...")
         try:
-            doc_index.colgemma_model = ColGemma3.from_pretrained(
-                "Cognitive-Lab/ColNetraEmbed",
-                dtype=torch.bfloat16,
-                device_map=device,
-            )
-            doc_index.colgemma_model.eval()
-            doc_index.colgemma_processor = ColGemmaProcessor3.from_pretrained(
-                "Cognitive-Lab/ColNetraEmbed",
-                use_fast=True,
-            )
-            print("✓ ColGemma3 loaded successfully")
         except Exception as e:
-            print(f"❌ Failed to load ColGemma3: {str(e)}")
-            raise gr.Error(f"Failed to load ColGemma3: {str(e)}")
-    return "✅ ColGemma3 loaded"
-def unload_models():
-    """Unload models and free GPU memory."""
-    try:
-        if doc_index.bigemma_model is not None:
-            del doc_index.bigemma_model
-            del doc_index.bigemma_processor
-            doc_index.bigemma_model = None
-            doc_index.bigemma_processor = None
-        if doc_index.colgemma_model is not None:
-            del doc_index.colgemma_model
-            del doc_index.colgemma_processor
-            doc_index.colgemma_model = None
-            doc_index.colgemma_processor = None
-        # Clear embeddings and images
-        doc_index.bigemma_embeddings = None
-        doc_index.colgemma_embeddings = None
-        doc_index.images = []
-        # Force garbage collection
-        gc.collect()
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
-            torch.cuda.synchronize()
-        return "✅ Models unloaded and GPU memory cleared"
-    except Exception as e:
-        return f"❌ Error unloading models: {str(e)}"
 @spaces.GPU
-def index_bigemma_images(images: List[Image.Image]) -> torch.Tensor:
-    """Index images with BiGemma3 model."""
-    # Ensure model is loaded
-    if doc_index.bigemma_model is None:
-        load_bigemma_model()
-    model, processor = doc_index.bigemma_model, doc_index.bigemma_processor
-    batch_images = processor.process_images(images).to(device)
-    embeddings = model(**batch_images, embedding_dim=768)
-    return embeddings
 @spaces.GPU
-def index_colgemma_images(images: List[Image.Image]) -> torch.Tensor:
-    """Index images with ColGemma3 model."""
-    # Ensure model is loaded
-    if doc_index.colgemma_model is None:
-        load_colgemma_model()
-    model, processor = doc_index.colgemma_model, doc_index.colgemma_processor
-    batch_images = processor.process_images(images).to(device)
-    embeddings = model(**batch_images)
-    return embeddings
-def index_document(pdf_file, model_choice: str):
-    """Upload and index a PDF document."""
-    if pdf_file is None:
-        return "⚠️ Please upload a PDF document first."
     try:
-        status = []
-        # Convert PDF to images
-        status.append("⏳ Converting PDF to images...")
-        doc_index.images = pdf_to_images(pdf_file.name)
         num_pages = len(doc_index.images)
-        status.append(f"✓ Converted PDF to {num_pages} images")
-        if num_pages > MAX_BATCH_SIZE:
-            status.append(f"⚠️ Large PDF ({num_pages} pages). Processing in batches...")
         # Index with BiGemma3
         if model_choice in ["NetraEmbed (BiGemma3)", "Both"]:
-            status.append("⏳ Loading & encoding with BiGemma3...")
             doc_index.bigemma_embeddings = index_bigemma_images(doc_index.images)
-            status.append(f"✓ Indexed with BiGemma3 (shape: {doc_index.bigemma_embeddings.shape})")
         # Index with ColGemma3
         if model_choice in ["ColNetraEmbed (ColGemma3)", "Both"]:
-            status.append("⏳ Loading & encoding with ColGemma3...")
             doc_index.colgemma_embeddings = index_colgemma_images(doc_index.images)
-            status.append(f"✓ Indexed with ColGemma3 (shape: {doc_index.colgemma_embeddings.shape})")
-        return "\n".join(status) + "\n\n✅ Document ready for querying!"
     except Exception as e:
         import traceback
@@ -201,67 +191,59 @@ def index_document(pdf_file, model_choice: str):
         print(f"Indexing error: {error_details}")
         return f"❌ Error indexing document: {str(e)}"
 @spaces.GPU
 def generate_colgemma_heatmap(
     image: Image.Image,
-    query: str,
     query_embedding: torch.Tensor,
     image_embedding: torch.Tensor,
-    model,
-    processor,
 ) -> Image.Image:
     """Generate heatmap overlay for ColGemma3 results."""
     try:
-        # Re-process the single image to get the proper batch_images dict for image mask
         batch_images = processor.process_images([image]).to(device)
-        # Create image mask manually
         if "input_ids" in batch_images and hasattr(model.config, "image_token_id"):
             image_token_id = model.config.image_token_id
             image_mask = batch_images["input_ids"] == image_token_id
         else:
             image_mask = torch.ones(
-                image_embedding.shape[0], image_embedding.shape[1], dtype=torch.bool, device=device
             )
-        # Calculate n_patches from actual number of image tokens
         num_image_tokens = image_mask.sum().item()
         n_side = int(math.sqrt(num_image_tokens))
-        if n_side * n_side == num_image_tokens:
-            n_patches = (n_side, n_side)
-        else:
-            n_patches = (16, 16)
         # Generate similarity maps
         similarity_maps_list = get_similarity_maps_from_embeddings(
-            image_embeddings=image_embedding,
-            query_embeddings=query_embedding,
             n_patches=n_patches,
             image_mask=image_mask,
         )
         similarity_map = similarity_maps_list[0]
-        # Aggregate across all query tokens
         if similarity_map.dtype == torch.bfloat16:
             similarity_map = similarity_map.float()
         aggregated_map = torch.mean(similarity_map, dim=0)
-        # Convert the image to an array
         img_array = np.array(image.convert("RGBA"))
-        # Normalize the similarity map
         similarity_map_array = normalize_similarity_map(aggregated_map).to(torch.float32).cpu().numpy()
         similarity_map_array = rearrange(similarity_map_array, "h w -> w h")
-        # Create PIL image from similarity map
         similarity_map_image = Image.fromarray((similarity_map_array * 255).astype("uint8")).resize(
             image.size, Image.Resampling.BICUBIC
         )
         # Create matplotlib figure
-        _, ax = plt.subplots(figsize=(10, 10))
         ax.imshow(img_array)
         ax.imshow(
             similarity_map_image,
@@ -284,210 +266,261 @@ def generate_colgemma_heatmap(
         print(f"❌ Heatmap generation error: {str(e)}")
         return image
-@spaces.GPU
-def query_bigemma(query: str, top_k: int) -> Tuple[str, List]:
-    """Query indexed documents with BiGemma3."""
-    # Ensure model is loaded
-    if doc_index.bigemma_model is None:
-        load_bigemma_model()
-    model, processor = doc_index.bigemma_model, doc_index.bigemma_processor
-    # Encode query
-    batch_query = processor.process_texts([query]).to(device)
-    query_embedding = model(**batch_query, embedding_dim=768)
-    # Compute scores
-    scores = processor.score(qs=query_embedding, ps=doc_index.bigemma_embeddings)
-    # Get top-k results
-    top_k_actual = min(top_k, len(doc_index.images))
-    top_indices = scores[0].argsort(descending=True)[:top_k_actual]
-    # Format results
-    results_text = "### BiGemma3 (NetraEmbed) Results\n\n"
-    gallery_images = []
-    for rank, idx in enumerate(top_indices):
-        score = scores[0, idx].item()
-        results_text += f"**Rank {rank + 1}:** Page {idx.item() + 1} - Score: {score:.4f}\n"
-        gallery_images.append(
-            (doc_index.images[idx.item()], f"Rank {rank + 1} - Page {idx.item() + 1} (Score: {score:.4f})")
-        )
-    return results_text, gallery_images
 @spaces.GPU
-def query_colgemma(query: str, top_k: int, show_heatmap: bool = False) -> Tuple[str, List]:
-    """Query indexed documents with ColGemma3."""
-    # Ensure model is loaded
-    if doc_index.colgemma_model is None:
-        load_colgemma_model()
-    model, processor = doc_index.colgemma_model, doc_index.colgemma_processor
-    # Encode query
-    batch_query = processor.process_queries([query]).to(device)
-    query_embedding = model(**batch_query)
-    # Compute scores
-    scores = processor.score_multi_vector(qs=query_embedding, ps=doc_index.colgemma_embeddings)
-    # Get top-k results
-    top_k_actual = min(top_k, len(doc_index.images))
-    top_indices = scores[0].argsort(descending=True)[:top_k_actual]
-    # Format results
-    results_text = "### ColGemma3 (ColNetraEmbed) Results\n\n"
-    gallery_images = []
-    for rank, idx in enumerate(top_indices):
-        score = scores[0, idx].item()
-        results_text += f"**Rank {rank + 1}:** Page {idx.item() + 1} - Score: {score:.2f}\n"
-        # Generate heatmap if requested
-        if show_heatmap:
-            heatmap_image = generate_colgemma_heatmap(
-                image=doc_index.images[idx.item()],
-                query=query,
-                query_embedding=query_embedding,
-                image_embedding=doc_index.colgemma_embeddings[idx.item()].unsqueeze(0),
-                model=model,
-                processor=processor,
-            )
-            gallery_images.append(
-                (heatmap_image, f"Rank {rank + 1} - Page {idx.item() + 1} (Score: {score:.2f})")
-            )
-        else:
-            gallery_images.append(
-                (doc_index.images[idx.item()], f"Rank {rank + 1} - Page {idx.item() + 1} (Score: {score:.2f})")
-            )
-    return results_text, gallery_images
 def query_documents(
     query: str, model_choice: str, top_k: int, show_heatmap: bool = False
-) -> Tuple[Optional[str], Optional[str], Optional[List], Optional[List]]:
     """Query the indexed documents."""
     if not doc_index.images:
-        return "⚠️ Please upload and index a document first.", None, None, None
     if not query.strip():
-        return "⚠️ Please enter a query.", None, None, None
     try:
-        results_bi = None
-        results_col = None
-        gallery_images_bi = []
-        gallery_images_col = []
         # Query with BiGemma3
         if model_choice in ["NetraEmbed (BiGemma3)", "Both"]:
             if doc_index.bigemma_embeddings is None:
-                return "⚠️ Please index the document with BiGemma3 first.", None, None, None
-            results_bi, gallery_images_bi = query_bigemma(query, top_k)
         # Query with ColGemma3
         if model_choice in ["ColNetraEmbed (ColGemma3)", "Both"]:
             if doc_index.colgemma_embeddings is None:
-                return "⚠️ Please index the document with ColGemma3 first.", None, None, None
-            results_col, gallery_images_col = query_colgemma(query, top_k, show_heatmap)
         # Return results based on model choice
         if model_choice == "NetraEmbed (BiGemma3)":
-            return results_bi, None, gallery_images_bi, None
         elif model_choice == "ColNetraEmbed (ColGemma3)":
-            return results_col, None, None, gallery_images_col
         else:  # Both
-            return results_bi, results_col, gallery_images_bi, gallery_images_col
     except Exception as e:
         import traceback
         error_details = traceback.format_exc()
         print(f"Query error: {error_details}")
-        return f"❌ Error during query: {str(e)}", None, None, None
 # Create Gradio interface
 with gr.Blocks(title="NetraEmbed Demo") as demo:
     # Header section
-    gr.Markdown("# NetraEmbed")
-    gr.HTML(
-        """
-        <div style="display: flex; gap: 8px; flex-wrap: wrap; margin-bottom: 15px;">
-            <a href="https://arxiv.org/abs/2512.03514" target="_blank">
-                <img src="https://img.shields.io/badge/arXiv-2512.03514-b31b1b.svg" alt="Paper">
-            </a>
-            <a href="https://github.com/adithya-s-k/colpali" target="_blank">
-                <img src="https://img.shields.io/badge/GitHub-colpali-181717?logo=github" alt="GitHub">
-            </a>
-            <a href="https://huggingface.co/Cognitive-Lab/ColNetraEmbed" target="_blank">
-                <img src="https://img.shields.io/badge/🤗%20HuggingFace-Model-yellow" alt="Model">
-            </a>
-        </div>
-        """
-    )
-    gr.Markdown(
-        """
-        **🚀 Universal Multilingual Multimodal Document Retrieval**
-        Upload a PDF document, select your model(s), and query using semantic search.
-        **Available Models:**
-        - **NetraEmbed (BiGemma3)**: Single-vector embedding - Fast retrieval with cosine similarity
-        - **ColNetraEmbed (ColGemma3)**: Multi-vector embedding - High-quality retrieval with MaxSim scoring and heatmaps
-        """
-    )
     with gr.Row():
-        # Column 1: Model Selection
         with gr.Column(scale=1):
-            gr.Markdown("### 🤖 Model Selection")
             model_select = gr.Radio(
                 choices=["NetraEmbed (BiGemma3)", "ColNetraEmbed (ColGemma3)", "Both"],
                 value="Both",
                 label="Select Model(s)",
             )
-        # Column 2: Document Upload
-        with gr.Column(scale=1):
-            gr.Markdown("### 📄 Upload & Index")
-            pdf_upload = gr.File(label="Upload PDF", file_types=[".pdf"])
-            index_btn = gr.Button("📥 Index Document", variant="primary")
-            index_status = gr.Textbox(label="Status", lines=6, interactive=False)
-        # Column 3: Query
-        with gr.Column(scale=1):
-            gr.Markdown("### 🔎 Query Document")
             query_input = gr.Textbox(
                 label="Enter Query",
                 placeholder="e.g., financial report, organizational structure...",
                 lines=2,
             )
             with gr.Row():
-                top_k_slider = gr.Slider(minimum=1, maximum=10, value=5, step=1, label="Top K", scale=2)
-                heatmap_checkbox = gr.Checkbox(label="Heatmaps", value=False, scale=1)
-            query_btn = gr.Button("🔍 Search", variant="primary")
     gr.Markdown("---")
     # Results section
-    gr.Markdown("### 📊 Results")
-    with gr.Row():
         with gr.Column(scale=1):
-            bigemma_results = gr.Markdown(value="*BiGemma3 results will appear here...*")
             bigemma_gallery = gr.Gallery(
                 label="BiGemma3 - Top Retrieved Pages",
                 columns=2,
                 height="auto",
             )
         with gr.Column(scale=1):
-            colgemma_results = gr.Markdown(value="*ColGemma3 results will appear here...*")
             colgemma_gallery = gr.Gallery(
                 label="ColGemma3 - Top Retrieved Pages",
                 columns=2,
                 height="auto",
             )
     # Event handlers
     index_btn.click(
         fn=index_document,
@@ -498,8 +531,11 @@ with gr.Blocks(title="NetraEmbed Demo") as demo:
     query_btn.click(
         fn=query_documents,
         inputs=[query_input, model_select, top_k_slider, heatmap_checkbox],
-        outputs=[bigemma_results, colgemma_results, bigemma_gallery, colgemma_gallery],
     )
-# Launch the app
-demo.launch()

 """
+NetraEmbed Demo - Document Retrieval with BiGemma3 and ColGemma3
+This demo allows you to:
+1. Select a model (NetraEmbed, ColNetraEmbed, or Both)
+2. Upload PDF files and index them
+3. Search for relevant pages based on your query
+HuggingFace Spaces deployment with ZeroGPU support.
+"""
 import spaces
+import torch
+import gradio as gr
 from pdf2image import convert_from_path
 from PIL import Image
+from typing import List, Tuple, Optional
+import math
+import io
 import matplotlib.pyplot as plt
 import numpy as np
 import seaborn as sns
 from colpali_engine.interpretability import get_similarity_maps_from_embeddings
 from colpali_engine.interpretability.similarity_map_utils import normalize_similarity_map
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"Device: {device}")
 doc_index = DocumentIndex()
 @spaces.GPU
 def load_bigemma_model():
     """Load BiGemma3 model and processor."""
     if doc_index.bigemma_model is None:
         print("Loading BiGemma3 (NetraEmbed)...")
+        doc_index.bigemma_processor = BiGemmaProcessor3.from_pretrained(
+            "Cognitive-Lab/NetraEmbed",
+            use_fast=True,
+        )
+        doc_index.bigemma_model = BiGemma3.from_pretrained(
+            "Cognitive-Lab/NetraEmbed",
+            torch_dtype=torch.bfloat16,
+            device_map=device,
+        ).eval()
+        print("✓ BiGemma3 loaded successfully")
+    return doc_index.bigemma_model, doc_index.bigemma_processor
 @spaces.GPU
 def load_colgemma_model():
     """Load ColGemma3 model and processor."""
     if doc_index.colgemma_model is None:
         print("Loading ColGemma3 (ColNetraEmbed)...")
+        doc_index.colgemma_model = ColGemma3.from_pretrained(
+            "Cognitive-Lab/ColNetraEmbed",
+            dtype=torch.bfloat16,
+            device_map=device,
+        ).eval()
+        doc_index.colgemma_processor = ColGemmaProcessor3.from_pretrained(
+            "Cognitive-Lab/ColNetraEmbed",
+            use_fast=True,
+        )
+        print("✓ ColGemma3 loaded successfully")
+    return doc_index.colgemma_model, doc_index.colgemma_processor
+def pdf_to_images(pdf_paths: List[str]) -> List[Image.Image]:
+    """Convert PDF files to list of PIL Images."""
+    images = []
+    for pdf_path in pdf_paths:
         try:
+            print(f"Converting PDF to images: {pdf_path}")
+            page_images = convert_from_path(pdf_path, dpi=200)
+            images.extend(page_images)
+            print(f"Converted {len(page_images)} pages from {pdf_path}")
         except Exception as e:
+            print(f"❌ PDF conversion error for {pdf_path}: {str(e)}")
+            raise gr.Error(f"Failed to convert PDF: {str(e)}")
+    if len(images) >= 150:
+        raise gr.Error("The number of images should be less than 150.")
+    return images
 @spaces.GPU
+def index_bigemma_images(images: List[Image.Image]):
+    """Index images with BiGemma3."""
+    model, processor = load_bigemma_model()
+    print(f"Indexing {len(images)} images with BiGemma3...")
+    embeddings_list = []
+    # Process in smaller batches to avoid memory issues
+    batch_size = 2
+    for i in range(0, len(images), batch_size):
+        batch = images[i:i+batch_size]
+        batch_images = processor.process_images(batch).to(device)
+        with torch.no_grad():
+            embeddings = model(**batch_images, embedding_dim=768)
+            embeddings_list.append(embeddings.cpu())
+    # Concatenate all embeddings
+    all_embeddings = torch.cat(embeddings_list, dim=0)
+    print(f"✓ Indexed {len(images)} pages with BiGemma3 (shape: {all_embeddings.shape})")
+    return all_embeddings
 @spaces.GPU
+def index_colgemma_images(images: List[Image.Image]):
+    """Index images with ColGemma3."""
+    model, processor = load_colgemma_model()
+    print(f"Indexing {len(images)} images with ColGemma3...")
+    embeddings_list = []
+    # Process in smaller batches to avoid memory issues
+    batch_size = 2
+    for i in range(0, len(images), batch_size):
+        batch = images[i:i+batch_size]
+        batch_images = processor.process_images(batch).to(device)
+        with torch.no_grad():
+            embeddings = model(**batch_images)
+            embeddings_list.append(embeddings.cpu())
+    # Concatenate all embeddings
+    all_embeddings = torch.cat(embeddings_list, dim=0)
+    print(f"✓ Indexed {len(images)} pages with ColGemma3 (shape: {all_embeddings.shape})")
+    return all_embeddings
+def index_document(pdf_files, model_choice: str) -> str:
+    """Upload and index PDF documents."""
+    if not pdf_files:
+        return "⚠️ Please upload PDF documents first."
+    if not model_choice:
+        return "⚠️ Please select a model first."
     try:
+        status_messages = []
+        # Convert PDFs to images
+        status_messages.append("⏳ Converting PDFs to images...")
+        pdf_paths = [f.name for f in pdf_files]
+        doc_index.images = pdf_to_images(pdf_paths)
         num_pages = len(doc_index.images)
+        status_messages.append(f"✓ Converted to {num_pages} images")
         # Index with BiGemma3
         if model_choice in ["NetraEmbed (BiGemma3)", "Both"]:
+            status_messages.append("⏳ Indexing with BiGemma3...")
             doc_index.bigemma_embeddings = index_bigemma_images(doc_index.images)
+            status_messages.append("✓ Indexed with BiGemma3")
         # Index with ColGemma3
         if model_choice in ["ColNetraEmbed (ColGemma3)", "Both"]:
+            status_messages.append("⏳ Indexing with ColGemma3...")
             doc_index.colgemma_embeddings = index_colgemma_images(doc_index.images)
+            status_messages.append("✓ Indexed with ColGemma3")
+        final_status = "\n".join(status_messages) + "\n\n✅ Document ready for querying!"
+        return final_status
     except Exception as e:
         import traceback
         print(f"Indexing error: {error_details}")
         return f"❌ Error indexing document: {str(e)}"
 @spaces.GPU
 def generate_colgemma_heatmap(
     image: Image.Image,
     query_embedding: torch.Tensor,
     image_embedding: torch.Tensor,
 ) -> Image.Image:
     """Generate heatmap overlay for ColGemma3 results."""
     try:
+        model, processor = load_colgemma_model()
+        # Re-process the single image
         batch_images = processor.process_images([image]).to(device)
+        # Create image mask
         if "input_ids" in batch_images and hasattr(model.config, "image_token_id"):
             image_token_id = model.config.image_token_id
             image_mask = batch_images["input_ids"] == image_token_id
         else:
             image_mask = torch.ones(
+                image_embedding.shape[0], image_embedding.shape[1],
+                dtype=torch.bool, device=device
             )
+        # Calculate n_patches
         num_image_tokens = image_mask.sum().item()
         n_side = int(math.sqrt(num_image_tokens))
+        n_patches = (n_side, n_side) if n_side * n_side == num_image_tokens else (16, 16)
         # Generate similarity maps
         similarity_maps_list = get_similarity_maps_from_embeddings(
+            image_embeddings=image_embedding.unsqueeze(0).to(device),
+            query_embeddings=query_embedding.to(device),
             n_patches=n_patches,
             image_mask=image_mask,
         )
         similarity_map = similarity_maps_list[0]
         if similarity_map.dtype == torch.bfloat16:
             similarity_map = similarity_map.float()
         aggregated_map = torch.mean(similarity_map, dim=0)
+        # Create heatmap overlay
         img_array = np.array(image.convert("RGBA"))
         similarity_map_array = normalize_similarity_map(aggregated_map).to(torch.float32).cpu().numpy()
         similarity_map_array = rearrange(similarity_map_array, "h w -> w h")
         similarity_map_image = Image.fromarray((similarity_map_array * 255).astype("uint8")).resize(
             image.size, Image.Resampling.BICUBIC
         )
         # Create matplotlib figure
+        fig, ax = plt.subplots(figsize=(10, 10))
         ax.imshow(img_array)
         ax.imshow(
             similarity_map_image,
         print(f"❌ Heatmap generation error: {str(e)}")
         return image
 @spaces.GPU
 def query_documents(
     query: str, model_choice: str, top_k: int, show_heatmap: bool = False
+) -> Tuple[Optional[List], Optional[str], Optional[List], Optional[str]]:
     """Query the indexed documents."""
     if not doc_index.images:
+        return None, "⚠️ Please upload and index a document first.", None, None
     if not query.strip():
+        return None, "⚠️ Please enter a query.", None, None
     try:
+        bigemma_results = []
+        bigemma_text = ""
+        colgemma_results = []
+        colgemma_text = ""
         # Query with BiGemma3
         if model_choice in ["NetraEmbed (BiGemma3)", "Both"]:
             if doc_index.bigemma_embeddings is None:
+                return None, "⚠️ Please index the document with BiGemma3 first.", None, None
+            model, processor = load_bigemma_model()
+            # Encode query
+            batch_query = processor.process_texts([query]).to(device)
+            with torch.no_grad():
+                query_embedding = model(**batch_query, embedding_dim=768)
+            # Compute scores
+            scores = processor.score(
+                qs=[query_embedding[0].cpu()],
+                ps=list(torch.unbind(doc_index.bigemma_embeddings)),
+                device=device,
+            )
+            # Get top-k results
+            top_k_actual = min(top_k, len(doc_index.images))
+            top_indices = scores[0].argsort(descending=True)[:top_k_actual]
+            # Format results
+            bigemma_text = "### BiGemma3 (NetraEmbed) Results\n\n"
+            for rank, idx in enumerate(top_indices):
+                score = scores[0, idx].item()
+                bigemma_text += f"**Rank {rank + 1}:** Page {idx.item() + 1} - Score: {score:.4f}\n"
+                bigemma_results.append(
+                    (doc_index.images[idx.item()], f"Rank {rank + 1} - Page {idx.item() + 1} (Score: {score:.4f})")
+                )
         # Query with ColGemma3
         if model_choice in ["ColNetraEmbed (ColGemma3)", "Both"]:
             if doc_index.colgemma_embeddings is None:
+                return bigemma_results if bigemma_results else None, bigemma_text if bigemma_text else "⚠️ Please index the document with ColGemma3 first.", None, None
+            model, processor = load_colgemma_model()
+            # Encode query
+            batch_query = processor.process_queries([query]).to(device)
+            with torch.no_grad():
+                query_embedding = model(**batch_query)
+            # Compute scores
+            scores = processor.score_multi_vector(
+                qs=[query_embedding[0].cpu()],
+                ps=list(torch.unbind(doc_index.colgemma_embeddings)),
+                device=device,
+            )
+            # Get top-k results
+            top_k_actual = min(top_k, len(doc_index.images))
+            top_indices = scores[0].argsort(descending=True)[:top_k_actual]
+            # Format results
+            colgemma_text = "### ColGemma3 (ColNetraEmbed) Results\n\n"
+            for rank, idx in enumerate(top_indices):
+                score = scores[0, idx].item()
+                colgemma_text += f"**Rank {rank + 1}:** Page {idx.item() + 1} - Score: {score:.2f}\n"
+                # Generate heatmap if requested
+                if show_heatmap:
+                    heatmap_image = generate_colgemma_heatmap(
+                        image=doc_index.images[idx.item()],
+                        query_embedding=query_embedding,
+                        image_embedding=doc_index.colgemma_embeddings[idx.item()],
+                    )
+                    colgemma_results.append(
+                        (heatmap_image, f"Rank {rank + 1} - Page {idx.item() + 1} (Score: {score:.2f})")
+                    )
+                else:
+                    colgemma_results.append(
+                        (doc_index.images[idx.item()], f"Rank {rank + 1} - Page {idx.item() + 1} (Score: {score:.2f})")
+                    )
         # Return results based on model choice
         if model_choice == "NetraEmbed (BiGemma3)":
+            return bigemma_results, bigemma_text, None, None
         elif model_choice == "ColNetraEmbed (ColGemma3)":
+            return None, None, colgemma_results, colgemma_text
         else:  # Both
+            return bigemma_results, bigemma_text, colgemma_results, colgemma_text
     except Exception as e:
         import traceback
         error_details = traceback.format_exc()
         print(f"Query error: {error_details}")
+        return None, f"❌ Error during query: {str(e)}", None, None
 # Create Gradio interface
 with gr.Blocks(title="NetraEmbed Demo") as demo:
     # Header section
+    with gr.Row():
+        with gr.Column(scale=1):
+            gr.Markdown("# NetraEmbed")
+            gr.HTML(
+                """
+                <div style="display: flex; gap: 8px; flex-wrap: wrap; margin-bottom: 15px;">
+                    <a href="https://arxiv.org/abs/2512.03514" target="_blank">
+                        <img src="https://img.shields.io/badge/arXiv-2512.03514-b31b1b.svg" alt="Paper">
+                    </a>
+                    <a href="https://github.com/adithya-s-k/colpali" target="_blank">
+                        <img src="https://img.shields.io/badge/GitHub-colpali-181717?logo=github" alt="GitHub">
+                    </a>
+                    <a href="https://huggingface.co/Cognitive-Lab/ColNetraEmbed" target="_blank">
+                        <img src="https://img.shields.io/badge/🤗%20HuggingFace-Model-yellow" alt="Model">
+                    </a>
+                    <a href="https://www.cognitivelab.in/blog/introducing-netraembed" target="_blank">
+                        <img src="https://img.shields.io/badge/Blog-CognitiveLab-blue" alt="Blog">
+                    </a>
+                    <a href="https://cloud.cognitivelab.in" target="_blank">
+                        <img src="https://img.shields.io/badge/Demo-Try%20it%20out-green" alt="Demo">
+                    </a>
+                </div>
+                """
+            )
+            gr.Markdown(
+                """
+                **🚀 Universal Multilingual Multimodal Document Retrieval**
+                Upload a PDF document, select your model(s), and query using semantic search.
+                **Available Models:**
+                - **NetraEmbed (BiGemma3)**: Single-vector embedding with Matryoshka representation
+                  Fast retrieval with cosine similarity
+                - **ColNetraEmbed (ColGemma3)**: Multi-vector embedding with late interaction
+                  High-quality retrieval with MaxSim scoring and attention heatmaps
+                """
+            )
+        with gr.Column(scale=1):
+            gr.HTML(
+                """
+                <div style="text-align: center;">
+                    <img src="https://cdn-uploads.huggingface.co/production/uploads/6442d975ad54813badc1ddf7/-fYMikXhSuqRqm-UIdulK.png"
+                         alt="NetraEmbed Banner"
+                         style="width: 100%; height: auto; border-radius: 8px;">
+                </div>
+                """
+            )
+    gr.Markdown("---")
+    # Main interface
     with gr.Row():
+        # Column 1: Model & Upload
         with gr.Column(scale=1):
+            gr.Markdown("### 🤖 Select Model & Upload")
             model_select = gr.Radio(
                 choices=["NetraEmbed (BiGemma3)", "ColNetraEmbed (ColGemma3)", "Both"],
                 value="Both",
                 label="Select Model(s)",
             )
+            pdf_upload = gr.File(
+                label="Upload PDFs",
+                file_types=[".pdf"],
+                file_count="multiple"
+            )
+            index_btn = gr.Button("📥 Index Documents", variant="primary", size="sm")
+            index_status = gr.Textbox(
+                label="Indexing Status",
+                lines=8,
+                interactive=False,
+                value="Select model and upload PDFs to start",
+            )
+        # Column 2: Query & Results
+        with gr.Column(scale=2):
+            gr.Markdown("### 🔎 Query Documents")
             query_input = gr.Textbox(
                 label="Enter Query",
                 placeholder="e.g., financial report, organizational structure...",
                 lines=2,
             )
             with gr.Row():
+                top_k_slider = gr.Slider(
+                    minimum=1,
+                    maximum=10,
+                    value=5,
+                    step=1,
+                    label="Top K Results",
+                    scale=2,
+                )
+                heatmap_checkbox = gr.Checkbox(
+                    label="Show Heatmaps (ColGemma3)",
+                    value=False,
+                    scale=1,
+                )
+            query_btn = gr.Button("🔍 Search", variant="primary", size="sm")
     gr.Markdown("---")
+    gr.Markdown("### 📊 Results")
     # Results section
+    with gr.Row(equal_height=True):
         with gr.Column(scale=1):
+            bigemma_results_text = gr.Markdown(
+                value="*BiGemma3 results will appear here...*",
+            )
             bigemma_gallery = gr.Gallery(
                 label="BiGemma3 - Top Retrieved Pages",
+                show_label=True,
                 columns=2,
                 height="auto",
+                object_fit="contain",
             )
         with gr.Column(scale=1):
+            colgemma_results_text = gr.Markdown(
+                value="*ColGemma3 results will appear here...*",
+            )
             colgemma_gallery = gr.Gallery(
                 label="ColGemma3 - Top Retrieved Pages",
+                show_label=True,
                 columns=2,
                 height="auto",
+                object_fit="contain",
             )
+    # Tips
+    with gr.Accordion("💡 Tips", open=False):
+        gr.Markdown(
+            """
+            - **Both models**: Compare results side-by-side
+            - **Scores**: BiGemma3 uses cosine similarity (-1 to 1), ColGemma3 uses MaxSim (higher is better)
+            - **Heatmaps**: Enable to visualize ColGemma3 attention patterns (brighter = higher attention)
+            - **Refresh**: If you change documents, refresh the page to clear the index
+            """
+        )
     # Event handlers
     index_btn.click(
         fn=index_document,
     query_btn.click(
         fn=query_documents,
         inputs=[query_input, model_select, top_k_slider, heatmap_checkbox],
+        outputs=[bigemma_gallery, bigemma_results_text, colgemma_gallery, colgemma_results_text],
     )
+# Enable queue for handling multiple requests
+demo.queue(max_size=20)
+if __name__ == "__main__":
+    demo.launch(debug=True)