Z-Image-Turbo-controlnet

Running on Zero

App Files Files Community

akhaliq HF Staff commited on 5 days ago

Commit

4ed82d8

verified ·

1 Parent(s): 54c2271

Update app.py

Browse files

Files changed (1) hide show

app.py +6 -30

app.py CHANGED Viewed

@@ -11,7 +11,6 @@ from controlnet_aux.processor import Processor
 from PIL import Image
 from safetensors.torch import load_file
 from huggingface_hub import hf_hub_download, snapshot_download
-import torchvision.transforms as transforms
 # Import pipeline and model
 from videox_fun.pipeline import ZImageControlPipeline
@@ -93,7 +92,6 @@ tokenizer = AutoTokenizer.from_pretrained(
     subfolder="tokenizer"
 )
-# Qwen3ForCausalLM is still needed as the Text Encoder for the pipeline
 text_encoder = Qwen3ForCausalLM.from_pretrained(
     MODEL_REPO,
     subfolder="text_encoder",
@@ -138,25 +136,6 @@ def rescale_image(image, scale, divisible_by=16):
     resized = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
     return resized, new_width, new_height
-def get_image_latent(image):
-    """Convert PIL image to VAE latent representation."""
-    # Normalize image
-    transform = transforms.Compose([
-        transforms.ToTensor(),
-        transforms.Normalize([0.5], [0.5])
-    ])
-    # FIX: Only unsqueeze(0) for Batch dimension [B, C, H, W]
-    # Removed the second unsqueeze(2) which caused the 5D error
-    img_tensor = transform(image).unsqueeze(0)
-    img_tensor = img_tensor.to(device, weight_dtype)
-    with torch.no_grad():
-        latent = pipe.vae.encode(img_tensor).latent_dist.sample()
-        latent = latent * pipe.vae.config.scaling_factor
-    return latent
 @spaces.GPU()
 def generate_image(
     prompt,
@@ -203,19 +182,17 @@ def generate_image(
     control_image_rescaled, width, height = rescale_image(input_image, image_scale, 16)
     # Run Processor
     temp_image = control_image_rescaled.resize((1024, 1024))
     processed_image_pil = processor(temp_image, to_pil=True)
     processed_image_pil = processed_image_pil.resize((width, height))
-    # Convert to Latent
-    progress(0.4, desc="Encoding control image...")
-    # FIX: Passed result directly without sample_size args which aren't used in new function
-    control_image_latent = get_image_latent(processed_image_pil)
     # 3. Generate
     progress(0.5, desc="Generating...")
     try:
         result = pipe(
             prompt=prompt,
             negative_prompt=negative_prompt,
@@ -223,7 +200,7 @@ def generate_image(
             width=width,
             generator=generator,
             guidance_scale=guidance_scale,
-            control_image=control_image_latent,
             num_inference_steps=num_inference_steps,
             control_context_scale=control_context_scale,
         )
@@ -283,7 +260,7 @@ button.primary:hover {
 }
 """
-with gr.Blocks(title="Z-Image Turbo ControlNet") as demo:
     gr.HTML("""
         <div class="header-container">
@@ -365,5 +342,4 @@ with gr.Blocks(title="Z-Image Turbo ControlNet") as demo:
     )
 if __name__ == "__main__":
-    demo.launch(share=False,
-               css=apple_css)

 from PIL import Image
 from safetensors.torch import load_file
 from huggingface_hub import hf_hub_download, snapshot_download
 # Import pipeline and model
 from videox_fun.pipeline import ZImageControlPipeline
     subfolder="tokenizer"
 )
 text_encoder = Qwen3ForCausalLM.from_pretrained(
     MODEL_REPO,
     subfolder="text_encoder",
     resized = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
     return resized, new_width, new_height
 @spaces.GPU()
 def generate_image(
     prompt,
     control_image_rescaled, width, height = rescale_image(input_image, image_scale, 16)
     # Run Processor
+    # We resize to 1024 temporarily for the preprocessor to work best, then resize back to target
     temp_image = control_image_rescaled.resize((1024, 1024))
     processed_image_pil = processor(temp_image, to_pil=True)
     processed_image_pil = processed_image_pil.resize((width, height))
     # 3. Generate
     progress(0.5, desc="Generating...")
     try:
+        # FIX: Pass the processed PIL image directly.
+        # The pipeline handles VAE encoding internally.
         result = pipe(
             prompt=prompt,
             negative_prompt=negative_prompt,
             width=width,
             generator=generator,
             guidance_scale=guidance_scale,
+            control_image=processed_image_pil,
             num_inference_steps=num_inference_steps,
             control_context_scale=control_context_scale,
         )
 }
 """
+with gr.Blocks(title="Z-Image Turbo ControlNet", css=apple_css) as demo:
     gr.HTML("""
         <div class="header-container">
     )
 if __name__ == "__main__":
+    demo.launch(share=False)