Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
|
@@ -11,7 +11,6 @@ from controlnet_aux.processor import Processor
|
|
| 11 |
from PIL import Image
|
| 12 |
from safetensors.torch import load_file
|
| 13 |
from huggingface_hub import hf_hub_download, snapshot_download
|
| 14 |
-
import torchvision.transforms as transforms
|
| 15 |
|
| 16 |
# Import pipeline and model
|
| 17 |
from videox_fun.pipeline import ZImageControlPipeline
|
|
@@ -93,7 +92,6 @@ tokenizer = AutoTokenizer.from_pretrained(
|
|
| 93 |
subfolder="tokenizer"
|
| 94 |
)
|
| 95 |
|
| 96 |
-
# Qwen3ForCausalLM is still needed as the Text Encoder for the pipeline
|
| 97 |
text_encoder = Qwen3ForCausalLM.from_pretrained(
|
| 98 |
MODEL_REPO,
|
| 99 |
subfolder="text_encoder",
|
|
@@ -138,25 +136,6 @@ def rescale_image(image, scale, divisible_by=16):
|
|
| 138 |
resized = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
|
| 139 |
return resized, new_width, new_height
|
| 140 |
|
| 141 |
-
def get_image_latent(image):
|
| 142 |
-
"""Convert PIL image to VAE latent representation."""
|
| 143 |
-
# Normalize image
|
| 144 |
-
transform = transforms.Compose([
|
| 145 |
-
transforms.ToTensor(),
|
| 146 |
-
transforms.Normalize([0.5], [0.5])
|
| 147 |
-
])
|
| 148 |
-
|
| 149 |
-
# FIX: Only unsqueeze(0) for Batch dimension [B, C, H, W]
|
| 150 |
-
# Removed the second unsqueeze(2) which caused the 5D error
|
| 151 |
-
img_tensor = transform(image).unsqueeze(0)
|
| 152 |
-
img_tensor = img_tensor.to(device, weight_dtype)
|
| 153 |
-
|
| 154 |
-
with torch.no_grad():
|
| 155 |
-
latent = pipe.vae.encode(img_tensor).latent_dist.sample()
|
| 156 |
-
latent = latent * pipe.vae.config.scaling_factor
|
| 157 |
-
|
| 158 |
-
return latent
|
| 159 |
-
|
| 160 |
@spaces.GPU()
|
| 161 |
def generate_image(
|
| 162 |
prompt,
|
|
@@ -203,19 +182,17 @@ def generate_image(
|
|
| 203 |
control_image_rescaled, width, height = rescale_image(input_image, image_scale, 16)
|
| 204 |
|
| 205 |
# Run Processor
|
|
|
|
| 206 |
temp_image = control_image_rescaled.resize((1024, 1024))
|
| 207 |
processed_image_pil = processor(temp_image, to_pil=True)
|
| 208 |
processed_image_pil = processed_image_pil.resize((width, height))
|
| 209 |
|
| 210 |
-
# Convert to Latent
|
| 211 |
-
progress(0.4, desc="Encoding control image...")
|
| 212 |
-
# FIX: Passed result directly without sample_size args which aren't used in new function
|
| 213 |
-
control_image_latent = get_image_latent(processed_image_pil)
|
| 214 |
-
|
| 215 |
# 3. Generate
|
| 216 |
progress(0.5, desc="Generating...")
|
| 217 |
|
| 218 |
try:
|
|
|
|
|
|
|
| 219 |
result = pipe(
|
| 220 |
prompt=prompt,
|
| 221 |
negative_prompt=negative_prompt,
|
|
@@ -223,7 +200,7 @@ def generate_image(
|
|
| 223 |
width=width,
|
| 224 |
generator=generator,
|
| 225 |
guidance_scale=guidance_scale,
|
| 226 |
-
control_image=
|
| 227 |
num_inference_steps=num_inference_steps,
|
| 228 |
control_context_scale=control_context_scale,
|
| 229 |
)
|
|
@@ -283,7 +260,7 @@ button.primary:hover {
|
|
| 283 |
}
|
| 284 |
"""
|
| 285 |
|
| 286 |
-
with gr.Blocks(title="Z-Image Turbo ControlNet") as demo:
|
| 287 |
|
| 288 |
gr.HTML("""
|
| 289 |
<div class="header-container">
|
|
@@ -365,5 +342,4 @@ with gr.Blocks(title="Z-Image Turbo ControlNet") as demo:
|
|
| 365 |
)
|
| 366 |
|
| 367 |
if __name__ == "__main__":
|
| 368 |
-
demo.launch(share=False
|
| 369 |
-
css=apple_css)
|
|
|
|
| 11 |
from PIL import Image
|
| 12 |
from safetensors.torch import load_file
|
| 13 |
from huggingface_hub import hf_hub_download, snapshot_download
|
|
|
|
| 14 |
|
| 15 |
# Import pipeline and model
|
| 16 |
from videox_fun.pipeline import ZImageControlPipeline
|
|
|
|
| 92 |
subfolder="tokenizer"
|
| 93 |
)
|
| 94 |
|
|
|
|
| 95 |
text_encoder = Qwen3ForCausalLM.from_pretrained(
|
| 96 |
MODEL_REPO,
|
| 97 |
subfolder="text_encoder",
|
|
|
|
| 136 |
resized = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
|
| 137 |
return resized, new_width, new_height
|
| 138 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 139 |
@spaces.GPU()
|
| 140 |
def generate_image(
|
| 141 |
prompt,
|
|
|
|
| 182 |
control_image_rescaled, width, height = rescale_image(input_image, image_scale, 16)
|
| 183 |
|
| 184 |
# Run Processor
|
| 185 |
+
# We resize to 1024 temporarily for the preprocessor to work best, then resize back to target
|
| 186 |
temp_image = control_image_rescaled.resize((1024, 1024))
|
| 187 |
processed_image_pil = processor(temp_image, to_pil=True)
|
| 188 |
processed_image_pil = processed_image_pil.resize((width, height))
|
| 189 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 190 |
# 3. Generate
|
| 191 |
progress(0.5, desc="Generating...")
|
| 192 |
|
| 193 |
try:
|
| 194 |
+
# FIX: Pass the processed PIL image directly.
|
| 195 |
+
# The pipeline handles VAE encoding internally.
|
| 196 |
result = pipe(
|
| 197 |
prompt=prompt,
|
| 198 |
negative_prompt=negative_prompt,
|
|
|
|
| 200 |
width=width,
|
| 201 |
generator=generator,
|
| 202 |
guidance_scale=guidance_scale,
|
| 203 |
+
control_image=processed_image_pil,
|
| 204 |
num_inference_steps=num_inference_steps,
|
| 205 |
control_context_scale=control_context_scale,
|
| 206 |
)
|
|
|
|
| 260 |
}
|
| 261 |
"""
|
| 262 |
|
| 263 |
+
with gr.Blocks(title="Z-Image Turbo ControlNet", css=apple_css) as demo:
|
| 264 |
|
| 265 |
gr.HTML("""
|
| 266 |
<div class="header-container">
|
|
|
|
| 342 |
)
|
| 343 |
|
| 344 |
if __name__ == "__main__":
|
| 345 |
+
demo.launch(share=False)
|
|
|