akhaliq HF Staff commited on
Commit
4ed82d8
·
verified ·
1 Parent(s): 54c2271

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +6 -30
app.py CHANGED
@@ -11,7 +11,6 @@ from controlnet_aux.processor import Processor
11
  from PIL import Image
12
  from safetensors.torch import load_file
13
  from huggingface_hub import hf_hub_download, snapshot_download
14
- import torchvision.transforms as transforms
15
 
16
  # Import pipeline and model
17
  from videox_fun.pipeline import ZImageControlPipeline
@@ -93,7 +92,6 @@ tokenizer = AutoTokenizer.from_pretrained(
93
  subfolder="tokenizer"
94
  )
95
 
96
- # Qwen3ForCausalLM is still needed as the Text Encoder for the pipeline
97
  text_encoder = Qwen3ForCausalLM.from_pretrained(
98
  MODEL_REPO,
99
  subfolder="text_encoder",
@@ -138,25 +136,6 @@ def rescale_image(image, scale, divisible_by=16):
138
  resized = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
139
  return resized, new_width, new_height
140
 
141
- def get_image_latent(image):
142
- """Convert PIL image to VAE latent representation."""
143
- # Normalize image
144
- transform = transforms.Compose([
145
- transforms.ToTensor(),
146
- transforms.Normalize([0.5], [0.5])
147
- ])
148
-
149
- # FIX: Only unsqueeze(0) for Batch dimension [B, C, H, W]
150
- # Removed the second unsqueeze(2) which caused the 5D error
151
- img_tensor = transform(image).unsqueeze(0)
152
- img_tensor = img_tensor.to(device, weight_dtype)
153
-
154
- with torch.no_grad():
155
- latent = pipe.vae.encode(img_tensor).latent_dist.sample()
156
- latent = latent * pipe.vae.config.scaling_factor
157
-
158
- return latent
159
-
160
  @spaces.GPU()
161
  def generate_image(
162
  prompt,
@@ -203,19 +182,17 @@ def generate_image(
203
  control_image_rescaled, width, height = rescale_image(input_image, image_scale, 16)
204
 
205
  # Run Processor
 
206
  temp_image = control_image_rescaled.resize((1024, 1024))
207
  processed_image_pil = processor(temp_image, to_pil=True)
208
  processed_image_pil = processed_image_pil.resize((width, height))
209
 
210
- # Convert to Latent
211
- progress(0.4, desc="Encoding control image...")
212
- # FIX: Passed result directly without sample_size args which aren't used in new function
213
- control_image_latent = get_image_latent(processed_image_pil)
214
-
215
  # 3. Generate
216
  progress(0.5, desc="Generating...")
217
 
218
  try:
 
 
219
  result = pipe(
220
  prompt=prompt,
221
  negative_prompt=negative_prompt,
@@ -223,7 +200,7 @@ def generate_image(
223
  width=width,
224
  generator=generator,
225
  guidance_scale=guidance_scale,
226
- control_image=control_image_latent,
227
  num_inference_steps=num_inference_steps,
228
  control_context_scale=control_context_scale,
229
  )
@@ -283,7 +260,7 @@ button.primary:hover {
283
  }
284
  """
285
 
286
- with gr.Blocks(title="Z-Image Turbo ControlNet") as demo:
287
 
288
  gr.HTML("""
289
  <div class="header-container">
@@ -365,5 +342,4 @@ with gr.Blocks(title="Z-Image Turbo ControlNet") as demo:
365
  )
366
 
367
  if __name__ == "__main__":
368
- demo.launch(share=False,
369
- css=apple_css)
 
11
  from PIL import Image
12
  from safetensors.torch import load_file
13
  from huggingface_hub import hf_hub_download, snapshot_download
 
14
 
15
  # Import pipeline and model
16
  from videox_fun.pipeline import ZImageControlPipeline
 
92
  subfolder="tokenizer"
93
  )
94
 
 
95
  text_encoder = Qwen3ForCausalLM.from_pretrained(
96
  MODEL_REPO,
97
  subfolder="text_encoder",
 
136
  resized = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
137
  return resized, new_width, new_height
138
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
  @spaces.GPU()
140
  def generate_image(
141
  prompt,
 
182
  control_image_rescaled, width, height = rescale_image(input_image, image_scale, 16)
183
 
184
  # Run Processor
185
+ # We resize to 1024 temporarily for the preprocessor to work best, then resize back to target
186
  temp_image = control_image_rescaled.resize((1024, 1024))
187
  processed_image_pil = processor(temp_image, to_pil=True)
188
  processed_image_pil = processed_image_pil.resize((width, height))
189
 
 
 
 
 
 
190
  # 3. Generate
191
  progress(0.5, desc="Generating...")
192
 
193
  try:
194
+ # FIX: Pass the processed PIL image directly.
195
+ # The pipeline handles VAE encoding internally.
196
  result = pipe(
197
  prompt=prompt,
198
  negative_prompt=negative_prompt,
 
200
  width=width,
201
  generator=generator,
202
  guidance_scale=guidance_scale,
203
+ control_image=processed_image_pil,
204
  num_inference_steps=num_inference_steps,
205
  control_context_scale=control_context_scale,
206
  )
 
260
  }
261
  """
262
 
263
+ with gr.Blocks(title="Z-Image Turbo ControlNet", css=apple_css) as demo:
264
 
265
  gr.HTML("""
266
  <div class="header-container">
 
342
  )
343
 
344
  if __name__ == "__main__":
345
+ demo.launch(share=False)