Z-Image-Turbo-controlnet

Running on Zero

App Files Files Community

akhaliq HF Staff commited on 5 days ago

Commit

8c2280a

verified ·

1 Parent(s): f33ac27

Update app.py

Browse files

Files changed (1) hide show

app.py +21 -62

app.py CHANGED Viewed

@@ -11,20 +11,12 @@ from controlnet_aux.processor import Processor
 from PIL import Image
 from safetensors.torch import load_file
 from huggingface_hub import hf_hub_download, snapshot_download
 # Import pipeline and model
-# Ensure the videox_fun folder is in your current directory
 from videox_fun.pipeline import ZImageControlPipeline
 from videox_fun.models import ZImageControlTransformer2DModel
-# Try to import prompt utility, define fallback if missing
-try:
-    from utils.prompt_utils import polish_prompt
-except ImportError:
-    print("utils.prompt_utils not found. Using passthrough for prompt polishing.")
-    def polish_prompt(prompt):
-        return prompt
 # --- Configuration & Paths ---
 MAX_SEED = np.iinfo(np.int32).max
 MAX_IMAGE_SIZE = 1280
@@ -40,20 +32,15 @@ weight_dtype = torch.bfloat16
 # --- FIX: Download Transformer Config & Weights Locally ---
 print("Downloading transformer files...")
-# This downloads the 'transformer' subfolder to a local cache and returns the path
 transformer_path = snapshot_download(
     repo_id=MODEL_REPO,
     allow_patterns=["transformer/*"],
     local_dir="models/transformer",
     local_dir_use_symlinks=False
 )
-# The snapshot puts files in models/transformer/transformer, we need to point to the inner one
-# depending on how snapshot_download behaves with 'allow_patterns'.
-# Usually it preserves structure. Let's ensure we point to the folder containing config.json.
 local_transformer_path = os.path.join(transformer_path, "transformer")
 if not os.path.exists(os.path.join(local_transformer_path, "config.json")):
-    # Fallback if structure is flat or different
     local_transformer_path = transformer_path
 print(f"Transformer files located at: {local_transformer_path}")
@@ -61,7 +48,7 @@ print(f"Transformer files located at: {local_transformer_path}")
 # --- 1. Load Transformer ---
 print("Initializing Transformer...")
 transformer = ZImageControlTransformer2DModel.from_pretrained(
-    local_transformer_path,  # Pass the LOCAL path now
     transformer_additional_kwargs={
         "control_layers_places": [0, 5, 10, 15, 20, 25],
         "control_in_dim": 16
@@ -69,7 +56,6 @@ transformer = ZImageControlTransformer2DModel.from_pretrained(
 ).to(device, weight_dtype)
 # --- 2. Download & Load ControlNet Weights ---
-# Check if weights exist locally; if not, download them
 if not os.path.exists(CONTROLNET_FILENAME):
     print(f"Downloading ControlNet weights from {CONTROLNET_REPO}...")
     try:
@@ -87,9 +73,7 @@ if CONTROLNET_WEIGHTS:
     print(f"Loading ControlNet weights from {CONTROLNET_WEIGHTS}")
     try:
         state_dict = load_file(CONTROLNET_WEIGHTS)
-        # Handle potential nesting of state_dict
         state_dict = state_dict.get("state_dict", state_dict)
         m, u = transformer.load_state_dict(state_dict, strict=False)
         print(f"ControlNet Weights Loaded - Missing keys: {len(m)}, Unexpected keys: {len(u)}")
     except Exception as e:
@@ -99,8 +83,6 @@ else:
 # --- 3. Load Core Components ---
 print("Loading VAE, Tokenizer, and Text Encoder...")
-# These standard libraries usually handle Hub IDs fine, but we can download if they fail too.
-# For now, standard diffusers/transformers components usually work with Hub IDs.
 vae = AutoencoderKL.from_pretrained(
     MODEL_REPO,
     subfolder="vae",
@@ -111,6 +93,7 @@ tokenizer = AutoTokenizer.from_pretrained(
     subfolder="tokenizer"
 )
 text_encoder = Qwen3ForCausalLM.from_pretrained(
     MODEL_REPO,
     subfolder="text_encoder",
@@ -144,11 +127,9 @@ def rescale_image(image, scale, divisible_by=16):
     new_width = int(width * scale)
     new_height = int(height * scale)
-    # Make dimensions divisible by divisible_by
     new_width = (new_width // divisible_by) * divisible_by
     new_height = (new_height // divisible_by) * divisible_by
-    # Clamp to max size
     if new_width > MAX_IMAGE_SIZE:
         new_width = MAX_IMAGE_SIZE
     if new_height > MAX_IMAGE_SIZE:
@@ -157,17 +138,17 @@ def rescale_image(image, scale, divisible_by=16):
     resized = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
     return resized, new_width, new_height
-def get_image_latent(image, sample_size):
     """Convert PIL image to VAE latent representation."""
-    import torchvision.transforms as transforms
     # Normalize image
     transform = transforms.Compose([
         transforms.ToTensor(),
         transforms.Normalize([0.5], [0.5])
     ])
-    img_tensor = transform(image).unsqueeze(0).unsqueeze(2)  # [B, C, 1, H, W]
     img_tensor = img_tensor.to(device, weight_dtype)
     with torch.no_grad():
@@ -188,36 +169,22 @@ def generate_image(
     guidance_scale=1.0,
     seed=42,
     randomize_seed=True,
-    is_polish_prompt=True,
     progress=gr.Progress(track_tqdm=True)
 ):
-    timestamp = time.time()
     if not prompt.strip():
         raise gr.Error("Please enter a prompt to generate an image.")
-    # 1. Polish Prompt
-    final_prompt = prompt
-    if is_polish_prompt:
-        progress(0.1, desc="Polishing prompt...")
-        try:
-            final_prompt = polish_prompt(prompt)
-        except Exception as e:
-            print(f"Prompt polish failed: {e}")
-            final_prompt = prompt
-    # 2. Set Seed
     if randomize_seed:
         seed = random.randint(0, MAX_SEED)
     generator = torch.Generator(device).manual_seed(seed)
-    # 3. Process Control Image
     if input_image is None:
         raise gr.Error("Please upload a control image.")
     progress(0.2, desc=f"Processing {control_mode}...")
-    # Map control mode to processor ID
     processor_map = {
         'Canny': 'canny',
         'HED': 'softedge_hed',
@@ -227,34 +194,30 @@ def generate_image(
     }
     processor_id = processor_map.get(control_mode, 'canny')
-    # Initialize processor
     try:
         processor = Processor(processor_id)
     except Exception as e:
         print(f"Failed to load processor {processor_id}, falling back to Canny. Error: {e}")
         processor = Processor('canny')
-    # Resize input for processing
     control_image_rescaled, width, height = rescale_image(input_image, image_scale, 16)
-    # Run Processor (requires resizing to 1024x1024 typically for best results with these models, then back)
     temp_image = control_image_rescaled.resize((1024, 1024))
     processed_image_pil = processor(temp_image, to_pil=True)
     processed_image_pil = processed_image_pil.resize((width, height))
     # Convert to Latent
     progress(0.4, desc="Encoding control image...")
-    control_image_latent = get_image_latent(
-        processed_image_pil,
-        sample_size=[height, width]
-    )[:, :, 0]
-    # 4. Generate
     progress(0.5, desc="Generating...")
     try:
         result = pipe(
-            prompt=final_prompt,
             negative_prompt=negative_prompt,
             height=height,
             width=width,
@@ -268,7 +231,7 @@ def generate_image(
         image = result.images[0]
         progress(1.0, desc="Complete!")
-        return image, seed, processed_image_pil, final_prompt
     except Exception as e:
         raise gr.Error(f"Generation failed: {str(e)}")
@@ -320,13 +283,13 @@ button.primary:hover {
 }
 """
-with gr.Blocks(title="Z-Image Turbo ControlNet") as demo:
     gr.HTML("""
         <div class="header-container">
             <div class="info-badge">✓ ControlNet Union</div>
             <h1 class="main-title">Z-Image Turbo</h1>
-            <p class="subtitle">Multi-Control Generation with LLM Prompt Polishing</p>
         </div>
     """)
@@ -339,9 +302,7 @@ with gr.Blocks(title="Z-Image Turbo ControlNet") as demo:
                 lines=3
             )
-            with gr.Row():
-                is_polish_prompt = gr.Checkbox(label="Polish Prompt with LLM", value=True)
-                randomize_seed = gr.Checkbox(label="Randomize Seed", value=True)
             negative_prompt = gr.Textbox(
                 label="Negative Prompt",
@@ -381,7 +342,6 @@ with gr.Blocks(title="Z-Image Turbo ControlNet") as demo:
             output_image = gr.Image(label="Generated Image", type="pil")
             with gr.Accordion("Details & Debug", open=True):
-                polished_prompt_output = gr.Textbox(label="Actual Polished Prompt", interactive=False, lines=2)
                 with gr.Row():
                     seed_output = gr.Number(label="Seed Used", precision=0)
                 control_output = gr.Image(label="Preprocessor Output", type="pil")
@@ -399,11 +359,10 @@ with gr.Blocks(title="Z-Image Turbo ControlNet") as demo:
         inputs=[
             prompt, negative_prompt, input_image, control_mode,
             control_context_scale, image_scale, num_inference_steps,
-            guidance_scale, seed, randomize_seed, is_polish_prompt
         ],
-        outputs=[output_image, seed_output, control_output, polished_prompt_output]
     )
 if __name__ == "__main__":
-    demo.launch(share=False,
-               css=apple_css)

 from PIL import Image
 from safetensors.torch import load_file
 from huggingface_hub import hf_hub_download, snapshot_download
+import torchvision.transforms as transforms
 # Import pipeline and model
 from videox_fun.pipeline import ZImageControlPipeline
 from videox_fun.models import ZImageControlTransformer2DModel
 # --- Configuration & Paths ---
 MAX_SEED = np.iinfo(np.int32).max
 MAX_IMAGE_SIZE = 1280
 # --- FIX: Download Transformer Config & Weights Locally ---
 print("Downloading transformer files...")
 transformer_path = snapshot_download(
     repo_id=MODEL_REPO,
     allow_patterns=["transformer/*"],
     local_dir="models/transformer",
     local_dir_use_symlinks=False
 )
 local_transformer_path = os.path.join(transformer_path, "transformer")
 if not os.path.exists(os.path.join(local_transformer_path, "config.json")):
     local_transformer_path = transformer_path
 print(f"Transformer files located at: {local_transformer_path}")
 # --- 1. Load Transformer ---
 print("Initializing Transformer...")
 transformer = ZImageControlTransformer2DModel.from_pretrained(
+    local_transformer_path,
     transformer_additional_kwargs={
         "control_layers_places": [0, 5, 10, 15, 20, 25],
         "control_in_dim": 16
 ).to(device, weight_dtype)
 # --- 2. Download & Load ControlNet Weights ---
 if not os.path.exists(CONTROLNET_FILENAME):
     print(f"Downloading ControlNet weights from {CONTROLNET_REPO}...")
     try:
     print(f"Loading ControlNet weights from {CONTROLNET_WEIGHTS}")
     try:
         state_dict = load_file(CONTROLNET_WEIGHTS)
         state_dict = state_dict.get("state_dict", state_dict)
         m, u = transformer.load_state_dict(state_dict, strict=False)
         print(f"ControlNet Weights Loaded - Missing keys: {len(m)}, Unexpected keys: {len(u)}")
     except Exception as e:
 # --- 3. Load Core Components ---
 print("Loading VAE, Tokenizer, and Text Encoder...")
 vae = AutoencoderKL.from_pretrained(
     MODEL_REPO,
     subfolder="vae",
     subfolder="tokenizer"
 )
+# Qwen3ForCausalLM is still needed as the Text Encoder for the pipeline
 text_encoder = Qwen3ForCausalLM.from_pretrained(
     MODEL_REPO,
     subfolder="text_encoder",
     new_width = int(width * scale)
     new_height = int(height * scale)
     new_width = (new_width // divisible_by) * divisible_by
     new_height = (new_height // divisible_by) * divisible_by
     if new_width > MAX_IMAGE_SIZE:
         new_width = MAX_IMAGE_SIZE
     if new_height > MAX_IMAGE_SIZE:
     resized = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
     return resized, new_width, new_height
+def get_image_latent(image):
     """Convert PIL image to VAE latent representation."""
     # Normalize image
     transform = transforms.Compose([
         transforms.ToTensor(),
         transforms.Normalize([0.5], [0.5])
     ])
+    # FIX: Only unsqueeze(0) for Batch dimension [B, C, H, W]
+    # Removed the second unsqueeze(2) which caused the 5D error
+    img_tensor = transform(image).unsqueeze(0)
     img_tensor = img_tensor.to(device, weight_dtype)
     with torch.no_grad():
     guidance_scale=1.0,
     seed=42,
     randomize_seed=True,
     progress=gr.Progress(track_tqdm=True)
 ):
     if not prompt.strip():
         raise gr.Error("Please enter a prompt to generate an image.")
+    # 1. Set Seed
     if randomize_seed:
         seed = random.randint(0, MAX_SEED)
     generator = torch.Generator(device).manual_seed(seed)
+    # 2. Process Control Image
     if input_image is None:
         raise gr.Error("Please upload a control image.")
     progress(0.2, desc=f"Processing {control_mode}...")
     processor_map = {
         'Canny': 'canny',
         'HED': 'softedge_hed',
     }
     processor_id = processor_map.get(control_mode, 'canny')
     try:
         processor = Processor(processor_id)
     except Exception as e:
         print(f"Failed to load processor {processor_id}, falling back to Canny. Error: {e}")
         processor = Processor('canny')
     control_image_rescaled, width, height = rescale_image(input_image, image_scale, 16)
+    # Run Processor
     temp_image = control_image_rescaled.resize((1024, 1024))
     processed_image_pil = processor(temp_image, to_pil=True)
     processed_image_pil = processed_image_pil.resize((width, height))
     # Convert to Latent
     progress(0.4, desc="Encoding control image...")
+    # FIX: Passed result directly without sample_size args which aren't used in new function
+    control_image_latent = get_image_latent(processed_image_pil)
+    # 3. Generate
     progress(0.5, desc="Generating...")
     try:
         result = pipe(
+            prompt=prompt,
             negative_prompt=negative_prompt,
             height=height,
             width=width,
         image = result.images[0]
         progress(1.0, desc="Complete!")
+        return image, seed, processed_image_pil
     except Exception as e:
         raise gr.Error(f"Generation failed: {str(e)}")
 }
 """
+with gr.Blocks(title="Z-Image Turbo ControlNet", css=apple_css) as demo:
     gr.HTML("""
         <div class="header-container">
             <div class="info-badge">✓ ControlNet Union</div>
             <h1 class="main-title">Z-Image Turbo</h1>
+            <p class="subtitle">Multi-Control Generation</p>
         </div>
     """)
                 lines=3
             )
+            randomize_seed = gr.Checkbox(label="Randomize Seed", value=True)
             negative_prompt = gr.Textbox(
                 label="Negative Prompt",
             output_image = gr.Image(label="Generated Image", type="pil")
             with gr.Accordion("Details & Debug", open=True):
                 with gr.Row():
                     seed_output = gr.Number(label="Seed Used", precision=0)
                 control_output = gr.Image(label="Preprocessor Output", type="pil")
         inputs=[
             prompt, negative_prompt, input_image, control_mode,
             control_context_scale, image_scale, num_inference_steps,
+            guidance_scale, seed, randomize_seed
         ],
+        outputs=[output_image, seed_output, control_output]
     )
 if __name__ == "__main__":
+    demo.launch(share=False)