Qwen-Image-Edit-Rapid-AIO-Loras-Experimental

Running on Zero

App Files Files Community

Professional Noob commited on Feb 3

Commit

e0e7a89

verified ·

1 Parent(s): 5b5f568

Update app.py

Browse files

Files changed (1) hide show

app.py +144 -198

app.py CHANGED Viewed

@@ -7,7 +7,8 @@ import numpy as np
 import spaces
 import torch
 import random
-from PIL import Image
 from typing import Iterable, Optional
 from transformers import (
@@ -136,7 +137,6 @@ def _normalize_version(raw: str) -> Optional[str]:
         return None
     if _VER_RE.fullmatch(s):
         return s
-    # forgiving: allow "21" -> "v21"
     if _DIGITS_RE.fullmatch(s):
         return f"v{s}"
     return None
@@ -180,10 +180,9 @@ def _load_pipe_with_version(version: str) -> QwenImageEditPlusPipeline:
     return p
-# Forgiving load: try env/default version, fallback to v19 if it fails
 try:
     pipe = _load_pipe_with_version(AIO_VERSION)
-except Exception as e:
     print("❌ Failed to load requested AIO_VERSION. Falling back to v19.")
     print("---- exception ----")
     print(traceback.format_exc())
@@ -192,7 +191,6 @@ except Exception as e:
     AIO_VERSION_SOURCE = "fallback_to_v19"
     pipe = _load_pipe_with_version(AIO_VERSION)
-# Apply FA3 Optimization
 try:
     pipe.transformer.set_attn_processor(QwenDoubleStreamAttnProcessorFA3())
     print("Flash Attention 3 Processor set successfully.")
@@ -202,47 +200,36 @@ except Exception as e:
 MAX_SEED = np.iinfo(np.int32).max
 # ============================================================
-# Derived conditioning (Pose + Depth)
-#   - Pose: DWPose via rtmlib (ONNX). Includes face/hands in wholebody.
-#   - Depth: Depth Anything V2 Small (Transformers-compatible)
 # ============================================================
-# Depth (Transformers)
 DEPTH_MODEL_ID = "depth-anything/Depth-Anything-V2-Small-hf"
-# Pose (rtmlib)
-_DWPOSE_CACHE = {}   # key: "cpu" / "cuda" -> Wholebody instance
-_DEPTH_CACHE = {}    # key: "cpu" / "cuda" -> (processor, model)
 def _derived_device(use_gpu: bool) -> torch.device:
     return torch.device("cuda" if (use_gpu and torch.cuda.is_available()) else "cpu")
-def _to_cv2_bgr(pil: Image.Image):
-    # Avoid importing cv2 at module import time if not installed yet
-    import cv2  # noqa: F401
-    arr = np.array(pil.convert("RGB"))
-    # RGB -> BGR
-    return arr[:, :, ::-1].copy()
-def _bgr_to_pil_rgb(bgr: np.ndarray) -> Image.Image:
-    rgb = bgr[:, :, ::-1]
-    return Image.fromarray(rgb.astype(np.uint8), mode="RGB")
-def _load_dwpose_model(use_gpu: bool):
     """
-    DWPose Wholebody via rtmlib.
-    Notes:
-    - This path avoids easy-dwpose (which hard-pins an old huggingface_hub).
-    - Uses ONNXRuntime backend by default.
-    - If user selects GPU, we try device='cuda'. If onnxruntime-gpu is not installed,
-      rtmlib may raise; we catch and fall back to CPU.
     """
-    key = "cuda" if (use_gpu and torch.cuda.is_available()) else "cpu"
     if key in _DWPOSE_CACHE:
         return _DWPOSE_CACHE[key]
@@ -250,29 +237,25 @@ def _load_dwpose_model(use_gpu: bool):
         from rtmlib import Wholebody
     except Exception as e:
         raise gr.Error(
-            "Missing dependency for DWPose: rtmlib. "
-            "Add `rtmlib`, `onnxruntime`, and OpenCV (headless recommended) to requirements.txt.\n"
             f"Import error: {e}"
         )
-    backend = "onnxruntime"
-    dev = "cuda" if key == "cuda" else "cpu"
     try:
-        # to_openpose=True => OpenPose-style keypoint layout + drawing (incl face/hands for wholebody)
         model = Wholebody(
-            to_openpose=True,
-            mode="balanced",   # 'performance'/'lightweight'/'balanced'
-            backend=backend,
-            device=dev,
         )
     except Exception as e:
-        if key == "cuda":
-            print(f"⚠️ rtmlib cuda init failed ({e}); falling back to CPU.")
             model = Wholebody(
-                to_openpose=True,
-                mode="balanced",
-                backend=backend,
                 device="cpu",
             )
         else:
@@ -282,72 +265,99 @@ def _load_dwpose_model(use_gpu: bool):
     return model
-def _load_depth_models(dev: torch.device):
-    key = str(dev)
-    if key in _DEPTH_CACHE:
-        return _DEPTH_CACHE[key]
-    proc = AutoImageProcessor.from_pretrained(DEPTH_MODEL_ID)
-    model = AutoModelForDepthEstimation.from_pretrained(DEPTH_MODEL_ID).to(dev)
-    model.eval()
-    _DEPTH_CACHE[key] = (proc, model)
-    return _DEPTH_CACHE[key]
-def make_dwpose_map(
     img: Image.Image,
     *,
     use_gpu: bool,
-    max_people: int = 4,
     kp_thresh: float = 0.20,
 ) -> Image.Image:
     """
-    Returns an OpenPose-style pose map (RGB) using rtmlib Wholebody (DWPose 133 kpts).
-    Includes face + hands.
-    We draw on a black canvas (like ControlNet pose maps).
     """
     img = img.convert("RGB")
-    wb = _load_dwpose_model(use_gpu=bool(use_gpu))
-    # rtmlib expects cv2 BGR image
-    bgr = _to_cv2_bgr(img)
     try:
-        keypoints, scores = wb(bgr)
     except Exception as e:
-        # If anything goes wrong, fail gracefully
-        print("⚠️ DWPose inference failed:", e)
-        return Image.new("RGB", img.size, (0, 0, 0))
-    # keypoints: (N, K, 2), scores: (N, K)
-    if keypoints is None or len(keypoints) == 0:
-        return Image.new("RGB", img.size, (0, 0, 0))
-    # Limit people
     try:
-        n = int(max_people)
-        if n > 0 and keypoints.shape[0] > n:
-            keypoints = keypoints[:n]
-            scores = scores[:n]
-    except Exception:
-        pass
-    # Draw on black canvas
     try:
         from rtmlib import draw_skeleton
     except Exception as e:
         raise gr.Error(f"rtmlib draw_skeleton import failed: {e}")
     canvas = np.zeros_like(bgr, dtype=np.uint8)
-    # rtmlib uses kpt_thr threshold
-    canvas = draw_skeleton(canvas, keypoints, scores, kpt_thr=float(kp_thresh))
-    out = _bgr_to_pil_rgb(canvas)
-    # Ensure same size as input
-    if out.size != img.size:
-        out = out.resize(img.size, Image.BILINEAR)
-    return out
 def make_depth_map(img: Image.Image, *, use_gpu: bool) -> Image.Image:
@@ -362,10 +372,7 @@ def make_depth_map(img: Image.Image, *, use_gpu: bool) -> Image.Image:
     with torch.no_grad():
         out = model(**inputs)
-    # predicted_depth: (B, H, W)
     pred = out.predicted_depth
-    # Upsample to original image size
     pred = torch.nn.functional.interpolate(
         pred.unsqueeze(1),
         size=(img.height, img.width),
@@ -463,7 +470,7 @@ ADAPTER_SPECS = {
         "weights": "bfs_head_v5_2511_original.safetensors",
         "adapter_name": "BFS-Best-Faceswap",
         "strength": 1.0,
-        "needs_alpha_fix": True,  # <-- fixes KeyError 'img_in.alpha'
     },
     "BFS-Best-FaceSwap-merge": {
         "type": "single",
@@ -473,7 +480,7 @@ ADAPTER_SPECS = {
         "weights": "bfs_head_v5_2511_merged_version_rank_32_fp32.safetensors",
         "adapter_name": "BFS-Best-Faceswap-merge",
         "strength": 1.1,
-        "needs_alpha_fix": True,  # <-- fixes KeyError 'img_in.alpha'
     },
     "F2P": {
         "type": "single",
@@ -559,16 +566,12 @@ LORA_PRESET_PROMPTS = {
     "BFS-Best-FaceSwap-merge": "head_swap: start with Picture 1 as the base image, keeping its lighting, environment, and background. remove the head from Picture 1 completely and replace it with the head from Picture 2, strictly preserving the hair, eye color, and nose structure of Picture 2. copy the eye direction, head rotation, and micro-expressions from Picture 1. high quality, sharp details, 4k",
 }
-# Track what is currently loaded in memory (adapter_name values)
 LOADED_ADAPTERS = set()
 # ============================================================
 # Helpers: resolution
 # ============================================================
-# We prefer *area-based* sizing (≈ megapixels) over long-edge sizing.
-# This aligns better with Qwen-Image-Edit's internal assumptions and reduces FOV drift.
 def _round_to_multiple(x: int, m: int) -> int:
     return max(m, (int(x) // m) * m)
@@ -577,14 +580,9 @@ def compute_canvas_dimensions_from_area(
     target_area: int,
     multiple_of: int,
 ) -> tuple[int, int]:
-    """Compute (width, height) that matches image aspect ratio and approximates target_area.
-    The result is floored to be divisible by multiple_of (typically vae_scale_factor*2).
-    """
     w, h = image.size
     aspect = w / h if h else 1.0
-    # Use the pipeline's own area->(w,h) helper for consistency.
     from qwenimage.pipeline_qwenimage_edit_plus import calculate_dimensions
     width, height = calculate_dimensions(int(target_area), float(aspect))
@@ -597,13 +595,6 @@ def get_target_area_for_lora(
     lora_adapter: str,
     user_target_megapixels: float,
 ) -> int:
-    """Return target pixel area for the canvas.
-    Priority:
-      1) Adapter spec: target_area (pixels) or target_megapixels
-      2) Adapter spec: target_long_edge (legacy) -> converted to area using image aspect
-      3) User slider target megapixels
-    """
     spec = ADAPTER_SPECS.get(lora_adapter, {})
     if "target_area" in spec:
@@ -619,7 +610,6 @@ def get_target_area_for_lora(
         except Exception:
             pass
-    # Legacy support (e.g. Upscale2K)
     if "target_long_edge" in spec:
         try:
             long_edge = int(spec["target_long_edge"])
@@ -634,7 +624,6 @@ def get_target_area_for_lora(
         except Exception:
             pass
-    # User default
     return int(float(user_target_megapixels) * 1024 * 1024)
 # ============================================================
@@ -644,67 +633,43 @@ def get_target_area_for_lora(
 def lora_requires_two_images(lora_adapter: str) -> bool:
     return bool(ADAPTER_SPECS.get(lora_adapter, {}).get("requires_two_images", False))
 def image2_label_for_lora(lora_adapter: str) -> str:
     return str(ADAPTER_SPECS.get(lora_adapter, {}).get("image2_label", "Upload Reference (Image 2)"))
 def _to_pil_rgb(x) -> Optional[Image.Image]:
-    """
-    Accepts PIL / numpy / (image, caption) tuples from gr.Gallery and returns PIL RGB.
-    Gradio Gallery commonly yields tuples like (image, caption).
-    """
     if x is None:
         return None
-    # Gallery often returns (image, caption)
     if isinstance(x, tuple) and len(x) >= 1:
         x = x[0]
         if x is None:
             return None
     if isinstance(x, Image.Image):
         return x.convert("RGB")
     if isinstance(x, np.ndarray):
         return Image.fromarray(x).convert("RGB")
-    # Best-effort fallback
     try:
         return Image.fromarray(np.array(x)).convert("RGB")
     except Exception:
         return None
 def build_labeled_images(
     img1: Image.Image,
     img2: Optional[Image.Image],
     extra_imgs: Optional[list[Image.Image]],
 ) -> dict[str, Image.Image]:
-    """
-    Creates labels image_1, image_2, image_3... based on what is actually uploaded:
-      - img1 is always image_1
-      - img2 becomes image_2 only if present
-      - extras start immediately after the last present base box
-    The pipeline receives images in this exact order.
-    """
     labeled: dict[str, Image.Image] = {}
     idx = 1
     labeled[f"image_{idx}"] = img1
     idx += 1
     if img2 is not None:
         labeled[f"image_{idx}"] = img2
         idx += 1
     if extra_imgs:
         for im in extra_imgs:
             if im is None:
                 continue
             labeled[f"image_{idx}"] = im
             idx += 1
     return labeled
 # ============================================================
@@ -712,17 +677,7 @@ def build_labeled_images(
 # ============================================================
 def _inject_missing_alpha_keys(state_dict: dict) -> dict:
-    """
-    Diffusers' Qwen LoRA converter expects '<module>.alpha' keys.
-    BFS safetensors omits them. We inject alpha = rank (neutral scaling).
-    IMPORTANT: diffusers may strip 'diffusion_model.' before lookup, so we
-    inject BOTH:
-      - diffusion_model.xxx.alpha
-      - xxx.alpha
-    """
     bases = {}
     for k, v in state_dict.items():
         if not isinstance(v, torch.Tensor):
             continue
@@ -733,22 +688,17 @@ def _inject_missing_alpha_keys(state_dict: dict) -> dict:
     for base, rank in bases.items():
         alpha_tensor = torch.tensor(float(rank), dtype=torch.float32)
         full_alpha = f"{base}.alpha"
         if full_alpha not in state_dict:
             state_dict[full_alpha] = alpha_tensor
         if base.startswith("diffusion_model."):
             stripped_base = base[len("diffusion_model.") :]
             stripped_alpha = f"{stripped_base}.alpha"
             if stripped_alpha not in state_dict:
                 state_dict[stripped_alpha] = alpha_tensor
     return state_dict
 def _filter_to_diffusers_lora_keys(state_dict: dict) -> tuple[dict, dict]:
-    """Return (filtered_state_dict, stats)."""
     keep_suffixes = (
         ".lora_up.weight",
         ".lora_down.weight",
@@ -756,7 +706,6 @@ def _filter_to_diffusers_lora_keys(state_dict: dict) -> tuple[dict, dict]:
         ".alpha",
         ".lora_alpha",
     )
     dropped_patch = 0
     dropped_other = 0
     kept = 0
@@ -767,15 +716,12 @@ def _filter_to_diffusers_lora_keys(state_dict: dict) -> tuple[dict, dict]:
         if not isinstance(v, torch.Tensor):
             dropped_other += 1
             continue
         if k.endswith(".diff") or k.endswith(".diff_b"):
             dropped_patch += 1
             continue
         if not k.endswith(keep_suffixes):
             dropped_other += 1
             continue
         if k.endswith(".lora_alpha"):
             base = k[: -len(".lora_alpha")]
             k2 = f"{base}.alpha"
@@ -783,7 +729,6 @@ def _filter_to_diffusers_lora_keys(state_dict: dict) -> tuple[dict, dict]:
             normalized_alpha += 1
             kept += 1
             continue
         out[k] = v
         kept += 1
@@ -795,7 +740,6 @@ def _filter_to_diffusers_lora_keys(state_dict: dict) -> tuple[dict, dict]:
     }
     return out, stats
 def _duplicate_stripped_prefix_keys(state_dict: dict, prefix: str = "diffusion_model.") -> dict:
     out = dict(state_dict)
     for k, v in list(state_dict.items()):
@@ -806,7 +750,6 @@ def _duplicate_stripped_prefix_keys(state_dict: dict, prefix: str = "diffusion_m
             out[stripped] = v
     return out
 def _load_lora_weights_with_fallback(repo: str, weight_name: str, adapter_name: str, needs_alpha_fix: bool = False):
     try:
         pipe.load_lora_weights(repo, weight_name=weight_name, adapter_name=adapter_name)
@@ -814,12 +757,10 @@ def _load_lora_weights_with_fallback(repo: str, weight_name: str, adapter_name:
     except (KeyError, ValueError) as e:
         if not needs_alpha_fix:
             raise
         print(
             "⚠️ LoRA load failed (will try safe dict fallback). "
             f"Adapter={adapter_name!r} file={weight_name!r} error={type(e).__name__}: {e}"
         )
         local_path = hf_hub_download(repo_id=repo, filename=weight_name)
         sd = safetensors_load_file(local_path)
@@ -832,11 +773,9 @@ def _load_lora_weights_with_fallback(repo: str, weight_name: str, adapter_name:
             f"kept={stats['kept']} dropped_patch={stats['dropped_patch']} "
             f"dropped_other={stats['dropped_other']} normalized_alpha={stats['normalized_alpha']}"
         )
         pipe.load_lora_weights(sd, adapter_name=adapter_name)
         return
 def _ensure_loaded_and_get_active_adapters(selected_lora: str):
     spec = ADAPTER_SPECS.get(selected_lora)
     if not spec:
@@ -849,7 +788,6 @@ def _ensure_loaded_and_get_active_adapters(selected_lora: str):
         parts = spec.get("parts", [])
         if not parts:
             raise gr.Error(f"Package spec has no parts: {selected_lora}")
         for part in parts:
             repo = part["repo"]
             weights = part["weights"]
@@ -871,10 +809,8 @@ def _ensure_loaded_and_get_active_adapters(selected_lora: str):
                     raise gr.Error(f"Failed to load adapter part {selected_lora}/{adapter_name}: {e}")
             else:
                 print(f"--- Adapter part already loaded: {selected_lora} / {adapter_name} ---")
             adapter_names.append(adapter_name)
             adapter_weights.append(strength)
     else:
         repo = spec["repo"]
         weights = spec["weights"]
@@ -902,13 +838,11 @@ def _ensure_loaded_and_get_active_adapters(selected_lora: str):
     return adapter_names, adapter_weights
 # ============================================================
 # UI handlers
 # ============================================================
 def on_lora_change_ui(selected_lora, current_prompt, current_extras_condition_only):
-    # Preset prompt (fill only if empty)
     if selected_lora != NONE_LORA:
         preset = LORA_PRESET_PROMPTS.get(selected_lora, "")
         if preset and (current_prompt is None or str(current_prompt).strip() == ""):
@@ -918,13 +852,11 @@ def on_lora_change_ui(selected_lora, current_prompt, current_extras_condition_on
     else:
         prompt_update = gr.update(value=current_prompt)
-    # Image2 visibility/label
     if lora_requires_two_images(selected_lora):
         img2_update = gr.update(visible=True, label=image2_label_for_lora(selected_lora))
     else:
         img2_update = gr.update(visible=False, value=None, label="Upload Reference (Image 2)")
-    # Extra references routing default:
     if selected_lora in ("BFS-Best-FaceSwap", "BFS-Best-FaceSwap-merge", "AnyPose"):
         extras_update = gr.update(value=True)
     else:
@@ -941,21 +873,26 @@ def set_output_as_image1(last):
         raise gr.Error("No output available yet.")
     return gr.update(value=last)
 def set_output_as_image2(last):
     if last is None:
         raise gr.Error("No output available yet.")
     return gr.update(value=last)
 def set_output_as_extra(last, existing_extra):
     if last is None:
         raise gr.Error("No output available yet.")
     return _append_to_gallery(existing_extra, last)
 @spaces.GPU
-def add_derived_ref(img1, existing_extra, derived_type, derived_use_gpu, derived_max_people):
     if img1 is None:
         raise gr.Error("Please upload Image 1 first.")
@@ -964,12 +901,13 @@ def add_derived_ref(img1, existing_extra, derived_type, derived_use_gpu, derived
     base = img1.convert("RGB")
-    if derived_type == "Pose (DWPose / rtmlib)":
-        derived = make_dwpose_map(
             base,
             use_gpu=bool(derived_use_gpu),
-            max_people=int(derived_max_people),
             kp_thresh=0.20,
         )
     elif derived_type == "Depth (Depth Anything V2 Small)":
         derived = make_depth_map(base, use_gpu=bool(derived_use_gpu))
@@ -979,7 +917,6 @@ def add_derived_ref(img1, existing_extra, derived_type, derived_use_gpu, derived
     new_gallery = _append_to_gallery(existing_extra, derived)
     return gr.update(value=new_gallery), gr.update(visible=True, value=derived)
 # ============================================================
 # Inference
 # ============================================================
@@ -988,7 +925,7 @@ def add_derived_ref(img1, existing_extra, derived_type, derived_use_gpu, derived
 def infer(
     input_image_1,
     input_image_2,
-    input_images_extra,  # gallery multi-image box
     prompt,
     lora_adapter,
     seed,
@@ -1007,7 +944,6 @@ def infer(
     if input_image_1 is None:
         raise gr.Error("Please upload Image 1.")
-    # Handle "None"
     if lora_adapter == NONE_LORA:
         try:
             pipe.set_adapters([], adapter_weights=[])
@@ -1030,7 +966,6 @@ def infer(
     img1 = input_image_1.convert("RGB")
     img2 = input_image_2.convert("RGB") if input_image_2 is not None else None
-    # Normalize extra images (Gallery) to PIL RGB (handles tuples from Gallery)
     extra_imgs: list[Image.Image] = []
     if input_images_extra:
         for item in input_images_extra:
@@ -1038,19 +973,15 @@ def infer(
             if pil is not None:
                 extra_imgs.append(pil)
-    # Enforce existing 2-image LoRA behavior (image_1 + image_2 required)
     if lora_requires_two_images(lora_adapter) and img2 is None:
         raise gr.Error("This LoRA needs two images. Please upload Image 2 as well.")
-    # Label images as image_1, image_2, image_3...
     labeled = build_labeled_images(img1, img2, extra_imgs)
-    # Pass to pipeline in labeled order. Keep single-image call when only one is present.
     pipe_images = list(labeled.values())
     if len(pipe_images) == 1:
         pipe_images = pipe_images[0]
-    # Resolution derived from Image 1 (base/body/target)
     target_area = get_target_area_for_lora(img1, lora_adapter, float(target_megapixels))
     width, height = compute_canvas_dimensions_from_area(
         img1,
@@ -1058,7 +989,6 @@ def infer(
         multiple_of=int(pipe.vae_scale_factor * 2),
     )
-    # Decide which images participate in the VAE latent stream.
     vae_image_indices = None
     if extras_condition_only:
         if isinstance(pipe_images, list) and len(pipe_images) > 2:
@@ -1066,8 +996,8 @@ def infer(
     try:
         print(
-             "[DEBUG][infer] submitting request | "
-             f"lora_adapter={lora_adapter!r} seed={seed} prompt={prompt!r}"
         )
         result = pipe(
@@ -1088,7 +1018,6 @@ def infer(
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
 @spaces.GPU
 def infer_example(input_image, prompt, lora_adapter):
     if input_image is None:
@@ -1099,7 +1028,6 @@ def infer_example(input_image, prompt, lora_adapter):
     result, seed, last = infer(input_pil, None, None, prompt, lora_adapter, 0, True, guidance_scale, steps, 1.0, True, True)
     return result, seed, last
 # ============================================================
 # UI
 # ============================================================
@@ -1181,20 +1109,33 @@ with gr.Blocks() as demo:
                             label="Derived Type (from Image 1)",
                             choices=[
                                 "None",
-                                "Pose (DWPose / rtmlib)",
                                 "Depth (Depth Anything V2 Small)",
                             ],
                             value="None",
                         )
                         derived_use_gpu = gr.Checkbox(label="Use GPU for derived model", value=False)
                         derived_max_people = gr.Slider(
-                            label="Max people (pose)",
                             minimum=1,
                             maximum=10,
                             step=1,
                             value=4,
                         )
-                        add_derived_btn = gr.Button("➕ Add derived ref to Extras (conditioning-only recommended)")
                     seed = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=0)
                     randomize_seed = gr.Checkbox(label="Randomize Seed", value=True)
@@ -1216,7 +1157,6 @@ with gr.Blocks() as demo:
                         value=True,
                     )
-        # On LoRA selection: preset prompt + toggle Image 2
         lora_adapter.change(
             fn=on_lora_change_ui,
             inputs=[lora_adapter, prompt, extras_condition_only],
@@ -1276,15 +1216,21 @@ with gr.Blocks() as demo:
         outputs=[output_image, seed, last_output],
     )
-    # Output routing buttons
     btn_out_to_img1.click(fn=set_output_as_image1, inputs=[last_output], outputs=[input_image_1])
     btn_out_to_img2.click(fn=set_output_as_image2, inputs=[last_output], outputs=[input_image_2])
     btn_out_to_extra.click(fn=set_output_as_extra, inputs=[last_output, input_images_extra], outputs=[input_images_extra])
-    # Derived conditioning: append pose/depth map as extra ref (UI shows preview)
     add_derived_btn.click(
         fn=add_derived_ref,
-        inputs=[input_image_1, input_images_extra, derived_type, derived_use_gpu, derived_max_people],
         outputs=[input_images_extra, derived_preview],
     )

 import spaces
 import torch
 import random
+import cv2
+from PIL import Image, ImageDraw
 from typing import Iterable, Optional
 from transformers import (
         return None
     if _VER_RE.fullmatch(s):
         return s
     if _DIGITS_RE.fullmatch(s):
         return f"v{s}"
     return None
     return p
 try:
     pipe = _load_pipe_with_version(AIO_VERSION)
+except Exception:
     print("❌ Failed to load requested AIO_VERSION. Falling back to v19.")
     print("---- exception ----")
     print(traceback.format_exc())
     AIO_VERSION_SOURCE = "fallback_to_v19"
     pipe = _load_pipe_with_version(AIO_VERSION)
 try:
     pipe.transformer.set_attn_processor(QwenDoubleStreamAttnProcessorFA3())
     print("Flash Attention 3 Processor set successfully.")
 MAX_SEED = np.iinfo(np.int32).max
 # ============================================================
+# Derived conditioning: DWPose + Depth
 # ============================================================
 DEPTH_MODEL_ID = "depth-anything/Depth-Anything-V2-Small-hf"
+# Lazy caches keyed by device string ("cpu" / "cuda")
+_DWPOSE_CACHE = {}
+_DEPTH_CACHE = {}
 def _derived_device(use_gpu: bool) -> torch.device:
     return torch.device("cuda" if (use_gpu and torch.cuda.is_available()) else "cpu")
+def _load_depth_models(dev: torch.device):
+    key = str(dev)
+    if key in _DEPTH_CACHE:
+        return _DEPTH_CACHE[key]
+    proc = AutoImageProcessor.from_pretrained(DEPTH_MODEL_ID)
+    model = AutoModelForDepthEstimation.from_pretrained(DEPTH_MODEL_ID).to(dev)
+    model.eval()
+    _DEPTH_CACHE[key] = (proc, model)
+    return _DEPTH_CACHE[key]
+def _load_dwpose(use_gpu: bool, *, to_openpose: bool = True, mode: str = "balanced", backend: str = "onnxruntime"):
     """
+    DWPose-ish wholebody via rtmlib Wholebody (RTMW-DW by default in rtmlib downloads).
     """
+    key = ("cuda" if (use_gpu and torch.cuda.is_available()) else "cpu", bool(to_openpose), str(mode), str(backend))
     if key in _DWPOSE_CACHE:
         return _DWPOSE_CACHE[key]
         from rtmlib import Wholebody
     except Exception as e:
         raise gr.Error(
+            "rtmlib not available. Add `rtmlib` to requirements.txt.\n"
             f"Import error: {e}"
         )
+    device_str = "cuda" if (use_gpu and torch.cuda.is_available()) else "cpu"
     try:
         model = Wholebody(
+            to_openpose=bool(to_openpose),
+            mode=str(mode),
+            backend=str(backend),
+            device=device_str,
         )
     except Exception as e:
+        if device_str == "cuda":
+            print(f"⚠️ rtmlib Wholebody CUDA init failed: {e} -> falling back to CPU")
             model = Wholebody(
+                to_openpose=bool(to_openpose),
+                mode=str(mode),
+                backend=str(backend),
                 device="cpu",
             )
         else:
     return model
+def make_dwpose_map_debug(
     img: Image.Image,
     *,
     use_gpu: bool,
     kp_thresh: float = 0.20,
+    to_openpose: bool = True,
+    openpose_skeleton: Optional[bool] = None,
 ) -> Image.Image:
     """
+    Run rtmlib Wholebody and attempt to draw with rtmlib.draw_skeleton,
+    BUT includes verbose debugging prints so we can see shapes / K.
+    IMPORTANT:
+      - If to_openpose=True, outputs commonly have K=134 (openpose wholebody).
+      - If to_openpose=False, outputs commonly have K=133 (coco wholebody).
+      - rtmlib.draw_skeleton needs correct openpose_skeleton flag AND correct shape.
     """
     img = img.convert("RGB")
+    wb = _load_dwpose(use_gpu=bool(use_gpu), to_openpose=bool(to_openpose))
+    bgr = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
+    keypoints, scores = wb(bgr)
+    # -------------------- DEBUGGER --------------------
+    kps = np.asarray(keypoints)
+    sc = np.asarray(scores)
+    print("[DWPose debug] keypoints type:", type(keypoints), "scores type:", type(scores))
+    print("[DWPose debug] kps.shape:", getattr(kps, "shape", None), "dtype:", getattr(kps, "dtype", None))
+    print("[DWPose debug] sc.shape :", getattr(sc, "shape", None), "dtype:", getattr(sc, "dtype", None))
+    if isinstance(keypoints, list):
+        print("[DWPose debug] keypoints list len:", len(keypoints))
+    if isinstance(scores, list):
+        print("[DWPose debug] scores list len:", len(scores))
     try:
+        if hasattr(kps, "shape") and len(kps.shape) >= 2:
+            K = kps.shape[-2]  # works for (K,2) and (N,K,2)
+            print("[DWPose debug] inferred K (num keypoints):", int(K))
     except Exception as e:
+        print("[DWPose debug] could not infer K:", e)
     try:
+        if kps.ndim == 3:
+            print("[DWPose debug] first person first 3 kpts:", kps[0, :3, :])
+            if sc.ndim >= 2:
+                print("[DWPose debug] first person first 3 scores:", sc[0, :3])
+        elif kps.ndim == 2:
+            print("[DWPose debug] first 3 kpts:", kps[:3, :])
+            if sc.ndim >= 1:
+                print("[DWPose debug] first 3 scores:", sc[:3])
+    except Exception as e:
+        print("[DWPose debug] sample print failed:", e)
+    # ------------------ END DEBUGGER ------------------
+    # Attempt to draw (this is what currently errors for you)
     try:
         from rtmlib import draw_skeleton
     except Exception as e:
         raise gr.Error(f"rtmlib draw_skeleton import failed: {e}")
     canvas = np.zeros_like(bgr, dtype=np.uint8)
+    # IMPORTANT: draw_skeleton in your pasted code infers skeleton by num_keypoints,
+    # but also needs correct openpose_skeleton flag depending on whether K is openpose-style.
+    # For debug run, we allow:
+    #   - openpose_skeleton override if provided
+    #   - else: default to 'to_openpose' (best guess)
+    if openpose_skeleton is None:
+        openpose_skeleton = bool(to_openpose)
+    # Normalize shapes BEFORE calling draw_skeleton so it doesn't mis-read K as 2
+    kps2 = np.asarray(keypoints)
+    sc2 = np.asarray(scores)
+    # If single instance comes back as (K,2) we must expand before draw_skeleton reads shape[1]
+    if kps2.ndim == 2 and kps2.shape[-1] == 2:
+        kps2 = kps2[None, :, :]
+    if sc2.ndim == 1:
+        sc2 = sc2[None, :]
+    # Now call rtmlib's draw
+    out = draw_skeleton(
+        canvas,
+        kps2,
+        sc2,
+        openpose_skeleton=bool(openpose_skeleton),
+        kpt_thr=float(kp_thresh),
+    )
+    out_rgb = cv2.cvtColor(out, cv2.COLOR_BGR2RGB)
+    return Image.fromarray(out_rgb)
 def make_depth_map(img: Image.Image, *, use_gpu: bool) -> Image.Image:
     with torch.no_grad():
         out = model(**inputs)
     pred = out.predicted_depth
     pred = torch.nn.functional.interpolate(
         pred.unsqueeze(1),
         size=(img.height, img.width),
         "weights": "bfs_head_v5_2511_original.safetensors",
         "adapter_name": "BFS-Best-Faceswap",
         "strength": 1.0,
+        "needs_alpha_fix": True,
     },
     "BFS-Best-FaceSwap-merge": {
         "type": "single",
         "weights": "bfs_head_v5_2511_merged_version_rank_32_fp32.safetensors",
         "adapter_name": "BFS-Best-Faceswap-merge",
         "strength": 1.1,
+        "needs_alpha_fix": True,
     },
     "F2P": {
         "type": "single",
     "BFS-Best-FaceSwap-merge": "head_swap: start with Picture 1 as the base image, keeping its lighting, environment, and background. remove the head from Picture 1 completely and replace it with the head from Picture 2, strictly preserving the hair, eye color, and nose structure of Picture 2. copy the eye direction, head rotation, and micro-expressions from Picture 1. high quality, sharp details, 4k",
 }
 LOADED_ADAPTERS = set()
 # ============================================================
 # Helpers: resolution
 # ============================================================
 def _round_to_multiple(x: int, m: int) -> int:
     return max(m, (int(x) // m) * m)
     target_area: int,
     multiple_of: int,
 ) -> tuple[int, int]:
     w, h = image.size
     aspect = w / h if h else 1.0
     from qwenimage.pipeline_qwenimage_edit_plus import calculate_dimensions
     width, height = calculate_dimensions(int(target_area), float(aspect))
     lora_adapter: str,
     user_target_megapixels: float,
 ) -> int:
     spec = ADAPTER_SPECS.get(lora_adapter, {})
     if "target_area" in spec:
         except Exception:
             pass
     if "target_long_edge" in spec:
         try:
             long_edge = int(spec["target_long_edge"])
         except Exception:
             pass
     return int(float(user_target_megapixels) * 1024 * 1024)
 # ============================================================
 def lora_requires_two_images(lora_adapter: str) -> bool:
     return bool(ADAPTER_SPECS.get(lora_adapter, {}).get("requires_two_images", False))
 def image2_label_for_lora(lora_adapter: str) -> str:
     return str(ADAPTER_SPECS.get(lora_adapter, {}).get("image2_label", "Upload Reference (Image 2)"))
 def _to_pil_rgb(x) -> Optional[Image.Image]:
     if x is None:
         return None
     if isinstance(x, tuple) and len(x) >= 1:
         x = x[0]
         if x is None:
             return None
     if isinstance(x, Image.Image):
         return x.convert("RGB")
     if isinstance(x, np.ndarray):
         return Image.fromarray(x).convert("RGB")
     try:
         return Image.fromarray(np.array(x)).convert("RGB")
     except Exception:
         return None
 def build_labeled_images(
     img1: Image.Image,
     img2: Optional[Image.Image],
     extra_imgs: Optional[list[Image.Image]],
 ) -> dict[str, Image.Image]:
     labeled: dict[str, Image.Image] = {}
     idx = 1
     labeled[f"image_{idx}"] = img1
     idx += 1
     if img2 is not None:
         labeled[f"image_{idx}"] = img2
         idx += 1
     if extra_imgs:
         for im in extra_imgs:
             if im is None:
                 continue
             labeled[f"image_{idx}"] = im
             idx += 1
     return labeled
 # ============================================================
 # ============================================================
 def _inject_missing_alpha_keys(state_dict: dict) -> dict:
     bases = {}
     for k, v in state_dict.items():
         if not isinstance(v, torch.Tensor):
             continue
     for base, rank in bases.items():
         alpha_tensor = torch.tensor(float(rank), dtype=torch.float32)
         full_alpha = f"{base}.alpha"
         if full_alpha not in state_dict:
             state_dict[full_alpha] = alpha_tensor
         if base.startswith("diffusion_model."):
             stripped_base = base[len("diffusion_model.") :]
             stripped_alpha = f"{stripped_base}.alpha"
             if stripped_alpha not in state_dict:
                 state_dict[stripped_alpha] = alpha_tensor
     return state_dict
 def _filter_to_diffusers_lora_keys(state_dict: dict) -> tuple[dict, dict]:
     keep_suffixes = (
         ".lora_up.weight",
         ".lora_down.weight",
         ".alpha",
         ".lora_alpha",
     )
     dropped_patch = 0
     dropped_other = 0
     kept = 0
         if not isinstance(v, torch.Tensor):
             dropped_other += 1
             continue
         if k.endswith(".diff") or k.endswith(".diff_b"):
             dropped_patch += 1
             continue
         if not k.endswith(keep_suffixes):
             dropped_other += 1
             continue
         if k.endswith(".lora_alpha"):
             base = k[: -len(".lora_alpha")]
             k2 = f"{base}.alpha"
             normalized_alpha += 1
             kept += 1
             continue
         out[k] = v
         kept += 1
     }
     return out, stats
 def _duplicate_stripped_prefix_keys(state_dict: dict, prefix: str = "diffusion_model.") -> dict:
     out = dict(state_dict)
     for k, v in list(state_dict.items()):
             out[stripped] = v
     return out
 def _load_lora_weights_with_fallback(repo: str, weight_name: str, adapter_name: str, needs_alpha_fix: bool = False):
     try:
         pipe.load_lora_weights(repo, weight_name=weight_name, adapter_name=adapter_name)
     except (KeyError, ValueError) as e:
         if not needs_alpha_fix:
             raise
         print(
             "⚠️ LoRA load failed (will try safe dict fallback). "
             f"Adapter={adapter_name!r} file={weight_name!r} error={type(e).__name__}: {e}"
         )
         local_path = hf_hub_download(repo_id=repo, filename=weight_name)
         sd = safetensors_load_file(local_path)
             f"kept={stats['kept']} dropped_patch={stats['dropped_patch']} "
             f"dropped_other={stats['dropped_other']} normalized_alpha={stats['normalized_alpha']}"
         )
         pipe.load_lora_weights(sd, adapter_name=adapter_name)
         return
 def _ensure_loaded_and_get_active_adapters(selected_lora: str):
     spec = ADAPTER_SPECS.get(selected_lora)
     if not spec:
         parts = spec.get("parts", [])
         if not parts:
             raise gr.Error(f"Package spec has no parts: {selected_lora}")
         for part in parts:
             repo = part["repo"]
             weights = part["weights"]
                     raise gr.Error(f"Failed to load adapter part {selected_lora}/{adapter_name}: {e}")
             else:
                 print(f"--- Adapter part already loaded: {selected_lora} / {adapter_name} ---")
             adapter_names.append(adapter_name)
             adapter_weights.append(strength)
     else:
         repo = spec["repo"]
         weights = spec["weights"]
     return adapter_names, adapter_weights
 # ============================================================
 # UI handlers
 # ============================================================
 def on_lora_change_ui(selected_lora, current_prompt, current_extras_condition_only):
     if selected_lora != NONE_LORA:
         preset = LORA_PRESET_PROMPTS.get(selected_lora, "")
         if preset and (current_prompt is None or str(current_prompt).strip() == ""):
     else:
         prompt_update = gr.update(value=current_prompt)
     if lora_requires_two_images(selected_lora):
         img2_update = gr.update(visible=True, label=image2_label_for_lora(selected_lora))
     else:
         img2_update = gr.update(visible=False, value=None, label="Upload Reference (Image 2)")
     if selected_lora in ("BFS-Best-FaceSwap", "BFS-Best-FaceSwap-merge", "AnyPose"):
         extras_update = gr.update(value=True)
     else:
         raise gr.Error("No output available yet.")
     return gr.update(value=last)
 def set_output_as_image2(last):
     if last is None:
         raise gr.Error("No output available yet.")
     return gr.update(value=last)
 def set_output_as_extra(last, existing_extra):
     if last is None:
         raise gr.Error("No output available yet.")
     return _append_to_gallery(existing_extra, last)
 @spaces.GPU
+def add_derived_ref(
+    img1,
+    existing_extra,
+    derived_type,
+    derived_use_gpu,
+    derived_max_people,  # kept for UI compatibility; not used by dwpose here
+    derived_dwpose_to_openpose,
+    derived_dwpose_openpose_flag,
+):
     if img1 is None:
         raise gr.Error("Please upload Image 1 first.")
     base = img1.convert("RGB")
+    if derived_type == "Pose (DWPose / rtmlib) [DEBUG]":
+        derived = make_dwpose_map_debug(
             base,
             use_gpu=bool(derived_use_gpu),
             kp_thresh=0.20,
+            to_openpose=bool(derived_dwpose_to_openpose),
+            openpose_skeleton=(None if derived_dwpose_openpose_flag == "Auto" else (derived_dwpose_openpose_flag == "True")),
         )
     elif derived_type == "Depth (Depth Anything V2 Small)":
         derived = make_depth_map(base, use_gpu=bool(derived_use_gpu))
     new_gallery = _append_to_gallery(existing_extra, derived)
     return gr.update(value=new_gallery), gr.update(visible=True, value=derived)
 # ============================================================
 # Inference
 # ============================================================
 def infer(
     input_image_1,
     input_image_2,
+    input_images_extra,
     prompt,
     lora_adapter,
     seed,
     if input_image_1 is None:
         raise gr.Error("Please upload Image 1.")
     if lora_adapter == NONE_LORA:
         try:
             pipe.set_adapters([], adapter_weights=[])
     img1 = input_image_1.convert("RGB")
     img2 = input_image_2.convert("RGB") if input_image_2 is not None else None
     extra_imgs: list[Image.Image] = []
     if input_images_extra:
         for item in input_images_extra:
             if pil is not None:
                 extra_imgs.append(pil)
     if lora_requires_two_images(lora_adapter) and img2 is None:
         raise gr.Error("This LoRA needs two images. Please upload Image 2 as well.")
     labeled = build_labeled_images(img1, img2, extra_imgs)
     pipe_images = list(labeled.values())
     if len(pipe_images) == 1:
         pipe_images = pipe_images[0]
     target_area = get_target_area_for_lora(img1, lora_adapter, float(target_megapixels))
     width, height = compute_canvas_dimensions_from_area(
         img1,
         multiple_of=int(pipe.vae_scale_factor * 2),
     )
     vae_image_indices = None
     if extras_condition_only:
         if isinstance(pipe_images, list) and len(pipe_images) > 2:
     try:
         print(
+            "[DEBUG][infer] submitting request | "
+            f"lora_adapter={lora_adapter!r} seed={seed} prompt={prompt!r}"
         )
         result = pipe(
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
 @spaces.GPU
 def infer_example(input_image, prompt, lora_adapter):
     if input_image is None:
     result, seed, last = infer(input_pil, None, None, prompt, lora_adapter, 0, True, guidance_scale, steps, 1.0, True, True)
     return result, seed, last
 # ============================================================
 # UI
 # ============================================================
                             label="Derived Type (from Image 1)",
                             choices=[
                                 "None",
+                                "Pose (DWPose / rtmlib) [DEBUG]",
                                 "Depth (Depth Anything V2 Small)",
                             ],
                             value="None",
                         )
                         derived_use_gpu = gr.Checkbox(label="Use GPU for derived model", value=False)
+                        # kept for UI compatibility (not used by dwpose here)
                         derived_max_people = gr.Slider(
+                            label="Max people (unused for dwpose)",
                             minimum=1,
                             maximum=10,
                             step=1,
                             value=4,
                         )
+                        derived_dwpose_to_openpose = gr.Checkbox(
+                            label="DWPose output: to_openpose=True (likely K=134)",
+                            value=True,
+                        )
+                        derived_dwpose_openpose_flag = gr.Dropdown(
+                            label="draw_skeleton openpose_skeleton flag",
+                            choices=["Auto", "True", "False"],
+                            value="Auto",
+                        )
+                        add_derived_btn = gr.Button("➕ Add derived ref to Extras (and print debug to logs)")
                     seed = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=0)
                     randomize_seed = gr.Checkbox(label="Randomize Seed", value=True)
                         value=True,
                     )
         lora_adapter.change(
             fn=on_lora_change_ui,
             inputs=[lora_adapter, prompt, extras_condition_only],
         outputs=[output_image, seed, last_output],
     )
     btn_out_to_img1.click(fn=set_output_as_image1, inputs=[last_output], outputs=[input_image_1])
     btn_out_to_img2.click(fn=set_output_as_image2, inputs=[last_output], outputs=[input_image_2])
     btn_out_to_extra.click(fn=set_output_as_extra, inputs=[last_output, input_images_extra], outputs=[input_images_extra])
     add_derived_btn.click(
         fn=add_derived_ref,
+        inputs=[
+            input_image_1,
+            input_images_extra,
+            derived_type,
+            derived_use_gpu,
+            derived_max_people,
+            derived_dwpose_to_openpose,
+            derived_dwpose_openpose_flag,
+        ],
         outputs=[input_images_extra, derived_preview],
     )