Qwen-Image-Edit-Rapid-AIO-Loras-Experimental

Running on Zero

App Files Files Community

Professional Noob commited on Jan 31

Commit

d8b9abb

verified ·

1 Parent(s): e32d379

Update app.py

Browse files

Files changed (1) hide show

app.py +332 -14

app.py CHANGED Viewed

@@ -7,9 +7,17 @@ import numpy as np
 import spaces
 import torch
 import random
-from PIL import Image
 from typing import Iterable, Optional
 from huggingface_hub import hf_hub_download
 from safetensors.torch import load_file as safetensors_load_file
@@ -194,6 +202,221 @@ except Exception as e:
 MAX_SEED = np.iinfo(np.int32).max
 # ============================================================
 # LoRA adapters + presets
 # ============================================================
@@ -777,6 +1000,55 @@ def on_lora_change_ui(selected_lora, current_prompt, current_extras_condition_on
         extras_update = gr.update(value=current_extras_condition_only)
     return prompt_update, img2_update, extras_update
 # ============================================================
@@ -869,12 +1141,9 @@ def infer(
     try:
         print(
              "[DEBUG][infer] submitting request | "
-             f"lora_adapter={lora_adapter!r} seed={seed} prompt={prompt!r} "
-             f"canvas={width}x{height} target_area={target_area} "
-             f"extras_condition_only={extras_condition_only} vae_image_indices={vae_image_indices} "
-             f"pad_to_canvas={bool(pad_to_canvas)}"
         )
         result = pipe(
             image=pipe_images,
             prompt=prompt,
@@ -887,7 +1156,7 @@ def infer(
             vae_image_indices=vae_image_indices,
             pad_to_canvas=bool(pad_to_canvas),
         ).images[0]
-        return result, seed
     finally:
         gc.collect()
         if torch.cuda.is_available():
@@ -897,13 +1166,14 @@ def infer(
 @spaces.GPU
 def infer_example(input_image, prompt, lora_adapter):
     if input_image is None:
-        return None, 0
     input_pil = input_image.convert("RGB")
     guidance_scale = 1.0
     steps = 4
     # Examples don't supply Image 2 or extra images; and example list doesn't include AnyPose/BFS.
-    result, seed = infer(input_pil, None, None, prompt, lora_adapter, 0, True, guidance_scale, steps, 1.0, True, True)
-    return result, seed
 # ============================================================
@@ -958,6 +1228,21 @@ with gr.Blocks() as demo:
             with gr.Column():
                 output_image = gr.Image(label="Output Image", interactive=False, format="png", height=353)
                 with gr.Row():
                     lora_choices = [NONE_LORA] + list(ADAPTER_SPECS.keys())
                     lora_adapter = gr.Dropdown(
@@ -967,6 +1252,27 @@ with gr.Blocks() as demo:
                     )
                 with gr.Accordion("Advanced Settings", open=False, visible=True):
                     seed = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=0)
                     randomize_seed = gr.Checkbox(label="Randomize Seed", value=True)
                     guidance_scale = gr.Slider(label="Guidance Scale", minimum=1.0, maximum=10.0, step=0.1, value=1.0)
@@ -987,7 +1293,7 @@ with gr.Blocks() as demo:
                         value=True,
                     )
-        # On LoRA selection: preset prompt + toggle Image 2 + default extras routing
         lora_adapter.change(
             fn=on_lora_change_ui,
             inputs=[lora_adapter, prompt, extras_condition_only],
@@ -1022,7 +1328,7 @@ with gr.Blocks() as demo:
                 ["examples/11.jpg", "Upscale this picture to 4K resolution.", "Upscale2K"],
             ],
             inputs=[input_image_1, prompt, lora_adapter],
-            outputs=[output_image, seed],
             fn=infer_example,
             cache_examples=False,
             label="Examples",
@@ -1044,9 +1350,21 @@ with gr.Blocks() as demo:
             extras_condition_only,
             pad_to_canvas,
         ],
-        outputs=[output_image, seed],
     )
 if __name__ == "__main__":
     demo.queue(max_size=30).launch(
         css=css,
@@ -1054,4 +1372,4 @@ if __name__ == "__main__":
         mcp_server=True,
         ssr_mode=False,
         show_error=True,
-    )

 import spaces
 import torch
 import random
+from PIL import Image, ImageDraw
 from typing import Iterable, Optional
+from transformers import (
+    AutoProcessor,
+    RTDetrForObjectDetection,
+    VitPoseForPoseEstimation,
+    AutoImageProcessor,
+    AutoModelForDepthEstimation,
+)
 from huggingface_hub import hf_hub_download
 from safetensors.torch import load_file as safetensors_load_file
 MAX_SEED = np.iinfo(np.int32).max
+# ============================================================
+# Derived conditioning (Transformers): Pose + Depth
+# ============================================================
+# Pose estimation uses ViTPose (top-down). Official docs show RT-DETR -> ViTPose flow:
+# https://huggingface.co/docs/transformers/model_doc/vitpose
+# Depth uses Depth Anything V2 Small (Transformers-compatible):
+# https://huggingface.co/depth-anything/Depth-Anything-V2-Small-hf
+POSE_MODEL_ID = "usyd-community/vitpose-base-simple"
+POSE_DETECTOR_ID = "PekingU/rtdetr_r50vd_coco_o365"
+DEPTH_MODEL_ID = "depth-anything/Depth-Anything-V2-Small-hf"
+# Lazy caches keyed by device string ("cpu" / "cuda")
+_POSE_CACHE = {}
+_DEPTH_CACHE = {}
+# COCO-17 skeleton connections (approx "OpenPose-like" stick figure)
+COCO17_EDGES = [
+    (0, 1), (0, 2), (1, 3), (2, 4),      # head
+    (5, 6),                              # shoulders
+    (5, 7), (7, 9),                      # left arm
+    (6, 8), (8, 10),                     # right arm
+    (5, 11), (6, 12), (11, 12),          # torso
+    (11, 13), (13, 15),                  # left leg
+    (12, 14), (14, 16),                  # right leg
+]
+def _derived_device(use_gpu: bool) -> torch.device:
+    return torch.device("cuda" if (use_gpu and torch.cuda.is_available()) else "cpu")
+def _load_pose_models(dev: torch.device):
+    key = str(dev)
+    if key in _POSE_CACHE:
+        return _POSE_CACHE[key]
+    # Detector (optional but used for multi-person boxes)
+    det_proc = AutoProcessor.from_pretrained(POSE_DETECTOR_ID)
+    det_model = RTDetrForObjectDetection.from_pretrained(POSE_DETECTOR_ID).to(dev)
+    # Pose model
+    pose_proc = AutoProcessor.from_pretrained(POSE_MODEL_ID)
+    pose_model = VitPoseForPoseEstimation.from_pretrained(POSE_MODEL_ID).to(dev)
+    det_model.eval()
+    pose_model.eval()
+    _POSE_CACHE[key] = (det_proc, det_model, pose_proc, pose_model)
+    return _POSE_CACHE[key]
+def _load_depth_models(dev: torch.device):
+    key = str(dev)
+    if key in _DEPTH_CACHE:
+        return _DEPTH_CACHE[key]
+    proc = AutoImageProcessor.from_pretrained(DEPTH_MODEL_ID)
+    model = AutoModelForDepthEstimation.from_pretrained(DEPTH_MODEL_ID).to(dev)
+    model.eval()
+    _DEPTH_CACHE[key] = (proc, model)
+    return _DEPTH_CACHE[key]
+def _draw_skeleton_on_blank(
+    size: tuple[int, int],
+    persons_keypoints: list[np.ndarray],
+    persons_scores: list[np.ndarray],
+    kp_thresh: float = 0.20,
+    point_r: int = 3,
+    line_w: int = 3,
+) -> Image.Image:
+    w, h = size
+    canvas = Image.new("RGB", (w, h), (0, 0, 0))
+    draw = ImageDraw.Draw(canvas)
+    for kps, sc in zip(persons_keypoints, persons_scores):
+        # Draw edges
+        for a, b in COCO17_EDGES:
+            if a >= len(sc) or b >= len(sc):
+                continue
+            if sc[a] < kp_thresh or sc[b] < kp_thresh:
+                continue
+            xa, ya = float(kps[a, 0]), float(kps[a, 1])
+            xb, yb = float(kps[b, 0]), float(kps[b, 1])
+            draw.line([(xa, ya), (xb, yb)], fill=(255, 255, 255), width=line_w)
+        # Draw keypoints
+        for i in range(min(len(sc), len(kps))):
+            if sc[i] < kp_thresh:
+                continue
+            x, y = float(kps[i, 0]), float(kps[i, 1])
+            draw.ellipse(
+                [(x - point_r, y - point_r), (x + point_r, y + point_r)],
+                fill=(255, 255, 255),
+                outline=None,
+            )
+    return canvas
+def make_pose_map(
+    img: Image.Image,
+    *,
+    use_gpu: bool,
+    mode: str,
+    det_thresh: float = 0.30,
+    max_people: int = 4,
+) -> Image.Image:
+    """Return an OpenPose-like skeleton map (RGB) using Transformers models.
+    mode:
+      - "fast": full-frame box (no detector). Good when Image 1 is already a single subject.
+      - "detect": RT-DETR person boxes -> ViTPose. Better for multi-person scenes.
+    """
+    img = img.convert("RGB")
+    dev = _derived_device(use_gpu)
+    det_proc, det_model, pose_proc, pose_model = _load_pose_models(dev)
+    w, h = img.size
+    if mode == "fast":
+        # Single box covering whole image, COCO format [x, y, w, h]
+        boxes = np.array([[0.0, 0.0, float(w), float(h)]], dtype=np.float32)
+    else:
+        # Detect people
+        inputs = det_proc(images=img, return_tensors="pt").to(dev)
+        with torch.no_grad():
+            outputs = det_model(**inputs)
+        results = det_proc.post_process_object_detection(
+            outputs,
+            target_sizes=torch.tensor([(h, w)], device=dev),
+            threshold=det_thresh,
+        )[0]
+        # COCO label 0 is "person" for COCO-trained detectors
+        person_boxes = results["boxes"][results["labels"] == 0].detach().cpu().numpy()
+        if person_boxes.size == 0:
+            # Fallback to full-frame
+            boxes = np.array([[0.0, 0.0, float(w), float(h)]], dtype=np.float32)
+        else:
+            # Convert VOC x1,y1,x2,y2 to COCO x,y,w,h
+            person_boxes[:, 2] = person_boxes[:, 2] - person_boxes[:, 0]
+            person_boxes[:, 3] = person_boxes[:, 3] - person_boxes[:, 1]
+            boxes = person_boxes.astype(np.float32)
+    if boxes.shape[0] > max_people:
+        boxes = boxes[:max_people]
+    pose_inputs = pose_proc(img, boxes=[boxes], return_tensors="pt").to(dev)
+    with torch.no_grad():
+        pose_outputs = pose_model(**pose_inputs)
+    pose_results = pose_proc.post_process_pose_estimation(pose_outputs, boxes=[boxes])[0]
+    persons_kps = []
+    persons_sc = []
+    for pr in pose_results:
+        kps = pr["keypoints"].detach().cpu().numpy()
+        sc = pr["scores"].detach().cpu().numpy()
+        persons_kps.append(kps)
+        persons_sc.append(sc)
+    if not persons_kps:
+        # No pose found; return black canvas
+        return Image.new("RGB", img.size, (0, 0, 0))
+    return _draw_skeleton_on_blank(img.size, persons_kps, persons_sc)
+def make_depth_map(img: Image.Image, *, use_gpu: bool) -> Image.Image:
+    """Return a grayscale (RGB) depth map using Depth Anything V2 Small."""
+    img = img.convert("RGB")
+    dev = _derived_device(use_gpu)
+    proc, model = _load_depth_models(dev)
+    inputs = proc(images=img, return_tensors="pt")
+    inputs = {k: v.to(dev) for k, v in inputs.items()}
+    with torch.no_grad():
+        out = model(**inputs)
+    # predicted_depth: (B, H, W)
+    pred = out.predicted_depth
+    # Upsample to original image size
+    pred = torch.nn.functional.interpolate(
+        pred.unsqueeze(1),
+        size=(img.height, img.width),
+        mode="bicubic",
+        align_corners=False,
+    ).squeeze(1)[0]
+    arr = pred.detach().float().cpu().numpy()
+    arr = arr - float(arr.min())
+    denom = float(arr.max()) + 1e-8
+    arr = arr / denom
+    depth8 = (arr * 255.0).clip(0, 255).astype(np.uint8)
+    depth_img = Image.fromarray(depth8, mode="L").convert("RGB")
+    return depth_img
+def _append_to_gallery(existing, new_img: Image.Image):
+    items = []
+    if existing:
+        for it in existing:
+            pil = _to_pil_rgb(it)
+            if pil is not None:
+                items.append(pil)
+    items.append(new_img)
+    return items
 # ============================================================
 # LoRA adapters + presets
 # ============================================================
         extras_update = gr.update(value=current_extras_condition_only)
     return prompt_update, img2_update, extras_update
+# ============================================================
+# UI helpers: output routing + derived conditioning
+# ============================================================
+def set_output_as_image1(last):
+    if last is None:
+        raise gr.Error("No output available yet.")
+    return gr.update(value=last)
+def set_output_as_image2(last):
+    if last is None:
+        raise gr.Error("No output available yet.")
+    return gr.update(value=last)
+def set_output_as_extra(last, existing_extra):
+    if last is None:
+        raise gr.Error("No output available yet.")
+    return _append_to_gallery(existing_extra, last)
+@spaces.GPU
+def add_derived_ref(img1, existing_extra, derived_type, derived_use_gpu, derived_max_people):
+    if img1 is None:
+        raise gr.Error("Please upload Image 1 first.")
+    if derived_type == "None":
+        return gr.update(value=existing_extra), gr.update(visible=False, value=None)
+    base = img1.convert("RGB")
+    if derived_type == "Pose (ViTPose, fast)":
+        derived = make_pose_map(base, use_gpu=bool(derived_use_gpu), mode="fast")
+    elif derived_type == "Pose (ViTPose + RT-DETR detect)":
+        derived = make_pose_map(
+            base,
+            use_gpu=bool(derived_use_gpu),
+            mode="detect",
+            max_people=int(derived_max_people),
+        )
+    elif derived_type == "Depth (Depth Anything V2 Small)":
+        derived = make_depth_map(base, use_gpu=bool(derived_use_gpu))
+    else:
+        raise gr.Error(f"Unknown derived type: {derived_type}")
+    new_gallery = _append_to_gallery(existing_extra, derived)
+    return gr.update(value=new_gallery), gr.update(visible=True, value=derived)
 # ============================================================
     try:
         print(
              "[DEBUG][infer] submitting request | "
+             f"lora_adapter={lora_adapter!r} seed={seed} prompt={prompt!r}"
         )
         result = pipe(
             image=pipe_images,
             prompt=prompt,
             vae_image_indices=vae_image_indices,
             pad_to_canvas=bool(pad_to_canvas),
         ).images[0]
+        return result, seed, result
     finally:
         gc.collect()
         if torch.cuda.is_available():
 @spaces.GPU
 def infer_example(input_image, prompt, lora_adapter):
     if input_image is None:
+        return None, 0, None
     input_pil = input_image.convert("RGB")
     guidance_scale = 1.0
     steps = 4
     # Examples don't supply Image 2 or extra images; and example list doesn't include AnyPose/BFS.
+    result, seed, last = infer(input_pil, None, None, prompt, lora_adapter, 0, True, guidance_scale, steps)
+    return result, seed, last
+, result
 # ============================================================
             with gr.Column():
                 output_image = gr.Image(label="Output Image", interactive=False, format="png", height=353)
+                last_output = gr.State(value=None)
+                with gr.Row():
+                    btn_out_to_img1 = gr.Button("⬅️ Output → Image 1", variant="secondary")
+                    btn_out_to_img2 = gr.Button("⬅️ Output → Image 2", variant="secondary")
+                    btn_out_to_extra = gr.Button("➕ Output → Extra Ref", variant="secondary")
+                derived_preview = gr.Image(
+                    label="Derived Conditioning Preview",
+                    interactive=False,
+                    format="png",
+                    height=200,
+                    visible=False,
+                )
                 with gr.Row():
                     lora_choices = [NONE_LORA] + list(ADAPTER_SPECS.keys())
                     lora_adapter = gr.Dropdown(
                     )
                 with gr.Accordion("Advanced Settings", open=False, visible=True):
+                    with gr.Accordion("Derived Conditioning (Pose / Depth)", open=False):
+                        derived_type = gr.Dropdown(
+                            label="Derived Type (from Image 1)",
+                            choices=[
+                                "None",
+                                "Pose (ViTPose, fast)",
+                                "Pose (ViTPose + RT-DETR detect)",
+                                "Depth (Depth Anything V2 Small)",
+                            ],
+                            value="None",
+                        )
+                        derived_use_gpu = gr.Checkbox(label="Use GPU for derived model", value=False)
+                        derived_max_people = gr.Slider(
+                            label="Max people (pose detect mode)",
+                            minimum=1,
+                            maximum=10,
+                            step=1,
+                            value=4,
+                        )
+                        add_derived_btn = gr.Button("➕ Add derived ref to Extras (conditioning-only recommended)")
                     seed = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=0)
                     randomize_seed = gr.Checkbox(label="Randomize Seed", value=True)
                     guidance_scale = gr.Slider(label="Guidance Scale", minimum=1.0, maximum=10.0, step=0.1, value=1.0)
                         value=True,
                     )
+        # On LoRA selection: preset prompt + toggle Image 2
         lora_adapter.change(
             fn=on_lora_change_ui,
             inputs=[lora_adapter, prompt, extras_condition_only],
                 ["examples/11.jpg", "Upscale this picture to 4K resolution.", "Upscale2K"],
             ],
             inputs=[input_image_1, prompt, lora_adapter],
+            outputs=[output_image, seed, last_output],
             fn=infer_example,
             cache_examples=False,
             label="Examples",
             extras_condition_only,
             pad_to_canvas,
         ],
+        outputs=[output_image, seed, last_output],
     )
+# Output routing buttons
+btn_out_to_img1.click(fn=set_output_as_image1, inputs=[last_output], outputs=[input_image_1])
+btn_out_to_img2.click(fn=set_output_as_image2, inputs=[last_output], outputs=[input_image_2])
+btn_out_to_extra.click(fn=set_output_as_extra, inputs=[last_output, input_images_extra], outputs=[input_images_extra])
+# Derived conditioning: append pose/depth map as extra ref (UI shows preview)
+add_derived_btn.click(
+    fn=add_derived_ref,
+    inputs=[input_image_1, input_images_extra, derived_type, derived_use_gpu, derived_max_people],
+    outputs=[input_images_extra, derived_preview],
+)
 if __name__ == "__main__":
     demo.queue(max_size=30).launch(
         css=css,
         mcp_server=True,
         ssr_mode=False,
         show_error=True,
+    )