Spaces:
Running on Zero
Running on Zero
Professional Noob commited on
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,29 +1,14 @@
|
|
| 1 |
# app.py
|
| 2 |
-
# Complete, self-contained Gradio app with:
|
| 3 |
-
# 1) Robust "ensure_pil_rgb" to avoid /tmp/gradio path issues across ZeroGPU workers
|
| 4 |
-
# 2) Per-process GRADIO_TEMP_DIR to reduce temp collisions
|
| 5 |
-
# 3) Qwen2.5-VL RoPE patch to avoid cublasSgemmStridedBatched failures (broadcast multiply instead)
|
| 6 |
-
# 4) Extra debug logging around inputs, image routing, and prompt/token lengths
|
| 7 |
-
|
| 8 |
import os
|
| 9 |
import re
|
| 10 |
import gc
|
|
|
|
|
|
|
|
|
|
| 11 |
import traceback
|
| 12 |
import random
|
| 13 |
-
import time
|
| 14 |
from typing import Iterable, Optional
|
| 15 |
|
| 16 |
-
# -------------------------------
|
| 17 |
-
# Temp-dir hardening (helps when multiple users hit the Space)
|
| 18 |
-
# -------------------------------
|
| 19 |
-
# IMPORTANT: This doesn't magically share files between runtimes, but it reduces collisions
|
| 20 |
-
# and makes temp behavior more deterministic.
|
| 21 |
-
if not os.environ.get("GRADIO_TEMP_DIR"):
|
| 22 |
-
_pid = os.getpid()
|
| 23 |
-
_tmp = f"/tmp/gradio_{_pid}"
|
| 24 |
-
os.makedirs(_tmp, exist_ok=True)
|
| 25 |
-
os.environ["GRADIO_TEMP_DIR"] = _tmp
|
| 26 |
-
|
| 27 |
import gradio as gr
|
| 28 |
import numpy as np
|
| 29 |
import spaces
|
|
@@ -44,68 +29,132 @@ from safetensors.torch import load_file as safetensors_load_file
|
|
| 44 |
from gradio.themes import Soft
|
| 45 |
from gradio.themes.utils import colors, fonts, sizes
|
| 46 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
# ============================================================
|
| 48 |
-
# Qwen2.5-VL RoPE
|
| 49 |
# ============================================================
|
| 50 |
|
| 51 |
-
def
|
| 52 |
"""
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(2, 3)
|
| 57 |
|
| 58 |
-
|
| 59 |
-
|
| 60 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
try:
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
Rotary = getattr(m, "Qwen2_5_VLRotaryEmbedding", None)
|
| 65 |
-
if Rotary is None:
|
| 66 |
-
print("[patch] Qwen2_5_VLRotaryEmbedding not found; skipping RoPE patch.")
|
| 67 |
-
return
|
| 68 |
-
|
| 69 |
-
# Guard: only patch once
|
| 70 |
-
if getattr(Rotary, "_rope_mul_patch_applied", False):
|
| 71 |
-
print("[patch] RoPE patch already applied.")
|
| 72 |
-
return
|
| 73 |
-
|
| 74 |
-
def safe_forward(self, x, position_ids):
|
| 75 |
-
"""
|
| 76 |
-
Return (cos, sin) with shapes compatible with original.
|
| 77 |
-
"""
|
| 78 |
-
inv_freq = self.inv_freq.to(device=x.device)
|
| 79 |
-
|
| 80 |
-
# position_ids is typically (bs, seq) or (seq,)
|
| 81 |
-
if position_ids.dim() == 1:
|
| 82 |
-
position_ids_ = position_ids.unsqueeze(0) # (1, seq)
|
| 83 |
-
else:
|
| 84 |
-
position_ids_ = position_ids # (bs, seq)
|
| 85 |
-
|
| 86 |
-
# Outer product via broadcast:
|
| 87 |
-
# inv: (1,1,dim/2,1)
|
| 88 |
-
# pos: (bs,1,1,seq)
|
| 89 |
-
# freqs: (bs,1,seq,dim/2)
|
| 90 |
-
inv = inv_freq[None, None, :, None].float()
|
| 91 |
-
pos = position_ids_[:, None, None, :].float()
|
| 92 |
-
freqs = (inv * pos).transpose(2, 3)
|
| 93 |
-
|
| 94 |
-
# Original commonly duplicates for sin/cos on last dim
|
| 95 |
-
emb = torch.cat((freqs, freqs), dim=-1) # (bs,1,seq,dim)
|
| 96 |
-
|
| 97 |
-
cos = emb.cos().to(dtype=x.dtype)
|
| 98 |
-
sin = emb.sin().to(dtype=x.dtype)
|
| 99 |
-
return cos, sin
|
| 100 |
-
|
| 101 |
-
Rotary.forward = safe_forward
|
| 102 |
-
Rotary._rope_mul_patch_applied = True
|
| 103 |
-
print("[patch] Patched Qwen2.5-VL RoPE to avoid cublas batched GEMM.")
|
| 104 |
except Exception as e:
|
| 105 |
-
print("[patch]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
|
| 108 |
-
patch_qwen25vl_rope_gemm_to_mul()
|
| 109 |
|
| 110 |
# ============================================================
|
| 111 |
# Theme
|
|
@@ -126,7 +175,6 @@ colors.orange_red = colors.Color(
|
|
| 126 |
c950="#802200",
|
| 127 |
)
|
| 128 |
|
| 129 |
-
|
| 130 |
class OrangeRedTheme(Soft):
|
| 131 |
def __init__(
|
| 132 |
self,
|
|
@@ -182,17 +230,15 @@ class OrangeRedTheme(Soft):
|
|
| 182 |
block_label_background_fill="*primary_200",
|
| 183 |
)
|
| 184 |
|
| 185 |
-
|
| 186 |
orange_red_theme = OrangeRedTheme()
|
| 187 |
|
|
|
|
| 188 |
# ============================================================
|
| 189 |
-
# Device
|
| 190 |
# ============================================================
|
| 191 |
|
| 192 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 193 |
|
| 194 |
-
print("===== Application Startup at", time.strftime("%Y-%m-%d %H:%M:%S"), "=====")
|
| 195 |
-
print("GRADIO_TEMP_DIR =", os.environ.get("GRADIO_TEMP_DIR"))
|
| 196 |
print("CUDA_VISIBLE_DEVICES=", os.environ.get("CUDA_VISIBLE_DEVICES"))
|
| 197 |
print("torch.__version__ =", torch.__version__)
|
| 198 |
print("torch.version.cuda =", torch.version.cuda)
|
|
@@ -203,6 +249,14 @@ if torch.cuda.is_available():
|
|
| 203 |
print("device name:", torch.cuda.get_device_name(torch.cuda.current_device()))
|
| 204 |
print("Using device:", device)
|
| 205 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 206 |
# ============================================================
|
| 207 |
# AIO version (Space variable)
|
| 208 |
# ============================================================
|
|
@@ -213,7 +267,6 @@ DEFAULT_AIO_VERSION = "v19"
|
|
| 213 |
_VER_RE = re.compile(r"^v\d+$")
|
| 214 |
_DIGITS_RE = re.compile(r"^\d+$")
|
| 215 |
|
| 216 |
-
|
| 217 |
def _normalize_version(raw: str) -> Optional[str]:
|
| 218 |
if raw is None:
|
| 219 |
return None
|
|
@@ -226,7 +279,6 @@ def _normalize_version(raw: str) -> Optional[str]:
|
|
| 226 |
return f"v{s}"
|
| 227 |
return None
|
| 228 |
|
| 229 |
-
|
| 230 |
_AIO_ENV_RAW = os.environ.get("AIO_VERSION", "")
|
| 231 |
_AIO_ENV_NORM = _normalize_version(_AIO_ENV_RAW)
|
| 232 |
|
|
@@ -237,6 +289,7 @@ print(f"AIO_VERSION (env raw) = {_AIO_ENV_RAW!r}")
|
|
| 237 |
print(f"AIO_VERSION (normalized) = {_AIO_ENV_NORM!r}")
|
| 238 |
print(f"Using AIO_VERSION = {AIO_VERSION} ({AIO_VERSION_SOURCE})")
|
| 239 |
|
|
|
|
| 240 |
# ============================================================
|
| 241 |
# Pipeline
|
| 242 |
# ============================================================
|
|
@@ -248,7 +301,6 @@ from qwenimage.qwen_fa3_processor import QwenDoubleStreamAttnProcessorFA3
|
|
| 248 |
|
| 249 |
dtype = torch.bfloat16
|
| 250 |
|
| 251 |
-
|
| 252 |
def _load_pipe_with_version(version: str) -> QwenImageEditPlusPipeline:
|
| 253 |
sub = f"{version}/transformer"
|
| 254 |
print(f"📦 Loading AIO transformer: {AIO_REPO_ID} / {sub}")
|
|
@@ -264,8 +316,6 @@ def _load_pipe_with_version(version: str) -> QwenImageEditPlusPipeline:
|
|
| 264 |
).to(device)
|
| 265 |
return p
|
| 266 |
|
| 267 |
-
|
| 268 |
-
# Forgiving load: try env/default version, fallback to v19 if it fails
|
| 269 |
try:
|
| 270 |
pipe = _load_pipe_with_version(AIO_VERSION)
|
| 271 |
except Exception:
|
|
@@ -286,64 +336,9 @@ except Exception as e:
|
|
| 286 |
|
| 287 |
MAX_SEED = np.iinfo(np.int32).max
|
| 288 |
|
| 289 |
-
# ============================================================
|
| 290 |
-
# Robust image coercion (prevents /tmp/gradio FileNotFound crashes)
|
| 291 |
-
# ============================================================
|
| 292 |
-
|
| 293 |
-
def ensure_pil_rgb(x, *, label: str = "") -> Optional[Image.Image]:
|
| 294 |
-
"""
|
| 295 |
-
Accepts:
|
| 296 |
-
- PIL.Image
|
| 297 |
-
- numpy arrays
|
| 298 |
-
- Gallery tuples (img, caption)
|
| 299 |
-
- dict payloads with path/name
|
| 300 |
-
- file path strings (best-effort)
|
| 301 |
-
Returns PIL RGB or None.
|
| 302 |
-
"""
|
| 303 |
-
if x is None:
|
| 304 |
-
return None
|
| 305 |
-
|
| 306 |
-
# Gallery often returns (img, caption)
|
| 307 |
-
if isinstance(x, tuple) and len(x) >= 1:
|
| 308 |
-
x = x[0]
|
| 309 |
-
|
| 310 |
-
# Sometimes dict payloads
|
| 311 |
-
if isinstance(x, dict):
|
| 312 |
-
path = x.get("path") or x.get("name")
|
| 313 |
-
if path:
|
| 314 |
-
x = path
|
| 315 |
-
|
| 316 |
-
if isinstance(x, Image.Image):
|
| 317 |
-
return x.convert("RGB")
|
| 318 |
-
|
| 319 |
-
if isinstance(x, np.ndarray):
|
| 320 |
-
try:
|
| 321 |
-
return Image.fromarray(x).convert("RGB")
|
| 322 |
-
except Exception:
|
| 323 |
-
return None
|
| 324 |
-
|
| 325 |
-
if isinstance(x, str):
|
| 326 |
-
# Path-based input: may fail on ZeroGPU worker if file isn't present.
|
| 327 |
-
exists = os.path.exists(x)
|
| 328 |
-
print(f"[DEBUG][ensure_pil_rgb] {label} got filepath: {x!r} exists={exists}")
|
| 329 |
-
if not exists:
|
| 330 |
-
# Return None so caller can show a meaningful error (instead of crashing).
|
| 331 |
-
return None
|
| 332 |
-
try:
|
| 333 |
-
return Image.open(x).convert("RGB")
|
| 334 |
-
except Exception as e:
|
| 335 |
-
print(f"[DEBUG][ensure_pil_rgb] {label} PIL open failed: {e!r}")
|
| 336 |
-
return None
|
| 337 |
-
|
| 338 |
-
# Last resort
|
| 339 |
-
try:
|
| 340 |
-
return Image.fromarray(np.array(x)).convert("RGB")
|
| 341 |
-
except Exception:
|
| 342 |
-
return None
|
| 343 |
-
|
| 344 |
|
| 345 |
# ============================================================
|
| 346 |
-
# Derived conditioning (Transformers): Pose + Depth
|
| 347 |
# ============================================================
|
| 348 |
|
| 349 |
POSE_MODEL_ID = "usyd-community/vitpose-base-simple"
|
|
@@ -363,11 +358,9 @@ COCO17_EDGES = [
|
|
| 363 |
(12, 14), (14, 16),
|
| 364 |
]
|
| 365 |
|
| 366 |
-
|
| 367 |
def _derived_device(use_gpu: bool) -> torch.device:
|
| 368 |
return torch.device("cuda" if (use_gpu and torch.cuda.is_available()) else "cpu")
|
| 369 |
|
| 370 |
-
|
| 371 |
def _load_pose_models(dev: torch.device):
|
| 372 |
key = str(dev)
|
| 373 |
if key in _POSE_CACHE:
|
|
@@ -385,7 +378,6 @@ def _load_pose_models(dev: torch.device):
|
|
| 385 |
_POSE_CACHE[key] = (det_proc, det_model, pose_proc, pose_model)
|
| 386 |
return _POSE_CACHE[key]
|
| 387 |
|
| 388 |
-
|
| 389 |
def _load_depth_models(dev: torch.device):
|
| 390 |
key = str(dev)
|
| 391 |
if key in _DEPTH_CACHE:
|
|
@@ -398,7 +390,6 @@ def _load_depth_models(dev: torch.device):
|
|
| 398 |
_DEPTH_CACHE[key] = (proc, model)
|
| 399 |
return _DEPTH_CACHE[key]
|
| 400 |
|
| 401 |
-
|
| 402 |
def _draw_skeleton_on_blank(
|
| 403 |
size: tuple[int, int],
|
| 404 |
persons_keypoints: list[np.ndarray],
|
|
@@ -425,15 +416,9 @@ def _draw_skeleton_on_blank(
|
|
| 425 |
if sc[i] < kp_thresh:
|
| 426 |
continue
|
| 427 |
x, y = float(kps[i, 0]), float(kps[i, 1])
|
| 428 |
-
draw.ellipse(
|
| 429 |
-
[(x - point_r, y - point_r), (x + point_r, y + point_r)],
|
| 430 |
-
fill=(255, 255, 255),
|
| 431 |
-
outline=None,
|
| 432 |
-
)
|
| 433 |
-
|
| 434 |
return canvas
|
| 435 |
|
| 436 |
-
|
| 437 |
def make_pose_map(
|
| 438 |
img: Image.Image,
|
| 439 |
*,
|
|
@@ -481,17 +466,14 @@ def make_pose_map(
|
|
| 481 |
|
| 482 |
persons_kps, persons_sc = [], []
|
| 483 |
for pr in pose_results:
|
| 484 |
-
|
| 485 |
-
|
| 486 |
-
persons_kps.append(kps)
|
| 487 |
-
persons_sc.append(sc)
|
| 488 |
|
| 489 |
if not persons_kps:
|
| 490 |
return Image.new("RGB", img.size, (0, 0, 0))
|
| 491 |
|
| 492 |
return _draw_skeleton_on_blank(img.size, persons_kps, persons_sc)
|
| 493 |
|
| 494 |
-
|
| 495 |
def make_depth_map(img: Image.Image, *, use_gpu: bool) -> Image.Image:
|
| 496 |
img = img.convert("RGB")
|
| 497 |
dev = _derived_device(use_gpu)
|
|
@@ -520,19 +502,86 @@ def make_depth_map(img: Image.Image, *, use_gpu: bool) -> Image.Image:
|
|
| 520 |
return Image.fromarray(depth8, mode="L").convert("RGB")
|
| 521 |
|
| 522 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 523 |
def _append_to_gallery(existing, new_img: Image.Image):
|
| 524 |
items = []
|
| 525 |
if existing:
|
| 526 |
for it in existing:
|
| 527 |
-
pil =
|
| 528 |
if pil is not None:
|
| 529 |
items.append(pil)
|
| 530 |
items.append(new_img)
|
| 531 |
return items
|
| 532 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 533 |
|
| 534 |
# ============================================================
|
| 535 |
-
# LoRA adapters + presets
|
| 536 |
# ============================================================
|
| 537 |
|
| 538 |
NONE_LORA = "None"
|
|
@@ -689,8 +738,8 @@ LORA_PRESET_PROMPTS = {
|
|
| 689 |
"Any2Real_2601": "change the picture 1 to realistic photograph",
|
| 690 |
"Semirealistic-photo-detailer": "transform the image to semi-realistic image",
|
| 691 |
"AnyPose": "Make the person in image 1 do the exact same pose of the person in image 2. Changing the style and background of the image of the person in image 1 is undesirable, so don't do it. The new pose should be pixel accurate to the pose we are trying to copy. The position of the arms and head and legs should be the same as the pose we are trying to copy. Change the field of view and angle to match exactly image 2. Head tilt and eye gaze pose should match the person in image 2.",
|
| 692 |
-
"Hyperrealistic-Portrait": "Transform the image into an ultra-realistic photorealistic portrait with strict identity preservation, facing straight to the camera.",
|
| 693 |
-
"Ultrarealistic-Portrait": "Transform the image into an ultra-realistic glamour portrait while strictly preserving the subject’s identity.",
|
| 694 |
"Upscale2K": "Upscale this picture to 4K resolution.",
|
| 695 |
"BFS-Best-FaceSwap": "head_swap: start with Picture 1 as the base image, keeping its lighting, environment, and background. remove the head from Picture 1 completely and replace it with the head from Picture 2, strictly preserving the hair, eye color, and nose structure of Picture 2. copy the eye direction, head rotation, and micro-expressions from Picture 1. high quality, sharp details, 4k",
|
| 696 |
"BFS-Best-FaceSwap-merge": "head_swap: start with Picture 1 as the base image, keeping its lighting, environment, and background. remove the head from Picture 1 completely and replace it with the head from Picture 2, strictly preserving the hair, eye color, and nose structure of Picture 2. copy the eye direction, head rotation, and micro-expressions from Picture 1. high quality, sharp details, 4k",
|
|
@@ -698,14 +747,14 @@ LORA_PRESET_PROMPTS = {
|
|
| 698 |
|
| 699 |
LOADED_ADAPTERS = set()
|
| 700 |
|
|
|
|
| 701 |
# ============================================================
|
| 702 |
-
# Helpers: resolution
|
| 703 |
# ============================================================
|
| 704 |
|
| 705 |
def _round_to_multiple(x: int, m: int) -> int:
|
| 706 |
return max(m, (int(x) // m) * m)
|
| 707 |
|
| 708 |
-
|
| 709 |
def compute_canvas_dimensions_from_area(
|
| 710 |
image: Image.Image,
|
| 711 |
target_area: int,
|
|
@@ -713,35 +762,29 @@ def compute_canvas_dimensions_from_area(
|
|
| 713 |
) -> tuple[int, int]:
|
| 714 |
w, h = image.size
|
| 715 |
aspect = w / h if h else 1.0
|
| 716 |
-
|
| 717 |
from qwenimage.pipeline_qwenimage_edit_plus import calculate_dimensions
|
| 718 |
-
|
| 719 |
width, height = calculate_dimensions(int(target_area), float(aspect))
|
| 720 |
width = _round_to_multiple(int(width), int(multiple_of))
|
| 721 |
height = _round_to_multiple(int(height), int(multiple_of))
|
| 722 |
return width, height
|
| 723 |
|
| 724 |
-
|
| 725 |
def get_target_area_for_lora(
|
| 726 |
image: Image.Image,
|
| 727 |
lora_adapter: str,
|
| 728 |
user_target_megapixels: float,
|
| 729 |
) -> int:
|
| 730 |
spec = ADAPTER_SPECS.get(lora_adapter, {})
|
| 731 |
-
|
| 732 |
if "target_area" in spec:
|
| 733 |
try:
|
| 734 |
return int(spec["target_area"])
|
| 735 |
except Exception:
|
| 736 |
pass
|
| 737 |
-
|
| 738 |
if "target_megapixels" in spec:
|
| 739 |
try:
|
| 740 |
mp = float(spec["target_megapixels"])
|
| 741 |
return int(mp * 1024 * 1024)
|
| 742 |
except Exception:
|
| 743 |
pass
|
| 744 |
-
|
| 745 |
if "target_long_edge" in spec:
|
| 746 |
try:
|
| 747 |
long_edge = int(spec["target_long_edge"])
|
|
@@ -755,47 +798,19 @@ def get_target_area_for_lora(
|
|
| 755 |
return int(new_w * new_h)
|
| 756 |
except Exception:
|
| 757 |
pass
|
| 758 |
-
|
| 759 |
return int(float(user_target_megapixels) * 1024 * 1024)
|
| 760 |
|
| 761 |
|
| 762 |
# ============================================================
|
| 763 |
-
# Helpers:
|
| 764 |
# ============================================================
|
| 765 |
|
| 766 |
def lora_requires_two_images(lora_adapter: str) -> bool:
|
| 767 |
return bool(ADAPTER_SPECS.get(lora_adapter, {}).get("requires_two_images", False))
|
| 768 |
|
| 769 |
-
|
| 770 |
def image2_label_for_lora(lora_adapter: str) -> str:
|
| 771 |
return str(ADAPTER_SPECS.get(lora_adapter, {}).get("image2_label", "Upload Reference (Image 2)"))
|
| 772 |
|
| 773 |
-
|
| 774 |
-
def build_labeled_images(
|
| 775 |
-
img1: Image.Image,
|
| 776 |
-
img2: Optional[Image.Image],
|
| 777 |
-
extra_imgs: Optional[list[Image.Image]],
|
| 778 |
-
) -> dict[str, Image.Image]:
|
| 779 |
-
labeled: dict[str, Image.Image] = {}
|
| 780 |
-
idx = 1
|
| 781 |
-
labeled[f"image_{idx}"] = img1
|
| 782 |
-
idx += 1
|
| 783 |
-
if img2 is not None:
|
| 784 |
-
labeled[f"image_{idx}"] = img2
|
| 785 |
-
idx += 1
|
| 786 |
-
if extra_imgs:
|
| 787 |
-
for im in extra_imgs:
|
| 788 |
-
if im is None:
|
| 789 |
-
continue
|
| 790 |
-
labeled[f"image_{idx}"] = im
|
| 791 |
-
idx += 1
|
| 792 |
-
return labeled
|
| 793 |
-
|
| 794 |
-
|
| 795 |
-
# ============================================================
|
| 796 |
-
# Helpers: BFS alpha key fix
|
| 797 |
-
# ============================================================
|
| 798 |
-
|
| 799 |
def _inject_missing_alpha_keys(state_dict: dict) -> dict:
|
| 800 |
bases = {}
|
| 801 |
for k, v in state_dict.items():
|
|
@@ -808,6 +823,7 @@ def _inject_missing_alpha_keys(state_dict: dict) -> dict:
|
|
| 808 |
|
| 809 |
for base, rank in bases.items():
|
| 810 |
alpha_tensor = torch.tensor(float(rank), dtype=torch.float32)
|
|
|
|
| 811 |
full_alpha = f"{base}.alpha"
|
| 812 |
if full_alpha not in state_dict:
|
| 813 |
state_dict[full_alpha] = alpha_tensor
|
|
@@ -817,10 +833,8 @@ def _inject_missing_alpha_keys(state_dict: dict) -> dict:
|
|
| 817 |
stripped_alpha = f"{stripped_base}.alpha"
|
| 818 |
if stripped_alpha not in state_dict:
|
| 819 |
state_dict[stripped_alpha] = alpha_tensor
|
| 820 |
-
|
| 821 |
return state_dict
|
| 822 |
|
| 823 |
-
|
| 824 |
def _filter_to_diffusers_lora_keys(state_dict: dict) -> tuple[dict, dict]:
|
| 825 |
keep_suffixes = (
|
| 826 |
".lora_up.weight",
|
|
@@ -863,7 +877,6 @@ def _filter_to_diffusers_lora_keys(state_dict: dict) -> tuple[dict, dict]:
|
|
| 863 |
}
|
| 864 |
return out, stats
|
| 865 |
|
| 866 |
-
|
| 867 |
def _duplicate_stripped_prefix_keys(state_dict: dict, prefix: str = "diffusion_model.") -> dict:
|
| 868 |
out = dict(state_dict)
|
| 869 |
for k, v in list(state_dict.items()):
|
|
@@ -874,7 +887,6 @@ def _duplicate_stripped_prefix_keys(state_dict: dict, prefix: str = "diffusion_m
|
|
| 874 |
out[stripped] = v
|
| 875 |
return out
|
| 876 |
|
| 877 |
-
|
| 878 |
def _load_lora_weights_with_fallback(repo: str, weight_name: str, adapter_name: str, needs_alpha_fix: bool = False):
|
| 879 |
try:
|
| 880 |
pipe.load_lora_weights(repo, weight_name=weight_name, adapter_name=adapter_name)
|
|
@@ -900,11 +912,9 @@ def _load_lora_weights_with_fallback(repo: str, weight_name: str, adapter_name:
|
|
| 900 |
f"kept={stats['kept']} dropped_patch={stats['dropped_patch']} "
|
| 901 |
f"dropped_other={stats['dropped_other']} normalized_alpha={stats['normalized_alpha']}"
|
| 902 |
)
|
| 903 |
-
|
| 904 |
pipe.load_lora_weights(sd, adapter_name=adapter_name)
|
| 905 |
return
|
| 906 |
|
| 907 |
-
|
| 908 |
def _ensure_loaded_and_get_active_adapters(selected_lora: str):
|
| 909 |
spec = ADAPTER_SPECS.get(selected_lora)
|
| 910 |
if not spec:
|
|
@@ -942,7 +952,6 @@ def _ensure_loaded_and_get_active_adapters(selected_lora: str):
|
|
| 942 |
|
| 943 |
adapter_names.append(adapter_name)
|
| 944 |
adapter_weights.append(strength)
|
| 945 |
-
|
| 946 |
else:
|
| 947 |
repo = spec["repo"]
|
| 948 |
weights = spec["weights"]
|
|
@@ -998,41 +1007,34 @@ def on_lora_change_ui(selected_lora, current_prompt, current_extras_condition_on
|
|
| 998 |
return prompt_update, img2_update, extras_update
|
| 999 |
|
| 1000 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1001 |
def set_output_as_image1(last):
|
| 1002 |
if last is None:
|
| 1003 |
raise gr.Error("No output available yet.")
|
| 1004 |
return gr.update(value=last)
|
| 1005 |
|
| 1006 |
-
|
| 1007 |
def set_output_as_image2(last):
|
| 1008 |
if last is None:
|
| 1009 |
raise gr.Error("No output available yet.")
|
| 1010 |
return gr.update(value=last)
|
| 1011 |
|
| 1012 |
-
|
| 1013 |
def set_output_as_extra(last, existing_extra):
|
| 1014 |
if last is None:
|
| 1015 |
raise gr.Error("No output available yet.")
|
| 1016 |
return _append_to_gallery(existing_extra, last)
|
| 1017 |
|
| 1018 |
-
|
| 1019 |
-
# ============================================================
|
| 1020 |
-
# Derived conditioning UI action
|
| 1021 |
-
# ============================================================
|
| 1022 |
-
|
| 1023 |
@spaces.GPU
|
| 1024 |
def add_derived_ref(img1, existing_extra, derived_type, derived_use_gpu, derived_max_people):
|
| 1025 |
-
|
| 1026 |
-
|
| 1027 |
-
raise gr.Error(
|
| 1028 |
-
"Image 1 could not be read in the GPU worker. "
|
| 1029 |
-
"If you uploaded via API or a prior session, re-upload the image."
|
| 1030 |
-
)
|
| 1031 |
|
| 1032 |
if derived_type == "None":
|
| 1033 |
return gr.update(value=existing_extra), gr.update(visible=False, value=None)
|
| 1034 |
|
| 1035 |
-
base =
|
| 1036 |
|
| 1037 |
if derived_type == "Pose (ViTPose, fast)":
|
| 1038 |
derived = make_pose_map(base, use_gpu=bool(derived_use_gpu), mode="fast")
|
|
@@ -1053,18 +1055,29 @@ def add_derived_ref(img1, existing_extra, derived_type, derived_use_gpu, derived
|
|
| 1053 |
|
| 1054 |
|
| 1055 |
# ============================================================
|
| 1056 |
-
#
|
| 1057 |
# ============================================================
|
| 1058 |
|
| 1059 |
-
def
|
| 1060 |
if not torch.cuda.is_available():
|
| 1061 |
return
|
| 1062 |
try:
|
| 1063 |
free, total = torch.cuda.mem_get_info()
|
| 1064 |
-
print(f"{prefix} mem free={free/1e9:.2f}GB total={total/1e9:.2f}GB")
|
| 1065 |
-
except Exception
|
| 1066 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1067 |
|
|
|
|
|
|
|
|
|
|
| 1068 |
|
| 1069 |
@spaces.GPU
|
| 1070 |
def infer(
|
|
@@ -1086,31 +1099,12 @@ def infer(
|
|
| 1086 |
if torch.cuda.is_available():
|
| 1087 |
torch.cuda.empty_cache()
|
| 1088 |
|
| 1089 |
-
|
| 1090 |
-
img1 = ensure_pil_rgb(input_image_1, label="input_image_1")
|
| 1091 |
-
img2 = ensure_pil_rgb(input_image_2, label="input_image_2") if input_image_2 is not None else None
|
| 1092 |
|
| 1093 |
-
|
| 1094 |
-
|
| 1095 |
-
type(input_image_1), type(input_image_2), type(input_images_extra))
|
| 1096 |
-
print("[DEBUG][infer] img1_ok:", img1 is not None, "img2_ok:", (img2 is not None))
|
| 1097 |
|
| 1098 |
-
|
| 1099 |
-
raise gr.Error(
|
| 1100 |
-
"Could not read Image 1 inside the GPU worker. "
|
| 1101 |
-
"This can happen if Gradio passed a temp file path that isn't present in the worker. "
|
| 1102 |
-
"Please re-upload Image 1 and try again."
|
| 1103 |
-
)
|
| 1104 |
-
|
| 1105 |
-
# Normalize extra images (Gallery) to PIL RGB
|
| 1106 |
-
extra_imgs: list[Image.Image] = []
|
| 1107 |
-
if input_images_extra:
|
| 1108 |
-
for i, item in enumerate(input_images_extra):
|
| 1109 |
-
pil = ensure_pil_rgb(item, label=f"gallery[{i}]")
|
| 1110 |
-
if pil is not None:
|
| 1111 |
-
extra_imgs.append(pil)
|
| 1112 |
-
|
| 1113 |
-
# -------------------- Adapter handling --------------------
|
| 1114 |
if lora_adapter == NONE_LORA:
|
| 1115 |
try:
|
| 1116 |
pipe.set_adapters([], adapter_weights=[])
|
|
@@ -1121,7 +1115,6 @@ def infer(
|
|
| 1121 |
adapter_names, adapter_weights = _ensure_loaded_and_get_active_adapters(lora_adapter)
|
| 1122 |
pipe.set_adapters(adapter_names, adapter_weights=adapter_weights)
|
| 1123 |
|
| 1124 |
-
# -------------------- Seed / prompts --------------------
|
| 1125 |
if randomize_seed:
|
| 1126 |
seed = random.randint(0, MAX_SEED)
|
| 1127 |
|
|
@@ -1132,17 +1125,34 @@ def infer(
|
|
| 1132 |
"extra digit, fewer digits, cropped, jpeg artifacts, signature, watermark, username, blurry"
|
| 1133 |
)
|
| 1134 |
|
| 1135 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1136 |
if lora_requires_two_images(lora_adapter) and img2 is None:
|
| 1137 |
raise gr.Error("This LoRA needs two images. Please upload Image 2 as well.")
|
| 1138 |
|
| 1139 |
-
# Label images as image_1, image_2, image_3...
|
| 1140 |
labeled = build_labeled_images(img1, img2, extra_imgs)
|
|
|
|
| 1141 |
pipe_images = list(labeled.values())
|
| 1142 |
if len(pipe_images) == 1:
|
| 1143 |
pipe_images = pipe_images[0]
|
| 1144 |
|
| 1145 |
-
# Canvas sizing
|
| 1146 |
target_area = get_target_area_for_lora(img1, lora_adapter, float(target_megapixels))
|
| 1147 |
width, height = compute_canvas_dimensions_from_area(
|
| 1148 |
img1,
|
|
@@ -1150,42 +1160,29 @@ def infer(
|
|
| 1150 |
multiple_of=int(pipe.vae_scale_factor * 2),
|
| 1151 |
)
|
| 1152 |
|
| 1153 |
-
# VAE indices (conditioning-only extras)
|
| 1154 |
vae_image_indices = None
|
| 1155 |
if extras_condition_only:
|
| 1156 |
if isinstance(pipe_images, list) and len(pipe_images) > 2:
|
| 1157 |
vae_image_indices = [0, 1] if len(pipe_images) >= 2 else [0]
|
| 1158 |
|
| 1159 |
-
#
|
| 1160 |
-
|
|
|
|
| 1161 |
print(
|
| 1162 |
"[DEBUG][infer] submitting request | "
|
| 1163 |
-
f"lora_adapter={lora_adapter!r} seed={seed} "
|
| 1164 |
-
f"
|
| 1165 |
-
f"
|
| 1166 |
-
f"
|
| 1167 |
-
f"vae_image_indices={vae_image_indices} pad_to_canvas={bool(pad_to_canvas)}"
|
| 1168 |
)
|
| 1169 |
-
print("[DEBUG][infer] image_1 size:
|
| 1170 |
-
|
| 1171 |
-
|
| 1172 |
|
| 1173 |
-
# Optional: log token count (if tokenizer exists)
|
| 1174 |
-
try:
|
| 1175 |
-
tok = getattr(pipe, "tokenizer", None)
|
| 1176 |
-
if tok is not None and prompt_s:
|
| 1177 |
-
ids = tok(prompt_s, return_tensors="pt", truncation=True).input_ids
|
| 1178 |
-
print("[DEBUG][infer] prompt token_count:", int(ids.shape[-1]))
|
| 1179 |
-
except Exception as e:
|
| 1180 |
-
print("[DEBUG][infer] token_count failed:", repr(e))
|
| 1181 |
-
|
| 1182 |
-
_debug_cuda_mem(prefix="[DEBUG][cuda][before]")
|
| 1183 |
-
|
| 1184 |
-
# -------------------- Run --------------------
|
| 1185 |
try:
|
| 1186 |
result = pipe(
|
| 1187 |
image=pipe_images,
|
| 1188 |
-
prompt=
|
| 1189 |
negative_prompt=negative_prompt,
|
| 1190 |
height=height,
|
| 1191 |
width=width,
|
|
@@ -1196,13 +1193,15 @@ def infer(
|
|
| 1196 |
pad_to_canvas=bool(pad_to_canvas),
|
| 1197 |
).images[0]
|
| 1198 |
return result, seed, result
|
| 1199 |
-
|
|
|
|
| 1200 |
print("---- [ERROR][infer] exception ----")
|
| 1201 |
print(traceback.format_exc())
|
| 1202 |
print("---------------------------------")
|
| 1203 |
raise
|
|
|
|
| 1204 |
finally:
|
| 1205 |
-
|
| 1206 |
gc.collect()
|
| 1207 |
if torch.cuda.is_available():
|
| 1208 |
torch.cuda.empty_cache()
|
|
@@ -1210,24 +1209,17 @@ def infer(
|
|
| 1210 |
|
| 1211 |
@spaces.GPU
|
| 1212 |
def infer_example(input_image, prompt, lora_adapter):
|
| 1213 |
-
|
| 1214 |
-
if img is None:
|
| 1215 |
return None, 0, None
|
|
|
|
| 1216 |
guidance_scale = 1.0
|
| 1217 |
steps = 4
|
| 1218 |
result, seed, last = infer(
|
| 1219 |
-
|
| 1220 |
-
|
| 1221 |
-
|
| 1222 |
-
|
| 1223 |
-
|
| 1224 |
-
0,
|
| 1225 |
-
True,
|
| 1226 |
-
guidance_scale,
|
| 1227 |
-
steps,
|
| 1228 |
-
1.0,
|
| 1229 |
-
True,
|
| 1230 |
-
True,
|
| 1231 |
)
|
| 1232 |
return result, seed, last
|
| 1233 |
|
|
@@ -1253,9 +1245,9 @@ with gr.Blocks() as demo:
|
|
| 1253 |
with gr.Column(elem_id="col-container"):
|
| 1254 |
gr.Markdown("# **Qwen-Image-Edit-2511-LoRAs-Fast**", elem_id="main-title")
|
| 1255 |
gr.Markdown(
|
| 1256 |
-
"Perform diverse image edits using specialized
|
| 1257 |
-
"Qwen-Image-Edit-2511
|
| 1258 |
-
"
|
| 1259 |
)
|
| 1260 |
gr.Markdown(aio_status_line)
|
| 1261 |
|
|
@@ -1283,6 +1275,7 @@ with gr.Blocks() as demo:
|
|
| 1283 |
|
| 1284 |
with gr.Column():
|
| 1285 |
output_image = gr.Image(label="Output Image", interactive=False, format="png", height=353)
|
|
|
|
| 1286 |
last_output = gr.State(value=None)
|
| 1287 |
|
| 1288 |
with gr.Row():
|
|
@@ -1348,7 +1341,6 @@ with gr.Blocks() as demo:
|
|
| 1348 |
value=True,
|
| 1349 |
)
|
| 1350 |
|
| 1351 |
-
# On LoRA selection: preset prompt + toggle Image 2
|
| 1352 |
lora_adapter.change(
|
| 1353 |
fn=on_lora_change_ui,
|
| 1354 |
inputs=[lora_adapter, prompt, extras_condition_only],
|
|
@@ -1360,6 +1352,26 @@ with gr.Blocks() as demo:
|
|
| 1360 |
["examples/5.jpg", "Remove shadows and relight the image using soft lighting.", "Light-Restoration"],
|
| 1361 |
["examples/4.jpg", "Use a subtle golden-hour filter with smooth light diffusion.", "Relight"],
|
| 1362 |
["examples/2.jpeg", "Rotate the camera 45 degrees to the left.", "Multiple-Angles"],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1363 |
["examples/11.jpg", "Upscale this picture to 4K resolution.", "Upscale2K"],
|
| 1364 |
],
|
| 1365 |
inputs=[input_image_1, prompt, lora_adapter],
|
|
@@ -1388,12 +1400,10 @@ with gr.Blocks() as demo:
|
|
| 1388 |
outputs=[output_image, seed, last_output],
|
| 1389 |
)
|
| 1390 |
|
| 1391 |
-
# Output routing buttons
|
| 1392 |
btn_out_to_img1.click(fn=set_output_as_image1, inputs=[last_output], outputs=[input_image_1])
|
| 1393 |
btn_out_to_img2.click(fn=set_output_as_image2, inputs=[last_output], outputs=[input_image_2])
|
| 1394 |
btn_out_to_extra.click(fn=set_output_as_extra, inputs=[last_output, input_images_extra], outputs=[input_images_extra])
|
| 1395 |
|
| 1396 |
-
# Derived conditioning button
|
| 1397 |
add_derived_btn.click(
|
| 1398 |
fn=add_derived_ref,
|
| 1399 |
inputs=[input_image_1, input_images_extra, derived_type, derived_use_gpu, derived_max_people],
|
|
|
|
| 1 |
# app.py
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
import os
|
| 3 |
import re
|
| 4 |
import gc
|
| 5 |
+
import uuid
|
| 6 |
+
import time
|
| 7 |
+
import math
|
| 8 |
import traceback
|
| 9 |
import random
|
|
|
|
| 10 |
from typing import Iterable, Optional
|
| 11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
import gradio as gr
|
| 13 |
import numpy as np
|
| 14 |
import spaces
|
|
|
|
| 29 |
from gradio.themes import Soft
|
| 30 |
from gradio.themes.utils import colors, fonts, sizes
|
| 31 |
|
| 32 |
+
|
| 33 |
+
# ============================================================
|
| 34 |
+
# Process-unique temp dir (helps avoid /tmp collisions)
|
| 35 |
+
# ============================================================
|
| 36 |
+
|
| 37 |
+
def _ensure_unique_gradio_tmp():
|
| 38 |
+
"""
|
| 39 |
+
ZeroGPU/Spaces can serve multiple users across recycled containers.
|
| 40 |
+
Gradio may use /tmp/gradio by default. We force a unique directory per process.
|
| 41 |
+
"""
|
| 42 |
+
if os.environ.get("GRADIO_TEMP_DIR"):
|
| 43 |
+
print(f"GRADIO_TEMP_DIR = {os.environ['GRADIO_TEMP_DIR']}")
|
| 44 |
+
return
|
| 45 |
+
pid = os.getpid()
|
| 46 |
+
tmp = f"/tmp/gradio_{pid}_{uuid.uuid4().hex[:8]}"
|
| 47 |
+
os.environ["GRADIO_TEMP_DIR"] = tmp
|
| 48 |
+
try:
|
| 49 |
+
os.makedirs(tmp, exist_ok=True)
|
| 50 |
+
except Exception:
|
| 51 |
+
pass
|
| 52 |
+
print(f"GRADIO_TEMP_DIR = {tmp}")
|
| 53 |
+
|
| 54 |
+
_ensure_unique_gradio_tmp()
|
| 55 |
+
|
| 56 |
+
|
| 57 |
# ============================================================
|
| 58 |
+
# Patch: Qwen2.5-VL RoPE (avoid cublas batched GEMM; preserve shapes)
|
| 59 |
# ============================================================
|
| 60 |
|
| 61 |
+
def patch_qwen25vl_rope_no_gemm():
|
| 62 |
"""
|
| 63 |
+
Patch Qwen2.5-VL rotary embedding to avoid the matmul that can trigger:
|
| 64 |
+
CUBLAS_STATUS_INVALID_VALUE (cublasSgemmStridedBatched)
|
| 65 |
+
on some ZeroGPU/H200 MIG configurations.
|
|
|
|
| 66 |
|
| 67 |
+
CRITICAL: Preserve exact output shapes used by apply_multimodal_rotary_pos_emb,
|
| 68 |
+
otherwise you get split_with_sizes mismatches.
|
| 69 |
"""
|
| 70 |
+
if os.environ.get("DISABLE_ROPE_PATCH", "").strip() == "1":
|
| 71 |
+
print("[patch][rope] DISABLE_ROPE_PATCH=1 -> skipping patch.")
|
| 72 |
+
return
|
| 73 |
+
|
| 74 |
try:
|
| 75 |
+
from transformers.models.qwen2_5_vl import modeling_qwen2_5_vl as qvl
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
except Exception as e:
|
| 77 |
+
print(f"[patch][rope] could not import qwen2_5_vl modeling: {e}")
|
| 78 |
+
return
|
| 79 |
+
|
| 80 |
+
Rotary = None
|
| 81 |
+
for name in ["Qwen2_5_VLRotaryEmbedding", "Qwen2_5RotaryEmbedding", "RotaryEmbedding"]:
|
| 82 |
+
Rotary = getattr(qvl, name, None)
|
| 83 |
+
if Rotary is not None:
|
| 84 |
+
break
|
| 85 |
+
|
| 86 |
+
if Rotary is None:
|
| 87 |
+
print("[patch][rope] rotary embedding class not found; no patch applied.")
|
| 88 |
+
return
|
| 89 |
+
|
| 90 |
+
orig_forward = Rotary.forward
|
| 91 |
+
|
| 92 |
+
def forward_no_gemm(self, x, position_ids):
|
| 93 |
+
# Fallback to original if structure differs
|
| 94 |
+
if not hasattr(self, "inv_freq") or position_ids is None:
|
| 95 |
+
return orig_forward(self, x, position_ids)
|
| 96 |
+
|
| 97 |
+
# Determine rotary dim from module config (NOT x.shape[-1])
|
| 98 |
+
if hasattr(self, "dim") and isinstance(self.dim, int):
|
| 99 |
+
rope_dim = int(self.dim)
|
| 100 |
+
else:
|
| 101 |
+
rope_dim = int(self.inv_freq.numel() * 2)
|
| 102 |
+
|
| 103 |
+
# Normalize position_ids to (bs, seq)
|
| 104 |
+
if position_ids.ndim > 2:
|
| 105 |
+
pos = position_ids.reshape(position_ids.shape[0], -1)
|
| 106 |
+
else:
|
| 107 |
+
pos = position_ids
|
| 108 |
+
|
| 109 |
+
# Compute on the same device as inv_freq/x
|
| 110 |
+
dev = self.inv_freq.device
|
| 111 |
+
pos = pos.to(device=dev)
|
| 112 |
+
|
| 113 |
+
# Broadcast multiply instead of matmul:
|
| 114 |
+
# inv: (1,1,dim/2,1), pos: (bs,1,1,seq) -> freqs: (bs,1,dim/2,seq)
|
| 115 |
+
inv = self.inv_freq[None, None, :, None].float()
|
| 116 |
+
posf = pos[:, None, None, :].float()
|
| 117 |
+
freqs = (inv * posf).transpose(2, 3) # (bs,1,seq,dim/2)
|
| 118 |
+
|
| 119 |
+
# Double to full rotary dim
|
| 120 |
+
emb = torch.cat((freqs, freqs), dim=-1) # (bs,1,seq,dim)
|
| 121 |
+
|
| 122 |
+
# Enforce exact expected rotary dim
|
| 123 |
+
if emb.shape[-1] != rope_dim:
|
| 124 |
+
emb = emb[..., :rope_dim]
|
| 125 |
+
|
| 126 |
+
cos = emb.cos()
|
| 127 |
+
sin = emb.sin()
|
| 128 |
+
|
| 129 |
+
# Respect attention scaling if present
|
| 130 |
+
attn_scale = getattr(self, "attention_scaling", None)
|
| 131 |
+
if attn_scale is not None:
|
| 132 |
+
cos = cos * attn_scale
|
| 133 |
+
sin = sin * attn_scale
|
| 134 |
+
|
| 135 |
+
# Match dtype expectations (upstream typically returns same dtype as x)
|
| 136 |
+
cos = cos.to(dtype=x.dtype)
|
| 137 |
+
sin = sin.to(dtype=x.dtype)
|
| 138 |
+
|
| 139 |
+
# Optional debug (enable by env)
|
| 140 |
+
if os.environ.get("DEBUG_ROPE", "").strip() == "1":
|
| 141 |
+
ms = getattr(self, "mrope_section", None)
|
| 142 |
+
if ms is not None:
|
| 143 |
+
try:
|
| 144 |
+
ms_list = list(ms)
|
| 145 |
+
print(f"[DEBUG][rope] rope_dim={rope_dim} cos_last={cos.shape[-1]} mrope_sum={sum(ms_list)} mrope={ms_list}")
|
| 146 |
+
except Exception:
|
| 147 |
+
print(f"[DEBUG][rope] rope_dim={rope_dim} cos_last={cos.shape[-1]} (mrope_section unreadable)")
|
| 148 |
+
else:
|
| 149 |
+
print(f"[DEBUG][rope] rope_dim={rope_dim} cos_last={cos.shape[-1]} (no mrope_section attr)")
|
| 150 |
+
|
| 151 |
+
return cos, sin
|
| 152 |
|
| 153 |
+
Rotary.forward = forward_no_gemm
|
| 154 |
+
print("[patch] Patched Qwen2.5-VL RoPE matmul -> broadcast multiply (shape-preserving).")
|
| 155 |
+
|
| 156 |
+
patch_qwen25vl_rope_no_gemm()
|
| 157 |
|
|
|
|
| 158 |
|
| 159 |
# ============================================================
|
| 160 |
# Theme
|
|
|
|
| 175 |
c950="#802200",
|
| 176 |
)
|
| 177 |
|
|
|
|
| 178 |
class OrangeRedTheme(Soft):
|
| 179 |
def __init__(
|
| 180 |
self,
|
|
|
|
| 230 |
block_label_background_fill="*primary_200",
|
| 231 |
)
|
| 232 |
|
|
|
|
| 233 |
orange_red_theme = OrangeRedTheme()
|
| 234 |
|
| 235 |
+
|
| 236 |
# ============================================================
|
| 237 |
+
# Device / Env debug
|
| 238 |
# ============================================================
|
| 239 |
|
| 240 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 241 |
|
|
|
|
|
|
|
| 242 |
print("CUDA_VISIBLE_DEVICES=", os.environ.get("CUDA_VISIBLE_DEVICES"))
|
| 243 |
print("torch.__version__ =", torch.__version__)
|
| 244 |
print("torch.version.cuda =", torch.version.cuda)
|
|
|
|
| 249 |
print("device name:", torch.cuda.get_device_name(torch.cuda.current_device()))
|
| 250 |
print("Using device:", device)
|
| 251 |
|
| 252 |
+
# Optional: make matmul a bit more stable (doesn't change correctness)
|
| 253 |
+
try:
|
| 254 |
+
torch.backends.cuda.matmul.allow_tf32 = True
|
| 255 |
+
torch.backends.cudnn.allow_tf32 = True
|
| 256 |
+
except Exception:
|
| 257 |
+
pass
|
| 258 |
+
|
| 259 |
+
|
| 260 |
# ============================================================
|
| 261 |
# AIO version (Space variable)
|
| 262 |
# ============================================================
|
|
|
|
| 267 |
_VER_RE = re.compile(r"^v\d+$")
|
| 268 |
_DIGITS_RE = re.compile(r"^\d+$")
|
| 269 |
|
|
|
|
| 270 |
def _normalize_version(raw: str) -> Optional[str]:
|
| 271 |
if raw is None:
|
| 272 |
return None
|
|
|
|
| 279 |
return f"v{s}"
|
| 280 |
return None
|
| 281 |
|
|
|
|
| 282 |
_AIO_ENV_RAW = os.environ.get("AIO_VERSION", "")
|
| 283 |
_AIO_ENV_NORM = _normalize_version(_AIO_ENV_RAW)
|
| 284 |
|
|
|
|
| 289 |
print(f"AIO_VERSION (normalized) = {_AIO_ENV_NORM!r}")
|
| 290 |
print(f"Using AIO_VERSION = {AIO_VERSION} ({AIO_VERSION_SOURCE})")
|
| 291 |
|
| 292 |
+
|
| 293 |
# ============================================================
|
| 294 |
# Pipeline
|
| 295 |
# ============================================================
|
|
|
|
| 301 |
|
| 302 |
dtype = torch.bfloat16
|
| 303 |
|
|
|
|
| 304 |
def _load_pipe_with_version(version: str) -> QwenImageEditPlusPipeline:
|
| 305 |
sub = f"{version}/transformer"
|
| 306 |
print(f"📦 Loading AIO transformer: {AIO_REPO_ID} / {sub}")
|
|
|
|
| 316 |
).to(device)
|
| 317 |
return p
|
| 318 |
|
|
|
|
|
|
|
| 319 |
try:
|
| 320 |
pipe = _load_pipe_with_version(AIO_VERSION)
|
| 321 |
except Exception:
|
|
|
|
| 336 |
|
| 337 |
MAX_SEED = np.iinfo(np.int32).max
|
| 338 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 339 |
|
| 340 |
# ============================================================
|
| 341 |
+
# Derived conditioning (Transformers): Pose + Depth (v1-style)
|
| 342 |
# ============================================================
|
| 343 |
|
| 344 |
POSE_MODEL_ID = "usyd-community/vitpose-base-simple"
|
|
|
|
| 358 |
(12, 14), (14, 16),
|
| 359 |
]
|
| 360 |
|
|
|
|
| 361 |
def _derived_device(use_gpu: bool) -> torch.device:
|
| 362 |
return torch.device("cuda" if (use_gpu and torch.cuda.is_available()) else "cpu")
|
| 363 |
|
|
|
|
| 364 |
def _load_pose_models(dev: torch.device):
|
| 365 |
key = str(dev)
|
| 366 |
if key in _POSE_CACHE:
|
|
|
|
| 378 |
_POSE_CACHE[key] = (det_proc, det_model, pose_proc, pose_model)
|
| 379 |
return _POSE_CACHE[key]
|
| 380 |
|
|
|
|
| 381 |
def _load_depth_models(dev: torch.device):
|
| 382 |
key = str(dev)
|
| 383 |
if key in _DEPTH_CACHE:
|
|
|
|
| 390 |
_DEPTH_CACHE[key] = (proc, model)
|
| 391 |
return _DEPTH_CACHE[key]
|
| 392 |
|
|
|
|
| 393 |
def _draw_skeleton_on_blank(
|
| 394 |
size: tuple[int, int],
|
| 395 |
persons_keypoints: list[np.ndarray],
|
|
|
|
| 416 |
if sc[i] < kp_thresh:
|
| 417 |
continue
|
| 418 |
x, y = float(kps[i, 0]), float(kps[i, 1])
|
| 419 |
+
draw.ellipse([(x - point_r, y - point_r), (x + point_r, y + point_r)], fill=(255, 255, 255))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 420 |
return canvas
|
| 421 |
|
|
|
|
| 422 |
def make_pose_map(
|
| 423 |
img: Image.Image,
|
| 424 |
*,
|
|
|
|
| 466 |
|
| 467 |
persons_kps, persons_sc = [], []
|
| 468 |
for pr in pose_results:
|
| 469 |
+
persons_kps.append(pr["keypoints"].detach().cpu().numpy())
|
| 470 |
+
persons_sc.append(pr["scores"].detach().cpu().numpy())
|
|
|
|
|
|
|
| 471 |
|
| 472 |
if not persons_kps:
|
| 473 |
return Image.new("RGB", img.size, (0, 0, 0))
|
| 474 |
|
| 475 |
return _draw_skeleton_on_blank(img.size, persons_kps, persons_sc)
|
| 476 |
|
|
|
|
| 477 |
def make_depth_map(img: Image.Image, *, use_gpu: bool) -> Image.Image:
|
| 478 |
img = img.convert("RGB")
|
| 479 |
dev = _derived_device(use_gpu)
|
|
|
|
| 502 |
return Image.fromarray(depth8, mode="L").convert("RGB")
|
| 503 |
|
| 504 |
|
| 505 |
+
# ============================================================
|
| 506 |
+
# Helpers: gallery normalization + debug-friendly PIL conversion
|
| 507 |
+
# ============================================================
|
| 508 |
+
|
| 509 |
+
def _to_pil_rgb(x) -> Optional[Image.Image]:
|
| 510 |
+
"""
|
| 511 |
+
Accepts PIL / numpy / (image, caption) tuples / gradio dicts and returns PIL RGB.
|
| 512 |
+
Also safely ignores broken temp paths.
|
| 513 |
+
"""
|
| 514 |
+
if x is None:
|
| 515 |
+
return None
|
| 516 |
+
|
| 517 |
+
# Gallery often returns (image, caption)
|
| 518 |
+
if isinstance(x, tuple) and len(x) >= 1:
|
| 519 |
+
x = x[0]
|
| 520 |
+
if x is None:
|
| 521 |
+
return None
|
| 522 |
+
|
| 523 |
+
# Some gradio versions can return dict with a temp file path
|
| 524 |
+
if isinstance(x, dict):
|
| 525 |
+
# common keys: 'name' or 'path'
|
| 526 |
+
p = x.get("name") or x.get("path")
|
| 527 |
+
if isinstance(p, str):
|
| 528 |
+
if not os.path.exists(p):
|
| 529 |
+
print(f"[WARN] extra image path missing, skipping: {p}")
|
| 530 |
+
return None
|
| 531 |
+
try:
|
| 532 |
+
return Image.open(p).convert("RGB")
|
| 533 |
+
except Exception as e:
|
| 534 |
+
print(f"[WARN] failed to open extra image path {p}: {e}")
|
| 535 |
+
return None
|
| 536 |
+
|
| 537 |
+
if isinstance(x, Image.Image):
|
| 538 |
+
return x.convert("RGB")
|
| 539 |
+
|
| 540 |
+
if isinstance(x, np.ndarray):
|
| 541 |
+
try:
|
| 542 |
+
return Image.fromarray(x).convert("RGB")
|
| 543 |
+
except Exception:
|
| 544 |
+
return None
|
| 545 |
+
|
| 546 |
+
# last resort
|
| 547 |
+
try:
|
| 548 |
+
return Image.fromarray(np.array(x)).convert("RGB")
|
| 549 |
+
except Exception:
|
| 550 |
+
return None
|
| 551 |
+
|
| 552 |
def _append_to_gallery(existing, new_img: Image.Image):
|
| 553 |
items = []
|
| 554 |
if existing:
|
| 555 |
for it in existing:
|
| 556 |
+
pil = _to_pil_rgb(it)
|
| 557 |
if pil is not None:
|
| 558 |
items.append(pil)
|
| 559 |
items.append(new_img)
|
| 560 |
return items
|
| 561 |
|
| 562 |
+
def build_labeled_images(
|
| 563 |
+
img1: Image.Image,
|
| 564 |
+
img2: Optional[Image.Image],
|
| 565 |
+
extra_imgs: Optional[list[Image.Image]],
|
| 566 |
+
) -> dict[str, Image.Image]:
|
| 567 |
+
labeled: dict[str, Image.Image] = {}
|
| 568 |
+
idx = 1
|
| 569 |
+
labeled[f"image_{idx}"] = img1
|
| 570 |
+
idx += 1
|
| 571 |
+
if img2 is not None:
|
| 572 |
+
labeled[f"image_{idx}"] = img2
|
| 573 |
+
idx += 1
|
| 574 |
+
if extra_imgs:
|
| 575 |
+
for im in extra_imgs:
|
| 576 |
+
if im is None:
|
| 577 |
+
continue
|
| 578 |
+
labeled[f"image_{idx}"] = im
|
| 579 |
+
idx += 1
|
| 580 |
+
return labeled
|
| 581 |
+
|
| 582 |
|
| 583 |
# ============================================================
|
| 584 |
+
# LoRA adapters + presets (your v1 config)
|
| 585 |
# ============================================================
|
| 586 |
|
| 587 |
NONE_LORA = "None"
|
|
|
|
| 738 |
"Any2Real_2601": "change the picture 1 to realistic photograph",
|
| 739 |
"Semirealistic-photo-detailer": "transform the image to semi-realistic image",
|
| 740 |
"AnyPose": "Make the person in image 1 do the exact same pose of the person in image 2. Changing the style and background of the image of the person in image 1 is undesirable, so don't do it. The new pose should be pixel accurate to the pose we are trying to copy. The position of the arms and head and legs should be the same as the pose we are trying to copy. Change the field of view and angle to match exactly image 2. Head tilt and eye gaze pose should match the person in image 2.",
|
| 741 |
+
"Hyperrealistic-Portrait": "Transform the image into an ultra-realistic photorealistic portrait with strict identity preservation, facing straight to the camera. Enhance pore-level skin textures, realistic moisture effects, and natural wet hair clumping against the skin. Apply cool-toned soft-box lighting with subtle highlights and shadows, maintain realistic green-hazel eye catchlights without synthetic gloss, and preserve soft natural lip texture. Use shallow depth of field with a clean bokeh background, an 85mm macro photographic look, and raw photo grading without retouching to maintain realism and original details.",
|
| 742 |
+
"Ultrarealistic-Portrait": "Transform the image into an ultra-realistic glamour portrait while strictly preserving the subject’s identity. Apply a close-up composition with a slight head tilt and a hand near the face, enhance cinematic directional lighting with dramatic fashion-style highlights, and refine makeup details including glowing skin, glossy lips, luminous highlighter, and defined eyes. Increase skin realism with detailed epidermal textures such as micropores, microhairs, subtle oil sheen, natural highlights, soft wrinkles, and subsurface scattering. Maintain a luxury fashion-magazine look in a 9:16 aspect ratio, preserving realism, facial structure, and original details without over-smoothing or retouching.",
|
| 743 |
"Upscale2K": "Upscale this picture to 4K resolution.",
|
| 744 |
"BFS-Best-FaceSwap": "head_swap: start with Picture 1 as the base image, keeping its lighting, environment, and background. remove the head from Picture 1 completely and replace it with the head from Picture 2, strictly preserving the hair, eye color, and nose structure of Picture 2. copy the eye direction, head rotation, and micro-expressions from Picture 1. high quality, sharp details, 4k",
|
| 745 |
"BFS-Best-FaceSwap-merge": "head_swap: start with Picture 1 as the base image, keeping its lighting, environment, and background. remove the head from Picture 1 completely and replace it with the head from Picture 2, strictly preserving the hair, eye color, and nose structure of Picture 2. copy the eye direction, head rotation, and micro-expressions from Picture 1. high quality, sharp details, 4k",
|
|
|
|
| 747 |
|
| 748 |
LOADED_ADAPTERS = set()
|
| 749 |
|
| 750 |
+
|
| 751 |
# ============================================================
|
| 752 |
+
# Helpers: resolution (area-based sizing)
|
| 753 |
# ============================================================
|
| 754 |
|
| 755 |
def _round_to_multiple(x: int, m: int) -> int:
|
| 756 |
return max(m, (int(x) // m) * m)
|
| 757 |
|
|
|
|
| 758 |
def compute_canvas_dimensions_from_area(
|
| 759 |
image: Image.Image,
|
| 760 |
target_area: int,
|
|
|
|
| 762 |
) -> tuple[int, int]:
|
| 763 |
w, h = image.size
|
| 764 |
aspect = w / h if h else 1.0
|
|
|
|
| 765 |
from qwenimage.pipeline_qwenimage_edit_plus import calculate_dimensions
|
|
|
|
| 766 |
width, height = calculate_dimensions(int(target_area), float(aspect))
|
| 767 |
width = _round_to_multiple(int(width), int(multiple_of))
|
| 768 |
height = _round_to_multiple(int(height), int(multiple_of))
|
| 769 |
return width, height
|
| 770 |
|
|
|
|
| 771 |
def get_target_area_for_lora(
|
| 772 |
image: Image.Image,
|
| 773 |
lora_adapter: str,
|
| 774 |
user_target_megapixels: float,
|
| 775 |
) -> int:
|
| 776 |
spec = ADAPTER_SPECS.get(lora_adapter, {})
|
|
|
|
| 777 |
if "target_area" in spec:
|
| 778 |
try:
|
| 779 |
return int(spec["target_area"])
|
| 780 |
except Exception:
|
| 781 |
pass
|
|
|
|
| 782 |
if "target_megapixels" in spec:
|
| 783 |
try:
|
| 784 |
mp = float(spec["target_megapixels"])
|
| 785 |
return int(mp * 1024 * 1024)
|
| 786 |
except Exception:
|
| 787 |
pass
|
|
|
|
| 788 |
if "target_long_edge" in spec:
|
| 789 |
try:
|
| 790 |
long_edge = int(spec["target_long_edge"])
|
|
|
|
| 798 |
return int(new_w * new_h)
|
| 799 |
except Exception:
|
| 800 |
pass
|
|
|
|
| 801 |
return int(float(user_target_megapixels) * 1024 * 1024)
|
| 802 |
|
| 803 |
|
| 804 |
# ============================================================
|
| 805 |
+
# Helpers: LoRA routing + BFS alpha fixes (your v1 logic)
|
| 806 |
# ============================================================
|
| 807 |
|
| 808 |
def lora_requires_two_images(lora_adapter: str) -> bool:
|
| 809 |
return bool(ADAPTER_SPECS.get(lora_adapter, {}).get("requires_two_images", False))
|
| 810 |
|
|
|
|
| 811 |
def image2_label_for_lora(lora_adapter: str) -> str:
|
| 812 |
return str(ADAPTER_SPECS.get(lora_adapter, {}).get("image2_label", "Upload Reference (Image 2)"))
|
| 813 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 814 |
def _inject_missing_alpha_keys(state_dict: dict) -> dict:
|
| 815 |
bases = {}
|
| 816 |
for k, v in state_dict.items():
|
|
|
|
| 823 |
|
| 824 |
for base, rank in bases.items():
|
| 825 |
alpha_tensor = torch.tensor(float(rank), dtype=torch.float32)
|
| 826 |
+
|
| 827 |
full_alpha = f"{base}.alpha"
|
| 828 |
if full_alpha not in state_dict:
|
| 829 |
state_dict[full_alpha] = alpha_tensor
|
|
|
|
| 833 |
stripped_alpha = f"{stripped_base}.alpha"
|
| 834 |
if stripped_alpha not in state_dict:
|
| 835 |
state_dict[stripped_alpha] = alpha_tensor
|
|
|
|
| 836 |
return state_dict
|
| 837 |
|
|
|
|
| 838 |
def _filter_to_diffusers_lora_keys(state_dict: dict) -> tuple[dict, dict]:
|
| 839 |
keep_suffixes = (
|
| 840 |
".lora_up.weight",
|
|
|
|
| 877 |
}
|
| 878 |
return out, stats
|
| 879 |
|
|
|
|
| 880 |
def _duplicate_stripped_prefix_keys(state_dict: dict, prefix: str = "diffusion_model.") -> dict:
|
| 881 |
out = dict(state_dict)
|
| 882 |
for k, v in list(state_dict.items()):
|
|
|
|
| 887 |
out[stripped] = v
|
| 888 |
return out
|
| 889 |
|
|
|
|
| 890 |
def _load_lora_weights_with_fallback(repo: str, weight_name: str, adapter_name: str, needs_alpha_fix: bool = False):
|
| 891 |
try:
|
| 892 |
pipe.load_lora_weights(repo, weight_name=weight_name, adapter_name=adapter_name)
|
|
|
|
| 912 |
f"kept={stats['kept']} dropped_patch={stats['dropped_patch']} "
|
| 913 |
f"dropped_other={stats['dropped_other']} normalized_alpha={stats['normalized_alpha']}"
|
| 914 |
)
|
|
|
|
| 915 |
pipe.load_lora_weights(sd, adapter_name=adapter_name)
|
| 916 |
return
|
| 917 |
|
|
|
|
| 918 |
def _ensure_loaded_and_get_active_adapters(selected_lora: str):
|
| 919 |
spec = ADAPTER_SPECS.get(selected_lora)
|
| 920 |
if not spec:
|
|
|
|
| 952 |
|
| 953 |
adapter_names.append(adapter_name)
|
| 954 |
adapter_weights.append(strength)
|
|
|
|
| 955 |
else:
|
| 956 |
repo = spec["repo"]
|
| 957 |
weights = spec["weights"]
|
|
|
|
| 1007 |
return prompt_update, img2_update, extras_update
|
| 1008 |
|
| 1009 |
|
| 1010 |
+
# ============================================================
|
| 1011 |
+
# UI helpers: output routing + derived conditioning
|
| 1012 |
+
# ============================================================
|
| 1013 |
+
|
| 1014 |
def set_output_as_image1(last):
|
| 1015 |
if last is None:
|
| 1016 |
raise gr.Error("No output available yet.")
|
| 1017 |
return gr.update(value=last)
|
| 1018 |
|
|
|
|
| 1019 |
def set_output_as_image2(last):
|
| 1020 |
if last is None:
|
| 1021 |
raise gr.Error("No output available yet.")
|
| 1022 |
return gr.update(value=last)
|
| 1023 |
|
|
|
|
| 1024 |
def set_output_as_extra(last, existing_extra):
|
| 1025 |
if last is None:
|
| 1026 |
raise gr.Error("No output available yet.")
|
| 1027 |
return _append_to_gallery(existing_extra, last)
|
| 1028 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1029 |
@spaces.GPU
|
| 1030 |
def add_derived_ref(img1, existing_extra, derived_type, derived_use_gpu, derived_max_people):
|
| 1031 |
+
if img1 is None:
|
| 1032 |
+
raise gr.Error("Please upload Image 1 first.")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1033 |
|
| 1034 |
if derived_type == "None":
|
| 1035 |
return gr.update(value=existing_extra), gr.update(visible=False, value=None)
|
| 1036 |
|
| 1037 |
+
base = img1.convert("RGB")
|
| 1038 |
|
| 1039 |
if derived_type == "Pose (ViTPose, fast)":
|
| 1040 |
derived = make_pose_map(base, use_gpu=bool(derived_use_gpu), mode="fast")
|
|
|
|
| 1055 |
|
| 1056 |
|
| 1057 |
# ============================================================
|
| 1058 |
+
# Debug helpers (CUDA mem + token count)
|
| 1059 |
# ============================================================
|
| 1060 |
|
| 1061 |
+
def _cuda_mem(prefix: str):
|
| 1062 |
if not torch.cuda.is_available():
|
| 1063 |
return
|
| 1064 |
try:
|
| 1065 |
free, total = torch.cuda.mem_get_info()
|
| 1066 |
+
print(f"[DEBUG][cuda][{prefix}] mem free={free/1e9:.2f}GB total={total/1e9:.2f}GB")
|
| 1067 |
+
except Exception:
|
| 1068 |
+
pass
|
| 1069 |
+
|
| 1070 |
+
def _approx_token_count(text: str) -> int:
|
| 1071 |
+
# Lightweight: we avoid forcing tokenizer calls here; this is only for debug.
|
| 1072 |
+
# Rule-of-thumb: ~4 chars per token in English-ish text.
|
| 1073 |
+
if not text:
|
| 1074 |
+
return 0
|
| 1075 |
+
return max(1, int(math.ceil(len(text) / 4.0)))
|
| 1076 |
+
|
| 1077 |
|
| 1078 |
+
# ============================================================
|
| 1079 |
+
# Inference
|
| 1080 |
+
# ============================================================
|
| 1081 |
|
| 1082 |
@spaces.GPU
|
| 1083 |
def infer(
|
|
|
|
| 1099 |
if torch.cuda.is_available():
|
| 1100 |
torch.cuda.empty_cache()
|
| 1101 |
|
| 1102 |
+
print("[DEBUG][infer] input types:", type(input_image_1), type(input_image_2), type(input_images_extra))
|
|
|
|
|
|
|
| 1103 |
|
| 1104 |
+
if input_image_1 is None:
|
| 1105 |
+
raise gr.Error("Please upload Image 1.")
|
|
|
|
|
|
|
| 1106 |
|
| 1107 |
+
# Handle "None"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1108 |
if lora_adapter == NONE_LORA:
|
| 1109 |
try:
|
| 1110 |
pipe.set_adapters([], adapter_weights=[])
|
|
|
|
| 1115 |
adapter_names, adapter_weights = _ensure_loaded_and_get_active_adapters(lora_adapter)
|
| 1116 |
pipe.set_adapters(adapter_names, adapter_weights=adapter_weights)
|
| 1117 |
|
|
|
|
| 1118 |
if randomize_seed:
|
| 1119 |
seed = random.randint(0, MAX_SEED)
|
| 1120 |
|
|
|
|
| 1125 |
"extra digit, fewer digits, cropped, jpeg artifacts, signature, watermark, username, blurry"
|
| 1126 |
)
|
| 1127 |
|
| 1128 |
+
img1 = input_image_1.convert("RGB") if isinstance(input_image_1, Image.Image) else _to_pil_rgb(input_image_1)
|
| 1129 |
+
if img1 is None:
|
| 1130 |
+
raise gr.Error("Image 1 could not be read (unexpected input type/path).")
|
| 1131 |
+
|
| 1132 |
+
img2 = None
|
| 1133 |
+
if input_image_2 is not None:
|
| 1134 |
+
img2 = input_image_2.convert("RGB") if isinstance(input_image_2, Image.Image) else _to_pil_rgb(input_image_2)
|
| 1135 |
+
if img2 is None:
|
| 1136 |
+
raise gr.Error("Image 2 could not be read (unexpected input type/path).")
|
| 1137 |
+
|
| 1138 |
+
# Normalize extra images (Gallery)
|
| 1139 |
+
extra_imgs: list[Image.Image] = []
|
| 1140 |
+
if input_images_extra:
|
| 1141 |
+
for item in input_images_extra:
|
| 1142 |
+
pil = _to_pil_rgb(item)
|
| 1143 |
+
if pil is not None:
|
| 1144 |
+
extra_imgs.append(pil)
|
| 1145 |
+
|
| 1146 |
+
# Enforce 2-image LoRA behavior
|
| 1147 |
if lora_requires_two_images(lora_adapter) and img2 is None:
|
| 1148 |
raise gr.Error("This LoRA needs two images. Please upload Image 2 as well.")
|
| 1149 |
|
|
|
|
| 1150 |
labeled = build_labeled_images(img1, img2, extra_imgs)
|
| 1151 |
+
|
| 1152 |
pipe_images = list(labeled.values())
|
| 1153 |
if len(pipe_images) == 1:
|
| 1154 |
pipe_images = pipe_images[0]
|
| 1155 |
|
|
|
|
| 1156 |
target_area = get_target_area_for_lora(img1, lora_adapter, float(target_megapixels))
|
| 1157 |
width, height = compute_canvas_dimensions_from_area(
|
| 1158 |
img1,
|
|
|
|
| 1160 |
multiple_of=int(pipe.vae_scale_factor * 2),
|
| 1161 |
)
|
| 1162 |
|
|
|
|
| 1163 |
vae_image_indices = None
|
| 1164 |
if extras_condition_only:
|
| 1165 |
if isinstance(pipe_images, list) and len(pipe_images) > 2:
|
| 1166 |
vae_image_indices = [0, 1] if len(pipe_images) >= 2 else [0]
|
| 1167 |
|
| 1168 |
+
# Debug summary
|
| 1169 |
+
n_images = len(pipe_images) if isinstance(pipe_images, list) else 1
|
| 1170 |
+
tok_est = _approx_token_count(prompt or "")
|
| 1171 |
print(
|
| 1172 |
"[DEBUG][infer] submitting request | "
|
| 1173 |
+
f"lora_adapter={lora_adapter!r} seed={seed} prompt_len={len(prompt or '')} "
|
| 1174 |
+
f"steps={steps} true_cfg_scale={guidance_scale} target_mp={target_megapixels} "
|
| 1175 |
+
f"canvas=({width}x{height}) n_images={n_images} vae_image_indices={vae_image_indices} "
|
| 1176 |
+
f"pad_to_canvas={bool(pad_to_canvas)}"
|
|
|
|
| 1177 |
)
|
| 1178 |
+
print(f"[DEBUG][infer] image_1 size: {img1.size} image_2 size: {img2.size if img2 else None}")
|
| 1179 |
+
print(f"[DEBUG][infer] prompt token_estimate: {tok_est}")
|
| 1180 |
+
_cuda_mem("before")
|
| 1181 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1182 |
try:
|
| 1183 |
result = pipe(
|
| 1184 |
image=pipe_images,
|
| 1185 |
+
prompt=prompt,
|
| 1186 |
negative_prompt=negative_prompt,
|
| 1187 |
height=height,
|
| 1188 |
width=width,
|
|
|
|
| 1193 |
pad_to_canvas=bool(pad_to_canvas),
|
| 1194 |
).images[0]
|
| 1195 |
return result, seed, result
|
| 1196 |
+
|
| 1197 |
+
except Exception as e:
|
| 1198 |
print("---- [ERROR][infer] exception ----")
|
| 1199 |
print(traceback.format_exc())
|
| 1200 |
print("---------------------------------")
|
| 1201 |
raise
|
| 1202 |
+
|
| 1203 |
finally:
|
| 1204 |
+
_cuda_mem("after")
|
| 1205 |
gc.collect()
|
| 1206 |
if torch.cuda.is_available():
|
| 1207 |
torch.cuda.empty_cache()
|
|
|
|
| 1209 |
|
| 1210 |
@spaces.GPU
|
| 1211 |
def infer_example(input_image, prompt, lora_adapter):
|
| 1212 |
+
if input_image is None:
|
|
|
|
| 1213 |
return None, 0, None
|
| 1214 |
+
input_pil = input_image.convert("RGB")
|
| 1215 |
guidance_scale = 1.0
|
| 1216 |
steps = 4
|
| 1217 |
result, seed, last = infer(
|
| 1218 |
+
input_pil, None, None,
|
| 1219 |
+
prompt, lora_adapter,
|
| 1220 |
+
0, True,
|
| 1221 |
+
guidance_scale, steps, 1.0,
|
| 1222 |
+
True, True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1223 |
)
|
| 1224 |
return result, seed, last
|
| 1225 |
|
|
|
|
| 1245 |
with gr.Column(elem_id="col-container"):
|
| 1246 |
gr.Markdown("# **Qwen-Image-Edit-2511-LoRAs-Fast**", elem_id="main-title")
|
| 1247 |
gr.Markdown(
|
| 1248 |
+
"Perform diverse image edits using specialized "
|
| 1249 |
+
"[LoRA](https://huggingface.co/models?other=base_model:adapter:Qwen/Qwen-Image-Edit-2511) adapters for the "
|
| 1250 |
+
"[Qwen-Image-Edit](https://huggingface.co/Qwen/Qwen-Image-Edit-2511) model. Uses a Diffusers compatible extraction of the transformers from Phr00t's Rapid AIO merge."
|
| 1251 |
)
|
| 1252 |
gr.Markdown(aio_status_line)
|
| 1253 |
|
|
|
|
| 1275 |
|
| 1276 |
with gr.Column():
|
| 1277 |
output_image = gr.Image(label="Output Image", interactive=False, format="png", height=353)
|
| 1278 |
+
|
| 1279 |
last_output = gr.State(value=None)
|
| 1280 |
|
| 1281 |
with gr.Row():
|
|
|
|
| 1341 |
value=True,
|
| 1342 |
)
|
| 1343 |
|
|
|
|
| 1344 |
lora_adapter.change(
|
| 1345 |
fn=on_lora_change_ui,
|
| 1346 |
inputs=[lora_adapter, prompt, extras_condition_only],
|
|
|
|
| 1352 |
["examples/5.jpg", "Remove shadows and relight the image using soft lighting.", "Light-Restoration"],
|
| 1353 |
["examples/4.jpg", "Use a subtle golden-hour filter with smooth light diffusion.", "Relight"],
|
| 1354 |
["examples/2.jpeg", "Rotate the camera 45 degrees to the left.", "Multiple-Angles"],
|
| 1355 |
+
[
|
| 1356 |
+
"examples/12.jpg",
|
| 1357 |
+
"flatcolor Desaturate the image and lower the contrast to create a flat, ungraded look similar to a camera log profile. Preserve details in the highlights and shadows.",
|
| 1358 |
+
"Flat-Log",
|
| 1359 |
+
],
|
| 1360 |
+
["examples/7.jpg", "Light source from the Right Rear", "Multi-Angle-Lighting"],
|
| 1361 |
+
["examples/10.jpeg", "Upscale the image.", "Upscale-Image"],
|
| 1362 |
+
["examples/7.jpg", "Light source from the Below", "Multi-Angle-Lighting"],
|
| 1363 |
+
["examples/2.jpeg", "Switch the camera to a top-down right corner view.", "Multiple-Angles"],
|
| 1364 |
+
[
|
| 1365 |
+
"examples/9.jpg",
|
| 1366 |
+
"The camera moves slightly forward as sunlight breaks through the clouds, casting a soft glow around the character's silhouette in the mist. Realistic cinematic style, atmospheric depth.",
|
| 1367 |
+
"Next-Scene",
|
| 1368 |
+
],
|
| 1369 |
+
["examples/8.jpg", "Make the subjects skin details more prominent and natural.", "Edit-Skin"],
|
| 1370 |
+
["examples/6.jpg", "Switch the camera to a bottom-up view.", "Multiple-Angles"],
|
| 1371 |
+
["examples/6.jpg", "Rotate the camera 180 degrees upside down.", "Multiple-Angles"],
|
| 1372 |
+
["examples/4.jpg", "Rotate the camera 45 degrees to the right.", "Multiple-Angles"],
|
| 1373 |
+
["examples/4.jpg", "Switch the camera to a top-down view.", "Multiple-Angles"],
|
| 1374 |
+
["examples/4.jpg", "Switch the camera to a wide-angle lens.", "Multiple-Angles"],
|
| 1375 |
["examples/11.jpg", "Upscale this picture to 4K resolution.", "Upscale2K"],
|
| 1376 |
],
|
| 1377 |
inputs=[input_image_1, prompt, lora_adapter],
|
|
|
|
| 1400 |
outputs=[output_image, seed, last_output],
|
| 1401 |
)
|
| 1402 |
|
|
|
|
| 1403 |
btn_out_to_img1.click(fn=set_output_as_image1, inputs=[last_output], outputs=[input_image_1])
|
| 1404 |
btn_out_to_img2.click(fn=set_output_as_image2, inputs=[last_output], outputs=[input_image_2])
|
| 1405 |
btn_out_to_extra.click(fn=set_output_as_extra, inputs=[last_output, input_images_extra], outputs=[input_images_extra])
|
| 1406 |
|
|
|
|
| 1407 |
add_derived_btn.click(
|
| 1408 |
fn=add_derived_ref,
|
| 1409 |
inputs=[input_image_1, input_images_extra, derived_type, derived_use_gpu, derived_max_people],
|