Professional Noob commited on
Commit
c549fb8
·
verified ·
1 Parent(s): 45b0d64

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +302 -292
app.py CHANGED
@@ -1,29 +1,14 @@
1
  # app.py
2
- # Complete, self-contained Gradio app with:
3
- # 1) Robust "ensure_pil_rgb" to avoid /tmp/gradio path issues across ZeroGPU workers
4
- # 2) Per-process GRADIO_TEMP_DIR to reduce temp collisions
5
- # 3) Qwen2.5-VL RoPE patch to avoid cublasSgemmStridedBatched failures (broadcast multiply instead)
6
- # 4) Extra debug logging around inputs, image routing, and prompt/token lengths
7
-
8
  import os
9
  import re
10
  import gc
 
 
 
11
  import traceback
12
  import random
13
- import time
14
  from typing import Iterable, Optional
15
 
16
- # -------------------------------
17
- # Temp-dir hardening (helps when multiple users hit the Space)
18
- # -------------------------------
19
- # IMPORTANT: This doesn't magically share files between runtimes, but it reduces collisions
20
- # and makes temp behavior more deterministic.
21
- if not os.environ.get("GRADIO_TEMP_DIR"):
22
- _pid = os.getpid()
23
- _tmp = f"/tmp/gradio_{_pid}"
24
- os.makedirs(_tmp, exist_ok=True)
25
- os.environ["GRADIO_TEMP_DIR"] = _tmp
26
-
27
  import gradio as gr
28
  import numpy as np
29
  import spaces
@@ -44,68 +29,132 @@ from safetensors.torch import load_file as safetensors_load_file
44
  from gradio.themes import Soft
45
  from gradio.themes.utils import colors, fonts, sizes
46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  # ============================================================
48
- # Qwen2.5-VL RoPE patch (avoid cublas batched GEMM)
49
  # ============================================================
50
 
51
- def patch_qwen25vl_rope_gemm_to_mul():
52
  """
53
- The observed crash:
54
- RuntimeError: CUDA error: CUBLAS_STATUS_INVALID_VALUE
55
- ... modeling_qwen2_5_vl.py line ~521
56
- freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(2, 3)
57
 
58
- This patches Qwen2.5-VL rotary embedding to compute the outer product
59
- via broadcast multiply instead of batched GEMM.
60
  """
 
 
 
 
61
  try:
62
- import transformers.models.qwen2_5_vl.modeling_qwen2_5_vl as m
63
-
64
- Rotary = getattr(m, "Qwen2_5_VLRotaryEmbedding", None)
65
- if Rotary is None:
66
- print("[patch] Qwen2_5_VLRotaryEmbedding not found; skipping RoPE patch.")
67
- return
68
-
69
- # Guard: only patch once
70
- if getattr(Rotary, "_rope_mul_patch_applied", False):
71
- print("[patch] RoPE patch already applied.")
72
- return
73
-
74
- def safe_forward(self, x, position_ids):
75
- """
76
- Return (cos, sin) with shapes compatible with original.
77
- """
78
- inv_freq = self.inv_freq.to(device=x.device)
79
-
80
- # position_ids is typically (bs, seq) or (seq,)
81
- if position_ids.dim() == 1:
82
- position_ids_ = position_ids.unsqueeze(0) # (1, seq)
83
- else:
84
- position_ids_ = position_ids # (bs, seq)
85
-
86
- # Outer product via broadcast:
87
- # inv: (1,1,dim/2,1)
88
- # pos: (bs,1,1,seq)
89
- # freqs: (bs,1,seq,dim/2)
90
- inv = inv_freq[None, None, :, None].float()
91
- pos = position_ids_[:, None, None, :].float()
92
- freqs = (inv * pos).transpose(2, 3)
93
-
94
- # Original commonly duplicates for sin/cos on last dim
95
- emb = torch.cat((freqs, freqs), dim=-1) # (bs,1,seq,dim)
96
-
97
- cos = emb.cos().to(dtype=x.dtype)
98
- sin = emb.sin().to(dtype=x.dtype)
99
- return cos, sin
100
-
101
- Rotary.forward = safe_forward
102
- Rotary._rope_mul_patch_applied = True
103
- print("[patch] Patched Qwen2.5-VL RoPE to avoid cublas batched GEMM.")
104
  except Exception as e:
105
- print("[patch] Failed to patch RoPE:", repr(e))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
 
 
 
 
 
107
 
108
- patch_qwen25vl_rope_gemm_to_mul()
109
 
110
  # ============================================================
111
  # Theme
@@ -126,7 +175,6 @@ colors.orange_red = colors.Color(
126
  c950="#802200",
127
  )
128
 
129
-
130
  class OrangeRedTheme(Soft):
131
  def __init__(
132
  self,
@@ -182,17 +230,15 @@ class OrangeRedTheme(Soft):
182
  block_label_background_fill="*primary_200",
183
  )
184
 
185
-
186
  orange_red_theme = OrangeRedTheme()
187
 
 
188
  # ============================================================
189
- # Device
190
  # ============================================================
191
 
192
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
193
 
194
- print("===== Application Startup at", time.strftime("%Y-%m-%d %H:%M:%S"), "=====")
195
- print("GRADIO_TEMP_DIR =", os.environ.get("GRADIO_TEMP_DIR"))
196
  print("CUDA_VISIBLE_DEVICES=", os.environ.get("CUDA_VISIBLE_DEVICES"))
197
  print("torch.__version__ =", torch.__version__)
198
  print("torch.version.cuda =", torch.version.cuda)
@@ -203,6 +249,14 @@ if torch.cuda.is_available():
203
  print("device name:", torch.cuda.get_device_name(torch.cuda.current_device()))
204
  print("Using device:", device)
205
 
 
 
 
 
 
 
 
 
206
  # ============================================================
207
  # AIO version (Space variable)
208
  # ============================================================
@@ -213,7 +267,6 @@ DEFAULT_AIO_VERSION = "v19"
213
  _VER_RE = re.compile(r"^v\d+$")
214
  _DIGITS_RE = re.compile(r"^\d+$")
215
 
216
-
217
  def _normalize_version(raw: str) -> Optional[str]:
218
  if raw is None:
219
  return None
@@ -226,7 +279,6 @@ def _normalize_version(raw: str) -> Optional[str]:
226
  return f"v{s}"
227
  return None
228
 
229
-
230
  _AIO_ENV_RAW = os.environ.get("AIO_VERSION", "")
231
  _AIO_ENV_NORM = _normalize_version(_AIO_ENV_RAW)
232
 
@@ -237,6 +289,7 @@ print(f"AIO_VERSION (env raw) = {_AIO_ENV_RAW!r}")
237
  print(f"AIO_VERSION (normalized) = {_AIO_ENV_NORM!r}")
238
  print(f"Using AIO_VERSION = {AIO_VERSION} ({AIO_VERSION_SOURCE})")
239
 
 
240
  # ============================================================
241
  # Pipeline
242
  # ============================================================
@@ -248,7 +301,6 @@ from qwenimage.qwen_fa3_processor import QwenDoubleStreamAttnProcessorFA3
248
 
249
  dtype = torch.bfloat16
250
 
251
-
252
  def _load_pipe_with_version(version: str) -> QwenImageEditPlusPipeline:
253
  sub = f"{version}/transformer"
254
  print(f"📦 Loading AIO transformer: {AIO_REPO_ID} / {sub}")
@@ -264,8 +316,6 @@ def _load_pipe_with_version(version: str) -> QwenImageEditPlusPipeline:
264
  ).to(device)
265
  return p
266
 
267
-
268
- # Forgiving load: try env/default version, fallback to v19 if it fails
269
  try:
270
  pipe = _load_pipe_with_version(AIO_VERSION)
271
  except Exception:
@@ -286,64 +336,9 @@ except Exception as e:
286
 
287
  MAX_SEED = np.iinfo(np.int32).max
288
 
289
- # ============================================================
290
- # Robust image coercion (prevents /tmp/gradio FileNotFound crashes)
291
- # ============================================================
292
-
293
- def ensure_pil_rgb(x, *, label: str = "") -> Optional[Image.Image]:
294
- """
295
- Accepts:
296
- - PIL.Image
297
- - numpy arrays
298
- - Gallery tuples (img, caption)
299
- - dict payloads with path/name
300
- - file path strings (best-effort)
301
- Returns PIL RGB or None.
302
- """
303
- if x is None:
304
- return None
305
-
306
- # Gallery often returns (img, caption)
307
- if isinstance(x, tuple) and len(x) >= 1:
308
- x = x[0]
309
-
310
- # Sometimes dict payloads
311
- if isinstance(x, dict):
312
- path = x.get("path") or x.get("name")
313
- if path:
314
- x = path
315
-
316
- if isinstance(x, Image.Image):
317
- return x.convert("RGB")
318
-
319
- if isinstance(x, np.ndarray):
320
- try:
321
- return Image.fromarray(x).convert("RGB")
322
- except Exception:
323
- return None
324
-
325
- if isinstance(x, str):
326
- # Path-based input: may fail on ZeroGPU worker if file isn't present.
327
- exists = os.path.exists(x)
328
- print(f"[DEBUG][ensure_pil_rgb] {label} got filepath: {x!r} exists={exists}")
329
- if not exists:
330
- # Return None so caller can show a meaningful error (instead of crashing).
331
- return None
332
- try:
333
- return Image.open(x).convert("RGB")
334
- except Exception as e:
335
- print(f"[DEBUG][ensure_pil_rgb] {label} PIL open failed: {e!r}")
336
- return None
337
-
338
- # Last resort
339
- try:
340
- return Image.fromarray(np.array(x)).convert("RGB")
341
- except Exception:
342
- return None
343
-
344
 
345
  # ============================================================
346
- # Derived conditioning (Transformers): Pose + Depth
347
  # ============================================================
348
 
349
  POSE_MODEL_ID = "usyd-community/vitpose-base-simple"
@@ -363,11 +358,9 @@ COCO17_EDGES = [
363
  (12, 14), (14, 16),
364
  ]
365
 
366
-
367
  def _derived_device(use_gpu: bool) -> torch.device:
368
  return torch.device("cuda" if (use_gpu and torch.cuda.is_available()) else "cpu")
369
 
370
-
371
  def _load_pose_models(dev: torch.device):
372
  key = str(dev)
373
  if key in _POSE_CACHE:
@@ -385,7 +378,6 @@ def _load_pose_models(dev: torch.device):
385
  _POSE_CACHE[key] = (det_proc, det_model, pose_proc, pose_model)
386
  return _POSE_CACHE[key]
387
 
388
-
389
  def _load_depth_models(dev: torch.device):
390
  key = str(dev)
391
  if key in _DEPTH_CACHE:
@@ -398,7 +390,6 @@ def _load_depth_models(dev: torch.device):
398
  _DEPTH_CACHE[key] = (proc, model)
399
  return _DEPTH_CACHE[key]
400
 
401
-
402
  def _draw_skeleton_on_blank(
403
  size: tuple[int, int],
404
  persons_keypoints: list[np.ndarray],
@@ -425,15 +416,9 @@ def _draw_skeleton_on_blank(
425
  if sc[i] < kp_thresh:
426
  continue
427
  x, y = float(kps[i, 0]), float(kps[i, 1])
428
- draw.ellipse(
429
- [(x - point_r, y - point_r), (x + point_r, y + point_r)],
430
- fill=(255, 255, 255),
431
- outline=None,
432
- )
433
-
434
  return canvas
435
 
436
-
437
  def make_pose_map(
438
  img: Image.Image,
439
  *,
@@ -481,17 +466,14 @@ def make_pose_map(
481
 
482
  persons_kps, persons_sc = [], []
483
  for pr in pose_results:
484
- kps = pr["keypoints"].detach().cpu().numpy()
485
- sc = pr["scores"].detach().cpu().numpy()
486
- persons_kps.append(kps)
487
- persons_sc.append(sc)
488
 
489
  if not persons_kps:
490
  return Image.new("RGB", img.size, (0, 0, 0))
491
 
492
  return _draw_skeleton_on_blank(img.size, persons_kps, persons_sc)
493
 
494
-
495
  def make_depth_map(img: Image.Image, *, use_gpu: bool) -> Image.Image:
496
  img = img.convert("RGB")
497
  dev = _derived_device(use_gpu)
@@ -520,19 +502,86 @@ def make_depth_map(img: Image.Image, *, use_gpu: bool) -> Image.Image:
520
  return Image.fromarray(depth8, mode="L").convert("RGB")
521
 
522
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
523
  def _append_to_gallery(existing, new_img: Image.Image):
524
  items = []
525
  if existing:
526
  for it in existing:
527
- pil = ensure_pil_rgb(it, label="gallery_item")
528
  if pil is not None:
529
  items.append(pil)
530
  items.append(new_img)
531
  return items
532
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
533
 
534
  # ============================================================
535
- # LoRA adapters + presets
536
  # ============================================================
537
 
538
  NONE_LORA = "None"
@@ -689,8 +738,8 @@ LORA_PRESET_PROMPTS = {
689
  "Any2Real_2601": "change the picture 1 to realistic photograph",
690
  "Semirealistic-photo-detailer": "transform the image to semi-realistic image",
691
  "AnyPose": "Make the person in image 1 do the exact same pose of the person in image 2. Changing the style and background of the image of the person in image 1 is undesirable, so don't do it. The new pose should be pixel accurate to the pose we are trying to copy. The position of the arms and head and legs should be the same as the pose we are trying to copy. Change the field of view and angle to match exactly image 2. Head tilt and eye gaze pose should match the person in image 2.",
692
- "Hyperrealistic-Portrait": "Transform the image into an ultra-realistic photorealistic portrait with strict identity preservation, facing straight to the camera.",
693
- "Ultrarealistic-Portrait": "Transform the image into an ultra-realistic glamour portrait while strictly preserving the subject’s identity.",
694
  "Upscale2K": "Upscale this picture to 4K resolution.",
695
  "BFS-Best-FaceSwap": "head_swap: start with Picture 1 as the base image, keeping its lighting, environment, and background. remove the head from Picture 1 completely and replace it with the head from Picture 2, strictly preserving the hair, eye color, and nose structure of Picture 2. copy the eye direction, head rotation, and micro-expressions from Picture 1. high quality, sharp details, 4k",
696
  "BFS-Best-FaceSwap-merge": "head_swap: start with Picture 1 as the base image, keeping its lighting, environment, and background. remove the head from Picture 1 completely and replace it with the head from Picture 2, strictly preserving the hair, eye color, and nose structure of Picture 2. copy the eye direction, head rotation, and micro-expressions from Picture 1. high quality, sharp details, 4k",
@@ -698,14 +747,14 @@ LORA_PRESET_PROMPTS = {
698
 
699
  LOADED_ADAPTERS = set()
700
 
 
701
  # ============================================================
702
- # Helpers: resolution
703
  # ============================================================
704
 
705
  def _round_to_multiple(x: int, m: int) -> int:
706
  return max(m, (int(x) // m) * m)
707
 
708
-
709
  def compute_canvas_dimensions_from_area(
710
  image: Image.Image,
711
  target_area: int,
@@ -713,35 +762,29 @@ def compute_canvas_dimensions_from_area(
713
  ) -> tuple[int, int]:
714
  w, h = image.size
715
  aspect = w / h if h else 1.0
716
-
717
  from qwenimage.pipeline_qwenimage_edit_plus import calculate_dimensions
718
-
719
  width, height = calculate_dimensions(int(target_area), float(aspect))
720
  width = _round_to_multiple(int(width), int(multiple_of))
721
  height = _round_to_multiple(int(height), int(multiple_of))
722
  return width, height
723
 
724
-
725
  def get_target_area_for_lora(
726
  image: Image.Image,
727
  lora_adapter: str,
728
  user_target_megapixels: float,
729
  ) -> int:
730
  spec = ADAPTER_SPECS.get(lora_adapter, {})
731
-
732
  if "target_area" in spec:
733
  try:
734
  return int(spec["target_area"])
735
  except Exception:
736
  pass
737
-
738
  if "target_megapixels" in spec:
739
  try:
740
  mp = float(spec["target_megapixels"])
741
  return int(mp * 1024 * 1024)
742
  except Exception:
743
  pass
744
-
745
  if "target_long_edge" in spec:
746
  try:
747
  long_edge = int(spec["target_long_edge"])
@@ -755,47 +798,19 @@ def get_target_area_for_lora(
755
  return int(new_w * new_h)
756
  except Exception:
757
  pass
758
-
759
  return int(float(user_target_megapixels) * 1024 * 1024)
760
 
761
 
762
  # ============================================================
763
- # Helpers: multi-input routing
764
  # ============================================================
765
 
766
  def lora_requires_two_images(lora_adapter: str) -> bool:
767
  return bool(ADAPTER_SPECS.get(lora_adapter, {}).get("requires_two_images", False))
768
 
769
-
770
  def image2_label_for_lora(lora_adapter: str) -> str:
771
  return str(ADAPTER_SPECS.get(lora_adapter, {}).get("image2_label", "Upload Reference (Image 2)"))
772
 
773
-
774
- def build_labeled_images(
775
- img1: Image.Image,
776
- img2: Optional[Image.Image],
777
- extra_imgs: Optional[list[Image.Image]],
778
- ) -> dict[str, Image.Image]:
779
- labeled: dict[str, Image.Image] = {}
780
- idx = 1
781
- labeled[f"image_{idx}"] = img1
782
- idx += 1
783
- if img2 is not None:
784
- labeled[f"image_{idx}"] = img2
785
- idx += 1
786
- if extra_imgs:
787
- for im in extra_imgs:
788
- if im is None:
789
- continue
790
- labeled[f"image_{idx}"] = im
791
- idx += 1
792
- return labeled
793
-
794
-
795
- # ============================================================
796
- # Helpers: BFS alpha key fix
797
- # ============================================================
798
-
799
  def _inject_missing_alpha_keys(state_dict: dict) -> dict:
800
  bases = {}
801
  for k, v in state_dict.items():
@@ -808,6 +823,7 @@ def _inject_missing_alpha_keys(state_dict: dict) -> dict:
808
 
809
  for base, rank in bases.items():
810
  alpha_tensor = torch.tensor(float(rank), dtype=torch.float32)
 
811
  full_alpha = f"{base}.alpha"
812
  if full_alpha not in state_dict:
813
  state_dict[full_alpha] = alpha_tensor
@@ -817,10 +833,8 @@ def _inject_missing_alpha_keys(state_dict: dict) -> dict:
817
  stripped_alpha = f"{stripped_base}.alpha"
818
  if stripped_alpha not in state_dict:
819
  state_dict[stripped_alpha] = alpha_tensor
820
-
821
  return state_dict
822
 
823
-
824
  def _filter_to_diffusers_lora_keys(state_dict: dict) -> tuple[dict, dict]:
825
  keep_suffixes = (
826
  ".lora_up.weight",
@@ -863,7 +877,6 @@ def _filter_to_diffusers_lora_keys(state_dict: dict) -> tuple[dict, dict]:
863
  }
864
  return out, stats
865
 
866
-
867
  def _duplicate_stripped_prefix_keys(state_dict: dict, prefix: str = "diffusion_model.") -> dict:
868
  out = dict(state_dict)
869
  for k, v in list(state_dict.items()):
@@ -874,7 +887,6 @@ def _duplicate_stripped_prefix_keys(state_dict: dict, prefix: str = "diffusion_m
874
  out[stripped] = v
875
  return out
876
 
877
-
878
  def _load_lora_weights_with_fallback(repo: str, weight_name: str, adapter_name: str, needs_alpha_fix: bool = False):
879
  try:
880
  pipe.load_lora_weights(repo, weight_name=weight_name, adapter_name=adapter_name)
@@ -900,11 +912,9 @@ def _load_lora_weights_with_fallback(repo: str, weight_name: str, adapter_name:
900
  f"kept={stats['kept']} dropped_patch={stats['dropped_patch']} "
901
  f"dropped_other={stats['dropped_other']} normalized_alpha={stats['normalized_alpha']}"
902
  )
903
-
904
  pipe.load_lora_weights(sd, adapter_name=adapter_name)
905
  return
906
 
907
-
908
  def _ensure_loaded_and_get_active_adapters(selected_lora: str):
909
  spec = ADAPTER_SPECS.get(selected_lora)
910
  if not spec:
@@ -942,7 +952,6 @@ def _ensure_loaded_and_get_active_adapters(selected_lora: str):
942
 
943
  adapter_names.append(adapter_name)
944
  adapter_weights.append(strength)
945
-
946
  else:
947
  repo = spec["repo"]
948
  weights = spec["weights"]
@@ -998,41 +1007,34 @@ def on_lora_change_ui(selected_lora, current_prompt, current_extras_condition_on
998
  return prompt_update, img2_update, extras_update
999
 
1000
 
 
 
 
 
1001
  def set_output_as_image1(last):
1002
  if last is None:
1003
  raise gr.Error("No output available yet.")
1004
  return gr.update(value=last)
1005
 
1006
-
1007
  def set_output_as_image2(last):
1008
  if last is None:
1009
  raise gr.Error("No output available yet.")
1010
  return gr.update(value=last)
1011
 
1012
-
1013
  def set_output_as_extra(last, existing_extra):
1014
  if last is None:
1015
  raise gr.Error("No output available yet.")
1016
  return _append_to_gallery(existing_extra, last)
1017
 
1018
-
1019
- # ============================================================
1020
- # Derived conditioning UI action
1021
- # ============================================================
1022
-
1023
  @spaces.GPU
1024
  def add_derived_ref(img1, existing_extra, derived_type, derived_use_gpu, derived_max_people):
1025
- img1_pil = ensure_pil_rgb(img1, label="image_1_for_derived")
1026
- if img1_pil is None:
1027
- raise gr.Error(
1028
- "Image 1 could not be read in the GPU worker. "
1029
- "If you uploaded via API or a prior session, re-upload the image."
1030
- )
1031
 
1032
  if derived_type == "None":
1033
  return gr.update(value=existing_extra), gr.update(visible=False, value=None)
1034
 
1035
- base = img1_pil.convert("RGB")
1036
 
1037
  if derived_type == "Pose (ViTPose, fast)":
1038
  derived = make_pose_map(base, use_gpu=bool(derived_use_gpu), mode="fast")
@@ -1053,18 +1055,29 @@ def add_derived_ref(img1, existing_extra, derived_type, derived_use_gpu, derived
1053
 
1054
 
1055
  # ============================================================
1056
- # Inference
1057
  # ============================================================
1058
 
1059
- def _debug_cuda_mem(prefix="[DEBUG][cuda]"):
1060
  if not torch.cuda.is_available():
1061
  return
1062
  try:
1063
  free, total = torch.cuda.mem_get_info()
1064
- print(f"{prefix} mem free={free/1e9:.2f}GB total={total/1e9:.2f}GB")
1065
- except Exception as e:
1066
- print(f"{prefix} mem_get_info failed: {e!r}")
 
 
 
 
 
 
 
 
1067
 
 
 
 
1068
 
1069
  @spaces.GPU
1070
  def infer(
@@ -1086,31 +1099,12 @@ def infer(
1086
  if torch.cuda.is_available():
1087
  torch.cuda.empty_cache()
1088
 
1089
- # -------------------- Robust image reads --------------------
1090
- img1 = ensure_pil_rgb(input_image_1, label="input_image_1")
1091
- img2 = ensure_pil_rgb(input_image_2, label="input_image_2") if input_image_2 is not None else None
1092
 
1093
- # Debug what we actually received (helps track unexpected path payloads)
1094
- print("[DEBUG][infer] input types:",
1095
- type(input_image_1), type(input_image_2), type(input_images_extra))
1096
- print("[DEBUG][infer] img1_ok:", img1 is not None, "img2_ok:", (img2 is not None))
1097
 
1098
- if img1 is None:
1099
- raise gr.Error(
1100
- "Could not read Image 1 inside the GPU worker. "
1101
- "This can happen if Gradio passed a temp file path that isn't present in the worker. "
1102
- "Please re-upload Image 1 and try again."
1103
- )
1104
-
1105
- # Normalize extra images (Gallery) to PIL RGB
1106
- extra_imgs: list[Image.Image] = []
1107
- if input_images_extra:
1108
- for i, item in enumerate(input_images_extra):
1109
- pil = ensure_pil_rgb(item, label=f"gallery[{i}]")
1110
- if pil is not None:
1111
- extra_imgs.append(pil)
1112
-
1113
- # -------------------- Adapter handling --------------------
1114
  if lora_adapter == NONE_LORA:
1115
  try:
1116
  pipe.set_adapters([], adapter_weights=[])
@@ -1121,7 +1115,6 @@ def infer(
1121
  adapter_names, adapter_weights = _ensure_loaded_and_get_active_adapters(lora_adapter)
1122
  pipe.set_adapters(adapter_names, adapter_weights=adapter_weights)
1123
 
1124
- # -------------------- Seed / prompts --------------------
1125
  if randomize_seed:
1126
  seed = random.randint(0, MAX_SEED)
1127
 
@@ -1132,17 +1125,34 @@ def infer(
1132
  "extra digit, fewer digits, cropped, jpeg artifacts, signature, watermark, username, blurry"
1133
  )
1134
 
1135
- # If a LoRA requires image2, enforce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1136
  if lora_requires_two_images(lora_adapter) and img2 is None:
1137
  raise gr.Error("This LoRA needs two images. Please upload Image 2 as well.")
1138
 
1139
- # Label images as image_1, image_2, image_3...
1140
  labeled = build_labeled_images(img1, img2, extra_imgs)
 
1141
  pipe_images = list(labeled.values())
1142
  if len(pipe_images) == 1:
1143
  pipe_images = pipe_images[0]
1144
 
1145
- # Canvas sizing
1146
  target_area = get_target_area_for_lora(img1, lora_adapter, float(target_megapixels))
1147
  width, height = compute_canvas_dimensions_from_area(
1148
  img1,
@@ -1150,42 +1160,29 @@ def infer(
1150
  multiple_of=int(pipe.vae_scale_factor * 2),
1151
  )
1152
 
1153
- # VAE indices (conditioning-only extras)
1154
  vae_image_indices = None
1155
  if extras_condition_only:
1156
  if isinstance(pipe_images, list) and len(pipe_images) > 2:
1157
  vae_image_indices = [0, 1] if len(pipe_images) >= 2 else [0]
1158
 
1159
- # -------------------- Extra debug logs --------------------
1160
- prompt_s = "" if prompt is None else str(prompt)
 
1161
  print(
1162
  "[DEBUG][infer] submitting request | "
1163
- f"lora_adapter={lora_adapter!r} seed={seed} "
1164
- f"prompt_len={len(prompt_s)} steps={steps} true_cfg_scale={guidance_scale} "
1165
- f"target_mp={target_megapixels} canvas=({width}x{height}) "
1166
- f"n_images={(len(pipe_images) if isinstance(pipe_images, list) else 1)} "
1167
- f"vae_image_indices={vae_image_indices} pad_to_canvas={bool(pad_to_canvas)}"
1168
  )
1169
- print("[DEBUG][infer] image_1 size:", img1.size, "image_2 size:", (img2.size if img2 else None))
1170
- if extra_imgs:
1171
- print("[DEBUG][infer] extra_imgs:", [im.size for im in extra_imgs][:8], ("..." if len(extra_imgs) > 8 else ""))
1172
 
1173
- # Optional: log token count (if tokenizer exists)
1174
- try:
1175
- tok = getattr(pipe, "tokenizer", None)
1176
- if tok is not None and prompt_s:
1177
- ids = tok(prompt_s, return_tensors="pt", truncation=True).input_ids
1178
- print("[DEBUG][infer] prompt token_count:", int(ids.shape[-1]))
1179
- except Exception as e:
1180
- print("[DEBUG][infer] token_count failed:", repr(e))
1181
-
1182
- _debug_cuda_mem(prefix="[DEBUG][cuda][before]")
1183
-
1184
- # -------------------- Run --------------------
1185
  try:
1186
  result = pipe(
1187
  image=pipe_images,
1188
- prompt=prompt_s,
1189
  negative_prompt=negative_prompt,
1190
  height=height,
1191
  width=width,
@@ -1196,13 +1193,15 @@ def infer(
1196
  pad_to_canvas=bool(pad_to_canvas),
1197
  ).images[0]
1198
  return result, seed, result
1199
- except Exception:
 
1200
  print("---- [ERROR][infer] exception ----")
1201
  print(traceback.format_exc())
1202
  print("---------------------------------")
1203
  raise
 
1204
  finally:
1205
- _debug_cuda_mem(prefix="[DEBUG][cuda][after]")
1206
  gc.collect()
1207
  if torch.cuda.is_available():
1208
  torch.cuda.empty_cache()
@@ -1210,24 +1209,17 @@ def infer(
1210
 
1211
  @spaces.GPU
1212
  def infer_example(input_image, prompt, lora_adapter):
1213
- img = ensure_pil_rgb(input_image, label="example_image")
1214
- if img is None:
1215
  return None, 0, None
 
1216
  guidance_scale = 1.0
1217
  steps = 4
1218
  result, seed, last = infer(
1219
- img,
1220
- None,
1221
- None,
1222
- prompt,
1223
- lora_adapter,
1224
- 0,
1225
- True,
1226
- guidance_scale,
1227
- steps,
1228
- 1.0,
1229
- True,
1230
- True,
1231
  )
1232
  return result, seed, last
1233
 
@@ -1253,9 +1245,9 @@ with gr.Blocks() as demo:
1253
  with gr.Column(elem_id="col-container"):
1254
  gr.Markdown("# **Qwen-Image-Edit-2511-LoRAs-Fast**", elem_id="main-title")
1255
  gr.Markdown(
1256
- "Perform diverse image edits using specialized LoRA adapters for "
1257
- "Qwen-Image-Edit-2511. Includes safeguards for ZeroGPU file-path uploads "
1258
- "and a RoPE patch to avoid certain CUDA cublas failures."
1259
  )
1260
  gr.Markdown(aio_status_line)
1261
 
@@ -1283,6 +1275,7 @@ with gr.Blocks() as demo:
1283
 
1284
  with gr.Column():
1285
  output_image = gr.Image(label="Output Image", interactive=False, format="png", height=353)
 
1286
  last_output = gr.State(value=None)
1287
 
1288
  with gr.Row():
@@ -1348,7 +1341,6 @@ with gr.Blocks() as demo:
1348
  value=True,
1349
  )
1350
 
1351
- # On LoRA selection: preset prompt + toggle Image 2
1352
  lora_adapter.change(
1353
  fn=on_lora_change_ui,
1354
  inputs=[lora_adapter, prompt, extras_condition_only],
@@ -1360,6 +1352,26 @@ with gr.Blocks() as demo:
1360
  ["examples/5.jpg", "Remove shadows and relight the image using soft lighting.", "Light-Restoration"],
1361
  ["examples/4.jpg", "Use a subtle golden-hour filter with smooth light diffusion.", "Relight"],
1362
  ["examples/2.jpeg", "Rotate the camera 45 degrees to the left.", "Multiple-Angles"],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1363
  ["examples/11.jpg", "Upscale this picture to 4K resolution.", "Upscale2K"],
1364
  ],
1365
  inputs=[input_image_1, prompt, lora_adapter],
@@ -1388,12 +1400,10 @@ with gr.Blocks() as demo:
1388
  outputs=[output_image, seed, last_output],
1389
  )
1390
 
1391
- # Output routing buttons
1392
  btn_out_to_img1.click(fn=set_output_as_image1, inputs=[last_output], outputs=[input_image_1])
1393
  btn_out_to_img2.click(fn=set_output_as_image2, inputs=[last_output], outputs=[input_image_2])
1394
  btn_out_to_extra.click(fn=set_output_as_extra, inputs=[last_output, input_images_extra], outputs=[input_images_extra])
1395
 
1396
- # Derived conditioning button
1397
  add_derived_btn.click(
1398
  fn=add_derived_ref,
1399
  inputs=[input_image_1, input_images_extra, derived_type, derived_use_gpu, derived_max_people],
 
1
  # app.py
 
 
 
 
 
 
2
  import os
3
  import re
4
  import gc
5
+ import uuid
6
+ import time
7
+ import math
8
  import traceback
9
  import random
 
10
  from typing import Iterable, Optional
11
 
 
 
 
 
 
 
 
 
 
 
 
12
  import gradio as gr
13
  import numpy as np
14
  import spaces
 
29
  from gradio.themes import Soft
30
  from gradio.themes.utils import colors, fonts, sizes
31
 
32
+
33
+ # ============================================================
34
+ # Process-unique temp dir (helps avoid /tmp collisions)
35
+ # ============================================================
36
+
37
+ def _ensure_unique_gradio_tmp():
38
+ """
39
+ ZeroGPU/Spaces can serve multiple users across recycled containers.
40
+ Gradio may use /tmp/gradio by default. We force a unique directory per process.
41
+ """
42
+ if os.environ.get("GRADIO_TEMP_DIR"):
43
+ print(f"GRADIO_TEMP_DIR = {os.environ['GRADIO_TEMP_DIR']}")
44
+ return
45
+ pid = os.getpid()
46
+ tmp = f"/tmp/gradio_{pid}_{uuid.uuid4().hex[:8]}"
47
+ os.environ["GRADIO_TEMP_DIR"] = tmp
48
+ try:
49
+ os.makedirs(tmp, exist_ok=True)
50
+ except Exception:
51
+ pass
52
+ print(f"GRADIO_TEMP_DIR = {tmp}")
53
+
54
+ _ensure_unique_gradio_tmp()
55
+
56
+
57
  # ============================================================
58
+ # Patch: Qwen2.5-VL RoPE (avoid cublas batched GEMM; preserve shapes)
59
  # ============================================================
60
 
61
+ def patch_qwen25vl_rope_no_gemm():
62
  """
63
+ Patch Qwen2.5-VL rotary embedding to avoid the matmul that can trigger:
64
+ CUBLAS_STATUS_INVALID_VALUE (cublasSgemmStridedBatched)
65
+ on some ZeroGPU/H200 MIG configurations.
 
66
 
67
+ CRITICAL: Preserve exact output shapes used by apply_multimodal_rotary_pos_emb,
68
+ otherwise you get split_with_sizes mismatches.
69
  """
70
+ if os.environ.get("DISABLE_ROPE_PATCH", "").strip() == "1":
71
+ print("[patch][rope] DISABLE_ROPE_PATCH=1 -> skipping patch.")
72
+ return
73
+
74
  try:
75
+ from transformers.models.qwen2_5_vl import modeling_qwen2_5_vl as qvl
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  except Exception as e:
77
+ print(f"[patch][rope] could not import qwen2_5_vl modeling: {e}")
78
+ return
79
+
80
+ Rotary = None
81
+ for name in ["Qwen2_5_VLRotaryEmbedding", "Qwen2_5RotaryEmbedding", "RotaryEmbedding"]:
82
+ Rotary = getattr(qvl, name, None)
83
+ if Rotary is not None:
84
+ break
85
+
86
+ if Rotary is None:
87
+ print("[patch][rope] rotary embedding class not found; no patch applied.")
88
+ return
89
+
90
+ orig_forward = Rotary.forward
91
+
92
+ def forward_no_gemm(self, x, position_ids):
93
+ # Fallback to original if structure differs
94
+ if not hasattr(self, "inv_freq") or position_ids is None:
95
+ return orig_forward(self, x, position_ids)
96
+
97
+ # Determine rotary dim from module config (NOT x.shape[-1])
98
+ if hasattr(self, "dim") and isinstance(self.dim, int):
99
+ rope_dim = int(self.dim)
100
+ else:
101
+ rope_dim = int(self.inv_freq.numel() * 2)
102
+
103
+ # Normalize position_ids to (bs, seq)
104
+ if position_ids.ndim > 2:
105
+ pos = position_ids.reshape(position_ids.shape[0], -1)
106
+ else:
107
+ pos = position_ids
108
+
109
+ # Compute on the same device as inv_freq/x
110
+ dev = self.inv_freq.device
111
+ pos = pos.to(device=dev)
112
+
113
+ # Broadcast multiply instead of matmul:
114
+ # inv: (1,1,dim/2,1), pos: (bs,1,1,seq) -> freqs: (bs,1,dim/2,seq)
115
+ inv = self.inv_freq[None, None, :, None].float()
116
+ posf = pos[:, None, None, :].float()
117
+ freqs = (inv * posf).transpose(2, 3) # (bs,1,seq,dim/2)
118
+
119
+ # Double to full rotary dim
120
+ emb = torch.cat((freqs, freqs), dim=-1) # (bs,1,seq,dim)
121
+
122
+ # Enforce exact expected rotary dim
123
+ if emb.shape[-1] != rope_dim:
124
+ emb = emb[..., :rope_dim]
125
+
126
+ cos = emb.cos()
127
+ sin = emb.sin()
128
+
129
+ # Respect attention scaling if present
130
+ attn_scale = getattr(self, "attention_scaling", None)
131
+ if attn_scale is not None:
132
+ cos = cos * attn_scale
133
+ sin = sin * attn_scale
134
+
135
+ # Match dtype expectations (upstream typically returns same dtype as x)
136
+ cos = cos.to(dtype=x.dtype)
137
+ sin = sin.to(dtype=x.dtype)
138
+
139
+ # Optional debug (enable by env)
140
+ if os.environ.get("DEBUG_ROPE", "").strip() == "1":
141
+ ms = getattr(self, "mrope_section", None)
142
+ if ms is not None:
143
+ try:
144
+ ms_list = list(ms)
145
+ print(f"[DEBUG][rope] rope_dim={rope_dim} cos_last={cos.shape[-1]} mrope_sum={sum(ms_list)} mrope={ms_list}")
146
+ except Exception:
147
+ print(f"[DEBUG][rope] rope_dim={rope_dim} cos_last={cos.shape[-1]} (mrope_section unreadable)")
148
+ else:
149
+ print(f"[DEBUG][rope] rope_dim={rope_dim} cos_last={cos.shape[-1]} (no mrope_section attr)")
150
+
151
+ return cos, sin
152
 
153
+ Rotary.forward = forward_no_gemm
154
+ print("[patch] Patched Qwen2.5-VL RoPE matmul -> broadcast multiply (shape-preserving).")
155
+
156
+ patch_qwen25vl_rope_no_gemm()
157
 
 
158
 
159
  # ============================================================
160
  # Theme
 
175
  c950="#802200",
176
  )
177
 
 
178
  class OrangeRedTheme(Soft):
179
  def __init__(
180
  self,
 
230
  block_label_background_fill="*primary_200",
231
  )
232
 
 
233
  orange_red_theme = OrangeRedTheme()
234
 
235
+
236
  # ============================================================
237
+ # Device / Env debug
238
  # ============================================================
239
 
240
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
241
 
 
 
242
  print("CUDA_VISIBLE_DEVICES=", os.environ.get("CUDA_VISIBLE_DEVICES"))
243
  print("torch.__version__ =", torch.__version__)
244
  print("torch.version.cuda =", torch.version.cuda)
 
249
  print("device name:", torch.cuda.get_device_name(torch.cuda.current_device()))
250
  print("Using device:", device)
251
 
252
+ # Optional: make matmul a bit more stable (doesn't change correctness)
253
+ try:
254
+ torch.backends.cuda.matmul.allow_tf32 = True
255
+ torch.backends.cudnn.allow_tf32 = True
256
+ except Exception:
257
+ pass
258
+
259
+
260
  # ============================================================
261
  # AIO version (Space variable)
262
  # ============================================================
 
267
  _VER_RE = re.compile(r"^v\d+$")
268
  _DIGITS_RE = re.compile(r"^\d+$")
269
 
 
270
  def _normalize_version(raw: str) -> Optional[str]:
271
  if raw is None:
272
  return None
 
279
  return f"v{s}"
280
  return None
281
 
 
282
  _AIO_ENV_RAW = os.environ.get("AIO_VERSION", "")
283
  _AIO_ENV_NORM = _normalize_version(_AIO_ENV_RAW)
284
 
 
289
  print(f"AIO_VERSION (normalized) = {_AIO_ENV_NORM!r}")
290
  print(f"Using AIO_VERSION = {AIO_VERSION} ({AIO_VERSION_SOURCE})")
291
 
292
+
293
  # ============================================================
294
  # Pipeline
295
  # ============================================================
 
301
 
302
  dtype = torch.bfloat16
303
 
 
304
  def _load_pipe_with_version(version: str) -> QwenImageEditPlusPipeline:
305
  sub = f"{version}/transformer"
306
  print(f"📦 Loading AIO transformer: {AIO_REPO_ID} / {sub}")
 
316
  ).to(device)
317
  return p
318
 
 
 
319
  try:
320
  pipe = _load_pipe_with_version(AIO_VERSION)
321
  except Exception:
 
336
 
337
  MAX_SEED = np.iinfo(np.int32).max
338
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
339
 
340
  # ============================================================
341
+ # Derived conditioning (Transformers): Pose + Depth (v1-style)
342
  # ============================================================
343
 
344
  POSE_MODEL_ID = "usyd-community/vitpose-base-simple"
 
358
  (12, 14), (14, 16),
359
  ]
360
 
 
361
  def _derived_device(use_gpu: bool) -> torch.device:
362
  return torch.device("cuda" if (use_gpu and torch.cuda.is_available()) else "cpu")
363
 
 
364
  def _load_pose_models(dev: torch.device):
365
  key = str(dev)
366
  if key in _POSE_CACHE:
 
378
  _POSE_CACHE[key] = (det_proc, det_model, pose_proc, pose_model)
379
  return _POSE_CACHE[key]
380
 
 
381
  def _load_depth_models(dev: torch.device):
382
  key = str(dev)
383
  if key in _DEPTH_CACHE:
 
390
  _DEPTH_CACHE[key] = (proc, model)
391
  return _DEPTH_CACHE[key]
392
 
 
393
  def _draw_skeleton_on_blank(
394
  size: tuple[int, int],
395
  persons_keypoints: list[np.ndarray],
 
416
  if sc[i] < kp_thresh:
417
  continue
418
  x, y = float(kps[i, 0]), float(kps[i, 1])
419
+ draw.ellipse([(x - point_r, y - point_r), (x + point_r, y + point_r)], fill=(255, 255, 255))
 
 
 
 
 
420
  return canvas
421
 
 
422
  def make_pose_map(
423
  img: Image.Image,
424
  *,
 
466
 
467
  persons_kps, persons_sc = [], []
468
  for pr in pose_results:
469
+ persons_kps.append(pr["keypoints"].detach().cpu().numpy())
470
+ persons_sc.append(pr["scores"].detach().cpu().numpy())
 
 
471
 
472
  if not persons_kps:
473
  return Image.new("RGB", img.size, (0, 0, 0))
474
 
475
  return _draw_skeleton_on_blank(img.size, persons_kps, persons_sc)
476
 
 
477
  def make_depth_map(img: Image.Image, *, use_gpu: bool) -> Image.Image:
478
  img = img.convert("RGB")
479
  dev = _derived_device(use_gpu)
 
502
  return Image.fromarray(depth8, mode="L").convert("RGB")
503
 
504
 
505
+ # ============================================================
506
+ # Helpers: gallery normalization + debug-friendly PIL conversion
507
+ # ============================================================
508
+
509
+ def _to_pil_rgb(x) -> Optional[Image.Image]:
510
+ """
511
+ Accepts PIL / numpy / (image, caption) tuples / gradio dicts and returns PIL RGB.
512
+ Also safely ignores broken temp paths.
513
+ """
514
+ if x is None:
515
+ return None
516
+
517
+ # Gallery often returns (image, caption)
518
+ if isinstance(x, tuple) and len(x) >= 1:
519
+ x = x[0]
520
+ if x is None:
521
+ return None
522
+
523
+ # Some gradio versions can return dict with a temp file path
524
+ if isinstance(x, dict):
525
+ # common keys: 'name' or 'path'
526
+ p = x.get("name") or x.get("path")
527
+ if isinstance(p, str):
528
+ if not os.path.exists(p):
529
+ print(f"[WARN] extra image path missing, skipping: {p}")
530
+ return None
531
+ try:
532
+ return Image.open(p).convert("RGB")
533
+ except Exception as e:
534
+ print(f"[WARN] failed to open extra image path {p}: {e}")
535
+ return None
536
+
537
+ if isinstance(x, Image.Image):
538
+ return x.convert("RGB")
539
+
540
+ if isinstance(x, np.ndarray):
541
+ try:
542
+ return Image.fromarray(x).convert("RGB")
543
+ except Exception:
544
+ return None
545
+
546
+ # last resort
547
+ try:
548
+ return Image.fromarray(np.array(x)).convert("RGB")
549
+ except Exception:
550
+ return None
551
+
552
  def _append_to_gallery(existing, new_img: Image.Image):
553
  items = []
554
  if existing:
555
  for it in existing:
556
+ pil = _to_pil_rgb(it)
557
  if pil is not None:
558
  items.append(pil)
559
  items.append(new_img)
560
  return items
561
 
562
+ def build_labeled_images(
563
+ img1: Image.Image,
564
+ img2: Optional[Image.Image],
565
+ extra_imgs: Optional[list[Image.Image]],
566
+ ) -> dict[str, Image.Image]:
567
+ labeled: dict[str, Image.Image] = {}
568
+ idx = 1
569
+ labeled[f"image_{idx}"] = img1
570
+ idx += 1
571
+ if img2 is not None:
572
+ labeled[f"image_{idx}"] = img2
573
+ idx += 1
574
+ if extra_imgs:
575
+ for im in extra_imgs:
576
+ if im is None:
577
+ continue
578
+ labeled[f"image_{idx}"] = im
579
+ idx += 1
580
+ return labeled
581
+
582
 
583
  # ============================================================
584
+ # LoRA adapters + presets (your v1 config)
585
  # ============================================================
586
 
587
  NONE_LORA = "None"
 
738
  "Any2Real_2601": "change the picture 1 to realistic photograph",
739
  "Semirealistic-photo-detailer": "transform the image to semi-realistic image",
740
  "AnyPose": "Make the person in image 1 do the exact same pose of the person in image 2. Changing the style and background of the image of the person in image 1 is undesirable, so don't do it. The new pose should be pixel accurate to the pose we are trying to copy. The position of the arms and head and legs should be the same as the pose we are trying to copy. Change the field of view and angle to match exactly image 2. Head tilt and eye gaze pose should match the person in image 2.",
741
+ "Hyperrealistic-Portrait": "Transform the image into an ultra-realistic photorealistic portrait with strict identity preservation, facing straight to the camera. Enhance pore-level skin textures, realistic moisture effects, and natural wet hair clumping against the skin. Apply cool-toned soft-box lighting with subtle highlights and shadows, maintain realistic green-hazel eye catchlights without synthetic gloss, and preserve soft natural lip texture. Use shallow depth of field with a clean bokeh background, an 85mm macro photographic look, and raw photo grading without retouching to maintain realism and original details.",
742
+ "Ultrarealistic-Portrait": "Transform the image into an ultra-realistic glamour portrait while strictly preserving the subject’s identity. Apply a close-up composition with a slight head tilt and a hand near the face, enhance cinematic directional lighting with dramatic fashion-style highlights, and refine makeup details including glowing skin, glossy lips, luminous highlighter, and defined eyes. Increase skin realism with detailed epidermal textures such as micropores, microhairs, subtle oil sheen, natural highlights, soft wrinkles, and subsurface scattering. Maintain a luxury fashion-magazine look in a 9:16 aspect ratio, preserving realism, facial structure, and original details without over-smoothing or retouching.",
743
  "Upscale2K": "Upscale this picture to 4K resolution.",
744
  "BFS-Best-FaceSwap": "head_swap: start with Picture 1 as the base image, keeping its lighting, environment, and background. remove the head from Picture 1 completely and replace it with the head from Picture 2, strictly preserving the hair, eye color, and nose structure of Picture 2. copy the eye direction, head rotation, and micro-expressions from Picture 1. high quality, sharp details, 4k",
745
  "BFS-Best-FaceSwap-merge": "head_swap: start with Picture 1 as the base image, keeping its lighting, environment, and background. remove the head from Picture 1 completely and replace it with the head from Picture 2, strictly preserving the hair, eye color, and nose structure of Picture 2. copy the eye direction, head rotation, and micro-expressions from Picture 1. high quality, sharp details, 4k",
 
747
 
748
  LOADED_ADAPTERS = set()
749
 
750
+
751
  # ============================================================
752
+ # Helpers: resolution (area-based sizing)
753
  # ============================================================
754
 
755
  def _round_to_multiple(x: int, m: int) -> int:
756
  return max(m, (int(x) // m) * m)
757
 
 
758
  def compute_canvas_dimensions_from_area(
759
  image: Image.Image,
760
  target_area: int,
 
762
  ) -> tuple[int, int]:
763
  w, h = image.size
764
  aspect = w / h if h else 1.0
 
765
  from qwenimage.pipeline_qwenimage_edit_plus import calculate_dimensions
 
766
  width, height = calculate_dimensions(int(target_area), float(aspect))
767
  width = _round_to_multiple(int(width), int(multiple_of))
768
  height = _round_to_multiple(int(height), int(multiple_of))
769
  return width, height
770
 
 
771
  def get_target_area_for_lora(
772
  image: Image.Image,
773
  lora_adapter: str,
774
  user_target_megapixels: float,
775
  ) -> int:
776
  spec = ADAPTER_SPECS.get(lora_adapter, {})
 
777
  if "target_area" in spec:
778
  try:
779
  return int(spec["target_area"])
780
  except Exception:
781
  pass
 
782
  if "target_megapixels" in spec:
783
  try:
784
  mp = float(spec["target_megapixels"])
785
  return int(mp * 1024 * 1024)
786
  except Exception:
787
  pass
 
788
  if "target_long_edge" in spec:
789
  try:
790
  long_edge = int(spec["target_long_edge"])
 
798
  return int(new_w * new_h)
799
  except Exception:
800
  pass
 
801
  return int(float(user_target_megapixels) * 1024 * 1024)
802
 
803
 
804
  # ============================================================
805
+ # Helpers: LoRA routing + BFS alpha fixes (your v1 logic)
806
  # ============================================================
807
 
808
  def lora_requires_two_images(lora_adapter: str) -> bool:
809
  return bool(ADAPTER_SPECS.get(lora_adapter, {}).get("requires_two_images", False))
810
 
 
811
  def image2_label_for_lora(lora_adapter: str) -> str:
812
  return str(ADAPTER_SPECS.get(lora_adapter, {}).get("image2_label", "Upload Reference (Image 2)"))
813
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
814
  def _inject_missing_alpha_keys(state_dict: dict) -> dict:
815
  bases = {}
816
  for k, v in state_dict.items():
 
823
 
824
  for base, rank in bases.items():
825
  alpha_tensor = torch.tensor(float(rank), dtype=torch.float32)
826
+
827
  full_alpha = f"{base}.alpha"
828
  if full_alpha not in state_dict:
829
  state_dict[full_alpha] = alpha_tensor
 
833
  stripped_alpha = f"{stripped_base}.alpha"
834
  if stripped_alpha not in state_dict:
835
  state_dict[stripped_alpha] = alpha_tensor
 
836
  return state_dict
837
 
 
838
  def _filter_to_diffusers_lora_keys(state_dict: dict) -> tuple[dict, dict]:
839
  keep_suffixes = (
840
  ".lora_up.weight",
 
877
  }
878
  return out, stats
879
 
 
880
  def _duplicate_stripped_prefix_keys(state_dict: dict, prefix: str = "diffusion_model.") -> dict:
881
  out = dict(state_dict)
882
  for k, v in list(state_dict.items()):
 
887
  out[stripped] = v
888
  return out
889
 
 
890
  def _load_lora_weights_with_fallback(repo: str, weight_name: str, adapter_name: str, needs_alpha_fix: bool = False):
891
  try:
892
  pipe.load_lora_weights(repo, weight_name=weight_name, adapter_name=adapter_name)
 
912
  f"kept={stats['kept']} dropped_patch={stats['dropped_patch']} "
913
  f"dropped_other={stats['dropped_other']} normalized_alpha={stats['normalized_alpha']}"
914
  )
 
915
  pipe.load_lora_weights(sd, adapter_name=adapter_name)
916
  return
917
 
 
918
  def _ensure_loaded_and_get_active_adapters(selected_lora: str):
919
  spec = ADAPTER_SPECS.get(selected_lora)
920
  if not spec:
 
952
 
953
  adapter_names.append(adapter_name)
954
  adapter_weights.append(strength)
 
955
  else:
956
  repo = spec["repo"]
957
  weights = spec["weights"]
 
1007
  return prompt_update, img2_update, extras_update
1008
 
1009
 
1010
+ # ============================================================
1011
+ # UI helpers: output routing + derived conditioning
1012
+ # ============================================================
1013
+
1014
  def set_output_as_image1(last):
1015
  if last is None:
1016
  raise gr.Error("No output available yet.")
1017
  return gr.update(value=last)
1018
 
 
1019
  def set_output_as_image2(last):
1020
  if last is None:
1021
  raise gr.Error("No output available yet.")
1022
  return gr.update(value=last)
1023
 
 
1024
  def set_output_as_extra(last, existing_extra):
1025
  if last is None:
1026
  raise gr.Error("No output available yet.")
1027
  return _append_to_gallery(existing_extra, last)
1028
 
 
 
 
 
 
1029
  @spaces.GPU
1030
  def add_derived_ref(img1, existing_extra, derived_type, derived_use_gpu, derived_max_people):
1031
+ if img1 is None:
1032
+ raise gr.Error("Please upload Image 1 first.")
 
 
 
 
1033
 
1034
  if derived_type == "None":
1035
  return gr.update(value=existing_extra), gr.update(visible=False, value=None)
1036
 
1037
+ base = img1.convert("RGB")
1038
 
1039
  if derived_type == "Pose (ViTPose, fast)":
1040
  derived = make_pose_map(base, use_gpu=bool(derived_use_gpu), mode="fast")
 
1055
 
1056
 
1057
  # ============================================================
1058
+ # Debug helpers (CUDA mem + token count)
1059
  # ============================================================
1060
 
1061
+ def _cuda_mem(prefix: str):
1062
  if not torch.cuda.is_available():
1063
  return
1064
  try:
1065
  free, total = torch.cuda.mem_get_info()
1066
+ print(f"[DEBUG][cuda][{prefix}] mem free={free/1e9:.2f}GB total={total/1e9:.2f}GB")
1067
+ except Exception:
1068
+ pass
1069
+
1070
+ def _approx_token_count(text: str) -> int:
1071
+ # Lightweight: we avoid forcing tokenizer calls here; this is only for debug.
1072
+ # Rule-of-thumb: ~4 chars per token in English-ish text.
1073
+ if not text:
1074
+ return 0
1075
+ return max(1, int(math.ceil(len(text) / 4.0)))
1076
+
1077
 
1078
+ # ============================================================
1079
+ # Inference
1080
+ # ============================================================
1081
 
1082
  @spaces.GPU
1083
  def infer(
 
1099
  if torch.cuda.is_available():
1100
  torch.cuda.empty_cache()
1101
 
1102
+ print("[DEBUG][infer] input types:", type(input_image_1), type(input_image_2), type(input_images_extra))
 
 
1103
 
1104
+ if input_image_1 is None:
1105
+ raise gr.Error("Please upload Image 1.")
 
 
1106
 
1107
+ # Handle "None"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1108
  if lora_adapter == NONE_LORA:
1109
  try:
1110
  pipe.set_adapters([], adapter_weights=[])
 
1115
  adapter_names, adapter_weights = _ensure_loaded_and_get_active_adapters(lora_adapter)
1116
  pipe.set_adapters(adapter_names, adapter_weights=adapter_weights)
1117
 
 
1118
  if randomize_seed:
1119
  seed = random.randint(0, MAX_SEED)
1120
 
 
1125
  "extra digit, fewer digits, cropped, jpeg artifacts, signature, watermark, username, blurry"
1126
  )
1127
 
1128
+ img1 = input_image_1.convert("RGB") if isinstance(input_image_1, Image.Image) else _to_pil_rgb(input_image_1)
1129
+ if img1 is None:
1130
+ raise gr.Error("Image 1 could not be read (unexpected input type/path).")
1131
+
1132
+ img2 = None
1133
+ if input_image_2 is not None:
1134
+ img2 = input_image_2.convert("RGB") if isinstance(input_image_2, Image.Image) else _to_pil_rgb(input_image_2)
1135
+ if img2 is None:
1136
+ raise gr.Error("Image 2 could not be read (unexpected input type/path).")
1137
+
1138
+ # Normalize extra images (Gallery)
1139
+ extra_imgs: list[Image.Image] = []
1140
+ if input_images_extra:
1141
+ for item in input_images_extra:
1142
+ pil = _to_pil_rgb(item)
1143
+ if pil is not None:
1144
+ extra_imgs.append(pil)
1145
+
1146
+ # Enforce 2-image LoRA behavior
1147
  if lora_requires_two_images(lora_adapter) and img2 is None:
1148
  raise gr.Error("This LoRA needs two images. Please upload Image 2 as well.")
1149
 
 
1150
  labeled = build_labeled_images(img1, img2, extra_imgs)
1151
+
1152
  pipe_images = list(labeled.values())
1153
  if len(pipe_images) == 1:
1154
  pipe_images = pipe_images[0]
1155
 
 
1156
  target_area = get_target_area_for_lora(img1, lora_adapter, float(target_megapixels))
1157
  width, height = compute_canvas_dimensions_from_area(
1158
  img1,
 
1160
  multiple_of=int(pipe.vae_scale_factor * 2),
1161
  )
1162
 
 
1163
  vae_image_indices = None
1164
  if extras_condition_only:
1165
  if isinstance(pipe_images, list) and len(pipe_images) > 2:
1166
  vae_image_indices = [0, 1] if len(pipe_images) >= 2 else [0]
1167
 
1168
+ # Debug summary
1169
+ n_images = len(pipe_images) if isinstance(pipe_images, list) else 1
1170
+ tok_est = _approx_token_count(prompt or "")
1171
  print(
1172
  "[DEBUG][infer] submitting request | "
1173
+ f"lora_adapter={lora_adapter!r} seed={seed} prompt_len={len(prompt or '')} "
1174
+ f"steps={steps} true_cfg_scale={guidance_scale} target_mp={target_megapixels} "
1175
+ f"canvas=({width}x{height}) n_images={n_images} vae_image_indices={vae_image_indices} "
1176
+ f"pad_to_canvas={bool(pad_to_canvas)}"
 
1177
  )
1178
+ print(f"[DEBUG][infer] image_1 size: {img1.size} image_2 size: {img2.size if img2 else None}")
1179
+ print(f"[DEBUG][infer] prompt token_estimate: {tok_est}")
1180
+ _cuda_mem("before")
1181
 
 
 
 
 
 
 
 
 
 
 
 
 
1182
  try:
1183
  result = pipe(
1184
  image=pipe_images,
1185
+ prompt=prompt,
1186
  negative_prompt=negative_prompt,
1187
  height=height,
1188
  width=width,
 
1193
  pad_to_canvas=bool(pad_to_canvas),
1194
  ).images[0]
1195
  return result, seed, result
1196
+
1197
+ except Exception as e:
1198
  print("---- [ERROR][infer] exception ----")
1199
  print(traceback.format_exc())
1200
  print("---------------------------------")
1201
  raise
1202
+
1203
  finally:
1204
+ _cuda_mem("after")
1205
  gc.collect()
1206
  if torch.cuda.is_available():
1207
  torch.cuda.empty_cache()
 
1209
 
1210
  @spaces.GPU
1211
  def infer_example(input_image, prompt, lora_adapter):
1212
+ if input_image is None:
 
1213
  return None, 0, None
1214
+ input_pil = input_image.convert("RGB")
1215
  guidance_scale = 1.0
1216
  steps = 4
1217
  result, seed, last = infer(
1218
+ input_pil, None, None,
1219
+ prompt, lora_adapter,
1220
+ 0, True,
1221
+ guidance_scale, steps, 1.0,
1222
+ True, True
 
 
 
 
 
 
 
1223
  )
1224
  return result, seed, last
1225
 
 
1245
  with gr.Column(elem_id="col-container"):
1246
  gr.Markdown("# **Qwen-Image-Edit-2511-LoRAs-Fast**", elem_id="main-title")
1247
  gr.Markdown(
1248
+ "Perform diverse image edits using specialized "
1249
+ "[LoRA](https://huggingface.co/models?other=base_model:adapter:Qwen/Qwen-Image-Edit-2511) adapters for the "
1250
+ "[Qwen-Image-Edit](https://huggingface.co/Qwen/Qwen-Image-Edit-2511) model. Uses a Diffusers compatible extraction of the transformers from Phr00t's Rapid AIO merge."
1251
  )
1252
  gr.Markdown(aio_status_line)
1253
 
 
1275
 
1276
  with gr.Column():
1277
  output_image = gr.Image(label="Output Image", interactive=False, format="png", height=353)
1278
+
1279
  last_output = gr.State(value=None)
1280
 
1281
  with gr.Row():
 
1341
  value=True,
1342
  )
1343
 
 
1344
  lora_adapter.change(
1345
  fn=on_lora_change_ui,
1346
  inputs=[lora_adapter, prompt, extras_condition_only],
 
1352
  ["examples/5.jpg", "Remove shadows and relight the image using soft lighting.", "Light-Restoration"],
1353
  ["examples/4.jpg", "Use a subtle golden-hour filter with smooth light diffusion.", "Relight"],
1354
  ["examples/2.jpeg", "Rotate the camera 45 degrees to the left.", "Multiple-Angles"],
1355
+ [
1356
+ "examples/12.jpg",
1357
+ "flatcolor Desaturate the image and lower the contrast to create a flat, ungraded look similar to a camera log profile. Preserve details in the highlights and shadows.",
1358
+ "Flat-Log",
1359
+ ],
1360
+ ["examples/7.jpg", "Light source from the Right Rear", "Multi-Angle-Lighting"],
1361
+ ["examples/10.jpeg", "Upscale the image.", "Upscale-Image"],
1362
+ ["examples/7.jpg", "Light source from the Below", "Multi-Angle-Lighting"],
1363
+ ["examples/2.jpeg", "Switch the camera to a top-down right corner view.", "Multiple-Angles"],
1364
+ [
1365
+ "examples/9.jpg",
1366
+ "The camera moves slightly forward as sunlight breaks through the clouds, casting a soft glow around the character's silhouette in the mist. Realistic cinematic style, atmospheric depth.",
1367
+ "Next-Scene",
1368
+ ],
1369
+ ["examples/8.jpg", "Make the subjects skin details more prominent and natural.", "Edit-Skin"],
1370
+ ["examples/6.jpg", "Switch the camera to a bottom-up view.", "Multiple-Angles"],
1371
+ ["examples/6.jpg", "Rotate the camera 180 degrees upside down.", "Multiple-Angles"],
1372
+ ["examples/4.jpg", "Rotate the camera 45 degrees to the right.", "Multiple-Angles"],
1373
+ ["examples/4.jpg", "Switch the camera to a top-down view.", "Multiple-Angles"],
1374
+ ["examples/4.jpg", "Switch the camera to a wide-angle lens.", "Multiple-Angles"],
1375
  ["examples/11.jpg", "Upscale this picture to 4K resolution.", "Upscale2K"],
1376
  ],
1377
  inputs=[input_image_1, prompt, lora_adapter],
 
1400
  outputs=[output_image, seed, last_output],
1401
  )
1402
 
 
1403
  btn_out_to_img1.click(fn=set_output_as_image1, inputs=[last_output], outputs=[input_image_1])
1404
  btn_out_to_img2.click(fn=set_output_as_image2, inputs=[last_output], outputs=[input_image_2])
1405
  btn_out_to_extra.click(fn=set_output_as_extra, inputs=[last_output, input_images_extra], outputs=[input_images_extra])
1406
 
 
1407
  add_derived_btn.click(
1408
  fn=add_derived_ref,
1409
  inputs=[input_image_1, input_images_extra, derived_type, derived_use_gpu, derived_max_people],