Professional Noob commited on
Commit
1d96c5d
·
verified ·
1 Parent(s): c08afc1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +535 -828
app.py CHANGED
@@ -1,24 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  import re
3
  import gc
4
  import traceback
5
- import base64
6
- import io
 
7
  import gradio as gr
8
  import numpy as np
9
  import spaces
10
  import torch
11
- import random
12
  from PIL import Image
13
- from typing import Iterable, Optional, Tuple
14
-
15
- from transformers import (
16
- AutoImageProcessor,
17
- AutoModelForDepthEstimation,
18
- )
19
 
 
20
  from huggingface_hub import hf_hub_download
21
- from huggingface_hub import InferenceClient
22
  from safetensors.torch import load_file as safetensors_load_file
23
 
24
  from gradio.themes import Soft
@@ -43,7 +51,6 @@ colors.orange_red = colors.Color(
43
  c950="#802200",
44
  )
45
 
46
-
47
  class OrangeRedTheme(Soft):
48
  def __init__(
49
  self,
@@ -99,7 +106,6 @@ class OrangeRedTheme(Soft):
99
  block_label_background_fill="*primary_200",
100
  )
101
 
102
-
103
  orange_red_theme = OrangeRedTheme()
104
 
105
  # ============================================================
@@ -107,7 +113,6 @@ orange_red_theme = OrangeRedTheme()
107
  # ============================================================
108
 
109
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
110
-
111
  print("CUDA_VISIBLE_DEVICES=", os.environ.get("CUDA_VISIBLE_DEVICES"))
112
  print("torch.__version__ =", torch.__version__)
113
  print("torch.version.cuda =", torch.version.cuda)
@@ -118,17 +123,18 @@ if torch.cuda.is_available():
118
  print("device name:", torch.cuda.get_device_name(torch.cuda.current_device()))
119
  print("Using device:", device)
120
 
 
 
 
121
  # ============================================================
122
  # AIO version (Space variable)
123
  # ============================================================
124
 
125
  AIO_REPO_ID = "Pr0f3ssi0n4ln00b/Phr00t-Qwen-Rapid-AIO"
126
  DEFAULT_AIO_VERSION = "v19"
127
-
128
  _VER_RE = re.compile(r"^v\d+$")
129
  _DIGITS_RE = re.compile(r"^\d+$")
130
 
131
-
132
  def _normalize_version(raw: str) -> Optional[str]:
133
  if raw is None:
134
  return None
@@ -141,13 +147,10 @@ def _normalize_version(raw: str) -> Optional[str]:
141
  return f"v{s}"
142
  return None
143
 
144
-
145
  _AIO_ENV_RAW = os.environ.get("AIO_VERSION", "")
146
  _AIO_ENV_NORM = _normalize_version(_AIO_ENV_RAW)
147
-
148
  AIO_VERSION = _AIO_ENV_NORM or DEFAULT_AIO_VERSION
149
  AIO_VERSION_SOURCE = "env" if _AIO_ENV_NORM else "default(v19)"
150
-
151
  print(f"AIO_VERSION (env raw) = {_AIO_ENV_RAW!r}")
152
  print(f"AIO_VERSION (normalized) = {_AIO_ENV_NORM!r}")
153
  print(f"Using AIO_VERSION = {AIO_VERSION} ({AIO_VERSION_SOURCE})")
@@ -161,12 +164,9 @@ from qwenimage.pipeline_qwenimage_edit_plus import QwenImageEditPlusPipeline
161
  from qwenimage.transformer_qwenimage import QwenImageTransformer2DModel
162
  from qwenimage.qwen_fa3_processor import QwenDoubleStreamAttnProcessorFA3
163
 
164
- dtype = torch.bfloat16
165
-
166
-
167
  def _load_pipe_with_version(version: str) -> QwenImageEditPlusPipeline:
168
  sub = f"{version}/transformer"
169
- print(f"📦 Loading AIO transformer: {AIO_REPO_ID} / {sub}")
170
  p = QwenImageEditPlusPipeline.from_pretrained(
171
  "Qwen/Qwen-Image-Edit-2511",
172
  transformer=QwenImageTransformer2DModel.from_pretrained(
@@ -179,12 +179,13 @@ def _load_pipe_with_version(version: str) -> QwenImageEditPlusPipeline:
179
  ).to(device)
180
  return p
181
 
182
-
183
  try:
184
  pipe = _load_pipe_with_version(AIO_VERSION)
185
  except Exception:
186
  print("❌ Failed to load requested AIO_VERSION. Falling back to v19.")
 
187
  print(traceback.format_exc())
 
188
  AIO_VERSION = DEFAULT_AIO_VERSION
189
  AIO_VERSION_SOURCE = "fallback_to_v19"
190
  pipe = _load_pipe_with_version(AIO_VERSION)
@@ -195,33 +196,57 @@ try:
195
  except Exception as e:
196
  print(f"Warning: Could not set FA3 processor: {e}")
197
 
198
- MAX_SEED = np.iinfo(np.int32).max
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
 
200
  # ============================================================
201
- # Derived conditioning (Depth Anything) ONLY — ViTPose removed
202
  # ============================================================
203
 
204
  DEPTH_MODEL_ID = "depth-anything/Depth-Anything-V2-Small-hf"
205
  _DEPTH_CACHE = {}
206
 
207
-
208
  def _derived_device(use_gpu: bool) -> torch.device:
209
  return torch.device("cuda" if (use_gpu and torch.cuda.is_available()) else "cpu")
210
 
211
-
212
  def _load_depth_models(dev: torch.device):
213
  key = str(dev)
214
  if key in _DEPTH_CACHE:
215
  return _DEPTH_CACHE[key]
216
-
217
  proc = AutoImageProcessor.from_pretrained(DEPTH_MODEL_ID)
218
  model = AutoModelForDepthEstimation.from_pretrained(DEPTH_MODEL_ID).to(dev)
219
  model.eval()
220
-
221
  _DEPTH_CACHE[key] = (proc, model)
222
  return _DEPTH_CACHE[key]
223
 
224
-
225
  def make_depth_map(img: Image.Image, *, use_gpu: bool) -> Image.Image:
226
  img = img.convert("RGB")
227
  dev = _derived_device(use_gpu)
@@ -233,7 +258,7 @@ def make_depth_map(img: Image.Image, *, use_gpu: bool) -> Image.Image:
233
  with torch.no_grad():
234
  out = model(**inputs)
235
 
236
- pred = out.predicted_depth
237
  pred = torch.nn.functional.interpolate(
238
  pred.unsqueeze(1),
239
  size=(img.height, img.width),
@@ -249,30 +274,6 @@ def make_depth_map(img: Image.Image, *, use_gpu: bool) -> Image.Image:
249
  depth8 = (arr * 255.0).clip(0, 255).astype(np.uint8)
250
  return Image.fromarray(depth8, mode="L").convert("RGB")
251
 
252
-
253
- def _to_pil_rgb(item):
254
- if item is None:
255
- return None
256
- if isinstance(item, (tuple, list)) and len(item) >= 1:
257
- item = item[0]
258
- if isinstance(item, Image.Image):
259
- return item.convert("RGB")
260
- if isinstance(item, np.ndarray):
261
- return Image.fromarray(item).convert("RGB")
262
- return None
263
-
264
-
265
- def _append_to_gallery(existing, new_img: Image.Image):
266
- items = []
267
- if existing:
268
- for it in existing:
269
- pil = _to_pil_rgb(it)
270
- if pil is not None:
271
- items.append(pil)
272
- items.append(new_img)
273
- return items
274
-
275
-
276
  # ============================================================
277
  # LoRA adapters + presets
278
  # ============================================================
@@ -297,7 +298,7 @@ ADAPTER_SPECS = {
297
  "AnyPose": {
298
  "type": "package",
299
  "requires_two_images": True,
300
- "image2_label": "Upload Pose Reference (Image 2)",
301
  "parts": [
302
  {
303
  "repo": "lilylilith/AnyPose",
@@ -337,7 +338,7 @@ ADAPTER_SPECS = {
337
  "BFS-Best-FaceSwap": {
338
  "type": "single",
339
  "requires_two_images": True,
340
- "image2_label": "Upload Head/Face Donor (Image 2)",
341
  "repo": "Alissonerdx/BFS-Best-Face-Swap",
342
  "weights": "bfs_head_v5_2511_original.safetensors",
343
  "adapter_name": "BFS-Best-Faceswap",
@@ -347,7 +348,7 @@ ADAPTER_SPECS = {
347
  "BFS-Best-FaceSwap-merge": {
348
  "type": "single",
349
  "requires_two_images": True,
350
- "image2_label": "Upload Head/Face Donor (Image 2)",
351
  "repo": "Alissonerdx/BFS-Best-Face-Swap",
352
  "weights": "bfs_head_v5_2511_merged_version_rank_32_fp32.safetensors",
353
  "adapter_name": "BFS-Best-Faceswap-merge",
@@ -430,12 +431,30 @@ ADAPTER_SPECS = {
430
  LORA_PRESET_PROMPTS = {
431
  "Any2Real_2601": "change the picture 1 to realistic photograph",
432
  "Semirealistic-photo-detailer": "transform the image to semi-realistic image",
433
- "AnyPose": "Make the person in image 1 do the exact same pose of the person in image 2. Changing the style and background of the image of the person in image 1 is undesirable, so don't do it. The new pose should be pixel accurate to the pose we are trying to copy. The position of the arms and head and legs should be the same as the pose we are trying to copy. Change the field of view and angle to match exactly image 2. Head tilt and eye gaze pose should match the person in image 2.",
434
- "Hyperrealistic-Portrait": "Transform the image into an ultra-realistic photorealistic portrait with strict identity preservation, facing straight to the camera. Enhance pore-level skin textures, realistic moisture effects, and natural wet hair clumping against the skin. Apply cool-toned soft-box lighting with subtle highlights and shadows, maintain realistic green-hazel eye catchlights without synthetic gloss, and preserve soft natural lip texture. Use shallow depth of field with a clean background, an 85mm macro photographic look, and raw photo grading without retouching to maintain realism and original details.",
435
- "Ultrarealistic-Portrait": "Transform the image into an ultra-realistic glamour portrait while strictly preserving the subject’s identity. Apply a close-up composition with a slight head tilt and a hand near the face, enhance cinematic directional lighting with dramatic fashion-style highlights, and refine makeup details including glowing skin, glossy lips, luminous highlighter, and defined eyes. Increase skin realism with detailed epidermal textures such as micropores, microhairs, subtle oil sheen, natural highlights, soft wrinkles, and subsurface scattering. Maintain a luxury fashion-magazine look in a 9:16 aspect ratio, preserving realism, facial structure, and original details without over-smoothing or retouching.",
 
 
 
 
 
 
 
 
 
 
 
 
436
  "Upscale2K": "Upscale this picture to 4K resolution.",
437
- "BFS-Best-FaceSwap": "head_swap: start with Picture 1 as the base image, keeping its lighting, environment, and background. remove the head from Picture 1 completely and replace it with the head from Picture 2, strictly preserving the hair, eye color, and nose structure of Picture 2. copy the eye direction, head rotation, and micro-expressions from Picture 1. high quality, sharp details, 4k",
438
- "BFS-Best-FaceSwap-merge": "head_swap: start with Picture 1 as the base image, keeping its lighting, environment, and background. remove the head from Picture 1 completely and replace it with the head from Picture 2, strictly preserving the hair, eye color, and nose structure of Picture 2. copy the eye direction, head rotation, and micro-expressions from Picture 1. high quality, sharp details, 4k",
 
 
 
 
 
 
439
  }
440
 
441
  LOADED_ADAPTERS = set()
@@ -444,856 +463,544 @@ LOADED_ADAPTERS = set()
444
  # Helpers: resolution
445
  # ============================================================
446
 
447
-
448
  def _round_to_multiple(x: int, m: int) -> int:
 
449
  return max(m, (int(x) // m) * m)
450
 
 
 
 
 
 
 
 
 
451
 
452
- def compute_canvas_dimensions_from_area(
453
- image: Image.Image,
454
- target_area: int,
455
- multiple_of: int = 64,
456
- ) -> Tuple[int, int]:
457
- w0, h0 = image.size
458
- if w0 <= 0 or h0 <= 0:
459
- return 512, 512
460
- aspect = w0 / h0
461
- w = int((target_area * aspect) ** 0.5)
462
- h = int(w / aspect) if aspect != 0 else int((target_area) ** 0.5)
463
- w = _round_to_multiple(w, multiple_of)
464
- h = _round_to_multiple(h, multiple_of)
465
- w = max(multiple_of, w)
466
- h = max(multiple_of, h)
467
- return w, h
468
-
469
-
470
- def get_target_area_for_lora(image: Image.Image, lora_adapter: str, target_megapixels: float) -> int:
471
  spec = ADAPTER_SPECS.get(lora_adapter, {})
472
- long_edge = spec.get("target_long_edge", None)
473
-
474
- if long_edge:
475
- w0, h0 = image.size
476
- if w0 <= 0 or h0 <= 0:
477
- return int(1.0 * 1024 * 1024)
478
- scale = float(long_edge) / float(max(w0, h0))
479
- w = int(w0 * scale)
480
- h = int(h0 * scale)
481
- return max(64 * 64, w * h)
482
-
483
- mp = float(target_megapixels)
484
- return max(64 * 64, int(mp * 1_000_000))
485
-
 
 
 
 
 
 
 
 
 
 
 
486
 
487
  # ============================================================
488
- # Helpers: LoRA loading + alpha fix
489
  # ============================================================
490
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
491
 
492
- def _download_from_hf(repo_id: str, filename: str) -> str:
493
- return hf_hub_download(repo_id=repo_id, filename=filename)
494
-
495
-
496
- def _maybe_apply_alpha_fix(state_dict: dict) -> dict:
497
- if "img_in.alpha" not in state_dict:
498
- for k in list(state_dict.keys()):
499
- if k.endswith("img_in.weight") or k.endswith("img_in.bias"):
500
- t = state_dict[k]
501
- if hasattr(t, "new_zeros"):
502
- state_dict["img_in.alpha"] = t.new_zeros(())
503
- break
504
- return state_dict
505
-
506
-
507
- def _load_single_lora(spec: dict):
508
- local_path = _download_from_hf(spec["repo"], spec["weights"])
509
- sd = safetensors_load_file(local_path)
510
- if spec.get("needs_alpha_fix", False):
511
- sd = _maybe_apply_alpha_fix(sd)
512
- pipe.load_lora_weights(sd, adapter_name=spec["adapter_name"])
513
- LOADED_ADAPTERS.add(spec["adapter_name"])
514
-
515
-
516
- def _ensure_loaded_and_get_active_adapters(lora_adapter: str):
517
- spec = ADAPTER_SPECS.get(lora_adapter, None)
518
- if spec is None:
519
- return [], []
520
-
521
- if spec["type"] == "single":
522
- if spec["adapter_name"] not in LOADED_ADAPTERS:
523
- _load_single_lora(spec)
524
- return [spec["adapter_name"]], [spec.get("strength", 1.0)]
525
-
526
- adapter_names = []
527
- weights = []
528
- for part in spec["parts"]:
529
- if part["adapter_name"] not in LOADED_ADAPTERS:
530
- _load_single_lora(part)
531
- adapter_names.append(part["adapter_name"])
532
- weights.append(part.get("strength", 1.0))
533
- return adapter_names, weights
534
-
535
 
536
  def lora_requires_two_images(lora_adapter: str) -> bool:
537
- spec = ADAPTER_SPECS.get(lora_adapter, {})
538
- return bool(spec.get("requires_two_images", False))
539
-
540
-
541
- def get_image2_label_for_lora(lora_adapter: str) -> str:
542
- spec = ADAPTER_SPECS.get(lora_adapter, {})
543
- return spec.get("image2_label", "Upload Reference (Image 2)")
544
-
545
-
546
- def build_labeled_images(img1: Image.Image, img2: Optional[Image.Image], extras: list[Image.Image]):
547
- labeled = {"image_1": img1}
548
- if img2 is not None:
549
- labeled["image_2"] = img2
550
- for ex in extras:
551
- labeled[f"image_{len(labeled) + 1}"] = ex
552
- return labeled
553
 
 
 
554
 
555
  # ============================================================
556
- # UI: lora change handler
557
  # ============================================================
558
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
559
 
560
- def on_lora_change_ui(lora_adapter, current_prompt, current_extras_condition_only):
561
- preset = LORA_PRESET_PROMPTS.get(lora_adapter, None)
562
- prompt_update = gr.update(value=preset) if preset else gr.update(value=current_prompt)
563
-
564
- needs_two = lora_requires_two_images(lora_adapter)
565
- img2_update = gr.update(visible=needs_two, label=get_image2_label_for_lora(lora_adapter))
566
-
567
- extras_update = gr.update(value=True) if needs_two else gr.update(value=current_extras_condition_only)
568
- return prompt_update, img2_update, extras_update
569
-
570
-
571
- # ============================================================
572
- # Output routing + derived conditioning
573
- # ============================================================
574
-
575
-
576
- def set_output_as_image1(last):
577
- if last is None:
578
- raise gr.Error("No output available yet.")
579
- return gr.update(value=last)
580
-
581
-
582
- def set_output_as_image2(last):
583
- if last is None:
584
- raise gr.Error("No output available yet.")
585
- return gr.update(value=last)
586
-
587
-
588
- def set_output_as_extra(last, existing_extra):
589
- if last is None:
590
- raise gr.Error("No output available yet.")
591
- return _append_to_gallery(existing_extra, last)
592
-
593
-
594
- @spaces.GPU
595
- def add_derived_ref(img1, existing_extra, derived_type, derived_use_gpu):
596
- if img1 is None:
597
- raise gr.Error("Please upload Image 1 first.")
598
-
599
- if derived_type == "None":
600
- return gr.update(value=existing_extra), gr.update(visible=False, value=None)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
601
 
602
- base = img1.convert("RGB")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
603
 
604
- if derived_type == "Depth (Depth Anything V2 Small)":
605
- derived = make_depth_map(base, use_gpu=bool(derived_use_gpu))
606
  else:
607
- raise gr.Error(f"Unknown derived type: {derived_type}")
 
 
 
 
 
 
 
 
608
 
609
- new_gallery = _append_to_gallery(existing_extra, derived)
610
- return gr.update(value=new_gallery), gr.update(visible=True, value=derived)
611
 
 
612
 
613
  # ============================================================
614
- # Prompt Helper (outsourced VLM calls, UI stays clean)
615
  # ============================================================
616
 
617
- # Configuration via env vars (no UI clutter)
618
- HF_TOKEN = os.environ.get("HF_TOKEN", "").strip() or os.environ.get("HUGGINGFACEHUB_API_TOKEN", "").strip()
619
- HF_PROVIDER = os.environ.get("HF_PROVIDER", "nebius").strip()
620
- HF_VLM_MODEL = os.environ.get("HF_VLM_MODEL", "Qwen/Qwen2.5-VL-7B-Instruct").strip()
621
-
622
- _client_cache = {}
623
-
624
-
625
- def _get_client() -> InferenceClient:
626
- key = (HF_PROVIDER, bool(HF_TOKEN))
627
- if key in _client_cache:
628
- return _client_cache[key]
629
- if not HF_TOKEN:
630
- raise gr.Error("Captioning is not configured (missing HF_TOKEN).")
631
- client = InferenceClient(provider=HF_PROVIDER, api_key=HF_TOKEN)
632
- _client_cache[key] = client
633
- return client
634
-
635
-
636
- def _encode_image_data_url(img: Image.Image, max_side: int = 1536, fmt: str = "PNG") -> str:
637
- """
638
- Converts PIL to data URL (base64). Downscales to keep payload reasonable.
639
- """
640
- img = img.convert("RGB")
641
  w, h = img.size
642
- scale = min(1.0, float(max_side) / float(max(w, h))) if max(w, h) > 0 else 1.0
643
- if scale < 1.0:
644
- img = img.resize((max(1, int(w * scale)), max(1, int(h * scale))), Image.LANCZOS)
645
-
646
- buf = io.BytesIO()
647
- img.save(buf, format=fmt)
648
- b64 = base64.b64encode(buf.getvalue()).decode("utf-8")
649
- mime = "image/png" if fmt.upper() == "PNG" else "image/jpeg"
650
- return f"data:{mime};base64,{b64}"
651
-
652
-
653
- def _chat_with_image(
654
- system_prompt: str,
655
- user_text: str,
656
- image: Image.Image,
657
- *,
658
- max_tokens: int,
659
- temperature: float,
660
- ) -> str:
661
- client = _get_client()
662
- data_url = _encode_image_data_url(image)
663
-
664
- messages = [
665
- {"role": "system", "content": system_prompt},
666
- {
667
- "role": "user",
668
- "content": [
669
- {"type": "text", "text": user_text},
670
- {"type": "image_url", "image_url": {"url": data_url}},
671
- ],
672
- },
673
- ]
674
-
675
- # Hugging Face chat.completions interface
676
- resp = client.chat.completions.create(
677
- model=HF_VLM_MODEL,
678
- messages=messages,
679
- max_tokens=int(max_tokens),
680
- temperature=float(temperature),
681
- )
682
- return (resp.choices[0].message.content or "").strip()
683
-
684
-
685
- def _chat_text_only(
686
- system_prompt: str,
687
- user_text: str,
688
- *,
689
- max_tokens: int,
690
- temperature: float,
691
- ) -> str:
692
- client = _get_client()
693
- messages = [
694
- {"role": "system", "content": system_prompt},
695
- {"role": "user", "content": [{"type": "text", "text": user_text}]},
696
- ]
697
- resp = client.chat.completions.create(
698
- model=HF_VLM_MODEL,
699
- messages=messages,
700
- max_tokens=int(max_tokens),
701
- temperature=float(temperature),
702
- )
703
- return (resp.choices[0].message.content or "").strip()
704
-
705
-
706
- def _has_header(text: str, header: str) -> bool:
707
- return header in (text or "")
708
-
709
-
710
- def _enforce_once_retry_image(system_prompt: str, user_text: str, image: Image.Image, header: str, max_tokens: int, temperature: float) -> str:
711
- out = _chat_with_image(system_prompt, user_text, image, max_tokens=max_tokens, temperature=temperature)
712
- if _has_header(out, header):
713
- return out
714
-
715
- # one strict retry
716
- retry_user = (
717
- user_text
718
- + "\n\nIMPORTANT: You did not follow the required output format. "
719
- + f"Return EXACTLY the block starting with {header} and fill each line. No extra text."
720
- )
721
- out2 = _chat_with_image(system_prompt, retry_user, image, max_tokens=max_tokens, temperature=temperature)
722
- return out2
723
-
724
-
725
- def _enforce_once_retry_text(system_prompt: str, user_text: str, header: str, max_tokens: int, temperature: float) -> str:
726
- out = _chat_text_only(system_prompt, user_text, max_tokens=max_tokens, temperature=temperature)
727
- if _has_header(out, header):
728
- return out
729
-
730
- retry_user = (
731
- user_text
732
- + "\n\nIMPORTANT: You did not follow the required output format. "
733
- + f"Return EXACTLY the sections starting with {header}. No extra text."
734
- )
735
- return _chat_text_only(system_prompt, retry_user, max_tokens=max_tokens, temperature=temperature)
736
-
737
-
738
- # --------- BASE (Pic1) extraction prompt (no identity) ----------
739
- BFS_BASE_SYSTEM = """You are extracting non-identity facial and contextual signals from Picture 1 (BASE) for a head/face swap.
740
-
741
- CRITICAL: DO NOT describe identity/likeness traits. That means:
742
- - No age, ethnicity/race/nationality guesses, attractiveness judgments, “looks like X”
743
- - No skin tone, facial structure descriptions, “round face”, “strong jaw”, etc.
744
- - No hair color/style as identity markers (only mention hair if it occludes the face, e.g. “hair covering left eye”)
745
-
746
- Focus ONLY on:
747
- - Head pose (yaw/pitch/roll, tilt, chin/jaw position)
748
- - Gaze and eyelids (direction, openness)
749
- - Micro-expressions / muscle cues (brow knit/raise, squint, lip tension, mouth corners, cheek tension, jaw set)
750
- - Mouth details (open/closed, teeth, tongue if visible)
751
- - Mood inference (max 2 labels) with visible evidence cues
752
- - Occlusions and interactions (hands, objects, glasses, shadows) relevant to face recreation
753
- - Visibility notes (unclear/occluded/shadowed)
754
-
755
- Output format (return exactly this block, nothing else):
756
-
757
- [BASE_SIGNALS_PIC1]
758
- Head pose:
759
- Gaze & eyelids:
760
- Expression (muscle cues):
761
- Mouth details:
762
- Mood (max 2 labels):
763
- Evidence for mood (visible cues only):
764
- Occlusions & interactions:
765
- Visibility notes (unclear/occluded/shadowed areas):
766
- """
767
-
768
- BFS_BASE_USER = """Analyze the single provided image as Picture 1 (BASE).
769
- Fill every line with either an observation or the word "unclear". Keep it concise."""
770
-
771
- # --------- DONOR (Pic2) extraction prompt (identity only) ----------
772
- BFS_DONOR_SYSTEM = """You are extracting inherent identity/likeness traits from Picture 2 (DONOR) for a head/face swap.
773
-
774
- CRITICAL: DO NOT describe expression, mood, gaze direction, head pose/rotation, body pose, or actions.
775
-
776
- Focus ONLY on visible physical traits:
777
- - Face shape & proportions (jawline, cheekbones, chin shape)
778
- - Skin tone/undertone + texture (freckles/moles only if visible)
779
- - Eyes (color, shape), brows (shape/thickness)
780
- - Nose structure (bridge, tip, nostrils)
781
- - Lips/mouth shape (fullness, cupid’s bow)
782
- - Chin/jaw details
783
- - Hair (color, style, hairline)
784
- - Distinctive traits (scars/moles/freckles if visible)
785
- - Visibility notes (unclear/occluded/shadowed)
786
-
787
- Output format (return exactly this block, nothing else):
788
-
789
- [DONOR_TRAITS_PIC2]
790
- Face shape & proportions:
791
- Skin tone & texture:
792
- Eyes & brows:
793
- Nose structure:
794
- Lips & mouth shape:
795
- Chin/jaw details:
796
- Hair (color, style, hairline):
797
- Distinctive traits (scars/moles/freckles if visible):
798
- Visibility notes (unclear/occluded/shadowed areas):
799
- """
800
-
801
- BFS_DONOR_USER = """Analyze the single provided image as Picture 2 (DONOR).
802
- Fill every line with either an observation or the word "unclear". Keep it concise."""
803
-
804
- # --------- Text-only prompt builder ----------
805
- BFS_BUILDER_SYSTEM = """You are a prompt editor for BFS-BestFaceSwap.
806
-
807
- Input you may receive:
808
- - A core prompt (already includes head_swap instructions)
809
- - BASE_SIGNALS_PIC1 text (pose/expression/mood/occlusions; non-identity)
810
- - Optional DONOR_TRAITS_PIC2 text (identity-only traits)
811
-
812
- Your job:
813
- - Produce a compact addendum that improves expressiveness transfer and reduces ambiguity.
814
- - Do NOT add any identity traits from the base signals.
815
- - Do NOT add any pose/expression/mood from donor traits.
816
- - Prefer concrete, visible cues over vague adjectives.
817
- - Keep it short (ideally 6–14 lines total).
818
- - If donor traits are missing or mostly "unclear", omit donor section entirely.
819
-
820
- Output EXACTLY two sections (donor section may be omitted if not provided/usable):
821
- [ADDENDUM_BASE]
822
- (bullets or short lines; use the best cues from BASE_SIGNALS)
823
-
824
- [ADDENDUM_DONOR]
825
- (optional; only if donor traits contain useful visible info; no pose/expression)
826
- """
827
-
828
 
829
- def scrub_placeholder(text: str, enabled: bool) -> str:
830
- # Placeholder for future strict scrubber pass (no-op).
831
- return text
832
 
 
 
 
 
 
 
833
 
834
  @spaces.GPU
835
- def caption_base_pic1(
836
- img1,
837
- max_new_tokens: int,
838
- temperature: float,
839
- strict_scrubber: bool,
840
- show_debug: bool,
 
 
 
 
 
 
 
 
 
 
 
 
 
841
  ):
842
  if img1 is None:
843
- raise gr.Error("Please upload Image 1 (base) first.")
844
-
845
- raw = _enforce_once_retry_image(
846
- BFS_BASE_SYSTEM,
847
- BFS_BASE_USER,
848
- img1,
849
- header="[BASE_SIGNALS_PIC1]",
850
- max_tokens=int(max_new_tokens),
851
- temperature=float(temperature),
852
- )
853
- out = scrub_placeholder(raw, enabled=bool(strict_scrubber))
854
- debug = raw if bool(show_debug) else ""
855
- return out, debug
856
 
 
 
857
 
858
- @spaces.GPU
859
- def caption_donor_pic2(
860
- img2,
861
- max_new_tokens: int,
862
- temperature: float,
863
- strict_scrubber: bool,
864
- show_debug: bool,
865
- ):
866
- if img2 is None:
867
- raise gr.Error("Please upload Image 2 (donor) first.")
868
-
869
- raw = _enforce_once_retry_image(
870
- BFS_DONOR_SYSTEM,
871
- BFS_DONOR_USER,
872
- img2,
873
- header="[DONOR_TRAITS_PIC2]",
874
- max_tokens=int(max_new_tokens),
875
- temperature=float(temperature),
876
- )
877
- out = scrub_placeholder(raw, enabled=bool(strict_scrubber))
878
- debug = raw if bool(show_debug) else ""
879
- return out, debug
880
 
 
 
881
 
882
- def _compose_final_prompt(core_prompt: str, addendum_text: str, mode: str) -> str:
883
- core = (core_prompt or "").strip()
884
- addendum = (addendum_text or "").strip()
885
- if not addendum:
886
- return core
 
 
 
 
887
 
888
- if (mode or "").lower().startswith("inject"):
889
- injected = core
890
- if "{BFS_ADDENDUM}" in injected:
891
- injected = injected.replace("{BFS_ADDENDUM}", addendum + "\n")
892
- return injected.strip()
893
 
894
- return (core + "\n\n" + addendum).strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
895
 
 
896
 
897
- @spaces.GPU
898
- def build_bfs_addendum_and_final_prompt(
899
- core_prompt: str,
900
- base_caption: str,
901
- donor_caption: str,
902
- integration_mode: str,
903
- max_new_tokens: int,
904
- temperature: float,
905
- show_debug: bool,
906
- ):
907
- base = (base_caption or "").strip()
908
- donor = (donor_caption or "").strip()
909
- core = (core_prompt or "").strip()
910
-
911
- if not base:
912
- raise gr.Error("Generate BASE signals (Pic1) first (or paste them) before building an addendum.")
913
-
914
- user_text = (
915
- "CORE PROMPT:\n"
916
- f"{core}\n\n"
917
- "BASE_SIGNALS_PIC1:\n"
918
- f"{base}\n\n"
919
- "DONOR_TRAITS_PIC2:\n"
920
- f"{donor if donor else '(none)'}\n\n"
921
- "Produce the addendum now."
922
- )
923
 
924
- raw = _enforce_once_retry_text(
925
- BFS_BUILDER_SYSTEM,
926
- user_text,
927
- header="[ADDENDUM_BASE]",
928
- max_tokens=int(max_new_tokens),
929
- temperature=float(temperature),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
930
  )
931
 
932
- final_prompt = _compose_final_prompt(core, raw, integration_mode)
933
- debug = raw if bool(show_debug) else ""
934
- return raw, final_prompt, debug
935
 
 
936
 
937
- # ============================================================
938
- # Inference
939
- # ============================================================
940
-
941
 
942
- @spaces.GPU
943
- def infer(
944
- input_image_1,
945
- input_image_2,
946
- input_images_extra,
947
- prompt,
948
- lora_adapter,
949
- seed,
950
- randomize_seed,
951
- guidance_scale,
952
- steps,
953
- target_megapixels,
954
- extras_condition_only,
955
- pad_to_canvas,
956
- progress=gr.Progress(track_tqdm=True),
957
- ):
958
  gc.collect()
959
  if torch.cuda.is_available():
960
  torch.cuda.empty_cache()
961
 
962
- if input_image_1 is None:
963
- raise gr.Error("Please upload Image 1.")
964
-
965
- if lora_adapter == NONE_LORA:
966
- try:
967
- pipe.set_adapters([], adapter_weights=[])
968
- except Exception:
969
- if LOADED_ADAPTERS:
970
- pipe.set_adapters(list(LOADED_ADAPTERS), adapter_weights=[0.0] * len(LOADED_ADAPTERS))
971
- else:
972
- adapter_names, adapter_weights = _ensure_loaded_and_get_active_adapters(lora_adapter)
973
- pipe.set_adapters(adapter_names, adapter_weights=adapter_weights)
974
-
975
- if randomize_seed:
976
- seed = random.randint(0, MAX_SEED)
977
-
978
- generator = torch.Generator(device=device).manual_seed(seed)
979
- negative_prompt = (
980
- "worst quality, low quality, bad anatomy, bad hands, text, error, missing fingers, "
981
- "extra digit, fewer digits, cropped, jpeg artifacts, signature, watermark, username, blurry"
982
- )
983
-
984
- img1 = input_image_1.convert("RGB")
985
- img2 = input_image_2.convert("RGB") if input_image_2 is not None else None
986
-
987
- extra_imgs: list[Image.Image] = []
988
- if input_images_extra:
989
- for item in input_images_extra:
990
- pil = _to_pil_rgb(item)
991
- if pil is not None:
992
- extra_imgs.append(pil)
993
 
994
- if lora_requires_two_images(lora_adapter) and img2 is None:
995
- raise gr.Error("This LoRA needs two images. Please upload Image 2 as well.")
 
996
 
997
- labeled = build_labeled_images(img1, img2, extra_imgs)
998
- pipe_images = list(labeled.values())
999
- if len(pipe_images) == 1:
1000
- pipe_images = pipe_images[0]
1001
 
1002
- target_area = get_target_area_for_lora(img1, lora_adapter, float(target_megapixels))
1003
- width, height = compute_canvas_dimensions_from_area(
1004
- img1,
1005
- target_area=target_area,
1006
- multiple_of=int(pipe.vae_scale_factor * 2),
1007
- )
1008
 
1009
- vae_image_indices = None
1010
- if extras_condition_only:
1011
- if isinstance(pipe_images, list) and len(pipe_images) > 2:
1012
- vae_image_indices = [0, 1] if len(pipe_images) >= 2 else [0]
1013
 
1014
- try:
1015
- result = pipe(
1016
- image=pipe_images,
1017
- prompt=prompt,
1018
- negative_prompt=negative_prompt,
1019
- height=height,
1020
- width=width,
1021
- num_inference_steps=steps,
1022
- generator=generator,
1023
- true_cfg_scale=guidance_scale,
1024
- vae_image_indices=vae_image_indices,
1025
- pad_to_canvas=bool(pad_to_canvas),
1026
- ).images[0]
1027
- return result, seed, result
1028
- finally:
1029
- gc.collect()
1030
- if torch.cuda.is_available():
1031
- torch.cuda.empty_cache()
1032
 
 
 
1033
 
1034
- @spaces.GPU
1035
- def infer_example(input_image, prompt, lora_adapter):
1036
- if input_image is None:
1037
- return None, 0, None
1038
- input_pil = input_image.convert("RGB")
1039
- guidance_scale = 1.0
1040
- steps = 4
1041
- result, seed, last = infer(
1042
- input_pil,
1043
- None,
1044
- None,
1045
- prompt,
1046
- lora_adapter,
1047
- 0,
1048
- True,
1049
- guidance_scale,
1050
- steps,
1051
- 1.0,
1052
- True,
1053
- True,
1054
- )
1055
- return result, seed, last
1056
 
 
 
 
 
 
1057
 
1058
- # ============================================================
1059
- # UI
1060
- # ============================================================
 
 
 
1061
 
1062
- css = """
1063
- #col-container { margin: 0 auto; max-width: 960px; }
1064
- #main-title h1 { font-size: 2.1em !important; }
1065
  """
 
1066
 
1067
- aio_status_line = (
1068
- f"**AIO transformer version:** `{AIO_VERSION}` "
1069
- f"({AIO_VERSION_SOURCE}; env `AIO_VERSION`={_AIO_ENV_RAW!r})"
1070
- )
1071
-
1072
- with gr.Blocks() as demo:
1073
- with gr.Column(elem_id="col-container"):
1074
- gr.Markdown("# **Qwen-Image-Edit-2511-LoRAs-Fast**", elem_id="main-title")
1075
- gr.Markdown(
1076
- "Perform diverse image edits using specialized "
1077
- "[LoRA](https://huggingface.co/models?other=base_model:adapter:Qwen/Qwen-Image-Edit-2511) adapters for the "
1078
- "[Qwen-Image-Edit](https://huggingface.co/Qwen/Qwen-Image-Edit-2511) model."
1079
- )
1080
- gr.Markdown(aio_status_line)
1081
-
1082
- with gr.Row(equal_height=True):
1083
- with gr.Column():
1084
- input_image_1 = gr.Image(label="Upload Image 1 (Base / Target)", type="pil", height=290)
1085
- input_image_2 = gr.Image(label="Upload Reference (Image 2)", type="pil", height=290, visible=False)
1086
-
1087
- input_images_extra = gr.Gallery(
1088
- label="Upload Additional Images (auto-indexed after Image 1/2)",
1089
- type="pil",
1090
- height=290,
1091
- columns=4,
1092
- rows=2,
1093
- interactive=True,
 
 
 
 
 
 
 
 
 
 
 
 
1094
  )
1095
-
1096
- prompt = gr.Text(
1097
- label="Edit Prompt",
1098
- show_label=True,
1099
- placeholder="e.g., transform into photo..",
1100
  )
1101
 
1102
- with gr.Accordion("BFS Prompt Helper", open=False):
1103
- with gr.Row():
1104
- helper_max_tokens = gr.Slider(label="Max new tokens", minimum=64, maximum=1024, step=16, value=384)
1105
- helper_temperature = gr.Slider(label="Temperature (0 = deterministic)", minimum=0.0, maximum=1.2, step=0.05, value=0.2)
1106
-
1107
- with gr.Row():
1108
- strict_scrubber = gr.Checkbox(label="Strict scrubber (placeholder, no-op)", value=False)
1109
- show_debug = gr.Checkbox(label="Show debug outputs", value=False)
1110
-
1111
- with gr.Row():
1112
- btn_cap_base = gr.Button("Generate BASE signals (Pic1)", variant="secondary")
1113
- btn_cap_donor = gr.Button("Generate DONOR traits (Pic2) (optional)", variant="secondary")
1114
-
1115
- with gr.Row():
1116
- caption_pic1 = gr.Textbox(label="BASE signals (from Image 1)", lines=12, value="")
1117
- caption_pic2 = gr.Textbox(label="DONOR traits (from Image 2) (optional)", lines=12, value="")
1118
-
1119
- with gr.Row():
1120
- debug_base = gr.Textbox(label="Debug: raw BASE output", lines=8, visible=False)
1121
- debug_donor = gr.Textbox(label="Debug: raw DONOR output", lines=8, visible=False)
1122
-
1123
- integration_mode = gr.Radio(
1124
- label="How to apply addendum to the core prompt",
1125
- choices=["Concatenate", "Inject (placeholder {BFS_ADDENDUM})"],
1126
- value="Concatenate",
1127
- )
1128
-
1129
- with gr.Row():
1130
- btn_build_addendum = gr.Button("Build addendum + final prompt", variant="primary")
1131
- btn_apply_final = gr.Button("Apply final prompt → Edit Prompt", variant="secondary")
1132
-
1133
- bfs_addendum = gr.Textbox(label="Built addendum (editable)", lines=10, value="")
1134
- bfs_final_prompt = gr.Textbox(label="Final prompt preview (editable)", lines=10, value="")
1135
- debug_builder = gr.Textbox(label="Debug: raw builder output", lines=8, visible=False)
1136
-
1137
- run_button = gr.Button("Edit Image", variant="primary")
1138
-
1139
- with gr.Column():
1140
- output_image = gr.Image(label="Output Image", interactive=False, format="png", height=353)
1141
- last_output = gr.State(value=None)
1142
-
1143
- with gr.Row():
1144
- btn_out_to_img1 = gr.Button("⬅️ Output → Image 1", variant="secondary")
1145
- btn_out_to_img2 = gr.Button("⬅️ Output → Image 2", variant="secondary")
1146
- btn_out_to_extra = gr.Button("➕ Output → Extra Ref", variant="secondary")
1147
-
1148
- derived_preview = gr.Image(
1149
- label="Derived Conditioning Preview",
1150
- interactive=False,
1151
- format="png",
1152
- height=200,
1153
- visible=False,
1154
  )
 
1155
 
1156
- with gr.Row():
1157
- lora_choices = [NONE_LORA] + list(ADAPTER_SPECS.keys())
1158
- lora_adapter = gr.Dropdown(
1159
- label="Choose Editing Style",
1160
- choices=lora_choices,
1161
- value=NONE_LORA,
1162
- )
1163
-
1164
- with gr.Accordion("Advanced Settings", open=False, visible=True):
1165
- with gr.Accordion("Derived Conditioning (Depth)", open=False):
1166
- derived_type = gr.Dropdown(
1167
- label="Derived Type (from Image 1)",
1168
- choices=["None", "Depth (Depth Anything V2 Small)"],
1169
- value="None",
1170
- )
1171
- derived_use_gpu = gr.Checkbox(label="Use GPU for derived model", value=False)
1172
- add_derived_btn = gr.Button("➕ Add derived ref to Extras (conditioning-only recommended)")
1173
-
1174
- seed = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=0)
1175
- randomize_seed = gr.Checkbox(label="Randomize Seed", value=True)
1176
- guidance_scale = gr.Slider(label="Guidance Scale", minimum=1.0, maximum=10.0, step=0.1, value=1.0)
1177
- steps = gr.Slider(label="Inference Steps", minimum=1, maximum=50, step=1, value=4)
1178
- target_megapixels = gr.Slider(
1179
- label="Target Megapixels (canvas)",
1180
- minimum=0.5,
1181
- maximum=6.0,
1182
- step=0.1,
1183
- value=1.0,
1184
- )
1185
- extras_condition_only = gr.Checkbox(
1186
- label="Extra references are conditioning-only (exclude from VAE)",
1187
- value=True,
1188
- )
1189
- pad_to_canvas = gr.Checkbox(
1190
- label="Pad images to canvas aspect (avoid warping)",
1191
- value=True,
1192
- )
1193
-
1194
- # LoRA selection: preset prompt + toggle Image 2
1195
- lora_adapter.change(
1196
- fn=on_lora_change_ui,
1197
- inputs=[lora_adapter, prompt, extras_condition_only],
1198
- outputs=[prompt, input_image_2, extras_condition_only],
1199
- )
1200
 
1201
- # Debug visibility toggles
1202
- show_debug.change(
1203
- fn=lambda x: (
1204
- gr.update(visible=bool(x)),
1205
- gr.update(visible=bool(x)),
1206
- gr.update(visible=bool(x)),
1207
- ),
1208
- inputs=[show_debug],
1209
- outputs=[debug_base, debug_donor, debug_builder],
1210
- )
1211
 
1212
- # Caption buttons (single-image)
1213
- btn_cap_base.click(
1214
- fn=caption_base_pic1,
1215
- inputs=[input_image_1, helper_max_tokens, helper_temperature, strict_scrubber, show_debug],
1216
- outputs=[caption_pic1, debug_base],
1217
- )
1218
 
1219
- btn_cap_donor.click(
1220
- fn=caption_donor_pic2,
1221
- inputs=[input_image_2, helper_max_tokens, helper_temperature, strict_scrubber, show_debug],
1222
- outputs=[caption_pic2, debug_donor],
1223
- )
1224
 
1225
- # Builder (text-only)
1226
- btn_build_addendum.click(
1227
- fn=build_bfs_addendum_and_final_prompt,
1228
- inputs=[
1229
- prompt,
1230
- caption_pic1,
1231
- caption_pic2,
1232
- integration_mode,
1233
- helper_max_tokens,
1234
- helper_temperature,
1235
- show_debug,
1236
- ],
1237
- outputs=[bfs_addendum, bfs_final_prompt, debug_builder],
1238
- )
1239
 
1240
- # Apply final prompt to the Edit Prompt box
1241
- btn_apply_final.click(
1242
- fn=lambda x: gr.update(value=x),
1243
- inputs=[bfs_final_prompt],
1244
- outputs=[prompt],
1245
- )
1246
 
1247
- gr.Examples(
1248
- examples=[
1249
- ["examples/5.jpg", "Remove shadows and relight the image using soft lighting.", "Light-Restoration"],
1250
- ["examples/4.jpg", "Use a subtle golden-hour filter with smooth light diffusion.", "Relight"],
1251
- ["examples/2.jpeg", "Rotate the camera 45 degrees to the left.", "Multiple-Angles"],
1252
- ["examples/11.jpg", "Upscale this picture to 4K resolution.", "Upscale2K"],
1253
- ],
1254
- inputs=[input_image_1, prompt, lora_adapter],
1255
- outputs=[output_image, seed, last_output],
1256
- fn=infer_example,
1257
- cache_examples=False,
1258
- label="Examples",
1259
- )
1260
 
1261
- run_button.click(
1262
- fn=infer,
 
1263
  inputs=[
1264
- input_image_1,
1265
- input_image_2,
1266
- input_images_extra,
1267
  prompt,
1268
  lora_adapter,
1269
  seed,
1270
  randomize_seed,
1271
- guidance_scale,
1272
  steps,
1273
  target_megapixels,
 
 
 
1274
  extras_condition_only,
1275
- pad_to_canvas,
 
 
 
1276
  ],
1277
- outputs=[output_image, seed, last_output],
1278
  )
1279
 
1280
- # Output routing
1281
- btn_out_to_img1.click(fn=set_output_as_image1, inputs=[last_output], outputs=[input_image_1])
1282
- btn_out_to_img2.click(fn=set_output_as_image2, inputs=[last_output], outputs=[input_image_2])
1283
- btn_out_to_extra.click(fn=set_output_as_extra, inputs=[last_output, input_images_extra], outputs=[input_images_extra])
1284
-
1285
- # Derived conditioning: append depth map
1286
- add_derived_btn.click(
1287
- fn=add_derived_ref,
1288
- inputs=[input_image_1, input_images_extra, derived_type, derived_use_gpu],
1289
- outputs=[input_images_extra, derived_preview],
1290
- )
1291
 
1292
- if __name__ == "__main__":
1293
- demo.queue(max_size=30).launch(
1294
- css=css,
1295
- theme=orange_red_theme,
1296
- mcp_server=True,
1297
- ssr_mode=False,
1298
- show_error=True,
1299
- )
 
1
+ ---
2
+
3
+ ## 2) `app.py`
4
+
5
+ > Replace your existing `app.py` with this.
6
+ >
7
+ > Notes:
8
+ > - **ViTPose removed** (no imports, no model loading)
9
+ > - Depth conditioning is the only derived conditioning mode
10
+ > - **Picture 1 / Picture 2** labels
11
+ > - Output routing buttons included
12
+ > - LCD step dropdown (32/56/112) controls both canvas snapping and pipeline snapping
13
+
14
+ ```python
15
  import os
16
  import re
17
  import gc
18
  import traceback
19
+ import random
20
+ from typing import Iterable, Optional
21
+
22
  import gradio as gr
23
  import numpy as np
24
  import spaces
25
  import torch
 
26
  from PIL import Image
 
 
 
 
 
 
27
 
28
+ from transformers import AutoImageProcessor, AutoModelForDepthEstimation
29
  from huggingface_hub import hf_hub_download
 
30
  from safetensors.torch import load_file as safetensors_load_file
31
 
32
  from gradio.themes import Soft
 
51
  c950="#802200",
52
  )
53
 
 
54
  class OrangeRedTheme(Soft):
55
  def __init__(
56
  self,
 
106
  block_label_background_fill="*primary_200",
107
  )
108
 
 
109
  orange_red_theme = OrangeRedTheme()
110
 
111
  # ============================================================
 
113
  # ============================================================
114
 
115
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
116
  print("CUDA_VISIBLE_DEVICES=", os.environ.get("CUDA_VISIBLE_DEVICES"))
117
  print("torch.__version__ =", torch.__version__)
118
  print("torch.version.cuda =", torch.version.cuda)
 
123
  print("device name:", torch.cuda.get_device_name(torch.cuda.current_device()))
124
  print("Using device:", device)
125
 
126
+ dtype = torch.bfloat16
127
+ MAX_SEED = np.iinfo(np.int32).max
128
+
129
  # ============================================================
130
  # AIO version (Space variable)
131
  # ============================================================
132
 
133
  AIO_REPO_ID = "Pr0f3ssi0n4ln00b/Phr00t-Qwen-Rapid-AIO"
134
  DEFAULT_AIO_VERSION = "v19"
 
135
  _VER_RE = re.compile(r"^v\d+$")
136
  _DIGITS_RE = re.compile(r"^\d+$")
137
 
 
138
  def _normalize_version(raw: str) -> Optional[str]:
139
  if raw is None:
140
  return None
 
147
  return f"v{s}"
148
  return None
149
 
 
150
  _AIO_ENV_RAW = os.environ.get("AIO_VERSION", "")
151
  _AIO_ENV_NORM = _normalize_version(_AIO_ENV_RAW)
 
152
  AIO_VERSION = _AIO_ENV_NORM or DEFAULT_AIO_VERSION
153
  AIO_VERSION_SOURCE = "env" if _AIO_ENV_NORM else "default(v19)"
 
154
  print(f"AIO_VERSION (env raw) = {_AIO_ENV_RAW!r}")
155
  print(f"AIO_VERSION (normalized) = {_AIO_ENV_NORM!r}")
156
  print(f"Using AIO_VERSION = {AIO_VERSION} ({AIO_VERSION_SOURCE})")
 
164
  from qwenimage.transformer_qwenimage import QwenImageTransformer2DModel
165
  from qwenimage.qwen_fa3_processor import QwenDoubleStreamAttnProcessorFA3
166
 
 
 
 
167
  def _load_pipe_with_version(version: str) -> QwenImageEditPlusPipeline:
168
  sub = f"{version}/transformer"
169
+ print(f"Loading AIO transformer: {AIO_REPO_ID} / {sub}")
170
  p = QwenImageEditPlusPipeline.from_pretrained(
171
  "Qwen/Qwen-Image-Edit-2511",
172
  transformer=QwenImageTransformer2DModel.from_pretrained(
 
179
  ).to(device)
180
  return p
181
 
 
182
  try:
183
  pipe = _load_pipe_with_version(AIO_VERSION)
184
  except Exception:
185
  print("❌ Failed to load requested AIO_VERSION. Falling back to v19.")
186
+ print("---- exception ----")
187
  print(traceback.format_exc())
188
+ print("-------------------")
189
  AIO_VERSION = DEFAULT_AIO_VERSION
190
  AIO_VERSION_SOURCE = "fallback_to_v19"
191
  pipe = _load_pipe_with_version(AIO_VERSION)
 
196
  except Exception as e:
197
  print(f"Warning: Could not set FA3 processor: {e}")
198
 
199
+ # ============================================================
200
+ # VAE tiling toggle (UI-controlled; OFF by default)
201
+ # ============================================================
202
+
203
+ def _apply_vae_tiling(enabled: bool):
204
+ """
205
+ Toggle VAE tiling on the global pipeline.
206
+ This does NOT require a Space restart; it applies to the next pipe(...) call.
207
+ """
208
+ try:
209
+ if enabled:
210
+ if hasattr(pipe, "enable_vae_tiling"):
211
+ pipe.enable_vae_tiling()
212
+ print("✅ VAE tiling ENABLED (per UI).")
213
+ elif hasattr(pipe, "vae") and hasattr(pipe.vae, "enable_tiling"):
214
+ pipe.vae.enable_tiling()
215
+ print("✅ VAE tiling ENABLED via pipe.vae.enable_tiling() (per UI).")
216
+ else:
217
+ print("⚠️ No enable_vae_tiling()/vae.enable_tiling() found; cannot enable.")
218
+ else:
219
+ if hasattr(pipe, "disable_vae_tiling"):
220
+ pipe.disable_vae_tiling()
221
+ print("VAE tiling DISABLED (per UI).")
222
+ elif hasattr(pipe, "vae") and hasattr(pipe.vae, "disable_tiling"):
223
+ pipe.vae.disable_tiling()
224
+ print("VAE tiling DISABLED via pipe.vae.disable_tiling() (per UI).")
225
+ else:
226
+ print("⚠️ No disable_vae_tiling()/vae.disable_tiling() found; leaving current state unchanged.")
227
+ except Exception as e:
228
+ print(f"⚠️ VAE tiling toggle failed: {e}")
229
 
230
  # ============================================================
231
+ # Derived conditioning (Depth only) — ViTPose REMOVED
232
  # ============================================================
233
 
234
  DEPTH_MODEL_ID = "depth-anything/Depth-Anything-V2-Small-hf"
235
  _DEPTH_CACHE = {}
236
 
 
237
  def _derived_device(use_gpu: bool) -> torch.device:
238
  return torch.device("cuda" if (use_gpu and torch.cuda.is_available()) else "cpu")
239
 
 
240
  def _load_depth_models(dev: torch.device):
241
  key = str(dev)
242
  if key in _DEPTH_CACHE:
243
  return _DEPTH_CACHE[key]
 
244
  proc = AutoImageProcessor.from_pretrained(DEPTH_MODEL_ID)
245
  model = AutoModelForDepthEstimation.from_pretrained(DEPTH_MODEL_ID).to(dev)
246
  model.eval()
 
247
  _DEPTH_CACHE[key] = (proc, model)
248
  return _DEPTH_CACHE[key]
249
 
 
250
  def make_depth_map(img: Image.Image, *, use_gpu: bool) -> Image.Image:
251
  img = img.convert("RGB")
252
  dev = _derived_device(use_gpu)
 
258
  with torch.no_grad():
259
  out = model(**inputs)
260
 
261
+ pred = out.predicted_depth # (B,H,W)
262
  pred = torch.nn.functional.interpolate(
263
  pred.unsqueeze(1),
264
  size=(img.height, img.width),
 
274
  depth8 = (arr * 255.0).clip(0, 255).astype(np.uint8)
275
  return Image.fromarray(depth8, mode="L").convert("RGB")
276
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
277
  # ============================================================
278
  # LoRA adapters + presets
279
  # ============================================================
 
298
  "AnyPose": {
299
  "type": "package",
300
  "requires_two_images": True,
301
+ "image2_label": "Picture 2 (Pose Reference)",
302
  "parts": [
303
  {
304
  "repo": "lilylilith/AnyPose",
 
338
  "BFS-Best-FaceSwap": {
339
  "type": "single",
340
  "requires_two_images": True,
341
+ "image2_label": "Picture 2 (Head/Face Donor)",
342
  "repo": "Alissonerdx/BFS-Best-Face-Swap",
343
  "weights": "bfs_head_v5_2511_original.safetensors",
344
  "adapter_name": "BFS-Best-Faceswap",
 
348
  "BFS-Best-FaceSwap-merge": {
349
  "type": "single",
350
  "requires_two_images": True,
351
+ "image2_label": "Picture 2 (Head/Face Donor)",
352
  "repo": "Alissonerdx/BFS-Best-Face-Swap",
353
  "weights": "bfs_head_v5_2511_merged_version_rank_32_fp32.safetensors",
354
  "adapter_name": "BFS-Best-Faceswap-merge",
 
431
  LORA_PRESET_PROMPTS = {
432
  "Any2Real_2601": "change the picture 1 to realistic photograph",
433
  "Semirealistic-photo-detailer": "transform the image to semi-realistic image",
434
+ "AnyPose": (
435
+ "Make the person in image 1 do the exact same pose of the person in image 2. "
436
+ "Changing the style and background of the image of the person in image 1 is undesirable, so don't do it. "
437
+ "The new pose should be pixel accurate to the pose we are trying to copy. "
438
+ "Change the field of view and angle to match exactly image 2."
439
+ ),
440
+ "Hyperrealistic-Portrait": (
441
+ "Transform the image into an ultra-realistic photorealistic portrait with strict identity preservation, "
442
+ "facing straight to the camera. Enhance pore-level skin textures, realistic moisture effects, and natural wet hair clumping. "
443
+ "Use shallow depth of field with a clean background."
444
+ ),
445
+ "Ultrarealistic-Portrait": (
446
+ "Transform the image into an ultra-realistic glamour portrait while strictly preserving the subject’s identity. "
447
+ "Enhance cinematic directional lighting and keep realism without over-smoothing."
448
+ ),
449
  "Upscale2K": "Upscale this picture to 4K resolution.",
450
+ "BFS-Best-FaceSwap": (
451
+ "head_swap: start with Picture 1 as the base image. replace the head with Picture 2, preserving identity of Picture 2. "
452
+ "copy eye direction and micro-expressions from Picture 1. high quality, sharp details, 4k"
453
+ ),
454
+ "BFS-Best-FaceSwap-merge": (
455
+ "head_swap: start with Picture 1 as the base image. replace the head with Picture 2, preserving identity of Picture 2. "
456
+ "copy eye direction and micro-expressions from Picture 1. high quality, sharp details, 4k"
457
+ ),
458
  }
459
 
460
  LOADED_ADAPTERS = set()
 
463
  # Helpers: resolution
464
  # ============================================================
465
 
 
466
  def _round_to_multiple(x: int, m: int) -> int:
467
+ m = max(1, int(m))
468
  return max(m, (int(x) // m) * m)
469
 
470
+ def compute_canvas_dimensions_from_area(image: Image.Image, target_area: int, multiple_of: int) -> tuple[int, int]:
471
+ w, h = image.size
472
+ aspect = w / h if h else 1.0
473
+ from qwenimage.pipeline_qwenimage_edit_plus import calculate_dimensions
474
+ width, height = calculate_dimensions(int(target_area), float(aspect), multiple=int(multiple_of))
475
+ width = _round_to_multiple(int(width), int(multiple_of))
476
+ height = _round_to_multiple(int(height), int(multiple_of))
477
+ return width, height
478
 
479
+ def get_target_area_for_lora(image: Image.Image, lora_adapter: str, user_target_megapixels: float) -> int:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
480
  spec = ADAPTER_SPECS.get(lora_adapter, {})
481
+ if "target_area" in spec:
482
+ try:
483
+ return int(spec["target_area"])
484
+ except Exception:
485
+ pass
486
+ if "target_megapixels" in spec:
487
+ try:
488
+ mp = float(spec["target_megapixels"])
489
+ return int(mp * 1024 * 1024)
490
+ except Exception:
491
+ pass
492
+ if "target_long_edge" in spec:
493
+ try:
494
+ long_edge = int(spec["target_long_edge"])
495
+ w, h = image.size
496
+ if w >= h:
497
+ new_w = long_edge
498
+ new_h = int(round(long_edge * (h / w)))
499
+ else:
500
+ new_h = long_edge
501
+ new_w = int(round(long_edge * (w / h)))
502
+ return int(new_w * new_h)
503
+ except Exception:
504
+ pass
505
+ return int(float(user_target_megapixels) * 1024 * 1024)
506
 
507
  # ============================================================
508
+ # Helpers: gallery normalization
509
  # ============================================================
510
 
511
+ def _to_pil_rgb(x) -> Optional[Image.Image]:
512
+ if x is None:
513
+ return None
514
+ if isinstance(x, tuple) and len(x) >= 1:
515
+ x = x[0]
516
+ if x is None:
517
+ return None
518
+ if isinstance(x, Image.Image):
519
+ return x.convert("RGB")
520
+ if isinstance(x, np.ndarray):
521
+ return Image.fromarray(x).convert("RGB")
522
+ try:
523
+ return Image.fromarray(np.array(x)).convert("RGB")
524
+ except Exception:
525
+ return None
526
 
527
+ def _append_to_gallery(existing, new_img: Image.Image):
528
+ items = []
529
+ if existing:
530
+ for it in existing:
531
+ pil = _to_pil_rgb(it)
532
+ if pil is not None:
533
+ items.append(pil)
534
+ items.append(new_img)
535
+ return items
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
536
 
537
  def lora_requires_two_images(lora_adapter: str) -> bool:
538
+ return bool(ADAPTER_SPECS.get(lora_adapter, {}).get("requires_two_images", False))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
539
 
540
+ def image2_label_for_lora(lora_adapter: str) -> str:
541
+ return str(ADAPTER_SPECS.get(lora_adapter, {}).get("image2_label", "Picture 2"))
542
 
543
  # ============================================================
544
+ # Helpers: BFS alpha key fix / strict filtering for merged safetensors
545
  # ============================================================
546
 
547
+ def _inject_missing_alpha_keys(state_dict: dict) -> dict:
548
+ bases = {}
549
+ for k, v in state_dict.items():
550
+ if not isinstance(v, torch.Tensor):
551
+ continue
552
+ if k.endswith(".lora_down.weight") and v.ndim >= 1:
553
+ base = k[: -len(".lora_down.weight")]
554
+ rank = int(v.shape[0])
555
+ bases[base] = rank
556
+
557
+ for base, rank in bases.items():
558
+ alpha_tensor = torch.tensor(float(rank), dtype=torch.float32)
559
+ full_alpha = f"{base}.alpha"
560
+ if full_alpha not in state_dict:
561
+ state_dict[full_alpha] = alpha_tensor
562
+ if base.startswith("diffusion_model."):
563
+ stripped_base = base[len("diffusion_model.") :]
564
+ stripped_alpha = f"{stripped_base}.alpha"
565
+ if stripped_alpha not in state_dict:
566
+ state_dict[stripped_alpha] = alpha_tensor
567
+ return state_dict
568
 
569
+ def _filter_to_diffusers_lora_keys(state_dict: dict) -> tuple[dict, dict]:
570
+ keep_suffixes = (
571
+ ".lora_up.weight",
572
+ ".lora_down.weight",
573
+ ".lora_mid.weight",
574
+ ".alpha",
575
+ ".lora_alpha",
576
+ )
577
+ dropped_patch = 0
578
+ dropped_other = 0
579
+ kept = 0
580
+ normalized_alpha = 0
581
+
582
+ out = {}
583
+ for k, v in state_dict.items():
584
+ if not isinstance(v, torch.Tensor):
585
+ dropped_other += 1
586
+ continue
587
+ if k.endswith(".diff") or k.endswith(".diff_b"):
588
+ dropped_patch += 1
589
+ continue
590
+ if not k.endswith(keep_suffixes):
591
+ dropped_other += 1
592
+ continue
593
+ if k.endswith(".lora_alpha"):
594
+ base = k[: -len(".lora_alpha")]
595
+ k2 = f"{base}.alpha"
596
+ out[k2] = v.float() if v.dtype != torch.float32 else v
597
+ normalized_alpha += 1
598
+ kept += 1
599
+ continue
600
+ out[k] = v
601
+ kept += 1
602
+
603
+ stats = {
604
+ "kept": kept,
605
+ "dropped_patch": dropped_patch,
606
+ "dropped_other": dropped_other,
607
+ "normalized_alpha": normalized_alpha,
608
+ }
609
+ return out, stats
610
+
611
+ def _duplicate_stripped_prefix_keys(state_dict: dict, prefix: str = "diffusion_model.") -> dict:
612
+ out = dict(state_dict)
613
+ for k, v in list(state_dict.items()):
614
+ if not k.startswith(prefix):
615
+ continue
616
+ stripped = k[len(prefix) :]
617
+ if stripped not in out:
618
+ out[stripped] = v
619
+ return out
620
+
621
+ def _load_lora_weights_with_fallback(repo: str, weight_name: str, adapter_name: str, needs_alpha_fix: bool = False):
622
+ try:
623
+ pipe.load_lora_weights(repo, weight_name=weight_name, adapter_name=adapter_name)
624
+ return
625
+ except (KeyError, ValueError) as e:
626
+ if not needs_alpha_fix:
627
+ raise
628
+
629
+ print(
630
+ "⚠️ LoRA load failed (will try safe dict fallback). "
631
+ f"Adapter={adapter_name!r} file={weight_name!r} error={type(e).__name__}: {e}"
632
+ )
633
+ local_path = hf_hub_download(repo_id=repo, filename=weight_name)
634
+ sd = safetensors_load_file(local_path)
635
+ sd = _inject_missing_alpha_keys(sd)
636
+ sd, stats = _filter_to_diffusers_lora_keys(sd)
637
+ sd = _duplicate_stripped_prefix_keys(sd)
638
+ print("LoRA dict stats:", stats)
639
+ pipe.load_lora_weights(sd, adapter_name=adapter_name)
640
+ return
641
+
642
+ def _ensure_loaded_and_get_active_adapters(selected_lora: str):
643
+ spec = ADAPTER_SPECS.get(selected_lora)
644
+ if not spec:
645
+ raise gr.Error(f"Configuration not found for: {selected_lora}")
646
 
647
+ adapter_names = []
648
+ adapter_weights = []
649
+
650
+ if spec.get("type") == "package":
651
+ parts = spec.get("parts", [])
652
+ if not parts:
653
+ raise gr.Error(f"Package spec has no parts: {selected_lora}")
654
+ for part in parts:
655
+ repo = part["repo"]
656
+ weights = part["weights"]
657
+ name = part["adapter_name"]
658
+ strength = float(part.get("strength", 1.0))
659
+ needs_alpha_fix = bool(part.get("needs_alpha_fix", False))
660
+
661
+ if name not in LOADED_ADAPTERS:
662
+ _load_lora_weights_with_fallback(repo, weights, name, needs_alpha_fix=needs_alpha_fix)
663
+ LOADED_ADAPTERS.add(name)
664
+
665
+ adapter_names.append(name)
666
+ adapter_weights.append(strength)
667
 
 
 
668
  else:
669
+ repo = spec["repo"]
670
+ weights = spec["weights"]
671
+ name = spec["adapter_name"]
672
+ strength = float(spec.get("strength", 1.0))
673
+ needs_alpha_fix = bool(spec.get("needs_alpha_fix", False))
674
+
675
+ if name not in LOADED_ADAPTERS:
676
+ _load_lora_weights_with_fallback(repo, weights, name, needs_alpha_fix=needs_alpha_fix)
677
+ LOADED_ADAPTERS.add(name)
678
 
679
+ adapter_names.append(name)
680
+ adapter_weights.append(strength)
681
 
682
+ return adapter_names, adapter_weights
683
 
684
  # ============================================================
685
+ # UI helpers
686
  # ============================================================
687
 
688
+ def _fmt_img_info(img: Optional[Image.Image]) -> str:
689
+ if img is None:
690
+ return ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
691
  w, h = img.size
692
+ mp = (w * h) / (1024 * 1024)
693
+ ar = (w / h) if h else 0
694
+ return f"**{w}×{h}** **{mp:.2f} MP** **AR {ar:.3f}**"
695
+
696
+ def _bfs_tooltip(selected_lora: str) -> gr.Update:
697
+ if selected_lora in ("BFS-Best-FaceSwap", "BFS-Best-FaceSwap-merge"):
698
+ return gr.update(
699
+ visible=True,
700
+ value="ℹ️ **BFS FaceSwap:** Picture 1 = **Base** (scene), Picture 2 = **Donor** (head/face).",
701
+ )
702
+ if selected_lora == "AnyPose":
703
+ return gr.update(
704
+ visible=True,
705
+ value="ℹ️ **AnyPose:** Picture 1 = **Subject**, Picture 2 = **Pose reference**.",
706
+ )
707
+ return gr.update(visible=False, value="")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
708
 
709
+ # ============================================================
710
+ # Inference
711
+ # ============================================================
712
 
713
+ def _seed_everything(seed: int):
714
+ random.seed(seed)
715
+ np.random.seed(seed)
716
+ torch.manual_seed(seed)
717
+ if torch.cuda.is_available():
718
+ torch.cuda.manual_seed_all(seed)
719
 
720
  @spaces.GPU
721
+ def infer(
722
+ img1: Image.Image,
723
+ img2: Optional[Image.Image],
724
+ extra_gallery,
725
+ prompt: str,
726
+ lora_adapter: str,
727
+ seed: int,
728
+ randomize_seed: bool,
729
+ guidance_scale: float,
730
+ steps: int,
731
+ target_megapixels: float,
732
+ use_input_area: bool,
733
+ keep_2x_output: bool,
734
+ vae_tiling: bool,
735
+ extras_condition_only: bool,
736
+ resolution_multiple: int,
737
+ vae_ref_megapixels: float,
738
+ use_depth: bool,
739
+ derived_on_gpu: bool,
740
  ):
741
  if img1 is None:
742
+ raise gr.Error("Picture 1 is required.")
 
 
 
 
 
 
 
 
 
 
 
 
743
 
744
+ img1 = img1.convert("RGB")
745
+ img2 = img2.convert("RGB") if img2 is not None else None
746
 
747
+ # Seed
748
+ if randomize_seed:
749
+ seed = random.randint(0, MAX_SEED)
750
+ seed = int(seed) % MAX_SEED
751
+ _seed_everything(seed)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
752
 
753
+ # VAE tiling toggle
754
+ _apply_vae_tiling(bool(vae_tiling))
755
 
756
+ # Load / activate LoRA
757
+ if lora_adapter != NONE_LORA:
758
+ adapter_names, adapter_weights = _ensure_loaded_and_get_active_adapters(lora_adapter)
759
+ pipe.set_adapters(adapter_names, adapter_weights)
760
+ else:
761
+ try:
762
+ pipe.set_adapters([])
763
+ except Exception:
764
+ pass
765
 
766
+ # Images list: Picture1, Picture2 (optional), extras..., derived (optional)
767
+ images = [img1]
768
+ base_count = 1
 
 
769
 
770
+ if lora_requires_two_images(lora_adapter):
771
+ if img2 is None:
772
+ raise gr.Error(f"{lora_adapter} requires Picture 2.")
773
+ images.append(img2)
774
+ base_count = 2
775
+ else:
776
+ img2 = None # ignore if not needed
777
+
778
+ extras = []
779
+ if extra_gallery:
780
+ for it in extra_gallery:
781
+ p = _to_pil_rgb(it)
782
+ if p is not None:
783
+ extras.append(p)
784
+ images.extend(extras)
785
+
786
+ derived_preview = None
787
+ derived_index = None
788
+ if use_depth:
789
+ derived_preview = make_depth_map(img1, use_gpu=bool(derived_on_gpu))
790
+ images.append(derived_preview)
791
+ derived_index = len(images) - 1
792
+
793
+ # Canvas sizing
794
+ res_mult = int(resolution_multiple)
795
+ if use_input_area or float(target_megapixels) <= 0.0:
796
+ target_area = int(img1.width * img1.height)
797
+ else:
798
+ target_area = int(get_target_area_for_lora(img1, lora_adapter, float(target_megapixels)))
799
 
800
+ base_w, base_h = compute_canvas_dimensions_from_area(img1, target_area, res_mult)
801
 
802
+ # Generate at 2x, then downsample unless keep_2x_output
803
+ gen_w, gen_h = int(base_w * 2), int(base_h * 2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
804
 
805
+ # Extra refs routing (VAE vs conditioning-only)
806
+ if extras_condition_only:
807
+ vae_indices = list(range(base_count))
808
+ else:
809
+ vae_indices = list(range(len(images)))
810
+
811
+ # Derived depth should ALWAYS be conditioning-only
812
+ if derived_index is not None and derived_index in vae_indices:
813
+ vae_indices = [i for i in vae_indices if i != derived_index]
814
+
815
+ # VAE ref size override for extras only
816
+ vae_ref_area = None
817
+ if float(vae_ref_megapixels) > 0.0:
818
+ vae_ref_area = int(float(vae_ref_megapixels) * 1024 * 1024)
819
+
820
+ # Run
821
+ out = pipe(
822
+ image=images,
823
+ prompt=prompt,
824
+ true_cfg_scale=float(guidance_scale),
825
+ num_inference_steps=int(steps),
826
+ width=int(gen_w),
827
+ height=int(gen_h),
828
+ pad_to_canvas=True,
829
+ vae_image_indices=vae_indices,
830
+ resolution_multiple=int(res_mult),
831
+ vae_ref_area=vae_ref_area,
832
+ vae_ref_start_index=int(base_count),
833
+ generator=torch.Generator(device=device).manual_seed(seed),
834
  )
835
 
836
+ result = out.images[0] if hasattr(out, "images") else out[0][0]
837
+ if isinstance(result, np.ndarray):
838
+ result = Image.fromarray(result)
839
 
840
+ result = result.convert("RGB")
841
 
842
+ if not keep_2x_output:
843
+ result = result.resize((base_w, base_h), Image.Resampling.LANCZOS)
 
 
844
 
845
+ # Cleanup
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
846
  gc.collect()
847
  if torch.cuda.is_available():
848
  torch.cuda.empty_cache()
849
 
850
+ return result, seed, derived_preview
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
851
 
852
+ # ============================================================
853
+ # UI
854
+ # ============================================================
855
 
856
+ def _on_lora_change(selected_lora: str):
857
+ # Prompt preset
858
+ preset = LORA_PRESET_PROMPTS.get(selected_lora, "")
859
+ prompt_update = gr.update(value=preset) if preset else gr.update()
860
 
861
+ # Picture 2 visibility/label
862
+ if lora_requires_two_images(selected_lora):
863
+ img2_update = gr.update(visible=True, label=image2_label_for_lora(selected_lora))
864
+ else:
865
+ img2_update = gr.update(visible=True, label="Picture 2") # keep visible, but optional
866
+ tooltip_update = _bfs_tooltip(selected_lora)
867
 
868
+ return prompt_update, img2_update, tooltip_update
 
 
 
869
 
870
+ def _out_to_pic1(out_img):
871
+ return gr.update(value=out_img)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
872
 
873
+ def _out_to_pic2(out_img):
874
+ return gr.update(value=out_img)
875
 
876
+ def _out_to_extras(existing, out_img):
877
+ if out_img is None:
878
+ return gr.update()
879
+ return gr.update(value=_append_to_gallery(existing, out_img))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
880
 
881
+ with gr.Blocks(theme=orange_red_theme) as demo:
882
+ gr.Markdown(
883
+ f"""
884
+ # Qwen Image Edit — Rapid AIO LoRAs (Merged)
885
+ This experimental space for **QIE-2511** uses an extracted Rapid AIO transformer with LoRA support and extra routing features.
886
 
887
+ **Enabled features**
888
+ - Optional conditioning-only routing for extra reference latents
889
+ - Uncapped canvas sizing (MP-based) + **2× generation with optional downsample**
890
+ - Optional **VAE tiling** (for high resolutions)
891
+ - Optional **Depth mapping** for conditioning
892
+ - Optional output routing back to inputs
893
 
894
+ **Active AIO version:** `{AIO_VERSION}` *(source: {AIO_VERSION_SOURCE})*
 
 
895
  """
896
+ )
897
 
898
+ with gr.Row():
899
+ with gr.Column(scale=1):
900
+ img1 = gr.Image(label="Picture 1", type="pil")
901
+ img1_info = gr.Markdown("—")
902
+ img2 = gr.Image(label="Picture 2", type="pil")
903
+ img2_info = gr.Markdown("—")
904
+
905
+ bfs_tip = gr.Markdown(visible=False)
906
+
907
+ extra_gallery = gr.Gallery(
908
+ label="Extra references (optional)",
909
+ columns=4,
910
+ height=180,
911
+ )
912
+
913
+ with gr.Row():
914
+ use_depth = gr.Checkbox(label="Use Depth conditioning (adds a derived reference)", value=False)
915
+ derived_on_gpu = gr.Checkbox(label="Run depth on GPU (if available)", value=True)
916
+
917
+ derived_preview = gr.Image(label="Derived conditioning preview", interactive=False, format="png")
918
+
919
+ with gr.Column(scale=1):
920
+ lora_adapter = gr.Dropdown(
921
+ label="LoRA",
922
+ choices=[NONE_LORA] + sorted(list(ADAPTER_SPECS.keys())),
923
+ value=NONE_LORA,
924
+ )
925
+
926
+ prompt = gr.Textbox(label="Prompt", lines=4, placeholder="Describe the edit…")
927
+
928
+ with gr.Row():
929
+ steps = gr.Slider(1, 80, value=40, step=1, label="Steps")
930
+ guidance = gr.Slider(1.0, 10.0, value=4.0, step=0.1, label="CFG (true_cfg_scale)")
931
+
932
+ with gr.Row():
933
+ resolution_multiple = gr.Dropdown(
934
+ label="Resolution step (LCD lattice)",
935
+ choices=[32, 56, 112],
936
+ value=32,
937
  )
938
+ vae_ref_megapixels = gr.Slider(
939
+ 0.0, 4.0, value=0.0, step=0.1,
940
+ label="VAE ref MP override (extras only, 0 = off)"
 
 
941
  )
942
 
943
+ with gr.Row():
944
+ target_megapixels = gr.Slider(
945
+ 0.0, 12.0, value=1.0, step=0.1,
946
+ label="Canvas megapixels (0 = same as Picture 1)"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
947
  )
948
+ use_input_area = gr.Checkbox(label="Use Picture 1 pixel area", value=False)
949
 
950
+ with gr.Row():
951
+ keep_2x_output = gr.Checkbox(label="Keep output (otherwise downsample)", value=False)
952
+ extras_condition_only = gr.Checkbox(label="Route extras as conditioning-only (no VAE)", value=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
953
 
954
+ with gr.Row():
955
+ vae_tiling = gr.Checkbox(label="VAE tiling", value=False)
956
+ randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
 
 
 
 
 
 
 
957
 
958
+ seed = gr.Number(label="Seed", value=0, precision=0)
 
 
 
 
 
959
 
960
+ run_btn = gr.Button("Run", variant="primary")
961
+ out_img = gr.Image(label="Output", type="pil")
 
 
 
962
 
963
+ with gr.Row():
964
+ to_pic1 = gr.Button("Output → Picture 1")
965
+ to_pic2 = gr.Button("Output → Picture 2")
966
+ to_extras = gr.Button("Output → Extras (append)")
 
 
 
 
 
 
 
 
 
 
967
 
968
+ # Live info updates
969
+ img1.change(lambda x: _fmt_img_info(x), inputs=[img1], outputs=[img1_info])
970
+ img2.change(lambda x: _fmt_img_info(x), inputs=[img2], outputs=[img2_info])
 
 
 
971
 
972
+ # LoRA change
973
+ lora_adapter.change(_on_lora_change, inputs=[lora_adapter], outputs=[prompt, img2, bfs_tip])
 
 
 
 
 
 
 
 
 
 
 
974
 
975
+ # Run
976
+ run_btn.click(
977
+ infer,
978
  inputs=[
979
+ img1,
980
+ img2,
981
+ extra_gallery,
982
  prompt,
983
  lora_adapter,
984
  seed,
985
  randomize_seed,
986
+ guidance,
987
  steps,
988
  target_megapixels,
989
+ use_input_area,
990
+ keep_2x_output,
991
+ vae_tiling,
992
  extras_condition_only,
993
+ resolution_multiple,
994
+ vae_ref_megapixels,
995
+ use_depth,
996
+ derived_on_gpu,
997
  ],
998
+ outputs=[out_img, seed, derived_preview],
999
  )
1000
 
1001
+ # Output routing buttons
1002
+ to_pic1.click(_out_to_pic1, inputs=[out_img], outputs=[img1])
1003
+ to_pic2.click(_out_to_pic2, inputs=[out_img], outputs=[img2])
1004
+ to_extras.click(_out_to_extras, inputs=[extra_gallery, out_img], outputs=[extra_gallery])
 
 
 
 
 
 
 
1005
 
1006
+ demo.queue(max_size=32).launch()