[FR] Please support ref image and multiple control videos in Wan VACE

@a-r-r-o-w

Is your feature request related to a problem? Please describe.
Not a problem but a good feature to be supported in Wan Vace pipeline as model supports it.

Describe the solution you'd like.

from controlnet_aux import OpenposeDetector, MidasDetector
from diffusers.utils import load_video, export_to_video, load_image
import torch
from diffusers import AutoencoderKLWan, WanVACEPipeline
from diffusers.schedulers.scheduling_unipc_multistep import UniPCMultistepScheduler

# Free video downloaded from Pexels
video = load_video("https://huggingface.co/datasets/newgenai79/testing/resolve/main/2795750-hd_832_480_25fps.mp4?download=true")
video = [frame.convert("RGB").resize((832, 480)) for frame in video]
ref_image = load_image("https://huggingface.co/datasets/newgenai79/testing/resolve/main/00766-4015437123.png?download=true")

open_pose = OpenposeDetector.from_pretrained("lllyasviel/Annotators")
open_pose.to("cuda")
openpose_video = [open_pose(frame) for frame in video]
export_to_video(openpose_video, "openpose-man-contemporary-dance.mp4", fps=25)
del open_pose, openpose_video

depth_midas = MidasDetector.from_pretrained("lllyasviel/Annotators")
depth_midas.to("cuda")
depth_video = [depth_midas(frame) for frame in video]
export_to_video(depth_video, "depth-man-contemporary-dance.mp4", fps=25)
del depth_midas, video, depth_video



model_id = "a-r-r-o-w/Wan-VACE-1.3B-diffusers"
vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
pipe = WanVACEPipeline.from_pretrained(model_id, vae=vae, torch_dtype=torch.bfloat16)
flow_shift = 3.0  # 5.0 for 720P, 3.0 for 480P
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config, flow_shift=flow_shift)
pipe.to("cuda")

prompt = "An alien-like creature with a resemblance of leaves, branches and twigs is dancing gracefully in a post-apocalyptic world. The creature has a humanoid shape, with long, flowing limbs that resemble branches. Its skin is textured like bark, and its eyes glow softly. The background is a desolate landscape with remnants of a once-thriving city, now overgrown with vegetation. The lighting is soft and ethereal, casting a magical glow on the scene. The camera captures the creature from a low angle, emphasizing its height and gracefulness as it moves fluidly through the air."
negative_prompt = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards"

height = 480
width = 832
num_frames = 50
open_pose_video = load_video("openpose-man-contemporary-dance.mp4")[:num_frames]
open_pose_video = [frame.convert("RGB").resize((width, height)) for frame in open_pose_video]

depth_video = load_video("depth-man-contemporary-dance.mp4")[:num_frames]
depth_video = [frame.convert("RGB").resize((width, height)) for frame in depth_video]

"""
In this scenario either prompt or image without prompt..
"""

output = pipe(
    video=[open_pose_video, depth_video],
    prompt=prompt,
    # and / or
    image=ref_image,
    negative_prompt=negative_prompt,
    height=height,
    width=width,
    num_frames=num_frames,
    num_inference_steps=30,
    guidance_scale=5.0,
    generator=torch.Generator().manual_seed(42),
).frames[0]
export_to_video(output, "output.mp4", fps=16)

Describe alternatives you've considered.
This is supported in Comfy (as shown in video) and I got the idea when it was requested here
deepbeepmeep/Wan2GP#266

Additional context.
Discussion

@a-r-r-o-w @DN6

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[FR] Please support ref image and multiple control videos in Wan VACE #11674

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

[FR] Please support ref image and multiple control videos in Wan VACE #11674

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions