Open
Description
Is your feature request related to a problem? Please describe.
Not a problem but a good feature to be supported in Wan Vace pipeline as model supports it.
Describe the solution you'd like.
from controlnet_aux import OpenposeDetector, MidasDetector
from diffusers.utils import load_video, export_to_video, load_image
import torch
from diffusers import AutoencoderKLWan, WanVACEPipeline
from diffusers.schedulers.scheduling_unipc_multistep import UniPCMultistepScheduler
# Free video downloaded from Pexels
video = load_video("https://huggingface.co/datasets/newgenai79/testing/resolve/main/2795750-hd_832_480_25fps.mp4?download=true")
video = [frame.convert("RGB").resize((832, 480)) for frame in video]
ref_image = load_image("https://huggingface.co/datasets/newgenai79/testing/resolve/main/00766-4015437123.png?download=true")
open_pose = OpenposeDetector.from_pretrained("lllyasviel/Annotators")
open_pose.to("cuda")
openpose_video = [open_pose(frame) for frame in video]
export_to_video(openpose_video, "openpose-man-contemporary-dance.mp4", fps=25)
del open_pose, openpose_video
depth_midas = MidasDetector.from_pretrained("lllyasviel/Annotators")
depth_midas.to("cuda")
depth_video = [depth_midas(frame) for frame in video]
export_to_video(depth_video, "depth-man-contemporary-dance.mp4", fps=25)
del depth_midas, video, depth_video
model_id = "a-r-r-o-w/Wan-VACE-1.3B-diffusers"
vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
pipe = WanVACEPipeline.from_pretrained(model_id, vae=vae, torch_dtype=torch.bfloat16)
flow_shift = 3.0 # 5.0 for 720P, 3.0 for 480P
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config, flow_shift=flow_shift)
pipe.to("cuda")
prompt = "An alien-like creature with a resemblance of leaves, branches and twigs is dancing gracefully in a post-apocalyptic world. The creature has a humanoid shape, with long, flowing limbs that resemble branches. Its skin is textured like bark, and its eyes glow softly. The background is a desolate landscape with remnants of a once-thriving city, now overgrown with vegetation. The lighting is soft and ethereal, casting a magical glow on the scene. The camera captures the creature from a low angle, emphasizing its height and gracefulness as it moves fluidly through the air."
negative_prompt = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards"
height = 480
width = 832
num_frames = 50
open_pose_video = load_video("openpose-man-contemporary-dance.mp4")[:num_frames]
open_pose_video = [frame.convert("RGB").resize((width, height)) for frame in open_pose_video]
depth_video = load_video("depth-man-contemporary-dance.mp4")[:num_frames]
depth_video = [frame.convert("RGB").resize((width, height)) for frame in depth_video]
"""
In this scenario either prompt or image without prompt..
"""
output = pipe(
video=[open_pose_video, depth_video],
prompt=prompt,
# and / or
image=ref_image,
negative_prompt=negative_prompt,
height=height,
width=width,
num_frames=num_frames,
num_inference_steps=30,
guidance_scale=5.0,
generator=torch.Generator().manual_seed(42),
).frames[0]
export_to_video(output, "output.mp4", fps=16)
Describe alternatives you've considered.
This is supported in Comfy (as shown in video) and I got the idea when it was requested here
deepbeepmeep/Wan2GP#266
Additional context.
Discussion
Metadata
Metadata
Assignees
Labels
No labels