Problems with Kontext quantization

I have some problems with Kontext quantization model.

FP8 quantization
This is my code:

import torch
from diffusers import FluxKontextPipeline, FluxTransformer2DModel, AutoencoderKL, FlowMatchEulerDiscreteScheduler, GGUFQuantizationConfig
from transformers import CLIPTextModel, CLIPTokenizer, T5EncoderModel, T5Tokenizer
from diffusers.utils import load_image
from huggingface_hub import login

model_path = "black-forest-labs/FLUX.1-Kontext-dev"

transformer = FluxTransformer2DModel.from_single_file("https://huggingface.co/Comfy-Org/flux1-kontext-dev_ComfyUI/blob/main/split_files/diffusion_models/flux1-dev-kontext_fp8_scaled.safetensors")

clip_model = CLIPTextModel.from_pretrained(model_path, subfolder="text_encoder", torch_dtype=torch.bfloat16)
t5_model = T5EncoderModel.from_pretrained(model_path, subfolder="text_encoder_2", torch_dtype=torch.bfloat16)
vae = AutoencoderKL.from_pretrained(model_path, subfolder="vae", torch_dtype=torch.bfloat16)

tokenizer = CLIPTokenizer.from_pretrained(model_path, subfolder="tokenizer")
tokenizer_2 = T5Tokenizer.from_pretrained(model_path, subfolder="tokenizer_2")
scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(model_path, subfolder="scheduler")

pipe = FluxKontextPipeline(
        vae=vae,
        text_encoder=clip_model,
        text_encoder_2=t5_model,
        transformer=transformer,
        tokenizer=tokenizer,
        tokenizer_2=tokenizer_2,
        scheduler=scheduler,
        )
pipe.to("cuda").to(torch.bfloat16)

input_image = load_image("https://images.squarespace-cdn.com/content/v1/5691c07b69492ec8d7400b4c/1606213149449-YYZKF3NPCITJGMLM3VAG/Boris%2BPoplavsky.jpg")
image = pipe(
  image=input_image,
  prompt="Colorize this photograph. Crisp intricate textured detailing. Rich tonality. Crisp details. Chromatic richness and range. Make it indistinguishable from a top quality large-format UHD or 50MP+ or 12k+ professional photo taken using a DSLR camera in 2025, upscale quality and carefully restore any blemished or blurred details. Retain facial features and proportions with great fidelity to the source. Add fine textural detailing only where missing. Render tones consistent with real life. Leave everything else about the photo unchanged.",
  guidance_scale=2.5,
  num_inference_steps=10,
  generator=torch.Generator().manual_seed(42),
).images[0]

image.save("image.png")

And the image was blank like this.

Does Diffusers support this fp8 checkpoint?

GGUF quantization with unquantized LoRA
The code I used for this:

import torch
from diffusers import FluxKontextPipeline, FluxTransformer2DModel, AutoencoderKL, FlowMatchEulerDiscreteScheduler, GGUFQuantizationConfig
from transformers import CLIPTextModel, CLIPTokenizer, T5EncoderModel, T5Tokenizer
from diffusers.utils import load_image

model_path = "black-forest-labs/FLUX.1-Kontext-dev"
ckpt_path = "https://huggingface.co/QuantStack/FLUX.1-Kontext-dev-GGUF/blob/main/flux1-kontext-dev-Q3_K_M.gguf"

transformer = FluxTransformer2DModel.from_single_file(
    ckpt_path,
    quantization_config=GGUFQuantizationConfig(compute_dtype=torch.bfloat16),
    torch_dtype=torch.bfloat16,
    config="black-forest-labs/FLUX.1-Kontext-dev",
    subfolder="transformer",
)

clip_model = CLIPTextModel.from_pretrained(model_path, subfolder="text_encoder", torch_dtype=torch.bfloat16)
t5_model = T5EncoderModel.from_pretrained(model_path, subfolder="text_encoder_2", torch_dtype=torch.bfloat16)
vae = AutoencoderKL.from_pretrained(model_path, subfolder="vae", torch_dtype=torch.bfloat16)

tokenizer = CLIPTokenizer.from_pretrained(model_path, subfolder="tokenizer")
tokenizer_2 = T5Tokenizer.from_pretrained(model_path, subfolder="tokenizer_2")
scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(model_path, subfolder="scheduler")

pipe = FluxKontextPipeline(
    transformer=transformer,
    text_encoder=clip_model,
    text_encoder_2=t5_model,
    vae=vae,
    tokenizer=tokenizer,
    tokenizer_2=tokenizer_2,
    scheduler=scheduler
)

adapter_id = "alimama-creative/FLUX.1-Turbo-Alpha"
pipe.load_lora_weights(adapter_id,adapter_name="flux-Turbo-Alpha")
pipe.fuse_lora()
pipe.to("cuda")

input_image = load_image("https://images.squarespace-cdn.com/content/v1/5691c07b69492ec8d7400b4c/1606213149449-YYZKF3NPCITJGMLM3VAG/Boris%2BPoplavsky.jpg")
image = pipe(
  image=input_image,
  prompt="Colorize this photograph. Crisp intricate textured detailing. Rich tonality. Crisp details. Chromatic richness and range. Make it indistinguishable from a top quality large-format UHD or 50MP+ or 12k+ professional photo taken using a DSLR camera in 2025, upscale quality and carefully restore any blemished or blurred details. Retain facial features and proportions with great fidelity to the source. Add fine textural detailing only where missing. Render tones consistent with real life. Leave everything else about the photo unchanged.",
  guidance_scale=2.5,
  num_inference_steps=10,
  generator=torch.Generator().manual_seed(42),
).images[0]

image.save("image.png")

And this is the bug:
RuntimeError: The size of tensor a (512) must match the size of tensor b (256) at non-singleton dimension 1
How can I fix this?

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Problems with Kontext quantization #11992

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Problems with Kontext quantization #11992

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions