mirror of
https://github.com/huggingface/diffusers.git
synced 2026-01-27 22:15:00 +08:00
Compare commits
2 Commits
modular-do
...
hunyuan-i2
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
cb11196de5 | ||
|
|
42c27dbfc0 |
@@ -344,7 +344,7 @@ class HunyuanVideoImageToVideoPipeline(DiffusionPipeline, HunyuanVideoLoraLoader
|
||||
)
|
||||
prompt_embeds = self.text_encoder(
|
||||
**expanded_inputs,
|
||||
pixel_value=image_embeds,
|
||||
pixel_values=image_embeds,
|
||||
output_hidden_states=True,
|
||||
).hidden_states[-(num_hidden_layers_to_skip + 1)]
|
||||
prompt_embeds = prompt_embeds.to(dtype=dtype)
|
||||
|
||||
@@ -24,9 +24,11 @@ from transformers import (
|
||||
CLIPTextModel,
|
||||
CLIPTokenizer,
|
||||
LlamaConfig,
|
||||
LlamaModel,
|
||||
LlamaTokenizer,
|
||||
LlamaTokenizerFast,
|
||||
LlavaConfig,
|
||||
LlavaForConditionalGeneration,
|
||||
)
|
||||
from transformers.models.clip import CLIPVisionConfig
|
||||
|
||||
from diffusers import (
|
||||
AutoencoderKLHunyuanVideo,
|
||||
@@ -116,7 +118,7 @@ class HunyuanVideoImageToVideoPipelineFastTests(
|
||||
torch.manual_seed(0)
|
||||
scheduler = FlowMatchEulerDiscreteScheduler(shift=7.0)
|
||||
|
||||
llama_text_encoder_config = LlamaConfig(
|
||||
text_config = LlamaConfig(
|
||||
bos_token_id=0,
|
||||
eos_token_id=2,
|
||||
hidden_size=16,
|
||||
@@ -124,11 +126,21 @@ class HunyuanVideoImageToVideoPipelineFastTests(
|
||||
layer_norm_eps=1e-05,
|
||||
num_attention_heads=4,
|
||||
num_hidden_layers=2,
|
||||
pad_token_id=1,
|
||||
pad_token_id=100,
|
||||
vocab_size=1000,
|
||||
hidden_act="gelu",
|
||||
projection_dim=32,
|
||||
)
|
||||
vision_config = CLIPVisionConfig(
|
||||
hidden_size=8,
|
||||
intermediate_size=37,
|
||||
projection_dim=32,
|
||||
num_attention_heads=4,
|
||||
num_hidden_layers=2,
|
||||
image_size=224,
|
||||
)
|
||||
llava_text_encoder_config = LlavaConfig(vision_config, text_config, pad_token_id=100, image_token_index=101)
|
||||
|
||||
clip_text_encoder_config = CLIPTextConfig(
|
||||
bos_token_id=0,
|
||||
eos_token_id=2,
|
||||
@@ -144,8 +156,8 @@ class HunyuanVideoImageToVideoPipelineFastTests(
|
||||
)
|
||||
|
||||
torch.manual_seed(0)
|
||||
text_encoder = LlamaModel(llama_text_encoder_config)
|
||||
tokenizer = LlamaTokenizer.from_pretrained("finetrainers/dummy-hunyaunvideo", subfolder="tokenizer")
|
||||
text_encoder = LlavaForConditionalGeneration(llava_text_encoder_config)
|
||||
tokenizer = LlamaTokenizerFast.from_pretrained("finetrainers/dummy-hunyaunvideo", subfolder="tokenizer")
|
||||
|
||||
torch.manual_seed(0)
|
||||
text_encoder_2 = CLIPTextModel(clip_text_encoder_config)
|
||||
@@ -153,14 +165,14 @@ class HunyuanVideoImageToVideoPipelineFastTests(
|
||||
|
||||
torch.manual_seed(0)
|
||||
image_processor = CLIPImageProcessor(
|
||||
crop_size=336,
|
||||
crop_size=224,
|
||||
do_center_crop=True,
|
||||
do_normalize=True,
|
||||
do_resize=True,
|
||||
image_mean=[0.48145466, 0.4578275, 0.40821073],
|
||||
image_std=[0.26862954, 0.26130258, 0.27577711],
|
||||
resample=3,
|
||||
size=336,
|
||||
size=224,
|
||||
)
|
||||
|
||||
components = {
|
||||
@@ -190,6 +202,10 @@ class HunyuanVideoImageToVideoPipelineFastTests(
|
||||
"prompt_template": {
|
||||
"template": "{}",
|
||||
"crop_start": 0,
|
||||
"image_emb_len": 49,
|
||||
"image_emb_start": 5,
|
||||
"image_emb_end": 54,
|
||||
"double_return_token_id": 0,
|
||||
},
|
||||
"generator": generator,
|
||||
"num_inference_steps": 2,
|
||||
@@ -197,7 +213,7 @@ class HunyuanVideoImageToVideoPipelineFastTests(
|
||||
"height": image_height,
|
||||
"width": image_width,
|
||||
"num_frames": 9,
|
||||
"max_sequence_length": 16,
|
||||
"max_sequence_length": 64,
|
||||
"output_type": "pt",
|
||||
}
|
||||
return inputs
|
||||
|
||||
Reference in New Issue
Block a user