mirror of
https://github.com/huggingface/diffusers.git
synced 2026-01-31 07:55:01 +08:00
Compare commits
5 Commits
ltx2-impro
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ec6b2bcccb | ||
|
|
6a1904eb06 | ||
|
|
f5b6b6625a | ||
|
|
1be2f7e8c5 | ||
|
|
314cfddf3a |
@@ -1,4 +1,4 @@
|
||||
FROM nvidia/cuda:12.9.0-runtime-ubuntu20.04
|
||||
FROM nvidia/cuda:12.9.1-runtime-ubuntu24.04
|
||||
LABEL maintainer="Hugging Face"
|
||||
LABEL repository="diffusers"
|
||||
|
||||
@@ -36,7 +36,8 @@ ENV PATH="$VIRTUAL_ENV/bin:$PATH"
|
||||
RUN uv pip install --no-cache-dir \
|
||||
torch \
|
||||
torchvision \
|
||||
torchaudio
|
||||
torchaudio \
|
||||
--index-url https://download.pytorch.org/whl/cu129
|
||||
|
||||
# Install compatible versions of numba/llvmlite for Python 3.10+
|
||||
RUN uv pip install --no-cache-dir \
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
FROM nvidia/cuda:12.9.0-runtime-ubuntu20.04
|
||||
FROM nvidia/cuda:12.9.1-runtime-ubuntu24.04
|
||||
LABEL maintainer="Hugging Face"
|
||||
LABEL repository="diffusers"
|
||||
|
||||
@@ -36,7 +36,8 @@ ENV PATH="$VIRTUAL_ENV/bin:$PATH"
|
||||
RUN uv pip install --no-cache-dir \
|
||||
torch \
|
||||
torchvision \
|
||||
torchaudio
|
||||
torchaudio \
|
||||
--index-url https://download.pytorch.org/whl/cu129
|
||||
|
||||
# Install compatible versions of numba/llvmlite for Python 3.10+
|
||||
RUN uv pip install --no-cache-dir \
|
||||
|
||||
@@ -106,6 +106,8 @@ video, audio = pipe(
|
||||
output_type="np",
|
||||
return_dict=False,
|
||||
)
|
||||
video = (video * 255).round().astype("uint8")
|
||||
video = torch.from_numpy(video)
|
||||
|
||||
encode_video(
|
||||
video[0],
|
||||
@@ -183,6 +185,8 @@ video, audio = pipe(
|
||||
output_type="np",
|
||||
return_dict=False,
|
||||
)
|
||||
video = (video * 255).round().astype("uint8")
|
||||
video = torch.from_numpy(video)
|
||||
|
||||
encode_video(
|
||||
video[0],
|
||||
|
||||
@@ -35,8 +35,8 @@ from . import BaseDiffusersCLICommand
|
||||
def conversion_command_factory(args: Namespace):
|
||||
if args.use_auth_token:
|
||||
warnings.warn(
|
||||
"The `--use_auth_token` flag is deprecated and will be removed in a future version. Authentication is now"
|
||||
" handled automatically if user is logged in."
|
||||
"The `--use_auth_token` flag is deprecated and will be removed in a future version."
|
||||
"Authentication is now handled automatically if the user is logged in."
|
||||
)
|
||||
return FP16SafetensorsCommand(args.ckpt_id, args.fp16, args.use_safetensors)
|
||||
|
||||
@@ -92,8 +92,8 @@ class FP16SafetensorsCommand(BaseDiffusersCLICommand):
|
||||
pipeline_class = getattr(import_module("diffusers"), pipeline_class_name)
|
||||
self.logger.info(f"Pipeline class imported: {pipeline_class_name}.")
|
||||
|
||||
# Load the appropriate pipeline. We could have use `DiffusionPipeline`
|
||||
# here, but just to avoid any rough edge cases.
|
||||
# Load the appropriate pipeline. We could have used `DiffusionPipeline`
|
||||
# here, but just to avoid potential edge cases.
|
||||
pipeline = pipeline_class.from_pretrained(
|
||||
self.ckpt_id, torch_dtype=torch.float16 if self.fp16 else torch.float32
|
||||
)
|
||||
|
||||
@@ -407,8 +407,8 @@ class GlmImagePipeline(DiffusionPipeline):
|
||||
|
||||
if len(source_grids) > 0:
|
||||
prior_token_image_embed = self.vision_language_encoder.get_image_features(
|
||||
inputs["pixel_values"], source_grids, return_dict=False
|
||||
)
|
||||
inputs["pixel_values"], source_grids
|
||||
).pooler_output
|
||||
prior_token_image_embed = torch.cat(prior_token_image_embed, dim=0)
|
||||
prior_token_image_ids_d32 = self.vision_language_encoder.get_image_tokens(
|
||||
prior_token_image_embed, source_grids
|
||||
|
||||
@@ -13,14 +13,10 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from collections.abc import Generator, Iterator
|
||||
from fractions import Fraction
|
||||
from typing import List, Optional, Tuple, Union
|
||||
from typing import Optional
|
||||
|
||||
import numpy as np
|
||||
import PIL.Image
|
||||
import torch
|
||||
from tqdm import tqdm
|
||||
|
||||
from ...utils import is_av_available
|
||||
|
||||
@@ -105,52 +101,11 @@ def _write_audio(
|
||||
|
||||
|
||||
def encode_video(
|
||||
video: Union[List[PIL.Image.Image], np.ndarray, torch.Tensor, Iterator[torch.Tensor]],
|
||||
fps: int,
|
||||
audio: Optional[torch.Tensor],
|
||||
audio_sample_rate: Optional[int],
|
||||
output_path: str,
|
||||
video_chunks_number: int = 1,
|
||||
video: torch.Tensor, fps: int, audio: Optional[torch.Tensor], audio_sample_rate: Optional[int], output_path: str
|
||||
) -> None:
|
||||
"""
|
||||
Encodes a video with audio using the PyAV library. Based on code from the original LTX-2 repo:
|
||||
https://github.com/Lightricks/LTX-2/blob/4f410820b198e05074a1e92de793e3b59e9ab5a0/packages/ltx-pipelines/src/ltx_pipelines/utils/media_io.py#L182
|
||||
video_np = video.cpu().numpy()
|
||||
|
||||
Args:
|
||||
video (`List[PIL.Image.Image]` or `np.ndarray` or `torch.Tensor`):
|
||||
A video tensor of shape [frames, height, width, channels] with integer pixel values in [0, 255]. If the
|
||||
input is a `np.ndarray`, it is expected to be a float array with values in [0, 1] (which is what pipelines
|
||||
usually return with `output_type="np"`).
|
||||
fps (`int`)
|
||||
The frames per second (FPS) of the encoded video.
|
||||
audio (`torch.Tensor`, *optional*):
|
||||
An audio waveform of shape [audio_channels, samples].
|
||||
audio_sample_rate: (`int`, *optional*):
|
||||
The sampling rate of the audio waveform. For LTX 2, this is typically 24000 (24 kHz).
|
||||
output_path (`str`):
|
||||
The path to save the encoded video to.
|
||||
video_chunks_number (`int`, *optional*, defaults to `1`):
|
||||
The number of chunks to split the video into for encoding. Each chunk will be encoded separately. The
|
||||
number of chunks to use often depends on the tiling config for the video VAE.
|
||||
"""
|
||||
if isinstance(video, list) and isinstance(video[0], PIL.Image.Image):
|
||||
# Pipeline output_type="pil"
|
||||
video_frames = [np.array(frame) for frame in video]
|
||||
video = np.stack(video_frames, axis=0)
|
||||
video = torch.from_numpy(video)
|
||||
elif isinstance(video, np.ndarray):
|
||||
# Pipeline output_type="np"
|
||||
is_denormalized = np.logical_and(np.zeros_like(video) <= video, video <= np.ones_like(video))
|
||||
if np.all(is_denormalized):
|
||||
video = (video * 255).round().astype("uint8")
|
||||
video = torch.from_numpy(video)
|
||||
|
||||
if isinstance(video, torch.Tensor):
|
||||
video = iter([video])
|
||||
|
||||
first_chunk = next(video)
|
||||
|
||||
_, height, width, _ = first_chunk.shape
|
||||
_, height, width, _ = video_np.shape
|
||||
|
||||
container = av.open(output_path, mode="w")
|
||||
stream = container.add_stream("libx264", rate=int(fps))
|
||||
@@ -164,18 +119,10 @@ def encode_video(
|
||||
|
||||
audio_stream = _prepare_audio_stream(container, audio_sample_rate)
|
||||
|
||||
def all_tiles(
|
||||
first_chunk: torch.Tensor, tiles_generator: Generator[Tuple[torch.Tensor, int], None, None]
|
||||
) -> Generator[Tuple[torch.Tensor, int], None, None]:
|
||||
yield first_chunk
|
||||
yield from tiles_generator
|
||||
|
||||
for video_chunk in tqdm(all_tiles(first_chunk, video), total=video_chunks_number):
|
||||
video_chunk_cpu = video_chunk.to("cpu").numpy()
|
||||
for frame_array in video_chunk_cpu:
|
||||
frame = av.VideoFrame.from_ndarray(frame_array, format="rgb24")
|
||||
for packet in stream.encode(frame):
|
||||
container.mux(packet)
|
||||
for frame_array in video_np:
|
||||
frame = av.VideoFrame.from_ndarray(frame_array, format="rgb24")
|
||||
for packet in stream.encode(frame):
|
||||
container.mux(packet)
|
||||
|
||||
# Flush encoder
|
||||
for packet in stream.encode():
|
||||
|
||||
@@ -69,6 +69,8 @@ EXAMPLE_DOC_STRING = """
|
||||
... output_type="np",
|
||||
... return_dict=False,
|
||||
... )
|
||||
>>> video = (video * 255).round().astype("uint8")
|
||||
>>> video = torch.from_numpy(video)
|
||||
|
||||
>>> encode_video(
|
||||
... video[0],
|
||||
|
||||
@@ -75,6 +75,8 @@ EXAMPLE_DOC_STRING = """
|
||||
... output_type="np",
|
||||
... return_dict=False,
|
||||
... )
|
||||
>>> video = (video * 255).round().astype("uint8")
|
||||
>>> video = torch.from_numpy(video)
|
||||
|
||||
>>> encode_video(
|
||||
... video[0],
|
||||
|
||||
@@ -76,6 +76,8 @@ EXAMPLE_DOC_STRING = """
|
||||
... output_type="np",
|
||||
... return_dict=False,
|
||||
... )[0]
|
||||
>>> video = (video * 255).round().astype("uint8")
|
||||
>>> video = torch.from_numpy(video)
|
||||
|
||||
>>> encode_video(
|
||||
... video[0],
|
||||
|
||||
@@ -496,8 +496,13 @@ class WanPipeline(DiffusionPipeline, WanLoraLoaderMixin):
|
||||
num_frames = num_frames // self.vae_scale_factor_temporal * self.vae_scale_factor_temporal + 1
|
||||
num_frames = max(num_frames, 1)
|
||||
|
||||
h_multiple_of = self.vae_scale_factor_spatial * self.transformer.config.patch_size[1]
|
||||
w_multiple_of = self.vae_scale_factor_spatial * self.transformer.config.patch_size[2]
|
||||
patch_size = (
|
||||
self.transformer.config.patch_size
|
||||
if self.transformer is not None
|
||||
else self.transformer_2.config.patch_size
|
||||
)
|
||||
h_multiple_of = self.vae_scale_factor_spatial * patch_size[1]
|
||||
w_multiple_of = self.vae_scale_factor_spatial * patch_size[2]
|
||||
calc_height = height // h_multiple_of * h_multiple_of
|
||||
calc_width = width // w_multiple_of * w_multiple_of
|
||||
if height != calc_height or width != calc_width:
|
||||
|
||||
@@ -637,8 +637,13 @@ class WanImageToVideoPipeline(DiffusionPipeline, WanLoraLoaderMixin):
|
||||
num_frames = num_frames // self.vae_scale_factor_temporal * self.vae_scale_factor_temporal + 1
|
||||
num_frames = max(num_frames, 1)
|
||||
|
||||
h_multiple_of = self.vae_scale_factor_spatial * self.transformer.config.patch_size[1]
|
||||
w_multiple_of = self.vae_scale_factor_spatial * self.transformer.config.patch_size[2]
|
||||
patch_size = (
|
||||
self.transformer.config.patch_size
|
||||
if self.transformer is not None
|
||||
else self.transformer_2.config.patch_size
|
||||
)
|
||||
h_multiple_of = self.vae_scale_factor_spatial * patch_size[1]
|
||||
w_multiple_of = self.vae_scale_factor_spatial * patch_size[2]
|
||||
calc_height = height // h_multiple_of * h_multiple_of
|
||||
calc_width = width // w_multiple_of * w_multiple_of
|
||||
if height != calc_height or width != calc_width:
|
||||
|
||||
@@ -227,7 +227,7 @@ _cosmos_guardrail_available, _cosmos_guardrail_version = _is_package_available("
|
||||
_sageattention_available, _sageattention_version = _is_package_available("sageattention")
|
||||
_flash_attn_available, _flash_attn_version = _is_package_available("flash_attn")
|
||||
_flash_attn_3_available, _flash_attn_3_version = _is_package_available("flash_attn_3")
|
||||
_aiter_available, _aiter_version = _is_package_available("aiter")
|
||||
_aiter_available, _aiter_version = _is_package_available("aiter", get_dist_name=True)
|
||||
_kornia_available, _kornia_version = _is_package_available("kornia")
|
||||
_nvidia_modelopt_available, _nvidia_modelopt_version = _is_package_available("modelopt", get_dist_name=True)
|
||||
_av_available, _av_version = _is_package_available("av")
|
||||
|
||||
Reference in New Issue
Block a user