Merge branch 'main' into lora-device-map

resolve conflicts.
2025-12-15 17:04:52 +08:00 · 2024-12-16 08:35:20 +05:30 · 2024-12-08 13:51:31 +05:30 · 2024-11-01 10:26:16 +05:30 · 2024-11-01 08:08:41 +05:30 · 2024-11-01 08:02:12 +05:30
26 changed files with 738 additions and 11 deletions
--- a/docs/source/en/training/distributed_inference.md
+++ b/docs/source/en/training/distributed_inference.md
@@ -237,3 +237,5 @@ with torch.no_grad():
 ```
 By selectively loading and unloading the models you need at a given stage and sharding the largest models across multiple GPUs, it is possible to run inference with large models on consumer GPUs.
 This workflow is also compatible with LoRAs via [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`]. However, only LoRAs without text encoder components are currently supported in this workflow.
--- a/src/diffusers/loaders/lora_base.py
+++ b/src/diffusers/loaders/lora_base.py
@@ -327,12 +327,18 @@ class LoraBaseMixin:
            tuple:
                A tuple indicating if `is_model_cpu_offload` or `is_sequential_cpu_offload` is True.
        """
        from ..pipelines.pipeline_loading_utils import model_has_device_map
        is_model_cpu_offload = False
        is_sequential_cpu_offload = False
        if _pipeline is not None and _pipeline.hf_device_map is None:
            for _, component in _pipeline.components.items():
-                if isinstance(component, nn.Module) and hasattr(component, "_hf_hook"):
+                if (
                    isinstance(component, nn.Module)
                    and hasattr(component, "_hf_hook")
                    and not model_has_device_map(component)
                ):
                    if not is_model_cpu_offload:
                        is_model_cpu_offload = isinstance(component._hf_hook, CpuOffload)
                    if not is_sequential_cpu_offload:
--- a/src/diffusers/loaders/unet.py
+++ b/src/diffusers/loaders/unet.py
@@ -400,12 +400,18 @@ class UNet2DConditionLoadersMixin:
            tuple:
                A tuple indicating if `is_model_cpu_offload` or `is_sequential_cpu_offload` is True.
        """
        from ..pipelines.pipeline_loading_utils import model_has_device_map
        is_model_cpu_offload = False
        is_sequential_cpu_offload = False
        if _pipeline is not None and _pipeline.hf_device_map is None:
            for _, component in _pipeline.components.items():
-                if isinstance(component, nn.Module) and hasattr(component, "_hf_hook"):
+                if (
                    isinstance(component, nn.Module)
                    and hasattr(component, "_hf_hook")
                    and not model_has_device_map(component)
                ):
                    if not is_model_cpu_offload:
                        is_model_cpu_offload = isinstance(component._hf_hook, CpuOffload)
                    if not is_sequential_cpu_offload:
--- a/src/diffusers/pipelines/pipeline_loading_utils.py
+++ b/src/diffusers/pipelines/pipeline_loading_utils.py
@@ -36,6 +36,7 @@ from ..utils import (
    deprecate,
    get_class_from_dynamic_module,
    is_accelerate_available,
    is_accelerate_version,
    is_peft_available,
    is_transformers_available,
    logging,
@@ -968,3 +969,18 @@ def _get_ignore_patterns(
            )
    return ignore_patterns
 def model_has_device_map(model):
    if not is_accelerate_available() or is_accelerate_version("<", "0.14.0"):
        return False
    # Check if the model has a device map that is not exclusively CPU
    # `device_map` can only contain CPU when a model has sharded checkpoints.
    # See here: https://github.com/huggingface/diffusers/blob/41e4779d988ead99e7acd78dc8e752de88777d0f/src/diffusers/models/modeling_utils.py#L883
    device_map = getattr(model, "hf_device_map", None)
    if device_map is not None:
        unique_devices = set(device_map.values())
        return len(unique_devices) > 1 or unique_devices != {"cpu"}
    return False
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -84,6 +84,7 @@ from .pipeline_loading_utils import (
    _update_init_kwargs_with_connected_pipeline,
    load_sub_model,
    maybe_raise_or_warn,
    model_has_device_map,
    variant_compatible_siblings,
    warn_deprecated_model_variant,
 )
@@ -406,6 +407,16 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
            return hasattr(module, "_hf_hook") and isinstance(module._hf_hook, accelerate.hooks.CpuOffload)
        # device-mapped modules should not go through any device placements.
        device_mapped_components = [
            key for key, component in self.components.items() if model_has_device_map(component)
        ]
        if device_mapped_components:
            raise ValueError(
                "The following pipeline components have been found to use a device map: "
                f"{device_mapped_components}. This is incompatible with explicitly setting the device using `to()`."
            )
        # .to("cuda") would raise an error if the pipeline is sequentially offloaded, so we raise our own to make it clearer
        pipeline_is_sequentially_offloaded = any(
            module_is_sequentially_offloaded(module) for _, module in self.components.items()
@@ -1008,6 +1019,16 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
                The PyTorch device type of the accelerator that shall be used in inference. If not specified, it will
                default to "cuda".
        """
        # device-mapped modules should not go through any device placements.
        device_mapped_components = [
            key for key, component in self.components.items() if model_has_device_map(component)
        ]
        if device_mapped_components:
            raise ValueError(
                "The following pipeline components have been found to use a device map: "
                f"{device_mapped_components}. This is incompatible with `enable_model_cpu_offload()`."
            )
        is_pipeline_device_mapped = self.hf_device_map is not None and len(self.hf_device_map) > 1
        if is_pipeline_device_mapped:
            raise ValueError(
@@ -1110,6 +1131,16 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
                The PyTorch device type of the accelerator that shall be used in inference. If not specified, it will
                default to "cuda".
        """
        # device-mapped modules should not go through any device placements.
        device_mapped_components = [
            key for key, component in self.components.items() if model_has_device_map(component)
        ]
        if device_mapped_components:
            raise ValueError(
                "The following pipeline components have been found to use a device map: "
                f"{device_mapped_components}. This is incompatible with `enable_sequential_cpu_offload()`."
            )
        if is_accelerate_available() and is_accelerate_version(">=", "0.14.0"):
            from accelerate import cpu_offload
        else:
--- a/tests/pipelines/audioldm2/test_audioldm2.py
+++ b/tests/pipelines/audioldm2/test_audioldm2.py
@@ -506,9 +506,14 @@ class AudioLDM2PipelineFastTests(PipelineTesterMixin, unittest.TestCase):
        model_dtypes = {key: component.dtype for key, component in components.items() if hasattr(component, "dtype")}
        self.assertTrue(all(dtype == torch.float16 for dtype in model_dtypes.values()))
    @unittest.skip("Test currently not supported.")
    def test_sequential_cpu_offload_forward_pass(self):
        pass
    @unittest.skip("Test currently not supported.")
    def test_calling_mco_raises_error_device_mapped_components(self):
        pass
@nightly
 class AudioLDM2PipelineSlowTests(unittest.TestCase):
--- a/tests/pipelines/controlnet/test_controlnet.py
+++ b/tests/pipelines/controlnet/test_controlnet.py
@@ -514,6 +514,18 @@ class StableDiffusionMultiControlNetPipelineFastTests(
        assert image.shape == (4, 64, 64, 3)
    @unittest.skip("Test not supported.")
    def test_calling_mco_raises_error_device_mapped_components(self):
        pass
    @unittest.skip("Test not supported.")
    def test_calling_to_raises_error_device_mapped_components(self):
        pass
    @unittest.skip("Test not supported.")
    def test_calling_sco_raises_error_device_mapped_components(self):
        pass
 class StableDiffusionMultiControlNetOneModelPipelineFastTests(
    IPAdapterTesterMixin, PipelineTesterMixin, PipelineKarrasSchedulerTesterMixin, unittest.TestCase
@@ -697,6 +709,18 @@ class StableDiffusionMultiControlNetOneModelPipelineFastTests(
            except NotImplementedError:
                pass
    @unittest.skip("Test not supported.")
    def test_calling_mco_raises_error_device_mapped_components(self):
        pass
    @unittest.skip("Test not supported.")
    def test_calling_to_raises_error_device_mapped_components(self):
        pass
    @unittest.skip("Test not supported.")
    def test_calling_sco_raises_error_device_mapped_components(self):
        pass
@slow
@require_torch_gpu
--- a/tests/pipelines/controlnet/test_controlnet_img2img.py
+++ b/tests/pipelines/controlnet/test_controlnet_img2img.py
@@ -389,6 +389,18 @@ class StableDiffusionMultiControlNetPipelineFastTests(
            except NotImplementedError:
                pass
    @unittest.skip("Test not supported.")
    def test_calling_mco_raises_error_device_mapped_components(self):
        pass
    @unittest.skip("Test not supported.")
    def test_calling_to_raises_error_device_mapped_components(self):
        pass
    @unittest.skip("Test not supported.")
    def test_calling_sco_raises_error_device_mapped_components(self):
        pass
@slow
@require_torch_gpu
--- a/tests/pipelines/controlnet/test_controlnet_inpaint.py
+++ b/tests/pipelines/controlnet/test_controlnet_inpaint.py
@@ -441,6 +441,18 @@ class MultiControlNetInpaintPipelineFastTests(
            except NotImplementedError:
                pass
    @unittest.skip("Test not supported.")
    def test_calling_mco_raises_error_device_mapped_components(self):
        pass
    @unittest.skip("Test not supported.")
    def test_calling_to_raises_error_device_mapped_components(self):
        pass
    @unittest.skip("Test not supported.")
    def test_calling_sco_raises_error_device_mapped_components(self):
        pass
@slow
@require_torch_gpu
--- a/tests/pipelines/controlnet/test_controlnet_sdxl.py
+++ b/tests/pipelines/controlnet/test_controlnet_sdxl.py
@@ -683,6 +683,18 @@ class StableDiffusionXLMultiControlNetPipelineFastTests(
    def test_save_load_optional_components(self):
        return self._test_save_load_optional_components()
    @unittest.skip("Test not supported.")
    def test_calling_mco_raises_error_device_mapped_components(self):
        pass
    @unittest.skip("Test not supported.")
    def test_calling_to_raises_error_device_mapped_components(self):
        pass
    @unittest.skip("Test not supported.")
    def test_calling_sco_raises_error_device_mapped_components(self):
        pass
 class StableDiffusionXLMultiControlNetOneModelPipelineFastTests(
    PipelineKarrasSchedulerTesterMixin, PipelineTesterMixin, SDXLOptionalComponentsTesterMixin, unittest.TestCase
@@ -887,6 +899,18 @@ class StableDiffusionXLMultiControlNetOneModelPipelineFastTests(
        self.assertTrue(np.abs(image_slice_without_neg_cond - image_slice_with_neg_cond).max() > 1e-2)
    @unittest.skip("Test not supported.")
    def test_calling_mco_raises_error_device_mapped_components(self):
        pass
    @unittest.skip("Test not supported.")
    def test_calling_to_raises_error_device_mapped_components(self):
        pass
    @unittest.skip("Test not supported.")
    def test_calling_sco_raises_error_device_mapped_components(self):
        pass
@slow
@require_torch_gpu
--- a/tests/pipelines/flux/test_pipeline_flux.py
+++ b/tests/pipelines/flux/test_pipeline_flux.py
@@ -8,9 +8,11 @@ from huggingface_hub import hf_hub_download
 from transformers import AutoTokenizer, CLIPTextConfig, CLIPTextModel, CLIPTokenizer, T5EncoderModel
 from diffusers import AutoencoderKL, FlowMatchEulerDiscreteScheduler, FluxPipeline, FluxTransformer2DModel
 from diffusers.image_processor import VaeImageProcessor
 from diffusers.utils.testing_utils import (
    numpy_cosine_similarity_distance,
    require_big_gpu_with_torch_cuda,
    require_torch_multi_gpu,
    slow,
    torch_device,
 )
@@ -296,3 +298,172 @@ class FluxPipelineSlowTests(unittest.TestCase):
        max_diff = numpy_cosine_similarity_distance(expected_slice.flatten(), image_slice.flatten())
        assert max_diff < 1e-4
    @require_torch_multi_gpu
    @torch.no_grad()
    def test_flux_component_sharding(self):
        """
        internal note: test was run on `audace`.
        """
        ckpt_id = "black-forest-labs/FLUX.1-dev"
        dtype = torch.bfloat16
        prompt = "a photo of a cat with tiger-like look"
        pipeline = FluxPipeline.from_pretrained(
            ckpt_id,
            transformer=None,
            vae=None,
            device_map="balanced",
            max_memory={0: "16GB", 1: "16GB"},
            torch_dtype=dtype,
        )
        prompt_embeds, pooled_prompt_embeds, _ = pipeline.encode_prompt(
            prompt=prompt, prompt_2=None, max_sequence_length=512
        )
        del pipeline.text_encoder
        del pipeline.text_encoder_2
        del pipeline.tokenizer
        del pipeline.tokenizer_2
        del pipeline
        gc.collect()
        torch.cuda.empty_cache()
        transformer = FluxTransformer2DModel.from_pretrained(
            ckpt_id, subfolder="transformer", device_map="auto", max_memory={0: "16GB", 1: "16GB"}, torch_dtype=dtype
        )
        pipeline = FluxPipeline.from_pretrained(
            ckpt_id,
            text_encoder=None,
            text_encoder_2=None,
            tokenizer=None,
            tokenizer_2=None,
            vae=None,
            transformer=transformer,
            torch_dtype=dtype,
        )
        height, width = 768, 1360
        # No need to wrap it up under `torch.no_grad()` as pipeline call method
        # is already wrapped under that.
        latents = pipeline(
            prompt_embeds=prompt_embeds,
            pooled_prompt_embeds=pooled_prompt_embeds,
            num_inference_steps=10,
            guidance_scale=3.5,
            height=height,
            width=width,
            output_type="latent",
            generator=torch.manual_seed(0),
        ).images
        latent_slice = latents[0, :3, :3].flatten().float().cpu().numpy()
        expected_slice = np.array([-0.377, -0.3008, -0.5117, -0.252, 0.0615, -0.3477, -0.1309, -0.1914, 0.1533])
        assert numpy_cosine_similarity_distance(latent_slice, expected_slice) < 1e-4
        del pipeline.transformer
        del pipeline
        gc.collect()
        torch.cuda.empty_cache()
        vae = AutoencoderKL.from_pretrained(ckpt_id, subfolder="vae", torch_dtype=dtype).to(torch_device)
        vae_scale_factor = 2 ** (len(vae.config.block_out_channels) - 1)
        image_processor = VaeImageProcessor(vae_scale_factor=vae_scale_factor)
        latents = FluxPipeline._unpack_latents(latents, height, width, vae_scale_factor)
        latents = (latents / vae.config.scaling_factor) + vae.config.shift_factor
        image = vae.decode(latents, return_dict=False)[0]
        image = image_processor.postprocess(image, output_type="np")
        image_slice = image[0, :3, :3, -1].flatten()
        expected_slice = np.array([0.127, 0.1113, 0.1055, 0.1172, 0.1172, 0.1074, 0.1191, 0.1191, 0.1152])
        assert numpy_cosine_similarity_distance(image_slice, expected_slice) < 1e-4
    @require_torch_multi_gpu
    @torch.no_grad()
    def test_flux_component_sharding_with_lora(self):
        """
        internal note: test was run on `audace`.
        """
        ckpt_id = "black-forest-labs/FLUX.1-dev"
        dtype = torch.bfloat16
        prompt = "jon snow eating pizza."
        pipeline = FluxPipeline.from_pretrained(
            ckpt_id,
            transformer=None,
            vae=None,
            device_map="balanced",
            max_memory={0: "16GB", 1: "16GB"},
            torch_dtype=dtype,
        )
        prompt_embeds, pooled_prompt_embeds, _ = pipeline.encode_prompt(
            prompt=prompt, prompt_2=None, max_sequence_length=512
        )
        del pipeline.text_encoder
        del pipeline.text_encoder_2
        del pipeline.tokenizer
        del pipeline.tokenizer_2
        del pipeline
        gc.collect()
        torch.cuda.empty_cache()
        transformer = FluxTransformer2DModel.from_pretrained(
            ckpt_id, subfolder="transformer", device_map="auto", max_memory={0: "16GB", 1: "16GB"}, torch_dtype=dtype
        )
        pipeline = FluxPipeline.from_pretrained(
            ckpt_id,
            text_encoder=None,
            text_encoder_2=None,
            tokenizer=None,
            tokenizer_2=None,
            vae=None,
            transformer=transformer,
            torch_dtype=dtype,
        )
        pipeline.load_lora_weights("TheLastBen/Jon_Snow_Flux_LoRA", weight_name="jon_snow.safetensors")
        height, width = 768, 1360
        # No need to wrap it up under `torch.no_grad()` as pipeline call method
        # is already wrapped under that.
        latents = pipeline(
            prompt_embeds=prompt_embeds,
            pooled_prompt_embeds=pooled_prompt_embeds,
            num_inference_steps=10,
            guidance_scale=3.5,
            height=height,
            width=width,
            output_type="latent",
            generator=torch.manual_seed(0),
        ).images
        latent_slice = latents[0, :3, :3].flatten().float().cpu().numpy()
        expected_slice = np.array([-0.6523, -0.4961, -0.9141, -0.5, -0.2129, -0.6914, -0.375, -0.5664, -0.1699])
        assert numpy_cosine_similarity_distance(latent_slice, expected_slice) < 1e-4
        del pipeline.transformer
        del pipeline
        gc.collect()
        torch.cuda.empty_cache()
        vae = AutoencoderKL.from_pretrained(ckpt_id, subfolder="vae", torch_dtype=dtype).to(torch_device)
        vae_scale_factor = 2 ** (len(vae.config.block_out_channels) - 1)
        image_processor = VaeImageProcessor(vae_scale_factor=vae_scale_factor)
        latents = FluxPipeline._unpack_latents(latents, height, width, vae_scale_factor)
        latents = (latents / vae.config.scaling_factor) + vae.config.shift_factor
        image = vae.decode(latents, return_dict=False)[0]
        image = image_processor.postprocess(image, output_type="np")
        image_slice = image[0, :3, :3, -1].flatten()
        expected_slice = np.array([0.1211, 0.1094, 0.1035, 0.1094, 0.1113, 0.1074, 0.1133, 0.1133, 0.1094])
        assert numpy_cosine_similarity_distance(image_slice, expected_slice) < 1e-4
--- a/tests/pipelines/kandinsky/test_kandinsky_combined.py
+++ b/tests/pipelines/kandinsky/test_kandinsky_combined.py
@@ -139,6 +139,18 @@ class KandinskyPipelineCombinedFastTests(PipelineTesterMixin, unittest.TestCase)
    def test_dict_tuple_outputs_equivalent(self):
        super().test_dict_tuple_outputs_equivalent(expected_max_difference=5e-4)
    @unittest.skip("Test not supported.")
    def test_calling_mco_raises_error_device_mapped_components(self):
        pass
    @unittest.skip("Test not supported.")
    def test_calling_to_raises_error_device_mapped_components(self):
        pass
    @unittest.skip("Test not supported.")
    def test_calling_sco_raises_error_device_mapped_components(self):
        pass
 class KandinskyPipelineImg2ImgCombinedFastTests(PipelineTesterMixin, unittest.TestCase):
    pipeline_class = KandinskyImg2ImgCombinedPipeline
@@ -248,6 +260,18 @@ class KandinskyPipelineImg2ImgCombinedFastTests(PipelineTesterMixin, unittest.Te
    def test_save_load_optional_components(self):
        super().test_save_load_optional_components(expected_max_difference=5e-4)
    @unittest.skip("Test not supported.")
    def test_calling_mco_raises_error_device_mapped_components(self):
        pass
    @unittest.skip("Test not supported.")
    def test_calling_to_raises_error_device_mapped_components(self):
        pass
    @unittest.skip("Test not supported.")
    def test_calling_sco_raises_error_device_mapped_components(self):
        pass
 class KandinskyPipelineInpaintCombinedFastTests(PipelineTesterMixin, unittest.TestCase):
    pipeline_class = KandinskyInpaintCombinedPipeline
@@ -363,3 +387,15 @@ class KandinskyPipelineInpaintCombinedFastTests(PipelineTesterMixin, unittest.Te
    def test_save_load_local(self):
        super().test_save_load_local(expected_max_difference=5e-3)
    @unittest.skip("Test not supported.")
    def test_calling_mco_raises_error_device_mapped_components(self):
        pass
    @unittest.skip("Test not supported.")
    def test_calling_to_raises_error_device_mapped_components(self):
        pass
    @unittest.skip("Test not supported.")
    def test_calling_sco_raises_error_device_mapped_components(self):
        pass
--- a/tests/pipelines/kandinsky/test_kandinsky_prior.py
+++ b/tests/pipelines/kandinsky/test_kandinsky_prior.py
@@ -13,6 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
 import tempfile
 import unittest
 import numpy as np
@@ -28,11 +30,16 @@ from transformers import (
 )
 from diffusers import KandinskyPriorPipeline, PriorTransformer, UnCLIPScheduler
-from diffusers.utils.testing_utils import enable_full_determinism, skip_mps, torch_device
+from diffusers.models.modeling_utils import ModelMixin
 from diffusers.utils import SAFE_WEIGHTS_INDEX_NAME
 from diffusers.utils.testing_utils import enable_full_determinism, is_accelerate_available, skip_mps, torch_device
 from ..test_pipelines_common import PipelineTesterMixin
 if is_accelerate_available():
    from accelerate.utils import compute_module_sizes
 enable_full_determinism()
@@ -236,3 +243,31 @@ class KandinskyPriorPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
            test_max_difference=test_max_difference,
            test_mean_pixel_difference=test_mean_pixel_difference,
        )
    # It needs a different sharding ratio than the standard 0.75. So, we override it.
    def test_sharded_components_can_be_device_placed(self):
        components = self.get_dummy_components()
        component_selected = None
        for component_name in components:
            if isinstance(components[component_name], ModelMixin) and hasattr(
                components[component_name], "load_config"
            ):
                component_to_be_sharded = components[component_name]
                component_cls = component_to_be_sharded.__class__
                component_selected = component_name
                break
        assert component_selected, "No component selected that can be sharded."
        model_size = compute_module_sizes(component_to_be_sharded)[""]
        max_shard_size = int((model_size * 0.45) / (2**10))
        with tempfile.TemporaryDirectory() as tmp_dir:
            component_to_be_sharded.cpu().save_pretrained(tmp_dir, max_shard_size=f"{max_shard_size}KB")
            self.assertTrue(os.path.exists(os.path.join(tmp_dir, SAFE_WEIGHTS_INDEX_NAME)))
            loaded_sharded_component = component_cls.from_pretrained(tmp_dir)
            _ = components.pop(component_selected)
            components.update({component_selected: loaded_sharded_component})
            _ = self.pipeline_class(**components).to(torch_device)
--- a/tests/pipelines/kandinsky2_2/test_kandinsky_combined.py
+++ b/tests/pipelines/kandinsky2_2/test_kandinsky_combined.py
@@ -159,6 +159,18 @@ class KandinskyV22PipelineCombinedFastTests(PipelineTesterMixin, unittest.TestCa
    def test_callback_cfg(self):
        pass
    @unittest.skip("Test not supported.")
    def test_calling_mco_raises_error_device_mapped_components(self):
        pass
    @unittest.skip("Test not supported.")
    def test_calling_to_raises_error_device_mapped_components(self):
        pass
    @unittest.skip("Test not supported.")
    def test_calling_sco_raises_error_device_mapped_components(self):
        pass
 class KandinskyV22PipelineImg2ImgCombinedFastTests(PipelineTesterMixin, unittest.TestCase):
    pipeline_class = KandinskyV22Img2ImgCombinedPipeline
@@ -281,6 +293,18 @@ class KandinskyV22PipelineImg2ImgCombinedFastTests(PipelineTesterMixin, unittest
    def test_callback_cfg(self):
        pass
    @unittest.skip("Test not supported.")
    def test_calling_mco_raises_error_device_mapped_components(self):
        pass
    @unittest.skip("Test not supported.")
    def test_calling_to_raises_error_device_mapped_components(self):
        pass
    @unittest.skip("Test not supported.")
    def test_calling_sco_raises_error_device_mapped_components(self):
        pass
 class KandinskyV22PipelineInpaintCombinedFastTests(PipelineTesterMixin, unittest.TestCase):
    pipeline_class = KandinskyV22InpaintCombinedPipeline
@@ -404,3 +428,15 @@ class KandinskyV22PipelineInpaintCombinedFastTests(PipelineTesterMixin, unittest
    def test_callback_cfg(self):
        pass
    @unittest.skip("Test not supported.")
    def test_calling_mco_raises_error_device_mapped_components(self):
        pass
    @unittest.skip("Test not supported.")
    def test_calling_to_raises_error_device_mapped_components(self):
        pass
    @unittest.skip("Test not supported.")
    def test_calling_sco_raises_error_device_mapped_components(self):
        pass
--- a/tests/pipelines/kandinsky2_2/test_kandinsky_prior.py
+++ b/tests/pipelines/kandinsky2_2/test_kandinsky_prior.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 import inspect
 import os
 import tempfile
 import unittest
 import numpy as np
@@ -29,11 +31,17 @@ from transformers import (
 )
 from diffusers import KandinskyV22PriorPipeline, PriorTransformer, UnCLIPScheduler
-from diffusers.utils.testing_utils import enable_full_determinism, skip_mps, torch_device
+from diffusers.models.modeling_utils import ModelMixin
 from diffusers.utils import SAFE_WEIGHTS_INDEX_NAME
 from diffusers.utils.testing_utils import enable_full_determinism, is_accelerate_available, skip_mps, torch_device
 from ..test_pipelines_common import PipelineTesterMixin
 if is_accelerate_available():
    from accelerate.utils import compute_module_sizes
 enable_full_determinism()
@@ -277,3 +285,31 @@ class KandinskyV22PriorPipelineFastTests(PipelineTesterMixin, unittest.TestCase)
        output = pipe(**inputs)[0]
        assert output.abs().sum() == 0
    # It needs a different sharding ratio than the standard 0.75. So, we override it.
    def test_sharded_components_can_be_device_placed(self):
        components = self.get_dummy_components()
        component_selected = None
        for component_name in components:
            if isinstance(components[component_name], ModelMixin) and hasattr(
                components[component_name], "load_config"
            ):
                component_to_be_sharded = components[component_name]
                component_cls = component_to_be_sharded.__class__
                component_selected = component_name
                break
        assert component_selected, "No component selected that can be sharded."
        model_size = compute_module_sizes(component_to_be_sharded)[""]
        max_shard_size = int((model_size * 0.45) / (2**10))
        with tempfile.TemporaryDirectory() as tmp_dir:
            component_to_be_sharded.cpu().save_pretrained(tmp_dir, max_shard_size=f"{max_shard_size}KB")
            self.assertTrue(os.path.exists(os.path.join(tmp_dir, SAFE_WEIGHTS_INDEX_NAME)))
            loaded_sharded_component = component_cls.from_pretrained(tmp_dir)
            _ = components.pop(component_selected)
            components.update({component_selected: loaded_sharded_component})
            _ = self.pipeline_class(**components).to(torch_device)
--- a/tests/pipelines/kandinsky2_2/test_kandinsky_prior_emb2emb.py
+++ b/tests/pipelines/kandinsky2_2/test_kandinsky_prior_emb2emb.py
@@ -13,7 +13,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
 import random
 import tempfile
 import unittest
 import numpy as np
@@ -30,9 +32,12 @@ from transformers import (
 )
 from diffusers import KandinskyV22PriorEmb2EmbPipeline, PriorTransformer, UnCLIPScheduler
 from diffusers.models.modeling_utils import ModelMixin
 from diffusers.utils import SAFE_WEIGHTS_INDEX_NAME
 from diffusers.utils.testing_utils import (
    enable_full_determinism,
    floats_tensor,
    is_accelerate_available,
    skip_mps,
    torch_device,
 )
@@ -40,6 +45,10 @@ from diffusers.utils.testing_utils import (
 from ..test_pipelines_common import PipelineTesterMixin
 if is_accelerate_available():
    from accelerate.utils import compute_module_sizes
 enable_full_determinism()
@@ -240,3 +249,31 @@ class KandinskyV22PriorEmb2EmbPipelineFastTests(PipelineTesterMixin, unittest.Te
            test_max_difference=test_max_difference,
            test_mean_pixel_difference=test_mean_pixel_difference,
        )
    # It needs a different sharding ratio than the standard 0.75. So, we override it.
    def test_sharded_components_can_be_device_placed(self):
        components = self.get_dummy_components()
        component_selected = None
        for component_name in components:
            if isinstance(components[component_name], ModelMixin) and hasattr(
                components[component_name], "load_config"
            ):
                component_to_be_sharded = components[component_name]
                component_cls = component_to_be_sharded.__class__
                component_selected = component_name
                break
        assert component_selected, "No component selected that can be sharded."
        model_size = compute_module_sizes(component_to_be_sharded)[""]
        max_shard_size = int((model_size * 0.45) / (2**10))
        with tempfile.TemporaryDirectory() as tmp_dir:
            component_to_be_sharded.cpu().save_pretrained(tmp_dir, max_shard_size=f"{max_shard_size}KB")
            self.assertTrue(os.path.exists(os.path.join(tmp_dir, SAFE_WEIGHTS_INDEX_NAME)))
            loaded_sharded_component = component_cls.from_pretrained(tmp_dir)
            _ = components.pop(component_selected)
            components.update({component_selected: loaded_sharded_component})
            _ = self.pipeline_class(**components).to(torch_device)
--- a/tests/pipelines/musicldm/test_musicldm.py
+++ b/tests/pipelines/musicldm/test_musicldm.py
@@ -404,6 +404,10 @@ class MusicLDMPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
        model_dtypes = {key: component.dtype for key, component in components.items() if hasattr(component, "dtype")}
        self.assertTrue(all(dtype == torch.float16 for dtype in model_dtypes.values()))
    @unittest.skip("Test currently not supported.")
    def test_calling_mco_raises_error_device_mapped_components(self):
        pass
@nightly
@require_torch_gpu
--- a/tests/pipelines/stable_cascade/test_stable_cascade_combined.py
+++ b/tests/pipelines/stable_cascade/test_stable_cascade_combined.py
@@ -279,3 +279,15 @@ class StableCascadeCombinedPipelineFastTests(PipelineTesterMixin, unittest.TestC
        )
        assert np.abs(output_prompt.images - output_prompt_embeds.images).max() < 1e-5
    @unittest.skip("Test not supported.")
    def test_calling_mco_raises_error_device_mapped_components(self):
        pass
    @unittest.skip("Test not supported.")
    def test_calling_to_raises_error_device_mapped_components(self):
        pass
    @unittest.skip("Test not supported.")
    def test_calling_sco_raises_error_device_mapped_components(self):
        pass
--- a/tests/pipelines/stable_diffusion_adapter/test_stable_diffusion_adapter.py
+++ b/tests/pipelines/stable_diffusion_adapter/test_stable_diffusion_adapter.py
@@ -593,6 +593,18 @@ class StableDiffusionMultiAdapterPipelineFastTests(AdapterTests, PipelineTesterM
        if test_mean_pixel_difference:
            assert_mean_pixel_difference(output_batch[0][0], output[0][0])
    @unittest.skip("Test not supported.")
    def test_calling_mco_raises_error_device_mapped_components(self):
        pass
    @unittest.skip("Test not supported.")
    def test_calling_to_raises_error_device_mapped_components(self):
        pass
    @unittest.skip("Test not supported.")
    def test_calling_sco_raises_error_device_mapped_components(self):
        pass
@slow
@require_torch_gpu
--- a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_adapter.py
+++ b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_adapter.py
@@ -642,9 +642,6 @@ class StableDiffusionXLMultiAdapterPipelineFastTests(
        assert image.shape == (1, 64, 64, 3)
        expected_slice = np.array([0.5313, 0.5375, 0.4942, 0.5021, 0.6142, 0.4968, 0.5434, 0.5311, 0.5448])
        debug = [str(round(i, 4)) for i in image_slice.flatten().tolist()]
        print(",".join(debug))
        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
    def test_adapter_sdxl_lcm_custom_timesteps(self):
@@ -667,7 +664,16 @@ class StableDiffusionXLMultiAdapterPipelineFastTests(
        assert image.shape == (1, 64, 64, 3)
        expected_slice = np.array([0.5313, 0.5375, 0.4942, 0.5021, 0.6142, 0.4968, 0.5434, 0.5311, 0.5448])
        debug = [str(round(i, 4)) for i in image_slice.flatten().tolist()]
        print(",".join(debug))
        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
    @unittest.skip("Test not supported.")
    def test_calling_mco_raises_error_device_mapped_components(self):
        pass
    @unittest.skip("Test not supported.")
    def test_calling_to_raises_error_device_mapped_components(self):
        pass
    @unittest.skip("Test not supported.")
    def test_calling_sco_raises_error_device_mapped_components(self):
        pass
--- a/tests/pipelines/stable_unclip/test_stable_unclip.py
+++ b/tests/pipelines/stable_unclip/test_stable_unclip.py
@@ -1,4 +1,6 @@
 import gc
 import os
 import tempfile
 import unittest
 import torch
@@ -12,8 +14,17 @@ from diffusers import (
    StableUnCLIPPipeline,
    UNet2DConditionModel,
 )
 from diffusers.models.modeling_utils import ModelMixin
 from diffusers.pipelines.stable_diffusion.stable_unclip_image_normalizer import StableUnCLIPImageNormalizer
-from diffusers.utils.testing_utils import enable_full_determinism, load_numpy, nightly, require_torch_gpu, torch_device
+from diffusers.utils import SAFE_WEIGHTS_INDEX_NAME
 from diffusers.utils.testing_utils import (
    enable_full_determinism,
    is_accelerate_available,
    load_numpy,
    nightly,
    require_torch_gpu,
    torch_device,
 )
 from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
 from ..test_pipelines_common import (
@@ -24,6 +35,10 @@ from ..test_pipelines_common import (
 )
 if is_accelerate_available():
    from accelerate.utils import compute_module_sizes
 enable_full_determinism()
@@ -184,6 +199,46 @@ class StableUnCLIPPipelineFastTests(
    def test_inference_batch_single_identical(self):
        self._test_inference_batch_single_identical(expected_max_diff=1e-3)
    @unittest.skip("Test not supported.")
    def test_calling_mco_raises_error_device_mapped_components(self):
        pass
    @unittest.skip("Test not supported.")
    def test_calling_to_raises_error_device_mapped_components(self):
        pass
    @unittest.skip("Test not supported.")
    def test_calling_sco_raises_error_device_mapped_components(self):
        pass
    # It needs a different sharding ratio than the standard 0.75. So, we override it.
    def test_sharded_components_can_be_device_placed(self):
        components = self.get_dummy_components()
        component_selected = None
        for component_name in components:
            if isinstance(components[component_name], ModelMixin) and hasattr(
                components[component_name], "load_config"
            ):
                component_to_be_sharded = components[component_name]
                component_cls = component_to_be_sharded.__class__
                component_selected = component_name
                break
        assert component_selected, "No component selected that can be sharded."
        model_size = compute_module_sizes(component_to_be_sharded)[""]
        max_shard_size = int((model_size * 0.45) / (2**10))
        with tempfile.TemporaryDirectory() as tmp_dir:
            component_to_be_sharded.cpu().save_pretrained(tmp_dir, max_shard_size=f"{max_shard_size}KB")
            self.assertTrue(os.path.exists(os.path.join(tmp_dir, SAFE_WEIGHTS_INDEX_NAME)))
            loaded_sharded_component = component_cls.from_pretrained(tmp_dir)
            _ = components.pop(component_selected)
            components.update({component_selected: loaded_sharded_component})
            _ = self.pipeline_class(**components).to(torch_device)
@nightly
@require_torch_gpu
--- a/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py
+++ b/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py
@@ -205,6 +205,18 @@ class StableUnCLIPImg2ImgPipelineFastTests(
    def test_xformers_attention_forwardGenerator_pass(self):
        self._test_xformers_attention_forwardGenerator_pass(test_max_difference=False)
    @unittest.skip("Test not supported.")
    def test_calling_mco_raises_error_device_mapped_components(self):
        pass
    @unittest.skip("Test not supported.")
    def test_calling_to_raises_error_device_mapped_components(self):
        pass
    @unittest.skip("Test not supported.")
    def test_calling_sco_raises_error_device_mapped_components(self):
        pass
@nightly
@require_torch_gpu
--- a/tests/pipelines/test_pipelines_common.py
+++ b/tests/pipelines/test_pipelines_common.py
@@ -41,10 +41,14 @@ from diffusers.utils import logging
 from diffusers.utils.import_utils import is_xformers_available
 from diffusers.utils.testing_utils import (
    CaptureLogger,
    is_accelerate_available,
    nightly,
    require_accelerate_version_greater,
    require_accelerator,
    require_torch,
    require_torch_multi_gpu,
    skip_mps,
    slow,
    torch_device,
 )
@@ -61,6 +65,10 @@ from ..models.unets.test_models_unet_2d_condition import (
 from ..others.test_utils import TOKEN, USER, is_staging_test
 if is_accelerate_available():
    from accelerate.utils import compute_module_sizes
 def to_np(tensor):
    if isinstance(tensor, torch.Tensor):
        tensor = tensor.detach().cpu().numpy()
@@ -1902,6 +1910,78 @@ class PipelineTesterMixin:
            )
        )
    @require_torch_multi_gpu
    @slow
    @nightly
    def test_calling_to_raises_error_device_mapped_components(self, safe_serialization=True):
        components = self.get_dummy_components()
        pipe = self.pipeline_class(**components)
        max_model_size = max(
            compute_module_sizes(module)[""]
            for _, module in pipe.components.items()
            if isinstance(module, torch.nn.Module)
        )
        with tempfile.TemporaryDirectory() as tmpdir:
            pipe.save_pretrained(tmpdir, safe_serialization=safe_serialization)
            max_memory = {0: max_model_size, 1: max_model_size}
            loaded_pipe = self.pipeline_class.from_pretrained(tmpdir, device_map="balanced", max_memory=max_memory)
        with self.assertRaises(ValueError) as err_context:
            loaded_pipe.to(torch_device)
        self.assertTrue(
            "The following pipeline components have been found" in str(err_context.exception)
            and "This is incompatible with explicitly setting the device using `to()`" in str(err_context.exception)
        )
    @require_torch_multi_gpu
    @slow
    @nightly
    def test_calling_mco_raises_error_device_mapped_components(self, safe_serialization=True):
        components = self.get_dummy_components()
        pipe = self.pipeline_class(**components)
        max_model_size = max(
            compute_module_sizes(module)[""]
            for _, module in pipe.components.items()
            if isinstance(module, torch.nn.Module)
        )
        with tempfile.TemporaryDirectory() as tmpdir:
            pipe.save_pretrained(tmpdir, safe_serialization=safe_serialization)
            max_memory = {0: max_model_size, 1: max_model_size}
            loaded_pipe = self.pipeline_class.from_pretrained(tmpdir, device_map="balanced", max_memory=max_memory)
        with self.assertRaises(ValueError) as err_context:
            loaded_pipe.enable_model_cpu_offload()
        self.assertTrue(
            "The following pipeline components have been found" in str(err_context.exception)
            and "This is incompatible with `enable_model_cpu_offload()`" in str(err_context.exception)
        )
    @require_torch_multi_gpu
    @slow
    @nightly
    def test_calling_sco_raises_error_device_mapped_components(self, safe_serialization=True):
        components = self.get_dummy_components()
        pipe = self.pipeline_class(**components)
        max_model_size = max(
            compute_module_sizes(module)[""]
            for _, module in pipe.components.items()
            if isinstance(module, torch.nn.Module)
        )
        with tempfile.TemporaryDirectory() as tmpdir:
            pipe.save_pretrained(tmpdir, safe_serialization=safe_serialization)
            max_memory = {0: max_model_size, 1: max_model_size}
            loaded_pipe = self.pipeline_class.from_pretrained(tmpdir, device_map="balanced", max_memory=max_memory)
        with self.assertRaises(ValueError) as err_context:
            loaded_pipe.enable_sequential_cpu_offload()
        self.assertTrue(
            "The following pipeline components have been found" in str(err_context.exception)
            and "This is incompatible with `enable_sequential_cpu_offload()`" in str(err_context.exception)
        )
@is_staging_test
 class PipelinePushToHubTester(unittest.TestCase):
--- a/tests/pipelines/unclip/test_unclip.py
+++ b/tests/pipelines/unclip/test_unclip.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 import gc
 import os
 import tempfile
 import unittest
 import numpy as np
@@ -21,9 +23,12 @@ import torch
 from transformers import CLIPTextConfig, CLIPTextModelWithProjection, CLIPTokenizer
 from diffusers import PriorTransformer, UnCLIPPipeline, UnCLIPScheduler, UNet2DConditionModel, UNet2DModel
 from diffusers.models.modeling_utils import ModelMixin
 from diffusers.pipelines.unclip.text_proj import UnCLIPTextProjModel
 from diffusers.utils import SAFE_WEIGHTS_INDEX_NAME
 from diffusers.utils.testing_utils import (
    enable_full_determinism,
    is_accelerate_available,
    load_numpy,
    nightly,
    require_torch_gpu,
@@ -35,6 +40,9 @@ from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
 from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
 if is_accelerate_available():
    from accelerate.utils import compute_module_sizes
 enable_full_determinism()
@@ -418,6 +426,34 @@ class UnCLIPPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
    def test_float16_inference(self):
        super().test_float16_inference(expected_max_diff=1.0)
    # It needs a different sharding ratio than the standard 0.75. So, we override it.
    def test_sharded_components_can_be_device_placed(self):
        components = self.get_dummy_components()
        component_selected = None
        for component_name in components:
            if isinstance(components[component_name], ModelMixin) and hasattr(
                components[component_name], "load_config"
            ):
                component_to_be_sharded = components[component_name]
                component_cls = component_to_be_sharded.__class__
                component_selected = component_name
                break
        assert component_selected, "No component selected that can be sharded."
        model_size = compute_module_sizes(component_to_be_sharded)[""]
        max_shard_size = int((model_size * 0.45) / (2**10))
        with tempfile.TemporaryDirectory() as tmp_dir:
            component_to_be_sharded.cpu().save_pretrained(tmp_dir, max_shard_size=f"{max_shard_size}KB")
            self.assertTrue(os.path.exists(os.path.join(tmp_dir, SAFE_WEIGHTS_INDEX_NAME)))
            loaded_sharded_component = component_cls.from_pretrained(tmp_dir)
            _ = components.pop(component_selected)
            components.update({component_selected: loaded_sharded_component})
            _ = self.pipeline_class(**components).to(torch_device)
@nightly
 class UnCLIPPipelineCPUIntegrationTests(unittest.TestCase):
--- a/tests/pipelines/unidiffuser/test_unidiffuser.py
+++ b/tests/pipelines/unidiffuser/test_unidiffuser.py
@@ -576,6 +576,15 @@ class UniDiffuserPipelineFastTests(
        expected_text_prefix = '" This This'
        assert text[0][: len(expected_text_prefix)] == expected_text_prefix
    def test_calling_mco_raises_error_device_mapped_components(self):
        super().test_calling_mco_raises_error_device_mapped_components(safe_serialization=False)
    def test_calling_to_raises_error_device_mapped_components(self):
        super().test_calling_to_raises_error_device_mapped_components(safe_serialization=False)
    def test_calling_sco_raises_error_device_mapped_components(self):
        super().test_calling_sco_raises_error_device_mapped_components(safe_serialization=False)
@nightly
@require_torch_gpu
--- a/tests/pipelines/wuerstchen/test_wuerstchen_combined.py
+++ b/tests/pipelines/wuerstchen/test_wuerstchen_combined.py
@@ -237,3 +237,15 @@ class WuerstchenCombinedPipelineFastTests(PipelineTesterMixin, unittest.TestCase
    def test_callback_cfg(self):
        pass
    @unittest.skip("Test not supported.")
    def test_calling_mco_raises_error_device_mapped_components(self):
        pass
    @unittest.skip("Test not supported.")
    def test_calling_to_raises_error_device_mapped_components(self):
        pass
    @unittest.skip("Test not supported.")
    def test_calling_sco_raises_error_device_mapped_components(self):
        pass
Author	SHA1	Message	Date
Sayak Paul	d1c05927bf	Merge branch 'main' into lora-device-map	2024-12-16 08:35:20 +05:30
sayakpaul	f7c4706f79	resolve conflicts.	2024-12-08 13:51:31 +05:30
Sayak Paul	d8336d6e4d	Merge branch 'main' into lora-device-map	2024-11-01 10:26:16 +05:30
sayakpaul	334173919a	fixes	2024-11-01 08:08:41 +05:30
sayakpaul	569f99e3d3	fix	2024-11-01 08:02:12 +05:30
sayakpaul	4d7986a126	Merge branch 'main' into lora-device-map	2024-11-01 07:26:29 +05:30
sayakpaul	ccd8d2ad80	resolve conflicts.	2024-10-31 20:47:24 +05:30
sayakpaul	a61b754fe5	fixes	2024-10-31 20:40:59 +05:30
sayakpaul	0bd40cbff3	skip properly.	2024-10-31 19:10:18 +05:30
sayakpaul	03377b7afc	fixes	2024-10-31 19:02:45 +05:30
Sayak Paul	61903c8080	Merge branch 'main' into lora-device-map	2024-10-31 18:34:46 +05:30
Sayak Paul	2db5d48743	Merge branch 'main' into lora-device-map	2024-10-23 12:51:33 +05:30
Sayak Paul	fe2cca8766	Update docs/source/en/training/distributed_inference.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>	2024-10-23 12:51:05 +05:30
Sayak Paul	4b6124a07a	Merge branch 'main' into lora-device-map	2024-10-22 16:00:09 +05:30
sayakpaul	c0dee879d4	quality	2024-10-19 18:20:29 +05:30
Sayak Paul	f64751e37d	Merge branch 'main' into lora-device-map	2024-10-19 18:19:49 +05:30
sayakpaul	5ea1173aeb	add hardware note.	2024-10-19 18:10:24 +05:30
sayakpaul	2334f78c3b	add: tests, docs.	2024-10-19 18:06:41 +05:30
sayakpaul	f62afac640	fix-copies	2024-10-19 16:18:15 +05:30
sayakpaul	71989e3edf	better error messages.	2024-10-19 16:17:12 +05:30
sayakpaul	ea727a3b32	minors	2024-10-19 16:05:33 +05:30
Sayak Paul	eefda549cd	Merge branch 'main' into lora-device-map	2024-10-19 16:02:10 +05:30
Sayak Paul	f63b04c5e0	Merge branch 'main' into lora-device-map	2024-10-15 15:21:03 +05:30
Sayak Paul	e42ec19fbf	Merge branch 'main' into lora-device-map	2024-10-10 21:00:21 +05:30
Sayak Paul	8f670e24e9	Merge branch 'main' into lora-device-map	2024-10-08 21:47:25 +05:30
Sayak Paul	5f3cae2bf5	Merge branch 'main' into lora-device-map	2024-10-06 10:00:48 +04:00
Sayak Paul	d2d59c38d7	Merge branch 'main' into lora-device-map	2024-10-02 15:48:04 +02:00
Sayak Paul	1ed0eb0af1	Merge branch 'main' into lora-device-map	2024-09-28 10:54:22 +05:30
Sayak Paul	2846549eaa	Merge branch 'main' into lora-device-map	2024-09-27 09:35:57 +05:30
Sayak Paul	5479198085	Apply suggestions from code review Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>	2024-09-24 19:53:34 +05:30
Sayak Paul	d4bd94b026	Merge branch 'main' into lora-device-map	2024-09-24 09:52:58 +05:30
Sayak Paul	6d03c12dc3	Merge branch 'main' into lora-device-map	2024-09-22 16:22:33 +05:30
sayakpaul	64b3ad14da	empty Co-authored-by: Benjamin Bossan <BenjaminBossan@users.noreply.github.com>	2024-09-17 19:36:14 +05:30
sayakpaul	949a9298e3	better attibutung	2024-09-17 19:34:32 +05:30
sayakpaul	dc1aee2718	fix: lora loading when using with a device_mapped model.	2024-09-17 07:22:22 +05:30