update

Merge branch 'main' into compile-test-fix
update
2025-12-10 14:34:55 +08:00 · 2023-12-08 12:28:50 +00:00 · 2023-12-08 11:10:18 +00:00 · 2023-12-08 09:58:13 +00:00
56 changed files with 245 additions and 2042 deletions
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -1,52 +0,0 @@
-name: Benchmarking tests
-
-on:
-  schedule:
-    - cron: "30 1 1,15 * *" # every 2 weeks on the 1st and the 15th of every month at 1:30 AM
-
-env:
-  DIFFUSERS_IS_CI: yes
-  HF_HOME: /mnt/cache
-  OMP_NUM_THREADS: 8
-  MKL_NUM_THREADS: 8
-
-jobs:
-  torch_pipelines_cuda_benchmark_tests:
-    name: Torch Core Pipelines CUDA Benchmarking Tests
-    strategy:
-      fail-fast: false
-      max-parallel: 1
-    runs-on: [single-gpu, nvidia-gpu, a10, ci]
-    container:
-      image: diffusers/diffusers-pytorch-cuda
-      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0
-    steps:
-      - name: Checkout diffusers
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 2
-      - name: NVIDIA-SMI
-        run: |
-          nvidia-smi
-      - name: Install dependencies
-        run: |
-          apt-get update && apt-get install libsndfile1-dev libgl1 -y
-          python -m pip install -e .[quality,test]
-          python -m pip install pandas
-      - name: Environment
-        run: |
-          python utils/print_env.py
-      - name: Diffusers Benchmarking
-        env:
-            HUGGING_FACE_HUB_TOKEN: ${{ secrets.DIFFUSERS_BOT_TOKEN }}
-            BASE_PATH: benchmark_outputs
-        run: |
-          export TOTAL_GPU_MEMORY=$(python -c "import torch; print(torch.cuda.get_device_properties(0).total_memory / (1024**3))")
-          cd benchmarks && mkdir ${BASE_PATH} && python run_all.py && python push_results.py
-
-      - name: Test suite reports artifacts
-        if: ${{ always() }}
-        uses: actions/upload-artifact@v2
-        with:
-          name: benchmark_test_reports
-          path: benchmarks/benchmark_outputs
--- a/2
+++ b/2
@@ -3,7 +3,7 @@
 # make sure to test the local checkout in scripts and not the pre-installed one (don't use quotes!)
 export PYTHONPATH = src

-check_dirs := examples scripts src tests utils benchmarks
+check_dirs := examples scripts src tests utils

 modified_only_fixup:
 	$(eval modified_py_files := $(shell python utils/get_modified_files.py $(check_dirs)))
--- a/README.md
+++ b/README.md
@@ -77,7 +77,7 @@ Please refer to the [How to use Stable Diffusion in Apple Silicon](https://huggi

 ## Quickstart

-Generating outputs is super easy with 🤗 Diffusers. To generate an image from text, use the `from_pretrained` method to load any pretrained diffusion model (browse the [Hub](https://huggingface.co/models?library=diffusers&sort=downloads) for 16000+ checkpoints):
+Generating outputs is super easy with 🤗 Diffusers. To generate an image from text, use the `from_pretrained` method to load any pretrained diffusion model (browse the [Hub](https://huggingface.co/models?library=diffusers&sort=downloads) for 15000+ checkpoints):

 ```python
 from diffusers import DiffusionPipeline
@@ -219,7 +219,7 @@ Also, say 👋 in our public Discord channel <a href="https://discord.gg/G7tWnz9
 - https://github.com/deep-floyd/IF
 - https://github.com/bentoml/BentoML
 - https://github.com/bmaltais/kohya_ss
- +7000 other amazing GitHub repositories 💪
+- +6000 other amazing GitHub repositories 💪

 Thank you for using us ❤️.

--- a/benchmarks/base_classes.py
+++ b/benchmarks/base_classes.py
@@ -1,297 +0,0 @@
-import os
-import sys
-
-import torch
-
-from diffusers import (
-    AutoPipelineForImage2Image,
-    AutoPipelineForInpainting,
-    AutoPipelineForText2Image,
-    ControlNetModel,
-    LCMScheduler,
-    StableDiffusionAdapterPipeline,
-    StableDiffusionControlNetPipeline,
-    StableDiffusionXLAdapterPipeline,
-    StableDiffusionXLControlNetPipeline,
-    T2IAdapter,
-    WuerstchenCombinedPipeline,
-)
-from diffusers.utils import load_image
-
-
-sys.path.append(".")
-
-from utils import (  # noqa: E402
-    BASE_PATH,
-    PROMPT,
-    BenchmarkInfo,
-    benchmark_fn,
-    bytes_to_giga_bytes,
-    flush,
-    generate_csv_dict,
-    write_to_csv,
-)
-
-
-RESOLUTION_MAPPING = {
-    "runwayml/stable-diffusion-v1-5": (512, 512),
-    "lllyasviel/sd-controlnet-canny": (512, 512),
-    "diffusers/controlnet-canny-sdxl-1.0": (1024, 1024),
-    "TencentARC/t2iadapter_canny_sd14v1": (512, 512),
-    "TencentARC/t2i-adapter-canny-sdxl-1.0": (1024, 1024),
-    "stabilityai/stable-diffusion-2-1": (768, 768),
-    "stabilityai/stable-diffusion-xl-base-1.0": (1024, 1024),
-    "stabilityai/stable-diffusion-xl-refiner-1.0": (1024, 1024),
-    "stabilityai/sdxl-turbo": (512, 512),
-}
-
-
-class BaseBenchmak:
-    pipeline_class = None
-
-    def __init__(self, args):
-        super().__init__()
-
-    def run_inference(self, args):
-        raise NotImplementedError
-
-    def benchmark(self, args):
-        raise NotImplementedError
-
-    def get_result_filepath(self, args):
-        pipeline_class_name = str(self.pipe.__class__.__name__)
-        name = (
-            args.ckpt.replace("/", "_")
-            + "_"
-            + pipeline_class_name
-            + f"-bs@{args.batch_size}-steps@{args.num_inference_steps}-mco@{args.model_cpu_offload}-compile@{args.run_compile}.csv"
-        )
-        filepath = os.path.join(BASE_PATH, name)
-        return filepath
-
-
-class TextToImageBenchmark(BaseBenchmak):
-    pipeline_class = AutoPipelineForText2Image
-
-    def __init__(self, args):
-        pipe = self.pipeline_class.from_pretrained(args.ckpt, torch_dtype=torch.float16)
-        pipe = pipe.to("cuda")
-
-        if args.run_compile:
-            if not isinstance(pipe, WuerstchenCombinedPipeline):
-                pipe.unet.to(memory_format=torch.channels_last)
-                print("Run torch compile")
-                pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
-
-                if hasattr(pipe, "movq") and getattr(pipe, "movq", None) is not None:
-                    pipe.movq.to(memory_format=torch.channels_last)
-                    pipe.movq = torch.compile(pipe.movq, mode="reduce-overhead", fullgraph=True)
-            else:
-                print("Run torch compile")
-                pipe.decoder = torch.compile(pipe.decoder, mode="reduce-overhead", fullgraph=True)
-                pipe.vqgan = torch.compile(pipe.vqgan, mode="reduce-overhead", fullgraph=True)
-
-        pipe.set_progress_bar_config(disable=True)
-        self.pipe = pipe
-
-    def run_inference(self, pipe, args):
-        _ = pipe(
-            prompt=PROMPT,
-            num_inference_steps=args.num_inference_steps,
-            num_images_per_prompt=args.batch_size,
-        )
-
-    def benchmark(self, args):
-        flush()
-
-        print(f"[INFO] {self.pipe.__class__.__name__}: Running benchmark with: {vars(args)}\n")
-
-        time = benchmark_fn(self.run_inference, self.pipe, args)  # in seconds.
-        memory = bytes_to_giga_bytes(torch.cuda.max_memory_allocated())  # in GBs.
-        benchmark_info = BenchmarkInfo(time=time, memory=memory)
-
-        pipeline_class_name = str(self.pipe.__class__.__name__)
-        flush()
-        csv_dict = generate_csv_dict(
-            pipeline_cls=pipeline_class_name, ckpt=args.ckpt, args=args, benchmark_info=benchmark_info
-        )
-        filepath = self.get_result_filepath(args)
-        write_to_csv(filepath, csv_dict)
-        print(f"Logs written to: {filepath}")
-        flush()
-
-
-class TurboTextToImageBenchmark(TextToImageBenchmark):
-    def __init__(self, args):
-        super().__init__(args)
-
-    def run_inference(self, pipe, args):
-        _ = pipe(
-            prompt=PROMPT,
-            num_inference_steps=args.num_inference_steps,
-            num_images_per_prompt=args.batch_size,
-            guidance_scale=0.0,
-        )
-
-
-class LCMLoRATextToImageBenchmark(TextToImageBenchmark):
-    lora_id = "latent-consistency/lcm-lora-sdxl"
-
-    def __init__(self, args):
-        super().__init__(args)
-        self.pipe.load_lora_weights(self.lora_id)
-        self.pipe.fuse_lora()
-        self.pipe.scheduler = LCMScheduler.from_config(self.pipe.scheduler.config)
-
-    def get_result_filepath(self, args):
-        pipeline_class_name = str(self.pipe.__class__.__name__)
-        name = (
-            self.lora_id.replace("/", "_")
-            + "_"
-            + pipeline_class_name
-            + f"-bs@{args.batch_size}-steps@{args.num_inference_steps}-mco@{args.model_cpu_offload}-compile@{args.run_compile}.csv"
-        )
-        filepath = os.path.join(BASE_PATH, name)
-        return filepath
-
-    def run_inference(self, pipe, args):
-        _ = pipe(
-            prompt=PROMPT,
-            num_inference_steps=args.num_inference_steps,
-            num_images_per_prompt=args.batch_size,
-            guidance_scale=1.0,
-        )
-
-
-class ImageToImageBenchmark(TextToImageBenchmark):
-    pipeline_class = AutoPipelineForImage2Image
-    url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/1665_Girl_with_a_Pearl_Earring.jpg"
-    image = load_image(url).convert("RGB")
-
-    def __init__(self, args):
-        super().__init__(args)
-        self.image = self.image.resize(RESOLUTION_MAPPING[args.ckpt])
-
-    def run_inference(self, pipe, args):
-        _ = pipe(
-            prompt=PROMPT,
-            image=self.image,
-            num_inference_steps=args.num_inference_steps,
-            num_images_per_prompt=args.batch_size,
-        )
-
-
-class TurboImageToImageBenchmark(ImageToImageBenchmark):
-    def __init__(self, args):
-        super().__init__(args)
-
-    def run_inference(self, pipe, args):
-        _ = pipe(
-            prompt=PROMPT,
-            image=self.image,
-            num_inference_steps=args.num_inference_steps,
-            num_images_per_prompt=args.batch_size,
-            guidance_scale=0.0,
-            strength=0.5,
-        )
-
-
-class InpaintingBenchmark(ImageToImageBenchmark):
-    pipeline_class = AutoPipelineForInpainting
-    mask_url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/overture-creations-5sI6fQgYIuo_mask.png"
-    mask = load_image(mask_url).convert("RGB")
-
-    def __init__(self, args):
-        super().__init__(args)
-        self.image = self.image.resize(RESOLUTION_MAPPING[args.ckpt])
-        self.mask = self.mask.resize(RESOLUTION_MAPPING[args.ckpt])
-
-    def run_inference(self, pipe, args):
-        _ = pipe(
-            prompt=PROMPT,
-            image=self.image,
-            mask_image=self.mask,
-            num_inference_steps=args.num_inference_steps,
-            num_images_per_prompt=args.batch_size,
-        )
-
-
-class ControlNetBenchmark(TextToImageBenchmark):
-    pipeline_class = StableDiffusionControlNetPipeline
-    aux_network_class = ControlNetModel
-    root_ckpt = "runwayml/stable-diffusion-v1-5"
-
-    url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/canny_image_condition.png"
-    image = load_image(url).convert("RGB")
-
-    def __init__(self, args):
-        aux_network = self.aux_network_class.from_pretrained(args.ckpt, torch_dtype=torch.float16)
-        pipe = self.pipeline_class.from_pretrained(self.root_ckpt, controlnet=aux_network, torch_dtype=torch.float16)
-        pipe = pipe.to("cuda")
-
-        pipe.set_progress_bar_config(disable=True)
-        self.pipe = pipe
-
-        if args.run_compile:
-            pipe.unet.to(memory_format=torch.channels_last)
-            pipe.controlnet.to(memory_format=torch.channels_last)
-
-            print("Run torch compile")
-            pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
-            pipe.controlnet = torch.compile(pipe.controlnet, mode="reduce-overhead", fullgraph=True)
-
-        self.image = self.image.resize(RESOLUTION_MAPPING[args.ckpt])
-
-    def run_inference(self, pipe, args):
-        _ = pipe(
-            prompt=PROMPT,
-            image=self.image,
-            num_inference_steps=args.num_inference_steps,
-            num_images_per_prompt=args.batch_size,
-        )
-
-
-class ControlNetSDXLBenchmark(ControlNetBenchmark):
-    pipeline_class = StableDiffusionXLControlNetPipeline
-    root_ckpt = "stabilityai/stable-diffusion-xl-base-1.0"
-
-    def __init__(self, args):
-        super().__init__(args)
-
-
-class T2IAdapterBenchmark(ControlNetBenchmark):
-    pipeline_class = StableDiffusionAdapterPipeline
-    aux_network_class = T2IAdapter
-    root_ckpt = "CompVis/stable-diffusion-v1-4"
-
-    url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/canny_for_adapter.png"
-    image = load_image(url).convert("L")
-
-    def __init__(self, args):
-        aux_network = self.aux_network_class.from_pretrained(args.ckpt, torch_dtype=torch.float16)
-        pipe = self.pipeline_class.from_pretrained(self.root_ckpt, adapter=aux_network, torch_dtype=torch.float16)
-        pipe = pipe.to("cuda")
-
-        pipe.set_progress_bar_config(disable=True)
-        self.pipe = pipe
-
-        if args.run_compile:
-            pipe.unet.to(memory_format=torch.channels_last)
-            pipe.adapter.to(memory_format=torch.channels_last)
-
-            print("Run torch compile")
-            pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
-            pipe.adapter = torch.compile(pipe.adapter, mode="reduce-overhead", fullgraph=True)
-
-        self.image = self.image.resize(RESOLUTION_MAPPING[args.ckpt])
-
-
-class T2IAdapterSDXLBenchmark(T2IAdapterBenchmark):
-    pipeline_class = StableDiffusionXLAdapterPipeline
-    root_ckpt = "stabilityai/stable-diffusion-xl-base-1.0"
-
-    url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/canny_for_adapter_sdxl.png"
-    image = load_image(url)
-
-    def __init__(self, args):
-        super().__init__(args)
--- a/benchmarks/benchmark_controlnet.py
+++ b/benchmarks/benchmark_controlnet.py
@@ -1,26 +0,0 @@
-import argparse
-import sys
-
-
-sys.path.append(".")
-from base_classes import ControlNetBenchmark, ControlNetSDXLBenchmark  # noqa: E402
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--ckpt",
-        type=str,
-        default="lllyasviel/sd-controlnet-canny",
-        choices=["lllyasviel/sd-controlnet-canny", "diffusers/controlnet-canny-sdxl-1.0"],
-    )
-    parser.add_argument("--batch_size", type=int, default=1)
-    parser.add_argument("--num_inference_steps", type=int, default=50)
-    parser.add_argument("--model_cpu_offload", action="store_true")
-    parser.add_argument("--run_compile", action="store_true")
-    args = parser.parse_args()
-
-    benchmark_pipe = (
-        ControlNetBenchmark(args) if args.ckpt == "lllyasviel/sd-controlnet-canny" else ControlNetSDXLBenchmark(args)
-    )
-    benchmark_pipe.benchmark(args)
--- a/benchmarks/benchmark_sd_img.py
+++ b/benchmarks/benchmark_sd_img.py
@@ -1,29 +0,0 @@
-import argparse
-import sys
-
-
-sys.path.append(".")
-from base_classes import ImageToImageBenchmark, TurboImageToImageBenchmark  # noqa: E402
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--ckpt",
-        type=str,
-        default="runwayml/stable-diffusion-v1-5",
-        choices=[
-            "runwayml/stable-diffusion-v1-5",
-            "stabilityai/stable-diffusion-2-1",
-            "stabilityai/stable-diffusion-xl-refiner-1.0",
-            "stabilityai/sdxl-turbo",
-        ],
-    )
-    parser.add_argument("--batch_size", type=int, default=1)
-    parser.add_argument("--num_inference_steps", type=int, default=50)
-    parser.add_argument("--model_cpu_offload", action="store_true")
-    parser.add_argument("--run_compile", action="store_true")
-    args = parser.parse_args()
-
-    benchmark_pipe = ImageToImageBenchmark(args) if "turbo" not in args.ckpt else TurboImageToImageBenchmark(args)
-    benchmark_pipe.benchmark(args)
--- a/benchmarks/benchmark_sd_inpainting.py
+++ b/benchmarks/benchmark_sd_inpainting.py
@@ -1,28 +0,0 @@
-import argparse
-import sys
-
-
-sys.path.append(".")
-from base_classes import InpaintingBenchmark  # noqa: E402
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--ckpt",
-        type=str,
-        default="runwayml/stable-diffusion-v1-5",
-        choices=[
-            "runwayml/stable-diffusion-v1-5",
-            "stabilityai/stable-diffusion-2-1",
-            "stabilityai/stable-diffusion-xl-base-1.0",
-        ],
-    )
-    parser.add_argument("--batch_size", type=int, default=1)
-    parser.add_argument("--num_inference_steps", type=int, default=50)
-    parser.add_argument("--model_cpu_offload", action="store_true")
-    parser.add_argument("--run_compile", action="store_true")
-    args = parser.parse_args()
-
-    benchmark_pipe = InpaintingBenchmark(args)
-    benchmark_pipe.benchmark(args)
--- a/benchmarks/benchmark_t2i_adapter.py
+++ b/benchmarks/benchmark_t2i_adapter.py
@@ -1,28 +0,0 @@
-import argparse
-import sys
-
-
-sys.path.append(".")
-from base_classes import T2IAdapterBenchmark, T2IAdapterSDXLBenchmark  # noqa: E402
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--ckpt",
-        type=str,
-        default="TencentARC/t2iadapter_canny_sd14v1",
-        choices=["TencentARC/t2iadapter_canny_sd14v1", "TencentARC/t2i-adapter-canny-sdxl-1.0"],
-    )
-    parser.add_argument("--batch_size", type=int, default=1)
-    parser.add_argument("--num_inference_steps", type=int, default=50)
-    parser.add_argument("--model_cpu_offload", action="store_true")
-    parser.add_argument("--run_compile", action="store_true")
-    args = parser.parse_args()
-
-    benchmark_pipe = (
-        T2IAdapterBenchmark(args)
-        if args.ckpt == "TencentARC/t2iadapter_canny_sd14v1"
-        else T2IAdapterSDXLBenchmark(args)
-    )
-    benchmark_pipe.benchmark(args)
--- a/benchmarks/benchmark_t2i_lcm_lora.py
+++ b/benchmarks/benchmark_t2i_lcm_lora.py
@@ -1,23 +0,0 @@
-import argparse
-import sys
-
-
-sys.path.append(".")
-from base_classes import LCMLoRATextToImageBenchmark  # noqa: E402
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--ckpt",
-        type=str,
-        default="stabilityai/stable-diffusion-xl-base-1.0",
-    )
-    parser.add_argument("--batch_size", type=int, default=1)
-    parser.add_argument("--num_inference_steps", type=int, default=4)
-    parser.add_argument("--model_cpu_offload", action="store_true")
-    parser.add_argument("--run_compile", action="store_true")
-    args = parser.parse_args()
-
-    benchmark_pipe = LCMLoRATextToImageBenchmark(args)
-    benchmark_pipe.benchmark(args)
--- a/benchmarks/benchmark_text_to_image.py
+++ b/benchmarks/benchmark_text_to_image.py
@@ -1,40 +0,0 @@
-import argparse
-import sys
-
-
-sys.path.append(".")
-from base_classes import TextToImageBenchmark, TurboTextToImageBenchmark  # noqa: E402
-
-
-ALL_T2I_CKPTS = [
-    "runwayml/stable-diffusion-v1-5",
-    "segmind/SSD-1B",
-    "stabilityai/stable-diffusion-xl-base-1.0",
-    "kandinsky-community/kandinsky-2-2-decoder",
-    "warp-ai/wuerstchen",
-    "stabilityai/sdxl-turbo",
-]
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--ckpt",
-        type=str,
-        default="runwayml/stable-diffusion-v1-5",
-        choices=ALL_T2I_CKPTS,
-    )
-    parser.add_argument("--batch_size", type=int, default=1)
-    parser.add_argument("--num_inference_steps", type=int, default=50)
-    parser.add_argument("--model_cpu_offload", action="store_true")
-    parser.add_argument("--run_compile", action="store_true")
-    args = parser.parse_args()
-
-    benchmark_cls = None
-    if "turbo" in args.ckpt:
-        benchmark_cls = TurboTextToImageBenchmark
-    else:
-        benchmark_cls = TextToImageBenchmark
-
-    benchmark_pipe = benchmark_cls(args)
-    benchmark_pipe.benchmark(args)
--- a/benchmarks/push_results.py
+++ b/benchmarks/push_results.py
@@ -1,72 +0,0 @@
-import glob
-import sys
-
-import pandas as pd
-from huggingface_hub import hf_hub_download, upload_file
-from huggingface_hub.utils._errors import EntryNotFoundError
-
-
-sys.path.append(".")
-from utils import BASE_PATH, FINAL_CSV_FILE, GITHUB_SHA, REPO_ID, collate_csv  # noqa: E402
-
-
-def has_previous_benchmark() -> str:
-    csv_path = None
-    try:
-        csv_path = hf_hub_download(repo_id=REPO_ID, repo_type="dataset", filename=FINAL_CSV_FILE)
-    except EntryNotFoundError:
-        csv_path = None
-    return csv_path
-
-
-def filter_float(value):
-    if isinstance(value, str):
-        return float(value.split()[0])
-    return value
-
-
-def push_to_hf_dataset():
-    all_csvs = sorted(glob.glob(f"{BASE_PATH}/*.csv"))
-    collate_csv(all_csvs, FINAL_CSV_FILE)
-
-    # If there's an existing benchmark file, we should report the changes.
-    csv_path = has_previous_benchmark()
-    if csv_path is not None:
-        current_results = pd.read_csv(FINAL_CSV_FILE)
-        previous_results = pd.read_csv(csv_path)
-
-        numeric_columns = current_results.select_dtypes(include=["float64", "int64"]).columns
-        numeric_columns = [
-            c for c in numeric_columns if c not in ["batch_size", "num_inference_steps", "actual_gpu_memory (gbs)"]
-        ]
-
-        for column in numeric_columns:
-            previous_results[column] = previous_results[column].map(lambda x: filter_float(x))
-
-            # Calculate the percentage change
-            current_results[column] = current_results[column].astype(float)
-            previous_results[column] = previous_results[column].astype(float)
-            percent_change = ((current_results[column] - previous_results[column]) / previous_results[column]) * 100
-
-            # Format the values with '+' or '-' sign and append to original values
-            current_results[column] = current_results[column].map(str) + percent_change.map(
-                lambda x: f" ({'+' if x > 0 else ''}{x:.2f}%)"
-            )
-            # There might be newly added rows. So, filter out the NaNs.
-            current_results[column] = current_results[column].map(lambda x: x.replace(" (nan%)", ""))
-
-        # Overwrite the current result file.
-        current_results.to_csv(FINAL_CSV_FILE, index=False)
-
-    commit_message = f"upload from sha: {GITHUB_SHA}" if GITHUB_SHA is not None else "upload benchmark results"
-    upload_file(
-        repo_id=REPO_ID,
-        path_in_repo=FINAL_CSV_FILE,
-        path_or_fileobj=FINAL_CSV_FILE,
-        repo_type="dataset",
-        commit_message=commit_message,
-    )
-
-
-if __name__ == "__main__":
-    push_to_hf_dataset()
--- a/benchmarks/run_all.py
+++ b/benchmarks/run_all.py
@@ -1,97 +0,0 @@
-import glob
-import subprocess
-import sys
-from typing import List
-
-
-sys.path.append(".")
-from benchmark_text_to_image import ALL_T2I_CKPTS  # noqa: E402
-
-
-PATTERN = "benchmark_*.py"
-
-
-class SubprocessCallException(Exception):
-    pass
-
-
-# Taken from `test_examples_utils.py`
-def run_command(command: List[str], return_stdout=False):
-    """
-    Runs `command` with `subprocess.check_output` and will potentially return the `stdout`. Will also properly capture
-    if an error occurred while running `command`
-    """
-    try:
-        output = subprocess.check_output(command, stderr=subprocess.STDOUT)
-        if return_stdout:
-            if hasattr(output, "decode"):
-                output = output.decode("utf-8")
-            return output
-    except subprocess.CalledProcessError as e:
-        raise SubprocessCallException(
-            f"Command `{' '.join(command)}` failed with the following error:\n\n{e.output.decode()}"
-        ) from e
-
-
-def main():
-    python_files = glob.glob(PATTERN)
-
-    for file in python_files:
-        print(f"****** Running file: {file} ******")
-
-        # Run with canonical settings.
-        if file != "benchmark_text_to_image.py":
-            command = f"python {file}"
-            run_command(command.split())
-
-            command += " --run_compile"
-            run_command(command.split())
-
-    # Run variants.
-    for file in python_files:
-        if file == "benchmark_text_to_image.py":
-            for ckpt in ALL_T2I_CKPTS:
-                command = f"python {file} --ckpt {ckpt}"
-
-                if "turbo" in ckpt:
-                    command += " --num_inference_steps 1"
-
-                run_command(command.split())
-
-                command += " --run_compile"
-                run_command(command.split())
-
-        elif file == "benchmark_sd_img.py":
-            for ckpt in ["stabilityai/stable-diffusion-xl-refiner-1.0", "stabilityai/sdxl-turbo"]:
-                command = f"python {file} --ckpt {ckpt}"
-
-                if ckpt == "stabilityai/sdxl-turbo":
-                    command += " --num_inference_steps 2"
-
-                run_command(command.split())
-                command += " --run_compile"
-                run_command(command.split())
-
-        elif file == "benchmark_sd_inpainting.py":
-            sdxl_ckpt = "stabilityai/stable-diffusion-xl-base-1.0"
-            command = f"python {file} --ckpt {sdxl_ckpt}"
-            run_command(command.split())
-
-            command += " --run_compile"
-            run_command(command.split())
-
-        elif file in ["benchmark_controlnet.py", "benchmark_t2i_adapter.py"]:
-            sdxl_ckpt = (
-                "diffusers/controlnet-canny-sdxl-1.0"
-                if "controlnet" in file
-                else "TencentARC/t2i-adapter-canny-sdxl-1.0"
-            )
-            command = f"python {file} --ckpt {sdxl_ckpt}"
-            run_command(command.split())
-
-            command += " --run_compile"
-            run_command(command.split())
-
-
-if __name__ == "__main__":
-    main()
--- a/benchmarks/utils.py
+++ b/benchmarks/utils.py
@@ -1,98 +0,0 @@
-import argparse
-import csv
-import gc
-import os
-from dataclasses import dataclass
-from typing import Dict, List, Union
-
-import torch
-import torch.utils.benchmark as benchmark
-
-
-GITHUB_SHA = os.getenv("GITHUB_SHA", None)
-BENCHMARK_FIELDS = [
-    "pipeline_cls",
-    "ckpt_id",
-    "batch_size",
-    "num_inference_steps",
-    "model_cpu_offload",
-    "run_compile",
-    "time (secs)",
-    "memory (gbs)",
-    "actual_gpu_memory (gbs)",
-    "github_sha",
-]
-
-PROMPT = "ghibli style, a fantasy landscape with castles"
-BASE_PATH = os.getenv("BASE_PATH", ".")
-TOTAL_GPU_MEMORY = float(os.getenv("TOTAL_GPU_MEMORY", torch.cuda.get_device_properties(0).total_memory / (1024**3)))
-
-REPO_ID = "diffusers/benchmarks"
-FINAL_CSV_FILE = "collated_results.csv"
-
-
-@dataclass
-class BenchmarkInfo:
-    time: float
-    memory: float
-
-
-def flush():
-    """Wipes off memory."""
-    gc.collect()
-    torch.cuda.empty_cache()
-    torch.cuda.reset_max_memory_allocated()
-    torch.cuda.reset_peak_memory_stats()
-
-
-def bytes_to_giga_bytes(bytes):
-    return f"{(bytes / 1024 / 1024 / 1024):.3f}"
-
-
-def benchmark_fn(f, *args, **kwargs):
-    t0 = benchmark.Timer(
-        stmt="f(*args, **kwargs)",
-        globals={"args": args, "kwargs": kwargs, "f": f},
-        num_threads=torch.get_num_threads(),
-    )
-    return f"{(t0.blocked_autorange().mean):.3f}"
-
-
-def generate_csv_dict(
-    pipeline_cls: str, ckpt: str, args: argparse.Namespace, benchmark_info: BenchmarkInfo
-) -> Dict[str, Union[str, bool, float]]:
-    """Packs benchmarking data into a dictionary for latter serialization."""
-    data_dict = {
-        "pipeline_cls": pipeline_cls,
-        "ckpt_id": ckpt,
-        "batch_size": args.batch_size,
-        "num_inference_steps": args.num_inference_steps,
-        "model_cpu_offload": args.model_cpu_offload,
-        "run_compile": args.run_compile,
-        "time (secs)": benchmark_info.time,
-        "memory (gbs)": benchmark_info.memory,
-        "actual_gpu_memory (gbs)": f"{(TOTAL_GPU_MEMORY):.3f}",
-        "github_sha": GITHUB_SHA,
-    }
-    return data_dict
-
-
-def write_to_csv(file_name: str, data_dict: Dict[str, Union[str, bool, float]]):
-    """Serializes a dictionary into a CSV file."""
-    with open(file_name, mode="w", newline="") as csvfile:
-        writer = csv.DictWriter(csvfile, fieldnames=BENCHMARK_FIELDS)
-        writer.writeheader()
-        writer.writerow(data_dict)
-
-
-def collate_csv(input_files: List[str], output_file: str):
-    """Collates multiple identically structured CSVs into a single CSV file."""
-    with open(output_file, mode="w", newline="") as outfile:
-        writer = csv.DictWriter(outfile, fieldnames=BENCHMARK_FIELDS)
-        writer.writeheader()
-
-        for file in input_files:
-            with open(file, mode="r") as infile:
-                reader = csv.DictReader(infile)
-                for row in reader:
-                    writer.writerow(row)
--- a/examples/README.md
+++ b/examples/README.md
@@ -18,7 +18,8 @@ limitations under the License.
 Diffusers examples are a collection of scripts to demonstrate how to effectively use the `diffusers` library
 for a variety of use cases involving training or fine-tuning.

-**Note**: If you are looking for **official** examples on how to use `diffusers` for inference, please have a look at [src/diffusers/pipelines](https://github.com/huggingface/diffusers/tree/main/src/diffusers/pipelines).
+**Note**: If you are looking for **official** examples on how to use `diffusers` for inference, 
+please have a look at [src/diffusers/pipelines](https://github.com/huggingface/diffusers/tree/main/src/diffusers/pipelines).

 Our examples aspire to be **self-contained**, **easy-to-tweak**, **beginner-friendly** and for **one-purpose-only**.
 More specifically, this means:
@@ -26,7 +27,8 @@ More specifically, this means:
 - **Self-contained**: An example script shall only depend on "pip-install-able" Python packages that can be found in a `requirements.txt` file. Example scripts shall **not** depend on any local files. This means that one can simply download an example script, *e.g.* [train_unconditional.py](https://github.com/huggingface/diffusers/blob/main/examples/unconditional_image_generation/train_unconditional.py), install the required dependencies, *e.g.* [requirements.txt](https://github.com/huggingface/diffusers/blob/main/examples/unconditional_image_generation/requirements.txt) and execute the example script.
 - **Easy-to-tweak**: While we strive to present as many use cases as possible, the example scripts are just that - examples. It is expected that they won't work out-of-the box on your specific problem and that you will be required to change a few lines of code to adapt them to your needs. To help you with that, most of the examples fully expose the preprocessing of the data and the training loop to allow you to tweak and edit them as required.
 - **Beginner-friendly**: We do not aim for providing state-of-the-art training scripts for the newest models, but rather examples that can be used as a way to better understand diffusion models and how to use them with the `diffusers` library. We often purposefully leave out certain state-of-the-art methods if we consider them too complex for beginners.
- **One-purpose-only**: Examples should show one task and one task only. Even if a task is from a modeling point of view very similar, *e.g.* image super-resolution and image modification tend to use the same model and training method, we want examples to showcase only one task to keep them as readable and easy-to-understand as possible.
+- **One-purpose-only**: Examples should show one task and one task only. Even if a task is from a modeling 
+point of view very similar, *e.g.* image super-resolution and image modification tend to use the same model and training method, we want examples to showcase only one task to keep them as readable and easy-to-understand as possible.

 We provide **official** examples that cover the most popular tasks of diffusion models.
 *Official* examples are **actively** maintained by the `diffusers` maintainers and we try to rigorously follow our example philosophy as defined above. 
--- a/examples/advanced_diffusion_training/train_dreambooth_lora_sdxl_advanced.py
+++ b/examples/advanced_diffusion_training/train_dreambooth_lora_sdxl_advanced.py
@@ -135,8 +135,8 @@ from safetensors.torch import load_file
        """
        diffusers_example_pivotal = f"""embedding_path = hf_hub_download(repo_id='{repo_id}', filename="embeddings.safetensors", repo_type="model")
 state_dict = load_file(embedding_path)
-pipeline.load_textual_inversion(state_dict["clip_l"], token=["<s0>", "<s1>"], text_encoder=pipeline.text_encoder, tokenizer=pipeline.tokenizer)
-pipeline.load_textual_inversion(state_dict["clip_g"], token=["<s0>", "<s1>"], text_encoder=pipeline.text_encoder_2, tokenizer=pipeline.tokenizer_2)
+pipeline.load_textual_inversion(state_dict["clip_l"], token=["<s0>", "<s1>"], text_encoder=pipe.text_encoder, tokenizer=pipe.tokenizer)
+pipeline.load_textual_inversion(state_dict["clip_g"], token=["<s0>", "<s1>"], text_encoder=pipe.text_encoder_2, tokenizer=pipe.tokenizer_2)
        """
        if token_abstraction_dict:
            for key, value in token_abstraction_dict.items():
@@ -157,8 +157,6 @@ tags:
 base_model: {base_model}
 instance_prompt: {instance_prompt}
 license: openrail++
-widget:
-    - text: '{validation_prompt if validation_prompt else instance_prompt}'
 ---
 """

--- a/examples/community/README.md
+++ b/examples/community/README.md
@@ -48,7 +48,6 @@ prompt-to-prompt | change parts of a prompt and retain image structure (see [pap
 |   Latent Consistency Pipeline                                                                                                    | Implementation of [Latent Consistency Models: Synthesizing High-Resolution Images with Few-Step Inference](https://arxiv.org/abs/2310.04378)                                                                                                                                                                                                                                                                                                                                                                                                                                      | [Latent Consistency Pipeline](#latent-consistency-pipeline)      | - |              [Simian Luo](https://github.com/luosiallen) |
 |   Latent Consistency Img2img Pipeline                                                                                                    | Img2img pipeline for Latent Consistency Models                                                                                                                                                                                                                                                                                                                                                                                                                                    | [Latent Consistency Img2Img Pipeline](#latent-consistency-img2img-pipeline)      | - |              [Logan Zoellner](https://github.com/nagolinc) |
 |   Latent Consistency Interpolation Pipeline                                                                                                    | Interpolate the latent space of Latent Consistency Models with multiple prompts                                                                                                                                                                                                                                                                                                                                                                                                                                    | [Latent Consistency Interpolation Pipeline](#latent-consistency-interpolation-pipeline) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1pK3NrLWJSiJsBynLns1K1-IDTW9zbPvl?usp=sharing) | [Aryan V S](https://github.com/a-r-r-o-w) |
-| SDE Drag Pipeline                                                                                                                         | The pipeline supports drag editing of images using stochastic differential equations                                                                                                                                                                                                                                                                                                                                                                                                                | [SDE Drag Pipeline](#sde-drag-pipeline)                                                     | - | [NieShen](https://github.com/NieShenRuc) [Fengqi Zhu](https://github.com/Monohydroxides) |
 |   Regional Prompting Pipeline                                                                                               | Assign multiple prompts for different regions                                                                                                                                                                                                                                                                                                                                                    |  [Regional Prompting Pipeline](#regional-prompting-pipeline) | - | [hako-mikan](https://github.com/hako-mikan) |
 | LDM3D-sr (LDM3D upscaler)                                                                                                             | Upscale low resolution RGB and depth inputs to high resolution                                                                                                                                                                                                                                                                                                                                                                                                                              | [StableDiffusionUpscaleLDM3D Pipeline](https://github.com/estelleafl/diffusers/tree/ldm3d_upscaler_community/examples/community#stablediffusionupscaleldm3d-pipeline)                                                                             | -                                                                                                                                                                                                             |                                                        [Estelle Aflalo](https://github.com/estelleafl) |
 | AnimateDiff ControlNet Pipeline                                                                                                    | Combines AnimateDiff with precise motion control using ControlNets                                                                                                                                                                                                                                                                                                                                                                                                                                    | [AnimateDiff ControlNet Pipeline](#animatediff-controlnet-pipeline) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1SKboYeGjEQmQPWoFC0aLYpBlYdHXkvAu?usp=sharing) | [Aryan V S](https://github.com/a-r-r-o-w) and [Edoardo Botta](https://github.com/EdoardoBotta) |
@@ -2931,7 +2930,7 @@ The original repo can be found at [repo](https://github.com/PRIS-CV/DemoFusion).

 - `show_image` (`bool`, defaults to False):
  Determine whether to show intermediate results during generation.
-```py
+```
 from diffusers import DiffusionPipeline

 pipe = DiffusionPipeline.from_pretrained(
@@ -2963,7 +2962,7 @@ images = pipe(
 )
 ```
 You can display and save the generated images as:
-```py
+```
 def image_grid(imgs, save_path=None):

    w = 0
@@ -2987,42 +2986,3 @@ def image_grid(imgs, save_path=None):
 image_grid(images, save_path="./outputs/")
 ```
 ![output_example](https://github.com/PRIS-CV/DemoFusion/blob/main/output_example.png)
-
-### SDE Drag pipeline
-
-This pipeline provides drag-and-drop image editing using stochastic differential equations. It enables image editing by inputting prompt, image, mask_image, source_points, and target_points.
-
-![SDE Drag Image](https://github.com/huggingface/diffusers/assets/75928535/bd54f52f-f002-4951-9934-b2a4592771a5)
-
-See [paper](https://arxiv.org/abs/2311.01410), [paper page](https://ml-gsai.github.io/SDE-Drag-demo/), [original repo](https://github.com/ML-GSAI/SDE-Drag) for more infomation.
-
-```py
-import PIL
-import torch
-from diffusers import DDIMScheduler, DiffusionPipeline
-
-# Load the pipeline
-model_path = "runwayml/stable-diffusion-v1-5"
-scheduler = DDIMScheduler.from_pretrained(model_path, subfolder="scheduler")
-pipe = DiffusionPipeline.from_pretrained(model_path, scheduler=scheduler, custom_pipeline="sde_drag")
-pipe.to('cuda')
-
-# To save GPU memory, torch.float16 can be used, but it may compromise image quality.
-# If not training LoRA, please avoid using torch.float16
-# pipe.to(torch.float16)
-
-# Provide prompt, image, mask image, and the starting and target points for drag editing.
-prompt = "prompt of the image"
-image = PIL.Image.open('/path/to/image')
-mask_image = PIL.Image.open('/path/to/mask_image')
-source_points = [[123, 456]]
-target_points = [[234, 567]]
-
-# train_lora is optional, and in most cases, using train_lora can better preserve consistency with the original image.
-pipe.train_lora(prompt, image)
-
-output = pipe(prompt, image, mask_image, source_points, target_points)
-output_image = PIL.Image.fromarray(output)
-output_image.save("./output.png")
-
-```
--- a/examples/community/sde_drag.py
+++ b/examples/community/sde_drag.py
@@ -1,594 +0,0 @@
-import math
-import tempfile
-from typing import List, Optional
-
-import numpy as np
-import PIL.Image
-import torch
-from accelerate import Accelerator
-from torchvision import transforms
-from tqdm.auto import tqdm
-from transformers import CLIPTextModel, CLIPTokenizer
-
-from diffusers import AutoencoderKL, DiffusionPipeline, DPMSolverMultistepScheduler, UNet2DConditionModel
-from diffusers.loaders import AttnProcsLayers, LoraLoaderMixin
-from diffusers.models.attention_processor import (
-    AttnAddedKVProcessor,
-    AttnAddedKVProcessor2_0,
-    LoRAAttnAddedKVProcessor,
-    LoRAAttnProcessor,
-    LoRAAttnProcessor2_0,
-    SlicedAttnAddedKVProcessor,
-)
-from diffusers.optimization import get_scheduler
-
-
-class SdeDragPipeline(DiffusionPipeline):
-    r"""
-    Pipeline for image drag-and-drop editing using stochastic differential equations: https://arxiv.org/abs/2311.01410.
-    Please refer to the [official repository](https://github.com/ML-GSAI/SDE-Drag) for more information.
-
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-
-    Args:
-        vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
-        text_encoder ([`CLIPTextModel`]):
-            Frozen text-encoder. Stable Diffusion uses the text portion of
-            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
-            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
-        tokenizer (`CLIPTokenizer`):
-            Tokenizer of class
-            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
-        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Please use
-            [`DDIMScheduler`].
-    """
-
-    def __init__(
-        self,
-        vae: AutoencoderKL,
-        text_encoder: CLIPTextModel,
-        tokenizer: CLIPTokenizer,
-        unet: UNet2DConditionModel,
-        scheduler: DPMSolverMultistepScheduler,
-    ):
-        super().__init__()
-
-        self.register_modules(vae=vae, text_encoder=text_encoder, tokenizer=tokenizer, unet=unet, scheduler=scheduler)
-
-    @torch.no_grad()
-    def __call__(
-        self,
-        prompt: str,
-        image: PIL.Image.Image,
-        mask_image: PIL.Image.Image,
-        source_points: List[List[int]],
-        target_points: List[List[int]],
-        t0: Optional[float] = 0.6,
-        steps: Optional[int] = 200,
-        step_size: Optional[int] = 2,
-        image_scale: Optional[float] = 0.3,
-        adapt_radius: Optional[int] = 5,
-        min_lora_scale: Optional[float] = 0.5,
-        generator: Optional[torch.Generator] = None,
-    ):
-        r"""
-        Function invoked when calling the pipeline for image editing.
-        Args:
-            prompt (`str`, *required*):
-                The prompt to guide the image editing.
-            image (`PIL.Image.Image`, *required*):
-                Which will be edited, parts of the image will be masked out with `mask_image` and edited
-                according to `prompt`.
-            mask_image (`PIL.Image.Image`, *required*):
-                To mask `image`. White pixels in the mask will be edited, while black pixels will be preserved.
-            source_points (`List[List[int]]`, *required*):
-                Used to mark the starting positions of drag editing in the image, with each pixel represented as a
-                `List[int]` of length 2.
-            target_points (`List[List[int]]`, *required*):
-                Used to mark the target positions of drag editing in the image, with each pixel represented as a
-                `List[int]` of length 2.
-            t0 (`float`, *optional*, defaults to 0.6):
-                The time parameter. Higher t0 improves the fidelity while lowering the faithfulness of the edited images
-                and vice versa.
-            steps (`int`, *optional*, defaults to 200):
-                The number of sampling iterations.
-            step_size (`int`, *optional*, defaults to 2):
-                The drag diatance of each drag step.
-            image_scale (`float`, *optional*, defaults to 0.3):
-                To avoid duplicating the content, use image_scale to perturbs the source.
-            adapt_radius (`int`, *optional*, defaults to 5):
-                The size of the region for copy and paste operations during each step of the drag process.
-            min_lora_scale (`float`, *optional*, defaults to 0.5):
-                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
-                min_lora_scale specifies the minimum LoRA scale during the image drag-editing process.
-            generator ('torch.Generator', *optional*, defaults to None):
-                To make generation deterministic(https://pytorch.org/docs/stable/generated/torch.Generator.html).
-        Examples:
-        ```py
-        >>> import PIL
-        >>> import torch
-        >>> from diffusers import DDIMScheduler, DiffusionPipeline
-
-        >>> # Load the pipeline
-        >>> model_path = "runwayml/stable-diffusion-v1-5"
-        >>> scheduler = DDIMScheduler.from_pretrained(model_path, subfolder="scheduler")
-        >>> pipe = DiffusionPipeline.from_pretrained(model_path, scheduler=scheduler, custom_pipeline="sde_drag")
-        >>> pipe.to('cuda')
-
-        >>> # To save GPU memory, torch.float16 can be used, but it may compromise image quality.
-        >>> # If not training LoRA, please avoid using torch.float16
-        >>> # pipe.to(torch.float16)
-
-        >>> # Provide prompt, image, mask image, and the starting and target points for drag editing.
-        >>> prompt = "prompt of the image"
-        >>> image = PIL.Image.open('/path/to/image')
-        >>> mask_image = PIL.Image.open('/path/to/mask_image')
-        >>> source_points = [[123, 456]]
-        >>> target_points = [[234, 567]]
-
-        >>> # train_lora is optional, and in most cases, using train_lora can better preserve consistency with the original image.
-        >>> pipe.train_lora(prompt, image)
-
-        >>> output = pipe(prompt, image, mask_image, source_points, target_points)
-        >>> output_image = PIL.Image.fromarray(output)
-        >>> output_image.save("./output.png")
-        ```
-        """
-
-        self.scheduler.set_timesteps(steps)
-
-        noise_scale = (1 - image_scale**2) ** (0.5)
-
-        text_embeddings = self._get_text_embed(prompt)
-        uncond_embeddings = self._get_text_embed([""])
-        text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
-
-        latent = self._get_img_latent(image)
-
-        mask = mask_image.resize((latent.shape[3], latent.shape[2]))
-        mask = torch.tensor(np.array(mask))
-        mask = mask.unsqueeze(0).expand_as(latent).to(self.device)
-
-        source_points = torch.tensor(source_points).div(torch.tensor([8]), rounding_mode="trunc")
-        target_points = torch.tensor(target_points).div(torch.tensor([8]), rounding_mode="trunc")
-
-        distance = target_points - source_points
-        distance_norm_max = torch.norm(distance.float(), dim=1, keepdim=True).max()
-
-        if distance_norm_max <= step_size:
-            drag_num = 1
-        else:
-            drag_num = distance_norm_max.div(torch.tensor([step_size]), rounding_mode="trunc")
-            if (distance_norm_max / drag_num - step_size).abs() > (
-                distance_norm_max / (drag_num + 1) - step_size
-            ).abs():
-                drag_num += 1
-
-        latents = []
-        for i in tqdm(range(int(drag_num)), desc="SDE Drag"):
-            source_new = source_points + (i / drag_num * distance).to(torch.int)
-            target_new = source_points + ((i + 1) / drag_num * distance).to(torch.int)
-
-            latent, noises, hook_latents, lora_scales, cfg_scales = self._forward(
-                latent, steps, t0, min_lora_scale, text_embeddings, generator
-            )
-            latent = self._copy_and_paste(
-                latent,
-                source_new,
-                target_new,
-                adapt_radius,
-                latent.shape[2] - 1,
-                latent.shape[3] - 1,
-                image_scale,
-                noise_scale,
-                generator,
-            )
-            latent = self._backward(
-                latent, mask, steps, t0, noises, hook_latents, lora_scales, cfg_scales, text_embeddings, generator
-            )
-
-            latents.append(latent)
-
-        result_image = 1 / 0.18215 * latents[-1]
-
-        with torch.no_grad():
-            result_image = self.vae.decode(result_image).sample
-
-        result_image = (result_image / 2 + 0.5).clamp(0, 1)
-        result_image = result_image.cpu().permute(0, 2, 3, 1).numpy()[0]
-        result_image = (result_image * 255).astype(np.uint8)
-
-        return result_image
-
-    def train_lora(self, prompt, image, lora_step=100, lora_rank=16, generator=None):
-        accelerator = Accelerator(gradient_accumulation_steps=1, mixed_precision="fp16")
-
-        self.vae.requires_grad_(False)
-        self.text_encoder.requires_grad_(False)
-        self.unet.requires_grad_(False)
-
-        unet_lora_attn_procs = {}
-        for name, attn_processor in self.unet.attn_processors.items():
-            cross_attention_dim = None if name.endswith("attn1.processor") else self.unet.config.cross_attention_dim
-            if name.startswith("mid_block"):
-                hidden_size = self.unet.config.block_out_channels[-1]
-            elif name.startswith("up_blocks"):
-                block_id = int(name[len("up_blocks.")])
-                hidden_size = list(reversed(self.unet.config.block_out_channels))[block_id]
-            elif name.startswith("down_blocks"):
-                block_id = int(name[len("down_blocks.")])
-                hidden_size = self.unet.config.block_out_channels[block_id]
-            else:
-                raise NotImplementedError("name must start with up_blocks, mid_blocks, or down_blocks")
-
-            if isinstance(attn_processor, (AttnAddedKVProcessor, SlicedAttnAddedKVProcessor, AttnAddedKVProcessor2_0)):
-                lora_attn_processor_class = LoRAAttnAddedKVProcessor
-            else:
-                lora_attn_processor_class = (
-                    LoRAAttnProcessor2_0
-                    if hasattr(torch.nn.functional, "scaled_dot_product_attention")
-                    else LoRAAttnProcessor
-                )
-            unet_lora_attn_procs[name] = lora_attn_processor_class(
-                hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, rank=lora_rank
-            )
-
-        self.unet.set_attn_processor(unet_lora_attn_procs)
-        unet_lora_layers = AttnProcsLayers(self.unet.attn_processors)
-        params_to_optimize = unet_lora_layers.parameters()
-
-        optimizer = torch.optim.AdamW(
-            params_to_optimize,
-            lr=2e-4,
-            betas=(0.9, 0.999),
-            weight_decay=1e-2,
-            eps=1e-08,
-        )
-
-        lr_scheduler = get_scheduler(
-            "constant",
-            optimizer=optimizer,
-            num_warmup_steps=0,
-            num_training_steps=lora_step,
-            num_cycles=1,
-            power=1.0,
-        )
-
-        unet_lora_layers = accelerator.prepare_model(unet_lora_layers)
-        optimizer = accelerator.prepare_optimizer(optimizer)
-        lr_scheduler = accelerator.prepare_scheduler(lr_scheduler)
-
-        with torch.no_grad():
-            text_inputs = self._tokenize_prompt(prompt, tokenizer_max_length=None)
-            text_embedding = self._encode_prompt(
-                text_inputs.input_ids, text_inputs.attention_mask, text_encoder_use_attention_mask=False
-            )
-
-        image_transforms = transforms.Compose(
-            [
-                transforms.ToTensor(),
-                transforms.Normalize([0.5], [0.5]),
-            ]
-        )
-
-        image = image_transforms(image).to(self.device, dtype=self.vae.dtype)
-        image = image.unsqueeze(dim=0)
-        latents_dist = self.vae.encode(image).latent_dist
-
-        for _ in tqdm(range(lora_step), desc="Train LoRA"):
-            self.unet.train()
-            model_input = latents_dist.sample() * self.vae.config.scaling_factor
-
-            # Sample noise that we'll add to the latents
-            noise = torch.randn(
-                model_input.size(),
-                dtype=model_input.dtype,
-                layout=model_input.layout,
-                device=model_input.device,
-                generator=generator,
-            )
-            bsz, channels, height, width = model_input.shape
-
-            # Sample a random timestep for each image
-            timesteps = torch.randint(
-                0, self.scheduler.config.num_train_timesteps, (bsz,), device=model_input.device, generator=generator
-            )
-            timesteps = timesteps.long()
-
-            # Add noise to the model input according to the noise magnitude at each timestep
-            # (this is the forward diffusion process)
-            noisy_model_input = self.scheduler.add_noise(model_input, noise, timesteps)
-
-            # Predict the noise residual
-            model_pred = self.unet(noisy_model_input, timesteps, text_embedding).sample
-
-            # Get the target for loss depending on the prediction type
-            if self.scheduler.config.prediction_type == "epsilon":
-                target = noise
-            elif self.scheduler.config.prediction_type == "v_prediction":
-                target = self.scheduler.get_velocity(model_input, noise, timesteps)
-            else:
-                raise ValueError(f"Unknown prediction type {self.scheduler.config.prediction_type}")
-
-            loss = torch.nn.functional.mse_loss(model_pred.float(), target.float(), reduction="mean")
-            accelerator.backward(loss)
-            optimizer.step()
-            lr_scheduler.step()
-            optimizer.zero_grad()
-
-        with tempfile.TemporaryDirectory() as save_lora_dir:
-            LoraLoaderMixin.save_lora_weights(
-                save_directory=save_lora_dir,
-                unet_lora_layers=unet_lora_layers,
-                text_encoder_lora_layers=None,
-            )
-
-            self.unet.load_attn_procs(save_lora_dir)
-
-    def _tokenize_prompt(self, prompt, tokenizer_max_length=None):
-        if tokenizer_max_length is not None:
-            max_length = tokenizer_max_length
-        else:
-            max_length = self.tokenizer.model_max_length
-
-        text_inputs = self.tokenizer(
-            prompt,
-            truncation=True,
-            padding="max_length",
-            max_length=max_length,
-            return_tensors="pt",
-        )
-
-        return text_inputs
-
-    def _encode_prompt(self, input_ids, attention_mask, text_encoder_use_attention_mask=False):
-        text_input_ids = input_ids.to(self.device)
-
-        if text_encoder_use_attention_mask:
-            attention_mask = attention_mask.to(self.device)
-        else:
-            attention_mask = None
-
-        prompt_embeds = self.text_encoder(
-            text_input_ids,
-            attention_mask=attention_mask,
-        )
-        prompt_embeds = prompt_embeds[0]
-
-        return prompt_embeds
-
-    @torch.no_grad()
-    def _get_text_embed(self, prompt):
-        text_input = self.tokenizer(
-            prompt,
-            padding="max_length",
-            max_length=self.tokenizer.model_max_length,
-            truncation=True,
-            return_tensors="pt",
-        )
-        text_embeddings = self.text_encoder(text_input.input_ids.to(self.device))[0]
-        return text_embeddings
-
-    def _copy_and_paste(
-        self, latent, source_new, target_new, adapt_radius, max_height, max_width, image_scale, noise_scale, generator
-    ):
-        def adaption_r(source, target, adapt_radius, max_height, max_width):
-            r_x_lower = min(adapt_radius, source[0], target[0])
-            r_x_upper = min(adapt_radius, max_width - source[0], max_width - target[0])
-            r_y_lower = min(adapt_radius, source[1], target[1])
-            r_y_upper = min(adapt_radius, max_height - source[1], max_height - target[1])
-            return r_x_lower, r_x_upper, r_y_lower, r_y_upper
-
-        for source_, target_ in zip(source_new, target_new):
-            r_x_lower, r_x_upper, r_y_lower, r_y_upper = adaption_r(
-                source_, target_, adapt_radius, max_height, max_width
-            )
-
-            source_feature = latent[
-                :, :, source_[1] - r_y_lower : source_[1] + r_y_upper, source_[0] - r_x_lower : source_[0] + r_x_upper
-            ].clone()
-
-            latent[
-                :, :, source_[1] - r_y_lower : source_[1] + r_y_upper, source_[0] - r_x_lower : source_[0] + r_x_upper
-            ] = image_scale * source_feature + noise_scale * torch.randn(
-                latent.shape[0],
-                4,
-                r_y_lower + r_y_upper,
-                r_x_lower + r_x_upper,
-                device=self.device,
-                generator=generator,
-            )
-
-            latent[
-                :, :, target_[1] - r_y_lower : target_[1] + r_y_upper, target_[0] - r_x_lower : target_[0] + r_x_upper
-            ] = source_feature * 1.1
-        return latent
-
-    @torch.no_grad()
-    def _get_img_latent(self, image, height=None, weight=None):
-        data = image.convert("RGB")
-        if height is not None:
-            data = data.resize((weight, height))
-        transform = transforms.ToTensor()
-        data = transform(data).unsqueeze(0)
-        data = (data * 2.0) - 1.0
-        data = data.to(self.device, dtype=self.vae.dtype)
-        latent = self.vae.encode(data).latent_dist.sample()
-        latent = 0.18215 * latent
-        return latent
-
-    @torch.no_grad()
-    def _get_eps(self, latent, timestep, guidance_scale, text_embeddings, lora_scale=None):
-        latent_model_input = torch.cat([latent] * 2) if guidance_scale > 1.0 else latent
-        text_embeddings = text_embeddings if guidance_scale > 1.0 else text_embeddings.chunk(2)[1]
-
-        cross_attention_kwargs = None if lora_scale is None else {"scale": lora_scale}
-
-        with torch.no_grad():
-            noise_pred = self.unet(
-                latent_model_input,
-                timestep,
-                encoder_hidden_states=text_embeddings,
-                cross_attention_kwargs=cross_attention_kwargs,
-            ).sample
-
-        if guidance_scale > 1.0:
-            noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-        elif guidance_scale == 1.0:
-            noise_pred_text = noise_pred
-            noise_pred_uncond = 0.0
-        else:
-            raise NotImplementedError(guidance_scale)
-        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-        return noise_pred
-
-    def _forward_sde(
-        self, timestep, sample, guidance_scale, text_embeddings, steps, eta=1.0, lora_scale=None, generator=None
-    ):
-        num_train_timesteps = len(self.scheduler)
-        alphas_cumprod = self.scheduler.alphas_cumprod
-        initial_alpha_cumprod = torch.tensor(1.0)
-
-        prev_timestep = timestep + num_train_timesteps // steps
-
-        alpha_prod_t = alphas_cumprod[timestep] if timestep >= 0 else initial_alpha_cumprod
-        alpha_prod_t_prev = alphas_cumprod[prev_timestep]
-
-        beta_prod_t_prev = 1 - alpha_prod_t_prev
-
-        x_prev = (alpha_prod_t_prev / alpha_prod_t) ** (0.5) * sample + (1 - alpha_prod_t_prev / alpha_prod_t) ** (
-            0.5
-        ) * torch.randn(
-            sample.size(), dtype=sample.dtype, layout=sample.layout, device=self.device, generator=generator
-        )
-        eps = self._get_eps(x_prev, prev_timestep, guidance_scale, text_embeddings, lora_scale)
-
-        sigma_t_prev = (
-            eta
-            * (1 - alpha_prod_t) ** (0.5)
-            * (1 - alpha_prod_t_prev / (1 - alpha_prod_t_prev) * (1 - alpha_prod_t) / alpha_prod_t) ** (0.5)
-        )
-
-        pred_original_sample = (x_prev - beta_prod_t_prev ** (0.5) * eps) / alpha_prod_t_prev ** (0.5)
-        pred_sample_direction_coeff = (1 - alpha_prod_t - sigma_t_prev**2) ** (0.5)
-
-        noise = (
-            sample - alpha_prod_t ** (0.5) * pred_original_sample - pred_sample_direction_coeff * eps
-        ) / sigma_t_prev
-
-        return x_prev, noise
-
-    def _sample(
-        self,
-        timestep,
-        sample,
-        guidance_scale,
-        text_embeddings,
-        steps,
-        sde=False,
-        noise=None,
-        eta=1.0,
-        lora_scale=None,
-        generator=None,
-    ):
-        num_train_timesteps = len(self.scheduler)
-        alphas_cumprod = self.scheduler.alphas_cumprod
-        final_alpha_cumprod = torch.tensor(1.0)
-
-        eps = self._get_eps(sample, timestep, guidance_scale, text_embeddings, lora_scale)
-
-        prev_timestep = timestep - num_train_timesteps // steps
-
-        alpha_prod_t = alphas_cumprod[timestep]
-        alpha_prod_t_prev = alphas_cumprod[prev_timestep] if prev_timestep >= 0 else final_alpha_cumprod
-
-        beta_prod_t = 1 - alpha_prod_t
-
-        sigma_t = (
-            eta
-            * ((1 - alpha_prod_t_prev) / (1 - alpha_prod_t)) ** (0.5)
-            * (1 - alpha_prod_t / alpha_prod_t_prev) ** (0.5)
-            if sde
-            else 0
-        )
-
-        pred_original_sample = (sample - beta_prod_t ** (0.5) * eps) / alpha_prod_t ** (0.5)
-        pred_sample_direction_coeff = (1 - alpha_prod_t_prev - sigma_t**2) ** (0.5)
-
-        noise = (
-            torch.randn(
-                sample.size(), dtype=sample.dtype, layout=sample.layout, device=self.device, generator=generator
-            )
-            if noise is None
-            else noise
-        )
-        latent = (
-            alpha_prod_t_prev ** (0.5) * pred_original_sample + pred_sample_direction_coeff * eps + sigma_t * noise
-        )
-
-        return latent
-
-    def _forward(self, latent, steps, t0, lora_scale_min, text_embeddings, generator):
-        def scale_schedule(begin, end, n, length, type="linear"):
-            if type == "constant":
-                return end
-            elif type == "linear":
-                return begin + (end - begin) * n / length
-            elif type == "cos":
-                factor = (1 - math.cos(n * math.pi / length)) / 2
-                return (1 - factor) * begin + factor * end
-            else:
-                raise NotImplementedError(type)
-
-        noises = []
-        latents = []
-        lora_scales = []
-        cfg_scales = []
-        latents.append(latent)
-        t0 = int(t0 * steps)
-        t_begin = steps - t0
-
-        length = len(self.scheduler.timesteps[t_begin - 1 : -1]) - 1
-        index = 1
-        for t in self.scheduler.timesteps[t_begin:].flip(dims=[0]):
-            lora_scale = scale_schedule(1, lora_scale_min, index, length, type="cos")
-            cfg_scale = scale_schedule(1, 3.0, index, length, type="linear")
-            latent, noise = self._forward_sde(
-                t, latent, cfg_scale, text_embeddings, steps, lora_scale=lora_scale, generator=generator
-            )
-
-            noises.append(noise)
-            latents.append(latent)
-            lora_scales.append(lora_scale)
-            cfg_scales.append(cfg_scale)
-            index += 1
-        return latent, noises, latents, lora_scales, cfg_scales
-
-    def _backward(
-        self, latent, mask, steps, t0, noises, hook_latents, lora_scales, cfg_scales, text_embeddings, generator
-    ):
-        t0 = int(t0 * steps)
-        t_begin = steps - t0
-
-        hook_latent = hook_latents.pop()
-        latent = torch.where(mask > 128, latent, hook_latent)
-        for t in self.scheduler.timesteps[t_begin - 1 : -1]:
-            latent = self._sample(
-                t,
-                latent,
-                cfg_scales.pop(),
-                text_embeddings,
-                steps,
-                sde=True,
-                noise=noises.pop(),
-                lora_scale=lora_scales.pop(),
-                generator=generator,
-            )
-            hook_latent = hook_latents.pop()
-            latent = torch.where(mask > 128, latent, hook_latent)
-        return latent
--- a/examples/research_projects/mulit_token_textual_inversion/README.md
+++ b/examples/research_projects/mulit_token_textual_inversion/README.md
@@ -1,6 +1,6 @@
 ## [Deprecated] Multi Token Textual Inversion

-**IMPORTART: This research project is deprecated. Multi Token Textual Inversion is now supported natively in [the official textual inversion example](https://github.com/huggingface/diffusers/tree/main/examples/textual_inversion#running-locally-with-pytorch).**
+**IMPORTART: This research project is deprecated. Multi Token Textual Inversion is now supported natively in [the officail textual inversion example](https://github.com/huggingface/diffusers/tree/main/examples/textual_inversion#running-locally-with-pytorch).**

 The author of this project is [Isamu Isozaki](https://github.com/isamu-isozaki) - please make sure to tag the author for issue and PRs as well as @patrickvonplaten.

--- a/examples/research_projects/mulit_token_textual_inversion/multi_token_clip.py
+++ b/examples/research_projects/mulit_token_textual_inversion/multi_token_clip.py
--- a/examples/research_projects/mulit_token_textual_inversion/requirements.txt
+++ b/examples/research_projects/mulit_token_textual_inversion/requirements.txt
--- a/examples/research_projects/mulit_token_textual_inversion/requirements_flax.txt
+++ b/examples/research_projects/mulit_token_textual_inversion/requirements_flax.txt
--- a/examples/research_projects/mulit_token_textual_inversion/textual_inversion.py
+++ b/examples/research_projects/mulit_token_textual_inversion/textual_inversion.py
--- a/examples/research_projects/mulit_token_textual_inversion/textual_inversion_flax.py
+++ b/examples/research_projects/mulit_token_textual_inversion/textual_inversion_flax.py
--- a/setup.py
+++ b/setup.py
@@ -204,7 +204,7 @@ class DepsTableUpdateCommand(Command):
 extras = {}
 extras["quality"] = deps_list("urllib3", "isort", "ruff", "hf-doc-builder")
 extras["docs"] = deps_list("hf-doc-builder")
-extras["training"] = deps_list("accelerate", "datasets", "protobuf", "tensorboard", "Jinja2", "peft")
+extras["training"] = deps_list("accelerate", "datasets", "protobuf", "tensorboard", "Jinja2")
 extras["test"] = deps_list(
    "compel",
    "GitPython",
--- a/src/diffusers/image_processor.py
+++ b/src/diffusers/image_processor.py
@@ -88,7 +88,7 @@ class VaeImageProcessor(ConfigMixin):
            self.config.do_convert_rgb = False

    @staticmethod
-    def numpy_to_pil(images: np.ndarray) -> List[PIL.Image.Image]:
+    def numpy_to_pil(images: np.ndarray) -> PIL.Image.Image:
        """
        Convert a numpy image or a batch of images to a PIL image.
        """
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py
@@ -19,10 +19,10 @@ import numpy as np
 import PIL.Image
 import torch
 import torch.nn.functional as F
-from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer

 from ...image_processor import PipelineImageInput, VaeImageProcessor
-from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, ControlNetModel, UNet2DConditionModel
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers
@@ -130,7 +130,7 @@ def prepare_image(image):


 class StableDiffusionControlNetImg2ImgPipeline(
-    DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, IPAdapterMixin, FromSingleFileMixin
+    DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin
 ):
    r"""
    Pipeline for image-to-image generation using Stable Diffusion with ControlNet guidance.
@@ -140,7 +140,7 @@ class StableDiffusionControlNetImg2ImgPipeline(

    The pipeline also inherits the following loading methods:
        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
-        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
    Args:
        vae ([`AutoencoderKL`]):
            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
@@ -166,7 +166,7 @@ class StableDiffusionControlNetImg2ImgPipeline(
    """

    model_cpu_offload_seq = "text_encoder->unet->vae"
-    _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
+    _optional_components = ["safety_checker", "feature_extractor"]
    _exclude_from_cpu_offload = ["safety_checker"]
    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]

@@ -180,7 +180,6 @@ class StableDiffusionControlNetImg2ImgPipeline(
        scheduler: KarrasDiffusionSchedulers,
        safety_checker: StableDiffusionSafetyChecker,
        feature_extractor: CLIPImageProcessor,
-        image_encoder: CLIPVisionModelWithProjection = None,
        requires_safety_checker: bool = True,
    ):
        super().__init__()
@@ -213,7 +212,6 @@ class StableDiffusionControlNetImg2ImgPipeline(
            scheduler=scheduler,
            safety_checker=safety_checker,
            feature_extractor=feature_extractor,
-            image_encoder=image_encoder,
        )
        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True)
@@ -470,31 +468,6 @@ class StableDiffusionControlNetImg2ImgPipeline(

        return prompt_embeds, negative_prompt_embeds

-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
-    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
-        dtype = next(self.image_encoder.parameters()).dtype
-
-        if not isinstance(image, torch.Tensor):
-            image = self.feature_extractor(image, return_tensors="pt").pixel_values
-
-        image = image.to(device=device, dtype=dtype)
-        if output_hidden_states:
-            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
-            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
-            uncond_image_enc_hidden_states = self.image_encoder(
-                torch.zeros_like(image), output_hidden_states=True
-            ).hidden_states[-2]
-            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
-                num_images_per_prompt, dim=0
-            )
-            return image_enc_hidden_states, uncond_image_enc_hidden_states
-        else:
-            image_embeds = self.image_encoder(image).image_embeds
-            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
-            uncond_image_embeds = torch.zeros_like(image_embeds)
-
-            return image_embeds, uncond_image_embeds
-
    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
    def run_safety_checker(self, image, device, dtype):
        if self.safety_checker is None:
@@ -888,7 +861,6 @@ class StableDiffusionControlNetImg2ImgPipeline(
        latents: Optional[torch.FloatTensor] = None,
        prompt_embeds: Optional[torch.FloatTensor] = None,
        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-        ip_adapter_image: Optional[PipelineImageInput] = None,
        output_type: Optional[str] = "pil",
        return_dict: bool = True,
        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
@@ -950,7 +922,6 @@ class StableDiffusionControlNetImg2ImgPipeline(
            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
-            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
            return_dict (`bool`, *optional*, defaults to `True`):
@@ -1082,11 +1053,6 @@ class StableDiffusionControlNetImg2ImgPipeline(
        if self.do_classifier_free_guidance:
            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])

-        if ip_adapter_image is not None:
-            image_embeds, negative_image_embeds = self.encode_image(ip_adapter_image, device, num_images_per_prompt)
-            if self.do_classifier_free_guidance:
-                image_embeds = torch.cat([negative_image_embeds, image_embeds])
-
        # 4. Prepare image
        image = self.image_processor.preprocess(image, height=height, width=width).to(dtype=torch.float32)

@@ -1145,10 +1111,7 @@ class StableDiffusionControlNetImg2ImgPipeline(
        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)

-        # 7.1 Add image embeds for IP-Adapter
-        added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
-
-        # 7.2 Create tensor stating which controlnets to keep
+        # 7.1 Create tensor stating which controlnets to keep
        controlnet_keep = []
        for i in range(len(timesteps)):
            keeps = [
@@ -1208,7 +1171,6 @@ class StableDiffusionControlNetImg2ImgPipeline(
                    cross_attention_kwargs=self.cross_attention_kwargs,
                    down_block_additional_residuals=down_block_res_samples,
                    mid_block_additional_residual=mid_block_res_sample,
-                    added_cond_kwargs=added_cond_kwargs,
                    return_dict=False,
                )[0]

--- a/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py
+++ b/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py
@@ -20,11 +20,11 @@ from typing import Any, Callable, Dict, List, Optional, Union

 import PIL.Image
 import torch
-from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer

 from ...image_processor import PipelineImageInput, VaeImageProcessor
-from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
-from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
+from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, UNet2DConditionModel
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import LCMScheduler
 from ...utils import (
@@ -129,7 +129,7 @@ EXAMPLE_DOC_STRING = """


 class LatentConsistencyModelImg2ImgPipeline(
-    DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin, FromSingleFileMixin
+    DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin
 ):
    r"""
    Pipeline for image-to-image generation using a latent consistency model.
@@ -142,7 +142,6 @@ class LatentConsistencyModelImg2ImgPipeline(
        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
-        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters

    Args:
        vae ([`AutoencoderKL`]):
@@ -167,7 +166,7 @@ class LatentConsistencyModelImg2ImgPipeline(
    """

    model_cpu_offload_seq = "text_encoder->unet->vae"
-    _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
+    _optional_components = ["safety_checker", "feature_extractor"]
    _exclude_from_cpu_offload = ["safety_checker"]
    _callback_tensor_inputs = ["latents", "denoised", "prompt_embeds", "w_embedding"]

@@ -180,7 +179,6 @@ class LatentConsistencyModelImg2ImgPipeline(
        scheduler: LCMScheduler,
        safety_checker: StableDiffusionSafetyChecker,
        feature_extractor: CLIPImageProcessor,
-        image_encoder: Optional[CLIPVisionModelWithProjection] = None,
        requires_safety_checker: bool = True,
    ):
        super().__init__()
@@ -193,7 +191,6 @@ class LatentConsistencyModelImg2ImgPipeline(
            scheduler=scheduler,
            safety_checker=safety_checker,
            feature_extractor=feature_extractor,
-            image_encoder=image_encoder,
        )

        if safety_checker is None and requires_safety_checker:
@@ -452,31 +449,6 @@ class LatentConsistencyModelImg2ImgPipeline(

        return prompt_embeds, negative_prompt_embeds

-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
-    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
-        dtype = next(self.image_encoder.parameters()).dtype
-
-        if not isinstance(image, torch.Tensor):
-            image = self.feature_extractor(image, return_tensors="pt").pixel_values
-
-        image = image.to(device=device, dtype=dtype)
-        if output_hidden_states:
-            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
-            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
-            uncond_image_enc_hidden_states = self.image_encoder(
-                torch.zeros_like(image), output_hidden_states=True
-            ).hidden_states[-2]
-            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
-                num_images_per_prompt, dim=0
-            )
-            return image_enc_hidden_states, uncond_image_enc_hidden_states
-        else:
-            image_embeds = self.image_encoder(image).image_embeds
-            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
-            uncond_image_embeds = torch.zeros_like(image_embeds)
-
-            return image_embeds, uncond_image_embeds
-
    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
    def run_safety_checker(self, image, device, dtype):
        if self.safety_checker is None:
@@ -675,7 +647,6 @@ class LatentConsistencyModelImg2ImgPipeline(
        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
        latents: Optional[torch.FloatTensor] = None,
        prompt_embeds: Optional[torch.FloatTensor] = None,
-        ip_adapter_image: Optional[PipelineImageInput] = None,
        output_type: Optional[str] = "pil",
        return_dict: bool = True,
        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
@@ -724,8 +695,6 @@ class LatentConsistencyModelImg2ImgPipeline(
            prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
                provided, text embeddings are generated from the `prompt` input argument.
-            ip_adapter_image: (`PipelineImageInput`, *optional*):
-                Optional image input to work with IP Adapters.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
            return_dict (`bool`, *optional*, defaults to `True`):
@@ -789,12 +758,6 @@ class LatentConsistencyModelImg2ImgPipeline(
        device = self._execution_device
        # do_classifier_free_guidance = guidance_scale > 1.0

-        if ip_adapter_image is not None:
-            output_hidden_state = False if isinstance(self.unet.encoder_hid_proj, ImageProjection) else True
-            image_embeds, negative_image_embeds = self.encode_image(
-                ip_adapter_image, device, num_images_per_prompt, output_hidden_state
-            )
-
        # 3. Encode input prompt
        lora_scale = (
            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
@@ -852,9 +815,6 @@ class LatentConsistencyModelImg2ImgPipeline(
        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, None)

-        # 7.1 Add image embeds for IP-Adapter
-        added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
-
        # 8. LCM Multistep Sampling Loop
        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
        self._num_timesteps = len(timesteps)
@@ -869,7 +829,6 @@ class LatentConsistencyModelImg2ImgPipeline(
                    timestep_cond=w_embedding,
                    encoder_hidden_states=prompt_embeds,
                    cross_attention_kwargs=self.cross_attention_kwargs,
-                    added_cond_kwargs=added_cond_kwargs,
                    return_dict=False,
                )[0]

--- a/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py
+++ b/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py
@@ -19,11 +19,11 @@ import inspect
 from typing import Any, Callable, Dict, List, Optional, Union

 import torch
-from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer

-from ...image_processor import PipelineImageInput, VaeImageProcessor
-from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
-from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
+from ...image_processor import VaeImageProcessor
+from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, UNet2DConditionModel
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import LCMScheduler
 from ...utils import (
@@ -107,7 +107,7 @@ def retrieve_timesteps(


 class LatentConsistencyModelPipeline(
-    DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin, FromSingleFileMixin
+    DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin
 ):
    r"""
    Pipeline for text-to-image generation using a latent consistency model.
@@ -120,7 +120,6 @@ class LatentConsistencyModelPipeline(
        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
-        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters

    Args:
        vae ([`AutoencoderKL`]):
@@ -145,7 +144,7 @@ class LatentConsistencyModelPipeline(
    """

    model_cpu_offload_seq = "text_encoder->unet->vae"
-    _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
+    _optional_components = ["safety_checker", "feature_extractor"]
    _exclude_from_cpu_offload = ["safety_checker"]
    _callback_tensor_inputs = ["latents", "denoised", "prompt_embeds", "w_embedding"]

@@ -158,7 +157,6 @@ class LatentConsistencyModelPipeline(
        scheduler: LCMScheduler,
        safety_checker: StableDiffusionSafetyChecker,
        feature_extractor: CLIPImageProcessor,
-        image_encoder: Optional[CLIPVisionModelWithProjection] = None,
        requires_safety_checker: bool = True,
    ):
        super().__init__()
@@ -187,7 +185,6 @@ class LatentConsistencyModelPipeline(
            scheduler=scheduler,
            safety_checker=safety_checker,
            feature_extractor=feature_extractor,
-            image_encoder=image_encoder,
        )
        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
@@ -436,31 +433,6 @@ class LatentConsistencyModelPipeline(

        return prompt_embeds, negative_prompt_embeds

-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
-    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
-        dtype = next(self.image_encoder.parameters()).dtype
-
-        if not isinstance(image, torch.Tensor):
-            image = self.feature_extractor(image, return_tensors="pt").pixel_values
-
-        image = image.to(device=device, dtype=dtype)
-        if output_hidden_states:
-            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
-            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
-            uncond_image_enc_hidden_states = self.image_encoder(
-                torch.zeros_like(image), output_hidden_states=True
-            ).hidden_states[-2]
-            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
-                num_images_per_prompt, dim=0
-            )
-            return image_enc_hidden_states, uncond_image_enc_hidden_states
-        else:
-            image_embeds = self.image_encoder(image).image_embeds
-            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
-            uncond_image_embeds = torch.zeros_like(image_embeds)
-
-            return image_embeds, uncond_image_embeds
-
    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
    def run_safety_checker(self, image, device, dtype):
        if self.safety_checker is None:
@@ -609,7 +581,6 @@ class LatentConsistencyModelPipeline(
        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
        latents: Optional[torch.FloatTensor] = None,
        prompt_embeds: Optional[torch.FloatTensor] = None,
-        ip_adapter_image: Optional[PipelineImageInput] = None,
        output_type: Optional[str] = "pil",
        return_dict: bool = True,
        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
@@ -658,8 +629,6 @@ class LatentConsistencyModelPipeline(
            prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
                provided, text embeddings are generated from the `prompt` input argument.
-            ip_adapter_image: (`PipelineImageInput`, *optional*):
-                Optional image input to work with IP Adapters.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
            return_dict (`bool`, *optional*, defaults to `True`):
@@ -728,12 +697,6 @@ class LatentConsistencyModelPipeline(
        device = self._execution_device
        # do_classifier_free_guidance = guidance_scale > 1.0

-        if ip_adapter_image is not None:
-            output_hidden_state = False if isinstance(self.unet.encoder_hid_proj, ImageProjection) else True
-            image_embeds, negative_image_embeds = self.encode_image(
-                ip_adapter_image, device, num_images_per_prompt, output_hidden_state
-            )
-
        # 3. Encode input prompt
        lora_scale = (
            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
@@ -785,9 +748,6 @@ class LatentConsistencyModelPipeline(
        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, None)

-        # 7.1 Add image embeds for IP-Adapter
-        added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
-
        # 8. LCM MultiStep Sampling Loop:
        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
        self._num_timesteps = len(timesteps)
@@ -802,7 +762,6 @@ class LatentConsistencyModelPipeline(
                    timestep_cond=w_embedding,
                    encoder_hidden_states=prompt_embeds,
                    cross_attention_kwargs=self.cross_attention_kwargs,
-                    added_cond_kwargs=added_cond_kwargs,
                    return_dict=False,
                )[0]

--- a/src/diffusers/pipelines/spectrogram_diffusion/continuous_encoder.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/continuous_encoder.py
--- a/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
+++ b/src/diffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
@@ -29,7 +29,7 @@ if is_onnx_available():
    from ..onnx_utils import OnnxRuntimeModel

 from ..pipeline_utils import AudioPipelineOutput, DiffusionPipeline
-from .continuous_encoder import SpectrogramContEncoder
+from .continous_encoder import SpectrogramContEncoder
 from .notes_encoder import SpectrogramNotesEncoder


--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
@@ -18,11 +18,11 @@ from typing import Callable, Dict, List, Optional, Union
 import numpy as np
 import PIL.Image
 import torch
-from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer

 from ...image_processor import PipelineImageInput, VaeImageProcessor
-from ...loaders import IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
-from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
+from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, UNet2DConditionModel
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import PIL_INTERPOLATION, deprecate, logging
 from ...utils.torch_utils import randn_tensor
@@ -72,9 +72,7 @@ def retrieve_latents(
        raise AttributeError("Could not access latents of provided encoder_output")


-class StableDiffusionInstructPix2PixPipeline(
-    DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, IPAdapterMixin
-):
+class StableDiffusionInstructPix2PixPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin):
    r"""
    Pipeline for pixel-level image editing by following text instructions (based on Stable Diffusion).

@@ -85,7 +83,6 @@ class StableDiffusionInstructPix2PixPipeline(
        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
-        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters

    Args:
        vae ([`AutoencoderKL`]):
@@ -108,7 +105,7 @@ class StableDiffusionInstructPix2PixPipeline(
    """

    model_cpu_offload_seq = "text_encoder->unet->vae"
-    _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
+    _optional_components = ["safety_checker", "feature_extractor"]
    _exclude_from_cpu_offload = ["safety_checker"]
    _callback_tensor_inputs = ["latents", "prompt_embeds", "image_latents"]

@@ -121,7 +118,6 @@ class StableDiffusionInstructPix2PixPipeline(
        scheduler: KarrasDiffusionSchedulers,
        safety_checker: StableDiffusionSafetyChecker,
        feature_extractor: CLIPImageProcessor,
-        image_encoder: Optional[CLIPVisionModelWithProjection] = None,
        requires_safety_checker: bool = True,
    ):
        super().__init__()
@@ -150,7 +146,6 @@ class StableDiffusionInstructPix2PixPipeline(
            scheduler=scheduler,
            safety_checker=safety_checker,
            feature_extractor=feature_extractor,
-            image_encoder=image_encoder,
        )
        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
@@ -171,7 +166,6 @@ class StableDiffusionInstructPix2PixPipeline(
        latents: Optional[torch.FloatTensor] = None,
        prompt_embeds: Optional[torch.FloatTensor] = None,
        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-        ip_adapter_image: Optional[PipelineImageInput] = None,
        output_type: Optional[str] = "pil",
        return_dict: bool = True,
        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
@@ -219,8 +213,6 @@ class StableDiffusionInstructPix2PixPipeline(
            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
-            ip_adapter_image: (`PipelineImageInput`, *optional*):
-                Optional image input to work with IP Adapters.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
            return_dict (`bool`, *optional*, defaults to `True`):
@@ -301,16 +293,6 @@ class StableDiffusionInstructPix2PixPipeline(
        self._guidance_scale = guidance_scale
        self._image_guidance_scale = image_guidance_scale

-        device = self._execution_device
-
-        if ip_adapter_image is not None:
-            output_hidden_state = False if isinstance(self.unet.encoder_hid_proj, ImageProjection) else True
-            image_embeds, negative_image_embeds = self.encode_image(
-                ip_adapter_image, device, num_images_per_prompt, output_hidden_state
-            )
-            if self.do_classifier_free_guidance:
-                image_embeds = torch.cat([image_embeds, negative_image_embeds, negative_image_embeds])
-
        if image is None:
            raise ValueError("`image` input cannot be undefined.")

@@ -385,9 +367,6 @@ class StableDiffusionInstructPix2PixPipeline(
        # 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)

-        # 8.1 Add image embeds for IP-Adapter
-        added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
-
        # 9. Denoising loop
        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
        self._num_timesteps = len(timesteps)
@@ -404,11 +383,7 @@ class StableDiffusionInstructPix2PixPipeline(

                # predict the noise residual
                noise_pred = self.unet(
-                    scaled_latent_model_input,
-                    t,
-                    encoder_hidden_states=prompt_embeds,
-                    added_cond_kwargs=added_cond_kwargs,
-                    return_dict=False,
+                    scaled_latent_model_input, t, encoder_hidden_states=prompt_embeds, return_dict=False
                )[0]

                # Hack:
@@ -628,31 +603,6 @@ class StableDiffusionInstructPix2PixPipeline(

        return prompt_embeds

-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
-    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
-        dtype = next(self.image_encoder.parameters()).dtype
-
-        if not isinstance(image, torch.Tensor):
-            image = self.feature_extractor(image, return_tensors="pt").pixel_values
-
-        image = image.to(device=device, dtype=dtype)
-        if output_hidden_states:
-            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
-            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
-            uncond_image_enc_hidden_states = self.image_encoder(
-                torch.zeros_like(image), output_hidden_states=True
-            ).hidden_states[-2]
-            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
-                num_images_per_prompt, dim=0
-            )
-            return image_enc_hidden_states, uncond_image_enc_hidden_states
-        else:
-            image_embeds = self.image_encoder(image).image_embeds
-            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
-            uncond_image_embeds = torch.zeros_like(image_embeds)
-
-            return image_embeds, uncond_image_embeds
-
    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
    def run_safety_checker(self, image, device, dtype):
        if self.safety_checker is None:
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_ldm3d.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_ldm3d.py
@@ -19,11 +19,11 @@ from typing import Any, Callable, Dict, List, Optional, Union
 import numpy as np
 import PIL.Image
 import torch
-from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer

-from ...image_processor import PipelineImageInput, VaeImageProcessorLDM3D
-from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
-from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
+from ...image_processor import VaeImageProcessorLDM3D
+from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, UNet2DConditionModel
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
@@ -82,7 +82,7 @@ class LDM3DPipelineOutput(BaseOutput):


 class StableDiffusionLDM3DPipeline(
-    DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin, FromSingleFileMixin
+    DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin
 ):
    r"""
    Pipeline for text-to-image and 3D generation using LDM3D.
@@ -95,7 +95,6 @@ class StableDiffusionLDM3DPipeline(
        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
-        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters

    Args:
        vae ([`AutoencoderKL`]):
@@ -118,7 +117,7 @@ class StableDiffusionLDM3DPipeline(
    """

    model_cpu_offload_seq = "text_encoder->unet->vae"
-    _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
+    _optional_components = ["safety_checker", "feature_extractor"]
    _exclude_from_cpu_offload = ["safety_checker"]

    def __init__(
@@ -130,7 +129,6 @@ class StableDiffusionLDM3DPipeline(
        scheduler: KarrasDiffusionSchedulers,
        safety_checker: StableDiffusionSafetyChecker,
        feature_extractor: CLIPImageProcessor,
-        image_encoder: Optional[CLIPVisionModelWithProjection],
        requires_safety_checker: bool = True,
    ):
        super().__init__()
@@ -159,7 +157,6 @@ class StableDiffusionLDM3DPipeline(
            scheduler=scheduler,
            safety_checker=safety_checker,
            feature_extractor=feature_extractor,
-            image_encoder=image_encoder,
        )
        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
        self.image_processor = VaeImageProcessorLDM3D(vae_scale_factor=self.vae_scale_factor)
@@ -413,31 +410,6 @@ class StableDiffusionLDM3DPipeline(

        return prompt_embeds, negative_prompt_embeds

-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
-    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
-        dtype = next(self.image_encoder.parameters()).dtype
-
-        if not isinstance(image, torch.Tensor):
-            image = self.feature_extractor(image, return_tensors="pt").pixel_values
-
-        image = image.to(device=device, dtype=dtype)
-        if output_hidden_states:
-            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
-            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
-            uncond_image_enc_hidden_states = self.image_encoder(
-                torch.zeros_like(image), output_hidden_states=True
-            ).hidden_states[-2]
-            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
-                num_images_per_prompt, dim=0
-            )
-            return image_enc_hidden_states, uncond_image_enc_hidden_states
-        else:
-            image_embeds = self.image_encoder(image).image_embeds
-            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
-            uncond_image_embeds = torch.zeros_like(image_embeds)
-
-            return image_embeds, uncond_image_embeds
-
    def run_safety_checker(self, image, device, dtype):
        if self.safety_checker is None:
            has_nsfw_concept = None
@@ -557,7 +529,6 @@ class StableDiffusionLDM3DPipeline(
        latents: Optional[torch.FloatTensor] = None,
        prompt_embeds: Optional[torch.FloatTensor] = None,
        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-        ip_adapter_image: Optional[PipelineImageInput] = None,
        output_type: Optional[str] = "pil",
        return_dict: bool = True,
        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
@@ -602,8 +573,6 @@ class StableDiffusionLDM3DPipeline(
            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
-            ip_adapter_image: (`PipelineImageInput`, *optional*):
-                Optional image input to work with IP Adapters.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
            return_dict (`bool`, *optional*, defaults to `True`):
@@ -653,14 +622,6 @@ class StableDiffusionLDM3DPipeline(
        # corresponds to doing no classifier free guidance.
        do_classifier_free_guidance = guidance_scale > 1.0

-        if ip_adapter_image is not None:
-            output_hidden_state = False if isinstance(self.unet.encoder_hid_proj, ImageProjection) else True
-            image_embeds, negative_image_embeds = self.encode_image(
-                ip_adapter_image, device, num_images_per_prompt, output_hidden_state
-            )
-            if do_classifier_free_guidance:
-                image_embeds = torch.cat([negative_image_embeds, image_embeds])
-
        # 3. Encode input prompt
        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
            prompt,
@@ -698,9 +659,6 @@ class StableDiffusionLDM3DPipeline(
        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)

-        # 6.1 Add image embeds for IP-Adapter
-        added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
-
        # 7. Denoising loop
        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
        with self.progress_bar(total=num_inference_steps) as progress_bar:
@@ -715,7 +673,6 @@ class StableDiffusionLDM3DPipeline(
                    t,
                    encoder_hidden_states=prompt_embeds,
                    cross_attention_kwargs=cross_attention_kwargs,
-                    added_cond_kwargs=added_cond_kwargs,
                    return_dict=False,
                )[0]

--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py
@@ -16,11 +16,11 @@ import inspect
 from typing import Any, Callable, Dict, List, Optional, Union

 import torch
-from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer

-from ...image_processor import PipelineImageInput, VaeImageProcessor
-from ...loaders import IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
-from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
+from ...image_processor import VaeImageProcessor
+from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, UNet2DConditionModel
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import DDIMScheduler
 from ...utils import (
@@ -59,19 +59,13 @@ EXAMPLE_DOC_STRING = """
 """


-class StableDiffusionPanoramaPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, IPAdapterMixin):
+class StableDiffusionPanoramaPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin):
    r"""
    Pipeline for text-to-image generation using MultiDiffusion.

    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
    implemented for all pipelines (downloading, saving, running on a particular device, etc.).

-    The pipeline also inherits the following loading methods:
-        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
-        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
-        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
-        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
-
    Args:
        vae ([`AutoencoderKL`]):
            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
@@ -93,7 +87,7 @@ class StableDiffusionPanoramaPipeline(DiffusionPipeline, TextualInversionLoaderM
    """

    model_cpu_offload_seq = "text_encoder->unet->vae"
-    _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
+    _optional_components = ["safety_checker", "feature_extractor"]
    _exclude_from_cpu_offload = ["safety_checker"]

    def __init__(
@@ -105,7 +99,6 @@ class StableDiffusionPanoramaPipeline(DiffusionPipeline, TextualInversionLoaderM
        scheduler: DDIMScheduler,
        safety_checker: StableDiffusionSafetyChecker,
        feature_extractor: CLIPImageProcessor,
-        image_encoder: Optional[CLIPVisionModelWithProjection] = None,
        requires_safety_checker: bool = True,
    ):
        super().__init__()
@@ -134,7 +127,6 @@ class StableDiffusionPanoramaPipeline(DiffusionPipeline, TextualInversionLoaderM
            scheduler=scheduler,
            safety_checker=safety_checker,
            feature_extractor=feature_extractor,
-            image_encoder=image_encoder,
        )
        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
@@ -371,31 +363,6 @@ class StableDiffusionPanoramaPipeline(DiffusionPipeline, TextualInversionLoaderM

        return prompt_embeds, negative_prompt_embeds

-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
-    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
-        dtype = next(self.image_encoder.parameters()).dtype
-
-        if not isinstance(image, torch.Tensor):
-            image = self.feature_extractor(image, return_tensors="pt").pixel_values
-
-        image = image.to(device=device, dtype=dtype)
-        if output_hidden_states:
-            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
-            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
-            uncond_image_enc_hidden_states = self.image_encoder(
-                torch.zeros_like(image), output_hidden_states=True
-            ).hidden_states[-2]
-            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
-                num_images_per_prompt, dim=0
-            )
-            return image_enc_hidden_states, uncond_image_enc_hidden_states
-        else:
-            image_embeds = self.image_encoder(image).image_embeds
-            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
-            uncond_image_embeds = torch.zeros_like(image_embeds)
-
-            return image_embeds, uncond_image_embeds
-
    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
    def run_safety_checker(self, image, device, dtype):
        if self.safety_checker is None:
@@ -562,7 +529,6 @@ class StableDiffusionPanoramaPipeline(DiffusionPipeline, TextualInversionLoaderM
        latents: Optional[torch.FloatTensor] = None,
        prompt_embeds: Optional[torch.FloatTensor] = None,
        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-        ip_adapter_image: Optional[PipelineImageInput] = None,
        output_type: Optional[str] = "pil",
        return_dict: bool = True,
        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
@@ -612,8 +578,6 @@ class StableDiffusionPanoramaPipeline(DiffusionPipeline, TextualInversionLoaderM
            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
-            ip_adapter_image: (`PipelineImageInput`, *optional*):
-                Optional image input to work with IP Adapters.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
            return_dict (`bool`, *optional*, defaults to `True`):
@@ -668,14 +632,6 @@ class StableDiffusionPanoramaPipeline(DiffusionPipeline, TextualInversionLoaderM
        # corresponds to doing no classifier free guidance.
        do_classifier_free_guidance = guidance_scale > 1.0

-        if ip_adapter_image is not None:
-            output_hidden_state = False if isinstance(self.unet.encoder_hid_proj, ImageProjection) else True
-            image_embeds, negative_image_embeds = self.encode_image(
-                ip_adapter_image, device, num_images_per_prompt, output_hidden_state
-            )
-            if do_classifier_free_guidance:
-                image_embeds = torch.cat([negative_image_embeds, image_embeds])
-
        # 3. Encode input prompt
        text_encoder_lora_scale = (
            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
@@ -725,9 +681,6 @@ class StableDiffusionPanoramaPipeline(DiffusionPipeline, TextualInversionLoaderM
        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)

-        # 7.1 Add image embeds for IP-Adapter
-        added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
-
        # 8. Denoising loop
        # Each denoising step also includes refinement of the latents with respect to the
        # views.
@@ -790,7 +743,6 @@ class StableDiffusionPanoramaPipeline(DiffusionPipeline, TextualInversionLoaderM
                        t,
                        encoder_hidden_states=prompt_embeds_input,
                        cross_attention_kwargs=cross_attention_kwargs,
-                        added_cond_kwargs=added_cond_kwargs,
                    ).sample

                    # perform guidance
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py
@@ -17,11 +17,11 @@ from typing import Any, Callable, Dict, List, Optional, Union

 import torch
 import torch.nn.functional as F
-from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer

-from ...image_processor import PipelineImageInput, VaeImageProcessor
-from ...loaders import IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
-from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
+from ...image_processor import VaeImageProcessor
+from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, UNet2DConditionModel
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
@@ -98,17 +98,13 @@ class CrossAttnStoreProcessor:


 # Modified to get self-attention guidance scale in this paper (https://arxiv.org/pdf/2210.00939.pdf) as an input
-class StableDiffusionSAGPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin):
+class StableDiffusionSAGPipeline(DiffusionPipeline, TextualInversionLoaderMixin):
    r"""
    Pipeline for text-to-image generation using Stable Diffusion.

    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
    implemented for all pipelines (downloading, saving, running on a particular device, etc.).

-    The pipeline also inherits the following loading methods:
-        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
-        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
-
    Args:
        vae ([`AutoencoderKL`]):
            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
@@ -130,7 +126,7 @@ class StableDiffusionSAGPipeline(DiffusionPipeline, TextualInversionLoaderMixin,
    """

    model_cpu_offload_seq = "text_encoder->unet->vae"
-    _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
+    _optional_components = ["safety_checker", "feature_extractor"]
    _exclude_from_cpu_offload = ["safety_checker"]

    def __init__(
@@ -142,7 +138,6 @@ class StableDiffusionSAGPipeline(DiffusionPipeline, TextualInversionLoaderMixin,
        scheduler: KarrasDiffusionSchedulers,
        safety_checker: StableDiffusionSafetyChecker,
        feature_extractor: CLIPImageProcessor,
-        image_encoder: Optional[CLIPVisionModelWithProjection] = None,
        requires_safety_checker: bool = True,
    ):
        super().__init__()
@@ -155,7 +150,6 @@ class StableDiffusionSAGPipeline(DiffusionPipeline, TextualInversionLoaderMixin,
            scheduler=scheduler,
            safety_checker=safety_checker,
            feature_extractor=feature_extractor,
-            image_encoder=image_encoder,
        )
        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
@@ -392,31 +386,6 @@ class StableDiffusionSAGPipeline(DiffusionPipeline, TextualInversionLoaderMixin,

        return prompt_embeds, negative_prompt_embeds

-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
-    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
-        dtype = next(self.image_encoder.parameters()).dtype
-
-        if not isinstance(image, torch.Tensor):
-            image = self.feature_extractor(image, return_tensors="pt").pixel_values
-
-        image = image.to(device=device, dtype=dtype)
-        if output_hidden_states:
-            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
-            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
-            uncond_image_enc_hidden_states = self.image_encoder(
-                torch.zeros_like(image), output_hidden_states=True
-            ).hidden_states[-2]
-            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
-                num_images_per_prompt, dim=0
-            )
-            return image_enc_hidden_states, uncond_image_enc_hidden_states
-        else:
-            image_embeds = self.image_encoder(image).image_embeds
-            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
-            uncond_image_embeds = torch.zeros_like(image_embeds)
-
-            return image_embeds, uncond_image_embeds
-
    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
    def run_safety_checker(self, image, device, dtype):
        if self.safety_checker is None:
@@ -550,7 +519,6 @@ class StableDiffusionSAGPipeline(DiffusionPipeline, TextualInversionLoaderMixin,
        latents: Optional[torch.FloatTensor] = None,
        prompt_embeds: Optional[torch.FloatTensor] = None,
        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-        ip_adapter_image: Optional[PipelineImageInput] = None,
        output_type: Optional[str] = "pil",
        return_dict: bool = True,
        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
@@ -597,8 +565,6 @@ class StableDiffusionSAGPipeline(DiffusionPipeline, TextualInversionLoaderMixin,
            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
-            ip_adapter_image: (`PipelineImageInput`, *optional*):
-                Optional image input to work with IP Adapters.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
            return_dict (`bool`, *optional*, defaults to `True`):
@@ -652,14 +618,6 @@ class StableDiffusionSAGPipeline(DiffusionPipeline, TextualInversionLoaderMixin,
        # `sag_scale = 0` means no self-attention guidance
        do_self_attention_guidance = sag_scale > 0.0

-        if ip_adapter_image is not None:
-            output_hidden_state = False if isinstance(self.unet.encoder_hid_proj, ImageProjection) else True
-            image_embeds, negative_image_embeds = self.encode_image(
-                ip_adapter_image, device, num_images_per_prompt, output_hidden_state
-            )
-            if do_classifier_free_guidance:
-                image_embeds = torch.cat([negative_image_embeds, image_embeds])
-
        # 3. Encode input prompt
        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
            prompt,
@@ -697,10 +655,6 @@ class StableDiffusionSAGPipeline(DiffusionPipeline, TextualInversionLoaderMixin,
        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)

-        # 6.1 Add image embeds for IP-Adapter
-        added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
-        added_uncond_kwargs = {"image_embeds": negative_image_embeds} if ip_adapter_image is not None else None
-
        # 7. Denoising loop
        store_processor = CrossAttnStoreProcessor()
        self.unet.mid_block.attentions[0].transformer_blocks[0].attn1.processor = store_processor
@@ -726,7 +680,6 @@ class StableDiffusionSAGPipeline(DiffusionPipeline, TextualInversionLoaderMixin,
                        t,
                        encoder_hidden_states=prompt_embeds,
                        cross_attention_kwargs=cross_attention_kwargs,
-                        added_cond_kwargs=added_cond_kwargs,
                    ).sample

                    # perform guidance
@@ -750,12 +703,7 @@ class StableDiffusionSAGPipeline(DiffusionPipeline, TextualInversionLoaderMixin,
                            )
                            uncond_emb, _ = prompt_embeds.chunk(2)
                            # forward and give guidance
-                            degraded_pred = self.unet(
-                                degraded_latents,
-                                t,
-                                encoder_hidden_states=uncond_emb,
-                                added_cond_kwargs=added_uncond_kwargs,
-                            ).sample
+                            degraded_pred = self.unet(degraded_latents, t, encoder_hidden_states=uncond_emb).sample
                            noise_pred += sag_scale * (noise_pred_uncond - degraded_pred)
                        else:
                            # DDIM-like prediction of x0
@@ -767,12 +715,7 @@ class StableDiffusionSAGPipeline(DiffusionPipeline, TextualInversionLoaderMixin,
                                pred_x0, cond_attn, map_size, t, self.pred_epsilon(latents, noise_pred, t)
                            )
                            # forward and give guidance
-                            degraded_pred = self.unet(
-                                degraded_latents,
-                                t,
-                                encoder_hidden_states=prompt_embeds,
-                                added_cond_kwargs=added_cond_kwargs,
-                            ).sample
+                            degraded_pred = self.unet(degraded_latents, t, encoder_hidden_states=prompt_embeds).sample
                            noise_pred += sag_scale * (noise_pred - degraded_pred)

                    # compute the previous noisy sample x_t -> x_t-1
--- a/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py
+++ b/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py
@@ -5,12 +5,10 @@ from typing import Callable, List, Optional, Union
 import numpy as np
 import torch
 from packaging import version
-from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer

 from ...configuration_utils import FrozenDict
-from ...image_processor import PipelineImageInput
-from ...loaders import IPAdapterMixin
-from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
+from ...models import AutoencoderKL, UNet2DConditionModel
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import deprecate, logging
 from ...utils.torch_utils import randn_tensor
@@ -22,16 +20,13 @@ from .safety_checker import SafeStableDiffusionSafetyChecker
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name


-class StableDiffusionPipelineSafe(DiffusionPipeline, IPAdapterMixin):
+class StableDiffusionPipelineSafe(DiffusionPipeline):
    r"""
    Pipeline based on the [`StableDiffusionPipeline`] for text-to-image generation using Safe Latent Diffusion.

    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
    implemented for all pipelines (downloading, saving, running on a particular device, etc.).

-    The pipeline also inherits the following loading methods:
-        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
-
    Args:
        vae ([`AutoencoderKL`]):
            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
@@ -53,7 +48,7 @@ class StableDiffusionPipelineSafe(DiffusionPipeline, IPAdapterMixin):
    """

    model_cpu_offload_seq = "text_encoder->unet->vae"
-    _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
+    _optional_components = ["safety_checker", "feature_extractor"]

    def __init__(
        self,
@@ -64,7 +59,6 @@ class StableDiffusionPipelineSafe(DiffusionPipeline, IPAdapterMixin):
        scheduler: KarrasDiffusionSchedulers,
        safety_checker: SafeStableDiffusionSafetyChecker,
        feature_extractor: CLIPImageProcessor,
-        image_encoder: Optional[CLIPVisionModelWithProjection] = None,
        requires_safety_checker: bool = True,
    ):
        super().__init__()
@@ -146,7 +140,6 @@ class StableDiffusionPipelineSafe(DiffusionPipeline, IPAdapterMixin):
            scheduler=scheduler,
            safety_checker=safety_checker,
            feature_extractor=feature_extractor,
-            image_encoder=image_encoder,
        )
        self._safety_text_concept = safety_concept
        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
@@ -474,31 +467,6 @@ class StableDiffusionPipelineSafe(DiffusionPipeline, IPAdapterMixin):
                noise_guidance = noise_guidance - noise_guidance_safety
        return noise_guidance, safety_momentum

-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
-    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
-        dtype = next(self.image_encoder.parameters()).dtype
-
-        if not isinstance(image, torch.Tensor):
-            image = self.feature_extractor(image, return_tensors="pt").pixel_values
-
-        image = image.to(device=device, dtype=dtype)
-        if output_hidden_states:
-            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
-            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
-            uncond_image_enc_hidden_states = self.image_encoder(
-                torch.zeros_like(image), output_hidden_states=True
-            ).hidden_states[-2]
-            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
-                num_images_per_prompt, dim=0
-            )
-            return image_enc_hidden_states, uncond_image_enc_hidden_states
-        else:
-            image_embeds = self.image_encoder(image).image_embeds
-            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
-            uncond_image_embeds = torch.zeros_like(image_embeds)
-
-            return image_embeds, uncond_image_embeds
-
    @torch.no_grad()
    def __call__(
        self,
@@ -512,7 +480,6 @@ class StableDiffusionPipelineSafe(DiffusionPipeline, IPAdapterMixin):
        eta: float = 0.0,
        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
        latents: Optional[torch.FloatTensor] = None,
-        ip_adapter_image: Optional[PipelineImageInput] = None,
        output_type: Optional[str] = "pil",
        return_dict: bool = True,
        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
@@ -554,8 +521,6 @@ class StableDiffusionPipelineSafe(DiffusionPipeline, IPAdapterMixin):
                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor is generated by sampling using the supplied random `generator`.
-            ip_adapter_image: (`PipelineImageInput`, *optional*):
-                Optional image input to work with IP Adapters.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
            return_dict (`bool`, *optional*, defaults to `True`):
@@ -623,17 +588,6 @@ class StableDiffusionPipelineSafe(DiffusionPipeline, IPAdapterMixin):
        if not enable_safety_guidance:
            warnings.warn("Safety checker disabled!")

-        if ip_adapter_image is not None:
-            output_hidden_state = False if isinstance(self.unet.encoder_hid_proj, ImageProjection) else True
-            image_embeds, negative_image_embeds = self.encode_image(
-                ip_adapter_image, device, num_images_per_prompt, output_hidden_state
-            )
-            if do_classifier_free_guidance:
-                if enable_safety_guidance:
-                    image_embeds = torch.cat([negative_image_embeds, image_embeds, image_embeds])
-                else:
-                    image_embeds = torch.cat([negative_image_embeds, image_embeds])
-
        # 3. Encode input prompt
        prompt_embeds = self._encode_prompt(
            prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt, enable_safety_guidance
@@ -659,9 +613,6 @@ class StableDiffusionPipelineSafe(DiffusionPipeline, IPAdapterMixin):
        # 6. Prepare extra step kwargs.
        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)

-        # 6.1 Add image embeds for IP-Adapter
-        added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
-
        safety_momentum = None

        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
@@ -676,9 +627,7 @@ class StableDiffusionPipelineSafe(DiffusionPipeline, IPAdapterMixin):
                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)

                # predict the noise residual
-                noise_pred = self.unet(
-                    latent_model_input, t, encoder_hidden_states=prompt_embeds, added_cond_kwargs=added_cond_kwargs
-                ).sample
+                noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=prompt_embeds).sample

                # perform guidance
                if do_classifier_free_guidance:
--- a/src/diffusers/utils/testing_utils.py
+++ b/src/diffusers/utils/testing_utils.py
@@ -820,7 +820,9 @@ def _is_torch_fp16_available(device):

    try:
        x = torch.zeros((2, 2), dtype=torch.float16).to(device)
-        _ = x @ x
+        _ = torch.mul(x, x)
+        return True
+
    except Exception as e:
        if device.type == "cuda":
            raise ValueError(
@@ -838,7 +840,9 @@ def _is_torch_fp64_available(device):

    try:
        x = torch.zeros((2, 2), dtype=torch.float64).to(device)
-        _ = x @ x
+        _ = torch.mul(x, x)
+        return True
+
    except Exception as e:
        if device.type == "cuda":
            raise ValueError(
--- a/tests/pipelines/controlnet/test_controlnet_img2img.py
+++ b/tests/pipelines/controlnet/test_controlnet_img2img.py
@@ -134,7 +134,6 @@ class ControlNetImg2ImgPipelineFastTests(
            "tokenizer": tokenizer,
            "safety_checker": None,
            "feature_extractor": None,
-            "image_encoder": None,
        }
        return components

@@ -274,7 +273,6 @@ class StableDiffusionMultiControlNetPipelineFastTests(
            "tokenizer": tokenizer,
            "safety_checker": None,
            "feature_extractor": None,
-            "image_encoder": None,
        }
        return components

--- a/tests/pipelines/latent_consistency_models/test_latent_consistency_models.py
+++ b/tests/pipelines/latent_consistency_models/test_latent_consistency_models.py
@@ -87,7 +87,6 @@ class LatentConsistencyModelPipelineFastTests(PipelineLatentTesterMixin, Pipelin
            "tokenizer": tokenizer,
            "safety_checker": None,
            "feature_extractor": None,
-            "image_encoder": None,
            "requires_safety_checker": False,
        }
        return components
--- a/tests/pipelines/latent_consistency_models/test_latent_consistency_models_img2img.py
+++ b/tests/pipelines/latent_consistency_models/test_latent_consistency_models_img2img.py
@@ -97,7 +97,6 @@ class LatentConsistencyModelImg2ImgPipelineFastTests(
            "tokenizer": tokenizer,
            "safety_checker": None,
            "feature_extractor": None,
-            "image_encoder": None,
            "requires_safety_checker": False,
        }
        return components
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py
@@ -108,7 +108,6 @@ class StableDiffusionInstructPix2PixPipelineFastTests(
            "tokenizer": tokenizer,
            "safety_checker": None,
            "feature_extractor": None,
-            "image_encoder": None,
        }
        return components

--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_ldm3d.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_ldm3d.py
@@ -93,7 +93,6 @@ class StableDiffusionLDM3DPipelineFastTests(unittest.TestCase):
            "tokenizer": tokenizer,
            "safety_checker": None,
            "feature_extractor": None,
-            "image_encoder": None,
        }
        return components

--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py
@@ -91,7 +91,6 @@ class StableDiffusionPanoramaPipelineFastTests(PipelineLatentTesterMixin, Pipeli
            "tokenizer": tokenizer,
            "safety_checker": None,
            "feature_extractor": None,
-            "image_encoder": None,
        }
        return components

--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_sag.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_sag.py
@@ -93,7 +93,6 @@ class StableDiffusionSAGPipelineFastTests(PipelineLatentTesterMixin, PipelineTes
            "tokenizer": tokenizer,
            "safety_checker": None,
            "feature_extractor": None,
-            "image_encoder": None,
        }
        return components
Author	SHA1	Message	Date
Dhruv Nair	644846aa14	update	2023-12-08 12:28:50 +00:00
Dhruv Nair	6a90d8a023	Merge branch 'main' into compile-test-fix	2023-12-08 11:10:18 +00:00
Dhruv Nair	7270cbf421	update	2023-12-08 09:58:13 +00:00